From 28b0982ea70c21841fb23802d38f6b424f8200e1 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 10 Nov 2021 12:34:50 -0600
Subject: [PATCH 001/230] Refactored her[2]k/syr[2]k in terms of gemmt. (#531)

Details:
- Renamed herk macrokernels and supporting files and functions to gemmt,
  which is possible since at the macrokernel level they are identical.
  Then recast herk/her2k/syrk/syr2k in terms of gemmt within the expert
  level-3 oapi (bli_l3_oapi_ex.c) while also redefining them as literal
  functions rather than cpp macros that instantiate multiple functions.
  Thanks to Devin Matthews for his efforts on this issue (#531).
- Check that the maximum stack buffer size is sufficiently large
  relative to the register blocksizes for each datatype, and do so when
  the context is initialized rather than when an operation is called.
  Note that with this change, users who pass in their own contexts into
  the expert interfaces currently will *not* have any checks performed.
  Thanks to Devin Matthews for suggesting this change.
---
 config/zen/bli_family_zen.h                   |   4 +-
 config/zen2/bli_family_zen2.h                 |   4 +-
 frame/3/bli_l3.h                              |   4 -
 frame/3/bli_l3_blocksize.c                    |  12 +-
 frame/3/bli_l3_blocksize.h                    |   6 +-
 frame/3/bli_l3_check.c                        |   5 -
 frame/3/bli_l3_cntl.c                         |   4 +-
 frame/3/bli_l3_direct.c                       |   6 +-
 frame/3/bli_l3_direct.h                       |   2 +-
 frame/3/bli_l3_ind.c                          |   7 +-
 frame/3/bli_l3_ind.h                          |   4 -
 frame/3/bli_l3_oapi_ex.c                      | 773 +++++++++++-------
 frame/3/bli_l3_prune.c                        |   6 +-
 frame/3/bli_l3_prune.h                        |   6 +-
 frame/3/bli_l3_thrinfo.h                      |   8 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |   2 +-
 frame/3/gemm/bli_gemm_cntl.c                  |   6 +-
 frame/3/gemmt/bli_gemmt.h                     |   2 +
 frame/3/gemmt/bli_gemmt_front.c               |   2 +-
 .../bli_gemmt_l_ker_var2.c}                   |  16 +-
 .../bli_gemmt_u_ker_var2.c}                   |  16 +-
 .../bli_herk_var.h => gemmt/bli_gemmt_var.h}  |  16 +-
 .../bli_gemmt_x_ker_var2.c}                   |   4 +-
 .../other/bli_gemmt_l_ker_var2.c}             |  12 +-
 .../other/bli_gemmt_u_ker_var2.c}             |  12 +-
 frame/3/her2k/bli_her2k.h                     |  36 -
 frame/3/her2k/bli_her2k_front.c               | 161 ----
 frame/3/her2k/bli_her2k_front.h               |  45 -
 frame/3/herk/bli_herk.h                       |  38 -
 frame/3/herk/bli_herk_front.c                 | 124 ---
 frame/3/herk/bli_herk_front.h                 |  44 -
 .../herk/other/bli_herk_l_ker_var2.1looprr.c  | 420 ----------
 frame/3/herk/other/bli_herk_l_ker_var2rr.c    | 555 -------------
 frame/3/herk/other/bli_herk_l_ker_var2sl.c    | 556 -------------
 .../herk/other/bli_herk_u_ker_var2.1looprr.c  | 420 ----------
 frame/3/herk/other/bli_herk_u_ker_var2rr.c    | 557 -------------
 frame/3/herk/other/bli_herk_u_ker_var2sl.c    | 558 -------------
 frame/3/syr2k/bli_syr2k.h                     |  36 -
 frame/3/syr2k/bli_syr2k_front.c               | 134 ---
 frame/3/syr2k/bli_syr2k_front.h               |  45 -
 frame/3/syrk/bli_syrk.h                       |  36 -
 frame/3/syrk/bli_syrk_front.c                 | 119 ---
 frame/3/syrk/bli_syrk_front.h                 |  58 --
 frame/base/bli_check.c                        |  30 +-
 frame/base/bli_check.h                        |   2 +-
 frame/base/bli_gks.c                          |   5 +
 frame/base/bli_info.c                         |   9 +-
 frame/base/bli_info.h                         |   1 +
 frame/base/bli_part.c                         |  18 +-
 frame/thread/bli_thread.c                     |   4 +-
 .../3/{bli_syrk_small.c => bli_gemmt_small.c} |  88 +-
 sandbox/gemmlike/bls_gemm_check.c             |   5 -
 52 files changed, 647 insertions(+), 4396 deletions(-)
 rename frame/3/{herk/bli_herk_l_ker_var2.c => gemmt/bli_gemmt_l_ker_var2.c} (97%)
 rename frame/3/{herk/bli_herk_u_ker_var2.c => gemmt/bli_gemmt_u_ker_var2.c} (97%)
 rename frame/3/{herk/bli_herk_var.h => gemmt/bli_gemmt_var.h} (90%)
 rename frame/3/{herk/bli_herk_x_ker_var2.c => gemmt/bli_gemmt_x_ker_var2.c} (97%)
 rename frame/3/{herk/other/bli_herk_l_ker_var2.c => gemmt/other/bli_gemmt_l_ker_var2.c} (97%)
 rename frame/3/{herk/other/bli_herk_u_ker_var2.c => gemmt/other/bli_gemmt_u_ker_var2.c} (97%)
 delete mode 100644 frame/3/her2k/bli_her2k.h
 delete mode 100644 frame/3/her2k/bli_her2k_front.c
 delete mode 100644 frame/3/her2k/bli_her2k_front.h
 delete mode 100644 frame/3/herk/bli_herk.h
 delete mode 100644 frame/3/herk/bli_herk_front.c
 delete mode 100644 frame/3/herk/bli_herk_front.h
 delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
 delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2rr.c
 delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2sl.c
 delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
 delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2rr.c
 delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2sl.c
 delete mode 100644 frame/3/syr2k/bli_syr2k.h
 delete mode 100644 frame/3/syr2k/bli_syr2k_front.c
 delete mode 100644 frame/3/syr2k/bli_syr2k_front.h
 delete mode 100644 frame/3/syrk/bli_syrk.h
 delete mode 100644 frame/3/syrk/bli_syrk_front.c
 delete mode 100644 frame/3/syrk/bli_syrk_front.h
 rename kernels/zen/3/{bli_syrk_small.c => bli_gemmt_small.c} (99%)

diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h
index c82392b60..d1c4ef828 100644
--- a/config/zen/bli_family_zen.h
+++ b/config/zen/bli_family_zen.h
@@ -52,8 +52,8 @@
 
 #define BLIS_SMALL_MATRIX_THRES_TRSM   32768 //128(128+128) => m*(m+n)
 #define BLIS_SMALL_MATRIX_A_THRES_TRSM	128
-#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK	96
-#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK	128
+#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT	96
+#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT	128
 
 //This macro will enable  BLIS DGEMM to choose block sizes for a  single instance mode
 #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 	0
diff --git a/config/zen2/bli_family_zen2.h b/config/zen2/bli_family_zen2.h
index a0f5b574d..d7adddf3c 100644
--- a/config/zen2/bli_family_zen2.h
+++ b/config/zen2/bli_family_zen2.h
@@ -51,8 +51,8 @@
 
 #define BLIS_SMALL_MATRIX_THRES_TRSM   32768 //128(128+128) => m*(m+n)
 #define BLIS_SMALL_MATRIX_A_THRES_TRSM	128
-#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK	96
-#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK	128
+#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT	96
+#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT	128
 
 #define BLIS_ENABLE_SMALL_MATRIX_ROME
 #define BLIS_SMALL_MATRIX_THRES_ROME       400
diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index 94e37fc17..da9348844 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -84,11 +84,7 @@
 // Operation-specific headers.
 #include "bli_gemm.h"
 #include "bli_hemm.h"
-#include "bli_herk.h"
-#include "bli_her2k.h"
 #include "bli_symm.h"
-#include "bli_syrk.h"
-#include "bli_syr2k.h"
 #include "bli_trmm.h"
 #include "bli_trmm3.h"
 #include "bli_trsm.h"
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
index 58b658d1d..1986b3b0f 100644
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -51,8 +51,8 @@ dim_t bli_l3_determine_kc
 
 	if      ( family == BLIS_GEMM )
 		return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-	else if ( family == BLIS_HERK )
-		return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx );
+	else if ( family == BLIS_GEMMT )
+		return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx );
 	else if ( family == BLIS_TRMM )
 		return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx );
 	else if ( family == BLIS_TRSM )
@@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \
 }
 
 GENFRONT( gemm_determine_kc, gemm )
-GENFRONT( herk_determine_kc, herk )
+GENFRONT( gemmt_determine_kc, gemmt )
 GENFRONT( trmm_determine_kc, trmm )
 GENFRONT( trsm_determine_kc, trsm )
 
@@ -201,7 +201,7 @@ dim_t PASTEMAC0(opname) \
 	b_alg = bli_blksz_get_def( dt, bsize ); \
 	b_max = bli_blksz_get_max( dt, bsize ); \
 \
-	/* Notice that for herk, we do not need to perform any special handling
+	/* Notice that for gemmt, we do not need to perform any special handling
 	   for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
@@ -211,8 +211,8 @@ dim_t PASTEMAC0(opname) \
 	return b_use; \
 }
 
-GENFRONT( herk_determine_kc_f, f )
-GENFRONT( herk_determine_kc_b, b )
+GENFRONT( gemmt_determine_kc_f, f )
+GENFRONT( gemmt_determine_kc_b, b )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
index c3301ee13..3ea3c5aa0 100644
--- a/frame/3/bli_l3_blocksize.h
+++ b/frame/3/bli_l3_blocksize.h
@@ -60,7 +60,7 @@ dim_t PASTEMAC0(opname) \
       );
 
 GENPROT( gemm_determine_kc )
-GENPROT( herk_determine_kc )
+GENPROT( gemmt_determine_kc )
 GENPROT( trmm_determine_kc )
 GENPROT( trsm_determine_kc )
 
@@ -81,8 +81,8 @@ dim_t PASTEMAC0(opname) \
 GENPROT( gemm_determine_kc_f )
 GENPROT( gemm_determine_kc_b )
 
-GENPROT( herk_determine_kc_f )
-GENPROT( herk_determine_kc_b )
+GENPROT( gemmt_determine_kc_f )
+GENPROT( gemmt_determine_kc_b )
 
 GENPROT( trmm_determine_kc_f )
 GENPROT( trmm_determine_kc_b )
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 413f6a58d..50da4627c 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -597,10 +597,5 @@ void bli_l3_basic_check
 
 	e_val = bli_check_object_buffer( c );
 	bli_check_error_code( e_val );
-
-	// Check for sufficiently sized stack buffers
-
-	e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx );
-	bli_check_error_code( e_val );
 }
 
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
index f6bfbedbb..3cdecfbc2 100644
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -54,7 +54,7 @@ void bli_l3_cntl_create_if
 	if ( cntl_orig == NULL )
 	{
 		if ( family == BLIS_GEMM ||
-		     family == BLIS_HERK ||
+		     family == BLIS_GEMMT ||
 		     family == BLIS_TRMM )
 		{
 			*cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b );
@@ -97,7 +97,7 @@ void bli_l3_cntl_free
 	opid_t family = bli_cntl_family( cntl_use );
 
 	if ( family == BLIS_GEMM ||
-	     family == BLIS_HERK ||
+	     family == BLIS_GEMMT ||
 	     family == BLIS_TRMM )
 	{
 		bli_gemm_cntl_free( rntm, cntl_use, thread );
diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c
index 7baf2d6ef..0d0a71921 100644
--- a/frame/3/bli_l3_direct.c
+++ b/frame/3/bli_l3_direct.c
@@ -46,7 +46,7 @@ dir_t bli_l3_direct
 	opid_t family = bli_cntl_family( cntl );
 
 	if      ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c );
-	else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c );
+	else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c );
 	else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c );
 	else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c );
 
@@ -68,14 +68,14 @@ dir_t bli_gemm_direct
 	return BLIS_FWD;
 }
 
-dir_t bli_herk_direct
+dir_t bli_gemmt_direct
      (
        obj_t* a,
        obj_t* b,
        obj_t* c
      )
 {
-	// For herk, movement may be forwards (or backwards).
+	// For gemmt, movement may be forwards (or backwards).
 
 	return BLIS_FWD;
 }
diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h
index 7383c4a9f..39798407a 100644
--- a/frame/3/bli_l3_direct.h
+++ b/frame/3/bli_l3_direct.h
@@ -53,7 +53,7 @@ dir_t PASTEMAC0(opname) \
       );
 
 GENPROT( gemm_direct )
-GENPROT( herk_direct )
+GENPROT( gemmt_direct )
 GENPROT( trmm_direct )
 GENPROT( trsm_direct )
 
diff --git a/frame/3/bli_l3_ind.c b/frame/3/bli_l3_ind.c
index 7c30f61af..fbf73be60 100644
--- a/frame/3/bli_l3_ind.c
+++ b/frame/3/bli_l3_ind.c
@@ -55,7 +55,8 @@ static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
 static BLIS_THREAD_LOCAL
 bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
 {
-        /*   gemm  gemmt  hemm  herk  her2k  symm  syrk  syr2k  trmm3  trmm  trsm  */
+        /*   gemm           gemmt          hemm           herk           her2k          symm
+             syrk           syr2k          trmm3          trmm           trsm  */
         /*    c     z    */
 /* 1m   */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
              {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
@@ -80,11 +81,7 @@ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ) \
 GENFUNC( gemm, BLIS_GEMM )
 GENFUNC( gemmt, BLIS_GEMMT )
 GENFUNC( hemm, BLIS_HEMM )
-GENFUNC( herk, BLIS_HERK )
-GENFUNC( her2k, BLIS_HER2K )
 GENFUNC( symm, BLIS_SYMM )
-GENFUNC( syrk, BLIS_SYRK )
-GENFUNC( syr2k, BLIS_SYR2K )
 GENFUNC( trmm3, BLIS_TRMM3 )
 GENFUNC( trmm, BLIS_TRMM )
 GENFUNC( trsm, BLIS_TRSM )
diff --git a/frame/3/bli_l3_ind.h b/frame/3/bli_l3_ind.h
index f80757eb0..a14ad783c 100644
--- a/frame/3/bli_l3_ind.h
+++ b/frame/3/bli_l3_ind.h
@@ -47,11 +47,7 @@ ind_t   PASTEMAC(opname,ind_find_avail)( num_t dt );
 GENPROT( gemm )
 GENPROT( gemmt )
 GENPROT( hemm )
-GENPROT( herk )
-GENPROT( her2k )
 GENPROT( symm )
-GENPROT( syrk )
-GENPROT( syr2k )
 GENPROT( trmm3 )
 GENPROT( trmm )
 GENPROT( trsm )
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index f6cfd6640..cd0df7017 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -38,301 +38,508 @@
 // Define object-based interfaces (expert).
 //
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* If the rntm is non-NULL, it may indicate that we should forgo sup
-	   handling altogether. */ \
-	bool enable_sup = TRUE; \
-	if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
-\
-	if ( enable_sup ) \
-	{ \
-		/* Execute the small/unpacked oapi handler. If it finds that the problem
-		   does not fall within the thresholds that define "small", or for some
-		   other reason decides not to use the small/unpacked implementation,
-		   the function returns with BLIS_FAILURE, which causes execution to
-		   proceed towards the conventional implementation. */ \
-		err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
-		if ( result == BLIS_SUCCESS ) \
-		{ \
-			return; \
-		} \
-	} \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If each matrix operand has a complex storage datatype, try to get an
-	   induced method (if one is available and enabled). NOTE: Allowing
-	   precisions to vary while using 1m, which is what we do here, is unique
-	   to gemm; other level-3 operations use 1m only if all storage datatypes
-	   are equal (and they ignore the computation precision). */ \
-	if ( bli_obj_is_complex( c ) && \
-	     bli_obj_is_complex( a ) && \
-	     bli_obj_is_complex( b ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \
-}
-
 // If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be
 // defined in the sandbox environment.
 #ifndef BLIS_ENABLE_SANDBOX
-GENFRONT( gemm )
+
+void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// If the rntm is non-NULL, it may indicate that we should forgo sup
+	// handling altogether.
+	bool enable_sup = TRUE;
+	if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm );
+
+	if ( enable_sup )
+	{
+		// Execute the small/unpacked oapi handler. If it finds that the problem
+		// does not fall within the thresholds that define "small", or for some
+		// other reason decides not to use the small/unpacked implementation,
+		// the function returns with BLIS_FAILURE, which causes execution to
+		// proceed towards the conventional implementation.
+		err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm );
+		if ( result == BLIS_SUCCESS )
+		{
+			return;
+		}
+	}
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If each matrix operand has a complex storage datatype, try to get an
+	// induced method (if one is available and enabled). NOTE: Allowing
+	// precisions to vary while using 1m, which is what we do here, is unique
+	// to gemm; other level-3 operations use 1m only if all storage datatypes
+	// are equal (and they ignore the computation precision).
+	if ( bli_obj_is_complex( c ) &&
+	     bli_obj_is_complex( a ) &&
+	     bli_obj_is_complex( b ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_gemmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
+}
+
 #endif
 
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
-	     bli_obj_dt( b ) == bli_obj_dt( c ) && \
-	     bli_obj_is_complex( c ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \
+void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_gemmtind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemmt_check( alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL );
 }
 
-GENFRONT( gemmt )
-GENFRONT( her2k )
-GENFRONT( syr2k )
-
-
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
-	     bli_obj_dt( b ) == bli_obj_dt( c ) && \
-	     bli_obj_is_complex( c ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( side, alpha, a, b, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( side, alpha, a, b, beta, c, cntx, rntm, NULL ); \
+
+void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	obj_t ah;
+	obj_t bh;
+	obj_t alphah;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_her2k_check( alpha, a, b, beta, c, cntx );
+
+	bli_obj_alias_to( alpha, &alphah );
+	bli_obj_toggle_conj( &alphah );
+
+	bli_obj_alias_to( a, &ah );
+	bli_obj_toggle_trans( &ah );
+	bli_obj_toggle_conj( &ah );
+
+	bli_obj_alias_to( b, &bh );
+	bli_obj_toggle_trans( &bh );
+	bli_obj_toggle_conj( &bh );
+
+	// Invoke gemmt twice, using beta only the first time.
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)(   alpha, a, &bh,      beta, c, cntx, rntm );
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm );
+
+	// The Hermitian rank-2k product was computed as alpha*A*B'+alpha'*B*A', even for
+	// the diagonal elements. Mathematically, the imaginary components of
+	// diagonal elements of a Hermitian rank-2k product should always be
+	// zero. However, in practice, they sometimes accumulate meaningless
+	// non-zero values. To prevent this, we explicitly set those values
+	// to zero before returning.
+	bli_setid( &BLIS_ZERO, c );
 }
 
-GENFRONT( hemm )
-GENFRONT( symm )
-GENFRONT( trmm3 )
-
-
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
-	     bli_obj_is_complex( c ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( alpha, a, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( alpha, a, beta, c, cntx, rntm, NULL ); \
+
+void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	obj_t at;
+	obj_t bt;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_syr2k_check( alpha, a, b, beta, c, cntx );
+
+	bli_obj_alias_to( b, &bt );
+	bli_obj_toggle_trans( &bt );
+
+	bli_obj_alias_to( a, &at );
+	bli_obj_toggle_trans( &at );
+
+	// Invoke gemmt twice, using beta only the first time.
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt,      beta, c, cntx, rntm );
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, b, &at, &BLIS_ONE, c, cntx, rntm );
 }
 
-GENFRONT( herk )
-GENFRONT( syrk )
-
-
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( b ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \
-	     bli_obj_is_complex( b ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( side, alpha, a, b, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( side, alpha, a, b, cntx, rntm, NULL ); \
+
+void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_hemmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_hemm_check( side, alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
 }
 
-GENFRONT( trmm )
-GENFRONT( trsm )
 
+void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_symmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_symm_check( side, alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
+}
+
+
+void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_trmm3ind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trmm3_check( side, alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
+}
+
+
+void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	obj_t ah;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_herk_check( alpha, a, beta, c, cntx );
+
+	bli_obj_alias_to( a, &ah );
+	bli_obj_toggle_trans( &ah );
+	bli_obj_toggle_conj( &ah );
+
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm );
+
+	// The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
+	// diagonal elements. Mathematically, the imaginary components of
+	// diagonal elements of a Hermitian rank-k product should always be
+	// zero. However, in practice, they sometimes accumulate meaningless
+	// non-zero values. To prevent this, we explicitly set those values
+	// to zero before returning.
+	bli_setid( &BLIS_ZERO, c );
+}
+
+
+void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	obj_t at;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_syrk_check( alpha, a, beta, c, cntx );
+
+	bli_obj_alias_to( a, &at );
+	bli_obj_toggle_trans( &at );
+
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm );
+}
+
+
+void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( b );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_is_complex( b ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_trmmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trmm_check( side, alpha, a, b, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL );
+}
+
+
+void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( b );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_is_complex( b ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_trsmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trsm_check( side, alpha, a, b, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL );
+}
diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c
index fa008fd15..6ca8244cb 100644
--- a/frame/3/bli_l3_prune.c
+++ b/frame/3/bli_l3_prune.c
@@ -47,7 +47,7 @@ void bli_l3_prune_unref_mparts_m
 	opid_t family = bli_cntl_family( cntl );
 
 	if      ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm.
-	else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c );
+	else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c );
 	else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c );
 	else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c );
 }
@@ -68,7 +68,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
 	opid_t family = bli_cntl_family( cntl ); \
 \
 	if      ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \
-	else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \
+	else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \
 	else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \
 	else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \
 }
@@ -152,7 +152,7 @@ void PASTEMAC(opname,_prune_unref_mparts_k) \
 	   for the k dimension. */ \
 }
 
-GENFRONT( herk )
+GENFRONT( gemmt )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h
index 340ecd4db..ad8f07dc4 100644
--- a/frame/3/bli_l3_prune.h
+++ b/frame/3/bli_l3_prune.h
@@ -64,9 +64,9 @@ GENPROT( gemm, m )
 GENPROT( gemm, n )
 GENPROT( gemm, k )
 
-GENPROT( herk, m )
-GENPROT( herk, n )
-GENPROT( herk, k )
+GENPROT( gemmt, m )
+GENPROT( gemmt, n )
+GENPROT( gemmt, k )
 
 GENPROT( trmm, m )
 GENPROT( trmm, n )
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index 4726e1042..37a3909fd 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -44,12 +44,12 @@
 #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
-// herk
+// gemmt
 
-// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to
+// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to
 // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
-#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
-#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
+#define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
 // trmm
 
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index 94f0af409..7883dfd6d 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -93,7 +93,7 @@ void bli_gemm_blk_var3
 		// can simply overwrite the internal beta scalar with BLIS_ONE once
 		// it has been used in the first iteration. However...
 
-		// Unlike variant 3 of gemm and herk, which reset the internal scalar
+		// Unlike variant 3 of gemm and gemmt, which reset the internal scalar
 		// on C at the end of the first iteration so that subsequent iterations
 		// do not erroneously apply beta more than once, it is important that
 		// this behavior not be applied to trmm. That is because the order of
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index d7cd0a92c..27678e0bf 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -63,7 +63,7 @@ cntl_t* bli_gemmbp_cntl_create
 	// Use the function pointers to the macrokernels that use slab
 	// assignment of micropanels to threads in the jr and ir loops.
 	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
-	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2;
+	else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
 	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
 	else /* should never execute */ macro_kernel_fp = NULL;
 
@@ -167,8 +167,8 @@ cntl_t* bli_gemmpb_cntl_create
 {
 	void_fp macro_kernel_p = bli_gemm_ker_var1;
 
-	// Change the macro-kernel if the operation family is herk or trmm.
-	//if      ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
+	// Change the macro-kernel if the operation family is gemmt or trmm.
+	//if      ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2;
 	//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
 
 	// Create two nodes for the macro-kernel.
diff --git a/frame/3/gemmt/bli_gemmt.h b/frame/3/gemmt/bli_gemmt.h
index ed522ee13..32ab3865e 100644
--- a/frame/3/gemmt/bli_gemmt.h
+++ b/frame/3/gemmt/bli_gemmt.h
@@ -34,3 +34,5 @@
 
 #include "bli_gemmt_front.h"
 
+#include "bli_gemmt_var.h"
+
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index 84385bf17..9f18a717d 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -108,7 +108,7 @@ void bli_gemmt_front
 	bli_l3_thread_decorator
 	(
 	  bli_gemm_int,
-	  BLIS_HERK, // operation family id (gemmt uses 'herk' family)
+	  BLIS_GEMMT, // operation family id
 	  alpha,
 	  &a_local,
 	  &b_local,
diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
similarity index 97%
rename from frame/3/herk/bli_herk_l_ker_var2.c
rename to frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 5a05672d7..a995e6c52 100644
--- a/frame/3/herk/bli_herk_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
 
 
-void bli_herk_l_ker_var2
+void bli_gemmt_l_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -464,11 +464,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -551,5 +551,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
 
diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
similarity index 97%
rename from frame/3/herk/bli_herk_u_ker_var2.c
rename to frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 9e685a944..3115fc67b 100644
--- a/frame/3/herk/bli_herk_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
 
 
-void bli_herk_u_ker_var2
+void bli_gemmt_u_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -490,11 +490,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -554,5 +554,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
 
diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/gemmt/bli_gemmt_var.h
similarity index 90%
rename from frame/3/herk/bli_herk_var.h
rename to frame/3/gemmt/bli_gemmt_var.h
index 00b85fc5c..60c68c9f5 100644
--- a/frame/3/herk/bli_herk_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -52,16 +52,10 @@ void PASTEMAC0(opname) \
        thrinfo_t* thread  \
      );
 
-//GENPROT( herk_blk_var1 )
-//GENPROT( herk_blk_var2 )
-//GENPROT( herk_blk_var3 )
+GENPROT( gemmt_x_ker_var2 )
 
-GENPROT( herk_x_ker_var2 )
-
-GENPROT( herk_l_ker_var2 )
-GENPROT( herk_u_ker_var2 )
-//GENPROT( herk_packa )
-//GENPROT( herk_packb )
+GENPROT( gemmt_l_ker_var2 )
+GENPROT( gemmt_u_ker_var2 )
 
 
 //
@@ -91,6 +85,6 @@ void PASTEMAC(ch,varname) \
        thrinfo_t* thread  \
      );
 
-INSERT_GENTPROT_BASIC0( herk_l_ker_var2 )
-INSERT_GENTPROT_BASIC0( herk_u_ker_var2 )
+INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 )
+INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 )
 
diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
similarity index 97%
rename from frame/3/herk/bli_herk_x_ker_var2.c
rename to frame/3/gemmt/bli_gemmt_x_ker_var2.c
index b6769d719..6d24ea496 100644
--- a/frame/3/herk/bli_herk_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -37,10 +37,10 @@
 
 static gemm_var_oft vars[2] =
 {
-	bli_herk_l_ker_var2, bli_herk_u_ker_var2,
+	bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2,
 };
 
-void bli_herk_x_ker_var2
+void bli_gemmt_x_ker_var2
      (
        obj_t*  a,
        obj_t*  ah,
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
similarity index 97%
rename from frame/3/herk/other/bli_herk_l_ker_var2.c
rename to frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
index 22439f5b2..0bf4b1a0f 100644
--- a/frame/3/herk/other/bli_herk_l_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
 
 
-void bli_herk_l_ker_var2
+void bli_gemmt_l_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \
 			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \
 				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 					b2 = b_cast; \
 			} \
@@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
 
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
similarity index 97%
rename from frame/3/herk/other/bli_herk_u_ker_var2.c
rename to frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
index 1aa3ce12d..1655bea55 100644
--- a/frame/3/herk/other/bli_herk_u_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
 
 
-void bli_herk_u_ker_var2
+void bli_gemmt_u_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \
 			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \
 				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 					b2 = b_cast; \
 			} \
@@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
 
diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h
deleted file mode 100644
index 02975c2b5..000000000
--- a/frame/3/her2k/bli_her2k.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_her2k_front.h"
-
diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c
deleted file mode 100644
index 459ab05c7..000000000
--- a/frame/3/her2k/bli_her2k_front.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_her2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t    alpha_conj;
-	obj_t    c_local;
-	obj_t    a_local;
-	obj_t    bh_local;
-	obj_t    b_local;
-	obj_t    ah_local;
-
-	// If alpha is zero, scale by beta, zero the imaginary components of
-	// the diagonal elements, and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		bli_setid( &BLIS_ZERO, c );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For her2k, the first and second right-hand "B" operands are simply B'
-	// and A'.
-	bli_obj_alias_to( b, &bh_local );
-	bli_obj_induce_trans( &bh_local );
-	bli_obj_toggle_conj( &bh_local );
-	bli_obj_alias_to( a, &ah_local );
-	bli_obj_induce_trans( &ah_local );
-	bli_obj_toggle_conj( &ah_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_swap( &a_local, &bh_local );
-		bli_obj_swap( &b_local, &ah_local );
-
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &bh_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &ah_local );
-
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &bh_local, &c_local, cntx );
-	bli_l3_set_schemas( &b_local, &ah_local, &c_local, cntx );
-
-	// Initialize a conjugated copy of alpha.
-	bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
-	                                      BLIS_CONJUGATE,
-	                                      alpha,
-	                                      &alpha_conj );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_HER2K,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke herk twice, using beta only the first time.
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &bh_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  &alpha_conj,
-	  &b_local,
-	  &ah_local,
-	  &BLIS_ONE,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	// The Hermitian rank-2k product was computed as A*B'+B*A', even for
-	// the diagonal elements. Mathematically, the imaginary components of
-	// diagonal elements of a Hermitian rank-2k product should always be
-	// zero. However, in practice, they sometimes accumulate meaningless
-	// non-zero values. To prevent this, we explicitly set those values
-	// to zero before returning.
-	bli_setid( &BLIS_ZERO, &c_local );
-}
-
diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h
deleted file mode 100644
index 0efdb86c2..000000000
--- a/frame/3/her2k/bli_her2k_front.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_her2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h
deleted file mode 100644
index c43728968..000000000
--- a/frame/3/herk/bli_herk.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_herk_front.h"
-
-#include "bli_herk_var.h"
-
diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c
deleted file mode 100644
index 324e18151..000000000
--- a/frame/3/herk/bli_herk_front.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_herk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   ah_local;
-	obj_t   c_local;
-
-	// If alpha is zero, scale by beta, zero the imaginary components of
-	// the diagonal elements, and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		bli_setid( &BLIS_ZERO, c );
-		return;
-	}
-
-	// Alias A and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For herk, the right-hand "B" operand is simply A'.
-	bli_obj_alias_to( a, &ah_local );
-	bli_obj_induce_trans( &ah_local );
-	bli_obj_toggle_conj( &ah_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_toggle_conj( &a_local );
-		bli_obj_toggle_conj( &ah_local );
-
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &ah_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_HERK,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &ah_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	// The Hermitian rank-k product was computed as A*A', even for the
-	// diagonal elements. Mathematically, the imaginary components of
-	// diagonal elements of a Hermitian rank-k product should always be
-	// zero. However, in practice, they sometimes accumulate meaningless
-	// non-zero values. To prevent this, we explicitly set those values
-	// to zero before returning.
-	bli_setid( &BLIS_ZERO, &c_local );
-}
-
diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h
deleted file mode 100644
index 44778a450..000000000
--- a/frame/3/herk/bli_herk_front.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_herk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
deleted file mode 100644
index 8a99a2e24..000000000
--- a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
-
-
-void bli_herk_l_ker_var2
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region above where the diagonal of C intersects
-	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Use interleaved (round robin) assignment of micropanels to threads in
-	   the 2nd and 1st loops. */ \
-	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
-
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2rr.c b/frame/3/herk/other/bli_herk_l_ker_var2rr.c
deleted file mode 100644
index c78a36b29..000000000
--- a/frame/3/herk/other/bli_herk_l_ker_var2rr.c
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr);
-
-//
-// -- Macrokernel functions for round-robin partitioning -----------------------
-//
-
-void bli_herk_l_ker_var2rr
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region above where the diagonal of C intersects
-	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of C, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the rectangular region by dividing NR into the diagonal
-		   offset. Any remainder from this integer division is discarded, which
-		   is what we want. That is, we want the rectangular region to contain
-		   as many columns of whole microtiles as possible without including any
-		   microtiles that intersect the diagonal. The number of iterations in
-		   the triangular (or trapezoidal) region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffc / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
-	   loops for the initial rectangular region of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and
-	   1st loops for the remaining triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the triangular region
-	   by the number of iterations used for the rectangular region. */ \
-	jr_start += n_iter_rct; \
-	jr_end   += n_iter_rct; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr )
-
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2sl.c b/frame/3/herk/other/bli_herk_l_ker_var2sl.c
deleted file mode 100644
index 17e0b0d0e..000000000
--- a/frame/3/herk/other/bli_herk_l_ker_var2sl.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl);
-
-//
-// -- Macrokernel functions for slab partitioning ------------------------------
-//
-
-void bli_herk_l_ker_var2sl
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region above where the diagonal of C intersects
-	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of C, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the rectangular region by dividing NR into the diagonal
-		   offset. Any remainder from this integer division is discarded, which
-		   is what we want. That is, we want the rectangular region to contain
-		   as many columns of whole microtiles as possible without including any
-		   microtiles that intersect the diagonal. The number of iterations in
-		   the triangular (or trapezoidal) region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffc / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
-	/* Use slab assignment of micropanels to threads in the 2nd and 1st
-	   loops for the initial rectangular region of C (if it exists). */ \
-	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd
-	   loop and slab partitioning in the 1st loop for the remaining
-	   triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the triangular region
-	   by the number of iterations used for the rectangular region. */ \
-	jr_start += n_iter_rct; \
-	jr_end   += n_iter_rct; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl )
-
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
deleted file mode 100644
index 31d8fab62..000000000
--- a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
-
-
-void bli_herk_u_ker_var2
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region to the left of where the diagonal of C
-	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero. */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of C intersects
-	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Use interleaved (round robin) assignment of micropanels to threads in
-	   the 2nd and 1st loops. */ \
-	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
-
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2rr.c b/frame/3/herk/other/bli_herk_u_ker_var2rr.c
deleted file mode 100644
index 085ef6308..000000000
--- a/frame/3/herk/other/bli_herk_u_ker_var2rr.c
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr);
-
-//
-// -- Macrokernel functions for round-robin partitioning -----------------------
-//
-
-void bli_herk_u_ker_var2rr
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-    f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region to the left of where the diagonal of C
-	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero.
-	   NOTE: It's possible that after this pruning that the diagonal offset
-	   is still positive (though it is guaranteed to be less than NR). */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of C intersects
-	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the triangular (or trapezoidal) region by dividing NR
-		   into the number of rows in C. A non-zero remainder means we need to
-		   add one additional iteration. That is, we want the triangular region
-		   to contain as few columns of whole microtiles as possible while still
-		   including all microtiles that intersect the diagonal. The number of
-		   iterations in the rectangular region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
-	   loops for the initial triangular region of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
-	   loops for the remaining triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr )
-
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2sl.c b/frame/3/herk/other/bli_herk_u_ker_var2sl.c
deleted file mode 100644
index abc6e5188..000000000
--- a/frame/3/herk/other/bli_herk_u_ker_var2sl.c
+++ /dev/null
@@ -1,558 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl);
-
-//
-// -- Macrokernel functions for slab partitioning ------------------------------
-//
-
-void bli_herk_u_ker_var2sl
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-    f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region to the left of where the diagonal of C
-	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero.
-	   NOTE: It's possible that after this pruning that the diagonal offset
-	   is still positive (though it is guaranteed to be less than NR). */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of C intersects
-	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the triangular (or trapezoidal) region by dividing NR
-		   into the number of rows in C. A non-zero remainder means we need to
-		   add one additional iteration. That is, we want the triangular region
-		   to contain as few columns of whole microtiles as possible while still
-		   including all microtiles that intersect the diagonal. The number of
-		   iterations in the rectangular region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd loop
-	   and slab partitioning in the 1st loop for the initial triangular region
-	   of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
-	/* Use slab assignment of micropanels to threads in the 2nd and 1st loops
-	   loop for the remaining triangular region of C. */ \
-	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl )
-
diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h
deleted file mode 100644
index 680e6e399..000000000
--- a/frame/3/syr2k/bli_syr2k.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_syr2k_front.h"
-
diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c
deleted file mode 100644
index 4f30cc3d5..000000000
--- a/frame/3/syr2k/bli_syr2k_front.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_syr2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t    c_local;
-	obj_t    a_local;
-	obj_t    bt_local;
-	obj_t    b_local;
-	obj_t    at_local;
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For syr2k, the first and second right-hand "B" operands are simply B'
-	// and A'.
-	bli_obj_alias_to( b, &bt_local );
-	bli_obj_induce_trans( &bt_local );
-	bli_obj_alias_to( a, &at_local );
-	bli_obj_induce_trans( &at_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &bt_local, &c_local, cntx );
-	bli_l3_set_schemas( &b_local, &at_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_SYR2K,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke herk twice, using beta only the first time.
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &bt_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &b_local,
-	  &at_local,
-	  &BLIS_ONE,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-}
-
diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h
deleted file mode 100644
index 767bb6ee1..000000000
--- a/frame/3/syr2k/bli_syr2k_front.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_syr2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h
deleted file mode 100644
index 4936fe431..000000000
--- a/frame/3/syrk/bli_syrk.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_syrk_front.h"
-
diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c
deleted file mode 100644
index 819941426..000000000
--- a/frame/3/syrk/bli_syrk_front.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_syrk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   at_local;
-	obj_t   c_local;
-
-	// Alias A and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For syrk, the right-hand "B" operand is simply A^T.
-	bli_obj_alias_to( a, &at_local );
-	bli_obj_induce_trans( &at_local );
-
-#if 0
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-	gint_t status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local,
-	                                cntx, cntl );
-	if ( status == BLIS_SUCCESS ) return;
-#endif
-#endif
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &at_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_SYRK,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &at_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-}
-
diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h
deleted file mode 100644
index bf8d26a52..000000000
--- a/frame/3/syrk/bli_syrk_front.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_syrk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
-
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-err_t bli_syrk_small
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
-     );
-#endif
-
diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c
index 78d139e6b..e76314036 100644
--- a/frame/base/bli_check.c
+++ b/frame/base/bli_check.c
@@ -819,22 +819,26 @@ err_t bli_check_if_exhausted_pool( pool_t* pool )
 	return e_val;
 }
 
-err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx )
+err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
+	num_t dt;
 
-	dim_t mr      = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
-	dim_t nr      = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
-	siz_t dt_size = bli_dt_size( dt );
-
-	// NOTE: For induced methods, we use the size of the complex datatypes
-	// (rather than the size of the native micro-kernels' datatype) because
-	// the macro-kernel needs this larger micro-tile footprint, even if the
-	// virtual micro-kernel implementation will only ever be writing to half
-	// of it (real or imaginary part) at a time.
-
-	if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE )
-		e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE;
+	for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
+	{
+		dim_t mr      = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+		dim_t nr      = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+		siz_t dt_size = bli_dt_size( dt );
+
+		// NOTE: For induced methods, we use the size of the complex datatypes
+		// (rather than the size of the native micro-kernels' datatype) because
+		// the macro-kernel needs this larger micro-tile footprint, even if the
+		// virtual micro-kernel implementation will only ever be writing to half
+		// of it (real or imaginary part) at a time.
+
+		if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE )
+			e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE;
+	}
 
 	return e_val;
 }
diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h
index 70ec2fd8f..276d27689 100644
--- a/frame/base/bli_check.h
+++ b/frame/base/bli_check.h
@@ -103,7 +103,7 @@ err_t bli_check_valid_malloc_buf( void* ptr );
 
 err_t bli_check_valid_packbuf( packbuf_t buf_type );
 err_t bli_check_if_exhausted_pool( pool_t* pool );
-err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx );
+err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx );
 err_t bli_check_alignment_is_power_of_two( size_t align_size );
 err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size );
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index c250191fc..0a5bcafd4 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -449,6 +449,11 @@ void bli_gks_register_cntx
 	e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val );
 	e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val );
 #endif
+
+	// Verify that the register blocksizes in the context are sufficiently large
+	// relative to the maximum stack buffer size defined at configure-time.
+	e_val = bli_check_sufficient_stack_buf_size( gks_id_nat );
+	bli_check_error_code( e_val );
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index fa7901583..8a3dcd30a 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -180,12 +180,13 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
 // -- BLIS implementation query (level-3) --------------------------------------
 
 char* bli_info_get_gemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM,  dt ); }
+char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
 char* bli_info_get_hemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM,  dt ); }
-char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HERK,  dt ); }
-char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HER2K, dt ); }
+char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
 char* bli_info_get_symm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM,  dt ); }
-char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYRK,  dt ); }
-char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYR2K, dt ); }
+char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
 char* bli_info_get_trmm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM,  dt ); }
 char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); }
 char* bli_info_get_trsm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM,  dt ); }
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index d900ca4f5..99c7d000d 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -91,6 +91,7 @@ BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t
 // -- BLIS implementation query (level-3) --------------------------------------
 
 BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt );
 BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt );
 BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt );
 BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt );
diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c
index da7643eb6..95587e4a7 100644
--- a/frame/base/bli_part.c
+++ b/frame/base/bli_part.c
@@ -266,7 +266,7 @@ void bli_acquire_mpart_mdim
 	// diagonal, then set the subpartition structure to "general"; otherwise
 	// we let the subpartition inherit the storage structure of its immediate
 	// parent.
-	if ( !bli_obj_root_is_general( sub_obj ) && 
+	if ( !bli_obj_root_is_general( sub_obj ) &&
 	      bli_obj_is_outside_diag( sub_obj ) )
 	{
 		// NOTE: This comment may be out-of-date since we now distinguish
@@ -274,10 +274,10 @@ void bli_acquire_mpart_mdim
 		// Note that we cannot mark the subpartition object as general/dense
 		// here since it makes sense to preserve the existing uplo information
 		// a while longer so that the correct kernels are invoked. (Example:
-		// incremental packing/computing in herk produces subpartitions that
+		// incremental packing/computing in gemmt produces subpartitions that
 		// appear general/dense, but their uplo fields are needed to be either
 		// lower or upper, to determine which macro-kernel gets called in the
-		// herk_int() back-end.)
+		// gemmt_int() back-end.)
 
 		// If the subpartition lies entirely in an "unstored" triangle of the
 		// root matrix, then we need to tweak the subpartition. If the root
@@ -489,7 +489,7 @@ void bli_acquire_mpart_ndim
 	// diagonal), and the subpartition does not intersect the root matrix's
 	// diagonal, then we might need to modify some of the subpartition's
 	// properties, depending on its structure type.
-	if ( !bli_obj_root_is_general( sub_obj ) && 
+	if ( !bli_obj_root_is_general( sub_obj ) &&
 	      bli_obj_is_outside_diag( sub_obj ) )
 	{
 		// NOTE: This comment may be out-of-date since we now distinguish
@@ -497,10 +497,10 @@ void bli_acquire_mpart_ndim
 		// Note that we cannot mark the subpartition object as general/dense
 		// here since it makes sense to preserve the existing uplo information
 		// a while longer so that the correct kernels are invoked. (Example:
-		// incremental packing/computing in herk produces subpartitions that
+		// incremental packing/computing in gemmt produces subpartitions that
 		// appear general/dense, but their uplo fields are needed to be either
 		// lower or upper, to determine which macro-kernel gets called in the
-		// herk_int() back-end.)
+		// gemmt_int() back-end.)
 
 		// If the subpartition lies entirely in an "unstored" triangle of the
 		// root matrix, then we need to tweak the subpartition. If the root
@@ -742,7 +742,7 @@ void bli_acquire_mpart_mndim
 	// diagonal, then set the subpartition structure to "general"; otherwise
 	// we let the subpartition inherit the storage structure of its immediate
 	// parent.
-	if ( !bli_obj_root_is_general( sub_obj ) && 
+	if ( !bli_obj_root_is_general( sub_obj ) &&
 	     req_part != BLIS_SUBPART00 &&
 	     req_part != BLIS_SUBPART11 &&
 	     req_part != BLIS_SUBPART22 )
@@ -762,10 +762,10 @@ void bli_acquire_mpart_mndim
 		// Note that we cannot mark the subpartition object as general/dense
 		// here since it makes sense to preserve the existing uplo information
 		// a while longer so that the correct kernels are invoked. (Example:
-		// incremental packing/computing in herk produces subpartitions that
+		// incremental packing/computing in gemmt produces subpartitions that
 		// appear general/dense, but their uplo fields are needed to be either
 		// lower or upper, to determine which macro-kernel gets called in the
-		// herk_int() back-end.)
+		// gemmt_int() back-end.)
 
 		// If the subpartition lies entirely in an "unstored" triangle of the
 		// root matrix, then we need to tweak the subpartition. If the root
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 9ebd47de1..6dc4f9141 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -678,7 +678,7 @@ siz_t bli_thread_range_mdim
 	// structured matrix, even though they represent part of that matrix
 	// that will be dense and full (after packing).
 	if      ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; }
-	else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE;  }
+	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
 	else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE;  }
 	else    /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; }
 
@@ -737,7 +737,7 @@ siz_t bli_thread_range_ndim
 	// structured matrix, even though they represent part of that matrix
 	// that will be dense and full (after packing).
 	if      ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; }
-	else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE;  }
+	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
 	else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE;  }
 	else    /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; }
 
diff --git a/kernels/zen/3/bli_syrk_small.c b/kernels/zen/3/bli_gemmt_small.c
similarity index 99%
rename from kernels/zen/3/bli_syrk_small.c
rename to kernels/zen/3/bli_gemmt_small.c
index 23d47298c..f2fd88de7 100644
--- a/kernels/zen/3/bli_syrk_small.c
+++ b/kernels/zen/3/bli_gemmt_small.c
@@ -52,9 +52,9 @@ static float C_pack[F_SCRATCH_DIM]  __attribute__((aligned(64)));
 #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES)
 static double D_A_pack[D_SCRATCH_DIM]  __attribute__((aligned(64)));
 static double D_C_pack[D_SCRATCH_DIM]  __attribute__((aligned(64)));
-#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. 
-#define AT_MR 4 // The kernel dimension of the A transpose SYRK kernel.(AT_MR * NR).
-static err_t bli_ssyrk_small
+#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called.
+#define AT_MR 4 // The kernel dimension of the A transpose GEMMT kernel.(AT_MR * NR).
+static err_t bli_sgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -65,7 +65,7 @@ static err_t bli_ssyrk_small
        cntl_t* cntl
      );
 
-static err_t bli_dsyrk_small
+static err_t bli_dgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -76,7 +76,7 @@ static err_t bli_dsyrk_small
        cntl_t* cntl
      );
 
-static err_t bli_ssyrk_small_atbn
+static err_t bli_sgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -87,7 +87,7 @@ static err_t bli_ssyrk_small_atbn
        cntl_t* cntl
      );
 
-static err_t bli_dsyrk_small_atbn
+static err_t bli_dgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -98,11 +98,11 @@ static err_t bli_dsyrk_small_atbn
        cntl_t* cntl
      );
 /*
-* The bli_syrk_small function will use the
+* The bli_gemmt_small function will use the
 * custom MRxNR kernels, to perform the computation.
 * The custom kernels are used if the [M * N] < 240 * 240
 */
-err_t bli_syrk_small
+err_t bli_gemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -113,20 +113,20 @@ err_t bli_syrk_small
        cntl_t* cntl
      )
 {
-	// FGVZ: This code was originally in bli_syrk_front(). However, it really
-	// fits more naturally here within the bli_syrk_small() function. This
+	// FGVZ: This code was originally in bli_gemmt_front(). However, it really
+	// fits more naturally here within the bli_gemmt_small() function. This
 	// becomes a bit more obvious now that the code is here, as it contains
-	// cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_SYRK, which are specific
+	// cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_GEMMT, which are specific
 	// to this implementation.
 	if ( bli_obj_has_trans( a ) )
 	{
 		// Continue with small implementation.
 		;
 	}
-	else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK &&
-	            bli_obj_width( a )  <  BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) ||
-	          ( bli_obj_length( a ) <  BLIS_SMALL_MATRIX_A_THRES_M_SYRK &&
-	            bli_obj_width( a )  <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) )
+	else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_GEMMT &&
+	            bli_obj_width( a )  <  BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) ||
+	          ( bli_obj_length( a ) <  BLIS_SMALL_MATRIX_A_THRES_M_GEMMT &&
+	            bli_obj_width( a )  <= BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) )
 	{
 		// Continue with small implementation.
 		;
@@ -162,11 +162,11 @@ err_t bli_syrk_small
         {
             if (dt == BLIS_FLOAT)
             {
-                return bli_ssyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl);
+                return bli_sgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl);
             }
             else if (dt == BLIS_DOUBLE)
             {
-                return bli_dsyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl);
+                return bli_dgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl);
             }
         }
 
@@ -175,19 +175,19 @@ err_t bli_syrk_small
 
     if (dt == BLIS_DOUBLE)
     {
-        return bli_dsyrk_small(alpha, a, b, beta, c, cntx, cntl);
+        return bli_dgemmt_small(alpha, a, b, beta, c, cntx, cntl);
     }
 
     if (dt == BLIS_FLOAT)
     {
-        return bli_ssyrk_small(alpha, a, b, beta, c, cntx, cntl);
+        return bli_sgemmt_small(alpha, a, b, beta, c, cntx, cntl);
     }
 
     return BLIS_NOT_YET_IMPLEMENTED;
 };
 
 
-static err_t bli_ssyrk_small
+static err_t bli_sgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -240,7 +240,7 @@ static err_t bli_ssyrk_small
         beta_cast = (beta->buffer);
         int required_packing_A = 1;
 
-        // when N is equal to 1 call GEMV instead of SYRK
+        // when N is equal to 1 call GEMV instead of GEMMT
         if (N == 1)
         {
             bli_gemv
@@ -1584,7 +1584,7 @@ static err_t bli_ssyrk_small
                 }
             }
         }
-        
+
         //copy/compute sryk values back to C using SIMD
         if ( bli_seq0( *beta_cast ) )
         {//just copy in case of beta = 0
@@ -1673,7 +1673,7 @@ static err_t bli_ssyrk_small
                 _i = 0;
                 for ( _l = 0; _l < k; _l++ )
                 {
-                    ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC));     
+                    ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC));
                     ymm0 = _mm256_loadu_ps((C + _i*rsc));
                     ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0);
                     _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0);
@@ -1703,11 +1703,11 @@ static err_t bli_ssyrk_small
                     _l = 0;
                     while ( _l < k )
                     {
-                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0);
                         _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
-                        
+
                         _i += 8;
                         _l++;
                     }
@@ -1729,8 +1729,8 @@ static err_t bli_ssyrk_small
                     _i = 0;
                     _l = 0;
                     while ( _l < k )
-                    {                                   
-                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                    {
+                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0);
                         _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
@@ -1747,7 +1747,7 @@ static err_t bli_ssyrk_small
                 }
             }
         }
-        
+
         return BLIS_SUCCESS;
     }
     else
@@ -1756,7 +1756,7 @@ static err_t bli_ssyrk_small
 
 };
 
-static err_t bli_dsyrk_small
+static err_t bli_dgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -1810,7 +1810,7 @@ static err_t bli_dsyrk_small
         beta_cast = (beta->buffer);
         int required_packing_A = 1;
 
-        // when N is equal to 1 call GEMV instead of SYRK
+        // when N is equal to 1 call GEMV instead of GEMMT
         if (N == 1)
         {
             bli_gemv
@@ -3154,7 +3154,7 @@ static err_t bli_dsyrk_small
                 }
             }
         }
-        
+
         //copy/compute sryk values back to C using SIMD
         if ( bli_seq0( *beta_cast ) )
         {//just copy for beta = 0
@@ -3195,7 +3195,7 @@ static err_t bli_dsyrk_small
                     {
                         ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc));
                         _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
-                        
+
                         _i += 4;
                         _l++;
                     }
@@ -3243,7 +3243,7 @@ static err_t bli_dsyrk_small
                 _i = 0;
                 for ( _l = 0; _l < k; _l++ )
                 {
-                    ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC));     
+                    ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC));
                     ymm0 = _mm256_loadu_pd((C + _i*rsc));
                     ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0);
                     _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0);
@@ -3273,7 +3273,7 @@ static err_t bli_dsyrk_small
                     _l = 0;
                     while ( _l < k )
                     {
-                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0);
                         _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
@@ -3299,8 +3299,8 @@ static err_t bli_dsyrk_small
                     _i = 0;
                     _l = 0;
                     while ( _l < k )
-                    {                                   
-                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                    {
+                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0);
                         _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
@@ -3317,7 +3317,7 @@ static err_t bli_dsyrk_small
                 }
             }
         }
-        
+
         return BLIS_SUCCESS;
     }
     else
@@ -3326,7 +3326,7 @@ static err_t bli_dsyrk_small
 
 };
 
-static err_t bli_ssyrk_small_atbn
+static err_t bli_sgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -3364,7 +3364,7 @@ static err_t bli_ssyrk_small_atbn
     alpha_cast = (alpha->buffer);
     beta_cast = (beta->buffer);
 
-    // The non-copy version of the A^T SYRK gives better performance for the small M cases.
+    // The non-copy version of the A^T GEMMT gives better performance for the small M cases.
     // The threshold is controlled by BLIS_ATBN_M_THRES
     if (M <= BLIS_ATBN_M_THRES)
     {
@@ -3715,7 +3715,7 @@ static err_t bli_ssyrk_small_atbn
                 }
             }
         }
-        
+
         //copy/compute sryk values back to C
         if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C
         {
@@ -3774,7 +3774,7 @@ static err_t bli_ssyrk_small_atbn
         return BLIS_NONCONFORMAL_DIMENSIONS;
 }
 
-static err_t bli_dsyrk_small_atbn
+static err_t bli_dgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -3812,7 +3812,7 @@ static err_t bli_dsyrk_small_atbn
     alpha_cast = (alpha->buffer);
     beta_cast = (beta->buffer);
 
-    // The non-copy version of the A^T SYRK gives better performance for the small M cases.
+    // The non-copy version of the A^T GEMMT gives better performance for the small M cases.
     // The threshold is controlled by BLIS_ATBN_M_THRES
     if (M <= BLIS_ATBN_M_THRES)
     {
@@ -3968,7 +3968,7 @@ static err_t bli_dsyrk_small_atbn
                 result *= (*alpha_cast);
                 tC[3] = result/* + tC[3] * (*beta_cast)*/;
 
-      
+
                 tC += ldc;
                 ymm6 = _mm256_hadd_pd(ymm6, ymm6);
                 _mm256_storeu_pd(scratch, ymm6);
@@ -4199,7 +4199,7 @@ static err_t bli_dsyrk_small_atbn
                 }
             }
         }
-        
+
         return BLIS_SUCCESS;
     }
     else
diff --git a/sandbox/gemmlike/bls_gemm_check.c b/sandbox/gemmlike/bls_gemm_check.c
index bd6c2647e..369017338 100644
--- a/sandbox/gemmlike/bls_gemm_check.c
+++ b/sandbox/gemmlike/bls_gemm_check.c
@@ -99,11 +99,6 @@ void bls_gemm_check
 	e_val = bli_check_object_buffer( c );
 	bli_check_error_code( e_val );
 
-	// Check for sufficiently sized stack buffers
-
-	e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx );
-	bli_check_error_code( e_val );
-
 	// Check object dimensions.
 
 	e_val = bli_check_level3_dims( a, b, c );

From 7bc8ab485e89cfc6032932e57929e208a28f4be5 Mon Sep 17 00:00:00 2001
From: Meghana-vankadari <74656386+Meghana-vankadari@users.noreply.github.com>
Date: Fri, 12 Nov 2021 04:16:14 +0530
Subject: [PATCH 002/230] Added BLAS/CBLAS APIs for axpby, gemm_batch. (#566)

Details:
- Expanded the BLAS compatibility layer to include support for
  ?axpby_() and ?gemm_batch_(). The former is a straightforward
  BLAS-like interface into the axpbyv operation while the latter
  implements a batched gemm via loops over bli_?gemm(). Also
  expanded the CBLAS compatibility layer to include support for
  cblas_?axpby() and cblas_?gemm_batch(), which serve as wrappers to
  the corresponding (new) BLAS-like APIs. Thanks to Meghana Vankadari
  for submitting these new APIs via #566.
- Fixed a long-standing bug in common.mk that for some reason never
  manifested until now. Previously, CBLAS source files were compiled
  *without* the location of cblas.h being specified via a -I flag.
  I'm not sure why this worked, but it may be due to the fact that
  the cblas.h file resided in the same directory as all of the CBLAS
  source, and perhaps compilers implicitly add a -I flag for the
  directory that corresponds to the location of the source file being
  compiled. This bug only showed up because some CBLAS-like source code
  was moved into an 'extra' subdirectory of that frame/compat/cblas/src
  directory. After moving the code, compilation for those files failed
  (because the cblas.h header file, presumably, could not be found in
  the same location). This bug was fixed within common.mk by explicitly
  adding the cblas.h directory to the list of -I flags passed to the
  compiler.
- Added test_axpbyv.c and test_gemm_batch.c files to 'test' directory,
  and updated test/Makefile to build those drivers.
- Fixed typo in error message string in cblas_sgemm.c.
---
 common.mk                                     |  14 +-
 frame/compat/bli_blas.h                       |   6 +
 frame/compat/cblas/src/cblas.h                |  57 ++
 frame/compat/cblas/src/cblas_f77.h            |  21 +-
 frame/compat/cblas/src/cblas_sgemm.c          |  30 +-
 frame/compat/cblas/src/extra/cblas_caxpby.c   |  27 +
 .../cblas/src/extra/cblas_cgemm_batch.c       | 168 +++++
 frame/compat/cblas/src/extra/cblas_daxpby.c   |  26 +
 .../cblas/src/extra/cblas_dgemm_batch.c       | 168 +++++
 frame/compat/cblas/src/extra/cblas_saxpby.c   |  28 +
 .../cblas/src/extra/cblas_sgemm_batch.c       | 168 +++++
 frame/compat/cblas/src/extra/cblas_zaxpby.c   |  27 +
 .../cblas/src/extra/cblas_zgemm_batch.c       | 168 +++++
 frame/compat/extra/bla_axpby.c                |  89 +++
 frame/compat/extra/bla_axpby.h                |  54 ++
 frame/compat/extra/bla_gemm_batch.c           | 254 ++++++++
 frame/compat/extra/bla_gemm_batch.h           |  61 ++
 test/Makefile                                 |   8 +-
 test/test_axpbyv.c                            | 293 +++++++++
 test/test_gemm_batch.c                        | 584 ++++++++++++++++++
 20 files changed, 2226 insertions(+), 25 deletions(-)
 create mode 100644 frame/compat/cblas/src/extra/cblas_caxpby.c
 create mode 100644 frame/compat/cblas/src/extra/cblas_cgemm_batch.c
 create mode 100644 frame/compat/cblas/src/extra/cblas_daxpby.c
 create mode 100644 frame/compat/cblas/src/extra/cblas_dgemm_batch.c
 create mode 100644 frame/compat/cblas/src/extra/cblas_saxpby.c
 create mode 100644 frame/compat/cblas/src/extra/cblas_sgemm_batch.c
 create mode 100644 frame/compat/cblas/src/extra/cblas_zaxpby.c
 create mode 100644 frame/compat/cblas/src/extra/cblas_zgemm_batch.c
 create mode 100644 frame/compat/extra/bla_axpby.c
 create mode 100644 frame/compat/extra/bla_axpby.h
 create mode 100644 frame/compat/extra/bla_gemm_batch.c
 create mode 100644 frame/compat/extra/bla_gemm_batch.h
 create mode 100644 test/test_axpbyv.c
 create mode 100644 test/test_gemm_batch.c

diff --git a/common.mk b/common.mk
index 2da306d79..90c3da83f 100644
--- a/common.mk
+++ b/common.mk
@@ -1009,9 +1009,11 @@ BLIS_H_FLAT     := $(BASE_INC_PATH)/$(BLIS_H)
 #
 
 # Isolate the path to cblas.h by filtering the file from the list of framework
-# header files.
+# header files, and then strip the filename to obtain the directory in which
+# cblas.h resides.
 CBLAS_H          := cblas.h
 CBLAS_H_SRC_PATH := $(filter %/$(CBLAS_H), $(FRAME_H99_FILES))
+CBLAS_H_DIRPATH  := $(dir $(CBLAS_H_SRC_PATH))
 
 # Construct the path to what will be the intermediate flattened/monolithic
 # cblas.h file.
@@ -1037,7 +1039,8 @@ REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \
                                               $(FRAME_H99_FILES)))))
 
 # Add -I to each header path so we can specify our include search paths to the
-# C compiler. Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h.
+# C compiler. Then add frame/include since it's needed when compiling source
+# files that #include bli_oapi_ba.h or bli_oapi_ex.h.
 REF_KER_I_PATHS := $(strip $(patsubst %, -I%, $(REF_KER_H_PATHS)))
 REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include
 
@@ -1046,6 +1049,13 @@ REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include
 # now #include the monolithic/flattened blis.h instead.
 CINCFLAGS       := -I$(BASE_INC_PATH) $(REF_KER_I_PATHS)
 
+# If CBLAS is enabled, we also include the path to the cblas.h directory so
+# that the compiler will be able to find cblas.h as the CBLAS source code is
+# being compiled.
+ifeq ($(MK_ENABLE_CBLAS),yes)
+CINCFLAGS       += -I$(CBLAS_H_DIRPATH)
+endif
+
 # Obtain a list of header paths in the configured sandbox. Then add -I to each
 # header path.
 CSBOXINCFLAGS   := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS)))
diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h
index 1ce976453..a65953c11 100644
--- a/frame/compat/bli_blas.h
+++ b/frame/compat/bli_blas.h
@@ -113,6 +113,7 @@
 #include "bla_amax.h"
 #include "bla_asum.h"
 #include "bla_axpy.h"
+#include "bla_axpby.h"
 #include "bla_copy.h"
 #include "bla_dot.h"
 #include "bla_nrm2.h"
@@ -199,6 +200,11 @@
 #include "bla_trsm_check.h"
 #include "bla_gemmt_check.h"
 
+// -- Batch prototypes --
+
+#include "bla_gemm_batch.h"
+
+
 // -- Fortran-compatible APIs to BLIS functions --
 
 #include "b77_thread.h"
diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h
index 85e24674d..cee74233c 100644
--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -1,3 +1,4 @@
+
 #ifndef CBLAS_H
 #define CBLAS_H
 #include <stddef.h>
@@ -595,6 +596,62 @@ void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
 void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...);
 
+
+/*
+ * ===========================================================================
+ * BLAS Extension prototypes
+ * ===========================================================================
+ */
+
+// -- APIs to operations unique to BLIS --
+
+void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X,
+                 f77_int incX, float beta, float *Y, f77_int incY);
+void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X,
+                 f77_int incX, double beta, double *Y, f77_int incY);
+void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha,
+                const void *X, f77_int incX, const void* beta,
+                void *Y, f77_int incY);
+void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha,
+                const void *X, f77_int incX, const void *beta,
+                void *Y, f77_int incY);
+
+// -- Batch APIs --
+
+void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order,
+                 enum CBLAS_TRANSPOSE *TransA_array,
+                 enum CBLAS_TRANSPOSE *TransB_array,
+                 f77_int *M_array, f77_int *N_array,
+                 f77_int *K_array, const float *alpha_array, const float **A,
+                 f77_int *lda_array, const float **B, f77_int *ldb_array,
+                 const float *beta_array, float **C, f77_int *ldc_array,
+                 f77_int group_count, f77_int *group_size);
+void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order,
+                 enum CBLAS_TRANSPOSE *TransA_array,
+                 enum CBLAS_TRANSPOSE *TransB_array,
+                 f77_int *M_array, f77_int *N_array,
+                 f77_int *K_array, const double *alpha_array,
+                 const double **A,f77_int *lda_array,
+                 const double **B, f77_int *ldb_array,
+                 const double *beta_array, double **C, f77_int *ldc_array,
+                 f77_int group_count, f77_int *group_size);
+void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order,
+                 enum CBLAS_TRANSPOSE *TransA_array,
+                 enum CBLAS_TRANSPOSE *TransB_array,
+                 f77_int *M_array, f77_int *N_array,
+                 f77_int *K_array, const void *alpha_array, const void **A,
+                 f77_int *lda_array, const void **B, f77_int *ldb_array,
+                 const void *beta_array, void **C, f77_int *ldc_array,
+                 f77_int group_count, f77_int *group_size);
+void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order,
+                 enum CBLAS_TRANSPOSE *TransA_array,
+                 enum CBLAS_TRANSPOSE *TransB_array,
+                 f77_int *M_array, f77_int *N_array,
+                 f77_int *K_array, const void *alpha_array, const void **A,
+                 f77_int *lda_array, const void **B, f77_int *ldb_array,
+                 const void *beta_array, void **C, f77_int *ldc_array,
+                 f77_int group_count, f77_int *group_size);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/frame/compat/cblas/src/cblas_f77.h b/frame/compat/cblas/src/cblas_f77.h
index 5e94fdf2c..e534d2054 100644
--- a/frame/compat/cblas/src/cblas_f77.h
+++ b/frame/compat/cblas/src/cblas_f77.h
@@ -14,7 +14,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -200,9 +200,20 @@
 /*
 * BLAS extensions
 */
-#define F77_sgemmt sgemmt_
-#define F77_dgemmt dgemmt_
-#define F77_cgemmt cgemmt_
-#define F77_zgemmt zgemmt_
+#define F77_sgemmt  sgemmt_
+#define F77_dgemmt  dgemmt_
+#define F77_cgemmt  cgemmt_
+#define F77_zgemmt  zgemmt_
+
+#define F77_saxpby  saxpby_
+#define F77_daxpby  daxpby_
+#define F77_caxpby  caxpby_
+#define F77_zaxpby  zaxpby_
+
+#define F77_sgemm_batch  sgemm_batch_
+#define F77_dgemm_batch  dgemm_batch_
+#define F77_cgemm_batch  cgemm_batch_
+#define F77_zgemm_batch  zgemm_batch_
+
 
 #endif /*  CBLAS_F77_H */
diff --git a/frame/compat/cblas/src/cblas_sgemm.c b/frame/compat/cblas/src/cblas_sgemm.c
index 89d0f07a8..bf40b9c0d 100644
--- a/frame/compat/cblas/src/cblas_sgemm.c
+++ b/frame/compat/cblas/src/cblas_sgemm.c
@@ -7,6 +7,8 @@
  * Written by Keita Teranishi
  * 4/8/1998
  *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+ *
  */
 
 #include "cblas.h"
@@ -17,12 +19,12 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  f77_int lda, const float  *B, f77_int ldb,
                  float beta, float  *C, f77_int ldc)
 {
-   char TA, TB;   
+   char TA, TB;
 #ifdef F77_CHAR
    F77_CHAR F77_TA, F77_TB;
 #else
-   #define F77_TA &TA  
-   #define F77_TB &TB  
+   #define F77_TA &TA
+   #define F77_TB &TB
 #endif
 
 #ifdef F77_INT
@@ -36,7 +38,7 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
    #define F77_ldb ldb
    #define F77_ldc ldc
 #endif
-   
+
    extern int CBLAS_CallFromC;
    extern int RowMajorStrg;
    RowMajorStrg = 0;
@@ -46,9 +48,9 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
       if(TransA == CblasTrans) TA='T';
       else if ( TransA == CblasConjTrans ) TA='C';
       else if ( TransA == CblasNoTrans )   TA='N';
-      else 
+      else
       {
-         cblas_xerbla(2, "cblas_sgemm", 
+         cblas_xerbla(2, "cblas_sgemm",
                        "Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
          RowMajorStrg = 0;
@@ -58,9 +60,9 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
       if(TransB == CblasTrans) TB='T';
       else if ( TransB == CblasConjTrans ) TB='C';
       else if ( TransB == CblasNoTrans )   TB='N';
-      else 
+      else
       {
-         cblas_xerbla(3, "cblas_sgemm", 
+         cblas_xerbla(3, "cblas_sgemm",
                        "Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
          RowMajorStrg = 0;
@@ -79,9 +81,9 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
       if(TransA == CblasTrans) TB='T';
       else if ( TransA == CblasConjTrans ) TB='C';
       else if ( TransA == CblasNoTrans )   TB='N';
-      else 
+      else
       {
-         cblas_xerbla(2, "cblas_sgemm", 
+         cblas_xerbla(2, "cblas_sgemm",
                        "Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
          RowMajorStrg = 0;
@@ -90,10 +92,10 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
       if(TransB == CblasTrans) TA='T';
       else if ( TransB == CblasConjTrans ) TA='C';
       else if ( TransB == CblasNoTrans )   TA='N';
-      else 
+      else
       {
-         cblas_xerbla(2, "cblas_sgemm", 
-                       "Illegal TransA setting, %d\n", TransA);
+         cblas_xerbla(2, "cblas_sgemm",
+                       "Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
          RowMajorStrg = 0;
          return;
@@ -104,7 +106,7 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
       #endif
 
       F77_sgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, &alpha, B, &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
-   } else  
+   } else
      cblas_xerbla(1, "cblas_sgemm",
                      "Illegal Order setting, %d\n", Order);
    CBLAS_CallFromC = 0;
diff --git a/frame/compat/cblas/src/extra/cblas_caxpby.c b/frame/compat/cblas/src/extra/cblas_caxpby.c
new file mode 100644
index 000000000..e8400d91b
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_caxpby.c
@@ -0,0 +1,27 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ * cblas_caxpby.c
+ *
+ * The program is a C interface to caxpby.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc
+ *
+ */
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_caxpby( f77_int N, const void *alpha,
+               const void *X, f77_int incX,
+               const void *beta,
+               void *Y, f77_int incY)
+{
+#ifdef F77_INT
+   F77_INT F77_N=N, F77_incX=incX, F77_incY=incY;
+#else 
+   #define F77_N N
+   #define F77_incX incX
+   #define F77_incY incY
+#endif
+   F77_caxpby( &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY);
+} 
+#endif
diff --git a/frame/compat/cblas/src/extra/cblas_cgemm_batch.c b/frame/compat/cblas/src/extra/cblas_cgemm_batch.c
new file mode 100644
index 000000000..18dd0bad5
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_cgemm_batch.c
@@ -0,0 +1,168 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ *
+ * cblas_cgemm_batch.c
+ * This program is a C interface to cgemm_batch.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ */
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_cgemm_batch(enum CBLAS_ORDER Order,
+                       enum CBLAS_TRANSPOSE *TransA_array,
+                       enum CBLAS_TRANSPOSE *TransB_array,
+                       f77_int *M_array, f77_int *N_array,
+                       f77_int *K_array, const void *alpha_array,
+                       const void  **A_array, f77_int *lda_array,
+                       const void  **B_array, f77_int *ldb_array,
+                       const void *beta_array,
+                       void **C_array, f77_int *ldc_array,
+                       f77_int group_count, f77_int *group_size)
+{
+    char TA[group_count], TB[group_count];
+#ifdef F77_CHAR
+    F77_CHAR F77_TA[group_count], F77_TB[group_count];
+#else
+    #define F77_TA TA
+    #define F77_TB TB
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_GRP_COUNT = group_count;
+    F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT];
+    F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT];
+    F77_INT F77_GRP_SIZE[F77_GRP_COUNT];
+#else
+    #define F77_GRP_COUNT group_count
+    #define F77_M M_array
+    #define F77_N N_array
+    #define F77_K K_array
+    #define F77_lda lda_array
+    #define F77_ldb ldb_array
+    #define F77_ldc ldc_array
+    #define F77_GRP_SIZE group_size
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+    CBLAS_CallFromC = 1;
+
+    dim_t i;
+    if( Order == CblasColMajor )
+    {
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_cgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+            if(TransB_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(3, "cblas_cgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA[i] = C2F_CHAR(TA+i);
+            F77_TB[i] = C2F_CHAR(TB+i);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE[i] = group_size[i];
+#endif
+    }
+
+        F77_cgemm_batch(F77_TA, F77_TB,
+                        F77_M, F77_N, F77_K,
+                        (const scomplex*)alpha_array,
+                        (const scomplex**)A_array, F77_lda,
+                        (const scomplex**)B_array, F77_ldb,
+                        (const scomplex*)beta_array,
+                        (scomplex**)C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+    }
+    else if (Order == CblasRowMajor)
+    {
+        RowMajorStrg = 1;
+        dim_t i;
+
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_cgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+            if(TransB_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_cgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA = C2F_CHAR(&TA);
+            F77_TB = C2F_CHAR(&TB);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE = group_size[i];
+#endif
+        }
+
+        F77_cgemm_batch(F77_TA, F77_TB,
+                        F77_N, F77_M, F77_K,
+                        (const scomplex*)alpha_array,
+                        (const scomplex**)B_array, F77_ldb,
+                        (const scomplex**)A_array, F77_lda,
+                        (const scomplex*)beta_array,
+                        (scomplex**)C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+   } else
+     cblas_xerbla(1, "cblas_cgemm_batch",
+                     "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+}
+#endif
diff --git a/frame/compat/cblas/src/extra/cblas_daxpby.c b/frame/compat/cblas/src/extra/cblas_daxpby.c
new file mode 100644
index 000000000..8fbea4d5a
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_daxpby.c
@@ -0,0 +1,26 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ * cblas_daxpby.c
+ *
+ * The program is a C interface to daxpby.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc.
+ */
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_daxpby( f77_int N, double alpha,
+               const double *X, f77_int incX,
+               double beta,
+               double *Y, f77_int incY)
+{
+#ifdef F77_INT
+   F77_INT F77_N=N, F77_incX=incX, F77_incY=incY;
+#else
+   #define F77_N N
+   #define F77_incX incX
+   #define F77_incY incY
+#endif
+   F77_daxpby( &F77_N, &alpha, X, &F77_incX, &beta, Y, &F77_incY);
+}
+#endif
diff --git a/frame/compat/cblas/src/extra/cblas_dgemm_batch.c b/frame/compat/cblas/src/extra/cblas_dgemm_batch.c
new file mode 100644
index 000000000..a2bed3b1a
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_dgemm_batch.c
@@ -0,0 +1,168 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ *
+ * cblas_dgemm_batch.c
+ * This program is a C interface to dgemm_batch.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ */
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_dgemm_batch(enum CBLAS_ORDER Order,
+                       enum CBLAS_TRANSPOSE *TransA_array,
+                       enum CBLAS_TRANSPOSE *TransB_array,
+                       f77_int *M_array, f77_int *N_array,
+                       f77_int *K_array, const double *alpha_array,
+                       const double  **A_array, f77_int *lda_array,
+                       const double  **B_array, f77_int *ldb_array,
+                       const double *beta_array,
+                       double **C_array, f77_int *ldc_array,
+                       f77_int group_count, f77_int *group_size)
+{
+    char TA[group_count], TB[group_count];
+#ifdef F77_CHAR
+    F77_CHAR F77_TA[group_count], F77_TB[group_count];
+#else
+    #define F77_TA TA
+    #define F77_TB TB
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_GRP_COUNT = group_count;
+    F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT];
+    F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT];
+    F77_INT F77_GRP_SIZE[F77_GRP_COUNT];
+#else
+    #define F77_GRP_COUNT group_count
+    #define F77_M M_array
+    #define F77_N N_array
+    #define F77_K K_array
+    #define F77_lda lda_array
+    #define F77_ldb ldb_array
+    #define F77_ldc ldc_array
+    #define F77_GRP_SIZE group_size
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+    CBLAS_CallFromC = 1;
+
+    dim_t i;
+    if( Order == CblasColMajor )
+    {
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_dgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+            if(TransB_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(3, "cblas_dgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA[i] = C2F_CHAR(TA+i);
+            F77_TB[i] = C2F_CHAR(TB+i);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE[i] = group_size[i];
+#endif
+    }
+
+        F77_dgemm_batch(F77_TA, F77_TB,
+                        F77_M, F77_N, F77_K,
+                        alpha_array,
+                        A_array, F77_lda,
+                        B_array, F77_ldb,
+                        beta_array,
+                        C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+    }
+    else if (Order == CblasRowMajor)
+    {
+        RowMajorStrg = 1;
+        dim_t i;
+
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_dgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+            if(TransB_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_dgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA = C2F_CHAR(&TA);
+            F77_TB = C2F_CHAR(&TB);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE = group_size[i];
+#endif
+        }
+
+        F77_dgemm_batch(F77_TA, F77_TB,
+                        F77_N, F77_M, F77_K,
+                        alpha_array,
+                        B_array, F77_ldb,
+                        A_array, F77_lda,
+                        beta_array,
+                        C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+   } else
+     cblas_xerbla(1, "cblas_dgemm_batch",
+                     "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+}
+#endif
diff --git a/frame/compat/cblas/src/extra/cblas_saxpby.c b/frame/compat/cblas/src/extra/cblas_saxpby.c
new file mode 100644
index 000000000..685282123
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_saxpby.c
@@ -0,0 +1,28 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ * cblas_saxpby.c
+ *
+ * The program is a C interface to saxpby.
+ * It calls the fortran wrapper before calling saxpby.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc.
+ */
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_saxpby( f77_int N, float alpha, 
+               const float *X, f77_int incX,
+               float beta,
+               float *Y, f77_int incY)
+{
+#ifdef F77_INT
+   F77_INT F77_N=N, F77_incX=incX, F77_incY=incY;
+#else
+   #define F77_N N
+   #define F77_incX incX
+   #define F77_incY incY
+#endif
+   F77_saxpby( &F77_N, &alpha, X, &F77_incX, &beta, Y, &F77_incY);
+}
+#endif
diff --git a/frame/compat/cblas/src/extra/cblas_sgemm_batch.c b/frame/compat/cblas/src/extra/cblas_sgemm_batch.c
new file mode 100644
index 000000000..3e8517db2
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_sgemm_batch.c
@@ -0,0 +1,168 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ *
+ * cblas_sgemm_batch.c
+ * This program is a C interface to sgemm_batch.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ */
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_sgemm_batch(enum CBLAS_ORDER Order,
+                       enum CBLAS_TRANSPOSE *TransA_array,
+                       enum CBLAS_TRANSPOSE *TransB_array,
+                       f77_int *M_array, f77_int *N_array,
+                       f77_int *K_array, const float *alpha_array,
+                       const float  **A_array, f77_int *lda_array,
+                       const float  **B_array, f77_int *ldb_array,
+                       const float *beta_array,
+                       float  **C_array, f77_int *ldc_array,
+                       f77_int group_count, f77_int *group_size)
+{
+    char TA[group_count], TB[group_count];
+#ifdef F77_CHAR
+    F77_CHAR F77_TA[group_count], F77_TB[group_count];
+#else
+    #define F77_TA TA
+    #define F77_TB TB
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_GRP_COUNT = group_count;
+    F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT];
+    F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT];
+    F77_INT F77_GRP_SIZE[F77_GRP_COUNT];
+#else
+    #define F77_GRP_COUNT group_count
+    #define F77_M M_array
+    #define F77_N N_array
+    #define F77_K K_array
+    #define F77_lda lda_array
+    #define F77_ldb ldb_array
+    #define F77_ldc ldc_array
+    #define F77_GRP_SIZE group_size
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+    CBLAS_CallFromC = 1;
+
+    dim_t i;
+    if( Order == CblasColMajor )
+    {
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_sgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+            if(TransB_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(3, "cblas_sgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA[i] = C2F_CHAR(TA+i);
+            F77_TB[i] = C2F_CHAR(TB+i);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE[i] = group_size[i];
+#endif
+    }
+
+        F77_sgemm_batch(F77_TA, F77_TB,
+                        F77_M, F77_N, F77_K,
+                        alpha_array,
+                        A_array, F77_lda,
+                        B_array, F77_ldb,
+                        beta_array,
+                        C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+    }
+    else if (Order == CblasRowMajor)
+    {
+        RowMajorStrg = 1;
+        dim_t i;
+
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_sgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+            if(TransB_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_sgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA = C2F_CHAR(&TA);
+            F77_TB = C2F_CHAR(&TB);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE = group_size[i];
+#endif
+        }
+
+        F77_sgemm_batch(F77_TA, F77_TB,
+                        F77_N, F77_M, F77_K,
+                        alpha_array,
+                        B_array, F77_ldb,
+                        A_array, F77_lda,
+                        beta_array,
+                        C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+   } else
+     cblas_xerbla(1, "cblas_sgemm_batch",
+                     "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+}
+#endif
diff --git a/frame/compat/cblas/src/extra/cblas_zaxpby.c b/frame/compat/cblas/src/extra/cblas_zaxpby.c
new file mode 100644
index 000000000..483607ec9
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_zaxpby.c
@@ -0,0 +1,27 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ * cblas_zaxpby.c
+ *
+ * The program is a C interface to zaxpby.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc.
+ *
+ */
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_zaxpby( f77_int N, const void *alpha, 
+               const void *X, f77_int incX,
+               const void *beta,
+               void *Y, f77_int incY)
+{
+#ifdef F77_INT
+   F77_INT F77_N=N, F77_incX=incX, F77_incY=incY;
+#else 
+   #define F77_N N
+   #define F77_incX incX
+   #define F77_incY incY
+#endif
+   F77_zaxpby( &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY);
+} 
+#endif
diff --git a/frame/compat/cblas/src/extra/cblas_zgemm_batch.c b/frame/compat/cblas/src/extra/cblas_zgemm_batch.c
new file mode 100644
index 000000000..2d188a9f0
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_zgemm_batch.c
@@ -0,0 +1,168 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ *
+ * cblas_zgemm_batch.c
+ * This program is a C interface to zgemm_batch.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ */
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_zgemm_batch(enum CBLAS_ORDER Order,
+                       enum CBLAS_TRANSPOSE *TransA_array,
+                       enum CBLAS_TRANSPOSE *TransB_array,
+                       f77_int *M_array, f77_int *N_array,
+                       f77_int *K_array, const void *alpha_array,
+                       const void  **A_array, f77_int *lda_array,
+                       const void  **B_array, f77_int *ldb_array,
+                       const void *beta_array,
+                       void **C_array, f77_int *ldc_array,
+                       f77_int group_count, f77_int *group_size)
+{
+    char TA[group_count], TB[group_count];
+#ifdef F77_CHAR
+    F77_CHAR F77_TA[group_count], F77_TB[group_count];
+#else
+    #define F77_TA TA
+    #define F77_TB TB
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_GRP_COUNT = group_count;
+    F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT];
+    F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT];
+    F77_INT F77_GRP_SIZE[F77_GRP_COUNT];
+#else
+    #define F77_GRP_COUNT group_count
+    #define F77_M M_array
+    #define F77_N N_array
+    #define F77_K K_array
+    #define F77_lda lda_array
+    #define F77_ldb ldb_array
+    #define F77_ldc ldc_array
+    #define F77_GRP_SIZE group_size
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+    CBLAS_CallFromC = 1;
+
+    dim_t i;
+    if( Order == CblasColMajor )
+    {
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_zgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+            if(TransB_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(3, "cblas_zgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA[i] = C2F_CHAR(TA+i);
+            F77_TB[i] = C2F_CHAR(TB+i);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE[i] = group_size[i];
+#endif
+    }
+
+        F77_zgemm_batch(F77_TA, F77_TB,
+                        F77_M, F77_N, F77_K,
+                        (const dcomplex*)alpha_array,
+                        (const dcomplex**)A_array, F77_lda,
+                        (const dcomplex**)B_array, F77_ldb,
+                        (const dcomplex*)beta_array,
+                        (dcomplex**)C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+    }
+    else if (Order == CblasRowMajor)
+    {
+        RowMajorStrg = 1;
+        dim_t i;
+
+        for(i = 0; i < group_count; i++)
+        {
+            if(TransA_array[i] == CblasTrans) TB[i]='T';
+            else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C';
+            else if ( TransA_array[i] == CblasNoTrans )   TB[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_zgemm_batch",
+                       "Illegal TransA setting %d for group %d\n", TransA_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+            if(TransB_array[i] == CblasTrans) TA[i]='T';
+            else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C';
+            else if ( TransB_array[i] == CblasNoTrans )   TA[i]='N';
+            else
+            {
+                cblas_xerbla(2, "cblas_zgemm_batch",
+                       "Illegal TransB setting %d for group %d\n", TransB_array[i], i);
+                CBLAS_CallFromC = 0;
+                RowMajorStrg = 0;
+                return;
+            }
+
+#ifdef F77_CHAR
+            F77_TA = C2F_CHAR(&TA);
+            F77_TB = C2F_CHAR(&TB);
+#endif
+
+#ifdef F77_INT
+            F77_M[i] = M_array[i];
+            F77_N[i] = N_array[i];
+            F77_K[i] = K_array[i];
+            F77_lda[i] = lda_array[i];
+            F77_ldb[i] = ldb_array[i];
+            F77_ldc[i] = ldc_array[i];
+            F77_GRP_SIZE = group_size[i];
+#endif
+        }
+
+        F77_zgemm_batch(F77_TA, F77_TB,
+                        F77_N, F77_M, F77_K,
+                        (const dcomplex*)alpha_array,
+                        (const dcomplex**)B_array, F77_ldb,
+                        (const dcomplex**)A_array, F77_lda,
+                        (const dcomplex*)beta_array,
+                        (dcomplex**)C_array, F77_ldc,
+                        &F77_GRP_COUNT, F77_GRP_SIZE);
+   } else
+     cblas_xerbla(1, "cblas_zgemm_batch",
+                     "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+}
+#endif
diff --git a/frame/compat/extra/bla_axpby.c b/frame/compat/extra/bla_axpby.c
new file mode 100644
index 000000000..d96d75d74
--- /dev/null
+++ b/frame/compat/extra/bla_axpby.c
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   beta, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+	dim_t  n0; \
+	ftype* x0; \
+	ftype* y0; \
+	inc_t  incx0; \
+	inc_t  incy0; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Convert/typecast negative values of n to zero. */ \
+	bli_convert_blas_dim1( *n, n0 ); \
+\
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */ \
+	bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+\
+	/* Call BLIS interface. */ \
+	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n0, \
+	  (ftype*)alpha, \
+	  x0, incx0, \
+	  (ftype*)beta,  \
+	  y0, incy0, \
+	  NULL, \
+	  NULL  \
+	); \
+\
+	/* Finalize BLIS. */ \
+	bli_finalize_auto(); \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTFUNC_BLAS( axpby, axpbyv )
+#endif
diff --git a/frame/compat/extra/bla_axpby.h b/frame/compat/extra/bla_axpby.h
new file mode 100644
index 000000000..ab2952be9
--- /dev/null
+++ b/frame/compat/extra/bla_axpby.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype BLAS-to-BLIS interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
+\
+BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   beta, \
+             ftype*   y, const f77_int* incy \
+     );
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTPROT_BLAS( axpby )
+#endif
+
diff --git a/frame/compat/extra/bla_gemm_batch.c b/frame/compat/extra/bla_gemm_batch.c
new file mode 100644
index 000000000..be84572a3
--- /dev/null
+++ b/frame/compat/extra/bla_gemm_batch.c
@@ -0,0 +1,254 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+
+#ifdef BLIS_BLAS3_CALLS_TAPI
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa_array, \
+       const f77_char* transb_array, \
+       const f77_int*  m_array, \
+       const f77_int*  n_array, \
+       const f77_int*  k_array, \
+       const ftype*    alpha_array, \
+       const ftype**   a_array, const f77_int* lda_array, \
+       const ftype**   b_array, const f77_int* ldb_array, \
+       const ftype*    beta_array, \
+             ftype**   c_array, const f77_int* ldc_array, \
+       const f77_int*  group_count, \
+       const f77_int*  group_size \
+     ) \
+{ \
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, n0, k0; \
+	inc_t   rs_a, cs_a; \
+	inc_t   rs_b, cs_b; \
+	inc_t   rs_c, cs_c; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Perform BLAS parameter checking. */ \
+	for ( f77_int gi = 0; gi < *group_count; gi++ ) \
+	{ \
+		PASTEBLACHK(blisname) \
+		( \
+		  MKSTR(ch), \
+		  MKSTR(blisname), \
+		  transa_array+gi, \
+		  transb_array+gi, \
+		  m_array+gi, \
+		  n_array+gi, \
+		  k_array+gi, \
+		  lda_array+gi, \
+		  ldb_array+gi, \
+		  ldc_array+gi \
+		); \
+	} \
+\
+	f77_int idx = 0; \
+\
+	for ( f77_int i = 0; i < *group_count; i++ ) \
+	{ \
+		/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+		bli_param_map_netlib_to_blis_trans( transa_array[i], &blis_transa ); \
+		bli_param_map_netlib_to_blis_trans( transb_array[i], &blis_transb ); \
+\
+		/* Typecast BLAS integers to BLIS integers. */ \
+		bli_convert_blas_dim1( m_array[i], m0 ); \
+		bli_convert_blas_dim1( n_array[i], n0 ); \
+		bli_convert_blas_dim1( k_array[i], k0 ); \
+\
+		/* Set the row and column strides of the matrix operands. */ \
+		rs_a = 1; \
+		cs_a = lda_array[i]; \
+		rs_b = 1; \
+		cs_b = ldb_array[i]; \
+		rs_c = 1; \
+		cs_c = ldc_array[i]; \
+\
+		for ( f77_int j = 0; j < group_size[i]; j++ ) \
+		{ \
+			/* Call BLIS interface. */ \
+			PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+			( \
+			  blis_transa, \
+			  blis_transb, \
+			  m0, \
+			  n0, \
+			  k0, \
+			  (ftype*)(alpha_array + i), \
+			  (ftype*)*(a_array + idx), rs_a, cs_a, \
+			  (ftype*)*(b_array + idx), rs_b, cs_b, \
+			  (ftype*)(beta_array + i), \
+			  (ftype*)*(c_array + idx), rs_c, cs_c, \
+			  NULL, \
+			  NULL  \
+			); \
+\
+			idx++; \
+		} \
+	} \
+\
+	bli_finalize_auto(); \
+}
+
+#else
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa_array, \
+       const f77_char* transb_array, \
+       const f77_int*  m_array, \
+       const f77_int*  n_array, \
+       const f77_int*  k_array, \
+       const ftype*    alpha_array, \
+       const ftype**   a_array, const f77_int* lda_array, \
+       const ftype**   b_array, const f77_int* ldb_array, \
+       const ftype*    beta_array, \
+             ftype**   c_array, const f77_int* ldc_array, \
+       const f77_int* group_count, \
+       const f77_int* group_size ) \
+{ \
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, n0, k0; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Perform BLAS parameter checking. */ \
+	for ( f77_int gi = 0; gi < *group_count; gi++ ) \
+	{ \
+		PASTEBLACHK(blisname) \
+		( \
+		  MKSTR(ch), \
+		  MKSTR(blisname), \
+		  transa_array+gi, \
+		  transb_array+gi, \
+		  m_array+gi, \
+		  n_array+gi, \
+		  k_array+gi, \
+		  lda_array+gi, \
+		  ldb_array+gi, \
+		  ldc_array+gi \
+		); \
+	} \
+\
+	const num_t dt     = PASTEMAC(ch,type); \
+\
+	f77_int idx = 0, i, j; \
+\
+	for ( i = 0; i < *group_count; i++ ) \
+	{ \
+		/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+		bli_param_map_netlib_to_blis_trans( transa_array[i], &blis_transa ); \
+		bli_param_map_netlib_to_blis_trans( transb_array[i], &blis_transb ); \
+\
+		/* Typecast BLAS integers to BLIS integers. */ \
+		bli_convert_blas_dim1( m_array[i], m0 ); \
+		bli_convert_blas_dim1( n_array[i], n0 ); \
+		bli_convert_blas_dim1( k_array[i], k0 ); \
+\
+		/* Set the row and column strides of the matrix operands. */ \
+		const inc_t rs_a = 1; \
+		const inc_t cs_a = lda_array[i]; \
+		const inc_t rs_b = 1; \
+		const inc_t cs_b = ldb_array[i]; \
+		const inc_t rs_c = 1; \
+		const inc_t cs_c = ldc_array[i]; \
+\
+		obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1; \
+		obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1; \
+\
+		dim_t       m0_a, n0_a; \
+		dim_t       m0_b, n0_b; \
+\
+		bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
+		bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
+\
+		bli_obj_init_finish_1x1( dt, (ftype*)(alpha_array + i), &alphao ); \
+		bli_obj_init_finish_1x1( dt, (ftype*)(beta_array  + i),  &betao ); \
+\
+		for( j = 0; j < group_size[i]; j++ ) \
+		{ \
+			obj_t       ao     = BLIS_OBJECT_INITIALIZER; \
+			obj_t       bo     = BLIS_OBJECT_INITIALIZER; \
+			obj_t       co     = BLIS_OBJECT_INITIALIZER; \
+\
+			bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)*(a_array + idx), rs_a, cs_a, &ao ); \
+			bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)*(b_array + idx), rs_b, cs_b, &bo ); \
+			bli_obj_init_finish( dt, m0,   n0,   (ftype*)*(c_array + idx), rs_c, cs_c, &co ); \
+			bli_obj_set_conjtrans( blis_transa, &ao ); \
+			bli_obj_set_conjtrans( blis_transb, &bo ); \
+\
+			PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
+			( \
+			  &alphao, \
+			  &ao, \
+			  &bo, \
+			  &betao, \
+			  &co, \
+			  NULL, \
+			  NULL  \
+			); \
+\
+			idx++; \
+		} \
+	} \
+\
+	/* Finalize BLIS. */  \
+	bli_finalize_auto(); \
+}
+
+#endif
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTFUNC_BLAS( gemm_batch, gemm )
+#endif
+
diff --git a/frame/compat/extra/bla_gemm_batch.h b/frame/compat/extra/bla_gemm_batch.h
new file mode 100644
index 000000000..f997f4b8e
--- /dev/null
+++ b/frame/compat/extra/bla_gemm_batch.h
@@ -0,0 +1,61 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc. 
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype BLAS-to-BLIS interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
+\
+BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa_array, \
+       const f77_char* transb_array, \
+       const f77_int*  m_array, \
+       const f77_int*  n_array, \
+       const f77_int*  k_array, \
+       const ftype*    alpha_array, \
+       const ftype**   a_array, const f77_int* lda_array, \
+       const ftype**   b_array, const f77_int* ldb_array, \
+       const ftype*    beta_array, \
+             ftype**   c_array, const f77_int* ldc_array, \
+       const f77_int*  group_count, \
+       const f77_int*  group_size \
+     );
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTPROT_BLAS( gemm_batch )
+#endif
+
diff --git a/test/Makefile b/test/Makefile
index bbd817f2d..ae998ccde 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,11 +1,11 @@
 #
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc.
+#  Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -143,9 +143,9 @@ CFLAGS         += -I$(TEST_SRC_PATH)
 #
 
 # Define the operations we will test.
-TEST_OPS := dotv axpyv \
+TEST_OPS := dotv axpyv axpbyv\
             gemv ger hemv her her2 trmv trsv \
-            gemm hemm herk her2k trmm trsm
+            gemm gemm_batch hemm herk her2k trmm trsm
 
 # Optionally test gemmt, which some libraries might not implement.
 ifeq ($(BUILD_GEMMT),yes)
diff --git a/test/test_axpbyv.c b/test/test_axpbyv.c
new file mode 100644
index 000000000..28be2542c
--- /dev/null
+++ b/test/test_axpbyv.c
@@ -0,0 +1,293 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+//#define PRINT
+#ifdef BLIS_ENABLE_CBLAS
+//#define CHECK_CBLAS
+#endif
+
+#ifdef CHECK_CBLAS
+#include "cblas.h"
+#endif
+
+/*
+ * BLIS interface API will be called by default.
+ * To call BLAS API, modify line 159 to '#if 0'.
+ * To call cblas API, modify line 159 to '#if 0'and define the 
+ * macro 'CHECK_CBLAS' in line 44 
+ *
+ *Sample prototype for BLAS interface API is as follows:
+ *                n    alpha     x      incx   beta       y        incy
+ *void daxpbyv_( int*, double*, double*, int*, double*, double*,   int* );
+ */
+
+int main( int argc, char** argv )
+{
+    obj_t x, y;
+    obj_t y_save;
+    obj_t alpha, beta;
+    dim_t n;
+    dim_t p;
+    dim_t p_begin, p_end, p_inc;
+    int   n_input;
+    num_t dt_x, dt_y;
+    num_t dt_alpha, dt_beta;
+    int   r, n_repeats;
+    num_t dt;
+
+    double dtime;
+    double dtime_save;
+    double gflops;
+
+    bli_init();
+
+    n_repeats = 3;
+
+#ifndef PRINT
+    p_begin = 40;
+    p_end   = 4000;
+    p_inc   = 40;
+
+    n_input = -1;
+#else
+    p_begin = 16;
+    p_end   = 16;
+    p_inc   = 1;
+
+    n_input = 15;
+#endif
+
+#if 1 
+    dt = BLIS_FLOAT;
+    //dt = BLIS_DOUBLE;
+#else
+    //dt = BLIS_SCOMPLEX;
+    dt = BLIS_DCOMPLEX;
+#endif
+
+
+    dt_x = dt_y = dt_alpha = dt_beta = dt;
+
+    // Begin with initializing the last entry to zero so that
+    // matlab allocates space for the entire array once up-front.
+    for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+    printf( "data_axpbyv_blis" );
+#else
+    printf( "data_axpbyv_%s", BLAS );
+#endif
+    printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+            ( unsigned long )(p - p_begin)/p_inc + 1,
+            ( unsigned long )0, 0.0 );
+
+    //for ( p = p_begin; p <= p_end; p += p_inc )
+    for ( p = p_end; p_begin <= p; p -= p_inc )
+    {
+
+        if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+        else               n =     ( dim_t )    n_input;
+
+        bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
+        bli_obj_create( dt_beta,  1, 1, 0, 0, &beta  );
+
+        bli_obj_create( dt_x, n, 1, 0, 0, &x );
+        bli_obj_create( dt_y, n, 1, 0, 0, &y );
+        bli_obj_create( dt_y, n, 1, 0, 0, &y_save );
+
+        bli_randm( &x );
+        bli_randm( &y );
+
+        bli_setsc(  (0.9/1.0), 0.2, &alpha );
+        bli_setsc( -(1.1/1.0), 0.3, &beta  );
+
+        bli_copym( &y, &y_save );
+
+        dtime_save = 1.0e9;
+
+        for ( r = 0; r < n_repeats; ++r )
+        {
+            bli_copym( &y_save, &y );
+
+            dtime = bli_clock();
+
+#ifdef PRINT
+            bli_printm( "alpha", &alpha, "%4.1f", "" );
+            bli_printm( "beta" , &beta,  "%4.1f", "" );
+
+            bli_printm( "x", &x, "%4.1f", "" );
+            bli_printm( "y", &y, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+            bli_axpbyv( &alpha,
+                        &x,
+                        &beta,
+                        &y );
+#else
+            if ( bli_is_float( dt ) )
+            {
+                f77_int nn     = bli_obj_length( &x );
+                f77_int incx   = bli_obj_vector_inc( &x );
+                f77_int incy   = bli_obj_vector_inc( &y );
+                float   alphap = *(( float * )bli_obj_buffer( &alpha ));
+                float   betap  = *(( float * )bli_obj_buffer( &beta  ));
+                float*  xp     = bli_obj_buffer( &x );
+                float*  yp     = bli_obj_buffer( &y );
+#ifdef CHECK_CBLAS
+                cblas_saxpby( nn,
+                              alphap,
+                              xp, incx,
+                              betap,
+                              yp, incy );
+#else
+                saxpby_( &nn,
+                         &alphap,
+                         xp, &incx,
+                         &betap,
+                         yp, &incy );
+
+#endif
+            }
+            else if ( bli_is_double( dt ) )
+            {
+
+                f77_int  nn     = bli_obj_length( &x );
+                f77_int  incx   = bli_obj_vector_inc( &x );
+                f77_int  incy   = bli_obj_vector_inc( &y );
+                double   alphap = *(( double * )bli_obj_buffer( &alpha ));
+                double   betap  = *(( double * )bli_obj_buffer( &beta  ));
+                double*  xp     = bli_obj_buffer( &x );
+                double*  yp     = bli_obj_buffer( &y );
+#ifdef CHECK_CBLAS
+                cblas_daxpby( nn,
+                              alphap,
+                              xp, incx,
+                              betap,
+                              yp, incy );
+#else
+                daxpby_( &nn,
+                         &alphap,
+                         xp, &incx,
+                         &betap,
+                         yp, &incy );
+#endif
+            }
+            else if ( bli_is_scomplex( dt ) )
+            {
+                f77_int  nn     = bli_obj_length( &x );
+                f77_int  incx   = bli_obj_vector_inc( &x );
+                f77_int  incy   = bli_obj_vector_inc( &y );
+                void*    alphap = bli_obj_buffer( &alpha );
+                void*    betap  = bli_obj_buffer( &beta  );
+                void*    xp     = bli_obj_buffer( &x );
+                void*    yp     = bli_obj_buffer( &y );
+#ifdef CHECK_CBLAS
+                cblas_caxpby( nn,
+                              alphap,
+                              xp, incx,
+                              betap,
+                              yp, incy );
+#else
+                caxpby_( &nn,
+                         ( scomplex* )alphap,
+                         ( scomplex* )xp, &incx,
+                         ( scomplex* )betap,
+                         ( scomplex* )yp, &incy );
+#endif
+            }
+            else if ( bli_is_dcomplex( dt ))
+            {
+                f77_int  nn     = bli_obj_length( &x );
+                f77_int  incx   = bli_obj_vector_inc( &x );
+                f77_int  incy   = bli_obj_vector_inc( &y );
+                void*    alphap = bli_obj_buffer( &alpha );
+                void*    betap  = bli_obj_buffer( &beta  );
+                void*    xp     = bli_obj_buffer( &x );
+                void*    yp     = bli_obj_buffer( &y );
+#ifdef CHECK_CBLAS
+                cblas_zaxpby( nn,
+                              alphap,
+                              xp, incx,
+                              betap,
+                              yp, incy );
+#else
+                zaxpby_( &nn,
+                         ( dcomplex* )alphap,
+                         ( dcomplex* )xp, &incx,
+                         ( dcomplex* )betap,
+                         ( dcomplex* )yp, &incy );
+#endif
+            }
+#endif
+
+#ifdef PRINT
+            bli_printm( "y after", &y, "%4.1f", "" );
+            exit(1);
+#endif
+
+
+            dtime_save = bli_clock_min_diff( dtime_save, dtime );
+        }
+
+        gflops = ( 3.0  * n ) / ( dtime_save * 1.0e9 );
+
+#ifdef BLIS
+        printf( "data_axpbyv_blis" );
+#else
+        printf( "data_axpbyv_%s", BLAS );
+#endif
+        printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+                ( unsigned long )(p - p_begin)/p_inc + 1,
+                ( unsigned long )n, gflops );
+
+        bli_obj_free( &alpha );
+        bli_obj_free( &beta  );
+
+        bli_obj_free( &x );
+        bli_obj_free( &y );
+        bli_obj_free( &y_save );
+    }
+
+    bli_finalize();
+
+    return 0;
+}
diff --git a/test/test_gemm_batch.c b/test/test_gemm_batch.c
new file mode 100644
index 000000000..5660e4150
--- /dev/null
+++ b/test/test_gemm_batch.c
@@ -0,0 +1,584 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+//#define CHECK_CBLAS
+#ifdef CHECK_CBLAS
+#include "cblas.h"
+#endif
+
+/* Format for FILE input
+ * For each input set, first line contains 'storage scheme'
+ *  and 'group count' seperated by space.
+ * Following 'group_count' number of lines contains all the parameters of
+ * each group separated by space in each line in the following order:
+ *  tA tB m n k lda ldb ldc alpha_r alpha_i beta_r beta_i group_size
+ *
+ * Example:
+ * c 2
+ * n n 4 8 4 4 4 4 1.1 0.0 0.9 0.0 2
+ * n n 3 3 6 3 6 3 1.0 0.0 2.0 0.0 2
+ *
+ */
+
+//#define FILE_IN_OUT
+#ifndef FILE_IN_OUT
+#define GRP_COUNT 2
+#endif
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+    num_t dt;
+
+    char stor_scheme;
+    dim_t i, j, idx;
+    dim_t r, n_repeats;
+
+    double dtime;
+    double dtime_save;
+    double gflops;
+
+    dim_t total_count = 0;
+
+#if 1
+    dt = BLIS_FLOAT;
+    //dt = BLIS_DOUBLE;
+#else
+    dt = BLIS_SCOMPLEX;
+    //dt = BLIS_DCOMPLEX;
+#endif
+
+    n_repeats = 1;
+
+#ifdef FILE_IN_OUT
+    FILE* fin = NULL;
+    FILE* fout = NULL;
+
+    if(argc < 3)
+    {
+        printf("Usage: ./test_gemm_batch_XX.x input.csv output.csv\n");
+        exit(1);
+    }
+
+    fin = fopen(argv[1], "r");
+    if( fin == NULL )
+    {
+        printf("Error opening input file %s \n", argv[1]);
+        exit(1);
+    }
+
+    fout = fopen(argv[2], "w");
+    if(fout == NULL)
+    {
+        printf("Error opening output file %s\n",argv[2]);
+        exit(1);
+    }
+
+    dim_t GRP_COUNT;
+
+    fprintf(fout, "m\t n\t k\t lda\t ldb\t ldc\t transa\t transb\t grp_size\n");
+
+    while(fscanf(fin, "%c %ld\n", &stor_scheme, &GRP_COUNT) == 2)
+    {
+        char transa[GRP_COUNT];
+        char transb[GRP_COUNT];
+
+        dim_t m[GRP_COUNT];
+        dim_t n[GRP_COUNT];
+        dim_t k[GRP_COUNT];
+
+        dim_t lda[GRP_COUNT];
+        dim_t ldb[GRP_COUNT];
+        dim_t ldc[GRP_COUNT];
+
+        double alpha_real[GRP_COUNT];
+        double alpha_imag[GRP_COUNT];
+        double beta_real[GRP_COUNT];
+        double beta_imag[GRP_COUNT];
+
+        dim_t group_size[GRP_COUNT];
+        obj_t alpha[GRP_COUNT], beta[GRP_COUNT];
+
+        total_count = 0;
+        for(i = 0; i < GRP_COUNT; i++)
+        {
+            fscanf(fin, "%c %c %ld %ld %ld %ld %ld %ld %lf %lf %lf %lf %ld\n", &transa[i], &transb[i], &m[i], &n[i], &k[i], &lda[i], &ldb[i], &ldc[i], &alpha_real[i], &alpha_imag[i], &beta_real[i], &beta_imag[i], &group_size[i]);
+
+            total_count += group_size[i];
+        }
+#else
+        printf("m\t n\t k\t lda\t ldb\t ldc\t transa\t transb\t grp_size\n");
+
+        stor_scheme = 'c';
+
+        dim_t m[GRP_COUNT] = {4, 3};
+        dim_t n[GRP_COUNT] = {8, 3};
+        dim_t k[GRP_COUNT] = {4, 6};
+
+        dim_t lda[GRP_COUNT] = {4, 3};
+        dim_t ldb[GRP_COUNT] = {4, 6};
+        dim_t ldc[GRP_COUNT] = {4, 3};
+
+        char transa[GRP_COUNT] = {'N', 'N'};
+        char transb[GRP_COUNT] = {'N', 'N'};
+
+        double alpha_real[GRP_COUNT] = {1.1, 1.0};
+        double alpha_imag[GRP_COUNT] = {0.0, 0.0};
+
+        double beta_real[GRP_COUNT] = {0.9, 2.0};
+        double beta_imag[GRP_COUNT] = {0.0, 0.0};
+
+        dim_t group_size[GRP_COUNT] = {2,2};
+
+        obj_t alpha[GRP_COUNT], beta[GRP_COUNT];
+
+        total_count = 0;
+        for(i = 0; i < GRP_COUNT; i++)
+            total_count += group_size[i];
+
+#endif
+        obj_t a[total_count], b[total_count];
+        obj_t c[total_count], c_save[total_count];
+        f77_int f77_m[GRP_COUNT], f77_n[GRP_COUNT], f77_k[GRP_COUNT];
+        f77_int f77_lda[GRP_COUNT], f77_ldb[GRP_COUNT], f77_ldc[GRP_COUNT];
+        f77_int f77_group_size[GRP_COUNT];
+        f77_int f77_group_count = GRP_COUNT;
+#ifdef CHECK_CBLAS
+        enum CBLAS_ORDER cblas_order;
+        enum CBLAS_TRANSPOSE cblas_transa[GRP_COUNT];
+        enum CBLAS_TRANSPOSE cblas_transb[GRP_COUNT];
+
+        if(stor_scheme == 'R' || stor_scheme == 'r')
+            cblas_order = CblasRowMajor;
+        else
+            cblas_order = CblasColMajor;
+
+#else
+        f77_char f77_transa[GRP_COUNT];
+        f77_char f77_transb[GRP_COUNT];
+
+        if(stor_scheme == 'r' || stor_scheme == 'R' )
+        {
+            printf("BLAS Interface doesn't support row-major order\n");
+#ifdef FILE_IN_OUT
+            continue;
+#else
+            exit(1);
+#endif
+        }
+#endif
+
+        idx = 0;
+        for(i = 0; i < GRP_COUNT; i++)
+        {
+            bli_obj_create(dt, 1, 1, 0, 0, &alpha[i]);
+            bli_obj_create(dt, 1, 1, 0, 0, &beta[i] );
+
+            bli_setsc(alpha_real[i], alpha_imag[i], &alpha[i]);
+            bli_setsc(beta_real[i],  beta_imag[i],  &beta[i] );
+
+            trans_t blis_transa, blis_transb;
+            if(transa[i] == 't' || transa[i] == 'T')
+                blis_transa = BLIS_TRANSPOSE;
+            else if (transa[i] == 'c' || transa[i] == 'C')
+                blis_transa = BLIS_CONJ_TRANSPOSE;
+            else if ( transa[i] == 'n' || transa[i] == 'N')
+                    blis_transa = BLIS_NO_TRANSPOSE;
+            else
+            {
+                printf("Illegal transA setting %c for group %ld\n", transa[i], i);
+                exit(1);
+            }
+
+            if(transb[i] == 't' || transb[i] == 'T')
+                blis_transb = BLIS_TRANSPOSE;
+            else if (transb[i] == 'c' || transb[i] == 'C')
+                blis_transb = BLIS_CONJ_TRANSPOSE;
+            else if (transb[i] == 'n' || transb[i] == 'N')
+                blis_transb = BLIS_NO_TRANSPOSE;
+            else
+            {
+                printf("Illegal transB setting %c for group %ld\n", transb[i], i);
+                exit(1);
+            }
+#ifdef CHECK_CBLAS
+            if(bli_is_trans( blis_transa ))
+                cblas_transa[i] = CblasTrans;
+            else if (bli_is_conjtrans( blis_transa ))
+                cblas_transa[i] = CblasConjTrans;
+            else
+                cblas_transa[i] = CblasNoTrans;
+
+            if(bli_is_trans( blis_transb ))
+                cblas_transb[i] = CblasTrans;
+            else if (bli_is_conjtrans( blis_transb ))
+                cblas_transb[i] = CblasConjTrans;
+            else
+                cblas_transb[i] = CblasNoTrans;
+#else
+            bli_param_map_blis_to_netlib_trans( blis_transa, &f77_transa[i]);
+            bli_param_map_blis_to_netlib_trans( blis_transb, &f77_transb[i]);
+
+#endif
+            dim_t m0_a, n0_a;
+            dim_t m0_b, n0_b;
+            bli_set_dims_with_trans( blis_transa, m[i], k[i], &m0_a, &n0_a );
+            bli_set_dims_with_trans( blis_transb, k[i], n[i], &m0_b, &n0_b );
+            if(stor_scheme == 'C' || stor_scheme == 'c')
+            {
+                for(j = 0; j < group_size[i]; j++)
+                {
+                    bli_obj_create(dt, m0_a, n0_a, 1, lda[i], &a[idx]);
+                    bli_obj_create(dt, m0_b, n0_b, 1, ldb[i], &b[idx]);
+                    bli_obj_create(dt, m[i], n[i], 1, ldc[i], &c[idx]);
+                    bli_obj_create(dt, m[i], n[i], 1, ldc[i], &c_save[idx]);
+
+                    bli_randm( &a[idx] );
+                    bli_randm( &b[idx] );
+                    bli_randm( &c[idx] );
+
+                    bli_obj_set_conjtrans(blis_transa, &a[idx]);
+                    bli_obj_set_conjtrans(blis_transb, &b[idx]);
+                    idx++;
+                }
+            }
+            else if(stor_scheme == 'R' || stor_scheme == 'r')
+            {
+                for(j = 0; j < group_size[i]; j++)
+                {
+                    bli_obj_create(dt, m0_a, n0_a, lda[i], 1, &a[idx]);
+                    bli_obj_create(dt, m0_b, n0_b, ldb[i], 1, &b[idx]);
+                    bli_obj_create(dt, m[i], n[i], ldc[i], 1, &c[idx]);
+                    bli_obj_create(dt, m[i], n[i], ldc[i], 1, &c_save[idx]);
+
+                    bli_randm( &a[idx] );
+                    bli_randm( &b[idx] );
+                    bli_randm( &c[idx] );
+
+                    bli_obj_set_conjtrans(blis_transa, &a[idx]);
+                    bli_obj_set_conjtrans(blis_transb, &b[idx]);
+                    idx++;
+                }
+            }
+            f77_m[i] = m[i];
+            f77_n[i] = n[i];
+            f77_k[i] = k[i];
+            f77_lda[i] = lda[i];
+            f77_ldb[i] = ldb[i];
+            f77_ldc[i] = ldc[i];
+            f77_group_size[i] = group_size[i];
+
+        }
+
+        idx = 0;
+         for(i = 0; i < GRP_COUNT; i++)
+            for(j = 0; j < group_size[i]; j++)
+            {
+                bli_copym(&c[idx], &c_save[idx]);
+                idx++;
+            }
+
+        dtime_save = DBL_MAX;
+
+        for( r = 0; r < n_repeats; ++r )
+        {
+            idx = 0;
+            for(i = 0; i < GRP_COUNT; i++)
+                for(j = 0; j < group_size[i]; j++)
+                {
+                    bli_copym( &c_save[idx], &c[idx]);
+                    idx++;
+                }
+
+            dtime = bli_clock();
+
+#ifdef PRINT
+        idx = 0;
+        for(i = 0; i < GRP_COUNT; i++)
+            for(j = 0; j < group_size[i]; j++)
+                {
+                    printf("Group: %ld Member: %ld\n", i, j);
+
+                    bli_printm("a", &a[idx], "%4.1f", "");
+                    bli_printm("b", &b[idx], "%4.1f", "");
+                    bli_printm("c", &c[idx], "%4.1f", "");
+
+                    idx++;
+                }
+#endif
+
+        if(bli_is_float(dt))
+        {
+            const float *ap[total_count], *bp[total_count];
+            float *cp[total_count];
+            float alphap[GRP_COUNT], betap[GRP_COUNT];
+
+            idx = 0;
+            for(i = 0; i < GRP_COUNT; i++)
+            {
+                for(j = 0; j < group_size[i]; j++)
+                {
+                    ap[idx] = bli_obj_buffer( &a[idx] );
+                    bp[idx] = bli_obj_buffer( &b[idx] );
+                    cp[idx] = bli_obj_buffer( &c[idx] );
+
+                    idx++;
+                }
+                alphap[i] = *(float*)bli_obj_buffer_for_1x1(dt, &alpha[i]);
+                betap[i]  = *(float*)bli_obj_buffer_for_1x1(dt, &beta[i] );
+            }
+
+#ifdef CHECK_CBLAS
+            cblas_sgemm_batch( cblas_order,
+                   cblas_transa,
+                   cblas_transb,
+                   f77_m, f77_n, f77_k,
+                   alphap, ap, f77_lda,
+                   bp, f77_ldb,
+                   betap, cp, f77_ldc,
+                   f77_group_count,
+                   f77_group_size
+                );
+#else
+            sgemm_batch_( f77_transa,
+                  f77_transb,
+                  f77_m, f77_n, f77_k,
+                  alphap, ap, f77_lda,
+                  bp, f77_ldb,
+                  betap, cp, f77_ldc,
+                  &f77_group_count,
+                  f77_group_size
+                );
+#endif
+
+        }
+        else if(bli_is_double(dt))
+        {
+            const double *ap[total_count], *bp[total_count];
+            double *cp[total_count];
+            double alphap[GRP_COUNT], betap[GRP_COUNT];
+
+            idx = 0;
+            for(i = 0; i < GRP_COUNT; i++)
+            {
+                for(j = 0; j < group_size[i]; j++)
+                {
+                    ap[idx] = bli_obj_buffer( &a[idx] );
+                    bp[idx] = bli_obj_buffer( &b[idx] );
+                    cp[idx] = bli_obj_buffer( &c[idx] );
+
+                    idx++;
+                }
+                alphap[i] = *(double*)bli_obj_buffer_for_1x1(dt, &alpha[i]);
+                betap[i]  = *(double*)bli_obj_buffer_for_1x1(dt, &beta[i] );
+            }
+#ifdef CHECK_CBLAS
+            cblas_dgemm_batch( cblas_order,
+                   cblas_transa,
+                   cblas_transb,
+                   f77_m, f77_n, f77_k,
+                   alphap, ap, f77_lda,
+                   bp, f77_ldb,
+                   betap, cp, f77_ldc,
+                   f77_group_count,
+                   f77_group_size
+                );
+#else
+            dgemm_batch_( f77_transa,
+                  f77_transb,
+                  f77_m, f77_n, f77_k,
+                  alphap, ap, f77_lda,
+                  bp, f77_ldb,
+                  betap, cp, f77_ldc,
+                  &f77_group_count,
+                  f77_group_size
+                );
+#endif
+
+        }
+        else if(bli_is_scomplex(dt))
+        {
+            const scomplex *ap[total_count], *bp[total_count];
+            scomplex *cp[total_count];
+            scomplex alphap[GRP_COUNT], betap[GRP_COUNT];
+
+            idx = 0;
+            for(i = 0; i < GRP_COUNT; i++)
+            {
+                for(j = 0; j < group_size[i]; j++)
+                {
+                    ap[idx] = bli_obj_buffer( &a[idx] );
+                    bp[idx] = bli_obj_buffer( &b[idx] );
+                    cp[idx] = bli_obj_buffer( &c[idx] );
+
+                    idx++;
+                }
+                alphap[i] = *(scomplex*)bli_obj_buffer_for_1x1(dt, &alpha[i]);
+                betap[i]  = *(scomplex*)bli_obj_buffer_for_1x1(dt, &beta[i] );
+            }
+#ifdef CHECK_CBLAS
+            cblas_cgemm_batch( cblas_order,
+                   cblas_transa,
+                   cblas_transb,
+                   f77_m, f77_n, f77_k,
+                   (const void*)alphap,
+                   (const void**)ap, f77_lda,
+                   (const void**)bp, f77_ldb,
+                   (const void*)betap, (void**)cp, f77_ldc,
+                   f77_group_count,
+                   f77_group_size
+                );
+#else
+            cgemm_batch_( f77_transa,
+                  f77_transb,
+                  f77_m, f77_n, f77_k,
+                  alphap, ap, f77_lda,
+                  bp, f77_ldb,
+                  betap, cp, f77_ldc,
+                  &f77_group_count,
+                  f77_group_size
+                );
+#endif
+        }
+        else if(bli_is_dcomplex(dt))
+        {
+            const dcomplex *ap[total_count], *bp[total_count];
+            dcomplex *cp[total_count];
+            dcomplex alphap[GRP_COUNT], betap[GRP_COUNT];
+
+            idx = 0;
+            for(i = 0; i < GRP_COUNT; i++)
+            {
+                for(j = 0; j < group_size[i]; j++)
+                {
+                    ap[idx] = bli_obj_buffer( &a[idx] );
+                    bp[idx] = bli_obj_buffer( &b[idx] );
+                    cp[idx] = bli_obj_buffer( &c[idx] );
+
+                    idx++;
+                }
+                alphap[i] = *(dcomplex*)bli_obj_buffer_for_1x1(dt, &alpha[i]);
+                betap[i]  = *(dcomplex*)bli_obj_buffer_for_1x1(dt, &beta[i] );
+            }
+
+#ifdef CHECK_CBLAS
+            cblas_zgemm_batch( cblas_order,
+                   cblas_transa,
+                   cblas_transb,
+                   f77_m, f77_n, f77_k,
+                   (const void*)alphap,
+                   (const void**)ap, f77_lda,
+                   (const void**)bp, f77_ldb,
+                   (const void*)betap, (void**)cp, f77_ldc,
+                   f77_group_count,
+                   f77_group_size
+                );
+#else
+            zgemm_batch_( f77_transa,
+                  f77_transb,
+                  f77_m, f77_n, f77_k,
+                  alphap, ap, f77_lda,
+                  bp, f77_ldb,
+                  betap, cp, f77_ldc,
+                  &f77_group_count,
+                  f77_group_size
+                );
+#endif
+        }
+#ifdef PRINT
+        idx = 0;
+        for(i = 0; i < GRP_COUNT; i++)
+            for(j = 0; j < group_size[i]; j++)
+            {
+                printf("Group: %ld Member: %ld\n", i, j);
+                bli_printm("c after", &c[idx], "%4.1f", "");
+
+                idx++;
+            }
+#endif
+            dtime_save = bli_clock_min_diff( dtime_save, dtime );
+        }
+
+        dim_t fp_ops = 0;
+        for(i = 0; i < GRP_COUNT; i++)
+                fp_ops += 2.0 * m[i] * k[i] * n[i] * group_size[i];
+
+        gflops = fp_ops / (dtime_save * 1.0e9 );
+
+        if(bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef FILE_IN_OUT
+        fprintf(fout, "Stor_scheme = %c, group_count = %lu, gflops = %7.2f\n", stor_scheme, GRP_COUNT, gflops);
+        for(i = 0; i < GRP_COUNT; i++)
+            fprintf(fout, "%4lu \t %4lu\t %4lu\t %4lu\t %4lu\t %4lu\t %c\t %c\t %4lu\n", m[i], n[i], k[i], lda[i], ldb[i], ldc[i], transa[i], transb[i], group_size[i]);
+
+        fflush(fout);
+#else
+        printf( "Stor_scheme = %c, group_count = %d, gflops = %7.2f\n", stor_scheme, GRP_COUNT, gflops);
+        for(i = 0; i < GRP_COUNT; i++)
+            printf("%4lu \t %4lu\t %4lu\t %4lu\t %4lu\t %4lu\t %c\t %c\t %4lu\n", m[i], n[i], k[i], lda[i], ldb[i], ldc[i], transa[i], transb[i], group_size[i]);
+
+#endif
+
+    idx = 0;
+    for(i = 0; i < GRP_COUNT; i++)
+    {
+        bli_obj_free( &alpha[i]);
+        bli_obj_free( &beta[i] );
+
+        for(j = 0; j < group_size[i]; j++ )
+        {
+            bli_obj_free( &a[idx]);
+            bli_obj_free( &b[idx]);
+            bli_obj_free( &c[idx]);
+            bli_obj_free( &c_save[idx]);
+
+            idx++;
+        }
+    }
+#ifdef FILE_IN_OUT
+    }
+    fclose(fin);
+    fclose(fout);
+#endif
+    return 0;
+}
+

From 7bde468c6f7ecc4b5322d2ade1ae9c0b88e6b9f3 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sat, 13 Nov 2021 16:39:37 -0600
Subject: [PATCH 003/230] Added support for addons.

Details:
- Implemented a new feature called addons, which are similar to
  sandboxes except that there is no requirement to define gemm or any
  other particular operation.
- Updated configure to accept --enable-addon=<name> or -a <name> syntax
  for requesting an addon be included within a BLIS build. configure now
  outputs the list of enabled addons into config.mk. It also outputs the
  corresponding #include directives for the addons' headers to a new
  companion to the bli_config.h header file named bli_addon.h. Because
  addons may wish to make use of existing BLIS types within their own
  definitions, the addons' headers must be included sometime after that
  of bli_config.h (which currently is #included before bli_type_defs.h).
  This is why the #include directives needed to go into a new top-level
  header file rather than the existing bli_config.h file.
- Added a markdown document, docs/Addons.md, to explain addons, how to
  build with them, and what assumptions their authors should keep in
  mind as they create them.
- Added a gemmlike-like implementation of sandwich gemm called 'gemmd'
  as an addon in addon/gemmd. The code uses a 'bao_' prefix for local
  functions, including the user-level object and typed APIs.
- Updated .gitignore so that git ignores bli_addon.h files.
---
 .gitignore                                 |   1 +
 Makefile                                   |  48 ++
 addon/gemmd/attic/bli_gemm_ex.c            |  88 +++
 addon/gemmd/bao_gemmd.c                    | 305 +++++++++++
 addon/gemmd/bao_gemmd.h                    | 105 ++++
 addon/gemmd/bao_gemmd_bp_var1.c            | 530 ++++++++++++++++++
 addon/gemmd/bao_gemmd_bp_var2.c            | 602 +++++++++++++++++++++
 addon/gemmd/bao_gemmd_check.c              | 131 +++++
 addon/gemmd/bao_gemmd_check.h              |  50 ++
 addon/gemmd/bao_gemmd_var.h                | 126 +++++
 addon/gemmd/bao_l3_packm_a.c               | 330 +++++++++++
 addon/gemmd/bao_l3_packm_a.h               | 123 +++++
 addon/gemmd/bao_l3_packm_b.c               | 330 +++++++++++
 addon/gemmd/bao_l3_packm_b.h               | 123 +++++
 addon/gemmd/bao_l3_packm_var.h             |  69 +++
 addon/gemmd/bao_l3_packm_var1.c            | 195 +++++++
 addon/gemmd/bao_l3_packm_var2.c            | 245 +++++++++
 addon/gemmd/bao_packm_cxk.c                | 199 +++++++
 addon/gemmd/bao_packm_cxk.h                |  59 ++
 addon/gemmd/gemmd.h                        |  54 ++
 addon/gemmd/thread/bao_l3_decor.h          |  75 +++
 addon/gemmd/thread/bao_l3_decor_openmp.c   | 140 +++++
 addon/gemmd/thread/bao_l3_decor_openmp.h   |  44 ++
 addon/gemmd/thread/bao_l3_decor_pthreads.c | 220 ++++++++
 addon/gemmd/thread/bao_l3_decor_pthreads.h |  47 ++
 addon/gemmd/thread/bao_l3_decor_single.c   | 143 +++++
 addon/gemmd/thread/bao_l3_decor_single.h   |  44 ++
 build/bli_addon.h.in                       |  47 ++
 build/config.mk.in                         |   4 +
 common.mk                                  | 121 ++++-
 configure                                  | 152 +++++-
 docs/Addons.md                             | 231 ++++++++
 frame/include/bli_config_macro_defs.h      |   5 +-
 frame/include/blis.h                       |   8 +
 34 files changed, 4961 insertions(+), 33 deletions(-)
 create mode 100644 addon/gemmd/attic/bli_gemm_ex.c
 create mode 100644 addon/gemmd/bao_gemmd.c
 create mode 100644 addon/gemmd/bao_gemmd.h
 create mode 100644 addon/gemmd/bao_gemmd_bp_var1.c
 create mode 100644 addon/gemmd/bao_gemmd_bp_var2.c
 create mode 100644 addon/gemmd/bao_gemmd_check.c
 create mode 100644 addon/gemmd/bao_gemmd_check.h
 create mode 100644 addon/gemmd/bao_gemmd_var.h
 create mode 100644 addon/gemmd/bao_l3_packm_a.c
 create mode 100644 addon/gemmd/bao_l3_packm_a.h
 create mode 100644 addon/gemmd/bao_l3_packm_b.c
 create mode 100644 addon/gemmd/bao_l3_packm_b.h
 create mode 100644 addon/gemmd/bao_l3_packm_var.h
 create mode 100644 addon/gemmd/bao_l3_packm_var1.c
 create mode 100644 addon/gemmd/bao_l3_packm_var2.c
 create mode 100644 addon/gemmd/bao_packm_cxk.c
 create mode 100644 addon/gemmd/bao_packm_cxk.h
 create mode 100644 addon/gemmd/gemmd.h
 create mode 100644 addon/gemmd/thread/bao_l3_decor.h
 create mode 100644 addon/gemmd/thread/bao_l3_decor_openmp.c
 create mode 100644 addon/gemmd/thread/bao_l3_decor_openmp.h
 create mode 100644 addon/gemmd/thread/bao_l3_decor_pthreads.c
 create mode 100644 addon/gemmd/thread/bao_l3_decor_pthreads.h
 create mode 100644 addon/gemmd/thread/bao_l3_decor_single.c
 create mode 100644 addon/gemmd/thread/bao_l3_decor_single.h
 create mode 100644 build/bli_addon.h.in
 create mode 100644 docs/Addons.md

diff --git a/.gitignore b/.gitignore
index 49b22c2b8..a24fe2b0e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@
 
 config.mk
 bli_config.h
+bli_addon.h
 
 # -- monolithic headers --
 
diff --git a/Makefile b/Makefile
index b5e036744..992983328 100644
--- a/Makefile
+++ b/Makefile
@@ -114,6 +114,7 @@ BASE_OBJ_CONFIG_PATH   := $(BASE_OBJ_PATH)/$(CONFIG_DIR)
 BASE_OBJ_FRAME_PATH    := $(BASE_OBJ_PATH)/$(FRAME_DIR)
 BASE_OBJ_REFKERN_PATH  := $(BASE_OBJ_PATH)/$(REFKERN_DIR)
 BASE_OBJ_KERNELS_PATH  := $(BASE_OBJ_PATH)/$(KERNELS_DIR)
+BASE_OBJ_ADDON_PATH    := $(BASE_OBJ_PATH)/$(ADDON_DIR)
 BASE_OBJ_SANDBOX_PATH  := $(BASE_OBJ_PATH)/$(SANDBOX_DIR)
 
 # --- Define install target names for static libraries ---
@@ -210,6 +211,10 @@ MK_REFKERN_OBJS     := $(foreach arch, $(CONFIG_LIST), \
 # Generate object file paths for all of the portable framework source code.
 MK_FRAME_OBJS       := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH))
 
+# Generate object file paths for the addon source code. If one or more addons
+# were not enabled a configure-time, this variable will we empty.
+MK_ADDON_OBJS       := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH))
+
 # Generate object file paths for the sandbox source code. If a sandbox was not
 # enabled a configure-time, this variable will we empty.
 MK_SANDBOX_OBJS     := $(call gen-obj-paths-from-src,$(SANDBOX_SRC_SUFS),$(MK_SANDBOX_SRC),$(SANDBOX_PATH),$(BASE_OBJ_SANDBOX_PATH))
@@ -219,6 +224,7 @@ MK_BLIS_OBJS        := $(MK_CONFIG_OBJS) \
                        $(MK_KERNELS_OBJS) \
                        $(MK_REFKERN_OBJS) \
                        $(MK_FRAME_OBJS) \
+                       $(MK_ADDON_OBJS) \
                        $(MK_SANDBOX_OBJS)
 
 # Optionally filter out the BLAS and CBLAS compatibility layer object files.
@@ -549,6 +555,28 @@ else
 endif
 endef
 
+# first argument: a configuration name from the union of config_list and
+# config_name, used to look up the CFLAGS to use during compilation.
+define make-c99-addon-rule
+$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-addon-c99text-for,$(1))
+	@$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@
+endif
+endef
+
+define make-cxx-addon-rule
+$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-addon-cxxtext-for,$(1))
+	@$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@
+endif
+endef
+
 # first argument: a configuration name from the union of config_list and
 # config_name, used to look up the CFLAGS to use during compilation.
 define make-c99-sandbox-rule
@@ -601,6 +629,16 @@ $(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf))))
 $(foreach suf, $(KERNELS_SRC_SUFS), \
 $(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf)))))
 
+# Instantiate the build rule for C addon files. Use the CFLAGS for the
+# configuration family.
+$(foreach suf, $(ADDON_C99_SUFS), \
+$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf)))))
+
+# Instantiate the build rule for C++ addon files. Use the CFLAGS for the
+# configuration family.
+$(foreach suf, $(ADDON_CXX_SUFS), \
+$(foreach conf, $(CONFIG_NAME), $(eval $(call make-cxx-addon-rule,$(conf),$(suf)))))
+
 # Instantiate the build rule for C sandbox files. Use the CFLAGS for the
 # configuration family.
 $(foreach suf, $(SANDBOX_C99_SUFS), \
@@ -1078,6 +1116,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(FIND) $(FRAME_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
 	- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
 	- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+ifneq ($(ADDON_LIST),)
+	- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+endif
 ifneq ($(SANDBOX),)
 	- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
 endif
@@ -1090,6 +1131,10 @@ else
 	@- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
 	@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)"
 	@- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+ifneq ($(ADDON_LIST),)
+	@echo "Removing makefile fragments from $(ADDON_FRAG_PATH)"
+	@- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+endif
 ifneq ($(SANDBOX),)
 	@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)"
 	@- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
@@ -1210,6 +1255,7 @@ endif # IS_CONFIGURED
 distclean: cleanmk cleanh cleanlib cleantest
 ifeq ($(IS_CONFIGURED),yes)
 ifeq ($(ENABLE_VERBOSE),yes)
+	- $(RM_F) $(BLIS_ADDON_H)
 	- $(RM_F) $(BLIS_CONFIG_H)
 	- $(RM_F) $(CONFIG_MK_FILE)
 	- $(RM_F) $(PC_OUT_FILE)
@@ -1217,6 +1263,8 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_RF) $(LIB_DIR)
 	- $(RM_RF) $(INCLUDE_DIR)
 else
+	@echo "Removing $(BLIS_ADDON_H)"
+	@$(RM_F) $(BLIS_ADDON_H)
 	@echo "Removing $(BLIS_CONFIG_H)"
 	@$(RM_F) $(BLIS_CONFIG_H)
 	@echo "Removing $(CONFIG_MK_FILE)"
diff --git a/addon/gemmd/attic/bli_gemm_ex.c b/addon/gemmd/attic/bli_gemm_ex.c
new file mode 100644
index 000000000..0f40d1cb3
--- /dev/null
+++ b/addon/gemmd/attic/bli_gemm_ex.c
@@ -0,0 +1,88 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_gemm_ex
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// A switch to easily toggle whether we use the addon implementation
+	// of bao_gemmd() as the implementation for bli_gemm(). (This allows for
+	// easy testing of bao_gemmd() via the testsuite.)
+	if ( 1 )
+	{
+		const dim_t k  = bli_obj_width_after_trans( a );
+		const num_t dt = bli_obj_dt( c );
+		obj_t       d;
+
+		bli_obj_create( dt, k, 1, 1, k, &d );
+		bli_setv( &BLIS_ONE, &d );
+		//bli_randv( &d );
+
+		bao_gemmd_ex( alpha, a, &d, b, beta, c, cntx, rntm );
+
+		bli_obj_free( &d );
+		return;
+	}
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Obtain a valid (native) context from the gks if necessary.
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front end.
+	bli_gemm_front
+	(
+	  alpha, a, b, beta, c, cntx, rntm, NULL
+	);
+}
+
diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c
new file mode 100644
index 000000000..71d49806b
--- /dev/null
+++ b/addon/gemmd/bao_gemmd.c
@@ -0,0 +1,305 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// -- Define the gemmd operation's object API ----------------------------------
+//
+
+void bao_gemmd
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c
+     )
+{
+	bao_gemmd_ex
+	(
+	  alpha,
+	  a,
+	  d,
+	  b,
+	  beta,
+	  c,
+	  NULL,
+	  NULL
+	);
+}
+
+void bao_gemmd_ex
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Obtain a valid (native) context from the gks if necessary.
+	// NOTE: This must be done before calling the _check() function, since
+	// that function assumes the context pointer is valid.
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bao_gemmd_check( alpha, a, d, b, beta, c, cntx );
+
+	// -- bli_gemmd_front() ----------------------------------------------------
+
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( a, &a_local );
+	bli_obj_alias_to( b, &b_local );
+	bli_obj_alias_to( c, &c_local );
+
+	// Induce a transposition of A if it has its transposition property set.
+	// Then clear the transposition bit in the object.
+	if ( bli_obj_has_trans( &a_local ) )
+	{
+		bli_obj_induce_trans( &a_local );
+		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
+	}
+
+	// Induce a transposition of B if it has its transposition property set.
+	// Then clear the transposition bit in the object.
+	if ( bli_obj_has_trans( &b_local ) )
+	{
+		bli_obj_induce_trans( &b_local );
+		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local );
+	}
+
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+	}
+
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop, and then make any
+	// additional modifications necessary for the current operation.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm/hemm/symm
+	  bli_obj_length( &c_local ),
+	  bli_obj_width( &c_local ),
+	  bli_obj_width( &a_local ),
+	  rntm
+	);
+
+	// Spawn threads (if applicable), where bao_gemmd_int() is the thread entry
+	// point function for each thread. This also begins the process of creating
+	// the thrinfo_t tree, which contains thread communicators.
+	bao_l3_thread_decorator
+	(
+	  bao_gemmd_int,
+	  BLIS_GEMM, // operation family id
+	  alpha,
+	  &a_local,
+	  d,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  rntm
+	);
+}
+
+//
+// -- Define the gemmd operation's thread entry point --------------------------
+//
+
+void bao_gemmd_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	// In this function, we choose the gemmd implementation that is executed
+	// on each thread.
+
+#if 1
+	// Call the block-panel algorithm that calls the kernel directly, which
+	// exposes edge-case handling.
+	bao_gemmd_bp_var1
+	(
+	  alpha,
+	  a,
+	  d,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+#else
+	// Call the block-panel algorithm that calls the kernel indirectly via a
+	// wrapper function, which hides edge-case handling.
+	bao_gemmd_bp_var2
+	(
+	  alpha,
+	  a,
+	  d,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+#endif
+}
+
+//
+// -- Define the gemmd operation's typed API -----------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       trans_t transa, \
+       trans_t transb, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  d, inc_t incd, \
+       ctype*  b, inc_t rs_b, inc_t cs_b, \
+       ctype*  beta, \
+       ctype*  c, inc_t rs_c, inc_t cs_c  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	/* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on
+	   the macro parameter 'ch' (e.g. s, d, etc). */ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	obj_t       alphao, ao, dd, bo, betao, co; \
+\
+	dim_t       m_a, n_a; \
+	dim_t       m_b, n_b; \
+\
+	/* Adjust the dimensions of matrices A and B according to the transa and
+	   transb parameters. */ \
+	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
+	bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
+\
+	/* Create bufferless scalar objects and attach the provided scalar pointers
+	   to those scalar objects. */ \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, beta,  &betao  ); \
+\
+	/* Create bufferless matrix objects and attach the provided matrix pointers
+	   to those matrix objects. */ \
+	bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
+	bli_obj_create_with_attached_buffer( dt, k,   1,   d, incd, k,    &dd ); \
+	bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
+	bli_obj_create_with_attached_buffer( dt, m,   n,   c, rs_c, cs_c, &co ); \
+\
+	/* Set the transposition/conjugation properties of the objects for matrices
+	   A and B. */ \
+	bli_obj_set_conjtrans( transa, &ao ); \
+	bli_obj_set_conjtrans( transb, &bo ); \
+\
+	/* Call the object interface. */ \
+	PASTECH(bao_,opname) \
+	( \
+	  &alphao, \
+	  &ao, \
+	  &dd, \
+	  &bo, \
+	  &betao, \
+	  &co  \
+	); \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemmd )
+GENTFUNC( float,    s, gemmd )
+GENTFUNC( double,   d, gemmd )
+GENTFUNC( scomplex, c, gemmd )
+GENTFUNC( dcomplex, z, gemmd )
+
diff --git a/addon/gemmd/bao_gemmd.h b/addon/gemmd/bao_gemmd.h
new file mode 100644
index 000000000..7c7466494
--- /dev/null
+++ b/addon/gemmd/bao_gemmd.h
@@ -0,0 +1,105 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//
+// -- Prototype the gemmd operation's object API -------------------------------
+//
+
+BLIS_EXPORT_ADDON void bao_gemmd
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c
+     );
+
+BLIS_EXPORT_ADDON void bao_gemmd_ex
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     );
+
+//
+// -- Prototype the gemmd operation's thread entry point -----------------------
+//
+
+void bao_gemmd_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+//
+// -- Prototype the gemmd operation's typed API --------------------------------
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \
+     ( \
+       trans_t transa, \
+       trans_t transb, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  d, inc_t incd, \
+       ctype*  b, inc_t rs_b, inc_t cs_b, \
+       ctype*  beta, \
+       ctype*  c, inc_t rs_c, inc_t cs_c  \
+     );
+
+//INSERT_GENTPROT_BASIC0( gemmd )
+GENTPROT( float,    s, gemmd )
+GENTPROT( double,   d, gemmd )
+GENTPROT( scomplex, c, gemmd )
+GENTPROT( dcomplex, z, gemmd )
+
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
new file mode 100644
index 000000000..e042f1fd8
--- /dev/null
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -0,0 +1,530 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmd_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       conj_t           conja,
+       conj_t           conjb,
+       dim_t            m,
+       dim_t            n,
+       dim_t            k,
+       void*   restrict alpha,
+       void*   restrict a, inc_t rs_a, inc_t cs_a,
+       void*   restrict d, inc_t incd,
+       void*   restrict b, inc_t rs_b, inc_t cs_b,
+       void*   restrict beta,
+       void*   restrict c, inc_t rs_c, inc_t cs_c,
+       cntx_t* restrict cntx,
+       rntm_t* restrict rntm,
+       thrinfo_t* restrict thread
+     );
+
+//
+// -- gemmd-like block-panel algorithm (object interface) ----------------------
+//
+
+// Define a function pointer array named ftypes and initialize its contents with
+// the addresses of the typed functions defined below, bao_?gemmd_bp_var1().
+static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1);
+
+void bao_gemmd_bp_var1
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	const num_t    dt        = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	const dim_t    k         = bli_obj_width( a );
+
+	void* restrict buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t    rs_a      = bli_obj_row_stride( a );
+	const inc_t    cs_a      = bli_obj_col_stride( a );
+
+	void* restrict buf_d     = bli_obj_buffer_at_off( d );
+	const inc_t    incd      = bli_obj_vector_inc( d );
+
+	void* restrict buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t    rs_b      = bli_obj_row_stride( b );
+	const inc_t    cs_b      = bli_obj_col_stride( b );
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+
+	// Index into the function pointer array to extract the correct
+	// typed function pointer based on the chosen datatype.
+	FUNCPTR_T f = ftypes[dt];
+
+	// Invoke the function.
+	f
+	(
+	  conja,
+	  conjb,
+	  m,
+	  n,
+	  k,
+	  buf_alpha,
+	  buf_a, rs_a, cs_a,
+	  buf_d, incd,
+	  buf_b, rs_b, cs_b,
+	  buf_beta,
+	  buf_c, rs_c, cs_c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+}
+
+//
+// -- gemmd-like block-panel algorithm (typed interface) -----------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict d, inc_t incd, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	/* Query the context for various blocksizes. */ \
+	const dim_t NR  = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t MR  = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t NC  = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
+	const dim_t MC  = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
+	const dim_t KC  = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+	/* Query the context for the microkernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
+	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
+\
+	/* Compute partitioning step values for each matrix of each loop. */ \
+	const inc_t jcstep_c = cs_c; \
+	const inc_t jcstep_b = cs_b; \
+\
+	const inc_t pcstep_a = cs_a; \
+	const inc_t pcstep_d = incd; \
+	const inc_t pcstep_b = rs_b; \
+\
+	const inc_t icstep_c = rs_c; \
+	const inc_t icstep_a = rs_a; \
+\
+	const inc_t jrstep_c = cs_c * NR; \
+\
+	const inc_t irstep_c = rs_c * MR; \
+\
+	ctype* restrict a_00       = a; \
+	ctype* restrict d_00       = d; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	/* Make local copies of the scalars to prevent any unnecessary sharing of
+	   cache lines between the cores' caches. */ \
+	ctype           alpha_local = *alpha_cast; \
+	ctype           beta_local  = *beta_cast; \
+	ctype           one_local   = *PASTEMAC(ch,1); \
+	ctype           zero_local  = *PASTEMAC(ch,0); \
+\
+	auxinfo_t       aux; \
+\
+	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
+	   needed for the matrix we will be packing (if any), but we do it
+	   unconditionally to be safe. */ \
+	mem_t mem_a = BLIS_MEM_INITIALIZER; \
+	mem_t mem_b = BLIS_MEM_INITIALIZER; \
+\
+	/* Define an array of bszid_t ids, which will act as our substitute for
+	   the cntl_t tree. */ \
+	bszid_t bszids[8] = { BLIS_NC,      /* 5th loop */ \
+	                      BLIS_KC,      /* 4th loop */ \
+	                      BLIS_NO_PART, /* pack B */ \
+	                      BLIS_MC,      /* 3rd loop */ \
+	                      BLIS_NO_PART, /* pack A */ \
+	                      BLIS_NR,      /* 2nd loop */ \
+	                      BLIS_MR,      /* 1st loop */ \
+	                      BLIS_KR };    /* microkernel loop */  \
+\
+	bszid_t* restrict bszids_jc = &bszids[0]; \
+	bszid_t* restrict bszids_pc = &bszids[1]; \
+	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
+	bszid_t* restrict bszids_ic = &bszids[3]; \
+	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
+	bszid_t* restrict bszids_jr = &bszids[5]; \
+	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
+\
+	thrinfo_t* restrict thread_jc = NULL; \
+	thrinfo_t* restrict thread_pc = NULL; \
+	thrinfo_t* restrict thread_pb = NULL; \
+	thrinfo_t* restrict thread_ic = NULL; \
+	thrinfo_t* restrict thread_pa = NULL; \
+	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* restrict thread_ir = NULL; \
+\
+	/* Identify the current thrinfo_t node and then grow the tree. */ \
+	thread_jc = thread; \
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+\
+	/* Compute the JC loop thread range for the current thread. */ \
+	dim_t jc_start, jc_end; \
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
+	const dim_t n_local = jc_end - jc_start; \
+\
+	/* Compute number of primary and leftover components of the JC loop. */ \
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
+	const dim_t jc_left =   n_local % NC; \
+\
+	/* Loop over the n dimension (NC rows/columns at a time). */ \
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
+	{ \
+		/* Calculate the thread's current JC block dimension. */ \
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
+\
+		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+		/* Identify the current thrinfo_t node and then grow the tree. */ \
+		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
+\
+		/* Compute the PC loop thread range for the current thread. */ \
+		const dim_t pc_start = 0, pc_end = k; \
+		const dim_t k_local = k; \
+\
+		/* Compute number of primary and leftover components of the PC loop. */ \
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
+		const dim_t pc_left =   k_local % KC; \
+\
+		/* Loop over the k dimension (KC rows/columns at a time). */ \
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
+		{ \
+			/* Calculate the thread's current PC block dimension. */ \
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
+\
+			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+			ctype* restrict d_pc = d_00 + pp * pcstep_d; \
+			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+\
+			/* Only apply beta to the first iteration of the pc loop. */ \
+			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+\
+			ctype* b_use; \
+			inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+			/* Identify the current thrinfo_t node. Note that the thrinfo_t
+			   node will have already been created by a previous call to
+			   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+			   cause the tree to grow by two (e.g. to the next bszid that is
+			   a normal bszid_t value). */ \
+			thread_pb = bli_thrinfo_sub_node( thread_pc ); \
+			/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
+\
+			/* Determine the packing buffer and related parameters for matrix
+			   B. Then call the packm implementation. */ \
+			PASTECH2(bao_,ch,packm_b) \
+			( \
+			  conjb, \
+			  KC,     NC, \
+			  kc_cur, nc_cur, NR, \
+			  &one_local, \
+			  d_pc,   incd, \
+			  b_pc,   rs_b,      cs_b, \
+			  &b_use, &rs_b_use, &cs_b_use, \
+			                     &ps_b_use, \
+			  cntx, \
+			  rntm, \
+			  &mem_b, \
+			  thread_pb  \
+			); \
+\
+			/* Alias b_use so that it's clear this is our current block of
+			   matrix B. */ \
+			ctype* restrict b_pc_use = b_use; \
+\
+			/* Identify the current thrinfo_t node and then grow the tree. */ \
+			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
+\
+			/* Compute the IC loop thread range for the current thread. */ \
+			dim_t ic_start, ic_end; \
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
+			const dim_t m_local = ic_end - ic_start; \
+\
+			/* Compute number of primary and leftover components of the IC loop. */ \
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
+			const dim_t ic_left =   m_local % MC; \
+\
+			/* Loop over the m dimension (MC rows at a time). */ \
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
+			{ \
+				/* Calculate the thread's current IC block dimension. */ \
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
+\
+				ctype* restrict a_ic = a_pc + ii * icstep_a; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+\
+				ctype* a_use; \
+				inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+				/* Identify the current thrinfo_t node. Note that the thrinfo_t
+				   node will have already been created by a previous call to
+				   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+				   cause the tree to grow by two (e.g. to the next bszid that is
+				   a normal bszid_t value). */ \
+				thread_pa = bli_thrinfo_sub_node( thread_ic ); \
+				/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
+\
+				/* Determine the packing buffer and related parameters for matrix
+				   A. Then call the packm implementation. */ \
+				PASTECH2(bao_,ch,packm_a) \
+				( \
+				  conja, \
+				  MC,     KC, \
+				  mc_cur, kc_cur, MR, \
+				  &one_local, \
+				  d_pc,   incd, \
+				  a_ic,   rs_a,      cs_a, \
+				  &a_use, &rs_a_use, &cs_a_use, \
+				                     &ps_a_use, \
+				  cntx, \
+				  rntm, \
+				  &mem_a, \
+				  thread_pa  \
+				); \
+\
+				/* Alias a_use so that it's clear this is our current block of
+				   matrix A. */ \
+				ctype* restrict a_ic_use = a_use; \
+\
+				/* Identify the current thrinfo_t node and then grow the tree. */ \
+				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
+\
+				/* Query the number of threads and thread ids for the JR loop.
+				   NOTE: These values are only needed when computing the next
+				   micropanel of B. */ \
+				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+\
+				/* Compute number of primary and leftover components of the JR loop. */ \
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+				dim_t jr_left =   nc_cur % NR; \
+\
+				/* Compute the JR loop thread range for the current thread. */ \
+				dim_t jr_start, jr_end; \
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				/* Loop over the n dimension (NR columns at a time). */ \
+				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
+				{ \
+					const dim_t nr_cur \
+					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
+\
+					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
+					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+\
+					/* Assume for now that our next panel of B to be the current panel
+					   of B. */ \
+					ctype* restrict b2 = b_jr; \
+\
+					/* Identify the current thrinfo_t node. */ \
+					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
+\
+					/* Query the number of threads and thread ids for the IR loop.
+					   NOTE: These values are only needed when computing the next
+					   micropanel of A. */ \
+					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+\
+					/* Compute number of primary and leftover components of the IR loop. */ \
+					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
+					dim_t ir_left =   mc_cur % MR; \
+\
+					/* Compute the IR loop thread range for the current thread. */ \
+					dim_t ir_start, ir_end; \
+					bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
+\
+					/* Loop over the m dimension (MR rows at a time). */ \
+					for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
+					{ \
+						const dim_t mr_cur \
+						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
+\
+						ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
+						ctype* restrict c_ir = c_jr     + i * irstep_c; \
+\
+						ctype* restrict a2; \
+\
+						/* Compute the addresses of the next micropanels of A and B. */ \
+						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
+						if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+						{ \
+							a2 = a_ic_use; \
+							b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
+							if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+								b2 = b_pc_use; \
+						} \
+\
+						/* Save the addresses of next micropanels of A and B to the
+						   auxinfo_t object. */ \
+						bli_auxinfo_set_next_a( a2, &aux ); \
+						bli_auxinfo_set_next_b( b2, &aux ); \
+\
+						/* Handle interior and edge cases separately. */ \
+						if ( mr_cur == MR && nr_cur == NR ) \
+						{ \
+							/* Invoke the gemm microkernel. */ \
+							gemm_ukr \
+							( \
+							  kc_cur, \
+							  &alpha_local, \
+							  a_ir, \
+							  b_jr, \
+							  beta_use, \
+							  c_ir, rs_c, cs_c, \
+							  &aux, \
+							  cntx  \
+							); \
+						} \
+						else \
+						{ \
+							/* Invoke the gemm microkernel. */ \
+							gemm_ukr \
+							( \
+							  kc_cur, \
+							  &alpha_local, \
+							  a_ir, \
+							  b_jr, \
+							  &zero_local, \
+							  ct, rs_ct, cs_ct, \
+							  &aux, \
+							  cntx  \
+							); \
+\
+							/* Scale the bottom edge of C and add the result from above. */ \
+							PASTEMAC(ch,xpbys_mxn) \
+							( \
+							  mr_cur, \
+							  nr_cur, \
+							  ct,   rs_ct, cs_ct, \
+							  beta_use, \
+							  c_ir, rs_c,  cs_c \
+							); \
+						} \
+					} \
+				} \
+			} \
+\
+			/* This barrier is needed to prevent threads from starting to pack
+			   the next row panel of B before the current row panel is fully
+			   computed upon. */ \
+			bli_thread_barrier( thread_pb ); \
+		} \
+	} \
+\
+	/* Release any memory that was acquired for packing matrices A and B. */ \
+	PASTECH2(bao_,ch,packm_finalize_mem_a) \
+	( \
+	  rntm, \
+	  &mem_a, \
+	  thread_pa  \
+	); \
+	PASTECH2(bao_,ch,packm_finalize_mem_b) \
+	( \
+	  rntm, \
+	  &mem_b, \
+	  thread_pb  \
+	); \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
+*/ \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemmd_bp_var1 )
+GENTFUNC( float,    s, gemmd_bp_var1 )
+GENTFUNC( double,   d, gemmd_bp_var1 )
+GENTFUNC( scomplex, c, gemmd_bp_var1 )
+GENTFUNC( dcomplex, z, gemmd_bp_var1 )
+
diff --git a/addon/gemmd/bao_gemmd_bp_var2.c b/addon/gemmd/bao_gemmd_bp_var2.c
new file mode 100644
index 000000000..a0040fec0
--- /dev/null
+++ b/addon/gemmd/bao_gemmd_bp_var2.c
@@ -0,0 +1,602 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmd_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       conj_t           conja,
+       conj_t           conjb,
+       dim_t            m,
+       dim_t            n,
+       dim_t            k,
+       void*   restrict alpha,
+       void*   restrict a, inc_t rs_a, inc_t cs_a,
+       void*   restrict d, inc_t incd,
+       void*   restrict b, inc_t rs_b, inc_t cs_b,
+       void*   restrict beta,
+       void*   restrict c, inc_t rs_c, inc_t cs_c,
+       cntx_t* restrict cntx,
+       rntm_t* restrict rntm,
+       thrinfo_t* restrict thread
+     );
+
+//
+// -- gemmd-like block-panel algorithm (object interface) ----------------------
+//
+
+// Define a function pointer array named ftypes and initialize its contents with
+// the addresses of the typed functions defined below, bao_?gemmd_bp_var2().
+static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var2);
+
+void bao_gemmd_bp_var2
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	const num_t    dt        = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	const dim_t    k         = bli_obj_width( a );
+
+	void* restrict buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t    rs_a      = bli_obj_row_stride( a );
+	const inc_t    cs_a      = bli_obj_col_stride( a );
+
+	void* restrict buf_d     = bli_obj_buffer_at_off( d );
+	const inc_t    incd      = bli_obj_vector_inc( d );
+
+	void* restrict buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t    rs_b      = bli_obj_row_stride( b );
+	const inc_t    cs_b      = bli_obj_col_stride( b );
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+
+	// Index into the function pointer array to extract the correct
+	// typed function pointer based on the chosen datatype.
+	FUNCPTR_T f = ftypes[dt];
+
+	// Invoke the function.
+	f
+	(
+	  conja,
+	  conjb,
+	  m,
+	  n,
+	  k,
+	  buf_alpha,
+	  buf_a, rs_a, cs_a,
+	  buf_d, incd,
+	  buf_b, rs_b, cs_b,
+	  buf_beta,
+	  buf_c, rs_c, cs_c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+}
+
+//
+// -- gemmd-like block-panel algorithm (typed interface) -----------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict d, inc_t incd, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	/* Query the context for various blocksizes. */ \
+	const dim_t NR  = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t MR  = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t NC  = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
+	const dim_t MC  = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
+	const dim_t KC  = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+	/* Query the context for the microkernel address and cast it to its
+	   function pointer type. */ \
+	/*
+	PASTECH(ch,gemm_ukr_ft) \
+               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	*/ \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	/*
+	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
+	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
+	*/ \
+\
+	/* Compute partitioning step values for each matrix of each loop. */ \
+	const inc_t jcstep_c = cs_c; \
+	const inc_t jcstep_b = cs_b; \
+\
+	const inc_t pcstep_a = cs_a; \
+	const inc_t pcstep_d = incd; \
+	const inc_t pcstep_b = rs_b; \
+\
+	const inc_t icstep_c = rs_c; \
+	const inc_t icstep_a = rs_a; \
+\
+	const inc_t jrstep_c = cs_c * NR; \
+\
+	const inc_t irstep_c = rs_c * MR; \
+\
+	ctype* restrict a_00       = a; \
+	ctype* restrict d_00       = d; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	/* Make local copies of the scalars to prevent any unnecessary sharing of
+	   cache lines between the cores' caches. */ \
+	ctype           alpha_local = *alpha_cast; \
+	ctype           beta_local  = *beta_cast; \
+	ctype           one_local   = *PASTEMAC(ch,1); \
+	/*ctype           zero_local  = *PASTEMAC(ch,0);*/ \
+\
+	auxinfo_t       aux; \
+\
+	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
+	   needed for the matrix we will be packing (if any), but we do it
+	   unconditionally to be safe. */ \
+	mem_t mem_a = BLIS_MEM_INITIALIZER; \
+	mem_t mem_b = BLIS_MEM_INITIALIZER; \
+\
+	/* Define an array of bszid_t ids, which will act as our substitute for
+	   the cntl_t tree. */ \
+	bszid_t bszids[8] = { BLIS_NC,      /* 5th loop */ \
+	                      BLIS_KC,      /* 4th loop */ \
+	                      BLIS_NO_PART, /* pack B */ \
+	                      BLIS_MC,      /* 3rd loop */ \
+	                      BLIS_NO_PART, /* pack A */ \
+	                      BLIS_NR,      /* 2nd loop */ \
+	                      BLIS_MR,      /* 1st loop */ \
+	                      BLIS_KR };    /* microkernel loop */  \
+\
+	bszid_t* restrict bszids_jc = &bszids[0]; \
+	bszid_t* restrict bszids_pc = &bszids[1]; \
+	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
+	bszid_t* restrict bszids_ic = &bszids[3]; \
+	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
+	bszid_t* restrict bszids_jr = &bszids[5]; \
+	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
+\
+	thrinfo_t* restrict thread_jc = NULL; \
+	thrinfo_t* restrict thread_pc = NULL; \
+	thrinfo_t* restrict thread_pb = NULL; \
+	thrinfo_t* restrict thread_ic = NULL; \
+	thrinfo_t* restrict thread_pa = NULL; \
+	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* restrict thread_ir = NULL; \
+\
+	/* Identify the current thrinfo_t node and then grow the tree. */ \
+	thread_jc = thread; \
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+\
+	/* Compute the JC loop thread range for the current thread. */ \
+	dim_t jc_start, jc_end; \
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
+	const dim_t n_local = jc_end - jc_start; \
+\
+	/* Compute number of primary and leftover components of the JC loop. */ \
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
+	const dim_t jc_left =   n_local % NC; \
+\
+	/* Loop over the n dimension (NC rows/columns at a time). */ \
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
+	{ \
+		/* Calculate the thread's current JC block dimension. */ \
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
+\
+		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+		/* Identify the current thrinfo_t node and then grow the tree. */ \
+		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
+\
+		/* Compute the PC loop thread range for the current thread. */ \
+		const dim_t pc_start = 0, pc_end = k; \
+		const dim_t k_local = k; \
+\
+		/* Compute number of primary and leftover components of the PC loop. */ \
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
+		const dim_t pc_left =   k_local % KC; \
+\
+		/* Loop over the k dimension (KC rows/columns at a time). */ \
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
+		{ \
+			/* Calculate the thread's current PC block dimension. */ \
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
+\
+			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+			ctype* restrict d_pc = d_00 + pp * pcstep_d; \
+			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+\
+			/* Only apply beta to the first iteration of the pc loop. */ \
+			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+\
+			ctype* b_use; \
+			inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+			/* Identify the current thrinfo_t node. Note that the thrinfo_t
+			   node will have already been created by a previous call to
+			   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+			   cause the tree to grow by two (e.g. to the next bszid that is
+			   a normal bszid_t value). */ \
+			thread_pb = bli_thrinfo_sub_node( thread_pc ); \
+			/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
+\
+			/* Determine the packing buffer and related parameters for matrix
+			   B. Then call the packm implementation. */ \
+			PASTECH2(bao_,ch,packm_b) \
+			( \
+			  conjb, \
+			  KC,     NC, \
+			  kc_cur, nc_cur, NR, \
+			  &one_local, \
+			  d_pc,   incd, \
+			  b_pc,   rs_b,      cs_b, \
+			  &b_use, &rs_b_use, &cs_b_use, \
+			                     &ps_b_use, \
+			  cntx, \
+			  rntm, \
+			  &mem_b, \
+			  thread_pb  \
+			); \
+\
+			/* Alias b_use so that it's clear this is our current block of
+			   matrix B. */ \
+			ctype* restrict b_pc_use = b_use; \
+\
+			/* Identify the current thrinfo_t node and then grow the tree. */ \
+			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
+\
+			/* Compute the IC loop thread range for the current thread. */ \
+			dim_t ic_start, ic_end; \
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
+			const dim_t m_local = ic_end - ic_start; \
+\
+			/* Compute number of primary and leftover components of the IC loop. */ \
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
+			const dim_t ic_left =   m_local % MC; \
+\
+			/* Loop over the m dimension (MC rows at a time). */ \
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
+			{ \
+				/* Calculate the thread's current IC block dimension. */ \
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
+\
+				ctype* restrict a_ic = a_pc + ii * icstep_a; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+\
+				ctype* a_use; \
+				inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+				/* Identify the current thrinfo_t node. Note that the thrinfo_t
+				   node will have already been created by a previous call to
+				   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+				   cause the tree to grow by two (e.g. to the next bszid that is
+				   a normal bszid_t value). */ \
+				thread_pa = bli_thrinfo_sub_node( thread_ic ); \
+				/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
+\
+				/* Determine the packing buffer and related parameters for matrix
+				   A. Then call the packm implementation. */ \
+				PASTECH2(bao_,ch,packm_a) \
+				( \
+				  conja, \
+				  MC,     KC, \
+				  mc_cur, kc_cur, MR, \
+				  &one_local, \
+				  d_pc,   incd, \
+				  a_ic,   rs_a,      cs_a, \
+				  &a_use, &rs_a_use, &cs_a_use, \
+				                     &ps_a_use, \
+				  cntx, \
+				  rntm, \
+				  &mem_a, \
+				  thread_pa  \
+				); \
+\
+				/* Alias a_use so that it's clear this is our current block of
+				   matrix A. */ \
+				ctype* restrict a_ic_use = a_use; \
+\
+				/* Identify the current thrinfo_t node and then grow the tree. */ \
+				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
+\
+				/* Query the number of threads and thread ids for the JR loop.
+				   NOTE: These values are only needed when computing the next
+				   micropanel of B. */ \
+				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+\
+				/* Compute number of primary and leftover components of the JR loop. */ \
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+				dim_t jr_left =   nc_cur % NR; \
+\
+				/* Compute the JR loop thread range for the current thread. */ \
+				dim_t jr_start, jr_end; \
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				/* Loop over the n dimension (NR columns at a time). */ \
+				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
+				{ \
+					const dim_t nr_cur \
+					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
+\
+					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
+					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+\
+					/* Assume for now that our next panel of B to be the current panel
+					   of B. */ \
+					ctype* restrict b2 = b_jr; \
+\
+					/* Identify the current thrinfo_t node. */ \
+					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
+\
+					/* Query the number of threads and thread ids for the IR loop.
+					   NOTE: These values are only needed when computing the next
+					   micropanel of A. */ \
+					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+\
+					/* Compute number of primary and leftover components of the IR loop. */ \
+					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
+					dim_t ir_left =   mc_cur % MR; \
+\
+					/* Compute the IR loop thread range for the current thread. */ \
+					dim_t ir_start, ir_end; \
+					bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
+\
+					/* Loop over the m dimension (MR rows at a time). */ \
+					for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
+					{ \
+						const dim_t mr_cur \
+						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
+\
+						ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
+						ctype* restrict c_ir = c_jr     + i * irstep_c; \
+\
+						ctype* restrict a2; \
+\
+						/* Compute the addresses of the next micropanels of A and B. */ \
+						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
+						if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+						{ \
+							a2 = a_ic_use; \
+							b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
+							if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+								b2 = b_pc_use; \
+						} \
+\
+						/* Save the addresses of next micropanels of A and B to the
+						   auxinfo_t object. */ \
+						bli_auxinfo_set_next_a( a2, &aux ); \
+						bli_auxinfo_set_next_b( b2, &aux ); \
+\
+						/* Call a wrapper to the kernel (which handles edge cases). */ \
+						PASTECH2(bao_,ch,gemm_kernel) \
+						( \
+						  MR, \
+						  NR, \
+						  mr_cur, \
+						  nr_cur, \
+						  kc_cur, \
+						  &alpha_local, \
+						  a_ir, rs_a_use, cs_a_use, \
+						  b_jr, rs_b_use, cs_b_use, \
+						  beta_use, \
+						  c_ir, rs_c,     cs_c, \
+						  &aux, \
+						  cntx  \
+						); \
+					} \
+				} \
+			} \
+\
+			/* This barrier is needed to prevent threads from starting to pack
+			   the next row panel of B before the current row panel is fully
+			   computed upon. */ \
+			bli_thread_barrier( thread_pb ); \
+		} \
+	} \
+\
+	/* Release any memory that was acquired for packing matrices A and B. */ \
+	PASTECH2(bao_,ch,packm_finalize_mem_a) \
+	( \
+	  rntm, \
+	  &mem_a, \
+	  thread_pa  \
+	); \
+	PASTECH2(bao_,ch,packm_finalize_mem_b) \
+	( \
+	  rntm, \
+	  &mem_b, \
+	  thread_pb  \
+	); \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
+*/ \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemmd_bp_var2 )
+GENTFUNC( float,    s, gemmd_bp_var2 )
+GENTFUNC( double,   d, gemmd_bp_var2 )
+GENTFUNC( scomplex, c, gemmd_bp_var2 )
+GENTFUNC( dcomplex, z, gemmd_bp_var2 )
+
+//
+// -- gemm-like microkernel wrapper --------------------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       const dim_t         MR, \
+       const dim_t         NR, \
+       dim_t               mr_cur, \
+       dim_t               nr_cur, \
+       dim_t               kc_cur, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict aux, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	/* Infer the datatype from the ctype. */ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	/* Query the context for the microkernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
+	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
+\
+	ctype       zero    = *PASTEMAC(ch,0); \
+\
+	/* Handle interior and edge cases separately. */ \
+	if ( mr_cur == MR && nr_cur == NR ) \
+	{ \
+		/* Invoke the gemm microkernel. */ \
+		gemm_ukr \
+		( \
+		  kc_cur, \
+		  alpha, \
+		  a, \
+		  b, \
+		  beta, \
+		  c, rs_c, cs_c, \
+		  aux, \
+		  cntx  \
+		); \
+	} \
+	else \
+	{ \
+		/* Invoke the gemm microkernel. */ \
+		gemm_ukr \
+		( \
+		  kc_cur, \
+		  alpha, \
+		  a, \
+		  b, \
+		  &zero, \
+		  ct, rs_ct, cs_ct, \
+		  aux, \
+		  cntx  \
+		); \
+\
+		/* Scale the bottom edge of C and add the result from above. */ \
+		PASTEMAC(ch,xpbys_mxn) \
+		( \
+		  mr_cur, \
+		  nr_cur, \
+		  ct, rs_ct, cs_ct, \
+		  beta, \
+		  c,  rs_c,  cs_c \
+		); \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemm_kernel )
+GENTFUNC( float,    s, gemm_kernel )
+GENTFUNC( double,   d, gemm_kernel )
+GENTFUNC( scomplex, c, gemm_kernel )
+GENTFUNC( dcomplex, z, gemm_kernel )
+
diff --git a/addon/gemmd/bao_gemmd_check.c b/addon/gemmd/bao_gemmd_check.c
new file mode 100644
index 000000000..864e9a1ac
--- /dev/null
+++ b/addon/gemmd/bao_gemmd_check.c
@@ -0,0 +1,131 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bao_gemmd_check
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx
+     )
+{
+	err_t e_val;
+
+	// Check object datatypes.
+
+	e_val = bli_check_noninteger_object( alpha );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_noninteger_object( beta );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_floating_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_floating_object( d );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_floating_object( b );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_floating_object( c );
+	bli_check_error_code( e_val );
+
+	// Check scalar/vector/matrix type.
+
+	e_val = bli_check_scalar_object( alpha );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_scalar_object( beta );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_matrix_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_vector_object( d );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_matrix_object( b );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_matrix_object( c );
+	bli_check_error_code( e_val );
+
+	// Check object buffers (for non-NULLness).
+
+	e_val = bli_check_object_buffer( alpha );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_object_buffer( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_object_buffer( d );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_object_buffer( b );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_object_buffer( beta );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_object_buffer( c );
+	bli_check_error_code( e_val );
+
+	// Check object dimensions.
+
+	e_val = bli_check_level3_dims( a, b, c );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) );
+	bli_check_error_code( e_val );
+
+	// Check for consistent datatypes.
+	// NOTE: We only perform these tests when mixed datatype support is
+	// disabled.
+
+	e_val = bli_check_consistent_object_datatypes( c, a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_consistent_object_datatypes( c, d );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_consistent_object_datatypes( c, b );
+	bli_check_error_code( e_val );
+}
+
diff --git a/addon/gemmd/bao_gemmd_check.h b/addon/gemmd/bao_gemmd_check.h
new file mode 100644
index 000000000..243ec70c8
--- /dev/null
+++ b/addon/gemmd/bao_gemmd_check.h
@@ -0,0 +1,50 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype object-based check functions.
+//
+
+void bao_gemmd_check
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx
+    );
+
diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/gemmd/bao_gemmd_var.h
new file mode 100644
index 000000000..5c6674727
--- /dev/null
+++ b/addon/gemmd/bao_gemmd_var.h
@@ -0,0 +1,126 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype the object-based variant interfaces.
+//
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+void PASTECH(bao_,opname) \
+     ( \
+       obj_t*  alpha, \
+       obj_t*  a, \
+       obj_t*  d, \
+       obj_t*  b, \
+       obj_t*  beta, \
+       obj_t*  c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     );
+
+GENPROT( gemmd_bp_var1 )
+GENPROT( gemmd_bp_var2 )
+
+
+//
+// Prototype the typed variant interfaces.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict d, inc_t incd, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     );
+
+//INSERT_GENTPROT_BASIC0( gemmd_bp_var1 )
+GENTPROT( float,    s, gemmd_bp_var1 )
+GENTPROT( double,   d, gemmd_bp_var1 )
+GENTPROT( scomplex, c, gemmd_bp_var1 )
+GENTPROT( dcomplex, z, gemmd_bp_var1 )
+
+//INSERT_GENTPROT_BASIC0( gemmd_bp_var2 )
+GENTPROT( float,    s, gemmd_bp_var2 )
+GENTPROT( double,   d, gemmd_bp_var2 )
+GENTPROT( scomplex, c, gemmd_bp_var2 )
+GENTPROT( dcomplex, z, gemmd_bp_var2 )
+
+
+//
+// Prototype the typed kernel interfaces.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       const dim_t         MR, \
+       const dim_t         NR, \
+       dim_t               mr_cur, \
+       dim_t               nr_cur, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict aux, \
+       cntx_t*    restrict cntx  \
+     );
+
+//INSERT_GENTPROT_BASIC0( gemm_kernel )
+GENTPROT( float,    s, gemm_kernel )
+GENTPROT( double,   d, gemm_kernel )
+GENTPROT( scomplex, c, gemm_kernel )
+GENTPROT( dcomplex, z, gemm_kernel )
+
diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c
new file mode 100644
index 000000000..49bb34664
--- /dev/null
+++ b/addon/gemmd/bao_l3_packm_a.c
@@ -0,0 +1,330 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	/* Set the pack buffer type so that we are obtaining memory blocks from
+	   the pool dedicated to blocks of A. */ \
+	const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
+\
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
+	const dim_t k_pack = k; \
+\
+	/* Barrier to make sure all threads are caught up and ready to begin the
+	   packm stage. */ \
+	bli_thread_barrier( thread ); \
+\
+	/* Compute the size of the memory block eneded. */ \
+	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
+\
+	/* Check the mem_t entry provided by the caller. If it is unallocated,
+	   then we need to acquire a block from the packed block allocator. */ \
+	if ( bli_mem_is_unalloc( mem ) ) \
+	{ \
+		if ( bli_thread_am_ochief( thread ) ) \
+		{ \
+			/* Acquire directly to the chief thread's mem_t that was passed in.
+			   It needs to be that mem_t struct, and not a local (temporary)
+			   mem_t, since there is no barrier until after packing is finished,
+			   which could allow a race condition whereby the chief thread exits
+			   the current function before the other threads have a chance to
+			   copy from it. (A barrier would fix that race condition, but then
+			   again, I prefer to keep barriers to a minimum.) */ \
+			bli_pba_acquire_m \
+			( \
+			  rntm, \
+			  size_needed, \
+			  pack_buf_type, \
+			  mem  \
+			); \
+		} \
+\
+		/* Broadcast the address of the chief thread's passed-in mem_t to all
+		   threads. */ \
+		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+		/* Non-chief threads: Copy the contents of the chief thread's
+		   passed-in mem_t to the passed-in mem_t for this thread. (The
+		   chief thread already has the mem_t, so it does not need to
+		   perform any copy.) */ \
+		if ( !bli_thread_am_ochief( thread ) ) \
+		{ \
+			*mem = *mem_p; \
+		} \
+	} \
+	else /* if ( bli_mem_is_alloc( mem ) ) */ \
+	{ \
+		/* If the mem_t entry provided by the caller does NOT contain a NULL
+		   buffer, then a block has already been acquired from the packed
+		   block allocator and cached by the caller. */ \
+\
+		/* As a sanity check, we should make sure that the mem_t object isn't
+		   associated with a block that is too small compared to the size of
+		   the packed matrix buffer that is needed, according to the value
+		   computed above. */ \
+		siz_t mem_size = bli_mem_size( mem ); \
+\
+		if ( mem_size < size_needed ) \
+		{ \
+			if ( bli_thread_am_ochief( thread ) ) \
+			{ \
+				/* The chief thread releases the existing block associated
+				   with the mem_t, and then re-acquires a new block, saving
+				   the associated mem_t to its passed-in mem_t. (See coment
+				   above for why the acquisition needs to be directly to
+				   the chief thread's passed-in mem_t and not a local
+				   (temporary) mem_t. */ \
+				bli_pba_release \
+				( \
+				  rntm, \
+				  mem \
+				); \
+				bli_pba_acquire_m \
+				( \
+				  rntm, \
+				  size_needed, \
+				  pack_buf_type, \
+				  mem \
+				); \
+			} \
+\
+			/* Broadcast the address of the chief thread's passed-in mem_t
+			   to all threads. */ \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+			/* Non-chief threads: Copy the contents of the chief thread's
+			   passed-in mem_t to the passed-in mem_t for this thread. (The
+			   chief thread already has the mem_t, so it does not need to
+			   perform any copy.) */ \
+			if ( !bli_thread_am_ochief( thread ) ) \
+			{ \
+				*mem = *mem_p; \
+			} \
+		} \
+		else \
+		{ \
+			/* If the mem_t entry is already allocated and sufficiently large,
+			   then we use it as-is. No action is needed. */ \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
+GENTFUNC( float,    s, packm_init_mem_a )
+GENTFUNC( double,   d, packm_init_mem_a )
+GENTFUNC( scomplex, c, packm_init_mem_a )
+GENTFUNC( dcomplex, z, packm_init_mem_a )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	if ( thread != NULL ) \
+	if ( bli_thread_am_ochief( thread ) ) \
+	{ \
+		/* Check the mem_t entry provided by the caller. Only proceed if it
+		   is allocated, which it should be. */ \
+		if ( bli_mem_is_alloc( mem ) ) \
+		{ \
+			bli_pba_release \
+			( \
+			  rntm, \
+			  mem \
+			); \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
+GENTFUNC( float,    s, packm_finalize_mem_a )
+GENTFUNC( double,   d, packm_finalize_mem_a )
+GENTFUNC( scomplex, c, packm_finalize_mem_a )
+GENTFUNC( dcomplex, z, packm_finalize_mem_a )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       dim_t*  restrict m_max, \
+       dim_t*  restrict k_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ) \
+{ \
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
+	*k_max = k; \
+\
+	/* Determine the dimensions and strides for the packed matrix A. */ \
+	{ \
+		/* Pack A to column-stored row-panels. */ \
+		*rs_p = 1; \
+		*cs_p = mr; \
+\
+		*pd_p = mr; \
+		*ps_p = mr * k; \
+\
+		/* Set the schema to "packed row panels" to indicate packing to
+		   conventional column-stored row panels. */ \
+		*schema = BLIS_PACKED_ROW_PANELS; \
+	} \
+\
+	/* Set the buffer address provided by the caller to point to the memory
+	   associated with the mem_t entry acquired from the memory pool. */ \
+	*p = bli_mem_buffer( mem ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_a )
+GENTFUNC( float,    s, packm_init_a )
+GENTFUNC( double,   d, packm_init_a )
+GENTFUNC( scomplex, c, packm_init_a )
+GENTFUNC( dcomplex, z, packm_init_a )
+
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            m_alloc, \
+       dim_t            k_alloc, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict d, inc_t           incd, \
+       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	pack_t schema; \
+	dim_t  m_max; \
+	dim_t  k_max; \
+	dim_t  pd_p; \
+\
+	/* Prepare the packing destination buffer. */ \
+	PASTECH2(bao_,ch,packm_init_mem_a) \
+	( \
+	  m_alloc, k_alloc, mr, \
+	  cntx, \
+	  rntm, \
+	  mem, \
+	  thread  \
+	); \
+\
+	/* Determine the packing buffer and related parameters for matrix A. */ \
+	PASTECH2(bao_,ch,packm_init_a) \
+	( \
+	  &schema, \
+	  m, k, mr, \
+	  &m_max, &k_max, \
+	  p, rs_p,  cs_p, \
+	     &pd_p, ps_p, \
+	  mem  \
+	); \
+\
+	/* Pack matrix A to the destination buffer chosen above. Here, the packed
+	   matrix is stored to column-stored MR x k micropanels. */ \
+	PASTECH2(bao_,ch,packm_var1) \
+	( \
+	  conj, \
+	  schema, \
+	  m, \
+	  k, \
+	  m_max, \
+	  k_max, \
+	  kappa, \
+	  d,  incd, \
+	  a,  rs_a,  cs_a, \
+	  *p, *rs_p, *cs_p, \
+		  pd_p,  *ps_p, \
+	  cntx, \
+	  thread  \
+	); \
+\
+	/* Barrier so that packing is done before computation. */ \
+	bli_thread_barrier( thread ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_a )
+GENTFUNC( float,    s, packm_a )
+GENTFUNC( double,   d, packm_a )
+GENTFUNC( scomplex, c, packm_a )
+GENTFUNC( dcomplex, z, packm_a )
+
diff --git a/addon/gemmd/bao_l3_packm_a.h b/addon/gemmd/bao_l3_packm_a.h
new file mode 100644
index 000000000..b683b79d4
--- /dev/null
+++ b/addon/gemmd/bao_l3_packm_a.h
@@ -0,0 +1,123 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_mem_a )
+GENTPROT( float,    s, packm_init_mem_a )
+GENTPROT( double,   d, packm_init_mem_a )
+GENTPROT( scomplex, c, packm_init_mem_a )
+GENTPROT( dcomplex, z, packm_init_mem_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
+GENTPROT( float,    s, packm_finalize_mem_a )
+GENTPROT( double,   d, packm_finalize_mem_a )
+GENTPROT( scomplex, c, packm_finalize_mem_a )
+GENTPROT( dcomplex, z, packm_finalize_mem_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       dim_t*  restrict m_max, \
+       dim_t*  restrict k_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_a )
+GENTPROT( float,    s, packm_init_a )
+GENTPROT( double,   d, packm_init_a )
+GENTPROT( scomplex, c, packm_init_a )
+GENTPROT( dcomplex, z, packm_init_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            m_alloc, \
+       dim_t            k_alloc, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict d, inc_t           incd, \
+       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_a )
+GENTPROT( float,    s, packm_a )
+GENTPROT( double,   d, packm_a )
+GENTPROT( scomplex, c, packm_a )
+GENTPROT( dcomplex, z, packm_a )
+
diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c
new file mode 100644
index 000000000..c41b062b6
--- /dev/null
+++ b/addon/gemmd/bao_l3_packm_b.c
@@ -0,0 +1,330 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	/* Set the pack buffer type so that we are obtaining memory blocks from
+	   the pool dedicated to panels of B. */ \
+	const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \
+\
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	const dim_t k_pack = k; \
+	const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
+\
+	/* Barrier to make sure all threads are caught up and ready to begin the
+	   packm stage. */ \
+	bli_thread_barrier( thread ); \
+\
+	/* Compute the size of the memory block eneded. */ \
+	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
+\
+	/* Check the mem_t entry provided by the caller. If it is unallocated,
+	   then we need to acquire a block from the packed block allocator. */ \
+	if ( bli_mem_is_unalloc( mem ) ) \
+	{ \
+		if ( bli_thread_am_ochief( thread ) ) \
+		{ \
+			/* Acquire directly to the chief thread's mem_t that was passed in.
+			   It needs to be that mem_t struct, and not a local (temporary)
+			   mem_t, since there is no barrier until after packing is finished,
+			   which could allow a race condition whereby the chief thread exits
+			   the current function before the other threads have a chance to
+			   copy from it. (A barrier would fix that race condition, but then
+			   again, I prefer to keep barriers to a minimum.) */ \
+			bli_pba_acquire_m \
+			( \
+			  rntm, \
+			  size_needed, \
+			  pack_buf_type, \
+			  mem  \
+			); \
+		} \
+\
+		/* Broadcast the address of the chief thread's passed-in mem_t to all
+		   threads. */ \
+		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+		/* Non-chief threads: Copy the contents of the chief thread's
+		   passed-in mem_t to the passed-in mem_t for this thread. (The
+		   chief thread already has the mem_t, so it does not need to
+		   perform any copy.) */ \
+		if ( !bli_thread_am_ochief( thread ) ) \
+		{ \
+			*mem = *mem_p; \
+		} \
+	} \
+	else /* if ( bli_mem_is_alloc( mem ) ) */ \
+	{ \
+		/* If the mem_t entry provided by the caller does NOT contain a NULL
+		   buffer, then a block has already been acquired from the packed
+		   block allocator and cached by the caller. */ \
+\
+		/* As a sanity check, we should make sure that the mem_t object isn't
+		   associated with a block that is too small compared to the size of
+		   the packed matrix buffer that is needed, according to the value
+		   computed above. */ \
+		siz_t mem_size = bli_mem_size( mem ); \
+\
+		if ( mem_size < size_needed ) \
+		{ \
+			if ( bli_thread_am_ochief( thread ) ) \
+			{ \
+				/* The chief thread releases the existing block associated
+				   with the mem_t, and then re-acquires a new block, saving
+				   the associated mem_t to its passed-in mem_t. (See coment
+				   above for why the acquisition needs to be directly to
+				   the chief thread's passed-in mem_t and not a local
+				   (temporary) mem_t. */ \
+				bli_pba_release \
+				( \
+				  rntm, \
+				  mem \
+				); \
+				bli_pba_acquire_m \
+				( \
+				  rntm, \
+				  size_needed, \
+				  pack_buf_type, \
+				  mem \
+				); \
+			} \
+\
+			/* Broadcast the address of the chief thread's passed-in mem_t
+			   to all threads. */ \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+			/* Non-chief threads: Copy the contents of the chief thread's
+			   passed-in mem_t to the passed-in mem_t for this thread. (The
+			   chief thread already has the mem_t, so it does not need to
+			   perform any copy.) */ \
+			if ( !bli_thread_am_ochief( thread ) ) \
+			{ \
+				*mem = *mem_p; \
+			} \
+		} \
+		else \
+		{ \
+			/* If the mem_t entry is already allocated and sufficiently large,
+			   then we use it as-is. No action is needed. */ \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_mem_b )
+GENTFUNC( float,    s, packm_init_mem_b )
+GENTFUNC( double,   d, packm_init_mem_b )
+GENTFUNC( scomplex, c, packm_init_mem_b )
+GENTFUNC( dcomplex, z, packm_init_mem_b )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	if ( thread != NULL ) \
+	if ( bli_thread_am_ochief( thread ) ) \
+	{ \
+		/* Check the mem_t entry provided by the caller. Only proceed if it
+		   is allocated, which it should be. */ \
+		if ( bli_mem_is_alloc( mem ) ) \
+		{ \
+			bli_pba_release \
+			( \
+			  rntm, \
+			  mem \
+			); \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
+GENTFUNC( float,    s, packm_finalize_mem_b )
+GENTFUNC( double,   d, packm_finalize_mem_b )
+GENTFUNC( scomplex, c, packm_finalize_mem_b )
+GENTFUNC( dcomplex, z, packm_finalize_mem_b )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       dim_t*  restrict k_max, \
+       dim_t*  restrict n_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ) \
+{ \
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	*k_max = k; \
+	*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
+\
+	/* Determine the dimensions and strides for the packed matrix B. */ \
+	{ \
+		/* Pack B to row-stored column-panels. */ \
+		*rs_p = nr; \
+		*cs_p = 1; \
+\
+		*pd_p = nr; \
+		*ps_p = k * nr; \
+\
+		/* Set the schema to "packed column panels" to indicate packing to
+		   conventional row-stored column panels. */ \
+		*schema = BLIS_PACKED_COL_PANELS; \
+	} \
+\
+	/* Set the buffer address provided by the caller to point to the memory
+	   associated with the mem_t entry acquired from the memory pool. */ \
+	*p = bli_mem_buffer( mem ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_b )
+GENTFUNC( float,    s, packm_init_b )
+GENTFUNC( double,   d, packm_init_b )
+GENTFUNC( scomplex, c, packm_init_b )
+GENTFUNC( dcomplex, z, packm_init_b )
+
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            k_alloc, \
+       dim_t            n_alloc, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict d, inc_t           incd, \
+       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	pack_t schema; \
+	dim_t  k_max; \
+	dim_t  n_max; \
+	dim_t  pd_p; \
+\
+	/* Prepare the packing destination buffer. */ \
+	PASTECH2(bao_,ch,packm_init_mem_b) \
+	( \
+	  k_alloc, n_alloc, nr, \
+	  cntx, \
+	  rntm, \
+	  mem, \
+	  thread  \
+	); \
+\
+	/* Determine the packing buffer and related parameters for matrix B. */ \
+	PASTECH2(bao_,ch,packm_init_b) \
+	( \
+	  &schema, \
+	  k, n, nr, \
+	  &k_max, &n_max, \
+	  p, rs_p,  cs_p, \
+	     &pd_p, ps_p, \
+	  mem  \
+	); \
+\
+	/* Pack matrix B to the destination buffer chosen above. Here, the packed
+	   matrix is stored to row-stored k x NR micropanels. */ \
+	PASTECH2(bao_,ch,packm_var1) \
+	( \
+	  conj, \
+	  schema, \
+	  k, \
+	  n, \
+	  k_max, \
+	  n_max, \
+	  kappa, \
+	  d,  incd, \
+	  b,  rs_b,  cs_b, \
+	  *p, *rs_p, *cs_p, \
+		  pd_p,  *ps_p, \
+	  cntx, \
+	  thread  \
+	); \
+\
+	/* Barrier so that packing is done before computation. */ \
+	bli_thread_barrier( thread ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_b )
+GENTFUNC( float,    s, packm_b )
+GENTFUNC( double,   d, packm_b )
+GENTFUNC( scomplex, c, packm_b )
+GENTFUNC( dcomplex, z, packm_b )
+
diff --git a/addon/gemmd/bao_l3_packm_b.h b/addon/gemmd/bao_l3_packm_b.h
new file mode 100644
index 000000000..9161604ce
--- /dev/null
+++ b/addon/gemmd/bao_l3_packm_b.h
@@ -0,0 +1,123 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_mem_b )
+GENTPROT( float,    s, packm_init_mem_b )
+GENTPROT( double,   d, packm_init_mem_b )
+GENTPROT( scomplex, c, packm_init_mem_b )
+GENTPROT( dcomplex, z, packm_init_mem_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
+GENTPROT( float,    s, packm_finalize_mem_b )
+GENTPROT( double,   d, packm_finalize_mem_b )
+GENTPROT( scomplex, c, packm_finalize_mem_b )
+GENTPROT( dcomplex, z, packm_finalize_mem_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       dim_t*  restrict k_max, \
+       dim_t*  restrict n_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_b )
+GENTPROT( float,    s, packm_init_b )
+GENTPROT( double,   d, packm_init_b )
+GENTPROT( scomplex, c, packm_init_b )
+GENTPROT( dcomplex, z, packm_init_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            k_alloc, \
+       dim_t            n_alloc, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict d, inc_t           incd, \
+       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_b )
+GENTPROT( float,    s, packm_b )
+GENTPROT( double,   d, packm_b )
+GENTPROT( scomplex, c, packm_b )
+GENTPROT( dcomplex, z, packm_b )
+
diff --git a/addon/gemmd/bao_l3_packm_var.h b/addon/gemmd/bao_l3_packm_var.h
new file mode 100644
index 000000000..063e59e5f
--- /dev/null
+++ b/addon/gemmd/bao_l3_packm_var.h
@@ -0,0 +1,69 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//
+// Prototype BLAS-like interfaces to the variants.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            m_max, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict d, inc_t incd, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+                           dim_t pd_p, inc_t ps_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     );
+
+//INSERT_GENTPROT_BASIC0( packm_var1 )
+GENTPROT( float,    s, packm_var1 )
+GENTPROT( double,   d, packm_var1 )
+GENTPROT( scomplex, c, packm_var1 )
+GENTPROT( dcomplex, z, packm_var1 )
+
+//INSERT_GENTPROT_BASIC0( packm_var2 )
+GENTPROT( float,    s, packm_var2 )
+GENTPROT( double,   d, packm_var2 )
+GENTPROT( scomplex, c, packm_var2 )
+GENTPROT( dcomplex, z, packm_var2 )
diff --git a/addon/gemmd/bao_l3_packm_var1.c b/addon/gemmd/bao_l3_packm_var1.c
new file mode 100644
index 000000000..24c0a2cc1
--- /dev/null
+++ b/addon/gemmd/bao_l3_packm_var1.c
@@ -0,0 +1,195 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// Variant 1 provides basic support for packing by calling packm_cxk().
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            m_max, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict d, inc_t incd, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+                           dim_t pd_p, inc_t ps_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it, ic; \
+	dim_t           ic0; \
+	doff_t          ic_inc; \
+	dim_t           panel_len; \
+	dim_t           panel_len_max; \
+	dim_t           panel_dim; \
+	dim_t           panel_dim_max; \
+	inc_t           incc; \
+	inc_t           ldc; \
+	inc_t           ldp; \
+	conj_t          conjc; \
+\
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	bool row_stored = bli_is_col_packed( schema ); \
+	/*bool col_stored = bli_is_row_packed( schema );*/ \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len      = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		incc           = cs_c; \
+		ldc            = rs_c; \
+		ldp            = rs_p; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len      = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		incc           = rs_c; \
+		ldc            = cs_c; \
+		ldp            = cs_p; \
+	} \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */ \
+	{ \
+		ic0    = 0; \
+		ic_inc = panel_dim_max; \
+	} \
+\
+	ctype* restrict p_begin = p_cast; \
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
+	( void )nt; \
+	( void )tid; \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    it  = 0; it < n_iter; \
+	      ic += ic_inc, it += 1 ) \
+	{ \
+		panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		ctype* restrict c_begin = c_cast   + (ic  )*incc; \
+\
+		ctype* restrict c_use = c_begin; \
+		ctype* restrict p_use = p_begin; \
+\
+		/* The definition of bli_packm_my_iter() will depend on whether slab
+		   or round-robin partitioning was requested at configure-time. (The
+		   default is slab.) */ \
+		if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+		{ \
+			PASTECH2(bao_,ch,packm_cxk) \
+			( \
+			  conjc, \
+			  schema, \
+			  panel_dim, \
+			  panel_dim_max, \
+			  panel_len, \
+			  panel_len_max, \
+			  kappa_cast, \
+			  d,     incd, \
+			  c_use, incc, ldc, \
+			  p_use,       ldp, \
+			  cntx  \
+			); \
+		} \
+\
+/*
+if ( !row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+else \
+PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+\
+		p_begin += ps_p; \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_var1 )
+GENTFUNC( float,    s, packm_var1 )
+GENTFUNC( double,   d, packm_var1 )
+GENTFUNC( scomplex, c, packm_var1 )
+GENTFUNC( dcomplex, z, packm_var1 )
+
diff --git a/addon/gemmd/bao_l3_packm_var2.c b/addon/gemmd/bao_l3_packm_var2.c
new file mode 100644
index 000000000..830e499b3
--- /dev/null
+++ b/addon/gemmd/bao_l3_packm_var2.c
@@ -0,0 +1,245 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// Variant 2 is similar to variant 1, but inlines the contents of packm_cxk().
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            m_max, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict d, inc_t incd, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+                           dim_t pd_p, inc_t ps_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it, ic; \
+	dim_t           ic0; \
+	doff_t          ic_inc; \
+	dim_t           panel_len; \
+	dim_t           panel_len_max; \
+	dim_t           panel_dim; \
+	dim_t           panel_dim_max; \
+	inc_t           incc; \
+	inc_t           ldc; \
+	inc_t           ldp; \
+	conj_t          conjc; \
+\
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	bool row_stored = bli_is_col_packed( schema ); \
+	/*bool col_stored = bli_is_row_packed( schema );*/ \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len      = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		incc           = cs_c; \
+		ldc            = rs_c; \
+		ldp            = rs_p; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len      = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		incc           = rs_c; \
+		ldc            = cs_c; \
+		ldp            = cs_p; \
+	} \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */ \
+	{ \
+		ic0    = 0; \
+		ic_inc = panel_dim_max; \
+	} \
+\
+	ctype* restrict p_begin = p_cast; \
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
+	( void )nt; \
+	( void )tid; \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    it  = 0; it < n_iter; \
+	      ic += ic_inc, it += 1 ) \
+	{ \
+		panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		ctype* restrict c_begin = c_cast   + (ic  )*incc; \
+\
+		ctype* restrict c_use = c_begin; \
+		ctype* restrict p_use = p_begin; \
+\
+		/* The definition of bli_packm_my_iter() will depend on whether slab
+		   or round-robin partitioning was requested at configure-time. (The
+		   default is slab.) */ \
+		if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+		{ \
+			/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
+			   we're wrong, this will get someone's attention. */ \
+			if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+				bli_abort(); \
+\
+			/* Perform the packing, taking conjc into account. */ \
+			if ( bli_is_conj( conjc ) ) \
+			{ \
+				for ( dim_t l = 0; l < panel_len; ++l ) \
+				{ \
+					for ( dim_t d = 0; d < panel_dim; ++d ) \
+					{ \
+						ctype* cld = c_use + (l  )*ldc + (d  )*incc; \
+						ctype* pld = p_use + (l  )*ldp + (d  )*1; \
+\
+						PASTEMAC(ch,copyjs)( *cld, *pld ); \
+					} \
+				} \
+			} \
+			else \
+			{ \
+				for ( dim_t l = 0; l < panel_len; ++l ) \
+				{ \
+					for ( dim_t d = 0; d < panel_dim; ++d ) \
+					{ \
+						ctype* cld = c_use + (l  )*ldc + (d  )*incc; \
+						ctype* pld = p_use + (l  )*ldp + (d  )*1; \
+\
+						PASTEMAC(ch,copys)( *cld, *pld ); \
+					} \
+				} \
+			} \
+\
+			/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
+			if ( panel_dim < panel_dim_max ) \
+			{ \
+				const dim_t     i      = panel_dim; \
+				const dim_t     m_edge = panel_dim_max - panel_dim; \
+				const dim_t     n_edge = panel_len_max; \
+				ctype* restrict p_edge = p_use + (i  )*1; \
+\
+				PASTEMAC(ch,set0s_mxn) \
+				( \
+				  m_edge, \
+				  n_edge, \
+				  p_edge, 1, ldp  \
+				); \
+			} \
+\
+			/* If panel_len < panel_len_max, then we zero those unused columns. */ \
+			if ( panel_len < panel_len_max ) \
+			{ \
+				const dim_t     j      = panel_len; \
+				const dim_t     m_edge = panel_dim_max; \
+				const dim_t     n_edge = panel_len_max - panel_len; \
+				ctype* restrict p_edge = p_use + (j  )*ldp; \
+\
+				PASTEMAC(ch,set0s_mxn) \
+				( \
+				  m_edge, \
+				  n_edge, \
+				  p_edge, 1, ldp  \
+				); \
+			} \
+		} \
+\
+/*
+if ( !row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+else \
+PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+\
+		p_begin += ps_p; \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_var1 )
+GENTFUNC( float,    s, packm_var2 )
+GENTFUNC( double,   d, packm_var2 )
+GENTFUNC( scomplex, c, packm_var2 )
+GENTFUNC( dcomplex, z, packm_var2 )
+
diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c
new file mode 100644
index 000000000..645f09d79
--- /dev/null
+++ b/addon/gemmd/bao_packm_cxk.c
@@ -0,0 +1,199 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bao_,ch,opname) \
+     ( \
+       conj_t  conja, \
+       pack_t  schema, \
+       dim_t   panel_dim, \
+       dim_t   panel_dim_max, \
+       dim_t   panel_len, \
+       dim_t   panel_len_max, \
+       ctype*  kappa, \
+       ctype*  d, inc_t incd, \
+       ctype*  a, inc_t inca, inc_t lda, \
+       ctype*  p,             inc_t ldp, \
+       cntx_t* cntx  \
+     ) \
+{ \
+	/* Note that we use panel_dim_max, not panel_dim, to query the packm
+	   kernel function pointer. This means that we always use the same
+	   kernel, even for edge cases. */ \
+	num_t     dt     = PASTEMAC(ch,type); \
+	l1mkr_t   ker_id = panel_dim_max; \
+\
+	PASTECH2(ch,opname,_ker_ft) f; \
+\
+	/* Query the context for the packm kernel corresponding to the current
+	   panel dimension, or kernel id. If the id is invalid, the function will
+	   return NULL. */ \
+	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+\
+	/* If there exists a kernel implementation for the micro-panel dimension
+	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
+	/* NOTE: We've disabled calling packm micro-kernels from the context for
+	   this implementation. To re-enable, change FALSE to TRUE in the
+	   conditional below. */ \
+	if ( f != NULL && FALSE ) \
+	{ \
+		f \
+		( \
+		  conja, \
+		  schema, \
+		  panel_dim, \
+		  panel_len, \
+		  panel_len_max, \
+		  kappa, \
+		  a, inca, lda, \
+		  p,       ldp, \
+		  cntx  \
+		); \
+	} \
+	else \
+	{ \
+		/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
+		   we're wrong, this will get someone's attention. */ \
+		if ( !PASTEMAC(ch,eq1)( *kappa ) ) \
+			bli_abort(); \
+\
+		if ( d == NULL ) \
+		{ \
+			/* Perform the packing, taking conja into account. */ \
+			if ( bli_is_conj( conja ) ) \
+			{ \
+				for ( dim_t l = 0; l < panel_len; ++l ) \
+				{ \
+					for ( dim_t i = 0; i < panel_dim; ++i ) \
+					{ \
+						ctype* ali = a + (l  )*lda + (i  )*inca; \
+						ctype* pli = p + (l  )*ldp + (i  )*1; \
+\
+						PASTEMAC(ch,copyjs)( *ali, *pli ); \
+					} \
+				} \
+			} \
+			else \
+			{ \
+				for ( dim_t l = 0; l < panel_len; ++l ) \
+				{ \
+					for ( dim_t i = 0; i < panel_dim; ++i ) \
+					{ \
+						ctype* ali = a + (l  )*lda + (i  )*inca; \
+						ctype* pli = p + (l  )*ldp + (i  )*1; \
+\
+						PASTEMAC(ch,copys)( *ali, *pli ); \
+					} \
+				} \
+			} \
+		} \
+		else /* if ( d != NULL ) */ \
+		{ \
+			/* Perform the packing, taking conja into account. */ \
+			if ( bli_is_conj( conja ) ) \
+			{ \
+				for ( dim_t l = 0; l < panel_len; ++l ) \
+				{ \
+					for ( dim_t i = 0; i < panel_dim; ++i ) \
+					{ \
+						ctype* ali = a + (l  )*lda + (i  )*inca; \
+						ctype* dl  = d + (l  )*incd; \
+						ctype* pli = p + (l  )*ldp + (i  )*1; \
+\
+						/* Note that ali must be the second operand here since
+						   that is what is conjugated by scal2js. */ \
+						PASTEMAC(ch,scal2js)( *dl, *ali, *pli ); \
+					} \
+				} \
+			} \
+			else \
+			{ \
+				for ( dim_t l = 0; l < panel_len; ++l ) \
+				{ \
+					for ( dim_t i = 0; i < panel_dim; ++i ) \
+					{ \
+						ctype* ali = a + (l  )*lda + (i  )*inca; \
+						ctype* dl  = d + (l  )*incd; \
+						ctype* pli = p + (l  )*ldp + (i  )*1; \
+\
+						PASTEMAC(ch,scal2s)( *ali, *dl, *pli ); \
+					} \
+				} \
+			} \
+		} \
+\
+		/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
+		if ( panel_dim < panel_dim_max ) \
+		{ \
+			const dim_t     i      = panel_dim; \
+			const dim_t     m_edge = panel_dim_max - panel_dim; \
+			const dim_t     n_edge = panel_len_max; \
+			ctype* restrict p_edge = p + (i  )*1; \
+\
+			PASTEMAC(ch,set0s_mxn) \
+			( \
+			  m_edge, \
+			  n_edge, \
+			  p_edge, 1, ldp  \
+			); \
+		} \
+\
+		/* If panel_len < panel_len_max, then we zero those unused columns. */ \
+		if ( panel_len < panel_len_max ) \
+		{ \
+			const dim_t     j      = panel_len; \
+			const dim_t     m_edge = panel_dim_max; \
+			const dim_t     n_edge = panel_len_max - panel_len; \
+			ctype* restrict p_edge = p + (j  )*ldp; \
+\
+			PASTEMAC(ch,set0s_mxn) \
+			( \
+			  m_edge, \
+			  n_edge, \
+			  p_edge, 1, ldp  \
+			); \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_cxk )
+GENTFUNC( float,    s, packm_cxk )
+GENTFUNC( double,   d, packm_cxk )
+GENTFUNC( scomplex, c, packm_cxk )
+GENTFUNC( dcomplex, z, packm_cxk )
+
diff --git a/addon/gemmd/bao_packm_cxk.h b/addon/gemmd/bao_packm_cxk.h
new file mode 100644
index 000000000..3e977a7cc
--- /dev/null
+++ b/addon/gemmd/bao_packm_cxk.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTECH2(bao_,ch,varname) \
+     ( \
+       conj_t  conja, \
+       pack_t  schema, \
+       dim_t   panel_dim, \
+       dim_t   panel_dim_max, \
+       dim_t   panel_len, \
+       dim_t   panel_len_max, \
+       ctype*  kappa, \
+       ctype*  d, inc_t incd, \
+       ctype*  a, inc_t inca, inc_t lda, \
+       ctype*  p,             inc_t ldp, \
+       cntx_t* cntx  \
+     );
+
+//INSERT_GENTPROT_BASIC0( packm_cxk )
+GENTPROT( float,    s, packm_cxk )
+GENTPROT( double,   d, packm_cxk )
+GENTPROT( scomplex, c, packm_cxk )
+GENTPROT( dcomplex, z, packm_cxk )
+
diff --git a/addon/gemmd/gemmd.h b/addon/gemmd/gemmd.h
new file mode 100644
index 000000000..cab61bd18
--- /dev/null
+++ b/addon/gemmd/gemmd.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of copyright holder(s) nor the names
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef GEMMD_H
+#define GEMMD_H
+
+// This header should contain (or #include) any definitions that must be
+// folded into blis.h.
+
+#include "bao_gemmd.h"
+#include "bao_gemmd_check.h"
+#include "bao_gemmd_var.h"
+
+#include "bao_l3_packm_a.h"
+#include "bao_l3_packm_b.h"
+#include "bao_l3_packm_var.h"
+
+#include "bao_packm_cxk.h"
+
+#include "bao_l3_decor.h"
+
+
+#endif
diff --git a/addon/gemmd/thread/bao_l3_decor.h b/addon/gemmd/thread/bao_l3_decor.h
new file mode 100644
index 000000000..b4fd2b9b7
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor.h
@@ -0,0 +1,75 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_H
+#define BLIS_SBX_L3_DECOR_H
+
+// -- sup definitions ----------------------------------------------------------
+
+// Level-3 sup internal function type.
+typedef void (*l3sbxint_t)
+     (
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       thrinfo_t* thread
+     );
+
+// Level-3 sup thread decorator prototype.
+void bao_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     );
+
+// Include definitions specific to the method of multithreading.
+#include "bao_l3_decor_single.h"
+#include "bao_l3_decor_openmp.h"
+#include "bao_l3_decor_pthreads.h"
+
+#endif
+
diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/gemmd/thread/bao_l3_decor_openmp.c
new file mode 100644
index 000000000..1aca8de27
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor_openmp.c
@@ -0,0 +1,140 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_OPENMP
+
+// Define a dummy thread entry function, which is needed in the pthreads
+// version, so that when building Windows DLLs (with OpenMP enabled or with
+// no multithreading) we don't risk having an unresolved symbol.
+void* bao_l3_thread_entry( void* data_void ) { return NULL; }
+
+//#define PRINT_THRINFO
+
+void bao_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+	// Query the total number of threads from the rntm_t object.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
+	bli_pba_rntm_set_pba( rntm );
+
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Create a thread-local copy of the master thread's rntm_t. This is
+		// necessary since we want each thread to be able to track its own
+		// small block pool_t as it executes down the function stack.
+		rntm_t           rntm_l = *rntm;
+		rntm_t* restrict rntm_p = &rntm_l;
+
+		// Query the thread's id from OpenMP.
+		const dim_t tid = omp_get_thread_num();
+
+		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
+		// NOTE: This calls the same function used for the conventional/large
+		// code path.
+		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+		thrinfo_t* thread = NULL;
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+
+		func
+		(
+		  alpha,
+		  a,
+		  d,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  thread
+		);
+
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_free( rntm_p, thread );
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called from the thread entry function).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+#endif
+
diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.h b/addon/gemmd/thread/bao_l3_decor_openmp.h
new file mode 100644
index 000000000..9c956d7c3
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor_openmp.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
+#define BLIS_SBX_L3_DECOR_OPENMP_H
+
+// Definitions specific to situations when OpenMP multithreading is enabled.
+#ifdef BLIS_ENABLE_OPENMP
+
+#endif
+
+#endif
+
diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.c b/addon/gemmd/thread/bao_l3_decor_pthreads.c
new file mode 100644
index 000000000..587b8400f
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor_pthreads.c
@@ -0,0 +1,220 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_PTHREADS
+
+// A data structure to assist in passing operands to additional threads.
+typedef struct thread_data
+{
+	l3sbxint_t func;
+	opid_t     family;
+	obj_t*     alpha;
+	obj_t*     a;
+	obj_t*     d;
+	obj_t*     b;
+	obj_t*     beta;
+	obj_t*     c;
+	cntx_t*    cntx;
+	rntm_t*    rntm;
+	dim_t      tid;
+	thrcomm_t* gl_comm;
+	array_t*   array;
+} thread_data_t;
+
+// Entry point function for additional threads.
+void* bao_l3_thread_entry( void* data_void )
+{
+	thread_data_t* data     = data_void;
+
+	l3sbxint_t     func     = data->func;
+	opid_t         family   = data->family;
+	obj_t*         alpha    = data->alpha;
+	obj_t*         a        = data->a;
+	obj_t*         d        = data->d;
+	obj_t*         b        = data->b;
+	obj_t*         beta     = data->beta;
+	obj_t*         c        = data->c;
+	cntx_t*        cntx     = data->cntx;
+	rntm_t*        rntm     = data->rntm;
+	dim_t          tid      = data->tid;
+	array_t*       array    = data->array;
+	thrcomm_t*     gl_comm  = data->gl_comm;
+
+	( void )family;
+
+	// Create a thread-local copy of the master thread's rntm_t. This is
+	// necessary since we want each thread to be able to track its own
+	// small block pool_t as it executes down the function stack.
+	rntm_t           rntm_l = *rntm;
+	rntm_t* restrict rntm_p = &rntm_l;
+
+	// Use the thread id to access the appropriate pool_t* within the
+	// array_t, and use it to set the sba_pool field within the rntm_t.
+	// If the pool_t* element within the array_t is NULL, it will first
+	// be allocated/initialized.
+	bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+	thrinfo_t* thread = NULL;
+
+	// Create the root node of the current thread's thrinfo_t structure.
+	bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+
+	func
+	(
+	  alpha,
+	  a,
+	  d,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm_p,
+	  thread
+	);
+
+	// Free the current thread's thrinfo_t structure.
+	bli_l3_sup_thrinfo_free( rntm_p, thread );
+
+	return NULL;
+}
+
+void bao_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+	err_t r_val;
+
+	// Query the total number of threads from the context.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
+	bli_pba_rntm_set_pba( rntm );
+
+	// Allocate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+	// Allocate an array of pthread objects and auxiliary data structs to pass
+	// to the thread entry functions.
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val );
+
+	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
+	// can spawn all other threads before proceeding with its own computation.
+	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
+	{
+		// Set up thread data for additional threads (beyond thread 0).
+		datas[tid].func     = func;
+		datas[tid].family   = family;
+		datas[tid].alpha    = alpha;
+		datas[tid].a        = a;
+		datas[tid].d        = d;
+		datas[tid].b        = b;
+		datas[tid].beta     = beta;
+		datas[tid].c        = c;
+		datas[tid].cntx     = cntx;
+		datas[tid].rntm     = rntm;
+		datas[tid].tid      = tid;
+		datas[tid].gl_comm  = gl_comm;
+		datas[tid].array    = array;
+
+		// Spawn additional threads for ids greater than 1.
+		if ( tid != 0 )
+			bli_pthread_create( &pthreads[tid], NULL, &bao_l3_thread_entry, &datas[tid] );
+		else
+			bao_l3_thread_entry( ( void* )(&datas[0]) );
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called from the thread entry function).
+
+	// Thread 0 waits for additional threads to finish.
+	for ( dim_t tid = 1; tid < n_threads; tid++ )
+	{
+		bli_pthread_join( pthreads[tid], NULL );
+	}
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( pthreads );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( datas );
+}
+
+#endif
+
diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.h b/addon/gemmd/thread/bao_l3_decor_pthreads.h
new file mode 100644
index 000000000..69adec45e
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor_pthreads.h
@@ -0,0 +1,47 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
+#define BLIS_SBX_L3_DECOR_PTHREADS_H
+
+// Definitions specific to situations when POSIX multithreading is enabled.
+#ifdef BLIS_ENABLE_PTHREADS
+
+// Thread entry point prototype.
+void* bao_l3_thread_entry( void* data_void );
+
+#endif
+
+#endif
+
diff --git a/addon/gemmd/thread/bao_l3_decor_single.c b/addon/gemmd/thread/bao_l3_decor_single.c
new file mode 100644
index 000000000..d60891d65
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor_single.c
@@ -0,0 +1,143 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+#define SKIP_THRINFO_TREE
+
+void bao_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       //pack_t     schema_a,
+       //pack_t     schema_b,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+	// For sequential execution, we use only one thread.
+	const dim_t n_threads = 1;
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm.
+	bli_pba_rntm_set_pba( rntm );
+
+#ifndef SKIP_THRINFO_TREE
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+#endif
+
+
+	{
+		// NOTE: We don't need to create another copy of the rntm_t since
+		// it was already copied in one of the high-level oapi functions.
+		rntm_t* restrict rntm_p = rntm;
+
+		// There is only one thread id (for the thief thread).
+		const dim_t tid = 0;
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		// NOTE: This is commented out because, in the single-threaded case,
+		// this is redundant since it's already been done above.
+		//bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+#ifndef SKIP_THRINFO_TREE
+		thrinfo_t* thread = NULL;
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+#else
+		// This optimization allows us to use one of the global thrinfo_t
+		// objects for single-threaded execution rather than grow one from
+		// scratch. The key is that bli_thrinfo_sup_grow(), which is called
+		// from within the variants, will immediately return if it detects
+		// that the thrinfo_t* passed into it is either
+		// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
+		thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
+
+		( void )tid;
+#endif
+
+		func
+		(
+		  alpha,
+		  a,
+		  d,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  thread
+		);
+
+#ifndef SKIP_THRINFO_TREE
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_free( rntm_p, thread );
+#endif
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+#endif
+
diff --git a/addon/gemmd/thread/bao_l3_decor_single.h b/addon/gemmd/thread/bao_l3_decor_single.h
new file mode 100644
index 000000000..211a43a89
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor_single.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
+#define BLIS_SBX_L3_DECOR_SINGLE_H
+
+// Definitions specific to situations when multithreading is disabled.
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+#endif
+
+#endif
+
diff --git a/build/bli_addon.h.in b/build/bli_addon.h.in
new file mode 100644
index 000000000..36a8e29bd
--- /dev/null
+++ b/build/bli_addon.h.in
@@ -0,0 +1,47 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_ADDON_H
+#define BLIS_ADDON_H
+
+#if @enable_addons@
+#define BLIS_ENABLE_ADDONS
+#else
+#define BLIS_DISABLE_ADDONS
+#endif
+
+// Enabled addons
+@addon_list_includes@
+
+#endif
diff --git a/build/config.mk.in b/build/config.mk.in
index 7533d1acb..63cd53e28 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -185,6 +185,10 @@ MK_ENABLE_CBLAS   := @enable_cblas@
 # Whether libblis will depend on libmemkind for certain memory allocations.
 MK_ENABLE_MEMKIND := @enable_memkind@
 
+# The names of the addons to include when building BLIS. If empty, no addons
+# will be included.
+ADDON_LIST        := @addon_list@
+
 # The name of a sandbox defining an alternative gemm implementation. If empty,
 # no sandbox will be used and the conventional gemm implementation will remain
 # enabled.
diff --git a/common.mk b/common.mk
index 90c3da83f..5f2d30c9b 100644
--- a/common.mk
+++ b/common.mk
@@ -152,18 +152,35 @@ get-kernel-cflags-for    = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
 
 # When compiling sandboxes, we use flags similar to those of general framework
 # source. This ensures that the same code can be linked and run across various
-# sub-configurations. (If we switch to using refkern/kernel flags, we should
-# prevent enabling sandboxes for umbrella families by verifying that
-# config_list == config_name if --enable-sandbox is given.)
+# sub-configurations.
+get-addon-c99flags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
+                                   $(call get-noopt-cflags-for,$(1)) \
+                                   $(CADDONINCFLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
+                            )
+get-addon-cxxflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
+                                   $(call get-noopt-cxxflags-for,$(1)) \
+                                   $(CADDONINCFLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
+                            )
+
+# When compiling sandboxes, we use flags similar to those of general framework
+# source. This ensures that the same code can be linked and run across various
+# sub-configurations. (NOTE: If we ever switch to using refkernel or kernel
+# flags, we should prevent enabling sandboxes for umbrella families by verifying
+# that config_list == config_name if --enable-sandbox is given. THIS ALSO
+# APPLIES TO ADDONS ABOVE.)
 get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
-                                   $(CSBOXINCFLAGS) \
+                                   $(CSANDINCFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
 get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cxxflags-for,$(1)) \
-                                   $(CSBOXINCFLAGS) \
+                                   $(CSANDINCFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
@@ -188,6 +205,8 @@ get-refkern-text-for    = "('$(1)' CFLAGS for ref. kernels)"
 get-config-text-for     = "('$(1)' CFLAGS for config code)"
 get-frame-text-for      = "('$(1)' CFLAGS for framework code)"
 get-kernel-text-for     = "('$(1)' CFLAGS for kernels)"
+get-addon-c99text-for   = "('$(1)' CFLAGS for addons)"
+get-addon-cxxtext-for   = "('$(1)' CXXFLAGS for addons)"
 get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)"
 get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
 
@@ -202,6 +221,10 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
 files-that-contain      = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),)))
 files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f))))
 
+# Define a function that removes duplicate strings *without* using the sort
+# function.
+rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1)))
+
 
 #
 # --- Include makefile configuration file --------------------------------------
@@ -286,6 +309,7 @@ CONFIG_DIR         := config
 FRAME_DIR          := frame
 REFKERN_DIR        := ref_kernels
 KERNELS_DIR        := kernels
+ADDON_DIR          := addon
 SANDBOX_DIR        := sandbox
 OBJ_DIR            := obj
 LIB_DIR            := lib
@@ -302,11 +326,13 @@ REFNM              := ref
 
 # Source suffixes.
 CONFIG_SRC_SUFS    := c
-
 KERNELS_SRC_SUFS   := c s S
-
 FRAME_SRC_SUFS     := c
 
+ADDON_C99_SUFS     := c
+ADDON_CXX_SUFS     := cc cpp cxx
+ADDON_SRC_SUFS     := $(ADDON_C99_SUFS) $(ADDON_CXX_SUFS)
+
 SANDBOX_C99_SUFS   := c
 SANDBOX_CXX_SUFS   := cc cpp cxx
 SANDBOX_SRC_SUFS   := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS)
@@ -314,15 +340,21 @@ SANDBOX_SRC_SUFS   := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS)
 # Header suffixes.
 FRAME_HDR_SUFS     := h
 
+ADDON_H99_SUFS     := h
+ADDON_HXX_SUFS     := hh hpp hxx
+ADDON_HDR_SUFS     := $(ADDON_H99_SUFS) $(ADDON_HXX_SUFS)
+
 SANDBOX_H99_SUFS   := h
 SANDBOX_HXX_SUFS   := hh hpp hxx
 SANDBOX_HDR_SUFS   := $(SANDBOX_H99_SUFS) $(SANDBOX_HXX_SUFS)
 
 # Combine all header suffixes and remove duplicates via sort().
 ALL_HDR_SUFS       := $(sort $(FRAME_HDR_SUFS) \
+                             $(ADDON_HDR_SUFS) \
                              $(SANDBOX_HDR_SUFS) )
 
 ALL_H99_SUFS       := $(sort $(FRAME_HDR_SUFS) \
+                             $(ADDON_HDR_SUFS) \
                              $(SANDBOX_H99_SUFS) )
 
 # The names of scripts that check output from the BLAS test drivers and
@@ -349,11 +381,13 @@ SHELL              := bash
 
 # Construct paths to the four primary directories of source code:
 # the config directory, general framework code, reference kernel code,
-# and optimized kernel code.
+# and optimized kernel code. Also process paths for addon and sandbox
+# directories.
 CONFIG_PATH        := $(DIST_PATH)/$(CONFIG_DIR)
 FRAME_PATH         := $(DIST_PATH)/$(FRAME_DIR)
 REFKERN_PATH       := $(DIST_PATH)/$(REFKERN_DIR)
 KERNELS_PATH       := $(DIST_PATH)/$(KERNELS_DIR)
+ADDON_PATH         := $(DIST_PATH)/$(ADDON_DIR)
 SANDBOX_PATH       := $(DIST_PATH)/$(SANDBOX_DIR)
 
 # Construct paths to some optional C++ template headers contributed by AMD.
@@ -367,6 +401,7 @@ CONFIG_FRAG_PATH   := ./obj/$(CONFIG_NAME)/$(CONFIG_DIR)
 FRAME_FRAG_PATH    := ./obj/$(CONFIG_NAME)/$(FRAME_DIR)
 REFKERN_FRAG_PATH  := ./obj/$(CONFIG_NAME)/$(REFKERN_DIR)
 KERNELS_FRAG_PATH  := ./obj/$(CONFIG_NAME)/$(KERNELS_DIR)
+ADDON_FRAG_PATH    := ./obj/$(CONFIG_NAME)/$(ADDON_DIR)
 SANDBOX_FRAG_PATH  := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR)
 
 
@@ -855,6 +890,7 @@ MK_CONFIG_SRC      :=
 MK_KERNELS_SRC     :=
 MK_REFKERN_SRC     :=
 MK_FRAME_SRC       :=
+MK_ADDON_SRC       :=
 MK_SANDBOX_SRC     :=
 
 # -- config --
@@ -905,6 +941,24 @@ PARENT_PATH        := $(OBJ_DIR)/$(CONFIG_NAME)
 -include $(addsuffix /$(FRAGMENT_MK), $(REFKERN_FRAG_PATH))
 -include $(addsuffix /$(FRAGMENT_MK), $(FRAME_FRAG_PATH))
 
+# -- addon --
+
+# Construct paths to each addon.
+# NOTE: If $(ADDON_LIST) is empty (because no addon was enabled at configure-
+# time) then $(ADDON_PATHS) will also be empty, which will cause no fragments
+# to be included.
+ADDON_PATHS        := $(addprefix $(ADDON_FRAG_PATH)/, $(ADDON_LIST))
+
+# This variable is used by the include statements as they recursively include
+# one another. For the 'addons' directory, we initialize it to that directory
+# in preparation to include the fragments in the configuration sub-directory.
+PARENT_SRC_PATH    := $(ADDON_PATH)
+PARENT_PATH        := $(ADDON_FRAG_PATH)
+
+# Recursively include the makefile fragments in each of the addons sub-
+# directories.
+-include $(addsuffix /$(FRAGMENT_MK), $(ADDON_PATHS))
+
 # -- sandbox --
 
 # Construct paths to each sandbox. (At present, there can be only one.)
@@ -922,6 +976,8 @@ PARENT_PATH        := $(SANDBOX_FRAG_PATH)
 # Recursively include the makefile fragments in the sandbox sub-directory.
 -include $(addsuffix /$(FRAGMENT_MK), $(SANDBOX_PATHS))
 
+# -- post-processing --
+
 # Create a list of the makefile fragments using the variable into which each
 # of the above include statements accumulated their directory paths.
 MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS))
@@ -940,14 +996,14 @@ endif
 #
 
 # Define a function that will expand all of the directory paths given in $(1)
-# to actual filepaths using the list of suffixes provided $(2).
+# to actual filepaths using the list of suffixes provided in $(2).
 get-filepaths = $(strip $(foreach path, $(1), \
                             $(foreach suf, $(2), \
                                 $(wildcard $(path)/*.$(suf)) \
                  )       )   )
 
 # Define a function that will expand all of the directory paths given in $(1)
-# to actual filepaths using the list of suffixes provided $(2), taking only
+# to actual filepaths using the list of suffixes provided in $(2), taking only
 # the first expansion from each directory with at least one file matching
 # the current suffix. Finally, strip the filenames from all resulting files,
 # returning only the directory paths.
@@ -957,20 +1013,29 @@ get-dirpaths  = $(dir $(foreach path, $(1), \
                                   $(wildcard $(path)/*.$(suf)) \
                  )     )   )   )
 
-# We'll use two directory lists. The first is a list of all of the directories
-# in which makefile fragments were generated (plus the current directory). The
-# second is the subset of the first that begins with the sandbox root path.
+# We'll use three directory lists. The first is a list of all of the directories
+# in which makefile fragments were generated, plus the current directory. (The
+# current directory is needed so we include bli_config.h and bli_addon.h in the
+# processing of header files.) The second and third are subsets of the first
+# that begins with the addon and sandbox root paths, respectively.
 ALLFRAG_DIR_PATHS := . $(FRAGMENT_DIR_PATHS)
+ADDON_DIR_PATHS   := $(filter $(ADDON_PATH)/%,$(ALLFRAG_DIR_PATHS))
 SANDBOX_DIR_PATHS := $(filter $(SANDBOX_PATH)/%,$(ALLFRAG_DIR_PATHS))
 
 ALL_H99_FILES     := $(call get-filepaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
-FRAME_H99_FILES   := $(filter-out $(SANDBOX_PATH)/%,$(ALL_H99_FILES))
+FRAME_H99_FILES   := $(filter-out $(ADDON_PATH)/%, \
+                        $(filter-out $(SANDBOX_PATH)/%, \
+                                    $(ALL_H99_FILES) \
+                      )  )
 
-ALL_H99_DIRPATHS  := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
+ALL_H99_DIRPATHS     := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
 
-SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS))
-SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS))
+ADDON_H99_FILES      := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_H99_SUFS))
+ADDON_HXX_FILES      := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_HXX_SUFS))
+ADDON_HDR_DIRPATHS   := $(call get-dirpaths,$(ADDON_DIR_PATHS),$(ALL_HDR_SUFS))
 
+SANDBOX_H99_FILES    := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS))
+SANDBOX_HXX_FILES    := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS))
 SANDBOX_HDR_DIRPATHS := $(call get-dirpaths,$(SANDBOX_DIR_PATHS),$(ALL_HDR_SUFS))
 
 
@@ -1025,8 +1090,8 @@ CBLAS_H_FLAT    := $(BASE_INC_PATH)/$(CBLAS_H)
 #
 
 # Obtain a list of header files #included inside of the bli_cntx_ref.c file.
-# Paths to these files will be needed when compiling with the monolithic
-# header.
+# Due to the way that bli_cntx_ref.c uses headers and macros, paths to these
+# files will be needed when compiling bli_cntx_ref.c with the monolithic header.
 ifeq ($(strip $(SHARE_PATH)),.)
 REF_KER_SRC     := $(DIST_PATH)/$(REFKERN_DIR)/bli_cntx_ref.c
 REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H))
@@ -1034,9 +1099,10 @@ endif
 
 # Match each header found above with the path to that header, and then strip
 # leading, trailing, and internal whitespace.
-REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \
-                               $(dir $(filter %/$(header), \
-                                              $(FRAME_H99_FILES)))))
+REF_KER_H_PATHS := $(call rm-dups,$(strip \
+                                  $(foreach header, $(REF_KER_HEADERS), \
+                                      $(dir $(filter %/$(header), \
+                                                     $(FRAME_H99_FILES))))))
 
 # Add -I to each header path so we can specify our include search paths to the
 # C compiler. Then add frame/include since it's needed when compiling source
@@ -1056,17 +1122,22 @@ ifeq ($(MK_ENABLE_CBLAS),yes)
 CINCFLAGS       += -I$(CBLAS_H_DIRPATH)
 endif
 
+# Obtain a list of header paths in the configured addons. Then add -I to each
+# header path.
+CADDONINCFLAGS  := $(strip $(patsubst %, -I%, $(ADDON_HDR_DIRPATHS)))
+
 # Obtain a list of header paths in the configured sandbox. Then add -I to each
 # header path.
-CSBOXINCFLAGS   := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS)))
+CSANDINCFLAGS   := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS)))
 
 
 #
 # --- BLIS configuration header definitions ------------------------------------
 #
 
-# This file was created by configure, but we need to define it here so we can
-# remove it as part of the clean targets.
+# These files were created by configure, but we need to define them here so we
+# can remove them as part of the clean targets.
+BLIS_ADDON_H    := ./bli_addon.h
 BLIS_CONFIG_H   := ./bli_config.h
 
 
diff --git a/configure b/configure
index 3c865dad9..15577eb22 100755
--- a/configure
+++ b/configure
@@ -270,6 +270,15 @@ print_usage()
 	echo "                 \"small\" depends on thresholds that may vary by sub-"
 	echo "                 configuration."
 	echo " "
+	echo "   -a NAME --enable-addon=NAME"
+	echo " "
+	echo "                 Enable the code provided by an addon. An addon consists"
+	echo "                 of a separate directory of code that provides additional"
+	echo "                 APIs, implementations, and/or operations that would"
+	echo "                 otherwise not be present within a build of BLIS. This"
+	echo "                 option may be used multiple times to specify the inclusion"
+	echo "                 of multiple addons. By default, no addons are enabled."
+	echo " "
 	echo "   -s NAME --enable-sandbox=NAME"
 	echo " "
 	echo "                 Enable a separate sandbox implementation of gemm. This"
@@ -973,6 +982,18 @@ canonicalize_ws()
 	echo "${str}"
 }
 
+rm_duplicate_words_simple()
+{
+	local str revstr revres res
+
+	str="$1"
+
+	# Remote duplicates, keeping the first occurrence.
+	res=$(echo "${str}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}{printf("\n")}')
+
+	echo "${res}"
+}
+
 rm_duplicate_words()
 {
 	local str revstr revres res
@@ -1958,6 +1979,13 @@ main()
 	bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}"
 	bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}"
 
+	# The names/paths for the template bli_addon.h.in and its instantiated
+	# counterpart.
+	bli_addon_h_in='bli_addon.h.in'
+	bli_addon_h_out='bli_addon.h'
+	bli_addon_h_in_path="${build_dirpath}/${bli_addon_h_in}"
+	bli_addon_h_out_path="${cur_dirpath}/${bli_addon_h_out}"
+
 	# Path to 'mirror-tree.sh' script.
 	mirror_tree_sh="${build_dirpath}/mirror-tree.sh"
 
@@ -1981,6 +2009,10 @@ main()
 	frame_dir='frame'
 	frame_dirpath="${dist_path}/${frame_dir}"
 
+	# The names of the addons.
+	addon_dir='addon'
+	addon_dirpath="${dist_path}/${addon_dir}"
+
 	# The name of the sandbox directory.
 	sandbox_dir='sandbox'
 	sandbox_dirpath="${dist_path}/${sandbox_dir}"
@@ -2088,6 +2120,10 @@ main()
 	force_version='no'
 	complex_return='default'
 
+	# The addon flag and names.
+	addon_flag=''
+	addon_list=''
+
 	# The sandbox flag and name.
 	sandbox_flag=''
 	sandbox=''
@@ -2132,7 +2168,7 @@ main()
 
 		# Process our command line options.
 		unset OPTIND
-		while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do
+		while getopts ":hp:d:e:a:s:t:r:qci:b:-:" opt; do
 			case $opt in
 				-)
 					case "$OPTARG" in
@@ -2239,12 +2275,21 @@ main()
 						disable-mem-tracing)
 							enable_mem_tracing='no'
 							;;
+						enable-addon=*)
+							addon_flag=1
+							addon_name=${OPTARG#*=}
+							# Append the addon name to the list.
+							addon_list="${addon_list} ${addon_name}"
+							;;
+						disable-addon)
+							addon_flag=''
+							;;
 						enable-sandbox=*)
 							sandbox_flag=1
 							sandbox=${OPTARG#*=}
 							;;
 						disable-sandbox)
-							sandbox_flag=0
+							sandbox_flag=''
 							;;
 						int-size=*)
 							int_type_size=${OPTARG#*=}
@@ -2321,6 +2366,12 @@ main()
 				e)
 					export_shared=$OPTARG
 					;;
+				a)
+					addon_flag=1
+					addon_name=$OPTARG
+					# Append the addon name to the list.
+					addon_list="${addon_list} ${addon_name}"
+					;;
 				s)
 					sandbox_flag=1
 					sandbox=$OPTARG
@@ -3168,6 +3219,34 @@ main()
 		exit 1
 	fi
 
+	# Check if addons were given.
+	if [ -n "${addon_flag}" ]; then
+
+		# Remove duplicates in the addon list, if they exist.
+		addon_list=$(rm_duplicate_words_simple "${addon_list}")
+
+		echo "${script_name}: configuring with addons:"
+
+		for addon in ${addon_list}; do
+
+			echo "${script_name}:   ${addon_dir}/${addon}"
+
+			addon_fullpath="${addon_dirpath}/${addon}"
+
+			if [ ! -d "${addon_fullpath}" ]; then
+				echo "${script_name}: requested addon sub-directory does not exist! Cannot continue."
+				echo "${script_name}: *** Please verify addon existence and name."
+				exit 1
+			fi
+		done
+
+		enable_addons_01=1
+	else
+		echo "${script_name}: configuring with no addons."
+
+		enable_addons_01=0
+	fi
+
 	# Check if a sandbox was given.
 	if [ -n "${sandbox_flag}" ]; then
 
@@ -3306,6 +3385,15 @@ main()
 		kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n"
 	done
 
+	# Create a list of #includes, one for each addon in addon_list.
+	addon_list_includes=""
+	for addon in ${addon_list}; do
+
+		# Create a #define and add it to the running list.
+		addon_header="\"${addon}.h\""
+		addon_list_includes="${addon_list_includes}#include ${addon_header}\n"
+	done
+
 
 	# -- Determine whether we are performing an out-of-tree build --------------
 
@@ -3333,7 +3421,7 @@ main()
 	fi
 
 
-	# -- Instantiate config.mk, bli_config.h files from templates --------------
+	# -- Instantiate config.mk file from template ------------------------------
 
 	# Begin substituting information into the config_mk_in file, outputting
 	# to config_mk_out.
@@ -3380,9 +3468,11 @@ main()
 		| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
 		| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
 		| sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \
+		| sed -e "s/@addon_list@/${addon_list}/g" \
 		| sed -e "s/@sandbox@/${sandbox}/g" \
 		> "${config_mk_out_path}"
 
+	# -- Instantiate bli_config.h file from template ---------------------------
 
 	# Begin substituting information into the bli_config_h_in file, outputting
 	# to bli_config_h_out. NOTE: We use perl instead of sed because the version
@@ -3417,6 +3507,17 @@ main()
 		| sed   -e "s/@complex_return_intel@/${complex_return_intel01}/g" \
 		> "${bli_config_h_out_path}"
 
+	# -- Instantiate bli_addon.h file from template ----------------------------
+
+	# Begin substituting information into the bli_addon_h_in file, outputting
+	# to bli_addon_h_out. NOTE: We use perl instead of sed because the version
+	# of sed used on OS X is old and does not handle the '\n' character
+	# intuitively, which was used when constructing ${addon_list_includes}.
+	echo "${script_name}: creating ${bli_addon_h_out_path} from ${bli_addon_h_in_path}"
+	cat "${bli_addon_h_in_path}" \
+		| perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" \
+		| sed   -e "s/@enable_addons@/${enable_addons_01}/g" \
+		> "${bli_addon_h_out_path}"
 
 	# -- Create top-level object directories -----------------------------------
 
@@ -3429,7 +3530,6 @@ main()
 
 	obj_config_dirpath="${base_obj_dirpath}/${config_dir}"
 
-	#echo "${script_name}: creating ${obj_config_dirpath}"
 	mkdir -p ${obj_config_dirpath}
 	for conf in ${config_list}; do
 		echo "${script_name}: creating ${obj_config_dirpath}/${conf}"
@@ -3439,7 +3539,6 @@ main()
 
 	obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}"
 
-	#echo "${script_name}: creating ${obj_kernels_dirpath}"
 	mkdir -p ${obj_kernels_dirpath}
 	for kern in ${kernel_list}; do
 		echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}"
@@ -3449,7 +3548,6 @@ main()
 
 	obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}"
 
-	#echo "${script_name}: creating ${obj_refkern_dirpath}"
 	mkdir -p ${obj_refkern_dirpath}
 	for conf in ${config_list}; do
 		echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}"
@@ -3462,6 +3560,18 @@ main()
 	echo "${script_name}: creating ${obj_frame_dirpath}"
 	mkdir -p ${obj_frame_dirpath}
 
+
+	if [ -n "${addon_flag}" ]; then
+
+		obj_addon_dirpath="${base_obj_dirpath}/${addon_dir}"
+
+		for addon in ${addon_list}; do
+			echo "${script_name}: creating ${obj_addon_dirpath}/${addon}"
+			mkdir -p ${obj_addon_dirpath}/${addon}
+		done
+	fi
+
+
 	if [ -n "${sandbox_flag}" ]; then
 
 		obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
@@ -3489,6 +3599,7 @@ main()
 	echo "${script_name}: creating ${base_lib_dirpath}"
 	mkdir -p ${base_lib_dirpath}
 
+
 	# Create include directory (if it does not already exist).
 	base_include_dirpath="${include_dirpath}/${config_name}"
 
@@ -3543,6 +3654,16 @@ main()
 	echo "${script_name}: mirroring ${frame_dirpath} to ${obj_frame_dirpath}"
 	${mirror_tree_sh} ${frame_dirpath} ${obj_frame_dirpath}
 
+	# Mirror the chosen addon source tree to its object sub-directory.
+	if [ -n "${addon_flag}" ]; then
+
+		for addon in ${addon_list}; do
+
+			echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}"
+			${mirror_tree_sh} "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}"
+		done
+	fi
+
 	# Mirror the chosen sandbox source tree to its object sub-directory.
 	if [ -n "${sandbox_flag}" ]; then
 
@@ -3629,6 +3750,25 @@ main()
 			 ${gen_make_frags_dirpath}/suffix_list \
 			 ${gen_make_frags_dirpath}/ignore_list
 
+	# Generate makefile fragments in the addon sub-directory.
+	if [ -n "${addon_flag}" ]; then
+
+		for addon in ${addon_list}; do
+
+			echo "${script_name}: creating makefile fragments in ${obj_addon_dirpath}/${addon}"
+			${gen_make_frags_sh} \
+					 -h -r -v0 \
+					 -o ${script_name} \
+					 -p 'ADDON' \
+					 ${addon_dirpath}/${addon} \
+					 ${obj_addon_dirpath}/${addon} \
+					 ${gen_make_frags_dirpath}/fragment.mk \
+					 ${gen_make_frags_dirpath}/suffix_list \
+					 ${gen_make_frags_dirpath}/ignore_list
+		done
+	fi
+
+
 	# Generate makefile fragments in the sandbox sub-directory.
 	if [ -n "${sandbox_flag}" ]; then
 
diff --git a/docs/Addons.md b/docs/Addons.md
new file mode 100644
index 000000000..595cebfa4
--- /dev/null
+++ b/docs/Addons.md
@@ -0,0 +1,231 @@
+## Contents
+
+* **[Introduction](Addons.md#introduction)**
+* **[Enabling addons](Addons.md#enabling-addons)**
+* **[Addon rules](Addons.md#addon-rules)**
+* **[Caveats](Addons.md#caveats)**
+* **[Known issues](Addons.md#known-issues)**
+* **[Conclusion](Addons.md#conclusion)**
+
+
+## Introduction
+
+This file briefly describes the requirements for building a custom BLIS
+*addon*.
+
+Simply put, an addon in BLIS provides additional APIs, operations, and/or
+implementations that may be useful to certain users. An addon can be
+thought of as a standalone extension of BLIS that does not depend on any
+other addon, although addons may utilize existing functionality or kernels
+within the core framework.
+
+By definition, an addon should *never* provide APIs that conflict with
+the interfaces that belong to either the [typed API](BLISTypedAPI.md) or the
+[object API](BLISObjectAPI.md). Thus, you'll never have to worry about a
+properly constructed (and properly functioning) addon interfering with or
+otherwise changing core BLIS functionality.
+
+How does an addon differ from a [sandbox](Sandboxes.md)? Great question!
+Sometimes you want to include additional BLIS-like functionality that does
+not relate directly to `gemm` or any other BLIS operation.
+(By contrast, a sandbox requires you to implement `gemm` whether you want
+to or not.)
+Furthermore, you may wish to enable multiple addons simultaneously.
+(By contrast, only one sandbox may be enabled at a time.)
+Thus, the addon feature provides additional flexibility to some
+users in a way that sandboxes cannot, while still providing many of the
+conveniences of sandboxes.
+
+## Enabling an addon
+
+To enable an existing addon at configure-time, you simply specify it as an
+option to `configure`. Either of the following usages are accepted:
+```
+$ ./configure --enable-addon=foobar auto
+$ ./configure -a foobar auto
+```
+Here, we tell `configure` that we want to use the `foobar` addon, which
+corresponds to a subdirectory of the `addon` directory named `foobar`.
+(Reminder: the `auto` argument is the configuration target and
+unrelated to addons.)
+
+You may also enable multiple addons within the same build of BLIS:
+```
+$ ./configure -a foobar -a thing1 -a thing2 auto
+```
+Note that the default behavior of `configure` is that no addons are enabled.
+
+As `configure` runs, you should get output that includes lines
+similar to:
+```
+configure: configuring with addons:
+configure:   addon/foobar
+configure:   addon/thing1
+configure:   addon/thing2
+```
+And when you build BLIS, the addon source code will be among the last files to
+be compiled:
+```
+Compiling obj/haswell/addon/foobar/foobar.o ('haswell' CFLAGS for addons)
+Compiling obj/haswell/addon/thing1/thing1.o ('haswell' CFLAGS for addons)
+Compiling obj/haswell/addon/thing1/thing1_api.o ('haswell' CFLAGS for addons)
+Compiling obj/haswell/addon/thing2/thing2_api.o ('haswell' CFLAGS for addons)
+...
+```
+That's it! After the BLIS library is built, it will contain your chosen
+addons. You can always confirm this by using `nm` to confirm the presence
+of your API symbols:
+```
+$ nm lib/haswell/libblis.a | grep foobar
+foobar.o:
+0000000000000000 T foobar
+```
+
+## Addon rules
+
+Please follow these guidelines for the best developer experience when
+creating addons.
+
+1. As with sandboxes, you don't need to worry about creating makefiles. The
+BLIS build system will take care of this for you. :) By configuring BLIS with
+an addon enabled, `make` will scan your addon subdirectory and compile
+all of its source code using similar compilation rules as were used for the rest
+of the framework. In addition, the compilation command line will automatically
+contain one `-I<includepath>` option for every subdirectory in your addon,
+so it doesn't matter where in your addon directory hierarchy you place your
+header files -- they will be found!
+
+2. We recommend that you write your addon in C99. While you *may* use C++11
+to implement your addon, you should provide a C99 wrapper API to your
+implementation so that others can interface with it. There is no guarantee
+that the end-user will be using a C++11 compiler, and therefore you should
+limit the definitions in your addon header to those that are C99 compliant.
+If you write your addon in C++11, you must use one of the BLIS-approved file
+extensions for your source files (`.cc`, `.cpp`, `.cxx`) and your local
+header files (`.hh`, `.hpp`, `.hxx`).
+Note that `blis.h` already contains all of its definitions inside of an
+`extern "C"` block, so you should be able to `#include "blis.h"` from your
+C++11 source code without any issues.
+
+3. All of your code related to the addon should reside within the named
+addon directory, or some subdirectory therein. If your addon requires
+new kernels, you should add kernel source code to an appropriate
+microarchitecture-specific subdirectory within the top-level `kernels`
+directory so that they are compiled with the correct
+microarchitecture-specific optimization flags.
+
+4. If your addon is named `foobar`, the BLIS build system will expect to
+find a header called `foobar.h` somewhere in the `addon/foobar` directory
+(or one of its subdirectories). This `foobar.h` header will automatically
+be inlined into the monolithic `blis.h` header that is produced by the
+BLIS build system. `foobar.h` may `#include` other local headers, each of
+which will also (recursively) get inlined into `blis.h`. However, you may
+choose to omit some local addon headers from `foobar.h.` You might do this,
+for example, because those headers define things that are not needed in
+order for the end user to call your addon code.
+
+5. Your addon APIs will always be available within static library builds of
+BLIS, but if you want your addon APIs to be exported as public APIs within
+*shared* library builds of BLIS, you'll need to annotate the prototypes
+accordingly. (BLIS makes its shared library symbols private by default; this
+allows us to export only those functions that we consider to be part of the
+public APIs.) This annotation can be done by prefixing function prototypes
+with the `BLIS_EXPORT_ADDON` macro as follows:
+```c
+BLIS_EXPORT_ADDON void foobar_calc( void* a, void* b );
+```
+
+6. Do not define any symbols in your addon that conflict with any symbols within
+the core framework. For example, don't define a function called `bli_copym()`
+in your addon since that function is already defined within BLIS.
+
+7. Do not define any symbols in your addon that conflict with any symbols within
+the C99 standard libraries/headers. For example, don't define a function called
+`printf()` since that function is already defined within the C99 standard library.
+
+8. *Try* to not define any symbols in your addon that conflict with symbols in any
+other addon, unless your addon is meant to serve as an alternative to the
+conflicting addon, in which case conflicting symbol names is okay (since you
+will presumably never build with both addons enabled).
+
+9. When choosing names for your addon files, avoid source filenames that already
+exist within BLIS. For example, don't name one of your files `bli_obj.c`
+since that file would compile into `bli_obj.o`, which will have already been
+placed into the library by the build system.
+
+10. Similarly, avoid header filenames that already exist within BLIS or C99.
+For example, don't name one of your header files `bli_obj.h` since that file
+already exists in BLIS. Also, don't name one of your header files `math.h`
+since that name would conflict with the `math.h` defined by C99. (This also
+means you shouldn't name your addon `math` since normally that name would
+require that you provide a `math.h` header inside the addon directory.)
+
+If you follow these rules, you will be much more likely to have a pleasant
+experience integrating your BLIS addon into the larger framework.
+
+## Caveats
+
+Notice that the BLIS addons are limited in what they can accomplish. Generally
+speaking, addons cannot change existing implementations within BLIS. Instead,
+addons aim to provide a way to quickly augment BLIS with additional bundles of
+code that extend BLIS's set of functionality in some interesting way. If you
+want to define new BLAS-like functions, but don't know where to start, creating
+a new addon is an appropriate place to start experimenting. If you want to
+change or refactor existing BLIS code, an addon is probably not suited for your
+needs.
+
+Another important limitation is the fact that the build system currently uses
+"framework `CFLAGS`" when compiling the addon source files. These are the same
+`CFLAGS` used when compiling general framework source code,
+```
+# Example framework CFLAGS used by 'haswell' sub-configuration
+-O2 -Wall -Wno-unused-function -Wfatal-errors -fPIC -std=c99
+-D_POSIX_C_SOURCE=200112L -Iinclude/haswell -I./frame/3/
+-I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include
+-DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden
+```
+which are likely more general-purpose than the `CFLAGS` used for, say,
+optimized kernels or even reference kernels:
+```
+# Example optimized kernel CFLAGS used by 'haswell' sub-configuration
+-O3 -fomit-frame-pointer -mavx2 -mfma -mfpmath=sse -march=haswell -Wall
+-Wno-unused-function -Wfatal-errors -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L
+-Iinclude/haswell -I./frame/3/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/
+-I./frame/include -DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden
+```
+(To see precisely which flags are being employed for any given file, enable
+verbosity at compile-time via `make V=1`.) Compiling addons with these more
+versatile `CFLAGS` compiler options means that we only need to compile one
+instance of each addon source file, even when targeting multiple
+configurations (for example, via `./configure x86_64`). However, it also means
+that addons are not ideal for microkernels, as they sometimes need additional
+compiler flags in order to
+yield the highest performance. If you have a new microkernel you would like to
+use within an addon, you can always develop it within that addon. However,
+once it is stable and ready for use by others, it's best to move the kernel(s)
+to the appropriate microarchitecture-specific subdirectory of the `kernels`
+directory the kernel(s). This will allow the kernel to be compiled with the
+appropriate microarchitecture-specific compiler flags.
+Please see the
+[Configuration Guide](ConfigurationHowTo)
+for more details, and when in doubt, please don't be shy about seeking
+guidance from BLIS developers by opening a
+[new issue](https://github.com/flame/blis/issues) or sending a message to the
+[blis-devel](http://groups.google.com/d/forum/blis-devel) mailing list.
+
+Notwithstanding these limitations, hopefully you still find BLIS addons
+useful!
+
+## Known issues
+
+* None yet.
+
+## Conclusion
+
+If you encounter any problems, please open
+a new [issue on GitHub](https://github.com/flame/blis/issues).
+
+If you are unsure about how something works, you can still open an issue. Or, you
+can send a message to
+[blis-devel](https://groups.google.com/d/forum/blis-devel) mailing list.
+
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 86f23df6e..5a4c8a15d 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -231,8 +231,9 @@
   #endif
 #endif
 
-#define BLIS_EXPORT_BLIS BLIS_EXPORT
-#define BLIS_EXPORT_BLAS BLIS_EXPORT
+#define BLIS_EXPORT_BLIS  BLIS_EXPORT
+#define BLIS_EXPORT_BLAS  BLIS_EXPORT
+#define BLIS_EXPORT_ADDON BLIS_EXPORT
 
 
 // -- STATIC INLINE FUNCTIONS --------------------------------------------------
diff --git a/frame/include/blis.h b/frame/include/blis.h
index b374e8539..98ebee878 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -196,6 +196,14 @@ extern "C" {
 #include "bli_util.h"
 
 
+// -- addon definitions --
+
+// NOTE: These definitions should not be included much earlier since an addon
+// may wish to utilize other types and definitions provided by BLIS.
+
+#include "bli_addon.h"
+
+
 // -- sandbox implementation --
 
 #include "bli_sbox.h"

From 78cd1b045155ddf0b9ec6e2ab815f2b216ad9a9e Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 16 Nov 2021 15:53:40 -0600
Subject: [PATCH 004/230] Added 'Example Code' section to README.md.

Details:
- Inserted a new 'Example Code' section into the README.md immediately
  after the 'Getting Started' section. Thanks to Devin Matthews for
  recommending this addition.
- Moved the 'Performance' section of the README down slightly so that it
  appears after the 'Documentation' section.
---
 README.md | 77 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index f4ec4acb3..2abe79400 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,9 @@ Contents
 * **[Key Features](#key-features)**
 * **[How to Download BLIS](#how-to-download-blis)**
 * **[Getting Started](#getting-started)**
-* **[Performance](#performance)**
+* **[Example Code](#example-code)**
 * **[Documentation](#documentation)**
+* **[Performance](#performance)**
 * **[External Packages](#external-packages)**
 * **[Discussion](#discussion)**
 * **[Contributing](#contributing)**
@@ -394,23 +395,41 @@ If/when you have time, we *strongly* encourage you to read the detailed
 walkthrough of the build system found in our [Build System](docs/BuildSystem.md)
 guide.
 
-Performance
------------
-
-We provide graphs that report performance of several implementations across a
-range of hardware types, multithreading configurations, problem sizes,
-operations, and datatypes. These pages also document most of the details needed
-to reproduce these experiments.
+Example Code
+------------
 
- * **[Performance](docs/Performance.md).** This document reports empirically
-measured performance of a representative set of level-3 operations on a variety
-of hardware architectures, as implemented within BLIS and other BLAS libraries
-for all four of the standard floating-point datatypes.
+The BLIS source distribution provides example code in the `examples` directory.
+Example code focuses on using BLIS APIs (not BLAS or CBLAS), and resides in
+two subdirectories: [examples/oapi](examples/oapi) (which demonstrates the
+[object API](docs/BLISObjectAPI.md)) and [examples/tapi](examples/tapi) (which
+demonstrates the [typed API](docs/BLISTypedAPI.md)).
+
+Either directory contains several files, each containing various pieces of
+code that exercise core functionality of the BLIS API in question (object or
+typed). These example files should be thought of collectively like a tutorial,
+and therefore it is recommended to start from the beginning (the file that
+starts in `00`).
+
+You can build all of the examples by simply running `make` from either example
+subdirectory (`examples/oapi` or `examples/tapi`). (You can also run
+`make clean`.) The local `Makefile` assumes that you've already configured and
+built (but not necessarily installed) BLIS two directories up, in `../..`. If
+you have already installed BLIS to some permanent directory, you may refer to
+that installation by setting the environment variable `BLIS_INSTALL_PATH` prior
+to running make:
+```
+export BLIS_INSTALL_PATH=/usr/local; make
+```
+or by setting the same variable as part of the make command:
+```
+make BLIS_INSTALL_PATH=/usr/local
+```
+**Once the executable files have been built, we recommend reading the code and
+the corresponding executable output side by side. This will help you see the
+effects of each section of code.**
 
- * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
-empirically measured performance of `gemm` on select hardware architectures
-within BLIS and other BLAS libraries when performing matrix problems where one
-or two dimensions is exceedingly small.
+This tutorial is not exhaustive or complete; several object API functions were
+omitted (mostly for brevity's sake) and thus more examples could be written.
 
 Documentation
 -------------
@@ -432,16 +451,12 @@ included BLAS test drivers.
 
  * **[BLIS Typed API Reference](docs/BLISTypedAPI.md).** Here we document the
 so-called "typed" (or BLAS-like) API. This is the API that many users who are
-already familiar with the BLAS will likely want to use. You can find lots of
-example code for the typed API in the [examples/tapi](examples/tapi) directory
-included in the BLIS source distribution.
+already familiar with the BLAS will likely want to use.
 
  * **[BLIS Object API Reference](docs/BLISObjectAPI.md).** Here we document
 the object API. This is API abstracts away properties of vectors and matrices
 within `obj_t` structs that can be queried with accessor functions. Many
-developers and experts prefer this API over the typed API. You can find lots of
-example code for the object API in the [examples/oapi](examples/oapi) directory
-included in the BLIS source distribution.
+developers and experts prefer this API over the typed API.
 
  * **[Hardware Support](docs/HardwareSupport.md).** This document maintains a
 table of supported microarchitectures.
@@ -501,6 +516,24 @@ please read this thorough walkthrough of the configuration system.
 about using sandboxes in BLIS--that is, providing alternative implementations
 of the `gemm` operation--please read this document.
 
+Performance
+-----------
+
+We provide graphs that report performance of several implementations across a
+range of hardware types, multithreading configurations, problem sizes,
+operations, and datatypes. These pages also document most of the details needed
+to reproduce these experiments.
+
+ * **[Performance](docs/Performance.md).** This document reports empirically
+measured performance of a representative set of level-3 operations on a variety
+of hardware architectures, as implemented within BLIS and other BLAS libraries
+for all four of the standard floating-point datatypes.
+
+ * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
+empirically measured performance of `gemm` on select hardware architectures
+within BLIS and other BLAS libraries when performing matrix problems where one
+or two dimensions is exceedingly small.
+
 External Packages
 -----------------
 

From cbc88feb51b949ce562d044cf9f99c4e46bb8a39 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 16 Nov 2021 16:02:39 -0600
Subject: [PATCH 005/230] Marked some markdown shell code blocks as 'bash'.

Details:
- Annotated the code blocks that represent shell commands and output as
  'bash' in README.md and BuildSystem.md.
---
 README.md           | 12 ++++++------
 docs/BuildSystem.md | 40 ++++++++++++++++++++--------------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 2abe79400..21bfe10d3 100644
--- a/README.md
+++ b/README.md
@@ -337,7 +337,7 @@ slightly out of date.)
 URL by clicking on the green button above the file/directory listing near the
 top of this page (as rendered by GitHub). Generally speaking, it will amount
 to executing the following command in your terminal shell:
-   ```
+   ```bash
    git clone https://github.com/flame/blis.git
    ```
 
@@ -375,18 +375,18 @@ as discussed in [the previous section](#how-to-download-blis).*
 If you just want to build a sequential (not parallelized) version of BLIS
 in a hurry and come back and explore other topics later, you can configure
 and build BLIS as follows:
-```
+```bash
 $ ./configure auto
 $ make [-j]
 ```
 You can then verify your build by running BLAS- and BLIS-specific test
 drivers via `make check`:
-```
+```bash
 $ make check [-j]
 ```
 And if you would like to install BLIS to the directory specified to `configure`
 via the `--prefix` option, run the `install` target:
-```
+```bash
 $ make install
 ```
 Please read the output of `./configure --help` for a full list of configure-time
@@ -417,11 +417,11 @@ built (but not necessarily installed) BLIS two directories up, in `../..`. If
 you have already installed BLIS to some permanent directory, you may refer to
 that installation by setting the environment variable `BLIS_INSTALL_PATH` prior
 to running make:
-```
+```bash
 export BLIS_INSTALL_PATH=/usr/local; make
 ```
 or by setting the same variable as part of the make command:
-```
+```bash
 make BLIS_INSTALL_PATH=/usr/local
 ```
 **Once the executable files have been built, we recommend reading the code and
diff --git a/docs/BuildSystem.md b/docs/BuildSystem.md
index 5e290d9bb..60fd541d6 100644
--- a/docs/BuildSystem.md
+++ b/docs/BuildSystem.md
@@ -40,14 +40,14 @@ Finally, we also require various other shell utilities that are so ubiquitous th
 Before starting, you must obtain a copy of BLIS.
 
 If you are an end-user (i.e., not a developer), you can download a tarball or zip file of the latest tagged version by returning to the main [BLIS homepage](https://github.com/flame/blis) and clicking on the [releases](https://github.com/flame/blis/releases) link. **However**, we highly recommend that you instead clone a copy using the command:
-```
+```bash
 $ git clone https://github.com/flame/blis.git
 ```
 
 Cloning a repository allows users and developers alike to quickly and easily pull in new commits as they are available, including commits that occur **between** tagged releases.
 
 Once you download the BLIS distribution, the top-level directory should look something like:
-```
+```bash
 $ ls
 CHANGELOG  Makefile      common.mk        configure  mpi_test     testsuite
 CREDITS    README.md     config           frame      obj          version
@@ -63,7 +63,7 @@ The first step is to choose how to configure BLIS. Specifically, a user must dec
 Configurations are described in detail in the [Configuration Guide](ConfigurationHowTo.md).
 
 Generally speaking, a configuration consists of several files that reside in a sub-directory of the `config` directory. To see a list of the available configurations, you may inspect this directory, or run `configure` with no arguments. Here are the current (as of this writing) contents of the `config` directory:
-```
+```bash
 $ ls config
 amd64      cortexa15  excavator  intel64  old         power7       template
 bgq        cortexa57  generic    knc      penryn      sandybridge  zen
@@ -85,19 +85,19 @@ Multithreading in BLIS is disabled by default. For more information on enabling
 ## Step 2: Running `configure`
 
 This step should be somewhat familiar to many people who use open source software. To configure the build system, simply run:
-```
+```bash
 $ ./configure <configname>
 ```
 where `<configname>` is the configuration sub-directory name you chose in [Step 1](BuildSystem.md#step-1-choose-a-framework-configuration) above. If `<configname>` is not given, a helpful message is printed reminding you to explicit specify a configuration name along with a list of valid configuration families and their implied sub-configurations. For more information on sub-configurations and families, please see the BLIS [Configuration Guide](ConfigurationHowTo.md).
 
 Alternatively, `configure` can automatically select a configuration based on your hardware:
-```
+```bash
 $ ./configure auto
 ```
 However, as of this writing, BLIS lacks support for automatically detecting some architectures. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used.
 
 Upon running configure, you will get output similar to the following. The exact output will depend on whether you cloned BLIS from a `git` repository or whether you obtained BLIS via a downloadable tarball from the [releases](https://github.com/flame/blis/releases) page.
-```
+```bash
 $ ./configure --prefix=$HOME/blis haswell
 configure: using 'gcc' compiler.
 configure: found gcc version 5.4.0 (maj: 5, min: 4, rev: 0).
@@ -174,17 +174,17 @@ configure: creating makefile fragments in ./frame
 configure: configured to build within top-level directory of source distribution.
 ```
 The installation prefix can be specified via the `--prefix=PREFIX` option:
-```
+```bash
 $ ./configure --prefix=/usr <configname>
 ```
 This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `/usr/local`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
-```
+```bash
 $ ./configure --libdir=/usr/lib --includedir=/usr/include <configname>
 ```
 The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any path implied by `PREFIX`, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `EXECPREFIX/lib` (where `EXECPREFIX`, set via `--exec-prefix=EXECPREFIX`, defaults to `PREFIX`) and `INCDIR` defaults to `PREFIX/include`, but `LIBDIR` and `INCDIR` will each be overriden by their respective `--libdir`/`--includedir` options. There is a third related option, `--sharedir=SHAREDIR`, where `SHAREDIR` defaults to `PREFIX/share`. This option specifies the installation directory for certain makefile fragments that contain variables determined by `configure` (e.g. `CC`, `CFLAGS`, `LDFLAGS`, etc.). These files allow certain BLIS makefiles, such as those in the `examples` or `testsuite` directories, to operate on an installed copy of BLIS rather than a local (and possibly uninstalled) copy.
 
 For a complete list of supported `configure` options and arguments, run `configure` with the `-h` option:
-```
+```bash
 $ ./configure -h
 ```
 The output from this invocation of `configure` should give you an up-to-date list of options and their descriptions.
@@ -192,7 +192,7 @@ The output from this invocation of `configure` should give you an up-to-date lis
 ## Step 3: Compilation
 
 Once `configure` is finished, you are ready to instantiate (compile) BLIS into a library by running `make`. Running `make` will result in output similar to:
-```
+```bash
 $ make
 Generating monolithic blis.h.........................................................
 .....................................................................................
@@ -209,11 +209,11 @@ Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int.o ('haswell' CFLAGS for ker
 Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int10.o ('haswell' CFLAGS for kernels)
 ```
 If you want to see the individual command line invocations of the compiler, you can run `make` as follows:
-```
+```bash
 $ make V=1
 ```
 Also, if you are compiling on a multicore system, you can get parallelism via:
-```
+```bash
 $ make -j<n>
 ```
 where `<n>` is the number of jobs `make` is allowed to run simultaneously. Generally, you should limit `<n>` to p+1, where p is the number of processor cores on your system.
@@ -236,7 +236,7 @@ The archiver and/or linker should no longer choke when creating the libraries.
 ## Step 3b: Testing (optional)
 
 If you would like to run some ready-made tests that exercise BLIS in a number of ways, including through its BLAS compatibility layer, run `make check`:
-```
+```bash
 $ make check
 ```
 Watch the output near the end. You should see the following messages, though not necessarily in immediate succession:
@@ -263,7 +263,7 @@ Archiving lib/haswell/libblis.a
 Dynamically linking lib/haswell/libblis.so
 ```
 Now you have a BLIS library (in static and shared forms) residing in the `lib/<configname>/` directory. To install the libraries and the header files associated with it, simply execute:
-```
+```bash
 $ make install
 ```
 This installs copies of the libraries and header files, and also creates conventional symbolic links of shared libraries:
@@ -275,7 +275,7 @@ Installing symlink libblis.so.0 into /u/field/blis/lib/
 Installing blis.h into /u/field/blis/include/blis/
 ```
 This results in your `PREFIX` directory looking like:
-```
+```bash
 # Check the contents of 'PREFIX'.
 $ ls -l $HOME/blis
 drwxr-xr-x 3 field dept 4096 May 10 17:36 include
@@ -296,14 +296,14 @@ lrwxrwxrwx 1 field dept      16 May 10 17:42 libblis.so.0 -> libblis.so.0.0.0
 ## Cleaning out build products
 
 If you want to remove various build products, you can use one of the `make` targets already defined for you in the BLIS Makefile:
-```
+```bash
 $ make clean
 Removing flattened header files from ./include/haswell.
 Removing object files from ./obj/haswell.
 Removing libraries from ./lib/haswell.
 ```
 Executing the `clean` target will remove all binary object files and library builds from the `obj` and `lib` directories, as well as any flattened header files. Any other configurations' build products are left untouched.
-```
+```bash
 $ make cleanmk
 Removing makefile fragments from ./config.
 Removing makefile fragments from ./frame.
@@ -311,7 +311,7 @@ Removing makefile fragments from ./ref_kernels.
 Removing makefile fragments from ./kernels.
 ```
 The `cleanmk` target results in removal of all makefile fragments from the framework source tree. (Makefile fragments are named `.fragment.mk` and are generated at configure-time.)
-```
+```bash
 $ make distclean
 Removing makefile fragments from ./config.
 Removing makefile fragments from ./frame.
@@ -357,7 +357,7 @@ If the BLAS compatibility layer was enabled at configure-time (as it is by defau
 ### Disabling BLAS prototypes
 
 Some applications already `#include` a header that contains BLAS prototypes. This can cause problems if those applications also try to `#include` the BLIS header file, as shown above. Suppose for a moment that `otherstuff.h` in the example above already provides BLAS prototypes.
-```
+```bash
 $ gcc -I/path/to/blis -I/path/to/otherstuff -c main.c -o main.o
 In file included from main.c:41:0:
 /path/to/blis/blis.h:36900:111: error: conflicting declaration of C function ‘int xerbla_(const bla_character*, const bla_integer*, ftnlen)’
@@ -413,7 +413,7 @@ The makefile shown above a very simple example. If you need help linking your ap
 ## Uninstalling
 
 If you decide that you want to uninstall BLIS, simply run `make uninstall`
-```
+```bash
 $ make uninstall
 Uninstalling libraries libblis.a libblis.so.0.0.0 from /u/field/blis/lib/.
 Uninstalling symlinks libblis.so libblis.so.0 from /u/field/blis/lib/.

From 74c0c622216aba0c24aa2c3a923811366a160cf5 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 16 Nov 2021 16:06:33 -0600
Subject: [PATCH 006/230] Reverted cbc88fe.

Details:
- Reverted the annotation of some markdown code blocks with 'bash'
  after realizing that the in-browser syntax highlighting was not
  worthwhile.
---
 README.md           | 12 ++++++------
 docs/BuildSystem.md | 40 ++++++++++++++++++++--------------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 21bfe10d3..2abe79400 100644
--- a/README.md
+++ b/README.md
@@ -337,7 +337,7 @@ slightly out of date.)
 URL by clicking on the green button above the file/directory listing near the
 top of this page (as rendered by GitHub). Generally speaking, it will amount
 to executing the following command in your terminal shell:
-   ```bash
+   ```
    git clone https://github.com/flame/blis.git
    ```
 
@@ -375,18 +375,18 @@ as discussed in [the previous section](#how-to-download-blis).*
 If you just want to build a sequential (not parallelized) version of BLIS
 in a hurry and come back and explore other topics later, you can configure
 and build BLIS as follows:
-```bash
+```
 $ ./configure auto
 $ make [-j]
 ```
 You can then verify your build by running BLAS- and BLIS-specific test
 drivers via `make check`:
-```bash
+```
 $ make check [-j]
 ```
 And if you would like to install BLIS to the directory specified to `configure`
 via the `--prefix` option, run the `install` target:
-```bash
+```
 $ make install
 ```
 Please read the output of `./configure --help` for a full list of configure-time
@@ -417,11 +417,11 @@ built (but not necessarily installed) BLIS two directories up, in `../..`. If
 you have already installed BLIS to some permanent directory, you may refer to
 that installation by setting the environment variable `BLIS_INSTALL_PATH` prior
 to running make:
-```bash
+```
 export BLIS_INSTALL_PATH=/usr/local; make
 ```
 or by setting the same variable as part of the make command:
-```bash
+```
 make BLIS_INSTALL_PATH=/usr/local
 ```
 **Once the executable files have been built, we recommend reading the code and
diff --git a/docs/BuildSystem.md b/docs/BuildSystem.md
index 60fd541d6..5e290d9bb 100644
--- a/docs/BuildSystem.md
+++ b/docs/BuildSystem.md
@@ -40,14 +40,14 @@ Finally, we also require various other shell utilities that are so ubiquitous th
 Before starting, you must obtain a copy of BLIS.
 
 If you are an end-user (i.e., not a developer), you can download a tarball or zip file of the latest tagged version by returning to the main [BLIS homepage](https://github.com/flame/blis) and clicking on the [releases](https://github.com/flame/blis/releases) link. **However**, we highly recommend that you instead clone a copy using the command:
-```bash
+```
 $ git clone https://github.com/flame/blis.git
 ```
 
 Cloning a repository allows users and developers alike to quickly and easily pull in new commits as they are available, including commits that occur **between** tagged releases.
 
 Once you download the BLIS distribution, the top-level directory should look something like:
-```bash
+```
 $ ls
 CHANGELOG  Makefile      common.mk        configure  mpi_test     testsuite
 CREDITS    README.md     config           frame      obj          version
@@ -63,7 +63,7 @@ The first step is to choose how to configure BLIS. Specifically, a user must dec
 Configurations are described in detail in the [Configuration Guide](ConfigurationHowTo.md).
 
 Generally speaking, a configuration consists of several files that reside in a sub-directory of the `config` directory. To see a list of the available configurations, you may inspect this directory, or run `configure` with no arguments. Here are the current (as of this writing) contents of the `config` directory:
-```bash
+```
 $ ls config
 amd64      cortexa15  excavator  intel64  old         power7       template
 bgq        cortexa57  generic    knc      penryn      sandybridge  zen
@@ -85,19 +85,19 @@ Multithreading in BLIS is disabled by default. For more information on enabling
 ## Step 2: Running `configure`
 
 This step should be somewhat familiar to many people who use open source software. To configure the build system, simply run:
-```bash
+```
 $ ./configure <configname>
 ```
 where `<configname>` is the configuration sub-directory name you chose in [Step 1](BuildSystem.md#step-1-choose-a-framework-configuration) above. If `<configname>` is not given, a helpful message is printed reminding you to explicit specify a configuration name along with a list of valid configuration families and their implied sub-configurations. For more information on sub-configurations and families, please see the BLIS [Configuration Guide](ConfigurationHowTo.md).
 
 Alternatively, `configure` can automatically select a configuration based on your hardware:
-```bash
+```
 $ ./configure auto
 ```
 However, as of this writing, BLIS lacks support for automatically detecting some architectures. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used.
 
 Upon running configure, you will get output similar to the following. The exact output will depend on whether you cloned BLIS from a `git` repository or whether you obtained BLIS via a downloadable tarball from the [releases](https://github.com/flame/blis/releases) page.
-```bash
+```
 $ ./configure --prefix=$HOME/blis haswell
 configure: using 'gcc' compiler.
 configure: found gcc version 5.4.0 (maj: 5, min: 4, rev: 0).
@@ -174,17 +174,17 @@ configure: creating makefile fragments in ./frame
 configure: configured to build within top-level directory of source distribution.
 ```
 The installation prefix can be specified via the `--prefix=PREFIX` option:
-```bash
+```
 $ ./configure --prefix=/usr <configname>
 ```
 This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `/usr/local`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
-```bash
+```
 $ ./configure --libdir=/usr/lib --includedir=/usr/include <configname>
 ```
 The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any path implied by `PREFIX`, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `EXECPREFIX/lib` (where `EXECPREFIX`, set via `--exec-prefix=EXECPREFIX`, defaults to `PREFIX`) and `INCDIR` defaults to `PREFIX/include`, but `LIBDIR` and `INCDIR` will each be overriden by their respective `--libdir`/`--includedir` options. There is a third related option, `--sharedir=SHAREDIR`, where `SHAREDIR` defaults to `PREFIX/share`. This option specifies the installation directory for certain makefile fragments that contain variables determined by `configure` (e.g. `CC`, `CFLAGS`, `LDFLAGS`, etc.). These files allow certain BLIS makefiles, such as those in the `examples` or `testsuite` directories, to operate on an installed copy of BLIS rather than a local (and possibly uninstalled) copy.
 
 For a complete list of supported `configure` options and arguments, run `configure` with the `-h` option:
-```bash
+```
 $ ./configure -h
 ```
 The output from this invocation of `configure` should give you an up-to-date list of options and their descriptions.
@@ -192,7 +192,7 @@ The output from this invocation of `configure` should give you an up-to-date lis
 ## Step 3: Compilation
 
 Once `configure` is finished, you are ready to instantiate (compile) BLIS into a library by running `make`. Running `make` will result in output similar to:
-```bash
+```
 $ make
 Generating monolithic blis.h.........................................................
 .....................................................................................
@@ -209,11 +209,11 @@ Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int.o ('haswell' CFLAGS for ker
 Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int10.o ('haswell' CFLAGS for kernels)
 ```
 If you want to see the individual command line invocations of the compiler, you can run `make` as follows:
-```bash
+```
 $ make V=1
 ```
 Also, if you are compiling on a multicore system, you can get parallelism via:
-```bash
+```
 $ make -j<n>
 ```
 where `<n>` is the number of jobs `make` is allowed to run simultaneously. Generally, you should limit `<n>` to p+1, where p is the number of processor cores on your system.
@@ -236,7 +236,7 @@ The archiver and/or linker should no longer choke when creating the libraries.
 ## Step 3b: Testing (optional)
 
 If you would like to run some ready-made tests that exercise BLIS in a number of ways, including through its BLAS compatibility layer, run `make check`:
-```bash
+```
 $ make check
 ```
 Watch the output near the end. You should see the following messages, though not necessarily in immediate succession:
@@ -263,7 +263,7 @@ Archiving lib/haswell/libblis.a
 Dynamically linking lib/haswell/libblis.so
 ```
 Now you have a BLIS library (in static and shared forms) residing in the `lib/<configname>/` directory. To install the libraries and the header files associated with it, simply execute:
-```bash
+```
 $ make install
 ```
 This installs copies of the libraries and header files, and also creates conventional symbolic links of shared libraries:
@@ -275,7 +275,7 @@ Installing symlink libblis.so.0 into /u/field/blis/lib/
 Installing blis.h into /u/field/blis/include/blis/
 ```
 This results in your `PREFIX` directory looking like:
-```bash
+```
 # Check the contents of 'PREFIX'.
 $ ls -l $HOME/blis
 drwxr-xr-x 3 field dept 4096 May 10 17:36 include
@@ -296,14 +296,14 @@ lrwxrwxrwx 1 field dept      16 May 10 17:42 libblis.so.0 -> libblis.so.0.0.0
 ## Cleaning out build products
 
 If you want to remove various build products, you can use one of the `make` targets already defined for you in the BLIS Makefile:
-```bash
+```
 $ make clean
 Removing flattened header files from ./include/haswell.
 Removing object files from ./obj/haswell.
 Removing libraries from ./lib/haswell.
 ```
 Executing the `clean` target will remove all binary object files and library builds from the `obj` and `lib` directories, as well as any flattened header files. Any other configurations' build products are left untouched.
-```bash
+```
 $ make cleanmk
 Removing makefile fragments from ./config.
 Removing makefile fragments from ./frame.
@@ -311,7 +311,7 @@ Removing makefile fragments from ./ref_kernels.
 Removing makefile fragments from ./kernels.
 ```
 The `cleanmk` target results in removal of all makefile fragments from the framework source tree. (Makefile fragments are named `.fragment.mk` and are generated at configure-time.)
-```bash
+```
 $ make distclean
 Removing makefile fragments from ./config.
 Removing makefile fragments from ./frame.
@@ -357,7 +357,7 @@ If the BLAS compatibility layer was enabled at configure-time (as it is by defau
 ### Disabling BLAS prototypes
 
 Some applications already `#include` a header that contains BLAS prototypes. This can cause problems if those applications also try to `#include` the BLIS header file, as shown above. Suppose for a moment that `otherstuff.h` in the example above already provides BLAS prototypes.
-```bash
+```
 $ gcc -I/path/to/blis -I/path/to/otherstuff -c main.c -o main.o
 In file included from main.c:41:0:
 /path/to/blis/blis.h:36900:111: error: conflicting declaration of C function ‘int xerbla_(const bla_character*, const bla_integer*, ftnlen)’
@@ -413,7 +413,7 @@ The makefile shown above a very simple example. If you need help linking your ap
 ## Uninstalling
 
 If you decide that you want to uninstall BLIS, simply run `make uninstall`
-```bash
+```
 $ make uninstall
 Uninstalling libraries libblis.a libblis.so.0.0.0 from /u/field/blis/lib/.
 Uninstalling symlinks libblis.so libblis.so.0 from /u/field/blis/lib/.

From 26e4b6b29312b472c3cadf95ccdf5240764777f4 Mon Sep 17 00:00:00 2001
From: Dipal M Zambare <71366780+dzambare@users.noreply.github.com>
Date: Thu, 18 Nov 2021 00:32:00 +0530
Subject: [PATCH 007/230] Added support for AMD's Zen3 microarchitecture.

Details:
- Added a new 'zen3' subconfiguration targeting support for the AMD Zen3
  microarchitecture (#561). Thanks to AMD for this contribution.
- Restructured clang and AOCC support for zen, zen2, and zen3
  make_defs.mk files. The clang and AOCC version detection now happens
  in configure, not in the subconfigurations' makefile fragments. That
  is, we've added logic to configure that detects the version of
  clang/AOCC, outputs an appropriate variable to config.mk
  (ie: CLANG_OT_*, AOCC_OT_*), and then checks for it within the
  makefile fragment (as is currently done for the GCC_OT_* variables).
- Added configure support for a GCC_OT_10_1_0 variable (and associated
  substitution anchor) to communicate whether the gcc version is older
  than 10.1.0, and use this variable to check for recent enough versions
  of gcc to use -march=znver3 in the zen3 subconfig.
- Inlined the contents of config/zen/amd_config.mk into the zen and zen2
  make_defs.mk so that the files are self-contained, harmonizing the
  format of all three Zen-based subconfigurations' make_defs.mk files.
- Added indenting (with spaces) of GNU make conditionals for easier
  reading in zen, zen2, and zen3 make_defs.mk files.
- Adjusted the range of models checked by bli_cpuid_is_zen() (which was
  previously 0x00 ~ 0xff and is now 0x00 ~ 0x2f) so that it is
  completely disjoint from the models checked by bli_cpuid_is_zen2()
  (0x30 ~ 0xff). This is normally necessary because Zen and Zen2
  microarchitectures share the same family (23, or 0x17), and so the
  model code is the only way to differentiate the two. But in our case,
  fixing the model range for zen *wasn't* actually necessary since we
  checked for zen2 first, and therefore the wide zen range acted like
  the 'else' of an 'if-else' statement. That said, the change helps
  improve clarity for the reader by encoding useful knowledge, which
  was obtained from https://en.wikichip.org/wiki/amd/cpuid .
- Added zen2.def and zen3.def files to the collection in travis/cpuid.
  Note that support for zen, zen2, and zen3 is now present, and while
  all the three microarchitectures have identical instruction sets from
  the perspective of BLIS microkernels, they each correspond to
  different subconfigurations and therefore merit separate testing.
  Thanks to Devin Matthews for his guidance in hacking these files as
  slight modifications of zen.def.
- Enabled testing of zen2 and zen3 via the SDE in travis/do_sde.sh.
  Now, zen, zen2, and zen3 are tested through the SDE via Travis CI
  builds.
- Updated travis/do_sde.sh to grab the SDE tarball from a new ci-utils
  repository on GitHub rather than on Intel's website. This change was
  made in an attempt to circumvent recent troubles with Travis CI not
  being able to download the SDE directly from Intel's website via curl.
  Thanks to Devin Matthews for suggesting the idea.
- Updated travis/do_sde.sh to grab the latest version (8.69.1) of the
  Intel SDE from the flame/ci-utils repository.
- Updated .travis.yml to use gcc 9. The file was previously using gcc 8,
  which did not support -march=znver2.
- Created amd64_legacy umbrella family in config_registry for targeting
  older (bulldozer, piledriver, steamroller, and excavator)
  microarchitectures and moved those same subconfigs out of the amd64
  umbrella family. However, x86_64 retains amd64_legacy as a constituent
  member.
- Fixed a bug in configure related to the building of the so-called
  config list. When processing the contents of config_registry,
  configure creates a series of structures and lists that allow for
  various mappings related to configuration families, subconfigs, and
  kernel sets. Two of those lists are built via substitution of
  umbrella families with their subconfig members, and one of those
  lists was improperly performing the substitution in a way that would
  erroneously match on partial umbrella family names. That code was
  changed to match the code that was already doing the substitution
  properly, via substitute_words(). Also added comments noting the
  importance of using substitute_words() in both instances.
- Comment updates.
---
 .travis.yml                                   |   8 +-
 build/config.mk.in                            |   5 +
 config/amd64/bli_family_amd64.h               |  19 +-
 config/amd64/make_defs.mk                     |  29 +-
 config/amd64_legacy/bli_family_amd64_legacy.h |  42 +++
 config/amd64_legacy/make_defs.mk              |  70 ++++
 config/zen/make_defs.mk                       |  67 ++--
 config/zen/make_defs.mk.old                   |  84 +++++
 config/zen2/make_defs.mk                      |  81 +++--
 config/zen2/make_defs.mk.old                  |  94 ++++++
 config/zen3/bli_cntx_init_zen3.c              | 298 ++++++++++++++++++
 config/zen3/bli_family_zen3.h                 |  94 ++++++
 config/zen3/make_defs.mk                      | 113 +++++++
 config/zen3/make_defs.mk.old                  | 137 ++++++++
 config_registry                               |  12 +-
 configure                                     | 128 +++++++-
 frame/base/bli_arch.c                         |   8 +-
 frame/base/bli_cpuid.c                        |  52 ++-
 frame/base/bli_cpuid.h                        |   3 +-
 frame/base/bli_error.h                        |   1 +
 frame/base/bli_gks.c                          |   7 +-
 frame/include/bli_arch_config.h               |  12 +
 frame/include/bli_type_defs.h                 |   1 +
 kernels/zen3/.gitignore                       |   4 +
 travis/cpuid/zen2.def                         |  87 +++++
 travis/cpuid/zen3.def                         |  87 +++++
 travis/do_sde.sh                              |   4 +-
 27 files changed, 1423 insertions(+), 124 deletions(-)
 create mode 100644 config/amd64_legacy/bli_family_amd64_legacy.h
 create mode 100644 config/amd64_legacy/make_defs.mk
 create mode 100644 config/zen/make_defs.mk.old
 create mode 100644 config/zen2/make_defs.mk.old
 create mode 100644 config/zen3/bli_cntx_init_zen3.c
 create mode 100644 config/zen3/bli_family_zen3.h
 create mode 100644 config/zen3/make_defs.mk
 create mode 100644 config/zen3/make_defs.mk.old
 create mode 100644 kernels/zen3/.gitignore
 create mode 100644 travis/cpuid/zen2.def
 create mode 100644 travis/cpuid/zen3.def

diff --git a/.travis.yml b/.travis.yml
index 555e9a11a..6603ca2f3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,17 +12,17 @@ matrix:
   - os: linux
     compiler: gcc
     env: OOT=1 TEST=ALL SDE=1 THR="none" CONF="x86_64" \
-      PACKAGES="gcc-8 binutils"
+      PACKAGES="gcc-9 binutils"
   # openmp build
   - os: linux
     compiler: gcc
     env: OOT=0 TEST=FAST SDE=0 THR="openmp" CONF="auto" \
-      PACKAGES="gcc-8 binutils"
+      PACKAGES="gcc-9 binutils"
   # pthreads build
   - os: linux
     compiler: gcc
     env: OOT=0 TEST=FAST SDE=0 THR="pthreads" CONF="auto" \
-      PACKAGES="gcc-8 binutils"
+      PACKAGES="gcc-9 binutils"
   # clang build
   - os: linux
     compiler: clang
@@ -63,7 +63,7 @@ matrix:
       PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
       TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/"
 install:
-- if [ "$CC" = "gcc"  ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi
+- if [ "$CC" = "gcc"  ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi
 - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
 script:
 - export DIST_PATH=.
diff --git a/build/config.mk.in b/build/config.mk.in
index 7533d1acb..1032ce8e7 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -93,6 +93,11 @@ CC                := @CC@
 GCC_OT_4_9_0      := @gcc_older_than_4_9_0@
 GCC_OT_6_1_0      := @gcc_older_than_6_1_0@
 GCC_OT_9_1_0      := @gcc_older_than_9_1_0@
+GCC_OT_10_1_0     := @gcc_older_than_10_1_0@
+CLANG_OT_9_0_0    := @clang_older_than_9_0_0@
+CLANG_OT_12_0_0   := @clang_older_than_12_0_0@
+AOCC_OT_2_0_0     := @aocc_older_than_2_0_0@
+AOCC_OT_3_0_0     := @aocc_older_than_3_0_0@
 
 # The C++ compiler. NOTE: A C++ is typically not needed.
 CXX               := @CXX@
diff --git a/config/amd64/bli_family_amd64.h b/config/amd64/bli_family_amd64.h
index 278c22818..ac10789aa 100644
--- a/config/amd64/bli_family_amd64.h
+++ b/config/amd64/bli_family_amd64.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,15 +32,14 @@
 
 */
 
-//#ifndef BLIS_FAMILY_H
-//#define BLIS_FAMILY_H
+#ifndef BLIS_FAMILY_AMD64_H
+#define BLIS_FAMILY_AMD64_H
 
+// Enable framework optimizations for EPYC family processors.
+// With this macro defined, we can call kernels directly from
+// BLAS interfaces for levels 1 & 2.
+// This macro needs to be defined for all EPYC configurations.
+#define BLIS_CONFIG_EPYC
 
-// -- MEMORY ALLOCATION --------------------------------------------------------
-
-#define BLIS_SIMD_ALIGN_SIZE 16
-
-
-
-//#endif
+#endif
 
diff --git a/config/amd64/make_defs.mk b/config/amd64/make_defs.mk
index b9232ac6c..ebb7a569f 100644
--- a/config/amd64/make_defs.mk
+++ b/config/amd64/make_defs.mk
@@ -1,10 +1,10 @@
 #
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -60,29 +60,8 @@ else
 COPTFLAGS      := -O2
 endif
 
-# Flags specific to optimized kernels.
-CKOPTFLAGS     := $(COPTFLAGS) -O3
-ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver2
-else
-ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver2
-else
-$(error gcc or clang are required for this configuration.)
-endif
-endif
-
-# Flags specific to reference kernels.
-CROPTFLAGS     := $(CKOPTFLAGS)
-ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
-else
-ifeq ($(CC_VENDOR),clang)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
-else
-CRVECFLAGS     := $(CKVECFLAGS)
-endif
-endif
+# Setting for reference and optimized kernels are taken from individual
+# subconfiguration makefile fragments in this family.
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/amd64_legacy/bli_family_amd64_legacy.h b/config/amd64_legacy/bli_family_amd64_legacy.h
new file mode 100644
index 000000000..c4f84885f
--- /dev/null
+++ b/config/amd64_legacy/bli_family_amd64_legacy.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_FAMILY_AMD64_LEGACY_H
+#define BLIS_FAMILY_AMD64_LEGACY_H
+
+// Placeholder for bundle configuration.
+
+#endif
+
diff --git a/config/amd64_legacy/make_defs.mk b/config/amd64_legacy/make_defs.mk
new file mode 100644
index 000000000..37ccbdae2
--- /dev/null
+++ b/config/amd64_legacy/make_defs.mk
@@ -0,0 +1,70 @@
+#
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := amd64_legacy
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2
+endif
+
+# Setting for reference and optimized kernels are taken from individual
+# subconfiguration makefile fragments in this family.
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk
index 8f975d5bc..8bdafd5ca 100644
--- a/config/zen/make_defs.mk
+++ b/config/zen/make_defs.mk
@@ -1,11 +1,10 @@
 #
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2019, Advanced Micro Devices, Inc.
+#  Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -33,9 +32,6 @@
 #
 #
 
-# FLAGS that are specific to the 'zen' architecture are added here.
-# FLAGS that are common for all the AMD architectures are present in
-# amd_config.mk.
 
 # Declare the name of the current configuration and add it to the
 # running list of configurations included by common.mk.
@@ -46,37 +42,50 @@ THIS_CONFIG    := zen
 # --- Determine the C compiler and related flags ---
 #
 
-# Include the file containing common flags for all AMD architectures.
-AMD_CONFIG_FILE := amd_config.mk
-AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
--include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
 
-ifeq ($(CC_VENDOR),gcc)
-# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the
-# Bulldozer instruction sets that were omitted from Zen.
-# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add
-# Zen-specific instructions back into the mix:
-# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
-ifeq ($(GCC_OT_6_1_0),yes)
-CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
-CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
 else
-# If gcc is at least 6.1.0, then we can specify the microarchitecture using
-# the preferred option.
-CRVECFLAGS += -march=znver1
-CKVECFLAGS += -march=znver1
+COPTFLAGS      := -O2 -fomit-frame-pointer
 endif
+
+# Flags specific to optimized and reference kernels.
+# NOTE: The -fomit-frame-pointer option is needed for some kernels because
+# they make explicit use of the rbp register.
+CKOPTFLAGS         := $(COPTFLAGS) -O3
+CROPTFLAGS         := $(CKOPTFLAGS)
+CKVECFLAGS         := -mavx2 -mfma -mfpmath=sse
+CRVECFLAGS         := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+ifeq ($(CC_VENDOR),gcc)
+  ifeq ($(GCC_OT_6_1_0),yes)  # gcc versions older than 6.1.
+    CVECFLAGS_VER  := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+  else
+    CVECFLAGS_VER  := -march=znver1 -mno-avx256-split-unaligned-store
+  endif
 else
 ifeq ($(CC_VENDOR),clang)
-# I couldn't find which versions of clang added support for -march=znver1,
-# so we don't even bother attempting the differentiation that appears in the
-# gcc branch above.
-CRVECFLAGS += -march=znver1
-CKVECFLAGS += -march=znver1
+  CVECFLAGS_VER    := -march=znver1
+else
+ifeq ($(CC_VENDOR),aocc)
+  CVECFLAGS_VER    := -march=znver1 -mllvm -disable-licm-vrp
 else
-$(error gcc or clang are required for this configuration.)
+  $(error gcc, clang, or aocc is required for this configuration.)
+endif
 endif
 endif
+CKVECFLAGS         += $(CVECFLAGS_VER)
+CRVECFLAGS         += $(CVECFLAGS_VER)
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/zen/make_defs.mk.old b/config/zen/make_defs.mk.old
new file mode 100644
index 000000000..44c2ad18d
--- /dev/null
+++ b/config/zen/make_defs.mk.old
@@ -0,0 +1,84 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2019, Advanced Micro Devices, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# FLAGS that are specific to the 'zen' architecture are added here.
+# FLAGS that are common for all the AMD architectures are present in
+# amd_config.mk.
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := zen
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# Include the file containing common flags for all AMD architectures.
+AMD_CONFIG_FILE := amd_config.mk
+AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
+-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
+
+ifeq ($(CC_VENDOR),gcc)
+# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the
+# Bulldozer instruction sets that were omitted from Zen.
+# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add
+# Zen-specific instructions back into the mix:
+# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
+ifeq ($(GCC_OT_6_1_0),yes)
+CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+else
+# If gcc is at least 6.1.0, then we can specify the microarchitecture using
+# the preferred option.
+CRVECFLAGS += -march=znver1
+CKVECFLAGS += -march=znver1
+endif
+else
+ifeq ($(CC_VENDOR),clang)
+# I couldn't find which versions of clang added support for -march=znver1,
+# so we don't even bother attempting the differentiation that appears in the
+# gcc branch above.
+CRVECFLAGS += -march=znver1
+CKVECFLAGS += -march=znver1
+else
+$(error gcc or clang are required for this configuration.)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk
index 7d3ccb4bf..c14b8cba0 100644
--- a/config/zen2/make_defs.mk
+++ b/config/zen2/make_defs.mk
@@ -1,11 +1,10 @@
 #
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2019, Advanced Micro Devices, Inc.
+#  Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -33,9 +32,6 @@
 #
 #
 
-# FLAGS that are specific to the 'zen2' architecture are added here.
-# FLAGS that are common for all the AMD architectures are present in
-# config/zen/amd_config.mk.
 
 # Declare the name of the current configuration and add it to the
 # running list of configurations included by common.mk.
@@ -46,41 +42,62 @@ THIS_CONFIG    := zen2
 # --- Determine the C compiler and related flags ---
 #
 
-# Include file containing common flags for all AMD architectures.
-AMD_CONFIG_FILE := amd_config.mk
-AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
--include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
 
-ifeq ($(CC_VENDOR),gcc)
-ifeq ($(GCC_OT_9_1_0),yes)
-ifeq ($(GCC_OT_6_1_0),yes)
-# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the
-# Bulldozer instruction sets that were omitted from Zen.
-CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
-CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
-else
-# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
-# as the fallback option.
-CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
-CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
 endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
 else
-# If gcc is at least 9.1.0, then we can specify the microarchitecture using
-# the preferred option.
-CRVECFLAGS += -march=znver2
-CKVECFLAGS += -march=znver2
+COPTFLAGS      := -O2 -fomit-frame-pointer
 endif
+
+# Flags specific to optimized and reference kernels.
+# NOTE: The -fomit-frame-pointer option is needed for some kernels because
+# they make explicit use of the rbp register.
+CKOPTFLAGS         := $(COPTFLAGS) -O3
+CROPTFLAGS         := $(CKOPTFLAGS)
+CKVECFLAGS         := -mavx2 -mfma -mfpmath=sse
+CRVECFLAGS         := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+ifeq ($(CC_VENDOR),gcc)
+  ifeq ($(GCC_OT_6_1_0),yes)  # gcc versions older than 6.1.
+    CVECFLAGS_VER  := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+  else
+  ifeq ($(GCC_OT_9_1_0),yes)  # gcc versions 6.1 or newer, but older than 9.1.
+    CVECFLAGS_VER  := -march=znver1 -mno-avx256-split-unaligned-store
+  else                        # gcc versions 9.1 or newer.
+    CVECFLAGS_VER  := -march=znver2
+  endif
+  endif
 else
 ifeq ($(CC_VENDOR),clang)
-# I couldn't find which versions of clang added support for -march=znver1
-# or -march=znver2, so we don't even bother attempting the differentiation
-# that appears in the gcc branch above.
-CRVECFLAGS += -march=znver1
-CKVECFLAGS += -march=znver1
+  ifeq ($(CLANG_OT_9_0_0),yes)  # clang versions older than 9.0.
+    CVECFLAGS_VER  := -march=znver1
+  else                          # clang versions 9.0 or newer.
+    CVECFLAGS_VER  := -march=znver2
+  endif
+else
+ifeq ($(CC_VENDOR),aocc)
+  ifeq ($(AOCC_OT_2_0_0),yes)   # aocc versions older than 2.0.
+    CVECFLAGS_VER  := -march=znver1 -mllvm -disable-licm-vrp
+  else                          # aocc versions 2.0 or newer.
+    CVECFLAGS_VER  := -march=znver2
+  endif
 else
-$(error gcc or clang are required for this configuration.)
+  $(error gcc, clang, or aocc is required for this configuration.)
+endif
 endif
 endif
+CKVECFLAGS         += $(CVECFLAGS_VER)
+CRVECFLAGS         += $(CVECFLAGS_VER)
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/zen2/make_defs.mk.old b/config/zen2/make_defs.mk.old
new file mode 100644
index 000000000..9f0370376
--- /dev/null
+++ b/config/zen2/make_defs.mk.old
@@ -0,0 +1,94 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2019, Advanced Micro Devices, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# FLAGS that are specific to the 'zen2' architecture are added here.
+# FLAGS that are common for all the AMD architectures are present in
+# config/zen/amd_config.mk.
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := zen2
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# Include file containing common flags for all AMD architectures.
+AMD_CONFIG_FILE := amd_config.mk
+AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
+-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
+
+ifeq ($(CC_VENDOR),gcc)
+  ifeq ($(GCC_OT_9_1_0),yes)
+    ifeq ($(GCC_OT_6_1_0),yes)
+    # If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the
+    # Bulldozer instruction sets that were omitted from Zen.
+    CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+    CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+    else
+    # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
+    # as the fallback option.
+    CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+    CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+    endif
+  else
+  # If gcc is at least 9.1.0, then we can specify the microarchitecture using
+  # the preferred option.
+  CRVECFLAGS += -march=znver2
+  CKVECFLAGS += -march=znver2
+  endif
+  else
+  ifeq ($(CC_VENDOR),clang)
+    ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
+    CKVECFLAGS += -march=znver2
+    else
+    #if compiling with clang
+    VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
+    CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
+    #clang 9.0 or later:
+    ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
+    CKVECFLAGS += -march=znver2
+    else
+    CKVECFLAGS += -march=znver1
+    endif # ge 9
+    endif # AOCC 2
+  endif # Clang
+endif # gcc
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
new file mode 100644
index 000000000..b5bbb05ed
--- /dev/null
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -0,0 +1,298 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_cntx_init_zen3( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+	blksz_t thresh[ BLIS_NUM_THRESH ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_zen3_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native gemm micro-kernels and
+	// their storage preferences.
+	bli_cntx_set_l3_nat_ukrs
+	(
+	  8,
+	  // gemm
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
+	  cntx
+	);
+
+#if 0
+	// AMD: This will be enabled in other PRs.
+	// packm kernels
+	bli_cntx_set_packm_kers
+	(
+	  2,
+	  BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
+	  BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
+	  cntx
+	);
+#else
+	// Update the context with optimized packm kernels.
+	bli_cntx_set_packm_kers
+	(
+	  8,
+	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  cntx
+	);
+#endif
+
+	// Update the context with optimized level-1f kernels.
+	bli_cntx_set_l1f_kers
+	(
+	  4,
+	  // axpyf
+	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
+	  // dotxf
+	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+	  cntx
+	);
+
+	// Update the context with optimized level-1v kernels.
+	bli_cntx_set_l1v_kers
+	(
+	  16,
+
+	  // amaxv
+	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
+	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+
+	  // axpyv
+
+	  // axpyv
+	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
+	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
+
+	  // dotv
+	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int10,
+	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int10,
+
+	  // dotxv
+	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
+	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+
+	  // scalv
+	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
+	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
+
+	  //swap
+	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,
+
+	  //copy
+	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
+	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
+
+	  //set
+	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
+	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+
+	  cntx
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//
+	// These are reference block sizes and may be overridden based on
+	// number of threads used at runtime.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
+
+	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  BLIS_NAT, 7,
+	  // level-3
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  // level-1f
+	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
+	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
+	  cntx
+	);
+
+// -------------------------------------------------------------------------
+
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],  512,  256,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],  200,  256,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],  240,  220,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+#if 0
+	// Initialize the context with the sup handlers.
+	bli_cntx_set_l3_sup_handlers
+	(
+	  2,
+	  BLIS_GEMM, bli_gemmsup_ref,
+	  BLIS_GEMMT, bli_gemmtsup_ref,
+	  cntx
+	);
+#endif
+
+#if 0
+	// AMD: This should be enabled in the PR which has added these kernels
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  28,
+	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
+	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
+	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  cntx
+	);
+#else
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  16,
+	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+
+	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
+	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
+	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
+	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
+	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
+	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
+	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
+	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
+	  cntx
+	);
+	
+#endif
+	
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,     3,     3,
+	                                             9,     9,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   512,   256,   128,    64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  2040,  1020 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
+}
+
diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h
new file mode 100644
index 000000000..918e919ae
--- /dev/null
+++ b/config/zen3/bli_family_zen3.h
@@ -0,0 +1,94 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLI_FAMILY_ZEN3_
+#define BLI_FAMILY_ZEN3_
+
+// By default, it is effective to parallelize the outer loops.
+// Setting these macros to 1 will force JR and IR inner loops
+// to be not paralleized.
+//
+
+#define BLIS_THREAD_MAX_IR      1
+#define BLIS_THREAD_MAX_JR      1
+
+
+// To enable framework optimizations for zen3 platform
+// All zen3 specific code should be included in this macro
+#define BLIS_CONFIG_ZEN3
+
+// To enable framework optimizations for zen3 platform
+// All zen3 specific code should be included in this macro
+#define BLIS_CONFIG_ZEN3
+
+#define BLIS_ENABLE_SMALL_MATRIX
+#define BLIS_ENABLE_SMALL_MATRIX_TRSM
+
+
+// This will select the threshold below which small matrix code will be called.
+#define BLIS_SMALL_MATRIX_THRES        700
+#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
+#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
+
+#define BLIS_SMALL_MATRIX_THRES_TRSM   32768 //128(128+128) => m*(m+n)
+#define BLIS_SMALL_MATRIX_A_THRES_TRSM  128
+#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK    96
+#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK    128
+
+#define BLIS_ENABLE_SMALL_MATRIX_ROME
+#define BLIS_SMALL_MATRIX_THRES_ROME       400
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50
+  
+#endif
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
new file mode 100644
index 000000000..5c68855db
--- /dev/null
+++ b/config/zen3/make_defs.mk
@@ -0,0 +1,113 @@
+#
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := zen3 
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3
+endif
+
+# Flags specific to optimized and reference kernels.
+# NOTE: The -fomit-frame-pointer option is needed for some kernels because
+# they make explicit use of the rbp register.
+CKOPTFLAGS         := $(COPTFLAGS) -fomit-frame-pointer
+CROPTFLAGS         := $(CKOPTFLAGS)
+CKVECFLAGS         := -mavx2 -mfma -mfpmath=sse
+CRVECFLAGS         := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+ifeq ($(CC_VENDOR),gcc)
+  ifeq ($(GCC_OT_9_1_0),yes)  # gcc versions older than 9.1.
+    CVECFLAGS_VER  := -march=znver1 -mno-avx256-split-unaligned-store
+  else
+  ifeq ($(GCC_OT_10_1_0),yes) # gcc versions 9.1 or newer, but older than 10.1.
+    CVECFLAGS_VER  := -march=znver2
+  else                        # gcc versions 10.1 or newer.
+    CVECFLAGS_VER  := -march=znver3
+  endif
+  endif
+else
+ifeq ($(CC_VENDOR),clang)
+  ifeq ($(CLANG_OT_9_0_0),yes)  # clang versions older than 9.0.
+    CVECFLAGS_VER  := -march=znver1
+  else
+  ifeq ($(CLANG_OT_12_0_0),yes) # clang versions 9.0 or newer, but older than 12.0.
+    CVECFLAGS_VER  := -march=znver2
+  else                          # clang versions 12.0 or newer.
+    CVECFLAGS_VER  := -march=znver3
+  endif
+  endif
+else
+ifeq ($(CC_VENDOR),aocc)
+  ifeq ($(AOCC_OT_2_0_0),yes)   # aocc versions older than 2.0.
+    CVECFLAGS_VER  := -march=znver1
+  else
+  ifeq ($(AOCC_OT_3_0_0),yes)   # aocc versions 2.0 or newer, but older than 3.0.
+    CVECFLAGS_VER  := -march=znver2
+  else                          # aocc versions 3.0 or newer.
+    CVECFLAGS_VER  := -march=znver3
+  endif
+  endif
+else
+  $(error gcc, clang, or aocc is required for this configuration.)
+endif
+endif
+endif
+CKVECFLAGS         += $(CVECFLAGS_VER)
+CRVECFLAGS         += $(CVECFLAGS_VER)
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/zen3/make_defs.mk.old b/config/zen3/make_defs.mk.old
new file mode 100644
index 000000000..e0794ab0c
--- /dev/null
+++ b/config/zen3/make_defs.mk.old
@@ -0,0 +1,137 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# FLAGS that are specific to the 'zen3' architecture are added here.
+# FLAGS that are common for all the AMD architectures are present in
+# config/zen/amd_config.mk.
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := zen3
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+#frame pointers are needed to execution tracing
+ifeq ($(ETRACE_ENABLE),1)
+COPTFLAGS      := -O3
+else
+COPTFLAGS      := -O3 -fomit-frame-pointer
+endif
+endif
+
+
+#
+# --- Enable ETRACE across the library if enabled ETRACE_ENABLE=[0,1] -----------------------
+#
+
+ifeq ($(ETRACE_ENABLE),1)
+CDBGFLAGS += -pg -finstrument-functions -DAOCL_DTL_AUTO_TRACE_ENABLE
+LDFLAGS += -ldl
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
+#gcc or clang version must be atleast 4.0
+# gcc 9.0 or later:
+ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
+CKVECFLAGS     += -march=znver2
+else
+# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
+# as the fallback option.
+CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+endif
+else
+ifeq ($(CC_VENDOR),clang)
+
+# AOCC clang has various formats for the version line
+
+# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19)
+# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12)
+# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0)
+# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
+# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
+
+# For our prupose we just want to know if it version 2x or 3x
+
+# for version 3x we will enable znver3
+ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
+CKVECFLAGS += -march=znver3
+else
+# for version 2x we will enable znver2
+ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
+CKVECFLAGS += -march=znver2
+else
+#if compiling with clang
+VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
+CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
+#clang 9.0 or later:
+ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
+CKVECFLAGS += -march=znver2
+else
+CKVECFLAGS += -march=znver1
+endif # ge 9
+endif # aocc 2
+endif # aocc 3
+endif # clang
+endif # gcc
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS)
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config_registry b/config_registry
index bdd3d2228..d472325c7 100644
--- a/config_registry
+++ b/config_registry
@@ -8,11 +8,12 @@
 #
 
 # Processor families.
-x86_64:      intel64 amd64
-intel64:     skx knl haswell sandybridge penryn generic
-amd64:       zen2 zen excavator steamroller piledriver bulldozer generic
-arm64:       firestorm thunderx2 cortexa57 cortexa53 generic
-arm32:       cortexa15 cortexa9 generic
+x86_64:         intel64 amd64 amd64_legacy
+intel64:        skx knl haswell sandybridge penryn generic
+amd64_legacy:   excavator steamroller piledriver bulldozer generic
+amd64:          zen3 zen2 zen generic
+arm64:          firestorm thunderx2 cortexa57 cortexa53 generic
+arm32:          cortexa15 cortexa9 generic
 
 # Intel architectures.
 skx:         skx/skx/haswell/zen
@@ -22,6 +23,7 @@ sandybridge: sandybridge
 penryn:      penryn
 
 # AMD architectures.
+zen3:        zen3/zen3/zen2/zen/haswell
 zen2:        zen2/zen2/zen/haswell
 zen:         zen/zen/haswell
 excavator:   excavator/piledriver
diff --git a/configure b/configure
index 3c865dad9..447b0791e 100755
--- a/configure
+++ b/configure
@@ -1434,22 +1434,80 @@ get_compiler_version()
 	# The last part ({ read first rest ; echo $first ; }) is a workaround
 	# to OS X's egrep only returning the first match.
 	cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG' | { read first rest ; echo $first ; })
+
+	# AOCC version strings contain both "clang" and "AOCC" substrings, and
+	# so we have perform a follow-up check to make sure cc_vendor gets set
+	# correctly.
+	aocc_grep=$(echo "${vendor_string}" | grep 'AOCC')
+	if [ -n "${aocc_grep}" ]; then
+		cc_vendor="aocc"
+	fi
+
+	# Begin parsing cc_vendor for the version string.
+
 	if [ "${cc_vendor}" = "crosstool-NG" ]; then
 	     # Treat compilers built by crosstool-NG (for eg: conda) as gcc.
 	     cc_vendor="gcc"
 	fi
 	if [ "${cc_vendor}" = "icc" -o \
 	     "${cc_vendor}" = "gcc" ]; then
+
 		cc_version=$(${cc} -dumpversion)
-	# If compiler is AOCC, first grep for clang and then the version number.
+
 	elif [ "${cc_vendor}" = "clang" ]; then
-		cc_version=$(echo "${vendor_string}" | egrep -o '(clang|LLVM) version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')
+
+		cc_version=$(echo "${vendor_string}" \
+		             | egrep -o '(clang|LLVM) version [0-9]+\.[0-9]+\.?[0-9]*' \
+		             | egrep -o                      '[0-9]+\.[0-9]+\.?[0-9]*')
+
+	elif [ "${cc_vendor}" = "aocc" ]; then
+
+		aocc_ver21=$(echo "${vendor_string}" | grep 'AOCC.LLVM.2')
+
+		# Versions 2.0 and 2.1 had different version string formats from
+		# 2.2 and later, so we have to handle them separately.
+		# Examples:
+		# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19)
+		# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12)
+		# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0)
+		# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
+		# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
+
+		if [ -n "${aocc_ver21}" ]; then
+
+			# Grep for the AOCC.LLVM.x.y.z substring first, and then isolate the
+			# version number. Also, the string may contain multiple instances of
+			# the version number, so only use the first occurrence.
+			cc_version=$(echo "${vendor_string}" \
+			             | egrep -o 'AOCC.LLVM.[0-9]+\.[0-9]+\.?[0-9]*' \
+			             | egrep -o           '[0-9]+\.[0-9]+\.?[0-9]*' \
+		                 | { read first rest ; echo $first ; })
+		else
+
+			# Grep for the AOCC_x.y.z substring first, and then isolate the
+			# version number. As of this writing, these version strings don't
+			# include multiple instances of the version, but we nonetheless
+			# take only the first occurrence as a future-oriented safety
+			# measure.
+			cc_version=$(echo "${vendor_string}" \
+			             | egrep -o 'AOCC_[0-9]+\.[0-9]+\.?[0-9]*' \
+			             | egrep -o      '[0-9]+\.[0-9]+\.?[0-9]*' \
+		                 | { read first rest ; echo $first ; })
+		fi
+
 	elif [ "${cc_vendor}" = "oneAPI" ]; then
+
 		# Treat Intel oneAPI's clang as clang, not icc.
 		cc_vendor="clang"
-		cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
+		cc_version=$(echo "${vendor_string}" \
+		             | egrep -o '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' \
+		             | { read first rest ; echo ${first} ; })
+
 	else
-		cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
+
+		cc_version=$(echo "${vendor_string}" \
+		             | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' \
+		             | { read first rest ; echo ${first} ; })
 	fi
 
 	# Parse the version number into its major, minor, and revision
@@ -1500,6 +1558,8 @@ check_compiler()
 	#   penryn: any
 	#
 	#   zen: gcc 6.0+[1], clang 4.0+
+	#   zen2: gcc 6.0+[1], clang 4.0+
+	#   zen3: gcc 6.0+[1], clang 4.0+
 	#   excavator: gcc 4.9+, clang 3.5+
 	#   steamroller: any
 	#   piledriver: any
@@ -1683,12 +1743,30 @@ check_compiler_version_ranges()
 	#   Newer versions of gcc support Zen2 via the '-march=znver2' option [6].
 	#
 	#   [5] https://gcc.gnu.org/onlinedocs/gcc-8.3.0/gcc/x86-Options.html#x86-Options
-	#   [6] https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/x86-Options.html#x86-Options
+	#   [6] https://gcc.gnu.org/onlinedocs/gcc-9.4.0/gcc/x86-Options.html#x86-Options
+	#
+	# range: gcc < 10.1 (ie: 9.4 or older)
+	# variable: gcc_older_than_10_1_0
+	# comments:
+	#   These older versions of gcc do not explicitly support the Zen3
+	#   microarchitecture; the newest microarchitectural value understood by
+	#   these versions is '-march=znver2' (if !gcc_older_than_9_1_0) [7].
+	#   Newer versions of gcc support Zen3 via the '-march=znver3' option [8].
+	#
+	#   [7] https://gcc.gnu.org/onlinedocs/gcc-9.4.0/gcc/x86-Options.html#x86-Options
+	#   [8] https://gcc.gnu.org/onlinedocs/gcc-10.3.0/gcc/x86-Options.html#x86-Options
 	#
 
 	gcc_older_than_4_9_0='no'
 	gcc_older_than_6_1_0='no'
 	gcc_older_than_9_1_0='no'
+	gcc_older_than_10_1_0='no'
+
+	clang_older_than_9_0_0='no'
+	clang_older_than_12_0_0='no'
+
+	aocc_older_than_2_0_0='no'
+	aocc_older_than_3_0_0='no'
 
 	echo "${script_name}: checking ${cc} ${cc_version} against known consequential version ranges."
 
@@ -1714,6 +1792,12 @@ check_compiler_version_ranges()
 			echo "${script_name}: note: found ${cc} version older than 9.1."
 			gcc_older_than_9_1_0='yes'
 		fi
+
+		# Check for gcc < 10.1.0 (ie: 9.4 or older).
+		if [ ${cc_major} -lt 10 ]; then
+			echo "${script_name}: note: found ${cc} version older than 10.1."
+			gcc_older_than_10_1_0='yes'
+		fi
 	fi
 
 	# icc
@@ -1723,7 +1807,34 @@ check_compiler_version_ranges()
 
 	# clang
 	if [ "x${cc_vendor}" = "xclang" ]; then
-		:
+
+		# Check for clang < 9.0.0.
+		if [ ${cc_major} -lt 9 ]; then
+			echo "${script_name}: note: found ${cc} version older than 9.0."
+			clang_older_than_9_0_0='yes'
+		fi
+
+		# Check for clang < 12.0.0.
+		if [ ${cc_major} -lt 12 ]; then
+			echo "${script_name}: note: found ${cc} version older than 12.0."
+			clang_older_than_12_0_0='yes'
+		fi
+	fi
+
+	# aocc
+	if [ "x${cc_vendor}" = "xaocc" ]; then
+
+		# Check for aocc < 2.0.0.
+		if [ ${cc_major} -lt 2 ]; then
+			echo "${script_name}: note: found ${cc} version older than 2.0."
+			aocc_older_than_2_0_0='yes'
+		fi
+
+		# Check for aocc < 3.0.0.
+		if [ ${cc_major} -lt 3 ]; then
+			echo "${script_name}: note: found ${cc} version older than 3.0."
+			aocc_older_than_3_0_0='yes'
+		fi
 	fi
 }
 
@@ -3353,6 +3464,11 @@ main()
 		| sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \
 		| sed -e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g" \
 		| sed -e "s/@gcc_older_than_9_1_0@/${gcc_older_than_9_1_0}/g" \
+		| sed -e "s/@gcc_older_than_10_1_0@/${gcc_older_than_10_1_0}/g" \
+		| sed -e "s/@clang_older_than_9_0_0@/${clang_older_than_9_0_0}/g" \
+		| sed -e "s/@clang_older_than_12_0_0@/${clang_older_than_12_0_0}/g" \
+		| sed -e "s/@aocc_older_than_2_0_0@/${aocc_older_than_2_0_0}/g" \
+		| sed -e "s/@aocc_older_than_3_0_0@/${aocc_older_than_3_0_0}/g" \
 		| sed -e "s/@CC@/${cc_esc}/g" \
 		| sed -e "s/@CXX@/${cxx_esc}/g" \
 		| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index c8d8eec79..54aa64d42 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -171,6 +171,9 @@ void bli_arch_set_id( void )
 		#endif
 
 		// AMD microarchitectures.
+		#ifdef BLIS_FAMILY_ZEN3
+		id = BLIS_ARCH_ZEN3;
+		#endif
 		#ifdef BLIS_FAMILY_ZEN2
 		id = BLIS_ARCH_ZEN2;
 		#endif
@@ -259,6 +262,7 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
     "sandybridge",
     "penryn",
 
+    "zen3",
     "zen2",
     "zen",
     "excavator",
@@ -279,7 +283,7 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
     "power9",
     "power7",
     "bgq",
-
+    
     "generic"
 };
 
diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c
index c7ceb8d7c..ff0f386e6 100644
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2019, Dave Love, University of Manchester
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -131,6 +132,10 @@ arch_t bli_cpuid_query_id( void )
 
 		// Check for each AMD configuration that is enabled, check for that
 		// microarchitecture. We check from most recent to most dated.
+#ifdef BLIS_CONFIG_ZEN3
+		if ( bli_cpuid_is_zen3( family, model, features ) )
+			return BLIS_ARCH_ZEN3;
+#endif
 #ifdef BLIS_CONFIG_ZEN2
 		if ( bli_cpuid_is_zen2( family, model, features ) )
 			return BLIS_ARCH_ZEN2;
@@ -278,6 +283,35 @@ bool bli_cpuid_is_penryn
 
 // -----------------------------------------------------------------------------
 
+bool bli_cpuid_is_zen3
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
+{
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_AVX  |
+	                          FEATURE_FMA3 |
+	                          FEATURE_AVX2;
+
+	if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
+
+	// All Zen3 cores have a family of 0x19.
+	if ( family != 0x19 ) return FALSE;
+
+	// Finally, check for specific models:
+	// - 0x00 ~ 0xff
+	// NOTE: We accept any model because the family 25 (0x19) is unique.
+	const bool is_arch
+	=
+	( 0x00 <= model && model <= 0xff );
+
+	if ( !is_arch ) return FALSE;
+
+	return TRUE;
+}
+
 bool bli_cpuid_is_zen2
      (
        uint32_t family,
@@ -296,7 +330,9 @@ bool bli_cpuid_is_zen2
 	if ( family != 0x17 ) return FALSE;
 
 	// Finally, check for specific models:
-	// - 0x30-0xff (THIS NEEDS UPDATING)
+	// - 0x30 ~ 0xff
+	// NOTE: We must check model because the family 23 (0x17) is shared with
+	// zen.
 	const bool is_arch
 	=
 	( 0x30 <= model && model <= 0xff );
@@ -324,10 +360,12 @@ bool bli_cpuid_is_zen
 	if ( family != 0x17 ) return FALSE;
 
 	// Finally, check for specific models:
-	// - 0x00-0xff (THIS NEEDS UPDATING)
+	// - 0x00 ~ 0x2f
+	// NOTE: We must check model because the family 23 (0x17) is shared with
+	// zen2.
 	const bool is_arch
 	=
-	( 0x00 <= model && model <= 0xff );
+	( 0x00 <= model && model <= 0x2f );
 
 	if ( !is_arch ) return FALSE;
 
@@ -352,7 +390,7 @@ bool bli_cpuid_is_excavator
 	if ( family != 0x15 ) return FALSE;
 
 	// Finally, check for specific models:
-	// - 0x60-0x7f
+	// - 0x60 ~ 0x7f
 	const bool is_arch
 	=
 	( 0x60 <= model && model <= 0x7f );
@@ -380,7 +418,7 @@ bool bli_cpuid_is_steamroller
 	if ( family != 0x15 ) return FALSE;
 
 	// Finally, check for specific models:
-	// - 0x30-0x3f
+	// - 0x30 ~ 0x3f
 	const bool is_arch
 	=
 	( 0x30 <= model && model <= 0x3f );
@@ -409,7 +447,7 @@ bool bli_cpuid_is_piledriver
 
 	// Finally, check for specific models:
 	// - 0x02
-	// - 0x10-0x1f
+	// - 0x10 ~ 0x1f
 	const bool is_arch
 	=
 	model == 0x02 || ( 0x10 <= model && model <= 0x1f );
diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h
index d8e597aee..3fea78e5a 100644
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -61,6 +61,7 @@ bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t feature
 bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features );
 
 // AMD
+bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features );
diff --git a/frame/base/bli_error.h b/frame/base/bli_error.h
index 8c2971781..e6e6f35dd 100644
--- a/frame/base/bli_error.h
+++ b/frame/base/bli_error.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 0a5bcafd4..cc17b33ff 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -97,6 +97,11 @@ void bli_gks_init( void )
 #endif
 
 		// AMD architectures
+#ifdef BLIS_CONFIG_ZEN3
+		bli_gks_register_cntx( BLIS_ARCH_ZEN3,        bli_cntx_init_zen3,
+		                                              bli_cntx_init_zen3_ref,
+		                                              bli_cntx_init_zen3_ind );
+#endif
 #ifdef BLIS_CONFIG_ZEN2
 		bli_gks_register_cntx( BLIS_ARCH_ZEN2,        bli_cntx_init_zen2,
 		                                              bli_cntx_init_zen2_ref,
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index b0d23419f..f804d3003 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -42,6 +42,7 @@
 //
 
 // -- Intel64 architectures --
+
 #ifdef BLIS_CONFIG_SKX
 CNTX_INIT_PROTS( skx )
 #endif
@@ -62,6 +63,10 @@ CNTX_INIT_PROTS( penryn )
 #endif
 
 // -- AMD64 architectures --
+
+#ifdef BLIS_CONFIG_ZEN3
+CNTX_INIT_PROTS( zen3 )
+#endif
 #ifdef BLIS_CONFIG_ZEN2
 CNTX_INIT_PROTS( zen2 )
 #endif
@@ -145,11 +150,15 @@ CNTX_INIT_PROTS( generic )
 #ifdef BLIS_FAMILY_AMD64
 #include "bli_family_amd64.h"
 #endif
+#ifdef BLIS_FAMILY_AMD64_LEGACY
+#include "bli_family_amd64_legacy.h"
+#endif
 #ifdef BLIS_FAMILY_X86_64
 #include "bli_family_x86_64.h"
 #endif
 
 // -- Intel64 architectures --
+
 #ifdef BLIS_FAMILY_SKX
 #include "bli_family_skx.h"
 #endif
@@ -171,6 +180,9 @@ CNTX_INIT_PROTS( generic )
 
 // -- AMD64 architectures --
 
+#ifdef BLIS_FAMILY_ZEN3
+#include "bli_family_zen3.h"
+#endif
 #ifdef BLIS_FAMILY_ZEN2
 #include "bli_family_zen2.h"
 #endif
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index c2db052e5..f1a7e8f8d 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -939,6 +939,7 @@ typedef enum
 	BLIS_ARCH_PENRYN,
 
 	// AMD
+	BLIS_ARCH_ZEN3,
 	BLIS_ARCH_ZEN2,
 	BLIS_ARCH_ZEN,
 	BLIS_ARCH_EXCAVATOR,
diff --git a/kernels/zen3/.gitignore b/kernels/zen3/.gitignore
new file mode 100644
index 000000000..5e7d2734c
--- /dev/null
+++ b/kernels/zen3/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/travis/cpuid/zen2.def b/travis/cpuid/zen2.def
new file mode 100644
index 000000000..1e2cc6390
--- /dev/null
+++ b/travis/cpuid/zen2.def
@@ -0,0 +1,87 @@
+#
+#   BLIS
+#   An object-based framework for developing high-performance BLAS-like
+#   libraries.
+#
+#   Copyright (C) 2018, The University of Texas at Austin
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions are
+#   met:
+#    - Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    - Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#    - Neither the name(s) of the copyright holder(s) nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# CPU: AMD EPYC 7742
+# NOTE: This file was copied from zen.def and then the appropriate bits
+# in the first field (eax) of leaf 1 were updated to reflect the Zen2
+# "Rome" processor. See [1] for details.
+# [1] https://en.wikichip.org/wiki/amd/cpuid
+#
+00000000 ******** => 0000000D 68747541 444D4163 69746E65
+00000001 ******** => 00830F12 00400800 7ED8320B 178BFBFF
+00000002 ******** => 00000000 00000000 00000000 00000000
+00000003 ******** => 00000000 00000000 00000000 00000000
+00000005 ******** => 00000040 00000040 00000003 00000011
+00000006 ******** => 00000004 00000000 00000001 00000000
+00000007 ******** => 00000000 209C01A9 00000000 00000000
+00000008 ******** => 00000000 00000000 00000000 00000000
+00000009 ******** => 00000000 00000000 00000000 00000000
+0000000A ******** => 00000000 00000000 00000000 00000000
+0000000C ******** => 00000000 00000000 00000000 00000000
+0000000D 00000000 => 00000007 00000340 00000340 00000000
+0000000D 00000001 => 0000000F 00000340 00000000 00000000
+0000000D 00000002 => 00000100 00000240 00000000 00000000
+80000000 ******** => 8000001F 68747541 444D4163 69746E65
+80000001 ******** => 00800F12 40000000 35C233FF 2FD3FBFF
+80000002 ******** => 20444D41 43595045 35353720 33205031
+80000003 ******** => 6F432D32 50206572 65636F72 726F7373
+80000004 ******** => 20202020 20202020 20202020 00202020
+80000005 ******** => FF40FF40 FF40FF40 20080140 40040140
+80000006 ******** => 36006400 56006400 02006140 0200C140
+80000007 ******** => 00000000 0000001B 00000000 00006799
+80000008 ******** => 00003030 00000007 0000603F 00000000
+80000009 ******** => 00000000 00000000 00000000 00000000
+8000000A ******** => 00000001 00008000 00000000 0001BCFF
+8000000B ******** => 00000000 00000000 00000000 00000000
+8000000C ******** => 00000000 00000000 00000000 00000000
+8000000D ******** => 00000000 00000000 00000000 00000000
+8000000E ******** => 00000000 00000000 00000000 00000000
+8000000F ******** => 00000000 00000000 00000000 00000000
+80000010 ******** => 00000000 00000000 00000000 00000000
+80000011 ******** => 00000000 00000000 00000000 00000000
+80000012 ******** => 00000000 00000000 00000000 00000000
+80000013 ******** => 00000000 00000000 00000000 00000000
+80000014 ******** => 00000000 00000000 00000000 00000000
+80000015 ******** => 00000000 00000000 00000000 00000000
+80000016 ******** => 00000000 00000000 00000000 00000000
+80000017 ******** => 00000000 00000000 00000000 00000000
+80000018 ******** => 00000000 00000000 00000000 00000000
+80000019 ******** => F040F040 00000000 00000000 00000000
+8000001A ******** => 00000003 00000000 00000000 00000000
+8000001B ******** => 000003FF 00000000 00000000 00000000
+8000001C ******** => 00000000 00000000 00000000 00000000
+8000001D 00000000 => 00004121 01C0003F 0000003F 00000000
+8000001D 00000001 => 00004122 00C0003F 000000FF 00000000
+8000001D 00000002 => 00004143 01C0003F 000003FF 00000002
+8000001D 00000003 => 0001C163 03C0003F 00001FFF 00000001
+8000001E ******** => 00000000 00000100 00000300 00000000
+8000001F ******** => 0000000F 0000016F 0000000F 00000001
+8FFFFFFF ******** => 00000000 00000000 00000000 00000000
diff --git a/travis/cpuid/zen3.def b/travis/cpuid/zen3.def
new file mode 100644
index 000000000..ed791813e
--- /dev/null
+++ b/travis/cpuid/zen3.def
@@ -0,0 +1,87 @@
+#
+#   BLIS
+#   An object-based framework for developing high-performance BLAS-like
+#   libraries.
+#
+#   Copyright (C) 2018, The University of Texas at Austin
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions are
+#   met:
+#    - Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    - Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#    - Neither the name(s) of the copyright holder(s) nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# CPU: AMD EPYC 7xxx
+# NOTE: This file was copied from zen.def and then the appropriate bits
+# in the first field (eax) of leaf 1 were updated to reflect the Zen3
+# "Milan" processor. See [1] for details.
+# [1] https://en.wikichip.org/wiki/amd/cpuid
+#
+00000000 ******** => 0000000D 68747541 444D4163 69746E65
+00000001 ******** => 00A00F12 00400800 7ED8320B 178BFBFF
+00000002 ******** => 00000000 00000000 00000000 00000000
+00000003 ******** => 00000000 00000000 00000000 00000000
+00000005 ******** => 00000040 00000040 00000003 00000011
+00000006 ******** => 00000004 00000000 00000001 00000000
+00000007 ******** => 00000000 209C01A9 00000000 00000000
+00000008 ******** => 00000000 00000000 00000000 00000000
+00000009 ******** => 00000000 00000000 00000000 00000000
+0000000A ******** => 00000000 00000000 00000000 00000000
+0000000C ******** => 00000000 00000000 00000000 00000000
+0000000D 00000000 => 00000007 00000340 00000340 00000000
+0000000D 00000001 => 0000000F 00000340 00000000 00000000
+0000000D 00000002 => 00000100 00000240 00000000 00000000
+80000000 ******** => 8000001F 68747541 444D4163 69746E65
+80000001 ******** => 00800F12 40000000 35C233FF 2FD3FBFF
+80000002 ******** => 20444D41 43595045 35353720 33205031
+80000003 ******** => 6F432D32 50206572 65636F72 726F7373
+80000004 ******** => 20202020 20202020 20202020 00202020
+80000005 ******** => FF40FF40 FF40FF40 20080140 40040140
+80000006 ******** => 36006400 56006400 02006140 0200C140
+80000007 ******** => 00000000 0000001B 00000000 00006799
+80000008 ******** => 00003030 00000007 0000603F 00000000
+80000009 ******** => 00000000 00000000 00000000 00000000
+8000000A ******** => 00000001 00008000 00000000 0001BCFF
+8000000B ******** => 00000000 00000000 00000000 00000000
+8000000C ******** => 00000000 00000000 00000000 00000000
+8000000D ******** => 00000000 00000000 00000000 00000000
+8000000E ******** => 00000000 00000000 00000000 00000000
+8000000F ******** => 00000000 00000000 00000000 00000000
+80000010 ******** => 00000000 00000000 00000000 00000000
+80000011 ******** => 00000000 00000000 00000000 00000000
+80000012 ******** => 00000000 00000000 00000000 00000000
+80000013 ******** => 00000000 00000000 00000000 00000000
+80000014 ******** => 00000000 00000000 00000000 00000000
+80000015 ******** => 00000000 00000000 00000000 00000000
+80000016 ******** => 00000000 00000000 00000000 00000000
+80000017 ******** => 00000000 00000000 00000000 00000000
+80000018 ******** => 00000000 00000000 00000000 00000000
+80000019 ******** => F040F040 00000000 00000000 00000000
+8000001A ******** => 00000003 00000000 00000000 00000000
+8000001B ******** => 000003FF 00000000 00000000 00000000
+8000001C ******** => 00000000 00000000 00000000 00000000
+8000001D 00000000 => 00004121 01C0003F 0000003F 00000000
+8000001D 00000001 => 00004122 00C0003F 000000FF 00000000
+8000001D 00000002 => 00004143 01C0003F 000003FF 00000002
+8000001D 00000003 => 0001C163 03C0003F 00001FFF 00000001
+8000001E ******** => 00000000 00000100 00000300 00000000
+8000001F ******** => 0000000F 0000016F 0000000F 00000001
+8FFFFFFF ******** => 00000000 00000000 00000000 00000000
diff --git a/travis/do_sde.sh b/travis/do_sde.sh
index c8eb5aa58..de1545886 100755
--- a/travis/do_sde.sh
+++ b/travis/do_sde.sh
@@ -3,7 +3,7 @@
 set -e
 set -x
 
-SDE_VERSION=sde-external-8.63.0-2021-01-18-lin
+SDE_VERSION=sde-external-8.69.1-2021-07-18-lin
 SDE_TARBALL=$SDE_VERSION.tar.bz2
 SDE=$SDE_VERSION/sde64
 
@@ -46,7 +46,7 @@ for LIB in $LD_SO $LIBC_SO $LIBM_SO; do
 done
 
 #for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do
-for ARCH in penryn sandybridge haswell skx knl zen; do
+for ARCH in penryn sandybridge haswell skx knl zen zen2 zen3; do
     if [ "$ARCH" = "knl" ]; then
         $SDE -knl -- ./test_libblis.x > output.testsuite
     else

From 9be97c150e19fa58bca30cb993a6509ae21e2025 Mon Sep 17 00:00:00 2001
From: Madan mohan Manokar <86282872+madanm3@users.noreply.github.com>
Date: Thu, 18 Nov 2021 00:46:46 +0530
Subject: [PATCH 008/230] Support all four dts in test/test_her[2][k].c (#578)

Details:
- Replaced the hard-coded calls to double-precision real syr, syr2,
  syrk, and syrk in the corresponding standalone test drivers in the
  'test' directory with conditional branches that will call the
  appropriate BLAS interface depending on which datatype is enabled.
  Thanks to Madan mohan Manokar for this improvement.
- CREDITS file update.
---
 CREDITS           |   1 +
 test/test_her.c   |  93 ++++++++++++++++++++++++++++-----------
 test/test_her2.c  | 110 ++++++++++++++++++++++++++++++++++------------
 test/test_her2k.c |  90 +++++++++++++++++++------------------
 test/test_herk.c  |  75 +++++++++++++++----------------
 5 files changed, 230 insertions(+), 139 deletions(-)

diff --git a/CREDITS b/CREDITS
index df088c746..81fc9bec5 100644
--- a/CREDITS
+++ b/CREDITS
@@ -58,6 +58,7 @@ but many others have contributed code and feedback, including
   Tze Meng Low                                 (The University of Texas at Austin)
   Ye Luo                   @ye-luo             (Argonne National Laboratory)
   Ricardo Magana           @magania            (Hewlett Packard Enterprise)
+  Madan mohan Manokar      @madanm3            (AMD)
   Giorgos Margaritis
   Bryan Marker             @bamarker           (The University of Texas at Austin)
   Simon Lukas Märtens      @ACSimon33          (RWTH Aachen University)
diff --git a/test/test_her.c b/test/test_her.c
index 341b8a5fc..267e1bfe0 100644
--- a/test/test_her.c
+++ b/test/test_her.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -81,11 +82,8 @@ int main( int argc, char** argv )
 	m_input = 6;
 #endif
 
-#if 1
-	dt_alpha = dt_x = dt_a = BLIS_DOUBLE;
-#else
+	// her supports complex and double complex
 	dt_alpha = dt_x = dt_a = BLIS_DCOMPLEX;
-#endif
 
 	uplo = BLIS_LOWER;
 
@@ -127,7 +125,7 @@ int main( int argc, char** argv )
 
 
 		bli_copym( &a, &a_save );
-	
+
 		dtime_save = DBL_MAX;
 
 		for ( r = 0; r < n_repeats; ++r )
@@ -143,33 +141,76 @@ int main( int argc, char** argv )
 #endif
 
 #ifdef BLIS
-			//bli_obj_toggle_conj( &x );
 
-			//bli_syr( &alpha,
 			bli_her( &alpha,
 			         &x,
 			         &a );
 
 #else
-
-			f77_char uplo   = 'L';
-			f77_int  mm     = bli_obj_length( &a );
-			f77_int  incx   = bli_obj_vector_inc( &x );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  xp     = bli_obj_buffer( &x );
-			double*  ap     = bli_obj_buffer( &a );
-/*
-			dcomplex* xp   = bli_obj_buffer( x );
-			dcomplex* ap   = bli_obj_buffer( &a );
-*/
-
-			dsyr_( &uplo,
-			//zher_( &uplo,
-			       &mm,
-			       alphap,
-			       xp, &incx,
-			       ap, &lda );
+			if ( bli_is_float( dt_a ) )
+			{
+				f77_char uplo   = 'L';
+				f77_int  mm     = bli_obj_length( &a );
+				f77_int  incx   = bli_obj_vector_inc( &x );
+				f77_int  lda    = bli_obj_col_stride( &a );
+				float*   alphap = bli_obj_buffer( &alpha );
+				float*   xp     = bli_obj_buffer( &x );
+				float*   ap     = bli_obj_buffer( &a );
+
+				ssyr_( &uplo,
+				       &mm,
+				       alphap,
+				       xp, &incx,
+				       ap, &lda );
+			}
+			else if ( bli_is_double( dt_a ) )
+			{
+				f77_char uplo   = 'L';
+				f77_int  mm     = bli_obj_length( &a );
+				f77_int  incx   = bli_obj_vector_inc( &x );
+				f77_int  lda    = bli_obj_col_stride( &a );
+				double*  alphap = bli_obj_buffer( &alpha );
+				double*  xp     = bli_obj_buffer( &x );
+				double*  ap     = bli_obj_buffer( &a );
+
+				dsyr_( &uplo,
+				       &mm,
+				       alphap,
+				       xp, &incx,
+				       ap, &lda );
+			}
+			else if ( bli_is_scomplex( dt_a ) )
+			{
+				f77_char  uplo   = 'L';
+				f77_int   mm     = bli_obj_length( &a );
+				f77_int   incx   = bli_obj_vector_inc( &x );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				float*    alphap = bli_obj_buffer( &alpha );
+				scomplex* xp     = bli_obj_buffer( &x );
+				scomplex* ap     = bli_obj_buffer( &a );
+
+				cher_( &uplo,
+				       &mm,
+				       alphap,
+				       xp, &incx,
+				       ap, &lda );
+			}
+			else if ( bli_is_dcomplex( dt_a ) )
+			{
+				f77_char  uplo   = 'L';
+				f77_int   mm     = bli_obj_length( &a );
+				f77_int   incx   = bli_obj_vector_inc( &x );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				double*   alphap = bli_obj_buffer( &alpha );
+				dcomplex* xp     = bli_obj_buffer( &x );
+				dcomplex* ap     = bli_obj_buffer( &a );
+
+				zher_( &uplo,
+				       &mm,
+				       alphap,
+				       xp, &incx,
+				       ap, &lda );
+			}
 #endif
 
 #ifdef PRINT
diff --git a/test/test_her2.c b/test/test_her2.c
index 7e6a61602..3672051dd 100644
--- a/test/test_her2.c
+++ b/test/test_her2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,7 +42,7 @@
 
 //           uplo   m     alpha    x        incx  y        incy  a        lda
 //void dsyr2_( char*, int*, double*, double*, int*, double*, int*, double*, int* );
- 
+
 //#define PRINT
 
 int main( int argc, char** argv )
@@ -80,11 +81,8 @@ int main( int argc, char** argv )
 	m_input = 6;
 #endif
 
-#if 1
-	dt_alpha = dt_x = dt_y = dt_a = BLIS_DOUBLE;
-#else
-	dt_alpha = dt_x = dt_y = dt_a = BLIS_DCOMPLEX;
-#endif
+	// her2 supports complex and double complex
+	dt_alpha = dt_x = dt_y = dt_a = BLIS_SCOMPLEX;
 
 	uplo = BLIS_LOWER;
 
@@ -128,7 +126,7 @@ int main( int argc, char** argv )
 
 
 		bli_copym( &a, &a_save );
-	
+
 		dtime_save = DBL_MAX;
 
 		for ( r = 0; r < n_repeats; ++r )
@@ -142,37 +140,93 @@ int main( int argc, char** argv )
 			bli_printm( "x", &x, "%4.1f", "" );
 			bli_printm( "y", &y, "%4.1f", "" );
 			bli_printm( "a", &a, "%4.1f", "" );
-#endif 
+#endif
 
 #ifdef BLIS
 
-			//bli_obj_toggle_conj( &x );
-			//bli_obj_toggle_conj( &y );
-
-			//bli_syr2( &alpha,
 			bli_her2( &alpha,
 			          &x,
 			          &y,
 			          &a );
 
 #else
+			if ( bli_is_float( dt_a ) )
+			{
+				f77_char uplo   = 'L';
+				f77_int  mm     = bli_obj_length( &a );
+				f77_int  incx   = bli_obj_vector_inc( &x );
+				f77_int  incy   = bli_obj_vector_inc( &y );
+				f77_int  lda    = bli_obj_col_stride( &a );
+				float*   alphap = bli_obj_buffer( &alpha );
+				float*   xp     = bli_obj_buffer( &x );
+				float*   yp     = bli_obj_buffer( &y );
+				float*   ap     = bli_obj_buffer( &a );
+
+				ssyr2_( &uplo,
+				        &mm,
+				        alphap,
+				        xp, &incx,
+				        yp, &incy,
+				        ap, &lda );
+			}
+			else if ( bli_is_double( dt_a ) )
+			{
+				f77_char uplo   = 'L';
+				f77_int  mm     = bli_obj_length( &a );
+				f77_int  incx   = bli_obj_vector_inc( &x );
+				f77_int  incy   = bli_obj_vector_inc( &y );
+				f77_int  lda    = bli_obj_col_stride( &a );
+				double*  alphap = bli_obj_buffer( &alpha );
+				double*  xp     = bli_obj_buffer( &x );
+				double*  yp     = bli_obj_buffer( &y );
+				double*  ap     = bli_obj_buffer( &a );
+
+				dsyr2_( &uplo,
+				        &mm,
+				        alphap,
+				        xp, &incx,
+				        yp, &incy,
+				        ap, &lda );
+			}
+			else if ( bli_is_scomplex( dt_a ) )
+			{
+				f77_char  uplo   = 'L';
+				f77_int   mm     = bli_obj_length( &a );
+				f77_int   incx   = bli_obj_vector_inc( &x );
+				f77_int   incy   = bli_obj_vector_inc( &y );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				scomplex* alphap = bli_obj_buffer( &alpha );
+				scomplex* xp     = bli_obj_buffer( &x );
+				scomplex* yp     = bli_obj_buffer( &y );
+				scomplex* ap     = bli_obj_buffer( &a );
+
+				cher2_( &uplo,
+				        &mm,
+				        alphap,
+				        xp, &incx,
+				        yp, &incy,
+				        ap, &lda );
+			}
+			else if ( bli_is_dcomplex( dt_a ) )
+			{
+				f77_char  uplo   = 'L';
+				f77_int   mm     = bli_obj_length( &a );
+				f77_int   incx   = bli_obj_vector_inc( &x );
+				f77_int   incy   = bli_obj_vector_inc( &y );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				dcomplex* alphap = bli_obj_buffer( &alpha );
+				dcomplex* xp     = bli_obj_buffer( &x );
+				dcomplex* yp     = bli_obj_buffer( &y );
+				dcomplex* ap     = bli_obj_buffer( &a );
+
+				zher2_( &uplo,
+				        &mm,
+				        alphap,
+				        xp, &incx,
+				        yp, &incy,
+				        ap, &lda );
+			}
 
-			f77_char uplo   = 'L';
-			f77_int  mm     = bli_obj_length( &a );
-			f77_int  incx   = bli_obj_vector_inc( &x );
-			f77_int  incy   = bli_obj_vector_inc( &y );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  xp     = bli_obj_buffer( &x );
-			double*  yp     = bli_obj_buffer( &y );
-			double*  ap     = bli_obj_buffer( &a );
-
-			dsyr2_( &uplo,
-			        &mm,
-			        alphap,
-			        xp, &incx,
-			        yp, &incy,
-			        ap, &lda );
 #endif
 
 #ifdef PRINT
diff --git a/test/test_her2k.c b/test/test_her2k.c
index 85dabc98d..7e8a7b8fe 100644
--- a/test/test_her2k.c
+++ b/test/test_her2k.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -84,13 +85,10 @@ int main( int argc, char** argv )
 	k_input = 1;
 #endif
 
-#if 1
-	//dt = BLIS_FLOAT;
-	dt = BLIS_DOUBLE;
-#else
+	// her2k supports complex and double complex 
 	//dt = BLIS_SCOMPLEX;
 	dt = BLIS_DCOMPLEX;
-#endif
+
 
 	uploc = BLIS_LOWER;
 	//uploc = BLIS_UPPER;
@@ -153,7 +151,7 @@ int main( int argc, char** argv )
 
 
 		bli_copym( &c, &c_save );
-	
+
 		dtime_save = DBL_MAX;
 
 		for ( r = 0; r < n_repeats; ++r )
@@ -181,16 +179,16 @@ int main( int argc, char** argv )
 #else
 		if ( bli_is_float( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			float*   alphap = bli_obj_buffer( &alpha );
-			float*   ap     = bli_obj_buffer( &a );
-			float*   bp     = bli_obj_buffer( &b );
-			float*   betap  = bli_obj_buffer( &beta );
-			float*   cp     = bli_obj_buffer( &c );
+			f77_int mm     = bli_obj_length( &c );
+			f77_int kk     = bli_obj_width_after_trans( &a );
+			f77_int lda    = bli_obj_col_stride( &a );
+			f77_int ldb    = bli_obj_col_stride( &b );
+			f77_int ldc    = bli_obj_col_stride( &c );
+			float*  alphap = bli_obj_buffer( &alpha );
+			float*  ap     = bli_obj_buffer( &a );
+			float*  bp     = bli_obj_buffer( &b );
+			float*  betap  = bli_obj_buffer( &beta );
+			float*  cp     = bli_obj_buffer( &c );
 
 			ssyr2k_( &f77_uploc,
 			         &f77_transa,
@@ -204,16 +202,16 @@ int main( int argc, char** argv )
 		}
 		else if ( bli_is_double( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  ap     = bli_obj_buffer( &a );
-			double*  bp     = bli_obj_buffer( &b );
-			double*  betap  = bli_obj_buffer( &beta );
-			double*  cp     = bli_obj_buffer( &c );
+			f77_int mm     = bli_obj_length( &c );
+			f77_int kk     = bli_obj_width_after_trans( &a );
+			f77_int lda    = bli_obj_col_stride( &a );
+			f77_int ldb    = bli_obj_col_stride( &b );
+			f77_int ldc    = bli_obj_col_stride( &c );
+			double* alphap = bli_obj_buffer( &alpha );
+			double* ap     = bli_obj_buffer( &a );
+			double* bp     = bli_obj_buffer( &b );
+			double* betap  = bli_obj_buffer( &beta );
+			double* cp     = bli_obj_buffer( &c );
 
 			dsyr2k_( &f77_uploc,
 			         &f77_transa,
@@ -227,16 +225,16 @@ int main( int argc, char** argv )
 		}
 		else if ( bli_is_scomplex( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			scomplex*  alphap = bli_obj_buffer( &alpha );
-			scomplex*  ap     = bli_obj_buffer( &a );
-			scomplex*  bp     = bli_obj_buffer( &b );
-			float*     betap  = bli_obj_buffer( &beta );
-			scomplex*  cp     = bli_obj_buffer( &c );
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldb    = bli_obj_col_stride( &b );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			scomplex* alphap = bli_obj_buffer( &alpha );
+			scomplex* ap     = bli_obj_buffer( &a );
+			scomplex* bp     = bli_obj_buffer( &b );
+			float*    betap  = bli_obj_buffer( &beta );
+			scomplex* cp     = bli_obj_buffer( &c );
 
 			cher2k_( &f77_uploc,
 			         &f77_transa,
@@ -250,16 +248,16 @@ int main( int argc, char** argv )
 		}
 		else if ( bli_is_dcomplex( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			dcomplex*  alphap = bli_obj_buffer( &alpha );
-			dcomplex*  ap     = bli_obj_buffer( &a );
-			dcomplex*  bp     = bli_obj_buffer( &b );
-			double*    betap  = bli_obj_buffer( &beta );
-			dcomplex*  cp     = bli_obj_buffer( &c );
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldb    = bli_obj_col_stride( &b );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			dcomplex* alphap = bli_obj_buffer( &alpha );
+			dcomplex* ap     = bli_obj_buffer( &a );
+			dcomplex* bp     = bli_obj_buffer( &b );
+			double*   betap  = bli_obj_buffer( &beta );
+			dcomplex* cp     = bli_obj_buffer( &c );
 
 			zher2k_( &f77_uploc,
 			         &f77_transa,
diff --git a/test/test_herk.c b/test/test_herk.c
index dc5725612..cbf963a33 100644
--- a/test/test_herk.c
+++ b/test/test_herk.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -83,14 +84,10 @@ int main( int argc, char** argv )
 	m_input = 3;
 	k_input = 1;
 #endif
-
-#if 1
-	//dt = BLIS_FLOAT;
-	dt = BLIS_DOUBLE;
-#else
+	
+	// herk supports complex and double complex 
 	//dt = BLIS_SCOMPLEX;
 	dt = BLIS_DCOMPLEX;
-#endif
 
 	uploc = BLIS_LOWER;
 	//uploc = BLIS_UPPER;
@@ -145,7 +142,7 @@ int main( int argc, char** argv )
 
 
 		bli_copym( &c, &c_save );
-	
+
 		dtime_save = DBL_MAX;
 
 		for ( r = 0; r < n_repeats; ++r )
@@ -171,14 +168,14 @@ int main( int argc, char** argv )
 #else
 		if ( bli_is_float( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			float*   alphap = bli_obj_buffer( &alpha );
-			float*   ap     = bli_obj_buffer( &a );
-			float*   betap  = bli_obj_buffer( &beta );
-			float*   cp     = bli_obj_buffer( &c );
+			f77_int mm     = bli_obj_length( &c );
+			f77_int kk     = bli_obj_width_after_trans( &a );
+			f77_int lda    = bli_obj_col_stride( &a );
+			f77_int ldc    = bli_obj_col_stride( &c );
+			float*  alphap = bli_obj_buffer( &alpha );
+			float*  ap     = bli_obj_buffer( &a );
+			float*  betap  = bli_obj_buffer( &beta );
+			float*  cp     = bli_obj_buffer( &c );
 
 			ssyrk_( &f77_uploc,
 			        &f77_transa,
@@ -191,14 +188,14 @@ int main( int argc, char** argv )
 		}
 		else if ( bli_is_double( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  ap     = bli_obj_buffer( &a );
-			double*  betap  = bli_obj_buffer( &beta );
-			double*  cp     = bli_obj_buffer( &c );
+			f77_int mm     = bli_obj_length( &c );
+			f77_int kk     = bli_obj_width_after_trans( &a );
+			f77_int lda    = bli_obj_col_stride( &a );
+			f77_int ldc    = bli_obj_col_stride( &c );
+			double* alphap = bli_obj_buffer( &alpha );
+			double* ap     = bli_obj_buffer( &a );
+			double* betap  = bli_obj_buffer( &beta );
+			double* cp     = bli_obj_buffer( &c );
 
 			dsyrk_( &f77_uploc,
 			        &f77_transa,
@@ -211,14 +208,14 @@ int main( int argc, char** argv )
 		}
 		else if ( bli_is_scomplex( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			float*     alphap = bli_obj_buffer( &alpha );
-			scomplex*  ap     = bli_obj_buffer( &a );
-			float*     betap  = bli_obj_buffer( &beta );
-			scomplex*  cp     = bli_obj_buffer( &c );
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			float*    alphap = bli_obj_buffer( &alpha );
+			scomplex* ap     = bli_obj_buffer( &a );
+			float*    betap  = bli_obj_buffer( &beta );
+			scomplex* cp     = bli_obj_buffer( &c );
 
 			cherk_( &f77_uploc,
 			        &f77_transa,
@@ -231,14 +228,14 @@ int main( int argc, char** argv )
 		}
 		else if ( bli_is_dcomplex( dt ) )
 		{
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*    alphap = bli_obj_buffer( &alpha );
-			dcomplex*  ap     = bli_obj_buffer( &a );
-			double*    betap  = bli_obj_buffer( &beta );
-			dcomplex*  cp     = bli_obj_buffer( &c );
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			double*   alphap = bli_obj_buffer( &alpha );
+			dcomplex* ap     = bli_obj_buffer( &a );
+			double*   betap  = bli_obj_buffer( &beta );
+			dcomplex* cp     = bli_obj_buffer( &c );
 
 			zherk_( &f77_uploc,
 			        &f77_transa,

From a4bc03b990fe0572001eb6409efd12cd70677dcf Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 19 Nov 2021 13:29:00 -0600
Subject: [PATCH 009/230] Brief mention/link to Addons.md in README.md.

Details:
- Add a blurb about the new addons feature to the "Documentation for
  BLIS developers" section of the README.md, which also links to the
  Addons.md document.
---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 2abe79400..372b6857c 100644
--- a/README.md
+++ b/README.md
@@ -512,6 +512,11 @@ learn how to add new sub-configurations or configuration families, or are simply
 interested in learning how BLIS organizes its configurations and kernel sets,
 please read this thorough walkthrough of the configuration system.
 
+ * **[Addon Guide](docs/Addons.md).** If you are interested in learning
+about using BLIS addons--that is, enabling existing (or creating new) bundles
+of operation or API code that are built into a BLIS library--please read this
+document.
+
  * **[Sandbox Guide](docs/Sandboxes.md).** If you are interested in learning
 about using sandboxes in BLIS--that is, providing alternative implementations
 of the `gemm` operation--please read this document.

From 12c66a4acc77bf4927b01e2358e2ac10b61e0a53 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 19 Nov 2021 14:43:53 -0600
Subject: [PATCH 010/230] Minor updates to README.md, docs/Addons.md.

Details:
- Add additional mentions of addons to README.md, including in the
  "What's New" section.
- Removed mention of sandboxes from the long list of advantages
  provided by BLIS.
- Very minor description update to opening line of Addons.md.
---
 README.md      | 25 +++++++++++++++----------
 docs/Addons.md |  4 ++--
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 372b6857c..211ebd6d5 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,16 @@ all of which are available for free via the [edX platform](http://www.edx.org/).
 What's New
 ----------
 
+ * **Addons feature now available!** Have you ever wanted to quickly extend BLIS's
+operation support or define new custom BLIS APIs for your application, but were
+unsure of how to add your source code to BLIS? Do you want to isolate your custom
+code so that it only gets enabled when the user requests it? Do you like
+[sandboxes](docs/Sandboxes.md), but wish you didn't have to provide an
+implementation of `gemm`? If so, you should check out our new
+[addons](docs/Addons.md) feature. Addons act like optional extensions that can be
+created, enabled, and combined to suit your application's needs, all without
+formally integrating your code into the core BLIS framework.
+
  * **Multithreaded small/skinny matrix support for sgemm now available!** Thanks to
 funding and hardware support from Oracle, we have now accelerated `gemm` for
 single-precision real matrix problems where one or two dimensions is exceedingly
@@ -265,20 +275,13 @@ many will find BLIS's object-based APIs a delight to use when customizing
 or writing their own BLIS operations. (Objects are relatively lightweight
 `structs` and passed by address, which helps tame function calling overhead.)
 
- * **Multilayered API, exposed kernels, and sandboxes.** The BLIS framework
-exposes its
+ * **Multilayered API and exposed kernels.** The BLIS framework exposes its
 implementations in various layers, allowing expert developers to access exactly
 the functionality desired. This layered interface includes that of the
 lowest-level kernels, for those who wish to bypass the bulk of the framework.
 Optimizations can occur at various levels, in part thanks to exposed packing
 and unpacking facilities, which by default are highly parameterized and
-flexible. And more recently, BLIS introduced sandboxes--a way to provide
-alternative implementations of `gemm` that do not use any more of the BLIS
-infrastructure than is desired. Sandboxes provide a convenient and
-straightforward way of modifying the `gemm` implementation without disrupting
-any other level-3 operation or any other part of the framework. This works
-especially well when the developer wants to experiment with new optimizations
-or try a different algorithm.
+flexible.
 
  * **Functionality that grows with the community's needs.** As its name
 suggests, the BLIS framework is not a single library or static API, but rather
@@ -286,7 +289,9 @@ a nearly-complete template for instantiating high-performance BLAS-like
 libraries. Furthermore, the framework is extensible, allowing developers to
 leverage existing components to support new operations as they are identified.
 If such operations require new kernels for optimal efficiency, the framework
-and its APIs will be adjusted and extended accordingly.
+and its APIs will be adjusted and extended accordingly. Community developers
+who wish to experiment with creating new operations or APIs in BLIS can quickly
+and easily do so via the [Addons](docs/Addons.md) feature.
 
  * **Code re-use.** Auto-generation approaches to achieving the aforementioned
 goals tend to quickly lead to code bloat due to the multiple dimensions of
diff --git a/docs/Addons.md b/docs/Addons.md
index 595cebfa4..bd4799fb7 100644
--- a/docs/Addons.md
+++ b/docs/Addons.md
@@ -10,8 +10,8 @@
 
 ## Introduction
 
-This file briefly describes the requirements for building a custom BLIS
-*addon*.
+This file briefly describes the requirements for enabling or creating a
+custom BLIS *addon*.
 
 Simply put, an addon in BLIS provides additional APIs, operations, and/or
 implementations that may be useful to certain users. An addon can be

From e229e049ca08dfbd45794669df08a71dba892925 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 1 Dec 2021 17:36:22 -0600
Subject: [PATCH 011/230] Added recu-sed.sh script to 'build' directory.

Details:
- Added a recursive sed script to the 'build' directory.
---
 build/recu-sed.sh | 488 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 488 insertions(+)
 create mode 100755 build/recu-sed.sh

diff --git a/build/recu-sed.sh b/build/recu-sed.sh
new file mode 100755
index 000000000..e7a1d43db
--- /dev/null
+++ b/build/recu-sed.sh
@@ -0,0 +1,488 @@
+#!/bin/bash
+
+#
+# recursive-sed.sh
+#
+# Field G. Van Zee
+#
+
+print_usage()
+{
+	# Echo usage info
+	echo " "
+	echo " "$script_name
+	echo " "
+	echo " Field G. Van Zee"
+	echo " "
+	echo " Recusively descend a directory tree and perform sed commands, either on"
+	echo " the filename or the file contents, or both." 
+	echo " "
+	echo " Usage:"
+	echo "   ${script_name} [options]"
+	echo " "
+	echo " The following options are accepted:"
+	echo " "
+	echo "   -d "
+	echo "                 Dry run. Go through all the motions, but don't actually"
+	echo "                 apply any of the sed expressions to file names or contents."
+	echo "   -N "
+	echo "                 Do not proceed recursively into subdirectories; consider"
+	echo "                 only the files within the current directory. Default"
+	echo "                 behavior is to act recursively."
+	echo "   -h "
+	echo "                 Consider hidden files and directories. Default behavior is"
+	echo "                 to ignore them."
+	echo "   -n "
+	echo "                 Use svn mv instead of mv when renaming the file."
+	echo "                 Notice that this only applies if the filename changes."
+	echo "   -p pattern "
+	echo "                 Specifies the filename pattern, as would be given to the"
+	echo "                 ls utility, to limit which files are affected. Default is"
+	echo "                 the to consider all files present."
+	echo "   -r dir"
+	echo "                 The root directory for the recursive action to be performed."
+	echo "                 Default is to use the current working directory."
+	echo "   -v [0|1|2]"
+	echo "                 verboseness level"
+	echo "                 level 0: silent  (no output)"
+	echo "                 level 1: default (one line per directory; supress ls stderr)"
+	echo "                 level 2: verbose (one line per directory; show ls stderr)"
+	echo " "
+	echo " At least one of the following option-argument pairs is required:"
+	echo " "
+	echo "   -f sed_expr "
+	echo "                 Specifies the sed expression that will be applied to the"
+	echo "                 filenames of the files touched by the script. This expression"
+    echo "                 must be a search-and-replace pattern."
+	echo "   -c sed_expr "
+	echo "                 Specifies the sed expression that will be applied to the"
+	echo "                 contents of the files touched by the script. This expression"
+    echo "                 should be a search-and-replace pattern."
+	echo "   -s sed_script"
+	echo "                 Specifies an arbitrary sed script that will be applied to the"
+	echo "                 file contents of the files touched by the script."
+	echo " "
+	echo " Note: -c and -s options are mutually exclusive."
+	echo " "
+	
+	# Exit with non-zero exit status
+	exit 1
+}
+
+
+
+
+perform_sed()
+{
+	# Variables set by getopts.
+	local exist_dir="$1"
+	
+	#echo "exist_dir: $exist_dir"
+
+	# The suffix used to create temporary files
+	local temp_file_suffix="sed_temp"
+
+	# Check that exist_dir actually exists and is a directory
+	if [ ! -d "${exist_dir}" ]; then
+		echo "${script_name}: ${exist_dir} does not seem to be a valid directory."
+		exit 1
+	fi
+	
+	# Check that the filename sed expression, if given, begins with an 's'.
+	if [ -n "$filename_sed_expr" ]; then
+		
+		# If it's a valid search-and-replace expression, this should return an 's'.
+		filename_sed_char=${filename_sed_expr%%/*}
+		
+		if [ "$filename_sed_char" != "s" ]; then
+			echo "${script_name}: sed expression given with -f must be search-and-replace."
+			exit 1
+		fi
+	fi
+	
+	# Check that the sed script, if given, exists.
+	if [ -n "$contents_sed_script" ]; then
+		
+		if [ ! -f ${contents_sed_script} ]; then
+			echo "${script_name}: ${contents_sed_script} is not a regular file or does not exist."
+			exit 1
+		fi
+	fi
+	
+	# Assume that the sed expression is a search-and-replace. Extract the patterns
+	# to match on. (Arbitrary sed expressions should be applied through a sed script.)
+	if [ "$filename_sed_expr" != "" ]; then
+		filename_sed_match=${filename_sed_expr#s/}
+		filename_sed_match=${filename_sed_match%%/*}
+	fi
+	
+
+	# Get the list of source files in the directory given. Supress stderr if
+	# level 0 or 1 verbosity was requested.
+	#if [ "$verbose_level" != "2" ]; then
+	#	old_filepaths=$(ls -d -b ${exist_dir}/${filename_pattern} 2> /dev/null)
+	#else
+	#	old_filepaths="$(ls -d -b ${exist_dir}/${filename_pattern})"
+	#fi
+	
+	#echo $old_filepaths
+	#echo "$exist_dir/$filename_pattern"
+	
+	#for old_filepath in $old_filepaths; do
+	#echo "exist_dir:    $exist_dir"
+
+	# Find all files that match the pattern in the current directory.
+	find "${exist_dir}" -maxdepth 1 -name "${filename_pattern}" -print | while read old_filepath
+	do
+		#echo "old_filepath: $old_filepath"
+
+		# Skip the current directory.
+		if [ "${old_filepath}" == "${exist_dir}" ]; then
+			continue
+		fi
+		
+		# Skip any non-regular files.
+		if [ ! -f "$old_filepath" ]; then
+			
+			# And say we are doing so if verboseness was requested.
+			if [ "$verbose_level" = "2" ]; then
+				echo "${script_name}: Ignoring $old_filepath"
+			fi
+			continue
+		fi
+		
+		# Strip exist_dir from filename.
+		old_filename=${old_filepath##*/}
+
+		# Strip the filename from old_filepath to leave the directory path.
+		old_dirpath=${old_filepath%/*}
+		
+		# Create a new filename from the old one. If a filename sed expression was given,
+		# it will be applied now.
+		if [ "$filename_sed_expr" != "" ]; then
+			new_filename=$(echo "${old_filename}" | sed "${filename_sed_expr}")
+		else
+			new_filename="${old_filename}"
+		fi
+
+		#echo "new_filename: $new_filename"
+			
+		# Create the filepath to the new file location.
+		new_filepath="${old_dirpath}/${new_filename}"
+		#echo "new_filepath: $new_filepath"
+			
+		# Grep for the filename pattern within the filename of the current file.
+		if [ "$filename_sed_expr" != "" ]; then
+			grep_filename=$(echo "${old_filename}" | grep "${filename_sed_match}")
+		fi
+		
+
+		# If we are not performing a dry run, proceed.
+		if [ -z "$dry_run_flag" ]; then
+			
+			# Save the old file permissions so we can re-apply them to the
+			# new file if its contents change (ie: if it's not just a 'mv',
+			# which inherently preserves file permissions).
+			old_perms=$(stat -c %a "${old_filepath}")
+
+			# If the old and new filepaths are different, then we start off by
+			# renaming the file. (Otherwise, if the old and new filepaths are
+			# identical, then we don't need to do anything to the file.) If
+			# the user requested that we use svn mv, then do that, otherwise we
+			# use regular mv.
+			if [ "${old_filepath}" != "${new_filepath}" ]; then
+	
+				if [ -n "$use_svn_mv_flag" ]; then
+	
+					svn mv "${old_filepath}" "${new_filepath}"
+				else
+	
+					mv -f "${old_filepath}" "${new_filepath}"
+				fi
+			fi
+		#else
+
+			# A dry run still needs the act upon the "new" file, so if the
+			# filepaths are different, simply set the new filepath to the
+			# old one. (We won't need the previous value of new_filepath 
+			# anymore.)
+			#if [ "${old_filepath}" != "${new_filepath}" ]; then
+			#	new_filepath="${old_filepath}"
+			#fi
+		fi
+
+		# Handle the cases that might change the contents of the file.
+		if [ "$contents_sed_expr" != "" ] ||
+		   [ "$contents_sed_script" != "" ]; then
+			
+			# Execute the sed command based on whether the sed action was given
+			# as a command line expression or a script residing in a file.
+			if   [ "$contents_sed_script" != "" ]; then
+				
+				# Perform the action, saving the result to a temporary file.
+				cat "${new_filepath}" | sed -f ${contents_sed_script} \
+				                      > ${new_filepath}.${temp_file_suffix}
+			
+			elif [ "$contents_sed_expr" != "" ]; then
+				
+				# Perform the action, saving the result to a temporary file.
+				cat "${new_filepath}" | sed -e "${contents_sed_expr}" \
+				                      > ${new_filepath}.${temp_file_suffix}
+			fi
+			
+			# Check the difference.
+			file_diff=$(diff "${new_filepath}" "${new_filepath}.${temp_file_suffix}")
+			
+			
+			# If we are not performing a dry run, proceed.
+			if [ -z "$dry_run_flag" ]; then
+			
+				# If the file contents change.
+				if [ -n "$file_diff" ]; then
+				
+					# Apply the old file permissions to the new file (before we
+					# potentially overwrite the old file with the new one).
+					chmod ${old_perms} "${new_filepath}.${temp_file_suffix}"
+
+					# Apply the file contents changes to the new filepath (which may
+					# or may not be the same as the old filepath).
+					mv -f "${new_filepath}.${temp_file_suffix}" "${new_filepath}"
+
+				else
+					# Otherwise remove the new temporary file since it is identical
+					# to the original.
+					rm -f "${new_filepath}.${temp_file_suffix}"
+				fi
+			else
+				# Simply remove the file since we are only performing a dry run.
+				rm -f "${new_filepath}.${temp_file_suffix}"
+			fi
+
+		fi
+		
+		# Check for dos2unix. If it's not here, we'll just substitute cat.
+		#type_dos2unix=$(type -path dos2unix)
+		#if [ -n "$type_dos2unix" ]; then
+		#	dos2unix -q ${new_filepath}
+		#fi
+
+		# Create a string that indicates what we are changing. We'll use this in
+		# the verbose progress echo to indicate how the file is or would be changed.
+		if   [ -n "$grep_filename" ] && [ -n "$file_diff" ]; then
+			which_matches="filename/contents"
+			file_touched="yes"
+		elif [ -n "$grep_filename" ] && [ -z "$file_diff" ]; then
+			which_matches="filename         "
+			file_touched="yes"
+		elif [ -z "$grep_filename" ] && [ -n "$file_diff" ]; then
+			which_matches="         contents"
+			file_touched="yes"
+		else
+			which_matches=""
+			file_touched="no"
+		fi
+		
+		# Be verbose, if requested, about which file we're looking at.
+		if [ "$verbose_level" != "0" ]; then
+			
+			# But we only need to output a line if the file was touched.
+			if [ "$file_touched" != "no" ]; then
+				
+				# Construct a relative filepath by stripping the initial root
+				# directory so that the output does not span as many columns on
+				# the terminal.
+				rel_old_filepath=${old_filepath#${initial_root_dir}/}
+
+				# Add a "dry run" condition to the output if we're doing a dry-run
+				# so that the user knows we didn't really change anything.
+				if [ -z "$dry_run_flag" ]; then
+					echo "$script_name: Changing [${which_matches}] of ${rel_old_filepath}"
+				else
+					echo "$script_name: Changing (dry run) [${which_matches}] of ${rel_old_filepath}"
+				fi
+			fi
+		fi
+		
+	done
+	
+	# Exit peacefully.
+	return 0
+}
+
+
+
+
+recursive_sed()
+{
+	# Local variable declarations
+	local item sub_items curr_dir this_dir
+	
+	
+	# Extract our argument
+	curr_dir="$1"
+	
+
+	# Call our function to perform the sed operations on the files in the
+	# directory given.
+	perform_sed "${curr_dir}"
+	
+
+	# If we were asked to act recursively, then continue processing
+	# curr_dir's contents.
+	if [ "$recursive_flag" = "1" ]; then
+		
+		# Get a listing of items in the directory according to the hidden
+		# files/directories flag.
+		if [ -n "$hidden_files_dirs_flag" ]; then
+
+			# Get a listing of the directories in curr_dir (including hidden
+			# files and directories).
+			sub_items=$(ls -a "$curr_dir")
+
+		else
+
+			# Get a listing of the directories in curr_dir.
+			sub_items=$(ls "$curr_dir")
+		fi
+
+		#echo "sub_items: $sub_items"
+	
+		# Descend into the contents of curr_dir, calling recursive_sed on
+		# any items that are directories.
+		find "${curr_dir}" -maxdepth 1 -name "*" -print | while read item
+		do
+
+			#echo "conisdering item: $item"
+
+			# Skip the current directory.
+			if [ "${item}" == "${curr_dir}" ]; then
+				continue
+			fi
+
+			# If item is a directory, descend into it.
+			if [ -d "$item" ]; then
+			
+				#echo "item is dir: $item"
+
+				recursive_sed "$item"
+			fi
+		done
+
+	fi
+	
+	
+	# Return peacefully
+	return 0
+}
+
+
+
+
+main()
+{
+	# Variables set by getopts.
+	dry_run_flag=""
+	hidden_files_dirs_flag=""
+	use_svn_mv_flag=""
+	filename_pattern=""
+	root_dir=""
+	initial_root_dir=""
+	verbose_level=""
+	filename_sed_expr=""
+	contents_sed_expr=""
+	contents_sed_script=""
+
+	recursive_flag="1"	
+
+	
+	# Get the script name
+	script_name=${0##*/}
+	
+	
+	# Local variable declarations.
+	local item sub_items this_dir
+	
+	
+	# Process our command line options.
+	while getopts ":c:df:hp:r:s:nNv:" opt; do
+		case $opt in
+			d  ) dry_run_flag="1" ;;
+			h  ) hidden_files_dirs_flag="1" ;;
+			n  ) use_svn_mv_flag="1" ;;
+			N  ) recursive_flag="0" ;;
+			v  ) verbose_level="$OPTARG" ;;
+			p  ) filename_pattern="$OPTARG" ;;
+			r  ) root_dir="$OPTARG" ;;
+			f  ) filename_sed_expr="$OPTARG" ;;
+			c  ) contents_sed_expr="$OPTARG" ;;
+			s  ) contents_sed_script="$OPTARG" ;;
+			\? ) print_usage
+		esac
+	done
+	shift $(($OPTIND - 1))
+	
+	
+	# Make sure we've parsed all command line arguments by now.
+	if [ $# != "0" ]; then
+		echo "${script_name}: Unparsed command line arguments! Try running with no arguments for help."
+		exit 1
+	fi
+	
+	
+	# Make sure we received at least one of the required options.
+	if [ -z "$filename_sed_expr" ] &&
+	   [ -z "$contents_sed_expr" ] &&
+	   [ -z "$contents_sed_script" ]; then
+		print_usage
+	fi
+	
+	
+	# Make sure that both a file contents sed expression and sed script were
+	# not given.
+	if [ "$contents_sed_expr"   != "" ] &&
+	   [ "$contents_sed_script" != "" ] ; then
+		echo "${script_name}: The -c and -s options may not be used at the same time."
+		exit 1
+	fi
+
+
+	# Make sure that verboseness level is valid.
+	if [ "$verbose_level" != "0" ] && 
+	   [ "$verbose_level" != "1" ] &&
+	   [ "$verbose_level" != "2" ]; then
+		verbose_level="1"
+	fi
+	
+	# Prepare the filename pattern arguments to perform_sed().
+	if [ "$filename_pattern" = "" ] ; then
+		filename_pattern='*'
+	fi
+
+	# Prepare the directory arguments to perform_sed().
+	if [ "$root_dir" != "" ] ; then
+		
+		# Strip / from end of directory paths, if there is one.
+		root_dir=${root_dir%/}
+	else
+		root_dir=$PWD
+	fi
+	initial_root_dir=${root_dir}
+	
+
+	#echo "root_dir: $root_dir"
+
+
+	# Begin recursing on the root directory.
+	recursive_sed "$root_dir"
+
+
+	# Exit peacefully
+	return 0
+}
+
+
+
+
+# The script's main entry point, passing all parameters given.
+main "$@"
+

From cf7d616a2fd58e293b496770654040818bf5609c Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 2 Dec 2021 17:10:03 -0600
Subject: [PATCH 012/230] Enable user-customized packm ukernel/variant. (#549)

Details:
- Added four new fields to obj_t: .pack_fn, .pack_params, .ker_fn, and
  .ker_params. These fields store pointers to functions and data that
  will allow the user to more flexibly create custom operations while
  recycling BLIS's existing partitioning infrastructure.
- Updated typed API to packm variant and structure-aware kernels to
  replace the diagonal offset with panel offsets, and changed strides
  of both C and P to inc/ldim semantics. Updated object API to the packm
  variant to include rntm_t*.
- Removed the packm variant function pointer from the packm cntl_t node
  definition since it has been replaced by the .pack_fn pointer in the
  obj_t.
- Updated bli_packm_int() to read the new packm variant function pointer
  from the obj_t and call it instead of from the cntl_t node.
- Moved some of the logic of bli_l3_packm.c to a new file,
  bli_packm_alloc.c.
- Rewrote bli_packm_blk_var1.c so that it uses byte (char*) pointers
  instead of typed pointers, allowing a single function to be used
  regardless of datatype. This obviated having a separate implementation
  in bli_packm_blk_var1_md.c. Also relegated handling of scalars to a
  new function, bli_packm_scalar().
- Employed a new standard whereby right-hand matrix operands ("B") are
  always packed as column-stored row panels -- that is, identically to
  that of left-hand matrix operands ("A"). This means that while we pack
  matrix A normally, we actually pack B in a transposed state. This
  allowed us to simplify a lot of code throughout the framework, and
  also affected some of the logic in bli_l3_packa() and _packb().
- Simplified bli_packm_init.c in light of the new B^T convention
  described above. bli_packm_init()--which is now called from within
  bli_packm_blk_var1()--also now calls bli_packm_alloc() and returns
  a bool that indicates whether packing should be performed (or
  skipped).
- Consolidated bli_gemm_int() and bli_trsm_int() into a bli_l3_int(),
  which, among other things, defaults the new .pack_fn field of the
  obj_t to bli_packm_blk_var1() if the field is NULL.
- Defined a new function, bli_obj_reset_origin(), which permanently
  refocuses the view of an object so that it "forgets" any offsets from
  its original pointer. This function also sets the object's root field
  to itself. Calls to bli_obj_reset_origin() for each matrix operand
  appear in the _front() functions, after the obj_t's are aliased. This
  resetting of the underlying matrices' origins is needed in preparation
  for more advanced features from within custom packm kernels.
- Redefined bli_pba_rntm_set_pba() from a regular function to a static
  inline function.
- Updated gemm_ukr, gemmtrsm_ukr, and trsm_ukr testsuite modules to use
  libblis_test_pobj_create() to create local packed objects. Previously,
  these packed objects were created by calling lower-level functions.
---
 build/libblis-symbols.def                     |   1 -
 frame/1m/bli_l1m_ft_ker.h                     |  18 +-
 frame/1m/bli_l1m_oft_var.h                    |   1 +
 frame/1m/packm/bli_packm.h                    |   8 +-
 .../{bli_packm_var.h => bli_packm_alloc.c}    | 139 ++-
 .../packm/bli_packm_alloc.h}                  |  17 +-
 frame/1m/packm/bli_packm_blk_var1.c           | 824 +++++-------------
 frame/1m/packm/bli_packm_blk_var1.h           |  59 ++
 frame/1m/packm/bli_packm_blk_var1_md.c        | 344 --------
 frame/1m/packm/bli_packm_blk_var1_md.h        |  67 --
 frame/1m/packm/bli_packm_cntl.c               |   4 +-
 frame/1m/packm/bli_packm_cntl.h               |   7 -
 frame/1m/packm/bli_packm_init.c               | 437 ++--------
 frame/1m/packm/bli_packm_init.h               |  19 +-
 frame/1m/packm/bli_packm_int.c                |  56 +-
 frame/1m/packm/bli_packm_int.h                |   1 +
 .../packm/bli_packm_scalar.c}                 | 106 +--
 .../{bli_packm_md.h => bli_packm_scalar.h}    |   3 +-
 frame/1m/packm/bli_packm_struc_cxk.c          | 327 +++----
 frame/1m/packm/bli_packm_struc_cxk.h          |  73 +-
 frame/1m/packm/bli_packm_struc_cxk_1er.c      | 335 +++----
 frame/1m/packm/bli_packm_struc_cxk_1er.h      |  76 +-
 frame/1m/packm/bli_packm_struc_cxk_md.c       |  59 +-
 frame/1m/packm/bli_packm_struc_cxk_md.h       |  21 +-
 frame/1m/packm/bli_packm_unb_var1.c           | 297 -------
 frame/1m/packm/bli_packm_unb_var1.h           |  66 --
 frame/1m/unpackm/bli_unpackm.h                |   2 -
 frame/1m/unpackm/bli_unpackm_unb_var1.c       | 131 ---
 frame/1m/unpackm/bli_unpackm_unb_var1.h       |  60 --
 frame/3/bli_l3.h                              |   3 +-
 frame/3/bli_l3_check.c                        |   6 +-
 frame/3/{trsm/bli_trsm_int.c => bli_l3_int.c} |  77 +-
 frame/3/{gemm/bli_gemm_int.h => bli_l3_int.h} |   2 +-
 frame/3/bli_l3_oft_var.h                      |  19 +-
 .../bli_gemm_packab.c => bli_l3_packab.c}     |  45 +-
 .../{trsm/bli_trsm_int.h => bli_l3_packab.h}  |  15 +-
 frame/3/bli_l3_packm.c                        | 187 ----
 frame/3/gemm/bli_gemm.h                       |   1 -
 frame/3/gemm/bli_gemm_blk_var1.c              |   2 +-
 frame/3/gemm/bli_gemm_blk_var2.c              |   2 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |   2 +-
 frame/3/gemm/bli_gemm_cntl.c                  |  15 +-
 frame/3/gemm/bli_gemm_front.c                 |  17 +-
 frame/3/gemm/bli_gemm_int.c                   | 127 ---
 frame/3/gemm/bli_gemm_var.h                   |   3 -
 frame/3/gemmt/bli_gemmt_front.c               |  11 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2.c          |   6 +-
 frame/3/hemm/bli_hemm_front.c                 |  17 +-
 frame/3/symm/bli_symm_front.c                 |  17 +-
 frame/3/trmm/bli_trmm_front.c                 |  17 +-
 frame/3/trmm/bli_trmm_xx_ker_var2.c           |   8 +-
 frame/3/trmm3/bli_trmm3_front.c               |  17 +-
 frame/3/trsm/bli_trsm.h                       |   2 -
 frame/3/trsm/bli_trsm_blk_var1.c              |   6 +-
 frame/3/trsm/bli_trsm_blk_var2.c              |   4 +-
 frame/3/trsm/bli_trsm_blk_var3.c              |   2 +-
 frame/3/trsm/bli_trsm_cntl.c                  |  27 +-
 frame/3/trsm/bli_trsm_front.c                 |  17 +-
 frame/3/trsm/bli_trsm_var.h                   |   2 -
 frame/3/trsm/bli_trsm_xx_ker_var2.c           |   8 +-
 frame/base/bli_obj.c                          |   9 +-
 frame/base/bli_pba.c                          |  11 -
 frame/base/bli_pba.h                          |  11 +-
 frame/base/bli_sba.c                          |  84 +-
 frame/include/bli_obj_macro_defs.h            |  60 +-
 frame/include/bli_type_defs.h                 | 240 +++--
 testsuite/src/test_gemm_ukr.c                 |  78 +-
 testsuite/src/test_gemmtrsm_ukr.c             | 128 ++-
 testsuite/src/test_libblis.c                  |  41 +-
 testsuite/src/test_libblis.h                  |   2 +-
 testsuite/src/test_trsm_ukr.c                 |  98 +--
 71 files changed, 1290 insertions(+), 3714 deletions(-)
 rename frame/1m/packm/{bli_packm_var.h => bli_packm_alloc.c} (50%)
 rename frame/{3/bli_l3_packm.h => 1m/packm/bli_packm_alloc.h} (88%)
 create mode 100644 frame/1m/packm/bli_packm_blk_var1.h
 delete mode 100644 frame/1m/packm/bli_packm_blk_var1_md.c
 delete mode 100644 frame/1m/packm/bli_packm_blk_var1_md.h
 rename frame/{3/trsm/bli_trsm_packab.c => 1m/packm/bli_packm_scalar.c} (53%)
 rename frame/1m/packm/{bli_packm_md.h => bli_packm_scalar.h} (95%)
 delete mode 100644 frame/1m/packm/bli_packm_unb_var1.c
 delete mode 100644 frame/1m/packm/bli_packm_unb_var1.h
 delete mode 100644 frame/1m/unpackm/bli_unpackm_unb_var1.c
 delete mode 100644 frame/1m/unpackm/bli_unpackm_unb_var1.h
 rename frame/3/{trsm/bli_trsm_int.c => bli_l3_int.c} (74%)
 rename frame/3/{gemm/bli_gemm_int.h => bli_l3_int.h} (99%)
 rename frame/3/{gemm/bli_gemm_packab.c => bli_l3_packab.c} (80%)
 rename frame/3/{trsm/bli_trsm_int.h => bli_l3_packab.h} (90%)
 delete mode 100644 frame/3/bli_l3_packm.c
 delete mode 100644 frame/3/gemm/bli_gemm_int.c

diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index 97146a786..8d29d73b2 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -1307,7 +1307,6 @@ bli_pba_init_pools
 bli_pba_pool_size
 bli_pba_query
 bli_pba_release
-bli_pba_rntm_set_pba
 bli_memsys_finalize
 bli_memsys_init
 bli_mkherm
diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h
index 1146ca7d2..2e813cf4a 100644
--- a/frame/1m/bli_l1m_ft_ker.h
+++ b/frame/1m/bli_l1m_ft_ker.h
@@ -50,21 +50,23 @@
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffc, \
        diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
        bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
+       dim_t           panel_dim, \
+       dim_t           panel_len, \
+       dim_t           panel_dim_max, \
+       dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
                           inc_t is_p, \
-       cntx_t*         cntx  \
+       cntx_t*         cntx, \
+       void*           params \
      );
 
 INSERT_GENTDEF( packm )
diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h
index 15e9dae6f..0b60d4e2f 100644
--- a/frame/1m/bli_l1m_oft_var.h
+++ b/frame/1m/bli_l1m_oft_var.h
@@ -48,6 +48,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
   obj_t*  a, \
   obj_t*  p, \
   cntx_t* cntx, \
+  rntm_t* rntm, \
   cntl_t* cntl, \
   thrinfo_t* thread  \
 );
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index e8aa36328..88657a712 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -33,15 +33,15 @@
 
 */
 
+#include "bli_packm_alloc.h"
 #include "bli_packm_cntl.h"
 #include "bli_packm_check.h"
 #include "bli_packm_init.h"
 #include "bli_packm_int.h"
+#include "bli_packm_scalar.h"
 
 #include "bli_packm_part.h"
 
-#include "bli_packm_var.h"
-
 #include "bli_packm_struc_cxk.h"
 #include "bli_packm_struc_cxk_1er.h"
 
@@ -50,6 +50,8 @@
 
 // Mixed datatype support.
 #ifdef BLIS_ENABLE_GEMM_MD
-#include "bli_packm_md.h"
+#include "bli_packm_struc_cxk_md.h"
 #endif
 
+#include "bli_packm_blk_var1.h"
+
diff --git a/frame/1m/packm/bli_packm_var.h b/frame/1m/packm/bli_packm_alloc.c
similarity index 50%
rename from frame/1m/packm/bli_packm_var.h
rename to frame/1m/packm/bli_packm_alloc.c
index 723e6fdb4..df6750d7a 100644
--- a/frame/1m/packm/bli_packm_var.h
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,78 +33,67 @@
 
 */
 
-//
-// Prototype object-based interfaces.
-//
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
-     ( \
-       obj_t*   c, \
-       obj_t*   p, \
-       cntx_t*  cntx, \
-       cntl_t*  cntl, \
-       thrinfo_t* t  \
-     );
-
-GENPROT( packm_unb_var1 )
-GENPROT( packm_blk_var1 )
-
-//
-// Prototype BLAS-like interfaces with void pointer operands.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* cntx  \
-     );
-
-INSERT_GENTPROT_BASIC0( packm_unb_var1 )
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       pack_t  schema, \
-       bool    invdiag, \
-       bool    revifup, \
-       bool    reviflo, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-                  inc_t is_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       void_fp packm_ker, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( packm_blk_var1 )
+#include "blis.h"
+
+void* bli_packm_alloc
+      (
+        siz_t      size_needed,
+        rntm_t*    rntm,
+        cntl_t*    cntl,
+        thrinfo_t* thread
+      )
+{
+	// Query the pack buffer type from the control tree node.
+	packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
+
+	// Query the address of the mem_t entry within the control tree node.
+	mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl );
+
+	mem_t* local_mem_p;
+	mem_t  local_mem_s;
+
+	siz_t cntl_mem_size = 0;
+
+	if ( bli_mem_is_alloc( cntl_mem_p ) )
+        cntl_mem_size = bli_mem_size( cntl_mem_p );
+
+	if ( cntl_mem_size < size_needed )
+	{
+		if ( bli_thread_am_ochief( thread ) )
+		{
+			// The chief thread releases the existing block associated with
+			// the mem_t entry in the control tree, and then re-acquires a
+			// new block, saving the associated mem_t entry to local_mem_s.
+	        if ( bli_mem_is_alloc( cntl_mem_p ) )
+            {
+    			bli_pba_release
+    			(
+    			  rntm,
+    			  cntl_mem_p
+    			);
+            }
+			bli_pba_acquire_m
+			(
+			  rntm,
+			  size_needed,
+			  pack_buf_type,
+			  &local_mem_s
+			);
+		}
+
+		// Broadcast the address of the chief thread's local mem_t entry to
+		// all threads.
+		local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
+
+		// Save the chief thread's local mem_t entry to the mem_t field in
+		// this thread's control tree node.
+		*cntl_mem_p = *local_mem_p;
+
+        // Barrier so that the master thread doesn't return from the function
+        // before we are done reading.
+	    bli_thread_barrier( thread );
+	}
+
+    return bli_mem_buffer( cntl_mem_p );
+}
 
diff --git a/frame/3/bli_l3_packm.h b/frame/1m/packm/bli_packm_alloc.h
similarity index 88%
rename from frame/3/bli_l3_packm.h
rename to frame/1m/packm/bli_packm_alloc.h
index 696dabf59..b433be350 100644
--- a/frame/3/bli_l3_packm.h
+++ b/frame/1m/packm/bli_packm_alloc.h
@@ -5,7 +5,6 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,13 +32,11 @@
 
 */
 
-void bli_l3_packm
-     (
-       obj_t*  x,
-       obj_t*  x_pack,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     );
+BLIS_EXPORT_BLIS  void* bli_packm_alloc
+      (
+        siz_t      size_needed,
+        rntm_t*    rntm,
+        cntl_t*    cntl,
+        thrinfo_t* thread
+      );
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 5073f7812..edeeae2b9 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -35,35 +35,6 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       struc_t strucc,
-       doff_t  diagoffc,
-       diag_t  diagc,
-       uplo_t  uploc,
-       trans_t transc,
-       pack_t  schema,
-       bool    invdiag,
-       bool    revifup,
-       bool    reviflo,
-       dim_t   m,
-       dim_t   n,
-       dim_t   m_max,
-       dim_t   n_max,
-       void*   kappa,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       void*   p, inc_t rs_p, inc_t cs_p,
-                  inc_t is_p,
-                  dim_t pd_p, inc_t ps_p,
-       void_fp packm_ker,
-       cntx_t* cntx,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
-
 
 static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
 {
@@ -79,614 +50,265 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
         NULL,                      bli_zpackm_struc_cxk_1er,  } },
 };
 
+static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
 
 void bli_packm_blk_var1
      (
        obj_t*   c,
        obj_t*   p,
        cntx_t*  cntx,
+       rntm_t*  rntm,
        cntl_t*  cntl,
-       thrinfo_t* t
+       thrinfo_t* thread
      )
 {
-#ifdef BLIS_ENABLE_GEMM_MD
-	// Call a different packm implementation when the storage and target
-	// datatypes differ.
-	if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) )
-	{
-		bli_packm_blk_var1_md( c, p, cntx, cntl, t );
+	// Extract various fields from the control tree.
+	pack_t schema  = bli_cntl_packm_params_pack_schema( cntl );
+	bool   invdiag = bli_cntl_packm_params_does_invert_diag( cntl );
+	bool   revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
+	bool   reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );
+
+	// Every thread initializes p and determines the size of memory
+	// block needed (which gets embedded into the otherwise "blank" mem_t
+	// entry in the control tree node). Return early if no packing is required.
+	if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) )
 		return;
-	}
-#endif
 
-	num_t     dt_p       = bli_obj_dt( p );
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_packm_int_check( c, p, cntx );
 
-	struc_t   strucc     = bli_obj_struc( c );
-	doff_t    diagoffc   = bli_obj_diag_offset( c );
-	diag_t    diagc      = bli_obj_diag( c );
-	uplo_t    uploc      = bli_obj_uplo( c );
-	trans_t   transc     = bli_obj_conjtrans_status( c );
-	pack_t    schema     = bli_obj_pack_schema( p );
-	bool      invdiag    = bli_obj_has_inverted_diag( p );
-	bool      revifup    = bli_obj_is_pack_rev_if_upper( p );
-	bool      reviflo    = bli_obj_is_pack_rev_if_lower( p );
+	num_t   dt_c           = bli_obj_dt( c );
+	dim_t   dt_c_size      = bli_dt_size( dt_c );
 
-	dim_t     m_p        = bli_obj_length( p );
-	dim_t     n_p        = bli_obj_width( p );
-	dim_t     m_max_p    = bli_obj_padded_length( p );
-	dim_t     n_max_p    = bli_obj_padded_width( p );
+	num_t   dt_p           = bli_obj_dt( p );
+	dim_t   dt_p_size      = bli_dt_size( dt_p );
 
-	void*     buf_c      = bli_obj_buffer_at_off( c );
-	inc_t     rs_c       = bli_obj_row_stride( c );
-	inc_t     cs_c       = bli_obj_col_stride( c );
+	struc_t strucc         = bli_obj_struc( c );
+	doff_t  diagoffc       = bli_obj_diag_offset( c );
+	diag_t  diagc          = bli_obj_diag( c );
+	uplo_t  uploc          = bli_obj_uplo( c );
+	conj_t  conjc          = bli_obj_conj_status( c );
 
-	void*     buf_p      = bli_obj_buffer_at_off( p );
-	inc_t     rs_p       = bli_obj_row_stride( p );
-	inc_t     cs_p       = bli_obj_col_stride( p );
-	inc_t     is_p       = bli_obj_imag_stride( p );
-	dim_t     pd_p       = bli_obj_panel_dim( p );
-	inc_t     ps_p       = bli_obj_panel_stride( p );
+	dim_t   iter_dim       = bli_obj_length( p );
+	dim_t   panel_len_full = bli_obj_width( p );
+	dim_t   panel_len_max  = bli_obj_padded_width( p );
 
-	obj_t     kappa;
-	void*     buf_kappa;
+	char*   c_cast         = bli_obj_buffer_at_off( c );
+	inc_t   incc           = bli_obj_row_stride( c );
+	inc_t   ldc            = bli_obj_col_stride( c );
+	dim_t   panel_dim_off  = bli_obj_row_off( c );
+	dim_t   panel_len_off  = bli_obj_col_off( c );
 
-	func_t*   packm_kers;
-	void_fp   packm_ker;
+	char*   p_cast         = bli_obj_buffer( p );
+	inc_t   ldp            = bli_obj_col_stride( p );
+	inc_t   is_p           = bli_obj_imag_stride( p );
+	dim_t   panel_dim_max  = bli_obj_panel_dim( p );
+	inc_t   ps_p           = bli_obj_panel_stride( p );
 
-	FUNCPTR_T f;
+	doff_t  diagoffc_inc   = ( doff_t )panel_dim_max;
 
+	obj_t   kappa_local;
+	char*   kappa_cast     = bli_packm_scalar( &kappa_local, p );
 
-	// Treatment of kappa (ie: packing during scaling) depends on
-	// whether we are executing an induced method.
-	if ( bli_is_nat_packed( schema ) )
-	{
-		// This branch is for native execution, where we assume that
-		// the micro-kernel will always apply the alpha scalar of the
-		// higher-level operation. Thus, we use BLIS_ONE for kappa so
-		// that the underlying packm implementation does not perform
-		// any scaling during packing.
-		buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
-	}
-	else // if ( bli_is_ind_packed( schema ) )
-	{
-		obj_t* kappa_p;
-
-		// The value for kappa we use will depend on whether the scalar
-		// attached to A has a nonzero imaginary component. If it does,
-		// then we will apply the scalar during packing to facilitate
-		// implementing induced complex domain algorithms in terms of
-		// real domain micro-kernels. (In the aforementioned situation,
-		// applying a real scalar is easy, but applying a complex one is
-		// harder, so we avoid the need altogether with the code below.)
-		if ( bli_obj_scalar_has_nonzero_imag( p ) )
-		{
-			//printf( "applying non-zero imag kappa\n" );
+	// we use the default lookup table to determine the right func_t
+	// for the current schema.
+	func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
 
-			// Detach the scalar.
-			bli_obj_scalar_detach( p, &kappa );
+	// Query the datatype-specific function pointer from the func_t object.
+	packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
 
-			// Reset the attached scalar (to 1.0).
-			bli_obj_scalar_reset( p );
+	// For mixed-precision gemm, select the proper kernel (only dense panels).
+	if ( dt_c != dt_p )
+	{
+		packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
+	}
 
-			kappa_p = &kappa;
-		}
-		else
-		{
-			// If the internal scalar of A has only a real component, then
-			// we will apply it later (in the micro-kernel), and so we will
-			// use BLIS_ONE to indicate no scaling during packing.
-			kappa_p = &BLIS_ONE;
-		}
+	// Query the address of the packm params field of the obj_t. The user might
+	// have set this field in order to specify a custom packm kernel.
+	packm_blk_var1_params_t* params = bli_obj_pack_params( c );
 
-		// Acquire the buffer to the kappa chosen above.
-		buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
+	if ( params && params->ukr_fn[ dt_c ][ dt_p ] )
+	{
+		// Query the user-provided packing kernel from the obj_t. If provided,
+		// this overrides the kernel determined above.
+		packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
 	}
 
+	/* Compute the total number of iterations we'll need. */
+	dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );
 
-	// The original idea here was to read the packm_ukr from the context
-	// if it is non-NULL. The problem is, it requires that we be able to
-	// assume that the packm_ukr field is initialized to NULL, which it
-	// currently is not.
-
-	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */
+	dim_t  ic0, ip0;
+	doff_t ic_inc, ip_inc;
 
-	//if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) )
+	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) ||
+	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) )
 	{
-		// If the packm structure-aware kernel func_t in the context is
-		// NULL (which is the default value after the context is created),
-		// we use the default lookup table to determine the right func_t
-		// for the current schema.
-		const dim_t i = bli_pack_schema_index( schema );
-
-		packm_kers = &packm_struc_cxk_kers[ i ];
+		ic0    = (n_iter - 1) * panel_dim_max;
+		ic_inc = -panel_dim_max;
+		ip0    = n_iter - 1;
+		ip_inc = -1;
 	}
-#if 0
-	else // cntx's packm func_t overrides
+	else
 	{
-		// If the packm structure-aware kernel func_t in the context is
-		// non-NULL (ie: assumed to be valid), we use that instead.
-		//packm_kers = bli_cntx_packm_ukrs( cntx );
-		packm_kers = cntx_packm_kers;
+		ic0    = 0;
+		ic_inc = panel_dim_max;
+		ip0    = 0;
+		ip_inc = 1;
 	}
-#endif
 
-	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker = bli_func_get_dt( dt_p, packm_kers );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_p];
-
-	// Invoke the function.
-	f( strucc,
-	   diagoffc,
-	   diagc,
-	   uploc,
-	   transc,
-	   schema,
-	   invdiag,
-	   revifup,
-	   reviflo,
-	   m_p,
-	   n_p,
-	   m_max_p,
-	   n_max_p,
-	   buf_kappa,
-	   buf_c, rs_c, cs_c,
-	   buf_p, rs_p, cs_p,
-	          is_p,
-	          pd_p, ps_p,
-	   packm_ker,
-	   cntx,
-	   t );
-}
+	// Query the number of threads and thread ids from the current thread's
+	// packm thrinfo_t node.
+	const dim_t nt  = bli_thread_n_way( thread );
+	const dim_t tid = bli_thread_work_id( thread );
 
+	// Determine the thread range and increment using the current thread's
+	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	// will depend on whether slab or round-robin partitioning was requested
+	// at configure-time.
+	dim_t it_start, it_end, it_inc;
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
 
-#undef  GENTFUNCR
-#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       pack_t  schema, \
-       bool    invdiag, \
-       bool    revifup, \
-       bool    reviflo, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-                  inc_t is_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       void_fp packm_ker, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
-\
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
-\
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it, ic, ip; \
-	dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-	doff_t          diagoffc_i; \
-	doff_t          diagoffc_inc; \
-	dim_t           panel_len_full; \
-	dim_t           panel_len_i; \
-	dim_t           panel_len_max; \
-	dim_t           panel_len_max_i; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	dim_t           panel_off_i; \
-	inc_t           vs_c; \
-	inc_t           ldc; \
-	inc_t           ldp, p_inc; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-	dim_t*          m_panel_use; \
-	dim_t*          n_panel_use; \
-	dim_t*          m_panel_max; \
-	dim_t*          n_panel_max; \
-	conj_t          conjc; \
-	bool            row_stored; \
-	bool            col_stored; \
-	inc_t           is_p_use; \
-\
-	ctype* restrict c_use; \
-	ctype* restrict p_use; \
-	doff_t          diagoffp_i; \
-\
-\
-	/* If C is zeros and part of a triangular matrix, then we don't need
-	   to pack it. */ \
-	if ( bli_is_zeros( uploc ) && \
-	     bli_is_triangular( strucc ) ) return; \
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
-\
-	/* If c needs a transposition, induce it so that we can more simply
-	   express the remaining parameters and code. */ \
-	if ( bli_does_trans( transc ) ) \
-	{ \
-		bli_swap_incs( &rs_c, &cs_c ); \
-		bli_negate_diag_offset( &diagoffc ); \
-		bli_toggle_uplo( &uploc ); \
-		bli_toggle_trans( &transc ); \
-	} \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = rs_c; \
-		vs_c           = cs_c; \
-		diagoffc_inc   = -( doff_t )panel_dim_max; \
-		ldp            = rs_p; \
-		m_panel_full   = &m; \
-		n_panel_full   = &panel_dim_i; \
-		m_panel_use    = &panel_len_i; \
-		n_panel_use    = &panel_dim_i; \
-		m_panel_max    = &panel_len_max_i; \
-		n_panel_max    = &panel_dim_max; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = cs_c; \
-		vs_c           = rs_c; \
-		diagoffc_inc   = ( doff_t )panel_dim_max; \
-		ldp            = cs_p; \
-		m_panel_full   = &panel_dim_i; \
-		n_panel_full   = &n; \
-		m_panel_use    = &panel_dim_i; \
-		n_panel_use    = &panel_len_i; \
-		m_panel_max    = &panel_dim_max; \
-		n_panel_max    = &panel_len_max_i; \
-	} \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
-	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
-	{ \
-		ic0    = (n_iter - 1) * panel_dim_max; \
-		ic_inc = -panel_dim_max; \
-		ip0    = n_iter - 1; \
-		ip_inc = -1; \
-	} \
-	else \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-		ip0    = 0; \
-		ip_inc = 1; \
-	} \
-\
-	p_begin = p_cast; \
-\
-	/* Query the number of threads and thread ids from the current thread's
-	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
-\
-	dim_t it_start, it_end, it_inc; \
-\
-	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
-	   will depend on whether slab or round-robin partitioning was requested
-	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
-\
-	/* Iterate over every logical micropanel in the source matrix. */ \
-	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter; \
-	      ic += ic_inc, ip += ip_inc, it += 1 ) \
-	{ \
-		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
-		c_begin     = c_cast   + (ic  )*vs_c; \
-\
-		if ( bli_is_triangular( strucc ) &&  \
-		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is completely unstored (ie: zero). If the panel
-			   is unstored, we do nothing. (Notice that we don't even
-			   increment p_begin.) */ \
-\
-			continue; \
-		} \
-		else if ( bli_is_triangular( strucc ) &&  \
-		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is diagonal-intersecting. Notice that we
-			   cannot bury the following conditional logic into
-			   packm_struc_cxk() because we need to know the value of
-			   panel_len_max_i so we can properly increment p_inc. */ \
-\
-			/* Sanity check. Diagonals should not intersect the short end of
-			   a micro-panel. If they do, then somehow the constraints on
-			   cache blocksizes being a whole multiple of the register
-			   blocksizes was somehow violated. */ \
-			if ( ( col_stored && diagoffc_i < 0 ) || \
-			     ( row_stored && diagoffc_i > 0 ) ) \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-			          ( col_stored && bli_is_lower( uploc ) ) )  \
-			{ \
-				panel_off_i     = 0; \
-				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
-				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
-				                           panel_len_max ); \
-				diagoffp_i      = diagoffc_i; \
-			} \
-			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
-			{ \
-				panel_off_i     = bli_abs( diagoffc_i ); \
-				panel_len_i     = panel_len_full - panel_off_i; \
-				panel_len_max_i = panel_len_max  - panel_off_i; \
-				diagoffp_i      = 0; \
-			} \
-\
-			c_use = c_begin + (panel_off_i  )*ldc; \
-			p_use = p_begin; \
-\
-			/* We need to re-compute the imaginary stride as a function of
-			   panel_len_max_i since triangular packed matrices have panels
-			   of varying lengths. NOTE: This imaginary stride value is
-			   only referenced by the packm kernels for induced methods. */ \
-			is_p_use  = ldp * panel_len_max_i; \
-\
-			/* We nudge the imaginary stride up by one if it is odd. */ \
-			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
-\
-			/* NOTE: We MUST use round-robin partitioning when packing
-			   micropanels of a triangular matrix. Hermitian/symmetric
-			   and general packing may use slab or round-robin, depending
-			   on which was selected at configure-time. */ \
-			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffp_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is usually LESS than ps_p because triangular
-			   matrices usually have several micro-panels that are shorter
-			   than a "full" micro-panel. */ \
-			p_inc = is_p_use; \
-		} \
-		else if ( bli_is_herm_or_symm( strucc ) ) \
-		{ \
-			/* This case executes if the panel belongs to a Hermitian or
-			   symmetric matrix, which includes stored, unstored, and
-			   diagonal-intersecting panels. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			/* The definition of bli_packm_my_iter() will depend on whether slab
-			   or round-robin partitioning was requested at configure-time. */ \
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffc_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			p_inc = ps_p; \
-		} \
-		else \
-		{ \
-			/* This case executes if the panel is general, or, if the
-			   panel is part of a triangular matrix and is neither unstored
-			   (ie: zero) nor diagonal-intersecting. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			/* The definition of bli_packm_my_iter() will depend on whether slab
-			   or round-robin partitioning was requested at configure-time. */ \
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				packm_ker_cast( BLIS_GENERAL, \
-				                0, \
-				                diagc, \
-				                BLIS_DENSE, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			p_inc = ps_p; \
-		} \
-\
-		p_begin += p_inc; \
-\
-	} \
-}
+	char* p_begin = p_cast;
 
-INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
+	// Iterate over every logical micropanel in the source matrix.
+	for ( dim_t ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter;
+	            ic += ic_inc, ip += ip_inc, it += 1 )
+	{
+		dim_t  panel_dim_i     = bli_min( panel_dim_max, iter_dim - ic );
+		dim_t  panel_dim_off_i = panel_dim_off + ic;
+
+		doff_t diagoffc_i      = diagoffc + (ip  )*diagoffc_inc;
+		char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;
+
+		inc_t  p_inc           = ps_p;
+
+		// NOTE: We MUST use round-robin partitioning when packing
+		// micropanels of a triangular matrix. Hermitian/symmetric
+		// and general packing may use slab or round-robin, depending
+		// on which was selected at configure-time.
+		// The definition of bli_packm_my_iter() will depend on whether slab
+		// or round-robin partitioning was requested at configure-time.
+		bool   my_iter         = bli_is_triangular( strucc )
+		    ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
+		    : bli_packm_my_iter   ( it, it_start, it_end, tid, nt );
+
+		if ( bli_is_triangular( strucc ) &&
+		     bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) )
+		{
+			// This case executes if the panel belongs to a triangular
+			// matrix AND is completely unstored (ie: zero). If the panel
+			// is unstored, we do nothing. (Notice that we don't even
+			// increment p_begin.)
 
+			continue;
+		}
+		else if ( bli_is_triangular( strucc ) &&
+		          bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) )
+		{
+			// This case executes if the panel belongs to a triangular
+			// matrix AND is diagonal-intersecting. Notice that we
+			// cannot bury the following conditional logic into
+			// packm_struc_cxk() because we need to know the value of
+			// panel_len_max_i so we can properly increment p_inc.
+
+			// Sanity check. Diagonals should not intersect the short end of
+			// a micro-panel. If they do, then somehow the constraints on
+			// cache blocksizes being a whole multiple of the register
+			// blocksizes was somehow violated.
+			if ( diagoffc_i < 0 )
+				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
+			dim_t  panel_off_i;
+			dim_t  panel_len_i;
+			dim_t  panel_len_max_i;
+
+			if ( bli_is_lower( uploc ) )
+			{
+				panel_off_i     = 0;
+				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i;
+				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max,
+				                           panel_len_max );
+			}
+			else // if ( bli_is_upper( uploc ) )
+			{
+				panel_off_i     = bli_abs( diagoffc_i );
+				panel_len_i     = panel_len_full - panel_off_i;
+				panel_len_max_i = panel_len_max  - panel_off_i;
+			}
+
+			dim_t panel_len_off_i = panel_off_i + panel_len_off;
+
+			char* c_use           = c_begin + (panel_off_i  )*ldc*dt_c_size;
+			char* p_use           = p_begin;
+
+			// We need to re-compute the imaginary stride as a function of
+			// panel_len_max_i since triangular packed matrices have panels
+			// of varying lengths. NOTE: This imaginary stride value is
+			// only referenced by the packm kernels for induced methods.
+			inc_t is_p_use = ldp * panel_len_max_i;
+
+			// We nudge the imaginary stride up by one if it is odd.
+			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );
+
+			if ( my_iter )
+			{
+				packm_ker_cast( strucc,
+				                diagc,
+				                uploc,
+				                conjc,
+				                schema,
+				                invdiag,
+				                panel_dim_i,
+				                panel_len_i,
+				                panel_dim_max,
+				                panel_len_max_i,
+				                panel_dim_off_i,
+				                panel_len_off_i,
+				                kappa_cast,
+				                c_use, incc, ldc,
+				                p_use,       ldp,
+				                       is_p_use,
+				                cntx,
+				                params );
+			}
+
+			// NOTE: This value is usually LESS than ps_p because triangular
+			// matrices usually have several micro-panels that are shorter
+			// than a "full" micro-panel.
+			p_inc = is_p_use;
+		}
+		else
+		{
+			// This case executes if the panel is either dense, or belongs
+			// to a Hermitian or symmetric matrix, which includes stored,
+			// unstored, and diagonal-intersecting panels.
+
+			if ( my_iter )
+			{
+				packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
+				                diagc,
+				                uploc,
+				                conjc,
+				                schema,
+				                invdiag,
+				                panel_dim_i,
+				                panel_len_full,
+				                panel_dim_max,
+				                panel_len_max,
+				                panel_dim_off_i,
+				                panel_len_off,
+				                kappa_cast,
+				                c_begin, incc, ldc,
+				                p_begin,       ldp, is_p,
+				                cntx,
+				                params );
+			}
+		}
 
+		p_begin += p_inc*dt_p_size;
+	}
+}
 
-/*
-if ( row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-if ( col_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-*/
-/*
-if ( row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-else \
-PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-*/ \
-\
-/*
-if ( col_stored ) { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_barrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_barrier( thread ); \
-} \
-else { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_barrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_barrier( thread ); \
-} \
-*/
-/*
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-*/
-/*
-		if ( row_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		inc_t is_b = rs_p * *m_panel_max; \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/
-/*
-		if ( col_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h
new file mode 100644
index 000000000..9cda5828b
--- /dev/null
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//
+// packm params types.
+//
+
+typedef struct
+{
+    //                   Type of C          Type of P
+    packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
+} packm_blk_var1_params_t;
+
+//
+// Prototype object-based interfaces.
+//
+
+BLIS_EXPORT_BLIS void bli_packm_blk_var1
+     (
+       obj_t*   c,
+       obj_t*   p,
+       cntx_t*  cntx,
+       rntm_t*  rntm,
+       cntl_t*  cntl,
+       thrinfo_t* t
+     );
+
diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c
deleted file mode 100644
index a7c694e4f..000000000
--- a/frame/1m/packm/bli_packm_blk_var1_md.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_GEMM_MD
-
-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)(
-                           trans_t transc,
-                           pack_t  schema,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_max,
-                           dim_t   n_max,
-                           void*   kappa,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                                      inc_t is_p,
-                                      dim_t pd_p, inc_t ps_p,
-                           cntx_t* cntx,
-                           thrinfo_t* thread
-                         );
-
-static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md);
-
-
-void bli_packm_blk_var1_md
-     (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       cntl_t*  cntl,
-       thrinfo_t* t
-     )
-{
-	num_t     dt_c       = bli_obj_dt( c );
-	num_t     dt_p       = bli_obj_dt( p );
-
-	trans_t   transc     = bli_obj_conjtrans_status( c );
-	pack_t    schema     = bli_obj_pack_schema( p );
-
-	dim_t     m_p        = bli_obj_length( p );
-	dim_t     n_p        = bli_obj_width( p );
-	dim_t     m_max_p    = bli_obj_padded_length( p );
-	dim_t     n_max_p    = bli_obj_padded_width( p );
-
-	void*     buf_c      = bli_obj_buffer_at_off( c );
-	inc_t     rs_c       = bli_obj_row_stride( c );
-	inc_t     cs_c       = bli_obj_col_stride( c );
-
-	void*     buf_p      = bli_obj_buffer_at_off( p );
-	inc_t     rs_p       = bli_obj_row_stride( p );
-	inc_t     cs_p       = bli_obj_col_stride( p );
-	inc_t     is_p       = bli_obj_imag_stride( p );
-	dim_t     pd_p       = bli_obj_panel_dim( p );
-	inc_t     ps_p       = bli_obj_panel_stride( p );
-
-	obj_t     kappa;
-	void*     buf_kappa;
-
-	FUNCPTR_T f;
-
-
-	// Treatment of kappa (ie: packing during scaling) depends on
-	// whether we are executing an induced method.
-	if ( bli_is_nat_packed( schema ) )
-	{
-		// This branch is for native execution, where we assume that
-		// the micro-kernel will always apply the alpha scalar of the
-		// higher-level operation. Thus, we use BLIS_ONE for kappa so
-		// that the underlying packm implementation does not perform
-		// any scaling during packing.
-		buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
-	}
-	else // if ( bli_is_ind_packed( schema ) )
-	{
-		obj_t* kappa_p;
-
-		// The value for kappa we use will depend on whether the scalar
-		// attached to A has a nonzero imaginary component. If it does,
-		// then we will apply the scalar during packing to facilitate
-		// implementing induced complex domain algorithms in terms of
-		// real domain micro-kernels. (In the aforementioned situation,
-		// applying a real scalar is easy, but applying a complex one is
-		// harder, so we avoid the need altogether with the code below.)
-		if ( bli_obj_scalar_has_nonzero_imag( p ) )
-		{
-			// Detach the scalar.
-			bli_obj_scalar_detach( p, &kappa );
-
-			// Reset the attached scalar (to 1.0).
-			bli_obj_scalar_reset( p );
-
-			kappa_p = &kappa;
-		}
-		else
-		{
-			// If the internal scalar of A has only a real component, then
-			// we will apply it later (in the micro-kernel), and so we will
-			// use BLIS_ONE to indicate no scaling during packing.
-			kappa_p = &BLIS_ONE;
-		}
-
-		// Acquire the buffer to the kappa chosen above.
-		buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
-	}
-
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_c][dt_p];
-
-	// Invoke the function.
-	f(
-	   transc,
-	   schema,
-	   m_p,
-	   n_p,
-	   m_max_p,
-	   n_max_p,
-	   buf_kappa,
-	   buf_c, rs_c, cs_c,
-	   buf_p, rs_p, cs_p,
-	          is_p,
-	          pd_p, ps_p,
-	   cntx,
-	   t );
-}
-
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
-\
-void PASTEMAC2(chc,chp,varname) \
-     ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-                  inc_t is_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	ctype_p* restrict kappa_cast = kappa; \
-	ctype_c* restrict c_cast     = c; \
-	ctype_p* restrict p_cast     = p; \
-	ctype_c* restrict c_begin; \
-	ctype_p* restrict p_begin; \
-\
-	dim_t             iter_dim; \
-	dim_t             n_iter; \
-	dim_t             it, ic, ip; \
-	doff_t            ic_inc, ip_inc; \
-	dim_t             panel_len_full; \
-	dim_t             panel_len_i; \
-	dim_t             panel_len_max; \
-	dim_t             panel_len_max_i; \
-	dim_t             panel_dim_i; \
-	dim_t             panel_dim_max; \
-	inc_t             vs_c; \
-	inc_t             p_inc; \
-	dim_t*            m_panel_use; \
-	dim_t*            n_panel_use; \
-	dim_t*            m_panel_max; \
-	dim_t*            n_panel_max; \
-	conj_t            conjc; \
-	bool              row_stored; \
-	bool              col_stored; \
-\
-	ctype_c* restrict c_use; \
-	ctype_p* restrict p_use; \
-\
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
-\
-	/* If c needs a transposition, induce it so that we can more simply
-	   express the remaining parameters and code. */ \
-	if ( bli_does_trans( transc ) ) \
-	{ \
-		bli_swap_incs( &rs_c, &cs_c ); \
-		bli_toggle_trans( &transc ); \
-	} \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
-\
-	( void )col_stored; \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		vs_c           = cs_c; \
-		m_panel_use    = &panel_len_i; \
-		n_panel_use    = &panel_dim_i; \
-		m_panel_max    = &panel_len_max_i; \
-		n_panel_max    = &panel_dim_max; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		vs_c           = rs_c; \
-		m_panel_use    = &panel_dim_i; \
-		n_panel_use    = &panel_len_i; \
-		m_panel_max    = &panel_dim_max; \
-		n_panel_max    = &panel_len_max_i; \
-	} \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	{ \
-		ic_inc = panel_dim_max; \
-		ip_inc = 1; \
-	} \
-\
-	p_begin = p_cast; \
-\
-	/* Query the number of threads and thread ids from the current thread's
-	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
-\
-	/* Suppress unused variable warnings when slab partitioning is enabled,
-	   since the slab-based definition of bli_packm_my_iter() does not
-	   actually use tid or nt. */ \
-	( void )nt; ( void )tid; \
-\
-	dim_t it_start, it_end, it_inc; \
-\
-	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
-	   will depend on whether slab or round-robin partitioning was requested
-	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
-\
-	for ( ic  = 0,      ip  = 0,      it  = 0; it < n_iter; \
-	      ic += ic_inc, ip += ip_inc, it += 1 ) \
-	{ \
-		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		c_begin     = c_cast + (ic  )*vs_c; \
-\
-		{ \
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				PASTEMAC2(chc,chp,packm_struc_cxk_md) \
-				( \
-				  conjc, \
-				  schema, \
-				  *m_panel_use, \
-				  *n_panel_use, \
-				  *m_panel_max, \
-				  *n_panel_max, \
-				  kappa_cast, \
-				  c_use, rs_c, cs_c, \
-				  p_use, rs_p, cs_p, \
-			             is_p, \
-				  cntx \
-				); \
-			} \
-\
-			p_inc = ps_p; \
-		} \
-\
-/*
-if ( row_stored ) \
-PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \
-                                p_use, rs_p, cs_p, "%5.2f", "" ); \
-else \
-PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \
-                                p_use, rs_p, cs_p, "%5.2f", "" ); \
-*/ \
-\
-		p_begin += p_inc; \
-\
-	} \
-}
-
-INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md )
-INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md )
-
-#endif
diff --git a/frame/1m/packm/bli_packm_blk_var1_md.h b/frame/1m/packm/bli_packm_blk_var1_md.h
deleted file mode 100644
index e6bf151d0..000000000
--- a/frame/1m/packm/bli_packm_blk_var1_md.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_packm_blk_var1_md
-     (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       cntl_t*  cntl,
-       thrinfo_t* t
-     );
-
-
-#undef  GENTPROT2
-#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \
-\
-void PASTEMAC2(chc,chp,varname) \
-     ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-                  inc_t is_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT2_BASIC0( packm_blk_var1_md )
-INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md )
-
diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c
index fc6ba8052..e99ed9cf3 100644
--- a/frame/1m/packm/bli_packm_cntl.c
+++ b/frame/1m/packm/bli_packm_cntl.c
@@ -35,11 +35,10 @@
 
 #include "blis.h"
 
-cntl_t* bli_packm_cntl_create_node
+BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
      (
        rntm_t*   rntm,
        void_fp   var_func,
-       void_fp   packm_var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
        bool      does_invert_diag,
@@ -62,7 +61,6 @@ cntl_t* bli_packm_cntl_create_node
 
 	// Initialize the packm_params_t struct.
 	params->size              = sizeof( packm_params_t );
-	params->var_func          = packm_var_func;
 	params->bmid_m            = bmid_m;
 	params->bmid_n            = bmid_n;
 	params->does_invert_diag  = does_invert_diag;
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index 17aa196e8..14bfe1ce8 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -36,7 +36,6 @@
 struct packm_params_s
 {
 	uint64_t      size; // size field must be present and come first.
-	packm_var_oft var_func;
 	bszid_t       bmid_m;
 	bszid_t       bmid_n;
 	bool          does_invert_diag;
@@ -47,11 +46,6 @@ struct packm_params_s
 };
 typedef struct packm_params_s packm_params_t;
 
-BLIS_INLINE packm_var_oft bli_cntl_packm_params_var_func( cntl_t* cntl )
-{
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->var_func;
-}
-
 BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
@@ -93,7 +87,6 @@ cntl_t* bli_packm_cntl_create_node
      (
        rntm_t*   rntm,
        void_fp   var_func,
-       void_fp   packm_var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
        bool      does_invert_diag,
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index 739fd5f1d..5a7d716fe 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -35,12 +35,14 @@
 
 #include "blis.h"
 
-siz_t bli_packm_init
+bool bli_packm_init
      (
-       obj_t*  a,
+       obj_t*  c,
        obj_t*  p,
        cntx_t* cntx,
-       cntl_t* cntl
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
      )
 {
 	bli_init_once();
@@ -51,139 +53,27 @@ siz_t bli_packm_init
 	// suitable block of memory from the memory allocator (if such a block
 	// of memory has not already been allocated previously).
 
-	bszid_t   bmult_id_m;
-	bszid_t   bmult_id_n;
-	bool      does_invert_diag;
-	bool      rev_iter_if_upper;
-	bool      rev_iter_if_lower;
-	pack_t    schema;
-	//packbuf_t pack_buf_type;
-	siz_t     size_needed;
-
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
-		bli_packm_init_check( a, p, cntx );
-
-	// Extract various fields from the control tree.
-	bmult_id_m        = bli_cntl_packm_params_bmid_m( cntl );
-	bmult_id_n        = bli_cntl_packm_params_bmid_n( cntl );
-	does_invert_diag  = bli_cntl_packm_params_does_invert_diag( cntl );
-	rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl );
-	rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl );
-	schema            = bli_cntl_packm_params_pack_schema( cntl );
-	//pack_buf_type     = bli_cntl_packm_params_pack_buf_type( cntl );
-
-#if 0
-	// Let us now check to see if the object has already been packed. First
-	// we check if it has been packed to an unspecified (row or column)
-	// format, in which case we can alias the object and return.
-	// NOTE: The reason we don't need to even look at the control tree in
-	// this case is as follows: an object's pack status is only set to
-	// BLIS_PACKED_UNSPEC for situations when the actual format used is
-	// not important, as long as its packed into contiguous rows or
-	// contiguous columns. A good example of this is packing for matrix
-	// operands in the level-2 operations.
-	if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
-	{
-		bli_obj_alias_to( a, p );
-		return 0;
-	}
+		bli_packm_init_check( c, p, cntx );
 
-	// Now we check if the object has already been packed to the desired
-	// schema (as encoded in the control tree). If so, we can alias and
-	// return 0.
-	// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
-	// and thus packing will be called for (but in some cases packing has
-	// already taken place, or does not need to take place, and so that will
-	// be indicated by the pack status). Also, not all combinations of
-	// current pack status and desired pack schema are valid.
-	if ( bli_obj_pack_schema( a ) == pack_schema )
-	{
-		bli_obj_alias_to( a, p );
-		return 0;
-	}
-#endif
+	// We begin by copying the fields of A.
+	bli_obj_alias_to( c, p );
 
 	// If the object is marked as being filled with zeros, then we can skip
 	// the packm operation entirely and alias.
-	if ( bli_obj_is_zeros( a ) )
-	{
-		bli_obj_alias_to( a, p );
-		return 0;
-	}
-
-	// Prepare a few other variables based on properties of the control
-	// tree.
-
-	invdiag_t invert_diag;
-	packord_t pack_ord_if_up;
-	packord_t pack_ord_if_lo;
-
-	if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG;
-	else                    invert_diag = BLIS_NO_INVERT_DIAG;
-
-	if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER;
-	else                     pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER;
-
-	if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER;
-	else                     pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER;
-
-	// Initialize object p for the final packed matrix.
-	size_needed
-	=
-	bli_packm_init_pack
-	(
-	  invert_diag,
-	  schema,
-	  pack_ord_if_up,
-	  pack_ord_if_lo,
-	  bmult_id_m,
-	  bmult_id_n,
-	  a,
-	  p,
-	  cntx
-	);
-
-	// Return the size needed for memory allocation of the packed buffer.
-	return size_needed;
-}
+	if ( bli_obj_is_zeros( c ) )
+		return false;
 
-
-siz_t bli_packm_init_pack
-     (
-       invdiag_t invert_diag,
-       pack_t    schema,
-       packord_t pack_ord_if_up,
-       packord_t pack_ord_if_lo,
-       bszid_t   bmult_id_m,
-       bszid_t   bmult_id_n,
-       obj_t*    a,
-       obj_t*    p,
-       cntx_t*   cntx
-     )
-{
-	bli_init_once();
-
-	num_t     dt_tar       = bli_obj_target_dt( a );
-	num_t     dt_scalar    = bli_obj_scalar_dt( a );
-	trans_t   transa       = bli_obj_onlytrans_status( a );
-	dim_t     m_a          = bli_obj_length( a );
-	dim_t     n_a          = bli_obj_width( a );
-	dim_t     bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
-	dim_t     bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
-	dim_t     bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
-	dim_t     bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx );
-
-	dim_t     m_p, n_p;
-	dim_t     m_p_pad, n_p_pad;
-	siz_t     size_p;
-	siz_t     elem_size_p;
-	inc_t     rs_p, cs_p;
-	inc_t     is_p;
-
-
-	// We begin by copying the fields of A.
-	bli_obj_alias_to( a, p );
+	// Extract various fields from the control tree.
+	bszid_t bmult_id_m   = bli_cntl_packm_params_bmid_m( cntl );
+	bszid_t bmult_id_n   = bli_cntl_packm_params_bmid_n( cntl );
+	pack_t  schema       = bli_cntl_packm_params_pack_schema( cntl );
+	num_t   dt_tar       = bli_obj_target_dt( c );
+	num_t   dt_scalar    = bli_obj_scalar_dt( c );
+	dim_t   bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
+	dim_t   bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
+	dim_t   bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
 
 	// Typecast the internal scalar value to the target datatype.
 	// Note that if the typecasting is needed, this must happen BEFORE we
@@ -195,51 +85,21 @@ siz_t bli_packm_init_pack
 
 	// Update the storage datatype of P to be the target datatype of A.
 	bli_obj_set_dt( dt_tar, p );
+	bli_obj_set_elem_size( bli_dt_size( dt_tar ), p );
 
-	// Update the dimension fields to explicitly reflect a transposition,
-	// if needed.
-	// Then, clear the conjugation and transposition fields from the object
-	// since matrix packing in BLIS is deemed to take care of all conjugation
-	// and transposition necessary.
-	// Then, we adjust the properties of P when A needs a transposition.
-	// We negate the diagonal offset, and if A is upper- or lower-stored,
-	// we either toggle the uplo of P.
-	// Finally, if we mark P as dense since we assume that all matrices,
-	// regardless of structure, will be densified.
-	bli_obj_set_dims_with_trans( transa, m_a, n_a, p );
-	bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, p );
-	if ( bli_does_trans( transa ) )
-	{
-		bli_obj_negate_diag_offset( p );
-		if ( bli_obj_is_upper_or_lower( a ) )
-			bli_obj_toggle_uplo( p );
-	}
+	// Store the pack schema to the object.
+	bli_obj_set_pack_schema( schema, p );
 
-	// If we are packing micropanels, mark P as dense. Otherwise, we are
-	// probably being called in the context of a level-2 operation, in
-	// which case we do not want to overwrite the uplo field of P (inherited
-	// from A) with BLIS_DENSE because that information may be needed by
-	// the level-2 operation's unblocked variant to decide whether to
-	// execute a "lower" or "upper" branch of code.
-	if ( bli_is_panel_packed( schema ) )
-	{
-		bli_obj_set_uplo( BLIS_DENSE, p );
-	}
+	// Clear the conjugation field from the object since matrix packing
+	// in BLIS is deemed to take care of all conjugation necessary.
+	bli_obj_set_conj( BLIS_NO_CONJUGATE, p );
+
+	// Since we are packing micropanels, mark P as dense.
+	bli_obj_set_uplo( BLIS_DENSE, p );
 
 	// Reset the view offsets to (0,0).
 	bli_obj_set_offs( 0, 0, p );
 
-	// Set the invert diagonal field.
-	bli_obj_set_invert_diag( invert_diag, p );
-
-	// Set the pack status of P to the pack schema prescribed in the control
-	// tree node.
-	bli_obj_set_pack_schema( schema, p );
-
-	// Set the packing order bits.
-	bli_obj_set_pack_order_if_upper( pack_ord_if_up, p );
-	bli_obj_set_pack_order_if_lower( pack_ord_if_lo, p );
-
 	// Compute the dimensions padded by the dimension multiples. These
 	// dimensions will be the dimensions of the packed matrices, including
 	// zero-padding, and will be used by the macro- and micro-kernels.
@@ -247,10 +107,10 @@ siz_t bli_packm_init_pack
 	// in P) and aligning them to the dimension multiples (typically equal
 	// to register blocksizes). This does waste a little bit of space for
 	// level-2 operations, but that's okay with us.
-	m_p     = bli_obj_length( p );
-	n_p     = bli_obj_width( p );
-	m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
-	n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
+	dim_t m_p     = bli_obj_length( p );
+	dim_t n_p     = bli_obj_width( p );
+	dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
+	dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
 
 	// Save the padded dimensions into the packed object. It is important
 	// to save these dimensions since they represent the actual dimensions
@@ -258,177 +118,70 @@ siz_t bli_packm_init_pack
 	bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
 
 	// Now we prepare to compute strides, align them, and compute the
-	// total number of bytes needed for the packed buffer. The caller
-	// will then use that value to acquire an appropriate block of memory
-	// from the memory allocator.
+	// total number of bytes needed for the packed buffer. Then we use
+	// that value to acquire an appropriate block of memory from the
+	// memory allocator.
 
 	// Extract the element size for the packed object.
-	elem_size_p = bli_obj_elem_size( p );
-
-	// Set the row and column strides of p based on the pack schema.
-	if      ( bli_is_row_packed( schema ) &&
-	          !bli_is_panel_packed( schema ) )
-	{
-		// For regular row storage, the padded width of our matrix
-		// should be used for the row stride, with the column stride set
-		// to one. By using the WIDTH of the mem_t region, we allow for
-		// zero-padding (if necessary/desired) along the right edge of
-		// the matrix.
-		rs_p = n_p_pad;
-		cs_p = 1;
-
-		// Align the leading dimension according to the heap stride
-		// alignment size so that the second, third, etc rows begin at
-		// aligned addresses.
-		rs_p = bli_align_dim_to_size( rs_p, elem_size_p,
-		                              BLIS_HEAP_STRIDE_ALIGN_SIZE );
-
-		// Store the strides in P.
-		bli_obj_set_strides( rs_p, cs_p, p );
-
-		// Compute the size of the packed buffer.
-		size_p = m_p_pad * rs_p * elem_size_p;
-	}
-	else if ( bli_is_col_packed( schema ) &&
-	          !bli_is_panel_packed( schema ) )
-	{
-		// For regular column storage, the padded length of our matrix
-		// should be used for the column stride, with the row stride set
-		// to one. By using the LENGTH of the mem_t region, we allow for
-		// zero-padding (if necessary/desired) along the bottom edge of
-		// the matrix.
-		cs_p = m_p_pad;
-		rs_p = 1;
-
-		// Align the leading dimension according to the heap stride
-		// alignment size so that the second, third, etc columns begin at
-		// aligned addresses.
-		cs_p = bli_align_dim_to_size( cs_p, elem_size_p,
-		                              BLIS_HEAP_STRIDE_ALIGN_SIZE );
-
-		// Store the strides in P.
-		bli_obj_set_strides( rs_p, cs_p, p );
-
-		// Compute the size of the packed buffer.
-		size_p = cs_p * n_p_pad * elem_size_p;
-	}
-	else if ( bli_is_row_packed( schema ) &&
-	          bli_is_panel_packed( schema ) )
-	{
-		dim_t m_panel;
-		dim_t ps_p;
-
-		// The panel dimension (for each datatype) should be equal to the
-		// default (logical) blocksize multiple in the m dimension.
-		m_panel = bmult_m_def;
-
-		// The "column stride" of a row-micropanel packed object is interpreted
-		// as the column stride WITHIN a micropanel. Thus, this is equal to the
-		// packing (storage) blocksize multiple, which may be equal to the
-		// default (logical) blocksize multiple).
-		cs_p = bmult_m_pack;
-
-		// The "row stride" of a row-micropanel packed object is interpreted
-		// as the row stride WITHIN a micropanel. Thus, it is unit.
-		rs_p = 1;
-
-		// The "panel stride" of a micropanel packed object is interpreted as
-		// the distance between the (0,0) element of panel k and the (0,0)
-		// element of panel k+1. We use the padded width computed above to
-		// allow for zero-padding (if necessary/desired) along the far end
-		// of each micropanel (ie: the right edge of the matrix). Zero-padding
-		// can also occur along the long edge of the last micropanel if the m
-		// dimension of the matrix is not a whole multiple of MR.
-		ps_p = cs_p * n_p_pad;
-
-		// As a general rule, we don't want micropanel strides to be odd.
-		// NOTE: This safety feature *may* not be necessary anymore, but was
-		// definitely needed to support certain variations of the 3m method.
-		if ( bli_is_odd( ps_p ) ) ps_p += 1;
-
-		// Set the imaginary stride (in units of fundamental elements).
-		// This is the number of real elements that must be traversed before
-		// reaching the imaginary part of the packed micropanel. NOTE: the
-		// imaginary stride is mostly vestigial and left over from the 3m
-		// and 4m implementations.
-		is_p = 1;
-
-		// Store the strides and panel dimension in P.
-		bli_obj_set_strides( rs_p, cs_p, p );
-		bli_obj_set_imag_stride( is_p, p );
-		bli_obj_set_panel_dim( m_panel, p );
-		bli_obj_set_panel_stride( ps_p, p );
-		bli_obj_set_panel_length( m_panel, p );
-		bli_obj_set_panel_width( n_p, p );
-
-		// Compute the size of the packed buffer.
-		size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
-	}
-	else if ( bli_is_col_packed( schema ) &&
-	          bli_is_panel_packed( schema ) )
-	{
-		dim_t n_panel;
-		dim_t ps_p;
-
-		// The panel dimension (for each datatype) should be equal to the
-		// default (logical) blocksize multiple in the n dimension.
-		n_panel = bmult_n_def;
-
-		// The "row stride" of a column-micropanel packed object is interpreted
-		// as the row stride WITHIN a micropanel. Thus, this is equal to the
-		// packing (storage) blocksize multiple (which may be equal to the
-		// default (logical) blocksize multiple.
-		rs_p = bmult_n_pack;
-
-		// The "column stride" of a column-micropanel packed object is
-		// interpreted as the column stride WITHIN a micropanel. Thus, it is
-		// unit.
-		cs_p = 1;
-
-		// The "panel stride" of a micropanel packed object is interpreted as
-		// the distance between the (0,0) element of panel k and the (0,0)
-		// element of panel k+1. We use the padded length computed above to
-		// allow for zero-padding (if necessary/desired) along the far end
-		// of each micropanel (ie: the bottom edge of the matrix). Zero-padding
-		// can also occur along the long edge of the last micropanel if the n
-		// dimension of the matrix is not a whole multiple of NR.
-		ps_p = m_p_pad * rs_p;
-
-		// As a general rule, we don't want micropanel strides to be odd.
-		// NOTE: This safety feature *may* not be necessary anymore, but was
-		// definitely needed to support certain variations of the 3m method.
-		if ( bli_is_odd( ps_p ) ) ps_p += 1;
-
-		// Set the imaginary stride (in units of fundamental elements).
-		// This is the number of real elements that must be traversed before
-		// reaching the imaginary part of the packed micropanel. NOTE: the
-		// imaginary stride is mostly vestigial and left over from the 3m
-		// and 4m implementations.
-		is_p = 1;
-
-		// Store the strides and panel dimension in P.
-		bli_obj_set_strides( rs_p, cs_p, p );
-		bli_obj_set_imag_stride( is_p, p );
-		bli_obj_set_panel_dim( n_panel, p );
-		bli_obj_set_panel_stride( ps_p, p );
-		bli_obj_set_panel_length( m_p, p );
-		bli_obj_set_panel_width( n_panel, p );
-
-		// Compute the size of the packed buffer.
-		size_p = ps_p * ( n_p_pad / n_panel ) * elem_size_p;
-	}
-	else
-	{
-		// NOTE: When implementing block storage, we only need to implement
-		// the following two cases:
-		// - row-stored blocks in row-major order
-		// - column-stored blocks in column-major order
-		// The other two combinations coincide with that of packed row-panel
-		// and packed column- panel storage.
-
-		size_p = 0;
-	}
-
-	return size_p;
+	siz_t elem_size_p = bli_obj_elem_size( p );
+
+	// The panel dimension (for each datatype) should be equal to the
+	// default (logical) blocksize multiple in the m dimension.
+	dim_t m_panel = bmult_m_def;
+
+	// The "column stride" of a row-micropanel packed object is interpreted
+	// as the column stride WITHIN a micropanel. Thus, this is equal to the
+	// packing (storage) blocksize multiple, which may be equal to the
+	// default (logical) blocksize multiple).
+	inc_t cs_p = bmult_m_pack;
+
+	// The "row stride" of a row-micropanel packed object is interpreted
+	// as the row stride WITHIN a micropanel. Thus, it is unit.
+	inc_t rs_p = 1;
+
+	// The "panel stride" of a micropanel packed object is interpreted as
+	// the distance between the (0,0) element of panel k and the (0,0)
+	// element of panel k+1. We use the padded width computed above to
+	// allow for zero-padding (if necessary/desired) along the far end
+	// of each micropanel (ie: the right edge of the matrix). Zero-padding
+	// can also occur along the long edge of the last micropanel if the m
+	// dimension of the matrix is not a whole multiple of MR.
+	inc_t ps_p = cs_p * n_p_pad;
+
+	// As a general rule, we don't want micropanel strides to be odd. There
+	// are very few instances where this can happen, but we've seen it happen
+	// more than zero times (such as for certain small problems), and so we
+	// check for it here.
+	if ( bli_is_odd( ps_p ) ) ps_p += 1;
+
+	// Set the imaginary stride (in units of fundamental elements).
+	// This is the number of real elements that must be traversed before
+	// reaching the imaginary part of the packed micropanel. NOTE: the
+	// imaginary stride is mostly vestigial and left over from the 3m
+	// and 4m implementations.
+	inc_t is_p = 1;
+
+	// Store the strides and panel dimension in P.
+	bli_obj_set_strides( rs_p, cs_p, p );
+	bli_obj_set_imag_stride( is_p, p );
+	bli_obj_set_panel_dim( m_panel, p );
+	bli_obj_set_panel_stride( ps_p, p );
+	bli_obj_set_panel_length( m_panel, p );
+	bli_obj_set_panel_width( n_p, p );
+
+	// Compute the size of the packed buffer.
+	siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
+
+	// If the requested size is zero, then we don't need to do any allocation.
+	if ( size_p == 0 )
+		return false;
+
+	// Update the buffer address in p to point to the buffer associated
+	// with the mem_t entry acquired from the memory broker (now cached in
+	// the control tree node).
+	void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread );
+	bli_obj_set_buffer( buffer, p );
+
+	return true;
 }
 
diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h
index 9365a131e..152c6f15c 100644
--- a/frame/1m/packm/bli_packm_init.h
+++ b/frame/1m/packm/bli_packm_init.h
@@ -32,24 +32,13 @@
 
 */
 
-siz_t bli_packm_init
+BLIS_EXPORT_BLIS bool bli_packm_init
      (
        obj_t*  a,
        obj_t*  p,
        cntx_t* cntx,
-       cntl_t* cntl
-     );
-
-BLIS_EXPORT_BLIS siz_t bli_packm_init_pack
-     (
-       invdiag_t invert_diag,
-       pack_t    schema,
-       packord_t pack_ord_if_up,
-       packord_t pack_ord_if_lo,
-       bszid_t   bmult_id_m,
-       bszid_t   bmult_id_n,
-       obj_t*    a,
-       obj_t*    p,
-       cntx_t*   cntx
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index 6dc9ec85a..c9a2bb9db 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -39,59 +39,19 @@ void bli_packm_int
        obj_t*  a,
        obj_t*  p,
        cntx_t* cntx,
+       rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
      )
 {
 	bli_init_once();
 
-	packm_var_oft f;
+	// Extract the function pointer from the object.
+	packm_var_oft f = bli_obj_pack_fn( a );
 
-	// Check parameters.
-	if ( bli_error_checking_is_enabled() )
-		bli_packm_int_check( a, p, cntx );
-
-	// Sanity check; A should never have a zero dimension. If we must support
-	// it, then we should fold it into the next alias-and-early-exit block.
-	//if ( bli_obj_has_zero_dim( a ) ) bli_abort();
-
-	// Let us now check to see if the object has already been packed. First
-	// we check if it has been packed to an unspecified (row or column)
-	// format, in which case we can return, since by now aliasing has already
-	// taken place in packm_init().
-	// NOTE: The reason we don't need to even look at the control tree in
-	// this case is as follows: an object's pack status is only set to
-	// BLIS_PACKED_UNSPEC for situations when the actual format used is
-	// not important, as long as its packed into contiguous rows or
-	// contiguous columns. A good example of this is packing for matrix
-	// operands in the level-2 operations.
-	if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
-	{
-		return;
-	}
-
-	// At this point, we can be assured that cntl is not NULL. Now we check
-	// if the object has already been packed to the desired schema (as en-
-	// coded in the control tree). If so, we can return, as above.
-	// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
-	// and thus packing will be called for (but in some cases packing has
-	// already taken place, or does not need to take place, and so that will
-	// be indicated by the pack status). Also, not all combinations of
-	// current pack status and desired pack schema are valid.
-	if ( bli_obj_pack_schema( a ) == bli_cntl_packm_params_pack_schema( cntl ) )
-	{
-		return;
-	}
-
-	// If the object is marked as being filled with zeros, then we can skip
-	// the packm operation entirely.
-	if ( bli_obj_is_zeros( a ) )
-	{
-		return;
-	}
-
-	// Extract the function pointer from the current control tree node.
-	f = bli_cntl_packm_params_var_func( cntl );
+	// Barrier so that we know threads are done with previous computation
+	// with the same packing buffer before starting to pack.
+	bli_thread_barrier( thread );
 
 	// Invoke the variant with kappa_use.
 	f
@@ -99,8 +59,12 @@ void bli_packm_int
 	  a,
 	  p,
 	  cntx,
+	  rntm,
 	  cntl,
 	  thread
 	);
+
+	// Barrier so that packing is done before computation.
+	bli_thread_barrier( thread );
 }
 
diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h
index 573a299d6..16a5c2c34 100644
--- a/frame/1m/packm/bli_packm_int.h
+++ b/frame/1m/packm/bli_packm_int.h
@@ -37,6 +37,7 @@ void bli_packm_int
        obj_t*  a,
        obj_t*  p,
        cntx_t* cntx,
+       rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
      );
diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/1m/packm/bli_packm_scalar.c
similarity index 53%
rename from frame/3/trsm/bli_trsm_packab.c
rename to frame/1m/packm/bli_packm_scalar.c
index 841230d80..f613028c9 100644
--- a/frame/3/trsm/bli_trsm_packab.c
+++ b/frame/1m/packm/bli_packm_scalar.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,83 +35,42 @@
 
 #include "blis.h"
 
-void bli_trsm_packa
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
+void* bli_packm_scalar( obj_t* kappa, obj_t* p )
 {
-	obj_t a_pack;
+	num_t  dt_p   = bli_obj_dt( p );
+	pack_t schema = bli_obj_pack_schema( p );
 
-	// Pack matrix A according to the control tree node.
-	bli_l3_packm
-	(
-	  a,
-	  &a_pack,
-	  cntx,
-	  rntm,
-	  cntl,
-	  thread
-	);
+	// The value for kappa we use will depends on whether the scalar
+	// attached to A has a nonzero imaginary component. If it does,
+	// then we will apply the scalar during packing to facilitate
+	// implementing induced complex domain algorithms in terms of
+	// real domain micro-kernels. (In the aforementioned situation,
+	// applying a real scalar is easy, but applying a complex one is
+	// harder, so we avoid the need altogether with the code below.)
+	if ( bli_obj_scalar_has_nonzero_imag( p ) &&
+	     !bli_is_nat_packed( schema ) )
+	{
+		//printf( "applying non-zero imag kappa\n_p" );
 
-	// Proceed with execution using packed matrix A.
-	bli_trsm_int
-	(
-	  &BLIS_ONE,
-	  &a_pack,
-	  b,
-	  &BLIS_ONE,
-	  c,
-	  cntx,
-	  rntm,
-	  bli_cntl_sub_node( cntl ),
-	  bli_thrinfo_sub_node( thread )
-	);
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_trsm_packb
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	obj_t b_pack;
+		// Detach the scalar.
+		bli_obj_scalar_detach( p, kappa );
 
-	// Pack matrix B according to the control tree node.
-	bli_l3_packm
-	(
-	  b,
-	  &b_pack,
-	  cntx,
-	  rntm,
-	  cntl,
-	  thread
-	);
+		// Reset the attached scalar (to 1.0).
+		bli_obj_scalar_reset( p );
 
-	// Proceed with execution using packed matrix B.
-	bli_trsm_int
-	(
-	  &BLIS_ONE,
-	  a,
-	  &b_pack,
-	  &BLIS_ONE,
-	  c,
-	  cntx,
-	  rntm,
-	  bli_cntl_sub_node( cntl ),
-	  bli_thrinfo_sub_node( thread )
-	);
+		return bli_obj_buffer_for_1x1( dt_p, kappa );
+	}
+	// This branch is also for native execution, where we assume that
+	// the micro-kernel will always apply the alpha scalar of the
+	// higher-level operation. Thus, we use BLIS_ONE for kappa so
+	// that the underlying packm implementation does not perform
+	// any scaling during packing.
+	else
+	{
+		// If the internal scalar of A has only a real component, then
+		// we will apply it later (in the micro-kernel), and so we will
+		// use BLIS_ONE to indicate no scaling during packing.
+		return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE );
+	}
 }
 
diff --git a/frame/1m/packm/bli_packm_md.h b/frame/1m/packm/bli_packm_scalar.h
similarity index 95%
rename from frame/1m/packm/bli_packm_md.h
rename to frame/1m/packm/bli_packm_scalar.h
index bb9d6d613..3745accf9 100644
--- a/frame/1m/packm/bli_packm_md.h
+++ b/frame/1m/packm/bli_packm_scalar.h
@@ -32,6 +32,5 @@
 
 */
 
-#include "bli_packm_blk_var1_md.h"
-#include "bli_packm_struc_cxk_md.h"
+BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p );
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index a3b2d66e6..2a52c42de 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -40,57 +40,24 @@
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffc, \
        diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
        bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
+       dim_t           panel_dim, \
+       dim_t           panel_len, \
+       dim_t           panel_dim_max, \
+       dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
                           inc_t is_p, \
        cntx_t*         cntx  \
      ) \
 { \
-	dim_t  panel_dim; \
-	dim_t  panel_dim_max; \
-	dim_t  panel_len; \
-	dim_t  panel_len_max; \
-	inc_t  incc, ldc; \
-	inc_t        ldp; \
-\
-\
-	/* Determine the dimensions and relative strides of the micro-panel
-	   based on its pack schema. */ \
-	if ( bli_is_col_packed( schema ) ) \
-	{ \
-		/* Prepare to pack to row-stored column panel. */ \
-		panel_dim     = n_panel; \
-		panel_dim_max = n_panel_max; \
-		panel_len     = m_panel; \
-		panel_len_max = m_panel_max; \
-		incc          = cs_c; \
-		ldc           = rs_c; \
-		ldp           = rs_p; \
-	} \
-	else /* if ( bli_is_row_packed( schema ) ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panel. */ \
-		panel_dim     = m_panel; \
-		panel_dim_max = m_panel_max; \
-		panel_len     = n_panel; \
-		panel_len_max = n_panel_max; \
-		incc          = rs_c; \
-		ldc           = cs_c; \
-		ldp           = cs_p; \
-	} \
-\
-\
 	/* Handle micro-panel packing based on the structure of the matrix
 	   being packed. */ \
 	if      ( bli_is_general( strucc ) ) \
@@ -118,23 +85,21 @@ void PASTEMAC(ch,varname) \
 		PASTEMAC(ch,packm_herm_cxk) \
 		( \
 		  strucc, \
-		  diagoffc, \
+		  diagc, \
 		  uploc, \
 		  conjc, \
 		  schema, \
-		  m_panel, \
-		  n_panel, \
-		  m_panel_max, \
-		  n_panel_max, \
+		  invdiag, \
 		  panel_dim, \
-		  panel_dim_max, \
 		  panel_len, \
+		  panel_dim_max, \
 		  panel_len_max, \
+		  panel_dim_off, \
+		  panel_len_off, \
 		  kappa, \
-		  c, rs_c, cs_c, \
-		     incc, ldc, \
-		  p, rs_p, cs_p, \
-		           ldp, \
+		  c, incc, ldc, \
+		  p,       ldp, \
+		     is_p, \
 		  cntx  \
 		); \
 	} \
@@ -145,130 +110,24 @@ void PASTEMAC(ch,varname) \
 		PASTEMAC(ch,packm_tri_cxk) \
 		( \
 		  strucc, \
-		  diagoffc, \
 		  diagc, \
 		  uploc, \
 		  conjc, \
 		  schema, \
 		  invdiag, \
-		  m_panel, \
-		  n_panel, \
-		  m_panel_max, \
-		  n_panel_max, \
 		  panel_dim, \
-		  panel_dim_max, \
 		  panel_len, \
+		  panel_dim_max, \
 		  panel_len_max, \
+		  panel_dim_off, \
+		  panel_len_off, \
 		  kappa, \
-		  c, rs_c, cs_c, \
-		     incc, ldc, \
-		  p, rs_p, cs_p, \
-		           ldp, \
+		  c, incc, ldc, \
+		  p,       ldp, \
+		     is_p, \
 		  cntx  \
 		); \
 	} \
-\
-\
-	/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
-	   fill the edge region (the bottom m_panel_max - m_panel rows or right-
-	   side n_panel_max - n_panel columns) of the micropanel with zeros.
-	   However, this responsibility has been moved to the packm microkernel.
-	   This change allows experts to use custom kernels that pack to custom
-	   packing formats when the problem size is not a nice multiple of the
-	   register blocksize. */ \
-\
-/*
-	if ( m_panel != m_panel_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		dim_t           i      = m_panel; \
-		dim_t           m_edge = m_panel_max - i; \
-		dim_t           n_edge = n_panel_max; \
-		ctype*          p_edge = p + (i  )*rs_p; \
-\
-		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p_edge, rs_p, cs_p, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
-\
-	if ( n_panel != n_panel_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		dim_t           j      = n_panel; \
-		dim_t           m_edge = m_panel_max; \
-		dim_t           n_edge = n_panel_max - j; \
-		ctype*          p_edge = p + (j  )*cs_p; \
-\
-		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p_edge, rs_p, cs_p, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
-*/ \
-\
-\
-	if ( bli_is_triangular( strucc ) ) \
-	{ \
-		/* If this panel is an edge case in both panel dimension and length,
-		   then it must be a bottom-right corner case. Set the part of the
-		   diagonal that extends into the zero-padded region to identity.
-		   NOTE: This is actually only necessary when packing for trsm, as
-		   it helps prevent NaNs and Infs from creeping into the computation.
-		   However, we set the region to identity for trmm as well. Those
-		   1.0's end up getting muliplied by the 0.0's in the zero-padded
-		   region of the other matrix, so there is no harm in this. */ \
-		if ( m_panel != m_panel_max && \
-		     n_panel != n_panel_max ) \
-		{ \
-			ctype* restrict one    = PASTEMAC(ch,1); \
-			dim_t           i      = m_panel; \
-			dim_t           j      = n_panel; \
-			dim_t           m_br   = m_panel_max - i; \
-			dim_t           n_br   = n_panel_max - j; \
-			ctype*          p_br   = p + (i  )*rs_p + (j  )*cs_p; \
-\
-			PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  m_br, \
-			  n_br, \
-			  one, \
-			  p_br, rs_p, cs_p, \
-			  cntx, \
-			  NULL  \
-			); \
-		} \
-	} \
-\
-\
-/*
-	if ( bli_is_col_packed( schema ) ) \
-	PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
-	                      p, rs_p, cs_p, "%4.1f", "" ); \
-	else if ( bli_is_row_packed( schema ) ) \
-	PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
-	                      p, rs_p, cs_p, "%4.1f", "" ); \
-*/ \
 }
 
 INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
@@ -282,42 +141,31 @@ INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffc, \
+       diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
+       bool            invdiag, \
        dim_t           panel_dim, \
-       dim_t           panel_dim_max, \
        dim_t           panel_len, \
+       dim_t           panel_dim_max, \
        dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
+                          inc_t is_p, \
        cntx_t*         cntx  \
      ) \
 { \
-	doff_t  diagoffc_abs; \
-	dim_t   i, j; \
-	bool    row_stored; \
-	bool    col_stored; \
-\
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
+	doff_t diagoffc = panel_dim_off - panel_len_off; \
+	doff_t diagoffc_abs; \
+	dim_t  i, j; \
 \
 	/* Handle the case where the micro-panel does NOT intersect the
 	   diagonal separately from the case where it does intersect. */ \
-	if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
+	if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
 	{ \
 		/* If the current panel is unstored, we need to make a few
 		   adjustments so we refer to the data where it is actually
@@ -325,10 +173,10 @@ void PASTEMAC(ch,varname) \
 		   implicitly assumes we are operating on a dense panel
 		   within a larger symmetric or Hermitian matrix, since a
 		   general matrix would not contain any unstored region.) */ \
-		if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
+		if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
 		{ \
-			c = c + diagoffc * ( doff_t )cs_c + \
-			       -diagoffc * ( doff_t )rs_c;  \
+			c = c + diagoffc * ( doff_t )ldc + \
+			       -diagoffc * ( doff_t )incc;  \
 			bli_swap_incs( &incc, &ldc ); \
 \
 			if ( bli_is_hermitian( strucc ) ) \
@@ -350,7 +198,7 @@ void PASTEMAC(ch,varname) \
 		  cntx  \
 		); \
 	} \
-	else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
+	else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
 	{ \
 		ctype* restrict c10; \
 		ctype* restrict p10; \
@@ -370,14 +218,12 @@ void PASTEMAC(ch,varname) \
 		   a micro-panel. If they do, then somehow the constraints on
 		   cache blocksizes being a whole multiple of the register
 		   blocksizes was somehow violated. */ \
-		if ( ( col_stored && diagoffc < 0 ) || \
-		     ( row_stored && diagoffc > 0 ) ) \
+		if ( diagoffc < 0 ) \
 			bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
 \
 		diagoffc_abs = bli_abs( diagoffc ); \
 \
-		if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-		          ( col_stored && bli_is_lower( uploc ) ) ) \
+		if      ( bli_is_lower( uploc ) ) \
 		{ \
 			p10_dim    = panel_dim; \
 			p10_len    = diagoffc_abs; \
@@ -393,8 +239,8 @@ void PASTEMAC(ch,varname) \
 			diagoffc12 = diagoffc_abs - j; \
 			p12        = p + (j  )*ldp; \
 			c12        = c + (j  )*ldc; \
-			c12        = c12 + diagoffc12 * ( doff_t )cs_c + \
-			                  -diagoffc12 * ( doff_t )rs_c;  \
+			c12        = c12 + diagoffc12 * ( doff_t )ldc + \
+			                  -diagoffc12 * ( doff_t )incc;  \
 			incc12     = ldc; \
 			ldc12      = incc; \
 			conjc12    = conjc; \
@@ -402,16 +248,15 @@ void PASTEMAC(ch,varname) \
 			if ( bli_is_hermitian( strucc ) ) \
 				bli_toggle_conj( &conjc12 ); \
 		} \
-		else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-		             ( col_stored && bli_is_upper( uploc ) ) ) */ \
+		else /* if ( bli_is_upper( uploc ) ) */ \
 		{ \
 			p10_dim    = panel_dim; \
 			p10_len    = diagoffc_abs + panel_dim; \
 			diagoffc10 = diagoffc; \
 			p10        = p; \
 			c10        = c; \
-			c10        = c10 + diagoffc10 * ( doff_t )cs_c + \
-			                  -diagoffc10 * ( doff_t )rs_c;  \
+			c10        = c10 + diagoffc10 * ( doff_t )ldc + \
+			                  -diagoffc10 * ( doff_t )incc;  \
 			incc10     = ldc; \
 			ldc10      = incc; \
 			conjc10    = conjc; \
@@ -486,8 +331,8 @@ void PASTEMAC(ch,varname) \
 			  transc, \
 			  p11_m, \
 			  p11_n, \
-			  c11, rs_c, cs_c, \
-			  p11, rs_p, cs_p, \
+			  c11, incc, ldc, \
+			  p11,    1, ldp, \
 			  cntx, \
 			  NULL  \
 			); \
@@ -503,7 +348,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					PASTEMAC(ch,seti0s)( *pi11 ); \
 \
-					pi11 += rs_p + cs_p; \
+					pi11 += 1 + ldp; \
 				} \
 			} \
 \
@@ -519,7 +364,7 @@ void PASTEMAC(ch,varname) \
 			  p11_m, \
 			  p11_n, \
 			  kappa, \
-			  p11, rs_p, cs_p, \
+			  p11, 1, ldp, \
 			  cntx, \
 			  NULL  \
 			); \
@@ -539,28 +384,26 @@ INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffp, \
        diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
        bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
        dim_t           panel_dim, \
-       dim_t           panel_dim_max, \
        dim_t           panel_len, \
+       dim_t           panel_dim_max, \
        dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
+                          inc_t is_p, \
        cntx_t*         cntx  \
      ) \
 { \
+	doff_t diagoffc = panel_dim_off - panel_len_off; \
+\
 	/* Pack the panel. */ \
 	PASTEMAC(ch,kername) \
 	( \
@@ -584,11 +427,11 @@ void PASTEMAC(ch,varname) \
 		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
-		  diagoffp, \
-		  m_panel, \
-		  n_panel, \
+		  diagoffc, \
+		  panel_dim, \
+		  panel_len, \
 		  kappa, \
-		  p, rs_p, cs_p, \
+		  p, 1, ldp, \
 		  cntx, \
 		  NULL  \
 		); \
@@ -599,10 +442,10 @@ void PASTEMAC(ch,varname) \
 	{ \
 		PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
 		( \
-		  diagoffp, \
-		  m_panel, \
-		  n_panel, \
-		  p, rs_p, cs_p, \
+		  diagoffc, \
+		  panel_dim, \
+		  panel_len, \
+		  p, 1, ldp, \
 		  cntx, \
 		  NULL  \
 		); \
@@ -621,23 +464,53 @@ void PASTEMAC(ch,varname) \
 		uplo_t          uplop = uploc; \
 \
 		bli_toggle_uplo( &uplop ); \
-		bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
+		bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
 \
 		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
-		  diagoffp, \
+		  diagoffc, \
 		  BLIS_NONUNIT_DIAG, \
 		  uplop, \
-		  m_panel, \
-		  n_panel, \
+		  panel_dim, \
+		  panel_len, \
 		  zero, \
-		  p, rs_p, cs_p, \
+		  p, 1, ldp, \
 		  cntx, \
 		  NULL  \
 		); \
 	} \
 \
+	/* If this panel is an edge case in both panel dimension and length,
+	   then it must be a bottom-right corner case. Set the part of the
+	   diagonal that extends into the zero-padded region to identity.
+	   NOTE: This is actually only necessary when packing for trsm, as
+	   it helps prevent NaNs and Infs from creeping into the computation.
+	   However, we set the region to identity for trmm as well. Those
+	   1.0's end up getting muliplied by the 0.0's in the zero-padded
+	   region of the other matrix, so there is no harm in this. */ \
+	if ( panel_dim != panel_dim_max && \
+	     panel_len != panel_len_max ) \
+	{ \
+		ctype* restrict one    = PASTEMAC(ch,1); \
+		dim_t           i      = panel_dim; \
+		dim_t           j      = panel_len; \
+		dim_t           m_br   = panel_dim_max - i; \
+		dim_t           n_br   = panel_len_max - j; \
+		ctype*          p_br   = p + (i  ) + (j  )*ldp; \
+\
+		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  0, \
+		  m_br, \
+		  n_br, \
+		  one, \
+		  p_br, 1, ldp, \
+		  cntx, \
+		  NULL  \
+		); \
+	} \
 }
 
 INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h
index 08afb19bd..973a02612 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.h
+++ b/frame/1m/packm/bli_packm_struc_cxk.h
@@ -38,84 +38,25 @@
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffp, \
        diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
        bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
-     );
-
-INSERT_GENTPROT_BASIC0( packm_struc_cxk )
-
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       doff_t          diagoffc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
        dim_t           panel_dim, \
-       dim_t           panel_dim_max, \
        dim_t           panel_len, \
-       dim_t           panel_len_max, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
-       cntx_t*         cntx  \
-     );
-
-INSERT_GENTPROT_BASIC0( packm_herm_cxk )
-
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       doff_t          diagoffc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
-       dim_t           panel_dim, \
        dim_t           panel_dim_max, \
-       dim_t           panel_len, \
        dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
+                          inc_t is_p, \
        cntx_t*         cntx  \
      );
 
+INSERT_GENTPROT_BASIC0( packm_struc_cxk )
+INSERT_GENTPROT_BASIC0( packm_herm_cxk )
 INSERT_GENTPROT_BASIC0( packm_tri_cxk )
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c
index a66ba5ff6..b3be9dff9 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_1er.c
+++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c
@@ -40,57 +40,25 @@
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffc, \
        diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
        bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
+       dim_t           panel_dim, \
+       dim_t           panel_len, \
+       dim_t           panel_dim_max, \
+       dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
                           inc_t is_p, \
-       cntx_t*         cntx  \
+       cntx_t*         cntx, \
+       void*           params \
      ) \
 { \
-	dim_t  panel_dim; \
-	dim_t  panel_dim_max; \
-	dim_t  panel_len; \
-	dim_t  panel_len_max; \
-	inc_t  incc, ldc; \
-	inc_t  ldp; \
-\
-\
-	/* Determine the dimensions and relative strides of the micro-panel
-	   based on its pack schema. */ \
-	if ( bli_is_col_packed( schema ) ) \
-	{ \
-		/* Prepare to pack to row-stored column panel. */ \
-		panel_dim     = n_panel; \
-		panel_dim_max = n_panel_max; \
-		panel_len     = m_panel; \
-		panel_len_max = m_panel_max; \
-		incc          = cs_c; \
-		ldc           = rs_c; \
-		ldp           = rs_p; \
-	} \
-	else /* if ( bli_is_row_packed( schema ) ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panel. */ \
-		panel_dim     = m_panel; \
-		panel_dim_max = m_panel_max; \
-		panel_len     = n_panel; \
-		panel_len_max = n_panel_max; \
-		incc          = rs_c; \
-		ldc           = cs_c; \
-		ldp           = cs_p; \
-	} \
-\
-\
 	/* Handle micro-panel packing based on the structure of the matrix
 	   being packed. */ \
 	if      ( bli_is_general( strucc ) ) \
@@ -108,7 +76,7 @@ void PASTEMAC(ch,varname) \
 		  kappa, \
 		  c, incc, ldc, \
 		  p,       ldp, \
-		  cntx  \
+		  cntx \
 		); \
 	} \
 	else if ( bli_is_herm_or_symm( strucc ) ) \
@@ -118,24 +86,23 @@ void PASTEMAC(ch,varname) \
 		PASTEMAC(ch,packm_herm_cxk_1er) \
 		( \
 		  strucc, \
-		  diagoffc, \
+		  diagc, \
 		  uploc, \
 		  conjc, \
 		  schema, \
-		  m_panel, \
-		  n_panel, \
-		  m_panel_max, \
-		  n_panel_max, \
+		  invdiag, \
 		  panel_dim, \
-		  panel_dim_max, \
 		  panel_len, \
+		  panel_dim_max, \
 		  panel_len_max, \
+		  panel_dim_off, \
+		  panel_len_off, \
 		  kappa, \
-		  c, rs_c, cs_c, \
-		     incc, ldc, \
-		  p, rs_p, cs_p, \
-		           ldp, \
-		  cntx  \
+		  c, incc, ldc, \
+		  p,       ldp, \
+		     is_p, \
+		  cntx, \
+		  params \
 		); \
 	} \
 	else /* ( bli_is_triangular( strucc ) ) */ \
@@ -145,125 +112,25 @@ void PASTEMAC(ch,varname) \
 		PASTEMAC(ch,packm_tri_cxk_1er) \
 		( \
 		  strucc, \
-		  diagoffc, \
 		  diagc, \
 		  uploc, \
 		  conjc, \
 		  schema, \
 		  invdiag, \
-		  m_panel, \
-		  n_panel, \
-		  m_panel_max, \
-		  n_panel_max, \
 		  panel_dim, \
-		  panel_dim_max, \
 		  panel_len, \
+		  panel_dim_max, \
 		  panel_len_max, \
+		  panel_dim_off, \
+		  panel_len_off, \
 		  kappa, \
-		  c, rs_c, cs_c, \
-		     incc, ldc, \
-		  p, rs_p, cs_p, \
-		           ldp, \
-		  cntx  \
-		); \
-	} \
-\
-\
-	/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
-	   fill the edge region (the bottom m_panel_max - m_panel rows or right-
-	   side n_panel_max - n_panel columns) of the micropanel with zeros.
-	   However, this responsibility has been moved to the packm microkernel.
-	   This change allows experts to use custom kernels that pack to custom
-	   packing formats when the problem size is not a nice multiple of the
-	   register blocksize. */ \
-/*
-	if ( m_panel != m_panel_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		dim_t           offm   = m_panel; \
-		dim_t           offn   = 0; \
-		dim_t           m_edge = m_panel_max - m_panel; \
-		dim_t           n_edge = n_panel_max; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, rs_p, cs_p, ldp  \
-		); \
-	} \
-\
-	if ( n_panel != n_panel_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		dim_t           offm   = 0; \
-		dim_t           offn   = n_panel; \
-		dim_t           m_edge = m_panel_max; \
-		dim_t           n_edge = n_panel_max - n_panel; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, rs_p, cs_p, ldp  \
+		  c, incc, ldc, \
+		  p,       ldp, \
+		     is_p, \
+		  cntx, \
+		  params \
 		); \
 	} \
-*/ \
-\
-	if ( bli_is_triangular( strucc ) ) \
-	{ \
-		/* If this micro-panel is an edge case in both panel dimension and
-		   length, then it must be a bottom-right corner case, which
-		   typically only happens for micro-panels being packed for trsm.
-		   (It also happens for trmm if kr > 1.) Here, we set the part of
-		   the diagonal that extends into the zero-padded region to
-		   identity. This prevents NaNs and Infs from creeping into the
-		   computation. If this code does execute for trmm, it is okay,
-		   because those 1.0's that extend into the bottom-right region
-		   end up getting muliplied by the 0.0's in the zero-padded region
-		   of the other matrix. */ \
-		if ( m_panel != m_panel_max && \
-		     n_panel != n_panel_max ) \
-		{ \
-			ctype* restrict one    = PASTEMAC(ch,1); \
-			dim_t           offm   = m_panel; \
-			dim_t           offn   = n_panel; \
-			dim_t           m_edge = m_panel_max - m_panel; \
-			dim_t           n_edge = n_panel_max - n_panel; \
-\
-			PASTEMAC(ch,set1ms_mxn_diag) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  one, \
-			  p, rs_p, cs_p, ldp  \
-			); \
-		} \
-	} \
-\
-\
-/*
-	if ( bli_is_1r_packed( schema ) ) { \
-	PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \
-	                       ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
-	} \
- \
-	if ( bli_is_1e_packed( schema ) ) { \
-	PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \
-	                       ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
-	} \
-*/ \
 }
 
 INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
@@ -277,42 +144,32 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffc, \
+       diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
+       bool            invdiag, \
        dim_t           panel_dim, \
-       dim_t           panel_dim_max, \
        dim_t           panel_len, \
+       dim_t           panel_dim_max, \
        dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
-       cntx_t*         cntx  \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
+                          inc_t is_p, \
+       cntx_t*         cntx, \
+       void*           params \
      ) \
 { \
-	doff_t  diagoffc_abs; \
-	dim_t   j; \
-	bool    row_stored; \
-	bool    col_stored; \
-\
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
+	doff_t diagoffc = panel_dim_off - panel_len_off; \
+	doff_t diagoffc_abs; \
+	dim_t  j; \
 \
 	/* Handle the case where the micro-panel does NOT intersect the
 	   diagonal separately from the case where it does intersect. */ \
-	if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
+	if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
 	{ \
 		/* If the current panel is unstored, we need to make a few
 		   adjustments so we refer to the data where it is actually
@@ -320,10 +177,10 @@ void PASTEMAC(ch,varname) \
 		   implicitly assumes we are operating on a dense panel
 		   within a larger symmetric or Hermitian matrix, since a
 		   general matrix would not contain any unstored region.) */ \
-		if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
+		if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
 		{ \
-			c = c + diagoffc * ( doff_t )cs_c + \
-			       -diagoffc * ( doff_t )rs_c;  \
+			c = c + diagoffc * ( doff_t )ldc + \
+			       -diagoffc * ( doff_t )incc;  \
 			bli_swap_incs( &incc, &ldc ); \
 \
 			if ( bli_is_hermitian( strucc ) ) \
@@ -345,7 +202,7 @@ void PASTEMAC(ch,varname) \
 		  cntx  \
 		); \
 	} \
-	else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
+	else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
 	{ \
 		ctype* restrict c10; \
 		ctype* restrict p10; \
@@ -366,14 +223,12 @@ void PASTEMAC(ch,varname) \
 		   a micro-panel. If they do, then somehow the constraints on
 		   cache blocksizes being a whole multiple of the register
 		   blocksizes was somehow violated. */ \
-		if ( ( col_stored && diagoffc < 0 ) || \
-		     ( row_stored && diagoffc > 0 ) ) \
+		if ( diagoffc < 0 ) \
 			bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
 \
 		diagoffc_abs = bli_abs( diagoffc ); \
 \
-		if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-		          ( col_stored && bli_is_lower( uploc ) ) ) \
+		if      ( bli_is_lower( uploc ) ) \
 		{ \
 			p10_dim    = panel_dim; \
 			p10_len    = diagoffc_abs; \
@@ -389,8 +244,8 @@ void PASTEMAC(ch,varname) \
 			diagoffc12 = diagoffc_abs - j; \
 			p12        = p + (j  )*ldp; \
 			c12        = c + (j  )*ldc; \
-			c12        = c12 + diagoffc12 * ( doff_t )cs_c + \
-			                  -diagoffc12 * ( doff_t )rs_c;  \
+			c12        = c12 + diagoffc12 * ( doff_t )ldc + \
+			                  -diagoffc12 * ( doff_t )incc;  \
 			incc12     = ldc; \
 			ldc12      = incc; \
 			conjc12    = conjc; \
@@ -398,16 +253,15 @@ void PASTEMAC(ch,varname) \
 			if ( bli_is_hermitian( strucc ) ) \
 				bli_toggle_conj( &conjc12 ); \
 		} \
-		else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-		             ( col_stored && bli_is_upper( uploc ) ) ) */ \
+		else /* if ( bli_is_upper( uploc ) ) */ \
 		{ \
 			p10_dim    = panel_dim; \
 			p10_len    = diagoffc_abs + panel_dim; \
 			diagoffc10 = diagoffc; \
 			p10        = p; \
 			c10        = c; \
-			c10        = c10 + diagoffc10 * ( doff_t )cs_c + \
-			                  -diagoffc10 * ( doff_t )rs_c;  \
+			c10        = c10 + diagoffc10 * ( doff_t )ldc + \
+			                  -diagoffc10 * ( doff_t )incc;  \
 			incc10     = ldc; \
 			ldc10      = incc; \
 			conjc10    = conjc; \
@@ -478,8 +332,8 @@ void PASTEMAC(ch,varname) \
 			  conjc, \
 			  panel_dim, \
 			  kappa, \
-			  c11, rs_c, cs_c, \
-			  p11, rs_p, cs_p, ldp  \
+			  c11, incc, ldc, \
+			  p11,    1, ldp, ldp  \
 			); \
 \
 			/* If we are packing a micro-panel with Hermitian structure,
@@ -495,8 +349,8 @@ void PASTEMAC(ch,varname) \
 			if ( bli_is_hermitian( strucc ) ) \
 			{ \
 				ctype_r* restrict c11_r = ( ctype_r* )c11; \
-				const dim_t       rs_c2 = 2*rs_c; \
-				const dim_t       cs_c2 = 2*cs_c; \
+				const dim_t       incc2 = 2*incc; \
+				const dim_t       ldc2 = 2*ldc; \
 \
 				PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \
 				( \
@@ -504,8 +358,8 @@ void PASTEMAC(ch,varname) \
 				  panel_dim, \
 				  panel_dim, \
 				  kappa, \
-				  c11_r, rs_c2, cs_c2, \
-				  p11,   rs_p, cs_p, ldp  \
+				  c11_r, incc2, ldc2, \
+				  p11,   1,     ldp, ldp  \
 				); \
 			} \
 		} \
@@ -523,30 +377,28 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er )
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffp, \
        diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
        bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
        dim_t           panel_dim, \
-       dim_t           panel_dim_max, \
        dim_t           panel_len, \
+       dim_t           panel_dim_max, \
        dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
-       cntx_t*         cntx  \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
+                          inc_t is_p, \
+       cntx_t*         cntx, \
+       void*           params \
      ) \
 { \
-	doff_t diagoffp_abs = bli_abs( diagoffp ); \
-	ctype* p11          = p + (diagoffp_abs  )*ldp; \
+	doff_t diagoffc     = panel_dim_off - panel_len_off; \
+	doff_t diagoffc_abs = bli_abs( diagoffc ); \
+	ctype* p11          = p + (diagoffc_abs  )*ldp; \
 \
 \
 	/* Pack the panel. */ \
@@ -579,7 +431,7 @@ void PASTEMAC(ch,varname) \
 			  panel_dim, \
 			  panel_dim, \
 			  kappa, \
-			  p11, rs_p, cs_p, ldp  \
+			  p11, 1, ldp, ldp  \
 			); \
 		} \
 \
@@ -594,7 +446,7 @@ void PASTEMAC(ch,varname) \
 			  0, \
 			  panel_dim, \
 			  panel_dim, \
-			  p11, rs_p, cs_p, ldp  \
+			  p11, 1, ldp, ldp  \
 			); \
 		} \
 \
@@ -610,11 +462,11 @@ void PASTEMAC(ch,varname) \
 		{ \
 			ctype* restrict zero         = PASTEMAC(ch,0); \
 			uplo_t          uplop        = uploc; \
-			doff_t          diagoffp11_0 = 0; \
+			doff_t          diagoffc11_0 = 0; \
 			dim_t           p11_0_dim    = panel_dim - 1; \
 \
 			bli_toggle_uplo( &uplop ); \
-			bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp11_0 ); \
+			bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \
 \
 			/* Note that this macro works a little differently than the setm
 			   operation. Here, we pass in the dimensions of only p11, rather
@@ -622,20 +474,51 @@ void PASTEMAC(ch,varname) \
 			   "shrunken" dimensions of p11, corresponding to the toggling
 			   and shrinking of the diagonal above. The macro will do the
 			   right thing, incrementing the pointer to p11 by the appropriate
-			   leading dimension (cs_p or rs_p), and setting only the lower
+			   leading dimension (ldp or rs_p), and setting only the lower
 			   or upper triangle to zero. */ \
 			PASTEMAC(ch,set1ms_mxn_uplo) \
 			( \
 			  schema, \
-			  diagoffp11_0, \
+			  diagoffc11_0, \
 			  uplop, \
 			  p11_0_dim, \
 			  p11_0_dim, \
 			  zero, \
-			  p11, rs_p, cs_p, ldp  \
+			  p11, 1, ldp, ldp  \
 			); \
 		} \
 	} \
+\
+	/* If this micro-panel is an edge case in both panel dimension and
+	   length, then it must be a bottom-right corner case, which
+	   typically only happens for micro-panels being packed for trsm.
+	   (It also happens for trmm if kr > 1.) Here, we set the part of
+	   the diagonal that extends into the zero-padded region to
+	   identity. This prevents NaNs and Infs from creeping into the
+	   computation. If this code does execute for trmm, it is okay,
+	   because those 1.0's that extend into the bottom-right region
+	   end up getting muliplied by the 0.0's in the zero-padded region
+	   of the other matrix. */ \
+	if ( panel_dim != panel_dim_max && \
+	     panel_len != panel_len_max ) \
+	{ \
+		ctype* restrict one    = PASTEMAC(ch,1); \
+		dim_t           offm   = panel_dim; \
+		dim_t           offn   = panel_len; \
+		dim_t           m_edge = panel_dim_max - panel_dim; \
+		dim_t           n_edge = panel_len_max - panel_len; \
+\
+		PASTEMAC(ch,set1ms_mxn_diag) \
+		( \
+		  schema, \
+		  offm, \
+		  offn, \
+		  m_edge, \
+		  n_edge, \
+		  one, \
+		  p, 1, ldp, ldp  \
+		); \
+	} \
 }
 
 INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er )
diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h
index 6e62d8f69..a953e9367 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_1er.h
+++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h
@@ -38,84 +38,26 @@
 void PASTEMAC(ch,varname) \
      ( \
        struc_t         strucc, \
-       doff_t          diagoffp, \
        diag_t          diagc, \
        uplo_t          uploc, \
        conj_t          conjc, \
        pack_t          schema, \
        bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
-     );
-
-INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er )
-
-
-
-#undef  GENTPROTCO
-#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       doff_t          diagoffc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
        dim_t           panel_dim, \
-       dim_t           panel_dim_max, \
        dim_t           panel_len, \
-       dim_t           panel_len_max, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
-       cntx_t*         cntx  \
-     );
-
-INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er )
-
-
-
-#undef  GENTPROTCO
-#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       doff_t          diagoffc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           m_panel, \
-       dim_t           n_panel, \
-       dim_t           m_panel_max, \
-       dim_t           n_panel_max, \
-       dim_t           panel_dim, \
        dim_t           panel_dim_max, \
-       dim_t           panel_len, \
        dim_t           panel_len_max, \
+       dim_t           panel_dim_off, \
+       dim_t           panel_len_off, \
        ctype* restrict kappa, \
-       ctype* restrict c, inc_t rs_c, inc_t cs_c, \
-                          inc_t incc, inc_t ldc, \
-       ctype* restrict p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t ldp, \
-       cntx_t*         cntx  \
+       ctype* restrict c, inc_t incc, inc_t ldc, \
+       ctype* restrict p,             inc_t ldp, \
+                          inc_t is_p, \
+       cntx_t*         cntx, \
+       void*           params \
      );
 
+INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er )
+INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er )
 INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er )
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c
index 52a1f9817..650b6178c 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.c
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.c
@@ -41,53 +41,26 @@
 \
 void PASTEMAC2(chc,chp,varname) \
      ( \
+       struc_t           strucc, \
+       diag_t            diagc, \
+       uplo_t            uploc, \
        conj_t            conjc, \
        pack_t            schema, \
-       dim_t             m_panel, \
-       dim_t             n_panel, \
-       dim_t             m_panel_max, \
-       dim_t             n_panel_max, \
+       bool              invdiag, \
+       dim_t             panel_dim, \
+       dim_t             panel_len, \
+       dim_t             panel_dim_max, \
+       dim_t             panel_len_max, \
+       dim_t             panel_dim_off, \
+       dim_t             panel_len_off, \
        ctype_p* restrict kappa, \
-       ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
+       ctype_c* restrict c, inc_t incc, inc_t ldc, \
+       ctype_p* restrict p,             inc_t ldp, \
                             inc_t is_p, \
-       cntx_t*           cntx  \
+       cntx_t*           cntx, \
+       void*             params \
      ) \
 { \
-	dim_t  panel_dim; \
-	dim_t  panel_dim_max; \
-	dim_t  panel_len; \
-	dim_t  panel_len_max; \
-	inc_t  incc, ldc; \
-	inc_t        ldp; \
-\
-\
-	/* Determine the dimensions and relative strides of the micro-panel
-	   based on its pack schema. */ \
-	if ( bli_is_col_packed( schema ) ) \
-	{ \
-		/* Prepare to pack to row-stored column panel. */ \
-		panel_dim     = n_panel; \
-		panel_dim_max = n_panel_max; \
-		panel_len     = m_panel; \
-		panel_len_max = m_panel_max; \
-		incc          = cs_c; \
-		ldc           = rs_c; \
-		ldp           = rs_p; \
-	} \
-	else /* if ( bli_is_row_packed( schema ) ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panel. */ \
-		panel_dim     = m_panel; \
-		panel_dim_max = m_panel_max; \
-		panel_len     = n_panel; \
-		panel_len_max = n_panel_max; \
-		incc          = rs_c; \
-		ldc           = cs_c; \
-		ldp           = cs_p; \
-	} \
-\
-\
 	if ( bli_is_nat_packed( schema ) ) \
 	{ \
 		/* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha
@@ -318,7 +291,7 @@ void PASTEMAC2(cha,chp,opname) \
        conj_t            conja, \
        dim_t             m, \
        dim_t             n, \
-	   ctype_p* restrict kappa, \
+       ctype_p* restrict kappa, \
        ctype_a* restrict a, inc_t inca, inc_t lda, \
        ctype_p* restrict p,             inc_t ldp  \
      ) \
@@ -445,7 +418,7 @@ void PASTEMAC2(cha,chp,opname) \
        conj_t            conja, \
        dim_t             m, \
        dim_t             n, \
-	   ctype_p* restrict kappa, \
+       ctype_p* restrict kappa, \
        ctype_a* restrict a, inc_t inca, inc_t lda, \
        ctype_p* restrict p,             inc_t ldp  \
      ) \
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h
index 72ca67937..f493838b3 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.h
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.h
@@ -37,17 +37,24 @@
 \
 void PASTEMAC2(chc,chp,varname) \
      ( \
+       struc_t           strucc, \
+       diag_t            diagc, \
+       uplo_t            uploc, \
        conj_t            conjc, \
        pack_t            schema, \
-       dim_t             m_panel, \
-       dim_t             n_panel, \
-       dim_t             m_panel_max, \
-       dim_t             n_panel_max, \
+       bool              invdiag, \
+       dim_t             panel_dim, \
+       dim_t             panel_len, \
+       dim_t             panel_dim_max, \
+       dim_t             panel_len_max, \
+       dim_t             panel_dim_off, \
+       dim_t             panel_len_off, \
        ctype_p* restrict kappa, \
-       ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
+       ctype_c* restrict c, inc_t incc, inc_t ldc, \
+       ctype_p* restrict p,             inc_t ldp, \
                             inc_t is_p, \
-       cntx_t*           cntx  \
+       cntx_t*           cntx, \
+       void*             params \
      );
 
 INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md )
diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c
deleted file mode 100644
index 6e72b3e9d..000000000
--- a/frame/1m/packm/bli_packm_unb_var1.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)(
-                           struc_t strucc,
-                           doff_t  diagoffc,
-                           diag_t  diagc,
-                           uplo_t  uploc,
-                           trans_t transc,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_max,
-                           dim_t   n_max,
-                           void*   kappa,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                           cntx_t* cntx
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
-
-
-void bli_packm_unb_var1
-     (
-       obj_t*  c,
-       obj_t*  p,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_cp     = bli_obj_dt( c );
-
-	struc_t   strucc    = bli_obj_struc( c );
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-	diag_t    diagc     = bli_obj_diag( c );
-	uplo_t    uploc     = bli_obj_uplo( c );
-	trans_t   transc    = bli_obj_conjtrans_status( c );
-
-	dim_t     m_p       = bli_obj_length( p );
-	dim_t     n_p       = bli_obj_width( p );
-	dim_t     m_max_p   = bli_obj_padded_length( p );
-	dim_t     n_max_p   = bli_obj_padded_width( p );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_p     = bli_obj_buffer_at_off( p );
-	inc_t     rs_p      = bli_obj_row_stride( p );
-	inc_t     cs_p      = bli_obj_col_stride( p );
-
-	void*     buf_kappa;
-
-	FUNCPTR_T f;
-
-
-	// This variant assumes that the computational kernel will always apply
-	// the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
-	// for kappa so that the underlying packm implementation does not scale
-	// during packing.
-	buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_cp];
-
-    if( bli_thread_am_ochief( thread ) ) {
-        // Invoke the function.
-        f
-		(
-		  strucc,
-          diagoffc,
-          diagc,
-          uploc,
-          transc,
-          m_p,
-          n_p,
-          m_max_p,
-          n_max_p,
-          buf_kappa,
-          buf_c, rs_c, cs_c,
-          buf_p, rs_p, cs_p,
-		  cntx
-		);
-    }
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-\
-	/* We begin by packing the region indicated by the parameters. If
-	   matrix c is dense (either because the structure is general or
-	   because the structure has already been "densified"), this ends
-	   up being the only action we take. Note that if kappa is unit,
-	   the data is simply copied (rather than scaled by one). */ \
-	PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-	( \
-	  diagoffc, \
-	  diagc, \
-	  uploc, \
-	  transc, \
-	  m, \
-	  n, \
-	  kappa_cast, \
-	  c_cast, rs_c, cs_c, \
-	  p_cast, rs_p, cs_p, \
-	  cntx, \
-	  NULL  \
-	); \
-\
-	/* If uploc is upper or lower, then the structure of c is necessarily
-	   non-dense (ie: Hermitian, symmetric, or triangular, where part of the
-	   matrix is unstored). In these cases, we want to fill in the unstored
-	   part of the matrix. How this is done depends on the structure of c. */ \
-	if ( bli_is_upper_or_lower( uploc ) ) \
-	{ \
-		/* The Hermitian and symmetric cases are almost identical, so we
-		   handle them in one conditional block. */ \
-		if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \
-		{ \
-			/* First we must reflect the region referenced to the opposite
-			   side of the diagonal. */ \
-			c_cast = c_cast + diagoffc * ( doff_t )cs_c + \
-			                 -diagoffc * ( doff_t )rs_c; \
-			bli_negate_diag_offset( &diagoffc ); \
-			bli_toggle_trans( &transc ); \
-			if      ( bli_is_upper( uploc ) ) diagoffc += 1; \
-			else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \
-\
-			/* If c is Hermitian, we need to apply a conjugation when
-			   copying the region opposite the diagonal. */ \
-			if ( bli_is_hermitian( strucc ) ) \
-				transc = bli_trans_toggled_conj( transc ); \
-\
-			/* Copy the data from the region opposite the diagonal of c
-			   (as specified by the original value of diagoffc). Notice
-			   that we use a diag parameter of non-unit since we can
-			   assume nothing about the neighboring off-diagonal. */ \
-			PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-			( \
-			  diagoffc, \
-			  BLIS_NONUNIT_DIAG, \
-			  uploc, \
-			  transc, \
-			  m, \
-			  n, \
-			  kappa_cast, \
-			  c_cast, rs_c, cs_c, \
-			  p_cast, rs_p, cs_p, \
-			  cntx, \
-			  NULL  \
-			); \
-		} \
-		else /* if ( bli_is_triangular( strucc ) ) */ \
-		{ \
-			doff_t diagoffp = diagoffc; \
-			uplo_t uplop    = uploc; \
-\
-			/* For this step we need the uplo and diagonal offset of p, which
-			   we can derive from the parameters given. */ \
-			if ( bli_does_trans( transc ) ) \
-			{ \
-				bli_negate_diag_offset( &diagoffp ); \
-				bli_toggle_uplo( &uplop ); \
-			} \
-\
-			/* For triangular matrices, we wish to reference the region
-			   strictly opposite the diagonal of C. This amounts to 
-			   toggling uploc and then shifting the diagonal offset to
-			   shrink the stored region (by one diagonal). */ \
-			bli_toggle_uplo( &uplop ); \
-			bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
-\
-			/* Set the region opposite the diagonal of p to zero. */ \
-			PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  diagoffp, \
-			  BLIS_NONUNIT_DIAG, \
-			  uplop, \
-			  m, \
-			  n, \
-			  zero, \
-			  p_cast, rs_p, cs_p, \
-			  cntx, \
-			  NULL  \
-			); \
-		} \
-	} \
-\
-	/* The packed memory region was acquired/allocated with "aligned"
-	   dimensions (ie: dimensions that were possibly inflated up to a
-	   multiple). When these dimension are inflated, it creates empty
-	   regions along the bottom and/or right edges of the matrix. If
-	   eithe region exists, we set them to zero. This simplifies the
-	   register level micro kernel in that it does not need to support
-	   different register blockings for the edge cases. */ \
-	if ( m != m_max ) \
-	{ \
-		ctype* p_edge = p_cast + (m  )*rs_p; \
-\
-		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  m_max - m, \
-		  n_max, \
-		  zero, \
-		  p_edge, rs_p, cs_p, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
-\
-	if ( n != n_max ) \
-	{ \
-		ctype* p_edge = p_cast + (n  )*cs_p; \
-\
-		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  m_max, \
-		  n_max - n, \
-		  zero, \
-		  p_edge, rs_p, cs_p, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_unb_var1 )
-
diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h
deleted file mode 100644
index 8960c8661..000000000
--- a/frame/1m/packm/bli_packm_unb_var1.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_packm_unb_var1
-     (
-       obj_t*  c,
-       obj_t*  p,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     );
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* cntx  \
-     );
-
-INSERT_GENTPROT_BASIC0( packm_unb_var1 )
-
diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h
index b32d02d9b..5e4542841 100644
--- a/frame/1m/unpackm/bli_unpackm.h
+++ b/frame/1m/unpackm/bli_unpackm.h
@@ -36,8 +36,6 @@
 #include "bli_unpackm_check.h"
 #include "bli_unpackm_int.h"
 
-#include "bli_unpackm_unb_var1.h"
-
 #include "bli_unpackm_blk_var1.h"
 
 #include "bli_unpackm_cxk.h"
diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.c b/frame/1m/unpackm/bli_unpackm_unb_var1.c
deleted file mode 100644
index c1033c2cb..000000000
--- a/frame/1m/unpackm/bli_unpackm_unb_var1.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T unpackm_fp
-
-typedef void (*FUNCPTR_T)(
-                           doff_t  diagoffp,
-                           uplo_t  uplop,
-                           trans_t transp,
-                           dim_t   m,
-                           dim_t   n,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           cntx_t* cntx
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1);
-
-
-void bli_unpackm_unb_var1
-     (
-       obj_t*  p,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_pc     = bli_obj_dt( p );
-
-	doff_t    diagoffp  = bli_obj_diag_offset( p );
-	uplo_t    uplop     = bli_obj_uplo( p );
-	trans_t   transc    = bli_obj_onlytrans_status( c );
-
-	dim_t     m_c       = bli_obj_length( c );
-	dim_t     n_c       = bli_obj_width( c );
-
-	void*     buf_p     = bli_obj_buffer_at_off( p );
-	inc_t     rs_p      = bli_obj_row_stride( p );
-	inc_t     cs_p      = bli_obj_col_stride( p );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	FUNCPTR_T f;
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_pc];
-
-	// Invoke the function.
-	f( diagoffp,
-	   uplop,
-	   transc,
-	   m_c,
-	   n_c,
-	   buf_p, rs_p, cs_p,
-	   buf_c, rs_c, cs_c,
-	   cntx
-	);
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, varname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           doff_t  diagoffp, \
-                           uplo_t  uplop, \
-                           trans_t transp, \
-                           dim_t   m, \
-                           dim_t   n, \
-                           void*   p, inc_t rs_p, inc_t cs_p, \
-                           void*   c, inc_t rs_c, inc_t cs_c, \
-                           cntx_t* cntx  \
-                         ) \
-{ \
-	ctype* p_cast = p; \
-	ctype* c_cast = c; \
-\
-	PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
-	( \
-	  diagoffp,\
-	  BLIS_NONUNIT_DIAG, \
-	  uplop, \
-	  transp, \
-	  m, \
-	  n, \
-	  p_cast, rs_p, cs_p, \
-	  c_cast, rs_c, cs_c, \
-	  cntx, \
-	  NULL  \
-	); \
-}
-
-INSERT_GENTFUNC_BASIC( unpackm, unpackm_unb_var1 )
-
diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.h b/frame/1m/unpackm/bli_unpackm_unb_var1.h
deleted file mode 100644
index 5119aaa7f..000000000
--- a/frame/1m/unpackm/bli_unpackm_unb_var1.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_unpackm_unb_var1
-     (
-       obj_t*  p,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     );
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffp, \
-       uplo_t  uplop, \
-       trans_t transp, \
-       dim_t   m, \
-       dim_t   n, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx  \
-     );
-
-INSERT_GENTPROT_BASIC0( unpackm_unb_var1 )
-
diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index da9348844..4dc1a9d54 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -35,6 +35,8 @@
 
 #include "bli_l3_cntl.h"
 #include "bli_l3_check.h"
+#include "bli_l3_int.h"
+#include "bli_l3_packab.h"
 
 // Define function types.
 //#include "bli_l3_ft_ex.h"
@@ -45,7 +47,6 @@
 #include "bli_l3_blocksize.h"
 #include "bli_l3_direct.h"
 #include "bli_l3_prune.h"
-#include "bli_l3_packm.h"
 #include "bli_l3_schema.h"
 
 // Prototype object APIs (basic and expert).
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 50da4627c..3e7882bc3 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -53,7 +53,7 @@ void bli_gemm_check
 	// Check object structure.
 
 	// NOTE: Can't perform these checks as long as bli_gemm_check() is called
-	// from bli_gemm_int(), which is in the execution path for structured
+	// from bli_l3_int(), which is in the execution path for structured
 	// level-3 operations such as hemm.
 
 	//e_val = bli_check_general_object( a );
@@ -109,7 +109,7 @@ void bli_hemm_check
 }
 
 void bli_herk_check
-     ( 
+     (
        obj_t*  alpha,
        obj_t*  a,
        obj_t*  beta,
@@ -197,7 +197,7 @@ void bli_symm_check
 }
 
 void bli_syrk_check
-     ( 
+     (
        obj_t*  alpha,
        obj_t*  a,
        obj_t*  beta,
diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/bli_l3_int.c
similarity index 74%
rename from frame/3/trsm/bli_trsm_int.c
rename to frame/3/bli_l3_int.c
index 53a22c355..d4b974030 100644
--- a/frame/3/trsm/bli_trsm_int.c
+++ b/frame/3/bli_l3_int.c
@@ -34,7 +34,7 @@
 
 #include "blis.h"
 
-void bli_trsm_int
+void bli_l3_int
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -47,10 +47,9 @@ void bli_trsm_int
        thrinfo_t* thread
      )
 {
-	obj_t        a_local;
-	obj_t        b_local;
-	obj_t        c_local;
-	trsm_var_oft f;
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
 
 	// Return early if the current control tree node is NULL.
 	if ( bli_cntl_is_null( cntl ) ) return;
@@ -60,72 +59,82 @@ void bli_trsm_int
 		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
 
 	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) ) return;
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
 
 	// If A or B has a zero dimension, scale C by beta and return early.
 	if ( bli_obj_has_zero_dim( a ) ||
 	     bli_obj_has_zero_dim( b ) )
 	{
 		if ( bli_thread_am_ochief( thread ) )
-		    bli_scalm( beta, c );
+			bli_scalm( beta, c );
 		bli_thread_barrier( thread );
 		return;
 	}
 
-	// Alias A and B in case we need to update attached scalars.
+	// If A or B is marked as being filled with zeros, scale C by beta and
+	// return early.
+	if ( bli_obj_is_zeros( a ) ||
+	     bli_obj_is_zeros( b ) )
+	{
+		// This should never execute.
+		bli_abort();
+
+		if ( bli_thread_am_ochief( thread ) )
+			bli_scalm( beta, c );
+		bli_thread_barrier( thread );
+		return;
+	}
+
+	// Alias A, B, and C in case we need to update attached scalars.
 	bli_obj_alias_to( a, &a_local );
 	bli_obj_alias_to( b, &b_local );
-
-	// Alias C in case we need to induce a transposition.
 	bli_obj_alias_to( c, &c_local );
 
+	// Ensure that a valid packing function is set on A and B.
+	if ( !bli_obj_pack_fn( &a_local ) )
+		bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local );
+
+	if ( !bli_obj_pack_fn( &b_local ) )
+		bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local );
+
 	// If we are about to call a leaf-level implementation, and matrix C
 	// still needs a transposition, then we must induce one by swapping the
 	// strides and dimensions. Note that this transposition would normally
 	// be handled explicitly in the packing of C, but if C is not being
 	// packed, this is our last chance to handle the transposition.
-	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
+	//if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
+	if ( bli_obj_has_trans( c ) )
 	{
 		bli_obj_induce_trans( &c_local );
 		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
 	}
 
-	// If beta is non-unit, apply it to the scalar attached to C.
-	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
+	// If alpha is non-unit, typecast and apply it to the scalar attached
+	// to B, unless it happens to be triangular.
+	if ( bli_obj_root_is_triangular( b ) )
 	{
-		bli_obj_scalar_apply_scalar( beta, &c_local );
-	}
-
-	// Set two bools: one based on the implied side parameter (the structure
-	// of the root object) and one based on the uplo field of the triangular
-	// matrix's root object (whether that is matrix A or matrix B).
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		// If alpha is non-unit, typecast and apply it to the scalar
-		// attached to B (the non-triangular matrix).
 		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
-		{
-			bli_obj_scalar_apply_scalar( alpha, &b_local );
-		}
+			bli_obj_scalar_apply_scalar( alpha, &a_local );
 	}
 	else // if ( bli_obj_root_is_triangular( b ) )
 	{
-		// If alpha is non-unit, typecast and apply it to the scalar
-		// attached to A (the non-triangular matrix).
 		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
-		{
-            bli_obj_scalar_apply_scalar( alpha, &a_local );
-		}
+			bli_obj_scalar_apply_scalar( alpha, &b_local );
 	}
 
-	// FGVZ->TMS: Is this barrier still needed?
-	bli_thread_barrier( thread );
+	// If beta is non-unit, typecast and apply it to the scalar attached
+	// to C.
+	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
+		bli_obj_scalar_apply_scalar( beta, &c_local );
 
 	// Create the next node in the thrinfo_t structure.
 	bli_thrinfo_grow( rntm, cntl, thread );
 
 	// Extract the function pointer from the current control tree node.
-	f = bli_cntl_var_func( cntl );
+	l3_var_oft f = bli_cntl_var_func( cntl );
 
 	// Invoke the variant.
 	f
diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/bli_l3_int.h
similarity index 99%
rename from frame/3/gemm/bli_gemm_int.h
rename to frame/3/bli_l3_int.h
index 2bbe5480a..d76b0ac3e 100644
--- a/frame/3/gemm/bli_gemm_int.h
+++ b/frame/3/bli_l3_int.h
@@ -32,7 +32,7 @@
 
 */
 
-void bli_gemm_int
+void bli_l3_int
      (
        obj_t*  alpha,
        obj_t*  a,
diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h
index 1456f8eff..ea10d8090 100644
--- a/frame/3/bli_l3_oft_var.h
+++ b/frame/3/bli_l3_oft_var.h
@@ -54,24 +54,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
   thrinfo_t* thread  \
 );
 
-GENTDEF( gemm )
-
-
-#undef  GENTDEF
-#define GENTDEF( opname ) \
-\
-typedef void (*PASTECH(opname,_var_oft)) \
-( \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  c, \
-  cntx_t* cntx, \
-  rntm_t* rntm, \
-  cntl_t* cntl, \
-  thrinfo_t* thread  \
-);
-
-GENTDEF( trsm )
+GENTDEF( l3 )
 
 
diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/bli_l3_packab.c
similarity index 80%
rename from frame/3/gemm/bli_gemm_packab.c
rename to frame/3/bli_l3_packab.c
index a15192994..d91181942 100644
--- a/frame/3/gemm/bli_gemm_packab.c
+++ b/frame/3/bli_l3_packab.c
@@ -34,7 +34,7 @@
 
 #include "blis.h"
 
-void bli_gemm_packa
+void bli_l3_packa
      (
        obj_t*  a,
        obj_t*  b,
@@ -45,12 +45,19 @@ void bli_gemm_packa
        thrinfo_t* thread
      )
 {
-	obj_t a_pack;
+	obj_t a_local, a_pack;
+
+	bli_obj_alias_to( a, &a_local );
+	if ( bli_obj_has_trans( a ) )
+	{
+		bli_obj_induce_trans( &a_local );
+		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
+	}
 
 	// Pack matrix A according to the control tree node.
-	bli_l3_packm
+	bli_packm_int
 	(
-	  a,
+	  &a_local,
 	  &a_pack,
 	  cntx,
 	  rntm,
@@ -59,7 +66,7 @@ void bli_gemm_packa
 	);
 
 	// Proceed with execution using packed matrix A.
-	bli_gemm_int
+	bli_l3_int
 	(
 	  &BLIS_ONE,
 	  &a_pack,
@@ -75,7 +82,7 @@ void bli_gemm_packa
 
 // -----------------------------------------------------------------------------
 
-void bli_gemm_packb
+void bli_l3_packb
      (
        obj_t*  a,
        obj_t*  b,
@@ -86,25 +93,39 @@ void bli_gemm_packb
        thrinfo_t* thread
      )
 {
-	obj_t b_pack;
+	obj_t bt_local, bt_pack;
+
+	// We always pass B^T to bli_l3_packm.
+	bli_obj_alias_to( b, &bt_local );
+	if ( bli_obj_has_trans( b ) )
+	{
+		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local );
+	}
+	else
+	{
+		bli_obj_induce_trans( &bt_local );
+	}
 
 	// Pack matrix B according to the control tree node.
-	bli_l3_packm
+	bli_packm_int
 	(
-	  b,
-	  &b_pack,
+	  &bt_local,
+	  &bt_pack,
 	  cntx,
 	  rntm,
 	  cntl,
 	  thread
 	);
 
+	// Transpose packed object back to B.
+	bli_obj_induce_trans( &bt_pack );
+
 	// Proceed with execution using packed matrix B.
-	bli_gemm_int
+	bli_l3_int
 	(
 	  &BLIS_ONE,
 	  a,
-	  &b_pack,
+	  &bt_pack,
 	  &BLIS_ONE,
 	  c,
 	  cntx,
diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/bli_l3_packab.h
similarity index 90%
rename from frame/3/trsm/bli_trsm_int.h
rename to frame/3/bli_l3_packab.h
index aabb2a8aa..380ca7212 100644
--- a/frame/3/trsm/bli_trsm_int.h
+++ b/frame/3/bli_l3_packab.h
@@ -32,12 +32,21 @@
 
 */
 
-void bli_trsm_int
+void bli_l3_packa
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     );
+
+void bli_l3_packb
      (
-       obj_t*  alpha,
        obj_t*  a,
        obj_t*  b,
-       obj_t*  beta,
        obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm,
diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c
deleted file mode 100644
index 48f55c360..000000000
--- a/frame/3/bli_l3_packm.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_l3_packm
-     (
-       obj_t*  x,
-       obj_t*  x_pack,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	packbuf_t pack_buf_type;
-	mem_t*    cntl_mem_p;
-	siz_t     size_needed;
-
-	// FGVZ: Not sure why we need this barrier, but we do.
-	bli_thread_barrier( thread );
-
-	// Every thread initializes x_pack and determines the size of memory
-	// block needed (which gets embedded into the otherwise "blank" mem_t
-	// entry in the control tree node).
-	size_needed
-	=
-	bli_packm_init
-	(
-	  x,
-	  x_pack,
-	  cntx,
-	  cntl
-	);
-
-	// If zero was returned, no memory needs to be allocated and so we can
-	// return early.
-	if ( size_needed == 0 ) return;
-
-	// Query the pack buffer type from the control tree node.
-	pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
-
-	// Query the address of the mem_t entry within the control tree node.
-	cntl_mem_p = bli_cntl_pack_mem( cntl );
-
-	// Check the mem_t field in the control tree. If it is unallocated, then
-	// we need to acquire a block from the memory broker and broadcast it to
-	// all threads in the chief's thread group.
-	if ( bli_mem_is_unalloc( cntl_mem_p ) )
-	{
-		mem_t* local_mem_p;
-		mem_t  local_mem_s;
-
-		if ( bli_thread_am_ochief( thread ) )
-		{
-			#ifdef BLIS_ENABLE_MEM_TRACING
-			printf( "bli_l3_packm(): acquiring mem pool block\n" );
-			#endif
-
-			// The chief thread acquires a block from the memory broker
-			// and saves the associated mem_t entry to local_mem_s.
-			bli_pba_acquire_m
-			(
-			  rntm,
-			  size_needed,
-			  pack_buf_type,
-			  &local_mem_s
-			);
-		}
-
-		// Broadcast the address of the chief thread's local mem_t entry to
-		// all threads.
-		local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
-
-		// Save the contents of the chief thread's local mem_t entry to the
-		// mem_t field in this thread's control tree node.
-		*cntl_mem_p = *local_mem_p;
-	}
-	else // ( bli_mem_is_alloc( cntl_mem_p ) )
-	{
-		mem_t* local_mem_p;
-		mem_t  local_mem_s;
-
-		// If the mem_t entry in the control tree does NOT contain a NULL
-		// buffer, then a block has already been acquired from the memory
-		// broker and cached in the control tree.
-
-		// As a sanity check, we should make sure that the mem_t object isn't
-		// associated with a block that is too small compared to the size of
-		// the packed matrix buffer that is needed, according to the return
-		// value from packm_init().
-		siz_t cntl_mem_size = bli_mem_size( cntl_mem_p );
-
-		if ( cntl_mem_size < size_needed )
-		{
-			if ( bli_thread_am_ochief( thread ) )
-			{
-				// The chief thread releases the existing block associated with
-				// the mem_t entry in the control tree, and then re-acquires a
-				// new block, saving the associated mem_t entry to local_mem_s.
-				bli_pba_release
-				(
-				  rntm,
-				  cntl_mem_p
-				);
-				bli_pba_acquire_m
-				(
-				  rntm,
-				  size_needed,
-				  pack_buf_type,
-				  &local_mem_s
-				);
-			}
-
-			// Broadcast the address of the chief thread's local mem_t entry to
-			// all threads.
-			local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
-
-			// Save the chief thread's local mem_t entry to the mem_t field in
-			// this thread's control tree node.
-			*cntl_mem_p = *local_mem_p;
-		}
-		else
-		{
-			// If the mem_t entry is already allocated and sufficiently large,
-			// then we use it as-is. No action is needed, because all threads
-			// will already have the cached values in their local control
-			// trees' mem_t entries, currently pointed to by cntl_mem_p.
-
-			bli_thread_barrier( thread );
-		}
-	}
-
-
-	// Update the buffer address in x_pack to point to the buffer associated
-	// with the mem_t entry acquired from the memory broker (now cached in
-	// the control tree node).
-	void* buf = bli_mem_buffer( cntl_mem_p );
-    bli_obj_set_buffer( buf, x_pack );
-
-
-	// Pack the contents of object x to object x_pack.
-	bli_packm_int
-	(
-	  x,
-	  x_pack,
-	  cntx,
-	  cntl,
-	  thread
-	);
-
-	// Barrier so that packing is done before computation.
-	bli_thread_barrier( thread );
-}
-
diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h
index a6f8b4e1e..ddd88e163 100644
--- a/frame/3/gemm/bli_gemm.h
+++ b/frame/3/gemm/bli_gemm.h
@@ -34,7 +34,6 @@
 
 #include "bli_gemm_cntl.h"
 #include "bli_gemm_front.h"
-#include "bli_gemm_int.h"
 
 #include "bli_gemm_var.h"
 
diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c
index 3b7634338..de077e5ad 100644
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -77,7 +77,7 @@ void bli_gemm_blk_var1
 		                        i, b_alg, c, &c1 );
 
 		// Perform gemm subproblem.
-		bli_gemm_int
+		bli_l3_int
 		(
 		  &BLIS_ONE,
 		  &a1,
diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c
index d89a71053..53943e47c 100644
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -77,7 +77,7 @@ void bli_gemm_blk_var2
 		                        i, b_alg, c, &c1 );
 
 		// Perform gemm subproblem.
-		bli_gemm_int
+		bli_l3_int
 		(
 		  &BLIS_ONE,
 		  a,
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index 7883dfd6d..28029777d 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -71,7 +71,7 @@ void bli_gemm_blk_var3
 		                        i, b_alg, b, &b1 );
 
 		// Perform gemm subproblem.
-		bli_gemm_int
+		bli_l3_int
 		(
 		  &BLIS_ONE,
 		  &a1,
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index 27678e0bf..72d78efe1 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -57,8 +57,6 @@ cntl_t* bli_gemmbp_cntl_create
      )
 {
 	void_fp macro_kernel_fp;
-	void_fp packa_fp;
-	void_fp packb_fp;
 
 	// Use the function pointers to the macrokernels that use slab
 	// assignment of micropanels to threads in the jr and ir loops.
@@ -67,9 +65,6 @@ cntl_t* bli_gemmbp_cntl_create
 	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
 	else /* should never execute */ macro_kernel_fp = NULL;
 
-	packa_fp = bli_packm_blk_var1;
-	packb_fp = bli_packm_blk_var1;
-
 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
 	(
@@ -93,8 +88,7 @@ cntl_t* bli_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  rntm,
-	  bli_gemm_packa,  // pack the left-hand operand
-	  packa_fp,
+	  bli_l3_packa,  // pack the left-hand operand
 	  BLIS_MR,
 	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
@@ -119,10 +113,9 @@ cntl_t* bli_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  rntm,
-	  bli_gemm_packb,  // pack the right-hand operand
-	  packb_fp,
-	  BLIS_KR,
+	  bli_l3_packb,  // pack the right-hand operand
 	  BLIS_NR,
+	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
 	  FALSE,   // reverse iteration if upper?
 	  FALSE,   // reverse iteration if lower?
@@ -194,8 +187,8 @@ cntl_t* bli_gemmpb_cntl_create
 	(
 	  bli_gemm_packb,  // pack the right-hand operand
 	  bli_packm_blk_var1,
-	  BLIS_KR,
 	  BLIS_MR,
+	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
 	  FALSE,   // reverse iteration if upper?
 	  FALSE,   // reverse iteration if lower?
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 792d69af5..a9ea21dc4 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -87,13 +87,14 @@ void bli_gemm_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( c, &c_local );
 
-#ifdef BLIS_ENABLE_GEMM_MD
-	// Don't perform the following optimization for ccr or crc cases, as
-	// those cases are sensitive to the ukernel storage preference (ie:
-	// transposing the operation would break them).
-	if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
-	     !bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
-#endif
+	// Set the obj_t buffer field to the location currently implied by the row
+	// and column offsets and then zero the offsets. If any of the original
+	// obj_t's were views into larger matrices, this step effectively makes
+	// those obj_t's "forget" their lineage.
+	bli_obj_reset_origin( &a_local );
+	bli_obj_reset_origin( &b_local );
+	bli_obj_reset_origin( &c_local );
+
 	// An optimization: If C is stored by rows and the micro-kernel prefers
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
@@ -251,7 +252,7 @@ void bli_gemm_front
 	// Invoke the internal back-end via the thread handler.
 	bli_l3_thread_decorator
 	(
-	  bli_gemm_int,
+	  bli_l3_int,
 	  BLIS_GEMM, // operation family id
 	  alpha,
 	  &a_local,
diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c
deleted file mode 100644
index 208e9bdca..000000000
--- a/frame/3/gemm/bli_gemm_int.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_gemm_int
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	obj_t        a_local;
-	obj_t        b_local;
-	obj_t        c_local;
-	gemm_var_oft f;
-
-	// Check parameters.
-	if ( bli_error_checking_is_enabled() )
-		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
-
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) )
-	{
-		return;
-	}
-
-	// If A or B has a zero dimension, scale C by beta and return early.
-	if ( bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		if ( bli_thread_am_ochief( thread ) )
-			bli_scalm( beta, c );
-		bli_thread_barrier( thread );
-		return;
-	}
-
-	// If A or B is marked as being filled with zeros, scale C by beta and
-	// return early.
-	if ( bli_obj_is_zeros( a ) ||
-	     bli_obj_is_zeros( b ) )
-	{
-		// This should never execute.
-		bli_abort();
-
-		if ( bli_thread_am_ochief( thread ) )
-			bli_scalm( beta, c );
-		bli_thread_barrier( thread );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to update attached scalars.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// If alpha is non-unit, typecast and apply it to the scalar attached
-	// to B.
-	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
-	{
-		bli_obj_scalar_apply_scalar( alpha, &b_local );
-	}
-
-	// If beta is non-unit, typecast and apply it to the scalar attached
-	// to C.
-	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
-	{
-		bli_obj_scalar_apply_scalar( beta, &c_local );
-	}
-
-	// Create the next node in the thrinfo_t structure.
-	bli_thrinfo_grow( rntm, cntl, thread );
-
-	// Extract the function pointer from the current control tree node.
-	f = bli_cntl_var_func( cntl );
-
-	// Invoke the variant.
-	f
-	(
-	  &a_local,
-	  &b_local,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl,
-	  thread
-	);
-}
-
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index 7bcc8a013..e7befc5b4 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -55,11 +55,8 @@ void PASTEMAC0(opname) \
 GENPROT( gemm_blk_var1 )
 GENPROT( gemm_blk_var2 )
 GENPROT( gemm_blk_var3 )
-GENPROT( gemm_packa )
-GENPROT( gemm_packb )
 
 GENPROT( gemm_ker_var1 )
-
 GENPROT( gemm_ker_var2 )
 
 
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index 9f18a717d..2a9d91759 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -73,7 +73,14 @@ void bli_gemmt_front
 	bli_obj_alias_to( a, &a_local );
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
+
+	// Set the obj_t buffer field to the location currently implied by the row
+	// and column offsets and then zero the offsets. If any of the original
+	// obj_t's were views into larger matrices, this step effectively makes
+	// those obj_t's "forget" their lineage.
+	bli_obj_reset_origin( &a_local );
+	bli_obj_reset_origin( &b_local );
+	bli_obj_reset_origin( &c_local );
 
 	// An optimization: If C is stored by rows and the micro-kernel prefers
 	// contiguous columns, or if C is stored by columns and the micro-kernel
@@ -107,7 +114,7 @@ void bli_gemmt_front
 	// Invoke the internal back-end via the thread handler.
 	bli_l3_thread_decorator
 	(
-	  bli_gemm_int,
+	  bli_l3_int,
 	  BLIS_GEMMT, // operation family id
 	  alpha,
 	  &a_local,
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
index 6d24ea496..3a1d681c3 100644
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-static gemm_var_oft vars[2] =
+static l3_var_oft vars[2] =
 {
 	bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2,
 };
@@ -51,8 +51,8 @@ void bli_gemmt_x_ker_var2
        thrinfo_t* thread
      )
 {
-	dim_t        uplo;
-	gemm_var_oft f;
+	dim_t      uplo;
+	l3_var_oft f;
 
 	// Set a bool based on the uplo field of C's root object.
 	if ( bli_obj_root_is_lower( c ) ) uplo = 0;
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index 7869f800a..9835de9c1 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -65,6 +65,14 @@ void bli_hemm_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( c, &c_local );
 
+	// Set the obj_t buffer field to the location currently implied by the row
+	// and column offsets and then zero the offsets. If any of the original
+	// obj_t's were views into larger matrices, this step effectively makes
+	// those obj_t's "forget" their lineage.
+	bli_obj_reset_origin( &a_local );
+	bli_obj_reset_origin( &b_local );
+	bli_obj_reset_origin( &c_local );
+
 #ifdef BLIS_DISABLE_HEMM_RIGHT
 	// NOTE: This case casts right-side hemm in terms of left side. This is
 	// necessary when the current subconfiguration uses a gemm microkernel
@@ -129,13 +137,6 @@ void bli_hemm_front
 	// Set the pack schemas within the objects.
 	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
 
-	// Set each alias as the root object.
-	// NOTE: We MUST wait until we are done potentially swapping the objects
-	// before setting the root fields!
-	bli_obj_set_as_root( &a_local );
-	bli_obj_set_as_root( &b_local );
-	bli_obj_set_as_root( &c_local );
-
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
 	// additional modifications necessary for the current operation.
@@ -152,7 +153,7 @@ void bli_hemm_front
 	// Invoke the internal back-end.
 	bli_l3_thread_decorator
 	(
-	  bli_gemm_int,
+	  bli_l3_int,
 	  BLIS_GEMM, // operation family id
 	  alpha,
 	  &a_local,
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index 52ef4cf36..be94c44c1 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -65,6 +65,14 @@ void bli_symm_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( c, &c_local );
 
+	// Set the obj_t buffer field to the location currently implied by the row
+	// and column offsets and then zero the offsets. If any of the original
+	// obj_t's were views into larger matrices, this step effectively makes
+	// those obj_t's "forget" their lineage.
+	bli_obj_reset_origin( &a_local );
+	bli_obj_reset_origin( &b_local );
+	bli_obj_reset_origin( &c_local );
+
 #ifdef BLIS_DISABLE_SYMM_RIGHT
 	// NOTE: This case casts right-side symm in terms of left side. This is
 	// necessary when the current subconfiguration uses a gemm microkernel
@@ -128,13 +136,6 @@ void bli_symm_front
 	// Set the pack schemas within the objects.
 	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
 
-	// Set each alias as the root object.
-	// NOTE: We MUST wait until we are done potentially swapping the objects
-	// before setting the root fields!
-	bli_obj_set_as_root( &a_local );
-	bli_obj_set_as_root( &b_local );
-	bli_obj_set_as_root( &c_local );
-
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
 	// additional modifications necessary for the current operation.
@@ -151,7 +152,7 @@ void bli_symm_front
 	// Invoke the internal back-end.
 	bli_l3_thread_decorator
 	(
-	  bli_gemm_int,
+	  bli_l3_int,
 	  BLIS_GEMM, // operation family id
 	  alpha,
 	  &a_local,
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index fac7349f5..1de28958e 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -64,6 +64,14 @@ void bli_trmm_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( b, &c_local );
 
+	// Set the obj_t buffer field to the location currently implied by the row
+	// and column offsets and then zero the offsets. If any of the original
+	// obj_t's were views into larger matrices, this step effectively makes
+	// those obj_t's "forget" their lineage.
+	bli_obj_reset_origin( &a_local );
+	bli_obj_reset_origin( &b_local );
+	bli_obj_reset_origin( &c_local );
+
 	// We do not explicitly implement the cases where A is transposed.
 	// However, we can still handle them. Specifically, if A is marked as
 	// needing a transposition, we simply induce a transposition. This
@@ -147,13 +155,6 @@ void bli_trmm_front
 	// Set the pack schemas within the objects.
 	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
 
-	// Set each alias as the root object.
-	// NOTE: We MUST wait until we are done potentially swapping the objects
-	// before setting the root fields!
-	bli_obj_set_as_root( &a_local );
-	bli_obj_set_as_root( &b_local );
-	bli_obj_set_as_root( &c_local );
-
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
 	// additional modifications necessary for the current operation.
@@ -170,7 +171,7 @@ void bli_trmm_front
 	// Invoke the internal back-end.
 	bli_l3_thread_decorator
 	(
-	  bli_gemm_int,
+	  bli_l3_int,
 	  BLIS_TRMM, // operation family id
 	  alpha,
 	  &a_local,
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
index b9c176d97..898cfe242 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-static gemm_var_oft vars[2][2] =
+static l3_var_oft vars[2][2] =
 {
 	{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
 	{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
@@ -52,9 +52,9 @@ void bli_trmm_xx_ker_var2
        thrinfo_t* thread
      )
 {
-	dim_t        side;
-	dim_t        uplo;
-	gemm_var_oft f;
+	dim_t      side;
+	dim_t      uplo;
+	l3_var_oft f;
 
 	// Set two bools: one based on the implied side parameter (the structure
 	// of the root object) and one based on the uplo field of the triangular
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index 0ce961d1c..3b9753960 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -65,6 +65,14 @@ void bli_trmm3_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( c, &c_local );
 
+	// Set the obj_t buffer field to the location currently implied by the row
+	// and column offsets and then zero the offsets. If any of the original
+	// obj_t's were views into larger matrices, this step effectively makes
+	// those obj_t's "forget" their lineage.
+	bli_obj_reset_origin( &a_local );
+	bli_obj_reset_origin( &b_local );
+	bli_obj_reset_origin( &c_local );
+
 	// We do not explicitly implement the cases where A is transposed.
 	// However, we can still handle them. Specifically, if A is marked as
 	// needing a transposition, we simply induce a transposition. This
@@ -139,13 +147,6 @@ void bli_trmm3_front
 	// Set the pack schemas within the objects.
 	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
 
-	// Set each alias as the root object.
-	// NOTE: We MUST wait until we are done potentially swapping the objects
-	// before setting the root fields!
-	bli_obj_set_as_root( &a_local );
-	bli_obj_set_as_root( &b_local );
-	bli_obj_set_as_root( &c_local );
-
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
 	// additional modifications necessary for the current operation.
@@ -162,7 +163,7 @@ void bli_trmm3_front
 	// Invoke the internal back-end.
 	bli_l3_thread_decorator
 	(
-	  bli_gemm_int,
+	  bli_l3_int,
 	  BLIS_TRMM, // operation family id
 	  alpha,
 	  &a_local,
diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h
index 00b604de6..964422d01 100644
--- a/frame/3/trsm/bli_trsm.h
+++ b/frame/3/trsm/bli_trsm.h
@@ -34,7 +34,5 @@
 
 #include "bli_trsm_cntl.h"
 #include "bli_trsm_front.h"
-#include "bli_trsm_int.h"
-
 #include "bli_trsm_var.h"
 
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index 578c37c32..30bf6921c 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -58,7 +58,7 @@ void bli_trsm_blk_var1
 	bli_l3_prune_unref_mparts_m( a, b, c, cntl );
 
 	// Isolate the diagonal block A11 and its corresponding row panel C1.
-	const dim_t kc = bli_obj_width( a );
+	const dim_t kc = bli_obj_width_after_trans( a );
 	obj_t a11, c1;
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
 	                        0, kc, a, &a11 );
@@ -96,7 +96,7 @@ void bli_trsm_blk_var1
 #endif
 
 		// Perform trsm subproblem.
-		bli_trsm_int
+		bli_l3_int
 		(
 		  &BLIS_ONE,
 		  &a11_1,
@@ -169,7 +169,7 @@ void bli_trsm_blk_var1
 
 		// Perform gemm subproblem. (Note that we use the same backend
 		// function as before, since we're calling the same macrokernel.)
-		bli_trsm_int
+		bli_l3_int
 		(
 		  &BLIS_ONE,
 		  &a11,
diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c
index 23fd3ed4c..5691c964a 100644
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -60,7 +60,7 @@ void bli_trsm_blk_var2
 	bli_thread_range_ndim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
-      &my_start, &my_end
+	  &my_start, &my_end
 	);
 
 	// Partition along the n dimension.
@@ -77,7 +77,7 @@ void bli_trsm_blk_var2
 		                        i, b_alg, c, &c1 );
 
 		// Perform trsm subproblem.
-		bli_trsm_int
+		bli_l3_int
 		(
 		  &BLIS_ONE,
 		  a,
diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index a68cc853b..43fc25f16 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -71,7 +71,7 @@ void bli_trsm_blk_var3
 		                        i, b_alg, b, &b1 );
 
 		// Perform trsm subproblem.
-		bli_trsm_int
+		bli_l3_int
 		(
 		  &BLIS_ONE,
 		  &a1,
diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c
index 4a7a4de8f..a8196ebb9 100644
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -57,16 +57,11 @@ cntl_t* bli_trsm_l_cntl_create
      )
 {
 	void_fp macro_kernel_p;
-	void_fp packa_fp;
-	void_fp packb_fp;
 
 	// Use the function pointer to the macrokernels that use slab
 	// assignment of micropanels to threads in the jr and ir loops.
 	macro_kernel_p = bli_trsm_xx_ker_var2;
 
-	packa_fp = bli_packm_blk_var1;
-	packb_fp = bli_packm_blk_var1;
-
 	const opid_t family = BLIS_TRSM;
 
 	//
@@ -95,8 +90,7 @@ cntl_t* bli_trsm_l_cntl_create
 	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  rntm,
-	  bli_trsm_packa, // trsm operation's packm function for A.
-	  packa_fp,
+	  bli_l3_packa, // trsm operation's packm function for A.
 	  BLIS_MR,
 	  BLIS_MR,
 	  FALSE,   // do NOT invert diagonal
@@ -133,8 +127,7 @@ cntl_t* bli_trsm_l_cntl_create
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  rntm,
-	  bli_trsm_packa, // trsm operation's packm function for A.
-	  packa_fp,
+	  bli_l3_packa, // trsm operation's packm function for A.
 	  BLIS_MR,
 	  BLIS_MR,
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
@@ -171,10 +164,9 @@ cntl_t* bli_trsm_l_cntl_create
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  rntm,
-	  bli_trsm_packb,
-	  packb_fp,
-	  BLIS_MR,
+	  bli_l3_packb,
 	  BLIS_NR,
+	  BLIS_MR,
 	  FALSE,   // do NOT invert diagonal
 	  FALSE,   // reverse iteration if upper?
 	  FALSE,   // reverse iteration if lower?
@@ -208,7 +200,7 @@ cntl_t* bli_trsm_l_cntl_create
 
 cntl_t* bli_trsm_r_cntl_create
      (
-	   rntm_t* rntm,
+       rntm_t* rntm,
        pack_t  schema_a,
        pack_t  schema_b
      )
@@ -216,9 +208,6 @@ cntl_t* bli_trsm_r_cntl_create
 	// NOTE: trsm macrokernels are presently disabled for right-side execution.
 	void_fp macro_kernel_p = bli_trsm_xx_ker_var2;
 
-	void_fp packa_fp = bli_packm_blk_var1;
-	void_fp packb_fp = bli_packm_blk_var1;
-
 	const opid_t family = BLIS_TRSM;
 
 	// Create two nodes for the macro-kernel.
@@ -244,8 +233,7 @@ cntl_t* bli_trsm_r_cntl_create
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  rntm,
-	  bli_trsm_packa,
-	  packa_fp,
+	  bli_l3_packa,
 	  BLIS_NR,
 	  BLIS_MR,
 	  FALSE,   // do NOT invert diagonal
@@ -270,8 +258,7 @@ cntl_t* bli_trsm_r_cntl_create
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  rntm,
-	  bli_trsm_packb,
-	  packb_fp,
+	  bli_l3_packb,
 	  BLIS_MR,
 	  BLIS_MR,
 	  TRUE,    // do NOT invert diagonal
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index 68a60b5bd..7f3d17aef 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -71,6 +71,14 @@ void bli_trsm_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( b, &c_local );
 
+	// Set the obj_t buffer field to the location currently implied by the row
+	// and column offsets and then zero the offsets. If any of the original
+	// obj_t's were views into larger matrices, this step effectively makes
+	// those obj_t's "forget" their lineage.
+	bli_obj_reset_origin( &a_local );
+	bli_obj_reset_origin( &b_local );
+	bli_obj_reset_origin( &c_local );
+
 	// We do not explicitly implement the cases where A is transposed.
 	// However, we can still handle them. Specifically, if A is marked as
 	// needing a transposition, we simply induce a transposition. This
@@ -121,13 +129,6 @@ void bli_trsm_front
 	// Set the pack schemas within the objects.
 	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
 
-	// Set each alias as the root object.
-	// NOTE: We MUST wait until we are done potentially swapping the objects
-	// before setting the root fields!
-	bli_obj_set_as_root( &a_local );
-	bli_obj_set_as_root( &b_local );
-	bli_obj_set_as_root( &c_local );
-
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
 	// additional modifications necessary for the current operation.
@@ -144,7 +145,7 @@ void bli_trsm_front
 	// Invoke the internal back-end.
 	bli_l3_thread_decorator
 	(
-	  bli_trsm_int,
+	  bli_l3_int,
 	  BLIS_TRSM, // operation family id
 	  alpha,
 	  &a_local,
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index de7c65936..8322a8b5b 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -55,8 +55,6 @@ void PASTEMAC0(opname) \
 GENPROT( trsm_blk_var1 )
 GENPROT( trsm_blk_var2 )
 GENPROT( trsm_blk_var3 )
-GENPROT( trsm_packa )
-GENPROT( trsm_packb )
 
 GENPROT( trsm_xx_ker_var2 )
 
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index e30e6d751..c30a5828a 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-static trsm_var_oft vars[2][2] =
+static l3_var_oft vars[2][2] =
 {
 	{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
 	{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
@@ -52,9 +52,9 @@ void bli_trsm_xx_ker_var2
        thrinfo_t* thread
      )
 {
-	dim_t        side;
-	dim_t        uplo;
-	trsm_var_oft f;
+	dim_t      side;
+	dim_t      uplo;
+	l3_var_oft f;
 
 	// Set two bools: one based on the implied side parameter (the structure
 	// of the root object) and one based on the uplo field of the triangular
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index 43e5101b5..23fbb4cd1 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -118,6 +118,11 @@ void bli_obj_create_without_buffer
 	bli_obj_set_offs( 0, 0, obj );
 	bli_obj_set_diag_offset( 0, obj );
 
+	bli_obj_set_pack_fn( NULL, obj );
+	bli_obj_set_pack_params( NULL, obj );
+	bli_obj_set_ker_fn( NULL, obj );
+	bli_obj_set_ker_params( NULL, obj );
+
 	// Set the internal scalar to 1.0.
 	bli_obj_set_scalar_dt( dt, obj );
 	s = bli_obj_internal_scalar_buffer( obj );
@@ -356,7 +361,7 @@ void bli_obj_free
 
 	buf_a = bli_obj_buffer_at_off( a );
 
-	bli_zzsets( 0.0, 0.0, value ); 
+	bli_zzsets( 0.0, 0.0, value );
 
 	if ( bli_obj_is_float( a ) )
 	{
@@ -500,7 +505,7 @@ void bli_adjust_strides
 			// Set the column stride to indicate that this is a column vector
 			// stored in column-major order. This is done for legacy reasons,
 			// because we at one time we had to satisify the error checking
-			// in the underlying BLAS library, which expects the leading 
+			// in the underlying BLAS library, which expects the leading
 			// dimension to be set to at least m, even if it will never be
 			// used for indexing since it is a vector and thus only has one
 			// column of data.
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index a924bbefc..f8835e5de 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -282,17 +282,6 @@ void bli_pba_acquire_v
 #endif
 
 
-void bli_pba_rntm_set_pba
-     (
-       rntm_t* rntm
-     )
-{
-	pba_t* pba = bli_pba_query();
-
-	bli_rntm_set_pba( pba, rntm );
-}
-
-
 siz_t bli_pba_pool_size
      (
        pba_t*    pba,
diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h
index ce19991f5..6431607ec 100644
--- a/frame/base/bli_pba.h
+++ b/frame/base/bli_pba.h
@@ -119,7 +119,7 @@ BLIS_INLINE void bli_pba_unlock( pba_t* pba )
 
 // -----------------------------------------------------------------------------
 
-pba_t* bli_pba_query( void );
+BLIS_EXPORT_BLIS pba_t* bli_pba_query( void );
 
 void bli_pba_init
      (
@@ -144,10 +144,15 @@ void bli_pba_release
        mem_t*  mem
      );
 
-void bli_pba_rntm_set_pba
+BLIS_INLINE void bli_pba_rntm_set_pba
      (
        rntm_t* rntm
-     );
+     )
+{
+	pba_t* pba = bli_pba_query();
+
+	bli_rntm_set_pba( pba, rntm );
+}
 
 siz_t bli_pba_pool_size
      (
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 1da6723c7..5b6ff6a0f 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -76,24 +76,39 @@ void* bli_sba_acquire
 		// Query the small block pool from the rntm.
 		pool_t* restrict pool = bli_rntm_sba_pool( rntm );
 
-		// Query the block_size of the pool_t so that we can request the exact
-		// size present.
-		const siz_t block_size = bli_pool_block_size( pool );
-
-		// Sanity check: Make sure the requested size is no larger than the
-		// block_size field of the pool.
-		if ( block_size < req_size )
+		// We don't expect NULL sba_pool pointers in the normal course of BLIS
+		// operation. However, there are rare instances where it is convenient
+		// to support use of bli_sba_acquire() without having to pass in a valid
+		// sba pool data structure. The case that inspired this branch was the
+		// gemm_ukr and related test modules in the BLIS testsuite. (There, it
+		// is convenient to not have to checkout an array_t from the sba, and it
+		// does no harm since the malloc() happens outside of the region that
+		// would be timed.)
+		if ( pool == NULL )
 		{
-			printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
-			        ( int )block_size, ( int )req_size );
-			bli_abort();
+		    block = bli_malloc_intl( req_size, &r_val );
+		}
+		else
+		{
+			// Query the block_size of the pool_t so that we can request the exact
+			// size present.
+			const siz_t block_size = bli_pool_block_size( pool );
+
+			// Sanity check: Make sure the requested size is no larger than the
+			// block_size field of the pool.
+			if ( block_size < req_size )
+			{
+				printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
+				        ( int )block_size, ( int )req_size );
+				bli_abort();
+			}
+
+			// Check out a block using the block_size queried above.
+			bli_pool_checkout_block( block_size, &pblk, pool );
+
+			// The block address is stored within the pblk_t.
+			block = bli_pblk_buf( &pblk );
 		}
-
-		// Check out a block using the block_size queried above.
-		bli_pool_checkout_block( block_size, &pblk, pool );
-
-		// The block address is stored within the pblk_t.
-		block = bli_pblk_buf( &pblk );
 	}
 #else
 
@@ -123,21 +138,28 @@ void bli_sba_release
 		// Query the small block pool from the rntm.
 		pool_t* restrict pool = bli_rntm_sba_pool( rntm );
 
-		// Query the block_size field from the pool. This is not super-important
-		// for this particular application of the pool_t (that is, the "leaf"
-		// component of the sba), but it seems like good housekeeping to maintain
-		// the block_size field of the pblk_t in case its ever needed/read.
-		const siz_t block_size = bli_pool_block_size( pool );
-
-		// Embed the block's memory address into a pblk_t, along with the
-		// block_size queried from the pool.
-		bli_pblk_set_buf( block, &pblk );
-		bli_pblk_set_block_size( block_size, &pblk );
-
-		// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
-		// a local variable since its contents are copied into the pool's internal
-		// data structure--an array of pblk_t.)
-		bli_pool_checkin_block( &pblk, pool );
+		if ( pool == NULL )
+		{
+		    bli_free_intl( block );
+		}
+		else
+		{
+			// Query the block_size field from the pool. This is not super-important
+			// for this particular application of the pool_t (that is, the "leaf"
+			// component of the sba), but it seems like good housekeeping to maintain
+			// the block_size field of the pblk_t in case its ever needed/read.
+			const siz_t block_size = bli_pool_block_size( pool );
+
+			// Embed the block's memory address into a pblk_t, along with the
+			// block_size queried from the pool.
+			bli_pblk_set_buf( block, &pblk );
+			bli_pblk_set_block_size( block_size, &pblk );
+
+			// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
+			// a local variable since its contents are copied into the pool's internal
+			// data structure--an array of pblk_t.)
+			bli_pool_checkin_block( &pblk, pool );
+		}
 	}
 #else
 
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index 84c977289..fe174202c 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -1189,52 +1189,48 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
 
 // -- User-provided information macros --
 
-// User data query
-
-BLIS_INLINE void* bli_obj_user_data( obj_t* obj )
-{
-	return obj->user_data;
-}
-
-// User data modification
+// Function pointer query
 
-BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj )
+BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj )
 {
-	obj->user_data = data;
+	return obj->pack_fn;
 }
 
-// Function pointer query
-
-BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj )
+BLIS_INLINE void* bli_obj_pack_params( obj_t* obj )
 {
-	return obj->pack;
+	return obj->pack_params;
 }
 
 BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj )
 {
-	return obj->ker;
+	return obj->ker_fn;
 }
 
-BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj )
+BLIS_INLINE void* bli_obj_ker_params( obj_t* obj )
 {
-	return obj->ukr;
+	return obj->ker_params;
 }
 
 // Function pointer modification
 
-BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj )
+BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj )
 {
-	obj->pack = pack;
+	obj->pack_fn = pack_fn;
 }
 
-BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj )
+BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj )
 {
-	obj->ker = ker;
+	obj->pack_params = params;
 }
 
-BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj )
+BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj )
 {
-	obj->ukr = ukr;
+	obj->ker_fn = ker_fn;
+}
+
+BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj )
+{
+	obj->ker_params = params;
 }
 
 
@@ -1357,6 +1353,18 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj )
 	       );
 }
 
+// Adjust the pointer based on current offsets, zero the offsets, and then
+// set the current object as the root. For obj_t's with at least one non-zero
+// offset, this effectively makes the obj_t "forget" that it was ever a view
+// into a larger matrix.
+
+BLIS_INLINE void bli_obj_reset_origin( obj_t* obj )
+{
+    bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj );
+    bli_obj_set_offs( 0, 0, obj );
+	bli_obj_set_as_root( obj );
+}
+
 // Make a full alias (shallow copy).
 
 BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b )
@@ -1482,7 +1490,13 @@ BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t*
 
 BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b )
 {
+	bool a_root_is_self = ( bli_obj_root( a ) == a );
+	bool b_root_is_self = ( bli_obj_root( b ) == b );
+
 	obj_t t = *b; *b = *a; *a = t;
+
+	if ( a_root_is_self ) bli_obj_set_as_root( b );
+	if ( b_root_is_self ) bli_obj_set_as_root( a );
 }
 
 // Swap object pack schemas.
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index f1a7e8f8d..5be0ceeb4 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1174,12 +1174,11 @@ struct thrinfo_s;
 
 typedef void (*obj_pack_fn_t)
     (
-      mdim_t            mat,
-      mem_t*            mem,
       struct obj_s*     a,
       struct obj_s*     ap,
       struct cntx_s*    cntx,
       struct rntm_s*    rntm,
+      struct cntl_s*    cntl,
       struct thrinfo_s* thread
     );
 
@@ -1190,23 +1189,10 @@ typedef void (*obj_ker_fn_t)
       struct obj_s*     c,
       struct cntx_s*    cntx,
       struct rntm_s*    rntm,
+      struct cntl_s*    cntl,
       struct thrinfo_s* thread
     );
 
-typedef void (*obj_ukr_fn_t)
-    (
-      dim_t                   m,
-      dim_t                   n,
-      dim_t                   k,
-      void*          restrict alpha,
-      void*          restrict a, inc_t rs_a, inc_t cs_a,
-      void*          restrict b, inc_t rs_b, inc_t cs_b,
-      void*          restrict beta,
-      void*          restrict c, inc_t rs_c, inc_t cs_c,
-      auxinfo_t*     restrict data,
-      struct cntx_s* restrict cntx
-    );
-
 typedef struct obj_s
 {
 	// Basic fields
@@ -1237,13 +1223,11 @@ typedef struct obj_s
 	dim_t         m_panel;  // m dimension of a "full" panel
 	dim_t         n_panel;  // n dimension of a "full" panel
 
-	// User data pointer
-	void*         user_data;
-
-	// Function pointers
-	obj_pack_fn_t pack;
-	obj_ker_fn_t  ker;
-	obj_ukr_fn_t  ukr;
+	// User-customizable fields
+	obj_pack_fn_t pack_fn;
+	void*         pack_params;
+	obj_ker_fn_t  ker_fn;
+	void*         ker_params;
 
 } obj_t;
 
@@ -1258,70 +1242,68 @@ typedef struct obj_s
 
 #define BLIS_OBJECT_INITIALIZER \
 { \
-	.root      = NULL, \
+	.root        = NULL, \
 \
-	.off       = { 0, 0 }, \
-	.dim       = { 0, 0 }, \
-	.diag_off  = 0, \
+	.off         = { 0, 0 }, \
+	.dim         = { 0, 0 }, \
+	.diag_off    = 0, \
 \
-	.info      = 0x0 | BLIS_BITVAL_DENSE      | \
-	                   BLIS_BITVAL_GENERAL, \
-	.info2     = 0x0, \
-	.elem_size = sizeof( float ), /* this is changed later. */ \
+	.info        = 0x0 | BLIS_BITVAL_DENSE      | \
+	                     BLIS_BITVAL_GENERAL, \
+	.info2       = 0x0, \
+	.elem_size   = sizeof( float ), /* this is changed later. */ \
 \
-	.buffer    = NULL, \
-	.rs        = 0, \
-	.cs        = 0, \
-	.is        = 1,  \
+	.buffer      = NULL, \
+	.rs          = 0, \
+	.cs          = 0, \
+	.is          = 1,  \
 \
-	.scalar    = { 0.0, 0.0 }, \
+	.scalar      = { 0.0, 0.0 }, \
 \
-	.m_padded  = 0, \
-	.n_padded  = 0, \
-	.ps        = 0, \
-	.pd        = 0, \
-	.m_panel   = 0, \
-	.n_panel   = 0, \
+	.m_padded    = 0, \
+	.n_padded    = 0, \
+	.ps          = 0, \
+	.pd          = 0, \
+	.m_panel     = 0, \
+	.n_panel     = 0, \
 \
-	.user_data = NULL, \
-\
-	.pack      = NULL, \
-	.ker       = NULL, \
-	.ukr       = NULL  \
+	.pack_fn     = NULL, \
+	.pack_params = NULL, \
+	.ker_fn      = NULL, \
+	.ker_params  = NULL  \
 }
 
 #define BLIS_OBJECT_INITIALIZER_1X1 \
 { \
-	.root      = NULL, \
-\
-	.off       = { 0, 0 }, \
-	.dim       = { 1, 1 }, \
-	.diag_off  = 0, \
+	.root        = NULL, \
 \
-	.info      = 0x0 | BLIS_BITVAL_DENSE      | \
-	                   BLIS_BITVAL_GENERAL, \
-	.info2     = 0x0, \
-	.elem_size = sizeof( float ), /* this is changed later. */ \
+	.off         = { 0, 0 }, \
+	.dim         = { 1, 1 }, \
+	.diag_off    = 0, \
 \
-	.buffer    = NULL, \
-	.rs        = 0, \
-	.cs        = 0, \
-	.is        = 1,  \
+	.info        = 0x0 | BLIS_BITVAL_DENSE      | \
+	                     BLIS_BITVAL_GENERAL, \
+	.info2       = 0x0, \
+	.elem_size   = sizeof( float ), /* this is changed later. */ \
 \
-	.scalar    = { 0.0, 0.0 }, \
+	.buffer      = NULL, \
+	.rs          = 0, \
+	.cs          = 0, \
+	.is          = 1,  \
 \
-	.m_padded  = 0, \
-	.n_padded  = 0, \
-	.ps        = 0, \
-	.pd        = 0, \
-	.m_panel   = 0, \
-	.n_panel   = 0, \
+	.scalar      = { 0.0, 0.0 }, \
 \
-	.user_data = NULL, \
+	.m_padded    = 0, \
+	.n_padded    = 0, \
+	.ps          = 0, \
+	.pd          = 0, \
+	.m_panel     = 0, \
+	.n_panel     = 0, \
 \
-	.pack      = NULL, \
-	.ker       = NULL, \
-	.ukr       = NULL  \
+	.pack_fn     = NULL, \
+	.pack_params = NULL, \
+	.ker_fn      = NULL, \
+	.ker_params  = NULL  \
 }
 
 // Define these macros here since they must be updated if contents of
@@ -1329,77 +1311,75 @@ typedef struct obj_s
 
 BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b )
 {
-	b->root      = a->root;
-
-	b->off[0]    = a->off[0];
-	b->off[1]    = a->off[1];
-	b->dim[0]    = a->dim[0];
-	b->dim[1]    = a->dim[1];
-	b->diag_off  = a->diag_off;
-
-	b->info      = a->info;
-	b->info2     = a->info2;
-	b->elem_size = a->elem_size;
-
-	b->buffer    = a->buffer;
-	b->rs        = a->rs;
-	b->cs        = a->cs;
-	b->is        = a->is;
-
-	b->scalar    = a->scalar;
-
-	//b->pack_mem  = a->pack_mem;
-	b->m_padded  = a->m_padded;
-	b->n_padded  = a->n_padded;
-	b->ps        = a->ps;
-	b->pd        = a->pd;
-	b->m_panel   = a->m_panel;
-	b->n_panel   = a->n_panel;
-
-	b->user_data = a->user_data;
-
-	b->pack      = a->pack;
-	b->ker       = a->ker;
-	b->ukr       = a->ukr;
+	b->root        = a->root;
+
+	b->off[0]      = a->off[0];
+	b->off[1]      = a->off[1];
+	b->dim[0]      = a->dim[0];
+	b->dim[1]      = a->dim[1];
+	b->diag_off    = a->diag_off;
+
+	b->info        = a->info;
+	b->info2       = a->info2;
+	b->elem_size   = a->elem_size;
+
+	b->buffer      = a->buffer;
+	b->rs          = a->rs;
+	b->cs          = a->cs;
+	b->is          = a->is;
+
+	b->scalar      = a->scalar;
+
+	//b->pack_mem    = a->pack_mem;
+	b->m_padded    = a->m_padded;
+	b->n_padded    = a->n_padded;
+	b->ps          = a->ps;
+	b->pd          = a->pd;
+	b->m_panel     = a->m_panel;
+	b->n_panel     = a->n_panel;
+
+	b->pack_fn     = a->pack_fn;
+	b->pack_params = a->pack_params;
+	b->ker_fn      = a->ker_fn;
+	b->ker_params  = a->ker_params;
 }
 
 BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b )
 {
-	b->root      = a->root;
+	b->root        = a->root;
 
-	b->off[0]    = a->off[0];
-	b->off[1]    = a->off[1];
+	b->off[0]      = a->off[0];
+	b->off[1]      = a->off[1];
 	// Avoid copying m and n since they will be overwritten.
-	//b->dim[0]    = a->dim[0];
-	//b->dim[1]    = a->dim[1];
-	b->diag_off  = a->diag_off;
+	//b->dim[0]      = a->dim[0];
+	//b->dim[1]      = a->dim[1];
+	b->diag_off    = a->diag_off;
 
-	b->info      = a->info;
-	b->info2     = a->info2;
-	b->elem_size = a->elem_size;
+	b->info        = a->info;
+	b->info2       = a->info2;
+	b->elem_size   = a->elem_size;
 
-	b->buffer    = a->buffer;
-	b->rs        = a->rs;
-	b->cs        = a->cs;
-	b->is        = a->is;
+	b->buffer      = a->buffer;
+	b->rs          = a->rs;
+	b->cs          = a->cs;
+	b->is          = a->is;
 
-	b->scalar    = a->scalar;
+	b->scalar      = a->scalar;
 
 	// Avoid copying pack_mem entry.
 	// FGVZ: You should probably make sure this is right.
-	//b->pack_mem  = a->pack_mem;
-	b->m_padded  = a->m_padded;
-	b->n_padded  = a->n_padded;
-	b->ps        = a->ps;
-	b->pd        = a->pd;
-	b->m_panel   = a->m_panel;
-	b->n_panel   = a->n_panel;
-
-	b->user_data = a->user_data;
-
-	b->pack      = a->pack;
-	b->ker       = a->ker;
-	b->ukr       = a->ukr;
+	//b->pack_mem    = a->pack_mem;
+	b->m_padded    = a->m_padded;
+	b->n_padded    = a->n_padded;
+	b->ps          = a->ps;
+	b->pd          = a->pd;
+	b->m_panel     = a->m_panel;
+	b->n_panel     = a->n_panel;
+
+	b->pack_fn     = a->pack_fn;
+	b->pack_params = a->pack_params;
+	b->ker_fn      = a->ker_fn;
+	b->ker_params  = a->ker_params;
 }
 
 // Initializors for global scalar constants.
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index 48996f28e..d37005b28 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -169,7 +169,6 @@ void libblis_test_gemm_ukr_experiment
 	num_t        datatype;
 
 	dim_t        m, n, k;
-	inc_t        ldap, ldbp;
 
 	char         sc_a = 'c';
 	char         sc_b = 'r';
@@ -194,11 +193,6 @@ void libblis_test_gemm_ukr_experiment
 	m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
 	n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
 
-	// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
-	// respectively.
-	ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
-	ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
-
 	// Store the register blocksizes so that the driver can retrieve the
 	// values later when printing results.
 	op->dim_aux[0] = m;
@@ -237,7 +231,13 @@ void libblis_test_gemm_ukr_experiment
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
 
-#if 0
+	rntm_t rntm;
+	bli_rntm_init( &rntm );
+	bli_pba_rntm_set_pba( &rntm );
+
+	// Transpose B to B^T for packing.
+	bli_obj_induce_trans( &b );
+
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
 	cntl_t* cntl_a = libblis_test_pobj_create
@@ -248,56 +248,26 @@ void libblis_test_gemm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx
+	  cntx,
+	  &rntm
 	);
 	cntl_t* cntl_b = libblis_test_pobj_create
 	(
-	  BLIS_KR,
 	  BLIS_NR,
+	  BLIS_KR,
 	  BLIS_NO_INVERT_DIAG,
 	  BLIS_PACKED_COL_PANELS,
 	  BLIS_BUFFER_FOR_B_PANEL,
 	  &b, &bp,
-	  cntx
+	  cntx,
+	  &rntm
 	);
-#endif
-
-	// Create the packed objects. Use packmr and packnr as the leading
-	// dimensions of ap and bp, respectively. Note that we use the ldims
-	// instead of the matrix dimensions for allocation purposes here.
-	// This is a little hacky and was prompted when trying to support
-	// configurations such as power9 that employ duplication/broadcasting
-	// of elements in one of the packed matrix objects. Thankfully, packm
-	// doesn't care about those dimensions and instead relies on
-	// information taken from the source object. Thus, this is merely
-	// about coaxing bli_obj_create() in allocating enough space for our
-	// purposes.
-	bli_obj_create( datatype, ldap, k, 1, ldap, &ap );
-	bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp );
-
-	// Set up the objects for packing. Calling packm_init_pack() does everything
-	// except checkout a memory pool block and save its address to the obj_t's.
-	// However, it does overwrite the buffer field of packed object with that of
-	// the source object (as a side-effect of bli_obj_alias_to(); that buffer
-	// field would normally be overwritten yet again by the address from the
-	// memory pool block). So, we have to save the buffer address that was
-	// allocated so we can re-store it to the object afterward.
-	void* buf_ap = bli_obj_buffer( &ap );
-	void* buf_bp = bli_obj_buffer( &bp );
-	bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
-	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_MR, BLIS_KR, &a, &ap, cntx );
-	bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
-	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_KR, BLIS_NR, &b, &bp, cntx );
-	bli_obj_set_buffer( buf_ap, &ap );
-	bli_obj_set_buffer( buf_bp, &bp );
-
-	// Pack the data from the source objects.
-	bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-	bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-
-	// Repeat the experiment n_repeats times and record results. 
+
+	// Transpose B^T back to B and Bp^T back to Bp.
+	bli_obj_induce_trans( &b );
+	bli_obj_induce_trans( &bp );
+
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -321,16 +291,10 @@ void libblis_test_gemm_ukr_experiment
 	// Zero out performance and residual if output matrix is empty.
 	libblis_test_check_empty_problem( &c, perf, resid );
 
-#if 0
 	// Free the control tree nodes and release their cached mem_t entries
-	// back to the memory broker.
-	bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED );
-	bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
-#endif
-
-	// Free the packed objects.
-	bli_obj_free( &ap );
-	bli_obj_free( &bp );
+	// back to the pba.
+	bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
+	bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
 
 	// Free the test objects.
 	bli_obj_free( &a );
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index b3916db6a..48fcb78db 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -283,7 +283,10 @@ void libblis_test_gemmtrsm_ukr_experiment
 	bli_copym( &b11, &c11 );
 	bli_copym( &c11, &c11_save );
 
-#if 0
+	rntm_t rntm;
+	bli_rntm_init( &rntm );
+	bli_pba_rntm_set_pba( &rntm );
+
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
 	cntl_t* cntl_a = libblis_test_pobj_create
@@ -294,59 +297,9 @@ void libblis_test_gemmtrsm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  &cntx
-	);
-	cntl_t* cntl_b = libblis_test_pobj_create
-	(
-	  BLIS_MR,
-	  BLIS_NR,
-	  BLIS_NO_INVERT_DIAG,
-	  BLIS_PACKED_COL_PANELS,
-	  BLIS_BUFFER_FOR_B_PANEL,
-	  &b, &bp,
-	  &cntx
+	  cntx,
+	  &rntm
 	);
-#endif
-
-	// Create the packed objects. Use packmr and packnr as the leading
-	// dimensions of ap and bp, respectively. Note that we use the ldims
-	// instead of the matrix dimensions for allocation purposes here.
-	// This is a little hacky and was prompted when trying to support
-	// configurations such as power9 that employ duplication/broadcasting
-	// of elements in one of the packed matrix objects. Thankfully, packm
-	// doesn't care about those dimensions and instead relies on
-	// information taken from the source object. Thus, this is merely
-	// about coaxing bli_obj_create() in allocating enough space for our
-	// purposes.
-	bli_obj_create( datatype, ldap, k+m, 1, ldap, &ap );
-	bli_obj_create( datatype, k+m, ldbp, ldbp, 1, &bp );
-
-	// We overwrite the m dimension of ap and n dimension of bp with
-	// m and n, respectively, so that these objects contain the correct
-	// logical dimensions. Recall that ldap and ldbp were used only to
-	// induce bli_obj_create() to allocate sufficient memory for the
-	// duplication in rare instances where the subconfig uses a gemm
-	// ukernel that duplicates elements in one of the operands.
-	bli_obj_set_length( m, &ap );
-	bli_obj_set_width( n, &bp );
-
-	// Set up the objects for packing. Calling packm_init_pack() does everything
-	// except checkout a memory pool block and save its address to the obj_t's.
-	// However, it does overwrite the buffer field of packed object with that of
-	// the source object (as a side-effect of bli_obj_alias_to(); that buffer
-	// field would normally be overwritten yet again by the address from the
-	// memory pool block). So, we have to save the buffer address that was
-	// allocated so we can re-store it to the object afterward.
-	void* buf_ap = bli_obj_buffer( &ap );
-	void* buf_bp = bli_obj_buffer( &bp );
-	bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
-	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_MR, BLIS_KR, &a, &ap, cntx );
-	bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
-	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_KR, BLIS_NR, &b, &bp, cntx );
-	bli_obj_set_buffer( buf_ap, &ap );
-	bli_obj_set_buffer( buf_bp, &bp );
 
 	// Set the diagonal offset of ap.
 	if ( bli_is_lower( uploa ) ) { bli_obj_set_diag_offset( k, &ap ); }
@@ -357,32 +310,45 @@ void libblis_test_gemmtrsm_ukr_experiment
 	// to know how to initialize the subpartitions.
 	bli_obj_set_uplo( uploa, &ap );
 
-	// Pack the data from the source objects.
-	bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-	bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-
-	// Create subpartitions from the a and b panels.
-	bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
-	                                &a1xp, &a11p, &bx1p, &b11p );
-
-	// Set the uplo field of a11p since the default for packed objects is
-	// BLIS_DENSE, and the _ukernel() wrapper needs this information to
-	// know which set of micro-kernels (lower or upper) to choose from.
-	bli_obj_set_uplo( uploa, &a11p );
-
 #if 0
 bli_printm( "a", &a, "%5.2f", "" );
 bli_printm( "ap", &ap, "%5.2f", "" );
 #endif
 
-	// Repeat the experiment n_repeats times and record results. 
+	cntl_t* cntl_b = NULL;
+
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c11_save, &c11 );
 
-		// Re-pack (restore) the contents of b to bp.
-		//bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
-		bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
+		// Transpose B to B^T for packing.
+		bli_obj_induce_trans( &b );
+
+		cntl_b = libblis_test_pobj_create
+		(
+		  BLIS_NR,
+		  BLIS_MR,
+		  BLIS_NO_INVERT_DIAG,
+		  BLIS_PACKED_COL_PANELS,
+		  BLIS_BUFFER_FOR_B_PANEL,
+		  &b, &bp,
+		  cntx,
+		  &rntm
+		);
+
+		// Transpose B^T back to B and Bp^T back to Bp.
+		bli_obj_induce_trans( &b );
+		bli_obj_induce_trans( &bp );
+
+		// Create subpartitions from the a and b panels.
+		bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
+		                                &a1xp, &a11p, &bx1p, &b11p );
+
+		// Set the uplo field of a11p since the default for packed objects is
+		// BLIS_DENSE, and the _ukernel() wrapper needs this information to
+		// know which set of micro-kernels (lower or upper) to choose from.
+		bli_obj_set_uplo( uploa, &a11p );
 
 		time = bli_clock();
 
@@ -391,6 +357,15 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		                                cntx );
 
 		time_min = bli_clock_min_diff( time_min, time );
+
+		// On the last pass, we must keep the packed B buffer checked out in order
+		// to perform the correctness check later.
+		if ( i < n_repeats - 1 )
+		{
+			// Free the control tree nodes and release their cached mem_t entries
+			// back to the memory broker.
+			bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
+		}
 	}
 
 	// Estimate the performance of the best experiment repeat.
@@ -426,16 +401,11 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 	// Zero out performance and residual if output matrix is empty.
 	//libblis_test_check_empty_problem( &c11, perf, resid );
 
-#if 0
 	// Free the control tree nodes and release their cached mem_t entries
-	// back to the memory broker.
-	bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED );
-	bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
-#endif
-
-	// Free the packed objects.
-	bli_obj_free( &ap );
-	bli_obj_free( &bp );
+	// back to the pba.
+	bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
+	if ( cntl_b )
+	    bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
 
 	// Free the test objects.
 	bli_obj_free( &a_big );
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index bbfd0ac63..edab9796d 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -636,7 +636,7 @@ void libblis_test_read_op_info( test_ops_t*  ops,
 	int   i, p;
 
 	// Initialize the operation type field.
-	op->opid = opid; 
+	op->opid = opid;
 
 	// Read the line for the overall operation switch.
 	libblis_test_read_next_line( buffer, input_stream );
@@ -671,7 +671,7 @@ void libblis_test_read_op_info( test_ops_t*  ops,
 			//printf( "buffer[p]:       %s\n", &buffer[p] );
 
 			// Advance until we hit non-whitespace (ie: the next number).
-			for ( ; isspace( buffer[p] ); ++p ) ; 
+			for ( ; isspace( buffer[p] ); ++p ) ;
 
 			//printf( "buffer[p] after: %s\n", &buffer[p] );
 
@@ -680,7 +680,7 @@ void libblis_test_read_op_info( test_ops_t*  ops,
 			//printf( "dim[%d] = %d\n", i, op->dim_spec[i] );
 
 			// Advance until we hit whitespace (ie: the space before the next number).
-			for ( ; !isspace( buffer[p] ); ++p ) ; 
+			for ( ; !isspace( buffer[p] ); ++p ) ;
 		}
 	}
 
@@ -778,11 +778,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	// convert these values into strings, with "unset" being used if the
 	// value returned was -1 (indicating the environment variable was unset).
 	dim_t nt    = bli_thread_get_num_threads();
-	dim_t jc_nt = bli_thread_get_jc_nt(); 
-	dim_t pc_nt = bli_thread_get_pc_nt(); 
-	dim_t ic_nt = bli_thread_get_ic_nt(); 
-	dim_t jr_nt = bli_thread_get_jr_nt(); 
-	dim_t ir_nt = bli_thread_get_ir_nt(); 
+	dim_t jc_nt = bli_thread_get_jc_nt();
+	dim_t pc_nt = bli_thread_get_pc_nt();
+	dim_t ic_nt = bli_thread_get_ic_nt();
+	dim_t jr_nt = bli_thread_get_jr_nt();
+	dim_t ir_nt = bli_thread_get_ir_nt();
 
 	if (    nt == -1 ) sprintf(    nt_str, "unset" );
 	else               sprintf(    nt_str, "%d", ( int )   nt );
@@ -1739,7 +1739,7 @@ void libblis_test_op_driver
 				= ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) );
 
 				for ( o = 0; o < n_operands; ++o )
-				{ 
+				{
 					unsigned int ij;
 					operand_t    operand_type
 					= libblis_test_get_operand_type_for_char( o_types[o] );
@@ -2181,7 +2181,7 @@ void libblis_test_op_driver
 				ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype );
 
 				// Loop over the requested parameter combinations.
-				for ( pci = 0; pci < n_param_combos; ++pci )	
+				for ( pci = 0; pci < n_param_combos; ++pci )
 				{
 					// Loop over the requested problem sizes.
 					for ( p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi )
@@ -2403,7 +2403,7 @@ void libblis_test_build_function_string
 	if ( strlen( funcname_str ) > MAX_FUNC_STRING_LENGTH )
 		libblis_test_printf_error( "Function name string length (%d) exceeds maximum (%d).\n",
 		                           strlen( funcname_str ), MAX_FUNC_STRING_LENGTH );
-		
+
 }
 
 
@@ -2545,7 +2545,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
 	dim_t  n_trans   = n;
 	dim_t  rs        = 1; // Initialization avoids a compiler warning.
 	dim_t  cs        = 1; // Initialization avoids a compiler warning.
-	
+
 	// Apply the trans parameter to the dimensions (if needed).
 	bli_set_dims_with_trans( trans, m, n, &m_trans, &n_trans );
 
@@ -2591,12 +2591,9 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
 }
 
 
-
-#if 0
-cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
+cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm )
 {
 	bool   does_inv_diag;
-	rntm_t rntm;
 
 	if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE;
 	else                                   does_inv_diag = TRUE;
@@ -2606,7 +2603,6 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia
 	(
 	  NULL, // we don't need the small block allocator from the runtime.
 	  NULL, // func ptr is not referenced b/c we don't call via l3 _int().
-	  bli_packm_blk_var1,
 	  bmult_id_m,
 	  bmult_id_n,
 	  does_inv_diag,
@@ -2617,20 +2613,13 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia
 	  NULL  // no child node needed
 	);
 
-	// Initialize a local-to-BLIS rntm_t. This is simply so we have something
-	// to pass into bli_l3_packm(). The function doesn't (currently) use the
-	// runtime object, and even if it did, one with default values would work
-	// fine here.
-	bli_rntm_init( &rntm );
-
 	// Pack the contents of A to P.
-	bli_l3_packm( a, p, cntx, &rntm, cntl, &BLIS_PACKM_SINGLE_THREADED );
+	bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED );
 
 	// Return the control tree pointer so the caller can free the cntl_t and its
 	// mem_t entry later on.
 	return cntl;
 }
-#endif
 
 
 void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x )
@@ -2975,7 +2964,7 @@ void libblis_test_parse_message( FILE* output_stream, char* message, va_list arg
 	char*         the_string;
 	char          the_char;
 
-	// Begin looping over message to insert variables wherever there are 
+	// Begin looping over message to insert variables wherever there are
 	// format specifiers.
 	for ( c = 0; message[c] != '\0'; )
 	{
diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h
index 786f82b30..cdb3c6dac 100644
--- a/testsuite/src/test_libblis.h
+++ b/testsuite/src/test_libblis.h
@@ -418,7 +418,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces );
 // --- Create object ---
 
 void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a );
-cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx );
+cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm );
 void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x );
 
 // --- Randomize/initialize object ---
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 6366e5fc3..b07da91cc 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -171,7 +171,6 @@ void libblis_test_trsm_ukr_experiment
 	num_t        datatype;
 
 	dim_t        m, n;
-	inc_t        ldap, ldbp;
 
 	char         sc_a = 'c';
 	char         sc_b = 'r';
@@ -196,11 +195,6 @@ void libblis_test_trsm_ukr_experiment
 	m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
 	n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
 
-	// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
-	// respectively.
-	ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
-	ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
-
 	// Store the register blocksizes so that the driver can retrieve the
 	// values later when printing results.
 	op->dim_aux[0] = m;
@@ -238,7 +232,10 @@ void libblis_test_trsm_ukr_experiment
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
 
-#if 0
+	rntm_t rntm;
+	bli_rntm_init( &rntm );
+	bli_pba_rntm_set_pba( &rntm );
+
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
 	cntl_t* cntl_a = libblis_test_pobj_create
@@ -249,50 +246,9 @@ void libblis_test_trsm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx
+	  cntx,
+	  &rntm
 	);
-	cntl_t* cntl_b = libblis_test_pobj_create
-	(
-	  BLIS_MR,
-	  BLIS_NR,
-	  BLIS_NO_INVERT_DIAG,
-	  BLIS_PACKED_COL_PANELS,
-	  BLIS_BUFFER_FOR_B_PANEL,
-	  &b, &bp,
-	  cntx
-	);
-#endif
-
-	// Create the packed objects. Use packmr and packnr as the leading
-	// dimensions of ap and bp, respectively. Note that we use the ldims
-	// instead of the matrix dimensions for allocation purposes here.
-	// This is a little hacky and was prompted when trying to support
-	// configurations such as power9 that employ duplication/broadcasting
-	// of elements in one of the packed matrix objects. Thankfully, packm
-	// doesn't care about those dimensions and instead relies on
-	// information taken from the source object. Thus, this is merely
-	// about coaxing bli_obj_create() in allocating enough space for our
-	// purposes.
-	bli_obj_create( datatype, ldap, m, 1, ldap, &ap );
-	bli_obj_create( datatype, m, ldbp, ldbp, 1, &bp );
-
-	// Set up the objects for packing. Calling packm_init_pack() does everything
-	// except checkout a memory pool block and save its address to the obj_t's.
-	// However, it does overwrite the buffer field of packed object with that of
-	// the source object (as a side-effect of bli_obj_alias_to(); that buffer
-	// field would normally be overwritten yet again by the address from the
-	// memory pool block). So, we have to save the buffer address that was
-	// allocated so we can re-store it to the object afterward.
-	void* buf_ap = bli_obj_buffer( &ap );
-	void* buf_bp = bli_obj_buffer( &bp );
-	bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
-	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_MR, BLIS_KR, &a, &ap, cntx );
-	bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
-	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_KR, BLIS_NR, &b, &bp, cntx );
-	bli_obj_set_buffer( buf_ap, &ap );
-	bli_obj_set_buffer( buf_bp, &bp );
 
 	// Set the diagonal offset of ap.
 	bli_obj_set_diag_offset( 0, &ap );
@@ -302,24 +258,35 @@ void libblis_test_trsm_ukr_experiment
 	// know which set of micro-kernels (lower or upper) to choose from.
 	bli_obj_set_uplo( uploa, &ap );
 
-	// Pack the data from the source objects.
-	bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-	bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-
 #if 0
 bli_printm( "a", &a, "%5.2f", "" );
 bli_printm( "ap", &ap, "%5.2f", "" );
 #endif
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
-		// Re-pack the contents of b to bp.
-		//bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
-		bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-
 		bli_copym( &c_save, &c );
 
+		// Transpose B to B^T for packing.
+		bli_obj_induce_trans( &b );
+
+		cntl_t* cntl_b = libblis_test_pobj_create
+		(
+		  BLIS_NR,
+		  BLIS_MR,
+		  BLIS_NO_INVERT_DIAG,
+		  BLIS_PACKED_COL_PANELS,
+		  BLIS_BUFFER_FOR_B_PANEL,
+		  &b, &bp,
+		  cntx,
+		  &rntm
+		);
+
+		// Transpose B^T back to B and Bp^T back to Bp.
+		bli_obj_induce_trans( &b );
+		bli_obj_induce_trans( &bp );
+
 		time = bli_clock();
 
 		libblis_test_trsm_ukr_impl( iface, side,
@@ -327,6 +294,10 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		                            cntx );
 
 		time_min = bli_clock_min_diff( time_min, time );
+
+		// Free the control tree nodes and release their cached mem_t entries
+		// back to the memory broker.
+		bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
 	}
 
 	// Estimate the performance of the best experiment repeat.
@@ -339,16 +310,9 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 	// Zero out performance and residual if output matrix is empty.
 	//libblis_test_check_empty_problem( &c, perf, resid );
 
-#if 0
 	// Free the control tree nodes and release their cached mem_t entries
 	// back to the memory broker.
-	bli_cntl_free( NULL, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
-	bli_cntl_free( NULL, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
-#endif
-
-	// Free the packed objects.
-	bli_obj_free( &ap );
-	bli_obj_free( &bp );
+	bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
 
 	// Free the test objects.
 	bli_obj_free( &a );

From 961d9d509dd94f3a66f7095057e3dc8eb6d89839 Mon Sep 17 00:00:00 2001
From: Kiran <kiran.varaganti@amd.com>
Date: Wed, 8 Dec 2021 03:00:38 +0530
Subject: [PATCH 013/230] Re-add BLIS_ENABLE_ZEN_BLOCK_SIZES macro for 'zen'.

Details:
- Added previously-deleted cpp macro block to bli_cntx_init_zen.c
  targeting the Naples microarchitecture that enabled different cache
  blocksizes when the number of threads exceeds 16. This commit
  represents PR #573.
---
 config/zen/bli_cntx_init_zen.c | 15 +++++++++++----
 config/zen/bli_family_zen.h    |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index ed7287cee..615a31a04 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -173,15 +173,22 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	    mc = 510, kc = 1024 and nc = 4080
 */
 
+#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
+	// Zen optmized level 3 cache block sizes
 	#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,   510,   144,    72 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,  1024,   256,   256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1020,   510,   510,   255 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  1024,  1024,  1024,  1024 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  4080,  3056 );
 	#else
 	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,   240,   144,    72 );
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   512,   256,   256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  2040,  4080,  4080 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  2040,  2040,  1528 );
 	#endif
+#else
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  4080,  3056 );
+#endif
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h
index d1c4ef828..da03bd7e4 100644
--- a/config/zen/bli_family_zen.h
+++ b/config/zen/bli_family_zen.h
@@ -39,6 +39,7 @@
 #define BLIS_THREAD_MAX_IR      1
 #define BLIS_THREAD_MAX_JR      1
 
+#define BLIS_ENABLE_ZEN_BLOCK_SIZES
 
 // Vanilla BLIS disables AMD's small matrix handling by default.
 #if 0

From 54fa28bd847b389215cffb57a83dc9b3dce79c86 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 24 Dec 2021 08:00:33 -0600
Subject: [PATCH 014/230] Move edge cases to gemm ukr; more user-custom mods.
 (#583)

Details:
- Moved edge-case handling into the gemm microkernel. This required
  changing the microkernel API to take m and n dimension parameters.
  This required updating all existing gemm microkernel function pointer
  types, function signatures, and related definitions to take m and n
  dimensions. We also updated all existing kernels in the 'kernels'
  directory to take m and n dimensions, and implemented edge-case
  handling within those microkernels via a collection of new C
  preprocessor macros defined within bli_edge_case_macro_defs.h. Also
  removed the assembly code that formerly would handle general stride
  IO on the microtile, since this can now be handled by the same code
  that does edge cases.
- Pass the obj_t.ker_fn (of matrix C) into bli_gemm_cntl_create() and
  bli_trsm_cntl_create(), where this function pointer is used in lieu of
  the default macrokernel when it is non-NULL, and ignored when it is
  NULL.
- Re-implemented macrokernel in bli_gemm_ker_var2.c to be a single
  function using byte pointers rather that one function for each
  floating-point datatype. Also, obtain the microkernel function pointer
  from the .ukr field of the params struct embedded within the obj_t
  for matrix C (assuming params is non-NULL and contains a non-NULL
  value in the .ukr field). Communicate both the gemm microkernel
  pointer to use as well as the params struct to the microkernel via
  the auxinfo_t struct.
- Defined gemm_ker_params_t type (for the aforementioned obj_t.params
  struct) in bli_gemm_var.h.
- Retired the separate _md macrokernel for mixed datatype computation.
  We now use the reimplemented bli_gemm_ker_var2() instead.
- Updated gemmt macrokernels to pass m and n dimensions into microkernel
  calls.
- Removed edge-case handling from trmm and trsm macrokernels.
- Moved most of bli_packm_alloc() code into a new helper function,
  bli_packm_alloc_ex().
- Fixed a typo bug in bli_gemmtrsm_u_template_noopt_mxn.c.
- Added test/syrk_diagonal and test/tensor_contraction directories with
  associated code to test those operations.
---
 .../kernels/3/bli_gemm_template_noopt_mxn.c   |   13 +-
 .../3/bli_gemmtrsm_l_template_noopt_mxn.c     |    4 +
 .../3/bli_gemmtrsm_u_template_noopt_mxn.c     |    8 +-
 frame/1m/packm/bli_packm_alloc.c              |   58 +-
 frame/1m/packm/bli_packm_alloc.h              |   23 +-
 frame/3/bli_l3_cntl.c                         |   18 +-
 frame/3/bli_l3_ft_ukr.h                       |    2 +
 frame/3/bli_l3_ukr_oapi.c                     |    4 +
 frame/3/bli_l3_ukr_prot.h                     |    2 +
 frame/3/bli_l3_ukr_tapi.c                     |   63 +-
 frame/3/gemm/bli_gemm_cntl.c                  |   15 +-
 frame/3/gemm/bli_gemm_cntl.h                  |    6 +-
 frame/3/gemm/bli_gemm_front.c                 |   87 -
 frame/3/gemm/bli_gemm_ker_var2.c              |  570 ++-
 frame/3/gemm/bli_gemm_ker_var2_md.c           |  406 --
 frame/3/gemm/bli_gemm_md.h                    |   61 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |   49 +-
 frame/3/gemm/bli_gemm_var.h                   |   39 +-
 frame/3/gemm/ind/bli_gemm_ind_opt.h           |    2 +
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |  107 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |  107 +-
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |  125 +-
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |  121 +-
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |  127 +-
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |  129 +-
 frame/3/trsm/bli_trsm_cntl.c                  |   21 +-
 frame/3/trsm/bli_trsm_cntl.h                  |    9 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |   52 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |   52 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |   52 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |   52 +-
 frame/base/bli_auxinfo.h                      |   20 +-
 frame/include/bli_edge_case_macro_defs.h      |  109 +
 frame/include/bli_macro_defs.h                |    1 +
 frame/include/bli_type_defs.h                 |    7 +
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |   15 +-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  |   15 +-
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  |   15 +-
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |   15 +-
 .../3/bli_gemm_armsve_asm_z2vx7_unindexed.c   |   15 +-
 .../3/bli_gemm_armsve_asm_z2vx8_unindexed.c   |   15 +-
 kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c   |   48 +-
 kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c   |  115 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   | 3948 +++++++++--------
 kernels/bgq/3/bli_gemm_bgq_int_8x8.c          |   12 +
 .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c      | 2073 +++------
 kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 1940 +++-----
 kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c | 1898 +++-----
 kernels/knc/3/bli_dgemm_knc_asm_30x8.c        |  127 +-
 kernels/knc/3/bli_sgemm_knc_asm_30x16.c       |  129 +-
 kernels/knl/3/bli_dgemm_knl_asm_24x8.c        |   85 +-
 kernels/knl/3/bli_sgemm_knl_asm_24x16.c       |   85 +-
 kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c   | 1300 ++----
 .../3/bli_gemm_piledriver_asm_d8x3.c          | 2029 +++------
 kernels/power10/3/bli_dgemm_power10_mma.c     |   43 +-
 kernels/power10/3/bli_i16gemm_power10_mma.c   |   10 +-
 kernels/power10/3/bli_i16sgemm_power10_mma.c  |   10 +-
 kernels/power10/3/bli_i4gemm_power10_mma.c    |   40 +-
 kernels/power10/3/bli_i8gemm_power10_mma.c    |   38 +-
 kernels/power10/3/bli_sbgemm_power10_mma.c    |   18 +-
 kernels/power10/3/bli_sgemm_power10_mma.c     |   24 +-
 kernels/power10/3/bli_shgemm_power10_mma.c    |   18 +-
 kernels/power7/3/bli_gemm_power7_int_8x4.c    |  368 +-
 .../power7/3/test/bli_gemm_power7_int_8x4.h   |    8 +
 kernels/power9/3/bli_gemm_power9_asm_d12x6.c  |  238 +-
 .../3/bli_gemm_sandybridge_asm_d8x4.c         | 3030 ++++---------
 .../3/bli_gemm_sandybridge_int_d8x4.c         |  361 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c    |  105 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x14.c       |  183 +-
 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c    |  111 +-
 ref_kernels/3/bb/bli_gemmbb_ref.c             |    5 +-
 ref_kernels/3/bb/bli_gemmtrsmbb_ref.c         |    2 +
 ref_kernels/3/bli_gemm_ref.c                  |   23 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |    4 +
 ref_kernels/ind/bli_gemm1m_ref.c              |   26 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |    2 +
 test/syrk_diagonal/complex_math.hpp           |  267 ++
 test/syrk_diagonal/syrk_diagonal_example.c    |  186 +
 test/syrk_diagonal/syrk_diagonal_example.cxx  |  220 +
 test/syrk_diagonal/syrk_diagonal_example2.c   |  354 ++
 test/syrk_diagonal/syrk_diagonal_example2.cxx |  338 ++
 test/syrk_diagonal/syrk_diagonal_ref.cxx      |  102 +
 test/syrk_diagonal/syrk_diagonal_ref.h        |    8 +
 test/tensor_contraction/complex_math.hpp      |  267 ++
 test/tensor_contraction/tcontract_example.cxx |  988 +++++
 test/tensor_contraction/tcontract_ref.cxx     |   67 +
 test/tensor_contraction/tcontract_ref.hpp     |  100 +
 87 files changed, 10458 insertions(+), 13506 deletions(-)
 delete mode 100644 frame/3/gemm/bli_gemm_ker_var2_md.c
 create mode 100644 frame/include/bli_edge_case_macro_defs.h
 create mode 100644 test/syrk_diagonal/complex_math.hpp
 create mode 100644 test/syrk_diagonal/syrk_diagonal_example.c
 create mode 100644 test/syrk_diagonal/syrk_diagonal_example.cxx
 create mode 100644 test/syrk_diagonal/syrk_diagonal_example2.c
 create mode 100644 test/syrk_diagonal/syrk_diagonal_example2.cxx
 create mode 100644 test/syrk_diagonal/syrk_diagonal_ref.cxx
 create mode 100644 test/syrk_diagonal/syrk_diagonal_ref.h
 create mode 100644 test/tensor_contraction/complex_math.hpp
 create mode 100644 test/tensor_contraction/tcontract_example.cxx
 create mode 100644 test/tensor_contraction/tcontract_ref.cxx
 create mode 100644 test/tensor_contraction/tcontract_ref.hpp

diff --git a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
index b7a13f3b6..06f25a0e9 100644
--- a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
@@ -37,6 +37,8 @@
 
 void bli_zgemm_template_noopt
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a1,
@@ -88,8 +90,7 @@ void bli_zgemm_template_noopt
 
 	dim_t              l, j, i;
 
-	dcomplex           ab[ bli_zmr *
-	                       bli_znr ];
+	dcomplex           ab[ mr * nr ];
 	dcomplex*          abij;
 	dcomplex           ai, bj;
 
@@ -137,16 +138,16 @@ void bli_zgemm_template_noopt
 	if ( bli_zeq0( *beta ) )
 	{
 		/* c11 := ab */
-		bli_zcopys_mxn( mr,
-		                nr,
+		bli_zcopys_mxn( m,
+		                n,
 		                ab,  rs_ab, cs_ab,
 		                c11, rs_c,  cs_c );
 	}
 	else
 	{
 		/* c11 := beta * c11 + ab */
-		bli_zxpbys_mxn( mr,
-		                nr,
+		bli_zxpbys_mxn( m,
+		                n,
 		                ab,  rs_ab, cs_ab,
 		                beta,
 		                c11, rs_c,  cs_c );
diff --git a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
index da0cd3110..87c21f7ed 100644
--- a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
@@ -74,6 +74,8 @@ void bli_zgemmtrsm_l_template_noopt
 */
 	const num_t        dt        = BLIS_DCOMPLEX;
 
+	const inc_t        mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	const inc_t        nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
 	const inc_t        packnr    = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );
 
 	const inc_t        rs_b      = packnr;
@@ -84,6 +86,8 @@ void bli_zgemmtrsm_l_template_noopt
 	/* b11 = alpha * b11 - a10 * b01; */
 	bli_zgemm_template_noopt
 	(
+	  mr,
+	  nr,
 	  k,
 	  minus_one,
 	  a10,
diff --git a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
index 09b3af9ce..0b4544ae1 100644
--- a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
@@ -74,6 +74,8 @@ void bli_zgemmtrsm_u_template_noopt
 */
 	const num_t        dt        = BLIS_DCOMPLEX;
 
+	const inc_t        mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	const inc_t        nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
 	const inc_t        packnr    = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );
 
 	const inc_t        rs_b      = packnr;
@@ -84,10 +86,12 @@ void bli_zgemmtrsm_u_template_noopt
 	/* b11 = alpha * b11 - a12 * b21; */
 	bli_zgemm_template_noopt
 	(
+	  mr,
+	  nr,
 	  k,
 	  minus_one,
-	  a12,
-	  b21,
+	  a10,
+	  b01,
 	  alpha,
 	  b11, rs_b, cs_b,
 	  data
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index df6750d7a..b12a93ddc 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -36,16 +36,35 @@
 #include "blis.h"
 
 void* bli_packm_alloc
-      (
-        siz_t      size_needed,
-        rntm_t*    rntm,
-        cntl_t*    cntl,
-        thrinfo_t* thread
-      )
+     (
+       siz_t      size_needed,
+       rntm_t*    rntm,
+       cntl_t*    cntl,
+       thrinfo_t* thread
+     )
 {
 	// Query the pack buffer type from the control tree node.
 	packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
 
+	return bli_packm_alloc_ex
+	(
+	  size_needed,
+	  pack_buf_type,
+	  rntm,
+	  cntl,
+	  thread
+	);
+}
+
+void* bli_packm_alloc_ex
+     (
+       siz_t      size_needed,
+       packbuf_t  pack_buf_type,
+       rntm_t*    rntm,
+       cntl_t*    cntl,
+       thrinfo_t* thread
+     )
+{
 	// Query the address of the mem_t entry within the control tree node.
 	mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl );
 
@@ -55,7 +74,7 @@ void* bli_packm_alloc
 	siz_t cntl_mem_size = 0;
 
 	if ( bli_mem_is_alloc( cntl_mem_p ) )
-        cntl_mem_size = bli_mem_size( cntl_mem_p );
+		cntl_mem_size = bli_mem_size( cntl_mem_p );
 
 	if ( cntl_mem_size < size_needed )
 	{
@@ -64,14 +83,15 @@ void* bli_packm_alloc
 			// The chief thread releases the existing block associated with
 			// the mem_t entry in the control tree, and then re-acquires a
 			// new block, saving the associated mem_t entry to local_mem_s.
-	        if ( bli_mem_is_alloc( cntl_mem_p ) )
-            {
-    			bli_pba_release
-    			(
-    			  rntm,
-    			  cntl_mem_p
-    			);
-            }
+			if ( bli_mem_is_alloc( cntl_mem_p ) )
+			{
+				bli_pba_release
+				(
+				  rntm,
+				  cntl_mem_p
+				);
+			}
+
 			bli_pba_acquire_m
 			(
 			  rntm,
@@ -89,11 +109,11 @@ void* bli_packm_alloc
 		// this thread's control tree node.
 		*cntl_mem_p = *local_mem_p;
 
-        // Barrier so that the master thread doesn't return from the function
-        // before we are done reading.
-	    bli_thread_barrier( thread );
+		// Barrier so that the master thread doesn't return from the function
+		// before we are done reading.
+		bli_thread_barrier( thread );
 	}
 
-    return bli_mem_buffer( cntl_mem_p );
+	return bli_mem_buffer( cntl_mem_p );
 }
 
diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h
index b433be350..5a5cf126b 100644
--- a/frame/1m/packm/bli_packm_alloc.h
+++ b/frame/1m/packm/bli_packm_alloc.h
@@ -32,11 +32,20 @@
 
 */
 
-BLIS_EXPORT_BLIS  void* bli_packm_alloc
-      (
-        siz_t      size_needed,
-        rntm_t*    rntm,
-        cntl_t*    cntl,
-        thrinfo_t* thread
-      );
+BLIS_EXPORT_BLIS void* bli_packm_alloc
+     (
+       siz_t      size_needed,
+       rntm_t*    rntm,
+       cntl_t*    cntl,
+       thrinfo_t* thread
+     );
+
+BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
+     (
+       siz_t      size_needed,
+       packbuf_t  pack_buf_type,
+       rntm_t*    rntm,
+       cntl_t*    cntl,
+       thrinfo_t* thread
+     );
 
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
index 3cdecfbc2..83ff8e5af 100644
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -57,7 +57,14 @@ void bli_l3_cntl_create_if
 		     family == BLIS_GEMMT ||
 		     family == BLIS_TRMM )
 		{
-			*cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b );
+			*cntl_use = bli_gemm_cntl_create
+			(
+			  rntm,
+			  family,
+			  schema_a,
+			  schema_b,
+			  bli_obj_ker_fn( c )
+			);
 		}
 		else // if ( family == BLIS_TRSM )
 		{
@@ -66,7 +73,14 @@ void bli_l3_cntl_create_if
 			if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT;
 			else                              side = BLIS_RIGHT;
 
-			*cntl_use = bli_trsm_cntl_create( rntm, side, schema_a, schema_b );
+			*cntl_use = bli_trsm_cntl_create
+			(
+			  rntm,
+			  side,
+			  schema_a,
+			  schema_b,
+			  bli_obj_ker_fn( c )
+			);
 		}
 	}
 	else
diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h
index 4249dcbd6..561c8264f 100644
--- a/frame/3/bli_l3_ft_ukr.h
+++ b/frame/3/bli_l3_ft_ukr.h
@@ -47,6 +47,8 @@
 \
 typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
diff --git a/frame/3/bli_l3_ukr_oapi.c b/frame/3/bli_l3_ukr_oapi.c
index 33262b0bb..b8f2e00e6 100644
--- a/frame/3/bli_l3_ukr_oapi.c
+++ b/frame/3/bli_l3_ukr_oapi.c
@@ -51,6 +51,8 @@ void PASTEMAC0(opname) \
 \
 	num_t     dt        = bli_obj_dt( c ); \
 \
+	dim_t     m         = bli_obj_length( c ); \
+	dim_t     n         = bli_obj_width( c ); \
 	dim_t     k         = bli_obj_width( a ); \
 	void*     buf_a     = bli_obj_buffer_at_off( a ); \
 	void*     buf_b     = bli_obj_buffer_at_off( b ); \
@@ -75,6 +77,8 @@ void PASTEMAC0(opname) \
 \
 	f \
 	( \
+	  m, \
+	  n, \
 	  k, \
 	  buf_alpha, \
 	  buf_a, \
diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h
index ca523b1d7..f68973ff5 100644
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -42,6 +42,8 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype_out* restrict alpha, \
        ctype_in*  restrict a, \
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index 67e33175b..ab745d12b 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -39,6 +39,8 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
@@ -58,16 +60,19 @@ void PASTEMAC(ch,opname) \
 	PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
-	f( \
-	   k, \
-	   alpha, \
-	   a, \
-	   b, \
-	   beta, \
-	   c, rs_c, cs_c, \
-	   data, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  m, \
+	  n, \
+	  k, \
+	  alpha, \
+	  a, \
+	  b, \
+	  beta, \
+	  c, rs_c, cs_c, \
+	  data, \
+	  cntx  \
+	); \
 } \
 
 INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR )
@@ -98,17 +103,18 @@ void PASTEMAC(ch,opname) \
 	PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
-	f( \
-	   k, \
-	   alpha, \
-	   a1x, \
-	   a11, \
-	   bx1, \
-	   b11, \
-	   c11, rs_c, cs_c, \
-	   data, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  k, \
+	  alpha, \
+	  a1x, \
+	  a11, \
+	  bx1, \
+	  b11, \
+	  c11, rs_c, cs_c, \
+	  data, \
+	  cntx  \
+	); \
 } \
 
 INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ukernel, gemmtrsm, BLIS_GEMMTRSM_L_UKR )
@@ -136,13 +142,14 @@ void PASTEMAC(ch,opname) \
 	PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
-	f( \
-	   a, \
-	   b, \
-	   c, rs_c, cs_c, \
-	   data, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  a, \
+	  b, \
+	  c, rs_c, cs_c, \
+	  data, \
+	  cntx  \
+	); \
 } \
 
 INSERT_GENTFUNC_BASIC2( trsm_l_ukernel, trsm, BLIS_TRSM_L_UKR )
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index 72d78efe1..052c812a3 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -40,10 +40,11 @@ cntl_t* bli_gemm_cntl_create
        rntm_t* rntm,
        opid_t  family,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      )
 {
-	return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b );
+	return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker );
 }
 
 // -----------------------------------------------------------------------------
@@ -53,18 +54,22 @@ cntl_t* bli_gemmbp_cntl_create
        rntm_t* rntm,
        opid_t  family,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      )
 {
 	void_fp macro_kernel_fp;
 
-	// Use the function pointers to the macrokernels that use slab
-	// assignment of micropanels to threads in the jr and ir loops.
+	// Choose the default macrokernel based on the operation family...
 	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
 	else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
 	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
 	else /* should never execute */ macro_kernel_fp = NULL;
 
+	// ...unless a non-NULL kernel function pointer is passed in, in which
+	// case we use that instead.
+	if ( ker ) macro_kernel_fp = ker;
+
 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
 	(
diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h
index bff91b58a..5fa213ac4 100644
--- a/frame/3/gemm/bli_gemm_cntl.h
+++ b/frame/3/gemm/bli_gemm_cntl.h
@@ -38,7 +38,8 @@ cntl_t* bli_gemm_cntl_create
        rntm_t* rntm,
        opid_t  family,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      );
 
 // -----------------------------------------------------------------------------
@@ -48,7 +49,8 @@ cntl_t* bli_gemmbp_cntl_create
        rntm_t* rntm,
        opid_t  family,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      );
 
 #if 0
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index a9ea21dc4..4ff45036f 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -283,90 +283,3 @@ void bli_gemm_front
 #endif
 }
 
-// -----------------------------------------------------------------------------
-
-#if 0
-	if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
-	     bli_obj_dt( a ) != bli_obj_dt( c ) ||
-	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
-	{
-		const bool a_is_real = bli_obj_is_real( a );
-		const bool a_is_comp = bli_obj_is_complex( a );
-		const bool b_is_real = bli_obj_is_real( b );
-		const bool b_is_comp = bli_obj_is_complex( b );
-		const bool c_is_real = bli_obj_is_real( c );
-		const bool c_is_comp = bli_obj_is_complex( c );
-
-		const bool a_is_single = bli_obj_is_single_prec( a );
-		const bool a_is_double = bli_obj_is_double_prec( a );
-		const bool b_is_single = bli_obj_is_single_prec( b );
-		const bool b_is_double = bli_obj_is_double_prec( b );
-		const bool c_is_single = bli_obj_is_single_prec( c );
-		const bool c_is_double = bli_obj_is_double_prec( c );
-
-		const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
-		const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
-
-		const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
-		                         bli_obj_domain( c ) != bli_obj_domain( b );
-
-		( void )a_is_real; ( void )a_is_comp;
-		( void )b_is_real; ( void )b_is_comp;
-		( void )c_is_real; ( void )c_is_comp;
-		( void )a_is_single; ( void )a_is_double;
-		( void )b_is_single; ( void )b_is_double;
-		( void )c_is_single; ( void )c_is_double;
-		( void )comp_single; ( void )comp_double;
-
-		if (
-		     //( c_is_comp && a_is_comp && b_is_real ) ||
-		     //( c_is_comp && a_is_real && b_is_comp ) ||
-		     //( c_is_real && a_is_comp && b_is_comp ) ||
-		     //( c_is_comp && a_is_real && b_is_real ) ||
-		     //( c_is_real && a_is_comp && b_is_real ) ||
-		     //( c_is_real && a_is_real && b_is_comp ) ||
-		     //FALSE
-		     TRUE
-		   )
-		{
-			if (
-			     ( c_is_single && a_is_single && b_is_single && mixeddomain ) ||
-			     ( c_is_single && a_is_single && b_is_single && comp_single ) ||
-			     ( c_is_single && a_is_single && b_is_single && comp_double ) ||
-			     ( c_is_single && a_is_single && b_is_double                ) ||
-			     ( c_is_single && a_is_double && b_is_single                ) ||
-			     ( c_is_double && a_is_single && b_is_single                ) ||
-			     ( c_is_single && a_is_double && b_is_double                ) ||
-			     ( c_is_double && a_is_single && b_is_double                ) ||
-			     ( c_is_double && a_is_double && b_is_single                ) ||
-			     ( c_is_double && a_is_double && b_is_double && comp_single ) ||
-			     ( c_is_double && a_is_double && b_is_double && comp_double ) ||
-			     ( c_is_double && a_is_double && b_is_double && mixeddomain ) ||
-			     FALSE
-			   )
-				bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
-			else
-				bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
-		}
-		else
-			bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
-		return;
-	}
-#else
-#if 0
-	// If any of the storage datatypes differ, or if the execution precision
-	// differs from the storage precision of C, utilize the mixed datatype
-	// code path.
-	// NOTE: We could check the exec dt against the storage dt of C, but for
-	// now we don't support the caller setting the execution domain
-	// explicitly.
-	if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
-	     bli_obj_dt( a ) != bli_obj_dt( c ) ||
-	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
-	{
-		bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
-		return;
-	}
-#endif
-#endif
-
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 0c9060552..6de361194 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -35,28 +35,44 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
+typedef void (*xpbys_mxn_vft)
+    (
+      dim_t m,
+      dim_t n,
+      void* x, inc_t rs_x, inc_t cs_x,
+      void* b,
+      void* y, inc_t rs_y, inc_t cs_y
+    );
 
-typedef void (*FUNCPTR_T)
-     (
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
+#undef GENTFUNC2
+#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \
+\
+void PASTEMAC2(chx,chy,op) \
+    ( \
+      dim_t m, \
+      dim_t n, \
+      void* x, inc_t rs_x, inc_t cs_x, \
+      void* b, \
+      void* y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctypex* restrict x_cast = x; \
+	ctypey* restrict b_cast = b; \
+	ctypey* restrict y_cast = y; \
+\
+	PASTEMAC3(chx,chy,chy,xpbys_mxn) \
+	( \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
 
-static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
+INSERT_GENTFUNC2_BASIC0(xbpys_mxn_fn);
+INSERT_GENTFUNC2_MIXDP0(xbpys_mxn_fn);
+
+static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn);
 
 
 void bli_gemm_ker_var2
@@ -70,23 +86,8 @@ void bli_gemm_ker_var2
        thrinfo_t* thread
      )
 {
-#ifdef BLIS_ENABLE_GEMM_MD
-	// By now, A and B have been packed and cast to the execution precision.
-	// In most cases, such as when storage precision of C differs from the
-	// execution precision, we utilize the mixed datatype code path. However,
-	// a few cases still fall within this kernel, such as mixed domain with
-	// equal precision (ccr, crc, rcc), hence those expressions being disabled
-	// in the conditional below.
-	if ( //( bli_obj_domain( c ) != bli_obj_domain( a ) ) ||
-	     //( bli_obj_domain( c ) != bli_obj_domain( b ) ) ||
-	     ( bli_obj_dt( c ) != bli_obj_exec_dt( c ) ) )
-	{
-		bli_gemm_ker_var2_md( a, b, c, cntx, rntm, cntl, thread );
-		return;
-	}
-#endif
-
 	num_t     dt_exec   = bli_obj_exec_dt( c );
+	num_t     dt_c      = bli_obj_dt( c );
 
 	pack_t    schema_a  = bli_obj_pack_schema( a );
 	pack_t    schema_b  = bli_obj_pack_schema( b );
@@ -95,50 +96,55 @@ void bli_gemm_ker_var2
 	dim_t     n         = bli_obj_width( c );
 	dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
+	char*     a_cast    = bli_obj_buffer_at_off( a );
 	inc_t     is_a      = bli_obj_imag_stride( a );
 	dim_t     pd_a      = bli_obj_panel_dim( a );
 	inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
+	char*     b_cast    = bli_obj_buffer_at_off( b );
 	inc_t     is_b      = bli_obj_imag_stride( b );
 	dim_t     pd_b      = bli_obj_panel_dim( b );
 	inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
+	char*     c_cast    = bli_obj_buffer_at_off( c );
 	inc_t     rs_c      = bli_obj_row_stride( c );
 	inc_t     cs_c      = bli_obj_col_stride( c );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
 
 	// Detach and multiply the scalars attached to A and B.
+	// NOTE: We know that the internal scalars of A and B are already of the
+	// target datatypes because the necessary typecasting would have already
+	// taken place during bli_packm_init().
+	obj_t     scalar_a;
+	obj_t     scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	// NOTE: We know that scalar_b is of type dt_exec due to the above code
+	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
+	// and we know that the internal scalar in C is already of the type dt_c
+	// due to the casting in the implementation of bli_obj_scalar_attach().
+	char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
+	char* beta_cast  = bli_obj_internal_scalar_buffer( c );
 
 	// If 1m is being employed on a column- or row-stored matrix with a
 	// real-valued beta, we can use the real domain macro-kernel, which
 	// eliminates a little overhead associated with the 1m virtual
 	// micro-kernel.
+	// Only employ this optimization if the storage datatype of C is
+	// equal to the execution/computation datatype.
 #if 1
 	if ( bli_cntx_method( cntx ) == BLIS_1M )
 	{
 		bli_gemm_ind_recast_1m_params
 		(
 		  &dt_exec,
+		  &dt_c,
 		  schema_a,
 		  c,
 		  &m, &n, &k,
@@ -151,273 +157,211 @@ void bli_gemm_ker_var2
 
 #ifdef BLIS_ENABLE_GEMM_MD
 	// Tweak parameters in select mixed domain cases (rcc, crc, ccr).
-	bli_gemm_md_ker_var2_recast
-	(
-	  &dt_exec,
-	  bli_obj_dt( a ),
-	  bli_obj_dt( b ),
-	  bli_obj_dt( c ),
-	  &m, &n, &k,
-	  &pd_a, &ps_a,
-	  &pd_b, &ps_b,
-	  c,
-	  &rs_c, &cs_c
-	);
+	if ( bli_cntx_method( cntx ) == BLIS_NAT )
+	{
+		bli_gemm_md_ker_var2_recast
+		(
+		  &dt_exec,
+		  bli_obj_dt( a ),
+		  bli_obj_dt( b ),
+		  &dt_c,
+		  &m, &n, &k,
+		  &pd_a, &ps_a,
+		  &pd_b, &ps_b,
+		  c,
+		  &rs_c, &cs_c
+		);
+	}
 #endif
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
+	siz_t        dt_size   = bli_dt_size( dt_exec );
+	siz_t        dt_c_size = bli_dt_size( dt_c );
 
+	// Alias some constants to simpler names.
+	const dim_t  MR        = pd_a;
+	const dim_t  NR        = pd_b;
+	//const dim_t PACKMR     = cs_a;
+	//const dim_t PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+
+	// Query the params field from the obj_t. If it is non-NULL, grab the ukr
+	// field of the params struct. If that function pointer is non-NULL, use it
+	// as our microkernel instead of the default microkernel queried from the
+	// cntx above.
+	gemm_ker_params_t* params = bli_obj_ker_params( c );
+	gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
+	if ( user_ukr ) gemm_ukr = user_ukr;
+
+	// Temporary C buffer for edge cases. Note that the strides of this
+	// temporary buffer are set so that they match the storage of the
+	// original C matrix. For example, if C is column-stored, ct will be
+	// column-stored as well.
+	char        ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool  col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	const inc_t rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+	char*       zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+
+	//
+	// Assumptions/assertions:
+	//   rs_a == 1
+	//   cs_a == PACKMR
+	//   pd_a == MR
+	//   ps_a == stride to next micro-panel of A
+	//   rs_b == PACKNR
+	//   cs_b == 1
+	//   pd_b == NR
+	//   ps_b == stride to next micro-panel of B
+	//   rs_c == (no assumptions)
+	//   cs_c == (no assumptions)
+	//
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_c_size;
+	inc_t cstep_c = cs_c * NR * dt_c_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// Save the imaginary stride of A and B to the auxinfo_t object.
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+	dim_t ir_nt  = bli_thread_n_way( caucus );
+	dim_t ir_tid = bli_thread_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
+	// Determine the thread range and increment for the 2nd and 1st loops.
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		char* b1 = b_cast + j * cstep_b;
+		char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		char* b2 = b1;
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			char* a1  = a_cast + i * rstep_a;
+			char* c11 = c1     + i * rstep_c;
+
+			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// Compute the addresses of the next panels of A and B.
+			char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
+			// Save addresses of next panels of A and B to the auxinfo_t
+			// object.
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
+			// Edge case handling now occurs within the microkernel itself, but
+			// we must still explicitly accumulate to a temporary microtile in
+			// situations where a virtual microkernel is being used, such as
+			// during the 1m method or some cases of mixed datatypes.
+			if ( dt_exec == dt_c )
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  alpha_cast,
+				  a1,
+				  b1,
+				  beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  cntx
+				);
+			}
+			else
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  alpha_cast,
+				  a1,
+				  b1,
+				  zero,
+				  &ct, rs_ct, cs_ct,
+				  &aux,
+				  cntx
+				);
+
+				// Accumulate to C with type-casting.
+				xbpys_mxn[ dt_exec ][ dt_c ]
+				(
+				    m_cur, n_cur,
+				    &ct, rs_ct, cs_ct,
+				    beta_cast,
+				    c11, rs_c, cs_c
+				);
+			}
+		}
+	}
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. Note that the virtual gemm ukernel is queried
-	   instead of the native gemm ukernel. This is needed for certain
-	   situations for the 1m method that require an extra layer of logic
-	   to allow for handling (for example) complex values of beta. Also
-	   note that under certain circumstances, the real-domain version of
-	   this macrokernel will be called for 1m (NOT the complex version)
-	   as an optimization. In these cases, the corresponding real-domain
-	   slots within the cntx_t's virtual gemm ukernel func_t will contain
-	   pointers to the *native* gemm ukernel, thanks to logic in the
-	   context initialization function for the induced method (defined
-	   in bli_cntx_ref.c). */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           i, j; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Determine the thread range and increment for the 2nd and 1st loops.
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* Handle interior and edge cases separately. */ \
-			if ( m_cur == MR && n_cur == NR ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-			else \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale the bottom edge of C and add the result from above. */ \
-				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-				                        ct,  rs_ct, cs_ct, \
-				                        beta_cast, \
-				                        c11, rs_c,  cs_c ); \
-			} \
-		} \
-	} \
-\
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
-
diff --git a/frame/3/gemm/bli_gemm_ker_var2_md.c b/frame/3/gemm/bli_gemm_ker_var2_md.c
deleted file mode 100644
index 09c279d14..000000000
--- a/frame/3/gemm/bli_gemm_ker_var2_md.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_GEMM_MD
-
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY2_ALL(ftypes,gemm_ker_var2_md);
-
-
-void bli_gemm_ker_var2_md
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-	num_t     dt_c      = bli_obj_dt( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	// NOTE: We know that the internal scalars of A and B are already of the
-	// target datatypes because the necessary typecasting would have already
-	// taken place during bli_packm_init().
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	// NOTE: We know that scalar_b is of type dt_exec due to the above code
-	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
-	// and we know that the internal scalar in C is already of the type dt_c
-	// due to the casting in the implementation of bli_obj_scalar_attach().
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-#if 0
-	// NOTE: Turns out that this optimization will never be employed since
-	// currently bli_gemm_ker_var2_md() is only called when the storage
-	// datatype of C differs from the execution/computation datatype, and
-	// this optimization would only make sense if they are equal.
-
-	// If 1m is being employed on a column- or row-stored matrix with a
-	// real-valued beta, we can use the real domain macro-kernel, which
-	// eliminates a little overhead associated with the 1m virtual
-	// micro-kernel.
-	if ( bli_cntx_method( cntx ) == BLIS_1M )
-	{
-		// Only employ this optimization if the storage datatype of C is
-		// equal to the execution/computation datatype.
-		if ( dt_c == dt_exec )
-		{
-			bli_gemm_ind_recast_1m_params
-			(
-			  &dt_exec,
-			  schema_a,
-			  c,
-			  &m, &n, &k,
-			  &pd_a, &ps_a,
-			  &pd_b, &ps_b,
-			  &rs_c, &cs_c
-			);
-		}
-	}
-#endif
-
-	// Tweak parameters in select mixed domain cases (rcc, crc, ccr).
-	bli_gemm_md_ker_var2_recast
-	(
-	  &dt_exec,
-	  bli_obj_dt( a ),
-	  bli_obj_dt( b ),
-	  bli_obj_dt( c ),
-	  &m, &n, &k,
-	  &pd_a, &ps_a,
-	  &pd_b, &ps_b,
-	  c,
-	  &rs_c, &cs_c
-	);
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_c][dt_exec];
-
-	// Invoke the function.
-	f( schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctype_c, ctype_e, chc, che, varname ) \
-\
-void PASTEMAC2(chc,che,varname) \
-     ( \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dte        = PASTEMAC(che,type); \
-	/*const num_t     dtc        = PASTEMAC(chc,type);*/ \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(che,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dte, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype_e         ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype_e ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dte, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype_e* restrict zero       = PASTEMAC(che,0); \
-	ctype_e* restrict a_cast     = a; \
-	ctype_e* restrict b_cast     = b; \
-	ctype_c* restrict c_cast     = c; \
-	ctype_e* restrict alpha_cast = alpha; \
-	ctype_c* restrict beta_cast  = beta; \
-	ctype_e* restrict b1; \
-	ctype_c* restrict c1; \
-\
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           i, j; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(che,set0s_mxn)( MR, NR, \
-	                         ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Determine the thread range and increment for the 2nd and 1st loops.
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype_e* restrict a1; \
-		ctype_c* restrict c11; \
-		ctype_e* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype_e* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* Always save the micropanel product to the local microtile and
-			   then accumulate it into C via the xpbys_mxn macro. */ \
-			/*if ( 1 )*/ \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale the microtile of C and add the result from above. */ \
-				PASTEMAC3(che,chc,chc,xpbys_mxn) \
-				( \
-				  m_cur, n_cur, \
-				  ct,  rs_ct, cs_ct, \
-				  beta_cast, \
-				  c11, rs_c,  cs_c \
-				); \
-			} \
-		} \
-	} \
-\
-/*
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
-}
-
-INSERT_GENTFUNC2_BASIC0( gemm_ker_var2_md )
-INSERT_GENTFUNC2_MIXDP0( gemm_ker_var2_md )
-
-#endif
diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h
index 8fcf6bd21..751e271ea 100644
--- a/frame/3/gemm/bli_gemm_md.h
+++ b/frame/3/gemm/bli_gemm_md.h
@@ -154,7 +154,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast
        num_t* dt_comp,
        num_t  dt_a,
        num_t  dt_b,
-       num_t  dt_c,
+       num_t* dt_c,
        dim_t* m,
        dim_t* n,
        dim_t* k,
@@ -164,7 +164,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast
        inc_t* rs_c, inc_t* cs_c
      )
 {
-	if      ( bli_is_real( dt_c )    &&
+	if      ( bli_is_real( *dt_c )    &&
 	          bli_is_complex( dt_a ) &&
 	          bli_is_complex( dt_b ) )
 	{
@@ -177,7 +177,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast
 		*ps_a *= 2;
 		*ps_b *= 2;
 	}
-	else if ( bli_is_complex( dt_c ) &&
+	else if ( bli_is_complex( *dt_c ) &&
 	          bli_is_real( dt_a )    &&
 	          bli_is_complex( dt_b ) )
 	{
@@ -197,6 +197,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast
 			// to the real virtual microkernel slots of the context) instead of
 			// the complex macrokernel and c2r virtual microkernel.
 			*dt_comp = bli_dt_proj_to_real( *dt_comp );
+			*dt_c = bli_dt_proj_to_real( *dt_c );
 			*n *= 2;
 			*pd_b *= 2; *ps_b *= 2;
 			*rs_c *= 2;
@@ -211,7 +212,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast
 			*ps_a /= 2;
 		}
 	}
-	else if ( bli_is_complex( dt_c ) &&
+	else if ( bli_is_complex( *dt_c ) &&
 	          bli_is_complex( dt_a ) &&
 	          bli_is_real( dt_b ) )
 	{
@@ -231,6 +232,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast
 			// to the real virtual microkernel slots of the context) instead of
 			// the complex macrokernel and c2r virtual microkernel.
 			*dt_comp = bli_dt_proj_to_real( *dt_comp );
+			*dt_c = bli_dt_proj_to_real( *dt_c );
 			*m *= 2;
 			*pd_a *= 2; *ps_a *= 2;
 			*cs_c *= 2;
@@ -274,54 +276,3 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast
 #endif
 }
 
-// -----------------------------------------------------------------------------
-
-//
-// Prototype object-based interfaces.
-//
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-void PASTEMAC0(opname) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
-     );
-
-GENPROT( gemm_ker_var2_md )
-
-//
-// Prototype BLAS-like interfaces with void pointer operands.
-//
-
-#undef  GENTPROT2
-#define GENTPROT2( ctype_c, ctype_e, chc, che, varname ) \
-\
-void PASTEMAC2(chc,che,varname) \
-     ( \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT2_BASIC0( gemm_ker_var2_md )
-INSERT_GENTPROT2_MIXDP0( gemm_ker_var2_md )
-
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index 0bfb59630..bbd9190a9 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -41,6 +41,8 @@
 \
 void PASTEMAC2(ch,opname,suf) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
@@ -61,6 +63,9 @@ void PASTEMAC2(ch,opname,suf) \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+\
+	dim_t             mr_r      = mr; \
+	dim_t             nr_r      = nr; \
 \
 	ctype             ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                      / sizeof( ctype_r ) ] \
@@ -81,6 +86,9 @@ void PASTEMAC2(ch,opname,suf) \
 \
 	ctype_r* restrict beta_r    = &PASTEMAC(ch,real)( *beta ); \
 	ctype_r* restrict beta_i    = &PASTEMAC(ch,imag)( *beta ); \
+\
+	dim_t             m_use; \
+	dim_t             n_use; \
 \
 	ctype_r*          c_use; \
 	inc_t             rs_c_use; \
@@ -146,17 +154,16 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		rs_c_use = rs_ct; \
 		cs_c_use = cs_ct; \
 \
-		/* Convert the strides from being in units of complex elements to
-		   be in units of real elements. Note that we don't need to check for
-		   general storage here because that case corresponds to the scenario
-		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
-		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
-		else                                           rs_c_use *= 2; \
-\
+		/* Convert the strides and corresponding microtile dimension from being
+		   in units of complex elements to be in units of real elements. */ \
+		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; mr_r *= 2; } \
+		else                                           { rs_c_use *= 2; nr_r *= 2; }\
 \
 		/* c = beta * c + alpha_r * a * b; */ \
 		rgemm_ukr \
 		( \
+		  mr_r, \
+		  nr_r, \
 		  k, \
 		  alpha_r, \
 		  a_r, \
@@ -166,14 +173,12 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		  data, \
 		  cntx  \
 		); \
-\
-		dim_t i, j; \
 \
 		/* Accumulate the final result in ct back to c. */ \
 		if ( PASTEMAC(ch,eq1)( *beta ) ) \
 		{ \
-			for ( j = 0; j < nr; ++j ) \
-			for ( i = 0; i < mr; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
 				                   *(c  + i*rs_c  + j*cs_c ) ); \
@@ -181,8 +186,8 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		} \
 		else if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( j = 0; j < nr; ++j ) \
-			for ( i = 0; i < mr; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
 				                    *(c  + i*rs_c  + j*cs_c ) ); \
@@ -190,8 +195,8 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		} \
 		else \
 		{ \
-			for ( j = 0; j < nr; ++j ) \
-			for ( i = 0; i < mr; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
 				                    *beta, \
@@ -207,17 +212,19 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		c_use    = ( ctype_r* )c; \
 		rs_c_use = rs_c; \
 		cs_c_use = cs_c; \
+		m_use    = m; \
+		n_use    = n; \
 \
-		/* Convert the strides from being in units of complex elements to
-		   be in units of real elements. Note that we don't need to check for
-		   general storage here because that case corresponds to the scenario
-		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
-		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
-		else                                           rs_c_use *= 2; \
+		/* Convert the strides and corresponding microtile dimension from being
+		   in units of complex elements to be in units of real elements. */ \
+		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; m_use *= 2; } \
+		else                                           { rs_c_use *= 2; n_use *= 2; } \
 \
 		/* c = beta * c + alpha_r * a * b; */ \
 		rgemm_ukr \
 		( \
+		  m_use, \
+		  n_use, \
 		  k, \
 		  alpha_r, \
 		  a_r, \
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index e7befc5b4..888181bad 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -34,6 +34,16 @@
 */
 
 
+//
+// gemm kernel parameter struct.
+//
+
+typedef struct
+{
+	gemm_ukr_vft ukr;
+} gemm_ker_params_t;
+
+
 //
 // Prototype object-based interfaces.
 //
@@ -59,32 +69,3 @@ GENPROT( gemm_blk_var3 )
 GENPROT( gemm_ker_var1 )
 GENPROT( gemm_ker_var2 )
 
-
-//
-// Prototype BLAS-like interfaces with void pointer operands.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
-
diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h
index 7528c4f03..52ea81a5e 100644
--- a/frame/3/gemm/ind/bli_gemm_ind_opt.h
+++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h
@@ -35,6 +35,7 @@
 BLIS_INLINE void bli_gemm_ind_recast_1m_params
      (
        num_t* dt_exec,
+       num_t* dt_c,
        pack_t schema_a,
        obj_t* c,
        dim_t* m,
@@ -57,6 +58,7 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params
 	     !bli_is_gen_stored( *rs_c, *cs_c ) )
 	{
 		*dt_exec = bli_dt_proj_to_real( *dt_exec );
+		*dt_c    = bli_dt_proj_to_real( *dt_c );
 
 		if ( bli_is_1e_packed( schema_a ) )
 		{
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index a995e6c52..fea4efec0 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -279,6 +279,9 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	   loop around the microkernel. Here we query the thrinfo_t node for the
@@ -381,43 +384,20 @@ void PASTEMAC(ch,varname) \
 			   And if we're strictly above the diagonal, we do nothing and
 			   continue. */ \
 			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 			} \
 		} \
 	} \
@@ -490,6 +470,8 @@ void PASTEMAC(ch,varname) \
 				/* Invoke the gemm micro-kernel. */ \
 				gemm_ukr \
 				( \
+				  MR, \
+				  NR, \
 				  k, \
 				  alpha_cast, \
 				  a1, \
@@ -509,43 +491,20 @@ void PASTEMAC(ch,varname) \
 			} \
 			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
 			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 			} \
 		} \
 	} \
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 3115fc67b..4b849bbc6 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -281,6 +281,9 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	   loop around the microkernel. Here we query the thrinfo_t node for the
@@ -385,6 +388,8 @@ void PASTEMAC(ch,varname) \
 				/* Invoke the gemm micro-kernel. */ \
 				gemm_ukr \
 				( \
+				  MR, \
+				  NR, \
 				  k, \
 				  alpha_cast, \
 				  a1, \
@@ -404,43 +409,20 @@ void PASTEMAC(ch,varname) \
 			} \
 			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
 			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 			} \
 		} \
 	} \
@@ -512,43 +494,20 @@ void PASTEMAC(ch,varname) \
 			   And if we're strictly below the diagonal, we do nothing and
 			   continue. */ \
 			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 			} \
 		} \
 	} \
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index 792281b53..646287f93 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
 	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
 	ctype* restrict b_cast     = b; \
 	ctype* restrict c_cast     = c; \
@@ -254,10 +242,6 @@ void PASTEMAC(ch,varname) \
 		diagoffa = 0; \
 		c_cast   = c_cast + (i  )*rs_c; \
 	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
 	   dimensions. */ \
@@ -307,8 +291,8 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_inc; \
 \
 	/* Determine the thread range and increment for the 2nd loop.
-       NOTE: The definition of bli_thread_range_jrir() will depend on whether
-       slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. \
 	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
 	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
@@ -379,47 +363,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_a1011, \
-					  alpha_cast, \
-					  a1, \
-					  b1_i, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Copy edge elements of C to the temporary buffer. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        c11, rs_c,  cs_c, \
-					                        ct,  rs_ct, cs_ct ); \
-\
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_a1011, \
-					  alpha_cast, \
-					  a1, \
-					  b1_i, \
-					  beta_cast, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_a1011, \
+				  alpha_cast, \
+				  a1, \
+				  b1_i, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				/*}*/ \
 \
 				a1 += ps_a_cur; \
@@ -446,42 +403,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  one, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
-					                       ct,  rs_ct, cs_ct, \
-					                       c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  one, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				/*}*/ \
 \
 				a1 += rstep_a; \
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index 69498540b..9ef2a475d 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
 	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
 	ctype* restrict b_cast     = b; \
 	ctype* restrict c_cast     = c; \
@@ -261,10 +249,6 @@ void PASTEMAC(ch,varname) \
 	{ \
 		m = -diagoffa + k; \
 	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
 	   dimensions. */ \
@@ -386,47 +370,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_a1112, \
-					  alpha_cast, \
-					  a1, \
-					  b1_i, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Copy edge elements of C to the temporary buffer. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        c11, rs_c,  cs_c, \
-					                        ct,  rs_ct, cs_ct ); \
-\
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_a1112, \
-					  alpha_cast, \
-					  a1, \
-					  b1_i, \
-					  beta_cast, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_a1112, \
+				  alpha_cast, \
+				  a1, \
+				  b1_i, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				/*}*/ \
 \
 				a1 += ps_a_cur; \
@@ -453,42 +410,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  one, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
-					                       ct,  rs_ct, cs_ct, \
-					                       c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  one, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				/*}*/ \
 \
 				a1 += rstep_a; \
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index 03e3f1e53..f6b20af2e 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
 	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
 	ctype* restrict b_cast     = b; \
 	ctype* restrict c_cast     = c; \
@@ -261,10 +249,6 @@ void PASTEMAC(ch,varname) \
 	{ \
 		n = diagoffb + k; \
 	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
 	   dimensions. */ \
@@ -335,9 +319,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* Determine the thread range and increment for the 2nd and 1st loops for
 	   the initial rectangular region of B (if it exists).
-       NOTE: The definition of bli_thread_range_jrir() will depend on whether
-       slab or round-robin partitioning was requested at configure-time. \
-       NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
 	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
@@ -382,42 +366,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  one, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
-					                       ct,  rs_ct, cs_ct, \
-					                       c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  one, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 			} \
 		} \
 	} \
@@ -501,47 +463,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_b1121, \
-					  alpha_cast, \
-					  a1_i, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Copy edge elements of C to the temporary buffer. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        c11, rs_c,  cs_c, \
-					                        ct,  rs_ct, cs_ct ); \
-\
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_b1121, \
-					  alpha_cast, \
-					  a1_i, \
-					  b1, \
-					  beta_cast, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_b1121, \
+				  alpha_cast, \
+				  a1_i, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				} \
 \
 				a1  += rstep_a; \
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index 5d63bd46d..f71fb3c4d 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
 	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
 	ctype* restrict b_cast     = b; \
 	ctype* restrict c_cast     = c; \
@@ -262,10 +250,6 @@ void PASTEMAC(ch,varname) \
 	{ \
 		k = -diagoffb + n; \
 	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
 	   dimensions. */ \
@@ -410,47 +394,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_b0111, \
-					  alpha_cast, \
-					  a1_i, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Copy edge elements of C to the temporary buffer. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        c11, rs_c,  cs_c, \
-					                        ct,  rs_ct, cs_ct ); \
-\
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k_b0111, \
-					  alpha_cast, \
-					  a1_i, \
-					  b1, \
-					  beta_cast, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_b0111, \
+				  alpha_cast, \
+				  a1_i, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				} \
 \
 				a1  += rstep_a; \
@@ -476,9 +433,9 @@ void PASTEMAC(ch,varname) \
 	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Advance the start and end iteration offsets for the rectangular region
-       by the number of iterations used for the triangular region. */ \
-    jr_start += n_iter_tri; \
-    jr_end   += n_iter_tri; \
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
 	jb0       = n_iter_tri; \
 \
 	/* Save the resulting value of b1 from the previous loop since it represents
@@ -496,7 +453,7 @@ void PASTEMAC(ch,varname) \
 		   the starting address of the rectangular region (which is already
 		   n_iter_tri logical iterations through B). */ \
 		b1 = b_cast + (j-jb0) * cstep_b; \
-        c1 = c_cast +  j      * cstep_c; \
+		c1 = c_cast +  j      * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
@@ -533,42 +490,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  one, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
-					                       ct,  rs_ct, cs_ct, \
-					                       c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  one, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 			} \
 		} \
 	} \
diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c
index a8196ebb9..0a3be87f7 100644
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -40,27 +40,30 @@ cntl_t* bli_trsm_cntl_create
        rntm_t* rntm,
        side_t  side,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      )
 {
 	if ( bli_is_left( side ) )
-		return bli_trsm_l_cntl_create( rntm, schema_a, schema_b );
+		return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker );
 	else
-		return bli_trsm_r_cntl_create( rntm, schema_a, schema_b );
+		return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker );
 }
 
 cntl_t* bli_trsm_l_cntl_create
      (
        rntm_t* rntm,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      )
 {
 	void_fp macro_kernel_p;
 
-	// Use the function pointer to the macrokernels that use slab
-	// assignment of micropanels to threads in the jr and ir loops.
+	// Set the default macrokernel. If a non-NULL kernel function pointer is
+	// passed in, we use that instead.
 	macro_kernel_p = bli_trsm_xx_ker_var2;
+	if ( ker ) macro_kernel_p = ker;
 
 	const opid_t family = BLIS_TRSM;
 
@@ -202,11 +205,15 @@ cntl_t* bli_trsm_r_cntl_create
      (
        rntm_t* rntm,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      )
 {
 	// NOTE: trsm macrokernels are presently disabled for right-side execution.
+	// Set the default macrokernel. If a non-NULL kernel function pointer is
+	// passed in, we use that instead.
 	void_fp macro_kernel_p = bli_trsm_xx_ker_var2;
+	if ( ker ) macro_kernel_p = ker;
 
 	const opid_t family = BLIS_TRSM;
 
diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h
index 7fdb1fc4f..86f4a29b2 100644
--- a/frame/3/trsm/bli_trsm_cntl.h
+++ b/frame/3/trsm/bli_trsm_cntl.h
@@ -38,21 +38,24 @@ cntl_t* bli_trsm_cntl_create
        rntm_t* rntm,
        side_t  side,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      );
 
 cntl_t* bli_trsm_l_cntl_create
      (
        rntm_t* rntm,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      );
 
 cntl_t* bli_trsm_r_cntl_create
      (
        rntm_t* rntm,
        pack_t  schema_a,
-       pack_t  schema_b
+       pack_t  schema_b,
+       void_fp ker
      );
 
 void bli_trsm_cntl_free
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index dec41301a..b503efa5b 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -183,7 +183,6 @@ void PASTEMAC(ch,varname) \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
-	ctype* restrict zero        = PASTEMAC(ch,0); \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
 	ctype* restrict b_cast      = b; \
@@ -470,43 +469,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  a1, \
-					  b1, \
-					  alpha2_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        alpha2_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  minus_one, \
+				  a1, \
+				  b1, \
+				  alpha2_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 \
 				a1 += rstep_a; \
 			} \
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 1627a12a3..55ceafb91 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -183,7 +183,6 @@ void PASTEMAC(ch,varname) \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
-	ctype* restrict zero        = PASTEMAC(ch,0); \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
 	ctype* restrict b_cast      = b; \
@@ -480,43 +479,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  a1, \
-					  b1, \
-					  alpha2_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        alpha2_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  minus_one, \
+				  a1, \
+				  b1, \
+				  alpha2_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 \
 				a1 += rstep_a; \
 			} \
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 8cbc26b36..23d4dd728 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -188,7 +188,6 @@ void PASTEMAC(ch,varname) \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
-	ctype* restrict zero        = PASTEMAC(ch,0); \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
 	ctype* restrict b_cast      = b; \
@@ -499,43 +498,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( b2, &aux ); \
 				bli_auxinfo_set_next_b( a2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  b1, \
-					  a1, \
-					  alpha2_cast, \
-					  c11, cs_c, rs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  b1, \
-					  a1, \
-					  zero, \
-					  ct, cs_ct, rs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        alpha2_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  minus_one, \
+				  b1, \
+				  a1, \
+				  alpha2_cast, \
+				  c11, cs_c, rs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				} \
 \
 				a1  += rstep_a; \
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 97399d0ae..71381707c 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -188,7 +188,6 @@ void PASTEMAC(ch,varname) \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
-	ctype* restrict zero        = PASTEMAC(ch,0); \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
 	ctype* restrict b_cast      = b; \
@@ -492,43 +491,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( b2, &aux ); \
 				bli_auxinfo_set_next_b( a2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  b1, \
-					  a1, \
-					  alpha2_cast, \
-					  c11, cs_c, rs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  minus_one, \
-					  b1, \
-					  a1, \
-					  zero, \
-					  ct, cs_ct, rs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        alpha2_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  minus_one, \
+				  b1, \
+				  a1, \
+				  alpha2_cast, \
+				  c11, cs_c, rs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 				} \
 \
 				a1  += rstep_a; \
diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h
index 68b6cc7cd..d8c6cbb13 100644
--- a/frame/base/bli_auxinfo.h
+++ b/frame/base/bli_auxinfo.h
@@ -74,6 +74,15 @@ BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai )
 	return ai->ps_b;
 }
 
+BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai )
+{
+    return ai->ukr;
+}
+BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai )
+{
+    return ai->params;
+}
+
 
 // auxinfo_t field modification
 
@@ -118,5 +127,14 @@ BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai )
 	ai->ps_b = ps;
 }
 
-#endif 
+BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai )
+{
+    ai->ukr = ukr;
+}
+BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai )
+{
+    ai->params = params;
+}
+
+#endif
 
diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h
new file mode 100644
index 000000000..242045a02
--- /dev/null
+++ b/frame/include/bli_edge_case_macro_defs.h
@@ -0,0 +1,109 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_EDGE_CASE_MACRO_DEFS_H
+#define BLIS_EDGE_CASE_MACRO_DEFS_H
+
+
+// Helper macros for edge-case handling within gemm microkernels.
+
+#define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major) \
+\
+	PASTEMAC(ch,ctype)* restrict _beta   = beta; \
+	PASTEMAC(ch,ctype)* restrict _c      = c; \
+	const inc_t                  _rs_c   = rs_c; \
+	const inc_t                  _cs_c   = cs_c; \
+	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \
+	                                  __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const inc_t                  _rs_ct  = row_major ? nr :  1; \
+	const inc_t                  _cs_ct  = row_major ?  1 : mr;
+
+#define GEMM_UKR_SETUP_CT_POST(ch) \
+\
+	PASTEMAC(ch,ctype) _zero; \
+	PASTEMAC(ch,set0s)( _zero ); \
+	\
+	if ( _use_ct ) \
+	{ \
+		c = _ct; \
+		rs_c = _rs_ct; \
+		cs_c = _cs_ct; \
+		beta = &_zero; \
+	}
+
+#define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \
+\
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
+	                     m != mr || n != nr; \
+	GEMM_UKR_SETUP_CT_POST(ch);
+
+#define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \
+\
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \
+	                     m != mr || n != nr; \
+	GEMM_UKR_SETUP_CT_POST(ch);
+
+#define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \
+\
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	const bool _use_ct = m != mr || n != nr; \
+	GEMM_UKR_SETUP_CT_POST(ch);
+
+#define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \
+\
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
+	                     m != mr || n != nr || \
+	                     ( (uintptr_t)_c % alignment ) || \
+	                     ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \
+	GEMM_UKR_SETUP_CT_POST(ch);
+
+#define GEMM_UKR_FLUSH_CT(ch) \
+\
+	if ( _use_ct ) \
+	{ \
+		PASTEMAC(ch,xpbys_mxn) \
+		( \
+		  m, n, \
+		  _ct, _rs_ct, _cs_ct, \
+		  _beta, \
+		  _c,  _rs_c,  _cs_c \
+		); \
+	} \
+
+
+#endif
+
diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h
index 03451d440..be45a12e3 100644
--- a/frame/include/bli_macro_defs.h
+++ b/frame/include/bli_macro_defs.h
@@ -98,6 +98,7 @@
 #include "bli_gentprot_macro_defs.h"
 
 #include "bli_misc_macro_defs.h"
+#include "bli_edge_case_macro_defs.h"
 #include "bli_param_macro_defs.h"
 #include "bli_obj_macro_defs.h"
 #include "bli_complex_macro_defs.h"
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 5be0ceeb4..c66505bde 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1144,6 +1144,13 @@ typedef struct
 	inc_t  ps_a;
 	inc_t  ps_b;
 
+	// The type to convert to on output.
+	//num_t  dt_on_output;
+
+	// (Virtual) microkernel address and additional parameters.
+	void_fp ukr;
+	void*   params;
+
 } auxinfo_t;
 
 
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 66337e0b7..913abd1f6 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -42,9 +42,13 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10cmplx.h"
 
+#include "arm_sve.h"
+
 void bli_cgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
@@ -59,12 +63,15 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
-  uint64_t k_mker = k0 / 4;
-  uint64_t k_left = k0 % 4;
+  uint64_t k_mker = k / 4;
+  uint64_t k_left = k % 4;
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
+  uint64_t mr = svcntw();
+  GEMM_UKR_SETUP_CT( c, mr, 10, false );
+
   __asm__ volatile (
 // " ldr             x0, %[a]                        \n\t"
 // " ldr             x1, %[b]                        \n\t"
@@ -310,5 +317,7 @@ GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
   "z24","z25","z26","z27",
   "z28","z29","z30","z31"
   );
+
+  GEMM_UKR_FLUSH_CT( c );
 }
 
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index e5b78a592..9730fb8ce 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -42,9 +42,13 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10.h"
 
+#include "arm_sve.h"
+
 void bli_dgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -59,11 +63,14 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
-  uint64_t k_mker = k0 / 4;
-  uint64_t k_left = k0 % 4;
+  uint64_t k_mker = k / 4;
+  uint64_t k_left = k % 4;
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
+  uint64_t mr = 2*svcntd();
+  GEMM_UKR_SETUP_CT( d, mr, 10, false );
+
   __asm__ volatile (
 " ldr             x0, %[a]                        \n\t"
 " ldr             x1, %[b]                        \n\t"
@@ -324,5 +331,7 @@ GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x
   "z24","z25","z26","z27",
   "z28","z29","z30","z31"
    );
+
+  GEMM_UKR_FLUSH_CT( d );
 }
 
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index 00b3f20b4..74c4779d7 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -42,9 +42,13 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10.h"
 
+#include "arm_sve.h"
+
 void bli_sgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -59,11 +63,14 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
-  uint64_t k_mker = k0 / 4;
-  uint64_t k_left = k0 % 4;
+  uint64_t k_mker = k / 4;
+  uint64_t k_left = k % 4;
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
+  uint64_t mr = 2*svcntw();
+  GEMM_UKR_SETUP_CT( s, mr, 10, false );
+
   __asm__ volatile (
 " ldr             x0, %[a]                        \n\t"
 " ldr             x1, %[b]                        \n\t"
@@ -310,5 +317,7 @@ GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x
   "z24","z25","z26","z27",
   "z28","z29","z30","z31"
    );
+
+   GEMM_UKR_FLUSH_CT( s );
 }
 
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 2fa37664a..ee041b3c4 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -42,9 +42,13 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10cmplx.h"
 
+#include "arm_sve.h"
+
 void bli_zgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -59,12 +63,15 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
-  uint64_t k_mker = k0 / 4;
-  uint64_t k_left = k0 % 4;
+  uint64_t k_mker = k / 4;
+  uint64_t k_left = k % 4;
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
+  uint64_t mr = svcntd();
+  GEMM_UKR_SETUP_CT( z, mr, 10, false );
+
   __asm__ volatile (
 // " ldr             x0, %[a]                        \n\t"
 // " ldr             x1, %[b]                        \n\t"
@@ -309,5 +316,7 @@ GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
   "z24","z25","z26","z27",
   "z28","z29","z30","z31"
   );
+
+  GEMM_UKR_FLUSH_CT( z );
 }
 
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
index 3d25719d9..641944ecd 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
@@ -42,9 +42,13 @@
 // 2vx7 microkernels.
 #include "armsve_asm_2vx7cmplx.h"
 
+#include "arm_sve.h"
+
 void bli_zgemm_armsve_asm_2vx7_unindexed
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -59,12 +63,15 @@ void bli_zgemm_armsve_asm_2vx7_unindexed
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
-  uint64_t k_mker = k0 / 4;
-  uint64_t k_left = k0 % 4;
+  uint64_t k_mker = k / 4;
+  uint64_t k_left = k % 4;
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
+  uint64_t mr = svcntd();
+  GEMM_UKR_SETUP_CT( z, mr, 7, false );
+
   __asm__ volatile (
 // " ldr             x0, %[a]                        \n\t"
 // " ldr             x1, %[b]                        \n\t"
@@ -261,6 +268,8 @@ GEMM_CCMPLX_STORE_COL7_G(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27
   "z24","z25","z26","z27",
   "z28","z29","z30","z31"
   );
+
+  GEMM_UKR_FLUSH_CT( z );
 }
 
 
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
index d0eef4a8c..4272f72c0 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
@@ -42,9 +42,13 @@
 // 2vx8 microkernels.
 #include "armsve_asm_2vx8cmplx.h"
 
+#include "arm_sve.h"
+
 void bli_zgemm_armsve_asm_2vx8_unindexed
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -59,12 +63,15 @@ void bli_zgemm_armsve_asm_2vx8_unindexed
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
-  uint64_t k_mker = k0 / 6;
-  uint64_t k_left = k0 % 6;
+  uint64_t k_mker = k / 6;
+  uint64_t k_left = k % 6;
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
+  uint64_t mr = svcntd();
+  GEMM_UKR_SETUP_CT( z, mr, 8, false );
+
   __asm__ volatile (
 // " ldr             x0, %[a]                        \n\t"
 // " ldr             x1, %[b]                        \n\t"
@@ -286,5 +293,7 @@ GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z16,%2,%4,x16)
   "z24","z25","z26","z27",
   "z28","z29","z30","z31"
   );
+
+  GEMM_UKR_FLUSH_CT( z );
 }
 
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
index b526cd095..c248285c3 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
@@ -48,23 +48,23 @@ void bli_sgemm_armv7a_ker_4x4
 
 void bli_sgemm_armv7a_asm_4x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
        float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+       float*     restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint32_t k    = k0;
-	uint32_t rs_c = rs_c0;
-	uint32_t cs_c = cs_c0;
-
+	GEMM_UKR_SETUP_CT_ANY( s, 4, 4, false );
 	bli_sgemm_armv7a_ker_4x4( k, alpha, a, b, beta, c, rs_c, cs_c, data );
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 
@@ -83,23 +83,23 @@ void bli_dgemm_armv7a_ker_4x4
 
 void bli_dgemm_armv7a_asm_4x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
        double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       double*    restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint32_t k    = k0;
-	uint32_t rs_c = rs_c0;
-	uint32_t cs_c = cs_c0;
-
+	GEMM_UKR_SETUP_CT_ANY( d, 4, 4, false );
 	bli_dgemm_armv7a_ker_4x4( k, alpha, a, b, beta, c, rs_c, cs_c, data );
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 
@@ -118,23 +118,23 @@ void bli_cgemm_armv7a_ker_2x2
 
 void bli_cgemm_armv7a_asm_2x2
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
        scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint32_t k    = k0;
-	uint32_t rs_c = rs_c0;
-	uint32_t cs_c = cs_c0;
-
+	GEMM_UKR_SETUP_CT_ANY( c, 2, 2, false );
 	bli_cgemm_armv7a_ker_2x2( k, alpha, a, b, beta, c, rs_c, cs_c, data );
+	GEMM_UKR_FLUSH_CT( c );
 }
 
 
@@ -153,22 +153,22 @@ void bli_zgemm_armv7a_ker_2x2
 
 void bli_zgemm_armv7a_asm_2x2
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint32_t k    = k0;
-	uint32_t rs_c = rs_c0;
-	uint32_t cs_c = cs_c0;
-
+	GEMM_UKR_SETUP_CT_ANY( z, 2, 2, false );
 	bli_zgemm_armv7a_ker_2x2( k, alpha, a, b, beta, c, rs_c, cs_c, data );
+	GEMM_UKR_FLUSH_CT( z );
 }
 
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
index b9db58726..06f36a346 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
@@ -37,7 +37,9 @@
 
 void bli_sgemm_armv7a_int_4x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -49,12 +51,14 @@ void bli_sgemm_armv7a_int_4x4
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint32_t k_iter = k0 / 4;
-	uint32_t k_left = k0 % 4;
+	uint32_t k_iter = k / 4;
+	uint32_t k_left = k % 4;
 	uint32_t rs_c   = rs_c0;
 	uint32_t cs_c   = cs_c0;
 	uint32_t i;
 
+    GEMM_UKR_SETUP_CT( s, 4, 4, false );
+
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
 
@@ -82,47 +86,17 @@ void bli_sgemm_armv7a_int_4x4
 
 	if ( *beta != 0.0F )
 	{
-		if ( rs_c == 1 )
-		{
-			// Load column 0
-			cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c );
-
-			// Load column 1
-			cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c );
-
-			// Load column 2
-			cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c );
-
-			// Load column 3
-			cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c );
-		}
-		else
-		{
-			// Load column 0
-			cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
-			cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
-			cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
-			cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
-
-			// Load column 1
-			cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
-			cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
-			cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
-			cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
-
-			// Load column 2
-			cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
-			cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
-			cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
-			cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
-
-			// Load column 3
-			cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
-			cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
-			cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
-			cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
-
-		}
+		// Load column 0
+		cv0 = vld1q_f32( c + 0*cs_c );
+
+		// Load column 1
+		cv1 = vld1q_f32( c + 1*cs_c );
+
+		// Load column 2
+		cv2 = vld1q_f32( c + 2*cs_c );
+
+		// Load column 3
+		cv3 = vld1q_f32( c + 3*cs_c );
 	}
 	else
 	{
@@ -255,47 +229,22 @@ void bli_sgemm_armv7a_int_4x4
 		cv3 = vmlaq_f32( cv3, abv3, alphav );
 	}
 
-	if ( rs_c == 1 )
-	{
-		// Store column 0
-		vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 );
-		// Store column 1
-		vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 );
-		// Store column 2
-		vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 );
-		// Store column 3
-		vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 );
-	}
-	else
-	{
-		// Store column 0
-		vst1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
-		vst1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
-		vst1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
-		vst1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
-
-		// Store column 1
-		vst1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
-		vst1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
-		vst1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
-		vst1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
-
-		// Store column 2
-		vst1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
-		vst1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
-		vst1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
-		vst1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
-
-		// Store column 3
-		vst1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
-		vst1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
-		vst1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
-		vst1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
-	}
+	// Store column 0
+	vst1q_f32( c + 0*cs_c, cv0 );
+	// Store column 1
+	vst1q_f32( c + 1*cs_c, cv1 );
+	// Store column 2
+	vst1q_f32( c + 2*cs_c, cv2 );
+	// Store column 3
+	vst1q_f32( c + 3*cs_c, cv3 );
+
+    GEMM_UKR_FLUSH_CT( s );
 }
 
 void bli_dgemm_armv7a_int_4x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
@@ -314,6 +263,8 @@ void bli_dgemm_armv7a_int_4x4
 	uint32_t cs_c   = cs_c0;
 	uint32_t i;
 
+    GEMM_UKR_SETUP_CT_ANY( d, 4, 4, false );
+
 	//void* a_next = bli_auxinfo_next_a( data );
 	//void* b_next = bli_auxinfo_next_b( data );
 
@@ -568,5 +519,7 @@ void bli_dgemm_armv7a_int_4x4
     	*c23 += ab23 * *alpha;
     	*c33 += ab33 * *alpha;
     }
+
+    GEMM_UKR_FLUSH_CT( d );
 }
 
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index dfdda863b..7b420f202 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -1,4 +1,4 @@
-    /* 
+    /*
 
    BLIS
    An object-based framework for developing high-performance BLAS-like
@@ -40,20 +40,22 @@
    o 4x4 Single precision micro-kernel fully functional.
    o Runnable on ARMv8, compiled with aarch64 GCC.
    o Use it together with the armv8 BLIS configuration.
-   o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. 
+   o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz.
 
    December 2014.
- 
+
  * UPDATE NOVEMBER 2015
  * Micro-kernel changed to 8x12
  * Tested on Juno Board. Around  8.1 GFLOPS, 1 x A57 core  @ 1.1 GHz.
  * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz.
- * Tested on Juno board. Around  3.1 GFLOPS, 1 x A53 core  @ 850 MHz. 
+ * Tested on Juno board. Around  3.1 GFLOPS, 1 x A53 core  @ 850 MHz.
  * Tested on Juno board. Around 12   GFLOPS, 4 x A53 cores @ 850 MHz.
 */
 void bli_sgemm_armv8a_asm_8x12
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -68,1020 +70,1023 @@ void bli_sgemm_armv8a_asm_8x12
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( s, 8, 12, false );
 
-__asm__ volatile 
-(
-"                                            \n\t"
-"                                            \n\t"
-" ldr x0,%[aaddr]                            \n\t" // Load address of A. 
-" ldr x1,%[baddr]                            \n\t" // Load address of B.
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-"                                            \n\t"
-" ldr x5,%[k_iter]                           \n\t" // Number of unrolled iterations (k_iter).
-" ldr x6,%[k_left]                           \n\t" // Number of remaining iterations (k_left).
-"                                            \n\t" 
-" ldr x10,%[cs_c]                            \n\t" // Load cs_c.
-" lsl x10,x10,#2                             \n\t" // cs_c * sizeof(float) -- AUX.
-"                                            \n\t" 
-" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
-" lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
-"                                            \n\t"
-" add x16,x2,x10                             \n\t" //Load address Column 1 of C
-" add x17,x16,x10                            \n\t" //Load address Column 2 of C
-" add x19,x17,x10                            \n\t" //Load address Column 3 of C
-" add x20,x19,x10                            \n\t" //Load address Column 4 of C
-" add x21,x20,x10                            \n\t" //Load address Column 5 of C
-" add x22,x21,x10                            \n\t" //Load address Column 6 of C
-" add x23,x22,x10                            \n\t" //Load address Column 7 of C
-" add x24,x23,x10                            \n\t" //Load address Column 8 of C
-" add x25,x24,x10                            \n\t" //Load address Column 9 of C
-" add x26,x25,x10                            \n\t" //Load address Column 10 of C
-" add x27,x26,x10                            \n\t" //Load address Column 11 of C
-"                                            \n\t"
-" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
-" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x27]                       \n\t" // Prefetch c.
-"                                            \n\t"
-" dup  v8.4s, wzr                            \n\t" // Vector for accummulating column 0
-" prfm    PLDL1KEEP, [x1, #192]              \n\t" 
-" dup  v9.4s, wzr                            \n\t" // Vector for accummulating column 0
-" prfm    PLDL1KEEP, [x1, #256]              \n\t"
-" dup  v10.4s, wzr                           \n\t" // Vector for accummulating column 1
-" prfm    PLDL1KEEP, [x1, #320]              \n\t"
-" dup  v11.4s, wzr                           \n\t" // Vector for accummulating column 1
-" dup  v12.4s, wzr                           \n\t" // Vector for accummulating column 2 
-" dup  v13.4s, wzr                           \n\t" // Vector for accummulating column 2
-"                                            \n\t"
-" dup  v14.4s, wzr                           \n\t" // Vector for accummulating column 3
-" prfm    PLDL1KEEP, [x0, #128]              \n\t"
-" dup  v15.4s, wzr                           \n\t" // Vector for accummulating column 3
-" prfm    PLDL1KEEP, [x0, #192]              \n\t"
-" dup  v16.4s, wzr                           \n\t" // Vector for accummulating column 4
-" dup  v17.4s, wzr                           \n\t" // Vector for accummulating column 4
-" dup  v18.4s, wzr                           \n\t" // Vector for accummulating column 5 
-" dup  v19.4s, wzr                           \n\t" // Vector for accummulating column 5
-"                                            \n\t"
-" dup  v20.4s, wzr                           \n\t" // Vector for accummulating column 6 
-" dup  v21.4s, wzr                           \n\t" // Vector for accummulating column 6
-" dup  v22.4s, wzr                           \n\t" // Vector for accummulating column 7
-" dup  v23.4s, wzr                           \n\t" // Vector for accummulating column 7
-" dup  v24.4s, wzr                           \n\t" // Vector for accummulating column 8 
-" dup  v25.4s, wzr                           \n\t" // Vector for accummulating column 8
-"                                            \n\t"
-" dup  v26.4s, wzr                           \n\t" // Vector for accummulating column 9 
-" dup  v27.4s, wzr                           \n\t" // Vector for accummulating column 9
-" dup  v28.4s, wzr                           \n\t" // Vector for accummulating column 10
-" dup  v29.4s, wzr                           \n\t" // Vector for accummulating column 10
-" dup  v30.4s, wzr                           \n\t" // Vector for accummulating column 11 
-" dup  v31.4s, wzr                           \n\t" // Vector for accummulating column 11
-"                                            \n\t"
-" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-BEQ(SCONSIDERKLEFT)
-"                                            \n\t"
-" ldr q0, [x0]                               \n\t"
-" ldr q1, [x0, #16]                          \n\t" // Load a
-"                                            \n\t"
-" ldr q2, [x1]                               \n\t" // Load b
-" ldr q3, [x1, #16]                          \n\t"
-" ldr q4, [x1, #32]                          \n\t"
-"                                            \n\t"
-" add x0, x0, #32                            \n\t" //update address of A
-" add x1, x1, #48                            \n\t" //update address of B
-"                                            \n\t"
-" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
-BEQ(SLASTITER)                                     // (as loop is do-while-like).
-"                                            \n\t"
-LABEL(SLOOPKITER)                                  // Body of the k_iter loop.
-"                                            \n\t"
-" ldr q5, [x0]                               \n\t"
-" fmla v8.4s, v0.4s,v2.s[0]                  \n\t" // Accummulate.
-" fmla v9.4s, v1.4s,v2.s[0]                  \n\t" // Accummulate.
-" ldr q6, [x0, #16]                          \n\t"
-" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
-" ldr q2, [x1]                               \n\t"
-"                                            \n\t"
-" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
-" prfm    PLDL1KEEP, [x1, #336]              \n\t" 
-" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
-" prfm    PLDL1KEEP, [x1, #400]              \n\t" 
-" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
-" prfm    PLDL1KEEP, [x1, #464]              \n\t" 
-" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q3, [x1, #16]                          \n\t"
-"                                            \n\t"
-" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q4, [x1, #32]                          \n\t"
-"                                            \n\t" //End It 1
-"                                            \n\t"
-" ldr q0, [x0, #32]                          \n\t"
-" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
-" ldr q1, [x0, #48]                          \n\t"
-" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
-" ldr q2, [x1, #48]                          \n\t"
-"                                            \n\t"
-" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
-" prfm    PLDL1KEEP, [x0, #224]              \n\t"
-" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
-" prfm    PLDL1KEEP, [x0, #288]              \n\t"
-" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q3, [x1, #64]                          \n\t"
-"                                            \n\t"
-" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q4, [x1, #80]                          \n\t"
-"                                            \n\t" //End It 2
-"                                            \n\t"
-" ldr q5, [x0, #64]                          \n\t"
-" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
-" ldr q6, [x0, #80]                          \n\t"
-" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
-" ldr q2, [x1, #96]                          \n\t"
-"                                            \n\t"
-" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q3, [x1, #112]                         \n\t"
-"                                            \n\t"
-" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q4, [x1, #128]                         \n\t"
-"                                            \n\t" //End It 3
-"                                            \n\t"
-" ldr q0, [x0, #96]                          \n\t"
-" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
-" ldr q1, [x0, #112]                         \n\t"
-" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
-" ldr q2, [x1, #144]                         \n\t"
-"                                            \n\t"
-" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q3, [x1, #160]                         \n\t"
-"                                            \n\t"
-" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q4, [x1, #176]                         \n\t"
-" add x1, x1, #192                           \n\t"
-" add x0, x0, #128                           \n\t"
-"                                            \n\t" //End It 4
-" sub x5,x5,1                                \n\t" // i-=1.
-" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-BNE(SLOOPKITER)
-"                                            \n\t" 
-LABEL(SLASTITER)                                   // Last iteration of k_iter loop.
-"                                            \n\t" 
-"                                            \n\t"
-" ldr q5, [x0]                               \n\t"
-" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
-" ldr q6, [x0, #16]                          \n\t"
-" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
-" ldr q2, [x1]                               \n\t"
-"                                            \n\t"
-" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q3, [x1, #16]                          \n\t"
-"                                            \n\t"
-" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q4, [x1, #32]                          \n\t"
-"                                            \n\t" //End It 1
-"                                            \n\t"
-" ldr q0, [x0, #32]                          \n\t"
-" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
-" ldr q1, [x0, #48]                          \n\t"
-" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
-" ldr q2, [x1, #48]                          \n\t"
-"                                            \n\t"
-" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q3, [x1, #64]                          \n\t"
-"                                            \n\t"
-" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q4, [x1, #80]                          \n\t"
-"                                            \n\t" //End It 2
-"                                            \n\t"
-" ldr q5, [x0, #64]                          \n\t"
-" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
-" ldr q6, [x0, #80]                          \n\t"
-" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
-" ldr q2, [x1, #96]                          \n\t"
-"                                            \n\t"
-" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q3, [x1, #112]                         \n\t"
-"                                            \n\t"
-" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
-" ldr q4, [x1, #128]                         \n\t"
-"                                            \n\t" //End It 3
-"                                            \n\t"
-" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
-" add x1, x1, #144                           \n\t"
-" add x0, x0, #96                            \n\t"
-"                                            \n\t" //End It 4
-"                                            \n\t"
-LABEL(SCONSIDERKLEFT)
-" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-BEQ(SPOSTACCUM)                                    // else, we enter the k_left loop.
-"                                            \n\t"
-LABEL(SLOOPKLEFT)                                  // Body of the left iterations
-"                                            \n\t"
-" ldr q0, [x0],#16                           \n\t"
-" ldr q1, [x0],#16                           \n\t" // Load a
-"                                            \n\t"
-" ldr q2, [x1],#16                           \n\t" // Load b
-" ldr q3, [x1],#16                           \n\t"
-" ldr q4, [x1],#16                           \n\t"
-"                                            \n\t"
-" sub x6,x6,1                                \n\t" // i = i-1.
-"                                            \n\t"
-" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
-" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
-" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
-" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
-" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
-" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
-" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
-" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
-" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
-" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
-" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
-"                                            \n\t"
-" cmp x6,0                                   \n\t" // Iterate again.
-BNE(SLOOPKLEFT)                                    // if i!=0.
-"                                            \n\t"
-LABEL(SPOSTACCUM)
-"                                            \n\t"
-" ldr x0,%[alpha]                            \n\t" // Alpha address.
-" ldr x1,%[beta]                             \n\t" // Beta address.
-"                                            \n\t"
-" ld1r {v6.4s},[x0]                          \n\t" // Load alpha.
-" ld1r {v7.4s},[x1]                          \n\t" // Load beta
-"                                            \n\t"
-" ldr x0,%[a_next]                           \n\t" // Pointer to next block of A.
-" ldr x1,%[b_next]                           \n\t" // Pointer to next pointer of B.
-"                                            \n\t"
-" cmp x14,#4                                 \n\t" // If rs_c != 1 (column-major)
-BNE(SGENSTORED)
-"                                            \n\t"
-LABEL(SCOLSTORED)                                  // C is column-major.
-"                                            \n\t"
-" dup  v0.4s, wzr                            \n\t"
-" dup  v1.4s, wzr                            \n\t"
-" dup  v2.4s, wzr                            \n\t"
-" dup  v3.4s, wzr                            \n\t"
-" dup  v4.4s, wzr                            \n\t"
-" dup  v5.4s, wzr                            \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q0, [x2]                               \n\t" //Load column 0 of C
-" ldr q1, [x2, #16]                          \n\t"
-" ldr q2, [x16]                              \n\t" //Load column 1 of C
-" ldr q3, [x16, #16]                         \n\t"
-" ldr q4, [x17]                              \n\t" //Load column 2 of C
-" ldr q5, [x17, #16]                         \n\t"
-"                                            \n\t"
-" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROCOLSTOREDS1)
-"                                            \n\t"
-" fmla v0.4s,v8.4s,v6.s[0]                   \n\t" // Scale by alpha
-" fmla v1.4s,v9.4s,v6.s[0]                   \n\t" // Scale by alpha
-" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" str q0, [x2]                               \n\t" //Store column 0 of C
-" str q1, [x2, #16]                          \n\t"
-" str q2, [x16]                              \n\t" //Store column 1 of C
-" str q3, [x16, #16]                         \n\t"
-" str q4, [x17]                              \n\t" //Store column 2 of C
-" str q5, [x17, #16]                         \n\t"
-"                                            \n\t"
-" dup  v8.4s, wzr                            \n\t"
-" dup  v9.4s, wzr                            \n\t"
-" dup  v10.4s, wzr                           \n\t"
-" dup  v11.4s, wzr                           \n\t"
-" dup  v12.4s, wzr                           \n\t"
-" dup  v13.4s, wzr                           \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q8, [x19]                              \n\t" //Load column 3 of C
-" ldr q9, [x19, #16]                         \n\t"
-" ldr q10, [x20]                             \n\t" //Load column 4 of C
-" ldr q11, [x20, #16]                        \n\t"
-" ldr q12, [x21]                             \n\t" //Load column 5 of C
-" ldr q13, [x21, #16]                        \n\t"
-"                                            \n\t"
-" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROCOLSTOREDS2)
-"                                            \n\t"
-" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" str q8, [x19]                              \n\t" //Store column 3 of C
-" str q9, [x19, #16]                         \n\t"
-" str q10, [x20]                             \n\t" //Store column 4 of C
-" str q11, [x20, #16]                        \n\t"
-" str q12, [x21]                             \n\t" //Store column 5 of C
-" str q13, [x21, #16]                        \n\t"
-"                                            \n\t"
-" dup  v0.4s, wzr                            \n\t"
-" dup  v1.4s, wzr                            \n\t"
-" dup  v2.4s, wzr                            \n\t"
-" dup  v3.4s, wzr                            \n\t"
-" dup  v4.4s, wzr                            \n\t"
-" dup  v5.4s, wzr                            \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q0, [x22]                              \n\t" //Load column 6 of C
-" ldr q1, [x22, #16]                         \n\t"
-" ldr q2, [x23]                              \n\t" //Load column 7 of C
-" ldr q3, [x23, #16]                         \n\t"
-" ldr q4, [x24]                              \n\t" //Load column 8 of C
-" ldr q5, [x24, #16]                         \n\t"
-"                                            \n\t"
-" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROCOLSTOREDS3)
-"                                            \n\t"
-" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" str q0, [x22]                              \n\t" //Store column 6 of C
-" str q1, [x22, #16]                         \n\t"
-" str q2, [x23]                              \n\t" //Store column 7 of C
-" str q3, [x23, #16]                         \n\t"
-" str q4, [x24]                              \n\t" //Store column 8 of C
-" str q5, [x24, #16]                         \n\t"
-"                                            \n\t"
-" dup  v8.4s, wzr                            \n\t"
-" dup  v9.4s, wzr                            \n\t"
-" dup  v10.4s, wzr                            \n\t"
-" dup  v11.4s, wzr                            \n\t"
-" dup  v12.4s, wzr                            \n\t"
-" dup  v13.4s, wzr                            \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q8, [x25]                              \n\t" //Load column 9 of C
-" ldr q9, [x25, #16]                         \n\t"
-" ldr q10, [x26]                             \n\t" //Load column 10 of C
-" ldr q11, [x26, #16]                        \n\t"
-" ldr q12, [x27]                             \n\t" //Load column 11 of C
-" ldr q13, [x27, #16]                        \n\t"
-"                                            \n\t"
-" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROCOLSTOREDS4)
-"                                            \n\t"
-" prfm pldl2keep,[x0]                        \n\t"
-" prfm pldl2keep,[x1]                        \n\t"
-"                                            \n\t"
-" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" str q8, [x25]                              \n\t" //Store column 9 of C
-" str q9, [x25, #16]                         \n\t"
-" str q10, [x26]                             \n\t" //Store column 10 of C
-" str q11, [x26, #16]                        \n\t"
-" str q12, [x27]                             \n\t" //Store column 11 of C
-" str q13, [x27, #16]                        \n\t"
-"                                            \n\t"
-"                                            \n\t"
-BRANCH(SEND)                                       // Done.
-"                                            \n\t"
-"                                            \n\t"
-LABEL(SGENSTORED)                                  // C is general-stride stored.
-"                                            \n\t"
-"                                            \n\t"
-" dup  v0.4s, wzr                            \n\t"
-" dup  v1.4s, wzr                            \n\t"
-" dup  v2.4s, wzr                            \n\t"
-" dup  v3.4s, wzr                            \n\t"
-" dup  v4.4s, wzr                            \n\t"
-" dup  v5.4s, wzr                            \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x5, x2                                 \n\t"
-"                                            \n\t"
-" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c00  into quad and increment by rs_c.
-" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c01  into quad and increment by rs_c.
-" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c02  into quad and increment by rs_c.
-" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c03  into quad and increment by rs_c.
-" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c04  into quad and increment by rs_c.
-" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c05  into quad and increment by rs_c.
-" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c06  into quad and increment by rs_c.
-" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c07  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x16                                \n\t"
-"                                            \n\t"
-" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c13  into quad and increment by rs_c.
-" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c14  into quad and increment by rs_c.
-" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c15  into quad and increment by rs_c.
-" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c16  into quad and increment by rs_c.
-" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c17  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x17                                \n\t"
-"                                            \n\t"
-" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c20  into quad and increment by rs_c.
-" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c21  into quad and increment by rs_c.
-" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c22  into quad and increment by rs_c.
-" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c23  into quad and increment by rs_c.
-" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c24  into quad and increment by rs_c.
-" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c25  into quad and increment by rs_c.
-" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c26  into quad and increment by rs_c.
-" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c27  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROGENSTOREDS1)
-"                                            \n\t"
-" fmla v0.4s, v8.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v1.4s, v9.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x5, x2                                 \n\t"
-"                                            \n\t"
-" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c00  into quad and increment by rs_c.
-" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c01  into quad and increment by rs_c.
-" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c02  into quad and increment by rs_c.
-" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c03  into quad and increment by rs_c.
-" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c04  into quad and increment by rs_c.
-" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c05  into quad and increment by rs_c.
-" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c06  into quad and increment by rs_c.
-" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c07  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x16                                \n\t"
-"                                            \n\t"
-" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c13  into quad and increment by rs_c.
-" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c14  into quad and increment by rs_c.
-" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c15  into quad and increment by rs_c.
-" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c16  into quad and increment by rs_c.
-" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c17  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x17                                \n\t"
-"                                            \n\t"
-" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c20  into quad and increment by rs_c.
-" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c21  into quad and increment by rs_c.
-" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c22  into quad and increment by rs_c.
-" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c23  into quad and increment by rs_c.
-" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c24  into quad and increment by rs_c.
-" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c25  into quad and increment by rs_c.
-" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c26  into quad and increment by rs_c.
-" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c27  into quad and increment by rs_c.
-"                                            \n\t"
-" dup  v8.4s, wzr                            \n\t"
-" dup  v9.4s, wzr                            \n\t"
-" dup  v10.4s, wzr                           \n\t"
-" dup  v11.4s, wzr                           \n\t"
-" dup  v12.4s, wzr                           \n\t"
-" dup  v13.4s, wzr                           \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x5, x19                                \n\t"
-"                                            \n\t"
-" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c30  into quad and increment by rs_c.
-" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c31  into quad and increment by rs_c.
-" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c32  into quad and increment by rs_c.
-" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c33  into quad and increment by rs_c.
-" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c34  into quad and increment by rs_c.
-" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c35  into quad and increment by rs_c.
-" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c36  into quad and increment by rs_c.
-" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c37  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x20                                \n\t"
-"                                            \n\t"
-" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c40  into quad and increment by rs_c.
-" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c41  into quad and increment by rs_c.
-" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c42  into quad and increment by rs_c.
-" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c43  into quad and increment by rs_c.
-" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c44  into quad and increment by rs_c.
-" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c45  into quad and increment by rs_c.
-" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c46  into quad and increment by rs_c.
-" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c47  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x21                                \n\t"
-"                                            \n\t"
-" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c50  into quad and increment by rs_c.
-" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c51  into quad and increment by rs_c.
-" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c52  into quad and increment by rs_c.
-" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c53  into quad and increment by rs_c.
-" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c54  into quad and increment by rs_c.
-" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c55  into quad and increment by rs_c.
-" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c56  into quad and increment by rs_c.
-" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c57  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROGENSTOREDS2)
-"                                            \n\t"
-" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x5, x19                                \n\t"
-"                                            \n\t"
-" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c30  into quad and increment by rs_c.
-" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c31  into quad and increment by rs_c.
-" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c32  into quad and increment by rs_c.
-" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c33  into quad and increment by rs_c.
-" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c34  into quad and increment by rs_c.
-" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c35  into quad and increment by rs_c.
-" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c36  into quad and increment by rs_c.
-" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c37  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x20                                \n\t"
-"                                            \n\t"
-" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c40  into quad and increment by rs_c.
-" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c41  into quad and increment by rs_c.
-" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c42  into quad and increment by rs_c.
-" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c43  into quad and increment by rs_c.
-" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c44  into quad and increment by rs_c.
-" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c45  into quad and increment by rs_c.
-" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c46  into quad and increment by rs_c.
-" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c47  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x21                                \n\t"
-"                                            \n\t"
-" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c50  into quad and increment by rs_c.
-" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c51  into quad and increment by rs_c.
-" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c52  into quad and increment by rs_c.
-" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c53  into quad and increment by rs_c.
-" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c54  into quad and increment by rs_c.
-" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c55  into quad and increment by rs_c.
-" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c56  into quad and increment by rs_c.
-" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c57  into quad and increment by rs_c.
-"                                            \n\t"
-" dup  v0.4s, wzr                            \n\t"
-" dup  v1.4s, wzr                            \n\t"
-" dup  v2.4s, wzr                            \n\t"
-" dup  v3.4s, wzr                            \n\t"
-" dup  v4.4s, wzr                            \n\t"
-" dup  v5.4s, wzr                            \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x5, x22                                \n\t"
-"                                            \n\t"
-" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c60  into quad and increment by rs_c.
-" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c61  into quad and increment by rs_c.
-" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c62  into quad and increment by rs_c.
-" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c63  into quad and increment by rs_c.
-" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c64  into quad and increment by rs_c.
-" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c65  into quad and increment by rs_c.
-" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c66  into quad and increment by rs_c.
-" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c67  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x23                                \n\t"
-"                                            \n\t"
-" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c70  into quad and increment by rs_c.
-" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c71  into quad and increment by rs_c.
-" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c72  into quad and increment by rs_c.
-" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c73  into quad and increment by rs_c.
-" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c74  into quad and increment by rs_c.
-" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c75  into quad and increment by rs_c.
-" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c76  into quad and increment by rs_c.
-" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c77  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x24                                \n\t"
-"                                            \n\t"
-" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c80  into quad and increment by rs_c.
-" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c81  into quad and increment by rs_c.
-" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c82  into quad and increment by rs_c.
-" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c83  into quad and increment by rs_c.
-" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c84  into quad and increment by rs_c.
-" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c85  into quad and increment by rs_c.
-" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c86  into quad and increment by rs_c.
-" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c87  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
-" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROGENSTOREDS3)
-"                                            \n\t"
-" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
-" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x5, x22                                \n\t"
-"                                            \n\t"
-" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c60  into quad and increment by rs_c.
-" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c61  into quad and increment by rs_c.
-" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c62  into quad and increment by rs_c.
-" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c63  into quad and increment by rs_c.
-" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c64  into quad and increment by rs_c.
-" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c65  into quad and increment by rs_c.
-" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c66  into quad and increment by rs_c.
-" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c67  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x23                                \n\t"
-"                                            \n\t"
-" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c70  into quad and increment by rs_c.
-" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c71  into quad and increment by rs_c.
-" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c72  into quad and increment by rs_c.
-" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c73  into quad and increment by rs_c.
-" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c74  into quad and increment by rs_c.
-" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c75  into quad and increment by rs_c.
-" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c76  into quad and increment by rs_c.
-" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c77  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x24                                \n\t"
-"                                            \n\t"
-" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c80  into quad and increment by rs_c.
-" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c81  into quad and increment by rs_c.
-" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c82  into quad and increment by rs_c.
-" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c83  into quad and increment by rs_c.
-" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c84  into quad and increment by rs_c.
-" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c85  into quad and increment by rs_c.
-" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c86  into quad and increment by rs_c.
-" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c87  into quad and increment by rs_c.
-"                                            \n\t"
-" dup  v8.4s, wzr                            \n\t"
-" dup  v9.4s, wzr                            \n\t"
-" dup  v10.4s, wzr                           \n\t"
-" dup  v11.4s, wzr                           \n\t"
-" dup  v12.4s, wzr                           \n\t"
-" dup  v13.4s, wzr                           \n\t"
-"                                            \n\t"
-" fcmp s7,#0.0                               \n\t"
-BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x5, x25                                \n\t"
-"                                            \n\t"
-" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c90  into quad and increment by rs_c.
-" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c91  into quad and increment by rs_c.
-" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c92  into quad and increment by rs_c.
-" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c93  into quad and increment by rs_c.
-" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c94  into quad and increment by rs_c.
-" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c95  into quad and increment by rs_c.
-" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c96  into quad and increment by rs_c.
-" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c97  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x26                                \n\t"
-"                                            \n\t"
-" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c100  into quad and increment by rs_c.
-" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c101  into quad and increment by rs_c.
-" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c102  into quad and increment by rs_c.
-" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c103  into quad and increment by rs_c.
-" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c104  into quad and increment by rs_c.
-" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c105  into quad and increment by rs_c.
-" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c106  into quad and increment by rs_c.
-" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c107  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x27                                \n\t"
-"                                            \n\t"
-" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c110  into quad and increment by rs_c.
-" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c111  into quad and increment by rs_c.
-" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c112  into quad and increment by rs_c.
-" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c113  into quad and increment by rs_c.
-" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c114  into quad and increment by rs_c.
-" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c115  into quad and increment by rs_c.
-" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c116  into quad and increment by rs_c.
-" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c117  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
-" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
-" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(SBETAZEROGENSTOREDS4)
-"                                            \n\t"
-" prfm pldl2keep,[x0]                        \n\t"
-" prfm pldl2keep,[x1]                        \n\t"
-"                                            \n\t"
-" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
-" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x5, x25                                \n\t"
-"                                            \n\t"
-" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c90  into quad and increment by rs_c.
-" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c91  into quad and increment by rs_c.
-" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c92  into quad and increment by rs_c.
-" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c93  into quad and increment by rs_c.
-" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c94  into quad and increment by rs_c.
-" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c95  into quad and increment by rs_c.
-" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c96  into quad and increment by rs_c.
-" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c97  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x26                                \n\t"
-"                                            \n\t"
-" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c100  into quad and increment by rs_c.
-" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c101  into quad and increment by rs_c.
-" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c102  into quad and increment by rs_c.
-" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c103  into quad and increment by rs_c.
-" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c104  into quad and increment by rs_c.
-" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c105  into quad and increment by rs_c.
-" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c106  into quad and increment by rs_c.
-" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c107  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x5, x27                                \n\t"
-"                                            \n\t"
-" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c110  into quad and increment by rs_c.
-" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c111  into quad and increment by rs_c.
-" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c112  into quad and increment by rs_c.
-" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c113  into quad and increment by rs_c.
-" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c114  into quad and increment by rs_c.
-" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c115  into quad and increment by rs_c.
-" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c116  into quad and increment by rs_c.
-" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c147  into quad and increment by rs_c.
-"                                            \n\t"
-LABEL(SEND)                                        // Done!
-"                                            \n\t"
-:// output operands (none)
-:// input operands
- [aaddr]  "m" (a),      // 0
- [baddr]  "m" (b),      // 1
- [caddr]  "m" (c),      // 2
- [k_iter] "m" (k_iter), // 3
- [k_left] "m" (k_left), // 4
- [alpha]  "m" (alpha),  // 5
- [beta]   "m" (beta),   // 6
- [rs_c]   "m" (rs_c),   // 7
- [cs_c]   "m" (cs_c),   // 8
- [a_next] "m" (a_next), // 9
- [b_next] "m" (b_next) // 10
-:// Register clobber list
- "x0", "x1", "x2",
- "x5", "x6", "x10","x14",
- "x16","x17","x19","x20",
- "x21","x22","x23","x24",
- "x25","x26","x27",
- "v0", "v1", "v2", "v3",
- "v4", "v5", "v6", "v7",
- "v8", "v9", "v10","v11",
- "v12","v13","v14","v15",
- "v16","v17","v18","v19",
- "v20","v21","v22","v23",
- "v24","v25","v26","v27",
- "v28","v29","v30","v31"
-);
 
+	__asm__ volatile
+	(
+	"                                            \n\t"
+	"                                            \n\t"
+	" ldr x0,%[aaddr]                            \n\t" // Load address of A.
+	" ldr x1,%[baddr]                            \n\t" // Load address of B.
+	" ldr x2,%[caddr]                            \n\t" // Load address of C.
+	"                                            \n\t"
+	" ldr x5,%[k_iter]                           \n\t" // Number of unrolled iterations (k_iter).
+	" ldr x6,%[k_left]                           \n\t" // Number of remaining iterations (k_left).
+	"                                            \n\t"
+	" ldr x10,%[cs_c]                            \n\t" // Load cs_c.
+	" lsl x10,x10,#2                             \n\t" // cs_c * sizeof(float) -- AUX.
+	"                                            \n\t"
+	" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+	" lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
+	"                                            \n\t"
+	" add x16,x2,x10                             \n\t" //Load address Column 1 of C
+	" add x17,x16,x10                            \n\t" //Load address Column 2 of C
+	" add x19,x17,x10                            \n\t" //Load address Column 3 of C
+	" add x20,x19,x10                            \n\t" //Load address Column 4 of C
+	" add x21,x20,x10                            \n\t" //Load address Column 5 of C
+	" add x22,x21,x10                            \n\t" //Load address Column 6 of C
+	" add x23,x22,x10                            \n\t" //Load address Column 7 of C
+	" add x24,x23,x10                            \n\t" //Load address Column 8 of C
+	" add x25,x24,x10                            \n\t" //Load address Column 9 of C
+	" add x26,x25,x10                            \n\t" //Load address Column 10 of C
+	" add x27,x26,x10                            \n\t" //Load address Column 11 of C
+	"                                            \n\t"
+	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+	" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x27]                       \n\t" // Prefetch c.
+	"                                            \n\t"
+	" dup  v8.4s, wzr                            \n\t" // Vector for accummulating column 0
+	" prfm    PLDL1KEEP, [x1, #192]              \n\t"
+	" dup  v9.4s, wzr                            \n\t" // Vector for accummulating column 0
+	" prfm    PLDL1KEEP, [x1, #256]              \n\t"
+	" dup  v10.4s, wzr                           \n\t" // Vector for accummulating column 1
+	" prfm    PLDL1KEEP, [x1, #320]              \n\t"
+	" dup  v11.4s, wzr                           \n\t" // Vector for accummulating column 1
+	" dup  v12.4s, wzr                           \n\t" // Vector for accummulating column 2
+	" dup  v13.4s, wzr                           \n\t" // Vector for accummulating column 2
+	"                                            \n\t"
+	" dup  v14.4s, wzr                           \n\t" // Vector for accummulating column 3
+	" prfm    PLDL1KEEP, [x0, #128]              \n\t"
+	" dup  v15.4s, wzr                           \n\t" // Vector for accummulating column 3
+	" prfm    PLDL1KEEP, [x0, #192]              \n\t"
+	" dup  v16.4s, wzr                           \n\t" // Vector for accummulating column 4
+	" dup  v17.4s, wzr                           \n\t" // Vector for accummulating column 4
+	" dup  v18.4s, wzr                           \n\t" // Vector for accummulating column 5
+	" dup  v19.4s, wzr                           \n\t" // Vector for accummulating column 5
+	"                                            \n\t"
+	" dup  v20.4s, wzr                           \n\t" // Vector for accummulating column 6
+	" dup  v21.4s, wzr                           \n\t" // Vector for accummulating column 6
+	" dup  v22.4s, wzr                           \n\t" // Vector for accummulating column 7
+	" dup  v23.4s, wzr                           \n\t" // Vector for accummulating column 7
+	" dup  v24.4s, wzr                           \n\t" // Vector for accummulating column 8
+	" dup  v25.4s, wzr                           \n\t" // Vector for accummulating column 8
+	"                                            \n\t"
+	" dup  v26.4s, wzr                           \n\t" // Vector for accummulating column 9
+	" dup  v27.4s, wzr                           \n\t" // Vector for accummulating column 9
+	" dup  v28.4s, wzr                           \n\t" // Vector for accummulating column 10
+	" dup  v29.4s, wzr                           \n\t" // Vector for accummulating column 10
+	" dup  v30.4s, wzr                           \n\t" // Vector for accummulating column 11
+	" dup  v31.4s, wzr                           \n\t" // Vector for accummulating column 11
+	"                                            \n\t"
+	" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
+	BEQ(SCONSIDERKLEFT)
+	"                                            \n\t"
+	" ldr q0, [x0]                               \n\t"
+	" ldr q1, [x0, #16]                          \n\t" // Load a
+	"                                            \n\t"
+	" ldr q2, [x1]                               \n\t" // Load b
+	" ldr q3, [x1, #16]                          \n\t"
+	" ldr q4, [x1, #32]                          \n\t"
+	"                                            \n\t"
+	" add x0, x0, #32                            \n\t" //update address of A
+	" add x1, x1, #48                            \n\t" //update address of B
+	"                                            \n\t"
+	" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one.
+	BEQ(SLASTITER)                                     // (as loop is do-while-like).
+	"                                            \n\t"
+	LABEL(SLOOPKITER)                                  // Body of the k_iter loop.
+	"                                            \n\t"
+	" ldr q5, [x0]                               \n\t"
+	" fmla v8.4s, v0.4s,v2.s[0]                  \n\t" // Accummulate.
+	" fmla v9.4s, v1.4s,v2.s[0]                  \n\t" // Accummulate.
+	" ldr q6, [x0, #16]                          \n\t"
+	" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+	" ldr q2, [x1]                               \n\t"
+	"                                            \n\t"
+	" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+	" prfm    PLDL1KEEP, [x1, #336]              \n\t"
+	" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+	" prfm    PLDL1KEEP, [x1, #400]              \n\t"
+	" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+	" prfm    PLDL1KEEP, [x1, #464]              \n\t"
+	" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q3, [x1, #16]                          \n\t"
+	"                                            \n\t"
+	" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q4, [x1, #32]                          \n\t"
+	"                                            \n\t" //End It 1
+	"                                            \n\t"
+	" ldr q0, [x0, #32]                          \n\t"
+	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+	" ldr q1, [x0, #48]                          \n\t"
+	" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+	" ldr q2, [x1, #48]                          \n\t"
+	"                                            \n\t"
+	" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+	" prfm    PLDL1KEEP, [x0, #224]              \n\t"
+	" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+	" prfm    PLDL1KEEP, [x0, #288]              \n\t"
+	" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q3, [x1, #64]                          \n\t"
+	"                                            \n\t"
+	" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q4, [x1, #80]                          \n\t"
+	"                                            \n\t" //End It 2
+	"                                            \n\t"
+	" ldr q5, [x0, #64]                          \n\t"
+	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+	" ldr q6, [x0, #80]                          \n\t"
+	" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+	" ldr q2, [x1, #96]                          \n\t"
+	"                                            \n\t"
+	" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q3, [x1, #112]                         \n\t"
+	"                                            \n\t"
+	" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q4, [x1, #128]                         \n\t"
+	"                                            \n\t" //End It 3
+	"                                            \n\t"
+	" ldr q0, [x0, #96]                          \n\t"
+	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+	" ldr q1, [x0, #112]                         \n\t"
+	" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+	" ldr q2, [x1, #144]                         \n\t"
+	"                                            \n\t"
+	" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q3, [x1, #160]                         \n\t"
+	"                                            \n\t"
+	" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q4, [x1, #176]                         \n\t"
+	" add x1, x1, #192                           \n\t"
+	" add x0, x0, #128                           \n\t"
+	"                                            \n\t" //End It 4
+	" sub x5,x5,1                                \n\t" // i-=1.
+	" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
+	BNE(SLOOPKITER)
+	"                                            \n\t"
+	LABEL(SLASTITER)                                   // Last iteration of k_iter loop.
+	"                                            \n\t"
+	"                                            \n\t"
+	" ldr q5, [x0]                               \n\t"
+	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+	" ldr q6, [x0, #16]                          \n\t"
+	" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+	" ldr q2, [x1]                               \n\t"
+	"                                            \n\t"
+	" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q3, [x1, #16]                          \n\t"
+	"                                            \n\t"
+	" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q4, [x1, #32]                          \n\t"
+	"                                            \n\t" //End It 1
+	"                                            \n\t"
+	" ldr q0, [x0, #32]                          \n\t"
+	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+	" ldr q1, [x0, #48]                          \n\t"
+	" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+	" ldr q2, [x1, #48]                          \n\t"
+	"                                            \n\t"
+	" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q3, [x1, #64]                          \n\t"
+	"                                            \n\t"
+	" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q4, [x1, #80]                          \n\t"
+	"                                            \n\t" //End It 2
+	"                                            \n\t"
+	" ldr q5, [x0, #64]                          \n\t"
+	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+	" ldr q6, [x0, #80]                          \n\t"
+	" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+	" ldr q2, [x1, #96]                          \n\t"
+	"                                            \n\t"
+	" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q3, [x1, #112]                         \n\t"
+	"                                            \n\t"
+	" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+	" ldr q4, [x1, #128]                         \n\t"
+	"                                            \n\t" //End It 3
+	"                                            \n\t"
+	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+	" add x1, x1, #144                           \n\t"
+	" add x0, x0, #96                            \n\t"
+	"                                            \n\t" //End It 4
+	"                                            \n\t"
+	LABEL(SCONSIDERKLEFT)
+	" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
+	BEQ(SPOSTACCUM)                                    // else, we enter the k_left loop.
+	"                                            \n\t"
+	LABEL(SLOOPKLEFT)                                  // Body of the left iterations
+	"                                            \n\t"
+	" ldr q0, [x0],#16                           \n\t"
+	" ldr q1, [x0],#16                           \n\t" // Load a
+	"                                            \n\t"
+	" ldr q2, [x1],#16                           \n\t" // Load b
+	" ldr q3, [x1],#16                           \n\t"
+	" ldr q4, [x1],#16                           \n\t"
+	"                                            \n\t"
+	" sub x6,x6,1                                \n\t" // i = i-1.
+	"                                            \n\t"
+	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+	" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+	" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+	" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+	" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+	" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+	" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+	" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+	" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+	" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+	" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+	"                                            \n\t"
+	" cmp x6,0                                   \n\t" // Iterate again.
+	BNE(SLOOPKLEFT)                                    // if i!=0.
+	"                                            \n\t"
+	LABEL(SPOSTACCUM)
+	"                                            \n\t"
+	" ldr x0,%[alpha]                            \n\t" // Alpha address.
+	" ldr x1,%[beta]                             \n\t" // Beta address.
+	"                                            \n\t"
+	" ld1r {v6.4s},[x0]                          \n\t" // Load alpha.
+	" ld1r {v7.4s},[x1]                          \n\t" // Load beta
+	"                                            \n\t"
+	" ldr x0,%[a_next]                           \n\t" // Pointer to next block of A.
+	" ldr x1,%[b_next]                           \n\t" // Pointer to next pointer of B.
+	"                                            \n\t"
+	" cmp x14,#4                                 \n\t" // If rs_c != 1 (column-major)
+	BNE(SGENSTORED)
+	"                                            \n\t"
+	LABEL(SCOLSTORED)                                  // C is column-major.
+	"                                            \n\t"
+	" dup  v0.4s, wzr                            \n\t"
+	" dup  v1.4s, wzr                            \n\t"
+	" dup  v2.4s, wzr                            \n\t"
+	" dup  v3.4s, wzr                            \n\t"
+	" dup  v4.4s, wzr                            \n\t"
+	" dup  v5.4s, wzr                            \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q0, [x2]                               \n\t" //Load column 0 of C
+	" ldr q1, [x2, #16]                          \n\t"
+	" ldr q2, [x16]                              \n\t" //Load column 1 of C
+	" ldr q3, [x16, #16]                         \n\t"
+	" ldr q4, [x17]                              \n\t" //Load column 2 of C
+	" ldr q5, [x17, #16]                         \n\t"
+	"                                            \n\t"
+	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROCOLSTOREDS1)
+	"                                            \n\t"
+	" fmla v0.4s,v8.4s,v6.s[0]                   \n\t" // Scale by alpha
+	" fmla v1.4s,v9.4s,v6.s[0]                   \n\t" // Scale by alpha
+	" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q0, [x2]                               \n\t" //Store column 0 of C
+	" str q1, [x2, #16]                          \n\t"
+	" str q2, [x16]                              \n\t" //Store column 1 of C
+	" str q3, [x16, #16]                         \n\t"
+	" str q4, [x17]                              \n\t" //Store column 2 of C
+	" str q5, [x17, #16]                         \n\t"
+	"                                            \n\t"
+	" dup  v8.4s, wzr                            \n\t"
+	" dup  v9.4s, wzr                            \n\t"
+	" dup  v10.4s, wzr                           \n\t"
+	" dup  v11.4s, wzr                           \n\t"
+	" dup  v12.4s, wzr                           \n\t"
+	" dup  v13.4s, wzr                           \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q8, [x19]                              \n\t" //Load column 3 of C
+	" ldr q9, [x19, #16]                         \n\t"
+	" ldr q10, [x20]                             \n\t" //Load column 4 of C
+	" ldr q11, [x20, #16]                        \n\t"
+	" ldr q12, [x21]                             \n\t" //Load column 5 of C
+	" ldr q13, [x21, #16]                        \n\t"
+	"                                            \n\t"
+	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROCOLSTOREDS2)
+	"                                            \n\t"
+	" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q8, [x19]                              \n\t" //Store column 3 of C
+	" str q9, [x19, #16]                         \n\t"
+	" str q10, [x20]                             \n\t" //Store column 4 of C
+	" str q11, [x20, #16]                        \n\t"
+	" str q12, [x21]                             \n\t" //Store column 5 of C
+	" str q13, [x21, #16]                        \n\t"
+	"                                            \n\t"
+	" dup  v0.4s, wzr                            \n\t"
+	" dup  v1.4s, wzr                            \n\t"
+	" dup  v2.4s, wzr                            \n\t"
+	" dup  v3.4s, wzr                            \n\t"
+	" dup  v4.4s, wzr                            \n\t"
+	" dup  v5.4s, wzr                            \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q0, [x22]                              \n\t" //Load column 6 of C
+	" ldr q1, [x22, #16]                         \n\t"
+	" ldr q2, [x23]                              \n\t" //Load column 7 of C
+	" ldr q3, [x23, #16]                         \n\t"
+	" ldr q4, [x24]                              \n\t" //Load column 8 of C
+	" ldr q5, [x24, #16]                         \n\t"
+	"                                            \n\t"
+	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROCOLSTOREDS3)
+	"                                            \n\t"
+	" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q0, [x22]                              \n\t" //Store column 6 of C
+	" str q1, [x22, #16]                         \n\t"
+	" str q2, [x23]                              \n\t" //Store column 7 of C
+	" str q3, [x23, #16]                         \n\t"
+	" str q4, [x24]                              \n\t" //Store column 8 of C
+	" str q5, [x24, #16]                         \n\t"
+	"                                            \n\t"
+	" dup  v8.4s, wzr                            \n\t"
+	" dup  v9.4s, wzr                            \n\t"
+	" dup  v10.4s, wzr                            \n\t"
+	" dup  v11.4s, wzr                            \n\t"
+	" dup  v12.4s, wzr                            \n\t"
+	" dup  v13.4s, wzr                            \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q8, [x25]                              \n\t" //Load column 9 of C
+	" ldr q9, [x25, #16]                         \n\t"
+	" ldr q10, [x26]                             \n\t" //Load column 10 of C
+	" ldr q11, [x26, #16]                        \n\t"
+	" ldr q12, [x27]                             \n\t" //Load column 11 of C
+	" ldr q13, [x27, #16]                        \n\t"
+	"                                            \n\t"
+	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROCOLSTOREDS4)
+	"                                            \n\t"
+	" prfm pldl2keep,[x0]                        \n\t"
+	" prfm pldl2keep,[x1]                        \n\t"
+	"                                            \n\t"
+	" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q8, [x25]                              \n\t" //Store column 9 of C
+	" str q9, [x25, #16]                         \n\t"
+	" str q10, [x26]                             \n\t" //Store column 10 of C
+	" str q11, [x26, #16]                        \n\t"
+	" str q12, [x27]                             \n\t" //Store column 11 of C
+	" str q13, [x27, #16]                        \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	BRANCH(SEND)                                       // Done.
+	"                                            \n\t"
+	"                                            \n\t"
+	LABEL(SGENSTORED)                                  // C is general-stride stored.
+	"                                            \n\t"
+	"                                            \n\t"
+	" dup  v0.4s, wzr                            \n\t"
+	" dup  v1.4s, wzr                            \n\t"
+	" dup  v2.4s, wzr                            \n\t"
+	" dup  v3.4s, wzr                            \n\t"
+	" dup  v4.4s, wzr                            \n\t"
+	" dup  v5.4s, wzr                            \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x2                                 \n\t"
+	"                                            \n\t"
+	" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c00  into quad and increment by rs_c.
+	" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c01  into quad and increment by rs_c.
+	" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c02  into quad and increment by rs_c.
+	" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c03  into quad and increment by rs_c.
+	" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c04  into quad and increment by rs_c.
+	" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c05  into quad and increment by rs_c.
+	" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c06  into quad and increment by rs_c.
+	" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c07  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x16                                \n\t"
+	"                                            \n\t"
+	" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c10  into quad and increment by rs_c.
+	" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c11  into quad and increment by rs_c.
+	" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c12  into quad and increment by rs_c.
+	" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c13  into quad and increment by rs_c.
+	" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c14  into quad and increment by rs_c.
+	" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c15  into quad and increment by rs_c.
+	" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c16  into quad and increment by rs_c.
+	" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c17  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x17                                \n\t"
+	"                                            \n\t"
+	" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c20  into quad and increment by rs_c.
+	" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c21  into quad and increment by rs_c.
+	" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c22  into quad and increment by rs_c.
+	" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c23  into quad and increment by rs_c.
+	" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c24  into quad and increment by rs_c.
+	" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c25  into quad and increment by rs_c.
+	" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c26  into quad and increment by rs_c.
+	" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c27  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROGENSTOREDS1)
+	"                                            \n\t"
+	" fmla v0.4s, v8.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v1.4s, v9.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x2                                 \n\t"
+	"                                            \n\t"
+	" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c00  into quad and increment by rs_c.
+	" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c01  into quad and increment by rs_c.
+	" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c02  into quad and increment by rs_c.
+	" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c03  into quad and increment by rs_c.
+	" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c04  into quad and increment by rs_c.
+	" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c05  into quad and increment by rs_c.
+	" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c06  into quad and increment by rs_c.
+	" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c07  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x16                                \n\t"
+	"                                            \n\t"
+	" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c10  into quad and increment by rs_c.
+	" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c11  into quad and increment by rs_c.
+	" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c12  into quad and increment by rs_c.
+	" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c13  into quad and increment by rs_c.
+	" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c14  into quad and increment by rs_c.
+	" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c15  into quad and increment by rs_c.
+	" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c16  into quad and increment by rs_c.
+	" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c17  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x17                                \n\t"
+	"                                            \n\t"
+	" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c20  into quad and increment by rs_c.
+	" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c21  into quad and increment by rs_c.
+	" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c22  into quad and increment by rs_c.
+	" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c23  into quad and increment by rs_c.
+	" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c24  into quad and increment by rs_c.
+	" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c25  into quad and increment by rs_c.
+	" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c26  into quad and increment by rs_c.
+	" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c27  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.4s, wzr                            \n\t"
+	" dup  v9.4s, wzr                            \n\t"
+	" dup  v10.4s, wzr                           \n\t"
+	" dup  v11.4s, wzr                           \n\t"
+	" dup  v12.4s, wzr                           \n\t"
+	" dup  v13.4s, wzr                           \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x19                                \n\t"
+	"                                            \n\t"
+	" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c30  into quad and increment by rs_c.
+	" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c31  into quad and increment by rs_c.
+	" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c32  into quad and increment by rs_c.
+	" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c33  into quad and increment by rs_c.
+	" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c34  into quad and increment by rs_c.
+	" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c35  into quad and increment by rs_c.
+	" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c36  into quad and increment by rs_c.
+	" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c37  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x20                                \n\t"
+	"                                            \n\t"
+	" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c40  into quad and increment by rs_c.
+	" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c41  into quad and increment by rs_c.
+	" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c42  into quad and increment by rs_c.
+	" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c43  into quad and increment by rs_c.
+	" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c44  into quad and increment by rs_c.
+	" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c45  into quad and increment by rs_c.
+	" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c46  into quad and increment by rs_c.
+	" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c47  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x21                                \n\t"
+	"                                            \n\t"
+	" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c50  into quad and increment by rs_c.
+	" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c51  into quad and increment by rs_c.
+	" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c52  into quad and increment by rs_c.
+	" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c53  into quad and increment by rs_c.
+	" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c54  into quad and increment by rs_c.
+	" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c55  into quad and increment by rs_c.
+	" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c56  into quad and increment by rs_c.
+	" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c57  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROGENSTOREDS2)
+	"                                            \n\t"
+	" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x19                                \n\t"
+	"                                            \n\t"
+	" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c30  into quad and increment by rs_c.
+	" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c31  into quad and increment by rs_c.
+	" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c32  into quad and increment by rs_c.
+	" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c33  into quad and increment by rs_c.
+	" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c34  into quad and increment by rs_c.
+	" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c35  into quad and increment by rs_c.
+	" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c36  into quad and increment by rs_c.
+	" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c37  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x20                                \n\t"
+	"                                            \n\t"
+	" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c40  into quad and increment by rs_c.
+	" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c41  into quad and increment by rs_c.
+	" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c42  into quad and increment by rs_c.
+	" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c43  into quad and increment by rs_c.
+	" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c44  into quad and increment by rs_c.
+	" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c45  into quad and increment by rs_c.
+	" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c46  into quad and increment by rs_c.
+	" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c47  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x21                                \n\t"
+	"                                            \n\t"
+	" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c50  into quad and increment by rs_c.
+	" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c51  into quad and increment by rs_c.
+	" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c52  into quad and increment by rs_c.
+	" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c53  into quad and increment by rs_c.
+	" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c54  into quad and increment by rs_c.
+	" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c55  into quad and increment by rs_c.
+	" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c56  into quad and increment by rs_c.
+	" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c57  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v0.4s, wzr                            \n\t"
+	" dup  v1.4s, wzr                            \n\t"
+	" dup  v2.4s, wzr                            \n\t"
+	" dup  v3.4s, wzr                            \n\t"
+	" dup  v4.4s, wzr                            \n\t"
+	" dup  v5.4s, wzr                            \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x22                                \n\t"
+	"                                            \n\t"
+	" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c60  into quad and increment by rs_c.
+	" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c61  into quad and increment by rs_c.
+	" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c62  into quad and increment by rs_c.
+	" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c63  into quad and increment by rs_c.
+	" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c64  into quad and increment by rs_c.
+	" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c65  into quad and increment by rs_c.
+	" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c66  into quad and increment by rs_c.
+	" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c67  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x23                                \n\t"
+	"                                            \n\t"
+	" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c70  into quad and increment by rs_c.
+	" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c71  into quad and increment by rs_c.
+	" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c72  into quad and increment by rs_c.
+	" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c73  into quad and increment by rs_c.
+	" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c74  into quad and increment by rs_c.
+	" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c75  into quad and increment by rs_c.
+	" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c76  into quad and increment by rs_c.
+	" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c77  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x24                                \n\t"
+	"                                            \n\t"
+	" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c80  into quad and increment by rs_c.
+	" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c81  into quad and increment by rs_c.
+	" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c82  into quad and increment by rs_c.
+	" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c83  into quad and increment by rs_c.
+	" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c84  into quad and increment by rs_c.
+	" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c85  into quad and increment by rs_c.
+	" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c86  into quad and increment by rs_c.
+	" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c87  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROGENSTOREDS3)
+	"                                            \n\t"
+	" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x22                                \n\t"
+	"                                            \n\t"
+	" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c60  into quad and increment by rs_c.
+	" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c61  into quad and increment by rs_c.
+	" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c62  into quad and increment by rs_c.
+	" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c63  into quad and increment by rs_c.
+	" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c64  into quad and increment by rs_c.
+	" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c65  into quad and increment by rs_c.
+	" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c66  into quad and increment by rs_c.
+	" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c67  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x23                                \n\t"
+	"                                            \n\t"
+	" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c70  into quad and increment by rs_c.
+	" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c71  into quad and increment by rs_c.
+	" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c72  into quad and increment by rs_c.
+	" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c73  into quad and increment by rs_c.
+	" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c74  into quad and increment by rs_c.
+	" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c75  into quad and increment by rs_c.
+	" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c76  into quad and increment by rs_c.
+	" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c77  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x24                                \n\t"
+	"                                            \n\t"
+	" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c80  into quad and increment by rs_c.
+	" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c81  into quad and increment by rs_c.
+	" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c82  into quad and increment by rs_c.
+	" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c83  into quad and increment by rs_c.
+	" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c84  into quad and increment by rs_c.
+	" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c85  into quad and increment by rs_c.
+	" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c86  into quad and increment by rs_c.
+	" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c87  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.4s, wzr                            \n\t"
+	" dup  v9.4s, wzr                            \n\t"
+	" dup  v10.4s, wzr                           \n\t"
+	" dup  v11.4s, wzr                           \n\t"
+	" dup  v12.4s, wzr                           \n\t"
+	" dup  v13.4s, wzr                           \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x25                                \n\t"
+	"                                            \n\t"
+	" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c90  into quad and increment by rs_c.
+	" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c91  into quad and increment by rs_c.
+	" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c92  into quad and increment by rs_c.
+	" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c93  into quad and increment by rs_c.
+	" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c94  into quad and increment by rs_c.
+	" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c95  into quad and increment by rs_c.
+	" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c96  into quad and increment by rs_c.
+	" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c97  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x26                                \n\t"
+	"                                            \n\t"
+	" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c100  into quad and increment by rs_c.
+	" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c101  into quad and increment by rs_c.
+	" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c102  into quad and increment by rs_c.
+	" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c103  into quad and increment by rs_c.
+	" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c104  into quad and increment by rs_c.
+	" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c105  into quad and increment by rs_c.
+	" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c106  into quad and increment by rs_c.
+	" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c107  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x27                                \n\t"
+	"                                            \n\t"
+	" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c110  into quad and increment by rs_c.
+	" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c111  into quad and increment by rs_c.
+	" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c112  into quad and increment by rs_c.
+	" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c113  into quad and increment by rs_c.
+	" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c114  into quad and increment by rs_c.
+	" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c115  into quad and increment by rs_c.
+	" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c116  into quad and increment by rs_c.
+	" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c117  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(SBETAZEROGENSTOREDS4)
+	"                                            \n\t"
+	" prfm pldl2keep,[x0]                        \n\t"
+	" prfm pldl2keep,[x1]                        \n\t"
+	"                                            \n\t"
+	" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x25                                \n\t"
+	"                                            \n\t"
+	" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c90  into quad and increment by rs_c.
+	" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c91  into quad and increment by rs_c.
+	" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c92  into quad and increment by rs_c.
+	" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c93  into quad and increment by rs_c.
+	" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c94  into quad and increment by rs_c.
+	" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c95  into quad and increment by rs_c.
+	" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c96  into quad and increment by rs_c.
+	" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c97  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x26                                \n\t"
+	"                                            \n\t"
+	" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c100  into quad and increment by rs_c.
+	" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c101  into quad and increment by rs_c.
+	" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c102  into quad and increment by rs_c.
+	" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c103  into quad and increment by rs_c.
+	" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c104  into quad and increment by rs_c.
+	" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c105  into quad and increment by rs_c.
+	" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c106  into quad and increment by rs_c.
+	" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c107  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x27                                \n\t"
+	"                                            \n\t"
+	" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c110  into quad and increment by rs_c.
+	" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c111  into quad and increment by rs_c.
+	" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c112  into quad and increment by rs_c.
+	" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c113  into quad and increment by rs_c.
+	" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c114  into quad and increment by rs_c.
+	" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c115  into quad and increment by rs_c.
+	" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c116  into quad and increment by rs_c.
+	" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c147  into quad and increment by rs_c.
+	"                                            \n\t"
+	LABEL(SEND)                                        // Done!
+	"                                            \n\t"
+	:// output operands (none)
+	:// input operands
+	 [aaddr]  "m" (a),      // 0
+	 [baddr]  "m" (b),      // 1
+	 [caddr]  "m" (c),      // 2
+	 [k_iter] "m" (k_iter), // 3
+	 [k_left] "m" (k_left), // 4
+	 [alpha]  "m" (alpha),  // 5
+	 [beta]   "m" (beta),   // 6
+	 [rs_c]   "m" (rs_c),   // 7
+	 [cs_c]   "m" (cs_c),   // 8
+	 [a_next] "m" (a_next), // 9
+	 [b_next] "m" (b_next) // 10
+	:// Register clobber list
+	 "x0", "x1", "x2",
+	 "x5", "x6", "x10","x14",
+	 "x16","x17","x19","x20",
+	 "x21","x22","x23","x24",
+	 "x25","x26","x27",
+	 "v0", "v1", "v2", "v3",
+	 "v4", "v5", "v6", "v7",
+	 "v8", "v9", "v10","v11",
+	 "v12","v13","v14","v15",
+	 "v16","v17","v18","v19",
+	 "v20","v21","v22","v23",
+	 "v24","v25","v26","v27",
+	 "v28","v29","v30","v31"
+	);
+
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 
@@ -1089,24 +1094,26 @@ LABEL(SEND)                                        // Done!
    o 4x4 Double precision micro-kernel NOT fully functional yet.
    o Runnable on ARMv8, compiled with aarch64 GCC.
    o Use it together with the armv8 BLIS configuration.
-   o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. 
+   o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz.
 
    December 2014.
-  
+
  * UPDATE OCTOBER 2015: Now is fully functional.
  * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz.
  * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz.
- 
+
  * UPDATE NOVEMBER 2015
  * Micro-kernel changed to 6x8
  * Tested on Juno Board. Around 4   GFLOPS, 1 x A57 core  @ 1.1 GHz.
  * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz.
- * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core  @ 850 MHz. 
+ * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core  @ 850 MHz.
  * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz.
 */
 void bli_dgemm_armv8a_asm_6x8
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -1121,966 +1128,969 @@ void bli_dgemm_armv8a_asm_6x8
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
-__asm__ volatile
-(
-"                                            \n\t" 
-" ldr x0,%[aaddr]                            \n\t" // Load address of A 
-" ldr x1,%[baddr]                            \n\t" // Load address of B
-" ldr x2,%[caddr]                            \n\t" // Load address of C
-"                                            \n\t"
-" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
-" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
-"                                            \n\t" 
-" ldr x10,%[cs_c]                            \n\t" // Load cs_c
-" lsl x10,x10,#3                             \n\t" // cs_c * sizeof(double)
-"                                            \n\t"
-" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
-" lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double). 
-"                                            \n\t"
-" add x20,x2,x10                             \n\t" //Load address Column 1 of C
-" add x21,x20,x10                            \n\t" //Load address Column 2 of C
-" add x22,x21,x10                            \n\t" //Load address Column 3 of C
-" add x23,x22,x10                            \n\t" //Load address Column 4 of C
-" add x24,x23,x10                            \n\t" //Load address Column 5 of C
-" add x25,x24,x10                            \n\t" //Load address Column 6 of C
-" add x26,x25,x10                            \n\t" //Load address Column 7 of C
-"                                            \n\t"
-" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
-" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
-"                                            \n\t"
-" dup  v8.2d, xzr                            \n\t" // Vector for accummulating column 0
-" prfm    PLDL1KEEP, [x1, #256]              \n\t" 
-" dup  v9.2d, xzr                            \n\t" // Vector for accummulating column 0
-" prfm    PLDL1KEEP, [x1, #320]              \n\t"
-" dup  v10.2d, xzr                           \n\t" // Vector for accummulating column 0
-" prfm    PLDL1KEEP, [x1, #384]              \n\t"
-" dup  v11.2d, xzr                           \n\t" // Vector for accummulating column 1
-" prfm    PLDL1KEEP, [x1, #448]              \n\t"
-" dup  v12.2d, xzr                           \n\t" // Vector for accummulating column 1 
-" dup  v13.2d, xzr                           \n\t" // Vector for accummulating column 1
-"                                            \n\t"
-" dup  v14.2d, xzr                           \n\t" // Vector for accummulating column 2
-" prfm    PLDL1KEEP, [x0, #192]              \n\t"
-" dup  v15.2d, xzr                           \n\t" // Vector for accummulating column 2
-" prfm    PLDL1KEEP, [x0, #256]              \n\t"
-" dup  v16.2d, xzr                           \n\t" // Vector for accummulating column 2
-" prfm    PLDL1KEEP, [x0, #320]              \n\t"
-" dup  v17.2d, xzr                           \n\t" // Vector for accummulating column 3
-" dup  v18.2d, xzr                           \n\t" // Vector for accummulating column 3 
-" dup  v19.2d, xzr                           \n\t" // Vector for accummulating column 3
-"                                            \n\t"
-" dup  v20.2d, xzr                           \n\t" // Vector for accummulating column 4 
-" dup  v21.2d, xzr                           \n\t" // Vector for accummulating column 4
-" dup  v22.2d, xzr                           \n\t" // Vector for accummulating column 4
-" dup  v23.2d, xzr                           \n\t" // Vector for accummulating column 5
-" dup  v24.2d, xzr                           \n\t" // Vector for accummulating column 5 
-" dup  v25.2d, xzr                           \n\t" // Vector for accummulating column 5
-"                                            \n\t"
-" dup  v26.2d, xzr                           \n\t" // Vector for accummulating column 6 
-" dup  v27.2d, xzr                           \n\t" // Vector for accummulating column 6
-" dup  v28.2d, xzr                           \n\t" // Vector for accummulating column 6
-" dup  v29.2d, xzr                           \n\t" // Vector for accummulating column 7
-" dup  v30.2d, xzr                           \n\t" // Vector for accummulating column 7 
-" dup  v31.2d, xzr                           \n\t" // Vector for accummulating column 7
-"                                            \n\t"
-"                                            \n\t"
-" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-BEQ(DCONSIDERKLEFT)
-"                                            \n\t"
-" ldr q0, [x0]                               \n\t" // Load a
-" ldr q1, [x0, #16]                          \n\t"
-" ldr q2, [x0, #32]                          \n\t"
-"                                            \n\t"
-" ldr q3, [x1]                               \n\t" // Load b
-" ldr q4, [x1, #16]                          \n\t"
-" ldr q5, [x1, #32]                          \n\t"
-" ldr q6, [x1, #48]                          \n\t"
-"                                            \n\t"
-" add x0, x0, #48                            \n\t" //update address of A
-" add x1, x1, #64                            \n\t" //update address of B
-"                                            \n\t"
-" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
-BEQ(DLASTITER)                                     // (as loop is do-while-like).
-"                                            \n\t"
-LABEL(DLOOP)                                       // Body
-"                                            \n\t"
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" prfm    PLDL1KEEP, [x1, #448]              \n\t" //512-64=448
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" prfm    PLDL1KEEP, [x1, #512]              \n\t"
-" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
-" prfm    PLDL1KEEP, [x1, #576]              \n\t"
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
-" ldr q3, [x1]                               \n\t"
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
-" ldr q7, [x0, #32]                          \n\t"
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
-" ldr q4, [x1, #16]                          \n\t"
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
-" ldr q5, [x1, #32]                          \n\t"
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q0, [x0]                               \n\t"
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q1, [x0, #16]                          \n\t"
-"                                            \n\t"
-" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q6, [x1, #48]                          \n\t"
-"                                            \n\t"                  // End it 1
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" prfm    PLDL1KEEP, [x1, #640]              \n\t"
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" prfm    PLDL1KEEP, [x0, #336]              \n\t"
-" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
-" prfm    PLDL1KEEP, [x0, #400]              \n\t"
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
-" ldr q3, [x1, #64]                          \n\t"
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
-" ldr q2, [x0, #80]                          \n\t"
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
-" ldr q4, [x1, #80]                          \n\t"
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
-" ldr q5, [x1, #96]                          \n\t"
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q0, [x0, #48]                          \n\t"
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q1, [x0, #64]                          \n\t"
-"                                            \n\t"
-" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q6, [x1, #112]                         \n\t"
-"                                            \n\t"                  //End it 2
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" prfm    PLDL1KEEP, [x0, #464]              \n\t"
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
-" ldr q3, [x1, #128]                         \n\t"
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
-" ldr q7, [x0, #128]                         \n\t"
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
-" ldr q4, [x1, #144]                         \n\t"
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
-" ldr q5, [x1, #160]                         \n\t"
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q0, [x0, #96]                          \n\t"
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q1, [x0, #112]                         \n\t"
-"                                            \n\t"
-" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q6, [x1, #176]                         \n\t"
-"                                            \n\t"                  // End it 3
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
-" ldr q3, [x1, #192]                         \n\t"
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
-" ldr q2, [x0, #176]                         \n\t"
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
-" ldr q4, [x1, #208]                         \n\t"
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
-" ldr q5, [x1, #224]                         \n\t"
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q0, [x0, #144]                         \n\t"
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q1, [x0, #160]                         \n\t"
-"                                            \n\t"
-" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q6, [x1, #240]                         \n\t"
-"                                            \n\t"                  //End it 4
-" add x0, x0, #192                           \n\t"
-" add x1, x1, #256                           \n\t"
-"                                            \n\t"
-" sub x5,x5,1                                \n\t" // i-=1
-" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-BNE(DLOOP)
-"                                            \n\t"
-LABEL(DLASTITER)
-"                                            \n\t"
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
-" ldr q3, [x1]                               \n\t"
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
-" ldr q7, [x0, #32]                          \n\t"
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
-" ldr q4, [x1, #16]                          \n\t"
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
-" ldr q5, [x1, #32]                          \n\t"
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q0, [x0]                               \n\t"
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q1, [x0, #16]                          \n\t"
-"                                            \n\t"
-" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q6, [x1, #48]                          \n\t"
-"                                            \n\t"                  // End it 1
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
-" ldr q3, [x1, #64]                          \n\t"
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
-" ldr q2, [x0, #80]                          \n\t"
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
-" ldr q4, [x1, #80]                          \n\t"
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
-" ldr q5, [x1, #96]                          \n\t"
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q0, [x0, #48]                          \n\t"
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q1, [x0, #64]                          \n\t"
-"                                            \n\t"
-" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q6, [x1, #112]                         \n\t"
-"                                            \n\t"                  //End it 2
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
-" ldr q3, [x1, #128]                         \n\t"
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
-" ldr q7, [x0, #128]                         \n\t"
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
-" ldr q4, [x1, #144]                         \n\t"
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
-" ldr q5, [x1, #160]                         \n\t"
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q0, [x0, #96]                          \n\t"
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q1, [x0, #112]                         \n\t"
-"                                            \n\t"
-" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
-" ldr q6, [x1, #176]                         \n\t"
-"                                            \n\t"                  // End it 3
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" add x1, x1, #192                           \n\t"
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
-"                                            \n\t"                  //End it 4
-" add x0, x0, #144                           \n\t"
-"                                            \n\t"
-LABEL(DCONSIDERKLEFT)
-" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-BEQ(DPOSTACCUM)                                    // else, we enter the k_left loop.
-"                                            \n\t"
-LABEL(DLOOPKLEFT)
-"                                            \n\t"
-" ldr q0, [x0],#16                           \n\t"
-" ldr q1, [x0],#16                           \n\t" // Load a
-" ldr q2, [x0],#16                           \n\t"
-"                                            \n\t"
-" ldr q3, [x1],#16                           \n\t" // Load b
-" ldr q4, [x1],#16                           \n\t"
-" ldr q5, [x1],#16                           \n\t"
-" ldr q6, [x1],#16                           \n\t"
-"                                            \n\t"
-" sub x6,x6,1                                \n\t"
-"                                            \n\t"
-" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
-" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
-" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
-" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
-" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
-"                                            \n\t"
-" cmp x6,0                                   \n\t" // Iterate again.
-BNE(DLOOPKLEFT)                                    // if i!=0.
-"                                            \n\t"
-LABEL(DPOSTACCUM)
-"                                            \n\t"
-" ldr x0,%[alpha]                            \n\t" // Alpha address      
-" ldr x1,%[beta]                             \n\t" // Beta address      
-"                                            \n\t" 
-" ld1r {v6.2d},[x0]                          \n\t" // Load alpha.
-" ld1r {v7.2d},[x1]                          \n\t" // Load beta
-"                                            \n\t"
-" ldr x0,%[a_next]                           \n\t" // Next A address for later use.
-" ldr x1,%[b_next]                           \n\t" // Next B address for later use.
-"                                            \n\t"
-" cmp x14,#8                                 \n\t" // If rs_c != 1 (column-major)
-BNE(DGENSTORED)
-"                                            \n\t"
-LABEL(DCOLSTORED)                                  // C is column-major.
-"                                            \n\t"
-" dup  v0.2d, xzr                            \n\t"
-" dup  v1.2d, xzr                            \n\t"
-" dup  v2.2d, xzr                            \n\t"
-" dup  v3.2d, xzr                            \n\t"
-" dup  v4.2d, xzr                            \n\t"
-" dup  v5.2d, xzr                            \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q0, [x2]                               \n\t" //Load column 0 of C
-" ldr q1, [x2, #16]                          \n\t"
-" ldr q2, [x2, #32]                          \n\t"
-"                                            \n\t"
-" ldr q3, [x20]                              \n\t" //Load column 1 of C
-" ldr q4, [x20, #16]                         \n\t"
-" ldr q5, [x20, #32]                         \n\t"
-"                                            \n\t"
-" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROCOLSTOREDS1)
-"                                            \n\t"
-" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
-" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
-" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" str q0, [x2]                               \n\t" //Store column 0 of C
-" str q1, [x2, #16]                          \n\t"
-" str q2, [x2, #32]                          \n\t"
-"                                            \n\t"
-" str q3, [x20]                              \n\t" //Store column 1 of C
-" str q4, [x20, #16]                         \n\t"
-" str q5, [x20, #32]                         \n\t"
-"                                            \n\t"
-" dup  v8.2d, xzr                            \n\t"
-" dup  v9.2d, xzr                            \n\t"
-" dup  v10.2d, xzr                           \n\t"
-" dup  v11.2d, xzr                           \n\t"
-" dup  v12.2d, xzr                           \n\t"
-" dup  v13.2d, xzr                           \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q8, [x21]                              \n\t" //Load column 2 of C
-" ldr q9, [x21, #16]                         \n\t"
-" ldr q10, [x21, #32]                        \n\t"
-"                                            \n\t"
-" ldr q11, [x22]                             \n\t" //Load column 3 of C
-" ldr q12, [x22, #16]                        \n\t"
-" ldr q13, [x22, #32]                        \n\t"
-"                                            \n\t"
-" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROCOLSTOREDS2)
-"                                            \n\t"
-" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" str q8, [x21]                              \n\t" //Store column 2 of C
-" str q9, [x21, #16]                         \n\t"
-" str q10, [x21, #32]                        \n\t"
-"                                            \n\t"
-" str q11, [x22]                             \n\t" //Store column 3 of C
-" str q12, [x22, #16]                        \n\t"
-" str q13, [x22, #32]                        \n\t"
-"                                            \n\t"
-" dup  v0.2d, xzr                            \n\t"
-" dup  v1.2d, xzr                            \n\t"
-" dup  v2.2d, xzr                            \n\t"
-" dup  v3.2d, xzr                            \n\t"
-" dup  v4.2d, xzr                            \n\t"
-" dup  v5.2d, xzr                            \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q0, [x23]                              \n\t" //Load column 4 of C
-" ldr q1, [x23, #16]                         \n\t"
-" ldr q2, [x23, #32]                         \n\t"
-"                                            \n\t"
-" ldr q3, [x24]                              \n\t" //Load column 5 of C
-" ldr q4, [x24, #16]                         \n\t"
-" ldr q5, [x24, #32]                         \n\t"
-"                                            \n\t"
-" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROCOLSTOREDS3)
-"                                            \n\t"
-" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" str q0, [x23]                              \n\t" //Store column 4 of C
-" str q1, [x23, #16]                         \n\t"
-" str q2, [x23, #32]                         \n\t"
-"                                            \n\t"
-" str q3, [x24]                              \n\t" //Store column 5 of C
-" str q4, [x24, #16]                         \n\t"
-" str q5, [x24, #32]                         \n\t"
-"                                            \n\t"
-" dup  v8.2d, xzr                            \n\t"
-" dup  v9.2d, xzr                            \n\t"
-" dup  v10.2d, xzr                           \n\t"
-" dup  v11.2d, xzr                           \n\t"
-" dup  v12.2d, xzr                           \n\t"
-" dup  v13.2d, xzr                           \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" ldr q8, [x25]                              \n\t" //Load column 6 of C
-" ldr q9, [x25, #16]                         \n\t"
-" ldr q10, [x25, #32]                        \n\t"
-"                                            \n\t"
-" ldr q11, [x26]                             \n\t" //Load column 7 of C
-" ldr q12, [x26, #16]                        \n\t"
-" ldr q13, [x26, #32]                        \n\t"
-"                                            \n\t"
-" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROCOLSTOREDS4)
-"                                            \n\t"
-" prfm pldl2keep,[x0]                        \n\t"
-" prfm pldl2keep,[x1]                        \n\t"
-"                                            \n\t"
-" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" str q8, [x25]                              \n\t" //Store column 6 of C
-" str q9, [x25, #16]                         \n\t"
-" str q10, [x25, #32]                        \n\t"
-"                                            \n\t"
-" str q11, [x26]                             \n\t" //Store column 7 of C
-" str q12, [x26, #16]                        \n\t"
-" str q13, [x26, #32]                        \n\t"
-"                                            \n\t"
-BRANCH(DEND)
-"                                            \n\t"
-LABEL(DGENSTORED)                                  // C is general-stride stored.
-"                                            \n\t"
-" dup  v0.2d, xzr                            \n\t"
-" dup  v1.2d, xzr                            \n\t"
-" dup  v2.2d, xzr                            \n\t"
-" dup  v3.2d, xzr                            \n\t"
-" dup  v4.2d, xzr                            \n\t"
-" dup  v5.2d, xzr                            \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x27, x2                                \n\t"
-"                                            \n\t" // Load address of C.
-" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
-" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
-" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
-" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
-" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
-" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x20                               \n\t" // Load address of C.
-"                                            \n\t"
-" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
-" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
-" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROGENSTOREDS1)
-"                                            \n\t"
-" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
-" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
-" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x27, x2                                \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
-" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
-" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
-" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
-" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
-" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x20                               \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
-" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
-" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
-"                                            \n\t"
-" dup  v8.2d, xzr                            \n\t"
-" dup  v9.2d, xzr                            \n\t"
-" dup  v10.2d, xzr                           \n\t"
-" dup  v11.2d, xzr                           \n\t"
-" dup  v12.2d, xzr                           \n\t"
-" dup  v13.2d, xzr                           \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x27, x21                               \n\t" // Load address of C.
-"                                            \n\t"
-" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c20  into quad and increment by rs_c.
-" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c21  into quad and increment by rs_c.
-" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c22  into quad and increment by rs_c.
-" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c23  into quad and increment by rs_c.
-" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c24  into quad and increment by rs_c.
-" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c25  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x22                               \n\t" // Load address of C.
-"                                            \n\t"
-" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c30  into quad and increment by rs_c.
-" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c31  into quad and increment by rs_c.
-" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c32  into quad and increment by rs_c.
-" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c33  into quad and increment by rs_c.
-" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c34  into quad and increment by rs_c.
-" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c35  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROGENSTOREDS2)
-"                                            \n\t"
-" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x27, x21                               \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c20  into quad and increment by rs_c.
-" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c21  into quad and increment by rs_c.
-" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c22  into quad and increment by rs_c.
-" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c23  into quad and increment by rs_c.
-" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c24  into quad and increment by rs_c.
-" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c25  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x22                               \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c30  into quad and increment by rs_c.
-" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c31  into quad and increment by rs_c.
-" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c32  into quad and increment by rs_c.
-" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c33  into quad and increment by rs_c.
-" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c34  into quad and increment by rs_c.
-" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c35  into quad and increment by rs_c.
-"                                            \n\t"
-" dup  v0.2d, xzr                            \n\t"
-" dup  v1.2d, xzr                            \n\t"
-" dup  v2.2d, xzr                            \n\t"
-" dup  v3.2d, xzr                            \n\t"
-" dup  v4.2d, xzr                            \n\t"
-" dup  v5.2d, xzr                            \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x27, x23                               \n\t" // Load address of C.
-"                                            \n\t"
-" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c40  into quad and increment by rs_c.
-" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c41  into quad and increment by rs_c.
-" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c42  into quad and increment by rs_c.
-" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c43  into quad and increment by rs_c.
-" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c44  into quad and increment by rs_c.
-" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c45  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x24                               \n\t" // Load address of C.
-"                                            \n\t"
-" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c50  into quad and increment by rs_c.
-" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c51  into quad and increment by rs_c.
-" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c52  into quad and increment by rs_c.
-" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c53  into quad and increment by rs_c.
-" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c54  into quad and increment by rs_c.
-" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c55  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
-" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROGENSTOREDS3)
-"                                            \n\t"
-" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
-" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x27, x23                               \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c40  into quad and increment by rs_c.
-" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c41  into quad and increment by rs_c.
-" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c42  into quad and increment by rs_c.
-" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c43  into quad and increment by rs_c.
-" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c44  into quad and increment by rs_c.
-" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c45  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x24                               \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c50  into quad and increment by rs_c.
-" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c51  into quad and increment by rs_c.
-" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c52  into quad and increment by rs_c.
-" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c53  into quad and increment by rs_c.
-" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c54  into quad and increment by rs_c.
-" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c55  into quad and increment by rs_c.
-"                                            \n\t"
-" dup  v8.2d, xzr                            \n\t"
-" dup  v9.2d, xzr                            \n\t"
-" dup  v10.2d, xzr                           \n\t"
-" dup  v11.2d, xzr                           \n\t"
-" dup  v12.2d, xzr                           \n\t"
-" dup  v13.2d, xzr                           \n\t"
-"                                            \n\t"
-" fcmp d7,#0.0                               \n\t"
-BEQ(DBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
-"                                            \n\t"
-" mov x27, x25                               \n\t"
-"                                            \n\t"
-" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c60  into quad and increment by rs_c.
-" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c61  into quad and increment by rs_c.
-" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c62  into quad and increment by rs_c.
-" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c63  into quad and increment by rs_c.
-" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c64  into quad and increment by rs_c.
-" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c65  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x26                               \n\t" // Load address of C.
-"                                            \n\t"
-" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c70  into quad and increment by rs_c.
-" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c71  into quad and increment by rs_c.
-" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c72  into quad and increment by rs_c.
-" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c73  into quad and increment by rs_c.
-" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c74  into quad and increment by rs_c.
-" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c75  into quad and increment by rs_c.
-"                                            \n\t"
-" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
-" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
-" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
-"                                            \n\t"
-LABEL(DBETAZEROGENSTOREDS4)
-"                                            \n\t"
-" prfm pldl2keep,[x0]                        \n\t"
-" prfm pldl2keep,[x1]                        \n\t"
-"                                            \n\t"
-" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
-" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
-"                                            \n\t"
-" mov x27, x25                               \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c60  into quad and increment by rs_c.
-" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c61  into quad and increment by rs_c.
-" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c62  into quad and increment by rs_c.
-" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c63  into quad and increment by rs_c.
-" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c64  into quad and increment by rs_c.
-" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c65  into quad and increment by rs_c.
-"                                            \n\t"
-" mov x27, x26                               \n\t" // Load address of C.
-"                                            \n\t"
-" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c70  into quad and increment by rs_c.
-" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c71  into quad and increment by rs_c.
-" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c72  into quad and increment by rs_c.
-" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c73  into quad and increment by rs_c.
-" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c74  into quad and increment by rs_c.
-" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c75  into quad and increment by rs_c.
-"                                            \n\t"
-LABEL(DEND)                                        // Done!
-"                                            \n\t"
-:// output operands (none)
-:// input operands
- [aaddr]  "m" (a),      // 0
- [baddr]  "m" (b),      // 1
- [caddr]  "m" (c),      // 2
- [k_iter] "m" (k_iter), // 3
- [k_left] "m" (k_left), // 4
- [alpha]  "m" (alpha),  // 5
- [beta]   "m" (beta),   // 6
- [rs_c]   "m" (rs_c),   // 6
- [cs_c]   "m" (cs_c),   // 7
- [a_next] "m" (a_next), // 8
- [b_next] "m" (b_next)  // 9
-:// Register clobber list
- "x0","x1","x2",
- "x5","x6","x10",
- "x14","x16","x17",
- "x20","x21","x22","x23","x24","x25","x26","x27",
- "v0","v1","v2",
- "v3","v4","v5",
- "v6","v7","v8",
- "v9","v10","v11",
- "v12","v13","v14",
- "v15","v16","v17","v18","v19",
- "v20","v21","v22","v23",
- "v24","v25","v26","v27",
- "v28","v29","v30","v31"
-);
-
+	GEMM_UKR_SETUP_CT( d, 6, 8, false );
 
+	__asm__ volatile
+	(
+	"                                            \n\t"
+	" ldr x0,%[aaddr]                            \n\t" // Load address of A
+	" ldr x1,%[baddr]                            \n\t" // Load address of B
+	" ldr x2,%[caddr]                            \n\t" // Load address of C
+	"                                            \n\t"
+	" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
+	" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
+	"                                            \n\t"
+	" ldr x10,%[cs_c]                            \n\t" // Load cs_c
+	" lsl x10,x10,#3                             \n\t" // cs_c * sizeof(double)
+	"                                            \n\t"
+	" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+	" lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double).
+	"                                            \n\t"
+	" add x20,x2,x10                             \n\t" //Load address Column 1 of C
+	" add x21,x20,x10                            \n\t" //Load address Column 2 of C
+	" add x22,x21,x10                            \n\t" //Load address Column 3 of C
+	" add x23,x22,x10                            \n\t" //Load address Column 4 of C
+	" add x24,x23,x10                            \n\t" //Load address Column 5 of C
+	" add x25,x24,x10                            \n\t" //Load address Column 6 of C
+	" add x26,x25,x10                            \n\t" //Load address Column 7 of C
+	"                                            \n\t"
+	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+	"                                            \n\t"
+	" dup  v8.2d, xzr                            \n\t" // Vector for accummulating column 0
+	" prfm    PLDL1KEEP, [x1, #256]              \n\t"
+	" dup  v9.2d, xzr                            \n\t" // Vector for accummulating column 0
+	" prfm    PLDL1KEEP, [x1, #320]              \n\t"
+	" dup  v10.2d, xzr                           \n\t" // Vector for accummulating column 0
+	" prfm    PLDL1KEEP, [x1, #384]              \n\t"
+	" dup  v11.2d, xzr                           \n\t" // Vector for accummulating column 1
+	" prfm    PLDL1KEEP, [x1, #448]              \n\t"
+	" dup  v12.2d, xzr                           \n\t" // Vector for accummulating column 1
+	" dup  v13.2d, xzr                           \n\t" // Vector for accummulating column 1
+	"                                            \n\t"
+	" dup  v14.2d, xzr                           \n\t" // Vector for accummulating column 2
+	" prfm    PLDL1KEEP, [x0, #192]              \n\t"
+	" dup  v15.2d, xzr                           \n\t" // Vector for accummulating column 2
+	" prfm    PLDL1KEEP, [x0, #256]              \n\t"
+	" dup  v16.2d, xzr                           \n\t" // Vector for accummulating column 2
+	" prfm    PLDL1KEEP, [x0, #320]              \n\t"
+	" dup  v17.2d, xzr                           \n\t" // Vector for accummulating column 3
+	" dup  v18.2d, xzr                           \n\t" // Vector for accummulating column 3
+	" dup  v19.2d, xzr                           \n\t" // Vector for accummulating column 3
+	"                                            \n\t"
+	" dup  v20.2d, xzr                           \n\t" // Vector for accummulating column 4
+	" dup  v21.2d, xzr                           \n\t" // Vector for accummulating column 4
+	" dup  v22.2d, xzr                           \n\t" // Vector for accummulating column 4
+	" dup  v23.2d, xzr                           \n\t" // Vector for accummulating column 5
+	" dup  v24.2d, xzr                           \n\t" // Vector for accummulating column 5
+	" dup  v25.2d, xzr                           \n\t" // Vector for accummulating column 5
+	"                                            \n\t"
+	" dup  v26.2d, xzr                           \n\t" // Vector for accummulating column 6
+	" dup  v27.2d, xzr                           \n\t" // Vector for accummulating column 6
+	" dup  v28.2d, xzr                           \n\t" // Vector for accummulating column 6
+	" dup  v29.2d, xzr                           \n\t" // Vector for accummulating column 7
+	" dup  v30.2d, xzr                           \n\t" // Vector for accummulating column 7
+	" dup  v31.2d, xzr                           \n\t" // Vector for accummulating column 7
+	"                                            \n\t"
+	"                                            \n\t"
+	" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
+	BEQ(DCONSIDERKLEFT)
+	"                                            \n\t"
+	" ldr q0, [x0]                               \n\t" // Load a
+	" ldr q1, [x0, #16]                          \n\t"
+	" ldr q2, [x0, #32]                          \n\t"
+	"                                            \n\t"
+	" ldr q3, [x1]                               \n\t" // Load b
+	" ldr q4, [x1, #16]                          \n\t"
+	" ldr q5, [x1, #32]                          \n\t"
+	" ldr q6, [x1, #48]                          \n\t"
+	"                                            \n\t"
+	" add x0, x0, #48                            \n\t" //update address of A
+	" add x1, x1, #64                            \n\t" //update address of B
+	"                                            \n\t"
+	" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one.
+	BEQ(DLASTITER)                                     // (as loop is do-while-like).
+	"                                            \n\t"
+	LABEL(DLOOP)                                       // Body
+	"                                            \n\t"
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" prfm    PLDL1KEEP, [x1, #448]              \n\t" //512-64=448
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" prfm    PLDL1KEEP, [x1, #512]              \n\t"
+	" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+	" prfm    PLDL1KEEP, [x1, #576]              \n\t"
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+	" ldr q3, [x1]                               \n\t"
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+	" ldr q7, [x0, #32]                          \n\t"
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+	" ldr q4, [x1, #16]                          \n\t"
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+	" ldr q5, [x1, #32]                          \n\t"
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q0, [x0]                               \n\t"
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q1, [x0, #16]                          \n\t"
+	"                                            \n\t"
+	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q6, [x1, #48]                          \n\t"
+	"                                            \n\t"                  // End it 1
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" prfm    PLDL1KEEP, [x1, #640]              \n\t"
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" prfm    PLDL1KEEP, [x0, #336]              \n\t"
+	" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
+	" prfm    PLDL1KEEP, [x0, #400]              \n\t"
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+	" ldr q3, [x1, #64]                          \n\t"
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+	" ldr q2, [x0, #80]                          \n\t"
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+	" ldr q4, [x1, #80]                          \n\t"
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+	" ldr q5, [x1, #96]                          \n\t"
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q0, [x0, #48]                          \n\t"
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q1, [x0, #64]                          \n\t"
+	"                                            \n\t"
+	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q6, [x1, #112]                         \n\t"
+	"                                            \n\t"                  //End it 2
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" prfm    PLDL1KEEP, [x0, #464]              \n\t"
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+	" ldr q3, [x1, #128]                         \n\t"
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+	" ldr q7, [x0, #128]                         \n\t"
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+	" ldr q4, [x1, #144]                         \n\t"
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+	" ldr q5, [x1, #160]                         \n\t"
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q0, [x0, #96]                          \n\t"
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q1, [x0, #112]                         \n\t"
+	"                                            \n\t"
+	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q6, [x1, #176]                         \n\t"
+	"                                            \n\t"                  // End it 3
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
+	" ldr q3, [x1, #192]                         \n\t"
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+	" ldr q2, [x0, #176]                         \n\t"
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+	" ldr q4, [x1, #208]                         \n\t"
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+	" ldr q5, [x1, #224]                         \n\t"
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q0, [x0, #144]                         \n\t"
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q1, [x0, #160]                         \n\t"
+	"                                            \n\t"
+	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q6, [x1, #240]                         \n\t"
+	"                                            \n\t"                  //End it 4
+	" add x0, x0, #192                           \n\t"
+	" add x1, x1, #256                           \n\t"
+	"                                            \n\t"
+	" sub x5,x5,1                                \n\t" // i-=1
+	" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
+	BNE(DLOOP)
+	"                                            \n\t"
+	LABEL(DLASTITER)
+	"                                            \n\t"
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+	" ldr q3, [x1]                               \n\t"
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+	" ldr q7, [x0, #32]                          \n\t"
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+	" ldr q4, [x1, #16]                          \n\t"
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+	" ldr q5, [x1, #32]                          \n\t"
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q0, [x0]                               \n\t"
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q1, [x0, #16]                          \n\t"
+	"                                            \n\t"
+	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q6, [x1, #48]                          \n\t"
+	"                                            \n\t"                  // End it 1
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
+	" ldr q3, [x1, #64]                          \n\t"
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+	" ldr q2, [x0, #80]                          \n\t"
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+	" ldr q4, [x1, #80]                          \n\t"
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+	" ldr q5, [x1, #96]                          \n\t"
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q0, [x0, #48]                          \n\t"
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q1, [x0, #64]                          \n\t"
+	"                                            \n\t"
+	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q6, [x1, #112]                         \n\t"
+	"                                            \n\t"                  //End it 2
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+	" ldr q3, [x1, #128]                         \n\t"
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+	" ldr q7, [x0, #128]                         \n\t"
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+	" ldr q4, [x1, #144]                         \n\t"
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+	" ldr q5, [x1, #160]                         \n\t"
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q0, [x0, #96]                          \n\t"
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q1, [x0, #112]                         \n\t"
+	"                                            \n\t"
+	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+	" ldr q6, [x1, #176]                         \n\t"
+	"                                            \n\t"                  // End it 3
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" add x1, x1, #192                           \n\t"
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"                  //End it 4
+	" add x0, x0, #144                           \n\t"
+	"                                            \n\t"
+	LABEL(DCONSIDERKLEFT)
+	" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
+	BEQ(DPOSTACCUM)                                    // else, we enter the k_left loop.
+	"                                            \n\t"
+	LABEL(DLOOPKLEFT)
+	"                                            \n\t"
+	" ldr q0, [x0],#16                           \n\t"
+	" ldr q1, [x0],#16                           \n\t" // Load a
+	" ldr q2, [x0],#16                           \n\t"
+	"                                            \n\t"
+	" ldr q3, [x1],#16                           \n\t" // Load b
+	" ldr q4, [x1],#16                           \n\t"
+	" ldr q5, [x1],#16                           \n\t"
+	" ldr q6, [x1],#16                           \n\t"
+	"                                            \n\t"
+	" sub x6,x6,1                                \n\t"
+	"                                            \n\t"
+	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+	" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+	" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+	" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+	" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+	" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+	" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+	"                                            \n\t"
+	" cmp x6,0                                   \n\t" // Iterate again.
+	BNE(DLOOPKLEFT)                                    // if i!=0.
+	"                                            \n\t"
+	LABEL(DPOSTACCUM)
+	"                                            \n\t"
+	" ldr x0,%[alpha]                            \n\t" // Alpha address
+	" ldr x1,%[beta]                             \n\t" // Beta address
+	"                                            \n\t"
+	" ld1r {v6.2d},[x0]                          \n\t" // Load alpha.
+	" ld1r {v7.2d},[x1]                          \n\t" // Load beta
+	"                                            \n\t"
+	" ldr x0,%[a_next]                           \n\t" // Next A address for later use.
+	" ldr x1,%[b_next]                           \n\t" // Next B address for later use.
+	"                                            \n\t"
+	" cmp x14,#8                                 \n\t" // If rs_c != 1 (column-major)
+	BNE(DGENSTORED)
+	"                                            \n\t"
+	LABEL(DCOLSTORED)                                  // C is column-major.
+	"                                            \n\t"
+	" dup  v0.2d, xzr                            \n\t"
+	" dup  v1.2d, xzr                            \n\t"
+	" dup  v2.2d, xzr                            \n\t"
+	" dup  v3.2d, xzr                            \n\t"
+	" dup  v4.2d, xzr                            \n\t"
+	" dup  v5.2d, xzr                            \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q0, [x2]                               \n\t" //Load column 0 of C
+	" ldr q1, [x2, #16]                          \n\t"
+	" ldr q2, [x2, #32]                          \n\t"
+	"                                            \n\t"
+	" ldr q3, [x20]                              \n\t" //Load column 1 of C
+	" ldr q4, [x20, #16]                         \n\t"
+	" ldr q5, [x20, #32]                         \n\t"
+	"                                            \n\t"
+	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROCOLSTOREDS1)
+	"                                            \n\t"
+	" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
+	" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
+	" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q0, [x2]                               \n\t" //Store column 0 of C
+	" str q1, [x2, #16]                          \n\t"
+	" str q2, [x2, #32]                          \n\t"
+	"                                            \n\t"
+	" str q3, [x20]                              \n\t" //Store column 1 of C
+	" str q4, [x20, #16]                         \n\t"
+	" str q5, [x20, #32]                         \n\t"
+	"                                            \n\t"
+	" dup  v8.2d, xzr                            \n\t"
+	" dup  v9.2d, xzr                            \n\t"
+	" dup  v10.2d, xzr                           \n\t"
+	" dup  v11.2d, xzr                           \n\t"
+	" dup  v12.2d, xzr                           \n\t"
+	" dup  v13.2d, xzr                           \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q8, [x21]                              \n\t" //Load column 2 of C
+	" ldr q9, [x21, #16]                         \n\t"
+	" ldr q10, [x21, #32]                        \n\t"
+	"                                            \n\t"
+	" ldr q11, [x22]                             \n\t" //Load column 3 of C
+	" ldr q12, [x22, #16]                        \n\t"
+	" ldr q13, [x22, #32]                        \n\t"
+	"                                            \n\t"
+	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROCOLSTOREDS2)
+	"                                            \n\t"
+	" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q8, [x21]                              \n\t" //Store column 2 of C
+	" str q9, [x21, #16]                         \n\t"
+	" str q10, [x21, #32]                        \n\t"
+	"                                            \n\t"
+	" str q11, [x22]                             \n\t" //Store column 3 of C
+	" str q12, [x22, #16]                        \n\t"
+	" str q13, [x22, #32]                        \n\t"
+	"                                            \n\t"
+	" dup  v0.2d, xzr                            \n\t"
+	" dup  v1.2d, xzr                            \n\t"
+	" dup  v2.2d, xzr                            \n\t"
+	" dup  v3.2d, xzr                            \n\t"
+	" dup  v4.2d, xzr                            \n\t"
+	" dup  v5.2d, xzr                            \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q0, [x23]                              \n\t" //Load column 4 of C
+	" ldr q1, [x23, #16]                         \n\t"
+	" ldr q2, [x23, #32]                         \n\t"
+	"                                            \n\t"
+	" ldr q3, [x24]                              \n\t" //Load column 5 of C
+	" ldr q4, [x24, #16]                         \n\t"
+	" ldr q5, [x24, #32]                         \n\t"
+	"                                            \n\t"
+	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROCOLSTOREDS3)
+	"                                            \n\t"
+	" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q0, [x23]                              \n\t" //Store column 4 of C
+	" str q1, [x23, #16]                         \n\t"
+	" str q2, [x23, #32]                         \n\t"
+	"                                            \n\t"
+	" str q3, [x24]                              \n\t" //Store column 5 of C
+	" str q4, [x24, #16]                         \n\t"
+	" str q5, [x24, #32]                         \n\t"
+	"                                            \n\t"
+	" dup  v8.2d, xzr                            \n\t"
+	" dup  v9.2d, xzr                            \n\t"
+	" dup  v10.2d, xzr                           \n\t"
+	" dup  v11.2d, xzr                           \n\t"
+	" dup  v12.2d, xzr                           \n\t"
+	" dup  v13.2d, xzr                           \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" ldr q8, [x25]                              \n\t" //Load column 6 of C
+	" ldr q9, [x25, #16]                         \n\t"
+	" ldr q10, [x25, #32]                        \n\t"
+	"                                            \n\t"
+	" ldr q11, [x26]                             \n\t" //Load column 7 of C
+	" ldr q12, [x26, #16]                        \n\t"
+	" ldr q13, [x26, #32]                        \n\t"
+	"                                            \n\t"
+	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROCOLSTOREDS4)
+	"                                            \n\t"
+	" prfm pldl2keep,[x0]                        \n\t"
+	" prfm pldl2keep,[x1]                        \n\t"
+	"                                            \n\t"
+	" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" str q8, [x25]                              \n\t" //Store column 6 of C
+	" str q9, [x25, #16]                         \n\t"
+	" str q10, [x25, #32]                        \n\t"
+	"                                            \n\t"
+	" str q11, [x26]                             \n\t" //Store column 7 of C
+	" str q12, [x26, #16]                        \n\t"
+	" str q13, [x26, #32]                        \n\t"
+	"                                            \n\t"
+	BRANCH(DEND)
+	"                                            \n\t"
+	LABEL(DGENSTORED)                                  // C is general-stride stored.
+	"                                            \n\t"
+	" dup  v0.2d, xzr                            \n\t"
+	" dup  v1.2d, xzr                            \n\t"
+	" dup  v2.2d, xzr                            \n\t"
+	" dup  v3.2d, xzr                            \n\t"
+	" dup  v4.2d, xzr                            \n\t"
+	" dup  v5.2d, xzr                            \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x2                                \n\t"
+	"                                            \n\t" // Load address of C.
+	" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
+	" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
+	" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
+	" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
+	" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
+	" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x20                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
+	" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
+	" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
+	" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+	" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
+	" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROGENSTOREDS1)
+	"                                            \n\t"
+	" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
+	" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
+	" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x2                                \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
+	" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
+	" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
+	" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
+	" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
+	" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x20                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
+	" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
+	" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
+	" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+	" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
+	" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.2d, xzr                            \n\t"
+	" dup  v9.2d, xzr                            \n\t"
+	" dup  v10.2d, xzr                           \n\t"
+	" dup  v11.2d, xzr                           \n\t"
+	" dup  v12.2d, xzr                           \n\t"
+	" dup  v13.2d, xzr                           \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x21                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c20  into quad and increment by rs_c.
+	" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c21  into quad and increment by rs_c.
+	" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c22  into quad and increment by rs_c.
+	" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c23  into quad and increment by rs_c.
+	" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c24  into quad and increment by rs_c.
+	" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c25  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x22                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c30  into quad and increment by rs_c.
+	" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c31  into quad and increment by rs_c.
+	" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c32  into quad and increment by rs_c.
+	" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c33  into quad and increment by rs_c.
+	" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c34  into quad and increment by rs_c.
+	" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c35  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROGENSTOREDS2)
+	"                                            \n\t"
+	" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x21                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c20  into quad and increment by rs_c.
+	" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c21  into quad and increment by rs_c.
+	" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c22  into quad and increment by rs_c.
+	" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c23  into quad and increment by rs_c.
+	" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c24  into quad and increment by rs_c.
+	" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c25  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x22                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c30  into quad and increment by rs_c.
+	" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c31  into quad and increment by rs_c.
+	" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c32  into quad and increment by rs_c.
+	" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c33  into quad and increment by rs_c.
+	" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c34  into quad and increment by rs_c.
+	" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c35  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v0.2d, xzr                            \n\t"
+	" dup  v1.2d, xzr                            \n\t"
+	" dup  v2.2d, xzr                            \n\t"
+	" dup  v3.2d, xzr                            \n\t"
+	" dup  v4.2d, xzr                            \n\t"
+	" dup  v5.2d, xzr                            \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x23                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c40  into quad and increment by rs_c.
+	" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c41  into quad and increment by rs_c.
+	" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c42  into quad and increment by rs_c.
+	" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c43  into quad and increment by rs_c.
+	" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c44  into quad and increment by rs_c.
+	" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c45  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x24                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c50  into quad and increment by rs_c.
+	" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c51  into quad and increment by rs_c.
+	" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c52  into quad and increment by rs_c.
+	" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c53  into quad and increment by rs_c.
+	" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c54  into quad and increment by rs_c.
+	" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c55  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROGENSTOREDS3)
+	"                                            \n\t"
+	" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x23                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c40  into quad and increment by rs_c.
+	" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c41  into quad and increment by rs_c.
+	" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c42  into quad and increment by rs_c.
+	" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c43  into quad and increment by rs_c.
+	" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c44  into quad and increment by rs_c.
+	" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c45  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x24                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c50  into quad and increment by rs_c.
+	" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c51  into quad and increment by rs_c.
+	" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c52  into quad and increment by rs_c.
+	" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c53  into quad and increment by rs_c.
+	" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c54  into quad and increment by rs_c.
+	" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c55  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.2d, xzr                            \n\t"
+	" dup  v9.2d, xzr                            \n\t"
+	" dup  v10.2d, xzr                           \n\t"
+	" dup  v11.2d, xzr                           \n\t"
+	" dup  v12.2d, xzr                           \n\t"
+	" dup  v13.2d, xzr                           \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ(DBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x25                               \n\t"
+	"                                            \n\t"
+	" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c60  into quad and increment by rs_c.
+	" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c61  into quad and increment by rs_c.
+	" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c62  into quad and increment by rs_c.
+	" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c63  into quad and increment by rs_c.
+	" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c64  into quad and increment by rs_c.
+	" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c65  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x26                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c70  into quad and increment by rs_c.
+	" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c71  into quad and increment by rs_c.
+	" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c72  into quad and increment by rs_c.
+	" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c73  into quad and increment by rs_c.
+	" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c74  into quad and increment by rs_c.
+	" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c75  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL(DBETAZEROGENSTOREDS4)
+	"                                            \n\t"
+	" prfm pldl2keep,[x0]                        \n\t"
+	" prfm pldl2keep,[x1]                        \n\t"
+	"                                            \n\t"
+	" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x25                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c60  into quad and increment by rs_c.
+	" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c61  into quad and increment by rs_c.
+	" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c62  into quad and increment by rs_c.
+	" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c63  into quad and increment by rs_c.
+	" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c64  into quad and increment by rs_c.
+	" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c65  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x26                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c70  into quad and increment by rs_c.
+	" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c71  into quad and increment by rs_c.
+	" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c72  into quad and increment by rs_c.
+	" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c73  into quad and increment by rs_c.
+	" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c74  into quad and increment by rs_c.
+	" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c75  into quad and increment by rs_c.
+	"                                            \n\t"
+	LABEL(DEND)                                        // Done!
+	"                                            \n\t"
+	:// output operands (none)
+	:// input operands
+	 [aaddr]  "m" (a),      // 0
+	 [baddr]  "m" (b),      // 1
+	 [caddr]  "m" (c),      // 2
+	 [k_iter] "m" (k_iter), // 3
+	 [k_left] "m" (k_left), // 4
+	 [alpha]  "m" (alpha),  // 5
+	 [beta]   "m" (beta),   // 6
+	 [rs_c]   "m" (rs_c),   // 6
+	 [cs_c]   "m" (cs_c),   // 7
+	 [a_next] "m" (a_next), // 8
+	 [b_next] "m" (b_next)  // 9
+	:// Register clobber list
+	 "x0","x1","x2",
+	 "x5","x6","x10",
+	 "x14","x16","x17",
+	 "x20","x21","x22","x23","x24","x25","x26","x27",
+	 "v0","v1","v2",
+	 "v3","v4","v5",
+	 "v6","v7","v8",
+	 "v9","v10","v11",
+	 "v12","v13","v14",
+	 "v15","v16","v17","v18","v19",
+	 "v20","v21","v22","v23",
+	 "v24","v25","v26","v27",
+	 "v28","v29","v30","v31"
+	);
 
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 
 #if 0
 void bli_cgemm_armv8a_opt_4x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
@@ -2095,6 +2105,8 @@ void bli_cgemm_armv8a_opt_4x4
 
 void bli_zgemm_armv8a_opt_4x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
index 1612e69b0..15e3e072f 100644
--- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
+++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
@@ -56,6 +56,8 @@
 
 void bli_dgemm_bgq_int_8x8
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
@@ -66,6 +68,8 @@ void bli_dgemm_bgq_int_8x8
        cntx_t*    restrict cntx
      )
 {
+    GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false );
+
     //Registers for storing C.
     //4 4x4 subblocks of C, c00, c01, c10, c11
     //4 registers per subblock: a, b, c, d
@@ -201,6 +205,8 @@ void bli_dgemm_bgq_int_8x8
     UPDATE( AB, c, 0 );
     AB = vec_perm( c11d, c11d, pattern );
     UPDATE( AB, c, 4 );
+
+    GEMM_UKR_FLUSH_CT( d );
 }
 
 void printvec(vector4double v)
@@ -214,6 +220,8 @@ void printvec(vector4double v)
 
 void bli_zgemm_bgq_int_4x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
@@ -224,6 +232,8 @@ void bli_zgemm_bgq_int_4x4
        cntx_t*    restrict cntx
      )
 {
+    GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false );
+
     double* a_d = ( double* )a;
     double* b_d = ( double* )b;
     double* c_d = ( double* )c;
@@ -368,4 +378,6 @@ void bli_zgemm_bgq_int_4x4
     c_d += 2*cs_c;
     ZUPDATE( c03a, c03b, c_d, 0 );
     ZUPDATE( c13a, c13b, c_d, 4 );
+
+    GEMM_UKR_FLUSH_CT( z );
 }
diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
index 403aaaaee..3a75d61d7 100644
--- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
+++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
@@ -90,7 +90,9 @@
 
 void bli_sgemm_bulldozer_asm_8x8_fma4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -102,25 +104,27 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_ALIGNED( s, 8, 8, false, 32 );
+
 	begin_asm()
-	
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
-	
+
 	vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
 	vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b.
 	vpermilps(imm(0x4e), ymm2, ymm3)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
 	lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c;
-	
+
 	lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c;
 	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c
@@ -130,7 +134,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 	prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c
 	prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c
 	prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c
-	
+
 	vxorps(ymm8, ymm8, ymm8)
 	vxorps(ymm9, ymm9, ymm9)
 	vxorps(ymm10, ymm10, ymm10)
@@ -139,15 +143,15 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 	vxorps(ymm13, ymm13, ymm13)
 	vxorps(ymm14, ymm14, ymm14)
 	vxorps(ymm15, ymm15, ymm15)
-	
-	
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
+
 	label(.SLOOPKITER) // MAIN LOOP
-	
+
 	 // iteration 0
 	prefetch(0, mem(rax, 16*32))
 	vfmaddps(ymm15, ymm0, ymm2, ymm15)
@@ -155,44 +159,44 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 	vmovshdup(mem(rbx, 0*32), ymm2)
 	vfmaddps(ymm13, ymm0, ymm3, ymm13)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vmovaps(mem(rax, 1*32), ymm1)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm11, ymm0, ymm4, ymm11)
 	vfmaddps(ymm9, ymm0, ymm5, ymm9)
-	
+
 	vfmaddps(ymm14, ymm0, ymm2, ymm14)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 1*32), ymm2)
 	vfmaddps(ymm12, ymm0, ymm3, ymm12)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm10, ymm0, ymm4, ymm10)
 	vfmaddps(ymm8, ymm0, ymm5, ymm8)
-	
+
 	 // iteration 1
 	vfmaddps(ymm15, ymm1, ymm2, ymm15)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovshdup(mem(rbx, 1*32), ymm2)
 	vfmaddps(ymm13, ymm1, ymm3, ymm13)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vmovaps(mem(rax, 2*32), ymm0)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm11, ymm1, ymm4, ymm11)
 	vfmaddps(ymm9, ymm1, ymm5, ymm9)
-	
+
 	vfmaddps(ymm14, ymm1, ymm2, ymm14)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 2*32), ymm2)
 	vfmaddps(ymm12, ymm1, ymm3, ymm12)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm10, ymm1, ymm4, ymm10)
 	vfmaddps(ymm8, ymm1, ymm5, ymm8)
-	
+
 	 // iteration 2
 	prefetch(0, mem(rax, 18*32))
 	vfmaddps(ymm15, ymm0, ymm2, ymm15)
@@ -200,23 +204,23 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 	vmovshdup(mem(rbx, 2*32), ymm2)
 	vfmaddps(ymm13, ymm0, ymm3, ymm13)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vmovaps(mem(rax, 3*32), ymm1)
 	add(imm(4*8*4), rax) // a += 4*8 (unroll x mr)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm11, ymm0, ymm4, ymm11)
 	vfmaddps(ymm9, ymm0, ymm5, ymm9)
-	
+
 	vfmaddps(ymm14, ymm0, ymm2, ymm14)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 3*32), ymm2)
 	vfmaddps(ymm12, ymm0, ymm3, ymm12)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm10, ymm0, ymm4, ymm10)
 	vfmaddps(ymm8, ymm0, ymm5, ymm8)
-	
+
 	 // iteration 3
 	vfmaddps(ymm15, ymm1, ymm2, ymm15)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
@@ -224,134 +228,134 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 	add(imm(4*8*4), rbx) // b += 4*8 (unroll x nr)
 	vfmaddps(ymm13, ymm1, ymm3, ymm13)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vmovaps(mem(rax, 0*32), ymm0)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm11, ymm1, ymm4, ymm11)
 	vfmaddps(ymm9, ymm1, ymm5, ymm9)
-	
+
 	vfmaddps(ymm14, ymm1, ymm2, ymm14)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 0*32), ymm2)
 	vfmaddps(ymm12, ymm1, ymm3, ymm12)
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm10, ymm1, ymm4, ymm10)
 	vfmaddps(ymm8, ymm1, ymm5, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi) // i -= 1;
 	jne(.SLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
+
 	label(.SLOOPKLEFT) // EDGE LOOP
-	
+
 	prefetch(0, mem(rax, 16*32))
 	vfmaddps(ymm15, ymm0, ymm2, ymm15)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmovshdup(mem(rbx, 0*32), ymm2)
 	vfmaddps(ymm13, ymm0, ymm3, ymm13)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
-	
+
 	vmovaps(mem(rax, 1*32), ymm1)
 	add(imm(8*1*4), rax) // a += 8 (1 x mr)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vfmaddps(ymm11, ymm0, ymm4, ymm11)
 	vfmaddps(ymm9, ymm0, ymm5, ymm9)
-	
- 	vfmaddps(ymm14, ymm0, ymm2, ymm14)
+
+	vfmaddps(ymm14, ymm0, ymm2, ymm14)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 1*32), ymm2)
 	add(imm(8*1*4), rbx) // b += 8 (1 x nr)
- 	vfmaddps(ymm12, ymm0, ymm3, ymm12)
+	vfmaddps(ymm12, ymm0, ymm3, ymm12)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
- 	vfmaddps(ymm10, ymm0, ymm4, ymm10)
- 	vfmaddps(ymm8, ymm0, ymm5, ymm8)
+	vfmaddps(ymm10, ymm0, ymm4, ymm10)
+	vfmaddps(ymm8, ymm0, ymm5, ymm8)
 	vmovaps(ymm1, ymm0)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.SLOOPKLEFT) // iterate again if i != 0.
-	
-	
+
+
 	label(.SPOSTACCUM)
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab02  ( ab04  ( ab06
-	 //   ab10    ab12    ab14    ab16  
+	 //   ab10    ab12    ab14    ab16
 	 //   ab22    ab20    ab26    ab24
 	 //   ab32    ab30    ab36    ab34
 	 //   ab44    ab46    ab40    ab42
-	 //   ab54    ab56    ab50    ab52  
+	 //   ab54    ab56    ab50    ab52
 	 //   ab66    ab64    ab62    ab60
 	 //   ab76 )  ab74 )  ab72 )  ab70 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab01  ( ab03  ( ab05  ( ab07
-	 //   ab11    ab13    ab15    ab17  
+	 //   ab11    ab13    ab15    ab17
 	 //   ab23    ab21    ab27    ab25
 	 //   ab33    ab31    ab37    ab35
 	 //   ab45    ab47    ab41    ab43
-	 //   ab55    ab57    ab51    ab53  
+	 //   ab55    ab57    ab51    ab53
 	 //   ab67    ab65    ab63    ab61
 	 //   ab77 )  ab75 )  ab73 )  ab71 )
 	GROUP_YMM_BY_4
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab02  ( ab04  ( ab06
-	 //   ab10    ab12    ab14    ab16  
+	 //   ab10    ab12    ab14    ab16
 	 //   ab20    ab22    ab24    ab26
 	 //   ab30    ab32    ab34    ab36
 	 //   ab44    ab46    ab40    ab42
-	 //   ab54    ab56    ab50    ab52  
+	 //   ab54    ab56    ab50    ab52
 	 //   ab64    ab66    ab60    ab62
 	 //   ab74 )  ab76 )  ab70 )  ab72 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab01  ( ab03  ( ab05  ( ab07
-	 //   ab11    ab13    ab15    ab17  
+	 //   ab11    ab13    ab15    ab17
 	 //   ab21    ab23    ab25    ab27
 	 //   ab31    ab33    ab35    ab37
 	 //   ab45    ab47    ab41    ab43
-	 //   ab55    ab57    ab51    ab53  
+	 //   ab55    ab57    ab51    ab53
 	 //   ab65    ab67    ab61    ab63
 	 //   ab75 )  ab77 )  ab71 )  ab73 )
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab02  ( ab04  ( ab06
-	 //   ab10    ab12    ab14    ab16  
+	 //   ab10    ab12    ab14    ab16
 	 //   ab20    ab22    ab24    ab26
 	 //   ab30    ab32    ab34    ab36
 	 //   ab40    ab42    ab44    ab46
-	 //   ab50    ab52    ab54    ab56  
+	 //   ab50    ab52    ab54    ab56
 	 //   ab60    ab62    ab64    ab66
 	 //   ab70 )  ab72 )  ab74 )  ab76 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab01  ( ab03  ( ab05  ( ab07
-	 //   ab11    ab13    ab15    ab17  
+	 //   ab11    ab13    ab15    ab17
 	 //   ab21    ab23    ab25    ab27
 	 //   ab31    ab33    ab35    ab37
 	 //   ab41    ab43    ab45    ab47
-	 //   ab51    ab53    ab55    ab57  
+	 //   ab51    ab53    ab55    ab57
 	 //   ab61    ab63    ab65    ab67
 	 //   ab71 )  ab73 )  ab75 )  ab77 )
-	
+
 	mov(var(alpha), rax) // load address of alpha
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm4) // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm8, ymm8) // scale by alpha
 	vmulps(ymm0, ymm9, ymm9)
 	vmulps(ymm0, ymm10, ymm10)
@@ -360,401 +364,115 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
-	
-	lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-	
-	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
-	lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c;
-	
-	
-	 // determine if
-	 //    c    % 32 == 0, AND
-	 //  4*cs_c % 32 == 0, AND
-	 //    rs_c      == 1
-	 // ie: aligned, ldim aligned, and
-	 // column-stored
-	
-	cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4.
-	sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-	test(imm(31), rcx) // set ZF if c & 32 is zero.
-	setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
-	test(imm(31), rdi) // set ZF if (4*cs_c) & 32 is zero.
-	setz(al) // al = ( ZF == 0 ? 1 : 0 );
-	 // and(bl,bh) followed by
-	 // and(bh,al) will reveal result
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm4) // set ZF if beta == 0.
 	je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.SCOLSTORED) // jump to column storage case
-	
-	
-	label(.SGENSTORED)
-	 // update c00:c70
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vfmaddps(ymm15, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c71
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm14, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm14, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c02:c72
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm13, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm13, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c03:c73
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm12, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm12, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c04:c74
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm11, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm11, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c05:c75
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm10, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm10, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c06:c76
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm9, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm9, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c07:c77
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm8, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm8, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	
-	STORE_SS
-	
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	label(.SCOLSTORED)
-	
-	
-	vmovaps(mem(rcx), ymm0) // load c00:c70,
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm15, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm15, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	vmovaps(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(mem(rcx), ymm1) // load c01:c71,
-//	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-//	vaddps(ymm14, ymm1, ymm1) // add the gemm result,
-	vfmaddps(ymm14, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
-	vmovaps(ymm1, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(mem(rcx), ymm0) // load c02:c72,
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm13, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm13, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	vmovaps(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(mem(rcx), ymm1) // load c03:c73,
-//	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-//	vaddps(ymm12, ymm1, ymm1) // add the gemm result,
-	vfmaddps(ymm12, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
-	vmovaps(ymm1, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(mem(rcx), ymm0) // load c04:c74,
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm11, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm11, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	vmovaps(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(mem(rcx), ymm1) // load c05:c75,
-//	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-//	vaddps(ymm10, ymm1, ymm1) // add the gemm result,
-	vfmaddps(ymm10, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
-	vmovaps(ymm1, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(mem(rcx), ymm0) // load c06:c76,
-//	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-//	vaddps(ymm9, ymm0, ymm0) // add the gemm result,
-	vfmaddps(ymm9, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
-	vmovaps(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(mem(rcx), ymm1) // load c07:c77,
-//	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-//	vaddps(ymm8, ymm1, ymm1) // add the gemm result,
-	vfmaddps(ymm8, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
-	vmovaps(ymm1, mem(rcx)) // and store back to memory.
-	
-	jmp(.SDONE) // jump to end.
-	
-	
+
+		vmovaps(mem(rcx), ymm0) // load c00:c70,
+		//vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		//vaddps(ymm15, ymm0, ymm0) // add the gemm result,
+		vfmaddps(ymm15, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
+		vmovaps(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(mem(rcx), ymm1) // load c01:c71,
+		//vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		//vaddps(ymm14, ymm1, ymm1) // add the gemm result,
+		vfmaddps(ymm14, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
+		vmovaps(ymm1, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(mem(rcx), ymm0) // load c02:c72,
+		//vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		//vaddps(ymm13, ymm0, ymm0) // add the gemm result,
+		vfmaddps(ymm13, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
+		vmovaps(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(mem(rcx), ymm1) // load c03:c73,
+		//vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		//vaddps(ymm12, ymm1, ymm1) // add the gemm result,
+		vfmaddps(ymm12, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
+		vmovaps(ymm1, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(mem(rcx), ymm0) // load c04:c74,
+		//vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		//vaddps(ymm11, ymm0, ymm0) // add the gemm result,
+		vfmaddps(ymm11, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
+		vmovaps(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(mem(rcx), ymm1) // load c05:c75,
+		//vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		//vaddps(ymm10, ymm1, ymm1) // add the gemm result,
+		vfmaddps(ymm10, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
+		vmovaps(ymm1, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(mem(rcx), ymm0) // load c06:c76,
+		//vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		//vaddps(ymm9, ymm0, ymm0) // add the gemm result,
+		vfmaddps(ymm9, ymm0, ymm4, ymm0)	// scale by beta and add the gemm result,
+		vmovaps(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(mem(rcx), ymm1) // load c07:c77,
+		//vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		//vaddps(ymm8, ymm1, ymm1) // add the gemm result,
+		vfmaddps(ymm8, ymm1, ymm4, ymm1)	// scale by beta and add the gemm result,
+		vmovaps(ymm1, mem(rcx)) // and store back to memory.
+
+		jmp(.SDONE) // jump to end.
+
 	label(.SBETAZERO)
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.SCOLSTORBZ) // jump to column storage case
-	
-	
-	label(.SGENSTORBZ)
-	 // update c00:c70
-	vmovapd(ymm15, ymm0)
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c71
-	vmovapd(ymm14, ymm0)
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c72
-	vmovapd(ymm13, ymm0)
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c73
-	vmovapd(ymm12, ymm0)
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c04:c74
-	vmovapd(ymm11, ymm0)
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c05:c75
-	vmovapd(ymm10, ymm0)
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c06:c76
-	vmovapd(ymm9, ymm0)
-	STORE_SS
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c07:c77
-	vmovapd(ymm8, ymm0)
-	STORE_SS
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	label(.SCOLSTORBZ)
-	
-	vmovaps(ymm15, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm14, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm13, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm12, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm11, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm10, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm9, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm8, mem(rcx)) // and store back to memory.
-	
+
+		vmovaps(ymm15, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm14, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm13, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm12, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm11, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm10, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm9, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm8, mem(rcx)) // and store back to memory.
+
 	label(.SDONE)
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter]  "m" (k_iter), // 0
-      [k_left]  "m" (k_left), // 1
-      [a]       "m" (a),      // 2
-      [b]       "m" (b),      // 3
-      [alpha]   "m" (alpha),  // 4
-      [beta]    "m" (beta),   // 5
-      [c]       "m" (c),      // 6
-      [rs_c]    "m" (rs_c),   // 7
-      [cs_c]    "m" (cs_c)/*,   // 8
-      [b_next]  "m" (b_next), // 9
-      [a_next]  "m" (a_next)*/  // 10
+	  [k_iter]  "m" (k_iter), // 0
+	  [k_left]  "m" (k_left), // 1
+	  [a]       "m" (a),      // 2
+	  [b]       "m" (b),      // 3
+	  [alpha]   "m" (alpha),  // 4
+	  [beta]    "m" (beta),   // 5
+	  [c]       "m" (c),      // 6
+	  [rs_c]    "m" (rs_c),   // 7
+	  [cs_c]    "m" (cs_c)/*,   // 8
+	  [b_next]  "m" (b_next), // 9
+	  [a_next]  "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -762,6 +480,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 #undef KERNEL4x6_1
@@ -862,7 +582,9 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 
 void bli_dgemm_bulldozer_asm_4x6_fma4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -874,66 +596,68 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 12;
-	uint64_t k_left = k0 % 12;
+	uint64_t k_iter = k / 12;
+	uint64_t k_left = k % 12;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_ANY( d, 4, 6, false );
+
 	begin_asm()
-		
-		
+
+
 		vzeroall()
 		mov(var(b), rbx) // load address of b.
 		mov(var(a), rax) // load address of a.
 		prefetch(0, mem(rax, 64))
-		
-		
+
+
 		vmovaps(mem(rbx, 0*8), xmm1)
 		vmovaps(mem(rbx, 2*8), xmm2)
 		vmovaps(mem(rbx, 4*8), xmm3)
 		add(imm(12*8), rbx)
 		add(imm(8*8), rax)
-		
+
 		mov(var(k_iter), rsi) // i = k_iter; notice var(k_iter) not $0
 		test(rsi, rsi)
 		je(.CONSIDERKLEFT)
-		
+
 		ALIGN32
 		label(.LOOPKITER) // MAIN LOOP
-		
-        KERNEL4x6_1(xx)
-        KERNEL4x6_2(xx)
-        KERNEL4x6_3(xx)
-        KERNEL4x6_4(xx)
-        KERNEL4x6_1(xx)
-        KERNEL4x6_2(xx)
-        KERNEL4x6_3(xx)
-        KERNEL4x6_4(xx)
-        KERNEL4x6_1(xx)
-        KERNEL4x6_2(xx)
-        KERNEL4x6_3(xx)
-        KERNEL4x6_4(xx)
-		
+
+		KERNEL4x6_1(xx)
+		KERNEL4x6_2(xx)
+		KERNEL4x6_3(xx)
+		KERNEL4x6_4(xx)
+		KERNEL4x6_1(xx)
+		KERNEL4x6_2(xx)
+		KERNEL4x6_3(xx)
+		KERNEL4x6_4(xx)
+		KERNEL4x6_1(xx)
+		KERNEL4x6_2(xx)
+		KERNEL4x6_3(xx)
+		KERNEL4x6_4(xx)
+
 		dec(rsi)
 		jne(.LOOPKITER)
-		
+
 		label(.CONSIDERKLEFT)
-		
+
 		mov(var(k_left), rsi)
-		test(rsi, rsi) 
+		test(rsi, rsi)
 		label(.LOOPKLEFT)
 		je(.POSTACCUM)
-		
-        KERNEL4x6_1(xx)
+
+		KERNEL4x6_1(xx)
 		add(imm(6*8), rbx)
 		add(imm(4*8), rax)
-		
+
 		dec(rsi)
 		jmp(.LOOPKLEFT) // iterate again if i != 0.
-		
+
 		label(.POSTACCUM)
-		
-		
+
+
 		mov(var(rs_c), rsi) // load cs_c
 		mov(var(cs_c), rdi) // load rs_c
 		vmovddup(mem(var(alpha)), xmm2) //load alpha
@@ -942,32 +666,32 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
 		sal(imm(3), rsi) // cs_c *= sizeof(double)
 		sal(imm(3), rdi) // rs_c *= sizeof(double)
 		lea(mem(rcx, rdi, 2), rdx)
-		
-		vmovlpd(mem(rcx), xmm0, xmm0) 		
-		vmovlpd(mem(rdx), xmm1, xmm1) 			
+
+		vmovlpd(mem(rcx), xmm0, xmm0)
+		vmovlpd(mem(rdx), xmm1, xmm1)
 		vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0)
 		vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1)
 		lea(mem(rdx, rdi, 2), r8)
 		vmulpd(xmm2, xmm4, xmm4)			// scale by alpha,
 		vmulpd(xmm2, xmm5, xmm5)			// scale by alpha,
 		vfmaddpd(xmm4, xmm0, xmm3, xmm4)	// scale by beta, and add the gemm result
-		vmovlpd(mem(r8), xmm0, xmm0) 			
+		vmovlpd(mem(r8), xmm0, xmm0)
 		vfmaddpd(xmm5, xmm1, xmm3, xmm5)	// scale by beta, and add the gemm result
 		vmovhpd(mem(r8, rdi, 1), xmm0, xmm0)
 		vmovlpd(xmm4, mem(rcx)) 			// and store back to memory.
 		vmovlpd(xmm5, mem(rdx)) 			// and store back to memory.
 		vmovhpd(xmm4, mem(rcx, rdi, 1))
-		add(rsi, rcx) 
+		add(rsi, rcx)
 		vmovhpd(xmm5, mem(rdx, rdi, 1))
-		add(rsi, rdx) 
-		
+		add(rsi, rdx)
+
 		vmulpd(xmm2, xmm6, xmm6)			// scale by alpha,
 		vfmaddpd(xmm6, xmm0, xmm3, xmm6)	// scale by beta, and add the gemm result
 		vmovlpd(xmm6, mem(r8)) 			// and store back to memory.
 		vmovhpd(xmm6, mem(r8, rdi, 1))
-		add(rsi, r8) 
-		
-		
+		add(rsi, r8)
+
+
 		vmovlpd(mem(rcx), xmm0, xmm0)
 		vmovlpd(mem(rdx), xmm1, xmm1)
 		vmovlpd(mem(r8), xmm4, xmm4)
@@ -984,13 +708,13 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
 		vmovlpd(xmm8, mem(rdx)) 			// and store back to memory.
 		vmovlpd(xmm9, mem(r8)) 			// and store back to memory.
 		vmovhpd(xmm7, mem(rcx, rdi, 1))
-		add(rsi, rcx) 
+		add(rsi, rcx)
 		vmovhpd(xmm8, mem(rdx, rdi, 1))
-		add(rsi, rdx) 
+		add(rsi, rdx)
 		vmovhpd(xmm9, mem(r8, rdi, 1))
-		add(rsi, r8) 
-		
-		
+		add(rsi, r8)
+
+
 		vmovlpd(mem(rcx), xmm0, xmm0)
 		vmovlpd(mem(rdx), xmm1, xmm1)
 		vmovlpd(mem(r8), xmm4, xmm4)
@@ -1007,13 +731,13 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
 		vmovlpd(xmm11, mem(rdx)) 			// and store back to memory.
 		vmovlpd(xmm12, mem(r8)) 			// and store back to memory.
 		vmovhpd(xmm10, mem(rcx, rdi, 1))
-		add(rsi, rcx) 
+		add(rsi, rcx)
 		vmovhpd(xmm11, mem(rdx, rdi, 1))
-		add(rsi, rdx) 
+		add(rsi, rdx)
 		vmovhpd(xmm12, mem(r8, rdi, 1))
-		add(rsi, r8) 
-		
-		
+		add(rsi, r8)
+
+
 		vmovlpd(mem(rcx), xmm0, xmm0)
 		vmovlpd(mem(rdx), xmm1, xmm1)
 		vmovlpd(mem(r8), xmm4, xmm4)
@@ -1031,30 +755,32 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
 		vmovlpd(xmm15, mem(r8)) 			// and store back to memory.
 		vmovhpd(xmm13, mem(rcx, rdi, 1))
 		vmovhpd(xmm14, mem(rdx, rdi, 1))
-		vmovhpd(xmm15, mem(r8, rdi, 1)) 
-
-    end_asm(
-		: // output operands (none)
-		: // input operands
-	      [k_iter]  "r" (k_iter), // 0
-	      [k_left]  "r" (k_left), // 1
-	      [a]       "r" (a),      // 2
-	      [b]       "r" (b),      // 3
-	      [alpha]   "r" (alpha),  // 4
-	      [beta]    "r" (beta),   // 5
-	      [c]       "r" (c),      // 6
-	      [rs_c]    "m" (rs_c),   // 7
-	      [cs_c]    "m" (cs_c)/*,   // 8
-	      [b_next]  "m" (b_next), // 9
-	      [a_next]  "m" (a_next)*/  // 10
-		: // register clobber list
-		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8",
-		  "xmm0", "xmm1", "xmm2", "xmm3",
-		  "xmm4", "xmm5", "xmm6", "xmm7",
-		  "xmm8", "xmm9", "xmm10", "xmm11",
-		  "xmm12", "xmm13", "xmm14", "xmm15",
-		  "memory"
+		vmovhpd(xmm15, mem(r8, rdi, 1))
+
+	end_asm(
+	: // output operands (none)
+	: // input operands
+	  [k_iter]  "r" (k_iter), // 0
+	  [k_left]  "r" (k_left), // 1
+	  [a]       "r" (a),      // 2
+	  [b]       "r" (b),      // 3
+	  [alpha]   "r" (alpha),  // 4
+	  [beta]    "r" (beta),   // 5
+	  [c]       "r" (c),      // 6
+	  [rs_c]    "m" (rs_c),   // 7
+	  [cs_c]    "m" (cs_c)/*,   // 8
+	  [b_next]  "m" (b_next), // 9
+	  [a_next]  "m" (a_next)*/  // 10
+	: // register clobber list
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8",
+	  "xmm0", "xmm1", "xmm2", "xmm3",
+	  "xmm4", "xmm5", "xmm6", "xmm7",
+	  "xmm8", "xmm9", "xmm10", "xmm11",
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( d );
 }
 //The parameter "i" is the iteration number, i.e. the B values to read
 #define MADD_TO_YMM(i) \
@@ -1076,7 +802,9 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
 
 void bli_cgemm_bulldozer_asm_8x4_fma4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
@@ -1091,33 +819,35 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_ALIGNED( c, 8, 4, false, 32 );
+
 	begin_asm()
-	
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	mov(var(b_next), r15) // load address of b_next.
 	//mov(var(a_next), r14) // load address of a_next.
 	sub(imm(4*64), r15)
-	
+
 	vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
 	vmovsldup(mem(rbx, 0*32), ymm2)
 	vpermilps(imm(0x4e), ymm2, ymm3)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
 	lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
-	
+
 	prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c
 	prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
-	
+
 	vxorps(ymm8, ymm8, ymm8)
 	vxorps(ymm9, ymm9, ymm9)
 	vxorps(ymm10, ymm10, ymm10)
@@ -1126,343 +856,312 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
 	vxorps(ymm13, ymm13, ymm13)
 	vxorps(ymm14, ymm14, ymm14)
 	vxorps(ymm15, ymm15, ymm15)
-	
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
+
 	label(.CLOOPKITER) // MAIN LOOP
-	
+
 	add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr)
-	
+
 	 // iteration 0
 	prefetch(0, mem(rax, 8*32))
 	vmovaps(mem(rax, 1*32), ymm1)
 	MADD_TO_YMM(0)
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 1*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 2*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 1
 	prefetch(0, mem(rax, 10*32))
 	vmovaps(mem(rax, 3*32), ymm1)
 	MADD_TO_YMM(1)
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 2*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 4*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
+
 	 // iteration 2
 	prefetch(0, mem(rax, 12*32))
 	vmovaps(mem(rax, 5*32), ymm1)
 	MADD_TO_YMM(2)
 	prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4]
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 3*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 6*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 3
 	prefetch(0, mem(rax, 14*32))
 	vmovaps(mem(rax, 7*32), ymm1)
 	MADD_TO_YMM(3)
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 4*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 8*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	add(imm(8*4*8), rax) // a += 8*4 (unroll x mr)
 	add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.CLOOPKITER) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.CCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.CLOOPKLEFT) // EDGE LOOP
-	
+
 	 // iteration 0
 	prefetch(0, mem(rax, 8*32))
 	vmovaps(mem(rax, 1*32), ymm1)
 	MADD_TO_YMM(0)
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 1*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 2*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	add(imm(8*1*8), rax) // a += 8 (1 x mr)
 	add(imm(4*1*8), rbx) // b += 4 (1 x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.CLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.CPOSTACCUM)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
-	 // ( ab00  ( ab01  ( ab02  ( ab03 
-	 //   ab10    ab11    ab12    ab13 
-	 //   ab21    ab20    ab23    ab22 
-	 //   ab31    ab30    ab33    ab32 
-	 //   ab42    ab43    ab40    ab41 
-	 //   ab52    ab53    ab50    ab51 
-	 //   ab63    ab62    ab61    ab60 
+	 // ( ab00  ( ab01  ( ab02  ( ab03
+	 //   ab10    ab11    ab12    ab13
+	 //   ab21    ab20    ab23    ab22
+	 //   ab31    ab30    ab33    ab32
+	 //   ab42    ab43    ab40    ab41
+	 //   ab52    ab53    ab50    ab51
+	 //   ab63    ab62    ab61    ab60
 	 //   ab73 )  ab72 )  ab71 )  ab70 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
-	 // ( ab80  ( ab81  ( ab82  ( ab83 
-	 //   ab90    ab91    ab92    ab93 
-	 //   aba1    aba0    aba3    aba2 
-	 //   abb1    abb0    abb3    abb2 
-	 //   abc2    abc3    abc0    abc1 
-	 //   abd2    abd3    abd0    abd1 
-	 //   abe3    abe2    abe1    abe0 
+	 // ( ab80  ( ab81  ( ab82  ( ab83
+	 //   ab90    ab91    ab92    ab93
+	 //   aba1    aba0    aba3    aba2
+	 //   abb1    abb0    abb3    abb2
+	 //   abc2    abc3    abc0    abc1
+	 //   abd2    abd3    abd0    abd1
+	 //   abe3    abe2    abe1    abe0
 	 //   abf3    abf2    abf1    abf0 )
 	GROUP_YMM_BY_4
 	 // ymm15:  ymm13:  ymm11:  ymm9:
-	 // ( ab00  ( ab01  ( ab02  ( ab03 
-	 //   ab10    ab11    ab12    ab13 
-	 //   ab20    ab21    ab22    ab23 
-	 //   ab30    ab31    ab32    ab33 
-	 //   ab42    ab43    ab40    ab41 
-	 //   ab52    ab53    ab50    ab51 
-	 //   ab62    ab63    ab60    ab61 
+	 // ( ab00  ( ab01  ( ab02  ( ab03
+	 //   ab10    ab11    ab12    ab13
+	 //   ab20    ab21    ab22    ab23
+	 //   ab30    ab31    ab32    ab33
+	 //   ab42    ab43    ab40    ab41
+	 //   ab52    ab53    ab50    ab51
+	 //   ab62    ab63    ab60    ab61
 	 //   ab72 )  ab73 )  ab70 )  ab71 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
-	 // ( ab80  ( ab81  ( ab82  ( ab83 
-	 //   ab90    ab91    ab92    ab93 
-	 //   aba0    aba1    aba2    aba3 
-	 //   abb0    abb1    abb2    abb3 
-	 //   abc2    abc3    abc0    abc1 
-	 //   abd2    abd3    abd0    abd1 
-	 //   abe2    abe3    abe0    abe1 
+	 // ( ab80  ( ab81  ( ab82  ( ab83
+	 //   ab90    ab91    ab92    ab93
+	 //   aba0    aba1    aba2    aba3
+	 //   abb0    abb1    abb2    abb3
+	 //   abc2    abc3    abc0    abc1
+	 //   abd2    abd3    abd0    abd1
+	 //   abe2    abe3    abe0    abe1
 	 //   abf2 )  abf3 )  abf0 )  abf1 )
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
-	 // ( ab00  ( ab01  ( ab02  ( ab03 
-	 //   ab10    ab11    ab12    ab13 
-	 //   ab20    ab21    ab22    ab23 
-	 //   ab30    ab31    ab32    ab33 
-	 //   ab40    ab41    ab42    ab43 
-	 //   ab50    ab51    ab52    ab53 
-	 //   ab60    ab61    ab62    ab63 
+	 // ( ab00  ( ab01  ( ab02  ( ab03
+	 //   ab10    ab11    ab12    ab13
+	 //   ab20    ab21    ab22    ab23
+	 //   ab30    ab31    ab32    ab33
+	 //   ab40    ab41    ab42    ab43
+	 //   ab50    ab51    ab52    ab53
+	 //   ab60    ab61    ab62    ab63
 	 //   ab70 )  ab71 )  ab72 )  ab73 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
-	 // ( ab80  ( ab81  ( ab82  ( ab83 
-	 //   ab90    ab91    ab92    ab93 
-	 //   aba0    aba1    aba2    aba3 
-	 //   abb0    abb1    abb2    abb3 
-	 //   abc0    abc1    abc2    abc3 
-	 //   abd0    abd1    abd2    abd3 
-	 //   abe0    abe1    abe2    abe3 
+	 // ( ab80  ( ab81  ( ab82  ( ab83
+	 //   ab90    ab91    ab92    ab93
+	 //   aba0    aba1    aba2    aba3
+	 //   abb0    abb1    abb2    abb3
+	 //   abc0    abc1    abc2    abc3
+	 //   abd0    abd1    abd2    abd3
+	 //   abe0    abe1    abe2    abe3
 	 //   abf0 )  abf1 )  abf2 )  abf3 )
-	
+
 	 // scale by alpha
-	
+
 	mov(var(alpha), rax) // load address of alpha
 	vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate
 	vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate
-	
+
 	vpermilps(imm(0xb1), ymm15, ymm3)
 	vmulps(ymm7, ymm15, ymm15)
 	vmulps(ymm6, ymm3, ymm3)
 	vaddsubps(ymm3, ymm15, ymm15)
-	
+
 	vpermilps(imm(0xb1), ymm14, ymm2)
 	vmulps(ymm7, ymm14, ymm14)
 	vmulps(ymm6, ymm2, ymm2)
 	vaddsubps(ymm2, ymm14, ymm14)
-	
+
 	vpermilps(imm(0xb1), ymm13, ymm1)
 	vmulps(ymm7, ymm13, ymm13)
 	vmulps(ymm6, ymm1, ymm1)
 	vaddsubps(ymm1, ymm13, ymm13)
-	
+
 	vpermilps(imm(0xb1), ymm12, ymm0)
 	vmulps(ymm7, ymm12, ymm12)
 	vmulps(ymm6, ymm0, ymm0)
 	vaddsubps(ymm0, ymm12, ymm12)
-	
+
 	vpermilps(imm(0xb1), ymm11, ymm3)
 	vmulps(ymm7, ymm11, ymm11)
 	vmulps(ymm6, ymm3, ymm3)
 	vaddsubps(ymm3, ymm11, ymm11)
-	
+
 	vpermilps(imm(0xb1), ymm10, ymm2)
 	vmulps(ymm7, ymm10, ymm10)
 	vmulps(ymm6, ymm2, ymm2)
 	vaddsubps(ymm2, ymm10, ymm10)
-	
+
 	vpermilps(imm(0xb1), ymm9, ymm1)
 	vmulps(ymm7, ymm9, ymm9)
 	vmulps(ymm6, ymm1, ymm1)
 	vaddsubps(ymm1, ymm9, ymm9)
-	
+
 	vpermilps(imm(0xb1), ymm8, ymm0)
 	vmulps(ymm7, ymm8, ymm8)
 	vmulps(ymm6, ymm0, ymm0)
 	vaddsubps(ymm0, ymm8, ymm8)
-	
-	
-	
-	
+
+
+
+
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate
-	
-	
-	
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
-	
-	lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-	
-	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
-	lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c;
-	
-	
-	
-	 // determine if
-	 //    c    % 32 == 0, AND
-	 //  8*cs_c % 32 == 0, AND
-	 //    rs_c      == 1
-	 // ie: aligned, ldim aligned, and
-	 // column-stored
-	
-	cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8.
-	sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-	test(imm(31), rcx) // set ZF if c & 32 is zero.
-	setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
-	test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero.
-	setz(al) // al = ( ZF == 0 ? 1 : 0 );
-	 // and(bl,bh) followed by
-	 // and(bh,al) will reveal result
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm7) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -1470,388 +1169,126 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case
-	
-	
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.CCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.CGENSTORED)
-	
-	 // update c00:c70
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c00,10) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c20,30) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c40,50) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c60,70) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c00,c10)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c20,c30)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c80:cf0
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c80,90) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca0,b0) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc0,d0) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce0,f0) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c80,c90)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca0,cb0)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c71
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c01,11) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c21,31) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c41,51) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c61,71) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c01,c11)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c21,c31)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c81:cf1
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c81,91) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca1,b1) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc1,d1) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce1,f1) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c81,c91)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca1,cb1)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c72
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c02,12) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c22,32) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c42,52) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c62,72) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c02,c12)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c22,c32)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c82:cf2
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c82,92) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca2,b2) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc2,d2) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce2,f2) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c82,c92)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca2,cb2)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c73
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c03,13) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c23,33) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c43,53) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c63,73) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c03,c13)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c23,c33)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c83:cf3
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c83,93) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca3,b3) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc3,d3) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce3,f3) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c83,c93)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca3,cb3)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3)
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
-	label(.CCOLSTORED)
-	
-	 // update c00:c70
-	
-	vmovaps(mem(rcx), ymm0) // load c00:c70 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rcx)) // store c00:c70
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c80:cf0
-	
-	vmovaps(mem(rdx), ymm0) // load c80:f0 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rdx)) // store c80:cf0
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c00:c70
-	
-	vmovaps(mem(rcx), ymm0) // load c01:c71 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rcx)) // store c01:c71
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c81:cf1
-	
-	vmovaps(mem(rdx), ymm0) // load c81:f1 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rdx)) // store c81:cf1
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c72
-	vmovaps(mem(rcx), ymm0) // load c02:c72 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rcx)) // store c02:c72
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c82:cf2
-	vmovaps(mem(rdx), ymm0) // load c82:f2 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rdx)) // store c82:cf2
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c73
-	vmovaps(mem(rcx), ymm0) // load c03:c73 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rcx)) // store c03:c73
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c83:cf3
-	vmovaps(mem(rdx), ymm0) // load c83:f3 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vmovaps(ymm0, mem(rdx)) // store c83:cf3
-	add(rdi, rdx) // c += cs_c;
-	
-	jmp(.CDONE) // jump to end.
-	
-	
+
+		 // update c00:c70
+
+		vmovaps(mem(rcx), ymm0) // load c00:c70 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx)) // store c00:c70
+
+		 // update c80:cf0
+
+		vmovaps(mem(rcx,32), ymm0) // load c80:f0 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx,32)) // store c80:cf0
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c00:c70
+
+		vmovaps(mem(rcx), ymm0) // load c01:c71 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx)) // store c01:c71
+
+		 // update c81:cf1
+
+		vmovaps(mem(rcx,32), ymm0) // load c81:f1 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx,32)) // store c81:cf1
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c02:c72
+		vmovaps(mem(rcx), ymm0) // load c02:c72 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx)) // store c02:c72
+
+		 // update c82:cf2
+		vmovaps(mem(rcx,32), ymm0) // load c82:f2 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx,32)) // store c82:cf2
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c03:c73
+		vmovaps(mem(rcx), ymm0) // load c03:c73 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx)) // store c03:c73
+
+		 // update c83:cf3
+		vmovaps(mem(rcx,32), ymm0) // load c83:f3 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
+		vmovaps(ymm0, mem(rcx,32)) // store c83:cf3
+		//add(rdi, rcx) // c += cs_c;
+
+		jmp(.CDONE) // jump to end.
+
 	label(.CBETAZERO)
-	 // check if aligned/column-stored
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.CCOLSTORBZ) // jump to column storage case
-	
-	
-	label(.CGENSTORBZ)
-	 // update c00:c70
-	vextractf128(imm(1), ymm15, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm15, mem(rcx)) // store (c00,c10)
-	vmovhpd(xmm15, mem(rcx, rsi, 1)) // store (c20,c30)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c80:cf0
-	vextractf128(imm(1), ymm14, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm14, mem(rdx)) // store (c80,c90)
-	vmovhpd(xmm14, mem(rdx, rsi, 1)) // store (ca0,cb0)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c71
-	vextractf128(imm(1), ymm13, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm13, mem(rcx)) // store (c01,c11)
-	vmovhpd(xmm13, mem(rcx, rsi, 1)) // store (c21,c31)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c81:cf1
-	vextractf128(imm(1), ymm12, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm12, mem(rdx)) // store (c81,c91)
-	vmovhpd(xmm12, mem(rdx, rsi, 1)) // store (ca1,cb1)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c72
-	vextractf128(imm(1), ymm11, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm11, mem(rcx)) // store (c02,c12)
-	vmovhpd(xmm11, mem(rcx, rsi, 1)) // store (c22,c32)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c82:cf2
-	vextractf128(imm(1), ymm10, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm10, mem(rdx)) // store (c82,c92)
-	vmovhpd(xmm10, mem(rdx, rsi, 1)) // store (ca2,cb2)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c73
-	vextractf128(imm(1), ymm9, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm9, mem(rcx)) // store (c03,c13)
-	vmovhpd(xmm9, mem(rcx, rsi, 1)) // store (c23,c33)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c83:cf3
-	vextractf128(imm(1), ymm8, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm8, mem(rdx)) // store (c83,c93)
-	vmovhpd(xmm8, mem(rdx, rsi, 1)) // store (ca3,cb3)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3)
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	label(.CCOLSTORBZ)
-	
-	vmovaps(ymm15, mem(rcx)) // store c00:c70
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm14, mem(rdx)) // store c80:cf0
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovaps(ymm13, mem(rcx)) // store c01:c71
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm12, mem(rdx)) // store c81:cf1
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovaps(ymm11, mem(rcx)) // store c02:c72
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm10, mem(rdx)) // store c82:cf2
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovaps(ymm9, mem(rcx)) // store c03:c73
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovaps(ymm8, mem(rdx)) // store c83:cf3
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	
+
+		vmovaps(ymm15, mem(rcx)) // store c00:c70
+		vmovaps(ymm14, mem(rcx,32)) // store c80:cf0
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm13, mem(rcx)) // store c01:c71
+		vmovaps(ymm12, mem(rcx,32)) // store c81:cf1
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm11, mem(rcx)) // store c02:c72
+		vmovaps(ymm10, mem(rcx,32)) // store c82:cf2
+		add(rdi, rcx) // c += cs_c;
+
+		vmovaps(ymm9, mem(rcx)) // store c03:c73
+		vmovaps(ymm8, mem(rcx,32)) // store c83:cf3
+		add(rdi, rcx) // c += cs_c;
+
 	label(.CDONE)
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter]  "m" (k_iter), // 0
-      [k_left]  "m" (k_left), // 1
-      [a]       "m" (a),      // 2
-      [b]       "m" (b),      // 3
-      [alpha]   "m" (alpha),  // 4
-      [beta]    "m" (beta),   // 5
-      [c]       "m" (c),      // 6
-      [rs_c]    "m" (rs_c),   // 7
-      [cs_c]    "m" (cs_c),   // 8
-      [b_next]  "m" (b_next)/*, // 9
-      [a_next]  "m" (a_next)*/  // 10
+	  [k_iter]  "m" (k_iter), // 0
+	  [k_left]  "m" (k_left), // 1
+	  [a]       "m" (a),      // 2
+	  [b]       "m" (b),      // 3
+	  [alpha]   "m" (alpha),  // 4
+	  [beta]    "m" (beta),   // 5
+	  [c]       "m" (c),      // 6
+	  [rs_c]    "m" (rs_c),   // 7
+	  [cs_c]    "m" (cs_c),   // 8
+	  [b_next]  "m" (b_next)/*, // 9
+	  [a_next]  "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "ymm0", "ymm1", "ymm2", "ymm3",
 	  "ymm4", "ymm5", "ymm6", "ymm7",
@@ -1859,6 +1296,8 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
 	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( c );
 }
 
 #define MADDSUBPD_TO_YMM \
@@ -1883,11 +1322,13 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
 	vmulpd(ymm7,  ymm(i), ymm(i))\
 	vmulpd(ymm6,  ymm(j),  ymm(j))\
 	vaddsubpd(ymm(j),  ymm(i), ymm(i))\
-	
+
 
 void bli_zgemm_bulldozer_asm_4x4_fma4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -1902,34 +1343,36 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_ALIGNED( z, 4, 4, false, 32 );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	//mov(var(b_next), r15) // load address of b_next.
 	//mov(var(a_next), r14) // load address of a_next.
-	
+
 	vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
 	vmovddup(mem(rbx, 0+0*32), ymm2)
 	vmovddup(mem(rbx, 0+1*32), ymm3)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
 	lea(mem(, rdi, 2), rdi)
 	lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
-	
+
 	prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c
 	prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
-	
+
 	vxorpd(ymm8, ymm8, ymm8)
 	vxorpd(ymm9, ymm9, ymm9)
 	vxorpd(ymm10, ymm10, ymm10)
@@ -1938,28 +1381,28 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	vxorpd(ymm13, ymm13, ymm13)
 	vxorpd(ymm14, ymm14, ymm14)
 	vxorpd(ymm15, ymm15, ymm15)
-	
-	
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
+
 	label(.ZLOOPKITER) // MAIN LOOP
-	
+
 	 // iteration 0
 	vmovapd(mem(rax, 1*32), ymm1)
 	vfmaddpd(ymm15, ymm0, ymm2, ymm15)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vfmaddpd(ymm11, ymm0, ymm3, ymm11)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
-	
+
 	prefetch(0, mem(rax, 16*32))
 	vfmaddpd(ymm14, ymm1, ymm2, ymm14)
 	vmovddup(mem(rbx, 8+0*32), ymm2)
 	vfmaddpd(ymm10, ymm1, ymm3, ymm10)
 	vmovddup(mem(rbx, 8+1*32), ymm3)
-	
+
 	MADDSUBPD_TO_YMM
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+2*32), ymm2)
@@ -1967,31 +1410,31 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	vmovddup(mem(rbx, 0+3*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 2*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
+
 	 // iteration 1
 	vmovapd(mem(rax, 3*32), ymm1)
 	vfmaddpd(ymm15, ymm0, ymm2, ymm15)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vfmaddpd(ymm11, ymm0, ymm3, ymm11)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
-	
+
 	prefetch(0, mem(rax, 18*32))
 	vfmaddpd(ymm14, ymm1, ymm2, ymm14)
 	vmovddup(mem(rbx, 8+2*32), ymm2)
 	vfmaddpd(ymm10, ymm1, ymm3, ymm10)
 	vmovddup(mem(rbx, 8+3*32), ymm3)
-	
+
 	MADDSUBPD_TO_YMM
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+4*32), ymm2)
@@ -1999,31 +1442,31 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	vmovddup(mem(rbx, 0+5*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 4*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
+
 	 // iteration 2
 	vmovapd(mem(rax, 5*32), ymm1)
 	vfmaddpd(ymm15, ymm0, ymm2, ymm15)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vfmaddpd(ymm11, ymm0, ymm3, ymm11)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
-	
+
 	prefetch(0, mem(rax, 20*32))
 	vfmaddpd(ymm14, ymm1, ymm2, ymm14)
 	vmovddup(mem(rbx, 8+4*32), ymm2)
 	vfmaddpd(ymm10, ymm1, ymm3, ymm10)
 	vmovddup(mem(rbx, 8+5*32), ymm3)
-	
+
 	MADDSUBPD_TO_YMM
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+6*32), ymm2)
@@ -2031,31 +1474,31 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	vmovddup(mem(rbx, 0+7*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 6*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
+
 	 // iteration 3
 	vmovapd(mem(rax, 7*32), ymm1)
 	vfmaddpd(ymm15, ymm0, ymm2, ymm15)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vfmaddpd(ymm11, ymm0, ymm3, ymm11)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
-	
+
 	prefetch(0, mem(rax, 22*32))
 	vfmaddpd(ymm14, ymm1, ymm2, ymm14)
 	vmovddup(mem(rbx, 8+6*32), ymm2)
 	vfmaddpd(ymm10, ymm1, ymm3, ymm10)
 	vmovddup(mem(rbx, 8+7*32), ymm3)
-	
+
 	MADDSUBPD_TO_YMM
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+8*32), ymm2)
@@ -2063,48 +1506,48 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	vmovddup(mem(rbx, 0+9*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 8*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
+
 	add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr)
 	add(imm(4*4*16), rax) // a += 4*4 (unroll x mr)
-	
+
 	dec(rsi) // i -= 1;
 	jne(.ZLOOPKITER) // iterate again if i != 0.
-	
-	
+
+
 	label(.ZCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.ZLOOPKLEFT) // EDGE LOOP
-	
+
 	 // iteration 0
 	vmovapd(mem(rax, 1*32), ymm1)
 	vfmaddpd(ymm15, ymm0, ymm2, ymm15)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vfmaddpd(ymm11, ymm0, ymm3, ymm11)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
-	
+
 	prefetch(0, mem(rax, 16*32))
 	vfmaddpd(ymm14, ymm1, ymm2, ymm14)
 	vmovddup(mem(rbx, 8+0*32), ymm2)
 	vfmaddpd(ymm10, ymm1, ymm3, ymm10)
 	vmovddup(mem(rbx, 8+1*32), ymm3)
-	
+
 	MADDSUBPD_TO_YMM
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+2*32), ymm2)
@@ -2112,75 +1555,75 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	vmovddup(mem(rbx, 0+3*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 2*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
-	
+
+
 	add(imm(4*1*16), rax) // a += 4 (1 x mr)
 	add(imm(4*1*16), rbx) // b += 4 (1 x nr)
-	
+
 	dec(rsi) // i -= 1;
 	jne(.ZLOOPKLEFT) // iterate again if i != 0.
-	
-	
+
+
 	label(.ZPOSTACCUM)
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab01  ( ab02  ( ab03
-	 //   ab10    ab11    ab12    ab13  
+	 //   ab10    ab11    ab12    ab13
 	 //   ab21    ab20    ab23    ab22
 	 //   ab31 )  ab30 )  ab33 )  ab32 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab40  ( ab41  ( ab42  ( ab43
-	 //   ab50    ab51    ab52    ab53  
+	 //   ab50    ab51    ab52    ab53
 	 //   ab61    ab60    ab63    ab62
 	 //   ab71 )  ab70 )  ab73 )  ab72 )
-	
+
 	vmovapd(ymm15, ymm7)
 	vperm2f128(imm(0x12), ymm15, ymm13, ymm15)
 	vperm2f128(imm(0x30), ymm7, ymm13, ymm13)
-	
+
 	vmovapd(ymm11, ymm7)
 	vperm2f128(imm(0x12), ymm11, ymm9, ymm11)
 	vperm2f128(imm(0x30), ymm7, ymm9, ymm9)
-	
+
 	vmovapd(ymm14, ymm7)
 	vperm2f128(imm(0x12), ymm14, ymm12, ymm14)
 	vperm2f128(imm(0x30), ymm7, ymm12, ymm12)
-	
+
 	vmovapd(ymm10, ymm7)
 	vperm2f128(imm(0x12), ymm10, ymm8, ymm10)
 	vperm2f128(imm(0x30), ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab01  ( ab02  ( ab03
-	 //   ab10    ab11    ab12    ab13  
+	 //   ab10    ab11    ab12    ab13
 	 //   ab20    ab21    ab22    ab23
 	 //   ab30 )  ab31 )  ab32 )  ab33 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab40  ( ab41  ( ab42  ( ab43
-	 //   ab50    ab51    ab52    ab53  
+	 //   ab50    ab51    ab52    ab53
 	 //   ab60    ab61    ab62    ab63
 	 //   ab70 )  ab71 )  ab72 )  ab73 )
-	
-	
+
+
 	 // scale by alpha
-	
+
 	mov(var(alpha), rax) // load address of alpha
 	vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate
 	vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate
-	
+
 	Z_ALPHA(15, 3)
 	Z_ALPHA(14, 2)
 	Z_ALPHA(13, 1)
@@ -2190,38 +1633,14 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	Z_ALPHA(10, 2)
 	Z_ALPHA(9, 1)
 	Z_ALPHA(8, 0)
-	
+
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
-	lea(mem(, rsi, 2), rsi)
-	lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
-	
-	
-	
-	 // determine if
-	 //    c    % 32 == 0, AND
-	 // 16*cs_c % 32 == 0, AND
-	 //    rs_c      == 1
-	 // ie: aligned, ldim aligned, and
-	 // column-stored
-	
-	cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16.
-	sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-	test(imm(31), rcx) // set ZF if c & 32 is zero.
-	setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
-	test(imm(31), rdi) // set ZF if (16*cs_c) & 32 is zero.
-	setz(al) // al = ( ZF == 0 ? 1 : 0 );
-	 // and(bl,bh) followed by
-	 // and(bh,al) will reveal result
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomisd(xmm0, xmm7) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -2229,287 +1648,91 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case
-	
-	
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.ZCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.ZGENSTORED)
-	 // update c00:c30
-	
-	vmovupd(mem(rcx), xmm0) // load (c00,c10) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c20,c30) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c00,c10)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c40:c70
-	
-	vmovupd(mem(rdx), xmm0) // load (c40,c50) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c60,c70) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c40,c50)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c31
-	
-	vmovupd(mem(rcx), xmm0) // load (c01,c11) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c21,c31) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c01,c11)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c41:c71
-	
-	vmovupd(mem(rdx), xmm0) // load (c41,c51) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c61,c71) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c41,c51)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c32
-	
-	vmovupd(mem(rcx), xmm0) // load (c02,c12) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c22,c32) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c02,c12)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c42:c72
-	
-	vmovupd(mem(rdx), xmm0) // load (c42,c52) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c62,c72) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c42,c52)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c33
-	
-	vmovupd(mem(rcx), xmm0) // load (c03,c13) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c23,c33) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c03,c13)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c43:c73
-	
-	vmovupd(mem(rdx), xmm0) // load (c43,c53) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c63,c73) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c43,c53)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73)
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
-	label(.ZCOLSTORED)
-	 // update c00:c30
-	
-	vmovapd(mem(rcx), ymm0) // load c00:c30 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rcx)) // store c00:c30
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c40:c70
-	
-	vmovapd(mem(rdx), ymm0) // load c40:c70 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rdx)) // store c40:c70
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c31
-	
-	vmovapd(mem(rcx), ymm0) // load c01:c31 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rcx)) // store c01:c31
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c41:c71
-	
-	vmovapd(mem(rdx), ymm0) // load c41:c71 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rdx)) // store c41:c71
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c32
-	
-	vmovapd(mem(rcx), ymm0) // load c02:c32 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rcx)) // store c02:c32
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c42:c72
-	
-	vmovapd(mem(rdx), ymm0) // load c42:c72 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rdx)) // store c42:c72
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c33
-	
-	vmovapd(mem(rcx), ymm0) // load c03:c33 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rcx)) // store c03:c33
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c43:c73
-	
-	vmovapd(mem(rdx), ymm0) // load c43:c73 into ymm0
-	Z_ALPHA(0, 2)									// scale ymm0 by beta
-	vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vmovapd(ymm0, mem(rdx)) // store c43:c73
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
+
+		 // update c00:c30
+
+		vmovapd(mem(rcx), ymm0) // load c00:c30 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx)) // store c00:c30
+
+		 // update c40:c70
+
+		vmovapd(mem(rcx,32), ymm0) // load c40:c70 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx,32)) // store c40:c70
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c01:c31
+
+		vmovapd(mem(rcx), ymm0) // load c01:c31 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx)) // store c01:c31
+
+		 // update c41:c71
+
+		vmovapd(mem(rcx,32), ymm0) // load c41:c71 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx,32)) // store c41:c71
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c02:c32
+
+		vmovapd(mem(rcx), ymm0) // load c02:c32 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx)) // store c02:c32
+
+		 // update c42:c72
+
+		vmovapd(mem(rcx,32), ymm0) // load c42:c72 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx,32)) // store c42:c72
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c03:c33
+
+		vmovapd(mem(rcx), ymm0) // load c03:c33 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx)) // store c03:c33
+
+		 // update c43:c73
+
+		vmovapd(mem(rcx,32), ymm0) // load c43:c73 into ymm0
+		Z_ALPHA(0, 2)									// scale ymm0 by beta
+		vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
+		vmovapd(ymm0, mem(rcx,32)) // store c43:c73
+		add(rdi, rcx) // c += cs_c;
+
+		jmp(.ZDONE) // jump to end.
+
 	label(.ZBETAZERO)
-	 // check if aligned/column-stored
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.ZCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.ZGENSTORBZ)
-	 // update c00:c30
-	
-	vextractf128(imm(1), ymm15, xmm2)
-	vmovupd(xmm15, mem(rcx)) // store (c00,c10)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c40:c70
-	
-	vextractf128(imm(1), ymm14, xmm2)
-	vmovupd(xmm14, mem(rdx)) // store (c40,c50)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c31
-	
-	vextractf128(imm(1), ymm13, xmm2)
-	vmovupd(xmm13, mem(rcx)) // store (c01,c11)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c41:c71
-	
-	vextractf128(imm(1), ymm12, xmm2)
-	vmovupd(xmm12, mem(rdx)) // store (c41,c51)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c32
-	
-	vextractf128(imm(1), ymm11, xmm2)
-	vmovupd(xmm11, mem(rcx)) // store (c02,c12)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c42:c72
-	
-	vextractf128(imm(1), ymm10, xmm2)
-	vmovupd(xmm10, mem(rdx)) // store (c42,c52)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c33
-	
-	vextractf128(imm(1), ymm9, xmm2)
-	vmovupd(xmm9, mem(rcx)) // store (c03,c13)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c43:c73
-	
-	vextractf128(imm(1), ymm8, xmm2)
-	vmovupd(xmm8, mem(rdx)) // store (c43,c53)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73)
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	label(.ZCOLSTORBZ)
-	
-	
-	vmovapd(ymm15, mem(rcx)) // store c00:c30
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovapd(ymm14, mem(rdx)) // store c40:c70
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovapd(ymm13, mem(rcx)) // store c01:c31
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovapd(ymm12, mem(rdx)) // store c41:c71
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovapd(ymm11, mem(rcx)) // store c02:c32
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovapd(ymm10, mem(rdx)) // store c42:c72
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovapd(ymm9, mem(rcx)) // store c03:c33
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovapd(ymm8, mem(rdx)) // store c43:c73
-	
-	
+
+		vmovapd(ymm15, mem(rcx)) // store c00:c30
+		vmovapd(ymm14, mem(rcx,32)) // store c40:c70
+		add(rdi, rcx) // c += cs_c;
+
+		vmovapd(ymm13, mem(rcx)) // store c01:c31
+		vmovapd(ymm12, mem(rcx,32)) // store c41:c71
+		add(rdi, rcx) // c += cs_c;
+
+		vmovapd(ymm11, mem(rcx)) // store c02:c32
+		vmovapd(ymm10, mem(rcx,32)) // store c42:c72
+		add(rdi, rcx) // c += cs_c;
+
+		vmovapd(ymm9, mem(rcx)) // store c03:c33
+		vmovapd(ymm8, mem(rcx,32)) // store c43:c73
+		//add(rdi, rcx) // c += cs_c;
+
 	label(.ZDONE)
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
 	  [k_iter]  "m" (k_iter), // 0
@@ -2524,7 +1747,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	  [b_next]  "m" (b_next), // 9
 	  [a_next]  "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "ymm0", "ymm1", "ymm2", "ymm3",
 	  "ymm4", "ymm5", "ymm6", "ymm7",
@@ -2532,5 +1755,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
 	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( z );
 }
 
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index 7907bd901..d0e793867 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -79,7 +79,9 @@
 
 void bli_sgemm_haswell_asm_6x16
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -94,11 +96,13 @@ void bli_sgemm_haswell_asm_6x16
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_AMBI( s, 6, 16, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -109,36 +113,65 @@ void bli_sgemm_haswell_asm_6x16
 	//mov(%9, r15) // load address of b_next.
 
 	add(imm(32*4), rbx)
-	 // initialize loop by pre-loading
+	// initialize loop by pre-loading
 	vmovaps(mem(rbx, -4*32), ymm0)
 	vmovaps(mem(rbx, -3*32), ymm1)
 
 	mov(var(c), rcx) // load address of c
 	mov(var(rs_c), rdi) // load rs_c
 	lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float)
+	mov(var(cs_c), rsi) // load cs_c
+	lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float)
 
-	lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
-	lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-
-
-
+	cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4.
+	jz(.SCOLPREFETCH) // jump to column prefetch case
+
+		lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
+		lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c;
+		prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c
+		prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
+		prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
+		prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c
+		prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
+		prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
+
+		jmp(.SPREFETCHDONE)
+
+	label(.SCOLPREFETCH)
+
+		lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c;
+		lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c;
+		prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
+		prefetch(0, mem(rcx, rsi, 1, 7*8)) // prefetch c + 1*cs_c
+		prefetch(0, mem(rcx, rsi, 2, 7*8)) // prefetch c + 2*cs_c
+		prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c
+		prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 4*cs_c
+		prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 5*cs_c
+		prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c
+		prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c
+		lea(mem(rcx, rsi, 8), r14) // r14 = c + 8*cs_c;
+		lea(mem(r14, r13, 1), rdx) // rdx = c + 11*cs_c;
+		prefetch(0, mem(r14, 7*8)) // prefetch c + 8*cs_c
+		prefetch(0, mem(r14, rsi, 1, 7*8)) // prefetch c + 9*cs_c
+		prefetch(0, mem(r14, rsi, 2, 7*8)) // prefetch c + 10*cs_c
+		prefetch(0, mem(rdx, 7*8)) // prefetch c + 11*cs_c
+		prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 12*cs_c
+		prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 13*cs_c
+		prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 14*cs_c
+		prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 15*cs_c
+
+	label(.SPREFETCHDONE)
 
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SCONSIDKLEFT) // if i == 0, jump to code that
-	 // contains the k_left loop.
+	// contains the k_left loop.
 
 
 	label(.SLOOPKITER) // MAIN LOOP
 
 
-	 // iteration 0
+	// iteration 0
 	prefetch(0, mem(rax, 64*4))
 
 	vbroadcastss(mem(rax, 0*4), ymm2)
@@ -165,7 +198,7 @@ void bli_sgemm_haswell_asm_6x16
 	vmovaps(mem(rbx, -2*32), ymm0)
 	vmovaps(mem(rbx, -1*32), ymm1)
 
-	 // iteration 1
+	// iteration 1
 	vbroadcastss(mem(rax, 6*4), ymm2)
 	vbroadcastss(mem(rax, 7*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
@@ -190,7 +223,7 @@ void bli_sgemm_haswell_asm_6x16
 	vmovaps(mem(rbx, 0*32), ymm0)
 	vmovaps(mem(rbx, 1*32), ymm1)
 
-	 // iteration 2
+	// iteration 2
 	prefetch(0, mem(rax, 76*4))
 
 	vbroadcastss(mem(rax, 12*4), ymm2)
@@ -217,7 +250,7 @@ void bli_sgemm_haswell_asm_6x16
 	vmovaps(mem(rbx, 2*32), ymm0)
 	vmovaps(mem(rbx, 3*32), ymm1)
 
-	 // iteration 3
+	// iteration 3
 	vbroadcastss(mem(rax, 18*4), ymm2)
 	vbroadcastss(mem(rax, 19*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
@@ -259,7 +292,7 @@ void bli_sgemm_haswell_asm_6x16
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
-	 // else, we prepare to enter k_left loop.
+	// else, we prepare to enter k_left loop.
 
 
 	label(.SLOOPKLEFT) // EDGE LOOP
@@ -338,533 +371,330 @@ void bli_sgemm_haswell_asm_6x16
 	lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c;
 
 
-	 // now avoid loading C if beta == 0
+	// now avoid loading C if beta == 0
 
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm3) // set ZF if beta == 0.
 	je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
 
-
-	cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4.
-	jz(.SROWSTORED) // jump to row storage case
-
-
-	cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4.
-	jz(.SCOLSTORED) // jump to column storage case
-
-
-
-	label(.SGENSTORED)
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm4, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm6, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm8, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm10, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm12, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm14, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += rs_c;
-
-
-	mov(rdx, rcx) // rcx = c + 8*cs_c
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm5, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm7, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm9, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm11, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm13, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm15, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += rs_c;
-
-
-
-	jmp(.SDONE) // jump to end.
-
-
-
-	label(.SROWSTORED)
-
-
-	vfmadd231ps(mem(rcx), ymm3, ymm4)
-	vmovups(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm5)
-	vmovups(ymm5, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231ps(mem(rcx), ymm3, ymm6)
-	vmovups(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm7)
-	vmovups(ymm7, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231ps(mem(rcx), ymm3, ymm8)
-	vmovups(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm9)
-	vmovups(ymm9, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231ps(mem(rcx), ymm3, ymm10)
-	vmovups(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm11)
-	vmovups(ymm11, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231ps(mem(rcx), ymm3, ymm12)
-	vmovups(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm13)
-	vmovups(ymm13, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231ps(mem(rcx), ymm3, ymm14)
-	vmovups(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm15)
-	vmovups(ymm15, mem(rdx))
-	//add(rdi, rdx)
-
-
-
-	jmp(.SDONE) // jump to end.
-
-
-
-	label(.SCOLSTORED)
-
-
-	vbroadcastss(mem(rbx), ymm3)
-
-	vunpcklps(ymm6, ymm4, ymm0)
-	vunpcklps(ymm10, ymm8, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vfmadd231ps(mem(rcx), xmm3, xmm0)
-	vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2)
-	vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
-	vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1)
-	vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2)
-	vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
-	vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
-
-
-	vunpckhps(ymm6, ymm4, ymm0)
-	vunpckhps(ymm10, ymm8, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0)
-	vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2)
-	vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
-	vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1)
-	vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2)
-	vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
-	vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
-
-	lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
-
-	vunpcklps(ymm14, ymm12, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(mem(r14), xmm1, xmm1)
-	vmovhpd(mem(r14, rsi, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm0)
-	vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
-	vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
-	vmovlpd(mem(r14, rsi, 4), xmm1, xmm1)
-	vmovhpd(mem(r14, r15, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm2)
-	vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
-	vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
-
-	vunpckhps(ymm14, ymm12, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(mem(r14, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(r14, r13, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm0)
-	vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
-	vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
-	vmovlpd(mem(r14, r13, 2), xmm1, xmm1)
-	vmovhpd(mem(r14, r10, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm2)
-	vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
-	vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
-
-	lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c
-
-
-
-	vunpcklps(ymm7, ymm5, ymm0)
-	vunpcklps(ymm11, ymm9, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vfmadd231ps(mem(rcx), xmm3, xmm0)
-	vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2)
-	vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
-	vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1)
-	vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2)
-	vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
-	vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
-
-
-	vunpckhps(ymm7, ymm5, ymm0)
-	vunpckhps(ymm11, ymm9, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0)
-	vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2)
-	vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
-	vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1)
-	vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2)
-	vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
-	vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
-
-	//lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
-
-	vunpcklps(ymm15, ymm13, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(mem(r14), xmm1, xmm1)
-	vmovhpd(mem(r14, rsi, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm0)
-	vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
-	vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
-	vmovlpd(mem(r14, rsi, 4), xmm1, xmm1)
-	vmovhpd(mem(r14, r15, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm2)
-	vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
-	vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
-
-	vunpckhps(ymm15, ymm13, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(mem(r14, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(r14, r13, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm0)
-	vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
-	vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
-	vmovlpd(mem(r14, r13, 2), xmm1, xmm1)
-	vmovhpd(mem(r14, r10, 1), xmm1, xmm1)
-	vfmadd231ps(xmm1, xmm3, xmm2)
-	vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
-	vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
-
-	//lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c
-
-
-
-	jmp(.SDONE) // jump to end.
-
-
+		cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4.
+		jz(.SCOLSTORED) // jump to column storage case
+
+			vfmadd231ps(mem(rcx), ymm3, ymm4)
+			vmovups(ymm4, mem(rcx))
+			vfmadd231ps(mem(rcx,32), ymm3, ymm5)
+			vmovups(ymm5, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231ps(mem(rcx), ymm3, ymm6)
+			vmovups(ymm6, mem(rcx))
+			vfmadd231ps(mem(rcx,32), ymm3, ymm7)
+			vmovups(ymm7, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231ps(mem(rcx), ymm3, ymm8)
+			vmovups(ymm8, mem(rcx))
+			vfmadd231ps(mem(rcx,32), ymm3, ymm9)
+			vmovups(ymm9, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231ps(mem(rcx), ymm3, ymm10)
+			vmovups(ymm10, mem(rcx))
+			vfmadd231ps(mem(rcx,32), ymm3, ymm11)
+			vmovups(ymm11, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231ps(mem(rcx), ymm3, ymm12)
+			vmovups(ymm12, mem(rcx))
+			vfmadd231ps(mem(rcx,32), ymm3, ymm13)
+			vmovups(ymm13, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231ps(mem(rcx), ymm3, ymm14)
+			vmovups(ymm14, mem(rcx))
+			vfmadd231ps(mem(rcx,32), ymm3, ymm15)
+			vmovups(ymm15, mem(rcx,32))
+			//add(rdi, rcx)
+
+			jmp(.SDONE) // jump to end.
+
+		label(.SCOLSTORED)
+
+			vunpcklps(ymm6, ymm4, ymm0)
+			vunpcklps(ymm10, ymm8, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
+
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vfmadd231ps(mem(rcx), xmm3, xmm0)
+			vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2)
+			vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
+			vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
+
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1)
+			vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2)
+			vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
+			vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
+
+
+			vunpckhps(ymm6, ymm4, ymm0)
+			vunpckhps(ymm10, ymm8, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
+
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0)
+			vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2)
+			vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
+			vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
+
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1)
+			vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2)
+			vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
+			vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
+
+			lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
+
+			vunpcklps(ymm14, ymm12, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(mem(r14), xmm1, xmm1)
+			vmovhpd(mem(r14, rsi, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm0)
+			vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
+			vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
+			vmovlpd(mem(r14, rsi, 4), xmm1, xmm1)
+			vmovhpd(mem(r14, r15, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm2)
+			vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
+			vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
+
+			vunpckhps(ymm14, ymm12, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(mem(r14, rsi, 2), xmm1, xmm1)
+			vmovhpd(mem(r14, r13, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm0)
+			vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
+			vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
+			vmovlpd(mem(r14, r13, 2), xmm1, xmm1)
+			vmovhpd(mem(r14, r10, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm2)
+			vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
+			vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
+
+			lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c
+
+
+
+			vunpcklps(ymm7, ymm5, ymm0)
+			vunpcklps(ymm11, ymm9, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
+
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vfmadd231ps(mem(rcx), xmm3, xmm0)
+			vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2)
+			vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
+			vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
+
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1)
+			vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2)
+			vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
+			vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
+
+
+			vunpckhps(ymm7, ymm5, ymm0)
+			vunpckhps(ymm11, ymm9, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
+
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0)
+			vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2)
+			vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
+			vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
+
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1)
+			vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2)
+			vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
+			vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
+
+			//lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
+
+			vunpcklps(ymm15, ymm13, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(mem(r14), xmm1, xmm1)
+			vmovhpd(mem(r14, rsi, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm0)
+			vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
+			vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
+			vmovlpd(mem(r14, rsi, 4), xmm1, xmm1)
+			vmovhpd(mem(r14, r15, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm2)
+			vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
+			vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
+
+			vunpckhps(ymm15, ymm13, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(mem(r14, rsi, 2), xmm1, xmm1)
+			vmovhpd(mem(r14, r13, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm0)
+			vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
+			vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
+			vmovlpd(mem(r14, r13, 2), xmm1, xmm1)
+			vmovhpd(mem(r14, r10, 1), xmm1, xmm1)
+			vfmadd231ps(xmm1, xmm3, xmm2)
+			vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
+			vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
+
+			//lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c
+
+			jmp(.SDONE) // jump to end.
 
 	label(.SBETAZERO)
 
-	cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4.
-	jz(.SROWSTORBZ) // jump to row storage case
-
-	cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4.
-	jz(.SCOLSTORBZ) // jump to column storage case
-
-
-
-	label(.SGENSTORBZ)
-
-
-	vmovaps(ymm4, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovaps(ymm6, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovaps(ymm8, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovaps(ymm10, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovaps(ymm12, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovaps(ymm14, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += rs_c;
-
-
-	mov(rdx, rcx) // rcx = c + 8*cs_c
-
-
-	vmovaps(ymm5, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovaps(ymm7, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
+		cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4.
+		jz(.SCOLSTORBZ) // jump to column storage case
 
-	vmovaps(ymm9, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
+			vmovups(ymm4, mem(rcx))
+			vmovups(ymm5, mem(rcx,32))
+			add(rdi, rcx)
 
+			vmovups(ymm6, mem(rcx))
+			vmovups(ymm7, mem(rcx,32))
+			add(rdi, rcx)
 
-	vmovaps(ymm11, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
 
+			vmovups(ymm8, mem(rcx))
+			vmovups(ymm9, mem(rcx,32))
+			add(rdi, rcx)
 
-	vmovaps(ymm13, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
 
+			vmovups(ymm10, mem(rcx))
+			vmovups(ymm11, mem(rcx,32))
+			add(rdi, rcx)
 
-	vmovaps(ymm15, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += rs_c;
 
+			vmovups(ymm12, mem(rcx))
+			vmovups(ymm13, mem(rcx,32))
+			add(rdi, rcx)
 
 
-	jmp(.SDONE) // jump to end.
+			vmovups(ymm14, mem(rcx))
+			vmovups(ymm15, mem(rcx,32))
+			//add(rdi, rcx)
 
+			jmp(.SDONE) // jump to end.
 
+		label(.SCOLSTORBZ)
 
-	label(.SROWSTORBZ)
+			vunpcklps(ymm6, ymm4, ymm0)
+			vunpcklps(ymm10, ymm8, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
 
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
+			vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
 
-	vmovups(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm5, mem(rdx))
-	add(rdi, rdx)
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
+			vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
 
-	vmovups(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm7, mem(rdx))
-	add(rdi, rdx)
 
+			vunpckhps(ymm6, ymm4, ymm0)
+			vunpckhps(ymm10, ymm8, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
 
-	vmovups(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm9, mem(rdx))
-	add(rdi, rdx)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
+			vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
 
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
+			vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
 
-	vmovups(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm11, mem(rdx))
-	add(rdi, rdx)
+			lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
 
+			vunpcklps(ymm14, ymm12, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
+			vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
+			vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
+			vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
 
-	vmovups(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm13, mem(rdx))
-	add(rdi, rdx)
+			vunpckhps(ymm14, ymm12, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
+			vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
+			vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
+			vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
 
+			lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c
 
-	vmovups(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vmovups(ymm15, mem(rdx))
-	//add(rdi, rdx)
 
 
+			vunpcklps(ymm7, ymm5, ymm0)
+			vunpcklps(ymm11, ymm9, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
 
-	jmp(.SDONE) // jump to end.
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
+			vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
 
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
+			vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
 
 
-	label(.SCOLSTORBZ)
+			vunpckhps(ymm7, ymm5, ymm0)
+			vunpckhps(ymm11, ymm9, ymm1)
+			vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+			vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+			vblendps(imm(0x33), ymm2, ymm1, ymm1)
 
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
+			vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
 
-	vunpcklps(ymm6, ymm4, ymm0)
-	vunpcklps(ymm10, ymm8, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
+			vextractf128(imm(0x1), ymm1, xmm2)
+			vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
+			vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
 
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
-	vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
-	vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
-
-
-	vunpckhps(ymm6, ymm4, ymm0)
-	vunpckhps(ymm10, ymm8, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
-	vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
-	vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
-
-	lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
-
-	vunpcklps(ymm14, ymm12, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
-	vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
-	vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
-	vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
-
-	vunpckhps(ymm14, ymm12, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
-	vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
-	vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
-	vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
-
-	lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c
-
-
-
-	vunpcklps(ymm7, ymm5, ymm0)
-	vunpcklps(ymm11, ymm9, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 )
-	vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 )
-	vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 )
-
-
-	vunpckhps(ymm7, ymm5, ymm0)
-	vunpckhps(ymm11, ymm9, ymm1)
-	vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-	vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-	vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 )
-	vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 )
-
-	vextractf128(imm(0x1), ymm1, xmm2)
-	vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 )
-	vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 )
-
-	//lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
-
-	vunpcklps(ymm15, ymm13, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
-	vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
-	vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
-	vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
-
-	vunpckhps(ymm15, ymm13, ymm0)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
-	vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
-	vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
-	vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
-
-	//lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c
+			//lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c
 
+			vunpcklps(ymm15, ymm13, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 )
+			vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 )
+			vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 )
+			vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 )
 
+			vunpckhps(ymm15, ymm13, ymm0)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 )
+			vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 )
+			vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 )
+			vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 )
 
+			//lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_
 
 	label(.SDONE)
 
@@ -896,6 +726,8 @@ void bli_sgemm_haswell_asm_6x16
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 
@@ -927,7 +759,9 @@ void bli_sgemm_haswell_asm_6x16
 
 void bli_dgemm_haswell_asm_6x8
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -942,11 +776,13 @@ void bli_dgemm_haswell_asm_6x8
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_AMBI( d, 6, 8, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -957,36 +793,56 @@ void bli_dgemm_haswell_asm_6x8
 	//mov(%9, r15) // load address of b_next.
 
 	add(imm(32*4), rbx)
-	 // initialize loop by pre-loading
+	// initialize loop by pre-loading
 	vmovapd(mem(rbx, -4*32), ymm0)
 	vmovapd(mem(rbx, -3*32), ymm1)
 
 	mov(var(c), rcx) // load address of c
 	mov(var(rs_c), rdi) // load rs_c
 	lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
+	mov(var(cs_c), rsi) // load cs_c
+	lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
 
-	lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
-	lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
+	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
+	jz(.SCOLPREFETCH) // jump to column prefetch case
+
+		lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
+		lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c;
+		prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c
+		prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
+		prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
+		prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c
+		prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
+		prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
+
+		jmp(.SPREFETCHDONE)
+
+	label(.SCOLPREFETCH)
 
+		lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c;
+		lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c;
+		prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
+		prefetch(0, mem(rcx, rsi, 1, 7*8)) // prefetch c + 1*cs_c
+		prefetch(0, mem(rcx, rsi, 2, 7*8)) // prefetch c + 2*cs_c
+		prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c
+		prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 4*cs_c
+		prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 5*cs_c
+		prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c
+		prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c
 
+	label(.SPREFETCHDONE)
 
 
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DCONSIDKLEFT) // if i == 0, jump to code that
-	 // contains the k_left loop.
+	// contains the k_left loop.
 
 
 	label(.DLOOPKITER) // MAIN LOOP
 
 
-	 // iteration 0
+	// iteration 0
 	prefetch(0, mem(rax, 64*8))
 
 	vbroadcastsd(mem(rax, 0*8), ymm2)
@@ -1013,7 +869,7 @@ void bli_dgemm_haswell_asm_6x8
 	vmovapd(mem(rbx, -2*32), ymm0)
 	vmovapd(mem(rbx, -1*32), ymm1)
 
-	 // iteration 1
+	// iteration 1
 	prefetch(0, mem(rax, 72*8))
 
 	vbroadcastsd(mem(rax, 6*8), ymm2)
@@ -1040,7 +896,7 @@ void bli_dgemm_haswell_asm_6x8
 	vmovapd(mem(rbx, 0*32), ymm0)
 	vmovapd(mem(rbx, 1*32), ymm1)
 
-	 // iteration 2
+	// iteration 2
 	prefetch(0, mem(rax, 80*8))
 
 	vbroadcastsd(mem(rax, 12*8), ymm2)
@@ -1067,7 +923,7 @@ void bli_dgemm_haswell_asm_6x8
 	vmovapd(mem(rbx, 2*32), ymm0)
 	vmovapd(mem(rbx, 3*32), ymm1)
 
-	 // iteration 3
+	// iteration 3
 	vbroadcastsd(mem(rax, 18*8), ymm2)
 	vbroadcastsd(mem(rax, 19*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
@@ -1109,7 +965,7 @@ void bli_dgemm_haswell_asm_6x8
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
-	 // else, we prepare to enter k_left loop.
+	// else, we prepare to enter k_left loop.
 
 
 	label(.DLOOPKLEFT) // EDGE LOOP
@@ -1188,428 +1044,232 @@ void bli_dgemm_haswell_asm_6x8
 	//lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c;
 
 
-	 // now avoid loading C if beta == 0
+	// now avoid loading C if beta == 0
 
 	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomisd(xmm0, xmm3) // set ZF if beta == 0.
 	je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
 
+		cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8.
+		jz(.DCOLSTORED) // jump to column storage case
+
+			vfmadd231pd(mem(rcx), ymm3, ymm4)
+			vmovupd(ymm4, mem(rcx))
+			vfmadd231pd(mem(rcx,32), ymm3, ymm5)
+			vmovupd(ymm5, mem(rcx,32))
+			add(rdi, rcx)
 
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.DROWSTORED) // jump to row storage case
-
-
-	cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED) // jump to column storage case
-
-
-
-	label(.DGENSTORED)
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm4, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm6, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm8, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm10, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm12, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm14, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-
-
-	mov(rdx, rcx) // rcx = c + 4*cs_c
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm5, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm7, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm9, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm11, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm13, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm15, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-
-
-
-	jmp(.DDONE) // jump to end.
-
-
-
-	label(.DROWSTORED)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm7)
-	vmovupd(ymm7, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm9)
-	vmovupd(ymm9, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm11)
-	vmovupd(ymm11, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm13)
-	vmovupd(ymm13, mem(rdx))
-	add(rdi, rdx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm14)
-	vmovupd(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm15)
-	vmovupd(ymm15, mem(rdx))
-	//add(rdi, rdx)
-
-
-
-	jmp(.DDONE) // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, r13, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(r14), xmm3, xmm0)
-	vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(r14))
-	vmovupd(xmm1, mem(r14, rsi, 1))
-	vmovupd(xmm2, mem(r14, rsi, 2))
-	vmovupd(xmm4, mem(r14, r13, 1))
-
-	lea(mem(r14, rsi, 4), r14)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm11)
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, r13, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(r14), xmm3, xmm0)
-	vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(r14))
-	vmovupd(xmm1, mem(r14, rsi, 1))
-	vmovupd(xmm2, mem(r14, rsi, 2))
-	vmovupd(xmm4, mem(r14, r13, 1))
-
-	//lea(mem(r14, rsi, 4), r14)
-
-
-
-	jmp(.DDONE) // jump to end.
 
+			vfmadd231pd(mem(rcx), ymm3, ymm6)
+			vmovupd(ymm6, mem(rcx))
+			vfmadd231pd(mem(rcx,32), ymm3, ymm7)
+			vmovupd(ymm7, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231pd(mem(rcx), ymm3, ymm8)
+			vmovupd(ymm8, mem(rcx))
+			vfmadd231pd(mem(rcx,32), ymm3, ymm9)
+			vmovupd(ymm9, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231pd(mem(rcx), ymm3, ymm10)
+			vmovupd(ymm10, mem(rcx))
+			vfmadd231pd(mem(rcx,32), ymm3, ymm11)
+			vmovupd(ymm11, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231pd(mem(rcx), ymm3, ymm12)
+			vmovupd(ymm12, mem(rcx))
+			vfmadd231pd(mem(rcx,32), ymm3, ymm13)
+			vmovupd(ymm13, mem(rcx,32))
+			add(rdi, rcx)
+
+
+			vfmadd231pd(mem(rcx), ymm3, ymm14)
+			vmovupd(ymm14, mem(rcx))
+			vfmadd231pd(mem(rcx,32), ymm3, ymm15)
+			vmovupd(ymm15, mem(rcx,32))
+			//add(rdi, rcx)
+
+			jmp(.DDONE) // jump to end.
+
+		label(.DCOLSTORED)
+
+			vunpcklpd(ymm6, ymm4, ymm0)
+			vunpckhpd(ymm6, ymm4, ymm1)
+			vunpcklpd(ymm10, ymm8, ymm2)
+			vunpckhpd(ymm10, ymm8, ymm3)
+			vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
+			vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
+			vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
+			vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
+
+			vbroadcastsd(mem(rbx), ymm3)
+
+			vfmadd231pd(mem(rcx), ymm3, ymm4)
+			vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
+			vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
+			vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm10)
+			vmovupd(ymm4, mem(rcx))
+			vmovupd(ymm6, mem(rcx, rsi, 1))
+			vmovupd(ymm8, mem(rcx, rsi, 2))
+			vmovupd(ymm10, mem(rcx, r13, 1))
+
+			lea(mem(rcx, rsi, 4), rcx)
+
+			vunpcklpd(ymm14, ymm12, ymm0)
+			vunpckhpd(ymm14, ymm12, ymm1)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vextractf128(imm(0x1), ymm1, xmm4)
+
+			vfmadd231pd(mem(r14), xmm3, xmm0)
+			vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1)
+			vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2)
+			vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4)
+			vmovupd(xmm0, mem(r14))
+			vmovupd(xmm1, mem(r14, rsi, 1))
+			vmovupd(xmm2, mem(r14, rsi, 2))
+			vmovupd(xmm4, mem(r14, r13, 1))
+
+			lea(mem(r14, rsi, 4), r14)
+
+
+			vunpcklpd(ymm7, ymm5, ymm0)
+			vunpckhpd(ymm7, ymm5, ymm1)
+			vunpcklpd(ymm11, ymm9, ymm2)
+			vunpckhpd(ymm11, ymm9, ymm3)
+			vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
+			vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
+			vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
+			vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
+
+			vbroadcastsd(mem(rbx), ymm3)
+
+			vfmadd231pd(mem(rcx), ymm3, ymm5)
+			vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
+			vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
+			vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm11)
+			vmovupd(ymm5, mem(rcx))
+			vmovupd(ymm7, mem(rcx, rsi, 1))
+			vmovupd(ymm9, mem(rcx, rsi, 2))
+			vmovupd(ymm11, mem(rcx, r13, 1))
+
+			//lea(mem(rcx, rsi, 4), rcx)
 
+			vunpcklpd(ymm15, ymm13, ymm0)
+			vunpckhpd(ymm15, ymm13, ymm1)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vextractf128(imm(0x1), ymm1, xmm4)
+
+			vfmadd231pd(mem(r14), xmm3, xmm0)
+			vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1)
+			vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2)
+			vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4)
+			vmovupd(xmm0, mem(r14))
+			vmovupd(xmm1, mem(r14, rsi, 1))
+			vmovupd(xmm2, mem(r14, rsi, 2))
+			vmovupd(xmm4, mem(r14, r13, 1))
+
+			//lea(mem(r14, rsi, 4), r14)
+
+			jmp(.DDONE) // jump to end.
 
 	label(.DBETAZERO)
 
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.DROWSTORBZ) // jump to row storage case
-
-	cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ) // jump to column storage case
-
-
-
-	label(.DGENSTORBZ)
-
-
-	vmovapd(ymm4, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovapd(ymm6, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovapd(ymm8, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovapd(ymm10, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovapd(ymm12, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
-
-	vmovapd(ymm14, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-
-
-	mov(rdx, rcx) // rcx = c + 4*cs_c
-
-
-	vmovapd(ymm5, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
-
+		cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8.
+		jz(.DCOLSTORBZ) // jump to column storage case
 
-	vmovapd(ymm7, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
+			vmovupd(ymm4, mem(rcx))
+			vmovupd(ymm5, mem(rcx,32))
+			add(rdi, rcx)
 
+			vmovupd(ymm6, mem(rcx))
+			vmovupd(ymm7, mem(rcx,32))
+			add(rdi, rcx)
 
-	vmovapd(ymm9, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
 
+			vmovupd(ymm8, mem(rcx))
+			vmovupd(ymm9, mem(rcx,32))
+			add(rdi, rcx)
 
-	vmovapd(ymm11, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
 
+			vmovupd(ymm10, mem(rcx))
+			vmovupd(ymm11, mem(rcx,32))
+			add(rdi, rcx)
 
-	vmovapd(ymm13, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += rs_c;
 
+			vmovupd(ymm12, mem(rcx))
+			vmovupd(ymm13, mem(rcx,32))
+			add(rdi, rcx)
 
-	vmovapd(ymm15, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
 
+			vmovupd(ymm14, mem(rcx))
+			vmovupd(ymm15, mem(rcx,32))
+			//add(rdi, rcx)
 
+			jmp(.DDONE) // jump to end.
 
-	jmp(.DDONE) // jump to end.
+		label(.DCOLSTORBZ)
 
+			vunpcklpd(ymm6, ymm4, ymm0)
+			vunpckhpd(ymm6, ymm4, ymm1)
+			vunpcklpd(ymm10, ymm8, ymm2)
+			vunpckhpd(ymm10, ymm8, ymm3)
+			vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
+			vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
+			vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
+			vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
 
+			vmovupd(ymm4, mem(rcx))
+			vmovupd(ymm6, mem(rcx, rsi, 1))
+			vmovupd(ymm8, mem(rcx, rsi, 2))
+			vmovupd(ymm10, mem(rcx, r13, 1))
 
-	label(.DROWSTORBZ)
+			lea(mem(rcx, rsi, 4), rcx)
 
+			vunpcklpd(ymm14, ymm12, ymm0)
+			vunpckhpd(ymm14, ymm12, ymm1)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vextractf128(imm(0x1), ymm1, xmm4)
 
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm5, mem(rdx))
-	add(rdi, rdx)
+			vmovupd(xmm0, mem(r14))
+			vmovupd(xmm1, mem(r14, rsi, 1))
+			vmovupd(xmm2, mem(r14, rsi, 2))
+			vmovupd(xmm4, mem(r14, r13, 1))
 
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm7, mem(rdx))
-	add(rdi, rdx)
+			lea(mem(r14, rsi, 4), r14)
 
 
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm9, mem(rdx))
-	add(rdi, rdx)
+			vunpcklpd(ymm7, ymm5, ymm0)
+			vunpckhpd(ymm7, ymm5, ymm1)
+			vunpcklpd(ymm11, ymm9, ymm2)
+			vunpckhpd(ymm11, ymm9, ymm3)
+			vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
+			vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
+			vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
+			vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
 
+			vmovupd(ymm5, mem(rcx))
+			vmovupd(ymm7, mem(rcx, rsi, 1))
+			vmovupd(ymm9, mem(rcx, rsi, 2))
+			vmovupd(ymm11, mem(rcx, r13, 1))
 
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm11, mem(rdx))
-	add(rdi, rdx)
-
-
-	vmovupd(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm13, mem(rdx))
-	add(rdi, rdx)
-
-
-	vmovupd(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vmovupd(ymm15, mem(rdx))
-	//add(rdi, rdx)
-
-
-	jmp(.DDONE) // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, r13, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(r14))
-	vmovupd(xmm1, mem(r14, rsi, 1))
-	vmovupd(xmm2, mem(r14, rsi, 2))
-	vmovupd(xmm4, mem(r14, r13, 1))
-
-	lea(mem(r14, rsi, 4), r14)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, r13, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(r14))
-	vmovupd(xmm1, mem(r14, rsi, 1))
-	vmovupd(xmm2, mem(r14, rsi, 2))
-	vmovupd(xmm4, mem(r14, r13, 1))
-
-	//lea(mem(r14, rsi, 4), r14)
+			//lea(mem(rcx, rsi, 4), rcx)
 
+			vunpcklpd(ymm15, ymm13, ymm0)
+			vunpckhpd(ymm15, ymm13, ymm1)
+			vextractf128(imm(0x1), ymm0, xmm2)
+			vextractf128(imm(0x1), ymm1, xmm4)
 
+			vmovupd(xmm0, mem(r14))
+			vmovupd(xmm1, mem(r14, rsi, 1))
+			vmovupd(xmm2, mem(r14, rsi, 2))
+			vmovupd(xmm4, mem(r14, r13, 1))
 
+			//lea(mem(r14, rsi, 4), r14)
 
 	label(.DDONE)
 
@@ -1641,45 +1301,26 @@ void bli_dgemm_haswell_asm_6x8
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 
-// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
-// outputs to ymm0
-#define CGEMM_INPUT_SCALE_GS_BETA_NZ \
-	vmovlpd(mem(rcx), xmm0, xmm0) \
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \
-	vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \
-	vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \
-	vinsertf128(imm(1), xmm3, ymm0, ymm0) \
-	vpermilps(imm(0xb1), ymm0, ymm3) \
-	vmulps(ymm1, ymm0, ymm0) \
-	vmulps(ymm2, ymm3, ymm3) \
-	vaddsubps(ymm3, ymm0, ymm0)
 
-// assumes values to output are in ymm0
-#define CGEMM_OUTPUT_GS \
-	vextractf128(imm(1), ymm0, xmm3) \
-	vmovlpd(xmm0, mem(rcx)) \
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) \
-	vmovlpd(xmm3, mem(rcx, rsi, 2)) \
-	vmovhpd(xmm3, mem(rcx, r13, 1))
-
-#define CGEMM_INPUT_SCALE_RS_BETA_NZ \
-	vmovups(mem(rcx), ymm0) \
+#define CGEMM_INPUT_SCALE_RS_BETA_NZ(where) \
+	vmovups(where, ymm0) \
 	vpermilps(imm(0xb1), ymm0, ymm3) \
 	vmulps(ymm1, ymm0, ymm0) \
 	vmulps(ymm2, ymm3, ymm3) \
 	vaddsubps(ymm3, ymm0, ymm0)
 
-#define CGEMM_OUTPUT_RS \
-	vmovups(ymm0, mem(rcx)) \
-
 void bli_cgemm_haswell_asm_3x8
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
@@ -1694,11 +1335,13 @@ void bli_cgemm_haswell_asm_3x8
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( c, 3, 8, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -1709,7 +1352,7 @@ void bli_cgemm_haswell_asm_3x8
 	//mov(%9, r15) // load address of b_next.
 
 	add(imm(32*4), rbx)
-	 // initialize loop by pre-loading
+	// initialize loop by pre-loading
 	vmovaps(mem(rbx, -4*32), ymm0)
 	vmovaps(mem(rbx, -3*32), ymm1)
 
@@ -1730,13 +1373,13 @@ void bli_cgemm_haswell_asm_3x8
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CCONSIDKLEFT) // if i == 0, jump to code that
-	 // contains the k_left loop.
+	// contains the k_left loop.
 
 
 	label(.CLOOPKITER) // MAIN LOOP
 
 
-	 // iteration 0
+	// iteration 0
 	prefetch(0, mem(rax, 32*8))
 
 	vbroadcastss(mem(rax, 0*4), ymm2)
@@ -1763,7 +1406,7 @@ void bli_cgemm_haswell_asm_3x8
 	vmovaps(mem(rbx, -2*32), ymm0)
 	vmovaps(mem(rbx, -1*32), ymm1)
 
-	 // iteration 1
+	// iteration 1
 	vbroadcastss(mem(rax, 6*4), ymm2)
 	vbroadcastss(mem(rax, 7*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
@@ -1788,7 +1431,7 @@ void bli_cgemm_haswell_asm_3x8
 	vmovaps(mem(rbx, 0*32), ymm0)
 	vmovaps(mem(rbx, 1*32), ymm1)
 
-	 // iteration 2
+	// iteration 2
 	prefetch(0, mem(rax, 38*8))
 
 	vbroadcastss(mem(rax, 12*4), ymm2)
@@ -1815,7 +1458,7 @@ void bli_cgemm_haswell_asm_3x8
 	vmovaps(mem(rbx, 2*32), ymm0)
 	vmovaps(mem(rbx, 3*32), ymm1)
 
-	 // iteration 3
+	// iteration 3
 	vbroadcastss(mem(rax, 18*4), ymm2)
 	vbroadcastss(mem(rax, 19*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
@@ -1857,7 +1500,7 @@ void bli_cgemm_haswell_asm_3x8
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
-	 // else, we prepare to enter k_left loop.
+	// else, we prepare to enter k_left loop.
 
 
 	label(.CLOOPKLEFT) // EDGE LOOP
@@ -1900,8 +1543,8 @@ void bli_cgemm_haswell_asm_3x8
 	label(.CPOSTACCUM)
 
 
-	 // permute even and odd elements
-	 // of ymm6/7, ymm10/11, ymm/14/15
+	// permute even and odd elements
+	// of ymm6/7, ymm10/11, ymm/14/15
 	vpermilps(imm(0xb1), ymm6, ymm6)
 	vpermilps(imm(0xb1), ymm7, ymm7)
 	vpermilps(imm(0xb1), ymm10, ymm10)
@@ -1910,7 +1553,7 @@ void bli_cgemm_haswell_asm_3x8
 	vpermilps(imm(0xb1), ymm15, ymm15)
 
 
-	 // subtract/add even/odd elements
+	// subtract/add even/odd elements
 	vaddsubps(ymm6, ymm4, ymm4)
 	vaddsubps(ymm7, ymm5, ymm5)
 
@@ -1969,16 +1612,7 @@ void bli_cgemm_haswell_asm_3x8
 	vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
 
 
-
-
-	mov(var(cs_c), rsi) // load cs_c
-	lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(scomplex)
-	lea(mem(, rsi, 4), rdx) // rdx = 4*cs_c;
-	lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c;
-
-
-
-	 // now avoid loading C if beta == 0
+	// now avoid loading C if beta == 0
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm1) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -1987,162 +1621,49 @@ void bli_cgemm_haswell_asm_3x8
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case
 
+		CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx))
+		vaddps(ymm4, ymm0, ymm0)
+		vmovups(ymm0, mem(rcx))
 
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.CROWSTORED) // jump to row storage case
-
-
-
-	label(.CGENSTORED)
-
-
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm4, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*cs_c;
-
-
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm5, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*rs_c
 
+		CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx,32))
+		vaddps(ymm5, ymm0, ymm0)
+		vmovups(ymm0, mem(rcx,32))
 
 
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm8, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*cs_c;
 
+		CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11))
+		vaddps(ymm8, ymm0, ymm0)
+		vmovups(ymm0, mem(r11))
 
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm9, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*rs_c
 
+		CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11,32))
+		vaddps(ymm9, ymm0, ymm0)
+		vmovups(ymm0, mem(r11,32))
 
 
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm12, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*cs_c;
 
+		CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12))
+		vaddps(ymm12, ymm0, ymm0)
+		vmovups(ymm0, mem(r12))
 
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm13, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-
-
-
-	jmp(.CDONE) // jump to end.
-
-
-
-	label(.CROWSTORED)
-
-
-	CGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddps(ymm4, ymm0, ymm0)
-	CGEMM_OUTPUT_RS
-	add(rdx, rcx) // c += 4*cs_c;
-
-
-	CGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddps(ymm5, ymm0, ymm0)
-	CGEMM_OUTPUT_RS
-	mov(r11, rcx) // rcx = c + 1*rs_c
-
-
-
-	CGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddps(ymm8, ymm0, ymm0)
-	CGEMM_OUTPUT_RS
-	add(rdx, rcx) // c += 4*cs_c;
-
-
-	CGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddps(ymm9, ymm0, ymm0)
-	CGEMM_OUTPUT_RS
-	mov(r12, rcx) // rcx = c + 2*rs_c
-
-
-
-	CGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddps(ymm12, ymm0, ymm0)
-	CGEMM_OUTPUT_RS
-	add(rdx, rcx) // c += 4*cs_c;
-
-
-	CGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddps(ymm13, ymm0, ymm0)
-	CGEMM_OUTPUT_RS
-
-
-
-	jmp(.CDONE) // jump to end.
 
+		CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12,32))
+		vaddps(ymm13, ymm0, ymm0)
+		vmovups(ymm0, mem(r12,32))
 
+		jmp(.CDONE) // jump to end.
 
 	label(.CBETAZERO)
 
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.CROWSTORBZ) // jump to row storage case
-
-
-
-	label(.CGENSTORBZ)
-
-
-	vmovaps(ymm4, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	vmovaps(ymm5, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*rs_c
-
-
-
-	vmovaps(ymm8, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	vmovaps(ymm9, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*rs_c
-
-
-
-	vmovaps(ymm12, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	vmovaps(ymm13, ymm0)
-	CGEMM_OUTPUT_GS
-
-
-
-	jmp(.CDONE) // jump to end.
-
-
-
-	label(.CROWSTORBZ)
-
-
-	vmovups(ymm4, mem(rcx))
-	vmovups(ymm5, mem(rcx, rdx, 1))
-
-	vmovups(ymm8, mem(r11))
-	vmovups(ymm9, mem(r11, rdx, 1))
-
-	vmovups(ymm12, mem(r12))
-	vmovups(ymm13, mem(r12, rdx, 1))
-
+		vmovups(ymm4, mem(rcx))
+		vmovups(ymm5, mem(rcx,32))
 
+		vmovups(ymm8, mem(r11))
+		vmovups(ymm9, mem(r11,32))
 
+		vmovups(ymm12, mem(r12))
+		vmovups(ymm13, mem(r12,32))
 
 	label(.CDONE)
 
@@ -2174,41 +1695,25 @@ void bli_cgemm_haswell_asm_3x8
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( c );
 }
 
 
-// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
-// outputs to ymm0
-#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \
-	vmovupd(mem(rcx), xmm0) \
-	vmovupd(mem(rcx, rsi, 1), xmm3) \
-	vinsertf128(imm(1), xmm3, ymm0, ymm0) \
+#define ZGEMM_INPUT_SCALE_RS_BETA_NZ(where) \
+	vmovupd(where, ymm0) \
 	vpermilpd(imm(0x5), ymm0, ymm3) \
 	vmulpd(ymm1, ymm0, ymm0) \
 	vmulpd(ymm2, ymm3, ymm3) \
 	vaddsubpd(ymm3, ymm0, ymm0)
 
-// assumes values to output are in ymm0
-#define ZGEMM_OUTPUT_GS \
-	vextractf128(imm(1), ymm0, xmm3) \
-	vmovupd(xmm0, mem(rcx)) \
-	vmovupd(xmm3, mem(rcx, rsi, 1)) \
-
-#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \
-	vmovupd(mem(rcx), ymm0) \
-	vpermilpd(imm(0x5), ymm0, ymm3) \
-	vmulpd(ymm1, ymm0, ymm0) \
-	vmulpd(ymm2, ymm3, ymm3) \
-	vaddsubpd(ymm3, ymm0, ymm0)
-
-#define ZGEMM_OUTPUT_RS \
-	vmovupd(ymm0, mem(rcx)) \
-
 void bli_zgemm_haswell_asm_3x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -2223,11 +1728,13 @@ void bli_zgemm_haswell_asm_3x4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( z, 3, 4, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -2238,7 +1745,7 @@ void bli_zgemm_haswell_asm_3x4
 	//mov(%9, r15) // load address of b_next.
 
 	add(imm(32*4), rbx)
-	 // initialize loop by pre-loading
+	// initialize loop by pre-loading
 	vmovapd(mem(rbx, -4*32), ymm0)
 	vmovapd(mem(rbx, -3*32), ymm1)
 
@@ -2260,13 +1767,13 @@ void bli_zgemm_haswell_asm_3x4
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZCONSIDKLEFT) // if i == 0, jump to code that
-	 // contains the k_left loop.
+	// contains the k_left loop.
 
 
 	label(.ZLOOPKITER) // MAIN LOOP
 
 
-	 // iteration 0
+	// iteration 0
 	prefetch(0, mem(rax, 32*16))
 
 	vbroadcastsd(mem(rax, 0*8), ymm2)
@@ -2293,7 +1800,7 @@ void bli_zgemm_haswell_asm_3x4
 	vmovapd(mem(rbx, -2*32), ymm0)
 	vmovapd(mem(rbx, -1*32), ymm1)
 
-	 // iteration 1
+	// iteration 1
 	prefetch(0, mem(rax, 36*16))
 
 	vbroadcastsd(mem(rax, 6*8), ymm2)
@@ -2320,7 +1827,7 @@ void bli_zgemm_haswell_asm_3x4
 	vmovapd(mem(rbx, 0*32), ymm0)
 	vmovapd(mem(rbx, 1*32), ymm1)
 
-	 // iteration 2
+	// iteration 2
 	prefetch(0, mem(rax, 40*16))
 
 	vbroadcastsd(mem(rax, 12*8), ymm2)
@@ -2347,7 +1854,7 @@ void bli_zgemm_haswell_asm_3x4
 	vmovapd(mem(rbx, 2*32), ymm0)
 	vmovapd(mem(rbx, 3*32), ymm1)
 
-	 // iteration 3
+	// iteration 3
 	vbroadcastsd(mem(rax, 18*8), ymm2)
 	vbroadcastsd(mem(rax, 19*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
@@ -2389,7 +1896,7 @@ void bli_zgemm_haswell_asm_3x4
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
-	 // else, we prepare to enter k_left loop.
+	// else, we prepare to enter k_left loop.
 
 
 	label(.ZLOOPKLEFT) // EDGE LOOP
@@ -2431,8 +1938,8 @@ void bli_zgemm_haswell_asm_3x4
 
 	label(.ZPOSTACCUM)
 
-	 // permute even and odd elements
-	 // of ymm6/7, ymm10/11, ymm/14/15
+	// permute even and odd elements
+	// of ymm6/7, ymm10/11, ymm/14/15
 	vpermilpd(imm(0x5), ymm6, ymm6)
 	vpermilpd(imm(0x5), ymm7, ymm7)
 	vpermilpd(imm(0x5), ymm10, ymm10)
@@ -2441,7 +1948,7 @@ void bli_zgemm_haswell_asm_3x4
 	vpermilpd(imm(0x5), ymm15, ymm15)
 
 
-	 // subtract/add even/odd elements
+	// subtract/add even/odd elements
 	vaddsubpd(ymm6, ymm4, ymm4)
 	vaddsubpd(ymm7, ymm5, ymm5)
 
@@ -2501,15 +2008,7 @@ void bli_zgemm_haswell_asm_3x4
 
 
-
-	mov(var(cs_c), rsi) // load cs_c
-	lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dcomplex)
-	lea(mem(, rsi, 2), rsi)
-	lea(mem(, rsi, 2), rdx) // rdx = 2*cs_c;
-
-
-
-	 // now avoid loading C if beta == 0
+	// now avoid loading C if beta == 0
 	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -2518,162 +2017,49 @@ void bli_zgemm_haswell_asm_3x4
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case
 
-
-	cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16.
-	jz(.ZROWSTORED) // jump to row storage case
-
-
-
-	label(.ZGENSTORED)
-
-
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm5, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*rs_c
-
+		ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx))
+		vaddpd(ymm4, ymm0, ymm0)
+		vmovupd(ymm0, mem(rcx))
 
 
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
+		ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx,32))
+		vaddpd(ymm5, ymm0, ymm0)
+		vmovupd(ymm0, mem(rcx,32))
 
 
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm9, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*rs_c
 
+		ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11))
+		vaddpd(ymm8, ymm0, ymm0)
+		vmovupd(ymm0, mem(r11))
 
 
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
+		ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11,32))
+		vaddpd(ymm9, ymm0, ymm0)
+		vmovupd(ymm0, mem(r11,32))
 
 
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm13, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
 
+		ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12))
+		vaddpd(ymm12, ymm0, ymm0)
+		vmovupd(ymm0, mem(r12))
 
 
-	jmp(.ZDONE) // jump to end.
-
-
-
-	label(.ZROWSTORED)
-
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm5, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-	mov(r11, rcx) // rcx = c + 1*rs_c
-
-
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm9, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-	mov(r12, rcx) // rcx = c + 2*rs_c
-
-
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm13, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-
-
-
-	jmp(.ZDONE) // jump to end.
-
+		ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12,32))
+		vaddpd(ymm13, ymm0, ymm0)
+		vmovupd(ymm0, mem(r12,32))
 
+		jmp(.ZDONE) // jump to end.
 
 	label(.ZBETAZERO)
 
-	cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16.
-	jz(.ZROWSTORBZ) // jump to row storage case
-
-
-
-	label(.ZGENSTORBZ)
-
-
-	vmovapd(ymm4, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	vmovapd(ymm5, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*rs_c
-
-
-
-	vmovapd(ymm8, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	vmovapd(ymm9, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*rs_c
-
-
-
-	vmovapd(ymm12, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*cs_c;
-
-
-	vmovapd(ymm13, ymm0)
-	ZGEMM_OUTPUT_GS
-
-
-
-	jmp(.ZDONE) // jump to end.
-
-
-
-	label(.ZROWSTORBZ)
-
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rdx, 1))
-
-	vmovupd(ymm8, mem(r11))
-	vmovupd(ymm9, mem(r11, rdx, 1))
-
-	vmovupd(ymm12, mem(r12))
-	vmovupd(ymm13, mem(r12, rdx, 1))
-
+		vmovupd(ymm4, mem(rcx))
+		vmovupd(ymm5, mem(rcx,32))
 
+		vmovupd(ymm8, mem(r11))
+		vmovupd(ymm9, mem(r11,32))
 
+		vmovupd(ymm12, mem(r12))
+		vmovupd(ymm13, mem(r12,32))
 
 	label(.ZDONE)
 
@@ -2705,6 +2091,8 @@ void bli_zgemm_haswell_asm_3x4
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( z );
 }
 
 
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index b074da965..a3a8b0b09 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -78,7 +78,9 @@
 
 void bli_sgemm_haswell_asm_16x6
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -93,29 +95,31 @@ void bli_sgemm_haswell_asm_16x6
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( s, 16, 6, true );
+
 	begin_asm()
-	
+
 	vzeroall() // zero all xmm/ymm registers.
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	//mov(%9, r15) // load address of b_next.
-	
+
 	add(imm(32*4), rax)
 	 // initialize loop by pre-loading
 	vmovaps(mem(rax, -4*32), ymm0)
 	vmovaps(mem(rax, -3*32), ymm1)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
-	
+
 	lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c;
 	lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c;
 	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
@@ -124,46 +128,46 @@ void bli_sgemm_haswell_asm_16x6
 	prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c
 	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c
 	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	 // iteration 0
 	prefetch(0, mem(rax, 128*4))
-	
+
 	vbroadcastss(mem(rbx, 0*4), ymm2)
 	vbroadcastss(mem(rbx, 1*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 2*4), ymm2)
 	vbroadcastss(mem(rbx, 3*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 4*4), ymm2)
 	vbroadcastss(mem(rbx, 5*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	vmovaps(mem(rax, -2*32), ymm0)
 	vmovaps(mem(rax, -1*32), ymm1)
-	
+
 	 // iteration 1
 	vbroadcastss(mem(rbx, 6*4), ymm2)
 	vbroadcastss(mem(rbx, 7*4), ymm3)
@@ -171,51 +175,51 @@ void bli_sgemm_haswell_asm_16x6
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 8*4), ymm2)
 	vbroadcastss(mem(rbx, 9*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 10*4), ymm2)
 	vbroadcastss(mem(rbx, 11*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	vmovaps(mem(rax, 0*32), ymm0)
 	vmovaps(mem(rax, 1*32), ymm1)
-	
+
 	 // iteration 2
 	prefetch(0, mem(rax, 152*4))
-	
+
 	vbroadcastss(mem(rbx, 12*4), ymm2)
 	vbroadcastss(mem(rbx, 13*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 14*4), ymm2)
 	vbroadcastss(mem(rbx, 15*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 16*4), ymm2)
 	vbroadcastss(mem(rbx, 17*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	vmovaps(mem(rax, 2*32), ymm0)
 	vmovaps(mem(rax, 3*32), ymm1)
-	
+
 	 // iteration 3
 	vbroadcastss(mem(rbx, 18*4), ymm2)
 	vbroadcastss(mem(rbx, 19*4), ymm3)
@@ -223,91 +227,91 @@ void bli_sgemm_haswell_asm_16x6
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 20*4), ymm2)
 	vbroadcastss(mem(rbx, 21*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 22*4), ymm2)
 	vbroadcastss(mem(rbx, 23*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	add(imm(4*16*4), rax) // a += 4*16 (unroll x mr)
 	add(imm(4*6*4), rbx) // b += 4*6  (unroll x nr)
-	
+
 	vmovaps(mem(rax, -4*32), ymm0)
 	vmovaps(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.SLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT) // EDGE LOOP
-	
+
 	prefetch(0, mem(rax, 128*4))
-	
+
 	vbroadcastss(mem(rbx, 0*4), ymm2)
 	vbroadcastss(mem(rbx, 1*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 2*4), ymm2)
 	vbroadcastss(mem(rbx, 3*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 4*4), ymm2)
 	vbroadcastss(mem(rbx, 5*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	add(imm(1*16*4), rax) // a += 1*16 (unroll x mr)
 	add(imm(1*6*4), rbx) // b += 1*6  (unroll x nr)
-	
+
 	vmovaps(mem(rax, -4*32), ymm0)
 	vmovaps(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.SLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
-	
-	
-	
-	
+
+
+
+
 	mov(var(alpha), rax) // load address of alpha
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3) // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4) // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -320,315 +324,107 @@ void bli_sgemm_haswell_asm_16x6
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
-	
-	lea(mem(rcx, rsi, 8), rdx) // load address of c +  8*rs_c;
-	
-	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
-	lea(mem(rsi, rsi, 4), r15) // r15 = 5*rs_c;
-	lea(mem(r13, rsi, 4), r10) // r10 = 7*rs_c;
-	
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm3) // set ZF if beta == 0.
 	je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4.
-	jz(.SCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.SGENSTORED)
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm4, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm6, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm8, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm10, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm12, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm14, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	mov(rdx, rcx) // rcx = c + 8*rs_c
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm5, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm7, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm9, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm11, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm13, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	SGEMM_INPUT_GS_BETA_NZ
-	vfmadd213ps(ymm15, ymm3, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	label(.SCOLSTORED)
-	
-	
-	vfmadd231ps(mem(rcx), ymm3, ymm4)
-	vmovups(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm5)
-	vmovups(ymm5, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231ps(mem(rcx), ymm3, ymm6)
-	vmovups(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm7)
-	vmovups(ymm7, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231ps(mem(rcx), ymm3, ymm8)
-	vmovups(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm9)
-	vmovups(ymm9, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231ps(mem(rcx), ymm3, ymm10)
-	vmovups(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm11)
-	vmovups(ymm11, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231ps(mem(rcx), ymm3, ymm12)
-	vmovups(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm13)
-	vmovups(ymm13, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231ps(mem(rcx), ymm3, ymm14)
-	vmovups(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vfmadd231ps(mem(rdx), ymm3, ymm15)
-	vmovups(ymm15, mem(rdx))
-	//add(rdi, rdx)
-	
-	
-	
-	
+
+		vfmadd231ps(mem(rcx), ymm3, ymm4)
+		vmovups(ymm4, mem(rcx))
+		vfmadd231ps(mem(rcx,32), ymm3, ymm5)
+		vmovups(ymm5, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231ps(mem(rcx), ymm3, ymm6)
+		vmovups(ymm6, mem(rcx))
+		vfmadd231ps(mem(rcx,32), ymm3, ymm7)
+		vmovups(ymm7, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231ps(mem(rcx), ymm3, ymm8)
+		vmovups(ymm8, mem(rcx))
+		vfmadd231ps(mem(rcx,32), ymm3, ymm9)
+		vmovups(ymm9, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231ps(mem(rcx), ymm3, ymm10)
+		vmovups(ymm10, mem(rcx))
+		vfmadd231ps(mem(rcx,32), ymm3, ymm11)
+		vmovups(ymm11, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231ps(mem(rcx), ymm3, ymm12)
+		vmovups(ymm12, mem(rcx))
+		vfmadd231ps(mem(rcx,32), ymm3, ymm13)
+		vmovups(ymm13, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231ps(mem(rcx), ymm3, ymm14)
+		vmovups(ymm14, mem(rcx))
+		vfmadd231ps(mem(rcx,32), ymm3, ymm15)
+		vmovups(ymm15, mem(rcx,32))
+		//add(rdi, rcx)
+
 	jmp(.SDONE) // jump to end.
-	
-	
-	
+
 	label(.SBETAZERO)
-	
-	cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4.
-	jz(.SCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.SGENSTORBZ)
-	
-	
-	vmovaps(ymm4, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm6, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm8, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm10, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm12, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm14, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	mov(rdx, rcx) // rcx = c + 8*rs_c
-	
-	
-	vmovaps(ymm5, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm7, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm9, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm11, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm13, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovaps(ymm15, ymm0)
-	SGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	label(.SCOLSTORBZ)
-	
-	
-	vmovups(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm5, mem(rdx))
-	add(rdi, rdx)
-	
-	vmovups(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm7, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovups(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm9, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovups(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm11, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovups(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vmovups(ymm13, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovups(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vmovups(ymm15, mem(rdx))
-	//add(rdi, rdx)
-	
-	
-	
-	
-	
-	
-	
+
+		vmovups(ymm4, mem(rcx))
+		vmovups(ymm5, mem(rcx,32))
+		add(rdi, rcx)
+
+		vmovups(ymm6, mem(rcx))
+		vmovups(ymm7, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovups(ymm8, mem(rcx))
+		vmovups(ymm9, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovups(ymm10, mem(rcx))
+		vmovups(ymm11, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovups(ymm12, mem(rcx))
+		vmovups(ymm13, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovups(ymm14, mem(rcx))
+		vmovups(ymm15, mem(rcx,32))
+		//add(rdi, rcx)
+
 	label(.SDONE)
-	
-	
+
+
 
 	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c)/*,   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)*/  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c)/*,   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -636,6 +432,8 @@ void bli_sgemm_haswell_asm_16x6
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 #define DGEMM_INPUT_GS_BETA_NZ \
@@ -664,7 +462,9 @@ void bli_sgemm_haswell_asm_16x6
 
 void bli_dgemm_haswell_asm_8x6
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -679,29 +479,31 @@ void bli_dgemm_haswell_asm_8x6
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( d, 8, 6, false );
+
 	begin_asm()
-	
+
 	vzeroall() // zero all xmm/ymm registers.
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	//mov(%9, r15) // load address of b_next.
-	
+
 	add(imm(32*4), rax)
 	 // initialize loop by pre-loading
 	vmovapd(mem(rax, -4*32), ymm0)
 	vmovapd(mem(rax, -3*32), ymm1)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
-	
+
 	lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c;
 	lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c;
 	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
@@ -710,46 +512,46 @@ void bli_dgemm_haswell_asm_8x6
 	prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c
 	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c
 	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	 // iteration 0
 	prefetch(0, mem(rax, 64*8))
-	
+
 	vbroadcastsd(mem(rbx, 0*8), ymm2)
 	vbroadcastsd(mem(rbx, 1*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 2*8), ymm2)
 	vbroadcastsd(mem(rbx, 3*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 4*8), ymm2)
 	vbroadcastsd(mem(rbx, 5*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	vmovapd(mem(rax, -2*32), ymm0)
 	vmovapd(mem(rax, -1*32), ymm1)
-	
+
 	 // iteration 1
 	vbroadcastsd(mem(rbx, 6*8), ymm2)
 	vbroadcastsd(mem(rbx, 7*8), ymm3)
@@ -757,51 +559,51 @@ void bli_dgemm_haswell_asm_8x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 8*8), ymm2)
 	vbroadcastsd(mem(rbx, 9*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 10*8), ymm2)
 	vbroadcastsd(mem(rbx, 11*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	vmovapd(mem(rax, 0*32), ymm0)
 	vmovapd(mem(rax, 1*32), ymm1)
-	
+
 	 // iteration 2
 	prefetch(0, mem(rax, 76*8))
-	
+
 	vbroadcastsd(mem(rbx, 12*8), ymm2)
 	vbroadcastsd(mem(rbx, 13*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 14*8), ymm2)
 	vbroadcastsd(mem(rbx, 15*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 16*8), ymm2)
 	vbroadcastsd(mem(rbx, 17*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	vmovapd(mem(rax, 2*32), ymm0)
 	vmovapd(mem(rax, 3*32), ymm1)
-	
+
 	 // iteration 3
 	vbroadcastsd(mem(rbx, 18*8), ymm2)
 	vbroadcastsd(mem(rbx, 19*8), ymm3)
@@ -809,91 +611,91 @@ void bli_dgemm_haswell_asm_8x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 20*8), ymm2)
 	vbroadcastsd(mem(rbx, 21*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 22*8), ymm2)
 	vbroadcastsd(mem(rbx, 23*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	add(imm(4*8*8), rax) // a += 4*8 (unroll x mr)
 	add(imm(4*6*8), rbx) // b += 4*6 (unroll x nr)
-	
+
 	vmovapd(mem(rax, -4*32), ymm0)
 	vmovapd(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.DLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT) // EDGE LOOP
-	
+
 	prefetch(0, mem(rax, 64*8))
-	
+
 	vbroadcastsd(mem(rbx, 0*8), ymm2)
 	vbroadcastsd(mem(rbx, 1*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 2*8), ymm2)
 	vbroadcastsd(mem(rbx, 3*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 4*8), ymm2)
 	vbroadcastsd(mem(rbx, 5*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	add(imm(1*8*8), rax) // a += 1*8 (unroll x mr)
 	add(imm(1*6*8), rbx) // b += 1*6 (unroll x nr)
-	
+
 	vmovapd(mem(rax, -4*32), ymm0)
 	vmovapd(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.DLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
-	
-	
-	
-	
+
+
+
+
 	mov(var(alpha), rax) // load address of alpha
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4) // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -906,314 +708,107 @@ void bli_dgemm_haswell_asm_8x6
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
-	
-	lea(mem(rcx, rsi, 4), rdx) // load address of c +  4*rs_c;
-	
-	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
-	//lea(mem(rsi, rsi, 4), r15) // r15 = 5*rs_c;
-	//lea(mem(r13, rsi, 4), r10) // r10 = 7*rs_c;
-	
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomisd(xmm0, xmm3) // set ZF if beta == 0.
 	je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.DGENSTORED)
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm4, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm6, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm8, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm10, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm12, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm14, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	mov(rdx, rcx) // rcx = c + 4*rs_c
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm5, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm7, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm9, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm11, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm13, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	DGEMM_INPUT_GS_BETA_NZ
-	vfmadd213pd(ymm15, ymm3, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
-	label(.DCOLSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm7)
-	vmovupd(ymm7, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm9)
-	vmovupd(ymm9, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm11)
-	vmovupd(ymm11, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm13)
-	vmovupd(ymm13, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm14)
-	vmovupd(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vfmadd231pd(mem(rdx), ymm3, ymm15)
-	vmovupd(ymm15, mem(rdx))
-	//add(rdi, rdx)
-	
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
+
+		vfmadd231pd(mem(rcx), ymm3, ymm4)
+		vmovupd(ymm4, mem(rcx))
+		vfmadd231pd(mem(rcx,32), ymm3, ymm5)
+		vmovupd(ymm5, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231pd(mem(rcx), ymm3, ymm6)
+		vmovupd(ymm6, mem(rcx))
+		vfmadd231pd(mem(rcx,32), ymm3, ymm7)
+		vmovupd(ymm7, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231pd(mem(rcx), ymm3, ymm8)
+		vmovupd(ymm8, mem(rcx))
+		vfmadd231pd(mem(rcx,32), ymm3, ymm9)
+		vmovupd(ymm9, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231pd(mem(rcx), ymm3, ymm10)
+		vmovupd(ymm10, mem(rcx))
+		vfmadd231pd(mem(rcx,32), ymm3, ymm11)
+		vmovupd(ymm11, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231pd(mem(rcx), ymm3, ymm12)
+		vmovupd(ymm12, mem(rcx))
+		vfmadd231pd(mem(rcx,32), ymm3, ymm13)
+		vmovupd(ymm13, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vfmadd231pd(mem(rcx), ymm3, ymm14)
+		vmovupd(ymm14, mem(rcx))
+		vfmadd231pd(mem(rcx,32), ymm3, ymm15)
+		vmovupd(ymm15, mem(rcx,32))
+		//add(rdi, rcx)
+
+		jmp(.DDONE) // jump to end.
+
 	label(.DBETAZERO)
-	
-	cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.DGENSTORBZ)
-	
-	
-	vmovapd(ymm4, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm6, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm8, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm10, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm12, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm14, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	mov(rdx, rcx) // rcx = c + 4*rs_c
-	
-	
-	vmovapd(ymm5, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm7, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm9, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm11, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm13, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	add(rdi, rcx) // c += cs_c;
-	
-	
-	vmovapd(ymm15, ymm0)
-	DGEMM_OUTPUT_GS_BETA_NZ
-	//add(rdi, rcx) // c += cs_c;
-	
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
-	label(.DCOLSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm5, mem(rdx))
-	add(rdi, rdx)
-	
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm7, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm9, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm11, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovupd(ymm12, mem(rcx))
-	add(rdi, rcx)
-	vmovupd(ymm13, mem(rdx))
-	add(rdi, rdx)
-	
-	
-	vmovupd(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	vmovupd(ymm15, mem(rdx))
-	//add(rdi, rdx)
-	
-	
-	
-	
-	
-	
-	
+
+		vmovupd(ymm4, mem(rcx))
+		vmovupd(ymm5, mem(rcx,32))
+		add(rdi, rcx)
+
+		vmovupd(ymm6, mem(rcx))
+		vmovupd(ymm7, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovupd(ymm8, mem(rcx))
+		vmovupd(ymm9, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovupd(ymm10, mem(rcx))
+		vmovupd(ymm11, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovupd(ymm12, mem(rcx))
+		vmovupd(ymm13, mem(rcx,32))
+		add(rdi, rcx)
+
+
+		vmovupd(ymm14, mem(rcx))
+		vmovupd(ymm15, mem(rcx,32))
+		//add(rdi, rcx)
+
 	label(.DDONE)
-	
-	
 
-    end_asm(
+
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c)/*,   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)*/  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c)/*,   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -1221,45 +816,25 @@ void bli_dgemm_haswell_asm_8x6
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 
-// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
-// outputs to ymm0
-#define CGEMM_INPUT_SCALE_GS_BETA_NZ \
-	vmovlpd(mem(rcx), xmm0, xmm0) \
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \
-	vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \
-	vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \
-	vinsertf128(imm(1), xmm3, ymm0, ymm0) \
+#define CGEMM_INPUT_SCALE_CS_BETA_NZ(where) \
+	vmovups(where, ymm0) \
 	vpermilps(imm(0xb1), ymm0, ymm3) \
 	vmulps(ymm1, ymm0, ymm0) \
 	vmulps(ymm2, ymm3, ymm3) \
 	vaddsubps(ymm3, ymm0, ymm0)
 
-// assumes values to output are in ymm0
-#define CGEMM_OUTPUT_GS \
-	vextractf128(imm(1), ymm0, xmm3) \
-	vmovlpd(xmm0, mem(rcx)) \
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) \
-	vmovlpd(xmm3, mem(rcx, rsi, 2)) \
-	vmovhpd(xmm3, mem(rcx, r13, 1))
-
-#define CGEMM_INPUT_SCALE_CS_BETA_NZ \
-	vmovups(mem(rcx), ymm0) \
-	vpermilps(imm(0xb1), ymm0, ymm3) \
-	vmulps(ymm1, ymm0, ymm0) \
-	vmulps(ymm2, ymm3, ymm3) \
-	vaddsubps(ymm3, ymm0, ymm0)
-	
-#define CGEMM_OUTPUT_CS \
-	vmovups(ymm0, mem(rcx)) \
-
 void bli_cgemm_haswell_asm_8x3
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
@@ -1274,75 +849,77 @@ void bli_cgemm_haswell_asm_8x3
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( c, 8, 3, false );
+
 	begin_asm()
-	
+
 	vzeroall() // zero all xmm/ymm registers.
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	//mov(%9, r15) // load address of b_next.
-	
+
 	add(imm(32*4), rax)
 	 // initialize loop by pre-loading
 	vmovaps(mem(rax, -4*32), ymm0)
 	vmovaps(mem(rax, -3*32), ymm1)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
-	
+
 	lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c;
 	lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c;
-	
+
 	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.CLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	 // iteration 0
 	prefetch(0, mem(rax, 32*8))
-	
+
 	vbroadcastss(mem(rbx, 0*4), ymm2)
 	vbroadcastss(mem(rbx, 1*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 2*4), ymm2)
 	vbroadcastss(mem(rbx, 3*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 4*4), ymm2)
 	vbroadcastss(mem(rbx, 5*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	vmovaps(mem(rax, -2*32), ymm0)
 	vmovaps(mem(rax, -1*32), ymm1)
-	
+
 	 // iteration 1
 	vbroadcastss(mem(rbx, 6*4), ymm2)
 	vbroadcastss(mem(rbx, 7*4), ymm3)
@@ -1350,51 +927,51 @@ void bli_cgemm_haswell_asm_8x3
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 8*4), ymm2)
 	vbroadcastss(mem(rbx, 9*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 10*4), ymm2)
 	vbroadcastss(mem(rbx, 11*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	vmovaps(mem(rax, 0*32), ymm0)
 	vmovaps(mem(rax, 1*32), ymm1)
-	
+
 	 // iteration 2
 	prefetch(0, mem(rax, 38*8))
-	
+
 	vbroadcastss(mem(rbx, 12*4), ymm2)
 	vbroadcastss(mem(rbx, 13*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 14*4), ymm2)
 	vbroadcastss(mem(rbx, 15*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 16*4), ymm2)
 	vbroadcastss(mem(rbx, 17*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	vmovaps(mem(rax, 2*32), ymm0)
 	vmovaps(mem(rax, 3*32), ymm1)
-	
+
 	 // iteration 3
 	vbroadcastss(mem(rbx, 18*4), ymm2)
 	vbroadcastss(mem(rbx, 19*4), ymm3)
@@ -1402,84 +979,84 @@ void bli_cgemm_haswell_asm_8x3
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 20*4), ymm2)
 	vbroadcastss(mem(rbx, 21*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 22*4), ymm2)
 	vbroadcastss(mem(rbx, 23*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	add(imm(4*8*8), rax) // a += 4*8  (unroll x mr)
 	add(imm(4*3*8), rbx) // b += 4*3  (unroll x nr)
-	
+
 	vmovaps(mem(rax, -4*32), ymm0)
 	vmovaps(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.CLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.CCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.CLOOPKLEFT) // EDGE LOOP
-	
+
 	prefetch(0, mem(rax, 32*8))
-	
+
 	vbroadcastss(mem(rbx, 0*4), ymm2)
 	vbroadcastss(mem(rbx, 1*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rbx, 2*4), ymm2)
 	vbroadcastss(mem(rbx, 3*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rbx, 4*4), ymm2)
 	vbroadcastss(mem(rbx, 5*4), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 	add(imm(1*8*8), rax) // a += 1*8  (unroll x mr)
 	add(imm(1*3*8), rbx) // b += 1*3  (unroll x nr)
-	
+
 	vmovaps(mem(rax, -4*32), ymm0)
 	vmovaps(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.CLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.CPOSTACCUM)
-	
-	
+
+
 	 // permute even and odd elements
 	 // of ymm6/7, ymm10/11, ymm/14/15
 	vpermilps(imm(0xb1), ymm6, ymm6)
@@ -1488,76 +1065,68 @@ void bli_cgemm_haswell_asm_8x3
 	vpermilps(imm(0xb1), ymm11, ymm11)
 	vpermilps(imm(0xb1), ymm14, ymm14)
 	vpermilps(imm(0xb1), ymm15, ymm15)
-	
-	
+
+
 	 // subtract/add even/odd elements
 	vaddsubps(ymm6, ymm4, ymm4)
 	vaddsubps(ymm7, ymm5, ymm5)
-	
+
 	vaddsubps(ymm10, ymm8, ymm8)
 	vaddsubps(ymm11, ymm9, ymm9)
-	
+
 	vaddsubps(ymm14, ymm12, ymm12)
 	vaddsubps(ymm15, ymm13, ymm13)
-	
-	
-	
-	
+
+
+
+
 	mov(var(alpha), rax) // load address of alpha
 	vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate
 	vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate
-	
-	
+
+
 	vpermilps(imm(0xb1), ymm4, ymm3)
 	vmulps(ymm0, ymm4, ymm4)
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm4, ymm4)
-	
+
 	vpermilps(imm(0xb1), ymm5, ymm3)
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm5, ymm5)
-	
-	
+
+
 	vpermilps(imm(0xb1), ymm8, ymm3)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm8, ymm8)
-	
+
 	vpermilps(imm(0xb1), ymm9, ymm3)
 	vmulps(ymm0, ymm9, ymm9)
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm9, ymm9)
-	
-	
+
+
 	vpermilps(imm(0xb1), ymm12, ymm3)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm12, ymm12)
-	
+
 	vpermilps(imm(0xb1), ymm13, ymm3)
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm13, ymm13)
-	
-	
-	
-	
-	
+
+
+
+
+
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
-	lea(mem(, rsi, 4), rdx) // rdx = 4*rs_c;
-	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
-	
-	
-	
+
+
+
 	 // now avoid loading C if beta == 0
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm1) // set ZF if beta_r == 0.
@@ -1566,186 +1135,71 @@ void bli_cgemm_haswell_asm_8x3
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.CCOLSTORED) // jump to row storage case
-	
-	
-	
-	label(.CGENSTORED)
-	
-	
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm4, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm5, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*cs_c
-	
-	
-	
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm8, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm9, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*cs_c
-	
-	
-	
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm12, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	CGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddps(ymm13, ymm0, ymm0)
-	CGEMM_OUTPUT_GS
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
-	label(.CCOLSTORED)
-	
-	
-	CGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddps(ymm4, ymm0, ymm0)
-	CGEMM_OUTPUT_CS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	CGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddps(ymm5, ymm0, ymm0)
-	CGEMM_OUTPUT_CS
-	mov(r11, rcx) // rcx = c + 1*cs_c
-	
-	
-	
-	CGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddps(ymm8, ymm0, ymm0)
-	CGEMM_OUTPUT_CS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	CGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddps(ymm9, ymm0, ymm0)
-	CGEMM_OUTPUT_CS
-	mov(r12, rcx) // rcx = c + 2*cs_c
-	
-	
-	
-	CGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddps(ymm12, ymm0, ymm0)
-	CGEMM_OUTPUT_CS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	CGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddps(ymm13, ymm0, ymm0)
-	CGEMM_OUTPUT_CS
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
+
+		CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx))
+		vaddps(ymm4, ymm0, ymm0)
+		vmovups(ymm0, mem(rcx))
+
+
+		CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx,32))
+		vaddps(ymm5, ymm0, ymm0)
+		vmovups(ymm0, mem(rcx,32))
+
+
+
+		CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11))
+		vaddps(ymm8, ymm0, ymm0)
+		vmovups(ymm0, mem(r11))
+
+
+		CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11,32))
+		vaddps(ymm9, ymm0, ymm0)
+		vmovups(ymm0, mem(r11,32))
+
+
+
+		CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12))
+		vaddps(ymm12, ymm0, ymm0)
+		vmovups(ymm0, mem(r12))
+
+
+		CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12,32))
+		vaddps(ymm13, ymm0, ymm0)
+		vmovups(ymm0, mem(r12,32))
+
+		jmp(.CDONE) // jump to end.
+
 	label(.CBETAZERO)
-	
-	cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8.
-	jz(.CCOLSTORBZ) // jump to row storage case
-	
-	
-	
-	label(.CGENSTORBZ)
-	
-	
-	vmovaps(ymm4, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	vmovaps(ymm5, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*cs_c
-	
-	
-	
-	vmovaps(ymm8, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	vmovaps(ymm9, ymm0)
-	CGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*cs_c
-	
-	
-	
-	vmovaps(ymm12, ymm0)
-	CGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 4*rs_c;
-	
-	
-	vmovaps(ymm13, ymm0)
-	CGEMM_OUTPUT_GS
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
-	label(.CCOLSTORBZ)
-	
-	
-	vmovups(ymm4, mem(rcx))
-	vmovups(ymm5, mem(rcx, rdx, 1))
-	
-	vmovups(ymm8, mem(r11))
-	vmovups(ymm9, mem(r11, rdx, 1))
-	
-	vmovups(ymm12, mem(r12))
-	vmovups(ymm13, mem(r12, rdx, 1))
-	
-	
-	
-	
-	
-	
+
+		vmovups(ymm4, mem(rcx))
+		vmovups(ymm5, mem(rcx,32))
+
+		vmovups(ymm8, mem(r11))
+		vmovups(ymm9, mem(r11,32))
+
+		vmovups(ymm12, mem(r12))
+		vmovups(ymm13, mem(r12,32))
+
 	label(.CDONE)
-	
-	
 
-    end_asm(
+
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c)/*,   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)*/  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c)/*,   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -1753,41 +1207,25 @@ void bli_cgemm_haswell_asm_8x3
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( c );
 }
 
 
-// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
-// outputs to ymm0
-#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \
-	vmovupd(mem(rcx), xmm0) \
-	vmovupd(mem(rcx, rsi, 1), xmm3) \
-	vinsertf128(imm(1), xmm3, ymm0, ymm0) \
+#define ZGEMM_INPUT_SCALE_CS_BETA_NZ(where) \
+	vmovups(where, ymm0) \
 	vpermilpd(imm(0x5), ymm0, ymm3) \
 	vmulpd(ymm1, ymm0, ymm0) \
 	vmulpd(ymm2, ymm3, ymm3) \
 	vaddsubpd(ymm3, ymm0, ymm0)
-	
-// assumes values to output are in ymm0
-#define ZGEMM_OUTPUT_GS \
-	vextractf128(imm(1), ymm0, xmm3) \
-	vmovupd(xmm0, mem(rcx)) \
-	vmovupd(xmm3, mem(rcx, rsi, 1)) \
-
-#define ZGEMM_INPUT_SCALE_CS_BETA_NZ \
-	vmovups(mem(rcx), ymm0) \
-	vpermilpd(imm(0x5), ymm0, ymm3) \
-	vmulpd(ymm1, ymm0, ymm0) \
-	vmulpd(ymm2, ymm3, ymm3) \
-	vaddsubpd(ymm3, ymm0, ymm0)
-	
-#define ZGEMM_OUTPUT_CS \
-	vmovupd(ymm0, mem(rcx)) \
 
 void bli_zgemm_haswell_asm_4x3
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -1802,76 +1240,78 @@ void bli_zgemm_haswell_asm_4x3
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( z, 4, 3, false );
+
 	begin_asm()
-	
+
 	vzeroall() // zero all xmm/ymm registers.
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	//mov(%9, r15) // load address of b_next.
-	
+
 	add(imm(32*4), rax)
 	 // initialize loop by pre-loading
 	vmovapd(mem(rax, -4*32), ymm0)
 	vmovapd(mem(rax, -3*32), ymm1)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
 	lea(mem(, rdi, 2), rdi)
-	
+
 	lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c;
 	lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c;
-	
+
 	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.ZLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	 // iteration 0
 	prefetch(0, mem(rax, 32*16))
-	
+
 	vbroadcastsd(mem(rbx, 0*8), ymm2)
 	vbroadcastsd(mem(rbx, 1*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 2*8), ymm2)
 	vbroadcastsd(mem(rbx, 3*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 4*8), ymm2)
 	vbroadcastsd(mem(rbx, 5*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	vmovapd(mem(rax, -2*32), ymm0)
 	vmovapd(mem(rax, -1*32), ymm1)
-	
+
 	 // iteration 1
 	vbroadcastsd(mem(rbx, 6*8), ymm2)
 	vbroadcastsd(mem(rbx, 7*8), ymm3)
@@ -1879,51 +1319,51 @@ void bli_zgemm_haswell_asm_4x3
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 8*8), ymm2)
 	vbroadcastsd(mem(rbx, 9*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 10*8), ymm2)
 	vbroadcastsd(mem(rbx, 11*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	vmovapd(mem(rax, 0*32), ymm0)
 	vmovapd(mem(rax, 1*32), ymm1)
-	
+
 	 // iteration 2
 	prefetch(0, mem(rax, 38*16))
-	
+
 	vbroadcastsd(mem(rbx, 12*8), ymm2)
 	vbroadcastsd(mem(rbx, 13*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 14*8), ymm2)
 	vbroadcastsd(mem(rbx, 15*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 16*8), ymm2)
 	vbroadcastsd(mem(rbx, 17*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	vmovapd(mem(rax, 2*32), ymm0)
 	vmovapd(mem(rax, 3*32), ymm1)
-	
+
 	 // iteration 3
 	vbroadcastsd(mem(rbx, 18*8), ymm2)
 	vbroadcastsd(mem(rbx, 19*8), ymm3)
@@ -1931,83 +1371,83 @@ void bli_zgemm_haswell_asm_4x3
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 20*8), ymm2)
 	vbroadcastsd(mem(rbx, 21*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 22*8), ymm2)
 	vbroadcastsd(mem(rbx, 23*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	add(imm(4*4*16), rax) // a += 4*4 (unroll x mr)
 	add(imm(4*3*16), rbx) // b += 4*3 (unroll x nr)
-	
+
 	vmovapd(mem(rax, -4*32), ymm0)
 	vmovapd(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.ZLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.ZCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.ZLOOPKLEFT) // EDGE LOOP
-	
+
 	prefetch(0, mem(rax, 32*16))
-	
+
 	vbroadcastsd(mem(rbx, 0*8), ymm2)
 	vbroadcastsd(mem(rbx, 1*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rbx, 2*8), ymm2)
 	vbroadcastsd(mem(rbx, 3*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rbx, 4*8), ymm2)
 	vbroadcastsd(mem(rbx, 5*8), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 	add(imm(1*4*16), rax) // a += 1*4 (unroll x mr)
 	add(imm(1*3*16), rbx) // b += 1*3 (unroll x nr)
-	
+
 	vmovapd(mem(rax, -4*32), ymm0)
 	vmovapd(mem(rax, -3*32), ymm1)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.ZLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.ZPOSTACCUM)
-	
+
 	 // permute even and odd elements
 	 // of ymm6/7, ymm10/11, ymm/14/15
 	vpermilpd(imm(0x5), ymm6, ymm6)
@@ -2016,76 +1456,69 @@ void bli_zgemm_haswell_asm_4x3
 	vpermilpd(imm(0x5), ymm11, ymm11)
 	vpermilpd(imm(0x5), ymm14, ymm14)
 	vpermilpd(imm(0x5), ymm15, ymm15)
-	
-	
+
+
 	 // subtract/add even/odd elements
 	vaddsubpd(ymm6, ymm4, ymm4)
 	vaddsubpd(ymm7, ymm5, ymm5)
-	
+
 	vaddsubpd(ymm10, ymm8, ymm8)
 	vaddsubpd(ymm11, ymm9, ymm9)
-	
+
 	vaddsubpd(ymm14, ymm12, ymm12)
 	vaddsubpd(ymm15, ymm13, ymm13)
-	
-	
-	
-	
+
+
+
+
 	mov(var(alpha), rax) // load address of alpha
 	vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
 	vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
-	
-	
+
+
 	vpermilpd(imm(0x5), ymm4, ymm3)
 	vmulpd(ymm0, ymm4, ymm4)
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm4, ymm4)
-	
+
 	vpermilpd(imm(0x5), ymm5, ymm3)
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm5, ymm5)
-	
-	
+
+
 	vpermilpd(imm(0x5), ymm8, ymm3)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm8, ymm8)
-	
+
 	vpermilpd(imm(0x5), ymm9, ymm3)
 	vmulpd(ymm0, ymm9, ymm9)
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm9, ymm9)
-	
-	
+
+
 	vpermilpd(imm(0x5), ymm12, ymm3)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm12, ymm12)
-	
+
 	vpermilpd(imm(0x5), ymm13, ymm3)
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm13, ymm13)
-	
-	
-	
-	
-	
+
+
+
+
+
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
-	lea(mem(, rsi, 2), rsi)
-	lea(mem(, rsi, 2), rdx) // rdx = 2*rs_c;
-	
-	
-	
+
+
+
+
 	 // now avoid loading C if beta == 0
 	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
@@ -2094,171 +1527,56 @@ void bli_zgemm_haswell_asm_4x3
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16.
-	jz(.ZCOLSTORED) // jump to row storage case
-	
-	
-	
-	label(.ZGENSTORED)
-	
-	
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm5, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*cs_c
-	
-	
-	
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm9, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*cs_c
-	
-	
-	
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	ZGEMM_INPUT_SCALE_GS_BETA_NZ
-	vaddpd(ymm13, ymm0, ymm0)
-	ZGEMM_OUTPUT_GS
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
-	label(.ZCOLSTORED)
-	
-	
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm0)
-	ZGEMM_OUTPUT_CS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm5, ymm0, ymm0)
-	ZGEMM_OUTPUT_CS
-	mov(r11, rcx) // rcx = c + 1*cs_c
-	
-	
-	
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm0)
-	ZGEMM_OUTPUT_CS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm9, ymm0, ymm0)
-	ZGEMM_OUTPUT_CS
-	mov(r12, rcx) // rcx = c + 2*cs_c
-	
-	
-	
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm0)
-	ZGEMM_OUTPUT_CS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm13, ymm0, ymm0)
-	ZGEMM_OUTPUT_CS
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
+
+		ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx))
+		vaddpd(ymm4, ymm0, ymm0)
+		vmovupd(ymm0, mem(rcx))
+
+
+		ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx,32))
+		vaddpd(ymm5, ymm0, ymm0)
+		vmovupd(ymm0, mem(rcx,32))
+
+
+
+		ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11))
+		vaddpd(ymm8, ymm0, ymm0)
+		vmovupd(ymm0, mem(r11))
+
+
+		ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11,32))
+		vaddpd(ymm9, ymm0, ymm0)
+		vmovupd(ymm0, mem(r11,32))
+
+
+
+		ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12))
+		vaddpd(ymm12, ymm0, ymm0)
+		vmovupd(ymm0, mem(r12))
+
+
+		ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12,32))
+		vaddpd(ymm13, ymm0, ymm0)
+		vmovupd(ymm0, mem(r12,32))
+
+		jmp(.ZDONE) // jump to end.
+
 	label(.ZBETAZERO)
-	
-	cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16.
-	jz(.ZCOLSTORBZ) // jump to row storage case
-	
-	
-	
-	label(.ZGENSTORBZ)
-	
-	
-	vmovapd(ymm4, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	vmovapd(ymm5, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r11, rcx) // rcx = c + 1*cs_c
-	
-	
-	
-	vmovapd(ymm8, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	vmovapd(ymm9, ymm0)
-	ZGEMM_OUTPUT_GS
-	mov(r12, rcx) // rcx = c + 2*cs_c
-	
-	
-	
-	vmovapd(ymm12, ymm0)
-	ZGEMM_OUTPUT_GS
-	add(rdx, rcx) // c += 2*rs_c;
-	
-	
-	vmovapd(ymm13, ymm0)
-	ZGEMM_OUTPUT_GS
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
-	label(.ZCOLSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rdx, 1))
-	
-	vmovupd(ymm8, mem(r11))
-	vmovupd(ymm9, mem(r11, rdx, 1))
-	
-	vmovupd(ymm12, mem(r12))
-	vmovupd(ymm13, mem(r12, rdx, 1))
-	
-	
-	
-	
-	
-	
+
+		vmovupd(ymm4, mem(rcx))
+		vmovupd(ymm5, mem(rcx,32))
+
+		vmovupd(ymm8, mem(r11))
+		vmovupd(ymm9, mem(r11,32))
+
+		vmovupd(ymm12, mem(r12))
+		vmovupd(ymm13, mem(r12,32))
+
 	label(.ZDONE)
-	
-	
 
-    end_asm(
+
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
 	  [k_iter] "m" (k_iter), // 0
@@ -2273,7 +1591,7 @@ void bli_zgemm_haswell_asm_4x3
 	  [b_next] "m" (b_next), // 9
 	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -2281,6 +1599,8 @@ void bli_zgemm_haswell_asm_4x3
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( z );
 }
 
 
diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
index 880632ae0..f20e43f7c 100644
--- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
+++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
@@ -256,6 +256,8 @@ extern int offsets[16];
 //#define LOOPMON
 void bli_dgemm_knc_asm_30x8
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
@@ -273,80 +275,82 @@ void bli_dgemm_knc_asm_30x8
 
     uint64_t k64 = k;
 
+    GEMM_UKR_SETUP_CT( d, 30, 8, true );
+
 #ifdef MONITORS
     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
 #endif
 #ifdef LOOPMON
     int tlooph, tloopl, blooph, bloopl;
 #endif
-    
+
     __asm
     {
 #ifdef MONITORS
         rdtsc
         mov topl, eax
-        mov toph, edx 
+        mov toph, edx
 #endif
         vpxord  zmm0,  zmm0, zmm0
         vmovaps zmm1,  zmm0  //clear out registers
-        vmovaps zmm2,  zmm0 
+        vmovaps zmm2,  zmm0
         mov rsi, k64    //loop index
-        vmovaps zmm3,  zmm0 
+        vmovaps zmm3,  zmm0
 
         mov r11, rs_c           //load row stride
-        vmovaps zmm4,  zmm0 
+        vmovaps zmm4,  zmm0
         sal r11, 3              //scale row stride
-        vmovaps zmm5,  zmm0 
+        vmovaps zmm5,  zmm0
         mov r15, a              //load address of a
-        vmovaps zmm6,  zmm0 
+        vmovaps zmm6,  zmm0
         mov rbx, b              //load address of b
-        vmovaps zmm7,  zmm0 
+        vmovaps zmm7,  zmm0
 
-        vmovaps zmm8,  zmm0 
+        vmovaps zmm8,  zmm0
         lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11
         vmovaps zmm9,  zmm0
-        vmovaps zmm10, zmm0 
-        mov rdi, r11    
-        vmovaps zmm11, zmm0 
+        vmovaps zmm10, zmm0
+        mov rdi, r11
+        vmovaps zmm11, zmm0
         sal rdi, 2              //rdi has 4*r11
 
-        vmovaps zmm12, zmm0 
+        vmovaps zmm12, zmm0
         mov rcx, c              //load address of c for prefetching
-        vmovaps zmm13, zmm0 
-        vmovaps zmm14, zmm0 
+        vmovaps zmm13, zmm0
+        vmovaps zmm14, zmm0
         mov r8, k64
-        vmovaps zmm15, zmm0 
+        vmovaps zmm15, zmm0
 
         vmovaps zmm16, zmm0
         vmovaps zmm17, zmm0
         mov r13, L2_PREFETCH_DIST*8*8
-        vmovaps zmm18, zmm0 
+        vmovaps zmm18, zmm0
         mov r14, L2_PREFETCH_DIST*8*32
-        vmovaps zmm19, zmm0 
-        vmovaps zmm20, zmm0 
-        vmovaps zmm21, zmm0 
-        vmovaps zmm22, zmm0 
+        vmovaps zmm19, zmm0
+        vmovaps zmm20, zmm0
+        vmovaps zmm21, zmm0
+        vmovaps zmm22, zmm0
 
-        vmovaps zmm23, zmm0 
+        vmovaps zmm23, zmm0
         sub r8, 30 + L2_PREFETCH_DIST       //Check if we have over 40 operations to do.
-        vmovaps zmm24, zmm0 
+        vmovaps zmm24, zmm0
         mov r8, 30
-        vmovaps zmm25, zmm0 
+        vmovaps zmm25, zmm0
         mov r9, 8*8                         //amount to increment b* by each iteration
-        vmovaps zmm26, zmm0 
+        vmovaps zmm26, zmm0
         mov r12, 32*8                       //amount to increment a* by each iteration
-        vmovaps zmm27, zmm0 
-        vmovaps zmm28, zmm0 
-        vmovaps zmm29, zmm0 
+        vmovaps zmm27, zmm0
+        vmovaps zmm28, zmm0
+        vmovaps zmm29, zmm0
 
 #ifdef MONITORS
         rdtsc
         mov midl, eax
-        mov midh, edx 
+        mov midh, edx
 #endif
         jle CONSIDER_UNDER_40
         sub rsi, 30 + L2_PREFETCH_DIST
-        
+
         //First 30 iterations
         LOOPREFECHCL2:
             ONE_ITER_PC_L2(rcx)
@@ -357,26 +361,26 @@ void bli_dgemm_knc_asm_30x8
         LOOPMAIN:
             ONE_ITER_MAIN_LOOP(rcx, rsi)
         jne LOOPMAIN
-        
+
         //Penultimate 22 iterations.
         //Break these off from the main loop to avoid prefetching extra shit.
         mov r14, a_next
         mov r13, b_next
         sub r14, r15
         sub r13, rbx
-        
+
         mov rsi, L2_PREFETCH_DIST-10
         LOOPMAIN2:
             ONE_ITER_MAIN_LOOP(rcx, rsi)
         jne LOOPMAIN2
-        
-        
+
+
         //Last 10 iterations
         mov r8, 10
         LOOPREFETCHCL1:
             ONE_ITER_PC_L1(rcx)
         jne LOOPREFETCHCL1
-       
+
 
         jmp POSTACCUM
 
@@ -403,14 +407,8 @@ void bli_dgemm_knc_asm_30x8
         mov r9, c               //load address of c for update
         mov r12, alpha          //load address of alpha
 
-        // Check if C is row stride. If not, jump to the slow scattered update
-        mov r14, cs_c
-        dec r14
-        jne SCATTEREDUPDATE
-
         mov r14, beta
-        vbroadcastsd zmm31, 0[r14] 
-
+        vbroadcastsd zmm31, 0[r14]
 
         vmulpd zmm0, zmm0, 0[r12]{1to8}
         vmulpd zmm1, zmm1, 0[r12]{1to8}
@@ -467,7 +465,7 @@ void bli_dgemm_knc_asm_30x8
         vmovapd [r9+2*r11+0], zmm14
         vmovapd [r9+r10+0], zmm15
         add r9, rdi
-        
+
         vmulpd zmm16, zmm16, 0[r12]{1to8}
         vmulpd zmm17, zmm17, 0[r12]{1to8}
         vmulpd zmm18, zmm18, 0[r12]{1to8}
@@ -516,47 +514,6 @@ void bli_dgemm_knc_asm_30x8
         vfmadd231pd zmm29, zmm31, [r9+r11+0]
         vmovapd [r9+0], zmm28
         vmovapd [r9+r11+0], zmm29
-        
-        jmp END
-        
-        SCATTEREDUPDATE:
-        mov r10, offsetPtr 
-        vmovapd zmm31, 0[r10] 
-        vpbroadcastd zmm30, cs_c 
-        mov r13, beta
-        vpmulld zmm30, zmm31, zmm30 
-
-        mov ebx, 255 
-        UPDATE_C_ROW_SCATTERED(zmm0, 0, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm1, 1, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm2, 2, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm3, 3, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm4, 4, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm5, 5, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm6, 6, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm7, 7, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm8, 8, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm9, 9, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm10, 10, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm11, 11, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm12, 12, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm13, 13, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm14, 14, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm15, 15, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm16, 16, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm17, 17, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm18, 18, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm19, 19, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm20, 20, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm21, 21, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm22, 22, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm23, 23, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm24, 24, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm25, 25, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm26, 26, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm27, 27, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm28, 28, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm29, 29, r9)
 
         END:
 #ifdef MONITORS
@@ -566,6 +523,8 @@ void bli_dgemm_knc_asm_30x8
 #endif
     }
 
+    GEMM_UKR_FLUSH_CT( d );
+
 #ifdef LOOPMON
     printf("looptime = \t%d\n", bloopl - tloopl);
 #endif
diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
index 866cb62ec..18a8e5e2e 100644
--- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
+++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
@@ -256,6 +256,8 @@ int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9
 //#define LOOPMON
 void bli_sgemm_knc_asm_30x16
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
@@ -273,80 +275,82 @@ void bli_sgemm_knc_asm_30x16
 
     uint64_t k64 = k;
 
+    GEMM_UKR_SETUP_CT( s, 30, 16, true );
+
 #ifdef MONITORS
     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
 #endif
 #ifdef LOOPMON
     int tlooph, tloopl, blooph, bloopl;
 #endif
-    
+
     __asm
     {
 #ifdef MONITORS
         rdtsc
         mov topl, eax
-        mov toph, edx 
+        mov toph, edx
 #endif
         vpxord  zmm0,  zmm0, zmm0
         vmovaps zmm1,  zmm0  //clear out registers
-        vmovaps zmm2,  zmm0 
+        vmovaps zmm2,  zmm0
         mov rsi, k64  //loop index
-        vmovaps zmm3,  zmm0 
+        vmovaps zmm3,  zmm0
 
         mov r11, rs_c           //load row stride
-        vmovaps zmm4,  zmm0 
+        vmovaps zmm4,  zmm0
         sal r11, 2              //scale row stride
-        vmovaps zmm5,  zmm0 
+        vmovaps zmm5,  zmm0
         mov r15, a              //load address of a
-        vmovaps zmm6,  zmm0 
+        vmovaps zmm6,  zmm0
         mov rbx, b              //load address of b
-        vmovaps zmm7,  zmm0 
+        vmovaps zmm7,  zmm0
 
-        vmovaps zmm8,  zmm0 
+        vmovaps zmm8,  zmm0
         lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11
         vmovaps zmm9,  zmm0
-        vmovaps zmm10, zmm0 
-        mov rdi, r11    
-        vmovaps zmm11, zmm0 
+        vmovaps zmm10, zmm0
+        mov rdi, r11
+        vmovaps zmm11, zmm0
         sal rdi, 2              //rdi has 4*r11
 
-        vmovaps zmm12, zmm0 
+        vmovaps zmm12, zmm0
         mov rcx, c              //load address of c for prefetching
-        vmovaps zmm13, zmm0 
-        vmovaps zmm14, zmm0 
+        vmovaps zmm13, zmm0
+        vmovaps zmm14, zmm0
         mov r8, k64
-        vmovaps zmm15, zmm0 
+        vmovaps zmm15, zmm0
 
         vmovaps zmm16, zmm0
         vmovaps zmm17, zmm0
         mov r13, L2_PREFETCH_DIST*4*16
-        vmovaps zmm18, zmm0 
+        vmovaps zmm18, zmm0
         mov r14, L2_PREFETCH_DIST*4*32
-        vmovaps zmm19, zmm0 
-        vmovaps zmm20, zmm0 
-        vmovaps zmm21, zmm0 
-        vmovaps zmm22, zmm0 
+        vmovaps zmm19, zmm0
+        vmovaps zmm20, zmm0
+        vmovaps zmm21, zmm0
+        vmovaps zmm22, zmm0
 
-        vmovaps zmm23, zmm0 
+        vmovaps zmm23, zmm0
         sub r8, 30 + L2_PREFETCH_DIST       //Check if we have over 40 operations to do.
-        vmovaps zmm24, zmm0 
+        vmovaps zmm24, zmm0
         mov r8, 30
-        vmovaps zmm25, zmm0 
+        vmovaps zmm25, zmm0
         mov r9, 16*4                         //amount to increment b* by each iteration
-        vmovaps zmm26, zmm0 
+        vmovaps zmm26, zmm0
         mov r12, 32*4                       //amount to increment a* by each iteration
-        vmovaps zmm27, zmm0 
-        vmovaps zmm28, zmm0 
-        vmovaps zmm29, zmm0 
+        vmovaps zmm27, zmm0
+        vmovaps zmm28, zmm0
+        vmovaps zmm29, zmm0
 
 #ifdef MONITORS
         rdtsc
         mov midl, eax
-        mov midh, edx 
+        mov midh, edx
 #endif
         jle CONSIDER_UNDER_40
         sub rsi, 30 + L2_PREFETCH_DIST
-        
+
         //First 30 iterations
         LOOPREFECHCL2:
             ONE_ITER_PC_L2(rcx)
@@ -357,26 +361,26 @@ void bli_sgemm_knc_asm_30x16
         LOOPMAIN:
             ONE_ITER_MAIN_LOOP(rcx, rsi)
         jne LOOPMAIN
-        
+
         //Penultimate 22 iterations.
         //Break these off from the main loop to avoid prefetching extra shit.
         mov r14, a_next
         mov r13, b_next
         sub r14, r15
         sub r13, rbx
-        
+
         mov rsi, L2_PREFETCH_DIST-10
         LOOPMAIN2:
             ONE_ITER_MAIN_LOOP(rcx, rsi)
         jne LOOPMAIN2
-        
-        
+
+
         //Last 10 iterations
         mov r8, 10
         LOOPREFETCHCL1:
             ONE_ITER_PC_L1(rcx)
         jne LOOPREFETCHCL1
-       
+
 
         jmp POSTACCUM
 
@@ -384,7 +388,7 @@ void bli_sgemm_knc_asm_30x16
         //Used when <= 40 iterations
         CONSIDER_UNDER_40:
         mov rsi, k64
-        test rsi, rsi 
+        test rsi, rsi
         je POSTACCUM
         LOOP_UNDER_40:
             ONE_ITER_MAIN_LOOP(rcx, rsi)
@@ -403,13 +407,8 @@ void bli_sgemm_knc_asm_30x16
         mov r9, c               //load address of c for update
         mov r12, alpha          //load address of alpha
 
-        // Check if C is row stride. If not, jump to the slow scattered update
-        mov r14, cs_c
-        dec r14
-        jne SCATTEREDUPDATE
-
         mov r14, beta
-        vbroadcastss zmm31, 0[r14] 
+        vbroadcastss zmm31, 0[r14]
 
 
         vmulps zmm0, zmm0, 0[r12]{1to16}
@@ -467,7 +466,7 @@ void bli_sgemm_knc_asm_30x16
         vmovaps [r9+2*r11+0], zmm14
         vmovaps [r9+r10+0], zmm15
         add r9, rdi
-        
+
         vmulps zmm16, zmm16, 0[r12]{1to16}
         vmulps zmm17, zmm17, 0[r12]{1to16}
         vmulps zmm18, zmm18, 0[r12]{1to16}
@@ -516,48 +515,6 @@ void bli_sgemm_knc_asm_30x16
         vfmadd231ps zmm29, zmm31, [r9+r11+0]
         vmovaps [r9+0], zmm28
         vmovaps [r9+r11+0], zmm29
-        
-        jmp END
-        
-        SCATTEREDUPDATE:
-        
-        mov r10, offsetPtr 
-        vmovaps zmm31, 0[r10] 
-        vpbroadcastd zmm30, cs_c 
-        mov r13, beta
-        vpmulld zmm30, zmm31, zmm30 
-
-        mov ebx, 0xFFFF
-        UPDATE_C_ROW_SCATTERED(zmm0, 0, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm1, 1, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm2, 2, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm3, 3, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm4, 4, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm5, 5, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm6, 6, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm7, 7, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm8, 8, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm9, 9, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm10, 10, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm11, 11, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm12, 12, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm13, 13, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm14, 14, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm15, 15, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm16, 16, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm17, 17, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm18, 18, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm19, 19, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm20, 20, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm21, 21, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm22, 22, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm23, 23, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm24, 24, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm25, 25, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm26, 26, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm27, 27, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm28, 28, r9) 
-        UPDATE_C_ROW_SCATTERED(zmm29, 29, r9)
 
         END:
 #ifdef MONITORS
@@ -567,6 +524,8 @@ void bli_sgemm_knc_asm_30x16
 #endif
     }
 
+    GEMM_UKR_FLUSH_CT( s );
+
 #ifdef LOOPMON
     printf("looptime = \t%d\n", bloopl - tloopl);
 #endif
diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
index b794e7c05..a7f860ae0 100644
--- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
+++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
@@ -185,6 +185,8 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
 //#define LOOPMON
 void bli_dgemm_knl_asm_24x8
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k_,
        double*    restrict alpha,
        double*    restrict a,
@@ -201,10 +203,12 @@ void bli_dgemm_knl_asm_24x8
     const double * a_next = bli_auxinfo_next_a( data );
     const double * b_next = bli_auxinfo_next_b( data );
 
-    const int32_t * offsetPtr = &offsets[0];
-    const int64_t k = k_;
-    const int64_t rs_c = rs_c_;
-    const int64_t cs_c = cs_c_;
+    int32_t * offsetPtr = &offsets[0];
+    int64_t k = k_;
+    int64_t rs_c = rs_c_;
+    int64_t cs_c = cs_c_;
+
+    GEMM_UKR_SETUP_CT( d, 24, 8, true );
 
 #ifdef MONITORS
     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
@@ -565,10 +569,7 @@ void bli_dgemm_knl_asm_24x8
     // Check if C is row stride. If not, jump to the slow scattered update
     MOV(RAX, VAR(rs_c))
     LEA(RAX, MEM(,RAX,8))
-    MOV(RBX, VAR(cs_c))
     LEA(RDI, MEM(RAX,RAX,2))
-    CMP(RBX, IMM(1))
-    JNE(SCATTEREDUPDATE)
 
     VMOVQ(RDX, XMM(1))
     SAL(RDX) //shift out sign bit
@@ -592,74 +593,6 @@ void bli_dgemm_knl_asm_24x8
     UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
     UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
 
-    JMP(END)
-
-    LABEL(SCATTEREDUPDATE)
-
-    MOV(RDI, VAR(offsetPtr))
-    VMOVAPS(ZMM(2), MEM(RDI))
-    /* Note that this ignores the upper 32 bits in cs_c */
-    VPBROADCASTD(ZMM(3), EBX)
-    VPMULLD(ZMM(2), ZMM(3), ZMM(2))
-
-    VMOVQ(RDX, XMM(1))
-    SAL(RDX) //shift out sign bit
-    JZ(SCATTERBZ)
-
-    UPDATE_C_ROW_SCATTERED( 8)
-    UPDATE_C_ROW_SCATTERED( 9)
-    UPDATE_C_ROW_SCATTERED(10)
-    UPDATE_C_ROW_SCATTERED(11)
-    UPDATE_C_ROW_SCATTERED(12)
-    UPDATE_C_ROW_SCATTERED(13)
-    UPDATE_C_ROW_SCATTERED(14)
-    UPDATE_C_ROW_SCATTERED(15)
-    UPDATE_C_ROW_SCATTERED(16)
-    UPDATE_C_ROW_SCATTERED(17)
-    UPDATE_C_ROW_SCATTERED(18)
-    UPDATE_C_ROW_SCATTERED(19)
-    UPDATE_C_ROW_SCATTERED(20)
-    UPDATE_C_ROW_SCATTERED(21)
-    UPDATE_C_ROW_SCATTERED(22)
-    UPDATE_C_ROW_SCATTERED(23)
-    UPDATE_C_ROW_SCATTERED(24)
-    UPDATE_C_ROW_SCATTERED(25)
-    UPDATE_C_ROW_SCATTERED(26)
-    UPDATE_C_ROW_SCATTERED(27)
-    UPDATE_C_ROW_SCATTERED(28)
-    UPDATE_C_ROW_SCATTERED(29)
-    UPDATE_C_ROW_SCATTERED(30)
-    UPDATE_C_ROW_SCATTERED(31)
-
-    JMP(END)
-
-    LABEL(SCATTERBZ)
-
-    UPDATE_C_BZ_ROW_SCATTERED( 8)
-    UPDATE_C_BZ_ROW_SCATTERED( 9)
-    UPDATE_C_BZ_ROW_SCATTERED(10)
-    UPDATE_C_BZ_ROW_SCATTERED(11)
-    UPDATE_C_BZ_ROW_SCATTERED(12)
-    UPDATE_C_BZ_ROW_SCATTERED(13)
-    UPDATE_C_BZ_ROW_SCATTERED(14)
-    UPDATE_C_BZ_ROW_SCATTERED(15)
-    UPDATE_C_BZ_ROW_SCATTERED(16)
-    UPDATE_C_BZ_ROW_SCATTERED(17)
-    UPDATE_C_BZ_ROW_SCATTERED(18)
-    UPDATE_C_BZ_ROW_SCATTERED(19)
-    UPDATE_C_BZ_ROW_SCATTERED(20)
-    UPDATE_C_BZ_ROW_SCATTERED(21)
-    UPDATE_C_BZ_ROW_SCATTERED(22)
-    UPDATE_C_BZ_ROW_SCATTERED(23)
-    UPDATE_C_BZ_ROW_SCATTERED(24)
-    UPDATE_C_BZ_ROW_SCATTERED(25)
-    UPDATE_C_BZ_ROW_SCATTERED(26)
-    UPDATE_C_BZ_ROW_SCATTERED(27)
-    UPDATE_C_BZ_ROW_SCATTERED(28)
-    UPDATE_C_BZ_ROW_SCATTERED(29)
-    UPDATE_C_BZ_ROW_SCATTERED(30)
-    UPDATE_C_BZ_ROW_SCATTERED(31)
-
     LABEL(END)
 
 #ifdef MONITORS
@@ -701,6 +634,8 @@ void bli_dgemm_knl_asm_24x8
       "zmm30", "zmm31", "memory"
     )
 
+    GEMM_UKR_FLUSH_CT( d );
+
 #ifdef LOOPMON
     printf("looptime = \t%d\n", bloopl - tloopl);
 #endif
diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
index 6d485b530..64feba09f 100644
--- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
+++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
@@ -182,6 +182,8 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
 //#define LOOPMON
 void bli_sgemm_knl_asm_24x16
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k_,
        float*     restrict alpha,
        float*     restrict a,
@@ -198,10 +200,12 @@ void bli_sgemm_knl_asm_24x16
     const double * a_next = bli_auxinfo_next_a( data );
     const double * b_next = bli_auxinfo_next_b( data );
 
-    const int32_t * offsetPtr = &offsets[0];
-    const int64_t k = k_;
-    const int64_t rs_c = rs_c_;
-    const int64_t cs_c = cs_c_;
+    int32_t * offsetPtr = &offsets[0];
+    int64_t k = k_;
+    int64_t rs_c = rs_c_;
+    int64_t cs_c = cs_c_;
+
+    GEMM_UKR_SETUP_CT( s, 24, 16, true );
 
 #ifdef MONITORS
     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
@@ -562,10 +566,7 @@ void bli_sgemm_knl_asm_24x16
     // Check if C is row stride. If not, jump to the slow scattered update
     MOV(RAX, VAR(rs_c))
     LEA(RAX, MEM(,RAX,4))
-    MOV(RBX, VAR(cs_c))
     LEA(RDI, MEM(RAX,RAX,2))
-    CMP(RBX, IMM(1))
-    JNE(SCATTEREDUPDATE)
 
     VMOVD(EDX, XMM(1))
     SAL(EDX) //shift out sign bit
@@ -589,74 +590,6 @@ void bli_sgemm_knl_asm_24x16
     UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
     UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
 
-    JMP(END)
-
-    LABEL(SCATTEREDUPDATE)
-
-    MOV(RDI, VAR(offsetPtr))
-    VMOVAPS(ZMM(2), MEM(RDI))
-    /* Note that this ignores the upper 32 bits in cs_c */
-    VPBROADCASTD(ZMM(3), EBX)
-    VPMULLD(ZMM(2), ZMM(3), ZMM(2))
-
-    VMOVD(EDX, XMM(1))
-    SAL(EDX) //shift out sign bit
-    JZ(SCATTERBZ)
-
-    UPDATE_C_ROW_SCATTERED( 8)
-    UPDATE_C_ROW_SCATTERED( 9)
-    UPDATE_C_ROW_SCATTERED(10)
-    UPDATE_C_ROW_SCATTERED(11)
-    UPDATE_C_ROW_SCATTERED(12)
-    UPDATE_C_ROW_SCATTERED(13)
-    UPDATE_C_ROW_SCATTERED(14)
-    UPDATE_C_ROW_SCATTERED(15)
-    UPDATE_C_ROW_SCATTERED(16)
-    UPDATE_C_ROW_SCATTERED(17)
-    UPDATE_C_ROW_SCATTERED(18)
-    UPDATE_C_ROW_SCATTERED(19)
-    UPDATE_C_ROW_SCATTERED(20)
-    UPDATE_C_ROW_SCATTERED(21)
-    UPDATE_C_ROW_SCATTERED(22)
-    UPDATE_C_ROW_SCATTERED(23)
-    UPDATE_C_ROW_SCATTERED(24)
-    UPDATE_C_ROW_SCATTERED(25)
-    UPDATE_C_ROW_SCATTERED(26)
-    UPDATE_C_ROW_SCATTERED(27)
-    UPDATE_C_ROW_SCATTERED(28)
-    UPDATE_C_ROW_SCATTERED(29)
-    UPDATE_C_ROW_SCATTERED(30)
-    UPDATE_C_ROW_SCATTERED(31)
-
-    JMP(END)
-
-    LABEL(SCATTERBZ)
-
-    UPDATE_C_BZ_ROW_SCATTERED( 8)
-    UPDATE_C_BZ_ROW_SCATTERED( 9)
-    UPDATE_C_BZ_ROW_SCATTERED(10)
-    UPDATE_C_BZ_ROW_SCATTERED(11)
-    UPDATE_C_BZ_ROW_SCATTERED(12)
-    UPDATE_C_BZ_ROW_SCATTERED(13)
-    UPDATE_C_BZ_ROW_SCATTERED(14)
-    UPDATE_C_BZ_ROW_SCATTERED(15)
-    UPDATE_C_BZ_ROW_SCATTERED(16)
-    UPDATE_C_BZ_ROW_SCATTERED(17)
-    UPDATE_C_BZ_ROW_SCATTERED(18)
-    UPDATE_C_BZ_ROW_SCATTERED(19)
-    UPDATE_C_BZ_ROW_SCATTERED(20)
-    UPDATE_C_BZ_ROW_SCATTERED(21)
-    UPDATE_C_BZ_ROW_SCATTERED(22)
-    UPDATE_C_BZ_ROW_SCATTERED(23)
-    UPDATE_C_BZ_ROW_SCATTERED(24)
-    UPDATE_C_BZ_ROW_SCATTERED(25)
-    UPDATE_C_BZ_ROW_SCATTERED(26)
-    UPDATE_C_BZ_ROW_SCATTERED(27)
-    UPDATE_C_BZ_ROW_SCATTERED(28)
-    UPDATE_C_BZ_ROW_SCATTERED(29)
-    UPDATE_C_BZ_ROW_SCATTERED(30)
-    UPDATE_C_BZ_ROW_SCATTERED(31)
-
     LABEL(END)
 
 #ifdef MONITORS
@@ -698,6 +631,8 @@ void bli_sgemm_knl_asm_24x16
       "zmm30", "zmm31", "memory"
     )
 
+    GEMM_UKR_FLUSH_CT( s );
+
 #ifdef LOOPMON
     printf("looptime = \t%d\n", bloopl - tloopl);
 #endif
diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
index e52cc9e0e..a3e39c3ac 100644
--- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
@@ -39,7 +39,9 @@
 
 void bli_sgemm_penryn_asm_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -54,38 +56,40 @@ void bli_sgemm_penryn_asm_8x4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_ALIGNED( s, 8, 4, false, 16 );
+
 	begin_asm()
-		
-		
+
+
 		mov(var(a), rax) // load address of a.
 		mov(var(b), rbx) // load address of b.
 		mov(var(b_next), r9) // load address of b_next.
-		
+
 		sub(imm(0-8*16), rax) // increment pointers to allow byte
 		sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
-		
+
 		movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
 		movaps(mem(rax, -7*16), xmm1) // of a and b.
 		movaps(mem(rbx, -8*16), xmm2)
-		
+
 		mov(var(c), rcx) // load address of c
 		mov(var(cs_c), rdi) // load cs_c
 		lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
 		mov(rdi, r12) // make a copy of cs_c (in bytes)
 		lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
-		
+
 		prefetch(2, mem(r9, 0*4)) // prefetch b_next
-		
+
 		xorps(xmm3, xmm3)
 		xorps(xmm4, xmm4)
 		xorps(xmm5, xmm5)
 		xorps(xmm6, xmm6)
-		
+
 		prefetch(2, mem(rcx, 6*4)) // prefetch c + 0*cs_c
 		xorps(xmm8, xmm8)
 		xorps(xmm9, xmm9)
@@ -98,33 +102,33 @@ void bli_sgemm_penryn_asm_8x4
 		prefetch(2, mem(r10, rdi, 1, 6*4)) // prefetch c + 3*cs_c
 		xorps(xmm14, xmm14)
 		xorps(xmm15, xmm15)
-		
-		
-		
+
+
+
 		mov(var(k_iter), rsi) // i = k_iter;
 		test(rsi, rsi) // check i via logical AND.
 		je(.SCONSIDKLEFT) // if i == 0, jump to code that
 		 // contains the k_left loop.
-		
-		
+
+
 		label(.SLOOPKITER) // MAIN LOOP
-		
+
 		prefetch(0, mem(rax, (4*35+1)*8))
-		
+
 		addps(xmm6, xmm10) // iteration 0
 		addps(xmm3, xmm14)
 		movaps(xmm2, xmm3)
 		pshufd(imm(0x39), xmm2, xmm7)
 		mulps(xmm0, xmm2)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm4, xmm11)
 		addps(xmm5, xmm15)
 		movaps(xmm7, xmm5)
 		pshufd(imm(0x39), xmm7, xmm6)
 		mulps(xmm0, xmm7)
 		mulps(xmm1, xmm5)
-		
+
 		addps(xmm2, xmm8)
 		movaps(mem(rbx, -7*16), xmm2)
 		addps(xmm3, xmm12)
@@ -132,7 +136,7 @@ void bli_sgemm_penryn_asm_8x4
 		pshufd(imm(0x39), xmm6, xmm4)
 		mulps(xmm0, xmm6)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm7, xmm9)
 		addps(xmm5, xmm13)
 		movaps(xmm4, xmm5)
@@ -140,22 +144,22 @@ void bli_sgemm_penryn_asm_8x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulps(xmm1, xmm5)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		addps(xmm6, xmm10) // iteration 1
 		addps(xmm3, xmm14)
 		movaps(xmm2, xmm3)
 		pshufd(imm(0x39), xmm2, xmm7)
 		mulps(xmm0, xmm2)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm4, xmm11)
 		addps(xmm5, xmm15)
 		movaps(xmm7, xmm5)
 		pshufd(imm(0x39), xmm7, xmm6)
 		mulps(xmm0, xmm7)
 		mulps(xmm1, xmm5)
-		
+
 		addps(xmm2, xmm8)
 		movaps(mem(rbx, -6*16), xmm2)
 		addps(xmm3, xmm12)
@@ -163,7 +167,7 @@ void bli_sgemm_penryn_asm_8x4
 		pshufd(imm(0x39), xmm6, xmm4)
 		mulps(xmm0, xmm6)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm7, xmm9)
 		addps(xmm5, xmm13)
 		movaps(xmm4, xmm5)
@@ -171,22 +175,22 @@ void bli_sgemm_penryn_asm_8x4
 		movaps(mem(rax, -4*16), xmm0)
 		mulps(xmm1, xmm5)
 		movaps(mem(rax, -3*16), xmm1)
-		
-		
+
+
 		addps(xmm6, xmm10) // iteration 2
 		addps(xmm3, xmm14)
 		movaps(xmm2, xmm3)
 		pshufd(imm(0x39), xmm2, xmm7)
 		mulps(xmm0, xmm2)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm4, xmm11)
 		addps(xmm5, xmm15)
 		movaps(xmm7, xmm5)
 		pshufd(imm(0x39), xmm7, xmm6)
 		mulps(xmm0, xmm7)
 		mulps(xmm1, xmm5)
-		
+
 		addps(xmm2, xmm8)
 		movaps(mem(rbx, -5*16), xmm2)
 		addps(xmm3, xmm12)
@@ -194,7 +198,7 @@ void bli_sgemm_penryn_asm_8x4
 		pshufd(imm(0x39), xmm6, xmm4)
 		mulps(xmm0, xmm6)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm7, xmm9)
 		addps(xmm5, xmm13)
 		movaps(xmm4, xmm5)
@@ -202,26 +206,26 @@ void bli_sgemm_penryn_asm_8x4
 		movaps(mem(rax, -2*16), xmm0)
 		mulps(xmm1, xmm5)
 		movaps(mem(rax, -1*16), xmm1)
-		
-		
+
+
 		addps(xmm6, xmm10) // iteration 3
 		addps(xmm3, xmm14)
 		movaps(xmm2, xmm3)
 		pshufd(imm(0x39), xmm2, xmm7)
 		mulps(xmm0, xmm2)
 		mulps(xmm1, xmm3)
-		
+
 		sub(imm(0-4*8*4), rax) // a += 4*8 (unroll x mr)
-		
+
 		addps(xmm4, xmm11)
 		addps(xmm5, xmm15)
 		movaps(xmm7, xmm5)
 		pshufd(imm(0x39), xmm7, xmm6)
 		mulps(xmm0, xmm7)
 		mulps(xmm1, xmm5)
-		
+
 		sub(imm(0-4*4*4), r9) // b_next += 4*4 (unroll x nr)
-		
+
 		addps(xmm2, xmm8)
 		movaps(mem(rbx, -4*16), xmm2)
 		addps(xmm3, xmm12)
@@ -229,9 +233,9 @@ void bli_sgemm_penryn_asm_8x4
 		pshufd(imm(0x39), xmm6, xmm4)
 		mulps(xmm0, xmm6)
 		mulps(xmm1, xmm3)
-		
+
 		sub(imm(0-4*4*4), rbx) // b += 4*4 (unroll x nr)
-		
+
 		addps(xmm7, xmm9)
 		addps(xmm5, xmm13)
 		movaps(xmm4, xmm5)
@@ -239,40 +243,40 @@ void bli_sgemm_penryn_asm_8x4
 		movaps(mem(rax, -8*16), xmm0)
 		mulps(xmm1, xmm5)
 		movaps(mem(rax, -7*16), xmm1)
-		
+
 		prefetch(2, mem(r9, 0*4)) // prefetch b_next[0]
 		prefetch(2, mem(r9, 16*4)) // prefetch b_next[16]
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.SLOOPKITER) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.SCONSIDKLEFT)
-		
+
 		mov(var(k_left), rsi) // i = k_left;
 		test(rsi, rsi) // check i via logical AND.
 		je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
 		 // else, we prepare to enter k_left loop.
-		
-		
+
+
 		label(.SLOOPKLEFT) // EDGE LOOP
-		
+
 		addps(xmm6, xmm10) // iteration 0
 		addps(xmm3, xmm14)
 		movaps(xmm2, xmm3)
 		pshufd(imm(0x39), xmm2, xmm7)
 		mulps(xmm0, xmm2)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm4, xmm11)
 		addps(xmm5, xmm15)
 		movaps(xmm7, xmm5)
 		pshufd(imm(0x39), xmm7, xmm6)
 		mulps(xmm0, xmm7)
 		mulps(xmm1, xmm5)
-		
+
 		addps(xmm2, xmm8)
 		movaps(mem(rbx, -7*16), xmm2)
 		addps(xmm3, xmm12)
@@ -280,7 +284,7 @@ void bli_sgemm_penryn_asm_8x4
 		pshufd(imm(0x39), xmm6, xmm4)
 		mulps(xmm0, xmm6)
 		mulps(xmm1, xmm3)
-		
+
 		addps(xmm7, xmm9)
 		addps(xmm5, xmm13)
 		movaps(xmm4, xmm5)
@@ -288,40 +292,40 @@ void bli_sgemm_penryn_asm_8x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulps(xmm1, xmm5)
 		movaps(mem(rax, -5*16), xmm1)
-		
+
 		sub(imm(0-1*8*4), rax) // a += 8 (1 x mr)
 		sub(imm(0-1*4*4), rbx) // b += 4 (1 x nr)
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.SLOOPKLEFT) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.SPOSTACCUM)
-		
+
 		addps(xmm6, xmm10)
 		addps(xmm3, xmm14)
 		addps(xmm4, xmm11)
 		addps(xmm5, xmm15)
-		
-		
+
+
 		mov(var(alpha), rax) // load address of alpha
 		mov(var(beta), rbx) // load address of beta
 		movss(mem(rax), xmm6) // load alpha to bottom 4 bytes of xmm6
 		movss(mem(rbx), xmm7) // load beta to bottom 4 bytes of xmm7
 		pshufd(imm(0x00), xmm6, xmm6) // populate xmm6 with four alphas
 		pshufd(imm(0x00), xmm7, xmm7) // populate xmm7 with four betas
-		
-		
+
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(rsi, r8) // make a copy of rs_c
-		
+
 		lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
 		lea(mem(rsi, rsi, 2), r11) // r11 = 3*(rs_c * sizeof(float))
-		
+
 		lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-		
+
 		 // xmm8:   xmm9:   xmm10:  xmm11:
 		 // ( ab00  ( ab01  ( ab02  ( ab03
 		 //   ab11    ab12    ab13    ab10
@@ -338,20 +342,20 @@ void bli_sgemm_penryn_asm_8x4
 		shufps(imm(0xd8), xmm11, xmm8)
 		shufps(imm(0xd8), xmm10, xmm11)
 		shufps(imm(0xd8), xmm4, xmm10)
-		
+
 		movaps(xmm8, xmm4)
 		shufps(imm(0xd8), xmm10, xmm8)
 		shufps(imm(0xd8), xmm4, xmm10)
 		movaps(xmm9, xmm5)
 		shufps(imm(0xd8), xmm11, xmm9)
 		shufps(imm(0xd8), xmm5, xmm11)
-		
+
 		movaps(xmm13, xmm4)
 		shufps(imm(0xd8), xmm12, xmm13)
 		shufps(imm(0xd8), xmm15, xmm12)
 		shufps(imm(0xd8), xmm14, xmm15)
 		shufps(imm(0xd8), xmm4, xmm14)
-		
+
 		movaps(xmm12, xmm4)
 		shufps(imm(0xd8), xmm14, xmm12)
 		shufps(imm(0xd8), xmm4, xmm14)
@@ -369,471 +373,133 @@ void bli_sgemm_penryn_asm_8x4
 		 //   ab50    ab51    ab52    ab53
 		 //   ab60    ab61    ab62    ab63
 		 //   ab70 )  ab71 )  ab72 )  ab73 )
-		
-		
-		
-		 // determine if
-		 //   c      % 16 == 0, AND
-		 //   8*cs_c % 16 == 0, AND
-		 //   rs_c        == 1
-		 // ie: aligned, ldim aligned, and
-		 // column-stored
-		
-		cmp(imm(1), r8) // set ZF if rs_c == 1.
-		sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-		test(imm(15), rcx) // set ZF if c & 16 is zero.
-		setz(bh) // bh = ( ZF == 1 ? 1 : 0 );
-		test(imm(15), r12) // set ZF if (4*cs_c) & 16 is zero.
-		setz(al) // al = ( ZF == 1 ? 1 : 0 );
-		 // and(bl,bh) followed by
-		 // and(bh,al) will reveal result
-		
+
 		 // now avoid loading C if beta == 0
-		
+
 		xorpd(xmm0, xmm0) // set xmm0 to zero.
 		ucomisd(xmm0, xmm7) // check if beta == 0.
 		je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
-		
-		
-		 // check if aligned/column-stored
-		and(bl, bh) // set ZF if bl & bh == 1.
-		and(bh, al) // set ZF if bh & al == 1.
-		jne(.SCOLSTORED) // jump to column storage case
-		
-		
-		
-		label(.SGENSTORED)
-		
-		movlps(mem(rcx), xmm0) // load c00 ~ c30
-		movhps(mem(rcx, rsi, 1), xmm0)
-		movlps(mem(rcx, rsi, 2), xmm1)
-		movhps(mem(rcx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm8) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm8, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		add(rdi, rcx)
-		
-		
-		movlps(mem(rdx), xmm0) // load c40 ~ c70
-		movhps(mem(rdx, rsi, 1), xmm0)
-		movlps(mem(rdx, rsi, 2), xmm1)
-		movhps(mem(rdx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm12) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm12, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		add(rdi, rdx)
-		
-		
-		movlps(mem(rcx), xmm0) // load c01 ~ c31
-		movhps(mem(rcx, rsi, 1), xmm0)
-		movlps(mem(rcx, rsi, 2), xmm1)
-		movhps(mem(rcx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm9) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm9, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		add(rdi, rcx)
-		
-		
-		movlps(mem(rdx), xmm0) // load c41 ~ c71
-		movhps(mem(rdx, rsi, 1), xmm0)
-		movlps(mem(rdx, rsi, 2), xmm1)
-		movhps(mem(rdx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm13) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm13, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		add(rdi, rdx)
-		
-		
-		movlps(mem(rcx), xmm0) // load c02 ~ c32
-		movhps(mem(rcx, rsi, 1), xmm0)
-		movlps(mem(rcx, rsi, 2), xmm1)
-		movhps(mem(rcx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm10) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm10, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		add(rdi, rcx)
-		
-		
-		movlps(mem(rdx), xmm0) // load c42 ~ c72
-		movhps(mem(rdx, rsi, 1), xmm0)
-		movlps(mem(rdx, rsi, 2), xmm1)
-		movhps(mem(rdx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm14) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm14, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		add(rdi, rdx)
-		
-		
-		movlps(mem(rcx), xmm0) // load c03 ~ c33
-		movhps(mem(rcx, rsi, 1), xmm0)
-		movlps(mem(rcx, rsi, 2), xmm1)
-		movhps(mem(rcx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm11) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm11, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		
-		
-		
-		movlps(mem(rdx), xmm0) // load c43 ~ c73
-		movhps(mem(rdx, rsi, 1), xmm0)
-		movlps(mem(rdx, rsi, 2), xmm1)
-		movhps(mem(rdx, r11, 1), xmm1)
-		shufps(imm(0x88), xmm1, xmm0)
-		
-		mulps(xmm6, xmm15) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm15, xmm0) // add the gemm result,
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		
-		
-		
-		jmp(.SDONE) // jump to end.
-		
-		
-		
-		label(.SCOLSTORED)
-		
-		movaps(mem(rcx), xmm0) // load c00 ~ c30,
-		mulps(xmm6, xmm8) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm8, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		
-		movaps(mem(rdx), xmm1) // load c40 ~ c70,
-		mulps(xmm6, xmm12) // scale by alpha,
-		mulps(xmm7, xmm1) // scale by beta,
-		addps(xmm12, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		
-		movaps(mem(rcx), xmm0) // load c01 ~ c31,
-		mulps(xmm6, xmm9) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm9, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		
-		movaps(mem(rdx), xmm1) // load c41 ~ c71,
-		mulps(xmm6, xmm13) // scale by alpha,
-		mulps(xmm7, xmm1) // scale by beta,
-		addps(xmm13, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		
-		movaps(mem(rcx), xmm0) // load c02 ~ c32,
-		mulps(xmm6, xmm10) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm10, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		
-		movaps(mem(rdx), xmm1) // load c42 ~ c72,
-		mulps(xmm6, xmm14) // scale by alpha,
-		mulps(xmm7, xmm1) // scale by beta,
-		addps(xmm14, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		
-		movaps(mem(rcx), xmm0) // load c03 ~ c33,
-		mulps(xmm6, xmm11) // scale by alpha,
-		mulps(xmm7, xmm0) // scale by beta,
-		addps(xmm11, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		
-		
-		movaps(mem(rdx), xmm1) // load c43 ~ c73,
-		mulps(xmm6, xmm15) // scale by alpha,
-		mulps(xmm7, xmm1) // scale by beta,
-		addps(xmm15, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		
-		jmp(.SDONE) // jump to end.
-		
-		
-		
-		
+
+			movaps(mem(rcx), xmm0) // load c00 ~ c30,
+			mulps(xmm6, xmm8) // scale by alpha,
+			mulps(xmm7, xmm0) // scale by beta,
+			addps(xmm8, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+
+			movaps(mem(rdx), xmm1) // load c40 ~ c70,
+			mulps(xmm6, xmm12) // scale by alpha,
+			mulps(xmm7, xmm1) // scale by beta,
+			addps(xmm12, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+
+			movaps(mem(rcx), xmm0) // load c01 ~ c31,
+			mulps(xmm6, xmm9) // scale by alpha,
+			mulps(xmm7, xmm0) // scale by beta,
+			addps(xmm9, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+
+			movaps(mem(rdx), xmm1) // load c41 ~ c71,
+			mulps(xmm6, xmm13) // scale by alpha,
+			mulps(xmm7, xmm1) // scale by beta,
+			addps(xmm13, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+
+			movaps(mem(rcx), xmm0) // load c02 ~ c32,
+			mulps(xmm6, xmm10) // scale by alpha,
+			mulps(xmm7, xmm0) // scale by beta,
+			addps(xmm10, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+
+			movaps(mem(rdx), xmm1) // load c42 ~ c72,
+			mulps(xmm6, xmm14) // scale by alpha,
+			mulps(xmm7, xmm1) // scale by beta,
+			addps(xmm14, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+
+			movaps(mem(rcx), xmm0) // load c03 ~ c33,
+			mulps(xmm6, xmm11) // scale by alpha,
+			mulps(xmm7, xmm0) // scale by beta,
+			addps(xmm11, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+
+
+			movaps(mem(rdx), xmm1) // load c43 ~ c73,
+			mulps(xmm6, xmm15) // scale by alpha,
+			mulps(xmm7, xmm1) // scale by beta,
+			addps(xmm15, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+
+			jmp(.SDONE) // jump to end.
+
 		label(.SBETAZERO)
-		 // check if aligned/column-stored
-		and(bl, bh) // set ZF if bl & bh == 1.
-		and(bh, al) // set ZF if bh & al == 1.
-		jne(.SCOLSTORBZ) // jump to column storage case
-		
-		
-		
-		label(.SGENSTORBZ)
-		
-		mulps(xmm6, xmm8) // scale by alpha,
-		movaps(xmm8, xmm0)
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		add(rdi, rcx)
-		
-		
-		mulps(xmm6, xmm12) // scale by alpha,
-		movaps(xmm12, xmm0)
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		add(rdi, rdx)
-		
-		
-		mulps(xmm6, xmm9) // scale by alpha,
-		movaps(xmm9, xmm0)
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		add(rdi, rcx)
-		
-		
-		mulps(xmm6, xmm13) // scale by alpha,
-		movaps(xmm13, xmm0)
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		add(rdi, rdx)
-		
-		
-		mulps(xmm6, xmm10) // scale by alpha,
-		movaps(xmm10, xmm0)
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		add(rdi, rcx)
-		
-		
-		mulps(xmm6, xmm14) // scale by alpha,
-		movaps(xmm14, xmm0)
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		add(rdi, rdx)
-		
-		
-		mulps(xmm6, xmm11) // scale by alpha,
-		movaps(xmm11, xmm0)
-		
-		movss(xmm0, mem(rcx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rcx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rcx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rcx, r11, 1))
-		
-		
-		
-		
-		mulps(xmm6, xmm15) // scale by alpha,
-		movaps(xmm15, xmm0)
-		
-		movss(xmm0, mem(rdx)) // and store back to memory.
-		pshufd(imm(0x39), xmm0, xmm1)
-		movss(xmm1, mem(rdx, rsi, 1))
-		pshufd(imm(0x39), xmm1, xmm2)
-		movss(xmm2, mem(rdx, rsi, 2))
-		pshufd(imm(0x39), xmm2, xmm3)
-		movss(xmm3, mem(rdx, r11, 1))
-		
-		
-		
-		
-		jmp(.SDONE) // jump to end.
-		
-		
-		
-		label(.SCOLSTORBZ)
-		
-		 // skip loading c00 ~ c30,
-		mulps(xmm6, xmm8) // scale by alpha,
-		movaps(xmm8, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		 // skip loading c40 ~ c70,
-		mulps(xmm6, xmm12) // scale by alpha,
-		movaps(xmm12, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		 // skip loading c01 ~ c31,
-		mulps(xmm6, xmm9) // scale by alpha,
-		movaps(xmm9, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		 // skip loading c41 ~ c71,
-		mulps(xmm6, xmm13) // scale by alpha,
-		movaps(xmm13, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		 // skip loading c02 ~ c32,
-		mulps(xmm6, xmm10) // scale by alpha,
-		movaps(xmm10, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		 // skip loading c42 ~ c72,
-		mulps(xmm6, xmm14) // scale by alpha,
-		movaps(xmm14, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		 // skip loading c03 ~ c33,
-		mulps(xmm6, xmm11) // scale by alpha,
-		movaps(xmm11, mem(rcx)) // and store back to memory.
-		
-		 // skip loading c43 ~ c73,
-		mulps(xmm6, xmm15) // scale by alpha,
-		movaps(xmm15, mem(rdx)) // and store back to memory.
-		
-		
-		
-		
-		
-		
-		
-		
+
+			 // skip loading c00 ~ c30,
+			mulps(xmm6, xmm8) // scale by alpha,
+			movaps(xmm8, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+			 // skip loading c40 ~ c70,
+			mulps(xmm6, xmm12) // scale by alpha,
+			movaps(xmm12, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+			 // skip loading c01 ~ c31,
+			mulps(xmm6, xmm9) // scale by alpha,
+			movaps(xmm9, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+			 // skip loading c41 ~ c71,
+			mulps(xmm6, xmm13) // scale by alpha,
+			movaps(xmm13, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+			 // skip loading c02 ~ c32,
+			mulps(xmm6, xmm10) // scale by alpha,
+			movaps(xmm10, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+			 // skip loading c42 ~ c72,
+			mulps(xmm6, xmm14) // scale by alpha,
+			movaps(xmm14, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+			 // skip loading c03 ~ c33,
+			mulps(xmm6, xmm11) // scale by alpha,
+			movaps(xmm11, mem(rcx)) // and store back to memory.
+
+			 // skip loading c43 ~ c73,
+			mulps(xmm6, xmm15) // scale by alpha,
+			movaps(xmm15, mem(rdx)) // and store back to memory.
+
 		label(.SDONE)
-		
 
-    end_asm(
+
+	end_asm(
 		: // output operands (none)
 		: // input operands
-	      [k_iter] "m" (k_iter), // 0
-	      [k_left] "m" (k_left), // 1
-	      [a]      "m" (a),      // 2
-	      [b]      "m" (b),      // 3
-	      [alpha]  "m" (alpha),  // 4
-	      [beta]   "m" (beta),   // 5
-	      [c]      "m" (c),      // 6
-	      [rs_c]   "m" (rs_c),   // 7
-	      [cs_c]   "m" (cs_c),   // 8
-	      [b_next] "m" (b_next)/*, // 9
-	      [a_next] "m" (a_next)*/  // 10
+		  [k_iter] "m" (k_iter), // 0
+		  [k_left] "m" (k_left), // 1
+		  [a]      "m" (a),      // 2
+		  [b]      "m" (b),      // 3
+		  [alpha]  "m" (alpha),  // 4
+		  [beta]   "m" (beta),   // 5
+		  [c]      "m" (c),      // 6
+		  [rs_c]   "m" (rs_c),   // 7
+		  [cs_c]   "m" (cs_c),   // 8
+		  [b_next] "m" (b_next)/*, // 9
+		  [a_next] "m" (a_next)*/  // 10
 		: // register clobber list
 		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
 		  "xmm0", "xmm1", "xmm2", "xmm3",
@@ -842,11 +508,15 @@ void bli_sgemm_penryn_asm_8x4
 		  "xmm12", "xmm13", "xmm14", "xmm15",
 		  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 void bli_dgemm_penryn_asm_4x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -861,39 +531,41 @@ void bli_dgemm_penryn_asm_4x4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT_ALIGNED( d, 4, 4, false, 16 );
+
 	begin_asm()
-		
-		
+
+
 		mov(var(a), rax) // load address of a.
 		mov(var(b), rbx) // load address of b.
 		mov(var(b_next), r9) // load address of b_next.
 		mov(var(a_next), r11) // load address of a_next.
-		
+
 		sub(imm(0-8*16), rax) // increment pointers to allow byte
 		sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
-		
+
 		movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
 		movaps(mem(rax, -7*16), xmm1) // of a and b.
 		movaps(mem(rbx, -8*16), xmm2)
-		
+
 		mov(var(c), rcx) // load address of c
 		mov(var(cs_c), rdi) // load cs_c
 		lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
 		mov(rdi, r12) // make a copy of cs_c (in bytes)
 		lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
-		
+
 		prefetch(2, mem(r9, 0*8)) // prefetch b_next
-		
+
 		xorpd(xmm3, xmm3)
 		xorpd(xmm4, xmm4)
 		xorpd(xmm5, xmm5)
 		xorpd(xmm6, xmm6)
-		
+
 		prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 		xorpd(xmm8, xmm8)
 		xorpd(xmm9, xmm9)
@@ -906,22 +578,22 @@ void bli_dgemm_penryn_asm_4x4
 		prefetch(2, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
 		xorpd(xmm14, xmm14)
 		xorpd(xmm15, xmm15)
-		
-		
-		
+
+
+
 		mov(var(k_iter), rsi) // i = k_iter;
 		test(rsi, rsi) // check i via logical AND.
 		je(.DCONSIDKLEFT) // if i == 0, jump to code that
 		 // contains the k_left loop.
-		
-		
+
+
 		label(.DLOOPKITER) // MAIN LOOP
-		
+
 		prefetch(0, mem(rax, (4*35+1)*8))
 		//prefetch(0, mem(rax, (8*97+4)*8))
-		
+
 		//prefetch(0, mem(r11, 67*4*8)) // prefetch a_next[0]
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -929,13 +601,13 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -943,7 +615,7 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -951,9 +623,9 @@ void bli_dgemm_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
-		
+
+
+
 		addpd(xmm3, xmm11) // iteration 1
 		movaps(mem(rbx, -5*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -961,13 +633,13 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -4*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -975,7 +647,7 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -983,16 +655,16 @@ void bli_dgemm_penryn_asm_4x4
 		movaps(mem(rax, -4*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -3*16), xmm1)
-		
-		
+
+
 		prefetch(0, mem(rax, (4*37+1)*8))
 		//prefetch(0, mem(rax, (8*97+12)*8))
-		
+
 		//prefetch(0, mem(r11, 69*4*8)) // prefetch a_next[8]
 		//sub(imm(-4*4*8), r11) // a_next += 4*4 (unroll x mr)
-		
-		
-		
+
+
+
 		addpd(xmm3, xmm11) // iteration 2
 		movaps(mem(rbx, -3*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -1000,13 +672,13 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -2*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -1014,8 +686,8 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
-		
+
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -1023,9 +695,9 @@ void bli_dgemm_penryn_asm_4x4
 		movaps(mem(rax, -2*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -1*16), xmm1)
-		
-		
-		
+
+
+
 		addpd(xmm3, xmm11) // iteration 3
 		movaps(mem(rbx, -1*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -1033,17 +705,17 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		sub(imm(0-4*4*8), r9) // b_next += 4*4 (unroll x nr)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, 0*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -1051,9 +723,9 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -1061,29 +733,29 @@ void bli_dgemm_penryn_asm_4x4
 		movaps(mem(rax, -8*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -7*16), xmm1)
-		
+
 		prefetch(2, mem(r9, 0*8)) // prefetch b_next[0]
 		prefetch(2, mem(r9, 8*8)) // prefetch b_next[8]
-		
+
 		dec(rsi) // i -= 1;
 		jne(.DLOOPKITER) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		//prefetch(2, mem(r9, -8*8)) // prefetch b_next[-8]
-		
-		
-		
+
+
+
 		label(.DCONSIDKLEFT)
-		
+
 		mov(var(k_left), rsi) // i = k_left;
 		test(rsi, rsi) // check i via logical AND.
 		je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
 		 // else, we prepare to enter k_left loop.
-		
-		
+
+
 		label(.DLOOPKLEFT) // EDGE LOOP
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -1091,13 +763,13 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -1105,7 +777,7 @@ void bli_dgemm_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -1113,38 +785,38 @@ void bli_dgemm_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		sub(imm(0-4*1*8), rax) // a += 4 (1 x mr)
 		sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr)
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.DLOOPKLEFT) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.DPOSTACCUM)
-		
+
 		addpd(xmm3, xmm11)
 		addpd(xmm4, xmm15)
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
-		
-		
+
+
 		mov(var(alpha), rax) // load address of alpha
 		mov(var(beta), rbx) // load address of beta
 		movddup(mem(rax), xmm6) // load alpha and duplicate
 		movddup(mem(rbx), xmm7) // load beta and duplicate
-		
-		
+
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(rsi, r8) // make a copy of rs_c
-		
+
 		lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
-		
+
 		lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
-		
+
 		 // xmm8:   xmm9:   xmm10:  xmm11:
 		 // ( ab01  ( ab00  ( ab03  ( ab02
 		 //   ab10 )  ab11 )  ab12 )  ab13 )
@@ -1155,15 +827,15 @@ void bli_dgemm_penryn_asm_4x4
 		movaps(xmm8, xmm0)
 		movsd(xmm9, xmm8)
 		movsd(xmm0, xmm9)
-		
+
 		movaps(xmm10, xmm0)
 		movsd(xmm11, xmm10)
 		movsd(xmm0, xmm11)
-		
+
 		movaps(xmm12, xmm0)
 		movsd(xmm13, xmm12)
 		movsd(xmm0, xmm13)
-		
+
 		movaps(xmm14, xmm0)
 		movsd(xmm15, xmm14)
 		movsd(xmm0, xmm15)
@@ -1174,313 +846,133 @@ void bli_dgemm_penryn_asm_4x4
 		 // xmm12:  xmm13:  xmm14:  xmm15:
 		 // ( ab20  ( ab21  ( ab22  ( ab23
 		 //   ab30 )  ab31 )  ab32 )  ab33 )
-		
-		
-		
-		 // determine if
-		 //   c      % 16 == 0, AND
-		 //   8*cs_c % 16 == 0, AND
-		 //   rs_c        == 1
-		 // ie: aligned, ldim aligned, and
-		 // column-stored
-		
-		cmp(imm(1), r8) // set ZF if rs_c == 1.
-		sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-		test(imm(15), rcx) // set ZF if c & 16 is zero.
-		setz(bh) // bh = ( ZF == 1 ? 1 : 0 );
-		test(imm(15), r12) // set ZF if (8*cs_c) & 16 is zero.
-		setz(al) // al = ( ZF == 1 ? 1 : 0 );
-		 // and(bl,bh) followed by
-		 // and(bh,al) will reveal result
-		
+
 		 // now avoid loading C if beta == 0
-		
+
 		xorpd(xmm0, xmm0) // set xmm0 to zero.
 		ucomisd(xmm0, xmm7) // check if beta == 0.
 		je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-		
-		
-		 // check if aligned/column-stored
-		and(bl, bh) // set ZF if bl & bh == 1.
-		and(bh, al) // set ZF if bh & al == 1.
-		jne(.DCOLSTORED) // jump to column storage case
-		
-		
-		
-		label(.DGENSTORED)
-		
-		movlpd(mem(rcx), xmm0) // load c00 and c10,
-		movhpd(mem(rcx, rsi, 1), xmm0)
-		mulpd(xmm6, xmm8) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm8, xmm0) // add the gemm result,
-		movlpd(xmm0, mem(rcx)) // and store back to memory.
-		movhpd(xmm0, mem(rcx, rsi, 1))
-		add(rdi, rcx)
-		
-		movlpd(mem(rdx), xmm1) // load c20 and c30,
-		movhpd(mem(rdx, rsi, 1), xmm1)
-		mulpd(xmm6, xmm12) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm12, xmm1) // add the gemm result,
-		movlpd(xmm1, mem(rdx)) // and store back to memory.
-		movhpd(xmm1, mem(rdx, rsi, 1))
-		add(rdi, rdx)
-		
-		
-		
-		movlpd(mem(rcx), xmm0) // load c01 and c11,
-		movhpd(mem(rcx, rsi, 1), xmm0)
-		mulpd(xmm6, xmm9) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm9, xmm0) // add the gemm result,
-		movlpd(xmm0, mem(rcx)) // and store back to memory.
-		movhpd(xmm0, mem(rcx, rsi, 1))
-		add(rdi, rcx)
-		
-		movlpd(mem(rdx), xmm1) // load c21 and c31,
-		movhpd(mem(rdx, rsi, 1), xmm1)
-		mulpd(xmm6, xmm13) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm13, xmm1) // add the gemm result,
-		movlpd(xmm1, mem(rdx)) // and store back to memory.
-		movhpd(xmm1, mem(rdx, rsi, 1))
-		add(rdi, rdx)
-		
-		
-		
-		movlpd(mem(rcx), xmm0) // load c02 and c12,
-		movhpd(mem(rcx, rsi, 1), xmm0)
-		mulpd(xmm6, xmm10) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm10, xmm0) // add the gemm result,
-		movlpd(xmm0, mem(rcx)) // and store back to memory.
-		movhpd(xmm0, mem(rcx, rsi, 1))
-		add(rdi, rcx)
-		
-		movlpd(mem(rdx), xmm1) // load c22 and c32,
-		movhpd(mem(rdx, rsi, 1), xmm1)
-		mulpd(xmm6, xmm14) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm14, xmm1) // add the gemm result,
-		movlpd(xmm1, mem(rdx)) // and store back to memory.
-		movhpd(xmm1, mem(rdx, rsi, 1))
-		add(rdi, rdx)
-		
-		
-		
-		movlpd(mem(rcx), xmm0) // load c03 and c13,
-		movhpd(mem(rcx, rsi, 1), xmm0)
-		mulpd(xmm6, xmm11) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm11, xmm0) // add the gemm result,
-		movlpd(xmm0, mem(rcx)) // and store back to memory.
-		movhpd(xmm0, mem(rcx, rsi, 1))
-		
-		
-		movlpd(mem(rdx), xmm1) // load c23 and c33,
-		movhpd(mem(rdx, rsi, 1), xmm1)
-		mulpd(xmm6, xmm15) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm15, xmm1) // add the gemm result,
-		movlpd(xmm1, mem(rdx)) // and store back to memory.
-		movhpd(xmm1, mem(rdx, rsi, 1))
-		
-		jmp(.DDONE) // jump to end.
-		
-		
-		
-		label(.DCOLSTORED)
-		
-		movaps(mem(rcx), xmm0) // load c00 and c10,
-		mulpd(xmm6, xmm8) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm8, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		
-		movaps(mem(rdx), xmm1) // load c20 and c30,
-		mulpd(xmm6, xmm12) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm12, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		
-		movaps(mem(rcx), xmm0) // load c01 and c11,
-		mulpd(xmm6, xmm9) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm9, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		
-		movaps(mem(rdx), xmm1) // load c21 and c31,
-		mulpd(xmm6, xmm13) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm13, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		
-		movaps(mem(rcx), xmm0) // load c02 and c12,
-		mulpd(xmm6, xmm10) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm10, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		
-		movaps(mem(rdx), xmm1) // load c22 and c32,
-		mulpd(xmm6, xmm14) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm14, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		
-		movaps(mem(rcx), xmm0) // load c03 and c13,
-		mulpd(xmm6, xmm11) // scale by alpha,
-		mulpd(xmm7, xmm0) // scale by beta,
-		addpd(xmm11, xmm0) // add the gemm result,
-		movaps(xmm0, mem(rcx)) // and store back to memory.
-		
-		
-		movaps(mem(rdx), xmm1) // load c23 and c33,
-		mulpd(xmm6, xmm15) // scale by alpha,
-		mulpd(xmm7, xmm1) // scale by beta,
-		addpd(xmm15, xmm1) // add the gemm result,
-		movaps(xmm1, mem(rdx)) // and store back to memory.
-		
-		jmp(.DDONE) // jump to end.
-		
-		
-		
-		
+
+			movaps(mem(rcx), xmm0) // load c00 and c10,
+			mulpd(xmm6, xmm8) // scale by alpha,
+			mulpd(xmm7, xmm0) // scale by beta,
+			addpd(xmm8, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+
+			movaps(mem(rdx), xmm1) // load c20 and c30,
+			mulpd(xmm6, xmm12) // scale by alpha,
+			mulpd(xmm7, xmm1) // scale by beta,
+			addpd(xmm12, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+
+			movaps(mem(rcx), xmm0) // load c01 and c11,
+			mulpd(xmm6, xmm9) // scale by alpha,
+			mulpd(xmm7, xmm0) // scale by beta,
+			addpd(xmm9, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+
+			movaps(mem(rdx), xmm1) // load c21 and c31,
+			mulpd(xmm6, xmm13) // scale by alpha,
+			mulpd(xmm7, xmm1) // scale by beta,
+			addpd(xmm13, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+
+			movaps(mem(rcx), xmm0) // load c02 and c12,
+			mulpd(xmm6, xmm10) // scale by alpha,
+			mulpd(xmm7, xmm0) // scale by beta,
+			addpd(xmm10, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+
+			movaps(mem(rdx), xmm1) // load c22 and c32,
+			mulpd(xmm6, xmm14) // scale by alpha,
+			mulpd(xmm7, xmm1) // scale by beta,
+			addpd(xmm14, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+
+			movaps(mem(rcx), xmm0) // load c03 and c13,
+			mulpd(xmm6, xmm11) // scale by alpha,
+			mulpd(xmm7, xmm0) // scale by beta,
+			addpd(xmm11, xmm0) // add the gemm result,
+			movaps(xmm0, mem(rcx)) // and store back to memory.
+
+
+			movaps(mem(rdx), xmm1) // load c23 and c33,
+			mulpd(xmm6, xmm15) // scale by alpha,
+			mulpd(xmm7, xmm1) // scale by beta,
+			addpd(xmm15, xmm1) // add the gemm result,
+			movaps(xmm1, mem(rdx)) // and store back to memory.
+
+			jmp(.DDONE) // jump to end.
+
 		label(.DBETAZERO)
-		 // check if aligned/column-stored
-		and(bl, bh) // set ZF if bl & bh == 1.
-		and(bh, al) // set ZF if bh & al == 1.
-		jne(.DCOLSTORBZ) // jump to column storage case
-		
-		
-		
-		label(.DGENSTORBZ)
-		 // skip loading c00 and c10,
-		mulpd(xmm6, xmm8) // scale by alpha,
-		movlpd(xmm8, mem(rcx)) // and store back to memory.
-		movhpd(xmm8, mem(rcx, rsi, 1))
-		add(rdi, rcx)
-		 // skip loading c20 and c30,
-		mulpd(xmm6, xmm12) // scale by alpha,
-		movlpd(xmm12, mem(rdx)) // and store back to memory.
-		movhpd(xmm12, mem(rdx, rsi, 1))
-		add(rdi, rdx)
-		
-		
-		 // skip loading c01 and c11,
-		mulpd(xmm6, xmm9) // scale by alpha,
-		movlpd(xmm9, mem(rcx)) // and store back to memory.
-		movhpd(xmm9, mem(rcx, rsi, 1))
-		add(rdi, rcx)
-		 // skip loading c21 and c31,
-		mulpd(xmm6, xmm13) // scale by alpha,
-		movlpd(xmm13, mem(rdx)) // and store back to memory.
-		movhpd(xmm13, mem(rdx, rsi, 1))
-		add(rdi, rdx)
-		
-		
-		 // skip loading c02 and c12,
-		mulpd(xmm6, xmm10) // scale by alpha,
-		movlpd(xmm10, mem(rcx)) // and store back to memory.
-		movhpd(xmm10, mem(rcx, rsi, 1))
-		add(rdi, rcx)
-		 // skip loading c22 and c32,
-		mulpd(xmm6, xmm14) // scale by alpha,
-		movlpd(xmm14, mem(rdx)) // and store back to memory.
-		movhpd(xmm14, mem(rdx, rsi, 1))
-		add(rdi, rdx)
-		
-		
-		 // skip loading c03 and c13,
-		mulpd(xmm6, xmm11) // scale by alpha,
-		movlpd(xmm11, mem(rcx)) // and store back to memory.
-		movhpd(xmm11, mem(rcx, rsi, 1))
-		
-		 // skip loading c23 and c33,
-		mulpd(xmm6, xmm15) // scale by alpha,
-		movlpd(xmm15, mem(rdx)) // and store back to memory.
-		movhpd(xmm15, mem(rdx, rsi, 1))
-		
-		jmp(.DDONE) // jump to end.
-		
-		
-		
-		label(.DCOLSTORBZ)
-		
-		 // skip loading c00 and c10,
-		mulpd(xmm6, xmm8) // scale by alpha,
-		movaps(xmm8, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		 // skip loading c20 and c30,
-		mulpd(xmm6, xmm12) // scale by alpha,
-		movaps(xmm12, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		 // skip loading c01 and c11,
-		mulpd(xmm6, xmm9) // scale by alpha,
-		movaps(xmm9, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		 // skip loading c21 and c31,
-		mulpd(xmm6, xmm13) // scale by alpha,
-		movaps(xmm13, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		 // skip loading c02 and c12,
-		mulpd(xmm6, xmm10) // scale by alpha,
-		movaps(xmm10, mem(rcx)) // and store back to memory.
-		add(rdi, rcx)
-		 // skip loading c22 and c32,
-		mulpd(xmm6, xmm14) // scale by alpha,
-		movaps(xmm14, mem(rdx)) // and store back to memory.
-		add(rdi, rdx)
-		
-		
-		 // skip loading c03 and c13,
-		mulpd(xmm6, xmm11) // scale by alpha,
-		movaps(xmm11, mem(rcx)) // and store back to memory.
-		
-		 // skip loading c23 and c33,
-		mulpd(xmm6, xmm15) // scale by alpha,
-		movaps(xmm15, mem(rdx)) // and store back to memory.
-		
-		
-		
-		
-		
-		
-		
-		
+
+			 // skip loading c00 and c10,
+			mulpd(xmm6, xmm8) // scale by alpha,
+			movaps(xmm8, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+			 // skip loading c20 and c30,
+			mulpd(xmm6, xmm12) // scale by alpha,
+			movaps(xmm12, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+			 // skip loading c01 and c11,
+			mulpd(xmm6, xmm9) // scale by alpha,
+			movaps(xmm9, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+			 // skip loading c21 and c31,
+			mulpd(xmm6, xmm13) // scale by alpha,
+			movaps(xmm13, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+			 // skip loading c02 and c12,
+			mulpd(xmm6, xmm10) // scale by alpha,
+			movaps(xmm10, mem(rcx)) // and store back to memory.
+			add(rdi, rcx)
+			 // skip loading c22 and c32,
+			mulpd(xmm6, xmm14) // scale by alpha,
+			movaps(xmm14, mem(rdx)) // and store back to memory.
+			add(rdi, rdx)
+
+
+			 // skip loading c03 and c13,
+			mulpd(xmm6, xmm11) // scale by alpha,
+			movaps(xmm11, mem(rcx)) // and store back to memory.
+
+			 // skip loading c23 and c33,
+			mulpd(xmm6, xmm15) // scale by alpha,
+			movaps(xmm15, mem(rdx)) // and store back to memory.
+
 		label(.DDONE)
-		
 
-    end_asm(
+
+	end_asm(
 		: // output operands (none)
 		: // input operands
-	      [k_iter] "m" (k_iter), // 0
-	      [k_left] "m" (k_left), // 1
-	      [a]      "m" (a),      // 2
-	      [b]      "m" (b),      // 3
-	      [alpha]  "m" (alpha),  // 4
-	      [beta]   "m" (beta),   // 5
-	      [c]      "m" (c),      // 6
-	      [rs_c]   "m" (rs_c),   // 7
-	      [cs_c]   "m" (cs_c),   // 8
-	      [b_next] "m" (b_next), // 9
-	      [a_next] "m" (a_next)  // 10
+		  [k_iter] "m" (k_iter), // 0
+		  [k_left] "m" (k_left), // 1
+		  [a]      "m" (a),      // 2
+		  [b]      "m" (b),      // 3
+		  [alpha]  "m" (alpha),  // 4
+		  [beta]   "m" (beta),   // 5
+		  [c]      "m" (c),      // 6
+		  [rs_c]   "m" (rs_c),   // 7
+		  [cs_c]   "m" (cs_c),   // 8
+		  [b_next] "m" (b_next), // 9
+		  [a_next] "m" (a_next)  // 10
 		: // register clobber list
 		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
 		  "xmm0", "xmm1", "xmm2", "xmm3",
@@ -1489,6 +981,8 @@ void bli_dgemm_penryn_asm_4x4
 		  "xmm12", "xmm13", "xmm14", "xmm15",
 		  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 
diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
index 5963dabee..e65ce7178 100644
--- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
+++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
@@ -42,7 +42,9 @@
 
 void bli_sgemm_piledriver_asm_16x3
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -57,36 +59,38 @@ void bli_sgemm_piledriver_asm_16x3
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 8;
-	uint64_t k_left = k0 % 8;
+	uint64_t k_iter = k / 8;
+	uint64_t k_left = k % 8;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( s, 16, 3, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	mov(var(b_next), r15) // load address of b_next.
 	mov(var(a_next), r14) // load address of a_next.
-	
+
 	prefetch(0, mem(rbx, 128)) // prefetch b
 	prefetch(0, mem(rbx, 64+128)) // prefetch b
 	prefetch(0, mem(rbx, 128+128)) // prefetch b
-	
+
 	add(imm(32*4), rax)
 	add(imm(12*4), rbx)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
 	lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
 	lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c;
-	
+
 	vbroadcastss(mem(rbx, -12*4), xmm1)
 	vbroadcastss(mem(rbx, -11*4), xmm2)
 	vbroadcastss(mem(rbx, -10*4), xmm3)
-	
+
 	vxorps(xmm4, xmm4, xmm4)
 	vxorps(xmm5, xmm5, xmm5)
 	vxorps(xmm6, xmm6, xmm6)
@@ -99,23 +103,23 @@ void bli_sgemm_piledriver_asm_16x3
 	vxorps(xmm13, xmm13, xmm13)
 	vxorps(xmm14, xmm14, xmm14)
 	vxorps(xmm15, xmm15, xmm15)
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	je(.SCONSIDKLEFT) // if i == 0, jump to k_left code.
-	
-	
+
+
 	prefetch(0, mem(rbx, 16+192)) // prefetch b
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -32*4), xmm0)
 	prefetch(0, mem(rax, 384))
@@ -136,7 +140,7 @@ void bli_sgemm_piledriver_asm_16x3
 	vfmadd231ps(xmm2, xmm0, xmm14)
 	vbroadcastss(mem(rbx, -8*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 1
 	vmovaps(mem(rax, -16*4), xmm0)
 	vbroadcastss(mem(rbx, -7*4), xmm3)
@@ -158,7 +162,7 @@ void bli_sgemm_piledriver_asm_16x3
 	vfmadd231ps(xmm2, xmm0, xmm14)
 	vbroadcastss(mem(rbx, -5*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 2
 	vmovaps(mem(rax, 0*4), xmm0)
 	vbroadcastss(mem(rbx, -4*4), xmm3)
@@ -180,7 +184,7 @@ void bli_sgemm_piledriver_asm_16x3
 	vfmadd231ps(xmm2, xmm0, xmm14)
 	vbroadcastss(mem(rbx, -2*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 3
 	vmovaps(mem(rax, 16*4), xmm0)
 	vbroadcastss(mem(rbx, -1*4), xmm3)
@@ -202,10 +206,10 @@ void bli_sgemm_piledriver_asm_16x3
 	vfmadd231ps(xmm2, xmm0, xmm14)
 	vbroadcastss(mem(rbx, 1*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
-	
-	
+
+
 	add(imm(4*16*4), rax) // a += 4*16 (unroll x mr)
-	
+
 	 // iteration 4
 	vmovaps(mem(rax, -32*4), xmm0)
 	vbroadcastss(mem(rbx, 2*4), xmm3)
@@ -227,9 +231,9 @@ void bli_sgemm_piledriver_asm_16x3
 	vfmadd231ps(xmm2, xmm0, xmm14)
 	vbroadcastss(mem(rbx, 4*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
-	
+
 	prefetch(0, mem(rbx, 80+192)) // prefetch b
-	
+
 	 // iteration 5
 	vmovaps(mem(rax, -16*4), xmm0)
 	vbroadcastss(mem(rbx, 5*4), xmm3)
@@ -251,7 +255,7 @@ void bli_sgemm_piledriver_asm_16x3
 	vfmadd231ps(xmm2, xmm0, xmm14)
 	vbroadcastss(mem(rbx, 7*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 6
 	vmovaps(mem(rax, 0*4), xmm0)
 	vbroadcastss(mem(rbx, 8*4), xmm3)
@@ -273,7 +277,7 @@ void bli_sgemm_piledriver_asm_16x3
 	vfmadd231ps(xmm2, xmm0, xmm14)
 	vbroadcastss(mem(rbx, 10*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 7
 	vmovaps(mem(rax, 16*4), xmm0)
 	vbroadcastss(mem(rbx, 11*4), xmm3)
@@ -298,34 +302,34 @@ void bli_sgemm_piledriver_asm_16x3
 	vbroadcastss(mem(rbx, -11*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
 	vbroadcastss(mem(rbx, -10*4), xmm3)
-	
-	
-	
-	
+
+
+
+
 	dec(rsi) // i -= 1;
 	jmp(.SLOOPKITER) // jump to beginning of loop.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT) // EDGE LOOP
-	
-	
+
+
 	je(.SPOSTACCUM) // if i == 0, we're done.
-	
-	
+
+
 	prefetch(0, mem(rbx, 16+192)) // prefetch b
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -32*4), xmm0)
 	prefetch(0, mem(rax, 384))
@@ -347,56 +351,56 @@ void bli_sgemm_piledriver_asm_16x3
 	vbroadcastss(mem(rbx, -8*4), xmm2)
 	vfmadd231ps(xmm3, xmm0, xmm15)
 	vbroadcastss(mem(rbx, -7*4), xmm3)
-	
-	
+
+
 	add(imm(1*16*4), rax) // a += 4*16 (unroll x mr)
 	add(imm(1*3*4), rbx) // a += 4*3  (unroll x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jmp(.SLOOPKLEFT) // jump to beginning of loop.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
-	
-	
+
+
 	prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c
 	prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c
 	prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c
-	
-	
-	 // xmm4:   xmm5:   xmm6: 
+
+
+	 // xmm4:   xmm5:   xmm6:
 	 // ( ab00  ( ab01  ( ab02
-	 //   ab10    ab11    ab12  
+	 //   ab10    ab11    ab12
 	 //   ab20    ab21    ab22
 	 //   ab30 )  ab31 )  ab32 )
-	
-	 // xmm7:   xmm8:   xmm9: 
+
+	 // xmm7:   xmm8:   xmm9:
 	 // ( ab40  ( ab41  ( ab42
-	 //   ab50    ab51    ab52  
+	 //   ab50    ab51    ab52
 	 //   ab60    ab61    ab62
 	 //   ab70 )  ab71 )  ab72 )
-	
+
 	 // xmm10:  xmm11:  xmm12:
 	 // ( ab80  ( ab01  ( ab02
-	 //   ab90    ab11    ab12  
+	 //   ab90    ab11    ab12
 	 //   abA0    abA1    abA2
 	 //   abB0 )  abB1 )  abB2 )
-	
+
 	 // xmm13:  xmm14:  xmm15:
 	 // ( abC0  ( abC1  ( abC2
-	 //   abD0    abD1    abD2  
+	 //   abD0    abD1    abD2
 	 //   abE0    abE1    abE2
 	 //   abF0 )  abF1 )  abF2 )
-	
-	
-	
+
+
+
 	mov(var(alpha), rax) // load address of alpha
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rax), xmm0) // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm2) // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4) // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
@@ -409,32 +413,32 @@ void bli_sgemm_piledriver_asm_16x3
 	vmulps(xmm0, xmm13, xmm13)
 	vmulps(xmm0, xmm14, xmm14)
 	vmulps(xmm0, xmm15, xmm15)
-	
-	
-	
+
+
+
 	prefetch(0, mem(r14)) // prefetch a_next
 	prefetch(0, mem(r14, 64)) // prefetch a_next
-	
-	
-	
-	
+
+
+
+
 	mov(var(rs_c), rsi) // load rs_c
 	lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-	
+
 	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
 	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
-	
-	
-	
+
+
+
 	 // determine if
 	 //    c    % 32 == 0, AND
 	 //  4*cs_c % 32 == 0, AND
 	 //    rs_c      == 1
 	 // ie: aligned, ldim aligned, and
 	 // column-stored
-	
+
 	cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4.
 	sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
 	test(imm(31), rcx) // set ZF if c & 32 is zero.
@@ -443,465 +447,69 @@ void bli_sgemm_piledriver_asm_16x3
 	setz(al) // al = ( ZF == 0 ? 1 : 0 );
 	 // and(bl,bh) followed by
 	 // and(bh,al) will reveal result
-	
+
 	prefetch(0, mem(r15)) // prefetch b_next
 	prefetch(0, mem(r15, 64)) // prefetch b_next
-	
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero.
 	vucomiss(xmm0, xmm2) // set ZF if beta == 0.
 	je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.SCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.SGENSTORED)
-	
-	
-	vmovlps(mem(rcx), xmm0, xmm0) // load c00:c30
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm4, xmm0, xmm0)
-	vmovss(xmm0, mem(rcx)) // store c00:c30
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(rcx), xmm0, xmm0) // load c40:c70
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm7, xmm0, xmm0)
-	vmovss(xmm0, mem(rcx)) // store c40:c70
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(rcx), xmm0, xmm0) // load c80:cB0
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm10, xmm0, xmm0)
-	vmovss(xmm0, mem(rcx)) // store c80:cB0
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(rcx), xmm0, xmm0) // load cC0:cF0
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm13, xmm0, xmm0)
-	vmovss(xmm0, mem(rcx)) // store cC0:cF0
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r10), xmm0, xmm0) // load c01:c31
-	vmovhps(mem(r10, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r10, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r10, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm5, xmm0, xmm0)
-	vmovss(xmm0, mem(r10)) // store c01:c31
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r10), xmm0, xmm0) // load c41:c71
-	vmovhps(mem(r10, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r10, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r10, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm8, xmm0, xmm0)
-	vmovss(xmm0, mem(r10)) // store c41:c71
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r10), xmm0, xmm0) // load c81:cB1
-	vmovhps(mem(r10, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r10, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r10, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm11, xmm0, xmm0)
-	vmovss(xmm0, mem(r10)) // store c81:cB1
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r10), xmm0, xmm0) // load cC1:cF1
-	vmovhps(mem(r10, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r10, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r10, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm14, xmm0, xmm0)
-	vmovss(xmm0, mem(r10)) // store cC1:cF1
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r11), xmm0, xmm0) // load c02:c32
-	vmovhps(mem(r11, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r11, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r11, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm6, xmm0, xmm0)
-	vmovss(xmm0, mem(r11)) // store c02:c32
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r11), xmm0, xmm0) // load c42:c72
-	vmovhps(mem(r11, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r11, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r11, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm9, xmm0, xmm0)
-	vmovss(xmm0, mem(r11)) // store c42:c72
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r11), xmm0, xmm0) // load c82:cB2
-	vmovhps(mem(r11, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r11, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r11, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm12, xmm0, xmm0)
-	vmovss(xmm0, mem(r11)) // store c82:cB2
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	vmovlps(mem(r11), xmm0, xmm0) // load cC2:cF2
-	vmovhps(mem(r11, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r11, r12, 1), xmm1, xmm1)
-	vmovhps(mem(r11, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmulps(xmm2, xmm0, xmm0)
-	vaddps(xmm15, xmm0, xmm0)
-	vmovss(xmm0, mem(r11)) // store cC2:cF1
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	label(.SCOLSTORED)
-	
-	
-	vfmadd231ps(mem(rcx, 0*16), xmm2, xmm4)
-	vfmadd231ps(mem(rcx, 1*16), xmm2, xmm7)
-	vfmadd231ps(mem(rcx, 2*16), xmm2, xmm10)
-	vfmadd231ps(mem(rcx, 3*16), xmm2, xmm13)
-	
-	vmovups(xmm4, mem(rcx, 0*16))
-	vmovups(xmm7, mem(rcx, 1*16))
-	vmovups(xmm10, mem(rcx, 2*16))
-	vmovups(xmm13, mem(rcx, 3*16))
-	
-	vfmadd231ps(mem(r10, 0*16), xmm2, xmm5)
-	vfmadd231ps(mem(r10, 1*16), xmm2, xmm8)
-	vfmadd231ps(mem(r10, 2*16), xmm2, xmm11)
-	vfmadd231ps(mem(r10, 3*16), xmm2, xmm14)
-	
-	vmovups(xmm5, mem(r10, 0*16))
-	vmovups(xmm8, mem(r10, 1*16))
-	vmovups(xmm11, mem(r10, 2*16))
-	vmovups(xmm14, mem(r10, 3*16))
-	
-	vfmadd231ps(mem(r11, 0*16), xmm2, xmm6)
-	vfmadd231ps(mem(r11, 1*16), xmm2, xmm9)
-	vfmadd231ps(mem(r11, 2*16), xmm2, xmm12)
-	vfmadd231ps(mem(r11, 3*16), xmm2, xmm15)
-	
-	vmovups(xmm6, mem(r11, 0*16))
-	vmovups(xmm9, mem(r11, 1*16))
-	vmovups(xmm12, mem(r11, 2*16))
-	vmovups(xmm15, mem(r11, 3*16))
-	
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
+
+		vfmadd231ps(mem(rcx, 0*16), xmm2, xmm4)
+		vfmadd231ps(mem(rcx, 1*16), xmm2, xmm7)
+		vfmadd231ps(mem(rcx, 2*16), xmm2, xmm10)
+		vfmadd231ps(mem(rcx, 3*16), xmm2, xmm13)
+
+		vfmadd231ps(mem(r10, 0*16), xmm2, xmm5)
+		vfmadd231ps(mem(r10, 1*16), xmm2, xmm8)
+		vfmadd231ps(mem(r10, 2*16), xmm2, xmm11)
+		vfmadd231ps(mem(r10, 3*16), xmm2, xmm14)
+
+		vfmadd231ps(mem(r11, 0*16), xmm2, xmm6)
+		vfmadd231ps(mem(r11, 1*16), xmm2, xmm9)
+		vfmadd231ps(mem(r11, 2*16), xmm2, xmm12)
+		vfmadd231ps(mem(r11, 3*16), xmm2, xmm15)
+
+		 // fall through
+
 	label(.SBETAZERO)
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.SCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.SGENSTORBZ)
-	
-	
-	vmovaps(xmm4, xmm0)
-	vmovss(xmm0, mem(rcx)) // store c00:c30
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm7, xmm0)
-	vmovss(xmm0, mem(rcx)) // store c40:c70
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm10, xmm0)
-	vmovss(xmm0, mem(rcx)) // store c80:cB0
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm13, xmm0)
-	vmovss(xmm0, mem(rcx)) // store cC0:cF0
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(rcx, r13, 1))
-	lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm5, xmm0)
-	vmovss(xmm0, mem(r10)) // store c01:c31
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm8, xmm0)
-	vmovss(xmm0, mem(r10)) // store c41:c71
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm11, xmm0)
-	vmovss(xmm0, mem(r10)) // store c81:cB1
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm14, xmm0)
-	vmovss(xmm0, mem(r10)) // store cC1:cF1
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r10, r13, 1))
-	lea(mem(r10, rsi, 4), r10) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm6, xmm0)
-	vmovss(xmm0, mem(r11)) // store c02:c32
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm9, xmm0)
-	vmovss(xmm0, mem(r11)) // store c42:c72
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm12, xmm0)
-	vmovss(xmm0, mem(r11)) // store c82:cB2
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	vmovaps(xmm15, xmm0)
-	vmovss(xmm0, mem(r11)) // store cC2:cF1
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, rsi, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm0)
-	vmovss(xmm0, mem(r11, r13, 1))
-	lea(mem(r11, rsi, 4), r11) // c += 4*rs_c;
-	
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	label(.SCOLSTORBZ)
-	
-	
-	vmovups(xmm4, mem(rcx, 0*16))
-	vmovups(xmm7, mem(rcx, 1*16))
-	vmovups(xmm10, mem(rcx, 2*16))
-	vmovups(xmm13, mem(rcx, 3*16))
-	
-	vmovups(xmm5, mem(r10, 0*16))
-	vmovups(xmm8, mem(r10, 1*16))
-	vmovups(xmm11, mem(r10, 2*16))
-	vmovups(xmm14, mem(r10, 3*16))
-	
-	vmovups(xmm6, mem(r11, 0*16))
-	vmovups(xmm9, mem(r11, 1*16))
-	vmovups(xmm12, mem(r11, 2*16))
-	vmovups(xmm15, mem(r11, 3*16))
-	
-	
-	
-	
-	
-	
+
+		vmovups(xmm4, mem(rcx, 0*16))
+		vmovups(xmm7, mem(rcx, 1*16))
+		vmovups(xmm10, mem(rcx, 2*16))
+		vmovups(xmm13, mem(rcx, 3*16))
+
+		vmovups(xmm5, mem(r10, 0*16))
+		vmovups(xmm8, mem(r10, 1*16))
+		vmovups(xmm11, mem(r10, 2*16))
+		vmovups(xmm14, mem(r10, 3*16))
+
+		vmovups(xmm6, mem(r11, 0*16))
+		vmovups(xmm9, mem(r11, 1*16))
+		vmovups(xmm12, mem(r11, 2*16))
+		vmovups(xmm15, mem(r11, 3*16))
+
 	label(.SDONE)
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c),   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c),   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -909,11 +517,15 @@ void bli_sgemm_piledriver_asm_16x3
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 void bli_dgemm_piledriver_asm_8x3
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -928,36 +540,38 @@ void bli_dgemm_piledriver_asm_8x3
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 8;
-	uint64_t k_left = k0 % 8;
+	uint64_t k_iter = k / 8;
+	uint64_t k_left = k % 8;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( d, 8, 3, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	mov(var(b_next), r15) // load address of b_next.
 	mov(var(a_next), r14) // load address of a_next.
-	
+
 	prefetch(0, mem(rbx, 128)) // prefetch b
 	prefetch(0, mem(rbx, 64+128)) // prefetch b
 	prefetch(0, mem(rbx, 128+128)) // prefetch b
-	
+
 	add(imm(16*8), rax)
 	add(imm(12*8), rbx)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
 	lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
 	lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c;
-	
+
 	vmovddup(mem(rbx, -12*8), xmm1)
 	vmovddup(mem(rbx, -11*8), xmm2)
 	vmovddup(mem(rbx, -10*8), xmm3)
-	
+
 	vxorpd(xmm4, xmm4, xmm4)
 	vxorpd(xmm5, xmm5, xmm5)
 	vxorpd(xmm6, xmm6, xmm6)
@@ -970,24 +584,24 @@ void bli_dgemm_piledriver_asm_8x3
 	vxorpd(xmm13, xmm13, xmm13)
 	vxorpd(xmm14, xmm14, xmm14)
 	vxorpd(xmm15, xmm15, xmm15)
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	je(.DCONSIDKLEFT) // if i == 0, jump to k_left code.
-	
-	
+
+
 	prefetch(0, mem(rbx, -32+256)) // prefetch b
 	prefetch(0, mem(rbx, 32+256)) // prefetch b
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -8*16), xmm0)
 	prefetch(0, mem(rax, 384)) // prefetch a
@@ -1008,7 +622,7 @@ void bli_dgemm_piledriver_asm_8x3
 	vfmadd231pd(xmm2, xmm0, xmm14)
 	vmovddup(mem(rbx, -8*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 1
 	vmovaps(mem(rax, -4*16), xmm0)
 	prefetch(0, mem(rax, 64+384)) // prefetch a
@@ -1030,7 +644,7 @@ void bli_dgemm_piledriver_asm_8x3
 	vfmadd231pd(xmm2, xmm0, xmm14)
 	vmovddup(mem(rbx, -5*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 2
 	vmovaps(mem(rax, 0*16), xmm0)
 	prefetch(0, mem(rax, 128+384)) // prefetch a
@@ -1052,7 +666,7 @@ void bli_dgemm_piledriver_asm_8x3
 	vfmadd231pd(xmm2, xmm0, xmm14)
 	vmovddup(mem(rbx, -2*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 3
 	vmovaps(mem(rax, 4*16), xmm0)
 	prefetch(0, mem(rax, 192+384)) // prefetch a
@@ -1075,7 +689,7 @@ void bli_dgemm_piledriver_asm_8x3
 	vfmadd231pd(xmm2, xmm0, xmm14)
 	vmovddup(mem(rbx, 1*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 4
 	vmovaps(mem(rax, -8*16), xmm0)
 	prefetch(0, mem(rax, 384)) // prefetch a
@@ -1097,9 +711,9 @@ void bli_dgemm_piledriver_asm_8x3
 	vfmadd231pd(xmm2, xmm0, xmm14)
 	vmovddup(mem(rbx, 4*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
-	
+
 	prefetch(0, mem(rbx, 96+256)) // prefetch b
-	
+
 	 // iteration 5
 	vmovaps(mem(rax, -4*16), xmm0)
 	prefetch(0, mem(rax, 64+384)) // prefetch a
@@ -1121,8 +735,8 @@ void bli_dgemm_piledriver_asm_8x3
 	vfmadd231pd(xmm2, xmm0, xmm14)
 	vmovddup(mem(rbx, 7*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
-	
-	
+
+
 	 // iteration 6
 	vmovaps(mem(rax, 0*16), xmm0)
 	prefetch(0, mem(rax, 128+384)) // prefetch a
@@ -1144,7 +758,7 @@ void bli_dgemm_piledriver_asm_8x3
 	vfmadd231pd(xmm2, xmm0, xmm14)
 	vmovddup(mem(rbx, 10*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
-	
+
 	 // iteration 7
 	vmovaps(mem(rax, 4*16), xmm0)
 	prefetch(0, mem(rax, 192+384)) // prefetch a
@@ -1169,31 +783,31 @@ void bli_dgemm_piledriver_asm_8x3
 	vmovddup(mem(rbx, -11*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
 	vmovddup(mem(rbx, -10*8), xmm3)
-	
-	
-	
+
+
+
 	dec(rsi) // i -= 1;
 	jmp(.DLOOPKITER) // jump to beginning of loop.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DPOSTACCUM) // if i == 0, we're done.
 	 // else, we prepare to
 	 // enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT) // EDGE LOOP
-	
-	
+
+
 	je(.DPOSTACCUM) // if i == 0, we're done.
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -8*16), xmm0)
 	prefetch(0, mem(rax, 512)) // prefetch a
@@ -1215,48 +829,48 @@ void bli_dgemm_piledriver_asm_8x3
 	vmovddup(mem(rbx, -8*8), xmm2)
 	vfmadd231pd(xmm3, xmm0, xmm15)
 	vmovddup(mem(rbx, -7*8), xmm3)
-	
-	
+
+
 	add(imm(1*8*8), rax) // a += 1*8 (1 x mr)
 	add(imm(1*3*8), rbx) // b += 1*3 (1 x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jmp(.DLOOPKLEFT) // jump to beginning of loop.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
-	
+
 	prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c
 	prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c
 	prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c
-	
-	
-	 // xmm4:   xmm5:   xmm6:   
-	 // ( ab00  ( ab01  ( ab02  
+
+
+	 // xmm4:   xmm5:   xmm6:
+	 // ( ab00  ( ab01  ( ab02
 	 //   ab10 )  ab11 )  ab12 )
 	 //
-	 // xmm7:   xmm8:   xmm9:   
-	 // ( ab20  ( ab21  ( ab22  
+	 // xmm7:   xmm8:   xmm9:
+	 // ( ab20  ( ab21  ( ab22
 	 //   ab30 )  ab31 )  ab32 )
 	 //
-	 // xmm10:  xmm11:  xmm12:  
-	 // ( ab40  ( ab41  ( ab42  
+	 // xmm10:  xmm11:  xmm12:
+	 // ( ab40  ( ab41  ( ab42
 	 //   ab50 )  ab51 )  ab52 )
 	 //
-	 // xmm13:  xmm14:  xmm15:  
-	 // ( ab60  ( ab61  ( ab62  
+	 // xmm13:  xmm14:  xmm15:
+	 // ( ab60  ( ab61  ( ab62
 	 //   ab70 )  ab71 )  ab72 )
-	
-	
-	
-	
+
+
+
+
 	mov(var(alpha), rax) // load address of alpha
 	mov(var(beta), rbx) // load address of beta
 	vmovddup(mem(rax), xmm0) // load alpha and duplicate
 	vmovddup(mem(rbx), xmm2) // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4) // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(xmm0, xmm6, xmm6)
@@ -1269,358 +883,89 @@ void bli_dgemm_piledriver_asm_8x3
 	vmulpd(xmm0, xmm13, xmm13)
 	vmulpd(xmm0, xmm14, xmm14)
 	vmulpd(xmm0, xmm15, xmm15)
-	
-	
+
+
 	prefetch(0, mem(r14)) // prefetch a_next
 	prefetch(0, mem(r14, 64)) // prefetch a_next
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
-	
-	lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-	
-	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
-	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
-	
-	
-	
-	 // determine if
-	 //    c    % 32 == 0, AND
-	 //  8*cs_c % 32 == 0, AND
-	 //    rs_c      == 1
-	 // ie: aligned, ldim aligned, and
-	 // column-stored
-	
-	cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8.
-	sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-	test(imm(31), rcx) // set ZF if c & 32 is zero.
-	setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
-	test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero.
-	setz(al) // al = ( ZF == 0 ? 1 : 0 );
-	 // and(bl,bh) followed by
-	 // and(bh,al) will reveal result
-	
+
 	prefetch(0, mem(r15)) // prefetch b_next
 	prefetch(0, mem(r15, 64)) // prefetch b_next
-	
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero.
 	vucomisd(xmm0, xmm2) // set ZF if beta == 0.
 	je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	je(.DGENSTORED) // jump to column storage case
-	
-	
-	
-	label(.DCOLSTORED)
-	
-	 // xmm4:   xmm5:   xmm6:   
-	 // ( ab00  ( ab01  ( ab02  
-	 //   ab10 )  ab11 )  ab12 )
-	 //
-	 // xmm7:   xmm8:   xmm9:   
-	 // ( ab20  ( ab21  ( ab22  
-	 //   ab30 )  ab31 )  ab32 )
-	 //
-	 // xmm10:  xmm11:  xmm12:  
-	 // ( ab40  ( ab41  ( ab42  
-	 //   ab50 )  ab51 )  ab52 )
-	 //
-	 // xmm13:  xmm14:  xmm15:  
-	 // ( ab60  ( ab61  ( ab62  
-	 //   ab70 )  ab71 )  ab72 )
-	
-	
-	vfmadd231pd(mem(rcx, 0*16), xmm2, xmm4)
-	vfmadd231pd(mem(rcx, 1*16), xmm2, xmm7)
-	vfmadd231pd(mem(rcx, 2*16), xmm2, xmm10)
-	vfmadd231pd(mem(rcx, 3*16), xmm2, xmm13)
-	
-	vfmadd231pd(mem(r10, 0*16), xmm2, xmm5)
-	vfmadd231pd(mem(r10, 1*16), xmm2, xmm8)
-	vfmadd231pd(mem(r10, 2*16), xmm2, xmm11)
-	vfmadd231pd(mem(r10, 3*16), xmm2, xmm14)
-	
-	vfmadd231pd(mem(r11, 0*16), xmm2, xmm6)
-	vfmadd231pd(mem(r11, 1*16), xmm2, xmm9)
-	vfmadd231pd(mem(r11, 2*16), xmm2, xmm12)
-	vfmadd231pd(mem(r11, 3*16), xmm2, xmm15)
-	
-	
-	vmovups(xmm4, mem(rcx, 0*16))
-	vmovups(xmm7, mem(rcx, 1*16))
-	vmovups(xmm10, mem(rcx, 2*16))
-	vmovups(xmm13, mem(rcx, 3*16))
-	
-	vmovups(xmm5, mem(r10, 0*16))
-	vmovups(xmm8, mem(r10, 1*16))
-	vmovups(xmm11, mem(r10, 2*16))
-	vmovups(xmm14, mem(r10, 3*16))
-	
-	vmovups(xmm6, mem(r11, 0*16))
-	vmovups(xmm9, mem(r11, 1*16))
-	vmovups(xmm12, mem(r11, 2*16))
-	vmovups(xmm15, mem(r11, 3*16))
-	
-	
-	
-	
-/*
-	vmovupd(mem(rcx), xmm0) // load c00:c10
-	vmovupd(mem(rcx, r12, 1), xmm1) // load c20:c30
-	vfmadd231pd(xmm2, xmm0, xmm4)
-	vfmadd231pd(xmm2, xmm1, xmm7)
-	vmovupd(xmm4, mem(rcx)) // store c00:c10
-	vmovupd(xmm7, mem(rcx, r12, 1)) // store c20:c30
-	add(rdi, rcx)
-	
-	vmovupd(mem(rdx), xmm0) // load c40:c50
-	vmovupd(mem(rdx, r12, 1), xmm1) // load c60:c70
-	vfmadd213pd(xmm10, xmm2, xmm0)
-	vfmadd213pd(xmm13, xmm2, xmm1)
-	vmovupd(xmm0, mem(rdx)) // store c40:c50
-	vmovupd(xmm1, mem(rdx, r12, 1)) // store c60:c70
-	add(rdi, rdx)
-	
-	
-	vmovupd(mem(rcx), xmm0) // load c01:c11
-	vmovupd(mem(rcx, r12, 1), xmm1) // load c21:c31
-	vfmadd213pd(xmm5, xmm2, xmm0)
-	vfmadd213pd(xmm8, xmm2, xmm1)
-	vmovupd(xmm0, mem(rcx)) // store c01:c11
-	vmovupd(xmm1, mem(rcx, r12, 1)) // store c21:c31
-	add(rdi, rcx)
-	
-	vmovupd(mem(rdx), xmm0) // load c41:c51
-	vmovupd(mem(rdx, r12, 1), xmm1) // load c61:c71
-	vfmadd213pd(xmm11, xmm2, xmm0)
-	vfmadd213pd(xmm14, xmm2, xmm1)
-	vmovupd(xmm0, mem(rdx)) // store c41:c51
-	vmovupd(xmm1, mem(rdx, r12, 1)) // store c61:c71
-	add(rdi, rdx)
-	
-	
-	vmovupd(mem(rcx), xmm0) // load c02:c12
-	vmovupd(mem(rcx, r12, 1), xmm1) // load c22:c32
-	vfmadd213pd(xmm6, xmm2, xmm0)
-	vfmadd213pd(xmm9, xmm2, xmm1)
-	vmovupd(xmm0, mem(rcx)) // store c02:c12
-	vmovupd(xmm1, mem(rcx, r12, 1)) // store c22:c32
-	
-	vmovupd(mem(rdx), xmm0) // load c42:c52
-	vmovupd(mem(rdx, r12, 1), xmm1) // load c62:c72
-	vfmadd213pd(xmm12, xmm2, xmm0)
-	vfmadd213pd(xmm15, xmm2, xmm1)
-	vmovupd(xmm0, mem(rdx)) // store c42:c52
-	vmovupd(xmm1, mem(rdx, r12, 1)) // store c62:c72
-*/
-	
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
-	label(.DGENSTORED)
-	
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load c00:c10
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm4, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rcx)) // store c00:c10
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c20:c30
-	vmovhpd(mem(rcx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm7, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rcx, r12, 1)) // store c20:c30
-	vmovhpd(xmm0, mem(rcx, r13, 1))
-	add(rdi, rcx)
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load c40:c50
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm10, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rdx)) // store c40:c50
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c60:c70
-	vmovhpd(mem(rdx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm13, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rdx, r12, 1)) // store c60:c70
-	vmovhpd(xmm0, mem(rdx, r13, 1))
-	add(rdi, rdx)
-	
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load c01:c11
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm5, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rcx)) // store c01:c11
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c21:c31
-	vmovhpd(mem(rcx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm8, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rcx, r12, 1)) // store c21:c31
-	vmovhpd(xmm0, mem(rcx, r13, 1))
-	add(rdi, rcx)
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load c41:c51
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm11, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rdx)) // store c41:c51
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c61:c71
-	vmovhpd(mem(rdx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm14, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rdx, r12, 1)) // store c61:c71
-	vmovhpd(xmm0, mem(rdx, r13, 1))
-	add(rdi, rdx)
-	
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load c02:c12
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm6, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rcx)) // store c02:c12
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c22:c32
-	vmovhpd(mem(rcx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm9, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rcx, r12, 1)) // store c22:c32
-	vmovhpd(xmm0, mem(rcx, r13, 1))
-	add(rdi, rcx)
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load c42:c52
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm12, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rdx)) // store c42:c52
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c62:c72
-	vmovhpd(mem(rdx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0)
-	vaddpd(xmm15, xmm0, xmm0)
-	vmovlpd(xmm0, mem(rdx, r12, 1)) // store c62:c72
-	vmovhpd(xmm0, mem(rdx, r13, 1))
-	add(rdi, rdx)
-	
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
+
+		 // xmm4:   xmm5:   xmm6:
+		 // ( ab00  ( ab01  ( ab02
+		 //   ab10 )  ab11 )  ab12 )
+		 //
+		 // xmm7:   xmm8:   xmm9:
+		 // ( ab20  ( ab21  ( ab22
+		 //   ab30 )  ab31 )  ab32 )
+		 //
+		 // xmm10:  xmm11:  xmm12:
+		 // ( ab40  ( ab41  ( ab42
+		 //   ab50 )  ab51 )  ab52 )
+		 //
+		 // xmm13:  xmm14:  xmm15:
+		 // ( ab60  ( ab61  ( ab62
+		 //   ab70 )  ab71 )  ab72 )
+
+		vfmadd231pd(mem(rcx, 0*16), xmm2, xmm4)
+		vfmadd231pd(mem(rcx, 1*16), xmm2, xmm7)
+		vfmadd231pd(mem(rcx, 2*16), xmm2, xmm10)
+		vfmadd231pd(mem(rcx, 3*16), xmm2, xmm13)
+
+		vfmadd231pd(mem(r10, 0*16), xmm2, xmm5)
+		vfmadd231pd(mem(r10, 1*16), xmm2, xmm8)
+		vfmadd231pd(mem(r10, 2*16), xmm2, xmm11)
+		vfmadd231pd(mem(r10, 3*16), xmm2, xmm14)
+
+		vfmadd231pd(mem(r11, 0*16), xmm2, xmm6)
+		vfmadd231pd(mem(r11, 1*16), xmm2, xmm9)
+		vfmadd231pd(mem(r11, 2*16), xmm2, xmm12)
+		vfmadd231pd(mem(r11, 3*16), xmm2, xmm15)
+
+		 // fall through
+
 	label(.DBETAZERO)
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.DCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.DGENSTORBZ)
-	
-	
-	vmovlpd(xmm4, mem(rcx))
-	vmovhpd(xmm4, mem(rcx, rsi, 1))
-	vmovlpd(xmm7, mem(rcx, r12, 1))
-	vmovhpd(xmm7, mem(rcx, r13, 1))
-	add(rdi, rcx)
-	vmovlpd(xmm10, mem(rdx))
-	vmovhpd(xmm10, mem(rdx, rsi, 1))
-	vmovlpd(xmm13, mem(rdx, r12, 1))
-	vmovhpd(xmm13, mem(rdx, r13, 1))
-	add(rdi, rdx)
-	
-	vmovlpd(xmm5, mem(rcx))
-	vmovhpd(xmm5, mem(rcx, rsi, 1))
-	vmovlpd(xmm8, mem(rcx, r12, 1))
-	vmovhpd(xmm8, mem(rcx, r13, 1))
-	add(rdi, rcx)
-	vmovlpd(xmm11, mem(rdx))
-	vmovhpd(xmm11, mem(rdx, rsi, 1))
-	vmovlpd(xmm14, mem(rdx, r12, 1))
-	vmovhpd(xmm14, mem(rdx, r13, 1))
-	add(rdi, rdx)
-	
-	vmovlpd(xmm6, mem(rcx))
-	vmovhpd(xmm6, mem(rcx, rsi, 1))
-	vmovlpd(xmm9, mem(rcx, r12, 1))
-	vmovhpd(xmm9, mem(rcx, r13, 1))
-	add(rdi, rcx)
-	vmovlpd(xmm12, mem(rdx))
-	vmovhpd(xmm12, mem(rdx, rsi, 1))
-	vmovlpd(xmm15, mem(rdx, r12, 1))
-	vmovhpd(xmm15, mem(rdx, r13, 1))
-	add(rdi, rdx)
-	
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
-	label(.DCOLSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm7, mem(rcx, r12, 1))
-	add(rdi, rcx)
-	vmovupd(xmm10, mem(rdx))
-	vmovupd(xmm13, mem(rdx, r12, 1))
-	add(rdi, rdx)
-	
-	vmovupd(xmm5, mem(rcx))
-	vmovupd(xmm8, mem(rcx, r12, 1))
-	add(rdi, rcx)
-	vmovupd(xmm11, mem(rdx))
-	vmovupd(xmm14, mem(rdx, r12, 1))
-	add(rdi, rdx)
-	
-	vmovupd(xmm6, mem(rcx))
-	vmovupd(xmm9, mem(rcx, r12, 1))
-	add(rdi, rcx)
-	vmovupd(xmm12, mem(rdx))
-	vmovupd(xmm15, mem(rdx, r12, 1))
-	add(rdi, rdx)
-	
-	
-	
-	
-	
+
+		vmovups(xmm4, mem(rcx, 0*16))
+		vmovups(xmm7, mem(rcx, 1*16))
+		vmovups(xmm10, mem(rcx, 2*16))
+		vmovups(xmm13, mem(rcx, 3*16))
+
+		vmovups(xmm5, mem(r10, 0*16))
+		vmovups(xmm8, mem(r10, 1*16))
+		vmovups(xmm11, mem(r10, 2*16))
+		vmovups(xmm14, mem(r10, 3*16))
+
+		vmovups(xmm6, mem(r11, 0*16))
+		vmovups(xmm9, mem(r11, 1*16))
+		vmovups(xmm12, mem(r11, 2*16))
+		vmovups(xmm15, mem(r11, 3*16))
+
 	label(.DDONE)
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c),   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c),   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -1628,11 +973,15 @@ void bli_dgemm_piledriver_asm_8x3
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 void bli_cgemm_piledriver_asm_4x2
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
@@ -1647,28 +996,30 @@ void bli_cgemm_piledriver_asm_4x2
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 8;
-	uint64_t k_left = k0 % 8;
+	uint64_t k_iter = k / 8;
+	uint64_t k_left = k % 8;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( c, 4, 2, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	mov(var(b_next), r15) // load address of b_next.
 	mov(var(a_next), r14) // load address of a_next.
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
 	lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
-	
+
 	add(imm(32*4), rax)
 	add(imm(16*4), rbx)
-	
-	
+
+
 	vxorps(xmm8, xmm8, xmm8)
 	vxorps(xmm9, xmm9, xmm9)
 	vxorps(xmm10, xmm10, xmm10)
@@ -1678,24 +1029,24 @@ void bli_cgemm_piledriver_asm_4x2
 	vxorps(xmm14, xmm14, xmm14)
 	vxorps(xmm15, xmm15, xmm15)
 	//vzeroall()
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.CLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	je(.CCONSIDKLEFT) // if i == 0, jump to k_left code.
-	
-	
+
+
 	prefetch(0, mem(rbx, 256))
 	prefetch(0, mem(rax, 512))
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -32*4), xmm0)
 	vbroadcastss(mem(rbx, -16*4), xmm4)
@@ -1711,7 +1062,7 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, -13*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 1
 	vmovaps(mem(rax, -24*4), xmm0)
 	vbroadcastss(mem(rbx, -12*4), xmm4)
@@ -1727,10 +1078,10 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, -9*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
+
 	prefetch(0, mem(rbx, 64+256))
 	prefetch(0, mem(rax, 64+512))
-	
+
 	 // iteration 2
 	vmovaps(mem(rax, -16*4), xmm0)
 	vbroadcastss(mem(rbx, -8*4), xmm4)
@@ -1746,7 +1097,7 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, -5*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 3
 	vmovaps(mem(rax, -8*4), xmm0)
 	vbroadcastss(mem(rbx, -4*4), xmm4)
@@ -1762,10 +1113,10 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, -1*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
+
 	prefetch(0, mem(rbx, 128+256))
 	prefetch(0, mem(rax, 128+512))
-	
+
 	 // iteration 4
 	vmovaps(mem(rax, 0*4), xmm0)
 	vbroadcastss(mem(rbx, 0*4), xmm4)
@@ -1781,7 +1132,7 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, 3*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 5
 	vmovaps(mem(rax, 8*4), xmm0)
 	vbroadcastss(mem(rbx, 4*4), xmm4)
@@ -1797,10 +1148,10 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, 7*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
+
 	prefetch(0, mem(rbx, 128+256))
 	prefetch(0, mem(rax, 128+512))
-	
+
 	 // iteration 6
 	vmovaps(mem(rax, 16*4), xmm0)
 	vbroadcastss(mem(rbx, 8*4), xmm4)
@@ -1816,7 +1167,7 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, 11*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 7
 	vmovaps(mem(rax, 24*4), xmm0)
 	vbroadcastss(mem(rbx, 12*4), xmm4)
@@ -1834,33 +1185,33 @@ void bli_cgemm_piledriver_asm_4x2
 	add(imm(8*2*8), rbx) // b += 8*2 (unroll x nr)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
-	
-	
+
+
+
 	dec(rsi) // i -= 1;
 	jmp(.CLOOPKITER) // jump to beginning of loop.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.CCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.CLOOPKLEFT) // EDGE LOOP
-	
-	
+
+
 	je(.CPOSTACCUM) // if i == 0, we're done.
-	
+
 	prefetch(0, mem(rbx, 256))
 	prefetch(0, mem(rax, 512))
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -32*4), xmm0)
 	vbroadcastss(mem(rbx, -16*4), xmm4)
@@ -1876,123 +1227,88 @@ void bli_cgemm_piledriver_asm_4x2
 	vbroadcastss(mem(rbx, -13*4), xmm7)
 	vfmadd231ps(xmm0, xmm7, xmm11)
 	vfmadd231ps(xmm1, xmm7, xmm15)
-	
-	
+
+
 	add(imm(1*4*8), rax) // a += 1*2 (1 x mr)
 	add(imm(1*2*8), rbx) // b += 1*2 (1 x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jmp(.CLOOPKLEFT) // jump to beginning of loop.
-	
-	
-	
+
+
+
 	label(.CPOSTACCUM)
-	
-	
+
+
 	prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c
 	prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c
-	
-	
+
+
 	vpermilps(imm(0xb1), xmm9, xmm9)
 	vpermilps(imm(0xb1), xmm11, xmm11)
 	vpermilps(imm(0xb1), xmm13, xmm13)
 	vpermilps(imm(0xb1), xmm15, xmm15)
-	
+
 	vaddsubps(xmm9, xmm8, xmm8)
 	vaddsubps(xmm11, xmm10, xmm10)
 	vaddsubps(xmm13, xmm12, xmm12)
 	vaddsubps(xmm15, xmm14, xmm14)
-	
-	
+
+
 	 // xmm8:   xmm10:
 	 // ( ab00  ( ab01
 	 //   ab10    ab11
 	 //   ab20    ab21
 	 //   ab30 )  ab31 )
-	
+
 	 // xmm12:  xmm14:
 	 // ( ab40  ( ab41
 	 //   ab50    ab51
 	 //   ab60    ab61
 	 //   ab70 )  ab71 )
-	
-	
+
+
 	prefetch(0, mem(r14)) // prefetch a_next
 	prefetch(0, mem(r14, 64)) // prefetch a_next
-	
-	
+
+
 	 // scale by alpha
-	
+
 	mov(var(alpha), rax) // load address of alpha
 	vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate
 	vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate
-	
+
 	vpermilps(imm(0xb1), xmm8, xmm9)
 	vpermilps(imm(0xb1), xmm10, xmm11)
 	vpermilps(imm(0xb1), xmm12, xmm13)
 	vpermilps(imm(0xb1), xmm14, xmm15)
-	
+
 	vmulps(xmm8, xmm0, xmm8)
 	vmulps(xmm10, xmm0, xmm10)
 	vmulps(xmm12, xmm0, xmm12)
 	vmulps(xmm14, xmm0, xmm14)
-	
+
 	vmulps(xmm9, xmm1, xmm9)
 	vmulps(xmm11, xmm1, xmm11)
 	vmulps(xmm13, xmm1, xmm13)
 	vmulps(xmm15, xmm1, xmm15)
-	
+
 	vaddsubps(xmm9, xmm8, xmm8)
 	vaddsubps(xmm11, xmm10, xmm10)
 	vaddsubps(xmm13, xmm12, xmm12)
 	vaddsubps(xmm15, xmm14, xmm14)
-	
-	
-	
-	
+
+
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), xmm6) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), xmm7) // load beta_i and duplicate
-	
-	
-	
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
-	
-	
-	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
-	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
-	
-	
-	
+
 	prefetch(0, mem(r15)) // prefetch b_next
 	prefetch(0, mem(r15, 64)) // prefetch b_next
-	
-	
-	
-	 // determine if
-	 //    c    % 32 == 0, AND
-	 //  8*cs_c % 32 == 0, AND
-	 //    rs_c      == 1
-	 // ie: aligned, ldim aligned, and
-	 // column-stored
-	
-	cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8.
-	sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-	test(imm(31), rcx) // set ZF if c & 32 is zero.
-	setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
-	test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero.
-	setz(al) // al = ( ZF == 0 ? 1 : 0 );
-	 // and(bl,bh) followed by
-	 // and(bh,al) will reveal result
-	
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero.
 	vucomiss(xmm0, xmm6) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -2000,175 +1316,66 @@ void bli_cgemm_piledriver_asm_4x2
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case
-	
-	
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.CCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.CGENSTORED)
-	
-	
-	vmovlps(mem(rcx), xmm0, xmm0) // load c00:c10
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm2, xmm2) // load c20:c30
-	vmovhps(mem(rcx, r13, 1), xmm2, xmm2)
-	vpermilps(imm(0xb1), xmm0, xmm1)
-	vpermilps(imm(0xb1), xmm2, xmm3)
-	
-	vmulps(xmm6, xmm0, xmm0)
-	vmulps(xmm7, xmm1, xmm1)
-	vaddsubps(xmm1, xmm0, xmm0)
-	vaddps(xmm8, xmm0, xmm0)
-	vmovlps(xmm0, mem(rcx)) // store c00:c10
-	vmovhps(xmm0, mem(rcx, rsi, 1))
-	
-	vmulps(xmm6, xmm2, xmm2)
-	vmulps(xmm7, xmm3, xmm3)
-	vaddsubps(xmm3, xmm2, xmm2)
-	vaddps(xmm12, xmm2, xmm2)
-	vmovlps(xmm2, mem(rcx, r12, 1)) // store c20:c30
-	vmovhps(xmm2, mem(rcx, r13, 1))
-	
-	
-	
-	vmovlps(mem(r10), xmm0, xmm0) // load c01:c11
-	vmovhps(mem(r10, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(r10, r12, 1), xmm2, xmm2) // load c21:c31
-	vmovhps(mem(r10, r13, 1), xmm2, xmm2)
-	vpermilps(imm(0xb1), xmm0, xmm1)
-	vpermilps(imm(0xb1), xmm2, xmm3)
-	
-	vmulps(xmm6, xmm0, xmm0)
-	vmulps(xmm7, xmm1, xmm1)
-	vaddsubps(xmm1, xmm0, xmm0)
-	vaddps(xmm10, xmm0, xmm0)
-	vmovlps(xmm0, mem(r10)) // store c01:c11
-	vmovhps(xmm0, mem(r10, rsi, 1))
-	
-	vmulps(xmm6, xmm2, xmm2)
-	vmulps(xmm7, xmm3, xmm3)
-	vaddsubps(xmm3, xmm2, xmm2)
-	vaddps(xmm14, xmm2, xmm2)
-	vmovlps(xmm2, mem(r10, r12, 1)) // store c21:c31
-	vmovhps(xmm2, mem(r10, r13, 1))
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
-	label(.CCOLSTORED)
-	
-	
-	vmovups(mem(rcx), xmm0) // load c00:c10
-	vmovups(mem(rcx, 16), xmm2) // load c20:c30
-	vpermilps(imm(0xb1), xmm0, xmm1)
-	vpermilps(imm(0xb1), xmm2, xmm3)
-	
-	vmulps(xmm6, xmm0, xmm0)
-	vmulps(xmm7, xmm1, xmm1)
-	vaddsubps(xmm1, xmm0, xmm0)
-	vaddps(xmm8, xmm0, xmm0)
-	vmovups(xmm0, mem(rcx)) // store c00:c10
-	
-	vmulps(xmm6, xmm2, xmm2)
-	vmulps(xmm7, xmm3, xmm3)
-	vaddsubps(xmm3, xmm2, xmm2)
-	vaddps(xmm12, xmm2, xmm2)
-	vmovups(xmm2, mem(rcx, 16)) // store c20:c30
-	
-	
-	
-	vmovups(mem(r10), xmm0) // load c01:c11
-	vmovups(mem(r10, 16), xmm2) // load c21:c31
-	vpermilps(imm(0xb1), xmm0, xmm1)
-	vpermilps(imm(0xb1), xmm2, xmm3)
-	
-	vmulps(xmm6, xmm0, xmm0)
-	vmulps(xmm7, xmm1, xmm1)
-	vaddsubps(xmm1, xmm0, xmm0)
-	vaddps(xmm10, xmm0, xmm0)
-	vmovups(xmm0, mem(r10)) // store c01:c11
-	
-	vmulps(xmm6, xmm2, xmm2)
-	vmulps(xmm7, xmm3, xmm3)
-	vaddsubps(xmm3, xmm2, xmm2)
-	vaddps(xmm14, xmm2, xmm2)
-	vmovups(xmm2, mem(r10, 16)) // store c21:c31
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
+
+		vmovups(mem(rcx), xmm0) // load c00:c10
+		vmovups(mem(rcx, 16), xmm2) // load c20:c30
+		vpermilps(imm(0xb1), xmm0, xmm1)
+		vpermilps(imm(0xb1), xmm2, xmm3)
+
+		vmulps(xmm6, xmm0, xmm0)
+		vmulps(xmm7, xmm1, xmm1)
+		vaddsubps(xmm1, xmm0, xmm0)
+		vaddps(xmm8, xmm0, xmm0)
+
+		vmulps(xmm6, xmm2, xmm2)
+		vmulps(xmm7, xmm3, xmm3)
+		vaddsubps(xmm3, xmm2, xmm2)
+		vaddps(xmm12, xmm2, xmm2)
+
+		vmovups(mem(r10), xmm0) // load c01:c11
+		vmovups(mem(r10, 16), xmm2) // load c21:c31
+		vpermilps(imm(0xb1), xmm0, xmm1)
+		vpermilps(imm(0xb1), xmm2, xmm3)
+
+		vmulps(xmm6, xmm0, xmm0)
+		vmulps(xmm7, xmm1, xmm1)
+		vaddsubps(xmm1, xmm0, xmm0)
+		vaddps(xmm10, xmm0, xmm0)
+
+		vmulps(xmm6, xmm2, xmm2)
+		vmulps(xmm7, xmm3, xmm3)
+		vaddsubps(xmm3, xmm2, xmm2)
+		vaddps(xmm14, xmm2, xmm2)
+
+		 // fall through
+
 	label(.CBETAZERO)
-	 // check if aligned/column-stored
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.CCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.CGENSTORBZ)
-	
-	
-	vmovlps(xmm8, mem(rcx)) // store c00:c10
-	vmovhps(xmm8, mem(rcx, rsi, 1))
-	
-	vmovlps(xmm12, mem(rcx, r12, 1)) // store c20:c30
-	vmovhps(xmm12, mem(rcx, r13, 1))
-	
-	vmovlps(xmm10, mem(r10)) // store c01:c11
-	vmovhps(xmm10, mem(r10, rsi, 1))
-	
-	vmovlps(xmm14, mem(r10, r12, 1)) // store c21:c31
-	vmovhps(xmm14, mem(r10, r13, 1))
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
-	label(.CCOLSTORBZ)
-	
-	
-	vmovups(xmm8, mem(rcx)) // store c00:c10
-	vmovups(xmm12, mem(rcx, 16)) // store c20:c30
-	
-	vmovups(xmm10, mem(r10)) // store c01:c11
-	vmovups(xmm14, mem(r10, 16)) // store c21:c31
-	
-	
-	
-	
-	
+
+		vmovups(xmm8, mem(rcx)) // store c00:c10
+		vmovups(xmm12, mem(rcx, 16)) // store c20:c30
+
+		vmovups(xmm10, mem(r10)) // store c01:c11
+		vmovups(xmm14, mem(r10, 16)) // store c21:c31
+
 	label(.CDONE)
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c),   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c),   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -2176,11 +1383,15 @@ void bli_cgemm_piledriver_asm_4x2
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( c );
 }
 
 void bli_zgemm_piledriver_asm_2x2
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -2195,28 +1406,30 @@ void bli_zgemm_piledriver_asm_2x2
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 8;
-	uint64_t k_left = k0 % 8;
+	uint64_t k_iter = k / 8;
+	uint64_t k_left = k % 8;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( z, 2, 2, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	mov(var(b_next), r15) // load address of b_next.
 	mov(var(a_next), r14) // load address of a_next.
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
 	lea(mem(, rdi, 2), rdi)
 	lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
-	
+
 	add(imm(16*8), rax)
 	add(imm(16*8), rbx)
-	
+
 	vxorpd(xmm8, xmm8, xmm8)
 	vxorpd(xmm9, xmm9, xmm9)
 	vxorpd(xmm10, xmm10, xmm10)
@@ -2225,25 +1438,25 @@ void bli_zgemm_piledriver_asm_2x2
 	vxorpd(xmm13, xmm13, xmm13)
 	vxorpd(xmm14, xmm14, xmm14)
 	vxorpd(xmm15, xmm15, xmm15)
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.ZLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	je(.ZCONSIDKLEFT) // if i == 0, jump to k_left code.
-	
-	
+
+
 	prefetch(0, mem(rbx, 256))
-	
+
 	prefetch(0, mem(rax, 512))
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -16*8), xmm0)
 	vmovddup(mem(rbx, -16*8), xmm4)
@@ -2261,7 +1474,7 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovaps(mem(rax, -12*8), xmm0)
 	vmovddup(mem(rbx, -12*8), xmm4)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 1
 	vfmadd231pd(xmm0, xmm4, xmm8)
 	vmovaps(mem(rax, -10*8), xmm1)
@@ -2277,11 +1490,11 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovaps(mem(rax, -8*8), xmm0)
 	vmovddup(mem(rbx, -8*8), xmm4)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
+
 	prefetch(0, mem(rbx, 64+256))
-	
+
 	prefetch(0, mem(rax, 64+512))
-	
+
 	 // iteration 2
 	vfmadd231pd(xmm0, xmm4, xmm8)
 	vmovaps(mem(rax, -6*8), xmm1)
@@ -2297,7 +1510,7 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovaps(mem(rax, -4*8), xmm0)
 	vmovddup(mem(rbx, -4*8), xmm4)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 3
 	vfmadd231pd(xmm0, xmm4, xmm8)
 	vmovaps(mem(rax, -2*8), xmm1)
@@ -2313,11 +1526,11 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovaps(mem(rax, 0*8), xmm0)
 	vmovddup(mem(rbx, 0*8), xmm4)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
+
 	prefetch(0, mem(rbx, 128+256))
-	
+
 	prefetch(0, mem(rax, 128+512))
-	
+
 	 // iteration 4
 	vfmadd231pd(xmm0, xmm4, xmm8)
 	vmovaps(mem(rax, 2*8), xmm1)
@@ -2333,7 +1546,7 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovaps(mem(rax, 4*8), xmm0)
 	vmovddup(mem(rbx, 4*8), xmm4)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 5
 	vfmadd231pd(xmm0, xmm4, xmm8)
 	vmovaps(mem(rax, 6*8), xmm1)
@@ -2349,11 +1562,11 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovaps(mem(rax, 8*8), xmm0)
 	vmovddup(mem(rbx, 8*8), xmm4)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
+
 	prefetch(0, mem(rbx, 128+256))
-	
+
 	prefetch(0, mem(rax, 128+512))
-	
+
 	 // iteration 6
 	vfmadd231pd(xmm0, xmm4, xmm8)
 	vmovaps(mem(rax, 10*8), xmm1)
@@ -2369,7 +1582,7 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovaps(mem(rax, 12*8), xmm0)
 	vmovddup(mem(rbx, 12*8), xmm4)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
+
 	 // iteration 7
 	vfmadd231pd(xmm0, xmm4, xmm8)
 	vmovaps(mem(rax, 14*8), xmm1)
@@ -2385,34 +1598,34 @@ void bli_zgemm_piledriver_asm_2x2
 	add(imm(8*2*16), rbx) // b += 8*2 (unroll x nr)
 	vfmadd231pd(xmm0, xmm7, xmm11)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
-	
-	
+
+
+
 	dec(rsi) // i -= 1;
 	jmp(.ZLOOPKITER) // jump to beginning of loop.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.ZCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.ZLOOPKLEFT) // EDGE LOOP
-	
-	
+
+
 	je(.ZPOSTACCUM) // if i == 0, we're done.
-	
+
 	prefetch(0, mem(rbx, 256))
-	
+
 	prefetch(0, mem(rax, 512))
-	
+
 	 // iteration 0
 	vmovaps(mem(rax, -16*8), xmm0)
 	vmovddup(mem(rbx, -16*8), xmm4)
@@ -2428,119 +1641,86 @@ void bli_zgemm_piledriver_asm_2x2
 	vmovddup(mem(rbx, -13*8), xmm7)
 	vfmadd231pd(xmm0, xmm7, xmm11)
 	vfmadd231pd(xmm1, xmm7, xmm15)
-	
-	
+
+
 	add(imm(1*2*16), rax) // a += 1*2 (1 x mr)
 	add(imm(1*2*16), rbx) // b += 1*2 (1 x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jmp(.ZLOOPKLEFT) // jump to beginning of loop.
-	
-	
-	
+
+
+
 	label(.ZPOSTACCUM)
-	
-	
+
+
 	prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c
 	prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c
-	
-	
+
+
 	vpermilpd(imm(0x1), xmm9, xmm9)
 	vpermilpd(imm(0x1), xmm11, xmm11)
 	vpermilpd(imm(0x1), xmm13, xmm13)
 	vpermilpd(imm(0x1), xmm15, xmm15)
-	
+
 	vaddsubpd(xmm9, xmm8, xmm8)
 	vaddsubpd(xmm11, xmm10, xmm10)
 	vaddsubpd(xmm13, xmm12, xmm12)
 	vaddsubpd(xmm15, xmm14, xmm14)
-	
-	
+
+
 	 // xmm8:   xmm10:
 	 // ( ab00  ( ab01
 	 //   ab10 )  ab11 )
-	
+
 	 // xmm12:  xmm14:
 	 // ( ab20  ( ab21
 	 //   ab30 )  ab31 )
-	
-	
+
+
 	prefetch(0, mem(r14)) // prefetch a_next
 	prefetch(0, mem(r14, 64)) // prefetch a_next
-	
-	
+
+
 	 // scale by alpha
-	
+
 	mov(var(alpha), rax) // load address of alpha
 	vmovddup(mem(rax), xmm0) // load alpha_r and duplicate
 	vmovddup(mem(rax, 8), xmm1) // load alpha_i and duplicate
-	
+
 	vpermilpd(imm(0x1), xmm8, xmm9)
 	vpermilpd(imm(0x1), xmm10, xmm11)
 	vpermilpd(imm(0x1), xmm12, xmm13)
 	vpermilpd(imm(0x1), xmm14, xmm15)
-	
+
 	vmulpd(xmm8, xmm0, xmm8)
 	vmulpd(xmm10, xmm0, xmm10)
 	vmulpd(xmm12, xmm0, xmm12)
 	vmulpd(xmm14, xmm0, xmm14)
-	
+
 	vmulpd(xmm9, xmm1, xmm9)
 	vmulpd(xmm11, xmm1, xmm11)
 	vmulpd(xmm13, xmm1, xmm13)
 	vmulpd(xmm15, xmm1, xmm15)
-	
+
 	vaddsubpd(xmm9, xmm8, xmm8)
 	vaddsubpd(xmm11, xmm10, xmm10)
 	vaddsubpd(xmm13, xmm12, xmm12)
 	vaddsubpd(xmm15, xmm14, xmm14)
-	
-	
-	
-	
+
+
+
+
 	mov(var(beta), rbx) // load address of beta
 	vmovddup(mem(rbx), xmm6) // load beta_r and duplicate
 	vmovddup(mem(rbx, 8), xmm7) // load beta_i and duplicate
-	
-	
-	
-	
-	
-	
-	
-	mov(var(rs_c), rsi) // load rs_c
-	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
-	lea(mem(, rsi, 2), rsi)
-	//lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
-	
-	
-	
-	
-	
+
 	prefetch(0, mem(r15)) // prefetch b_next
 	prefetch(0, mem(r15, 64)) // prefetch b_next
-	
-	
-	
-	 // determine if
-	 //    c    % 32 == 0, AND
-	 // 16*cs_c % 32 == 0, AND
-	 //    rs_c      == 1
-	 // ie: aligned, ldim aligned, and
-	 // column-stored
-	
-	cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16.
-	sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
-	test(imm(31), rcx) // set ZF if c & 32 is zero.
-	setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
-	test(imm(31), rdi) // set ZF if (16*cs_c) & 32 is zero.
-	setz(al) // al = ( ZF == 0 ? 1 : 0 );
-	 // and(bl,bh) followed by
-	 // and(bh,al) will reveal result
-	
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero.
 	vucomisd(xmm0, xmm6) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -2548,161 +1728,66 @@ void bli_zgemm_piledriver_asm_2x2
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case
-	
-	
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.ZCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.ZGENSTORED)
-	
-	
-	vmovups(mem(rcx), xmm0) // load c00
-	vmovups(mem(rcx, rsi, 1), xmm2) // load c10
-	vpermilpd(imm(0x1), xmm0, xmm1)
-	vpermilpd(imm(0x1), xmm2, xmm3)
-	
-	vmulpd(xmm6, xmm0, xmm0)
-	vmulpd(xmm7, xmm1, xmm1)
-	vaddsubpd(xmm1, xmm0, xmm0)
-	vaddpd(xmm8, xmm0, xmm0)
-	vmovups(xmm0, mem(rcx)) // store c00
-	
-	vmulpd(xmm6, xmm2, xmm2)
-	vmulpd(xmm7, xmm3, xmm3)
-	vaddsubpd(xmm3, xmm2, xmm2)
-	vaddpd(xmm12, xmm2, xmm2)
-	vmovups(xmm2, mem(rcx, rsi, 1)) // store c10
-	
-	
-	
-	vmovups(mem(r10), xmm0) // load c01
-	vmovups(mem(r10, rsi, 1), xmm2) // load c11
-	vpermilpd(imm(0x1), xmm0, xmm1)
-	vpermilpd(imm(0x1), xmm2, xmm3)
-	
-	vmulpd(xmm6, xmm0, xmm0)
-	vmulpd(xmm7, xmm1, xmm1)
-	vaddsubpd(xmm1, xmm0, xmm0)
-	vaddpd(xmm10, xmm0, xmm0)
-	vmovups(xmm0, mem(r10)) // store c01
-	
-	vmulpd(xmm6, xmm2, xmm2)
-	vmulpd(xmm7, xmm3, xmm3)
-	vaddsubpd(xmm3, xmm2, xmm2)
-	vaddpd(xmm14, xmm2, xmm2)
-	vmovups(xmm2, mem(r10, rsi, 1)) // store c11
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
-	label(.ZCOLSTORED)
-	
-	
-	vmovups(mem(rcx), xmm0) // load c00
-	vmovups(mem(rcx, 16), xmm2) // load c10
-	vpermilpd(imm(0x1), xmm0, xmm1)
-	vpermilpd(imm(0x1), xmm2, xmm3)
-	
-	vmulpd(xmm6, xmm0, xmm0)
-	vmulpd(xmm7, xmm1, xmm1)
-	vaddsubpd(xmm1, xmm0, xmm0)
-	vaddpd(xmm8, xmm0, xmm0)
-	vmovups(xmm0, mem(rcx)) // store c00
-	
-	vmulpd(xmm6, xmm2, xmm2)
-	vmulpd(xmm7, xmm3, xmm3)
-	vaddsubpd(xmm3, xmm2, xmm2)
-	vaddpd(xmm12, xmm2, xmm2)
-	vmovups(xmm2, mem(rcx, 16)) // store c10
-	
-	
-	
-	vmovups(mem(r10), xmm0) // load c01
-	vmovups(mem(r10, 16), xmm2) // load c11
-	vpermilpd(imm(0x1), xmm0, xmm1)
-	vpermilpd(imm(0x1), xmm2, xmm3)
-	
-	vmulpd(xmm6, xmm0, xmm0)
-	vmulpd(xmm7, xmm1, xmm1)
-	vaddsubpd(xmm1, xmm0, xmm0)
-	vaddpd(xmm10, xmm0, xmm0)
-	vmovups(xmm0, mem(r10)) // store c01
-	
-	vmulpd(xmm6, xmm2, xmm2)
-	vmulpd(xmm7, xmm3, xmm3)
-	vaddsubpd(xmm3, xmm2, xmm2)
-	vaddpd(xmm14, xmm2, xmm2)
-	vmovups(xmm2, mem(r10, 16)) // store c11
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
+
+		vmovups(mem(rcx), xmm0) // load c00
+		vmovups(mem(rcx, 16), xmm2) // load c10
+		vpermilpd(imm(0x1), xmm0, xmm1)
+		vpermilpd(imm(0x1), xmm2, xmm3)
+
+		vmulpd(xmm6, xmm0, xmm0)
+		vmulpd(xmm7, xmm1, xmm1)
+		vaddsubpd(xmm1, xmm0, xmm0)
+		vaddpd(xmm8, xmm0, xmm0)
+
+		vmulpd(xmm6, xmm2, xmm2)
+		vmulpd(xmm7, xmm3, xmm3)
+		vaddsubpd(xmm3, xmm2, xmm2)
+		vaddpd(xmm12, xmm2, xmm2)
+
+		vmovups(mem(r10), xmm0) // load c01
+		vmovups(mem(r10, 16), xmm2) // load c11
+		vpermilpd(imm(0x1), xmm0, xmm1)
+		vpermilpd(imm(0x1), xmm2, xmm3)
+
+		vmulpd(xmm6, xmm0, xmm0)
+		vmulpd(xmm7, xmm1, xmm1)
+		vaddsubpd(xmm1, xmm0, xmm0)
+		vaddpd(xmm10, xmm0, xmm0)
+
+		vmulpd(xmm6, xmm2, xmm2)
+		vmulpd(xmm7, xmm3, xmm3)
+		vaddsubpd(xmm3, xmm2, xmm2)
+		vaddpd(xmm14, xmm2, xmm2)
+
+		 // fall through
+
 	label(.ZBETAZERO)
-	 // check if aligned/column-stored
-	 // check if aligned/column-stored
-	and(bl, bh) // set ZF if bl & bh == 1.
-	and(bh, al) // set ZF if bh & al == 1.
-	jne(.ZCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.ZGENSTORBZ)
-	
-	
-	vmovups(xmm8, mem(rcx)) // store c00
-	vmovups(xmm12, mem(rcx, rsi, 1)) // store c10
-	
-	vmovups(xmm10, mem(r10)) // store c01
-	vmovups(xmm14, mem(r10, rsi, 1)) // store c11
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
-	label(.ZCOLSTORBZ)
-	
-	
-	vmovups(xmm8, mem(rcx)) // store c00
-	vmovups(xmm12, mem(rcx, 16)) // store c10
-	
-	vmovups(xmm10, mem(r10)) // store c01
-	vmovups(xmm14, mem(r10, 16)) // store c11
-	
-	
-	
-	
-	
+
+		vmovups(xmm8, mem(rcx)) // store c00
+		vmovups(xmm12, mem(rcx, 16)) // store c10
+
+		vmovups(xmm10, mem(r10)) // store c01
+		vmovups(xmm14, mem(r10, 16)) // store c11
+
 	label(.ZDONE)
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c),   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c),   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -2710,6 +1795,8 @@ void bli_zgemm_piledriver_asm_2x2
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( z );
 }
 
 
diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c
index 396824986..84e7d16d3 100644
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -37,7 +37,7 @@
 
 #define D_ASSEMBLE_VEC_PAIR \
         __builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \
-        __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); 
+        __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]);
 
 #define D_ACCUMULATE \
         __builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \
@@ -47,7 +47,7 @@
         __builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \
         __builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \
         __builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \
-        __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); 
+        __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]);
 
 #define D_INCREMENT \
         A0+=8; \
@@ -57,17 +57,19 @@
         LOAD_VECTORS \
         D_ASSEMBLE_VEC_PAIR \
         D_INCREMENT \
-        D_ACCUMULATE 
+        D_ACCUMULATE
 
 
 void bli_dgemm_power10_mma_8x8
     (
-        dim_t               k0,
+        dim_t               m,
+        dim_t               n,
+        dim_t               k,
         double*    restrict alpha,
         double*    restrict a,
         double*    restrict b,
         double*    restrict beta,
-        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+        double*    restrict c, inc_t rs_c0, inc_t cs_c,
         auxinfo_t* restrict data,
         cntx_t*    restrict cntx
     )
@@ -76,11 +78,13 @@ void bli_dgemm_power10_mma_8x8
     // Typecast local copies of integers in case dim_t and inc_t are a
     // different size than is expected by load instructions.
     // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-    uint64_t k_iter = (k0-1) / 4;
-    uint64_t k_left = (k0-1) % 4;
+    uint64_t k_iter = (k-1) / 4;
+    uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
 
+    GEMM_UKR_SETUP_CT( d, 8, 8, true );
+
     double* restrict A0 = a;
     double* restrict B0 = b;
     double* restrict C0 = c;
@@ -92,23 +96,23 @@ void bli_dgemm_power10_mma_8x8
     dv4sf_t *rowC;
 
     /* 8 accumulator registers that will be used to store the result.
-       
+
        Each accumulator register is mapped to 4 vector registers.
        Illustration:
-                      
+
             acc0 = [  vs0
                       vs1
                       vs3
                       vs4  ]
 
-        These registers are used to store the result of an outer product 
+        These registers are used to store the result of an outer product
         instruction (general outer product instruction syntax: xv???ger??). */
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
-    /* 2 vector pairs are necessary for a double precision outer product 
+    /* 2 vector pairs are necessary for a double precision outer product
        instruction. */
-    __vector_pair colA_1, 
+    __vector_pair colA_1,
                   colA_2;
 
     /* Prefetch C so that it stays in cache */
@@ -123,17 +127,17 @@ void bli_dgemm_power10_mma_8x8
 
     /* Load elements into vector registers */
     vec_t *ca = (vec_t *) A0;
-    vec_t *rb = (vec_t *) B0; 
+    vec_t *rb = (vec_t *) B0;
 
-    /* Each accumulator represents a matrix of size 
+    /* Each accumulator represents a matrix of size
        4 x ( 16 / (datatype size in bytes) )  (vector register size = 16B)
 
-       Thus in the case of double, the accumulate registers represent a 4x2 
+       Thus in the case of double, the accumulate registers represent a 4x2
        matrix. However, a vector register can hold at most 2 doubles. Thus, if
-       we performed an outer product using 2 vector register, we can only get a 
+       we performed an outer product using 2 vector register, we can only get a
        2x2 matrix. Therefore, we must create a vector register pair in order
        to get the desired 4x2 matrix.
-    
+
     */
     D_ASSEMBLE_VEC_PAIR
 
@@ -158,7 +162,7 @@ void bli_dgemm_power10_mma_8x8
         D_AB_PRODUCT
         D_AB_PRODUCT
     }
-    
+
     // edge loop
     for (int k = 0; k<k_left; k++)
     {
@@ -189,4 +193,5 @@ void bli_dgemm_power10_mma_8x8
         SAVE_ACC_bz(dv4sf_t, &acc7, rs_c, 6+4*rs_c);
     }
 
+    GEMM_UKR_FLUSH_CT( d );
 }
diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index 1e52805fe..c7f81dc7d 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -55,7 +55,9 @@
 
 void bli_i16gemm_power10_mma_8x16
     (
-        dim_t               k0,
+        dim_t               m,
+        dim_t               n,
+        dim_t               k,
         int32_t*       restrict alpha,
         short*     restrict a,
         short*     restrict b,
@@ -66,8 +68,8 @@ void bli_i16gemm_power10_mma_8x16
     )
 {
 
-    uint64_t k_iter = (k0-1) / 4;
-    uint64_t k_left = (k0-1) % 4;
+    uint64_t k_iter = (k-1) / 4;
+    uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -82,7 +84,7 @@ void bli_i16gemm_power10_mma_8x16
     iv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
     vec_t *ca = (vec_t *) A0;
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index 1ddafd722..9e8d99c13 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -55,7 +55,9 @@
 
 void bli_i16sgemm_power10_mma_8x16
     (
-        dim_t               k0,
+        dim_t               m,
+        dim_t               n,
+        dim_t               k,
         int32_t*       restrict alpha,
         short*     restrict a,
         short*     restrict b,
@@ -66,8 +68,8 @@ void bli_i16sgemm_power10_mma_8x16
     )
 {
 
-    uint64_t k_iter = (k0-1) / 4;
-    uint64_t k_left = (k0-1) % 4;
+    uint64_t k_iter = (k-1) / 4;
+    uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -82,7 +84,7 @@ void bli_i16sgemm_power10_mma_8x16
     iv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
     vec_t *ca = (vec_t *) A0;
diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c
index c06c88a6e..7527f271f 100644
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -55,7 +55,9 @@
 
 void bli_i4gemm_power10_mma_8x16
     (
-        dim_t               k0,
+        dim_t               m,
+        dim_t               n,
+        dim_t               k,
         int32_t*       restrict alpha,
         nibbles*   restrict a,
         nibbles*   restrict b,
@@ -66,8 +68,8 @@ void bli_i4gemm_power10_mma_8x16
     )
 {
 
-    uint64_t k_iter = (k0-1) / 4;
-	uint64_t k_left = (k0-1) % 4;
+    uint64_t k_iter = (k-1) / 4;
+    uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -82,11 +84,11 @@ void bli_i4gemm_power10_mma_8x16
     iv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
     vec_t *ca = (vec_t *) A0;
-    vec_t *rb = (vec_t *) B0;        
+    vec_t *rb = (vec_t *) B0;
 
     __builtin_mma_xvi4ger8 (&acc0, ca[0], rb[0]);
     __builtin_mma_xvi4ger8 (&acc1, ca[0], rb[1]);
@@ -96,23 +98,23 @@ void bli_i4gemm_power10_mma_8x16
     __builtin_mma_xvi4ger8 (&acc5, ca[1], rb[1]);
     __builtin_mma_xvi4ger8 (&acc6, ca[1], rb[2]);
     __builtin_mma_xvi4ger8 (&acc7, ca[1], rb[3]);
-    
+
     I4_INCREMENT
 
     // k loop (unrolled by 4)
-	for (int k = 0; k<k_iter; k++)
-	{
-		I4_AB_PRODUCT
-		I4_AB_PRODUCT
-		I4_AB_PRODUCT
-		I4_AB_PRODUCT
-	}
-	
-	// edge loop
-	for (int k = 0; k<k_left; k++)
-	{
-		I4_AB_PRODUCT
-	}
+    for (int k = 0; k<k_iter; k++)
+    {
+        I4_AB_PRODUCT
+        I4_AB_PRODUCT
+        I4_AB_PRODUCT
+        I4_AB_PRODUCT
+    }
+
+    // edge loop
+    for (int k = 0; k<k_left; k++)
+    {
+        I4_AB_PRODUCT
+    }
 
     // handle beta cases
     if (beta_ != 0.0)
diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c
index 0d6b62e84..037a28595 100644
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -55,7 +55,9 @@
 
 void bli_i8gemm_power10_mma_8x16
     (
-        dim_t               k0,
+        dim_t               m,
+        dim_t               n,
+        dim_t               k,
         int32_t*       restrict alpha,
         int8_t*    restrict a,
         int8_t*    restrict b,
@@ -65,8 +67,8 @@ void bli_i8gemm_power10_mma_8x16
         cntx_t*    restrict cntx
     )
 {
-    uint64_t k_iter = (k0-1) / 4;
-	uint64_t k_left = (k0-1) % 4;
+    uint64_t k_iter = (k-1) / 4;
+    uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -81,11 +83,11 @@ void bli_i8gemm_power10_mma_8x16
     iv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
     vec_t *ca = (vec_t *) A0;
-    vec_t *rb = (vec_t *) B0;        
+    vec_t *rb = (vec_t *) B0;
 
     __builtin_mma_xvi8ger4 (&acc0, ca[0], rb[0]);
     __builtin_mma_xvi8ger4 (&acc1, ca[0], rb[1]);
@@ -99,19 +101,19 @@ void bli_i8gemm_power10_mma_8x16
     I8_INCREMENT
 
     // k loop (unrolled by 4)
-	for (int k = 0; k<k_iter; k++)
-	{
-		I8_AB_PRODUCT
-		I8_AB_PRODUCT
-		I8_AB_PRODUCT
-		I8_AB_PRODUCT
-	}
-	
-	// edge loop
-	for (int k = 0; k<k_left; k++)
-	{
-		I8_AB_PRODUCT
-	}
+    for (int k = 0; k<k_iter; k++)
+    {
+        I8_AB_PRODUCT
+        I8_AB_PRODUCT
+        I8_AB_PRODUCT
+        I8_AB_PRODUCT
+    }
+
+    // edge loop
+    for (int k = 0; k<k_left; k++)
+    {
+        I8_AB_PRODUCT
+    }
 
     // handle beta cases
     if (beta_ != 0.0)
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index 7a65157ae..b37a0c7ce 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -42,21 +42,23 @@
     __builtin_mma_xvbf16ger2pp (&acc4, ca[1], rb[0]); \
     __builtin_mma_xvbf16ger2pp (&acc5, ca[1], rb[1]); \
     __builtin_mma_xvbf16ger2pp (&acc6, ca[1], rb[2]); \
-    __builtin_mma_xvbf16ger2pp (&acc7, ca[1], rb[3]); 
+    __builtin_mma_xvbf16ger2pp (&acc7, ca[1], rb[3]);
 
 #define B_INCREMENT \
     A0+=16; \
-    B0+=32; 
-    
+    B0+=32;
+
 #define B_AB_PRODUCT \
     LOAD_VECTORS \
     B_INCREMENT \
-    B_ACCUMULATE 
+    B_ACCUMULATE
 
 
 void bli_sbgemm_power10_mma_8x16
     (
-        dim_t               k0,
+        dim_t               m,
+        dim_t               n,
+        dim_t               k,
         float*     restrict alpha,
         bfloat16*  restrict a,
         bfloat16*  restrict b,
@@ -67,8 +69,8 @@ void bli_sbgemm_power10_mma_8x16
     )
 {
 
-    uint64_t k_iter = (k0-1)/4;
-    uint64_t k_left = (k0-1)%4;
+    uint64_t k_iter = (k-1)/4;
+    uint64_t k_left = (k-1)%4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -83,7 +85,7 @@ void bli_sbgemm_power10_mma_8x16
     fv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
     vec_t *ca = (vec_t *) A0;
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index 196bc085f..42bbaa916 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -42,7 +42,7 @@
         __builtin_mma_xvf32gerpp (&acc4, ca[1], rb[0]); \
         __builtin_mma_xvf32gerpp (&acc5, ca[1], rb[1]); \
         __builtin_mma_xvf32gerpp (&acc6, ca[1], rb[2]); \
-        __builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]); 
+        __builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]);
 
 #define S_INCREMENT \
         A0+=8; \
@@ -51,16 +51,18 @@
 #define S_AB_PRODUCT \
         LOAD_VECTORS \
         S_INCREMENT \
-        S_ACCUMULATE 
+        S_ACCUMULATE
 
 void bli_sgemm_power10_mma_8x16
     (
-        dim_t               k0,
+        dim_t               m,
+        dim_t               n,
+        dim_t               k,
         float*     restrict alpha,
         float*     restrict a,
         float*     restrict b,
         float*     restrict beta,
-        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+        float*     restrict c, inc_t rs_c0, inc_t cs_c,
         auxinfo_t* restrict data,
         cntx_t*    restrict cntx
     )
@@ -68,16 +70,18 @@ void bli_sgemm_power10_mma_8x16
     // Typecast local copies of integers in case dim_t and inc_t are a
     // different size than is expected by load instructions.
     // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-    uint64_t k_iter = (k0-1) / 4;
-    uint64_t k_left = (k0-1) % 4;
-    
+    uint64_t k_iter = (k-1) / 4;
+    uint64_t k_left = (k-1) % 4;
+
     uint64_t rs_c   = rs_c0;
 
+    GEMM_UKR_SETUP_CT( s, 8, 16, true );
+
     fv4sf_t result[4];
       fv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
     float* restrict A0 = a;
@@ -111,7 +115,7 @@ void bli_sgemm_power10_mma_8x16
         S_AB_PRODUCT
         S_AB_PRODUCT
     }
-    
+
     // edge loop
     for (int k = 0; k<k_left; k++)
     {
@@ -141,4 +145,6 @@ void bli_sgemm_power10_mma_8x16
         SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
         SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
+
+    GEMM_UKR_FLUSH_CT( s );
 }
\ No newline at end of file
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index 8a16fdc06..0e80735df 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -42,21 +42,23 @@
     __builtin_mma_xvf16ger2pp (&acc4, ca[1], rb[0]); \
     __builtin_mma_xvf16ger2pp (&acc5, ca[1], rb[1]); \
     __builtin_mma_xvf16ger2pp (&acc6, ca[1], rb[2]); \
-    __builtin_mma_xvf16ger2pp (&acc7, ca[1], rb[3]); 
+    __builtin_mma_xvf16ger2pp (&acc7, ca[1], rb[3]);
 
 #define H_INCREMENT \
     A0+=16; \
-    B0+=32; 
-    
+    B0+=32;
+
 #define H_AB_PRODUCT \
     LOAD_VECTORS \
     H_INCREMENT \
-    H_ACCUMULATE 
+    H_ACCUMULATE
 
 
 void bli_shgemm_power10_mma_8x16
     (
-        dim_t               k0,
+        dim_t              m,
+        dim_t              n,
+        dim_t              k,
         float*     restrict alpha,
         float16*  restrict a,
         float16*  restrict b,
@@ -67,8 +69,8 @@ void bli_shgemm_power10_mma_8x16
     )
 {
 
-    uint64_t k_iter = (k0-1)/4;
-    uint64_t k_left = (k0-1)%4;
+    uint64_t k_iter = (k-1)/4;
+    uint64_t k_left = (k-1)%4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -83,7 +85,7 @@ void bli_shgemm_power10_mma_8x16
     fv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
-    __vector_quad acc0, acc1, acc2, acc3, 
+    __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
     vec_t *ca = (vec_t *) A0;
diff --git a/kernels/power7/3/bli_gemm_power7_int_8x4.c b/kernels/power7/3/bli_gemm_power7_int_8x4.c
index eb6b42ba7..b9ce85f72 100644
--- a/kernels/power7/3/bli_gemm_power7_int_8x4.c
+++ b/kernels/power7/3/bli_gemm_power7_int_8x4.c
@@ -50,32 +50,28 @@
  */
 void bli_sgemm_power7_int_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
        float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+       float*     restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k      = k0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
 #if 1 || defined(UTEST)
     const long MR = BLIS_DEFAULT_MR_S, NR = BLIS_DEFAULT_NR_S;
     const long LDA = MR, LDB = NR;
     long i, j, kk;
     float c00;
 
-    for (i=0; i < MR; i++) {
-        for (j=0; j < NR; j++) {
+    for (i=0; i < m; i++) {
+        for (j=0; j < n; j++) {
             c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta;
-            for (kk=0; kk < k; kk++) 
+            for (kk=0; kk < k; kk++)
                 c00 += *alpha * (a[COLMAJ_INDEX(i,kk,LDA)] * b[ROWMAJ_INDEX(kk,j,LDB)]);
             c[BLIS_INDEX(i,j,rs_c,cs_c)] = c00;
         }
@@ -96,24 +92,160 @@ void bli_sgemm_power7_int_8x4
  */
 void bli_dgemm_power7_int_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
        double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       double*    restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k      = k0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-#if 1
-    if (rs_c == 1) {
+    if ( cs_c == 1 )
+    {
+        // Optimized code for case where C rows are contiguous (i.e. C is row-major)
+
+        vector double vzero = vec_splats( 0.0 );
+
+        vector double vc00_01 = vzero;
+        vector double vc02_03 = vzero;
+        vector double vc10_11 = vzero;
+        vector double vc12_13 = vzero;
+        vector double vc20_21 = vzero;
+        vector double vc22_23 = vzero;
+        vector double vc30_31 = vzero;
+        vector double vc32_33 = vzero;
+        vector double vc40_41 = vzero;
+        vector double vc42_43 = vzero;
+        vector double vc50_51 = vzero;
+        vector double vc52_53 = vzero;
+        vector double vc60_61 = vzero;
+        vector double vc62_63 = vzero;
+        vector double vc70_71 = vzero;
+        vector double vc72_73 = vzero;
+
+        unsigned long long pa = (unsigned long long)a;
+        unsigned long long pb = (unsigned long long)b;
+
+#if 0
+        unsigned long long d1 = 1*sizeof(double);
+        unsigned long long d2 = 2*sizeof(double);
+        unsigned long long d3 = 3*sizeof(double);
+        unsigned long long d4 = 4*sizeof(double);
+        unsigned long long d6 = 6*sizeof(double);
+#else
+        // ppc64 linux abi: r14-r31   Nonvolatile registers used for local variables
+        register unsigned long long d1 __asm ("r21") = 1*sizeof(double);
+        register unsigned long long d2 __asm ("r22") = 2*sizeof(double);
+        register unsigned long long d3 __asm ("r23") = 3*sizeof(double);
+        register unsigned long long d4 __asm ("r24") = 4*sizeof(double);
+        register unsigned long long d5 __asm ("r25") = 5*sizeof(double);
+        register unsigned long long d6 __asm ("r26") = 6*sizeof(double);
+        register unsigned long long d7 __asm ("r27") = 7*sizeof(double);
+
+        __asm__ volatile (";" : "=r" (d1) : "r" (d1) );
+        __asm__ volatile (";" : "=r" (d2) : "r" (d2) );
+        __asm__ volatile (";" : "=r" (d3) : "r" (d3) );
+        __asm__ volatile (";" : "=r" (d4) : "r" (d4) );
+        __asm__ volatile (";" : "=r" (d5) : "r" (d5) );
+        __asm__ volatile (";" : "=r" (d6) : "r" (d6) );
+        __asm__ volatile (";" : "=r" (d7) : "r" (d7) );
+#endif
+
+        int kk;
+        for (kk=k; kk > 0; kk--) {
+            vector double va00 = vec_splats( *(double *)( pa+0 ) );
+            vector double va10 = vec_splats( *(double *)( pa+d1 ) );
+            vector double va20 = vec_splats( *(double *)( pa+d2 ) );
+            vector double va30 = vec_splats( *(double *)( pa+d3 ) );
+            vector double va40 = vec_splats( *(double *)( pa+d4 ) );
+            vector double va50 = vec_splats( *(double *)( pa+d5 ) );
+            vector double va60 = vec_splats( *(double *)( pa+d6 ) );
+            vector double va70 = vec_splats( *(double *)( pa+d7 ) );
+            pa += 8*sizeof(double);
+
+            vector double vb00_01 = *(vector double *)( pb+0 );
+            vector double vb02_03 = *(vector double *)( pb+d2 );
+            pb += 4*sizeof(double);
+
+            vc00_01 = vec_madd(va00, vb00_01, vc00_01);
+            vc02_03 = vec_madd(va00, vb02_03, vc02_03);
+            vc10_11 = vec_madd(va10, vb00_01, vc10_11);
+            vc12_13 = vec_madd(va10, vb02_03, vc12_13);
+            vc20_21 = vec_madd(va20, vb00_01, vc20_21);
+            vc22_23 = vec_madd(va20, vb02_03, vc22_23);
+            vc30_31 = vec_madd(va30, vb00_01, vc30_31);
+            vc32_33 = vec_madd(va30, vb02_03, vc32_33);
+            vc40_41 = vec_madd(va40, vb00_01, vc40_41);
+            vc42_43 = vec_madd(va40, vb02_03, vc42_43);
+            vc50_51 = vec_madd(va50, vb00_01, vc50_51);
+            vc52_53 = vec_madd(va50, vb02_03, vc52_53);
+            vc60_61 = vec_madd(va60, vb00_01, vc60_61);
+            vc62_63 = vec_madd(va60, vb02_03, vc62_63);
+            vc70_71 = vec_madd(va70, vb00_01, vc70_71);
+            vc72_73 = vec_madd(va70, vb02_03, vc72_73);
+        }
+
+        vector double valpha = vec_splats( *alpha );
+        vector double vbeta  = (vector double) { *beta, *beta };
+
+        vector double *pc = (vector double *)c;
+
+        vc00_01 = vec_mul(valpha, vc00_01);
+        vc02_03 = vec_mul(valpha, vc02_03);
+        pc[0] = vec_madd( pc[0], vbeta, vc00_01);
+        pc[1] = vec_madd( pc[1], vbeta, vc02_03);
+        pc += rs_c/2;
+
+        vc10_11 = vec_mul(valpha, vc10_11);
+        vc12_13 = vec_mul(valpha, vc12_13);
+        pc[0] = vec_madd( pc[0], vbeta, vc10_11);
+        pc[1] = vec_madd( pc[1], vbeta, vc12_13);
+        pc += rs_c/2;
+
+        vc20_21 = vec_mul(valpha, vc20_21);
+        vc22_23 = vec_mul(valpha, vc22_23);
+        pc[0] = vec_madd( pc[0], vbeta, vc20_21);
+        pc[1] = vec_madd( pc[1], vbeta, vc22_23);
+        pc += rs_c/2;
+
+        vc30_31 = vec_mul(valpha, vc30_31);
+        vc32_33 = vec_mul(valpha, vc32_33);
+        pc[0] = vec_madd( pc[0], vbeta, vc30_31);
+        pc[1] = vec_madd( pc[1], vbeta, vc32_33);
+        pc += rs_c/2;
+
+        vc40_41 = vec_mul(valpha, vc40_41);
+        vc42_43 = vec_mul(valpha, vc42_43);
+        pc[0] = vec_madd( pc[0], vbeta, vc40_41);
+        pc[1] = vec_madd( pc[1], vbeta, vc42_43);
+        pc += rs_c/2;
+
+        vc50_51 = vec_mul(valpha, vc50_51);
+        vc52_53 = vec_mul(valpha, vc52_53);
+        pc[0] = vec_madd( pc[0], vbeta, vc50_51);
+        pc[1] = vec_madd( pc[1], vbeta, vc52_53);
+        pc += rs_c/2;
+
+        vc60_61 = vec_mul(valpha, vc60_61);
+        vc62_63 = vec_mul(valpha, vc62_63);
+        pc[0] = vec_madd( pc[0], vbeta, vc60_61);
+        pc[1] = vec_madd( pc[1], vbeta, vc62_63);
+        pc += rs_c/2;
+
+        vc70_71 = vec_mul(valpha, vc70_71);
+        vc72_73 = vec_mul(valpha, vc72_73);
+        pc[0] = vec_madd( pc[0], vbeta, vc70_71);
+        pc[1] = vec_madd( pc[1], vbeta, vc72_73);
+        pc += rs_c/2;
+    }
+    else
+    {
+        GEMM_UKR_SETUP_CT( d, 8, 4, false );
+
         // Optimized code for case where C columns are contiguous (column-major C)
         vector double vzero = vec_splats( 0.0 );
 
@@ -301,168 +433,8 @@ void bli_dgemm_power7_int_8x4
         pc[1] = vec_madd( pc[1], vbeta, vc23_33);
         pc[2] = vec_madd( pc[2], vbeta, vc43_53);
         pc[3] = vec_madd( pc[3], vbeta, vc63_73);
-    }
-    else
-#endif
-#if 1
-    if ( cs_c == 1 ) {
-        // Optimized code for case where C rows are contiguous (i.e. C is row-major)
-
-        vector double vzero = vec_splats( 0.0 );
-
-        vector double vc00_01 = vzero;
-        vector double vc02_03 = vzero;
-        vector double vc10_11 = vzero;
-        vector double vc12_13 = vzero;
-        vector double vc20_21 = vzero;
-        vector double vc22_23 = vzero;
-        vector double vc30_31 = vzero;
-        vector double vc32_33 = vzero;
-        vector double vc40_41 = vzero;
-        vector double vc42_43 = vzero;
-        vector double vc50_51 = vzero;
-        vector double vc52_53 = vzero;
-        vector double vc60_61 = vzero;
-        vector double vc62_63 = vzero;
-        vector double vc70_71 = vzero;
-        vector double vc72_73 = vzero;
-
-        unsigned long long pa = (unsigned long long)a;
-        unsigned long long pb = (unsigned long long)b;
-
-#if 0
-        unsigned long long d1 = 1*sizeof(double);
-        unsigned long long d2 = 2*sizeof(double);
-        unsigned long long d3 = 3*sizeof(double);
-        unsigned long long d4 = 4*sizeof(double);
-        unsigned long long d6 = 6*sizeof(double);
-#else
-        // ppc64 linux abi: r14-r31   Nonvolatile registers used for local variables
-        register unsigned long long d1 __asm ("r21") = 1*sizeof(double);
-        register unsigned long long d2 __asm ("r22") = 2*sizeof(double);
-        register unsigned long long d3 __asm ("r23") = 3*sizeof(double);
-        register unsigned long long d4 __asm ("r24") = 4*sizeof(double);
-        register unsigned long long d5 __asm ("r25") = 5*sizeof(double);
-        register unsigned long long d6 __asm ("r26") = 6*sizeof(double);
-        register unsigned long long d7 __asm ("r27") = 7*sizeof(double);
-
-        __asm__ volatile (";" : "=r" (d1) : "r" (d1) );
-        __asm__ volatile (";" : "=r" (d2) : "r" (d2) );
-        __asm__ volatile (";" : "=r" (d3) : "r" (d3) );
-        __asm__ volatile (";" : "=r" (d4) : "r" (d4) );
-        __asm__ volatile (";" : "=r" (d5) : "r" (d5) );
-        __asm__ volatile (";" : "=r" (d6) : "r" (d6) );
-        __asm__ volatile (";" : "=r" (d7) : "r" (d7) );
-#endif
-
-        int kk;
-        for (kk=k; kk > 0; kk--) {
-            vector double va00 = vec_splats( *(double *)( pa+0 ) ); 
-            vector double va10 = vec_splats( *(double *)( pa+d1 ) );
-            vector double va20 = vec_splats( *(double *)( pa+d2 ) );
-            vector double va30 = vec_splats( *(double *)( pa+d3 ) );
-            vector double va40 = vec_splats( *(double *)( pa+d4 ) );
-            vector double va50 = vec_splats( *(double *)( pa+d5 ) );
-            vector double va60 = vec_splats( *(double *)( pa+d6 ) );
-            vector double va70 = vec_splats( *(double *)( pa+d7 ) );
-            pa += 8*sizeof(double);
-
-            vector double vb00_01 = *(vector double *)( pb+0 ); 
-            vector double vb02_03 = *(vector double *)( pb+d2 );
-            pb += 4*sizeof(double);
-
-            vc00_01 = vec_madd(va00, vb00_01, vc00_01);
-            vc02_03 = vec_madd(va00, vb02_03, vc02_03);
-            vc10_11 = vec_madd(va10, vb00_01, vc10_11);
-            vc12_13 = vec_madd(va10, vb02_03, vc12_13);
-            vc20_21 = vec_madd(va20, vb00_01, vc20_21);
-            vc22_23 = vec_madd(va20, vb02_03, vc22_23);
-            vc30_31 = vec_madd(va30, vb00_01, vc30_31);
-            vc32_33 = vec_madd(va30, vb02_03, vc32_33);
-            vc40_41 = vec_madd(va40, vb00_01, vc40_41);
-            vc42_43 = vec_madd(va40, vb02_03, vc42_43);
-            vc50_51 = vec_madd(va50, vb00_01, vc50_51);
-            vc52_53 = vec_madd(va50, vb02_03, vc52_53);
-            vc60_61 = vec_madd(va60, vb00_01, vc60_61);
-            vc62_63 = vec_madd(va60, vb02_03, vc62_63);
-            vc70_71 = vec_madd(va70, vb00_01, vc70_71);
-            vc72_73 = vec_madd(va70, vb02_03, vc72_73);
-        }
-
-        vector double valpha = vec_splats( *alpha );
-        vector double vbeta  = (vector double) { *beta, *beta };
-
-        vector double *pc = (vector double *)c;
-
-        vc00_01 = vec_mul(valpha, vc00_01);
-        vc02_03 = vec_mul(valpha, vc02_03);
-        pc[0] = vec_madd( pc[0], vbeta, vc00_01);
-        pc[1] = vec_madd( pc[1], vbeta, vc02_03);
-        pc += rs_c/2;
-
-        vc10_11 = vec_mul(valpha, vc10_11);
-        vc12_13 = vec_mul(valpha, vc12_13);
-        pc[0] = vec_madd( pc[0], vbeta, vc10_11);
-        pc[1] = vec_madd( pc[1], vbeta, vc12_13);
-        pc += rs_c/2;
-
-        vc20_21 = vec_mul(valpha, vc20_21);
-        vc22_23 = vec_mul(valpha, vc22_23);
-        pc[0] = vec_madd( pc[0], vbeta, vc20_21);
-        pc[1] = vec_madd( pc[1], vbeta, vc22_23);
-        pc += rs_c/2;
-
-        vc30_31 = vec_mul(valpha, vc30_31);
-        vc32_33 = vec_mul(valpha, vc32_33);
-        pc[0] = vec_madd( pc[0], vbeta, vc30_31);
-        pc[1] = vec_madd( pc[1], vbeta, vc32_33);
-        pc += rs_c/2;
-
-        vc40_41 = vec_mul(valpha, vc40_41);
-        vc42_43 = vec_mul(valpha, vc42_43);
-        pc[0] = vec_madd( pc[0], vbeta, vc40_41);
-        pc[1] = vec_madd( pc[1], vbeta, vc42_43);
-        pc += rs_c/2;
-
-        vc50_51 = vec_mul(valpha, vc50_51);
-        vc52_53 = vec_mul(valpha, vc52_53);
-        pc[0] = vec_madd( pc[0], vbeta, vc50_51);
-        pc[1] = vec_madd( pc[1], vbeta, vc52_53);
-        pc += rs_c/2;
-
-        vc60_61 = vec_mul(valpha, vc60_61);
-        vc62_63 = vec_mul(valpha, vc62_63);
-        pc[0] = vec_madd( pc[0], vbeta, vc60_61);
-        pc[1] = vec_madd( pc[1], vbeta, vc62_63);
-        pc += rs_c/2;
-
-        vc70_71 = vec_mul(valpha, vc70_71);
-        vc72_73 = vec_mul(valpha, vc72_73);
-        pc[0] = vec_madd( pc[0], vbeta, vc70_71);
-        pc[1] = vec_madd( pc[1], vbeta, vc72_73);
-        pc += rs_c/2;
 
-    }
-    else
-#endif
-    { /* General case. Just do it right.  */
-#if 1 || defined(UTEST)
-        const long MR = BLIS_DEFAULT_MR_D, NR = BLIS_DEFAULT_NR_D;
-        const long LDA = MR, LDB = NR;
-        int i, j, kk;
-        double c00;
-
-        for (i=0; i < MR; i++) {
-            for (j=0; j < NR; j++) {
-                c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta;
-                for (kk=0; kk < k; kk++) 
-                    c00 += *alpha * (a[COLMAJ_INDEX(i,kk,LDA)] * b[ROWMAJ_INDEX(kk,j,LDB)]);
-                c[BLIS_INDEX(i,j,rs_c,cs_c)] = c00;
-            }
-        }
-#else
-		//BLIS_DGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data);
-#endif
+        GEMM_UKR_FLUSH_CT( d );
     }
 }
 
@@ -477,30 +449,26 @@ void bli_dgemm_power7_int_8x4
  */
 void bli_cgemm_power7_int_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
        scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k      = k0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
 #if 1 || defined(UTEST)
     const long MR = BLIS_DEFAULT_MR_C, NR = BLIS_DEFAULT_NR_C;
     const long LDA = MR, LDB = NR;
     int i, j, kk;
     scomplex c00;
 
-    for (i=0; i < MR; i++) {
-        for (j=0; j < NR; j++) {
+    for (i=0; i < m; i++) {
+        for (j=0; j < n; j++) {
             scomplex tmpc, tmpa, tmpb, tmp;
             //c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta;
             tmpc = c[BLIS_INDEX(i,j,rs_c,cs_c)];
@@ -534,30 +502,26 @@ void bli_cgemm_power7_int_8x4
  */
 void bli_zgemm_power7_int_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
        scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k      = k0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
 #if 1 || defined(UTEST)
     const long MR = BLIS_DEFAULT_MR_Z, NR = BLIS_DEFAULT_NR_Z;
     const long LDA = MR, LDB = NR;
     int i, j, kk;
     dcomplex c00;
 
-    for (i=0; i < MR; i++) {
-        for (j=0; j < NR; j++) {
+    for (i=0; i < m; i++) {
+        for (j=0; j < n; j++) {
             dcomplex tmpc, tmpa, tmpb, tmp;
             //c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta;
             tmpc = c[BLIS_INDEX(i,j,rs_c,cs_c)];
diff --git a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
index ef1930907..50984a67d 100644
--- a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
+++ b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
@@ -43,6 +43,8 @@
 
 void bli_sgemm_opt_8x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
@@ -55,6 +57,8 @@ void bli_sgemm_opt_8x4
 
 void bli_dgemm_opt_8x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
@@ -67,6 +71,8 @@ void bli_dgemm_opt_8x4
 
 void bli_cgemm_opt_8x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
@@ -79,6 +85,8 @@ void bli_cgemm_opt_8x4
 
 void bli_zgemm_opt_8x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
index ec09f8e38..3e5f0d416 100644
--- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
+++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
@@ -37,7 +37,9 @@
 
 void bli_dgemm_power9_asm_12x6
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -50,117 +52,91 @@ void bli_dgemm_power9_asm_12x6
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
 
-	uint64_t k_iter = k0 / 16;
-	uint64_t k_left = k0 % 16;
+	uint64_t k_iter = k / 16;
+	uint64_t k_left = k % 16;
 
-  uint64_t rs_c   = rs_c0;
+	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( d, 12, 6, false );
+
 	__asm__ volatile
-  	(
-  "                                               \n\t"
-  "ld               %%r7,  %2                     \n\t" // load ptr of A
-  "ld               %%r8,  %3                     \n\t" // load ptr of B
-  "ld               %%r16, %6                     \n\t" // load ptr of C
-  "                                               \n\t"
-  "ld               %%r28, %4                     \n\t" // load ptr for alpha
-  "ld               %%r29, %5                     \n\t" // load ptr for beta
-  "                                               \n\t"
-  "ld               %%r11, %0                     \n\t" // load k_iter
-  "ld               %%r12, %1                     \n\t" // load k_left
-  "                                               \n\t"
-  "ld               %%r10, %8                     \n\t" // load cs_c
-  "slwi             %%r10, %%r10, 3               \n\t" // mul by size of elem
-  "                                               \n\t"
-  "ld               %%r9,  %7                     \n\t" // load rs_c
-  "slwi             %%r9,  %%r9, 3                \n\t" // mul by size of elem
-  "                                               \n\t"
-  "ld               %%r26,  0(%%r29)              \n\t" // load val of beta
-  "                                               \n\t"
-  "lxvdsx           %%vs62, 0, %%r28              \n\t" // splat alpha
-  "lxvdsx           %%vs63, 0, %%r29              \n\t" // splat beta
-  "                                               \n\t"
-  "add              %%r17, %%r16, %%r10           \n\t" // addr of col 1 of C
-  "add              %%r18, %%r17, %%r10           \n\t" //         col 2 of C
-  "add              %%r19, %%r18, %%r10           \n\t" //         col 3 of C
-  "add              %%r20, %%r19, %%r10           \n\t" //         col 4 of C
-  "add              %%r21, %%r20, %%r10           \n\t" //         col 5 of C
-  "                                               \n\t"
-  DZERO_OUT_VREG                                         
-  "                                               \n\t"
-  DPRELOAD											                          
-  "                                               \n\t"
-  "addi             %%r8, %%r8, 96                \n\t" // move to next col/row of A/B
-  "addi             %%r7, %%r7, 96                \n\t"
-  "                                               \n\t"
-  DPREFETCH
-  "                                               \n\t"
-  "cmpwi                  %%r11, 0                \n\t" // if k_iter == 0,
-  "beq                    DCONSIDERKLEFT          \n\t" // then jmp to k_left
-  "mtctr            %%r11                         \n\t" // else, do k_iter loop
-  "                                               \n\t"  
-  "DLOOPKITER:                                    \n\t" // k_iter loop
-  "                                               \n\t"
-  A_B_PRODUCT_16									                      // compute A*B 
-  "                                               \n\t"
-  "bdnz             DLOOPKITER                    \n\t"
-  "                                               \n\t"
-  "DCONSIDERKLEFT:                                \n\t"
-  "                                               \n\t"
-  "cmpwi                  %%r12, 0                \n\t" // if k_left == 0,
-  "beq                    DPOSTACCUM              \n\t" // then jmp to post accum
-  "mtctr            %%r12                         \n\t" // else, do k_left loop
-  "                                               \n\t"
-  "DLOOPKLEFT:                                    \n\t" // k_left loop 
-  "                                               \n\t"
-  A_B_PRODUCT_1
-  "                                               \n\t"
-  "bdnz             DLOOPKLEFT                    \n\t" 
-  "                                               \n\t"
-  "DPOSTACCUM:                                    \n\t" 
-  "                                               \n\t"
-  DSCALE_ALPHA											                    
-  "                                               \n\t"
-  "cmpdi                  %%r26, 0                \n\t" // if beta == 0,
-  "beq                    DBETAZERO               \n\t" // then jmp to BZ
-  "                                               \n\t"
-  "cmpwi                  %%r9, 8                 \n\t" // if rs_c == 8
-  "beq              DCOLSTOREDBNZ                 \n\t" // then jmp to col store 
-  "                                               \n\t"
-  "DGENSTOREDBNZ:                                 \n\t" // BNZ gen stored case 
-  "                                               \n\t"
-  DGEN_LOAD_OFS_C                                       
-  "                                              	\n\t"
-  DGEN_SCALE_BETA
-  "                                               \n\t"
-  "b                DGENSTORED                    \n\t"
-  "                                               \n\t"
-  "DCOLSTOREDBNZ:                                 \n\t" // BNZ col stored case
-  "                                               \n\t"
-  DCOL_SCALE_BETA                                       
-  "                                               \n\t"
-  "b                DCOLSTORED                    \n\t"
-  "                                               \n\t"
-  "DBETAZERO:                                     \n\t" // BZ case
-  "                                               \n\t" 
-  "cmpwi                  %%r9, 8                 \n\t" // if rs_c == 8,
-  "beq              DCOLSTORED                    \n\t" // C is col stored
-  "                                               \n\t"
-  "DGENSTORED:                                    \n\t" // BZ gen stored case
-  "                                               \n\t"
-  DGEN_LOAD_OFS_C                                       
-  "                                               \n\t"
-  DGEN_STORE                                            
-  "                                               \n\t"
-  "b               DDONE                          \n\t"
-  "                                               \n\t"
-  "DCOLSTORED:                                    \n\t" // BZ col stored case
-  "                                               \n\t"
-  DCOL_STORE
-  "                                               \n\t"
-  "DDONE:                                         \n\t"  
-  "                                               \n\t"
-  : // output operands (none)
+	(
+	"                                               \n\t"
+	"ld               %%r7,  %2                     \n\t" // load ptr of A
+	"ld               %%r8,  %3                     \n\t" // load ptr of B
+	"ld               %%r16, %6                     \n\t" // load ptr of C
+	"                                               \n\t"
+	"ld               %%r28, %4                     \n\t" // load ptr for alpha
+	"ld               %%r29, %5                     \n\t" // load ptr for beta
+	"                                               \n\t"
+	"ld               %%r11, %0                     \n\t" // load k_iter
+	"ld               %%r12, %1                     \n\t" // load k_left
+	"                                               \n\t"
+	"ld               %%r10, %8                     \n\t" // load cs_c
+	"slwi             %%r10, %%r10, 3               \n\t" // mul by size of elem
+	"                                               \n\t"
+	"ld               %%r9,  %7                     \n\t" // load rs_c
+	"slwi             %%r9,  %%r9, 3                \n\t" // mul by size of elem
+	"                                               \n\t"
+	"ld               %%r26,  0(%%r29)              \n\t" // load val of beta
+	"                                               \n\t"
+	"lxvdsx           %%vs62, 0, %%r28              \n\t" // splat alpha
+	"lxvdsx           %%vs63, 0, %%r29              \n\t" // splat beta
+	"                                               \n\t"
+	"add              %%r17, %%r16, %%r10           \n\t" // addr of col 1 of C
+	"add              %%r18, %%r17, %%r10           \n\t" //         col 2 of C
+	"add              %%r19, %%r18, %%r10           \n\t" //         col 3 of C
+	"add              %%r20, %%r19, %%r10           \n\t" //         col 4 of C
+	"add              %%r21, %%r20, %%r10           \n\t" //         col 5 of C
+	"                                               \n\t"
+	DZERO_OUT_VREG
+	"                                               \n\t"
+	DPRELOAD
+	"                                               \n\t"
+	"addi             %%r8, %%r8, 96                \n\t" // move to next col/row of A/B
+	"addi             %%r7, %%r7, 96                \n\t"
+	"                                               \n\t"
+	DPREFETCH
+	"                                               \n\t"
+	"cmpwi                  %%r11, 0                \n\t" // if k_iter == 0,
+	"beq                    DCONSIDERKLEFT          \n\t" // then jmp to k_left
+	"mtctr            %%r11                         \n\t" // else, do k_iter loop
+	"                                               \n\t"
+	"DLOOPKITER:                                    \n\t" // k_iter loop
+	"                                               \n\t"
+	A_B_PRODUCT_16                                        // compute A*B
+	"                                               \n\t"
+	"bdnz             DLOOPKITER                    \n\t"
+	"                                               \n\t"
+	"DCONSIDERKLEFT:                                \n\t"
+	"                                               \n\t"
+	"cmpwi                  %%r12, 0                \n\t" // if k_left == 0,
+	"beq                    DPOSTACCUM              \n\t" // then jmp to post accum
+	"mtctr            %%r12                         \n\t" // else, do k_left loop
+	"                                               \n\t"
+	"DLOOPKLEFT:                                    \n\t" // k_left loop
+	"                                               \n\t"
+	A_B_PRODUCT_1
+	"                                               \n\t"
+	"bdnz             DLOOPKLEFT                    \n\t"
+	"                                               \n\t"
+	"DPOSTACCUM:                                    \n\t"
+	"                                               \n\t"
+	DSCALE_ALPHA
+	"                                               \n\t"
+	"cmpdi                  %%r26, 0                \n\t" // if beta == 0,
+	"beq                    DBETAZERO               \n\t" // then jmp to BZ
+	"                                               \n\t"
+	DCOL_SCALE_BETA
+	"                                               \n\t"
+	"DBETAZERO:                                     \n\t" // BZ case
+	"                                               \n\t"
+	DCOL_STORE
+	"                                               \n\t"
+	"DDONE:                                         \n\t"
+	"                                               \n\t"
+	: // output operands (none)
 	: // input operands
 	  "m" (k_iter), // 0
 	  "m" (k_left), // 1
@@ -174,28 +150,30 @@ void bli_dgemm_power9_asm_12x6
 	  "m" (b_next), // 9
 	  "m" (a_next)*/  // 10
 	: // register clobber list
-  /* unclobberable regs: r2, r3, r4, r5, r6, r13, r14, r15, r30, r31 */
-  "r0", "r7",  "r8",  "r9",
-  "r10", "r11", "r12", "r16", "r17", "r18", "r19", 
-  "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29" 
+	/* unclobberable regs: r2, r3, r4, r5, r6, r13, r14, r15, r30, r31 */
+	"r0", "r7",  "r8",  "r9",
+	"r10", "r11", "r12", "r16", "r17", "r18", "r19",
+	"r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29"
+
+	#if XLC
+	,"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"
+	, "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19"
+	, "f20" ,"f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29"
+	, "f30" ,"f31"
+	, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"
+	, "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"
+	, "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"
+	, "v30", "v31"
+	#else
+	, "vs0", "vs1", "vs2", "vs3", "vs4", "vs5", "vs6", "vs7", "vs8", "vs9"
+	, "vs10", "vs11", "vs12", "vs13", "vs14", "vs15", "vs16", "vs17", "vs18", "vs19"
+	, "vs20", "vs21", "vs22", "vs23", "vs24", "vs25", "vs26", "vs27", "vs28", "vs29"
+	, "vs30", "vs31", "vs32", "vs33", "vs34", "vs35", "vs36", "vs37", "vs38", "vs39"
+	, "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49"
+	, "vs50", "vs51", "vs52", "vs53"
+	#endif
 
-  #if XLC
-  ,"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"
-  , "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19"
-  , "f20" ,"f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29"
-  , "f30" ,"f31"
-  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"
-  , "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"
-  , "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"
-  , "v30", "v31"
-  #else
-  , "vs0", "vs1", "vs2", "vs3", "vs4", "vs5", "vs6", "vs7", "vs8", "vs9"
-  , "vs10", "vs11", "vs12", "vs13", "vs14", "vs15", "vs16", "vs17", "vs18", "vs19"
-  , "vs20", "vs21", "vs22", "vs23", "vs24", "vs25", "vs26", "vs27", "vs28", "vs29"
-  , "vs30", "vs31", "vs32", "vs33", "vs34", "vs35", "vs36", "vs37", "vs38", "vs39"
-  , "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49"
-  , "vs50", "vs51", "vs52", "vs53"
-  #endif
+	);
 
-  );
+	GEMM_UKR_FLUSH_CT( d );
 }
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
index a56ef16e5..7890ad347 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
@@ -42,7 +42,9 @@
 
 void bli_sgemm_sandybridge_asm_8x8
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -57,27 +59,29 @@ void bli_sgemm_sandybridge_asm_8x8
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( s, 8, 8, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	//mov(var(b_next), r15) // load address of b_next.
-	
+
 	vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
 	vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b.
 	vpermilps(imm(0x4e), ymm2, ymm3)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
 	lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c;
-	
+
 	lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c;
 	prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c
@@ -87,7 +91,7 @@ void bli_sgemm_sandybridge_asm_8x8
 	prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c
 	prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c
 	prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c
-	
+
 	vxorps(ymm8, ymm8, ymm8)
 	vxorps(ymm9, ymm9, ymm9)
 	vxorps(ymm10, ymm10, ymm10)
@@ -96,18 +100,18 @@ void bli_sgemm_sandybridge_asm_8x8
 	vxorps(ymm13, ymm13, ymm13)
 	vxorps(ymm14, ymm14, ymm14)
 	vxorps(ymm15, ymm15, ymm15)
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	 // iteration 0
 	prefetch(0, mem(rax, 16*32))
 	vmulps(ymm0, ymm2, ymm6)
@@ -117,14 +121,14 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm15, ymm6, ymm15)
 	vaddps(ymm13, ymm7, ymm13)
-	
+
 	vmovaps(mem(rax, 1*32), ymm1)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vaddps(ymm11, ymm6, ymm11)
 	vaddps(ymm9, ymm7, ymm9)
-	
+
 	vmulps(ymm0, ymm2, ymm6)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 1*32), ymm2)
@@ -132,13 +136,13 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm14, ymm6, ymm14)
 	vaddps(ymm12, ymm7, ymm12)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vaddps(ymm10, ymm6, ymm10)
 	vaddps(ymm8, ymm7, ymm8)
-	
+
 	 // iteration 1
 	vmulps(ymm1, ymm2, ymm6)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
@@ -147,14 +151,14 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm15, ymm6, ymm15)
 	vaddps(ymm13, ymm7, ymm13)
-	
+
 	vmovaps(mem(rax, 2*32), ymm0)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddps(ymm11, ymm6, ymm11)
 	vaddps(ymm9, ymm7, ymm9)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 2*32), ymm2)
@@ -162,14 +166,14 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm14, ymm6, ymm14)
 	vaddps(ymm12, ymm7, ymm12)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddps(ymm10, ymm6, ymm10)
 	vaddps(ymm8, ymm7, ymm8)
-	
-	
+
+
 	 // iteration 2
 	prefetch(0, mem(rax, 18*32))
 	vmulps(ymm0, ymm2, ymm6)
@@ -179,7 +183,7 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm15, ymm6, ymm15)
 	vaddps(ymm13, ymm7, ymm13)
-	
+
 	vmovaps(mem(rax, 3*32), ymm1)
 	add(imm(4*8*4), rax) // a += 4*8 (unroll x mr)
 	vpermilps(imm(0x4e), ymm2, ymm3)
@@ -187,7 +191,7 @@ void bli_sgemm_sandybridge_asm_8x8
 	vmulps(ymm0, ymm5, ymm7)
 	vaddps(ymm11, ymm6, ymm11)
 	vaddps(ymm9, ymm7, ymm9)
-	
+
 	vmulps(ymm0, ymm2, ymm6)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 3*32), ymm2)
@@ -195,14 +199,14 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm14, ymm6, ymm14)
 	vaddps(ymm12, ymm7, ymm12)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vaddps(ymm10, ymm6, ymm10)
 	vaddps(ymm8, ymm7, ymm8)
-	
-	
+
+
 	 // iteration 3
 	vmulps(ymm1, ymm2, ymm6)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
@@ -212,14 +216,14 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm15, ymm6, ymm15)
 	vaddps(ymm13, ymm7, ymm13)
-	
+
 	vmovaps(mem(rax, 0*32), ymm0)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddps(ymm11, ymm6, ymm11)
 	vaddps(ymm9, ymm7, ymm9)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 0*32), ymm2)
@@ -227,35 +231,35 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
 	vaddps(ymm14, ymm6, ymm14)
 	vaddps(ymm12, ymm7, ymm12)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddps(ymm10, ymm6, ymm10)
 	vaddps(ymm8, ymm7, ymm8)
-	
-	
-	
-	
+
+
+
+
 	dec(rsi) // i -= 1;
 	jne(.SLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT) // EDGE LOOP
-	
-	
+
+
 	prefetch(0, mem(rax, 16*32))
 	vmulps(ymm0, ymm2, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
@@ -264,7 +268,7 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm15, ymm6, ymm15)
 	vaddps(ymm13, ymm7, ymm13)
-	
+
 	vmovaps(mem(rax, 1*32), ymm1)
 	add(imm(8*1*4), rax) // a += 8 (1 x mr)
 	vpermilps(imm(0x4e), ymm2, ymm3)
@@ -272,7 +276,7 @@ void bli_sgemm_sandybridge_asm_8x8
 	vmulps(ymm0, ymm5, ymm7)
 	vaddps(ymm11, ymm6, ymm11)
 	vaddps(ymm9, ymm7, ymm9)
-	
+
 	vmulps(ymm0, ymm2, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmovsldup(mem(rbx, 1*32), ymm2)
@@ -281,122 +285,122 @@ void bli_sgemm_sandybridge_asm_8x8
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm14, ymm6, ymm14)
 	vaddps(ymm12, ymm7, ymm12)
-	
+
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(ymm1, ymm0)
 	vaddps(ymm10, ymm6, ymm10)
 	vaddps(ymm8, ymm7, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi) // i -= 1;
 	jne(.SLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab02  ( ab04  ( ab06
-	 //   ab10    ab12    ab14    ab16  
+	 //   ab10    ab12    ab14    ab16
 	 //   ab22    ab20    ab26    ab24
 	 //   ab32    ab30    ab36    ab34
 	 //   ab44    ab46    ab40    ab42
-	 //   ab54    ab56    ab50    ab52  
+	 //   ab54    ab56    ab50    ab52
 	 //   ab66    ab64    ab62    ab60
 	 //   ab76 )  ab74 )  ab72 )  ab70 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab01  ( ab03  ( ab05  ( ab07
-	 //   ab11    ab13    ab15    ab17  
+	 //   ab11    ab13    ab15    ab17
 	 //   ab23    ab21    ab27    ab25
 	 //   ab33    ab31    ab37    ab35
 	 //   ab45    ab47    ab41    ab43
-	 //   ab55    ab57    ab51    ab53  
+	 //   ab55    ab57    ab51    ab53
 	 //   ab67    ab65    ab63    ab61
 	 //   ab77 )  ab75 )  ab73 )  ab71 )
-	
+
 	vmovaps(ymm15, ymm7)
 	vshufps(imm(0xe4), ymm13, ymm15, ymm15)
 	vshufps(imm(0xe4), ymm7, ymm13, ymm13)
-	
+
 	vmovaps(ymm11, ymm7)
 	vshufps(imm(0xe4), ymm9, ymm11, ymm11)
 	vshufps(imm(0xe4), ymm7, ymm9, ymm9)
-	
+
 	vmovaps(ymm14, ymm7)
 	vshufps(imm(0xe4), ymm12, ymm14, ymm14)
 	vshufps(imm(0xe4), ymm7, ymm12, ymm12)
-	
+
 	vmovaps(ymm10, ymm7)
 	vshufps(imm(0xe4), ymm8, ymm10, ymm10)
 	vshufps(imm(0xe4), ymm7, ymm8, ymm8)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab02  ( ab04  ( ab06
-	 //   ab10    ab12    ab14    ab16  
+	 //   ab10    ab12    ab14    ab16
 	 //   ab20    ab22    ab24    ab26
 	 //   ab30    ab32    ab34    ab36
 	 //   ab44    ab46    ab40    ab42
-	 //   ab54    ab56    ab50    ab52  
+	 //   ab54    ab56    ab50    ab52
 	 //   ab64    ab66    ab60    ab62
 	 //   ab74 )  ab76 )  ab70 )  ab72 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab01  ( ab03  ( ab05  ( ab07
-	 //   ab11    ab13    ab15    ab17  
+	 //   ab11    ab13    ab15    ab17
 	 //   ab21    ab23    ab25    ab27
 	 //   ab31    ab33    ab35    ab37
 	 //   ab45    ab47    ab41    ab43
-	 //   ab55    ab57    ab51    ab53  
+	 //   ab55    ab57    ab51    ab53
 	 //   ab65    ab67    ab61    ab63
 	 //   ab75 )  ab77 )  ab71 )  ab73 )
-	
+
 	vmovaps(ymm15, ymm7)
 	vperm2f128(imm(0x30), ymm11, ymm15, ymm15)
 	vperm2f128(imm(0x12), ymm11, ymm7, ymm11)
-	
+
 	vmovaps(ymm13, ymm7)
 	vperm2f128(imm(0x30), ymm9, ymm13, ymm13)
 	vperm2f128(imm(0x12), ymm9, ymm7, ymm9)
-	
+
 	vmovaps(ymm14, ymm7)
 	vperm2f128(imm(0x30), ymm10, ymm14, ymm14)
 	vperm2f128(imm(0x12), ymm10, ymm7, ymm10)
-	
+
 	vmovaps(ymm12, ymm7)
 	vperm2f128(imm(0x30), ymm8, ymm12, ymm12)
 	vperm2f128(imm(0x12), ymm8, ymm7, ymm8)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab02  ( ab04  ( ab06
-	 //   ab10    ab12    ab14    ab16  
+	 //   ab10    ab12    ab14    ab16
 	 //   ab20    ab22    ab24    ab26
 	 //   ab30    ab32    ab34    ab36
 	 //   ab40    ab42    ab44    ab46
-	 //   ab50    ab52    ab54    ab56  
+	 //   ab50    ab52    ab54    ab56
 	 //   ab60    ab62    ab64    ab66
 	 //   ab70 )  ab72 )  ab74 )  ab76 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab01  ( ab03  ( ab05  ( ab07
-	 //   ab11    ab13    ab15    ab17  
+	 //   ab11    ab13    ab15    ab17
 	 //   ab21    ab23    ab25    ab27
 	 //   ab31    ab33    ab35    ab37
 	 //   ab41    ab43    ab45    ab47
-	 //   ab51    ab53    ab55    ab57  
+	 //   ab51    ab53    ab55    ab57
 	 //   ab61    ab63    ab65    ab67
 	 //   ab71 )  ab73 )  ab75 )  ab77 )
-	
-	
-	
+
+
+
 	mov(var(alpha), rax) // load address of alpha
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm4) // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm8, ymm8) // scale by alpha
 	vmulps(ymm0, ymm9, ymm9)
 	vmulps(ymm0, ymm10, ymm10)
@@ -405,618 +409,118 @@ void bli_sgemm_sandybridge_asm_8x8
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
 	mov(var(rs_c), rsi) // load rs_c
 	lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
-	
+
 	lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-	
+
 	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
 	lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c;
-	
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm4) // set ZF if beta == 0.
 	je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4.
-	jz(.SCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.SGENSTORED)
-	
-	 // update c00:c70
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm15, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c01:c71
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm14, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c02:c72
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm13, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c03:c73
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm12, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c04:c74
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm11, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c05:c75
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm10, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c06:c76
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm9, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c07:c77
-	vmovlps(mem(rcx), xmm0, xmm0)
-	vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
-	vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
-	vshufps(imm(0x88), xmm1, xmm0, xmm0)
-	vmovlps(mem(rdx), xmm2, xmm2)
-	vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
-	vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
-	vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
-	vshufps(imm(0x88), xmm3, xmm2, xmm2)
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
-	
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm8, ymm0, ymm0) // add the gemm result,
-	
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	label(.SCOLSTORED)
-	
-	
-	vmovups(mem(rcx), ymm0) // load c00:c70,
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm15, ymm0, ymm0) // add the gemm result,
-	vmovups(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(mem(rcx), ymm1) // load c01:c71,
-	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-	vaddps(ymm14, ymm1, ymm1) // add the gemm result,
-	vmovups(ymm1, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(mem(rcx), ymm0) // load c02:c72,
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm13, ymm0, ymm0) // add the gemm result,
-	vmovups(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(mem(rcx), ymm1) // load c03:c73,
-	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-	vaddps(ymm12, ymm1, ymm1) // add the gemm result,
-	vmovups(ymm1, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(mem(rcx), ymm0) // load c04:c74,
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm11, ymm0, ymm0) // add the gemm result,
-	vmovups(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(mem(rcx), ymm1) // load c05:c75,
-	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-	vaddps(ymm10, ymm1, ymm1) // add the gemm result,
-	vmovups(ymm1, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(mem(rcx), ymm0) // load c06:c76,
-	vmulps(ymm4, ymm0, ymm0) // scale by beta,
-	vaddps(ymm9, ymm0, ymm0) // add the gemm result,
-	vmovups(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(mem(rcx), ymm1) // load c07:c77,
-	vmulps(ymm4, ymm1, ymm1) // scale by beta,
-	vaddps(ymm8, ymm1, ymm1) // add the gemm result,
-	vmovups(ymm1, mem(rcx)) // and store back to memory.
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	
+
+		vmovups(mem(rcx), ymm0) // load c00:c70,
+		vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		vaddps(ymm15, ymm0, ymm0) // add the gemm result,
+		vmovups(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(mem(rcx), ymm1) // load c01:c71,
+		vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		vaddps(ymm14, ymm1, ymm1) // add the gemm result,
+		vmovups(ymm1, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(mem(rcx), ymm0) // load c02:c72,
+		vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		vaddps(ymm13, ymm0, ymm0) // add the gemm result,
+		vmovups(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(mem(rcx), ymm1) // load c03:c73,
+		vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		vaddps(ymm12, ymm1, ymm1) // add the gemm result,
+		vmovups(ymm1, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(mem(rcx), ymm0) // load c04:c74,
+		vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		vaddps(ymm11, ymm0, ymm0) // add the gemm result,
+		vmovups(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(mem(rcx), ymm1) // load c05:c75,
+		vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		vaddps(ymm10, ymm1, ymm1) // add the gemm result,
+		vmovups(ymm1, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(mem(rcx), ymm0) // load c06:c76,
+		vmulps(ymm4, ymm0, ymm0) // scale by beta,
+		vaddps(ymm9, ymm0, ymm0) // add the gemm result,
+		vmovups(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(mem(rcx), ymm1) // load c07:c77,
+		vmulps(ymm4, ymm1, ymm1) // scale by beta,
+		vaddps(ymm8, ymm1, ymm1) // add the gemm result,
+		vmovups(ymm1, mem(rcx)) // and store back to memory.
+
+		jmp(.SDONE) // jump to end.
+
 	label(.SBETAZERO)
-	
-	cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4.
-	jz(.SCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.SGENSTORBZ)
-	
-	 // update c00:c70
-	vmovups(ymm15, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c01:c71
-	vmovups(ymm14, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c02:c72
-	vmovups(ymm13, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c03:c73
-	vmovups(ymm12, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c04:c74
-	vmovups(ymm11, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c05:c75
-	vmovups(ymm10, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c06:c76
-	vmovups(ymm9, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	add(rdi, rcx) // c += cs_c;
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	 // update c07:c77
-	vmovups(ymm8, ymm0)
-	vextractf128(imm(1), ymm0, xmm2)
-	vmovss(xmm0, mem(rcx))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, rsi, 1))
-	vpermilps(imm(0x39), xmm1, xmm0)
-	vmovss(xmm0, mem(rcx, r12, 1))
-	vpermilps(imm(0x39), xmm0, xmm1)
-	vmovss(xmm1, mem(rcx, r13, 1))
-	vmovss(xmm2, mem(rdx))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, rsi, 1))
-	vpermilps(imm(0x39), xmm3, xmm2)
-	vmovss(xmm2, mem(rdx, r12, 1))
-	vpermilps(imm(0x39), xmm2, xmm3)
-	vmovss(xmm3, mem(rdx, r13, 1))
-	
-	
-	jmp(.SDONE) // jump to end.
-	
-	
-	
-	label(.SCOLSTORBZ)
-	
-	
-	vmovups(ymm15, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm14, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm13, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm12, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm11, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm10, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm9, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm8, mem(rcx)) // and store back to memory.
-	
-	
-	
-	
-	
+
+		vmovups(ymm15, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm14, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm13, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm12, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm11, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm10, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm9, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm8, mem(rcx)) // and store back to memory.
+
 	label(.SDONE)
-	
+
 	vzeroupper()
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c)/*,   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)*/  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c)/*,   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -1024,11 +528,15 @@ void bli_sgemm_sandybridge_asm_8x8
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( s );
 }
 
 void bli_dgemm_sandybridge_asm_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -1043,34 +551,36 @@ void bli_dgemm_sandybridge_asm_8x4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( d, 8, 4, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	mov(var(b_next), r15) // load address of b_next.
 	//mov(var(a_next), r14) // load address of a_next.
 	sub(imm(4*64), r15)
-	
+
 	vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
 	vmovapd(mem(rbx, 0*32), ymm2) // elements of a and b.
 	vpermilpd(imm(0x5), ymm2, ymm3)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
 	lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
-	
+
 	prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c
 	prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
-	
+
 	vxorpd(ymm8, ymm8, ymm8)
 	vxorpd(ymm9, ymm9, ymm9)
 	vxorpd(ymm10, ymm10, ymm10)
@@ -1079,19 +589,19 @@ void bli_dgemm_sandybridge_asm_8x4
 	vxorpd(ymm13, ymm13, ymm13)
 	vxorpd(ymm14, ymm14, ymm14)
 	vxorpd(ymm15, ymm15, ymm15)
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER) // MAIN LOOP
-	
+
 	add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr)
-	
+
 	 // iteration 0
 	vmovapd(mem(rax, 1*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -1100,7 +610,7 @@ void bli_dgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm15, ymm6, ymm15)
 	vaddpd(ymm13, ymm7, ymm13)
-	
+
 	prefetch(0, mem(rax, 16*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovapd(mem(rbx, 1*32), ymm2)
@@ -1108,20 +618,20 @@ void bli_dgemm_sandybridge_asm_8x4
 	vpermilpd(imm(0x5), ymm2, ymm3)
 	vaddpd(ymm14, ymm6, ymm14)
 	vaddpd(ymm12, ymm7, ymm12)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 2*32), ymm0)
 	vaddpd(ymm11, ymm6, ymm11)
 	vaddpd(ymm9, ymm7, ymm9)
 	prefetch(0, mem(r15, 0*32)) // prefetch b_next[0*4]
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddpd(ymm10, ymm6, ymm10)
 	vaddpd(ymm8, ymm7, ymm8)
-	
-	
+
+
 	 // iteration 1
 	vmovapd(mem(rax, 3*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -1130,7 +640,7 @@ void bli_dgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm15, ymm6, ymm15)
 	vaddpd(ymm13, ymm7, ymm13)
-	
+
 	prefetch(0, mem(rax, 18*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovapd(mem(rbx, 2*32), ymm2)
@@ -1138,19 +648,19 @@ void bli_dgemm_sandybridge_asm_8x4
 	vpermilpd(imm(0x5), ymm2, ymm3)
 	vaddpd(ymm14, ymm6, ymm14)
 	vaddpd(ymm12, ymm7, ymm12)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 4*32), ymm0)
 	vaddpd(ymm11, ymm6, ymm11)
 	vaddpd(ymm9, ymm7, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddpd(ymm10, ymm6, ymm10)
 	vaddpd(ymm8, ymm7, ymm8)
-	
-	
+
+
 	 // iteration 2
 	vmovapd(mem(rax, 5*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -1159,7 +669,7 @@ void bli_dgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm15, ymm6, ymm15)
 	vaddpd(ymm13, ymm7, ymm13)
-	
+
 	prefetch(0, mem(rax, 20*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovapd(mem(rbx, 3*32), ymm2)
@@ -1168,20 +678,20 @@ void bli_dgemm_sandybridge_asm_8x4
 	vpermilpd(imm(0x5), ymm2, ymm3)
 	vaddpd(ymm14, ymm6, ymm14)
 	vaddpd(ymm12, ymm7, ymm12)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 6*32), ymm0)
 	vaddpd(ymm11, ymm6, ymm11)
 	vaddpd(ymm9, ymm7, ymm9)
 	prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4]
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddpd(ymm10, ymm6, ymm10)
 	vaddpd(ymm8, ymm7, ymm8)
-	
-	
+
+
 	 // iteration 3
 	vmovapd(mem(rax, 7*32), ymm1)
 	add(imm(4*8*8), rax) // a += 4*8 (unroll x mr)
@@ -1191,7 +701,7 @@ void bli_dgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm15, ymm6, ymm15)
 	vaddpd(ymm13, ymm7, ymm13)
-	
+
 	//prefetch(0, mem(rax, 22*32))
 	prefetch(0, mem(rax, 14*32))
 	vmulpd(ymm1, ymm2, ymm6)
@@ -1200,41 +710,41 @@ void bli_dgemm_sandybridge_asm_8x4
 	vpermilpd(imm(0x5), ymm2, ymm3)
 	vaddpd(ymm14, ymm6, ymm14)
 	vaddpd(ymm12, ymm7, ymm12)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 0*32), ymm0)
 	vaddpd(ymm11, ymm6, ymm11)
 	vaddpd(ymm9, ymm7, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddpd(ymm10, ymm6, ymm10)
 	vaddpd(ymm8, ymm7, ymm8)
-	
-	
-	
+
+
+
 	//add(imm(4*8*8), rax) // a      += 4*8 (unroll x mr)
 	//add(imm(4*4*8), rbx) // b      += 4*4 (unroll x nr)
-	
+
 	dec(rsi) // i -= 1;
 	jne(.DLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT) // EDGE LOOP
-	
+
 	vmovapd(mem(rax, 1*32), ymm1)
 	add(imm(8*1*8), rax) // a += 8 (1 x mr)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -1243,7 +753,7 @@ void bli_dgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm15, ymm6, ymm15)
 	vaddpd(ymm13, ymm7, ymm13)
-	
+
 	prefetch(0, mem(rax, 14*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovapd(mem(rbx, 1*32), ymm2)
@@ -1252,101 +762,101 @@ void bli_dgemm_sandybridge_asm_8x4
 	vpermilpd(imm(0x5), ymm2, ymm3)
 	vaddpd(ymm14, ymm6, ymm14)
 	vaddpd(ymm12, ymm7, ymm12)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 0*32), ymm0)
 	vaddpd(ymm11, ymm6, ymm11)
 	vaddpd(ymm9, ymm7, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddpd(ymm10, ymm6, ymm10)
 	vaddpd(ymm8, ymm7, ymm8)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.DLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
-	
-	
+
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab01  ( ab02  ( ab03
-	 //   ab11    ab10    ab13    ab12  
+	 //   ab11    ab10    ab13    ab12
 	 //   ab22    ab23    ab20    ab21
 	 //   ab33 )  ab32 )  ab31 )  ab30 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab40  ( ab41  ( ab42  ( ab43
-	 //   ab51    ab50    ab53    ab52  
+	 //   ab51    ab50    ab53    ab52
 	 //   ab62    ab63    ab60    ab61
 	 //   ab73 )  ab72 )  ab71 )  ab70 )
-	
+
 	vmovapd(ymm15, ymm7)
 	vshufpd(imm(0xa), ymm15, ymm13, ymm15)
 	vshufpd(imm(0xa), ymm13, ymm7, ymm13)
-	
+
 	vmovapd(ymm11, ymm7)
 	vshufpd(imm(0xa), ymm11, ymm9, ymm11)
 	vshufpd(imm(0xa), ymm9, ymm7, ymm9)
-	
+
 	vmovapd(ymm14, ymm7)
 	vshufpd(imm(0xa), ymm14, ymm12, ymm14)
 	vshufpd(imm(0xa), ymm12, ymm7, ymm12)
-	
+
 	vmovapd(ymm10, ymm7)
 	vshufpd(imm(0xa), ymm10, ymm8, ymm10)
 	vshufpd(imm(0xa), ymm8, ymm7, ymm8)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab01  ( ab00  ( ab03  ( ab02
-	 //   ab11    ab10    ab13    ab12  
+	 //   ab11    ab10    ab13    ab12
 	 //   ab23    ab22    ab21    ab20
 	 //   ab33 )  ab32 )  ab31 )  ab30 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab41  ( ab40  ( ab43  ( ab42
-	 //   ab51    ab50    ab53    ab52  
+	 //   ab51    ab50    ab53    ab52
 	 //   ab63    ab62    ab61    ab60
 	 //   ab73 )  ab72 )  ab71 )  ab70 )
-	
+
 	vmovapd(ymm15, ymm7)
 	vperm2f128(imm(0x30), ymm15, ymm11, ymm15)
 	vperm2f128(imm(0x12), ymm7, ymm11, ymm11)
-	
+
 	vmovapd(ymm13, ymm7)
 	vperm2f128(imm(0x30), ymm13, ymm9, ymm13)
 	vperm2f128(imm(0x12), ymm7, ymm9, ymm9)
-	
+
 	vmovapd(ymm14, ymm7)
 	vperm2f128(imm(0x30), ymm14, ymm10, ymm14)
 	vperm2f128(imm(0x12), ymm7, ymm10, ymm10)
-	
+
 	vmovapd(ymm12, ymm7)
 	vperm2f128(imm(0x30), ymm12, ymm8, ymm12)
 	vperm2f128(imm(0x12), ymm7, ymm8, ymm8)
-	
+
 	 // ymm9:   ymm11:  ymm13:  ymm15:
 	 // ( ab00  ( ab01  ( ab02  ( ab03
-	 //   ab10    ab11    ab12    ab13  
+	 //   ab10    ab11    ab12    ab13
 	 //   ab20    ab21    ab22    ab23
 	 //   ab30 )  ab31 )  ab32 )  ab33 )
-	
+
 	 // ymm8:   ymm10:  ymm12:  ymm14:
 	 // ( ab40  ( ab41  ( ab42  ( ab43
-	 //   ab50    ab51    ab52    ab53  
+	 //   ab50    ab51    ab52    ab53
 	 //   ab60    ab61    ab62    ab63
 	 //   ab70 )  ab71 )  ab72 )  ab73 )
-	
-	
+
+
 	mov(var(alpha), rax) // load address of alpha
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm2) // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm8, ymm8) // scale by alpha
 	vmulpd(ymm0, ymm9, ymm9)
 	vmulpd(ymm0, ymm10, ymm10)
@@ -1355,343 +865,124 @@ void bli_dgemm_sandybridge_asm_8x4
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
 	mov(var(rs_c), rsi) // load rs_c
 	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
-	
+
 	lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-	
+
 	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
 	lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c;
-	
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomisd(xmm0, xmm2) // set ZF if beta == 0.
 	je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-	
-	
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.DCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.DGENSTORED)
-	 // update c00:c33
-	
-	vextractf128(imm(1), ymm9, xmm1)
-	vmovlpd(mem(rcx), xmm0, xmm0) // load c00 and c10,
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm9, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c20 and c30,
-	vmovhpd(mem(rcx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, r13, 1))
-	add(rdi, rcx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm11, xmm1)
-	vmovlpd(mem(rcx), xmm0, xmm0) // load c01 and c11,
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm11, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c21 and c31,
-	vmovhpd(mem(rcx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, r13, 1))
-	add(rdi, rcx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm13, xmm1)
-	vmovlpd(mem(rcx), xmm0, xmm0) // load c02 and c12,
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm13, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c22 and c32,
-	vmovhpd(mem(rcx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, r13, 1))
-	add(rdi, rcx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm15, xmm1)
-	vmovlpd(mem(rcx), xmm0, xmm0) // load c03 and c13,
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm15, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c23 and c33,
-	vmovhpd(mem(rcx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rcx, r13, 1))
-	
-	 // update c40:c73
-	
-	vextractf128(imm(1), ymm8, xmm1)
-	vmovlpd(mem(rdx), xmm0, xmm0) // load c40 and c50,
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm8, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c60 and c70,
-	vmovhpd(mem(rdx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, r13, 1))
-	add(rdi, rdx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm10, xmm1)
-	vmovlpd(mem(rdx), xmm0, xmm0) // load c41 and c51,
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm10, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c61 and c71,
-	vmovhpd(mem(rdx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, r13, 1))
-	add(rdi, rdx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm12, xmm1)
-	vmovlpd(mem(rdx), xmm0, xmm0) // load c42 and c52,
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm12, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c62 and c72,
-	vmovhpd(mem(rdx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, r13, 1))
-	add(rdi, rdx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm14, xmm1)
-	vmovlpd(mem(rdx), xmm0, xmm0) // load c43 and c53,
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm14, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c63 and c73,
-	vmovhpd(mem(rdx, r13, 1), xmm0, xmm0)
-	vmulpd(xmm2, xmm0, xmm0) // scale by beta,
-	vaddpd(xmm1, xmm0, xmm0) // add the gemm result,
-	vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory.
-	vmovhpd(xmm0, mem(rdx, r13, 1))
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
-	label(.DCOLSTORED)
-	 // update c00:c33
-	
-	vmovupd(mem(rcx), ymm0) // load c00:c30,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm9, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(mem(rcx), ymm0) // load c01:c31,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm11, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(mem(rcx), ymm0) // load c02:c32,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm13, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rcx)) // and store back to memory.
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(mem(rcx), ymm0) // load c03:c33,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm15, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rcx)) // and store back to memory.
-	
-	 // update c40:c73
-	
-	vmovupd(mem(rdx), ymm0) // load c40:c70,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm8, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rdx)) // and store back to memory.
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(mem(rdx), ymm0) // load c41:c71,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm10, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rdx)) // and store back to memory.
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(mem(rdx), ymm0) // load c42:c72,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm12, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rdx)) // and store back to memory.
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(mem(rdx), ymm0) // load c43:c73,
-	vmulpd(ymm2, ymm0, ymm0) // scale by beta,
-	vaddpd(ymm14, ymm0, ymm0) // add the gemm result,
-	vmovupd(ymm0, mem(rdx)) // and store back to memory.
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
-	
+
+		 // update c00:c33
+
+		vmovupd(mem(rcx), ymm0) // load c00:c30,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm9, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(mem(rcx), ymm0) // load c01:c31,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm11, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(mem(rcx), ymm0) // load c02:c32,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm13, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rcx)) // and store back to memory.
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(mem(rcx), ymm0) // load c03:c33,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm15, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rcx)) // and store back to memory.
+
+		 // update c40:c73
+
+		vmovupd(mem(rdx), ymm0) // load c40:c70,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm8, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rdx)) // and store back to memory.
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(mem(rdx), ymm0) // load c41:c71,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm10, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rdx)) // and store back to memory.
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(mem(rdx), ymm0) // load c42:c72,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm12, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rdx)) // and store back to memory.
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(mem(rdx), ymm0) // load c43:c73,
+		vmulpd(ymm2, ymm0, ymm0) // scale by beta,
+		vaddpd(ymm14, ymm0, ymm0) // add the gemm result,
+		vmovupd(ymm0, mem(rdx)) // and store back to memory.
+
+		jmp(.DDONE) // jump to end.
+
 	label(.DBETAZERO)
-	
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.DCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.DGENSTORBZ)
-	 // update c00:c33
-	
-	vextractf128(imm(1), ymm9, xmm1)
-	vmovlpd(xmm9, mem(rcx)) // store to c00:c30
-	vmovhpd(xmm9, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, r12, 1))
-	vmovhpd(xmm1, mem(rcx, r13, 1))
-	add(rdi, rcx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm11, xmm1)
-	vmovlpd(xmm11, mem(rcx)) // store to c01:c31
-	vmovhpd(xmm11, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, r12, 1))
-	vmovhpd(xmm1, mem(rcx, r13, 1))
-	add(rdi, rcx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm13, xmm1)
-	vmovlpd(xmm13, mem(rcx)) // store to c02:c32
-	vmovhpd(xmm13, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, r12, 1))
-	vmovhpd(xmm1, mem(rcx, r13, 1))
-	add(rdi, rcx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm15, xmm1)
-	vmovlpd(xmm15, mem(rcx)) // store to c03:c33
-	vmovhpd(xmm15, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, r12, 1))
-	vmovhpd(xmm1, mem(rcx, r13, 1))
-	
-	 // update c40:c73
-	
-	vextractf128(imm(1), ymm8, xmm1)
-	vmovlpd(xmm8, mem(rdx)) // store to c40:c70
-	vmovhpd(xmm8, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, r12, 1))
-	vmovhpd(xmm1, mem(rdx, r13, 1))
-	add(rdi, rdx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm10, xmm1)
-	vmovlpd(xmm10, mem(rdx)) // store to c41:c71
-	vmovhpd(xmm10, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, r12, 1))
-	vmovhpd(xmm1, mem(rdx, r13, 1))
-	add(rdi, rdx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm12, xmm1)
-	vmovlpd(xmm12, mem(rdx)) // store to c42:c72
-	vmovhpd(xmm12, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, r12, 1))
-	vmovhpd(xmm1, mem(rdx, r13, 1))
-	add(rdi, rdx) // c += cs_c;
-	
-	vextractf128(imm(1), ymm14, xmm1)
-	vmovlpd(xmm14, mem(rdx)) // store to c43:c73
-	vmovhpd(xmm14, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, r12, 1))
-	vmovhpd(xmm1, mem(rdx, r13, 1))
-	
-	
-	jmp(.DDONE) // jump to end.
-	
-	
-	
-	label(.DCOLSTORBZ)
-	 // update c00:c33
-	
-	vmovupd(ymm9, mem(rcx)) // store c00:c30
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(ymm11, mem(rcx)) // store c01:c31
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(ymm13, mem(rcx)) // store c02:c32
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(ymm15, mem(rcx)) // store c03:c33
-	
-	 // update c40:c73
-	
-	vmovupd(ymm8, mem(rdx)) // store c40:c70
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(ymm10, mem(rdx)) // store c41:c71
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(ymm12, mem(rdx)) // store c42:c72
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(ymm14, mem(rdx)) // store c43:c73
-	
-	
-	
-	
-	
+
+		 // update c00:c33
+
+		vmovupd(ymm9, mem(rcx)) // store c00:c30
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(ymm11, mem(rcx)) // store c01:c31
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(ymm13, mem(rcx)) // store c02:c32
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(ymm15, mem(rcx)) // store c03:c33
+
+		 // update c40:c73
+
+		vmovupd(ymm8, mem(rdx)) // store c40:c70
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(ymm10, mem(rdx)) // store c41:c71
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(ymm12, mem(rdx)) // store c42:c72
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(ymm14, mem(rdx)) // store c43:c73
+
 	label(.DDONE)
-    
-    vzeroupper()
-	
+
 	vzeroupper()
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c),   // 8
-      [b_next] "m" (b_next)/*, // 9
-      [a_next] "m" (a_next)*/  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c),   // 8
+	  [b_next] "m" (b_next)/*, // 9
+	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -1699,11 +990,15 @@ void bli_dgemm_sandybridge_asm_8x4
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 void bli_cgemm_sandybridge_asm_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
@@ -1718,34 +1013,36 @@ void bli_cgemm_sandybridge_asm_8x4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( c, 8, 4, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	mov(var(b_next), r15) // load address of b_next.
 	//mov(var(a_next), r14) // load address of a_next.
 	sub(imm(4*64), r15)
-	
+
 	vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
 	vmovsldup(mem(rbx, 0*32), ymm2)
 	vpermilps(imm(0x4e), ymm2, ymm3)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
 	lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
-	
+
 	prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c
 	prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
-	
+
 	vxorps(ymm8, ymm8, ymm8)
 	vxorps(ymm9, ymm9, ymm9)
 	vxorps(ymm10, ymm10, ymm10)
@@ -1754,19 +1051,19 @@ void bli_cgemm_sandybridge_asm_8x4
 	vxorps(ymm13, ymm13, ymm13)
 	vxorps(ymm14, ymm14, ymm14)
 	vxorps(ymm15, ymm15, ymm15)
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.CLOOPKITER) // MAIN LOOP
-	
+
 	add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr)
-	
+
 	 // iteration 0
 	prefetch(0, mem(rax, 8*32))
 	vmovaps(mem(rax, 1*32), ymm1)
@@ -1776,20 +1073,20 @@ void bli_cgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm15, ymm15)
 	vaddps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovshdup(mem(rbx, 0*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddps(ymm6, ymm14, ymm14)
 	vaddps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vpermilps(imm(0xb1), ymm0, ymm0)
 	vaddps(ymm6, ymm11, ymm11)
 	vaddps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulps(ymm1, ymm5, ymm7)
@@ -1797,32 +1094,32 @@ void bli_cgemm_sandybridge_asm_8x4
 	vaddps(ymm6, ymm10, ymm10)
 	vaddps(ymm7, ymm8, ymm8)
 	prefetch(0, mem(r15, 0*32)) // prefetch b_next[0*4]
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 1*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 2*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 1
 	prefetch(0, mem(rax, 10*32))
 	vmovaps(mem(rax, 3*32), ymm1)
@@ -1832,52 +1129,52 @@ void bli_cgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm15, ymm15)
 	vaddps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovshdup(mem(rbx, 1*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddps(ymm6, ymm14, ymm14)
 	vaddps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vpermilps(imm(0xb1), ymm0, ymm0)
 	vaddps(ymm6, ymm11, ymm11)
 	vaddps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulps(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm10, ymm10)
 	vaddps(ymm7, ymm8, ymm8)
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 2*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 4*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 2
 	prefetch(0, mem(rax, 12*32))
 	vmovaps(mem(rax, 5*32), ymm1)
@@ -1887,20 +1184,20 @@ void bli_cgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm15, ymm15)
 	vaddps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovshdup(mem(rbx, 2*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddps(ymm6, ymm14, ymm14)
 	vaddps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vpermilps(imm(0xb1), ymm0, ymm0)
 	vaddps(ymm6, ymm11, ymm11)
 	vaddps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulps(ymm1, ymm5, ymm7)
@@ -1908,32 +1205,32 @@ void bli_cgemm_sandybridge_asm_8x4
 	vaddps(ymm6, ymm10, ymm10)
 	vaddps(ymm7, ymm8, ymm8)
 	prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4]
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 3*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 6*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 3
 	prefetch(0, mem(rax, 14*32))
 	vmovaps(mem(rax, 7*32), ymm1)
@@ -1943,74 +1240,74 @@ void bli_cgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm15, ymm15)
 	vaddps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovshdup(mem(rbx, 3*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddps(ymm6, ymm14, ymm14)
 	vaddps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vpermilps(imm(0xb1), ymm0, ymm0)
 	vaddps(ymm6, ymm11, ymm11)
 	vaddps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulps(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm10, ymm10)
 	vaddps(ymm7, ymm8, ymm8)
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 4*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 8*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	add(imm(8*4*8), rax) // a += 8*4 (unroll x mr)
 	add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.CLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.CCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.CLOOPKLEFT) // EDGE LOOP
-	
+
 	 // iteration 0
 	prefetch(0, mem(rax, 8*32))
 	vmovaps(mem(rax, 1*32), ymm1)
@@ -2020,228 +1317,228 @@ void bli_cgemm_sandybridge_asm_8x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm15, ymm15)
 	vaddps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovshdup(mem(rbx, 0*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddps(ymm6, ymm14, ymm14)
 	vaddps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vpermilps(imm(0xb1), ymm0, ymm0)
 	vaddps(ymm6, ymm11, ymm11)
 	vaddps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulps(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddps(ymm6, ymm10, ymm10)
 	vaddps(ymm7, ymm8, ymm8)
-	
+
 	vpermilps(imm(0xb1), ymm1, ymm1)
 	vmulps(ymm0, ymm2, ymm6)
 	vmulps(ymm0, ymm3, ymm7)
 	vaddsubps(ymm6, ymm15, ymm15)
 	vaddsubps(ymm7, ymm13, ymm13)
-	
+
 	vmulps(ymm1, ymm2, ymm6)
 	vmovsldup(mem(rbx, 1*32), ymm2)
 	vmulps(ymm1, ymm3, ymm7)
 	vpermilps(imm(0x4e), ymm2, ymm3)
 	vaddsubps(ymm6, ymm14, ymm14)
 	vaddsubps(ymm7, ymm12, ymm12)
-	
+
 	vmulps(ymm0, ymm4, ymm6)
 	vmulps(ymm0, ymm5, ymm7)
 	vmovaps(mem(rax, 2*32), ymm0)
 	vaddsubps(ymm6, ymm11, ymm11)
 	vaddsubps(ymm7, ymm9, ymm9)
-	
+
 	vmulps(ymm1, ymm4, ymm6)
 	vmulps(ymm1, ymm5, ymm7)
 	vaddsubps(ymm6, ymm10, ymm10)
 	vaddsubps(ymm7, ymm8, ymm8)
-	
-	
+
+
 	add(imm(8*1*8), rax) // a += 8 (1 x mr)
 	add(imm(4*1*8), rbx) // b += 4 (1 x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.CLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.CPOSTACCUM)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
-	 // ( ab00  ( ab01  ( ab02  ( ab03 
-	 //   ab10    ab11    ab12    ab13 
-	 //   ab21    ab20    ab23    ab22 
-	 //   ab31    ab30    ab33    ab32 
-	 //   ab42    ab43    ab40    ab41 
-	 //   ab52    ab53    ab50    ab51 
-	 //   ab63    ab62    ab61    ab60 
+	 // ( ab00  ( ab01  ( ab02  ( ab03
+	 //   ab10    ab11    ab12    ab13
+	 //   ab21    ab20    ab23    ab22
+	 //   ab31    ab30    ab33    ab32
+	 //   ab42    ab43    ab40    ab41
+	 //   ab52    ab53    ab50    ab51
+	 //   ab63    ab62    ab61    ab60
 	 //   ab73 )  ab72 )  ab71 )  ab70 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
-	 // ( ab80  ( ab81  ( ab82  ( ab83 
-	 //   ab90    ab91    ab92    ab93 
-	 //   aba1    aba0    aba3    aba2 
-	 //   abb1    abb0    abb3    abb2 
-	 //   abc2    abc3    abc0    abc1 
-	 //   abd2    abd3    abd0    abd1 
-	 //   abe3    abe2    abe1    abe0 
+	 // ( ab80  ( ab81  ( ab82  ( ab83
+	 //   ab90    ab91    ab92    ab93
+	 //   aba1    aba0    aba3    aba2
+	 //   abb1    abb0    abb3    abb2
+	 //   abc2    abc3    abc0    abc1
+	 //   abd2    abd3    abd0    abd1
+	 //   abe3    abe2    abe1    abe0
 	 //   abf3    abf2    abf1    abf0 )
-	
+
 	vmovaps(ymm15, ymm7)
 	vshufps(imm(0xe4), ymm13, ymm15, ymm15)
 	vshufps(imm(0xe4), ymm7, ymm13, ymm13)
-	
+
 	vmovaps(ymm11, ymm7)
 	vshufps(imm(0xe4), ymm9, ymm11, ymm11)
 	vshufps(imm(0xe4), ymm7, ymm9, ymm9)
-	
+
 	vmovaps(ymm14, ymm7)
 	vshufps(imm(0xe4), ymm12, ymm14, ymm14)
 	vshufps(imm(0xe4), ymm7, ymm12, ymm12)
-	
+
 	vmovaps(ymm10, ymm7)
 	vshufps(imm(0xe4), ymm8, ymm10, ymm10)
 	vshufps(imm(0xe4), ymm7, ymm8, ymm8)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
-	 // ( ab00  ( ab01  ( ab02  ( ab03 
-	 //   ab10    ab11    ab12    ab13 
-	 //   ab20    ab21    ab22    ab23 
-	 //   ab30    ab31    ab32    ab33 
-	 //   ab42    ab43    ab40    ab41 
-	 //   ab52    ab53    ab50    ab51 
-	 //   ab62    ab63    ab60    ab61 
+	 // ( ab00  ( ab01  ( ab02  ( ab03
+	 //   ab10    ab11    ab12    ab13
+	 //   ab20    ab21    ab22    ab23
+	 //   ab30    ab31    ab32    ab33
+	 //   ab42    ab43    ab40    ab41
+	 //   ab52    ab53    ab50    ab51
+	 //   ab62    ab63    ab60    ab61
 	 //   ab72 )  ab73 )  ab70 )  ab71 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
-	 // ( ab80  ( ab81  ( ab82  ( ab83 
-	 //   ab90    ab91    ab92    ab93 
-	 //   aba0    aba1    aba2    aba3 
-	 //   abb0    abb1    abb2    abb3 
-	 //   abc2    abc3    abc0    abc1 
-	 //   abd2    abd3    abd0    abd1 
-	 //   abe2    abe3    abe0    abe1 
+	 // ( ab80  ( ab81  ( ab82  ( ab83
+	 //   ab90    ab91    ab92    ab93
+	 //   aba0    aba1    aba2    aba3
+	 //   abb0    abb1    abb2    abb3
+	 //   abc2    abc3    abc0    abc1
+	 //   abd2    abd3    abd0    abd1
+	 //   abe2    abe3    abe0    abe1
 	 //   abf2 )  abf3 )  abf0 )  abf1 )
-	
+
 	vmovaps(ymm15, ymm7)
 	vperm2f128(imm(0x12), ymm15, ymm11, ymm15)
 	vperm2f128(imm(0x30), ymm7, ymm11, ymm11)
-	
+
 	vmovaps(ymm13, ymm7)
 	vperm2f128(imm(0x12), ymm13, ymm9, ymm13)
 	vperm2f128(imm(0x30), ymm7, ymm9, ymm9)
-	
+
 	vmovaps(ymm14, ymm7)
 	vperm2f128(imm(0x12), ymm14, ymm10, ymm14)
 	vperm2f128(imm(0x30), ymm7, ymm10, ymm10)
-	
+
 	vmovaps(ymm12, ymm7)
 	vperm2f128(imm(0x12), ymm12, ymm8, ymm12)
 	vperm2f128(imm(0x30), ymm7, ymm8, ymm8)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
-	 // ( ab00  ( ab01  ( ab02  ( ab03 
-	 //   ab10    ab11    ab12    ab13 
-	 //   ab20    ab21    ab22    ab23 
-	 //   ab30    ab31    ab32    ab33 
-	 //   ab40    ab41    ab42    ab43 
-	 //   ab50    ab51    ab52    ab53 
-	 //   ab60    ab61    ab62    ab63 
+	 // ( ab00  ( ab01  ( ab02  ( ab03
+	 //   ab10    ab11    ab12    ab13
+	 //   ab20    ab21    ab22    ab23
+	 //   ab30    ab31    ab32    ab33
+	 //   ab40    ab41    ab42    ab43
+	 //   ab50    ab51    ab52    ab53
+	 //   ab60    ab61    ab62    ab63
 	 //   ab70 )  ab71 )  ab72 )  ab73 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
-	 // ( ab80  ( ab81  ( ab82  ( ab83 
-	 //   ab90    ab91    ab92    ab93 
-	 //   aba0    aba1    aba2    aba3 
-	 //   abb0    abb1    abb2    abb3 
-	 //   abc0    abc1    abc2    abc3 
-	 //   abd0    abd1    abd2    abd3 
-	 //   abe0    abe1    abe2    abe3 
+	 // ( ab80  ( ab81  ( ab82  ( ab83
+	 //   ab90    ab91    ab92    ab93
+	 //   aba0    aba1    aba2    aba3
+	 //   abb0    abb1    abb2    abb3
+	 //   abc0    abc1    abc2    abc3
+	 //   abd0    abd1    abd2    abd3
+	 //   abe0    abe1    abe2    abe3
 	 //   abf0 )  abf1 )  abf2 )  abf3 )
-	
-	
-	
-	
+
+
+
+
 	 // scale by alpha
-	
+
 	mov(var(alpha), rax) // load address of alpha
 	vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate
 	vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate
-	
+
 	vpermilps(imm(0xb1), ymm15, ymm3)
 	vmulps(ymm7, ymm15, ymm15)
 	vmulps(ymm6, ymm3, ymm3)
 	vaddsubps(ymm3, ymm15, ymm15)
-	
+
 	vpermilps(imm(0xb1), ymm14, ymm2)
 	vmulps(ymm7, ymm14, ymm14)
 	vmulps(ymm6, ymm2, ymm2)
 	vaddsubps(ymm2, ymm14, ymm14)
-	
+
 	vpermilps(imm(0xb1), ymm13, ymm1)
 	vmulps(ymm7, ymm13, ymm13)
 	vmulps(ymm6, ymm1, ymm1)
 	vaddsubps(ymm1, ymm13, ymm13)
-	
+
 	vpermilps(imm(0xb1), ymm12, ymm0)
 	vmulps(ymm7, ymm12, ymm12)
 	vmulps(ymm6, ymm0, ymm0)
 	vaddsubps(ymm0, ymm12, ymm12)
-	
+
 	vpermilps(imm(0xb1), ymm11, ymm3)
 	vmulps(ymm7, ymm11, ymm11)
 	vmulps(ymm6, ymm3, ymm3)
 	vaddsubps(ymm3, ymm11, ymm11)
-	
+
 	vpermilps(imm(0xb1), ymm10, ymm2)
 	vmulps(ymm7, ymm10, ymm10)
 	vmulps(ymm6, ymm2, ymm2)
 	vaddsubps(ymm2, ymm10, ymm10)
-	
+
 	vpermilps(imm(0xb1), ymm9, ymm1)
 	vmulps(ymm7, ymm9, ymm9)
 	vmulps(ymm6, ymm1, ymm1)
 	vaddsubps(ymm1, ymm9, ymm9)
-	
+
 	vpermilps(imm(0xb1), ymm8, ymm0)
 	vmulps(ymm7, ymm8, ymm8)
 	vmulps(ymm6, ymm0, ymm0)
 	vaddsubps(ymm0, ymm8, ymm8)
-	
-	
-	
-	
+
+
+
+
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate
-	
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
+
 	mov(var(rs_c), rsi) // load rs_c
 	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
-	
+
 	lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
-	
+
 	lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
 	lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c;
-	
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomiss(xmm0, xmm7) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -2249,410 +1546,144 @@ void bli_cgemm_sandybridge_asm_8x4
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case
-	
-	
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.CCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.CGENSTORED)
-	
-	 // update c00:c70
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c00,10) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c20,30) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c40,50) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c60,70) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c00,c10)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c20,c30)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c80:cf0
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c80,90) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca0,b0) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc0,d0) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce0,f0) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c80,c90)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca0,cb0)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c71
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c01,11) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c21,31) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c41,51) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c61,71) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c01,c11)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c21,c31)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c81:cf1
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c81,91) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca1,b1) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc1,d1) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce1,f1) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c81,c91)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca1,cb1)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c72
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c02,12) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c22,32) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c42,52) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c62,72) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c02,c12)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c22,c32)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c82:cf2
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c82,92) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca2,b2) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc2,d2) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce2,f2) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c82,c92)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca2,cb2)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c73
-	
-	vmovlpd(mem(rcx), xmm0, xmm0) // load (c03,13) into xmm0[0:1]
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c23,33) into xmm0[2:3]
-	vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c43,53) into xmm2[0:1]
-	vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c63,73) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rcx)) // store (c03,c13)
-	vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c23,c33)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c83:cf3
-	
-	vmovlpd(mem(rdx), xmm0, xmm0) // load (c83,93) into xmm0[0:1]
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca3,b3) into xmm0[2:3]
-	vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc3,d3) into xmm2[0:1]
-	vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce3,f3) into xmm2[2:3]
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm0, mem(rdx)) // store (c83,c93)
-	vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca3,cb3)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3)
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
-	label(.CCOLSTORED)
-	
-	 // update c00:c70
-	
-	vmovups(mem(rcx), ymm0) // load c00:c70 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rcx)) // store c00:c70
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c80:cf0
-	
-	vmovups(mem(rdx), ymm0) // load c80:f0 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rdx)) // store c80:cf0
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c00:c70
-	
-	vmovups(mem(rcx), ymm0) // load c01:c71 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rcx)) // store c01:c71
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c81:cf1
-	
-	vmovups(mem(rdx), ymm0) // load c81:f1 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rdx)) // store c81:cf1
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c72
-	
-	vmovups(mem(rcx), ymm0) // load c02:c72 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rcx)) // store c02:c72
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c82:cf2
-	
-	vmovups(mem(rdx), ymm0) // load c82:f2 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rdx)) // store c82:cf2
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c73
-	
-	vmovups(mem(rcx), ymm0) // load c03:c73 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rcx)) // store c03:c73
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c83:cf3
-	
-	vmovups(mem(rdx), ymm0) // load c83:f3 into ymm0
-	vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
-	vmulps(ymm7, ymm0, ymm0)
-	vmulps(ymm6, ymm2, ymm2)
-	vaddsubps(ymm2, ymm0, ymm0)
-	vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vmovups(ymm0, mem(rdx)) // store c83:cf3
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
+
+		 // update c00:c70
+
+		vmovups(mem(rcx), ymm0) // load c00:c70 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rcx)) // store c00:c70
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c80:cf0
+
+		vmovups(mem(rdx), ymm0) // load c80:f0 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rdx)) // store c80:cf0
+		add(rdi, rdx) // c += cs_c;
+
+		 // update c00:c70
+
+		vmovups(mem(rcx), ymm0) // load c01:c71 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rcx)) // store c01:c71
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c81:cf1
+
+		vmovups(mem(rdx), ymm0) // load c81:f1 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rdx)) // store c81:cf1
+		add(rdi, rdx) // c += cs_c;
+
+		 // update c02:c72
+
+		vmovups(mem(rcx), ymm0) // load c02:c72 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rcx)) // store c02:c72
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c82:cf2
+
+		vmovups(mem(rdx), ymm0) // load c82:f2 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rdx)) // store c82:cf2
+		add(rdi, rdx) // c += cs_c;
+
+		 // update c03:c73
+
+		vmovups(mem(rcx), ymm0) // load c03:c73 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rcx)) // store c03:c73
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c83:cf3
+
+		vmovups(mem(rdx), ymm0) // load c83:f3 into ymm0
+		vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
+		vmulps(ymm7, ymm0, ymm0)
+		vmulps(ymm6, ymm2, ymm2)
+		vaddsubps(ymm2, ymm0, ymm0)
+		vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
+		vmovups(ymm0, mem(rdx)) // store c83:cf3
+		add(rdi, rdx) // c += cs_c;
+
+		jmp(.CDONE) // jump to end.
+
 	label(.CBETAZERO)
-	
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.CCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.CGENSTORBZ)
-	
-	 // update c00:c70
-	
-	vextractf128(imm(1), ymm15, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm15, mem(rcx)) // store (c00,c10)
-	vmovhpd(xmm15, mem(rcx, rsi, 1)) // store (c20,c30)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c80:cf0
-	
-	vextractf128(imm(1), ymm14, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm14, mem(rdx)) // store (c80,c90)
-	vmovhpd(xmm14, mem(rdx, rsi, 1)) // store (ca0,cb0)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c71
-	
-	vextractf128(imm(1), ymm13, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm13, mem(rcx)) // store (c01,c11)
-	vmovhpd(xmm13, mem(rcx, rsi, 1)) // store (c21,c31)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c81:cf1
-	
-	vextractf128(imm(1), ymm12, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm12, mem(rdx)) // store (c81,c91)
-	vmovhpd(xmm12, mem(rdx, rsi, 1)) // store (ca1,cb1)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c72
-	
-	vextractf128(imm(1), ymm11, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm11, mem(rcx)) // store (c02,c12)
-	vmovhpd(xmm11, mem(rcx, rsi, 1)) // store (c22,c32)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c82:cf2
-	
-	vextractf128(imm(1), ymm10, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm10, mem(rdx)) // store (c82,c92)
-	vmovhpd(xmm10, mem(rdx, rsi, 1)) // store (ca2,cb2)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c73
-	
-	vextractf128(imm(1), ymm9, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm9, mem(rcx)) // store (c03,c13)
-	vmovhpd(xmm9, mem(rcx, rsi, 1)) // store (c23,c33)
-	vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53)
-	vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c83:cf3
-	
-	vextractf128(imm(1), ymm8, xmm2) // xmm2 := ymm0[4:7]
-	vmovlpd(xmm8, mem(rdx)) // store (c83,c93)
-	vmovhpd(xmm8, mem(rdx, rsi, 1)) // store (ca3,cb3)
-	vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3)
-	vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3)
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	
-	jmp(.CDONE) // jump to end.
-	
-	
-	
-	label(.CCOLSTORBZ)
-	
-	
-	vmovups(ymm15, mem(rcx)) // store c00:c70
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm14, mem(rdx)) // store c80:cf0
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovups(ymm13, mem(rcx)) // store c01:c71
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm12, mem(rdx)) // store c81:cf1
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovups(ymm11, mem(rcx)) // store c02:c72
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm10, mem(rdx)) // store c82:cf2
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovups(ymm9, mem(rcx)) // store c03:c73
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovups(ymm8, mem(rdx)) // store c83:cf3
-	add(rdi, rdx) // c += cs_c;
-	
-	
-	
-	
-	
+
+		vmovups(ymm15, mem(rcx)) // store c00:c70
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm14, mem(rdx)) // store c80:cf0
+		add(rdi, rdx) // c += cs_c;
+
+		vmovups(ymm13, mem(rcx)) // store c01:c71
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm12, mem(rdx)) // store c81:cf1
+		add(rdi, rdx) // c += cs_c;
+
+		vmovups(ymm11, mem(rcx)) // store c02:c72
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm10, mem(rdx)) // store c82:cf2
+		add(rdi, rdx) // c += cs_c;
+
+		vmovups(ymm9, mem(rcx)) // store c03:c73
+		add(rdi, rcx) // c += cs_c;
+
+		vmovups(ymm8, mem(rdx)) // store c83:cf3
+		add(rdi, rdx) // c += cs_c;
+
 	label(.CDONE)
-    
-    vzeroupper()
-	
+
 	vzeroupper()
-	
 
-    end_asm(
+
+	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c),   // 8
-      [b_next] "m" (b_next)/*, // 9
-      [a_next] "m" (a_next)*/  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c),   // 8
+	  [b_next] "m" (b_next)/*, // 9
+	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -2660,13 +1691,17 @@ void bli_cgemm_sandybridge_asm_8x4
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( c );
 }
 
 
 void bli_zgemm_sandybridge_asm_4x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
@@ -2681,34 +1716,36 @@ void bli_zgemm_sandybridge_asm_4x4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = k / 4;
+	uint64_t k_left = k % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMM_UKR_SETUP_CT( z, 4, 4, false );
+
 	begin_asm()
-	
-	
+
+
 	mov(var(a), rax) // load address of a.
 	mov(var(b), rbx) // load address of b.
 	//mov(var(b_next), r15) // load address of b_next.
 	//mov(var(a_next), r14) // load address of a_next.
-	
+
 	vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
 	vmovddup(mem(rbx, 0+0*32), ymm2)
 	vmovddup(mem(rbx, 0+1*32), ymm3)
-	
+
 	mov(var(c), rcx) // load address of c
 	mov(var(cs_c), rdi) // load cs_c
 	lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
 	lea(mem(, rdi, 2), rdi)
 	lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
-	
+
 	prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c
 	prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
-	
+
 	vxorpd(ymm8, ymm8, ymm8)
 	vxorpd(ymm9, ymm9, ymm9)
 	vxorpd(ymm10, ymm10, ymm10)
@@ -2717,18 +1754,18 @@ void bli_zgemm_sandybridge_asm_4x4
 	vxorpd(ymm13, ymm13, ymm13)
 	vxorpd(ymm14, ymm14, ymm14)
 	vxorpd(ymm15, ymm15, ymm15)
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi) // i = k_iter;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZCONSIDKLEFT) // if i == 0, jump to code that
 	 // contains the k_left loop.
-	
-	
+
+
 	label(.ZLOOPKITER) // MAIN LOOP
-	
-	
+
+
 	 // iteration 0
 	vmovapd(mem(rax, 1*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -2737,7 +1774,7 @@ void bli_zgemm_sandybridge_asm_4x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm15, ymm15)
 	vaddpd(ymm7, ymm11, ymm11)
-	
+
 	prefetch(0, mem(rax, 16*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 8+0*32), ymm2)
@@ -2745,45 +1782,45 @@ void bli_zgemm_sandybridge_asm_4x4
 	vmovddup(mem(rbx, 8+1*32), ymm3)
 	vaddpd(ymm6, ymm14, ymm14)
 	vaddpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vpermilpd(imm(0x5), ymm0, ymm0)
 	vaddpd(ymm6, ymm13, ymm13)
 	vaddpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulpd(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm12, ymm12)
 	vaddpd(ymm7, ymm8, ymm8)
-	
+
 	vpermilpd(imm(0x5), ymm1, ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
 	vmulpd(ymm0, ymm3, ymm7)
 	vaddsubpd(ymm6, ymm15, ymm15)
 	vaddsubpd(ymm7, ymm11, ymm11)
-	
+
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+2*32), ymm2)
 	vmulpd(ymm1, ymm3, ymm7)
 	vmovddup(mem(rbx, 0+3*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 2*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 1
 	vmovapd(mem(rax, 3*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -2792,7 +1829,7 @@ void bli_zgemm_sandybridge_asm_4x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm15, ymm15)
 	vaddpd(ymm7, ymm11, ymm11)
-	
+
 	prefetch(0, mem(rax, 18*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 8+2*32), ymm2)
@@ -2800,45 +1837,45 @@ void bli_zgemm_sandybridge_asm_4x4
 	vmovddup(mem(rbx, 8+3*32), ymm3)
 	vaddpd(ymm6, ymm14, ymm14)
 	vaddpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vpermilpd(imm(0x5), ymm0, ymm0)
 	vaddpd(ymm6, ymm13, ymm13)
 	vaddpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulpd(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm12, ymm12)
 	vaddpd(ymm7, ymm8, ymm8)
-	
+
 	vpermilpd(imm(0x5), ymm1, ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
 	vmulpd(ymm0, ymm3, ymm7)
 	vaddsubpd(ymm6, ymm15, ymm15)
 	vaddsubpd(ymm7, ymm11, ymm11)
-	
+
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+4*32), ymm2)
 	vmulpd(ymm1, ymm3, ymm7)
 	vmovddup(mem(rbx, 0+5*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 4*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 2
 	vmovapd(mem(rax, 5*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -2847,7 +1884,7 @@ void bli_zgemm_sandybridge_asm_4x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm15, ymm15)
 	vaddpd(ymm7, ymm11, ymm11)
-	
+
 	prefetch(0, mem(rax, 20*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 8+4*32), ymm2)
@@ -2855,45 +1892,45 @@ void bli_zgemm_sandybridge_asm_4x4
 	vmovddup(mem(rbx, 8+5*32), ymm3)
 	vaddpd(ymm6, ymm14, ymm14)
 	vaddpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vpermilpd(imm(0x5), ymm0, ymm0)
 	vaddpd(ymm6, ymm13, ymm13)
 	vaddpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulpd(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm12, ymm12)
 	vaddpd(ymm7, ymm8, ymm8)
-	
+
 	vpermilpd(imm(0x5), ymm1, ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
 	vmulpd(ymm0, ymm3, ymm7)
 	vaddsubpd(ymm6, ymm15, ymm15)
 	vaddsubpd(ymm7, ymm11, ymm11)
-	
+
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+6*32), ymm2)
 	vmulpd(ymm1, ymm3, ymm7)
 	vmovddup(mem(rbx, 0+7*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 6*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // iteration 3
 	vmovapd(mem(rax, 7*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -2902,7 +1939,7 @@ void bli_zgemm_sandybridge_asm_4x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm15, ymm15)
 	vaddpd(ymm7, ymm11, ymm11)
-	
+
 	prefetch(0, mem(rax, 22*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 8+6*32), ymm2)
@@ -2910,67 +1947,67 @@ void bli_zgemm_sandybridge_asm_4x4
 	vmovddup(mem(rbx, 8+7*32), ymm3)
 	vaddpd(ymm6, ymm14, ymm14)
 	vaddpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vpermilpd(imm(0x5), ymm0, ymm0)
 	vaddpd(ymm6, ymm13, ymm13)
 	vaddpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulpd(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm12, ymm12)
 	vaddpd(ymm7, ymm8, ymm8)
-	
+
 	vpermilpd(imm(0x5), ymm1, ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
 	vmulpd(ymm0, ymm3, ymm7)
 	vaddsubpd(ymm6, ymm15, ymm15)
 	vaddsubpd(ymm7, ymm11, ymm11)
-	
+
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+8*32), ymm2)
 	vmulpd(ymm1, ymm3, ymm7)
 	vmovddup(mem(rbx, 0+9*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 8*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
-	
+
+
 	add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr)
 	add(imm(4*4*16), rax) // a += 4*4 (unroll x mr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.ZLOOPKITER) // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.ZCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi) // i = k_left;
 	test(rsi, rsi) // check i via logical AND.
 	je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
 	 // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.ZLOOPKLEFT) // EDGE LOOP
-	
+
 	 // iteration 0
 	vmovapd(mem(rax, 1*32), ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
@@ -2979,7 +2016,7 @@ void bli_zgemm_sandybridge_asm_4x4
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm15, ymm15)
 	vaddpd(ymm7, ymm11, ymm11)
-	
+
 	prefetch(0, mem(rax, 16*32))
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 8+0*32), ymm2)
@@ -2987,166 +2024,166 @@ void bli_zgemm_sandybridge_asm_4x4
 	vmovddup(mem(rbx, 8+1*32), ymm3)
 	vaddpd(ymm6, ymm14, ymm14)
 	vaddpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vpermilpd(imm(0x5), ymm0, ymm0)
 	vaddpd(ymm6, ymm13, ymm13)
 	vaddpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
 	vmulpd(ymm1, ymm5, ymm7)
 	vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
 	vaddpd(ymm6, ymm12, ymm12)
 	vaddpd(ymm7, ymm8, ymm8)
-	
+
 	vpermilpd(imm(0x5), ymm1, ymm1)
 	vmulpd(ymm0, ymm2, ymm6)
 	vmulpd(ymm0, ymm3, ymm7)
 	vaddsubpd(ymm6, ymm15, ymm15)
 	vaddsubpd(ymm7, ymm11, ymm11)
-	
+
 	vmulpd(ymm1, ymm2, ymm6)
 	vmovddup(mem(rbx, 0+2*32), ymm2)
 	vmulpd(ymm1, ymm3, ymm7)
 	vmovddup(mem(rbx, 0+3*32), ymm3)
 	vaddsubpd(ymm6, ymm14, ymm14)
 	vaddsubpd(ymm7, ymm10, ymm10)
-	
+
 	vmulpd(ymm0, ymm4, ymm6)
 	vmulpd(ymm0, ymm5, ymm7)
 	vmovapd(mem(rax, 2*32), ymm0)
 	vaddsubpd(ymm6, ymm13, ymm13)
 	vaddsubpd(ymm7, ymm9, ymm9)
-	
+
 	vmulpd(ymm1, ymm4, ymm6)
 	vmulpd(ymm1, ymm5, ymm7)
 	vaddsubpd(ymm6, ymm12, ymm12)
 	vaddsubpd(ymm7, ymm8, ymm8)
-	
-	
+
+
 	add(imm(4*1*16), rax) // a += 4 (1 x mr)
 	add(imm(4*1*16), rbx) // b += 4 (1 x nr)
-	
-	
+
+
 	dec(rsi) // i -= 1;
 	jne(.ZLOOPKLEFT) // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.ZPOSTACCUM)
-	
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab01  ( ab02  ( ab03
-	 //   ab10    ab11    ab12    ab13  
+	 //   ab10    ab11    ab12    ab13
 	 //   ab21    ab20    ab23    ab22
 	 //   ab31 )  ab30 )  ab33 )  ab32 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab40  ( ab41  ( ab42  ( ab43
-	 //   ab50    ab51    ab52    ab53  
+	 //   ab50    ab51    ab52    ab53
 	 //   ab61    ab60    ab63    ab62
 	 //   ab71 )  ab70 )  ab73 )  ab72 )
-	
-	
+
+
 	vmovapd(ymm15, ymm7)
 	vperm2f128(imm(0x12), ymm15, ymm13, ymm15)
 	vperm2f128(imm(0x30), ymm7, ymm13, ymm13)
-	
+
 	vmovapd(ymm11, ymm7)
 	vperm2f128(imm(0x12), ymm11, ymm9, ymm11)
 	vperm2f128(imm(0x30), ymm7, ymm9, ymm9)
-	
+
 	vmovapd(ymm14, ymm7)
 	vperm2f128(imm(0x12), ymm14, ymm12, ymm14)
 	vperm2f128(imm(0x30), ymm7, ymm12, ymm12)
-	
+
 	vmovapd(ymm10, ymm7)
 	vperm2f128(imm(0x12), ymm10, ymm8, ymm10)
 	vperm2f128(imm(0x30), ymm7, ymm8, ymm8)
-	
-	
+
+
 	 // ymm15:  ymm13:  ymm11:  ymm9:
 	 // ( ab00  ( ab01  ( ab02  ( ab03
-	 //   ab10    ab11    ab12    ab13  
+	 //   ab10    ab11    ab12    ab13
 	 //   ab20    ab21    ab22    ab23
 	 //   ab30 )  ab31 )  ab32 )  ab33 )
-	
+
 	 // ymm14:  ymm12:  ymm10:  ymm8:
 	 // ( ab40  ( ab41  ( ab42  ( ab43
-	 //   ab50    ab51    ab52    ab53  
+	 //   ab50    ab51    ab52    ab53
 	 //   ab60    ab61    ab62    ab63
 	 //   ab70 )  ab71 )  ab72 )  ab73 )
-	
-	
+
+
 	 // scale by alpha
-	
+
 	mov(var(alpha), rax) // load address of alpha
 	vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate
 	vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate
-	
+
 	vpermilpd(imm(0x5), ymm15, ymm3)
 	vmulpd(ymm7, ymm15, ymm15)
 	vmulpd(ymm6, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm15, ymm15)
-	
+
 	vpermilpd(imm(0x5), ymm14, ymm2)
 	vmulpd(ymm7, ymm14, ymm14)
 	vmulpd(ymm6, ymm2, ymm2)
 	vaddsubpd(ymm2, ymm14, ymm14)
-	
+
 	vpermilpd(imm(0x5), ymm13, ymm1)
 	vmulpd(ymm7, ymm13, ymm13)
 	vmulpd(ymm6, ymm1, ymm1)
 	vaddsubpd(ymm1, ymm13, ymm13)
-	
+
 	vpermilpd(imm(0x5), ymm12, ymm0)
 	vmulpd(ymm7, ymm12, ymm12)
 	vmulpd(ymm6, ymm0, ymm0)
 	vaddsubpd(ymm0, ymm12, ymm12)
-	
+
 	vpermilpd(imm(0x5), ymm11, ymm3)
 	vmulpd(ymm7, ymm11, ymm11)
 	vmulpd(ymm6, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm11, ymm11)
-	
+
 	vpermilpd(imm(0x5), ymm10, ymm2)
 	vmulpd(ymm7, ymm10, ymm10)
 	vmulpd(ymm6, ymm2, ymm2)
 	vaddsubpd(ymm2, ymm10, ymm10)
-	
+
 	vpermilpd(imm(0x5), ymm9, ymm1)
 	vmulpd(ymm7, ymm9, ymm9)
 	vmulpd(ymm6, ymm1, ymm1)
 	vaddsubpd(ymm1, ymm9, ymm9)
-	
+
 	vpermilpd(imm(0x5), ymm8, ymm0)
 	vmulpd(ymm7, ymm8, ymm8)
 	vmulpd(ymm6, ymm0, ymm0)
 	vaddsubpd(ymm0, ymm8, ymm8)
-	
-	
-	
-	
+
+
+
+
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate
-	
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
+
 	mov(var(rs_c), rsi) // load rs_c
 	lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
 	lea(mem(, rsi, 2), rsi)
 	lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
-	
-	
+
+
 	 // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
 	vucomisd(xmm0, xmm7) // set ZF if beta_r == 0.
 	sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
@@ -3154,355 +2191,142 @@ void bli_zgemm_sandybridge_asm_4x4
 	sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
 	and(r8b, r9b) // set ZF if r8b & r9b == 1.
 	jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case
-	
-	
-	cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16.
-	jz(.ZCOLSTORED) // jump to column storage case
-	
-	
-	
-	label(.ZGENSTORED)
-	 // update c00:c30
-	
-	vmovupd(mem(rcx), xmm0) // load (c00,c10) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c20,c30) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c00,c10)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c40:c70
-	
-	vmovupd(mem(rdx), xmm0) // load (c40,c50) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c60,c70) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c40,c50)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c31
-	
-	vmovupd(mem(rcx), xmm0) // load (c01,c11) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c21,c31) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c01,c11)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c41:c71
-	
-	vmovupd(mem(rdx), xmm0) // load (c41,c51) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c61,c71) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c41,c51)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c32
-	
-	vmovupd(mem(rcx), xmm0) // load (c02,c12) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c22,c32) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c02,c12)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c42:c72
-	
-	vmovupd(mem(rdx), xmm0) // load (c42,c52) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c62,c72) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c42,c52)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c33
-	
-	vmovupd(mem(rcx), xmm0) // load (c03,c13) into xmm0
-	vmovupd(mem(rcx, rsi, 1), xmm2) // load (c23,c33) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rcx)) // store (c03,c13)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c43:c73
-	
-	vmovupd(mem(rdx), xmm0) // load (c43,c53) into xmm0
-	vmovupd(mem(rdx, rsi, 1), xmm2) // load (c63,c73) into xmm2
-	vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
-	vmovupd(xmm0, mem(rdx)) // store (c43,c53)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73)
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
-	label(.ZCOLSTORED)
-	 // update c00:c30
-	
-	vmovupd(mem(rcx), ymm0) // load c00:c30 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rcx)) // store c00:c30
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c40:c70
-	
-	vmovupd(mem(rdx), ymm0) // load c40:c70 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rdx)) // store c40:c70
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c31
-	
-	vmovupd(mem(rcx), ymm0) // load c01:c31 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rcx)) // store c01:c31
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c41:c71
-	
-	vmovupd(mem(rdx), ymm0) // load c41:c71 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rdx)) // store c41:c71
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c32
-	
-	vmovupd(mem(rcx), ymm0) // load c02:c32 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rcx)) // store c02:c32
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c42:c72
-	
-	vmovupd(mem(rdx), ymm0) // load c42:c72 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rdx)) // store c42:c72
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c33
-	
-	vmovupd(mem(rcx), ymm0) // load c03:c33 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rcx)) // store c03:c33
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c43:c73
-	
-	vmovupd(mem(rdx), ymm0) // load c43:c73 into ymm0
-	vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
-	vmulpd(ymm7, ymm0, ymm0)
-	vmulpd(ymm6, ymm2, ymm2)
-	vaddsubpd(ymm2, ymm0, ymm0)
-	vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
-	vmovupd(ymm0, mem(rdx)) // store c43:c73
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
+
+		 // update c00:c30
+
+		vmovupd(mem(rcx), ymm0) // load c00:c30 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rcx)) // store c00:c30
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c40:c70
+
+		vmovupd(mem(rdx), ymm0) // load c40:c70 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rdx)) // store c40:c70
+		add(rdi, rdx) // c += cs_c;
+
+		 // update c01:c31
+
+		vmovupd(mem(rcx), ymm0) // load c01:c31 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rcx)) // store c01:c31
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c41:c71
+
+		vmovupd(mem(rdx), ymm0) // load c41:c71 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rdx)) // store c41:c71
+		add(rdi, rdx) // c += cs_c;
+
+		 // update c02:c32
+
+		vmovupd(mem(rcx), ymm0) // load c02:c32 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rcx)) // store c02:c32
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c42:c72
+
+		vmovupd(mem(rdx), ymm0) // load c42:c72 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rdx)) // store c42:c72
+		add(rdi, rdx) // c += cs_c;
+
+		 // update c03:c33
+
+		vmovupd(mem(rcx), ymm0) // load c03:c33 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rcx)) // store c03:c33
+		add(rdi, rcx) // c += cs_c;
+
+		 // update c43:c73
+
+		vmovupd(mem(rdx), ymm0) // load c43:c73 into ymm0
+		vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta
+		vmulpd(ymm7, ymm0, ymm0)
+		vmulpd(ymm6, ymm2, ymm2)
+		vaddsubpd(ymm2, ymm0, ymm0)
+		vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
+		vmovupd(ymm0, mem(rdx)) // store c43:c73
+
+		jmp(.ZDONE) // jump to end.
+
 	label(.ZBETAZERO)
-	
-	cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16.
-	jz(.ZCOLSTORBZ) // jump to column storage case
-	
-	
-	
-	label(.ZGENSTORBZ)
-	 // update c00:c30
-	
-	vextractf128(imm(1), ymm15, xmm2)
-	vmovupd(xmm15, mem(rcx)) // store (c00,c10)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c40:c70
-	
-	vextractf128(imm(1), ymm14, xmm2)
-	vmovupd(xmm14, mem(rdx)) // store (c40,c50)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c01:c31
-	
-	vextractf128(imm(1), ymm13, xmm2)
-	vmovupd(xmm13, mem(rcx)) // store (c01,c11)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c41:c71
-	
-	vextractf128(imm(1), ymm12, xmm2)
-	vmovupd(xmm12, mem(rdx)) // store (c41,c51)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c02:c32
-	
-	vextractf128(imm(1), ymm11, xmm2)
-	vmovupd(xmm11, mem(rcx)) // store (c02,c12)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c42:c72
-	
-	vextractf128(imm(1), ymm10, xmm2)
-	vmovupd(xmm10, mem(rdx)) // store (c42,c52)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72)
-	add(rdi, rdx) // c += cs_c;
-	
-	 // update c03:c33
-	
-	vextractf128(imm(1), ymm9, xmm2)
-	vmovupd(xmm9, mem(rcx)) // store (c03,c13)
-	vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33)
-	add(rdi, rcx) // c += cs_c;
-	
-	 // update c43:c73
-	
-	vextractf128(imm(1), ymm8, xmm2)
-	vmovupd(xmm8, mem(rdx)) // store (c43,c53)
-	vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73)
-	
-	
-	
-	jmp(.ZDONE) // jump to end.
-	
-	
-	
-	label(.ZCOLSTORBZ)
-	
-	
-	vmovupd(ymm15, mem(rcx)) // store c00:c30
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(ymm14, mem(rdx)) // store c40:c70
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(ymm13, mem(rcx)) // store c01:c31
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(ymm12, mem(rdx)) // store c41:c71
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(ymm11, mem(rcx)) // store c02:c32
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(ymm10, mem(rdx)) // store c42:c72
-	add(rdi, rdx) // c += cs_c;
-	
-	vmovupd(ymm9, mem(rcx)) // store c03:c33
-	add(rdi, rcx) // c += cs_c;
-	
-	vmovupd(ymm8, mem(rdx)) // store c43:c73
-	
-	
-	
-	
-	
+
+		vmovupd(ymm15, mem(rcx)) // store c00:c30
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(ymm14, mem(rdx)) // store c40:c70
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(ymm13, mem(rcx)) // store c01:c31
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(ymm12, mem(rdx)) // store c41:c71
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(ymm11, mem(rcx)) // store c02:c32
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(ymm10, mem(rdx)) // store c42:c72
+		add(rdi, rdx) // c += cs_c;
+
+		vmovupd(ymm9, mem(rcx)) // store c03:c33
+		add(rdi, rcx) // c += cs_c;
+
+		vmovupd(ymm8, mem(rdx)) // store c43:c73
+
 	label(.ZDONE)
-    
-    vzeroupper()
-	
+
 	vzeroupper()
-	
+
 
 	end_asm(
 	: // output operands (none)
 	: // input operands
-      [k_iter] "m" (k_iter), // 0
-      [k_left] "m" (k_left), // 1
-      [a]      "m" (a),      // 2
-      [b]      "m" (b),      // 3
-      [alpha]  "m" (alpha),  // 4
-      [beta]   "m" (beta),   // 5
-      [c]      "m" (c),      // 6
-      [rs_c]   "m" (rs_c),   // 7
-      [cs_c]   "m" (cs_c)/*,   // 8
-      [b_next] "m" (b_next), // 9
-      [a_next] "m" (a_next)*/  // 10
+	  [k_iter] "m" (k_iter), // 0
+	  [k_left] "m" (k_left), // 1
+	  [a]      "m" (a),      // 2
+	  [b]      "m" (b),      // 3
+	  [alpha]  "m" (alpha),  // 4
+	  [beta]   "m" (beta),   // 5
+	  [c]      "m" (c),      // 6
+	  [rs_c]   "m" (rs_c),   // 7
+	  [cs_c]   "m" (cs_c)/*,   // 8
+	  [b_next] "m" (b_next), // 9
+	  [a_next] "m" (a_next)*/  // 10
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
@@ -3510,6 +2334,8 @@ void bli_zgemm_sandybridge_asm_4x4
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMM_UKR_FLUSH_CT( z );
 }
 
 
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
index 6a1bb04f5..6bf991082 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
@@ -32,14 +32,17 @@
 
 */
 
-#include <immintrin.h> 
+#include <emmintrin.h>
+#include <immintrin.h>
 #include "blis.h"
 
 
 #if 0
 void bli_sgemm_sandybridge_int_8x8
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        float*     restrict alpha,
        float*     restrict a,
        float*     restrict b,
@@ -52,11 +55,11 @@ void bli_sgemm_sandybridge_int_8x8
 }
 #endif
 
-
-
 void bli_dgemm_sandybridge_int_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
@@ -66,19 +69,22 @@ void bli_dgemm_sandybridge_int_8x4
        cntx_t*    restrict cntx
      )
 {
+
 	//void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 2;
-	uint64_t k_left = k0 % 2;
+	uint64_t k_iter = k / 2;
+	uint64_t k_left = k % 2;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 	uint64_t i;
 
-        double *c00, *c01, *c02, *c03;
-        double *c40, *c41, *c42, *c43;
+	GEMM_UKR_SETUP_CT( d, 8, 4, false );
+
+	double *c00, *c01, *c02, *c03;
+	double *c40, *c41, *c42, *c43;
 
 	// Quad registers.
 	__m256d va0_3, va4_7;
@@ -87,23 +93,20 @@ void bli_dgemm_sandybridge_int_8x4
 	__m256d vb;
 	__m256d vB0;
 
-	__m256d va0_3b_0, va4_7b_0; 
-	__m256d va0_3b_1, va4_7b_1; 
-	__m256d va0_3b_2, va4_7b_2; 
-	__m256d va0_3b_3, va4_7b_3; 
-
-	__m256d va0_3b0, va4_7b0; 
-	__m256d va0_3b1, va4_7b1; 
-	__m256d va0_3b2, va4_7b2; 
-	__m256d va0_3b3, va4_7b3; 
+	__m256d va0_3b_0, va4_7b_0;
+	__m256d va0_3b_1, va4_7b_1;
+	__m256d va0_3b_2, va4_7b_2;
+	__m256d va0_3b_3, va4_7b_3;
 
+	__m256d va0_3b0, va4_7b0;
+	__m256d va0_3b1, va4_7b1;
+	__m256d va0_3b2, va4_7b2;
+	__m256d va0_3b3, va4_7b3;
 
-	__m256d valpha, vbeta, vtmp; 
+	__m256d valpha, vbeta, vtmp;
 	__m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3;
 	__m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3;
 
-	__m128d aa, bb;
-	
 	__asm__ volatile( "prefetcht0 0(%0)          \n\t" : :"r"(a)  );
 	__asm__ volatile( "prefetcht2 0(%0)          \n\t" : :"r"(b_next)  );
 	__asm__ volatile( "prefetcht0 0(%0)          \n\t" : :"r"(c)  );
@@ -129,19 +132,19 @@ void bli_dgemm_sandybridge_int_8x4
 	va4_7b_3 = _mm256_setzero_pd();
 
 	// Load va0_3
- 	va0_3 = _mm256_load_pd( a );
+	va0_3 = _mm256_load_pd( a );
 	// Load va4_7
- 	va4_7 = _mm256_load_pd( a + 4 );
+	va4_7 = _mm256_load_pd( a + 4 );
 
-	// Load vb (b0,b1,b2,b3) 
- 	vb0 = _mm256_load_pd( b );
+	// Load vb (b0,b1,b2,b3)
+	vb0 = _mm256_load_pd( b );
 
 	for( i = 0; i < k_iter; ++i )
 	{
 		__asm__ volatile( "prefetcht0 192(%0)          \n\t" : :"r"(a)  );
 
 		// Load va0_3 (Prefetch)
- 		vA0_3 = _mm256_load_pd( a + 8 );
+		vA0_3 = _mm256_load_pd( a + 8 );
 
 		// Iteration 0.
 		vtmp = _mm256_mul_pd( va0_3, vb0 );
@@ -151,10 +154,10 @@ void bli_dgemm_sandybridge_int_8x4
 		va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
 
 		// Load va4_7 (Prefetch)
- 		vA4_7 = _mm256_load_pd( a + 12 );
+		vA4_7 = _mm256_load_pd( a + 12 );
 
 		// Shuffle vb (b1,b0,b3,b2)
- 		vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 );
+		vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 );
 
 		vtmp = _mm256_mul_pd( va0_3, vb1 );
 		va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
@@ -163,10 +166,10 @@ void bli_dgemm_sandybridge_int_8x4
 		va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
 
 		// Permute vb (b3,b2,b1,b0)
- 		vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
+		vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
 
 		// Load vb (b0,b1,b2,b3) (Prefetch)
- 		vB0 = _mm256_load_pd( b + 4 ); 
+		vB0 = _mm256_load_pd( b + 4 );
 
 		vtmp = _mm256_mul_pd( va0_3, vb2 );
 		va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
@@ -175,7 +178,7 @@ void bli_dgemm_sandybridge_int_8x4
 		va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
 
 		// Shuffle vb (b3,b2,b1,b0)
- 		vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
+		vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
 
 		vtmp = _mm256_mul_pd( va0_3, vb3 );
 		va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
@@ -186,14 +189,14 @@ void bli_dgemm_sandybridge_int_8x4
 		// Iteration 1.
 
 		__asm__ volatile( "prefetcht0 512(%0)          \n\t" : :"r"(a)  );
-		
+
 		// Load va0_3 (Next iteration)
- 		va0_3 = _mm256_load_pd( a + 16 );
+		va0_3 = _mm256_load_pd( a + 16 );
 
 		vtmp = _mm256_mul_pd( vA0_3, vB0 );
 		va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
 
- 		vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 );
+		vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 );
 
 		vtmp = _mm256_mul_pd( vA4_7, vB0 );
 		va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
@@ -202,9 +205,9 @@ void bli_dgemm_sandybridge_int_8x4
 		va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
 
 		// Load va4_7 (Next iteration)
- 		va4_7 = _mm256_load_pd( a + 20 );
+		va4_7 = _mm256_load_pd( a + 20 );
 
- 		vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
+		vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
 
 		vtmp = _mm256_mul_pd( vA4_7, vb1 );
 		va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
@@ -212,13 +215,13 @@ void bli_dgemm_sandybridge_int_8x4
 		vtmp = _mm256_mul_pd( vA0_3, vb2 );
 		va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
 
- 		vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
+		vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
 
 		vtmp = _mm256_mul_pd( vA4_7, vb2 );
 		va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
 
 		// Load vb0(Next iteration)
- 		vb0 = _mm256_load_pd( b + 8 ); 
+		vb0 = _mm256_load_pd( b + 8 );
 
 		vtmp = _mm256_mul_pd( vA0_3, vb3 );
 		va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
@@ -236,12 +239,12 @@ void bli_dgemm_sandybridge_int_8x4
 		// Iteration 0.
 
 		// Load va0_3
- 		va0_3 = _mm256_load_pd( a );
+		va0_3 = _mm256_load_pd( a );
 		// Load va4_7
- 		va4_7 = _mm256_load_pd( a + 4 );
+		va4_7 = _mm256_load_pd( a + 4 );
 
-		// Load vb (b0,b1,b2,b3) 
- 		vb = _mm256_load_pd( b );
+		// Load vb (b0,b1,b2,b3)
+		vb = _mm256_load_pd( b );
 
 		vtmp = _mm256_mul_pd( va0_3, vb );
 		va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
@@ -250,7 +253,7 @@ void bli_dgemm_sandybridge_int_8x4
 		va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
 
 		// Shuffle vb (b1,b0,b3,b2)
- 		vb = _mm256_shuffle_pd( vb, vb, 0x5 );
+		vb = _mm256_shuffle_pd( vb, vb, 0x5 );
 
 		vtmp = _mm256_mul_pd( va0_3, vb );
 		va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
@@ -259,7 +262,7 @@ void bli_dgemm_sandybridge_int_8x4
 		va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
 
 		// Permute vb (b3,b2,b1,b0)
- 		vb = _mm256_permute2f128_pd( vb, vb, 0x1 );
+		vb = _mm256_permute2f128_pd( vb, vb, 0x1 );
 
 		vtmp = _mm256_mul_pd( va0_3, vb );
 		va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
@@ -268,7 +271,7 @@ void bli_dgemm_sandybridge_int_8x4
 		va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
 
 		// Shuffle vb (b3,b2,b1,b0)
- 		vb = _mm256_shuffle_pd( vb, vb, 0x5 );
+		vb = _mm256_shuffle_pd( vb, vb, 0x5 );
 
 		vtmp = _mm256_mul_pd( va0_3, vb );
 		va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
@@ -309,131 +312,73 @@ void bli_dgemm_sandybridge_int_8x4
 	va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 );
 	va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 );
 
-	if( rs_c == 1 )
+	__m128d vzero = _mm_setzero_pd( );
+
+	if( _mm_comieq_sd( _mm256_castpd256_pd128(vbeta), vzero ) )
 	{
 		// Calculate address
-		c00 = ( c + 0*rs_c + 0*cs_c );
-		// Load
-		//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c  );
-		vc0_3_0 = _mm256_load_pd( c00  );
+		c00 = ( c + 0 + 0*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b0);
-		// Scale by beta
-		vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
-		// Add gemm result
-		vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c00, vc0_3_0 );
-	
+		_mm256_store_pd( c00, vtmp );
+
 		// Calculate address
-		c40 = ( c + 4*rs_c + 0*cs_c );
-		// Load
-		//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c  );
-		vc4_7_0 = _mm256_load_pd( c40  );
+		c40 = ( c + 4 + 0*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b0);
-		// Scale by beta
-		vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
-		// Add gemm result
-		vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c40, vc4_7_0 );
-	
+		_mm256_store_pd( c40, vtmp );
+
 		// Calculate address
-		c01 = ( c + 0*rs_c + 1*cs_c );
-		// Load
-		//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c  );
-		vc0_3_1 = _mm256_load_pd( c01  );
+		c01 = ( c + 0 + 1*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b1);
-		// Scale by beta
-		vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
-		// Add gemm result
-		vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c01, vc0_3_1 );
-	
+		_mm256_store_pd( c01, vtmp );
+
 		// Calculate address
-		c41 = ( c + 4*rs_c + 1*cs_c );
-		// Load
-		//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c  );
-		vc4_7_1 = _mm256_load_pd( c41  );
+		c41 = ( c + 4 + 1*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b1);
-		// Scale by beta
-		vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
-		// Add gemm result
-		vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c41, vc4_7_1 );
-	
+		_mm256_store_pd( c41, vtmp );
+
 		// Calculate address
-		c02 = ( c + 0*rs_c + 2*cs_c );
-		// Load
-		//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c  );
-		vc0_3_2 = _mm256_load_pd( c02 );
+		c02 = ( c + 0 + 2*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b2);
-		// Scale by beta
-		vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
-		// Add gemm result
-		vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c02, vc0_3_2 );
-	
+		_mm256_store_pd( c02, vtmp );
+
 		// Calculate address
-		c42 = ( c + 4*rs_c + 2*cs_c );
-		// Load
-		//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c  );
-		vc4_7_2 = _mm256_load_pd( c42 );
+		c42 = ( c + 4 + 2*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b2);
-		// Scale by beta
-		vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
-		// Add gemm result
-		vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c42, vc4_7_2 );
-		
+		_mm256_store_pd( c42, vtmp );
+
 		// Calculate address
-		c03 = ( c + 0*rs_c + 3*cs_c );
-		// Load
-		//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c  );
-		vc0_3_3 = _mm256_load_pd( c03 );
+		c03 = ( c + 0 + 3*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b3);
-		// Scale by beta
-		vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
-		// Add gemm result
-		vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c03, vc0_3_3 );
-	
+		_mm256_store_pd( c03, vtmp );
+
 		// Calculate address
-		c43 = ( c + 4*rs_c + 3*cs_c );
-		// Load
-		//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c  );
-		vc4_7_3 = _mm256_load_pd( c43 );
+		c43 = ( c + 4 + 3*cs_c );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b3);
-		// Scale by beta
-		vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
-		// Add gemm result
-		vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
 		// Store back to memory
-		_mm256_store_pd( c43, vc4_7_3 );
-	
+		_mm256_store_pd( c43, vtmp );
 	}
 	else
 	{
 		// Calculate address
-		c00 = ( c + 0*rs_c + 0*cs_c );
+		c00 = ( c + 0 + 0*cs_c );
 		// Load
-		//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c  );
-		vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ),  
-                                         *(c + 2*rs_c + 0*cs_c ), 
-                                         *(c + 1*rs_c + 0*cs_c ), 
-                                         *(c + 0*rs_c + 0*cs_c ) );
+		//vc0_3_0 = _mm256_load_pd( c + 0 + 0*cs_c  );
+		vc0_3_0 = _mm256_load_pd( c00  );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b0);
 		// Scale by beta
@@ -441,24 +386,13 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c00, vc0_3_0 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 0*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 0*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb );
+		_mm256_store_pd( c00, vc0_3_0 );
 
 		// Calculate address
-		c40 = ( c + 4*rs_c + 0*cs_c );
+		c40 = ( c + 4 + 0*cs_c );
 		// Load
-		//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c  );
-		vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ),  
-                                         *(c + 6*rs_c + 0*cs_c ), 
-                                         *(c + 5*rs_c + 0*cs_c ), 
-                                         *(c + 4*rs_c + 0*cs_c ) );
+		//vc4_7_0 = _mm256_load_pd( c + 4 + 0*cs_c  );
+		vc4_7_0 = _mm256_load_pd( c40  );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b0);
 		// Scale by beta
@@ -466,24 +400,13 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c40, vc4_7_0 );
-	
-		aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 0*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 0*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb );
+		_mm256_store_pd( c40, vc4_7_0 );
 
 		// Calculate address
-		c01 = ( c + 0*rs_c + 1*cs_c );
+		c01 = ( c + 0 + 1*cs_c );
 		// Load
-		//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c  );
-		vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ),  
-                                         *(c + 2*rs_c + 1*cs_c ), 
-                                         *(c + 1*rs_c + 1*cs_c ), 
-                                         *(c + 0*rs_c + 1*cs_c ) );
+		//vc0_3_1 = _mm256_load_pd( c + 0 + 1*cs_c  );
+		vc0_3_1 = _mm256_load_pd( c01  );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b1);
 		// Scale by beta
@@ -491,24 +414,13 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c01, vc0_3_1 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 1*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 1*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb );
+		_mm256_store_pd( c01, vc0_3_1 );
 
 		// Calculate address
-		c41 = ( c + 4*rs_c + 1*cs_c );
+		c41 = ( c + 4 + 1*cs_c );
 		// Load
-		//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c  );
-		vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ),  
-                                         *(c + 6*rs_c + 1*cs_c ), 
-                                         *(c + 5*rs_c + 1*cs_c ), 
-                                         *(c + 4*rs_c + 1*cs_c ) );
+		//vc4_7_1 = _mm256_load_pd( c + 4 + 1*cs_c  );
+		vc4_7_1 = _mm256_load_pd( c41  );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b1);
 		// Scale by beta
@@ -516,24 +428,13 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c41, vc4_7_1 );
-	
-		aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 1*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 1*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb );
+		_mm256_store_pd( c41, vc4_7_1 );
 
 		// Calculate address
-		c02 = ( c + 0*rs_c + 2*cs_c );
+		c02 = ( c + 0 + 2*cs_c );
 		// Load
-		//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c  );
-		vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ),  
-                                         *(c + 2*rs_c + 2*cs_c ), 
-                                         *(c + 1*rs_c + 2*cs_c ), 
-                                         *(c + 0*rs_c + 2*cs_c ) );
+		//vc0_3_2 = _mm256_load_pd( c + 0 + 2*cs_c  );
+		vc0_3_2 = _mm256_load_pd( c02 );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b2);
 		// Scale by beta
@@ -541,24 +442,13 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c02, vc0_3_2 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 2*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 2*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb );
+		_mm256_store_pd( c02, vc0_3_2 );
 
 		// Calculate address
-		c42 = ( c + 4*rs_c + 2*cs_c );
+		c42 = ( c + 4 + 2*cs_c );
 		// Load
-		//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c  );
-		vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ),  
-                                         *(c + 6*rs_c + 2*cs_c ), 
-                                         *(c + 5*rs_c + 2*cs_c ), 
-                                         *(c + 4*rs_c + 2*cs_c ) );
+		//vc4_7_2 = _mm256_load_pd( c + 4 + 2*cs_c  );
+		vc4_7_2 = _mm256_load_pd( c42 );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b2);
 		// Scale by beta
@@ -566,24 +456,13 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c42, vc4_7_2 );
-		
-		aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 2*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 2*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb );
+		_mm256_store_pd( c42, vc4_7_2 );
 
 		// Calculate address
-		c03 = ( c + 0*rs_c + 3*cs_c );
+		c03 = ( c + 0 + 3*cs_c );
 		// Load
-		//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c  );
-		vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ),  
-                                         *(c + 2*rs_c + 3*cs_c ), 
-                                         *(c + 1*rs_c + 3*cs_c ), 
-                                         *(c + 0*rs_c + 3*cs_c ) );
+		//vc0_3_3 = _mm256_load_pd( c + 0 + 3*cs_c  );
+		vc0_3_3 = _mm256_load_pd( c03 );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va0_3b3);
 		// Scale by beta
@@ -591,24 +470,13 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c03, vc0_3_3 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 3*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 3*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb );
+		_mm256_store_pd( c03, vc0_3_3 );
 
 		// Calculate address
-		c43 = ( c + 4*rs_c + 3*cs_c );
+		c43 = ( c + 4 + 3*cs_c );
 		// Load
-		//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c  );
-		vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ),  
-                                         *(c + 6*rs_c + 3*cs_c ), 
-                                         *(c + 5*rs_c + 3*cs_c ), 
-                                         *(c + 4*rs_c + 3*cs_c ) );
+		//vc4_7_3 = _mm256_load_pd( c + 4 + 3*cs_c  );
+		vc4_7_3 = _mm256_load_pd( c43 );
 		// Scale by alpha
 		vtmp = _mm256_mul_pd( valpha, va4_7b3);
 		// Scale by beta
@@ -616,17 +484,10 @@ void bli_dgemm_sandybridge_int_8x4
 		// Add gemm result
 		vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
 		// Store back to memory
-		//_mm256_store_pd( c43, vc4_7_3 );
-
-		aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 3*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 3*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb );
+		_mm256_store_pd( c43, vc4_7_3 );
 	}
 
+	GEMM_UKR_FLUSH_CT( d );
 }
 
 
@@ -634,7 +495,9 @@ void bli_dgemm_sandybridge_int_8x4
 #if 0
 void bli_cgemm_sandybridge_int_8x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        scomplex*  restrict alpha,
        scomplex*  restrict a,
        scomplex*  restrict b,
@@ -652,7 +515,9 @@ void bli_cgemm_sandybridge_int_8x4
 #if 0
 void bli_zgemm_sandybridge_int_4x4
      (
-       dim_t               k0,
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
        dcomplex*  restrict alpha,
        dcomplex*  restrict a,
        dcomplex*  restrict b,
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
index 3a20cd861..9943a170b 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
@@ -287,24 +287,28 @@ static int64_t offsets[16] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
 
 
-void bli_dgemm_skx_asm_16x12_l2(
-                             dim_t            k_,
-                             double* restrict alpha,
-                             double* restrict a,
-                             double* restrict b,
-                             double* restrict beta,
-                             double* restrict c, inc_t rs_c_, inc_t cs_c_,
-                             auxinfo_t*       data,
-                             cntx_t* restrict cntx
-                           )
+void bli_dgemm_skx_asm_16x12_l2
+     (
+       dim_t            m,
+       dim_t            n,
+       dim_t            k_,
+       double* restrict alpha,
+       double* restrict a,
+       double* restrict b,
+       double* restrict beta,
+       double* restrict c, inc_t rs_c_, inc_t cs_c_,
+       auxinfo_t*       data,
+       cntx_t* restrict cntx
+     )
 {
     (void)data;
     (void)cntx;
 
-    const int64_t* offsetPtr = &offsets[0];
-    const int64_t k = k_;
-    const int64_t rs_c = rs_c_;
-    const int64_t cs_c = cs_c_;
+    int64_t k = k_;
+    int64_t rs_c = rs_c_;
+    int64_t cs_c = cs_c_;
+
+    GEMM_UKR_SETUP_CT( d, 16, 12, false );
 
     BEGIN_ASM()
 
@@ -464,62 +468,26 @@ void bli_dgemm_skx_asm_16x12_l2(
 
     MOV(RAX, VAR(cs_c))
     LEA(RAX, MEM(,RAX,8))
-    MOV(RBX, VAR(rs_c))
-    LEA(RBX, MEM(,RBX,8))
-
-    // Check if C is column stride. If not, jump to the slow scattered update
-    CMP(RBX, IMM(1))
-    JNE(SCATTEREDUPDATE)
-
-        VCOMISD(XMM(1), XMM(7))
-        JE(COLSTORBZ)
 
-            UPDATE_C( 8, 9,10,11)
-            UPDATE_C(12,13,14,15)
-            UPDATE_C(16,17,18,19)
-            UPDATE_C(20,21,22,23)
-            UPDATE_C(24,25,26,27)
-            UPDATE_C(28,29,30,31)
+    VCOMISD(XMM(1), XMM(7))
+    JE(COLSTORBZ)
 
-        JMP(END)
-        LABEL(COLSTORBZ)
-
-            UPDATE_C_BZ( 8, 9,10,11)
-            UPDATE_C_BZ(12,13,14,15)
-            UPDATE_C_BZ(16,17,18,19)
-            UPDATE_C_BZ(20,21,22,23)
-            UPDATE_C_BZ(24,25,26,27)
-            UPDATE_C_BZ(28,29,30,31)
+        UPDATE_C( 8, 9,10,11)
+        UPDATE_C(12,13,14,15)
+        UPDATE_C(16,17,18,19)
+        UPDATE_C(20,21,22,23)
+        UPDATE_C(24,25,26,27)
+        UPDATE_C(28,29,30,31)
 
     JMP(END)
-    LABEL(SCATTEREDUPDATE)
-
-        MOV(RDI, VAR(offsetPtr))
-        VMOVDQA64(ZMM(2), MEM(RDI,0*64))
-        VMOVDQA64(ZMM(3), MEM(RDI,1*64))
-        VPBROADCASTQ(ZMM(6), RBX)
-        VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
-        VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
-
-        VCOMISD(XMM(1), XMM(7))
-        JE(SCATTERBZ)
-
-            UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
-            UPDATE_C_ROW_SCATTERED(12,13,14,15)
-            UPDATE_C_ROW_SCATTERED(16,17,18,19)
-            UPDATE_C_ROW_SCATTERED(20,21,22,23)
-            UPDATE_C_ROW_SCATTERED(24,25,26,27)
-            UPDATE_C_ROW_SCATTERED(28,29,30,31)
-
-        JMP(END)
-        LABEL(SCATTERBZ)
-
-            UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
-            UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
-            UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
-            UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
-            UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
-            UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
+    LABEL(COLSTORBZ)
+
+        UPDATE_C_BZ( 8, 9,10,11)
+        UPDATE_C_BZ(12,13,14,15)
+        UPDATE_C_BZ(16,17,18,19)
+        UPDATE_C_BZ(20,21,22,23)
+        UPDATE_C_BZ(24,25,26,27)
+        UPDATE_C_BZ(28,29,30,31)
 
     LABEL(END)
 
@@ -535,8 +503,7 @@ void bli_dgemm_skx_asm_16x12_l2(
       [beta]      "m" (beta),
       [c]         "m" (c),
       [rs_c]      "m" (rs_c),
-      [cs_c]      "m" (cs_c),
-      [offsetPtr] "m" (offsetPtr)
+      [cs_c]      "m" (cs_c)
     : // register clobber list
       "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
       "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
@@ -545,4 +512,6 @@ void bli_dgemm_skx_asm_16x12_l2(
       "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
       "zmm30", "zmm31", "memory"
     )
+
+    GEMM_UKR_FLUSH_CT( d );
 }
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
index 136f31532..e3bc52041 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
@@ -153,24 +153,28 @@
 static int64_t offsets[16] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
 
-void bli_dgemm_skx_asm_16x14(
-                              dim_t            k_,
-                              double* restrict alpha,
-                              double* restrict a,
-                              double* restrict b,
-                              double* restrict beta,
-                              double* restrict c, inc_t rs_c_, inc_t cs_c_,
-                              auxinfo_t*       data,
-                              cntx_t* restrict cntx
-                            )
+void bli_dgemm_skx_asm_16x14
+     (
+       dim_t            m,
+       dim_t            n,
+       dim_t            k_,
+       double* restrict alpha,
+       double* restrict a,
+       double* restrict b,
+       double* restrict beta,
+       double* restrict c, inc_t rs_c_, inc_t cs_c_,
+       auxinfo_t*       data,
+       cntx_t* restrict cntx
+     )
 {
     (void)data;
     (void)cntx;
 
-    const int64_t* offsetPtr = &offsets[0];
-    const int64_t k = k_;
-    const int64_t rs_c = rs_c_*8;
-    const int64_t cs_c = cs_c_*8;
+    int64_t k = k_;
+    int64_t rs_c = rs_c_;
+    int64_t cs_c = cs_c_;
+
+    GEMM_UKR_SETUP_CT( d, 16, 14, false );
 
     BEGIN_ASM()
 
@@ -220,6 +224,8 @@ void bli_dgemm_skx_asm_16x14(
 
     MOV(R12, VAR(rs_c))
     MOV(R10, VAR(cs_c))
+    LEA(R12, MEM(,R12,8))
+    LEA(R10, MEM(,R10,8))
 
     MOV(RDI, RSI)
     AND(RSI, IMM(3))
@@ -320,119 +326,41 @@ void bli_dgemm_skx_asm_16x14(
     MOV(RAX, R12)
     MOV(RBX, R10)
 
-    // Check if C is column stride.
-    CMP(RAX, IMM(8))
-    JNE(SCATTEREDUPDATE)
-
-        VCOMISD(XMM(1), XMM(2))
-        JE(COLSTORBZ)
-
-            UPDATE_C( 4, 5)
-            UPDATE_C( 6, 7)
-            UPDATE_C( 8, 9)
-            UPDATE_C(10,11)
-            UPDATE_C(12,13)
-            UPDATE_C(14,15)
-            UPDATE_C(16,17)
-            UPDATE_C(18,19)
-            UPDATE_C(20,21)
-            UPDATE_C(22,23)
-            UPDATE_C(24,25)
-            UPDATE_C(26,27)
-            UPDATE_C(28,29)
-            UPDATE_C(30,31)
-
-        JMP(END)
-        LABEL(COLSTORBZ)
-
-            UPDATE_C_BZ( 4, 5)
-            UPDATE_C_BZ( 6, 7)
-            UPDATE_C_BZ( 8, 9)
-            UPDATE_C_BZ(10,11)
-            UPDATE_C_BZ(12,13)
-            UPDATE_C_BZ(14,15)
-            UPDATE_C_BZ(16,17)
-            UPDATE_C_BZ(18,19)
-            UPDATE_C_BZ(20,21)
-            UPDATE_C_BZ(22,23)
-            UPDATE_C_BZ(24,25)
-            UPDATE_C_BZ(26,27)
-            UPDATE_C_BZ(28,29)
-            UPDATE_C_BZ(30,31)
+    VCOMISD(XMM(1), XMM(2))
+    JE(COLSTORBZ)
+
+        UPDATE_C( 4, 5)
+        UPDATE_C( 6, 7)
+        UPDATE_C( 8, 9)
+        UPDATE_C(10,11)
+        UPDATE_C(12,13)
+        UPDATE_C(14,15)
+        UPDATE_C(16,17)
+        UPDATE_C(18,19)
+        UPDATE_C(20,21)
+        UPDATE_C(22,23)
+        UPDATE_C(24,25)
+        UPDATE_C(26,27)
+        UPDATE_C(28,29)
+        UPDATE_C(30,31)
 
     JMP(END)
-    LABEL(SCATTEREDUPDATE)
-
-        VMULPD(ZMM( 4), ZMM( 4), ZMM(0))
-        VMULPD(ZMM( 5), ZMM( 5), ZMM(0))
-        VMULPD(ZMM( 6), ZMM( 6), ZMM(0))
-        VMULPD(ZMM( 7), ZMM( 7), ZMM(0))
-        VMULPD(ZMM( 8), ZMM( 8), ZMM(0))
-        VMULPD(ZMM( 9), ZMM( 9), ZMM(0))
-        VMULPD(ZMM(10), ZMM(10), ZMM(0))
-        VMULPD(ZMM(11), ZMM(11), ZMM(0))
-        VMULPD(ZMM(12), ZMM(12), ZMM(0))
-        VMULPD(ZMM(13), ZMM(13), ZMM(0))
-        VMULPD(ZMM(14), ZMM(14), ZMM(0))
-        VMULPD(ZMM(15), ZMM(15), ZMM(0))
-        VMULPD(ZMM(16), ZMM(16), ZMM(0))
-        VMULPD(ZMM(17), ZMM(17), ZMM(0))
-        VMULPD(ZMM(18), ZMM(18), ZMM(0))
-        VMULPD(ZMM(19), ZMM(19), ZMM(0))
-        VMULPD(ZMM(20), ZMM(20), ZMM(0))
-        VMULPD(ZMM(21), ZMM(21), ZMM(0))
-        VMULPD(ZMM(22), ZMM(22), ZMM(0))
-        VMULPD(ZMM(23), ZMM(23), ZMM(0))
-        VMULPD(ZMM(24), ZMM(24), ZMM(0))
-        VMULPD(ZMM(25), ZMM(25), ZMM(0))
-        VMULPD(ZMM(26), ZMM(26), ZMM(0))
-        VMULPD(ZMM(27), ZMM(27), ZMM(0))
-        VMULPD(ZMM(28), ZMM(28), ZMM(0))
-        VMULPD(ZMM(29), ZMM(29), ZMM(0))
-        VMULPD(ZMM(30), ZMM(30), ZMM(0))
-        VMULPD(ZMM(31), ZMM(31), ZMM(0))
-
-        VCOMISD(XMM(1), XMM(2))
-
-        MOV(RDI, VAR(offsetPtr))
-        VPBROADCASTQ(ZMM(0), RAX)
-        VPMULLQ(ZMM(2), ZMM(0), MEM(RDI))
-        VPMULLQ(ZMM(3), ZMM(0), MEM(RDI,64))
-
-        JE(SCATTERBZ)
-
-            UPDATE_C_COL_SCATTERED( 4, 5)
-            UPDATE_C_COL_SCATTERED( 6, 7)
-            UPDATE_C_COL_SCATTERED( 8, 9)
-            UPDATE_C_COL_SCATTERED(10,11)
-            UPDATE_C_COL_SCATTERED(12,13)
-            UPDATE_C_COL_SCATTERED(14,15)
-            UPDATE_C_COL_SCATTERED(16,17)
-            UPDATE_C_COL_SCATTERED(18,19)
-            UPDATE_C_COL_SCATTERED(20,21)
-            UPDATE_C_COL_SCATTERED(22,23)
-            UPDATE_C_COL_SCATTERED(24,25)
-            UPDATE_C_COL_SCATTERED(26,27)
-            UPDATE_C_COL_SCATTERED(28,29)
-            UPDATE_C_COL_SCATTERED(30,31)
-
-        JMP(END)
-        LABEL(SCATTERBZ)
-
-            UPDATE_C_BZ_COL_SCATTERED( 4, 5)
-            UPDATE_C_BZ_COL_SCATTERED( 6, 7)
-            UPDATE_C_BZ_COL_SCATTERED( 8, 9)
-            UPDATE_C_BZ_COL_SCATTERED(10,11)
-            UPDATE_C_BZ_COL_SCATTERED(12,13)
-            UPDATE_C_BZ_COL_SCATTERED(14,15)
-            UPDATE_C_BZ_COL_SCATTERED(16,17)
-            UPDATE_C_BZ_COL_SCATTERED(18,19)
-            UPDATE_C_BZ_COL_SCATTERED(20,21)
-            UPDATE_C_BZ_COL_SCATTERED(22,23)
-            UPDATE_C_BZ_COL_SCATTERED(24,25)
-            UPDATE_C_BZ_COL_SCATTERED(26,27)
-            UPDATE_C_BZ_COL_SCATTERED(28,29)
-            UPDATE_C_BZ_COL_SCATTERED(30,31)
+    LABEL(COLSTORBZ)
+
+        UPDATE_C_BZ( 4, 5)
+        UPDATE_C_BZ( 6, 7)
+        UPDATE_C_BZ( 8, 9)
+        UPDATE_C_BZ(10,11)
+        UPDATE_C_BZ(12,13)
+        UPDATE_C_BZ(14,15)
+        UPDATE_C_BZ(16,17)
+        UPDATE_C_BZ(18,19)
+        UPDATE_C_BZ(20,21)
+        UPDATE_C_BZ(22,23)
+        UPDATE_C_BZ(24,25)
+        UPDATE_C_BZ(26,27)
+        UPDATE_C_BZ(28,29)
+        UPDATE_C_BZ(30,31)
 
     LABEL(END)
 
@@ -449,8 +377,7 @@ void bli_dgemm_skx_asm_16x14(
           [beta]      "m" (beta),
           [c]         "m" (c),
           [rs_c]      "m" (rs_c),
-          [cs_c]      "m" (cs_c),
-          [offsetPtr] "m" (offsetPtr)
+          [cs_c]      "m" (cs_c)
         : // register clobber list
           "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
           "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
@@ -459,4 +386,6 @@ void bli_dgemm_skx_asm_16x14(
           "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
           "zmm30", "zmm31", "memory"
     )
+
+    GEMM_UKR_FLUSH_CT( d );
 }
diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
index 40af49614..8808449b6 100644
--- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
+++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
@@ -317,24 +317,28 @@ ahead*/
 static int64_t offsets[16] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
 
-void bli_sgemm_skx_asm_32x12_l2(
-                             dim_t            k_,
-                             float* restrict alpha,
-                             float* restrict a,
-                             float* restrict b,
-                             float* restrict beta,
-                             float* restrict c, inc_t rs_c_, inc_t cs_c_,
-                             auxinfo_t*       data,
-                             cntx_t* restrict cntx
-                           )
+void bli_sgemm_skx_asm_32x12_l2
+     (
+       dim_t            m,
+       dim_t            n,
+       dim_t            k_,
+       float* restrict alpha,
+       float* restrict a,
+       float* restrict b,
+       float* restrict beta,
+       float* restrict c, inc_t rs_c_, inc_t cs_c_,
+       auxinfo_t*       data,
+       cntx_t* restrict cntx
+     )
 {
     (void)data;
     (void)cntx;
 
-    const int64_t* offsetPtr = &offsets[0];
-    const int64_t k = k_;
-    const int64_t rs_c = rs_c_;
-    const int64_t cs_c = cs_c_;
+    int64_t k = k_;
+    int64_t rs_c = rs_c_;
+    int64_t cs_c = cs_c_;
+
+    GEMM_UKR_SETUP_CT( s, 32, 12, false );
 
     BEGIN_ASM()
 
@@ -381,7 +385,7 @@ void bli_sgemm_skx_asm_32x12_l2(
 #endif
 
 #ifdef PREFETCH_B_BEFORE
-	/* Prefetching 3 cachlines of B (4 iterations worth of data
+    /* Prefetching 3 cachlines of B (4 iterations worth of data
        (12 (NR) x 4 (sizeof(float)) x 4 iter /64 = 3 cachelines) */
     PREFETCH(0, MEM(RBX,0*64))
     PREFETCH(0, MEM(RBX,1*64))
@@ -485,66 +489,26 @@ void bli_sgemm_skx_asm_32x12_l2(
 
     MOV(RAX, VAR(cs_c))
     LEA(RAX, MEM(,RAX,4))
-    MOV(RBX, VAR(rs_c))
-    LEA(RBX, MEM(,RBX,4))
-
-
-    // Check if C is column major (rs_c = 1). If not, jump to the slow scattered update
-    CMP(RBX, IMM(4))
-    JNE(SCATTEREDUPDATE)
-
-        VCOMISS(XMM(1), XMM(7))
-        JE(COLSTORBZ)
 
-            UPDATE_C( 8, 9,10,11)
-            UPDATE_C(12,13,14,15)
-            UPDATE_C(16,17,18,19)
-            UPDATE_C(20,21,22,23)
-            UPDATE_C(24,25,26,27)
-            UPDATE_C(28,29,30,31)
+    VCOMISS(XMM(1), XMM(7))
+    JE(COLSTORBZ)
 
-        JMP(END)
-        LABEL(COLSTORBZ)
-
-            UPDATE_C_BZ( 8, 9,10,11)
-            UPDATE_C_BZ(12,13,14,15)
-            UPDATE_C_BZ(16,17,18,19)
-            UPDATE_C_BZ(20,21,22,23)
-            UPDATE_C_BZ(24,25,26,27)
-            UPDATE_C_BZ(28,29,30,31)
+        UPDATE_C( 8, 9,10,11)
+        UPDATE_C(12,13,14,15)
+        UPDATE_C(16,17,18,19)
+        UPDATE_C(20,21,22,23)
+        UPDATE_C(24,25,26,27)
+        UPDATE_C(28,29,30,31)
 
     JMP(END)
-    LABEL(SCATTEREDUPDATE)
-
-        LEA(RDX, MEM(RCX,RBX,8))
-        LEA(RDX, MEM(RDX,RBX,8))
-
-        MOV(RDI, VAR(offsetPtr))
-        VMOVDQA64(ZMM(2), MEM(RDI,0*64))
-        VMOVDQA64(ZMM(3), MEM(RDI,1*64))
-        VPBROADCASTQ(ZMM(6), RBX)
-        VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
-        VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
-
-        VCOMISS(XMM(1), XMM(7))
-        JE(SCATTERBZ)
-
-            UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
-            UPDATE_C_ROW_SCATTERED(12,13,14,15)
-            UPDATE_C_ROW_SCATTERED(16,17,18,19)
-            UPDATE_C_ROW_SCATTERED(20,21,22,23)
-            UPDATE_C_ROW_SCATTERED(24,25,26,27)
-            UPDATE_C_ROW_SCATTERED(28,29,30,31)
-
-        JMP(END)
-        LABEL(SCATTERBZ)
-
-            UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
-            UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
-            UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
-            UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
-            UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
-            UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
+    LABEL(COLSTORBZ)
+
+        UPDATE_C_BZ( 8, 9,10,11)
+        UPDATE_C_BZ(12,13,14,15)
+        UPDATE_C_BZ(16,17,18,19)
+        UPDATE_C_BZ(20,21,22,23)
+        UPDATE_C_BZ(24,25,26,27)
+        UPDATE_C_BZ(28,29,30,31)
 
     LABEL(END)
 
@@ -560,8 +524,7 @@ void bli_sgemm_skx_asm_32x12_l2(
       [beta]      "m" (beta),
       [c]         "m" (c),
       [rs_c]      "m" (rs_c),
-      [cs_c]      "m" (cs_c),
-      [offsetPtr] "m" (offsetPtr)
+      [cs_c]      "m" (cs_c)
     : // register clobber list
       "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
       "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
@@ -570,4 +533,6 @@ void bli_sgemm_skx_asm_32x12_l2(
       "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
       "zmm30", "zmm31", "memory"
     )
+
+    GEMM_UKR_FLUSH_CT( s );
 }
diff --git a/ref_kernels/3/bb/bli_gemmbb_ref.c b/ref_kernels/3/bb/bli_gemmbb_ref.c
index b45718d45..4c75c064c 100644
--- a/ref_kernels/3/bb/bli_gemmbb_ref.c
+++ b/ref_kernels/3/bb/bli_gemmbb_ref.c
@@ -42,6 +42,8 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
@@ -59,9 +61,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
 	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t     m      = mr; \
-	const dim_t     n      = nr; \
 \
 	const inc_t     cs_a   = packmr; \
 \
diff --git a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
index 681b740b5..dd4e1f153 100644
--- a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
+++ b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
@@ -87,6 +87,8 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
 	/* upper: b11 = alpha * b11 - a12 * b21; */ \
 	gemm_ukr \
 	( \
+	  mr, \
+	  nr, \
 	  k, \
 	  minus_one, \
 	  a1x, \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 931fe994b..51ff9df4b 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -44,6 +44,8 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
@@ -107,8 +109,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( dim_t i = 0; i < mr; ++i ) \
-			for ( dim_t j = 0; j < nr; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			PASTEMAC(ch,copys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -117,8 +119,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else \
 		{ \
-			for ( dim_t i = 0; i < mr; ++i ) \
-			for ( dim_t j = 0; j < nr; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			PASTEMAC(ch,xpbys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -133,8 +135,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( dim_t j = 0; j < nr; ++j ) \
-			for ( dim_t i = 0; i < mr; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			PASTEMAC(ch,copys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -143,8 +145,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else \
 		{ \
-			for ( dim_t j = 0; j < nr; ++j ) \
-			for ( dim_t i = 0; i < mr; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			PASTEMAC(ch,xpbys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -171,6 +173,8 @@ GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 )
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
@@ -188,9 +192,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
 	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t     m      = mr; \
-	const dim_t     n      = nr; \
 \
 	const inc_t     cs_a   = packmr; \
 \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 2b756963e..2b260c881 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -52,6 +52,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
 \
+	const inc_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const inc_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
 	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
 	const inc_t     rs_b   = packnr; \
@@ -68,6 +70,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	/* upper: b11 = alpha * b11 - a12 * b21; */ \
 	gemm_ukr \
 	( \
+	  mr, \
+	  nr, \
 	  k, \
 	  minus_one, \
 	  a1x, \
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 6d2464de9..fbd15d695 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -39,6 +39,8 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
@@ -59,6 +61,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+\
+	const dim_t       mr_r      = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
+	const dim_t       nr_r      = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
 \
 	const dim_t       k2        = 2 * k; \
 \
@@ -118,6 +123,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	else if ( bli_is_gen_stored( rs_c, cs_c ) )             using_ct = TRUE; \
 	else                                                    using_ct = FALSE; \
 \
+\
+	/* If we are not computing a full micro-tile, then we must write to
+	   ct and then accumulate to c afterwards. */ \
+	if ( mr != m || nr != n ) using_ct = TRUE; \
+\
 \
 	if ( using_ct ) \
 	{ \
@@ -149,6 +159,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* c = beta * c + alpha_r * a * b; */ \
 		rgemm_ukr \
 		( \
+		  mr_r, \
+		  nr_r, \
 		  k2, \
 		  alpha_r, \
 		  a_r, \
@@ -164,8 +176,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Accumulate the final result in ct back to c. */ \
 		if ( PASTEMAC(ch,eq1)( *beta ) ) \
 		{ \
-			for ( j = 0; j < nr; ++j ) \
-			for ( i = 0; i < mr; ++i ) \
+			for ( j = 0; j < n; ++j ) \
+			for ( i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
 				                   *(c  + i*rs_c  + j*cs_c ) ); \
@@ -173,8 +185,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( j = 0; j < nr; ++j ) \
-			for ( i = 0; i < mr; ++i ) \
+			for ( j = 0; j < n; ++j ) \
+			for ( i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
 				                    *(c  + i*rs_c  + j*cs_c ) ); \
@@ -182,8 +194,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else \
 		{ \
-			for ( j = 0; j < nr; ++j ) \
-			for ( i = 0; i < mr; ++i ) \
+			for ( j = 0; j < n; ++j ) \
+			for ( i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
 				                    *beta, \
@@ -215,6 +227,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* c = beta * c + alpha_r * a * b; */ \
 		rgemm_ukr \
 		( \
+		  mr_r, \
+		  nr_r, \
 		  k2, \
 		  alpha_r, \
 		  a_r, \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 5cfaee9ec..96f5a16fe 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -153,6 +153,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	   upper: bt = -1.0 * a12 * b21; */ \
 	rgemm_ukr \
 	( \
+	  mr_r, \
+	  nr_r, \
 	  k2, \
 	  minus_one_r, \
 	  a1x_r, \
diff --git a/test/syrk_diagonal/complex_math.hpp b/test/syrk_diagonal/complex_math.hpp
new file mode 100644
index 000000000..9c68e730a
--- /dev/null
+++ b/test/syrk_diagonal/complex_math.hpp
@@ -0,0 +1,267 @@
+#include <cmath>
+#include <algorithm>
+#include <type_traits>
+
+#include "blis.h"
+
+template <typename T>
+struct is_complex : std::false_type {};
+
+template <>
+struct is_complex<scomplex> : std::true_type {};
+
+template <>
+struct is_complex<dcomplex> : std::true_type {};
+
+template <typename T>
+struct is_real : std::integral_constant<bool,!is_complex<T>::value> {};
+
+template <typename T> struct make_complex;
+
+template <> struct make_complex<float   > { using type = scomplex; };
+template <> struct make_complex<double  > { using type = dcomplex; };
+template <> struct make_complex<scomplex> { using type = scomplex; };
+template <> struct make_complex<dcomplex> { using type = dcomplex; };
+
+template <typename T>
+using make_complex_t = typename make_complex<T>::type;
+
+template <typename T> struct make_real;
+
+template <> struct make_real<float   > { using type = float; };
+template <> struct make_real<double  > { using type = double; };
+template <> struct make_real<scomplex> { using type = float; };
+template <> struct make_real<dcomplex> { using type = double; };
+
+template <typename T>
+using make_real_t = typename make_real<T>::type;
+
+template <typename T, bool Cond>
+struct make_complex_if : std::conditional<Cond,make_complex_t<T>,make_real_t<T>> {};
+
+template <typename T, bool Cond>
+using make_complex_if_t = typename make_complex_if<T,Cond>::type;
+
+template <typename T>
+struct real_imag_part
+{
+    real_imag_part& operator=(T) { return *this; }
+
+    operator T() const { return T(); }
+};
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<typename std::remove_cv<T>::type>::value,T&> real(T& x) { return x; }
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value,real_imag_part<T>> imag(T x) { return {}; }
+
+inline float& real(scomplex& x) { return x.real; }
+
+inline float& imag(scomplex& x) { return x.imag; }
+
+inline double& real(dcomplex& x) { return x.real; }
+
+inline double& imag(dcomplex& x) { return x.imag; }
+
+inline const float& real(const scomplex& x) { return x.real; }
+
+inline const float& imag(const scomplex& x) { return x.imag; }
+
+inline const double& real(const dcomplex& x) { return x.real; }
+
+inline const double& imag(const dcomplex& x) { return x.imag; }
+
+template <typename T>
+std::enable_if_t<is_real<T>::value,T> conj(T x) { return x; }
+
+template <typename T>
+std::enable_if_t<is_complex<T>::value,T> conj(const T& x) { return {x.real, -x.imag}; }
+
+template <typename T, typename U, typename=void>
+struct convert_impl;
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_real<T>::value && is_real<U>::value>>
+{
+    void operator()(T x, U& y) const { y = x; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_real<T>::value && is_complex<U>::value>>
+{
+    void operator()(T x, U& y) const { y.real = x; y.imag = 0; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_complex<T>::value && is_real<U>::value>>
+{
+    void operator()(T x, U& y) const { y = x.real; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_complex<T>::value && is_complex<U>::value>>
+{
+    void operator()(T x, U& y) const { y.real = x.real; y.imag = x.imag; }
+};
+
+template <typename U, typename T>
+U convert(T x)
+{
+    U y;
+    convert_impl<T,U>{}(x,y);
+    return y;
+}
+
+template <typename U, typename T>
+auto convert_prec(T x) -> make_complex_if_t<U,is_complex<T>::value>
+{
+    return convert<make_complex_if_t<U,is_complex<T>::value>>(x);
+}
+
+#define COMPLEX_MATH_OPS(rtype, ctype) \
+\
+inline bool operator==(rtype x, ctype y) \
+{ \
+    return x == y.real && y.imag == 0; \
+} \
+\
+inline bool operator==(ctype x, rtype y) \
+{ \
+    return y == x.real && x.imag == 0; \
+} \
+\
+inline bool operator==(ctype x, ctype y) \
+{ \
+    return x.real == y.real && \
+           x.imag == y.imag; \
+ } \
+ \
+inline ctype operator-(ctype x) \
+{ \
+    return {-x.real, -x.imag}; \
+} \
+\
+inline ctype operator+(rtype x, ctype y) \
+{ \
+    return {x+y.real, y.imag}; \
+} \
+\
+inline ctype operator+(ctype x, rtype y) \
+{ \
+    return {y+x.real, x.imag}; \
+} \
+\
+inline ctype operator+(ctype x, ctype y) \
+{ \
+    return {x.real+y.real, x.imag+y.imag}; \
+} \
+\
+inline ctype operator-(rtype x, ctype y) \
+{ \
+    return {x-y.real, -y.imag}; \
+} \
+\
+inline ctype operator-(ctype x, rtype y) \
+{ \
+    return {x.real-y, x.imag}; \
+} \
+\
+inline ctype operator-(ctype x, ctype y) \
+{ \
+    return {x.real-y.real, x.imag-y.imag}; \
+} \
+\
+inline ctype operator*(rtype x, ctype y) \
+{ \
+    return {x*y.real, x*y.imag}; \
+} \
+\
+inline ctype operator*(ctype x, rtype y) \
+{ \
+    return {y*x.real, y*x.imag}; \
+} \
+\
+inline ctype operator*(ctype x, ctype y) \
+{ \
+    return {x.real*y.real - x.imag*y.imag, \
+            x.real*y.imag + x.imag*y.real}; \
+} \
+\
+inline ctype operator/(rtype x, ctype y) \
+{ \
+    auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \
+    auto n = std::ilogb(scale); \
+    auto yrs = std::scalbn(y.real, -n); \
+    auto yis = std::scalbn(y.imag, -n); \
+    auto denom = y.real*yrs + y.imag*yis; \
+    return {x*yrs/denom, -x*yis/denom}; \
+} \
+\
+inline ctype operator/(ctype x, rtype y) \
+{ \
+    return {x.real/y, x.imag/y}; \
+} \
+\
+inline ctype operator/(ctype x, ctype y) \
+{ \
+    auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \
+    auto n = std::ilogb(scale); \
+    auto yrs = std::scalbn(y.real, -n); \
+    auto yis = std::scalbn(y.imag, -n); \
+    auto denom = y.real*yrs + y.imag*yis; \
+    return {(x.real*yrs + x.imag*yis)/denom, \
+            (x.imag*yrs - x.real*yis)/denom}; \
+} \
+\
+inline ctype& operator+=(ctype& x, rtype y) \
+{ \
+    x.real += y; \
+    return x; \
+} \
+\
+inline ctype& operator+=(ctype& x, ctype y) \
+{ \
+    x.real += y.real; x.imag += y.imag; \
+    return x; \
+} \
+\
+inline ctype& operator-=(ctype& x, rtype y) \
+{ \
+    x.real -= y; \
+    return x; \
+} \
+\
+inline ctype& operator-=(ctype& x, ctype y) \
+{ \
+    x.real -= y.real; x.imag -= y.imag; \
+    return x; \
+} \
+\
+inline ctype& operator*=(ctype& x, rtype y) \
+{ \
+    x.real *= y; x.imag *= y; \
+    return x; \
+} \
+\
+inline ctype& operator*=(ctype& x, ctype y) \
+{ \
+    x = x * y; \
+    return x; \
+} \
+\
+inline ctype& operator/=(ctype& x, rtype y) \
+{ \
+    x.real /= y; x.imag /= y; \
+    return x; \
+} \
+\
+inline ctype& operator/=(ctype& x, ctype y) \
+{ \
+    x = x / y; \
+    return x; \
+}
+
+COMPLEX_MATH_OPS(float,  scomplex);
+COMPLEX_MATH_OPS(double, dcomplex);
+
diff --git a/test/syrk_diagonal/syrk_diagonal_example.c b/test/syrk_diagonal/syrk_diagonal_example.c
new file mode 100644
index 000000000..c2bfd8fa1
--- /dev/null
+++ b/test/syrk_diagonal/syrk_diagonal_example.c
@@ -0,0 +1,186 @@
+#include "syrk_diagonal_ref.h"
+
+/*
+ * Structure which includes all additional information beyond what is
+ * already stored in the obj_t structure.
+ *
+ * This structure is **read-only** during the operation!
+ */
+typedef struct packm_diag_params_t
+{
+	packm_blk_var1_params_t super;
+	void* d;
+	inc_t incd;
+} packm_diag_params_t;
+
+/*
+ * Declare the pack kernel type and set up and array of
+ * packing kernels, one for each data type.
+ */
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+void PASTEMAC(ch,op) \
+    ( \
+       struc_t        struca, \
+       diag_t         diaga, \
+       uplo_t         uploa, \
+       conj_t         conja, \
+       pack_t         schema, \
+       bool           invdiag, \
+       dim_t          panel_dim, \
+       dim_t          panel_len, \
+       dim_t          panel_dim_max, \
+       dim_t          panel_len_max, \
+       dim_t          panel_dim_off, \
+       dim_t          panel_len_off, \
+       void* restrict kappa, \
+       void* restrict a, inc_t inca, inc_t lda, \
+       void* restrict p,             inc_t ldp, \
+                         inc_t is_p, \
+       cntx_t*        cntx, \
+       void*          params \
+    ) \
+{ \
+	packm_diag_params_t* params_cast = params; \
+	ctype* restrict      a_cast      = a; \
+	ctype* restrict      p_cast      = p; \
+	ctype* restrict      d_cast      = params_cast->d; \
+	inc_t                incd        = params_cast->incd; \
+	ctype                kappa_cast  = *( ctype* )kappa; \
+\
+	if ( schema != BLIS_PACKED_ROW_PANELS && \
+		 schema != BLIS_PACKED_COL_PANELS ) \
+		bli_abort(); \
+\
+	/* Apply the offset */ \
+	d_cast += panel_len_off * incd; \
+\
+	if ( conja ) \
+	{ \
+		for ( dim_t j = 0; j < panel_len; j++ ) \
+		{ \
+			ctype kappa_d; \
+			PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \
+\
+			for (dim_t i = 0;i < panel_dim;i++) \
+				PASTEMAC(ch,scal2js)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \
+\
+			for (dim_t i = panel_dim;i < panel_dim_max;i++) \
+				PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t j = 0; j < panel_len; j++ ) \
+		{ \
+			ctype kappa_d; \
+			PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \
+\
+			for (dim_t i = 0;i < panel_dim;i++) \
+				PASTEMAC(ch,scal2s)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \
+\
+			for (dim_t i = panel_dim;i < panel_dim_max;i++) \
+				PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \
+		} \
+	} \
+\
+	for (dim_t j = panel_len;j < panel_len_max;j++) \
+		for (dim_t i = 0;i < panel_dim_max;i++) \
+			PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \
+}
+
+INSERT_GENTFUNC_BASIC0(packm_diag_ukr);
+
+static packm_ker_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr );
+
+/*
+ * Modify the object A to include information about the diagonal D,
+ * and imbue it with special function pointers which will take care
+ * of the actual work of forming (D * A^T)
+ */
+void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a )
+{
+	memset( params, 0, sizeof(*params) );
+
+	// Assumes D is a column vector
+	params->d = bli_obj_buffer_at_off( d );
+	params->incd = bli_obj_row_stride( d );
+
+	for ( int i = BLIS_DT_LO; i <= BLIS_DT_HI; i++ )
+		params->super.ukr_fn[i][i] = packm_diag_ukrs[i];
+
+	// Attach the parameters to the A object.
+	bli_obj_set_pack_params( params, a );
+}
+
+/*
+ * Implements C := alpha * A * D * A^T + beta * C
+ *
+ * where D is a diagonal matrix with elements taken from the "d" vector.
+ */
+void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c )
+{
+	obj_t ad; // this is (D * A^T)
+	packm_diag_params_t params;
+
+	bli_obj_alias_to( a, &ad );
+	bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T
+	attach_diagonal_factor( &params, d, &ad );
+
+	// Does C := alpha * A * B + beta * C using B = (D + A^T)
+	bli_gemmtnat( alpha, a, &ad, beta, c, NULL, NULL );
+}
+
+int main( void )
+{
+	obj_t a;
+	obj_t d;
+	obj_t c;
+	obj_t c_copy;
+	obj_t norm;
+
+	dim_t m = 10;
+	dim_t k = 10;
+
+	for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ )
+	for ( int upper = 0; upper <= 1; upper++ )
+	for ( int transa = 0; transa <= 1; transa++ )
+	for ( int transc = 0; transc <= 1; transc++ )
+	{
+		num_t dt = dt_;
+		uplo_t uplo = upper ? BLIS_UPPER : BLIS_LOWER;
+
+		bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a );
+		bli_obj_create( dt, k, 1,              1,          1,     &d );
+		bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c );
+		bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy );
+		bli_obj_set_struc( BLIS_SYMMETRIC , &c );
+		bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy );
+		bli_obj_set_uplo( uplo , &c );
+		bli_obj_set_uplo( uplo , &c_copy );
+		bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm );
+
+		bli_randm( &a );
+		bli_randm( &d );
+		bli_randm( &c );
+		bli_copym( &c, &c_copy );
+
+		syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c );
+		syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy );
+
+		bli_subm( &c_copy, &c );
+		bli_normfm( &c, &norm );
+
+		double normr, normi;
+		bli_getsc( &norm, &normr, &normi );
+
+		printf( "dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n",
+		        dt, upper, transa, transc, normr );
+
+		bli_obj_free( &a );
+		bli_obj_free( &d );
+		bli_obj_free( &c );
+		bli_obj_free( &c_copy );
+		bli_obj_free( &norm );
+	}
+}
diff --git a/test/syrk_diagonal/syrk_diagonal_example.cxx b/test/syrk_diagonal/syrk_diagonal_example.cxx
new file mode 100644
index 000000000..1c269d5c4
--- /dev/null
+++ b/test/syrk_diagonal/syrk_diagonal_example.cxx
@@ -0,0 +1,220 @@
+#include "syrk_diagonal_ref.h"
+
+/*
+ * Forward-declare the pack kernel type and set up and array of
+ * packing kernels, one for each data type.
+ */
+template <typename T>
+void packm_diag_ukr
+    (
+       struc_t        /*struca*/,
+       diag_t         /*diaga*/,
+       uplo_t         /*uploa*/,
+       conj_t         conja,
+       pack_t         schema,
+       bool           /*invdiag*/,
+       dim_t          panel_dim,
+       dim_t          panel_len,
+       dim_t          panel_dim_max,
+       dim_t          panel_len_max,
+       dim_t          /*panel_dim_off*/,
+       dim_t          panel_len_off,
+       void* restrict kappa,
+       void* restrict a, inc_t inca, inc_t lda,
+       void* restrict p,             inc_t ldp,
+                         inc_t /*is_p*/,
+       cntx_t*        /*cntx*/,
+       void*          params
+    );
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+static auto PASTEMAC(ch,op) = &packm_diag_ukr<ctype>;
+
+INSERT_GENTFUNC_BASIC0(packm_diag_ukr);
+
+static packm_ker_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr );
+
+/*
+ * Structure which includes all additional information beyond what is
+ * already stored in the obj_t structure.
+ *
+ * This structure is **read-only** during the operation!
+ */
+struct packm_diag_params_t  : packm_blk_var1_params_t
+{
+    void* d;
+    inc_t incd;
+
+    packm_diag_params_t() {}
+
+    packm_diag_params_t( void* d, inc_t incd )
+    : d(d), incd(incd)
+    {
+        for ( int i = BLIS_DT_LO; i <= BLIS_DT_HI; i++ )
+            ukr_fn[i][i] = packm_diag_ukrs[i];
+    }
+};
+
+/*
+ * Selecting a different kernel based on the current architecture is
+ * currently not possible, but is something we plan to support.
+ */
+template <typename T>
+void packm_diag_ukr
+    (
+       struc_t        /*struca*/,
+       diag_t         /*diaga*/,
+       uplo_t         /*uploa*/,
+       conj_t         conja,
+       pack_t         schema,
+       bool           /*invdiag*/,
+       dim_t          panel_dim,
+       dim_t          panel_len,
+       dim_t          panel_dim_max,
+       dim_t          panel_len_max,
+       dim_t          /*panel_dim_off*/,
+       dim_t          panel_len_off,
+       void* restrict kappa,
+       void* restrict a, inc_t inca, inc_t lda,
+       void* restrict p,             inc_t ldp,
+                         inc_t /*is_p*/,
+       cntx_t*        /*cntx*/,
+       void*          params
+    )
+{
+    auto        params_cast = ( packm_diag_params_t* )params;
+    T* restrict a_cast      = ( T* )a;
+    T* restrict p_cast      = ( T* )p;
+    T* restrict d_cast      = ( T* )params_cast->d;
+    auto        incd        = params_cast->incd;
+    auto        kappa_cast  = *( T* )kappa;
+
+    if ( schema != BLIS_PACKED_ROW_PANELS &&
+         schema != BLIS_PACKED_COL_PANELS )
+       bli_abort();
+
+    /* Apply the offset */
+    d_cast += panel_len_off * incd;
+
+    if ( conja )
+    {
+        for ( dim_t j = 0; j < panel_len; j++ )
+        {
+            auto kappa_d = kappa_cast * d_cast[ j*incd ];
+
+            for (dim_t i = 0;i < panel_dim;i++)
+                p_cast[ i + j*ldp ] = kappa_d * conj( a_cast[ i*inca + j*lda ] );
+
+            for (dim_t i = panel_dim;i < panel_dim_max;i++)
+                p_cast[ i + j*ldp ] = convert<T>(0.0);
+        }
+    }
+    else
+    {
+        for ( dim_t j = 0; j < panel_len; j++ )
+        {
+            auto kappa_d = kappa_cast * d_cast[ j*incd ];
+
+            for (dim_t i = 0;i < panel_dim;i++)
+                p_cast[ i + j*ldp ] = kappa_d * a_cast[ i*inca + j*lda ];
+
+            for (dim_t i = panel_dim;i < panel_dim_max;i++)
+                p_cast[ i + j*ldp ] = convert<T>(0.0);
+        }
+    }
+
+    for (dim_t j = panel_len;j < panel_len_max;j++)
+        for (dim_t i = 0;i < panel_dim_max;i++)
+            p_cast[ i + j*ldp ] = convert<T>(0.0);
+}
+
+/*
+ * Modify the object A to include information about the diagonal D,
+ * and imbue it with special function pointers which will take care
+ * of the actual work of forming (D * A^T)
+ */
+void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a )
+{
+    // Assumes D is a column vector
+    new (params) packm_diag_params_t
+    (
+      bli_obj_buffer_at_off( d ),
+      bli_obj_row_stride( d )
+    );
+
+    // Attach the parameters to the A object.
+    bli_obj_set_pack_params( params, a );
+}
+
+/*
+ * Implements C := alpha * A * D * A^T + beta * C
+ *
+ * where D is a diagonal matrix with elements taken from the "d" vector.
+ */
+void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c )
+{
+    obj_t ad; // this is (D * A^T)
+    packm_diag_params_t params;
+
+    bli_obj_alias_to( a, &ad );
+    bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T
+    attach_diagonal_factor( &params, d, &ad );
+
+    // Does C := alpha * A * B + beta * C using B = (D + A^T)
+    bli_gemmtnat( alpha, a, &ad, beta, c, NULL, NULL );
+}
+
+int main()
+{
+    obj_t a;
+    obj_t d;
+    obj_t c;
+    obj_t c_copy;
+    obj_t norm;
+
+    auto m = 10;
+    auto k = 10;
+
+    for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ )
+    for ( int upper = 0; upper <= 1; upper++ )
+    for ( int transa = 0; transa <= 1; transa++ )
+    for ( int transc = 0; transc <= 1; transc++ )
+    {
+        auto dt = ( num_t )dt_;
+        auto uplo = upper ? BLIS_UPPER : BLIS_LOWER;
+
+        bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a );
+        bli_obj_create( dt, k, 1,              1,              1, &d );
+        bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c );
+        bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy );
+        bli_obj_set_struc( BLIS_SYMMETRIC , &c );
+        bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy );
+        bli_obj_set_uplo( uplo , &c );
+        bli_obj_set_uplo( uplo , &c_copy );
+        bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm );
+
+        bli_randm( &a );
+        bli_randm( &d );
+        bli_randm( &c );
+        bli_copym( &c, &c_copy );
+
+        syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c );
+        syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy );
+
+        bli_subm( &c_copy, &c );
+        bli_normfm( &c, &norm );
+
+        double normr, normi;
+        bli_getsc( &norm, &normr, &normi );
+
+        printf("dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n",
+               dt, upper, transa, transc, normr);
+
+        bli_obj_free( &a );
+        bli_obj_free( &d );
+        bli_obj_free( &c );
+        bli_obj_free( &c_copy );
+        bli_obj_free( &norm );
+    }
+}
diff --git a/test/syrk_diagonal/syrk_diagonal_example2.c b/test/syrk_diagonal/syrk_diagonal_example2.c
new file mode 100644
index 000000000..92371f48b
--- /dev/null
+++ b/test/syrk_diagonal/syrk_diagonal_example2.c
@@ -0,0 +1,354 @@
+#include "syrk_diagonal_ref.h"
+
+/*
+ * Structure which includes all additional information beyond what is
+ * already stored in the obj_t structure.
+ *
+ * This structure is **read-only** during the operation!
+ */
+typedef struct packm_diag_params_t
+{
+    void* d;
+    inc_t incd;
+} packm_diag_params_t;
+
+typedef void (*packm_diag_ukr_vft)
+    (
+       bool           conja,
+       dim_t          panel_dim,
+       dim_t          panel_len,
+       dim_t          panel_dim_max,
+       dim_t          panel_len_max,
+       void* restrict kappa,
+       void* restrict d, inc_t incd,
+       void* restrict a, inc_t inca, inc_t lda,
+       void* restrict p,             inc_t ldp
+    );
+
+/*
+ * Declare the pack kernel type and set up and array of
+ * packing kernels, one for each data type.
+ */
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+void PASTEMAC(ch,op) \
+    ( \
+       bool           conja, \
+       dim_t          panel_dim, \
+       dim_t          panel_len, \
+       dim_t          panel_dim_max, \
+       dim_t          panel_len_max, \
+       void* restrict kappa, \
+       void* restrict d, inc_t incd, \
+       void* restrict a, inc_t inca, inc_t lda, \
+       void* restrict p,             inc_t ldp \
+    ) \
+{ \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict p_cast     = p; \
+	ctype* restrict d_cast     = d; \
+	ctype           kappa_cast = *( ctype* )kappa; \
+\
+	if ( conja ) \
+	{ \
+		for ( dim_t j = 0; j < panel_len; j++ ) \
+		{ \
+			ctype kappa_d; \
+			PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \
+\
+			for (dim_t i = 0;i < panel_dim;i++) \
+				PASTEMAC(ch,scal2js)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \
+\
+			for (dim_t i = panel_dim;i < panel_dim_max;i++) \
+				PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t j = 0; j < panel_len; j++ ) \
+		{ \
+			ctype kappa_d; \
+			PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \
+\
+			for (dim_t i = 0;i < panel_dim;i++) \
+				PASTEMAC(ch,scal2s)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \
+\
+			for (dim_t i = panel_dim;i < panel_dim_max;i++) \
+				PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \
+		} \
+	} \
+\
+	for (dim_t j = panel_len;j < panel_len_max;j++) \
+		for (dim_t i = 0;i < panel_dim_max;i++) \
+			PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \
+}
+
+INSERT_GENTFUNC_BASIC0(packm_diag_ukr);
+
+static packm_diag_ukr_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr );
+
+void packm_diag
+     (
+       obj_t*   a,
+       obj_t*   p,
+       cntx_t*  cntx,
+       rntm_t*  rntm,
+       cntl_t*  cntl,
+       thrinfo_t* thread
+     )
+{
+#if 1
+
+	// We begin by copying the fields of A.
+	bli_obj_alias_to( a, p );
+
+    // Get information about data types.
+	num_t dt        = bli_obj_dt( a );
+	num_t dt_tar    = bli_obj_target_dt( a );
+	num_t dt_scalar = bli_obj_scalar_dt( a );
+	dim_t dt_size   = bli_dt_size( dt );
+
+	if ( dt_scalar != dt || dt_tar != dt )
+		bli_abort();
+
+	// Extract various fields from the control tree.
+	bszid_t bmult_id_m   = bli_cntl_packm_params_bmid_m( cntl );
+	bszid_t bmult_id_n   = bli_cntl_packm_params_bmid_n( cntl );
+	pack_t  schema       = bli_cntl_packm_params_pack_schema( cntl );
+	dim_t   bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
+	dim_t   bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
+	dim_t   bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
+
+	if ( schema != BLIS_PACKED_ROW_PANELS &&
+	     schema != BLIS_PACKED_COL_PANELS )
+		bli_abort();
+
+	// Store the pack schema to the object.
+	bli_obj_set_pack_schema( schema, p );
+
+	// Clear the conjugation field from the object since matrix packing
+	// in BLIS is deemed to take care of all conjugation necessary.
+	bli_obj_set_conj( BLIS_NO_CONJUGATE, p );
+
+	// If we are packing micropanels, mark P as dense.
+	bli_obj_set_uplo( BLIS_DENSE, p );
+
+	// Reset the view offsets to (0,0).
+	bli_obj_set_offs( 0, 0, p );
+
+	// Compute the dimensions padded by the dimension multiples. These
+	// dimensions will be the dimensions of the packed matrices, including
+	// zero-padding, and will be used by the macro- and micro-kernels.
+	// We compute them by starting with the effective dimensions of A (now
+	// in P) and aligning them to the dimension multiples (typically equal
+	// to register blocksizes). This does waste a little bit of space for
+	// level-2 operations, but that's okay with us.
+	dim_t m_p     = bli_obj_length( p );
+	dim_t n_p     = bli_obj_width( p );
+	dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
+	dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
+
+	// Save the padded dimensions into the packed object. It is important
+	// to save these dimensions since they represent the actual dimensions
+	// of the zero-padded matrix.
+	bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
+
+	// The "panel stride" of a micropanel packed object is interpreted as
+	// the distance between the (0,0) element of panel k and the (0,0)
+	// element of panel k+1. We use the padded width computed above to
+	// allow for zero-padding (if necessary/desired) along the far end
+	// of each micropanel (ie: the right edge of the matrix). Zero-padding
+	// can also occur along the long edge of the last micropanel if the m
+	// dimension of the matrix is not a whole multiple of MR.
+	inc_t ps_p = bmult_m_pack * n_p_pad;
+
+	/* Compute the total number of iterations we'll need. */
+	dim_t n_iter = m_p_pad / bmult_m_def;
+
+	// Store the strides and panel dimension in P.
+	bli_obj_set_strides( 1, bmult_m_pack, p );
+	bli_obj_set_imag_stride( 1, p );
+	bli_obj_set_panel_dim( bmult_m_def, p );
+	bli_obj_set_panel_stride( ps_p, p );
+	bli_obj_set_panel_length( bmult_m_def, p );
+	bli_obj_set_panel_width( n_p, p );
+
+	// Compute the size of the packed buffer.
+	siz_t size_p = ps_p * n_iter * dt_size;
+	if ( size_p == 0 ) return;
+
+	// Update the buffer address in p to point to the buffer associated
+	// with the mem_t entry acquired from the memory broker (now cached in
+	// the control tree node).
+	char*   p_cast         = (char*)bli_packm_alloc( size_p, rntm, cntl, thread );
+	bli_obj_set_buffer( p_cast, p );
+
+#else
+
+	// Every thread initializes p and determines the size of memory
+	// block needed (which gets embedded into the otherwise "blank" mem_t
+	// entry in the control tree node). Return early if no packing is required.
+	if ( !bli_packm_init( a, p, cntx, rntm, cntl, thread ) )
+		return;
+
+	num_t dt       = bli_obj_dt( a );
+	dim_t dt_size  = bli_dt_size( dt );
+
+	bszid_t bmult_id_m   = bli_cntl_packm_params_bmid_m( cntl );
+	dim_t   bmult_m_def  = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx );
+	dim_t   bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx );
+
+	dim_t m_p     = bli_obj_length( p );
+	dim_t n_p     = bli_obj_width( p );
+	dim_t m_p_pad = bli_obj_padded_length( p );
+	dim_t n_p_pad = bli_obj_padded_width( p );
+	dim_t n_iter  = m_p_pad / bmult_m_def;
+
+	char* p_cast = bli_obj_buffer( p );
+	inc_t ps_p   = bli_obj_panel_stride( p );
+
+#endif
+
+	char*   a_cast         = bli_obj_buffer_at_off( a );
+	inc_t   inca           = bli_obj_row_stride( a );
+	inc_t   lda            = bli_obj_col_stride( a );
+	dim_t   panel_len_off  = bli_obj_col_off( a );
+	conj_t  conja          = bli_obj_conj_status( a );
+
+	packm_diag_params_t* params = bli_obj_pack_params( a );
+	char*   d_cast         = params->d;
+	inc_t   incd           = params->incd;
+
+	obj_t   kappa_local;
+	char*   kappa_cast     = bli_packm_scalar( &kappa_local, p );
+
+	packm_diag_ukr_vft packm_ker_cast = packm_diag_ukrs[ dt ];
+
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */
+	const dim_t nt  = bli_thread_n_way( thread );
+	const dim_t tid = bli_thread_work_id( thread );
+
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */
+	dim_t it_start, it_end, it_inc;
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
+
+	/* Iterate over every logical micropanel in the source matrix. */
+	for ( dim_t it  = 0; it < n_iter; it += 1 )
+	{
+		dim_t panel_dim_i = bli_min( bmult_m_def, m_p - it*bmult_m_def );
+
+		char* d_begin     = d_cast +    panel_len_off*incd*dt_size;
+		char* a_begin     = a_cast + it*  bmult_m_def*inca*dt_size;
+		char* p_begin     = p_cast + it*              ps_p*dt_size;
+
+		if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) )
+		{
+			packm_ker_cast
+			(
+			  conja,
+			  panel_dim_i,
+			  n_p,
+			  bmult_m_def,
+			  n_p_pad,
+			  kappa_cast,
+			  d_begin, incd,
+			  a_begin, inca, lda,
+			  p_begin, bmult_m_pack
+			);
+		}
+	}
+}
+
+/*
+ * Modify the object A to include information about the diagonal D,
+ * and imbue it with special function pointers which will take care
+ * of the actual work of forming (D * A^T)
+ */
+void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a )
+{
+	// Assumes D is a column vector
+	params->d    = bli_obj_buffer_at_off( d );
+	params->incd = bli_obj_row_stride( d );
+
+	// Set the custom pack function.
+	bli_obj_set_pack_fn( packm_diag, a );
+
+	// Attach the parameters to the A object.
+	bli_obj_set_pack_params( params, a );
+}
+
+/*
+ * Implements C := alpha * A * D * A^T + beta * C
+ *
+ * where D is a diagonal matrix with elements taken from the "d" vector.
+ */
+void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c )
+{
+	obj_t ad; // this is (D * A^T)
+	packm_diag_params_t params;
+
+	bli_obj_alias_to( a, &ad );
+	bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T
+	attach_diagonal_factor( &params, d, &ad );
+
+	// Does C := alpha * A * B + beta * C using B = (D + A^T)
+	bli_gemmt( alpha, a, &ad, beta, c );
+}
+
+int main( void )
+{
+	obj_t a;
+	obj_t d;
+	obj_t c;
+	obj_t c_copy;
+	obj_t norm;
+
+	dim_t m = 10;
+	dim_t k = 10;
+
+	for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ )
+	for ( int upper = 0; upper <= 1; upper++ )
+	for ( int transa = 0; transa <= 1; transa++ )
+	for ( int transc = 0; transc <= 1; transc++ )
+	{
+		num_t dt = dt_;
+		uplo_t uplo = upper ? BLIS_UPPER : BLIS_LOWER;
+
+		bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a );
+		bli_obj_create( dt, k, 1,              1,          1,     &d );
+		bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c );
+		bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy );
+		bli_obj_set_struc( BLIS_SYMMETRIC , &c );
+		bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy );
+		bli_obj_set_uplo( uplo , &c );
+		bli_obj_set_uplo( uplo , &c_copy );
+		bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm );
+
+		bli_randm( &a );
+		bli_randm( &d );
+		bli_randm( &c );
+		bli_copym( &c, &c_copy );
+
+		syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c );
+		syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy );
+
+		bli_subm( &c_copy, &c );
+		bli_normfm( &c, &norm );
+
+		double normr, normi;
+		bli_getsc( &norm, &normr, &normi );
+
+		printf( "dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n",
+		        dt, upper, transa, transc, normr );
+
+		bli_obj_free( &a );
+		bli_obj_free( &d );
+		bli_obj_free( &c );
+		bli_obj_free( &c_copy );
+		bli_obj_free( &norm );
+	}
+}
diff --git a/test/syrk_diagonal/syrk_diagonal_example2.cxx b/test/syrk_diagonal/syrk_diagonal_example2.cxx
new file mode 100644
index 000000000..8312a07ee
--- /dev/null
+++ b/test/syrk_diagonal/syrk_diagonal_example2.cxx
@@ -0,0 +1,338 @@
+#include "syrk_diagonal_ref.h"
+
+/*
+ * Forward-declare the pack kernel type and set up and array of
+ * packing kernels, one for each data type.
+ */
+template <typename T>
+void packm_diag_ukr
+    (
+       bool           conja,
+       dim_t          panel_dim,
+       dim_t          panel_len,
+       dim_t          panel_dim_max,
+       dim_t          panel_len_max,
+       void* restrict kappa,
+       void* restrict d, inc_t incd,
+       void* restrict a, inc_t inca, inc_t lda,
+       void* restrict p,             inc_t ldp
+    );
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+static auto PASTEMAC(ch,op) = &packm_diag_ukr<ctype>;
+
+INSERT_GENTFUNC_BASIC0(packm_diag_ukr);
+
+using packm_diag_ukr_vft = decltype(&packm_diag_ukr<void>);
+static packm_diag_ukr_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr );
+
+/*
+ * Structure which includes all additional information beyond what is
+ * already stored in the obj_t structure.
+ *
+ * This structure is **read-only** during the operation!
+ */
+struct packm_diag_params_t
+{
+    void* d;
+    inc_t incd;
+
+    packm_diag_params_t() {}
+
+    packm_diag_params_t( void* d, inc_t incd )
+    : d(d), incd(incd) {}
+};
+
+/*
+ * Selecting a different kernel based on the current architecture is
+ * currently not possible, but is something we plan to support.
+ */
+template <typename T>
+void packm_diag_ukr
+    (
+       bool           conja,
+       dim_t          panel_dim,
+       dim_t          panel_len,
+       dim_t          panel_dim_max,
+       dim_t          panel_len_max,
+       void* restrict kappa,
+       void* restrict d, inc_t incd,
+       void* restrict a, inc_t inca, inc_t lda,
+       void* restrict p,             inc_t ldp
+    )
+{
+    T* restrict a_cast     = ( T* )a;
+    T* restrict p_cast     = ( T* )p;
+    T* restrict d_cast     = ( T* )d;
+    auto        kappa_cast = *( T* )kappa;
+
+    if ( conja )
+    {
+        for ( dim_t j = 0; j < panel_len; j++ )
+        {
+            auto kappa_d = kappa_cast * d_cast[ j*incd ];
+
+            for (dim_t i = 0;i < panel_dim;i++)
+                p_cast[ i + j*ldp ] = kappa_d * conj( a_cast[ i*inca + j*lda ] );
+
+            for (dim_t i = panel_dim;i < panel_dim_max;i++)
+                p_cast[ i + j*ldp ] = convert<T>(0.0);
+        }
+    }
+    else
+    {
+        for ( dim_t j = 0; j < panel_len; j++ )
+        {
+            auto kappa_d = kappa_cast * d_cast[ j*incd ];
+
+            for (dim_t i = 0;i < panel_dim;i++)
+                p_cast[ i + j*ldp ] = kappa_d * a_cast[ i*inca + j*lda ];
+
+            for (dim_t i = panel_dim;i < panel_dim_max;i++)
+                p_cast[ i + j*ldp ] = convert<T>(0.0);
+        }
+    }
+
+    for (dim_t j = panel_len;j < panel_len_max;j++)
+        for (dim_t i = 0;i < panel_dim_max;i++)
+            p_cast[ i + j*ldp ] = convert<T>(0.0);
+}
+
+void packm_diag
+     (
+       obj_t*   a,
+       obj_t*   p,
+       cntx_t*  cntx,
+       rntm_t*  rntm,
+       cntl_t*  cntl,
+       thrinfo_t* thread
+     )
+{
+	// We begin by copying the fields of A.
+	bli_obj_alias_to( a, p );
+
+    // Get information about data types.
+	num_t dt        = bli_obj_dt( a );
+	num_t dt_tar    = bli_obj_target_dt( a );
+	num_t dt_scalar = bli_obj_scalar_dt( a );
+	dim_t dt_size   = bli_dt_size( dt );
+
+	if ( dt_scalar != dt || dt_tar != dt )
+       bli_abort();
+
+	// Extract various fields from the control tree.
+	bszid_t bmult_id_m   = bli_cntl_packm_params_bmid_m( cntl );
+	bszid_t bmult_id_n   = bli_cntl_packm_params_bmid_n( cntl );
+	pack_t  schema       = bli_cntl_packm_params_pack_schema( cntl );
+	dim_t   bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
+	dim_t   bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
+	dim_t   bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
+
+    if ( schema != BLIS_PACKED_ROW_PANELS &&
+         schema != BLIS_PACKED_COL_PANELS )
+       bli_abort();
+
+	// Store the pack schema to the object.
+	bli_obj_set_pack_schema( schema, p );
+
+	// Clear the conjugation field from the object since matrix packing
+	// in BLIS is deemed to take care of all conjugation necessary.
+	bli_obj_set_conj( BLIS_NO_CONJUGATE, p );
+
+	// If we are packing micropanels, mark P as dense.
+	bli_obj_set_uplo( BLIS_DENSE, p );
+
+	// Reset the view offsets to (0,0).
+	bli_obj_set_offs( 0, 0, p );
+
+	// Compute the dimensions padded by the dimension multiples. These
+	// dimensions will be the dimensions of the packed matrices, including
+	// zero-padding, and will be used by the macro- and micro-kernels.
+	// We compute them by starting with the effective dimensions of A (now
+	// in P) and aligning them to the dimension multiples (typically equal
+	// to register blocksizes). This does waste a little bit of space for
+	// level-2 operations, but that's okay with us.
+	dim_t m_p     = bli_obj_length( p );
+	dim_t n_p     = bli_obj_width( p );
+	dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
+	dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
+
+	// Save the padded dimensions into the packed object. It is important
+	// to save these dimensions since they represent the actual dimensions
+	// of the zero-padded matrix.
+	bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
+
+	// The "panel stride" of a micropanel packed object is interpreted as
+	// the distance between the (0,0) element of panel k and the (0,0)
+	// element of panel k+1. We use the padded width computed above to
+	// allow for zero-padding (if necessary/desired) along the far end
+	// of each micropanel (ie: the right edge of the matrix). Zero-padding
+	// can also occur along the long edge of the last micropanel if the m
+	// dimension of the matrix is not a whole multiple of MR.
+	inc_t ps_p = bmult_m_pack * n_p_pad;
+
+	/* Compute the total number of iterations we'll need. */
+	dim_t n_iter = m_p_pad / bmult_m_def;
+
+	// Store the strides and panel dimension in P.
+	bli_obj_set_strides( 1, bmult_m_pack, p );
+	bli_obj_set_imag_stride( 1, p );
+	bli_obj_set_panel_dim( bmult_m_def, p );
+	bli_obj_set_panel_stride( ps_p, p );
+	bli_obj_set_panel_length( bmult_m_def, p );
+	bli_obj_set_panel_width( n_p, p );
+
+	// Compute the size of the packed buffer.
+	siz_t size_p = ps_p * n_iter * dt_size;
+	if ( size_p == 0 ) return;
+
+	// Update the buffer address in p to point to the buffer associated
+	// with the mem_t entry acquired from the memory broker (now cached in
+	// the control tree node).
+	char*   p_cast         = (char*)bli_packm_alloc( size_p, rntm, cntl, thread );
+	bli_obj_set_buffer( p_cast, p );
+
+	char*   a_cast         = (char*)bli_obj_buffer_at_off( a );
+	inc_t   inca           = bli_obj_row_stride( a );
+	inc_t   lda            = bli_obj_col_stride( a );
+	dim_t   panel_len_off  = bli_obj_col_off( a );
+	conj_t  conja          = bli_obj_conj_status( a );
+
+    auto    params         = (packm_diag_params_t*)bli_obj_pack_params( a );
+    char*   d_cast         = (char*)params->d;
+    inc_t   incd           = params->incd;
+
+	obj_t   kappa_local;
+	char*   kappa_cast     = (char*)bli_packm_scalar( &kappa_local, p );
+
+	auto    packm_ker_cast = packm_diag_ukrs[ dt ];
+
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */
+	const dim_t nt  = bli_thread_n_way( thread );
+	const dim_t tid = bli_thread_work_id( thread );
+
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */
+	dim_t it_start, it_end, it_inc;
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
+
+	/* Iterate over every logical micropanel in the source matrix. */
+	for ( dim_t it  = 0; it < n_iter; it += 1 )
+	{
+		dim_t panel_dim_i = bli_min( bmult_m_def, m_p - it*bmult_m_def );
+
+        char* d_begin     = d_cast +    panel_len_off*incd*dt_size;
+		char* a_begin     = a_cast + it*  bmult_m_def*inca*dt_size;
+	    char* p_begin     = p_cast + it*              ps_p*dt_size;
+
+		if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) )
+		{
+    		packm_ker_cast( conja,
+                            panel_dim_i,
+    		                n_p,
+    		                bmult_m_def,
+    		                n_p_pad,
+    		                kappa_cast,
+                            d_begin, incd,
+    		                a_begin, inca, lda,
+    		                p_begin, bmult_m_pack );
+        }
+	}
+}
+
+/*
+ * Modify the object A to include information about the diagonal D,
+ * and imbue it with special function pointers which will take care
+ * of the actual work of forming (D * A^T)
+ */
+void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a )
+{
+    // Assumes D is a column vector
+    new (params) packm_diag_params_t
+    (
+      bli_obj_buffer_at_off( d ),
+      bli_obj_row_stride( d )
+    );
+
+    // Set the custom pack function.
+    bli_obj_set_pack_fn( packm_diag, a );
+
+    // Attach the parameters to the A object.
+    bli_obj_set_pack_params( params, a );
+}
+
+/*
+ * Implements C := alpha * A * D * A^T + beta * C
+ *
+ * where D is a diagonal matrix with elements taken from the "d" vector.
+ */
+void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c )
+{
+    obj_t ad; // this is (D * A^T)
+    packm_diag_params_t params;
+
+    bli_obj_alias_to( a, &ad );
+    bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T
+    attach_diagonal_factor( &params, d, &ad );
+
+    // Does C := alpha * A * B + beta * C using B = (D + A^T)
+    bli_gemmt( alpha, a, &ad, beta, c );
+}
+
+int main()
+{
+    obj_t a;
+    obj_t d;
+    obj_t c;
+    obj_t c_copy;
+    obj_t norm;
+
+    auto m = 10;
+    auto k = 10;
+
+    for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ )
+    for ( int upper = 0; upper <= 1; upper++ )
+    for ( int transa = 0; transa <= 1; transa++ )
+    for ( int transc = 0; transc <= 1; transc++ )
+    {
+        auto dt = ( num_t )dt_;
+        auto uplo = upper ? BLIS_UPPER : BLIS_LOWER;
+
+        bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a );
+        bli_obj_create( dt, k, 1,              1,              1, &d );
+        bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c );
+        bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy );
+        bli_obj_set_struc( BLIS_SYMMETRIC , &c );
+        bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy );
+        bli_obj_set_uplo( uplo , &c );
+        bli_obj_set_uplo( uplo , &c_copy );
+        bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm );
+
+        bli_randm( &a );
+        bli_randm( &d );
+        bli_randm( &c );
+        bli_copym( &c, &c_copy );
+
+        syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c );
+        syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy );
+
+        bli_subm( &c_copy, &c );
+        bli_normfm( &c, &norm );
+
+        double normr, normi;
+        bli_getsc( &norm, &normr, &normi );
+
+        printf("dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n",
+               dt, upper, transa, transc, normr);
+
+        bli_obj_free( &a );
+        bli_obj_free( &d );
+        bli_obj_free( &c );
+        bli_obj_free( &c_copy );
+        bli_obj_free( &norm );
+    }
+}
diff --git a/test/syrk_diagonal/syrk_diagonal_ref.cxx b/test/syrk_diagonal/syrk_diagonal_ref.cxx
new file mode 100644
index 000000000..1d7c5d96e
--- /dev/null
+++ b/test/syrk_diagonal/syrk_diagonal_ref.cxx
@@ -0,0 +1,102 @@
+#include "syrk_diagonal_ref.h"
+#include "complex_math.hpp"
+
+typedef void (*syrk_diag_ref_vft)
+    (
+        uplo_t uplo,
+        dim_t m,
+        dim_t k,
+        void* alpha,
+        void* a, inc_t rs_a, inc_t cs_a,
+        void* d, inc_t incd,
+        void* beta,
+        void* c, inc_t rs_c, inc_t cs_c
+    );
+
+template <typename T>
+void syrk_diag_ref
+    (
+        uplo_t uplo,
+        dim_t m,
+        dim_t k,
+        void* alpha,
+        void* a, inc_t rs_a, inc_t cs_a,
+        void* d, inc_t incd,
+        void* beta,
+        void* c, inc_t rs_c, inc_t cs_c
+    )
+{
+    auto alpha_cast = *( T* )alpha;
+    auto beta_cast  = *( T* )beta;
+    auto a_cast     = ( T* )a;
+    auto d_cast     = ( T* )d;
+    auto c_cast     = ( T* )c;
+
+    for ( dim_t i = 0; i < m; i++ )
+    {
+        dim_t j_min = uplo == BLIS_UPPER ? i : 0;
+        dim_t j_max = uplo == BLIS_UPPER ? m : i+1;
+
+        for ( dim_t j = j_min; j < j_max; j++ )
+        {
+            auto ada = convert<T>(0.0);
+
+            for ( dim_t p = 0; p < k; p++ )
+            {
+                ada += a_cast[ i*rs_a + p*cs_a ] *
+                       d_cast[          p*incd ] *
+                       a_cast[ j*rs_a + p*cs_a ];
+            }
+
+            if ( beta_cast == convert<T>(0.0) )
+            {
+                c_cast[ i*rs_c + j*cs_c ] = alpha_cast * ada;
+            }
+            else
+            {
+                c_cast[ i*rs_c + j*cs_c ] = alpha_cast * ada +
+                                             beta_cast * c_cast[ i*rs_c + j*cs_c ];
+            }
+        }
+    }
+}
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+static auto PASTEMAC(ch,op) = &syrk_diag_ref<ctype>;
+
+INSERT_GENTFUNC_BASIC0(syrk_diag_ref);
+
+static syrk_diag_ref_vft GENARRAY( syrk_diag_ref_impl, syrk_diag_ref );
+
+void syrk_diag_ref( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c )
+{
+    num_t dt = bli_obj_dt( a );
+
+    dim_t m = bli_obj_length_after_trans( a );
+    dim_t k = bli_obj_width_after_trans( a );
+
+    inc_t rs_a = bli_obj_row_stride( a );
+    inc_t cs_a = bli_obj_col_stride( a );
+    inc_t rs_c = bli_obj_row_stride( c );
+    inc_t cs_c = bli_obj_col_stride( c );
+    inc_t incd = bli_obj_row_stride( d );
+
+    if ( bli_obj_has_trans( a ) )
+        bli_swap_incs( &rs_a, &cs_a );
+
+    if ( bli_obj_has_trans( c ) )
+        bli_swap_incs( &rs_c, &cs_c );
+
+    syrk_diag_ref_impl[ dt ]
+    (
+      bli_obj_uplo( c ),
+      m, k,
+      bli_obj_buffer_for_1x1( dt, alpha ),
+      bli_obj_buffer_at_off( a ), rs_a, cs_a,
+      bli_obj_buffer_at_off( d ), incd,
+      bli_obj_buffer_for_1x1( dt, beta ),
+      bli_obj_buffer_at_off( c ), rs_c, cs_c
+    );
+}
+
diff --git a/test/syrk_diagonal/syrk_diagonal_ref.h b/test/syrk_diagonal/syrk_diagonal_ref.h
new file mode 100644
index 000000000..a6864caec
--- /dev/null
+++ b/test/syrk_diagonal/syrk_diagonal_ref.h
@@ -0,0 +1,8 @@
+#include "blis.h"
+
+#ifdef __cplusplus
+#include "complex_math.hpp"
+extern "C"
+#endif
+void syrk_diag_ref( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c );
+
diff --git a/test/tensor_contraction/complex_math.hpp b/test/tensor_contraction/complex_math.hpp
new file mode 100644
index 000000000..9c68e730a
--- /dev/null
+++ b/test/tensor_contraction/complex_math.hpp
@@ -0,0 +1,267 @@
+#include <cmath>
+#include <algorithm>
+#include <type_traits>
+
+#include "blis.h"
+
+template <typename T>
+struct is_complex : std::false_type {};
+
+template <>
+struct is_complex<scomplex> : std::true_type {};
+
+template <>
+struct is_complex<dcomplex> : std::true_type {};
+
+template <typename T>
+struct is_real : std::integral_constant<bool,!is_complex<T>::value> {};
+
+template <typename T> struct make_complex;
+
+template <> struct make_complex<float   > { using type = scomplex; };
+template <> struct make_complex<double  > { using type = dcomplex; };
+template <> struct make_complex<scomplex> { using type = scomplex; };
+template <> struct make_complex<dcomplex> { using type = dcomplex; };
+
+template <typename T>
+using make_complex_t = typename make_complex<T>::type;
+
+template <typename T> struct make_real;
+
+template <> struct make_real<float   > { using type = float; };
+template <> struct make_real<double  > { using type = double; };
+template <> struct make_real<scomplex> { using type = float; };
+template <> struct make_real<dcomplex> { using type = double; };
+
+template <typename T>
+using make_real_t = typename make_real<T>::type;
+
+template <typename T, bool Cond>
+struct make_complex_if : std::conditional<Cond,make_complex_t<T>,make_real_t<T>> {};
+
+template <typename T, bool Cond>
+using make_complex_if_t = typename make_complex_if<T,Cond>::type;
+
+template <typename T>
+struct real_imag_part
+{
+    real_imag_part& operator=(T) { return *this; }
+
+    operator T() const { return T(); }
+};
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<typename std::remove_cv<T>::type>::value,T&> real(T& x) { return x; }
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value,real_imag_part<T>> imag(T x) { return {}; }
+
+inline float& real(scomplex& x) { return x.real; }
+
+inline float& imag(scomplex& x) { return x.imag; }
+
+inline double& real(dcomplex& x) { return x.real; }
+
+inline double& imag(dcomplex& x) { return x.imag; }
+
+inline const float& real(const scomplex& x) { return x.real; }
+
+inline const float& imag(const scomplex& x) { return x.imag; }
+
+inline const double& real(const dcomplex& x) { return x.real; }
+
+inline const double& imag(const dcomplex& x) { return x.imag; }
+
+template <typename T>
+std::enable_if_t<is_real<T>::value,T> conj(T x) { return x; }
+
+template <typename T>
+std::enable_if_t<is_complex<T>::value,T> conj(const T& x) { return {x.real, -x.imag}; }
+
+template <typename T, typename U, typename=void>
+struct convert_impl;
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_real<T>::value && is_real<U>::value>>
+{
+    void operator()(T x, U& y) const { y = x; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_real<T>::value && is_complex<U>::value>>
+{
+    void operator()(T x, U& y) const { y.real = x; y.imag = 0; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_complex<T>::value && is_real<U>::value>>
+{
+    void operator()(T x, U& y) const { y = x.real; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_complex<T>::value && is_complex<U>::value>>
+{
+    void operator()(T x, U& y) const { y.real = x.real; y.imag = x.imag; }
+};
+
+template <typename U, typename T>
+U convert(T x)
+{
+    U y;
+    convert_impl<T,U>{}(x,y);
+    return y;
+}
+
+template <typename U, typename T>
+auto convert_prec(T x) -> make_complex_if_t<U,is_complex<T>::value>
+{
+    return convert<make_complex_if_t<U,is_complex<T>::value>>(x);
+}
+
+#define COMPLEX_MATH_OPS(rtype, ctype) \
+\
+inline bool operator==(rtype x, ctype y) \
+{ \
+    return x == y.real && y.imag == 0; \
+} \
+\
+inline bool operator==(ctype x, rtype y) \
+{ \
+    return y == x.real && x.imag == 0; \
+} \
+\
+inline bool operator==(ctype x, ctype y) \
+{ \
+    return x.real == y.real && \
+           x.imag == y.imag; \
+ } \
+ \
+inline ctype operator-(ctype x) \
+{ \
+    return {-x.real, -x.imag}; \
+} \
+\
+inline ctype operator+(rtype x, ctype y) \
+{ \
+    return {x+y.real, y.imag}; \
+} \
+\
+inline ctype operator+(ctype x, rtype y) \
+{ \
+    return {y+x.real, x.imag}; \
+} \
+\
+inline ctype operator+(ctype x, ctype y) \
+{ \
+    return {x.real+y.real, x.imag+y.imag}; \
+} \
+\
+inline ctype operator-(rtype x, ctype y) \
+{ \
+    return {x-y.real, -y.imag}; \
+} \
+\
+inline ctype operator-(ctype x, rtype y) \
+{ \
+    return {x.real-y, x.imag}; \
+} \
+\
+inline ctype operator-(ctype x, ctype y) \
+{ \
+    return {x.real-y.real, x.imag-y.imag}; \
+} \
+\
+inline ctype operator*(rtype x, ctype y) \
+{ \
+    return {x*y.real, x*y.imag}; \
+} \
+\
+inline ctype operator*(ctype x, rtype y) \
+{ \
+    return {y*x.real, y*x.imag}; \
+} \
+\
+inline ctype operator*(ctype x, ctype y) \
+{ \
+    return {x.real*y.real - x.imag*y.imag, \
+            x.real*y.imag + x.imag*y.real}; \
+} \
+\
+inline ctype operator/(rtype x, ctype y) \
+{ \
+    auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \
+    auto n = std::ilogb(scale); \
+    auto yrs = std::scalbn(y.real, -n); \
+    auto yis = std::scalbn(y.imag, -n); \
+    auto denom = y.real*yrs + y.imag*yis; \
+    return {x*yrs/denom, -x*yis/denom}; \
+} \
+\
+inline ctype operator/(ctype x, rtype y) \
+{ \
+    return {x.real/y, x.imag/y}; \
+} \
+\
+inline ctype operator/(ctype x, ctype y) \
+{ \
+    auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \
+    auto n = std::ilogb(scale); \
+    auto yrs = std::scalbn(y.real, -n); \
+    auto yis = std::scalbn(y.imag, -n); \
+    auto denom = y.real*yrs + y.imag*yis; \
+    return {(x.real*yrs + x.imag*yis)/denom, \
+            (x.imag*yrs - x.real*yis)/denom}; \
+} \
+\
+inline ctype& operator+=(ctype& x, rtype y) \
+{ \
+    x.real += y; \
+    return x; \
+} \
+\
+inline ctype& operator+=(ctype& x, ctype y) \
+{ \
+    x.real += y.real; x.imag += y.imag; \
+    return x; \
+} \
+\
+inline ctype& operator-=(ctype& x, rtype y) \
+{ \
+    x.real -= y; \
+    return x; \
+} \
+\
+inline ctype& operator-=(ctype& x, ctype y) \
+{ \
+    x.real -= y.real; x.imag -= y.imag; \
+    return x; \
+} \
+\
+inline ctype& operator*=(ctype& x, rtype y) \
+{ \
+    x.real *= y; x.imag *= y; \
+    return x; \
+} \
+\
+inline ctype& operator*=(ctype& x, ctype y) \
+{ \
+    x = x * y; \
+    return x; \
+} \
+\
+inline ctype& operator/=(ctype& x, rtype y) \
+{ \
+    x.real /= y; x.imag /= y; \
+    return x; \
+} \
+\
+inline ctype& operator/=(ctype& x, ctype y) \
+{ \
+    x = x / y; \
+    return x; \
+}
+
+COMPLEX_MATH_OPS(float,  scomplex);
+COMPLEX_MATH_OPS(double, dcomplex);
+
diff --git a/test/tensor_contraction/tcontract_example.cxx b/test/tensor_contraction/tcontract_example.cxx
new file mode 100644
index 000000000..0b935c54d
--- /dev/null
+++ b/test/tensor_contraction/tcontract_example.cxx
@@ -0,0 +1,988 @@
+
+#include "tcontract_ref.hpp"
+
+#include <algorithm>
+#include <numeric>
+
+static constexpr dim_t BS_K = 8;
+
+struct packm_tensor_params_t
+{
+    gint_t ndim_m, ndim_n;
+    const dim_t *len_m, *len_n;
+    const inc_t *stride_m, *stride_n;
+
+    packm_tensor_params_t() {}
+
+    packm_tensor_params_t( gint_t ndim_m, const dim_t* len_m, const inc_t* stride_m,
+                           gint_t ndim_n, const dim_t* len_n, const inc_t* stride_n )
+    : ndim_m(ndim_m), ndim_n(ndim_n),
+      len_m(len_m), len_n(len_n),
+      stride_m(stride_m), stride_n(stride_n) {}
+};
+
+using gemm_tensor_params_t = packm_tensor_params_t;
+
+template <typename T>
+void packm_ckx_nb
+    (
+       bool  conja,
+       dim_t panel_dim,
+       dim_t panel_len,
+       dim_t panel_dim_max,
+       dim_t panel_len_max,
+       void* kappa,
+       void* a, inc_t inca, inc_t* bsa, inc_t* scata,
+       void* p, inc_t ldp
+    )
+{
+    T* restrict a_cast     = ( T* )a;
+    T* restrict p_cast     = ( T* )p;
+    auto        kappa_cast = *( T* )kappa;
+
+    if ( conja )
+    {
+        for ( auto j0 = 0; j0 < panel_len; j0 += BS_K, bsa += BS_K, scata += BS_K )
+        {
+            auto lda = *bsa;
+            auto panel_len_j = std::min<dim_t>( panel_len-j0, BS_K );
+
+            if ( lda )
+            {
+                T* restrict aj = a_cast + *scata;
+
+                for ( auto j = 0; j < panel_len_j; j++ )
+                {
+                    for ( auto i = 0; i < panel_dim; i++ )
+                        p_cast[ i ] = kappa_cast * conj( aj[ i*inca + j*lda ] );
+
+                    for ( auto i = panel_dim; i < panel_dim_max; i++ )
+                        p_cast[ i ] = convert<T>(0.0);
+
+                    p_cast += ldp;
+                }
+            }
+            else
+            {
+                for ( auto j = 0; j < panel_len_j; j++)
+                {
+                    for ( auto i = 0; i < panel_dim; i++)
+                        p_cast[ i ] = kappa_cast * conj( a_cast[ i*inca + scata[j] ] );
+
+                    for ( auto i = panel_dim; i < panel_dim_max; i++)
+                        p_cast[ i ] = convert<T>(0.0);
+
+                    p_cast += ldp;
+                }
+            }
+        }
+    }
+    else
+    {
+        for ( auto j0 = 0; j0 < panel_len; j0 += BS_K, bsa += BS_K, scata += BS_K )
+        {
+            auto lda = *bsa;
+            auto panel_len_j = std::min<dim_t>( panel_len-j0, BS_K );
+
+            if ( lda )
+            {
+                T* restrict aj = a_cast + *scata;
+
+                for ( auto j = 0; j < panel_len_j; j++ )
+                {
+                    for ( auto i = 0; i < panel_dim; i++ )
+                        p_cast[ i ] = kappa_cast * aj[ i*inca + j*lda ];
+
+                    for ( auto i = panel_dim; i < panel_dim_max; i++ )
+                        p_cast[ i ] = convert<T>(0.0);
+
+                    p_cast += ldp;
+                }
+            }
+            else
+            {
+                for ( auto j = 0; j < panel_len_j; j++ )
+                {
+                    for ( auto i = 0; i < panel_dim; i++ )
+                        p_cast[ i ] = kappa_cast * a_cast[ i*inca + scata[j] ];
+
+                    for ( auto i = panel_dim; i < panel_dim_max; i++ )
+                        p_cast[ i ] = convert<T>(0.0);
+
+                    p_cast += ldp;
+                }
+            }
+        }
+    }
+
+    for ( auto j = panel_len; j < panel_len_max; j++)
+    {
+        for ( auto i = 0; i < panel_dim_max; i++)
+            p_cast[ i ] = convert<T>(0.0);
+
+        p_cast += ldp;
+    }
+}
+
+template <typename T>
+void packm_ckx_ss
+    (
+       bool  conja,
+       dim_t panel_dim,
+       dim_t panel_len,
+       dim_t panel_dim_max,
+       dim_t panel_len_max,
+       void* kappa,
+       void* a, inc_t* inca, inc_t* scata,
+       void* p, inc_t ldp
+    )
+{
+    T* restrict a_cast     = ( T* )a;
+    T* restrict p_cast     = ( T* )p;
+    auto        kappa_cast = *( T* )kappa;
+
+    if ( conja )
+    {
+        for (dim_t j = 0;j < panel_len;j++)
+        {
+            for (dim_t i = 0;i < panel_dim;i++)
+                p_cast[ i ] = kappa_cast * conj( a_cast[ inca[i] + scata[j] ] );
+
+            for (dim_t i = panel_dim;i < panel_dim_max;i++)
+                p_cast[ i ] = convert<T>(0.0);
+
+            p_cast += ldp;
+        }
+    }
+    else
+    {
+        for (dim_t j = 0;j < panel_len;j++)
+        {
+            for (dim_t i = 0;i < panel_dim;i++)
+                p_cast[ i ] = kappa_cast * a_cast[ inca[i] + scata[j] ];
+
+            for (dim_t i = panel_dim;i < panel_dim_max;i++)
+                p_cast[ i ] = convert<T>(0.0);
+
+            p_cast += ldp;
+        }
+    }
+
+    for (dim_t j = panel_len;j < panel_len_max;j++)
+    {
+        for (dim_t i = 0;i < panel_dim_max;i++)
+            p_cast[ i ] = convert<T>(0.0);
+
+        p_cast += ldp;
+    }
+}
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+static auto PASTEMAC(ch,op) = &packm_ckx_nb<ctype>;
+
+INSERT_GENTFUNC_BASIC0(packm_ckx_nb);
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+static auto PASTEMAC(ch,op) = &packm_ckx_ss<ctype>;
+
+INSERT_GENTFUNC_BASIC0(packm_ckx_ss);
+
+static decltype(&packm_ckx_nb<void>) GENARRAY( packm_ckx_nb_ukrs, packm_ckx_nb );
+static decltype(&packm_ckx_ss<void>) GENARRAY( packm_ckx_ss_ukrs, packm_ckx_ss );
+
+static void fill_scatter
+            (
+              gint_t                ndim,
+              const dim_t* restrict len,
+              const inc_t* restrict stride,
+              dim_t                 BS,
+              inc_t                 off,
+              dim_t                 size,
+              inc_t* restrict       scat,
+              inc_t* restrict       bs
+            )
+{
+    if ( size == 0 ) return;
+
+    if ( ndim == 0 )
+    {
+        *scat = 0;
+        *bs = 0;
+        return;
+    }
+
+    if ( ndim == 1 )
+    {
+        auto l = *len;
+        auto s = *stride;
+        for ( auto i = 0; i < l; i++ )
+        {
+            scat[i] = i*s;
+            bs[i] = s;
+        }
+    }
+
+    dim_t tot_len = 1;
+    for ( auto i = 0; i < ndim; i++ )
+        tot_len *= len[i];
+
+    assert(off >= 0);
+    assert(size >= 0);
+    assert(off+size <= tot_len);
+
+    auto len0 = len[0];
+    auto stride0 = stride[0];
+    auto off0 = off % len0;
+    auto off1 = off / len0;
+    auto size1 = ( size + off0 + len0 - 1) / len0;
+
+    inc_t pos1 = 0;
+    inc_t idx = 0;
+    for_each( ndim-1, len+1, off1, size1, pos1, stride+1,
+    [&]
+    {
+        auto pos = pos1 + off0 * stride0;
+        auto len_i = std::min( len0-off0, size-idx );
+        for ( auto i = 0; i < len_i; i++ )
+        {
+            scat[idx++] = pos;
+            pos += stride0;
+        }
+        off0 = 0;
+    });
+    assert(idx == size);
+
+    for ( idx = 0; idx < size; idx += BS )
+    {
+        auto len_i = std::min( BS, size-idx );
+        auto s = stride0;
+
+        for ( auto i = idx; i < idx+len_i-1; i++)
+        {
+            if (scat[i+1]-scat[i] != s)
+            {
+                s = 0;
+                break;
+            }
+        }
+
+        bs[idx] = s;
+    }
+}
+
+void packm_tensor
+     (
+       obj_t*   a,
+       obj_t*   p,
+       cntx_t*  cntx,
+       rntm_t*  rntm,
+       cntl_t*  cntl,
+       thrinfo_t* thread
+     )
+{
+	// We begin by copying the fields of A.
+	bli_obj_alias_to( a, p );
+
+    // Get information about data types.
+	auto dt        = bli_obj_dt( a );
+	auto dt_tar    = bli_obj_target_dt( a );
+	auto dt_scalar = bli_obj_scalar_dt( a );
+	auto dt_size   = bli_dt_size( dt );
+
+	if ( dt_scalar != dt || dt_tar != dt )
+       bli_abort();
+
+	// Extract various fields from the control tree.
+	auto bmult_id_m   = bli_cntl_packm_params_bmid_m( cntl );
+	auto bmult_id_n   = bli_cntl_packm_params_bmid_n( cntl );
+	auto schema       = bli_cntl_packm_params_pack_schema( cntl );
+	auto bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
+	auto bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
+	auto bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
+
+    if ( schema != BLIS_PACKED_ROW_PANELS &&
+         schema != BLIS_PACKED_COL_PANELS )
+       bli_abort();
+
+	// Store the pack schema to the object.
+	bli_obj_set_pack_schema( schema, p );
+
+	// Clear the conjugation field from the object since matrix packing
+	// in BLIS is deemed to take care of all conjugation necessary.
+	bli_obj_set_conj( BLIS_NO_CONJUGATE, p );
+
+	// If we are packing micropanels, mark P as dense.
+	bli_obj_set_uplo( BLIS_DENSE, p );
+
+	// Reset the view offsets to (0,0).
+	bli_obj_set_offs( 0, 0, p );
+
+	// Compute the dimensions padded by the dimension multiples. These
+	// dimensions will be the dimensions of the packed matrices, including
+	// zero-padding, and will be used by the macro- and micro-kernels.
+	// We compute them by starting with the effective dimensions of A (now
+	// in P) and aligning them to the dimension multiples (typically equal
+	// to register blocksizes). This does waste a little bit of space for
+	// level-2 operations, but that's okay with us.
+	auto m_p     = bli_obj_length( p );
+	auto n_p     = bli_obj_width( p );
+	auto m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
+	auto n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
+
+	// Save the padded dimensions into the packed object. It is important
+	// to save these dimensions since they represent the actual dimensions
+	// of the zero-padded matrix.
+	bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
+
+	// The "panel stride" of a micropanel packed object is interpreted as
+	// the distance between the (0,0) element of panel k and the (0,0)
+	// element of panel k+1. We use the padded width computed above to
+	// allow for zero-padding (if necessary/desired) along the far end
+	// of each micropanel (ie: the right edge of the matrix). Zero-padding
+	// can also occur along the long edge of the last micropanel if the m
+	// dimension of the matrix is not a whole multiple of MR.
+	auto ps_p = bmult_m_pack * n_p_pad;
+
+	/* Compute the total number of iterations we'll need. */
+	auto n_iter = m_p_pad / bmult_m_def;
+
+	// Store the strides and panel dimension in P.
+	bli_obj_set_strides( 1, bmult_m_pack, p );
+	bli_obj_set_imag_stride( 1, p );
+	bli_obj_set_panel_dim( bmult_m_def, p );
+	bli_obj_set_panel_stride( ps_p, p );
+	bli_obj_set_panel_length( bmult_m_def, p );
+	bli_obj_set_panel_width( n_p, p );
+
+	// Compute the size of the packed buffer.
+	auto size_p = ps_p * n_iter * dt_size;
+	if ( size_p == 0 ) return;
+
+    // Compute the size of the scatter and block-scatter vectors to the total.
+    // It is never necessary to add padding for alignment because:
+    // 1) ps_p is always even
+    // 2) dt_size is a power of two >= 4
+    // 3) the alignment of the scatter vectors is at most 8
+    auto scat_size = 2 * (m_p + n_p) * sizeof(inc_t);
+
+	// Update the buffer address in p to point to the buffer associated
+	// with the mem_t entry acquired from the memory broker (now cached in
+	// the control tree node).
+	auto p_cast = (char*)bli_packm_alloc( size_p + scat_size, rntm, cntl, thread );
+	bli_obj_set_buffer( p_cast, p );
+
+    // Get the addresses of the scatter and block-scatter vectors. These are
+    // placed directly after the packed matrix buffer.
+    auto  rscat          = (inc_t*)(p_cast + size_p);
+    auto  rbs            = rscat + m_p;
+    auto  cscat          = rbs + m_p;
+    auto  cbs            = cscat + n_p;
+
+	auto  a_cast         = (char*)bli_obj_buffer_at_off( a );
+	auto  panel_dim_off  = bli_obj_row_off( a );
+	auto  panel_len_off  = bli_obj_col_off( a );
+	auto  conja          = bli_obj_conj_status( a );
+
+    auto  params         = (packm_tensor_params_t*)bli_obj_pack_params( a );
+    auto  ndim_m         = params->ndim_m;
+    auto  ndim_n         = params->ndim_n;
+    auto  len_m          = params->len_m;
+    auto  len_n          = params->len_n;
+    auto  stride_m       = params->stride_m;
+    auto  stride_n       = params->stride_n;
+
+	obj_t kappa_local;
+	auto  kappa_cast     = (char*)bli_packm_scalar( &kappa_local, p );
+
+	auto  packm_nb_ker   = packm_ckx_nb_ukrs[ dt ];
+	auto  packm_ss_ker   = packm_ckx_ss_ukrs[ dt ];
+
+    a_cast -= ( panel_dim_off * stride_m[0] +
+                panel_len_off * stride_n[0] ) * dt_size;
+
+    /* Fill in the scatter and block-scatter vectors. This is done single-threaded for now. */
+    if ( bli_thread_am_ochief( thread ) )
+    {
+        fill_scatter
+        (
+          ndim_m,
+          len_m,
+          stride_m,
+          bmult_m_def,
+          panel_dim_off,
+          m_p,
+          rscat,
+          rbs
+        );
+
+        fill_scatter
+        (
+          ndim_n,
+          len_n,
+          stride_n,
+          BS_K,
+          panel_len_off,
+          n_p,
+          cscat,
+          cbs
+        );
+    }
+
+    /* Wait for the scatter vectors to be done. */
+    bli_thread_barrier( thread );
+
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */
+	auto nt  = bli_thread_n_way( thread );
+	auto tid = bli_thread_work_id( thread );
+
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */
+	dim_t it_start, it_end, it_inc;
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
+
+	/* Iterate over every logical micropanel in the source matrix. */
+	for ( auto it  = 0; it < n_iter; it += 1 )
+	if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) )
+	{
+		auto panel_dim_i = bli_min( bmult_m_def, m_p - it*bmult_m_def );
+
+	    auto p_begin     = p_cast + it*ps_p*dt_size;
+        auto inca        = rbs[ it*bmult_m_def ];
+
+        if ( inca )
+        {
+	        auto a_begin = a_cast + rscat[ it*bmult_m_def ]*dt_size;
+
+    		packm_nb_ker( conja,
+                          panel_dim_i,
+		                  n_p,
+		                  bmult_m_def,
+		                  n_p_pad,
+		                  kappa_cast,
+		                  a_begin, inca, cbs, cscat,
+		                  p_begin, bmult_m_pack );
+        }
+        else
+        {
+	        auto a_begin   = a_cast;
+            auto rscat_use = rscat + it*bmult_m_def;
+
+    		packm_ss_ker( conja,
+                          panel_dim_i,
+		                  n_p,
+		                  bmult_m_def,
+		                  n_p_pad,
+		                  kappa_cast,
+		                  a_begin, rscat_use, cscat,
+		                  p_begin, bmult_m_pack );
+        }
+    }
+}
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+void PASTEMAC(ch,op) \
+    ( \
+      dim_t m, \
+      dim_t n, \
+      void* x, inc_t rs_x, inc_t cs_x, \
+      void* b, \
+      void* y, inc_t* rs_y, inc_t* cs_y \
+    ) \
+{ \
+    ctype* restrict x_cast =  (ctype*)x; \
+    ctype           b_cast = *(ctype*)b; \
+    ctype* restrict y_cast =  (ctype*)y; \
+\
+    if ( PASTEMAC(ch,eq0)( b_cast ) ) \
+    { \
+        for ( auto i = 0; i < m; i++ ) \
+        for ( auto j = 0; j < n; j++ ) \
+            PASTEMAC(ch,copys)( x_cast[ i*rs_x + j*cs_x ], y_cast[ rs_y[i] + cs_y[j] ] ); \
+    } \
+    else \
+    { \
+        for ( auto i = 0; i < m; i++ ) \
+        for ( auto j = 0; j < n; j++ ) \
+            PASTEMAC(ch,xpbys)( x_cast[ i*rs_x + j*cs_x ], b_cast, y_cast[ rs_y[i] + cs_y[j] ] ); \
+    } \
+}
+
+INSERT_GENTFUNC_BASIC0(scatter_mxn);
+
+static decltype(&bli_sscatter_mxn) GENARRAY(scatter_mxn, scatter_mxn);
+
+void gemm_tensor
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	auto dt       = bli_obj_dt( c );
+    auto dt_size  = bli_dt_size( dt );
+
+	auto m        = bli_obj_length( c );
+	auto n        = bli_obj_width( c );
+	auto k        = bli_obj_width( a );
+
+	auto a_cast   = (char*)bli_obj_buffer_at_off( a );
+	auto pd_a     = bli_obj_panel_dim( a );
+	auto ps_a     = bli_obj_panel_stride( a );
+
+	auto b_cast   = (char*)bli_obj_buffer_at_off( b );
+	auto pd_b     = bli_obj_panel_dim( b );
+	auto ps_b     = bli_obj_panel_stride( b );
+
+	auto c_cast   = (char*)bli_obj_buffer_at_off( c );
+	auto rs_c0    = bli_obj_row_stride( c );
+	auto cs_c0    = bli_obj_col_stride( c );
+	auto off_m    = bli_obj_row_off( c );
+	auto off_n    = bli_obj_col_off( c );
+
+    auto params   = (gemm_tensor_params_t*)bli_obj_ker_params( c );
+    auto ndim_m   = params->ndim_m;
+    auto ndim_n   = params->ndim_n;
+    auto len_m    = params->len_m;
+    auto len_n    = params->len_n;
+    auto stride_m = params->stride_m;
+    auto stride_n = params->stride_n;
+
+    if ( rs_c0 != stride_m[0] || cs_c0 != stride_n[0] )
+    {
+        std::swap( ndim_m, ndim_n );
+        std::swap( len_m, len_n );
+        std::swap( stride_m, stride_n );
+    }
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+    c_cast -= ( off_m * stride_m[0] +
+                off_n * stride_n[0] ) * dt_size;
+
+	// Detach and multiply the scalars attached to A and B.
+	// NOTE: We know that the internal scalars of A and B are already of the
+	// target datatypes because the necessary typecasting would have already
+	// taken place during bli_packm_init().
+	obj_t scalar_a;
+	obj_t scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	// NOTE: We know that scalar_b is of type dt due to the above code
+	// that casts the scalars of A and B to dt via scalar_a and scalar_b,
+	// and we know that the internal scalar in C is already of the type dt
+	// due to the casting in the implementation of bli_obj_scalar_attach().
+	auto alpha_cast = (char*)bli_obj_internal_scalar_buffer( &scalar_b );
+	auto beta_cast  = (char*)bli_obj_internal_scalar_buffer( c );
+
+	/* Alias some constants to simpler names. */
+	auto MR = pd_a;
+	auto NR = pd_b;
+
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */
+	auto gemm_ukr = (gemm_ukr_vft)bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */
+	char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	auto col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx );
+	auto rs_ct    = ( col_pref ? 1 : NR );
+	auto cs_ct    = ( col_pref ? MR : 1 );
+	auto zero     = (char*)bli_obj_buffer_for_const( dt, &BLIS_ZERO );
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+    auto scat_size = 2 * (m + n) * sizeof(inc_t);
+    auto rscat_c   = (inc_t*)bli_packm_alloc_ex( scat_size, BLIS_BUFFER_FOR_GEN_USE, rntm, cntl, thread );
+    auto rbs_c     = rscat_c + m;
+    auto cscat_c   = rbs_c + m;
+    auto cbs_c     = cscat_c + n;
+
+    /* Fill in the scatter and block-scatter vectors. This is done single-threaded for now. */
+    if ( bli_thread_am_ochief( thread ) )
+    {
+        fill_scatter
+        (
+          ndim_m,
+          len_m,
+          stride_m,
+          MR,
+          off_m,
+          m,
+          rscat_c,
+          rbs_c
+        );
+
+        fill_scatter
+        (
+          ndim_n,
+          len_n,
+          stride_n,
+          NR,
+          off_n,
+          n,
+          cscat_c,
+          cbs_c
+        );
+    }
+
+    /* Wait for the scatter vectors to be done. */
+    bli_thread_barrier( thread );
+
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */
+	auto n_iter = n / NR;
+	auto n_left = n % NR;
+
+	auto m_iter = m / MR;
+	auto m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	auto rstep_a = ps_a * dt_size;
+	auto cstep_b = ps_b * dt_size;
+
+    /* Save the virtual microkernel address and the params. */
+	auxinfo_t aux;
+    bli_auxinfo_set_ukr( (void*)gemm_ukr, &aux );
+    bli_auxinfo_set_params( params, &aux );
+
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */
+	auto caucus = bli_thrinfo_sub_node( thread );
+
+	/* Query the number of threads and thread ids for each loop. */
+	auto jr_nt  = bli_thread_n_way( thread );
+	auto jr_tid = bli_thread_work_id( thread );
+	auto ir_nt  = bli_thread_n_way( caucus );
+	auto ir_tid = bli_thread_work_id( caucus );
+
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( auto j = jr_start; j < jr_end; j += jr_inc )
+	{
+		auto b1    = b_cast + j * cstep_b;
+
+		auto n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		auto b2 = b1;
+
+		/* Loop over the m dimension (MR rows at a time). */
+		for ( auto i = ir_start; i < ir_end; i += ir_inc )
+		{
+			auto a1       = a_cast  + i * rstep_a;
+            auto rscat_c1 = rscat_c + i * MR;
+            auto rbs_c1   = rbs_c   + i * MR;
+            auto cscat_c1 = cscat_c + j * NR;
+            auto cbs_c1   = cbs_c   + j * NR;
+
+			auto m_cur    = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			/* Compute the addresses of the next panels of A and B. */
+			auto a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
+            auto rs_c = *rbs_c1;
+            auto cs_c = *cbs_c1;
+
+            if ( rs_c && cs_c )
+            {
+			    auto c11 = c_cast + ( *rscat_c1 + *cscat_c1 ) * dt_size;
+
+    			/* Invoke the gemm micro-kernel. */
+    			gemm_ukr
+    			(
+                  m_cur,
+                  n_cur,
+    			  k,
+    			  alpha_cast,
+    			  a1,
+    			  b1,
+    			  beta_cast,
+    			  c11, rs_c, cs_c,
+    			  &aux,
+    			  cntx
+    			);
+            }
+            else
+            {
+    			/* Invoke the gemm micro-kernel. */
+    			gemm_ukr
+    			(
+                  MR,
+                  NR,
+    			  k,
+    			  alpha_cast,
+    			  a1,
+    			  b1,
+    			  zero,
+    			  &ct, rs_ct, cs_ct,
+    			  &aux,
+    			  cntx
+    			);
+
+    			/* Scatter to C. */
+                scatter_mxn[ dt ]
+                (
+                    m_cur, n_cur,
+                    &ct, rs_ct, cs_ct,
+                    beta_cast,
+                    c_cast, rscat_c1, cscat_c1
+                );
+            }
+		}
+	}
+}
+
+static bool has_unit_stride( const std::vector<inc_t>& stride )
+{
+    for ( auto s : stride )
+        if ( s == 1 )
+            return true;
+    return false;
+}
+
+void tcontract( num_t dt, const std::vector<dim_t>& m, const std::vector<dim_t>& n, const std::vector<dim_t>& k,
+                const void* alpha, const void* a, std::vector<inc_t> rs_a, std::vector<inc_t> cs_a,
+                                   const void* b, std::vector<inc_t> rs_b, std::vector<inc_t> cs_b,
+                const void*  beta,       void* c, std::vector<inc_t> rs_c, std::vector<inc_t> cs_c )
+{
+    if ( rs_a.size() != m.size() ||
+         rs_b.size() != k.size() ||
+         rs_c.size() != m.size() )
+        bli_check_error_code( BLIS_INVALID_ROW_STRIDE );
+
+    if ( cs_a.size() != k.size() ||
+         cs_b.size() != n.size() ||
+         cs_c.size() != n.size() )
+        bli_check_error_code( BLIS_INVALID_COL_STRIDE );
+
+    dim_t m_mat = 1;
+    dim_t n_mat = 1;
+    dim_t k_mat = 1;
+    for ( auto& i : m ) m_mat *= i;
+    for ( auto& i : n ) n_mat *= i;
+    for ( auto& i : k ) k_mat *= i;
+
+    auto& stride_m = has_unit_stride( rs_c ) ? rs_c : rs_a;
+    for ( int i = 1;i < m.size(); i++ )
+    for ( int j = 0;j < m.size()-i; j++ )
+    if ( stride_m[j] > stride_m[j+1] )
+    {
+        std::swap( rs_a[j], rs_a[j+1] );
+        std::swap( rs_c[j], rs_c[j+1] );
+    }
+
+    auto& stride_n = has_unit_stride( cs_c ) ? cs_c : cs_b;
+    for ( int i = 1;i < n.size(); i++ )
+    for ( int j = 0;j < n.size()-i; j++ )
+    if ( stride_n[j] > stride_n[j+1] )
+    {
+        std::swap( cs_b[j], cs_b[j+1] );
+        std::swap( cs_c[j], cs_c[j+1] );
+    }
+
+    auto& stride_k = has_unit_stride( cs_a ) ? cs_a : rs_b;
+    for ( int i = 1;i < k.size(); i++ )
+    for ( int j = 0;j < k.size()-i; j++ )
+    if ( stride_k[j] > stride_k[j+1] )
+    {
+        std::swap( cs_a[j], cs_a[j+1] );
+        std::swap( rs_b[j], rs_b[j+1] );
+    }
+
+    if ( rs_a.empty() ) rs_a.push_back( 1 );
+    if ( cs_a.empty() ) cs_a.push_back( 1 );
+    if ( rs_b.empty() ) rs_b.push_back( 1 );
+    if ( cs_b.empty() ) cs_b.push_back( 1 );
+    if ( rs_c.empty() ) rs_c.push_back( 1 );
+    if ( cs_c.empty() ) cs_c.push_back( 1 );
+
+    obj_t a_o, b_o, c_o;
+    bli_obj_create_with_attached_buffer( dt, m_mat, k_mat, const_cast<void*>(a), rs_a[0], cs_a[0], &a_o );
+    bli_obj_create_with_attached_buffer( dt, k_mat, n_mat, const_cast<void*>(b), rs_b[0], cs_b[0], &b_o );
+    bli_obj_create_with_attached_buffer( dt, m_mat, n_mat,                   c , rs_c[0], cs_c[0], &c_o );
+
+    packm_tensor_params_t params_a( m.size(), m.data(), rs_a.data(),
+                                    k.size(), k.data(), cs_a.data() );
+    packm_tensor_params_t params_b( n.size(), n.data(), cs_b.data(),
+                                    k.size(), k.data(), rs_b.data() );
+    gemm_tensor_params_t params_c( m.size(), m.data(), rs_c.data(),
+                                   n.size(), n.data(), cs_c.data() );
+
+    bli_obj_set_pack_fn( packm_tensor, &a_o );
+    bli_obj_set_pack_fn( packm_tensor, &b_o );
+    bli_obj_set_ker_fn( gemm_tensor, &c_o );
+    bli_obj_set_pack_params( &params_a, &a_o );
+    bli_obj_set_pack_params( &params_b, &b_o );
+    bli_obj_set_ker_params( &params_c, &c_o );
+
+    obj_t alpha_o, beta_o;
+    bli_obj_create_1x1_with_attached_buffer( dt, const_cast<void*>(alpha), &alpha_o );
+    bli_obj_create_1x1_with_attached_buffer( dt, const_cast<void*>(beta), &beta_o );
+
+    rntm_t rntm;
+    bli_rntm_init_from_global( &rntm );
+    bli_rntm_disable_l3_sup( &rntm );
+
+    bli_gemm_ex( &alpha_o, &a_o, &b_o, &beta_o, &c_o, NULL, &rntm );
+}
+
+int main()
+{
+    auto N = 5;
+
+    gint_t ndim_a = 4;
+    gint_t ndim_b = 4;
+    gint_t ndim_c = 4;
+
+    std::vector<dim_t> len_a(ndim_a, N);
+    std::vector<dim_t> len_b(ndim_b, N);
+    std::vector<dim_t> len_c(ndim_c, N);
+
+    std::vector<inc_t> stride_a(ndim_a, 1);
+    std::vector<inc_t> stride_b(ndim_b, 1);
+    std::vector<inc_t> stride_c(ndim_c, 1);
+    for ( gint_t i = 1; i < ndim_a; i++ )
+        stride_a[i] = stride_a[i-1] * len_a[i - 1];
+    for ( gint_t i = 1; i < ndim_b; i++ )
+        stride_b[i] = stride_b[i-1] * len_b[i - 1];
+    for ( gint_t i = 1; i < ndim_c; i++ )
+        stride_c[i] = stride_c[i-1] * len_c[i - 1];
+
+    std::vector<int> dim_a(ndim_a);
+    std::vector<int> dim_b(ndim_b);
+    std::vector<int> dim_c(ndim_c);
+    std::iota(dim_a.begin(), dim_a.end(), 0);
+    std::iota(dim_b.begin(), dim_b.end(), 0);
+    std::iota(dim_c.begin(), dim_c.end(), 0);
+
+    for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ )
+    do
+    do
+    do
+    {
+        auto dt = ( num_t )dt_;
+
+        auto ndim_m = (ndim_a + ndim_c - ndim_b)/2;
+        auto ndim_k = (ndim_a + ndim_b - ndim_c)/2;
+
+        std::vector<dim_t> m(len_a.begin(), len_a.begin()+ndim_m);
+        std::vector<dim_t> n(len_b.begin()+ndim_k, len_b.end());
+        std::vector<dim_t> k(len_b.begin(), len_b.begin()+ndim_k);
+
+        std::vector<inc_t> rs_a(stride_a.begin(), stride_a.begin()+ndim_m);
+        std::vector<inc_t> cs_a(stride_a.begin()+ndim_m, stride_a.end());
+        std::vector<inc_t> rs_b(stride_b.begin(), stride_b.begin()+ndim_k);
+        std::vector<inc_t> cs_b(stride_b.begin()+ndim_k, stride_b.end());
+        std::vector<inc_t> rs_c(stride_c.begin(), stride_c.begin()+ndim_m);
+        std::vector<inc_t> cs_c(stride_c.begin()+ndim_m, stride_c.end());
+
+        dim_t m_tot = 1;
+        dim_t n_tot = 1;
+        dim_t k_tot = 1;
+        for ( auto i : m ) m_tot *= i;
+        for ( auto i : n ) n_tot *= i;
+        for ( auto i : k ) k_tot *= i;
+
+        obj_t a, b, c, c_ref, norm;
+
+        bli_obj_create( dt, m_tot*k_tot, 1, 1, 1, &a );
+        bli_obj_create( dt, k_tot*n_tot, 1, 1, 1, &b );
+        bli_obj_create( dt, m_tot*n_tot, 1, 1, 1, &c );
+        bli_obj_create( dt, m_tot*n_tot, 1, 1, 1, &c_ref );
+        bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm );
+
+        bli_randv( &a );
+        bli_randv( &b );
+        bli_randv( &c );
+        bli_copyv( &c, &c_ref );
+
+        tcontract( dt, m, n, k,
+                   bli_obj_buffer_for_const( dt, &BLIS_ONE ),
+                   bli_obj_buffer( &a ), rs_a, cs_a,
+                   bli_obj_buffer( &b ), rs_b, cs_b,
+                   bli_obj_buffer_for_const( dt, &BLIS_ZERO ),
+                   bli_obj_buffer( &c ), rs_c, cs_c );
+
+        tcontract_ref( dt, m, n, k,
+                       bli_obj_buffer_for_const( dt, &BLIS_ONE ),
+                       bli_obj_buffer( &a ), rs_a, cs_a,
+                       bli_obj_buffer( &b ), rs_b, cs_b,
+                       bli_obj_buffer_for_const( dt, &BLIS_ZERO ),
+                       bli_obj_buffer( &c_ref ), rs_c, cs_c );
+
+        bli_subv( &c_ref, &c );
+        bli_normfv( &c, &norm );
+
+        double normr, normi;
+        bli_getsc( &norm, &normr, &normi );
+
+        printf("dt: %d, dim_a: [%d,%d,%d,%d], dim_b: [%d,%d,%d,%d], dim_c: [%d,%d,%d,%d], norm: %g\n",
+               dt, dim_a[0], dim_a[1], dim_a[2], dim_a[3],
+                   dim_b[0], dim_b[1], dim_b[2], dim_b[3],
+                   dim_c[0], dim_c[1], dim_c[2], dim_c[3],
+               normr / std::sqrt( bli_obj_vector_dim( &c ) ) );
+
+        bli_obj_free( &a );
+        bli_obj_free( &b );
+        bli_obj_free( &c );
+        bli_obj_free( &c_ref );
+    }
+    while (std::next_permutation(dim_a.begin(), dim_a.end()));
+    while (std::next_permutation(dim_b.begin(), dim_b.end()));
+    while (std::next_permutation(dim_c.begin(), dim_c.end()));
+}
+
diff --git a/test/tensor_contraction/tcontract_ref.cxx b/test/tensor_contraction/tcontract_ref.cxx
new file mode 100644
index 000000000..b4cd07f90
--- /dev/null
+++ b/test/tensor_contraction/tcontract_ref.cxx
@@ -0,0 +1,67 @@
+#include "tcontract_ref.hpp"
+
+template <typename T>
+void tcontract_ref( const std::vector<dim_t>& m, const std::vector<dim_t>& n, const std::vector<dim_t>& k,
+                    const void* alpha, const void* a, const std::vector<inc_t>& rs_a, const std::vector<inc_t>& cs_a,
+                                       const void* b, const std::vector<inc_t>& rs_b, const std::vector<inc_t>& cs_b,
+                    const void*  beta,       void* c, const std::vector<inc_t>& rs_c, const std::vector<inc_t>& cs_c )
+{
+    auto alpha_cast = *( T* )alpha;
+    auto beta_cast  = *( T* )beta;
+    auto a_cast     = ( T* )a;
+    auto b_cast     = ( T* )b;
+    auto c_cast     = ( T* )c;
+
+    for_each(m.size(), m.data(), a_cast, rs_a.data(), c_cast, rs_c.data(),
+    [&]
+    {
+        for_each(n.size(), n.data(), b_cast, cs_b.data(), c_cast, cs_c.data(),
+        [&]
+        {
+            auto ab = convert<T>(0.0);
+
+            for_each(k.size(), k.data(), a_cast, cs_a.data(), b_cast, rs_b.data(),
+            [&]
+            {
+                ab += (*a_cast) * (*b_cast);
+            });
+
+            if ( beta_cast == convert<T>(0.0) )
+            {
+                *c_cast = alpha_cast * ab;
+            }
+            else
+            {
+                *c_cast = alpha_cast * ab + beta_cast * (*c_cast);
+            }
+        });
+
+        assert(b_cast == b);
+    });
+
+    assert(a_cast == a);
+    assert(c_cast == c);
+}
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+static auto PASTEMAC(ch,op) = &tcontract_ref<ctype>;
+
+INSERT_GENTFUNC_BASIC0(tcontract_ref);
+
+static decltype(&tcontract_ref<void>) GENARRAY( tcontract_ref_impl, tcontract_ref );
+
+void tcontract_ref( num_t dt, const std::vector<dim_t>& m, const std::vector<dim_t>& n, const std::vector<dim_t>& k,
+                    const void* alpha, const void* a, const std::vector<inc_t>& rs_a, const std::vector<inc_t>& cs_a,
+                                       const void* b, const std::vector<inc_t>& rs_b, const std::vector<inc_t>& cs_b,
+                    const void*  beta,       void* c, const std::vector<inc_t>& rs_c, const std::vector<inc_t>& cs_c )
+{
+    tcontract_ref_impl[ dt ]
+    (
+      m, n, k,
+      alpha, a, rs_a, cs_a,
+             b, rs_b, cs_b,
+       beta, c, rs_c, cs_c
+    );
+}
+
diff --git a/test/tensor_contraction/tcontract_ref.hpp b/test/tensor_contraction/tcontract_ref.hpp
new file mode 100644
index 000000000..99d4380dc
--- /dev/null
+++ b/test/tensor_contraction/tcontract_ref.hpp
@@ -0,0 +1,100 @@
+#include "blis.h"
+#include "complex_math.hpp"
+
+#include <vector>
+#include <array>
+#include <cassert>
+
+inline void increment(inc_t, gint_t) {}
+
+template <typename T, typename... Args>
+void increment(inc_t n, gint_t i, T& off, const inc_t* s, Args&... args)
+{
+    off += s[i]*n;
+    increment(n, i, args...);
+}
+
+template <typename Body, typename... Args>
+void for_each_impl(gint_t ndim, const dim_t* n,
+                   dim_t off, dim_t len,
+                   Body& body,
+                   Args&... args)
+{
+    std::array<dim_t,8> i = {};
+    assert( ndim <= i.size() );
+
+    if ( off )
+    {
+        for ( gint_t k = 0; k < ndim; k++ )
+        {
+            i[k] = off % n[k];
+            off /= n[k];
+            increment(i[k], k, args...);
+        }
+    }
+
+    for ( dim_t pos = 0; pos < len; pos++ )
+    {
+        body();
+
+        for ( gint_t k = 0; k < ndim; k++ )
+        {
+            if ( i[k] == n[k]-1 )
+            {
+                increment(-i[k], k, args...);
+                i[k] = 0;
+            }
+            else
+            {
+                increment(1, k, args...);
+                i[k]++;
+                break;
+            }
+        }
+    }
+}
+
+template <typename T, typename Body>
+void for_each(gint_t ndim, const dim_t* n,
+              dim_t off, dim_t len,
+              T& a, const inc_t* s_a,
+              Body&& body)
+{
+    for_each_impl( ndim, n, off, len, body, a, s_a );
+}
+
+template <typename T, typename Body>
+void for_each(gint_t ndim, const dim_t* n,
+              dim_t off, dim_t len,
+              T& a, const inc_t* s_a,
+              T& b, const inc_t* s_b,
+              Body&& body)
+{
+    for_each_impl( ndim, n, off, len, body, a, s_a, b, s_b );
+}
+
+template <typename T, typename Body>
+void for_each(gint_t ndim, const dim_t* n,
+              T& a, const inc_t* s_a,
+              Body&& body)
+{
+    dim_t len = 1;
+    for ( gint_t i = 0;i < ndim;i++ ) len *= n[i];
+    for_each_impl( ndim, n, 0, len, body, a, s_a );
+}
+
+template <typename T, typename Body>
+void for_each(gint_t ndim, const dim_t* n,
+              T& a, const inc_t* s_a,
+              T& b, const inc_t* s_b,
+              Body&& body)
+{
+    dim_t len = 1;
+    for ( gint_t i = 0;i < ndim;i++ ) len *= n[i];
+    for_each_impl( ndim, n, 0, len, body, a, s_a, b, s_b );
+}
+
+void tcontract_ref( num_t dt, const std::vector<dim_t>& m, const std::vector<dim_t>& n, const std::vector<dim_t>& k,
+                    const void* alpha, const void* a, const std::vector<inc_t>& rs_a, const std::vector<inc_t>& cs_a,
+                                       const void* b, const std::vector<inc_t>& rs_b, const std::vector<inc_t>& cs_b,
+                    const void*  beta,       void* c, const std::vector<inc_t>& rs_c, const std::vector<inc_t>& cs_c );

From 08174a2f6ebbd8ed5aa2bc4edc45da80962f06bb Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Sat, 1 Jan 2022 21:35:19 +0900
Subject: [PATCH 015/230] Evict <arm_sve.h> Requirement for SVE GEMM

For 8<= GCC < 10 compatibility.
---
 config/armsve/bli_cntx_init_armsve.c                        | 1 -
 .../armsve/3/bli_armsve_utils.c                             | 6 +++---
 .../armsve/3/bli_armsve_utils.h                             | 2 +-
 kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c     | 3 +--
 kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c     | 3 +--
 kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c     | 3 +--
 kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c     | 3 +--
 kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c      | 3 +--
 kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c      | 4 +---
 kernels/armsve/bli_kernels_armsve.h                         | 1 +
 10 files changed, 11 insertions(+), 18 deletions(-)
 rename config/armsve/bli_armsve_config_utils.c => kernels/armsve/3/bli_armsve_utils.c (97%)
 rename config/armsve/bli_armsve_config_utils.h => kernels/armsve/3/bli_armsve_utils.h (98%)

diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index fafed2229..cd07924a7 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -33,7 +33,6 @@
 */
 
 #include "blis.h"
-#include "bli_armsve_config_utils.h"
 
 void bli_cntx_init_armsve( cntx_t* cntx )
 {
diff --git a/config/armsve/bli_armsve_config_utils.c b/kernels/armsve/3/bli_armsve_utils.c
similarity index 97%
rename from config/armsve/bli_armsve_config_utils.c
rename to kernels/armsve/3/bli_armsve_utils.c
index 70501e39d..1e3256d34 100644
--- a/config/armsve/bli_armsve_config_utils.c
+++ b/kernels/armsve/3/bli_armsve_utils.c
@@ -35,7 +35,7 @@
 */
 #include "blis.h"
 
-dim_t bli_vl_bits_armsve(void)
+dim_t bli_vl_bytes_armsve(void)
 { \
     uint64_t vl = 0;
     __asm__ (
@@ -43,7 +43,7 @@ dim_t bli_vl_bits_armsve(void)
       " incb x0        \n\t"
       " mov  %[vl], x0 \n\t"
     : [vl] "=r" (vl)
-    : 
+    :
     : "x0"
      );
     return vl;
@@ -64,7 +64,7 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
     dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \
     dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \
 \
-    dim_t vl_b = bli_vl_bits_armsve(); \
+    dim_t vl_b = bli_vl_bytes_armsve(); \
     dim_t vl = vl_b / S_Data; \
     dim_t m_r = 2 * vl; \
     dim_t n_r = 10; \
diff --git a/config/armsve/bli_armsve_config_utils.h b/kernels/armsve/3/bli_armsve_utils.h
similarity index 98%
rename from config/armsve/bli_armsve_config_utils.h
rename to kernels/armsve/3/bli_armsve_utils.h
index 87bba73ed..6d3aab05d 100644
--- a/config/armsve/bli_armsve_config_utils.h
+++ b/kernels/armsve/3/bli_armsve_utils.h
@@ -35,7 +35,7 @@
 */
 #include "blis.h"
 
-dim_t bli_vl_bits_armsve(void);
+dim_t bli_vl_bytes_armsve(void);
 
 void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
 void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 913abd1f6..c84a59f07 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -42,7 +42,6 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10cmplx.h"
 
-#include "arm_sve.h"
 
 void bli_cgemm_armsve_asm_2vx10_unindexed
      (
@@ -69,7 +68,7 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  uint64_t mr = svcntw();
+  uint64_t mr = bli_vl_bytes_armsve() * 2 / 8;
   GEMM_UKR_SETUP_CT( c, mr, 10, false );
 
   __asm__ volatile (
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 9730fb8ce..5a662df4e 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -42,7 +42,6 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10.h"
 
-#include "arm_sve.h"
 
 void bli_dgemm_armsve_asm_2vx10_unindexed
      (
@@ -68,7 +67,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
-  uint64_t mr = 2*svcntd();
+  uint64_t mr = bli_vl_bytes_armsve() * 2 / 8;
   GEMM_UKR_SETUP_CT( d, mr, 10, false );
 
   __asm__ volatile (
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index 74c4779d7..caa70a5e5 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -42,7 +42,6 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10.h"
 
-#include "arm_sve.h"
 
 void bli_sgemm_armsve_asm_2vx10_unindexed
      (
@@ -68,7 +67,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
-  uint64_t mr = 2*svcntw();
+  uint64_t mr = bli_vl_bytes_armsve() * 2 / 4;
   GEMM_UKR_SETUP_CT( s, mr, 10, false );
 
   __asm__ volatile (
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index ee041b3c4..25084af35 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -42,7 +42,6 @@
 // 2vx10 microkernels.
 #include "armsve_asm_2vx10cmplx.h"
 
-#include "arm_sve.h"
 
 void bli_zgemm_armsve_asm_2vx10_unindexed
      (
@@ -69,7 +68,7 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  uint64_t mr = svcntd();
+  uint64_t mr = bli_vl_bytes_armsve() * 2 / 16;
   GEMM_UKR_SETUP_CT( z, mr, 10, false );
 
   __asm__ volatile (
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
index 641944ecd..ca62f9db1 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
@@ -42,7 +42,6 @@
 // 2vx7 microkernels.
 #include "armsve_asm_2vx7cmplx.h"
 
-#include "arm_sve.h"
 
 void bli_zgemm_armsve_asm_2vx7_unindexed
      (
@@ -69,7 +68,7 @@ void bli_zgemm_armsve_asm_2vx7_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  uint64_t mr = svcntd();
+  uint64_t mr = bli_vl_bytes_armsve() * 2 / 16;
   GEMM_UKR_SETUP_CT( z, mr, 7, false );
 
   __asm__ volatile (
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
index 4272f72c0..4a910baac 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
@@ -42,8 +42,6 @@
 // 2vx8 microkernels.
 #include "armsve_asm_2vx8cmplx.h"
 
-#include "arm_sve.h"
-
 void bli_zgemm_armsve_asm_2vx8_unindexed
      (
        dim_t               m,
@@ -69,7 +67,7 @@ void bli_zgemm_armsve_asm_2vx8_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  uint64_t mr = svcntd();
+  uint64_t mr = bli_vl_bytes_armsve() * 2 / 16;
   GEMM_UKR_SETUP_CT( z, mr, 8, false );
 
   __asm__ volatile (
diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h
index 0d5c5dc47..408300308 100644
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -31,6 +31,7 @@
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
+#include "./3/bli_armsve_utils.h"
 
 GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )
 GEMM_UKR_PROT( double,   d, gemm_armsve_asm_2vx10_unindexed )

From 466b68a3ad118342dc49a8130b7b02f5e7748521 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 2 Jan 2022 14:59:41 -0600
Subject: [PATCH 016/230] Add unique tag to branch labels for Apple ARM64.

Add `%=` tag to branch labels, which expands to a unique identifier for each inline assembly block. This prevents duplicate symbol errors on Apple Silicon (#594). Fixes #594. [ci skip] since we can't test Apple Silicon anyways...
---
 kernels/armv8a/3/armv8a_asm_utils.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h
index 5cb0bad69..6ef6a3fb0 100644
--- a/kernels/armv8a/3/armv8a_asm_utils.h
+++ b/kernels/armv8a/3/armv8a_asm_utils.h
@@ -36,10 +36,10 @@
 
 // Apple's local label requirements.
 #if defined(__APPLE__)
-#define LABEL(str) "   L" #str": \n\t"
-#define BEQ(str) "b.eq L" #str"  \n\t"
-#define BNE(str) "b.ne L" #str"  \n\t"
-#define BRANCH(str) "b L" #str"  \n\t"
+#define LABEL(str) "   L" #str"%=: \n\t"
+#define BEQ(str) "b.eq L" #str"%=  \n\t"
+#define BNE(str) "b.ne L" #str"%=  \n\t"
+#define BRANCH(str) "b L" #str"%=  \n\t"
 #else
 #define LABEL(str) "   ." #str": \n\t"
 #define BEQ(str) "b.eq ." #str"  \n\t"

From 864bfab4486ac910ef9a366e9ade4b45a39747fc Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 4 Jan 2022 15:10:34 -0600
Subject: [PATCH 017/230] CREDITS file update.

---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index 81fc9bec5..7dd452daa 100644
--- a/CREDITS
+++ b/CREDITS
@@ -70,6 +70,7 @@ but many others have contributed code and feedback, including
   Stepan Nassyr            @stepannassyr       (Jülich Supercomputing Centre)
   Nisanth Padinharepatt                        (AMD)
   Ajay Panyala             @ajaypanyala
+  Marc-Antoine Parent      @maparent           (Conversence)
   Devangi Parikh           @dnparikh           (The University of Texas at Austin)
   Elmar Peise              @elmar-peise        (RWTH-Aachen)
   Clément Pernet           @ClementPernet

From 3f2440b0226d5e23a43d12105d74aa917cd6c610 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 6 Jan 2022 14:57:36 -0600
Subject: [PATCH 018/230] Added m, n dims to gemmd/gemmlike ukernel calls.

Details:
- Updated the gemmd addon and the gemmlike sandbox code to use the new
  microkernel calling sequence, which now includes m and n dimensions so
  that the microkernel has all the information necessary to handle edge
  cases. Thanks to Jeff Diamond for catching this, which ideally would
  have been included in commit 54fa28b.
- Retired var2 of both gemmd and gemmlike to 'attic' directories and
  removed their corresponding prototypes. In both cases, var2 was a
  variant of the block-panel algorithm where edge-case handling was
  abstracted away to a microkernel wrapper. (Since this is now the
  official behavior of BLIS microkernels, I saw no need to have it
  included as a separate code path.)
- Comment updates.
---
 addon/gemmd/{ => attic}/bao_gemmd_bp_var2.c   |  0
 addon/gemmd/bao_gemmd.c                       | 20 +-----
 addon/gemmd/bao_gemmd_bp_var1.c               | 67 ++++---------------
 addon/gemmd/bao_gemmd_var.h                   |  7 --
 .../gemmlike/{ => attic}/bls_gemm_bp_var2.c   |  0
 sandbox/gemmlike/bls_gemm.c                   | 19 +-----
 sandbox/gemmlike/bls_gemm_bp_var1.c           | 67 ++++---------------
 sandbox/gemmlike/bls_gemm_var.h               |  7 --
 8 files changed, 30 insertions(+), 157 deletions(-)
 rename addon/gemmd/{ => attic}/bao_gemmd_bp_var2.c (100%)
 rename sandbox/gemmlike/{ => attic}/bls_gemm_bp_var2.c (100%)

diff --git a/addon/gemmd/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c
similarity index 100%
rename from addon/gemmd/bao_gemmd_bp_var2.c
rename to addon/gemmd/attic/bao_gemmd_bp_var2.c
diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c
index 71d49806b..fadc52691 100644
--- a/addon/gemmd/bao_gemmd.c
+++ b/addon/gemmd/bao_gemmd.c
@@ -197,9 +197,7 @@ void bao_gemmd_int
 	// In this function, we choose the gemmd implementation that is executed
 	// on each thread.
 
-#if 1
-	// Call the block-panel algorithm that calls the kernel directly, which
-	// exposes edge-case handling.
+	// Call the block-panel algorithm.
 	bao_gemmd_bp_var1
 	(
 	  alpha,
@@ -212,22 +210,6 @@ void bao_gemmd_int
 	  rntm,
 	  thread
 	);
-#else
-	// Call the block-panel algorithm that calls the kernel indirectly via a
-	// wrapper function, which hides edge-case handling.
-	bao_gemmd_bp_var2
-	(
-	  alpha,
-	  a,
-	  d,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm,
-	  thread
-	);
-#endif
 }
 
 //
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
index e042f1fd8..09e4df09e 100644
--- a/addon/gemmd/bao_gemmd_bp_var1.c
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -164,17 +164,6 @@ void PASTECH2(bao_,ch,varname) \
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
                gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                / sizeof( ctype ) ] \
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
-	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = cs_c; \
@@ -203,7 +192,6 @@ void PASTECH2(bao_,ch,varname) \
 	ctype           alpha_local = *alpha_cast; \
 	ctype           beta_local  = *beta_cast; \
 	ctype           one_local   = *PASTEMAC(ch,1); \
-	ctype           zero_local  = *PASTEMAC(ch,0); \
 \
 	auxinfo_t       aux; \
 \
@@ -449,47 +437,20 @@ void PASTECH2(bao_,ch,varname) \
 						bli_auxinfo_set_next_a( a2, &aux ); \
 						bli_auxinfo_set_next_b( b2, &aux ); \
 \
-						/* Handle interior and edge cases separately. */ \
-						if ( mr_cur == MR && nr_cur == NR ) \
-						{ \
-							/* Invoke the gemm microkernel. */ \
-							gemm_ukr \
-							( \
-							  kc_cur, \
-							  &alpha_local, \
-							  a_ir, \
-							  b_jr, \
-							  beta_use, \
-							  c_ir, rs_c, cs_c, \
-							  &aux, \
-							  cntx  \
-							); \
-						} \
-						else \
-						{ \
-							/* Invoke the gemm microkernel. */ \
-							gemm_ukr \
-							( \
-							  kc_cur, \
-							  &alpha_local, \
-							  a_ir, \
-							  b_jr, \
-							  &zero_local, \
-							  ct, rs_ct, cs_ct, \
-							  &aux, \
-							  cntx  \
-							); \
-\
-							/* Scale the bottom edge of C and add the result from above. */ \
-							PASTEMAC(ch,xpbys_mxn) \
-							( \
-							  mr_cur, \
-							  nr_cur, \
-							  ct,   rs_ct, cs_ct, \
-							  beta_use, \
-							  c_ir, rs_c,  cs_c \
-							); \
-						} \
+						/* Invoke the gemm microkernel. */ \
+						gemm_ukr \
+						( \
+						  mr_cur, \
+						  nr_cur, \
+						  kc_cur, \
+						  &alpha_local, \
+						  a_ir, \
+						  b_jr, \
+						  beta_use, \
+						  c_ir, rs_c, cs_c, \
+						  &aux, \
+						  cntx  \
+						); \
 					} \
 				} \
 			} \
diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/gemmd/bao_gemmd_var.h
index 5c6674727..05ec45e07 100644
--- a/addon/gemmd/bao_gemmd_var.h
+++ b/addon/gemmd/bao_gemmd_var.h
@@ -54,7 +54,6 @@ void PASTECH(bao_,opname) \
      );
 
 GENPROT( gemmd_bp_var1 )
-GENPROT( gemmd_bp_var2 )
 
 
 //
@@ -88,12 +87,6 @@ GENTPROT( double,   d, gemmd_bp_var1 )
 GENTPROT( scomplex, c, gemmd_bp_var1 )
 GENTPROT( dcomplex, z, gemmd_bp_var1 )
 
-//INSERT_GENTPROT_BASIC0( gemmd_bp_var2 )
-GENTPROT( float,    s, gemmd_bp_var2 )
-GENTPROT( double,   d, gemmd_bp_var2 )
-GENTPROT( scomplex, c, gemmd_bp_var2 )
-GENTPROT( dcomplex, z, gemmd_bp_var2 )
-
 
 //
 // Prototype the typed kernel interfaces.
diff --git a/sandbox/gemmlike/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
similarity index 100%
rename from sandbox/gemmlike/bls_gemm_bp_var2.c
rename to sandbox/gemmlike/attic/bls_gemm_bp_var2.c
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index 0b15f2197..f2f8b7e25 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -192,9 +192,7 @@ void bls_gemm_int
 	// In this function, we choose the gemm implementation that is executed
 	// on each thread.
 
-#if 1
-	// Call the block-panel algorithm that calls the kernel directly, which
-	// exposes edge-case handling.
+	// Call the block-panel algorithm.
 	bls_gemm_bp_var1
 	(
 	  alpha,
@@ -206,21 +204,6 @@ void bls_gemm_int
 	  rntm,
 	  thread
 	);
-#else
-	// Call the block-panel algorithm that calls the kernel indirectly via a
-	// wrapper function, which hides edge-case handling.
-	bls_gemm_bp_var2
-	(
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm,
-	  thread
-	);
-#endif
 }
 
 //
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index ae695ce34..62dc462d5 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -157,17 +157,6 @@ void PASTECH2(bls_,ch,varname) \
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
                gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                / sizeof( ctype ) ] \
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
-	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = cs_c; \
@@ -194,7 +183,6 @@ void PASTECH2(bls_,ch,varname) \
 	ctype           alpha_local = *alpha_cast; \
 	ctype           beta_local  = *beta_cast; \
 	ctype           one_local   = *PASTEMAC(ch,1); \
-	ctype           zero_local  = *PASTEMAC(ch,0); \
 \
 	auxinfo_t       aux; \
 \
@@ -437,47 +425,20 @@ void PASTECH2(bls_,ch,varname) \
 						bli_auxinfo_set_next_a( a2, &aux ); \
 						bli_auxinfo_set_next_b( b2, &aux ); \
 \
-						/* Handle interior and edge cases separately. */ \
-						if ( mr_cur == MR && nr_cur == NR ) \
-						{ \
-							/* Invoke the gemm microkernel. */ \
-							gemm_ukr \
-							( \
-							  kc_cur, \
-							  &alpha_local, \
-							  a_ir, \
-							  b_jr, \
-							  beta_use, \
-							  c_ir, rs_c, cs_c, \
-							  &aux, \
-							  cntx  \
-							); \
-						} \
-						else \
-						{ \
-							/* Invoke the gemm microkernel. */ \
-							gemm_ukr \
-							( \
-							  kc_cur, \
-							  &alpha_local, \
-							  a_ir, \
-							  b_jr, \
-							  &zero_local, \
-							  ct, rs_ct, cs_ct, \
-							  &aux, \
-							  cntx  \
-							); \
-\
-							/* Scale the bottom edge of C and add the result from above. */ \
-							PASTEMAC(ch,xpbys_mxn) \
-							( \
-							  mr_cur, \
-							  nr_cur, \
-							  ct,   rs_ct, cs_ct, \
-							  beta_use, \
-							  c_ir, rs_c,  cs_c \
-							); \
-						} \
+						/* Invoke the gemm microkernel. */ \
+						gemm_ukr \
+						( \
+						  mr_cur, \
+						  nr_cur, \
+						  kc_cur, \
+						  &alpha_local, \
+						  a_ir, \
+						  b_jr, \
+						  beta_use, \
+						  c_ir, rs_c, cs_c, \
+						  &aux, \
+						  cntx  \
+						); \
 					} \
 				} \
 			} \
diff --git a/sandbox/gemmlike/bls_gemm_var.h b/sandbox/gemmlike/bls_gemm_var.h
index 025b54a06..7c515f8c3 100644
--- a/sandbox/gemmlike/bls_gemm_var.h
+++ b/sandbox/gemmlike/bls_gemm_var.h
@@ -53,7 +53,6 @@ void PASTECH(bls_,opname) \
      );
 
 GENPROT( gemm_bp_var1 )
-GENPROT( gemm_bp_var2 )
 
 
 //
@@ -86,12 +85,6 @@ GENTPROT( double,   d, gemm_bp_var1 )
 GENTPROT( scomplex, c, gemm_bp_var1 )
 GENTPROT( dcomplex, z, gemm_bp_var1 )
 
-//INSERT_GENTPROT_BASIC0( gemm_bp_var2 )
-GENTPROT( float,    s, gemm_bp_var2 )
-GENTPROT( double,   d, gemm_bp_var2 )
-GENTPROT( scomplex, c, gemm_bp_var2 )
-GENTPROT( dcomplex, z, gemm_bp_var2 )
-
 
 //
 // Prototype the typed kernel interfaces.

From 268ce1f29a717d18304713ecc25a2eafe41838c7 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 10 Jan 2022 10:17:17 -0600
Subject: [PATCH 019/230] Relax alignment constraints

Remove alignment of temporary AB buffer in edge case handling macros unless alignment is specifically requested (e.g. Core2, SDB/IVB). Fixes #595.
---
 frame/include/bli_edge_case_macro_defs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h
index 242045a02..4a1fba7ac 100644
--- a/frame/include/bli_edge_case_macro_defs.h
+++ b/frame/include/bli_edge_case_macro_defs.h
@@ -38,14 +38,14 @@
 
 // Helper macros for edge-case handling within gemm microkernels.
 
-#define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major) \
+#define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \
 \
 	PASTEMAC(ch,ctype)* restrict _beta   = beta; \
 	PASTEMAC(ch,ctype)* restrict _c      = c; \
 	const inc_t                  _rs_c   = rs_c; \
 	const inc_t                  _cs_c   = cs_c; \
 	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \
-	                                  __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	                                  __attribute__((aligned(alignment))); \
 	const inc_t                  _rs_ct  = row_major ? nr :  1; \
 	const inc_t                  _cs_ct  = row_major ?  1 : mr;
 
@@ -64,27 +64,27 @@
 
 #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \
 \
-	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
 	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
 	                     m != mr || n != nr; \
 	GEMM_UKR_SETUP_CT_POST(ch);
 
 #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \
 \
-	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
 	const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \
 	                     m != mr || n != nr; \
 	GEMM_UKR_SETUP_CT_POST(ch);
 
 #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \
 \
-	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
 	const bool _use_ct = m != mr || n != nr; \
 	GEMM_UKR_SETUP_CT_POST(ch);
 
 #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \
 \
-	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \
+	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \
 	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
 	                     m != mr || n != nr || \
 	                     ( (uintptr_t)_c % alignment ) || \

From 81f93be0561c705ae6823d19e40849facc40bef7 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 10 Jan 2022 10:19:47 -0600
Subject: [PATCH 020/230] Fix row-/column-major pref. in 16x8 haswell sgemm ukr
 (unused)

---
 kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index a3a8b0b09..dd9526d56 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -100,7 +100,7 @@ void bli_sgemm_haswell_asm_16x6
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
-	GEMM_UKR_SETUP_CT( s, 16, 6, true );
+	GEMM_UKR_SETUP_CT( s, 16, 6, false );
 
 	begin_asm()
 

From 0ab20c0e72402ba0b17fe2c3ed3e16bf2ace0fd3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Thu, 13 Jan 2022 07:29:56 -0800
Subject: [PATCH 021/230] the Apple local label thing is required by Clang in
 general

@egaudry and I both saw this issue on Linux with Clang 10.

```
Compiling obj/thunderx2/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o ('thunderx2' CFLAGS for kernels)
kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c:171:49: fatal error: invalid symbol redefinition
        "                                            \n\t"
                                                       ^
<inline asm>:90:5: note: instantiated into assembly here
           .SLOOPKITER:
           ^
1 error generated.
```

Signed-off-by: Jeff Hammond <jehammond@nvidia.com>
---
 kernels/armv8a/3/armv8a_asm_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h
index 6ef6a3fb0..465950999 100644
--- a/kernels/armv8a/3/armv8a_asm_utils.h
+++ b/kernels/armv8a/3/armv8a_asm_utils.h
@@ -35,7 +35,7 @@
 */
 
 // Apple's local label requirements.
-#if defined(__APPLE__)
+#if defined(__APPLE__) || defined(__clang__)
 #define LABEL(str) "   L" #str"%=: \n\t"
 #define BEQ(str) "b.eq L" #str"%=  \n\t"
 #define BNE(str) "b.ne L" #str"%=  \n\t"

From 0be9282cdccf73342d8571d3f7971a9b0af72363 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 26 Jan 2022 17:46:24 -0600
Subject: [PATCH 022/230] Updated zen3 macro constant names.

Details:
- In config/zen3/bli_family_zen3.h, renamed:
    BLIS_SMALL_MATRIX_A_THRES_M_GEMMT -> _M_SYRK
    BLIS_SMALL_MATRIX_A_THRES_N_GEMMT -> _N_SYRK
  Thanks to Jeff Diamond for helping spot the stale _SYRK naming.
---
 config/zen3/bli_family_zen3.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h
index 918e919ae..8487a7277 100644
--- a/config/zen3/bli_family_zen3.h
+++ b/config/zen3/bli_family_zen3.h
@@ -63,8 +63,8 @@
 
 #define BLIS_SMALL_MATRIX_THRES_TRSM   32768 //128(128+128) => m*(m+n)
 #define BLIS_SMALL_MATRIX_A_THRES_TRSM  128
-#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK    96
-#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK    128
+#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT    96
+#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT    128
 
 #define BLIS_ENABLE_SMALL_MATRIX_ROME
 #define BLIS_SMALL_MATRIX_THRES_ROME       400

From 35195bb5cea5d99eb3eaf41e3815137d14ceb52d Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 31 Jan 2022 10:29:50 -0600
Subject: [PATCH 023/230] Add armclang detection to configure.

armclang is treated as regular clang. Fixes #606. [ci skip]
---
 configure | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index 1abe7610e..95a97c6b1 100755
--- a/configure
+++ b/configure
@@ -1464,17 +1464,31 @@ get_compiler_version()
 		cc_vendor="aocc"
 	fi
 
+	# Detect armclang, which doesn't have a nice, unambiguous, one-word tag
+	armclang_grep=$(echo "${vendor_string}" | grep 'Arm C/C++/Fortran Compiler')
+	if [ -n "${armclang_grep}" ]; then
+		cc_vendor="armclang"
+	fi
+
 	# Begin parsing cc_vendor for the version string.
 
 	if [ "${cc_vendor}" = "crosstool-NG" ]; then
 	     # Treat compilers built by crosstool-NG (for eg: conda) as gcc.
-	     cc_vendor="gcc"
+		cc_vendor="gcc"
 	fi
 	if [ "${cc_vendor}" = "icc" -o \
 	     "${cc_vendor}" = "gcc" ]; then
 
 		cc_version=$(${cc} -dumpversion)
 
+	elif [ "${cc_vendor}" = "armclang" ]; then
+
+		# Treat armclang as regular clang.
+		cc_vendor="clang"
+		cc_version=$(echo "${vendor_string}" \
+		             | egrep -o 'based on LLVM [0-9]+\.[0-9]+\.?[0-9]*' \
+		             | egrep -o               '[0-9]+\.[0-9]+\.?[0-9]*')
+
 	elif [ "${cc_vendor}" = "clang" ]; then
 
 		cc_version=$(echo "${vendor_string}" \

From b5df1811f1bc8212b2cda6bb97b79819afe236a8 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Thu, 3 Feb 2022 02:31:29 +0900
Subject: [PATCH 024/230] Armv8a, ArmSVE: Simplify Gen-C

---
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |  74 +-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  |  68 +-
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  |  62 +-
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |  74 +-
 .../armsve/3/{ => old}/armsve_asm_2vx7cmplx.h |   0
 .../armsve/3/{ => old}/armsve_asm_2vx8cmplx.h |   0
 .../3/{ => old}/armsve_asm_macros_half.h      |   0
 .../bli_gemm_armsve_asm_z2vx7_unindexed.c     |   0
 .../bli_gemm_armsve_asm_z2vx8_unindexed.c     |   0
 kernels/armsve/bli_kernels_armsve.h           |   6 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   | 659 +-----------------
 11 files changed, 154 insertions(+), 789 deletions(-)
 rename kernels/armsve/3/{ => old}/armsve_asm_2vx7cmplx.h (100%)
 rename kernels/armsve/3/{ => old}/armsve_asm_2vx8cmplx.h (100%)
 rename kernels/armsve/3/{ => old}/armsve_asm_macros_half.h (100%)
 rename kernels/armsve/3/{ => old}/bli_gemm_armsve_asm_z2vx7_unindexed.c (100%)
 rename kernels/armsve/3/{ => old}/bli_gemm_armsve_asm_z2vx8_unindexed.c (100%)

diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index c84a59f07..60a64515f 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -118,8 +118,8 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 "                                                 \n\t"
 " CCOL_PRFM:                                      \n\t"
-" cmp             %3, #1                          \n\t"
-" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// " cmp             %3, #1                          \n\t"
+// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
 " mov             x16, %2                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
@@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+// " END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
@@ -233,8 +233,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
 " WRITE_MEM_EXEC:                                 \n\t"
 " mov             x9, %2                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is %2 itself.
-" cmp             %3, #1                          \n\t"
-" b.ne            WRITE_MEM_G                     \n\t"
+// " cmp             %3, #1                          \n\t"
+// " b.ne            WRITE_MEM_G                     \n\t"
 "                                                 \n\t"
 " WRITE_MEM_C:                                    \n\t"
 " fmov            s29, wzr                        \n\t"
@@ -260,38 +260,38 @@ GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
 GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
-" b               END_WRITE_MEM                   \n\t"
-"                                                 \n\t"
-" WRITE_MEM_G:                                    \n\t"
-" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
-" mov             x3, %3                          \n\t" //  s.t. 2*sizeof(float) = 2*4 = 8.
-" index           z28.s, wzr, w3                  \n\t"
-" fmov            s29, wzr                        \n\t"
-" fcmp            s31, #0.0                       \n\t" // Whether Imag(beta) == 0.
-" fccmp           s30, s29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-" b.eq            ZERO_BETA_G_0_1_2_3             \n\t"
-GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
-GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-" ZERO_BETA_G_0_1_2_3:                            \n\t"
-GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
-GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
-"                                                 \n\t"
-" b.eq            ZERO_BETA_G_4_5_6_7_8_9         \n\t"
-GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
-GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
-GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
-GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-" ZERO_BETA_G_4_5_6_7_8_9:                        \n\t"
-GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
-GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
-GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
-"                                                 \n\t"
-" END_WRITE_MEM:                                  \n\t"
-" b               END_EXEC                        \n\t"
+// " b               END_WRITE_MEM                   \n\t"
+// "                                                 \n\t"
+// " WRITE_MEM_G:                                    \n\t"
+// " add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+// " mov             x3, %3                          \n\t" //  s.t. 2*sizeof(float) = 2*4 = 8.
+// " index           z28.s, wzr, w3                  \n\t"
+// " fmov            s29, wzr                        \n\t"
+// " fcmp            s31, #0.0                       \n\t" // Whether Imag(beta) == 0.
+// " fccmp           s30, s29, 0, eq                 \n\t" // Whether Real(beta) == 0.
+// " b.eq            ZERO_BETA_G_0_1_2_3             \n\t"
+// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+// " ZERO_BETA_G_0_1_2_3:                            \n\t"
+// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
+// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
+// "                                                 \n\t"
+// " b.eq            ZERO_BETA_G_4_5_6_7_8_9         \n\t"
+// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
+// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+// " ZERO_BETA_G_4_5_6_7_8_9:                        \n\t"
+// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
+// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
+// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
+// "                                                 \n\t"
+// " END_WRITE_MEM:                                  \n\t"
+// " b               END_EXEC                        \n\t"
 "                                                 \n\t"
 " END_EXEC:                                       \n\t"
 " mov             %11, #0                         \n\t" // Return normal.
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 5a662df4e..7136104b5 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -78,7 +78,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 " mov             x3, #10                         \n\t" // Row-skip of B.
 "                                                 \n\t"
 " ldr             x5, %[c]                        \n\t"
-" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+// " ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
 " ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
 #ifdef _A64FX
 " mov             x8, 0x3                         \n\t" // Tag C address.
@@ -117,8 +117,8 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
 "                                                 \n\t"
 " CCOL_PRFM:                                      \n\t"
-" cmp             x6, #1                          \n\t"
-" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// " cmp             x6, #1                          \n\t"
+// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
 " mov             x16, x5                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
@@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+// " END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
@@ -253,8 +253,8 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 "                                                 \n\t"
 " mov             x9, x5                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is x5 itself.
-" cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
-" b.ne            WRITE_MEM                       \n\t"
+// " cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
+// " b.ne            WRITE_MEM                       \n\t"
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
 "                                                 \n\t"
 " WRITE_MEM:                                      \n\t"
@@ -265,8 +265,8 @@ GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
 SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
 "                                                 \n\t"
 " UNIT_ALPHA:                                     \n\t"
-" cmp             x6, #1                          \n\t"
-" b.ne            WRITE_MEM_G                     \n\t"
+// " cmp             x6, #1                          \n\t"
+// " b.ne            WRITE_MEM_G                     \n\t"
 "                                                 \n\t"
 " WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
 "                                                 \n\t" // Here used scratch: Z[20-29].
@@ -281,32 +281,32 @@ GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,
 " BETA_ZERO_C:                                    \n\t"
 GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
 GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
-" b               END_WRITE_MEM                   \n\t"
-"                                                 \n\t"
-" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
-"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
-" mov             x8, xzr                         \n\t"
-" incb            x8                              \n\t"
-" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
-" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
-"                                                 \n\t"
-" fcmp            d31, #0.0                       \n\t" // Skip loading if *beta == 0 to override NaN.
-" b.eq            BETA_ZERO_G                     \n\t"
-"                                                 \n\t"
-GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-"                                                 \n\t"
-" BETA_ZERO_G:                                    \n\t"
-GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
-GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
-"                                                 \n\t"
-" END_WRITE_MEM:                                  \n\t"
-" b               END_EXEC                        \n\t"
-"                                                 \n\t"
-" END_ERROR:                                      \n\t"
-" mov             x0, #1                          \n\t" // Return error.
+// " b               END_WRITE_MEM                   \n\t"
+// "                                                 \n\t"
+// " WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+// "                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+// " mov             x8, xzr                         \n\t"
+// " incb            x8                              \n\t"
+// " madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+// " index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+// "                                                 \n\t"
+// " fcmp            d31, #0.0                       \n\t" // Skip loading if *beta == 0 to override NaN.
+// " b.eq            BETA_ZERO_G                     \n\t"
+// "                                                 \n\t"
+// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+// "                                                 \n\t"
+// " BETA_ZERO_G:                                    \n\t"
+// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
+// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
+// "                                                 \n\t"
+// " END_WRITE_MEM:                                  \n\t"
+// " b               END_EXEC                        \n\t"
+// "                                                 \n\t"
+// " END_ERROR:                                      \n\t"
+// " mov             x0, #1                          \n\t" // Return error.
 " END_EXEC:                                       \n\t"
 " mov             x0, #0                          \n\t" // Return normal.
 :
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index caa70a5e5..20841891b 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -78,7 +78,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 " mov             x3, #10                         \n\t" // Row-skip of B.
 "                                                 \n\t"
 " ldr             x5, %[c]                        \n\t"
-" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+// " ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
 " ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
 #ifdef _A64FX
 " mov             x8, 0x3                         \n\t" // Tag C address.
@@ -117,8 +117,8 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
 "                                                 \n\t"
 " CCOL_PRFM:                                      \n\t"
-" cmp             x6, #1                          \n\t"
-" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// " cmp             x6, #1                          \n\t"
+// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
 " mov             x16, x5                         \n\t"
 " prfm            PLDL1STRM, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
@@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
 " prfm            PLDL1STRM, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
 " prfm            PLDL1STRM, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+// " END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
@@ -253,8 +253,8 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
 " UNIT_ALPHA:                                     \n\t"
 " mov             x9, x5                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is x5 itself.
-" cmp             x6, #1                          \n\t"
-" b.ne            WRITE_MEM_G                     \n\t"
+// " cmp             x6, #1                          \n\t"
+// " b.ne            WRITE_MEM_G                     \n\t"
 "                                                 \n\t"
 " WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
 "                                                 \n\t" // Here used scratch: Z[20-29].
@@ -268,31 +268,31 @@ GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,
 " BETA_ZERO_C:                                    \n\t"
 GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
 GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
-" b               END_WRITE_MEM                   \n\t"
-"                                                 \n\t"
-" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
-"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
-" mov             x8, xzr                         \n\t"
-" incb            x8                              \n\t"
-" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
-" index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
-"                                                 \n\t"
-" fcmp            s31, #0.0                       \n\t"
-" b.eq            BETA_ZERO_G                     \n\t"
-GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-"                                                 \n\t"
-" BETA_ZERO_G:                                    \n\t"
-GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
-GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
-"                                                 \n\t"
-" END_WRITE_MEM:                                  \n\t"
-" b               END_EXEC                        \n\t"
-"                                                 \n\t"
-" END_ERROR:                                      \n\t"
-" mov             x0, #1                          \n\t" // Return error.
+// " b               END_WRITE_MEM                   \n\t"
+// "                                                 \n\t"
+// " WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+// "                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+// " mov             x8, xzr                         \n\t"
+// " incb            x8                              \n\t"
+// " madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+// " index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
+// "                                                 \n\t"
+// " fcmp            s31, #0.0                       \n\t"
+// " b.eq            BETA_ZERO_G                     \n\t"
+// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+// "                                                 \n\t"
+// " BETA_ZERO_G:                                    \n\t"
+// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
+// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
+// "                                                 \n\t"
+// " END_WRITE_MEM:                                  \n\t"
+// " b               END_EXEC                        \n\t"
+// "                                                 \n\t"
+// " END_ERROR:                                      \n\t"
+// " mov             x0, #1                          \n\t" // Return error.
 " END_EXEC:                                       \n\t"
 " mov             x0, #0                          \n\t" // Return normal.
 :
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 25084af35..7e630894f 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -118,8 +118,8 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 "                                                 \n\t"
 " CCOL_PRFM:                                      \n\t"
-" cmp             %3, #1                          \n\t"
-" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// " cmp             %3, #1                          \n\t"
+// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
 " mov             x16, %2                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
@@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+// " END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
@@ -233,8 +233,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
 " WRITE_MEM_EXEC:                                 \n\t"
 " mov             x9, %2                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is %2 itself.
-" cmp             %3, #1                          \n\t"
-" b.ne            WRITE_MEM_G                     \n\t"
+// " cmp             %3, #1                          \n\t"
+// " b.ne            WRITE_MEM_G                     \n\t"
 "                                                 \n\t"
 " WRITE_MEM_C:                                    \n\t"
 " fmov            d29, xzr                        \n\t"
@@ -260,38 +260,38 @@ GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
 GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
-" b               END_WRITE_MEM                   \n\t"
-"                                                 \n\t"
-" WRITE_MEM_G:                                    \n\t"
-" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
-" index           z28.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
-" fmov            d29, xzr                        \n\t"
-" fcmp            d31, #0.0                       \n\t" // Whether Imag(beta) == 0.
-" fccmp           d30, d29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-" b.eq            ZERO_BETA_G_0_1_2_3             \n\t"
-GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
-GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-" ZERO_BETA_G_0_1_2_3:                            \n\t"
-GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
-GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
-"                                                 \n\t"
-" b.eq            ZERO_BETA_G_4_5_6_7_8_9         \n\t"
-GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
-GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
-GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
-GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-" ZERO_BETA_G_4_5_6_7_8_9:                        \n\t"
-GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
-GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
-GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
-"                                                 \n\t"
-" END_WRITE_MEM:                                  \n\t"
-" b               END_EXEC                        \n\t"
-"                                                 \n\t"
+// " b               END_WRITE_MEM                   \n\t"
+// "                                                 \n\t"
+// " WRITE_MEM_G:                                    \n\t"
+// " add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+// " index           z28.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
+// " fmov            d29, xzr                        \n\t"
+// " fcmp            d31, #0.0                       \n\t" // Whether Imag(beta) == 0.
+// " fccmp           d30, d29, 0, eq                 \n\t" // Whether Real(beta) == 0.
+// " b.eq            ZERO_BETA_G_0_1_2_3             \n\t"
+// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+// " ZERO_BETA_G_0_1_2_3:                            \n\t"
+// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
+// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
+// "                                                 \n\t"
+// " b.eq            ZERO_BETA_G_4_5_6_7_8_9         \n\t"
+// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
+// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+// " ZERO_BETA_G_4_5_6_7_8_9:                        \n\t"
+// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
+// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
+// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
+// "                                                 \n\t"
+// " END_WRITE_MEM:                                  \n\t"
+// " b               END_EXEC                        \n\t"
+// "                                                 \n\t"
 " END_EXEC:                                       \n\t"
 " mov             %11, #0                         \n\t" // Return normal.
 : "+r" (a),      // %0
diff --git a/kernels/armsve/3/armsve_asm_2vx7cmplx.h b/kernels/armsve/3/old/armsve_asm_2vx7cmplx.h
similarity index 100%
rename from kernels/armsve/3/armsve_asm_2vx7cmplx.h
rename to kernels/armsve/3/old/armsve_asm_2vx7cmplx.h
diff --git a/kernels/armsve/3/armsve_asm_2vx8cmplx.h b/kernels/armsve/3/old/armsve_asm_2vx8cmplx.h
similarity index 100%
rename from kernels/armsve/3/armsve_asm_2vx8cmplx.h
rename to kernels/armsve/3/old/armsve_asm_2vx8cmplx.h
diff --git a/kernels/armsve/3/armsve_asm_macros_half.h b/kernels/armsve/3/old/armsve_asm_macros_half.h
similarity index 100%
rename from kernels/armsve/3/armsve_asm_macros_half.h
rename to kernels/armsve/3/old/armsve_asm_macros_half.h
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx7_unindexed.c
similarity index 100%
rename from kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
rename to kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx7_unindexed.c
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx8_unindexed.c
similarity index 100%
rename from kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
rename to kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx8_unindexed.c
diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h
index 408300308..39daf30c6 100644
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -33,13 +33,13 @@
 */
 #include "./3/bli_armsve_utils.h"
 
-GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )
+// GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )
 GEMM_UKR_PROT( double,   d, gemm_armsve_asm_2vx10_unindexed )
 GEMM_UKR_PROT( float,    s, gemm_armsve_asm_2vx10_unindexed )
 GEMM_UKR_PROT( scomplex, c, gemm_armsve_asm_2vx10_unindexed )
 GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed )
-GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed )
-GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed )
+// GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed )
+// GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed )
 //GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_2vx10_unindexed )
 //GEMMSUP_KER_PROT( double,   d, gemmsup_cv_armsve_2vx10_unindexed )
 //GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_10x2v_unindexed )
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 7b420f202..4d9a88817 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -92,8 +92,8 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr x10,%[cs_c]                            \n\t" // Load cs_c.
 	" lsl x10,x10,#2                             \n\t" // cs_c * sizeof(float) -- AUX.
 	"                                            \n\t"
-	" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
-	" lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
+	// " ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+	// " lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
 	"                                            \n\t"
 	" add x16,x2,x10                             \n\t" //Load address Column 1 of C
 	" add x17,x16,x10                            \n\t" //Load address Column 2 of C
@@ -509,9 +509,6 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr x0,%[a_next]                           \n\t" // Pointer to next block of A.
 	" ldr x1,%[b_next]                           \n\t" // Pointer to next pointer of B.
 	"                                            \n\t"
-	" cmp x14,#4                                 \n\t" // If rs_c != 1 (column-major)
-	BNE(SGENSTORED)
-	"                                            \n\t"
 	LABEL(SCOLSTORED)                                  // C is column-major.
 	"                                            \n\t"
 	" dup  v0.4s, wzr                            \n\t"
@@ -678,384 +675,8 @@ void bli_sgemm_armv8a_asm_8x12
 	" str q13, [x27, #16]                        \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	BRANCH(SEND)                                       // Done.
-	"                                            \n\t"
-	"                                            \n\t"
-	LABEL(SGENSTORED)                                  // C is general-stride stored.
-	"                                            \n\t"
-	"                                            \n\t"
-	" dup  v0.4s, wzr                            \n\t"
-	" dup  v1.4s, wzr                            \n\t"
-	" dup  v2.4s, wzr                            \n\t"
-	" dup  v3.4s, wzr                            \n\t"
-	" dup  v4.4s, wzr                            \n\t"
-	" dup  v5.4s, wzr                            \n\t"
-	"                                            \n\t"
-	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x5, x2                                 \n\t"
-	"                                            \n\t"
-	" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c00  into quad and increment by rs_c.
-	" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c01  into quad and increment by rs_c.
-	" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c02  into quad and increment by rs_c.
-	" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c03  into quad and increment by rs_c.
-	" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c04  into quad and increment by rs_c.
-	" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c05  into quad and increment by rs_c.
-	" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c06  into quad and increment by rs_c.
-	" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c07  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x16                                \n\t"
-	"                                            \n\t"
-	" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c10  into quad and increment by rs_c.
-	" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c11  into quad and increment by rs_c.
-	" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c12  into quad and increment by rs_c.
-	" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c13  into quad and increment by rs_c.
-	" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c14  into quad and increment by rs_c.
-	" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c15  into quad and increment by rs_c.
-	" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c16  into quad and increment by rs_c.
-	" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c17  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x17                                \n\t"
-	"                                            \n\t"
-	" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c20  into quad and increment by rs_c.
-	" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c21  into quad and increment by rs_c.
-	" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c22  into quad and increment by rs_c.
-	" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c23  into quad and increment by rs_c.
-	" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c24  into quad and increment by rs_c.
-	" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c25  into quad and increment by rs_c.
-	" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c26  into quad and increment by rs_c.
-	" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c27  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(SBETAZEROGENSTOREDS1)
-	"                                            \n\t"
-	" fmla v0.4s, v8.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v1.4s, v9.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x5, x2                                 \n\t"
-	"                                            \n\t"
-	" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c00  into quad and increment by rs_c.
-	" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c01  into quad and increment by rs_c.
-	" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c02  into quad and increment by rs_c.
-	" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c03  into quad and increment by rs_c.
-	" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c04  into quad and increment by rs_c.
-	" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c05  into quad and increment by rs_c.
-	" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c06  into quad and increment by rs_c.
-	" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c07  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x16                                \n\t"
-	"                                            \n\t"
-	" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c10  into quad and increment by rs_c.
-	" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c11  into quad and increment by rs_c.
-	" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c12  into quad and increment by rs_c.
-	" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c13  into quad and increment by rs_c.
-	" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c14  into quad and increment by rs_c.
-	" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c15  into quad and increment by rs_c.
-	" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c16  into quad and increment by rs_c.
-	" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c17  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x17                                \n\t"
-	"                                            \n\t"
-	" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c20  into quad and increment by rs_c.
-	" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c21  into quad and increment by rs_c.
-	" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c22  into quad and increment by rs_c.
-	" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c23  into quad and increment by rs_c.
-	" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c24  into quad and increment by rs_c.
-	" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c25  into quad and increment by rs_c.
-	" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c26  into quad and increment by rs_c.
-	" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c27  into quad and increment by rs_c.
-	"                                            \n\t"
-	" dup  v8.4s, wzr                            \n\t"
-	" dup  v9.4s, wzr                            \n\t"
-	" dup  v10.4s, wzr                           \n\t"
-	" dup  v11.4s, wzr                           \n\t"
-	" dup  v12.4s, wzr                           \n\t"
-	" dup  v13.4s, wzr                           \n\t"
-	"                                            \n\t"
-	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x5, x19                                \n\t"
-	"                                            \n\t"
-	" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c30  into quad and increment by rs_c.
-	" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c31  into quad and increment by rs_c.
-	" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c32  into quad and increment by rs_c.
-	" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c33  into quad and increment by rs_c.
-	" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c34  into quad and increment by rs_c.
-	" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c35  into quad and increment by rs_c.
-	" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c36  into quad and increment by rs_c.
-	" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c37  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x20                                \n\t"
-	"                                            \n\t"
-	" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c40  into quad and increment by rs_c.
-	" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c41  into quad and increment by rs_c.
-	" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c42  into quad and increment by rs_c.
-	" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c43  into quad and increment by rs_c.
-	" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c44  into quad and increment by rs_c.
-	" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c45  into quad and increment by rs_c.
-	" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c46  into quad and increment by rs_c.
-	" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c47  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x21                                \n\t"
-	"                                            \n\t"
-	" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c50  into quad and increment by rs_c.
-	" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c51  into quad and increment by rs_c.
-	" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c52  into quad and increment by rs_c.
-	" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c53  into quad and increment by rs_c.
-	" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c54  into quad and increment by rs_c.
-	" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c55  into quad and increment by rs_c.
-	" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c56  into quad and increment by rs_c.
-	" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c57  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
-	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
-	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
-	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
-	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
-	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(SBETAZEROGENSTOREDS2)
-	"                                            \n\t"
-	" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x5, x19                                \n\t"
-	"                                            \n\t"
-	" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c30  into quad and increment by rs_c.
-	" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c31  into quad and increment by rs_c.
-	" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c32  into quad and increment by rs_c.
-	" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c33  into quad and increment by rs_c.
-	" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c34  into quad and increment by rs_c.
-	" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c35  into quad and increment by rs_c.
-	" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c36  into quad and increment by rs_c.
-	" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c37  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x20                                \n\t"
-	"                                            \n\t"
-	" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c40  into quad and increment by rs_c.
-	" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c41  into quad and increment by rs_c.
-	" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c42  into quad and increment by rs_c.
-	" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c43  into quad and increment by rs_c.
-	" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c44  into quad and increment by rs_c.
-	" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c45  into quad and increment by rs_c.
-	" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c46  into quad and increment by rs_c.
-	" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c47  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x21                                \n\t"
-	"                                            \n\t"
-	" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c50  into quad and increment by rs_c.
-	" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c51  into quad and increment by rs_c.
-	" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c52  into quad and increment by rs_c.
-	" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c53  into quad and increment by rs_c.
-	" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c54  into quad and increment by rs_c.
-	" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c55  into quad and increment by rs_c.
-	" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c56  into quad and increment by rs_c.
-	" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c57  into quad and increment by rs_c.
-	"                                            \n\t"
-	" dup  v0.4s, wzr                            \n\t"
-	" dup  v1.4s, wzr                            \n\t"
-	" dup  v2.4s, wzr                            \n\t"
-	" dup  v3.4s, wzr                            \n\t"
-	" dup  v4.4s, wzr                            \n\t"
-	" dup  v5.4s, wzr                            \n\t"
-	"                                            \n\t"
-	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x5, x22                                \n\t"
-	"                                            \n\t"
-	" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c60  into quad and increment by rs_c.
-	" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c61  into quad and increment by rs_c.
-	" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c62  into quad and increment by rs_c.
-	" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c63  into quad and increment by rs_c.
-	" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c64  into quad and increment by rs_c.
-	" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c65  into quad and increment by rs_c.
-	" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c66  into quad and increment by rs_c.
-	" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c67  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x23                                \n\t"
-	"                                            \n\t"
-	" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c70  into quad and increment by rs_c.
-	" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c71  into quad and increment by rs_c.
-	" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c72  into quad and increment by rs_c.
-	" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c73  into quad and increment by rs_c.
-	" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c74  into quad and increment by rs_c.
-	" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c75  into quad and increment by rs_c.
-	" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c76  into quad and increment by rs_c.
-	" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c77  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x24                                \n\t"
-	"                                            \n\t"
-	" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c80  into quad and increment by rs_c.
-	" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c81  into quad and increment by rs_c.
-	" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c82  into quad and increment by rs_c.
-	" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c83  into quad and increment by rs_c.
-	" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c84  into quad and increment by rs_c.
-	" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c85  into quad and increment by rs_c.
-	" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c86  into quad and increment by rs_c.
-	" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c87  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
-	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(SBETAZEROGENSTOREDS3)
-	"                                            \n\t"
-	" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
-	" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x5, x22                                \n\t"
-	"                                            \n\t"
-	" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c60  into quad and increment by rs_c.
-	" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c61  into quad and increment by rs_c.
-	" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c62  into quad and increment by rs_c.
-	" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c63  into quad and increment by rs_c.
-	" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c64  into quad and increment by rs_c.
-	" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c65  into quad and increment by rs_c.
-	" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c66  into quad and increment by rs_c.
-	" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c67  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x23                                \n\t"
-	"                                            \n\t"
-	" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c70  into quad and increment by rs_c.
-	" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c71  into quad and increment by rs_c.
-	" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c72  into quad and increment by rs_c.
-	" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c73  into quad and increment by rs_c.
-	" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c74  into quad and increment by rs_c.
-	" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c75  into quad and increment by rs_c.
-	" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c76  into quad and increment by rs_c.
-	" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c77  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x24                                \n\t"
-	"                                            \n\t"
-	" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c80  into quad and increment by rs_c.
-	" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c81  into quad and increment by rs_c.
-	" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c82  into quad and increment by rs_c.
-	" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c83  into quad and increment by rs_c.
-	" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c84  into quad and increment by rs_c.
-	" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c85  into quad and increment by rs_c.
-	" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c86  into quad and increment by rs_c.
-	" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c87  into quad and increment by rs_c.
-	"                                            \n\t"
-	" dup  v8.4s, wzr                            \n\t"
-	" dup  v9.4s, wzr                            \n\t"
-	" dup  v10.4s, wzr                           \n\t"
-	" dup  v11.4s, wzr                           \n\t"
-	" dup  v12.4s, wzr                           \n\t"
-	" dup  v13.4s, wzr                           \n\t"
-	"                                            \n\t"
-	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x5, x25                                \n\t"
-	"                                            \n\t"
-	" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c90  into quad and increment by rs_c.
-	" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c91  into quad and increment by rs_c.
-	" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c92  into quad and increment by rs_c.
-	" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c93  into quad and increment by rs_c.
-	" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c94  into quad and increment by rs_c.
-	" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c95  into quad and increment by rs_c.
-	" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c96  into quad and increment by rs_c.
-	" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c97  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x26                                \n\t"
-	"                                            \n\t"
-	" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c100  into quad and increment by rs_c.
-	" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c101  into quad and increment by rs_c.
-	" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c102  into quad and increment by rs_c.
-	" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c103  into quad and increment by rs_c.
-	" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c104  into quad and increment by rs_c.
-	" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c105  into quad and increment by rs_c.
-	" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c106  into quad and increment by rs_c.
-	" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c107  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x27                                \n\t"
-	"                                            \n\t"
-	" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c110  into quad and increment by rs_c.
-	" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c111  into quad and increment by rs_c.
-	" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c112  into quad and increment by rs_c.
-	" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c113  into quad and increment by rs_c.
-	" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c114  into quad and increment by rs_c.
-	" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c115  into quad and increment by rs_c.
-	" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c116  into quad and increment by rs_c.
-	" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c117  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
-	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
-	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
-	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
-	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
-	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(SBETAZEROGENSTOREDS4)
-	"                                            \n\t"
-	" prfm pldl2keep,[x0]                        \n\t"
-	" prfm pldl2keep,[x1]                        \n\t"
-	"                                            \n\t"
-	" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
-	" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x5, x25                                \n\t"
-	"                                            \n\t"
-	" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c90  into quad and increment by rs_c.
-	" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c91  into quad and increment by rs_c.
-	" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c92  into quad and increment by rs_c.
-	" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c93  into quad and increment by rs_c.
-	" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c94  into quad and increment by rs_c.
-	" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c95  into quad and increment by rs_c.
-	" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c96  into quad and increment by rs_c.
-	" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c97  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x26                                \n\t"
-	"                                            \n\t"
-	" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c100  into quad and increment by rs_c.
-	" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c101  into quad and increment by rs_c.
-	" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c102  into quad and increment by rs_c.
-	" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c103  into quad and increment by rs_c.
-	" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c104  into quad and increment by rs_c.
-	" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c105  into quad and increment by rs_c.
-	" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c106  into quad and increment by rs_c.
-	" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c107  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x5, x27                                \n\t"
-	"                                            \n\t"
-	" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c110  into quad and increment by rs_c.
-	" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c111  into quad and increment by rs_c.
-	" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c112  into quad and increment by rs_c.
-	" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c113  into quad and increment by rs_c.
-	" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c114  into quad and increment by rs_c.
-	" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c115  into quad and increment by rs_c.
-	" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c116  into quad and increment by rs_c.
-	" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c147  into quad and increment by rs_c.
-	"                                            \n\t"
-	LABEL(SEND)                                        // Done!
+	// BRANCH(SEND)                                       // Done.
+	// LABEL(SEND)                                        // Done!
 	"                                            \n\t"
 	:// output operands (none)
 	:// input operands
@@ -1072,7 +693,7 @@ void bli_sgemm_armv8a_asm_8x12
 	 [b_next] "m" (b_next) // 10
 	:// Register clobber list
 	 "x0", "x1", "x2",
-	 "x5", "x6", "x10","x14",
+	 "x5", "x6", "x10",
 	 "x16","x17","x19","x20",
 	 "x21","x22","x23","x24",
 	 "x25","x26","x27",
@@ -1148,8 +769,8 @@ void bli_dgemm_armv8a_asm_6x8
 	" ldr x10,%[cs_c]                            \n\t" // Load cs_c
 	" lsl x10,x10,#3                             \n\t" // cs_c * sizeof(double)
 	"                                            \n\t"
-	" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
-	" lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double).
+	// " ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+	// " lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double).
 	"                                            \n\t"
 	" add x20,x2,x10                             \n\t" //Load address Column 1 of C
 	" add x21,x20,x10                            \n\t" //Load address Column 2 of C
@@ -1620,9 +1241,6 @@ void bli_dgemm_armv8a_asm_6x8
 	" ldr x0,%[a_next]                           \n\t" // Next A address for later use.
 	" ldr x1,%[b_next]                           \n\t" // Next B address for later use.
 	"                                            \n\t"
-	" cmp x14,#8                                 \n\t" // If rs_c != 1 (column-major)
-	BNE(DGENSTORED)
-	"                                            \n\t"
 	LABEL(DCOLSTORED)                                  // C is column-major.
 	"                                            \n\t"
 	" dup  v0.2d, xzr                            \n\t"
@@ -1796,262 +1414,8 @@ void bli_dgemm_armv8a_asm_6x8
 	" str q12, [x26, #16]                        \n\t"
 	" str q13, [x26, #32]                        \n\t"
 	"                                            \n\t"
-	BRANCH(DEND)
-	"                                            \n\t"
-	LABEL(DGENSTORED)                                  // C is general-stride stored.
-	"                                            \n\t"
-	" dup  v0.2d, xzr                            \n\t"
-	" dup  v1.2d, xzr                            \n\t"
-	" dup  v2.2d, xzr                            \n\t"
-	" dup  v3.2d, xzr                            \n\t"
-	" dup  v4.2d, xzr                            \n\t"
-	" dup  v5.2d, xzr                            \n\t"
-	"                                            \n\t"
-	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x27, x2                                \n\t"
-	"                                            \n\t" // Load address of C.
-	" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
-	" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
-	" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
-	" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
-	" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
-	" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x20                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-	" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-	" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-	" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
-	" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
-	" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(DBETAZEROGENSTOREDS1)
-	"                                            \n\t"
-	" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
-	" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
-	" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x27, x2                                \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
-	" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
-	" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
-	" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
-	" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
-	" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x20                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-	" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-	" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-	" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
-	" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
-	" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
-	"                                            \n\t"
-	" dup  v8.2d, xzr                            \n\t"
-	" dup  v9.2d, xzr                            \n\t"
-	" dup  v10.2d, xzr                           \n\t"
-	" dup  v11.2d, xzr                           \n\t"
-	" dup  v12.2d, xzr                           \n\t"
-	" dup  v13.2d, xzr                           \n\t"
-	"                                            \n\t"
-	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x27, x21                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c20  into quad and increment by rs_c.
-	" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c21  into quad and increment by rs_c.
-	" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c22  into quad and increment by rs_c.
-	" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c23  into quad and increment by rs_c.
-	" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c24  into quad and increment by rs_c.
-	" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c25  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x22                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c30  into quad and increment by rs_c.
-	" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c31  into quad and increment by rs_c.
-	" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c32  into quad and increment by rs_c.
-	" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c33  into quad and increment by rs_c.
-	" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c34  into quad and increment by rs_c.
-	" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c35  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
-	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
-	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
-	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
-	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
-	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(DBETAZEROGENSTOREDS2)
-	"                                            \n\t"
-	" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x27, x21                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c20  into quad and increment by rs_c.
-	" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c21  into quad and increment by rs_c.
-	" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c22  into quad and increment by rs_c.
-	" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c23  into quad and increment by rs_c.
-	" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c24  into quad and increment by rs_c.
-	" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c25  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x22                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c30  into quad and increment by rs_c.
-	" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c31  into quad and increment by rs_c.
-	" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c32  into quad and increment by rs_c.
-	" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c33  into quad and increment by rs_c.
-	" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c34  into quad and increment by rs_c.
-	" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c35  into quad and increment by rs_c.
-	"                                            \n\t"
-	" dup  v0.2d, xzr                            \n\t"
-	" dup  v1.2d, xzr                            \n\t"
-	" dup  v2.2d, xzr                            \n\t"
-	" dup  v3.2d, xzr                            \n\t"
-	" dup  v4.2d, xzr                            \n\t"
-	" dup  v5.2d, xzr                            \n\t"
-	"                                            \n\t"
-	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x27, x23                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c40  into quad and increment by rs_c.
-	" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c41  into quad and increment by rs_c.
-	" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c42  into quad and increment by rs_c.
-	" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c43  into quad and increment by rs_c.
-	" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c44  into quad and increment by rs_c.
-	" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c45  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x24                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c50  into quad and increment by rs_c.
-	" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c51  into quad and increment by rs_c.
-	" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c52  into quad and increment by rs_c.
-	" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c53  into quad and increment by rs_c.
-	" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c54  into quad and increment by rs_c.
-	" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c55  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
-	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(DBETAZEROGENSTOREDS3)
-	"                                            \n\t"
-	" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
-	" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x27, x23                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c40  into quad and increment by rs_c.
-	" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c41  into quad and increment by rs_c.
-	" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c42  into quad and increment by rs_c.
-	" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c43  into quad and increment by rs_c.
-	" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c44  into quad and increment by rs_c.
-	" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c45  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x24                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c50  into quad and increment by rs_c.
-	" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c51  into quad and increment by rs_c.
-	" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c52  into quad and increment by rs_c.
-	" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c53  into quad and increment by rs_c.
-	" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c54  into quad and increment by rs_c.
-	" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c55  into quad and increment by rs_c.
-	"                                            \n\t"
-	" dup  v8.2d, xzr                            \n\t"
-	" dup  v9.2d, xzr                            \n\t"
-	" dup  v10.2d, xzr                           \n\t"
-	" dup  v11.2d, xzr                           \n\t"
-	" dup  v12.2d, xzr                           \n\t"
-	" dup  v13.2d, xzr                           \n\t"
-	"                                            \n\t"
-	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
-	"                                            \n\t"
-	" mov x27, x25                               \n\t"
-	"                                            \n\t"
-	" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c60  into quad and increment by rs_c.
-	" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c61  into quad and increment by rs_c.
-	" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c62  into quad and increment by rs_c.
-	" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c63  into quad and increment by rs_c.
-	" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c64  into quad and increment by rs_c.
-	" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c65  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x26                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c70  into quad and increment by rs_c.
-	" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c71  into quad and increment by rs_c.
-	" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c72  into quad and increment by rs_c.
-	" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c73  into quad and increment by rs_c.
-	" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c74  into quad and increment by rs_c.
-	" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c75  into quad and increment by rs_c.
-	"                                            \n\t"
-	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
-	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
-	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
-	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
-	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
-	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
-	"                                            \n\t"
-	LABEL(DBETAZEROGENSTOREDS4)
-	"                                            \n\t"
-	" prfm pldl2keep,[x0]                        \n\t"
-	" prfm pldl2keep,[x1]                        \n\t"
-	"                                            \n\t"
-	" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
-	" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
-	"                                            \n\t"
-	" mov x27, x25                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c60  into quad and increment by rs_c.
-	" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c61  into quad and increment by rs_c.
-	" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c62  into quad and increment by rs_c.
-	" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c63  into quad and increment by rs_c.
-	" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c64  into quad and increment by rs_c.
-	" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c65  into quad and increment by rs_c.
-	"                                            \n\t"
-	" mov x27, x26                               \n\t" // Load address of C.
-	"                                            \n\t"
-	" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c70  into quad and increment by rs_c.
-	" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c71  into quad and increment by rs_c.
-	" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c72  into quad and increment by rs_c.
-	" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c73  into quad and increment by rs_c.
-	" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c74  into quad and increment by rs_c.
-	" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c75  into quad and increment by rs_c.
-	"                                            \n\t"
-	LABEL(DEND)                                        // Done!
+	// BRANCH(DEND)
+	// LABEL(DEND)                                        // Done!
 	"                                            \n\t"
 	:// output operands (none)
 	:// input operands
@@ -2069,8 +1433,9 @@ void bli_dgemm_armv8a_asm_6x8
 	:// Register clobber list
 	 "x0","x1","x2",
 	 "x5","x6","x10",
-	 "x14","x16","x17",
-	 "x20","x21","x22","x23","x24","x25","x26","x27",
+	 "x16","x17","x20",
+	 "x21","x22","x23",
+	 "x24","x25","x26","x27",
 	 "v0","v1","v2",
 	 "v3","v4","v5",
 	 "v6","v7","v8",

From 9cc897f37455d52fbba752e3801f1a9d4a5bfdc1 Mon Sep 17 00:00:00 2001
From: Ruqing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Thu, 3 Feb 2022 16:40:02 +0000
Subject: [PATCH 025/230] Fix SVE Compil.

---
 config/a64fx/bli_family_a64fx.h                        | 10 ++++++++++
 .../armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c    |  2 +-
 .../armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c    |  2 +-
 .../armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c    |  2 +-
 .../armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c    |  2 +-
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/config/a64fx/bli_family_a64fx.h b/config/a64fx/bli_family_a64fx.h
index 5e3f29fd4..b67ae7c60 100644
--- a/config/a64fx/bli_family_a64fx.h
+++ b/config/a64fx/bli_family_a64fx.h
@@ -41,6 +41,16 @@
 #define BLIS_SIMD_ALIGN_SIZE    256
 #define BLIS_SIMD_NUM_REGISTERS 32
 
+// SVE-specific configs.
+#define N_L1_SVE_DEFAULT 64
+#define W_L1_SVE_DEFAULT 4
+#define C_L1_SVE_DEFAULT 256
+#define N_L2_SVE_DEFAULT 2048
+#define W_L2_SVE_DEFAULT 16
+#define C_L2_SVE_DEFAULT 256
+#define N_L3_SVE_DEFAULT 8192
+#define W_L3_SVE_DEFAULT 16
+#define C_L3_SVE_DEFAULT 256
 
 //#endif
 
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 60a64515f..0327f6dbc 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-// " END_CCOL_PRFM:                                  \n\t"
+" END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 7136104b5..e92eba9d6 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-// " END_CCOL_PRFM:                                  \n\t"
+" END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index 20841891b..deb01f9fe 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
 " prfm            PLDL1STRM, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
 " prfm            PLDL1STRM, [x16]                \n\t"
-// " END_CCOL_PRFM:                                  \n\t"
+" END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 7e630894f..e941f5abd 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-// " END_CCOL_PRFM:                                  \n\t"
+" END_CCOL_PRFM:                                  \n\t"
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"

From 72089bb2917b78d99cf4f27c69125bf213ee54e6 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Sat, 5 Feb 2022 16:56:04 +0900
Subject: [PATCH 026/230] ArmSVE Use Predicate in M-Direction

No need to query MR during kernel runtime.
---
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |  7 ++-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  | 53 +++++++------------
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  | 50 +++++++----------
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |  7 ++-
 4 files changed, 43 insertions(+), 74 deletions(-)

diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 0327f6dbc..c24384b02 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -68,10 +68,10 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  uint64_t mr = bli_vl_bytes_armsve() * 2 / 8;
-  GEMM_UKR_SETUP_CT( c, mr, 10, false );
+  GEMM_UKR_SETUP_CT( c, m, 10, false );
 
   __asm__ volatile (
+" whilelo         p0.s, xzr, %12                  \n\t"
 // " ldr             x0, %[a]                        \n\t"
 // " ldr             x1, %[b]                        \n\t"
 " mov             x2, xzr                         \n\t"
@@ -97,7 +97,6 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
 " madd            x2, x16, x2, xzr                \n\t" // cs_a
 " madd            x3, x16, x3, xzr                \n\t" // rs_b
 " madd            %4, x16, %4, xzr                \n\t" // cs_c
-" ptrue           p0.s                            \n\t"
 "                                                 \n\t"
 // " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
 // " ldr             x6, %[k_left]                   \n\t"
@@ -307,7 +306,7 @@ GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
   "+r" (a_next), // %9
   "+r" (b_next), // %10
   "=r" (info)    // %11
-:
+: "r"  (m)       // %12
 : "x2","x3","x9","x16",
   "z0","z1","z2","z3","z4","z5","z6","z7",
   "z8","z9","z10","z11","z12","z13","z14","z15",
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index e92eba9d6..1c2c37208 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -67,10 +67,14 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
-  uint64_t mr = bli_vl_bytes_armsve() * 2 / 8;
-  GEMM_UKR_SETUP_CT( d, mr, 10, false );
+  GEMM_UKR_SETUP_CT( d, m, 10, false );
 
   __asm__ volatile (
+" mov             x0, xzr                         \n\t"
+" ldr             x1, %[m]                        \n\t"
+" whilelo         p0.d, x0, x1 \n\t" " incd x0    \n\t"
+" whilelo         p1.d, x0, x1                    \n\t"
+"                                                 \n\t"
 " ldr             x0, %[a]                        \n\t"
 " ldr             x1, %[b]                        \n\t"
 " mov             x2, xzr                         \n\t"
@@ -96,7 +100,6 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 " madd            x2, x8, x2, xzr                 \n\t" // cs_a
 " madd            x3, x8, x3, xzr                 \n\t" // rs_b
 " madd            x7, x8, x7, xzr                 \n\t" // cs_c
-" ptrue           p0.d                            \n\t"
 "                                                 \n\t"
 " ldr             x4, %[k_mker]                   \n\t" // Number of loops.
 " ldr             x8, %[k_left]                   \n\t"
@@ -114,7 +117,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 " ld1rd           z26.d, p0/z, [x1, 48]           \n\t"
 " ld1rd           z27.d, p0/z, [x1, 56]           \n\t"
 "                                                 \n\t"
-GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 "                                                 \n\t"
 " CCOL_PRFM:                                      \n\t"
 // " cmp             x6, #1                          \n\t"
@@ -149,22 +152,22 @@ CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
 " K_MKER_LOOP:                                    \n\t"
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
 " b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 " b               K_MKER_LOOP                     \n\t"
 "                                                 \n\t"
@@ -176,7 +179,7 @@ GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3
 " cmp             x8, #0                          \n\t" // End of execution.
 " b.eq            WRITE_MEM_PREP                  \n\t"
 "                                                 \n\t"
-GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 " ld1rd           z20.d, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
 " ld1rd           z21.d, p0/z, [x1, 8]            \n\t"
 " ld1rd           z22.d, p0/z, [x1, 16]           \n\t"
@@ -255,7 +258,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 "                                                 \n\t" // C address for storing is x5 itself.
 // " cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
 // " b.ne            WRITE_MEM                       \n\t"
-GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 "                                                 \n\t"
 " WRITE_MEM:                                      \n\t"
 "                                                 \n\t"
@@ -273,35 +276,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
 " fcmp            d31, #0.0                       \n\t" // Skip loading if *beta == 0 to override NaN.
 " b.eq            BETA_ZERO_C                     \n\t"
 // First half of C is already loaded in this case.
-// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
+// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
 GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 "                                                 \n\t"
 " BETA_ZERO_C:                                    \n\t"
-GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
-GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7)
+GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7)
 // " b               END_WRITE_MEM                   \n\t"
 // "                                                 \n\t"
-// " WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
-// "                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
-// " mov             x8, xzr                         \n\t"
-// " incb            x8                              \n\t"
-// " madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
-// " index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
-// "                                                 \n\t"
-// " fcmp            d31, #0.0                       \n\t" // Skip loading if *beta == 0 to override NaN.
-// " b.eq            BETA_ZERO_G                     \n\t"
-// "                                                 \n\t"
-// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-// "                                                 \n\t"
-// " BETA_ZERO_G:                                    \n\t"
-// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
-// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
-// "                                                 \n\t"
 // " END_WRITE_MEM:                                  \n\t"
 // " b               END_EXEC                        \n\t"
 // "                                                 \n\t"
@@ -310,7 +294,8 @@ GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
 " END_EXEC:                                       \n\t"
 " mov             x0, #0                          \n\t" // Return normal.
 :
-: [a]      "m" (a),
+: [m]      "m" (m),
+  [a]      "m" (a),
   [b]      "m" (b),
   [c]      "m" (c),
   [rs_c]   "m" (rs_c),
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index deb01f9fe..7dad6953f 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -67,10 +67,14 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
-  uint64_t mr = bli_vl_bytes_armsve() * 2 / 4;
-  GEMM_UKR_SETUP_CT( s, mr, 10, false );
+  GEMM_UKR_SETUP_CT( s, m, 10, false );
 
   __asm__ volatile (
+" mov             x0, xzr                         \n\t"
+" ldr             x1, %[m]                        \n\t"
+" whilelo         p0.s, x0, x1 \n\t" " incw x0    \n\t"
+" whilelo         p1.s, x0, x1                    \n\t"
+"                                                 \n\t"
 " ldr             x0, %[a]                        \n\t"
 " ldr             x1, %[b]                        \n\t"
 " mov             x2, xzr                         \n\t"
@@ -96,7 +100,6 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 " madd            x2, x8, x2, xzr                 \n\t" // cs_a
 " madd            x3, x8, x3, xzr                 \n\t" // rs_b
 " madd            x7, x8, x7, xzr                 \n\t" // cs_c
-" ptrue           p0.s                            \n\t"
 "                                                 \n\t"
 " ldr             x4, %[k_mker]                   \n\t" // Number of loops.
 " ldr             x8, %[k_left]                   \n\t"
@@ -114,7 +117,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 " ld1rw           z26.s, p0/z, [x1, 24]           \n\t"
 " ld1rw           z27.s, p0/z, [x1, 28]           \n\t"
 "                                                 \n\t"
-GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 "                                                 \n\t"
 " CCOL_PRFM:                                      \n\t"
 // " cmp             x6, #1                          \n\t"
@@ -149,22 +152,22 @@ CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
 " K_MKER_LOOP:                                    \n\t"
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
 " b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
-GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 " b               K_MKER_LOOP                     \n\t"
 "                                                 \n\t"
@@ -176,7 +179,7 @@ GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3
 " cmp             x8, #0                          \n\t" // End of execution.
 " b.eq            WRITE_MEM_PREP                  \n\t"
 "                                                 \n\t"
-GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 " ld1rw           z20.s, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
 " ld1rw           z21.s, p0/z, [x1, 4]            \n\t"
 " ld1rw           z22.s, p0/z, [x1, 8]            \n\t"
@@ -260,34 +263,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
 "                                                 \n\t" // Here used scratch: Z[20-29].
 " fcmp            s31, #0.0                       \n\t"
 " b.eq            BETA_ZERO_C                     \n\t"
-GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 "                                                 \n\t"
 " BETA_ZERO_C:                                    \n\t"
-GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
-GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7)
+GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7)
 // " b               END_WRITE_MEM                   \n\t"
 // "                                                 \n\t"
-// " WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
-// "                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
-// " mov             x8, xzr                         \n\t"
-// " incb            x8                              \n\t"
-// " madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
-// " index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
-// "                                                 \n\t"
-// " fcmp            s31, #0.0                       \n\t"
-// " b.eq            BETA_ZERO_G                     \n\t"
-// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
-// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-// "                                                 \n\t"
-// " BETA_ZERO_G:                                    \n\t"
-// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
-// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
-// "                                                 \n\t"
 // " END_WRITE_MEM:                                  \n\t"
 // " b               END_EXEC                        \n\t"
 // "                                                 \n\t"
@@ -296,7 +281,8 @@ GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
 " END_EXEC:                                       \n\t"
 " mov             x0, #0                          \n\t" // Return normal.
 :
-: [a]      "m" (a),
+: [m]      "m" (m),
+  [a]      "m" (a),
   [b]      "m" (b),
   [c]      "m" (c),
   [rs_c]   "m" (rs_c),
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index e941f5abd..42b1345ff 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -68,10 +68,10 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  uint64_t mr = bli_vl_bytes_armsve() * 2 / 16;
-  GEMM_UKR_SETUP_CT( z, mr, 10, false );
+  GEMM_UKR_SETUP_CT( z, m, 10, false );
 
   __asm__ volatile (
+" whilelo         p0.d, xzr, %12                  \n\t"
 // " ldr             x0, %[a]                        \n\t"
 // " ldr             x1, %[b]                        \n\t"
 " mov             x2, xzr                         \n\t"
@@ -97,7 +97,6 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
 " madd            x2, x16, x2, xzr                \n\t" // cs_a
 " madd            x3, x16, x3, xzr                \n\t" // rs_b
 " madd            %4, x16, %4, xzr                \n\t" // cs_c
-" ptrue           p0.d                            \n\t"
 "                                                 \n\t"
 // " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
 // " ldr             x6, %[k_left]                   \n\t"
@@ -306,7 +305,7 @@ GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
   "+r" (a_next), // %9
   "+r" (b_next), // %10
   "=r" (info)    // %11
-:
+: "r"  (m)       // %12
 : "x2","x3","x9","x16",
   "z0","z1","z2","z3","z4","z5","z6","z7",
   "z8","z9","z10","z11","z12","z13","z14","z15",

From 2f3872e01d51545c687ae2c8b2650e00552111a7 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Mon, 7 Feb 2022 17:14:49 +0900
Subject: [PATCH 027/230] ArmSVE Adopts Label Wrapper

For clang (& armclang?) compilation.

Hopefully solves #609 .
---
 .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c | 37 +++++-----
 .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c | 37 +++++-----
 kernels/armsve/3/armsve_asm_macros.h          | 13 ++++
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  | 70 +++++++++----------
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  | 57 +++++++--------
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  | 55 ++++++++-------
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  | 70 +++++++++----------
 kernels/armv8a/3/armv8a_asm_utils.h           |  2 +-
 8 files changed, 179 insertions(+), 162 deletions(-)

diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
index 44718fa57..a086b3a76 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
@@ -36,6 +36,7 @@
 #include "blis.h"
 #include "armsve512_asm_transpose_d8x8.h"
 #include "armsve512_asm_transpose_d8x2.h"
+#include "../3/armsve_asm_macros.h"
 
 // assumption:
 //   SVE vector length = 512 bits.
@@ -93,9 +94,9 @@ void bli_dpackm_armsve512_asm_10xk
             "mov  x8, %[n_mker] \n\t"
             "mov  x9, %[n_left] \n\t"
             "ptrue p0.d \n\t"
-            "b.ne .AROWSTOR \n\t"
+            BNE(AROWSTOR)
             // A stored in columns.
-            " .ACOLSTOR: \n\t"
+            LABEL(ACOLSTOR)
             // Prefetch distance.
             "mov  x17, #8 \n\t"
             "madd x17, x17, x3, xzr \n\t"
@@ -105,9 +106,9 @@ void bli_dpackm_armsve512_asm_10xk
             "lsl  x16, x16, #60 \n\t"
             "orr  x0, x0, x16 \n\t"
 #endif
-            " .ACOLSTORMKER: \n\t"
+            LABEL(ACOLSTORMKER)
             "cmp  x8, xzr \n\t"
-            "b.eq .ACOLSTORMKEREND \n\t"
+            BEQ(ACOLSTORMKEREND)
             "add  x5, x0, x3 \n\t"
             "add  x6, x5, x3 \n\t"
             "add  x7, x6, x3 \n\t"
@@ -201,11 +202,11 @@ void bli_dpackm_armsve512_asm_10xk
             // "add  x1, x1, #320 \n\t"
             "add  x0, x7, x3 \n\t"
             "sub  x8, x8, #1 \n\t"
-            "b    .ACOLSTORMKER \n\t"
-            " .ACOLSTORMKEREND: \n\t"
-            " .ACOLSTORLEFT: \n\t"
+            BRANCH(ACOLSTORMKER)
+            LABEL(ACOLSTORMKEREND)
+            LABEL(ACOLSTORLEFT)
             "cmp  x9, xzr \n\t"
-            "b.eq .UNITKDONE \n\t"
+            BEQ(UNITKDONE)
             "ld1d z0.d, p0/z, [x0] \n\t"
             "ldr  q1, [x0, #64] \n\t"
             "st1d z0.d, p0, [x1] \n\t"
@@ -213,14 +214,14 @@ void bli_dpackm_armsve512_asm_10xk
             "add  x0, x0, x3 \n\t"
             "add  x1, x1, x2 \n\t"
             "sub  x9, x9, #1 \n\t"
-            "b    .ACOLSTORLEFT \n\t"
+            BRANCH(ACOLSTORLEFT)
             // A stored in rows.
-            " .AROWSTOR: \n\t"
+            LABEL(AROWSTOR)
             // Prepare predicates for in-reg transpose.
             SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
-            " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
+            LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful.
             "cmp  x8, xzr \n\t"
-            "b.eq .AROWSTORMKEREND \n\t"
+            BEQ(AROWSTORMKEREND)
             "add  x10, x0, x4 \n\t"
             "add  x11, x10, x4 \n\t"
             "add  x12, x11, x4 \n\t"
@@ -271,15 +272,15 @@ void bli_dpackm_armsve512_asm_10xk
             "add  x1, x16, x2 \n\t"
             "add  x0, x0, #64 \n\t"
             "sub  x8, x8, #1 \n\t"
-            "b    .AROWSTORMKER \n\t"
-            " .AROWSTORMKEREND: \n\t"
+            BRANCH(AROWSTORMKER)
+            LABEL(AROWSTORMKEREND)
             "mov  x4, %[inca] \n\t" // Restore unshifted inca.
             "index z30.d, xzr, x4 \n\t" // Generate index.
             "lsl  x4, x4, #3 \n\t" // Shift again.
             "lsl  x5, x4, #3 \n\t" // Virtual column vl.
-            " .AROWSTORLEFT: \n\t"
+            LABEL(AROWSTORLEFT)
             "cmp  x9, xzr \n\t"
-            "b.eq .UNITKDONE \n\t"
+            BEQ(UNITKDONE)
             "add  x6, x0, x5 \n\t"
             "add  x7, x6, x4 \n\t"
             "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
@@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_10xk
             "add  x1, x1, x2 \n\t"
             "add  x0, x0, #8 \n\t"
             "sub  x9, x9, #1 \n\t"
-            "b    .AROWSTORLEFT \n\t"
-            " .UNITKDONE: \n\t"
+            BRANCH(AROWSTORLEFT)
+            LABEL(UNITKDONE)
             "mov  x0, #0 \n\t"
             :
             : [a]      "r" (a),
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
index f02b87a7a..aeb323c0c 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
@@ -35,6 +35,7 @@
 
 #include "blis.h"
 #include "armsve512_asm_transpose_d8x8.h"
+#include "../3/armsve_asm_macros.h"
 
 // assumption:
 //   SVE vector length = 512 bits.
@@ -99,9 +100,9 @@ void bli_dpackm_armsve512_asm_16xk
             "mov  x8, %[n_mker] \n\t"
             "mov  x9, %[n_left] \n\t"
             "ptrue p0.d \n\t"
-            "b.ne .AROWSTOR \n\t"
+            BNE(AROWSTOR)
             // A stored in columns.
-            " .ACOLSTOR: \n\t"
+            LABEL(ACOLSTOR)
             // Prefetch distance.
             "mov  x17, #8 \n\t"
             "madd x17, x17, x3, xzr \n\t"
@@ -125,9 +126,9 @@ void bli_dpackm_armsve512_asm_16xk
             // "prfm PLDL1STRM, [x5] \n\t"
             // "prfm PLDL1STRM, [x6] \n\t"
             // "prfm PLDL1STRM, [x7] \n\t"
-            " .ACOLSTORMKER: \n\t"
+            LABEL(ACOLSTORMKER)
             "cmp  x8, xzr \n\t"
-            "b.eq .ACOLSTORMKEREND \n\t"
+            BEQ(ACOLSTORMKEREND)
             "add  x5, x0, x3 \n\t"
             "add  x6, x5, x3 \n\t"
             "add  x7, x6, x3 \n\t"
@@ -193,11 +194,11 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x0, x7, x3 \n\t"
             "add  x1, x16, x2 \n\t"
             "sub  x8, x8, #1 \n\t"
-            "b    .ACOLSTORMKER \n\t"
-            " .ACOLSTORMKEREND: \n\t"
-            " .ACOLSTORLEFT: \n\t"
+            BRANCH(ACOLSTORMKER)
+            LABEL(ACOLSTORMKEREND)
+            LABEL(ACOLSTORLEFT)
             "cmp  x9, xzr \n\t"
-            "b.eq .UNITKDONE \n\t"
+            BEQ(UNITKDONE)
             "ld1d z0.d, p0/z, [x0] \n\t"
             "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
             "st1d z0.d, p0, [x1] \n\t"
@@ -205,14 +206,14 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x0, x0, x3 \n\t"
             "add  x1, x1, x2 \n\t"
             "sub  x9, x9, #1 \n\t"
-            "b    .ACOLSTORLEFT \n\t"
+            BRANCH(ACOLSTORLEFT)
             // A stored in rows.
-            " .AROWSTOR: \n\t"
+            LABEL(AROWSTOR)
             // Prepare predicates for in-reg transpose.
             SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
-            " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
+            LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful.
             "cmp  x8, xzr \n\t"
-            "b.eq .AROWSTORMKEREND \n\t"
+            BEQ(AROWSTORMKEREND)
             "add  x10, x0, x4 \n\t"
             "add  x11, x10, x4 \n\t"
             "add  x12, x11, x4 \n\t"
@@ -274,15 +275,15 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x0, x0, #64 \n\t"
             "add  x1, x16, x2 \n\t"
             "sub  x8, x8, #1 \n\t"
-            "b    .AROWSTORMKER \n\t"
-            " .AROWSTORMKEREND: \n\t"
+            BRANCH(AROWSTORMKER)
+            LABEL(AROWSTORMKEREND)
             "mov  x4, %[inca] \n\t" // Restore unshifted inca.
             "index z30.d, xzr, x4 \n\t" // Generate index.
             "lsl  x4, x4, #3 \n\t" // Shift again.
             "lsl  x5, x4, #3 \n\t" // Virtual column vl.
-            " .AROWSTORLEFT: \n\t"
+            LABEL(AROWSTORLEFT)
             "cmp  x9, xzr \n\t"
-            "b.eq .UNITKDONE \n\t"
+            BEQ(UNITKDONE)
             "add  x6, x0, x5 \n\t"
             "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
             "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
@@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x1, x1, x2 \n\t"
             "add  x0, x0, #8 \n\t"
             "sub  x9, x9, #1 \n\t"
-            "b    .AROWSTORLEFT \n\t"
-            " .UNITKDONE: \n\t"
+            BRANCH(AROWSTORLEFT)
+            LABEL(UNITKDONE)
             "mov  x0, #0 \n\t"
             :
             : [a]      "r" (a),
diff --git a/kernels/armsve/3/armsve_asm_macros.h b/kernels/armsve/3/armsve_asm_macros.h
index 5e8eb3c62..9cbbeab92 100644
--- a/kernels/armsve/3/armsve_asm_macros.h
+++ b/kernels/armsve/3/armsve_asm_macros.h
@@ -33,6 +33,19 @@
 
 
 */
+// Clang's label requirements.
+#if defined(__clang__)
+#define LABEL(str) "   L" #str"%=: \n\t"
+#define BEQ(str) "b.eq L" #str"%=  \n\t"
+#define BNE(str) "b.ne L" #str"%=  \n\t"
+#define BRANCH(str) "b L" #str"%=  \n\t"
+#else
+#define LABEL(str) "   ." #str": \n\t"
+#define BEQ(str) "b.eq ." #str"  \n\t"
+#define BNE(str) "b.ne ." #str"  \n\t"
+#define BRANCH(str) "b ." #str"  \n\t"
+#endif
+
 #define CLEAR_COL2(Z0,Z1) \
 " dup  "#Z0"."DT", #0 \n\t" \
 " dup  "#Z1"."DT", #0 \n\t"
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index c24384b02..098d5d4b5 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -101,9 +101,9 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
 // " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
 // " ldr             x6, %[k_left]                   \n\t"
 "                                                 \n\t"
-" LOAD_ABC:                                       \n\t"
+LABEL(LOAD_ABC)
 " cmp             %5, #0                          \n\t" // Don't preload if no microkernel there.
-" b.eq            END_CCOL_PRFM                   \n\t"
+BEQ(END_CCOL_PRFM)
 "                                                 \n\t"
 " ld1rw           z20.s, p0/z, [%1, 4*0]          \n\t" // Load B's real 8/10, no imaginary.
 " ld1rw           z21.s, p0/z, [%1, 4*2]          \n\t"
@@ -116,9 +116,9 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 "                                                 \n\t"
-" CCOL_PRFM:                                      \n\t"
+LABEL(CCOL_PRFM)
 // " cmp             %3, #1                          \n\t"
-// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, %2                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
@@ -139,14 +139,14 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+LABEL(END_CCOL_PRFM)
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
 " cmp             %5, #0                          \n\t" // If no 4-microkernel can be applied.
-" b.eq            K_LEFT_LOOP                     \n\t"
+BEQ(K_LEFT_LOOP)
 "                                                 \n\t"
-" K_MKER_LOOP:                                    \n\t"
+LABEL(K_MKER_LOOP)
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
@@ -158,18 +158,18 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
 "                                                 \n\t"
 " subs            %5, %5, #1                      \n\t" // Decrease counter before final replica.
-" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem.
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
-" b               K_MKER_LOOP                     \n\t"
+BRANCH(K_MKER_LOOP)
 "                                                 \n\t"
-" FIN_MKER_LOOP:                                  \n\t"
+LABEL(FIN_MKER_LOOP)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
 "                                                 \n\t"
-" K_LEFT_LOOP:                                    \n\t"
+LABEL(K_LEFT_LOOP)
 " cmp             %6, #0                          \n\t" // End of execution.
-" b.eq            WRITE_MEM_PREP                  \n\t"
+BEQ(WRITE_MEM_PREP)
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " ld1rw           z20.s, p0/z, [%1, 4*0]          \n\t" // Load B's real 8/10, no imaginary.
@@ -182,9 +182,9 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " ld1rw           z27.s, p0/z, [%1, 4*14]         \n\t"
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
 " sub             %6, %6, #1                      \n\t"
-" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+BRANCH(K_LEFT_LOOP)
 "                                                 \n\t"
-" WRITE_MEM_PREP:                                 \n\t"
+LABEL(WRITE_MEM_PREP)
 "                                                 \n\t"
 // " ldr             x7, %[alpha]                    \n\t" // Load alpha & beta (address).
 // " ldr             x8, %[beta]                     \n\t"
@@ -193,7 +193,7 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,
 " ld1rw           z30.s, p0/z, [%8]               \n\t" // Real(beta).
 " ld1rw           z31.s, p0/z, [%8, 4]            \n\t" // Imag(beta).
 "                                                 \n\t"
-" PREFETCH_ABNEXT:                                \n\t"
+LABEL(PREFETCH_ABNEXT)
 // " ldr             x9,  %[a_next]                  \n\t"
 // " ldr             x10, %[b_next]                  \n\t"
 #ifdef _A64FX
@@ -209,90 +209,90 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,
 " prfm            PLDL1STRM, [%10]                \n\t"
 " prfm            PLDL1STRM, [%10, 256*1]         \n\t"
 "                                                 \n\t"
-" WRITE_MEM:                                      \n\t"
+LABEL(WRITE_MEM)
 " fmov            s27, #1.0                       \n\t"
 " fcmp            s29, #0.0                       \n\t" // Whether Imag(alpha) == 0.
 " fccmp           s28, s27, 0, eq                 \n\t" // Whether Real(alpha) == 1.
-" b.eq            UNIT_ALPHA                      \n\t"
+BEQ(UNIT_ALPHA)
 "                                                 \n\t"
 GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29)
 GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29)
 GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29)
 GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29)
 GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29)
-" b               WRITE_MEM_EXEC                  \n\t"
+BRANCH(WRITE_MEM_EXEC)
 "                                                 \n\t"
-" UNIT_ALPHA:                                     \n\t"
+LABEL(UNIT_ALPHA)
 MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 )
 MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 )
 MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11)
 MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15)
 MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
 "                                                 \n\t"
-" WRITE_MEM_EXEC:                                 \n\t"
+LABEL(WRITE_MEM_EXEC)
 " mov             x9, %2                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is %2 itself.
 // " cmp             %3, #1                          \n\t"
-// " b.ne            WRITE_MEM_G                     \n\t"
+// BNE(WRITE_MEM_G)
 "                                                 \n\t"
-" WRITE_MEM_C:                                    \n\t"
+LABEL(WRITE_MEM_C)
 " fmov            s29, wzr                        \n\t"
 " fcmp            s31, #0.0                       \n\t" // Whether Imag(beta) == 0.
 " fccmp           s30, s29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-" b.eq            ZERO_BETA_C_0_1_2_3             \n\t"
+BEQ(ZERO_BETA_C_0_1_2_3)
 GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
 GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
 GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
 GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-" ZERO_BETA_C_0_1_2_3:                            \n\t"
+LABEL(ZERO_BETA_C_0_1_2_3)
 GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4)
 "                                                 \n\t"
-" b.eq            ZERO_BETA_C_4_5_6_7_8_9         \n\t"
+BEQ(ZERO_BETA_C_4_5_6_7_8_9)
 GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
 GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
 GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4)
 GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
 GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
 GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-" ZERO_BETA_C_4_5_6_7_8_9:                        \n\t"
+LABEL(ZERO_BETA_C_4_5_6_7_8_9)
 GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
-// " b               END_WRITE_MEM                   \n\t"
+// BRANCH(END_WRITE_MEM)
 // "                                                 \n\t"
-// " WRITE_MEM_G:                                    \n\t"
+// LABEL(WRITE_MEM_G)
 // " add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
 // " mov             x3, %3                          \n\t" //  s.t. 2*sizeof(float) = 2*4 = 8.
 // " index           z28.s, wzr, w3                  \n\t"
 // " fmov            s29, wzr                        \n\t"
 // " fcmp            s31, #0.0                       \n\t" // Whether Imag(beta) == 0.
 // " fccmp           s30, s29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-// " b.eq            ZERO_BETA_G_0_1_2_3             \n\t"
+// BEQ(ZERO_BETA_G_0_1_2_3)
 // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
 // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
 // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
 // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-// " ZERO_BETA_G_0_1_2_3:                            \n\t"
+// LABEL(ZERO_BETA_G_0_1_2_3)
 // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
 // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
 // "                                                 \n\t"
-// " b.eq            ZERO_BETA_G_4_5_6_7_8_9         \n\t"
+// BEQ(ZERO_BETA_G_4_5_6_7_8_9)
 // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
 // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
 // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
 // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
 // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
 // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-// " ZERO_BETA_G_4_5_6_7_8_9:                        \n\t"
+// LABEL(ZERO_BETA_G_4_5_6_7_8_9)
 // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
 // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
 // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
 // "                                                 \n\t"
-// " END_WRITE_MEM:                                  \n\t"
-// " b               END_EXEC                        \n\t"
+// LABEL(END_WRITE_MEM)
+// BRANCH(END_EXEC)
 "                                                 \n\t"
-" END_EXEC:                                       \n\t"
+LABEL(END_EXEC)
 " mov             %11, #0                         \n\t" // Return normal.
 : "+r" (a),      // %0
   "+r" (b),      // %1
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 1c2c37208..0ee470f24 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -104,9 +104,9 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 " ldr             x4, %[k_mker]                   \n\t" // Number of loops.
 " ldr             x8, %[k_left]                   \n\t"
 "                                                 \n\t"
-" LOAD_ABC:                                       \n\t"
+LABEL(LOAD_ABC)
 " cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
-" b.eq            END_CCOL_PRFM                   \n\t"
+BEQ(END_CCOL_PRFM)
 
 " ld1rd           z20.d, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
 " ld1rd           z21.d, p0/z, [x1, 8]            \n\t"
@@ -119,9 +119,9 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 "                                                 \n\t"
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 "                                                 \n\t"
-" CCOL_PRFM:                                      \n\t"
+LABEL(CCOL_PRFM)
 // " cmp             x6, #1                          \n\t"
-// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, x5                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
@@ -142,14 +142,14 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+LABEL(END_CCOL_PRFM)
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
 " cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
-" b.eq            K_LEFT_LOOP                     \n\t"
+BEQ(K_LEFT_LOOP)
 "                                                 \n\t"
-" K_MKER_LOOP:                                    \n\t"
+LABEL(K_MKER_LOOP)
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
 GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
@@ -164,20 +164,20 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
-" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem.
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
-" b               K_MKER_LOOP                     \n\t"
+BRANCH(K_MKER_LOOP)
 "                                                 \n\t"
-" FIN_MKER_LOOP:                                  \n\t"
+LABEL(FIN_MKER_LOOP)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 " add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
 "                                                 \n\t"
-" K_LEFT_LOOP:                                    \n\t"
+LABEL(K_LEFT_LOOP)
 " cmp             x8, #0                          \n\t" // End of execution.
-" b.eq            WRITE_MEM_PREP                  \n\t"
+BEQ(WRITE_MEM_PREP)
 "                                                 \n\t"
 GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 " ld1rd           z20.d, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
@@ -203,9 +203,9 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 " add             x0, x0, x2                      \n\t" // Forward A.
 " add             x1, x1, x3                      \n\t" // Forward B.
 " sub             x8, x8, #1                      \n\t"
-" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+BRANCH(K_LEFT_LOOP)
 "                                                 \n\t"
-" WRITE_MEM_PREP:                                 \n\t"
+LABEL(WRITE_MEM_PREP)
 "                                                 \n\t"
 " ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
 " ldr             x8, %[beta]                     \n\t"
@@ -216,7 +216,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 " fmov            d28, #1.0                       \n\t" // Prepare FP 1.0.
 " fmov            x16, d28                        \n\t"
 "                                                 \n\t"
-" PREFETCH_ABNEXT:                                \n\t"
+LABEL(PREFETCH_ABNEXT)
 " ldr             x0, %[a_next]                   \n\t"
 " ldr             x1, %[b_next]                   \n\t"
 #ifdef _A64FX
@@ -257,41 +257,42 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 " mov             x9, x5                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is x5 itself.
 // " cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
-// " b.ne            WRITE_MEM                       \n\t"
+// BNE(WRITE_MEM)
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 "                                                 \n\t"
-" WRITE_MEM:                                      \n\t"
+LABEL(WRITE_MEM)
 "                                                 \n\t"
 " cmp             x16, x4                         \n\t"
-" b.eq            UNIT_ALPHA                      \n\t"
+BEQ(UNIT_ALPHA)
 "                                                 \n\t"
 SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
 "                                                 \n\t"
-" UNIT_ALPHA:                                     \n\t"
+LABEL(UNIT_ALPHA)
 // " cmp             x6, #1                          \n\t"
-// " b.ne            WRITE_MEM_G                     \n\t"
+// BNE(WRITE_MEM_G)
 "                                                 \n\t"
-" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+LABEL(WRITE_MEM_C)
+"                                                 \n\t" // Available scratch: Z[20-30].
 "                                                 \n\t" // Here used scratch: Z[20-29].
 " fcmp            d31, #0.0                       \n\t" // Skip loading if *beta == 0 to override NaN.
-" b.eq            BETA_ZERO_C                     \n\t"
+BEQ(BETA_ZERO_C)
 // First half of C is already loaded in this case.
 // GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
 GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 "                                                 \n\t"
-" BETA_ZERO_C:                                    \n\t"
+LABEL(BETA_ZERO_C)
 GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7)
 GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7)
-// " b               END_WRITE_MEM                   \n\t"
+// BRANCH(END_WRITE_MEM)
 // "                                                 \n\t"
-// " END_WRITE_MEM:                                  \n\t"
-// " b               END_EXEC                        \n\t"
+// LABEL(END_WRITE_MEM)
+// BRANCH(END_EXEC)
 // "                                                 \n\t"
-// " END_ERROR:                                      \n\t"
+// LABEL(END_ERROR)
 // " mov             x0, #1                          \n\t" // Return error.
-" END_EXEC:                                       \n\t"
+LABEL(END_EXEC)
 " mov             x0, #0                          \n\t" // Return normal.
 :
 : [m]      "m" (m),
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index 7dad6953f..d03af5923 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -104,9 +104,9 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 " ldr             x4, %[k_mker]                   \n\t" // Number of loops.
 " ldr             x8, %[k_left]                   \n\t"
 "                                                 \n\t"
-" LOAD_ABC:                                       \n\t"
+LABEL(LOAD_ABC)
 " cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
-" b.eq            END_CCOL_PRFM                   \n\t"
+BEQ(END_CCOL_PRFM)
 
 " ld1rw           z20.s, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
 " ld1rw           z21.s, p0/z, [x1, 4]            \n\t"
@@ -119,9 +119,9 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 "                                                 \n\t"
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 "                                                 \n\t"
-" CCOL_PRFM:                                      \n\t"
+LABEL(CCOL_PRFM)
 // " cmp             x6, #1                          \n\t"
-// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, x5                         \n\t"
 " prfm            PLDL1STRM, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
@@ -142,14 +142,14 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 " prfm            PLDL1STRM, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
 " prfm            PLDL1STRM, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+LABEL(END_CCOL_PRFM)
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
 " cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
-" b.eq            K_LEFT_LOOP                     \n\t"
+BEQ(K_LEFT_LOOP)
 "                                                 \n\t"
-" K_MKER_LOOP:                                    \n\t"
+LABEL(K_MKER_LOOP)
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
 GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
@@ -164,20 +164,20 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 "                                                 \n\t"
 " subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
-" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem.
 "                                                 \n\t"
 " add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
-" b               K_MKER_LOOP                     \n\t"
+BRANCH(K_MKER_LOOP)
 "                                                 \n\t"
-" FIN_MKER_LOOP:                                  \n\t"
+LABEL(FIN_MKER_LOOP)
 GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
 " add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
 "                                                 \n\t"
-" K_LEFT_LOOP:                                    \n\t"
+LABEL(K_LEFT_LOOP)
 " cmp             x8, #0                          \n\t" // End of execution.
-" b.eq            WRITE_MEM_PREP                  \n\t"
+BEQ(WRITE_MEM_PREP)
 "                                                 \n\t"
 GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0)
 " ld1rw           z20.s, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
@@ -203,9 +203,9 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 " add             x0, x0, x2                      \n\t" // Forward A.
 " add             x1, x1, x3                      \n\t" // Forward B.
 " sub             x8, x8, #1                      \n\t"
-" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+BRANCH(K_LEFT_LOOP)
 "                                                 \n\t"
-" WRITE_MEM_PREP:                                 \n\t"
+LABEL(WRITE_MEM_PREP)
 "                                                 \n\t"
 " ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
 " ldr             x8, %[beta]                     \n\t"
@@ -214,7 +214,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 " dup             z30.s, w4                       \n\t" // Broadcast alpha & beta into vectors.
 " dup             z31.s, w8                       \n\t"
 "                                                 \n\t"
-" PREFETCH_ABNEXT:                                \n\t"
+LABEL(PREFETCH_ABNEXT)
 " ldr             x0, %[a_next]                   \n\t"
 " ldr             x1, %[b_next]                   \n\t"
 " prfm            PLDL2KEEP, [x0]                 \n\t"
@@ -244,41 +244,42 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
 " prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
 " prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
 "                                                 \n\t"
-" WRITE_MEM:                                      \n\t"
+LABEL(WRITE_MEM)
 "                                                 \n\t"
 " fmov            s28, #1.0                       \n\t"
 " fmov            w16, s28                        \n\t"
 " cmp             w16, w4                         \n\t"
-" b.eq            UNIT_ALPHA                      \n\t"
+BEQ(UNIT_ALPHA)
 "                                                 \n\t"
 SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
 "                                                 \n\t"
-" UNIT_ALPHA:                                     \n\t"
+LABEL(UNIT_ALPHA)
 " mov             x9, x5                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is x5 itself.
 // " cmp             x6, #1                          \n\t"
-// " b.ne            WRITE_MEM_G                     \n\t"
+// BNE(WRITE_MEM_G)
 "                                                 \n\t"
-" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+LABEL(WRITE_MEM_C)
+"                                                 \n\t" // Available scratch: Z[20-30].
 "                                                 \n\t" // Here used scratch: Z[20-29].
 " fcmp            s31, #0.0                       \n\t"
-" b.eq            BETA_ZERO_C                     \n\t"
+BEQ(BETA_ZERO_C)
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 "                                                 \n\t"
-" BETA_ZERO_C:                                    \n\t"
+LABEL(BETA_ZERO_C)
 GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7)
 GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7)
-// " b               END_WRITE_MEM                   \n\t"
+// BRANCH(END_WRITE_MEM)
 // "                                                 \n\t"
-// " END_WRITE_MEM:                                  \n\t"
-// " b               END_EXEC                        \n\t"
+// LABEL(END_WRITE_MEM)
+// BRANCH(END_EXEC)
 // "                                                 \n\t"
-// " END_ERROR:                                      \n\t"
+// LABEL(END_ERROR)
 // " mov             x0, #1                          \n\t" // Return error.
-" END_EXEC:                                       \n\t"
+LABEL(END_EXEC)
 " mov             x0, #0                          \n\t" // Return normal.
 :
 : [m]      "m" (m),
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 42b1345ff..8636a527b 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -101,9 +101,9 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
 // " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
 // " ldr             x6, %[k_left]                   \n\t"
 "                                                 \n\t"
-" LOAD_ABC:                                       \n\t"
+LABEL(LOAD_ABC)
 " cmp             %5, #0                          \n\t" // Don't preload if no microkernel there.
-" b.eq            END_CCOL_PRFM                   \n\t"
+BEQ(END_CCOL_PRFM)
 "                                                 \n\t"
 " ld1rd           z20.d, p0/z, [%1, 8*0]          \n\t" // Load B's real 8/10, no imaginary.
 " ld1rd           z21.d, p0/z, [%1, 8*2]          \n\t"
@@ -116,9 +116,9 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 "                                                 \n\t"
-" CCOL_PRFM:                                      \n\t"
+LABEL(CCOL_PRFM)
 // " cmp             %3, #1                          \n\t"
-// " b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, %2                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
@@ -139,14 +139,14 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
+LABEL(END_CCOL_PRFM)
 "                                                 \n\t"
 CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
 "                                                 \n\t"
 " cmp             %5, #0                          \n\t" // If no 4-microkernel can be applied.
-" b.eq            K_LEFT_LOOP                     \n\t"
+BEQ(K_LEFT_LOOP)
 "                                                 \n\t"
-" K_MKER_LOOP:                                    \n\t"
+LABEL(K_MKER_LOOP)
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
@@ -158,18 +158,18 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
 "                                                 \n\t"
 " subs            %5, %5, #1                      \n\t" // Decrease counter before final replica.
-" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem.
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
-" b               K_MKER_LOOP                     \n\t"
+BRANCH(K_MKER_LOOP)
 "                                                 \n\t"
-" FIN_MKER_LOOP:                                  \n\t"
+LABEL(FIN_MKER_LOOP)
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
 "                                                 \n\t"
-" K_LEFT_LOOP:                                    \n\t"
+LABEL(K_LEFT_LOOP)
 " cmp             %6, #0                          \n\t" // End of execution.
-" b.eq            WRITE_MEM_PREP                  \n\t"
+BEQ(WRITE_MEM_PREP)
 "                                                 \n\t"
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " ld1rd           z20.d, p0/z, [%1, 8*0]          \n\t" // Load B's real 8/10, no imaginary.
@@ -182,9 +182,9 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 " ld1rd           z27.d, p0/z, [%1, 8*14]         \n\t"
 GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
 " sub             %6, %6, #1                      \n\t"
-" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+BRANCH(K_LEFT_LOOP)
 "                                                 \n\t"
-" WRITE_MEM_PREP:                                 \n\t"
+LABEL(WRITE_MEM_PREP)
 "                                                 \n\t"
 // " ldr             x7, %[alpha]                    \n\t" // Load alpha & beta (address).
 // " ldr             x8, %[beta]                     \n\t"
@@ -193,7 +193,7 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,
 " ld1rd           z30.d, p0/z, [%8]               \n\t" // Real(beta).
 " ld1rd           z31.d, p0/z, [%8, 8]            \n\t" // Imag(beta).
 "                                                 \n\t"
-" PREFETCH_ABNEXT:                                \n\t"
+LABEL(PREFETCH_ABNEXT)
 // " ldr             x9,  %[a_next]                  \n\t"
 // " ldr             x10, %[b_next]                  \n\t"
 #ifdef _A64FX
@@ -209,89 +209,89 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,
 " prfm            PLDL1STRM, [%10]                \n\t"
 " prfm            PLDL1STRM, [%10, 256*1]         \n\t"
 "                                                 \n\t"
-" WRITE_MEM:                                      \n\t"
+LABEL(WRITE_MEM)
 " fmov            d27, #1.0                       \n\t"
 " fcmp            d29, #0.0                       \n\t" // Whether Imag(alpha) == 0.
 " fccmp           d28, d27, 0, eq                 \n\t" // Whether Real(alpha) == 1.
-" b.eq            UNIT_ALPHA                      \n\t"
+BEQ(UNIT_ALPHA)
 "                                                 \n\t"
 GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29)
 GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29)
 GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29)
 GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29)
 GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29)
-" b               WRITE_MEM_EXEC                  \n\t"
+BRANCH(WRITE_MEM_EXEC)
 "                                                 \n\t"
-" UNIT_ALPHA:                                     \n\t"
+LABEL(UNIT_ALPHA)
 MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 )
 MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 )
 MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11)
 MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15)
 MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
 "                                                 \n\t"
-" WRITE_MEM_EXEC:                                 \n\t"
+LABEL(WRITE_MEM_EXEC)
 " mov             x9, %2                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is %2 itself.
 // " cmp             %3, #1                          \n\t"
-// " b.ne            WRITE_MEM_G                     \n\t"
+// BNE(WRITE_MEM_G)
 "                                                 \n\t"
-" WRITE_MEM_C:                                    \n\t"
+LABEL(WRITE_MEM_C)
 " fmov            d29, xzr                        \n\t"
 " fcmp            d31, #0.0                       \n\t" // Whether Imag(beta) == 0.
 " fccmp           d30, d29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-" b.eq            ZERO_BETA_C_0_1_2_3             \n\t"
+BEQ(ZERO_BETA_C_0_1_2_3)
 GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
 GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
 GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
 GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-" ZERO_BETA_C_0_1_2_3:                            \n\t"
+LABEL(ZERO_BETA_C_0_1_2_3)
 GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4)
 "                                                 \n\t"
-" b.eq            ZERO_BETA_C_4_5_6_7_8_9         \n\t"
+BEQ(ZERO_BETA_C_4_5_6_7_8_9)
 GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
 GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
 GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4)
 GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
 GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
 GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-" ZERO_BETA_C_4_5_6_7_8_9:                        \n\t"
+LABEL(ZERO_BETA_C_4_5_6_7_8_9)
 GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
-// " b               END_WRITE_MEM                   \n\t"
+// BRANCH(END_WRITE_MEM)
 // "                                                 \n\t"
-// " WRITE_MEM_G:                                    \n\t"
+// LABEL(WRITE_MEM_G)
 // " add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
 // " index           z28.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
 // " fmov            d29, xzr                        \n\t"
 // " fcmp            d31, #0.0                       \n\t" // Whether Imag(beta) == 0.
 // " fccmp           d30, d29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-// " b.eq            ZERO_BETA_G_0_1_2_3             \n\t"
+// BEQ(ZERO_BETA_G_0_1_2_3)
 // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
 // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
 // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
 // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-// " ZERO_BETA_G_0_1_2_3:                            \n\t"
+// LABEL(ZERO_BETA_G_0_1_2_3)
 // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
 // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
 // "                                                 \n\t"
-// " b.eq            ZERO_BETA_G_4_5_6_7_8_9         \n\t"
+// BEQ(ZERO_BETA_G_4_5_6_7_8_9)
 // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
 // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
 // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
 // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
 // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
 // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-// " ZERO_BETA_G_4_5_6_7_8_9:                        \n\t"
+// LABEL(ZERO_BETA_G_4_5_6_7_8_9)
 // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
 // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
 // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
 // "                                                 \n\t"
-// " END_WRITE_MEM:                                  \n\t"
-// " b               END_EXEC                        \n\t"
+// LABEL(END_WRITE_MEM)
+// BRANCH(END_EXEC)
 // "                                                 \n\t"
-" END_EXEC:                                       \n\t"
+LABEL(END_EXEC)
 " mov             %11, #0                         \n\t" // Return normal.
 : "+r" (a),      // %0
   "+r" (b),      // %1
diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h
index 465950999..0c405dfd2 100644
--- a/kernels/armv8a/3/armv8a_asm_utils.h
+++ b/kernels/armv8a/3/armv8a_asm_utils.h
@@ -34,7 +34,7 @@
 
 */
 
-// Apple's local label requirements.
+// Apple/Clang's local label requirements.
 #if defined(__APPLE__) || defined(__clang__)
 #define LABEL(str) "   L" #str"%=: \n\t"
 #define BEQ(str) "b.eq L" #str"%=  \n\t"

From 26742910a087947780a089360e2baf82ea109e01 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 13 Feb 2022 16:53:45 -0600
Subject: [PATCH 028/230] Update CC_VENDOR logic

Look for `GCC` in addition to `gcc` to handle weird conda version strings. [ci skip]
---
 configure | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index 95a97c6b1..c03df26cd 100755
--- a/configure
+++ b/configure
@@ -1454,7 +1454,7 @@ get_compiler_version()
 	# isolate the version number.
 	# The last part ({ read first rest ; echo $first ; }) is a workaround
 	# to OS X's egrep only returning the first match.
-	cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG' | { read first rest ; echo $first ; })
+	cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG|GCC' | { read first rest ; echo $first ; })
 
 	# AOCC version strings contain both "clang" and "AOCC" substrings, and
 	# so we have perform a follow-up check to make sure cc_vendor gets set
@@ -1472,6 +1472,10 @@ get_compiler_version()
 
 	# Begin parsing cc_vendor for the version string.
 
+	if [ "${cc_vendor}" = "GCC" ]; then
+	     # Conda gcc sometimes has GCC (all caps) in the version string
+		cc_vendor="gcc"
+	fi
 	if [ "${cc_vendor}" = "crosstool-NG" ]; then
 	     # Treat compilers built by crosstool-NG (for eg: conda) as gcc.
 		cc_vendor="gcc"

From 5a4d3f5208d3d8cc1827f8cc90414c764b7ebab3 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 13 Feb 2022 17:28:30 -0600
Subject: [PATCH 029/230] Use -flat_namespace option to link on macOS

Fixes #611.
---
 common.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common.mk b/common.mk
index 5f2d30c9b..b9e3d97a7 100644
--- a/common.mk
+++ b/common.mk
@@ -552,7 +552,7 @@ endif
 # NOTE: The flag for creating shared objects is different for Linux and OS X.
 ifeq ($(OS_NAME),Darwin)
 # OS X shared library link flags.
-SOFLAGS    := -dynamiclib
+SOFLAGS    := -dynamiclib -Wl,-flat_namespace
 ifeq ($(MK_ENABLE_RPATH),yes)
 SOFLAGS    += -Wl,-install_name,@rpath/$(LIBBLIS_SONAME)
 else

From 25061593460767221e1066f9d720fa6676bbed8f Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 13 Feb 2022 20:11:55 -0600
Subject: [PATCH 030/230] Don't use `-Wl,-flat-namespace`.

Flat namespaces can cause problems due to conflicting system libraries,
etc., so just mark `xerbla_` as a weak symbol on macOS instead.
---
 common.mk                             |  2 +-
 frame/compat/f2c/bla_xerbla.h         |  2 +-
 frame/include/bli_config_macro_defs.h | 16 ++++++++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/common.mk b/common.mk
index b9e3d97a7..5f2d30c9b 100644
--- a/common.mk
+++ b/common.mk
@@ -552,7 +552,7 @@ endif
 # NOTE: The flag for creating shared objects is different for Linux and OS X.
 ifeq ($(OS_NAME),Darwin)
 # OS X shared library link flags.
-SOFLAGS    := -dynamiclib -Wl,-flat_namespace
+SOFLAGS    := -dynamiclib
 ifeq ($(MK_ENABLE_RPATH),yes)
 SOFLAGS    += -Wl,-install_name,@rpath/$(LIBBLIS_SONAME)
 else
diff --git a/frame/compat/f2c/bla_xerbla.h b/frame/compat/f2c/bla_xerbla.h
index 44c168e58..f9f0a4641 100644
--- a/frame/compat/f2c/bla_xerbla.h
+++ b/frame/compat/f2c/bla_xerbla.h
@@ -34,6 +34,6 @@
 
 #ifdef BLIS_ENABLE_BLAS
 
-BLIS_EXPORT_BLAS int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
+BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
 
 #endif
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 5a4c8a15d..0c75fb639 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -236,6 +236,22 @@
 #define BLIS_EXPORT_ADDON BLIS_EXPORT
 
 
+// -- OVERRIDABLE (WEAK) SYMBOLS -----------------------------------------------
+
+// On Linux, functions called from a shared library can be overriden by the main
+// program simply by providing a new definition. However, macOS uses a "two-level
+// namespace" which causes calls to shared library functions to be tied to the
+// library and not overridable. As a workaround, certain symbols can be defined
+// as "weak" and are given lower preference during linking.
+#ifndef BLIS_OVERRIDABLE
+#if BLIS_OS_OSX
+#define BLIS_OVERRIDABLE __attribute__((weak))
+#else
+#define BLIS_OVERRIDABLE
+#endif
+#endif
+
+
 // -- STATIC INLINE FUNCTIONS --------------------------------------------------
 
 // C and C++ have different semantics for defining "inline" functions. In C,

From ee9ff988c49f16696679d4c6cd3dcfcac7295be7 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 15 Feb 2022 15:01:51 -0600
Subject: [PATCH 031/230] Move edge cases to gemmtrsm ukrs; doc updates.

Details:
- Moved edge-case handling into the gemmtrsm microkernel. This required
  changing the microkernel API to take m and n dimension parameters as
  well as updating all existing gemmtrsm microkernel function pointer
  types, function signatures, and related definitions to take m and n
  dimensions. Also updated all existing gemmtrsm kernels in the
  'kernels' directory (which for now is limited to haswell and penryn
  kernel sets, plus native and 1m-based reference kernels in
  'ref_kernels') to take m and n dimensions, and implemented edge-case
  handling within those microkernels via a collection of new C
  preprocessor macros defined within bli_edge_case_macro_defs.h. Note
  that the edge-case handling for gemm-like operations had already
  been relocated into the gemm microkernel in 54fa28b.
- Added desriptive comments to GEMM_UKR_SETUP_CT() and related macros in
  bli_edge_case_macro_defs.h to allow for easier reading.
- Updated docs/KernelsHowTo.md to reflect above changes. Also cleaned up
  the bullet under "Implementation Notes for gemm" that covers alignment
  issues. (Thanks to Ivan Korostelev for pointing out the confusing and
  outdated language in issue #591.)
- Other minor tweaks to KernelsHowTo.md.
---
 docs/KernelsHowTo.md                          |  64 ++++++----
 frame/3/bli_l3_ft_ukr.h                       |   2 +
 frame/3/bli_l3_ind_ukr.h                      |   4 +
 frame/3/bli_l3_ukr_oapi.c                     |   6 +
 frame/3/bli_l3_ukr_prot.h                     |   2 +
 frame/3/bli_l3_ukr_tapi.c                     |   4 +
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |  58 +++------
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |  58 +++------
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |  60 +++-------
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |  60 +++-------
 frame/include/bli_edge_case_macro_defs.h      | 110 +++++++++++++++++-
 .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c       |  12 ++
 .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c       |  12 ++
 .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c |   5 +
 .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c |   5 +
 ref_kernels/3/bli_gemmtrsm_ref.c              |  45 ++++++-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |  50 ++++++--
 17 files changed, 352 insertions(+), 205 deletions(-)

diff --git a/docs/KernelsHowTo.md b/docs/KernelsHowTo.md
index 302b1c75d..6e84db8e7 100644
--- a/docs/KernelsHowTo.md
+++ b/docs/KernelsHowTo.md
@@ -113,7 +113,7 @@ Note that all kernels, whether they be reference implementations or based on ful
 
 The first step is to obtain a valid context. Contexts store all of the information
 specific to a particular sub-configuration (usually loosely specific to a
-microarchitecture or group of closely-related microarchitectuers). If a context is
+microarchitecture or group of closely-related microarchitectures). If a context is
 not already available in your current scope, a default context for the hardware
 for which BLIS was configured (or, in the case of multi-configuration builds, the
 hardware on which BLIS is currently running) may be queried via:
@@ -229,7 +229,7 @@ This section seeks to provide developers with a complete reference for each of t
 
 The function prototypes in this section follow the same guidelines as those listed in the [BLIS typed API reference](BLISTypedAPI.md#Notes_for_using_this_reference). Namely:
   * Any occurrence of `?` should be replaced with `s`, `d`, `c`, or `z` to form an actual function name.
-  * Any occurrence of `ctype` should be replaced with the actual C type corresponding to the datatype instance in question.
+  * Any occurrence of `ctype` should be replaced with the actual C99 language type corresponding to the datatype instance in question.
   * Some matrix arguments have associated row and column strides arguments that proceed them, typically listed as `rsX` and `csX` for a given matrix `X`. Row strides are always listed first, and column strides are always listed second. The semantic meaning of a row stride is "the distance, in units of elements, from any given element to the corresponding element (within the same column) of the next row," and the meaning of a column stride is "the distance, in units of elements, from any given element to the corresponding element (within the same row) of the next column." Thus, unit row stride implies column-major storage and unit column stride implies row-major storage.
   * All occurrences of `alpha` and `beta` parameters are scalars.
 
@@ -248,6 +248,8 @@ This section describes in detail the various level-3 microkernels supported by B
 ```c
 void bli_?gemm_<suffix>
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        ctype*     restrict alpha,
        ctype*     restrict a1,
@@ -264,6 +266,8 @@ where `<suffix>` is implementation-dependent. (Recall that the precise `<suffix>
 ```c
 void bli_?gemm_ukernel
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        ctype*     restrict alpha,
        ctype*     restrict a1,
@@ -274,6 +278,7 @@ void bli_?gemm_ukernel
        cntx_t*    restrict cntx
      );
 ```
+This function simply queries a microkernel function pointer from the context specified by `cntx`. Note that in the case of either method of calling the microkernel, `cntx` must be a valid pointer. (Passing in `NULL` will *not* result in a default context being used.)
 
 The `gemm` microkernel, sometimes simply referred to as "the BLIS microkernel" or "the microkernel", performs the following operation:
 
@@ -281,16 +286,20 @@ The `gemm` microkernel, sometimes simply referred to as "the BLIS microkernel" o
   C11 := beta * C11 + alpha * A1 * B1
 ```
 
-where `A1` is an _MR x k_ "micropanel" matrix stored in packed (column-wise) format, `B1` is a _k x NR_ "micropanel" matrix stored in packed (row-wise) format, `C11` is an _MR x NR_ general matrix stored according to its row and column strides `rsc` and `csc`, and `alpha` and beta are scalars.
+where `A1` is an _m x k_ "micropanel" matrix stored in packed (column-wise) format, `B1` is a _k x n_ "micropanel" matrix stored in packed (row-wise) format, `C11` is an _m x n_ "microtile" matrix stored according to its row and column strides `rsc` and `csc`, and `alpha` and beta are scalars.
 
-_MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md).
+Here, _m <= MR_ and _n <= NR_, where _MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md).
+
+**Note:** For many years, BLIS defined its microkernel to operate on microtiles whose dimensions were *exactly* _MR x NR_. However, as of commit 54fa28b, we have augmented the `gemm` microkernel API to pass in _m_ and _n_ dimensions as well as _k_. This change was made as part of our decision to move edge-case handling into the microkernel, whereas previously it was handled outside of the microkernel, within the portable parts of BLIS framework. And while this does mean additional complexity for microkernel authors, adding generic edge-case handling can be done in a relatively painless manner by employing some pre-defined preprocessor macros (which are defined in `bli_edge_case_macro_defs.h`). For examples of how to use these macros, please see the beginning and end of existing microkernel functions residing within the `kernels` directory.
 
 Parameters:
 
+  * `m`:      The number of rows of `C11` and `A1`.
+  * `n`:      The number of columns of `C11` and `B1`.
   * `k`:      The number of columns of `A1` and rows of `B1`.
   * `alpha`:  The address of a scalar to the `A1 * B1` product.
-  * `a1`:     The address of a micropanel of matrix `A` of dimension _MR x k_, stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.)
-  * `b1`:     The address of a micropanel of matrix `B` of dimension _k x NR_, stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.)
+  * `a1`:     The address of a micropanel of matrix `A` of dimension _m x k_ (where _m <= MR_), stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.)
+  * `b1`:     The address of a micropanel of matrix `B` of dimension _k x n_ (where _n <= NR_), stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.)
   * `beta`:   The address of a scalar to the input value of matrix `C11`.
   * `c11`:    The address of a matrix `C11` of dimension _MR x NR_, stored according to `rsc` and `csc`.
   * `rsc`:    The row stride of matrix `C11` (ie: the distance to the next row, in units of matrix elements).
@@ -321,24 +330,24 @@ The diagram below shows the packed micropanel operands and how elements of each
 
 #### Implementation Notes for gemm
 
-  * **Register blocksizes.** The register blocksizes `MR` and `NR`, corresponding to the number of *logical* rows in `a1` and columns in `b1`, respectively, are defined in the context and may be queried via `bli_cntx_get_blksz_def_dt()`. However, you shouldn't need to query these values since the implementation inherently "knows" them already.
-  * **Leading dimensions of `a1` and `b1`: _PACKMR_ and _PACKNR_.** The packed micropanels `a1` and `b1` are simply stored in column-major and row-major order, respectively. Usually, the width of either micropanel (ie: the number of logical rows of `a1`, or _MR_, and the number of columns of `b1`, or _NR_) is equal to that micropanel's so-called "leading dimension", or number of *physical* rows. Sometimes, it may be beneficial to specify a leading dimension that is larger than the panel width. This may be desirable because it allows each column of `a1` or row of `b1` to maintain a certain alignment in memory that would not otherwise be maintained by _MR_ and/or _NR_. In this case, you should index through `a1` and `b1` using the values _PACKMR_ and _PACKNR_, respectively (which are stored in the context as the blocksize "maximums" associated with the `bszid_t` values `BLIS_MR` and `BLIS_NR`). These values are defined in the context and may be queried via `bli_cntx_get_blksz_max_dt()`. However, you shouldn't need to query these values since the implementation inherently "knows" them already.
+  * **Register blocksizes.** The register blocksizes `MR` and `NR`, corresponding to the maximum number of *logical* rows in `a1` and columns in `b1`, respectively, are defined in the context and may be queried via `bli_cntx_get_blksz_def_dt()`. However, you shouldn't need to query these values since the implementation inherently "knows" them already.
+  * **Leading dimensions of `a1` and `b1`: _PACKMR_ and _PACKNR_.** The packed micropanels `a1` and `b1` are simply stored in column-major and row-major order, respectively. Usually, the width of either micropanel (ie: the number of *logical* rows of `a1` and the number of columns of `b1`) is equal to that micropanel's so-called "leading dimension", or number of *physical* rows. Sometimes, it may be beneficial to specify a leading dimension that is larger than the panel width. This may be desirable because it allows each column of `a1` or row of `b1` to maintain a certain alignment in memory that would not otherwise be maintained by _MR_ and/or _NR_, which would othewise serve as the maximum value for each micropanel, respectively. If you want your microkernel to support _MR < PACKMR_ or _NR < PACKNR_, you should index through columns of `a1` and rows of `b1` using the values _PACKMR_ and _PACKNR_, respectively (which are stored in the context as the blocksize "maximums" associated with the `bszid_t` values `BLIS_MR` and `BLIS_NR`). These values are defined in the context and may be queried via `bli_cntx_get_blksz_max_dt()`. However, you shouldn't need to query these values since the microkernel implementation inherently must "know" them already.
   * **Storage preference of `c11`.** Usually, an optimized `gemm` microkernel will have a "preferred" storage format for `C11`--typically either contiguous row-storage (i.e. `cs_c` = 1) or contiguous column-storage (i.e. `rs_c` = 1). This preference comes from how the microkernel is most efficiently able to load/store elements of `C11` from/to memory. Most microkernels use vector instructions to access contiguous columns (or column segments) of `C11`. However, the developer may decide that accessing contiguous rows (or row segments) is more desirable. If this is the case, this preference should be indicated via the `bool` argument when registering microkernels via `bli_cntx_set_l3_nat_ukrs()`--`TRUE` indicating a row preference and `FALSE` indicating a column preference. Properly setting this property allows the framework to perform a runtime optimization that will ensure the microkernel preference is honored, if at all possible.
-  * **Edge cases in _MR_, _NR_ dimensions.** Sometimes the microkernel will be called with micropanels `a1` and `b1` that correspond to edge cases, where only partial results are needed. Zero-padding is handled automatically by the packing function to facilitate reuse of the same microkernel. Similarly, the logic for computing to temporary storage and then saving only the elements that correspond to elements of `C11` that exist (at the edges) is handled automatically within the macrokernel.
-  * **Alignment of `a1` and `b1`.** By default, the alignment of addresses `a1` and `b1` are aligned only to `sizeof(type)`. If `BLIS_POOL_ADDR_ALIGN_SIZE` is set to some larger multiple of `sizeof(type)`, such as the page size, then the *first* `a1` and `b1` micropanels will be aligned to that value, but subsequent micropanels will only be aligned to `sizeof(type)`, or, if `BLIS_POOL_ADDR_ALIGN_SIZE` is a multiple of `PACKMR` and `PACKNR`, then subsequent micropanels `a1` and `b1` will be aligned to `PACKMR * sizeof(type)` and `PACKNR * sizeof(type)`, respectively.
-  * **Unrolling loops.** As a general rule of thumb, the loop over _k_ is sometimes moderately unrolled; for example, in our experience, an unrolling factor of _u_ = 4 is fairly common. If unrolling is applied in the _k_ dimension, edge cases must be handled to support values of _k_ that are not multiples of _u_. It is nearly universally true that there should be no loops in the _MR_ or _NR_ directions; in other words, iteration over these dimensions should always be fully unrolled (within the loop over _k_).
+  * **Edge cases in _MR_, _NR_ dimensions.** Sometimes the microkernel will be called with micropanels `a1` and `b1` that correspond to edge cases, where only partial results are needed. This edge-case handling was once performed by the framework automatically. However, as of commit 54fa28b, edge-case handling is the responsiblity of the microkernel. This means that the kernel author will need to handle all possible values of _m_ and _n_ that are equal to **or** less than _MR_ and _NR_, respectively. Fortunately, this can be implemented outside of the assembly region of the microkernel with preprocessor macros. Please reference the existing microkernels in the `kernels` directory for examples of how this is done. (The macros that are now employed by most of BLIS's microkernels are defined in `bli_edge_case_macro_defs.h`.)
+  * **Alignment of `a1` and `b1`.** By default, the alignment of addresses `a1` and `b1` are aligned to the page size (4096 bytes). These alignment factors are set by `BLIS_POOL_ADDR_ALIGN_SIZE_A` and `BLIS_POOL_ADDR_ALIGN_SIZE_B`, respectively. Note that these alignment factors control only the alignment of the *first* micropanel within a given packed blockof matrix `A` or packed row-panel of matrix `B`. Subsequent micropanels will only be aligned to `sizeof(type)`, or, if `BLIS_POOL_ADDR_ALIGN_SIZE_A` is a multiple of `PACKMR` and/or `BLIS_POOL_ADDR_ALIGN_SIZE_B` is a multiple of `PACKNR`, then subsequent micropanels `a1` and/or `b1` will be aligned to `PACKMR * sizeof(type)` and/or `PACKNR * sizeof(type)`, respectively.
+  * **Unrolling loops.** As a general rule of thumb, the loop over _k_ is sometimes moderately unrolled; for example, in our experience, an unrolling factor of _u_ = 4 is fairly common. If unrolling is applied in the _k_ dimension, edge cases must be handled to support values of _k_ that are not multiples of _u_. It is nearly universally true that the microkernel should not contain loops in the _m_ or _n_ directions; in other words, iteration over these dimensions should always be fully unrolled (within the loop over _k_).
   * **Zero `beta`.** If `beta` = 0.0 (or 0.0 + 0.0i for complex datatypes), then the microkernel should NOT use it explicitly, as `C11` may contain uninitialized memory (including elements containing `NaN` or `Inf`). This case should be detected and handled separately by overwriting `C11` with the `alpha * A1 * B1` product.
 
 #### Using the auxinfo\_t object
 
-Each microkernel ([gemm](KernelsHowTo.md#gemm-microkernel), [trsm](KernelsHowTo.md#trsm_microkernels), and [gemmtrsm](KernelsHowTo.md#gemmtrsm-microkernels)) takes as its last argument a pointer of type `auxinfo_t`. This BLIS-defined type is defined as a `struct` whose fields contain auxiliary values that may be useful to some microkernel authors, particularly when implementing certain optimization techniques. BLIS provides kernel authors access to the fields of the `auxinfo_t` object via the following function-like preprocessor macros. Each macro takes a single argument, the `auxinfo_t` pointer, and returns one of the values stored within the object.
+Each microkernel ([gemm](KernelsHowTo.md#gemm-microkernel), [trsm](KernelsHowTo.md#trsm_microkernels), and [gemmtrsm](KernelsHowTo.md#gemmtrsm-microkernels)) takes as its last argument a pointer of type `auxinfo_t`. This BLIS-defined type is defined as a `struct` whose fields contain auxiliary values that may be useful to some microkernel authors, particularly when implementing certain optimization techniques. BLIS provides kernel authors access to the fields of the `auxinfo_t` object via the following static inline functions. Each function takes a single argument, the `auxinfo_t` pointer, and returns one of the values stored within the object.
 
   * `bli_auxinfo_next_a()`. Returns the address (`void*`) of the micropanel of `A` that will be used the next time the microkernel will be called.
   * `bli_auxinfo_next_b()`. Returns the address (`void*`) of the micropanel of `B` that will be used the next time the microkernel will be called.
   * `bli_auxinfo_ps_a()`. Returns the panel stride (`inc_t`) of the current micropanel of `A`.
   * `bli_auxinfo_ps_b()`. Returns the panel stride (`inc_t`) of the current micropanel of `B`.
 
-The addresses of the next micropanels of `A` and `B` may be used by the microkernel to perform prefetching, if prefetching is supported by the architecture. Similarly, it may be useful to know the precise distance in memory to the next micropanel. (Note that sometimes the next micropanel to be used is **not** the same as the next micropanel in memory.)
+The addresses of the next micropanels of `A` and `B` may be used by the microkernel to perform prefetching, if prefetching is supported by the architecture. Similarly, it may be useful to know the precise distance in memory to the next micropanel. (Note that occasionally the next micropanel to be used is **not** the same as the next micropanel in memory.)
 
 Any and all of these values may be safely ignored; they are completely optional. However, BLIS guarantees that all values accessed via the macros listed above will **always** be initialized and meaningful, for every invocation of each microkernel (`gemm`, `trsm`, and `gemmtrsm`).
 
@@ -348,8 +357,7 @@ Any and all of these values may be safely ignored; they are completely optional.
 An example implementation of the `gemm` microkernel may be found in the `template` configuration directory in:
   * [config/template/kernels/3/bli\_gemm_opt\_mxn.c](https://github.com/flame/blis/tree/master/config/template/kernels/3/bli_gemm_opt_mxn.c)
 
-
-Note that this implementation is coded in C99 and lacks several kinds of optimization that are typical of real-world optimized microkernels, such as vector instructions (or intrinsics) and loop unrolling in _MR_ or _NR_. It is meant to serve only as a starting point for a microkernel developer.
+Note that this implementation is coded in C99 and lacks several kinds of optimization that are typical of real-world optimized microkernels, such as vector instructions (or intrinsics) and loop unrolling in the _m_ or _n_ dimensions. It is meant to serve only as a starting point for a microkernel developer.
 
 
@@ -411,6 +419,8 @@ where `A11` is _MR x MR_ and lower (`trsm_l`) or upper (`trsm_u`) triangular, `B
 
 _MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md).
 
+**Note:** Although the `gemm` microkernel must handle edge-cases, and therefore must take _m_ and _n_ parameters, the `trsm` microkernels are simpler in that they still assume _m = MR_ and _n = NR_, and therefore do not need these _m_ and _n_ parameters passed in.
+
 Parameters:
 
   * `a11`:    The address of `A11`, which is the _MR x MR_ lower (`trsm_l`) or upper (`trsm_u`) triangular submatrix within the packed micropanel of matrix `A`. `A11` is stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.) Note that `A11` contains elements in both triangles, though elements in the unstored triangle are not guaranteed to be zero and thus should not be referenced.
@@ -454,6 +464,8 @@ Note that these implementations are coded in C99 and lack several kinds of optim
 ```c
 void bli_?gemmtrsm_l_<suffix>
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        ctype*     restrict alpha,
        ctype*     restrict a10,
@@ -467,6 +479,8 @@ void bli_?gemmtrsm_l_<suffix>
 
 void bli_?gemmtrsm_u_<suffix>
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        ctype*     restrict alpha,
        ctype*     restrict a12,
@@ -484,6 +498,8 @@ where `<suffix>` is implementation-dependent. (Recall that the precise `<suffix>
 ```c
 void bli_?gemmtrsm_l_ukernel
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        ctype*     restrict alpha,
        ctype*     restrict a10,
@@ -497,6 +513,8 @@ void bli_?gemmtrsm_l_ukernel
 
 void bli_?gemmtrsm_u_ukernel
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k,
        ctype*     restrict alpha,
        ctype*     restrict a12,
@@ -517,7 +535,7 @@ The `gemmtrsm_l` microkernel performs the following compound operation:
   C11 := B11
 ```
 
-where `A11` is _MR_ x _MR_ and lower triangular, `A10` is _MR_ x _k_, and `B01` is _k_ x _NR_.
+where `A11` is _MR x MR_ and lower triangular, `A10` is _MR x k_, and `B01` is _k x NR_.
 The `gemmtrsm_u` microkernel performs:
 
 ```
@@ -526,20 +544,22 @@ The `gemmtrsm_u` microkernel performs:
   C11 := B11
 ```
 
-where `A11` is _MR_ x _MR_ and upper triangular, `A12` is _MR_ x _k_, and `B21` is _k_ x _NR_.
-In both cases, `B11` is _MR_ x _NR_ and `alpha` is a scalar. Here, `inv()` denotes matrix inverse.
+where `A11` is _MR x MR_ and upper triangular, `A12` is _MR x k_, and `B21` is _k x NR_.
+In both cases, `B11` is _MR x NR_ and `alpha` is a scalar. However, `C11` is _m x n_, and therefore the `C11 := B11` statements amount to a copy of only the top-leftmost _m x n_ elements of `B11`. (Recall that A11 and B11 are packed and therefore guaranteed to reside within fully-sized micropanels, whereas `C11` exists in the caller-provided output matrix and may represent a bottom-right edge case.) Here, `inv()` denotes matrix inverse.
 
 _MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md).
 
 Parameters:
 
+  * `m`:      The number of rows of `C11`.
+  * `n`:      The number of columns of `C11`.
   * `k`:      The number of columns of `A10` and rows of `B01` (`trsm_l`); the number of columns of `A12` and rows of `B21` (`trsm_u`).
   * `alpha`:  The address of a scalar to be applied to `B11`.
   * `a10`, `a12`:    The address of `A10` or `A12`, which is the _MR x k_ submatrix of the packed micropanel of `A` that is situated to the left (`trsm_l`) or right (`trsm_u`) of the _MR x MR_ triangular submatrix `A11`. `A10` and `A12` are stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.)
   * `a11`:    The address of `A11`, which is the _MR x MR_ lower (`trsm_l`) or upper (`trsm_u`) triangular submatrix within the packed micropanel of matrix `A` that is situated to the right of `A10` (`trsm_l`) or the left of `A12` (`trsm_u`). `A11` is stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.) Note that `A11` contains elements in both triangles, though elements in the unstored triangle are not guaranteed to be zero and thus should not be referenced.
   * `b01`, `b21`:   The address of `B01` and `B21`, which is the _k x NR_ submatrix of the packed micropanel of `B` that is situated above (`trsm_l`) or below (`trsm_u`) the _MR x NR_ block `B11`. `B01` and `B21` are stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.)
   * `b11`:    The address of `B11`, which is the _MR x NR_ submatrix of the packed micropanel of `B`, situated below `B01` (`trsm_l`) or above `B21` (`trsm_u`). `B11` is stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.)
-  * `c11`:    The address of `C11`, which is an _MR x NR_ submatrix of matrix `C`, stored according to `rsc` and `csc`. `C11` is the submatrix within `C` that corresponds to the elements which were packed into `B11`. Thus, `C` is the original input matrix `B` to the overall `trsm` operation.
+  * `c11`:    The address of `C11`, which is an _m x n_ submatrix of matrix `C`, stored according to `rsc` and `csc`, where _m <= MR_ and _n <= NR_. `C11` is the submatrix within `C` that corresponds to the elements which were packed into `B11`. Thus, `C` is the original input matrix `B` to the overall `trsm` operation.
   * `rsc`:    The row stride of matrix `C11` (ie: the distance to the next row, in units of matrix elements).
   * `csc`:    The column stride of matrix `C11` (ie: the distance to the next column, in units of matrix elements).
   * `data`:   The address of an `auxinfo_t` object that contains auxiliary information that may be useful when optimizing the `gemmtrsm` microkernel implementation. (See [Using the auxinfo\_t object](KernelsHowTo.md#Using_the_auxinfo_t_object) for a discussion of the kinds of values available via `auxinfo_t`, and also [Implementation Notes for gemmtrsm](KernelsHowTo.md#implementation-notes-for-gemmtrsm) for caveats.)
@@ -690,7 +710,7 @@ This kernel performs the following operation:
 ```
   y := y + alpha * conja(a) * conjy(x)
 ```
-where `a` is an _m_ x _b_ matrix, `x` is a vector of length _b_, and `y` is a vector of length _m_. Vectors `x` and `y` are stored with strides `incx` and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit. This kernel is typically implemented as a fused series of _b_ `axpyv` operations updating the same vector `y` (with the elements of `x` serving as the scalars and the columns of `a` serving as the vectors to be scaled).
+where `a` is an _m x b_ matrix, `x` is a vector of length _b_, and `y` is a vector of length _m_. Vectors `x` and `y` are stored with strides `incx` and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit. This kernel is typically implemented as a fused series of _b_ `axpyv` operations updating the same vector `y` (with the elements of `x` serving as the scalars and the columns of `a` serving as the vectors to be scaled).
 
 ---
 
@@ -714,7 +734,7 @@ This kernel performs the following operation:
 ```
   y := beta * y + alpha * conjat(a)^T conjx(x)
 ```
-where `a` is an _m_ x _b_ matrix, where `w` is a vector of length _m_, `y` is a vector of length _b_, and `alpha` is a scalar.
+where `a` is an _m x b_ matrix, where `w` is a vector of length _m_, `y` is a vector of length _b_, and `alpha` is a scalar.
 Vectors `x` and `y` are stored with strides `incx` and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit.
 This kernel is typically implemented as a series of _b_ `dotxv` operations with the same right-hand operand vector `x` (contracted with the rows of `a^T` and accumulating to the corresponding elements of vector `y`).
 
@@ -745,7 +765,7 @@ This kernel performs the following operation:
   y := beta * y + alpha * conjat(a)^T conjw(w)
   z :=        z + alpha *  conja(a)   conjx(x)
 ```
-where `a` is an _m_ x _b_ matrix, `w` and `z` are vectors of length _m_, `x` and `y` are vectors of length _b_, and `alpha` and `beta` are scalars.
+where `a` is an _m x b_ matrix, `w` and `z` are vectors of length _m_, `x` and `y` are vectors of length _b_, and `alpha` and `beta` are scalars.
 Vectors `w`, `z`, `x` and `y` are stored with strides `incw`, `incz`, `incx`, and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit.
 This kernel is typically implemented as a series of _b_ `dotxv` operations with the same right-hand operand vector `w` fused with a series of _b_ `axpyv` operations updating the same vector `z`.
 
diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h
index 561c8264f..28065c208 100644
--- a/frame/3/bli_l3_ft_ukr.h
+++ b/frame/3/bli_l3_ft_ukr.h
@@ -69,6 +69,8 @@ INSERT_GENTDEF( gemm )
 \
 typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a1x, \
diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h
index f73a6ad90..6f24e71fc 100644
--- a/frame/3/bli_l3_ind_ukr.h
+++ b/frame/3/bli_l3_ind_ukr.h
@@ -43,6 +43,8 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a, \
@@ -61,6 +63,8 @@ INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
 \
 void PASTEMAC(ch,opname) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a1x, \
diff --git a/frame/3/bli_l3_ukr_oapi.c b/frame/3/bli_l3_ukr_oapi.c
index b8f2e00e6..e500bab71 100644
--- a/frame/3/bli_l3_ukr_oapi.c
+++ b/frame/3/bli_l3_ukr_oapi.c
@@ -111,6 +111,8 @@ void PASTEMAC0(opname) \
 \
 	num_t     dt        = bli_obj_dt( c11 ); \
 \
+	dim_t     m         = bli_obj_length( c11 ); \
+	dim_t     n         = bli_obj_width( c11 ); \
 	dim_t     k         = bli_obj_width( a1x ); \
 	void*     buf_a1x   = bli_obj_buffer_at_off( a1x ); \
 	void*     buf_a11   = bli_obj_buffer_at_off( a11 ); \
@@ -140,6 +142,8 @@ void PASTEMAC0(opname) \
 \
 		f \
 		( \
+		  m, \
+		  n, \
 		  k, \
 		  buf_alpha, \
 		  buf_a1x, \
@@ -160,6 +164,8 @@ void PASTEMAC0(opname) \
 \
 		f \
 		( \
+		  m, \
+		  n, \
 		  k, \
 		  buf_alpha, \
 		  buf_a1x, \
diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h
index f68973ff5..677afc020 100644
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -59,6 +59,8 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a1x, \
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index ab745d12b..56eaf3f4c 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -83,6 +83,8 @@ INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR )
 \
 void PASTEMAC(ch,opname) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a1x, \
@@ -105,6 +107,8 @@ void PASTEMAC(ch,opname) \
 	/* Invoke the typed function for the given datatype. */ \
 	f \
 	( \
+	  m, \
+	  n, \
 	  k, \
 	  alpha, \
 	  a1x, \
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index b503efa5b..f50f739e7 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -176,12 +176,14 @@ void PASTEMAC(ch,varname) \
 	   temporary buffer are set so that they match the storage of the
 	   original C matrix. For example, if C is column-stored, ct will be
 	   column-stored as well. */ \
+/*
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+*/ \
 \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
@@ -276,10 +278,6 @@ void PASTEMAC(ch,varname) \
 	   know that the underlying buffer was already allocated to have an m
 	   dimension that is a multiple of PACKMR, with the region between the
 	   last row and the next multiple of MR zero-padded accordingly. */ \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
        dimensions. */ \
@@ -409,44 +407,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_a10, \
-					  alpha1_cast, \
-					  a10, \
-					  a11, \
-					  b01, \
-					  b11, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_a10, \
-					  alpha1_cast, \
-					  a10, \
-					  a11, \
-					  b01, \
-					  b11, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the bottom edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				gemmtrsm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_a10, \
+				  alpha1_cast, \
+				  a10, \
+				  a11, \
+				  b01, \
+				  b11, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 \
 				a1 += ps_a_cur; \
 			} \
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 55ceafb91..4f3514143 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -176,12 +176,14 @@ void PASTEMAC(ch,varname) \
 	   temporary buffer are set so that they match the storage of the
 	   original C matrix. For example, if C is column-stored, ct will be
 	   column-stored as well. */ \
+/*
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+*/ \
 \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
@@ -284,10 +286,6 @@ void PASTEMAC(ch,varname) \
 	   know that the underlying buffer was already allocated to have an m
 	   dimension that is a multiple of PACKMR, with the region between the
 	   last row and the next multiple of MR zero-padded accordingly. */ \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
        dimensions. */ \
@@ -419,44 +417,20 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( a2, &aux ); \
 				bli_auxinfo_set_next_b( b2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_a12, \
-					  alpha1_cast, \
-					  a12, \
-					  a11, \
-					  b21, \
-					  b11, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_a12, \
-					  alpha1_cast, \
-					  a12, \
-					  a11, \
-					  b21, \
-					  b11, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the bottom edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				gemmtrsm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_a12, \
+				  alpha1_cast, \
+				  a12, \
+				  a11, \
+				  b21, \
+				  b11, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
 \
 				a1 += ps_a_cur; \
 			} \
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 23d4dd728..b4937134f 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -181,12 +181,14 @@ void PASTEMAC(ch,varname) \
 	   temporary buffer are set so that they match the storage of the
 	   original C matrix. For example, if C is column-stored, ct will be
 	   column-stored as well. */ \
+/*
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+*/ \
 \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
@@ -302,10 +304,6 @@ void PASTEMAC(ch,varname) \
 	   know that the underlying buffer was already allocated to have an n
 	   dimension that is a multiple of PACKNR, with the region between the
 	   last column and the next multiple of NR zero-padded accordingly. */ \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
        dimensions. */ \
@@ -424,44 +422,21 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( b2, &aux ); \
 				bli_auxinfo_set_next_b( a2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_b21, \
-					  alpha1_cast, \
-					  b21, \
-					  b11, \
-					  a12, \
-					  a11, \
-					  c11, cs_c, rs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_b21, \
-					  alpha1_cast, \
-					  b21, \
-					  b11, \
-					  a12, \
-					  a11, \
-					  ct, cs_ct, rs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the bottom edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				gemmtrsm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_b21, \
+				  alpha1_cast, \
+				  b21, \
+				  b11, \
+				  a12, \
+				  a11, \
+				  c11, cs_c, rs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+\
 				} \
 \
 				a1  += rstep_a; \
@@ -512,6 +487,7 @@ void PASTEMAC(ch,varname) \
 				  &aux, \
 				  cntx  \
 				); \
+\
 				} \
 \
 				a1  += rstep_a; \
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 71381707c..09942d311 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -181,12 +181,14 @@ void PASTEMAC(ch,varname) \
 	   temporary buffer are set so that they match the storage of the
 	   original C matrix. For example, if C is column-stored, ct will be
 	   column-stored as well. */ \
+/*
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+*/ \
 \
 	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 	ctype* restrict a_cast      = a; \
@@ -297,10 +299,6 @@ void PASTEMAC(ch,varname) \
 	   know that the underlying buffer was already allocated to have an n
 	   dimension that is a multiple of PACKNR, with the region between the
 	   last column and the next multiple of NR zero-padded accordingly. */ \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
 \
 	/* Compute number of primary and leftover components of the m and n
        dimensions. */ \
@@ -417,44 +415,21 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_next_a( b2, &aux ); \
 				bli_auxinfo_set_next_b( a2, &aux ); \
 \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_b01, \
-					  alpha1_cast, \
-					  b01, \
-					  b11, \
-					  a10, \
-					  a11, \
-					  c11, cs_c, rs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the fused gemm/trsm micro-kernel. */ \
-					gemmtrsm_ukr \
-					( \
-					  k_b01, \
-					  alpha1_cast, \
-					  b01, \
-					  b11, \
-					  a10, \
-					  a11, \
-					  ct, cs_ct, rs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Copy the result to the bottom edge of C. */ \
-					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        c11, rs_c,  cs_c ); \
-				} \
+				gemmtrsm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k_b01, \
+				  alpha1_cast, \
+				  b01, \
+				  b11, \
+				  a10, \
+				  a11, \
+				  c11, cs_c, rs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+\
 				} \
 \
 				a1  += rstep_a; \
@@ -505,6 +480,7 @@ void PASTEMAC(ch,varname) \
 				  &aux, \
 				  cntx  \
 				); \
+\
 				} \
 \
 				a1  += rstep_a; \
diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h
index 4a1fba7ac..70d97d5d1 100644
--- a/frame/include/bli_edge_case_macro_defs.h
+++ b/frame/include/bli_edge_case_macro_defs.h
@@ -35,8 +35,11 @@
 #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H
 #define BLIS_EDGE_CASE_MACRO_DEFS_H
 
+//
+// Macros for edge-case handling within gemm microkernels.
+//
 
-// Helper macros for edge-case handling within gemm microkernels.
+// -- Setup helper macros --
 
 #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \
 \
@@ -62,8 +65,14 @@
 		beta = &_zero; \
 	}
 
+// -- Setup macros --
+
 #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \
 \
+	/* Scenario 1: the ukernel contains assembly-level support only for its
+	   IO preference (e.g. only row-oriented or only column-oriented IO).
+	   Use a temporary microtile for the other two cases as well as edge
+	   cases. */ \
 	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
 	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
 	                     m != mr || n != nr; \
@@ -71,6 +80,10 @@
 
 #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \
 \
+	/* Scenario 2: the ukernel contains assembly-level support for its IO
+	   preference as well as its opposite via in-register transpose
+	   (e.g. both row- and column-oriented IO). Use a temporary microtile
+	   for the general stride case as well as edge cases. */ \
 	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
 	const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \
 	                     m != mr || n != nr; \
@@ -78,12 +91,16 @@
 
 #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \
 \
+	/* Scenario 3: Similar to (2) where the assembly region also supports
+	   general stride I0. Use a temporary microtile only for edge cases. */ \
 	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
-	const bool _use_ct = m != mr || n != nr; \
+	const bool _use_ct = ( m != mr || n != nr ); \
 	GEMM_UKR_SETUP_CT_POST(ch);
 
 #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \
 \
+	/* Scenario 4: Similar to (1), but uses temporary microtile to handle
+	   cases where the pointer to the C microtile is not aligned. */ \
 	GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \
 	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
 	                     m != mr || n != nr || \
@@ -91,8 +108,12 @@
 	                     ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \
 	GEMM_UKR_SETUP_CT_POST(ch);
 
+// -- Flush macros --
+
 #define GEMM_UKR_FLUSH_CT(ch) \
 \
+	/* If we actually used the temporary microtile, accumulate it to the output
+	   microtile. */ \
 	if ( _use_ct ) \
 	{ \
 		PASTEMAC(ch,xpbys_mxn) \
@@ -105,5 +126,90 @@
 	} \
 
 
+//
+// Macros for edge-case handling within gemmtrsm microkernels.
+//
+
+// -- Setup helper macros --
+
+#define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \
+\
+	PASTEMAC(ch,ctype)* restrict _c      = c11; \
+	const inc_t                  _rs_c   = rs_c; \
+	const inc_t                  _cs_c   = cs_c; \
+	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \
+	                                  __attribute__((aligned(alignment))); \
+	const inc_t                  _rs_ct  = row_major ? nr :  1; \
+	const inc_t                  _cs_ct  = row_major ?  1 : mr;
+
+#define GEMMTRSM_UKR_SETUP_CT_POST(ch) \
+\
+	if ( _use_ct ) \
+	{ \
+		c11 = _ct; \
+		rs_c = _rs_ct; \
+		cs_c = _cs_ct; \
+	}
+
+// -- Setup macros --
+
+#define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \
+\
+	/* Scenario 1: the ukernel contains assembly-level support only for its
+	   IO preference (e.g. only row-oriented or only column-oriented IO).
+	   Use a temporary microtile for the other two cases as well as edge
+	   cases. */ \
+	GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
+	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
+	                     m != mr || n != nr; \
+	GEMMTRSM_UKR_SETUP_CT_POST(ch);
+
+#define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \
+\
+	/* Scenario 2: the ukernel contains assembly-level support for its IO
+	   preference as well as its opposite via in-register transpose
+	   (e.g. both row- and column-oriented IO). Use a temporary microtile
+	   for the general stride case as well as edge cases. */ \
+	GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
+	const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \
+	                     m != mr || n != nr; \
+	GEMMTRSM_UKR_SETUP_CT_POST(ch);
+
+#define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \
+\
+	/* Scenario 3: Similar to (2) where the assembly region also supports
+	   general stride I0. Use a temporary microtile only for edge cases. */ \
+	GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \
+	const bool _use_ct = ( m != mr || n != nr ); \
+	GEMMTRSM_UKR_SETUP_CT_POST(ch);
+
+#define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \
+\
+	/* Scenario 4: Similar to (1), but uses temporary microtile to handle
+	   cases where the pointer to the C microtile is not aligned. */ \
+	GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \
+	const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \
+	                     m != mr || n != nr || \
+	                     ( (uintptr_t)_c % alignment ) || \
+	                     ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \
+	GEMMTRSM_UKR_SETUP_CT_POST(ch);
+
+// -- Flush macros --
+
+#define GEMMTRSM_UKR_FLUSH_CT(ch) \
+\
+	/* If we actually used the temporary microtile, use it to overwrite the
+	   output microtile. Used by trsm. */ \
+	if ( _use_ct ) \
+	{ \
+		PASTEMAC(ch,copys_mxn) \
+		( \
+		  m, n, \
+		  _ct, _rs_ct, _cs_ct, \
+		  _c,  _rs_c,  _cs_c \
+		); \
+	} \
+
+
 #endif
 
diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index aead3ea9f..d0d0ff211 100644
--- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -58,6 +58,8 @@
 
 void bli_sgemmtrsm_l_haswell_asm_6x16
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k0,
        float*     restrict alpha,
        float*     restrict a10,
@@ -81,6 +83,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
 
 	float*   beta   = bli_sm1;
 
+	GEMMTRSM_UKR_SETUP_CT_ANY( s, 6, 16, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -825,6 +829,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMMTRSM_UKR_FLUSH_CT( s );
 }
 
 
@@ -843,6 +849,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
 
 void bli_dgemmtrsm_l_haswell_asm_6x8
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k0,
        double*    restrict alpha,
        double*    restrict a10,
@@ -866,6 +874,8 @@ void bli_dgemmtrsm_l_haswell_asm_6x8
 
 	double*  beta   = bli_dm1;
 
+	GEMMTRSM_UKR_SETUP_CT_ANY( d, 6, 8, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -1572,6 +1582,8 @@ void bli_dgemmtrsm_l_haswell_asm_6x8
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMMTRSM_UKR_FLUSH_CT( d );
 }
 
 
diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index 2849e6994..68a8c069b 100644
--- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -58,6 +58,8 @@
 
 void bli_sgemmtrsm_u_haswell_asm_6x16
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k0,
        float*     restrict alpha,
        float*     restrict a10,
@@ -81,6 +83,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 
 	float*   beta   = bli_sm1;
 
+	GEMMTRSM_UKR_SETUP_CT_ANY( s, 6, 16, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -830,6 +834,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMMTRSM_UKR_FLUSH_CT( s );
 }
 
 
@@ -848,6 +854,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 
 void bli_dgemmtrsm_u_haswell_asm_6x8
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k0,
        double*    restrict alpha,
        double*    restrict a10,
@@ -871,6 +879,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
 
 	double*  beta   = bli_dm1;
 
+	GEMMTRSM_UKR_SETUP_CT_ANY( d, 6, 8, true );
+
 	begin_asm()
 
 	vzeroall() // zero all xmm/ymm registers.
@@ -1583,6 +1593,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
 	  "xmm12", "xmm13", "xmm14", "xmm15",
 	  "memory"
 	)
+
+	GEMMTRSM_UKR_FLUSH_CT( d );
 }
 
 
diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
index 56afcf08c..7bef618fa 100644
--- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
@@ -56,6 +56,8 @@ void bli_sgemmtrsm_l_penryn_asm_8x4
 
 void bli_dgemmtrsm_l_penryn_asm_4x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k0,
        double*    restrict alpha,
        double*    restrict a10,
@@ -76,6 +78,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false );
+
 	begin_asm()
 		
 		mov(var(a10), rax) // load address of a10.
@@ -561,6 +565,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		  "memory"
 	)
 
+	GEMMTRSM_UKR_FLUSH_CT( d );
 }
 
 
diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
index 9811e0e32..add12ea24 100644
--- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
@@ -56,6 +56,8 @@ void bli_sgemmtrsm_u_penryn_asm_8x4
 
 void bli_dgemmtrsm_u_penryn_asm_4x4
      (
+       dim_t               m,
+       dim_t               n,
        dim_t               k0,
        double*    restrict alpha,
        double*    restrict a12,
@@ -76,6 +78,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
+	GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false );
+
 	begin_asm()
 		
 		mov(var(a12), rax) // load address of a12.
@@ -546,6 +550,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		  "memory"
 	)
 
+	GEMMTRSM_UKR_FLUSH_CT( d );
 }
 
 
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 2b260c881..30fc3fcd6 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -39,6 +39,8 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a1x, \
@@ -52,8 +54,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
 \
-	const inc_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+\
 	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
 	const inc_t     rs_b   = packnr; \
@@ -65,13 +68,35 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	              gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	PASTECH(ch,trsm_ukr_ft) \
 	              trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
+\
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	/* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
+	   instead? */ \
+	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : nr ); \
+	const inc_t     cs_ct       = ( col_pref ? mr : 1 ); \
+\
+	const bool      use_ct      = ( m < mr || n < nr ); \
+\
+	ctype* restrict c11_use     = c11; \
+	inc_t           rs_c_use    = rs_c; \
+	inc_t           cs_c_use    = cs_c; \
+\
+	if ( use_ct ) \
+	{ \
+		c11_use  = ct; \
+		rs_c_use = rs_ct; \
+		cs_c_use = cs_ct; \
+	} \
 \
 	/* lower: b11 = alpha * b11 - a10 * b01; */ \
 	/* upper: b11 = alpha * b11 - a12 * b21; */ \
 	gemm_ukr \
 	( \
-	  mr, \
-	  nr, \
+	  m, \
+	  n, \
 	  k, \
 	  minus_one, \
 	  a1x, \
@@ -88,10 +113,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	( \
 	  a11, \
 	  b11, \
-	  c11, rs_c, cs_c, \
+	  c11_use, rs_c_use, cs_c_use, \
 	  data, \
 	  cntx  \
 	); \
+\
+	if ( use_ct ) \
+	{ \
+		PASTEMAC(ch,copys_mxn) \
+		( \
+		  m, n, \
+		  ct,  rs_ct, cs_ct, \
+		  c11, rs_c,  cs_c  \
+		); \
+	} \
 \
 /*
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 96f5a16fe..08823f073 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -39,6 +39,8 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
+       dim_t               m, \
+       dim_t               n, \
        dim_t               k, \
        ctype*     restrict alpha, \
        ctype*     restrict a1x, \
@@ -59,7 +61,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	PASTECH(ch,trsm_ukr_ft) \
 	                ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
 \
-	const bool        col_pref    = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref_r  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 \
 	const dim_t       mr          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -98,6 +100,28 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype_r*          b_use; \
 	inc_t             rs_b_use; \
 	inc_t             cs_b_use; \
+\
+	ctype             ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                      / sizeof( ctype ) ] \
+	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	/* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
+	   instead? */ \
+	const bool        col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t       rs_ct       = ( col_pref ? 1 : nr ); \
+	const inc_t       cs_ct       = ( col_pref ? mr : 1 ); \
+\
+	const bool        use_ct      = ( m < mr || n < nr ); \
+\
+	ctype* restrict   c11_use     = c11; \
+	inc_t             rs_c_use    = rs_c; \
+	inc_t             cs_c_use    = cs_c; \
+\
+	if ( use_ct ) \
+	{ \
+		c11_use  = ct; \
+		rs_c_use = rs_ct; \
+		cs_c_use = cs_ct; \
+	} \
 \
 \
 	/* Handle alphas with non-zero imaginary components. */ \
@@ -113,7 +137,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		bli_abort(); \
 \
-/*
+		/*
 		ctype_r* restrict one_r = PASTEMAC(chr,1); \
 \
 		const inc_t ld_b = rs_b; \
@@ -125,17 +149,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		                          b11, rs_b, cs_b, ld_b ); \
 \
 		alpha_r = *one_r; \
-*/ \
+		*/ \
 	} \
 \
 \
 	{ \
 		/* Set the strides for the temporary bt matrix based on the native
 		   real domain micro-kernel storage preferences. */ \
-		if ( col_pref ) { rs_bt   = 1;    cs_bt   = mr;     \
-		                  rs_bt_r = 1;    cs_bt_r = mr_r; } \
-		else            { rs_bt   = nr;   cs_bt   = 1;      \
-		                  rs_bt_r = nr_r; cs_bt_r = 1;    } \
+		if ( col_pref_r ) { rs_bt   = 1;    cs_bt   = mr;     \
+		                    rs_bt_r = 1;    cs_bt_r = mr_r; } \
+		else              { rs_bt   = nr;   cs_bt   = 1;      \
+		                    rs_bt_r = nr_r; cs_bt_r = 1;    } \
 \
 		b_use    = ( ctype_r* )bt; \
 		rs_b_use = rs_bt_r; \
@@ -241,10 +265,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	( \
 	  a11, \
 	  b11, \
-	  c11, rs_c, cs_c, \
+	  c11_use, rs_c_use, cs_c_use, \
 	  data, \
 	  cntx  \
 	); \
+\
+	if ( use_ct ) \
+	{ \
+		PASTEMAC(ch,copys_mxn) \
+		( \
+		  m, n, \
+		  ct,  rs_ct, cs_ct, \
+		  c11, rs_c,  cs_c  \
+		); \
+	} \
 }
 
 INSERT_GENTFUNCCO_BASIC3( gemmtrsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )

From c9700f369aa84fc00f36c4b817ffb7dab72b865d Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 15 Feb 2022 15:36:52 -0600
Subject: [PATCH 032/230] Renamed SIMD-related macro constants for clarity.

Details:
- Renamed the following macros defined in bli_kernel_macro_defs.h:

    BLIS_SIMD_NUM_REGISTERS -> BLIS_SIMD_MAX_NUM_REGISTERS
    BLIS_SIMD_SIZE          -> BLIS_SIMD_MAX_SIZE

  Also updated all instances of these macros elsewhere, including
  subconfigurations, source code, and documentation. Thanks to Devin
  Matthews for suggesting this change.
---
 config/a64fx/bli_family_a64fx.h       |  4 ++--
 config/armsve/bli_family_armsve.h     |  4 ++--
 config/knc/bli_family_knc.h           |  4 ++--
 config/knl/bli_family_knl.h           |  4 ++--
 config/skx/bli_family_skx.h           |  4 ++--
 docs/ConfigurationHowTo.md            | 23 +++++++++++++----------
 docs/Testsuite.md                     |  2 +-
 frame/base/bli_info.c                 |  4 ++--
 frame/include/bli_kernel_macro_defs.h | 14 +++++++-------
 9 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/config/a64fx/bli_family_a64fx.h b/config/a64fx/bli_family_a64fx.h
index b67ae7c60..f2837459d 100644
--- a/config/a64fx/bli_family_a64fx.h
+++ b/config/a64fx/bli_family_a64fx.h
@@ -38,8 +38,8 @@
 
 // -- MEMORY ALLOCATION --------------------------------------------------------
 
-#define BLIS_SIMD_ALIGN_SIZE    256
-#define BLIS_SIMD_NUM_REGISTERS 32
+#define BLIS_SIMD_ALIGN_SIZE        256
+#define BLIS_SIMD_MAX_NUM_REGISTERS 32
 
 // SVE-specific configs.
 #define N_L1_SVE_DEFAULT 64
diff --git a/config/armsve/bli_family_armsve.h b/config/armsve/bli_family_armsve.h
index b67ae7c60..f2837459d 100644
--- a/config/armsve/bli_family_armsve.h
+++ b/config/armsve/bli_family_armsve.h
@@ -38,8 +38,8 @@
 
 // -- MEMORY ALLOCATION --------------------------------------------------------
 
-#define BLIS_SIMD_ALIGN_SIZE    256
-#define BLIS_SIMD_NUM_REGISTERS 32
+#define BLIS_SIMD_ALIGN_SIZE        256
+#define BLIS_SIMD_MAX_NUM_REGISTERS 32
 
 // SVE-specific configs.
 #define N_L1_SVE_DEFAULT 64
diff --git a/config/knc/bli_family_knc.h b/config/knc/bli_family_knc.h
index 6f9e03e8f..b968b0c9a 100644
--- a/config/knc/bli_family_knc.h
+++ b/config/knc/bli_family_knc.h
@@ -46,8 +46,8 @@
 
 #define BLIS_SIMD_ALIGN_SIZE           64
 
-#define BLIS_SIMD_SIZE                 64
-#define BLIS_SIMD_NUM_REGISTERS        32
+#define BLIS_SIMD_MAX_SIZE             64
+#define BLIS_SIMD_MAX_NUM_REGISTERS    32
 
 
 #if 0
diff --git a/config/knl/bli_family_knl.h b/config/knl/bli_family_knl.h
index 64994cd9d..98d3fe8d7 100644
--- a/config/knl/bli_family_knl.h
+++ b/config/knl/bli_family_knl.h
@@ -52,8 +52,8 @@
 
 #define BLIS_SIMD_ALIGN_SIZE             64
 
-#define BLIS_SIMD_SIZE                   64
-#define BLIS_SIMD_NUM_REGISTERS          32
+#define BLIS_SIMD_MAX_SIZE               64
+#define BLIS_SIMD_MAX_NUM_REGISTERS      32
 
 /*
 #ifdef BLIS_NO_HBWMALLOC
diff --git a/config/skx/bli_family_skx.h b/config/skx/bli_family_skx.h
index ac9478f8b..d698f12b4 100644
--- a/config/skx/bli_family_skx.h
+++ b/config/skx/bli_family_skx.h
@@ -47,8 +47,8 @@
 
 #define BLIS_SIMD_ALIGN_SIZE             64
 
-#define BLIS_SIMD_SIZE                   64
-#define BLIS_SIMD_NUM_REGISTERS          32
+#define BLIS_SIMD_MAX_SIZE               64
+#define BLIS_SIMD_MAX_NUM_REGISTERS      32
 
 //#include <stdlib.h>
 
diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md
index 08eaf8027..dcec7754c 100644
--- a/docs/ConfigurationHowTo.md
+++ b/docs/ConfigurationHowTo.md
@@ -212,32 +212,35 @@ Furthermore, if a header file needs to be included, such as `my_malloc.h`, it sh
 
 _**SIMD register file.**_ BLIS allows you to specify the _maximum_ number of SIMD registers available for use by your kernels, as well as the _maximum_ size (in bytes) of those registers. These values default to:
 ```c
-#define BLIS_SIMD_NUM_REGISTERS  32
-#define BLIS_SIMD_SIZE           64
+#define BLIS_SIMD_MAX_NUM_REGISTERS  32
+#define BLIS_SIMD_MAX_SIZE           64
 ```
 These macros are used in computing the maximum amount of temporary storage (typically allocated statically, on the function stack) that will be needed to hold a single micro-tile of any datatype (and for any induced method):
 ```c
-#define BLIS_STACK_BUF_MAX_SIZE  ( BLIS_SIMD_NUM_REGISTERS * BLIS_SIMD_SIZE * 2 )
+#define BLIS_STACK_BUF_MAX_SIZE  ( BLIS_SIMD_MAX_NUM_REGISTERS * BLIS_SIMD_MAX_SIZE * 2 )
 ```
-These temporary buffers are used when handling edge cases (m % _MR_ != 0 || n % _NR_ != 0) within the level-3 macrokernels, and also in the virtual microkernels of various implementations of induced methods for complex matrix multiplication. It is **very important** that these values be set correctly; otherwise, you may experience undefined behavior as stack data is overwritten at run-time. A kernel developer may set `BLIS_SIMD_NUM_REGISTERS` and `BLIS_SIMD_SIZE`, which will indirectly affect `BLIS_STACK_BUF_MAX_SIZE`, or he may set `BLIS_STACK_BUF_MAX_SIZE` directly. Notice that the default values are already set to work with modern x86_64 systems.
+These temporary buffers are used when handling edge cases (m % _MR_ != 0 || n % _NR_ != 0) within the level-3 macrokernels, and also in the virtual microkernels of various implementations of induced methods for complex matrix multiplication. It is **very important** that these values be set correctly; otherwise, you may experience undefined behavior as stack data is overwritten at run-time. A kernel developer may set `BLIS_SIMD_MAX_NUM_REGISTERS` and `BLIS_SIMD_MAX_SIZE`, which will indirectly affect `BLIS_STACK_BUF_MAX_SIZE`, or he may set `BLIS_STACK_BUF_MAX_SIZE` directly. Notice that the default values are already set to work with modern x86_64 systems.
 
 _**Memory alignment.**_ BLIS implements memory alignment internally, rather than relying on a function such as `posix_memalign()`, and thus it can provide aligned memory even with functions that adhere to the `malloc()` and `free()` API in the standard C library.
 ```c
-#define BLIS_SIMD_ALIGN_SIZE             BLIS_SIMD_SIZE
+#define BLIS_SIMD_ALIGN_SIZE             BLIS_SIMD_MAX_SIZE
 #define BLIS_PAGE_SIZE                   4096
 
 #define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
 #define BLIS_HEAP_ADDR_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
 #define BLIS_HEAP_STRIDE_ALIGN_SIZE      BLIS_SIMD_ALIGN_SIZE
-#define BLIS_POOL_ADDR_ALIGN_SIZE        BLIS_PAGE_SIZE
+#define BLIS_POOL_ADDR_ALIGN_SIZE_A      BLIS_PAGE_SIZE
+#define BLIS_POOL_ADDR_ALIGN_SIZE_B      BLIS_PAGE_SIZE
+#define BLIS_POOL_ADDR_ALIGN_SIZE_C      BLIS_PAGE_SIZE
+#define BLIS_POOL_ADDR_ALIGN_SIZE_GEN    BLIS_PAGE_SIZE
 ```
-The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_SIZE`.
+The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`.
 
 The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. 
 
 The value `BLIS_HEAP_STRIDE_ALIGN_SIZE` defines the alignment used for so-called "leading dimensions" (i.e. column strides for column-stored matrices, and row strides for row-stored matrices) when creating BLIS matrices via the object-based API (e.g. `bli_obj_create()`). While setting `BLIS_HEAP_ADDR_ALIGN_SIZE` guarantees alignment for the first column (or row), creating a matrix with certain dimension values (_m_ and _n_) may cause subsequent columns (or rows) to be misaligned. Setting this value to `BLIS_SIMD_ALIGN_SIZE` is usually desirable. Additional alignment may or may not be beneficial.
 
-The value `BLIS_POOL_ADDR_ALIGN_SIZE` defines the alignment used when allocating blocks to the memory pools used to manage internal packing buffers. Any block of memory returned by the memory allocator is guaranteed to be aligned to this value. Aligning these blocks to the virtual memory page size (usually 4096 bytes) is standard practice.
+The value `BLIS_POOL_ADDR_ALIGN_SIZE_*` define the alignments used when allocating blocks to the memory pools used to manage internal packing buffers for matrices A, B, C, and for general use. Any block of memory returned by the memory allocator is guaranteed to be aligned to this value. Aligning these blocks to the virtual memory page size (usually 4096 bytes) is standard practice.
 
 
@@ -635,8 +638,8 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
    ```
    and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. 
    ```c
-   #define BLIS_SIMD_NUM_REGISTERS  32
-   #define BLIS_SIMD_SIZE           64
+   #define BLIS_SIMD_MAX_NUM_REGISTERS  32
+   #define BLIS_SIMD_MAX_SIZE           64
 
    #ifdef BLIS_NO_HBWMALLOC
      #include <stdlib.h>
diff --git a/docs/Testsuite.md b/docs/Testsuite.md
index d34955f0a..7c4893d04 100644
--- a/docs/Testsuite.md
+++ b/docs/Testsuite.md
@@ -150,7 +150,7 @@ _**Vector storage scheme.**_ Similar to the matrix storage scheme string, this s
 
 _**Test all combinations of storage schemes?**_ Enabling this option causes all combinations of storage schemes to be tested. For example, if the option is disabled, a matrix storage scheme string of `cr` would cause the `gemm` test module to test execution where all matrix operands are column-stored, and then where all matrix operands are row-stored. Enabling this option with the same matrix storage string (`cr`) would cause the test suite to test `gemm` under all eight scenarios where the three `gemm` matrix operands are either column-stored or row-stored.
 
-_**Perform all tests with alignment?**_ Disabling this option causes the leading dimension (row or column stride) of test matrices to **not** be aligned according to `BLIS_HEAP_STRIDE_ALIGN_SIZE`, which defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_SIZE`, which defaults to 64 (bytes). (If any of these values is set to a non-default value, it would be in `bli_family_<arch>.h` where `<arch>` is the configuration family.) Sometimes it's useful to disable leading dimension alignment in order to test certain aspects of BLIS that need to handle computing with unaligned user data, such as level-1v and level-1f kernels.
+_**Perform all tests with alignment?**_ Disabling this option causes the leading dimension (row or column stride) of test matrices to **not** be aligned according to `BLIS_HEAP_STRIDE_ALIGN_SIZE`, which defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`, which defaults to 64 (bytes). (If any of these values is set to a non-default value, it would be in `bli_family_<arch>.h` where `<arch>` is the configuration family.) Sometimes it's useful to disable leading dimension alignment in order to test certain aspects of BLIS that need to handle computing with unaligned user data, such as level-1v and level-1f kernels.
 
 _**Randomize vectors and matrices.**_ The default randomization method uses real values on the interval [-1,1]. However, we offer an alternate randomization using powers of two in a narrow precision range, which is more likely to result in test residuals exactly equal to zero. This method is somewhat niche/experimental and most people should use random values on the [-1,1] interval.
 
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index 8a3dcd30a..bfa5ca9a3 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -54,8 +54,8 @@ gint_t bli_info_get_int_type_size( void )             { return BLIS_INT_TYPE_SIZ
 gint_t bli_info_get_num_fp_types( void )              { return BLIS_NUM_FP_TYPES; }
 gint_t bli_info_get_max_type_size( void )             { return BLIS_MAX_TYPE_SIZE; }
 gint_t bli_info_get_page_size( void )                 { return BLIS_PAGE_SIZE; }
-gint_t bli_info_get_simd_num_registers( void )        { return BLIS_SIMD_NUM_REGISTERS; }
-gint_t bli_info_get_simd_size( void )                 { return BLIS_SIMD_SIZE; }
+gint_t bli_info_get_simd_num_registers( void )        { return BLIS_SIMD_MAX_NUM_REGISTERS; }
+gint_t bli_info_get_simd_size( void )                 { return BLIS_SIMD_MAX_SIZE; }
 gint_t bli_info_get_simd_align_size( void )           { return BLIS_SIMD_ALIGN_SIZE; }
 gint_t bli_info_get_stack_buf_max_size( void )        { return BLIS_STACK_BUF_MAX_SIZE; }
 gint_t bli_info_get_stack_buf_align_size( void )      { return BLIS_STACK_BUF_ALIGN_SIZE; }
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index d2487584e..4de624f98 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -163,21 +163,21 @@
 // When configuring with umbrella configuration families, this should be
 // set to the maximum number of registers across all sub-configurations in
 // the family.
-#ifndef BLIS_SIMD_NUM_REGISTERS
-#define BLIS_SIMD_NUM_REGISTERS          32
+#ifndef BLIS_SIMD_MAX_NUM_REGISTERS
+#define BLIS_SIMD_MAX_NUM_REGISTERS      32
 #endif
 
 // The maximum size (in bytes) of each SIMD vector.
 // When configuring with umbrella configuration families, this should be
 // set to the maximum SIMD size across all sub-configurations in the family.
-#ifndef BLIS_SIMD_SIZE
-#define BLIS_SIMD_SIZE                   64
+#ifndef BLIS_SIMD_MAX_SIZE
+#define BLIS_SIMD_MAX_SIZE               64
 #endif
 
 // Alignment size (in bytes) needed by the instruction set for aligned
 // SIMD/vector instructions.
 #ifndef BLIS_SIMD_ALIGN_SIZE
-#define BLIS_SIMD_ALIGN_SIZE             BLIS_SIMD_SIZE
+#define BLIS_SIMD_ALIGN_SIZE             BLIS_SIMD_MAX_SIZE
 #endif
 
 // The maximum size in bytes of local stack buffers within macro-kernel
@@ -188,8 +188,8 @@
 // micro-tile footprint, even though the virtual micro-kernels will only
 // ever be writing to half (real or imaginary part) at a time.
 #ifndef BLIS_STACK_BUF_MAX_SIZE
-#define BLIS_STACK_BUF_MAX_SIZE          ( BLIS_SIMD_NUM_REGISTERS * \
-                                           BLIS_SIMD_SIZE * 2 )
+#define BLIS_STACK_BUF_MAX_SIZE          ( BLIS_SIMD_MAX_NUM_REGISTERS * \
+                                           BLIS_SIMD_MAX_SIZE * 2 )
 #endif
 
 // Alignment size used to align local stack buffers within macro-kernel

From 4d8352309784403ed6719528968531ffb4483947 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Wed, 23 Feb 2022 01:03:47 +0900
Subject: [PATCH 033/230] Add armsve to arm64 Metaconfig (#614)

Availability of the `armsve` subconfig is controlled by the compiler version (gcc/clang). Tested for SVE and non-SVE. Fixes #612.
---
 config/arm64/bli_family_arm64.h                     | 13 ++++++++++++-
 config/armsve/bli_cntx_init_armsve.c                |  8 ++++++++
 config_registry                                     |  2 +-
 kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c    |  2 +-
 .../armsve/1m/old/bli_dpackm_armsve512_int_12xk.c   |  2 +-
 kernels/armsve/bli_kernels_armsve.h                 |  2 +-
 6 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/config/arm64/bli_family_arm64.h b/config/arm64/bli_family_arm64.h
index 278c22818..b242d7049 100644
--- a/config/arm64/bli_family_arm64.h
+++ b/config/arm64/bli_family_arm64.h
@@ -39,7 +39,18 @@
 // -- MEMORY ALLOCATION --------------------------------------------------------
 
 #define BLIS_SIMD_ALIGN_SIZE 16
-
+#define BLIS_SIMD_MAX_NUM_REGISTERS 32
+
+// SVE-specific configs.
+#define N_L1_SVE_DEFAULT 64
+#define W_L1_SVE_DEFAULT 4
+#define C_L1_SVE_DEFAULT 256
+#define N_L2_SVE_DEFAULT 2048
+#define W_L2_SVE_DEFAULT 16
+#define C_L2_SVE_DEFAULT 256
+#define N_L3_SVE_DEFAULT 8192
+#define W_L3_SVE_DEFAULT 16
+#define C_L3_SVE_DEFAULT 256
 
 
 //#endif
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index cd07924a7..ad0e68219 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -33,9 +33,17 @@
 */
 
 #include "blis.h"
+#include <sys/auxv.h>
+
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
 
 void bli_cntx_init_armsve( cntx_t* cntx )
 {
+	if (!(getauxval( AT_HWCAP ) & HWCAP_SVE))
+		return;
+
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
 #if 0
 	blksz_t thresh[ BLIS_NUM_THRESH ];
diff --git a/config_registry b/config_registry
index d472325c7..93cff1523 100644
--- a/config_registry
+++ b/config_registry
@@ -12,7 +12,7 @@ x86_64:         intel64 amd64 amd64_legacy
 intel64:        skx knl haswell sandybridge penryn generic
 amd64_legacy:   excavator steamroller piledriver bulldozer generic
 amd64:          zen3 zen2 zen generic
-arm64:          firestorm thunderx2 cortexa57 cortexa53 generic
+arm64:          armsve firestorm thunderx2 cortexa57 cortexa53 generic
 arm32:          cortexa15 cortexa9 generic
 
 # Intel architectures.
diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
index 85dfaa9c0..7171347bf 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#if (defined(BLIS_FAMILY_ARMSVE) && !defined(BLIS_FAMILY_A64FX))
+#if !defined(BLIS_FAMILY_A64FX)
 #include <arm_sve.h>
 
 // assumption:
diff --git a/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c b/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c
index 966b0c134..47b15b437 100644
--- a/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c
+++ b/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c
@@ -36,7 +36,7 @@
 #include "blis.h"
 #include <stdio.h>
 
-#if (defined(BLIS_FAMILY_ARMSVE) && !defined(BLIS_FAMILY_A64FX))
+#if !defined(BLIS_FAMILY_A64FX)
 #include <arm_sve.h>
 
 // assumption:
diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h
index 39daf30c6..00e1f0455 100644
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -45,7 +45,7 @@ GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed )
 //GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_10x2v_unindexed )
 
 // Use SVE intrinsics only for referred cases.
-#if (defined(BLIS_FAMILY_ARMSVE) && !defined(BLIS_FAMILY_A64FX))
+#if !defined(BLIS_FAMILY_A64FX)
 PACKM_KER_PROT( double,   d, packm_armsve256_int_8xk )
 PACKM_KER_PROT( double,   d, packm_armsve512_int_12xk )
 #endif

From d5146582b1f1bcdccefe23925d3b114d40cd7e31 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Wed, 23 Feb 2022 03:35:46 +0900
Subject: [PATCH 034/230] ArmSVE Ensure Non-zero Block Size (#615)

Fixes #613. There are several macros/environment variables which need to be tuned to get good cache block sizes. It would be nice to have a way of getting values automatically.
---
 kernels/armsve/3/bli_armsve_utils.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernels/armsve/3/bli_armsve_utils.c b/kernels/armsve/3/bli_armsve_utils.c
index 1e3256d34..2ebafa655 100644
--- a/kernels/armsve/3/bli_armsve_utils.c
+++ b/kernels/armsve/3/bli_armsve_utils.c
@@ -79,6 +79,11 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
     dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \
     dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \
     n_c -= n_c % n_r; \
+\
+    /* Ensure non-zero block sizes. */ \
+    m_c = bli_max(m_c, m_r); \
+    n_c = bli_max(n_c, n_r); \
+    k_c = bli_max(k_c, 128); \
 \
     *m_r_ = m_r; \
     *n_r_ = n_r; \

From 84732bf95634ac606c5f2661d9474318e366c386 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 28 Feb 2022 12:19:31 -0600
Subject: [PATCH 035/230] Revamp how tools are handled/checked by configure.

Details:
- Consolidate handling of tools that are specifiable via CC, CXX, FC,
  PYTHON, AR, and RANLIB into one bash function, select_tool_w_env().
  - If the user specifies a tool via an environment variable (e.g.
    CC=gcc) and that tool does not seem valid, print an error message
    and abort configure, unless the tool is optional (e.g. CXX or FC),
    in which case a warning message is printed instead.
  - The definition of "seems valid" above amounts to:
    - responding to at least one of a basic set of command line options
      (e.g. --version, -V, -h) if the os_name is Linux (since GNU tools
      tend to respond to flags such as --version) or if the tool in
      question is CC, CXX, FC, or PYTHON (which tend to respond to the
      expected flags regardless of OS)
    - the binary merely existing for AR and RANLIB on Darwin/OSX/BSD.
      (These OSes tend to have non-GNU versions of ar and ranlib, which
      typically do not respond to --version and friends.)
- This PR addresses #584. Thanks to Devin Matthews for suggesting some
  of the changes in this commit.
---
 configure              | 440 ++++++++++++++++++++++++++++++-----------
 frame/compat/bla_dot.c |   5 +-
 2 files changed, 333 insertions(+), 112 deletions(-)

diff --git a/configure b/configure
index c03df26cd..5f3e83eaa 100755
--- a/configure
+++ b/configure
@@ -360,15 +360,18 @@ print_usage()
 	echo "   CC            Specifies the C compiler to use."
 	echo "   CXX           Specifies the C++ compiler to use (sandbox only)."
 	echo "   FC            Specifies the Fortran compiler to use (only to determine --complex-return)."
-	echo "   RANLIB        Specifies the ranlib executable to use."
-	echo "   AR            Specifies the archiver to use."
+	echo "   AR            Specifies the static library archiver to use."
+	echo "   RANLIB        Specifies the ranlib (library indexer) executable to use."
+	echo "   PYTHON        Specifies the python interpreter to use."
 	echo "   CFLAGS        Specifies additional compiler flags to use (prepended)."
 	echo "   LDFLAGS       Specifies additional linker flags to use (prepended)."
 	echo "   LIBPTHREAD    Pthreads library to use."
-	echo "   PYTHON        Specifies the python interpreter to use."
 	echo " "
-	echo "   Environment variables may also be specified as command line"
-	echo "   options, e.g.:"
+	echo "   Environment variables are traditionally set prior to running configure:"
+	echo " "
+	echo "     CC=gcc ./configure [options] haswell"
+	echo " "
+	echo "   However, they may also be specified as command line options, e.g.:"
 	echo " "
 	echo "     ./configure [options] CC=gcc haswell"
 	echo " "
@@ -418,10 +421,10 @@ assign_key_value()
 #	# found in a blacklist.
 #
 #	# Note: $2 can actually be a list of items.
-#	dlist=\$"$1"
-#	ditem=\$"$2"
+#	ditem=\$"$1"
+#	dlist=\$"$2"
 #
-#	# Acquire the contents of $list and $item and store them in list_c
+#	# Acquire the contents of $dlist and $ditem and store them in list_c
 #	# and item_c, respectively.
 #	list_c=$(eval "expr \"$dlist\" ")
 #	item_c=$(eval "expr \"$ditem\" ")
@@ -438,7 +441,7 @@ assign_key_value()
 #	done
 #
 #	# Update the argument.
-#	eval "$1=\"${list_c}\""
+#	eval "$2=\"${list_c}\""
 #}
 
 pass_config_kernel_registries()
@@ -1049,42 +1052,31 @@ get_cxx_search_list()
 	echo "${list}"
 }
 
-select_tool()
+get_fc_search_list()
 {
-	local search_list CC_env the_cc cc
+	local list
 
-	# This is the list of compilers/tools to search for, and the order in
-	# which to search for them.
-	search_list=$1
+	list="gfortran ifort"
 
-	# The environment variable associated with the compiler/tool type we
-	# are searching (e.g. CC, CXX, PYTHON).
-	CC_env=$2
+	echo "${list}"
+}
 
-	# If CC_env contains something, add it to the beginning of our default
-	# search list.
-	if [ -n "${CC_env}" ]; then
-		search_list="${CC_env} ${search_list}"
-	fi
+get_ar_search_list()
+{
+	local list
 
-	# Initialize our selected compiler/tool to empty.
-	the_cc=""
+	list="ar"
 
-	# Try each compiler/tool in the list and select the first one we find that
-	# works.
-	for cc in ${search_list}; do
+	echo "${list}"
+}
 
-		# See if the current compiler/tool works and/or is present.
-		${cc} --version > /dev/null 2>&1
+get_ranlib_search_list()
+{
+	local list
 
-		if [ "$?" == 0 ]; then
-			the_cc=${cc}
-			break
-		fi
-	done
+	list="ranlib"
 
-	# Return the selected compiler/tool.
-	echo "${the_cc}"
+	echo "${list}"
 }
 
 auto_detect()
@@ -2054,6 +2046,223 @@ set_default_version()
 	fi
 }
 
+select_tool_w_env()
+{
+	local search_list env_var env_str tool_str found_var
+	local _the_tool
+
+	# Example calling sequence:
+	#
+	#  select_tool_w_env "${cc_search_list}" "${CC}" "CC" "C compiler" "yes" found_cc
+	#
+
+	search_list="$1" # the tool's default search list.
+	env_var="$2"     # the value of the environment variable for this tool.
+	env_str="$3"     # a string naming the source of env_var.
+	tool_str="$4"    # a human-readable string identifying the tool.
+	is_required="$5" # is it fatal if env_var doesn't exist/work? (yes or no)
+	found_var="$6"   # the variable into which to save the selected tool.
+
+	# If the environment variable contains something, verify that it exists. If
+	# it is unset or empty, we proceed with the default search list.
+	if [ -n "${env_var}" ]; then
+
+		echo "${script_name}: user specified a ${tool_str} via ${env_str} (${env_var})."
+
+		# See if the binary specified by env_var exists.
+		_the_tool=$(select_tool "${env_var}" "${env_str}")
+
+		# Copy the result into the variable specified by found_var.
+		eval "${found_var}=\"${_the_tool}\""
+
+		# If the tool specified by env_var doesn't exist, throw a tantrum.
+		if [ -z "${_the_tool}" ]; then
+
+			echo "${script_name}: *** Could not find the ${tool_str} specified via ${env_str} ('${env_var}')."
+
+			# Whether the tantrum is fatal depends on the is_required argument.
+			if [ "${is_required}" == "yes" ]; then
+				echo "${script_name}: *** A working ${tool_str} is required. Please set ${env_str}"
+				echo "${script_name}: *** to a ${tool_str} that exists (or unset ${env_str})."
+				exit 1
+			else
+				echo "${script_name}: *** Note that a ${tool_str} will not be available."
+
+				# Set the found_var variable to *something* so that the output
+				# makefile fragment contains a record that the tool wasn't found.
+				eval "${found_var}=\"${env_str}\"-not-found"
+			fi
+		else
+			# The user-specified tool was found.
+			echo "${script_name}: ${_the_tool} exists and appears to work."
+			echo "${script_name}: using '${_the_tool}' as ${tool_str}."
+		fi
+
+	else
+
+		echo "${script_name}: ${tool_str} search list is: ${search_list}."
+
+		# Search for a working tool from the search list.
+		_the_tool=$(select_tool "${search_list}" "${env_str}")
+
+		# Copy the result into the variable specified by found_var.
+		eval "${found_var}=\"${_the_tool}\""
+
+		# If we didn't find a working tool from the search list, throw a tantrum.
+		if [ -z "${_the_tool}" ]; then
+
+			echo "${script_name}: *** Could not find a ${tool_str} from the search list."
+
+			# Whether the tantrum is fatal depends on the is_required argument.
+			if [ "${is_required}" == "yes" ]; then
+				echo "${script_name}: *** A working ${tool_str} is required. Cannot continue."
+				exit 1
+			else
+				echo "${script_name}: *** Note that a ${tool_str} will not be available."
+
+				# Set the found_var variable to *something* so that the output
+				# makefile fragment contains a record that the tool wasn't found.
+				eval "${found_var}=\"${env_str}-not-found\""
+			fi
+		else
+			# A tool from the search list was found.
+			echo "${script_name}: found '${_the_tool}'."
+			echo "${script_name}: using '${_the_tool}' as ${tool_str}."
+		fi
+	fi
+}
+
+select_tool()
+{
+	local search_list env_str
+	local the_tool tool the_flags rval
+
+	# This is the list of tools to search for, and the order in which
+	# to search for them.
+	search_list="$1"
+
+	# This is the name of the environment variable associated with the tool. For
+	# example, if search_list is a list of C compilers, env_str will be "CC".
+	env_str="$2"
+
+	# Initialize our selected tool to empty.
+	the_tool=""
+
+	# Try each tool in the list and select the first one we find that works.
+	for tool in ${search_list}; do
+
+		# Map each tool (via its canonical environment variable form) to the set
+		# of options we should use to check that it is working and available.
+		the_flags=$(get_tool_checkflags "${env_str}")
+
+		# Check that the tool works with at least one of the flags in the_flags
+		# the_flags (or, if the_flags is empty, check that the tool exists).
+		rval=$(check_tool "${tool}" "${the_flags}")
+
+		# If check_tool() returns 0, we're done.
+		if [ "${rval}" == "0" ]; then
+			the_tool=${tool}
+			break
+		fi
+	done
+
+	# Return the selected tool.
+	echo "${the_tool}"
+}
+
+get_tool_checkflags()
+{
+	local env_str
+	local allflags flaglist
+
+	# The tool for which we will determine the flag/option to pass in
+	# when testing that the tool works. Notice that it's not actually
+	# the tool but rather its equivalent environment variable.
+	env_str="${1}"
+
+	# The default list of flags to use in most circumstances.
+	allflags="--version -V -h"
+
+	if [ "${os_name}" = "Linux" ]; then
+
+		# If we are on Linux, it is very likely that all the tools will respond
+		# to at least one of the usual flags.
+		flaglist="${allflags}"
+
+	else
+
+		# If we are on Darwin/OSX/BSD or something else, we sometimes skip flag
+		# checks. (Note that when the list of flags to check is empty, we end
+		# up testing for the existence of the tool instead.)
+		if   [ "${env_str}" = "AR" -o \
+		       "${env_str}" = "RANLIB" ]; then
+
+			# AR, RANLIB may not respond to the normal flags on Darwin/OSX/BSD,
+			# so all we can really do is check for their existence.
+			flaglist=""
+		else
+			# Even on Darwin/OSX/BSD, we expect that CC, CXX, FC, PYTHON will
+			# respond to the typical flag checklist.
+			flaglist="${allflags}"
+		fi
+	fi
+
+	echo "${flaglist}"
+}
+
+check_tool()
+{
+	local tool the_flags
+	local rval opt toolpath
+
+	# This is the name, or filepath, of the tool to check for.
+	tool="$1"
+
+	# Some command line options to try to determine that the tool works.
+	the_flags="$2"
+
+	# Start with the assuming that the tool doesn't work/exist.
+	rval=1
+
+	if [ -n "${the_flags}" ]; then
+
+		# If the list of flags to check non-empty, we will iterate through the
+		# list in search of a flag that works. Failure to find one that works
+		# means the tool doesn't work (or, if the user specified the tool via
+		# its environment variable, failure might mean that the tool doesn't
+		# even exist).
+
+		# Try each flag in the list of flags.
+		for opt in ${the_flags}; do
+
+			# See if the tool responds to the current flag.
+			${tool} ${opt} > /dev/null 2>&1
+
+			# If the tool responded to the flag with a nominal error code of
+			# 0, we found one that works and set rval accoringly.
+			if [ "$?" == 0 ]; then
+				rval=0
+				break
+			fi
+		done
+	else
+
+		# If the list of flags to check is empty, we interpret this as a
+		# request to instead check for the existence of the tool.
+
+		# Use 'which' to determine if the tool exists.
+		toolpath="$(which ${tool} 2> /dev/null)"
+
+		# If the tool doesn't exist, we set rval accordingly.
+		if [ -n "${toolpath}" ]; then
+			rval=0
+		fi
+	fi
+
+	# Return the error code.
+	echo "${rval}"
+}
+
 
 
 #
@@ -2568,24 +2777,13 @@ main()
 
 	# -- Find a python interpreter ---------------------------------------------
 
-	# Acquire the python search order. This may vary based on the os found
-	# above.
+	# Acquire the default python search order.
 	python_search_list=$(get_python_search_list)
 
-	echo "${script_name}: python interpeter search list is: ${python_search_list}."
-
-	# Find a working python interpreter.
-	found_python=$(select_tool "${python_search_list}" "${PYTHON}")
-
-	# If we didn't find any working python interpreters, we print an error
-	# message.
-	if [ -z "${found_python}" ]; then
-		echo "${script_name}: *** Could not find working python interperter! Cannot continue."
-		exit 1
-	fi
-
-	echo "${script_name}: using '${found_python}' python interpreter."
-
+	# Select a python interpreter from the default list, or from PYTHON if it
+	# refers to a valid binary.
+	select_tool_w_env "${python_search_list}" "${PYTHON}" "PYTHON" \
+	                  "python interpreter" "yes" found_python
 
 	# -- Check the python version ----------------------------------------------
 
@@ -2596,22 +2794,13 @@ main()
 
 	# -- Find a C compiler -----------------------------------------------------
 
-	# Acquire the compiler search order. This will vary based on the os found
-	# above.
+	# Acquire the default compiler search order. This will vary based on os_name.
 	cc_search_list=$(get_cc_search_list)
 
-	echo "${script_name}: C compiler search list is: ${cc_search_list}."
-
-	# Find a working C compiler.
-	found_cc=$(select_tool "${cc_search_list}" "${CC}")
-
-	# If we didn't find any working C compilers, we print an error message.
-	if [ -z "${found_cc}" ]; then
-		echo "${script_name}: *** Could not find working C compiler! Cannot continue."
-		exit 1
-	fi
-
-	echo "${script_name}: using '${found_cc}' C compiler."
+	# Select a C compiler from the default list, or from CC if it refers to a
+	# valid binary.
+	select_tool_w_env "${cc_search_list}" "${CC}" "CC" \
+	                  "C compiler" "yes" found_cc
 
 	# Also check the compiler to see if we are (cross-)compiling for Windows
 	if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
@@ -2619,27 +2808,6 @@ main()
 	fi
 
 
-	# -- Find a C++ compiler ---------------------------------------------------
-
-	# Acquire the compiler search order. This will vary based on the os
-	# found above.
-	cxx_search_list=$(get_cxx_search_list)
-
-	echo "${script_name}: C++ compiler search list is: ${cxx_search_list}."
-
-	# Find a working C++ compiler. NOTE: We can reuse the select_tool()
-	# function since it is written in a way that is general-purpose.
-	found_cxx=$(select_tool "${cxx_search_list}" "${CXX}")
-
-	# If we didn't find any working C++ compilers, we print an error message.
-	if [ -z "${found_cxx}" ]; then
-		echo "${script_name}: Could not find working C++ compiler! C++ will not be available in sandbox."
-		found_cxx="c++notfound"
-	fi
-
-	echo "${script_name}: using '${found_cxx}' C++ compiler (for sandbox only)."
-
-
 	# -- Check the compiler version --------------------------------------------
 
 	# Initialize the blacklist to empty.
@@ -2670,6 +2838,57 @@ main()
 	fi
 
 
+	# -- Find a C++ compiler ---------------------------------------------------
+
+	# Acquire the default C++ compiler search order. This will vary based on
+	# os_name.
+	cxx_search_list=$(get_cxx_search_list)
+
+	# Select a C compiler from the default list, or from CC if it refers to a
+	# valid binary.
+	select_tool_w_env "${cxx_search_list}" "${CXX}" "CXX" \
+	                  "C++ compiler" "no" found_cxx
+
+
+	# -- Find a Fortran compiler -----------------------------------------------
+
+	# Acquire the default Fortran compiler search order.
+	fc_search_list=$(get_fc_search_list)
+
+	# Select a Fortran compiler from the default list, or from FC if it refers
+	# to a valid binary.
+	# NOTE: A Fortran compiler is not necessary for building BLIS. The only
+	# reason we might want to query it is to detect the style of returning
+	# complex values from functions. The 'gnu' style returns complex values
+	# from functions normally, via the C language return statement, while the
+	# 'intel' style returns them in a "hidden" parameter (inserted by the
+	# compiler) that precedes all other function parameters.
+	select_tool_w_env "${fc_search_list}" "${FC}" "FC" \
+	                  "Fortran compiler" "no" found_fc
+
+
+	# -- Find a static library archiver ----------------------------------------
+
+	# Acquire the default archiver search order.
+	ar_search_list=$(get_ar_search_list)
+
+	# Select an archiver from the default list, or from AR if it refers
+	# to a valid binary.
+	select_tool_w_env "${ar_search_list}" "${AR}" "AR" \
+	                  "library archiver" "yes" found_ar
+
+
+	# -- Find an archive indexer -----------------------------------------------
+
+	# Acquire the default archive indexer search order.
+	ranlib_search_list=$(get_ranlib_search_list)
+
+	# Select an archive indexer from the default list, or from RANLIB if it
+	# refers to a valid binary.
+	select_tool_w_env "${ranlib_search_list}" "${RANLIB}" "RANLIB" \
+	                  "archive indexer" "yes" found_ranlib
+
+
 	# -- Read the configuration registry ---------------------------------------
 
 	# Make sure the config registry file exists and can be opened.
@@ -3399,10 +3618,16 @@ main()
 		enable_sandbox_01=0
 	fi
 
-	# Check the method used for returning complex numbers
+	# Check the method used for returning complex numbers.
 	if [ "x${complex_return}" = "xdefault" ]; then
-		if [ -n "${FC}" ]; then
-			# Determine the complex return type from the given Fortran compiler
+
+		# If we prevoiusly found a Fortran compiler, let's query it to see what
+		# kind of complex return type it uses (gnu or intel). The 'gnu' style
+		# returns complex values from functions normally, via the C language
+		# return statement, while the 'intel' style returns them in a "hidden"
+		# parameter (inserted by the compiler) that precedes all other function
+		# parameters.
+		if [ -n "${found_fc}" ]; then
 
 			# Query the full vendor version string output. This includes the
 			# version number along with (potentially) a bunch of other textual
@@ -3411,8 +3636,7 @@ main()
 			# stdout. But it works for now.
 			vendor_string="$(${FC} --version 2>/dev/null)"
 
-			# Query the compiler "vendor" (ie: the compiler's simple name) and
-			# isolate the version number.
+			# Query the compiler "vendor" (ie: the compiler's simple name).
 			# The last part ({ read first rest ; echo $first ; }) is a workaround
 			# to OS X's egrep only returning the first match.
 			fc_vendor=$(echo "${vendor_string}" | egrep -o 'ifort|GNU' | { read first rest ; echo $first ; })
@@ -3445,23 +3669,19 @@ main()
 	# Variables that may contain forward slashes, such as paths, need extra
 	# escaping when used in sed commands. We insert those extra escape
 	# characters here so that the sed commands below do the right thing.
-	os_name_esc=$(echo     "${os_name}"     | sed 's/\//\\\//g')
-	prefix_esc=$(echo      "${prefix}"      | sed 's/\//\\\//g')
-	exec_prefix_esc=$(echo "${exec_prefix}" | sed 's/\//\\\//g')
-	libdir_esc=$(echo      "${libdir}"      | sed 's/\//\\\//g')
-	includedir_esc=$(echo  "${includedir}"  | sed 's/\//\\\//g')
-	sharedir_esc=$(echo    "${sharedir}"    | sed 's/\//\\\//g')
-	dist_path_esc=$(echo   "${dist_path}"   | sed 's/\//\\\//g')
-	cc_esc=$(echo          "${found_cc}"    | sed 's/\//\\\//g')
-	cxx_esc=$(echo         "${found_cxx}"   | sed 's/\//\\\//g')
-	python_esc=$(echo      "${found_python}"    | sed 's/\//\\\//g')
-	#sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g')
-
-	# For RANLIB, if the variable is not set, we use a default value of
-	# 'ranlib'.
-	ranlib_esc=$(echo "${RANLIB:-ranlib}" | sed 's/\//\\\//g')
-	# For AR, if the variable is not set, we use a default value of 'ar'.
-	ar_esc=$(echo "${AR:-ar}" | sed 's/\//\\\//g')
+	os_name_esc=$(echo     "${os_name}"      | sed 's/\//\\\//g')
+	prefix_esc=$(echo      "${prefix}"       | sed 's/\//\\\//g')
+	exec_prefix_esc=$(echo "${exec_prefix}"  | sed 's/\//\\\//g')
+	libdir_esc=$(echo      "${libdir}"       | sed 's/\//\\\//g')
+	includedir_esc=$(echo  "${includedir}"   | sed 's/\//\\\//g')
+	sharedir_esc=$(echo    "${sharedir}"     | sed 's/\//\\\//g')
+	dist_path_esc=$(echo   "${dist_path}"    | sed 's/\//\\\//g')
+	cc_esc=$(echo          "${found_cc}"     | sed 's/\//\\\//g')
+	cxx_esc=$(echo         "${found_cxx}"    | sed 's/\//\\\//g')
+	ar_esc=$(echo          "${found_ar}"     | sed 's/\//\\\//g')
+	ranlib_esc=$(echo      "${found_ranlib}" | sed 's/\//\\\//g')
+	python_esc=$(echo      "${found_python}" | sed 's/\//\\\//g')
+
 	libpthread_esc=$(echo "${LIBPTHREAD--lpthread}" | sed 's/\//\\\//g')
 	cflags_preset_esc=$(echo "${cflags_preset}" | sed 's/\//\\\//g')
 	ldflags_preset_esc=$(echo "${ldflags_preset}" | sed 's/\//\\\//g')
@@ -3577,8 +3797,8 @@ main()
 		| sed -e "s/@aocc_older_than_3_0_0@/${aocc_older_than_3_0_0}/g" \
 		| sed -e "s/@CC@/${cc_esc}/g" \
 		| sed -e "s/@CXX@/${cxx_esc}/g" \
-		| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
 		| sed -e "s/@AR@/${ar_esc}/g" \
+		| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
 		| sed -e "s/@PYTHON@/${python_esc}/g" \
 		| sed -e "s/@libpthread@/${libpthread_esc}/g" \
 		| sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \
diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c
index 0699cb22f..f5396b190 100644
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -92,9 +92,10 @@ INSERT_GENTFUNCDOTR_BLAS( dot, dotv )
 
 INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
 
-#else
+#else // #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
 
-// For the "intel" complex return type, use a hidden parameter to return the result
+// For the "intel" complex return type, use a hidden preceding parameter to
+// return the result rather than an actual return value.
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \

From 71851a0549276b17db18a0a0c8ab4f54493bf033 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 8 Mar 2022 17:38:09 -0600
Subject: [PATCH 036/230] Fixed level-3 performance bug in haswell ukernels.

Details:
- Fixed a performance regression affecting nearly all level-3 operations
  that use the 'haswell' sgemm and dgemm microkernels. This regression
  was introduced in 54fa28b, caused by an ill-formed conditional
  expression in the assembly code that controls whether cache lines of C
  should be prefetched as rows or as columns. Essentially, the two
  branches were reversed, causing incomplete prefetching to occur for
  both row- and column-stored instances of matrix C. Thanks to Devin
  Matthews for his help finding and fixing this bug.
---
 kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index d0e793867..70ea4ccd7 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -123,7 +123,7 @@ void bli_sgemm_haswell_asm_6x16
 	mov(var(cs_c), rsi) // load cs_c
 	lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float)
 
-	cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4.
+	cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4.
 	jz(.SCOLPREFETCH) // jump to column prefetch case
 
 		lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
@@ -803,8 +803,8 @@ void bli_dgemm_haswell_asm_6x8
 	mov(var(cs_c), rsi) // load cs_c
 	lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double)
 
-	cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8.
-	jz(.SCOLPREFETCH) // jump to column prefetch case
+	cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8.
+	jz(.DCOLPREFETCH) // jump to column prefetch case
 
 		lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
 		lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c;
@@ -815,9 +815,9 @@ void bli_dgemm_haswell_asm_6x8
 		prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
 		prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
 
-		jmp(.SPREFETCHDONE)
+		jmp(.DPREFETCHDONE)
 
-	label(.SCOLPREFETCH)
+	label(.DCOLPREFETCH)
 
 		lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c;
 		lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c;
@@ -830,7 +830,7 @@ void bli_dgemm_haswell_asm_6x8
 		prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c
 		prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c
 
-	label(.SPREFETCHDONE)
+	label(.DPREFETCHDONE)
 
 
 	mov(var(k_iter), rsi) // i = k_iter;

From cad10410b2305bc0e328c5f2517ab02593b53428 Mon Sep 17 00:00:00 2001
From: Ivan Korostelev <ivan23kor@gmail.com>
Date: Thu, 10 Mar 2022 09:58:14 -0600
Subject: [PATCH 037/230] POWER10: edge cases in microkernel (#620)

Use new API for POWER10 gemm microkernel
---
 sandbox/power10/gemm_template.h | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/sandbox/power10/gemm_template.h b/sandbox/power10/gemm_template.h
index 6f9b23032..eb0ef24bb 100644
--- a/sandbox/power10/gemm_template.h
+++ b/sandbox/power10/gemm_template.h
@@ -80,9 +80,6 @@ void GEMM_FUNC_NAME(ch) \
         DTYPE_OUT*  c, inc_t rsc, inc_t csc \
     ) \
 { \
-    DTYPE_OUT zero  = 0.0; \
-    DTYPE_OUT beta_  = *beta; \
-    \
     DTYPE_IN * restrict btilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, B_ALIGN + KC * NC * sizeof( DTYPE_IN ) ); \
     DTYPE_IN * restrict atilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, A_ALIGN + MC * KC * sizeof( DTYPE_IN ) ); \
     \
@@ -104,10 +101,6 @@ void GEMM_FUNC_NAME(ch) \
     DTYPE_OUT * restrict cblock = c; \
     DTYPE_IN  * restrict bblock = b; \
     \
-    DTYPE_OUT tmp_cmicrotile[MR*NR];  \
-    int   rsct = ( rsc == 1 ? 1 : NR ); \
-    int   csct = ( rsc == 1 ? MR : 1 ); \
-    \
     for ( int jc=0; jc<n; jc+=NC ) \
     { \
         int jb = bli_min( NC, n-jc ); \
@@ -146,18 +139,11 @@ void GEMM_FUNC_NAME(ch) \
                     {    \
                         int irb = bli_min( MR, ib-ir ); \
                         \
-                        if (jrb == NR && irb == MR) \
-                            MICROKERNEL (new_pb, alpha, amicropanel, bmicropanel, beta, cmicrotile, rsc, csc, NULL, NULL); \
-                        else \
-                        { \
-                            MICROKERNEL (new_pb, alpha, amicropanel, bmicropanel, &zero, tmp_cmicrotile, rsct, csct, NULL, NULL); \
-                            \
-                            for (int j=0; j<jrb;j++) \
-                                for (int i=0; i<irb;i++)  \
-                                    cmicrotile[i*rsc + j*csc] = \
-                                        beta_ * cmicrotile[i*rsc + j*csc] + \
-                                        tmp_cmicrotile[i*rsct + j*csct]; \
-                        } \
+                        MICROKERNEL \
+                        ( \
+                          irb, jrb, new_pb, alpha, amicropanel, bmicropanel, \
+                          beta, cmicrotile, rsc, csc, NULL, NULL \
+                        ); \
                         amicropanel += a_ps; \
                         cmicrotile += rstep_mt_c; \
                     } \

From 7c07b477e432adbbce5812ed9341ba3092b03976 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 11 Mar 2022 13:28:50 -0600
Subject: [PATCH 038/230] Avoid gemmsup barriers when not packing A or B.
 (#622)

Details:
- Implemented a multithreaded optimization for the special (and common)
  case of employing the gemmsup code path when the user requests
  (implicitly or explicitly) that neither A nor B be packed during
  computation. This optimization takes the form of a greatly reduced
  code branch in bli_thrinfo_sup_create_for_cntl(), which avoids a
  broadcast and two barriers, and results in higher performance when
  obtaining two-way or higher parallelism within BLIS. Thanks to
  Bhaskar Nallani of AMD for proposing this change via issue #605.
- Added an early return branch to bli_thrinfo_create_for_cntl() that
  detects and quickly handles cases where no parallelism is being
  obtained within BLIS (i.e., single-threaded execution). Note that
  this special case handling was/is already present in
  bli_thrinfo_sup_create_for_cntl().
- CREDITS file update.
---
 CREDITS                        |   1 +
 frame/thread/bli_thrinfo.c     |  18 ++++
 frame/thread/bli_thrinfo_sup.c | 181 ++++++++++++++++++++-------------
 3 files changed, 131 insertions(+), 69 deletions(-)

diff --git a/CREDITS b/CREDITS
index 7dd452daa..85ed97c6a 100644
--- a/CREDITS
+++ b/CREDITS
@@ -64,6 +64,7 @@ but many others have contributed code and feedback, including
   Simon Lukas Märtens      @ACSimon33          (RWTH Aachen University)
   Devin Matthews           @devinamatthews     (The University of Texas at Austin)
   Stefanos Mavros          @smavros
+  Mithun Mohan             @MithunMohanKadavil (AMD)
   Ilknur Mustafazade       @Runkli
                            @nagsingh
   Bhaskar Nallani          @BhaskarNallani     (AMD)
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index f9cd5ce74..0282be170 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -298,6 +298,24 @@ thrinfo_t* bli_thrinfo_create_for_cntl
        thrinfo_t* thread_par
      )
 {
+	// If we are running with a single thread, all of the code can be reduced
+	// and simplified to this.
+	if ( bli_rntm_calc_num_threads( rntm ) == 1 )
+	{
+		thrinfo_t* thread_chl = bli_thrinfo_create
+		(
+		  rntm,                        // rntm
+		  &BLIS_SINGLE_COMM,           // ocomm
+		  0,                           // ocomm_id
+		  1,                           // n_way
+		  0,                           // work_id
+		  FALSE,                       // free_comm
+		  BLIS_NO_PART,                // bszid
+		  NULL                         // sub_node
+		);
+		return thread_chl;
+	}
+
 	thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
 	thrcomm_t** new_comms = NULL;
 
diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
index ab28b7160..984820f39 100644
--- a/frame/thread/bli_thrinfo_sup.c
+++ b/frame/thread/bli_thrinfo_sup.c
@@ -145,7 +145,6 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
        thrinfo_t* thread_par
      )
 {
-#if 1
 	// If we are running with a single thread, all of the code can be reduced
 	// and simplified to this.
 	if ( bli_rntm_calc_num_threads( rntm ) == 1 )
@@ -163,84 +162,128 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 		);
 		return thread_chl;
 	}
-#endif
 
-	thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
-	thrcomm_t** new_comms = NULL;
+	// The remainder of this function handles the cases involving the use of
+	// multiple BLIS threads.
 
-	const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
-	const dim_t parent_n_way   = bli_thread_n_way( thread_par );
-	const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-	const dim_t parent_work_id = bli_thread_work_id( thread_par );
-
-	// Sanity check: make sure the number of threads in the parent's
-	// communicator is divisible by the number of new sub-groups.
-	if ( parent_nt_in % parent_n_way != 0 )
+	if ( bli_rntm_pack_a( rntm ) == FALSE &&
+	     bli_rntm_pack_b( rntm ) == FALSE )
 	{
-		printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
-		bli_abort();
+		// If we are packing neither A nor B, there are no broadcasts or barriers
+		// needed to synchronize threads (since all threads can work completely
+		// independently). In this special case situation, the thrinfo_t can be
+		// created with much simpler logic.
+
+		const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
+
+		// Compute:
+		// - the number of threads inside the new child comm,
+		// - the current thread's id within the new communicator,
+		// - the current thread's work id, given the ways of parallelism
+		//   to be obtained within the next loop.
+		const dim_t child_nt_in   = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
+		const dim_t child_n_way   = bli_rntm_ways_for( *bszid_chl, rntm );
+		const dim_t child_comm_id = parent_comm_id % child_nt_in;
+		const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
+
+		// All threads create a new thrinfo_t node using the communicator
+		// that was created by their chief, as identified by parent_work_id.
+		thrinfo_t* thread_chl = bli_thrinfo_create
+		(
+		  rntm,                        // rntm
+		  NULL,                        // ocomm
+		  child_comm_id,               // ocomm_id
+		  child_n_way,                 // n_way
+		  child_work_id,               // work_id
+		  TRUE,                        // free_comm
+		  *bszid_chl,                  // bszid
+		  NULL                         // sub_node
+		);
+
+		return thread_chl;
 	}
+	else
+	{
+		// If we are packing at least one of A or B, then we use the general
+		// approach that employs broadcasts and barriers.
+
+		thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
+		thrcomm_t** new_comms = NULL;
 
-	// Compute:
-	// - the number of threads inside the new child comm,
-	// - the current thread's id within the new communicator,
-	// - the current thread's work id, given the ways of parallelism
-	//   to be obtained within the next loop.
-	const dim_t child_nt_in   = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
-	const dim_t child_n_way   = bli_rntm_ways_for( *bszid_chl, rntm );
-	const dim_t child_comm_id = parent_comm_id % child_nt_in;
-	const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
+		const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
+		const dim_t parent_n_way   = bli_thread_n_way( thread_par );
+		const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
+		const dim_t parent_work_id = bli_thread_work_id( thread_par );
+
+		// Sanity check: make sure the number of threads in the parent's
+		// communicator is divisible by the number of new sub-groups.
+		if ( parent_nt_in % parent_n_way != 0 )
+		{
+			printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
+			bli_abort();
+		}
+
+		// Compute:
+		// - the number of threads inside the new child comm,
+		// - the current thread's id within the new communicator,
+		// - the current thread's work id, given the ways of parallelism
+		//   to be obtained within the next loop.
+		const dim_t child_nt_in   = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
+		const dim_t child_n_way   = bli_rntm_ways_for( *bszid_chl, rntm );
+		const dim_t child_comm_id = parent_comm_id % child_nt_in;
+		const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
 
 //printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
 
-	// The parent's chief thread creates a temporary array of thrcomm_t
-	// pointers.
-	if ( bli_thread_am_ochief( thread_par ) )
-	{
-		err_t r_val;
+		// The parent's chief thread creates a temporary array of thrcomm_t
+		// pointers.
+		if ( bli_thread_am_ochief( thread_par ) )
+		{
+			err_t r_val;
 
-		if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-			new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
-		else
-			new_comms = static_comms;
-	}
+			if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
+				new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
+			else
+				new_comms = static_comms;
+		}
 
-	// Broadcast the temporary array to all threads in the parent's
-	// communicator.
-	new_comms = bli_thread_broadcast( thread_par, new_comms );
-
-	// Chiefs in the child communicator allocate the communicator
-	// object and store it in the array element corresponding to the
-	// parent's work id.
-	if ( child_comm_id == 0 )
-		new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
-
-	bli_thread_barrier( thread_par );
-
-	// All threads create a new thrinfo_t node using the communicator
-	// that was created by their chief, as identified by parent_work_id.
-	thrinfo_t* thread_chl = bli_thrinfo_create
-	(
-	  rntm,                        // rntm
-	  new_comms[ parent_work_id ], // ocomm
-	  child_comm_id,               // ocomm_id
-	  child_n_way,                 // n_way
-	  child_work_id,               // work_id
-	  TRUE,                        // free_comm
-	  *bszid_chl,                  // bszid
-	  NULL                         // sub_node
-	);
-
-	bli_thread_barrier( thread_par );
-
-	// The parent's chief thread frees the temporary array of thrcomm_t
-	// pointers.
-	if ( bli_thread_am_ochief( thread_par ) )
-	{
-		if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-			bli_free_intl( new_comms );
-	}
+		// Broadcast the temporary array to all threads in the parent's
+		// communicator.
+		new_comms = bli_thread_broadcast( thread_par, new_comms );
+
+		// Chiefs in the child communicator allocate the communicator
+		// object and store it in the array element corresponding to the
+		// parent's work id.
+		if ( child_comm_id == 0 )
+			new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
+
+		bli_thread_barrier( thread_par );
+
+		// All threads create a new thrinfo_t node using the communicator
+		// that was created by their chief, as identified by parent_work_id.
+		thrinfo_t* thread_chl = bli_thrinfo_create
+		(
+		  rntm,                        // rntm
+		  new_comms[ parent_work_id ], // ocomm
+		  child_comm_id,               // ocomm_id
+		  child_n_way,                 // n_way
+		  child_work_id,               // work_id
+		  TRUE,                        // free_comm
+		  *bszid_chl,                  // bszid
+		  NULL                         // sub_node
+		);
 
-	return thread_chl;
+		bli_thread_barrier( thread_par );
+
+		// The parent's chief thread frees the temporary array of thrcomm_t
+		// pointers.
+		if ( bli_thread_am_ochief( thread_par ) )
+		{
+			if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
+				bli_free_intl( new_comms );
+		}
+
+		return thread_chl;
+	}
 }
 

From f1dbb0e514f53a3240d3a6cbdc3306b01a2206f5 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 11 Mar 2022 13:38:28 -0600
Subject: [PATCH 039/230] Trival whitespace change; commit log addendum.

Details:
- A co-attribution to Mithun Mohan was inadvertently omitted from the
  commit log for headline change in the previous commit, 7c07b47.
---
 frame/thread/bli_thrinfo_sup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
index 984820f39..881990f78 100644
--- a/frame/thread/bli_thrinfo_sup.c
+++ b/frame/thread/bli_thrinfo_sup.c
@@ -160,6 +160,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 		  BLIS_NO_PART,                // bszid
 		  NULL                         // sub_node
 		);
+
 		return thread_chl;
 	}
 

From d6810000e961fe807dc5a7db81180a8355f3eac0 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 14 Mar 2022 10:29:54 -0500
Subject: [PATCH 040/230] Update Multithreading.md

Add notes about `BLIS_IR_NT` (should typically be 1) and `BLIS_JR_NT` (should typically be small, e.g. <= 4). [ci skip]
---
 docs/Multithreading.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index b50db5b70..48fbc8ca1 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -142,13 +142,13 @@ The manual way of specifying parallelism involves communicating which loops with
 
 The below chart describes the five loops used in BLIS's matrix multiplication operations.
 
-| Loop around microkernel  | Environment variable | Direction | Notes       |
-|:-------------------------|:---------------------|:----------|:------------|
-| 5th loop                 | `BLIS_JC_NT`         | `n`       |             |
-| 4th loop                 | _N/A_                | `k`       | Not enabled |
-| 3rd loop                 | `BLIS_IC_NT`         | `m`       |             |
-| 2nd loop                 | `BLIS_JR_NT`         | `n`       |             |
-| 1st loop                 | `BLIS_IR_NT`         | `m`       |             |
+| Loop around microkernel  | Environment variable | Direction | Notes          |
+|:-------------------------|:---------------------|:----------|:---------------|
+| 5th loop                 | `BLIS_JC_NT`         | `n`       |                |
+| 4th loop                 | _N/A_                | `k`       | Not enabled    |
+| 3rd loop                 | `BLIS_IC_NT`         | `m`       |                |
+| 2nd loop                 | `BLIS_JR_NT`         | `n`       | Typically <= 4 |
+| 1st loop                 | `BLIS_IR_NT`         | `m`       | Typically 1    |
 
 **Note**: Parallelization of the 4th loop is not currently enabled because each iteration of the loop updates the same part of the output matrix C. Thus, to safely parallelize it requires either a reduction or mutex locks when updating C.
 
@@ -161,7 +161,7 @@ In general, the way to choose how to set these environment variables is as follo
 Next, which combinations of loops to parallelize depends on which caches are shared. Here are some of the more common scenarios:
  * When compute resources have private L3 caches (example: multi-socket systems), try parallelizing  the `JC` loop. This means threads (or thread groups) will pack and compute with different row panels from matrix B.
  * For compute resources that have private L2 caches but that share an L3 cache (example: cores on a socket), try parallelizing the `IC` loop. In this situation, threads will share the same packed row panel from matrix B, but pack and compute with different blocks of matrix A.
- * If compute resources share an L2 cache but have private L1 caches (example: pairs of cores), try parallelizing the `JR` loop. Here, threads share the same packed block of matrix A but read different packed micropanels of B into their private L1 caches. In some situations, parallelizing the `IR` loop may also be effective.
+ * If compute resources share an L2 cache but have private L1 caches (example: pairs of cores), try parallelizing the `JR` loop. Here, threads share the same packed block of matrix A but read different packed micropanels of B into their private L1 caches. In some situations, *lightly* parallelizing the `IR` loop may also be effective.
 
 ![The primary algorithm for level-3 operations in BLIS](http://www.cs.utexas.edu/users/field/mm_algorithm_color.png)
 

From 0db2bd5341c5c3ed5f1cc2bffa90952735efa45f Mon Sep 17 00:00:00 2001
From: Bhaskar Nallani <Nallani.Bhaskar@amd.com>
Date: Fri, 25 Mar 2022 05:11:55 +0530
Subject: [PATCH 041/230] Added BLAS/CBLAS APIs for gemm3m. (#590)

Details:
- Created ?gemm3m_() and cblas_?gemm3m() APIs that (for now) simply
  invoke the 1m implementation unconditionally. (Note that these APIs
  bypass sup handling.)
- Added BLAS prototypes for gemm3m in frame/compat/bla_gemm3m.h.
- Added CBLAS prototypes for gemm3m in frame/compat/cblas/src/cblas.h.
- Relocated:
    frame/compat/cblas/src/cblas_?gemmt.c
  files into
    frame/compat/cblas/src/extra/
- Relocated frame/compat/bla_gemmt.? into frame/compat/extra/ .
- Minor reorganization of prototypes and cpp macro directives in
  bli_blas.h, cblas.h, and cblas_f77.h.
- Trival whitespace change to cblas_zgemm.c.
---
 frame/compat/bli_blas.h                       |  20 +-
 frame/compat/cblas/src/cblas.h                |  56 ++-
 frame/compat/cblas/src/cblas_f77.h            |  13 +-
 frame/compat/cblas/src/cblas_zgemm.c          |   2 +-
 frame/compat/cblas/src/extra/cblas_cgemm3m.c  | 115 ++++++
 .../cblas/src/{ => extra}/cblas_cgemmt.c      |   0
 .../cblas/src/{ => extra}/cblas_dgemmt.c      |   0
 .../cblas/src/{ => extra}/cblas_sgemmt.c      |   0
 frame/compat/cblas/src/extra/cblas_zgemm3m.c  | 113 ++++++
 .../cblas/src/{ => extra}/cblas_zgemmt.c      |   0
 frame/compat/check/bla_gemm3m_check.h         |  89 +++++
 frame/compat/extra/bla_gemm3m.c               | 259 +++++++++++++
 frame/compat/extra/bla_gemm3m.h               |  59 +++
 frame/compat/extra/bla_gemm_batch.c           |  15 +-
 frame/compat/{ => extra}/bla_gemmt.c          |  15 +-
 frame/compat/{ => extra}/bla_gemmt.h          |   0
 test/Makefile                                 |   2 +-
 test/test_gemm3m.c                            | 352 ++++++++++++++++++
 18 files changed, 1062 insertions(+), 48 deletions(-)
 create mode 100644 frame/compat/cblas/src/extra/cblas_cgemm3m.c
 rename frame/compat/cblas/src/{ => extra}/cblas_cgemmt.c (100%)
 rename frame/compat/cblas/src/{ => extra}/cblas_dgemmt.c (100%)
 rename frame/compat/cblas/src/{ => extra}/cblas_sgemmt.c (100%)
 create mode 100644 frame/compat/cblas/src/extra/cblas_zgemm3m.c
 rename frame/compat/cblas/src/{ => extra}/cblas_zgemmt.c (100%)
 create mode 100644 frame/compat/check/bla_gemm3m_check.h
 create mode 100644 frame/compat/extra/bla_gemm3m.c
 create mode 100644 frame/compat/extra/bla_gemm3m.h
 rename frame/compat/{ => extra}/bla_gemmt.c (97%)
 rename frame/compat/{ => extra}/bla_gemmt.h (100%)
 create mode 100644 test/test_gemm3m.c

diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h
index a65953c11..c88a2e3c3 100644
--- a/frame/compat/bli_blas.h
+++ b/frame/compat/bli_blas.h
@@ -113,7 +113,6 @@
 #include "bla_amax.h"
 #include "bla_asum.h"
 #include "bla_axpy.h"
-#include "bla_axpby.h"
 #include "bla_copy.h"
 #include "bla_dot.h"
 #include "bla_nrm2.h"
@@ -187,7 +186,6 @@
 #include "bla_syr2k.h"
 #include "bla_trmm.h"
 #include "bla_trsm.h"
-#include "bla_gemmt.h"
 
 #include "bla_gemm_check.h"
 #include "bla_hemm_check.h"
@@ -198,12 +196,28 @@
 #include "bla_syr2k_check.h"
 #include "bla_trmm_check.h"
 #include "bla_trsm_check.h"
+
+
+// -- BLAS extension prototypes --
+
+// unique to BLIS
+
+#include "bla_axpby.h"
+
+// level-3
+
+#include "bla_gemmt.h"
 #include "bla_gemmt_check.h"
 
-// -- Batch prototypes --
+// batch
 
 #include "bla_gemm_batch.h"
 
+// 3m
+
+#include "bla_gemm3m.h"
+#include "bla_gemm3m_check.h"
+
 
 // -- Fortran-compatible APIs to BLIS functions --
 
diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h
index cee74233c..22399ac8d 100644
--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -449,11 +449,6 @@ void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  float alpha, const float *A, f77_int lda,
                  float *B, f77_int ldb);
-void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
-                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
-                 f77_int N, f77_int K, float alpha, const float *A,
-                 f77_int lda, const float *B, f77_int ldb,
-                 float beta, float *C, f77_int ldc);
 
 void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -484,11 +479,6 @@ void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  double alpha, const double *A, f77_int lda,
                  double *B, f77_int ldb);
-void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
-                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
-                 f77_int N, f77_int K, double alpha, const double *A,
-                 f77_int lda, const double *B, f77_int ldb,
-                 double beta, double *C, f77_int ldc);
 
 void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -519,11 +509,6 @@ void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  const void *alpha, const void *A, f77_int lda,
                  void *B, f77_int ldb);
-void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
-                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
-                 f77_int N, f77_int K, const void *alpha, const void *A,
-                 f77_int lda, const void *B, f77_int ldb,
-                 const void *beta, void *C, f77_int ldc);
 
 void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -554,11 +539,6 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  const void *alpha, const void *A, f77_int lda,
                  void *B, f77_int ldb);
-void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
-                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
-                 f77_int N, f77_int K, const void *alpha, const void *A,
-                 f77_int lda, const void *B, f77_int ldb,
-                 const void *beta, void *C, f77_int ldc);
 
 
 /* 
@@ -616,6 +596,29 @@ void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha,
                 const void *X, f77_int incX, const void *beta,
                 void *Y, f77_int incY);
 
+// -- APIs to level-3-like operations --
+
+void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, float alpha, const float *A,
+                 f77_int lda, const float *B, f77_int ldb,
+                 float beta, float *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, double alpha, const double *A,
+                 f77_int lda, const double *B, f77_int ldb,
+                 double beta, double *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);
+
 // -- Batch APIs --
 
 void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order,
@@ -652,6 +655,19 @@ void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order,
                  const void *beta_array, void **C, f77_int *ldc_array,
                  f77_int group_count, f77_int *group_size);
 
+// -- 3m APIs --
+
+void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
+                 f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
+                 f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/frame/compat/cblas/src/cblas_f77.h b/frame/compat/cblas/src/cblas_f77.h
index e534d2054..acb354aaf 100644
--- a/frame/compat/cblas/src/cblas_f77.h
+++ b/frame/compat/cblas/src/cblas_f77.h
@@ -200,20 +200,23 @@
 /*
 * BLAS extensions
 */
-#define F77_sgemmt  sgemmt_
-#define F77_dgemmt  dgemmt_
-#define F77_cgemmt  cgemmt_
-#define F77_zgemmt  zgemmt_
-
 #define F77_saxpby  saxpby_
 #define F77_daxpby  daxpby_
 #define F77_caxpby  caxpby_
 #define F77_zaxpby  zaxpby_
 
+#define F77_sgemmt  sgemmt_
+#define F77_dgemmt  dgemmt_
+#define F77_cgemmt  cgemmt_
+#define F77_zgemmt  zgemmt_
+
 #define F77_sgemm_batch  sgemm_batch_
 #define F77_dgemm_batch  dgemm_batch_
 #define F77_cgemm_batch  cgemm_batch_
 #define F77_zgemm_batch  zgemm_batch_
 
+#define F77_cgemm3m    cgemm3m_
+#define F77_zgemm3m    zgemm3m_
+
 
 #endif /*  CBLAS_F77_H */
diff --git a/frame/compat/cblas/src/cblas_zgemm.c b/frame/compat/cblas/src/cblas_zgemm.c
index e50de2205..8e08c2031 100644
--- a/frame/compat/cblas/src/cblas_zgemm.c
+++ b/frame/compat/cblas/src/cblas_zgemm.c
@@ -104,7 +104,7 @@ void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
       F77_zgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B,
                   &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
    } 
-   else  cblas_xerbla(1, "cblas_zgemm", "Illegal Order setting, %d\n", Order);
+   else cblas_xerbla(1, "cblas_zgemm", "Illegal Order setting, %d\n", Order);
    CBLAS_CallFromC = 0;
    RowMajorStrg = 0;
    return;
diff --git a/frame/compat/cblas/src/extra/cblas_cgemm3m.c b/frame/compat/cblas/src/extra/cblas_cgemm3m.c
new file mode 100644
index 000000000..514e52545
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_cgemm3m.c
@@ -0,0 +1,115 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ *
+ * cblas_cgemm3m.c
+ *
+ * This program is a C interface to cgemm3m.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ */
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
+                 f77_int K, const void *alpha, const void  *A,
+                 f77_int lda, const void  *B, f77_int ldb,
+                 const void *beta, void  *C, f77_int ldc)
+{
+   char TA, TB;   
+#ifdef F77_CHAR
+   F77_CHAR F77_TA, F77_TB;
+#else
+   #define F77_TA &TA  
+   #define F77_TB &TB  
+#endif
+
+#ifdef F77_INT
+   F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+   F77_INT F77_ldc=ldc;
+#else
+   #define F77_M M
+   #define F77_N N
+   #define F77_K K
+   #define F77_lda lda
+   #define F77_ldb ldb
+   #define F77_ldc ldc
+#endif
+
+   extern int CBLAS_CallFromC;
+   extern int RowMajorStrg;
+   RowMajorStrg = 0;
+   CBLAS_CallFromC = 1;
+
+
+   if( Order == CblasColMajor )
+   {
+      if(TransA == CblasTrans) TA='T';
+      else if ( TransA == CblasConjTrans ) TA='C';
+      else if ( TransA == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransB == CblasTrans) TB='T';
+      else if ( TransB == CblasConjTrans ) TB='C';
+      else if ( TransB == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_cgemm3m", "Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      #ifdef F77_CHAR
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_cgemm3m(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A,
+                     &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc);
+   } else if (Order == CblasRowMajor)
+   {
+      RowMajorStrg = 1;
+      if(TransA == CblasTrans) TB='T';
+      else if ( TransA == CblasConjTrans ) TB='C';
+      else if ( TransA == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      if(TransB == CblasTrans) TA='T';
+      else if ( TransB == CblasConjTrans ) TA='C';
+      else if ( TransB == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      #ifdef F77_CHAR
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+      
+
+      F77_cgemm3m(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B,
+                  &F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc);
+   } 
+   else cblas_xerbla(1, "cblas_cgemm3m", "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+   return;
+}
+#endif
diff --git a/frame/compat/cblas/src/cblas_cgemmt.c b/frame/compat/cblas/src/extra/cblas_cgemmt.c
similarity index 100%
rename from frame/compat/cblas/src/cblas_cgemmt.c
rename to frame/compat/cblas/src/extra/cblas_cgemmt.c
diff --git a/frame/compat/cblas/src/cblas_dgemmt.c b/frame/compat/cblas/src/extra/cblas_dgemmt.c
similarity index 100%
rename from frame/compat/cblas/src/cblas_dgemmt.c
rename to frame/compat/cblas/src/extra/cblas_dgemmt.c
diff --git a/frame/compat/cblas/src/cblas_sgemmt.c b/frame/compat/cblas/src/extra/cblas_sgemmt.c
similarity index 100%
rename from frame/compat/cblas/src/cblas_sgemmt.c
rename to frame/compat/cblas/src/extra/cblas_sgemmt.c
diff --git a/frame/compat/cblas/src/extra/cblas_zgemm3m.c b/frame/compat/cblas/src/extra/cblas_zgemm3m.c
new file mode 100644
index 000000000..8be4278b4
--- /dev/null
+++ b/frame/compat/cblas/src/extra/cblas_zgemm3m.c
@@ -0,0 +1,113 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+ *
+ * cblas_zgemm3m.c
+ *
+ * This program is a C interface to zgemm3m.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ */
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
+                 f77_int K, const void *alpha, const void  *A,
+                 f77_int lda, const void  *B, f77_int ldb,
+                 const void *beta, void  *C, f77_int ldc)
+{
+   char TA, TB;   
+#ifdef F77_CHAR
+   F77_CHAR F77_TA, F77_TB;
+#else
+   #define F77_TA &TA  
+   #define F77_TB &TB  
+#endif
+
+#ifdef F77_INT
+   F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+   F77_INT F77_ldc=ldc;
+#else
+   #define F77_M M
+   #define F77_N N
+   #define F77_K K
+   #define F77_lda lda
+   #define F77_ldb ldb
+   #define F77_ldc ldc
+#endif
+
+   extern int CBLAS_CallFromC;
+   extern int RowMajorStrg;
+   RowMajorStrg = 0;
+   CBLAS_CallFromC = 1;
+   
+   
+   if( Order == CblasColMajor )
+   {
+      if(TransA == CblasTrans) TA='T';
+      else if ( TransA == CblasConjTrans ) TA='C';
+      else if ( TransA == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransB == CblasTrans) TB='T';
+      else if ( TransB == CblasConjTrans ) TB='C';
+      else if ( TransB == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_zgemm3m", "Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      #ifdef F77_CHAR
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+      F77_zgemm3m(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A,
+                     &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
+   } else if (Order == CblasRowMajor)
+   {
+      RowMajorStrg = 1;
+      if(TransA == CblasTrans) TB='T';
+      else if ( TransA == CblasConjTrans ) TB='C';
+      else if ( TransA == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      if(TransB == CblasTrans) TA='T';
+      else if ( TransB == CblasConjTrans ) TA='C';
+      else if ( TransB == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      #ifdef F77_CHAR
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_zgemm3m(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B,
+                  &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
+   } 
+   else cblas_xerbla(1, "cblas_zgemm3m", "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+   return;
+}
+#endif
diff --git a/frame/compat/cblas/src/cblas_zgemmt.c b/frame/compat/cblas/src/extra/cblas_zgemmt.c
similarity index 100%
rename from frame/compat/cblas/src/cblas_zgemmt.c
rename to frame/compat/cblas/src/extra/cblas_zgemmt.c
diff --git a/frame/compat/check/bla_gemm3m_check.h b/frame/compat/check/bla_gemm3m_check.h
new file mode 100644
index 000000000..f565b5d29
--- /dev/null
+++ b/frame/compat/check/bla_gemm3m_check.h
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef BLIS_ENABLE_BLAS
+
+#define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \
+{ \
+	f77_int info = 0; \
+	f77_int nota,  notb; \
+	f77_int conja, conjb; \
+	f77_int ta,    tb; \
+	f77_int nrowa, nrowb; \
+\
+	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+\
+	if ( nota ) { nrowa = *m; } \
+	else        { nrowa = *k; } \
+	if ( notb ) { nrowb = *k; } \
+	else        { nrowb = *n; } \
+\
+	if      ( !nota && !conja && !ta ) \
+		info = 1; \
+	else if ( !notb && !conjb && !tb ) \
+		info = 2; \
+	else if ( *m < 0 ) \
+		info = 3; \
+	else if ( *n < 0 ) \
+		info = 4; \
+	else if ( *k < 0 ) \
+		info = 5; \
+	else if ( *lda < bli_max( 1, nrowa ) ) \
+		info = 8; \
+	else if ( *ldb < bli_max( 1, nrowb ) ) \
+		info = 10; \
+	else if ( *ldc < bli_max( 1, *m    ) ) \
+		info = 13; \
+\
+	if ( info != 0 ) \
+	{ \
+		char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
+\
+		sprintf( func_str, "%s%-5s", dt_str, op_str ); \
+\
+		bli_string_mkupper( func_str ); \
+\
+		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+\
+		return; \
+	} \
+}
+
+#endif
diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c
new file mode 100644
index 000000000..11d542e69
--- /dev/null
+++ b/frame/compat/extra/bla_gemm3m.c
@@ -0,0 +1,259 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+
+#ifdef BLIS_BLAS3_CALLS_TAPI
+
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, n0, k0; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Perform BLAS parameter checking. */ \
+	PASTEBLACHK(blasname) \
+	( \
+	  MKSTR(ch), \
+	  MKSTR(blasname), \
+	  transa, \
+	  transb, \
+	  m, \
+	  n, \
+	  k, \
+	  lda, \
+	  ldb, \
+	  ldc  \
+	); \
+\
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
+\
+	/* Typecast BLAS integers to BLIS integers. */ \
+	bli_convert_blas_dim1( *m, m0 ); \
+	bli_convert_blas_dim1( *n, n0 ); \
+	bli_convert_blas_dim1( *k, k0 ); \
+\
+	/* Set the row and column strides of the matrix operands. */ \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
+\
+	/* As a placeholder, invoke 1m since BLIS does no longer contains an
+	   official 3m implementation. Note that we do this by inlining an
+	   abbreviated version of bli_gemm_ex() so that we can bypass
+	   consideration of sup, which doesn't make sense in this context. */ \
+	{ \
+		cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \
+\
+		rntm_t  rntm_l; \
+		rntm_t* rntm = &rntm_l; \
+		bli_rntm_init_from_global( rntm ); \
+\
+		/* Note that we MUST disable sup handling since it could redirect
+		   execution for some problem sizes to a non-3m implementation. */ \
+		bli_rntm_disable_l3_sup( rntm ); \
+\
+		/* Call BLIS interface. */ \
+		PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+		( \
+		  blis_transa, \
+		  blis_transb, \
+		  m0, \
+		  n0, \
+		  k0, \
+		  (ftype*)alpha, \
+		  (ftype*)a, rs_a, cs_a, \
+		  (ftype*)b, rs_b, cs_b, \
+		  (ftype*)beta, \
+		  (ftype*)c, rs_c, cs_c, \
+		  cntx, \
+		  rntm  \
+		); \
+	} \
+\
+	/* Finalize BLIS. */ \
+	bli_finalize_auto(); \
+}
+
+#else
+
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, n0, k0; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Perform BLAS parameter checking. */ \
+	PASTEBLACHK(blasname) \
+	( \
+	  MKSTR(ch), \
+	  MKSTR(blasname), \
+	  transa, \
+	  transb, \
+	  m, \
+	  n, \
+	  k, \
+	  lda, \
+	  ldb, \
+	  ldc  \
+	); \
+\
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
+\
+	/* Typecast BLAS integers to BLIS integers. */ \
+	bli_convert_blas_dim1( *m, m0 ); \
+	bli_convert_blas_dim1( *n, n0 ); \
+	bli_convert_blas_dim1( *k, k0 ); \
+\
+	/* Set the row and column strides of the matrix operands. */ \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
+\
+	const num_t dt     = PASTEMAC(ch,type); \
+\
+	obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1; \
+	obj_t       ao     = BLIS_OBJECT_INITIALIZER; \
+	obj_t       bo     = BLIS_OBJECT_INITIALIZER; \
+	obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1; \
+	obj_t       co     = BLIS_OBJECT_INITIALIZER; \
+\
+	dim_t       m0_a, n0_a; \
+	dim_t       m0_b, n0_b; \
+\
+	bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
+	bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
+\
+	bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, (ftype*)beta,  &betao  ); \
+\
+	bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m0,   n0,   (ftype*)c, rs_c, cs_c, &co ); \
+\
+	bli_obj_set_conjtrans( blis_transa, &ao ); \
+	bli_obj_set_conjtrans( blis_transb, &bo ); \
+\
+	/* As a placeholder, invoke 1m since BLIS does no longer contains an
+	   official 3m implementation. Note that we do this by inlining an
+	   abbreviated version of bli_gemm_ex() so that we can bypass
+	   consideration of sup, which doesn't make sense in this context. */ \
+	{ \
+		cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \
+\
+		rntm_t  rntm_l; \
+		rntm_t* rntm = &rntm_l; \
+		bli_rntm_init_from_global( &rntm_l ); \
+\
+		/* This is probably not needed given that we performed BLAS-style
+		   parameter checking above, but bli_gemm_check() is normally called
+		   in the normal course of bli_gemm_ex(). */ \
+		if ( bli_error_checking_is_enabled() ) \
+			bli_gemm_check( &alphao, &ao, &bo, &betao, &co, cntx ); \
+\
+		PASTEMAC(blisname,_front) \
+		( \
+		  &alphao, \
+		  &ao, \
+		  &bo, \
+		  &betao, \
+		  &co, \
+		  cntx, \
+		  rntm, \
+		  NULL  \
+		); \
+	} \
+\
+	/* Finalize BLIS. */ \
+	bli_finalize_auto(); \
+}
+
+#endif
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTFUNCCO_BLAS( gemm3m, gemm )
+#endif
+
diff --git a/frame/compat/extra/bla_gemm3m.h b/frame/compat/extra/bla_gemm3m.h
new file mode 100644
index 000000000..86b7277c8
--- /dev/null
+++ b/frame/compat/extra/bla_gemm3m.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype BLAS-to-BLIS interfaces.
+//
+#undef  GENTPROTCO
+#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
+\
+BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     );
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTPROTCO_BLAS( gemm3m )
+#endif
+
diff --git a/frame/compat/extra/bla_gemm_batch.c b/frame/compat/extra/bla_gemm_batch.c
index be84572a3..4b2597e19 100644
--- a/frame/compat/extra/bla_gemm_batch.c
+++ b/frame/compat/extra/bla_gemm_batch.c
@@ -63,9 +63,6 @@ void PASTEF77(ch,blasname) \
 	trans_t blis_transa; \
 	trans_t blis_transb; \
 	dim_t   m0, n0, k0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -102,12 +99,12 @@ void PASTEF77(ch,blasname) \
 		bli_convert_blas_dim1( k_array[i], k0 ); \
 \
 		/* Set the row and column strides of the matrix operands. */ \
-		rs_a = 1; \
-		cs_a = lda_array[i]; \
-		rs_b = 1; \
-		cs_b = ldb_array[i]; \
-		rs_c = 1; \
-		cs_c = ldc_array[i]; \
+		const inc_t rs_a = 1; \
+		const inc_t cs_a = lda_array[i]; \
+		const inc_t rs_b = 1; \
+		const inc_t cs_b = ldb_array[i]; \
+		const inc_t rs_c = 1; \
+		const inc_t cs_c = ldc_array[i]; \
 \
 		for ( f77_int j = 0; j < group_size[i]; j++ ) \
 		{ \
diff --git a/frame/compat/bla_gemmt.c b/frame/compat/extra/bla_gemmt.c
similarity index 97%
rename from frame/compat/bla_gemmt.c
rename to frame/compat/extra/bla_gemmt.c
index 6f2439e9f..101cc6d13 100644
--- a/frame/compat/bla_gemmt.c
+++ b/frame/compat/extra/bla_gemmt.c
@@ -63,9 +63,6 @@ void PASTEF77(ch,blasname) \
 	trans_t blis_transa; \
 	trans_t blis_transb; \
 	dim_t   m0, k0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -95,12 +92,12 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *k, k0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_gemmt.h b/frame/compat/extra/bla_gemmt.h
similarity index 100%
rename from frame/compat/bla_gemmt.h
rename to frame/compat/extra/bla_gemmt.h
diff --git a/test/Makefile b/test/Makefile
index ae998ccde..361cd2ff8 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -145,7 +145,7 @@ CFLAGS         += -I$(TEST_SRC_PATH)
 # Define the operations we will test.
 TEST_OPS := dotv axpyv axpbyv\
             gemv ger hemv her her2 trmv trsv \
-            gemm gemm_batch hemm herk her2k trmm trsm
+            gemm gemm3m gemm_batch hemm herk her2k trmm trsm
 
 # Optionally test gemmt, which some libraries might not implement.
 ifeq ($(BUILD_GEMMT),yes)
diff --git a/test/test_gemm3m.c b/test/test_gemm3m.c
new file mode 100644
index 000000000..8e7042901
--- /dev/null
+++ b/test/test_gemm3m.c
@@ -0,0 +1,352 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+#include "cblas.h"
+
+#define CBLAS
+//#define FILE_IN_OUT
+//#define PRINT
+#define MATRIX_INITIALISATION
+
+int main( int argc, char** argv )
+{
+	obj_t a, b, c;
+	obj_t c_save;
+	obj_t alpha, beta;
+	dim_t m, n, k;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   m_input, n_input, k_input;
+	num_t dt;
+	int   r, n_repeats;
+	trans_t  transa;
+	trans_t  transb;
+	f77_char f77_transa;
+	f77_char f77_transb;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+#ifdef FILE_IN_OUT
+	FILE* fin  = NULL;
+	FILE* fout = NULL;
+#endif
+	//bli_init();
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end   = 2000;
+	p_inc   = 100;
+
+	m_input = -1;
+	n_input = -1;
+	k_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	m_input = 5;
+	k_input = 6;
+	n_input = 4;
+#endif
+
+	dt = BLIS_SCOMPLEX;
+	//dt = BLIS_DCOMPLEX;
+
+	transa = BLIS_NO_TRANSPOSE;
+	transb = BLIS_NO_TRANSPOSE;
+
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+	// printf("BLIS Library version is : %s\n", bli_info_get_version_str());
+
+#ifdef FILE_IN_OUT
+	if ( argc < 3 )
+	{
+		printf( "Usage: ./test_gemm_XX.x input.csv output.csv\n" );
+		exit(1);
+	}
+	fin = fopen( argv[1], "r" );
+	if ( fin == NULL )
+	{
+		printf( "Error opening the file %s\n", argv[1] );
+		exit(1);
+	}
+	fout = fopen( argv[2], "w" );
+	if ( fout == NULL )
+	{
+		printf( "Error opening output file %s\n", argv[2] );
+		exit(1);
+	}
+
+	fprintf( fout, "m\t k\t n\t cs_a\t cs_b\t cs_c\t gflops\t GEMM_Algo\n" );
+	printf( "~~~~~~~~~~_BLAS\t m\t k\t n\t cs_a\t cs_b\t cs_c \t gflops\t GEMM_Algo\n" );
+
+	inc_t cs_a;
+	inc_t cs_b;
+	inc_t cs_c;
+
+	while ( fscanf(fin, "%lld %lld %lld %lld %lld %lld\n", &m, &k, &n, &cs_a, &cs_b, &cs_c) == 6 )
+	{
+		if ( ( m > cs_a ) ||
+		     ( k > cs_b ) ||
+		     ( m > cs_c ) ) continue; // leading dimension should be greater than number of rows
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha);
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+		bli_obj_create( dt, m, k, 1, cs_a, &a );
+		bli_obj_create( dt, k, n, 1, cs_b, &b );
+		bli_obj_create( dt, m, n, 1, cs_c, &c );
+		bli_obj_create( dt, m, n, 1, cs_c, &c_save );
+#ifdef MATRIX_INITIALISATION
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+#endif
+		bli_obj_set_conjtrans( transa, &a);
+		bli_obj_set_conjtrans( transb, &b);
+
+		//bli_setsc( 0.0, -1, &alpha );
+		//bli_setsc( 0.0, 1, &beta );
+
+		bli_setsc( -1, 0.0, &alpha );
+		bli_setsc( 1, 0.0, &beta );
+
+#else
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+		bli_obj_create( dt, m, k, 0, 0, &a );
+		bli_obj_create( dt, k, n, 0, 0, &b );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+#ifdef MATRIX_INITIALISATION
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+#endif
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_setsc(  (0.9/1.0), 0.2, &alpha );
+		bli_setsc( -(1.1/1.0), 0.3, &beta );
+
+#endif
+		bli_copym( &c, &c_save );
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "b", &b, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifndef CBLAS
+    
+			if ( bli_is_scomplex( dt ) )
+			{
+				f77_int  mm     = bli_obj_length( &c );
+				f77_int  kk     = bli_obj_width_after_trans( &a );
+				f77_int  nn     = bli_obj_width( &c );
+				f77_int  lda    = bli_obj_col_stride( &a );
+				f77_int  ldb    = bli_obj_col_stride( &b );
+				f77_int  ldc    = bli_obj_col_stride( &c );
+				scomplex*  alphap = bli_obj_buffer( &alpha );
+				scomplex*  ap     = bli_obj_buffer( &a );
+				scomplex*  bp     = bli_obj_buffer( &b );
+				scomplex*  betap  = bli_obj_buffer( &beta );
+				scomplex*  cp     = bli_obj_buffer( &c );
+
+				cgemm3m_( &f77_transa,
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
+			}
+			else if ( bli_is_dcomplex( dt ) )
+			{
+				f77_int  mm     = bli_obj_length( &c );
+				f77_int  kk     = bli_obj_width_after_trans( &a );
+				f77_int  nn     = bli_obj_width( &c );
+				f77_int  lda    = bli_obj_col_stride( &a );
+				f77_int  ldb    = bli_obj_col_stride( &b );
+				f77_int  ldc    = bli_obj_col_stride( &c );
+				dcomplex*  alphap = bli_obj_buffer( &alpha );
+				dcomplex*  ap     = bli_obj_buffer( &a );
+				dcomplex*  bp     = bli_obj_buffer( &b );
+				dcomplex*  betap  = bli_obj_buffer( &beta );
+				dcomplex*  cp     = bli_obj_buffer( &c );
+
+				zgemm3m_( &f77_transa,
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
+			}
+#else
+			if ( bli_is_scomplex( dt ) )
+			{
+				scomplex*   ap     = bli_obj_buffer( &a );
+				scomplex*   bp     = bli_obj_buffer( &b );
+				scomplex*   cp     = bli_obj_buffer( &c );    
+				scomplex*   alphap = bli_obj_buffer( &alpha );
+				scomplex*   betap  = bli_obj_buffer( &beta );
+				cblas_cgemm3m( CblasColMajor,
+				               CblasNoTrans,
+				               CblasNoTrans,
+				               m,
+				               n,
+				               k,
+				               (const void*)alphap,
+				               ap, m,
+				               bp, k,
+				               (const void*)betap,
+				               cp, m );
+			}
+			else if (bli_is_dcomplex(dt))
+			{
+				dcomplex*   ap     = bli_obj_buffer( &a );
+				dcomplex*   bp     = bli_obj_buffer( &b );
+				dcomplex*   cp     = bli_obj_buffer( &c );    
+				dcomplex*    alphap = bli_obj_buffer( &alpha );
+				dcomplex*    betap  = bli_obj_buffer( &beta );
+				cblas_zgemm3m( CblasColMajor,
+				               CblasNoTrans,
+				               CblasNoTrans,
+				               m,
+				               n,
+				               k,
+				               (const void*)alphap,
+				               ap, m,
+				               bp, k,
+				               (const void*)betap,
+				               cp, m );
+			}
+#endif    
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.6f", "" );
+			exit(1);
+#endif
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
+
+		gflops *= 4.0; //to represent complex ops in gflops
+
+#ifdef BLIS
+		printf( "data_gemm_blis" );
+#else
+		printf( "data_gemm_%s", BLAS );
+#endif
+
+#ifdef FILE_IN_OUT
+
+		printf("%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f\n", \
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		       ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops);
+
+
+		fprintf(fout, "%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \n", \
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops);
+		fflush(fout);
+
+#else
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, gflops );
+#endif
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+#ifdef FILE_IN_OUT
+	fclose( fin );
+	fclose( fout );
+#endif
+	return 0;
+}

From 1ec020b33ece1681c0041e2549eed2bd4c6cf356 Mon Sep 17 00:00:00 2001
From: Dipal M Zambare <71366780+dzambare@users.noreply.github.com>
Date: Wed, 30 Mar 2022 02:45:36 +0530
Subject: [PATCH 042/230] AMD kernel updates; frame-specific AMD updates.
 (#597)

Details:
- Allow building BLIS with certain framework files (each with the '_amd'
  suffix) that have been customized by AMD for Zen-based hardware. These
  customized files were derived from portable versions of the same files
  (i.e., those without the '_amd' suffix). Whether the portable or AMD-
  specific files are compiled is now controlled by a new configure
  option, --[en|dis]able-amd-frame-tweaks. This option is disabled by
  default in vanilla BLIS, though AMD may choose to enable it by default
  in their fork. For now, the added AMD-specific files are:
  - bli_gemv_unf_var2_amd.c
  - bla_copy_amd.c
  - bla_gemv_amd.c
  These files reside in 'amd' subdirectories found within the directory
  housing their generic counterparts.
- Register optimized real-domain copyv, setv, and swapv kernels in
  bli_cntx_init_zen.c.
- Various minor updates to level-1v kernels in 'zen' kernel set.
- Added caxpyf kernel as well as saxpyf and multiple daxpyf kernels to
  the 'zen' kernel set
- If the problem passed to ?gemm_() in bla_gemm.c has a unit m or n dim,
  call gemv instead and return early.
- Combined variable declarations with their initialization in various
  level-2 and level-3 BLAS compatibility files, and also inserted
  'const' qualifer in those same declaration statements.
- Moved frame/compat/bla_gemmt.c and .h to frame/compat/extra/ .
- Added copyv and swapv test drivers to 'test' directory.
- Whitespace, comment changes.
---
 Makefile                                 |   23 +
 build/config.mk.in                       |    4 +
 config/amd64/bli_family_amd64.h          |    6 -
 config/zen/bli_cntx_init_zen.c           |   11 +-
 config/zen2/bli_cntx_init_zen2.c         |    5 +-
 config/zen3/bli_cntx_init_zen3.c         |    9 +-
 config/zen3/bli_family_zen3.h            |    5 +-
 configure                                |   50 +-
 frame/2/gemv/amd/bli_gemv_unf_var2_amd.c |  222 ++++
 frame/compat/amd/bla_copy_amd.c          |  147 +++
 frame/compat/amd/bla_gemv_amd.c          |  172 +++
 frame/compat/bla_gemm.c                  |   59 +-
 frame/compat/bla_gemv.c                  |   27 +-
 frame/compat/bla_ger.c                   |    5 +-
 frame/compat/bla_hemm.c                  |   15 +-
 frame/compat/bla_hemv.c                  |    5 +-
 frame/compat/bla_her.c                   |    5 +-
 frame/compat/bla_her2.c                  |    5 +-
 frame/compat/bla_her2k.c                 |   15 +-
 frame/compat/bla_herk.c                  |   10 +-
 frame/compat/bla_symm.c                  |   15 +-
 frame/compat/bla_symv.c                  |    5 +-
 frame/compat/bla_syr.c                   |    5 +-
 frame/compat/bla_syr2.c                  |    5 +-
 frame/compat/bla_syr2k.c                 |   15 +-
 frame/compat/bla_syrk.c                  |   10 +-
 frame/compat/bla_trmm.c                  |   10 +-
 frame/compat/bla_trmv.c                  |    5 +-
 frame/compat/bla_trsm.c                  |   10 +-
 frame/compat/bla_trsv.c                  |    5 +-
 kernels/zen/1/bli_scalv_zen_int10.c      |   62 +-
 kernels/zen/1f/bli_axpyf_zen_int_4.c     |  277 +++++
 kernels/zen/1f/bli_axpyf_zen_int_5.c     | 1231 ++++++++++++++++++++++
 kernels/zen/bli_kernels_zen.h            |   14 +-
 kernels/zen2/1f/bli_axpyf_zen_int_5.c    |  599 -----------
 test/test_copyv.c                        |  218 ++++
 test/test_swapv.c                        |  180 ++++
 37 files changed, 2716 insertions(+), 750 deletions(-)
 create mode 100644 frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
 create mode 100644 frame/compat/amd/bla_copy_amd.c
 create mode 100644 frame/compat/amd/bla_gemv_amd.c
 create mode 100644 kernels/zen/1f/bli_axpyf_zen_int_4.c
 create mode 100644 kernels/zen/1f/bli_axpyf_zen_int_5.c
 delete mode 100644 kernels/zen2/1f/bli_axpyf_zen_int_5.c
 create mode 100644 test/test_copyv.c
 create mode 100644 test/test_swapv.c

diff --git a/Makefile b/Makefile
index 992983328..5605dd8fc 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2022, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -219,6 +220,28 @@ MK_ADDON_OBJS       := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDO
 # enabled a configure-time, this variable will we empty.
 MK_SANDBOX_OBJS     := $(call gen-obj-paths-from-src,$(SANDBOX_SRC_SUFS),$(MK_SANDBOX_SRC),$(SANDBOX_PATH),$(BASE_OBJ_SANDBOX_PATH))
 
+# AMD has chosen to introduce AOCL-specific optimizations to certain BLIS
+# framework files that are otherwise intended to remain generic. Upstream
+# developers of vanilla BLIS have agreed to integrate some of these
+# optimizations, but in a way that keeps the AOCL-specific code segregated
+# in separate files containing the suffix '_amd'. For example, the BLAS
+# compatibility layer in vanilla BLIS contains a generic file named
+# 'bla_gemm.c'. AMD's version of this file is named 'bla_gemm_amd.c'.
+# Only one or the other is ever built and included in libblis. Currently,
+# these files are chosen automatically based on the target configuration.
+ifeq ($(ENABLE_AMD_FRAME_TWEAKS),yes)
+# Build is being done for AMD platforms; remove the objects which DO NOT have
+# an "_amd" suffix.
+MK_FRAME_AMD_OBJS  := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
+FILES_TO_REMOVE := $(subst _amd.o,.o, $(MK_FRAME_AMD_OBJS))
+MK_FRAME_OBJS := $(filter-out $(FILES_TO_REMOVE), $(MK_FRAME_OBJS))
+else
+# Build is being done for non-AMD platforms; remove the objects which DO have
+# an "_amd" suffix.
+MK_FRAME_AMD_OBJS  := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
+MK_FRAME_OBJS := $(filter-out $(MK_FRAME_AMD_OBJS), $(MK_FRAME_OBJS))
+endif
+
 # Combine all of the object files into some readily-accessible variables.
 MK_BLIS_OBJS        := $(MK_CONFIG_OBJS) \
                        $(MK_KERNELS_OBJS) \
diff --git a/build/config.mk.in b/build/config.mk.in
index 79ecea653..56d6211c2 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2022, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -203,5 +204,8 @@ SANDBOX           := @sandbox@
 # variable is set to the empty value.
 LIBPTHREAD        := @libpthread@
 
+# Whether we should use AMD-customized versions of certain framework files.
+ENABLE_AMD_FRAME_TWEAKS := @enable_amd_frame_tweaks@
+
 # end of ifndef CONFIG_MK_INCLUDED conditional block
 endif
diff --git a/config/amd64/bli_family_amd64.h b/config/amd64/bli_family_amd64.h
index ac10789aa..4791cceeb 100644
--- a/config/amd64/bli_family_amd64.h
+++ b/config/amd64/bli_family_amd64.h
@@ -35,11 +35,5 @@
 #ifndef BLIS_FAMILY_AMD64_H
 #define BLIS_FAMILY_AMD64_H
 
-// Enable framework optimizations for EPYC family processors.
-// With this macro defined, we can call kernels directly from
-// BLAS interfaces for levels 1 & 2.
-// This macro needs to be defined for all EPYC configurations.
-#define BLIS_CONFIG_EPYC
-
 #endif
 
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 615a31a04..1b16cd06f 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -66,6 +66,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  // gemmtrsm_u
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
+
 	  cntx
 	);
 
@@ -98,13 +99,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+
 	  cntx
 	);
 
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-	  10,
+	  16,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -119,7 +121,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
 #endif
 
-#if 0
+#if 1
 	  // copyv
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
@@ -142,7 +144,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
 
-#if 0
+#if 1
 	  // setv
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
@@ -151,6 +153,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
 	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
 #endif
+ 
 	  cntx
 	);
 
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index 0964ce463..ba728602b 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -64,6 +64,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  // gemmtrsm_u
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
+
 	  cntx
 	);
 
@@ -96,6 +97,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+
 	  cntx
 	);
 
@@ -135,6 +137,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  //set
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+
 	  cntx
 	);
 
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index b5bbb05ed..0336ddc95 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -49,6 +49,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	bli_cntx_set_l3_nat_ukrs
 	(
 	  8,
+
 	  // gemm
 	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
 	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
@@ -62,6 +63,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  // gemmtrsm_u
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
+
 	  cntx
 	);
 
@@ -96,12 +98,15 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	bli_cntx_set_l1f_kers
 	(
 	  4,
+
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
 	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
+
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+
 	  cntx
 	);
 
@@ -114,8 +119,6 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 
-	  // axpyv
-
 	  // axpyv
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h
index 8487a7277..661313ca9 100644
--- a/config/zen3/bli_family_zen3.h
+++ b/config/zen3/bli_family_zen3.h
@@ -63,8 +63,9 @@
 
 #define BLIS_SMALL_MATRIX_THRES_TRSM   32768 //128(128+128) => m*(m+n)
 #define BLIS_SMALL_MATRIX_A_THRES_TRSM  128
-#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT    96
-#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT    128
+
+#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96
+#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128
 
 #define BLIS_ENABLE_SMALL_MATRIX_ROME
 #define BLIS_SMALL_MATRIX_THRES_ROME       400
diff --git a/configure b/configure
index 5f3e83eaa..f64aac705 100755
--- a/configure
+++ b/configure
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2020, Advanced Micro Devices, Inc.
+#  Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -270,6 +270,21 @@ print_usage()
 	echo "                 \"small\" depends on thresholds that may vary by sub-"
 	echo "                 configuration."
 	echo " "
+	echo "   --enable-amd-frame-tweaks, --disable-amd-frame-tweaks"
+	echo " "
+	echo "                 Enable building with certain framework files that have"
+	echo "                 been customized by AMD for Zen-based microarchitectures."
+	echo "                 The default counterparts of these files must be portable,"
+	echo "                 and so these customized files may provide some (typically"
+	echo "                 modest) performance improvement for some select operations"
+	echo "                 and/or APIs, though there may a few (tiny dimension) cases"
+	echo "                 where the improvement is more pronounced. Note that the"
+	echo "                 target configuration must be Zen-based (or 'amd64') for"
+	echo "                 this option to have any effect. (Also note that this"
+	echo "                 option is NOT to be confused with enabling AMD *kernels*,"
+	echo "                 which are determined by the BLIS subconfiguration used at"
+	echo "                 runtime.) By default, these customized files are disabled."
+	echo " "
 	echo "   -a NAME --enable-addon=NAME"
 	echo " "
 	echo "                 Enable the code provided by an addon. An addon consists"
@@ -2453,6 +2468,7 @@ main()
 	enable_mixed_dt='yes'
 	enable_mixed_dt_extra_mem='yes'
 	enable_sup_handling='yes'
+	enable_amd_frame_tweaks='no'
 	enable_memkind='' # The default memkind value is determined later on.
 	enable_trsm_preinversion='yes'
 	force_version='no'
@@ -2665,6 +2681,12 @@ main()
 						disable-sup-handling)
 							enable_sup_handling='no'
 							;;
+						enable-amd-frame-tweaks)
+							enable_amd_frame_tweaks='yes'
+							;;
+						disable-amd-frame-tweaks)
+							enable_amd_frame_tweaks='no'
+							;;
 						with-memkind)
 							enable_memkind='yes'
 							;;
@@ -3567,6 +3589,29 @@ main()
 		exit 1
 	fi
 
+	# Check whether we should use AMD-customized versions of certain framework
+	# files.
+	if [ "x${enable_amd_frame_tweaks}" = "xyes" ]; then
+
+		echo "${script_name}: AMD-specific framework files will be considered."
+		echo "${script_name}:   checking eligibility of target configuration."
+
+		# Make sure we are targeting either one of the zen subconfigs or the
+		# amd64 umbrella family.
+		uconf=$(echo ${config_name} | grep -c 'zen\|amd64')
+
+		if [[ $uconf == 0 ]]; then
+			echo "${script_name}:   target configuration '${config_name}' is not eligible."
+			echo "${script_name}:   disabling AMD-specific framework files."
+			enable_amd_frame_tweaks='no'
+		else
+			echo "${script_name}:   target configuration '${config_name}' is eligible."
+			echo "${script_name}:   enabling AMD-specific framework files."
+		fi
+	else
+		echo "${script_name}: AMD-specific framework files will not be considered."
+	fi
+
 	# Check if addons were given.
 	if [ -n "${addon_flag}" ]; then
 
@@ -3709,7 +3754,7 @@ main()
 	# Create a #define for the configuration family (config_name).
 	uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
 	config_name_define="#define BLIS_FAMILY_${uconf}\n"
-
+	
 	# Create a list of #defines, one for each configuration in config_list.
 	config_list_defines=""
 	for conf in ${config_list}; do
@@ -3820,6 +3865,7 @@ main()
 		| sed -e "s/@export_shared@/${export_shared}/g" \
 		| sed -e "s/@enable_blas@/${enable_blas}/g" \
 		| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
+		| sed -e "s/@enable_amd_frame_tweaks@/${enable_amd_frame_tweaks}/g" \
 		| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
 		| sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \
 		| sed -e "s/@addon_list@/${addon_list}/g" \
diff --git a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
new file mode 100644
index 000000000..8f0f31479
--- /dev/null
+++ b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
@@ -0,0 +1,222 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname, scalvsuf, axpyfsuf, fusefac ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t transa, \
+       conj_t  conjx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       ctype*  beta, \
+       ctype*  y, inc_t incy, \
+       cntx_t* cntx  \
+     ) \
+{ \
+	/*const num_t dt = PASTEMAC(ch,type);*/ \
+\
+	ctype*  A1; \
+	ctype*  x1; \
+	ctype*  y1; \
+	dim_t   i; \
+	dim_t   b_fuse, f; \
+	dim_t   n_elem, n_iter; \
+	inc_t   rs_at, cs_at; \
+	conj_t  conja; \
+\
+	bli_set_dims_incs_with_trans( transa, \
+	                              m, n, rs_a, cs_a, \
+	                              &n_elem, &n_iter, &rs_at, &cs_at ); \
+\
+	conja = bli_extract_conj( transa ); \
+\
+	/* y = beta * y; */ \
+	/* NOTE: We don't explicitly handle the case where beta == 0 here
+	   since that behavior is handled within the scalv kernel itself. */ \
+	PASTEMAC2(ch,scalv,scalvsuf) \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n_elem, \
+	  beta, \
+	  y, incy, \
+	  cntx  \
+	); \
+\
+	/* If alpha == 0, then we are done. */ \
+	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
+\
+	/*PASTECH(ch,axpyf_ker_ft) kfp_af;*/ \
+\
+	/* Query the context for the kernel function pointer and fusing factor. */ \
+	/*kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );*/ \
+	/*b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );*/ \
+	b_fuse = fusefac; \
+\
+	for ( i = 0; i < n_iter; i += f ) \
+	{ \
+		f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
+\
+		A1 = a + (0  )*rs_at + (i  )*cs_at; \
+		x1 = x + (i  )*incx; \
+		y1 = y + (0  )*incy; \
+\
+		/* y = y + alpha * A1 * x1; */ \
+		/*kfp_af*/ \
+		PASTEMAC2(ch,axpyf,axpyfsuf) \
+		( \
+		  conja, \
+		  conjx, \
+		  n_elem, \
+		  f, \
+		  alpha, \
+		  A1, rs_at, cs_at, \
+		  x1, incx, \
+		  y1, incy, \
+		  cntx  \
+		); \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
+GENTFUNC( float,    s, gemv_unf_var2, _zen_int10, _zen_int_5,    5 )
+GENTFUNC( double,   d, gemv_unf_var2, _zen_int10, _zen_int_16x4, 4 )
+GENTFUNC( scomplex, c, gemv_unf_var2, _zen_int10, _zen_int_4,    4 )
+//GENTFUNC( dcomplex, z, gemv_unf_var2, _zen_int10, _ex,           1 )
+
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t transa, \
+       conj_t  conjx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       ctype*  beta, \
+       ctype*  y, inc_t incy, \
+       cntx_t* cntx  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	ctype*  zero       = PASTEMAC(ch,0); \
+	ctype*  A1; \
+	ctype*  x1; \
+	ctype*  y1; \
+	dim_t   i; \
+	dim_t   b_fuse, f; \
+	dim_t   n_elem, n_iter; \
+	inc_t   rs_at, cs_at; \
+	conj_t  conja; \
+\
+	bli_set_dims_incs_with_trans( transa, \
+	                              m, n, rs_a, cs_a, \
+	                              &n_elem, &n_iter, &rs_at, &cs_at ); \
+\
+	conja = bli_extract_conj( transa ); \
+\
+	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
+	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	{ \
+		/* y = 0; */ \
+		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  n_elem, \
+		  zero, \
+		  y, incy, \
+		  cntx, \
+		  NULL  \
+		); \
+	} \
+	else \
+	{ \
+		/* y = beta * y; */ \
+		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  n_elem, \
+		  beta, \
+		  y, incy, \
+		  cntx, \
+		  NULL  \
+		); \
+	} \
+\
+	PASTECH(ch,axpyf_ker_ft) kfp_af; \
+\
+	/* Query the context for the kernel function pointer and fusing factor. */ \
+	kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
+\
+	for ( i = 0; i < n_iter; i += f ) \
+	{ \
+		f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
+\
+		A1 = a + (0  )*rs_at + (i  )*cs_at; \
+		x1 = x + (i  )*incx; \
+		y1 = y + (0  )*incy; \
+\
+		/* y = y + alpha * A1 * x1; */ \
+		kfp_af \
+		( \
+		  conja, \
+		  conjx, \
+		  n_elem, \
+		  f, \
+		  alpha, \
+		  A1, rs_at, cs_at, \
+		  x1, incx, \
+		  y1, incy, \
+		  cntx  \
+		); \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
+GENTFUNC( dcomplex, z, gemv_unf_var2 )
+
diff --git a/frame/compat/amd/bla_copy_amd.c b/frame/compat/amd/bla_copy_amd.c
new file mode 100644
index 000000000..6780b555e
--- /dev/null
+++ b/frame/compat/amd/bla_copy_amd.c
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+	dim_t  n0; \
+	ftype* x0; \
+	ftype* y0; \
+	inc_t  incx0; \
+	inc_t  incy0; \
+\
+	/* Initialize BLIS. */ \
+	/*bli_init_auto()*/; \
+\
+	/* Convert/typecast negative values of n to zero. */ \
+	bli_convert_blas_dim1( *n, n0 ); \
+\
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */ \
+	bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+\
+	/* Call BLIS interface. */ \
+	/* NOTE: While we skip explicit initialization for real domain instances
+	   since we call the microkernel directly, the complex domain instances
+	   still need initialization so that they can query valid contexts from
+	   gks. However, the expert API will self-initialize before attempting
+	   to query a context, so the complex domain cases should work fine. */ \
+	PASTEMAC2(ch,blisname,isuf) \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n0, \
+	  x0, incx0, \
+	  y0, incy0, \
+	  NULL  \
+	); \
+\
+	/* Finalize BLIS. */ \
+	/*bli_finalize_auto();*/ \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+//INSERT_GENTFUNC_BLAS( copy, copyv )
+GENTFUNC( float,    s, copy, copyv, _zen_int )
+GENTFUNC( double,   d, copy, copyv, _zen_int )
+#endif
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+	dim_t  n0; \
+	ftype* x0; \
+	ftype* y0; \
+	inc_t  incx0; \
+	inc_t  incy0; \
+\
+	/* Initialize BLIS. */ \
+	/*bli_init_auto()*/; \
+\
+	/* Convert/typecast negative values of n to zero. */ \
+	bli_convert_blas_dim1( *n, n0 ); \
+\
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */ \
+	bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+\
+	/* Call BLIS interface. */ \
+	/* NOTE: While we skip explicit initialization for real domain instances
+	   since we call the microkernel directly, the complex domain instances
+	   still need initialization so that they can query valid contexts from
+	   gks. However, the expert API will self-initialize before attempting
+	   to query a context, so the complex domain cases should work fine. */ \
+	PASTEMAC2(ch,blisname,isuf) \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n0, \
+	  x0, incx0, \
+	  y0, incy0, \
+	  NULL, \
+	  NULL  \
+	); \
+\
+	/* Finalize BLIS. */ \
+	/*bli_finalize_auto();*/ \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+//INSERT_GENTFUNC_BLAS( copy, copyv )
+GENTFUNC( scomplex, c, copy, copyv, _ex )
+GENTFUNC( dcomplex, z, copy, copyv, _ex )
+#endif
+
diff --git a/frame/compat/amd/bla_gemv_amd.c b/frame/compat/amd/bla_gemv_amd.c
new file mode 100644
index 000000000..398d1bf2c
--- /dev/null
+++ b/frame/compat/amd/bla_gemv_amd.c
@@ -0,0 +1,172 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     ) \
+{ \
+	trans_t blis_transa; \
+	dim_t   m0, n0; \
+	dim_t   m_y, n_x; \
+	ftype*  x0; \
+	ftype*  y0; \
+	inc_t   incx0; \
+	inc_t   incy0; \
+\
+	/* Initialize BLIS. */ \
+	/*bli_init_auto();*/ \
+\
+	/* Perform BLAS parameter checking. */ \
+	PASTEBLACHK(blasname) \
+	( \
+	  MKSTR(ch), \
+	  MKSTR(blasname), \
+	  transa, \
+	  m, \
+	  n, \
+	  lda, \
+	  incx, \
+	  incy  \
+	); \
+\
+	/* BLAS handles cases where y has no elements as well as those where x has
+	   no elements. In the case of the former, it cannot do any work since
+	   the output vector is empty; but in the latter case, BLAS has peculiar
+	   semantics. When x has no elements (and transa(A) has no columns), BLAS
+	   returns immediately without performing any computation even if the
+	   number of elements of y (and rows of transa(A)) is non-zero, in which
+	   case any sane interpretations of gemv would have the the operation
+	   reduce to y := beta * y. Here, we emulate the BLAS exactly so as to
+	   provide "bug-for-bug" compatibility. Note that this extreme level of
+	   compatibility would not be contemplated if it weren't for the fact
+	   that some BLAS unit tests actually check for this behavior. Also, it
+	   should be emphasized that BLIS, when called natively, does NOT exhibit
+	   this quirky behavior; it will scale y by beta as one would expect. */ \
+	if ( *m == 0 || *n == 0 ) \
+	{ \
+		/* Finalize BLIS. */ \
+		/*bli_finalize_auto();*/ \
+\
+		return; \
+	} \
+\
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+\
+	/* Convert/typecast negative values of m and n to zero. */ \
+	bli_convert_blas_dim1( *m, m0 ); \
+	bli_convert_blas_dim1( *n, n0 ); \
+\
+	/* Determine the dimensions of x and y so we can adjust the increments,
+	   if necessary.*/ \
+	bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
+\
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */ \
+	bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
+	bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
+\
+	/* If alpha is zero, scale y by beta and return early. */ \
+	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  m_y, \
+		  ( ftype* )beta, \
+		  ( ftype* )y0, incy0, \
+		  NULL, \
+		  NULL  \
+		); \
+		return; \
+	} \
+\
+	/* Set the row and column strides of A. */ \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+\
+	/* Declare a void function pointer for the current operation. */ \
+	PASTECH2(ch,blisname,_unb_ft) f; \
+\
+	/* Choose the underlying implementation. */ \
+	if         ( bli_does_notrans( blis_transa ) )  f = PASTEMAC(ch,gemv_unf_var2); \
+	else /* if ( bli_does_trans( blis_transa ) ) */ f = PASTEMAC(ch,gemv_unf_var1); \
+\
+	/* Obtain a valid context from the gks. This is needed because these
+	   implementations of ?gemv_() skip calling gemv_ex() and instead
+	   call the unblocked fused variants directly. */ \
+	cntx_t* cntx = bli_gks_query_cntx(); \
+\
+	/* Invoke the variant chosen above, which loops over a level-1v or
+	   level-1f kernel to implement the current operation. */ \
+	f \
+	( \
+	  blis_transa, \
+	  BLIS_NO_CONJUGATE, \
+	  m0, \
+	  n0, \
+	  (ftype*)alpha, \
+	  (ftype*)a, rs_a, cs_a, \
+	  x0, incx0, \
+	  (ftype*)beta, \
+	  y0, incy0, \
+	  cntx  \
+	); \
+\
+	/* Finalize BLIS. */ \
+	/*bli_finalize_auto();*/ \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTFUNC_BLAS( gemv, gemv )
+#endif
+
diff --git a/frame/compat/bla_gemm.c b/frame/compat/bla_gemm.c
index e04e48cf5..e71d4e2fc 100644
--- a/frame/compat/bla_gemm.c
+++ b/frame/compat/bla_gemm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2022, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -62,9 +62,6 @@ void PASTEF77(ch,blasname) \
 	trans_t blis_transa; \
 	trans_t blis_transb; \
 	dim_t   m0, n0, k0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -94,12 +91,12 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *k, k0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -179,6 +176,48 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_b = *ldb; \
 	const inc_t rs_c = 1; \
 	const inc_t cs_c = *ldc; \
+\
+	/* Handle special cases of m == 1 or n == 1 via gemv. */ \
+	if ( n0 == 1 ) \
+	{ \
+		dim_t m0t, k0t; \
+		bli_set_dims_with_trans( blis_transa, m0, k0, &m0t, &k0t ); \
+\
+		PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \
+		( \
+		  blis_transa, \
+		  bli_extract_conj( blis_transb ), \
+		  m0t, k0t, \
+		  ( ftype* )alpha, \
+		  ( ftype* )a, rs_a, cs_a, \
+		  ( ftype* )b, ( bli_does_notrans( blis_transb ) ? rs_b : cs_b ), \
+		  ( ftype* )beta, \
+		            c, rs_c, \
+		  NULL, \
+		  NULL  \
+		); \
+		return; \
+	} \
+	else if ( m0 == 1 ) \
+	{ \
+		dim_t n0t, k0t; \
+		bli_set_dims_with_trans( blis_transb, n0, k0, &n0t, &k0t ); \
+\
+		PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \
+		( \
+		  blis_transb, \
+		  bli_extract_conj( blis_transa ), \
+		  n0t, k0t, \
+		  ( ftype* )alpha, \
+		  ( ftype* )b, cs_b, rs_b, \
+		  ( ftype* )a, ( bli_does_notrans( blis_transa ) ? cs_a : rs_a ), \
+		  ( ftype* )beta, \
+		            c, cs_c, \
+		  NULL, \
+		  NULL  \
+		); \
+		return; \
+	} \
 \
 	const num_t dt     = PASTEMAC(ch,type); \
 \
diff --git a/frame/compat/bla_gemv.c b/frame/compat/bla_gemv.c
index 85c65dde4..8d730edd9 100644
--- a/frame/compat/bla_gemv.c
+++ b/frame/compat/bla_gemv.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -60,7 +61,6 @@ void PASTEF77(ch,blasname) \
 	ftype*  y0; \
 	inc_t   incx0; \
 	inc_t   incy0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -89,16 +89,19 @@ void PASTEF77(ch,blasname) \
 	   if necessary.*/ \
 	bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
 \
-	/* BLAS handles cases where trans(A) has no columns, and x has no elements,
-	   in a peculiar way. In these situations, BLAS returns without performing
-	   any action, even though most sane interpretations of gemv would have the
-	   the operation reduce to y := beta * y. Here, we catch those cases that
-	   BLAS would normally mishandle and emulate the BLAS exactly so as to
+	/* BLAS handles cases where y has no elements as well as those where x has
+	   no elements. In the case of the former, it cannot do any work since
+	   the output vector is empty; but in the latter case, BLAS has peculiar
+	   semantics. When x has no elements (and transa(A) has no columns), BLAS
+	   returns immediately without performing any computation even if the
+	   number of elements of y (and rows of transa(A)) is non-zero, in which
+	   case any sane interpretations of gemv would have the the operation
+	   reduce to y := beta * y. Here, we emulate the BLAS exactly so as to
 	   provide "bug-for-bug" compatibility. Note that this extreme level of
-	   compatibility would not be as much of an issue if it weren't for the
-	   fact that some BLAS test suites actually test for these cases. Also, it
-	   should be emphasized that BLIS, if called natively, does NOT exhibit
-	   this quirky behavior; it will scale y by beta, as one would expect. */ \
+	   compatibility would not be contemplated if it weren't for the fact
+	   that some BLAS unit tests actually check for this behavior. Also, it
+	   should be emphasized that BLIS, when called natively, does NOT exhibit
+	   this quirky behavior; it will scale y by beta as one would expect. */ \
 	if ( m_y > 0 && n_x == 0 ) \
 	{ \
 		/* Finalize BLIS. */ \
@@ -113,8 +116,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_ger.c b/frame/compat/bla_ger.c
index db4f76f18..b558bfd94 100644
--- a/frame/compat/bla_ger.c
+++ b/frame/compat/bla_ger.c
@@ -56,7 +56,6 @@ void PASTEF772(ch,blasname,chc) \
 	ftype*  y0; \
 	inc_t   incx0; \
 	inc_t   incy0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -84,8 +83,8 @@ void PASTEF772(ch,blasname,chc) \
 	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_hemm.c b/frame/compat/bla_hemm.c
index 6bfb13e18..9a4484a09 100644
--- a/frame/compat/bla_hemm.c
+++ b/frame/compat/bla_hemm.c
@@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \
 	side_t  blis_side; \
 	uplo_t  blis_uploa; \
 	dim_t   m0, n0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -91,12 +88,12 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *n, n0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_hemv.c b/frame/compat/bla_hemv.c
index 944468278..d036c10e3 100644
--- a/frame/compat/bla_hemv.c
+++ b/frame/compat/bla_hemv.c
@@ -58,7 +58,6 @@ void PASTEF77(ch,blasname) \
 	ftype*  y0; \
 	inc_t   incx0; \
 	inc_t   incy0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -87,8 +86,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_her.c b/frame/compat/bla_her.c
index ade3cbdda..512081d89 100644
--- a/frame/compat/bla_her.c
+++ b/frame/compat/bla_her.c
@@ -54,7 +54,6 @@ void PASTEF77(ch,blasname) \
 	dim_t   m0; \
 	ftype*  x0; \
 	inc_t   incx0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -81,8 +80,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_her2.c b/frame/compat/bla_her2.c
index e3ed4ce31..7d99a6378 100644
--- a/frame/compat/bla_her2.c
+++ b/frame/compat/bla_her2.c
@@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \
 	ftype*  y0; \
 	inc_t   incx0; \
 	inc_t   incy0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -86,8 +85,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_her2k.c b/frame/compat/bla_her2k.c
index df5121975..2a058dc02 100644
--- a/frame/compat/bla_her2k.c
+++ b/frame/compat/bla_her2k.c
@@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \
 	uplo_t  blis_uploc; \
 	trans_t blis_transa; \
 	dim_t   m0, k0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -108,12 +105,12 @@ void PASTEF77(ch,blasname) \
 	} \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_herk.c b/frame/compat/bla_herk.c
index d9c47f5af..8236e2032 100644
--- a/frame/compat/bla_herk.c
+++ b/frame/compat/bla_herk.c
@@ -60,8 +60,6 @@ void PASTEF77(ch,blasname) \
 	uplo_t  blis_uploc; \
 	trans_t blis_transa; \
 	dim_t   m0, k0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -105,10 +103,10 @@ void PASTEF77(ch,blasname) \
 	} \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_symm.c b/frame/compat/bla_symm.c
index b4f0b66d0..098beb472 100644
--- a/frame/compat/bla_symm.c
+++ b/frame/compat/bla_symm.c
@@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \
 	side_t  blis_side; \
 	uplo_t  blis_uploa; \
 	dim_t   m0, n0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -91,12 +88,12 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *n, n0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index 79076194c..c5b5ebda3 100644
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -58,7 +58,6 @@ void PASTEF77(ch,blasname) \
 	ftype*  y0; \
 	inc_t   incx0; \
 	inc_t   incy0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -87,8 +86,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index 0ed4aebb1..6732a75cf 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -54,7 +54,6 @@ void PASTEF77(ch,blasname) \
 	dim_t   m0; \
 	ftype*  x0; \
 	inc_t   incx0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -81,8 +80,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_syr2.c b/frame/compat/bla_syr2.c
index dbae67027..7050c0488 100644
--- a/frame/compat/bla_syr2.c
+++ b/frame/compat/bla_syr2.c
@@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \
 	ftype*  y0; \
 	inc_t   incx0; \
 	inc_t   incy0; \
-	inc_t   rs_a, cs_a; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -87,8 +86,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_syr2k.c b/frame/compat/bla_syr2k.c
index 35cfca9a3..2b26171b6 100644
--- a/frame/compat/bla_syr2k.c
+++ b/frame/compat/bla_syr2k.c
@@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \
 	uplo_t  blis_uploc; \
 	trans_t blis_transa; \
 	dim_t   m0, k0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -100,12 +97,12 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *k, k0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_syrk.c b/frame/compat/bla_syrk.c
index 82ce2f166..4f3f15367 100644
--- a/frame/compat/bla_syrk.c
+++ b/frame/compat/bla_syrk.c
@@ -60,8 +60,6 @@ void PASTEF77(ch,blasname) \
 	uplo_t  blis_uploc; \
 	trans_t blis_transa; \
 	dim_t   m0, k0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_c, cs_c; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -97,10 +95,10 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *k, k0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_c = 1; \
-	cs_c = *ldc; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_trmm.c b/frame/compat/bla_trmm.c
index ce099dc59..b77a60dd6 100644
--- a/frame/compat/bla_trmm.c
+++ b/frame/compat/bla_trmm.c
@@ -63,8 +63,6 @@ void PASTEF77(ch,blasname) \
 	trans_t blis_transa; \
 	diag_t  blis_diaga; \
 	dim_t   m0, n0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -95,10 +93,10 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *n, n0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_trmv.c b/frame/compat/bla_trmv.c
index ffb31b12f..2821d4bfa 100644
--- a/frame/compat/bla_trmv.c
+++ b/frame/compat/bla_trmv.c
@@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \
 	dim_t   m0; \
 	ftype*  x0; \
 	inc_t   incx0; \
-	inc_t   rs_a, cs_a; \
 	ftype*  one_p; \
 \
 	/* Initialize BLIS. */ \
@@ -89,8 +88,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Acquire a pointer to the global scalar constant BLIS_ONE. */ \
 	one_p = PASTEMAC(ch,1); \
diff --git a/frame/compat/bla_trsm.c b/frame/compat/bla_trsm.c
index c0d8e4b3e..9af008090 100644
--- a/frame/compat/bla_trsm.c
+++ b/frame/compat/bla_trsm.c
@@ -63,8 +63,6 @@ void PASTEF77(ch,blasname) \
 	trans_t blis_transa; \
 	diag_t  blis_diaga; \
 	dim_t   m0, n0; \
-	inc_t   rs_a, cs_a; \
-	inc_t   rs_b, cs_b; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
@@ -95,10 +93,10 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_dim1( *n, n0 ); \
 \
 	/* Set the row and column strides of the matrix operands. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
-	rs_b = 1; \
-	cs_b = *ldb; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/bla_trsv.c b/frame/compat/bla_trsv.c
index 445059720..91132934e 100644
--- a/frame/compat/bla_trsv.c
+++ b/frame/compat/bla_trsv.c
@@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \
 	dim_t   m0; \
 	ftype*  x0; \
 	inc_t   incx0; \
-	inc_t   rs_a, cs_a; \
 	ftype*  one_p; \
 \
 	/* Initialize BLIS. */ \
@@ -89,8 +88,8 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
 \
 	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
 \
 	/* Acquire a pointer to the global scalar constant BLIS_ONE. */ \
 	one_p = PASTEMAC(ch,1); \
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index c4096cbbc..c8488890f 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc.
    Copyright (C) 2018, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
@@ -81,16 +81,9 @@ void bli_sscalv_zen_int10
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
 		float* zero = bli_s0;
-#ifdef BLIS_CONFIG_ZEN2
-		bli_ssetv_zen_int
-		(
-		  BLIS_NO_CONJUGATE,
-		  n,
-		  zero,
-		  x, incx,
-		  cntx
-		);
-#else
+
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
 		ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 		f
 		(
@@ -100,7 +93,7 @@ void bli_sscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-#endif
+		
 		return;
 	}
 
@@ -281,16 +274,9 @@ void bli_dscalv_zen_int10
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
 		double* zero = bli_d0;
-#ifdef BLIS_CONFIG_ZEN2
-		bli_dsetv_zen_int
-		(
-		  BLIS_NO_CONJUGATE,
-		  n,
-		  zero,
-		  x, incx,
-		  cntx
-		);
-#else
+
+		if( cntx == NULL ) cntx = bli_gks_query_cntx();
+
 		dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
@@ -301,7 +287,7 @@ void bli_dscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-#endif
+		
 		return;
 	}
 
@@ -454,3 +440,33 @@ void bli_dscalv_zen_int10
 	}
 }
 
+// -----------------------------------------------------------------------------
+
+//
+// NOTE: This function definition is provided as a placeholder in order to allow
+// function names of scalv kernels to be hard-coded in bli_gemv_unf_var2_amd.c.
+//
+
+void bli_cscalv_zen_int10
+     (
+       conj_t             conjalpha,
+       dim_t              n,
+       scomplex* restrict alpha,
+       scomplex* restrict x, inc_t incx,
+       cntx_t*   restrict cntx
+     )
+{
+	const num_t dt = BLIS_SCOMPLEX;
+
+	cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx );
+
+	f
+	(
+	  conjalpha,
+	  n,
+	  alpha,
+	  x, incx,
+	  cntx
+	);
+}
+
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
new file mode 100644
index 000000000..5ddb56ac5
--- /dev/null
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -0,0 +1,277 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+
+ void bli_caxpyf_zen_int_4
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       scomplex* restrict alpha,
+       scomplex* restrict a, inc_t inca, inc_t lda,
+       scomplex* restrict x, inc_t incx,
+       scomplex* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    inc_t fuse_fac = 4;
+    inc_t i;
+
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm5, ymm6, ymm7;
+    __m256 ymm8,       ymm10;
+    __m256 ymm12, ymm13;
+
+    float* ap[4];
+    float* y0 = (float*)y;
+
+    scomplex            chi0;
+    scomplex            chi1;
+    scomplex            chi2;
+    scomplex            chi3;
+
+
+    dim_t setPlusOne = 1;
+
+    if ( bli_is_conj(conja) )
+    {
+        setPlusOne = -1;
+    }
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_ceq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+        caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            scomplex* a1   = a + (0  )*inca + (i  )*lda;
+            scomplex* chi1 = x + (i  )*incx;
+            scomplex* y1   = y + (0  )*incy;
+            scomplex  alpha_chi1;
+
+            bli_ccopycjs( conjx, *chi1, alpha_chi1 );
+            bli_cscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+    if(bli_is_noconj(conjx))
+    {
+        chi0 = *( x + 0*incx );
+        chi1 = *( x + 1*incx );
+        chi2 = *( x + 2*incx );
+        chi3 = *( x + 3*incx );
+    }
+    else
+    {
+        scomplex *pchi0 = x + 0*incx ;
+        scomplex *pchi1 = x + 1*incx ;
+        scomplex *pchi2 = x + 2*incx ;
+        scomplex *pchi3 = x + 3*incx ;
+
+        bli_ccopycjs( conjx, *pchi0, chi0 );
+        bli_ccopycjs( conjx, *pchi1, chi1 );
+        bli_ccopycjs( conjx, *pchi2, chi2 );
+        bli_ccopycjs( conjx, *pchi3, chi3 );
+    }
+
+    // Scale each chi scalar by alpha.
+    bli_cscals( *alpha, chi0 );
+    bli_cscals( *alpha, chi1 );
+    bli_cscals( *alpha, chi2 );
+    bli_cscals( *alpha, chi3 );
+
+    lda *= 2;
+    incx *= 2;
+    incy *= 2;
+    inca *= 2;
+
+    ap[0] = (float*)a;
+    ap[1] = (float*)a + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+
+    if( inca == 2 && incy == 2 )
+    {
+        inc_t n1 = m/4;
+        inc_t n2 = m%4;
+
+        ymm12 = _mm256_setzero_ps();
+        ymm13 = _mm256_setzero_ps();
+
+            // broadcast real & imag parts of 4 elements of x
+        ymm0 = _mm256_broadcast_ss(&chi0.real); // real part of x0
+        ymm1 = _mm256_broadcast_ss(&chi0.imag); // imag part of x0
+        ymm2 = _mm256_broadcast_ss(&chi1.real); // real part of x1
+        ymm3 = _mm256_broadcast_ss(&chi1.imag); // imag part of x1
+        ymm4 = _mm256_broadcast_ss(&chi2.real); // real part of x2
+        ymm5 = _mm256_broadcast_ss(&chi2.imag); // imag part of x2
+        ymm6 = _mm256_broadcast_ss(&chi3.real); // real part of x3
+        ymm7 = _mm256_broadcast_ss(&chi3.imag); // imag part of x3
+
+        for(i = 0; i < n1; i++)
+        {
+            //load first two columns of A
+     	    ymm8  = _mm256_loadu_ps(ap[0] + 0);
+            ymm10 = _mm256_loadu_ps(ap[1] + 0);
+
+            ymm12 = _mm256_mul_ps(ymm8, ymm0);
+            ymm13 = _mm256_mul_ps(ymm8, ymm1);
+
+            ymm12 = _mm256_fmadd_ps(ymm10, ymm2, ymm12);
+            ymm13 = _mm256_fmadd_ps(ymm10, ymm3, ymm13);
+
+	    //load 3rd and 4th columns of A
+            ymm8  = _mm256_loadu_ps(ap[2] + 0);
+            ymm10 = _mm256_loadu_ps(ap[3] + 0);
+
+            ymm12 = _mm256_fmadd_ps(ymm8, ymm4, ymm12);
+            ymm13 = _mm256_fmadd_ps(ymm8, ymm5, ymm13);
+
+            ymm12 = _mm256_fmadd_ps(ymm10, ymm6, ymm12);
+            ymm13 = _mm256_fmadd_ps(ymm10, ymm7, ymm13);
+
+	    //load Y vector
+            ymm10 = _mm256_loadu_ps(y0 + 0);
+
+            if(bli_is_noconj(conja))
+            {
+                //printf("Inside no conj if\n");
+                ymm13 = _mm256_permute_ps(ymm13, 0xB1);
+                ymm8 = _mm256_addsub_ps(ymm12, ymm13);
+            }
+            else
+            {
+                ymm12 = _mm256_permute_ps(ymm12, 0xB1);
+                ymm8 = _mm256_addsub_ps(ymm13, ymm12);
+                ymm8 = _mm256_permute_ps(ymm8, 0xB1);
+            }
+
+            ymm12 = _mm256_add_ps(ymm8, ymm10);
+
+            _mm256_storeu_ps((float*)(y0), ymm12);
+
+            y0 += 8;
+            ap[0] += 8;
+            ap[1] += 8;
+            ap[2] += 8;
+            ap[3] += 8;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+
+        for ( i = 0; (i + 0) < n2 ; ++i )
+        {
+
+	    scomplex       y0c = *(scomplex*)y0;
+
+            const scomplex a0c = *(scomplex*)ap[0];
+            const scomplex a1c = *(scomplex*)ap[1];
+            const scomplex a2c = *(scomplex*)ap[2];
+            const scomplex a3c = *(scomplex*)ap[3];
+
+            y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne;
+            y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne;
+            y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne;
+            y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne;
+
+            y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne;
+            y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne;
+            y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne;
+            y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne;
+
+            *(scomplex*)y0 = y0c;
+
+            ap[0] += 2;
+            ap[1] += 2;
+            ap[2] += 2;
+            ap[3] += 2;
+            y0 += 2;
+        }
+    //PASTEMAC(c,fprintm)(stdout, "Y after A*x in axpyf",m, 1, (scomplex*)y, 1, 1, "%4.1f", "");
+
+    }
+    else
+    {
+        for (i = 0 ; (i + 0) < m ; ++i )
+        {
+            scomplex       y0c = *(scomplex*)y0;
+            const scomplex a0c = *(scomplex*)ap[0];
+            const scomplex a1c = *(scomplex*)ap[1];
+            const scomplex a2c = *(scomplex*)ap[2];
+            const scomplex a3c = *(scomplex*)ap[3];
+
+            y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne;
+            y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne;
+            y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne;
+            y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne;
+
+            y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne;
+            y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne;
+            y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne;
+            y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne;
+
+            *(scomplex*)y0 = y0c;
+
+            ap[0] += inca;
+            ap[1] += inca;
+            ap[2] += inca;
+            ap[3] += inca;
+            y0 += incy;
+        }
+    }
+}
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c
new file mode 100644
index 000000000..15a64d596
--- /dev/null
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -0,0 +1,1231 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+    __m256d v;
+    __m128d xmm[2];
+    double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+typedef union
+{
+    __m128d v;
+    double  d[2] __attribute__((aligned(64)));
+} v2df_t;
+
+
+void bli_saxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       float* restrict alpha,
+       float* restrict a, inc_t inca, inc_t lda,
+       float* restrict x, inc_t incx,
+       float* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 8;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    float* restrict a0;
+    float* restrict a1;
+    float* restrict a2;
+    float* restrict a3;
+    float* restrict a4;
+
+    float* restrict y0;
+
+    v8sf_t           chi0v, chi1v, chi2v, chi3v;
+    v8sf_t           chi4v;
+
+    v8sf_t           a00v, a01v, a02v, a03v;
+    v8sf_t           a04v;
+
+    v8sf_t           a10v, a11v, a12v, a13v;
+    v8sf_t           a14v;
+
+    v8sf_t           y0v, y1v;
+
+    float           chi0, chi1, chi2, chi3;
+    float           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        if(cntx == NULL) cntx = bli_gks_query_cntx();
+        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            float* a1   = a + (0  )*inca + (i  )*lda;
+            float* chi1 = x + (i  )*incx;
+            float* y1   = y + (0  )*incy;
+            float  alpha_chi1;
+
+            bli_scopycjs( conjx, *chi1, alpha_chi1 );
+            bli_sscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+        
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_sscals( *alpha, chi0 );
+    bli_sscals( *alpha, chi1 );
+    bli_sscals( *alpha, chi2 );
+    bli_sscals( *alpha, chi3 );
+    bli_sscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_ss( &chi0 );
+    chi1v.v = _mm256_broadcast_ss( &chi1 );
+    chi2v.v = _mm256_broadcast_ss( &chi2 );
+    chi3v.v = _mm256_broadcast_ss( &chi3 );
+    chi4v.v = _mm256_broadcast_ss( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+    double* restrict a2;
+    double* restrict a3;
+    double* restrict a4;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v, chi2v, chi3v;
+    v4df_t           chi4v;
+
+    v4df_t           a00v, a01v, a02v, a03v;
+    v4df_t           a04v;
+
+    v4df_t           a10v, a11v, a12v, a13v;
+    v4df_t           a14v;
+
+    v4df_t           y0v, y1v;
+
+    double           chi0, chi1, chi2, chi3;
+    double           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+        
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+    bli_dscals( *alpha, chi2 );
+    bli_dscals( *alpha, chi3 );
+    bli_dscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+    chi2v.v = _mm256_broadcast_sd( &chi2 );
+    chi3v.v = _mm256_broadcast_sd( &chi3 );
+    chi4v.v = _mm256_broadcast_sd( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 3) < m; i += 4 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
+// -----------------------------------------------------------------------------
+
+static void bli_daxpyf_zen_int_16x2
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 2;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 4;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v;
+
+    v4df_t           a00v, a01v;
+
+    v4df_t           a10v, a11v;
+
+    v4df_t           a20v, a21v;
+
+    v4df_t           a30v, a31v;
+
+    v4df_t           y0v, y1v, y2v, y3v;
+
+    double           chi0, chi1;
+
+    v2df_t           a40v, a41v;
+
+    v2df_t           y4v; 
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+            a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+            a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+            _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for ( ; (i + 11) < m; i += 12 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+
+            y0 += 3 * n_elem_per_reg;
+            a0 += 3 * n_elem_per_reg;
+            a1 += 3 * n_elem_per_reg;
+        }
+        for ( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += 2 * n_elem_per_reg;
+            a0 += 2 * n_elem_per_reg;
+            a1 += 2 * n_elem_per_reg;
+        }
+
+        for ( ; (i + 3) < m; i += 4 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+        }
+
+        for ( ; (i + 1) < m; i += 2 )
+        {
+            // Load the input values.
+            y4v.v = _mm_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v );
+
+            // Store the output.
+            _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v );
+
+            y0 += 2;
+            a0 += 2;
+            a1 += 2;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
+// -----------------------------------------------------------------------------
+void bli_daxpyf_zen_int_16x4
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 4;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 4;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+    double* restrict a2;
+    double* restrict a3;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v, chi2v, chi3v;
+
+    v4df_t           a00v, a01v, a02v, a03v;
+
+    v4df_t           a10v, a11v, a12v, a13v;
+
+    v4df_t           a20v, a21v, a22v, a23v;
+
+    v4df_t           a30v, a31v, a32v, a33v;
+
+    v4df_t           y0v, y1v, y2v, y3v;
+
+    double           chi0, chi1, chi2, chi3;
+
+    v2df_t           y4v;
+
+    v2df_t           a40v, a41v, a42v, a43v;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        if(cntx == NULL) cntx = bli_gks_query_cntx();
+        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+    bli_dscals( *alpha, chi2 );
+    bli_dscals( *alpha, chi3 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+    chi2v.v = _mm256_broadcast_sd( &chi2 );
+    chi3v.v = _mm256_broadcast_sd( &chi3 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+            a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+            a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+            a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg );
+            a32v.v = _mm256_loadu_pd( a2 + 3*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+            a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg );
+            a33v.v = _mm256_loadu_pd( a3 + 3*n_elem_per_reg );
+
+        // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a32v.v, chi2v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a33v.v, chi3v.v, y3v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+            _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for ( ; (i + 11) < m; i += 12 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+            a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+            a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+
+            y0 += 3 * n_elem_per_reg;
+            a0 += 3 * n_elem_per_reg;
+            a1 += 3 * n_elem_per_reg;
+            a2 += 3 * n_elem_per_reg;
+            a3 += 3 * n_elem_per_reg;
+        }
+
+        for ( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += 2 * n_elem_per_reg;
+            a0 += 2 * n_elem_per_reg;
+            a1 += 2 * n_elem_per_reg;
+            a2 += 2 * n_elem_per_reg;
+            a3 += 2 * n_elem_per_reg;
+        }
+
+
+        for ( ; (i + 3) < m; i += 4)
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+        }
+#if 1
+        for ( ; (i + 1) < m; i += 2)
+        {
+
+	    // Load the input values.
+            y4v.v  = _mm_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            a42v.v = _mm_loadu_pd( a2 + 0*n_elem_per_reg );
+
+            a43v.v = _mm_loadu_pd( a3 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a42v.v, chi2v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a43v.v, chi3v.xmm[0], y4v.v );
+
+            // Store the output.
+            _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v );
+
+            y0 += 2;
+            a0 += 2;
+            a1 += 2;
+            a2 += 2;
+            a3 += 2;
+        }
+#endif
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+
+	    y0 += incy;
+        }
+
+    }
+}
+
+
diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h
index 161bcef1a..c9651554d 100644
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -73,6 +73,7 @@ SCALV_KER_PROT( double,   d, scalv_zen_int )
 // scalv (intrinsics unrolled x10)
 SCALV_KER_PROT( float,    s, scalv_zen_int10 )
 SCALV_KER_PROT( double,   d, scalv_zen_int10 )
+SCALV_KER_PROT( scomplex, c, scalv_zen_int10 )
 
 // swapv (intrinsics)
 SWAPV_KER_PROT(float,    s, swapv_zen_int8 )
@@ -86,11 +87,21 @@ COPYV_KER_PROT( double,   d, copyv_zen_int )
 SETV_KER_PROT(float,    s, setv_zen_int)
 SETV_KER_PROT(double,   d, setv_zen_int)
 
+// swapv (intrinsics)
+SWAPV_KER_PROT(float, 	s, swapv_zen_int8 )
+SWAPV_KER_PROT(double,	d, swapv_zen_int8 )
+
+
 // -- level-1f --
 
 // axpyf (intrinsics)
 AXPYF_KER_PROT( float,    s, axpyf_zen_int_8 )
 AXPYF_KER_PROT( double,   d, axpyf_zen_int_8 )
+AXPYF_KER_PROT( float,    s, axpyf_zen_int_5 )
+AXPYF_KER_PROT( double,   d, axpyf_zen_int_5 )
+
+AXPYF_KER_PROT( double,   d, axpyf_zen_int_16x4 )
+AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 )
 
 // dotxf (intrinsics)
 DOTXF_KER_PROT( float,    s, dotxf_zen_int_8 )
@@ -199,3 +210,4 @@ GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_2x4n )
 GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_1x4n )
 GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_3x2 )
 GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_3x1 )
+
diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
deleted file mode 100644
index 5a919b622..000000000
--- a/kernels/zen2/1f/bli_axpyf_zen_int_5.c
+++ /dev/null
@@ -1,599 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "immintrin.h"
-#include "blis.h"
-
-/* Union data structure to access AVX registers
-   One 256-bit AVX register holds 8 SP elements. */
-typedef union
-{
-    __m256  v;
-    float   f[8] __attribute__((aligned(64)));
-} v8sf_t;
-
-/* Union data structure to access AVX registers
-*  One 256-bit AVX register holds 4 DP elements. */
-typedef union
-{
-    __m256d v;
-    double  d[4] __attribute__((aligned(64)));
-} v4df_t;
-
-
-void bli_saxpyf_zen_int_5
-     (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       float* restrict alpha,
-       float* restrict a, inc_t inca, inc_t lda,
-       float* restrict x, inc_t incx,
-       float* restrict y, inc_t incy,
-       cntx_t* restrict cntx
-     )
-{
-    const dim_t      fuse_fac       = 5;
-
-    const dim_t      n_elem_per_reg = 8;
-    const dim_t      n_iter_unroll  = 2;
-
-    dim_t            i;
-
-    float* restrict a0;
-    float* restrict a1;
-    float* restrict a2;
-    float* restrict a3;
-    float* restrict a4;
-
-    float* restrict y0;
-
-    v8sf_t           chi0v, chi1v, chi2v, chi3v;
-    v8sf_t           chi4v;
-
-    v8sf_t           a00v, a01v, a02v, a03v;
-    v8sf_t           a04v;
-
-    v8sf_t           a10v, a11v, a12v, a13v;
-    v8sf_t           a14v;
-
-    v8sf_t           y0v, y1v;
-
-    float           chi0, chi1, chi2, chi3;
-    float           chi4;
-
-    // If either dimension is zero, or if alpha is zero, return early.
-    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
-
-    // If b_n is not equal to the fusing factor, then perform the entire
-    // operation as a loop over axpyv.
-    if ( b_n != fuse_fac )
-    {
-#ifdef BLIS_CONFIG_ZEN2
-        for ( i = 0; i < b_n; ++i )
-        {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
-
-            bli_scopycjs( conjx, *chi1, alpha_chi1 );
-            bli_sscals( *alpha, alpha_chi1 );
-
-            bli_saxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
-        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
-
-        for ( i = 0; i < b_n; ++i )
-        {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
-
-            bli_scopycjs( conjx, *chi1, alpha_chi1 );
-            bli_sscals( *alpha, alpha_chi1 );
-
-            f
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#endif
-        return;
-    }
-
-    // At this point, we know that b_n is exactly equal to the fusing factor.
-
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
-    a2   = a + 2*lda;
-    a3   = a + 3*lda;
-    a4   = a + 4*lda;
-    y0   = y;
-
-    chi0 = *( x + 0*incx );
-    chi1 = *( x + 1*incx );
-    chi2 = *( x + 2*incx );
-    chi3 = *( x + 3*incx );
-    chi4 = *( x + 4*incx );
-
-
-    // Scale each chi scalar by alpha.
-    bli_sscals( *alpha, chi0 );
-    bli_sscals( *alpha, chi1 );
-    bli_sscals( *alpha, chi2 );
-    bli_sscals( *alpha, chi3 );
-    bli_sscals( *alpha, chi4 );
-
-    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
-    chi0v.v = _mm256_broadcast_ss( &chi0 );
-    chi1v.v = _mm256_broadcast_ss( &chi1 );
-    chi2v.v = _mm256_broadcast_ss( &chi2 );
-    chi3v.v = _mm256_broadcast_ss( &chi3 );
-    chi4v.v = _mm256_broadcast_ss( &chi4 );
-
-    // If there are vectorized iterations, perform them with vector
-    // instructions.
-    if ( inca == 1 && incy == 1 )
-    {
-        for ( i = 0; (i + 15) < m; i += 16 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
-
-            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
-
-            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
-
-            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
-
-            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
-            a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
-
-
-            // Store the output.
-            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
-            a2 += n_iter_unroll * n_elem_per_reg;
-            a3 += n_iter_unroll * n_elem_per_reg;
-            a4 += n_iter_unroll * n_elem_per_reg;
-        }
-
-        for( ; (i + 7) < m; i += 8 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
-
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
-
-            // Store the output.
-            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
-            a2 += n_elem_per_reg;
-            a3 += n_elem_per_reg;
-            a4 += n_elem_per_reg;
-        }
-    
-        // If there are leftover iterations, perform them with scalar code.
-        for ( ; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const float a0c = *a0;
-            const float a1c = *a1;
-            const float a2c = *a2;
-            const float a3c = *a3;
-            const float a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += 1;
-            a1 += 1;
-            a2 += 1;
-            a3 += 1;
-            a4 += 1;
-            y0 += 1;
-        }
-    }
-    else
-    {
-        for ( i = 0; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const float a0c = *a0;
-            const float a1c = *a1;
-            const float a2c = *a2;
-            const float a3c = *a3;
-            const float a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += inca;
-            a1 += inca;
-            a2 += inca;
-            a3 += inca;
-            a4 += inca; 
-            y0 += incy;
-        }
-
-    }
-}
-
-
-// -----------------------------------------------------------------------------
-
-void bli_daxpyf_zen_int_5
-     (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
-     )
-{
-    const dim_t      fuse_fac       = 5;
-
-    const dim_t      n_elem_per_reg = 4;
-    const dim_t      n_iter_unroll  = 2;
-
-    dim_t            i;
-
-    double* restrict a0;
-    double* restrict a1;
-    double* restrict a2;
-    double* restrict a3;
-    double* restrict a4;
-
-    double* restrict y0;
-
-    v4df_t           chi0v, chi1v, chi2v, chi3v;
-    v4df_t           chi4v;
-
-    v4df_t           a00v, a01v, a02v, a03v;
-    v4df_t           a04v;
-
-    v4df_t           a10v, a11v, a12v, a13v;
-    v4df_t           a14v;
-
-    v4df_t           y0v, y1v;
-
-    double           chi0, chi1, chi2, chi3;
-    double           chi4;
-
-    // If either dimension is zero, or if alpha is zero, return early.
-    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
-
-    // If b_n is not equal to the fusing factor, then perform the entire
-    // operation as a loop over axpyv.
-    if ( b_n != fuse_fac )
-    {
-#ifdef BLIS_CONFIG_ZEN2
-        for ( i = 0; i < b_n; ++i )
-        {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
-
-            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_dscals( *alpha, alpha_chi1 );
-
-            bli_daxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
-        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
-
-        for ( i = 0; i < b_n; ++i )
-        {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
-
-            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_dscals( *alpha, alpha_chi1 );
-
-            f
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#endif
-        return;
-    }
-
-    // At this point, we know that b_n is exactly equal to the fusing factor.
-
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
-    a2   = a + 2*lda;
-    a3   = a + 3*lda;
-    a4   = a + 4*lda;
-    y0   = y;
-
-    chi0 = *( x + 0*incx );
-    chi1 = *( x + 1*incx );
-    chi2 = *( x + 2*incx );
-    chi3 = *( x + 3*incx );
-    chi4 = *( x + 4*incx );
-
-
-    // Scale each chi scalar by alpha.
-    bli_dscals( *alpha, chi0 );
-    bli_dscals( *alpha, chi1 );
-    bli_dscals( *alpha, chi2 );
-    bli_dscals( *alpha, chi3 );
-    bli_dscals( *alpha, chi4 );
-
-    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
-    chi0v.v = _mm256_broadcast_sd( &chi0 );
-    chi1v.v = _mm256_broadcast_sd( &chi1 );
-    chi2v.v = _mm256_broadcast_sd( &chi2 );
-    chi3v.v = _mm256_broadcast_sd( &chi3 );
-    chi4v.v = _mm256_broadcast_sd( &chi4 );
-
-    // If there are vectorized iterations, perform them with vector
-    // instructions.
-    if ( inca == 1 && incy == 1 )
-    {
-        for ( i = 0; (i + 7) < m; i += 8 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
-
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
-
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
-
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
-
-            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
-            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
-
-
-            // Store the output.
-            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
-            a2 += n_iter_unroll * n_elem_per_reg;
-            a3 += n_iter_unroll * n_elem_per_reg;
-            a4 += n_iter_unroll * n_elem_per_reg;
-        }
-
-        for( ; (i + 3) < m; i += 4 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
-
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
-
-            // Store the output.
-            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
-
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
-            a2 += n_elem_per_reg;
-            a3 += n_elem_per_reg;
-            a4 += n_elem_per_reg;
-        }
-    
-        // If there are leftover iterations, perform them with scalar code.
-        for ( ; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
-            const double a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += 1;
-            a1 += 1;
-            a2 += 1;
-            a3 += 1;
-            a4 += 1;
-            y0 += 1;
-        }
-    }
-    else
-    {
-        for ( i = 0; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
-            const double a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += inca;
-            a1 += inca;
-            a2 += inca;
-            a3 += inca;
-            a4 += inca; 
-            y0 += incy;
-        }
-
-    }
-}
-
diff --git a/test/test_copyv.c b/test/test_copyv.c
new file mode 100644
index 000000000..a85004f12
--- /dev/null
+++ b/test/test_copyv.c
@@ -0,0 +1,218 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+//#define BLIS_ACCURACY_TEST
+#ifdef BLIS_ACCURACY_TEST
+
+bool scompare_result( int n, float *x, int incx, float *y, int incy )
+{
+	for ( int i = 0; i < n; i++ )
+	{
+		if ( (*x) != (*y) )
+		{
+			printf( "%4f != %4f at location %d\n", *x, *y, i );
+			return FALSE;
+		}
+		x += incx;
+		y += incy;
+	}
+	return TRUE;
+}
+
+bool dcompare_result( int n, double *x, int incx, double *y, int incy )
+{
+	for ( int i = 0; i < n; i++ )
+	{
+		if ( (*x) != (*y) )
+		{
+			printf( "%4f != %4f at location %d\n", *x, *y, i );
+			return FALSE;
+		}
+		x += incx;
+		y += incy;
+	}
+	return TRUE;
+}
+
+#endif
+
+
+int main( int argc, char** argv )
+{
+	obj_t x, y;
+	dim_t n;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   n_input, sizeof_dt;
+	int   r, n_repeats;
+	num_t dt;
+
+	double dtime;
+	double dtime_save;
+	double gbps;
+
+	//bli_init();
+
+	n_repeats = 100000;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end = 100000;
+	p_inc = 200;
+
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end = 16;
+	p_inc = 1;
+
+	n_input = 16;
+#endif
+
+#if 1
+	 // dt = BLIS_FLOAT;
+	dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	if      ( dt == BLIS_FLOAT  ) sizeof_dt = sizeof( float );
+	else if ( dt == BLIS_DOUBLE ) sizeof_dt = sizeof( double );
+
+	printf( "executable\t n\t GBs per sec\n" );
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( n_input < 0 ) n = p * ( dim_t )abs( n_input );
+		else               n =     ( dim_t )     n_input;
+
+		bli_obj_create( dt, n, 1, 0, 0, &x );
+		bli_obj_create( dt, n, 1, 0, 0, &y );
+
+		bli_randm( &x );
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+
+			dtime = bli_clock();
+
+#ifdef BLIS
+			bli_copyv( &x,
+			           &y );
+#else
+			if ( bli_is_float( dt ) )
+			{
+				f77_int nn   = bli_obj_length( &x );
+				f77_int incx = bli_obj_vector_inc( &x );
+				float*  xp   = bli_obj_buffer( &x );
+				f77_int incy = bli_obj_vector_inc( &y );
+				float*  yp   = bli_obj_buffer( &y );
+
+				scopy_( &nn,
+				        xp, &incx,
+				        yp, &incy );
+
+			}
+			else if ( bli_is_double( dt ) )
+			{
+
+				f77_int nn   = bli_obj_length( &x );
+				f77_int incx = bli_obj_vector_inc( &x );
+				double* xp   = bli_obj_buffer( &x );
+				f77_int incy = bli_obj_vector_inc( &y );
+				double* yp   = bli_obj_buffer( &y );
+
+				dcopy_( &nn,
+				        xp, &incx,
+				        yp, &incy );
+			}
+#endif
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+
+#ifdef BLIS_ACCURACY_TEST
+			if ( dt == BLIS_FLOAT )
+			{
+				int     nn   = bli_obj_length( &x );
+				int     incx = bli_obj_vector_inc( &x );
+				float*  xp   = bli_obj_buffer( &x );
+				int     incy = bli_obj_vector_inc( &y );
+				float*  yp   = bli_obj_buffer( &y );
+				if ( scompare_result( nn, xp, incx, yp, incy ) )
+					printf( "Copy Successful\n" );
+				else
+					printf( "ALERT!!! Copy Failed\n" );
+			}
+			if ( dt == BLIS_DOUBLE )
+			{
+				int     nn   = bli_obj_length( &x );
+				int     incx = bli_obj_vector_inc( &x );
+				double* xp   = bli_obj_buffer( &x );
+				int     incy = bli_obj_vector_inc( &y );
+				double* yp   = bli_obj_buffer( &y );
+				if ( dcompare_result( nn, xp, incx, yp, incy ) )
+					printf( "Copy Successful\n" );
+				else
+					printf( "ALERT!!! Copy Failed\n" );
+			}
+#endif
+		}
+
+		// Size of the vectors are incrementd by 1000, to test wide range of inputs.
+		if ( p >= 1000  ) p_inc = 1000;
+		if ( p >= 10000 ) p_inc = 10000;
+		gbps = ( n * sizeof_dt ) / ( dtime_save * 1.0e9 );
+
+#ifdef BLIS
+		printf( "data_copyv_blis\t" );
+#else
+		printf( "data_copyv_%s\t", BLAS );
+#endif
+		printf( "%4lu\t %7.2f\n",
+		        ( unsigned long )n, gbps );
+
+		bli_obj_free( &x );
+		bli_obj_free( &y );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
diff --git a/test/test_swapv.c b/test/test_swapv.c
new file mode 100644
index 000000000..4d8d35eac
--- /dev/null
+++ b/test/test_swapv.c
@@ -0,0 +1,180 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+//               n     x      incx      y        incy
+//void  dswap_( int*, double*, int*, double*,   int* );
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t x, y;
+	dim_t n;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   n_input;
+	int   r, n_repeats;
+	num_t dt;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+
+	bli_init();
+
+	n_repeats = 3;
+
+#ifndef PRINT
+	p_begin = 40;
+	p_end   = 8000;
+	p_inc   = 40;
+
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	n_input = -1;
+#endif
+
+#if 1
+	dt = BLIS_FLOAT;
+	//dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_swapv_blis" );
+#else
+	printf( "data_swapv_%s", BLAS );
+#endif
+	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+			( unsigned long )(p - p_begin)/p_inc + 1,
+			( unsigned long )0, 0.0 );
+
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
+	{
+
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+		bli_obj_create( dt, n, 1, 0, 0, &x );
+		bli_obj_create( dt, n, 1, 0, 0, &y );
+
+		bli_randm( &x );
+		bli_randm( &y );
+
+		dtime_save = 1.0e9;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+
+			dtime = bli_clock();
+
+#ifdef PRINT
+			bli_printm( "x", &x, "%4.1f", "" );
+			bli_printm( "y", &y, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_swapv( &x,
+			           &y );
+#else
+			if ( bli_is_float( dt ) )
+			{
+				f77_int nn     = bli_obj_length( &x );
+				f77_int incx   = bli_obj_vector_inc( &x );
+				f77_int incy   = bli_obj_vector_inc( &y );
+				float*  xp     = bli_obj_buffer( &x );
+				float*  yp     = bli_obj_buffer( &y );
+
+				sswap_( &nn,
+				        xp, &incx,
+				        yp, &incy );
+
+			}
+			else if ( bli_is_double( dt ) )
+			{
+
+				f77_int  nn     = bli_obj_length( &x );
+				f77_int  incx   = bli_obj_vector_inc( &x );
+				f77_int  incy   = bli_obj_vector_inc( &y );
+				double*  xp     = bli_obj_buffer( &x );
+				double*  yp     = bli_obj_buffer( &y );
+
+				dswap_( &nn,
+				        xp, &incx,
+				        yp, &incy );
+			}
+#endif
+
+#ifdef PRINT
+			bli_printm( "X after", &x, "%4.1f", "" );
+			bli_printm( "Y after", &y, "%4.1f", "" );
+
+			exit(1);
+#endif
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( n ) / ( dtime_save * 1.0e9 );
+
+#ifdef BLIS
+		printf( "data_swapv_blis" );
+#else
+		printf( "data_swapv_%s", BLAS );
+#endif
+		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )n, gflops );
+
+		bli_obj_free( &x );
+		bli_obj_free( &y );
+	}
+
+	bli_finalize();
+
+	return 0;
+}

From cf06364327bd2d21d606392371ff3c5962bee5ba Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 29 Mar 2022 16:18:25 -0500
Subject: [PATCH 043/230] Fixed typo in BLAS gemm3m call to _check().

Details:
- Fixed an unresolved symbol issue leftover from #590 whereby ?gemm3m_()
  as defined in bla_gemm3m.c was referencing bla_gemm3m_check(), which
  does not exist. It should have simply called the _check() function for
  gemm.
---
 frame/compat/extra/bla_gemm3m.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c
index 11d542e69..4533375f0 100644
--- a/frame/compat/extra/bla_gemm3m.c
+++ b/frame/compat/extra/bla_gemm3m.c
@@ -67,7 +67,7 @@ void PASTEF77(ch,blasname) \
 	bli_init_auto(); \
 \
 	/* Perform BLAS parameter checking. */ \
-	PASTEBLACHK(blasname) \
+	PASTEBLACHK(blisname) \
 	( \
 	  MKSTR(ch), \
 	  MKSTR(blasname), \
@@ -162,7 +162,7 @@ void PASTEF77(ch,blasname) \
 	bli_init_auto(); \
 \
 	/* Perform BLAS parameter checking. */ \
-	PASTEBLACHK(blasname) \
+	PASTEBLACHK(blisname) \
 	( \
 	  MKSTR(ch), \
 	  MKSTR(blasname), \

From bee7678b2558a691ac850819dbe33fefe4fdbee3 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 31 Mar 2022 14:09:39 -0500
Subject: [PATCH 044/230] CREDITS file update.

---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index 85ed97c6a..b701598cf 100644
--- a/CREDITS
+++ b/CREDITS
@@ -105,6 +105,7 @@ but many others have contributed code and feedback, including
   Meghana Vankadari        @Meghana-vankadari  (AMD)
   Kiran Varaganti          @kvaragan           (AMD)
   Natalia Vassilieva                           (Hewlett Packard Enterprise)
+                           @h-vetinari
   Andrew Wildman           @awild82            (University of Washington)
   Zhang Xianyi             @xianyi             (Chinese Academy of Sciences)
   Benda Xu                 @heroxbd

From 99bb9002f1aff598d347eae2821a3f7bdd1f48e8 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 1 Apr 2022 08:10:59 -0500
Subject: [PATCH 045/230] ReleaseNotes.md update in advance of next version.

---
 docs/ReleaseNotes.md | 110 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index ce6f29a1c..ccb4d9f0e 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -4,6 +4,7 @@
 
 ## Contents
 
+* [Changes in 0.9.0](ReleaseNotes.md#changes-in-090)
 * [Changes in 0.8.1](ReleaseNotes.md#changes-in-081)
 * [Changes in 0.8.0](ReleaseNotes.md#changes-in-080)
 * [Changes in 0.7.0](ReleaseNotes.md#changes-in-070)
@@ -39,6 +40,115 @@
 * [Changes in 0.0.2](ReleaseNotes.md#changes-in-002)
 * [Changes in 0.0.1](ReleaseNotes.md#changes-in-001)
 
+## Changes in 0.9.0
+April 1, 2022
+
+Improvements present in 0.9.0:
+
+Framework:
+- Added various fields to `obj_t` that relate to storing function pointers to custom `packm` kernels, microkernels, etc as well as accessor functions to set and query those fields. (Devin Matthews)
+- Enabled user-customized `packm` microkernels and variants via the aforementioned new `obj_t` fields. (Devin Matthews)
+- Moved edge-case handling out of the macrokernel and into the `gemm` and `gemmtrsm` microkernels. This also required updating of APIs and definitions of all existing microkernels in `kernels` directory. Edge-case handling functionality is now facilitated via new preprocessor macros found in `bli_edge_case_macro_defs.h`. (Devin Matthews)
+- Avoid `gemmsup` thread barriers when not packing A or B. This boosts performance for many small multithreaded problems. (Field Van Zee, AMD)
+- Allow the 1m method to operate normally when single and double real-domain microkernels mix row and column I/O preference. (Field Van Zee, Devin Matthews, RuQing Xu)
+- Removed support for execution of complex-domain level-3 operations via the 3m and 4m methods.
+- Refactored `herk`, `her2k`, `syrk`, `syr2k` in terms of `gemmt`. (Devin Matthews)
+- Defined `setijv` and `getijv` to set/get vector elements.
+- Defined `eqsc`, `eqv`, and `eqm` operations to test equality between two scalars, vectors, or matrices.
+- Added new bounds checking to `setijm` and `getijm` to prevent use of negative indices.
+- Renamed `membrk` files/variables/functions to `pba`.
+- Store error-checking level as a thread-local variable. (Devin Matthews)
+- Add `err_t*` "return" parameter to `bli_malloc_*()` and friends.
+- Switched internal mutexes of the `sba` and `pba` to static initialization.
+- Changed return value method of `bli_pack_get_pack_a()`, `bli_pack_get_pack_b()`.
+- Fixed a bug that allows `bli_init()` to be called more than once (without segfaulting). (@lschork2, Minh Quan Ho, Devin Matthews)
+- Removed a sanity check in `bli_pool_finalize()` that prevented BLIS from being re-initialized. (AMD)
+- Fixed insufficient `pool_t`-growing logic in `bli_pool.c`, and always allocate at least one element in `.block_ptrs` array. (Minh Quan Ho)
+- Cleanups related to the error message array in `bli_error.c`. (Minh Quan Ho)
+- Moved language-related definitions from `bli_macro_defs.h` to a new header, `bli_lang_defs.h`.
+- Renamed `BLIS_SIMD_NUM_REGISTERS` to `BLIS_SIMD_MAX_NUM_REGISTERS` and `BLIS_SIMD_SIZE` to `BLIS_SIMD_MAX_SIZE` for improved clarity. (Devin Matthews)
+- Many minor bugfixes.
+- Many cleanups, including removal of old and commented-out code.
+
+Compatibility:
+- Expanded BLAS layer to include support for `?axpby_()` and `?gemm_batch_()`. (Meghana Vankadari, AMD)
+- Added `gemm3m` APIs to BLAS and CBLAS layers. (Bhaskar Nallani, AMD)
+- Handle `?gemm_()` invocations where m or n is unit by calling `?gemv_()`. (Dipal M Zambare, AMD)
+- Removed option to finalize BLIS after every BLAS call.
+- Updated default definitions of `bli_slamch()` and `bli_dlamch()` to use constants from standard C library rather than values computed at runtime. (Devin Matthews)
+
+Kernels:
+- Added 512-bit SVE-based `a64fx` subconfiguration that uses empirically-tuned blocksizes (Stepan Nassyr, RuQing Xu)
+- Added a vector-length agnostic `armsve` subconfig that computes blocksizes via an analytical model. (Stepan Nassyr)
+- Added vector-length agnostic d/s/sh `gemm` kernels for Arm SVE. (Stepan Nassyr)
+- Added `gemmsup` kernels to the `armv8a` kernel set for use in new Apple Firestorm subconfiguration. (RuQing Xu)
+- Added 512-bit SVE `dpackm` kernels (16xk and 10xk) with in-register transpose. (RuQing Xu)
+- Extended 256-bit SVE `dpackm` kernels by Linaro Ltd. to 512-bit for size 12xk. (RuQing Xu)
+- Reorganized register usage in `bli_gemm_armv8a_asm_d6x8.c` to accommodate clang. (RuQing Xu)
+- Added `saxpyf`/`daxpyf`/`caxpyf` kernels to `zen` kernel set. (Dipal M Zambare, AMD)
+- Added `vzeroupper` instruction to `haswell` microkernels. (Devin Matthews)
+- Added explicit `beta == 0` handling in s/d `armsve` and `armv7a` `gemm` microkernels. (Devin Matthews)
+- Added a unique tag to branch labels to accommodate clang. (Devin Matthews, Jeff Hammond)
+- Fixed a copy-paste bug in the loading of `kappa_i` in the two assembly `cpackm` kernels in `haswell` kernel set. (Devin Matthews)
+- Fixed a bug in Mx1 `gemmsup` `haswell` kernels whereby the `vhaddpd` instruction is used with uninitialized registers. (Devin Matthews)
+- Fixed a bug in the `power10` microkernel I/O. (Nicholai Tukanov)
+- Many other Arm kernel updates and fixes. (RuQing Xu)
+
+Extras:
+- Added support for addons, which are similar to sandboxes but do not require the user to implement any particular operation.
+- Added a new `gemmlike` sandbox to allow rapid prototyping of `gemm`-like operations.
+- Various updates and improvements to the `power10` sandbox, including a new testsuite. (Nicholai Tukanov)
+
+Build system:
+- Added explicit support for AMD's Zen3 microarchitecture. (Dipal M Zambare, AMD, Field Van Zee)
+- Added runtime microarchitecture detection for Arm. (Dave Love, RuQing Xu, Devin Matthews)
+- Added a new `configure` option `--[en|dis]able-amd-frame-tweaks` that allows BLIS to compile certain framework files (each with the `_amd` suffix) that have been customized by AMD for improved performance (provided that the targeted configuration is eligible). By default, the more portable counterparts to these files are compiled. (Field Van Zee, AMD)
+- Added an explicit compiler predicate (`is_win`) for Windows in `configure`. (Devin Matthews)
+- Use `-march=haswell` instead of `-march=skylake-avx512` on Windows. (Devin Matthews, @h-vetinari)
+- Fixed `configure` breakage on MacOSX by accepting either `clang` or `LLVM` in vendor string. (Devin Matthews)
+- Blacklist clang10/gcc9 and older for `armsve` subconfig.
+- Added a `configure` option to control whether or not to use `@rpath`. (Devin Matthews)
+- Added armclang detection to `configure`. (Devin Matthews)
+- Use `@path`-based install name on MacOSX and use relocatable `RPATH` entries for testsuite binaries. (Devin Matthews)
+- For environment variables `CC`, `CXX`, `FC`, `PYTHON`, `AR`, and `RANLIB`, `configure` will now print an error message and abort if a user specifies a specific tool and that tool is not found. (Field Van Zee, Devin Matthews)
+- Added symlink to `blis.pc.in` for out-of-tree builds. (Andrew Wildman)
+- Register optimized real-domain `copyv`, `setv`, and `swapv` kernels in `zen` subconfig. (Dipal M Zambare, AMD)
+- Added Apple Firestorm (A14/M1) subconfiguration, `firestorm`. (RuQing Xu)
+- Added `armsve` subconfig to `arm64` configuration family. (RuQing Xu)
+- Allow using clang with the `thunderx2` subconfiguration. (Devin Matthews)
+- Fixed a subtle substitution bug in `configure`. (Chengguo Sun)
+- Updated top-level Makefile to reflect a dependency on the "flat" `blis.h` file for the BLIS and BLAS testsuite objects. (Devin Matthews)
+- Mark `xerbla_()` as a "weak" symbol on MacOSX. (Devin Matthews)
+- Fixed a long-standing bug in `common.mk` whereby the header path to `cblas.h` was omitted from the compiler flags when compiling CBLAS files within BLIS.
+- Added a custom-made recursive `sed` script to `build` directory.
+- Minor cleanups and fixes to `configure`, `common.mk`, and others.
+
+Testing:
+- Fixed a race condition in the testsuite when the SALT option (simulate application-level threading) is enabled. (Devin Matthews)
+- Test 1m method execution during `make check`. (Devin Matthews)
+- Test `make install` in Travis CI. (Devin Matthews)
+- Test C++ in Travis CI to make sure `blis.h` is C++-compatible. (Devin Matthews)
+- Disabled SDE testing of pre-Zen microarchitectures via Travis CI.
+- Added Travis CI support for testing Arm SVE. (RuQing Xu)
+- Updated SDE usage so that it is downloaded from a separate repository (ci-utils) in our GitHub organization. (Field Van Zee, Devin Matthews)
+- Updated octave scripts in `test/3` to be robust against missing datasets as well as to fixed a few minor issues.
+- Added `test_axpbyv.c` and `test_gemm_batch.c` test driver files to `test` directory. (Meghana Vankadari, AMD)
+- Support all four datatypes in `her`, `her2`, `herk`, and `her2k` drivers in `test` directory. (Madan mohan Manokar, AMD)
+
+Documentation:
+- Added documentation for: `setijv`, `getijv`, `eqsc`, `eqv`, `eqm`.
+- Added `docs/Addons.md`.
+- Added dedicated "Performance" and "Example Code" sections to `README.md`.
+- Updated `README.md`.
+- Updated `docs/Sandboxes.md`.
+- Updated `docs/Multithreading.md`. (Devin Matthews)
+- Updated `docs/KernelHowTo.md`.
+- Updated `docs/Performance.md` to report Fujitsu A64fx (512-bit SVE) results. (RuQing Xu)
+- Updated `docs/Performance.md` to report Graviton2 Neoverse N1 results. (Nicholai Tukanov)
+- Updated `docs/FAQ.md` with new questions.
+- Fixed typos in `docs/FAQ.md`. (Gaëtan Cassiers)
+- Various other minor fixes.
+
 ## Changes in 0.8.1
 March 22, 2021
 

From 14c86f66b20901b60ee276da355c1b62642c18d2 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 1 Apr 2022 08:12:06 -0500
Subject: [PATCH 046/230] Version file update (0.9.0)

---
 version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version b/version
index 6f4eebdf6..ac39a106c 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-0.8.1
+0.9.0

From 88cab8383ca90ddbb4cf13e69b7d44a1663a4425 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 1 Apr 2022 08:12:06 -0500
Subject: [PATCH 047/230] CHANGELOG update (0.9.0)

---
 CHANGELOG | 2886 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 2882 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 13eaa52ca..27bb039b5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,10 +1,2888 @@
-commit 8535b3e11d2297854991c4272932ce4974dda629 (HEAD -> master, tag: 0.8.1)
+commit 14c86f66b20901b60ee276da355c1b62642c18d2 (HEAD -> master, tag: 0.9.0)
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Apr 1 08:12:06 2022 -0500
+
+    Version file update (0.9.0)
+
+commit 99bb9002f1aff598d347eae2821a3f7bdd1f48e8 (origin/master, origin/HEAD)
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Apr 1 08:10:59 2022 -0500
+
+    ReleaseNotes.md update in advance of next version.
+
+commit bee7678b2558a691ac850819dbe33fefe4fdbee3 (origin/dev, origin/amd, dev, amd)
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Mar 31 14:09:39 2022 -0500
+
+    CREDITS file update.
+
+commit cf06364327bd2d21d606392371ff3c5962bee5ba
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Mar 29 16:18:25 2022 -0500
+
+    Fixed typo in BLAS gemm3m call to _check().
+    
+    Details:
+    - Fixed an unresolved symbol issue leftover from #590 whereby ?gemm3m_()
+      as defined in bla_gemm3m.c was referencing bla_gemm3m_check(), which
+      does not exist. It should have simply called the _check() function for
+      gemm.
+
+commit 1ec020b33ece1681c0041e2549eed2bd4c6cf356
+Author: Dipal M Zambare <71366780+dzambare@users.noreply.github.com>
+Date:   Wed Mar 30 02:45:36 2022 +0530
+
+    AMD kernel updates; frame-specific AMD updates. (#597)
+    
+    Details:
+    - Allow building BLIS with certain framework files (each with the '_amd'
+      suffix) that have been customized by AMD for Zen-based hardware. These
+      customized files were derived from portable versions of the same files
+      (i.e., those without the '_amd' suffix). Whether the portable or AMD-
+      specific files are compiled is now controlled by a new configure
+      option, --[en|dis]able-amd-frame-tweaks. This option is disabled by
+      default in vanilla BLIS, though AMD may choose to enable it by default
+      in their fork. For now, the added AMD-specific files are:
+      - bli_gemv_unf_var2_amd.c
+      - bla_copy_amd.c
+      - bla_gemv_amd.c
+      These files reside in 'amd' subdirectories found within the directory
+      housing their generic counterparts.
+    - Register optimized real-domain copyv, setv, and swapv kernels in
+      bli_cntx_init_zen.c.
+    - Various minor updates to level-1v kernels in 'zen' kernel set.
+    - Added caxpyf kernel as well as saxpyf and multiple daxpyf kernels to
+      the 'zen' kernel set
+    - If the problem passed to ?gemm_() in bla_gemm.c has a unit m or n dim,
+      call gemv instead and return early.
+    - Combined variable declarations with their initialization in various
+      level-2 and level-3 BLAS compatibility files, and also inserted
+      'const' qualifer in those same declaration statements.
+    - Moved frame/compat/bla_gemmt.c and .h to frame/compat/extra/ .
+    - Added copyv and swapv test drivers to 'test' directory.
+    - Whitespace, comment changes.
+
+commit 0db2bd5341c5c3ed5f1cc2bffa90952735efa45f
+Author: Bhaskar Nallani <Nallani.Bhaskar@amd.com>
+Date:   Fri Mar 25 05:11:55 2022 +0530
+
+    Added BLAS/CBLAS APIs for gemm3m. (#590)
+    
+    Details:
+    - Created ?gemm3m_() and cblas_?gemm3m() APIs that (for now) simply
+      invoke the 1m implementation unconditionally. (Note that these APIs
+      bypass sup handling.)
+    - Added BLAS prototypes for gemm3m in frame/compat/bla_gemm3m.h.
+    - Added CBLAS prototypes for gemm3m in frame/compat/cblas/src/cblas.h.
+    - Relocated:
+        frame/compat/cblas/src/cblas_?gemmt.c
+      files into
+        frame/compat/cblas/src/extra/
+    - Relocated frame/compat/bla_gemmt.? into frame/compat/extra/ .
+    - Minor reorganization of prototypes and cpp macro directives in
+      bli_blas.h, cblas.h, and cblas_f77.h.
+    - Trival whitespace change to cblas_zgemm.c.
+
+commit d6810000e961fe807dc5a7db81180a8355f3eac0
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Mar 14 10:29:54 2022 -0500
+
+    Update Multithreading.md
+    
+    Add notes about `BLIS_IR_NT` (should typically be 1) and `BLIS_JR_NT` (should typically be small, e.g. <= 4). [ci skip]
+
+commit f1dbb0e514f53a3240d3a6cbdc3306b01a2206f5
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Mar 11 13:38:28 2022 -0600
+
+    Trival whitespace change; commit log addendum.
+    
+    Details:
+    - A co-attribution to Mithun Mohan was inadvertently omitted from the
+      commit log for headline change in the previous commit, 7c07b47.
+
+commit 7c07b477e432adbbce5812ed9341ba3092b03976
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Mar 11 13:28:50 2022 -0600
+
+    Avoid gemmsup barriers when not packing A or B. (#622)
+    
+    Details:
+    - Implemented a multithreaded optimization for the special (and common)
+      case of employing the gemmsup code path when the user requests
+      (implicitly or explicitly) that neither A nor B be packed during
+      computation. This optimization takes the form of a greatly reduced
+      code branch in bli_thrinfo_sup_create_for_cntl(), which avoids a
+      broadcast and two barriers, and results in higher performance when
+      obtaining two-way or higher parallelism within BLIS. Thanks to
+      Bhaskar Nallani of AMD for proposing this change via issue #605.
+    - Added an early return branch to bli_thrinfo_create_for_cntl() that
+      detects and quickly handles cases where no parallelism is being
+      obtained within BLIS (i.e., single-threaded execution). Note that
+      this special case handling was/is already present in
+      bli_thrinfo_sup_create_for_cntl().
+    - CREDITS file update.
+
+commit cad10410b2305bc0e328c5f2517ab02593b53428
+Author: Ivan Korostelev <ivan23kor@gmail.com>
+Date:   Thu Mar 10 09:58:14 2022 -0600
+
+    POWER10: edge cases in microkernel (#620)
+    
+    Use new API for POWER10 gemm microkernel
+
+commit 71851a0549276b17db18a0a0c8ab4f54493bf033
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Mar 8 17:38:09 2022 -0600
+
+    Fixed level-3 performance bug in haswell ukernels.
+    
+    Details:
+    - Fixed a performance regression affecting nearly all level-3 operations
+      that use the 'haswell' sgemm and dgemm microkernels. This regression
+      was introduced in 54fa28b, caused by an ill-formed conditional
+      expression in the assembly code that controls whether cache lines of C
+      should be prefetched as rows or as columns. Essentially, the two
+      branches were reversed, causing incomplete prefetching to occur for
+      both row- and column-stored instances of matrix C. Thanks to Devin
+      Matthews for his help finding and fixing this bug.
+
+commit 84732bf95634ac606c5f2661d9474318e366c386
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Feb 28 12:19:31 2022 -0600
+
+    Revamp how tools are handled/checked by configure.
+    
+    Details:
+    - Consolidate handling of tools that are specifiable via CC, CXX, FC,
+      PYTHON, AR, and RANLIB into one bash function, select_tool_w_env().
+      - If the user specifies a tool via an environment variable (e.g.
+        CC=gcc) and that tool does not seem valid, print an error message
+        and abort configure, unless the tool is optional (e.g. CXX or FC),
+        in which case a warning message is printed instead.
+      - The definition of "seems valid" above amounts to:
+        - responding to at least one of a basic set of command line options
+          (e.g. --version, -V, -h) if the os_name is Linux (since GNU tools
+          tend to respond to flags such as --version) or if the tool in
+          question is CC, CXX, FC, or PYTHON (which tend to respond to the
+          expected flags regardless of OS)
+        - the binary merely existing for AR and RANLIB on Darwin/OSX/BSD.
+          (These OSes tend to have non-GNU versions of ar and ranlib, which
+          typically do not respond to --version and friends.)
+    - This PR addresses #584. Thanks to Devin Matthews for suggesting some
+      of the changes in this commit.
+
+commit d5146582b1f1bcdccefe23925d3b114d40cd7e31
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Feb 23 03:35:46 2022 +0900
+
+    ArmSVE Ensure Non-zero Block Size (#615)
+    
+    Fixes #613. There are several macros/environment variables which need to be tuned to get good cache block sizes. It would be nice to have a way of getting values automatically.
+
+commit 4d8352309784403ed6719528968531ffb4483947
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Feb 23 01:03:47 2022 +0900
+
+    Add armsve to arm64 Metaconfig (#614)
+    
+    Availability of the `armsve` subconfig is controlled by the compiler version (gcc/clang). Tested for SVE and non-SVE. Fixes #612.
+
+commit c9700f369aa84fc00f36c4b817ffb7dab72b865d
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Feb 15 15:36:52 2022 -0600
+
+    Renamed SIMD-related macro constants for clarity.
+    
+    Details:
+    - Renamed the following macros defined in bli_kernel_macro_defs.h:
+    
+        BLIS_SIMD_NUM_REGISTERS -> BLIS_SIMD_MAX_NUM_REGISTERS
+        BLIS_SIMD_SIZE          -> BLIS_SIMD_MAX_SIZE
+    
+      Also updated all instances of these macros elsewhere, including
+      subconfigurations, source code, and documentation. Thanks to Devin
+      Matthews for suggesting this change.
+
+commit ee9ff988c49f16696679d4c6cd3dcfcac7295be7
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Feb 15 15:01:51 2022 -0600
+
+    Move edge cases to gemmtrsm ukrs; doc updates.
+    
+    Details:
+    - Moved edge-case handling into the gemmtrsm microkernel. This required
+      changing the microkernel API to take m and n dimension parameters as
+      well as updating all existing gemmtrsm microkernel function pointer
+      types, function signatures, and related definitions to take m and n
+      dimensions. Also updated all existing gemmtrsm kernels in the
+      'kernels' directory (which for now is limited to haswell and penryn
+      kernel sets, plus native and 1m-based reference kernels in
+      'ref_kernels') to take m and n dimensions, and implemented edge-case
+      handling within those microkernels via a collection of new C
+      preprocessor macros defined within bli_edge_case_macro_defs.h. Note
+      that the edge-case handling for gemm-like operations had already
+      been relocated into the gemm microkernel in 54fa28b.
+    - Added desriptive comments to GEMM_UKR_SETUP_CT() and related macros in
+      bli_edge_case_macro_defs.h to allow for easier reading.
+    - Updated docs/KernelsHowTo.md to reflect above changes. Also cleaned up
+      the bullet under "Implementation Notes for gemm" that covers alignment
+      issues. (Thanks to Ivan Korostelev for pointing out the confusing and
+      outdated language in issue #591.)
+    - Other minor tweaks to KernelsHowTo.md.
+
+commit 25061593460767221e1066f9d720fa6676bbed8f
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun Feb 13 20:11:55 2022 -0600
+
+    Don't use `-Wl,-flat-namespace`.
+    
+    Flat namespaces can cause problems due to conflicting system libraries,
+    etc., so just mark `xerbla_` as a weak symbol on macOS instead.
+
+commit 5a4d3f5208d3d8cc1827f8cc90414c764b7ebab3
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun Feb 13 17:28:30 2022 -0600
+
+    Use -flat_namespace option to link on macOS
+    
+    Fixes #611.
+
+commit 26742910a087947780a089360e2baf82ea109e01
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun Feb 13 16:53:45 2022 -0600
+
+    Update CC_VENDOR logic
+    
+    Look for `GCC` in addition to `gcc` to handle weird conda version strings. [ci skip]
+
+commit 2f3872e01d51545c687ae2c8b2650e00552111a7
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Mon Feb 7 17:14:49 2022 +0900
+
+    ArmSVE Adopts Label Wrapper
+    
+    For clang (& armclang?) compilation.
+    
+    Hopefully solves #609 .
+
+commit 72089bb2917b78d99cf4f27c69125bf213ee54e6
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Feb 5 16:56:04 2022 +0900
+
+    ArmSVE Use Predicate in M-Direction
+    
+    No need to query MR during kernel runtime.
+
+commit 9cc897f37455d52fbba752e3801f1a9d4a5bfdc1
+Author: Ruqing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Feb 3 16:40:02 2022 +0000
+
+    Fix SVE Compil.
+
+commit b5df1811f1bc8212b2cda6bb97b79819afe236a8
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Feb 3 02:31:29 2022 +0900
+
+    Armv8a, ArmSVE: Simplify Gen-C
+
+commit 35195bb5cea5d99eb3eaf41e3815137d14ceb52d
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Jan 31 10:29:50 2022 -0600
+
+    Add armclang detection to configure.
+    
+    armclang is treated as regular clang. Fixes #606. [ci skip]
+
+commit 0be9282cdccf73342d8571d3f7971a9b0af72363
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Jan 26 17:46:24 2022 -0600
+
+    Updated zen3 macro constant names.
+    
+    Details:
+    - In config/zen3/bli_family_zen3.h, renamed:
+        BLIS_SMALL_MATRIX_A_THRES_M_GEMMT -> _M_SYRK
+        BLIS_SMALL_MATRIX_A_THRES_N_GEMMT -> _N_SYRK
+      Thanks to Jeff Diamond for helping spot the stale _SYRK naming.
+
+commit 0ab20c0e72402ba0b17fe2c3ed3e16bf2ace0fd3
+Author: Jeff Hammond <jehammond@nvidia.com>
+Date:   Thu Jan 13 07:29:56 2022 -0800
+
+    the Apple local label thing is required by Clang in general
+    
+    @egaudry and I both saw this issue on Linux with Clang 10.
+    
+    ```
+    Compiling obj/thunderx2/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o ('thunderx2' CFLAGS for kernels)
+    kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c:171:49: fatal error: invalid symbol redefinition
+            "                                            \n\t"
+                                                           ^
+    <inline asm>:90:5: note: instantiated into assembly here
+               .SLOOPKITER:
+               ^
+    1 error generated.
+    ```
+    
+    Signed-off-by: Jeff Hammond <jehammond@nvidia.com>
+
+commit 81f93be0561c705ae6823d19e40849facc40bef7
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Jan 10 10:19:47 2022 -0600
+
+    Fix row-/column-major pref. in 16x8 haswell sgemm ukr (unused)
+
+commit 268ce1f29a717d18304713ecc25a2eafe41838c7
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Jan 10 10:17:17 2022 -0600
+
+    Relax alignment constraints
+    
+    Remove alignment of temporary AB buffer in edge case handling macros unless alignment is specifically requested (e.g. Core2, SDB/IVB). Fixes #595.
+
+commit 3f2440b0226d5e23a43d12105d74aa917cd6c610
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Jan 6 14:57:36 2022 -0600
+
+    Added m, n dims to gemmd/gemmlike ukernel calls.
+    
+    Details:
+    - Updated the gemmd addon and the gemmlike sandbox code to use the new
+      microkernel calling sequence, which now includes m and n dimensions so
+      that the microkernel has all the information necessary to handle edge
+      cases. Thanks to Jeff Diamond for catching this, which ideally would
+      have been included in commit 54fa28b.
+    - Retired var2 of both gemmd and gemmlike to 'attic' directories and
+      removed their corresponding prototypes. In both cases, var2 was a
+      variant of the block-panel algorithm where edge-case handling was
+      abstracted away to a microkernel wrapper. (Since this is now the
+      official behavior of BLIS microkernels, I saw no need to have it
+      included as a separate code path.)
+    - Comment updates.
+
+commit 864bfab4486ac910ef9a366e9ade4b45a39747fc
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Jan 4 15:10:34 2022 -0600
+
+    CREDITS file update.
+
+commit 466b68a3ad118342dc49a8130b7b02f5e7748521
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun Jan 2 14:59:41 2022 -0600
+
+    Add unique tag to branch labels for Apple ARM64.
+    
+    Add `%=` tag to branch labels, which expands to a unique identifier for each inline assembly block. This prevents duplicate symbol errors on Apple Silicon (#594). Fixes #594. [ci skip] since we can't test Apple Silicon anyways...
+
+commit 08174a2f6ebbd8ed5aa2bc4edc45da80962f06bb
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Jan 1 21:35:19 2022 +0900
+
+    Evict <arm_sve.h> Requirement for SVE GEMM
+    
+    For 8<= GCC < 10 compatibility.
+
+commit 54fa28bd847b389215cffb57a83dc9b3dce79c86
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Dec 24 08:00:33 2021 -0600
+
+    Move edge cases to gemm ukr; more user-custom mods. (#583)
+    
+    Details:
+    - Moved edge-case handling into the gemm microkernel. This required
+      changing the microkernel API to take m and n dimension parameters.
+      This required updating all existing gemm microkernel function pointer
+      types, function signatures, and related definitions to take m and n
+      dimensions. We also updated all existing kernels in the 'kernels'
+      directory to take m and n dimensions, and implemented edge-case
+      handling within those microkernels via a collection of new C
+      preprocessor macros defined within bli_edge_case_macro_defs.h. Also
+      removed the assembly code that formerly would handle general stride
+      IO on the microtile, since this can now be handled by the same code
+      that does edge cases.
+    - Pass the obj_t.ker_fn (of matrix C) into bli_gemm_cntl_create() and
+      bli_trsm_cntl_create(), where this function pointer is used in lieu of
+      the default macrokernel when it is non-NULL, and ignored when it is
+      NULL.
+    - Re-implemented macrokernel in bli_gemm_ker_var2.c to be a single
+      function using byte pointers rather that one function for each
+      floating-point datatype. Also, obtain the microkernel function pointer
+      from the .ukr field of the params struct embedded within the obj_t
+      for matrix C (assuming params is non-NULL and contains a non-NULL
+      value in the .ukr field). Communicate both the gemm microkernel
+      pointer to use as well as the params struct to the microkernel via
+      the auxinfo_t struct.
+    - Defined gemm_ker_params_t type (for the aforementioned obj_t.params
+      struct) in bli_gemm_var.h.
+    - Retired the separate _md macrokernel for mixed datatype computation.
+      We now use the reimplemented bli_gemm_ker_var2() instead.
+    - Updated gemmt macrokernels to pass m and n dimensions into microkernel
+      calls.
+    - Removed edge-case handling from trmm and trsm macrokernels.
+    - Moved most of bli_packm_alloc() code into a new helper function,
+      bli_packm_alloc_ex().
+    - Fixed a typo bug in bli_gemmtrsm_u_template_noopt_mxn.c.
+    - Added test/syrk_diagonal and test/tensor_contraction directories with
+      associated code to test those operations.
+
+commit 961d9d509dd94f3a66f7095057e3dc8eb6d89839
+Author: Kiran <kiran.varaganti@amd.com>
+Date:   Wed Dec 8 03:00:38 2021 +0530
+
+    Re-add BLIS_ENABLE_ZEN_BLOCK_SIZES macro for 'zen'.
+    
+    Details:
+    - Added previously-deleted cpp macro block to bli_cntx_init_zen.c
+      targeting the Naples microarchitecture that enabled different cache
+      blocksizes when the number of threads exceeds 16. This commit
+      represents PR #573.
+
+commit cf7d616a2fd58e293b496770654040818bf5609c
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Dec 2 17:10:03 2021 -0600
+
+    Enable user-customized packm ukernel/variant. (#549)
+    
+    Details:
+    - Added four new fields to obj_t: .pack_fn, .pack_params, .ker_fn, and
+      .ker_params. These fields store pointers to functions and data that
+      will allow the user to more flexibly create custom operations while
+      recycling BLIS's existing partitioning infrastructure.
+    - Updated typed API to packm variant and structure-aware kernels to
+      replace the diagonal offset with panel offsets, and changed strides
+      of both C and P to inc/ldim semantics. Updated object API to the packm
+      variant to include rntm_t*.
+    - Removed the packm variant function pointer from the packm cntl_t node
+      definition since it has been replaced by the .pack_fn pointer in the
+      obj_t.
+    - Updated bli_packm_int() to read the new packm variant function pointer
+      from the obj_t and call it instead of from the cntl_t node.
+    - Moved some of the logic of bli_l3_packm.c to a new file,
+      bli_packm_alloc.c.
+    - Rewrote bli_packm_blk_var1.c so that it uses byte (char*) pointers
+      instead of typed pointers, allowing a single function to be used
+      regardless of datatype. This obviated having a separate implementation
+      in bli_packm_blk_var1_md.c. Also relegated handling of scalars to a
+      new function, bli_packm_scalar().
+    - Employed a new standard whereby right-hand matrix operands ("B") are
+      always packed as column-stored row panels -- that is, identically to
+      that of left-hand matrix operands ("A"). This means that while we pack
+      matrix A normally, we actually pack B in a transposed state. This
+      allowed us to simplify a lot of code throughout the framework, and
+      also affected some of the logic in bli_l3_packa() and _packb().
+    - Simplified bli_packm_init.c in light of the new B^T convention
+      described above. bli_packm_init()--which is now called from within
+      bli_packm_blk_var1()--also now calls bli_packm_alloc() and returns
+      a bool that indicates whether packing should be performed (or
+      skipped).
+    - Consolidated bli_gemm_int() and bli_trsm_int() into a bli_l3_int(),
+      which, among other things, defaults the new .pack_fn field of the
+      obj_t to bli_packm_blk_var1() if the field is NULL.
+    - Defined a new function, bli_obj_reset_origin(), which permanently
+      refocuses the view of an object so that it "forgets" any offsets from
+      its original pointer. This function also sets the object's root field
+      to itself. Calls to bli_obj_reset_origin() for each matrix operand
+      appear in the _front() functions, after the obj_t's are aliased. This
+      resetting of the underlying matrices' origins is needed in preparation
+      for more advanced features from within custom packm kernels.
+    - Redefined bli_pba_rntm_set_pba() from a regular function to a static
+      inline function.
+    - Updated gemm_ukr, gemmtrsm_ukr, and trsm_ukr testsuite modules to use
+      libblis_test_pobj_create() to create local packed objects. Previously,
+      these packed objects were created by calling lower-level functions.
+
+commit e229e049ca08dfbd45794669df08a71dba892925
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Dec 1 17:36:22 2021 -0600
+
+    Added recu-sed.sh script to 'build' directory.
+    
+    Details:
+    - Added a recursive sed script to the 'build' directory.
+
+commit 12c66a4acc77bf4927b01e2358e2ac10b61e0a53
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Nov 19 14:43:53 2021 -0600
+
+    Minor updates to README.md, docs/Addons.md.
+    
+    Details:
+    - Add additional mentions of addons to README.md, including in the
+      "What's New" section.
+    - Removed mention of sandboxes from the long list of advantages
+      provided by BLIS.
+    - Very minor description update to opening line of Addons.md.
+
+commit a4bc03b990fe0572001eb6409efd12cd70677dcf
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Nov 19 13:29:00 2021 -0600
+
+    Brief mention/link to Addons.md in README.md.
+    
+    Details:
+    - Add a blurb about the new addons feature to the "Documentation for
+      BLIS developers" section of the README.md, which also links to the
+      Addons.md document.
+
+commit b727645eb7a8df39dee74068f734da66322fe0b3
+Merge: 9be97c15 7bde468c
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Nov 19 13:22:09 2021 -0600
+
+    Merge branch 'dev'
+
+commit 9be97c150e19fa58bca30cb993a6509ae21e2025
+Author: Madan mohan Manokar <86282872+madanm3@users.noreply.github.com>
+Date:   Thu Nov 18 00:46:46 2021 +0530
+
+    Support all four dts in test/test_her[2][k].c (#578)
+    
+    Details:
+    - Replaced the hard-coded calls to double-precision real syr, syr2,
+      syrk, and syrk in the corresponding standalone test drivers in the
+      'test' directory with conditional branches that will call the
+      appropriate BLAS interface depending on which datatype is enabled.
+      Thanks to Madan mohan Manokar for this improvement.
+    - CREDITS file update.
+
+commit 26e4b6b29312b472c3cadf95ccdf5240764777f4
+Author: Dipal M Zambare <71366780+dzambare@users.noreply.github.com>
+Date:   Thu Nov 18 00:32:00 2021 +0530
+
+    Added support for AMD's Zen3 microarchitecture.
+    
+    Details:
+    - Added a new 'zen3' subconfiguration targeting support for the AMD Zen3
+      microarchitecture (#561). Thanks to AMD for this contribution.
+    - Restructured clang and AOCC support for zen, zen2, and zen3
+      make_defs.mk files. The clang and AOCC version detection now happens
+      in configure, not in the subconfigurations' makefile fragments. That
+      is, we've added logic to configure that detects the version of
+      clang/AOCC, outputs an appropriate variable to config.mk
+      (ie: CLANG_OT_*, AOCC_OT_*), and then checks for it within the
+      makefile fragment (as is currently done for the GCC_OT_* variables).
+    - Added configure support for a GCC_OT_10_1_0 variable (and associated
+      substitution anchor) to communicate whether the gcc version is older
+      than 10.1.0, and use this variable to check for recent enough versions
+      of gcc to use -march=znver3 in the zen3 subconfig.
+    - Inlined the contents of config/zen/amd_config.mk into the zen and zen2
+      make_defs.mk so that the files are self-contained, harmonizing the
+      format of all three Zen-based subconfigurations' make_defs.mk files.
+    - Added indenting (with spaces) of GNU make conditionals for easier
+      reading in zen, zen2, and zen3 make_defs.mk files.
+    - Adjusted the range of models checked by bli_cpuid_is_zen() (which was
+      previously 0x00 ~ 0xff and is now 0x00 ~ 0x2f) so that it is
+      completely disjoint from the models checked by bli_cpuid_is_zen2()
+      (0x30 ~ 0xff). This is normally necessary because Zen and Zen2
+      microarchitectures share the same family (23, or 0x17), and so the
+      model code is the only way to differentiate the two. But in our case,
+      fixing the model range for zen *wasn't* actually necessary since we
+      checked for zen2 first, and therefore the wide zen range acted like
+      the 'else' of an 'if-else' statement. That said, the change helps
+      improve clarity for the reader by encoding useful knowledge, which
+      was obtained from https://en.wikichip.org/wiki/amd/cpuid .
+    - Added zen2.def and zen3.def files to the collection in travis/cpuid.
+      Note that support for zen, zen2, and zen3 is now present, and while
+      all the three microarchitectures have identical instruction sets from
+      the perspective of BLIS microkernels, they each correspond to
+      different subconfigurations and therefore merit separate testing.
+      Thanks to Devin Matthews for his guidance in hacking these files as
+      slight modifications of zen.def.
+    - Enabled testing of zen2 and zen3 via the SDE in travis/do_sde.sh.
+      Now, zen, zen2, and zen3 are tested through the SDE via Travis CI
+      builds.
+    - Updated travis/do_sde.sh to grab the SDE tarball from a new ci-utils
+      repository on GitHub rather than on Intel's website. This change was
+      made in an attempt to circumvent recent troubles with Travis CI not
+      being able to download the SDE directly from Intel's website via curl.
+      Thanks to Devin Matthews for suggesting the idea.
+    - Updated travis/do_sde.sh to grab the latest version (8.69.1) of the
+      Intel SDE from the flame/ci-utils repository.
+    - Updated .travis.yml to use gcc 9. The file was previously using gcc 8,
+      which did not support -march=znver2.
+    - Created amd64_legacy umbrella family in config_registry for targeting
+      older (bulldozer, piledriver, steamroller, and excavator)
+      microarchitectures and moved those same subconfigs out of the amd64
+      umbrella family. However, x86_64 retains amd64_legacy as a constituent
+      member.
+    - Fixed a bug in configure related to the building of the so-called
+      config list. When processing the contents of config_registry,
+      configure creates a series of structures and lists that allow for
+      various mappings related to configuration families, subconfigs, and
+      kernel sets. Two of those lists are built via substitution of
+      umbrella families with their subconfig members, and one of those
+      lists was improperly performing the substitution in a way that would
+      erroneously match on partial umbrella family names. That code was
+      changed to match the code that was already doing the substitution
+      properly, via substitute_words(). Also added comments noting the
+      importance of using substitute_words() in both instances.
+    - Comment updates.
+
+commit 74c0c622216aba0c24aa2c3a923811366a160cf5
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Nov 16 16:06:33 2021 -0600
+
+    Reverted cbc88fe.
+    
+    Details:
+    - Reverted the annotation of some markdown code blocks with 'bash'
+      after realizing that the in-browser syntax highlighting was not
+      worthwhile.
+
+commit cbc88feb51b949ce562d044cf9f99c4e46bb8a39
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Nov 16 16:02:39 2021 -0600
+
+    Marked some markdown shell code blocks as 'bash'.
+    
+    Details:
+    - Annotated the code blocks that represent shell commands and output as
+      'bash' in README.md and BuildSystem.md.
+
+commit 78cd1b045155ddf0b9ec6e2ab815f2b216ad9a9e
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Nov 16 15:53:40 2021 -0600
+
+    Added 'Example Code' section to README.md.
+    
+    Details:
+    - Inserted a new 'Example Code' section into the README.md immediately
+      after the 'Getting Started' section. Thanks to Devin Matthews for
+      recommending this addition.
+    - Moved the 'Performance' section of the README down slightly so that it
+      appears after the 'Documentation' section.
+
+commit 7bde468c6f7ecc4b5322d2ade1ae9c0b88e6b9f3
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Nov 13 16:39:37 2021 -0600
+
+    Added support for addons.
+    
+    Details:
+    - Implemented a new feature called addons, which are similar to
+      sandboxes except that there is no requirement to define gemm or any
+      other particular operation.
+    - Updated configure to accept --enable-addon=<name> or -a <name> syntax
+      for requesting an addon be included within a BLIS build. configure now
+      outputs the list of enabled addons into config.mk. It also outputs the
+      corresponding #include directives for the addons' headers to a new
+      companion to the bli_config.h header file named bli_addon.h. Because
+      addons may wish to make use of existing BLIS types within their own
+      definitions, the addons' headers must be included sometime after that
+      of bli_config.h (which currently is #included before bli_type_defs.h).
+      This is why the #include directives needed to go into a new top-level
+      header file rather than the existing bli_config.h file.
+    - Added a markdown document, docs/Addons.md, to explain addons, how to
+      build with them, and what assumptions their authors should keep in
+      mind as they create them.
+    - Added a gemmlike-like implementation of sandwich gemm called 'gemmd'
+      as an addon in addon/gemmd. The code uses a 'bao_' prefix for local
+      functions, including the user-level object and typed APIs.
+    - Updated .gitignore so that git ignores bli_addon.h files.
+
+commit 7bc8ab485e89cfc6032932e57929e208a28f4be5
+Author: Meghana-vankadari <74656386+Meghana-vankadari@users.noreply.github.com>
+Date:   Fri Nov 12 04:16:14 2021 +0530
+
+    Added BLAS/CBLAS APIs for axpby, gemm_batch. (#566)
+    
+    Details:
+    - Expanded the BLAS compatibility layer to include support for
+      ?axpby_() and ?gemm_batch_(). The former is a straightforward
+      BLAS-like interface into the axpbyv operation while the latter
+      implements a batched gemm via loops over bli_?gemm(). Also
+      expanded the CBLAS compatibility layer to include support for
+      cblas_?axpby() and cblas_?gemm_batch(), which serve as wrappers to
+      the corresponding (new) BLAS-like APIs. Thanks to Meghana Vankadari
+      for submitting these new APIs via #566.
+    - Fixed a long-standing bug in common.mk that for some reason never
+      manifested until now. Previously, CBLAS source files were compiled
+      *without* the location of cblas.h being specified via a -I flag.
+      I'm not sure why this worked, but it may be due to the fact that
+      the cblas.h file resided in the same directory as all of the CBLAS
+      source, and perhaps compilers implicitly add a -I flag for the
+      directory that corresponds to the location of the source file being
+      compiled. This bug only showed up because some CBLAS-like source code
+      was moved into an 'extra' subdirectory of that frame/compat/cblas/src
+      directory. After moving the code, compilation for those files failed
+      (because the cblas.h header file, presumably, could not be found in
+      the same location). This bug was fixed within common.mk by explicitly
+      adding the cblas.h directory to the list of -I flags passed to the
+      compiler.
+    - Added test_axpbyv.c and test_gemm_batch.c files to 'test' directory,
+      and updated test/Makefile to build those drivers.
+    - Fixed typo in error message string in cblas_sgemm.c.
+
+commit 28b0982ea70c21841fb23802d38f6b424f8200e1
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Nov 10 12:34:50 2021 -0600
+
+    Refactored her[2]k/syr[2]k in terms of gemmt. (#531)
+    
+    Details:
+    - Renamed herk macrokernels and supporting files and functions to gemmt,
+      which is possible since at the macrokernel level they are identical.
+      Then recast herk/her2k/syrk/syr2k in terms of gemmt within the expert
+      level-3 oapi (bli_l3_oapi_ex.c) while also redefining them as literal
+      functions rather than cpp macros that instantiate multiple functions.
+      Thanks to Devin Matthews for his efforts on this issue (#531).
+    - Check that the maximum stack buffer size is sufficiently large
+      relative to the register blocksizes for each datatype, and do so when
+      the context is initialized rather than when an operation is called.
+      Note that with this change, users who pass in their own contexts into
+      the expert interfaces currently will *not* have any checks performed.
+      Thanks to Devin Matthews for suggesting this change.
+
+commit cfa3db3f3465dc58dbbd842f4462e4b49e7768b4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Nov 3 18:13:56 2021 -0500
+
+    Fixed bug in mixed-dt gemm introduced in e9da642.
+    
+    Details:
+    - Fixed a bug that broke certain mixed-datatype gemm behavior. This
+      bug was introduced recently in e9da642 when the code that performs
+      the operation transposition (for microkernel IO preference purposes)
+      was moved up so that it occurred sooner. However, when I moved that
+      code, I failed to notice that there was a cpp-protected "if"
+      conditional that applied to the entire code block that was moved. Once
+      the code block was relocated, the orphaned if-statement was now
+      (erroneously) glomming on to the next thing that happened to be in the
+      function, which happened to be the call to bli_rntm_set_ways_for_op(),
+      causing a rather odd memory exhaustion error in the sba due to the
+      num_threads field of the rntm_t still being -1 (because the rntm_t
+      field were never processed as they should have been). Thanks to
+      @ArcadioN09 (Snehith) for reporting this error and helpfully including
+      relevant memory trace output.
+
+commit f065a8070f187739ec2b34417b8ab864a7de5d7e
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Oct 28 16:05:43 2021 -0500
+
+    Removed support for 3m, 4m induced methods.
+    
+    Details:
+    - Removed support for all induced methods except for 1m. This included
+      removing code related to 3mh, 3m1, 4mh, 4m1a, and 4m1b as well as any
+      code that existed only to support those implementations. These
+      implementations were rarely used and posed code maintenance challenges
+      for BLIS's maintainers going forward.
+    - Removed reference kernels for packm that pack 3m and 4m micropanels,
+      and removed 3m/4m-related code from bli_cntx_ref.c.
+    - Removed support for 3m/4m from the code in frame/ind, then reorganized
+      and streamlined the remaining code in that directory. The *ind(),
+      *nat(), and *1m() APIs were all removed. (These additional API layers
+      no longer made as much sense with only one induced method (1m) being
+      supported.) The bli_ind.c file (and header) were moved to frame/base
+      and bli_l3_ind.c (and header) and bli_l3_ind_tapi.h were moved to
+      frame/3.
+    - Removed 3m/4m support from the code in frame/1m/packm.
+    - Removed 3m/4m support from trmm/trsm macrokernels and simplified some
+      pointer arithmetic that was previously expressed in terms of the
+      bli_ptr_inc_by_frac() static inline function (whose definition was
+      also removed).
+    - Removed the following subdirectories of level-0 macro headers from
+      frame/include/level0: ri3, rih, ri, ro, rpi. The level-0 scalar macros
+      defined in these directories were used exclusively for 3m and 4m
+      method codes.
+    - Simplified bli_cntx_set_blkszs() and bli_cntx_set_ind_blkszs() in
+      light of 1m being the only induced method left within BLIS.
+    - Removed dt_on_output field within auxinfo_t and its associated
+      accessor functions.
+    - Re-indexed the 1e/1r pack schemas after removing those associated with
+      variants of the 3m and 4m methods. This leaves two bits unused within
+      the pack format portion of the schema bitfield. (See bli_type_defs.h
+      for more info.)
+    - Spun off the basic and expert interfaces to the object and typed APIs
+      into separate files: bli_l3_oapi.c and bli_l3_oapi_ex.c; bli_l3_tapi.c
+      and bli_l3_tapi_ex.c.
+    - Moved the level-3 operation-specific _check function calls from the
+      operations' _front() functions to the corresponding _ex() function of
+      the object API. (This change roughly maintains where the _check()
+      functions are called in the call stack but lays the groundwork for
+      future changes that may come to the level-3 object APIs.) Minor
+      modifications to bli_l3_check.c to allow the check() functions to be
+      called from the expert interface APIs.
+    - Removed support within the testsuite for testing the aforementioned
+      induced methods, and updated the standalone test drivers in the 'test'
+      directory so reflect the retirement of those induced methods.
+    - Modified the sandbox contract so that the user is obliged to define
+      bli_gemm_ex() instead of bli_gemmnat(). (This change was made in light
+      of the *nat() functions no longer existing.) Also updated the existing
+      'power10' and 'gemmlike' sandboxes to come into compliance with the
+      new sandbox rules.
+    - Updated BLISObjectAPI.md, BLISTypedAPI.md, Testsuite.md documentation
+      to reflect the retirement of 3m/4m, and also modified Sandboxes.md to
+      bring the document into alignment with new conventions.
+    - Updated various comments; removed segments of commented-out code.
+
+commit e8caf200a908859fa5f5ea2049911a9bdaa3d270
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Oct 18 13:04:15 2021 -0500
+
+    Updated do_sde.sh to get SDE from GitHub.
+    
+    Details:
+    - Updated travis/do_sde.sh so that the script downloads the SDE tarball
+      from a new ci-utils repository on GitHub rather than from Intel's
+      website. This change is being made in an attempt to circumvent Travis
+      CI's recent troubles with downloading the SDE from Intel's website via
+      curl. Thanks to Devin Matthews for suggesting the idea.
+
+commit 290ff4b1c26737b074d5abbf76966bc22af8c562
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Oct 14 16:09:43 2021 -0500
+
+    Disable SDE testing of old AMD microarchitectures.
+    
+    Details:
+    - Skip testing on piledriver, steamroller, and excavator platforms
+      in travis/do_sde.sh.
+
+commit 514fd101742dee557e5eb43d0023a221ae8a7172
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Oct 14 13:50:28 2021 -0500
+
+    Fixed substitution bug in configure.
+    
+    Details:
+    - Fixed a bug in configure related to the building of the so-called
+      config list. When processing the contents of config_registry,
+      configure creates a series of structures and list that allow for
+      various mappings related to configuration families, subconfigs,
+      and kernel sets. Two of those lists are built via subsitituion
+      of umbrella families with their subconfig members, and one of
+      those lists was improperly performing the subtitution in a way
+      that would erroneously match on partial umbrella family names.
+      That code was changed to match the code that was already doing
+      the subtitution properly, via substitute_words().
+    - Added comments noting the importance of using substitute_words()
+      in both instances.
+
+commit e9da6425e27a9d63c9fef92afc2dd750c601ccd7
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Oct 13 14:15:38 2021 -0500
+
+    Allow use of 1m with mixing of row/col-pref ukrs.
+    
+    Details:
+    - Fixed a bug that broke the use of 1m for dcomplex when the single-
+      precision real and double-precision real ukernels had opposing I/O
+      preferences (row-preferential sgemm ukernel + column-preferential
+      dgemm ukernel, or vice versa). The fix involved adjusting the API
+      to bli_cntx_set_ind_blkszs() so that the induced method context init
+      function (e.g., bli_cntx_init_<subconfig>_ind()) could call that
+      function for only one datatype at a time. This allowed the blocksize
+      scaling (which varies depending on whether we're doing 1m_r or 1m_c)
+      to happen on a per-datatype basis. This fixes issue #557. Thanks to
+      Devin Matthews and RuQing Xu for helping discover and report this bug.
+    - The aforementioned 1m fix required moving the 1m_r/1m_c logic from
+      bli_cntx_ref.c into a new function, bli_l3_set_schemas(), which is
+      called from each level-3 _front() function. The pack_t schemas in the
+      cntx_t were also removed entirely, along with the associated accessor
+      functions. This in turn required updating the trsm1m-related virtual
+      ukernels to read the pack schema for B from the auxinfo_t struct
+      rather than the context. This also required slight tweaks to
+      bli_gemm_md.c.
+    - Repositioned the logic for transposing the operation to accommodate
+      the microkernel IO preference. This mostly only affects gemm. Thanks
+      to Devin Matthews for his help with this.
+    - Updated dpackm pack ukernels in the 'armsve' kernel set to avoid
+      querying pack_t schemas from the context.
+    - Removed the num_t dt argument from the ind_cntx_init_ft type defined
+      in bli_gks.c. The context initialization functions for induced methods
+      were previously passed a dt argument, but I can no longer figure out
+      *why* they were passed this value. To reduce confusion, I've removed
+      the dt argument (including also from the function defintion +
+      prototype).
+    - Commented out setting of cntx_t schemas in bli_cntx_ind_stage.c. This
+      breaks high-leve implementations of 3m and 4m, but this is okay since
+      those implementations will be removed very soon.
+    - Removed some older blocks of preprocessor-disabled code.
+    - Comment update to test_libblis.c.
+
+commit 81e103463214d589071ccbe2d90b8d7c19a186e4
+Author: Minh Quan Ho <1337056+hominhquan@users.noreply.github.com>
+Date:   Wed Oct 13 20:28:02 2021 +0200
+
+    Alloc at least 1 elem in pool_t block_ptrs. (#560)
+    
+    Details:
+    - Previously, the block_ptrs field of the pool_t was allowed to be
+      initialized as any unsigned integer, including 0. However, a length of
+      0 could be problematic given that malloc(0) is undefined and therefore
+      variable across implementations. As a safety measure, we check for
+      block_ptrs array lengths of 0 and, in that case, increase them to 1.
+    - Co-authored-by: Minh Quan Ho <minh-quan.ho@kalray.eu>
+
+commit 327481a4b0acf485d0cbdd8635dd9b886ba3f2a7
+Author: Minh Quan Ho <1337056+hominhquan@users.noreply.github.com>
+Date:   Tue Oct 12 19:53:04 2021 +0200
+
+    Fix insufficient pool-growing logic in bli_pool.c. (#559)
+    
+    Details:
+    - The current mechanism for growing a pool_t doubles the length of the
+      block_ptrs array every time the array length needs to be increased
+      due to new blocks being added. However, that logic did not take in
+      account the new total number of blocks, and the fact that the caller
+      may be requesting more blocks that would fit even after doubling the
+      current length of block_ptrs. The code comments now contain two
+      illustrating examples that show why, even after doubling, we must
+      always have at least enough room to fit all of the old blocks plus
+      the newly requested blocks.
+    - This commit also happens to fix a memory corruption issue that stems
+      from growing any pool_t that is initialized with a block_ptrs length
+      of 0. (Previously, the memory pool for packed buffers of C was
+      initialized with a block_ptrs length of 0, but because it is unused
+      this bug did not manifest by default.)
+    - Co-authored-by: Minh Quan Ho <minh-quan.ho@kalray.eu>
+
+commit 32a6d93ef6e2af5e486dfd5e46f8272153d3d53d
+Merge: 408906fd 2604f407
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Oct 9 15:53:54 2021 -0500
+
+    Merge pull request #543 from xrq-phys/armsve-packm-fix
+    
+    ARMSVE Block SVE-Intrinsic Kernels for GCC 8-9
+
+commit 408906fdd8892032aa11bd061b7971128f453bef
+Merge: 4277fec0 ccf16289
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Oct 9 15:50:25 2021 -0500
+
+    Merge pull request #542 from xrq-phys/armsve-zgemm
+    
+    Arm SVE CGEMM / ZGEMM Natural Kernels
+
+commit ccf16289d2e71fd9511ccf2d13dcebbfa29deabc
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Oct 8 12:34:14 2021 +0900
+
+    Arm SVE C/ZGEMM Fix FMOV 0 Mistake
+    
+    FMOV [hsd]M, #imm does not allow zero immediate.
+    Use wzr, xzr instead.
+
+commit 82b61283b2005f900101056e6df2a108258db602
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Oct 8 12:17:29 2021 +0900
+
+    SH Kernel Unused Eigher
+
+commit 1749dfa493054abd2e4ddba7cb21278d337e4f74
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Oct 8 12:11:53 2021 +0900
+
+    Arm SVE C/ZGEMM Support *beta==0
+
+commit 4b648e47daad256ab8ab698173a97f71ab9f75eb
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Sep 22 16:42:09 2021 +0900
+
+    Arm SVE Config armsve Use ZGEMM/CGEMM
+
+commit f76ea905e216cf640975e6319c6d2f54aeafed2e
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Tue Sep 21 20:38:44 2021 +0900
+
+    Arm SVE: Update Perf. Graph
+    
+    Pic. size seems a bit different from upstream.
+    Generaged w/ MATLAB. Open to any change.
+
+commit 66a018e6ad00d9e8967b67e1aa3e23b20a7efdfe
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Mon Sep 20 00:16:11 2021 +0900
+
+    Arm SVE CGEMM 2Vx10 Unindex Process Alpha=1.0
+
+commit 9e1e781cb59f8fadb2a10a02376d3feac17ce38d
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sun Sep 19 23:30:42 2021 +0900
+
+    Arm SVE ZGEMM 2Vx10 Unindex Process Alpha=1.0
+
+commit f7c6c2b119423e7ba7a24ae2156790e076071cba
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Sep 16 01:47:42 2021 +0900
+
+    A64FX Config Use ZGEMM/CGEMM
+
+commit e4cabb977d038688688aca39b366f98f9c36b7eb
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Sep 16 01:34:26 2021 +0900
+
+    Arm SVE Typo Fix ZGEMM/CGEMM C Prefetch Reg
+
+commit b677e0d61b23f26d9536e5c363fd6bbab6ee1540
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Sep 16 01:18:54 2021 +0900
+
+    Arm SVE Add SGEMM 2Vx10 Unindexed
+
+commit 3f68e8309f2c5b31e25c0964395a180a80014d36
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Sep 16 01:00:54 2021 +0900
+
+    Arm SVE ZGEMM Support Gather Load / Scatt. St.
+
+commit c19db2ff826e2ea6ac54569e8aa37e91bdf7cabe
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Sep 15 23:39:53 2021 +0900
+
+    Arm SVE Add ZGEMM 2Vx10 Unindexed
+
+commit e13abde30b9e0e381c730c496e74bc7ae062a674
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Sep 15 04:19:45 2021 +0900
+
+    Arm SVE Add ZGEMM 2Vx7 Unindexed
+
+commit 49b9d7998eb86f340ae7b26af3e5a135d6a8feee
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Tue Sep 14 04:02:47 2021 +0900
+
+    Arm SVE Add ZGEMM 2Vx8 Unindexed
+
+commit 4277fec0d0293400497ae8bcfc32be5e62319ae9
+Merge: 2329d990 f44149f7
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Oct 7 13:47:22 2021 -0500
+
+    Merge pull request #533 from xrq-phys/arm64-hi-bw
+    
+    ARMv8 PACKM and GEMMSUP Kernels + Apple Firestorm Subconfig
+
+commit 2329d99016fe1aeb86da4552295f497543cea311 (origin/1m_row_col_problem)
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Oct 7 12:37:58 2021 -0500
+
+    Update Travis CI badge
+    
+    [ci skip]
+
+commit f44149f787ae3d4b53d9c4d8e6f23b2818b7770d
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Oct 8 02:35:58 2021 +0900
+
+    Armv8 Trash New Bulk Kernels
+    
+    - They didn't make much improvements.
+    - Can't register row-preferral and column-preferral ukrs at the same time.
+      Will break 1m.
+
+commit 70b52cadc5ef4c16431e1876b407019e6286614e
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Oct 7 12:34:35 2021 -0500
+
+    Enable testing 1m in `make check`.
+
+commit 2604f4071300d109f28c8438be845aeaf3ec44e4
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Oct 7 02:39:00 2021 +0900
+
+    Config ArmSVE Unregister 12xk. Move 12xk to Old
+
+commit 1e3200326be9109eb0f8c7b9e4f952e45700cbba
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Oct 7 02:37:14 2021 +0900
+
+    Revert __has_include(). Distinguish w/ BLIS_FAMILY_**
+
+commit a4066f278a5c06f73b16ded25f115ca4b7728ecb
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Oct 7 02:26:05 2021 +0900
+
+    Register firestorm into arm64 Metaconfig
+
+commit d7a3372247c37568d142110a1537632b34b8f2ff
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Oct 7 02:25:14 2021 +0900
+
+    Armv8 DGEMMSUP Fix Edge 6x4 Switch Case Typo
+
+commit 2920dde5ac52e09f84aa42990aab8340421522ce
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Oct 7 02:01:45 2021 +0900
+
+    Armv8 DGEMMSUP Fix 8x4m Store Inst. Typo
+
+commit 14b13583f1802c002e195b3b48874b3ebadbeb20
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Oct 6 10:22:34 2021 -0500
+
+    Add test for Apple M1 (firestorm)
+    
+    This test will run on Linux, but all the kernels should run just fine. This does not test autodetection but then none of the other ARM tests do either.
+
+commit a024715065532400da6257b8b3124ca5aecda405
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Oct 7 00:15:54 2021 +0900
+
+    Firestorm CPUID Dispatcher
+    
+    Commenting out <sys/sysctl.h> due to possibly a Xcode bug.
+
+commit b9da6d55fec447d05c8b67f34ce83617123d8357
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Oct 6 12:25:54 2021 +0900
+
+    Armv8 GEMMSUP Edge Cases Require Signed Ints
+    
+    Fix a bug in bli_gemmsup_rd_armv8a_asm_d6x8m.c.
+    For safety upon similar strategies in the future,
+     change all [mn]_[iter/left] into signed ints.
+
+commit 34919de3df5dda7a06fc09dcec12ca46dc8b26f4
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Oct 2 18:48:50 2021 -0500
+
+    Make error checking level a thread-local variable.
+    
+    Previously, this was a global variable. Setting the value was synchronized via a mutex but reading the value was not. Of course, these accesses are almost certainly atomic, but there is still the possibility of one thread attempting to set the value and then reading the value set by another thread. For correct operation under user threading (e.g. pthreads), this should probably be thread-local with no mutex.
+
+commit c3024993c3d50236fad112822215f066496c5831
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Oct 5 15:20:27 2021 -0500
+
+    Fix data race in testsuite.
+
+commit 353a0d82572f26e78102cee25693130ce6e0ea5b
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Oct 5 14:24:17 2021 -0500
+
+    Update .appveyor.yml
+    
+    [ci skip]
+
+commit 4bfadf9b561d4ebe0bbaf8b6d332f07ff531d618
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Oct 6 01:51:26 2021 +0900
+
+    Firestorm Block Size Fixes
+
+commit 40baf83f0ea2749199b93b5a8ac45c01794b008c
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Oct 6 01:00:52 2021 +0900
+
+    Armv8 Handle *beta == 0 for GEMMSUP ??r Case.
+
+commit 079fbd42ce8cf7ea67a939b0f80f488de5821319
+Merge: f5c03e9f 9905f443
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 17:21:48 2021 -0500
+
+    Merge branch 'master' into arm64-hi-bw
+
+commit 9905f44347eea4c57ef4927b81f1c63e76a92739
+Merge: 6d3036e3 64a421f6
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 15:58:59 2021 -0500
+
+    Merge pull request #553 from flame/rpath-fix
+    
+    Add an option to use an @rpath-dependent install_name on macOS
+
+commit 6d3036e31d8a2c1acbc1260489eeb8f535a8f97a
+Merge: 53377fcc eaa554aa
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 15:58:43 2021 -0500
+
+    Merge pull request #545 from hominhquan/clean_error
+    
+    bli_error: more cleanup on the error strings array
+
+commit 53377fcca91e595787b38e2a47780ac0c35a7e7c
+Merge: d0a0b4b8 80c5366e
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 15:45:53 2021 -0500
+
+    Merge pull request #554 from flame/armsve-cleanup
+    
+    Move unused ARM SVE kernels to "old" directory.
+
+commit 80c5366e4a9b8b72d97fba1eab89bab8989c44f4
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 15:40:28 2021 -0500
+
+    Move unused ARM SVE kernels to "old" directory.
+
+commit 64a421f6983ab5bc0b55df30a2ddcfff5bfd73be
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 13:40:43 2021 -0500
+
+    Add an option to control whether or not to use @rpath.
+    
+    Adds `--enable-rpath/--disable--rpath` (default disabled) to use an install_name starting with @rpath/. Otherwise, set the install_name to the absolute path of the install library, which was the previous behavior.
+
+commit c4a31683dd6f4da3065d86c11dd998da5192740a
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 13:27:10 2021 -0500
+
+    Fix $ORIGIN usage on linux.
+
+commit d0a0b4b841fce56b7b2d3c03c5d93ad173ce2b97
+Author: Dave Love <dave.love@manchester.ac.uk>
+Date:   Mon Oct 4 18:03:04 2021 +0000
+
+    Arm micro-architecture dispatch (#344)
+    
+    Details:
+    - Reworked support for ARM hardware detection in bli_cpuid.c to parse
+      the result of a CPUID-like instruction.
+    - Added a64fx support to bli_gks.c.
+    - #include arm64 and arm32 family headers from bli_arch_config.h.
+    - Fix the ordering of the "armsve" and "a64fx" strings in the
+      config_name string array in bli_arch.c. The ordering did not match
+      the ordering of the corresponding arch_t values in bli_type_defs.h,
+      as it should have all along.
+    - Added clang support to make_defs.mk in arm64, cortexa53, cortexa57
+      subconfigs.
+    - Updated arm64 and arm32 families in config_registry.
+    - Updated docs/HardwareSupport.md to reflect added ARM support.
+    - Thanks to Dave Love, RuQing Xu, and Devin Matthews for their
+      contributions in this PR (#344).
+
+commit 91408d161a2b80871463ffb6f34c455bdfb72492
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 4 11:37:48 2021 -0500
+
+    Use @path-based install name on MacOS and use relocatable RPATH entries for testsuite inaries.
+    
+    - RPATH entries (and DYLD_LIBRARY_PATH) do nothing on macOS unless the install_name of the library starts with @rpath/. While the install_name can be set to the absolute install path, this makes the installation non-relocatable. When using @path in the install_name, install paths within the normal DYLD_LIBRARY_PATH work with no changes on the user side, but for install paths off the beaten track, users must specify an RPATH entry when linking (or modify DYLD_LIBRARY_PATH at runtime). Perhaps this could be made into a configure-time option.
+    - Having relocable testsuite binaries is not necessarily a priority but it is easy to do with @executable_path (macOS) or $ORIGIN (linux/BSD).
+
+commit f5c03e9fe808f9bd8a3e0c62786334e13c46b0fc
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sun Oct 3 16:51:51 2021 +0900
+
+    Armv8 Handle *beta == 0 for GEMMSUP ?rc Case.
+
+commit abc648352c591e26ceee436bd3a45400115b70c5
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sun Oct 3 13:14:19 2021 +0900
+
+    Armv8 Fix 6x8 Row-Maj Ukr
+    
+    - Fixed for 6x8 only, 4x4 & 4x8 pending;
+    - Installed to config firestorm as benchmark seems to show better perf:
+       Old:
+    blis_dgemm_ukr_c                     6     8   320    36.87   2.43e-17   PASS
+    blis_dgemm_ukr_c                     6     8   352    40.55   1.04e-17   PASS
+    blis_dgemm_ukr_c                     6     8   384    44.24   5.68e-17   PASS
+    blis_dgemm_ukr_c                     6     8   416    41.67   3.51e-17   PASS
+    blis_dgemm_ukr_c                     6     8   448    34.41   2.94e-17   PASS
+    blis_dgemm_ukr_c                     6     8   480    42.53   2.35e-17   PASS
+    
+       New:
+    blis_dgemm_ukr_r                     6     8   352    50.69   1.59e-17   PASS
+    blis_dgemm_ukr_r                     6     8   384    49.15   5.55e-17   PASS
+    blis_dgemm_ukr_r                     6     8   416    50.44   2.86e-17   PASS
+    blis_dgemm_ukr_r                     6     8   448    46.92   3.12e-17   PASS
+    blis_dgemm_ukr_r                     6     8   480    48.08   4.08e-17   PASS
+
+commit 0a45bc0fbc7aee3876c315ed567fc37f19cdc57f
+Merge: 5013a6cb 13dbd5b5
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Oct 2 18:59:43 2021 -0500
+
+    Merge pull request #552 from flame/armsve_beta_0
+    
+    Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs.
+
+commit 13dbd5b5d3dbf27e33ecf0e98d43c97019a6339d
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Oct 2 20:40:25 2021 +0000
+
+    Apply patch from @xrq-phys.
+
+commit ae0eeeaf77c77892db17027cef10b95ec97c904f
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Sep 29 16:42:33 2021 -0500
+
+    Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs.
+
+commit 5013a6cb7110746c417da96e4a1308ef681b0b88
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Sep 29 10:38:50 2021 -0500
+
+    More edits and fixes to docs/FAQ.md.
+
+commit b36fb0fbc5fda13d9a52cc64953341d3d53067ee
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 28 18:47:45 2021 -0500
+
+    Fixed newly broken link to CREDITS in FAQ.md.
+
+commit 3442d4002b3bfffd8848f72103b30691df2b19b1
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 28 18:43:23 2021 -0500
+
+    More minor fixes to FAQ.md and Sandboxes.md.
+
+commit 89aaf00650d6cc19b83af2aea6c8d04ddd3769cb
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 28 18:34:33 2021 -0500
+
+    Updates to FAQ.md, Sandboxes.md, and README.md.
+    
+    Details:
+    - Updated FAQ.md to include two new questions, reordered an existing
+      question, and also removed an outdated and redundant question about
+      BLIS vs. AMD BLIS.
+    - Updated Sandboxes.md to use 'gemmlike' as its main example, along with
+      other smaller details.
+    - Added ARM as a funder to README.md.
+
+commit c52c43115ec2264fda9380c48d9e6bb1e1ea2ead
+Merge: 1fc23d21 1f527a93
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sun Sep 26 15:56:54 2021 -0500
+
+    Merge branch 'dev'
+
+commit 1fc23d2141189c7b583a5bff2cffd87fd5261444
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 21 14:54:20 2021 -0500
+
+    Safelist 'master', 'dev', 'amd' branches.
+    
+    Details:
+    - Modified .travis.yml so that only commits to 'master', 'dev', and
+      'amd' branches get built by Travis CI. Thanks to Devin Matthews for
+      helping to track down the syntax for this change.
+
+commit 1f527a93b996093e06ef7a8e94fb47ee7e690ce0
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Sep 20 17:56:36 2021 -0500
+
+    Re-enable and fix fb93d24.
+    
+    Details:
+    - Re-enabled the changes made in fb93d24.
+    - Defined BLIS_ENABLE_SYSTEM in bli_arch.c, bli_cpuid.c, and bli_env.c,
+      all of which needed the definition (in addition to config_detect.c) in
+      order for the configure-time hardware detection binary to be compiled
+      properly. Thanks to Minh Quan Ho for helping identify these additional
+      files as needing to be updated.
+    - Added additional comments to all four source files, most notably to
+      prompt the reader to remember to update all of the files when updating
+      any of the files. Also made the cpp code in each of the files as
+      consistent/similar as possible.
+    - Refer to issues #532 and PR #546 for more history.
+
+commit 7b39c1492067de941f81b49a3b6c1583290336fd
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Sep 20 16:13:50 2021 -0500
+
+    Reverted fb93d24.
+    
+    Details:
+    - The latest changes in fb93d24 are still causing problems. Reverting
+      and preparing to move them to a branch.
+
+commit fb93d242a4fef4694ce2680436da23087bbdd5fe
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Sep 20 15:42:08 2021 -0500
+
+    Re-enable and fix 8e0c425 (BLIS_ENABLE_SYSTEM).
+    
+    Details:
+    - Re-enable the changes originally made in 8e0c425 but quickly reverted
+      in 2be78fc.
+    - Moved the #include of bli_config.h so that it occurs before the
+      #include of bli_system.h. This allows the #define BLIS_ENABLE_SYSTEM
+      or #define BLIS_DISABLE_SYSTEM in bli_config.h to be processed by the
+      time it is needed in bli_system.h. This change should have been
+      in the original 8e0c425, but was accidentally omitted. Thanks to Minh
+      Quan Ho for catching this.
+    - Add #define BLIS_ENABLE_SYSTEM to config_detect.c so that the proper
+      cpp conditional branch executes in bli_system.h when compiling the
+      hardware detection binary. The changes made in 8e0c425 were an attempt
+      to support the definition of BLIS_OS_NONE when configuring with
+      --disable-system (in issue #532).  That commit failed because, aside
+      from the required but omitted header reordering (second bullet above),
+      AppVeyor was unable to compile the hardware detection binary as a
+      result of missing Windows headers. This commit, which builds on PR
+      #546, should help fix that issue. Thanks to Minh Quan Ho for his
+      assistance and patience on this matter.
+
+commit eaa554aa52b879d181fdc87ba0bfad3ab6131517
+Author: Minh Quan HO <minh-quan.ho@kalray.eu>
+Date:   Wed Sep 15 15:39:36 2021 +0200
+
+    bli_error: more cleanup on the error strings array
+    
+    - There was redundance between the macro BLIS_MAX_NUM_ERR_MSGS (=200) and
+      the enum BLIS_ERROR_CODE_MAX (-170), while they both mean the same thing:
+      the maximal number of error codes/messages.
+    - The previous initialization of error messages at compile time ignored that
+      the 'bli_error_string' array still occupies useless memory due to 2D char[][]
+      declaration. Instead, it should be just an array of pointers, pointing at
+      strings in .rodata section.
+    - This commit does the two modifications:
+       * retired macros BLIS_MAX_NUM_ERR_MSGS and BLIS_MAX_ERR_MSG_LENGTH everywhere
+       * switch bli_error_string from char[][] to char *[] to reduce its footprint
+         from 40KB (200*200) to 1.3KB (170*sizeof(char*)).
+         (No problem to use the enum BLIS_ERROR_CODE_MAX at compile-time,
+         since compiler is smart enough to determine its value is 170.)
+
+commit 52f29f739dbbb878c4cde36dbe26b82847acd4e9
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Sep 17 08:38:29 2021 -0500
+
+    Removed last vestige of #define BLIS_NUM_ARCHS.
+    
+    Details:
+    - Removed the commented-out #define BLIS_NUM_ARCHS in bli_type_defs.h
+      and its associated (now outdated) comments. BLIS_NUM_ARCHS has been
+      part of the arch_t enum for some time now, and so this change is
+      mostly about removing any opportunity for confusion for people who
+      may be reading the code. Thanks to Minh Quan Ho for leading me to
+      cleanup.
+
+commit 849aae09f4fbf8d7abf11f4df1471f1d057e874b
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Sep 16 14:47:45 2021 -0500
+
+    Added new packm var3 to 'gemmlike'.
+    
+    Details:
+    - Defined a new packm variant for the 'gemmlike' sandbox. This new
+      variant (bls_l3_packm_var3.c) parallelizes the packing operation over
+      the k dimension rather than the m or n dimensions. Note that the
+      gemmlike implementation still uses var1 by default, and use of the new
+      code would require changing bls_l3_packm_a.c and/or bls_l3_packm_b.c
+      so that var3 is called instead. Thanks to Jeff Diamond for proposing
+      this (perhaps NUMA-friendly) solution.
+
+commit b6f71fd378b7cd0cdc5c780e0b8c975a7abde998
+Merge: 9293a68e e3dc1954
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Sep 16 12:24:33 2021 -0500
+
+    Merge pull request #544 from flame/haswell-gemmsup-fpe
+    
+    Fix more copy-paste errors in the haswell gemmsup code.
+
+commit e3dc1954ffb5eee2a8b41fce85ba589f75770eea
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Sep 16 10:59:37 2021 -0500
+
+    Fix problem where uninitialized registers are included in vhaddpd in the Mx1 gemmsup kernels for haswell.
+    
+    The fix is to use the same (valid) source register twice in the horizontal addition.
+
+commit 5191c43faccf45975f577c60b9089abee25722c9
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Sep 16 10:16:17 2021 -0500
+
+    Fix more copy-paste errors in the haswell gemmsup code.
+    
+    Fixes #486.
+
+commit 30c29b256ef13f0141ca9e9169cbdc7a45ce3a61
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Sep 16 05:01:03 2021 +0900
+
+    Arm SVE Exclude SVE-Intrinsic Kernels for GCC 8-9
+    
+    Affected configs: a64fx.
+
+commit bffa85be59dece8e756b9444e762f18892c06ee1
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Sep 16 04:31:45 2021 +0900
+
+    Arm SVE: Correct PACKM Ker Name: Intrinsic Kers
+    
+    SVE-Intrinsic-based kernels ought not to use asm in their names.
+
+commit 9293a68eb6557a9ea43a846435908c3d52d4218b
+Merge: ade10f42 98ce6e8b
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Sep 10 14:13:29 2021 -0500
+
+    Merge pull request #534 from flame/cxx_test
+    
+    Add test to Travis using C++ compiler to make sure blis.h is C++-compatible
+
+commit 98ce6e8bc916e952510872caa60d818d62a31e69
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Sep 10 14:12:13 2021 -0500
+
+    Do a fast test on OSX. [ci skip]
+
+commit c76fcad0c2836e7140b6bef3942e0a632a5f2cda
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Sep 10 13:57:02 2021 -0500
+
+    Fix AArch64 tests and consolidate some other tests.
+
+commit e486d666ffefee790d5e39895222b575886ac1ea
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Sep 10 13:50:16 2021 -0500
+
+    Use C++ cross-compiler for ARM tests.
+
+commit fbb3560cb8e2aeab205c47c2b096d4fa306d93db
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Sep 10 13:38:27 2021 -0500
+
+    Attempt to fix cxx-test for OOT builds.
+
+commit 9c0064f3f67d59263c62d57ae19605562bb87cc2
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Sep 10 10:39:04 2021 -0500
+
+    Fix config_name in bli_arch.c
+
+commit ade10f427835d5274411cafc9618ac12966eb1e7
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Aug 27 12:47:12 2021 -0500
+
+    Updated travis-ci.org link in README.md to .com.
+
+commit 2be78fc97777148c83d20b8509e38aa1fc1b4540
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Aug 27 12:17:26 2021 -0500
+
+    Disabled (at least temporarily) commit 8e0c425.
+    
+    Details:
+    - Reverted changes in 8e0c425 due to AppVeyor build failures that we do
+      not yet understand.
+
+commit 820f11a4694aee5f234e24277aecca40885ae9d4
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Aug 27 13:40:26 2021 +0900
+
+    Arm Whole GEMMSUP Call Route is Asm/Int Optimized
+    
+    - `ref2` call in `bli_gemmsup_rv_armv8a_asm_d6x8m.c` is commented out.
+    - `bli_gemmsup_rv_armv8a_asm_d4x8m.c` contains a tail `ref2` call but
+      it's not called by any upper routine.
+
+commit 8e0c4255de52a0a5cffecbebf6314aa52120ebe4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 26 15:29:18 2021 -0500
+
+    Define BLIS_OS_NONE when using --disable-system.
+    
+    Details:
+    - Modified bli_system.h so that the cpp macro BLIS_OS_NONE is defined
+      when BLIS_DISABLE_SYSTEM is defined. Otherwise, the previous OS-
+      detecting macro conditionals are considered. This change is to
+      accommodate a solution to a cross-compilation issue described in
+      #532.
+
+commit d6eb70fbc382ad7732dedb4afa01cf9f53e3e027
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 26 13:12:39 2021 -0500
+
+    Updated stale calls to malloc_intl() in gemmlike.
+    
+    Details:
+    - Updated two out-of-date calls to bli_malloc_intl() within the gemmlike
+      sandbox. These calls to malloc_intl(), which resided in
+      bls_l3_decor_pthreads.c, were missing the err_t argument that the
+      function uses to report errors. Thanks to Jeff Diamond for helping
+      isolate this issue.
+
+commit 2f7325b2b770a15ff8aaaecc087b22238f0c67b7
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Aug 23 15:04:05 2021 -0500
+
+    Blacklist clang10/gcc9 and older for 'armsve'.
+    
+    Details:
+    - Prohibit use of clang 10.x and older or gcc 9.x and older for the
+      'armsve' subconfiguration. Addresses issue #535.
+
+commit 7e2951e61fda1c325d6a76ca9956253482d84924
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Mon Aug 23 17:06:44 2021 +0900
+
+    Arm: DGEMMSUP `Macro' Edge Cases Stop Calling Ref
+    
+    Ref cannot handle panel strides (packed cases) thus cannot be called
+    from the beginning of `gemmsup` (i.e. cannot be dispatch target of
+    gemmsup to other sizes.)
+
+commit 4fd82b0e9348553d83e258bd4969e49a81f8fcf0
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Mon Aug 23 05:18:32 2021 +0900
+
+    Header Typo
+
+commit 35409ebe67557c0e7cf5ced138c8166c9c1c909f
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Mon Aug 23 04:51:47 2021 +0900
+
+    Arm: DGEMMSUP ??r(rv) Invoke Edge Size
+    
+    Plus some fix at edges.
+    
+    TODO: Should ensure that no ref kernel appear in beginning of gemmsup
+    kernels. As ref does not recognise panel stride.
+
+commit a361492c24fdd919ee037763fc6523e8d7d2967a
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Mon Aug 23 01:13:39 2021 +0900
+
+    Arm: DGEMMSUP ?rc(rd) Invoke Edge Size
+
+commit eaea67401c2ab31f2e51eede59725f64c1a21785
+Merge: 5fc65cdd e320ec6d
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Aug 21 16:09:31 2021 -0500
+
+    Merge branch 'master' into cxx_test
+
+commit 5fc65cdd9e4134c5dcb16d21cd4a79ff426ca9f3
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Aug 21 15:59:27 2021 -0500
+
+    Add test to Travis using C++ compiler to make sure blis.h is C++-compatible.
+
+commit e320ec6d5cd44e03cb2e2faa1d7625e84f76d668
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Aug 20 17:15:20 2021 -0500
+
+    Moved lang defs from _macro_def.h to _lang_defs.h.
+    
+    Details:
+    - Moved miscellaneous language-related definitions, including defs
+      related to the handling of the 'restrict' keyword, from the top half
+      of bli_macro_defs.h into a new file, bli_lang_defs.h, which is now
+      #included immediately after "bli_system.h" in blis.h. This change is
+      an attempt to fix a report of recent breakage of C++ compilers due
+      to the recent introduction of 'restrict' in bli_type_defs.h (which
+      previously was being included *before* bli_macro_defs.h and its
+      restrict handling therein. Thanks to Ivan Korostelev for reporting
+      this issue in #527.
+    - CREDITS file update.
+
+commit e6799b26a6ecf1e80661a77d857d1c9e9adf50dc
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Aug 21 02:39:38 2021 +0900
+
+    Arm: Implement GEMMSUP Fallback Method
+    
+    bli_dgemmsup_rv_armv8a_int_6x4mn
+
+commit 7d5903d8d7570090eb37c592094424d1c64805d1
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Aug 21 01:55:50 2021 +0900
+
+    Arm64 Fix: Support Alpha/Beta in GEMMSUP Intrin
+    
+    Forgot to support `alpha`/`beta` in gemmsup_armv8a_int.
+
+commit 3b275f810b2479eb5d6cf2296e97a658cf1bb769
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 19 16:06:46 2021 -0500
+
+    Minor tweaks to gemmlike sandbox.
+    
+    Details:
+    - In the gemmlike sandbox, changed the loop index variable of inner
+      loop of packm_cxk() from 'd' to 'i' (and likewise for the
+      corresponding inlined code within packm_var2()).
+    - Pack matrices A and B using packm_var1() instead of packm_var2().
+
+commit 3eccfd456e7e84052c9a429dcde1183a7ecfaa48
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 19 13:22:10 2021 -0500
+
+    Added local _check() code to gemmlike sandbox.
+    
+    Details:
+    - Added code to the gemmlike sandbox that handles parameter checking.
+      Previously, the gemmlike implementation called bli_gemm_check(), which
+      resides within the BLIS framework proper. Certain modifications that a
+      user may wish to perform on the sandbox, such as adding a new matrix
+      or vector operand, would have required additional checks, and so these
+      changes make it easier for such a person to implement those checks for
+      their custom gemm-like operation.
+
+commit 7144230cdb0653b70035ddd91f7f41e06ad8d011
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Aug 18 13:25:39 2021 -0500
+
+    README.md citation updates (e.g. BLIS7 bibtex).
+
+commit 4a955e939044cfd2048cf9f3e33024e3ad1fbe00
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Aug 16 13:49:27 2021 -0500
+
+    Tweaks to gemmlike to facilitate 3rd party mods.
+    
+    Details:
+    - Changed the implementation in the 'gemmlike' sandbox to more easily
+      allow others to provide custom implementations of packm. These changes
+      include:
+      - Calling a local version of packm_cxk() that can be modified. This
+        version of packm_cxk() uses inlined loops in packm_cxk() rather
+        than querying the context for packm kernels (or even using scal2m).
+      - Providing two variants of packm, one of which calls the
+        aforementioned packm_cxk(), the other of which inlines the contents
+        of packm_cxk() into the variant itself, making it self-contained.
+        To switch from one to the other, simply change which function gets
+        called within bls_packm_a() and bls_packm_b().
+      - Simplified and cleaned up some variant names in both variants of
+        packm, relative to their parent code.
+
+commit 2c0b4150e40c83ea814f69ca766da74c19ed0a58
+Merge: c99fae50 4b8ed99d
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Aug 14 18:41:35 2021 -0500
+
+    Merge pull request #527 from flame/obj_t_makeover
+    
+    Implement proposed new function pointer fields for obj_t.
+
+commit 4b8ed99d926876fbf54c15468feae4637268eb6b
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Aug 13 15:31:10 2021 -0500
+
+    Whitespace tweaks.
+
+commit c99fae50ac3de0b5380a085aeebebfe67a645407
+Merge: e6d68bc4 4f70eb79
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Aug 13 14:48:00 2021 -0500
+
+    Merge pull request #530 from flame/fix_clang_warnings
+    
+    Clean up some warnings that show up on clang/OSX.
+
+commit e6d68bc4fd0981bea90d7f045779cacfe53f6ae8
+Merge: 20a1c401 ec06b6a5
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Aug 13 14:47:46 2021 -0500
+
+    Merge pull request #529 from flame/fix_make_check_dependencies
+    
+    Add dependency on the "flat" blis.h file for the BLIS and BLAS testuite objects.
+
+commit 1772db029e10e0075b5a59d3fb098487b1ad542a
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Aug 13 14:46:35 2021 -0500
+
+    Add row- and column-strides for A/B in obj_ukr_fn_t.
+
+commit 4f70eb7913ad3ded193870361b6da62b20ec3823
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Aug 13 11:12:43 2021 -0500
+
+    Clean up some warnings that show up on clang/OSX.
+
+commit 3cddce1e2a021be6064b90af30022b99cbfea986
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Aug 12 22:32:34 2021 -0500
+
+    Remove schema field on obj_t (redundant) and add new API functions.
+
+commit ec06b6a503a203fa0cdb23273af3c0e3afeae7fa
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Aug 12 19:27:31 2021 -0500
+
+    Add dependency on the "flat" blis.h file for the BLIS and BLAS testsuite objects.
+    
+    This fixes a bug where "make -j<N> check" may fail after a change to one or more header files, or where testsuite code doesn't get properly recompiled after internal changes.
+
+commit 20a1c4014c999063e6bc1cfa605b152454c5cbf4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 12 14:44:04 2021 -0500
+
+    Disabled sanity check in bli_pool_finalize().
+    
+    Details:
+    - Disabled a sanity check in bli_pool_finalize() that was meant to alert
+      the user if a pool_t was being finalized while some blocks were still
+      checked out. However, this is exactly the situation that might happen
+      when a pool_t is re-initialized for a larger blocksize, and currently
+      bli_pool_reinit() is implemeneted as _finalize() followed by _init().
+      So, this sanity check is not universally appropriate. Thanks to
+      AMD-India for reporting this issue.
+
+commit e366665cd2b5ae8d7683f5ba2de345df0a41096f
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 12 14:06:53 2021 -0500
+
+    Fixed stale API calls to membrk API in gemmlike.
+    
+    Details:
+    - Updated stale calls to the bli_membrk API within the 'gemmlike'
+      sandbox. This API is now called bli_pba (packed block allocator).
+      Ideally, this forgotten update would have been included as part of
+      21911d6, which is when the branch where the membrk->pba changes was
+      introduced was merged into 'master'.
+    - Comment updates.
+
+commit e38ca28689f31c5e5bd2347704dc33042e5ea176
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Aug 13 03:21:19 2021 +0900
+
+    Added Apple Firestorm (A14/M1) Subconfig
+    
+    - Use the same bulk kernel as Cortex-A53 / ThunderX2;
+    - Larger block size;
+    - Use gemmsup kernels for double precision.
+
+commit 3df0e9b653fbb1293cad93010273eea579e753d9
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Jul 17 04:21:53 2021 +0900
+
+    Arm64 8x4 Kernel Use Less Regs
+
+commit 4e7e225057a05b9722ce65ddf75a9c31af9fbf36
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Jun 9 15:46:36 2021 +0900
+
+    Armv8-A Supplimentary GEMMSUP Sizes for RD
+
+commit c792d506ba09530395c439051727631fd164f59a
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Jun 5 04:20:24 2021 +0900
+
+    Armv8-A Fix GEMMSUP-RD Kernels on GNU Asm
+    
+    Suffixed NEON opcode is not supported by GNU assembler
+
+commit ce4473520975c2c8790c82c65a69d75f8ad758ea
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Jun 5 04:08:14 2021 +0900
+
+    Armv8-A Adjust Types for PACKM Kernels
+    
+    GCC does not have full NEON intrinsics support.
+
+commit 8a32d19af85b61af92fcab1c316fb3be1a8d42ce
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Jun 5 03:31:30 2021 +0900
+
+    Armv8-A GEMMSUP-RD 6x8m
+    
+    Armv8-A now has a complete set of GEMMSUP kernels..
+
+commit afd0fa6ad1889ed073f781c8aa8635f99e76b601
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat Jun 5 01:19:01 2021 +0900
+
+    Armv8-A GEMMSUP-RD 6x8n
+
+commit 3c5f7405148ab142dee565d00da331d95a7a07b9
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Jun 4 21:50:51 2021 +0900
+
+    Armv8-A s/d Packing Kernels Fix Typo
+    
+    For GCC.
+
+commit 49b05df7929ec3abc0d27b475d2d406116fe2682
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Fri Jun 4 18:04:59 2021 +0900
+
+    Armv8-A Introduced s/d Packing Kernels
+    
+    Sizes according to the 2014 kernels.
+
+commit c3faf93168c3371ff48a2d40d597bdb27021cad4
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Jun 3 23:09:05 2021 +0900
+
+    Armv8-A DGEMMSUP 6x8m Kernel
+    
+    Recommended kernels set:
+      ...
+      BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
+      BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
+      BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
+      BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
+      BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
+      BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
+      ...
+      bli_blksz_init     ( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1,
+                                                  -1,     8,    -1,    -1 );
+      bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
+      ...
+
+commit 3efe707b5500954941061d4c2363d6ed41d17233
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Jun 3 17:20:57 2021 +0900
+
+    Armv8-A DGEMMSUP Adjustments
+
+commit 8ed8f5e625de9b77a0f14883283effe79af01771
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Jun 3 16:37:37 2021 +0900
+
+    Armv8-A Add More DGEMMSUP
+    
+    - Add 6x8 GEMMSUP.
+    - Adjust prefetching.
+    - Workaround for Clang's disability to handle reg clobbering.
+    - Subproduct 6x8 row-major GEMM <- incomplete.
+
+commit a9ba79ea14de3b5a271e5970cb473d3c52e2fa5f
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Jun 2 15:04:29 2021 +0900
+
+    Armv8-A Add GEMMSUP 4x8n Kernel
+    
+    - Compile w/ both GCC & Clang.
+    - Edge cases use ref-kernels.
+    - Can give performance boost in some contexts.
+
+commit df40efe8fbfd399d76c6000ec03791a9b76ffbdf
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed Jun 2 00:04:20 2021 +0900
+
+    Armv8-A Add Part of GEMMSUP 8x4m Kernel
+    
+    - Compile w/ both GCC & Clang
+    - Only block part is implement. Edge cases WIP
+    - Not Optimal kernel scheme. Should do 4x8 instead
+
+commit 66399992881316514f64d68ec9eb60a87d53f674
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat May 29 05:52:05 2021 +0900
+
+    Armv8A DGEMM 4x4 Kernel WIP. Slow
+    
+    Quite slow.
+
+commit a29c16394ccef02d29141c79b71fb408e20073e6
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat May 29 04:58:45 2021 +0900
+
+    Armv8-A Add 8x4 Kernel WIP
+    
+    Test result: a bit lower GFlOps than 6x8.
+
+commit 64a1f786d58001284aa4f7faf9fae17f0be7a018
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Aug 11 17:53:12 2021 -0500
+
+    Implement proposed new function pointer fields for obj_t.
+    
+    The added fields:
+    1. `pack_t schema`: storing the pack schema on the object allows the macrokernel to act accordingly without side-channel information from the rntm_t and cntx_t. The pack schema and "pack_[ab]" fields could be removed from those structs.
+    2. `void* user_data`: this field can be used to store any sort of additional information provided by the user. The pointer is propagated to submatrix objects and copies, but is otherwise ignored by the framework and the default implementations of the following three fields. User-specified pack, kernel, or ukr functions can do whatever they want with the data, and the user is 100% responsible for allocating, assigning, and freeing this buffer.
+    3. `obj_pack_fn_t pack`: the function called when a matrix is packed. This functions receives the expected arguments, as well as a mdim_t and mem_t* as memory must be allocated inside this function, and behavior may differ based on which matrix is being backed (i.e. transposition for B). This could also be achieved by passing a desired pack schema, but this would require additional information to travel down the control tree.
+    4. `obj_ker_fn_t ker`: the function called when we get to the "second loop", or the macro-kernel. Behavior may depend on the pack schemas of the input matrices. The default implementation would perform the inner two loops around the ukr, and then call either the default ukr or a user-supplied one (next field).
+    5. `obj_ukr_fn_t ukr`: the function called by the default macrokernel. This would replace the various current "virtual" microkernels, and could also be used to supply user-defined behavior. Users could supply both a custom kernel (above) and microkernel, although the user-specified kernel does **not** necessarily have to call the ukr function specified on the obj_t.
+    
+    Note that no macros or functions for accessing these new fields have been defined yet. That is next once these are finalized. Addresses https://github.com/flame/blis/projects/1#card-62357687.
+
+commit a32257eeab2e9946e71546a05a1847a39341ec6b
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 5 16:23:02 2021 -0500
+
+    Fixed bli_init.c compile-time error on OSX clang.
+    
+    Details:
+    - Fixed a compile-time error in bli_init.c when compiling with OSX's
+      clang. This error was introduced in 868b901, which introduced a
+      post-declaration struct assignment where the RHS was a struct
+      initialization expression (i.e. { ... }). This use of struct
+      initializer expressions apparently works with gcc despite it not
+      being strict C99. The fix included in this commit declares a temporary
+      variable for the purposes of being initialized to the desired value,
+      via the struct initializer, and then copies the temporary struct (via
+      '=' struct assignment) to the persistent struct. Thanks to Devin
+      Matthews for his help with this.
+
+commit c8728cfbd19ecde9d43af05829e00bcfe7d86eed
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 5 15:17:09 2021 -0500
+
+    Fixed configure breakage on OSX clang.
+    
+    Details:
+    - Accept either 'clang' or 'LLVM' in vendor string when greping for
+      the version number (after determining that we're working with clang).
+      Thanks to Devin Matthews for this fix.
+
+commit 868b90138e64c873c780d9df14150d2a370a7a42
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Aug 4 18:31:01 2021 -0500
+
+    Fixed one-time use property of bli_init() (#525).
+    
+    Details:
+    - Fixes a rather obvious bug that resulted in segmentation fault
+      whenever the calling application tried to re-initialize BLIS after
+      its first init/finalize cycle. The bug resulted from the fact that
+      the bli_init.c APIs made no effort to allow bli_init() to be called
+      subsequent times at all due to it, and bli_finalize(), being
+      implemented in terms of pthread_once(). This has been fixed by
+      resetting the pthread_once_t control variable for initialization
+      at the end of bli_finalize_apis(), and by resetting the control
+      variable for finalization at the end of bli_init_apis(). Thanks to
+      @lschork2 for reporting this issue (#525), and to Minh Quan Ho and
+      Devin Matthews for suggesting the chosen solution.
+    - CREDITS file update.
+
+commit 8dba1e752c6846a85dea50907135bbc5cbc54ee5
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Jul 27 12:38:24 2021 -0500
+
+    CREDITS file update.
+
+commit cc9206df667b7c710b57b190b8ad351176de53b8
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Jul 16 15:48:37 2021 -0500
+
+    Added Graviton2 Neoverse N1 performance results.
+    
+    Details:
+    - Added single-threaded and multithreaded performance results to
+      docs/Performance.md. These results were gathered on a Graviton2
+      Neoverse N1 server. Special thanks to Nicholai Tukanov for
+      collecting these results via the Arm-HPC/AWS hackaton.
+    - Corrected what was supposed to be a temporary tweak to the legend
+      labels in test/3/octave/plot_l3_perf.m.
+
+commit fab5c86d68137b59800715efb69214c0a7e458a7
+Merge: 84f9dcd4 d073fc9a
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Jul 13 16:46:21 2021 -0500
+
+    Merge pull request #516 from nicholaiTukanov/p10-sandbox-rework
+    
+    P10 sandbox rework
+
+commit 84f9dcd449fa7a4cf4087fca8ec4ca0d10e9b801
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Jul 13 16:45:44 2021 -0500
+
+    Remove unnecesary windows/zen2 directory.
+
+commit 21911d6ed3438ca4ba942d05851ba5d7e9835586
+Merge: 17729cf4 689fa0f4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Jul 9 18:10:46 2021 -0500
+
+    Merge branch 'dev'
+
+commit 17729cf449919d1db9777cea5b65d2efc77e2692
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Jul 9 14:59:48 2021 -0500
+
+    Add vzeroupper to Haswell microkernels. (#524)
+    
+    Details:
+    - Added vzeroupper instruction to the end of all 'gemm' and 'gemmtrsm'
+      microkernels so as to avoid a performance penalty when mixing AVX
+      and SSE instructions. These vzeroupper instructions were once part
+      of the haswell kernels, but were inadvertently removed during a source
+      code shuffle some time ago when we were managing duplicate 'haswell'
+      and 'zen' kernel sets. Thanks to Devin Matthews for tracking this down
+      and re-inserting the missing instructions.
+
+commit c9a7f59aa84daa54d8f8c771f1f1ef2bd8730da2
+Merge: 75f03907 9a8e649c
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Jul 8 14:00:38 2021 -0500
+
+    Merge pull request #522 from flame/windows-avx512
+    
+    Fix Win64 AVX512 bug.
+
+commit 9a8e649c5ac89eba951bbee7136ca28aeb24d731
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Jul 7 15:23:57 2021 -0500
+
+    Fix Win64 AVX512 bug.
+    
+    Use `-march=haswell` for kernels. Fixes #514.
+
+commit 75f03907c58385b656c8bd35d111db245814a9f3
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Jul 7 15:44:11 2021 -0500
+
+    Add comment about make checkblas on Windows
+    
+    [ci skip]
+
+commit 4651583b1204a965e4aa672c7ad6de60f3ab1600
+Merge: 69205ac2 174f7fc9
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Jul 7 01:11:20 2021 -0500
+
+    Merge pull request #520 from flame/travis-ci-install
+    
+    Test installation in Travis CI
+
+commit 69205ac266947723ad4d7bb028b7521fe5c76991
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Jul 6 20:39:22 2021 -0500
+
+    CREDITS file update.
+    
+    Details:
+    - Thanks to Chengguo Sun for submitting #515 (5ef7f68).
+    - Thanks to Andrew Wildman for submitting #519 (551c6b4).
+    - Whitespace update to configure (spaces to tabs).
+
+commit 174f7fc9a11712c7bd1a61510bdc5c262b3e8e1f
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Jul 6 19:35:55 2021 -0500
+
+    Test installation in Travis CI
+
+commit 551c6b4ee8cd9dd2e1d1b46c8dde09eb50b91b2c
+Merge: 78eac6a0 f648df4e
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Jul 6 19:32:53 2021 -0500
+
+    Merge pull request #519 from awild82/oot_build_bugfix
+    
+    Fix installation from out-of-tree builds
+
+commit f648df4e5588f069b2db96f8be320ead0c1967ef
+Author: Andrew Wildman <apw4@uw.edu>
+Date:   Tue Jul 6 16:35:12 2021 -0700
+
+    Add symlink to blis.pc.in for out-of-tree builds
+
+commit 78eac6a0ab78c995c3f4e46a9e87388b5c3e1af6
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Jul 6 11:05:43 2021 -0500
+
+    Revert "Always run `make check`."
+    
+    This reverts commit a201a53440c51244739aaee20e3309b50121cc68.
+
+commit a201a53440c51244739aaee20e3309b50121cc68
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Jul 5 21:39:18 2021 -0500
+
+    Always run `make check`.
+    
+    I'm concerned that problems may lurk for `x86_64` builds on Windows which may be uncovered by a fuller `make check`.
+
+commit 5ef7f684dc75fc707c82f919e0836615f90a2627
+Merge: aaa10c87 ad6231cc
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Jul 5 21:35:07 2021 -0500
+
+    Merge pull request #515 from chengguosun/bug-fix
+    
+    Fixed configure script bug.
+
+commit ad6231cca3fc1e477752ecd31b1ee2323398a642
+Author: sunchengguo <sunchengguo@higon.com>
+Date:   Tue Jul 6 07:30:00 2021 -0400
+
+    Fixed configure script bug.
+    Details:
+    - Fixed kernel list string substitution error by adding function substitute_words in configure script.
+      if the string contains zen and zen2, and zen need to be replaced with another string, then zen2
+      also be incorrectly replaced.
+
+commit d073fc9acac9d702556cab9fbbb3a253eeb1f998
+Author: nicholaiTukanov <nicholaitukanov@gmail.com>
+Date:   Fri Jul 2 19:54:33 2021 -0500
+
+    Update POWER10.md
+
+commit 907226c0af4afb6323b4e02be4f73f5fb89cddaf
+Author: nicholaiTukanov <nicholaitukanov@gmail.com>
+Date:   Fri Jul 2 19:47:18 2021 -0500
+
+    Rework POWER10 sandbox
+    
+    - Add a testsuite for gathering performance (in GFLOPs) and measuring correctness for the POWER10 GEMM reduced precision/integer kernels.
+    - Reworked GENERIC_GEMM template to hardcode the cache parameters.
+    - Remove kernel wrapper that checked that only allowed matrices that weren't transposed or conjugated. However, the kernels still assume the matrices are not transposed. This wrapper was removed for performance reasons.
+    - Renamed and restructured files and functions for clarity.
+    - Editted the POWER10 document to reflect new changes.
+
+commit aaa10c87e19449674a4ca30fa3b6392bb22c3a66
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 21 17:53:52 2021 -0500
+
+    Skip clearing temp microtile in gemmlike sandbox.
+    
+    Details:
+    - Removed code from gemmlike sandbox files bls_gemm_bp_var1.c and
+      bls_gemm_bp_var2.c that initializes the elements of the temporary
+      microtile to zero. This code, introduced recently in 7f7d726, did
+      not actually fix any bug (despite that commit's log entry). The
+      microtile does not need to be initialized because it is completely
+      overwritten by a "beta = 0" invocation of gemm prior to it being
+      read. Any NaNs or Infs present at the outset would have no impact
+      on the output matrix C. Thanks to Devin Matthews for reminding me
+      of this.
+
+commit bc10a3f2ff518360c32bea825b3eb62a9e4c8a77
+Merge: bf727636 6548ceba
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Jun 18 19:01:08 2021 -0500
+
+    Merge pull request #492 from flame/thunderx2-clang
+    
+    Allow clang for ThunderX2 config
+
+commit bf727636632a368f3247dc8ab1d4b6119e9c511a
+Merge: e28f2a2d 5fc93e28
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Jun 18 18:59:43 2021 -0500
+
+    Merge pull request #506 from xrq-phys/arm64-mac
+    
+    BLIS on Darwin_Aarch64
+
+commit e28f2a2dfcff14e7094fce0b279b3a917b3ab98c
+Merge: d10e05bb 56ffca6a
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Jun 15 19:35:07 2021 -0500
+
+    Merge pull request #513 from nicholaiTukanov/asm_warning_p9_fix
+    
+    Fix assembler warning in POWER9 DGEMM
+
+commit 56ffca6a9bc67432a7894298739895f406e5f467
+Author: nicholai <nicholai@ibm.com>
+Date:   Tue Jun 15 18:17:39 2021 -0500
+
+    Fix asm warning
+
+commit 689fa0f40399bde1acc5367d6dd4e8fc4eb6f3ea
+Merge: b683d01b d10e05bb
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sun Jun 13 19:44:14 2021 -0500
+
+    Merge branch 'master' into dev
+
+commit d10e05bbd1ce45ce2c0dfe5c64daae2633357b3f
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sun Jun 13 19:36:16 2021 -0500
+
+    Sandbox header edits trigger full library rebuild.
+    
+    Details:
+    - Adjusted the top-level Makefile so that any change to a sandbox header
+      file will result in blis.h being regenerated along with a full
+      recompilation of the library. Previously, sandbox files were omitted
+      from the list of header files that, when touched, could trigger a full
+      rebuild. Why was it like that previously? Because originally we only
+      envisioned using sandboxes to *replace* gemm, not augment the library
+      with new functionality. When replacing gemm, blis.h does not need to
+      contain any local sandbox defintions in order for the user to be able
+      to (indirectly) use that sandbox. But if you are adding functions to
+      the library, those functions need to be prototyped so the compiler
+      can perform type checking against the user's invocation of those new
+      functions. Thanks to Jeff Diamond for helping us discover this
+      deficiency in the build system.
+
+commit 7c3eb44efaa762088c190bb820ef6a3c87db8f65
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Jun 2 11:28:22 2021 -0500
+
+    Add vhsubpd/vhsubpd.
+    
+    Horizontal subtraction instructions added to bli_x86_asm_macros.h, currently unused [ci skip].
+
+commit 7f7d72610c25f511ba8cd2a53be7b59bdb80f3f3
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon May 31 16:50:18 2021 -0500
+
+    Fixed bugs in cpackm kernels, gemmlike code.
+    
+    Details:
+    - Fixed intermittent bugs in bli_packm_haswell_asm_c3xk.c and
+      bli_packm_haswell_asm_c8xk.c whereby the imaginary component of the
+      kappa scalar was incorrectly loaded at an offset of 8 bytes (instead
+      of 4 bytes) from the real component. This was almost certainly a copy-
+      paste bug carried over from the corresonding zpackm kernels. Thanks to
+      Devin Matthews for bringing this to my attention.
+    - Added missing code to gemmlike sandbox files bls_gemm_bp_var1.c and
+      bls_gemm_bp_var2.c that initializes the elements of the temporary
+      microtile to zero. (This bug was never observed in output but rather
+      noticed analytically. It probably would have also manifested as
+      intermittent failures, this time involving edge cases.)
+    - Minor commented-out/disabled changes to testsuite/src/test_gemm.c
+      relating to debugging.
+
+commit 5fc93e280614b4a21a9cff36cf873b4b9407285b
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat May 29 18:44:47 2021 +0900
+
+    Armv8A Rename Regs for Safe Darwin Compile
+    
+    Avoid x18 use in FP32 kernel:
+    - C address lines x[18-26] renamed to x[19-27] (reg index +1)
+    - Original role of x27 fulfilled by x5 which is free after k-loop pert.
+    
+    FP64 does not require changing since x18 is not used there.
+
+commit 9f4a4a3cfb2244e4024445e127dafd2a11f39fc5
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat May 29 17:21:28 2021 +0900
+
+    Armv8A Rename Regs for Clang Compile: FP32 Part
+    
+    Roughly the same as 916e1fa , additionally with x15 clobbering removed.
+    - x15: Not used at all.
+    
+    Compilation w/ Clang shows warning about x18 reservation, but
+    compilation itself is OK and all tests got passed.
+
+commit 916e1fa8be3cea0e3e2a4a7e8b00027ac2ee7780
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat May 29 16:46:52 2021 +0900
+
+    Armv8A Rename Regs for Clang Compile: FP64 Part
+    
+    - x7, x8: Used to store address for Alpha and Beta.
+      As Alpha & Beta was not used in k-loops, use x0, x1 to load
+      Alpha & Beta's addresses after k-loops are completed, since A & B's
+      addresses are no longer needed there.
+      This "ldr [addr]; -> ldr val, [addr]" would not cause much performance
+      drawback since it is done outside k-loops and there are plenty of
+      instructions between Alpha & Beta's loading and usage.
+    - x9: Used to store cs_c. x9 is multiplied by 8 into x10 and not used
+      any longer. Directly loading cs_c and into x10 and scale by 8 spares
+      x9 straightforwardly.
+    - x11, x12: Not used at all. Simply remove from clobber list.
+    - x13: Alike x9, loaded and scaled by 8 into x14, except that x13 is
+      also used in a conditional branch so that "cmp x13, #1" needs to be
+      modified into "cmp x14, #8" to completely free x13.
+    - x3, x4: Used to store next_a & next_b. Untouched in k-loops. Load
+      these addresses into x0 and x1 after Alpha & Beta are both loaded,
+      since then neigher address of A/B nor address of Alpha/Beta is needed.
+
+commit 7fabd896af773623ed01820a71bbff432e8a7d25
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat May 29 16:28:03 2021 +0900
+
+    Asm Flag Mingling for Darwin_Aarch64
+    
+    Apple+Arm64 requires additional "tagging" of local symbols.
+
+commit 213dce32d2eed8b7a38c6a3f6112072b0a89ecd0
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri May 28 14:49:57 2021 -0500
+
+    Added a new 'gemmlike' sandbox.
+    
+    Details:
+    - Added a new sandbox called 'gemmlike', which implements sequential and
+      multithreaded gemm in the style of gemmsup but also unconditionally
+      employs packing. The purpose of this sandbox is to
+      (1) avoid select abstractions, such as objects and control trees, in
+          order to allow readers to better understand how a real-world
+          implementation of high-performance gemm can be constructed;
+      (2) provide a starting point for expert users who wish to build
+          something that is gemm-like without "reinventing the wheel."
+      Thanks to Jeff Diamond, Tze Meng Low, Nicholai Tukanov, and Devangi
+      Parikh for requesting and inspiring this work.
+    - The functions defined in this sandbox currently use the "bls_" prefix
+      instead of "bli_" in order to avoid any symbol collisions in the main
+      library.
+    - The sandbox contains two variants, each of which implements gemm via a
+      block-panel algorithm. The only difference between the two is that
+      variant 1 calls the microkernel directly while variant 2 calls the
+      microkernel indirectly, via a function wrapper, which allows the edge
+      case handling to be abstracted away from the classic five loops.
+    - This sandbox implementation utilizes the conventional gemm microkernel
+      (not the skinny/unpacked gemmsup kernels).
+    - Updated some typos in the comments of a few files in the main
+      framework.
+
+commit 82af05f54c34526a60fd2ec46656f13e1ac8f719
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue May 25 15:25:08 2021 -0500
+
+    Updated Fugaku (a64fx) performance results.
+    
+    Details:
+    - Updated the performance graphs (pdfs and pngs) for the Fugaku/a64fx
+      entry within Performance.md, and also updated the experiment details
+      accordingly. Thanks to RuQing Xu for re-running the BLIS and SSL2
+      experiments reflected in this commit.
+    - In Performance.md, added an English translation of the project name
+      under which the Fugaku results were gathered, courtesy of RuQing Xu.
+
+commit e5c85da3763f73854ecd739ba3008bb467ed77c3
+Merge: cbd8d393 5feb04e2
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon May 24 16:56:22 2021 -0500
+
+    Merge pull request #503 from flame/windows-compiler-check
+    
+    Add explicit compiler check for Windows.
+
+commit cbd8d3932599485727204479fded66ac19186db4
+Merge: 6d4ab022 932dfe6a
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon May 24 16:32:42 2021 -0500
+
+    Merge pull request #500 from xrq-phys/armsve+travis
+    
+    Upgrade Travis CI for Arm SVE
+
+commit 5feb04e233e1e6f81c727578ad9eae1367a2562f
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun May 23 18:46:56 2021 -0500
+
+    Add explicit compiler check for Windows.
+    
+    Check the C compiler for a predefined macro `_WIN32` to indicate (cross-)compilation for Windows. Fixes #463.
+
+commit 6d4ab0223d9014ac2a66d66759536aa305be5867
+Merge: 61584ded 859fb77a
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun May 23 18:39:53 2021 -0500
+
+    Merge pull request #502 from flame/rm-rm-dupls
+    
+    Remove `rm-dupls` function in common.mk.
+
+commit 859fb77a320a3ace71d25a8885c23639b097a1b6
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun May 23 18:15:23 2021 -0500
+
+    Remove `rm-dupls` function in common.mk.
+    
+    AMD requested removal due to unclear licensing terms; original code was from stackoverflow. The function is unused but could easily be replaced by new implementation.
+
+commit 932dfe6abb9617223bd26a249e53447169033f8c
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu May 20 02:07:31 2021 +0900
+
+    Travis CI Revert Unnecessary Extras from 91d3636
+    
+    - Removed `V=1` in make line
+    - Removed `CFLAGS` in configure line
+    - Restored `pwd` surrounding OOT line
+
+commit bd156a210d347a073a6939cc4adab3d9256c2e2b
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sun May 16 02:56:14 2021 +0900
+
+    Adjust TravisCI
+    
+    - ArmSVE don't test gemmt (seems Qemu-only problem);
+    - Clang use TravisCI-provided version instead of fixing to clang-8
+      due to that clang-8 seems conflicting with TravisCI's clang-7.
+
+commit 91d3636031021af3712d14c9fcb1eb34b6fe2a31
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Sat May 15 17:05:16 2021 +0900
+
+    Travis Support Arm SVE
+    
+    - Updated distro to 20.04 focal aarch64-gcc-10.
+      This is minimal version required by aarch64-gcc-10.
+      SVE intrinsics would not compile without GCC >=10.
+    - x86 toolchains use official repo instead of ubuntu-toolchain-r/test.
+      20.04 focal is not supported by that PPA at the moment.
+    - Add extra configuration-time options to .travis.yml.
+    - Add Arm SVE entry to .travis.yml.
+
+commit 61584deddf9b3af6d11a811e6e04328d22390202
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Wed May 19 23:52:29 2021 +0900
+
+    Added 512b SVE-based a64fx subconfig + SVE kernels.
+    
+    Details:
+    - Added 512-bit specific 'a64fx' subconfiguration that uses empirically
+      tuned block size by Stepan Nassyr. This subconfig also sets the sector
+      cache size and enables memory-tagging code in SVE gemm kernels. This
+      subconfig utilizes (16, k) and (10, k) DPACKM kernels.
+    - Added a vector-length agnostic 'armsve' subconfiguration that computes
+      blocksizes according to the analytical model. This part is ported from
+      Stepan Nassyr's repository.
+    - Implemented vector-length-agnostic [d/s/sh] gemm kernels for Arm SVE
+      at size (2*VL, 10). These kernels use unindexed FMLA instructions
+      because indexed FMLA takes 2 FMA units in many implementations.
+      PS: There are indexed-FLMA kernels in Stepan Nassyr's repository.
+    - Implemented 512-bit SVE dpackm kernels with in-register transpose
+      support for sizes (16, k) and (10, k).
+    - Extended 256-bit SVE dpackm kernels by Linaro Ltd. to 512-bit for
+      size (12, k). This dpackm kernel is not currently used by any
+      subconfiguration.
+    - Implemented several experimental dgemmsup kernels which would
+      improve performance in a few cases. However, those dgemmsup kernels
+      generally underperform hence they are not currently used in any
+      subconfig.
+    - Note: This commit squashes several commits submitted by RuQing Xu via
+      PR #424.
+
+commit b683d01b9c4ea5f64c8031bda816beccfbf806a0
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu May 13 15:23:22 2021 -0500
+
+    Use extra #undef when including ba/ex API headers.
+    
+    Details:
+    - Inserted a "#include bli_xapi_undef.h" after each usage of the basic
+      and expert API macro setup headers: bli_oapi_ba.h, bli_oapi_ex.h,
+      bli_tapi_ba.h, and bli_tapi_ex.h. This is functionally equivalent to
+      the previous status quo, in which each header made minimal #undef
+      prior to its own definitions and then a single instance of
+      "#include bli_xapi_undef.h" cleaned up any remaining macro defs after
+      all other headers were used. This commit will guarantee that macro
+      defs from the setup of one header (say, bli_oapi_ex.h) don't "infect"
+      the definitions made in a subsequent header. As with this previous
+      commit, this change does not fix any issue but rather attempts to
+      avoid creating orphaned macro definitions that are only needed within
+      a very limited scope.
+    - Removed minimal #undef from bli_?api_[ba|ex].h.
+    - Removed old commented-out lines from bli_?api_[ba|ex].h.
+
+commit d4427a5b2f5cab5d2a64c58d87416628867c2b4a
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu May 13 13:55:11 2021 -0500
+
+    Minor preprocessor/header cleanup.
+    
+    Details:
+    - Added frame/include/bli_xapi_undef.h, which explicitly undefines all
+      macros defined in bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and
+      bli_tapi_ex.h. (This is for safety and good cpp coding practice, not
+      because it fixes anything.)
+    - Added #include "bli_xapi_undef.h" to bli_l1v.h, bli_l1d.h, bli_l1f.h,
+      bli_l1m.h, bli_l2.h, bli_l3.h, and bli_util.h.
+    - Comment updates to bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and
+      bli_tapi_ex.h.
+    - Moved frame/3/bli_l3_ft_ex.h to local 'old' directory after realizing
+      that nothing in BLIS used those function pointer types. Also commented
+      out the "#include bli_l3_ft_ex.h" directive in frame/3/bli_l3.h.
+
+commit 5aa63cd927b22a04e581b07d0b68ef391f4f9b1f
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed May 12 19:53:35 2021 -0500
+
+    Fixed typo in cpp guard in bli_util_ft.h.
+    
+    Details:
+    - Changed #ifdef BLIS_OAPI_BASIC to #ifdef BLIS_TAPI_BASIC in
+      bli_util_ft.h. This typo was causing some types to be redefined when
+      they weren't supposed to be.
+
+commit f0e8634775094584e89f1b03811ee192f2aaf67f
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed May 12 18:45:32 2021 -0500
+
+    Defined eqsc, eqv, eqm to test object equality.
+    
+    Details:
+    - Defined eqsc, eqv, and eqm operations, which set a bool depending on
+      whether the two scalars, two vectors, or two matrix operands are equal
+      (element-wise). eqsc and eqv support implicit conjugation and eqm
+      supports diagonal offset, diag, uplo, and trans parameters (in a
+      manner consistent with other level-1m operations). These operations
+      are currently housed under frame/util, at least for now, because they
+      are not computational in nature.
+    - Redefined bli_obj_equals() in terms of eqsc, eqv, and eqm.
+    - Documented eqsc, eqv, and eqm in BLISObjectAPI.md and BLISTypedAPI.md.
+      Also:
+      - Documented getsc and setsc in both docs.
+      - Reordered entry for setijv in BLISTypedAPI.md, and added separator
+        bars to both docs.
+      - Added missing "Observed object properties" clauses to various
+        levle-1v entries in BLISObjectAPI.md.
+    - Defined bli_apply_trans() in bli_param_macro_defs.h.
+    - Defined supporting _check() function, bli_l0_xxbsc_check(), in
+      bli_l0_check.c for eqsc.
+    - Programming style and whitespace updates to bli_l1m_unb_var1.c.
+    - Whitespace updates to bli_l0_oapi.c, bli_l1m_oapi.c
+    - Consolidated redundant macro redefinition for copym function pointer
+      type in bli_l1m_ft.h.
+    - Added macros to bli_oapi_ba.h, _ex.h, and bli_tapi_ba.h, _ex.h that
+      allow oapi and tapi source files to forego defining certain expert
+      functions. (Certain operations such as printv and printm do not need
+      to have both basic expert interfaces. This also includes eqsc, eqv,
+      and eqm.)
+
+commit 5d46dbee4a06ba5a422e19817836976f8574cb4f
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed May 12 18:42:09 2021 -0500
+
+    Replace bli_dlamch with something less archaic (#498)
+    
+    Details:
+    - Added new implementations of bli_slamch() and bli_dlamch() that use
+      constants from the standard C library in lieu of dynamically-computed
+      values (via code inherited from netlib). The previous implementation
+      is still available when the cpp macro BLIS_ENABLE_LEGACY_LAMCH is
+      defined by the subconfiguration at compile-time. Thanks to Devin
+      Matthews for providing this patch, and to Stefano Zampini for
+      reporting the issue (#497) that prompted Devin to propose the patch.
+
+commit 6a89c7d8f9ac3f51b5b4d8ccb2630d908d951e6f
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat May 1 18:54:48 2021 -0500
+
+    Defined setijv, getijv to set/get vector elements.
+    
+    Details:
+    - Defined getijv, setijv operations to get and set elements of a vector,
+      in bli_setgetijv.c and .h.
+    - Renamed bli_setgetij.c and .h to bli_setgetijm.c and .h, respectively.
+    - Added additional bounds checking to getijm and setijm to prevent
+      actions with negative indices.
+    - Added documentation to BLISObjectAPI.md and BLISTypedAPI.md for getijv
+      and setijv.
+    - Added documentation to BLISTypedAPI.md for getijm and setijm, which
+      were inadvertently missing.
+    - Added a new entry to the FAQ titled "Why does BLIS have vector
+      (level-1v) and matrix (level-1m) variations of most level-1
+      operations?"
+    - Comment updates.
+
+commit 4534daffd13ed7a8983c681d3f5e9de17c9f0b96
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Apr 27 18:16:44 2021 -0500
+
+    Minor API breakage in bli_pack API.
+    
+    Details:
+    - Changed bli_pack_get_pack_a() and bli_pack_get_pack_b() so that
+      instead of returning a bool, they set a bool that is passed in by
+      address. This does break the public exported API, but I expect very
+      few users actually use this function. (This change is being made in
+      preparation for a much more extensive commit relating to error
+      checking.)
+
+commit 6a4aa986ffc060d3e64ed230afe318b82630f8b2
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Apr 23 13:10:01 2021 -0500
+
+    Fixed typo in Table of Contents.
+
+commit f6424b5b82160d346a09a0fbb526981ecf66cdb3
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Apr 23 13:08:06 2021 -0500
+
+    Added dedicated Performance section to README.md.
+    
+    Details:
+    - Spun off the Performance.md and PerformanceSmall.md links in the
+      Documentation section into a new Performance section dedicated to
+      those two links. (The previous entries remain redundantly listed
+      within Documentation section.) Thanks to Robert van de Geijn for
+      suggesting this change.
+
+commit 40ce5fd241b9ad140bf57278d440f0598d7f15d8
+Merge: 6280757b 1f3461a5
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Apr 21 09:54:25 2021 -0500
+
+    Merge pull request #493 from cassiersg/patch-1
+    
+    Fix typo in FAQ.md
+
+commit 1f3461a5a5a88510f913451a93e3190ec1556f39
+Author: Gaëtan Cassiers <cassiersg@users.noreply.github.com>
+Date:   Wed Apr 21 16:49:05 2021 +0200
+
+    Fix typo in FAQ.md
+
+commit 6548cebaf55a1f9bdb8417cc89dd0444d8f9c2e4
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Apr 14 13:00:42 2021 -0500
+
+    Allow clang for ThunderX2 config
+    
+    Needed for compiling on e.g. Mac M1. AFAIK clang supports the same -mcpu flag for ThunderX2 as gcc.
+
+commit 6280757be32f90fd77d8dd9357b07d9306e6f80d
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Apr 7 13:03:56 2021 -0500
+
+    Minor updates to a64fx section of Performance.md.
+
+commit 1e6ed823c6cd11f9b671779f3c8bdbd2bbb40f34
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Thu Apr 8 02:59:26 2021 +0900
+
+    Additional A64fx Comments (#490)
+    
+    * Performance.md Update A64fx Comments
+    
+    - Reason for ARMPL's missing data;
+    - Additional envs / flags for kernel selection;
+    - Update BLIS SRC commit.
+    
+    * Include Another Fix in armsve-cfg-vendor
+    
+    A prototype was forgotten, causing that void* pointer was not fully returned.
+
+commit 2688f21a5b073950f6f187c95917fdbb5aac234a
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Apr 6 19:02:37 2021 -0500
+
+    Added Fujitsu A64fx (512-bit SVE) perf results.
+    
+    Details:
+    - Added single-threaded and multithreaded performance results to
+      docs/Performance.md. These results were gathered on the "Fugaku"
+      Fujitsu A64fx supercomputer at the RIKEN Center for Computational
+      Science in Kobe, Japan. Special thanks to RuQing Xu and Stepan
+      Nassyr for their work in developing and optimizing A64fx support in
+      BLIS and RuQing for gathering the performance data that is reflected
+      in these new graphs.
+
+commit ba3ba8da83d48397162139e11337c036a631ba79
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Apr 6 18:39:58 2021 -0500
+
+    Minor updates and fixes to test/3/octave scripts.
+    
+    Details:
+    - Fixed an issue where the wrong string was being passed in for the
+      vendor legend string.
+    - Changed the graph in which the legends appear.
+    - Updates to runthese.m.
+
+commit 09bd4f4f12311131938baa9f75d27e92b664d681
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Mar 31 17:09:36 2021 -0500
+
+    Add err_t* "return" parameter to malloc functions.
+    
+    Details:
+    - Added an err_t* parameter to memory allocation functions including
+      bli_malloc_intl(), bli_calloc_intl(), bli_malloc_user(),
+      bli_fmalloc_align(), and bli_fmalloc_noalign(). Since these functions
+      already use the return value to return the allocated memory address,
+      they can't communicate errors to the caller through the return value.
+      This commit does not employ any error checking within these functions
+      or their callers, but this sets up BLIS for a more comprehensive
+      commit that moves in that direction.
+    - Moved the typedefs for malloc_ft and free_ft from bli_malloc.h to
+      bli_type_defs.h. This was done so that what remains of bli_malloc.h
+      can be included after the definition of the err_t enum. (This ordering
+      was needed because bli_malloc.h now contains function prototypes that
+      use err_t.)
+    - Defined bli_is_success() and bli_is_failure() static functions in
+      bli_param_macro_defs.h. These functions provide easy checks for error
+      codes and will be used more heavily in future commits.
+    - Unfortunately, the additional err_t* argument discussed above breaks
+      the API for bli_malloc_user(), which is an exported symbol in the
+      shared library. However, it's quite possible that the only application
+      that calls bli_malloc_user()--indeed, the reason it is was marked for
+      symbol exporting to begin with--is the BLIS testsuite. And if that's
+      the case, this breakage won't affect anyone. Nonetheless, the "major"
+      part of the so_version file has been updated accordingly to 4.0.0.
+
+commit f9ad55ce7e12f59930605753959fcfd41a218d8d
+Merge: 04502492 90508192
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Mar 31 14:20:19 2021 -0500
+
+    Merge branch 'master' into dev
+
+commit 90508192f2d6ae95adc2a3ba9f4e5bad2c8d6fd2
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Mar 30 21:16:44 2021 -0500
+
+    Update do_sde.sh (#489)
+    
+    Update to a newer version of SDE, and do a direct download as it seems you don't have to click-through the license anymore.
+
+commit 22c6b5dc4c9cc21942f8ccc30891f9b4385a9504
+Author: Nicholai Tukanov <nicholaitukanov@gmail.com>
+Date:   Tue Mar 30 19:07:42 2021 -0500
+
+    Fixed bug in power10 microkernel I/O. (#488)
+    
+    Details:
+    - Fixed a bug in the POWER10 DGEMM kernel whereby the microkernel did
+      not store the microtile result correctly due to incorrect indices
+      calculations. (The error was introduced when I reorganized the
+      'kernels/power10/3' directory.)
+
+commit 04502492671456b94bcdee60b9de347b6763a32d
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sun Mar 28 19:11:43 2021 -0500
+
+    Always stay initialized after BLAS compat calls.
+    
+    Details:
+    - Removed the option to finalize BLIS after every BLAS call, which also
+      means that BLIS would initialize at the beginning of every BLAS call.
+      This option never really made sense and wasn't even implemented
+      properly to begin with. (Because bli_init_auto() and _finalize_auto()
+      were implemented in terms of bli_init_once() and _finalize_once(),
+      respectively, the application would have only been able to call one
+      BLAS routine before BLIS would find itself in a unusable, permanently
+      uninitialized state.) Because this option was never meant for regular
+      use, it never made it into configure as an actual configure-time
+      option, and therefore this commit only removes parts of the code
+      affected by the cpp macro guard BLIS_ENABLE_STAY_AUTO_INITIALIZED.
+
+commit 3a6f41afb8197e831b6ce2f1ae7f63735685fa0a
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Mar 27 17:22:14 2021 -0500
+
+    Renamed membrk files/vars/functions to pba.
+    
+    Details:
+    - Renamed the files, variables, and functions relating to the packing
+      block allocator from its legacy name (membrk) to its current name
+      (pba). This more clearly contrasts the packing block allocator with
+      the small block allocator (sba).
+    - Fixed a typo in bli_pack_set_pack_b(), defined in bli_pack.c, that
+      caused the function to erroneously change the value of the pack_a
+      field of the global rntm_t instead of the pack_b field. (Apparently
+      nobody has used this API yet.)
+    - Comment updates.
+
+commit 36cb4116d15cfef2d42ec4a834efd4a958f261b5
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Mar 27 15:15:09 2021 -0500
+
+    Switch allocator mutexes to static initialization.
+    
+    Details:
+    - Switched the small block allocator (sba), as defined in bli_sba.c and
+      bli_apool.c, to static initialization of its internal mutex. Did a
+      similar thing for the packing block allocator (pba), which appears as
+      global_membrk in bli_membrk.c.
+    - Commented out bli_membrk_init_mutex() and bli_membrk_finalize_mutex()
+      to ensure they won't be used in the future.
+    - In bli_thrcomm_pthreads.c and .h, removed old, commented-out cpp
+      blocks guarded by BLIS_USE_PTHREAD_MUTEX.
+
+commit 159ca6f01a5f91b93513134c9470b69ff78f5354
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Mar 24 15:57:32 2021 -0500
+
+    Made test/3/octave scripts robust to missing data.
+    
+    Details:
+    - Modified the octave scripts in test/3 so that the script does not
+      choke when one or more of the expected OpenBLAS, Eigen, or vendor data
+      files is missing. (The BLIS data set, however, must be complete.) When
+      a file is missing, that data series is simply not included on that
+      particular graph. Also factored out a lot of the redundant logic from
+      plot_panel_4x5.m into a separate function in read_data.m.
+
+commit 545e6c2f6d09d023b353002a9a43b11aa0c1d701
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Mar 22 17:42:33 2021 -0500
+
+    CHANGELOG update (0.8.1)
+
+commit 8535b3e11d2297854991c4272932ce4974dda629 (tag: 0.8.1)
 Author: Field G. Van Zee <field@cs.utexas.edu>
 Date:   Mon Mar 22 17:42:33 2021 -0500
 
     Version file update (0.8.1)
 
-commit e56d9f2d94ed247696dda2cbf94d2ca05c7fc089 (origin/master, origin/HEAD)
+commit e56d9f2d94ed247696dda2cbf94d2ca05c7fc089
 Author: Field G. Van Zee <field@cs.utexas.edu>
 Date:   Mon Mar 22 17:40:50 2021 -0500
 
@@ -163,7 +3041,7 @@ Date:   Fri Mar 5 13:53:43 2021 -0600
       information, refer to the POWER10.md document that is included in
       'sandbox/power10'.
 
-commit b8dcc5bc75a746807d6f8fa22dc2123c98396bf5 (origin/dev, origin/amd, dev, amd)
+commit b8dcc5bc75a746807d6f8fa22dc2123c98396bf5
 Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
 Date:   Tue Mar 2 06:58:24 2021 +0800
 
@@ -6796,7 +9674,7 @@ Date:   Mon Oct 15 16:37:39 2018 -0500
     - Updated frame/include/bli_x86_asm_macros.h with additional macros
       (courtsey of Devin Matthews).
 
-commit 3612ecac98a9d36c3fcd64154121d420bb69febd (origin/nested-omp-patch)
+commit 3612ecac98a9d36c3fcd64154121d420bb69febd
 Author: Field G. Van Zee <field@cs.utexas.edu>
 Date:   Thu Oct 11 15:16:41 2018 -0500
 

From 69fa915464c52f09a5971a60f521900d31a34e69 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 1 Apr 2022 08:47:46 -0500
Subject: [PATCH 048/230] Fixed broken "tagged releases" link in README.md.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 211ebd6d5..8b355470c 100644
--- a/README.md
+++ b/README.md
@@ -357,7 +357,7 @@ This should reveal a link for downloading the zip file.
 3. **Download a source release via a tarball/zip file.**
 Alternatively, if you would like to stick to the code that is included in
 official releases, you may download either a tarball or zip file of any of
-BLIS's previous [tagged releases](https://github.com/flame/blis/releases).
+BLIS's previous [tagged releases](https://github.com/flame/blis/tags).
 We consider this option to be less than ideal for most people since it will
 likely mean you miss out on the latest bugfix or feature commits (in contrast
 to Options 1 or 2), and you also will not be able to update your code with a

From b3e674db3c05ca586b159a71deb1b61d701ae5c9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 4 Apr 2022 17:31:02 -0500
Subject: [PATCH 049/230] README.md update to link to releases page.

---
 README.md | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8b355470c..3803acdca 100644
--- a/README.md
+++ b/README.md
@@ -345,6 +345,18 @@ to executing the following command in your terminal shell:
    ```
    git clone https://github.com/flame/blis.git
    ```
+   At this point, you will have the latest commit of the `master` branch
+checked out. If you wish to check out a particular version x.y.z, execute
+the following:
+   ```
+   git checkout x.y.z
+   ```
+   `git` will then transform your working copy to match the state of the
+commit associated with version x.y.z. You can view a list of tags at any
+time by executing:
+   ```
+   git tag --list
+   ```
 
 2. **Download a source repository via a zip file.**
 If you are uncomfortable with using `git` but would still like the latest
@@ -356,8 +368,11 @@ This should reveal a link for downloading the zip file.
 
 3. **Download a source release via a tarball/zip file.**
 Alternatively, if you would like to stick to the code that is included in
-official releases, you may download either a tarball or zip file of any of
-BLIS's previous [tagged releases](https://github.com/flame/blis/tags).
+official releases, you may download either a tarball or zip file of BLIS's
+latest [release](https://github.com/flame/blis/releases). Some older releases
+are only available as [tagged](https://github.com/flame/blis/tags) commits.
+(Note: downloading release x.y.z is equivalent to downloading, or checking out,
+tag `x.y.z`.)
 We consider this option to be less than ideal for most people since it will
 likely mean you miss out on the latest bugfix or feature commits (in contrast
 to Options 1 or 2), and you also will not be able to update your code with a

From ae10d9495486f589ed0320f0151b2d195574f1cf Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 6 Apr 2022 20:31:11 -0500
Subject: [PATCH 050/230] Simplify and rewrite reference packm kernels. (#610)

Details:
- Reorganized the way kernels are stored within the cntx_t structure so
  that rather than having a function pointer for every supported size of
  unrolled packm kernel (2xk, 3xk, 4xk, etc.), we store only two packm
  kernels per datatype: one to pack MRxk micropanels and one to pack
  NRxk micropanels.
  - NOTE: The "bb" (broadcast B) reference kernels have been merged into
    the "standard" kernels (packm [including 1er and unpackm], gemm,
    trsm, gemmtrsm). This replication factor is controlled by
    BLIS_BB[MN]_[sdcz] etc. Power9/10 needs testing since only a
    replication factor of 1 has been tested. armsve also needs testing
    since the MR value isn't available as a macro.
- Simplified the bli_cntx_*() APIs to conform to the new unified kernel
  array within the cntx_t. Updated existing bli_cntx_init_<subconfig>()
  function definitions for all subconfigurations.
- Consolidated all kernel id types (e.g. l1vkr_t, l1mkr_t, l3ukr_t,
  etc.) into one kernel id type: ukr_t.
- Various edits, updates, and rewrites of reference kernels pursuant to
  the aforementioned changes.
- Define compile-time macro constants (BLIS_MR_[sdcz], BLIS_NR_[sdcz],
  and friends) in bli_kernel_macro_defs.h, but only when the macro
  BLIS_IN_REF_KERNEL is defined by the build system.
- Loose ends:
  - Still need to update documentation, including:
    - docs/ConfigurationHowTo.md
    - docs/KernelsHowTo.md
    to reflect changes made in this commit.
---
 addon/gemmd/attic/bao_gemmd_bp_var2.c         |    8 +-
 addon/gemmd/bao_gemmd.c                       |    2 +-
 addon/gemmd/bao_gemmd_bp_var1.c               |    2 +-
 addon/gemmd/bao_packm_cxk.c                   |    6 +-
 common.mk                                     |    4 +
 config/a64fx/bli_cntx_init_a64fx.c            |  100 +-
 config/a64fx/bli_kernel_defs_a64fx.h          |   52 +
 config/armsve/bli_cntx_init_armsve.c          |  119 +-
 config/armsve/bli_kernel_defs_armsve.h        |   58 +
 config/bgq/bli_cntx_init_bgq.c                |   35 +-
 config/bgq/bli_kernel_defs_bgq.h              |   48 +
 config/bulldozer/bli_cntx_init_bulldozer.c    |   41 +-
 config/bulldozer/bli_kernel_defs_bulldozer.h  |   52 +
 config/cortexa15/bli_cntx_init_cortexa15.c    |   35 +-
 config/cortexa15/bli_kernel_defs_cortexa15.h  |   48 +
 config/cortexa53/bli_cntx_init_cortexa53.c    |   35 +-
 config/cortexa53/bli_kernel_defs_cortexa53.h  |   48 +
 config/cortexa57/bli_cntx_init_cortexa57.c    |   35 +-
 config/cortexa57/bli_kernel_defs_cortexa57.h  |   48 +
 config/cortexa9/bli_cntx_init_cortexa9.c      |   35 +-
 config/cortexa9/bli_kernel_defs_cortexa9.h    |   48 +
 config/excavator/bli_cntx_init_excavator.c    |   41 +-
 config/excavator/bli_kernel_defs_excavator.h  |   52 +
 config/firestorm/bli_cntx_init_firestorm.c    |  143 +-
 config/firestorm/bli_kernel_defs_firestorm.h  |   48 +
 config/generic/bli_kernel_defs_generic.h      |   42 +
 config/haswell/bli_cntx_init_haswell.c        |  247 +-
 config/haswell/bli_kernel_defs_haswell.h      |   52 +
 config/knc/bli_cntx_init_knc.c                |   34 +-
 config/knc/bli_kernel_defs_knc.h              |   48 +
 config/knl/bli_cntx_init_knl.c                |   71 +-
 config/knl/bli_kernel_defs_knl.h              |   48 +
 config/old/armv7a/bli_cntx_init_armv7a.c      |    2 +-
 config/old/haswellbb/bli_cntx_init_haswell.c  |    2 +-
 config/penryn/bli_cntx_init_penryn.c          |   47 +-
 config/penryn/bli_kernel_defs_penryn.h        |   48 +
 config/piledriver/bli_cntx_init_piledriver.c  |   41 +-
 .../piledriver/bli_kernel_defs_piledriver.h   |   52 +
 config/power10/bli_cntx_init_power10.c        |  102 +-
 config/power10/bli_kernel_defs_power10.h      |   51 +
 config/power7/bli_cntx_init_power7.c          |   32 +-
 config/power7/bli_kernel_defs_power7.h        |   46 +
 config/power9/bli_cntx_init_power9.c          |  101 +-
 config/power9/bli_kernel_defs_power9.h        |   49 +
 .../sandybridge/bli_cntx_init_sandybridge.c   |   41 +-
 .../sandybridge/bli_kernel_defs_sandybridge.h |   52 +
 config/skx/bli_cntx_init_skx.c                |   53 +-
 config/skx/bli_kernel_defs_skx.h              |   48 +
 .../steamroller/bli_cntx_init_steamroller.c   |   41 +-
 .../steamroller/bli_kernel_defs_steamroller.h |   52 +
 config/template/bli_cntx_init_template.c      |   53 +-
 config/template/bli_kernel_defs_template.h    |   60 +
 config/thunderx2/bli_cntx_init_thunderx2.c    |   35 +-
 config/thunderx2/bli_kernel_defs_thunderx2.h  |   48 +
 config/zen/bli_cntx_init_zen.c                |  337 +--
 config/zen/bli_kernel_defs_zen.h              |   52 +
 config/zen2/bli_cntx_init_zen2.c              |  309 +--
 config/zen2/bli_kernel_defs_zen2.h            |   52 +
 config/zen3/bli_cntx_init_zen3.c              |  344 +--
 config/zen3/bli_kernel_defs_zen3.h            |   52 +
 docs/ConfigurationHowTo.md                    |   24 +-
 frame/1/bli_l1v_tapi.c                        |  132 +-
 frame/1/other/packv/bli_packv_unb_var1.c      |    2 +-
 frame/1/other/unpackv/bli_unpackv_unb_var1.c  |    2 +-
 frame/1d/bli_l1d_tapi.c                       |  171 +-
 frame/1f/bli_l1f_tapi.c                       |  114 +-
 frame/1m/bli_l1m_ft_ker.h                     |   17 +-
 frame/1m/bli_l1m_ker.h                        |   54 +-
 frame/1m/bli_l1m_ker_prot.h                   |   19 +-
 frame/1m/bli_l1m_unb_var1.c                   |    8 +-
 frame/1m/{packm => other}/bli_packm_cxk.c     |   35 +-
 frame/1m/{packm => other}/bli_packm_cxk.h     |    0
 frame/1m/{packm => other}/bli_packm_cxk_1er.c |    7 +-
 frame/1m/{packm => other}/bli_packm_cxk_1er.h |    0
 .../bli_packm_struc_cxk_1er.c                 |    0
 .../bli_packm_struc_cxk_1er.h                 |    0
 frame/1m/{unpackm => other}/bli_unpackm_cxk.c |    8 +-
 frame/1m/{unpackm => other}/bli_unpackm_cxk.h |    1 +
 frame/1m/packm/bli_packm.h                    |    4 -
 frame/1m/packm/bli_packm_blk_var1.c           |    8 +-
 frame/1m/packm/bli_packm_struc_cxk.c          |  599 ++---
 frame/1m/unpackm/bli_unpackm.h                |    2 -
 frame/1m/unpackm/bli_unpackm_blk_var1.c       |   51 +-
 frame/2/gemv/bli_gemv_unb_var1.c              |    2 +-
 frame/2/gemv/bli_gemv_unb_var2.c              |    2 +-
 frame/2/gemv/bli_gemv_unf_var1.c              |    2 +-
 frame/2/gemv/bli_gemv_unf_var2.c              |    2 +-
 frame/2/ger/bli_ger_unb_var1.c                |    2 +-
 frame/2/ger/bli_ger_unb_var2.c                |    2 +-
 frame/2/hemv/bli_hemv_unb_var1.c              |    4 +-
 frame/2/hemv/bli_hemv_unb_var2.c              |    2 +-
 frame/2/hemv/bli_hemv_unb_var3.c              |    4 +-
 frame/2/hemv/bli_hemv_unb_var4.c              |    2 +-
 frame/2/hemv/bli_hemv_unf_var1.c              |    2 +-
 frame/2/hemv/bli_hemv_unf_var1a.c             |    2 +-
 frame/2/hemv/bli_hemv_unf_var3.c              |    2 +-
 frame/2/hemv/bli_hemv_unf_var3a.c             |    2 +-
 frame/2/her/bli_her_unb_var1.c                |    2 +-
 frame/2/her/bli_her_unb_var2.c                |    2 +-
 frame/2/her2/bli_her2_unb_var1.c              |    2 +-
 frame/2/her2/bli_her2_unb_var2.c              |    2 +-
 frame/2/her2/bli_her2_unb_var3.c              |    2 +-
 frame/2/her2/bli_her2_unb_var4.c              |    2 +-
 frame/2/her2/bli_her2_unf_var1.c              |    2 +-
 frame/2/her2/bli_her2_unf_var4.c              |    2 +-
 frame/2/trmv/bli_trmv_unb_var1.c              |    2 +-
 frame/2/trmv/bli_trmv_unb_var2.c              |    2 +-
 frame/2/trmv/bli_trmv_unf_var1.c              |    2 +-
 frame/2/trmv/bli_trmv_unf_var2.c              |    2 +-
 frame/2/trsv/bli_trsv_unb_var1.c              |    2 +-
 frame/2/trsv/bli_trsv_unb_var2.c              |    2 +-
 frame/2/trsv/bli_trsv_unf_var1.c              |    2 +-
 frame/2/trsv/bli_trsv_unf_var2.c              |    2 +-
 frame/3/bli_l3_schema.c                       |    2 +-
 frame/3/bli_l3_sup.c                          |    2 +-
 frame/3/bli_l3_sup_int.c                      |    4 +-
 frame/3/bli_l3_sup_packm_var.c                |   11 +-
 frame/3/bli_l3_sup_vars.h                     |    2 +-
 frame/3/gemm/bli_gemm_front.c                 |    2 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |    2 +-
 frame/3/gemm/bli_gemm_md.c                    |   22 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |    4 +-
 frame/3/gemm/other/bli_gemm_ker_var2.c        |    2 +-
 frame/3/gemm/other/bli_gemm_ker_var2rr.c      |    2 +-
 frame/3/gemm/other/bli_gemm_ker_var2sl.c      |    2 +-
 frame/3/gemmt/bli_gemmt_front.c               |    2 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |    2 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |    2 +-
 frame/3/gemmt/other/bli_gemmt_l_ker_var2.c    |    2 +-
 frame/3/gemmt/other/bli_gemmt_u_ker_var2.c    |    2 +-
 frame/3/hemm/bli_hemm_front.c                 |    2 +-
 frame/3/symm/bli_symm_front.c                 |    2 +-
 frame/3/trmm/bli_trmm_front.c                 |    2 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c   |    4 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c   |    2 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c   |    2 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c   |    2 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c   |    2 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c   |    2 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c   |    2 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c   |    2 +-
 frame/3/trmm3/bli_trmm3_front.c               |    2 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |    2 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |    2 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |    2 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |    2 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2.c     |    2 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c   |    2 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c   |    2 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2.c     |    2 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c   |    2 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c   |    2 +-
 frame/3/trsm/other/bli_trsm_rl_ker_var2.c     |    2 +-
 frame/3/trsm/other/bli_trsm_ru_ker_var2.c     |    2 +-
 frame/base/bli_cntx.c                         | 1586 ++----------
 frame/base/bli_cntx.h                         |  558 +----
 frame/base/bli_gks.c                          |   42 +-
 frame/base/bli_gks.h                          |    8 +-
 frame/include/bli_gentfunc_macro_defs.h       |    7 +
 frame/include/bli_kernel_macro_defs.h         |  104 +
 frame/include/bli_misc_macro_defs.h           |    6 +
 frame/include/bli_param_macro_defs.h          |   26 +-
 frame/include/bli_scalar_macro_defs.h         |    7 +-
 frame/include/bli_type_defs.h                 |  227 +-
 frame/include/level0/bli_set0s_edge.h         |   79 +
 kernels/penryn/1/bli_axpyv_penryn_int.c       |    2 +-
 kernels/penryn/1/bli_dotv_penryn_int.c        |    2 +-
 kernels/penryn/1f/bli_axpy2v_penryn_int.c     |    2 +-
 kernels/penryn/1f/bli_axpyf_penryn_int.c      |    2 +-
 kernels/penryn/1f/bli_dotaxpyv_penryn_int.c   |    2 +-
 kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c  |    4 +-
 kernels/penryn/1f/bli_dotxf_penryn_int.c      |    4 +-
 kernels/zen/1/bli_scalv_zen_int.c             |    4 +-
 kernels/zen/1/bli_scalv_zen_int10.c           |   14 +-
 kernels/zen/1f/bli_axpyf_zen_int_4.c          |    4 +-
 kernels/zen/1f/bli_axpyf_zen_int_5.c          |   23 +-
 kernels/zen/1f/bli_axpyf_zen_int_8.c          |    4 +-
 kernels/zen/1f/bli_dotxf_zen_int_8.c          |   12 +-
 ref_kernels/1/bli_axpbyv_ref.c                |   14 +-
 ref_kernels/1/bli_axpyv_ref.c                 |    4 +-
 ref_kernels/1/bli_scal2v_ref.c                |    4 +-
 ref_kernels/1/bli_scalv_ref.c                 |    2 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |    4 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |    2 +-
 ref_kernels/1f/bli_axpyf_ref.c                |    2 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |    4 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |    4 +-
 ref_kernels/1f/bli_dotxf_ref.c                |    2 +-
 ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c  |    4 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   |  336 +++
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       |  173 ++
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        | 2195 +----------------
 ref_kernels/1m/bli_packm_cxk_bb_ref.c         |  656 -----
 ref_kernels/1m/bli_packm_cxk_ref.c            | 1679 +------------
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |  814 +-----
 ref_kernels/3/bb/bli_gemmbb_ref.c             |  141 --
 ref_kernels/3/bb/bli_gemmtrsmbb_ref.c         |  140 --
 ref_kernels/3/bb/bli_trsmbb_ref.c             |  214 --
 ref_kernels/3/bli_gemm_ref.c                  |  256 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |   49 +-
 ref_kernels/3/bli_trsm_ref.c                  |   43 +-
 ref_kernels/bli_cntx_ref.c                    |  433 ++--
 ref_kernels/ind/bli_gemm1m_ref.c              |    4 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |   56 +-
 ref_kernels/ind/bli_trsm1m_ref.c              |  116 +-
 sandbox/gemmlike/attic/bls_gemm_bp_var2.c     |    8 +-
 sandbox/gemmlike/bls_gemm.c                   |    2 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           |    2 +-
 sandbox/gemmlike/bls_packm_cxk.c              |    7 +-
 testsuite/src/test_trsm_ukr.c                 |    4 +-
 214 files changed, 5180 insertions(+), 10257 deletions(-)
 create mode 100644 config/a64fx/bli_kernel_defs_a64fx.h
 create mode 100644 config/armsve/bli_kernel_defs_armsve.h
 create mode 100644 config/bgq/bli_kernel_defs_bgq.h
 create mode 100644 config/bulldozer/bli_kernel_defs_bulldozer.h
 create mode 100644 config/cortexa15/bli_kernel_defs_cortexa15.h
 create mode 100644 config/cortexa53/bli_kernel_defs_cortexa53.h
 create mode 100644 config/cortexa57/bli_kernel_defs_cortexa57.h
 create mode 100644 config/cortexa9/bli_kernel_defs_cortexa9.h
 create mode 100644 config/excavator/bli_kernel_defs_excavator.h
 create mode 100644 config/firestorm/bli_kernel_defs_firestorm.h
 create mode 100644 config/generic/bli_kernel_defs_generic.h
 create mode 100644 config/haswell/bli_kernel_defs_haswell.h
 create mode 100644 config/knc/bli_kernel_defs_knc.h
 create mode 100644 config/knl/bli_kernel_defs_knl.h
 create mode 100644 config/penryn/bli_kernel_defs_penryn.h
 create mode 100644 config/piledriver/bli_kernel_defs_piledriver.h
 create mode 100644 config/power10/bli_kernel_defs_power10.h
 create mode 100644 config/power7/bli_kernel_defs_power7.h
 create mode 100644 config/power9/bli_kernel_defs_power9.h
 create mode 100644 config/sandybridge/bli_kernel_defs_sandybridge.h
 create mode 100644 config/skx/bli_kernel_defs_skx.h
 create mode 100644 config/steamroller/bli_kernel_defs_steamroller.h
 create mode 100644 config/template/bli_kernel_defs_template.h
 create mode 100644 config/thunderx2/bli_kernel_defs_thunderx2.h
 create mode 100644 config/zen/bli_kernel_defs_zen.h
 create mode 100644 config/zen2/bli_kernel_defs_zen2.h
 create mode 100644 config/zen3/bli_kernel_defs_zen3.h
 rename frame/1m/{packm => other}/bli_packm_cxk.c (84%)
 rename frame/1m/{packm => other}/bli_packm_cxk.h (100%)
 rename frame/1m/{packm => other}/bli_packm_cxk_1er.c (94%)
 rename frame/1m/{packm => other}/bli_packm_cxk_1er.h (100%)
 rename frame/1m/{packm => other}/bli_packm_struc_cxk_1er.c (100%)
 rename frame/1m/{packm => other}/bli_packm_struc_cxk_1er.h (100%)
 rename frame/1m/{unpackm => other}/bli_unpackm_cxk.c (92%)
 rename frame/1m/{unpackm => other}/bli_unpackm_cxk.h (98%)
 create mode 100644 frame/include/level0/bli_set0s_edge.h
 create mode 100644 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
 create mode 100644 ref_kernels/1m/bli_packm_cxc_diag_ref.c
 delete mode 100644 ref_kernels/1m/bli_packm_cxk_bb_ref.c
 delete mode 100644 ref_kernels/3/bb/bli_gemmbb_ref.c
 delete mode 100644 ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
 delete mode 100644 ref_kernels/3/bb/bli_trsmbb_ref.c

diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c
index a0040fec0..9139e89b1 100644
--- a/addon/gemmd/attic/bao_gemmd_bp_var2.c
+++ b/addon/gemmd/attic/bao_gemmd_bp_var2.c
@@ -164,7 +164,7 @@ void PASTECH2(bao_,ch,varname) \
 	   function pointer type. */ \
 	/*
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	*/ \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -175,7 +175,7 @@ void PASTECH2(bao_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 	*/ \
@@ -536,7 +536,7 @@ void PASTECH2(bao_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
 	   temporary buffer are set so that they match the storage of the
@@ -545,7 +545,7 @@ void PASTECH2(bao_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 \
diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c
index fadc52691..01185a9d7 100644
--- a/addon/gemmd/bao_gemmd.c
+++ b/addon/gemmd/bao_gemmd.c
@@ -137,7 +137,7 @@ void bao_gemmd_ex
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
index 09e4df09e..689471367 100644
--- a/addon/gemmd/bao_gemmd_bp_var1.c
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -163,7 +163,7 @@ void PASTECH2(bao_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = cs_c; \
diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c
index 645f09d79..8680c5332 100644
--- a/addon/gemmd/bao_packm_cxk.c
+++ b/addon/gemmd/bao_packm_cxk.c
@@ -55,15 +55,15 @@ void PASTECH2(bao_,ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/common.mk b/common.mk
index 5f2d30c9b..a93f8ab24 100644
--- a/common.mk
+++ b/common.mk
@@ -120,6 +120,8 @@ get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
+                                   -DBLIS_IN_REF_KERNEL=1 \
+                                   -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
 get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
@@ -129,6 +131,8 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
+                                   -DBLIS_IN_REF_KERNEL=1 \
+                                   -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
 get-config-cflags-for    = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c
index 5132b2824..dd920bcec 100644
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -38,34 +38,42 @@
 void bli_cntx_init_a64fx( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_a64fx_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
+
+	  // packm
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+
+	  BLIS_VA_END
 	);
 
-	// Set SVE-512 packing routine.
-	bli_cntx_set_packm_kers
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
 	(
-	  2,
-	  BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
-	  // 12xk is not used and disabled for GCC 8-9 compatibility.
-	  // BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk,
-	  BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -80,66 +88,18 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
-
-#if 0
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,   65,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,   65,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,   65,   -1,   -1 );
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
+	  BLIS_VA_END
 	);
 
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  4,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    10,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,    16,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
-#endif
-
 	// Set A64FX cache sector sizes for each PE/CMG
 	// SC Fugaku might disable users' setting cache sizes.
 #if !defined(CACHE_SECTOR_SIZE_READONLY)
diff --git a/config/a64fx/bli_kernel_defs_a64fx.h b/config/a64fx/bli_kernel_defs_a64fx.h
new file mode 100644
index 000000000..2c5c97204
--- /dev/null
+++ b/config/a64fx/bli_kernel_defs_a64fx.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   32
+#define BLIS_MR_d   16
+#define BLIS_MR_c   16
+#define BLIS_MR_z   8
+
+#define BLIS_NR_s   10
+#define BLIS_NR_d   10
+#define BLIS_NR_c   10
+#define BLIS_NR_z   10
+
+//#endif
+
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index ad0e68219..6339ba381 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -45,9 +45,6 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 		return;
 
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-#if 0
-	blksz_t thresh[ BLIS_NUM_THRESH ];
-#endif
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_armsve_ref( cntx );
@@ -64,35 +61,55 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
 	bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
+	  cntx,
+
+	  // level-3
 	  // These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  cntx
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Set VL-specific packing routines if applicable.
-	if (m_r_d==16)
-	  bli_cntx_set_packm_kers
+	if ( m_r_d == 16 )
+	{
+	  bli_cntx_set_ukrs
 	  (
-		2,
-		BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
-		BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
-		cntx
+		cntx,
+		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+		BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+		BLIS_VA_END
 	  );
-	else if (m_r_d==8)
-	  bli_cntx_set_packm_kers
+	}
+	else if ( m_r_d == 8 )
+	{
+	  bli_cntx_set_ukrs
 	  (
-		1,
-		BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
-		cntx
+		cntx,
+		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
+		BLIS_VA_END
 	  );
+	}
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
@@ -106,64 +123,16 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
-
-#if 0
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  101,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  101,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  101,   -1,   -1 );
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  4,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  cntx
+	  BLIS_VA_END
 	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1, n_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1, m_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  2048,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
-#endif
 }
 
diff --git a/config/armsve/bli_kernel_defs_armsve.h b/config/armsve/bli_kernel_defs_armsve.h
new file mode 100644
index 000000000..8c9c0b0dd
--- /dev/null
+++ b/config/armsve/bli_kernel_defs_armsve.h
@@ -0,0 +1,58 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+//
+// The armsve configuration handles both 256-bit and 512-bit SVE vectors,
+// so it is not possible to define specific register block sizes. Thus,
+// armsve can't use reference kernels!
+//
+
+#define BLIS_MR_s   -1
+#define BLIS_MR_d   -1
+#define BLIS_MR_c   -1
+#define BLIS_MR_z   -1
+
+#define BLIS_NR_s   10
+#define BLIS_NR_d   10
+#define BLIS_NR_c   10
+#define BLIS_NR_z   10
+
+//#endif
+
diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c
index 782c441b9..d3871d8f7 100644
--- a/config/bgq/bli_cntx_init_bgq.c
+++ b/config/bgq/bli_cntx_init_bgq.c
@@ -43,14 +43,28 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bgq_int_8x8, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bgq_int_8x8,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/bgq/bli_kernel_defs_bgq.h b/config/bgq/bli_kernel_defs_bgq.h
new file mode 100644
index 000000000..bd3962e45
--- /dev/null
+++ b/config/bgq/bli_kernel_defs_bgq.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_d   8
+#define BLIS_MR_z   4
+
+#define BLIS_NR_d   8
+#define BLIS_NR_z   4
+
+//#endif
+
diff --git a/config/bulldozer/bli_cntx_init_bulldozer.c b/config/bulldozer/bli_cntx_init_bulldozer.c
index 9f6e83d6b..5b056f591 100644
--- a/config/bulldozer/bli_cntx_init_bulldozer.c
+++ b/config/bulldozer/bli_cntx_init_bulldozer.c
@@ -43,16 +43,32 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_bulldozer_asm_8x8_fma4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bulldozer_asm_4x6_fma4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_bulldozer_asm_8x8_fma4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bulldozer_asm_4x6_fma4,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/bulldozer/bli_kernel_defs_bulldozer.h b/config/bulldozer/bli_kernel_defs_bulldozer.h
new file mode 100644
index 000000000..ea1e58e66
--- /dev/null
+++ b/config/bulldozer/bli_kernel_defs_bulldozer.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   4
+#define BLIS_MR_c   8
+#define BLIS_MR_z   4
+
+#define BLIS_NR_s   8
+#define BLIS_NR_d   6
+#define BLIS_NR_c   4
+#define BLIS_NR_z   4
+
+//#endif
+
diff --git a/config/cortexa15/bli_cntx_init_cortexa15.c b/config/cortexa15/bli_cntx_init_cortexa15.c
index 7c6134ff0..28ebdef71 100644
--- a/config/cortexa15/bli_cntx_init_cortexa15.c
+++ b/config/cortexa15/bli_cntx_init_cortexa15.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv7a_int_4x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv7a_int_4x4, FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv7a_int_4x4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -73,13 +87,16 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa15/bli_kernel_defs_cortexa15.h b/config/cortexa15/bli_kernel_defs_cortexa15.h
new file mode 100644
index 000000000..9c413f7f8
--- /dev/null
+++ b/config/cortexa15/bli_kernel_defs_cortexa15.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   4
+#define BLIS_MR_d   4
+
+#define BLIS_NR_s   4
+#define BLIS_NR_d   4
+
+//#endif
+
diff --git a/config/cortexa53/bli_cntx_init_cortexa53.c b/config/cortexa53/bli_cntx_init_cortexa53.c
index d7d786f8c..4957de04e 100644
--- a/config/cortexa53/bli_cntx_init_cortexa53.c
+++ b/config/cortexa53/bli_cntx_init_cortexa53.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa53/bli_kernel_defs_cortexa53.h b/config/cortexa53/bli_kernel_defs_cortexa53.h
new file mode 100644
index 000000000..60292099c
--- /dev/null
+++ b/config/cortexa53/bli_kernel_defs_cortexa53.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
+
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
+
+//#endif
+
diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c
index 57d18792d..28558bc52 100644
--- a/config/cortexa57/bli_cntx_init_cortexa57.c
+++ b/config/cortexa57/bli_cntx_init_cortexa57.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa57/bli_kernel_defs_cortexa57.h b/config/cortexa57/bli_kernel_defs_cortexa57.h
new file mode 100644
index 000000000..60292099c
--- /dev/null
+++ b/config/cortexa57/bli_kernel_defs_cortexa57.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
+
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
+
+//#endif
+
diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c
index d38e12ebb..6af3ff91c 100644
--- a/config/cortexa9/bli_cntx_init_cortexa9.c
+++ b/config/cortexa9/bli_cntx_init_cortexa9.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv7a_int_4x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv7a_int_4x4, FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv7a_int_4x4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa9/bli_kernel_defs_cortexa9.h b/config/cortexa9/bli_kernel_defs_cortexa9.h
new file mode 100644
index 000000000..9c413f7f8
--- /dev/null
+++ b/config/cortexa9/bli_kernel_defs_cortexa9.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   4
+#define BLIS_MR_d   4
+
+#define BLIS_NR_s   4
+#define BLIS_NR_d   4
+
+//#endif
+
diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c
index adae152d5..d36865b21 100644
--- a/config/excavator/bli_cntx_init_excavator.c
+++ b/config/excavator/bli_cntx_init_excavator.c
@@ -43,16 +43,32 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/excavator/bli_kernel_defs_excavator.h b/config/excavator/bli_kernel_defs_excavator.h
new file mode 100644
index 000000000..df4a8c411
--- /dev/null
+++ b/config/excavator/bli_kernel_defs_excavator.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   16
+#define BLIS_MR_d   8
+#define BLIS_MR_c   4
+#define BLIS_MR_z   2
+
+#define BLIS_NR_s   3
+#define BLIS_NR_d   3
+#define BLIS_NR_c   2
+#define BLIS_NR_z   2
+
+//#endif
+
diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c
index a15ce0344..8e4d0088d 100644
--- a/config/firestorm/bli_cntx_init_firestorm.c
+++ b/config/firestorm/bli_cntx_init_firestorm.c
@@ -37,32 +37,60 @@
 void bli_cntx_init_firestorm( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_firestorm_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+	  // packm
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
+
+	  // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
+
+	  BLIS_VA_END
 	);
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
 	(
-	  4,
-	  BLIS_PACKM_8XK_KER,  BLIS_FLOAT,    bli_spackm_armv8a_int_8xk,
-	  BLIS_PACKM_12XK_KER, BLIS_FLOAT,    bli_spackm_armv8a_int_12xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_armv8a_int_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_armv8a_int_8xk,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -73,72 +101,47 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   640,  3072,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  3072,  8192,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],   -1,   99,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],   -1,   99,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],   -1,   99,   -1,   -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                               s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ],    -1,     6,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],    -1,   240,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],    -1,  1024,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],    -1,  3072,    -1,    -1 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
 
-	// -------------------------------------------------------------------------
+	  // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,   99,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,   99,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,   99,   -1,   -1 );
+	  // level-3 sup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  8,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   240,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,  1024,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  3072,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/firestorm/bli_kernel_defs_firestorm.h b/config/firestorm/bli_kernel_defs_firestorm.h
new file mode 100644
index 000000000..60292099c
--- /dev/null
+++ b/config/firestorm/bli_kernel_defs_firestorm.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
+
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
+
+//#endif
+
diff --git a/config/generic/bli_kernel_defs_generic.h b/config/generic/bli_kernel_defs_generic.h
new file mode 100644
index 000000000..db2f32947
--- /dev/null
+++ b/config/generic/bli_kernel_defs_generic.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+//#endif
+
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index f2dc900ea..fe3b45147 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -35,79 +35,58 @@
 
 #include "blis.h"
 
-//GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-
 void bli_cntx_init_haswell( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_haswell_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
+
 	  // gemm
 #if 1
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 #else
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_16x6,       FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_8x6,        FALSE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3,        FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3,        FALSE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_16x6,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_8x6,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3,
 #endif
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
 #if 1
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
+	  // packm
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 #endif
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
 	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  10,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -137,7 +116,74 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
-	  cntx
+
+	  // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+#if 1
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+#else
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, FALSE,
+#endif
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -161,97 +207,54 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,     8,     8 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,     8,     8 );
 
+	// -------------------------------------------------------------------------
+
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],  201,  201,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],  201,  201,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],  201,  201,   -1,   -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
+	                                                 9,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   168,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   256,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  4080,  4080,    -1,    -1 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
 
-	// -------------------------------------------------------------------------
+	  // gemmsup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],  201,  201,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],  201,  201,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],  201,  201,   -1,   -1 );
-
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
+	  // level-3 sup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-#if 0
-	// Initialize the context with the sup handlers.
-	bli_cntx_set_l3_sup_handlers
-	(
-	  1,
-	  BLIS_GEMM, bli_gemmsup_ref,
-	  cntx
-	);
-#endif
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,    -1,    -1,
-	                                             9,     9,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/haswell/bli_kernel_defs_haswell.h b/config/haswell/bli_kernel_defs_haswell.h
new file mode 100644
index 000000000..c5bc8d63f
--- /dev/null
+++ b/config/haswell/bli_kernel_defs_haswell.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
+
+//#endif
+
diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c
index 198f08827..8f615588c 100644
--- a/config/knc/bli_cntx_init_knc.c
+++ b/config/knc/bli_cntx_init_knc.c
@@ -43,13 +43,26 @@ void bli_cntx_init_knc( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  1,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_knc_asm_30x8, TRUE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -58,7 +71,7 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     8,     0,     0 );
 	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,   120,     0,     0,
 	                                             0,   160,     0,     0 );
-	bli_blksz_init     ( &blkszs[ BLIS_KC ],     0,   240,     0,     0,  
+	bli_blksz_init     ( &blkszs[ BLIS_KC ],     0,   240,     0,     0,
 	                                             0,   300,     0,     0 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0, 14400,     0,     0 );
 
@@ -66,13 +79,16 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/knc/bli_kernel_defs_knc.h b/config/knc/bli_kernel_defs_knc.h
new file mode 100644
index 000000000..0ae6d1b75
--- /dev/null
+++ b/config/knc/bli_kernel_defs_knc.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_d   30
+
+#define BLIS_NR_d   8
+
+#define BLIS_PACKMR_d   32
+
+//#endif
+
diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c
index 6da3b7a3a..87fa3176a 100644
--- a/config/knl/bli_cntx_init_knl.c
+++ b/config/knl/bli_cntx_init_knl.c
@@ -43,47 +43,33 @@ void bli_cntx_init_knl( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_knl_asm_24x16, FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_knl_asm_24x8,  FALSE,
-	  cntx
-	);
+	  cntx,
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  2,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
-	  BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
-	  cntx
-	);
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_knl_asm_24x16,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8,
+
+	  // packm
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+	  BLIS_AXPYF_KER, BLIS_FLOAT,  bli_saxpyf_zen_int_8,
+	  BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
+	  BLIS_DOTXF_KER, BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  10,
 #if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 #endif
+
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
@@ -92,12 +78,15 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
 #endif
+
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+
 	  // scalv
 #if 0
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
@@ -106,7 +95,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
-	  cntx
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -125,17 +127,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/knl/bli_kernel_defs_knl.h b/config/knl/bli_kernel_defs_knl.h
new file mode 100644
index 000000000..ce514bb21
--- /dev/null
+++ b/config/knl/bli_kernel_defs_knl.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   24
+#define BLIS_MR_d   24
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+
+//#endif
+
diff --git a/config/old/armv7a/bli_cntx_init_armv7a.c b/config/old/armv7a/bli_cntx_init_armv7a.c
index d4cc9e91d..acd8e6c18 100644
--- a/config/old/armv7a/bli_cntx_init_armv7a.c
+++ b/config/old/armv7a/bli_cntx_init_armv7a.c
@@ -66,7 +66,7 @@ void bli_cntx_init_armv7a( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  5,
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
diff --git a/config/old/haswellbb/bli_cntx_init_haswell.c b/config/old/haswellbb/bli_cntx_init_haswell.c
index 9e1d03503..88bd14a07 100644
--- a/config/old/haswellbb/bli_cntx_init_haswell.c
+++ b/config/old/haswellbb/bli_cntx_init_haswell.c
@@ -203,7 +203,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  7,
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c
index 1576bf944..964438e83 100644
--- a/config/penryn/bli_cntx_init_penryn.c
+++ b/config/penryn/bli_cntx_init_penryn.c
@@ -43,18 +43,36 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_penryn_asm_8x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_penryn_asm_4x4, FALSE,
-	  //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE,
-	  //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_penryn_asm_4x4, FALSE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_penryn_asm_4x4, FALSE,
-	  cntx
+	  cntx,
+
+	  //level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_penryn_asm_8x4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4,
+	  //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4,
+	  //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  //level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+	  //BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  //BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -69,13 +87,16 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-1
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/penryn/bli_kernel_defs_penryn.h b/config/penryn/bli_kernel_defs_penryn.h
new file mode 100644
index 000000000..f1e483646
--- /dev/null
+++ b/config/penryn/bli_kernel_defs_penryn.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   4
+
+#define BLIS_NR_s   4
+#define BLIS_NR_d   4
+
+//#endif
+
diff --git a/config/piledriver/bli_cntx_init_piledriver.c b/config/piledriver/bli_cntx_init_piledriver.c
index 4ed15e322..1c9a96fd9 100644
--- a/config/piledriver/bli_cntx_init_piledriver.c
+++ b/config/piledriver/bli_cntx_init_piledriver.c
@@ -43,16 +43,32 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/piledriver/bli_kernel_defs_piledriver.h b/config/piledriver/bli_kernel_defs_piledriver.h
new file mode 100644
index 000000000..df4a8c411
--- /dev/null
+++ b/config/piledriver/bli_kernel_defs_piledriver.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   16
+#define BLIS_MR_d   8
+#define BLIS_MR_c   4
+#define BLIS_MR_z   2
+
+#define BLIS_NR_s   3
+#define BLIS_NR_d   3
+#define BLIS_NR_c   2
+#define BLIS_NR_z   2
+
+//#endif
+
diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c
index 14c940f99..12d9f51c6 100644
--- a/config/power10/bli_cntx_init_power10.c
+++ b/config/power10/bli_cntx_init_power10.c
@@ -34,35 +34,6 @@
 
 #include "blis.h"
 
-// Instantiate prototypes for packm kernels.
-PACKM_KER_PROT(    float,  s, packm_6xk_bb4_power10_ref )
-PACKM_KER_PROT(    double, d, packm_6xk_bb2_power10_ref )
-
-// Instantiate prototypes for level-3 kernels.
-GEMM_UKR_PROT(     float,  s, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_u_power10_ref )
-
-GEMM_UKR_PROT(     double, d, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_u_power10_ref )
-
-GEMM_UKR_PROT(     scomplex, c, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_u_power10_ref )
-
-GEMM_UKR_PROT(     dcomplex, z, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_u_power10_ref )
-
 void bli_cntx_init_power10( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -72,51 +43,38 @@ void bli_cntx_init_power10( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  12,
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_power10_mma_8x16,     TRUE,
-
-	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power10_ref,      FALSE,
-	  
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power10_mma_8x8,      TRUE,  
-	  
-	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power10_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power10_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power10_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref,      FALSE,
-	  cntx
-	);
+	  cntx,
 
-	// Update the context with customized virtual [gemm]trsm micro-kernels.
-	bli_cntx_set_l3_vir_ukrs
-	(
-	  8,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_u_power10_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_u_power10_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref,
-	  cntx
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_power10_mma_8x16,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8,
+
+	  BLIS_VA_END
 	);
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
 	(
-	  2,
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_6xk_bb4_power10_ref,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power10_ref,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	//                                           s      d      c      z
@@ -131,14 +89,16 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 
 }
diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h
new file mode 100644
index 000000000..4e32f1173
--- /dev/null
+++ b/config/power10/bli_kernel_defs_power10.h
@@ -0,0 +1,51 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   8
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+
+#define BLIS_BBN_s   4
+#define BLIS_BBN_d   2
+
+//#endif
+
diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c
index c9caf62a6..d5ffe7dcf 100644
--- a/config/power7/bli_cntx_init_power7.c
+++ b/config/power7/bli_cntx_init_power7.c
@@ -43,13 +43,26 @@ void bli_cntx_init_power7( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  1,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_power7_int_8x4,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -64,13 +77,16 @@ void bli_cntx_init_power7( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/power7/bli_kernel_defs_power7.h b/config/power7/bli_kernel_defs_power7.h
new file mode 100644
index 000000000..ceec01df3
--- /dev/null
+++ b/config/power7/bli_kernel_defs_power7.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_d   8
+
+#define BLIS_NR_d   4
+
+//#endif
+
diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c
index 4370ce26c..9f2d67632 100644
--- a/config/power9/bli_cntx_init_power9.c
+++ b/config/power9/bli_cntx_init_power9.c
@@ -34,35 +34,6 @@
 
 #include "blis.h"
 
-// Instantiate prototypes for packm kernels.
-PACKM_KER_PROT(    float,  s, packm_6xk_bb4_power9_ref )
-PACKM_KER_PROT(    double, d, packm_6xk_bb2_power9_ref )
-
-// Instantiate prototypes for level-3 kernels.
-GEMM_UKR_PROT(     float,  s, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_u_power9_ref )
-
-GEMM_UKR_PROT(     double, d, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_u_power9_ref )
-
-GEMM_UKR_PROT(     scomplex, c, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_u_power9_ref )
-
-GEMM_UKR_PROT(     dcomplex, z, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_u_power9_ref )
-
 void bli_cntx_init_power9( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -72,50 +43,37 @@ void bli_cntx_init_power9( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  12,
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemmbb_power9_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power9_ref,      FALSE,
-
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,     FALSE,
-	  
-	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power9_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power9_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power9_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref,      FALSE,
-	  cntx
-	);
+	  cntx,
 
-	// Update the context with customized virtual [gemm]trsm micro-kernels.
-	bli_cntx_set_l3_vir_ukrs
-	(
-	  8,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_u_power9_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_u_power9_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref,
-	  cntx
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6,
+
+	  BLIS_VA_END
 	);
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
 	(
-	  2,
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_6xk_bb4_power9_ref,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power9_ref,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 
@@ -131,14 +89,15 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
 
+	  BLIS_VA_END
+	);
 }
diff --git a/config/power9/bli_kernel_defs_power9.h b/config/power9/bli_kernel_defs_power9.h
new file mode 100644
index 000000000..debfeac5f
--- /dev/null
+++ b/config/power9/bli_kernel_defs_power9.h
@@ -0,0 +1,49 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_d   12
+
+#define BLIS_NR_d   6
+
+#define BLIS_BBN_s   4
+#define BLIS_BBN_d   2
+
+//#endif
+
diff --git a/config/sandybridge/bli_cntx_init_sandybridge.c b/config/sandybridge/bli_cntx_init_sandybridge.c
index 1ffa5bf8b..0697a3351 100644
--- a/config/sandybridge/bli_cntx_init_sandybridge.c
+++ b/config/sandybridge/bli_cntx_init_sandybridge.c
@@ -43,16 +43,32 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_sandybridge_asm_8x8, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_sandybridge_asm_8x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_sandybridge_asm_8x8,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_sandybridge_asm_8x4,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/sandybridge/bli_kernel_defs_sandybridge.h b/config/sandybridge/bli_kernel_defs_sandybridge.h
new file mode 100644
index 000000000..dc1b843f6
--- /dev/null
+++ b/config/sandybridge/bli_kernel_defs_sandybridge.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   8
+#define BLIS_MR_c   8
+#define BLIS_MR_z   4
+
+#define BLIS_NR_s   8
+#define BLIS_NR_d   4
+#define BLIS_NR_c   4
+#define BLIS_NR_z   4
+
+//#endif
+
diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c
index f18503a7a..3af58b38d 100644
--- a/config/skx/bli_cntx_init_skx.c
+++ b/config/skx/bli_cntx_init_skx.c
@@ -43,39 +43,29 @@ void bli_cntx_init_skx( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT ,   bli_sgemm_skx_asm_32x12_l2,   FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_skx_asm_16x14,      FALSE,
-	  cntx
-	);
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR,       BLIS_FLOAT ,   bli_sgemm_skx_asm_32x12_l2,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_skx_asm_16x14,
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
 	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
 
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  10,
 #if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 #endif
+
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
@@ -84,12 +74,15 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
 #endif
+
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+
 	  // scalv
 #if 0
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
@@ -98,7 +91,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
-	  cntx
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -116,17 +122,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/skx/bli_kernel_defs_skx.h b/config/skx/bli_kernel_defs_skx.h
new file mode 100644
index 000000000..2aaf477ad
--- /dev/null
+++ b/config/skx/bli_kernel_defs_skx.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   32
+#define BLIS_MR_d   16
+
+#define BLIS_NR_s   12
+#define BLIS_NR_d   14
+
+//#endif
+
diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c
index 13e7f6495..4b4ecdf4e 100644
--- a/config/steamroller/bli_cntx_init_steamroller.c
+++ b/config/steamroller/bli_cntx_init_steamroller.c
@@ -43,16 +43,32 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/steamroller/bli_kernel_defs_steamroller.h b/config/steamroller/bli_kernel_defs_steamroller.h
new file mode 100644
index 000000000..df4a8c411
--- /dev/null
+++ b/config/steamroller/bli_kernel_defs_steamroller.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   16
+#define BLIS_MR_d   8
+#define BLIS_MR_c   4
+#define BLIS_MR_z   2
+
+#define BLIS_NR_s   3
+#define BLIS_NR_d   3
+#define BLIS_NR_c   2
+#define BLIS_NR_z   2
+
+//#endif
+
diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c
index f2b1c8d17..4bacc5d63 100644
--- a/config/template/bli_cntx_init_template.c
+++ b/config/template/bli_cntx_init_template.c
@@ -45,34 +45,44 @@ void bli_cntx_init_template( cntx_t* cntx )
 
 	// Update the context with optimized native gemm micro-kernels and
 	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	bli_cntx_set_ukrs
 	(
-	  5,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_template_noopt,       FALSE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt,     FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt,     FALSE,
-	  cntx
-	);
+	  cntx,
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
+	  // level-3
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_template_noopt,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt,
+	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt,
+	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt,
+
+	  // level-1f
 	  BLIS_AXPY2V_KER,    BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt,
 	  BLIS_DOTAXPYV_KER,  BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt,
 	  BLIS_AXPYF_KER,     BLIS_DCOMPLEX, bli_zaxpyf_template_noopt,
 	  BLIS_DOTXF_KER,     BLIS_DCOMPLEX, bli_zdotxf_template_noopt,
 	  BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt,
-	  cntx
-	);
 
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
+	  // level-1v
 	  BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt,
 	  BLIS_DOTV_KER,  BLIS_DCOMPLEX, bli_zdotv_template_noopt,
-	  cntx
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, FALSE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF,     BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF,     BLIS_DCOMPLEX, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -87,13 +97,16 @@ void bli_cntx_init_template( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/template/bli_kernel_defs_template.h b/config/template/bli_kernel_defs_template.h
new file mode 100644
index 000000000..86a33d8d8
--- /dev/null
+++ b/config/template/bli_kernel_defs_template.h
@@ -0,0 +1,60 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+//
+// Only defined for block sizes which are not taken as the default (i.e. when
+// an optimized kernel is provided).
+//
+
+#define BLIS_MR_z   4
+
+#define BLIS_NR_z   4
+
+//
+// PACKMR/PACKNR do not need to be defined unless they are different from the
+// "normal" MR/NR.
+//
+
+//#define BLIS_PACKMR_z   4
+
+//#define BLIS_PACKNR_z   4
+
+//#endif
+
diff --git a/config/thunderx2/bli_cntx_init_thunderx2.c b/config/thunderx2/bli_cntx_init_thunderx2.c
index f2b7b633d..9d1af2c99 100644
--- a/config/thunderx2/bli_cntx_init_thunderx2.c
+++ b/config/thunderx2/bli_cntx_init_thunderx2.c
@@ -43,14 +43,28 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/thunderx2/bli_kernel_defs_thunderx2.h b/config/thunderx2/bli_kernel_defs_thunderx2.h
new file mode 100644
index 000000000..60292099c
--- /dev/null
+++ b/config/thunderx2/bli_kernel_defs_thunderx2.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
+
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
+
+//#endif
+
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 1b16cd06f..a10986b23 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -40,92 +40,107 @@
 void bli_cntx_init_zen( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
 
 	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
 
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
+
+	  // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+#if 0
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+#endif
 
-#if 1
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
 #endif
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
+	  // packm
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_8,
+	  BLIS_AXPYF_KER,  BLIS_DOUBLE, bli_daxpyf_zen_int_8,
 
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  16,
+	  BLIS_DOTXF_KER,  BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 
 	  // axpyv
-#if 0
-	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
-	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int,
-#else
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
-#endif
 
-#if 1
 	  // copyv
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
-#endif
 
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
@@ -136,25 +151,76 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
 
 	  // scalv
-#if 0
-	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
-	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int,
-#else
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
-#endif
 
-#if 1
 	  // setv
-	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
-	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_FLOAT,  bli_ssetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_DOUBLE, bli_dsetv_zen_int,
 
 	  // swapv
 	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
 	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
 #endif
- 
-	  cntx
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -195,131 +261,74 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],   512,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],   512,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],   440,   220,    -1,    -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                               s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
+	                                                 9,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   144,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   256,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  8160,  4080,    -1,    -1 );
+#if 0
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,     3,     3,
+	                                                 9,     9,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   144,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   512,   256,   128,    64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  8160,  4080,  2040,  1020 );
+#endif
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
 
-	// -------------------------------------------------------------------------
+	  // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   512,   256,    -1,    -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   512,   256,    -1,    -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   440,   220,    -1,    -1 );
+	  // gemmsup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
+	  BLIS_VA_END
 	);
 
+	// -------------------------------------------------------------------------
+
+#if 0
 	// Initialize the context with the sup handlers.
 	bli_cntx_set_l3_sup_handlers
 	(
-	  1,
+	  cntx,
+	  
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
-	  cntx
-	);
 
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-#if 0
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-#endif
-
-#if 0
-	  // NOTE: This set of kernels is likely broken and therefore disabled.
-	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-
-	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-#endif
-	  cntx
+	  BLIS_VA_END
 	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,    -1,    -1,
-	                                             9,     9,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,    -1,    -1 );
-#if 0
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,     3,     3,
-	                                             9,     9,     3,     3 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    72,    36 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   512,   256,   128,    64 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  2040,  1020 );
 #endif
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
 }
-
diff --git a/config/zen/bli_kernel_defs_zen.h b/config/zen/bli_kernel_defs_zen.h
new file mode 100644
index 000000000..c5bc8d63f
--- /dev/null
+++ b/config/zen/bli_kernel_defs_zen.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
+
+//#endif
+
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index ba728602b..c7e40b4d0 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -38,73 +38,94 @@
 void bli_cntx_init_zen2( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen2_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
 
 	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
-
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
+
+	  // level-3 sup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+#if 0
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+#endif
 
-#if 1
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
 #endif
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
+	  // packm
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_DOUBLE, bli_daxpyf_zen_int_5,
 
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  16,
+	  BLIS_DOTXF_KER,  BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -127,18 +148,59 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 
 	  //swap
-	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
-	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
 
 	  //copy
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
 
 	  //set
-	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
-	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_FLOAT,  bli_ssetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_DOUBLE, bli_dsetv_zen_int,
 
-	  cntx
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // level-3 sup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -158,130 +220,73 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+#if 1
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],  500,  249,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],  500,  249,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],  500,  249,   -1,   -1 );
+#else
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ], 100000, 100000,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ], 100000, 100000,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ], 100000, 100000,   -1,   -1 );
+#endif
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                               s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
+	                                                 9,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   168,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   256,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  4080,  4080,    -1,    -1 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
 
-	// -------------------------------------------------------------------------
+	  // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-#if 1
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],  500,  249,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],  500,  249,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],  500,  249,   -1,   -1 );
-#else
-	bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000,   -1,   -1 );
-#endif
+	  // level-3 sup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
+	  BLIS_VA_END
 	);
 
+	// -------------------------------------------------------------------------
+
 #if 0
 	// Initialize the context with the sup handlers.
 	bli_cntx_set_l3_sup_handlers
 	(
-	  1,
+	  cntx,
+	  
 	  BLIS_GEMM, bli_gemmsup_ref,
-	  cntx
-	);
-#endif
+	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-#if 0
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-#endif
-
-#if 0
-	  // NOTE: This set of kernels is likely broken and therefore disabled.
-	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-
-	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-#endif
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,    -1,    -1,
-	                                             9,     9,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+	  BLIS_VA_END
 	);
+#endif
 }
 
diff --git a/config/zen2/bli_kernel_defs_zen2.h b/config/zen2/bli_kernel_defs_zen2.h
new file mode 100644
index 000000000..c5bc8d63f
--- /dev/null
+++ b/config/zen2/bli_kernel_defs_zen2.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
+
+//#endif
+
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index 0336ddc95..3ee385ed6 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -37,83 +37,106 @@
 void bli_cntx_init_zen3( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen3_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
 
 	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
-
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
+	  // gemmsup
 #if 0
-	// AMD: This will be enabled in other PRs.
-	// packm kernels
-	bli_cntx_set_packm_kers
-	(
-	  2,
-	  BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
-	  BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
-	  cntx
-	);
+	  // AMD: This should be enabled in the PR which has added these kernels
+	  // Update the context with optimized small/unpacked gemm kernels.
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
 #else
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
 #endif
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
+	  // packm
+#if 0
+	  // AMD: This will be enabled in other PRs.
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
+#else
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+#endif
 
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_DOUBLE, bli_daxpyf_zen_int_5,
 
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  16,
+	  BLIS_DOTXF_KER,  BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -135,19 +158,75 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 
-	  //swap
-	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
-	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,
+	  // swapv
+	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
 
-	  //copy
+	  // copyv
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
 
-	  //set
+	  // setv
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
 
-	  cntx
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+#if 0
+	  // AMD: This should be enabled in the PR which has added these kernels
+	  // Update the context with optimized small/unpacked gemm kernels.
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+#endif
+
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -164,138 +243,67 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],  512,  256,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],  200,  256,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],  240,  220,   -1,   -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                               s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,     3,     3,
+	                                                 9,     9,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   144,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   512,   256,   128,    64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  8160,  4080,  2040,  1020 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
 
-// -------------------------------------------------------------------------
+	  // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],  512,  256,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],  200,  256,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],  240,  220,   -1,   -1 );
+	  // gemmsup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
+	  BLIS_VA_END
 	);
 
+	// -------------------------------------------------------------------------
+
 #if 0
 	// Initialize the context with the sup handlers.
 	bli_cntx_set_l3_sup_handlers
 	(
-	  2,
+	  cntx,
+	  
 	  BLIS_GEMM, bli_gemmsup_ref,
-	  BLIS_GEMMT, bli_gemmtsup_ref,
-	  cntx
-	);
-#endif
+	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
-#if 0
-	// AMD: This should be enabled in the PR which has added these kernels
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  28,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  cntx
+	  BLIS_VA_END
 	);
-#else
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  cntx
-	);
-	
 #endif
-	
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,     3,     3,
-	                                             9,     9,     3,     3 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    72,    36 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   512,   256,   128,    64 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  2040,  1020 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
 }
 
diff --git a/config/zen3/bli_kernel_defs_zen3.h b/config/zen3/bli_kernel_defs_zen3.h
new file mode 100644
index 000000000..c5bc8d63f
--- /dev/null
+++ b/config/zen3/bli_kernel_defs_zen3.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
+
+//#endif
+
diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md
index dcec7754c..cc1224182 100644
--- a/docs/ConfigurationHowTo.md
+++ b/docs/ConfigurationHowTo.md
@@ -47,7 +47,7 @@ $ ls config/haswell
 bli_cntx_init_haswell.c  bli_family_haswell.h  make_defs.mk
 ```
 A sub-configuration (`haswell`, in this case) usually contains just three files:
-  * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute. 
+  * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute.
   * `bli_family_haswell.h`. This header file is `#included` when the configuration in question, in this case `haswell`, was the target to `./configure`. This is where you would specify certain global parameters and settings. For example, if you wanted to specify custom implementations of `malloc()` and `free()`, this is where you would specify them. The file is oftentimes empty. (In the case of configuration families, the definitions in this file apply to the _entire_ build, and not any specific sub-configuration, but for consistency we support them for all configuration targets, whether they be singleton sub-configurations or configuration families.)
   * `make_defs.mk`. This makefile fragment defines the compiler and compiler flags to use during compilation. Specifically, the values defined in this file are used whenever compiling source code specific to the sub-configuration (i.e., reference kernels and optimized kernels). If the sub-configuration is the target of `configure`, then these flags are also used to compile general framework code.
 
@@ -127,7 +127,7 @@ void bli_cntx_init_fooarch( cntx_t* cntx )
     // blocksizes (and multiples) for native execution.
     bli_cntx_set_blkszs
     (
-      BLIS_NAT, 5,
+      5,
       BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
       BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
       BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
@@ -143,7 +143,7 @@ _**Blocksize object array.**_ The `blkszs` array declaration is needed later in
 
 _**Reference initialization.**_ The first function call, `bli_cntx_init_fooarch_ref()`, initializes the context `cntx` with function pointers to reference implementations of all of the kernels supported by BLIS (as well as cache and register blocksizes, and other fields). This function is automatically generated by BLIS for every sub-configuration enabled at configure-time. The function prototype is generated by a preprocessor macro in `frame/include/bli_arch_config.h`.
 
-_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated. 
+_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated.
 
 _Note:_ Currently, BLIS only allows the kernel developer to signal a preference (row or column) for `gemm` microkernels. The preference of the `gemmtrsm` and `trsm` microkernels can (and must) be set, but are ignored by the framework during execution.
 
@@ -236,7 +236,7 @@ _**Memory alignment.**_ BLIS implements memory alignment internally, rather than
 ```
 The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`.
 
-The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. 
+The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels.
 
 The value `BLIS_HEAP_STRIDE_ALIGN_SIZE` defines the alignment used for so-called "leading dimensions" (i.e. column strides for column-stored matrices, and row strides for row-stored matrices) when creating BLIS matrices via the object-based API (e.g. `bli_obj_create()`). While setting `BLIS_HEAP_ADDR_ALIGN_SIZE` guarantees alignment for the first column (or row), creating a matrix with certain dimension values (_m_ and _n_) may cause subsequent columns (or rows) to be misaligned. Setting this value to `BLIS_SIMD_ALIGN_SIZE` is usually desirable. Additional alignment may or may not be beneficial.
 
@@ -246,7 +246,7 @@ The value `BLIS_POOL_ADDR_ALIGN_SIZE_*` define the alignments used when allocati
 
 ### make_defs.mk
 
-The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library. 
+The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library.
 
 The format of the file is mostly self-explanatory. However, we will expound on the contents here, using the `make_defs.mk` file for the `haswell` configuration as an example:
 ```make
@@ -304,7 +304,7 @@ _**Debugging flags.**_ The `CDBGFLAGS` variable should be assigned to contain fl
 
 _**Optimization flags.**_ The `COPTFLAGS` variable should be assigned any flags relating to general compiler optimization. Usually this takes the form of `-O2` or `-O3`, but more specific optimization flags may be included as well, such as `-fomit-frame-pointer`. Note that, as with `CDBGFLAGS`, `COPTFLAGS` is conditionally assigned based on the value of `$(DEBUG_TYPE)`. A separate `CKOPTFLAGS` variable tracks optimizations flags used when compiling kernels. For most configurations, `CKOPTFLAGS` is assigned as a copy of `COPTFLAGS`, but if the kernel developer needs different optimization flags to be applied when compiling kernel source code, `CKOPTFLAGS` should be set accordingly.
 
-_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`. 
+_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`.
 
 _**Variable storage/renaming.**_ Finally, the last statement commits the variables defined in the file to "storage". That is, they are copied to variable names that contain `THIS_CONFIG` as a suffix. This allows the variables for one configuration to co-exist with variables of another configuration.
 
@@ -406,7 +406,7 @@ Some sub-configurations, for various reasons, do not rely on their own set of ke
 excavator:   excavator/piledriver
 steamroller: steamroller/piledriver
 ```
-Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner. 
+Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner.
 
 **Note:** Specifying non-native kernel sets via the `/` character is only allowed when defining singleton configuration families. They may NOT appear in the definitions of umbrella families! When an umbrella family includes a singleton family that is defined to require non-native kernels, this will be accounted for during the parsing of the `config_registry` file.
 
@@ -467,7 +467,7 @@ configure:   skx: skx
 configure:   steamroller: steamroller
 configure:   x86_64: haswell sandybridge penryn zen excavator steamroller piledriver bulldozer generic
 ```
-This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically. 
+This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically.
 
 Next, the kernel list (actually, all kernel lists) is printed:
 ```
@@ -549,7 +549,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the
 
 2. _**Add support within the framework source code.**_ We also need to make a minor update to the framework to support the new kernels--specifically, to pull in the kernels' function prototypes.
 
-   **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file: 
+   **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file:
    ```c
    #ifdef BLIS_KERNELS_KNL
    #include "bli_kernels_knl.h"
@@ -560,7 +560,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the
 
 ## Adding a new configuration family
 
-Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set. 
+Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set.
 
 
@@ -636,7 +636,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
    ```
    THIS_CONFIG    := knl
    ```
-   and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. 
+   and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file.
    ```c
    #define BLIS_SIMD_MAX_NUM_REGISTERS  32
    #define BLIS_SIMD_MAX_SIZE           64
@@ -714,7 +714,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
       #include "bli_family_knl.h"
       #endif
       ```
-      As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.) 
+      As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.)
 
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index 5fdfdb91e..1d12b42eb 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -61,15 +61,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -98,14 +98,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   n, \
-	   x, incx, \
-	   index, \
-	   cntx  \
+	  n, \
+	  x, incx, \
+	  index, \
+	  cntx  \
 	); \
 }
 
@@ -135,17 +135,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -175,16 +175,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) \
 		cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -215,17 +215,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   x, incx, \
-	   y, incy, \
-	   rho, \
-	   cntx  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  x, incx, \
+	  y, incy, \
+	  rho, \
+	  cntx  \
 	); \
 }
 
@@ -257,19 +257,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   y, incy, \
-	   beta, \
-	   rho, \
-	   cntx  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  y, incy, \
+	  beta, \
+	  rho, \
+	  cntx  \
 	); \
 }
 
@@ -295,13 +295,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   n, \
-	   x, incx, \
-	   cntx  \
+	  n, \
+	  x, incx, \
+	  cntx  \
 	); \
 }
 
@@ -329,15 +329,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjalpha, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   cntx  \
+	  conjalpha, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  cntx  \
 	); \
 }
 
@@ -365,14 +365,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   n, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  n, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -400,16 +400,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
diff --git a/frame/1/other/packv/bli_packv_unb_var1.c b/frame/1/other/packv/bli_packv_unb_var1.c
index 23b370949..ca1323b58 100644
--- a/frame/1/other/packv/bli_packv_unb_var1.c
+++ b/frame/1/other/packv/bli_packv_unb_var1.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 { \
 	const num_t dt  = PASTEMAC(ch,type); \
 \
-	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 	copyv_p \
 	( \
diff --git a/frame/1/other/unpackv/bli_unpackv_unb_var1.c b/frame/1/other/unpackv/bli_unpackv_unb_var1.c
index 5dc1101b6..43c9a266c 100644
--- a/frame/1/other/unpackv/bli_unpackv_unb_var1.c
+++ b/frame/1/other/unpackv/bli_unpackv_unb_var1.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 { \
 	const num_t dt  = PASTEMAC(ch,type); \
 \
-	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 	copyv_p \
 	( \
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index a8f9e844a..cfaf5150f 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -85,32 +85,33 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	if ( bli_is_nonunit_diag( diagx ) ) \
 	{ \
-	    x1   = x + offx; \
-	    y1   = y + offy; \
+		x1   = x + offx; \
+		y1   = y + offy; \
 	} \
 	else /* if ( bli_is_unit_diag( diagx ) ) */ \
 	{ \
-	    /* Simulate a unit diagonal for x with a zero increment over a unit
-	       scalar. */ \
-	    x1   = PASTEMAC(ch,1); \
-	    incx = 0; \
-	    y1   = y + offy; \
+		/* Simulate a unit diagonal for x with a zero increment over a unit
+		   scalar. */ \
+		x1   = PASTEMAC(ch,1); \
+		incx = 0; \
+		y1   = y + offy; \
 	} \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjx, \
-	   n_elem, \
-	   x1, incx, \
-	   y1, incy, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjx, \
+	  n_elem, \
+	  x1, incx, \
+	  y1, incy, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( addd,  addv,  BLIS_ADDV_KER )
@@ -164,33 +165,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	if ( bli_is_nonunit_diag( diagx ) ) \
 	{ \
-	    x1   = x + offx; \
-	    y1   = y + offy; \
+		x1   = x + offx; \
+		y1   = y + offy; \
 	} \
 	else /* if ( bli_is_unit_diag( diagx ) ) */ \
 	{ \
-	    /* Simulate a unit diagonal for x with a zero increment over a unit
-	       scalar. */ \
-	    x1   = PASTEMAC(ch,1); \
-	    incx = 0; \
-	    y1   = y + offy; \
+		/* Simulate a unit diagonal for x with a zero increment over a unit
+		   scalar. */ \
+		x1   = PASTEMAC(ch,1); \
+		incx = 0; \
+		y1   = y + offy; \
 	} \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjx, \
-	   n_elem, \
-	   alpha, \
-	   x1, incx, \
-	   y1, incy, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjx, \
+	  n_elem, \
+	  alpha, \
+	  x1, incx, \
+	  y1, incy, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( axpyd,  axpyv,  BLIS_AXPYV_KER )
@@ -233,20 +235,21 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  &offx, &n_elem, &incx \
 	); \
 \
-    x1 = x + offx; \
+	x1 = x + offx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   n_elem, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  n_elem, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
@@ -290,22 +293,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  &offx, &n_elem, &incx \
 	); \
 \
-    x1 = x + offx; \
+	x1 = x + offx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjalpha, \
-	   n_elem, \
-	   alpha, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjalpha, \
+	  n_elem, \
+	  alpha, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
@@ -361,27 +365,28 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		PASTEMAC(ch,setis)( *alpha, *chi11 ); \
 	} */ \
 \
-	/* Acquire the addres of the imaginary component of the first element,
+	/* Acquire the address of the imaginary component of the first element,
 	   and scale the increment for use in the real domain. Note that the
 	   indexing into the imaginary field only needs to work for complex
 	   datatypes since we return early for real domain types. */ \
-    x1   = ( ctype_r* )( x + offx ) + 1; \
+	x1   = ( ctype_r* )( x + offx ) + 1; \
 	incx = 2*incx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \
+	PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   BLIS_NO_CONJUGATE, \
-	   n_elem, \
-	   alpha, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n_elem, \
+	  alpha, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
@@ -424,22 +429,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  &offx, &n_elem, &incx \
 	); \
 \
-    x1 = x + offx; \
+	x1 = x + offx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   BLIS_NO_CONJUGATE, \
-	   n_elem, \
-	   alpha, 0, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n_elem, \
+	  alpha, 0, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
@@ -491,33 +497,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	if ( bli_is_nonunit_diag( diagx ) ) \
 	{ \
-	    x1   = x + offx; \
-	    y1   = y + offy; \
+		x1   = x + offx; \
+		y1   = y + offy; \
 	} \
 	else /* if ( bli_is_unit_diag( diagx ) ) */ \
 	{ \
-	    /* Simulate a unit diagonal for x with a zero increment over a unit
-	       scalar. */ \
-	    x1   = PASTEMAC(ch,1); \
-	    incx = 0; \
-	    y1   = y + offy; \
+		/* Simulate a unit diagonal for x with a zero increment over a unit
+		   scalar. */ \
+		x1   = PASTEMAC(ch,1); \
+		incx = 0; \
+		y1   = y + offy; \
 	} \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjx, \
-	   n_elem, \
-	   x1, incx, \
-	   beta, \
-	   y1, incy, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjx, \
+	  n_elem, \
+	  x1, incx, \
+	  beta, \
+	  y1, incy, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( xpbyd,  xpbyv,  BLIS_XPBYV_KER )
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index 332ff5af2..a54379299 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -65,19 +65,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   alphax, \
-	   alphay, \
-	   x, incx, \
-	   y, incy, \
-	   z, incz, \
-	   cntx  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  alphax, \
+	  alphay, \
+	  x, incx, \
+	  y, incy, \
+	  z, incz, \
+	  cntx  \
 	); \
 }
 
@@ -109,19 +109,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conja, \
-	   conjx, \
-	   m, \
-	   b_n, \
-	   alpha, \
-	   a, inca, lda, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  conja, \
+	  conjx, \
+	  m, \
+	  b_n, \
+	  alpha, \
+	  a, inca, lda, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -154,20 +154,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjxt, \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   y, incy, \
-	   rho, \
-	   z, incz, \
-	   cntx  \
+	  conjxt, \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  y, incy, \
+	  rho, \
+	  z, incz, \
+	  cntx  \
 	); \
 }
 
@@ -204,24 +204,24 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjat, \
-	   conja, \
-	   conjw, \
-	   conjx, \
-	   m, \
-	   b_n, \
-	   alpha, \
-	   a, inca, lda, \
-	   w, incw, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   z, incz, \
-	   cntx  \
+	  conjat, \
+	  conja, \
+	  conjw, \
+	  conjx, \
+	  m, \
+	  b_n, \
+	  alpha, \
+	  a, inca, lda, \
+	  w, incw, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  z, incz, \
+	  cntx  \
 	); \
 }
 
@@ -254,20 +254,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
-	   conjat, \
-	   conjx, \
-	   m, \
-	   b_n, \
-	   alpha, \
-	   a, inca, lda, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   cntx  \
+	  conjat, \
+	  conjx, \
+	  m, \
+	  b_n, \
+	  alpha, \
+	  a, inca, lda, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h
index 2e813cf4a..41d80e217 100644
--- a/frame/1m/bli_l1m_ft_ker.h
+++ b/frame/1m/bli_l1m_ft_ker.h
@@ -102,35 +102,40 @@ INSERT_GENTDEF( packm_cxk )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t           conjp, \
+       conj_t           conja, \
+       pack_t           schema, \
+       dim_t            cdim, \
        dim_t            n, \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
 INSERT_GENTDEF( unpackm_cxk )
 
-// packm_1er_ker
+// packm_diag_ker
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
        conj_t           conja, \
        pack_t           schema, \
+       bool             invdiag, \
        dim_t            cdim, \
-       dim_t            n, \
        dim_t            n_max, \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
-INSERT_GENTDEF( packm_cxk_1er )
+INSERT_GENTDEF( packm_cxc_diag )
 
 
 #endif
diff --git a/frame/1m/bli_l1m_ker.h b/frame/1m/bli_l1m_ker.h
index 76d51af2b..970c5f040 100644
--- a/frame/1m/bli_l1m_ker.h
+++ b/frame/1m/bli_l1m_ker.h
@@ -47,16 +47,8 @@
 #undef  GENTPROT
 #define GENTPROT PACKM_KER_PROT
 
-INSERT_GENTPROT_BASIC0( packm_2xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_3xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_4xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_6xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_8xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_10xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_12xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_14xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_16xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
+INSERT_GENTPROT_BASIC0( packm_mrxk_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxk_ker_name )
 
 
 // native unpackm kernels
@@ -64,27 +56,33 @@ INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
 #undef  GENTPROT
 #define GENTPROT UNPACKM_KER_PROT
 
-INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name )
+INSERT_GENTPROT_BASIC0( unpackm_mrxk_ker_name )
+INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name )
 
 
 // 1e/1r packm kernels
 
 #undef  GENTPROT
-#define GENTPROT PACKM_1ER_KER_PROT
-
-INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name )
+#define GENTPROT PACKM_KER_PROT
+
+INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name )
+
+
+// packm kernels for diagonal blocks
+
+#undef  GENTPROT
+#define GENTPROT PACKM_DIAG_KER_PROT
+
+INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_ker_name )
+
+
+// 1e/1r packm kernels for diagonal blocks
+
+#undef  GENTPROT
+#define GENTPROT PACKM_DIAG_KER_PROT
+
+INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_1er_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_1er_ker_name )
 
diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h
index 02d329622..80284ea22 100644
--- a/frame/1m/bli_l1m_ker_prot.h
+++ b/frame/1m/bli_l1m_ker_prot.h
@@ -37,7 +37,7 @@
 // Define template prototypes for level-1m kernels.
 //
 
-// native packm kernels
+// packm kernels
 
 #define PACKM_KER_PROT( ctype, ch, varname ) \
 \
@@ -55,35 +55,40 @@ void PASTEMAC(ch,varname) \
      );
 
 
-// native unpackm kernels
+// unpackm kernels
 
 #define UNPACKM_KER_PROT( ctype, ch, varname ) \
 \
 void PASTEMAC(ch,varname) \
      ( \
        conj_t           conja, \
+       pack_t           schema, \
+       dim_t            cdim, \
        dim_t            n, \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
 
-// 1e/1r packm kernels
+// packm kernels for diagonal blocks
 
-#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \
+#define PACKM_DIAG_KER_PROT( ctype, ch, varname ) \
 \
 void PASTEMAC(ch,varname) \
      ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
        conj_t           conja, \
        pack_t           schema, \
+       bool             invdiag, \
        dim_t            cdim, \
-       dim_t            n, \
        dim_t            n_max, \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index f2ce3c8d7..c979f082a 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -197,7 +197,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -310,7 +310,7 @@ void PASTEMAC(ch,opname) \
 	if ( bli_is_zeros( uplox_eff ) ) return; \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -423,7 +423,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/other/bli_packm_cxk.c
similarity index 84%
rename from frame/1m/packm/bli_packm_cxk.c
rename to frame/1m/other/bli_packm_cxk.c
index ea0418cae..612b37f78 100644
--- a/frame/1m/packm/bli_packm_cxk.c
+++ b/frame/1m/other/bli_packm_cxk.c
@@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                           : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
@@ -91,30 +92,30 @@ void PASTEMAC(ch,opname) \
 		   that happens, the packm kernel must have set the 0's added in
 		   step (3) below.
 
-             packm kernel     packm kernel     packm kernel     packm_tri_cxk
+		     packm kernel     packm kernel     packm kernel     packm_tri_cxk
 		     step 1:          step 2:          step 3:          step 4:
 
-             x x x x . .      x x x x . .      x x x x 0 0      x x x x 0 0
-             ? x x x . .      ? x x x . .      ? x x x 0 0      ? x x x 0 0
-             ? ? x x . .  ->  ? ? x x . .  ->  ? ? x x 0 0  ->  ? ? x x 0 0
-             ? ? ? x . .      ? ? ? x . .      ? ? ? x 0 0      ? ? ? x 0 0
-             . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 1 0
-             . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 0 1
+		     x x x x . .      x x x x . .      x x x x 0 0      x x x x 0 0
+		     ? x x x . .      ? x x x . .      ? x x x 0 0      ? x x x 0 0
+		     ? ? x x . .  ->  ? ? x x . .  ->  ? ? x x 0 0  ->  ? ? x x 0 0
+		     ? ? ? x . .      ? ? ? x . .      ? ? ? x 0 0      ? ? ? x 0 0
+		     . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 1 0
+		     . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 0 1
 
 		     x  Copied from A; valid element.
-             ?  Copied from A, but value is unknown and unused.
+		     ?  Copied from A, but value is unknown and unused.
 		     .  Uninitialized.
-             0  Initialized to zero.
-             1  Initialized to one.
+		     0  Initialized to zero.
+		     1  Initialized to one.
 
 		     NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
 		     to zero. This is not needed to support trsm, but rather to
 		     support trmm. (Both use the same packing format and code.)
 
-           In this case, panel_dim will be 4 because four rows of data are
-           copied from A, panel_len will be 4 because those four rows span
-           four columns of A, and panel_len_max will be 6 because there are a
-           total of 6 columns that can be written to in the packed micropanel,
+		   In this case, panel_dim will be 4 because four rows of data are
+		   copied from A, panel_len will be 4 because those four rows span
+		   four columns of A, and panel_len_max will be 6 because there are a
+		   total of 6 columns that can be written to in the packed micropanel,
 		   2 of which lie beyond the values copied from A. */ \
 		f \
 		( \
diff --git a/frame/1m/packm/bli_packm_cxk.h b/frame/1m/other/bli_packm_cxk.h
similarity index 100%
rename from frame/1m/packm/bli_packm_cxk.h
rename to frame/1m/other/bli_packm_cxk.h
diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/other/bli_packm_cxk_1er.c
similarity index 94%
rename from frame/1m/packm/bli_packm_cxk_1er.c
rename to frame/1m/other/bli_packm_cxk_1er.c
index e583c8a82..22598dbac 100644
--- a/frame/1m/packm/bli_packm_cxk_1er.c
+++ b/frame/1m/other/bli_packm_cxk_1er.c
@@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
+	                                           : BLIS_PACKM_MRXK_1ER_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/frame/1m/packm/bli_packm_cxk_1er.h b/frame/1m/other/bli_packm_cxk_1er.h
similarity index 100%
rename from frame/1m/packm/bli_packm_cxk_1er.h
rename to frame/1m/other/bli_packm_cxk_1er.h
diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/other/bli_packm_struc_cxk_1er.c
similarity index 100%
rename from frame/1m/packm/bli_packm_struc_cxk_1er.c
rename to frame/1m/other/bli_packm_struc_cxk_1er.c
diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/other/bli_packm_struc_cxk_1er.h
similarity index 100%
rename from frame/1m/packm/bli_packm_struc_cxk_1er.h
rename to frame/1m/other/bli_packm_struc_cxk_1er.h
diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/other/bli_unpackm_cxk.c
similarity index 92%
rename from frame/1m/unpackm/bli_unpackm_cxk.c
rename to frame/1m/other/bli_unpackm_cxk.c
index 4423c41a2..4b7977e86 100644
--- a/frame/1m/unpackm/bli_unpackm_cxk.c
+++ b/frame/1m/other/bli_unpackm_cxk.c
@@ -40,6 +40,7 @@
 void PASTEMAC(ch,opname) \
      ( \
        conj_t  conjp, \
+       pack_t  schema, \
        dim_t   panel_dim, \
        dim_t   panel_len, \
        ctype*  kappa, \
@@ -48,15 +49,16 @@ void PASTEMAC(ch,opname) \
        cntx_t* cntx  \
      ) \
 { \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
+	                                           : BLIS_UNPACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/frame/1m/unpackm/bli_unpackm_cxk.h b/frame/1m/other/bli_unpackm_cxk.h
similarity index 98%
rename from frame/1m/unpackm/bli_unpackm_cxk.h
rename to frame/1m/other/bli_unpackm_cxk.h
index 53c3c0c44..d828a9b8e 100644
--- a/frame/1m/unpackm/bli_unpackm_cxk.h
+++ b/frame/1m/other/bli_unpackm_cxk.h
@@ -39,6 +39,7 @@
 void PASTEMAC(ch,varname) \
      ( \
        conj_t  conjp, \
+       pack_t  schema, \
        dim_t   panel_dim, \
        dim_t   panel_len, \
        ctype*  kappa, \
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index 88657a712..7d73bf903 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -43,10 +43,6 @@
 #include "bli_packm_part.h"
 
 #include "bli_packm_struc_cxk.h"
-#include "bli_packm_struc_cxk_1er.h"
-
-#include "bli_packm_cxk.h"
-#include "bli_packm_cxk_1er.h"
 
 // Mixed datatype support.
 #ifdef BLIS_ENABLE_GEMM_MD
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index edeeae2b9..e13391151 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -43,11 +43,11 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
     { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
         bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
 // 0001 row/col panels: 1m-expanded (1e)
-    { { NULL,                      bli_cpackm_struc_cxk_1er,
-        NULL,                      bli_zpackm_struc_cxk_1er,  } },
+    { { NULL,                      bli_cpackm_struc_cxk,
+        NULL,                      bli_zpackm_struc_cxk,  } },
 // 0010 row/col panels: 1m-reordered (1r)
-    { { NULL,                      bli_cpackm_struc_cxk_1er,
-        NULL,                      bli_zpackm_struc_cxk_1er,  } },
+    { { NULL,                      bli_cpackm_struc_cxk,
+        NULL,                      bli_zpackm_struc_cxk,  } },
 };
 
 static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 2a52c42de..dbdaf4738 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -34,8 +34,8 @@
 
 #include "blis.h"
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kername ) \
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, cxk_kername, cxc_kername ) \
 \
 void PASTEMAC(ch,varname) \
      ( \
@@ -58,138 +58,38 @@ void PASTEMAC(ch,varname) \
        cntx_t*         cntx  \
      ) \
 { \
-	/* Handle micro-panel packing based on the structure of the matrix
-	   being packed. */ \
-	if      ( bli_is_general( strucc ) ) \
-	{ \
-		/* For micro-panels of general matrices, we can call the pack
-		   kernel front-end directly. */ \
-		PASTEMAC(ch,kername) \
-		( \
-		  conjc, \
-		  schema, \
-		  panel_dim, \
-		  panel_dim_max, \
-		  panel_len, \
-		  panel_len_max, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp, \
-		  cntx  \
-		); \
-	} \
-	else if ( bli_is_herm_or_symm( strucc ) ) \
-	{ \
-		/* Call a helper function for micro-panels of Hermitian/symmetric
-		   matrices. */ \
-		PASTEMAC(ch,packm_herm_cxk) \
-		( \
-		  strucc, \
-		  diagc, \
-		  uploc, \
-		  conjc, \
-		  schema, \
-		  invdiag, \
-		  panel_dim, \
-		  panel_len, \
-		  panel_dim_max, \
-		  panel_len_max, \
-		  panel_dim_off, \
-		  panel_len_off, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp, \
-		     is_p, \
-		  cntx  \
-		); \
-	} \
-	else /* ( bli_is_triangular( strucc ) ) */ \
-	{ \
-		/* Call a helper function for micro-panels of triangular
-		   matrices. */ \
-		PASTEMAC(ch,packm_tri_cxk) \
-		( \
-		  strucc, \
-		  diagc, \
-		  uploc, \
-		  conjc, \
-		  schema, \
-		  invdiag, \
-		  panel_dim, \
-		  panel_len, \
-		  panel_dim_max, \
-		  panel_len_max, \
-		  panel_dim_off, \
-		  panel_len_off, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp, \
-		     is_p, \
-		  cntx  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kername ) \
+	num_t   dt            = PASTEMAC(ch,type); \
+	num_t   dt_r          = PASTEMAC(chr,type); \
+	dim_t   panel_len_pad = panel_len_max - panel_len; \
 \
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
-     ) \
-{ \
-	doff_t diagoffc = panel_dim_off - panel_len_off; \
-	doff_t diagoffc_abs; \
-	dim_t  i, j; \
+	bszid_t bsz_id        = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \
+	dim_t   packmrnr      = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \
+	dim_t   packmrnr_r    = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \
 \
-	/* Handle the case where the micro-panel does NOT intersect the
-	   diagonal separately from the case where it does intersect. */ \
-	if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
+	ukr_t   cxk_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                                    : BLIS_PACKM_MRXK_KER; \
+	ukr_t   cxc_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER \
+	                                                    : BLIS_PACKM_MRXMR_DIAG_KER; \
+\
+	if ( bli_is_1m_packed( schema ) ) \
 	{ \
-		/* If the current panel is unstored, we need to make a few
-		   adjustments so we refer to the data where it is actually
-		   stored, also taking conjugation into account. (Note this
-		   implicitly assumes we are operating on a dense panel
-		   within a larger symmetric or Hermitian matrix, since a
-		   general matrix would not contain any unstored region.) */ \
-		if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
-		{ \
-			c = c + diagoffc * ( doff_t )ldc + \
-			       -diagoffc * ( doff_t )incc;  \
-			bli_swap_incs( &incc, &ldc ); \
+		cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
+		                                         : BLIS_PACKM_MRXK_1ER_KER; \
+		cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER \
+		                                         : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
+	} \
 \
-			if ( bli_is_hermitian( strucc ) ) \
-				bli_toggle_conj( &conjc ); \
-		} \
+	PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
+	PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
 \
-		/* Pack the full panel. */ \
-		PASTEMAC(ch,kername) \
+	/* For general matrices, pack and return early */ \
+	if ( bli_is_general( strucc ) ) \
+	{ \
+		f_cxk \
 		( \
 		  conjc, \
 		  schema, \
 		  panel_dim, \
-		  panel_dim_max, \
 		  panel_len, \
 		  panel_len_max, \
 		  kappa, \
@@ -197,321 +97,210 @@ void PASTEMAC(ch,varname) \
 		  p,       ldp, \
 		  cntx  \
 		); \
+		return; \
 	} \
-	else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
-	{ \
-		ctype* restrict c10; \
-		ctype* restrict p10; \
-		dim_t           p10_dim, p10_len; \
-		inc_t           incc10, ldc10; \
-		doff_t          diagoffc10; \
-		conj_t          conjc10; \
 \
-		ctype* restrict c12; \
-		ctype* restrict p12; \
-		dim_t           p12_dim, p12_len; \
-		inc_t           incc12, ldc12; \
-		doff_t          diagoffc12; \
-		conj_t          conjc12; \
+	/* Sanity check. Diagonals should not intersect the short end of
+	   a micro-panel. If they do, then somehow the constraints on
+	   cache blocksizes being a whole multiple of the register
+	   blocksizes was somehow violated. */ \
+	doff_t diagoffc = panel_dim_off - panel_len_off; \
+	if ( (          -panel_dim < diagoffc && diagoffc <         0 ) || \
+		 ( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \
+		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
 \
-		/* Sanity check. Diagonals should not intersect the short end of
-		   a micro-panel. If they do, then somehow the constraints on
-		   cache blocksizes being a whole multiple of the register
-		   blocksizes was somehow violated. */ \
-		if ( diagoffc < 0 ) \
-			bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+	/* For triangular, symmetric, and hermitian matrices we need to consider
+	   three parts. */ \
 \
-		diagoffc_abs = bli_abs( diagoffc ); \
+	/* Pack to p10. */ \
+	if ( 0 < diagoffc ) \
+	{ \
+		dim_t  p10_dim     = panel_dim; \
+		dim_t  p10_len     = bli_min( diagoffc, panel_len ); \
+		dim_t  p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
+		ctype* p10         = p; \
+		conj_t conjc10     = conjc; \
+		ctype* c10         = c; \
+		inc_t  incc10      = incc; \
+		inc_t  ldc10       = ldc; \
 \
-		if      ( bli_is_lower( uploc ) ) \
+		if ( bli_is_upper( uploc ) ) \
 		{ \
-			p10_dim    = panel_dim; \
-			p10_len    = diagoffc_abs; \
-			p10        = p; \
-			c10        = c; \
-			incc10     = incc; \
-			ldc10      = ldc; \
-			conjc10    = conjc; \
-\
-			p12_dim    = panel_dim; \
-			p12_len    = panel_len - p10_len; \
-			j          = p10_len; \
-			diagoffc12 = diagoffc_abs - j; \
-			p12        = p + (j  )*ldp; \
-			c12        = c + (j  )*ldc; \
-			c12        = c12 + diagoffc12 * ( doff_t )ldc + \
-			                  -diagoffc12 * ( doff_t )incc;  \
-			incc12     = ldc; \
-			ldc12      = incc; \
-			conjc12    = conjc; \
-\
-			if ( bli_is_hermitian( strucc ) ) \
-				bli_toggle_conj( &conjc12 ); \
-		} \
-		else /* if ( bli_is_upper( uploc ) ) */ \
-		{ \
-			p10_dim    = panel_dim; \
-			p10_len    = diagoffc_abs + panel_dim; \
-			diagoffc10 = diagoffc; \
-			p10        = p; \
-			c10        = c; \
-			c10        = c10 + diagoffc10 * ( doff_t )ldc + \
-			                  -diagoffc10 * ( doff_t )incc;  \
-			incc10     = ldc; \
-			ldc10      = incc; \
-			conjc10    = conjc; \
-\
-			p12_dim    = panel_dim; \
-			p12_len    = panel_len - p10_len; \
-			j          = p10_len; \
-			p12        = p + (j  )*ldp; \
-			c12        = c + (j  )*ldc; \
-			incc12     = incc; \
-			ldc12      = ldc; \
-			conjc12    = conjc; \
+			bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \
 \
 			if ( bli_is_hermitian( strucc ) ) \
 				bli_toggle_conj( &conjc10 ); \
 		} \
 \
-		/* Pack to p10. For upper storage, this includes the unstored
-		   triangle of c11. */ \
-		/* NOTE: Since we're only packing partial panels here, we pass in
-		   p1x_len as panel_len_max; otherwise, the packm kernel will zero-
-		   fill the columns up to panel_len_max, which is not what we need
-		   or want to happen. */ \
-		PASTEMAC(ch,kername) \
-		( \
-		  conjc10, \
-		  schema, \
-		  p10_dim, \
-		  panel_dim_max, \
-		  p10_len, \
-		  p10_len, \
-		  kappa, \
-		  c10, incc10, ldc10, \
-		  p10,         ldp, \
-		  cntx  \
-		); \
-\
-		/* Pack to p12. For lower storage, this includes the unstored
-		   triangle of c11. */ \
-		/* NOTE: Since we're only packing partial panels here, we pass in
-		   p1x_len as panel_len_max; otherwise, the packm kernel will zero-
-		   fill the columns up to panel_len_max, which is not what we need
-		   or want to happen. */ \
-		PASTEMAC(ch,kername) \
-		( \
-		  conjc12, \
-		  schema, \
-		  p12_dim, \
-		  panel_dim_max, \
-		  p12_len, \
-		  p12_len, \
-		  kappa, \
-		  c12, incc12, ldc12, \
-		  p12,         ldp, \
-		  cntx  \
-		); \
-\
-		/* Pack the stored triangle of c11 to p11. */ \
+		/* If we are referencing the unstored part of a triangular matrix,
+		   explicitly store zeros */ \
+		if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \
 		{ \
-			dim_t           p11_m  = panel_dim; \
-			dim_t           p11_n  = panel_dim; \
-			dim_t           j2     = diagoffc_abs; \
-			ctype* restrict c11    = c + (j2 )*ldc; \
-			ctype* restrict p11    = p + (j2 )*ldp; \
-			trans_t         transc = ( trans_t )conjc; \
-\
-			PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
-			( \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  uploc, \
-			  transc, \
-			  p11_m, \
-			  p11_n, \
-			  c11, incc, ldc, \
-			  p11,    1, ldp, \
-			  cntx, \
-			  NULL  \
-			); \
-\
-			/* If source matrix c is Hermitian, we have to zero out the
-			   imaginary components of the diagonal of p11 in case the
-			   corresponding elements in c11 were not already zero. */ \
-			if ( bli_is_hermitian( strucc ) ) \
+			if ( bli_is_1m_packed( schema ) ) \
 			{ \
-				ctype* restrict pi11 = p11; \
-\
-				for ( i = 0; i < p11_m; ++i ) \
-				{ \
-					PASTEMAC(ch,seti0s)( *pi11 ); \
+				ctype_r* restrict zero = PASTEMAC(chr,0); \
 \
-					pi11 += 1 + ldp; \
-				} \
+				PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr_r, \
+				  p10_len_max * 2, \
+				  zero, \
+				  ( ctype_r* )p10, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
 			} \
+			else \
+			{ \
+				ctype* restrict zero = PASTEMAC(ch,0); \
 \
-			/* Now that the diagonal has been made explicitly Hermitian
-			   (if applicable), we can now safely scale the stored
-			   triangle specified by uploc. */ \
-			PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \
+				PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr, \
+				  p10_len_max, \
+				  zero, \
+				  p10, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+		} \
+		else \
+		{ \
+			f_cxk \
 			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  uploc, \
-			  p11_m, \
-			  p11_n, \
+			  conjc10, \
+			  schema, \
+			  p10_dim, \
+			  p10_len, \
+			  p10_len_max, \
 			  kappa, \
-			  p11, 1, ldp, \
-			  cntx, \
-			  NULL  \
+			  c10, incc10, ldc10, \
+			  p10,         ldp, \
+			  cntx  \
 			); \
 		} \
 	} \
-}
-
-INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
-
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kername ) \
 \
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
-     ) \
-{ \
-	doff_t diagoffc = panel_dim_off - panel_len_off; \
-\
-	/* Pack the panel. */ \
-	PASTEMAC(ch,kername) \
-	( \
-	  conjc, \
-	  schema, \
-	  panel_dim, \
-	  panel_dim_max, \
-	  panel_len, \
-	  panel_len_max, \
-	  kappa, \
-	  c, incc, ldc, \
-	  p,       ldp, \
-	  cntx  \
-	); \
-\
-\
-	/* If the diagonal of c is implicitly unit, explicitly set the
-	   the diagonal of the packed panel to kappa. */ \
-	if ( bli_is_unit_diag( diagc ) ) \
+	/* Pack to p11. */ \
+	if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \
 	{ \
-		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
+		dim_t  i           = diagoffc; \
+		dim_t  p11_dim     = panel_dim; \
+		dim_t  p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \
+		                                   ? panel_len_pad : 0 ); \
+		ctype* p11         = p + i * ldp; \
+		conj_t conjc11     = conjc; \
+		ctype* c11         = c + i * ldc; \
+		inc_t  incc11      = incc; \
+		inc_t  ldc11       = ldc; \
+\
+		f_cxc \
 		( \
-		  BLIS_NO_CONJUGATE, \
-		  diagoffc, \
-		  panel_dim, \
-		  panel_len, \
+		  strucc, \
+		  diagc, \
+		  uploc, \
+		  conjc11, \
+		  schema, \
+		  invdiag, \
+		  p11_dim, \
+		  p11_len_max, \
 		  kappa, \
-		  p, 1, ldp, \
-		  cntx, \
-		  NULL  \
+		  c11, incc11, ldc11, \
+		  p11,         ldp, \
+		  cntx  \
 		); \
 	} \
 \
-	/* If requested, invert the diagonal of the packed panel. */ \
-	if ( invdiag == TRUE ) \
+	/* Pack to p12. */ \
+	if ( diagoffc + panel_dim < panel_len ) \
 	{ \
-		PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
-		( \
-		  diagoffc, \
-		  panel_dim, \
-		  panel_len, \
-		  p, 1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
+		dim_t  i           = bli_max( 0, diagoffc + panel_dim ); \
+		dim_t  p12_dim     = panel_dim; \
+		dim_t  p12_len     = panel_len - i; \
+		/* If we are packing p12, then it is always the last partial block \
+		   and so we should make sure to pad with zeros if necessary. */ \
+		dim_t  p12_len_max = p12_len + panel_len_pad; \
+		ctype* p12         = p + i * ldp; \
+		conj_t conjc12     = conjc; \
+		ctype* c12         = c + i * ldc; \
+		inc_t  incc12      = incc; \
+		inc_t  ldc12       = ldc; \
 \
-	/* Set the region opposite the diagonal of p to zero. To do this,
-	   we need to reference the "unstored" region on the other side of
-	   the diagonal. This amounts to toggling uploc and then shifting
-	   the diagonal offset to shrink the newly referenced region (by
-	   one diagonal). Note that this zero-filling is not needed for
-	   trsm, since the unstored region is not referenced by the trsm
-	   micro-kernel; however, zero-filling is needed for trmm, which
-	   uses the gemm micro-kernel.*/ \
-	{ \
-		ctype* restrict zero  = PASTEMAC(ch,0); \
-		uplo_t          uplop = uploc; \
+		if ( bli_is_lower( uploc ) ) \
+		{ \
+			bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \
 \
-		bli_toggle_uplo( &uplop ); \
-		bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
+			if ( bli_is_hermitian( strucc ) ) \
+				bli_toggle_conj( &conjc12 ); \
+		} \
 \
-		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  diagoffc, \
-		  BLIS_NONUNIT_DIAG, \
-		  uplop, \
-		  panel_dim, \
-		  panel_len, \
-		  zero, \
-		  p, 1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
+		/* If we are referencing the unstored part of a triangular matrix,
+		   explicitly store zeros */ \
+		if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \
+		{ \
+			if ( bli_is_1m_packed( schema ) ) \
+			{ \
+			    ctype_r* restrict zero = PASTEMAC(chr,0); \
 \
-	/* If this panel is an edge case in both panel dimension and length,
-	   then it must be a bottom-right corner case. Set the part of the
-	   diagonal that extends into the zero-padded region to identity.
-	   NOTE: This is actually only necessary when packing for trsm, as
-	   it helps prevent NaNs and Infs from creeping into the computation.
-	   However, we set the region to identity for trmm as well. Those
-	   1.0's end up getting muliplied by the 0.0's in the zero-padded
-	   region of the other matrix, so there is no harm in this. */ \
-	if ( panel_dim != panel_dim_max && \
-	     panel_len != panel_len_max ) \
-	{ \
-		ctype* restrict one    = PASTEMAC(ch,1); \
-		dim_t           i      = panel_dim; \
-		dim_t           j      = panel_len; \
-		dim_t           m_br   = panel_dim_max - i; \
-		dim_t           n_br   = panel_len_max - j; \
-		ctype*          p_br   = p + (i  ) + (j  )*ldp; \
+				PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr_r, \
+				  p12_len_max * 2, \
+				  zero, \
+				  ( ctype_r* )p12, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+			else \
+			{ \
+				ctype* restrict zero = PASTEMAC(ch,0); \
 \
-		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  0, \
-		  m_br, \
-		  n_br, \
-		  one, \
-		  p_br, 1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
+				PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr, \
+				  p12_len_max, \
+				  zero, \
+				  p12, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+		} \
+		else \
+		{ \
+			f_cxk \
+			( \
+			  conjc12, \
+			  schema, \
+			  p12_dim, \
+			  p12_len, \
+			  p12_len_max, \
+			  kappa, \
+			  c12, incc12, ldc12, \
+			  p12,         ldp, \
+			  cntx  \
+			); \
+		} \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
+INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag )
 
diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h
index 5e4542841..80fa3804a 100644
--- a/frame/1m/unpackm/bli_unpackm.h
+++ b/frame/1m/unpackm/bli_unpackm.h
@@ -37,5 +37,3 @@
 #include "bli_unpackm_int.h"
 
 #include "bli_unpackm_blk_var1.h"
-
-#include "bli_unpackm_cxk.h"
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index b2c862045..b6165f516 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -36,21 +36,22 @@
 
 #define FUNCPTR_T unpackm_fp
 
-typedef void (*FUNCPTR_T)(
-                           struc_t strucc,
-                           doff_t  diagoffc,
-                           diag_t  diagc,
-                           uplo_t  uploc,
-                           trans_t transc,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_panel,
-                           dim_t   n_panel,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                                      dim_t pd_p, inc_t ps_p,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           cntx_t* cntx
-                         );
+typedef void (*FUNCPTR_T)
+     (
+       struc_t strucc,
+       doff_t  diagoffc,
+       diag_t  diagc,
+       uplo_t  uploc,
+       trans_t transc,
+       dim_t   m,
+       dim_t   n,
+       dim_t   m_panel,
+       dim_t   n_panel,
+       void*   p, inc_t rs_p, inc_t cs_p,
+                  dim_t pd_p, inc_t ps_p,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx
+     );
 
 static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
 
@@ -152,10 +153,10 @@ void PASTEMAC(ch,varname) \
 	dim_t           iter_dim; \
 	dim_t           num_iter; \
 	dim_t           it, ic, ip; \
-    dim_t           ic0, ip0; \
+	dim_t           ic0, ip0; \
 	doff_t          ic_inc, ip_inc; \
-    doff_t          diagoffc_i; \
-    doff_t          diagoffc_inc; \
+	doff_t          diagoffc_i; \
+	doff_t          diagoffc_inc; \
 	dim_t           panel_len; \
 	dim_t           panel_dim_i; \
 	dim_t           panel_dim_max; \
@@ -164,6 +165,7 @@ void PASTEMAC(ch,varname) \
 	inc_t           ldp; \
 	dim_t*          m_panel_full; \
 	dim_t*          n_panel_full; \
+	pack_t          schema; \
 \
 \
 	/* If c needs a transposition, induce it so that we can more simply
@@ -182,6 +184,7 @@ void PASTEMAC(ch,varname) \
 	if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
 	{ \
 		/* Prepare to unpack from column panels. */ \
+		schema        = BLIS_PACKED_COL_PANELS; \
 		iter_dim      = n; \
 		panel_len     = m; \
 		panel_dim_max = pd_p; \
@@ -196,6 +199,7 @@ void PASTEMAC(ch,varname) \
 	else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
 	{ \
 		/* Prepare to unpack from row panels. */ \
+		schema        = BLIS_PACKED_ROW_PANELS; \
 		iter_dim      = m; \
 		panel_len     = n; \
 		panel_dim_max = pd_p; \
@@ -207,6 +211,14 @@ void PASTEMAC(ch,varname) \
 		m_panel_full  = &panel_dim_i; \
 		n_panel_full  = &n; \
 	} \
+\
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
+	                                           : BLIS_UNPACKM_MRXK_KER; \
+\
+	/* Query the context for the unpackm kernel corresponding to the current
+	   panel dimension, or kernel id. */ \
+	PASTECH2(ch,unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -253,9 +265,10 @@ void PASTEMAC(ch,varname) \
 		else \
 		{ \
 			/* Pack the current panel. */ \
-			PASTEMAC(ch,unpackm_cxk) \
+			f \
 			( \
 			  BLIS_NO_CONJUGATE, \
+			  schema, \
 			  panel_dim_i, \
 			  panel_len, \
 			  one, \
diff --git a/frame/2/gemv/bli_gemv_unb_var1.c b/frame/2/gemv/bli_gemv_unb_var1.c
index 3f5681d2b..840b96901 100644
--- a/frame/2/gemv/bli_gemv_unb_var1.c
+++ b/frame/2/gemv/bli_gemv_unb_var1.c
@@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < n_iter; ++i ) \
 	{ \
diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c
index 8166aa417..7fc4fcfe4 100644
--- a/frame/2/gemv/bli_gemv_unb_var2.c
+++ b/frame/2/gemv/bli_gemv_unb_var2.c
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < n_iter; ++i ) \
 	{ \
diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c
index e392e830e..0dceed4cf 100644
--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -71,7 +71,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c
index fe7702e4c..4c43657ad 100644
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
diff --git a/frame/2/ger/bli_ger_unb_var1.c b/frame/2/ger/bli_ger_unb_var1.c
index d6cda277e..d8ddd1247 100644
--- a/frame/2/ger/bli_ger_unb_var1.c
+++ b/frame/2/ger/bli_ger_unb_var1.c
@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/ger/bli_ger_unb_var2.c b/frame/2/ger/bli_ger_unb_var2.c
index 1590bfe5e..9c49e336b 100644
--- a/frame/2/ger/bli_ger_unb_var2.c
+++ b/frame/2/ger/bli_ger_unb_var2.c
@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( j = 0; j < n; ++j ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c
index ea5d478be..71c27a326 100644
--- a/frame/2/hemv/bli_hemv_unb_var1.c
+++ b/frame/2/hemv/bli_hemv_unb_var1.c
@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c
index 1f7346517..3753c8d3b 100644
--- a/frame/2/hemv/bli_hemv_unb_var2.c
+++ b/frame/2/hemv/bli_hemv_unb_var2.c
@@ -123,7 +123,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c
index 6573e59fc..d592251d5 100644
--- a/frame/2/hemv/bli_hemv_unb_var3.c
+++ b/frame/2/hemv/bli_hemv_unb_var3.c
@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c
index deabc3ab4..10cf953b6 100644
--- a/frame/2/hemv/bli_hemv_unb_var4.c
+++ b/frame/2/hemv/bli_hemv_unb_var4.c
@@ -122,7 +122,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c
index d36dc0098..a449909a5 100644
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
 \
 	for ( i = 0; i < m; i += f ) \
diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c
index 31ab1515f..d0af57393 100644
--- a/frame/2/hemv/bli_hemv_unf_var1a.c
+++ b/frame/2/hemv/bli_hemv_unf_var1a.c
@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
+	kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c
index d8db9bc78..baaff098d 100644
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
 \
 	for ( i = 0; i < m; i += f ) \
diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c
index 54ab0f6ce..55c1929ff 100644
--- a/frame/2/hemv/bli_hemv_unf_var3a.c
+++ b/frame/2/hemv/bli_hemv_unf_var3a.c
@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
+	kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her/bli_her_unb_var1.c b/frame/2/her/bli_her_unb_var1.c
index e7f718680..8cd6bd397 100644
--- a/frame/2/her/bli_her_unb_var1.c
+++ b/frame/2/her/bli_her_unb_var1.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her/bli_her_unb_var2.c b/frame/2/her/bli_her_unb_var2.c
index 4b39e1df0..f68798dce 100644
--- a/frame/2/her/bli_her_unb_var2.c
+++ b/frame/2/her/bli_her_unb_var2.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c
index 37423bfcb..b5c182639 100644
--- a/frame/2/her2/bli_her2_unb_var1.c
+++ b/frame/2/her2/bli_her2_unb_var1.c
@@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c
index 22d6de07a..602e922a8 100644
--- a/frame/2/her2/bli_her2_unb_var2.c
+++ b/frame/2/her2/bli_her2_unb_var2.c
@@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c
index 297b9b702..1d5872d5d 100644
--- a/frame/2/her2/bli_her2_unb_var3.c
+++ b/frame/2/her2/bli_her2_unb_var3.c
@@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c
index 58adb0e70..922fe7db7 100644
--- a/frame/2/her2/bli_her2_unb_var4.c
+++ b/frame/2/her2/bli_her2_unb_var4.c
@@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c
index a0aec48f7..3824880c6 100644
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+	kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c
index 3dea31d53..6b2b0e9ac 100644
--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+	kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/trmv/bli_trmv_unb_var1.c b/frame/2/trmv/bli_trmv_unb_var1.c
index 31bfa6a83..367a34e6c 100644
--- a/frame/2/trmv/bli_trmv_unb_var1.c
+++ b/frame/2/trmv/bli_trmv_unb_var1.c
@@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trmv/bli_trmv_unb_var2.c b/frame/2/trmv/bli_trmv_unb_var2.c
index 00d4d95f3..fa21776b3 100644
--- a/frame/2/trmv/bli_trmv_unb_var2.c
+++ b/frame/2/trmv/bli_trmv_unb_var2.c
@@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c
index 6dc3cea36..9e576fc77 100644
--- a/frame/2/trmv/bli_trmv_unf_var1.c
+++ b/frame/2/trmv/bli_trmv_unf_var1.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c
index 8bbd51820..052595935 100644
--- a/frame/2/trmv/bli_trmv_unf_var2.c
+++ b/frame/2/trmv/bli_trmv_unf_var2.c
@@ -90,7 +90,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c
index c7493e33d..2f24b10a8 100644
--- a/frame/2/trsv/bli_trsv_unb_var1.c
+++ b/frame/2/trsv/bli_trsv_unb_var1.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotv_ker_ft) kfp_tv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_tv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
+	kfp_tv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c
index a78e7eef0..1a8e81634 100644
--- a/frame/2/trsv/bli_trsv_unb_var2.c
+++ b/frame/2/trsv/bli_trsv_unb_var2.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c
index 3b03b43e5..824f26d15 100644
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c
index 10741d291..bd1f8e3b0 100644
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -102,7 +102,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c
index bde30c527..1d4608799 100644
--- a/frame/3/bli_l3_schema.c
+++ b/frame/3/bli_l3_schema.c
@@ -57,7 +57,7 @@ void bli_l3_set_schemas
 		// projection of dt to query the preference of the corresponding native
 		// real-domain microkernel. This is what ultimately determines which
 		// variant of 1m is applicable.
-		if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
+		if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ) )
 		{
 			schema_a = BLIS_PACKED_ROW_PANELS_1E;
 			schema_b = BLIS_PACKED_COL_PANELS_1R;
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index 72ec405ab..7e37e1f22 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -63,7 +63,7 @@ err_t bli_gemmsup
 	// Return early if a microkernel preference-induced transposition would
 	// have been performed and shifted the dimensions outside of the space
 	// of sup-handled problems.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( c, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		const num_t dt = bli_obj_dt( c );
 		const dim_t m  = bli_obj_length( c );
diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index e54e01d7c..3da3954fa 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -85,7 +85,7 @@ err_t bli_gemmsup_int
 	const bool    is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
 
 	const num_t   dt         = bli_obj_dt( c );
-	const bool    row_pref   = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+	const bool    row_pref   = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx );
 
 	const bool    is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
 	                                      : is_rcc_crc_ccr_ccc );
@@ -259,7 +259,7 @@ err_t bli_gemmtsup_int
 	const bool    is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
 
 	const num_t   dt         = bli_obj_dt( c );
-	const bool    row_pref   = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+	const bool    row_pref   = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx );
 
 	const bool    is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
 	                                      : is_rcc_crc_ccr_ccc );
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 85fb246f0..519dc5ccd 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -122,6 +122,14 @@ void PASTEMAC(ch,varname) \
 		ldc            = cs_c; \
 		ldp            = cs_p; \
 	} \
+\
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                           : BLIS_PACKM_MRXK_KER; \
+\
+	/* Query the context for the unpackm kernel corresponding to the current
+	   panel dimension, or kernel id. */ \
+	PASTECH2(ch,packm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -171,12 +179,11 @@ void PASTEMAC(ch,varname) \
 			   or round-robin partitioning was requested at configure-time. */ \
 			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
-				PASTEMAC(ch,packm_cxk) \
+				f \
 				( \
 				  conjc, \
 				  schema, \
 				  panel_dim_i, \
-				  panel_dim_max, \
 				  panel_len_i, \
 				  panel_len_max_i, \
 				  kappa_cast, \
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index 7c315192d..ead9925e6 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -127,7 +127,7 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
        cntx_t*  cntx
      )
 {
-	const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx );
+	const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( *eff_id ), cntx );
 
 	// Handle row- and column-preferrential kernels separately.
 	if ( row_pref )
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 4ff45036f..cd8827bd9 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -99,7 +99,7 @@ void bli_gemm_front
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 6de361194..874a12439 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -201,7 +201,7 @@ void bli_gemm_ker_var2
 	// column-stored as well.
 	char        ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
 	const inc_t rs_ct       = ( col_pref ? 1 : NR );
 	const inc_t cs_ct       = ( col_pref ? MR : 1 );
 	char*       zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index e257cdf28..6202cfffd 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -173,7 +173,7 @@ mddm_t bli_gemm_md_ccr
 	// preference.
 	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
 	const bool  row_pref
-	      = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
+	      = bli_cntx_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
 
 	// We can only perform this case of mixed-domain gemm, C += A*B where
 	// B is real, if the microkernel prefers column output. If it prefers
@@ -236,8 +236,8 @@ mddm_t bli_gemm_md_ccr
 
 	// Use the default pack schemas in the objects.
 
-	// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
+	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
+	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
@@ -278,7 +278,7 @@ mddm_t bli_gemm_md_crc
 	// preference.
 	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
 	const bool  col_pref
-	      = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
+	      = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
 
 	// We can only perform this case of mixed-domain gemm, C += A*B where
 	// A is real, if the microkernel prefers row output. If it prefers
@@ -341,8 +341,8 @@ mddm_t bli_gemm_md_crc
 
 	// Use the default pack schemas in the objects.
 
-	// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
+	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
+	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
@@ -430,13 +430,11 @@ mddm_t bli_gemm_md_rcc
 	const num_t dt_complex = bli_obj_dt( a );
 	cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
 
-	func_t* cntx_funcs    = bli_cntx_packm_kers_buf( *cntx );
-	func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m );
+	func_t* cntx_funcs    = bli_cntx_ukrs_buf( *cntx );
+	func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m );
 
-	for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i )
-	{
-		cntx_funcs[ i ] = cntx_1m_funcs[ i ];
-	}
+	cntx_funcs[ BLIS_PACKM_MRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_MRXK_KER ];
+	cntx_funcs[ BLIS_PACKM_NRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_NRXK_KER ];
 
 	// Return the computation and execution domains.
 	return doms;
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index bbd9190a9..a4797ad4f 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -57,8 +57,8 @@ void PASTEMAC2(ch,opname,suf) \
 	const num_t       dt_r      = PASTEMAC(chr,type); \
 \
 	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        col_pref  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	                  rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        row_pref  = !col_pref; \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c
index 62d2a9e04..c5cf935b8 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2.c
@@ -198,7 +198,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
index 289e4ddf5..946e3048c 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2rr.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
@@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
index d75838fb4..f5159bbb9 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2sl.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
@@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index 2a9d91759..d53838470 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -86,7 +86,7 @@ void bli_gemmt_front
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index fea4efec0..3aedc6e9a 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 4b849bbc6..b3a9fe8a1 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
index 0bf4b1a0f..ece351ef7 100644
--- a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
index 1655bea55..f00e769b5 100644
--- a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index 9835de9c1..15460125d 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -117,7 +117,7 @@ void bli_hemm_front
 	// micro-kernel to access elements of C in its preferred manner.
 	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
 	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_toggle_conj( &a_local );
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index be94c44c1..8108b607f 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -117,7 +117,7 @@ void bli_symm_front
 	// micro-kernel to access elements of C in its preferred manner.
 	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
 	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_induce_trans( &b_local );
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index 1de28958e..d973b6eb6 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -135,7 +135,7 @@ void bli_trmm_front
 	// of row- vs. column storage breaks down.
 	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
 	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_induce_trans( &a_local );
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
index 9ab64e470..706e14d43 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
index 6fef4e0c9..699892635 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
@@ -337,7 +337,7 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_inc; \
 \
 	/* Use round-robin assignment of micropanels to threads in the 2nd loop for
-	   the initial rectangular region of C (if it exists). 
+	   the initial rectangular region of C (if it exists).
 	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
 	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
index e0d9cc75f..eb5577593 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
index 0abcfd77a..738711f58 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
index 8c505f88a..df53b2011 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
index 3bb0deaa3..fbcd4f9aa 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
index 672caaa05..7775d9217 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
index 9d9e3809c..c1354a962 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
index 8bac0ec4a..7cf8eeef0 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
index fc2991b13..1d0f31708 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
index 00a0dc3f0..d8ae4f8bb 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
index 889fa49fa..c05a082d4 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index 3b9753960..9cd04963b 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -127,7 +127,7 @@ void bli_trmm3_front
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_induce_trans( &a_local );
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index f50f739e7..7b1133c2a 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -180,7 +180,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 */ \
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 4f3514143..2059d1c9f 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -180,7 +180,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 */ \
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index b4937134f..cace3622a 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -185,7 +185,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 */ \
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 09942d311..4b0c7f083 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -185,7 +185,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 */ \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
index dc57eac5f..26da1b004 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
@@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
index 38768242e..607b40e54 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
index 78ffe1758..3299b5f8e 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
index 7c4cea976..b02ff0955 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
@@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
index 8d050c62b..e78cef477 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
index b49a1144e..93cac371a 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
index a11936389..1e903c3c1 100644
--- a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
index 7ad1e4271..a44d64f45 100644
--- a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 3a698871b..218325d5a 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -43,253 +43,76 @@ void bli_cntx_clear( cntx_t* cntx )
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
+void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
 	// non-default blocksizes. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// blocksizes across all datatypes.
+	// bli_cntx_init_<subconfig>_ref() so that the context begins with
+	// default blocksizes across all datatypes.
 
 	/* Example prototypes:
 
 	   void bli_cntx_set_blkszs
 	   (
-	     ind_t   method = BLIS_NAT,
-	     dim_t   n_bs,
+	     cntx_t* cntx,
 	     bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
 	     bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
 	     bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
-	     ...
-	     cntx_t* cntx
-	   );
-
-	   void bli_cntx_set_blkszs
-	   (
-	     ind_t   method != BLIS_NAT,
-	     dim_t   n_bs,
-	     bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0,
-	     bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1,
-	     bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2,
-	     ...
-	     cntx_t* cntx
+	     ...,
+	     BLIS_VA_END
 	   );
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bszid_t*  bszids = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bszid_t*  bmults = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	double*   dsclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	double*   msclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_bs );
-
-	// Handle native and induced method cases separately.
-	if ( method == BLIS_NAT )
-	{
-		// Process n_bs tuples.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Here, we query the variable argument list for:
-			// - the bszid_t of the blocksize we're about to process,
-			// - the address of the blksz_t object,
-			// - the bszid_t of the multiple we need to associate with
-			//   the blksz_t object.
-			bszid_t  bs_id = ( bszid_t  )va_arg( args, bszid_t  );
-			blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-			bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
-
-			// Store the values in our temporary arrays.
-			bszids[ i ] = bs_id;
-			blkszs[ i ] = blksz;
-			bmults[ i ] = bm_id;
-		}
-	}
-	else // if induced method execution was indicated
-	{
-		// Process n_bs tuples.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Here, we query the variable argument list for:
-			// - the bszid_t of the blocksize we're about to process,
-			// - the address of the blksz_t object,
-			// - the bszid_t of the multiple we  need to associate with
-			//   the blksz_t object,
-			// - the scalars we wish to apply to the real blocksizes to
-			//   come up with the induced complex blocksizes (for default
-			//   and maximum blocksizes).
-			bszid_t  bs_id = ( bszid_t  )va_arg( args, bszid_t  );
-			blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-			bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
-			double   dsclr = ( double   )va_arg( args, double   );
-			double   msclr = ( double   )va_arg( args, double   );
-
-			// Store the values in our temporary arrays.
-			bszids[ i ] = bs_id;
-			blkszs[ i ] = blksz;
-			bmults[ i ] = bm_id;
-			dsclrs[ i ] = dsclr;
-			msclrs[ i ] = msclr;
-		}
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
 	// Save the execution type into the context.
-	bli_cntx_set_method( method, cntx );
+	bli_cntx_set_method( BLIS_NAT, cntx );
 
 	// Query the context for the addresses of:
 	// - the blocksize object array
 	// - the blocksize multiple array
-
 	blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx );
 	bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx );
 
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context. Notice that the blksz_t* pointers were saved, rather than
-	// the objects themselves, but we copy the contents of the objects
-	// when copying into the context.
+	// Initialize variable argument environment.
+	va_list args;
+	va_start( args, cntx );
 
-	// Handle native and induced method cases separately.
-	if ( method == BLIS_NAT )
+	// Process blocksizes until we get a BLIS_VA_END.
+	while ( true )
 	{
-		// Process each blocksize id tuple provided.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Read the current blocksize id, blksz_t* pointer, blocksize
-			// multiple id, and blocksize scalar.
-			bszid_t  bs_id = bszids[ i ];
-			bszid_t  bm_id = bmults[ i ];
-
-			blksz_t* blksz = blkszs[ i ];
+		int bs_id0 = va_arg( args, int );
 
-			blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
+		// If we find a bszid_t id of BLIS_VA_END, then we are done.
+		if ( bs_id0 == BLIS_VA_END ) break;
 
-			// Copy the blksz_t object contents into the appropriate
-			// location within the context's blksz_t array. Do the same
-			// for the blocksize multiple id.
-			//cntx_blkszs[ bs_id ] = *blksz;
-			//bli_blksz_copy( blksz, cntx_blksz );
-			bli_blksz_copy_if_pos( blksz, cntx_blksz );
+		// Here, we query the variable argument list for:
+		// - the bszid_t of the blocksize we're about to process (already done),
+		// - the address of the blksz_t object,
+		// - the bszid_t of the multiple we need to associate with
+		//   the blksz_t object.
+		bszid_t  bs_id = ( bszid_t  )bs_id0;
+		blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
+		bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
 
-			// Copy the blocksize multiple id into the context.
-			cntx_bmults[ bs_id ] = bm_id;
-		}
-	}
-	else
-	{
-		// Process each blocksize id tuple provided.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Read the current blocksize id, blksz_t pointer, blocksize
-			// multiple id, and blocksize scalar.
-			bszid_t  bs_id = bszids[ i ];
-			bszid_t  bm_id = bmults[ i ];
-			double   dsclr = dsclrs[ i ];
-			double   msclr = msclrs[ i ];
-
-			blksz_t* blksz = blkszs[ i ];
-
-			blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-
-			// Copy the real domain values of the source blksz_t object into
-			// the context, duplicating into the complex domain fields.
-			bli_blksz_copy_dt( BLIS_FLOAT,  blksz, BLIS_FLOAT,    cntx_blksz );
-			bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE,   cntx_blksz );
-			bli_blksz_copy_dt( BLIS_FLOAT,  blksz, BLIS_SCOMPLEX, cntx_blksz );
-			bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz );
-
-			// If the default blocksize scalar is non-unit, we need to scale
-			// the complex domain default blocksizes.
-			if ( dsclr != 1.0 )
-			{
-				// Scale the complex domain default blocksize values in the
-				// blocksize object.
-				bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
-				bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
-			}
-
-			// Similarly, if the maximum blocksize scalar is non-unit, we need
-			// to scale the complex domain maximum blocksizes.
-			if ( msclr != 1.0 )
-			{
-				// Scale the complex domain maximum blocksize values in the
-				// blocksize object.
-				bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
-				bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
-			}
-
-			// Copy the blocksize multiple id into the context.
-			cntx_bmults[ bs_id ] = bm_id;
-		}
+		// Copy the blksz_t object contents into the appropriate
+		// location within the context's blksz_t array. Do the same
+		// for the blocksize multiple id.
+		//cntx_blkszs[ bs_id ] = *blksz;
+		//bli_blksz_copy( blksz, cntx_blksz );
+		blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
+		bli_blksz_copy_if_pos( blksz, cntx_blksz );
+
+		// Copy the blocksize multiple id into the context.
+		cntx_bmults[ bs_id ] = bm_id;
 	}
 
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( blkszs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( bszids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( bmults );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( dsclrs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( msclrs );
+	// Shutdown variable argument environment and clean up stack.
+	va_end( args );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... )
+void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
 {
 	/* Example prototypes:
 
@@ -297,1269 +120,268 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... )
 	   (
 	     ind_t   method != BLIS_NAT,
 	     num_t   dt,
-	     dim_t   n_bs,
+	     cntx_t* cntx,
 	     bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
 	     bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
 	     bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2,
-	     ...
-	     cntx_t* cntx
+	     ...,
+	     BLIS_VA_END
 	   );
-	
+
 		NOTE: This function modifies an existing context that is presumed
 		to have been initialized for native execution.
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
 	// Project the given datatype to the real domain. This will be used later on.
 	num_t dt_real = bli_dt_proj_to_real( dt );
 
 	// Return early if called with BLIS_NAT.
 	if ( method == BLIS_NAT ) return;
 
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	double*  dsclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	double*  msclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	// -- Begin variable argument section --
+	// Save the execution type into the context.
+	bli_cntx_set_method( method, cntx );
 
 	// Initialize variable argument environment.
-	va_start( args, n_bs );
+	va_list args;
+	va_start( args, cntx );
 
+	// Process blocksizes until we get a BLIS_VA_END.
+	while ( true )
 	{
-		// Process n_bs tuples.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Here, we query the variable argument list for:
-			// - the bszid_t of the blocksize we're about to process,
-			// - the scalars we wish to apply to the real blocksizes to
-			//   come up with the induced complex blocksizes (for default
-			//   and maximum blocksizes).
-			bszid_t  bs_id = ( bszid_t )va_arg( args, bszid_t  );
-			double   dsclr = ( double  )va_arg( args, double   );
-			double   msclr = ( double  )va_arg( args, double   );
-
-			// Store the values in our temporary arrays.
-			bszids[ i ] = bs_id;
-			dsclrs[ i ] = dsclr;
-			msclrs[ i ] = msclr;
-		}
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
+		int bs_id0 = va_arg( args, int );
 
-	// Save the execution type into the context.
-	bli_cntx_set_method( method, cntx );
+		// If we find a bszid_t id of BLIS_VA_END, then we are done.
+		if ( bs_id0 == BLIS_VA_END ) break;
 
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	{
-		// Process each blocksize id tuple provided.
-		for ( i = 0; i < n_bs; ++i )
+		// Here, we query the variable argument list for:
+		// - the bszid_t of the blocksize we're about to process (already done),
+		// - the scalars we wish to apply to the real blocksizes to
+		//   come up with the induced complex blocksizes (for default
+		//   and maximum blocksizes).
+		bszid_t bs_id = ( bszid_t )bs_id0;
+		double  dsclr = ( double  )va_arg( args, double );
+		double  msclr = ( double  )va_arg( args, double );
+
+		// Query the context for the blksz_t object assoicated with the
+		// current blocksize id, and also query the object corresponding
+		// to the blocksize multiple.
+		blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
+
+		// Copy the real domain value of the blksz_t object into the
+		// corresponding complex domain slot of the same object.
+		bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
+
+		// If the default blocksize scalar is non-unit, we need to scale
+		// the complex domain default blocksizes.
+		if ( dsclr != 1.0 )
 		{
-			// Read the current blocksize id, blocksize multiple id,
-			// and blocksize scalar.
-			bszid_t  bs_id = bszids[ i ];
-			double   dsclr = dsclrs[ i ];
-			double   msclr = msclrs[ i ];
-
-			//blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-
-			// Query the context for the blksz_t object assoicated with the
-			// current blocksize id, and also query the object corresponding
-			// to the blocksize multiple.
-			blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
-
-			// Copy the real domain value of the blksz_t object into the
-			// corresponding complex domain slot of the same object.
-			bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
-
-			// If the default blocksize scalar is non-unit, we need to scale
-			// the complex domain default blocksizes.
-			if ( dsclr != 1.0 )
-			{
-				// Scale the default blocksize value corresponding to the given
-				// datatype.
-				bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
-			}
-
-			// Similarly, if the maximum blocksize scalar is non-unit, we need
-			// to scale the complex domain maximum blocksizes.
-			if ( msclr != 1.0 )
-			{
-				// Scale the maximum blocksize value corresponding to the given
-				// datatype.
-				bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
-			}
+			// Scale the default blocksize value corresponding to the given
+			// datatype.
+			bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
 		}
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bli_free_intl( bszids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bli_free_intl( dsclrs );
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bli_free_intl( msclrs );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 microkernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// microkernels across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_nat_ukrs
-	   (
-	     dim_t   n_ukrs,
-	     l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, bool pref0,
-	     l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, bool pref1,
-	     l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, bool pref2,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	l3ukr_t* ukr_ids   = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	num_t*   ukr_dts   = bli_malloc_intl( n_ukrs * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	void_fp* ukr_fps   = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bool*    ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool    ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_ukrs );
-
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the l3ukr_t of the kernel we're about to process,
-		// - the datatype of the kernel,
-		// - the kernel function pointer, and
-		// - the kernel function storage preference
-		// that we need to store to the context.
-
-		// NOTE: Though bool_t is no longer used, the following comment is
-		// being kept for historical reasons.
-		// The type that we pass into the va_arg() macro for the ukr
-		// preference matters. Using 'bool_t' may cause breakage on 64-bit
-		// systems that define int as 32 bits and long int and pointers as
-		// 64 bits. The problem is that TRUE or FALSE are defined as 1 and
-		// 0, respectively, and when "passed" into the variadic function
-		// they come with no contextual typecast. Thus, default rules of
-		// argument promotion kick in to treat these integer literals as
-		// being of type int. Thus, we need to let va_arg() treat the TRUE
-		// or FALSE value as an int, even if we cast it to and store it
-		// within a bool_t afterwards.
-		const l3ukr_t  ukr_id   = ( l3ukr_t )va_arg( args, l3ukr_t );
-		const num_t    ukr_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ukr_fp   = ( void_fp )va_arg( args, void_fp );
-		const bool     ukr_pref = ( bool    )va_arg( args, int     );
-
-		// Store the values in our temporary arrays.
-		ukr_ids[ i ]   = ukr_id;
-		ukr_dts[ i ]   = ukr_dt;
-		ukr_fps[ i ]   = ukr_fp;
-		ukr_prefs[ i ] = ukr_pref;
+		// Similarly, if the maximum blocksize scalar is non-unit, we need
+		// to scale the complex domain maximum blocksizes.
+		if ( msclr != 1.0 )
+		{
+			// Scale the maximum blocksize value corresponding to the given
+			// datatype.
+			bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
+		}
 	}
 
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
 	// Shutdown variable argument environment and clean up stack.
 	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 virtual ukernel func_t array
-	// - the l3 native ukernel func_t array
-	// - the l3 native ukernel preferences array
-	func_t*  cntx_l3_vir_ukrs       = bli_cntx_l3_vir_ukrs_buf( cntx );
-	func_t*  cntx_l3_nat_ukrs       = bli_cntx_l3_nat_ukrs_buf( cntx );
-	mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Read the current ukernel id, ukernel datatype, ukernel function
-		// pointer, and ukernel preference.
-		const l3ukr_t ukr_id   = ukr_ids[ i ];
-		const num_t   ukr_dt   = ukr_dts[ i ];
-		      void_fp ukr_fp   = ukr_fps[ i ];
-		const bool    ukr_pref = ukr_prefs[ i ];
-
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		func_t*       vukrs  = &cntx_l3_vir_ukrs[ ukr_id ];
-		func_t*       ukrs   = &cntx_l3_nat_ukrs[ ukr_id ];
-		mbool_t*      prefs  = &cntx_l3_nat_ukrs_prefs[ ukr_id ];
-
-		// Store the ukernel function pointer and preference values into
-		// the context. Notice that we redundantly store the native
-		// ukernel address in both the native and virtual ukernel slots
-		// in the context. This is standard practice when creating a
-		// native context. (Induced method contexts will overwrite the
-		// virtual function pointer with the address of the appropriate
-		// virtual ukernel.)
-		bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
-		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
-		bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
-	}
-
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_fps );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_prefs );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... )
+void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 virtual microkernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// microkernels across all datatypes.
+	// non-default microkernels. It should be called after
+	// bli_cntx_init_<subconfig>_ref() so that the context begins with
+	// default microkernels across all datatypes.
 
 	/* Example prototypes:
 
-	   void bli_cntx_set_l3_vir_ukrs
+	   void bli_cntx_set_ukrs
 	   (
-	     dim_t   n_ukrs,
-	     l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
-	     l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
-	     l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
-	     ...
-	     cntx_t* cntx
+	     cntx_t* cntx,
+	     ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
+	     ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
+	     ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
+	     ...,
+	     BLIS_VA_END
 	   );
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	l3ukr_t* ukr_ids   = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	num_t*   ukr_dts   = bli_malloc_intl( n_ukrs * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	void_fp* ukr_fps   = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val );
-
-	// -- Begin variable argument section --
+	// Query the context for the address of the ukernel func_t array
+	func_t*  cntx_ukrs = bli_cntx_ukrs_buf( cntx );
 
 	// Initialize variable argument environment.
-	va_start( args, n_ukrs );
+	va_list   args;
+	va_start( args, cntx );
 
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ukrs; ++i )
+	// Process ukernels until BLIS_VA_END is reached.
+	while ( true )
 	{
-		// Here, we query the variable argument list for:
-		// - the l3ukr_t of the kernel we're about to process,
-		// - the datatype of the kernel, and
-		// - the kernel function pointer.
-		// that we need to store to the context.
-		const l3ukr_t  ukr_id   = ( l3ukr_t )va_arg( args, l3ukr_t );
-		const num_t    ukr_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ukr_fp   = ( void_fp )va_arg( args, void_fp );
-
-		// Store the values in our temporary arrays.
-		ukr_ids[ i ]   = ukr_id;
-		ukr_dts[ i ]   = ukr_dt;
-		ukr_fps[ i ]   = ukr_fp;
-	}
+		const int ukr_id0 = va_arg( args, int );
 
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
+		// If we find a ukernel id of BLIS_VA_END, then we are done.
+		if ( ukr_id0 == BLIS_VA_END ) break;
 
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 virtual ukernel func_t array
-	func_t*  cntx_l3_vir_ukrs       = bli_cntx_l3_vir_ukrs_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Read the current ukernel id, ukernel datatype, ukernel function
-		// pointer, and ukernel preference.
-		const l3ukr_t ukr_id   = ukr_ids[ i ];
-		const num_t   ukr_dt   = ukr_dts[ i ];
-		      void_fp ukr_fp   = ukr_fps[ i ];
+		// Here, we query the variable argument list for:
+		// - the ukr_t of the kernel we're about to process (already done),
+		// - the datatype of the kernel, and
+		// - the kernel function pointer
+		const ukr_t   ukr_id = ( ukr_t   )ukr_id0;
+		const num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
+		      void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
 
 		// Index into the func_t and mbool_t for the current kernel id
 		// being processed.
-		func_t*       vukrs  = &cntx_l3_vir_ukrs[ ukr_id ];
+		func_t* ukrs = &cntx_ukrs[ ukr_id ];
 
-		// Store the ukernel function pointer and preference values into
-		// the context. Notice that we redundantly store the native
+		// Store the ukernel function pointer into the context.
+		// Notice that we redundantly store the native
 		// ukernel address in both the native and virtual ukernel slots
 		// in the context. This is standard practice when creating a
 		// native context. (Induced method contexts will overwrite the
 		// virtual function pointer with the address of the appropriate
 		// virtual ukernel.)
-		bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
-	}
-
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_fps );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default thresholds for small/unpacked matrix handling. It should
-	// be called after bli_cntx_init_defaults() so that the context begins
-	// with default thresholds.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_sup_thresh
-	   (
-	     dim_t      n_thresh,
-	     threshid_t th0_id, blksz_t* blksz0,
-	     threshid_t th1_id, blksz_t* blksz1,
-	     ...
-	     cntx_t* cntx
-	   );
-
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	blksz_t**   threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_thresh );
-
-	// Process n_thresh tuples.
-	for ( i = 0; i < n_thresh; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the threshid_t of the threshold we're about to process,
-		// - the address of the blksz_t object,
-		threshid_t th_id  = ( threshid_t )va_arg( args, threshid_t );
-		blksz_t*   thresh = ( blksz_t*   )va_arg( args, blksz_t*   );
-
-		// Store the values in our temporary arrays.
-		threshids[ i ] = th_id;
-		threshs[ i ]   = thresh;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the threshold array
-	blksz_t* cntx_threshs = bli_cntx_l3_sup_thresh_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context. Notice that the blksz_t* pointers were saved, rather than
-	// the objects themselves, but we copy the contents of the objects
-	// when copying into the context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_thresh; ++i )
-	{
-		// Read the current blocksize id, blksz_t* pointer, blocksize
-		// multiple id, and blocksize scalar.
-		threshid_t th_id  = threshids[ i ];
-		blksz_t*   thresh = threshs[ i ];
-
-		blksz_t* cntx_thresh = &cntx_threshs[ th_id ];
-
-		// Copy the blksz_t object contents into the appropriate
-		// location within the context's blksz_t array.
-		//cntx_threshs[ th_id ] = *thresh;
-		//bli_blksz_copy( thresh, cntx_thresh );
-		bli_blksz_copy_if_pos( thresh, cntx_thresh );
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	bli_free_intl( threshs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	bli_free_intl( threshids );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 operation handler for small/unpacked matrices. It
-	// should be called after bli_cntx_init_defaults() so that the context
-	// begins with default sup handlers across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_sup_handlers
-	   (
-	     dim_t   n_ops,
-	     opid_t  op0_id, void* handler0_fp,
-	     opid_t  op1_id, void* handler1_fp,
-	     opid_t  op2_id, void* handler2_fp,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	void**  op_fps = bli_malloc_intl( n_ops * sizeof( void*  ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_ops );
-
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ops; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the opid_t of the operation we're about to process,
-		// - the sup handler function pointer
-		// that we need to store to the context.
-		const opid_t op_id = ( opid_t )va_arg( args, opid_t );
-		      void*  op_fp = ( void*  )va_arg( args, void*  );
+		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
 
-		// Store the values in our temporary arrays.
-		op_ids[ i ] = op_id;
-		op_fps[ i ] = op_fp;
+		// Locate the virtual ukernel func_t pointer that corresponds to the
+		// ukernel id provided by the caller.
+		switch ( ukr_id )
+		{
+			case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ]; break;
+			case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ]; break;
+			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break;
+			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break;
+			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break;
+		    default:                  ukrs = NULL; break;
+		};
+
+		if ( ukrs )
+			bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
 	}
 
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
 	// Shutdown variable argument environment and clean up stack.
 	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 small/unpacked handlers array
-	void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each operation id tuple provided.
-	for ( i = 0; i < n_ops; ++i )
-	{
-		// Read the current operation id and handler function pointer.
-		const opid_t op_id = op_ids[ i ];
-		      void*  op_fp = op_fps[ i ];
-
-		// Store the sup handler function pointer into the slot for the
-		// specified operation id.
-		cntx_l3_sup_handlers[ op_id ] = op_fp;
-	}
-
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	bli_free_intl( op_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	bli_free_intl( op_fps );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... )
+void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
-	// non-default l3 sup blocksizes. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// blocksizes across all datatypes.
+	// non-default microkernel preferences. It should be called after
+	// bli_cntx_init_<subconfig>_ref() so that the context begins with
+	// default preferences across all datatypes.
 
 	/* Example prototypes:
 
-	   void bli_cntx_set_blkszs
+	   void bli_cntx_set_ukr_prefs
 	   (
-	     dim_t   n_bs,
-	     bszid_t bs0_id, blksz_t* blksz0,
-	     bszid_t bs1_id, blksz_t* blksz1,
-	     bszid_t bs2_id, blksz_t* blksz2,
-	     ...
-	     cntx_t* cntx
+	     cntx_t* cntx,
+	     ukr_pref_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
+	     ukr_pref_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
+	     ukr_pref_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
+	     ...,
+	     BLIS_VA_END
 	   );
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val );
-
-	// -- Begin variable argument section --
+	// Query the context for the address of the ukernel preference mbool_t array
+	mbool_t* cntx_ukr_prefs = bli_cntx_ukr_prefs_buf( cntx );
 
 	// Initialize variable argument environment.
-	va_start( args, n_bs );
-
-	// Process n_bs tuples.
-	for ( i = 0; i < n_bs; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the bszid_t of the blocksize we're about to process,
-		// - the address of the blksz_t object.
-		bszid_t  bs_id = ( bszid_t  )va_arg( args, bszid_t  );
-		blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-
-		// Store the values in our temporary arrays.
-		bszids[ i ] = bs_id;
-		blkszs[ i ] = blksz;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the blocksize object array
-	blksz_t* cntx_l3_sup_blkszs = bli_cntx_l3_sup_blkszs_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context. Notice that the blksz_t* pointers were saved, rather than
-	// the objects themselves, but we copy the contents of the objects
-	// when copying into the context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_bs; ++i )
-	{
-		// Read the current blocksize id, blksz_t* pointer, blocksize
-		// multiple id, and blocksize scalar.
-		bszid_t  bs_id = bszids[ i ];
-		blksz_t* blksz = blkszs[ i ];
-
-		blksz_t* cntx_l3_sup_blksz = &cntx_l3_sup_blkszs[ bs_id ];
-
-		// Copy the blksz_t object contents into the appropriate
-		// location within the context's blksz_t array.
-		//cntx_l3_sup_blkszs[ bs_id ] = *blksz;
-		//bli_blksz_copy( blksz, cntx_l3_sup_blksz );
-		bli_blksz_copy_if_pos( blksz, cntx_l3_sup_blksz );
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( blkszs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( bszids );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 microkernels for small/unpacked matrices. It
-	// should be called after bli_cntx_init_defaults() so that the context
-	// begins with default sup micro/millikernels across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_sup_kers
-	   (
-	     dim_t   n_ukrs,
-	     stor3_t stor_id0, num_t dt0, void* ukr0_fp, bool pref0,
-	     stor3_t stor_id1, num_t dt1, void* ukr1_fp, bool pref1,
-	     stor3_t stor_id2, num_t dt2, void* ukr2_fp, bool pref2,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
 	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	stor3_t* st3_ids   = bli_malloc_intl( n_ukrs * sizeof( stor3_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	num_t*   ukr_dts   = bli_malloc_intl( n_ukrs * sizeof( num_t   ), &r_val );
+	va_start( args, cntx );
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	void**   ukr_fps   = bli_malloc_intl( n_ukrs * sizeof( void*   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bool*    ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool    ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_ukrs );
-
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ukrs; ++i )
+	// Process ukernel preferences until BLIS_VA_END is reached.
+	while ( true )
 	{
-		// Here, we query the variable argument list for:
-		// - the stor3_t storage case being assigned to the kernel we're
-		//   about to process,
-		// - the datatype of the kernel,
-		// - the kernel function pointer, and
-		// - the kernel function storage preference
-		// that we need to store to the context.
-		const stor3_t  st3_id   = ( stor3_t )va_arg( args, stor3_t );
-		const num_t    ukr_dt   = ( num_t   )va_arg( args, num_t   );
-		      void*    ukr_fp   = ( void*   )va_arg( args, void*   );
-		const bool     ukr_pref = ( bool    )va_arg( args, int     );
-
-		// Store the values in our temporary arrays.
-		st3_ids[ i ]   = st3_id;
-		ukr_dts[ i ]   = ukr_dt;
-		ukr_fps[ i ]   = ukr_fp;
-		ukr_prefs[ i ] = ukr_pref;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 small/unpacked ukernel func_t array
-	// - the l3 small/unpacked ukernel preferences array
-	func_t*  cntx_l3_sup_kers       = bli_cntx_l3_sup_kers_buf( cntx );
-	mbool_t* cntx_l3_sup_kers_prefs = bli_cntx_l3_sup_kers_prefs_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-#if 0
-	dim_t sup_map[ BLIS_NUM_LEVEL3_SUP_UKRS ][2];
-
-	// Create the small/unpacked ukernel mappings:
-	// - rv -> rrr 0, rcr 2
-	// - rg -> rrc 1, rcc 3
-	// - cv -> ccr 6, ccc 7
-	// - cg -> crr 4, crc 5
-	// - rd -> rrc 1
-	// - cd -> crc 5
-	// - rc -> rcc 3
-	// - cr -> crr 4
-	// - gx -> xxx 8
-	// NOTE: We only need to set one slot in the context l3_sup_kers array
-	// for the general-stride/generic ukernel type, but since the loop below
-	// needs to be set up to set two slots to accommodate the RV, RG, CV, and
-	// CG, ukernel types, we will just be okay with the GX ukernel being set
-	// redundantly. (The RD, CD, CR, and RC ukernel types are set redundantly
-	// for the same reason.)
-	sup_map[ BLIS_GEMMSUP_RV_UKR ][0] = BLIS_RRR;
-	sup_map[ BLIS_GEMMSUP_RV_UKR ][1] = BLIS_RCR;
-	sup_map[ BLIS_GEMMSUP_RG_UKR ][0] = BLIS_RRC;
-	sup_map[ BLIS_GEMMSUP_RG_UKR ][1] = BLIS_RCC;
-	sup_map[ BLIS_GEMMSUP_CV_UKR ][0] = BLIS_CCR;
-	sup_map[ BLIS_GEMMSUP_CV_UKR ][1] = BLIS_CCC;
-	sup_map[ BLIS_GEMMSUP_CG_UKR ][0] = BLIS_CRR;
-	sup_map[ BLIS_GEMMSUP_CG_UKR ][1] = BLIS_CRC;
-
-	sup_map[ BLIS_GEMMSUP_RD_UKR ][0] = BLIS_RRC;
-	sup_map[ BLIS_GEMMSUP_RD_UKR ][1] = BLIS_RRC;
-	sup_map[ BLIS_GEMMSUP_CD_UKR ][0] = BLIS_CRC;
-	sup_map[ BLIS_GEMMSUP_CD_UKR ][1] = BLIS_CRC;
-
-	sup_map[ BLIS_GEMMSUP_RC_UKR ][0] = BLIS_RCC;
-	sup_map[ BLIS_GEMMSUP_RC_UKR ][1] = BLIS_RCC;
-	sup_map[ BLIS_GEMMSUP_CR_UKR ][0] = BLIS_CRR;
-	sup_map[ BLIS_GEMMSUP_CR_UKR ][1] = BLIS_CRR;
-
-	sup_map[ BLIS_GEMMSUP_GX_UKR ][0] = BLIS_XXX;
-	sup_map[ BLIS_GEMMSUP_GX_UKR ][1] = BLIS_XXX;
-#endif
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Read the current stor3_t id, ukernel datatype, ukernel function
-		// pointer, and ukernel preference.
-		const stor3_t st3_id   = st3_ids[ i ];
-		const num_t   ukr_dt   = ukr_dts[ i ];
-		      void*   ukr_fp   = ukr_fps[ i ];
-		const bool    ukr_pref = ukr_prefs[ i ];
-
-		// Index to the func_t and mbool_t for the current stor3_t id
-		// being processed.
-		func_t*  ukrs   = &cntx_l3_sup_kers[ st3_id ];
-		mbool_t* prefs  = &cntx_l3_sup_kers_prefs[ st3_id ];
-
-		// Store the ukernel function pointer and preference values into
-		// the stor3_t location in the context.
-		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
-		bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
-	}
-
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( st3_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( ukr_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( ukr_fps );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( ukr_prefs );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l1f_kers( dim_t n_kers, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-1f kernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default l1f
-	// kernels across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l1f_kers
-	   (
-	     dim_t   n_ukrs,
-	     l1fkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
-	     l1fkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
-	     l1fkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	l1fkr_t* ker_ids   = bli_malloc_intl( n_kers * sizeof( l1fkr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	num_t*   ker_dts   = bli_malloc_intl( n_kers * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	void_fp* ker_fps   = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
-
-	// -- Begin variable argument section --
+		const int ukr_pref_id0 = va_arg( args, int );
 
-	// Initialize variable argument environment.
-	va_start( args, n_kers );
+		// If we find a ukernel pref id of BLIS_VA_END, then we are done.
+		if ( ukr_pref_id0 == BLIS_VA_END ) break;
 
-	// Process n_kers tuples.
-	for ( i = 0; i < n_kers; ++i )
-	{
 		// Here, we query the variable argument list for:
-		// - the l1fkr_t of the kernel we're about to process,
+		// - the ukr_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		// that we need to store to the context.
-		const l1fkr_t  ker_id   = ( l1fkr_t )va_arg( args, l1fkr_t );
-		const num_t    ker_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ker_fp   = ( void_fp )va_arg( args, void_fp );
-
-		// Store the values in our temporary arrays.
-		ker_ids[ i ]   = ker_id;
-		ker_dts[ i ]   = ker_dt;
-		ker_fps[ i ]   = ker_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the address of:
-	// - the level-1f kernels func_t array
-	func_t* cntx_l1f_kers = bli_cntx_l1f_kers_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_kers; ++i )
-	{
-		// Read the current kernel id, kernel datatype, and kernel function
-		// pointer.
-		const l1fkr_t ker_id   = ker_ids[ i ];
-		const num_t   ker_dt   = ker_dts[ i ];
-		      void_fp ker_fp   = ker_fps[ i ];
+		const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0;
+		const bool       ukr_pref_dt = ( num_t      )va_arg( args, num_t   );
+		const bool       ukr_pref    = ( bool       )va_arg( args, int );
 
 		// Index into the func_t and mbool_t for the current kernel id
 		// being processed.
-		func_t*       kers     = &cntx_l1f_kers[ ker_id ];
+		mbool_t* ukr_prefs = &cntx_ukr_prefs[ ukr_pref_id ];
 
-		// Store the ukernel function pointer and preference values into
-		// the context.
-		bli_func_set_dt( ker_fp, ker_dt, kers );
+		// Store the ukernel preference value into the context.
+		bli_mbool_set_dt( ukr_pref, ukr_pref_dt, ukr_prefs );
 	}
 
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	bli_free_intl( ker_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	bli_free_intl( ker_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	bli_free_intl( ker_fps );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l1v_kers( dim_t n_kers, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-1v kernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default l1v
-	// kernels across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l1v_kers
-	   (
-	     dim_t   n_ukrs,
-	     l1vkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
-	     l1vkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
-	     l1vkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	l1vkr_t* ker_ids   = bli_malloc_intl( n_kers * sizeof( l1vkr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	num_t*   ker_dts   = bli_malloc_intl( n_kers * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	void_fp* ker_fps   = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_kers );
-
-	// Process n_kers tuples.
-	for ( i = 0; i < n_kers; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the l1vkr_t of the kernel we're about to process,
-		// - the datatype of the kernel, and
-		// - the kernel function pointer
-		// that we need to store to the context.
-		const l1vkr_t  ker_id   = ( l1vkr_t )va_arg( args, l1vkr_t );
-		const num_t    ker_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ker_fp   = ( void_fp )va_arg( args, void_fp );
-
-		// Store the values in our temporary arrays.
-		ker_ids[ i ]   = ker_id;
-		ker_dts[ i ]   = ker_dt;
-		ker_fps[ i ]   = ker_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
 	// Shutdown variable argument environment and clean up stack.
 	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the address of:
-	// - the level-1v kernels func_t array
-	func_t* cntx_l1v_kers = bli_cntx_l1v_kers_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_kers; ++i )
-	{
-		// Read the current kernel id, kernel datatype, and kernel function
-		// pointer.
-		const l1vkr_t ker_id   = ker_ids[ i ];
-		const num_t   ker_dt   = ker_dts[ i ];
-		      void_fp ker_fp   = ker_fps[ i ];
-
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		func_t*       kers     = &cntx_l1v_kers[ ker_id ];
-
-		// Store the ukernel function pointer and preference values into
-		// the context.
-		bli_func_set_dt( ker_fp, ker_dt, kers );
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	bli_free_intl( ker_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	bli_free_intl( ker_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	bli_free_intl( ker_fps );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_packm_kers( dim_t n_kers, ... )
+void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
-	// non-default packing kernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default packm
-	// kernels across all datatypes.
+	// non-default level-3 operation handler for small/unpacked matrices. It
+	// should be called after bli_cntx_init_<subconfig>_ref() so that the
+	// context begins with default sup handlers across all datatypes.
 
 	/* Example prototypes:
 
-	   void bli_cntx_set_packm_kers
+	   void bli_cntx_set_l3_sup_handlers
 	   (
-	     dim_t   n_ukrs,
-	     l1mkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
-	     l1mkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
-	     l1mkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
-	     ...
 	     cntx_t* cntx
+	     opid_t  op0_id, void_fp handler0_fp,
+	     opid_t  op1_id, void_fp handler1_fp,
+	     opid_t  op2_id, void_fp handler2_fp,
+	     ...,
+	     BLIS_VA_END
 	   );
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	l1mkr_t* ker_ids   = bli_malloc_intl( n_kers * sizeof( l1mkr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	num_t*   ker_dts   = bli_malloc_intl( n_kers * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	void_fp* ker_fps   = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
-
-	// -- Begin variable argument section --
+	// Query the context for the address of the l3 sup handlers array.
+	void_fp* cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
 
 	// Initialize variable argument environment.
-	va_start( args, n_kers );
+	va_list   args;
+	va_start( args, cntx );
 
-	// Process n_kers tuples.
-	for ( i = 0; i < n_kers; ++i )
+	// Process sup handlers until BLIS_VA_END is reached.
+	while ( true )
 	{
-		// Here, we query the variable argument list for:
-		// - the l1mkr_t of the kernel we're about to process,
-		// - the datatype of the kernel, and
-		// - the kernel function pointer
-		// that we need to store to the context.
-		const l1mkr_t  ker_id   = ( l1mkr_t )va_arg( args, l1mkr_t );
-		const num_t    ker_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ker_fp   = ( void_fp )va_arg( args, void_fp );
-
-		// Store the values in our temporary arrays.
-		ker_ids[ i ]   = ker_id;
-		ker_dts[ i ]   = ker_dt;
-		ker_fps[ i ]   = ker_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
+		const int op_id0 = va_arg( args, int );
 
-	// Query the context for the address of:
-	// - the packm kernels func_t array
-	func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx );
+		// If we find an operation id of BLIS_VA_END, then we are done.
+		if ( op_id0 == BLIS_VA_END ) break;
 
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_kers; ++i )
-	{
-		// Read the current kernel id, kernel datatype, and kernel function
-		// pointer.
-		const l1mkr_t ker_id   = ker_ids[ i ];
-		const num_t   ker_dt   = ker_dts[ i ];
-		      void_fp ker_fp   = ker_fps[ i ];
-
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		func_t*       kers     = &cntx_packm_kers[ ker_id ];
+		// Here, we query the variable argument list for:
+		// - the opid_t of the operation we're about to process,
+		// - the sup handler function pointer
+		const opid_t  op_id = ( opid_t  )op_id0;
+		      void_fp op_fp = ( void_fp )va_arg( args, void_fp );
 
-		// Store the ukernel function pointer and preference values into
-		// the context.
-		bli_func_set_dt( ker_fp, ker_dt, kers );
+		// Store the sup handler function pointer into the slot for the
+		// specified operation id.
+		cntx_l3_sup_handlers[ op_id ] = op_fp;
 	}
 
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	bli_free_intl( ker_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	bli_free_intl( ker_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	bli_free_intl( ker_fps );
+	// Shutdown variable argument environment and clean up stack.
+	va_end( args );
 }
 
 // -----------------------------------------------------------------------------
@@ -1586,11 +408,11 @@ void bli_cntx_print( cntx_t* cntx )
 		      );
 	}
 
-	for ( i = 0; i < BLIS_NUM_LEVEL3_UKRS; ++i )
+	for ( i = 0; i < BLIS_NUM_UKRS; ++i )
 	{
-		func_t* ukr = bli_cntx_get_l3_vir_ukrs( i, cntx );
+		func_t* ukr = bli_cntx_get_ukrs( i, cntx );
 
-		printf( "l3 vir ukr %2lu:  %16p %16p %16p %16p\n",
+		printf( "ukr %2lu:  %16p %16p %16p %16p\n",
 		        ( unsigned long )i,
 		        bli_func_get_dt( BLIS_FLOAT,    ukr ),
 		        bli_func_get_dt( BLIS_DOUBLE,   ukr ),
@@ -1599,42 +421,16 @@ void bli_cntx_print( cntx_t* cntx )
 		      );
 	}
 
-	for ( i = 0; i < BLIS_NUM_3OP_RC_COMBOS; ++i )
-	{
-		func_t* ukr = bli_cntx_get_l3_sup_kers( i, cntx );
-
-		printf( "l3 sup ukr %2lu:  %16p %16p %16p %16p\n",
-		        ( unsigned long )i,
-		        bli_func_get_dt( BLIS_FLOAT,    ukr ),
-		        bli_func_get_dt( BLIS_DOUBLE,   ukr ),
-		        bli_func_get_dt( BLIS_SCOMPLEX, ukr ),
-		        bli_func_get_dt( BLIS_DCOMPLEX, ukr )
-		      );
-	}
-
-	for ( i = 0; i < BLIS_NUM_LEVEL1F_KERS; ++i )
-	{
-		func_t* ker = bli_cntx_get_l1f_kers( i, cntx );
-
-		printf( "l1f ker    %2lu:  %16p %16p %16p %16p\n",
-		        ( unsigned long )i,
-		        bli_func_get_dt( BLIS_FLOAT,    ker ),
-		        bli_func_get_dt( BLIS_DOUBLE,   ker ),
-		        bli_func_get_dt( BLIS_SCOMPLEX, ker ),
-		        bli_func_get_dt( BLIS_DCOMPLEX, ker )
-		      );
-	}
-
-	for ( i = 0; i < BLIS_NUM_LEVEL1V_KERS; ++i )
+	for ( i = 0; i < BLIS_NUM_UKR_PREFS; ++i )
 	{
-		func_t* ker = bli_cntx_get_l1v_kers( i, cntx );
+		mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx );
 
-		printf( "l1v ker    %2lu:  %16p %16p %16p %16p\n",
+		printf( "ukr pref %2lu:  %d %d %d %d\n",
 		        ( unsigned long )i,
-		        bli_func_get_dt( BLIS_FLOAT,    ker ),
-		        bli_func_get_dt( BLIS_DOUBLE,   ker ),
-		        bli_func_get_dt( BLIS_SCOMPLEX, ker ),
-		        bli_func_get_dt( BLIS_DCOMPLEX, ker )
+		        bli_mbool_get_dt( BLIS_FLOAT,    ukr_pref ),
+		        bli_mbool_get_dt( BLIS_DOUBLE,   ukr_pref ),
+		        bli_mbool_get_dt( BLIS_SCOMPLEX, ukr_pref ),
+		        bli_mbool_get_dt( BLIS_DCOMPLEX, ukr_pref )
 		      );
 	}
 
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 76350f6bc..412430e9b 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -43,24 +43,13 @@
 /*
 typedef struct cntx_s
 {
-	blksz_t*  blkszs;
-	bszid_t*  bmults;
+	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
+	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
 
-	func_t*   l3_vir_ukrs;
-	func_t*   l3_nat_ukrs;
-	mbool_t*  l3_nat_ukrs_prefs;
+	func_t    ukrs[ BLIS_NUM_UKRS ];
+	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
-	blksz_t*  l3_sup_thresh;
-	void**    l3_sup_handlers;
-	blksz_t*  l3_sup_blkszs;
-	func_t*   l3_sup_kers;
-	mbool_t*  l3_sup_kers_prefs;
-
-	func_t*   l1f_kers;
-	func_t*   l1v_kers;
-
-	func_t*   packm_kers;
-	func_t*   unpackm_kers;
+	void_fp   l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
 
 	ind_t     method;
 
@@ -81,54 +70,18 @@ BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
 {
 	return cntx->bmults;
 }
-BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx )
-{
-	return cntx->l3_vir_ukrs;
-}
-BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx )
+BLIS_INLINE func_t* bli_cntx_ukrs_buf( cntx_t* cntx )
 {
-	return cntx->l3_nat_ukrs;
+	return cntx->ukrs;
 }
-BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx )
+BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx )
 {
-	return cntx->l3_nat_ukrs_prefs;
+	return cntx->ukr_prefs;
 }
-BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_thresh;
-}
-BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
+BLIS_INLINE void_fp* bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
 {
 	return cntx->l3_sup_handlers;
 }
-BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_blkszs;
-}
-BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_kers;
-}
-BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_kers_prefs;
-}
-BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx )
-{
-	return cntx->l1f_kers;
-}
-BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx )
-{
-	return cntx->l1v_kers;
-}
-BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx )
-{
-	return cntx->packm_kers;
-}
-BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx )
-{
-	return cntx->unpackm_kers;
-}
 BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
 {
 	return cntx->method;
@@ -204,399 +157,144 @@ BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	func_t* funcs = bli_cntx_ukrs_buf( cntx );
 	func_t* func  = &funcs[ ukr_id ];
 
 	return func;
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx );
+	func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
 
 	return bli_func_get_dt( dt, func );
 }
 
-BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx );
-	func_t* func  = &funcs[ ukr_id ];
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx );
+	switch ( ukr_id )
+	{
+		case BLIS_GEMM_UKR:       ukr_id = BLIS_GEMM_VIR_UKR; break;
+		case BLIS_TRSM_L_UKR:     ukr_id = BLIS_TRSM_L_VIR_UKR; break;
+		case BLIS_TRSM_U_UKR:     ukr_id = BLIS_TRSM_U_VIR_UKR; break;
+		case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break;
+		case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break;
+		default: break;
+	};
 
-	return bli_func_get_dt( dt, func );
+	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t ukr_id, cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
+	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
 	mbool_t* mbool  = &mbools[ ukr_id ];
 
 	return mbool;
 }
 
-BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, cntx_t* cntx )
 {
-	mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx );
+	mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
 
 	return ( bool )bli_mbool_get_dt( dt, mbool );
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx )
-{
-	blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx );
-	blksz_t* thresh  = &threshs[ thresh_id ];
-
-	// Return the address of the blksz_t identified by thresh_id.
-	return thresh;
-}
-
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx )
-{
-	blksz_t* threshs   = bli_cntx_get_l3_sup_thresh( thresh_id, cntx );
-	dim_t    thresh_dt = bli_blksz_get_def( dt, threshs );
-
-	// Return the main (default) threshold value for the datatype given.
-	return thresh_dt;
-}
-
 BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx )
 {
-	if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE;
-	if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE;
-	if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE;
+	if ( m < bli_cntx_get_blksz_def_dt( dt, BLIS_MT, cntx ) ) return TRUE;
+	if ( n < bli_cntx_get_blksz_def_dt( dt, BLIS_NT, cntx ) ) return TRUE;
+	if ( k < bli_cntx_get_blksz_def_dt( dt, BLIS_KT, cntx ) ) return TRUE;
 
 	return FALSE;
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
 {
-	void** funcs = bli_cntx_l3_sup_handlers_buf( cntx );
-	void*  func  = funcs[ op ];
+	void_fp* funcs = bli_cntx_l3_sup_handlers_buf( cntx );
+	void_fp  func  = funcs[ op ];
 
 	return func;
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx )
-{
-	blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
-	// Return the address of the blksz_t identified by bs_id.
-	return blksz;
-}
-
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-	blksz_t* blksz  = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_def( dt, blksz );
+	// This initial value will get overwritten during the switch statement below.
+	ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF;
 
-	// Return the main (default) blocksize value for the datatype given.
-	return bs_dt;
-}
-
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
-{
-	blksz_t* blksz  = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_max( dt, blksz );
-
-	// Return the auxiliary (maximum) blocksize value for the datatype given.
-	return bs_dt;
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx );
-	func_t* func  = &funcs[ stor_id ];
-
-	return func;
-}
-
-BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx );
-
-	return bli_func_get_dt( dt, func );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx )
-{
-	mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx );
-	mbool_t* mbool  = &mbools[ stor_id ];
-
-	return mbool;
-}
-
-BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx );
-
-	return ( bool )bli_mbool_get_dt( dt, mbool );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l1f_kers_buf( cntx );
-	func_t* func  = &funcs[ ker_id ];
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx );
-
-	return bli_func_get_dt( dt, func );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l1v_kers_buf( cntx );
-	func_t* func  = &funcs[ ker_id ];
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx );
-
-	return bli_func_get_dt( dt, func );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = NULL;
-
-	// Only index to the requested packm func_t if the packm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_PACKM_KERS )
-	{
-		func_t* funcs = bli_cntx_packm_kers_buf( cntx );
-
-		func = &funcs[ ker_id ];
-	}
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx )
-{
-	void_fp fp = NULL;
-
-	// Only query the context for the packm func_t (and then extract the
-	// datatype-specific function pointer) if the packm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_PACKM_KERS )
+	// Get the correct preference from the kernel ID.
+	switch ( ukr_id )
 	{
-		func_t* func = bli_cntx_get_packm_kers( ker_id, cntx );
-
-		fp = bli_func_get_dt( dt, func );
+		case BLIS_GEMM_VIR_UKR: // fallthrough
+		case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break;
+		case BLIS_TRSM_L_VIR_UKR: // fallthrough
+		case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break;
+		case BLIS_TRSM_U_VIR_UKR: // fallthrough
+		case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break;
+		case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
+		case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break;
+		case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough
+		case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RCR_UKR: ukr_pref_id = BLIS_GEMMSUP_RCR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RCC_UKR: ukr_pref_id = BLIS_GEMMSUP_RCC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CRR_UKR: ukr_pref_id = BLIS_GEMMSUP_CRR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CRC_UKR: ukr_pref_id = BLIS_GEMMSUP_CRC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CCR_UKR: ukr_pref_id = BLIS_GEMMSUP_CCR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CCC_UKR: ukr_pref_id = BLIS_GEMMSUP_CCC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_XXX_UKR: ukr_pref_id = BLIS_GEMMSUP_XXX_UKR_ROW_PREF; break;
+		default: break; // TODO: should be an error condition
 	}
 
-	return fp;
-}
-
-BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = NULL;
-
-	// Only index to the requested unpackm func_t if the unpackm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS )
-	{
-		func_t* funcs = bli_cntx_unpackm_kers_buf( cntx );
-
-		func = &funcs[ ker_id ];
-	}
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx )
-{
-	void_fp fp = NULL;
-
-	// Only query the context for the unpackm func_t (and then extract the
-	// datatype-specific function pointer) if the unpackm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS )
+	// For virtual ukernels during non-native execution, use the real projection of
+	// the datatype.
+	if ( bli_cntx_method( cntx ) != BLIS_NAT )
 	{
-		func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx );
-
-		fp = bli_func_get_dt( dt, func );
+		switch ( ukr_id )
+		{
+			case BLIS_GEMM_VIR_UKR: // fallthrough
+			case BLIS_TRSM_L_VIR_UKR: // fallthrough
+			case BLIS_TRSM_U_VIR_UKR: // fallthrough
+			case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
+			case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break;
+			default: break;
+		}
 	}
 
-	return fp;
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx );
-
-	// A ukernel preference of TRUE means the ukernel prefers row storage.
-	return ( bool )
-	       ( prefs == TRUE );
-}
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx );
-
-	// A ukernel preference of FALSE means the ukernel prefers column storage.
-	return ( bool )
-	       ( prefs == FALSE );
-}
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// Note that we use the computation datatype, which may differ from the
-	// storage datatype of C (when performing a mixed datatype operation).
-	const num_t dt    = bli_obj_comp_dt( obj );
-	const bool  ukr_prefers_rows
-	                  = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
-	const bool  ukr_prefers_cols
-	                  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
-	bool        r_val = FALSE;
-
-	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
-
-	return r_val;
-}
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	return ( bool )
-	       !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// For induced methods, return the ukernel storage preferences of the
-	// corresponding real micro-kernel.
-	// NOTE: This projection to real domain becomes unnecessary if you
-	// set the exec_dt for 1m to the real projection of the storage
-	// datatype.
-	if ( bli_cntx_method( cntx ) != BLIS_NAT )
-	    dt = bli_dt_proj_to_real( dt );
-
-	return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
-}
-
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// For induced methods, return the ukernel storage preferences of the
-	// corresponding real micro-kernel.
-	// NOTE: This projection to real domain becomes unnecessary if you
-	// set the exec_dt for 1m to the real projection of the storage
-	// datatype.
-	if ( bli_cntx_method( cntx ) != BLIS_NAT )
-	    dt = bli_dt_proj_to_real( dt );
-
-	return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
-}
-
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// Note that we use the computation datatype, which may differ from the
-	// storage datatype of C (when performing a mixed datatype operation).
-	const num_t dt    = bli_obj_comp_dt( obj );
-	const bool  ukr_prefers_rows
-	                  = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx );
-	const bool  ukr_prefers_cols
-	                  = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx );
-	bool        r_val = FALSE;
-
-	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
-
-	return r_val;
+	return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx );
 }
 
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-	return ( bool )
-	       !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx );
+	return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx );
 }
 
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
-
-	// A ukernel preference of TRUE means the ukernel prefers row storage.
-	return ( bool )
-	       ( prefs == TRUE );
-}
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
-
-	// A ukernel preference of FALSE means the ukernel prefers column storage.
-	return ( bool )
-	       ( prefs == FALSE );
-}
-
-#if 0
-// NOTE: These static functions aren't needed yet.
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
 {
-	const num_t dt    = bli_obj_dt( obj );
-	const bool  ukr_prefers_rows
-	                  = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
-	const bool  ukr_prefers_cols
-	                  = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx );
-	bool        r_val = FALSE;
+	const bool ukr_prefers_rows
+		= bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
 
-	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
+	if      ( bli_obj_is_row_stored( obj ) &&  ukr_prefers_rows ) return TRUE;
+	else if ( bli_obj_is_col_stored( obj ) && !ukr_prefers_rows ) return TRUE;
 
-	return r_val;
+	return FALSE;
 }
 
-BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
 {
-	return ( bool )
-	       !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx );
+	return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx );
 }
-#endif
 
 // -----------------------------------------------------------------------------
 
@@ -632,67 +330,64 @@ BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, c
 	bli_blksz_set_max( bs, dt, blksz );
 }
 
-BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	func_t* funcs = bli_cntx_ukrs_buf( cntx );
 
 	funcs[ ukr_id ] = *func;
 }
 
-BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx );
+	func_t* func = bli_cntx_get_ukrs( ker_id, cntx );
 
-	funcs[ ukr_id ] = *func;
+	bli_func_set_dt( fp, dt, func );
 }
 
-BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
+	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
 
 	mbools[ ukr_id ] = *prefs;
 }
 
-BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_l1f_kers_buf( cntx );
+	ukr_t ukr_id = bli_stor3_ukr( stor_id );
 
-	funcs[ ker_id ] = *func;
+	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
-BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l1v_kers_buf( cntx );
-
-	funcs[ ker_id ] = *func;
-}
-
-BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx );
-
-	funcs[ ker_id ] = *func;
-}
-
-BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx );
-
-	bli_func_set_dt( fp, dt, func );
-}
-
-BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx );
+	switch ( bs_id )
+	{
+		case BLIS_MR: bs_id = BLIS_MR_SUP; break;
+		case BLIS_NR: bs_id = BLIS_NR_SUP; break;
+		case BLIS_KR: bs_id = BLIS_KR_SUP; break;
+		case BLIS_MC: bs_id = BLIS_MC_SUP; break;
+		case BLIS_NC: bs_id = BLIS_NC_SUP; break;
+		case BLIS_KC: bs_id = BLIS_KC_SUP; break;
+		default: break;
+	};
 
-	funcs[ ker_id ] = *func;
+	return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx );
 }
 
-BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 {
-	func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx );
+	switch ( bs_id )
+	{
+		case BLIS_MR: bs_id = BLIS_MR_SUP; break;
+		case BLIS_NR: bs_id = BLIS_NR_SUP; break;
+		case BLIS_KR: bs_id = BLIS_KR_SUP; break;
+		case BLIS_MC: bs_id = BLIS_MC_SUP; break;
+		case BLIS_NC: bs_id = BLIS_NC_SUP; break;
+		case BLIS_KC: bs_id = BLIS_KC_SUP; break;
+		default: break;
+	};
 
-	bli_func_set_dt( fp, dt, func );
+	return bli_cntx_get_blksz_max_dt( dt, bs_id, cntx );
 }
 
 // -----------------------------------------------------------------------------
@@ -701,24 +396,17 @@ BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_
 
 BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );
-
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... );
-
-BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... );
 
 BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx );
 
+BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
+
 
 #endif
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index cc17b33ff..1372a055a 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -185,7 +185,7 @@ void bli_gks_init( void )
 		bli_gks_register_cntx( BLIS_ARCH_POWER10,     bli_cntx_init_power10,
 		                                              bli_cntx_init_power10_ref,
 		                                              bli_cntx_init_power10_ind );
-#endif													  
+#endif
 #ifdef BLIS_CONFIG_POWER9
 		bli_gks_register_cntx( BLIS_ARCH_POWER9,      bli_cntx_init_power9,
 		                                              bli_cntx_init_power9_ref,
@@ -267,7 +267,7 @@ void bli_gks_finalize( void )
 void bli_gks_init_index( void )
 {
 	// This function is called by bli_gks_init(). It simply initializes all
-	// architecture id elements of the internal arrays to NULL. 
+	// architecture id elements of the internal arrays to NULL.
 
 	const size_t gks_size = sizeof( cntx_t* ) * BLIS_NUM_ARCHS;
 	const size_t fpa_size = sizeof( void_fp ) * BLIS_NUM_ARCHS;
@@ -382,7 +382,7 @@ void bli_gks_register_cntx
 	// functions for reference kernels and induced method execution. The
 	// former will be used whenever we need to obtain reference kernels and
 	// latter will be used later on if the user calls a level-3 function
-	// with induced execution enabled. 
+	// with induced execution enabled.
 	cntx_ref_init[ id ] = ref_fp;
 	cntx_ind_init[ id ] = ind_fp;
 
@@ -582,7 +582,7 @@ cntx_t* bli_gks_query_ind_cntx
 			// function on the newly allocated structure, we must first copy
 			// over the contents of the native context.
 			*gks_id_ind = *gks_id_nat;
-			
+
 			// Use the architecture id to look up the function pointer to the
 			// context initialization function for induced methods.
 			ind_cntx_init_ft f = cntx_ind_init[ id ];
@@ -635,7 +635,7 @@ void bli_gks_init_ref_cntx
 bool bli_gks_cntx_l3_nat_ukr_is_ref
      (
        num_t   dt,
-       l3ukr_t ukr_id,
+       ukr_t   ukr_id,
        cntx_t* cntx
      )
 {
@@ -647,8 +647,8 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref
 
 	// Query each context for the micro-kernel function pointer for the
 	// specified datatype.
-	void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, &ref_cntx );
-	void_fp fp     = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, cntx );
+	void_fp ref_fp = bli_cntx_get_ukr_dt( dt, ukr_id, &ref_cntx );
+	void_fp fp     = bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 
 	// Return the result.
 	return fp == ref_fp;
@@ -668,7 +668,7 @@ static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
 
 // -----------------------------------------------------------------------------
 
-char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
+char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
 {
 	kimpl_t ki;
 
@@ -676,7 +676,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
 	// then query the ukernel function pointer for the given datatype from
 	// that context.
 	cntx_t* cntx  = bli_gks_query_ind_cntx( method, dt );
-	void_fp fp    = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx );
+	void_fp fp    = bli_cntx_get_ukr_dt( dt, ukr, cntx );
 
 	// Check whether the ukernel function pointer is NULL for the given
 	// datatype. If it is NULL, return the string for not applicable.
@@ -691,7 +691,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
 }
 
 #if 0
-char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt )
+char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt )
 {
 	opid_t  oper;
 	ind_t   method;
@@ -716,7 +716,7 @@ char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt )
 }
 #endif
 
-kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt )
+kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 {
 	// If the current available induced method is not native, it
 	// must be virtual.
@@ -731,8 +731,6 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt )
 		// method to the typed function pointer within the known
 		// reference ukrs object.
 
-		cntx_t ref_cntx_l;
-
 		// Query the architecture id.
 		arch_t id = bli_arch_query_id();
 
@@ -743,23 +741,13 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt )
 			bli_check_error_code( e_val );
 		}
 
-		// Obtain the function pointer to the context initialization function
-		// for reference kernels.
-		ref_cntx_init_ft f = cntx_ref_init[ id ];
-
-		// Initialize a local context with reference kernels and related values.
-		f( &ref_cntx_l );
-
 		// Query the native context from the gks.
 		cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
 
-		// Query the native ukernel func_t from both the native and reference
-		// contexts.
-		void_fp nat_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, nat_cntx );
-		void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, &ref_cntx_l );
-
-		if ( nat_fp == ref_fp ) return BLIS_REFERENCE_UKERNEL;
-		else                    return BLIS_OPTIMIZED_UKERNEL;
+		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) )
+			return BLIS_REFERENCE_UKERNEL;
+		else
+			return BLIS_OPTIMIZED_UKERNEL;
 	}
 }
 
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index 188dcd507..b8e4c4fe0 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -54,12 +54,12 @@ BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
 
 BLIS_EXPORT_BLIS void    bli_gks_init_ref_cntx( cntx_t* cntx );
 
-bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx );
+bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx );
 
-BLIS_EXPORT_BLIS char*    bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt );
-BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS char*   bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
 
-//char*   bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt );
+//char*   bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
 
 #endif
 
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index 011ebcdfb..e863f7dcf 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -289,6 +289,13 @@ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 )
 GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3 ) \
 GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 )
 
+// -- (four auxiliary arguments) --
+
+#define INSERT_GENTFUNCCO_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \
+\
+GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3, varname4 ) \
+GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 )
+
 
 
 // -- Basic one-operand macro with integer instance --
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index 4de624f98..d273c353a 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -245,7 +245,111 @@
 #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN   0
 #endif
 
+// -- MR and NR blocksizes (only for reference kernels) ------------------------
 
+// The build system defines BLIS_IN_REF_KERNEL, but only when compiling
+// reference kernels. By using compile-time constants for MR and NR, the
+// compiler can perform certain optimizations, such as unrolling and
+// vectorization, that would not be otherwise be possible.
+#ifdef BLIS_IN_REF_KERNEL
+
+#ifndef BLIS_MR_s
+#define BLIS_MR_s 4
+#endif
+
+#ifndef BLIS_MR_d
+#define BLIS_MR_d 4
+#endif
+
+#ifndef BLIS_MR_c
+#define BLIS_MR_c 4
+#endif
+
+#ifndef BLIS_MR_z
+#define BLIS_MR_z 4
+#endif
+
+#ifndef BLIS_NR_s
+#define BLIS_NR_s 16
+#endif
+
+#ifndef BLIS_NR_d
+#define BLIS_NR_d 8
+#endif
+
+#ifndef BLIS_NR_c
+#define BLIS_NR_c 8
+#endif
+
+#ifndef BLIS_NR_z
+#define BLIS_NR_z 4
+#endif
+
+#ifndef BLIS_BBM_s
+#define BLIS_BBM_s 1
+#endif
+
+#ifndef BLIS_BBM_d
+#define BLIS_BBM_d 1
+#endif
+
+#ifndef BLIS_BBM_c
+#define BLIS_BBM_c 1
+#endif
+
+#ifndef BLIS_BBM_z
+#define BLIS_BBM_z 1
+#endif
+
+#ifndef BLIS_BBN_s
+#define BLIS_BBN_s 1
+#endif
+
+#ifndef BLIS_BBN_d
+#define BLIS_BBN_d 1
+#endif
+
+#ifndef BLIS_BBN_c
+#define BLIS_BBN_c 1
+#endif
+
+#ifndef BLIS_BBN_z
+#define BLIS_BBN_z 1
+#endif
+
+#ifndef BLIS_PACKMR_s
+#define BLIS_PACKMR_s (BLIS_MR_s*BLIS_BBM_s)
+#endif
+
+#ifndef BLIS_PACKMR_d
+#define BLIS_PACKMR_d (BLIS_MR_d*BLIS_BBM_d)
+#endif
+
+#ifndef BLIS_PACKMR_c
+#define BLIS_PACKMR_c (BLIS_MR_c*BLIS_BBM_c)
+#endif
+
+#ifndef BLIS_PACKMR_z
+#define BLIS_PACKMR_z (BLIS_MR_z*BLIS_BBM_z)
+#endif
+
+#ifndef BLIS_PACKNR_s
+#define BLIS_PACKNR_s (BLIS_NR_s*BLIS_BBN_s)
+#endif
+
+#ifndef BLIS_PACKNR_d
+#define BLIS_PACKNR_d (BLIS_NR_d*BLIS_BBN_d)
+#endif
+
+#ifndef BLIS_PACKNR_c
+#define BLIS_PACKNR_c (BLIS_NR_c*BLIS_BBN_c)
+#endif
+
+#ifndef BLIS_PACKNR_z
+#define BLIS_PACKNR_z (BLIS_NR_z*BLIS_BBN_z)
+#endif
+
+#endif
 
 #endif
 
diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h
index 120338beb..903b4ece6 100644
--- a/frame/include/bli_misc_macro_defs.h
+++ b/frame/include/bli_misc_macro_defs.h
@@ -164,5 +164,11 @@ BLIS_INLINE void bli_toggle_bool( bool* b )
 #define bli_iformatspec() "%6d"
 
 
+// Sentinel constant used to indicate the end of a variable argument function
+// (See bli_cntx.c)
+
+#define BLIS_VA_END  (-1)
+
+
 #endif
 
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index 286e79e2b..1822065da 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -754,7 +754,7 @@ BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m,
 	*offm_inc = 0;
 
 	// If the diagonal intersects the right side of the matrix,
-	// ignore the area below that intersection. 
+	// ignore the area below that intersection.
 	if ( *m > -(*diagoff) + *n )
 	{
 		*m = -(*diagoff) + *n;
@@ -777,6 +777,14 @@ BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m
 	bli_toggle_uplo( uplo );
 }
 
+// we don't know the type of a, so this must be a macro
+// rs_a and cs_a must be variables and not expressions
+#define bli_reflect_to_stored_part( diagoff, a, rs_a, cs_a ) \
+do { \
+	a += ( diagoff ) * ( cs_a - rs_a ); \
+	bli_swap_incs( &rs_a, &cs_a ); \
+} while (0) \
+
 BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end )
 {
 	dim_t start2 = n - *start;
@@ -858,6 +866,22 @@ BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id )
 #endif
 }
 
+BLIS_INLINE ukr_t bli_stor3_ukr( stor3_t id )
+{
+	switch ( id )
+	{
+		case BLIS_RRR: return BLIS_GEMMSUP_RRR_UKR;
+		case BLIS_RRC: return BLIS_GEMMSUP_RRC_UKR;
+		case BLIS_RCR: return BLIS_GEMMSUP_RCR_UKR;
+		case BLIS_RCC: return BLIS_GEMMSUP_RCC_UKR;
+		case BLIS_CRR: return BLIS_GEMMSUP_CRR_UKR;
+		case BLIS_CRC: return BLIS_GEMMSUP_CRC_UKR;
+		case BLIS_CCR: return BLIS_GEMMSUP_CCR_UKR;
+		case BLIS_CCC: return BLIS_GEMMSUP_CCC_UKR;
+		default: return BLIS_GEMMSUP_XXX_UKR;
+	}
+}
+
 BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id )
 {
 #if 0
diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h
index 293c80f91..f567e7ef3 100644
--- a/frame/include/bli_scalar_macro_defs.h
+++ b/frame/include/bli_scalar_macro_defs.h
@@ -49,8 +49,8 @@
 // NOTE: These macros are not used by other scalar macros, but they are
 // related to those defined in bli_sets.h, and so we #include them here.
 
-#include "bli_setrs.h"   // sets real component only 
-#include "bli_setis.h"   // sets imaginary component only 
+#include "bli_setrs.h"   // sets real component only
+#include "bli_setis.h"   // sets imaginary component only
 
 // NOTE: This macro also needs to be defined early on since it determines
 // how real and imaginary components are accessed (ie: whether the fields
@@ -194,6 +194,7 @@
 #include "bli_adds_mxn.h"
 #include "bli_adds_mxn_uplo.h"
 #include "bli_set0s_mxn.h"
+#include "bli_set0s_edge.h"
 #include "bli_copys_mxn.h"
 #include "bli_scal2s_mxn.h"
 #include "bli_xpbys_mxn.h"
@@ -230,7 +231,7 @@
 #include "bli_scal21rs.h"
 #include "bli_scal2j1rs.h"
 
-// 1m (1e or 1r) 
+// 1m (1e or 1r)
 #include "bli_invert1ms_mxn_diag.h"
 
 #include "bli_scal1ms_mxn.h"
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index c66505bde..4e64f3711 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -626,7 +626,8 @@ typedef enum
 
 typedef enum
 {
-	BLIS_ADDV_KER  = 0,
+	// l1v kernels
+	BLIS_ADDV_KER,
 	BLIS_AMAXV_KER,
 	BLIS_AXPBYV_KER,
 	BLIS_AXPYV_KER,
@@ -639,108 +640,82 @@ typedef enum
 	BLIS_SETV_KER,
 	BLIS_SUBV_KER,
 	BLIS_SWAPV_KER,
-	BLIS_XPBYV_KER
-} l1vkr_t;
-
-#define BLIS_NUM_LEVEL1V_KERS 14
-
-
-typedef enum
-{
-	BLIS_AXPY2V_KER = 0,
+	BLIS_XPBYV_KER,
+	BLIS_AXPY2V_KER,
 	BLIS_DOTAXPYV_KER,
+
+	// l1f kernels
 	BLIS_AXPYF_KER,
 	BLIS_DOTXF_KER,
-	BLIS_DOTXAXPYF_KER
-} l1fkr_t;
-
-#define BLIS_NUM_LEVEL1F_KERS 5
-
-
-typedef enum
-{
-	BLIS_PACKM_0XK_KER  = 0,
-	BLIS_PACKM_1XK_KER  = 1,
-	BLIS_PACKM_2XK_KER  = 2,
-	BLIS_PACKM_3XK_KER  = 3,
-	BLIS_PACKM_4XK_KER  = 4,
-	BLIS_PACKM_5XK_KER  = 5,
-	BLIS_PACKM_6XK_KER  = 6,
-	BLIS_PACKM_7XK_KER  = 7,
-	BLIS_PACKM_8XK_KER  = 8,
-	BLIS_PACKM_9XK_KER  = 9,
-	BLIS_PACKM_10XK_KER = 10,
-	BLIS_PACKM_11XK_KER = 11,
-	BLIS_PACKM_12XK_KER = 12,
-	BLIS_PACKM_13XK_KER = 13,
-	BLIS_PACKM_14XK_KER = 14,
-	BLIS_PACKM_15XK_KER = 15,
-	BLIS_PACKM_16XK_KER = 16,
-	BLIS_PACKM_17XK_KER = 17,
-	BLIS_PACKM_18XK_KER = 18,
-	BLIS_PACKM_19XK_KER = 19,
-	BLIS_PACKM_20XK_KER = 20,
-	BLIS_PACKM_21XK_KER = 21,
-	BLIS_PACKM_22XK_KER = 22,
-	BLIS_PACKM_23XK_KER = 23,
-	BLIS_PACKM_24XK_KER = 24,
-	BLIS_PACKM_25XK_KER = 25,
-	BLIS_PACKM_26XK_KER = 26,
-	BLIS_PACKM_27XK_KER = 27,
-	BLIS_PACKM_28XK_KER = 28,
-	BLIS_PACKM_29XK_KER = 29,
-	BLIS_PACKM_30XK_KER = 30,
-	BLIS_PACKM_31XK_KER = 31,
-
-	BLIS_UNPACKM_0XK_KER  = 0,
-	BLIS_UNPACKM_1XK_KER  = 1,
-	BLIS_UNPACKM_2XK_KER  = 2,
-	BLIS_UNPACKM_3XK_KER  = 3,
-	BLIS_UNPACKM_4XK_KER  = 4,
-	BLIS_UNPACKM_5XK_KER  = 5,
-	BLIS_UNPACKM_6XK_KER  = 6,
-	BLIS_UNPACKM_7XK_KER  = 7,
-	BLIS_UNPACKM_8XK_KER  = 8,
-	BLIS_UNPACKM_9XK_KER  = 9,
-	BLIS_UNPACKM_10XK_KER = 10,
-	BLIS_UNPACKM_11XK_KER = 11,
-	BLIS_UNPACKM_12XK_KER = 12,
-	BLIS_UNPACKM_13XK_KER = 13,
-	BLIS_UNPACKM_14XK_KER = 14,
-	BLIS_UNPACKM_15XK_KER = 15,
-	BLIS_UNPACKM_16XK_KER = 16,
-	BLIS_UNPACKM_17XK_KER = 17,
-	BLIS_UNPACKM_18XK_KER = 18,
-	BLIS_UNPACKM_19XK_KER = 19,
-	BLIS_UNPACKM_20XK_KER = 20,
-	BLIS_UNPACKM_21XK_KER = 21,
-	BLIS_UNPACKM_22XK_KER = 22,
-	BLIS_UNPACKM_23XK_KER = 23,
-	BLIS_UNPACKM_24XK_KER = 24,
-	BLIS_UNPACKM_25XK_KER = 25,
-	BLIS_UNPACKM_26XK_KER = 26,
-	BLIS_UNPACKM_27XK_KER = 27,
-	BLIS_UNPACKM_28XK_KER = 28,
-	BLIS_UNPACKM_29XK_KER = 29,
-	BLIS_UNPACKM_30XK_KER = 30,
-	BLIS_UNPACKM_31XK_KER = 31
-
-} l1mkr_t;
-
-#define BLIS_NUM_PACKM_KERS   32
-#define BLIS_NUM_UNPACKM_KERS 32
-
-
-typedef enum
-{
-	BLIS_GEMM_UKR = 0,
+	BLIS_DOTXAXPYF_KER,
+
+	// pack kernels
+	BLIS_PACKM_MRXK_KER,
+	BLIS_PACKM_NRXK_KER,
+	BLIS_PACKM_MRXK_1ER_KER,
+	BLIS_PACKM_NRXK_1ER_KER,
+	BLIS_PACKM_MRXMR_DIAG_KER,
+	BLIS_PACKM_NRXNR_DIAG_KER,
+	BLIS_PACKM_MRXMR_DIAG_1ER_KER,
+	BLIS_PACKM_NRXNR_DIAG_1ER_KER,
+
+	// unpack kernels
+	BLIS_UNPACKM_MRXK_KER,
+	BLIS_UNPACKM_NRXK_KER,
+
+	// l3 native kernels
+	BLIS_GEMM_UKR,
 	BLIS_GEMMTRSM_L_UKR,
 	BLIS_GEMMTRSM_U_UKR,
 	BLIS_TRSM_L_UKR,
-	BLIS_TRSM_U_UKR
-} l3ukr_t;
+	BLIS_TRSM_U_UKR,
+
+	// l3 virtual kernels
+	BLIS_GEMM_VIR_UKR,
+	BLIS_GEMMTRSM_L_VIR_UKR,
+	BLIS_GEMMTRSM_U_VIR_UKR,
+	BLIS_TRSM_L_VIR_UKR,
+	BLIS_TRSM_U_VIR_UKR,
+
+	// gemmsup kernels
+	BLIS_GEMMSUP_RRR_UKR,
+	BLIS_GEMMSUP_RRC_UKR,
+	BLIS_GEMMSUP_RCR_UKR,
+	BLIS_GEMMSUP_RCC_UKR,
+	BLIS_GEMMSUP_CRR_UKR,
+	BLIS_GEMMSUP_CRC_UKR,
+	BLIS_GEMMSUP_CCR_UKR,
+	BLIS_GEMMSUP_CCC_UKR,
+	BLIS_GEMMSUP_XXX_UKR,
+
+	// BLIS_NUM_UKRS must be last!
+	BLIS_NUM_UKRS
+} ukr_t;
 
-#define BLIS_NUM_LEVEL3_UKRS 5
+
+typedef enum
+{
+    // l3 kernel row preferences
+	BLIS_GEMM_UKR_ROW_PREF,
+	BLIS_GEMMTRSM_L_UKR_ROW_PREF,
+	BLIS_GEMMTRSM_U_UKR_ROW_PREF,
+	BLIS_TRSM_L_UKR_ROW_PREF,
+	BLIS_TRSM_U_UKR_ROW_PREF,
+
+    // gemmsup kernel row preferences
+	BLIS_GEMMSUP_RRR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_RRC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_RCR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_RCC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CRR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CRC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CCR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CCC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_XXX_UKR_ROW_PREF,
+
+    // BLIS_NUM_UKR_PREFS must be last!
+    BLIS_NUM_UKR_PREFS
+} ukr_pref_t;
 
 
 typedef enum
@@ -884,39 +859,45 @@ typedef enum
 	// NOTE: the level-3 blocksizes MUST be indexed starting at zero.
 	// At one point, we made this assumption in bli_cntx_set_blkszs()
 	// and friends.
-
-	BLIS_KR = 0,
+	BLIS_KR,
 	BLIS_MR,
 	BLIS_NR,
 	BLIS_MC,
 	BLIS_KC,
 	BLIS_NC,
 
+	// broadcast factors for packing
+	BLIS_BBM,
+	BLIS_BBN,
+
+	// level-2 blocksizes
 	BLIS_M2, // level-2 blocksize in m dimension
 	BLIS_N2, // level-2 blocksize in n dimension
 
+	// level-1f blocksizes
 	BLIS_AF, // level-1f axpyf fusing factor
 	BLIS_DF, // level-1f dotxf fusing factor
 	BLIS_XF, // level-1f dotxaxpyf fusing factor
 
-	BLIS_NO_PART  // used as a placeholder when blocksizes are not applicable.
+	// gemmsup thresholds
+	BLIS_MT, // level-3 small/unpacked matrix threshold in m dimension
+	BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension
+	BLIS_KT, // level-3 small/unpacked matrix threshold in k dimension
+
+	// gemmsup block sizes
+	BLIS_KR_SUP,
+	BLIS_MR_SUP,
+	BLIS_NR_SUP,
+	BLIS_MC_SUP,
+	BLIS_KC_SUP,
+	BLIS_NC_SUP,
+
+	// BLIS_NO_PART (= BLIS_NUM_BLKSZS) must be last!
+	BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable,
+	              // such as when characterizing a packm operation.
+	BLIS_NUM_BLKSZS = BLIS_NO_PART
 } bszid_t;
 
-#define BLIS_NUM_BLKSZS 11
-
-
-// -- Threshold ID type --
-
-typedef enum
-{
-	BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension
-	BLIS_NT,     // level-3 small/unpacked matrix threshold in n dimension
-	BLIS_KT      // level-3 small/unpacked matrix threshold in k dimension
-
-} threshid_t;
-
-#define BLIS_NUM_THRESH 3
-
 
 // -- Architecture ID type --
 
@@ -1430,21 +1411,10 @@ typedef struct cntx_s
 	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
 	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
 
-	func_t    l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
-	func_t    l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
-	mbool_t   l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ];
-
-	blksz_t   l3_sup_thresh[ BLIS_NUM_THRESH ];
-	void*     l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
-	blksz_t   l3_sup_blkszs[ BLIS_NUM_BLKSZS ];
-	func_t    l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ];
-	mbool_t   l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ];
-
-	func_t    l1f_kers[ BLIS_NUM_LEVEL1F_KERS ];
-	func_t    l1v_kers[ BLIS_NUM_LEVEL1V_KERS ];
+	func_t    ukrs[ BLIS_NUM_UKRS ];
+	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
-	func_t    packm_kers[ BLIS_NUM_PACKM_KERS ];
-	func_t    unpackm_kers[ BLIS_NUM_UNPACKM_KERS ];
+	void_fp   l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
 
 	ind_t     method;
 
@@ -1577,6 +1547,7 @@ typedef enum
 	// Architecture-related errors
 	BLIS_INVALID_ARCH_ID                       = (-150),
 	BLIS_UNINITIALIZED_GKS_CNTX                = (-151),
+	BLIS_INVALID_UKR_ID                        = (-152),
 
 	// Blocksize-related errors
 	BLIS_MC_DEF_NONMULTIPLE_OF_MR              = (-160),
diff --git a/frame/include/level0/bli_set0s_edge.h b/frame/include/level0/bli_set0s_edge.h
new file mode 100644
index 000000000..2c436812e
--- /dev/null
+++ b/frame/include/level0/bli_set0s_edge.h
@@ -0,0 +1,79 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SET0S_EDGE_H
+#define BLIS_SET0S_EDGE_H
+
+// set0s_mxn
+
+// Notes:
+// - The first char encodes the type of x.
+// - The second char encodes the type of y.
+
+#define GENTFUNC(ctype,ch,op) \
+\
+BLIS_INLINE void PASTEMAC(ch,op) \
+     ( \
+       const dim_t     i, \
+       const dim_t     m, \
+       const dim_t     j, \
+       const dim_t     n, \
+       ctype* restrict p, \
+       const inc_t     ldp \
+     ) \
+{ \
+	if ( i < m ) \
+	{ \
+		PASTEMAC(ch,set0s_mxn) \
+		( \
+		  m - i, \
+		  j, \
+		  p + i*1, 1, ldp \
+		); \
+	} \
+\
+	if ( j < n ) \
+	{ \
+		PASTEMAC(ch,set0s_mxn) \
+		( \
+		  m, \
+		  n - j, \
+		  p + j*ldp, 1, ldp \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0(set0s_edge)
+
+#endif
diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c
index 53904b645..2dd7c7324 100644
--- a/kernels/penryn/1/bli_axpyv_penryn_int.c
+++ b/kernels/penryn/1/bli_axpyv_penryn_int.c
@@ -102,7 +102,7 @@ void bli_daxpyv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+		daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c
index 4d39b3641..2e88a577a 100644
--- a/kernels/penryn/1/bli_dotv_penryn_int.c
+++ b/kernels/penryn/1/bli_dotv_penryn_int.c
@@ -104,7 +104,7 @@ void bli_ddotv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx );
+		ddotv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
index 5e8a2a9a1..c809ebb41 100644
--- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
@@ -110,7 +110,7 @@ void bli_daxpy2v_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpy2v_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
+		daxpy2v_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c
index 66bb88ec6..ce4c4f786 100644
--- a/kernels/penryn/1f/bli_axpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c
@@ -115,7 +115,7 @@ void bli_daxpyf_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
+		daxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
index 7602a7f28..6b9dab773 100644
--- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
@@ -112,7 +112,7 @@ void bli_ddotaxpyv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotaxpyv_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx );
+		ddotaxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
index 2deb4a457..fe102d427 100644
--- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
@@ -104,7 +104,7 @@ void bli_ddotxaxpyf_penryn_int
 	// If the vector lengths are zero, scale y by beta and return.
 	if ( bli_zero_dim1( m ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -149,7 +149,7 @@ void bli_ddotxaxpyf_penryn_int
 
 	if ( use_ref == TRUE )
 	{
-		ddotxaxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx );
+		ddotxaxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx );
 		f
 		(
 		  conjat,
diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c
index ad9dc5fbd..ac9887d59 100644
--- a/kernels/penryn/1f/bli_dotxf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c
@@ -90,7 +90,7 @@ void bli_ddotxf_penryn_int
 	// If the vector lengths are zero, scale r by beta and return.
 	if ( bli_zero_dim1( m ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -134,7 +134,7 @@ void bli_ddotxf_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotxf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx );
+		ddotxf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx );
 
 		f
 		( conjat,
diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c
index 9f76e88e1..fb17dd4b3 100644
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -83,7 +83,7 @@ void bli_sscalv_zen_int
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
 		float*       zero = bli_s0;
-		ssetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+		ssetv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 
 		f
 		(
@@ -182,7 +182,7 @@ void bli_dscalv_zen_int
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
 		double*      zero = bli_d0;
-		dsetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+		dsetv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index c8488890f..9f31b7200 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -84,7 +84,8 @@ void bli_sscalv_zen_int10
 
 		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
-		ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+		ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+
 		f
 		(
 		  BLIS_NO_CONJUGATE,
@@ -93,7 +94,7 @@ void bli_sscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-		
+
 		return;
 	}
 
@@ -275,9 +276,9 @@ void bli_dscalv_zen_int10
 	{
 		double* zero = bli_d0;
 
-		if( cntx == NULL ) cntx = bli_gks_query_cntx();
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
-		dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+		dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
 		(
@@ -287,7 +288,7 @@ void bli_dscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-		
+
 		return;
 	}
 
@@ -458,7 +459,7 @@ void bli_cscalv_zen_int10
 {
 	const num_t dt = BLIS_SCOMPLEX;
 
-	cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx );
+	cscalv_ker_ft f = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx );
 
 	f
 	(
@@ -469,4 +470,3 @@ void bli_cscalv_zen_int10
 	  cntx
 	);
 }
-
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
index 5ddb56ac5..0ec5f44f5 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -36,7 +36,7 @@
 #include "blis.h"
 
 
- void bli_caxpyf_zen_int_4
+void bli_caxpyf_zen_int_4
      (
        conj_t           conja,
        conj_t           conjx,
@@ -81,7 +81,7 @@
     {
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
-        caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
+        caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c
index 15a64d596..1566f9809 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -108,8 +108,9 @@ void bli_saxpyf_zen_int_5
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if(cntx == NULL) cntx = bli_gks_query_cntx();
-        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+        saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
@@ -131,7 +132,7 @@ void bli_saxpyf_zen_int_5
               cntx
             );
         }
-        
+
         return;
     }
 
@@ -359,7 +360,9 @@ void bli_daxpyf_zen_int_5
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
@@ -381,7 +384,7 @@ void bli_daxpyf_zen_int_5
               cntx
             );
         }
-        
+
         return;
     }
 
@@ -559,7 +562,7 @@ void bli_daxpyf_zen_int_5
 
 // -----------------------------------------------------------------------------
 
-static void bli_daxpyf_zen_int_16x2
+void bli_daxpyf_zen_int_16x2
      (
        conj_t           conja,
        conj_t           conjx,
@@ -608,7 +611,7 @@ static void bli_daxpyf_zen_int_16x2
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
@@ -843,6 +846,7 @@ static void bli_daxpyf_zen_int_16x2
 }
 
 // -----------------------------------------------------------------------------
+
 void bli_daxpyf_zen_int_16x4
      (
        conj_t           conja,
@@ -895,8 +899,9 @@ void bli_daxpyf_zen_int_16x4
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if(cntx == NULL) cntx = bli_gks_query_cntx();
-        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c
index b958600ce..15fdf4651 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -104,7 +104,7 @@ void bli_saxpyf_zen_int_8
 	// operation as a loop over axpyv.
 	if ( b_n != fuse_fac )
 	{
-		saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+		saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
 		for ( i = 0; i < b_n; ++i )
 		{
@@ -313,7 +313,7 @@ void bli_daxpyf_zen_int_8
 	// operation as a loop over axpyv.
 	if ( b_n != fuse_fac )
 	{
-		daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+		daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
 		for ( i = 0; i < b_n; ++i )
 		{
diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c
index e40c785d8..1f4a671b6 100644
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -78,8 +78,8 @@ void bli_sdotxf_zen_int_8
 	// simplifies to updating y.
 	if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) )
 	{
-		sscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
-		
+		sscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
+
 		f
 		(
 		  BLIS_NO_CONJUGATE,
@@ -95,7 +95,7 @@ void bli_sdotxf_zen_int_8
 	// operation as a loop over dotxv.
 	if ( b_n != fuse_fac )
 	{
-		sdotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
+		sdotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
@@ -468,8 +468,8 @@ void bli_ddotxf_zen_int_8
 	// simplifies to updating y.
 	if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
-		
+		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+
 		f
 		(
 		  BLIS_NO_CONJUGATE,
@@ -485,7 +485,7 @@ void bli_ddotxf_zen_int_8
 	// operation as a loop over dotxv.
 	if ( b_n != fuse_fac )
 	{
-		ddotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
+		ddotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 2e648bbd6..2da4bc928 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -60,7 +60,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t             dt     = PASTEMAC(ch,type); \
-			PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \
+			PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 			setv_p \
 			( \
@@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); \
+			PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); \
 \
 			scalv_p \
 			( \
@@ -105,7 +105,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+			PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 			copyv_p \
 			( \
@@ -123,7 +123,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t             dt     = PASTEMAC(ch,type); \
-			PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+			PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 			addv_p \
 			( \
@@ -141,7 +141,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_XPBYV_KER, cntx ); \
+			PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_ukr_dt( dt, BLIS_XPBYV_KER, cntx ); \
 \
 			xpbyv_p \
 			( \
@@ -163,7 +163,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t               dt       = PASTEMAC(ch,type); \
-			PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx ); \
+			PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_ukr_dt( dt, BLIS_SCAL2V_KER, cntx ); \
 \
 			scal2v_p \
 			( \
@@ -182,7 +182,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+			PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 			axpyv_p \
 			( \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 31fece0a0..30076ddaf 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
@@ -148,7 +148,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index 1dcb03839..ba0595990 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -57,7 +57,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \
+		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 		setv_p \
 		( \
@@ -75,7 +75,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t              dt      = PASTEMAC(ch,type); \
-		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 		copyv_p \
 		( \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 4945b637b..3e6be7492 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \
+		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 		setv_p \
 		( \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 8101023d4..28286a5f8 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -54,7 +54,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t              dt      = PASTEMAC(ch,type); \
-		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 		copyv_p \
 		( \
@@ -71,7 +71,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 9c08c96f1..6439ff8b0 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -110,7 +110,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,axpyv_ker_ft) kfp_av \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		kfp_av \
 		( \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index f001108e2..5799a03a6 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,axpyv_ker_ft) kfp_av \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index faeef5dea..42936c650 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -132,10 +132,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,dotv_ker_ft)  kfp_dv \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 		PASTECH(ch,axpyv_ker_ft) kfp_av \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		kfp_dv \
 		( \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index c61217941..990133621 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -165,10 +165,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,dotxf_ker_ft) kfp_df \
 		= \
-		bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 		PASTECH(ch,axpyf_ker_ft) kfp_af \
 		= \
-		bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 \
 		kfp_df \
 		( \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 33f5d1ba5..86781fd58 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -113,7 +113,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,dotxv_ker_ft) kfp_dv \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
diff --git a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c
index cc5852b37..e07090754 100644
--- a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c
+++ b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c
@@ -67,8 +67,8 @@ void PASTEMAC(ch,varname) \
 \
 	/* Query the context for the kernel function pointer. */ \
 	const num_t          dt     = PASTEMAC(ch,type); \
-	PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
-	PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* A is m x n.                   */ \
 	/* y = beta * y + alpha * A^T w; */ \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
new file mode 100644
index 000000000..5cee5535b
--- /dev/null
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -0,0 +1,336 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define PACKM_SET1_1E( chr, mnk ) \
+do { \
+	PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set0s)( *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set0s)( *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+} while (0)
+
+#define PACKM_SET1_1R( chr, mnk ) \
+do { \
+	PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \
+} while (0)
+
+#define PACKM_SCAL_1E( ch, mn, k, op ) \
+do { \
+	PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
+	                                    *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
+	                                    *(pi1_ri + (mn*2 + 0)*dfac  + d + k*ldp2), \
+	                                    *(pi1_ri + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
+	PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
+	                                    *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
+	                                    *(pi1_ir + (mn*2 + 0)*dfac  + d + k*ldp2), \
+	                                    *(pi1_ir + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
+} while (0)
+
+#define PACKM_SCAL_1R( ch, mn, k, op ) \
+do { \
+	PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0 + k*lda2), \
+	                                   *(alpha1 + mn*inca2 + 1 + k*lda2), \
+	                                   *(pi1_r  + mn*dfac  + d + k*ldp2), \
+	                                   *(pi1_i  + mn*dfac  + d + k*ldp2) ); \
+} while (0)
+
+#define PACKM_DIAG_1E_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
+\
+do \
+{ \
+	/* PACKM_SCAL_1E assumes inca2 and lda2 are the strides to use. */ \
+	dim_t inca2 = inca2_lu; \
+	dim_t lda2 = lda2_lu; \
+	for ( dim_t k = 0; k < cdim; k++ ) \
+	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
+	for ( dim_t d = 0; d < dfac; d++ ) \
+		PACKM_SCAL_1E( ch, mn, k, op ); \
+} while(0)
+
+#define PACKM_DIAG_BODY_1E_L( ch, op ) \
+	PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+
+#define PACKM_DIAG_BODY_1E_U( ch, op ) \
+	PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op )
+
+#define PACKM_DIAG_1R_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
+\
+do \
+{ \
+	/* PACKM_SCAL_1R assumes inca2 and lda2 are the strides to use. */ \
+	dim_t inca2 = inca2_lu; \
+	dim_t lda2 = lda2_lu; \
+	for ( dim_t k = 0; k < cdim; k++ ) \
+	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
+	for ( dim_t d = 0; d < dfac; d++ ) \
+		PACKM_SCAL_1R( ch, mn, k, op ); \
+} while(0)
+
+#define PACKM_DIAG_BODY_1R_L( ch, op ) \
+	PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+
+#define PACKM_DIAG_BODY_1R_U( ch, op ) \
+	PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op )
+
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
+       conj_t           conja, \
+       pack_t           schema, \
+       bool             invdiag, \
+       dim_t            cdim, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
+       ctype*  restrict p,             inc_t ldp, \
+       cntx_t* restrict cntx \
+     ) \
+{ \
+	const num_t dt_r      = PASTEMAC(chr,type); \
+	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt_r, mnr0, cntx ); \
+	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt_r, bb0, cntx ); \
+\
+	/* start by zeroing out the whole block */ \
+	PASTEMAC(chr,set0s_mxn) \
+	( \
+	  cdim_pack, \
+	  2*n_max, \
+	  ( ctype_r* )p, 1, ldp  \
+	); \
+\
+	const inc_t       inca2   = 2 * inca; \
+	const inc_t       lda2    = 2 * lda; \
+	const inc_t       ldp2    = 2 * ldp; \
+\
+	ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
+	ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
+	ctype_r* restrict alpha1  = ( ctype_r* )a; \
+\
+	if ( bli_is_1e_packed( schema ) ) \
+	{ \
+		const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
+\
+		ctype_r* restrict pi1_ri   = ( ctype_r* )p; \
+		ctype_r* restrict pi1_ir   = ( ctype_r* )p + ldp; \
+\
+		/* write the strictly lower part if it exists */ \
+		if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_l2 = inca2; \
+			dim_t  lda_l2  = lda2; \
+			conj_t conja_l = conja; \
+\
+			if ( bli_is_upper( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_l2, &lda_l2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_l ); \
+			} \
+\
+			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1E_L( ch, scal2ris ); \
+		} \
+\
+		/* write the strictly upper part if it exists */ \
+		/* assume either symmetric, hermitian, or triangular */ \
+		if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_u2 = inca2; \
+			dim_t  lda_u2  = lda2; \
+			conj_t conja_u = conja; \
+\
+			if ( bli_is_lower( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_u2, &lda_u2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_u ); \
+			} \
+\
+			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1E_U( ch, scal2ris ); \
+		} \
+\
+		/* write the diagonal */ \
+		if ( bli_is_unit_diag( diaga ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SET1_1E( chr, mnk ); \
+		} \
+		else if ( bli_is_hermitian( struca ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+			{ \
+				ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
+				PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(chr,scal2s)(  kappa_i, mu_r, *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(chr,scal2s)( -kappa_i, mu_r, *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+			} \
+		} \
+		else if ( bli_is_conj( conja )) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1E( ch, mnk, mnk, scal2jris ); \
+		} \
+		else \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1E( ch, mnk, mnk, scal2ris ); \
+		} \
+\
+		/* invert the diagonal if requested */ \
+		if ( invdiag ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+			{ \
+				PASTEMAC(ch,invertris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
+				                        *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(ch,copyjris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
+				                       *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
+				                       *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
+				                       *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+			} \
+		} \
+\
+		/* if this an edge case in both directions, extend the diagonal with ones */ \
+		for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PACKM_SET1_1E( chr, mnk ); \
+	} \
+	else /* bli_is_1r_packed( schema ) */ \
+	{ \
+		const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
+\
+		ctype_r* restrict pi1_r    = ( ctype_r* )p; \
+		ctype_r* restrict pi1_i    = ( ctype_r* )p + ldp; \
+\
+		/* write the strictly lower part if it exists */ \
+		if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_l2 = inca2; \
+			dim_t  lda_l2  = lda2; \
+			conj_t conja_l = conja; \
+\
+			if ( bli_is_upper( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_l2, &lda_l2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_l ); \
+			} \
+\
+			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1R_L( ch, scal2ris ); \
+		} \
+\
+		/* write the strictly upper part if it exists */ \
+		/* assume either symmetric, hermitian, or triangular */ \
+		if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_u2 = inca2; \
+			dim_t  lda_u2  = lda2; \
+			conj_t conja_u = conja; \
+\
+			if ( bli_is_lower( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_u2, &lda_u2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_u ); \
+			} \
+\
+			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1R_U( ch, scal2ris ); \
+		} \
+\
+		/* write the diagonal */ \
+		if ( bli_is_unit_diag( diaga ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SET1_1R( chr, mnk ); \
+		} \
+		else if ( bli_is_hermitian( struca ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+			{ \
+				ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
+				PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_r + mnk*(dfac + ldp2) + d) ); \
+				PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+			} \
+		} \
+		else if ( bli_is_conj( conja ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1R( ch, mnk, mnk, scal2jris ); \
+		} \
+		else \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1R( ch, mnk, mnk, scal2ris ); \
+		} \
+\
+		/* invert the diagonal if requested */ \
+		if ( invdiag ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PASTEMAC(ch,invertris)( *(pi1_r + mnk*(dfac + ldp2) + d), \
+				                        *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+		} \
+\
+		/* if this an edge case in both directions, extend the diagonal with ones */ \
+		for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PACKM_SET1_1R( chr, mnk ); \
+	} \
+}
+
+INSERT_GENTFUNCCO_BASIC4( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO_BASIC4( packm_nrxnr_diag_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
new file mode 100644
index 000000000..80ffcbc14
--- /dev/null
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -0,0 +1,173 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define PACKM_DIAG_BODY( ctype, ch, mn_min, mn_max, inca, lda, op ) \
+\
+do \
+{ \
+	for ( dim_t k = 0; k < cdim; k++ ) \
+	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
+	for ( dim_t d = 0; d < dfac; d++ ) \
+		PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca + k*lda), *(pi1 + mn*dfac + d + k*ldp) ); \
+} while(0)
+
+#define PACKM_DIAG_BODY_L( ctype, ch, op ) \
+	PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op )
+
+#define PACKM_DIAG_BODY_U( ctype, ch, op ) \
+	PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op )
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
+       conj_t           conja, \
+       pack_t           schema, \
+       bool             invdiag, \
+       dim_t            cdim, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
+       ctype*  restrict p,             inc_t ldp, \
+       cntx_t* restrict cntx \
+     ) \
+{ \
+	const num_t dt        = PASTEMAC(ch,type); \
+	const dim_t cdim_max  = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
+	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt, mnr0, cntx ); \
+	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt, bb0, cntx ); \
+\
+	/* start by zeroing out the whole block */ \
+	PASTEMAC(ch,set0s_mxn) \
+	( \
+	  cdim_pack, \
+	  n_max, \
+	  p, 1, ldp  \
+	); \
+\
+	ctype           kappa_cast = *( ctype* )kappa; \
+	ctype* restrict alpha1     = a; \
+	ctype* restrict pi1        = p; \
+\
+	/* write the strictly lower part if it exists */ \
+	if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+	{ \
+		dim_t  inca_l  = inca; \
+		dim_t  lda_l   = lda; \
+		conj_t conja_l = conja; \
+\
+		if ( bli_is_upper( uploa ) ) \
+		{ \
+			bli_swap_incs( &inca_l, &lda_l ); \
+			if ( bli_is_hermitian( struca ) ) \
+				bli_toggle_conj( &conja_l ); \
+		} \
+\
+		if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctype, ch, scal2js ); \
+		else                          PACKM_DIAG_BODY_L( ctype, ch, scal2s ); \
+	} \
+\
+	/* write the strictly upper part if it exists */ \
+	/* assume either symmetric, hermitian, or triangular */ \
+	if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+	{ \
+		dim_t  inca_u  = inca; \
+		dim_t  lda_u   = lda; \
+		conj_t conja_u = conja; \
+\
+		if ( bli_is_lower( uploa ) ) \
+		{ \
+			bli_swap_incs( &inca_u, &lda_u ); \
+			if ( bli_is_hermitian( struca ) ) \
+				bli_toggle_conj( &conja_u ); \
+		} \
+\
+		if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctype, ch, scal2js ); \
+		else                          PACKM_DIAG_BODY_U( ctype, ch, scal2s ); \
+	} \
+\
+	/* write the diagonal */ \
+	if ( bli_is_unit_diag( diaga ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
+	else if ( bli_is_hermitian( struca ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+		{ \
+			ctype mu; \
+			PASTEMAC(ch,copys)( *(alpha1 + mnk*(inca + lda)), mu ); \
+			PASTEMAC(ch,seti0s)( mu ); \
+			PASTEMAC(ch,scal2s)( kappa_cast, mu, *(pi1 + mnk*(dfac + ldp) + d) ); \
+		} \
+	} \
+	else if ( bli_is_conj( conja )) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
+	else \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
+\
+	/* invert the diagonal if requested */ \
+	if ( invdiag ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,inverts)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
+\
+	/* if this an edge case in both directions, extend the diagonal with ones */ \
+	for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+	for ( dim_t d = 0; d < dfac; ++d ) \
+		PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+}
+
+INSERT_GENTFUNC_BASIC4( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( packm_nrxnr_diag, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 03ec46d14..56d8379be 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -34,458 +34,48 @@
 
 #include "blis.h"
 
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
+#define PACKM_1E_BODY( ctype, ch, pragma, cdim, inca2, op ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
+do \
 { \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
+	for ( dim_t k = n; k != 0; --k ) \
 	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; ++mn ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
 		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
+			PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                                    *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+			PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                                    *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
 		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
 \
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
+		alpha1 += lda2; \
+		pi1_ri += ldp2; \
+		pi1_ir += ldp2; \
 	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_2xk_1er, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
+} while(0)
 
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
+#define PACKM_1R_BODY( ctype, ch, pragma, cdim, inca2, op ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
+do \
 { \
-	if ( cdim == mnr ) \
+	for ( dim_t k = n; k != 0; --k ) \
 	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; ++mn ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                                   *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
 \
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
+		alpha1 += lda2; \
+		pi1_r  += ldp2; \
+		pi1_i  += ldp2; \
 	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_4xk_1er, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
+} while(0)
 
 #undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
+#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
@@ -500,1719 +90,94 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t* restrict cntx  \
      ) \
 { \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
+	const dim_t dfac = PASTECH2(bb0, _, chr); \
+	const num_t dt_r = PASTEMAC(chr,type); \
 \
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
+	if ( bli_is_1e_packed( schema ) ) \
 	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
+		/* cdim and mnr are in units of complex values */ \
+		const dim_t mnr      = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \
+		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
 \
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
+		const inc_t       inca2      = 2 * inca; \
+		const inc_t       lda2       = 2 * lda; \
+		const inc_t       ldp2       = 2 * ldp; \
 \
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
+		ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
+		ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
+		ctype_r* restrict alpha1     = ( ctype_r* )a; \
+		ctype_r* restrict pi1_ri     = ( ctype_r* )p; \
+		ctype_r* restrict pi1_ir     = ( ctype_r* )p + ldp; \
 \
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_6xk_1er, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
+		if ( cdim == mnr && mnr != -1 ) \
 		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+			if ( inca == 1 ) \
 			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
 			} \
 			else \
 			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
 			} \
 		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
+		else \
 		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2jris ); \
+			else                        PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
 		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
 \
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
+		PASTEMAC(chr,set0s_edge) \
 		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
+		  2*cdim*dfac, 2*cdim_max*dfac, \
+		  2*n, 2*n_max, \
+		  ( ctype_r* )p, ldp  \
 		); \
 	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_8xk_1er, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
+	else /* ( bli_is_1r_packed( schema ) ) */ \
 	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
+		const dim_t mnr      = PASTECH2(mnr0, _, chr); \
+		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
 \
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
+		const inc_t       inca2      = 2 * inca; \
+		const inc_t       lda2       = 2 * lda; \
+		const inc_t       ldp2       = 2 * ldp; \
 \
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
+		ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
+		ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
+		ctype_r* restrict alpha1     = ( ctype_r* )a; \
+		ctype_r* restrict pi1_r      = ( ctype_r* )p; \
+		ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
 \
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
+		if ( cdim == mnr && mnr != -1 ) \
 		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+			if ( inca == 1 ) \
 			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
 			} \
 			else \
 			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
 			} \
 		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
+		else \
 		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
+			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2jris ); \
+			else                        PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
 		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_10xk_1er, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_12xk_1er, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_14xk_1er, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
 \
-		PASTEMAC(ch,set1ms_mxn) \
+		PASTEMAC(chr,set0s_edge) \
 		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
+		  cdim*dfac, cdim_max*dfac, \
+		  2*n, 2*n_max, \
+		  ( ctype_r* )p, ldp  \
 		); \
 	} \
 }
 
-INSERT_GENTFUNCCO_BASIC3( packm_16xk_1er, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO_BASIC4( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO_BASIC4( packm_nrxk_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxk_bb_ref.c b/ref_kernels/1m/bli_packm_cxk_bb_ref.c
deleted file mode 100644
index e7498a735..000000000
--- a/ref_kernels/1m/bli_packm_cxk_bb_ref.c
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// -- 6xk, duplication factor 2 ------------------------------------------------
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	const dim_t     dfac       = 2; \
-\
-	/* Handle the packing of B (column panel schemas) separately from packing
-	   of A (row panel schemas). */ \
-	if ( bli_is_col_packed( schema ) ) \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2bbs_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, dfac, ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*dfac; \
-\
-				PASTEMAC(ch,set0bbs_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, dfac, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0bbs_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, dfac, ldp  \
-			); \
-		} \
-	} \
-	else /* if ( bli_is_row_packed( schema ) ) */ \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2s_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, 1,    ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-				PASTEMAC(ch,set0s_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, 1, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-// -- 6xk, duplication factor 4 ------------------------------------------------
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	const dim_t     dfac       = 4; \
-\
-	/* Handle the packing of B (column panel schemas) separately from packing
-	   of A (row panel schemas). */ \
-	if ( bli_is_col_packed( schema ) ) \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2bbs_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, dfac, ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*dfac; \
-\
-				PASTEMAC(ch,set0bbs_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, dfac, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0bbs_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, dfac, ldp  \
-			); \
-		} \
-	} \
-	else /* if ( bli_is_row_packed( schema ) ) */ \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2s_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, 1,    ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-				PASTEMAC(ch,set0s_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, 1, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_6xk_bb4, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index c98f1b250..eefdb464b 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -34,469 +34,24 @@
 
 #include "blis.h"
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	dim_t           n_iter     = n / 4; \
-	dim_t           n_left     = n % 4; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \
-\
-					alpha1 += 4*lda; \
-					pi1    += 4*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
+#define PACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
 \
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	dim_t           n_iter     = n / 4; \
-	dim_t           n_left     = n % 4; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 2*lda), *(pi1 + 2 + 2*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 3*lda), *(pi1 + 2 + 3*ldp) ); \
-\
-					alpha1 += 4*lda; \
-					pi1    += 4*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
+do \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	dim_t           n_iter     = n / 2; \
-	dim_t           n_left     = n % 2; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \
-\
-					alpha1 += 2*lda; \
-					pi1    += 2*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
+	for ( dim_t k = n; k != 0; --k ) \
 	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; mn++ ) \
+		for ( dim_t d = 0; d < dfac; d++ ) \
+			PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \
 \
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
+		alpha1 += lda; \
+		pi1    += ldp; \
 	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
+} while(0)
 
 #undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
+#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
@@ -511,1212 +66,42 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t* restrict cntx \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
+	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
+	const num_t     dt         = PASTEMAC(ch,type); \
+	const dim_t     cdim_max   = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
+	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
+	ctype           kappa_cast = *( ctype* )kappa; \
 	ctype* restrict alpha1     = a; \
 	ctype* restrict pi1        = p; \
 \
-	dim_t           n_iter     = n / 2; \
-	dim_t           n_left     = n % 2; \
-\
-	if ( cdim == mnr ) \
+	if ( cdim == mnr && mnr != -1 ) \
 	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+		if ( inca == 1 ) \
 		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 0*lda), *(pi1 + 4 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 0*lda), *(pi1 + 5 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 0*lda), *(pi1 + 6 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 0*lda), *(pi1 + 7 + 0*ldp) ); \
-\
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 1*lda), *(pi1 + 4 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 1*lda), *(pi1 + 5 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 1*lda), *(pi1 + 6 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 1*lda), *(pi1 + 7 + 1*ldp) ); \
-\
-					alpha1 += 2*lda; \
-					pi1    += 2*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \
+			else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
 		} \
 		else \
 		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \
+			else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
 		} \
 	} \
 	else /* if ( cdim < mnr ) */ \
 	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
+		if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \
+		else                        PACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
 	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-    ctype* restrict kappa_cast = kappa; \
-    ctype* restrict alpha1     = a; \
-    ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
+	PASTEMAC(ch,set0s_edge) \
+	( \
+	  cdim*dfac, cdim_max*dfac, \
+	  n, n_max, \
+	  p, ldp  \
+	); \
 }
 
-INSERT_GENTFUNC_BASIC3( packm_24xk, 24, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( packm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 00dc02eb4..73d98e268 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -34,816 +34,64 @@
 
 #include "blis.h"
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+#define UNPACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
+do \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+	for ( dim_t k = n; k != 0; --k ) \
 	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; mn++ ) \
+			PASTEMAC(ch,op)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
 \
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
+		alpha1 += lda; \
+		pi1    += ldp; \
 	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_2xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
+} while(0)
 
 #undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjp, \
+       conj_t           conja, \
+       pack_t           schema, \
+       dim_t            cdim, \
        dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
+       ctype*  restrict kappa, \
+       ctype*  restrict p,             inc_t ldp, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
+       cntx_t* restrict cntx \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
+	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
+    /* It's not clear if unpack needs to care about BB storage... */ \
+	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_4xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
 	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
 	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_6xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
 	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
 \
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+	if ( cdim == mnr && mnr != -1 ) \
 	{ \
-		if ( bli_is_conj( conjp ) ) \
+		if ( inca == 1 ) \
 		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \
+			else                        UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
 		} \
 		else \
 		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \
+			else                        UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
 		} \
 	} \
-	else \
+	else /* if ( cdim < mnr ) */ \
 	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_8xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_10xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_12xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_14xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \
+			else                        UNPACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( unpackm_16xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( unpackm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( unpackm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/3/bb/bli_gemmbb_ref.c b/ref_kernels/3/bb/bli_gemmbb_ref.c
deleted file mode 100644
index 4c75c064c..000000000
--- a/ref_kernels/3/bb/bli_gemmbb_ref.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// An implementation that indexes through B with the assumption that all
-// elements were broadcast (duplicated) by a factor of NP/NR.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-\
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = 1; \
-	const inc_t     cs_ab  = mr; \
-\
-	dim_t           l, j, i; \
-\
-	ctype           ai; \
-	ctype           bj; \
-\
-\
-	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,set0s)( *(ab + i) ); \
-	} \
-\
-	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( l = 0; l < k; ++l ) \
-	{ \
-		ctype* restrict abij = ab; \
-\
-		/* In an optimized implementation, these two loops over MR and NR
-		   are typically fully unrolled. */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			bj = *(b + j*cs_b); \
-\
-			for ( i = 0; i < m; ++i ) \
-			{ \
-				ai = *(a + i); \
-\
-				PASTEMAC(ch,dots)( ai, bj, *abij ); \
-\
-				abij += rs_ab; \
-			} \
-		} \
-\
-		a += cs_a; \
-		b += rs_b; \
-	} \
-\
-	/* Scale the result in ab by alpha. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
-	} \
-\
-	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
-	   scale by beta and then add the scaled redult in ab. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
-	{ \
-		PASTEMAC(ch,copys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-	else \
-	{ \
-		PASTEMAC(ch,xpbys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        beta, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
diff --git a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
deleted file mode 100644
index dd4e1f153..000000000
--- a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// An implementation that indexes through B with the assumption that all
-// elements were broadcast (duplicated) by a factor of NP/NR.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const inc_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-/*
-printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \
-printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
-*/ \
-\
-	ctype*          minus_one = PASTEMAC(ch,m1); \
-\
-	PASTECH(ch,gemm_ukr_ft) \
-	              gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	PASTECH(ch,trsm_ukr_ft) \
-	              trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
-\
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
-                     (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
-                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
-*/ \
-\
-	/* lower: b11 = alpha * b11 - a10 * b01; */ \
-	/* upper: b11 = alpha * b11 - a12 * b21; */ \
-	gemm_ukr \
-	( \
-	  mr, \
-	  nr, \
-	  k, \
-	  minus_one, \
-	  a1x, \
-	  bx1, \
-	  alpha, \
-	  b11, rs_b, cs_b, \
-	  data, \
-	  cntx  \
-	); \
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
-                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
-*/ \
-\
-	/* b11 = inv(a11) * b11;
-	   c11 = b11; */ \
-	trsm_ukr \
-	( \
-	  a11, \
-	  b11, \
-	  c11, rs_c, cs_c, \
-	  data, \
-	  cntx  \
-	); \
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
-                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
-*/ \
-\
-	/* Broadcast the elements of the updated b11 submatrix to their
-	   duplicated neighbors. */ \
-	PASTEMAC(ch,bcastbbs_mxn) \
-	( \
-	  mr, \
-	  nr, \
-	  b11, rs_b, cs_b  \
-	); \
-\
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
-                     ( double* )b01,     2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \
-                     ( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
-*/ \
-}
-
-INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
-INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
-
diff --git a/ref_kernels/3/bb/bli_trsmbb_ref.c b/ref_kernels/3/bb/bli_trsmbb_ref.c
deleted file mode 100644
index e3f5500cc..000000000
--- a/ref_kernels/3/bb/bli_trsmbb_ref.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// An implementation that indexes through B with the assumption that all
-// elements were broadcast (duplicated) by a factor of NP/NR.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t     m      = mr; \
-	const dim_t     n      = nr; \
-\
-	const inc_t     rs_a   = 1; \
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-\
-	dim_t           iter, i, j, l; \
-	dim_t           n_behind; \
-\
-	for ( iter = 0; iter < m; ++iter ) \
-	{ \
-		i        = iter; \
-		n_behind = i; \
-\
-		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a10t     = a + (i  )*rs_a + (0  )*cs_a; \
-		ctype* restrict B0       = b + (0  )*rs_b + (0  )*cs_b; \
-		ctype* restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
-\
-		/* b1 = b1 - a10t * B0; */ \
-		/* b1 = b1 / alpha11; */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict b01     = B0 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict gamma11 = c  + (i  )*rs_c + (j  )*cs_c; \
-			ctype           beta11c = *beta11; \
-			ctype           rho11; \
-\
-			/* beta11 = beta11 - a10t * b01; */ \
-			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( l = 0; l < n_behind; ++l ) \
-			{ \
-				ctype* restrict alpha10 = a10t + (l  )*cs_a; \
-				ctype* restrict beta01  = b01  + (l  )*rs_b; \
-\
-				PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \
-			} \
-			PASTEMAC(ch,subs)( rho11, beta11c ); \
-\
-			/* beta11 = beta11 / alpha11; */ \
-			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-			   (1.0/alpha11) is stored during packing instead alpha11 so we
-			   can multiply rather than divide. When preinversion is disabled,
-			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,scals)( *alpha11, beta11c ); \
-\
-			/* Output final result to matrix c. */ \
-			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
-\
-			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
-		} \
-	} \
-}
-
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
-#else
-INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
-#endif
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t     m      = mr; \
-	const dim_t     n      = nr; \
-\
-	const inc_t     rs_a   = 1; \
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-\
-	dim_t           iter, i, j, l; \
-	dim_t           n_behind; \
-\
-	for ( iter = 0; iter < m; ++iter ) \
-	{ \
-		i        = m - iter - 1; \
-		n_behind = iter; \
-\
-		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a12t     = a + (i  )*rs_a + (i+1)*cs_a; \
-		ctype* restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
-		ctype* restrict B2       = b + (i+1)*rs_b + (0  )*cs_b; \
-\
-		/* b1 = b1 - a12t * B2; */ \
-		/* b1 = b1 / alpha11; */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict b21     = B2 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict gamma11 = c  + (i  )*rs_c + (j  )*cs_c; \
-			ctype           beta11c = *beta11; \
-			ctype           rho11; \
-\
-			/* beta11 = beta11 - a12t * b21; */ \
-			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( l = 0; l < n_behind; ++l ) \
-			{ \
-				ctype* restrict alpha12 = a12t + (l  )*cs_a; \
-				ctype* restrict beta21  = b21  + (l  )*rs_b; \
-\
-				PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \
-			} \
-			PASTEMAC(ch,subs)( rho11, beta11c ); \
-\
-			/* beta11 = beta11 / alpha11; */ \
-			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-			   (1.0/alpha11) is stored during packing instead alpha11 so we
-			   can multiply rather than divide. When preinversion is disabled,
-			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,diagop)( *alpha11, beta11c ); \
-\
-			/* Output final result to matrix c. */ \
-			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
-\
-			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
-		} \
-	} \
-}
-
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
-#else
-INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
-#endif
-
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 51ff9df4b..f284acb98 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -34,13 +34,114 @@
 
 #include "blis.h"
 
-#if 1
+// Completely generic gemm ukr implementation which checks MR/NR at
+// runtime. Very slow, but has to be used in certain cases.
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+static void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       dim_t               m, \
+       dim_t               n, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, \
+       ctype*     restrict b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	const num_t     dt     = PASTEMAC(ch,type); \
+\
+	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
+	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
+\
+	const inc_t     rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
+	const inc_t     cs_a   = packmr; \
+\
+	const inc_t     rs_b   = packnr; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
+\
+	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const inc_t     rs_ab  = 1; \
+	const inc_t     cs_ab  = m; \
+\
+	dim_t           l, j, i; \
+\
+	ctype           ai; \
+	ctype           bj; \
+\
+\
+	/* Initialize the accumulator elements in ab to zero. */ \
+	for ( i = 0; i < m * n; ++i ) \
+	{ \
+		PASTEMAC(ch,set0s)( *(ab + i) ); \
+	} \
+\
+	/* Perform a series of k rank-1 updates into ab. */ \
+	for ( l = 0; l < k; ++l ) \
+	{ \
+		ctype* restrict abij = ab; \
+\
+		/* In an optimized implementation, these two loops over MR and NR
+		   are typically fully unrolled. */ \
+		for ( j = 0; j < n; ++j ) \
+		{ \
+			bj = *(b + j*cs_b); \
+\
+			for ( i = 0; i < m; ++i ) \
+			{ \
+				ai = *(a + i*rs_a); \
+\
+				PASTEMAC(ch,dots)( ai, bj, *abij ); \
+\
+				abij += rs_ab; \
+			} \
+		} \
+\
+		a += cs_a; \
+		b += rs_b; \
+	} \
+\
+	/* Scale the result in ab by alpha. */ \
+	for ( i = 0; i < m * n; ++i ) \
+	{ \
+		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
+	} \
+\
+	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
+	   scale by beta and then add the scaled redult in ab. */ \
+	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	{ \
+		PASTEMAC(ch,copys_mxn)( m, \
+		                        n, \
+		                        ab, rs_ab, cs_ab, \
+		                        c,  rs_c,  cs_c ); \
+	} \
+	else \
+	{ \
+		PASTEMAC(ch,xpbys_mxn)( m, \
+		                        n, \
+		                        ab, rs_ab, cs_ab, \
+		                        beta, \
+		                        c,  rs_c,  cs_c ); \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 // An implementation that attempts to facilitate emission of vectorized
 // instructions via constant loop bounds + #pragma omp simd directives.
+// If compile-time MR/NR are not available (indicated by BLIS_[MN]R_x = -1),
+// then the non-unrolled version (above) is used.
 
 #undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
@@ -56,14 +157,38 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t*    restrict cntx  \
      ) \
 { \
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = nr; \
-	const inc_t     cs_ab  = 1; \
 \
-	const inc_t     cs_a   = mr; \
-	const inc_t     rs_b   = nr; \
+	const dim_t     mr = PASTECH(BLIS_MR_,ch); \
+	const dim_t     nr = PASTECH(BLIS_NR_,ch); \
+\
+	if ( mr == -1 || nr == -1 ) \
+	{ \
+		PASTEMAC3(ch,gemm_gen,arch,suf) \
+		( \
+		  m, \
+		  n, \
+		  k, \
+		  alpha, \
+		  a, \
+		  b, \
+		  beta, \
+		  c, rs_c, cs_c, \
+		  data, \
+		  cntx \
+		); \
+		return; \
+	} \
+\
+	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const inc_t rs_ab  = nr; \
+	const inc_t cs_ab  = 1; \
+\
+	const inc_t rs_a   = PASTECH(BLIS_BBM_,ch); \
+	const inc_t cs_a   = PASTECH(BLIS_PACKMR_,ch); \
+	const inc_t rs_b   = PASTECH(BLIS_PACKNR_,ch); \
+	const inc_t cs_b   = PASTECH(BLIS_BBN_,ch); \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
@@ -83,8 +208,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			{ \
 				PASTEMAC(ch,dots) \
 				( \
-				  a[ i ], \
-				  b[ j ], \
+				  a[ i*rs_a ], \
+				  b[ j*cs_b ], \
 				  ab[ i*rs_ab + j*cs_ab ]  \
 				); \
 			} \
@@ -157,115 +282,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-GENTFUNC( float,    s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 )
-GENTFUNC( double,   d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
-GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
-GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 )
-
-#else
-
-// An implementation that uses variable loop bounds (queried from the context)
-// and makes no use of #pragma omp simd.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = 1; \
-	const inc_t     cs_ab  = mr; \
-\
-	dim_t           l, j, i; \
-\
-	ctype           ai; \
-	ctype           bj; \
-\
-\
-	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,set0s)( *(ab + i) ); \
-	} \
-\
-	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( l = 0; l < k; ++l ) \
-	{ \
-		ctype* restrict abij = ab; \
-\
-		/* In an optimized implementation, these two loops over MR and NR
-		   are typically fully unrolled. */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			bj = *(b + j); \
-\
-			for ( i = 0; i < m; ++i ) \
-			{ \
-				ai = *(a + i); \
-\
-				PASTEMAC(ch,dots)( ai, bj, *abij ); \
-\
-				abij += rs_ab; \
-			} \
-		} \
-\
-		a += cs_a; \
-		b += rs_b; \
-	} \
-\
-	/* Scale the result in ab by alpha. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
-	} \
-\
-	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
-	   scale by beta and then add the scaled redult in ab. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
-	{ \
-		PASTEMAC(ch,copys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-	else \
-	{ \
-		PASTEMAC(ch,xpbys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        beta, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-}
-
 INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
-#endif
 
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 30fc3fcd6..046aa5617 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -34,6 +34,9 @@
 
 #include "blis.h"
 
+// An implementation that indexes through B with the assumption that all
+// elements were broadcast (duplicated) by a factor of NP/NR.
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
 \
@@ -60,21 +63,38 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
 	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = 1; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
+/*
+printf( "bli_gemmtrsm_ref(): cs_b = %d\n", (int)cs_b ); \
+printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
+*/ \
 \
 	ctype*          minus_one = PASTEMAC(ch,m1); \
 \
 	PASTECH(ch,gemm_ukr_ft) \
-	              gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	              gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	PASTECH(ch,trsm_ukr_ft) \
-	              trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
+	              trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
+                     (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
+                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
+*/ \
 \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	/* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
-	   instead? */ \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	/* to FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
+	   instead?
+
+	   to DAM: Given that this reference kernel is implemented in terms of gemm,
+	   I think that is the preference we want to query. There might be other
+	   circumstances where we would want the gemmtrsm_? operations to have
+	   and exercise their own IO preferences -- I'd have to think about it --
+	   but this doesn't seem to be one of them. */ \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : nr ); \
 	const inc_t     cs_ct       = ( col_pref ? mr : 1 ); \
 \
@@ -106,6 +126,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	  data, \
 	  cntx  \
 	); \
+/*
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
+                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
+*/ \
+\
+	/* Broadcast the elements of the updated b11 submatrix to their
+	   duplicated neighbors. */ \
+	PASTEMAC(ch,bcastbbs_mxn) \
+	( \
+	  m, \
+	  n, \
+	  b11, rs_b, cs_b  \
+	); \
 \
 	/* b11 = inv(a11) * b11;
 	   c11 = b11; */ \
@@ -117,6 +150,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	  data, \
 	  cntx  \
 	); \
+/*
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
+                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
+*/ \
 \
 	if ( use_ct ) \
 	{ \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index 786f1129d..8234a84cc 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -34,17 +34,8 @@
 
 #include "blis.h"
 
-#if 0
-
-// An implementation that attempts to facilitate emission of vectorized
-// instructions via constant loop bounds + #pragma omp simd directives.
-
-// (Deleted. See 'old' directory.)
-
-#else
-
-// An implementation that uses variable loop bounds (queried from the context)
-// and makes no use of #pragma omp simd.
+// An implementation that indexes through B with the assumption that all
+// elements were broadcast (duplicated) by a factor of NP/NR.
 
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
@@ -69,11 +60,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t     m      = mr; \
 	const dim_t     n      = nr; \
 \
-	const inc_t     rs_a   = 1; \
+	const inc_t     rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
 	const inc_t     cs_a   = packmr; \
 \
 	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = 1; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
 	dim_t           iter, i, j, l; \
 	dim_t           n_behind; \
@@ -114,13 +105,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			   (1.0/alpha11) is stored during packing instead alpha11 so we
 			   can multiply rather than divide. When preinversion is disabled,
 			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,diagop)( *alpha11, beta11c ); \
+			PASTEMAC(ch,scals)( *alpha11, beta11c ); \
 \
 			/* Output final result to matrix c. */ \
 			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
+			for ( dim_t d = 0; d < cs_b; ++d ) \
+				PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
@@ -155,19 +147,16 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t     m      = mr; \
 	const dim_t     n      = nr; \
 \
-	const inc_t     rs_a   = 1; \
+	const inc_t     rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
 	const inc_t     cs_a   = packmr; \
 \
 	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = 1; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
-	dim_t           iter, i, j, l; \
-	dim_t           n_behind; \
-\
-	for ( iter = 0; iter < m; ++iter ) \
+	for ( dim_t iter = 0; iter < m; ++iter ) \
 	{ \
-		i        = m - iter - 1; \
-		n_behind = iter; \
+		dim_t i        = m - iter - 1; \
+		dim_t n_behind = iter; \
 \
 		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
 		ctype* restrict a12t     = a + (i  )*rs_a + (i+1)*cs_a; \
@@ -176,7 +165,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* b1 = b1 - a12t * B2; */ \
 		/* b1 = b1 / alpha11; */ \
-		for ( j = 0; j < n; ++j ) \
+		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
 			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
 			ctype* restrict b21     = B2 + (0  )*rs_b + (j  )*cs_b; \
@@ -186,7 +175,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* beta11 = beta11 - a12t * b21; */ \
 			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( l = 0; l < n_behind; ++l ) \
+			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
 				ctype* restrict alpha12 = a12t + (l  )*cs_a; \
 				ctype* restrict beta21  = b21  + (l  )*rs_b; \
@@ -206,7 +195,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
+			for ( dim_t d = 0; d < cs_b; ++d ) \
+				PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
@@ -217,4 +207,3 @@ INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
 INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
 #endif
 
-#endif
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 33e74ecaa..69c546cd4 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -107,60 +107,30 @@
 
 // -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------
 
-#undef  packm_2xk_ker_name
-#define packm_2xk_ker_name  GENARNAME(packm_2xk)
-#undef  packm_3xk_ker_name
-#define packm_3xk_ker_name  GENARNAME(packm_3xk)
-#undef  packm_4xk_ker_name
-#define packm_4xk_ker_name  GENARNAME(packm_4xk)
-#undef  packm_6xk_ker_name
-#define packm_6xk_ker_name  GENARNAME(packm_6xk)
-#undef  packm_8xk_ker_name
-#define packm_8xk_ker_name  GENARNAME(packm_8xk)
-#undef  packm_10xk_ker_name
-#define packm_10xk_ker_name GENARNAME(packm_10xk)
-#undef  packm_12xk_ker_name
-#define packm_12xk_ker_name GENARNAME(packm_12xk)
-#undef  packm_14xk_ker_name
-#define packm_14xk_ker_name GENARNAME(packm_14xk)
-#undef  packm_16xk_ker_name
-#define packm_16xk_ker_name GENARNAME(packm_16xk)
-#undef  packm_24xk_ker_name
-#define packm_24xk_ker_name GENARNAME(packm_24xk)
-
-#undef  unpackm_2xk_ker_name
-#define unpackm_2xk_ker_name  GENARNAME(unpackm_2xk)
-#undef  unpackm_4xk_ker_name
-#define unpackm_4xk_ker_name  GENARNAME(unpackm_4xk)
-#undef  unpackm_6xk_ker_name
-#define unpackm_6xk_ker_name  GENARNAME(unpackm_6xk)
-#undef  unpackm_8xk_ker_name
-#define unpackm_8xk_ker_name  GENARNAME(unpackm_8xk)
-#undef  unpackm_10xk_ker_name
-#define unpackm_10xk_ker_name GENARNAME(unpackm_10xk)
-#undef  unpackm_12xk_ker_name
-#define unpackm_12xk_ker_name GENARNAME(unpackm_12xk)
-#undef  unpackm_14xk_ker_name
-#define unpackm_14xk_ker_name GENARNAME(unpackm_14xk)
-#undef  unpackm_16xk_ker_name
-#define unpackm_16xk_ker_name GENARNAME(unpackm_16xk)
-
-#undef  packm_2xk_1er_ker_name
-#define packm_2xk_1er_ker_name  GENARNAME(packm_2xk_1er)
-#undef  packm_4xk_1er_ker_name
-#define packm_4xk_1er_ker_name  GENARNAME(packm_4xk_1er)
-#undef  packm_6xk_1er_ker_name
-#define packm_6xk_1er_ker_name  GENARNAME(packm_6xk_1er)
-#undef  packm_8xk_1er_ker_name
-#define packm_8xk_1er_ker_name  GENARNAME(packm_8xk_1er)
-#undef  packm_10xk_1er_ker_name
-#define packm_10xk_1er_ker_name GENARNAME(packm_10xk_1er)
-#undef  packm_12xk_1er_ker_name
-#define packm_12xk_1er_ker_name GENARNAME(packm_12xk_1er)
-#undef  packm_14xk_1er_ker_name
-#define packm_14xk_1er_ker_name GENARNAME(packm_14xk_1er)
-#undef  packm_16xk_1er_ker_name
-#define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1er)
+#undef  packm_mrxk_ker_name
+#define packm_mrxk_ker_name  GENARNAME(packm_mrxk)
+#undef  packm_nrxk_ker_name
+#define packm_nrxk_ker_name  GENARNAME(packm_nrxk)
+
+#undef  packm_mrxk_1er_ker_name
+#define packm_mrxk_1er_ker_name  GENARNAME(packm_mrxk_1er)
+#undef  packm_nrxk_1er_ker_name
+#define packm_nrxk_1er_ker_name  GENARNAME(packm_nrxk_1er)
+
+#undef  packm_mrxmr_diag_ker_name
+#define packm_mrxmr_diag_ker_name  GENARNAME(packm_mrxmr_diag)
+#undef  packm_nrxnr_diag_ker_name
+#define packm_nrxnr_diag_ker_name  GENARNAME(packm_nrxnr_diag)
+
+#undef  packm_mrxmr_diag_1er_ker_name
+#define packm_mrxmr_diag_1er_ker_name  GENARNAME(packm_mrxmr_diag_1er)
+#undef  packm_nrxnr_diag_1er_ker_name
+#define packm_nrxnr_diag_1er_ker_name  GENARNAME(packm_nrxnr_diag_1er)
+
+#undef  unpackm_mrxk_ker_name
+#define unpackm_mrxk_ker_name  GENARNAME(unpackm_mrxk)
+#undef  unpackm_nrxk_ker_name
+#define unpackm_nrxk_ker_name  GENARNAME(unpackm_nrxk)
 
 // Instantiate prototypes for above functions via the level-1m kernel API
 // template.
@@ -259,11 +229,10 @@ void GENBARNAME(cntx_init)
      )
 {
 	blksz_t  blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t  thresh[ BLIS_NUM_THRESH ];
 	func_t*  funcs;
 	mbool_t* mbools;
 	dim_t    i;
-	void**   vfuncs;
+	void_fp* vfuncs;
 
 
 	// -- Clear the context ----------------------------------------------------
@@ -273,59 +242,87 @@ void GENBARNAME(cntx_init)
 
 	// -- Set blocksizes -------------------------------------------------------
 
+	// NOTE: The macro values for register blocksizes and packm broadcast factors are
+	// used here as defined in the bli_kernel_defs_<family>.h or generic values from
+	// bli_kernel_macro_defs.h otherwise. Configurations should also initialize the
+	// blocksizes in the context explicitly, but using the correct values here helps
+	// to prevent accidents.
+	//                                                    s              d              c              z
+	bli_blksz_init_easy( &blkszs[ BLIS_KR  ],             1,             1,             1,             1 );
+	bli_blksz_init     ( &blkszs[ BLIS_MR  ],     BLIS_MR_s,     BLIS_MR_d,     BLIS_MR_c,     BLIS_MR_z,
+	                                          BLIS_PACKMR_s, BLIS_PACKMR_d, BLIS_PACKMR_c, BLIS_PACKMR_z );
+	bli_blksz_init     ( &blkszs[ BLIS_NR  ],     BLIS_NR_s,     BLIS_NR_d,     BLIS_NR_c,     BLIS_NR_z,
+	                                          BLIS_PACKNR_s, BLIS_PACKNR_d, BLIS_PACKNR_c, BLIS_PACKNR_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC  ],           256,           128,           128,            64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC  ],           256,           256,           256,           256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC  ],          4096,          4096,          4096,          4096 );
+	bli_blksz_init_easy( &blkszs[ BLIS_M2  ],          1000,          1000,          1000,          1000 );
+	bli_blksz_init_easy( &blkszs[ BLIS_N2  ],          1000,          1000,          1000,          1000 );
+	bli_blksz_init_easy( &blkszs[ BLIS_AF  ],             8,             8,             8,             8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_DF  ],             6,             6,             6,             6 );
+	bli_blksz_init_easy( &blkszs[ BLIS_XF  ],             4,             4,             4,             4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBM ],    BLIS_BBM_s,    BLIS_BBM_d,    BLIS_BBM_c,    BLIS_BBM_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBN ],    BLIS_BBN_s,    BLIS_BBN_d,    BLIS_BBN_c,    BLIS_BBN_z );
+
+	// -- Set level-3 small/unpacked thresholds --------------------------------
+
+	// NOTE: The default thresholds are set to zero so that the sup framework
+	// does not activate by default. Note that the semantic meaning of the
+	// thresholds is that the sup code path is executed if a dimension is
+	// strictly less than its corresponding threshold. So actually, the
+	// thresholds specify the minimum dimension size that will still dispatch
+	// the non-sup/large code path. This "strictly less than" behavior was
+	// chosen over "less than or equal to" so that threshold values of 0 would
+	// effectively disable sup (even for matrix dimensions of 0).
 	//                                          s     d     c     z
-	bli_blksz_init_easy( &blkszs[ BLIS_KR ],    1,    1,    1,    1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    4,    4,    4,    4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],   16,    8,    8,    4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  256,  128,  128,   64 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  256,  256,  256,  256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 );
-	bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 );
-	bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 );
-	bli_blksz_init_easy( &blkszs[ BLIS_AF ],    8,    8,    8,    8 );
-	bli_blksz_init_easy( &blkszs[ BLIS_DF ],    6,    6,    6,    6 );
-	bli_blksz_init_easy( &blkszs[ BLIS_XF ],    4,    4,    4,    4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],    0,    0,    0,    0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],    0,    0,    0,    0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],    0,    0,    0,    0 );
 
 	// Initialize the context with the default blocksize objects and their
 	// multiples.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 11,
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR,
-	  BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2,
-	  BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2,
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF,
-	  cntx
+	  cntx,
+	  BLIS_NC,  &blkszs[ BLIS_NC  ], BLIS_NR,
+	  BLIS_KC,  &blkszs[ BLIS_KC  ], BLIS_KR,
+	  BLIS_MC,  &blkszs[ BLIS_MC  ], BLIS_MR,
+	  BLIS_NR,  &blkszs[ BLIS_NR  ], BLIS_NR,
+	  BLIS_MR,  &blkszs[ BLIS_MR  ], BLIS_MR,
+	  BLIS_KR,  &blkszs[ BLIS_KR  ], BLIS_KR,
+	  BLIS_M2,  &blkszs[ BLIS_M2  ], BLIS_M2,
+	  BLIS_N2,  &blkszs[ BLIS_N2  ], BLIS_N2,
+	  BLIS_AF,  &blkszs[ BLIS_AF  ], BLIS_AF,
+	  BLIS_DF,  &blkszs[ BLIS_DF  ], BLIS_DF,
+	  BLIS_XF,  &blkszs[ BLIS_XF  ], BLIS_XF,
+	  BLIS_MT,  &blkszs[ BLIS_MT  ], BLIS_MT,
+	  BLIS_NT,  &blkszs[ BLIS_NT  ], BLIS_NT,
+	  BLIS_KT,  &blkszs[ BLIS_KT  ], BLIS_KT,
+	  BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
+	  BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
+	  BLIS_VA_END
 	);
 
 
 	// -- Set level-3 virtual micro-kernels ------------------------------------
 
-	funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	funcs = bli_cntx_ukrs_buf( cntx );
 
 	// NOTE: We set the virtual micro-kernel slots to contain the addresses
 	// of the native micro-kernels. In general, the ukernels in the virtual
 	// ukernel slots are always called, and if the function called happens to
 	// be a virtual micro-kernel, it will then know to find its native ukernel
 	// (i.e., in the native ukernel slots).
-	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
-	gen_func_init( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
-	gen_func_init( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );
+	gen_func_init( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
+	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
+	gen_func_init( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
+	gen_func_init( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
 
 
 	// -- Set level-3 native micro-kernels and preferences ---------------------
 
-	funcs  = bli_cntx_l3_nat_ukrs_buf( cntx );
-	mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
+	mbools = bli_cntx_ukr_prefs_buf( cntx );
 
 	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
 	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
@@ -333,108 +330,47 @@ void GENBARNAME(cntx_init)
 	gen_func_init( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
 	gen_func_init( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );
 
-	//                                                  s      d      c      z
-	bli_mbool_init( &mbools[ BLIS_GEMM_UKR ],        TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE );
-	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE );
-	bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ],     FALSE, FALSE, FALSE, FALSE );
-	bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ],     FALSE, FALSE, FALSE, FALSE );
-
-
-	// -- Set level-3 small/unpacked thresholds --------------------------------
-
-	// NOTE: The default thresholds are set to zero so that the sup framework
-	// does not activate by default. Note that the semantic meaning of the
-	// thresholds is that the sup code path is executed if a dimension is
-	// strictly less than its corresponding threshold. So actually, the
-	// thresholds specify the minimum dimension size that will still dispatch
-	// the non-sup/large code path. This "strictly less than" behavior was
-	// chosen over "less than or equal to" so that threshold values of 0 would
-	// effectively disable sup (even for matrix dimensions of 0).
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],    0,    0,    0,    0 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],    0,    0,    0,    0 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],    0,    0,    0,    0 );
-
-	// Initialize the context with the default thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-
-	// -- Set level-3 small/unpacked handlers ----------------------------------
-
-	vfuncs = bli_cntx_l3_sup_handlers_buf( cntx );
-
-	// Initialize all of the function pointers to NULL;
-	for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL;
-
-	// The level-3 sup handlers are oapi-based, so we only set one slot per
-	// operation.
-
-	// Set the gemm slot to the default gemm sup handler.
-	vfuncs[ BLIS_GEMM ]  = bli_gemmsup_ref;
-	vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
+	//                                                           s      d      c      z
+	bli_mbool_init( &mbools[ BLIS_GEMM_UKR_ROW_PREF ],        TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE );
+	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE );
+	bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR_ROW_PREF ],     FALSE, FALSE, FALSE, FALSE );
+	bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR_ROW_PREF ],     FALSE, FALSE, FALSE, FALSE );
 
 
 	// -- Set level-3 small/unpacked micro-kernels and preferences -------------
 
-	funcs  = bli_cntx_l3_sup_kers_buf( cntx );
-	mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx );
-
-#if 0
-	// Adhere to the small/unpacked ukernel mappings:
-	// - rv -> rrr, rcr
-	// - rg -> rrc, rcc
-	// - cv -> ccr, ccc
-	// - cg -> crr, crc
-	gen_sup_func_init( &funcs[ BLIS_RRR ],
-	                   &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name );
-	gen_sup_func_init( &funcs[ BLIS_RRC ],
-	                   &funcs[ BLIS_RCC ], gemmsup_rg_ukr_name );
-	gen_sup_func_init( &funcs[ BLIS_CCR ],
-	                   &funcs[ BLIS_CCC ], gemmsup_cv_ukr_name );
-	gen_sup_func_init( &funcs[ BLIS_CRR ],
-	                   &funcs[ BLIS_CRC ], gemmsup_cg_ukr_name );
-#endif
-	gen_func_init( &funcs[ BLIS_RRR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_RRC ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_RCC ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CRR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CRC ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CCR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CCC ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RRR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RRC_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RCR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RCC_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CRR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CRC_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CCR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CCC_UKR ], gemmsup_rv_ukr_name );
 
 	// Register the general-stride/generic ukernel to the "catch-all" slot
 	// associated with the BLIS_XXX enum value. This slot will be queried if
 	// *any* operand is stored with general stride.
-	gen_func_init( &funcs[ BLIS_XXX ], gemmsup_gx_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_XXX_UKR ], gemmsup_gx_ukr_name );
 
 
 	// Set the l3 sup ukernel storage preferences.
-	//                                       s      d      c      z
-	bli_mbool_init( &mbools[ BLIS_RRR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_RRC ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_RCR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_RCC ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CRR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CRC ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CCR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CCC ],  TRUE,  TRUE,  TRUE,  TRUE );
+	//                                                            s      d      c      z
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
 
-	bli_mbool_init( &mbools[ BLIS_XXX ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_XXX_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
 
 
 	// -- Set level-1f kernels -------------------------------------------------
 
-	funcs = bli_cntx_l1f_kers_buf( cntx );
-
 	gen_func_init( &funcs[ BLIS_AXPY2V_KER ],    axpy2v_ker_name    );
 	gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ],  dotaxpyv_ker_name  );
 	gen_func_init( &funcs[ BLIS_AXPYF_KER ],     axpyf_ker_name     );
@@ -444,8 +380,6 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-1v kernels -------------------------------------------------
 
-	funcs = bli_cntx_l1v_kers_buf( cntx );
-
 	gen_func_init( &funcs[ BLIS_ADDV_KER ],    addv_ker_name    );
 	gen_func_init( &funcs[ BLIS_AMAXV_KER ],   amaxv_ker_name   );
 	gen_func_init( &funcs[ BLIS_AXPBYV_KER ],  axpbyv_ker_name  );
@@ -464,41 +398,35 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-1m (packm/unpackm) kernels ---------------------------------
 
-	funcs = bli_cntx_packm_kers_buf( cntx );
+	gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
+	gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
 
-	// Initialize all packm kernel func_t entries to NULL.
-	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
-	{
-		bli_func_init_null( &funcs[ i ] );
-	}
+	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
+	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
 
-	gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ],  packm_3xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name );
-
-	funcs = bli_cntx_unpackm_kers_buf( cntx );
-
-	// Initialize all packm kernel func_t entries to NULL.
-	for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i )
-	{
-		bli_func_init_null( &funcs[ i ] );
-	}
+	gen_func_init( &funcs[ BLIS_PACKM_MRXMR_DIAG_KER ],  packm_mrxmr_diag_ker_name );
+	gen_func_init( &funcs[ BLIS_PACKM_NRXNR_DIAG_KER ],  packm_nrxnr_diag_ker_name );
+
+	gen_func_init_co( &funcs[ BLIS_PACKM_MRXMR_DIAG_1ER_KER ],  packm_mrxmr_diag_1er_ker_name );
+	gen_func_init_co( &funcs[ BLIS_PACKM_NRXNR_DIAG_1ER_KER ],  packm_nrxnr_diag_1er_ker_name );
+
+	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
+	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
+
+
+	// -- Set level-3 small/unpacked handlers ----------------------------------
+
+	vfuncs = bli_cntx_l3_sup_handlers_buf( cntx );
 
-	gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ],  unpackm_2xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ],  unpackm_4xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ],  unpackm_6xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ],  unpackm_8xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name );
+	// Initialize all of the function pointers to NULL;
+	for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL;
+
+	// The level-3 sup handlers are oapi-based, so we only set one slot per
+	// operation.
+
+	// Set the gemm slot to the default gemm sup handler.
+	vfuncs[ BLIS_GEMM ]  = bli_gemmsup_ref;
+	vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
 
 
 	// -- Set miscellaneous fields ---------------------------------------------
@@ -515,7 +443,6 @@ void GENBAINAME(cntx_init)
      )
 {
 	func_t* funcs;
-	dim_t   i;
 
 	// This function is designed to modify a copy of an existing native
 	// context to enable computation via an induced method for complex
@@ -525,23 +452,23 @@ void GENBAINAME(cntx_init)
 
 	// -- Set induced method level-3 virtual micro-kernels ---------------------
 
-	funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	funcs = bli_cntx_ukrs_buf( cntx );
 
 	if ( method == BLIS_1M )
 	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_UKR ],       gemm1m_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ],     trsm1m_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ],     trsm1m_u_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm1m_ukr_name       );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm1m_l_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm1m_u_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm1m_l_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm1m_u_ukr_name     );
 	}
 	else // if ( method == BLIS_NAT )
 	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
 	}
 
 	// For 1m, we employ an optimization which requires that we copy the native
@@ -556,8 +483,8 @@ void GENBAINAME(cntx_init)
 	// beta has a zero imaginary component and C is either row- or column-stored).
 	if ( method == BLIS_1M )
 	{
-		func_t* gemm_nat_ukrs = bli_cntx_get_l3_nat_ukrs( BLIS_GEMM_UKR, cntx );
-		func_t* gemm_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, cntx );
+		func_t* gemm_nat_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
+		func_t* gemm_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
 
 		bli_func_copy_dt( BLIS_FLOAT,  gemm_nat_ukrs, BLIS_FLOAT,  gemm_vir_ukrs );
 		bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs );
@@ -566,39 +493,23 @@ void GENBAINAME(cntx_init)
 
 	// -- Set induced method packm kernels -------------------------------------
 
-	funcs = bli_cntx_packm_kers_buf( cntx );
-
-	// Initialize all packm kernel func_t entries to NULL.
-	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
-	{
-		bli_func_init_null( &funcs[ i ] );
-	}
-
 	if ( method == BLIS_1M )
 	{
-		gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_1er_ker_name );
+		gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_1er_ker_name );
+		gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_1er_ker_name );
 	}
 	else // if ( method == BLIS_NAT )
 	{
-		gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ],  packm_3xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name );
+		gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
+		gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
 	}
 
+	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
+	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
+
+	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
+	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
+
 
 	// -- Set induced method cache and register blocksizes ---------------------
 
@@ -628,50 +539,44 @@ void GENBAINAME(cntx_init_blkszs)
        cntx_t* cntx
      )
 {
-	// We MUST set the induced method in the context prior to calling
-	// bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries
-	// the induced method. That function needs the induced method value in
-	// order to determine whether to evaluate the "prefers column storage"
-	// predicate using the storage preference of the kernel for dt, or
-	// the storage preference of the kernel for the real projection of
-	// dt. Failing to set the induced method here can lead to strange
-	// undefined behavior at runtime if the native complex kernel's
-	// storage preference happens to not equal that of the native real
-	// kernel.
+	// Set the induced method in the context.
 	bli_cntx_set_method( method, cntx );
 
+	num_t dt_r = bli_dt_proj_to_real( dt );
+
 	// Initialize the blocksizes according to the micro-kernel preference as
 	// well as the algorithm.
-	if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
+	//if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
+	if ( ! bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
 	{
 		// This branch is used for algorithm 1m_c_bp.
 
 		bli_cntx_set_ind_blkszs
 		(
-		  method, dt, 6,
+		  method, dt, cntx,
 		  BLIS_NC, 1.0, 1.0,
 		  BLIS_KC, 2.0, 2.0, // halve kc...
 		  BLIS_MC, 2.0, 2.0, // halve mc...
 		  BLIS_NR, 1.0, 1.0,
 		  BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
 		  BLIS_KR, 1.0, 1.0,
-		  cntx
+		  BLIS_VA_END
 		);
 	}
-	else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
+	else // if ( bli_cntx_get_ukr_prefs_dt( dt, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
 	{
 		// This branch is used for algorithm 1m_r_bp.
 
 		bli_cntx_set_ind_blkszs
 		(
-		  method, dt, 6,
+		  method, dt, cntx,
 		  BLIS_NC, 2.0, 2.0, // halve nc...
 		  BLIS_KC, 2.0, 2.0, // halve kc...
 		  BLIS_MC, 1.0, 1.0,
 		  BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
 		  BLIS_MR, 1.0, 1.0,
 		  BLIS_KR, 1.0, 1.0,
-		  cntx
+		  BLIS_VA_END
 		);
 	}
 }
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index fbd15d695..2f0808389 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -55,8 +55,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const num_t       dt_r      = PASTEMAC(chr,type); \
 \
 	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        col_pref  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	                  rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        row_pref  = !col_pref; \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 08823f073..6cfb83cae 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -56,12 +56,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const num_t       dt_r        = PASTEMAC(chr,type); \
 \
 	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr   = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	                  rgemm_ukr   = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 \
 	PASTECH(ch,trsm_ukr_ft) \
 	                ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
 \
-	const bool        col_pref_r  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref_r  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 \
 	const dim_t       mr          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -89,7 +89,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype_r* restrict bx1_r       = ( ctype_r* )bx1; \
 \
 	const inc_t       rs_b        = packnr; \
-	const inc_t       cs_b        = 1; \
+	const inc_t       cs_b        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
 \
 	ctype_r* restrict zero_r      = PASTEMAC(chr,0); \
 	ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
@@ -106,7 +106,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	/* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
 	   instead? */ \
-	const bool        col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t       rs_ct       = ( col_pref ? 1 : nr ); \
 	const inc_t       cs_ct       = ( col_pref ? mr : 1 ); \
 \
@@ -192,24 +192,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	if ( bli_is_1e_packed( schema_b ) ) \
 	{ \
-		const inc_t     ld_b = rs_b; \
+		const inc_t       ld_b   =     rs_b; \
+		const inc_t       rs_b2  = 2 * rs_b; \
+		const inc_t       cs_b2  = 2 * cs_b; \
 \
-		ctype* restrict b11_ri = ( ctype* )b11; \
-		ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \
-\
-		dim_t i, j; \
+		ctype_r* restrict b11_ri = ( ctype_r* )b11; \
+		ctype_r* restrict b11_ir = ( ctype_r* )b11 + ld_b; \
 \
 		/* b11 = alpha * b11 + bt; */ \
-		for ( j = 0; j < nr; ++j ) \
-		for ( i = 0; i < mr; ++i ) \
+		for ( dim_t j = 0; j < nr; ++j ) \
+		for ( dim_t i = 0; i < mr; ++i ) \
+		for ( dim_t d = 0; d < cs_b; ++d ) \
 		{ \
-			ctype*   restrict beta11t   = bt     + i*rs_bt + j*cs_bt; \
-			ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \
-			ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \
-			ctype*   restrict beta11_ri = b11_ri + i*rs_b  + j*cs_b; \
-			ctype_r* restrict beta11_r  = &PASTEMAC(ch,real)( *beta11_ri ); \
-			ctype_r* restrict beta11_i  = &PASTEMAC(ch,imag)( *beta11_ri ); \
-			ctype*   restrict beta11_ir = b11_ir + i*rs_b  + j*cs_b; \
+			ctype*   restrict beta11t     = bt     + i*rs_bt + j*cs_bt; \
+			ctype_r* restrict beta11t_r   = &PASTEMAC(ch,real)( *beta11t ); \
+			ctype_r* restrict beta11t_i   = &PASTEMAC(ch,imag)( *beta11t ); \
+			ctype_r* restrict beta11_ri_r = b11_ri + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
+			ctype_r* restrict beta11_ri_i = b11_ri + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \
+			ctype_r* restrict beta11_ir_r = b11_ir + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
+			ctype_r* restrict beta11_ir_i = b11_ir + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \
 \
 			PASTEMAC3(ch,chr,ch,xpbyris) \
 			( \
@@ -217,12 +218,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			  *beta11t_i, \
 			  alpha_r, \
 			  alpha_i, /* alpha_i not referenced */ \
-			  *beta11_r, \
-			  *beta11_i  \
+			  *beta11_ri_r, \
+			  *beta11_ri_i  \
 			); \
 \
-			PASTEMAC(ch,sets)( -*beta11_i, \
-			                    *beta11_r, *beta11_ir ); \
+			PASTEMAC(ch,copyris)( -*beta11_ri_i, *beta11_ri_r, \
+			                       *beta11_ir_r, *beta11_ir_i ); \
 		} \
 	} \
 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
@@ -233,18 +234,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		ctype_r* restrict b11_r = ( ctype_r* )b11; \
 		ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \
-\
-		dim_t i, j; \
 \
 		/* b11 = alpha * b11 + bt; */ \
-		for ( j = 0; j < nr; ++j ) \
-		for ( i = 0; i < mr; ++i ) \
+		for ( dim_t j = 0; j < nr; ++j ) \
+		for ( dim_t i = 0; i < mr; ++i ) \
+		for ( dim_t d = 0; d < cs_b; ++d ) \
 		{ \
 			ctype*   restrict beta11t   = bt    + i*rs_bt + j*cs_bt; \
 			ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \
 			ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \
-			ctype_r* restrict beta11_r  = b11_r + i*rs_b2 + j*cs_b2; \
-			ctype_r* restrict beta11_i  = b11_i + i*rs_b2 + j*cs_b2; \
+			ctype_r* restrict beta11_r  = b11_r + i*rs_b2 + j*cs_b2 + d; \
+			ctype_r* restrict beta11_i  = b11_i + i*rs_b2 + j*cs_b2 + d; \
 \
 			PASTEMAC3(ch,chr,ch,xpbyris) \
 			( \
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index 68717f7a6..5eda20f20 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -48,6 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
+	const num_t       dt_r   = PASTEMAC(chr,type); \
 \
 	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -58,11 +59,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t       m      = mr; \
 	const dim_t       n      = nr; \
 \
-	const inc_t       rs_a  = 1; \
-	const inc_t       cs_a  = packmr; \
+	const inc_t       rs_a   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \
+	const inc_t       cs_a   = packmr; \
 \
-	const inc_t       rs_b  = packnr; \
-	const inc_t       cs_b  = 1; \
+	const inc_t       rs_b   = packnr; \
+	const inc_t       cs_b   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
 \
 	const inc_t       ld_a  = cs_a; \
 	const inc_t       ld_b  = rs_b; \
@@ -77,12 +78,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		const inc_t       rs_a2 = 1 * rs_a; \
 		const inc_t       cs_a2 = 2 * cs_a; \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 2 * cs_b; \
 \
 		ctype_r* restrict a_r   = ( ctype_r* )a; \
 		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
 \
-		ctype*   restrict b_ri  = ( ctype*   )b; \
-		ctype*   restrict b_ir  = ( ctype*   )b + ld_b/2; \
+		ctype_r* restrict b_ri  = ( ctype_r* )b; \
+		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
 \
 		for ( iter = 0; iter < m; ++iter ) \
 		{ \
@@ -93,20 +96,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
 			ctype_r* restrict a10t_r     = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
 			ctype_r* restrict a10t_i     = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
-			ctype*   restrict b1_ri      = b_ri + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict b1_ir      = b_ir + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict B0_ri      = b_ri + (0  )*rs_b  + (0  )*cs_b; \
+			ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B0_ri      = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a10t * B0; */ \
 			/* b1 = b1 / alpha11; */ \
 			for ( j = 0; j < n; ++j ) \
 			{ \
-				ctype*   restrict beta11_ri = b1_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict beta11_ir = b1_ir + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict b01_ri    = B0_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict gamma11   = c     + (i  )*rs_c + (j  )*cs_c; \
-				ctype_r           beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \
-				ctype_r           beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \
+				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict b01_ri      = B0_ri + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11     = c     + (i  )*rs_c  + (j  )*cs_c; \
+				ctype_r           beta11c_r   = *beta11_ri_r; \
+				ctype_r           beta11c_i   = *beta11_ri_i; \
 				ctype_r           rho11_r; \
 				ctype_r           rho11_i; \
 \
@@ -117,9 +122,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				{ \
 					ctype_r* restrict alpha10_r = a10t_r  + (l  )*cs_a2; \
 					ctype_r* restrict alpha10_i = a10t_i  + (l  )*cs_a2; \
-					ctype*   restrict beta01_ri = b01_ri  + (l  )*rs_b; \
-					ctype_r* restrict beta01_r  = &PASTEMAC(ch,real)( *beta01_ri ); \
-					ctype_r* restrict beta01_i  = &PASTEMAC(ch,imag)( *beta01_ri ); \
+					ctype_r* restrict beta01_r  = b01_ri  + (l  )*rs_b2 + 0*cs_b; \
+					ctype_r* restrict beta01_i  = b01_ri  + (l  )*rs_b2 + 1*cs_b; \
 \
 					PASTEMAC(ch,axpyris)( *alpha10_r, \
 					                      *alpha10_i, \
@@ -147,8 +151,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *beta11_ri ); \
-				PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+				{ \
+					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+                } \
 			} \
 		} \
 	} \
@@ -229,10 +236,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                   beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,copyris)( beta11c_r, \
-				                      beta11c_i, \
-				                      *beta11_r, \
-				                      *beta11_i ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+					PASTEMAC(ch,copyris)( beta11c_r, \
+					                      beta11c_i, \
+					                      *(beta11_r + d), \
+					                      *(beta11_i + d) ); \
 			} \
 		} \
 	} \
@@ -258,6 +266,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
+	const num_t       dt_r   = PASTEMAC(chr,type); \
 \
 	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -268,11 +277,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t       m      = mr; \
 	const dim_t       n      = nr; \
 \
-	const inc_t       rs_a  = 1; \
-	const inc_t       cs_a  = packmr; \
+	const inc_t       rs_a   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \
+	const inc_t       cs_a   = packmr; \
 \
-	const inc_t       rs_b  = packnr; \
-	const inc_t       cs_b  = 1; \
+	const inc_t       rs_b   = packnr; \
+	const inc_t       cs_b   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
 \
 	const inc_t       ld_a  = cs_a; \
 	const inc_t       ld_b  = rs_b; \
@@ -287,12 +296,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		const inc_t       rs_a2 = 1 * rs_a; \
 		const inc_t       cs_a2 = 2 * cs_a; \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 2 * cs_b; \
 \
 		ctype_r* restrict a_r   = ( ctype_r* )a; \
 		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
 \
-		ctype*   restrict b_ri  = ( ctype*   )b; \
-		ctype*   restrict b_ir  = ( ctype*   )b + ld_b/2; \
+		ctype_r* restrict b_ri  = ( ctype_r* )b; \
+		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
 \
 		for ( iter = 0; iter < m; ++iter ) \
 		{ \
@@ -303,20 +314,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
 			ctype_r* restrict a12t_r     = a_r  + (i  )*rs_a2 + (i+1)*cs_a2; \
 			ctype_r* restrict a12t_i     = a_i  + (i  )*rs_a2 + (i+1)*cs_a2; \
-			ctype*   restrict b1_ri      = b_ri + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict b1_ir      = b_ir + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict B2_ri      = b_ri + (i+1)*rs_b  + (0  )*cs_b; \
+			ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B2_ri      = b_ri + (i+1)*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a12t * B2; */ \
 			/* b1 = b1 / alpha11; */ \
 			for ( j = 0; j < n; ++j ) \
 			{ \
-				ctype*   restrict beta11_ri = b1_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict beta11_ir = b1_ir + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict b21_ri    = B2_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict gamma11   = c     + (i  )*rs_c + (j  )*cs_c; \
-				ctype_r           beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \
-				ctype_r           beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \
+				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict b21_ri      = B2_ri + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11     = c     + (i  )*rs_c + (j  )*cs_c; \
+				ctype_r           beta11c_r   = *beta11_ri_r; \
+				ctype_r           beta11c_i   = *beta11_ri_i; \
 				ctype_r           rho11_r; \
 				ctype_r           rho11_i; \
 \
@@ -325,11 +338,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                      rho11_i ); \
 				for ( l = 0; l < n_behind; ++l ) \
 				{ \
-					ctype_r* restrict alpha12_r = a12t_r  + (l  )*cs_a2; \
-					ctype_r* restrict alpha12_i = a12t_i  + (l  )*cs_a2; \
-					ctype*   restrict beta21_ri = b21_ri  + (l  )*rs_b; \
-					ctype_r* restrict beta21_r  = &PASTEMAC(ch,real)( *beta21_ri ); \
-					ctype_r* restrict beta21_i  = &PASTEMAC(ch,imag)( *beta21_ri ); \
+					ctype_r* restrict alpha12_r = a12t_r + (l  )*cs_a2; \
+					ctype_r* restrict alpha12_i = a12t_i + (l  )*cs_a2; \
+					ctype_r* restrict beta21_r  = b21_ri + (l  )*rs_b2 + 0*cs_b; \
+					ctype_r* restrict beta21_i  = b21_ri + (l  )*rs_b2 + 1*cs_b; \
 \
 					PASTEMAC(ch,axpyris)( *alpha12_r, \
 					                      *alpha12_i, \
@@ -357,8 +369,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *beta11_ri ); \
-				PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+				{ \
+					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+                } \
 			} \
 		} \
 	} \
@@ -439,10 +454,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                   beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,copyris)( beta11c_r, \
-				                      beta11c_i, \
-				                      *beta11_r, \
-				                      *beta11_i ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+					PASTEMAC(ch,copyris)( beta11c_r, \
+					                      beta11c_i, \
+					                      *(beta11_r + d), \
+					                      *(beta11_i + d) ); \
 			} \
 		} \
 	} \
diff --git a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
index 957cd5794..8caccf923 100644
--- a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
+++ b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
@@ -157,7 +157,7 @@ void PASTECH2(bls_,ch,varname) \
 	   function pointer type. */ \
 	/*
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	*/ \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -168,7 +168,7 @@ void PASTECH2(bls_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 	*/ \
@@ -524,7 +524,7 @@ void PASTECH2(bls_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
 	   temporary buffer are set so that they match the storage of the
@@ -533,7 +533,7 @@ void PASTECH2(bls_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 \
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index f2f8b7e25..ec5d8d5b1 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -134,7 +134,7 @@ void bls_gemm_ex
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 62dc462d5..1e3e5ea03 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -156,7 +156,7 @@ void PASTECH2(bls_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = cs_c; \
diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c
index ca11c207c..2ed178c65 100644
--- a/sandbox/gemmlike/bls_packm_cxk.c
+++ b/sandbox/gemmlike/bls_packm_cxk.c
@@ -54,15 +54,16 @@ void PASTECH2(bls_,ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                           : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index b07da91cc..9568dfee7 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -305,7 +305,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 	if ( bli_obj_is_complex( &b ) ) *perf *= 4.0;
 
 	// Perform checks.
-	libblis_test_trsm_ukr_check( params, side, &ap, &c, &b, resid );
+	libblis_test_trsm_ukr_check( params, side, &a, &c, &b, resid );
 
 	// Zero out performance and residual if output matrix is empty.
 	//libblis_test_check_empty_problem( &c, perf, resid );
@@ -418,9 +418,11 @@ void libblis_test_trsm_ukr_check
 bli_printm( "a11", a, "%5.2f", "" );
 #endif
 
+#if 0
 	// Restore the diagonal of a11 to its original, un-inverted state
 	// (needed for trsv).
 	bli_invertd( a );
+#endif
 
 	if ( bli_is_left( side ) )
 	{

From 9fea633748ed27ef3853bba7cd955690c61092b4 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 13 Apr 2022 15:59:06 -0500
Subject: [PATCH 051/230] Partial addition of 'const' to all interfaces above
 the (micro)kernels. (#625)

Details:
- Added 'const' qualifier to applicable function arguments wherever the
  the pointed-to object is not internally modified. This change affects
  all interfaces that reside above the level of the (micro)kernels.
- Typecast certain function return values to discard 'const' qualifier.
- Removed 'restrict' from various arguments, including cntx_t*,
  auxinfo_t*, rntm_t*, thrinfo_t*, mem_t*, and others
- Removed parts of some APIs, such as bli_cntx_*(), due to limited use.
- Merged some variable declarations with their corresponding
  initialization statements.
- Whitespace changes.
---
 build/detect/config/config_detect.c           |    4 +-
 .../kernels/1/bli_axpyv_template_noopt_var1.c |    2 +-
 .../kernels/1/bli_dotv_template_noopt_var1.c  |    4 +-
 .../1f/bli_axpy2v_template_noopt_var1.c       |    2 +-
 .../1f/bli_axpyf_template_noopt_var1.c        |    2 +-
 .../1f/bli_dotaxpyv_template_noopt_var1.c     |    2 +-
 .../1f/bli_dotxaxpyf_template_noopt_var1.c    |    2 +-
 .../1f/bli_dotxf_template_noopt_var1.c        |    4 +-
 .../kernels/3/bli_gemm_template_noopt_mxn.c   |    4 +-
 .../3/bli_gemmtrsm_l_template_noopt_mxn.c     |    4 +-
 .../3/bli_gemmtrsm_u_template_noopt_mxn.c     |    4 +-
 .../kernels/3/bli_trsm_l_template_noopt_mxn.c |    4 +-
 .../kernels/3/bli_trsm_u_template_noopt_mxn.c |    4 +-
 frame/0/bli_l0_check.c                        |   50 +-
 frame/0/bli_l0_check.h                        |   50 +-
 frame/0/bli_l0_ft.h                           |   42 +-
 frame/0/bli_l0_oapi.c                         |   64 +-
 frame/0/bli_l0_oapi.h                         |   34 +-
 frame/0/bli_l0_tapi.c                         |   64 +-
 frame/0/bli_l0_tapi.h                         |   54 +-
 frame/0/copysc/bli_copysc.c                   |   32 +-
 frame/0/copysc/bli_copysc.h                   |   10 +-
 frame/1/bli_l1v_check.c                       |   94 +-
 frame/1/bli_l1v_check.h                       |   94 +-
 frame/1/bli_l1v_ft.h                          |   92 +-
 frame/1/bli_l1v_ker_prot.h                    |   28 +-
 frame/1/bli_l1v_oapi.c                        |  194 +--
 frame/1/bli_l1v_oapi.h                        |   68 +-
 frame/1/bli_l1v_tapi.c                        |  158 +--
 frame/1/bli_l1v_tapi.h                        |   92 +-
 frame/1d/bli_l1d_check.c                      |   38 +-
 frame/1d/bli_l1d_check.h                      |   38 +-
 frame/1d/bli_l1d_ft.h                         |   86 +-
 frame/1d/bli_l1d_oapi.c                       |   44 +-
 frame/1d/bli_l1d_oapi.h                       |   30 +-
 frame/1d/bli_l1d_tapi.c                       |  164 +--
 frame/1d/bli_l1d_tapi.h                       |   86 +-
 frame/1f/bli_l1f_check.c                      |   56 +-
 frame/1f/bli_l1f_check.h                      |   56 +-
 frame/1f/bli_l1f_ft.h                         |   94 +-
 frame/1f/bli_l1f_ker_prot.h                   |   10 +-
 frame/1f/bli_l1f_oapi.c                       |   66 +-
 frame/1f/bli_l1f_oapi.h                       |   56 +-
 frame/1f/bli_l1f_tapi.c                       |  156 +-
 frame/1f/bli_l1f_tapi.h                       |   94 +-
 frame/1m/bli_l1m_check.c                      |   34 +-
 frame/1m/bli_l1m_check.h                      |   34 +-
 frame/1m/bli_l1m_ft.h                         |   86 +-
 frame/1m/bli_l1m_ft_ker.h                     |    6 +-
 frame/1m/bli_l1m_ker_prot.h                   |    6 +-
 frame/1m/bli_l1m_oapi.c                       |   40 +-
 frame/1m/bli_l1m_oapi.h                       |   20 +-
 frame/1m/bli_l1m_oft_var.h                    |   22 +-
 frame/1m/bli_l1m_tapi.c                       |  176 +--
 frame/1m/bli_l1m_tapi.h                       |   86 +-
 frame/1m/packm/bli_packm_alloc.c              |   18 +-
 frame/1m/packm/bli_packm_alloc.h              |   18 +-
 frame/1m/packm/bli_packm_blk_var1.c           |   16 +-
 frame/1m/packm/bli_packm_blk_var1.h           |   16 +-
 frame/1m/packm/bli_packm_check.c              |   12 +-
 frame/1m/packm/bli_packm_check.h              |   12 +-
 frame/1m/packm/bli_packm_cntl.h               |   30 +-
 frame/1m/packm/bli_packm_init.c               |   12 +-
 frame/1m/packm/bli_packm_init.h               |   12 +-
 frame/1m/packm/bli_packm_int.c                |   12 +-
 frame/1m/packm/bli_packm_int.h                |   12 +-
 frame/1m/packm/bli_packm_part.c               |   32 +-
 frame/1m/packm/bli_packm_part.h               |   38 +-
 frame/1m/packm/bli_packm_struc_cxk.c          |   34 +-
 frame/1m/packm/bli_packm_struc_cxk.h          |   36 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.c       |   84 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.h       |   10 +-
 frame/1m/unpackm/bli_unpackm_check.c          |    6 +-
 frame/1m/unpackm/bli_unpackm_check.h          |    6 +-
 frame/1m/unpackm/bli_unpackm_int.c            |   22 +-
 frame/1m/unpackm/bli_unpackm_int.h            |   10 +-
 frame/2/bli_l2_check.c                        |   96 +-
 frame/2/bli_l2_check.h                        |   48 +-
 frame/2/bli_l2_ft.h                           |  106 +-
 frame/2/bli_l2_oapi.c                         |  192 +--
 frame/2/bli_l2_oapi.h                         |   30 +-
 frame/2/bli_l2_tapi.c                         |  174 +--
 frame/2/bli_l2_tapi.h                         |  106 +-
 .../gemv/{ => other}/bli_gemv_var_oapi.c.prev |    0
 frame/3/bli_l3_blocksize.c                    |  157 +--
 frame/3/bli_l3_blocksize.h                    |   42 +-
 frame/3/bli_l3_check.c                        |  208 +--
 frame/3/bli_l3_check.h                        |  124 +-
 frame/3/bli_l3_cntl.c                         |   18 +-
 frame/3/bli_l3_cntl.h                         |   18 +-
 frame/3/bli_l3_direct.c                       |   32 +-
 frame/3/bli_l3_direct.h                       |   14 +-
 frame/3/bli_l3_ft_ukr.h                       |   12 +-
 frame/3/bli_l3_ind_ukr.h                      |   12 +-
 frame/3/bli_l3_int.c                          |   18 +-
 frame/3/bli_l3_int.h                          |   18 +-
 frame/3/bli_l3_oapi.c                         |   38 +-
 frame/3/bli_l3_oapi.h                         |   38 +-
 frame/3/bli_l3_oapi_ex.c                      |  152 +-
 frame/3/bli_l3_oapi_ex.h                      |   54 +-
 frame/3/bli_l3_oft.h                          |   54 +-
 frame/3/bli_l3_oft_var.h                      |   14 +-
 frame/3/bli_l3_packab.c                       |   28 +-
 frame/3/bli_l3_packab.h                       |   28 +-
 frame/3/bli_l3_prune.c                        |  246 ++--
 frame/3/bli_l3_prune.h                        |   57 +-
 frame/3/bli_l3_schema.c                       |    8 +-
 frame/3/bli_l3_schema.h                       |    8 +-
 frame/3/bli_l3_sup.c                          |   28 +-
 frame/3/bli_l3_sup.h                          |   28 +-
 frame/3/bli_l3_sup_ft_ker.h                   |    4 +-
 frame/3/bli_l3_sup_int.c                      |   32 +-
 frame/3/bli_l3_sup_int.h                      |   32 +-
 frame/3/bli_l3_sup_ker_prot.h                 |    4 +-
 frame/3/bli_l3_sup_oft.h                      |   14 +-
 frame/3/bli_l3_sup_packm_a.c                  |   96 +-
 frame/3/bli_l3_sup_packm_a.h                  |   88 +-
 frame/3/bli_l3_sup_packm_b.c                  |   96 +-
 frame/3/bli_l3_sup_packm_b.h                  |   88 +-
 frame/3/bli_l3_sup_packm_var.c                |  116 +-
 frame/3/bli_l3_sup_packm_var.h                |   42 +-
 frame/3/bli_l3_sup_ref.c                      |   28 +-
 frame/3/bli_l3_sup_ref.h                      |   28 +-
 frame/3/bli_l3_sup_var12.c                    |  166 +--
 frame/3/bli_l3_sup_var1n2m.c                  |  432 +++---
 frame/3/bli_l3_sup_vars.h                     |   92 +-
 frame/3/bli_l3_tapi.c                         |  176 +--
 frame/3/bli_l3_tapi.h                         |  156 +-
 frame/3/bli_l3_tapi_ex.c                      |  294 ++--
 frame/3/bli_l3_tapi_ex.h                      |  188 +--
 frame/3/bli_l3_ukr_prot.h                     |   12 +-
 frame/3/bli_l3_ukr_tapi.c                     |   12 +-
 frame/3/gemm/bli_gemm_blk_var1.c              |   35 +-
 frame/3/gemm/bli_gemm_blk_var2.c              |   35 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |   38 +-
 frame/3/gemm/bli_gemm_front.c                 |   22 +-
 frame/3/gemm/bli_gemm_front.h                 |   30 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |  110 +-
 frame/3/gemm/bli_gemm_md.c                    |  282 ++--
 frame/3/gemm/bli_gemm_md.h                    |   88 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |    4 +-
 frame/3/gemm/bli_gemm_var.h                   |   14 +-
 frame/3/gemm/ind/bli_gemm_ind_opt.h           |   20 +-
 frame/3/gemmt/bli_gemmt_front.c               |   16 +-
 frame/3/gemmt/bli_gemmt_front.h               |   16 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |  105 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |  105 +-
 frame/3/gemmt/bli_gemmt_var.h                 |   14 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2.c          |   14 +-
 frame/3/hemm/bli_hemm_front.c                 |   18 +-
 frame/3/hemm/bli_hemm_front.h                 |   18 +-
 frame/3/symm/bli_symm_front.c                 |   18 +-
 frame/3/symm/bli_symm_front.h                 |   18 +-
 frame/3/trmm/bli_trmm_front.c                 |   14 +-
 frame/3/trmm/bli_trmm_front.h                 |   14 +-
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_var.h                   |   14 +-
 frame/3/trmm/bli_trmm_xx_ker_var2.c           |   14 +-
 frame/3/trmm3/bli_trmm3_front.c               |   18 +-
 frame/3/trmm3/bli_trmm3_front.h               |   18 +-
 frame/3/trsm/bli_trsm_blk_var1.c              |   44 +-
 frame/3/trsm/bli_trsm_blk_var2.c              |   35 +-
 frame/3/trsm/bli_trsm_blk_var3.c              |   41 +-
 frame/3/trsm/bli_trsm_front.c                 |   14 +-
 frame/3/trsm/bli_trsm_front.h                 |   14 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |   93 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |   93 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |   93 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |   93 +-
 frame/3/trsm/bli_trsm_var.h                   |   14 +-
 frame/3/trsm/bli_trsm_xx_ker_var2.c           |   14 +-
 frame/base/bli_apool.c                        |   70 +-
 frame/base/bli_apool.h                        |   34 +-
 frame/base/bli_arch.c                         |   12 +-
 frame/base/bli_arch.h                         |    4 +-
 frame/base/bli_array.c                        |   43 +-
 frame/base/bli_array.h                        |   28 +-
 frame/base/bli_auxinfo.h                      |   36 +-
 frame/base/bli_blksz.c                        |   48 +-
 frame/base/bli_blksz.h                        |   60 +-
 frame/base/bli_check.c                        |   84 +-
 frame/base/bli_check.h                        |   86 +-
 frame/base/bli_cntl.c                         |    4 +-
 frame/base/bli_cntl.h                         |   24 +-
 frame/base/bli_cntx.c                         |   20 +-
 frame/base/bli_cntx.h                         |  142 +-
 frame/base/bli_const.c                        |   10 +-
 frame/base/bli_env.c                          |    4 +-
 frame/base/bli_error.c                        |    6 +-
 frame/base/bli_error.h                        |    4 +-
 frame/base/bli_func.c                         |    6 +-
 frame/base/bli_func.h                         |   18 +-
 frame/base/bli_getopt.c                       |    8 +-
 frame/base/bli_getopt.h                       |   10 +-
 frame/base/bli_gks.c                          |   66 +-
 frame/base/bli_gks.h                          |   32 +-
 frame/base/bli_ind.c                          |   12 +-
 frame/base/bli_ind.h                          |   28 +-
 frame/base/bli_info.c                         |   43 +-
 frame/base/bli_info.h                         |   36 +-
 frame/base/bli_mbool.h                        |    2 +-
 frame/base/bli_mem.h                          |   16 +-
 frame/base/bli_memsys.c                       |    2 +-
 frame/base/bli_obj.c                          |   49 +-
 frame/base/bli_obj.h                          |   14 +-
 frame/base/bli_obj_scalar.c                   |   45 +-
 frame/base/bli_obj_scalar.h                   |   26 +-
 frame/base/bli_part.c                         |  153 +-
 frame/base/bli_part.h                         |   58 +-
 frame/base/bli_pba.c                          |  119 +-
 frame/base/bli_pba.h                          |   38 +-
 frame/base/bli_pool.c                         |   84 +-
 frame/base/bli_pool.h                         |   92 +-
 frame/base/bli_query.c                        |    6 +-
 frame/base/bli_query.h                        |    6 +-
 frame/base/bli_rntm.c                         |    6 +-
 frame/base/bli_rntm.h                         |   36 +-
 frame/base/bli_sba.c                          |   22 +-
 frame/base/bli_sba.h                          |   18 +-
 frame/base/bli_setgetijm.c                    |   68 +-
 frame/base/bli_setgetijm.h                    |   40 +-
 frame/base/bli_setgetijv.c                    |   52 +-
 frame/base/bli_setgetijv.h                    |   32 +-
 frame/base/bli_setri.c                        |   16 +-
 frame/base/bli_setri.h                        |   16 +-
 frame/base/cast/bli_castm.c                   |   89 +-
 frame/base/cast/bli_castm.h                   |   18 +-
 frame/base/cast/bli_castnzm.c                 |   89 +-
 frame/base/cast/bli_castnzm.h                 |   18 +-
 frame/base/cast/bli_castv.c                   |   53 +-
 frame/base/cast/bli_castv.h                   |   16 +-
 frame/base/check/bli_obj_check.c              |   47 +-
 frame/base/check/bli_obj_check.h              |   51 +-
 frame/base/check/bli_part_check.c             |   30 +-
 frame/base/check/bli_part_check.h             |   34 +-
 frame/base/proj/bli_projm.c                   |    8 +-
 frame/base/proj/bli_projm.h                   |    8 +-
 frame/base/proj/bli_projv.c                   |    8 +-
 frame/base/proj/bli_projv.h                   |    8 +-
 frame/compat/extra/bla_gemm3m.c               |    4 +-
 frame/include/bli_extern_defs.h               |   10 +-
 frame/include/bli_oapi_ba.h                   |    4 +-
 frame/include/bli_oapi_ex.h                   |    2 +-
 frame/include/bli_obj_macro_defs.h            |  272 ++--
 frame/include/bli_tapi_ba.h                   |    4 +-
 frame/include/bli_tapi_ex.h                   |    2 +-
 frame/include/bli_type_defs.h                 |   44 +-
 frame/thread/bli_l3_decor.h                   |   38 +-
 frame/thread/bli_l3_decor_openmp.c            |   51 +-
 frame/thread/bli_l3_decor_openmp.h            |    2 +-
 frame/thread/bli_l3_decor_pthreads.c          |  118 +-
 frame/thread/bli_l3_decor_single.c            |   44 +-
 frame/thread/bli_l3_sup_decor.h               |   34 +-
 frame/thread/bli_l3_sup_decor_openmp.c        |   26 +-
 frame/thread/bli_l3_sup_decor_pthreads.c      |   74 +-
 frame/thread/bli_l3_sup_decor_single.c        |   27 +-
 frame/thread/bli_thread.c                     |  360 ++---
 frame/thread/bli_thread.h                     |  104 +-
 frame/thread/bli_thrinfo.c                    |   10 +-
 frame/thread/bli_thrinfo.h                    |   28 +-
 frame/thread/bli_thrinfo_sup.c                |   38 +-
 frame/thread/bli_thrinfo_sup.h                |   22 +-
 frame/util/bli_util_check.c                   |   94 +-
 frame/util/bli_util_check.h                   |   80 +-
 frame/util/bli_util_ft.h                      |   96 +-
 frame/util/bli_util_oapi.c                    |  260 ++--
 frame/util/bli_util_oapi.h                    |   75 +-
 frame/util/bli_util_tapi.c                    |  152 +-
 frame/util/bli_util_tapi.h                    |  112 +-
 frame/util/bli_util_unb_var1.c                |  150 +-
 frame/util/bli_util_unb_var1.h                |   34 +-
 .../armsve/1m/bli_dpackm_armsve256_int_8xk.c  |    2 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c |    2 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c |    2 +-
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |    4 +-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  |    4 +-
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  |    4 +-
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |    4 +-
 kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c   |   24 +-
 kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c   |    8 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c |    6 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c |    6 +-
 .../armv8a/1m/bli_packm_armv8a_int_s12xk.c    |    6 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c |    6 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   |   16 +-
 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c |    8 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c   |    4 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c   |    4 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c   |    6 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c   |    6 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c   |    4 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c   |    4 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c   |    4 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c |    4 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c |    4 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c |   12 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c |    4 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c   |    4 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c   |    6 +-
 kernels/bgq/1/bli_axpyv_bgq_int.c             |    8 +-
 kernels/bgq/1/bli_dotv_bgq_int.c              |    4 +-
 kernels/bgq/1f/bli_axpyf_bgq_int.c            |    6 +-
 kernels/bgq/3/bli_gemm_bgq_int_8x8.c          |    8 +-
 .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c      |   16 +-
 .../haswell/1m/bli_packm_haswell_asm_c3xk.c   |    2 +-
 .../haswell/1m/bli_packm_haswell_asm_c8xk.c   |    2 +-
 .../haswell/1m/bli_packm_haswell_asm_d6xk.c   |   26 +-
 .../haswell/1m/bli_packm_haswell_asm_d8xk.c   |   26 +-
 .../haswell/1m/bli_packm_haswell_asm_s16xk.c  |   28 +-
 .../haswell/1m/bli_packm_haswell_asm_s6xk.c   |   28 +-
 .../haswell/1m/bli_packm_haswell_asm_z3xk.c   |    2 +-
 .../haswell/1m/bli_packm_haswell_asm_z4xk.c   |    2 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c |   16 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c |   16 +-
 .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c       |    8 +-
 .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c       |    8 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c  |  476 +++----
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c  |  612 ++++----
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c |  790 +++++------
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c |  614 ++++----
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c  |  828 +++++------
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c  | 1050 +++++++-------
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c | 1252 ++++++++---------
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c | 1048 +++++++-------
 .../sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c |   40 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c    |   16 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c    |  600 ++++----
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c    |   12 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c    |  460 +++---
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c    | 1044 +++++++-------
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c    | 1036 +++++++-------
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c    | 1018 +++++++-------
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c    | 1018 +++++++-------
 .../s6x16/bli_gemmsup_r_haswell_ref_sMx1.c    |   44 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c   |  608 ++++----
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c  |  462 +++---
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c  |  462 +++---
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c   |  608 ++++----
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c   |  462 +++---
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c   |  462 +++---
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c  | 1008 ++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c  | 1010 ++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c   | 1082 +++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c   | 1082 +++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c   | 1202 ++++++++--------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c   | 1008 ++++++-------
 kernels/knc/3/bli_dgemm_knc_asm_30x8.c        |    4 +-
 kernels/knc/3/bli_sgemm_knc_asm_30x16.c       |    4 +-
 kernels/knl/1m/bli_dpackm_knl_asm_24x8.c      |    4 +-
 kernels/knl/1m/bli_spackm_knl_asm_24x16.c     |    4 +-
 kernels/knl/3/bli_dgemm_knl_asm_24x8.c        |    4 +-
 kernels/knl/3/bli_sgemm_knl_asm_24x16.c       |    4 +-
 kernels/penryn/1/bli_axpyv_penryn_int.c       |    2 +-
 kernels/penryn/1/bli_dotv_penryn_int.c        |    2 +-
 kernels/penryn/1f/bli_axpy2v_penryn_int.c     |    2 +-
 kernels/penryn/1f/bli_axpyf_penryn_int.c      |    2 +-
 kernels/penryn/1f/bli_dotaxpyv_penryn_int.c   |    2 +-
 kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c  |    2 +-
 kernels/penryn/1f/bli_dotxf_penryn_int.c      |    2 +-
 kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c   |    8 +-
 .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c |  214 +--
 .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c |  206 +--
 kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c |   76 +-
 kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c |   76 +-
 .../3/bli_gemm_piledriver_asm_d8x3.c          |   16 +-
 kernels/power10/3/bli_dgemm_power10_mma.c     |    4 +-
 kernels/power10/3/bli_i16gemm_power10_mma.c   |    4 +-
 kernels/power10/3/bli_i16sgemm_power10_mma.c  |    4 +-
 kernels/power10/3/bli_i4gemm_power10_mma.c    |    4 +-
 kernels/power10/3/bli_i8gemm_power10_mma.c    |    4 +-
 kernels/power10/3/bli_sbgemm_power10_mma.c    |    4 +-
 kernels/power10/3/bli_sgemm_power10_mma.c     |    4 +-
 kernels/power10/3/bli_shgemm_power10_mma.c    |    4 +-
 kernels/power7/3/bli_gemm_power7_int_8x4.c    |   16 +-
 .../power7/3/test/bli_gemm_power7_int_8x4.h   |   16 +-
 kernels/power9/3/bli_gemm_power9_asm_d12x6.c  |    4 +-
 .../3/bli_gemm_sandybridge_asm_d8x4.c         |   16 +-
 .../3/bli_gemm_sandybridge_int_d8x4.c         |   16 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c    |    2 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x14.c       |    2 +-
 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c    |    2 +-
 kernels/zen/1/bli_amaxv_zen_int.c             |   16 +-
 kernels/zen/1/bli_axpyv_zen_int.c             |    4 +-
 kernels/zen/1/bli_axpyv_zen_int10.c           |    4 +-
 kernels/zen/1/bli_copyv_zen_int.c             |  660 ++++-----
 kernels/zen/1/bli_dotv_zen_int.c              |    4 +-
 kernels/zen/1/bli_dotv_zen_int10.c            |    4 +-
 kernels/zen/1/bli_dotxv_zen_int.c             |    6 +-
 kernels/zen/1/bli_scalv_zen_int.c             |    4 +-
 kernels/zen/1/bli_scalv_zen_int10.c           |    8 +-
 kernels/zen/1/bli_setv_zen_int.c              |    4 +-
 kernels/zen/1/bli_swapv_zen_int8.c            |    4 +-
 kernels/zen/1f/bli_axpyf_zen_int_4.c          |    2 +-
 kernels/zen/1f/bli_axpyf_zen_int_5.c          |    6 +-
 kernels/zen/1f/bli_axpyf_zen_int_8.c          |    4 +-
 kernels/zen/1f/bli_dotxf_zen_int_8.c          |    4 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c |   45 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c |   34 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c  |   56 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c |   36 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c |   30 +-
 kernels/zen2/1f/old/bli_axpyf_zen_int_5.c     |  599 ++++++++
 ref_kernels/1/bli_addv_ref.c                  |    2 +-
 ref_kernels/1/bli_amaxv_ref.c                 |    2 +-
 ref_kernels/1/bli_axpbyv_ref.c                |    2 +-
 ref_kernels/1/bli_axpyv_ref.c                 |    4 +-
 ref_kernels/1/bli_copyv_ref.c                 |    2 +-
 ref_kernels/1/bli_dotv_ref.c                  |    2 +-
 ref_kernels/1/bli_dotxv_ref.c                 |    2 +-
 ref_kernels/1/bli_invertv_ref.c               |    2 +-
 ref_kernels/1/bli_scal2v_ref.c                |    2 +-
 ref_kernels/1/bli_scalv_ref.c                 |    2 +-
 ref_kernels/1/bli_setv_ref.c                  |    2 +-
 ref_kernels/1/bli_subv_ref.c                  |    2 +-
 ref_kernels/1/bli_swapv_ref.c                 |    2 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |    2 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |    2 +-
 ref_kernels/1f/bli_axpyf_ref.c                |    2 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |    2 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |    2 +-
 ref_kernels/1f/bli_dotxf_ref.c                |    2 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   |    2 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       |    2 +-
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        |   34 +-
 ref_kernels/1m/bli_packm_cxk_ref.c            |    2 +-
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |    2 +-
 ref_kernels/3/bli_gemm_ref.c                  |   66 +-
 ref_kernels/3/bli_gemmsup_ref.c               |   60 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |   36 +-
 ref_kernels/3/bli_trsm_ref.c                  |   24 +-
 ref_kernels/bli_cntx_ref.c                    |   12 +-
 ref_kernels/ind/bli_gemm1m_ref.c              |    4 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |    4 +-
 ref_kernels/ind/bli_trsm1m_ref.c              |   22 +-
 testsuite/src/test_axpy2v.c                   |    4 +-
 testsuite/src/test_axpyf.c                    |    4 +-
 testsuite/src/test_dotaxpyv.c                 |    6 +-
 testsuite/src/test_dotxaxpyf.c                |    4 +-
 testsuite/src/test_dotxf.c                    |    4 +-
 testsuite/src/test_gemm_ukr.c                 |    2 +-
 testsuite/src/test_gemmtrsm_ukr.c             |    2 +-
 testsuite/src/test_libblis.c                  |   10 +-
 testsuite/src/test_trsm_ukr.c                 |    2 +-
 446 files changed, 19651 insertions(+), 19345 deletions(-)
 rename frame/2/gemv/{ => other}/bli_gemv_var_oapi.c.prev (100%)
 create mode 100644 kernels/zen2/1f/old/bli_axpyf_zen_int_5.c

diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c
index 5e29defe1..5f1ea0f42 100644
--- a/build/detect/config/config_detect.c
+++ b/build/detect/config/config_detect.c
@@ -69,8 +69,8 @@
 
 int main( int argc, char** argv )
 {
-	arch_t id = bli_cpuid_query_id();
-	char*  s  = bli_arch_string( id );
+	arch_t id     = bli_cpuid_query_id();
+	const char* s = bli_arch_string( id );
 
 	printf( "%s\n", s );
 
diff --git a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
index d1918466f..8796bab26 100644
--- a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
+++ b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
@@ -42,7 +42,7 @@ void bli_zaxpyv_template_noopt
        dcomplex* restrict alpha,
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1/bli_dotv_template_noopt_var1.c b/config/template/kernels/1/bli_dotv_template_noopt_var1.c
index 3761d2e76..90f93b817 100644
--- a/config/template/kernels/1/bli_dotv_template_noopt_var1.c
+++ b/config/template/kernels/1/bli_dotv_template_noopt_var1.c
@@ -43,7 +43,7 @@ void bli_zdotv_template_noopt
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict rho,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
@@ -187,7 +187,7 @@ void bli_zdotv_template_noopt
 	// Initialize accumulator to zero.
 	bli_zset0s( dotxy );
 
-	
+
 	conjx_use = conjx;
 
 	// If y must be conjugated, we compute the result indirectly by first
diff --git a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
index 7080abce0..5a12bf761 100644
--- a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
@@ -45,7 +45,7 @@ void bli_zaxpy2v_template_noopt
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict z, inc_t incz,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
index a0afedfca..f7b492286 100644
--- a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
@@ -45,7 +45,7 @@ void bli_zaxpyf_template_noopt
        dcomplex* restrict a, inc_t inca, inc_t lda,
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
index 275c39998..31a3097c0 100644
--- a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
@@ -46,7 +46,7 @@ void bli_zdotaxpyv_template_noopt
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict rho,
        dcomplex* restrict z, inc_t incz,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
index 6754d86ce..aeb502f35 100644
--- a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
@@ -50,7 +50,7 @@ void bli_zdotxaxpyf_template_noopt
        dcomplex* restrict beta,
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict z, inc_t incz,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 
 {
diff --git a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
index 430fb277d..650303afe 100644
--- a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
@@ -46,7 +46,7 @@ void bli_zdotxf_template_noopt
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict beta,
        dcomplex* restrict y, inc_t incy,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
@@ -239,7 +239,7 @@ void bli_zdotxf_template_noopt
 	if ( bli_is_conj( conjx ) )
 		bli_toggle_conj( &conjat_use );
 
-	
+
 	// Iterate over columns of A and rows of x to compute:
 	//   Atx = conjat_use( A^T ) * x;
 	if ( bli_is_noconj( conjat_use ) )
diff --git a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
index 06f25a0e9..190519fa0 100644
--- a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
@@ -45,8 +45,8 @@ void bli_zgemm_template_noopt
        dcomplex*  restrict b1,
        dcomplex*  restrict beta,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
index 87c21f7ed..d44fa4c1e 100644
--- a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
@@ -44,8 +44,8 @@ void bli_zgemmtrsm_l_template_noopt
        dcomplex*  restrict b01,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
index 0b4544ae1..0a3d59622 100644
--- a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
@@ -44,8 +44,8 @@ void bli_zgemmtrsm_u_template_noopt
        dcomplex*  restrict b01,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
index ce15798b0..4e6634dea 100644
--- a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
@@ -40,8 +40,8 @@ void bli_ztrsm_l_template_noopt
        dcomplex*  restrict a11,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
index 661167c9c..42982459a 100644
--- a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
@@ -40,8 +40,8 @@ void bli_ztrsm_u_template_noopt
        dcomplex*  restrict a11,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c
index 966f0c6aa..02867a22d 100644
--- a/frame/0/bli_l0_check.c
+++ b/frame/0/bli_l0_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_l0_xxsc_check( chi, psi ); \
@@ -63,7 +63,7 @@ GENFRONT( subsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi  \
+       const obj_t* chi  \
      ) \
 { \
 	bli_l0_xsc_check( chi ); \
@@ -77,8 +77,8 @@ GENFRONT( invertsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  norm  \
+       const obj_t* chi, \
+       const obj_t* norm  \
      ) \
 { \
 	bli_l0_xx2sc_check( chi, norm ); \
@@ -91,9 +91,9 @@ GENFRONT( normfsc )
 
 void bli_getsc_check
      (
-       obj_t*  chi,
-       double* zeta_r,
-       double* zeta_i 
+       const obj_t*  chi,
+       const double* zeta_r,
+       const double* zeta_i
      )
 {
 	err_t e_val;
@@ -117,9 +117,9 @@ void bli_getsc_check
 
 void bli_setsc_check
      (
-       double  zeta_r,
-       double  zeta_i,
-       obj_t*  chi 
+       double       zeta_r,
+       double       zeta_i,
+       const obj_t* chi
      )
 {
 	err_t e_val;
@@ -143,9 +143,9 @@ void bli_setsc_check
 
 void bli_unzipsc_check
      (
-       obj_t*  chi,
-       obj_t*  zeta_r,
-       obj_t*  zeta_i 
+       const obj_t* chi,
+       const obj_t* zeta_r,
+       const obj_t* zeta_i
      )
 {
 	err_t e_val;
@@ -199,9 +199,9 @@ void bli_unzipsc_check
 
 void bli_zipsc_check
      (
-       obj_t*  zeta_r,
-       obj_t*  zeta_i,
-       obj_t*  chi 
+       const obj_t* zeta_r,
+       const obj_t* zeta_i,
+       const obj_t* chi
      )
 {
 	err_t e_val;
@@ -254,7 +254,7 @@ void bli_zipsc_check
 
 void bli_l0_xsc_check
      (
-       obj_t*  chi
+       const obj_t* chi
      )
 {
 	err_t e_val;
@@ -280,8 +280,8 @@ void bli_l0_xsc_check
 
 void bli_l0_xxsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi 
+       const obj_t* chi,
+       const obj_t* psi
      )
 {
 	err_t e_val;
@@ -316,8 +316,8 @@ void bli_l0_xxsc_check
 
 void bli_l0_xx2sc_check
      (
-       obj_t*  chi,
-       obj_t*  absq 
+       const obj_t* chi,
+       const obj_t* absq
      )
 {
 	err_t e_val;
@@ -355,9 +355,9 @@ void bli_l0_xx2sc_check
 
 void bli_l0_xxbsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi,
-       bool*   is_eq
+       const obj_t* chi,
+       const obj_t* psi,
+       const bool*  is_eq
      )
 {
 	err_t e_val;
diff --git a/frame/0/bli_l0_check.h b/frame/0/bli_l0_check.h
index f495866c6..1bbb4a756 100644
--- a/frame/0/bli_l0_check.h
+++ b/frame/0/bli_l0_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      );
 
 GENTPROT( addsc )
@@ -59,7 +59,7 @@ GENTPROT( subsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi  \
+       const obj_t* chi  \
      );
 
 GENTPROT( invertsc )
@@ -70,8 +70,8 @@ GENTPROT( invertsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  absq  \
+       const obj_t* chi, \
+       const obj_t* absq  \
      );
 
 GENTPROT( absqsc )
@@ -83,9 +83,9 @@ GENTPROT( normfsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const obj_t*  chi, \
+       const double* zeta_r, \
+       const double* zeta_i  \
      );
 
 GENTPROT( getsc )
@@ -96,9 +96,9 @@ GENTPROT( getsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
-       obj_t*  chi  \
+             double zeta_r, \
+             double zeta_i, \
+       const obj_t* chi  \
      );
 
 GENTPROT( setsc )
@@ -109,9 +109,9 @@ GENTPROT( setsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i  \
+       const obj_t* chi, \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i  \
      );
 
 GENTPROT( unzipsc )
@@ -122,9 +122,9 @@ GENTPROT( unzipsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i, \
-       obj_t*  chi  \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i, \
+       const obj_t* chi  \
      );
 
 GENTPROT( zipsc )
@@ -133,24 +133,24 @@ GENTPROT( zipsc )
 
 void bli_l0_xsc_check
      (
-       obj_t*  chi
+       const obj_t* chi
      );
 
 void bli_l0_xxsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi 
+       const obj_t* chi,
+       const obj_t* psi
      );
 
 void bli_l0_xx2sc_check
      (
-       obj_t*  chi,
-       obj_t*  norm 
+       const obj_t* chi,
+       const obj_t* norm
      );
 
 void bli_l0_xxbsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi,
-       bool*   is_eq
+       const obj_t* chi,
+       const obj_t* psi,
+       const bool*  is_eq
      );
diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h
index b90e35eb5..01d90cc3b 100644
--- a/frame/0/bli_l0_ft.h
+++ b/frame/0/bli_l0_ft.h
@@ -44,9 +44,9 @@
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+             conj_t conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTDEF( addsc )
@@ -73,9 +73,9 @@ INSERT_GENTDEF( invertsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+             conj_t conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTDEF( mulsc )
@@ -87,8 +87,8 @@ INSERT_GENTDEF( mulsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*   chi, \
-       ctype_r* absq  \
+       const ctype*   chi, \
+             ctype_r* absq  \
      );
 
 INSERT_GENTDEFR( absqsc )
@@ -100,8 +100,8 @@ INSERT_GENTDEFR( absqsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*   chi, \
-       ctype_r* norm  \
+       const ctype*   chi, \
+             ctype_r* norm  \
      );
 
 INSERT_GENTDEFR( normfsc )
@@ -113,8 +113,8 @@ INSERT_GENTDEFR( normfsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*  chi, \
-       ctype*  psi  \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTDEF( sqrtsc )
@@ -126,9 +126,9 @@ INSERT_GENTDEF( sqrtsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const ctype*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      );
 
 INSERT_GENTDEF( getsc )
@@ -154,9 +154,9 @@ INSERT_GENTDEF( setsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*   chi, \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i  \
+       const ctype*   chi, \
+             ctype_r* zeta_r, \
+             ctype_r* zeta_i  \
      );
 
 INSERT_GENTDEFR( unzipsc )
@@ -168,9 +168,9 @@ INSERT_GENTDEFR( unzipsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i, \
-       ctype*   chi  \
+       const ctype_r* zeta_r, \
+       const ctype_r* zeta_i, \
+             ctype*   chi  \
      );
 
 INSERT_GENTDEFR( zipsc )
diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c
index ac62530db..0bfdbe3b3 100644
--- a/frame/0/bli_l0_oapi.c
+++ b/frame/0/bli_l0_oapi.c
@@ -43,25 +43,25 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  absq  \
+       const obj_t* chi, \
+       const obj_t* absq  \
      ) \
 { \
 	bli_init_once(); \
 \
-	num_t     dt_chi; \
-	num_t     dt_absq_c  = bli_obj_dt_proj_to_complex( absq ); \
+	num_t       dt_chi; \
+	num_t       dt_absq_c  = bli_obj_dt_proj_to_complex( absq ); \
 \
-	void*     buf_chi; \
-	void*     buf_absq   = bli_obj_buffer_at_off( absq ); \
+	const void* buf_chi; \
+	void*       buf_absq   = bli_obj_buffer_at_off( absq ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, absq ); \
+		PASTEMAC(opname,_check)( chi, absq ); \
 \
 	/* If chi is a scalar constant, use dt_absq_c to extract the address of the
 	   corresponding constant value; otherwise, use the datatype encoded
 	   within the chi object and extract the buffer at the chi offset. */ \
-	bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \
+	bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, ( void** )&buf_chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -83,8 +83,8 @@ GENFRONT( normfsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -97,7 +97,7 @@ void PASTEMAC0(opname) \
 	void*     buf_psi   = bli_obj_buffer_at_off( psi ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, psi ); \
+		PASTEMAC(opname,_check)( chi, psi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -122,7 +122,7 @@ GENFRONT( subsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi  \
+       const obj_t* chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -134,7 +134,7 @@ void PASTEMAC0(opname) \
 	void*     buf_chi   = bli_obj_buffer_for_1x1( dt, chi ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi ); \
+		PASTEMAC(opname,_check)( chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -155,8 +155,8 @@ GENFRONT( invertsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -167,7 +167,7 @@ void PASTEMAC0(opname) \
 	void*     buf_psi   = bli_obj_buffer_at_off( psi ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, psi ); \
+		PASTEMAC(opname,_check)( chi, psi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -188,9 +188,9 @@ GENFRONT( sqrtsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const obj_t*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -205,7 +205,7 @@ void PASTEMAC0(opname) \
 	void*     buf_chi   = bli_obj_buffer_for_1x1( dt_def, chi ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
+		PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
 \
 	/* The _check() routine prevents integer types, so we know that chi
 	   is either a constant or an actual floating-point type. */ \
@@ -232,9 +232,9 @@ GENFRONT( getsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
-       obj_t*  chi  \
+             double zeta_r, \
+             double zeta_i, \
+       const obj_t* chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -244,7 +244,7 @@ void PASTEMAC0(opname) \
 	void*     buf_chi   = bli_obj_buffer_at_off( chi ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \
+		PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -266,9 +266,9 @@ GENFRONT( setsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i  \
+       const obj_t* chi, \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -282,7 +282,7 @@ void PASTEMAC0(opname) \
 	void*     buf_zeta_i  = bli_obj_buffer_at_off( zeta_i ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
+		PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
 \
 	/* If chi is a scalar constant, use dt_zeta_c to extract the address of the
 	   corresponding constant value; otherwise, use the datatype encoded
@@ -309,9 +309,9 @@ GENFRONT( unzipsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i, \
-       obj_t*  chi  \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i, \
+       const obj_t* chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -324,7 +324,7 @@ void PASTEMAC0(opname) \
 	void*     buf_chi     = bli_obj_buffer_at_off( chi ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
+		PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h
index 702bb40ea..a34252cf7 100644
--- a/frame/0/bli_l0_oapi.h
+++ b/frame/0/bli_l0_oapi.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  absq  \
+       const obj_t* chi, \
+       const obj_t* absq  \
      );
 
 GENPROT( absqsc )
@@ -55,8 +55,8 @@ GENPROT( normfsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      );
 
 GENPROT( addsc )
@@ -71,7 +71,7 @@ GENPROT( subsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi  \
+       const obj_t* chi  \
      );
 
 GENPROT( invertsc )
@@ -82,9 +82,9 @@ GENPROT( invertsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const obj_t*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      );
 
 GENPROT( getsc )
@@ -95,9 +95,9 @@ GENPROT( getsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
-       obj_t*  chi  \
+             double zeta_r, \
+             double zeta_i, \
+       const obj_t* chi  \
      );
 
 GENPROT( setsc )
@@ -108,9 +108,9 @@ GENPROT( setsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i  \
+       const obj_t* chi, \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i  \
      );
 
 GENPROT( unzipsc )
@@ -121,9 +121,9 @@ GENPROT( unzipsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i, \
-       obj_t*  chi  \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i, \
+       const obj_t* chi  \
      );
 
 GENPROT( zipsc )
diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c
index 620cad299..e0cdffcf3 100644
--- a/frame/0/bli_l0_tapi.c
+++ b/frame/0/bli_l0_tapi.c
@@ -43,9 +43,9 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+             conj_t conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -66,8 +66,8 @@ INSERT_GENTFUNC_BASIC( subsc, subs )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi  \
+       conj_t conjchi, \
+       ctype* chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -87,9 +87,9 @@ INSERT_GENTFUNC_BASIC( invertsc, inverts )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+             conj_t conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -116,8 +116,8 @@ INSERT_GENTFUNC_BASIC( mulsc, scals )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* absq  \
+       const ctype*   chi, \
+             ctype_r* absq  \
      ) \
 { \
 	bli_init_once(); \
@@ -145,8 +145,8 @@ INSERT_GENTFUNCR_BASIC0( absqsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* norm  \
+       const ctype*   chi, \
+             ctype_r* norm  \
      ) \
 { \
 	bli_init_once(); \
@@ -163,8 +163,8 @@ INSERT_GENTFUNCR_BASIC0( normfsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       ctype*  psi  \
+       const ctype* chi, \
+             ctype* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -181,9 +181,9 @@ INSERT_GENTFUNC_BASIC0( sqrtsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const ctype*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -199,9 +199,9 @@ INSERT_GENTFUNC_BASIC0( getsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
-       ctype*  chi  \
+       double zeta_r, \
+       double zeta_i, \
+       ctype* chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -217,9 +217,9 @@ INSERT_GENTFUNC_BASIC0( setsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i  \
+       const ctype*   chi, \
+             ctype_r* zeta_r, \
+             ctype_r* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -235,9 +235,9 @@ INSERT_GENTFUNCR_BASIC0( unzipsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i, \
-       ctype*   chi  \
+       const ctype_r* zeta_r, \
+       const ctype_r* zeta_i, \
+             ctype*   chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -251,9 +251,9 @@ INSERT_GENTFUNCR_BASIC0( zipsc )
 
 void bli_igetsc
      (
-       dim_t*  chi,
-       double* zeta_r,
-       double* zeta_i
+       const dim_t*  chi,
+             double* zeta_r,
+             double* zeta_i
      )
 {
 	bli_init_once();
@@ -263,9 +263,9 @@ void bli_igetsc
 
 void bli_isetsc
      (
-       double  zeta_r,
-       double  zeta_i,
-       dim_t*  chi
+       double zeta_r,
+       double zeta_i,
+       dim_t* chi
      )
 {
 	bli_init_once();
diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h
index c2d600d66..b39303410 100644
--- a/frame/0/bli_l0_tapi.h
+++ b/frame/0/bli_l0_tapi.h
@@ -42,9 +42,9 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+             conj_t conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTPROT_BASIC0( addsc )
@@ -58,8 +58,8 @@ INSERT_GENTPROT_BASIC0( subsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi  \
+       conj_t conjchi, \
+       ctype* chi  \
      );
 
 INSERT_GENTPROT_BASIC0( invertsc )
@@ -70,8 +70,8 @@ INSERT_GENTPROT_BASIC0( invertsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* absq  \
+       const ctype*   chi, \
+             ctype_r* absq  \
      );
 
 INSERT_GENTPROTR_BASIC0( absqsc )
@@ -83,8 +83,8 @@ INSERT_GENTPROTR_BASIC0( normfsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       ctype*  psi  \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTPROT_BASIC0( sqrtsc )
@@ -95,9 +95,9 @@ INSERT_GENTPROT_BASIC0( sqrtsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const ctype*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      );
 
 INSERT_GENTPROT_BASIC0( getsc )
@@ -108,9 +108,9 @@ INSERT_GENTPROT_BASIC0( getsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
-       ctype*  chi  \
+       double zeta_r, \
+       double zeta_i, \
+       ctype* chi  \
      );
 
 INSERT_GENTPROT_BASIC0( setsc )
@@ -121,9 +121,9 @@ INSERT_GENTPROT_BASIC0( setsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i  \
+       const ctype*   chi, \
+             ctype_r* zeta_r, \
+             ctype_r* zeta_i  \
      );
 
 INSERT_GENTPROTR_BASIC0( unzipsc )
@@ -134,9 +134,9 @@ INSERT_GENTPROTR_BASIC0( unzipsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i, \
-       ctype*   chi  \
+       const ctype_r* zeta_r, \
+       const ctype_r* zeta_i, \
+             ctype*   chi  \
      );
 
 INSERT_GENTPROTR_BASIC0( zipsc )
@@ -145,15 +145,15 @@ INSERT_GENTPROTR_BASIC0( zipsc )
 
 BLIS_EXPORT_BLIS void bli_igetsc
      (
-       dim_t*  chi,
-       double* zeta_r,
-       double* zeta_i
+       const dim_t*  chi,
+             double* zeta_r,
+             double* zeta_i
      );
 
 BLIS_EXPORT_BLIS void bli_isetsc
      (
-       double  zeta_r,
-       double  zeta_i,
-       dim_t*  chi
+       double zeta_r,
+       double zeta_i,
+       dim_t* chi
      );
 
diff --git a/frame/0/copysc/bli_copysc.c b/frame/0/copysc/bli_copysc.c
index 3001aa6c7..c2e01d07b 100644
--- a/frame/0/copysc/bli_copysc.c
+++ b/frame/0/copysc/bli_copysc.c
@@ -41,9 +41,9 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t conjchi,
-       void*  chi,
-       void*  psi
+             conj_t conjchi,
+       const void*  chi,
+             void*  psi
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc);
@@ -57,24 +57,24 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc);
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_init_once(); \
 \
-	conj_t    conjchi   = bli_obj_conj_status( chi ); \
+	conj_t conjchi = bli_obj_conj_status( chi ); \
 \
-	num_t     dt_psi    = bli_obj_dt( psi ); \
-	void*     buf_psi   = bli_obj_buffer_at_off( psi ); \
+	num_t  dt_psi  = bli_obj_dt( psi ); \
+	void*  buf_psi = bli_obj_buffer_at_off( psi ); \
 \
-	num_t     dt_chi; \
-	void*     buf_chi; \
+	num_t  dt_chi; \
+	void*  buf_chi; \
 \
 	FUNCPTR_T f; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, psi ); \
+		PASTEMAC(opname,_check)( chi, psi ); \
 \
 	/* If chi is a scalar constant, use dt_psi to extract the address of the
 	   corresponding constant value; otherwise, use the datatype encoded
@@ -105,15 +105,15 @@ GENFRONT( copysc )
 \
 void PASTEMAC2(chx,chy,varname) \
      ( \
-       conj_t conjchi, \
-       void*  chi, \
-       void*  psi \
+             conj_t conjchi, \
+       const void*  chi, \
+             void*  psi \
      ) \
 { \
 	bli_init_once(); \
 \
-	ctype_x* chi_cast = chi; \
-	ctype_y* psi_cast = psi; \
+	const ctype_x* chi_cast = chi; \
+	      ctype_y* psi_cast = psi; \
 \
 	if ( bli_is_conj( conjchi ) ) \
 	{ \
diff --git a/frame/0/copysc/bli_copysc.h b/frame/0/copysc/bli_copysc.h
index 1dfd9d7bc..cd5481e57 100644
--- a/frame/0/copysc/bli_copysc.h
+++ b/frame/0/copysc/bli_copysc.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      );
 GENFRONT( copysc )
 
@@ -57,9 +57,9 @@ GENFRONT( copysc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
      ( \
-       conj_t conjchi, \
-       void*  chi, \
-       void*  psi \
+             conj_t conjchi, \
+       const void*  chi, \
+             void*  psi \
      );
 
 INSERT_GENTPROT2_BASIC0( copysc )
diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c
index 74b60febd..8ab470bf4 100644
--- a/frame/1/bli_l1v_check.c
+++ b/frame/1/bli_l1v_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_xy_check( x, y ); \
@@ -61,8 +61,8 @@ GENFRONT( swapv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t* x, \
+       const obj_t* index  \
      ) \
 { \
 	bli_l1v_xi_check( x, index ); \
@@ -76,10 +76,10 @@ GENFRONT( amaxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_axby_check( alpha, x, beta, y ); \
@@ -93,9 +93,9 @@ GENFRONT( axpbyv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_axy_check( alpha, x, y ); \
@@ -110,9 +110,9 @@ GENFRONT( scal2v )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho  \
      ) \
 { \
 	bli_l1v_dot_check( &BLIS_ONE, x, y, &BLIS_ONE, rho ); \
@@ -126,11 +126,11 @@ GENFRONT( dotv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* beta, \
+       const obj_t* rho  \
      ) \
 { \
 	bli_l1v_dot_check( alpha, x, y, beta, rho ); \
@@ -144,7 +144,7 @@ GENFRONT( dotxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      ) \
 { \
 	bli_l1v_x_check( x ); \
@@ -158,8 +158,8 @@ GENFRONT( invertv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
      ) \
 { \
 	bli_l1v_ax_check( alpha, x ); \
@@ -174,9 +174,9 @@ GENFRONT( setv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_xby_check( x, beta, y ); \
@@ -189,8 +189,8 @@ GENFRONT( xpbyv )
 
 void bli_l1v_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -230,9 +230,9 @@ void bli_l1v_xy_check
 
 void bli_l1v_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -281,9 +281,9 @@ void bli_l1v_axy_check
 
 void bli_l1v_xby_check
      (
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -332,10 +332,10 @@ void bli_l1v_xby_check
 
 void bli_l1v_axby_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -393,11 +393,11 @@ void bli_l1v_axby_check
 
 void bli_l1v_dot_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  beta,
-       obj_t*  rho 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* beta,
+       const obj_t* rho
      )
 {
 	err_t e_val;
@@ -467,7 +467,7 @@ void bli_l1v_dot_check
 
 void bli_l1v_x_check
      (
-       obj_t*  x 
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -490,8 +490,8 @@ void bli_l1v_x_check
 
 void bli_l1v_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -523,8 +523,8 @@ void bli_l1v_ax_check
 
 void bli_l1v_xi_check
      (
-       obj_t*  x,
-       obj_t*  index
+       const obj_t* x,
+       const obj_t* index
      )
 {
 	err_t e_val;
diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h
index 98051d0cd..110b25d55 100644
--- a/frame/1/bli_l1v_check.h
+++ b/frame/1/bli_l1v_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
      );
 
 GENTPROT( addv )
@@ -57,8 +57,8 @@ GENTPROT( swapv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t* x, \
+       const obj_t* index  \
      );
 
 GENTPROT( amaxv )
@@ -69,10 +69,10 @@ GENTPROT( amaxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      );
 
 GENTPROT( axpbyv )
@@ -83,9 +83,9 @@ GENTPROT( axpbyv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
      );
 
 GENTPROT( axpyv )
@@ -97,9 +97,9 @@ GENTPROT( scal2v )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho  \
      );
 
 GENTPROT( dotv )
@@ -110,11 +110,11 @@ GENTPROT( dotv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* beta, \
+       const obj_t* rho  \
      );
 
 GENTPROT( dotxv )
@@ -125,7 +125,7 @@ GENTPROT( dotxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      );
 
 GENTPROT( invertv )
@@ -136,8 +136,8 @@ GENTPROT( invertv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
      );
 
 GENTPROT( scalv )
@@ -149,9 +149,9 @@ GENTPROT( setv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      );
 
 GENTPROT( xpbyv )
@@ -162,55 +162,55 @@ GENTPROT( xpbyv )
 
 void bli_l1v_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* x,
+       const obj_t* y
      );
 
 void bli_l1v_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y
      );
 
 void bli_l1v_xby_check
      (
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      );
 
 void bli_l1v_axby_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      );
 
 void bli_l1v_dot_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  beta,
-       obj_t*  rho 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* beta,
+       const obj_t* rho
      );
 
 void bli_l1v_x_check
      (
-       obj_t*  x 
+       const obj_t* x
      );
 
 void bli_l1v_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* x
      );
 
 void bli_l1v_xi_check
      (
-       obj_t*  x,
-       obj_t*  index
+       const obj_t* x,
+       const obj_t* index
      );
 
diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h
index 162f1bf60..57f9d223a 100644
--- a/frame/1/bli_l1v_ft.h
+++ b/frame/1/bli_l1v_ft.h
@@ -44,10 +44,10 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -62,9 +62,9 @@ INSERT_GENTDEF( subv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       dim_t*  index  \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+             dim_t* index  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -77,12 +77,12 @@ INSERT_GENTDEF( amaxv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -95,11 +95,11 @@ INSERT_GENTDEF( axpbyv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -113,12 +113,12 @@ INSERT_GENTDEF( scal2v )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -131,14 +131,14 @@ INSERT_GENTDEF( dotv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  beta, \
-       ctype*  rho  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+       const ctype* beta, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -151,8 +151,8 @@ INSERT_GENTDEF( dotxv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx  \
+       dim_t  n, \
+       ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -165,10 +165,10 @@ INSERT_GENTDEF( invertv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjalpha, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx  \
+             conj_t conjalpha, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -182,9 +182,9 @@ INSERT_GENTDEF( setv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -197,11 +197,11 @@ INSERT_GENTDEF( swapv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1/bli_l1v_ker_prot.h b/frame/1/bli_l1v_ker_prot.h
index 1a1eec3f3..b912ba7e0 100644
--- a/frame/1/bli_l1v_ker_prot.h
+++ b/frame/1/bli_l1v_ker_prot.h
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname) \
         dim_t            n, \
         ctype*  restrict x, inc_t incx, \
         ctype*  restrict y, inc_t incy, \
-        cntx_t* restrict cntx  \
+        cntx_t*          cntx  \
       );
 
 
@@ -56,7 +56,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        dim_t*  restrict index, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -70,7 +70,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -83,7 +83,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -95,7 +95,7 @@ void PASTEMAC(ch,opname) \
         dim_t            n, \
         ctype*  restrict x, inc_t incx, \
         ctype*  restrict y, inc_t incy, \
-        cntx_t* restrict cntx  \
+        cntx_t*          cntx  \
       );
 
 
@@ -109,7 +109,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -125,7 +125,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict beta, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -135,7 +135,7 @@ void PASTEMAC(ch,opname) \
      ( \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -147,7 +147,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -172,7 +172,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -184,7 +184,7 @@ void PASTEMAC(ch,opname) \
         dim_t            n, \
         ctype*  restrict x, inc_t incx, \
         ctype*  restrict y, inc_t incy, \
-        cntx_t* restrict cntx  \
+        cntx_t*          cntx  \
       );
 
 
@@ -195,7 +195,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -208,6 +208,6 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c
index 201af2e09..4ea241693 100644
--- a/frame/1/bli_l1v_oapi.c
+++ b/frame/1/bli_l1v_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -64,7 +64,7 @@ void PASTEMAC(opname,EX_SUF) \
 	inc_t     inc_y     = bli_obj_vector_inc( y ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, y ); \
+		PASTEMAC(opname,_check)( x, y ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -92,8 +92,8 @@ GENFRONT( subv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t*  x, \
+       const obj_t*  index  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -110,7 +110,7 @@ void PASTEMAC(opname,EX_SUF) \
 	void*     buf_index = bli_obj_buffer_at_off( index ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, index ); \
+		PASTEMAC(opname,_check)( x, index ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -119,11 +119,11 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   n, \
-	   buf_x, incx, \
-	   buf_index, \
-	   cntx, \
-	   rntm  \
+	  n, \
+	  buf_x, incx, \
+	  buf_index, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -135,10 +135,10 @@ GENFRONT( amaxv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -162,7 +162,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, beta, y ); \
+		PASTEMAC(opname,_check)( alpha, x, beta, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -180,14 +180,14 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   buf_alpha, \
-	   buf_x, inc_x, \
-	   buf_beta, \
-	   buf_y, inc_y, \
-	   cntx, \
-	   rntm  \
+	  conjx, \
+	  n, \
+	  buf_alpha, \
+	  buf_x, inc_x, \
+	  buf_beta, \
+	  buf_y, inc_y, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -199,9 +199,9 @@ GENFRONT( axpbyv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -223,7 +223,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, y ); \
+		PASTEMAC(opname,_check)( alpha, x, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -238,13 +238,13 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   buf_alpha, \
-	   buf_x, inc_x, \
-	   buf_y, inc_y, \
-	   cntx, \
-	   rntm  \
+	  conjx, \
+	  n, \
+	  buf_alpha, \
+	  buf_x, inc_x, \
+	  buf_y, inc_y, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -257,9 +257,9 @@ GENFRONT( scal2v )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  rho  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -279,7 +279,7 @@ void PASTEMAC(opname,EX_SUF) \
 	void*     buf_rho   = bli_obj_buffer_at_off( rho ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, y, rho ); \
+		PASTEMAC(opname,_check)( x, y, rho ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -288,14 +288,14 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   buf_x, inc_x, \
-	   buf_y, inc_y, \
-	   buf_rho, \
-	   cntx, \
-	   rntm  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  buf_x, inc_x, \
+	  buf_y, inc_y, \
+	  buf_rho, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -307,11 +307,11 @@ GENFRONT( dotv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  beta, \
+       const obj_t*  rho  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -337,7 +337,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, y, beta, rho ); \
+		PASTEMAC(opname,_check)( alpha, x, y, beta, rho ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -355,16 +355,16 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   buf_alpha, \
-	   buf_x, inc_x, \
-	   buf_y, inc_y, \
-	   buf_beta, \
-	   buf_rho, \
-	   cntx, \
-	   rntm  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  buf_alpha, \
+	  buf_x, inc_x, \
+	  buf_y, inc_y, \
+	  buf_beta, \
+	  buf_rho, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -376,7 +376,7 @@ GENFRONT( dotxv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -391,7 +391,7 @@ void PASTEMAC(opname,EX_SUF) \
 	inc_t     inc_x     = bli_obj_vector_inc( x ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x ); \
+		PASTEMAC(opname,_check)( x ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -400,10 +400,10 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   n, \
-	   buf_x, inc_x, \
-	   cntx, \
-	   rntm  \
+	  n, \
+	  buf_x, inc_x, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -415,8 +415,8 @@ GENFRONT( invertv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -436,7 +436,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x ); \
+		PASTEMAC(opname,_check)( alpha, x ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -451,12 +451,12 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \
-	   n, \
-	   buf_alpha, \
-	   buf_x, inc_x, \
-	   cntx, \
-	   rntm  \
+	  BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \
+	  n, \
+	  buf_alpha, \
+	  buf_x, inc_x, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -469,8 +469,8 @@ GENFRONT( setv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -487,7 +487,7 @@ void PASTEMAC(opname,EX_SUF) \
 	inc_t     inc_y     = bli_obj_vector_inc( y ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, y ); \
+		PASTEMAC(opname,_check)( x, y ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -496,11 +496,11 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   n, \
-	   buf_x, inc_x, \
-	   buf_y, inc_y, \
-	   cntx, \
-	   rntm  \
+	  n, \
+	  buf_x, inc_x, \
+	  buf_y, inc_y, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -512,9 +512,9 @@ GENFRONT( swapv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -536,7 +536,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, beta, y ); \
+		PASTEMAC(opname,_check)( x, beta, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -551,13 +551,13 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   buf_x, inc_x, \
-	   buf_beta, \
-	   buf_y, inc_y, \
-	   cntx, \
-	   rntm  \
+	  conjx, \
+	  n, \
+	  buf_x, inc_x, \
+	  buf_beta, \
+	  buf_y, inc_y, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h
index 41aecdc4d..957747a2a 100644
--- a/frame/1/bli_l1v_oapi.h
+++ b/frame/1/bli_l1v_oapi.h
@@ -42,10 +42,10 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( addv )
 GENTPROT( copyv )
@@ -57,8 +57,8 @@ GENTPROT( subv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t* x, \
+       const obj_t* index  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -70,10 +70,10 @@ GENTPROT( amaxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -85,11 +85,11 @@ GENTPROT( axpbyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( axpyv )
 GENTPROT( scal2v )
@@ -100,11 +100,11 @@ GENTPROT( scal2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( dotv )
 
@@ -114,13 +114,13 @@ GENTPROT( dotv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* beta, \
+       const obj_t* rho  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( dotxv )
 
@@ -130,9 +130,9 @@ GENTPROT( dotxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( invertv )
 
@@ -142,10 +142,10 @@ GENTPROT( invertv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( scalv )
 GENTPROT( setv )
@@ -156,10 +156,10 @@ GENTPROT( setv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( swapv )
 
@@ -169,9 +169,9 @@ GENTPROT( swapv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index 1d12b42eb..01e3356d5 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -45,10 +45,10 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -67,9 +67,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  x, incx, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )x, incx, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -83,9 +83,9 @@ INSERT_GENTFUNC_BASIC( subv,  BLIS_SUBV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       dim_t*  index  \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+             dim_t* index  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -103,9 +103,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	f \
 	( \
 	  n, \
-	  x, incx, \
+	  ( ctype* )x, incx, \
 	  index, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -117,12 +117,12 @@ INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -141,11 +141,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -157,11 +157,11 @@ INSERT_GENTFUNC_BASIC( axpbyv, BLIS_AXPBYV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -181,10 +181,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -197,12 +197,12 @@ INSERT_GENTFUNC_BASIC( scal2v, BLIS_SCAL2V_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -222,10 +222,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  x, incx, \
-	  y, incy, \
-	  rho, \
-	  cntx  \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            rho, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -237,14 +237,14 @@ INSERT_GENTFUNC_BASIC( dotv, BLIS_DOTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  beta, \
-       ctype*  rho  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+       const ctype* beta, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -264,12 +264,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  beta, \
-	  rho, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	  ( ctype* )beta, \
+	            rho, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -281,8 +281,8 @@ INSERT_GENTFUNC_BASIC( dotxv, BLIS_DOTXV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx  \
+       dim_t  n, \
+       ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -301,7 +301,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  n, \
 	  x, incx, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -313,10 +313,10 @@ INSERT_GENTFUNC_BASIC( invertv, BLIS_INVERTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx  \
+             conj_t conjalpha, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -335,9 +335,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjalpha, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	            x, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -350,9 +350,9 @@ INSERT_GENTFUNC_BASIC( setv,  BLIS_SETV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -372,7 +372,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  n, \
 	  x, incx, \
 	  y, incy, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -383,11 +383,11 @@ INSERT_GENTFUNC_BASIC( swapv, BLIS_SWAPV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -406,10 +406,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h
index 5cb3295ef..c1965cb3c 100644
--- a/frame/1/bli_l1v_tapi.h
+++ b/frame/1/bli_l1v_tapi.h
@@ -42,10 +42,10 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
       ( \
-        conj_t  conjx, \
-        dim_t   n, \
-        ctype*  x, inc_t incx, \
-        ctype*  y, inc_t incy  \
+              conj_t conjx, \
+              dim_t  n, \
+        const ctype* x, inc_t incx, \
+              ctype* y, inc_t incy  \
         BLIS_TAPI_EX_PARAMS  \
       );
 
@@ -59,9 +59,9 @@ INSERT_GENTPROT_BASIC0( subv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       dim_t*  index  \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+             dim_t* index  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -73,12 +73,12 @@ INSERT_GENTPROT_BASIC0( amaxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -90,11 +90,11 @@ INSERT_GENTPROT_BASIC0( axpbyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -107,12 +107,12 @@ INSERT_GENTPROT_BASIC0( scal2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -124,14 +124,14 @@ INSERT_GENTPROT_BASIC0( dotv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  beta, \
-       ctype*  rho  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+       const ctype* beta, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -143,8 +143,8 @@ INSERT_GENTPROT_BASIC0( dotxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx  \
+       dim_t  n, \
+       ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -156,10 +156,10 @@ INSERT_GENTPROT_BASIC0( invertv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx  \
+             conj_t conjalpha, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -172,9 +172,9 @@ INSERT_GENTPROT_BASIC0( setv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -186,11 +186,11 @@ INSERT_GENTPROT_BASIC0( swapv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c
index 908a410ad..fcc62a757 100644
--- a/frame/1d/bli_l1d_check.c
+++ b/frame/1d/bli_l1d_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
      ) \
 { \
 	bli_l1d_xy_check( x, y ); \
@@ -60,9 +60,9 @@ GENFRONT( subd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
      ) \
 { \
 	bli_l1d_axy_check( alpha, x, y ); \
@@ -77,7 +77,7 @@ GENFRONT( scal2d )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
      ) \
 { \
 	bli_l1d_x_check( x ); \
@@ -91,8 +91,8 @@ GENFRONT( invertd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
      ) \
 { \
 	bli_l1d_ax_check( alpha, x ); \
@@ -109,9 +109,9 @@ GENFRONT( shiftd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
      ) \
 { \
 	bli_l1d_axy_check( beta, x, y ); \
@@ -124,8 +124,8 @@ GENFRONT( xpbyd )
 
 void bli_l1d_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  x,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -165,9 +165,9 @@ void bli_l1d_xy_check
 
 void bli_l1d_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -216,7 +216,7 @@ void bli_l1d_axy_check
 
 void bli_l1d_x_check
      (
-       obj_t*  x 
+       const obj_t*  x
      )
 {
 	err_t e_val;
@@ -239,8 +239,8 @@ void bli_l1d_x_check
 
 void bli_l1d_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  x
      )
 {
 	err_t e_val;
diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h
index 6d000d314..1ef57e236 100644
--- a/frame/1d/bli_l1d_check.h
+++ b/frame/1d/bli_l1d_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
     );
 
 GENTPROT( addd )
@@ -56,9 +56,9 @@ GENTPROT( subd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
     );
 
 GENTPROT( axpyd )
@@ -70,7 +70,7 @@ GENTPROT( scal2d )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
     );
 
 GENTPROT( invertd )
@@ -81,8 +81,8 @@ GENTPROT( invertd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
     );
 
 GENTPROT( scald )
@@ -96,9 +96,9 @@ GENTPROT( shiftd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
     );
 
 GENTPROT( xpbyd )
@@ -108,25 +108,25 @@ GENTPROT( xpbyd )
 
 void bli_l1d_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1d_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1d_x_check
      (
-       obj_t*  x 
+       const obj_t*  x
      );
 
 void bli_l1d_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 
diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h
index 53e296616..3de317527 100644
--- a/frame/1d/bli_l1d_ft.h
+++ b/frame/1d/bli_l1d_ft.h
@@ -44,13 +44,13 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -65,14 +65,14 @@ INSERT_GENTDEF( subd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -86,10 +86,10 @@ INSERT_GENTDEF( scal2d )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t diagoffx, \
+       dim_t  m, \
+       dim_t  n, \
+       ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -102,12 +102,12 @@ INSERT_GENTDEF( invertd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             conj_t conjalpha, \
+             doff_t diagoffx, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -121,11 +121,11 @@ INSERT_GENTDEF( setd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t   diagoffx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t rs_x, inc_t cs_x  \
+             doff_t   diagoffx, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype_r* alpha, \
+             ctype*   x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -138,11 +138,11 @@ INSERT_GENTDEFR( setid )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             doff_t diagoffx, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -155,14 +155,14 @@ INSERT_GENTDEF( shiftd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c
index 15e68cf50..7027e7780 100644
--- a/frame/1d/bli_l1d_oapi.c
+++ b/frame/1d/bli_l1d_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -69,7 +69,7 @@ void PASTEMAC(opname,EX_SUF) \
 	inc_t     cs_y      = bli_obj_col_stride( y ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, y ); \
+		PASTEMAC(opname,_check)( x, y ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -100,9 +100,9 @@ GENFRONT( subd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -129,7 +129,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, y ); \
+		PASTEMAC(opname,_check)( alpha, x, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -166,7 +166,7 @@ GENFRONT( scal2d )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -184,7 +184,7 @@ void PASTEMAC(opname,EX_SUF) \
 	inc_t     cs_x      = bli_obj_col_stride( x ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x ); \
+		PASTEMAC(opname,_check)( x ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -210,8 +210,8 @@ GENFRONT( invertd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -234,7 +234,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x ); \
+		PASTEMAC(opname,_check)( alpha, x ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -269,8 +269,8 @@ GENFRONT( setd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -290,7 +290,7 @@ void PASTEMAC(opname,EX_SUF) \
 	void*     buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x ); \
+		PASTEMAC(opname,_check)( alpha, x ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -317,8 +317,8 @@ GENFRONT( setid )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -340,7 +340,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x ); \
+		PASTEMAC(opname,_check)( alpha, x ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -373,9 +373,9 @@ GENFRONT( shiftd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -402,7 +402,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, beta, y ); \
+		PASTEMAC(opname,_check)( x, beta, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h
index 47129b771..66f9d698c 100644
--- a/frame/1d/bli_l1d_oapi.h
+++ b/frame/1d/bli_l1d_oapi.h
@@ -42,10 +42,10 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( addd )
 GENTPROT( copyd )
@@ -57,11 +57,11 @@ GENTPROT( subd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( axpyd )
 GENTPROT( scal2d )
@@ -72,9 +72,9 @@ GENTPROT( scal2d )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( invertd )
 
@@ -84,10 +84,10 @@ GENTPROT( invertd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( scald )
 GENTPROT( setd )
@@ -100,9 +100,9 @@ GENTPROT( shiftd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index cfaf5150f..60916cd56 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -45,13 +45,13 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -61,12 +61,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	const num_t dt = PASTEMAC(ch,type); \
 \
-	ctype*      x1; \
-	ctype*      y1; \
-	conj_t      conjx; \
-	dim_t       n_elem; \
-	dim_t       offx, offy; \
-	inc_t       incx, incy; \
+	const ctype* x1; \
+	ctype*       y1; \
+	conj_t       conjx; \
+	dim_t        n_elem; \
+	dim_t        offx, offy; \
+	inc_t        incx, incy; \
 \
 	if ( bli_zero_dim2( m, n ) ) return; \
 \
@@ -108,9 +108,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n_elem, \
-	  x1, incx, \
-	  y1, incy, \
-	  cntx  \
+	  ( ctype* )x1, incx, \
+	            y1, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -124,14 +124,14 @@ INSERT_GENTFUNC_BASIC2( subd,  subv,  BLIS_SUBV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -141,12 +141,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	const num_t dt = PASTEMAC(ch,type); \
 \
-	ctype*      x1; \
-	ctype*      y1; \
-	conj_t      conjx; \
-	dim_t       n_elem; \
-	dim_t       offx, offy; \
-	inc_t       incx, incy; \
+	const ctype* x1; \
+	ctype*       y1; \
+	conj_t       conjx; \
+	dim_t        n_elem; \
+	dim_t        offx, offy; \
+	inc_t        incx, incy; \
 \
 	if ( bli_zero_dim2( m, n ) ) return; \
 \
@@ -188,10 +188,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n_elem, \
-	  alpha, \
-	  x1, incx, \
-	  y1, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x1, incx, \
+	            y1, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -204,10 +204,10 @@ INSERT_GENTFUNC_BASIC2( scal2d, scal2v, BLIS_SCAL2V_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t diagoffx, \
+       dim_t  m, \
+       dim_t  n, \
+       ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -248,7 +248,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  n_elem, \
 	  x1, incx, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -260,12 +260,12 @@ INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             conj_t conjalpha, \
+             doff_t diagoffx, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -306,9 +306,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjalpha, \
 	  n_elem, \
-	  alpha, \
-	  x1, incx, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	            x1, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -321,11 +321,11 @@ INSERT_GENTFUNC_BASIC2( setd,  setv,  BLIS_SETV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t rs_x, inc_t cs_x  \
+             doff_t   diagoffx, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype_r* alpha, \
+             ctype*   x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -383,9 +383,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n_elem, \
-	  alpha, \
-	  x1, incx, \
-	  cntx  \
+	  ( ctype_r* )alpha, \
+	              x1, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -397,11 +397,11 @@ INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             doff_t diagoffx, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -442,9 +442,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n_elem, \
-	  alpha, 0, \
-	  x1, incx, \
-	  cntx  \
+	  ( ctype* )alpha, 0, \
+	            x1, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -456,14 +456,14 @@ INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -473,12 +473,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	const num_t dt = PASTEMAC(ch,type); \
 \
-	ctype*      x1; \
-	ctype*      y1; \
-	conj_t      conjx; \
-	dim_t       n_elem; \
-	dim_t       offx, offy; \
-	inc_t       incx, incy; \
+	const ctype* x1; \
+	ctype*       y1; \
+	conj_t       conjx; \
+	dim_t        n_elem; \
+	dim_t        offx, offy; \
+	inc_t        incx, incy; \
 \
 	if ( bli_zero_dim2( m, n ) ) return; \
 \
@@ -520,10 +520,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n_elem, \
-	  x1, incx, \
-	  beta, \
-	  y1, incy, \
-	  cntx  \
+	  ( ctype* )x1, incx, \
+	  ( ctype* )beta, \
+	            y1, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h
index 35d093e86..831b3d390 100644
--- a/frame/1d/bli_l1d_tapi.h
+++ b/frame/1d/bli_l1d_tapi.h
@@ -42,13 +42,13 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -62,14 +62,14 @@ INSERT_GENTPROT_BASIC0( subd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -82,10 +82,10 @@ INSERT_GENTPROT_BASIC0( scal2d )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t diagoffx, \
+       dim_t  m, \
+       dim_t  n, \
+       ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -97,12 +97,12 @@ INSERT_GENTPROT_BASIC0( invertd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             conj_t conjalpha, \
+             doff_t diagoffx, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -115,11 +115,11 @@ INSERT_GENTPROT_BASIC0( setd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t rs_x, inc_t cs_x  \
+             doff_t   diagoffx, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype_r* alpha, \
+             ctype*   x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -131,11 +131,11 @@ INSERT_GENTPROTR_BASIC0( setid )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             doff_t diagoffx, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -147,14 +147,14 @@ INSERT_GENTPROT_BASIC0( shiftd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1f/bli_l1f_check.c b/frame/1f/bli_l1f_check.c
index c880237c1..e05cb7750 100644
--- a/frame/1f/bli_l1f_check.c
+++ b/frame/1f/bli_l1f_check.c
@@ -40,11 +40,11 @@
 
 void bli_axpy2v_check
      (
-       obj_t*  alphax,
-       obj_t*  alphay,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  z 
+       const obj_t* alphax,
+       const obj_t* alphay,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* z
      )
 {
 	err_t e_val;
@@ -118,10 +118,10 @@ void bli_axpy2v_check
 
 void bli_axpyf_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -186,12 +186,12 @@ void bli_axpyf_check
 
 void bli_dotaxpyv_check
      (
-       obj_t*  alpha,
-       obj_t*  xt,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  rho,
-       obj_t*  z 
+       const obj_t* alpha,
+       const obj_t* xt,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* rho,
+       const obj_t* z
      )
 {
 	err_t e_val;
@@ -288,14 +288,14 @@ void bli_dotaxpyv_check
 
 void bli_dotxaxpyf_check
      (
-       obj_t*  alpha,
-       obj_t*  at,
-       obj_t*  a,
-       obj_t*  w,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y,
-       obj_t*  z 
+       const obj_t* alpha,
+       const obj_t* at,
+       const obj_t* a,
+       const obj_t* w,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y,
+       const obj_t* z
      )
 {
 	err_t e_val;
@@ -425,11 +425,11 @@ void bli_dotxaxpyf_check
 
 void bli_dotxf_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
diff --git a/frame/1f/bli_l1f_check.h b/frame/1f/bli_l1f_check.h
index d630f3205..9cd53107a 100644
--- a/frame/1f/bli_l1f_check.h
+++ b/frame/1f/bli_l1f_check.h
@@ -42,11 +42,11 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alphax, \
-       obj_t*  alphay, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t* alphax, \
+       const obj_t* alphay, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* z  \
     );
 
 GENTPROT( axpy2v )
@@ -57,10 +57,10 @@ GENTPROT( axpy2v )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* y  \
     );
 
 GENTPROT( axpyf )
@@ -71,12 +71,12 @@ GENTPROT( axpyf )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  xt, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho, \
-       obj_t*  z  \
+       const obj_t* alpha, \
+       const obj_t* xt, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho, \
+       const obj_t* z  \
     );
 
 GENTPROT( dotaxpyv )
@@ -87,14 +87,14 @@ GENTPROT( dotaxpyv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  at, \
-       obj_t*  a, \
-       obj_t*  w, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t* alpha, \
+       const obj_t* at, \
+       const obj_t* a, \
+       const obj_t* w, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y, \
+       const obj_t* z  \
     );
 
 GENTPROT( dotxaxpyf )
@@ -105,11 +105,11 @@ GENTPROT( dotxaxpyf )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
     );
 
 GENTPROT( dotxf )
diff --git a/frame/1f/bli_l1f_ft.h b/frame/1f/bli_l1f_ft.h
index 1c7bfd9b6..8e143bf54 100644
--- a/frame/1f/bli_l1f_ft.h
+++ b/frame/1f/bli_l1f_ft.h
@@ -44,14 +44,14 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha1, \
-       ctype*  alpha2, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alpha1, \
+       const ctype* alpha2, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -64,14 +64,14 @@ INSERT_GENTDEF( axpy2v )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conja, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -84,15 +84,15 @@ INSERT_GENTDEF( axpyf )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjxt, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho, \
-       ctype*  z, inc_t incz  \
+             conj_t conjxt, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* rho, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -105,15 +105,15 @@ INSERT_GENTDEF( dotaxpyv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjat, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjat, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -126,19 +126,19 @@ INSERT_GENTDEF( dotxf )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjat, \
-       conj_t  conja, \
-       conj_t  conjw, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  w, inc_t incw, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+             conj_t conjat, \
+             conj_t conja, \
+             conj_t conjw, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* w, inc_t incw, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1f/bli_l1f_ker_prot.h b/frame/1f/bli_l1f_ker_prot.h
index 18eea4568..4393faf10 100644
--- a/frame/1f/bli_l1f_ker_prot.h
+++ b/frame/1f/bli_l1f_ker_prot.h
@@ -49,7 +49,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -65,7 +65,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -82,7 +82,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -103,7 +103,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -120,6 +120,6 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
diff --git a/frame/1f/bli_l1f_oapi.c b/frame/1f/bli_l1f_oapi.c
index db8fdfb68..f1e65a252 100644
--- a/frame/1f/bli_l1f_oapi.c
+++ b/frame/1f/bli_l1f_oapi.c
@@ -45,11 +45,11 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alphax, \
-       obj_t*  alphay, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t* alphax, \
+       const obj_t* alphay, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* z  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -76,7 +76,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alphay_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alphax, alphay, x, y, z ); \
+		PASTEMAC(opname,_check)( alphax, alphay, x, y, z ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -115,10 +115,10 @@ GENFRONT( axpy2v )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -145,7 +145,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, a, x, y ); \
+		PASTEMAC(opname,_check)( alpha, a, x, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -184,12 +184,12 @@ GENFRONT( axpyf )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  xt, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho, \
-       obj_t*  z  \
+       const obj_t* alpha, \
+       const obj_t* xt, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho, \
+       const obj_t* z  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -216,7 +216,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, xt, x, y, rho, z ); \
+		PASTEMAC(opname,_check)( alpha, xt, x, y, rho, z ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -253,14 +253,14 @@ GENFRONT( dotaxpyv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  at, \
-       obj_t*  a, \
-       obj_t*  w, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t* alpha, \
+       const obj_t* at, \
+       const obj_t* a, \
+       const obj_t* w, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y, \
+       const obj_t* z  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -295,7 +295,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, at, a, w, x, beta, y, z ); \
+		PASTEMAC(opname,_check)( alpha, at, a, w, x, beta, y, z ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -342,11 +342,11 @@ GENFRONT( dotxaxpyf )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -375,7 +375,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
+		PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
diff --git a/frame/1f/bli_l1f_oapi.h b/frame/1f/bli_l1f_oapi.h
index 0348c4871..d0d53a6df 100644
--- a/frame/1f/bli_l1f_oapi.h
+++ b/frame/1f/bli_l1f_oapi.h
@@ -42,11 +42,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alphax, \
-       obj_t*  alphay, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t* alphax, \
+       const obj_t* alphay, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* z  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -58,10 +58,10 @@ GENTPROT( axpy2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -73,12 +73,12 @@ GENTPROT( axpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  xt, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho, \
-       obj_t*  z  \
+       const obj_t* alpha, \
+       const obj_t* xt, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho, \
+       const obj_t* z  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -90,14 +90,14 @@ GENTPROT( dotaxpyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  at, \
-       obj_t*  a, \
-       obj_t*  w, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t* alpha, \
+       const obj_t* at, \
+       const obj_t* a, \
+       const obj_t* w, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y, \
+       const obj_t* z  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -109,11 +109,11 @@ GENTPROT( dotxaxpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index a54379299..04d100cb3 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -45,14 +45,14 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alphax, \
-       ctype*  alphay, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alphax, \
+       const ctype* alphay, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -72,12 +72,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  alphax, \
-	  alphay, \
-	  x, incx, \
-	  y, incy, \
-	  z, incz, \
-	  cntx  \
+	  ( ctype* )alphax, \
+	  ( ctype* )alphay, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            z, incz, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -89,14 +89,14 @@ INSERT_GENTFUNC_BASIC( axpy2v, BLIS_AXPY2V_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conja, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -117,11 +117,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  b_n, \
-	  alpha, \
-	  a, inca, lda, \
-	  x, incx, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, inca, lda, \
+	  ( ctype* )x, incx, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -133,15 +133,15 @@ INSERT_GENTFUNC_BASIC( axpyf, BLIS_AXPYF_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjxt, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho, \
-       ctype*  z, inc_t incz  \
+             conj_t conjxt, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* rho, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -162,12 +162,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  rho, \
-	  z, incz, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            rho, \
+	            z, incz, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -179,19 +179,19 @@ INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_DOTAXPYV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conja, \
-       conj_t  conjw, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  w, inc_t incw, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+             conj_t conjat, \
+             conj_t conja, \
+             conj_t conjw, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* w, inc_t incw, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -214,14 +214,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  b_n, \
-	  alpha, \
-	  a, inca, lda, \
-	  w, incw, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  z, incz, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, inca, lda, \
+	  ( ctype* )w, incw, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	            z, incz, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -233,15 +233,15 @@ INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_DOTXAXPYF_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjat, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -262,12 +262,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  b_n, \
-	  alpha, \
-	  a, inca, lda, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, inca, lda, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
diff --git a/frame/1f/bli_l1f_tapi.h b/frame/1f/bli_l1f_tapi.h
index 2138b989d..2ea54df4c 100644
--- a/frame/1f/bli_l1f_tapi.h
+++ b/frame/1f/bli_l1f_tapi.h
@@ -42,14 +42,14 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alphax, \
-       ctype*  alphay, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alphax, \
+       const ctype* alphay, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -61,14 +61,14 @@ INSERT_GENTPROT_BASIC0( axpy2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+             conj_t conja, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -80,15 +80,15 @@ INSERT_GENTPROT_BASIC0( axpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjxt, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho, \
-       ctype*  z, inc_t incz  \
+             conj_t conjxt, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* rho, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -100,19 +100,19 @@ INSERT_GENTPROT_BASIC0( dotaxpyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conja, \
-       conj_t  conjw, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  w, inc_t incw, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+             conj_t conjat, \
+             conj_t conja, \
+             conj_t conjw, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* w, inc_t incw, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy, \
+             ctype* z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -124,15 +124,15 @@ INSERT_GENTPROT_BASIC0( dotxaxpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             conj_t conjat, \
+             conj_t conjx, \
+             dim_t  m, \
+             dim_t  b_n, \
+       const ctype* alpha, \
+       const ctype* a, inc_t inca, inc_t lda, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c
index 8914e43b1..f5d4bf1b4 100644
--- a/frame/1m/bli_l1m_check.c
+++ b/frame/1m/bli_l1m_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1m_xy_check( x, y ); \
@@ -60,9 +60,9 @@ GENFRONT( subm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1m_axy_check( alpha, x, y ); \
@@ -77,8 +77,8 @@ GENFRONT( scal2m )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
      ) \
 { \
 	bli_l1m_ax_check( alpha, x ); \
@@ -93,9 +93,9 @@ GENFRONT( setm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1m_axy_check( beta, x, y ); \
@@ -108,8 +108,8 @@ GENFRONT( xpbym )
 
 void bli_l1m_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -149,9 +149,9 @@ void bli_l1m_xy_check
 
 void bli_l1m_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -200,8 +200,8 @@ void bli_l1m_axy_check
 
 void bli_l1m_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* x
      )
 {
 	err_t e_val;
diff --git a/frame/1m/bli_l1m_check.h b/frame/1m/bli_l1m_check.h
index 030c0e219..6089dfa17 100644
--- a/frame/1m/bli_l1m_check.h
+++ b/frame/1m/bli_l1m_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
     );
 
 GENPROT( addm )
@@ -56,9 +56,9 @@ GENPROT( subm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
     );
 
 GENPROT( axpym )
@@ -70,8 +70,8 @@ GENPROT( scal2m )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
     );
 
 GENPROT( scalm )
@@ -83,9 +83,9 @@ GENPROT( setm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
     );
 
 GENPROT( xpbym )
@@ -95,20 +95,20 @@ GENPROT( xpbym )
 
 void bli_l1m_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1m_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1m_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 
diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h
index af6c384e5..36d06b2fe 100644
--- a/frame/1m/bli_l1m_ft.h
+++ b/frame/1m/bli_l1m_ft.h
@@ -44,14 +44,14 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -66,15 +66,15 @@ INSERT_GENTDEF( copym )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -87,15 +87,15 @@ INSERT_GENTDEF( axpym )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -108,14 +108,14 @@ INSERT_GENTDEF( scal2m )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             conj_t conjalpha, \
+             doff_t diagoffx, \
+             diag_t diagx, \
+             uplo_t uplox, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -129,15 +129,15 @@ INSERT_GENTDEF( setm )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h
index 41d80e217..f25c3c943 100644
--- a/frame/1m/bli_l1m_ft_ker.h
+++ b/frame/1m/bli_l1m_ft_ker.h
@@ -90,7 +90,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 INSERT_GENTDEF( packm_cxk )
@@ -109,7 +109,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
 INSERT_GENTDEF( unpackm_cxk )
@@ -132,7 +132,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
 INSERT_GENTDEF( packm_cxc_diag )
diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h
index 80284ea22..8430614d2 100644
--- a/frame/1m/bli_l1m_ker_prot.h
+++ b/frame/1m/bli_l1m_ker_prot.h
@@ -51,7 +51,7 @@ void PASTEMAC(ch,varname) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -68,7 +68,7 @@ void PASTEMAC(ch,varname) \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
 
@@ -89,6 +89,6 @@ void PASTEMAC(ch,varname) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c
index 840b058d4..7520afce7 100644
--- a/frame/1m/bli_l1m_oapi.c
+++ b/frame/1m/bli_l1m_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -70,7 +70,7 @@ void PASTEMAC(opname,EX_SUF) \
 	inc_t     cs_y      = bli_obj_col_stride( y ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, y ); \
+		PASTEMAC(opname,_check)( x, y ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -102,9 +102,9 @@ GENFRONT( subm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -132,7 +132,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, y ); \
+		PASTEMAC(opname,_check)( alpha, x, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -170,8 +170,8 @@ GENFRONT( scal2m )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -197,7 +197,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     x_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x ); \
+		PASTEMAC(opname,_check)( alpha, x ); \
 \
 	/* Alias x to x_local so we can apply alpha if it is non-unit. */ \
 	bli_obj_alias_to( x, &x_local ); \
@@ -245,8 +245,8 @@ GENFRONT( scalm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -271,7 +271,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x ); \
+		PASTEMAC(opname,_check)( alpha, x ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -307,9 +307,9 @@ GENFRONT( setm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -340,7 +340,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, beta, y ); \
+		PASTEMAC(opname,_check)( x, beta, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -377,9 +377,9 @@ GENFRONT( xpbym )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1m/bli_l1m_oapi.h b/frame/1m/bli_l1m_oapi.h
index a6a94cf9f..9510f1aee 100644
--- a/frame/1m/bli_l1m_oapi.h
+++ b/frame/1m/bli_l1m_oapi.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -57,9 +57,9 @@ GENPROT( subm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -72,8 +72,8 @@ GENPROT( scal2m )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -86,9 +86,9 @@ GENPROT( setm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h
index 0b60d4e2f..325ed0ecf 100644
--- a/frame/1m/bli_l1m_oft_var.h
+++ b/frame/1m/bli_l1m_oft_var.h
@@ -45,12 +45,12 @@
 \
 typedef void (*PASTECH(opname,_var_oft)) \
 ( \
-  obj_t*  a, \
-  obj_t*  p, \
-  cntx_t* cntx, \
-  rntm_t* rntm, \
-  cntl_t* cntl, \
-  thrinfo_t* thread  \
+  const obj_t*  a, \
+        obj_t*  p, \
+  const cntx_t* cntx, \
+        rntm_t* rntm, \
+        cntl_t* cntl, \
+  const thrinfo_t* thread  \
 );
 
 GENTDEF( packm )
@@ -61,11 +61,11 @@ GENTDEF( packm )
 \
 typedef void (*PASTECH(opname,_var_oft)) \
 ( \
-  obj_t*  p, \
-  obj_t*  a, \
-  cntx_t* cntx, \
-  cntl_t* cntl, \
-  thrinfo_t* thread  \
+  const obj_t*  p, \
+  const obj_t*  a, \
+  const cntx_t* cntx, \
+  const cntl_t* cntl, \
+  const thrinfo_t* thread  \
 );
 
 GENTDEF( unpackm )
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 2b3c4bb4a..6b802b9fe 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -45,14 +45,14 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -75,9 +75,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -110,14 +110,14 @@ INSERT_GENTFUNC_BASIC( subm, subd )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -140,9 +140,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -179,15 +179,15 @@ INSERT_GENTFUNC_BASIC0( copym )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -213,10 +213,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -249,15 +249,15 @@ INSERT_GENTFUNC_BASIC0( axpym )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -302,10 +302,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -341,14 +341,14 @@ INSERT_GENTFUNC_BASIC0( scal2m )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             conj_t conjalpha, \
+             doff_t diagoffx, \
+             diag_t diagx, \
+             uplo_t uplox, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -371,9 +371,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  uplox, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, rs_x, cs_x, \
-	  cntx, \
+	  ( ctype* )alpha, \
+	            x, rs_x, cs_x, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -387,15 +387,15 @@ INSERT_GENTFUNC_BASIC0( setm )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -419,9 +419,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  transx, \
 		  m, \
 		  n, \
-		  x, rs_x, cs_x, \
-		  y, rs_y, cs_y, \
-		  cntx, \
+		  ( ctype* )x, rs_x, cs_x, \
+		            y, rs_y, cs_y, \
+		  ( cntx_t* )cntx, \
 		  rntm  \
 		); \
 \
@@ -438,10 +438,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  beta, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	  ( ctype* )beta, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -474,15 +474,15 @@ INSERT_GENTFUNC_BASIC0( xpbym )
 \
 void PASTEMAC3(chx,chy,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       trans_t  transx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_x* x, inc_t rs_x, inc_t cs_x, \
-       ctype_y* beta, \
-       ctype_y* y, inc_t rs_y, inc_t cs_y  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             trans_t  transx, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       const ctype_y* beta, \
+             ctype_y* y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -520,10 +520,10 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  beta, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype_x* )x, rs_x, cs_x, \
+	  ( ctype_y* )beta, \
+	              y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h
index 03a1196ed..68646a71f 100644
--- a/frame/1m/bli_l1m_tapi.h
+++ b/frame/1m/bli_l1m_tapi.h
@@ -42,14 +42,14 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -63,15 +63,15 @@ INSERT_GENTPROT_BASIC0( subm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -84,14 +84,14 @@ INSERT_GENTPROT_BASIC0( scal2m )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+             conj_t conjalpha, \
+             doff_t diagoffx, \
+             diag_t diagx, \
+             uplo_t uplox, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+             ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -104,15 +104,15 @@ INSERT_GENTPROT_BASIC0( setm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -124,15 +124,15 @@ INSERT_GENTPROT_BASIC0( xpbym )
 \
 BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       trans_t  transx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_x* x, inc_t rs_x, inc_t cs_x, \
-       ctype_y* beta, \
-       ctype_y* y, inc_t rs_y, inc_t cs_y  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             trans_t  transx, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       const ctype_y* beta, \
+             ctype_y* y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index b12a93ddc..22ed31ecc 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -37,10 +37,10 @@
 
 void* bli_packm_alloc
      (
-       siz_t      size_needed,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+             siz_t      size_needed,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
+       const thrinfo_t* thread
      )
 {
 	// Query the pack buffer type from the control tree node.
@@ -58,11 +58,11 @@ void* bli_packm_alloc
 
 void* bli_packm_alloc_ex
      (
-       siz_t      size_needed,
-       packbuf_t  pack_buf_type,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+             siz_t      size_needed,
+             packbuf_t  pack_buf_type,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
+       const thrinfo_t* thread
      )
 {
 	// Query the address of the mem_t entry within the control tree node.
diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h
index 5a5cf126b..aec2e1af5 100644
--- a/frame/1m/packm/bli_packm_alloc.h
+++ b/frame/1m/packm/bli_packm_alloc.h
@@ -34,18 +34,18 @@
 
 BLIS_EXPORT_BLIS void* bli_packm_alloc
      (
-       siz_t      size_needed,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+             siz_t      size_needed,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
+       const thrinfo_t* thread
      );
 
 BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
      (
-       siz_t      size_needed,
-       packbuf_t  pack_buf_type,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+             siz_t      size_needed,
+             packbuf_t  pack_buf_type,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
+       const thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index e13391151..601f2c05c 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -54,12 +54,12 @@ static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
 
 void bli_packm_blk_var1
      (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       rntm_t*  rntm,
-       cntl_t*  cntl,
-       thrinfo_t* thread
+       const obj_t*   c,
+             obj_t*   p,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl,
+       const thrinfo_t* thread
      )
 {
 	// Extract various fields from the control tree.
@@ -271,7 +271,7 @@ void bli_packm_blk_var1
 				                c_use, incc, ldc,
 				                p_use,       ldp,
 				                       is_p_use,
-				                cntx,
+				                ( cntx_t* )cntx,
 				                params );
 			}
 
@@ -303,7 +303,7 @@ void bli_packm_blk_var1
 				                kappa_cast,
 				                c_begin, incc, ldc,
 				                p_begin,       ldp, is_p,
-				                cntx,
+				                ( cntx_t* )cntx,
 				                params );
 			}
 		}
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h
index 9cda5828b..5797e3b94 100644
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -39,8 +39,8 @@
 
 typedef struct
 {
-    //                   Type of C          Type of P
-    packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
+	//                   Type of C          Type of P
+	packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
 } packm_blk_var1_params_t;
 
 //
@@ -49,11 +49,11 @@ typedef struct
 
 BLIS_EXPORT_BLIS void bli_packm_blk_var1
      (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       rntm_t*  rntm,
-       cntl_t*  cntl,
-       thrinfo_t* t
+       const obj_t*   c,
+             obj_t*   p,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl,
+       const thrinfo_t* t
      );
 
diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c
index e662a85df..15bd032ca 100644
--- a/frame/1m/packm/bli_packm_check.c
+++ b/frame/1m/packm/bli_packm_check.c
@@ -37,9 +37,9 @@
 
 void bli_packm_init_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -59,9 +59,9 @@ void bli_packm_init_check
 
 void bli_packm_int_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/frame/1m/packm/bli_packm_check.h b/frame/1m/packm/bli_packm_check.h
index be375fcf7..da9399b31 100644
--- a/frame/1m/packm/bli_packm_check.h
+++ b/frame/1m/packm/bli_packm_check.h
@@ -34,15 +34,15 @@
 
 void bli_packm_init_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      );
 
 void bli_packm_int_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      );
 
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index 14bfe1ce8..be0fc8fde 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -35,48 +35,48 @@
 
 struct packm_params_s
 {
-	uint64_t      size; // size field must be present and come first.
-	bszid_t       bmid_m;
-	bszid_t       bmid_n;
-	bool          does_invert_diag;
-	bool          rev_iter_if_upper;
-	bool          rev_iter_if_lower;
-	pack_t        pack_schema;
-	packbuf_t     pack_buf_type;
+	uint64_t  size; // size field must be present and come first.
+	bszid_t   bmid_m;
+	bszid_t   bmid_n;
+	bool      does_invert_diag;
+	bool      rev_iter_if_upper;
+	bool      rev_iter_if_lower;
+	pack_t    pack_schema;
+	packbuf_t pack_buf_type;
 };
 typedef struct packm_params_s packm_params_t;
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl )
+BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
 }
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl )
+BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower;
 }
 
-BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl )
+BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema;
 }
 
-BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl )
+BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type;
 }
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index 5a7d716fe..67e02ac0e 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -37,12 +37,12 @@
 
 bool bli_packm_init
      (
-       obj_t*  c,
-       obj_t*  p,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  c,
+             obj_t*  p,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+       const thrinfo_t* thread
      )
 {
 	bli_init_once();
diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h
index 152c6f15c..6f9b47273 100644
--- a/frame/1m/packm/bli_packm_init.h
+++ b/frame/1m/packm/bli_packm_init.h
@@ -34,11 +34,11 @@
 
 BLIS_EXPORT_BLIS bool bli_packm_init
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+             obj_t*  p,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+       const thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index c9a2bb9db..f76607508 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -36,12 +36,12 @@
 
 void bli_packm_int
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+             obj_t*  p,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+       const thrinfo_t* thread
      )
 {
 	bli_init_once();
diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h
index 16a5c2c34..a4cf17d59 100644
--- a/frame/1m/packm/bli_packm_int.h
+++ b/frame/1m/packm/bli_packm_int.h
@@ -34,10 +34,10 @@
 
 void bli_packm_int
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+             obj_t*  p,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+       const thrinfo_t* thread
      );
diff --git a/frame/1m/packm/bli_packm_part.c b/frame/1m/packm/bli_packm_part.c
index 2fff4b7c8..feaaaeea8 100644
--- a/frame/1m/packm/bli_packm_part.c
+++ b/frame/1m/packm/bli_packm_part.c
@@ -38,11 +38,11 @@
 // -- Matrix partitioning ------------------------------------------------------
 
 
-void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
-                                  dim_t     i,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj )
+void bli_packm_acquire_mpart_t2b( subpart_t    requested_part,
+                                  dim_t        i,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj )
 {
 	dim_t m, n;
 
@@ -110,11 +110,11 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
 
 
-void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
-                                  dim_t     j,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj )
+void bli_packm_acquire_mpart_l2r( subpart_t    requested_part,
+                                  dim_t        j,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj )
 {
 	dim_t m, n;
 
@@ -186,18 +186,18 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
 
 
-void bli_packm_acquire_mpart_tl2br( subpart_t requested_part,
-                                    dim_t     ij,
-                                    dim_t     b,
-                                    obj_t*    obj,
-                                    obj_t*    sub_obj )
+void bli_packm_acquire_mpart_tl2br( subpart_t    requested_part,
+                                    dim_t        ij,
+                                    dim_t        b,
+                                    const obj_t* obj,
+                                          obj_t* sub_obj )
 {
 	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
 }
 
 
-dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p )
+dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p )
 {
 	dim_t panel_off;
 
diff --git a/frame/1m/packm/bli_packm_part.h b/frame/1m/packm/bli_packm_part.h
index 5930d312e..39ee69a2c 100644
--- a/frame/1m/packm/bli_packm_part.h
+++ b/frame/1m/packm/bli_packm_part.h
@@ -34,23 +34,23 @@
 
 // -- Matrix partitioning ------------------------------------------------------
 
-void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
-                                  dim_t     i,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj );
-
-void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
-                                  dim_t     j,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj );
-
-void bli_packm_acquire_mpart_tl2br( subpart_t requested_part,
-                                    dim_t     ij,
-                                    dim_t     b,
-                                    obj_t*    obj,
-                                    obj_t*    sub_obj );
-
-dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p );
+void bli_packm_acquire_mpart_t2b( subpart_t    requested_part,
+                                  dim_t        i,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj );
+
+void bli_packm_acquire_mpart_l2r( subpart_t    requested_part,
+                                  dim_t        j,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj );
+
+void bli_packm_acquire_mpart_tl2br( subpart_t    requested_part,
+                                    dim_t        ij,
+                                    dim_t        b,
+                                    const obj_t* obj,
+                                          obj_t* sub_obj );
+
+dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p );
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index dbdaf4738..3f0d48dbf 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -39,23 +39,23 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
+       struc_t strucc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       conj_t  conjc, \
+       pack_t  schema, \
+       bool    invdiag, \
+       dim_t   panel_dim, \
+       dim_t   panel_len, \
+       dim_t   panel_dim_max, \
+       dim_t   panel_len_max, \
+       dim_t   panel_dim_off, \
+       dim_t   panel_len_off, \
+       ctype*  kappa, \
+       ctype*  c, inc_t incc, inc_t ldc, \
+       ctype*  p,             inc_t ldp, \
+                  inc_t is_p, \
+       cntx_t* cntx  \
      ) \
 { \
 	num_t   dt            = PASTEMAC(ch,type); \
diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h
index 973a02612..f0293330b 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.h
+++ b/frame/1m/packm/bli_packm_struc_cxk.h
@@ -37,26 +37,24 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
+       struc_t strucc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       conj_t  conjc, \
+       pack_t  schema, \
+       bool    invdiag, \
+       dim_t   panel_dim, \
+       dim_t   panel_len, \
+       dim_t   panel_dim_max, \
+       dim_t   panel_len_max, \
+       dim_t   panel_dim_off, \
+       dim_t   panel_len_off, \
+       ctype*  kappa, \
+       ctype*  c, inc_t incc, inc_t ldc, \
+       ctype*  p,             inc_t ldp, \
+                  inc_t is_p, \
+       cntx_t* cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( packm_struc_cxk )
-INSERT_GENTPROT_BASIC0( packm_herm_cxk )
-INSERT_GENTPROT_BASIC0( packm_tri_cxk )
 
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index b6165f516..f9f7f511c 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -58,11 +58,11 @@ static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
 
 void bli_unpackm_blk_var1
      (
-       obj_t*  p,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      )
 {
 	num_t     dt_cp     = bli_obj_dt( c );
@@ -108,19 +108,22 @@ void bli_unpackm_blk_var1
 	f = ftypes[dt_cp];
 
 	// Invoke the function.
-	f( strucc,
-	   diagoffc,
-	   diagc,
-	   uploc,
-	   transc,
-	   m_c,
-	   n_c,
-	   m_panel,
-	   n_panel,
-	   buf_p, rs_p, cs_p,
-	          pd_p, ps_p,
-	   buf_c, rs_c, cs_c,
-	   cntx );
+	f
+	(
+	  strucc,
+	  diagoffc,
+	  diagc,
+	  uploc,
+	  transc,
+	  m_c,
+	  n_c,
+	  m_panel,
+	  n_panel,
+	  buf_p, rs_p, cs_p,
+	         pd_p, ps_p,
+	  buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx
+	);
 }
 
 
@@ -144,29 +147,28 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      ) \
 { \
-	ctype* restrict one       = PASTEMAC(ch,1); \
-	ctype* restrict c_cast    = c; \
-	ctype* restrict p_cast    = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
-\
-	dim_t           iter_dim; \
-	dim_t           num_iter; \
-	dim_t           it, ic, ip; \
-	dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-	doff_t          diagoffc_i; \
-	doff_t          diagoffc_inc; \
-	dim_t           panel_len; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	inc_t           vs_c; \
-	inc_t           incc, ldc; \
-	inc_t           ldp; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-	pack_t          schema; \
+	ctype* one    = PASTEMAC(ch,1); \
+	ctype* c_cast = c; \
+	ctype* p_cast = p; \
+	ctype* c_begin; \
+	ctype* p_begin; \
 \
+	dim_t  iter_dim; \
+	dim_t  num_iter; \
+	dim_t  it, ic, ip; \
+	dim_t  ic0, ip0; \
+	doff_t ic_inc, ip_inc; \
+	doff_t diagoffc_i; \
+	doff_t diagoffc_inc; \
+	dim_t  panel_len; \
+	dim_t  panel_dim_i; \
+	dim_t  panel_dim_max; \
+	inc_t  vs_c; \
+	inc_t  incc, ldc; \
+	inc_t  ldp; \
+	dim_t* m_panel_full; \
+	dim_t* n_panel_full; \
+	pack_t schema; \
 \
 	/* If c needs a transposition, induce it so that we can more simply
 	   express the remaining parameters and code. */ \
@@ -274,7 +276,7 @@ void PASTEMAC(ch,varname) \
 			  one, \
 			  p_begin,       ldp, \
 			  c_begin, incc, ldc, \
-			  cntx  \
+			  ( cntx_t* )cntx  \
 			); \
 		} \
 \
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h
index abd044549..4a92dc1b7 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.h
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h
@@ -34,11 +34,11 @@
 
 void bli_unpackm_blk_var1
      (
-       obj_t*  p,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      );
 
 
diff --git a/frame/1m/unpackm/bli_unpackm_check.c b/frame/1m/unpackm/bli_unpackm_check.c
index 5bce60ed3..786edd4c8 100644
--- a/frame/1m/unpackm/bli_unpackm_check.c
+++ b/frame/1m/unpackm/bli_unpackm_check.c
@@ -36,9 +36,9 @@
 
 void bli_unpackm_int_check
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/frame/1m/unpackm/bli_unpackm_check.h b/frame/1m/unpackm/bli_unpackm_check.h
index d2a976dd8..697010fa7 100644
--- a/frame/1m/unpackm/bli_unpackm_check.h
+++ b/frame/1m/unpackm/bli_unpackm_check.h
@@ -34,8 +34,8 @@
 
 void bli_unpackm_int_check
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx
      );
 
diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c
index 550a8fb87..f6b09d8ae 100644
--- a/frame/1m/unpackm/bli_unpackm_int.c
+++ b/frame/1m/unpackm/bli_unpackm_int.c
@@ -36,11 +36,11 @@
 
 void bli_unpackm_int
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      )
 {
 	bli_init_once();
@@ -60,19 +60,19 @@ void bli_unpackm_int
 	f = bli_cntl_unpackm_params_var_func( cntl );
 
 	// Invoke the variant.
-    if ( bli_thread_am_ochief( thread ) )
+	if ( bli_thread_am_ochief( thread ) )
 	{
-        f
+		f
 		(
 		  p,
-          a,
+		  a,
 		  cntx,
-          cntl,
+		  cntl,
 		  thread
 		);
-    }
+	}
 
 	// Barrier so that unpacking is done before computation.
-    bli_thread_barrier( thread );
+	bli_thread_barrier( thread );
 }
 
diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h
index cb66d0975..8258ea367 100644
--- a/frame/1m/unpackm/bli_unpackm_int.h
+++ b/frame/1m/unpackm/bli_unpackm_int.h
@@ -34,10 +34,10 @@
 
 void bli_unpackm_int
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      );
 
diff --git a/frame/2/bli_l2_check.c b/frame/2/bli_l2_check.c
index fac91fec4..a2772e1c4 100644
--- a/frame/2/bli_l2_check.c
+++ b/frame/2/bli_l2_check.c
@@ -36,11 +36,11 @@
 
 void bli_gemv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -66,11 +66,11 @@ void bli_gemv_check
 
 void bli_hemv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -101,11 +101,11 @@ void bli_hemv_check
 
 void bli_symv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -136,9 +136,9 @@ void bli_symv_check
 
 void bli_trmv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -166,9 +166,9 @@ void bli_trmv_check
 
 void bli_trsv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -196,10 +196,10 @@ void bli_trsv_check
 
 void bli_ger_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* a
      )
 {
 	err_t e_val;
@@ -225,9 +225,9 @@ void bli_ger_check
 
 void bli_her_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  a 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* a
      )
 {
 	err_t e_val;
@@ -255,10 +255,10 @@ void bli_her_check
 
 void bli_her2_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* a
      )
 {
 	err_t e_val;
@@ -289,9 +289,9 @@ void bli_her2_check
 
 void bli_syr_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  a 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* a
      )
 {
 	err_t e_val;
@@ -319,10 +319,10 @@ void bli_syr_check
 
 void bli_syr2_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* a
      )
 {
 	err_t e_val;
@@ -355,11 +355,11 @@ void bli_syr2_check
 
 void bli_xxmv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -424,10 +424,10 @@ void bli_xxmv_check
 
 void bli_xxr_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* a
      )
 {
 	err_t e_val;
diff --git a/frame/2/bli_l2_check.h b/frame/2/bli_l2_check.h
index af9388753..b698e9d59 100644
--- a/frame/2/bli_l2_check.h
+++ b/frame/2/bli_l2_check.h
@@ -42,11 +42,11 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
     );
 
 GENPROT( gemv )
@@ -59,10 +59,10 @@ GENPROT( symv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* a  \
     );
 
 GENPROT( ger )
@@ -75,9 +75,9 @@ GENPROT( syr2 )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  a  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* a  \
     );
 
 GENPROT( her )
@@ -89,9 +89,9 @@ GENPROT( syr )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x  \
     );
 
 GENPROT( trmv )
@@ -102,17 +102,17 @@ GENPROT( trsv )
 
 void bli_xxmv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* a,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      );
 
 void bli_xxr_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* a
      );
diff --git a/frame/2/bli_l2_ft.h b/frame/2/bli_l2_ft.h
index 73aa4dd48..8c48e2bed 100644
--- a/frame/2/bli_l2_ft.h
+++ b/frame/2/bli_l2_ft.h
@@ -44,15 +44,15 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       trans_t transa, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             trans_t transa, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -65,14 +65,14 @@ INSERT_GENTDEF( gemv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -85,15 +85,15 @@ INSERT_GENTDEF( ger )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             uplo_t uploa, \
+             conj_t conja, \
+             conj_t conjx, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* a, inc_t rs_a, inc_t cs_a, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -107,12 +107,12 @@ INSERT_GENTDEF( symv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
+       const ctype_r* alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -125,12 +125,12 @@ INSERT_GENTDEFR( her )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype*   alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+             uplo_t uploa, \
+             conj_t conjx, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -143,14 +143,14 @@ INSERT_GENTDEF( syr )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+             uplo_t uploa, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -164,13 +164,13 @@ INSERT_GENTDEF( syr2 )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx  \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/2/bli_l2_oapi.c b/frame/2/bli_l2_oapi.c
index cc32fb61e..2eac6394c 100644
--- a/frame/2/bli_l2_oapi.c
+++ b/frame/2/bli_l2_oapi.c
@@ -45,11 +45,11 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -78,7 +78,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
+		PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -96,17 +96,17 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   transa, \
-	   conjx, \
-	   m, \
-	   n, \
-	   buf_alpha, \
-	   buf_a, rs_a, cs_a, \
-	   buf_x, incx, \
-	   buf_beta, \
-	   buf_y, incy, \
-	   cntx, \
-	   rntm  \
+	  transa, \
+	  conjx, \
+	  m, \
+	  n, \
+	  buf_alpha, \
+	  buf_a, rs_a, cs_a, \
+	  buf_x, incx, \
+	  buf_beta, \
+	  buf_y, incy, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -118,10 +118,10 @@ GENFRONT( gemv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -148,7 +148,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, y, a ); \
+		PASTEMAC(opname,_check)( alpha, x, y, a ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -163,16 +163,16 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   m, \
-	   n, \
-	   buf_alpha, \
-	   buf_x, incx, \
-	   buf_y, incy, \
-	   buf_a, rs_a, cs_a, \
-	   cntx, \
-	   rntm  \
+	  conjx, \
+	  conjy, \
+	  m, \
+	  n, \
+	  buf_alpha, \
+	  buf_x, incx, \
+	  buf_y, incy, \
+	  buf_a, rs_a, cs_a, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -184,11 +184,11 @@ GENFRONT( ger )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -217,7 +217,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     beta_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
+		PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -235,17 +235,17 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   uploa, \
-	   conja, \
-	   conjx, \
-	   m, \
-	   buf_alpha, \
-	   buf_a, rs_a, cs_a, \
-	   buf_x, incx, \
-	   buf_beta, \
-	   buf_y, incy, \
-	   cntx, \
-	   rntm  \
+	  uploa, \
+	  conja, \
+	  conjx, \
+	  m, \
+	  buf_alpha, \
+	  buf_a, rs_a, cs_a, \
+	  buf_x, incx, \
+	  buf_beta, \
+	  buf_y, incy, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -258,9 +258,9 @@ GENFRONT( symv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  a  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -270,21 +270,21 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	num_t     dt        = bli_obj_dt( a ); \
 \
-    uplo_t    uploa     = bli_obj_uplo( a ); \
-    conj_t    conjx     = bli_obj_conj_status( x ); \
+	uplo_t    uploa     = bli_obj_uplo( a ); \
+	conj_t    conjx     = bli_obj_conj_status( x ); \
 	dim_t     m         = bli_obj_length( a ); \
-    void*     buf_x     = bli_obj_buffer_at_off( x ); \
-    inc_t     incx      = bli_obj_vector_inc( x ); \
-    void*     buf_a     = bli_obj_buffer_at_off( a ); \
-    inc_t     rs_a      = bli_obj_row_stride( a ); \
-    inc_t     cs_a      = bli_obj_col_stride( a ); \
+	void*     buf_x     = bli_obj_buffer_at_off( x ); \
+	inc_t     incx      = bli_obj_vector_inc( x ); \
+	void*     buf_a     = bli_obj_buffer_at_off( a ); \
+	inc_t     rs_a      = bli_obj_row_stride( a ); \
+	inc_t     cs_a      = bli_obj_col_stride( a ); \
 \
 	void*     buf_alpha; \
 \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, a ); \
+		PASTEMAC(opname,_check)( alpha, x, a ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -299,14 +299,14 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   uploa, \
-	   conjx, \
-	   m, \
-	   buf_alpha, \
-	   buf_x, incx, \
-	   buf_a, rs_a, cs_a, \
-	   cntx, \
-	   rntm  \
+	  uploa, \
+	  conjx, \
+	  m, \
+	  buf_alpha, \
+	  buf_x, incx, \
+	  buf_a, rs_a, cs_a, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -319,10 +319,10 @@ GENFRONT( syr )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -349,7 +349,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, x, y, a ); \
+		PASTEMAC(opname,_check)( alpha, x, y, a ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -364,16 +364,16 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   uploa, \
-	   conjx, \
-	   conjy, \
-	   m, \
-	   buf_alpha, \
-	   buf_x, incx, \
-	   buf_y, incy, \
-	   buf_a, rs_a, cs_a, \
-	   cntx, \
-	   rntm  \
+	  uploa, \
+	  conjx, \
+	  conjy, \
+	  m, \
+	  buf_alpha, \
+	  buf_x, incx, \
+	  buf_y, incy, \
+	  buf_a, rs_a, cs_a, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
@@ -386,9 +386,9 @@ GENFRONT( syr2 )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -413,7 +413,7 @@ void PASTEMAC(opname,EX_SUF) \
 	obj_t     alpha_local; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( alpha, a, x ); \
+		PASTEMAC(opname,_check)( alpha, a, x ); \
 \
 	/* Create local copy-casts of scalars (and apply internal conjugation
 	   as needed). */ \
@@ -428,15 +428,15 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	f \
 	( \
-	   uploa, \
-	   transa, \
-	   diaga, \
-	   m, \
-	   buf_alpha, \
-	   buf_a, rs_a, cs_a, \
-	   buf_x, incx, \
-	   cntx, \
-	   rntm  \
+	  uploa, \
+	  transa, \
+	  diaga, \
+	  m, \
+	  buf_alpha, \
+	  buf_a, rs_a, cs_a, \
+	  buf_x, incx, \
+	  cntx, \
+	  rntm  \
 	); \
 }
 
diff --git a/frame/2/bli_l2_oapi.h b/frame/2/bli_l2_oapi.h
index 6b6a1d77e..391de06d5 100644
--- a/frame/2/bli_l2_oapi.h
+++ b/frame/2/bli_l2_oapi.h
@@ -42,11 +42,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -60,10 +60,10 @@ GENPROT( symv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* a  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -77,9 +77,9 @@ GENPROT( syr2 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  a  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* a  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -92,9 +92,9 @@ GENPROT( syr )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c
index f6eb6c7d9..4bef7c81a 100644
--- a/frame/2/bli_l2_tapi.c
+++ b/frame/2/bli_l2_tapi.c
@@ -45,15 +45,15 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       trans_t transa, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             trans_t transa, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -111,12 +111,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  n, \
-	  alpha, \
-	  a, rs_a, cs_a, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, rs_a, cs_a, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -128,14 +128,14 @@ INSERT_GENTFUNC_BASIC3( gemv, gemv, gemv_unf_var1, gemv_unf_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -164,11 +164,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjy, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -180,15 +180,15 @@ INSERT_GENTFUNC_BASIC3( ger, ger, ger_unb_var1, ger_unb_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             uplo_t uploa, \
+             conj_t conja, \
+             conj_t conjx, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* a, inc_t rs_a, inc_t cs_a, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -239,12 +239,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjh, /* used by variants to distinguish hemv from symv */ \
 	  m, \
-	  alpha, \
-	  a, rs_a, cs_a, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, rs_a, cs_a, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -257,12 +257,12 @@ INSERT_GENTFUNC_BASIC4( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_v
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
+       const ctype_r* alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -306,10 +306,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjh, /* used by variants to distinguish her from syr */ \
 	  m, \
-	  &alpha_local, \
-	  x, incx, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )&alpha_local, \
+	  ( ctype* )x, incx, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -321,12 +321,12 @@ INSERT_GENTFUNCR_BASIC4( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype*   alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+             uplo_t uploa, \
+             conj_t conjx, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -363,10 +363,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjh, /* used by variants to distinguish her2 from syr2 */ \
 	  m, \
-	  alpha, \
-	  x, incx, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -378,14 +378,14 @@ INSERT_GENTFUNC_BASIC4( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+             uplo_t uploa, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -423,11 +423,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjy, \
 	  conjh, \
 	  m, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -440,13 +440,13 @@ INSERT_GENTFUNC_BASIC4( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_v
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx  \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -498,10 +498,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transa, \
 	  diaga, \
 	  m, \
-	  alpha, \
-	  a, rs_a, cs_a, \
-	  x, incx, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, rs_a, cs_a, \
+	            x, incx, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
diff --git a/frame/2/bli_l2_tapi.h b/frame/2/bli_l2_tapi.h
index 4b45236e2..edd9607b6 100644
--- a/frame/2/bli_l2_tapi.h
+++ b/frame/2/bli_l2_tapi.h
@@ -42,15 +42,15 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       trans_t transa, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             trans_t transa, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -62,14 +62,14 @@ INSERT_GENTPROT_BASIC0( gemv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -81,15 +81,15 @@ INSERT_GENTPROT_BASIC0( ger )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+             uplo_t uploa, \
+             conj_t conja, \
+             conj_t conjx, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* a, inc_t rs_a, inc_t cs_a, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -102,12 +102,12 @@ INSERT_GENTPROT_BASIC0( symv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
+       const ctype_r* alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -119,12 +119,12 @@ INSERT_GENTPROTR_BASIC0( her )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype*   alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+             uplo_t uploa, \
+             conj_t conjx, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -136,14 +136,14 @@ INSERT_GENTPROT_BASIC0( syr )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+             uplo_t uploa, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -156,13 +156,13 @@ INSERT_GENTPROT_BASIC0( syr2 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx  \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/2/gemv/bli_gemv_var_oapi.c.prev b/frame/2/gemv/other/bli_gemv_var_oapi.c.prev
similarity index 100%
rename from frame/2/gemv/bli_gemv_var_oapi.c.prev
rename to frame/2/gemv/other/bli_gemv_var_oapi.c.prev
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
index 1986b3b0f..78482b5f6 100644
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -37,14 +37,14 @@
 
 dim_t bli_l3_determine_kc
       (
-        dir_t   direct,
-        dim_t   i,
-        dim_t   dim,
-        obj_t*  a,
-        obj_t*  b,
-        bszid_t bszid,
-        cntx_t* cntx,
-        cntl_t* cntl
+              dir_t   direct,
+              dim_t   i,
+              dim_t   dim,
+        const obj_t*  a,
+        const obj_t*  b,
+              bszid_t bszid,
+        const cntx_t* cntx,
+        const cntl_t* cntl
       )
 {
 	opid_t family = bli_cntl_family( cntl );
@@ -75,13 +75,13 @@ dim_t bli_l3_determine_kc
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dir_t   direct, \
-        dim_t   i, \
-        dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
-        bszid_t bszid, \
-        cntx_t* cntx  \
+              dir_t   direct, \
+              dim_t   i, \
+              dim_t   dim, \
+        const obj_t*  a, \
+        const obj_t*  b, \
+              bszid_t bszid, \
+        const cntx_t* cntx  \
       ) \
 { \
 	if ( direct == BLIS_FWD ) \
@@ -102,20 +102,14 @@ GENFRONT( trsm_determine_kc, trsm )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
-        bszid_t bszid, \
-        cntx_t* cntx  \
+              dim_t   i, \
+              dim_t   dim, \
+        const obj_t*  a, \
+        const obj_t*  b, \
+              bszid_t bszid, \
+        const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	blksz_t* bsize; \
-	dim_t    mnr; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -130,15 +124,16 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR if A is Hermitian or symmetric, or NR if B is
 	   Hermitian or symmetric. If neither case applies, then we leave
 	   the blocksizes unchanged. */ \
+	dim_t    mnr; \
 	if      ( bli_obj_root_is_herm_or_symm( a ) ) \
 	{ \
 		mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
@@ -154,9 +149,7 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( gemm_determine_kc_f, f )
@@ -169,19 +162,14 @@ GENFRONT( gemm_determine_kc_b, b )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
-        bszid_t bszid, \
-        cntx_t* cntx  \
+              dim_t   i, \
+              dim_t   dim, \
+        const obj_t*  a, \
+        const obj_t*  b, \
+              bszid_t bszid, \
+        const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	blksz_t* bsize; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -196,19 +184,17 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	const dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	const dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Notice that for gemmt, we do not need to perform any special handling
 	   for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( gemmt_determine_kc_f, f )
@@ -221,20 +207,14 @@ GENFRONT( gemmt_determine_kc_b, b )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
-        bszid_t bszid, \
-        cntx_t* cntx  \
+              dim_t   i, \
+              dim_t   dim, \
+        const obj_t*  a, \
+        const obj_t*  b, \
+              bszid_t bszid, \
+        const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	blksz_t* bsize; \
-	dim_t    mnr; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -249,14 +229,15 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR if the triangular matrix is on the left, or NR
 	   if the triangular matrix is one the right. */ \
+	dim_t mnr; \
 	if ( bli_obj_root_is_triangular( a ) ) \
 		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	else \
@@ -267,9 +248,7 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( trmm_determine_kc_f, f )
@@ -282,20 +261,14 @@ GENFRONT( trmm_determine_kc_b, b )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
-        bszid_t bszid, \
-        cntx_t* cntx  \
+              dim_t   i, \
+              dim_t   dim, \
+        const obj_t*  a, \
+        const obj_t*  b, \
+              bszid_t bszid, \
+        const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	blksz_t* bsize; \
-	dim_t    mnr; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -310,25 +283,23 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR. We always use MR (rather than sometimes using NR)
 	   because even when the triangle is on the right, packing of that
 	   matrix uses MR, since only left-side trsm micro-kernels are
 	   supported. */ \
-	mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-	b_max = bli_align_dim_to_mult( b_max, mnr ); \
+	const dim_t mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	            b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
+	            b_max = bli_align_dim_to_mult( b_max, mnr ); \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( trsm_determine_kc_f, f )
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
index 3ea3c5aa0..1ec889e03 100644
--- a/frame/3/bli_l3_blocksize.h
+++ b/frame/3/bli_l3_blocksize.h
@@ -34,14 +34,14 @@
 
 dim_t bli_l3_determine_kc
       (
-        dir_t   direct,
-        dim_t   i,
-        dim_t   dim,
-        obj_t*  a,
-        obj_t*  b,
-        bszid_t bszid,
-        cntx_t* cntx,
-        cntl_t* cntl
+              dir_t   direct,
+              dim_t   i,
+              dim_t   dim,
+        const obj_t*  a,
+        const obj_t*  b,
+              bszid_t bszid,
+        const cntx_t* cntx,
+        const cntl_t* cntl
       );
 
 
@@ -50,13 +50,13 @@ dim_t bli_l3_determine_kc
 \
 dim_t PASTEMAC0(opname) \
       ( \
-         dir_t   direct, \
-         dim_t   i, \
-         dim_t   dim, \
-         obj_t*  a, \
-         obj_t*  b, \
-         bszid_t bszid, \
-         cntx_t* cntx  \
+               dir_t   direct, \
+               dim_t   i, \
+               dim_t   dim, \
+         const obj_t*  a, \
+         const obj_t*  b, \
+               bszid_t bszid, \
+         const cntx_t* cntx  \
       );
 
 GENPROT( gemm_determine_kc )
@@ -70,12 +70,12 @@ GENPROT( trsm_determine_kc )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-         dim_t   i, \
-         dim_t   dim, \
-         obj_t*  a, \
-         obj_t*  b, \
-         bszid_t bszid, \
-         cntx_t* cntx  \
+               dim_t   i, \
+               dim_t   dim, \
+         const obj_t*  a, \
+         const obj_t*  b, \
+               bszid_t bszid, \
+         const cntx_t* cntx  \
       );
 
 GENPROT( gemm_determine_kc_f )
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 3e7882bc3..3b4d88746 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -36,12 +36,12 @@
 
 void bli_gemm_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	//err_t e_val;
@@ -65,12 +65,12 @@ void bli_gemm_check
 
 void bli_gemmt_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -87,13 +87,13 @@ void bli_gemmt_check
 
 void bli_hemm_check
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -110,11 +110,11 @@ void bli_hemm_check
 
 void bli_herk_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -143,12 +143,12 @@ void bli_herk_check
 
 void bli_her2k_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -175,13 +175,13 @@ void bli_her2k_check
 
 void bli_symm_check
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -198,11 +198,11 @@ void bli_symm_check
 
 void bli_syrk_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -223,12 +223,12 @@ void bli_syrk_check
 
 void bli_syr2k_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -250,13 +250,13 @@ void bli_syr2k_check
 
 void bli_trmm3_check
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -273,11 +273,11 @@ void bli_trmm3_check
 
 void bli_trmm_check
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -294,11 +294,11 @@ void bli_trmm_check
 
 void bli_trsm_check
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -317,12 +317,12 @@ void bli_trsm_check
 
 void bli_gemm_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -367,12 +367,12 @@ void bli_gemm_basic_check
 
 void bli_gemmt_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -389,13 +389,13 @@ void bli_gemmt_basic_check
 
 void bli_hemm_basic_check
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -433,12 +433,12 @@ void bli_hemm_basic_check
 
 void bli_herk_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -476,14 +476,14 @@ void bli_herk_basic_check
 
 void bli_her2k_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  bh,
-       obj_t*  b,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  bh,
+       const obj_t*  b,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -537,12 +537,12 @@ void bli_her2k_basic_check
 
 void bli_l3_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h
index c600d60b9..8551b6b61 100644
--- a/frame/3/bli_l3_check.h
+++ b/frame/3/bli_l3_check.h
@@ -42,12 +42,12 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( gemm )
@@ -61,13 +61,13 @@ GENPROT( syr2k )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx  \
+             side_t  side, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( hemm )
@@ -80,11 +80,11 @@ GENPROT( trmm3 )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( herk )
@@ -96,11 +96,11 @@ GENPROT( syrk )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       cntx_t* cntx  \
+             side_t  side, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( trmm )
@@ -111,63 +111,63 @@ GENPROT( trsm )
 
 void bli_gemm_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_gemmt_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_hemm_basic_check
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_herk_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_her2k_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  bh,
-       obj_t*  b,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  bh,
+       const obj_t*  b,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_l3_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
index 83ff8e5af..d7fd9649e 100644
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -38,15 +38,15 @@
 
 void bli_l3_cntl_create_if
      (
-       opid_t   family,
-       pack_t   schema_a,
-       pack_t   schema_b,
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   c,
-       rntm_t*  rntm,
-       cntl_t*  cntl_orig,
-       cntl_t** cntl_use
+             opid_t   family,
+             pack_t   schema_a,
+             pack_t   schema_b,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   c,
+             rntm_t*  rntm,
+             cntl_t*  cntl_orig,
+             cntl_t** cntl_use
      )
 {
 	// If the control tree pointer is NULL, we construct a default
diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h
index c308c8a96..eb4321ecd 100644
--- a/frame/3/bli_l3_cntl.h
+++ b/frame/3/bli_l3_cntl.h
@@ -40,15 +40,15 @@
 
 void bli_l3_cntl_create_if
      (
-       opid_t   family,
-       pack_t   schema_a,
-       pack_t   schema_b,
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   c,
-       rntm_t*  rntm,
-       cntl_t*  cntl_orig,
-       cntl_t** cntl_use
+             opid_t   family,
+             pack_t   schema_a,
+             pack_t   schema_b,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   c,
+             rntm_t*  rntm,
+             cntl_t*  cntl_orig,
+             cntl_t** cntl_use
      );
 
 void bli_l3_cntl_free
diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c
index 0d0a71921..bbc4af7a0 100644
--- a/frame/3/bli_l3_direct.c
+++ b/frame/3/bli_l3_direct.c
@@ -36,10 +36,10 @@
 
 dir_t bli_l3_direct
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntl_t* cntl
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
      )
 {
 	// Query the operation family.
@@ -58,9 +58,9 @@ dir_t bli_l3_direct
 
 dir_t bli_gemm_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	// For gemm, movement may be forwards (or backwards).
@@ -70,9 +70,9 @@ dir_t bli_gemm_direct
 
 dir_t bli_gemmt_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	// For gemmt, movement may be forwards (or backwards).
@@ -82,9 +82,9 @@ dir_t bli_gemmt_direct
 
 dir_t bli_trmm_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	dir_t direct;
@@ -111,9 +111,9 @@ dir_t bli_trmm_direct
 
 dir_t bli_trsm_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	dir_t direct;
diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h
index 39798407a..8f624098e 100644
--- a/frame/3/bli_l3_direct.h
+++ b/frame/3/bli_l3_direct.h
@@ -34,10 +34,10 @@
 
 dir_t bli_l3_direct
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntl_t* cntl
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
@@ -47,9 +47,9 @@ dir_t bli_l3_direct
 \
 dir_t PASTEMAC0(opname) \
       ( \
-         obj_t*  a, \
-         obj_t*  b, \
-         obj_t*  c  \
+         const obj_t* a, \
+         const obj_t* b, \
+         const obj_t* c  \
       );
 
 GENPROT( gemm_direct )
diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h
index 28065c208..e7952409f 100644
--- a/frame/3/bli_l3_ft_ukr.h
+++ b/frame/3/bli_l3_ft_ukr.h
@@ -55,8 +55,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( gemm )
@@ -78,8 +78,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( gemmtrsm )
@@ -95,8 +95,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( trsm )
diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h
index 6f24e71fc..243ff818d 100644
--- a/frame/3/bli_l3_ind_ukr.h
+++ b/frame/3/bli_l3_ind_ukr.h
@@ -51,8 +51,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
@@ -72,8 +72,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name )
@@ -88,8 +88,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name )
diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c
index d4b974030..b786236ab 100644
--- a/frame/3/bli_l3_int.c
+++ b/frame/3/bli_l3_int.c
@@ -36,15 +36,15 @@
 
 void bli_l3_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	obj_t a_local;
diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h
index d76b0ac3e..65485206d 100644
--- a/frame/3/bli_l3_int.h
+++ b/frame/3/bli_l3_int.h
@@ -34,14 +34,14 @@
 
 void bli_l3_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c
index 1df8e8012..0365a198c 100644
--- a/frame/3/bli_l3_oapi.c
+++ b/frame/3/bli_l3_oapi.c
@@ -43,11 +43,11 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* b, \
+       const obj_t* beta, \
+       const obj_t* c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -66,12 +66,12 @@ GENFRONT( syr2k )
 \
 void PASTEMAC0(opname) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+             side_t side, \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* b, \
+       const obj_t* beta, \
+       const obj_t* c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -89,10 +89,10 @@ GENFRONT( trmm3 )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* beta, \
+       const obj_t* c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -109,10 +109,10 @@ GENFRONT( syrk )
 \
 void PASTEMAC0(opname) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b  \
+             side_t side, \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* b  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h
index e00f238ad..7161a3bf3 100644
--- a/frame/3/bli_l3_oapi.h
+++ b/frame/3/bli_l3_oapi.h
@@ -43,11 +43,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* b, \
+       const obj_t* beta, \
+       const obj_t* c  \
      );
 
 GENPROT( gemm )
@@ -61,12 +61,12 @@ GENPROT( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+             side_t side, \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* b, \
+       const obj_t* beta, \
+       const obj_t* c  \
      );
 
 GENPROT( hemm )
@@ -79,10 +79,10 @@ GENPROT( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* beta, \
+       const obj_t* c  \
      );
 
 GENPROT( herk )
@@ -94,10 +94,10 @@ GENPROT( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b  \
+             side_t side, \
+       const obj_t* alpha, \
+       const obj_t* a, \
+       const obj_t* b  \
      );
 
 GENPROT( trmm )
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index cd0df7017..e4c815fe3 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -44,13 +44,13 @@
 
 void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -117,13 +117,13 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -166,13 +166,13 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -212,13 +212,13 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -244,14 +244,14 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -294,14 +294,14 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -344,14 +344,14 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -394,12 +394,12 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -428,12 +428,12 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -453,12 +453,12 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -500,12 +500,12 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h
index 946a7aa17..58091704b 100644
--- a/frame/3/bli_l3_oapi_ex.h
+++ b/frame/3/bli_l3_oapi_ex.h
@@ -43,13 +43,13 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( gemm )
@@ -63,14 +63,14 @@ GENPROT( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( hemm )
@@ -83,12 +83,12 @@ GENPROT( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( herk )
@@ -100,12 +100,12 @@ GENPROT( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( trmm )
diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h
index e7c8dcca3..997ade58e 100644
--- a/frame/3/bli_l3_oft.h
+++ b/frame/3/bli_l3_oft.h
@@ -48,13 +48,13 @@
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
-  rntm_t* rntm  \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
+        rntm_t* rntm  \
 );
 
 GENTDEF( gemm )
@@ -70,14 +70,14 @@ GENTDEF( syr2k )
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  side_t  side, \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
-  rntm_t* rntm  \
+        side_t  side, \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
+        rntm_t* rntm  \
 );
 
 GENTDEF( hemm )
@@ -92,12 +92,12 @@ GENTDEF( trmm3 )
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
-  rntm_t* rntm  \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
+        rntm_t* rntm  \
 );
 
 GENTDEF( herk )
@@ -111,12 +111,12 @@ GENTDEF( syrk )
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  side_t  side, \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  cntx_t* cntx, \
-  rntm_t* rntm  \
+        side_t  side, \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const cntx_t* cntx, \
+        rntm_t* rntm  \
 );
 
 GENTDEF( trmm )
diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h
index ea10d8090..ee529b115 100644
--- a/frame/3/bli_l3_oft_var.h
+++ b/frame/3/bli_l3_oft_var.h
@@ -45,13 +45,13 @@
 \
 typedef void (*PASTECH(opname,_var_oft)) \
 ( \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  c, \
-  cntx_t* cntx, \
-  rntm_t* rntm, \
-  cntl_t* cntl, \
-  thrinfo_t* thread  \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
+        rntm_t* rntm, \
+        cntl_t* cntl, \
+        thrinfo_t* thread  \
 );
 
 GENTDEF( l3 )
diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c
index d91181942..6f18169b2 100644
--- a/frame/3/bli_l3_packab.c
+++ b/frame/3/bli_l3_packab.c
@@ -36,13 +36,13 @@
 
 void bli_l3_packa
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	obj_t a_local, a_pack;
@@ -84,13 +84,13 @@ void bli_l3_packa
 
 void bli_l3_packb
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	obj_t bt_local, bt_pack;
diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h
index 380ca7212..f03b7f62c 100644
--- a/frame/3/bli_l3_packab.h
+++ b/frame/3/bli_l3_packab.h
@@ -34,23 +34,23 @@
 
 void bli_l3_packa
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      );
 
 void bli_l3_packb
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c
index 6ca8244cb..6531b74a8 100644
--- a/frame/3/bli_l3_prune.c
+++ b/frame/3/bli_l3_prune.c
@@ -34,174 +34,106 @@
 
 #include "blis.h"
 
-/*
+
 void bli_l3_prune_unref_mparts_m
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntl_t* cntl
+             obj_t*  a,
+       const obj_t*  b,
+             obj_t*  c,
+       const cntl_t* cntl
      )
 {
-	// Query the operation family.
+	/* Query the operation family. */
 	opid_t family = bli_cntl_family( cntl );
 
-	if      ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm.
-	else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c );
-	else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c );
-	else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c );
-}
-*/
-
-#undef  GENFRONT
-#define GENFRONT( dim ) \
-\
-void PASTEMAC(l3_prune_unref_mparts_,dim) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntl_t* cntl  \
-     ) \
-{ \
-	/* Query the operation family. */ \
-	opid_t family = bli_cntl_family( cntl ); \
-\
-	if      ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \
-	else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \
-	else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \
-	else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \
+	if      ( family == BLIS_GEMM )
+	{
+		/* No pruning is necessary for gemm. */
+		return;
+	}
+	else if ( family == BLIS_GEMMT )
+	{
+		/* Prune any unreferenced part from the subpartition of C (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of A accordingly. */
+		bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M );
+	}
+	else if ( family == BLIS_TRMM ||
+	          family == BLIS_TRSM )
+	{
+		/* Prune any unreferenced part from the subpartition of A (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of C accordingly. */
+		bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M );
+	}
 }
 
-GENFRONT( m )
-GENFRONT( n )
-GENFRONT( k )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,_prune_unref_mparts_m) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* No pruning is necessary for gemm. */ \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_n) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* No pruning is necessary for gemm. */ \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_k) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* No pruning is necessary for gemm. */ \
-}
-
-GENFRONT( gemm )
-
-// -----------------------------------------------------------------------------
+void bli_l3_prune_unref_mparts_n
+     (
+       const obj_t*  a,
+             obj_t*  b,
+             obj_t*  c,
+       const cntl_t* cntl
+     )
+{
+	/* Query the operation family. */
+	opid_t family = bli_cntl_family( cntl );
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,_prune_unref_mparts_m) \
-     ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of C (that would
-	   be encountered from partitioning in the m dimension) and adjust the
-	   subpartition of A accordingly. */ \
-	bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_n) \
-     ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of C (that would
-	   be encountered from partitioning in the n dimension) and adjust the
-	   subpartition of Ah accordingly. */ \
-	bli_prune_unref_mparts( c, BLIS_N, ah, BLIS_N ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_k) \
-     ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* As long as A and Ah are general in structure, no pruning should be
-	   for the k dimension. */ \
+	if      ( family == BLIS_GEMM )
+	{
+		/* No pruning is necessary for gemm. */
+		return;
+	}
+	else if ( family == BLIS_GEMMT )
+	{
+		/* Prune any unreferenced part from the subpartition of C (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of B accordingly. */
+		bli_prune_unref_mparts( c, BLIS_N, b, BLIS_N );
+	}
+	else if ( family == BLIS_TRMM ||
+	          family == BLIS_TRSM )
+	{
+		/* Prune any unreferenced part from the subpartition of B (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of C accordingly. */
+		bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N );
+	}
 }
 
-GENFRONT( gemmt )
-
-// -----------------------------------------------------------------------------
+void bli_l3_prune_unref_mparts_k
+     (
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
+     )
+{
+	/* Query the operation family. */
+	opid_t family = bli_cntl_family( cntl );
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,_prune_unref_mparts_m) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of A (that would
-	   be encountered from partitioning in the m dimension) and adjust the
-	   subpartition of C accordingly. */ \
-	bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_n) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of B (that would
-	   be encountered from partitioning in the n dimension) and adjust the
-	   subpartition of C accordingly. */ \
-	bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_k) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of A (that would
-	   be encountered from partitioning in the k dimension) and adjust the
-	   subpartition of B accordingly. */ \
-	bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M ); \
-\
-	/* Prune any unreferenced part from the subpartition of B (that would
-	   be encountered from partitioning in the k dimension) and adjust the
-	   subpartition of A accordingly. */ \
-	bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N ); \
+	if      ( family == BLIS_GEMM )
+	{
+		/* No pruning is necessary for gemm. */
+		return;
+	}
+	else if ( family == BLIS_GEMMT )
+	{
+		/* No pruning is necessary for gemmt. */
+		return;
+	}
+	else if ( family == BLIS_TRMM ||
+	          family == BLIS_TRSM )
+	{
+		/* Prune any unreferenced part from the subpartition of A (that would
+		   be encountered from partitioning in the k dimension) and adjust the
+		   subpartition of B accordingly. */
+		bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M );
+
+		/* Prune any unreferenced part from the subpartition of B (that would
+		   be encountered from partitioning in the k dimension) and adjust the
+		   subpartition of A accordingly. */
+		bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N );
+	}
 }
 
-GENFRONT( trmm )
-GENFRONT( trsm )
-
-
diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h
index ad8f07dc4..84c0cbbcd 100644
--- a/frame/3/bli_l3_prune.h
+++ b/frame/3/bli_l3_prune.h
@@ -33,46 +33,27 @@
 */
 
 
-#undef  GENPROT
-#define GENPROT( dim ) \
-\
-void PASTEMAC(l3_prune_unref_mparts_,dim) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntl_t* cntl  \
+void bli_l3_prune_unref_mparts_m
+     (
+             obj_t*  a,
+       const obj_t*  b,
+             obj_t*  c,
+       const cntl_t* cntl
      );
 
-GENPROT( m )
-GENPROT( n )
-GENPROT( k )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENPROT
-#define GENPROT( opname, dim ) \
-\
-void PASTEMAC2(opname,_prune_unref_mparts_,dim) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
+void bli_l3_prune_unref_mparts_n
+     (
+       const obj_t*  a,
+             obj_t*  b,
+             obj_t*  c,
+       const cntl_t* cntl
      );
 
-GENPROT( gemm, m )
-GENPROT( gemm, n )
-GENPROT( gemm, k )
-
-GENPROT( gemmt, m )
-GENPROT( gemmt, n )
-GENPROT( gemmt, k )
-
-GENPROT( trmm, m )
-GENPROT( trmm, n )
-GENPROT( trmm, k )
-
-GENPROT( trsm, m )
-GENPROT( trsm, n )
-GENPROT( trsm, k )
+void bli_l3_prune_unref_mparts_k
+     (
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
+     );
 
diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c
index 1d4608799..1de381f37 100644
--- a/frame/3/bli_l3_schema.c
+++ b/frame/3/bli_l3_schema.c
@@ -36,10 +36,10 @@
 
 void bli_l3_set_schemas
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	// Begin with pack schemas for native execution.
diff --git a/frame/3/bli_l3_schema.h b/frame/3/bli_l3_schema.h
index c6a12ce52..a909bf598 100644
--- a/frame/3/bli_l3_schema.h
+++ b/frame/3/bli_l3_schema.h
@@ -34,8 +34,8 @@
 
 void bli_l3_set_schemas
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index 7e37e1f22..eedbd9ec5 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -36,13 +36,13 @@
 
 err_t bli_gemmsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
@@ -134,13 +134,13 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
 
 err_t bli_gemmtsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
diff --git a/frame/3/bli_l3_sup.h b/frame/3/bli_l3_sup.h
index fe6d0483e..33b3f8ca7 100644
--- a/frame/3/bli_l3_sup.h
+++ b/frame/3/bli_l3_sup.h
@@ -34,23 +34,23 @@
 
 err_t bli_gemmsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
 err_t bli_gemmtsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
diff --git a/frame/3/bli_l3_sup_ft_ker.h b/frame/3/bli_l3_sup_ft_ker.h
index 5bb2218f3..dbeafb404 100644
--- a/frame/3/bli_l3_sup_ft_ker.h
+++ b/frame/3/bli_l3_sup_ft_ker.h
@@ -57,8 +57,8 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( gemmsup )
diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index 3da3954fa..3ff13bdb5 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -36,14 +36,14 @@
 
 err_t bli_gemmsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             thrinfo_t* thread
      )
 {
 #if 0
@@ -240,14 +240,14 @@ err_t bli_gemmsup_int
 
 err_t bli_gemmtsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             thrinfo_t* thread
      )
 {
 	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h
index c6cb88056..195e3ca40 100644
--- a/frame/3/bli_l3_sup_int.h
+++ b/frame/3/bli_l3_sup_int.h
@@ -34,24 +34,24 @@
 
 err_t bli_gemmsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             thrinfo_t* thread
      );
 
 err_t bli_gemmtsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             thrinfo_t* thread
      );
diff --git a/frame/3/bli_l3_sup_ker_prot.h b/frame/3/bli_l3_sup_ker_prot.h
index 899a47d3f..30cad5257 100644
--- a/frame/3/bli_l3_sup_ker_prot.h
+++ b/frame/3/bli_l3_sup_ker_prot.h
@@ -50,7 +50,7 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h
index 98a06cf57..ba60035b7 100644
--- a/frame/3/bli_l3_sup_oft.h
+++ b/frame/3/bli_l3_sup_oft.h
@@ -47,13 +47,13 @@
 \
 typedef err_t (*PASTECH(opname,_oft)) \
 ( \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
-  rntm_t* rntm  \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
+        rntm_t* rntm  \
 );
 
 GENTDEF( gemmsup )
diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c
index 56726c5f8..6b73050fd 100644
--- a/frame/3/bli_l3_sup_packm_a.c
+++ b/frame/3/bli_l3_sup_packm_a.c
@@ -40,15 +40,15 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       will_pack, \
+             packbuf_t  pack_buf_type, \
+             dim_t      m, \
+             dim_t      k, \
+             dim_t      mr, \
+       const cntx_t*    cntx, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix A. */ \
@@ -174,10 +174,10 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       did_pack, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we previously packed matrix A. */ \
@@ -212,20 +212,20 @@ INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* restrict schema, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       dim_t*  restrict m_max, \
-       dim_t*  restrict k_max, \
-       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool    will_pack, \
+       stor3_t stor_id, \
+       pack_t* schema, \
+       dim_t   m, \
+       dim_t   k, \
+       dim_t   mr, \
+       dim_t*  m_max, \
+       dim_t*  k_max, \
+       ctype*  a, inc_t  rs_a, inc_t  cs_a, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                  dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix A. */ \
@@ -238,11 +238,11 @@ void PASTEMAC(ch,opname) \
 		   source matrix A directly). */ \
 		{ \
 			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_x; \
-			*cs_p = cs_x; \
+			*rs_p = rs_a; \
+			*cs_p = cs_a; \
 \
 			*pd_p = mr; \
-			*ps_p = mr * rs_x; \
+			*ps_p = mr * rs_a; \
 \
 			/* Set the schema to "not packed" to indicate that packing will be
 			   skipped. */ \
@@ -251,7 +251,7 @@ void PASTEMAC(ch,opname) \
 \
 		/* Since we won't be packing, simply update the buffer address provided
 		   by the caller to point to source matrix. */ \
-		*p = x; \
+		*p = a; \
 	} \
 	else /* if ( will_pack == TRUE ) */ \
 	{ \
@@ -311,23 +311,23 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_a )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            m_alloc, \
-       dim_t            k_alloc, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool      will_pack, \
+       packbuf_t pack_buf_type, \
+       stor3_t   stor_id, \
+       trans_t   transc, \
+       dim_t     m_alloc, \
+       dim_t     k_alloc, \
+       dim_t     m, \
+       dim_t     k, \
+       dim_t     mr, \
+       ctype*    kappa, \
+       ctype*    a, inc_t  rs_a, inc_t  cs_a, \
+       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
+                                 inc_t* ps_p, \
+       cntx_t*   cntx, \
+       rntm_t*   rntm, \
+       mem_t*    mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	pack_t schema; \
diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h
index 95c9582e7..0aaa302c8 100644
--- a/frame/3/bli_l3_sup_packm_a.h
+++ b/frame/3/bli_l3_sup_packm_a.h
@@ -38,15 +38,15 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       will_pack, \
+             packbuf_t  pack_buf_type, \
+             dim_t      m, \
+             dim_t      k, \
+             dim_t      mr, \
+       const cntx_t*    cntx, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
@@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       did_pack, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
@@ -71,20 +71,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* restrict schema, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       dim_t*  restrict m_max, \
-       dim_t*  restrict k_max, \
-       ctype*           a, inc_t           rs_a, inc_t           cs_a, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool    will_pack, \
+       stor3_t stor_id, \
+       pack_t* schema, \
+       dim_t   m, \
+       dim_t   k, \
+       dim_t   mr, \
+       dim_t*  m_max, \
+       dim_t*  k_max, \
+       ctype*  a, inc_t  rs_a, inc_t  cs_a, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                  dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_a )
@@ -95,23 +95,23 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            m_alloc, \
-       dim_t            k_alloc, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool      will_pack, \
+       packbuf_t pack_buf_type, \
+       stor3_t   stor_id, \
+       trans_t   transc, \
+       dim_t     m_alloc, \
+       dim_t     k_alloc, \
+       dim_t     m, \
+       dim_t     k, \
+       dim_t     mr, \
+       ctype*    kappa, \
+       ctype*    a, inc_t  rs_a, inc_t  cs_a, \
+       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
+                                 inc_t* ps_p, \
+       cntx_t*   cntx, \
+       rntm_t*   rntm, \
+       mem_t*    mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_a )
diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c
index 32c14afe3..7a2030ccf 100644
--- a/frame/3/bli_l3_sup_packm_b.c
+++ b/frame/3/bli_l3_sup_packm_b.c
@@ -40,15 +40,15 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       will_pack, \
+             packbuf_t  pack_buf_type, \
+             dim_t      k, \
+             dim_t      n, \
+             dim_t      nr, \
+       const cntx_t*    cntx, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix B. */ \
@@ -174,10 +174,10 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       did_pack, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we previously packed matrix A. */ \
@@ -212,20 +212,20 @@ INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* restrict schema, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       dim_t*  restrict k_max, \
-       dim_t*  restrict n_max, \
-       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool    will_pack, \
+       stor3_t stor_id, \
+       pack_t* schema, \
+       dim_t   k, \
+       dim_t   n, \
+       dim_t   nr, \
+       dim_t*  k_max, \
+       dim_t*  n_max, \
+       ctype*  b, inc_t  rs_b, inc_t  cs_b, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                  dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix B. */ \
@@ -238,11 +238,11 @@ void PASTEMAC(ch,opname) \
 		   source matrix B directly). */ \
 		{ \
 			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_x; \
-			*cs_p = cs_x; \
+			*rs_p = rs_b; \
+			*cs_p = cs_b; \
 \
 			*pd_p = nr; \
-			*ps_p = nr * cs_x; \
+			*ps_p = nr * cs_b; \
 \
 			/* Set the schema to "not packed" to indicate that packing will be
 			   skipped. */ \
@@ -251,7 +251,7 @@ void PASTEMAC(ch,opname) \
 \
 		/* Since we won't be packing, simply update the buffer address provided
 		   by the caller to point to source matrix. */ \
-		*p = x; \
+		*p = b; \
 	} \
 	else /* if ( will_pack == TRUE ) */ \
 	{ \
@@ -311,23 +311,23 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_b )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            k_alloc, \
-       dim_t            n_alloc, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool      will_pack, \
+       packbuf_t pack_buf_type, \
+       stor3_t   stor_id, \
+       trans_t   transc, \
+       dim_t     k_alloc, \
+       dim_t     n_alloc, \
+       dim_t     k, \
+       dim_t     n, \
+       dim_t     nr, \
+       ctype*    kappa, \
+       ctype*    b, inc_t  rs_b, inc_t  cs_b, \
+       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
+                                 inc_t* ps_p, \
+       cntx_t*   cntx, \
+       rntm_t*   rntm, \
+       mem_t*    mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	pack_t schema; \
diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h
index 2965727d5..bd18e5887 100644
--- a/frame/3/bli_l3_sup_packm_b.h
+++ b/frame/3/bli_l3_sup_packm_b.h
@@ -38,15 +38,15 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       will_pack, \
+             packbuf_t  pack_buf_type, \
+             dim_t      k, \
+             dim_t      n, \
+             dim_t      nr, \
+       const cntx_t*    cntx, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
@@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+             bool       did_pack, \
+             rntm_t*    rntm, \
+             mem_t*     mem, \
+       const thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
@@ -71,20 +71,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* restrict schema, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       dim_t*  restrict k_max, \
-       dim_t*  restrict n_max, \
-       ctype*           b, inc_t           rs_b, inc_t           cs_b, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool    will_pack, \
+       stor3_t stor_id, \
+       pack_t* schema, \
+       dim_t   k, \
+       dim_t   n, \
+       dim_t   nr, \
+       dim_t*  k_max, \
+       dim_t*  n_max, \
+       ctype*  b, inc_t  rs_b, inc_t  cs_b, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                  dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_b )
@@ -95,23 +95,23 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            k_alloc, \
-       dim_t            n_alloc, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       bool      will_pack, \
+       packbuf_t pack_buf_type, \
+       stor3_t   stor_id, \
+       trans_t   transc, \
+       dim_t     k_alloc, \
+       dim_t     n_alloc, \
+       dim_t     k, \
+       dim_t     n, \
+       dim_t     nr, \
+       ctype*    kappa, \
+       ctype*    b, inc_t  rs_b, inc_t  cs_b, \
+       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
+                                 inc_t* ps_p, \
+       cntx_t*   cntx, \
+       rntm_t*   rntm, \
+       mem_t*    mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_b )
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 519dc5ccd..54ecab8ff 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -44,39 +44,39 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t          transc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            m_max, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-                           dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       trans_t transc, \
+       pack_t  schema, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
+                  dim_t pd_p, inc_t ps_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-\
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it, ic; \
-	dim_t           ic0; \
-	doff_t          ic_inc; \
-	dim_t           panel_len_full; \
-	dim_t           panel_len_i; \
-	dim_t           panel_len_max; \
-	dim_t           panel_len_max_i; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	inc_t           vs_c; \
-	inc_t           ldc; \
-	inc_t           ldp, p_inc; \
-	conj_t          conjc; \
+	ctype* kappa_cast = kappa; \
+	ctype* c_cast     = c; \
+	ctype* p_cast     = p; \
+\
+	dim_t  iter_dim; \
+	dim_t  n_iter; \
+	dim_t  it, ic; \
+	dim_t  ic0; \
+	doff_t ic_inc; \
+	dim_t  panel_len_full; \
+	dim_t  panel_len_i; \
+	dim_t  panel_len_max; \
+	dim_t  panel_len_max_i; \
+	dim_t  panel_dim_i; \
+	dim_t  panel_dim_max; \
+	inc_t  vs_c; \
+	inc_t  ldc; \
+	inc_t  ldp, p_inc; \
+	conj_t conjc; \
 \
 \
 	/* Extract the conjugation bit from the transposition argument. */ \
@@ -141,7 +141,7 @@ void PASTEMAC(ch,varname) \
 		ic_inc = panel_dim_max; \
 	} \
 \
-	ctype* restrict p_begin = p_cast; \
+	ctype* p_begin = p_cast; \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
@@ -166,10 +166,10 @@ void PASTEMAC(ch,varname) \
 	{ \
 		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
 \
-		ctype* restrict c_begin = c_cast   + (ic  )*vs_c; \
+		ctype* c_begin = c_cast   + (ic  )*vs_c; \
 \
-		ctype* restrict c_use = c_begin; \
-		ctype* restrict p_use = p_begin; \
+		ctype* c_use = c_begin; \
+		ctype* p_use = p_begin; \
 \
 		{ \
 			panel_len_i     = panel_len_full; \
@@ -317,28 +317,28 @@ bli_thread_barrier( thread ); \
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t          transc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       trans_t transc, \
+       pack_t  schema, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
+	ctype* kappa_cast = kappa; \
+	ctype* c_cast     = c; \
+	ctype* p_cast     = p; \
 \
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it; \
-	dim_t           vector_len; \
-	inc_t           incc, ldc; \
-	inc_t           incp, ldp; \
-	conj_t          conjc; \
+	dim_t  iter_dim; \
+	dim_t  n_iter; \
+	dim_t  it; \
+	dim_t  vector_len; \
+	inc_t  incc, ldc; \
+	inc_t  incp, ldp; \
+	conj_t conjc; \
 \
 \
 	/* Extract the conjugation bit from the transposition argument. */ \
@@ -384,7 +384,7 @@ void PASTEMAC(ch,varname) \
 	n_iter = iter_dim; \
 \
 \
-	ctype* restrict p_begin = p_cast; \
+	ctype* p_begin = p_cast; \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
@@ -406,10 +406,10 @@ void PASTEMAC(ch,varname) \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( it = 0; it < n_iter; it += 1 ) \
 	{ \
-		ctype* restrict c_begin = c_cast + (it  )*ldc; \
+		ctype* c_begin = c_cast + (it  )*ldc; \
 \
-		ctype* restrict c_use = c_begin; \
-		ctype* restrict p_use = p_begin; \
+		ctype* c_use = c_begin; \
+		ctype* p_use = p_begin; \
 \
 		{ \
 			/* The definition of bli_packm_my_iter() will depend on whether slab
diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h
index 5ccdd3b76..9c62c9c68 100644
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -42,18 +42,18 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t          transc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            m_max, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-                           dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       trans_t transc, \
+       pack_t  schema, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
+                  dim_t pd_p, inc_t ps_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( packm_sup_var1 )
@@ -63,15 +63,15 @@ INSERT_GENTPROT_BASIC0( packm_sup_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t          transc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       trans_t transc, \
+       pack_t  schema, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( packm_sup_var2 )
diff --git a/frame/3/bli_l3_sup_ref.c b/frame/3/bli_l3_sup_ref.c
index f03ec1b18..8eb7a6d4b 100644
--- a/frame/3/bli_l3_sup_ref.c
+++ b/frame/3/bli_l3_sup_ref.c
@@ -36,13 +36,13 @@
 
 err_t bli_gemmsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// This function implements the default gemmsup handler. If you are a
@@ -124,13 +124,13 @@ err_t bli_gemmsup_ref
 
 err_t bli_gemmtsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// This function implements the default gemmtsup handler. If you are a
diff --git a/frame/3/bli_l3_sup_ref.h b/frame/3/bli_l3_sup_ref.h
index bce4e1729..4d4811db3 100644
--- a/frame/3/bli_l3_sup_ref.h
+++ b/frame/3/bli_l3_sup_ref.h
@@ -34,23 +34,23 @@
 
 err_t bli_gemmsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
 err_t bli_gemmtsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c
index 106ad86e4..d65482243 100644
--- a/frame/3/bli_l3_sup_var12.c
+++ b/frame/3/bli_l3_sup_var12.c
@@ -38,19 +38,19 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t           conja,
-       conj_t           conjb,
-       dim_t            m,
-       dim_t            n,
-       dim_t            k,
-       void*   restrict alpha,
-       void*   restrict a, inc_t rs_a, inc_t cs_a,
-       void*   restrict b, inc_t rs_b, inc_t cs_b,
-       void*   restrict beta,
-       void*   restrict c, inc_t rs_c, inc_t cs_c,
-       stor3_t          eff_id,
-       cntx_t* restrict cntx,
-       rntm_t* restrict rntm
+       conj_t  conja,
+       conj_t  conjb,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t rs_a, inc_t cs_a,
+       void*   b, inc_t rs_b, inc_t cs_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       stor3_t eff_id,
+       cntx_t* cntx,
+       rntm_t* rntm
      );
 
 #if 0
@@ -95,20 +95,20 @@ void bli_gemmsup_ref_var2
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #else
 
@@ -121,11 +121,11 @@ void bli_gemmsup_ref_var2
 	const dim_t    n         = bli_obj_width( c );
 	      dim_t    k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
+	void* buf_a = bli_obj_buffer_at_off( a );
 	      inc_t    rs_a;
 	      inc_t    cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
+	void* buf_b = bli_obj_buffer_at_off( b );
 	      inc_t    rs_b;
 	      inc_t    cs_b;
 
@@ -157,12 +157,12 @@ void bli_gemmsup_ref_var2
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #endif
 
@@ -200,14 +200,14 @@ void PASTEMAC(ch,varname) \
        dim_t            m, \
        dim_t            n, \
        dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       void*   alpha, \
+       void*   a, inc_t rs_a, inc_t cs_a, \
+       void*   b, inc_t rs_b, inc_t cs_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
        stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm  \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	/* If any dimension is zero, return immediately. */ \
@@ -266,13 +266,13 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,gemmsup_ker_ft) \
                gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
-	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* one        = PASTEMAC(ch,1); \
 \
 	auxinfo_t       aux; \
 \
@@ -305,8 +305,8 @@ void PASTEMAC(ch,varname) \
 	{ \
 		const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
 \
-		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* b_jc = b_00 + jj * jcstep_b; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		const dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
 		const dim_t jr_left =   nc_cur % NR; \
@@ -316,19 +316,19 @@ void PASTEMAC(ch,varname) \
 		{ \
 			const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
-			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+			ctype* a_pc = a_00 + pp * pcstep_a; \
+			ctype* b_pc = b_jc + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
+			ctype* beta_use = ( pp == 0 ? beta_cast : one ); \
 \
 			/* Loop over the m dimension (MC rows at a time). */ \
 			for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
 			{ \
 				const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
 \
-				ctype* restrict a_ic = a_pc + ii * icstep_a; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* a_ic = a_pc + ii * icstep_a; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
 				const dim_t ir_left =   mc_cur % MR; \
@@ -338,11 +338,11 @@ void PASTEMAC(ch,varname) \
 				{ \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
-					ctype* restrict b_jr = b_pc + j * jrstep_b; \
-					ctype* restrict c_jr = c_ic + j * jrstep_c; \
+					ctype* b_jr = b_pc + j * jrstep_b; \
+					ctype* c_jr = c_ic + j * jrstep_c; \
 \
 /*
-					ctype* restrict b2 = b_jr; \
+					ctype* b2 = b_jr; \
 */ \
 \
 					/* Loop over the m dimension (MR rows at a time). */ \
@@ -350,13 +350,13 @@ void PASTEMAC(ch,varname) \
 					{ \
 						const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
 \
-						ctype* restrict a_ir = a_ic + i * irstep_a; \
-						ctype* restrict c_ir = c_jr + i * irstep_c; \
+						ctype* a_ir = a_ic + i * irstep_a; \
+						ctype* c_ir = c_jr + i * irstep_c; \
 \
 						/* Save addresses of next panels of A and B to the auxinfo_t
 						   object. */ \
 /*
-						ctype* restrict a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
+						ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
 						if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \
 						{ \
 							a2 = a_00; \
@@ -442,20 +442,20 @@ void bli_gemmsup_ref_var1
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #else
 
@@ -468,11 +468,11 @@ void bli_gemmsup_ref_var1
 	const dim_t    n         = bli_obj_width( c );
 	      dim_t    k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
+	void* buf_a = bli_obj_buffer_at_off( a );
 	      inc_t    rs_a;
 	      inc_t    cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
+	void* buf_b = bli_obj_buffer_at_off( b );
 	      inc_t    rs_b;
 	      inc_t    cs_b;
 
@@ -504,12 +504,12 @@ void bli_gemmsup_ref_var1
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #endif
 
@@ -547,14 +547,14 @@ void PASTEMAC(ch,varname) \
        dim_t            m, \
        dim_t            n, \
        dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       void*   alpha, \
+       void*   a, inc_t rs_a, inc_t cs_a, \
+       void*   b, inc_t rs_b, inc_t cs_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
        stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm  \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	/* If any dimension is zero, return immediately. */ \
@@ -617,13 +617,13 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,gemmsup_ker_ft) \
                gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
-	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* one        = PASTEMAC(ch,1); \
 \
 	auxinfo_t       aux; \
 \
@@ -656,8 +656,8 @@ void PASTEMAC(ch,varname) \
 	{ \
 		const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
 \
-		ctype* restrict a_jc = a_00 + jj * jcstep_a; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* a_jc = a_00 + jj * jcstep_a; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		const dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
 		const dim_t jr_left =   nc_cur % MR; \
@@ -667,19 +667,19 @@ void PASTEMAC(ch,varname) \
 		{ \
 			const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_jc + pp * pcstep_a; \
-			ctype* restrict b_pc = b_00 + pp * pcstep_b; \
+			ctype* a_pc = a_jc + pp * pcstep_a; \
+			ctype* b_pc = b_00 + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
+			ctype* beta_use = ( pp == 0 ? beta_cast : one ); \
 \
 			/* Loop over the n dimension (MC rows at a time). */ \
 			for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
 			{ \
 				const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
 \
-				ctype* restrict b_ic = b_pc + ii * icstep_b; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* b_ic = b_pc + ii * icstep_b; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
 				const dim_t ir_left =   mc_cur % NR; \
@@ -689,16 +689,16 @@ void PASTEMAC(ch,varname) \
 				{ \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
-					ctype* restrict a_jr = a_pc + j * jrstep_a; \
-					ctype* restrict c_jr = c_ic + j * jrstep_c; \
+					ctype* a_jr = a_pc + j * jrstep_a; \
+					ctype* c_jr = c_ic + j * jrstep_c; \
 \
 					/* Loop over the n dimension (MR rows at a time). */ \
 					for ( dim_t i = 0; i < ir_iter; i += ir_inc ) \
 					{ \
 						const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
 \
-						ctype* restrict b_ir = b_ic + i * irstep_b; \
-						ctype* restrict c_ir = c_jr + i * irstep_c; \
+						ctype* b_ir = b_ic + i * irstep_b; \
+						ctype* c_ir = c_jr + i * irstep_c; \
 \
 						/* Invoke the gemmsup micro-kernel. */ \
 						gemmsup_ker \
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index acc4c3071..a5d66783f 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -38,22 +38,22 @@
 
 typedef void (*FUNCPTR_T)
      (
-       bool             packa,
-       bool             packb,
-       conj_t           conja,
-       conj_t           conjb,
-       dim_t            m,
-       dim_t            n,
-       dim_t            k,
-       void*   restrict alpha,
-       void*   restrict a, inc_t rs_a, inc_t cs_a,
-       void*   restrict b, inc_t rs_b, inc_t cs_b,
-       void*   restrict beta,
-       void*   restrict c, inc_t rs_c, inc_t cs_c,
-       stor3_t          eff_id,
-       cntx_t* restrict cntx,
-       rntm_t* restrict rntm,
-       thrinfo_t* restrict thread
+       bool       packa,
+       bool       packb,
+       conj_t     conja,
+       conj_t     conjb,
+       dim_t      m,
+       dim_t      n,
+       dim_t      k,
+       void*      alpha,
+       void*      a, inc_t rs_a, inc_t cs_a,
+       void*      b, inc_t rs_b, inc_t cs_b,
+       void*      beta,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       stor3_t    eff_id,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       thrinfo_t* thread
      );
 
 //
@@ -64,16 +64,16 @@ static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
 
 void bli_gemmsup_ref_var1n
      (
-       trans_t trans,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       stor3_t eff_id,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             trans_t trans,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+             stor3_t eff_id,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             thrinfo_t* thread
      )
 {
 #if 0
@@ -98,41 +98,41 @@ void bli_gemmsup_ref_var1n
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #else
-	const num_t    dt        = bli_obj_dt( c );
+	const num_t  dt    = bli_obj_dt( c );
 
-	const bool     packa     = bli_rntm_pack_a( rntm );
-	const bool     packb     = bli_rntm_pack_b( rntm );
+	const bool   packa = bli_rntm_pack_a( rntm );
+	const bool   packb = bli_rntm_pack_b( rntm );
 
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
+	const conj_t conja = bli_obj_conj_status( a );
+	const conj_t conjb = bli_obj_conj_status( b );
 
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-	      dim_t    k;
+	const dim_t  m     = bli_obj_length( c );
+	const dim_t  n     = bli_obj_width( c );
+	      dim_t  k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
-	      inc_t    rs_a;
-	      inc_t    cs_a;
+	const void*  buf_a = bli_obj_buffer_at_off( a );
+	      inc_t  rs_a;
+	      inc_t  cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
-	      inc_t    rs_b;
-	      inc_t    cs_b;
+	const void*  buf_b = bli_obj_buffer_at_off( b );
+	      inc_t  rs_b;
+	      inc_t  cs_b;
 
 	if ( bli_obj_has_notrans( a ) )
 	{
@@ -162,12 +162,12 @@ void bli_gemmsup_ref_var1n
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
+	      void* buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t rs_c      = bli_obj_row_stride( c );
+	const inc_t cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #endif
 
@@ -193,13 +193,13 @@ void bli_gemmsup_ref_var1n
 		  m,
 		  n,
 		  k,
-		  buf_alpha,
-		  buf_a, rs_a, cs_a,
-		  buf_b, rs_b, cs_b,
-		  buf_beta,
-		  buf_c, rs_c, cs_c,
+		  ( void* )buf_alpha,
+		  ( void* )buf_a, rs_a, cs_a,
+		  ( void* )buf_b, rs_b, cs_b,
+		  ( void* )buf_beta,
+		           buf_c, rs_c, cs_c,
 		  eff_id,
-		  cntx,
+		  ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -216,13 +216,13 @@ void bli_gemmsup_ref_var1n
 		  n,                 // swap the m and n dimensions.
 		  m,
 		  k,
-		  buf_alpha,
-		  buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  buf_beta,
-		  buf_c, cs_c, rs_c, // swap the strides of C.
+		  ( void* )buf_alpha,
+		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
+		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
+		  ( void* )buf_beta,
+		           buf_c, cs_c, rs_c, // swap the strides of C.
 		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-		  cntx,
+          ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -235,22 +235,22 @@ void bli_gemmsup_ref_var1n
 \
 void PASTEMAC(ch,varname) \
      ( \
-       bool             packa, \
-       bool             packb, \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          stor_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       bool       packa, \
+       bool       packb, \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    stor_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -365,20 +365,20 @@ void PASTEMAC(ch,varname) \
 	/* Query the context for the sup microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemmsup_ker_ft) \
-               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+	    gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
 	/* Make local copies of beta and one scalars to prevent any unnecessary
 	   sharing of cache lines between the cores' caches. */ \
 	ctype           beta_local = *beta_cast; \
 	ctype           one_local  = *PASTEMAC(ch,1); \
 \
-	auxinfo_t       aux; \
+	auxinfo_t aux; \
 \
 	/* Parse and interpret the contents of the rntm_t object to properly
 	   set the ways of parallelism for each loop. */ \
@@ -408,12 +408,12 @@ void PASTEMAC(ch,varname) \
 	   That is, this panel-block algorithm partitions an NC x KC submatrix
 	   of A to be packed in the 4th loop, and a KC x MC submatrix of B
 	   to be packed in the 3rd loop. */ \
-	/*                           5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
+	/*                                  5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
 	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* restrict bszids; \
+	bszid_t* bszids; \
 \
 	/* Set the bszids pointer to the correct bszids array above based on which
 	   matrices (if any) are being packed. */ \
@@ -425,16 +425,16 @@ void PASTEMAC(ch,varname) \
 	/* Determine whether we are using more than one thread. */ \
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
 \
-	thrinfo_t* restrict thread_jc = NULL; \
-	thrinfo_t* restrict thread_pc = NULL; \
-	thrinfo_t* restrict thread_pa = NULL; \
-	thrinfo_t* restrict thread_ic = NULL; \
-	thrinfo_t* restrict thread_pb = NULL; \
-	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* thread_jc = NULL; \
+	thrinfo_t* thread_pc = NULL; \
+	thrinfo_t* thread_pa = NULL; \
+	thrinfo_t* thread_ic = NULL; \
+	thrinfo_t* thread_pb = NULL; \
+	thrinfo_t* thread_jr = NULL; \
 \
 	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   restrict bszids_jc = bszids; \
-	                    thread_jc = thread; \
+	bszid_t*   bszids_jc = bszids; \
+	               thread_jc = thread; \
 	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
 \
 	/* Compute the JC loop thread range for the current thread. */ \
@@ -453,12 +453,12 @@ void PASTEMAC(ch,varname) \
 		/* Calculate the thread's current JC block dimension. */ \
 		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
 \
-		ctype* restrict a_jc = a_00 + jj * jcstep_a; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* a_jc = a_00 + jj * jcstep_a; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   restrict bszids_pc = &bszids_jc[1]; \
-		                    thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bszid_t*   bszids_pc = &bszids_jc[1]; \
+		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
 		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
 \
 		/* Compute the PC loop thread range for the current thread. */ \
@@ -476,14 +476,14 @@ void PASTEMAC(ch,varname) \
 			/* Calculate the thread's current PC block dimension. */ \
 			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_jc + pp * pcstep_a; \
-			ctype* restrict b_pc = b_00 + pp * pcstep_b; \
+			ctype* a_pc = a_jc + pp * pcstep_a; \
+			ctype* b_pc = b_00 + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
 \
 			ctype* a_use; \
-			inc_t  rs_a_use, cs_a_use, ps_a_use; \
+			      inc_t  rs_a_use, cs_a_use, ps_a_use; \
 \
 			/* Set the bszid_t array and thrinfo_t pointer based on whether
 			   we will be packing A. If we won't be packing A, we alias to
@@ -493,7 +493,7 @@ void PASTEMAC(ch,varname) \
 			   previous call to bli_thrinfo_grow(), since bszid values of
 			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   restrict bszids_pa; \
+			bszid_t*   bszids_pa; \
 			if ( packa ) { bszids_pa = &bszids_pc[1]; \
 			               thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
 			else         { bszids_pa = &bszids_pc[0]; \
@@ -526,7 +526,7 @@ void PASTEMAC(ch,varname) \
 \
 			/* Alias a_use so that it's clear this is our current block of
 			   matrix A. */ \
-			ctype* restrict a_pc_use = a_use; \
+			ctype* a_pc_use = a_use; \
 \
 			/* We don't need to embed the panel stride of A within the auxinfo_t
 			   object because this variant iterates through A in the jr loop,
@@ -535,8 +535,8 @@ void PASTEMAC(ch,varname) \
 			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
 \
 			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   restrict bszids_ic = &bszids_pa[1]; \
-			                    thread_ic = bli_thrinfo_sub_node( thread_pa ); \
+			bszid_t*   bszids_ic = &bszids_pa[1]; \
+			               thread_ic = bli_thrinfo_sub_node( thread_pa ); \
 			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
 \
 			/* Compute the IC loop thread range for the current thread. */ \
@@ -555,11 +555,11 @@ void PASTEMAC(ch,varname) \
 				/* Calculate the thread's current IC block dimension. */ \
 				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
 \
-				ctype* restrict b_ic = b_pc + ii * icstep_b; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* b_ic = b_pc + ii * icstep_b; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				ctype* b_use; \
-				inc_t  rs_b_use, cs_b_use, ps_b_use; \
+				      inc_t  rs_b_use, cs_b_use, ps_b_use; \
 \
 				/* Set the bszid_t array and thrinfo_t pointer based on whether
 				   we will be packing A. If we won't be packing A, we alias to
@@ -569,7 +569,7 @@ void PASTEMAC(ch,varname) \
 				   previous call to bli_thrinfo_grow(), since bszid values of
 				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   restrict bszids_pb; \
+				bszid_t*   bszids_pb; \
 				if ( packb ) { bszids_pb = &bszids_ic[1]; \
 							   thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
 				else         { bszids_pb = &bszids_ic[0]; \
@@ -602,7 +602,7 @@ void PASTEMAC(ch,varname) \
 \
 				/* Alias b_use so that it's clear this is our current block of
 				   matrix B. */ \
-				ctype* restrict b_ic_use = b_use; \
+				ctype* b_ic_use = b_use; \
 \
 				/* Embed the panel stride of B within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
@@ -610,8 +610,8 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
 \
 				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   restrict bszids_jr = &bszids_pb[1]; \
-				                    thread_jr = bli_thrinfo_sub_node( thread_pb ); \
+				bszid_t*   bszids_jr = &bszids_pb[1]; \
+				               thread_jr = bli_thrinfo_sub_node( thread_pb ); \
 				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
@@ -640,10 +640,10 @@ void PASTEMAC(ch,varname) \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
 \
 					/*
-					ctype* restrict a_jr = a_pc + j * jrstep_a; \
+					ctype* a_jr = a_pc + j * jrstep_a; \
 					*/ \
-					ctype* restrict a_jr = a_pc_use + j * ps_a_use; \
-					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+					ctype* a_jr = a_pc_use + j * ps_a_use; \
+					ctype* c_jr = c_ic     + j * jrstep_c; \
 \
 					/*
 					const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
@@ -664,7 +664,7 @@ void PASTEMAC(ch,varname) \
 						  a_jr,     rs_a_use, cs_a_use, \
 						  b_ic_use, rs_b_use, cs_b_use, \
 						  beta_use, \
-						  c_jr,     rs_c,     cs_c, \
+						            c_jr,     rs_c,     cs_c, \
 						  &aux, \
 						  cntx  \
 						); \
@@ -712,16 +712,16 @@ static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
 
 void bli_gemmsup_ref_var2m
      (
-       trans_t trans,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       stor3_t eff_id,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             trans_t    trans,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+             stor3_t    eff_id,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             thrinfo_t* thread
      )
 {
 #if 0
@@ -746,41 +746,41 @@ void bli_gemmsup_ref_var2m
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #else
-	const num_t    dt        = bli_obj_dt( c );
+	const num_t  dt    = bli_obj_dt( c );
 
-	const bool     packa     = bli_rntm_pack_a( rntm );
-	const bool     packb     = bli_rntm_pack_b( rntm );
+	const bool   packa = bli_rntm_pack_a( rntm );
+	const bool   packb = bli_rntm_pack_b( rntm );
 
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
+	const conj_t conja = bli_obj_conj_status( a );
+	const conj_t conjb = bli_obj_conj_status( b );
 
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-	      dim_t    k;
+	const dim_t  m     = bli_obj_length( c );
+	const dim_t  n     = bli_obj_width( c );
+	      dim_t  k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
-	      inc_t    rs_a;
-	      inc_t    cs_a;
+	const void*  buf_a = bli_obj_buffer_at_off( a );
+	      inc_t  rs_a;
+	      inc_t  cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
-	      inc_t    rs_b;
-	      inc_t    cs_b;
+	const void*  buf_b = bli_obj_buffer_at_off( b );
+	      inc_t  rs_b;
+	      inc_t  cs_b;
 
 	if ( bli_obj_has_notrans( a ) )
 	{
@@ -810,12 +810,12 @@ void bli_gemmsup_ref_var2m
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
+	      void* buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t rs_c      = bli_obj_row_stride( c );
+	const inc_t cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #endif
 
@@ -841,13 +841,13 @@ void bli_gemmsup_ref_var2m
 		  m,
 		  n,
 		  k,
-		  buf_alpha,
-		  buf_a, rs_a, cs_a,
-		  buf_b, rs_b, cs_b,
-		  buf_beta,
-		  buf_c, rs_c, cs_c,
+		  ( void* )buf_alpha,
+		  ( void* )buf_a, rs_a, cs_a,
+		  ( void* )buf_b, rs_b, cs_b,
+		  ( void* )buf_beta,
+		           buf_c, rs_c, cs_c,
 		  eff_id,
-		  cntx,
+		  ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -864,13 +864,13 @@ void bli_gemmsup_ref_var2m
 		  n,                 // swap the m and n dimensions.
 		  m,
 		  k,
-		  buf_alpha,
-		  buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  buf_beta,
-		  buf_c, cs_c, rs_c, // swap the strides of C.
+		  ( void* )buf_alpha,
+		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
+		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
+		  ( void* )buf_beta,
+		           buf_c, cs_c, rs_c, // swap the strides of C.
 		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-		  cntx,
+		  ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -883,22 +883,22 @@ void bli_gemmsup_ref_var2m
 \
 void PASTEMAC(ch,varname) \
      ( \
-       bool             packa, \
-       bool             packb, \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          stor_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       bool       packa, \
+       bool       packb, \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    stor_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -998,13 +998,13 @@ void PASTEMAC(ch,varname) \
 	/* Query the context for the sup microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemmsup_ker_ft) \
-               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+        gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
 	/* Make local copies of beta and one scalars to prevent any unnecessary
 	   sharing of cache lines between the cores' caches. */ \
@@ -1035,7 +1035,7 @@ void PASTEMAC(ch,varname) \
 	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* restrict bszids; \
+	bszid_t* bszids; \
 \
 	/* Set the bszids pointer to the correct bszids array above based on which
 	   matrices (if any) are being packed. */ \
@@ -1047,16 +1047,16 @@ void PASTEMAC(ch,varname) \
 	/* Determine whether we are using more than one thread. */ \
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
 \
-	thrinfo_t* restrict thread_jc = NULL; \
-	thrinfo_t* restrict thread_pc = NULL; \
-	thrinfo_t* restrict thread_pb = NULL; \
-	thrinfo_t* restrict thread_ic = NULL; \
-	thrinfo_t* restrict thread_pa = NULL; \
-	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* thread_jc = NULL; \
+	thrinfo_t* thread_pc = NULL; \
+	thrinfo_t* thread_pb = NULL; \
+	thrinfo_t* thread_ic = NULL; \
+	thrinfo_t* thread_pa = NULL; \
+	thrinfo_t* thread_jr = NULL; \
 \
 	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   restrict bszids_jc = bszids; \
-	                    thread_jc = thread; \
+	bszid_t*   bszids_jc = bszids; \
+	               thread_jc = thread; \
 	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
 \
 	/* Compute the JC loop thread range for the current thread. */ \
@@ -1075,12 +1075,12 @@ void PASTEMAC(ch,varname) \
 		/* Calculate the thread's current JC block dimension. */ \
 		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
 \
-		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* b_jc = b_00 + jj * jcstep_b; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   restrict bszids_pc = &bszids_jc[1]; \
-		                    thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bszid_t*   bszids_pc = &bszids_jc[1]; \
+		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
 		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
 \
 		/* Compute the PC loop thread range for the current thread. */ \
@@ -1098,11 +1098,11 @@ void PASTEMAC(ch,varname) \
 			/* Calculate the thread's current PC block dimension. */ \
 			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
-			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+			ctype* a_pc = a_00 + pp * pcstep_a; \
+			ctype* b_pc = b_jc + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
 \
 			ctype* b_use; \
 			inc_t  rs_b_use, cs_b_use, ps_b_use; \
@@ -1115,7 +1115,7 @@ void PASTEMAC(ch,varname) \
 			   previous call to bli_thrinfo_grow(), since bszid values of
 			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   restrict bszids_pb; \
+			bszid_t*   bszids_pb; \
 			if ( packb ) { bszids_pb = &bszids_pc[1]; \
 			               thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
 			else         { bszids_pb = &bszids_pc[0]; \
@@ -1146,7 +1146,7 @@ void PASTEMAC(ch,varname) \
 \
 			/* Alias b_use so that it's clear this is our current block of
 			   matrix B. */ \
-			ctype* restrict b_pc_use = b_use; \
+			ctype* b_pc_use = b_use; \
 \
 			/* We don't need to embed the panel stride of B within the auxinfo_t
 			   object because this variant iterates through B in the jr loop,
@@ -1155,8 +1155,8 @@ void PASTEMAC(ch,varname) \
 			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
 \
 			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   restrict bszids_ic = &bszids_pb[1]; \
-			                    thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+			bszid_t*   bszids_ic = &bszids_pb[1]; \
+			               thread_ic = bli_thrinfo_sub_node( thread_pb ); \
 			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
 \
 			/* Compute the IC loop thread range for the current thread. */ \
@@ -1175,8 +1175,8 @@ void PASTEMAC(ch,varname) \
 				/* Calculate the thread's current IC block dimension. */ \
 				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
 \
-				ctype* restrict a_ic = a_pc + ii * icstep_a; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* a_ic = a_pc + ii * icstep_a; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				ctype* a_use; \
 				inc_t  rs_a_use, cs_a_use, ps_a_use; \
@@ -1189,7 +1189,7 @@ void PASTEMAC(ch,varname) \
 				   previous call to bli_thrinfo_grow(), since bszid values of
 				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   restrict bszids_pa; \
+				bszid_t*   bszids_pa; \
 				if ( packa ) { bszids_pa = &bszids_ic[1]; \
 							   thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
 				else         { bszids_pa = &bszids_ic[0]; \
@@ -1220,7 +1220,7 @@ void PASTEMAC(ch,varname) \
 \
 				/* Alias a_use so that it's clear this is our current block of
 				   matrix A. */ \
-				ctype* restrict a_ic_use = a_use; \
+				ctype* a_ic_use = a_use; \
 \
 				/* Embed the panel stride of A within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
@@ -1228,8 +1228,8 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
 \
 				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   restrict bszids_jr = &bszids_pa[1]; \
-				                    thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+				bszid_t*   bszids_jr = &bszids_pa[1]; \
+				               thread_jr = bli_thrinfo_sub_node( thread_pa ); \
 				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
@@ -1258,10 +1258,10 @@ void PASTEMAC(ch,varname) \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
 					/*
-					ctype* restrict b_jr = b_pc_use + j * jrstep_b; \
+					ctype* b_jr = b_pc_use + j * jrstep_b; \
 					*/ \
-					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
-					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+					ctype* b_jr = b_pc_use + j * ps_b_use; \
+					ctype* c_jr = c_ic     + j * jrstep_c; \
 \
 					/*
 					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -1282,7 +1282,7 @@ void PASTEMAC(ch,varname) \
 						  a_ic_use, rs_a_use, cs_a_use, \
 						  b_jr,     rs_b_use, cs_b_use, \
 						  beta_use, \
-						  c_jr,     rs_c,     cs_c, \
+						            c_jr,     rs_c,     cs_c, \
 						  &aux, \
 						  cntx  \
 						); \
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index ead9925e6..df9a747ab 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -42,16 +42,16 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       trans_t trans, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       stor3_t eff_id, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
+             trans_t    trans, \
+       const obj_t*     alpha, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     beta, \
+       const obj_t*     c, \
+             stor3_t    eff_id, \
+       const cntx_t*    cntx, \
+             rntm_t*    rntm, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( gemmsup_ref_var1 )
@@ -70,20 +70,20 @@ GENPROT( gemmsup_ref_var2m )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    eff_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
@@ -94,22 +94,22 @@ INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       bool             packa, \
-       bool             packb, \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       bool       packa, \
+       bool       packb, \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    eff_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
@@ -119,12 +119,12 @@ INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
 
 BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
      (
-       num_t    dt,
-       trans_t* trans,
-       bool     packa,
-       bool     packb,
-       stor3_t* eff_id,
-       cntx_t*  cntx
+             num_t    dt,
+             trans_t* trans,
+             bool     packa,
+             bool     packb,
+             stor3_t* eff_id,
+       const cntx_t*  cntx
      )
 {
 	const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( *eff_id ), cntx );
diff --git a/frame/3/bli_l3_tapi.c b/frame/3/bli_l3_tapi.c
index afec5b677..8f256a11a 100644
--- a/frame/3/bli_l3_tapi.c
+++ b/frame/3/bli_l3_tapi.c
@@ -43,16 +43,16 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -80,16 +80,16 @@ INSERT_GENTFUNC_BASIC0( gemm )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -118,17 +118,17 @@ INSERT_GENTFUNC_BASIC0( gemmt )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -159,14 +159,14 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -193,16 +193,16 @@ INSERT_GENTFUNCR_BASIC0( herk )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -231,14 +231,14 @@ INSERT_GENTFUNCR_BASIC0( her2k )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -265,16 +265,16 @@ INSERT_GENTFUNC_BASIC0( syrk )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -303,18 +303,18 @@ INSERT_GENTFUNC_BASIC0( syr2k )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -345,15 +345,15 @@ INSERT_GENTFUNC_BASIC0( trmm3 )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
diff --git a/frame/3/bli_l3_tapi.h b/frame/3/bli_l3_tapi.h
index 4b3504001..9b7a9b077 100644
--- a/frame/3/bli_l3_tapi.h
+++ b/frame/3/bli_l3_tapi.h
@@ -43,16 +43,16 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm )
@@ -62,17 +62,17 @@ INSERT_GENTPROT_BASIC0( gemm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( hemm )
@@ -84,14 +84,14 @@ INSERT_GENTPROT_BASIC0( symm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROTR_BASIC0( herk )
@@ -102,16 +102,16 @@ INSERT_GENTPROTR_BASIC0( herk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROTR_BASIC0( her2k )
@@ -122,14 +122,14 @@ INSERT_GENTPROTR_BASIC0( her2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( syrk )
@@ -140,16 +140,16 @@ INSERT_GENTPROT_BASIC0( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmt )
@@ -161,18 +161,18 @@ INSERT_GENTPROT_BASIC0( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm3 )
@@ -183,15 +183,15 @@ INSERT_GENTPROT_BASIC0( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm )
diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c
index f6a52fb5e..c934ba949 100644
--- a/frame/3/bli_l3_tapi_ex.c
+++ b/frame/3/bli_l3_tapi_ex.c
@@ -44,18 +44,18 @@
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -74,12 +74,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   n,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   n,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_conjtrans( transa, &ao ); \
 	bli_obj_set_conjtrans( transb, &bo ); \
@@ -103,19 +103,19 @@ INSERT_GENTFUNC_BASIC0( gemm )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -134,12 +134,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dim_with_side(   side,   m, n, &mn_a ); \
 	bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b,  n_b,  b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,    n,    c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b,  n_b,  ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,    n,             c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploa, &ao ); \
 	bli_obj_set_conj( conja, &ao ); \
@@ -169,16 +169,16 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -195,11 +195,11 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 \
-	bli_obj_init_finish_1x1( dt_r, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt_r, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt_r, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt_r, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -225,18 +225,18 @@ INSERT_GENTFUNCR_BASIC0( herk )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -256,12 +256,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt,   alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt_r, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt,   ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt_r, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -289,16 +289,16 @@ INSERT_GENTFUNCR_BASIC0( her2k )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -314,11 +314,11 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -344,18 +344,18 @@ INSERT_GENTFUNC_BASIC0( syrk )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -374,12 +374,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -407,18 +407,18 @@ INSERT_GENTFUNC_BASIC0( syr2k )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -437,12 +437,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -468,20 +468,20 @@ INSERT_GENTFUNC_BASIC0( gemmt )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -500,12 +500,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dim_with_side(   side,   m, n, &mn_a ); \
 	bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b,  n_b,  b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,    n,    c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b,  n_b,  ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,    n,             c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploa, &ao ); \
 	bli_obj_set_diag( diaga, &ao ); \
@@ -535,17 +535,17 @@ INSERT_GENTFUNC_BASIC0( trmm3 )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -560,10 +560,10 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 \
 	bli_set_dim_with_side( side, m, n, &mn_a ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
 \
-	bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m,    n,    b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m,    n,             b, rs_b, cs_b, &bo ); \
 \
 	bli_obj_set_uplo( uploa, &ao ); \
 	bli_obj_set_diag( diaga, &ao ); \
diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h
index 1ab0a8ff1..eb142af05 100644
--- a/frame/3/bli_l3_tapi_ex.h
+++ b/frame/3/bli_l3_tapi_ex.h
@@ -43,18 +43,18 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm )
@@ -64,19 +64,19 @@ INSERT_GENTPROT_BASIC0( gemm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( hemm )
@@ -88,16 +88,16 @@ INSERT_GENTPROT_BASIC0( symm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( herk )
@@ -108,18 +108,18 @@ INSERT_GENTPROTR_BASIC0( herk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( her2k )
@@ -130,16 +130,16 @@ INSERT_GENTPROTR_BASIC0( her2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( syrk )
@@ -150,18 +150,18 @@ INSERT_GENTPROT_BASIC0( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmt )
@@ -173,20 +173,20 @@ INSERT_GENTPROT_BASIC0( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm3 )
@@ -197,17 +197,17 @@ INSERT_GENTPROT_BASIC0( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm )
diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h
index 677afc020..44a59bd4c 100644
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -50,8 +50,8 @@ void PASTEMAC(ch,opname) \
        ctype_in*  restrict b, \
        ctype_out* restrict beta, \
        ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 
@@ -68,8 +68,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 
@@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index 56eaf3f4c..c2e8ed5d5 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -47,8 +47,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	bli_init_once(); \
@@ -92,8 +92,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	bli_init_once(); \
@@ -133,8 +133,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c
index de077e5ad..485779a90 100644
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -37,44 +37,47 @@
 
 void bli_gemm_blk_var1
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t a1, c1;
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t ap, cp;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	const dir_t direct = bli_l3_direct( &ap, b, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_m( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl );
 
 	// Determine the current thread's subpartition range.
+	dim_t my_start, my_end;
 	bli_thread_range_mdim
 	(
-	  direct, thread, a, b, c, cntl, cntx,
+	  direct, thread, &ap, b, &cp, cntl, cntx,
 	  &my_start, &my_end
 	);
 
 	// Partition along the m dimension.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, a,
+		b_alg = bli_determine_blocksize( direct, i, my_end, &ap,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
+		obj_t a1, c1;
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, a, &a1 );
+		                        i, b_alg, &ap, &a1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, c, &c1 );
+		                        i, b_alg, &cp, &c1 );
 
 		// Perform gemm subproblem.
 		bli_l3_int
diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c
index 53943e47c..254a31064 100644
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -37,44 +37,47 @@
 
 void bli_gemm_blk_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t b1, c1;
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t bp, cp;
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( a, &bp, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_n( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl );
 
 	// Determine the current thread's subpartition range.
+	dim_t my_start, my_end;
 	bli_thread_range_ndim
 	(
-	  direct, thread, a, b, c, cntl, cntx,
+	  direct, thread, a, &bp, &cp, cntl, cntx,
 	  &my_start, &my_end
 	);
 
 	// Partition along the n dimension.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, b,
+		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for B1 and C1.
+		obj_t b1, c1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, c, &c1 );
+		                        i, b_alg, &cp, &c1 );
 
 		// Perform gemm subproblem.
 		bli_l3_int
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index 28029777d..1bbec1d95 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -36,39 +36,43 @@
 
 void bli_gemm_blk_var3
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t a1, b1;
-	dim_t b_alg;
+	obj_t ap, bp, cs;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cs );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_k( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl );
 
 	// Query dimension in partitioning direction.
-	dim_t k_trans = bli_obj_width_after_trans( a );
+	dim_t k_trans = bli_obj_width_after_trans( &ap );
 
 	// Partition along the k dimension.
+	dim_t b_alg;
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b,
+		b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp,
 		                             bli_cntl_bszid( cntl ), cntx, cntl );
 
 		// Acquire partitions for A1 and B1.
+		obj_t a1, b1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, a, &a1 );
+		                        i, b_alg, &ap, &a1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 
 		// Perform gemm subproblem.
 		bli_l3_int
@@ -77,7 +81,7 @@ void bli_gemm_blk_var3
 		  &a1,
 		  &b1,
 		  &BLIS_ONE,
-		  c,
+		  &cs,
 		  cntx,
 		  rntm,
 		  bli_cntl_sub_node( cntl ),
@@ -107,7 +111,7 @@ void bli_gemm_blk_var3
 		// Thus, for neither trmm nor trmm3 should we reset the scalar on C
 		// after the first iteration.
 		if ( bli_cntl_family( cntl ) != BLIS_TRMM )
-		if ( i == 0 ) bli_obj_scalar_reset( c );
+		if ( i == 0 ) bli_obj_scalar_reset( &cs );
 	}
 }
 
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index cd8827bd9..1ae904abf 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -37,14 +37,14 @@
 
 void bli_gemm_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
@@ -163,8 +163,8 @@ void bli_gemm_front
 	  rntm
 	);
 
-	obj_t* cp    = &c_local;
-	obj_t* betap = beta;
+	      obj_t* cp    = &c_local;
+	const obj_t* betap = beta;
 
 #ifdef BLIS_ENABLE_GEMM_MD
 #ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
@@ -269,7 +269,7 @@ void bli_gemm_front
 	// If we created a temporary matrix conformal to C for whatever reason,
 	// we copy/accumulate the result back to C and then release the object.
 	if ( use_ct )
-    {
+	{
 		obj_t beta_local;
 
 		bli_obj_scalar_detach( &c_local, &beta_local );
diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h
index 2728ce8f7..744f88d1b 100644
--- a/frame/3/gemm/bli_gemm_front.h
+++ b/frame/3/gemm/bli_gemm_front.h
@@ -34,26 +34,26 @@
 
 void bli_gemm_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 err_t bli_gemm_small
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      );
 #endif
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 874a12439..814b47c0c 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -77,38 +77,38 @@ static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn);
 
 void bli_gemm_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-	num_t     dt_c      = bli_obj_dt( c );
+	      num_t  dt_exec   = bli_obj_exec_dt( c );
+	      num_t  dt_c      = bli_obj_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	      pack_t schema_a  = bli_obj_pack_schema( a );
+	      pack_t schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	char*     a_cast    = bli_obj_buffer_at_off( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const char*  a_cast    = bli_obj_buffer_at_off( a );
+	      inc_t  is_a      = bli_obj_imag_stride( a );
+	      dim_t  pd_a      = bli_obj_panel_dim( a );
+	      inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	char*     b_cast    = bli_obj_buffer_at_off( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const char*  b_cast    = bli_obj_buffer_at_off( b );
+	      inc_t  is_b      = bli_obj_imag_stride( b );
+	      dim_t  pd_b      = bli_obj_panel_dim( b );
+	      inc_t  ps_b      = bli_obj_panel_stride( b );
 
-	char*     c_cast    = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	      char*  c_cast    = bli_obj_buffer_at_off( c );
+	      inc_t  rs_c      = bli_obj_row_stride( c );
+	      inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// If any dimension is zero, return immediately.
 	if ( bli_zero_dim3( m, n, k ) ) return;
@@ -129,8 +129,8 @@ void bli_gemm_ker_var2
 	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
 	// and we know that the internal scalar in C is already of the type dt_c
 	// due to the casting in the implementation of bli_obj_scalar_attach().
-	char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
-	char* beta_cast  = bli_obj_internal_scalar_buffer( c );
+	const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
+	const char* beta_cast  = bli_obj_internal_scalar_buffer( c );
 
 	// If 1m is being employed on a column- or row-stored matrix with a
 	// real-valued beta, we can use the real domain macro-kernel, which
@@ -174,14 +174,12 @@ void bli_gemm_ker_var2
 	}
 #endif
 
-	siz_t        dt_size   = bli_dt_size( dt_exec );
-	siz_t        dt_c_size = bli_dt_size( dt_c );
+	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_c_size = bli_dt_size( dt_c );
 
 	// Alias some constants to simpler names.
-	const dim_t  MR        = pd_a;
-	const dim_t  NR        = pd_b;
-	//const dim_t PACKMR     = cs_a;
-	//const dim_t PACKNR     = rs_b;
+	const dim_t MR = pd_a;
+	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
@@ -191,7 +189,7 @@ void bli_gemm_ker_var2
 	// field of the params struct. If that function pointer is non-NULL, use it
 	// as our microkernel instead of the default microkernel queried from the
 	// cntx above.
-	gemm_ker_params_t* params = bli_obj_ker_params( c );
+	const gemm_ker_params_t* params = bli_obj_ker_params( c );
 	gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
 	if ( user_ukr ) gemm_ukr = user_ukr;
 
@@ -204,7 +202,7 @@ void bli_gemm_ker_var2
 	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
 	const inc_t rs_ct       = ( col_pref ? 1 : NR );
 	const inc_t cs_ct       = ( col_pref ? MR : 1 );
-	char*       zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const char* zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
 
 	//
 	// Assumptions/assertions:
@@ -277,24 +275,24 @@ void bli_gemm_ker_var2
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
 	{
-		char* b1 = b_cast + j * cstep_b;
-		char* c1 = c_cast + j * cstep_c;
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
-		char* b2 = b1;
+		const char* b2 = b1;
 
 		// Loop over the m dimension (MR rows at a time).
 		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
 		{
-			char* a1  = a_cast + i * rstep_a;
-			char* c11 = c1     + i * rstep_c;
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
 
-			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
 			// Compute the addresses of the next panels of A and B.
-			char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
+			const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
 			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) )
 			{
 				a2 = a_cast;
@@ -320,13 +318,13 @@ void bli_gemm_ker_var2
 				  m_cur,
 				  n_cur,
 				  k,
-				  alpha_cast,
-				  a1,
-				  b1,
-				  beta_cast,
-				  c11, rs_c, cs_c,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				           c11, rs_c, cs_c,
 				  &aux,
-				  cntx
+				  ( cntx_t* )cntx
 				);
 			}
 			else
@@ -337,13 +335,13 @@ void bli_gemm_ker_var2
 				  MR,
 				  NR,
 				  k,
-				  alpha_cast,
-				  a1,
-				  b1,
-				  zero,
-				  &ct, rs_ct, cs_ct,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				           &ct, rs_ct, cs_ct,
 				  &aux,
-				  cntx
+				  ( cntx_t* )cntx
 				);
 
 				// Accumulate to C with type-casting.
@@ -351,7 +349,7 @@ void bli_gemm_ker_var2
 				(
 				    m_cur, n_cur,
 				    &ct, rs_ct, cs_ct,
-				    beta_cast,
+				    ( void* )beta_cast,
 				    c11, rs_c, cs_c
 				);
 			}
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index 6202cfffd..a283c1235 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -39,12 +39,12 @@
 
 void bli_gemm_md
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -148,12 +148,12 @@ void bli_gemm_md
 //                 cab
 mddm_t bli_gemm_md_ccr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -201,48 +201,51 @@ mddm_t bli_gemm_md_ccr
 
 	// Copy the real domain blocksizes into the slots of their complex
 	// counterparts.
-	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
-	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
-	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
-	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
-	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
+	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
+	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
+	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
+	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
 
 	// Halve both the real and complex MR's (which are both real MR's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mr );
 
 	// Halve both the real and complex MC's (which are both real MC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mc );
 
-	// Use the default pack schemas in the objects.
+    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
 
-	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
+	// Use the default pack schemas in the objects.
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
-	bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
-	bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
+    bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
+    bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
 
 	// Return the computation and execution domains.
 	return doms;
@@ -253,12 +256,12 @@ mddm_t bli_gemm_md_ccr
 //                 cab
 mddm_t bli_gemm_md_crc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -306,48 +309,51 @@ mddm_t bli_gemm_md_crc
 
 	// Copy the real domain blocksizes into the slots of their complex
 	// counterparts.
-	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
-	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
-	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
-	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
-	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
+	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
+	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
+	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
+	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
 
 	// Halve both the real and complex NR's (which are both real NR's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nr );
 
 	// Halve both the real and complex NC's (which are both real NC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nc );
 
-	// Use the default pack schemas in the objects.
+    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
 
-	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
+	// Use the default pack schemas in the objects.
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
-	bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
-	bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
+    bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
+    bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
 
 	// Return the computation and execution domains.
 	return doms;
@@ -358,12 +364,12 @@ mddm_t bli_gemm_md_crc
 //                 cab
 mddm_t bli_gemm_md_rcc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -384,32 +390,38 @@ mddm_t bli_gemm_md_rcc
 
 	// Copy the real domain blocksizes into the slots of their complex
 	// counterparts.
-	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
-	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
-	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
-	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
-	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
+	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
+	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
+	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
+	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
 
 	// Halve both the real and complex KC's (which are both real KC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_kc );
+
+    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
 
 	// Use the 1r pack schema for both A and B with the conjugation
 	// of A or B toggled (to produce ar * br - ai * bi).
@@ -427,14 +439,14 @@ mddm_t bli_gemm_md_rcc
 	// the target datatype. (The packm_blk_var1_md() function has "built-in"
 	// support for packing to 1r (and 1e) schemas, whereas the
 	// packm_blk_var1() function relies on packm kernels for packing to 1r.
-	const num_t dt_complex = bli_obj_dt( a );
-	cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
+	const num_t   dt_complex = bli_obj_dt( a );
+	const cntx_t* cntx_1m    = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
 
-	func_t* cntx_funcs    = bli_cntx_ukrs_buf( *cntx );
-	func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m );
+	const func_t* packm_1m_mr = bli_cntx_get_ukrs( BLIS_PACKM_MRXK_KER, cntx_1m );
+	const func_t* packm_1m_nr = bli_cntx_get_ukrs( BLIS_PACKM_NRXK_KER, cntx_1m );
 
-	cntx_funcs[ BLIS_PACKM_MRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_MRXK_KER ];
-	cntx_funcs[ BLIS_PACKM_NRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_NRXK_KER ];
+    bli_cntx_set_ukr( BLIS_PACKM_MRXK_KER, packm_1m_mr, cntx_local );
+    bli_cntx_set_ukr( BLIS_PACKM_NRXK_KER, packm_1m_nr, cntx_local );
 
 	// Return the computation and execution domains.
 	return doms;
@@ -445,12 +457,12 @@ mddm_t bli_gemm_md_rcc
 //                 cab
 mddm_t bli_gemm_md_crr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -502,12 +514,12 @@ mddm_t bli_gemm_md_crr
 //                 cab
 mddm_t bli_gemm_md_rcr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -540,12 +552,12 @@ mddm_t bli_gemm_md_rcr
 //                 cab
 mddm_t bli_gemm_md_rrc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -578,12 +590,12 @@ mddm_t bli_gemm_md_rrc
 //                 cab
 mddm_t bli_gemm_md_rrr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -608,12 +620,12 @@ mddm_t bli_gemm_md_rrr
 //                 cab
 mddm_t bli_gemm_md_ccc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h
index 751e271ea..d71d97987 100644
--- a/frame/3/gemm/bli_gemm_md.h
+++ b/frame/3/gemm/bli_gemm_md.h
@@ -43,51 +43,51 @@ typedef struct mddm_s
 
 void bli_gemm_md
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      );
-mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
 
 // -----------------------------------------------------------------------------
 
 void bli_gemm_md_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 void bli_gemm_md_zgemm
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c )
+BLIS_INLINE bool bli_gemm_md_is_crr( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	bool r_val = FALSE;
 
@@ -107,7 +107,7 @@ BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c )
 	return r_val;
 }
 
-BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c )
+BLIS_INLINE bool bli_gemm_md_is_ccr( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	bool r_val = FALSE;
 
@@ -127,7 +127,7 @@ BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c )
 	return r_val;
 }
 
-BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c )
+BLIS_INLINE bool bli_gemm_md_is_crc( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	bool r_val = FALSE;
 
@@ -151,17 +151,17 @@ BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c )
 
 BLIS_INLINE void bli_gemm_md_ker_var2_recast
      (
-       num_t* dt_comp,
-       num_t  dt_a,
-       num_t  dt_b,
-       num_t* dt_c,
-       dim_t* m,
-       dim_t* n,
-       dim_t* k,
-       inc_t* pd_a, inc_t* ps_a,
-       inc_t* pd_b, inc_t* ps_b,
-       obj_t* c,
-       inc_t* rs_c, inc_t* cs_c
+             num_t* dt_comp,
+             num_t  dt_a,
+             num_t  dt_b,
+             num_t* dt_c,
+             dim_t* m,
+             dim_t* n,
+             dim_t* k,
+             inc_t* pd_a, inc_t* ps_a,
+             inc_t* pd_b, inc_t* ps_b,
+       const obj_t* c,
+             inc_t* rs_c, inc_t* cs_c
      )
 {
 	if      ( bli_is_real( *dt_c )    &&
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index a4797ad4f..086a3b1df 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -49,8 +49,8 @@ void PASTEMAC2(ch,opname,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt        = PASTEMAC(ch,type); \
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index 888181bad..d3109e600 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -53,13 +53,13 @@ typedef struct
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( gemm_blk_var1 )
diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h
index 52ea81a5e..789d5895c 100644
--- a/frame/3/gemm/ind/bli_gemm_ind_opt.h
+++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h
@@ -34,16 +34,16 @@
 
 BLIS_INLINE void bli_gemm_ind_recast_1m_params
      (
-       num_t* dt_exec,
-       num_t* dt_c,
-       pack_t schema_a,
-       obj_t* c,
-       dim_t* m,
-       dim_t* n,
-       dim_t* k,
-       inc_t* pd_a, inc_t* ps_a,
-       inc_t* pd_b, inc_t* ps_b,
-       inc_t* rs_c, inc_t* cs_c
+             num_t* dt_exec,
+             num_t* dt_c,
+             pack_t schema_a,
+       const obj_t* c,
+             dim_t* m,
+             dim_t* n,
+             dim_t* k,
+             inc_t* pd_a, inc_t* ps_a,
+             inc_t* pd_b, inc_t* ps_b,
+             inc_t* rs_c, inc_t* cs_c
      )
 {
 	obj_t beta;
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index d53838470..e291b5f27 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -37,14 +37,14 @@
 
 void bli_gemmt_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/gemmt/bli_gemmt_front.h
index c5967f8b8..0f2a9ada2 100644
--- a/frame/3/gemmt/bli_gemmt_front.h
+++ b/frame/3/gemmt/bli_gemmt_front.h
@@ -35,12 +35,12 @@
 
 void bli_gemmt_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 3aedc6e9a..aed0359ec 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -62,81 +62,74 @@ static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
 
 void bli_gemmt_l_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index b3a9fe8a1..87d77ee55 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -62,81 +62,74 @@ static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
 
 void bli_gemmt_u_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-    f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h
index 60c68c9f5..98d8f5563 100644
--- a/frame/3/gemmt/bli_gemmt_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -43,13 +43,13 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  ah, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( gemmt_x_ker_var2 )
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
index 3a1d681c3..76fe106b0 100644
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -42,13 +42,13 @@ static l3_var_oft vars[2] =
 
 void bli_gemmt_x_ker_var2
      (
-       obj_t*  a,
-       obj_t*  ah,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  ah,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	dim_t      uplo;
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index 15460125d..c39703503 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -36,15 +36,15 @@
 
 void bli_hemm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h
index 308b6378b..63eb91cd3 100644
--- a/frame/3/hemm/bli_hemm_front.h
+++ b/frame/3/hemm/bli_hemm_front.h
@@ -34,13 +34,13 @@
 
 void bli_hemm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index 8108b607f..c9aada989 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -36,15 +36,15 @@
 
 void bli_symm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h
index 909997f6c..417cb9acb 100644
--- a/frame/3/symm/bli_symm_front.h
+++ b/frame/3/symm/bli_symm_front.h
@@ -34,13 +34,13 @@
 
 void bli_symm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index d973b6eb6..edd4ce1ef 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -37,13 +37,13 @@
 
 void bli_trmm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h
index 3e136f9dc..cfefdd39b 100644
--- a/frame/3/trmm/bli_trmm_front.h
+++ b/frame/3/trmm/bli_trmm_front.h
@@ -34,11 +34,11 @@
 
 void bli_trmm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index 646287f93..f5476b2ca 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
 
 void bli_trmm_ll_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffa,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index 9ef2a475d..df5b2dac5 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
 
 void bli_trmm_lu_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffa,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index f6b20af2e..89f86aa3a 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
 
 void bli_trmm_rl_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index f71fb3c4d..4ed38e761 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
 
 void bli_trmm_ru_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h
index 262b0490f..2f0642ca8 100644
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -43,13 +43,13 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 //GENPROT( trmm_blk_var1 )
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
index 898cfe242..d42bc88c2 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -43,13 +43,13 @@ static l3_var_oft vars[2][2] =
 
 void bli_trmm_xx_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	dim_t      side;
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index 9cd04963b..9681eb640 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -36,15 +36,15 @@
 
 void bli_trmm3_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h
index 296b9354b..b5dde34cd 100644
--- a/frame/3/trmm3/bli_trmm3_front.h
+++ b/frame/3/trmm3/bli_trmm3_front.h
@@ -34,13 +34,13 @@
 
 void bli_trmm3_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index 30bf6921c..79ac65c48 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -39,34 +39,35 @@
 
 void bli_trsm_blk_var1
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t ap, cp;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( &ap, b, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_m( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl );
 
 	// Isolate the diagonal block A11 and its corresponding row panel C1.
-	const dim_t kc = bli_obj_width_after_trans( a );
+	const dim_t kc = bli_obj_width_after_trans( &ap );
 	obj_t a11, c1;
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-	                        0, kc, a, &a11 );
+	                        0, kc, &ap, &a11 );
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-	                        0, kc, c, &c1 );
+	                        0, kc, &cp, &c1 );
 
 	// All threads iterate over the entire diagonal block A11.
-	my_start = 0; my_end = kc;
+	dim_t my_start = 0, my_end = kc;
 
 #ifdef PRINT
 	printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n",
@@ -76,14 +77,14 @@ void bli_trsm_blk_var1
 #endif
 
 	// Partition along the m dimension for the trsm subproblem.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
-		obj_t a11_1, c1_1;
-
 		b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
+		obj_t a11_1, c1_1;
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
 		                        i, b_alg, &a11, &a11_1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
@@ -124,9 +125,9 @@ void bli_trsm_blk_var1
 	// on whether we are moving forwards or backwards, respectively).
 	obj_t ax1, cx1;
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A,
-	                        0, kc, a, &ax1 );
+	                        0, kc, &ap, &ax1 );
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A,
-	                        0, kc, c, &cx1 );
+	                        0, kc, &cp, &cx1 );
 
 #ifdef PRINT
 	printf( "bli_trsm_blk_var1(): ax1 is %d x %d at offsets (%3d, %3d)\n",
@@ -139,7 +140,7 @@ void bli_trsm_blk_var1
 	bli_thread_range_mdim
 	(
 	  direct, thread, &ax1, b, &cx1, cntl, cntx,
-      &my_start, &my_end
+	  &my_start, &my_end
 	);
 
 #ifdef PRINT
@@ -149,13 +150,12 @@ void bli_trsm_blk_var1
 	// Partition along the m dimension for the gemm subproblem.
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
-		obj_t a11, c1;
-
 		// Determine the current algorithmic blocksize.
 		b_alg = bli_determine_blocksize( direct, i, my_end, &ax1,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
+		obj_t a11, c1;
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
 		                        i, b_alg, &ax1, &a11 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c
index 5691c964a..88db57e51 100644
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -37,44 +37,47 @@
 
 void bli_trsm_blk_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t b1, c1;
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t bp, cp;
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( a, &bp, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_n( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl );
 
 	// Determine the current thread's subpartition range.
+	dim_t my_start, my_end;
 	bli_thread_range_ndim
 	(
-	  direct, thread, a, b, c, cntl, cntx,
+	  direct, thread, a, &bp, &cp, cntl, cntx,
 	  &my_start, &my_end
 	);
 
 	// Partition along the n dimension.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, b,
+		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for B1 and C1.
+		obj_t b1, c1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, c, &c1 );
+		                        i, b_alg, &cp, &c1 );
 
 		// Perform trsm subproblem.
 		bli_l3_int
diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index 43fc25f16..2ff3db6f1 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -36,39 +36,43 @@
 
 void bli_trsm_blk_var3
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t a1, b1;
-	dim_t b_alg;
+	obj_t ap, bp, cs;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cs );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_k( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl );
 
 	// Query dimension in partitioning direction.
-	dim_t k_trans = bli_obj_width_after_trans( a );
+	dim_t k_trans = bli_obj_width_after_trans( &ap );
 
 	// Partition along the k dimension.
+	dim_t b_alg;
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b,
+		b_alg = bli_trsm_determine_kc( direct, i, k_trans, &ap, &bp,
 		                               bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and B1.
+		obj_t a1, b1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, a, &a1 );
+		                        i, b_alg, &ap, &a1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 
 		// Perform trsm subproblem.
 		bli_l3_int
@@ -77,7 +81,7 @@ void bli_trsm_blk_var3
 		  &a1,
 		  &b1,
 		  &BLIS_ONE,
-		  c,
+		  &cs,
 		  cntx,
 		  rntm,
 		  bli_cntl_sub_node( cntl ),
@@ -92,8 +96,9 @@ void bli_trsm_blk_var3
 		// that they are only used in the first iteration.
 		if ( i == 0 )
 		{
-			bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b );
-			bli_obj_scalar_reset( c );
+			bli_obj_scalar_reset( &ap );
+			bli_obj_scalar_reset( &bp );
+			bli_obj_scalar_reset( &cs );
 		}
 	}
 }
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index 7f3d17aef..b94a129d9 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -37,13 +37,13 @@
 
 void bli_trsm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h
index 379935536..b31e88b04 100644
--- a/frame/3/trsm/bli_trsm_front.h
+++ b/frame/3/trsm/bli_trsm_front.h
@@ -35,13 +35,13 @@
 
 void bli_trsm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 7b1133c2a..075b40336 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
 
 void bli_trsm_ll_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to B (the non-triangular matrix). This will be the alpha
@@ -105,7 +100,7 @@ void bli_trsm_ll_ker_var2
 	// be applied to the packed copy of B prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( b );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_ll_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffa,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 2059d1c9f..799fdd101 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
 
 void bli_trsm_lu_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to B (the non-triangular matrix). This will be the alpha
@@ -105,7 +100,7 @@ void bli_trsm_lu_ker_var2
 	// be applied to the packed copy of B prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( b );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_lu_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffa,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index cace3622a..721203df7 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
 
 void bli_trsm_rl_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to A (the non-triangular matrix). This will be the alpha
@@ -105,7 +100,7 @@ void bli_trsm_rl_ker_var2
 	// be applied to the packed copy of A prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_rl_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 4b0c7f083..447fbf8cd 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
 
 void bli_trsm_ru_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to A (the non-triangular matrix). This will be the alpha
@@ -105,7 +100,7 @@ void bli_trsm_ru_ker_var2
 	// be applied to the packed copy of A prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_ru_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index 8322a8b5b..7e747b4a8 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -43,13 +43,13 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( trsm_blk_var1 )
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index c30a5828a..a0a59c0a8 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -43,13 +43,13 @@ static l3_var_oft vars[2][2] =
 
 void bli_trsm_xx_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	dim_t      side;
diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c
index e2d812351..a42c7103e 100644
--- a/frame/base/bli_apool.c
+++ b/frame/base/bli_apool.c
@@ -36,7 +36,7 @@
 
 void bli_apool_init
      (
-       apool_t* restrict apool
+       apool_t* apool
      )
 {
 	err_t r_val;
@@ -47,7 +47,7 @@ void bli_apool_init
 	// library initialization.
 
 	// Query the mutex from the apool_t.
-	//bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool );
+	//bli_pthread_mutex_t* mutex = bli_apool_mutex( apool );
 
 	// Initialize the mutex.
 	//*mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
@@ -76,7 +76,7 @@ void bli_apool_init
 	const siz_t align_size = 64;
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// Set the default array_t length of the apool_t.
 	bli_apool_set_def_array_len( num_elem, apool );
@@ -92,7 +92,7 @@ void bli_apool_init
 	#endif
 
 	// Allocate the block_ptrs array.
-	array_t** restrict block_ptrs
+	array_t** block_ptrs
 	=
 	bli_malloc_intl( block_ptrs_len * sizeof( array_t* ), &r_val );
 
@@ -139,8 +139,8 @@ void bli_apool_init
 
 void bli_apool_alloc_block
      (
-       siz_t              num_elem,
-       array_t** restrict array_p
+       siz_t     num_elem,
+       array_t** array_p
      )
 {
 	err_t r_val;
@@ -156,9 +156,7 @@ void bli_apool_alloc_block
 	// Allocate the array_t via the bli_fmalloc_align() wrapper, which performs
 	// alignment logic and opaquely saves the original pointer so that it can
 	// be recovered when it's time to free the block.
-	array_t* restrict array
-	=
-	bli_malloc_intl( block_size, &r_val );
+	array_t* array = bli_malloc_intl( block_size, &r_val );
 
 	// Initialize an array_t struct within the newly allocated memory region.
 	bli_array_init( num_elem, sizeof( pool_t* ), array );
@@ -169,16 +167,16 @@ void bli_apool_alloc_block
 
 void bli_apool_free_block
      (
-       array_t* restrict array
+       array_t* array
      )
 {
-	const siz_t       num_elem = bli_array_num_elem( array );
-	pool_t** restrict buf      = bli_array_buf( array );
+	const siz_t    num_elem = bli_array_num_elem( array );
+	      pool_t** buf      = bli_array_buf( array );
 
 	// Step through the array and finalize each pool_t.
 	for ( dim_t i = 0; i < num_elem; ++i )
 	{
-		pool_t* restrict pool = buf[ i ];
+		pool_t* pool = buf[ i ];
 
 		#ifdef BLIS_ENABLE_MEM_TRACING
 		printf( "bli_apool_free_block(): freeing pool_t %d within array_t.\n",
@@ -218,25 +216,25 @@ void bli_apool_free_block
 
 void bli_apool_finalize
      (
-       apool_t* restrict apool
+       apool_t* apool
      )
 {
 	// NOTE: Since the apool_t's mutex is now initialized statically, we no
 	// longer need to explicitly destroy it.
 
 	// Query the mutex from the apool_t.
-	//bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool );
+	//bli_pthread_mutex_t* mutex = bli_apool_mutex( apool );
 
 	// Destroy the mutex.
 	//bli_pthread_mutex_destroy( mutex );
 
 	// Query the underlying pool_t and mutex from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// ----------------------------------------------------------------
 
 	// Query the block_ptrs array.
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the total number of blocks currently allocated.
 	siz_t num_blocks = bli_pool_num_blocks( pool );
@@ -270,8 +268,8 @@ void bli_apool_finalize
 
 array_t* bli_apool_checkout_array
      (
-       siz_t             n_threads,
-       apool_t* restrict apool
+       siz_t    n_threads,
+       apool_t* apool
      )
 {
 	// Acquire the apool_t's mutex.
@@ -298,10 +296,10 @@ array_t* bli_apool_checkout_array
 	// At this point, at least one array_t is guaranteed to be available.
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// Query the block_ptrs array.
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -313,7 +311,7 @@ array_t* bli_apool_checkout_array
 	#endif
 
 	// Select the array_t* at top_index to return to the caller.
-	array_t* restrict array = block_ptrs[ top_index ];
+	array_t* array = block_ptrs[ top_index ];
 
 	// Increment the pool's top_index.
 	bli_pool_set_top_index( top_index + 1, pool );
@@ -333,15 +331,15 @@ array_t* bli_apool_checkout_array
 
 void bli_apool_checkin_array
      (
-       array_t* restrict array,
-       apool_t* restrict apool
+       array_t* array,
+       apool_t* apool
      )
 {
 	// Acquire the apool_t's mutex.
 	bli_apool_lock( apool );
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// ----------------------------------------------------------------------------
 
@@ -351,7 +349,7 @@ void bli_apool_checkin_array
 	// change.
 
 	// Query the block_ptrs array.
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -376,8 +374,8 @@ void bli_apool_checkin_array
 
 pool_t* bli_apool_array_elem
      (
-       siz_t             index,
-       array_t* restrict array
+       siz_t    index,
+       array_t* array
      )
 {
 	err_t r_val;
@@ -391,8 +389,8 @@ pool_t* bli_apool_array_elem
 	// stores in the array_t are pool_t*, that means that the function is
 	// actually returning the address of a pool_t*, or pool_t**, hence the
 	// dereferencing below.
-	pool_t** restrict pool_p = bli_array_elem( index, array );
-	pool_t*           pool   = *pool_p;
+	pool_t** pool_p = bli_array_elem( index, array );
+	pool_t*  pool   = *pool_p;
 
 	// If the element is NULL, then it means a pool_t has not yet been created
 	// and allocated for the given index (thread id).
@@ -463,8 +461,8 @@ pool_t* bli_apool_array_elem
 
 void bli_apool_grow
      (
-       siz_t             num_blocks_add,
-       apool_t* restrict apool
+       siz_t    num_blocks_add,
+       apool_t* apool
      )
 {
 	err_t r_val;
@@ -473,7 +471,7 @@ void bli_apool_grow
 	if ( num_blocks_add == 0 ) return;
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// Query the default initial array length from the apool_t.
 	const siz_t num_elem = bli_apool_def_array_len( apool );
@@ -499,7 +497,7 @@ void bli_apool_grow
 		const siz_t block_ptrs_len_new = 2 * block_ptrs_len_cur;
 
 		// Query the current block_ptrs array.
-		array_t** restrict block_ptrs_cur = bli_pool_block_ptrs( pool );
+		array_t** block_ptrs_cur = bli_pool_block_ptrs( pool );
 
 		#ifdef BLIS_ENABLE_MEM_TRACING
 		printf( "bli_apool_grow(): growing block_ptrs_len (%d -> %d): ",
@@ -507,7 +505,7 @@ void bli_apool_grow
 		#endif
 
 		// Allocate a new block_ptrs array.
-		array_t** restrict block_ptrs_new
+		array_t** block_ptrs_new
 		=
 		bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ), &r_val );
 
@@ -541,7 +539,7 @@ void bli_apool_grow
 	// blocks.
 
 	// Query the current block_ptrs array (which was maybe just resized).
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_apool_grow(): growing apool_t (%d -> %d).\n",
diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h
index e6e91958a..d06f79207 100644
--- a/frame/base/bli_apool.h
+++ b/frame/base/bli_apool.h
@@ -61,16 +61,14 @@ BLIS_INLINE  bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
 	return &(apool->mutex);
 }
 
-BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool )
+BLIS_INLINE siz_t bli_apool_def_array_len( const apool_t* pool )
 {
 	return pool->def_array_len;
 }
 
-BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool )
+BLIS_INLINE bool bli_apool_is_exhausted( const apool_t* apool )
 {
-	pool_t* restrict pool = bli_apool_pool( apool );
-
-	return bli_pool_is_exhausted( pool );
+	return bli_pool_is_exhausted( &apool->pool );
 }
 
 // apool action
@@ -96,44 +94,44 @@ BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool
 
 void bli_apool_init
      (
-       apool_t* restrict apool
+       apool_t* apool
      );
 void bli_apool_finalize
      (
-       apool_t* restrict apool
+       apool_t* apool
      );
 
 array_t* bli_apool_checkout_array
      (
-       siz_t             n_threads,
-       apool_t* restrict apool
+       siz_t    n_threads,
+       apool_t* apool
      );
 void bli_apool_checkin_array
      (
-       array_t* restrict array,
-       apool_t* restrict apool
+       array_t* array,
+       apool_t* apool
      );
 
 pool_t* bli_apool_array_elem
      (
-       siz_t             index,
-       array_t* restrict array
+       siz_t    index,
+       array_t* array
      );
 
 void bli_apool_grow
      (
-       siz_t             num_blocks_add,
-       apool_t* restrict apool
+       siz_t    num_blocks_add,
+       apool_t* apool
      );
 
 void bli_apool_alloc_block
      (
-       siz_t              num_elem,
-       array_t** restrict array_p
+       siz_t     num_elem,
+       array_t** array_p
      );
 void bli_apool_free_block
      (
-       array_t* restrict array
+       array_t* array
      );
 
 
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index 54aa64d42..48b50a774 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -121,7 +121,7 @@ void bli_arch_set_id( void )
 		// initialized. Query the address of an internal context data structure
 		// corresponding to req_id. This pointer will be NULL if the associated
 		// subconfig is not available.
-		cntx_t** req_cntx = bli_gks_lookup_id( req_id );
+		const cntx_t* const * req_cntx = bli_gks_lookup_id( req_id );
 
 		// This function checks the context pointer and aborts with a useful
 		// error message if the pointer is found to be NULL.
@@ -253,7 +253,7 @@ void bli_arch_set_id( void )
 // enumeration that is typedef'ed in bli_type_defs.h. That is, the
 // index order of each string should correspond to the implied/assigned
 // enum value given to the corresponding BLIS_ARCH_ value.
-static char* config_name[ BLIS_NUM_ARCHS ] =
+static const char* config_name[ BLIS_NUM_ARCHS ] =
 {
     "skx",
     "knl",
@@ -283,11 +283,11 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
     "power9",
     "power7",
     "bgq",
-    
+
     "generic"
 };
 
-char* bli_arch_string( arch_t id )
+const char* bli_arch_string( arch_t id )
 {
 	return config_name[ id ];
 }
@@ -306,9 +306,9 @@ bool bli_arch_get_logging( void )
 	return arch_dolog;
 }
 
-void bli_arch_log( char* fmt, ... )
+void bli_arch_log( const char* fmt, ... )
 {
-	char prefix[] = "libblis: ";
+	const char prefix[] = "libblis: ";
 	int  n_chars  = strlen( prefix ) + strlen( fmt ) + 1;
 
 	if ( bli_arch_get_logging() && fmt )
diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h
index 0cd55dace..08af7ae79 100644
--- a/frame/base/bli_arch.h
+++ b/frame/base/bli_arch.h
@@ -40,11 +40,11 @@ BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void );
 void bli_arch_set_id_once( void );
 void bli_arch_set_id( void );
 
-BLIS_EXPORT_BLIS char*  bli_arch_string( arch_t id );
+BLIS_EXPORT_BLIS const char*  bli_arch_string( arch_t id );
 
 void bli_arch_set_logging( bool dolog );
 bool bli_arch_get_logging( void );
-void bli_arch_log( char*, ... );
+void bli_arch_log( const char*, ... );
 
 #endif
 
diff --git a/frame/base/bli_array.c b/frame/base/bli_array.c
index 3844cd52f..ea47a0024 100644
--- a/frame/base/bli_array.c
+++ b/frame/base/bli_array.c
@@ -38,9 +38,9 @@
 
 void bli_array_init
      (
-       const siz_t       num_elem,
-       const siz_t       elem_size,
-       array_t* restrict array
+       siz_t    num_elem,
+       siz_t    elem_size,
+       array_t* array
      )
 {
 	err_t r_val;
@@ -54,7 +54,7 @@ void bli_array_init
 	const size_t array_size = num_elem * elem_size;
 
 	// Allocate the array buffer.
-	void* restrict buf = bli_malloc_intl( array_size, &r_val );
+	void* buf = bli_malloc_intl( array_size, &r_val );
 
 	// Initialize the array elements to zero. THIS IS IMPORANT because
 	// consumer threads will use the NULL-ness of the array elements to
@@ -70,8 +70,8 @@ void bli_array_init
 
 void bli_array_resize
      (
-       const siz_t       num_elem_new,
-       array_t* restrict array
+       siz_t    num_elem_new,
+       array_t* array
      )
 {
 	err_t r_val;
@@ -94,7 +94,7 @@ void bli_array_resize
 	const size_t array_size_new  = num_elem_new  * elem_size;
 
 	// Query the previous array buffer.
-	void* restrict buf_prev = bli_array_buf( array );
+	void* buf_prev = bli_array_buf( array );
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_array_resize(): allocating array [%d * %d]: ",
@@ -102,7 +102,7 @@ void bli_array_resize
 	#endif
 
 	// Allocate a new array buffer.
-	char* restrict buf_new = bli_malloc_intl( array_size_new, &r_val );
+	char* buf_new = bli_malloc_intl( array_size_new, &r_val );
 
 	// Copy the previous array contents to the new array.
 	memcpy( buf_new, buf_prev, array_size_prev );
@@ -129,7 +129,7 @@ void bli_array_resize
 
 void bli_array_finalize
      (
-       array_t* restrict array
+       array_t* array
      )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
@@ -138,7 +138,7 @@ void bli_array_finalize
 	#endif
 
 	// Query the buffer from the array.
-	void* restrict buf = bli_array_buf( array );
+	void* buf = bli_array_buf( array );
 
 	// Free the buffer.
 	bli_free_intl( buf );
@@ -146,8 +146,8 @@ void bli_array_finalize
 
 void* bli_array_elem
      (
-       const siz_t       index,
-       array_t* restrict array
+             siz_t    index,
+       const array_t* array
      )
 {
 	// Query the number of elements in the array.
@@ -161,7 +161,7 @@ void* bli_array_elem
 
 	// Query the buffer from the array, but store it as a char* so we can use
 	// it to easily perform byte pointer arithmetic.
-	char* restrict buf = bli_array_buf( array );
+	char* buf = bli_array_buf( array );
 
 	// Advance the pointer by (index * elem_size) bytes.
 	buf += index * elem_size;
@@ -172,17 +172,19 @@ void* bli_array_elem
 
 void bli_array_set_elem
      (
-       void*    restrict elem,
-       const siz_t       index,
-       array_t* restrict array
+       void*    elem,
+       siz_t    index,
+       array_t* array
      )
 {
 	// Query the size of each element in the array.
 	const siz_t elem_size = bli_array_elem_size( array );
 
 	// Query the buffer from the array as a char*.
-	char* restrict buf = bli_array_buf( array );
+	char* buf = bli_array_buf( array );
 
+// memcpy() is the only safe way to copy data of unknown type
+#if 0
 	if ( elem_size == sizeof( void* ) )
 	{
 		#ifdef BLIS_ENABLE_MEM_TRACING
@@ -193,16 +195,19 @@ void bli_array_set_elem
 
 		// Special case: Handle elem_size = sizeof( void* ) without calling
 		// memcpy().
-		void** restrict buf_vvp  = ( void** )buf;
-		void** restrict elem_vvp = ( void** )elem;
+		void** buf_vvp  = ( void** )buf;
+		void** elem_vvp = ( void** )elem;
 
 		buf_vvp[ index ] = *elem_vvp;
 	}
 	else
 	{
+#endif
 		// General case: Copy the elem_size bytes from elem to buf at the
 		// element index specified by index.
 		memcpy( &buf[ index * elem_size ], elem, ( size_t )elem_size );
+#if 0
 	}
+#endif
 }
 
diff --git a/frame/base/bli_array.h b/frame/base/bli_array.h
index 4cb00496b..c1e6ce038 100644
--- a/frame/base/bli_array.h
+++ b/frame/base/bli_array.h
@@ -51,17 +51,17 @@ typedef struct
 
 // Array entry query
 
-BLIS_INLINE void* bli_array_buf( array_t* array )
+BLIS_INLINE void* bli_array_buf( const array_t* array )
 {
 	return array->buf;
 }
 
-BLIS_INLINE siz_t bli_array_num_elem( array_t* array )
+BLIS_INLINE siz_t bli_array_num_elem( const array_t* array )
 {
 	return array->num_elem;
 }
 
-BLIS_INLINE siz_t bli_array_elem_size( array_t* array )
+BLIS_INLINE siz_t bli_array_elem_size( const array_t* array )
 {
 	return array->elem_size;
 }
@@ -87,30 +87,30 @@ BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \
 
 void bli_array_init
      (
-       const siz_t       num_elem,
-       const siz_t       elem_size,
-       array_t* restrict array
+       siz_t    num_elem,
+       siz_t    elem_size,
+       array_t* array
      );
 void bli_array_resize
      (
-       const siz_t       num_elem_new,
-       array_t* restrict array
+       siz_t    num_elem_new,
+       array_t* array
      );
 void bli_array_finalize
      (
-       array_t* restrict array
+       array_t* array
      );
 
 void* bli_array_elem
      (
-       const siz_t       index,
-       array_t* restrict array
+             siz_t    index,
+       const array_t* array
      );
 void bli_array_set_elem
      (
-       void*    restrict elem,
-       const siz_t       index,
-       array_t* restrict array
+       void*    elem,
+       siz_t    index,
+       array_t* array
      );
 
 #endif
diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h
index d8c6cbb13..166480b30 100644
--- a/frame/base/bli_auxinfo.h
+++ b/frame/base/bli_auxinfo.h
@@ -38,49 +38,49 @@
 
 // auxinfo_t field query
 
-BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai )
+BLIS_INLINE pack_t bli_auxinfo_schema_a( const auxinfo_t* ai )
 {
 	return ai->schema_a;
 }
-BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai )
+BLIS_INLINE pack_t bli_auxinfo_schema_b( const auxinfo_t* ai )
 {
 	return ai->schema_b;
 }
 
-BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai )
+BLIS_INLINE const void* bli_auxinfo_next_a( const auxinfo_t* ai )
 {
 	return ai->a_next;
 }
-BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai )
+BLIS_INLINE const void* bli_auxinfo_next_b( const auxinfo_t* ai )
 {
 	return ai->b_next;
 }
 
-BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_is_a( const auxinfo_t* ai )
 {
 	return ai->is_a;
 }
-BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_is_b( const auxinfo_t* ai )
 {
 	return ai->is_b;
 }
 
-BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_ps_a( const auxinfo_t* ai )
 {
 	return ai->ps_a;
 }
-BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_ps_b( const auxinfo_t* ai )
 {
 	return ai->ps_b;
 }
 
-BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai )
+BLIS_INLINE void_fp bli_auxinfo_ukr( const auxinfo_t* ai )
 {
-    return ai->ukr;
+	return ai->ukr;
 }
-BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai )
+BLIS_INLINE const void* bli_auxinfo_params( const auxinfo_t* ai )
 {
-    return ai->params;
+	return ai->params;
 }
 
 
@@ -95,15 +95,15 @@ BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai )
 	ai->schema_b = schema;
 }
 
-BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_next_a( const void* p, auxinfo_t* ai )
 {
 	ai->a_next = p;
 }
-BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_next_b( const void* p, auxinfo_t* ai )
 {
 	ai->b_next = p;
 }
-BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_next_ab( const void* ap, const void* bp, auxinfo_t* ai )
 {
 	ai->a_next = ap;
 	ai->b_next = bp;
@@ -129,11 +129,11 @@ BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai )
 
 BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai )
 {
-    ai->ukr = ukr;
+	ai->ukr = ukr;
 }
-BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_params( const void* params, auxinfo_t* ai )
 {
-    ai->params = params;
+	ai->params = params;
 }
 
 #endif
diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c
index 524653d74..38b4b7956 100644
--- a/frame/base/bli_blksz.c
+++ b/frame/base/bli_blksz.c
@@ -235,12 +235,12 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-       dir_t   direct,
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+             dir_t   direct,
+             dim_t   i,
+             dim_t   dim,
+       const obj_t*  obj,
+             bszid_t bszid,
+       const cntx_t* cntx
      )
 {
 	if ( direct == BLIS_FWD )
@@ -251,17 +251,17 @@ dim_t bli_determine_blocksize
 
 dim_t bli_determine_blocksize_f
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+             dim_t   i,
+             dim_t   dim,
+       const obj_t*  obj,
+             bszid_t bszid,
+       const cntx_t* cntx
      )
 {
-	num_t    dt;
-	blksz_t* bsize;
-	dim_t    b_alg, b_max;
-	dim_t    b_use;
+	num_t          dt;
+	const blksz_t* bsize;
+	dim_t          b_alg, b_max;
+	dim_t          b_use;
 
 	// Extract the execution datatype and use it to query the corresponding
 	// blocksize and blocksize maximum values from the blksz_t object.
@@ -277,17 +277,17 @@ dim_t bli_determine_blocksize_f
 
 dim_t bli_determine_blocksize_b
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+             dim_t   i,
+             dim_t   dim,
+       const obj_t*  obj,
+             bszid_t bszid,
+       const cntx_t* cntx
      )
 {
-	num_t    dt;
-	blksz_t* bsize;
-	dim_t    b_alg, b_max;
-	dim_t    b_use;
+	num_t          dt;
+	const blksz_t* bsize;
+	dim_t          b_alg, b_max;
+	dim_t          b_use;
 
 	// Extract the execution datatype and use it to query the corresponding
 	// blocksize and blocksize maximum values from the blksz_t object.
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index 2e0fefeae..d91c0542d 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -36,8 +36,8 @@
 
 BLIS_INLINE dim_t bli_blksz_get_def
      (
-       num_t    dt,
-       blksz_t* b
+             num_t    dt,
+       const blksz_t* b
      )
 {
 	return b->v[ dt ];
@@ -45,8 +45,8 @@ BLIS_INLINE dim_t bli_blksz_get_def
 
 BLIS_INLINE dim_t bli_blksz_get_max
      (
-       num_t    dt,
-       blksz_t* b
+             num_t    dt,
+       const blksz_t* b
      )
 {
 	return b->e[ dt ];
@@ -77,8 +77,8 @@ BLIS_INLINE void bli_blksz_set_max
 
 BLIS_INLINE void bli_blksz_copy
      (
-       blksz_t* b_src,
-       blksz_t* b_dst
+       const blksz_t* b_src,
+             blksz_t* b_dst
      )
 {
 	*b_dst = *b_src;
@@ -86,8 +86,8 @@ BLIS_INLINE void bli_blksz_copy
 
 BLIS_INLINE void bli_blksz_copy_if_pos
      (
-       blksz_t* b_src,
-       blksz_t* b_dst
+       const blksz_t* b_src,
+             blksz_t* b_dst
      )
 {
 	// Copy the blocksize values over to b_dst one-by-one so that
@@ -116,8 +116,8 @@ BLIS_INLINE void bli_blksz_copy_if_pos
 
 BLIS_INLINE void bli_blksz_copy_def_dt
      (
-       num_t dt_src, blksz_t* b_src,
-       num_t dt_dst, blksz_t* b_dst
+       num_t dt_src, const blksz_t* b_src,
+       num_t dt_dst,       blksz_t* b_dst
      )
 {
 	const dim_t val = bli_blksz_get_def( dt_src, b_src );
@@ -127,8 +127,8 @@ BLIS_INLINE void bli_blksz_copy_def_dt
 
 BLIS_INLINE void bli_blksz_copy_max_dt
      (
-       num_t dt_src, blksz_t* b_src,
-       num_t dt_dst, blksz_t* b_dst
+       num_t dt_src, const blksz_t* b_src,
+       num_t dt_dst,       blksz_t* b_dst
      )
 {
 	const dim_t val = bli_blksz_get_max( dt_src, b_src );
@@ -138,8 +138,8 @@ BLIS_INLINE void bli_blksz_copy_max_dt
 
 BLIS_INLINE void bli_blksz_copy_dt
      (
-       num_t dt_src, blksz_t* b_src,
-       num_t dt_dst, blksz_t* b_dst
+       num_t dt_src, const blksz_t* b_src,
+       num_t dt_dst,       blksz_t* b_dst
      )
 {
 	bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst );
@@ -252,30 +252,30 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-       dir_t   direct,
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+             dir_t   direct,
+             dim_t   i,
+             dim_t   dim,
+       const obj_t*  obj,
+             bszid_t bszid,
+       const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_f
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+             dim_t   i,
+             dim_t   dim,
+       const obj_t*  obj,
+             bszid_t bszid,
+       const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_b
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+             dim_t   i,
+             dim_t   dim,
+       const obj_t*  obj,
+             bszid_t bszid,
+       const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_f_sub
diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c
index e76314036..16c418b49 100644
--- a/frame/base/bli_check.c
+++ b/frame/base/bli_check.c
@@ -37,7 +37,7 @@
 
 // -- General stuff ------------------------------------------------------------
 
-err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line )
+err_t bli_check_error_code_helper( gint_t code, const char* file, guint_t line )
 {
 	if ( code == BLIS_SUCCESS ) return code;
 
@@ -68,7 +68,7 @@ err_t bli_check_valid_error_level( errlev_t level )
 	return e_val;
 }
 
-err_t bli_check_null_pointer( void* ptr )
+err_t bli_check_null_pointer( const void* ptr )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -128,7 +128,7 @@ err_t bli_check_valid_diag( diag_t diag )
 	return e_val;
 }
 
-err_t bli_check_nonunit_diag( obj_t* a )
+err_t bli_check_nonunit_diag( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -155,7 +155,7 @@ err_t bli_check_valid_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_object_valid_datatype( obj_t* a )
+err_t bli_check_object_valid_datatype( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -176,7 +176,7 @@ err_t bli_check_noninteger_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_noninteger_object( obj_t* a )
+err_t bli_check_noninteger_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -197,7 +197,7 @@ err_t bli_check_nonconstant_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_nonconstant_object( obj_t* a )
+err_t bli_check_nonconstant_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -221,7 +221,7 @@ err_t bli_check_floating_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_floating_object( obj_t* a )
+err_t bli_check_floating_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -243,7 +243,7 @@ err_t bli_check_real_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_real_object( obj_t* a )
+err_t bli_check_real_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -264,7 +264,7 @@ err_t bli_check_integer_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_integer_object( obj_t* a )
+err_t bli_check_integer_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -287,7 +287,7 @@ err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b )
 	return e_val;
 }
 
-err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b )
+err_t bli_check_consistent_object_datatypes( const obj_t* a, const obj_t* b )
 {
 	err_t e_val;
 	num_t dt_a;
@@ -315,7 +315,7 @@ err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r )
 	return e_val;
 }
 
-err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r )
+err_t bli_check_object_real_proj_of( const obj_t* c, const obj_t* r )
 {
 	err_t e_val;
 	num_t dt_c;
@@ -329,7 +329,7 @@ err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r )
 	return e_val;
 }
 
-err_t bli_check_real_valued_object( obj_t* a )
+err_t bli_check_real_valued_object( const obj_t* a )
 {
 	err_t  e_val = BLIS_SUCCESS;
 	double a_real;
@@ -363,7 +363,7 @@ err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b )
 	return e_val;
 }
 
-err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b )
+err_t bli_check_consistent_object_precisions( const obj_t* a, const obj_t* b )
 {
 	err_t e_val;
 	num_t dt_a;
@@ -379,7 +379,7 @@ err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b )
 
 // -- Dimension-related checks -------------------------------------------------
 
-err_t bli_check_conformal_dims( obj_t* a, obj_t* b )
+err_t bli_check_conformal_dims( const obj_t* a, const obj_t* b )
 {
 	err_t e_val = BLIS_SUCCESS;
 	dim_t m_a, n_a;
@@ -396,7 +396,7 @@ err_t bli_check_conformal_dims( obj_t* a, obj_t* b )
 	return e_val;
 }
 
-err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c )
+err_t bli_check_level3_dims( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	err_t e_val = BLIS_SUCCESS;
 	dim_t m_c, n_c;
@@ -420,7 +420,7 @@ err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c )
 	return e_val;
 }
 
-err_t bli_check_scalar_object( obj_t* a )
+err_t bli_check_scalar_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -435,7 +435,7 @@ err_t bli_check_scalar_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_vector_object( obj_t* a )
+err_t bli_check_vector_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -449,7 +449,7 @@ err_t bli_check_vector_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_matrix_object( obj_t* a )
+err_t bli_check_matrix_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -460,7 +460,7 @@ err_t bli_check_matrix_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y )
+err_t bli_check_equal_vector_lengths( const obj_t* x, const obj_t* y )
 {
 	err_t e_val = BLIS_SUCCESS;
 	dim_t dim_x;
@@ -475,7 +475,7 @@ err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y )
 	return e_val;
 }
 
-err_t bli_check_square_object( obj_t* a )
+err_t bli_check_square_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -485,7 +485,7 @@ err_t bli_check_square_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_object_length_equals( obj_t* a, dim_t m )
+err_t bli_check_object_length_equals( const obj_t* a, dim_t m )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -495,7 +495,7 @@ err_t bli_check_object_length_equals( obj_t* a, dim_t m )
 	return e_val;
 }
 
-err_t bli_check_object_width_equals( obj_t* a, dim_t n )
+err_t bli_check_object_width_equals( const obj_t* a, dim_t n )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -505,7 +505,7 @@ err_t bli_check_object_width_equals( obj_t* a, dim_t n )
 	return e_val;
 }
 
-err_t bli_check_vector_dim_equals( obj_t* a, dim_t n )
+err_t bli_check_vector_dim_equals( const obj_t* a, dim_t n )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -515,7 +515,7 @@ err_t bli_check_vector_dim_equals( obj_t* a, dim_t n )
 	return e_val;
 }
 
-err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset )
+err_t bli_check_object_diag_offset_equals( const obj_t* a, doff_t offset )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -612,7 +612,7 @@ err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is )
 
 // -- Structure-related checks -------------------------------------------------
 
-err_t bli_check_general_object( obj_t* a )
+err_t bli_check_general_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -622,7 +622,7 @@ err_t bli_check_general_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_hermitian_object( obj_t* a )
+err_t bli_check_hermitian_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -632,7 +632,7 @@ err_t bli_check_hermitian_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_symmetric_object( obj_t* a )
+err_t bli_check_symmetric_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -642,7 +642,7 @@ err_t bli_check_symmetric_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_triangular_object( obj_t* a )
+err_t bli_check_triangular_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -652,7 +652,7 @@ err_t bli_check_triangular_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_object_struc( obj_t* a, struc_t struc )
+err_t bli_check_object_struc( const obj_t* a, struc_t struc )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -666,7 +666,7 @@ err_t bli_check_object_struc( obj_t* a, struc_t struc )
 
 // -- Storage-related checks ---------------------------------------------------
 
-err_t bli_check_upper_or_lower_object( obj_t* a )
+err_t bli_check_upper_or_lower_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -731,7 +731,7 @@ err_t bli_check_valid_3x3_subpart( subpart_t part )
 
 // -- Control tree-related checks ----------------------------------------------
 
-err_t bli_check_valid_cntl( void* cntl )
+err_t bli_check_valid_cntl( const void* cntl )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -743,7 +743,7 @@ err_t bli_check_valid_cntl( void* cntl )
 
 // -- Packing-related checks ---------------------------------------------------
 
-err_t bli_check_packm_schema_on_unpack( obj_t* a )
+err_t bli_check_packm_schema_on_unpack( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -756,7 +756,7 @@ err_t bli_check_packm_schema_on_unpack( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_packv_schema_on_unpack( obj_t* a )
+err_t bli_check_packv_schema_on_unpack( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -768,7 +768,7 @@ err_t bli_check_packv_schema_on_unpack( obj_t* a )
 
 // -- Buffer-related checks ----------------------------------------------------
 
-err_t bli_check_object_buffer( obj_t* a )
+err_t bli_check_object_buffer( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -783,7 +783,7 @@ err_t bli_check_object_buffer( obj_t* a )
 
 // -- Memory checks ------------------------------------------------------------
 
-err_t bli_check_valid_malloc_buf( void* ptr )
+err_t bli_check_valid_malloc_buf( const void* ptr )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -809,7 +809,7 @@ err_t bli_check_valid_packbuf( packbuf_t buf_type )
 	return e_val;
 }
 
-err_t bli_check_if_exhausted_pool( pool_t* pool )
+err_t bli_check_if_exhausted_pool( const pool_t* pool )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -819,7 +819,7 @@ err_t bli_check_if_exhausted_pool( pool_t* pool )
 	return e_val;
 }
 
-err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx )
+err_t bli_check_sufficient_stack_buf_size( const cntx_t* cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
 	num_t dt;
@@ -873,7 +873,7 @@ err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size )
 
 // -- Object-related errors ----------------------------------------------------
 
-err_t bli_check_object_alias_of( obj_t* a, obj_t* b )
+err_t bli_check_object_alias_of( const obj_t* a, const obj_t* b )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -895,7 +895,7 @@ err_t bli_check_valid_arch_id( arch_t id )
 	return e_val;
 }
 
-err_t bli_check_initialized_gks_cntx( cntx_t** cntx )
+err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -907,7 +907,7 @@ err_t bli_check_initialized_gks_cntx( cntx_t** cntx )
 
 // -- Architecture-related errors ----------------------------------------------
 
-err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr )
+err_t bli_check_valid_mc_mod_mult( const blksz_t* mc, const blksz_t* mr )
 {
 	num_t dt;
 
@@ -924,7 +924,7 @@ err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr )
 	return BLIS_SUCCESS;
 }
 
-err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr )
+err_t bli_check_valid_nc_mod_mult( const blksz_t* nc, const blksz_t* nr )
 {
 	num_t dt;
 
@@ -941,7 +941,7 @@ err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr )
 	return BLIS_SUCCESS;
 }
 
-err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr )
+err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr )
 {
 	num_t dt;
 
diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h
index 276d27689..f1e2201a7 100644
--- a/frame/base/bli_check.h
+++ b/frame/base/bli_check.h
@@ -34,85 +34,85 @@
 */
 
 
-BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line );
+BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, const char* file, guint_t line );
 
 err_t bli_check_valid_error_level( errlev_t level );
 
-err_t bli_check_null_pointer( void* ptr );
+err_t bli_check_null_pointer( const void* ptr );
 
 err_t bli_check_valid_side( side_t side );
 err_t bli_check_valid_uplo( uplo_t uplo );
 err_t bli_check_valid_trans( trans_t trans );
 err_t bli_check_valid_diag( diag_t diag );
-err_t bli_check_nonunit_diag( obj_t* a );
+err_t bli_check_nonunit_diag( const obj_t* a );
 
 err_t bli_check_valid_datatype( num_t dt );
-err_t bli_check_object_valid_datatype( obj_t* a );
+err_t bli_check_object_valid_datatype( const obj_t* a );
 err_t bli_check_noninteger_datatype( num_t dt );
-err_t bli_check_noninteger_object( obj_t* a );
+err_t bli_check_noninteger_object( const obj_t* a );
 err_t bli_check_nonconstant_datatype( num_t dt );
-err_t bli_check_nonconstant_object( obj_t* a );
+err_t bli_check_nonconstant_object( const obj_t* a );
 err_t bli_check_floating_datatype( num_t dt );
-err_t bli_check_floating_object( obj_t* a );
+err_t bli_check_floating_object( const obj_t* a );
 err_t bli_check_real_datatype( num_t dt );
-err_t bli_check_real_object( obj_t* a );
+err_t bli_check_real_object( const obj_t* a );
 err_t bli_check_integer_datatype( num_t dt );
-err_t bli_check_integer_object( obj_t* a );
+err_t bli_check_integer_object( const obj_t* a );
 err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b );
-err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b );
+err_t bli_check_consistent_object_datatypes( const obj_t* a, const obj_t* b );
 err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r );
-err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r );
-err_t bli_check_real_valued_object( obj_t* a );
+err_t bli_check_object_real_proj_of( const obj_t* c, const obj_t* r );
+err_t bli_check_real_valued_object( const obj_t* a );
 err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b );
-err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b );
-
-err_t bli_check_conformal_dims( obj_t* a, obj_t* b );
-err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c );
-err_t bli_check_scalar_object( obj_t* a );
-err_t bli_check_vector_object( obj_t* a );
-err_t bli_check_matrix_object( obj_t* a );
-err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y );
-err_t bli_check_square_object( obj_t* a );
-err_t bli_check_object_length_equals( obj_t* a, dim_t m );
-err_t bli_check_object_width_equals( obj_t* a, dim_t n );
-err_t bli_check_vector_dim_equals( obj_t* a, dim_t n );
-err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset );
+err_t bli_check_consistent_object_precisions( const obj_t* a, const obj_t* b );
+
+err_t bli_check_conformal_dims( const obj_t* a, const obj_t* b );
+err_t bli_check_level3_dims( const obj_t* a, const obj_t* b, const obj_t* c );
+err_t bli_check_scalar_object( const obj_t* a );
+err_t bli_check_vector_object( const obj_t* a );
+err_t bli_check_matrix_object( const obj_t* a );
+err_t bli_check_equal_vector_lengths( const obj_t* x, const obj_t* y );
+err_t bli_check_square_object( const obj_t* a );
+err_t bli_check_object_length_equals( const obj_t* a, dim_t m );
+err_t bli_check_object_width_equals( const obj_t* a, dim_t n );
+err_t bli_check_vector_dim_equals( const obj_t* a, dim_t n );
+err_t bli_check_object_diag_offset_equals( const obj_t* a, doff_t offset );
 
 err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is );
 
-err_t bli_check_general_object( obj_t* a );
-err_t bli_check_hermitian_object( obj_t* a );
-err_t bli_check_symmetric_object( obj_t* a );
-err_t bli_check_triangular_object( obj_t* a );
-err_t bli_check_object_struc( obj_t* a, struc_t struc );
+err_t bli_check_general_object( const obj_t* a );
+err_t bli_check_hermitian_object( const obj_t* a );
+err_t bli_check_symmetric_object( const obj_t* a );
+err_t bli_check_triangular_object( const obj_t* a );
+err_t bli_check_object_struc( const obj_t* a, struc_t struc );
 
-err_t bli_check_upper_or_lower_object( obj_t* a );
+err_t bli_check_upper_or_lower_object( const obj_t* a );
 
 err_t bli_check_valid_3x1_subpart( subpart_t part );
 err_t bli_check_valid_1x3_subpart( subpart_t part );
 err_t bli_check_valid_3x3_subpart( subpart_t part );
 
-err_t bli_check_valid_cntl( void* cntl );
+err_t bli_check_valid_cntl( const void* cntl );
 
-err_t bli_check_packm_schema_on_unpack( obj_t* a );
-err_t bli_check_packv_schema_on_unpack( obj_t* a );
+err_t bli_check_packm_schema_on_unpack( const obj_t* a );
+err_t bli_check_packv_schema_on_unpack( const obj_t* a );
 
-err_t bli_check_object_buffer( obj_t* a );
+err_t bli_check_object_buffer( const obj_t* a );
 
-err_t bli_check_valid_malloc_buf( void* ptr );
+err_t bli_check_valid_malloc_buf( const void* ptr );
 
 err_t bli_check_valid_packbuf( packbuf_t buf_type );
-err_t bli_check_if_exhausted_pool( pool_t* pool );
-err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx );
+err_t bli_check_if_exhausted_pool( const pool_t* pool );
+err_t bli_check_sufficient_stack_buf_size( const cntx_t* cntx );
 err_t bli_check_alignment_is_power_of_two( size_t align_size );
 err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size );
 
-err_t bli_check_object_alias_of( obj_t* a, obj_t* b );
+err_t bli_check_object_alias_of( const obj_t* a, const obj_t* b );
 
 err_t bli_check_valid_arch_id( arch_t id );
-err_t bli_check_initialized_gks_cntx( cntx_t** cntx );
+err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx );
 
-err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr );
-err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr );
-err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr );
+err_t bli_check_valid_mc_mod_mult( const blksz_t* mc, const blksz_t* mr );
+err_t bli_check_valid_nc_mod_mult( const blksz_t* nc, const blksz_t* nr );
+err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr );
 
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index f8846198f..b22ddbee0 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -349,8 +349,8 @@ void bli_cntl_mark_family
 
 dim_t bli_cntl_calc_num_threads_in
      (
-       rntm_t* rntm,
-       cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      )
 {
 	dim_t n_threads_in = 1;
diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h
index 67dd02f0c..406a350ee 100644
--- a/frame/base/bli_cntl.h
+++ b/frame/base/bli_cntl.h
@@ -119,45 +119,45 @@ BLIS_EXPORT_BLIS void bli_cntl_mark_family
 
 dim_t bli_cntl_calc_num_threads_in
      (
-       rntm_t* rntm,
-       cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
 // cntl_t query (fields only)
 
-BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl )
+BLIS_INLINE opid_t bli_cntl_family( const cntl_t* cntl )
 {
 	return cntl->family;
 }
 
-BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl )
+BLIS_INLINE bszid_t bli_cntl_bszid( const cntl_t* cntl )
 {
 	return cntl->bszid;
 }
 
-BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl )
+BLIS_INLINE void_fp bli_cntl_var_func( const cntl_t* cntl )
 {
 	return cntl->var_func;
 }
 
-BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl )
+BLIS_INLINE cntl_t* bli_cntl_sub_prenode( const cntl_t* cntl )
 {
 	return cntl->sub_prenode;
 }
 
-BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl )
+BLIS_INLINE cntl_t* bli_cntl_sub_node( const cntl_t* cntl )
 {
 	return cntl->sub_node;
 }
 
-BLIS_INLINE void* bli_cntl_params( cntl_t* cntl )
+BLIS_INLINE void* bli_cntl_params( const cntl_t* cntl )
 {
 	return cntl->params;
 }
 
-BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl )
+BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl )
 {
 	// The first 64 bytes is always the size of the params structure.
 	return *( ( uint64_t* )(cntl->params) );
@@ -170,19 +170,19 @@ BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl )
 
 // cntl_t query (complex)
 
-BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl )
 {
 	return ( bool )
 	       ( cntl == NULL );
 }
 
-BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_is_leaf( const cntl_t* cntl )
 {
 	return ( bool )
 	       ( bli_cntl_sub_node( cntl ) == NULL );
 }
 
-BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_does_part( const cntl_t* cntl )
 {
 	return ( bool )
 	       ( bli_cntl_bszid( cntl ) != BLIS_NO_PART );
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 218325d5a..70057060f 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -70,8 +70,8 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	// Query the context for the addresses of:
 	// - the blocksize object array
 	// - the blocksize multiple array
-	blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx );
-	bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx );
+	blksz_t* cntx_blkszs = cntx->blkszs;
+	bszid_t* cntx_bmults = cntx->bmults;
 
 	// Initialize variable argument environment.
 	va_list args;
@@ -165,7 +165,7 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
 		// Query the context for the blksz_t object assoicated with the
 		// current blocksize id, and also query the object corresponding
 		// to the blocksize multiple.
-		blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
+		blksz_t* cntx_blksz = ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx );
 
 		// Copy the real domain value of the blksz_t object into the
 		// corresponding complex domain slot of the same object.
@@ -218,7 +218,7 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	*/
 
 	// Query the context for the address of the ukernel func_t array
-	func_t*  cntx_ukrs = bli_cntx_ukrs_buf( cntx );
+	func_t*  cntx_ukrs = cntx->ukrs;
 
 	// Initialize variable argument environment.
 	va_list   args;
@@ -262,7 +262,7 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break;
 			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break;
 			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break;
-		    default:                  ukrs = NULL; break;
+			default:                  ukrs = NULL; break;
 		};
 
 		if ( ukrs )
@@ -297,7 +297,7 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	*/
 
 	// Query the context for the address of the ukernel preference mbool_t array
-	mbool_t* cntx_ukr_prefs = bli_cntx_ukr_prefs_buf( cntx );
+	mbool_t* cntx_ukr_prefs = cntx->ukr_prefs;
 
 	// Initialize variable argument environment.
 	va_list   args;
@@ -355,7 +355,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 	*/
 
 	// Query the context for the address of the l3 sup handlers array.
-	void_fp* cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
+	void_fp* cntx_l3_sup_handlers = cntx->l3_sup_handlers;
 
 	// Initialize variable argument environment.
 	va_list   args;
@@ -386,7 +386,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_print( cntx_t* cntx )
+void bli_cntx_print( const cntx_t* cntx )
 {
 	dim_t i;
 
@@ -410,7 +410,7 @@ void bli_cntx_print( cntx_t* cntx )
 
 	for ( i = 0; i < BLIS_NUM_UKRS; ++i )
 	{
-		func_t* ukr = bli_cntx_get_ukrs( i, cntx );
+		const func_t* ukr = bli_cntx_get_ukrs( i, cntx );
 
 		printf( "ukr %2lu:  %16p %16p %16p %16p\n",
 		        ( unsigned long )i,
@@ -423,7 +423,7 @@ void bli_cntx_print( cntx_t* cntx )
 
 	for ( i = 0; i < BLIS_NUM_UKR_PREFS; ++i )
 	{
-		mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx );
+		const mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx );
 
 		printf( "ukr pref %2lu:  %d %d %d %d\n",
 		        ( unsigned long )i,
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 412430e9b..827b19cfd 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -62,27 +62,7 @@ typedef struct cntx_s
 // -- cntx_t query (fields only) -----------------------------------------------
 //
 
-BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx )
-{
-	return cntx->blkszs;
-}
-BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
-{
-	return cntx->bmults;
-}
-BLIS_INLINE func_t* bli_cntx_ukrs_buf( cntx_t* cntx )
-{
-	return cntx->ukrs;
-}
-BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx )
-{
-	return cntx->ukr_prefs;
-}
-BLIS_INLINE void_fp* bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_handlers;
-}
-BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
+BLIS_INLINE ind_t bli_cntx_method( const cntx_t* cntx )
 {
 	return cntx->method;
 }
@@ -104,75 +84,66 @@ BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
 // -- cntx_t query (complex) ---------------------------------------------------
 //
 
-BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_blksz( bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
 	// Return the address of the blksz_t identified by bs_id.
-	return blksz;
+	return &cntx->blkszs[ bs_id ];
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_def( dt, blksz );
+	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
+	dim_t          bs_dt  = bli_blksz_get_def( dt, blksz );
 
 	// Return the main (default) blocksize value for the datatype given.
 	return bs_dt;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_max( dt, blksz );
+	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
+	dim_t          bs_dt  = bli_blksz_get_max( dt, blksz );
 
 	// Return the auxiliary (maximum) blocksize value for the datatype given.
 	return bs_dt;
 }
 
-BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, const cntx_t* cntx )
 {
-	bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx );
-	bszid_t           bm_id  = bmults[ bs_id ];
-
-	return bm_id;
+	return cntx->bmults[ bs_id ];
 }
 
-BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_bmult( bszid_t bs_id, const cntx_t* cntx )
 {
-	bszid_t           bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
-	blksz_t* restrict bmult  = bli_cntx_get_blksz( bm_id, cntx );
+	bszid_t        bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
+	const blksz_t* bmult  = bli_cntx_get_blksz( bm_id, cntx );
 
 	return bmult;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* bmult  = bli_cntx_get_bmult( bs_id, cntx );
-	dim_t    bm_dt  = bli_blksz_get_def( dt, bmult );
+	const blksz_t* bmult  = bli_cntx_get_bmult( bs_id, cntx );
+	dim_t          bm_dt  = bli_blksz_get_def( dt, bmult );
 
 	return bm_dt;
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_ukrs_buf( cntx );
-	func_t* func  = &funcs[ ukr_id ];
-
-	return func;
+	return &cntx->ukrs[ ukr_id ];
 }
 
-BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
+	const func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
 
 	return bli_func_get_dt( dt, func );
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
 	switch ( ukr_id )
 	{
@@ -189,24 +160,21 @@ BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t*
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t ukr_id, cntx_t* cntx )
+BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t pref_id, const cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
-	mbool_t* mbool  = &mbools[ ukr_id ];
-
-	return mbool;
+	return &cntx->ukr_prefs[ pref_id ];
 }
 
-BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, const cntx_t* cntx )
 {
-	mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
+	const mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
 
 	return ( bool )bli_mbool_get_dt( dt, mbool );
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, const cntx_t* cntx )
 {
 	if ( m < bli_cntx_get_blksz_def_dt( dt, BLIS_MT, cntx ) ) return TRUE;
 	if ( n < bli_cntx_get_blksz_def_dt( dt, BLIS_NT, cntx ) ) return TRUE;
@@ -217,17 +185,14 @@ BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, const cntx_t* cntx )
 {
-	void_fp* funcs = bli_cntx_l3_sup_handlers_buf( cntx );
-	void_fp  func  = funcs[ op ];
-
-	return func;
+	return cntx->l3_sup_handlers[ op ];
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
 	// This initial value will get overwritten during the switch statement below.
 	ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF;
@@ -275,12 +240,12 @@ BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* c
 	return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx );
 }
 
-BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
 	return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx );
 }
 
-BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_prefers_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
 {
 	const bool ukr_prefers_rows
 		= bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
@@ -291,7 +256,7 @@ BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t*
 	return FALSE;
 }
 
-BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
 {
 	return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx );
 }
@@ -307,58 +272,43 @@ BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t*
 
 BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	bszid_t* bmults = bli_cntx_bmults_buf( cntx );
-
-	blkszs[ bs_id ] = *blksz;
-	bmults[ bs_id ] = mult_id;
+	cntx->blkszs[ bs_id ] = *blksz;
+	cntx->bmults[ bs_id ] = mult_id;
 }
 
 BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
-	bli_blksz_set_def( bs, dt, blksz );
+	bli_blksz_set_def( bs, dt, &cntx->blkszs[ bs_id ] );
 }
 
 BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
-	bli_blksz_set_max( bs, dt, blksz );
+	bli_blksz_set_max( bs, dt, &cntx->blkszs[ bs_id ]);
 }
 
-BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, const func_t* func, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_ukrs_buf( cntx );
-
-	funcs[ ukr_id ] = *func;
+	cntx->ukrs[ ukr_id ] = *func;
 }
 
 BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_ukrs( ker_id, cntx );
-
-	bli_func_set_dt( fp, dt, func );
+	bli_func_set_dt( fp, dt, &cntx->ukrs[ ker_id ] );
 }
 
 BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
-
-	mbools[ ukr_id ] = *prefs;
+	cntx->ukr_prefs[ ukr_id ] = *prefs;
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, const cntx_t* cntx )
 {
 	ukr_t ukr_id = bli_stor3_ukr( stor_id );
 
 	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
 	switch ( bs_id )
 	{
@@ -374,7 +324,7 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cnt
 	return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx );
 }
 
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
 	switch ( bs_id )
 	{
@@ -403,7 +353,9 @@ BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* c
 BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... );
 BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx );
+BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx );
+
+BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
 BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
diff --git a/frame/base/bli_const.c b/frame/base/bli_const.c
index f20bc8447..210d6ae77 100644
--- a/frame/base/bli_const.c
+++ b/frame/base/bli_const.c
@@ -44,11 +44,11 @@ static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 );
 
 // Statically initialize global scalar constants, attaching the addresses
 // of the corresponding structs above.
-obj_t BLIS_TWO       = bli_obj_init_const( &bli_two_buffer );
-obj_t BLIS_ONE       = bli_obj_init_const( &bli_one_buffer );
-obj_t BLIS_ZERO      = bli_obj_init_const( &bli_zero_buffer );
-obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer );
-obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer );
+const obj_t BLIS_TWO       = bli_obj_init_const( &bli_two_buffer );
+const obj_t BLIS_ONE       = bli_obj_init_const( &bli_one_buffer );
+const obj_t BLIS_ZERO      = bli_obj_init_const( &bli_zero_buffer );
+const obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer );
+const obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer );
 
 #if 0
 obj_t BLIS_TWO = {};
diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c
index 92aba6970..0972f1771 100644
--- a/frame/base/bli_env.c
+++ b/frame/base/bli_env.c
@@ -67,8 +67,8 @@
 
 gint_t bli_env_get_var( const char* env, gint_t fallback )
 {
-	gint_t r_val;
-	char*  str;
+	gint_t      r_val;
+	const char* str;
 
 	// Query the environment variable and store the result in str.
 	str = getenv( env );
diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c
index 37add3b67..f4933d962 100644
--- a/frame/base/bli_error.c
+++ b/frame/base/bli_error.c
@@ -36,7 +36,7 @@
 #include "blis.h"
 
 // Internal array to hold error strings.
-static char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
+static const char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
 {
 	[-BLIS_INVALID_ERROR_CHECKING_LEVEL]         = "Invalid error checking level.",
 	[-BLIS_UNDEFINED_ERROR_CODE]                 = "Undefined error code.",
@@ -116,7 +116,7 @@ static char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
 
 // -----------------------------------------------------------------------------
 
-void bli_print_msg( char* str, char* file, guint_t line )
+void bli_print_msg( const char* str, const char* file, guint_t line )
 {
 	fprintf( stderr, "\n" );
 	fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
@@ -156,7 +156,7 @@ bool bli_error_checking_is_enabled( void )
 	return bli_error_checking_level() != BLIS_NO_ERROR_CHECKING;
 }
 
-char* bli_error_string_for_code( gint_t code )
+const char* bli_error_string_for_code( gint_t code )
 {
 	return bli_error_string[-code];
 }
diff --git a/frame/base/bli_error.h b/frame/base/bli_error.h
index e6e6f35dd..f3037e2c2 100644
--- a/frame/base/bli_error.h
+++ b/frame/base/bli_error.h
@@ -39,8 +39,8 @@ BLIS_EXPORT_BLIS void     bli_error_checking_level_set( errlev_t new_level );
 
 BLIS_EXPORT_BLIS bool     bli_error_checking_is_enabled( void );
 
-void                      bli_print_msg( char* str, char* file, guint_t line );
+void                      bli_print_msg( const char* str, const char* file, guint_t line );
 BLIS_EXPORT_BLIS void     bli_abort( void );
 
-char*                     bli_error_string_for_code( gint_t code );
+const char*               bli_error_string_for_code( gint_t code );
 
diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c
index 477710ff0..7cb7aac6d 100644
--- a/frame/base/bli_func.c
+++ b/frame/base/bli_func.c
@@ -93,13 +93,13 @@ void bli_func_free( func_t* f )
 
 // -----------------------------------------------------------------------------
 
-bool bli_func_is_null_dt( num_t   dt,
-                          func_t* f )
+bool bli_func_is_null_dt(       num_t   dt,
+                          const func_t* f )
 {
 	return ( bli_func_get_dt( dt, f ) == NULL );
 }
 
-bool bli_func_is_null( func_t* f )
+bool bli_func_is_null( const func_t* f )
 {
 	bool  r_val = TRUE;
 	num_t dt;
diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h
index 7bdd1ab10..cf89df389 100644
--- a/frame/base/bli_func.h
+++ b/frame/base/bli_func.h
@@ -38,11 +38,11 @@
 
 BLIS_INLINE void_fp bli_func_get_dt
      (
-       num_t   dt,
-       func_t* func
+             num_t   dt,
+       const func_t* func
      )
 {
-    return func->ptr[ dt ];
+	return func->ptr[ dt ];
 }
 
 // func_t modification
@@ -54,13 +54,13 @@ BLIS_INLINE void bli_func_set_dt
        func_t* func
      )
 {
-    func->ptr[ dt ] = fp;
+	func->ptr[ dt ] = fp;
 }
 
 BLIS_INLINE void bli_func_copy_dt
      (
-       num_t dt_src, func_t* func_src,
-       num_t dt_dst, func_t* func_dst
+       num_t dt_src, const func_t* func_src,
+       num_t dt_dst,       func_t* func_dst
      )
 {
 	void_fp fp = bli_func_get_dt( dt_src, func_src );
@@ -96,7 +96,7 @@ void bli_func_free( func_t* f );
 
 // -----------------------------------------------------------------------------
 
-bool bli_func_is_null_dt( num_t   dt,
-                          func_t* f );
-bool bli_func_is_null( func_t* f );
+bool bli_func_is_null_dt(       num_t   dt,
+                          const func_t* f );
+bool bli_func_is_null( const func_t* f );
 
diff --git a/frame/base/bli_getopt.c b/frame/base/bli_getopt.c
index 184439db5..e1d90d323 100644
--- a/frame/base/bli_getopt.c
+++ b/frame/base/bli_getopt.c
@@ -45,12 +45,12 @@ void bli_getopt_init_state( int opterr, getopt_t* state )
 	state->optopt = 0;
 }
 
-int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state )
+int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state )
 {
-	static char* nextchar = NULL;
+	static const char* nextchar = NULL;
 
-	char*        elem_str;
-	char*        optstr_char;
+	const char* elem_str;
+	const char* optstr_char;
 
 	// If argv contains no more arguments to process, return.
 	if ( state->optind == argc ) return -1;
diff --git a/frame/base/bli_getopt.h b/frame/base/bli_getopt.h
index 1b5a7a002..bb0e4f2cf 100644
--- a/frame/base/bli_getopt.h
+++ b/frame/base/bli_getopt.h
@@ -34,13 +34,13 @@
 
 typedef struct getopt_s
 {
-	char* optarg;
-	int   optind;
-	int   opterr;
-	int   optopt;
+	const char* optarg;
+	      int   optind;
+	      int   opterr;
+	      int   optopt;
 } getopt_t;
 
 BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state );
 
-BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state );
+BLIS_EXPORT_BLIS int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state );
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 1372a055a..4a7ccbbc3 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -226,7 +226,7 @@ void bli_gks_finalize( void )
 		// Iterate over the architectures in the gks array.
 		for ( id = 0; id < BLIS_NUM_ARCHS; ++id )
 		{
-			cntx_t** restrict gks_id = gks[ id ];
+			cntx_t** gks_id = gks[ id ];
 
 			// Only consider context arrays for architectures that were allocated
 			// in the first place.
@@ -236,7 +236,7 @@ void bli_gks_finalize( void )
 				// referenced by cntx_pp.
 				for ( ind = 0; ind < BLIS_NUM_IND_METHODS; ++ind )
 				{
-					cntx_t* restrict gks_id_ind = gks_id[ ind ];
+					cntx_t* gks_id_ind = gks_id[ ind ];
 
 					// If the current context was allocated, free it.
 					if ( gks_id_ind != NULL )
@@ -282,7 +282,7 @@ void bli_gks_init_index( void )
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_lookup_nat_cntx
+const cntx_t* bli_gks_lookup_nat_cntx
      (
        arch_t id
      )
@@ -295,7 +295,7 @@ cntx_t* bli_gks_lookup_nat_cntx
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_lookup_ind_cntx
+const cntx_t* bli_gks_lookup_ind_cntx
      (
        arch_t id,
        ind_t  ind
@@ -316,8 +316,8 @@ cntx_t* bli_gks_lookup_ind_cntx
 
 	// Index into the array of context pointers for the given architecture id,
 	// and then index into the subarray for the given induced method.
-	cntx_t** restrict gks_id     = gks[ id ];
-	cntx_t*  restrict gks_id_ind = gks_id[ ind ];
+	cntx_t** gks_id     = gks[ id ];
+	cntx_t*  gks_id_ind = gks_id[ ind ];
 
 	// Return the context pointer at gks_id_ind.
 	return gks_id_ind;
@@ -325,7 +325,7 @@ cntx_t* bli_gks_lookup_ind_cntx
 
 // -----------------------------------------------------------------------------
 
-cntx_t** bli_gks_lookup_id
+const cntx_t* const * bli_gks_lookup_id
      (
        arch_t id
      )
@@ -336,10 +336,10 @@ cntx_t** bli_gks_lookup_id
 	// initialized.
 
 	// Index into the array of context pointers for the given architecture id.
-	cntx_t** restrict gks_id = gks[ id ];
+	cntx_t** gks_id = gks[ id ];
 
 	// Return the context pointer at gks_id_ind.
-	return gks_id;
+	return ( const cntx_t* const * )gks_id;
 }
 
 // -----------------------------------------------------------------------------
@@ -405,7 +405,7 @@ void bli_gks_register_cntx
 	gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val );
 
 	// Alias the allocated array for readability.
-	cntx_t** restrict gks_id = gks[ id ];
+	cntx_t** gks_id = gks[ id ];
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_gks_register_cntx(): " );
@@ -417,7 +417,7 @@ void bli_gks_register_cntx
 	gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val );
 
 	// Alias the allocated context address for readability.
-	cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ];
+	cntx_t* gks_id_nat = gks_id[ BLIS_NAT ];
 
 	// Call the context initialization function on the element of the newly
 	// allocated array corresponding to native execution.
@@ -440,12 +440,12 @@ void bli_gks_register_cntx
 	// kernel is called.
 	err_t e_val;
 
-	blksz_t* restrict mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat );
-	blksz_t* restrict nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat );
-	blksz_t* restrict kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat );
-	blksz_t* restrict mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat );
-	blksz_t* restrict nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat );
-	blksz_t* restrict kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat );
+	const blksz_t* mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat );
+	const blksz_t* nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat );
+	const blksz_t* kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat );
+	const blksz_t* mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat );
+	const blksz_t* nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat );
+	const blksz_t* kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat );
 
 	e_val = bli_check_valid_mc_mod_mult( mc, mr ); bli_check_error_code( e_val );
 	e_val = bli_check_valid_nc_mod_mult( nc, nr ); bli_check_error_code( e_val );
@@ -463,12 +463,12 @@ void bli_gks_register_cntx
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_query_cntx( void )
+const cntx_t* bli_gks_query_cntx( void )
 {
 	return bli_gks_query_nat_cntx();
 }
 
-cntx_t* bli_gks_query_nat_cntx( void )
+const cntx_t* bli_gks_query_nat_cntx( void )
 {
 	bli_init_once();
 
@@ -480,14 +480,14 @@ cntx_t* bli_gks_query_nat_cntx( void )
 	arch_t id = bli_arch_query_id();
 
 	// Use the architecture id to look up a pointer to its context.
-	cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
+	const cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
 
 	return cntx;
 }
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_query_cntx_noinit( void )
+const cntx_t* bli_gks_query_cntx_noinit( void )
 {
 	// This function is identical to bli_gks_query_cntx(), except that it
 	// does not call bli_init_once().
@@ -496,7 +496,7 @@ cntx_t* bli_gks_query_cntx_noinit( void )
 	arch_t id = bli_arch_query_id();
 
 	// Use the architecture id to look up a pointer to its context.
-	cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
+	const cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
 
 	return cntx;
 }
@@ -507,7 +507,7 @@ cntx_t* bli_gks_query_cntx_noinit( void )
 // with a new entry corresponding to a context for an ind_t value.
 static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
-cntx_t* bli_gks_query_ind_cntx
+const cntx_t* bli_gks_query_ind_cntx
      (
        ind_t ind,
        num_t dt
@@ -547,8 +547,8 @@ cntx_t* bli_gks_query_ind_cntx
 
 	// Query the gks for the array of context pointers corresponding to the
 	// given architecture id.
-	cntx_t** restrict gks_id     = gks[ id ];
-	cntx_t*  restrict gks_id_nat = gks_id[ BLIS_NAT ];
+	cntx_t** gks_id     = gks[ id ];
+	cntx_t*  gks_id_nat = gks_id[ BLIS_NAT ];
 
 	// If for some reason the native context was requested, we can return
 	// its address early.
@@ -634,9 +634,9 @@ void bli_gks_init_ref_cntx
 
 bool bli_gks_cntx_l3_nat_ukr_is_ref
      (
-       num_t   dt,
-       ukr_t   ukr_id,
-       cntx_t* cntx
+             num_t   dt,
+             ukr_t   ukr_id,
+       const cntx_t* cntx
      )
 {
 	cntx_t ref_cntx;
@@ -658,7 +658,7 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref
 // -- level-3 micro-kernel implementation strings ------------------------------
 //
 
-static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
+static const char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
 {
 	"refrnce",
 	"virtual",
@@ -668,15 +668,15 @@ static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
 
 // -----------------------------------------------------------------------------
 
-char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
+const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
 {
 	kimpl_t ki;
 
 	// Query the context for the current induced method and datatype, and
 	// then query the ukernel function pointer for the given datatype from
 	// that context.
-	cntx_t* cntx  = bli_gks_query_ind_cntx( method, dt );
-	void_fp fp    = bli_cntx_get_ukr_dt( dt, ukr, cntx );
+	const cntx_t* cntx = bli_gks_query_ind_cntx( method, dt );
+	void_fp fp         = bli_cntx_get_ukr_dt( dt, ukr, cntx );
 
 	// Check whether the ukernel function pointer is NULL for the given
 	// datatype. If it is NULL, return the string for not applicable.
@@ -742,7 +742,7 @@ kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 		}
 
 		// Query the native context from the gks.
-		cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
+		const cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
 
 		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) )
 			return BLIS_REFERENCE_UKERNEL;
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index b8e4c4fe0..30e3b2e39 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -35,31 +35,31 @@
 #ifndef BLIS_GKS_H
 #define BLIS_GKS_H
 
-void    bli_gks_init( void );
-void    bli_gks_finalize( void );
+void                           bli_gks_init( void );
+void                           bli_gks_finalize( void );
 
-void    bli_gks_init_index( void );
+void                           bli_gks_init_index( void );
 
-cntx_t* bli_gks_lookup_nat_cntx( arch_t id );
-cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind );
-cntx_t** bli_gks_lookup_id( arch_t id );
-void    bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
+const cntx_t*                  bli_gks_lookup_nat_cntx( arch_t id );
+const cntx_t*                  bli_gks_lookup_ind_cntx( arch_t id, ind_t ind );
+const cntx_t* const *          bli_gks_lookup_id( arch_t id );
+void                           bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
 
-BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void );
-BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_cntx( void );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void );
 
-cntx_t* bli_gks_query_cntx_noinit( void );
+const cntx_t*                  bli_gks_query_cntx_noinit( void );
 
-BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
 
-BLIS_EXPORT_BLIS void    bli_gks_init_ref_cntx( cntx_t* cntx );
+BLIS_EXPORT_BLIS void          bli_gks_init_ref_cntx( cntx_t* cntx );
 
-bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx );
+bool                           bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, const cntx_t* cntx );
 
-BLIS_EXPORT_BLIS char*   bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
-BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char*   bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS kimpl_t       bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
 
-//char*   bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
+//char*                          bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
 
 #endif
 
diff --git a/frame/base/bli_ind.c b/frame/base/bli_ind.c
index a359e89a3..fbe740465 100644
--- a/frame/base/bli_ind.c
+++ b/frame/base/bli_ind.c
@@ -34,7 +34,7 @@
 
 #include "blis.h"
 
-static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
+static const char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
 {
 /* 1m   */ "1m",
 /* nat  */ "native",
@@ -46,7 +46,7 @@ void bli_ind_init( void )
 {
 	// NOTE: Instead of calling bli_gks_query_cntx(), we call
 	// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
-	cntx_t* cntx     = bli_gks_query_cntx_noinit();
+	const cntx_t* cntx = bli_gks_query_cntx_noinit();
 
 	// For each precision, enable the default induced method (1m) if both of
 	// the following conditions are met:
@@ -151,8 +151,8 @@ bool bli_ind_oper_is_impl( opid_t oper, ind_t method )
 		// All other operations should be reported as not implemented,
 		// unless the requested check was for BLIS_NAT, in which case
 		// all operations are implemented.
-	    if ( method == BLIS_NAT ) is_impl = TRUE;
-	    else                      is_impl = FALSE;
+		if ( method == BLIS_NAT ) is_impl = TRUE;
+		else                      is_impl = FALSE;
 	}
 
 	return is_impl;
@@ -176,7 +176,7 @@ ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt )
 	return method;
 }
 
-char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt )
+const char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt )
 {
 	ind_t method = bli_ind_oper_find_avail( oper, dt );
 
@@ -185,7 +185,7 @@ char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt )
 
 // -----------------------------------------------------------------------------
 
-char* bli_ind_get_impl_string( ind_t method )
+const char* bli_ind_get_impl_string( ind_t method )
 {
 	return bli_ind_impl_str[ method ];
 }
diff --git a/frame/base/bli_ind.h b/frame/base/bli_ind.h
index 85cad648e..e162c5809 100644
--- a/frame/base/bli_ind.h
+++ b/frame/base/bli_ind.h
@@ -38,25 +38,25 @@
 // level-3 induced method management
 #include "bli_l3_ind.h"
 
-void   bli_ind_init( void );
-void   bli_ind_finalize( void );
+void                         bli_ind_init( void );
+void                         bli_ind_finalize( void );
 
-BLIS_EXPORT_BLIS void    bli_ind_enable( ind_t method );
-BLIS_EXPORT_BLIS void    bli_ind_disable( ind_t method );
-BLIS_EXPORT_BLIS void    bli_ind_disable_all( void );
+BLIS_EXPORT_BLIS void        bli_ind_enable( ind_t method );
+BLIS_EXPORT_BLIS void        bli_ind_disable( ind_t method );
+BLIS_EXPORT_BLIS void        bli_ind_disable_all( void );
 
-BLIS_EXPORT_BLIS void    bli_ind_enable_dt( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS void    bli_ind_disable_dt( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS void    bli_ind_disable_all_dt( num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_enable_dt( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_disable_dt( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_disable_all_dt( num_t dt );
 
-BLIS_EXPORT_BLIS void    bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt );
 
-BLIS_EXPORT_BLIS bool    bli_ind_oper_is_impl( opid_t oper, ind_t method );
-BLIS_EXPORT_BLIS ind_t   bli_ind_oper_find_avail( opid_t oper, num_t dt );
-BLIS_EXPORT_BLIS char*   bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt );
+BLIS_EXPORT_BLIS bool        bli_ind_oper_is_impl( opid_t oper, ind_t method );
+BLIS_EXPORT_BLIS ind_t       bli_ind_oper_find_avail( opid_t oper, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt );
 
-char*  bli_ind_get_impl_string( ind_t method );
-num_t  bli_ind_map_cdt_to_index( num_t dt );
+const char*                  bli_ind_get_impl_string( ind_t method );
+num_t                        bli_ind_map_cdt_to_index( num_t dt );
 
 
 #endif
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index bfa5ca9a3..72b54ca20 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -40,12 +40,11 @@
 
 // This string gets defined via -D on the command line when BLIS is compiled.
 // This string is (or rather, should be) only used here.
-static char* bli_version_str       = BLIS_VERSION_STRING;
-static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
-
-char* bli_info_get_version_str( void )                { return bli_version_str; }
-char* bli_info_get_int_type_size_str( void )          { return bli_int_type_size_str; }
+static const char* bli_version_str       = BLIS_VERSION_STRING;
+static const char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
 
+const char* bli_info_get_version_str( void )          { return bli_version_str; }
+const char* bli_info_get_int_type_size_str( void )    { return bli_int_type_size_str; }
 
 
 // -- General configuration-related --------------------------------------------
@@ -158,36 +157,34 @@ gint_t bli_info_get_enable_sandbox( void )
 }
 
 
-
 // -- Kernel implementation-related --------------------------------------------
 
 
 // -- Level-3 kernel definitions --
 
-char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMM_UKR,       method, dt ); }
-char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_L_UKR, method, dt ); }
-char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_U_UKR, method, dt ); }
-char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_L_UKR,     method, dt ); }
-char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_U_UKR,     method, dt ); }
 
 
-
 // -- BLIS implementation query (level-3) --------------------------------------
 
-char* bli_info_get_gemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM,  dt ); }
-char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_hemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM,  dt ); }
-char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_symm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM,  dt ); }
-char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_trmm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM,  dt ); }
-char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); }
-char* bli_info_get_trsm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM,  dt ); }
+const char* bli_info_get_gemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM,  dt ); }
+const char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_hemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM,  dt ); }
+const char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_symm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM,  dt ); }
+const char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_trmm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM,  dt ); }
+const char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); }
+const char* bli_info_get_trsm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM,  dt ); }
 
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index 99c7d000d..250504c23 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -36,8 +36,8 @@
 
 // -- General library information ----------------------------------------------
 
-BLIS_EXPORT_BLIS char* bli_info_get_version_str( void );
-BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void );
+BLIS_EXPORT_BLIS const char* bli_info_get_version_str( void );
+BLIS_EXPORT_BLIS const char* bli_info_get_int_type_size_str( void );
 
 
 // -- General configuration-related --------------------------------------------
@@ -81,24 +81,24 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void );
 
 // -- Level-3 kernel definitions --
 
-BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt );
 
 
 // -- BLIS implementation query (level-3) --------------------------------------
 
-BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemmt_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_hemm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_herk_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_her2k_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_symm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_syrk_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_syr2k_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trmm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trmm3_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trsm_impl_string( num_t dt );
 
diff --git a/frame/base/bli_mbool.h b/frame/base/bli_mbool.h
index 6a989590b..d00424273 100644
--- a/frame/base/bli_mbool.h
+++ b/frame/base/bli_mbool.h
@@ -36,7 +36,7 @@
 
 // mbool_t query
 
-BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb )
+BLIS_INLINE bool bli_mbool_get_dt( num_t dt, const mbool_t* mb )
 {
 	return ( bool )( mb->v[ dt ] );
 }
diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h
index d61e97021..c25511486 100644
--- a/frame/base/bli_mem.h
+++ b/frame/base/bli_mem.h
@@ -66,33 +66,33 @@ BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem )
 	return &(mem->pblk);
 }
 
-BLIS_INLINE void* bli_mem_buffer( mem_t* mem )
+BLIS_INLINE void* bli_mem_buffer( const mem_t* mem )
 {
-	return bli_pblk_buf( bli_mem_pblk( mem ) );
+	return bli_pblk_buf( bli_mem_pblk( ( mem_t* )mem ) );
 }
 
-BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem )
+BLIS_INLINE packbuf_t bli_mem_buf_type( const mem_t* mem )
 {
 	return mem->buf_type;
 }
 
-BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem )
+BLIS_INLINE pool_t* bli_mem_pool( const mem_t* mem )
 {
 	return mem->pool;
 }
 
-BLIS_INLINE siz_t bli_mem_size( mem_t* mem )
+BLIS_INLINE siz_t bli_mem_size( const mem_t* mem )
 {
 	return mem->size;
 }
 
-BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem )
+BLIS_INLINE bool bli_mem_is_alloc( const mem_t* mem )
 {
 	return ( bool )
 	       ( bli_mem_buffer( mem ) != NULL );
 }
 
-BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem )
+BLIS_INLINE bool bli_mem_is_unalloc( const mem_t* mem )
 {
 	return ( bool )
 	       ( bli_mem_buffer( mem ) == NULL );
@@ -160,4 +160,4 @@ BLIS_INLINE void bli_mem_clear( mem_t* mem )
 }
 
 
-#endif 
+#endif
diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c
index ca3c46f99..7b62ded5c 100644
--- a/frame/base/bli_memsys.c
+++ b/frame/base/bli_memsys.c
@@ -44,7 +44,7 @@ void bli_memsys_init( void )
 	// contexts for induced methods.
 	// NOTE: Instead of calling bli_gks_query_cntx(), we call
 	// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
-	cntx_t* cntx_p = bli_gks_query_cntx_noinit();
+	const cntx_t* cntx_p = bli_gks_query_cntx_noinit();
 
 	// Initialize the packing block allocator and its data structures.
 	bli_pba_init( cntx_p );
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index 23fbb4cd1..043bd1088 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -82,16 +82,13 @@ void bli_obj_create_without_buffer
        obj_t* obj
      )
 {
-	siz_t  elem_size;
-	void*  s;
-
 	bli_init_once();
 
 	if ( bli_error_checking_is_enabled() )
 		bli_obj_create_without_buffer_check( dt, m, n, obj );
 
 	// Query the size of one element of the object's pre-set datatype.
-	elem_size = bli_dt_size( dt );
+	siz_t elem_size = bli_dt_size( dt );
 
 	// Set any default properties that are appropriate.
 	bli_obj_set_defaults( obj );
@@ -125,7 +122,7 @@ void bli_obj_create_without_buffer
 
 	// Set the internal scalar to 1.0.
 	bli_obj_set_scalar_dt( dt, obj );
-	s = bli_obj_internal_scalar_buffer( obj );
+	void* s = bli_obj_internal_scalar_buffer( obj );
 
 	// Always writing the imaginary component is needed in mixed-domain
 	// scenarios. Failing to do this can lead to reading uninitialized
@@ -147,21 +144,17 @@ void bli_obj_alloc_buffer
        obj_t* obj
      )
 {
-	dim_t  n_elem = 0;
-	dim_t  m, n;
-	siz_t  elem_size;
-	siz_t  buffer_size;
-	void*  p;
-	err_t  r_val;
+	dim_t n_elem = 0;
+	err_t r_val;
 
 	bli_init_once();
 
 	// Query the dimensions of the object we are allocating.
-	m = bli_obj_length( obj );
-	n = bli_obj_width( obj );
+	dim_t m = bli_obj_length( obj );
+	dim_t n = bli_obj_width( obj );
 
 	// Query the size of one element.
-	elem_size = bli_obj_elem_size( obj );
+	siz_t elem_size = bli_obj_elem_size( obj );
 
 	// Adjust the strides, if needed, before doing anything else
 	// (particularly, before doing any error checking).
@@ -198,10 +191,10 @@ void bli_obj_alloc_buffer
 
 	// Compute the size of the total buffer to be allocated, which includes
 	// padding if the leading dimension was increased for alignment purposes.
-	buffer_size = ( siz_t )n_elem * elem_size;
+	siz_t buffer_size = ( siz_t )n_elem * elem_size;
 
 	// Allocate the buffer.
-	p = bli_malloc_user( buffer_size, &r_val );
+	void* p = bli_malloc_user( buffer_size, &r_val );
 
 	// Set individual fields.
 	bli_obj_set_buffer( p, obj );
@@ -264,8 +257,8 @@ void bli_obj_create_1x1_with_attached_buffer
 
 void bli_obj_create_conf_to
      (
-       obj_t* s,
-       obj_t* d
+       const obj_t* s,
+             obj_t* d
      )
 {
 	const num_t dt = bli_obj_dt( s );
@@ -552,7 +545,7 @@ static char* dt_names[ BLIS_NUM_FP_TYPES+1 ] =
 	"int"
 };
 
-char* bli_dt_string
+const char* bli_dt_string
      (
        num_t dt
      )
@@ -600,15 +593,13 @@ dim_t bli_align_dim_to_size
 
 dim_t bli_align_ptr_to_size
      (
-       void*  p,
-       size_t align_size
+       const void*  p,
+             size_t align_size
      )
 {
-	dim_t dim;
-
-	dim = ( ( ( uintptr_t )p + align_size - 1 ) /
-	        align_size
-	      ) * align_size;
+	dim_t dim = ( ( ( uintptr_t )p + align_size - 1 ) /
+	              align_size
+	            ) * align_size;
 
 	return dim;
 }
@@ -634,13 +625,13 @@ num_t bli_dt_union( num_t dt1, num_t dt2 )
 
 void bli_obj_print
      (
-       char*  label,
-       obj_t* obj
+       const char*  label,
+       const obj_t* obj
      )
 {
 	bli_init_once();
 
-	FILE*  file     = stdout;
+	FILE* file = stdout;
 
 	if ( bli_error_checking_is_enabled() )
 		bli_obj_print_check( label, obj );
diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h
index 4436d2cd8..a446c09c8 100644
--- a/frame/base/bli_obj.h
+++ b/frame/base/bli_obj.h
@@ -95,8 +95,8 @@ BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer
 
 BLIS_EXPORT_BLIS void bli_obj_create_conf_to
      (
-       obj_t* s,
-       obj_t* d
+       const obj_t* s,
+             obj_t* d
      );
 
 BLIS_EXPORT_BLIS void bli_obj_free
@@ -119,7 +119,7 @@ BLIS_EXPORT_BLIS siz_t bli_dt_size
        num_t dt
      );
 
-BLIS_EXPORT_BLIS char* bli_dt_string
+BLIS_EXPORT_BLIS const char* bli_dt_string
      (
        num_t dt
      );
@@ -139,13 +139,13 @@ BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size
 
 BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size
      (
-       void*  p,
-       size_t align_size
+       const void*  p,
+             size_t align_size
      );
 
 BLIS_EXPORT_BLIS void bli_obj_print
      (
-       char*  label,
-       obj_t* obj
+       const char*  label,
+       const obj_t* obj
      );
 
diff --git a/frame/base/bli_obj_scalar.c b/frame/base/bli_obj_scalar.c
index e28d4fda9..2ef9751f6 100644
--- a/frame/base/bli_obj_scalar.c
+++ b/frame/base/bli_obj_scalar.c
@@ -41,15 +41,13 @@ void bli_obj_scalar_init_detached
        obj_t* beta
      )
 {
-	void* p;
-
 	// Initialize beta without a buffer and then attach its internal buffer.
 	// NOTE: This initializes both the storage datatype and scalar datatype
 	// bitfields within beta to dt.
 	bli_obj_create_without_buffer( dt, 1, 1, beta );
 
 	// Query the address of the object's internal scalar buffer.
-	p = bli_obj_internal_scalar_buffer( beta );
+	void* p = bli_obj_internal_scalar_buffer( beta );
 
 	// Update the object.
 	bli_obj_set_buffer( p, beta );
@@ -59,10 +57,10 @@ void bli_obj_scalar_init_detached
 
 void bli_obj_scalar_init_detached_copy_of
      (
-       num_t  dt,
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* beta
+             num_t  dt,
+             conj_t conj,
+       const obj_t* alpha,
+             obj_t* beta
      )
 {
 	obj_t alpha_local;
@@ -81,8 +79,8 @@ void bli_obj_scalar_init_detached_copy_of
 
 void bli_obj_scalar_detach
      (
-       obj_t* a,
-       obj_t* alpha
+       const obj_t* a,
+             obj_t* alpha
      )
 {
 	// Use the scalar datatype of A as the storage datatype of the detached
@@ -103,9 +101,9 @@ void bli_obj_scalar_detach
 
 void bli_obj_scalar_attach
      (
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* a
+             conj_t conj,
+       const obj_t* alpha,
+             obj_t* a
      )
 {
 	obj_t alpha_cast;
@@ -165,8 +163,8 @@ void bli_obj_scalar_cast_to
 
 void bli_obj_scalar_apply_scalar
      (
-       obj_t* alpha,
-       obj_t* a
+       const obj_t* alpha,
+             obj_t* a
      )
 {
 	obj_t alpha_cast;
@@ -193,9 +191,9 @@ void bli_obj_scalar_reset
        obj_t* a
      )
 {
-	num_t dt       = bli_obj_scalar_dt( a );
-	void* scalar_a = bli_obj_internal_scalar_buffer( a );
-	void* one      = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	num_t       dt       = bli_obj_scalar_dt( a );
+	void*       scalar_a = bli_obj_internal_scalar_buffer( a );
+	const void* one      = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 
 	if      ( bli_is_float( dt )    ) *(( float*    )scalar_a) = *(( float*    )one);
 	else if ( bli_is_double( dt )   ) *(( double*   )scalar_a) = *(( double*   )one);
@@ -211,9 +209,9 @@ bool bli_obj_scalar_has_nonzero_imag
        obj_t* a
      )
 {
-	bool   r_val     = FALSE;
-	num_t  dt        = bli_obj_scalar_dt( a );
-	void*  scalar_a  = bli_obj_internal_scalar_buffer( a );
+	bool  r_val    = FALSE;
+	num_t dt       = bli_obj_scalar_dt( a );
+	void* scalar_a = bli_obj_internal_scalar_buffer( a );
 
 	// FGVZ: Reimplement by using bli_obj_imag_part() and then
 	// bli_obj_equals( &BLIS_ZERO, ... ).
@@ -236,16 +234,15 @@ bool bli_obj_scalar_has_nonzero_imag
 
 bool bli_obj_scalar_equals
      (
-       obj_t* a,
-       obj_t* beta
+       const obj_t* a,
+       const obj_t* beta
      )
 {
 	obj_t scalar_a;
-	bool  r_val;
 
 	bli_obj_scalar_detach( a, &scalar_a );
 
-	r_val = bli_obj_equals( &scalar_a, beta );
+	bool r_val = bli_obj_equals( &scalar_a, beta );
 
 	return r_val;
 }
diff --git a/frame/base/bli_obj_scalar.h b/frame/base/bli_obj_scalar.h
index 86b699659..23bf573c6 100644
--- a/frame/base/bli_obj_scalar.h
+++ b/frame/base/bli_obj_scalar.h
@@ -40,23 +40,23 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of
      (
-       num_t  dt,
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* beta
+             num_t  dt,
+             conj_t conj,
+       const obj_t* alpha,
+             obj_t* beta
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_detach
      (
-       obj_t* a,
-       obj_t* alpha
+       const obj_t* a,
+             obj_t* alpha
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_attach
      (
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* a
+             conj_t conj,
+       const obj_t* alpha,
+             obj_t* a
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to
@@ -67,8 +67,8 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar
      (
-       obj_t* alpha,
-       obj_t* a
+       const obj_t* alpha,
+             obj_t* a
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_reset
@@ -83,7 +83,7 @@ BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag
 
 BLIS_EXPORT_BLIS bool bli_obj_scalar_equals
      (
-       obj_t* a,
-       obj_t* beta
+       const obj_t* a,
+       const obj_t* beta
      );
 
diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c
index 95587e4a7..f3a2deeb4 100644
--- a/frame/base/bli_part.c
+++ b/frame/base/bli_part.c
@@ -40,12 +40,12 @@
 
 void bli_acquire_mpart
      (
-       dim_t     i,
-       dim_t     j,
-       dim_t     bm,
-       dim_t     bn,
-       obj_t*    parent,
-       obj_t*    child
+             dim_t  i,
+             dim_t  j,
+             dim_t  bm,
+             dim_t  bn,
+       const obj_t* parent,
+             obj_t* child
      )
 {
 	// Query the dimensions of the parent object.
@@ -83,11 +83,11 @@ void bli_acquire_mpart
 
 void bli_acquire_mpart_t2b
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj );
@@ -96,11 +96,11 @@ void bli_acquire_mpart_t2b
 
 void bli_acquire_mpart_b2t
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj );
@@ -109,12 +109,12 @@ void bli_acquire_mpart_b2t
 
 void bli_acquire_mpart_mdim
      (
-       dir_t     direct,
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dir_t     direct,
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	dim_t  m;
@@ -123,7 +123,6 @@ void bli_acquire_mpart_mdim
 	dim_t  n_part   = 0;
 	inc_t  offm_inc = 0;
 	inc_t  offn_inc = 0;
-	doff_t diag_off_inc;
 
 
 	// Call a special function for partitioning packed objects. (By only
@@ -235,7 +234,7 @@ void bli_acquire_mpart_mdim
 
 
 	// Compute the diagonal offset based on the m and n offsets.
-	diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
+	doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
 
 
 	// Begin by copying the info, elem size, buffer, row stride, and column
@@ -307,24 +306,24 @@ void bli_acquire_mpart_mdim
 
 void bli_acquire_mpart_l2r
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
-	bli_acquire_mpart_ndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
+	bli_acquire_mpart_ndim( BLIS_FWD, req_part, j, b, obj, sub_obj );
 }
 
 
 void bli_acquire_mpart_r2l
      (
-       subpart_t req_part,
-       dim_t     j,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_ndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
@@ -333,12 +332,12 @@ void bli_acquire_mpart_r2l
 
 void bli_acquire_mpart_ndim
      (
-       dir_t     direct,
-       subpart_t req_part,
-       dim_t     j,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dir_t     direct,
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	dim_t  m;
@@ -347,7 +346,6 @@ void bli_acquire_mpart_ndim
 	dim_t  n_part   = 0;
 	inc_t  offm_inc = 0;
 	inc_t  offn_inc = 0;
-	doff_t diag_off_inc;
 
 
 	// Call a special function for partitioning packed objects. (By only
@@ -459,7 +457,7 @@ void bli_acquire_mpart_ndim
 
 
 	// Compute the diagonal offset based on the m and n offsets.
-	diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
+	doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
 
 
 	// Begin by copying the info, elem size, buffer, row stride, and column
@@ -530,11 +528,11 @@ void bli_acquire_mpart_ndim
 
 void bli_acquire_mpart_tl2br
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
@@ -543,11 +541,11 @@ void bli_acquire_mpart_tl2br
 
 void bli_acquire_mpart_br2tl
      (
-       subpart_t req_part,
-       dim_t     j,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
@@ -556,12 +554,12 @@ void bli_acquire_mpart_br2tl
 
 void bli_acquire_mpart_mndim
      (
-       dir_t     direct,
-       subpart_t req_part,
-       dim_t     ij,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dir_t     direct,
+             subpart_t req_part,
+             dim_t     ij,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	dim_t  m;
@@ -571,7 +569,6 @@ void bli_acquire_mpart_mndim
 	dim_t  n_part   = 0;
 	inc_t  offm_inc = 0;
 	inc_t  offn_inc = 0;
-	doff_t diag_off_inc;
 
 
 	// Call a special function for partitioning packed objects. (By only
@@ -712,7 +709,7 @@ void bli_acquire_mpart_mndim
 
 
 	// Compute the diagonal offset based on the m and n offsets.
-	diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
+	doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
 
 
 	// Begin by copying the info, elem size, buffer, row stride, and column
@@ -798,11 +795,11 @@ void bli_acquire_mpart_mndim
 
 void bli_acquire_vpart_f2b
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
@@ -814,11 +811,11 @@ void bli_acquire_vpart_f2b
 
 void bli_acquire_vpart_b2f
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
@@ -833,10 +830,10 @@ void bli_acquire_vpart_b2f
 
 void bli_acquire_mij
      (
-       dim_t     i,
-       dim_t     j,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dim_t  i,
+             dim_t  j,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	obj_t tmp_obj;
@@ -848,9 +845,9 @@ void bli_acquire_mij
 
 void bli_acquire_vi
      (
-       dim_t     i,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dim_t  i,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h
index 5e56a9fec..6d3e00ced 100644
--- a/frame/base/bli_part.h
+++ b/frame/base/bli_part.h
@@ -38,12 +38,12 @@
 
 BLIS_EXPORT_BLIS void bli_acquire_mpart
      (
-       dim_t     i,
-       dim_t     j,
-       dim_t     m,
-       dim_t     n,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dim_t  i,
+             dim_t  j,
+             dim_t  m,
+             dim_t  n,
+       const obj_t* obj,
+             obj_t* sub_obj
      );
 
 #undef  GENPROT
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_acquire_mpart
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       subpart_t req_part, \
-       dim_t     i, \
-       dim_t     b, \
-       obj_t*    obj, \
-       obj_t*    sub_obj \
+             subpart_t req_part, \
+             dim_t     i, \
+             dim_t     b, \
+       const obj_t*    obj, \
+             obj_t*    sub_obj \
      );
 
 GENPROT( acquire_mpart_t2b )
@@ -71,12 +71,12 @@ GENPROT( acquire_mpart_br2tl )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       dir_t     direct, \
-       subpart_t req_part, \
-       dim_t     i, \
-       dim_t     b, \
-       obj_t*    obj, \
-       obj_t*    sub_obj \
+             dir_t     direct, \
+             subpart_t req_part, \
+             dim_t     i, \
+             dim_t     b, \
+       const obj_t*    obj, \
+             obj_t*    sub_obj \
      );
 
 GENPROT( acquire_mpart_mdim )
@@ -91,11 +91,11 @@ GENPROT( acquire_mpart_mndim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       subpart_t req_part, \
-       dim_t     i, \
-       dim_t     b, \
-       obj_t*    obj, \
-       obj_t*    sub_obj \
+             subpart_t req_part, \
+             dim_t     i, \
+             dim_t     b, \
+       const obj_t*    obj, \
+             obj_t*    sub_obj \
      );
 
 GENPROT( acquire_vpart_f2b )
@@ -105,16 +105,16 @@ GENPROT( acquire_vpart_b2f )
 
 BLIS_EXPORT_BLIS void bli_acquire_mij
      (
-       dim_t     i,
-       dim_t     j,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dim_t  i,
+             dim_t  j,
+       const obj_t* obj,
+             obj_t* sub_obj
      );
 
 BLIS_EXPORT_BLIS void bli_acquire_vi
      (
-       dim_t     i,
-       obj_t*    obj,
-       obj_t*    sub_obj
+             dim_t  i,
+       const obj_t* obj,
+             obj_t* sub_obj
      );
 
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index f8835e5de..68dffd728 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -48,10 +48,10 @@ pba_t* bli_pba_query( void )
 
 void bli_pba_init
      (
-       cntx_t* restrict cntx
+       const cntx_t* cntx
      )
 {
-	pba_t* restrict pba = bli_pba_query();
+	pba_t* pba = bli_pba_query();
 
 	const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN;
 	malloc_ft   malloc_fp  = BLIS_MALLOC_POOL;
@@ -77,7 +77,7 @@ void bli_pba_finalize
        void
      )
 {
-	pba_t* restrict pba = bli_pba_query();
+	pba_t* pba = bli_pba_query();
 
 #ifdef BLIS_ENABLE_PBA_POOLS
 	bli_pba_finalize_pools( pba );
@@ -201,15 +201,11 @@ void bli_pba_release
        mem_t*  mem
      )
 {
-	packbuf_t buf_type;
-	pool_t*   pool;
-	pblk_t*   pblk;
-
 	// Query the memory broker from the runtime.
 	pba_t* pba = bli_rntm_pba( rntm );
 
 	// Extract the buffer type so we know what kind of memory was allocated.
-	buf_type = bli_mem_buf_type( mem );
+	packbuf_t buf_type = bli_mem_buf_type( mem );
 
 #ifndef BLIS_ENABLE_PBA_POOLS
 	#ifdef BLIS_ENABLE_MEM_TRACING
@@ -231,10 +227,10 @@ void bli_pba_release
 	{
 		// Extract the address of the pool from which the memory was
 		// allocated.
-		pool = bli_mem_pool( mem );
+		pool_t* pool = bli_mem_pool( mem );
 
 		// Extract the address of the pblk_t struct within the mem_t struct.
-		pblk = bli_mem_pblk( mem );
+		pblk_t* pblk = bli_mem_pblk( mem );
 
 		// Acquire the mutex associated with the pba object.
 		bli_pba_lock( pba );
@@ -284,8 +280,8 @@ void bli_pba_acquire_v
 
 siz_t bli_pba_pool_size
      (
-       pba_t*    pba,
-       packbuf_t buf_type
+       const pba_t*    pba,
+             packbuf_t buf_type
      )
 {
 	siz_t r_val;
@@ -304,7 +300,7 @@ siz_t bli_pba_pool_size
 		// Acquire the pointer to the pool corresponding to the buf_type
 		// provided.
 		pool_index = bli_packbuf_index( buf_type );
-		pool       = bli_pba_pool( pool_index, pba );
+		pool       = bli_pba_pool( pool_index, ( pba_t* )pba );
 
 		// Compute the pool "size" as the product of the block size
 		// and the number of blocks in the pool.
@@ -319,8 +315,8 @@ siz_t bli_pba_pool_size
 
 void bli_pba_init_pools
      (
-       cntx_t* cntx,
-       pba_t*  pba
+       const cntx_t* cntx,
+             pba_t*  pba
      )
 {
 	// Map each of the packbuf_t values to an index starting at zero.
@@ -402,10 +398,10 @@ void bli_pba_finalize_pools
 
 void bli_pba_compute_pool_block_sizes
      (
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
+       const cntx_t* cntx
      )
 {
 	const ind_t im = bli_cntx_method( cntx );
@@ -414,12 +410,10 @@ void bli_pba_compute_pool_block_sizes
 	siz_t bs_cand_b = 0;
 	siz_t bs_cand_c = 0;
 
-	num_t dt;
-
 	// Compute pool block sizes for each datatype and find the maximum
 	// size for each pool. This is done so that new pools do not need
 	// to be allocated if the user switches datatypes.
-	for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
+	for ( num_t dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
 	{
 		siz_t bs_dt_a;
 		siz_t bs_dt_b;
@@ -449,71 +443,43 @@ void bli_pba_compute_pool_block_sizes
 
 void bli_pba_compute_pool_block_sizes_dt
      (
-       num_t   dt,
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+             num_t   dt,
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
+       const cntx_t* cntx
      )
 {
-	siz_t    size_dt = bli_dt_size( dt );
-
-	blksz_t* mr;
-	blksz_t* nr;
-
-	blksz_t* mc;
-	blksz_t* kc;
-	blksz_t* nc;
-
-	dim_t    mr_dt;
-	dim_t    nr_dt;
-	dim_t    max_mnr_dt;
-
-	dim_t    mc_max_dt;
-	dim_t    kc_max_dt;
-	dim_t    nc_max_dt;
-
-	dim_t    packmr_dt;
-	dim_t    packnr_dt;
-	dim_t    max_packmnr_dt;
-
-	dim_t    scale_num_dt;
-	dim_t    scale_den_dt;
-
-	dim_t    pool_mc_dt, left_mc_dt;
-	dim_t    pool_nc_dt, left_nc_dt;
-	dim_t    pool_kc_dt;
-
 	//
 	// Find the larger of the two register blocksizes.
 	//
 
 	// Query the mr and nr blksz_t objects for the given method of
 	// execution.
-	mr = bli_cntx_get_blksz( BLIS_MR, cntx );
-	nr = bli_cntx_get_blksz( BLIS_NR, cntx );
+	const blksz_t* mr = bli_cntx_get_blksz( BLIS_MR, cntx );
+	const blksz_t* nr = bli_cntx_get_blksz( BLIS_NR, cntx );
 
 	// Extract the mr and nr values specific to the current datatype.
-	mr_dt = bli_blksz_get_def( dt, mr );
-	nr_dt = bli_blksz_get_def( dt, nr );
+	dim_t mr_dt = bli_blksz_get_def( dt, mr );
+	dim_t nr_dt = bli_blksz_get_def( dt, nr );
 
 	// Find the maximum of mr and nr.
-	max_mnr_dt = bli_max( mr_dt, nr_dt );
+	dim_t max_mnr_dt = bli_max( mr_dt, nr_dt );
 
 	//
 	// Define local maximum cache blocksizes.
 	//
 
 	// Query the mc, kc, and nc blksz_t objects for native execution.
-	mc = bli_cntx_get_blksz( BLIS_MC, cntx );
-	kc = bli_cntx_get_blksz( BLIS_KC, cntx );
-	nc = bli_cntx_get_blksz( BLIS_NC, cntx );
+	const blksz_t* mc = bli_cntx_get_blksz( BLIS_MC, cntx );
+	const blksz_t* kc = bli_cntx_get_blksz( BLIS_KC, cntx );
+	const blksz_t* nc = bli_cntx_get_blksz( BLIS_NC, cntx );
 
 	// Extract the maximum mc, kc, and nc values specific to the current
 	// datatype.
-	mc_max_dt = bli_blksz_get_max( dt, mc );
-	kc_max_dt = bli_blksz_get_max( dt, kc );
-	nc_max_dt = bli_blksz_get_max( dt, nc );
+	dim_t mc_max_dt = bli_blksz_get_max( dt, mc );
+	dim_t kc_max_dt = bli_blksz_get_max( dt, kc );
+	dim_t nc_max_dt = bli_blksz_get_max( dt, nc );
 
 	// Add max(mr,nr) to kc to make room for the nudging of kc at
 	// runtime to be a multiple of mr or nr for triangular operations
@@ -545,8 +511,11 @@ void bli_pba_compute_pool_block_sizes_dt
 	// So, if packmr * nr >= packnr * mr, then we will use packmr and mr as
 	// our scaling factors. Otherwise, we'll use packnr and nr.
 
-	packmr_dt = bli_blksz_get_max( dt, mr );
-	packnr_dt = bli_blksz_get_max( dt, nr );
+	dim_t packmr_dt = bli_blksz_get_max( dt, mr );
+	dim_t packnr_dt = bli_blksz_get_max( dt, nr );
+
+	dim_t scale_num_dt;
+	dim_t scale_den_dt;
 
 	if ( packmr_dt * nr_dt >=
 	     packnr_dt * mr_dt ) { scale_num_dt = packmr_dt;
@@ -558,13 +527,13 @@ void bli_pba_compute_pool_block_sizes_dt
 	// Compute pool block dimensions.
 	//
 
-	pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
-	left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
+	dim_t pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
+	dim_t left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
 
-	pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
-	left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
+	dim_t pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
+	dim_t left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
 
-	pool_kc_dt = ( kc_max_dt );
+	dim_t pool_kc_dt = ( kc_max_dt );
 
 	if ( left_mc_dt > 0 ) pool_mc_dt += 1;
 	if ( left_nc_dt > 0 ) pool_nc_dt += 1;
@@ -573,10 +542,12 @@ void bli_pba_compute_pool_block_sizes_dt
 	// Compute pool block sizes
 	//
 
+	siz_t size_dt = bli_dt_size( dt );
+
 	// We add an extra micro-panel of space to the block sizes for A and B
 	// just to be sure any pre-loading performed by the micro-kernel does
 	// not cause a segmentation fault.
-	max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
+	dim_t max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
 
 	*bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
 	*bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h
index 6431607ec..dfda53090 100644
--- a/frame/base/bli_pba.h
+++ b/frame/base/bli_pba.h
@@ -34,8 +34,8 @@
 
 */
 
-#ifndef BLIS_MEMBRK_H
-#define BLIS_MEMBRK_H
+#ifndef BLIS_PBA_H
+#define BLIS_PBA_H
 
 // Packing block allocator (formerly memory broker)
 
@@ -73,17 +73,17 @@ BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba )
 	return &(pba->pools[ pool_index ]);
 }
 
-BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba )
+BLIS_INLINE siz_t bli_pba_align_size( const pba_t* pba )
 {
 	return pba->align_size;
 }
 
-BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba )
+BLIS_INLINE malloc_ft bli_pba_malloc_fp( const pba_t* pba )
 {
 	return pba->malloc_fp;
 }
 
-BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba )
+BLIS_INLINE free_ft bli_pba_free_fp( const pba_t* pba )
 {
 	return pba->free_fp;
 }
@@ -123,7 +123,7 @@ BLIS_EXPORT_BLIS pba_t* bli_pba_query( void );
 
 void bli_pba_init
      (
-       cntx_t*   cntx
+       const cntx_t* cntx
      );
 void bli_pba_finalize
      (
@@ -156,16 +156,16 @@ BLIS_INLINE void bli_pba_rntm_set_pba
 
 siz_t bli_pba_pool_size
      (
-       pba_t*    pba,
-       packbuf_t buf_type
+       const pba_t*    pba,
+             packbuf_t buf_type
      );
 
 // ----------------------------------------------------------------------------
 
 void bli_pba_init_pools
      (
-       cntx_t* cntx,
-       pba_t*  pba
+       const cntx_t* cntx,
+             pba_t*  pba
      );
 void bli_pba_finalize_pools
      (
@@ -174,18 +174,18 @@ void bli_pba_finalize_pools
 
 void bli_pba_compute_pool_block_sizes
      (
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
+       const cntx_t* cntx
      );
 void bli_pba_compute_pool_block_sizes_dt
      (
-       num_t   dt,
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+             num_t   dt,
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
+       const cntx_t* cntx
      );
 
 #endif
diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c
index 112ab68e8..684b0ef73 100644
--- a/frame/base/bli_pool.c
+++ b/frame/base/bli_pool.c
@@ -39,14 +39,14 @@
 
 void bli_pool_init
      (
-       siz_t            num_blocks,
-       siz_t            block_ptrs_len,
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       free_ft          free_fp,
-       pool_t* restrict pool
+       siz_t     num_blocks,
+       siz_t     block_ptrs_len,
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       free_ft   free_fp,
+       pool_t*   pool
      )
 {
 	err_t r_val;
@@ -67,7 +67,7 @@ void bli_pool_init
 	// Allocate the block_ptrs array.
 	// FGVZ: Do we want to call malloc_fp() for internal data structures as
 	// well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g.
-	pblk_t* restrict block_ptrs
+	pblk_t* block_ptrs
 	=
 	bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ), &r_val );
 
@@ -115,7 +115,7 @@ void bli_pool_init
 
 void bli_pool_finalize
      (
-       pool_t* restrict pool
+       pool_t* pool
      )
 {
 	// NOTE: This implementation assumes that either:
@@ -124,7 +124,7 @@ void bli_pool_finalize
 	//   is bli_pool_reinit().
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the total number of blocks currently allocated.
 	const siz_t num_blocks = bli_pool_num_blocks( pool );
@@ -196,12 +196,12 @@ void bli_pool_finalize
 
 void bli_pool_reinit
      (
-       siz_t            num_blocks_new,
-       siz_t            block_ptrs_len_new,
-       siz_t            block_size_new,
-       siz_t            align_size_new,
-       siz_t            offset_size_new,
-       pool_t* restrict pool
+       siz_t   num_blocks_new,
+       siz_t   block_ptrs_len_new,
+       siz_t   block_size_new,
+       siz_t   align_size_new,
+       siz_t   offset_size_new,
+       pool_t* pool
      )
 {
 	// Preserve the pointers to malloc() and free() provided when the pool
@@ -234,9 +234,9 @@ void bli_pool_reinit
 
 void bli_pool_checkout_block
      (
-       siz_t            req_size,
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       siz_t   req_size,
+       pblk_t* block,
+       pool_t* pool
      )
 {
 	// If the requested block size is smaller than what the pool was
@@ -282,7 +282,7 @@ void bli_pool_checkout_block
 	// At this point, at least one block is guaranteed to be available.
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -309,8 +309,8 @@ void bli_pool_checkout_block
 
 void bli_pool_checkin_block
      (
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       pblk_t* block,
+       pool_t* pool
      )
 {
 	// If the pblk_t being checked in was allocated with a different block
@@ -330,7 +330,7 @@ void bli_pool_checkin_block
 	}
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -353,8 +353,8 @@ void bli_pool_checkin_block
 
 void bli_pool_grow
      (
-       siz_t            num_blocks_add,
-       pool_t* restrict pool
+       siz_t   num_blocks_add,
+       pool_t* pool
      )
 {
 	err_t r_val;
@@ -394,12 +394,12 @@ void bli_pool_grow
 		#endif
 
 		// Query the current block_ptrs array.
-		pblk_t* restrict block_ptrs_cur = bli_pool_block_ptrs( pool );
+		pblk_t* block_ptrs_cur = bli_pool_block_ptrs( pool );
 
 		// Allocate a new block_ptrs array.
 		// FGVZ: Do we want to call malloc_fp() for internal data structures as
 		// well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g.
-		pblk_t* restrict block_ptrs_new
+		pblk_t* block_ptrs_new
 		=
 		bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val );
 
@@ -433,7 +433,7 @@ void bli_pool_grow
 	// blocks.
 
 	// Query the current block_ptrs array (which was mabye just resized).
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the block size and alignment size of the pool.
 	const siz_t block_size  = bli_pool_block_size( pool );
@@ -470,8 +470,8 @@ void bli_pool_grow
 
 void bli_pool_shrink
      (
-       siz_t            num_blocks_sub,
-       pool_t* restrict pool
+       siz_t   num_blocks_sub,
+       pool_t* pool
      )
 {
 	// If the requested decrease is zero, return early.
@@ -493,7 +493,7 @@ void bli_pool_shrink
 	num_blocks_sub = bli_min( num_blocks_sub, num_blocks_avail );
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Compute the new total number of blocks.
 	const siz_t num_blocks_new = num_blocks - num_blocks_sub;
@@ -520,11 +520,11 @@ void bli_pool_shrink
 
 void bli_pool_alloc_block
      (
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       pblk_t* restrict block
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       pblk_t*   block
      )
 {
 	err_t r_val;
@@ -540,7 +540,7 @@ void bli_pool_alloc_block
 	// be recovered when it's time to free the block. Note that we have to
 	// add offset_size to the number of bytes requested since we will skip
 	// that many bytes at the beginning of the allocated memory.
-	void* restrict buf
+	void* buf
 	=
 	bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size, &r_val );
 
@@ -579,7 +579,7 @@ void bli_pool_free_block
      (
        siz_t            offset_size,
        free_ft          free_fp,
-       pblk_t* restrict block
+       pblk_t* block
      )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
@@ -590,7 +590,7 @@ void bli_pool_free_block
 
 	// Extract the pblk_t buffer, which is the aligned address returned from
 	// bli_fmalloc_align() when the block was allocated.
-	void* restrict buf = bli_pblk_buf( block );
+	void* buf = bli_pblk_buf( block );
 
 	// Undo the pointer advancement by offset_size bytes performed previously
 	// by bli_pool_alloc_block().
@@ -604,7 +604,7 @@ void bli_pool_free_block
 
 void bli_pool_print
      (
-       pool_t* restrict pool
+       const pool_t* pool
      )
 {
 	pblk_t* block_ptrs     = bli_pool_block_ptrs( pool );
@@ -633,7 +633,7 @@ void bli_pool_print
 
 void bli_pblk_print
      (
-       pblk_t* restrict pblk
+       const pblk_t* pblk
      )
 {
 	void* buf = bli_pblk_buf( pblk );
diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h
index b4bb23fec..0b16ae8ee 100644
--- a/frame/base/bli_pool.h
+++ b/frame/base/bli_pool.h
@@ -70,12 +70,12 @@ typedef struct
 
 // Pool block query
 
-BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk )
+BLIS_INLINE void* bli_pblk_buf( const pblk_t* pblk )
 {
 	return pblk->buf;
 }
 
-BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk )
+BLIS_INLINE siz_t bli_pblk_block_size( const pblk_t* pblk )
 {
 	return pblk->block_size;
 }
@@ -115,52 +115,52 @@ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk )
 
 // Pool entry query
 
-BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool )
+BLIS_INLINE void* bli_pool_block_ptrs( const pool_t* pool )
 {
 	return pool->block_ptrs;
 }
 
-BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_block_ptrs_len( const pool_t* pool )
 {
 	return pool->block_ptrs_len;
 }
 
-BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_num_blocks( const pool_t* pool )
 {
 	return pool->num_blocks;
 }
 
-BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_block_size( const pool_t* pool )
 {
 	return pool->block_size;
 }
 
-BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_align_size( const pool_t* pool )
 {
 	return pool->align_size;
 }
 
-BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_offset_size( const pool_t* pool )
 {
 	return pool->offset_size;
 }
 
-BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool )
+BLIS_INLINE malloc_ft bli_pool_malloc_fp( const pool_t* pool )
 {
 	return pool->malloc_fp;
 }
 
-BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool )
+BLIS_INLINE free_ft bli_pool_free_fp( const pool_t* pool )
 {
 	return pool->free_fp;
 }
 
-BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_top_index( const pool_t* pool )
 {
 	return pool->top_index;
 }
 
-BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool )
+BLIS_INLINE bool bli_pool_is_exhausted( const pool_t* pool )
 {
 	return ( bool )
 	       ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) );
@@ -217,74 +217,74 @@ BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \
 
 void bli_pool_init
      (
-       siz_t            num_blocks,
-       siz_t            block_ptrs_len,
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       free_ft          free_fp,
-       pool_t* restrict pool
+       siz_t     num_blocks,
+       siz_t     block_ptrs_len,
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       free_ft   free_fp,
+       pool_t*   pool
      );
 void bli_pool_finalize
      (
-       pool_t* restrict pool
+       pool_t* pool
      );
 void bli_pool_reinit
      (
-       siz_t            num_blocks_new,
-       siz_t            block_ptrs_len_new,
-       siz_t            block_size_new,
-       siz_t            align_size_new,
-       siz_t            offset_size_new,
-       pool_t* restrict pool
+       siz_t   num_blocks_new,
+       siz_t   block_ptrs_len_new,
+       siz_t   block_size_new,
+       siz_t   align_size_new,
+       siz_t   offset_size_new,
+       pool_t* pool
      );
 
 void bli_pool_checkout_block
      (
-       siz_t            req_size,
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       siz_t   req_size,
+       pblk_t* block,
+       pool_t* pool
      );
 void bli_pool_checkin_block
      (
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       pblk_t* block,
+       pool_t* pool
      );
 
 void bli_pool_grow
      (
-       siz_t            num_blocks_add,
-       pool_t* restrict pool
+       siz_t   num_blocks_add,
+       pool_t* pool
      );
 void bli_pool_shrink
      (
-       siz_t            num_blocks_sub,
-       pool_t* restrict pool
+       siz_t   num_blocks_sub,
+       pool_t* pool
      );
 
 void bli_pool_alloc_block
      (
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       pblk_t* restrict block
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       pblk_t*   block
      );
 void bli_pool_free_block
      (
-       siz_t            offset_size,
-       free_ft          free_fp,
-       pblk_t* restrict block
+       siz_t   offset_size,
+       free_ft free_fp,
+       pblk_t* block
      );
 
 void bli_pool_print
      (
-       pool_t* restrict pool
+       const pool_t* pool
      );
 void bli_pblk_print
      (
-       pblk_t* restrict pblk
+       const pblk_t* pblk
      );
 
 #endif
diff --git a/frame/base/bli_query.c b/frame/base/bli_query.c
index c62a30ccc..140fc2f97 100644
--- a/frame/base/bli_query.c
+++ b/frame/base/bli_query.c
@@ -34,7 +34,7 @@
 
 #include "blis.h"
 
-bool bli_obj_equals( obj_t* a, obj_t* b )
+bool bli_obj_equals( const obj_t* a, const obj_t* b )
 {
 #if 0
 	bool  r_val = FALSE;
@@ -95,7 +95,7 @@ bool bli_obj_equals( obj_t* a, obj_t* b )
 #endif
 }
 
-bool bli_obj_imag_equals( obj_t* a, obj_t* b )
+bool bli_obj_imag_equals( const obj_t* a, const obj_t* b )
 {
 #if 0
 	bool  r_val = FALSE;
@@ -165,7 +165,7 @@ bool bli_obj_imag_equals( obj_t* a, obj_t* b )
 	return r_val;
 }
 
-bool bli_obj_imag_is_zero( obj_t* a )
+bool bli_obj_imag_is_zero( const obj_t* a )
 {
 	bool r_val = TRUE;
 
diff --git a/frame/base/bli_query.h b/frame/base/bli_query.h
index 65246050b..d2decf928 100644
--- a/frame/base/bli_query.h
+++ b/frame/base/bli_query.h
@@ -32,8 +32,8 @@
 
 */
 
-BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b );
+BLIS_EXPORT_BLIS bool bli_obj_equals( const obj_t* a, const obj_t* b );
 
-BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b );
+BLIS_EXPORT_BLIS bool bli_obj_imag_equals( const obj_t* a, const obj_t* b );
 
-BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a );
+BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( const obj_t* a );
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index a6ded35b3..2c13c74a2 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -410,7 +410,7 @@ void bli_rntm_set_ways_from_rntm_sup
 
 void bli_rntm_print
      (
-       rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	dim_t af = bli_rntm_auto_factor( rntm );
@@ -433,8 +433,8 @@ void bli_rntm_print
 
 dim_t bli_rntm_calc_num_threads_in
      (
-       bszid_t* restrict bszid_cur,
-       rntm_t*  restrict rntm
+       const bszid_t* bszid_cur,
+       const rntm_t*  rntm
      )
 {
 	/*                                     // bp algorithm:
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 249a69805..2a39f8894 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -61,56 +61,56 @@ typedef struct rntm_s
 // -- rntm_t query (public API) ------------------------------------------------
 //
 
-BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_auto_factor( const rntm_t* rntm )
 {
 	return rntm->auto_factor;
 }
 
-BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_num_threads( const rntm_t* rntm )
 {
 	return rntm->num_threads;
 }
 
-BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm )
 {
 	return rntm->thrloop[ bszid ];
 }
 
-BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_NC, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_pc_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_KC, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_ic_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_MC, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_jr_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_NR, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_ir_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_MR, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_pr_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_KR, rntm );
 }
 
-BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_pack_a( const rntm_t* rntm )
 {
 	return ( bool )( rntm->pack_a );
 }
-BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_pack_b( const rntm_t* rntm )
 {
 	return ( bool )( rntm->pack_b );
 }
 
-BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_l3_sup( const rntm_t* rntm )
 {
 	return rntm->l3_sup;
 }
@@ -119,12 +119,12 @@ BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm )
 // -- rntm_t query (internal use only) -----------------------------------------
 //
 
-BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm )
+BLIS_INLINE pool_t* bli_rntm_sba_pool( const rntm_t* rntm )
 {
 	return rntm->sba_pool;
 }
 
-BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm )
+BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm )
 {
 	return rntm->pba;
 }
@@ -334,7 +334,7 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 
 BLIS_INLINE dim_t bli_rntm_calc_num_threads
      (
-       rntm_t*  restrict rntm
+       const rntm_t* rntm
      )
 {
 	dim_t n_threads;
@@ -382,13 +382,13 @@ void bli_rntm_set_ways_from_rntm_sup
 
 void bli_rntm_print
      (
-       rntm_t* rntm
+       const rntm_t* rntm
      );
 
 dim_t bli_rntm_calc_num_threads_in
      (
-       bszid_t* restrict bszid_cur,
-       rntm_t*  restrict rntm
+       const bszid_t* bszid_cur,
+       const rntm_t*  rntm
      );
 
 #endif
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 5b6ff6a0f..776622bb4 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -57,8 +57,8 @@ void bli_sba_finalize( void )
 
 void* bli_sba_acquire
      (
-       rntm_t* restrict rntm,
-       siz_t            req_size
+       rntm_t* rntm,
+       siz_t   req_size
      )
 {
 	void* block;
@@ -74,7 +74,7 @@ void* bli_sba_acquire
 		pblk_t pblk;
 
 		// Query the small block pool from the rntm.
-		pool_t* restrict pool = bli_rntm_sba_pool( rntm );
+		pool_t* pool = bli_rntm_sba_pool( rntm );
 
 		// We don't expect NULL sba_pool pointers in the normal course of BLIS
 		// operation. However, there are rare instances where it is convenient
@@ -122,8 +122,8 @@ void* bli_sba_acquire
 
 void bli_sba_release
      (
-       rntm_t* restrict rntm,
-       void*   restrict block
+       rntm_t* rntm,
+       void*   block
      )
 {
 #ifdef BLIS_ENABLE_SBA_POOLS
@@ -136,7 +136,7 @@ void bli_sba_release
 		pblk_t pblk;
 
 		// Query the small block pool from the rntm.
-		pool_t* restrict pool = bli_rntm_sba_pool( rntm );
+		pool_t* pool = bli_rntm_sba_pool( rntm );
 
 		if ( pool == NULL )
 		{
@@ -182,7 +182,7 @@ array_t* bli_sba_checkout_array
 
 void bli_sba_checkin_array
      (
-       array_t* restrict array
+       array_t* array
      )
 {
 	#ifndef BLIS_ENABLE_SBA_POOLS
@@ -194,9 +194,9 @@ void bli_sba_checkin_array
 
 void bli_sba_rntm_set_pool
      (
-       siz_t             index,
-       array_t* restrict array,
-       rntm_t*  restrict rntm
+       siz_t    index,
+       array_t* array,
+       rntm_t*  rntm
      )
 {
 	#ifndef BLIS_ENABLE_SBA_POOLS
@@ -205,7 +205,7 @@ void bli_sba_rntm_set_pool
 	#endif
 
 	// Query the pool_t* in the array_t corresponding to index.
-	pool_t* restrict pool = bli_apool_array_elem( index, array );
+	pool_t* pool = bli_apool_array_elem( index, array );
 
 	// Embed the pool_t* into the rntm_t.
 	bli_rntm_set_sba_pool( pool, rntm );
diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h
index f5e36d759..4fc3aaaee 100644
--- a/frame/base/bli_sba.h
+++ b/frame/base/bli_sba.h
@@ -44,30 +44,30 @@ void bli_sba_finalize( void );
 
 array_t* bli_sba_checkout_array
      (
-       const siz_t n_threads
+       siz_t n_threads
      );
 
 void bli_sba_checkin_array
      (
-       array_t* restrict array
+       array_t* array
      );
 
 void bli_sba_rntm_set_pool
      (
-       siz_t             index,
-       array_t* restrict array,
-       rntm_t*  restrict rntm
+       siz_t    index,
+       array_t* array,
+       rntm_t*  rntm
      );
 
 void* bli_sba_acquire
      (
-       rntm_t* restrict rntm,
-       siz_t            req_size
+       rntm_t* rntm,
+       siz_t   req_size
      );
 void bli_sba_release
      (
-       rntm_t* restrict rntm,
-       void*   restrict block
+       rntm_t* rntm,
+       void*   block
      );
 
 
diff --git a/frame/base/bli_setgetijm.c b/frame/base/bli_setgetijm.c
index 78ff58a29..d056a2e44 100644
--- a/frame/base/bli_setgetijm.c
+++ b/frame/base/bli_setgetijm.c
@@ -36,21 +36,21 @@
 
 typedef void (*setijm_fp)
      (
-       double         ar,
-       double         ai,
-       dim_t          i,
-       dim_t          j,
-       void* restrict b, inc_t rs, inc_t cs
+       double ar,
+       double ai,
+       dim_t  i,
+       dim_t  j,
+       void*  b, inc_t rs, inc_t cs
      );
 static setijm_fp GENARRAY(ftypes_setijm,setijm);
 
 err_t bli_setijm
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       dim_t   j,
-       obj_t*  b
+             double ar,
+             double ai,
+             dim_t  i,
+             dim_t  j,
+       const obj_t* b
      )
 {
 	dim_t m  = bli_obj_length( b );
@@ -90,16 +90,16 @@ err_t bli_setijm
 \
 void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       dim_t  j, \
+       void*  b, inc_t rs, inc_t cs  \
      ) \
 { \
-	ctype* restrict b_cast = ( ctype* )b; \
+	ctype* b_cast = ( ctype* )b; \
 \
-	ctype* restrict b_ij = b_cast + (i  )*rs + (j  )*cs; \
+	ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
 	PASTEMAC2(z,ch,sets)( ar, ai, *b_ij ); \
 }
@@ -110,21 +110,21 @@ INSERT_GENTFUNC_BASIC0( setijm )
 
 typedef void (*getijm_fp)
      (
-       dim_t          i,
-       dim_t          j,
-       void* restrict b, inc_t rs, inc_t cs,
-       double*        ar,
-       double*        ai
+             dim_t   i,
+             dim_t   j,
+       const void*   b, inc_t rs, inc_t cs,
+             double* ar,
+             double* ai
      );
 static getijm_fp GENARRAY(ftypes_getijm,getijm);
 
 err_t bli_getijm
       (
-        dim_t   i,
-        dim_t   j,
-        obj_t*  b,
-        double* ar,
-        double* ai
+              dim_t   i,
+              dim_t   j,
+        const obj_t*  b,
+              double* ar,
+              double* ai
       )
 {
 	dim_t m  = bli_obj_length( b );
@@ -164,16 +164,16 @@ err_t bli_getijm
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs, \
-       double*        ar, \
-       double*        ai  \
+             dim_t   i, \
+             dim_t   j, \
+       const void*   b, inc_t rs, inc_t cs, \
+             double* ar, \
+             double* ai  \
      ) \
 { \
-	ctype* restrict b_cast = ( ctype* )b; \
+	const ctype* b_cast = ( const ctype* )b; \
 \
-	ctype* restrict b_ij = b_cast + (i  )*rs + (j  )*cs; \
+	const ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
 	PASTEMAC2(ch,z,gets)( *b_ij, *ar, *ai ); \
 }
diff --git a/frame/base/bli_setgetijm.h b/frame/base/bli_setgetijm.h
index 55ce0ee11..a2db16d11 100644
--- a/frame/base/bli_setgetijm.h
+++ b/frame/base/bli_setgetijm.h
@@ -34,11 +34,11 @@
 
 BLIS_EXPORT_BLIS err_t bli_setijm
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       dim_t   j,
-       obj_t*  b
+             double ar,
+             double ai,
+             dim_t  i,
+             dim_t  j,
+       const obj_t* b
      );
 
 #undef  GENTPROT
@@ -46,11 +46,11 @@ BLIS_EXPORT_BLIS err_t bli_setijm
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       dim_t  j, \
+       void*  b, inc_t rs, inc_t cs  \
      );
 
 INSERT_GENTPROT_BASIC0( setijm )
@@ -59,11 +59,11 @@ INSERT_GENTPROT_BASIC0( setijm )
 
 BLIS_EXPORT_BLIS err_t bli_getijm
       (
-        dim_t   i,
-        dim_t   j,
-        obj_t*  b,
-        double* ar,
-        double* ai
+              dim_t   i,
+              dim_t   j,
+        const obj_t*  b,
+              double* ar,
+              double* ai
       );
 
 #undef  GENTPROT
@@ -71,11 +71,11 @@ BLIS_EXPORT_BLIS err_t bli_getijm
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs, \
-       double*        ar, \
-       double*        ai  \
+             dim_t   i, \
+             dim_t   j, \
+       const void*   b, inc_t rs, inc_t cs, \
+             double* ar, \
+             double* ai  \
      );
 
 INSERT_GENTPROT_BASIC0( getijm )
diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c
index 610f6f271..6cee789c7 100644
--- a/frame/base/bli_setgetijv.c
+++ b/frame/base/bli_setgetijv.c
@@ -36,19 +36,19 @@
 
 typedef void (*setijv_fp)
      (
-       double         ar,
-       double         ai,
-       dim_t          i,
-       void* restrict x, inc_t incx
+       double ar,
+       double ai,
+       dim_t  i,
+       void*  x, inc_t incx
      );
 static setijv_fp GENARRAY(ftypes_setijv,setijv);
 
 err_t bli_setijv
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       obj_t*  x
+             double ar,
+             double ai,
+             dim_t  i,
+       const obj_t* x
      )
 {
 	dim_t n    = bli_obj_vector_dim( x );
@@ -84,10 +84,10 @@ err_t bli_setijv
 \
 void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       void* restrict x, inc_t incx  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       void*  x, inc_t incx  \
      ) \
 { \
 	ctype* restrict x_cast = ( ctype* )x; \
@@ -103,19 +103,19 @@ INSERT_GENTFUNC_BASIC0( setijv )
 
 typedef void (*getijv_fp)
      (
-       dim_t          i,
-       void* restrict x, inc_t incx,
-       double*        ar,
-       double*        ai
+             dim_t   i,
+       const void*   x, inc_t incx,
+             double* ar,
+             double* ai
      );
 static getijv_fp GENARRAY(ftypes_getijv,getijv);
 
 err_t bli_getijv
       (
-        dim_t   i,
-        obj_t*  x,
-        double* ar,
-        double* ai
+              dim_t   i,
+        const obj_t*  x,
+              double* ar,
+              double* ai
       )
 {
 	dim_t n    = bli_obj_vector_dim( x );
@@ -151,15 +151,15 @@ err_t bli_getijv
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       void* restrict x, inc_t incx, \
-       double*        ar, \
-       double*        ai  \
+             dim_t   i, \
+       const void*   x, inc_t incx, \
+             double* ar, \
+             double* ai  \
      ) \
 { \
-	ctype* restrict x_cast = ( ctype* )x; \
+	const ctype* restrict x_cast = ( const ctype* )x; \
 \
-	ctype* restrict x_i = x_cast + (i  )*incx; \
+	const ctype* restrict x_i = x_cast + (i  )*incx; \
 \
 	PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \
 }
diff --git a/frame/base/bli_setgetijv.h b/frame/base/bli_setgetijv.h
index 703fe41aa..a9badce4d 100644
--- a/frame/base/bli_setgetijv.h
+++ b/frame/base/bli_setgetijv.h
@@ -34,10 +34,10 @@
 
 BLIS_EXPORT_BLIS err_t bli_setijv
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       obj_t*  x
+             double  ar,
+             double  ai,
+             dim_t   i,
+       const obj_t*  x
      );
 
 #undef  GENTPROT
@@ -45,10 +45,10 @@ BLIS_EXPORT_BLIS err_t bli_setijv
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       void* restrict x, inc_t incx  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       void*  x, inc_t incx  \
      );
 
 INSERT_GENTPROT_BASIC0( setijv )
@@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( setijv )
 
 BLIS_EXPORT_BLIS err_t bli_getijv
       (
-        dim_t   i,
-        obj_t*  x,
-        double* ar,
-        double* ai
+              dim_t   i,
+        const obj_t*  x,
+              double* ar,
+              double* ai
       );
 
 #undef  GENTPROT
@@ -68,10 +68,10 @@ BLIS_EXPORT_BLIS err_t bli_getijv
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       void* restrict b, inc_t incx, \
-       double*        ar, \
-       double*        ai  \
+             dim_t   i, \
+       const void*   b, inc_t incx, \
+             double* ar, \
+             double* ai  \
      );
 
 INSERT_GENTPROT_BASIC0( getijv )
diff --git a/frame/base/bli_setri.c b/frame/base/bli_setri.c
index 7220571c0..15e698b2b 100644
--- a/frame/base/bli_setri.c
+++ b/frame/base/bli_setri.c
@@ -38,8 +38,8 @@
 
 void bli_setrm
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+       const obj_t* b
      )
 {
 	obj_t alpha_real;
@@ -67,8 +67,8 @@ void bli_setrm
 
 void bli_setrv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+       const obj_t* x
      )
 {
 	obj_t alpha_real;
@@ -98,8 +98,8 @@ void bli_setrv
 
 void bli_setim
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+       const obj_t* b
      )
 {
 	obj_t alpha_real;
@@ -130,8 +130,8 @@ void bli_setim
 
 void bli_setiv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+       const obj_t* x
      )
 {
 	obj_t alpha_real;
diff --git a/frame/base/bli_setri.h b/frame/base/bli_setri.h
index dd6ce9f3f..ff5a09681 100644
--- a/frame/base/bli_setri.h
+++ b/frame/base/bli_setri.h
@@ -36,27 +36,27 @@
 
 BLIS_EXPORT_BLIS void bli_setrm
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+       const obj_t* b
      );
 
 BLIS_EXPORT_BLIS void bli_setrv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+       const obj_t* x
      );
 
 // -- seti ---------------------------------------------------------------------
 
 BLIS_EXPORT_BLIS void bli_setim
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+       const obj_t* b
      );
 
 BLIS_EXPORT_BLIS void bli_setiv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+       const obj_t* x
      );
 
diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c
index 64db75d24..57dd48bbc 100644
--- a/frame/base/cast/bli_castm.c
+++ b/frame/base/cast/bli_castm.c
@@ -41,11 +41,11 @@
 
 typedef void (*FUNCPTR_T)
      (
-       trans_t        transa,
-       dim_t          m,
-       dim_t          n,
-       void* restrict a, inc_t rs_a, inc_t cs_a,
-       void* restrict b, inc_t rs_b, inc_t cs_b
+             trans_t transa,
+             dim_t   m,
+             dim_t   n,
+       const void*   a, inc_t rs_a, inc_t cs_a,
+             void*   b, inc_t rs_b, inc_t cs_b
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castm);
@@ -56,27 +56,25 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castm);
 
 void bli_castm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
-	num_t     dt_a     = bli_obj_dt( a );
-	num_t     dt_b     = bli_obj_dt( b );
+	const num_t   dt_a   = bli_obj_dt( a );
+	const num_t   dt_b   = bli_obj_dt( b );
 
-	trans_t   transa   = bli_obj_conjtrans_status( a );
+	const trans_t transa = bli_obj_conjtrans_status( a );
 
-	dim_t     m        = bli_obj_length( b );
-	dim_t     n        = bli_obj_width( b );
+	const dim_t   m      = bli_obj_length( b );
+	const dim_t   n      = bli_obj_width( b );
 
-	void*     buf_a    = bli_obj_buffer_at_off( a );
-	inc_t     rs_a     = bli_obj_row_stride( a );
-	inc_t     cs_a     = bli_obj_col_stride( a );
+	const void*   buf_a  = bli_obj_buffer_at_off( a );
+	const inc_t   rs_a   = bli_obj_row_stride( a );
+	const inc_t   cs_a   = bli_obj_col_stride( a );
 
-	void*     buf_b    = bli_obj_buffer_at_off( b );
-	inc_t     rs_b     = bli_obj_row_stride( b );
-	inc_t     cs_b     = bli_obj_col_stride( b );
-
-	FUNCPTR_T f;
+	      void*   buf_b  = bli_obj_buffer_at_off( b );
+	const inc_t   rs_b   = bli_obj_row_stride( b );
+	const inc_t   cs_b   = bli_obj_col_stride( b );
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -93,10 +91,7 @@ void bli_castm
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_a][dt_b];
-
-	// Invoke the void pointer-based function.
-	f
+	ftypes[dt_a][dt_b]
 	(
 	  transa,
 	  m,
@@ -117,21 +112,21 @@ void bli_castm
 \
 void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t        transa, \
-       dim_t          m, \
-       dim_t          n, \
-       void* restrict a, inc_t rs_a, inc_t cs_a, \
-       void* restrict b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
-	ctype_a* restrict a_cast = a; \
-	ctype_b* restrict b_cast = b; \
-	conj_t            conja; \
-	dim_t             n_iter; \
-	dim_t             n_elem; \
-	inc_t             lda, inca; \
-	inc_t             ldb, incb; \
-	dim_t             j, i; \
+	const ctype_a* restrict a_cast = a; \
+	      ctype_b* restrict b_cast = b; \
+	      conj_t            conja; \
+	      dim_t             n_iter; \
+	      dim_t             n_elem; \
+	      inc_t             lda, inca; \
+	      inc_t             ldb, incb; \
+	      dim_t             j, i; \
 \
 	/* Set various loop parameters. */ \
 	bli_set_dims_incs_2m \
@@ -150,8 +145,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -163,8 +158,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -182,8 +177,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -195,8 +190,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -221,8 +216,8 @@ INSERT_GENTFUNC2_MIXDP0( castm )
 
 void bli_castm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	err_t e_val;
diff --git a/frame/base/cast/bli_castm.h b/frame/base/cast/bli_castm.h
index e9e1dee21..c06d1241a 100644
--- a/frame/base/cast/bli_castm.h
+++ b/frame/base/cast/bli_castm.h
@@ -38,8 +38,8 @@
 
 BLIS_EXPORT_BLIS void bli_castm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
 //
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castm
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   n, \
-       void*   a, inc_t rs_a, inc_t cs_a, \
-       void*   b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT2_BASIC0( castm )
@@ -67,7 +67,7 @@ INSERT_GENTPROT2_MIXDP0( castm )
 
 void bli_castm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c
index a50bdfc15..071233169 100644
--- a/frame/base/cast/bli_castnzm.c
+++ b/frame/base/cast/bli_castnzm.c
@@ -41,11 +41,11 @@
 
 typedef void (*FUNCPTR_T)
      (
-       trans_t        transa,
-       dim_t          m,
-       dim_t          n,
-       void* restrict a, inc_t rs_a, inc_t cs_a,
-       void* restrict b, inc_t rs_b, inc_t cs_b
+             trans_t transa,
+             dim_t   m,
+             dim_t   n,
+       const void*   a, inc_t rs_a, inc_t cs_a,
+             void*   b, inc_t rs_b, inc_t cs_b
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm);
@@ -56,27 +56,25 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm);
 
 void bli_castnzm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
-	num_t     dt_a     = bli_obj_dt( a );
-	num_t     dt_b     = bli_obj_dt( b );
+	const num_t   dt_a   = bli_obj_dt( a );
+	const num_t   dt_b   = bli_obj_dt( b );
 
-	trans_t   transa   = bli_obj_conjtrans_status( a );
+	const trans_t transa = bli_obj_conjtrans_status( a );
 
-	dim_t     m        = bli_obj_length( b );
-	dim_t     n        = bli_obj_width( b );
+	const dim_t   m      = bli_obj_length( b );
+	const dim_t   n      = bli_obj_width( b );
 
-	void*     buf_a    = bli_obj_buffer_at_off( a );
-	inc_t     rs_a     = bli_obj_row_stride( a );
-	inc_t     cs_a     = bli_obj_col_stride( a );
+	const void*   buf_a  = bli_obj_buffer_at_off( a );
+	const inc_t   rs_a   = bli_obj_row_stride( a );
+	const inc_t   cs_a   = bli_obj_col_stride( a );
 
-	void*     buf_b    = bli_obj_buffer_at_off( b );
-	inc_t     rs_b     = bli_obj_row_stride( b );
-	inc_t     cs_b     = bli_obj_col_stride( b );
-
-	FUNCPTR_T f;
+	      void*   buf_b  = bli_obj_buffer_at_off( b );
+	const inc_t   rs_b   = bli_obj_row_stride( b );
+	const inc_t   cs_b   = bli_obj_col_stride( b );
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -93,10 +91,7 @@ void bli_castnzm
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_a][dt_b];
-
-	// Invoke the void pointer-based function.
-	f
+	ftypes[dt_a][dt_b]
 	(
 	  transa,
 	  m,
@@ -117,21 +112,21 @@ void bli_castnzm
 \
 void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t        transa, \
-       dim_t          m, \
-       dim_t          n, \
-       void* restrict a, inc_t rs_a, inc_t cs_a, \
-       void* restrict b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
-	ctype_a* restrict a_cast = a; \
-	ctype_b* restrict b_cast = b; \
-	conj_t            conja; \
-	dim_t             n_iter; \
-	dim_t             n_elem; \
-	inc_t             lda, inca; \
-	inc_t             ldb, incb; \
-	dim_t             j, i; \
+	const ctype_a* restrict a_cast = a; \
+	      ctype_b* restrict b_cast = b; \
+	      conj_t            conja; \
+	      dim_t             n_iter; \
+	      dim_t             n_elem; \
+	      inc_t             lda, inca; \
+	      inc_t             ldb, incb; \
+	      dim_t             j, i; \
 \
 	/* Set various loop parameters. */ \
 	bli_set_dims_incs_2m \
@@ -150,8 +145,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -163,8 +158,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -182,8 +177,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -195,8 +190,8 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -221,8 +216,8 @@ INSERT_GENTFUNC2_MIXDP0( castnzm )
 
 void bli_castnzm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	err_t e_val;
diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h
index 42cfef8c0..03860fe40 100644
--- a/frame/base/cast/bli_castnzm.h
+++ b/frame/base/cast/bli_castnzm.h
@@ -38,8 +38,8 @@
 
 BLIS_EXPORT_BLIS void bli_castnzm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
 //
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castnzm
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   n, \
-       void*   a, inc_t rs_a, inc_t cs_a, \
-       void*   b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT2_BASIC0( castnzm )
@@ -67,7 +67,7 @@ INSERT_GENTPROT2_MIXDP0( castnzm )
 
 void bli_castnzm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c
index 213c960d8..c46a2798c 100644
--- a/frame/base/cast/bli_castv.c
+++ b/frame/base/cast/bli_castv.c
@@ -41,10 +41,10 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t         conjx,
-       dim_t          n,
-       void* restrict x, inc_t inc_x,
-       void* restrict y, inc_t inc_y
+             conj_t conjx,
+             dim_t  n,
+       const void*  x, inc_t inc_x,
+             void*  y, inc_t inc_y
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castv);
@@ -55,24 +55,22 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castv);
 
 void bli_castv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
-	num_t     dt_x     = bli_obj_dt( x );
-	num_t     dt_y     = bli_obj_dt( y );
+	const num_t  dt_x  = bli_obj_dt( x );
+	const num_t  dt_y  = bli_obj_dt( y );
 
-	conj_t    conjx    = bli_obj_conj_status( x );
+	const conj_t conjx = bli_obj_conj_status( x );
 
-	dim_t     n        = bli_obj_vector_dim( x );
+	const dim_t  n     = bli_obj_vector_dim( x );
 
-	void*     buf_x    = bli_obj_buffer_at_off( x );
-	inc_t     inc_x    = bli_obj_vector_inc( x );
+	const void*  buf_x = bli_obj_buffer_at_off( x );
+	const inc_t  inc_x = bli_obj_vector_inc( x );
 
-	void*     buf_y    = bli_obj_buffer_at_off( y );
-	inc_t     inc_y    = bli_obj_vector_inc( y );
-
-	FUNCPTR_T f;
+	      void*  buf_y = bli_obj_buffer_at_off( y );
+	const inc_t  inc_y = bli_obj_vector_inc( y );
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -89,10 +87,7 @@ void bli_castv
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_x][dt_y];
-
-	// Invoke the void pointer-based function.
-	f
+	ftypes[dt_x][dt_y]
 	(
 	  conjx,
 	  n,
@@ -112,15 +107,15 @@ void bli_castv
 \
 void PASTEMAC2(chx,chy,opname) \
      ( \
-       conj_t         conjx, \
-       dim_t          n, \
-       void* restrict x, inc_t incx, \
-       void* restrict y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const void*  x, inc_t incx, \
+             void*  y, inc_t incy  \
      ) \
 { \
-	ctype_x* restrict x1 = x; \
-	ctype_y* restrict y1 = y; \
-	dim_t             i; \
+	const ctype_x* restrict x1 = x; \
+	      ctype_y* restrict y1 = y; \
+	      dim_t             i; \
 \
 	if ( bli_is_conj( conjx ) ) \
 	{ \
@@ -175,8 +170,8 @@ INSERT_GENTFUNC2_MIXDP0( castv )
 
 void bli_castv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
diff --git a/frame/base/cast/bli_castv.h b/frame/base/cast/bli_castv.h
index 9a8261514..85d87d911 100644
--- a/frame/base/cast/bli_castv.h
+++ b/frame/base/cast/bli_castv.h
@@ -38,8 +38,8 @@
 
 BLIS_EXPORT_BLIS void bli_castv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
 //
@@ -51,10 +51,10 @@ BLIS_EXPORT_BLIS void bli_castv
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       void*   x, inc_t incx, \
-       void*   y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const void*  x, inc_t incx, \
+             void*  y, inc_t incy  \
      );
 
 INSERT_GENTPROT2_BASIC0( castv )
@@ -66,7 +66,7 @@ INSERT_GENTPROT2_MIXDP0( castv )
 
 void bli_castv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
diff --git a/frame/base/check/bli_obj_check.c b/frame/base/check/bli_obj_check.c
index a971fa19a..cbacdd0fc 100644
--- a/frame/base/check/bli_obj_check.c
+++ b/frame/base/check/bli_obj_check.c
@@ -34,12 +34,12 @@
 
 #include "blis.h"
 
-void bli_obj_create_check( num_t  dt,
-                           dim_t  m,
-                           dim_t  n,
-                           inc_t  rs,
-                           inc_t  cs,
-                           obj_t* obj )
+void bli_obj_create_check(       num_t  dt,
+                                 dim_t  m,
+                                 dim_t  n,
+                                 inc_t  rs,
+                                 inc_t  cs,
+                           const obj_t* obj )
 {
 	err_t e_val;
 
@@ -53,10 +53,10 @@ void bli_obj_create_check( num_t  dt,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_create_without_buffer_check( num_t  dt,
-                                          dim_t  m,
-                                          dim_t  n,
-                                          obj_t* obj )
+void bli_obj_create_without_buffer_check(       num_t  dt,
+                                                dim_t  m,
+                                                dim_t  n,
+                                          const obj_t* obj )
 {
 	err_t e_val;
 
@@ -67,10 +67,10 @@ void bli_obj_create_without_buffer_check( num_t  dt,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_alloc_buffer_check( inc_t  rs,
-                                 inc_t  cs,
-                                 inc_t  is,
-                                 obj_t* obj )
+void bli_obj_alloc_buffer_check(       inc_t  rs,
+                                       inc_t  cs,
+                                       inc_t  is,
+                                 const obj_t* obj )
 {
 	err_t e_val;
 
@@ -83,11 +83,11 @@ void bli_obj_alloc_buffer_check( inc_t  rs,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_attach_buffer_check( void*  p,
-                                  inc_t  rs,
-                                  inc_t  cs,
-                                  inc_t  is,
-                                  obj_t* obj )
+void bli_obj_attach_buffer_check( const void*  p,
+                                        inc_t  rs,
+                                        inc_t  cs,
+                                        inc_t  is,
+                                  const obj_t* obj )
 {
 	err_t e_val;
 
@@ -109,8 +109,7 @@ void bli_obj_attach_buffer_check( void*  p,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_create_scalar_check( num_t  dt,
-                                  obj_t* obj )
+void bli_obj_create_scalar_check( num_t dt, const obj_t* obj )
 {
 	err_t e_val;
 
@@ -121,7 +120,7 @@ void bli_obj_create_scalar_check( num_t  dt,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_free_check( obj_t* obj )
+void bli_obj_free_check( const obj_t* obj )
 {
 	//err_t e_val;
 
@@ -131,7 +130,7 @@ void bli_obj_free_check( obj_t* obj )
 	//bli_check_error_code( e_val );
 }
 
-void bli_obj_create_const_check( double value, obj_t* obj )
+void bli_obj_create_const_check( double value, const obj_t* obj )
 {
 	err_t e_val;
 
@@ -185,7 +184,7 @@ void bli_dt_union_check( num_t dt1, num_t dt2 )
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_print_check( char* label, obj_t* obj )
+void bli_obj_print_check( const char* label, const obj_t* obj )
 {
 	err_t e_val;
 
diff --git a/frame/base/check/bli_obj_check.h b/frame/base/check/bli_obj_check.h
index 201842844..8572f0cfb 100644
--- a/frame/base/check/bli_obj_check.h
+++ b/frame/base/check/bli_obj_check.h
@@ -32,37 +32,36 @@
 
 */
 
-void bli_obj_create_check( num_t  dt,
-                           dim_t  m,
-                           dim_t  n,
-                           inc_t  rs,
-                           inc_t  cs,
-                           obj_t* obj );
-
-void bli_obj_create_without_buffer_check( num_t  dt,
-                                          dim_t  m,
-                                          dim_t  n,
-                                          obj_t* obj );
-
-void bli_obj_alloc_buffer_check( inc_t  rs,
+void bli_obj_create_check(       num_t  dt,
+                                 dim_t  m,
+                                 dim_t  n,
+                                 inc_t  rs,
                                  inc_t  cs,
-                                 inc_t  is,
-                                 obj_t* obj );
+                           const obj_t* obj );
 
-void bli_obj_attach_buffer_check( void*  p,
-                                  inc_t  rs,
-                                  inc_t  cs,
-                                  inc_t  is,
-                                  obj_t* obj );
+void bli_obj_create_without_buffer_check(       num_t  dt,
+                                                dim_t  m,
+                                                dim_t  n,
+                                          const obj_t* obj );
 
-void bli_obj_create_scalar_check( num_t  dt,
-                                  obj_t* obj );
+void bli_obj_alloc_buffer_check(       inc_t  rs,
+                                       inc_t  cs,
+                                       inc_t  is,
+                                 const obj_t* obj );
 
-void bli_obj_free_check( obj_t* obj );
+void bli_obj_attach_buffer_check( const void*  p,
+                                        inc_t  rs,
+                                        inc_t  cs,
+                                        inc_t  is,
+                                  const obj_t* obj );
 
-void bli_obj_create_const_check( double value, obj_t* obj );
+void bli_obj_create_scalar_check( num_t dt, const obj_t* obj );
 
-void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b );
+void bli_obj_free_check( const obj_t* obj );
+
+void bli_obj_create_const_check( double value, const obj_t* obj );
+
+void bli_obj_create_const_copy_of_check( const obj_t* a, const obj_t* b );
 
 void bli_dt_size_check( num_t dt );
 
@@ -70,5 +69,5 @@ void bli_dt_string_check( num_t dt );
 
 void bli_dt_union_check( num_t dt1, num_t dt2 );
 
-void bli_obj_print_check( char* label, obj_t* obj );
+void bli_obj_print_check( const char* label, const obj_t* obj );
 
diff --git a/frame/base/check/bli_part_check.c b/frame/base/check/bli_part_check.c
index 6d9aa37b9..d13a8c22f 100644
--- a/frame/base/check/bli_part_check.c
+++ b/frame/base/check/bli_part_check.c
@@ -34,11 +34,11 @@
 
 #include "blis.h"
 
-void bli_acquire_mpart_t2b_check( subpart_t  requested_part,
-                                      dim_t  i,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj )
+void bli_acquire_mpart_t2b_check(       subpart_t requested_part,
+                                        dim_t     i,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj )
 {
 	err_t e_val;
 
@@ -52,11 +52,11 @@ void bli_acquire_mpart_t2b_check( subpart_t  requested_part,
 	bli_check_error_code( e_val );
 }
 
-void bli_acquire_mpart_l2r_check( subpart_t  requested_part,
-                                      dim_t  j,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj )
+void bli_acquire_mpart_l2r_check(       subpart_t requested_part,
+                                        dim_t     j,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj )
 {
 	err_t e_val;
 
@@ -70,11 +70,11 @@ void bli_acquire_mpart_l2r_check( subpart_t  requested_part,
 	bli_check_error_code( e_val );
 }
 
-void bli_acquire_mpart_tl2br_check( subpart_t  requested_part,
-                                        dim_t  ij,
-                                        dim_t  b,
-                                        obj_t* obj,
-                                        obj_t* sub_obj )
+void bli_acquire_mpart_tl2br_check(       subpart_t requested_part,
+                                          dim_t     ij,
+                                          dim_t     b,
+                                    const obj_t*    obj,
+                                    const obj_t*    sub_obj )
 {
 	err_t e_val;
 
diff --git a/frame/base/check/bli_part_check.h b/frame/base/check/bli_part_check.h
index 2905af0e4..810c5a3a7 100644
--- a/frame/base/check/bli_part_check.h
+++ b/frame/base/check/bli_part_check.h
@@ -32,21 +32,21 @@
 
 */
 
-void bli_acquire_mpart_t2b_check( subpart_t  requested_part,
-                                      dim_t  i,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj );
-
-void bli_acquire_mpart_l2r_check( subpart_t  requested_part,
-                                      dim_t  j,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj );
-
-void bli_acquire_mpart_tl2br_check( subpart_t  requested_part,
-                                        dim_t  ij,
-                                        dim_t  b,
-                                        obj_t* obj,
-                                        obj_t* sub_obj );
+void bli_acquire_mpart_t2b_check(       subpart_t requested_part,
+                                        dim_t     i,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj );
+
+void bli_acquire_mpart_l2r_check(       subpart_t requested_part,
+                                        dim_t     j,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj );
+
+void bli_acquire_mpart_tl2br_check(       subpart_t requested_part,
+                                          dim_t     ij,
+                                          dim_t     b,
+                                    const obj_t*    obj,
+                                    const obj_t*    sub_obj );
 
diff --git a/frame/base/proj/bli_projm.c b/frame/base/proj/bli_projm.c
index 949bc2cc9..c79897083 100644
--- a/frame/base/proj/bli_projm.c
+++ b/frame/base/proj/bli_projm.c
@@ -36,8 +36,8 @@
 
 void bli_projm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	// Check parameters.
@@ -88,8 +88,8 @@ void bli_projm
 
 void bli_projm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	err_t e_val;
diff --git a/frame/base/proj/bli_projm.h b/frame/base/proj/bli_projm.h
index e95f7f2f5..924924f9b 100644
--- a/frame/base/proj/bli_projm.h
+++ b/frame/base/proj/bli_projm.h
@@ -34,13 +34,13 @@
 
 BLIS_EXPORT_BLIS void bli_projm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
 void bli_projm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
diff --git a/frame/base/proj/bli_projv.c b/frame/base/proj/bli_projv.c
index 9a6587e5b..588ac39c3 100644
--- a/frame/base/proj/bli_projv.c
+++ b/frame/base/proj/bli_projv.c
@@ -36,8 +36,8 @@
 
 void bli_projv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	// Check parameters.
@@ -88,8 +88,8 @@ void bli_projv
 
 void bli_projv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
diff --git a/frame/base/proj/bli_projv.h b/frame/base/proj/bli_projv.h
index b738b2f97..abdf35522 100644
--- a/frame/base/proj/bli_projv.h
+++ b/frame/base/proj/bli_projv.h
@@ -34,13 +34,13 @@
 
 BLIS_EXPORT_BLIS void bli_projv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
 void bli_projv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c
index 4533375f0..31f677db6 100644
--- a/frame/compat/extra/bla_gemm3m.c
+++ b/frame/compat/extra/bla_gemm3m.c
@@ -103,7 +103,7 @@ void PASTEF77(ch,blasname) \
 	   abbreviated version of bli_gemm_ex() so that we can bypass
 	   consideration of sup, which doesn't make sense in this context. */ \
 	{ \
-		cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \
+		cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \
 \
 		rntm_t  rntm_l; \
 		rntm_t* rntm = &rntm_l; \
@@ -222,7 +222,7 @@ void PASTEF77(ch,blasname) \
 	   abbreviated version of bli_gemm_ex() so that we can bypass
 	   consideration of sup, which doesn't make sense in this context. */ \
 	{ \
-		cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \
+		cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \
 \
 		rntm_t  rntm_l; \
 		rntm_t* rntm = &rntm_l; \
diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h
index 9773e5e69..42ad9c72b 100644
--- a/frame/include/bli_extern_defs.h
+++ b/frame/include/bli_extern_defs.h
@@ -35,13 +35,13 @@
 #ifndef BLIS_EXTERN_DEFS_H
 #define BLIS_EXTERN_DEFS_H
 
-BLIS_EXPORT_BLIS extern obj_t BLIS_TWO;
-BLIS_EXPORT_BLIS extern obj_t BLIS_ONE;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_TWO;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_ONE;
 //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF;
-BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_ZERO;
 //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF;
-BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE;
-BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO;
 
 BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM;
 BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED;
diff --git a/frame/include/bli_oapi_ba.h b/frame/include/bli_oapi_ba.h
index dc17507d1..d80263597 100644
--- a/frame/include/bli_oapi_ba.h
+++ b/frame/include/bli_oapi_ba.h
@@ -54,6 +54,6 @@
 // to NULL. The "( void )" statements are to prevent unused variable
 // warnings by the compiler.
 #undef  BLIS_OAPI_EX_DECLS
-#define BLIS_OAPI_EX_DECLS   cntx_t* cntx = NULL; ( void )cntx; \
-                             rntm_t* rntm = NULL; ( void )rntm;
+#define BLIS_OAPI_EX_DECLS   const cntx_t* cntx = NULL; ( void )cntx; \
+                                   rntm_t* rntm = NULL; ( void )rntm;
 
diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h
index 0eb5eb2a1..7252fd7ff 100644
--- a/frame/include/bli_oapi_ex.h
+++ b/frame/include/bli_oapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_OAPI_EX_PARAMS
-#define BLIS_OAPI_EX_PARAMS   ,cntx_t* cntx, rntm_t* rntm
+#define BLIS_OAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index fe174202c..9adaef211 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -42,363 +42,363 @@
 
 // Info query
 
-BLIS_INLINE num_t bli_obj_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( obj->info & BLIS_DATATYPE_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_float( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_float( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_double( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_double( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_scomplex( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_dcomplex( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_int( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_int( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_const( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_const( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE );
 }
 
-BLIS_INLINE dom_t bli_obj_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( obj->info & BLIS_DOMAIN_BIT );
 }
 
-BLIS_INLINE prec_t bli_obj_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( obj->info & BLIS_PRECISION_BIT );
 }
 
-BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_single_prec( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC );
 }
 
-BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_double_prec( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC );
 }
 
-BLIS_INLINE bool bli_obj_is_real( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_real( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL &&
 	         !bli_obj_is_const( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_complex( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_complex( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX &&
 	         !bli_obj_is_const( obj ) );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_real( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_complex( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX );
 }
 
-BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_target_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT );
 }
 
-BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_target_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT );
 }
 
-BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_target_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT );
 }
 
-BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_exec_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT );
 }
 
-BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_exec_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT );
 }
 
-BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_exec_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT );
 }
 
-BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_comp_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT );
 }
 
-BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_comp_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT );
 }
 
-BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_comp_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT );
 }
 
 // NOTE: This function queries info2.
-BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_scalar_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT );
 }
 
 // NOTE: This function queries info2.
-BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_scalar_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT );
 }
 
 // NOTE: This function queries info2.
-BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_scalar_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT );
 }
 
-BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj )
+BLIS_INLINE trans_t bli_obj_conjtrans_status( const obj_t* obj )
 {
 	return ( trans_t )
 	       ( obj->info & BLIS_CONJTRANS_BITS );
 }
 
-BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj )
+BLIS_INLINE trans_t bli_obj_onlytrans_status( const obj_t* obj )
 {
 	return ( trans_t )
 	       ( obj->info & BLIS_TRANS_BIT );
 }
 
-BLIS_INLINE bool bli_obj_has_trans( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_trans( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS );
 }
 
-BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_notrans( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS );
 }
 
-BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj )
+BLIS_INLINE conj_t bli_obj_conj_status( const obj_t* obj )
 {
 	return ( conj_t )
 	       ( obj->info & BLIS_CONJ_BIT );
 }
 
-BLIS_INLINE bool bli_obj_has_conj( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_conj( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ );
 }
 
-BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_noconj( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ );
 }
 
-BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj )
+BLIS_INLINE uplo_t bli_obj_uplo( const obj_t* obj )
 {
 	return ( uplo_t )
 	       ( obj->info & BLIS_UPLO_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_upper( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_upper( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER );
 }
 
-BLIS_INLINE bool bli_obj_is_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER );
 }
 
-BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_upper_or_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_upper( obj ) ||
 	         bli_obj_is_lower( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_dense( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_dense( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE );
 }
 
-BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_zeros( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS );
 }
 
-BLIS_INLINE diag_t bli_obj_diag( obj_t* obj )
+BLIS_INLINE diag_t bli_obj_diag( const obj_t* obj )
 {
 	return ( diag_t )
 	       ( obj->info & BLIS_UNIT_DIAG_BIT );
 }
 
-BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_nonunit_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG );
 }
 
-BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_unit_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG );
 }
 
-BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_inverted_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG );
 }
 
-BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER );
 }
 
-BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER );
 }
 
-BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj )
+BLIS_INLINE pack_t bli_obj_pack_schema( const obj_t* obj )
 {
 	return ( pack_t )
 	       ( obj->info & BLIS_PACK_SCHEMA_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( obj->info & BLIS_PACK_BIT );
 }
 
-BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_row_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-                                                   BLIS_BITVAL_PACKED_ROWS    ) );
+	                                               BLIS_BITVAL_PACKED_ROWS    ) );
 }
 
-BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_col_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-                                                   BLIS_BITVAL_PACKED_COLUMNS ) );
+	                                               BLIS_BITVAL_PACKED_COLUMNS ) );
 }
 
-BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_panel_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( obj->info & BLIS_PACK_PANEL_BIT );
 }
 
-BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj )
+BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( const obj_t* obj )
 {
 	return ( packbuf_t )
 	       ( obj->info & BLIS_PACK_BUFFER_BITS );
 }
 
-BLIS_INLINE struc_t bli_obj_struc( obj_t* obj )
+BLIS_INLINE struc_t bli_obj_struc( const obj_t* obj )
 {
 	return ( struc_t )
 	       ( obj->info & BLIS_STRUC_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_general( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_general( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL );
 }
 
-BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_hermitian( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN );
 }
 
-BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_symmetric( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC );
 }
 
-BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_triangular( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR );
@@ -599,49 +599,49 @@ BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj )
 
 // Root matrix query
 
-BLIS_INLINE obj_t* bli_obj_root( obj_t* obj )
+BLIS_INLINE obj_t* bli_obj_root( const obj_t* obj )
 {
 	return ( obj_t* )( obj->root );
 }
 
-BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_general( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_general( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_hermitian( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_hermitian( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_symmetric( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_symmetric( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_triangular( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_triangular( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_herm_or_symm( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ||
 	         bli_obj_is_symmetric( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_upper( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_upper( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_lower( bli_obj_root( obj ) ) );
@@ -656,13 +656,13 @@ BLIS_INLINE void bli_obj_set_as_root( obj_t* obj )
 
 // Diagonal offset query
 
-BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj )
+BLIS_INLINE doff_t bli_obj_diag_offset( const obj_t* obj )
 {
 	return ( doff_t )
 	       ( obj->diag_off );
 }
 
-BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj )
+BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( const obj_t* obj )
 {
 	return ( doff_t )
 	       ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj )
@@ -688,46 +688,46 @@ BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj )
 
 // Dimension query
 
-BLIS_INLINE dim_t bli_obj_length( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length( const obj_t* obj )
 {
 	return ( obj->dim[ BLIS_M ] );
 }
 
-BLIS_INLINE dim_t bli_obj_width( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width( const obj_t* obj )
 {
 	return ( obj->dim[ BLIS_N ] );
 }
 
-BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj )
+BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, const obj_t* obj )
 {
 	return ( obj->dim[ mdim ] );
 }
 
-BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_min_dim( const obj_t* obj )
 {
 	return bli_min( bli_obj_length( obj ),
 	                bli_obj_width( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_max_dim( const obj_t* obj )
 {
 	return bli_max( bli_obj_length( obj ),
 	                bli_obj_width( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj )
 	                                  : bli_obj_length( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj )
 	                                  : bli_obj_width( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_1x1( obj_t* x )
+BLIS_INLINE bool bli_obj_is_1x1( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 1 &&
@@ -736,34 +736,34 @@ BLIS_INLINE bool bli_obj_is_1x1( obj_t* x )
 
 // Stride/increment query
 
-BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_row_stride( const obj_t* obj )
 {
 	return ( obj->rs );
 }
 
-BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_col_stride( const obj_t* obj )
 {
 	return ( obj->cs );
 }
 
-BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_imag_stride( const obj_t* obj )
 {
 	return ( obj->is );
 }
 
-BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_row_stride_mag( const obj_t* obj )
 {
 	return ( inc_t )
 	       ( bli_abs( obj->rs ) );
 }
 
-BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_col_stride_mag( const obj_t* obj )
 {
 	return ( inc_t )
 	       ( bli_abs( obj->cs ) );
 }
 
-BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_imag_stride_mag( const obj_t* obj )
 {
 	return ( inc_t )
 	       ( bli_abs( obj->is ) );
@@ -773,7 +773,7 @@ BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj )
 // of the smallest submatrices of an object that could still encompass
 // the stored data above (if obj is upper) or below (if obj is lower)
 // the diagonal.
-BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length_stored( const obj_t* obj )
 {
 	return ( dim_t )
 	       ( bli_obj_is_upper( obj )
@@ -784,7 +784,7 @@ BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj )
 	       );
 }
 
-BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width_stored( const obj_t* obj )
 {
 	return ( dim_t )
 	       ( bli_obj_is_lower( obj )
@@ -795,25 +795,25 @@ BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj )
 	       );
 }
 
-BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length_stored_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj )
 	                                  : bli_obj_length_stored( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width_stored_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj )
 	                                  : bli_obj_width_stored( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x )
+BLIS_INLINE dim_t bli_obj_vector_dim( const obj_t* x )
 {
 	return ( bli_obj_length( x ) == 1 ? bli_obj_width( x )
 	                                  : bli_obj_length( x ) );
 }
 
-BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x )
+BLIS_INLINE inc_t bli_obj_vector_inc( const obj_t* x )
 {
 	return ( bli_obj_is_1x1( x ) ? 1 :
 	         ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x )
@@ -821,26 +821,26 @@ BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x )
 	       );
 }
 
-BLIS_INLINE bool bli_obj_is_vector( obj_t* x )
+BLIS_INLINE bool bli_obj_is_vector( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 1 ||
 	         bli_obj_width(  x ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x )
+BLIS_INLINE bool bli_obj_is_row_vector( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x )
+BLIS_INLINE bool bli_obj_is_col_vector( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_width( x ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x )
+BLIS_INLINE bool bli_obj_has_zero_dim( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 0 ||
@@ -894,32 +894,32 @@ BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, o
 // "obj" macros are used on packed matrices.
 //
 
-BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_row_stored( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_col_stride_mag( obj ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_col_stored( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_row_stride_mag( obj ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_gen_stored( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_row_stride_mag( obj ) != 1 &&
 	         bli_obj_col_stride_mag( obj ) != 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_row_tilted( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_col_tilted( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) );
@@ -950,17 +950,17 @@ BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj )
 
 // Offset query
 
-BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_row_off( const obj_t* obj )
 {
 	return ( obj->off[ BLIS_M ] );
 }
 
-BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_col_off( const obj_t* obj )
 {
 	return ( obj->off[ BLIS_N ] );
 }
 
-BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj )
+BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, const obj_t* obj )
 {
 	return ( obj->off[ mdim ] );
 }
@@ -991,33 +991,33 @@ BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj )
 
 // Diagonal offset predicates
 
-BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_strictly_above_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_strictly_below_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_outside_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_strictly_above_diag( obj ) ||
 	         bli_obj_is_strictly_below_diag( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_intersects_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( !bli_obj_is_strictly_above_diag( obj ) &&
 	         !bli_obj_is_strictly_below_diag( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_unstored_subpart( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) ||
@@ -1026,7 +1026,7 @@ BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj )
 
 // Buffer address query
 
-BLIS_INLINE void* bli_obj_buffer( obj_t* obj )
+BLIS_INLINE void* bli_obj_buffer( const obj_t* obj )
 {
 	return ( void* )
 	       ( obj->buffer );
@@ -1041,7 +1041,7 @@ BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj )
 
 // Bufferless scalar field query
 
-BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj )
+BLIS_INLINE void* bli_obj_internal_scalar_buffer( const obj_t* obj )
 {
 	return ( void* )
 	       ( &( obj->scalar ) );
@@ -1049,14 +1049,14 @@ BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj )
 
 // Bufferless scalar field modification
 
-BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_copy_internal_scalar( const obj_t* a, obj_t* b )
 {
 	b->scalar = a->scalar;
 }
 
 // Element size query
 
-BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj )
+BLIS_INLINE siz_t bli_obj_elem_size( const obj_t* obj )
 {
 	return ( siz_t )
 	       ( obj->elem_size );
@@ -1071,12 +1071,12 @@ BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj )
 
 // Packed matrix info query
 
-BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_padded_length( const obj_t* obj )
 {
 	return ( obj->m_padded );
 }
 
-BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_padded_width( const obj_t* obj )
 {
 	return ( obj->n_padded );
 }
@@ -1101,22 +1101,22 @@ BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj )
 
 // Packed panel info query
 
-BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_panel_length( const obj_t* obj )
 {
 	return ( obj->m_panel );
 }
 
-BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_panel_width( const obj_t* obj )
 {
 	return ( obj->n_panel );
 }
 
-BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_panel_dim( const obj_t* obj )
 {
 	return ( obj->pd );
 }
 
-BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_panel_stride( const obj_t* obj )
 {
 	return ( obj->ps );
 }
@@ -1151,7 +1151,7 @@ BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj )
 
 // stor3_t-related
 
-BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
+BLIS_INLINE stor3_t bli_obj_stor3_from_strides( const obj_t* c, const obj_t* a, const obj_t* b )
 {
 	const inc_t rs_c = bli_obj_row_stride( c );
 	const inc_t cs_c = bli_obj_col_stride( c );
@@ -1191,22 +1191,22 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
 
 // Function pointer query
 
-BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj )
+BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( const obj_t* obj )
 {
 	return obj->pack_fn;
 }
 
-BLIS_INLINE void* bli_obj_pack_params( obj_t* obj )
+BLIS_INLINE void* bli_obj_pack_params( const obj_t* obj )
 {
 	return obj->pack_params;
 }
 
-BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj )
+BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( const obj_t* obj )
 {
 	return obj->ker_fn;
 }
 
-BLIS_INLINE void* bli_obj_ker_params( obj_t* obj )
+BLIS_INLINE void* bli_obj_ker_params( const obj_t* obj )
 {
 	return obj->ker_params;
 }
@@ -1261,7 +1261,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
 	bli_obj_set_buffer( p, obj );
 
 	bli_obj_set_scalar_dt( dt, obj );
-	void* restrict s = bli_obj_internal_scalar_buffer( obj );
+	void* s = bli_obj_internal_scalar_buffer( obj );
 
 	if      ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F;
 	                                          (( scomplex* )s)->imag = 0.0F; }
@@ -1315,7 +1315,7 @@ BLIS_INLINE void bli_obj_set_defaults( obj_t* obj )
 
 // Acquire buffer at object's submatrix offset (offset-aware buffer query).
 
-BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj )
+BLIS_INLINE void* bli_obj_buffer_at_off( const obj_t* obj )
 {
 	return ( void* )
 	       (
@@ -1330,7 +1330,7 @@ BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj )
 
 // Acquire buffer from BLIS_CONSTANT object.
 
-BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj )
+BLIS_INLINE const void* bli_obj_buffer_for_const( num_t dt, const obj_t* obj )
 {
 	void* p;
 
@@ -1345,7 +1345,7 @@ BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj )
 
 // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects.
 
-BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj )
+BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, const obj_t* obj )
 {
 	return ( void* )
 	       ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj )
@@ -1360,21 +1360,21 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj )
 
 BLIS_INLINE void bli_obj_reset_origin( obj_t* obj )
 {
-    bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj );
-    bli_obj_set_offs( 0, 0, obj );
+	bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj );
+	bli_obj_set_offs( 0, 0, obj );
 	bli_obj_set_as_root( obj );
 }
 
 // Make a full alias (shallow copy).
 
-BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_alias_to( const obj_t* a, obj_t* b )
 {
 	bli_obj_init_full_shallow_copy_of( a, b );
 }
 
 // Check if two objects are aliases of one another.
 
-BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b )
+BLIS_INLINE bool bli_obj_is_alias_of( const obj_t* a, const obj_t* b )
 {
 	return ( bool )
 	       ( bli_obj_buffer( a ) == bli_obj_buffer( b ) );
@@ -1384,7 +1384,7 @@ BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b )
 // Create an alias with a trans value applied.
 // (Note: trans may include a conj component.)
 
-BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, const obj_t* a, obj_t* b )
 {
 	bli_obj_alias_to( a, b );
 	bli_obj_apply_trans( trans, b );
@@ -1392,7 +1392,7 @@ BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b )
 
 // Create an alias with a conj value applied.
 
-BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, const obj_t* a, obj_t* b )
 {
 	bli_obj_alias_to( a, b );
 	bli_obj_apply_conj( conja, b );
@@ -1400,7 +1400,7 @@ BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b )
 
 // Alias only the real part.
 
-BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r )
+BLIS_INLINE void bli_obj_real_part( const obj_t* c, obj_t* r )
 {
 	bli_obj_alias_to( c, r );
 
@@ -1433,7 +1433,7 @@ BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r )
 
 // Alias only the imaginary part.
 
-BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i )
+BLIS_INLINE void bli_obj_imag_part( const obj_t* c, obj_t* i )
 {
 	if ( bli_obj_is_complex( c ) )
 	{
@@ -1472,7 +1472,7 @@ BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i )
 // chosen buffer (possibly using an auxiliary datatype if the object is
 // BLIS_CONSTANT).
 
-BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf )
+BLIS_INLINE void bli_obj_scalar_set_dt_buffer( const obj_t* obj, num_t dt_aux, num_t* dt, void** buf )
 {
 	if ( bli_obj_is_const( obj ) )
 	{
diff --git a/frame/include/bli_tapi_ba.h b/frame/include/bli_tapi_ba.h
index 0177985d9..6a7e195ab 100644
--- a/frame/include/bli_tapi_ba.h
+++ b/frame/include/bli_tapi_ba.h
@@ -54,6 +54,6 @@
 // to NULL. The "( void )" statements are to prevent unused variable
 // warnings by the compiler.
 #undef  BLIS_TAPI_EX_DECLS
-#define BLIS_TAPI_EX_DECLS   cntx_t* cntx = NULL; ( void )cntx; \
-                             rntm_t* rntm = NULL; ( void )rntm;
+#define BLIS_TAPI_EX_DECLS   const cntx_t* cntx = NULL; ( void )cntx; \
+                                   rntm_t* rntm = NULL; ( void )rntm;
 
diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h
index c999b0ae9..f12be24b8 100644
--- a/frame/include/bli_tapi_ex.h
+++ b/frame/include/bli_tapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_TAPI_EX_PARAMS
-#define BLIS_TAPI_EX_PARAMS   ,cntx_t* cntx, rntm_t* rntm
+#define BLIS_TAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 4e64f3711..e957fc6b2 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1111,26 +1111,26 @@ typedef struct
 
 	// Pointers to the micro-panels of A and B which will be used by the
 	// next call to the micro-kernel.
-	void*  a_next;
-	void*  b_next;
+	const void* a_next;
+	const void* b_next;
 
 	// The imaginary strides of A and B.
-	inc_t  is_a;
-	inc_t  is_b;
+	inc_t is_a;
+	inc_t is_b;
 
 	// The panel strides of A and B.
 	// NOTE: These are only used in situations where iteration over the
 	// micropanels takes place in part within the kernel code (e.g. sup
 	// millikernels).
-	inc_t  ps_a;
-	inc_t  ps_b;
+	inc_t ps_a;
+	inc_t ps_b;
 
 	// The type to convert to on output.
 	//num_t  dt_on_output;
 
 	// (Virtual) microkernel address and additional parameters.
 	void_fp ukr;
-	void*   params;
+	const void* params;
 
 } auxinfo_t;
 
@@ -1162,23 +1162,23 @@ struct thrinfo_s;
 
 typedef void (*obj_pack_fn_t)
     (
-      struct obj_s*     a,
-      struct obj_s*     ap,
-      struct cntx_s*    cntx,
-      struct rntm_s*    rntm,
-      struct cntl_s*    cntl,
-      struct thrinfo_s* thread
+      const struct obj_s*     a,
+            struct obj_s*     ap,
+      const struct cntx_s*    cntx,
+            struct rntm_s*    rntm,
+            struct cntl_s*    cntl,
+      const struct thrinfo_s* thread
     );
 
 typedef void (*obj_ker_fn_t)
     (
-      struct obj_s*     a,
-      struct obj_s*     b,
-      struct obj_s*     c,
-      struct cntx_s*    cntx,
-      struct rntm_s*    rntm,
-      struct cntl_s*    cntl,
-      struct thrinfo_s* thread
+      const struct obj_s*     a,
+      const struct obj_s*     b,
+      const struct obj_s*     c,
+      const struct cntx_s*    cntx,
+            struct rntm_s*    rntm,
+            struct cntl_s*    cntl,
+      const struct thrinfo_s* thread
     );
 
 typedef struct obj_s
@@ -1297,7 +1297,7 @@ typedef struct obj_s
 // Define these macros here since they must be updated if contents of
 // obj_t changes.
 
-BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_init_full_shallow_copy_of( const obj_t* a, obj_t* b )
 {
 	b->root        = a->root;
 
@@ -1332,7 +1332,7 @@ BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b )
 	b->ker_params  = a->ker_params;
 }
 
-BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b )
 {
 	b->root        = a->root;
 
diff --git a/frame/thread/bli_l3_decor.h b/frame/thread/bli_l3_decor.h
index 0b09189a6..e2208aae6 100644
--- a/frame/thread/bli_l3_decor.h
+++ b/frame/thread/bli_l3_decor.h
@@ -41,30 +41,30 @@
 // Level-3 internal function type.
 typedef void (*l3int_t)
      (
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
+             thrinfo_t* thread
      );
 
 // Level-3 thread decorator prototype.
 void bli_l3_thread_decorator
      (
-       l3int_t func,
-       opid_t  family,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
index 5b40d0614..2c71c7532 100644
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -46,29 +46,18 @@ void* bli_l3_thread_entry( void* data_void ) { return NULL; }
 
 void bli_l3_thread_decorator
      (
-       l3int_t    func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
 	// Query the total number of threads from the rntm_t object.
 	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
@@ -83,7 +72,7 @@ void bli_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -96,7 +85,7 @@ void bli_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
@@ -104,8 +93,8 @@ void bli_l3_thread_decorator
 		// Create a thread-local copy of the master thread's rntm_t. This is
 		// necessary since we want each thread to be able to track its own
 		// small block pool_t as it executes down the function stack.
-		rntm_t           rntm_l = *rntm;
-		rntm_t* restrict rntm_p = &rntm_l;
+		rntm_t  rntm_l = *rntm;
+		rntm_t* rntm_p = &rntm_l;
 
 		// Query the thread's id from OpenMP.
 		const dim_t tid = omp_get_thread_num();
@@ -119,7 +108,6 @@ void bli_l3_thread_decorator
 		// be allocated/initialized.
 		bli_sba_rntm_set_pool( tid, array, rntm_p );
 
-
 		obj_t      a_t, b_t, c_t;
 		cntl_t*    cntl_use;
 		thrinfo_t* thread;
@@ -133,6 +121,17 @@ void bli_l3_thread_decorator
 		bli_obj_alias_to( b, &b_t );
 		bli_obj_alias_to( c, &c_t );
 
+		// This is part of a hack to support mixed domain in bli_gemm_front().
+		// Sometimes we need to specify a non-standard schema for A and B, and
+		// we decided to transmit them via the schema field in the obj_t's
+		// rather than pass them in as function parameters. Once the values
+		// have been read, we immediately reset them back to their expected
+		// values for unpacked objects.
+		pack_t schema_a = bli_obj_pack_schema( &a_t );
+		pack_t schema_b = bli_obj_pack_schema( &b_t );
+		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
+		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
+
 		// Create a default control tree for the operation, if needed.
 		bli_l3_cntl_create_if( family, schema_a, schema_b,
 		                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
diff --git a/frame/thread/bli_l3_decor_openmp.h b/frame/thread/bli_l3_decor_openmp.h
index 80dbe5374..6ff7f16a9 100644
--- a/frame/thread/bli_l3_decor_openmp.h
+++ b/frame/thread/bli_l3_decor_openmp.h
@@ -43,7 +43,7 @@ void bli_l3_thread_decorator_thread_check
      (
        dim_t      n_threads,
        dim_t      tid,
-	   thrcomm_t* gl_comm,
+       thrcomm_t* gl_comm,
        rntm_t*    rntm
      );
 
diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c
index 89b6ea118..80247dfb1 100644
--- a/frame/thread/bli_l3_decor_pthreads.c
+++ b/frame/thread/bli_l3_decor_pthreads.c
@@ -40,49 +40,45 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	l3int_t    func;
-	opid_t     family;
-	pack_t     schema_a;
-	pack_t     schema_b;
-	obj_t*     alpha;
-	obj_t*     a;
-	obj_t*     b;
-	obj_t*     beta;
-	obj_t*     c;
-	cntx_t*    cntx;
-	rntm_t*    rntm;
-	cntl_t*    cntl;
-	dim_t      tid;
-	thrcomm_t* gl_comm;
-	array_t*   array;
+	      l3int_t    func;
+	      opid_t     family;
+	const obj_t*     alpha;
+	const obj_t*     a;
+	const obj_t*     b;
+	const obj_t*     beta;
+	const obj_t*     c;
+	const cntx_t*    cntx;
+	      rntm_t*    rntm;
+	      cntl_t*    cntl;
+	      dim_t      tid;
+	      thrcomm_t* gl_comm;
+	      array_t*   array;
 } thread_data_t;
 
 // Entry point for additional threads
 void* bli_l3_thread_entry( void* data_void )
 {
-	thread_data_t* data     = data_void;
-
-	l3int_t        func     = data->func;
-	opid_t         family   = data->family;
-	pack_t         schema_a = data->schema_a;
-	pack_t         schema_b = data->schema_b;
-	obj_t*         alpha    = data->alpha;
-	obj_t*         a        = data->a;
-	obj_t*         b        = data->b;
-	obj_t*         beta     = data->beta;
-	obj_t*         c        = data->c;
-	cntx_t*        cntx     = data->cntx;
-	rntm_t*        rntm     = data->rntm;
-	cntl_t*        cntl     = data->cntl;
-	dim_t          tid      = data->tid;
-	array_t*       array    = data->array;
-	thrcomm_t*     gl_comm  = data->gl_comm;
+	const thread_data_t* data     = data_void;
+
+	const l3int_t        func     = data->func;
+	const opid_t         family   = data->family;
+	const obj_t*         alpha    = data->alpha;
+	const obj_t*         a        = data->a;
+	const obj_t*         b        = data->b;
+	const obj_t*         beta     = data->beta;
+	const obj_t*         c        = data->c;
+	const cntx_t*        cntx     = data->cntx;
+	      rntm_t*        rntm     = data->rntm;
+	      cntl_t*        cntl     = data->cntl;
+	const dim_t          tid      = data->tid;
+	      array_t*       array    = data->array;
+	      thrcomm_t*     gl_comm  = data->gl_comm;
 
 	// Create a thread-local copy of the master thread's rntm_t. This is
 	// necessary since we want each thread to be able to track its own
 	// small block pool_t as it executes down the function stack.
-	rntm_t           rntm_l = *rntm;
-	rntm_t* restrict rntm_p = &rntm_l;
+	rntm_t  rntm_l = *rntm;
+	rntm_t* rntm_p = &rntm_l;
 
 	// Use the thread id to access the appropriate pool_t* within the
 	// array_t, and use it to set the sba_pool field within the rntm_t.
@@ -90,9 +86,9 @@ void* bli_l3_thread_entry( void* data_void )
 	// be allocated/initialized.
 	bli_sba_rntm_set_pool( tid, array, rntm_p );
 
-	obj_t          a_t, b_t, c_t;
-	cntl_t*        cntl_use;
-	thrinfo_t*     thread;
+	obj_t      a_t, b_t, c_t;
+	cntl_t*    cntl_use;
+	thrinfo_t* thread;
 
 	// Alias thread-local copies of A, B, and C. These will be the objects
 	// we pass down the algorithmic function stack. Making thread-local
@@ -103,6 +99,17 @@ void* bli_l3_thread_entry( void* data_void )
 	bli_obj_alias_to( b, &b_t );
 	bli_obj_alias_to( c, &c_t );
 
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	pack_t schema_a = bli_obj_pack_schema( &a_t );
+	pack_t schema_b = bli_obj_pack_schema( &b_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
+
 	// Create a default control tree for the operation, if needed.
 	bli_l3_cntl_create_if( family, schema_a, schema_b,
 	                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
@@ -134,31 +141,20 @@ void* bli_l3_thread_entry( void* data_void )
 
 void bli_l3_thread_decorator
      (
-       l3int_t    func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	err_t r_val;
 
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
 	// Query the total number of threads from the context.
 	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
@@ -168,7 +164,7 @@ void bli_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -181,7 +177,7 @@ void bli_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
@@ -203,8 +199,6 @@ void bli_l3_thread_decorator
 		// Set up thread data for additional threads (beyond thread 0).
 		datas[tid].func     = func;
 		datas[tid].family   = family;
-		datas[tid].schema_a = schema_a;
-		datas[tid].schema_b = schema_b;
 		datas[tid].alpha    = alpha;
 		datas[tid].a        = a;
 		datas[tid].b        = b;
diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c
index 51474f0ee..c2c43b370 100644
--- a/frame/thread/bli_l3_decor_single.c
+++ b/frame/thread/bli_l3_decor_single.c
@@ -39,28 +39,32 @@
 
 void bli_l3_thread_decorator
      (
-       l3int_t    func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
+	obj_t a_t, b_t;
+	bli_obj_alias_to( a, &a_t );
+	bli_obj_alias_to( b, &b_t );
+
 	// This is part of a hack to support mixed domain in bli_gemm_front().
 	// Sometimes we need to specify a non-standard schema for A and B, and
 	// we decided to transmit them via the schema field in the obj_t's
 	// rather than pass them in as function parameters. Once the values
 	// have been read, we immediately reset them back to their expected
 	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+	pack_t schema_a = bli_obj_pack_schema( &a_t );
+	pack_t schema_b = bli_obj_pack_schema( &b_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
 
 	// For sequential execution, we use only one thread.
 	const dim_t n_threads = 1;
@@ -71,7 +75,7 @@ void bli_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we can create the global comm below.
@@ -81,13 +85,13 @@ void bli_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	{
 		// NOTE: We don't need to create another copy of the rntm_t since
 		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
+		rntm_t* rntm_p = rntm;
 
 		cntl_t*    cntl_use;
 		thrinfo_t* thread;
@@ -111,7 +115,7 @@ void bli_l3_thread_decorator
 
 		// Create a default control tree for the operation, if needed.
 		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       a, b, c, rntm_p, cntl, &cntl_use );
+		                       &a_t, &b_t, c, rntm_p, cntl, &cntl_use );
 
 		// Create the root node of the thread's thrinfo_t structure.
 		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
@@ -119,8 +123,8 @@ void bli_l3_thread_decorator
 		func
 		(
 		  alpha,
-		  a,
-		  b,
+		  &a_t,
+		  &b_t,
 		  beta,
 		  c,
 		  cntx,
diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h
index a001e5b74..6e0401151 100644
--- a/frame/thread/bli_l3_sup_decor.h
+++ b/frame/thread/bli_l3_sup_decor.h
@@ -41,28 +41,28 @@
 // Level-3 sup internal function type.
 typedef err_t (*l3supint_t)
      (
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       thrinfo_t* thread
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             thrinfo_t* thread
      );
 
 // Level-3 sup thread decorator prototype.
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c
index 1db9514fd..ff6bc667d 100644
--- a/frame/thread/bli_l3_sup_decor_openmp.c
+++ b/frame/thread/bli_l3_sup_decor_openmp.c
@@ -46,15 +46,15 @@ void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
 
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      )
 {
 	// Query the total number of threads from the rntm_t object.
@@ -66,7 +66,7 @@ err_t bli_l3_sup_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -79,7 +79,7 @@ err_t bli_l3_sup_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
@@ -87,8 +87,8 @@ err_t bli_l3_sup_thread_decorator
 		// Create a thread-local copy of the master thread's rntm_t. This is
 		// necessary since we want each thread to be able to track its own
 		// small block pool_t as it executes down the function stack.
-		rntm_t           rntm_l = *rntm;
-		rntm_t* restrict rntm_p = &rntm_l;
+		rntm_t  rntm_l = *rntm;
+		rntm_t* rntm_p = &rntm_l;
 
 		// Query the thread's id from OpenMP.
 		const dim_t tid = omp_get_thread_num();
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c
index dade71a03..375a85730 100644
--- a/frame/thread/bli_l3_sup_decor_pthreads.c
+++ b/frame/thread/bli_l3_sup_decor_pthreads.c
@@ -40,18 +40,18 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	l3supint_t func;
-	opid_t     family;
-	obj_t*     alpha;
-	obj_t*     a;
-	obj_t*     b;
-	obj_t*     beta;
-	obj_t*     c;
-	cntx_t*    cntx;
-	rntm_t*    rntm;
-	dim_t      tid;
-	thrcomm_t* gl_comm;
-	array_t*   array;
+	      l3supint_t func;
+	      opid_t     family;
+	const obj_t*     alpha;
+	const obj_t*     a;
+	const obj_t*     b;
+	const obj_t*     beta;
+	const obj_t*     c;
+	const cntx_t*    cntx;
+	      rntm_t*    rntm;
+	      dim_t      tid;
+	      thrcomm_t* gl_comm;
+	      array_t*   array;
 } thread_data_t;
 
 // Entry point for additional threads
@@ -59,26 +59,26 @@ void* bli_l3_sup_thread_entry( void* data_void )
 {
 	thread_data_t* data     = data_void;
 
-	l3supint_t     func     = data->func;
-	opid_t         family   = data->family;
-	obj_t*         alpha    = data->alpha;
-	obj_t*         a        = data->a;
-	obj_t*         b        = data->b;
-	obj_t*         beta     = data->beta;
-	obj_t*         c        = data->c;
-	cntx_t*        cntx     = data->cntx;
-	rntm_t*        rntm     = data->rntm;
-	dim_t          tid      = data->tid;
-	array_t*       array    = data->array;
-	thrcomm_t*     gl_comm  = data->gl_comm;
+	      l3supint_t     func     = data->func;
+	      opid_t         family   = data->family;
+	const obj_t*         alpha    = data->alpha;
+	const obj_t*         a        = data->a;
+	const obj_t*         b        = data->b;
+	const obj_t*         beta     = data->beta;
+	const obj_t*         c        = data->c;
+	const cntx_t*        cntx     = data->cntx;
+	      rntm_t*        rntm     = data->rntm;
+	      dim_t          tid      = data->tid;
+	      array_t*       array    = data->array;
+	      thrcomm_t*     gl_comm  = data->gl_comm;
 
 	( void )family;
 
 	// Create a thread-local copy of the master thread's rntm_t. This is
 	// necessary since we want each thread to be able to track its own
 	// small block pool_t as it executes down the function stack.
-	rntm_t           rntm_l = *rntm;
-	rntm_t* restrict rntm_p = &rntm_l;
+	rntm_t  rntm_l = *rntm;
+	rntm_t* rntm_p = &rntm_l;
 
 	// Use the thread id to access the appropriate pool_t* within the
 	// array_t, and use it to set the sba_pool field within the rntm_t.
@@ -111,15 +111,15 @@ void* bli_l3_sup_thread_entry( void* data_void )
 
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      )
 {
 	err_t r_val;
@@ -133,7 +133,7 @@ err_t bli_l3_sup_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -146,7 +146,7 @@ err_t bli_l3_sup_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c
index a87af4103..df767ad29 100644
--- a/frame/thread/bli_l3_sup_decor_single.c
+++ b/frame/thread/bli_l3_sup_decor_single.c
@@ -41,17 +41,15 @@
 
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       //pack_t     schema_a,
-       //pack_t     schema_b,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      )
 {
 	// For sequential execution, we use only one thread.
@@ -63,7 +61,7 @@ err_t bli_l3_sup_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm.
 	bli_sba_rntm_set_pool( 0, array, rntm );
@@ -73,14 +71,14 @@ err_t bli_l3_sup_thread_decorator
 
 #ifndef SKIP_THRINFO_TREE
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 #endif
 
 
 	{
 		// NOTE: We don't need to create another copy of the rntm_t since
 		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
+		rntm_t* rntm_p = rntm;
 
 		// There is only one thread id (for the thief thread).
 		const dim_t tid = 0;
@@ -138,7 +136,6 @@ err_t bli_l3_sup_thread_decorator
 	bli_sba_checkin_array( array );
 
 	return BLIS_SUCCESS;
-
 }
 
 #endif
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 6dc4f9141..7d647a314 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -67,12 +67,12 @@ void bli_thread_finalize( void )
 
 void bli_thread_range_sub
      (
-       thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	dim_t      n_way      = bli_thread_n_way( thread );
@@ -211,11 +211,11 @@ void bli_thread_range_sub
 
 siz_t bli_thread_range_l2r
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -231,11 +231,11 @@ siz_t bli_thread_range_l2r
 
 siz_t bli_thread_range_r2l
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -251,11 +251,11 @@ siz_t bli_thread_range_r2l
 
 siz_t bli_thread_range_t2b
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -271,11 +271,11 @@ siz_t bli_thread_range_t2b
 
 siz_t bli_thread_range_b2t
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -504,15 +504,15 @@ siz_t bli_find_area_trap_l
 
 siz_t bli_thread_range_weighted_sub
      (
-       thrinfo_t* restrict thread,
-       doff_t              diagoff,
-       uplo_t              uplo,
-       dim_t               m,
-       dim_t               n,
-       dim_t               bf,
-       bool                handle_edge_low,
-       dim_t*     restrict j_start_thr,
-       dim_t*     restrict j_end_thr
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     j_start_thr,
+             dim_t*     j_end_thr
      )
 {
 	dim_t      n_way   = bli_thread_n_way( thread );
@@ -641,15 +641,15 @@ siz_t bli_thread_range_weighted_sub
 
 siz_t bli_thread_range_mdim
      (
-       dir_t      direct,
-       thrinfo_t* thr,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     c,
-       cntl_t*    cntl,
-       cntx_t*    cntx,
-       dim_t*     start,
-       dim_t*     end
+             dir_t      direct,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntl_t*    cntl,
+       const cntx_t*    cntx,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	bszid_t  bszid  = bli_cntl_bszid( cntl );
@@ -665,8 +665,8 @@ siz_t bli_thread_range_mdim
 		else                                   bszid = BLIS_NR;
 	}
 
-	blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	obj_t*   x;
+	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	const obj_t*   x;
 	bool     use_weighted;
 
 	// Use the operation family to choose the one of the two matrices
@@ -700,15 +700,15 @@ siz_t bli_thread_range_mdim
 
 siz_t bli_thread_range_ndim
      (
-       dir_t      direct,
-       thrinfo_t* thr,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     c,
-       cntl_t*    cntl,
-       cntx_t*    cntx,
-       dim_t*     start,
-       dim_t*     end
+             dir_t      direct,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntl_t*    cntl,
+       const cntx_t*    cntx,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	bszid_t  bszid  = bli_cntl_bszid( cntl );
@@ -724,8 +724,8 @@ siz_t bli_thread_range_ndim
 		else                                   bszid = BLIS_NR;
 	}
 
-	blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	obj_t*   x;
+	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	const obj_t*   x;
 	bool     use_weighted;
 
 	// Use the operation family to choose the one of the two matrices
@@ -759,11 +759,11 @@ siz_t bli_thread_range_ndim
 
 siz_t bli_thread_range_weighted_l2r
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
@@ -809,11 +809,11 @@ siz_t bli_thread_range_weighted_l2r
 
 siz_t bli_thread_range_weighted_r2l
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
@@ -861,11 +861,11 @@ siz_t bli_thread_range_weighted_r2l
 
 siz_t bli_thread_range_weighted_t2b
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
@@ -913,11 +913,11 @@ siz_t bli_thread_range_weighted_t2b
 
 siz_t bli_thread_range_weighted_b2t
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
@@ -1295,31 +1295,31 @@ void bli_thread_partition_2x2_orig
 	dim_t tn1; // = *nt1;
 	dim_t tn2; // = *nt2;
 
-    // Partition a number of threads into two factors nt1 and nt2 such that
-    // nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a
-    // slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|).
+	// Partition a number of threads into two factors nt1 and nt2 such that
+	// nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a
+	// slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|).
 
-    // Return early small prime numbers of threads.
-    if ( n_thread < 4 )
-    {
-        tn1 = ( work1 >= work2 ? n_thread : 1 );
-        tn2 = ( work1 <  work2 ? n_thread : 1 );
+	// Return early small prime numbers of threads.
+	if ( n_thread < 4 )
+	{
+		tn1 = ( work1 >= work2 ? n_thread : 1 );
+		tn2 = ( work1 <  work2 ? n_thread : 1 );
 
 		return;
-    }
+	}
 
-    tn1 = 1;
-    tn2 = 1;
+	tn1 = 1;
+	tn2 = 1;
 
-    // Both algorithms need the prime factorization of n_thread.
-    bli_prime_factors_t factors;
-    bli_prime_factorization( n_thread, &factors );
+	// Both algorithms need the prime factorization of n_thread.
+	bli_prime_factors_t factors;
+	bli_prime_factorization( n_thread, &factors );
 
 #if 1
 
-    // Fast algorithm: assign prime factors in increasing order to whichever
-    // partition has more work to do. The work is divided by the number of
-    // threads assigned at each iteration. This algorithm is sub-optimal in
+	// Fast algorithm: assign prime factors in increasing order to whichever
+	// partition has more work to do. The work is divided by the number of
+	// threads assigned at each iteration. This algorithm is sub-optimal in
 	// some cases. We attempt to mitigate the cases that involve at least one
 	// factor of 2. For example, in the partitioning of 12 with equal work
 	// this algorithm tentatively finds 6x2. This factorization involves a
@@ -1330,22 +1330,22 @@ void bli_thread_partition_2x2_orig
 
 	//printf( "w1 w2 = %d %d (initial)\n", (int)work1, (int)work2 );
 
-    dim_t f;
-    while ( ( f = bli_next_prime_factor( &factors ) ) > 1 )
-    {
+	dim_t f;
+	while ( ( f = bli_next_prime_factor( &factors ) ) > 1 )
+	{
 		//printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d ... f = %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2, (int)f );
 
-        if ( work1 > work2 )
-        {
-            work1 /= f;
-            tn1 *= f;
-        }
-        else
-        {
-            work2 /= f;
-            tn2 *= f;
-        }
-    }
+		if ( work1 > work2 )
+		{
+			work1 /= f;
+			tn1 *= f;
+		}
+		else
+		{
+			work2 /= f;
+			tn2 *= f;
+		}
+	}
 
 	//printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2 );
 
@@ -1391,78 +1391,78 @@ void bli_thread_partition_2x2_orig
 
 #else
 
-    // Slow algorithm: exhaustively constructs all factor pairs of n_thread and
-    // chooses the best one.
-
-    // Eight prime factors handles n_thread up to 223092870.
-    dim_t fact[8];
-    dim_t mult[8];
-
-    // There is always at least one prime factor, so use if for initialization.
-    dim_t nfact = 1;
-    fact[0] = bli_next_prime_factor( &factors );
-    mult[0] = 1;
-
-    // Collect the remaining prime factors, accounting for multiplicity of
-    // repeated factors.
-    dim_t f;
-    while ( ( f = bli_next_prime_factor( &factors ) ) > 1 )
-    {
-        if ( f == fact[nfact-1] )
-        {
-            mult[nfact-1]++;
-        }
-        else
-        {
-            nfact++;
-            fact[nfact-1] = f;
-            mult[nfact-1] = 1;
-        }
-    }
-
-    // Now loop over all factor pairs. A single factor pair is denoted by how
-    // many of each prime factor are included in the first factor (ntaken).
-    dim_t ntake[8] = {0};
-    dim_t min_diff = INT_MAX;
-
-    // Loop over how many prime factors to assign to the first factor in the
-    // pair, for each prime factor. The total number of iterations is
-    // \Prod_{i=0}^{nfact-1} mult[i].
-    bool   done = FALSE;
-    while ( !done )
-    {
-        dim_t x = 1;
-        dim_t y = 1;
-
-        // Form the factors by integer exponentiation and accumulation.
-        for  (dim_t i = 0 ; i < nfact ; i++ )
-        {
-            x *= bli_ipow( fact[i], ntake[i] );
-            y *= bli_ipow( fact[i], mult[i]-ntake[i] );
-        }
-
-        // Check if this factor pair is optimal by checking
-        // |nt1*work2 - nt2*work1|.
-        dim_t diff = llabs( x*work2 - y*work1 );
-        if ( diff < min_diff )
-        {
-            min_diff = diff;
-            tn1 = x;
-            tn2 = y;
-        }
-
-        // Go to the next factor pair by doing an "odometer loop".
-        for ( dim_t i = 0 ; i < nfact ; i++ )
-        {
-            if ( ++ntake[i] > mult[i] )
-            {
-                ntake[i] = 0;
-                if ( i == nfact-1 ) done = TRUE;
-                else continue;
-            }
-            break;
-        }
-    }
+	// Slow algorithm: exhaustively constructs all factor pairs of n_thread and
+	// chooses the best one.
+
+	// Eight prime factors handles n_thread up to 223092870.
+	dim_t fact[8];
+	dim_t mult[8];
+
+	// There is always at least one prime factor, so use if for initialization.
+	dim_t nfact = 1;
+	fact[0] = bli_next_prime_factor( &factors );
+	mult[0] = 1;
+
+	// Collect the remaining prime factors, accounting for multiplicity of
+	// repeated factors.
+	dim_t f;
+	while ( ( f = bli_next_prime_factor( &factors ) ) > 1 )
+	{
+		if ( f == fact[nfact-1] )
+		{
+			mult[nfact-1]++;
+		}
+		else
+		{
+			nfact++;
+			fact[nfact-1] = f;
+			mult[nfact-1] = 1;
+		}
+	}
+
+	// Now loop over all factor pairs. A single factor pair is denoted by how
+	// many of each prime factor are included in the first factor (ntaken).
+	dim_t ntake[8] = {0};
+	dim_t min_diff = INT_MAX;
+
+	// Loop over how many prime factors to assign to the first factor in the
+	// pair, for each prime factor. The total number of iterations is
+	// \Prod_{i=0}^{nfact-1} mult[i].
+	bool   done = FALSE;
+	while ( !done )
+	{
+		dim_t x = 1;
+		dim_t y = 1;
+
+		// Form the factors by integer exponentiation and accumulation.
+		for  (dim_t i = 0 ; i < nfact ; i++ )
+		{
+			x *= bli_ipow( fact[i], ntake[i] );
+			y *= bli_ipow( fact[i], mult[i]-ntake[i] );
+		}
+
+		// Check if this factor pair is optimal by checking
+		// |nt1*work2 - nt2*work1|.
+		dim_t diff = llabs( x*work2 - y*work1 );
+		if ( diff < min_diff )
+		{
+			min_diff = diff;
+			tn1 = x;
+			tn2 = y;
+		}
+
+		// Go to the next factor pair by doing an "odometer loop".
+		for ( dim_t i = 0 ; i < nfact ; i++ )
+		{
+			if ( ++ntake[i] > mult[i] )
+			{
+				ntake[i] = 0;
+				if ( i == nfact-1 ) done = TRUE;
+				else continue;
+			}
+			break;
+		}
+	}
 
 #endif
 
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index d4880c4c8..5e9c650b5 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -66,12 +66,12 @@ void bli_thread_finalize( void );
 BLIS_EXPORT_BLIS
 void bli_thread_range_sub
      (
-       thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end
      );
 
 #undef  GENPROT
@@ -79,15 +79,15 @@ void bli_thread_range_sub
 \
 siz_t PASTEMAC0( opname ) \
      ( \
-       dir_t      direct, \
-       thrinfo_t* thr, \
-       obj_t*     a, \
-       obj_t*     b, \
-       obj_t*     c, \
-       cntl_t*    cntl, \
-       cntx_t*    cntx, \
-       dim_t*     start, \
-       dim_t*     end  \
+             dir_t      direct, \
+       const thrinfo_t* thr, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     c, \
+       const cntl_t*    cntl, \
+       const cntx_t*    cntx, \
+             dim_t*     start, \
+             dim_t*     end  \
      );
 
 GENPROT( thread_range_mdim )
@@ -98,11 +98,11 @@ GENPROT( thread_range_ndim )
 \
 siz_t PASTEMAC0( opname ) \
      ( \
-       thrinfo_t* thr, \
-       obj_t*     a, \
-       blksz_t*   bmult, \
-       dim_t*     start, \
-       dim_t*     end  \
+       const thrinfo_t* thr, \
+       const obj_t*     a, \
+       const blksz_t*   bmult, \
+             dim_t*     start, \
+             dim_t*     end  \
      );
 
 GENPROT( thread_range_l2r )
@@ -136,15 +136,15 @@ siz_t bli_find_area_trap_l
      );
 siz_t bli_thread_range_weighted_sub
      (
-       thrinfo_t* restrict thread,
-       doff_t              diagoff,
-       uplo_t              uplo,
-       dim_t               m,
-       dim_t               n,
-       dim_t               bf,
-       bool                handle_edge_low,
-       dim_t*     restrict j_start_thr,
-       dim_t*     restrict j_end_thr
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     j_start_thr,
+             dim_t*     j_end_thr
      );
 
 // -----------------------------------------------------------------------------
@@ -157,9 +157,9 @@ typedef struct
     dim_t f;
 } bli_prime_factors_t;
 
-void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors);
+void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors );
 
-dim_t bli_next_prime_factor(bli_prime_factors_t* factors);
+dim_t bli_next_prime_factor( bli_prime_factors_t* factors );
 bool  bli_is_prime( dim_t n );
 
 void bli_thread_partition_2x2
@@ -211,13 +211,13 @@ void  bli_thread_init_rntm_from_env( rntm_t* rntm );
 
 BLIS_INLINE void bli_thread_range_jrir_rr
      (
-       thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-       dim_t*     inc
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
      )
 {
 	// Use interleaved partitioning of jr/ir loops.
@@ -228,13 +228,13 @@ BLIS_INLINE void bli_thread_range_jrir_rr
 
 BLIS_INLINE void bli_thread_range_jrir_sl
      (
-       thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-       dim_t*     inc
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
      )
 {
 	// Use contiguous slab partitioning of jr/ir loops.
@@ -244,13 +244,13 @@ BLIS_INLINE void bli_thread_range_jrir_sl
 
 BLIS_INLINE void bli_thread_range_jrir
      (
-       thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-       dim_t*     inc
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
      )
 {
 	// Define a general-purpose version of bli_thread_range_jrir() whose
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index 0282be170..bbe711400 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -41,7 +41,7 @@ thrinfo_t* bli_thrinfo_create
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
@@ -51,13 +51,13 @@ thrinfo_t* bli_thrinfo_create
 	printf( "bli_thrinfo_create(): " );
 	#endif
 
-    thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) );
+	thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) );
 
-    bli_thrinfo_init
+	bli_thrinfo_init
 	(
 	  thread,
 	  ocomm, ocomm_id,
-	  n_way, work_id, 
+	  n_way, work_id,
 	  free_comm,
 	  bszid,
 	  sub_node
@@ -72,7 +72,7 @@ void bli_thrinfo_init
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h
index 8e5a6da3b..6b9809684 100644
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -75,54 +75,54 @@ typedef struct thrinfo_s thrinfo_t;
 
 // thrinfo_t query (field only)
 
-BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_num_threads( const thrinfo_t* t )
 {
 	return (t->ocomm)->n_threads;
 }
 
-BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_ocomm_id( const thrinfo_t* t )
 {
 	return t->ocomm_id;
 }
 
-BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_n_way( const thrinfo_t* t )
 {
 	return t->n_way;
 }
 
-BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_work_id( const thrinfo_t* t )
 {
 	return t->work_id;
 }
 
-BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t )
+BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( const thrinfo_t* t )
 {
 	return t->ocomm;
 }
 
-BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t )
+BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t )
 {
 	return t->free_comm;
 }
 
-BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_bszid( const thrinfo_t* t )
 {
 	return t->bszid;
 }
 
-BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t )
+BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t )
 {
 	return t->sub_node;
 }
 
-BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t )
+BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t )
 {
 	return t->sub_prenode;
 }
 
 // thrinfo_t query (complex)
 
-BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t )
+BLIS_INLINE bool bli_thread_am_ochief( const thrinfo_t* t )
 {
 	return t->ocomm_id == 0;
 }
@@ -171,12 +171,12 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t*
 
 // other thrinfo_t-related functions
 
-BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p )
+BLIS_INLINE void* bli_thread_broadcast( const thrinfo_t* t, void* p )
 {
 	return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
 }
 
-BLIS_INLINE void bli_thread_barrier( thrinfo_t* t )
+BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t )
 {
 	bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
 }
@@ -192,7 +192,7 @@ thrinfo_t* bli_thrinfo_create
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
@@ -204,7 +204,7 @@ void bli_thrinfo_init
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
index 881990f78..966247fd0 100644
--- a/frame/thread/bli_thrinfo_sup.c
+++ b/frame/thread/bli_thrinfo_sup.c
@@ -37,9 +37,9 @@
 
 void bli_thrinfo_sup_grow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       thrinfo_t* thread
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+             thrinfo_t* thread
      )
 {
 	if ( thread == &BLIS_GEMM_SINGLE_THREADED ||
@@ -75,10 +75,10 @@ void bli_thrinfo_sup_grow
 
 thrinfo_t* bli_thrinfo_sup_rgrow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_cur,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_cur,
+             thrinfo_t* thread_par
      )
 {
 	thrinfo_t* thread_cur;
@@ -139,10 +139,10 @@ thrinfo_t* bli_thrinfo_sup_rgrow
 
 thrinfo_t* bli_thrinfo_sup_create_for_cntl
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_chl,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_chl,
+             thrinfo_t* thread_par
      )
 {
 	// If we are running with a single thread, all of the code can be reduced
@@ -151,14 +151,14 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 	{
 		thrinfo_t* thread_chl = bli_thrinfo_create
 		(
-		  rntm,                        // rntm
-		  &BLIS_SINGLE_COMM,           // ocomm
-		  0,                           // ocomm_id
-		  1,                           // n_way
-		  0,                           // work_id
-		  FALSE,                       // free_comm
-		  BLIS_NO_PART,                // bszid
-		  NULL                         // sub_node
+		  rntm,               // rntm
+		  &BLIS_SINGLE_COMM,  // ocomm
+		  0,                  // ocomm_id
+		  1,                  // n_way
+		  0,                  // work_id
+		  FALSE,              // free_comm
+		  BLIS_NO_PART,       // bszid
+		  NULL                // sub_node
 		);
 
 		return thread_chl;
diff --git a/frame/thread/bli_thrinfo_sup.h b/frame/thread/bli_thrinfo_sup.h
index 0be035cf8..1afcd3337 100644
--- a/frame/thread/bli_thrinfo_sup.h
+++ b/frame/thread/bli_thrinfo_sup.h
@@ -42,25 +42,25 @@
 
 void bli_thrinfo_sup_grow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       thrinfo_t* thread
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+             thrinfo_t* thread
      );
 
 thrinfo_t* bli_thrinfo_sup_rgrow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_cur,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_cur,
+             thrinfo_t* thread_par
      );
 
 thrinfo_t* bli_thrinfo_sup_create_for_cntl
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_chl,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_chl,
+             thrinfo_t* thread_par
      );
 
 #endif
diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c
index 3693ea39c..a96f6f5e9 100644
--- a/frame/util/bli_util_check.c
+++ b/frame/util/bli_util_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t* x, \
+       const obj_t* asum  \
      ) \
 { \
 	bli_utilv_xa_check( x, asum ); \
@@ -58,7 +58,7 @@ GENFRONT( asumv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      ) \
 { \
 	bli_utilm_mkhst_check( x ); \
@@ -74,8 +74,8 @@ GENFRONT( mktrim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
      ) \
 { \
 	bli_utilv_norm_check( x, norm ); \
@@ -91,8 +91,8 @@ GENFRONT( normiv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
      ) \
 { \
 	bli_utilm_norm_check( x, norm ); \
@@ -108,7 +108,7 @@ GENFRONT( normim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      ) \
 { \
 	bli_utilm_rand_check( x ); \
@@ -125,9 +125,9 @@ GENFRONT( randnm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t* x, \
+       const obj_t* scale, \
+       const obj_t* sumsq  \
      ) \
 { \
 	bli_utilv_sumsqv_check( x, scale, sumsq ); \
@@ -142,9 +142,9 @@ GENFRONT( sumsqv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
+       const obj_t* chi, \
+       const obj_t* psi, \
+       const bool*  is_eq  \
      ) \
 { \
 	bli_l0_xxbsc_check( chi, psi, is_eq ); \
@@ -158,9 +158,9 @@ GENFRONT( eqsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const bool*  is_eq  \
      ) \
 { \
 	bli_l1v_xy_check( x, y ); \
@@ -174,9 +174,9 @@ GENFRONT( eqv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const bool*  is_eq  \
      ) \
 { \
 	bli_l1m_xy_check( x, y ); \
@@ -190,11 +190,11 @@ GENFRONT( eqm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       obj_t* x, \
-       char*  format, \
-       char*  s2  \
+       const FILE*  file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	bli_utilm_fprint_check( file, s1, x, format, s2 ); \
@@ -207,8 +207,8 @@ GENFRONT( fprintm )
 
 void bli_utilv_xa_check
      (
-       obj_t*  x,
-       obj_t*  asum
+       const obj_t* x,
+       const obj_t* asum
      )
 {
 	err_t e_val;
@@ -240,7 +240,7 @@ void bli_utilv_xa_check
 
 void bli_utilm_mkhst_check
      (
-       obj_t*  a
+       const obj_t* a
      )
 {
 	err_t e_val;
@@ -277,8 +277,8 @@ void bli_utilm_mkhst_check
 
 void bli_utilv_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t* x,
+       const obj_t* norm
      )
 {
 	err_t e_val;
@@ -317,8 +317,8 @@ void bli_utilv_norm_check
 
 void bli_utilm_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t* x,
+       const obj_t* norm
      )
 {
 	err_t e_val;
@@ -356,35 +356,35 @@ void bli_utilm_norm_check
 
 void bli_utilm_fprint_check
      (
-       FILE*  file,
-       char*  s1,
-       obj_t* x,
-       char*  format,
-       char*  s2
+       const FILE*  file,
+       const char*  s1,
+       const obj_t* x,
+       const char*  format,
+       const char*  s2
      )
 {
 	err_t e_val;
 
 	// Check argument pointers.
-	
+
 	e_val = bli_check_null_pointer( file );
 	bli_check_error_code( e_val );
 
 	e_val = bli_check_null_pointer( s1 );
 	bli_check_error_code( e_val );
 
-	e_val = bli_check_null_pointer( s2 ); 
+	e_val = bli_check_null_pointer( s2 );
 	bli_check_error_code( e_val );
 
 	// Check object buffers (for non-NULLness).
 
-	e_val = bli_check_object_buffer( x ); 
+	e_val = bli_check_object_buffer( x );
 	bli_check_error_code( e_val );
 }
 
 void bli_utilm_rand_check
      (
-       obj_t* x
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -405,9 +405,9 @@ void bli_utilm_rand_check
 
 void bli_utilv_sumsqv_check
      (
-       obj_t*  x,
-       obj_t*  scale,
-       obj_t*  sumsq
+       const obj_t* x,
+       const obj_t* scale,
+       const obj_t* sumsq
      )
 {
 	err_t e_val;
@@ -430,15 +430,15 @@ void bli_utilv_sumsqv_check
 
 	e_val = bli_check_scalar_object( scale );
 	bli_check_error_code( e_val );
-	
+
 	e_val = bli_check_scalar_object( sumsq );
 	bli_check_error_code( e_val );
 
 	// Check object buffers (for non-NULLness).
-	
+
 	e_val = bli_check_object_buffer( x );
 	bli_check_error_code( e_val );
-	
+
 	e_val = bli_check_object_buffer( scale );
 	bli_check_error_code( e_val );
 
diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h
index 866a2cd89..c3f4fd1aa 100644
--- a/frame/util/bli_util_check.h
+++ b/frame/util/bli_util_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t* x, \
+       const obj_t* asum  \
      );
 
 GENPROT( asumv )
@@ -54,7 +54,7 @@ GENPROT( asumv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      );
 
 GENPROT( mkherm )
@@ -67,8 +67,8 @@ GENPROT( mktrim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
      );
 
 GENPROT( norm1v )
@@ -81,8 +81,8 @@ GENPROT( normiv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
      );
 
 GENPROT( norm1m )
@@ -95,7 +95,7 @@ GENPROT( normim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      );
 
 GENPROT( randv )
@@ -109,9 +109,9 @@ GENPROT( randnm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t* x, \
+       const obj_t* scale, \
+       const obj_t* sumsq  \
      );
 
 GENPROT( sumsqv )
@@ -123,9 +123,9 @@ GENPROT( sumsqv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
+       const obj_t* chi, \
+       const obj_t* psi, \
+       const bool*  is_eq  \
      );
 
 GENTPROT( eqsc )
@@ -136,9 +136,9 @@ GENTPROT( eqsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const bool*  is_eq  \
     );
 
 GENPROT( eqv )
@@ -150,11 +150,11 @@ GENPROT( eqm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       obj_t* x, \
-       char*  format, \
-       char*  s2  \
+       const FILE*  file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 GENPROT( fprintv )
@@ -164,51 +164,51 @@ GENPROT( fprintm )
 
 void bli_utilv_xi_check
      (
-       obj_t*  x,
-       obj_t*  index
+       const obj_t* x,
+       const obj_t* index
      );
 
 void bli_utilv_xa_check
      (
-       obj_t*  x,
-       obj_t*  asum
+       const obj_t* x,
+       const obj_t* asum
      );
 
 void bli_utilm_mkhst_check
      (
-       obj_t*  a
+       const obj_t* a
      );
 
 void bli_utilv_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t* x,
+       const obj_t* norm
      );
 
 void bli_utilm_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t* x,
+       const obj_t* norm
      );
 
 void bli_utilm_fprint_check
      (
-       FILE*  file,
-       char*  s1,
-       obj_t* x,
-       char*  format,
-       char*  s2
+       const FILE*  file,
+       const char*  s1,
+       const obj_t* x,
+       const char*  format,
+       const char*  s2
      );
 
 void bli_utilm_rand_check
      (
-       obj_t* x
+       const obj_t* x
      );
 
 void bli_utilv_sumsqv_check
      (
-       obj_t*  x,
-       obj_t*  scale,
-       obj_t*  sumsq
+       const obj_t* x,
+       const obj_t* scale,
+       const obj_t* sumsq
      );
 
diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h
index 673f4782a..ccdd7ae66 100644
--- a/frame/util/bli_util_ft.h
+++ b/frame/util/bli_util_ft.h
@@ -44,9 +44,9 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* asum  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -76,9 +76,9 @@ INSERT_GENTDEF( mktrim )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -93,13 +93,13 @@ INSERT_GENTDEFR( normiv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype*   x, inc_t rs_x, inc_t cs_x, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -114,12 +114,12 @@ INSERT_GENTDEFR( normim )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  n, \
-       ctype* x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+             FILE*  file, \
+       const char*  s1, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTDEF( fprintv )
@@ -131,13 +131,13 @@ INSERT_GENTDEF( fprintv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       ctype* x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+             FILE*  file, \
+       const char*  s1, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTDEF( fprintm )
@@ -182,10 +182,10 @@ INSERT_GENTDEF( randnm )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* scale, \
+             ctype_r* sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -204,10 +204,10 @@ INSERT_GENTDEFR( sumsqv )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi, \
-       bool*   is_eq  \
+             conj_t conjchi, \
+       const ctype* chi, \
+       const ctype* psi, \
+             bool*  is_eq  \
      );
 
 INSERT_GENTDEF( eqsc )
@@ -219,11 +219,11 @@ INSERT_GENTDEF( eqsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       bool*   is_eq  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             bool*  is_eq  \
      );
 
 INSERT_GENTDEF( eqv )
@@ -235,15 +235,15 @@ INSERT_GENTDEF( eqv )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y, \
-       bool*   is_eq  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  y, inc_t rs_y, inc_t cs_y, \
+             bool*   is_eq  \
      );
 
 INSERT_GENTDEF( eqm )
diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c
index afd221a58..d4e5617ee 100644
--- a/frame/util/bli_util_oapi.c
+++ b/frame/util/bli_util_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t* x, \
+       const obj_t* asum  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -54,16 +54,16 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	BLIS_OAPI_EX_DECLS \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t dt    = bli_obj_dt( x ); \
 \
-	dim_t     n         = bli_obj_vector_dim( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     incx      = bli_obj_vector_inc( x ); \
+	dim_t n     = bli_obj_vector_dim( x ); \
+	void* buf_x = bli_obj_buffer_at_off( x ); \
+	inc_t incx  = bli_obj_vector_inc( x ); \
 \
-	void*     buf_asum  = bli_obj_buffer_at_off( asum ); \
+	void* buf_asum = bli_obj_buffer_at_off( asum ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, asum ); \
+		PASTEMAC(opname,_check)( x, asum ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -88,7 +88,7 @@ GENFRONT( asumv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  a  \
+       const obj_t* a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -96,16 +96,16 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	BLIS_OAPI_EX_DECLS \
 \
-	num_t     dt        = bli_obj_dt( a ); \
+	num_t  dt    = bli_obj_dt( a ); \
 \
-	uplo_t    uploa     = bli_obj_uplo( a ); \
-	dim_t     m         = bli_obj_length( a ); \
-	void*     buf_a     = bli_obj_buffer_at_off( a ); \
-	inc_t     rs_a      = bli_obj_row_stride( a ); \
-	inc_t     cs_a      = bli_obj_col_stride( a ); \
+	uplo_t uploa = bli_obj_uplo( a ); \
+	dim_t  m     = bli_obj_length( a ); \
+	void*  buf_a = bli_obj_buffer_at_off( a ); \
+	inc_t  rs_a  = bli_obj_row_stride( a ); \
+	inc_t  cs_a  = bli_obj_col_stride( a ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( a ); \
+		PASTEMAC(opname,_check)( a ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -132,8 +132,8 @@ GENFRONT( mktrim )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -141,15 +141,15 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	BLIS_OAPI_EX_DECLS \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t dt       = bli_obj_dt( x ); \
 \
-	dim_t     n         = bli_obj_vector_dim( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     incx      = bli_obj_vector_inc( x ); \
-	void*     buf_norm  = bli_obj_buffer_at_off( norm ); \
+	dim_t n        = bli_obj_vector_dim( x ); \
+	void* buf_x    = bli_obj_buffer_at_off( x ); \
+	inc_t incx     = bli_obj_vector_inc( x ); \
+	void* buf_norm = bli_obj_buffer_at_off( norm ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, norm ); \
+		PASTEMAC(opname,_check)( x, norm ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -176,8 +176,8 @@ GENFRONT( normiv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -185,20 +185,20 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	BLIS_OAPI_EX_DECLS \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t  dt       = bli_obj_dt( x ); \
 \
-	doff_t    diagoffx  = bli_obj_diag_offset( x ); \
-	diag_t    diagx     = bli_obj_diag( x ); \
-	uplo_t    uplox     = bli_obj_uplo( x ); \
-	dim_t     m         = bli_obj_length( x ); \
-	dim_t     n         = bli_obj_width( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     rs_x      = bli_obj_row_stride( x ); \
-	inc_t     cs_x      = bli_obj_col_stride( x ); \
-	void*     buf_norm  = bli_obj_buffer_at_off( norm ); \
+	doff_t diagoffx = bli_obj_diag_offset( x ); \
+	diag_t diagx    = bli_obj_diag( x ); \
+	uplo_t uplox    = bli_obj_uplo( x ); \
+	dim_t  m        = bli_obj_length( x ); \
+	dim_t  n        = bli_obj_width( x ); \
+	void*  buf_x    = bli_obj_buffer_at_off( x ); \
+	inc_t  rs_x     = bli_obj_row_stride( x ); \
+	inc_t  cs_x     = bli_obj_col_stride( x ); \
+	void*  buf_norm = bli_obj_buffer_at_off( norm ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, norm ); \
+		PASTEMAC(opname,_check)( x, norm ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -229,7 +229,7 @@ GENFRONT( normim )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -237,14 +237,14 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	BLIS_OAPI_EX_DECLS \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t dt    = bli_obj_dt( x ); \
 \
-	dim_t     n         = bli_obj_vector_dim( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     incx      = bli_obj_vector_inc( x ); \
+	dim_t n     = bli_obj_vector_dim( x ); \
+	void* buf_x = bli_obj_buffer_at_off( x ); \
+	inc_t incx  = bli_obj_vector_inc( x ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x ); \
+		PASTEMAC(opname,_check)( x ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -269,7 +269,7 @@ GENFRONT( randnv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -277,18 +277,18 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	BLIS_OAPI_EX_DECLS \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t  dt       = bli_obj_dt( x ); \
 \
-	doff_t    diagoffx  = bli_obj_diag_offset( x ); \
-	uplo_t    uplox     = bli_obj_uplo( x ); \
-	dim_t     m         = bli_obj_length( x ); \
-	dim_t     n         = bli_obj_width( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     rs_x      = bli_obj_row_stride( x ); \
-	inc_t     cs_x      = bli_obj_col_stride( x ); \
+	doff_t diagoffx = bli_obj_diag_offset( x ); \
+	uplo_t uplox    = bli_obj_uplo( x ); \
+	dim_t  m        = bli_obj_length( x ); \
+	dim_t  n        = bli_obj_width( x ); \
+	void*  buf_x    = bli_obj_buffer_at_off( x ); \
+	inc_t  rs_x     = bli_obj_row_stride( x ); \
+	inc_t  cs_x     = bli_obj_col_stride( x ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x ); \
+		PASTEMAC(opname,_check)( x ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -316,9 +316,9 @@ GENFRONT( randnm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t* x, \
+       const obj_t* scale, \
+       const obj_t* sumsq  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -326,16 +326,16 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	BLIS_OAPI_EX_DECLS \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t dt        = bli_obj_dt( x ); \
 \
-	dim_t     n         = bli_obj_vector_dim( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     incx      = bli_obj_vector_inc( x ); \
-	void*     buf_scale = bli_obj_buffer_at_off( scale ); \
-	void*     buf_sumsq = bli_obj_buffer_at_off( sumsq ); \
+	dim_t n         = bli_obj_vector_dim( x ); \
+	void* buf_x     = bli_obj_buffer_at_off( x ); \
+	inc_t incx      = bli_obj_vector_inc( x ); \
+	void* buf_scale = bli_obj_buffer_at_off( scale ); \
+	void* buf_sumsq = bli_obj_buffer_at_off( sumsq ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, scale, sumsq ); \
+		PASTEMAC(opname,_check)( x, scale, sumsq ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -366,19 +366,19 @@ GENFRONT( sumsqv )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
+       const obj_t* chi, \
+       const obj_t* psi, \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
 \
-	num_t     dt_chi    = bli_obj_dt( chi ); \
-	num_t     dt_psi    = bli_obj_dt( psi ); \
-	num_t     dt; \
+	num_t dt_chi = bli_obj_dt( chi ); \
+	num_t dt_psi = bli_obj_dt( psi ); \
+	num_t dt; \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( chi, psi, is_eq ); \
+		PASTEMAC(opname,_check)( chi, psi, is_eq ); \
 \
 	/* Decide which datatype will be used to query the buffer from the
 	   constant object (if there is one). */ \
@@ -427,29 +427,29 @@ GENFRONT( eqsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t dt    = bli_obj_dt( x ); \
 \
-	dim_t     n         = bli_obj_vector_dim( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     inc_x     = bli_obj_vector_inc( x ); \
-	void*     buf_y     = bli_obj_buffer_at_off( y ); \
-	inc_t     inc_y     = bli_obj_vector_inc( y ); \
+	dim_t n     = bli_obj_vector_dim( x ); \
+	void* buf_x = bli_obj_buffer_at_off( x ); \
+	inc_t inc_x = bli_obj_vector_inc( x ); \
+	void* buf_y = bli_obj_buffer_at_off( y ); \
+	inc_t inc_y = bli_obj_vector_inc( y ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, y, is_eq ); \
+		PASTEMAC(opname,_check)( x, y, is_eq ); \
 \
 	/* Query the conj status of each object and use the two to come up with a
 	   single "net" conj_t value. */ \
-	conj_t conjx   = bli_obj_conj_status( x ); \
-	conj_t conjy   = bli_obj_conj_status( y ); \
-	conj_t conj    = bli_apply_conj( conjx, conjy ); \
+	conj_t conjx = bli_obj_conj_status( x ); \
+	conj_t conjy = bli_obj_conj_status( y ); \
+	conj_t conj  = bli_apply_conj( conjx, conjy ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -474,29 +474,29 @@ GENFRONT( eqv )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t  dt       = bli_obj_dt( x ); \
 \
-	doff_t    diagoffx  = bli_obj_diag_offset( x ); \
-	diag_t    diagx     = bli_obj_diag( x ); \
-	uplo_t    uplox     = bli_obj_uplo( x ); \
-	dim_t     m         = bli_obj_length( y ); \
-	dim_t     n         = bli_obj_width( y ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     rs_x      = bli_obj_row_stride( x ); \
-	inc_t     cs_x      = bli_obj_col_stride( x ); \
-	void*     buf_y     = bli_obj_buffer_at_off( y ); \
-	inc_t     rs_y      = bli_obj_row_stride( y ); \
-	inc_t     cs_y      = bli_obj_col_stride( y ); \
+	doff_t diagoffx = bli_obj_diag_offset( x ); \
+	diag_t diagx    = bli_obj_diag( x ); \
+	uplo_t uplox    = bli_obj_uplo( x ); \
+	dim_t  m        = bli_obj_length( y ); \
+	dim_t  n        = bli_obj_width( y ); \
+	void*  buf_x    = bli_obj_buffer_at_off( x ); \
+	inc_t  rs_x     = bli_obj_row_stride( x ); \
+	inc_t  cs_x     = bli_obj_col_stride( x ); \
+	void*  buf_y    = bli_obj_buffer_at_off( y ); \
+	inc_t  rs_y     = bli_obj_row_stride( y ); \
+	inc_t  cs_y     = bli_obj_col_stride( y ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( x, y, is_eq ); \
+		PASTEMAC(opname,_check)( x, y, is_eq ); \
 \
 	/* Query the combined trans and conj status of each object and use the two
 	   to come up with a single "net" trans_t value. */ \
@@ -531,23 +531,23 @@ GENFRONT( eqm )
 \
 void PASTEMAC0(opname) \
      ( \
-       FILE*   file, \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+             FILE*  file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	bli_init_once(); \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t dt    = bli_obj_dt( x ); \
 \
-	dim_t     n         = bli_obj_vector_dim( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     incx      = bli_obj_vector_inc( x ); \
+	dim_t n     = bli_obj_vector_dim( x ); \
+	void* buf_x = bli_obj_buffer_at_off( x ); \
+	inc_t incx  = bli_obj_vector_inc( x ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \
+		PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \
 \
 	/* Handle constants up front. */ \
 	if ( dt == BLIS_CONSTANT ) \
@@ -579,34 +579,34 @@ GENFRONT( fprintv )
 \
 void PASTEMAC0(opname) \
      ( \
-       FILE*   file, \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+             FILE*  file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	bli_init_once(); \
 \
-	num_t     dt        = bli_obj_dt( x ); \
+	num_t dt    = bli_obj_dt( x ); \
 \
-	dim_t     m         = bli_obj_length( x ); \
-	dim_t     n         = bli_obj_width( x ); \
-	void*     buf_x     = bli_obj_buffer_at_off( x ); \
-	inc_t     rs_x      = bli_obj_row_stride( x ); \
-	inc_t     cs_x      = bli_obj_col_stride( x ); \
+	dim_t m     = bli_obj_length( x ); \
+	dim_t n     = bli_obj_width( x ); \
+	void* buf_x = bli_obj_buffer_at_off( x ); \
+	inc_t rs_x  = bli_obj_row_stride( x ); \
+	inc_t cs_x  = bli_obj_col_stride( x ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
-	    PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \
+		PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \
 \
 	/* Handle constants up front. */ \
 	if ( dt == BLIS_CONSTANT ) \
 	{ \
-		float*    sp = bli_obj_buffer_for_const( BLIS_FLOAT,    x ); \
-		double*   dp = bli_obj_buffer_for_const( BLIS_DOUBLE,   x ); \
-		scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \
-		dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \
-		gint_t*   ip = bli_obj_buffer_for_const( BLIS_INT,      x ); \
+		const float*    sp = bli_obj_buffer_for_const( BLIS_FLOAT,    x ); \
+		const double*   dp = bli_obj_buffer_for_const( BLIS_DOUBLE,   x ); \
+		const scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \
+		const dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \
+		const gint_t*   ip = bli_obj_buffer_for_const( BLIS_INT,      x ); \
 \
 		fprintf( file, "%s\n", s1 ); \
 		fprintf( file, " float:     %9.2e\n",         bli_sreal( *sp ) ); \
@@ -645,10 +645,10 @@ GENFRONT( fprintm )
 \
 void PASTEMAC0(opname) \
      ( \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h
index 92ce6c95f..ab48f841a 100644
--- a/frame/util/bli_util_oapi.h
+++ b/frame/util/bli_util_oapi.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t* x, \
+       const obj_t* asum  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -55,7 +55,7 @@ GENPROT( asumv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  a  \
+       const obj_t* a  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -69,8 +69,8 @@ GENPROT( mktrim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -84,8 +84,8 @@ GENPROT( normiv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t* x, \
+       const obj_t* norm  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -99,7 +99,7 @@ GENPROT( normim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -112,7 +112,7 @@ GENPROT( randnv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -125,9 +125,9 @@ GENPROT( randnm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t* x, \
+       const obj_t* scale, \
+       const obj_t* sumsq  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -139,42 +139,15 @@ GENPROT( sumsqv )
 
 #ifdef BLIS_OAPI_BASIC
 
-/*
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
-     ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
-     );
-
-GENPROT( eqsc )
-
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
-     ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
-     );
-
-GENPROT( eqv )
-*/
-
 
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+             bool*  is_eq  \
      );
 
 GENPROT( eqsc )
@@ -187,11 +160,11 @@ GENPROT( eqm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       FILE*   file, \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+             FILE*  file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 GENPROT( fprintv )
@@ -203,10 +176,10 @@ GENPROT( fprintm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 GENPROT( printv )
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index ca0b3c279..abc9c9089 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -45,9 +45,9 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* asum  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -71,9 +71,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTEMAC2(ch,opname,_unb_var1) \
 	( \
 	  n, \
-	  x, incx, \
-	  asum, \
-	  cntx, \
+	  ( ctype* )x, incx, \
+	            asum, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -86,9 +86,9 @@ INSERT_GENTFUNCR_BASIC0( asumv )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       dim_t   m, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       uplo_t uploa, \
+       dim_t  m, \
+       ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -109,7 +109,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  uploa, \
 	  m, \
 	  a, rs_a, cs_a, \
-	  cntx, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -124,9 +124,9 @@ INSERT_GENTFUNC_BASIC0( mktrim )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -150,9 +150,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTEMAC2(ch,opname,_unb_var1) \
 	( \
 	  n, \
-	  x, incx, \
-	  norm, \
-	  cntx, \
+	  ( ctype* )x, incx, \
+	            norm, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -167,13 +167,13 @@ INSERT_GENTFUNCR_BASIC0( normiv )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype*   x, inc_t rs_x, inc_t cs_x, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -201,9 +201,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  uplox, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  norm, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            norm, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -218,8 +218,8 @@ INSERT_GENTFUNCR_BASIC0( normim )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx  \
+       dim_t  n, \
+       ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -247,7 +247,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		( \
 		  n, \
 		  x, incx, \
-		  cntx, \
+		  ( cntx_t* )cntx, \
 		  rntm  \
 		); \
 \
@@ -274,11 +274,11 @@ INSERT_GENTFUNCR_BASIC0( randnv )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t diagoffx, \
+       uplo_t uplox, \
+       dim_t  m, \
+       dim_t  n, \
+       ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -309,7 +309,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  m, \
 		  n, \
 		  x, rs_x, cs_x, \
-		  cntx, \
+		  ( cntx_t* )cntx, \
 		  rntm  \
 		); \
 \
@@ -340,10 +340,10 @@ INSERT_GENTFUNCR_BASIC0( randnm )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* scale, \
+             ctype_r* sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -362,10 +362,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTEMAC2(ch,opname,_unb_var1) \
 	( \
 	  n, \
-	  x, incx, \
-	  scale, \
-	  sumsq, \
-	  cntx, \
+	  ( ctype* )x, incx, \
+	            scale, \
+	            sumsq, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -383,10 +383,10 @@ INSERT_GENTFUNCR_BASIC0( sumsqv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi, \
-       bool*   is_eq  \
+             conj_t conjchi, \
+       const ctype* chi, \
+       const ctype* psi, \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -406,11 +406,11 @@ INSERT_GENTFUNC_BASIC0( eqsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       bool*   is_eq  \
+             conj_t conjx, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -425,8 +425,8 @@ void PASTEMAC(ch,opname) \
 	( \
 	  conjx, \
 	  n, \
-	  x, incx, \
-	  y, incy  \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy  \
 	); \
 }
 
@@ -438,15 +438,15 @@ INSERT_GENTFUNC_BASIC0( eqv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y, \
-       bool*   is_eq  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  y, inc_t rs_y, inc_t cs_y, \
+             bool*   is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -467,8 +467,8 @@ void PASTEMAC(ch,opname) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y  \
+	  ( ctype* )x, rs_x, cs_x, \
+	  ( ctype* )y, rs_y, cs_y  \
 	); \
 }
 
@@ -480,11 +480,11 @@ INSERT_GENTFUNC_BASIC0( eqm )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  n, \
-       void*  x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+             dim_t n, \
+       const void* x, inc_t incx, \
+       const char* format, \
+       const char* s2  \
      ) \
 { \
 	bli_init_once(); \
@@ -508,12 +508,12 @@ INSERT_GENTFUNC_BASIC_I( printv, fprintv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       void*  x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+             dim_t m, \
+             dim_t n, \
+       const void* x, inc_t rs_x, inc_t cs_x, \
+       const char* format, \
+       const char* s2  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h
index 43fbbdb06..29c67df23 100644
--- a/frame/util/bli_util_tapi.h
+++ b/frame/util/bli_util_tapi.h
@@ -42,9 +42,9 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* asum  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -56,9 +56,9 @@ INSERT_GENTPROTR_BASIC0( asumv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       dim_t   m, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       uplo_t uploa, \
+       dim_t  m, \
+       ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -72,9 +72,9 @@ INSERT_GENTPROT_BASIC0( mktrim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -88,13 +88,13 @@ INSERT_GENTPROTR_BASIC0( normiv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype*   x, inc_t rs_x, inc_t cs_x, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -108,8 +108,8 @@ INSERT_GENTPROTR_BASIC0( normim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx  \
+       dim_t  n, \
+       ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -122,11 +122,11 @@ INSERT_GENTPROT_BASIC0( randnv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t diagoffx, \
+       uplo_t uplox, \
+       dim_t  m, \
+       dim_t  n, \
+       ctype* x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -139,10 +139,10 @@ INSERT_GENTPROT_BASIC0( randnm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* scale, \
+             ctype_r* sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -159,10 +159,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi, \
-       bool*   is_eq  \
+             conj_t conjchi, \
+       const ctype* chi, \
+       const ctype* psi, \
+             bool*  is_eq  \
      );
 
 INSERT_GENTPROT_BASIC0( eqsc )
@@ -173,11 +173,11 @@ INSERT_GENTPROT_BASIC0( eqsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
       ( \
-        conj_t  conjx, \
-        dim_t   n, \
-        ctype*  x, inc_t incx, \
-        ctype*  y, inc_t incy, \
-        bool*   is_eq  \
+              conj_t conjx, \
+              dim_t  n, \
+        const ctype* x, inc_t incx, \
+        const ctype* y, inc_t incy, \
+              bool*  is_eq  \
       );
 
 INSERT_GENTPROT_BASIC0( eqv )
@@ -188,15 +188,15 @@ INSERT_GENTPROT_BASIC0( eqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y, \
-       bool*   is_eq  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  y, inc_t rs_y, inc_t cs_y, \
+             bool*   is_eq  \
      );
 
 INSERT_GENTPROT_BASIC0( eqm )
@@ -207,11 +207,11 @@ INSERT_GENTPROT_BASIC0( eqm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  n, \
-       void*  x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+             dim_t n, \
+       const void* x, inc_t incx, \
+       const char* format, \
+       const char* s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( printv )
@@ -222,12 +222,12 @@ INSERT_GENTPROT_BASIC0_I( printv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       void*  x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+             dim_t m, \
+             dim_t n, \
+       const void* x, inc_t rs_x, inc_t cs_x, \
+       const char* format, \
+       const char* s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( printm )
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index af550681a..2b65c8460 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -52,11 +52,11 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      ) \
 { \
-	ctype*   chi1; \
-	ctype_r  chi1_r; \
-	ctype_r  chi1_i; \
-	ctype_r  absum; \
-	dim_t    i; \
+	ctype*  chi1; \
+	ctype_r chi1_r; \
+	ctype_r chi1_i; \
+	ctype_r absum; \
+	dim_t   i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
 	PASTEMAC(chr,set0s)( absum ); \
@@ -239,10 +239,10 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      ) \
 { \
-	ctype*   chi1; \
-	ctype_r  abs_chi1; \
-	ctype_r  absum; \
-	dim_t    i; \
+	ctype*  chi1; \
+	ctype_r abs_chi1; \
+	ctype_r absum; \
+	dim_t   i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
 	PASTEMAC(chr,set0s)( absum ); \
@@ -455,10 +455,10 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      ) \
 { \
-	ctype*   chi1; \
-	ctype_r  abs_chi1; \
-	ctype_r  abs_chi1_max; \
-	dim_t    i; \
+	ctype*  chi1; \
+	ctype_r abs_chi1; \
+	ctype_r abs_chi1_max; \
+	dim_t   i; \
 \
 	/* Initialize the maximum absolute value to zero. */ \
 	PASTEMAC(chr,set0s)( abs_chi1_max ); \
@@ -505,19 +505,19 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      ) \
 { \
-	ctype*   one       = PASTEMAC(ch,1); \
-	ctype*   x0; \
-	ctype*   chi1; \
-	ctype*   x2; \
-	ctype_r  absum_max; \
-	ctype_r  absum_j; \
-	ctype_r  abval_chi1; \
-	uplo_t   uplox_eff; \
-	dim_t    n_iter; \
-	dim_t    n_elem, n_elem_max; \
-	inc_t    ldx, incx; \
-	dim_t    j, i; \
-	dim_t    ij0, n_shift; \
+	ctype*  one = PASTEMAC(ch,1); \
+	ctype*  x0; \
+	ctype*  chi1; \
+	ctype*  x2; \
+	ctype_r absum_max; \
+	ctype_r absum_j; \
+	ctype_r abval_chi1; \
+	uplo_t  uplox_eff; \
+	dim_t   n_iter; \
+	dim_t   n_elem, n_elem_max; \
+	inc_t   ldx, incx; \
+	dim_t   j, i; \
+	dim_t   ij0, n_shift; \
 \
 	/* Initialize the maximum absolute column sum to zero. */ \
 	PASTEMAC(chr,set0s)( absum_max ); \
@@ -904,20 +904,20 @@ void PASTEMAC(ch,varname) \
        rntm_t* rntm  \
      ) \
 { \
-	ctype*  one = PASTEMAC(ch,1); \
-	ctype*  x0; \
-	ctype*  x1; \
-	ctype*  x2; \
-	ctype*  chi1; \
-	ctype   beta; \
-	ctype   omega; \
-	double  max_m_n; \
-	uplo_t  uplox_eff; \
-	dim_t   n_iter; \
-	dim_t   n_elem, n_elem_max; \
-	inc_t   ldx, incx; \
-	dim_t   j, i; \
-	dim_t   ij0, n_shift; \
+	ctype* one = PASTEMAC(ch,1); \
+	ctype* x0; \
+	ctype* x1; \
+	ctype* x2; \
+	ctype* chi1; \
+	ctype  beta; \
+	ctype  omega; \
+	double max_m_n; \
+	uplo_t uplox_eff; \
+	dim_t  n_iter; \
+	dim_t  n_elem, n_elem_max; \
+	inc_t  ldx, incx; \
+	dim_t  j, i; \
+	dim_t  ij0, n_shift; \
 \
 	/* Set various loop parameters. Here, we pretend that diagx is equal to
 	   BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \
@@ -1059,16 +1059,16 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      ) \
 { \
-	const ctype_r zero_r = *PASTEMAC(chr,0); \
-	const ctype_r one_r  = *PASTEMAC(chr,1); \
+	ctype_r zero_r = *PASTEMAC(chr,0); \
+	ctype_r one_r  = *PASTEMAC(chr,1); \
 \
-	ctype*        chi1; \
-	ctype_r       chi1_r; \
-	ctype_r       chi1_i; \
-	ctype_r       scale_r; \
-	ctype_r       sumsq_r; \
-	ctype_r       abs_chi1_r; \
-	dim_t         i; \
+	ctype*  chi1; \
+	ctype_r chi1_r; \
+	ctype_r chi1_i; \
+	ctype_r scale_r; \
+	ctype_r sumsq_r; \
+	ctype_r abs_chi1_r; \
+	dim_t   i; \
 \
 	/* NOTE: This function attempts to mimic the algorithm for computing
 	   the Frobenius norm in netlib LAPACK's ?lassq(). */ \
@@ -1143,10 +1143,10 @@ INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 )
 \
 bool PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t conjx, \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
      ) \
 { \
 	for ( dim_t i = 0; i < n; ++i ) \
@@ -1298,25 +1298,23 @@ INSERT_GENTFUNC_BASIC0( eqm_unb_var1 )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  n, \
-       ctype* x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+             FILE*  file, \
+       const char*  s1, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
-	dim_t  i; \
-	ctype* chi1; \
-	char   default_spec[32] = PASTEMAC(ch,formatspec)(); \
+	const char default_spec[32] = PASTEMAC(ch,formatspec)(); \
 \
 	if ( format == NULL ) format = default_spec; \
 \
-	chi1 = x; \
+	const ctype*chi1 = x; \
 \
 	fprintf( file, "%s\n", s1 ); \
 \
-	for ( i = 0; i < n; ++i ) \
+	for ( dim_t i = 0; i < n; ++i ) \
 	{ \
 		PASTEMAC(ch,fprints)( file, format, *chi1 ); \
 		fprintf( file, "\n" ); \
@@ -1335,28 +1333,26 @@ INSERT_GENTFUNC_BASIC0_I( fprintv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       ctype* x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+             FILE*  file, \
+       const char*  s1, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
-	dim_t  i, j; \
-	ctype* chi1; \
-	char   default_spec[32] = PASTEMAC(ch,formatspec)(); \
+	const char default_spec[32] = PASTEMAC(ch,formatspec)(); \
 \
 	if ( format == NULL ) format = default_spec; \
 \
 	fprintf( file, "%s\n", s1 ); \
 \
-	for ( i = 0; i < m; ++i ) \
+	for ( dim_t i = 0; i < m; ++i ) \
 	{ \
-		for ( j = 0; j < n; ++j ) \
+		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
+			const ctype* chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
 \
 			PASTEMAC(ch,fprints)( file, format, *chi1 ); \
 			fprintf( file, " " ); \
diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h
index f87848856..435efa4ac 100644
--- a/frame/util/bli_util_unb_var1.h
+++ b/frame/util/bli_util_unb_var1.h
@@ -162,10 +162,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 )
 \
 bool PASTEMAC(ch,varname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t conjx, \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
      );
 
 INSERT_GENTPROT_BASIC0( eqv_unb_var1 )
@@ -194,12 +194,12 @@ INSERT_GENTPROT_BASIC0( eqm_unb_var1 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  n, \
-       ctype* x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+             FILE*  file, \
+       const char*  s1, \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( fprintv )
@@ -210,13 +210,13 @@ INSERT_GENTPROT_BASIC0_I( fprintv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       ctype* x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+             FILE*  file, \
+       const char*  s1, \
+             dim_t  m, \
+             dim_t  n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( fprintm )
diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
index 7171347bf..8f1122b45 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
@@ -52,7 +52,7 @@ void bli_dpackm_armsve256_int_8xk
        double* restrict kappa,
        double* restrict a, inc_t inca_, inc_t lda_,
        double* restrict p,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
index a086b3a76..5866ed26f 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
@@ -51,7 +51,7 @@ void bli_dpackm_armsve512_asm_10xk
        double* restrict kappa,
        double* restrict a, inc_t inca_, inc_t lda_,
        double* restrict p,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
index aeb323c0c..88ccb4b8e 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
@@ -50,7 +50,7 @@ void bli_dpackm_armsve512_asm_16xk
        double* restrict kappa,
        double* restrict a, inc_t inca_, inc_t lda_,
        double* restrict p,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 098d5d4b5..9bc7fd949 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 0ee470f24..1c9d68dec 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index d03af5923..05005f8c3 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 8636a527b..210d40f0b 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
index c248285c3..4dec190e0 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
@@ -43,7 +43,7 @@ void bli_sgemm_armv7a_ker_4x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_sgemm_armv7a_asm_4x4
@@ -56,8 +56,8 @@ void bli_sgemm_armv7a_asm_4x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -78,7 +78,7 @@ void bli_dgemm_armv7a_ker_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_dgemm_armv7a_asm_4x4
@@ -91,8 +91,8 @@ void bli_dgemm_armv7a_asm_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -113,7 +113,7 @@ void bli_cgemm_armv7a_ker_2x2
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_cgemm_armv7a_asm_2x2
@@ -126,8 +126,8 @@ void bli_cgemm_armv7a_asm_2x2
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -148,7 +148,7 @@ void bli_zgemm_armv7a_ker_2x2
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_zgemm_armv7a_asm_2x2
@@ -161,8 +161,8 @@ void bli_zgemm_armv7a_asm_2x2
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
index 06f36a346..b1e9481a3 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
@@ -45,8 +45,8 @@ void bli_sgemm_armv7a_int_4x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -251,8 +251,8 @@ void bli_dgemm_armv7a_int_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
index 301b8ad79..3eefd9ddc 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
@@ -57,7 +57,7 @@ void bli_dpackm_armv8a_int_6xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -296,7 +296,7 @@ void bli_dpackm_armv8a_int_6xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -316,7 +316,7 @@ void bli_dpackm_armv8a_int_6xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
index 321fa5403..51b064a24 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
@@ -57,7 +57,7 @@ void bli_dpackm_armv8a_int_8xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -326,7 +326,7 @@ void bli_dpackm_armv8a_int_8xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -346,7 +346,7 @@ void bli_dpackm_armv8a_int_8xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
index 371877247..f915215e1 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
@@ -57,7 +57,7 @@ void bli_spackm_armv8a_int_12xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -410,7 +410,7 @@ void bli_spackm_armv8a_int_12xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -428,7 +428,7 @@ void bli_spackm_armv8a_int_12xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
index 3d363c2d8..b508b2a0e 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
@@ -57,7 +57,7 @@ void bli_spackm_armv8a_int_8xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -348,7 +348,7 @@ void bli_spackm_armv8a_int_8xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -366,7 +366,7 @@ void bli_spackm_armv8a_int_8xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 4d9a88817..94f0090bc 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -61,8 +61,8 @@ void bli_sgemm_armv8a_asm_8x12
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void* a_next = bli_auxinfo_next_a( data );
@@ -740,8 +740,8 @@ void bli_dgemm_armv8a_asm_6x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void* a_next = bli_auxinfo_next_a( data );
@@ -1462,8 +1462,8 @@ void bli_cgemm_armv8a_opt_4x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -1478,8 +1478,8 @@ void bli_zgemm_armv8a_opt_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
index c87ff1feb..44e0ac419 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
@@ -57,8 +57,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
@@ -262,8 +262,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
index 630459db7..cade3ee05 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
@@ -121,8 +121,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( n0 != 8 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
index e13dd668e..06c9ac32c 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
@@ -114,8 +114,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( m0 != 6 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
index 16001a73c..312eb4454 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
@@ -98,7 +98,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 /*
  * 4x8 dgemmsup kernel with extending 1st dimension.
  *
- * Recommanded usage case: 
+ * Recommanded usage case:
  * o 16 < (L1 cache latency) * (Num. FPU) < 25.
  * o L1 cache has a bandwidth not too low (true in most cases).
  * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases).
@@ -115,8 +115,8 @@ void bli_dgemmsup_rv_armv8a_asm_4x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
index 43913cd38..bc7402a5f 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
@@ -98,7 +98,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 /*
  * 4x8 dgemmsup kernel with extending 2nd dimension.
  *
- * Recommanded usage case: 
+ * Recommanded usage case:
  * o 16 < (L1 cache latency) * (Num. FPU) < 25.
  * o L1 cache has a bandwidth not too low (true in most cases).
  * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases).
@@ -115,8 +115,8 @@ void bli_dgemmsup_rv_armv8a_asm_4x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
index 3100112d3..8ff5ec173 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
@@ -140,8 +140,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( n0 != 8 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
index fb9357c11..9bdf4b3b8 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
@@ -140,8 +140,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( m0 != 6 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
index 5b0e9b062..4d374df98 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
@@ -111,8 +111,8 @@ void bli_dgemmsup_rv_armv8a_asm_8x4m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
index 84c7c4a7d..aa53de55c 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
@@ -94,8 +94,8 @@ void bli_dgemmsup_rd_armv8a_asm_3x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   assert( m0 == 3 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
index abbb6fb4d..b10546764 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
@@ -118,8 +118,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x3
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   assert( m0 == 6 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
index 43880063e..5438fdfc2 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rd_armv8a_int_2x8
        double*    restrict b, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   assert( m0 <= 2 );
@@ -114,10 +114,10 @@ void bli_dgemmsup_rd_armv8a_int_2x8
   PRAGMA_UNROLL
   for ( ; k_mker > 0; --k_mker )
   {
-    // if ( m0 > 0 ) 
+    // if ( m0 > 0 )
                   va_0 = vld1q_f64( a_loc + rs_a * 0 );
     if ( m0 > 1 ) va_1 = vld1q_f64( a_loc + rs_a * 1 );
-    // if ( n0 > 0 ) 
+    // if ( n0 > 0 )
                   vb_0 = vld1q_f64( b_loc + cs_b * 0 );
     if ( n0 > 1 ) vb_1 = vld1q_f64( b_loc + cs_b * 1 );
     if ( n0 > 2 ) vb_2 = vld1q_f64( b_loc + cs_b * 2 );
@@ -174,10 +174,10 @@ void bli_dgemmsup_rd_armv8a_int_2x8
   PRAGMA_NOUNROLL
   for ( ; k_left > 0; --k_left )
   {
-    // if ( m0 > 0 ) 
+    // if ( m0 > 0 )
                   va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 );
     if ( m0 > 1 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 1, va_1, 0 );
-    // if ( n0 > 0 ) 
+    // if ( n0 > 0 )
                   vb_0 = vld1q_lane_f64( b_loc + cs_b * 0, vb_0, 0 );
     if ( n0 > 1 ) vb_1 = vld1q_lane_f64( b_loc + cs_b * 1, vb_1, 0 );
     if ( n0 > 2 ) vb_2 = vld1q_lane_f64( b_loc + cs_b * 2, vb_2, 0 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
index 73e5f20fb..89817d6d5 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rd_armv8a_int_3x4
        double*    restrict b, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // if ( m0 == 3 && n0 == 4 )
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
index 16af42ade..931f3ed66 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rv_armv8a_int_3x8mn
        double*    restrict b0, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c0, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Unlike the rd case, this rv case does not impose restriction upon
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
index 8bbd87f1f..f850b0fa6 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
        double*    restrict b0, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c0, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Unlike the rd case, this rv case does not impose restriction upon
@@ -123,7 +123,7 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
       for ( ; k > 0; --k )
       {
         // A columns.
-        // if ( m0 > 0 ) 
+        // if ( m0 > 0 )
                       va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 );
         if ( m0 > 1 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 1, va_0, 1 );
         if ( m0 > 2 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 2, va_1, 0 );
diff --git a/kernels/bgq/1/bli_axpyv_bgq_int.c b/kernels/bgq/1/bli_axpyv_bgq_int.c
index 0c4a8cbd3..1d233f5c1 100644
--- a/kernels/bgq/1/bli_axpyv_bgq_int.c
+++ b/kernels/bgq/1/bli_axpyv_bgq_int.c
@@ -34,14 +34,14 @@
 
 #include "blis.h"
 
-void bli_daxpyv_bgq_int 
-     ( 
+void bli_daxpyv_bgq_int
+     (
        conj_t           conjx,
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	if ( bli_zero_dim1( n ) ) return;
@@ -70,7 +70,7 @@ void bli_daxpyv_bgq_int
         xv = vec_lda( 0 * sizeof(double), &x[i*4] );
         yv = vec_lda( 0 * sizeof(double), &y[i*4] );
         zv = vec_madd( alphav, xv, yv );
-        vec_sta( zv, 0 * sizeof(double), &y[i*4] );   
+        vec_sta( zv, 0 * sizeof(double), &y[i*4] );
 	}
     for ( dim_t i = 0; i < n_left; i++ )
     {
diff --git a/kernels/bgq/1/bli_dotv_bgq_int.c b/kernels/bgq/1/bli_dotv_bgq_int.c
index 73e53c23a..eb6805a4c 100644
--- a/kernels/bgq/1/bli_dotv_bgq_int.c
+++ b/kernels/bgq/1/bli_dotv_bgq_int.c
@@ -42,7 +42,7 @@ void bli_ddotv_bgq_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	bool   use_ref = FALSE;
@@ -91,7 +91,7 @@ void bli_ddotv_bgq_int
     {
         rhos += x[4*n_run + i] * y[4*n_run + i];
     }
-	
+
     *rho = rhos;
 }
 
diff --git a/kernels/bgq/1f/bli_axpyf_bgq_int.c b/kernels/bgq/1f/bli_axpyf_bgq_int.c
index 4e296e0a2..cf0fe633c 100644
--- a/kernels/bgq/1f/bli_axpyf_bgq_int.c
+++ b/kernels/bgq/1f/bli_axpyf_bgq_int.c
@@ -45,7 +45,7 @@ void bli_daxpyf_bgq_int
        double* restrict a, inc_t inca, inc_t lda,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t fusefac = 8;
@@ -60,7 +60,7 @@ void bli_daxpyf_bgq_int
 		use_ref = TRUE;
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
-	{   
+	{
 //        printf("%d\t%d\t%d\t%d\t%d\t%d\n", fusefac, inca, incx, incy, bli_is_unaligned_to( ( siz_t )a, 32 ), bli_is_unaligned_to( ( siz_t )y, 32));
 //        printf("DEFAULTING TO REFERENCE IMPLEMENTATION\n");
 		BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy, cntx );
@@ -134,7 +134,7 @@ void bli_daxpyf_bgq_int
 
         vec_sta( yv, 0 * sizeof(double), &y0[i*4]);
 	}
-    
+
     for ( dim_t i = 0; i < m_left; ++i )
     {
         y0[4*m_run + i] += chi0 * a0[4*m_run + i]
diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
index 15e3e072f..2adbc4c36 100644
--- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
+++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
@@ -64,8 +64,8 @@ void bli_dgemm_bgq_int_8x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false );
@@ -228,8 +228,8 @@ void bli_zgemm_bgq_int_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false );
diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
index 3a75d61d7..bef7232dd 100644
--- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
+++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
@@ -98,8 +98,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -590,8 +590,8 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -810,8 +810,8 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1334,8 +1334,8 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
index 843335ad5..e5d077409 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
@@ -51,7 +51,7 @@ void bli_cpackm_haswell_asm_3xk
        scomplex*  restrict kappa,
        scomplex*  restrict a, inc_t inca0, inc_t lda0,
        scomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
index 862a33b86..fa8fabe9d 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
@@ -51,7 +51,7 @@ void bli_cpackm_haswell_asm_8xk
        scomplex*  restrict kappa,
        scomplex*  restrict a, inc_t inca0, inc_t lda0,
        scomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
index b64f26591..47fc5b98d 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
@@ -51,7 +51,7 @@ void bli_dpackm_haswell_asm_6xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -107,7 +107,7 @@ void bli_dpackm_haswell_asm_6xk
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_dpackm_haswell_asm_6xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovsd(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovsd(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomisd(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.DKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_dpackm_haswell_asm_6xk
 
 		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
 		jz(.DCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.DROWNONU)
@@ -150,7 +150,7 @@ void bli_dpackm_haswell_asm_6xk
 		label(.DCOLNONU)
 
 		jmp(.DDONE)                        // jump to end.
-		
+
 
 
@@ -161,7 +161,7 @@ void bli_dpackm_haswell_asm_6xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.DROWUNIT)
 
 		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
@@ -255,7 +255,7 @@ void bli_dpackm_haswell_asm_6xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.DCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 
 		mov(var(k_iter), rsi)              // i = k_iter;
@@ -319,8 +319,8 @@ void bli_dpackm_haswell_asm_6xk
 
 
 		label(.DDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -374,7 +374,7 @@ void bli_dpackm_haswell_asm_6xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -394,7 +394,7 @@ void bli_dpackm_haswell_asm_6xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
index 9deb564ce..9f07e37a4 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
@@ -51,7 +51,7 @@ void bli_dpackm_haswell_asm_8xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -107,7 +107,7 @@ void bli_dpackm_haswell_asm_8xk
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_dpackm_haswell_asm_8xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovsd(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovsd(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomisd(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.DKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_dpackm_haswell_asm_8xk
 
 		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
 		jz(.DCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.DROWNONU)
@@ -150,7 +150,7 @@ void bli_dpackm_haswell_asm_8xk
 		label(.DCOLNONU)
 
 		jmp(.DDONE)                        // jump to end.
-		
+
 
 
@@ -161,7 +161,7 @@ void bli_dpackm_haswell_asm_8xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.DROWUNIT)
 
 		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
@@ -265,7 +265,7 @@ void bli_dpackm_haswell_asm_8xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.DCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 
 		mov(var(k_iter), rsi)              // i = k_iter;
@@ -329,8 +329,8 @@ void bli_dpackm_haswell_asm_8xk
 
 
 		label(.DDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -384,7 +384,7 @@ void bli_dpackm_haswell_asm_8xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -402,7 +402,7 @@ void bli_dpackm_haswell_asm_8xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
index 40ac22bc5..27b2c71ee 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
@@ -51,7 +51,7 @@ void bli_spackm_haswell_asm_16xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -100,14 +100,14 @@ void bli_spackm_haswell_asm_16xk
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
 	const bool     unitk  = bli_seq1( *kappa );
-	
+
 
 	// -------------------------------------------------------------------------
 
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_spackm_haswell_asm_16xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovss(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovss(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomiss(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.SKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_spackm_haswell_asm_16xk
 
 		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
 		jz(.SCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.SROWNONU)
@@ -150,7 +150,7 @@ void bli_spackm_haswell_asm_16xk
 		label(.SCOLNONU)
 
 		jmp(.SDONE)                        // jump to end.
-		
+
 
 
@@ -161,7 +161,7 @@ void bli_spackm_haswell_asm_16xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.SROWUNIT)
 
 		lea(mem(r8,  r8,  2), r13)         // r13 = 3*inca
@@ -402,7 +402,7 @@ void bli_spackm_haswell_asm_16xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.SCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 		lea(mem(r13, r10, 2), r15)         // r15 = 5*lda
 		lea(mem(r13, r10, 4), rdx)         // rdx = 7*lda
@@ -488,8 +488,8 @@ void bli_spackm_haswell_asm_16xk
 
 
 		label(.SDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -543,7 +543,7 @@ void bli_spackm_haswell_asm_16xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -561,7 +561,7 @@ void bli_spackm_haswell_asm_16xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
index 3a134bed8..a073eca62 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
@@ -51,7 +51,7 @@ void bli_spackm_haswell_asm_6xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -100,14 +100,14 @@ void bli_spackm_haswell_asm_6xk
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
 	const bool     unitk  = bli_seq1( *kappa );
-	
+
 
 	// -------------------------------------------------------------------------
 
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_spackm_haswell_asm_6xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovss(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovss(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomiss(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.SKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_spackm_haswell_asm_6xk
 
 		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
 		jz(.SCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.SROWNONU)
@@ -150,7 +150,7 @@ void bli_spackm_haswell_asm_6xk
 		label(.SCOLNONU)
 
 		jmp(.SDONE)                        // jump to end.
-		
+
 
 
@@ -161,7 +161,7 @@ void bli_spackm_haswell_asm_6xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.SROWUNIT)
 
 		lea(mem(r8,  r8,  2), r13)         // r13 = 3*inca
@@ -274,7 +274,7 @@ void bli_spackm_haswell_asm_6xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.SCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 		lea(mem(r13, r10, 2), r15)         // r15 = 5*lda
 		lea(mem(r13, r10, 4), rdx)         // rdx = 7*lda
@@ -361,8 +361,8 @@ void bli_spackm_haswell_asm_6xk
 
 
 		label(.SDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -416,7 +416,7 @@ void bli_spackm_haswell_asm_6xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -434,7 +434,7 @@ void bli_spackm_haswell_asm_6xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
index 1a714abe2..5e65565d5 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
@@ -51,7 +51,7 @@ void bli_zpackm_haswell_asm_3xk
        dcomplex*  restrict kappa,
        dcomplex*  restrict a, inc_t inca0, inc_t lda0,
        dcomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
index 4e11872af..d118081cc 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
@@ -51,7 +51,7 @@ void bli_zpackm_haswell_asm_4xk
        dcomplex*  restrict kappa,
        dcomplex*  restrict a, inc_t inca0, inc_t lda0,
        dcomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index 70ea4ccd7..b7be1c674 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -87,8 +87,8 @@ void bli_sgemm_haswell_asm_6x16
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -767,8 +767,8 @@ void bli_dgemm_haswell_asm_6x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1326,8 +1326,8 @@ void bli_cgemm_haswell_asm_3x8
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1719,8 +1719,8 @@ void bli_zgemm_haswell_asm_3x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index dd9526d56..261054499 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -86,8 +86,8 @@ void bli_sgemm_haswell_asm_16x6
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -470,8 +470,8 @@ void bli_dgemm_haswell_asm_8x6
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -840,8 +840,8 @@ void bli_cgemm_haswell_asm_8x3
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1231,8 +1231,8 @@ void bli_zgemm_haswell_asm_4x3
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index d0d0ff211..915fbf08f 100644
--- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -67,8 +67,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
        float*     restrict b01,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -858,8 +858,8 @@ void bli_dgemmtrsm_l_haswell_asm_6x8
        double*    restrict b01,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index 68a8c069b..63c42785c 100644
--- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -67,8 +67,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
        float*     restrict b01,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -863,8 +863,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
        double*    restrict b01,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
index 1820277d5..637e5917b 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
@@ -78,8 +78,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -166,7 +166,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -184,7 +184,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -259,18 +259,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -305,7 +305,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -336,7 +336,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -398,27 +398,27 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -429,7 +429,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -451,21 +451,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -473,12 +473,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -500,22 +500,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -555,7 +555,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -563,73 +563,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -653,7 +653,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -735,8 +735,8 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -766,7 +766,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -784,7 +784,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -838,19 +838,19 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -885,7 +885,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -916,7 +916,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -978,25 +978,25 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
 
 #if 1
@@ -1004,12 +1004,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
 	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
 #endif
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1031,21 +1031,21 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1053,12 +1053,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1080,22 +1080,22 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1134,7 +1134,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1142,73 +1142,73 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -1225,7 +1225,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1307,8 +1307,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1336,9 +1336,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1356,7 +1356,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1412,19 +1412,19 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
 	prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1462,7 +1462,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -1496,7 +1496,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1564,25 +1564,25 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
 
 #if 0
@@ -1590,7 +1590,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
 	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
 #endif
-	
+
 	vmovupd(mem(rbx        ), ymm0)
 	vmovupd(mem(rbx, r11, 1), ymm1)
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
@@ -1620,21 +1620,21 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1642,7 +1642,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -1672,12 +1672,12 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1690,7 +1690,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
@@ -1723,7 +1723,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	                                   // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1731,96 +1731,96 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
 	vmulpd(xmm0, xmm8,  xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
+
 
 
@@ -1838,7 +1838,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
index e720e7da1..d9dad5fea 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
@@ -78,8 +78,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -223,7 +223,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -241,7 +241,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -317,18 +317,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -362,7 +362,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -398,7 +398,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -465,32 +465,32 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -512,21 +512,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -534,12 +534,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -561,22 +561,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -615,7 +615,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -623,73 +623,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -711,7 +711,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -804,8 +804,8 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -835,7 +835,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -853,7 +853,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -909,18 +909,18 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -954,7 +954,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -990,7 +990,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1057,32 +1057,32 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1104,21 +1104,21 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1126,12 +1126,12 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1153,22 +1153,22 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1206,7 +1206,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	                                   // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15)
 
 
-	
+
 
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
@@ -1215,73 +1215,73 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -1297,7 +1297,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1391,8 +1391,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1422,7 +1422,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1440,7 +1440,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1491,18 +1491,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1531,7 +1531,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -1562,7 +1562,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1619,31 +1619,31 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1661,21 +1661,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1683,11 +1683,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1705,21 +1705,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1746,7 +1746,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1754,65 +1754,65 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -1828,7 +1828,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1921,8 +1921,8 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1952,7 +1952,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	//mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1970,7 +1970,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2016,18 +2016,18 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2051,7 +2051,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -2077,7 +2077,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -2124,30 +2124,30 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -2161,21 +2161,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2183,10 +2183,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -2200,20 +2200,20 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -2228,7 +2228,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -2236,57 +2236,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -2302,7 +2302,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
index f764bc613..fcf448423 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
@@ -78,8 +78,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -190,7 +190,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -208,7 +208,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -283,18 +283,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -329,7 +329,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -360,7 +360,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -422,27 +422,27 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -453,7 +453,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -475,21 +475,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -497,12 +497,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -524,22 +524,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -555,7 +555,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -571,7 +571,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -590,7 +590,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -598,73 +598,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -770,8 +770,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -801,7 +801,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -819,7 +819,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -894,18 +894,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -940,7 +940,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -971,7 +971,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1033,27 +1033,27 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1064,7 +1064,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1086,21 +1086,21 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1108,12 +1108,12 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1135,22 +1135,22 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1166,7 +1166,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1182,7 +1182,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1201,7 +1201,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1209,73 +1209,73 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -1299,7 +1299,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1383,8 +1383,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1414,7 +1414,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1432,7 +1432,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1507,18 +1507,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1553,7 +1553,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1584,7 +1584,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1646,27 +1646,27 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1677,7 +1677,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1699,21 +1699,21 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1721,12 +1721,12 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1748,22 +1748,22 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1779,7 +1779,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1795,7 +1795,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1814,7 +1814,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1822,73 +1822,73 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -1912,7 +1912,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1994,8 +1994,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2025,7 +2025,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -2043,7 +2043,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2098,18 +2098,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2144,7 +2144,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -2175,7 +2175,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2237,27 +2237,27 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2268,7 +2268,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -2290,21 +2290,21 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2312,12 +2312,12 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -2339,22 +2339,22 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2370,7 +2370,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2386,7 +2386,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2405,7 +2405,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -2413,73 +2413,73 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -2496,7 +2496,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -2579,8 +2579,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2610,7 +2610,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -2628,7 +2628,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2685,18 +2685,18 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2734,7 +2734,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -2768,7 +2768,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2836,27 +2836,27 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2892,21 +2892,21 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2944,12 +2944,12 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -2962,7 +2962,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -3007,7 +3007,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
                                        // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -3015,103 +3015,103 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulps(xmm0, xmm6,  xmm6)
 	vmulps(xmm0, xmm8,  xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 4), r12)         //
@@ -3128,7 +3128,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
index 1fe862a8d..33b2df4b4 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
@@ -78,8 +78,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -223,7 +223,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -317,18 +317,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -362,7 +362,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -398,7 +398,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -465,32 +465,32 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -512,21 +512,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -534,12 +534,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -561,22 +561,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -592,7 +592,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -608,7 +608,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -627,7 +627,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -635,73 +635,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -723,7 +723,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -816,8 +816,8 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -847,7 +847,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -864,7 +864,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -919,18 +919,18 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -964,7 +964,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -1000,7 +1000,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1067,32 +1067,32 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1114,21 +1114,21 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1136,12 +1136,12 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1163,22 +1163,22 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1194,7 +1194,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1210,7 +1210,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1229,7 +1229,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1237,73 +1237,73 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -1319,7 +1319,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1413,8 +1413,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1444,7 +1444,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1461,7 +1461,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1511,18 +1511,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1551,7 +1551,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -1582,7 +1582,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1639,31 +1639,31 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1681,21 +1681,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1703,11 +1703,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1725,21 +1725,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1755,7 +1755,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1774,7 +1774,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1782,65 +1782,65 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -1856,7 +1856,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1949,8 +1949,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1980,7 +1980,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1997,7 +1997,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2042,18 +2042,18 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2077,7 +2077,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -2103,7 +2103,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -2150,30 +2150,30 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -2187,21 +2187,21 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2209,11 +2209,11 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -2227,20 +2227,20 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2259,7 +2259,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -2267,57 +2267,57 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -2333,7 +2333,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
index 1637e9766..4e6b75572 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -225,15 +225,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -336,19 +336,19 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -356,7 +356,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -367,14 +367,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -383,7 +383,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -402,14 +402,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -417,8 +417,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -426,7 +426,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -437,14 +437,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -452,7 +452,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -473,14 +473,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -488,50 +488,50 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -539,23 +539,23 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -568,24 +568,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -594,60 +594,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15)
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -734,51 +734,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -843,9 +843,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -866,8 +866,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -995,8 +995,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1026,15 +1026,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1135,19 +1135,19 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1155,7 +1155,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1166,14 +1166,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1182,7 +1182,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1201,14 +1201,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1216,8 +1216,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1225,7 +1225,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1236,14 +1236,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1251,7 +1251,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1272,14 +1272,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1287,27 +1287,27 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -1316,21 +1316,21 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1338,23 +1338,23 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1367,24 +1367,24 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vmulpd(xmm0, xmm13, xmm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1393,60 +1393,60 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13)
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15)
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1521,51 +1521,51 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1622,9 +1622,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1645,8 +1645,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1774,8 +1774,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1805,9 +1805,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1822,7 +1822,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1907,17 +1907,17 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1925,7 +1925,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1933,19 +1933,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1961,18 +1961,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 2
 
@@ -1981,7 +1981,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1989,18 +1989,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2018,38 +2018,38 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2062,58 +2062,58 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2122,42 +2122,42 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2205,45 +2205,45 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
-	
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
 
-	
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
-	
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2278,15 +2278,15 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vmovupd(xmm4, mem(rdx, rax, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
 
 
-	
+
 	lea(mem(r12, rdi, 4), r12)         //
 	lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c
 
@@ -2302,8 +2302,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2431,8 +2431,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2462,9 +2462,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2479,7 +2479,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2558,19 +2558,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2578,7 +2578,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2586,19 +2586,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2614,18 +2614,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 2
 
@@ -2634,7 +2634,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2642,18 +2642,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2671,43 +2671,43 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2715,58 +2715,58 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2775,42 +2775,42 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2846,40 +2846,40 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -2890,7 +2890,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -2911,10 +2911,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vmovupd(xmm1, mem(rdx, rsi, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
 
 
@@ -2936,7 +2936,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
index 5ecef06e8..2533a7825 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -154,14 +154,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 		}
 #endif
 
-		dgemmsup_ker_ft ker_fps[6] = 
+		dgemmsup_ker_ft ker_fps[6] =
 		{
 		  NULL,
 		  bli_dgemmsup_rv_haswell_asm_1x8n,
 		  bli_dgemmsup_rv_haswell_asm_2x8n,
 		  bli_dgemmsup_rv_haswell_asm_3x8n,
 		  bli_dgemmsup_rv_haswell_asm_4x8n,
-		  bli_dgemmsup_rv_haswell_asm_5x8n 
+		  bli_dgemmsup_rv_haswell_asm_5x8n
 		};
 
 		dgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
@@ -203,15 +203,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -313,19 +313,19 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -344,14 +344,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -360,7 +360,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -379,14 +379,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -394,8 +394,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -414,14 +414,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -429,7 +429,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -450,14 +450,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -465,25 +465,25 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -494,21 +494,21 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -516,23 +516,23 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -545,24 +545,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -571,60 +571,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15)
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -711,51 +711,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -820,9 +820,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -841,8 +841,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -959,8 +959,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -990,15 +990,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1097,19 +1097,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1117,7 +1117,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1128,20 +1128,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1160,20 +1160,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1181,7 +1181,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1192,19 +1192,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1225,37 +1225,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -1266,42 +1266,42 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1312,24 +1312,24 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vmulpd(ymm0, ymm11, ymm11)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1338,52 +1338,52 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1468,46 +1468,46 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1568,9 +1568,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1589,8 +1589,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1707,8 +1707,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1740,13 +1740,13 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1842,19 +1842,19 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1862,7 +1862,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1873,7 +1873,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1882,7 +1882,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1901,7 +1901,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1909,8 +1909,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1918,7 +1918,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1929,7 +1929,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1937,7 +1937,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1958,7 +1958,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1966,25 +1966,25 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -1995,14 +1995,14 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2010,23 +2010,23 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -2035,24 +2035,24 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vmulpd(ymm0, ymm9, ymm9)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2061,44 +2061,44 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2153,21 +2153,21 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2175,16 +2175,16 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2225,9 +2225,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -2246,8 +2246,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2355,8 +2355,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2388,13 +2388,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2493,19 +2493,19 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2513,7 +2513,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2524,13 +2524,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2549,13 +2549,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2563,7 +2563,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2574,12 +2574,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2600,30 +2600,30 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -2634,59 +2634,59 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2695,36 +2695,36 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2811,21 +2811,21 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2833,12 +2833,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2903,9 +2903,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -2924,8 +2924,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3033,8 +3033,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3066,13 +3066,13 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3162,19 +3162,19 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -3182,7 +3182,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3195,7 +3195,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3215,8 +3215,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3224,7 +3224,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3236,7 +3236,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3258,25 +3258,25 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3287,7 +3287,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -3295,45 +3295,45 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3342,28 +3342,28 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3406,21 +3406,21 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -3428,8 +3428,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3462,9 +3462,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -3483,8 +3483,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3592,8 +3592,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3625,13 +3625,13 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3718,19 +3718,19 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -3738,7 +3738,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3748,7 +3748,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -3765,8 +3765,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -3774,7 +3774,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3783,7 +3783,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3802,25 +3802,25 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3831,48 +3831,48 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3881,20 +3881,20 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3937,26 +3937,26 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3985,9 +3985,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -4006,8 +4006,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
index 426e5157e..aacfd8d1f 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -249,15 +249,15 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -369,19 +369,19 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -389,7 +389,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -400,14 +400,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -416,7 +416,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -435,14 +435,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -450,8 +450,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -459,7 +459,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -470,14 +470,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -485,7 +485,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -506,14 +506,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -521,50 +521,50 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -572,23 +572,23 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -601,26 +601,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -629,60 +629,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15)
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -828,51 +828,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -977,9 +977,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -1000,8 +1000,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1129,8 +1129,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1160,15 +1160,15 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1275,19 +1275,19 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1295,7 +1295,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1306,14 +1306,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1322,7 +1322,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1341,14 +1341,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1356,8 +1356,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1365,7 +1365,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1376,14 +1376,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1391,7 +1391,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1412,14 +1412,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1427,50 +1427,50 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1478,23 +1478,23 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1507,26 +1507,26 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vmulps(xmm0, xmm13, xmm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1535,60 +1535,60 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13)
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15)
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1711,51 +1711,51 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1844,9 +1844,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -1867,8 +1867,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1996,8 +1996,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2027,15 +2027,15 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2131,19 +2131,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2151,7 +2151,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2159,19 +2159,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2187,19 +2187,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2207,7 +2207,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2215,18 +2215,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2244,104 +2244,104 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2350,42 +2350,42 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2463,45 +2463,45 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2557,9 +2557,9 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -2580,8 +2580,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2709,8 +2709,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2740,15 +2740,15 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2842,19 +2842,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2862,7 +2862,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2872,19 +2872,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2902,19 +2902,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2922,7 +2922,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2932,18 +2932,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2963,43 +2963,43 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -3009,60 +3009,60 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3071,12 +3071,12 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -3086,8 +3086,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -3097,8 +3097,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -3108,8 +3108,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
@@ -3119,8 +3119,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*4))
@@ -3130,8 +3130,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm13, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*4))
@@ -3141,8 +3141,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm15, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3207,57 +3207,57 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vmovups(xmm12, mem(rcx, 0*4))
 	vmovsd(xmm13, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vmovups(xmm14, mem(rcx, 0*4))
 	vmovsd(xmm15, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3305,7 +3305,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
 
 
@@ -3326,8 +3326,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3455,8 +3455,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3486,15 +3486,15 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3585,19 +3585,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -3605,7 +3605,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -3613,19 +3613,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3641,19 +3641,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3661,7 +3661,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -3669,18 +3669,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3698,104 +3698,104 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3804,42 +3804,42 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3893,45 +3893,45 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3972,9 +3972,9 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -3995,8 +3995,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -4124,8 +4124,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4155,15 +4155,15 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -4252,19 +4252,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -4272,7 +4272,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -4280,19 +4280,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -4308,19 +4308,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -4328,7 +4328,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -4336,18 +4336,18 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -4365,104 +4365,104 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -4471,42 +4471,42 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4541,45 +4541,45 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4606,9 +4606,9 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -4629,8 +4629,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
index 7463707cc..da768ebf1 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -154,14 +154,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] = 
+		sgemmsup_ker_ft ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x16n,
 		  bli_sgemmsup_rv_haswell_asm_2x16n,
 		  bli_sgemmsup_rv_haswell_asm_3x16n,
 		  bli_sgemmsup_rv_haswell_asm_4x16n,
-		  bli_sgemmsup_rv_haswell_asm_5x16n 
+		  bli_sgemmsup_rv_haswell_asm_5x16n
 		};
 
 		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
@@ -203,15 +203,15 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -322,19 +322,19 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -353,14 +353,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -369,7 +369,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -388,14 +388,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -403,8 +403,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -423,14 +423,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -438,7 +438,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -459,14 +459,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -474,25 +474,25 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -503,21 +503,21 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -525,23 +525,23 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -554,26 +554,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -582,60 +582,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15)
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -781,51 +781,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -930,9 +930,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -952,8 +952,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1094,8 +1094,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1125,15 +1125,15 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1241,19 +1241,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1272,20 +1272,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1304,20 +1304,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1336,19 +1336,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1369,37 +1369,37 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -1410,42 +1410,42 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1456,26 +1456,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vmulps(ymm0, ymm11, ymm11)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1484,52 +1484,52 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1701,46 +1701,46 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1862,9 +1862,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -1884,8 +1884,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2026,8 +2026,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2057,15 +2057,15 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2170,19 +2170,19 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2201,7 +2201,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2210,7 +2210,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2229,7 +2229,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2237,8 +2237,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2257,7 +2257,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2265,7 +2265,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2286,7 +2286,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2294,25 +2294,25 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -2323,14 +2323,14 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2338,23 +2338,23 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -2363,26 +2363,26 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vmulps(ymm0, ymm9, ymm9)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2391,44 +2391,44 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2516,41 +2516,41 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2620,9 +2620,9 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -2642,8 +2642,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2784,8 +2784,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2815,15 +2815,15 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2931,19 +2931,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2962,13 +2962,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2987,13 +2987,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3012,12 +3012,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3038,30 +3038,30 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3072,61 +3072,61 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3135,36 +3135,36 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3316,36 +3316,36 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3439,9 +3439,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -3461,8 +3461,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3603,8 +3603,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3634,15 +3634,15 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3741,19 +3741,19 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -3774,7 +3774,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3794,8 +3794,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3815,7 +3815,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3837,25 +3837,25 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3866,7 +3866,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -3874,47 +3874,47 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3923,28 +3923,28 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4012,21 +4012,21 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -4034,8 +4034,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4077,9 +4077,9 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -4099,8 +4099,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -4241,8 +4241,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4272,15 +4272,15 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -4376,19 +4376,19 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -4406,7 +4406,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -4423,8 +4423,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -4441,7 +4441,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -4460,25 +4460,25 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -4489,50 +4489,50 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -4541,20 +4541,20 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4648,26 +4648,26 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4728,9 +4728,9 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -4750,8 +4750,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
index 69d543a99..67b3ec8bf 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -104,8 +104,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx \
+       auxinfo_t*          data, \
+       cntx_t*             cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < mdim; ++i ) \
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
index 457ef9f22..929f9ea47 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -586,8 +586,8 @@ void bli_dgemmsup_rd_haswell_asm_3x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -995,8 +995,8 @@ void bli_dgemmsup_rd_haswell_asm_2x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1369,8 +1369,8 @@ void bli_dgemmsup_rd_haswell_asm_1x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
index af498eb0e..397d932e4 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -99,9 +99,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -169,19 +169,19 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
 	prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -219,7 +219,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -253,7 +253,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -321,27 +321,27 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -377,21 +377,21 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -399,7 +399,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -429,12 +429,12 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -447,7 +447,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -480,7 +480,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	                                   // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -488,103 +488,103 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
 	vmulpd(xmm0, xmm8,  xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -628,8 +628,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -655,9 +655,9 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -675,7 +675,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -716,19 +716,19 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -754,7 +754,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -776,7 +776,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -820,27 +820,27 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -864,21 +864,21 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -886,7 +886,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -904,12 +904,12 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -919,7 +919,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
 	                                   // ymm8  ymm9
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -937,7 +937,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	                                   // xmm8[0:1]  = sum(ymm8)  sum(ymm9)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -945,79 +945,79 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
 	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1061,8 +1061,8 @@ void bli_dgemmsup_rd_haswell_asm_2x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1088,9 +1088,9 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1108,7 +1108,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1146,19 +1146,19 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	prefetch(0, mem(rcx,         1*8)) // prefetch c + 0*rs_c
 	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1180,7 +1180,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -1198,7 +1198,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1234,27 +1234,27 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1274,21 +1274,21 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1296,7 +1296,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -1310,12 +1310,12 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1324,7 +1324,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -1337,7 +1337,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	                                   // xmm6[0:1]  = sum(ymm6)  sum(ymm7)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1345,71 +1345,71 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1453,8 +1453,8 @@ void bli_dgemmsup_rd_haswell_asm_1x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1480,9 +1480,9 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1500,7 +1500,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1535,19 +1535,19 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	//lea(mem(r10, rdi, 1), r10)         // rdx = c + 3*rs_c;
 	prefetch(0, mem(rcx,         1*8)) // prefetch c + 0*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1565,7 +1565,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -1579,7 +1579,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1607,27 +1607,27 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1643,21 +1643,21 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1665,7 +1665,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -1675,12 +1675,12 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1688,7 +1688,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	label(.DPOSTACCUM)
 
 	                                   // ymm4  ymm5
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -1696,7 +1696,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	                                   // xmm4[0:1]  = sum(ymm4)  sum(ymm5)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1704,63 +1704,63 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index 516bfced5..75e84650c 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -605,8 +605,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1055,8 +1055,8 @@ void bli_dgemmsup_rd_haswell_asm_1x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
index 571444bed..b2e3d83af 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -163,7 +163,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -181,7 +181,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -256,18 +256,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -302,7 +302,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -333,7 +333,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -395,27 +395,27 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -426,7 +426,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -448,21 +448,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -470,12 +470,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -497,23 +497,23 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
 
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -553,7 +553,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -561,73 +561,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -651,7 +651,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -735,8 +735,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -761,7 +761,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -779,7 +779,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -833,18 +833,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -874,7 +874,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -900,7 +900,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -952,27 +952,27 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -982,7 +982,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1000,21 +1000,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1022,11 +1022,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1044,22 +1044,22 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.DPOSTACCUM)
 
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1090,70 +1090,70 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1165,7 +1165,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1209,8 +1209,8 @@ void bli_dgemmsup_rd_haswell_asm_1x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1235,7 +1235,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	//mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1253,7 +1253,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	//mov(var(rs_c), rdi)                // load rs_c
@@ -1302,18 +1302,18 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1338,7 +1338,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -1359,7 +1359,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1401,27 +1401,27 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1430,7 +1430,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 	vmovupd(mem(rax       ), ymm0)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -1444,21 +1444,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1466,10 +1466,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -1483,12 +1483,12 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1496,9 +1496,9 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	label(.DPOSTACCUM)
 
 
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1513,7 +1513,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1521,57 +1521,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1583,7 +1583,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
index eb1118196..5843d5e40 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -168,31 +168,31 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -200,19 +200,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -226,25 +226,25 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -252,18 +252,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -278,43 +278,43 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -322,57 +322,57 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -381,42 +381,42 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -452,40 +452,40 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -517,13 +517,13 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vmovupd(xmm1, mem(rdx, rsi, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -566,8 +566,8 @@ void bli_dgemmsup_rv_haswell_asm_5x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -588,9 +588,9 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -605,7 +605,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -646,21 +646,21 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
@@ -672,17 +672,17 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -696,23 +696,23 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -720,16 +720,16 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -744,41 +744,41 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -786,54 +786,54 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // r13 = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -842,37 +842,37 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -907,37 +907,37 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -947,7 +947,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-1
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -967,13 +967,13 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vmovhpd(xmm0, mem(rdx, rsi, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1016,8 +1016,8 @@ void bli_dgemmsup_rv_haswell_asm_4x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1038,9 +1038,9 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1055,7 +1055,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1089,31 +1089,31 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
+
 
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1121,14 +1121,14 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1142,20 +1142,20 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1163,13 +1163,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1184,89 +1184,89 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1275,32 +1275,32 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1326,32 +1326,32 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -1362,7 +1362,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-1
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -1375,13 +1375,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vmovupd(ymm6, mem(rcx, rsi, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
 
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1424,8 +1424,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1446,9 +1446,9 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1463,7 +1463,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1496,31 +1496,31 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1528,12 +1528,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1547,18 +1547,18 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1566,11 +1566,11 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1585,36 +1585,36 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1622,61 +1622,61 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1690,10 +1690,10 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -1722,26 +1722,26 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
 	vmovsd(xmm12, mem(rdx        ))
 	vmovsd(xmm13, mem(rdx, rsi, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -1752,8 +1752,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 
 	vmovupd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1781,12 +1781,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1829,8 +1829,8 @@ void bli_dgemmsup_rv_haswell_asm_2x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1851,9 +1851,9 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1868,7 +1868,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1900,41 +1900,41 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1943,29 +1943,29 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1975,82 +1975,82 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2059,22 +2059,22 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2094,34 +2094,34 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
 
-	
-	
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
+
 
 	jmp(.DDONE)                        // jump to end.
 
 
 	label(.DCOLSTORBZ)
-	
+
 
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -2130,13 +2130,13 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	vmovupd(xmm1, mem(rcx, rsi, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2179,8 +2179,8 @@ void bli_dgemmsup_rv_haswell_asm_1x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2201,9 +2201,9 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2218,7 +2218,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2249,31 +2249,31 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2281,7 +2281,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2294,21 +2294,21 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2322,92 +2322,92 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2422,48 +2422,48 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 
 	vmovlpd(xmm0, mem(rcx        ))
 	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	
+
 	//lea(mem(rcx, rsi, 4), rcx)
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
 
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-1
 	vmovlpd(xmm4, mem(rcx        ))
 	vmovhpd(xmm4, mem(rcx, rsi, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
index bdcf833e3..6fb5eaf8a 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -170,31 +170,31 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -202,19 +202,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -228,25 +228,25 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -254,18 +254,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -280,43 +280,43 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -324,57 +324,57 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -383,42 +383,42 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -466,45 +466,45 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
-	
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -539,13 +539,13 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vmovupd(xmm4, mem(rdx, rax, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -588,8 +588,8 @@ void bli_dgemmsup_rv_haswell_asm_5x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -610,9 +610,9 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -627,7 +627,7 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -670,19 +670,19 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -696,17 +696,17 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -720,16 +720,16 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 2
 
@@ -744,16 +744,16 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -768,41 +768,41 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -810,54 +810,54 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm12, ymm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -866,37 +866,37 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -943,41 +943,41 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
-	
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1010,13 +1010,13 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vmovhpd(xmm1, mem(rdx, rax, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1059,8 +1059,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1081,9 +1081,9 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1098,7 +1098,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1134,8 +1134,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-		
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
@@ -1143,22 +1143,22 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 
-	
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1166,14 +1166,14 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1182,39 +1182,39 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1224,128 +1224,128 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1377,33 +1377,33 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
-	
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -1413,7 +1413,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vunpcklpd(ymm6, ymm4, ymm0)
 	vunpckhpd(ymm6, ymm4, ymm1)
@@ -1431,12 +1431,12 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1479,8 +1479,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1501,9 +1501,9 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1518,7 +1518,7 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1553,31 +1553,31 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1585,12 +1585,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1604,18 +1604,18 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1623,11 +1623,11 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1642,36 +1642,36 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1679,61 +1679,61 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1747,10 +1747,10 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -1791,26 +1791,26 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -1821,8 +1821,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 
 	vmovupd(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1858,12 +1858,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1906,8 +1906,8 @@ void bli_dgemmsup_rv_haswell_asm_2x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1928,9 +1928,9 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1945,7 +1945,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1979,31 +1979,31 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2012,8 +2012,8 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2028,10 +2028,10 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -2044,7 +2044,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2060,32 +2060,32 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2094,42 +2094,42 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2138,22 +2138,22 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2179,24 +2179,24 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -2207,7 +2207,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vunpcklpd(ymm6, ymm4, ymm0)
 	vunpckhpd(ymm6, ymm4, ymm1)
@@ -2220,13 +2220,13 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	vmovupd(xmm4, mem(rcx, rax, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2269,8 +2269,8 @@ void bli_dgemmsup_rv_haswell_asm_1x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2291,9 +2291,9 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2308,7 +2308,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2341,27 +2341,27 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2372,8 +2372,8 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2386,21 +2386,21 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2414,27 +2414,27 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2446,41 +2446,41 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2488,17 +2488,17 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2520,15 +2520,15 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vmovhpd(xmm0, mem(rcx, rsi, 1))
 	vmovlpd(xmm1, mem(rcx, rsi, 2))
 	vmovhpd(xmm1, mem(rcx, rax, 1))
-	
+
 	//lea(mem(rcx, rsi, 4), rcx)
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
@@ -2536,10 +2536,10 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -2549,7 +2549,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vmovupd(ymm4, ymm0)
 
@@ -2560,14 +2560,14 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vmovhpd(xmm1, mem(rcx, rax, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
index 9da1e7b83..2b7222a34 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,15 +115,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -180,18 +180,18 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -208,14 +208,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -224,7 +224,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -241,14 +241,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -256,8 +256,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -274,14 +274,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -289,7 +289,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -307,14 +307,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -322,50 +322,50 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -373,22 +373,22 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -401,24 +401,24 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vmulpd(xmm0, xmm13, xmm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -427,60 +427,60 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13)
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15)
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -555,51 +555,51 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -656,12 +656,12 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -704,8 +704,8 @@ void bli_dgemmsup_rv_haswell_asm_5x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -726,15 +726,15 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -790,18 +790,18 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -818,20 +818,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -848,20 +848,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -878,19 +878,19 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -908,82 +908,82 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -994,24 +994,24 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vmulpd(xmm0, xmm11, xmm11)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(xmm0, xmm13, xmm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1020,52 +1020,52 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13)
 	vmovupd(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1138,46 +1138,46 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1231,12 +1231,12 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1279,8 +1279,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1301,15 +1301,15 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1365,17 +1365,17 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1392,7 +1392,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1401,7 +1401,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1418,7 +1418,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1426,8 +1426,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -1444,7 +1444,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1452,7 +1452,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1470,7 +1470,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1478,43 +1478,43 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1522,22 +1522,22 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1546,24 +1546,24 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vmulpd(xmm0, xmm9, xmm9)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(xmm0, xmm11, xmm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1572,44 +1572,44 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1658,41 +1658,41 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1729,9 +1729,9 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1777,8 +1777,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1799,9 +1799,9 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1816,7 +1816,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1854,31 +1854,31 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1889,13 +1889,13 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1912,15 +1912,15 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -1935,12 +1935,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1958,37 +1958,37 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1999,65 +1999,65 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(xmm0, xmm7, xmm7)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(xmm0, xmm9, xmm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -2080,10 +2080,10 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -2124,7 +2124,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	lea(mem(rdx, rsi, 4), rdx)
 
 	                                   // begin I/O on columns 4-5
@@ -2155,26 +2155,26 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
 	vmovsd(xmm12, mem(rdx        ))
 	vmovsd(xmm13, mem(rdx, rsi, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2186,8 +2186,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2244,12 +2244,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2292,8 +2292,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2314,15 +2314,15 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2376,17 +2376,17 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2405,7 +2405,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2423,8 +2423,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -2442,7 +2442,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2461,36 +2461,36 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2498,44 +2498,44 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(xmm0, xmm7, xmm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2544,28 +2544,28 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2614,31 +2614,31 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2675,9 +2675,9 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -2723,8 +2723,8 @@ void bli_dgemmsup_rv_haswell_asm_1x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2745,15 +2745,15 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2806,17 +2806,17 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2832,7 +2832,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2847,8 +2847,8 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -2863,7 +2863,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2879,76 +2879,76 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2957,20 +2957,20 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3007,26 +3007,26 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3052,9 +3052,9 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
index a6c8f0e43..b3a7c17ca 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -108,8 +108,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -178,7 +178,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 						// Advance C and A pointers by the mrs and nrs we just
 						// used, and decrement m_left.
 						cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-					} 
+					}
 				}
 
 				// Advance C and B pointers by the mrs and nrs we just used, and
@@ -208,9 +208,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -225,7 +225,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -275,25 +275,25 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -304,14 +304,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -320,7 +320,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -337,14 +337,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -352,14 +352,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -370,14 +370,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -385,7 +385,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -403,14 +403,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -418,50 +418,50 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -469,22 +469,22 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -497,24 +497,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -523,60 +523,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15)
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -663,51 +663,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -772,12 +772,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -820,8 +820,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -842,15 +842,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -909,18 +909,18 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -937,20 +937,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -967,26 +967,26 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -997,19 +997,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1027,37 +1027,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
@@ -1068,41 +1068,41 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1113,24 +1113,24 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vmulpd(ymm0, ymm11, ymm11)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1139,52 +1139,52 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1269,46 +1269,46 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1367,9 +1367,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1415,8 +1415,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1437,9 +1437,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1454,7 +1454,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1496,31 +1496,31 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
+
 
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1531,7 +1531,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1539,8 +1539,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1557,7 +1557,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1565,10 +1565,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -1583,7 +1583,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1591,7 +1591,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1609,7 +1609,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1617,27 +1617,27 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -1653,7 +1653,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1661,22 +1661,22 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1685,38 +1685,38 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vmulpd(ymm0, ymm9, ymm9)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -1747,10 +1747,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -1805,19 +1805,19 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -1833,8 +1833,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1875,12 +1875,12 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1923,8 +1923,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1945,9 +1945,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1962,7 +1962,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2003,27 +2003,27 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2038,13 +2038,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2061,15 +2061,15 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -2084,12 +2084,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2107,32 +2107,32 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2148,65 +2148,65 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -2229,10 +2229,10 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -2273,7 +2273,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	lea(mem(rdx, rsi, 4), rdx)
 
 	                                   // begin I/O on columns 4-7
@@ -2312,26 +2312,26 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2343,8 +2343,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2409,12 +2409,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2457,8 +2457,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2479,9 +2479,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2496,7 +2496,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2536,27 +2536,27 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2572,8 +2572,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2591,10 +2591,10 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -2610,7 +2610,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2629,27 +2629,27 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2666,44 +2666,44 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2712,12 +2712,12 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -2732,8 +2732,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2778,19 +2778,19 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2798,8 +2798,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2832,12 +2832,12 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2880,8 +2880,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2902,9 +2902,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2919,7 +2919,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2958,27 +2958,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
+
 
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2991,8 +2991,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3002,15 +3002,15 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -3023,7 +3023,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3039,27 +3039,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -3068,18 +3068,18 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
@@ -3088,27 +3088,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3116,20 +3116,20 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3150,7 +3150,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovhpd(xmm0, mem(rcx, rsi, 1))
 	vmovlpd(xmm1, mem(rcx, rsi, 2))
 	vmovhpd(xmm1, mem(rcx, rax, 1))
-	
+
 	lea(mem(rcx, rsi, 4), rcx)
 
 	                                   // begin I/O on columns 4-7
@@ -3173,26 +3173,26 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3220,14 +3220,14 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovhpd(xmm1, mem(rcx, rax, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
 
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
index dad5458b9..98b557fae 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -103,8 +103,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx \
+       auxinfo_t*          data, \
+       cntx_t*             cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < mdim; ++i ) \
@@ -175,8 +175,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx \
+       auxinfo_t*          data, \
+       cntx_t*             cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < m; ++i ) \
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
index 1eb8d926c..c17b0b275 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -164,18 +164,18 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	prefetch(0, mem(r10, rdi, 2, 0*4)) // prefetch c + 5*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -206,7 +206,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -233,7 +233,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -287,27 +287,27 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -336,21 +336,21 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -358,7 +358,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -381,12 +381,12 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -399,7 +399,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	                                   // ymm10
 	                                   // ymm12
 	                                   // ymm14
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -443,8 +443,8 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	                                   // xmm12[0] = sum(ymm12)
 	                                   // xmm14[0] = sum(ymm14)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -452,109 +452,109 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovss(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovss(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovss(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovss(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -598,8 +598,8 @@ void bli_sgemmsup_rd_haswell_asm_3x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -627,7 +627,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -645,7 +645,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -684,18 +684,18 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	prefetch(0, mem(rcx, rdi, 2, 0*4)) // prefetch c + 2*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -717,7 +717,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -735,7 +735,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -771,27 +771,27 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -811,21 +811,21 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -833,7 +833,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -847,12 +847,12 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -862,7 +862,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	                                   // ymm4
 	                                   // ymm6
 	                                   // ymm8
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -885,8 +885,8 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	                                   // xmm6[0]  = sum(ymm6)
 	                                   // xmm8[0]  = sum(ymm8)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -894,82 +894,82 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovss(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1013,8 +1013,8 @@ void bli_sgemmsup_rd_haswell_asm_2x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1042,7 +1042,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1060,7 +1060,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1097,18 +1097,18 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	prefetch(0, mem(rcx, rdi, 1, 0*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1127,7 +1127,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1142,7 +1142,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1172,27 +1172,27 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1209,21 +1209,21 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1231,7 +1231,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -1242,12 +1242,12 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1256,7 +1256,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 
 	                                   // ymm4
 	                                   // ymm6
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1272,8 +1272,8 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	                                   // xmm4[0]  = sum(ymm4)
 	                                   // xmm6[0]  = sum(ymm6)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1281,73 +1281,73 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovss(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1391,8 +1391,8 @@ void bli_sgemmsup_rd_haswell_asm_1x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1420,7 +1420,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1438,7 +1438,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1473,18 +1473,18 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	prefetch(0, mem(rcx,         0*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1500,7 +1500,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1512,7 +1512,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1536,27 +1536,27 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1570,21 +1570,21 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1592,7 +1592,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -1600,12 +1600,12 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1614,7 +1614,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 
 	                                   // ymm4
 	                                   // ymm6
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1624,8 +1624,8 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	                                   // xmm4[0]  = sum(ymm4)
 	                                   // xmm6[0]  = sum(ymm6)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1633,65 +1633,65 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
index 1d3d88309..5fb91e634 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -194,18 +194,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -271,7 +271,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -333,27 +333,27 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -364,7 +364,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -386,21 +386,21 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -408,12 +408,12 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -435,22 +435,22 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -466,7 +466,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -482,7 +482,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -500,8 +500,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -509,73 +509,73 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -599,7 +599,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -644,8 +644,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -670,7 +670,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -742,18 +742,18 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -782,7 +782,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -808,7 +808,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -859,27 +859,27 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -888,7 +888,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -906,21 +906,21 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -928,11 +928,11 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -950,21 +950,21 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -980,7 +980,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -998,8 +998,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1007,65 +1007,65 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1077,7 +1077,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1121,8 +1121,8 @@ void bli_sgemmsup_rd_haswell_asm_1x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1147,7 +1147,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1165,7 +1165,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1214,18 +1214,18 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1248,7 +1248,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1269,7 +1269,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
@@ -1309,34 +1309,34 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1350,21 +1350,21 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1372,10 +1372,10 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1389,21 +1389,21 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1421,8 +1421,8 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1430,57 +1430,57 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1492,7 +1492,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
index bbb75a6fc..1398c3da7 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -176,7 +176,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -194,7 +194,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -269,18 +269,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -315,7 +315,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -346,7 +346,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -408,27 +408,27 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -439,7 +439,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -461,21 +461,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -483,12 +483,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -510,22 +510,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -541,7 +541,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -557,7 +557,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -575,8 +575,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -584,73 +584,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -674,7 +674,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -758,8 +758,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -784,7 +784,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -802,7 +802,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -856,18 +856,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -897,7 +897,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -923,7 +923,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -975,27 +975,27 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1005,7 +1005,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1023,21 +1023,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1045,11 +1045,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1067,21 +1067,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1097,7 +1097,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1115,8 +1115,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1124,65 +1124,65 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1194,7 +1194,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1238,8 +1238,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1264,7 +1264,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	//mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1282,7 +1282,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	//mov(var(rs_c), rdi)                // load rs_c
@@ -1331,18 +1331,18 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1367,7 +1367,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1388,7 +1388,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1430,27 +1430,27 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1459,7 +1459,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1473,21 +1473,21 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1495,10 +1495,10 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1512,20 +1512,20 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1543,8 +1543,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1552,57 +1552,57 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1614,7 +1614,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
index 1e3240350..75c687267 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -170,18 +170,18 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -219,7 +219,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -253,7 +253,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -321,27 +321,27 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -377,21 +377,21 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -399,7 +399,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -429,12 +429,12 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -447,7 +447,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -491,8 +491,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	                                   // xmm12[0:1] = sum(ymm12) sum(ymm13)
 	                                   // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -500,109 +500,109 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -646,8 +646,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -675,7 +675,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -693,7 +693,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -735,18 +735,18 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -772,7 +772,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -794,7 +794,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -837,27 +837,27 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -881,21 +881,21 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -903,7 +903,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -921,12 +921,12 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -936,7 +936,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
 	                                   // ymm8  ymm9
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -959,8 +959,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	                                   // xmm6[0:1]  = sum(ymm6)  sum(ymm7)
 	                                   // xmm8[0:1]  = sum(ymm8)  sum(ymm9)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -968,83 +968,83 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1088,8 +1088,8 @@ void bli_sgemmsup_rd_haswell_asm_2x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1117,7 +1117,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1135,7 +1135,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1174,18 +1174,18 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1207,7 +1207,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1225,7 +1225,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1260,27 +1260,27 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1300,21 +1300,21 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1322,7 +1322,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -1336,12 +1336,12 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1350,7 +1350,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1366,8 +1366,8 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	                                   // xmm4[0:1]  = sum(ymm4)  sum(ymm5)
 	                                   // xmm6[0:1]  = sum(ymm6)  sum(ymm7)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1375,73 +1375,73 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1485,8 +1485,8 @@ void bli_sgemmsup_rd_haswell_asm_1x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1514,7 +1514,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1532,7 +1532,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1568,18 +1568,18 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1597,7 +1597,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1611,7 +1611,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1638,27 +1638,27 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1674,21 +1674,21 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1696,7 +1696,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -1706,12 +1706,12 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
@@ -1719,7 +1719,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	label(.SPOSTACCUM)
 
 	                                   // ymm4  ymm5
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1728,8 +1728,8 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 
 	                                   // xmm4[0:1]  = sum(ymm4)  sum(ymm5)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1737,64 +1737,64 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
index 9d4e9d51d..80be4e932 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -173,18 +173,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -219,7 +219,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -250,7 +250,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -312,27 +312,27 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -343,7 +343,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -365,21 +365,21 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -387,12 +387,12 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -414,22 +414,22 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -445,7 +445,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -461,7 +461,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -479,8 +479,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -488,73 +488,73 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -571,7 +571,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -616,8 +616,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -642,7 +642,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -660,7 +660,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -701,18 +701,18 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -741,7 +741,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -767,7 +767,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -818,27 +818,27 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -847,7 +847,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -865,21 +865,21 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -887,11 +887,11 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -909,21 +909,21 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -939,7 +939,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -957,8 +957,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -966,70 +966,70 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1073,8 +1073,8 @@ void bli_sgemmsup_rd_haswell_asm_1x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1099,7 +1099,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1117,7 +1117,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1153,18 +1153,18 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1187,7 +1187,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1208,7 +1208,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
@@ -1248,34 +1248,34 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1289,21 +1289,21 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1311,10 +1311,10 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1328,20 +1328,20 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1359,8 +1359,8 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1368,62 +1368,62 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
index 788912ecf..3a82e9b3e 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -194,18 +194,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -271,7 +271,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -333,27 +333,27 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -364,7 +364,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -386,21 +386,21 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -408,12 +408,12 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -435,22 +435,22 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -466,7 +466,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -482,7 +482,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -500,8 +500,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -509,73 +509,73 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -599,7 +599,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -644,8 +644,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -670,7 +670,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -742,18 +742,18 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -782,7 +782,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -808,7 +808,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -859,27 +859,27 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -888,7 +888,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -906,21 +906,21 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -928,11 +928,11 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -950,21 +950,21 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -980,7 +980,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -998,8 +998,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1007,65 +1007,65 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1077,7 +1077,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1121,8 +1121,8 @@ void bli_sgemmsup_rd_haswell_asm_1x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1147,7 +1147,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1165,7 +1165,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1214,18 +1214,18 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1248,7 +1248,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1269,7 +1269,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
@@ -1309,34 +1309,34 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1350,21 +1350,21 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1372,10 +1372,10 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1389,21 +1389,21 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1421,8 +1421,8 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1430,57 +1430,57 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1492,7 +1492,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
index 1bea78ee7..65d8664da 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -186,25 +186,25 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -215,14 +215,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -231,7 +231,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -248,14 +248,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -263,14 +263,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -281,14 +281,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -296,7 +296,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -314,14 +314,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -329,50 +329,50 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -380,22 +380,22 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -408,26 +408,26 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vmulps(xmm0, xmm13, xmm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -436,60 +436,60 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13)
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15)
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -611,51 +611,51 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -743,12 +743,12 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -791,8 +791,8 @@ void bli_sgemmsup_rv_haswell_asm_5x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -813,9 +813,9 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -830,7 +830,7 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -883,25 +883,25 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -912,20 +912,20 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -942,26 +942,26 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -972,19 +972,19 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1002,82 +1002,82 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1088,26 +1088,26 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vmulps(xmm0, xmm11, xmm11)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(xmm0, xmm13, xmm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1116,52 +1116,52 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13)
 	vmovups(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1302,46 +1302,46 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1442,12 +1442,12 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1490,8 +1490,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1512,9 +1512,9 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1529,7 +1529,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1581,25 +1581,25 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1610,7 +1610,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1619,7 +1619,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1636,7 +1636,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1644,14 +1644,14 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1662,7 +1662,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1670,7 +1670,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1688,7 +1688,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1696,32 +1696,32 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1732,7 +1732,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1740,22 +1740,22 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1764,40 +1764,40 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vmulps(xmm0, xmm9, xmm9)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(xmm0, xmm11, xmm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -1828,10 +1828,10 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -1907,19 +1907,19 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -1938,8 +1938,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1999,12 +1999,12 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2047,8 +2047,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2069,9 +2069,9 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2086,7 +2086,7 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2137,25 +2137,25 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2166,13 +2166,13 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2189,19 +2189,19 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2212,12 +2212,12 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2235,37 +2235,37 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2276,67 +2276,67 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(xmm0, xmm7, xmm7)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(xmm0, xmm9, xmm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2359,10 +2359,10 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2483,19 +2483,19 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2509,8 +2509,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2587,12 +2587,12 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2635,8 +2635,8 @@ void bli_sgemmsup_rv_haswell_asm_2x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2657,9 +2657,9 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2674,7 +2674,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2724,25 +2724,25 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2755,7 +2755,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2773,14 +2773,14 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2792,7 +2792,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2811,32 +2811,32 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2848,60 +2848,60 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(xmm0, xmm7, xmm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2916,10 +2916,10 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2975,19 +2975,19 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2996,8 +2996,8 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3033,12 +3033,12 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3081,8 +3081,8 @@ void bli_sgemmsup_rv_haswell_asm_1x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3103,9 +3103,9 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -3120,7 +3120,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3169,25 +3169,25 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3197,7 +3197,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3212,14 +3212,14 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3228,7 +3228,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3244,32 +3244,32 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3278,68 +3278,68 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3414,24 +3414,24 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3480,12 +3480,12 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
index 6a08cecd4..26eec0c09 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -108,8 +108,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -178,7 +178,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 						// Advance C and A pointers by the mrs and nrs we just
 						// used, and decrement m_left.
 						cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-					} 
+					}
 				}
 
 				// Advance C and B pointers by the mrs and nrs we just used, and
@@ -208,9 +208,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -225,7 +225,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -286,25 +286,25 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -315,14 +315,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -331,7 +331,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -348,14 +348,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -363,14 +363,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -381,14 +381,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -396,7 +396,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -414,14 +414,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -429,50 +429,50 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -480,22 +480,22 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -508,26 +508,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -536,60 +536,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15)
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -735,51 +735,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -884,12 +884,12 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -932,8 +932,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -954,9 +954,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -971,7 +971,7 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1029,25 +1029,25 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1058,20 +1058,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1088,26 +1088,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1118,19 +1118,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1148,82 +1148,82 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1234,26 +1234,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vmulps(ymm0, ymm11, ymm11)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1262,52 +1262,52 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1479,46 +1479,46 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1640,12 +1640,12 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1688,8 +1688,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1710,9 +1710,9 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1727,7 +1727,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1784,25 +1784,25 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1813,7 +1813,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1822,7 +1822,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1839,7 +1839,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1847,14 +1847,14 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1865,7 +1865,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1873,7 +1873,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1891,7 +1891,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1899,32 +1899,32 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1935,7 +1935,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1943,22 +1943,22 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1967,40 +1967,40 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vmulps(ymm0, ymm9, ymm9)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2031,10 +2031,10 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2122,19 +2122,19 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2153,8 +2153,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2225,12 +2225,12 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2273,8 +2273,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2295,9 +2295,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2312,7 +2312,7 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2368,25 +2368,25 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2397,13 +2397,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2420,19 +2420,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2443,12 +2443,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2466,37 +2466,37 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2507,67 +2507,67 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2590,10 +2590,10 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2745,19 +2745,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2771,8 +2771,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2866,12 +2866,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2914,8 +2914,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2936,9 +2936,9 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2953,7 +2953,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3008,25 +3008,25 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3039,7 +3039,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3057,14 +3057,14 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3076,7 +3076,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3095,32 +3095,32 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3132,60 +3132,60 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -3200,10 +3200,10 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3271,19 +3271,19 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -3292,8 +3292,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3335,12 +3335,12 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3383,8 +3383,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3405,9 +3405,9 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -3422,7 +3422,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3476,25 +3476,25 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3504,7 +3504,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3519,14 +3519,14 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3535,7 +3535,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3551,32 +3551,32 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3585,68 +3585,68 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3740,24 +3740,24 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3817,12 +3817,12 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
index 6090f8b0b..53a70d15f 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -174,25 +174,25 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -200,19 +200,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -226,25 +226,25 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -252,18 +252,18 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -278,103 +278,103 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -383,42 +383,42 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -452,45 +452,45 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -516,12 +516,12 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -564,8 +564,8 @@ void bli_sgemmsup_rv_haswell_asm_5x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -586,9 +586,9 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -603,7 +603,7 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -644,25 +644,25 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -670,17 +670,17 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -694,23 +694,23 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -718,16 +718,16 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -742,98 +742,98 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -842,37 +842,37 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -910,41 +910,41 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -973,12 +973,12 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1021,8 +1021,8 @@ void bli_sgemmsup_rv_haswell_asm_4x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1043,9 +1043,9 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1060,7 +1060,7 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1100,25 +1100,25 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1126,14 +1126,14 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1147,20 +1147,20 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1168,13 +1168,13 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1189,91 +1189,91 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1282,32 +1282,32 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1331,37 +1331,37 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1380,12 +1380,12 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1428,8 +1428,8 @@ void bli_sgemmsup_rv_haswell_asm_3x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1450,9 +1450,9 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1467,7 +1467,7 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1506,25 +1506,25 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1532,12 +1532,12 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1551,18 +1551,18 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1570,11 +1570,11 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1589,86 +1589,86 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1677,27 +1677,27 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1730,33 +1730,33 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1780,12 +1780,12 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1828,8 +1828,8 @@ void bli_sgemmsup_rv_haswell_asm_2x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1850,9 +1850,9 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1867,7 +1867,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1905,25 +1905,25 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1933,7 +1933,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1948,14 +1948,14 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1964,7 +1964,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1980,78 +1980,78 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2060,22 +2060,22 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2094,29 +2094,29 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2130,12 +2130,12 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2178,8 +2178,8 @@ void bli_sgemmsup_rv_haswell_asm_1x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2200,9 +2200,9 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2217,7 +2217,7 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2254,25 +2254,25 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2280,7 +2280,7 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2293,21 +2293,21 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2321,75 +2321,75 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2398,17 +2398,17 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2431,25 +2431,25 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2466,12 +2466,12 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
index 512fd6052..2d6165710 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -176,25 +176,25 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -202,19 +202,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -228,25 +228,25 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -254,18 +254,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -280,103 +280,103 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -385,42 +385,42 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -474,45 +474,45 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -552,12 +552,12 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -600,8 +600,8 @@ void bli_sgemmsup_rv_haswell_asm_5x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -622,9 +622,9 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -639,7 +639,7 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -682,25 +682,25 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -708,17 +708,17 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -732,23 +732,23 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -756,16 +756,16 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -780,98 +780,98 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -880,37 +880,37 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -970,41 +970,41 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1049,12 +1049,12 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1097,8 +1097,8 @@ void bli_sgemmsup_rv_haswell_asm_4x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1119,9 +1119,9 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1136,7 +1136,7 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1178,25 +1178,25 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1204,14 +1204,14 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1225,20 +1225,20 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1246,13 +1246,13 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1267,91 +1267,91 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1360,32 +1360,32 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1422,37 +1422,37 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1482,12 +1482,12 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1530,8 +1530,8 @@ void bli_sgemmsup_rv_haswell_asm_3x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1552,9 +1552,9 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1569,7 +1569,7 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1610,25 +1610,25 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1636,12 +1636,12 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1655,18 +1655,18 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1674,11 +1674,11 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1693,86 +1693,86 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1781,27 +1781,27 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1851,33 +1851,33 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1911,12 +1911,12 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1959,8 +1959,8 @@ void bli_sgemmsup_rv_haswell_asm_2x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1981,9 +1981,9 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1998,7 +1998,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2038,25 +2038,25 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2066,7 +2066,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2081,14 +2081,14 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2097,7 +2097,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2113,78 +2113,78 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2193,22 +2193,22 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2235,29 +2235,29 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2276,12 +2276,12 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2324,8 +2324,8 @@ void bli_sgemmsup_rv_haswell_asm_1x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2346,9 +2346,9 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2363,7 +2363,7 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2402,25 +2402,25 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2428,7 +2428,7 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2441,21 +2441,21 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2469,75 +2469,75 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2546,17 +2546,17 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2589,25 +2589,25 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2630,12 +2630,12 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
index ac4e1ee0b..f2cb1df42 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -179,25 +179,25 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -207,19 +207,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -235,25 +235,25 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -263,18 +263,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -291,105 +291,105 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -398,12 +398,12 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -413,8 +413,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -424,8 +424,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -435,8 +435,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
@@ -446,8 +446,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*4))
@@ -457,8 +457,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm13, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*4))
@@ -468,8 +468,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm15, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -534,57 +534,57 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vmovups(xmm12, mem(rcx, 0*4))
 	vmovsd(xmm13, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vmovups(xmm14, mem(rcx, 0*4))
 	vmovsd(xmm15, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -632,10 +632,10 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -678,8 +678,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -700,9 +700,9 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -717,7 +717,7 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -763,25 +763,25 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -791,17 +791,17 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -817,23 +817,23 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -843,16 +843,16 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -869,100 +869,100 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -971,12 +971,12 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -986,8 +986,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -997,8 +997,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -1008,8 +1008,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
@@ -1019,8 +1019,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*4))
@@ -1030,8 +1030,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm13, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1106,51 +1106,51 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vmovups(xmm12, mem(rcx, 0*4))
 	vmovsd(xmm13, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1206,10 +1206,10 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1252,8 +1252,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1274,9 +1274,9 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1291,7 +1291,7 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1336,25 +1336,25 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1364,14 +1364,14 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1387,20 +1387,20 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1410,13 +1410,13 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1433,93 +1433,93 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1528,12 +1528,12 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -1543,8 +1543,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -1554,8 +1554,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -1565,9 +1565,9 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
-	vextractf128(imm(0x1), ymm10, xmm11)
+
+
+	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
 
@@ -1576,8 +1576,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1620,45 +1620,45 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1693,10 +1693,10 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1739,8 +1739,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1761,9 +1761,9 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1778,7 +1778,7 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1822,25 +1822,25 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1850,12 +1850,12 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1871,18 +1871,18 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1892,11 +1892,11 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1913,88 +1913,88 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2003,12 +2003,12 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -2018,8 +2018,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -2029,8 +2029,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -2040,8 +2040,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2106,39 +2106,39 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2181,10 +2181,10 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2227,8 +2227,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2249,9 +2249,9 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2266,7 +2266,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2309,25 +2309,25 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2339,7 +2339,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2356,14 +2356,14 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2374,7 +2374,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2392,80 +2392,80 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2474,12 +2474,12 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -2489,8 +2489,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -2500,8 +2500,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2533,33 +2533,33 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2581,10 +2581,10 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2627,8 +2627,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2649,9 +2649,9 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2666,7 +2666,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2708,25 +2708,25 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2736,7 +2736,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2751,14 +2751,14 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2767,7 +2767,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2783,77 +2783,77 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2862,12 +2862,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -2877,8 +2877,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2920,27 +2920,27 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2968,12 +2968,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3018,8 +3018,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3040,9 +3040,9 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -3057,7 +3057,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3101,25 +3101,25 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -3127,7 +3127,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3140,21 +3140,21 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3168,96 +3168,96 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3308,23 +3308,23 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3357,12 +3357,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
index 2b1a221ad..603ba7554 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -181,25 +181,25 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -207,19 +207,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -233,25 +233,25 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -259,18 +259,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -285,103 +285,103 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -390,42 +390,42 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -502,45 +502,45 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -596,12 +596,12 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -644,8 +644,8 @@ void bli_sgemmsup_rv_haswell_asm_5x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -666,9 +666,9 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -683,7 +683,7 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -731,25 +731,25 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -757,17 +757,17 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -781,23 +781,23 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -805,16 +805,16 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -829,98 +829,98 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -929,37 +929,37 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1049,41 +1049,41 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1147,12 +1147,12 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1195,8 +1195,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1217,9 +1217,9 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1234,7 +1234,7 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1281,25 +1281,25 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1307,14 +1307,14 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1328,20 +1328,20 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1349,13 +1349,13 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1370,38 +1370,38 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1409,66 +1409,66 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1487,10 +1487,10 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -1538,19 +1538,19 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -1565,8 +1565,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 
 	vmovups(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1604,12 +1604,12 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1652,8 +1652,8 @@ void bli_sgemmsup_rv_haswell_asm_3x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1674,9 +1674,9 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1691,7 +1691,7 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1737,25 +1737,25 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1763,12 +1763,12 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1782,18 +1782,18 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1801,11 +1801,11 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1820,36 +1820,36 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1857,63 +1857,63 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1927,10 +1927,10 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2010,19 +2010,19 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -2033,8 +2033,8 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 
 	vmovups(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2084,12 +2084,12 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2132,8 +2132,8 @@ void bli_sgemmsup_rv_haswell_asm_2x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2154,9 +2154,9 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2171,7 +2171,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2216,25 +2216,25 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2244,7 +2244,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2259,14 +2259,14 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2275,7 +2275,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2291,32 +2291,32 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2325,58 +2325,58 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -2385,10 +2385,10 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2426,27 +2426,27 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2470,12 +2470,12 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2518,8 +2518,8 @@ void bli_sgemmsup_rv_haswell_asm_1x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2540,9 +2540,9 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2557,7 +2557,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2601,25 +2601,25 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2627,7 +2627,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2640,21 +2640,21 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2668,96 +2668,96 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2808,23 +2808,23 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2857,12 +2857,12 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
index f20e43f7c..a53b763da 100644
--- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
+++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
@@ -264,8 +264,8 @@ void bli_dgemm_knc_asm_30x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     double * a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
index 18a8e5e2e..7374abfe0 100644
--- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
+++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
@@ -264,8 +264,8 @@ void bli_sgemm_knc_asm_30x16
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     float * a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
index 91fe1989f..2464ecf0a 100644
--- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
+++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
@@ -116,7 +116,7 @@ void bli_dpackm_knl_asm_8xk
        double* restrict kappa_,
        double* restrict a_, inc_t inca_, inc_t lda_,
        double* restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
@@ -367,7 +367,7 @@ void bli_dpackm_knl_asm_24xk
        double* restrict kappa_,
        double* restrict a_, inc_t inca_, inc_t lda_,
        double* restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
index 8c4bdfe6b..4326a00dd 100644
--- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
+++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
@@ -118,7 +118,7 @@ void bli_spackm_knl_asm_16xk
        float*  restrict kappa_,
        float*  restrict a_, inc_t inca_, inc_t lda_,
        float*  restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
@@ -385,7 +385,7 @@ void bli_spackm_knl_asm_24xk
        float*  restrict kappa_,
        float*  restrict a_, inc_t inca_, inc_t lda_,
        float*  restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
index a7f860ae0..11a480997 100644
--- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
+++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
@@ -193,8 +193,8 @@ void bli_dgemm_knl_asm_24x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     (void)data;
diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
index 64feba09f..cbef0cb82 100644
--- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
+++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
@@ -190,8 +190,8 @@ void bli_sgemm_knl_asm_24x16
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     (void)data;
diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c
index 2dd7c7324..c329912b4 100644
--- a/kernels/penryn/1/bli_axpyv_penryn_int.c
+++ b/kernels/penryn/1/bli_axpyv_penryn_int.c
@@ -50,7 +50,7 @@ void bli_daxpyv_penryn_int
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c
index 2e88a577a..6d63a9cf0 100644
--- a/kernels/penryn/1/bli_dotv_penryn_int.c
+++ b/kernels/penryn/1/bli_dotv_penryn_int.c
@@ -51,7 +51,7 @@ void bli_ddotv_penryn_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict x_cast   = x;
diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
index c809ebb41..350a0af5f 100644
--- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
@@ -53,7 +53,7 @@ void bli_daxpy2v_penryn_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict z, inc_t incz,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast  = alpha;
diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c
index ce4c4f786..f52c05d67 100644
--- a/kernels/penryn/1f/bli_axpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c
@@ -53,7 +53,7 @@ void bli_daxpyf_penryn_int
        double* restrict a, inc_t inca, inc_t lda,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
index 6b9dab773..244e3f11c 100644
--- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
@@ -54,7 +54,7 @@ void bli_ddotaxpyv_penryn_int
        double* restrict y, inc_t incy,
        double* restrict rho,
        double* restrict z, inc_t incz,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
index fe102d427..3ff80319a 100644
--- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
@@ -58,7 +58,7 @@ void bli_ddotxaxpyf_penryn_int
        double* restrict beta,
        double* restrict y, inc_t incy,
        double* restrict z, inc_t incz,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c
index ac9887d59..e8775bd0c 100644
--- a/kernels/penryn/1f/bli_dotxf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c
@@ -54,7 +54,7 @@ void bli_ddotxf_penryn_int
        double* restrict x, inc_t incx,
        double* restrict beta,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
index a3e39c3ac..8a3ec077f 100644
--- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
@@ -47,8 +47,8 @@ void bli_sgemm_penryn_asm_8x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -522,8 +522,8 @@ void bli_dgemm_penryn_asm_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
index 7bef618fa..aa8dcf858 100644
--- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
@@ -47,8 +47,8 @@ void bli_sgemmtrsm_l_penryn_asm_8x4
        float*     restrict b01,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -65,8 +65,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
        double*    restrict b01,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   b_next  = bli_auxinfo_next_b( data );
@@ -81,30 +81,30 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 	GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false );
 
 	begin_asm()
-		
+
 		mov(var(a10), rax) // load address of a10.
 		mov(var(b01), rbx) // load address of b01.
 		//mov(var(b_next), r9) // load address of b_next.
-		
+
 		sub(imm(0-8*16), rax) // increment pointers to allow byte
 		sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
-		
+
 		movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
 		movaps(mem(rax, -7*16), xmm1) // of a and b.
 		movaps(mem(rbx, -8*16), xmm2)
-		
+
 		//mov(var(c11), rcx) // load address of c11
 		//mov(var(rs_c), rdi) // load cs_c
 		//lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
 		//lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c;
-		
+
 		//prefetch(2, mem(r9, 0*8)) // prefetch b_next
-		
+
 		xorpd(xmm3, xmm3)
 		xorpd(xmm4, xmm4)
 		xorpd(xmm5, xmm5)
 		xorpd(xmm6, xmm6)
-		
+
 		//prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 		xorpd(xmm8, xmm8)
 		movaps(xmm8, xmm9)
@@ -117,20 +117,20 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		//prefetch(2, mem(rdx, rdi, 1, 3*8)) // prefetch c + 3*cs_c
 		movaps(xmm8, xmm14)
 		movaps(xmm8, xmm15)
-		
-		
-		
+
+
+
 		mov(var(k_iter), rsi) // i = k_iter;
 		test(rsi, rsi) // check i via logical AND.
 		je(.CONSIDERKLEFT) // if i == 0, jump to code that
 		 // contains the k_left loop.
-		
-		
+
+
 		label(.LOOPKITER) // MAIN LOOP
-		
+
 		//prefetch(0, mem(rax, 1264))
 		prefetch(0, mem(rax, (4*35+1)*8))
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -138,13 +138,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -152,7 +152,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -160,8 +160,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 1
 		movaps(mem(rbx, -5*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -169,13 +169,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -4*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -183,7 +183,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -191,10 +191,10 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -4*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -3*16), xmm1)
-		
+
 		//prefetch(0, mem(rax, 1328))
 		prefetch(0, mem(rax, (4*37+1)*8))
-		
+
 		addpd(xmm3, xmm11) // iteration 2
 		movaps(mem(rbx, -3*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -202,13 +202,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -2*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -216,7 +216,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -224,8 +224,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -2*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -1*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 3
 		movaps(mem(rbx, -1*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -233,17 +233,17 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		//sub(imm(-4*4*8), r9) // b_next += 4*4 (unroll x nr)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, 0*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -251,9 +251,9 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -261,26 +261,26 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -8*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -7*16), xmm1)
-		
+
 		//prefetch(2, mem(r9, 0*8)) // prefetch b_next[0]
 		//prefetch(2, mem(r9, 8*8)) // prefetch b_next[8]
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKITER) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.CONSIDERKLEFT)
-		
+
 		mov(var(k_left), rsi) // i = k_left;
 		test(rsi, rsi) // check i via logical AND.
 		je(.POSTACCUM) // if i == 0, we're done; jump to end.
 		 // else, we prepare to enter k_left loop.
-		
-		
+
+
 		label(.LOOPKLEFT) // EDGE LOOP
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -288,13 +288,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -302,7 +302,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -310,28 +310,28 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		sub(imm(0-4*1*8), rax) // a += 4 (1 x mr)
 		sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr)
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKLEFT) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.POSTACCUM)
-		
+
 		addpd(xmm3, xmm11)
 		addpd(xmm4, xmm15)
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
-		
-		
-		
+
+
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		 // xmm8:   xmm9:   xmm10:  xmm11:
 		 // ( ab01  ( ab00  ( ab03  ( ab02
 		 //   ab10 )  ab11 )  ab12 )  ab13 )
@@ -343,31 +343,31 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(xmm8, xmm1)
 		unpcklpd(xmm8, xmm0)
 		unpckhpd(xmm9, xmm1)
-		
+
 		movaps(xmm11, xmm4)
 		movaps(xmm10, xmm5)
 		unpcklpd(xmm10, xmm4)
 		unpckhpd(xmm11, xmm5)
-		
+
 		movaps(xmm13, xmm2)
 		movaps(xmm12, xmm3)
 		unpcklpd(xmm12, xmm2)
 		unpckhpd(xmm13, xmm3)
-		
+
 		movaps(xmm15, xmm6)
 		movaps(xmm14, xmm7)
 		unpcklpd(xmm14, xmm6)
 		unpckhpd(xmm15, xmm7)
-		
+
 		 // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
 		 // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
 		 // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
 		 // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
-		
+
 		mov(var(alpha), rax) // load address of alpha
 		movddup(mem(rax), xmm15) // load alpha and duplicate
-		
-		movaps(mem(rbx, 0*16), xmm8) 
+
+		movaps(mem(rbx, 0*16), xmm8)
 		movaps(mem(rbx, 1*16), xmm12)
 		mulpd(xmm15, xmm8) // xmm8  = alpha * ( beta00 beta01 )
 		mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 )
@@ -382,13 +382,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rbx, 6*16), xmm11)
 		mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
 		mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
-		
+
 		 // (Now scaled by alpha:)
 		 // xmm8:  ( beta00 beta01 ) xmm12: ( beta02 beta03 )
 		 // xmm9:  ( beta10 beta11 ) xmm13: ( beta12 beta13 )
 		 // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
 		 // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
-		
+
 		subpd(xmm0, xmm8) // xmm8  -= xmm0
 		subpd(xmm1, xmm9) // xmm9  -= xmm1
 		subpd(xmm2, xmm10) // xmm10 -= xmm2
@@ -397,28 +397,28 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		subpd(xmm5, xmm13) // xmm13 -= xmm5
 		subpd(xmm6, xmm14) // xmm14 -= xmm6
 		subpd(xmm7, xmm15) // xmm15 -= xmm7
-		
-		
-		
+
+
+
 		label(.TRSM)
-		
-		
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
-		
+
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
 		mulpd(xmm0, xmm8)  // xmm8  *= (1/alpha00);
 		mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
@@ -426,7 +426,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm0, xmm8)  // xmm8  /= alpha00;
 		divpd(xmm0, xmm12) // xmm12 /= alpha00;
 #endif
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
@@ -435,14 +435,14 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
 		mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
@@ -455,7 +455,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm1, xmm9)  // xmm9  /= alpha11;
 		divpd(xmm1, xmm13) // xmm13 /= alpha11;
 #endif
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -464,15 +464,15 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
 		movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
@@ -490,7 +490,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm2, xmm10) // xmm10 /= alpha22;
 		divpd(xmm2, xmm14) // xmm14 /= alpha22;
 #endif
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -499,16 +499,16 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
 		movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
 		movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
@@ -531,16 +531,16 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm3, xmm11) // xmm11 /= alpha33;
 		divpd(xmm3, xmm15) // xmm15 /= alpha33;
 #endif
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
 		movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
 		movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
-		
-		
-		
+
+
+
 
     end_asm(
 		: // output operands (none)
diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
index add12ea24..2efc037cc 100644
--- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
@@ -47,8 +47,8 @@ void bli_sgemmtrsm_u_penryn_asm_8x4
        float*     restrict b21,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -65,8 +65,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
        double*    restrict b21,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   b_next  = bli_auxinfo_next_b( data );
@@ -81,23 +81,23 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 	GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false );
 
 	begin_asm()
-		
+
 		mov(var(a12), rax) // load address of a12.
 		mov(var(b21), rbx) // load address of b21.
 		//mov(var(b_next), r9) // load address of b_next.
-		
+
 		add(imm(8*16), rax) // increment pointers to allow byte
 		add(imm(8*16), rbx) // offsets in the unrolled iterations.
-		
+
 		movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
 		movaps(mem(rax, -7*16), xmm1) // of a and b.
 		movaps(mem(rbx, -8*16), xmm2)
-		
+
 		xorpd(xmm3, xmm3)
 		xorpd(xmm4, xmm4)
 		xorpd(xmm5, xmm5)
 		xorpd(xmm6, xmm6)
-		
+
 		xorpd(xmm8, xmm8)
 		movaps(xmm8, xmm9)
 		movaps(xmm8, xmm10)
@@ -106,19 +106,19 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(xmm8, xmm13)
 		movaps(xmm8, xmm14)
 		movaps(xmm8, xmm15)
-		
-		
-		
+
+
+
 		mov(var(k_iter), rsi) // i = k_iter;
 		test(rsi, rsi) // check i via logical AND.
 		je(.CONSIDERKLEFT) // if i == 0, jump to code that
 		 // contains the k_left loop.
-		
-		
+
+
 		label(.LOOPKITER) // MAIN LOOP
-		
+
 		prefetch(0, mem(rax, 1264))
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -126,13 +126,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -140,7 +140,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -148,8 +148,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 1
 		movaps(mem(rbx, -5*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -157,13 +157,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -4*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -171,7 +171,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -179,9 +179,9 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -4*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -3*16), xmm1)
-		
+
 		prefetch(0, mem(rax, 1328))
-		
+
 		addpd(xmm3, xmm11) // iteration 2
 		movaps(mem(rbx, -3*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -189,13 +189,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -2*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -203,7 +203,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -211,8 +211,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -2*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -1*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 3
 		movaps(mem(rbx, -1*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -220,15 +220,15 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		add(imm(4*4*8), rax) // a += 4*4 (unroll x mr)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, 0*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -236,9 +236,9 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -246,24 +246,24 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -8*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -7*16), xmm1)
-		
-		
-		
+
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKITER) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.CONSIDERKLEFT)
-		
+
 		mov(var(k_left), rsi) // i = k_left;
 		test(rsi, rsi) // check i via logical AND.
 		je(.POSTACCUM) // if i == 0, we're done; jump to end.
 		 // else, we prepare to enter k_left loop.
-		
-		
+
+
 		label(.LOOPKLEFT) // EDGE LOOP
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -271,13 +271,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -285,7 +285,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -293,28 +293,28 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		add(imm(4*1*8), rax) // a += 4 (1 x mr)
 		add(imm(4*1*8), rbx) // b += 4 (1 x nr)
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKLEFT) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.POSTACCUM)
-		
+
 		addpd(xmm3, xmm11)
 		addpd(xmm4, xmm15)
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
-		
-		
-		
+
+
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		 // xmm8:   xmm9:   xmm10:  xmm11:
 		 // ( ab01  ( ab00  ( ab03  ( ab02
 		 //   ab10 )  ab11 )  ab12 )  ab13 )
@@ -326,30 +326,30 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(xmm8, xmm1)
 		unpcklpd(xmm8, xmm0)
 		unpckhpd(xmm9, xmm1)
-		
+
 		movaps(xmm11, xmm4)
 		movaps(xmm10, xmm5)
 		unpcklpd(xmm10, xmm4)
 		unpckhpd(xmm11, xmm5)
-		
+
 		movaps(xmm13, xmm2)
 		movaps(xmm12, xmm3)
 		unpcklpd(xmm12, xmm2)
 		unpckhpd(xmm13, xmm3)
-		
+
 		movaps(xmm15, xmm6)
 		movaps(xmm14, xmm7)
 		unpcklpd(xmm14, xmm6)
 		unpckhpd(xmm15, xmm7)
-		
+
 		 // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
 		 // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
 		 // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
 		 // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
-		
+
 		mov(var(alpha), rax) // load address of alpha
 		movddup(mem(rax), xmm15) // load alpha and duplicate
-		
+
 		movaps(mem(rbx, 0*16), xmm8)
 		movaps(mem(rbx, 1*16), xmm12)
 		mulpd(xmm15, xmm8) // xmm8  = alpha * ( beta00 beta01 )
@@ -365,13 +365,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rbx, 6*16), xmm11)
 		mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
 		mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
-		
+
 		 // (Now scaled by alpha:)
 		 // xmm8:  ( beta00 beta01 ) xmm12: ( beta02 beta03 )
 		 // xmm9:  ( beta10 beta11 ) xmm13: ( beta12 beta13 )
 		 // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
 		 // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
-		
+
 		subpd(xmm0, xmm8) // xmm8  -= xmm0
 		subpd(xmm1, xmm9) // xmm9  -= xmm1
 		subpd(xmm2, xmm10) // xmm10 -= xmm2
@@ -380,31 +380,31 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		subpd(xmm5, xmm13) // xmm13 -= xmm5
 		subpd(xmm6, xmm14) // xmm14 -= xmm6
 		subpd(xmm7, xmm15) // xmm15 -= xmm7
-		
-		
-		
+
+
+
 		label(.TRSM)
-		
-		
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		add(rsi, rcx) // c11 += (4-1)*rs_c
 		add(rsi, rcx)
 		add(rsi, rcx)
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
 		mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
 		mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
@@ -412,7 +412,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm3, xmm11) // xmm11 /= alpha33;
 		divpd(xmm3, xmm15) // xmm15 /= alpha33;
 #endif
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
@@ -421,14 +421,14 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
 		movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
-		
+
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
 		mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
@@ -441,7 +441,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm2, xmm10) // xmm10 /= alpha22;
 		divpd(xmm2, xmm14) // xmm14 /= alpha22;
 #endif
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -450,15 +450,15 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
 		movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
 		movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
-		
+
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
@@ -476,7 +476,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm1, xmm9)  // xmm9  /= alpha11;
 		divpd(xmm1, xmm13) // xmm13 /= alpha11;
 #endif
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -485,16 +485,16 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
 		movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
 		movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
 		movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
-		
+
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
@@ -517,16 +517,16 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm0, xmm8)  // xmm8  /= alpha00;
 		divpd(xmm0, xmm12) // xmm12 /= alpha00;
 #endif
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
 		movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
 		movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
-		
-		
-		
+
+
+
     end_asm(
 		: // output operands (none)
 		: // input operands
diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
index 21c0b2f10..69341320e 100644
--- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
@@ -43,8 +43,8 @@ void bli_strsm_l_penryn_asm_8x4
        float*     restrict a11,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -55,8 +55,8 @@ void bli_dtrsm_l_penryn_asm_4x4
        double*    restrict a11,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -65,9 +65,9 @@ void bli_dtrsm_l_penryn_asm_4x4
 	uint64_t cs_c   = cs_c0;
 
 	begin_asm()
-		
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		movaps(mem(rbx, 0*16), xmm8) // xmm8  = ( beta00 beta01 )
 		movaps(mem(rbx, 1*16), xmm12) // xmm9  = ( beta02 beta03 )
 		movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
@@ -76,28 +76,28 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
 		movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
 		movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
-		
-		
-		
+
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
-		
+
 		mulpd(xmm0, xmm8) // xmm8  *= (1/alpha00);
 		mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
@@ -106,14 +106,14 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
 		mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
@@ -121,7 +121,7 @@ void bli_dtrsm_l_penryn_asm_4x4
 		subpd(xmm4, xmm13) // xmm13 -= xmm4
 		mulpd(xmm1, xmm9) // xmm9  *= (1/alpha11);
 		mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -130,15 +130,15 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
 		movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
@@ -151,7 +151,7 @@ void bli_dtrsm_l_penryn_asm_4x4
 		subpd(xmm4, xmm14) // xmm14 -= xmm4
 		mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
 		mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -160,16 +160,16 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
 		movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
 		movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
@@ -187,16 +187,16 @@ void bli_dtrsm_l_penryn_asm_4x4
 		subpd(xmm4, xmm15) // xmm15 -= xmm4
 		mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
 		mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
 		movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
 		movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
-		
-		
-		
+
+
+
 
     end_asm(
 		: // output operands (none)
diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
index 23855a460..0befb4e4e 100644
--- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
@@ -43,8 +43,8 @@ void bli_strsm_u_penryn_asm_8x4
        float*     restrict a11,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -55,8 +55,8 @@ void bli_dtrsm_u_penryn_asm_4x4
        double*    restrict a11,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -65,9 +65,9 @@ void bli_dtrsm_u_penryn_asm_4x4
 	uint64_t cs_c   = cs_c0;
 
 	begin_asm()
-		
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		movaps(mem(rbx, 0*16), xmm8) // xmm8  = ( beta00 beta01 )
 		movaps(mem(rbx, 1*16), xmm12) // xmm9  = ( beta02 beta03 )
 		movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
@@ -76,31 +76,31 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
 		movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
 		movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
-		
-		
-		
+
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		add(rsi, rcx) // c11 += (4-1)*rs_c
 		add(rsi, rcx)
 		add(rsi, rcx)
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 		mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
 		mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
@@ -109,14 +109,14 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
 		movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
-		
+
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
 		mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
@@ -124,7 +124,7 @@ void bli_dtrsm_u_penryn_asm_4x4
 		subpd(xmm7, xmm14) // xmm14 -= xmm7
 		mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
 		mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -133,15 +133,15 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
 		movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
 		movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
-		
+
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
@@ -154,7 +154,7 @@ void bli_dtrsm_u_penryn_asm_4x4
 		subpd(xmm6, xmm13) // xmm13 -= xmm6
 		mulpd(xmm1, xmm9) // xmm9  *= (1/alpha11);
 		mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -163,16 +163,16 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
 		movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
 		movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
 		movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
-		
+
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
@@ -190,16 +190,16 @@ void bli_dtrsm_u_penryn_asm_4x4
 		subpd(xmm5, xmm12) // xmm12 -= xmm5
 		mulpd(xmm0, xmm8) // xmm8  *= (1/alpha00);
 		mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
 		movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
 		movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
-		
-		
-		
+
+
+
 
     end_asm(
 		: // output operands (none)
diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
index e65ce7178..95ce7edeb 100644
--- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
+++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
@@ -50,8 +50,8 @@ void bli_sgemm_piledriver_asm_16x3
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
@@ -531,8 +531,8 @@ void bli_dgemm_piledriver_asm_8x3
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
@@ -987,8 +987,8 @@ void bli_cgemm_piledriver_asm_4x2
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
@@ -1397,8 +1397,8 @@ void bli_zgemm_piledriver_asm_2x2
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c
index 84e7d16d3..abf66f58f 100644
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -70,8 +70,8 @@ void bli_dgemm_power10_mma_8x8
         double*    restrict b,
         double*    restrict beta,
         double*    restrict c, inc_t rs_c0, inc_t cs_c,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index c7f81dc7d..d0c9390f5 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i16gemm_power10_mma_8x16
         short*     restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index 9e8d99c13..7d84e68e2 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i16sgemm_power10_mma_8x16
         short*     restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c
index 7527f271f..6c78a9f00 100644
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i4gemm_power10_mma_8x16
         nibbles*   restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c
index 037a28595..8a0b158a5 100644
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i8gemm_power10_mma_8x16
         int8_t*    restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
     uint64_t k_iter = (k-1) / 4;
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index b37a0c7ce..c16710f45 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -64,8 +64,8 @@ void bli_sbgemm_power10_mma_8x16
         bfloat16*  restrict b,
         float*     restrict beta,
         float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index 42bbaa916..15895e654 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_sgemm_power10_mma_8x16
         float*     restrict b,
         float*     restrict beta,
         float*     restrict c, inc_t rs_c0, inc_t cs_c,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
     // Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index 0e80735df..dc62b5d60 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -64,8 +64,8 @@ void bli_shgemm_power10_mma_8x16
         float16*  restrict b,
         float*     restrict beta,
         float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power7/3/bli_gemm_power7_int_8x4.c b/kernels/power7/3/bli_gemm_power7_int_8x4.c
index b9ce85f72..8ca0c891e 100644
--- a/kernels/power7/3/bli_gemm_power7_int_8x4.c
+++ b/kernels/power7/3/bli_gemm_power7_int_8x4.c
@@ -58,8 +58,8 @@ void bli_sgemm_power7_int_8x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 1 || defined(UTEST)
@@ -100,8 +100,8 @@ void bli_dgemm_power7_int_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     if ( cs_c == 1 )
@@ -457,8 +457,8 @@ void bli_cgemm_power7_int_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 1 || defined(UTEST)
@@ -510,8 +510,8 @@ void bli_zgemm_power7_int_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 1 || defined(UTEST)
diff --git a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
index 50984a67d..a8082b38e 100644
--- a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
+++ b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
@@ -51,8 +51,8 @@ void bli_sgemm_opt_8x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 void bli_dgemm_opt_8x4
@@ -65,8 +65,8 @@ void bli_dgemm_opt_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 void bli_cgemm_opt_8x4
@@ -79,8 +79,8 @@ void bli_cgemm_opt_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 void bli_zgemm_opt_8x4
@@ -93,8 +93,8 @@ void bli_zgemm_opt_8x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 #endif
diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
index 3e5f0d416..70af2b17e 100644
--- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
+++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
@@ -45,8 +45,8 @@ void bli_dgemm_power9_asm_12x6
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
index 7890ad347..051af62e7 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
@@ -50,8 +50,8 @@ void bli_sgemm_sandybridge_asm_8x8
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -542,8 +542,8 @@ void bli_dgemm_sandybridge_asm_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1004,8 +1004,8 @@ void bli_cgemm_sandybridge_asm_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1707,8 +1707,8 @@ void bli_zgemm_sandybridge_asm_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
index 6bf991082..cb1cdc7c2 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
@@ -48,8 +48,8 @@ void bli_sgemm_sandybridge_int_8x8
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -65,8 +65,8 @@ void bli_dgemm_sandybridge_int_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 
@@ -503,8 +503,8 @@ void bli_cgemm_sandybridge_int_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -523,8 +523,8 @@ void bli_zgemm_sandybridge_int_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
index 9943a170b..2579ac4b5 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
@@ -298,7 +298,7 @@ void bli_dgemm_skx_asm_16x12_l2
        double* restrict beta,
        double* restrict c, inc_t rs_c_, inc_t cs_c_,
        auxinfo_t*       data,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     (void)data;
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
index e3bc52041..babb89a1d 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
@@ -164,7 +164,7 @@ void bli_dgemm_skx_asm_16x14
        double* restrict beta,
        double* restrict c, inc_t rs_c_, inc_t cs_c_,
        auxinfo_t*       data,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     (void)data;
diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
index 8808449b6..99b850d1d 100644
--- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
+++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
@@ -328,7 +328,7 @@ void bli_sgemm_skx_asm_32x12_l2
        float* restrict beta,
        float* restrict c, inc_t rs_c_, inc_t cs_c_,
        auxinfo_t*       data,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     (void)data;
diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c
index 4ece5af29..d1263a6c1 100644
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -104,7 +104,7 @@ void bli_samaxv_zen_int
        dim_t            n,
        float*  restrict x, inc_t incx,
        dim_t*  restrict i_max,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	float*  minus_one = PASTEMAC(s,m1);
@@ -202,7 +202,7 @@ void bli_samaxv_zen_int
 		max_vec_hi.v    = _mm256_extractf128_ps( max_vec.v, 1 );
 		maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 );
 		maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
@@ -210,7 +210,7 @@ void bli_samaxv_zen_int
 
 		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 14 );
 		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
-		
+
 		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
@@ -218,7 +218,7 @@ void bli_samaxv_zen_int
 
 		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 1 );
 		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
@@ -269,7 +269,7 @@ void bli_damaxv_zen_int
        dim_t            n,
        double* restrict x, inc_t incx,
        dim_t*  restrict i_max,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double* minus_one = PASTEMAC(d,m1);
@@ -367,15 +367,15 @@ void bli_damaxv_zen_int
 		max_vec_hi.v    = _mm256_extractf128_pd( max_vec.v, 1 );
 		maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
 		maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
 		maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
-		
+
 		max_vec_hi.v    = _mm_permute_pd( max_vec_lo.v, 1 );
 		maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
diff --git a/kernels/zen/1/bli_axpyv_zen_int.c b/kernels/zen/1/bli_axpyv_zen_int.c
index 686580b29..b842c59ed 100644
--- a/kernels/zen/1/bli_axpyv_zen_int.c
+++ b/kernels/zen/1/bli_axpyv_zen_int.c
@@ -62,7 +62,7 @@ void bli_saxpyv_zen_int
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -166,7 +166,7 @@ void bli_daxpyv_zen_int
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t       n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c
index 873b7da53..6ad6d30cf 100644
--- a/kernels/zen/1/bli_axpyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpyv_zen_int10.c
@@ -62,7 +62,7 @@ void bli_saxpyv_zen_int10
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -268,7 +268,7 @@ void bli_daxpyv_zen_int10
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c
index 5fd2b1576..6307b5341 100644
--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -1,330 +1,330 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-	- Redistributions of source code must retain the above copyright
-	  notice, this list of conditions and the following disclaimer.
-	- Redistributions in binary form must reproduce the above copyright
-	  notice, this list of conditions and the following disclaimer in the
-	  documentation and/or other materials provided with the distribution.
-	- Neither the name(s) of the copyright holder(s) nor the names of its
-	  contributors may be used to endorse or promote products derived
-	  from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "immintrin.h"
-#include "blis.h"
-
-// -----------------------------------------------------------------------------
-
-void bli_scopyv_zen_int
-     (
-       conj_t           conjx,
-       dim_t            n,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
-     )
-{
-	const dim_t num_elem_per_reg = 8;
-	dim_t       i = 0;
-	__m256      xv[16];
-
-	// If the vector dimension is zero return early.
-	if ( bli_zero_dim1( n ) ) return;
-
-	if ( incx == 1 && incy == 1 )
-	{
-#if 0
-	PRAGMA_SIMD
-	for (i = 0; i < n; i++)
-	{
-		y[i] = x[i];
-	}
-#endif
-#if 0
-	memcpy(y, x, n << 2);
-#endif
-#if 1
-
-		// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
-		// for example if n = 255
-		// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
-		// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
-		// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
-		for ( i = 0; i < (n & (~0x7F)); i += 128 )
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
-			xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
-			xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
-			xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
-			xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
-			xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
-			xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
-			xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
-			xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
-
-			y += 128;
-			x += 128;
-		}
-		for ( ; i < (n & (~0x3F)); i += 64 )
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
-
-			y += 64;
-			x += 64;
-		}
-		for ( ; i < (n & (~0x1F)); i += 32 )
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
-
-			y += 32;
-			x += 32;
-		}
-		for ( ; i < (n & (~0x0F)); i += 16 )
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-
-			y += 16;
-			x += 16;
-		}
-		for ( ; i < (n & (~0x07)); i += 8 )
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			y += 8;
-			x += 8;
-		}
-		for ( ; i < n; ++i )
-		{
-			*y++ = *x++;
-		}
-#endif
-	}
-	else
-	{
-		for ( dim_t i = 0; i < n; ++i )
-		{
-			*y = *x;
-			x += incx;
-			y += incy;
-		}
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_dcopyv_zen_int
-     (
-       conj_t           conjx,
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
-     )
-{
-	const dim_t num_elem_per_reg = 4;
-	dim_t       i = 0;
-	__m256d     xv[16];
-
-	// If the vector dimension is zero return early.
-	if ( bli_zero_dim1( n ) ) return;
-
-	if ( incx == 1 && incy == 1 )
-	{
-#if 0
-	PRAGMA_SIMD
-	for (i = 0; i < n; ++i)
-	{
-		y[i] = x[i];
-	}
-#endif
-#if 0
-	memcpy(y, x, n << 3);
-#endif
-#if 1
-		// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
-		// the copy operation will be done for the multiples of 64
-		for ( i = 0; i < (n & (~0x3F)); i += 64 )
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
-			xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
-			xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
-			xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
-			xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
-			xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
-			xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
-			xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
-			xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
-			y += num_elem_per_reg * 16;
-			x += num_elem_per_reg * 16;
-		}
-		for ( ; i < (n & (~0x1F)); i += 32 )
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
-
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
-
-			y += num_elem_per_reg * 8;
-			x += num_elem_per_reg * 8;
-		}
-		for ( ; i < (n & (~0xF)); i += 16 )
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
-
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
-
-			y += num_elem_per_reg * 4;
-			x += num_elem_per_reg * 4;
-		}
-		for ( ; i < (n & (~0x07)); i += 8 )
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-
-			y += num_elem_per_reg * 2;
-			x += num_elem_per_reg * 2;
-		}
-		for ( ; i < (n & (~0x03)); i += 4 )
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			y += num_elem_per_reg;
-			x += num_elem_per_reg;
-		}
-		for ( ; i < n; ++i )
-		{
-			*y++ = *x++;
-		}
-#endif
-	}
-	else
-	{
-		for ( i = 0; i < n; ++i )
-		{
-			*y = *x;
-
-			x += incx;
-			y += incy;
-		}
-	}
-}
-
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+void bli_scopyv_zen_int
+     (
+       conj_t           conjx,
+       dim_t            n,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t num_elem_per_reg = 8;
+	dim_t       i = 0;
+	__m256      xv[16];
+
+	// If the vector dimension is zero return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	if ( incx == 1 && incy == 1 )
+	{
+#if 0
+	PRAGMA_SIMD
+	for (i = 0; i < n; i++)
+	{
+		y[i] = x[i];
+	}
+#endif
+#if 0
+	memcpy(y, x, n << 2);
+#endif
+#if 1
+
+		// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
+		// for example if n = 255
+		// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
+		// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
+		// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
+		for ( i = 0; i < (n & (~0x7F)); i += 128 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
+
+			y += 128;
+			x += 128;
+		}
+		for ( ; i < (n & (~0x3F)); i += 64 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+
+			y += 64;
+			x += 64;
+		}
+		for ( ; i < (n & (~0x1F)); i += 32 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+
+			y += 32;
+			x += 32;
+		}
+		for ( ; i < (n & (~0x0F)); i += 16 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+
+			y += 16;
+			x += 16;
+		}
+		for ( ; i < (n & (~0x07)); i += 8 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			y += 8;
+			x += 8;
+		}
+		for ( ; i < n; ++i )
+		{
+			*y++ = *x++;
+		}
+#endif
+	}
+	else
+	{
+		for ( dim_t i = 0; i < n; ++i )
+		{
+			*y = *x;
+			x += incx;
+			y += incy;
+		}
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_dcopyv_zen_int
+     (
+       conj_t           conjx,
+       dim_t            n,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t num_elem_per_reg = 4;
+	dim_t       i = 0;
+	__m256d     xv[16];
+
+	// If the vector dimension is zero return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	if ( incx == 1 && incy == 1 )
+	{
+#if 0
+	PRAGMA_SIMD
+	for (i = 0; i < n; ++i)
+	{
+		y[i] = x[i];
+	}
+#endif
+#if 0
+	memcpy(y, x, n << 3);
+#endif
+#if 1
+		// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
+		// the copy operation will be done for the multiples of 64
+		for ( i = 0; i < (n & (~0x3F)); i += 64 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
+			y += num_elem_per_reg * 16;
+			x += num_elem_per_reg * 16;
+		}
+		for ( ; i < (n & (~0x1F)); i += 32 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+
+			y += num_elem_per_reg * 8;
+			x += num_elem_per_reg * 8;
+		}
+		for ( ; i < (n & (~0xF)); i += 16 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+
+			y += num_elem_per_reg * 4;
+			x += num_elem_per_reg * 4;
+		}
+		for ( ; i < (n & (~0x07)); i += 8 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+
+			y += num_elem_per_reg * 2;
+			x += num_elem_per_reg * 2;
+		}
+		for ( ; i < (n & (~0x03)); i += 4 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			y += num_elem_per_reg;
+			x += num_elem_per_reg;
+		}
+		for ( ; i < n; ++i )
+		{
+			*y++ = *x++;
+		}
+#endif
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			*y = *x;
+
+			x += incx;
+			y += incy;
+		}
+	}
+}
+
diff --git a/kernels/zen/1/bli_dotv_zen_int.c b/kernels/zen/1/bli_dotv_zen_int.c
index 01022d353..03c448f85 100644
--- a/kernels/zen/1/bli_dotv_zen_int.c
+++ b/kernels/zen/1/bli_dotv_zen_int.c
@@ -62,7 +62,7 @@ void bli_sdotv_zen_int
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
        float*  restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -184,7 +184,7 @@ void bli_ddotv_zen_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c
index 8c445849b..f3fe5ea71 100644
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -63,7 +63,7 @@ void bli_sdotv_zen_int10
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
        float*  restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -254,7 +254,7 @@ void bli_ddotv_zen_int10
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_dotxv_zen_int.c b/kernels/zen/1/bli_dotxv_zen_int.c
index 99ea51710..48a9878a7 100644
--- a/kernels/zen/1/bli_dotxv_zen_int.c
+++ b/kernels/zen/1/bli_dotxv_zen_int.c
@@ -64,7 +64,7 @@ void bli_sdotxv_zen_int
        float*  restrict y, inc_t incy,
        float*  restrict beta,
        float*  restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -192,7 +192,7 @@ void bli_ddotxv_zen_int
        double* restrict y, inc_t incy,
        double* restrict beta,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
@@ -264,7 +264,7 @@ void bli_ddotxv_zen_int
 
 		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
 		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-		
+
 		// Compute the element-wise product of the x and y vectors,
 		// storing in the corresponding rho vectors.
 		rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v );
diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c
index fb17dd4b3..f92cb0c6c 100644
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -61,7 +61,7 @@ void bli_sscalv_zen_int
        dim_t            n,
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -160,7 +160,7 @@ void bli_dscalv_zen_int
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t       n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index 9f31b7200..7487880b8 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -61,7 +61,7 @@ void bli_sscalv_zen_int10
        dim_t            n,
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -82,7 +82,7 @@ void bli_sscalv_zen_int10
 	{
 		float* zero = bli_s0;
 
-		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+		if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
 		ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 
@@ -255,7 +255,7 @@ void bli_dscalv_zen_int10
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
@@ -276,7 +276,7 @@ void bli_dscalv_zen_int10
 	{
 		double* zero = bli_d0;
 
-		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+		if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
 		dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
diff --git a/kernels/zen/1/bli_setv_zen_int.c b/kernels/zen/1/bli_setv_zen_int.c
index 16e02c94d..0fbc24cfd 100644
--- a/kernels/zen/1/bli_setv_zen_int.c
+++ b/kernels/zen/1/bli_setv_zen_int.c
@@ -43,7 +43,7 @@ void bli_ssetv_zen_int
        dim_t            n,
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t num_elem_per_reg = 8;
@@ -138,7 +138,7 @@ void  bli_dsetv_zen_int
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t num_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_swapv_zen_int8.c b/kernels/zen/1/bli_swapv_zen_int8.c
index aa7a6e339..824fd0fb8 100644
--- a/kernels/zen/1/bli_swapv_zen_int8.c
+++ b/kernels/zen/1/bli_swapv_zen_int8.c
@@ -59,7 +59,7 @@ void bli_sswapv_zen_int8
        dim_t            n,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 
@@ -205,7 +205,7 @@ void bli_dswapv_zen_int8
        dim_t            n,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
index 0ec5f44f5..ddebc5ee0 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -79,7 +79,7 @@ void bli_caxpyf_zen_int_4
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+	if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
         caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
 
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c
index 1566f9809..9c8a370e1 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -108,7 +108,7 @@ void bli_saxpyf_zen_int_5
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+        if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
         saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
@@ -360,7 +360,7 @@ void bli_daxpyf_zen_int_5
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+        if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
         daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
@@ -899,7 +899,7 @@ void bli_daxpyf_zen_int_16x4
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+        if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
         daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c
index 15fdf4651..24e6ee5e2 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -64,7 +64,7 @@ void bli_saxpyf_zen_int_8
        float*  restrict a, inc_t inca, inc_t lda,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
@@ -273,7 +273,7 @@ void bli_daxpyf_zen_int_8
        double* restrict a, inc_t inca, inc_t lda,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c
index 1f4a671b6..50ca92561 100644
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -65,7 +65,7 @@ void bli_sdotxf_zen_int_8
        float*  restrict x, inc_t incx,
        float*  restrict beta,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t fuse_fac       = 8;
@@ -455,7 +455,7 @@ void bli_ddotxf_zen_int_8
        double* restrict x, inc_t incx,
        double* restrict beta,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
index 8d10406a0..076953725 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
@@ -123,8 +123,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -495,7 +495,7 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm13, ymm13)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rbx), ymm1)       // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2)    // load beta_i and duplicate
@@ -583,7 +583,7 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
+	add(rdi, rcx)
 
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm13, ymm0, ymm13)
@@ -609,18 +609,18 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	vmovups(xmm2, mem(rcx))				// store (gamma03-13)
 	vmovhpd(xmm12, mem(rcx, 16))	// store (gamma33)
 	lea(mem(rcx, rsi, 1), rcx)
-	
+
 	/******************Transpose bottom tile 4x3***************************/
 	vunpcklpd(ymm9, ymm5, ymm0)        //a8a9b8b9     a12a13b12b13 //gamma04-14 gamma06-16
 	vunpckhpd(ymm9, ymm5, ymm2)        //a10a11b10b11 a14a15b14b15 //gamma05-15 gamma07-17
-	
+
 	vmovups(xmm0, mem(rcx))				// store (gamma04-14)
 	vmovlpd(xmm13, mem(rcx, 16))	// store (gamma24)
 	lea(mem(rcx, rsi, 1), rcx)
 	vmovups(xmm2, mem(rcx))				// store (gamma05-15)
 	vmovhpd(xmm13, mem(rcx, 16))	// store (gamma25)
 	lea(mem(rcx, rsi, 1), rcx)
-	
+
 	vextractf128(imm(0x1), ymm0, xmm0)
 	vextractf128(imm(0x1), ymm2, xmm2)
 	vextractf128(imm(0x1), ymm13, xmm13)
@@ -658,8 +658,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
 
-	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 
-	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 
+	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5
+	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7
 
 	/******************Transpose top tile 4x3***************************/
 	vmovups(xmm0, mem(rcx))
@@ -680,8 +680,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	lea(mem(rcx, rsi, 1), rcx)
 
 	/******************Transpose bottom tile 4x3***************************/
-	vunpcklpd(ymm9, ymm5, ymm0)  //a8a9b8b9     a12a13b12b13 
-	vunpckhpd(ymm9, ymm5, ymm2)  //a10a11b10b11 a14a15b14b15 
+	vunpcklpd(ymm9, ymm5, ymm0)  //a8a9b8b9     a12a13b12b13
+	vunpckhpd(ymm9, ymm5, ymm2)  //a10a11b10b11 a14a15b14b15
 
 	vmovups(xmm0, mem(rcx))
 	vmovlpd(xmm13, mem(rcx, 16))
@@ -788,8 +788,8 @@ void bli_cgemmsup_rv_zen_asm_3x4m
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1060,7 +1060,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm12, ymm12)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rbx), ymm1)       // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2)    // load beta_i and duplicate
@@ -1117,7 +1117,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm4, ymm0, ymm4)
 	add(rdi, rcx)
-	
+
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm8, ymm0, ymm8)
 	add(rdi, rcx)
@@ -1136,7 +1136,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	vmovups(xmm2, mem(rcx))				// store (gamma01-11)
 	vmovhpd(xmm12, mem(rcx, 16))	// store (gamma21)
 	lea(mem(rcx, rsi, 1), rcx)
-	
+
 	vextractf128(imm(0x1), ymm0, xmm0)
 	vextractf128(imm(0x1), ymm2, xmm2)
 	vextractf128(imm(0x1), ymm12, xmm12)
@@ -1172,8 +1172,8 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
 
-	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 
-	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 
+	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5
+	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7
 
 	vmovups(xmm0, mem(rcx))
 	vmovlpd(xmm12, mem(rcx, 16))
@@ -1277,8 +1277,8 @@ void bli_cgemmsup_rv_zen_asm_3x2m
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1543,7 +1543,7 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	vmulps(xmm1, xmm3, xmm3)
 	vaddsubps(xmm3, xmm12, xmm12)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rbx), xmm1)       // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), xmm2)    // load beta_i and duplicate
@@ -1627,7 +1627,7 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	CGEMM_INPUT_SCALE_CS_BETA_NZ_128
 	vaddps(xmm4, xmm0, xmm4)
 	add(rdi, rcx)
-	
+
 	CGEMM_INPUT_SCALE_CS_BETA_NZ_128
 	vaddps(xmm8, xmm0, xmm8)
 	add(rdi, rcx)
@@ -1753,4 +1753,3 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	}
 }
 
- 
\ No newline at end of file
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
index 6c68707e1..62491dfb4 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
@@ -80,14 +80,14 @@ void bli_cgemmsup_rv_zen_asm_3x8n
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t m_left = m0 % 3;
 	if ( m_left )
 	{
-		cgemmsup_ker_ft ker_fps[3] = 
+		cgemmsup_ker_ft ker_fps[3] =
 		{
 			NULL,
 			bli_cgemmsup_rv_zen_asm_1x8n,
@@ -120,7 +120,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
-	
+
 	if ( n_iter == 0 ) goto consider_edge_cases;
 
 	// -------------------------------------------------------------------------
@@ -150,7 +150,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n
 		ymm13 = _mm256_setzero_ps();
 		ymm14 = _mm256_setzero_ps();
 		ymm15 = _mm256_setzero_ps();
-		
+
 		dim_t ta_inc_row = rs_a;
 		dim_t tb_inc_row = rs_b;
 		dim_t tc_inc_row = rs_c;
@@ -170,7 +170,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter +  4));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_ss((float const *)(tA));
@@ -534,8 +534,8 @@ void bli_cgemmsup_rv_zen_asm_2x8n
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -600,7 +600,7 @@ void bli_cgemmsup_rv_zen_asm_2x8n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter +  4));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_ss((float const *)(tA));
@@ -882,8 +882,8 @@ void bli_cgemmsup_rv_zen_asm_1x8n
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -1151,8 +1151,8 @@ void bli_cgemmsup_rv_zen_asm_3x4
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -1184,7 +1184,7 @@ void bli_cgemmsup_rv_zen_asm_3x4
 	ymm10 = _mm256_setzero_ps();
 	ymm12 = _mm256_setzero_ps();
 	ymm14 = _mm256_setzero_ps();
-	
+
 	dim_t ta_inc_row = rs_a;
 	dim_t tb_inc_row = rs_b;
 	dim_t tc_inc_row = rs_c;
@@ -1386,8 +1386,8 @@ void bli_cgemmsup_rv_zen_asm_3x2
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -1408,7 +1408,7 @@ void bli_cgemmsup_rv_zen_asm_3x2
 	scomplex *tB = b;
 	scomplex *tC = c;
 	// clear scratch registers.
-	__m128 xmm0, xmm1, xmm2, xmm3; 
+	__m128 xmm0, xmm1, xmm2, xmm3;
 	__m128 xmm4 = _mm_setzero_ps();
 	__m128 xmm6 = _mm_setzero_ps();
 	__m128 xmm8 = _mm_setzero_ps();
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
index 1638eaba0..b9ed3c9f9 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
@@ -82,8 +82,8 @@ void bli_zgemmsup_rv_zen_asm_2x4
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -357,7 +357,7 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm9, ymm9)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -413,7 +413,7 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -423,16 +423,16 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm8, ymm0, ymm8)
 	add(rdi, rcx)
-	
+
 	lea(mem(r12, rsi, 2), rcx)
-	
+
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm5, ymm0, ymm5)
 	add(rdi, rcx)
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
+	add(rdi, rcx)
 
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 
@@ -454,12 +454,12 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	vmovups(xmm8, mem(rcx, 16))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vmovups(xmm5, mem(rcx))
@@ -501,12 +501,12 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	vmovups(xmm8, mem(rcx, 16))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vmovups(xmm5, mem(rcx))
@@ -558,8 +558,8 @@ void bli_zgemmsup_rv_zen_asm_1x4
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -781,7 +781,7 @@ void bli_zgemmsup_rv_zen_asm_1x4
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm5, ymm5)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -828,14 +828,14 @@ void bli_zgemmsup_rv_zen_asm_1x4
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm4, ymm0, ymm4)
 
 	lea(mem(r12, rsi, 2), rcx)
-	
+
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm5, ymm0, ymm5)
 
@@ -854,7 +854,7 @@ void bli_zgemmsup_rv_zen_asm_1x4
 	vmovups(xmm4, mem(rcx))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 
 	add(rsi, rcx)
@@ -943,8 +943,8 @@ void bli_zgemmsup_rv_zen_asm_2x2
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
 
      )
 {
@@ -1178,7 +1178,7 @@ void bli_zgemmsup_rv_zen_asm_2x2
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm8, ymm8)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -1226,7 +1226,7 @@ void bli_zgemmsup_rv_zen_asm_2x2
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -1330,8 +1330,8 @@ void bli_zgemmsup_rv_zen_asm_1x2
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
 
      )
 {
@@ -1529,7 +1529,7 @@ void bli_zgemmsup_rv_zen_asm_1x2
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm4, ymm4)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -1571,7 +1571,7 @@ void bli_zgemmsup_rv_zen_asm_1x2
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -1602,7 +1602,7 @@ void bli_zgemmsup_rv_zen_asm_1x2
 
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 
 	jmp(.SDONE)                        // jump to end.
 
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
index 05e05dfec..1dd37a395 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
@@ -97,7 +97,7 @@
 
    crr:
 	 | | | | | | | |       ------        --------
-	 | | | | | | | |  +=   ------ 
+	 | | | | | | | |  +=   ------
 	 --------
 	 | | | | | | | |       ------        --------
 	 | | | | | | | |       ------            :
@@ -114,8 +114,8 @@ void bli_zgemmsup_rv_zen_asm_3x4m
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t n_left = n0 % 4;
@@ -477,7 +477,7 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm13, ymm13)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -563,7 +563,7 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
+	add(rdi, rcx)
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm13, ymm0, ymm13)
@@ -591,13 +591,13 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 	vmovups(xmm12, mem(rcx,32))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
 	vmovups(xmm13,mem(rcx,32))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vextractf128(imm(0x1), ymm13, xmm13)
@@ -649,13 +649,13 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 	vmovups(xmm12, mem(rcx,32))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
 	vmovups(xmm13,mem(rcx,32))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vextractf128(imm(0x1), ymm13, xmm13)
@@ -750,8 +750,8 @@ void bli_zgemmsup_rv_zen_asm_3x2m
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
 
      )
 {
@@ -1025,7 +1025,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm12, ymm12)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -1079,7 +1079,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -1089,7 +1089,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm8, ymm0, ymm8)
 	add(rdi, rcx)
-	
+
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm12, ymm0, ymm12)
 
@@ -1126,10 +1126,10 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm12, mem(rcx))
 
 	jmp(.SDONE)                        // jump to end.
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
index 872d04868..58d08ecbd 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
@@ -79,14 +79,14 @@ void bli_zgemmsup_rv_zen_asm_3x4n
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t m_left = m0 % 3;
 	if ( m_left )
 	{
-		zgemmsup_ker_ft ker_fps[3] = 
+		zgemmsup_ker_ft ker_fps[3] =
 		{
 			NULL,
 			bli_zgemmsup_rv_zen_asm_1x4n,
@@ -150,7 +150,7 @@ void bli_zgemmsup_rv_zen_asm_3x4n
 		ymm13 = _mm256_setzero_pd();
 		ymm14 = _mm256_setzero_pd();
 		ymm15 = _mm256_setzero_pd();
-		
+
 		dim_t ta_inc_row = rs_a;
 		dim_t tb_inc_row = rs_b;
 		dim_t tc_inc_row = rs_c;
@@ -170,7 +170,7 @@ void bli_zgemmsup_rv_zen_asm_3x4n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter +  2));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_sd((double const *)(tA));
@@ -472,8 +472,8 @@ void bli_zgemmsup_rv_zen_asm_2x4n
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -515,7 +515,7 @@ void bli_zgemmsup_rv_zen_asm_2x4n
 		ymm9 = _mm256_setzero_pd();
 		ymm10 = _mm256_setzero_pd();
 		ymm11 = _mm256_setzero_pd();
-		
+
 		dim_t ta_inc_row = rs_a;
 		dim_t tb_inc_row = rs_b;
 		dim_t tc_inc_row = rs_c;
@@ -535,7 +535,7 @@ void bli_zgemmsup_rv_zen_asm_2x4n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter +  2));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_sd((double const *)(tA));
@@ -772,8 +772,8 @@ void bli_zgemmsup_rv_zen_asm_1x4n
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -832,7 +832,7 @@ void bli_zgemmsup_rv_zen_asm_1x4n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter +  2));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_sd((double const *)(tA));
@@ -999,8 +999,8 @@ void bli_zgemmsup_rv_zen_asm_3x2
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t k_iter = 0;
@@ -1046,7 +1046,7 @@ void bli_zgemmsup_rv_zen_asm_3x2
 		// multiplies it with the A matrix.
 		// This loop is processing MR x K
 		ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
-		
+
 		//broadcasted matrix B elements are multiplied
 		//with matrix A columns.
 		ymm2 = _mm256_broadcast_sd((double const *)(tA));
diff --git a/kernels/zen2/1f/old/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/old/bli_axpyf_zen_int_5.c
new file mode 100644
index 000000000..8a60bce46
--- /dev/null
+++ b/kernels/zen2/1f/old/bli_axpyf_zen_int_5.c
@@ -0,0 +1,599 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+    __m256d v;
+    double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+
+void bli_saxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       float* restrict alpha,
+       float* restrict a, inc_t inca, inc_t lda,
+       float* restrict x, inc_t incx,
+       float* restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 8;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    float* restrict a0;
+    float* restrict a1;
+    float* restrict a2;
+    float* restrict a3;
+    float* restrict a4;
+
+    float* restrict y0;
+
+    v8sf_t           chi0v, chi1v, chi2v, chi3v;
+    v8sf_t           chi4v;
+
+    v8sf_t           a00v, a01v, a02v, a03v;
+    v8sf_t           a04v;
+
+    v8sf_t           a10v, a11v, a12v, a13v;
+    v8sf_t           a14v;
+
+    v8sf_t           y0v, y1v;
+
+    float           chi0, chi1, chi2, chi3;
+    float           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+#ifdef BLIS_CONFIG_ZEN2
+        for ( i = 0; i < b_n; ++i )
+        {
+            float* a1   = a + (0  )*inca + (i  )*lda;
+            float* chi1 = x + (i  )*incx;
+            float* y1   = y + (0  )*incy;
+            float  alpha_chi1;
+
+            bli_scopycjs( conjx, *chi1, alpha_chi1 );
+            bli_sscals( *alpha, alpha_chi1 );
+
+            bli_saxpyv_zen_int10
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#else
+        saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            float* a1   = a + (0  )*inca + (i  )*lda;
+            float* chi1 = x + (i  )*incx;
+            float* y1   = y + (0  )*incy;
+            float  alpha_chi1;
+
+            bli_scopycjs( conjx, *chi1, alpha_chi1 );
+            bli_sscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#endif
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_sscals( *alpha, chi0 );
+    bli_sscals( *alpha, chi1 );
+    bli_sscals( *alpha, chi2 );
+    bli_sscals( *alpha, chi3 );
+    bli_sscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_ss( &chi0 );
+    chi1v.v = _mm256_broadcast_ss( &chi1 );
+    chi2v.v = _mm256_broadcast_ss( &chi2 );
+    chi3v.v = _mm256_broadcast_ss( &chi3 );
+    chi4v.v = _mm256_broadcast_ss( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+    double* restrict a2;
+    double* restrict a3;
+    double* restrict a4;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v, chi2v, chi3v;
+    v4df_t           chi4v;
+
+    v4df_t           a00v, a01v, a02v, a03v;
+    v4df_t           a04v;
+
+    v4df_t           a10v, a11v, a12v, a13v;
+    v4df_t           a14v;
+
+    v4df_t           y0v, y1v;
+
+    double           chi0, chi1, chi2, chi3;
+    double           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+#ifdef BLIS_CONFIG_ZEN2
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            bli_daxpyv_zen_int10
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#else
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#endif
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+    bli_dscals( *alpha, chi2 );
+    bli_dscals( *alpha, chi3 );
+    bli_dscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+    chi2v.v = _mm256_broadcast_sd( &chi2 );
+    chi3v.v = _mm256_broadcast_sd( &chi3 );
+    chi4v.v = _mm256_broadcast_sd( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 3) < m; i += 4 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c
index 6724cdfd1..bb637d7e6 100644
--- a/ref_kernels/1/bli_addv_ref.c
+++ b/ref_kernels/1/bli_addv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c
index 169180f3b..cdfae9568 100644
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -46,7 +46,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        dim_t*  restrict i_max, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	ctype_r* minus_one = PASTEMAC(chr,m1); \
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 2da4bc928..fb48070a5 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 30076ddaf..295fcf24c 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
@@ -135,7 +135,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c
index 9cf005aae..1202aa896 100644
--- a/ref_kernels/1/bli_copyv_ref.c
+++ b/ref_kernels/1/bli_copyv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c
index f2cfae78b..d17c71dd3 100644
--- a/ref_kernels/1/bli_dotv_ref.c
+++ b/ref_kernels/1/bli_dotv_ref.c
@@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	ctype dotxy; \
diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c
index e2283bcc6..caea62176 100644
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -47,7 +47,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict beta, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	ctype dotxy; \
diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c
index 07c52d82d..914663c82 100644
--- a/ref_kernels/1/bli_invertv_ref.c
+++ b/ref_kernels/1/bli_invertv_ref.c
@@ -41,7 +41,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index ba0595990..f4785c228 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -44,7 +44,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 3e6be7492..6ca9a88a5 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c
index 862ff177d..be6e76cbb 100644
--- a/ref_kernels/1/bli_setv_ref.c
+++ b/ref_kernels/1/bli_setv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c
index 6b512909f..ce1ec2079 100644
--- a/ref_kernels/1/bli_subv_ref.c
+++ b/ref_kernels/1/bli_subv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c
index 6f8d54f66..73a90c87b 100644
--- a/ref_kernels/1/bli_swapv_ref.c
+++ b/ref_kernels/1/bli_swapv_ref.c
@@ -42,7 +42,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 28286a5f8..0a6844bf1 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -44,7 +44,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 6439ff8b0..0563322ae 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -48,7 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index 5799a03a6..873cee563 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -48,7 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index 42936c650..b83b927c9 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -49,7 +49,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index 990133621..249b9a6de 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -53,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	/* A is m x n.                   */ \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 86781fd58..2d2da1318 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -49,7 +49,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index 5cee5535b..e2008d255 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -122,7 +122,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
 	const num_t dt_r      = PASTEMAC(chr,type); \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index 80ffcbc14..d12ff59ab 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -66,7 +66,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
 	const num_t dt        = PASTEMAC(ch,type); \
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 56d8379be..f3dd3d78f 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -87,7 +87,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	const dim_t dfac = PASTECH2(bb0, _, chr); \
@@ -99,15 +99,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const dim_t mnr      = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \
 		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
 \
-		const inc_t       inca2      = 2 * inca; \
-		const inc_t       lda2       = 2 * lda; \
-		const inc_t       ldp2       = 2 * ldp; \
+		const inc_t       inca2   = 2 * inca; \
+		const inc_t       lda2    = 2 * lda; \
+		const inc_t       ldp2    = 2 * ldp; \
 \
-		ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
-		ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
-		ctype_r* restrict alpha1     = ( ctype_r* )a; \
-		ctype_r* restrict pi1_ri     = ( ctype_r* )p; \
-		ctype_r* restrict pi1_ir     = ( ctype_r* )p + ldp; \
+		ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
+		ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
+		ctype_r* restrict alpha1  = ( ctype_r* )a; \
+		ctype_r* restrict pi1_ri  = ( ctype_r* )p; \
+		ctype_r* restrict pi1_ir  = ( ctype_r* )p + ldp; \
 \
 		if ( cdim == mnr && mnr != -1 ) \
 		{ \
@@ -140,15 +140,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const dim_t mnr      = PASTECH2(mnr0, _, chr); \
 		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
 \
-		const inc_t       inca2      = 2 * inca; \
-		const inc_t       lda2       = 2 * lda; \
-		const inc_t       ldp2       = 2 * ldp; \
+		const inc_t       inca2   = 2 * inca; \
+		const inc_t       lda2    = 2 * lda; \
+		const inc_t       ldp2    = 2 * ldp; \
 \
-		ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
-		ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
-		ctype_r* restrict alpha1     = ( ctype_r* )a; \
-		ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-		ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
+		ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
+		ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
+		ctype_r* restrict alpha1  = ( ctype_r* )a; \
+		ctype_r* restrict pi1_r   = ( ctype_r* )p; \
+		ctype_r* restrict pi1_i   = ( ctype_r* )p + ldp; \
 \
 		if ( cdim == mnr && mnr != -1 ) \
 		{ \
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index eefdb464b..efbbc95e4 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -63,7 +63,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 73d98e268..172e93bdf 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -61,7 +61,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index f284acb98..26eda0c65 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -50,31 +50,31 @@ static void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
-	const num_t     dt     = PASTEMAC(ch,type); \
+	const num_t dt     = PASTEMAC(ch,type); \
 \
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
+	const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
+	const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
-	const inc_t     rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
-	const inc_t     cs_a   = packmr; \
+	const inc_t rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
+	const inc_t cs_a   = packmr; \
 \
-	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
+	const inc_t rs_b   = packnr; \
+	const inc_t cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = 1; \
-	const inc_t     cs_ab  = m; \
+	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const inc_t rs_ab  = 1; \
+	const inc_t cs_ab  = m; \
 \
-	dim_t           l, j, i; \
+	dim_t       l, j, i; \
 \
-	ctype           ai; \
-	ctype           bj; \
+	ctype       ai; \
+	ctype       bj; \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
@@ -118,18 +118,24 @@ static void PASTEMAC3(ch,opname,arch,suf) \
 	   scale by beta and then add the scaled redult in ab. */ \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
-		PASTEMAC(ch,copys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        c,  rs_c,  cs_c ); \
+		PASTEMAC(ch,copys_mxn) \
+		( \
+		  m, \
+		  n, \
+		  ab, rs_ab, cs_ab, \
+		  c,  rs_c,  cs_c \
+		); \
 	} \
 	else \
 	{ \
-		PASTEMAC(ch,xpbys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        beta, \
-		                        c,  rs_c,  cs_c ); \
+		PASTEMAC(ch,xpbys_mxn) \
+		( \
+		  m, \
+		  n, \
+		  ab, rs_ab, cs_ab, \
+		  beta, \
+		  c,  rs_c,  cs_c \
+		); \
 	} \
 }
 
@@ -153,13 +159,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 \
-	const dim_t     mr = PASTECH(BLIS_MR_,ch); \
-	const dim_t     nr = PASTECH(BLIS_NR_,ch); \
+	const dim_t mr = PASTECH(BLIS_MR_,ch); \
+	const dim_t nr = PASTECH(BLIS_NR_,ch); \
 \
 	if ( mr == -1 || nr == -1 ) \
 	{ \
diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c
index 0c3773c1c..9cadb3bd6 100644
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -53,12 +53,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
-       values of m, n, and k. */ \
+	   values of m, n, and k. */ \
 \
 	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
 	{ \
@@ -258,12 +258,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
-       values of m, n, and k. */ \
+	   values of m, n, and k. */ \
 \
 	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
 	{ \
@@ -478,17 +478,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
-	const dim_t     mn     = m * n; \
+	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 \
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = n; \
-	const inc_t     cs_ab  = 1; \
+	const dim_t mn    = m * n; \
+	const inc_t rs_ab = n; \
+	const inc_t cs_ab = 1; \
 \
 \
 	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
@@ -602,17 +602,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
-	const dim_t     mn     = m * n; \
+	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 \
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = 1; \
-	const inc_t     cs_ab  = m; \
+	const dim_t mn    = m * n; \
+	const inc_t rs_ab = 1; \
+	const inc_t cs_ab = m; \
 \
 \
 	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
@@ -725,17 +725,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
-	const dim_t     mn     = m * n; \
+	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 \
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = 1; \
-	const inc_t     cs_ab  = m; \
+	const dim_t mn    = m * n; \
+	const inc_t rs_ab = 1; \
+	const inc_t cs_ab = m; \
 \
 \
 	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 046aa5617..0a11aa052 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -51,30 +51,30 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
-	const num_t     dt     = PASTEMAC(ch,type); \
+	const num_t dt     = PASTEMAC(ch,type); \
 \
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
 \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
+	const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
-	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
+	const inc_t rs_b   = packnr; \
+	const inc_t cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 /*
 printf( "bli_gemmtrsm_ref(): cs_b = %d\n", (int)cs_b ); \
 printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
 */ \
 \
-	ctype*          minus_one = PASTEMAC(ch,m1); \
+	ctype*      minus_one = PASTEMAC(ch,m1); \
 \
 	PASTECH(ch,gemm_ukr_ft) \
-	              gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	            gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	PASTECH(ch,trsm_ukr_ft) \
-	              trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
+	            trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
 \
 /*
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
@@ -94,15 +94,15 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
 	   circumstances where we would want the gemmtrsm_? operations to have
 	   and exercise their own IO preferences -- I'd have to think about it --
 	   but this doesn't seem to be one of them. */ \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : nr ); \
-	const inc_t     cs_ct       = ( col_pref ? mr : 1 ); \
+	const bool      col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const inc_t     rs_ct    = ( col_pref ? 1 : nr ); \
+	const inc_t     cs_ct    = ( col_pref ? mr : 1 ); \
 \
-	const bool      use_ct      = ( m < mr || n < nr ); \
+	const bool      use_ct   = ( m < mr || n < nr ); \
 \
-	ctype* restrict c11_use     = c11; \
-	inc_t           rs_c_use    = rs_c; \
-	inc_t           cs_c_use    = cs_c; \
+	ctype* restrict c11_use  = c11; \
+	inc_t           rs_c_use = rs_c; \
+	inc_t           cs_c_use = cs_c; \
 \
 	if ( use_ct ) \
 	{ \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index 8234a84cc..f115e2a60 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -45,8 +45,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
@@ -74,10 +74,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		i        = iter; \
 		n_behind = i; \
 \
-		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a10t     = a + (i  )*rs_a + (0  )*cs_a; \
-		ctype* restrict B0       = b + (0  )*rs_b + (0  )*cs_b; \
-		ctype* restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
+		ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
+		ctype* restrict a10t    = a + (i  )*rs_a + (0  )*cs_a; \
+		ctype* restrict B0      = b + (0  )*rs_b + (0  )*cs_b; \
+		ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
 \
 		/* b1 = b1 - a10t * B0; */ \
 		/* b1 = b1 / alpha11; */ \
@@ -132,8 +132,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
@@ -158,10 +158,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		dim_t i        = m - iter - 1; \
 		dim_t n_behind = iter; \
 \
-		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a12t     = a + (i  )*rs_a + (i+1)*cs_a; \
-		ctype* restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
-		ctype* restrict B2       = b + (i+1)*rs_b + (0  )*cs_b; \
+		ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
+		ctype* restrict a12t    = a + (i  )*rs_a + (i+1)*cs_a; \
+		ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
+		ctype* restrict B2      = b + (i+1)*rs_b + (0  )*cs_b; \
 \
 		/* b1 = b1 - a12t * B2; */ \
 		/* b1 = b1 / alpha11; */ \
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 69c546cd4..e094db54b 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -306,7 +306,7 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-3 virtual micro-kernels ------------------------------------
 
-	funcs = bli_cntx_ukrs_buf( cntx );
+	funcs = cntx->ukrs;
 
 	// NOTE: We set the virtual micro-kernel slots to contain the addresses
 	// of the native micro-kernels. In general, the ukernels in the virtual
@@ -322,7 +322,7 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-3 native micro-kernels and preferences ---------------------
 
-	mbools = bli_cntx_ukr_prefs_buf( cntx );
+	mbools = cntx->ukr_prefs;
 
 	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
 	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
@@ -416,7 +416,7 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-3 small/unpacked handlers ----------------------------------
 
-	vfuncs = bli_cntx_l3_sup_handlers_buf( cntx );
+	vfuncs = cntx->l3_sup_handlers;
 
 	// Initialize all of the function pointers to NULL;
 	for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL;
@@ -452,7 +452,7 @@ void GENBAINAME(cntx_init)
 
 	// -- Set induced method level-3 virtual micro-kernels ---------------------
 
-	funcs = bli_cntx_ukrs_buf( cntx );
+	funcs = cntx->ukrs;
 
 	if ( method == BLIS_1M )
 	{
@@ -483,8 +483,8 @@ void GENBAINAME(cntx_init)
 	// beta has a zero imaginary component and C is either row- or column-stored).
 	if ( method == BLIS_1M )
 	{
-		func_t* gemm_nat_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
-		func_t* gemm_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
+		func_t* gemm_nat_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
+		func_t* gemm_vir_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
 
 		bli_func_copy_dt( BLIS_FLOAT,  gemm_nat_ukrs, BLIS_FLOAT,  gemm_vir_ukrs );
 		bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs );
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 2f0808389..317cf2604 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -47,8 +47,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt        = PASTEMAC(ch,type); \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 6cfb83cae..1688b688d 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -48,8 +48,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt          = PASTEMAC(ch,type); \
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index 5eda20f20..37551b399 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -43,8 +43,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
@@ -92,13 +92,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			i         = iter; \
 			n_behind  = i; \
 \
-			ctype_r* restrict alpha11_r  = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
-			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
-			ctype_r* restrict a10t_r     = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
-			ctype_r* restrict a10t_i     = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
-			ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict B0_ri      = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict alpha11_r = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
+			ctype_r* restrict alpha11_i = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
+			ctype_r* restrict a10t_r    = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
+			ctype_r* restrict a10t_i    = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
+			ctype_r* restrict b1_ri     = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_ir     = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B0_ri     = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a10t * B0; */ \
 			/* b1 = b1 / alpha11; */ \
@@ -261,8 +261,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c
index eeebf15e7..3019d472b 100644
--- a/testsuite/src/test_axpy2v.c
+++ b/testsuite/src/test_axpy2v.c
@@ -176,7 +176,7 @@ void libblis_test_axpy2v_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -220,7 +220,7 @@ void libblis_test_axpy2v_experiment
 	bli_obj_set_conj( conjx, &x );
 	bli_obj_set_conj( conjy, &y );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &z_save, &z );
diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c
index 7a85b2212..42ab73018 100644
--- a/testsuite/src/test_axpyf.c
+++ b/testsuite/src/test_axpyf.c
@@ -174,7 +174,7 @@ void libblis_test_axpyf_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -223,7 +223,7 @@ void libblis_test_axpyf_experiment
 	bli_obj_set_conj( conja, &a );
 	bli_obj_set_conj( conjx, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &y_save, &y );
diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c
index 391c119bb..8e09e3ee1 100644
--- a/testsuite/src/test_dotaxpyv.c
+++ b/testsuite/src/test_dotaxpyv.c
@@ -179,7 +179,7 @@ void libblis_test_dotaxpyv_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -222,7 +222,7 @@ void libblis_test_dotaxpyv_experiment
 	bli_obj_alias_to( &x, &xt );
 
 	// Determine whether to make a copy of x with or without conjugation.
-	// 
+	//
 	//  conjx conjy  ~conjx^conjy   y is initialized as
 	//  n     n      c              y = conj(x)
 	//  n     c      n              y = x
@@ -239,7 +239,7 @@ void libblis_test_dotaxpyv_experiment
 	bli_obj_set_conj( conjx,  &x );
 	bli_obj_set_conj( conjy,  &y );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copysc( &BLIS_MINUS_ONE, &rho );
diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c
index a2c3ef3e9..ec519de51 100644
--- a/testsuite/src/test_dotxaxpyf.c
+++ b/testsuite/src/test_dotxaxpyf.c
@@ -184,7 +184,7 @@ void libblis_test_dotxaxpyf_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -251,7 +251,7 @@ void libblis_test_dotxaxpyf_experiment
 	bli_obj_set_conj( conjw, &w );
 	bli_obj_set_conj( conjx, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &y_save, &y );
diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c
index 8a1eca4eb..83f4b44eb 100644
--- a/testsuite/src/test_dotxf.c
+++ b/testsuite/src/test_dotxf.c
@@ -176,7 +176,7 @@ void libblis_test_dotxf_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -228,7 +228,7 @@ void libblis_test_dotxf_experiment
 	bli_obj_set_conj( conjat, &a );
 	bli_obj_set_conj( conjx, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &y_save, &y );
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index d37005b28..69ee4339d 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -181,7 +181,7 @@ void libblis_test_gemm_ukr_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 48fcb78db..44ba51587 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -207,7 +207,7 @@ void libblis_test_gemmtrsm_ukr_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index edab9796d..f267ae158 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -977,7 +977,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 
 	// Query a native context.
-	cntx = bli_gks_query_nat_cntx();
+	cntx = ( cntx_t* )bli_gks_query_nat_cntx();
 
 	libblis_test_fprintf_c( os, "level-3 blocksizes             s       d       c       z \n" );
 	libblis_test_fprintf_c( os, "  mc                     %7d %7d %7d %7d\n",
@@ -1081,8 +1081,8 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 
 	// Query a native context.
-	cntx_c = bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX );
-	cntx_z = bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX );
+	cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX );
+	cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX );
 
 	libblis_test_fprintf_c( os, "level-3 blocksizes                             c       z \n" );
 	libblis_test_fprintf_c( os, "  mc                                     %7d %7d\n",
@@ -2178,7 +2178,7 @@ void libblis_test_op_driver
 				// Query the implementation string associated with the
 				// current operation and datatype. If the operation is
 				// not level-3, we will always get back the native string.
-				ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype );
+				ind_str = ( char* )bli_ind_oper_get_avail_impl_string( op->opid, datatype );
 
 				// Loop over the requested parameter combinations.
 				for ( pci = 0; pci < n_param_combos; ++pci )
@@ -3051,7 +3051,7 @@ void libblis_test_parse_command_line( int argc, char** argv )
 	bli_getopt_init_state( 0, &state );
 
 	// Process all option arguments until we get a -1, which means we're done.
-	while( (opt = bli_getopt( argc, argv, "g:o:", &state )) != -1 )
+	while( (opt = bli_getopt( argc, ( const char** )argv, "g:o:", &state )) != -1 )
 	{
 		// Explicitly typecast opt, which is an int, to a char. (Failing to
 		// typecast resulted in at least one user-reported problem whereby
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 9568dfee7..5f4988e1c 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -186,7 +186,7 @@ void libblis_test_trsm_ukr_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );

From 6431c9e13b86e4442b6aacba18a0ace12288c955 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 14 Apr 2022 13:01:24 -0500
Subject: [PATCH 052/230] Added missing 'const' to zen bli_gemm_small.c.

Details:
- Added missing 'const' qualifiers to signatures of functions defined in
  kernels/zen/3/bli_gemm_small.c. This fixes compile-time errors when
  targeting 'zen3' subconfig (which apparently is enabling AMD's
  gemm_small code path by default). Thanks to Devin Matthews for
  reporting this error.
---
 kernels/zen/3/bli_gemm_small.c | 126 ++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c
index b04ffea58..890c5bc2d 100644
--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
@@ -55,46 +55,46 @@
 #define AT_MR 4 // The kernel dimension of the A transpose GEMM kernel.(AT_MR * NR).
 static err_t bli_sgemm_small
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      );
 
 static err_t bli_dgemm_small
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      );
 
 static err_t bli_sgemm_small_atbn
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      );
 
 static err_t bli_dgemm_small_atbn
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      );
 /*
 * The bli_gemm_small function will use the
@@ -103,13 +103,13 @@ static err_t bli_dgemm_small_atbn
 */
 err_t bli_gemm_small
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      )
 {
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
@@ -168,13 +168,13 @@ err_t bli_gemm_small
 
 static err_t bli_sgemm_small
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      )
 {
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
@@ -1719,13 +1719,13 @@ static err_t bli_sgemm_small
 
 static err_t bli_dgemm_small
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      )
 {
 
@@ -3327,13 +3327,13 @@ static err_t bli_dgemm_small
 
 static err_t bli_sgemm_small_atbn
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      )
 {
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
@@ -3804,13 +3804,13 @@ static err_t bli_sgemm_small_atbn
 
 static err_t bli_dgemm_small_atbn
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      )
 {
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);

From 1c733402a95ab08b20f3332c2397fd52a2627cf6 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Thu, 28 Apr 2022 11:58:44 -0600
Subject: [PATCH 053/230] Fix version check for znver3, which needs gcc >= 10.3
 (#628)

Apple's clang-12 lacks znver3 support, unlike upstream clang-12.
---
 build/config.mk.in       |  2 +-
 config/zen3/make_defs.mk |  6 +++++-
 configure                | 16 ++++++++--------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/build/config.mk.in b/build/config.mk.in
index 56d6211c2..1b3468642 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -94,7 +94,7 @@ CC                := @CC@
 GCC_OT_4_9_0      := @gcc_older_than_4_9_0@
 GCC_OT_6_1_0      := @gcc_older_than_6_1_0@
 GCC_OT_9_1_0      := @gcc_older_than_9_1_0@
-GCC_OT_10_1_0     := @gcc_older_than_10_1_0@
+GCC_OT_10_3_0     := @gcc_older_than_10_3_0@
 CLANG_OT_9_0_0    := @clang_older_than_9_0_0@
 CLANG_OT_12_0_0   := @clang_older_than_12_0_0@
 AOCC_OT_2_0_0     := @aocc_older_than_2_0_0@
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
index 5c68855db..cfeca4f5d 100644
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -71,7 +71,7 @@ ifeq ($(CC_VENDOR),gcc)
   ifeq ($(GCC_OT_9_1_0),yes)  # gcc versions older than 9.1.
     CVECFLAGS_VER  := -march=znver1 -mno-avx256-split-unaligned-store
   else
-  ifeq ($(GCC_OT_10_1_0),yes) # gcc versions 9.1 or newer, but older than 10.1.
+  ifeq ($(GCC_OT_10_3_0),yes) # gcc versions 9.1 or newer, but older than 10.3.
     CVECFLAGS_VER  := -march=znver2
   else                        # gcc versions 10.1 or newer.
     CVECFLAGS_VER  := -march=znver3
@@ -84,10 +84,14 @@ ifeq ($(CC_VENDOR),clang)
   else
   ifeq ($(CLANG_OT_12_0_0),yes) # clang versions 9.0 or newer, but older than 12.0.
     CVECFLAGS_VER  := -march=znver2
+  else
+  ifeq ($(OS_NAME),Darwin)      # clang version 12.0 on OSX lacks znver3 support
+    CVECFLAGS_VER  := -march=znver2
   else                          # clang versions 12.0 or newer.
     CVECFLAGS_VER  := -march=znver3
   endif
   endif
+  endif
 else
 ifeq ($(CC_VENDOR),aocc)
   ifeq ($(AOCC_OT_2_0_0),yes)   # aocc versions older than 2.0.
diff --git a/configure b/configure
index f64aac705..7e825f1dc 100755
--- a/configure
+++ b/configure
@@ -1791,8 +1791,8 @@ check_compiler_version_ranges()
 	#   [5] https://gcc.gnu.org/onlinedocs/gcc-8.3.0/gcc/x86-Options.html#x86-Options
 	#   [6] https://gcc.gnu.org/onlinedocs/gcc-9.4.0/gcc/x86-Options.html#x86-Options
 	#
-	# range: gcc < 10.1 (ie: 9.4 or older)
-	# variable: gcc_older_than_10_1_0
+	# range: gcc < 10.3 (ie: 9.4 or older)
+	# variable: gcc_older_than_10_3_0
 	# comments:
 	#   These older versions of gcc do not explicitly support the Zen3
 	#   microarchitecture; the newest microarchitectural value understood by
@@ -1806,7 +1806,7 @@ check_compiler_version_ranges()
 	gcc_older_than_4_9_0='no'
 	gcc_older_than_6_1_0='no'
 	gcc_older_than_9_1_0='no'
-	gcc_older_than_10_1_0='no'
+	gcc_older_than_10_3_0='no'
 
 	clang_older_than_9_0_0='no'
 	clang_older_than_12_0_0='no'
@@ -1839,10 +1839,10 @@ check_compiler_version_ranges()
 			gcc_older_than_9_1_0='yes'
 		fi
 
-		# Check for gcc < 10.1.0 (ie: 9.4 or older).
-		if [ ${cc_major} -lt 10 ]; then
-			echo "${script_name}: note: found ${cc} version older than 10.1."
-			gcc_older_than_10_1_0='yes'
+		# Check for gcc < 10.3.0 (ie: 10.2 or older).
+		if [[ ( ${cc_major} -lt 10 ) || ( ${cc_major} -eq 10 && ${cc_minor} -lt 3 ) ]]; then
+			echo "${script_name}: note: found ${cc} version older than 10.3."
+			gcc_older_than_10_3_0='yes'
 		fi
 	fi
 
@@ -3835,7 +3835,7 @@ main()
 		| sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \
 		| sed -e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g" \
 		| sed -e "s/@gcc_older_than_9_1_0@/${gcc_older_than_9_1_0}/g" \
-		| sed -e "s/@gcc_older_than_10_1_0@/${gcc_older_than_10_1_0}/g" \
+		| sed -e "s/@gcc_older_than_10_3_0@/${gcc_older_than_10_3_0}/g" \
 		| sed -e "s/@clang_older_than_9_0_0@/${clang_older_than_9_0_0}/g" \
 		| sed -e "s/@clang_older_than_12_0_0@/${clang_older_than_12_0_0}/g" \
 		| sed -e "s/@aocc_older_than_2_0_0@/${aocc_older_than_2_0_0}/g" \

From 64a9b061f6032e2b59613aecdbe7bb52161605c1 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 10 May 2022 14:54:22 -0500
Subject: [PATCH 054/230] Fixed misspelling of 'xpbys' in gemm macrokernel.

Details:
- Fixed a functionally harmless typo in bli_gemm_ker_var2.c where a few
  instances of the substring "xpbys" were misspelled as "xbpys". The
  misspellings were harmless because they were consistent, and because
  they referenced only local symbols.
---
 frame/3/gemm/bli_gemm_ker_var2.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 814b47c0c..199e72cb6 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -69,10 +69,10 @@ void PASTEMAC2(chx,chy,op) \
 	); \
 }
 
-INSERT_GENTFUNC2_BASIC0(xbpys_mxn_fn);
-INSERT_GENTFUNC2_MIXDP0(xbpys_mxn_fn);
+INSERT_GENTFUNC2_BASIC0(xpbys_mxn_fn);
+INSERT_GENTFUNC2_MIXDP0(xpbys_mxn_fn);
 
-static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn);
+static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn);
 
 
 void bli_gemm_ker_var2
@@ -345,7 +345,7 @@ void bli_gemm_ker_var2
 				);
 
 				// Accumulate to C with type-casting.
-				xbpys_mxn[ dt_exec ][ dt_c ]
+				xpbys_mxn[ dt_exec ][ dt_c ]
 				(
 				    m_cur, n_cur,
 				    &ct, rs_ct, cs_ct,

From 4603324eb090dfceaad3693a70b2d60544036aa8 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 19 May 2022 14:07:03 -0500
Subject: [PATCH 055/230] Init/finalize via bli_pthread_switch_t API (#634).

Details:
- Defined and implemented a new pthread-like abstract datatype and API
  in bli_pthread.c. The new type, bli_pthread_switch_t, is similar to
  bli_pthread_once_t in some respects. The idea is that like a switch in
  your home that controls a light or ceiling fan, it can either be on or
  off. The switch starts in the off state. Moving from one state to the
  other (on to off; off to on) causes some action (i.e., a startup or
  shutdown function) to be executed. Trying to move from one state to
  the same state (on to on; off to off) is safe in that it results in
  no action. Unlike bli_pthread_once(), the API for bli_pthread_switch_t
  contains both _on() and _off() interfaces. Also, unlike the _once()
  function, the _on() and _off() functions return error codes so that
  the 'int' error code returned from the startup or shutdown functions
  may be passed back to the caller. Thanks to Devin Matthews for his
  input and feedback on this feature.
- Replaced the previous implementation of bli_init_once() and
  bli_finalize_once() -- both of which used bli_pthread_once() -- with
  ones that rely upon bli_pthread_switch_on() and _switch_off(),
  respectively. This also required updating the return types of
  _init_apis() and _finalize_apis() to match the function pointer type
  required by bli_pthread_switch_on()/_switch_off().
- Comment updates.
---
 frame/base/bli_init.c      |  32 +++--------
 frame/base/bli_init.h      |   6 +-
 frame/thread/bli_pthread.c | 114 +++++++++++++++++++++++++++++++++++++
 frame/thread/bli_pthread.h |  25 ++++++++
 4 files changed, 149 insertions(+), 28 deletions(-)

diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c
index e616ac2d7..f1baa2c21 100644
--- a/frame/base/bli_init.c
+++ b/frame/base/bli_init.c
@@ -64,25 +64,21 @@ void bli_finalize_auto( void )
 
 // -----------------------------------------------------------------------------
 
-// A pthread_once_t variable is a pthread structure used in pthread_once().
-// pthread_once() is guaranteed to execute exactly once among all threads that
-// pass in this control object (until/unless the variable is reset).
-static bli_pthread_once_t once_init     = BLIS_PTHREAD_ONCE_INIT;
-static bli_pthread_once_t once_finalize = BLIS_PTHREAD_ONCE_INIT;
+static bli_pthread_switch_t lib_state = BLIS_PTHREAD_SWITCH_INIT;
 
 void bli_init_once( void )
 {
-	bli_pthread_once( &once_init, bli_init_apis );
+	bli_pthread_switch_on( &lib_state, bli_init_apis );
 }
 
 void bli_finalize_once( void )
 {
-	bli_pthread_once( &once_finalize, bli_finalize_apis );
+	bli_pthread_switch_off( &lib_state, bli_finalize_apis );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_init_apis( void )
+int bli_init_apis( void )
 {
 	// Initialize various sub-APIs.
 	bli_gks_init();
@@ -91,17 +87,10 @@ void bli_init_apis( void )
 	bli_pack_init();
 	bli_memsys_init();
 
-	// Reset the control variable that will allow finalization.
-	// NOTE: We must initialize a fresh pthread_once_t object and THEN copy the
-	// contents to the static control variable because some implementations of
-	// pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as
-	// a struct initializer expression (i.e. { ... }), which cannot be used in
-	// post-declaration struct assignment in strict C99.
-	const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT;
-	once_finalize = once_new;
+	return 0;
 }
 
-void bli_finalize_apis( void )
+int bli_finalize_apis( void )
 {
 	// Finalize various sub-APIs.
 	bli_memsys_finalize();
@@ -110,13 +99,6 @@ void bli_finalize_apis( void )
 	bli_ind_finalize();
 	bli_gks_finalize();
 
-	// Reset the control variable that will allow (re-)initialization.
-	// NOTE: We must initialize a fresh pthread_once_t object and THEN copy the
-	// contents to the static control variable because some implementations of
-	// pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as
-	// a struct initializer expression (i.e. { ... }), which cannot be used in
-	// post-declaration struct assignment in strict C99.
-	const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT;
-	once_init = once_new;
+	return 0;
 }
 
diff --git a/frame/base/bli_init.h b/frame/base/bli_init.h
index f174ac0f9..d1bea0cb3 100644
--- a/frame/base/bli_init.h
+++ b/frame/base/bli_init.h
@@ -38,9 +38,9 @@ BLIS_EXPORT_BLIS void bli_finalize( void );
 void bli_init_auto( void );
 void bli_finalize_auto( void );
 
-void bli_init_apis( void );
-void bli_finalize_apis( void );
-
 void bli_init_once( void );
 void bli_finalize_once( void );
 
+int  bli_init_apis( void );
+int  bli_finalize_apis( void );
+
diff --git a/frame/thread/bli_pthread.c b/frame/thread/bli_pthread.c
index a09935661..804ace46d 100644
--- a/frame/thread/bli_pthread.c
+++ b/frame/thread/bli_pthread.c
@@ -693,3 +693,117 @@ int bli_pthread_barrier_wait
 
 #endif
 
+// -- Non-standard extensions --------------------------------------------------
+
+// -- pthread_switch --
+
+//
+// Note that bli_pthread_switch_t has the following properties:
+//
+// 1. Access to a switch is protected by a mutex specific to that switch, and
+//    therefore state changes and thread-safe.
+//
+// 2. An initialized switch always starts in the "off" state.
+//
+// 3. Calling _switch_on() when the switch is already "on" results in an early
+//    return (no action); similar for _switch_off() when it is already "off".
+//
+// 4. The _switch_on() and _switch_off() functions each return an error code
+//    that is equal to the return value of their user-supplied functions,
+//    provided the function in question was actually called rather than being
+//    skipped. When a function call is skipped (as in (3) above), the return
+//    value from _switch_on() and/or _switch_off() is 0 (success).
+//
+//    Note that the user-supplied functions must abide by the convention that a
+//    return value of 0 indicates success and all other values indicate failure
+//    (of some kind). The switch and the user-supplied function must agree on
+//    how "success" is conveyed because the switch must know whether to toggle
+//    its state after inspecting the return value of the user-supplied function.
+//
+
+int bli_pthread_switch_on
+     (
+       bli_pthread_switch_t* sw,
+       int                 (*init)(void)
+     )
+{
+	// NOTE: This function assumes that init() will return 0 on success;
+	// otherwise, it will return some other integer. If the function
+	// partially succeeds (in such a way that it must be called again in
+	// order to complete), it should treat that outcome as failure and
+	// return a non-zero value.
+
+	// Initialize the return value with the error code for success.
+	int r_val = 0;
+
+	// Proceed only if the switch is currently off; otherwise, we return with
+	// an error code of 0.
+	if ( sw->status == 0 )
+	{
+		// Wait for and acquire the switch's lock.
+		bli_pthread_mutex_lock( &sw->mutex );
+
+		// Check the status of the switch once more now that we've acquired the
+		// lock. Proceed with calling the init() function only if the switch
+		// is still off; otherwise, release the lock with an error code of 0.
+		if ( sw->status == 0 )
+		{
+			// Call the init() function and catch its return value in r_val.
+			r_val = init();
+
+			// If the init() function succeeded, turn the switch on;
+			// otherwise, leave the switch off.
+			if ( r_val == 0 )
+				sw->status = 1;
+		}
+
+		// Release the switch's lock.
+		bli_pthread_mutex_unlock( &sw->mutex );
+	}
+
+	return r_val;
+}
+
+int bli_pthread_switch_off
+     (
+       bli_pthread_switch_t* sw,
+       int                 (*deinit)(void)
+     )
+{
+	// NOTE: This function assumes that deinit() will return 0 on success;
+	// otherwise, it will return some other integer. If the function
+	// partially succeeds (in such a way that it must be called again in
+	// order to complete), it should treat that outcome as failure and
+	// return a non-zero value.
+
+	// Initialize the return value with the error code for success.
+	int r_val = 0;
+
+	// Proceed only if the switch is currently on; otherwise, we return with
+	// an error code of 0.
+	if ( sw->status == 1 )
+	{
+		// Wait for and acquire the switch's lock.
+		bli_pthread_mutex_lock( &sw->mutex );
+
+		// Check the status of the switch once more now that we've acquired the
+		// lock. Proceed with calling the deinit() function only if the switch
+		// is still on; otherwise, release the lock with an error code of 0.
+		if ( sw->status == 1 )
+		{
+			// Call the deinit() function and catch its return value in r_val.
+			r_val = deinit();
+
+			// If the deinit() function succeeded, turn the switch off;
+			// otherwise, leave the switch on.
+			if ( r_val == 0 )
+				sw->status = 0;
+		}
+
+		// Release the switch's lock.
+		bli_pthread_mutex_unlock( &sw->mutex );
+	}
+
+	return r_val;
+}
+
diff --git a/frame/thread/bli_pthread.h b/frame/thread/bli_pthread.h
index be786aa39..dcf0db212 100644
--- a/frame/thread/bli_pthread.h
+++ b/frame/thread/bli_pthread.h
@@ -270,4 +270,29 @@ BLIS_EXPORT_BLIS int bli_pthread_barrier_wait
        bli_pthread_barrier_t* barrier
      );
 
+// -- Non-standard extensions --------------------------------------------------
+
+// -- pthread_switch --
+
+typedef struct
+{
+    int                 status;
+    bli_pthread_mutex_t mutex;
+} bli_pthread_switch_t;
+
+#define BLIS_PTHREAD_SWITCH_INIT { .status = 0, \
+                                   .mutex  = BLIS_PTHREAD_MUTEX_INITIALIZER }
+
+int bli_pthread_switch_on
+     (
+       bli_pthread_switch_t* sw,
+       int                 (*init)(void)
+     );
+
+int bli_pthread_switch_off
+     (
+       bli_pthread_switch_t* sw,
+       int                 (*deinit)(void)
+     );
+
 #endif // BLIS_PTHREAD_H

From 56772892450cc92b3fbd6a9d0460153a43fc47ab Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 1 Jun 2022 10:49:33 -0500
Subject: [PATCH 056/230] Added SMU citation to README.md intro.

Details:
- Added a citation to SMU and the Matthews Research Group to the general
  attribution of maintainership and development in the Introduction of
  the README.md file. Thanks to Robert van de Geijn and Devin Matthews
  for suggesting this change.
---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3803acdca..7996cb676 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,9 @@ The BLIS framework is primarily developed and maintained by individuals in the
 [Science of High-Performance Computing](http://shpc.ices.utexas.edu/)
 (SHPC) group in the
 [Oden Institute for Computational Engineering and Sciences](https://www.oden.utexas.edu/)
-at [The University of Texas at Austin](https://www.utexas.edu/).
+at [The University of Texas at Austin](https://www.utexas.edu/)
+and in the [Matthews Research Group](https://matthewsresearchgroup.webstarts.com/)
+at [Southern Methodist University](https://www.smu.edu/).
 Please visit the [SHPC](http://shpc.ices.utexas.edu/) website for more
 information about our research group, such as a list of
 [people](http://shpc.ices.utexas.edu/people.html)

From d93df023348144e091f7b3e3053995648f348aa7 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 15 Jun 2022 14:09:49 -0500
Subject: [PATCH 057/230] Removed unused dt arg in bli_gks_query_ind_cntx().

Details:
- Removed the num_t datatype argument from bli_gks_query_ind_cntx().
  This argument stopped being needed by the function in commit e9da642.
  Its only use in bli_gks_query_ind_cntx() was to be passed through to
  the context initialization function for the chosen induced method,
  but even then, commit log notes from e9da642 indicate that I could not
  recall why the datatype argument was ever needed by the context init
  function to begin with.
- Updated all invocations of bli_gks_query_ind_cntx() to omit the dt
  argument. Most of these invocations resided in various standalone test
  drivers (and the testsuite).
---
 frame/3/bli_l3_oapi_ex.c           | 14 +++++++-------
 frame/3/gemm/bli_gemm_md.c         |  3 +--
 frame/base/bli_gks.c               |  5 ++---
 frame/base/bli_gks.h               |  2 +-
 frame/compat/extra/bla_gemm3m.c    |  4 ++--
 test/1m4m/test_gemm.c              |  2 +-
 test/3/test_gemm.c                 |  2 +-
 test/3/test_hemm.c                 |  2 +-
 test/3/test_herk.c                 |  2 +-
 test/3/test_trmm.c                 |  2 +-
 test/3/test_trsm.c                 |  2 +-
 test/studies/skx/test_gemm.c       |  2 +-
 test/studies/skx/test_hemm.c       |  2 +-
 test/studies/skx/test_syrk.c       |  2 +-
 test/studies/skx/test_trmm.c       |  2 +-
 test/studies/thunderx2/test_gemm.c |  2 +-
 test/studies/thunderx2/test_hemm.c |  2 +-
 test/studies/thunderx2/test_syrk.c |  2 +-
 test/studies/thunderx2/test_trmm.c |  2 +-
 testsuite/src/test_libblis.c       |  8 +++++---
 20 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index e4c815fe3..20b0294eb 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -102,7 +102,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
 
 	// Check the operands.
 	if ( bli_error_checking_is_enabled() )
@@ -153,7 +153,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
 
 	// Check the operands.
 	if ( bli_error_checking_is_enabled() )
@@ -281,7 +281,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
 
 	// Check the operands.
 	if ( bli_error_checking_is_enabled() )
@@ -331,7 +331,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
 
 	// Check the operands.
 	if ( bli_error_checking_is_enabled() )
@@ -381,7 +381,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
 
 	// Check the operands.
 	if ( bli_error_checking_is_enabled() )
@@ -487,7 +487,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
 
 	// Check the operands.
 	if ( bli_error_checking_is_enabled() )
@@ -534,7 +534,7 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
 
 	// Check the operands.
 	if ( bli_error_checking_is_enabled() )
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index a283c1235..1e23d058e 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -439,8 +439,7 @@ mddm_t bli_gemm_md_rcc
 	// the target datatype. (The packm_blk_var1_md() function has "built-in"
 	// support for packing to 1r (and 1e) schemas, whereas the
 	// packm_blk_var1() function relies on packm kernels for packing to 1r.
-	const num_t   dt_complex = bli_obj_dt( a );
-	const cntx_t* cntx_1m    = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
+	const cntx_t* cntx_1m     = bli_gks_query_ind_cntx( BLIS_1M );
 
 	const func_t* packm_1m_mr = bli_cntx_get_ukrs( BLIS_PACKM_MRXK_KER, cntx_1m );
 	const func_t* packm_1m_nr = bli_cntx_get_ukrs( BLIS_PACKM_NRXK_KER, cntx_1m );
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 4a7ccbbc3..ff80f85ed 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -509,8 +509,7 @@ static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
 const cntx_t* bli_gks_query_ind_cntx
      (
-       ind_t ind,
-       num_t dt
+       ind_t ind
      )
 {
 	bli_init_once();
@@ -675,7 +674,7 @@ const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
 	// Query the context for the current induced method and datatype, and
 	// then query the ukernel function pointer for the given datatype from
 	// that context.
-	const cntx_t* cntx = bli_gks_query_ind_cntx( method, dt );
+	const cntx_t* cntx = bli_gks_query_ind_cntx( method );
 	void_fp fp         = bli_cntx_get_ukr_dt( dt, ukr, cntx );
 
 	// Check whether the ukernel function pointer is NULL for the given
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index 30e3b2e39..3a93fd59e 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -50,7 +50,7 @@ BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void );
 
 const cntx_t*                  bli_gks_query_cntx_noinit( void );
 
-BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind );
 
 BLIS_EXPORT_BLIS void          bli_gks_init_ref_cntx( cntx_t* cntx );
 
diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c
index 31f677db6..258ac5bbb 100644
--- a/frame/compat/extra/bla_gemm3m.c
+++ b/frame/compat/extra/bla_gemm3m.c
@@ -103,7 +103,7 @@ void PASTEF77(ch,blasname) \
 	   abbreviated version of bli_gemm_ex() so that we can bypass
 	   consideration of sup, which doesn't make sense in this context. */ \
 	{ \
-		cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \
+		cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M ); \
 \
 		rntm_t  rntm_l; \
 		rntm_t* rntm = &rntm_l; \
@@ -222,7 +222,7 @@ void PASTEF77(ch,blasname) \
 	   abbreviated version of bli_gemm_ex() so that we can bypass
 	   consideration of sup, which doesn't make sense in this context. */ \
 	{ \
-		cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \
+		cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M ); \
 \
 		rntm_t  rntm_l; \
 		rntm_t* rntm = &rntm_l; \
diff --git a/test/1m4m/test_gemm.c b/test/1m4m/test_gemm.c
index f9a855125..87bdceb11 100644
--- a/test/1m4m/test_gemm.c
+++ b/test/1m4m/test_gemm.c
@@ -109,7 +109,7 @@ int main( int argc, char** argv )
 	ind_t ind_mod = ind;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c
index 745dae07c..96992f4a1 100644
--- a/test/3/test_gemm.c
+++ b/test/3/test_gemm.c
@@ -109,7 +109,7 @@ int main( int argc, char** argv )
 	ind_t ind_mod = ind;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c
index 8df46f0f0..537378d43 100644
--- a/test/3/test_hemm.c
+++ b/test/3/test_hemm.c
@@ -87,7 +87,7 @@ int main( int argc, char** argv )
 	ind_t ind_mod = ind;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/3/test_herk.c b/test/3/test_herk.c
index 65dcb9f6c..6dbaf1936 100644
--- a/test/3/test_herk.c
+++ b/test/3/test_herk.c
@@ -89,7 +89,7 @@ int main( int argc, char** argv )
 	ind_t ind_mod = ind;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c
index 425630a2a..4e58b95fa 100644
--- a/test/3/test_trmm.c
+++ b/test/3/test_trmm.c
@@ -92,7 +92,7 @@ int main( int argc, char** argv )
 	ind_t ind_mod = ind;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c
index 678be4330..4897d4627 100644
--- a/test/3/test_trsm.c
+++ b/test/3/test_trsm.c
@@ -92,7 +92,7 @@ int main( int argc, char** argv )
 	ind_t ind_mod = ind;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/skx/test_gemm.c b/test/studies/skx/test_gemm.c
index 64311753c..53a227c2b 100644
--- a/test/studies/skx/test_gemm.c
+++ b/test/studies/skx/test_gemm.c
@@ -94,7 +94,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/skx/test_hemm.c b/test/studies/skx/test_hemm.c
index 4ed9b2b67..1b0b1a609 100644
--- a/test/studies/skx/test_hemm.c
+++ b/test/studies/skx/test_hemm.c
@@ -93,7 +93,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/skx/test_syrk.c b/test/studies/skx/test_syrk.c
index 5e1c43159..37b8c54da 100644
--- a/test/studies/skx/test_syrk.c
+++ b/test/studies/skx/test_syrk.c
@@ -92,7 +92,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/skx/test_trmm.c b/test/studies/skx/test_trmm.c
index 1c7db7956..235e1e224 100644
--- a/test/studies/skx/test_trmm.c
+++ b/test/studies/skx/test_trmm.c
@@ -94,7 +94,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/thunderx2/test_gemm.c b/test/studies/thunderx2/test_gemm.c
index f212c570b..7f1880558 100644
--- a/test/studies/thunderx2/test_gemm.c
+++ b/test/studies/thunderx2/test_gemm.c
@@ -93,7 +93,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/thunderx2/test_hemm.c b/test/studies/thunderx2/test_hemm.c
index 5bf0373b4..11e0bea68 100644
--- a/test/studies/thunderx2/test_hemm.c
+++ b/test/studies/thunderx2/test_hemm.c
@@ -93,7 +93,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/thunderx2/test_syrk.c b/test/studies/thunderx2/test_syrk.c
index 4b240e25a..5b9a9957b 100644
--- a/test/studies/thunderx2/test_syrk.c
+++ b/test/studies/thunderx2/test_syrk.c
@@ -92,7 +92,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/test/studies/thunderx2/test_trmm.c b/test/studies/thunderx2/test_trmm.c
index 0fb153444..4851ac222 100644
--- a/test/studies/thunderx2/test_trmm.c
+++ b/test/studies/thunderx2/test_trmm.c
@@ -94,7 +94,7 @@ int main( int argc, char** argv )
 	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
 
 	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+	cntx = bli_gks_query_ind_cntx( ind_mod );
 
 	// Set k to the kc blocksize for the current datatype.
 	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index f267ae158..da729b3a9 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -1080,9 +1080,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) );
 	libblis_test_fprintf_c( os, "\n" );
 
-	// Query a native context.
-	cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX );
-	cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX );
+	// Query a native context. NOTE: Now that we've removed the dt argument from
+	// bli_gks_query_ind_cntx(), we can consolidate cntx_c and cntx_z; there is
+	// no need to query two contexts since they are the same.
+	cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im );
+	cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im );
 
 	libblis_test_fprintf_c( os, "level-3 blocksizes                             c       z \n" );
 	libblis_test_fprintf_c( os, "  mc                                     %7d %7d\n",

From d429b6bfced21a63bf711224ac402f93f0080b52 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Tue, 28 Jun 2022 15:34:10 -0500
Subject: [PATCH 058/230] Support clang targetting MinGW (#639)

* Support clang targetting MinGW

* Fix pthread linking
---
 build/config.mk.in |  1 +
 common.mk          | 16 ++++++++++++----
 configure          |  7 ++++++-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/build/config.mk.in b/build/config.mk.in
index 1b3468642..7ef8c6bd0 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -73,6 +73,7 @@ OS_NAME           := @os_name@
 
 # Check for whether the operating system is Windows.
 IS_WIN            := @is_win@
+IS_MSVC           := @is_msvc@
 
 # The directory path to the top level of the source distribution. When
 # building in-tree, this path is ".". When building out-of-tree, this path
diff --git a/common.mk b/common.mk
index a93f8ab24..6661f84c5 100644
--- a/common.mk
+++ b/common.mk
@@ -436,7 +436,7 @@ LIBBLIS            := libblis
 ifeq ($(OS_NAME),Darwin)
 SHLIB_EXT          := dylib
 else ifeq ($(IS_WIN),yes)
-ifeq ($(CC_VENDOR),gcc)
+ifeq ($(IS_MSVC),no)
 SHLIB_EXT          := dll.a
 else
 SHLIB_EXT          := lib
@@ -524,7 +524,7 @@ GIT_LOG    := $(GIT) log --decorate
 # manually override whatever they need.
 
 # Define the external libraries we may potentially need at link-time.
-ifeq ($(IS_WIN),yes)
+ifeq ($(IS_MSVC),yes)
 LIBM       :=
 else
 LIBM       := -lm
@@ -566,7 +566,7 @@ else
 SOFLAGS    := -shared
 ifeq ($(IS_WIN),yes)
 # Windows shared library link flags.
-ifeq ($(CC_VENDOR),clang)
+ifeq ($(IS_MSVC),yes)
 SOFLAGS    += -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib
 else
 SOFLAGS    += -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a
@@ -687,7 +687,7 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c))))
 # --- Position-independent code flags (shared libraries only) ---
 
 # Emit position-independent code for dynamic linking.
-ifeq ($(IS_WIN),yes)
+ifeq ($(IS_MSVC),yes)
 # Note: Don't use any fPIC flags for Windows builds since all code is position-
 # independent.
 CPICFLAGS :=
@@ -739,6 +739,14 @@ endif
 # Determine default export behavior / visibility of symbols for clang.
 ifeq ($(CC_VENDOR),clang)
 ifeq ($(IS_WIN),yes)
+ifeq ($(IS_MSVC),no)
+# This is a clang build targetting MinGW-w64 env
+ifeq ($(EXPORT_SHARED),all)
+BUILD_SYMFLAGS := -Wl,--export-all-symbols, -Wl,--enable-auto-import
+else # ifeq ($(EXPORT_SHARED),all)
+BUILD_SYMFLAGS := -Wl,--exclude-all-symbols
+endif
+endif # ifeq ($(IS_MSVC),no)
 ifeq ($(EXPORT_SHARED),all)
 # NOTE: clang on Windows does not appear to support exporting all symbols
 # by default, and therefore we ignore the value of EXPORT_SHARED.
diff --git a/configure b/configure
index 7e825f1dc..5ff877317 100755
--- a/configure
+++ b/configure
@@ -1170,7 +1170,7 @@ auto_detect()
 	# Set the linker flags. We typically need pthreads (or BLIS's homerolled
 	# equiavlent) because it is needed for parts of bli_arch.c unrelated to
 	# bli_arch_string(), which is called by the main() function in ${main_c}.
-	if [[ "$is_win" == "no" || "$cc_vendor" != "clang" ]]; then
+	if [[ "$is_msvc" == "no" ]]; then
 		ldflags="${LIBPTHREAD--lpthread}"
 	fi
 
@@ -2828,6 +2828,10 @@ main()
 	if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
 		is_win=yes
 	fi
+	is_msvc=no
+	if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _MSC_VER; then
+		is_msvc=yes
+	fi
 
 
 	# -- Check the compiler version --------------------------------------------
@@ -3830,6 +3834,7 @@ main()
 		| sed -e "s/@kconfig_map@/${kconfig_map}/g" \
 		| sed -e "s/@os_name@/${os_name_esc}/g" \
 		| sed -e "s/@is_win@/${is_win}/g" \
+		| sed -e "s/@is_msvc@/${is_msvc}/g" \
 		| sed -e "s/@dist_path@/${dist_path_esc}/g" \
 		| sed -e "s/@CC_VENDOR@/${cc_vendor}/g" \
 		| sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \

From 667f201b7871da68622027d02bd6b7da3262f8e8 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 7 Jul 2022 16:44:21 -0500
Subject: [PATCH 059/230] Fixed type bug in bli_cntx_set_ukr_prefs().

Details:
- Fixed a bug in bli_cntx_set_ukr_prefs() which erroneously typecast the
  num_t value read from va_args() down to a bool before being stored
  within the cntx_t. This bug was introduced on April 6th 2022, in
  ae10d94. This caused the ukernel preferences for double real and
  double complex to go unchanged while the preferences for single real
  and single complex were corrupted by the former datatypes'
  preference values. The bug manifested as degraded performance for
  subconfigurations that registered column-preferential ukernels. The
  reason is that the erroneous preferences trigger unnecessary
  transpositions in the operation, which forces the gemm ukernel to
  compute on matrices that are not stored according to its preference.
  Thanks to Devin Matthews, Jeff Diamond, and Leick Robinson for their
  extensive efforts and assistance in tracking down this issue.
- Augmented the informational header that is output by the testsuite to
  include ukernel preferences for gemm, gemmtrsm_[lu], and trsm_[lu].
- CREDITS file update.
---
 CREDITS                      |  1 +
 frame/base/bli_cntx.c        |  2 +-
 testsuite/src/test_libblis.c | 29 +++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/CREDITS b/CREDITS
index b701598cf..43c7b3ed5 100644
--- a/CREDITS
+++ b/CREDITS
@@ -84,6 +84,7 @@ but many others have contributed code and feedback, including
   Michael Rader            @mrader1248
   Pradeep Rao              @pradeeptrgit       (AMD)
   Aleksei Rechinskii
+  Leick Robinson           @LeickR             (Oracle)
   Karl Rupp                @karlrupp
   Martin Schatz                                (The University of Texas at Austin)
   Nico Schlömer            @nschloe
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 70057060f..8c6cafc13 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -316,7 +316,7 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
 		const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0;
-		const bool       ukr_pref_dt = ( num_t      )va_arg( args, num_t   );
+		const num_t      ukr_pref_dt = ( num_t      )va_arg( args, num_t );
 		const bool       ukr_pref    = ( bool       )va_arg( args, int );
 
 		// Index into the func_t and mbool_t for the current kernel id
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index da729b3a9..eaa0a9cef 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -962,6 +962,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        bli_info_get_trsm_impl_string( BLIS_SCOMPLEX ),
 	                        bli_info_get_trsm_impl_string( BLIS_DCOMPLEX ) );
 	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "\n" );
 
 	//bli_ind_disable_all();
 
@@ -1062,6 +1063,34 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        bli_info_get_trsm_u_ukr_impl_string( BLIS_NAT, BLIS_DCOMPLEX ) );
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "micro-kernel prefers rows?     s       d       c       z\n" );
+	libblis_test_fprintf_c( os, "  gemm                   %7d %7d %7d %7d\n",
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT,    BLIS_GEMM_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE,   BLIS_GEMM_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_GEMM_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_GEMM_UKR, cntx ) );
+	libblis_test_fprintf_c( os, "  gemmtrsm_l             %7d %7d %7d %7d\n",
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT,    BLIS_GEMMTRSM_L_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE,   BLIS_GEMMTRSM_L_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_GEMMTRSM_L_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_GEMMTRSM_L_UKR, cntx ) );
+	libblis_test_fprintf_c( os, "  gemmtrsm_u             %7d %7d %7d %7d\n",
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT,    BLIS_GEMMTRSM_U_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE,   BLIS_GEMMTRSM_U_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_GEMMTRSM_U_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_GEMMTRSM_U_UKR, cntx ) );
+	libblis_test_fprintf_c( os, "  trsm_l                 %7d %7d %7d %7d\n",
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT,    BLIS_TRSM_L_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE,   BLIS_TRSM_L_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_TRSM_L_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_TRSM_L_UKR, cntx ) );
+	libblis_test_fprintf_c( os, "  trsm_u                 %7d %7d %7d %7d\n",
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT,    BLIS_TRSM_U_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE,   BLIS_TRSM_U_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_TRSM_U_UKR, cntx ),
+	                        ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_TRSM_U_UKR, cntx ) );
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "\n" );
 
 	libblis_test_fprintf_c( os, "--- BLIS induced implementation info ---\n" );
 	libblis_test_fprintf_c( os, "\n" );

From 7cba7ce3dd1533fcc4ca96ac902bdf218686139a Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 8 Jul 2022 11:15:18 -0500
Subject: [PATCH 060/230] Minor cleanups, comment updates to bli_gks.c.

Details:
- Removed a redundant registration of 'a64fx' subconfig in
  bli_gks_init().
- Reordered registration of 'armsve', 'a64fx', and 'firestorm'
  subconfigs. Thanks to Jeff Diamond for his input on this reordering.
- Comment updates to bli_gks.c and arch_t enum in bli_type_defs.h.
---
 frame/base/bli_gks.c          | 49 ++++++++++++++++++++---------------
 frame/include/bli_type_defs.h |  6 ++++-
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index ff80f85ed..094810d9d 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -64,7 +64,8 @@ void bli_gks_init( void )
 		// Register a context for each architecture that was #define'd in
 		// bli_config.h.
 
-		// Intel architectures
+		// -- Intel architectures ----------------------------------------------
+
 #ifdef BLIS_CONFIG_SKX
 		bli_gks_register_cntx( BLIS_ARCH_SKX,         bli_cntx_init_skx,
 		                                              bli_cntx_init_skx_ref,
@@ -96,7 +97,8 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_penryn_ind );
 #endif
 
-		// AMD architectures
+		// -- AMD architectures ------------------------------------------------
+
 #ifdef BLIS_CONFIG_ZEN3
 		bli_gks_register_cntx( BLIS_ARCH_ZEN3,        bli_cntx_init_zen3,
 		                                              bli_cntx_init_zen3_ref,
@@ -133,12 +135,28 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_bulldozer_ind );
 #endif
 
-		// ARM architectures
+		// -- ARM architectures ------------------------------------------------
+
+		// -- ARM-SVE --
+#ifdef BLIS_CONFIG_ARMSVE
+		bli_gks_register_cntx( BLIS_ARCH_ARMSVE,      bli_cntx_init_armsve,
+		                                              bli_cntx_init_armsve_ref,
+		                                              bli_cntx_init_armsve_ind );
+#endif
 #ifdef BLIS_CONFIG_A64FX
-		bli_gks_register_cntx( BLIS_ARCH_A64FX,   bli_cntx_init_a64fx,
+		bli_gks_register_cntx( BLIS_ARCH_A64FX,       bli_cntx_init_a64fx,
 		                                              bli_cntx_init_a64fx_ref,
 		                                              bli_cntx_init_a64fx_ind );
 #endif
+
+		// -- ARM-NEON (4 pipes x 128-bit vectors) --
+#ifdef BLIS_CONFIG_FIRESTORM
+		bli_gks_register_cntx( BLIS_ARCH_FIRESTORM,   bli_cntx_init_firestorm,
+		                                              bli_cntx_init_firestorm_ref,
+		                                              bli_cntx_init_firestorm_ind );
+#endif
+
+		// -- ARM (2 pipes x 128-bit vectors) --
 #ifdef BLIS_CONFIG_THUNDERX2
 		bli_gks_register_cntx( BLIS_ARCH_THUNDERX2,   bli_cntx_init_thunderx2,
 		                                              bli_cntx_init_thunderx2_ref,
@@ -154,21 +172,8 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_cortexa53_ref,
 		                                              bli_cntx_init_cortexa53_ind );
 #endif
-#ifdef BLIS_CONFIG_ARMSVE
-		bli_gks_register_cntx( BLIS_ARCH_ARMSVE,      bli_cntx_init_armsve,
-		                                              bli_cntx_init_armsve_ref,
-		                                              bli_cntx_init_armsve_ind );
-#endif
-#ifdef BLIS_CONFIG_A64FX
-		bli_gks_register_cntx( BLIS_ARCH_A64FX,       bli_cntx_init_a64fx,
-		                                              bli_cntx_init_a64fx_ref,
-		                                              bli_cntx_init_a64fx_ind );
-#endif
-#ifdef BLIS_CONFIG_FIRESTORM
-		bli_gks_register_cntx( BLIS_ARCH_FIRESTORM,   bli_cntx_init_firestorm,
-		                                              bli_cntx_init_firestorm_ref,
-		                                              bli_cntx_init_firestorm_ind );
-#endif
+
+		// -- ARM (older 32-bit microarchitectures) --
 #ifdef BLIS_CONFIG_CORTEXA15
 		bli_gks_register_cntx( BLIS_ARCH_CORTEXA15,   bli_cntx_init_cortexa15,
 		                                              bli_cntx_init_cortexa15_ref,
@@ -180,7 +185,8 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_cortexa9_ind );
 #endif
 
-		// IBM architectures
+		// -- IBM architectures ------------------------------------------------
+
 #ifdef BLIS_CONFIG_POWER10
 		bli_gks_register_cntx( BLIS_ARCH_POWER10,     bli_cntx_init_power10,
 		                                              bli_cntx_init_power10_ref,
@@ -202,7 +208,8 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_bgq_ind );
 #endif
 
-		// Generic architectures
+		// -- Generic architectures --------------------------------------------
+
 #ifdef BLIS_CONFIG_GENERIC
 		bli_gks_register_cntx( BLIS_ARCH_GENERIC,     bli_cntx_init_generic,
 		                                              bli_cntx_init_generic_ref,
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index e957fc6b2..08c7ddc4a 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -928,10 +928,14 @@ typedef enum
 	BLIS_ARCH_PILEDRIVER,
 	BLIS_ARCH_BULLDOZER,
 
-	// ARM
+	// ARM-SVE
 	BLIS_ARCH_ARMSVE,
 	BLIS_ARCH_A64FX,
+
+	// ARM-NEON (4 pipes x 128-bit vectors)
 	BLIS_ARCH_FIRESTORM,
+
+	// ARM (2 pipes x 128-bit vectors)
 	BLIS_ARCH_THUNDERX2,
 	BLIS_ARCH_CORTEXA57,
 	BLIS_ARCH_CORTEXA53,

From ffde54cc5c334aca8eff4d6072ba49496bf3104c Mon Sep 17 00:00:00 2001
From: jdiamondGitHub <jeff_diamond@fastmail.com>
Date: Mon, 11 Jul 2022 16:47:30 -0500
Subject: [PATCH 061/230] Minor changes to .gitignore and LICENSE files. (#642)

Details:
- Macs create .DS_Store files in every directory visited. Updated
  .gitignore file so these files won't be reported as untracked by
  'git status'.
- Added Oracle Corporation to the LICENSE file.
- Updated UT copyright on behalf of SHPC.
---
 .gitignore | 3 +++
 LICENSE    | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index a24fe2b0e..6d51f6f51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,3 +53,6 @@ out.*
 GPATH
 GRTAGS
 GTAGS
+
+# Mac DS.store files
+.DS_Store
diff --git a/LICENSE b/LICENSE
index b9cde54b8..8168814a9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -6,6 +6,7 @@ while other portions are copyrighted by
 
   Hewlett Packard Enterprise Development LP
   Advanced Micro Devices, Inc.
+  Oracle Corporation
 
 with some overlap. Please see file-level license headers for file-specific
 copyright info. All parties provide their portions of the code under the
@@ -13,9 +14,10 @@ copyright info. All parties provide their portions of the code under the
 
 ---
 
-Copyright (C) 2018, The University of Texas at Austin
+Copyright (C) 2012 - 2022, The University of Texas at Austin
 Copyright (C) 2016, Hewlett Packard Enterprise Development LP
 Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+Copyright (C) 2022, Oracle Corporation
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are

From 98d467891b74021ace7f248cb0856bec734e39b6 Mon Sep 17 00:00:00 2001
From: bartoldeman <bartoldeman@users.noreply.github.com>
Date: Mon, 11 Jul 2022 19:40:53 -0400
Subject: [PATCH 062/230] Change complex_return='intel' for ifx. (#637)

Details:
- When checking the version string of the Fortran compiler for the
  purposes of determining a default return convention for complex
  domain values, grep for "IFORT" instead of "ifort" since that string
  is common to both the 'ifx' and 'ifort' binaries provided by Intel:

    $ ifx --version
    ifx (IFORT) 2022.1.0 20220316
    Copyright (C) 1985-2022 Intel Corporation. All rights reserved.

    $ ifort --version
    ifort (IFORT) 2021.6.0 20220226
    Copyright (C) 1985-2022 Intel Corporation. All rights reserved.
---
 configure | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 5ff877317..a6018edab 100755
--- a/configure
+++ b/configure
@@ -3688,9 +3688,9 @@ main()
 			# Query the compiler "vendor" (ie: the compiler's simple name).
 			# The last part ({ read first rest ; echo $first ; }) is a workaround
 			# to OS X's egrep only returning the first match.
-			fc_vendor=$(echo "${vendor_string}" | egrep -o 'ifort|GNU' | { read first rest ; echo $first ; })
+			fc_vendor=$(echo "${vendor_string}" | egrep -o 'IFORT|GNU' | { read first rest ; echo $first ; })
 
-			if [ "x${fc_vendor}" = "xifort" ]; then
+			if [ "x${fc_vendor}" = "xIFORT" ]; then
 				complex_return='intel'
 			elif [ "x${fc_vendor}" = "xGNU" ]; then
 				complex_return='gnu'

From 9b1beec60be31c6ea20b85806d61551497b699e4 Mon Sep 17 00:00:00 2001
From: bartoldeman <bartoldeman@users.noreply.github.com>
Date: Mon, 11 Jul 2022 20:15:12 -0400
Subject: [PATCH 063/230] Use BLIS_ENABLE_COMPLEX_RETURN_INTEL in blastest
 files (#636)

Details:
- Fixed a crash that occurs when either cblat1 or zblat1 are linked
  with a build of BLIS that was compiled with '--complex-return=intel'.
  This fix involved inserting preprocessor macro guards based on
  BLIS_ENABLE_COMPLEX_RETURN_INTEL into blastest/src/cblat1.c and
  blastest/src/zblat1.c to correctly handle situations where BLIS is
  compiled with Intel/f2c-style calling conventions for complex numbers.
- Updated blastest/src/fortran/run-f2c.sh so that future executions
  will insert the aforementioned cpp macro conditional where
  appropriate.
---
 blastest/src/cblat1.c           | 32 ++++++++++++++++++++++++++++----
 blastest/src/fortran/run-f2c.sh | 20 +++++++++++---------
 blastest/src/zblat1.c           | 32 ++++++++++++++++++++++++++++----
 3 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/blastest/src/cblat1.c b/blastest/src/cblat1.c
index daccb2f6c..606511662 100644
--- a/blastest/src/cblat1.c
+++ b/blastest/src/cblat1.c
@@ -475,11 +475,23 @@ static real c_b52 = 0.f;
     integer mx, my;
     complex cdot[1];
     integer lenx, leny;
-    extern /* Complex */ complex cdotc_(integer *, complex *, integer 
+    extern /* Complex */
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+ void cdotc_(complex *,
+#else
+complex cdotc_(
+#endif
+ integer *, complex *, integer 
 	    *, complex *, integer *);
     extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, 
 	    complex *, integer *);
-    extern /* Complex */ complex cdotu_(integer *, complex *, integer 
+    extern /* Complex */
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+ void cdotu_(complex *,
+#else
+complex cdotu_(
+#endif
+ integer *, complex *, integer 
 	    *, complex *, integer *);
     extern /* Subroutine */ int cswap_(integer *, complex *, integer *, 
 	    complex *, integer *), ctest_(integer *, complex *, complex *, 
@@ -526,14 +538,26 @@ static real c_b52 = 0.f;
 	    }
 	    if (combla_1.icase == 1) {
 /*              .. CDOTC .. */
-		q__1 = cdotc_(&combla_1.n, cx, &combla_1.incx, cy, &
+
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+		cdotc_(&q__1,
+#else
+		q__1 = cdotc_(
+#endif
+		 &combla_1.n, cx, &combla_1.incx, cy, &
 			combla_1.incy);
 		cdot[0].r = q__1.r, cdot[0].i = q__1.i;
 		ctest_(&c__1, cdot, &ct6[kn + (ki << 2) - 5], &csize1[kn - 1],
 			 sfac);
 	    } else if (combla_1.icase == 2) {
 /*              .. CDOTU .. */
-		q__1 = cdotu_(&combla_1.n, cx, &combla_1.incx, cy, &
+
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+		cdotu_(&q__1,
+#else
+		q__1 = cdotu_(
+#endif
+		 &combla_1.n, cx, &combla_1.incx, cy, &
 			combla_1.incy);
 		cdot[0].r = q__1.r, cdot[0].i = q__1.i;
 		ctest_(&c__1, cdot, &ct7[kn + (ki << 2) - 5], &csize1[kn - 1],
diff --git a/blastest/src/fortran/run-f2c.sh b/blastest/src/fortran/run-f2c.sh
index fdad4fd34..f0df2f5b8 100755
--- a/blastest/src/fortran/run-f2c.sh
+++ b/blastest/src/fortran/run-f2c.sh
@@ -50,13 +50,15 @@ recursive-sed.sh -c "s/-4.f };/-4.f }};/g" -p "s*1.c"
 
 # Convert from brain-dead f2c complex calling conventions to normal
 # return-based conventions.
-recursive-sed.sh -c "s/void cdotc_(complex \*, /complex cdotc_(/g" -p "c*1.c"
-recursive-sed.sh -c "s/void cdotu_(complex \*, /complex cdotu_(/g" -p "c*1.c"
-recursive-sed.sh -c "s/cdotc_(&q__1, /q__1 = cdotc_(/g" -p "c*1.c"
-recursive-sed.sh -c "s/cdotu_(&q__1, /q__1 = cdotu_(/g" -p "c*1.c"
-
-recursive-sed.sh -c "s/void zdotc_(doublecomplex \*, /doublecomplex zdotc_(/g" -p "z*1.c"
-recursive-sed.sh -c "s/void zdotu_(doublecomplex \*, /doublecomplex zdotu_(/g" -p "z*1.c"
-recursive-sed.sh -c "s/zdotc_(\&z__1, /z__1 = zdotc_(/g" -p "z*1.c"
-recursive-sed.sh -c "s/zdotu_(\&z__1, /z__1 = zdotu_(/g" -p "z*1.c"
+subst1='\n#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL\n&\n#else\n'
+subst2='\n#endif\n'
+recursive-sed.sh -c "s/ void cdotc_(complex \*,/${subst1}complex cdotc_(${subst2}/g" -p "c*1.c"
+recursive-sed.sh -c "s/ void cdotu_(complex \*,/${subst1}complex cdotu_(${subst2}/g" -p "c*1.c"
+recursive-sed.sh -c "s/\(.*\)cdotc_(&q__1,/${subst1}\1q__1 = cdotc_(${subst2}\1/g" -p "c*1.c"
+recursive-sed.sh -c "s/\(.*\)cdotu_(&q__1,/${subst1}\1q__1 = cdotu_(${subst2}\1/g" -p "c*1.c"
+
+recursive-sed.sh -c "s/ void zdotc_(doublecomplex \*,/${subst1}doublecomplex zdotc_(${subst2}/g" -p "z*1.c"
+recursive-sed.sh -c "s/ void zdotu_(doublecomplex \*,/${subst1}doublecomplex zdotu_(${subst2}/g" -p "z*1.c"
+recursive-sed.sh -c "s/\(.*\)zdotc_(\&z__1,/${subst1}\1z__1 = zdotc_(${subst2}\1/g" -p "z*1.c"
+recursive-sed.sh -c "s/\(.*\)zdotu_(\&z__1,/${subst1}\1z__1 = zdotu_(${subst2}\1/g" -p "z*1.c"
 
diff --git a/blastest/src/zblat1.c b/blastest/src/zblat1.c
index c34a57262..b620910be 100644
--- a/blastest/src/zblat1.c
+++ b/blastest/src/zblat1.c
@@ -459,12 +459,24 @@ static doublereal c_b52 = 0.;
     integer lenx, leny;
     extern /* Subroutine */ int ctest_(integer *, doublecomplex *, 
 	    doublecomplex *, doublecomplex *, doublereal *);
-    extern /* Double Complex */ doublecomplex zdotc_(integer *, 
+    extern /* Double Complex */
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+ void zdotc_(doublecomplex *,
+#else
+doublecomplex zdotc_(
+#endif
+ integer *, 
 	    doublecomplex *, integer *, doublecomplex *, integer *);
     integer ksize;
     extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, 
 	    doublecomplex *, integer *);
-    extern /* Double Complex */ doublecomplex zdotu_(integer *, 
+    extern /* Double Complex */
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+ void zdotu_(doublecomplex *,
+#else
+doublecomplex zdotu_(
+#endif
+ integer *, 
 	    doublecomplex *, integer *, doublecomplex *, integer *);
     extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
 	    doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, 
@@ -508,14 +520,26 @@ static doublereal c_b52 = 0.;
 	    }
 	    if (combla_1.icase == 1) {
 /*              .. ZDOTC .. */
-		z__1 = zdotc_(&combla_1.n, cx, &combla_1.incx, cy, &
+
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+		zdotc_(&z__1,
+#else
+		z__1 = zdotc_(
+#endif
+		 &combla_1.n, cx, &combla_1.incx, cy, &
 			combla_1.incy);
 		cdot[0].r = z__1.r, cdot[0].i = z__1.i;
 		ctest_(&c__1, cdot, &ct6[kn + (ki << 2) - 5], &csize1[kn - 1],
 			 sfac);
 	    } else if (combla_1.icase == 2) {
 /*              .. ZDOTU .. */
-		z__1 = zdotu_(&combla_1.n, cx, &combla_1.incx, cy, &
+
+#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
+		zdotu_(&z__1,
+#else
+		z__1 = zdotu_(
+#endif
+		 &combla_1.n, cx, &combla_1.incx, cy, &
 			combla_1.incy);
 		cdot[0].r = z__1.r, cdot[0].i = z__1.i;
 		ctest_(&c__1, cdot, &ct7[kn + (ki << 2) - 5], &csize1[kn - 1],

From cc260fd7068f0fe449d818435aa11adb14c17fed Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 13 Jul 2022 16:16:01 -0500
Subject: [PATCH 064/230] Allow uniform max problem sizes in test/3/runme.sh.

Details:
- Tweaked test/3/runme.sh so that the test driver binaries for single-
  threaded (st), single-socket (1s), and dual-socket (2s) execution can
  be built using identical problem size ranges. Previously, this was not
  possible because runme.sh used the maximum problem size, which was
  embedded into the binary filename, to tell the three classes of
  binaries apart from one another. Now, runme.sh uses the binary suffix
  ("st", "1s", or "2s") to tell them apart. This required only a few
  changes to the logic, but it also required a change in format to the
  threading config strings themselves (replacing the max problem size
  with "st", "1s", or "2s"). Thanks to Jeff Diamond for inspiring this
  improvement.
- Comment updates.
---
 test/3/runme.sh | 90 +++++++++++++++++++++++++++++--------------------
 1 file changed, 54 insertions(+), 36 deletions(-)

diff --git a/test/3/runme.sh b/test/3/runme.sh
index 56c192809..cf84bd121 100755
--- a/test/3/runme.sh
+++ b/test/3/runme.sh
@@ -5,12 +5,12 @@ exec_root="test"
 out_root="output"
 delay=0.1
 
-#sys="blis"
+sys="blis"
 #sys="stampede2"
 #sys="lonestar5"
 #sys="ul252"
 #sys="ul264"
-sys="ul2128"
+#sys="ul2128"
 
 # Bind threads to processors.
 #export OMP_PROC_BIND=true
@@ -22,9 +22,9 @@ if [ ${sys} = "blis" ]; then
 	export GOMP_CPU_AFFINITY="0-3"
 
 	numactl=""
-	threads="jc1ic1jr1_2400
-	         jc2ic3jr2_6000
-	         jc4ic3jr2_8000"
+	threads="jc1ic1jr1_st
+	         jc2ic1jr1_1s
+	         jc2ic2jr1_2s"
 
 elif [ ${sys} = "stampede2" ]; then
 
@@ -32,9 +32,9 @@ elif [ ${sys} = "stampede2" ]; then
 	exit 1
 
 	numactl=""
-	threads="jc1ic1jr1_2400
-	         jc4ic6jr1_6000
-	         jc4ic12jr1_8000"
+	threads="jc1ic1jr1_st
+	         jc4ic6jr1_1s
+	         jc4ic12jr1_2s"
 
 elif [ ${sys} = "lonestar5" ]; then
 
@@ -44,9 +44,9 @@ elif [ ${sys} = "lonestar5" ]; then
 	#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
 
 	numactl=""
-	threads="jc1ic1jr1_2400
-	         jc2ic3jr2_6000
-	         jc4ic3jr2_8000"
+	threads="jc1ic1jr1_st
+	         jc2ic3jr2_1s
+	         jc4ic3jr2_2s"
 
 elif [ ${sys} = "ul252" ]; then
 
@@ -54,9 +54,9 @@ elif [ ${sys} = "ul252" ]; then
 	export GOMP_CPU_AFFINITY="0-51"
 
 	numactl=""
-	threads="jc1ic1jr1_2400
-	         jc2ic13jr1_6000
-	         jc4ic13jr1_8000"
+	threads="jc1ic1jr1_st
+	         jc2ic13jr1_1s
+	         jc4ic13jr1_2s"
 
 elif [ ${sys} = "ul264" ]; then
 
@@ -64,9 +64,9 @@ elif [ ${sys} = "ul264" ]; then
 	export GOMP_CPU_AFFINITY="0-63"
 
 	numactl="numactl --interleave=all"
-	threads="jc1ic1jr1_2400
-	         jc1ic8jr4_6000
-	         jc2ic8jr4_8000"
+	threads="jc1ic1jr1_st
+	         jc1ic8jr4_1s
+	         jc2ic8jr4_2s"
 
 elif [ ${sys} = "ul2128" ]; then
 
@@ -74,14 +74,14 @@ elif [ ${sys} = "ul2128" ]; then
 	export GOMP_CPU_AFFINITY="0-127"
 
 	numactl="numactl --interleave=all"
-	threads="jc1ic1jr1_2400
-	         jc4ic4jr4_6000
-	         jc8ic4jr4_8000"
-	#threads="jc4ic4jr4_6000
-	#         jc8ic4jr4_8000"
-	#threads="jc1ic1jr1_2400"
-	#threads="jc4ic4jr4_6000"
-	#threads="jc8ic4jr4_8000"
+	threads="jc1ic1jr1_st
+	         jc4ic4jr4_1s
+	         jc8ic4jr4_2s"
+	#threads="jc4ic4jr4_1s
+	#         jc8ic4jr4_2s"
+	#threads="jc1ic1jr1_st"
+	#threads="jc4ic4jr4_1s"
+	#threads="jc8ic4jr4_2s"
 fi
 
 # Datatypes to test.
@@ -93,12 +93,12 @@ test_ops="gemm hemm herk trmm trsm"
 #test_ops="herk"
 
 # Implementations to test.
-#impls="blis"
+impls="blis"
 #impls="openblas"
 #impls="vendor"
 #impls="other"
 #impls="eigen"
-impls="all"
+#impls="all"
 
 if [ "${impls}" = "blis" ]; then
 
@@ -129,7 +129,7 @@ fi
 GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
 
 
-# First perform real test cases.
+# Iterate over the threading configs.
 for th in ${threads}; do
 
 	# Start with one way of parallelism in each loop. We will now begin
@@ -139,7 +139,8 @@ for th in ${threads}; do
 
 	# Strip everything before and after the underscore so that what remains
 	# is the problem size and threading parameter string, respectively.
-	psize=${th##*_}; thinfo=${th%%_*}
+	#psize=${th##*_}; thinfo=${th%%_*}
+	tsuf=${th##*_}; thinfo=${th%%_*}
 
 	# Identify each threading parameter and insert a space before it.
 	thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
@@ -166,13 +167,32 @@ for th in ${threads}; do
 
 	done
 
-	echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
+	# Find a binary using the test driver prefix and the threading suffix.
+	# Then strip everything before and after the max problem size that's
+	# encoded into the name of the binary.
+	binname=$(ls -1 ${exec_root}_*_${tsuf}.x | head -n1)
+	temp1=${binname#${exec_root}_*_}
+	psize=${temp1%%_*}
+
+	# Sanity check: If 'ls' couldn't find any binaries, then the user
+	# probably didn't build them. Inform the user and proceed to the next
+	# threading config.
+	if [ "${binname}" = "" ]; then
 
+		echo "Could not find binaries corresponding to '${tsuf}' threading config. Skipping."
+		continue
+	fi
+
+	# Let the user know what threading config we are working on.
+	echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
 
+	# Iterate over the datatypes.
 	for dt in ${test_dts}; do
 
+		# Iterate over the implementations.
 		for im in ${test_impls}; do
 
+			# Iterate over the operations.
 			for op in ${test_ops}; do
 
 				# Eigen does not support multithreading for hemm, herk, trmm,
@@ -185,14 +205,12 @@ for th in ${threads}; do
 				fi
 
 				# Find the threading suffix by probing the executable.
-				binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x)
-				suf_ext=${binname##*_}
-				suf=${suf_ext%%.*}
+				binname=$(ls ${exec_root}_${dt}${op}_*_${im}_${tsuf}.x)
 
 				#echo "found file: ${binname} with suffix ${suf}"
 
 				# Set the number of threads according to th.
-				if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then
+				if [ "${tsuf}" = "1s" ] || [ "${tsuf}" = "2s" ]; then
 
 					# Set the threading parameters based on the implementation
 					# that we are preparing to run.
@@ -237,10 +255,10 @@ for th in ${threads}; do
 				fi
 
 				# Construct the name of the test executable.
-				exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x"
+				exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${tsuf}.x"
 
 				# Construct the name of the output file.
-				out_file="${out_root}_${suf}_${dt}${op}_${im}.m"
+				out_file="${out_root}_${tsuf}_${dt}${op}_${im}.m"
 
 				#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
 				echo "Running ${numactl} ./${exec_name} > ${out_file}"

From 17b0caa2b2bff439feb6d2b39cfa16e7591882b0 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 14 Jul 2022 17:55:34 -0500
Subject: [PATCH 065/230] Fixed out-of-bounds read in haswell gemmsup kernels.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Details:
- Fixed memory access bugs in the bli_sgemmsup_rv_haswell_asm_Mx2()
  kernels, where M = {1,2,3,4,5,6}. The bugs were caused by loading four
  single-precision elements of C, via instructions such as:

	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)

  in situations where only two elements are guaranteed to exist. (These
  bugs may not have manifested in earlier tests due to the leading
  dimension alignment that BLIS employs by default.) The issue was fixed
  by replacing lines like the one above with:

	vmovsd(mem(rcx), xmm0)
	vfmadd231ps(xmm0, xmm3, xmm4)

  Thus, we use vmovsd to explicitly load only two elements of C into
  registers, and then operate on those values using register addressing.
  Thanks to Daniël de Kok for reporting these bugs in #635, and to
  Bhaskar Nallani for proposing the fix).
- CREDITS file update.
---
 CREDITS                                       |  1 +
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c   | 63 ++++++++++++-------
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/CREDITS b/CREDITS
index 43c7b3ed5..bb2b3798f 100644
--- a/CREDITS
+++ b/CREDITS
@@ -23,6 +23,7 @@ but many others have contributed code and feedback, including
   Dilyn Corner             @dilyn-corner
   Mat Cross                @matcross           (NAG)
                            @decandia50
+  Daniël de Kok            @danieldk           (Explosion)
   Kay Dewhurst             @jkd2016            (Max Planck Institute, Halle, Germany)
   Jeff Diamond                                 (Oracle)
   Johannes Dieterich       @iotamudelta
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
index 53a70d15f..efb336395 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
@@ -389,32 +389,38 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	label(.SROWSTORED)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -848,27 +854,32 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	label(.SROWSTORED)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -1288,22 +1299,26 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	label(.SROWSTORED)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -1683,17 +1698,20 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	label(.SROWSTORED)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -2066,12 +2084,14 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	label(.SROWSTORED)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -2404,7 +2424,8 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	label(.SROWSTORED)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+	vmovsd(mem(rcx), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
 

From af3a41e02534befdae026377592ce437bab83023 Mon Sep 17 00:00:00 2001
From: Alexander Grund <Flamefire@users.noreply.github.com>
Date: Thu, 21 Jul 2022 18:05:48 +0200
Subject: [PATCH 066/230] Add autodetection for POWER7, POWER9 & POWER10 (#647)

Read from `/proc/cpuinfo` as done for ARM.
Fixes #501
---
 frame/base/bli_cpuid.c | 27 +++++++++++++++++++++++----
 frame/base/bli_cpuid.h |  6 +++++-
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c
index ff0f386e6..527db1f5d 100644
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -485,7 +485,7 @@ bool bli_cpuid_is_bulldozer
 	return TRUE;
 }
 
-#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
+#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC)
 
 arch_t bli_cpuid_query_id( void )
 {
@@ -530,9 +530,14 @@ arch_t bli_cpuid_query_id( void )
 			return BLIS_ARCH_GENERIC;
 		}
 	}
-	else if ( vendor == VENDOR_UNKNOWN )
+	else if ( vendor == VENDOR_IBM )
 	{
-		return BLIS_ARCH_GENERIC;
+		if ( model == MODEL_POWER7)
+			return BLIS_ARCH_POWER7;
+		else if ( model == MODEL_POWER9)
+			return BLIS_ARCH_POWER9;
+		else if ( model == MODEL_POWER10)
+			return BLIS_ARCH_POWER10;
 	}
 
 	return BLIS_ARCH_GENERIC;
@@ -1203,7 +1208,7 @@ uint32_t bli_cpuid_query
 	return VENDOR_ARM;
 }
 
-#elif defined(__arm__) || defined(_M_ARM)
+#elif defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC)
 
 /* 
    I can't easily find documentation to do this as for aarch64, though
@@ -1240,6 +1245,20 @@ uint32_t bli_cpuid_query
 	char  feat_str[ TEMP_BUFFER_SIZE ];
 	char* r_val;
 
+#ifdef _ARCH_PPC
+	r_val = find_string_in( "cpu", proc_str, TEMP_BUFFER_SIZE, pci_str );
+	if ( r_val == NULL ) return VENDOR_IBM;
+
+	if ( strstr( proc_str, "POWER7" ) != NULL )
+		*model = MODEL_POWER7;
+	else if ( strstr( proc_str, "POWER9" ) != NULL )
+		*model = MODEL_POWER9;
+	else if ( strstr( proc_str, "POWER10" ) != NULL )
+		*model = MODEL_POWER10;
+
+	return VENDOR_IBM;
+#endif
+
 	//printf( "bli_cpuid_query(): beginning search\n" );
 
 	// Search /proc/cpuinfo for the 'Processor' entry.
diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h
index 3fea78e5a..c10f36a1c 100644
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -161,19 +161,23 @@ enum
 	FEATURE_AVX512VL = 0x4000
 };
 
-#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
+#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC)
 
 char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath );
 
 enum
 {
 	VENDOR_ARM = 0,
+	VENDOR_IBM,
 	VENDOR_UNKNOWN
 };
 enum
 {
 	MODEL_ARMV7 = 0,
 	MODEL_ARMV8,
+	MODEL_POWER7,
+	MODEL_POWER9,
+	MODEL_POWER10,
 	MODEL_UNKNOWN
 };
 enum

From 6826c1cdfba855513786d9e3d606681316453398 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 25 Jul 2022 18:21:05 -0500
Subject: [PATCH 067/230] Add `#line` directives to flattened `blis.h`. (#643)

Details:
- Modified flatten-headers.py so that #line directives are inserted into
  the flattened blis.h file. This facilitates easier debugging when
  something is amiss in the flattened blis.h because the compiler will
  be able to refer to the line number within the original constituent
  header file (which is where the fix would go) rather than the line
  number within the flattened header (which is not as helpful).
---
 build/flatten-headers.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/build/flatten-headers.py b/build/flatten-headers.py
index 563725a7e..40fc2a450 100755
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -215,9 +215,19 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 	# Open the input file to process.
 	ifile = open( inputfile, "r" )
 
+	# A counter to track the line number being parsed within the current file.
+	# This counter, when selectively encoded into the flattened header via #line
+	# directives, facilitates easier debugging. (When the compiler finds an
+	# issue, it will be able to refer to the line number within the constituent
+	# header file rather than the flattened one.)
+	lineno = 0
+
 	# Iterate over the lines in the file.
 	while True:
 
+		# Increment the line number.
+		lineno += 1
+
 		# Read a line in the file.
 		line = ifile.readline()
 
@@ -268,12 +278,14 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 
 				# Mark the beginning of the header being inserted.
 				ostring += "%s%s%c" % ( beginstr, header, '\n' )
+				ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' )
 
 				# Recurse on the header, accumulating the string.
 				ostring += flatten_header( header_path, header_dirpaths, cursp + "  " )
 
 				# Mark the end of the header being inserted.
 				ostring += "%s%s%c" % ( endstr, header, '\n' )
+				ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' )
 
 				echov2( "%sheader file '%s' fully processed." \
 				        % ( cursp, header_path ) )
@@ -300,7 +312,7 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 		# endif
 
 	# endwhile
-	
+
 	# Close the input file.
 	ifile.close()
 
@@ -330,7 +342,6 @@ def find_header_dirs( dirpath ):
 	#endfor
 
 	return header_dirpaths
-	
 
 # ------------------------------------------------------------------------------
 

From 4dde947e2ec9e139c162801320c94e6a01a39708 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 26 Jul 2022 17:29:32 -0500
Subject: [PATCH 068/230] Fixed out-of-bounds bug in sup s6x16m haswell kernel.

Details:
- Fixed another out-of-bounds read access bug in the haswell sup
  assembly kernels. This bug is similar to the one fixed in 17b0caa
  and affects bli_sgemmsup_rv_haswell_asm_6x2m(). Thanks to Madeesh
  Kannan for reporting this bug (and a suitable fix) in #635.
- CREDITS file update.
---
 CREDITS                                        |  1 +
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c  | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/CREDITS b/CREDITS
index bb2b3798f..aa1591334 100644
--- a/CREDITS
+++ b/CREDITS
@@ -47,6 +47,7 @@ but many others have contributed code and feedback, including
   Matthew Honnibal         @honnibal
   Stefan Husmann           @stefanhusmann
   Francisco Igual          @figual             (Universidad Complutense de Madrid)
+  Madeesh Kannan           @shadeMe
   Tony Kelman              @tkelman
   Lee Killough             @leekillough        (Cray)
   Mike Kistler             @mkistler           (IBM, Austin Research Laboratory)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
index aacfd8d1f..b5424f09a 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
@@ -4477,32 +4477,38 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	label(.SROWSTORED)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+	vmovsd(mem(rcx, 0*32), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
+	vmovsd(mem(rcx, 0*32), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
+	vmovsd(mem(rcx, 0*32), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
+	vmovsd(mem(rcx, 0*32), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
+	vmovsd(mem(rcx, 0*32), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
-	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
+	vmovsd(mem(rcx, 0*32), xmm0)
+	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
 

From 56de31b00fa0f1ba866321817cd1e5d83000ff11 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 27 Jul 2022 13:54:17 -0500
Subject: [PATCH 069/230] Disable modification of KC in the gemmsup kernels.
 (#648)

This led to a ~50% performance reduction for certain gemm operations (but not others?). See #644 for example.
---
 frame/3/bli_l3_sup_var1n2m.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index a5d66783f..61c85d6e9 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -295,6 +295,9 @@ void PASTEMAC(ch,varname) \
 	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
 	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
 \
+	/* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \
+	dim_t KC = KC0; \
+	/* \
 	dim_t KC; \
 	if      ( packa && packb ) \
 	{ \
@@ -320,7 +323,7 @@ void PASTEMAC(ch,varname) \
 		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
 		else                               KC = KC0; \
 	} \
-	else /* if ( !packa && !packb ) */ \
+	else *//* if ( !packa && !packb ) *//* \
 	{ \
 		if      ( FALSE                  ) KC = KC0; \
 		else if ( stor_id == BLIS_RRC || \
@@ -330,7 +333,7 @@ void PASTEMAC(ch,varname) \
 		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
 		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
 		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	} \
+	}*/ \
 \
 	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
 	   NOTE: This is unique to variant 1 (ie: not performed in variant 2)
@@ -932,6 +935,9 @@ void PASTEMAC(ch,varname) \
 	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
 	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
 \
+	/* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \
+	dim_t KC = KC0; \
+	/* \
 	dim_t KC; \
 	if      ( packa && packb ) \
 	{ \
@@ -957,7 +963,7 @@ void PASTEMAC(ch,varname) \
 		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
 		else                               KC = KC0; \
 	} \
-	else /* if ( !packa && !packb ) */ \
+	else *//* if ( !packa && !packb ) *//* \
 	{ \
 		if      ( stor_id == BLIS_RRR || \
 				  stor_id == BLIS_CCC    ) KC = KC0; \
@@ -968,7 +974,7 @@ void PASTEMAC(ch,varname) \
 		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
 		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
 		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	} \
+	}*/ \
 \
 	/* Query the maximum blocksize for NR, which implies a maximum blocksize
 	   extension for the final iteration. */ \

From 5b298935de7f20462bfad1893ed34ecd691cec5a Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 27 Jul 2022 19:14:15 -0500
Subject: [PATCH 070/230] Removed buggy cruft from power10 subconfig.

Details:
- Removed #defines for BLIS_BBN_s and BLIS_BBN_d from
  bli_kernel_defs_power10.h. These were inadvertently set in ae10d949
  because the power10 subconfig was registering bb packm ukernels, but
  only for 6xk (power10 uses s8x16 and d8x8 ukernels) and only because
  the original author (probably) copy-pasted from power9 when getting
  started. That 6xk packm registration was effectively "dead code"
  prior to ae10d949, but was then mistaken as not-dead code during the
  ae10d949 refactor. These improper bb factors may have been causing
  bugs in power10 builds. Thanks to Nicholai Tukanov for helping remind
  me what the power10 subconfig was supposed to look like.
- Removed extraneous microkernel preference registrations from power10
  subconfig. Preferences for single and double complex gemm were being
  registered despite there being no complex gemm ukernels registered to
  go with them. Similarly, there were trsm preferences registered
  without any trsm ukernels registered (and BLIS doesn't actually use a
  preference for the trsm ukernel anyway). These extraneous
  registrations were almost surely not hurting anything, even if they
  were quite misleading.
---
 config/power10/bli_cntx_init_power10.c   | 10 ----------
 config/power10/bli_kernel_defs_power10.h |  2 --
 2 files changed, 12 deletions(-)

diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c
index 12d9f51c6..f662d5791 100644
--- a/config/power10/bli_cntx_init_power10.c
+++ b/config/power10/bli_cntx_init_power10.c
@@ -63,16 +63,6 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_FLOAT,    TRUE,
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DOUBLE,   TRUE,
-	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_SCOMPLEX, FALSE,
-	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DCOMPLEX, FALSE,
-	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
-	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
-	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
-	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
-	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
-	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
-	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
-	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
 	  BLIS_VA_END
 	);
diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h
index 4e32f1173..9b47a77c0 100644
--- a/config/power10/bli_kernel_defs_power10.h
+++ b/config/power10/bli_kernel_defs_power10.h
@@ -44,8 +44,6 @@
 #define BLIS_NR_s   16
 #define BLIS_NR_d   8
 
-#define BLIS_BBN_s   4
-#define BLIS_BBN_d   2
 
 //#endif
 

From a48e29d799091a833213efeafaf2d342ebdafde9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 28 Jul 2022 10:11:07 -0500
Subject: [PATCH 071/230] CREDITS file update.

Details:
- Thanks to Kihiro Bando for assisting with issue #644.
---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index aa1591334..49361c801 100644
--- a/CREDITS
+++ b/CREDITS
@@ -16,6 +16,7 @@ but many others have contributed code and feedback, including
   Alex Arslan              @ararslan
   Vernon Austel                                (IBM, T.J. Watson Research Center)
   Satish Balay             @balay              (Argonne National Laboratory)
+  Kihiro Bando             @bandokihiro
   Matthew Brett            @matthew-brett      (University of Birmingham)
   Jérémie du Boisberranger @jeremiedbb
   Jed Brown                @jedbrown           (Argonne National Laboratory)

From bbaf29abd942de47a3a99a80a67d12bab41b27db Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 4 Aug 2022 17:51:37 -0500
Subject: [PATCH 072/230] Very minor variable updates to common.mk.

Details:
- Fixed a harmless bug that would have allowed C++ headers into the list
  of header suffices specifically reserved for C99 headers. In practice,
  this would have had no substantive effect on anything since the core
  BLIS framework does not use C++ headers.
---
 common.mk | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/common.mk b/common.mk
index 6661f84c5..33713e9f5 100644
--- a/common.mk
+++ b/common.mk
@@ -342,7 +342,8 @@ SANDBOX_CXX_SUFS   := cc cpp cxx
 SANDBOX_SRC_SUFS   := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS)
 
 # Header suffixes.
-FRAME_HDR_SUFS     := h
+FRAME_H99_SUFS     := h
+FRAME_HDR_SUFS     := $(FRAME_H99_SUFS)
 
 ADDON_H99_SUFS     := h
 ADDON_HXX_SUFS     := hh hpp hxx
@@ -357,8 +358,8 @@ ALL_HDR_SUFS       := $(sort $(FRAME_HDR_SUFS) \
                              $(ADDON_HDR_SUFS) \
                              $(SANDBOX_HDR_SUFS) )
 
-ALL_H99_SUFS       := $(sort $(FRAME_HDR_SUFS) \
-                             $(ADDON_HDR_SUFS) \
+ALL_H99_SUFS       := $(sort $(FRAME_H99_SUFS) \
+                             $(ADDON_H99_SUFS) \
                              $(SANDBOX_H99_SUFS) )
 
 # The names of scripts that check output from the BLAS test drivers and

From 775148bcdbb1014b4881a76306f35f5d0fedecbe Mon Sep 17 00:00:00 2001
From: jdiamondGitHub <jeff_diamond@fastmail.com>
Date: Fri, 5 Aug 2022 12:01:24 -0500
Subject: [PATCH 073/230] Updated ARMv8a kernels to fix 2 prefetching issues.
 (#649)

Details:
- The ARMv8a dgemm/sgemm microkernels had 2 prefetching issues that
  impacted performance on modern ARM platforms. The most significant
  issue was that only a single prefetch per C tile column was issued.
  When a column of C was not cache aligned, the second cache line would
  not be prefetched at all, forcing the kernel to wait for an entire
  load to update elements of C. This happened with roughly 50% of the
  C prefetches. The fix was to have two prefetches per column, spaced
  64 bytes (1 cache line) apart.
- A secondary performance issue was that all the C prefetch instructions
  were issued sequentially at the beginning of the kernel call. This
  caused a noticeable performance slowdown. Interleaving the prefetch
  calls every 2-3 instructions in the prologue code solved the issue.
---
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 356 ++++++++++++--------
 1 file changed, 211 insertions(+), 145 deletions(-)

diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 94f0090bc..12c670a9f 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -1,4 +1,4 @@
-    /*
+/*
 
    BLIS
    An object-based framework for developing high-performance BLAS-like
@@ -30,12 +30,20 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 */
 
 #include "blis.h"
 #include "armv8a_asm_utils.h"
 
+// #define DISPLAY_DEBUG_INFO
+
+// Added prefetch fix for non-cacheline aligned C columns
+// (with the prefetches interleaved with other instructions)
+// to both sgemm and dgemm versions.
+
+// Added sgemm prefetch fix for non-cacheline aligned C columns
+// (with the prefetches interleaved with other instructions)
+
 /*
    o 4x4 Single precision micro-kernel fully functional.
    o Runnable on ARMv8, compiled with aarch64 GCC.
@@ -50,7 +58,13 @@
  * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz.
  * Tested on Juno board. Around  3.1 GFLOPS, 1 x A53 core  @ 850 MHz.
  * Tested on Juno board. Around 12   GFLOPS, 4 x A53 cores @ 850 MHz.
+
+ * UPDATE JULY 2021 - Leick Robinson
+ * Both Microkernels changed to fix two prefetching performance bugs
+ * Tested on 2s Altra.   Around 6,900 GFLOPS, 160 x N2 cores @ 3.0 GHz
+ * Tested on 1s Altra Max. Arnd 5,800 GFLOPS. 128 x N2 cores @ 3.0 GHz
 */
+
 void bli_sgemm_armv8a_asm_8x12
      (
        dim_t               m,
@@ -86,73 +100,111 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr x1,%[baddr]                            \n\t" // Load address of B.
 	" ldr x2,%[caddr]                            \n\t" // Load address of C.
 	"                                            \n\t"
-	" ldr x5,%[k_iter]                           \n\t" // Number of unrolled iterations (k_iter).
-	" ldr x6,%[k_left]                           \n\t" // Number of remaining iterations (k_left).
-	"                                            \n\t"
 	" ldr x10,%[cs_c]                            \n\t" // Load cs_c.
 	" lsl x10,x10,#2                             \n\t" // cs_c * sizeof(float) -- AUX.
 	"                                            \n\t"
+	" ldr x5,%[k_iter]                           \n\t" // Number of unrolled iterations (k_iter).
+	" ldr x6,%[k_left]                           \n\t" // Number of remaining iterations (k_left).
+	" add x16,x2,x10                             \n\t" // Load address Column 1 of C
+	"                                            \n\t"
 	// " ldr x14,%[rs_c]                            \n\t" // Load rs_c.
 	// " lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
 	"                                            \n\t"
-	" add x16,x2,x10                             \n\t" //Load address Column 1 of C
-	" add x17,x16,x10                            \n\t" //Load address Column 2 of C
-	" add x19,x17,x10                            \n\t" //Load address Column 3 of C
-	" add x20,x19,x10                            \n\t" //Load address Column 4 of C
-	" add x21,x20,x10                            \n\t" //Load address Column 5 of C
-	" add x22,x21,x10                            \n\t" //Load address Column 6 of C
-	" add x23,x22,x10                            \n\t" //Load address Column 7 of C
-	" add x24,x23,x10                            \n\t" //Load address Column 8 of C
-	" add x25,x24,x10                            \n\t" //Load address Column 9 of C
-	" add x26,x25,x10                            \n\t" //Load address Column 10 of C
-	" add x27,x26,x10                            \n\t" //Load address Column 11 of C
-	"                                            \n\t"
-	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
-	" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x27]                       \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v8.4s, wzr                            \n\t" // Vector for accummulating column 0
 	" prfm    PLDL1KEEP, [x1, #192]              \n\t"
+	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+	" add x17,x16,x10                            \n\t" // Load address Column 2 of C
+
 	" dup  v9.4s, wzr                            \n\t" // Vector for accummulating column 0
 	" prfm    PLDL1KEEP, [x1, #256]              \n\t"
+	"                                            \n\t" // Since columns of C can cross a cache
+	                                                   // line boundary, we also need to prefetch
+	                                                   // the "ends."
+	" prfm pldl1keep,[x2, #16]                   \n\t" // Prefetch c.
+	" add x19,x17,x10                            \n\t" // Load address Column 3 of C
+
 	" dup  v10.4s, wzr                           \n\t" // Vector for accummulating column 1
 	" prfm    PLDL1KEEP, [x1, #320]              \n\t"
+	" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
+	" add x20,x19,x10                            \n\t" // Load address Column 4 of C
+
 	" dup  v11.4s, wzr                           \n\t" // Vector for accummulating column 1
+	" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
+	" prfm pldl1keep,[x16, #16]                  \n\t" // Prefetch c.
+
 	" dup  v12.4s, wzr                           \n\t" // Vector for accummulating column 2
+	" prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
+	" add x21,x20,x10                            \n\t" // Load address Column 5 of C
+
 	" dup  v13.4s, wzr                           \n\t" // Vector for accummulating column 2
+	" prfm pldl1keep,[x17, #16]                  \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v14.4s, wzr                           \n\t" // Vector for accummulating column 3
 	" prfm    PLDL1KEEP, [x0, #128]              \n\t"
+	" prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
+	" add x22,x21,x10                            \n\t" // Load address Column 6 of C
+
 	" dup  v15.4s, wzr                           \n\t" // Vector for accummulating column 3
 	" prfm    PLDL1KEEP, [x0, #192]              \n\t"
+	" prfm pldl1keep,[x19, #16]                  \n\t" // Prefetch c.
+
 	" dup  v16.4s, wzr                           \n\t" // Vector for accummulating column 4
+	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+	" add x23,x22,x10                            \n\t" // Load address Column 7 of C
+
 	" dup  v17.4s, wzr                           \n\t" // Vector for accummulating column 4
+	" prfm pldl1keep,[x20, #16]                  \n\t" // Prefetch c.
+
 	" dup  v18.4s, wzr                           \n\t" // Vector for accummulating column 5
+	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
+	" add x24,x23,x10                            \n\t" // Load address Column 8 of C
+
 	" dup  v19.4s, wzr                           \n\t" // Vector for accummulating column 5
+	" prfm pldl1keep,[x21, #16]                  \n\t" // Prefetch c.
+
 	"                                            \n\t"
 	" dup  v20.4s, wzr                           \n\t" // Vector for accummulating column 6
+	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+	" add x25,x24,x10                            \n\t" // Load address Column 9 of C
+
 	" dup  v21.4s, wzr                           \n\t" // Vector for accummulating column 6
+	" prfm pldl1keep,[x22, #16]                  \n\t" // Prefetch c.
+
 	" dup  v22.4s, wzr                           \n\t" // Vector for accummulating column 7
+	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+	" add x26,x25,x10                            \n\t" // Load address Column 10 of C
+
 	" dup  v23.4s, wzr                           \n\t" // Vector for accummulating column 7
+	" prfm pldl1keep,[x23, #16]                  \n\t" // Prefetch c.
+
 	" dup  v24.4s, wzr                           \n\t" // Vector for accummulating column 8
+	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
+	" add x27,x26,x10                            \n\t" // Load address Column 11 of C
+
 	" dup  v25.4s, wzr                           \n\t" // Vector for accummulating column 8
+	" prfm pldl1keep,[x24, #16]                  \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v26.4s, wzr                           \n\t" // Vector for accummulating column 9
+	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+
 	" dup  v27.4s, wzr                           \n\t" // Vector for accummulating column 9
+	" prfm pldl1keep,[x25, #16]                  \n\t" // Prefetch c.
+
 	" dup  v28.4s, wzr                           \n\t" // Vector for accummulating column 10
+	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+
 	" dup  v29.4s, wzr                           \n\t" // Vector for accummulating column 10
+	" prfm pldl1keep,[x26, #16]                  \n\t" // Prefetch c.
+
 	" dup  v30.4s, wzr                           \n\t" // Vector for accummulating column 11
+	" prfm pldl1keep,[x27]                       \n\t" // Prefetch c.
+
 	" dup  v31.4s, wzr                           \n\t" // Vector for accummulating column 11
+	" prfm pldl1keep,[x27, #16]                  \n\t" // Prefetch c.
+	"                                            \n\t"
 	"                                            \n\t"
+
 	" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
 	BEQ(SCONSIDERKLEFT)
 	"                                            \n\t"
@@ -163,10 +215,10 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr q3, [x1, #16]                          \n\t"
 	" ldr q4, [x1, #32]                          \n\t"
 	"                                            \n\t"
-	" add x0, x0, #32                            \n\t" //update address of A
-	" add x1, x1, #48                            \n\t" //update address of B
+	" add x0, x0, #32                            \n\t" // Update address of A
+	" add x1, x1, #48                            \n\t" // Update address of B
 	"                                            \n\t"
-	" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one.
+	" cmp x5,1                                   \n\t" // If there's only one k_iter, jump to it
 	BEQ(SLASTITER)                                     // (as loop is do-while-like).
 	"                                            \n\t"
 	LABEL(SLOOPKITER)                                  // Body of the k_iter loop.
@@ -206,7 +258,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #32]                          \n\t"
-	"                                            \n\t" //End It 1
+	"                                            \n\t"                  // End It 1
 	"                                            \n\t"
 	" ldr q0, [x0, #32]                          \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -242,7 +294,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #80]                          \n\t"
-	"                                            \n\t" //End It 2
+	"                                            \n\t"                  // End It 2
 	"                                            \n\t"
 	" ldr q5, [x0, #64]                          \n\t"
 	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -276,7 +328,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #128]                         \n\t"
-	"                                            \n\t" //End It 3
+	"                                            \n\t"                  // End It 3
 	"                                            \n\t"
 	" ldr q0, [x0, #96]                          \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -312,7 +364,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr q4, [x1, #176]                         \n\t"
 	" add x1, x1, #192                           \n\t"
 	" add x0, x0, #128                           \n\t"
-	"                                            \n\t" //End It 4
+	"                                            \n\t"                  // End It 4
 	" sub x5,x5,1                                \n\t" // i-=1.
 	" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
 	BNE(SLOOPKITER)
@@ -352,7 +404,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #32]                          \n\t"
-	"                                            \n\t" //End It 1
+	"                                            \n\t"                  // End It 1
 	"                                            \n\t"
 	" ldr q0, [x0, #32]                          \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -386,7 +438,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #80]                          \n\t"
-	"                                            \n\t" //End It 2
+	"                                            \n\t"                  // End It 2
 	"                                            \n\t"
 	" ldr q5, [x0, #64]                          \n\t"
 	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -420,7 +472,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #128]                         \n\t"
-	"                                            \n\t" //End It 3
+	"                                            \n\t"                  // End It 3
 	"                                            \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
 	" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -451,7 +503,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
 	" add x1, x1, #144                           \n\t"
 	" add x0, x0, #96                            \n\t"
-	"                                            \n\t" //End It 4
+	"                                            \n\t"                  // End It 4
 	"                                            \n\t"
 	LABEL(SCONSIDERKLEFT)
 	" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
@@ -521,11 +573,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fcmp s7,#0.0                               \n\t"
 	BEQ(SBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x2]                               \n\t" //Load column 0 of C
+	" ldr q0, [x2]                               \n\t" // Load column 0 of C
 	" ldr q1, [x2, #16]                          \n\t"
-	" ldr q2, [x16]                              \n\t" //Load column 1 of C
+	" ldr q2, [x16]                              \n\t" // Load column 1 of C
 	" ldr q3, [x16, #16]                         \n\t"
-	" ldr q4, [x17]                              \n\t" //Load column 2 of C
+	" ldr q4, [x17]                              \n\t" // Load column 2 of C
 	" ldr q5, [x17, #16]                         \n\t"
 	"                                            \n\t"
 	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -544,11 +596,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
 	" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x2]                               \n\t" //Store column 0 of C
+	" str q0, [x2]                               \n\t" // Store column 0 of C
 	" str q1, [x2, #16]                          \n\t"
-	" str q2, [x16]                              \n\t" //Store column 1 of C
+	" str q2, [x16]                              \n\t" // Store column 1 of C
 	" str q3, [x16, #16]                         \n\t"
-	" str q4, [x17]                              \n\t" //Store column 2 of C
+	" str q4, [x17]                              \n\t" // Store column 2 of C
 	" str q5, [x17, #16]                         \n\t"
 	"                                            \n\t"
 	" dup  v8.4s, wzr                            \n\t"
@@ -561,11 +613,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fcmp s7,#0.0                               \n\t"
 	BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x19]                              \n\t" //Load column 3 of C
+	" ldr q8, [x19]                              \n\t" // Load column 3 of C
 	" ldr q9, [x19, #16]                         \n\t"
-	" ldr q10, [x20]                             \n\t" //Load column 4 of C
+	" ldr q10, [x20]                             \n\t" // Load column 4 of C
 	" ldr q11, [x20, #16]                        \n\t"
-	" ldr q12, [x21]                             \n\t" //Load column 5 of C
+	" ldr q12, [x21]                             \n\t" // Load column 5 of C
 	" ldr q13, [x21, #16]                        \n\t"
 	"                                            \n\t"
 	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -584,11 +636,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
 	" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x19]                              \n\t" //Store column 3 of C
+	" str q8, [x19]                              \n\t" // Store column 3 of C
 	" str q9, [x19, #16]                         \n\t"
-	" str q10, [x20]                             \n\t" //Store column 4 of C
+	" str q10, [x20]                             \n\t" // Store column 4 of C
 	" str q11, [x20, #16]                        \n\t"
-	" str q12, [x21]                             \n\t" //Store column 5 of C
+	" str q12, [x21]                             \n\t" // Store column 5 of C
 	" str q13, [x21, #16]                        \n\t"
 	"                                            \n\t"
 	" dup  v0.4s, wzr                            \n\t"
@@ -601,11 +653,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fcmp s7,#0.0                               \n\t"
 	BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x22]                              \n\t" //Load column 6 of C
+	" ldr q0, [x22]                              \n\t" // Load column 6 of C
 	" ldr q1, [x22, #16]                         \n\t"
-	" ldr q2, [x23]                              \n\t" //Load column 7 of C
+	" ldr q2, [x23]                              \n\t" // Load column 7 of C
 	" ldr q3, [x23, #16]                         \n\t"
-	" ldr q4, [x24]                              \n\t" //Load column 8 of C
+	" ldr q4, [x24]                              \n\t" // Load column 8 of C
 	" ldr q5, [x24, #16]                         \n\t"
 	"                                            \n\t"
 	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -624,11 +676,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
 	" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x22]                              \n\t" //Store column 6 of C
+	" str q0, [x22]                              \n\t" // Store column 6 of C
 	" str q1, [x22, #16]                         \n\t"
-	" str q2, [x23]                              \n\t" //Store column 7 of C
+	" str q2, [x23]                              \n\t" // Store column 7 of C
 	" str q3, [x23, #16]                         \n\t"
-	" str q4, [x24]                              \n\t" //Store column 8 of C
+	" str q4, [x24]                              \n\t" // Store column 8 of C
 	" str q5, [x24, #16]                         \n\t"
 	"                                            \n\t"
 	" dup  v8.4s, wzr                            \n\t"
@@ -641,11 +693,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fcmp s7,#0.0                               \n\t"
 	BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x25]                              \n\t" //Load column 9 of C
+	" ldr q8, [x25]                              \n\t" // Load column 9 of C
 	" ldr q9, [x25, #16]                         \n\t"
-	" ldr q10, [x26]                             \n\t" //Load column 10 of C
+	" ldr q10, [x26]                             \n\t" // Load column 10 of C
 	" ldr q11, [x26, #16]                        \n\t"
-	" ldr q12, [x27]                             \n\t" //Load column 11 of C
+	" ldr q12, [x27]                             \n\t" // Load column 11 of C
 	" ldr q13, [x27, #16]                        \n\t"
 	"                                            \n\t"
 	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -667,11 +719,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
 	" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x25]                              \n\t" //Store column 9 of C
+	" str q8, [x25]                              \n\t" // Store column 9 of C
 	" str q9, [x25, #16]                         \n\t"
-	" str q10, [x26]                             \n\t" //Store column 10 of C
+	" str q10, [x26]                             \n\t" // Store column 10 of C
 	" str q11, [x26, #16]                        \n\t"
-	" str q12, [x27]                             \n\t" //Store column 11 of C
+	" str q12, [x27]                             \n\t" // Store column 11 of C
 	" str q13, [x27, #16]                        \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -729,7 +781,13 @@ void bli_sgemm_armv8a_asm_8x12
  * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz.
  * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core  @ 850 MHz.
  * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz.
-*/
+
+ * UPDATE JULY 2021 - Leick Robinson
+ * Both Microkernels changed to fix two prefetching performance bugs
+ * Tested on 2s Altra. Around 3,200 GFLOPS, 160 x N2 cores @ 3.0 GHz
+ * Tested on 1s Altra, Around 1,700 GFLOPS,  80 x N2 cores @ 3.0 GHz
+ * Tested on 1s Altra Max,  ~ 2,600 GFLOPS, 128 x N2 cores @ 3.0 GHz
+ */
 void bli_dgemm_armv8a_asm_6x8
      (
        dim_t               m,
@@ -744,6 +802,21 @@ void bli_dgemm_armv8a_asm_6x8
        cntx_t*             cntx
      )
 {
+#ifdef DISPLAY_DEBUG_INFO
+
+	static bool bFirstTime = true;
+
+	if ( bFirstTime )
+	{
+		printf( "In bli_dgemm_armv8a_asm_6x8: rs_c0=%d, cs_c0=%d \n",
+		        (int) rs_c0, (int) cs_c0 );
+		fflush( stdout );
+		bFirstTime = false;
+	}
+
+#endif
+
+
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
 
@@ -763,31 +836,17 @@ void bli_dgemm_armv8a_asm_6x8
 	" ldr x1,%[baddr]                            \n\t" // Load address of B
 	" ldr x2,%[caddr]                            \n\t" // Load address of C
 	"                                            \n\t"
-	" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
-	" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
-	"                                            \n\t"
 	" ldr x10,%[cs_c]                            \n\t" // Load cs_c
 	" lsl x10,x10,#3                             \n\t" // cs_c * sizeof(double)
 	"                                            \n\t"
+	" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
+	" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
+	" add x20,x2,x10                             \n\t" // Load address Column 1 of C
+	"                                            \n\t"
 	// " ldr x14,%[rs_c]                            \n\t" // Load rs_c.
 	// " lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double).
 	"                                            \n\t"
-	" add x20,x2,x10                             \n\t" //Load address Column 1 of C
-	" add x21,x20,x10                            \n\t" //Load address Column 2 of C
-	" add x22,x21,x10                            \n\t" //Load address Column 3 of C
-	" add x23,x22,x10                            \n\t" //Load address Column 4 of C
-	" add x24,x23,x10                            \n\t" //Load address Column 5 of C
-	" add x25,x24,x10                            \n\t" //Load address Column 6 of C
-	" add x26,x25,x10                            \n\t" //Load address Column 7 of C
 	"                                            \n\t"
-	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
-	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
-	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v8.2d, xzr                            \n\t" // Vector for accummulating column 0
 	" prfm    PLDL1KEEP, [x1, #256]              \n\t"
@@ -798,33 +857,75 @@ void bli_dgemm_armv8a_asm_6x8
 	" dup  v11.2d, xzr                           \n\t" // Vector for accummulating column 1
 	" prfm    PLDL1KEEP, [x1, #448]              \n\t"
 	" dup  v12.2d, xzr                           \n\t" // Vector for accummulating column 1
+	" prfm    PLDL1KEEP, [x0, #192]              \n\t"
+	" add x21,x20,x10                            \n\t" // Load address Column 2 of C
+
 	" dup  v13.2d, xzr                           \n\t" // Vector for accummulating column 1
+	" prfm    PLDL1KEEP, [x0, #256]              \n\t"
 	"                                            \n\t"
 	" dup  v14.2d, xzr                           \n\t" // Vector for accummulating column 2
-	" prfm    PLDL1KEEP, [x0, #192]              \n\t"
+	" prfm    PLDL1KEEP, [x0, #320]              \n\t"
+	" add x22,x21,x10                            \n\t" // Load address Column 3 of C
+
 	" dup  v15.2d, xzr                           \n\t" // Vector for accummulating column 2
-	" prfm    PLDL1KEEP, [x0, #256]              \n\t"
+	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
 	" dup  v16.2d, xzr                           \n\t" // Vector for accummulating column 2
-	" prfm    PLDL1KEEP, [x0, #320]              \n\t"
+	"                                            \n\t" // Since columns of C can cross a cache
+	                                                   // line boundary, we also need to prefetch
+	                                                   // the "ends."
+	" prfm pldl1keep,[x2, #32]                   \n\t" // Prefetch c.
+	" add x23,x22,x10                            \n\t" // Load address Column 4 of C
+
 	" dup  v17.2d, xzr                           \n\t" // Vector for accummulating column 3
+	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+
 	" dup  v18.2d, xzr                           \n\t" // Vector for accummulating column 3
+	" prfm pldl1keep,[x20, #32]                  \n\t" // Prefetch c.
+	" add x24,x23,x10                            \n\t" // Load address Column 5 of C
+
 	" dup  v19.2d, xzr                           \n\t" // Vector for accummulating column 3
+	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
 	"                                            \n\t"
+
 	" dup  v20.2d, xzr                           \n\t" // Vector for accummulating column 4
+	" prfm pldl1keep,[x21, #32]                  \n\t" // Prefetch c.
+	" add x25,x24,x10                            \n\t" // Load address Column 6 of C
+
 	" dup  v21.2d, xzr                           \n\t" // Vector for accummulating column 4
+	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+
 	" dup  v22.2d, xzr                           \n\t" // Vector for accummulating column 4
+	" prfm pldl1keep,[x22, #32]                  \n\t" // Prefetch c.
+	" add x26,x25,x10                            \n\t" // Load address Column 7 of C
+
 	" dup  v23.2d, xzr                           \n\t" // Vector for accummulating column 5
+	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+
 	" dup  v24.2d, xzr                           \n\t" // Vector for accummulating column 5
+	" prfm pldl1keep,[x23, #32]                  \n\t" // Prefetch c.
+
 	" dup  v25.2d, xzr                           \n\t" // Vector for accummulating column 5
+	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v26.2d, xzr                           \n\t" // Vector for accummulating column 6
+	" prfm pldl1keep,[x24, #32]                  \n\t" // Prefetch c.
+
 	" dup  v27.2d, xzr                           \n\t" // Vector for accummulating column 6
+	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+
 	" dup  v28.2d, xzr                           \n\t" // Vector for accummulating column 6
+	" prfm pldl1keep,[x25, #32]                  \n\t" // Prefetch c.
+
 	" dup  v29.2d, xzr                           \n\t" // Vector for accummulating column 7
+	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+
 	" dup  v30.2d, xzr                           \n\t" // Vector for accummulating column 7
+	" prfm pldl1keep,[x26, #32]                  \n\t" // Prefetch c.
+
 	" dup  v31.2d, xzr                           \n\t" // Vector for accummulating column 7
 	"                                            \n\t"
 	"                                            \n\t"
+
 	" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
 	BEQ(DCONSIDERKLEFT)
 	"                                            \n\t"
@@ -837,10 +938,10 @@ void bli_dgemm_armv8a_asm_6x8
 	" ldr q5, [x1, #32]                          \n\t"
 	" ldr q6, [x1, #48]                          \n\t"
 	"                                            \n\t"
-	" add x0, x0, #48                            \n\t" //update address of A
-	" add x1, x1, #64                            \n\t" //update address of B
+	" add x0, x0, #48                            \n\t" // Update address of A
+	" add x1, x1, #64                            \n\t" // Update address of B
 	"                                            \n\t"
-	" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one.
+	" cmp x5,1                                   \n\t" // If there's only one k_iter, jump to it
 	BEQ(DLASTITER)                                     // (as loop is do-while-like).
 	"                                            \n\t"
 	LABEL(DLOOP)                                       // Body
@@ -930,7 +1031,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #112]                         \n\t"
-	"                                            \n\t"                  //End it 2
+	"                                            \n\t"                  // End it 2
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" prfm    PLDL1KEEP, [x0, #464]              \n\t"
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1011,7 +1112,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #240]                         \n\t"
-	"                                            \n\t"                  //End it 4
+	"                                            \n\t"                  // End it 4
 	" add x0, x0, #192                           \n\t"
 	" add x1, x1, #256                           \n\t"
 	"                                            \n\t"
@@ -1100,7 +1201,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #112]                         \n\t"
-	"                                            \n\t"                  //End it 2
+	"                                            \n\t"                  // End it 2
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1174,7 +1275,7 @@ void bli_dgemm_armv8a_asm_6x8
 	"                                            \n\t"
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
-	"                                            \n\t"                  //End it 4
+	"                                            \n\t"                  // End it 4
 	" add x0, x0, #144                           \n\t"
 	"                                            \n\t"
 	LABEL(DCONSIDERKLEFT)
@@ -1253,11 +1354,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fcmp d7,#0.0                               \n\t"
 	BEQ(DBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x2]                               \n\t" //Load column 0 of C
+	" ldr q0, [x2]                               \n\t" // Load column 0 of C
 	" ldr q1, [x2, #16]                          \n\t"
 	" ldr q2, [x2, #32]                          \n\t"
 	"                                            \n\t"
-	" ldr q3, [x20]                              \n\t" //Load column 1 of C
+	" ldr q3, [x20]                              \n\t" // Load column 1 of C
 	" ldr q4, [x20, #16]                         \n\t"
 	" ldr q5, [x20, #32]                         \n\t"
 	"                                            \n\t"
@@ -1277,11 +1378,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
 	" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x2]                               \n\t" //Store column 0 of C
+	" str q0, [x2]                               \n\t" // Store column 0 of C
 	" str q1, [x2, #16]                          \n\t"
 	" str q2, [x2, #32]                          \n\t"
 	"                                            \n\t"
-	" str q3, [x20]                              \n\t" //Store column 1 of C
+	" str q3, [x20]                              \n\t" // Store column 1 of C
 	" str q4, [x20, #16]                         \n\t"
 	" str q5, [x20, #32]                         \n\t"
 	"                                            \n\t"
@@ -1295,11 +1396,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fcmp d7,#0.0                               \n\t"
 	BEQ(DBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x21]                              \n\t" //Load column 2 of C
+	" ldr q8, [x21]                              \n\t" // Load column 2 of C
 	" ldr q9, [x21, #16]                         \n\t"
 	" ldr q10, [x21, #32]                        \n\t"
 	"                                            \n\t"
-	" ldr q11, [x22]                             \n\t" //Load column 3 of C
+	" ldr q11, [x22]                             \n\t" // Load column 3 of C
 	" ldr q12, [x22, #16]                        \n\t"
 	" ldr q13, [x22, #32]                        \n\t"
 	"                                            \n\t"
@@ -1319,11 +1420,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
 	" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x21]                              \n\t" //Store column 2 of C
+	" str q8, [x21]                              \n\t" // Store column 2 of C
 	" str q9, [x21, #16]                         \n\t"
 	" str q10, [x21, #32]                        \n\t"
 	"                                            \n\t"
-	" str q11, [x22]                             \n\t" //Store column 3 of C
+	" str q11, [x22]                             \n\t" // Store column 3 of C
 	" str q12, [x22, #16]                        \n\t"
 	" str q13, [x22, #32]                        \n\t"
 	"                                            \n\t"
@@ -1337,11 +1438,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fcmp d7,#0.0                               \n\t"
 	BEQ(DBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x23]                              \n\t" //Load column 4 of C
+	" ldr q0, [x23]                              \n\t" // Load column 4 of C
 	" ldr q1, [x23, #16]                         \n\t"
 	" ldr q2, [x23, #32]                         \n\t"
 	"                                            \n\t"
-	" ldr q3, [x24]                              \n\t" //Load column 5 of C
+	" ldr q3, [x24]                              \n\t" // Load column 5 of C
 	" ldr q4, [x24, #16]                         \n\t"
 	" ldr q5, [x24, #32]                         \n\t"
 	"                                            \n\t"
@@ -1361,11 +1462,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
 	" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x23]                              \n\t" //Store column 4 of C
+	" str q0, [x23]                              \n\t" // Store column 4 of C
 	" str q1, [x23, #16]                         \n\t"
 	" str q2, [x23, #32]                         \n\t"
 	"                                            \n\t"
-	" str q3, [x24]                              \n\t" //Store column 5 of C
+	" str q3, [x24]                              \n\t" // Store column 5 of C
 	" str q4, [x24, #16]                         \n\t"
 	" str q5, [x24, #32]                         \n\t"
 	"                                            \n\t"
@@ -1379,11 +1480,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fcmp d7,#0.0                               \n\t"
 	BEQ(DBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x25]                              \n\t" //Load column 6 of C
+	" ldr q8, [x25]                              \n\t" // Load column 6 of C
 	" ldr q9, [x25, #16]                         \n\t"
 	" ldr q10, [x25, #32]                        \n\t"
 	"                                            \n\t"
-	" ldr q11, [x26]                             \n\t" //Load column 7 of C
+	" ldr q11, [x26]                             \n\t" // Load column 7 of C
 	" ldr q12, [x26, #16]                        \n\t"
 	" ldr q13, [x26, #32]                        \n\t"
 	"                                            \n\t"
@@ -1406,11 +1507,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
 	" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x25]                              \n\t" //Store column 6 of C
+	" str q8, [x25]                              \n\t" // Store column 6 of C
 	" str q9, [x25, #16]                         \n\t"
 	" str q10, [x25, #32]                        \n\t"
 	"                                            \n\t"
-	" str q11, [x26]                             \n\t" //Store column 7 of C
+	" str q11, [x26]                             \n\t" // Store column 7 of C
 	" str q12, [x26, #16]                        \n\t"
 	" str q13, [x26, #32]                        \n\t"
 	"                                            \n\t"
@@ -1450,39 +1551,4 @@ void bli_dgemm_armv8a_asm_6x8
 	GEMM_UKR_FLUSH_CT( d );
 }
 
-
-#if 0
-void bli_cgemm_armv8a_opt_4x4
-     (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
-     )
-{
-}
-
-void bli_zgemm_armv8a_opt_4x4
-     (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
-     )
-{
-}
-
-#endif
-
+// June 2022, removed unused stubs for ancient 4x4 kernels

From 9e5594ad5fc41df8ef2825a025d7844ac2275c27 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 11 Aug 2022 14:36:38 -0500
Subject: [PATCH 074/230] Temporarily disabled #line directives from 6826c1c.

Details:
- Commented out the inclusion of #line preprocessor directives in the
  flattened header output provided by build/flatten-headers.py. This
  output was added recently in 6826c1c, but was later found to have
  thrown off the line numbering referenced by compiler warnings and
  errors (possibly due to license comment blocks, which are stripped
  from source headers as they are inlined into the monolithic header).
---
 build/flatten-headers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/flatten-headers.py b/build/flatten-headers.py
index 40fc2a450..ecd4635d1 100755
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -278,14 +278,14 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 
 				# Mark the beginning of the header being inserted.
 				ostring += "%s%s%c" % ( beginstr, header, '\n' )
-				ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' )
+				#ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' )
 
 				# Recurse on the header, accumulating the string.
 				ostring += flatten_header( header_path, header_dirpaths, cursp + "  " )
 
 				# Mark the end of the header being inserted.
 				ostring += "%s%s%c" % ( endstr, header, '\n' )
-				ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' )
+				#ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' )
 
 				echov2( "%sheader file '%s' fully processed." \
 				        % ( cursp, header_path ) )

From dfa54139664a42d29774e140ec9e5597af869a76 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Tue, 30 Aug 2022 08:07:50 +0800
Subject: [PATCH 075/230] Arm64 dgemmsup with extended MR&NR (#655)

Details:
- Since the number of registers in NEON is large but their lengths are
  short, I'm here extending both MR and NR.
- The approach is to represent the C microtile in registers optionally
  in columns, so for sizes like 6x7m, the 'crr' kernel is the default
  with 'rrr' supported through an in-register transpose.
- A few asm kernels are crafted for 'rv' to complete this extended size
  support.
- For 'rd' I'm still relying heavily on C99 intrinsic kernels with
  branching so the performance might not be optimal. (Sorry for that.)
- So far, these changes only affect the 'firestorm' subconfig.
- This commit also contains row-preferential s12x8 and d6x8 gemm
  ukernels. These microkernels are templatized versions of the existing
  s8x12 and d6x8 ukernels defined in bli_gemm_armv8a_asm_d6x8.c.
---
 config/firestorm/bli_cntx_init_firestorm.c    |  32 +-
 kernels/armv8a/3/armv8a_asm_utils.h           |  40 ++
 kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c  | 605 ++++++++++++++++++
 .../sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c     |   0
 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c | 450 -------------
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c   | 190 ++++--
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c   | 268 ++++----
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c   |   3 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c   | 482 ++++++++++++++
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c   | 475 ++++++++++++++
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c   | 477 ++++++++++++++
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c   | 513 +++++++++++++++
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c   | 126 ++--
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c   |  64 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c   |   1 -
 kernels/armv8a/bli_kernels_armv8a.h           |   6 +
 16 files changed, 3020 insertions(+), 712 deletions(-)
 create mode 100644 kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
 rename kernels/armv8a/3/{ => old}/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c (100%)
 delete mode 100644 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
 create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
 create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
 create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
 create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c

diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c
index 8e4d0088d..bfc7f24b9 100644
--- a/config/firestorm/bli_cntx_init_firestorm.c
+++ b/config/firestorm/bli_cntx_init_firestorm.c
@@ -49,14 +49,14 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_12x8r,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r,
 
 	  // packm
-	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
 
 	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
@@ -77,8 +77,8 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
-	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 
 	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
@@ -95,11 +95,11 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     6,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    12,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   120,   252,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   640,  3072,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  3072,  8192,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    12,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     8,     6,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   480,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  4096,  3072,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  9600,  8184,    -1,    -1 );
 
 	// Initialize sup thresholds with architecture-appropriate values.
 	//                                          s     d     c     z
@@ -110,8 +110,10 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	// Initialize level-3 sup blocksize objects with architecture-specific
 	// values.
 	//                                               s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ],    -1,     6,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    -1,     8,    -1,    -1 );
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],    -1,     6,    -1,    -1,
+	                                                -1,     9,    -1,    -1 );
+	bli_blksz_init     ( &blkszs[ BLIS_NR_SUP ],    -1,     8,    -1,    -1,
+	                                                -1,    13,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],    -1,   240,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],    -1,  1024,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],    -1,  3072,    -1,    -1 );
diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h
index 0c405dfd2..061cea66d 100644
--- a/kernels/armv8a/3/armv8a_asm_utils.h
+++ b/kernels/armv8a/3/armv8a_asm_utils.h
@@ -61,6 +61,18 @@
   CLEAR4V(V4,V5,V6,V7)
 
 // Scale vectors.
+#define SSCALE1V(V,A,IDX) \
+" fmul  v"#V".4s, v"#V".4s, v"#A".s["#IDX"] \n\t"
+#define SSCALE2V(V0,V1,A,IDX) \
+  SSCALE1V(V0,A,IDX) \
+  SSCALE1V(V1,A,IDX)
+#define SSCALE4V(V0,V1,V2,V3,A,IDX) \
+  SSCALE2V(V0,V1,A,IDX) \
+  SSCALE2V(V2,V3,A,IDX)
+#define SSCALE8V(V0,V1,V2,V3,V4,V5,V6,V7,A,IDX) \
+  SSCALE4V(V0,V1,V2,V3,A,IDX) \
+  SSCALE4V(V4,V5,V6,V7,A,IDX)
+
 #define DSCALE1V(V,A,IDX) \
 " fmul  v"#V".2d, v"#V".2d, v"#A".d["#IDX"] \n\t"
 #define DSCALE2V(V0,V1,A,IDX) \
@@ -74,6 +86,18 @@
   DSCALE4V(V4,V5,V6,V7,A,IDX)
 
 // Scale-accumulate.
+#define SSCALEA1V(D,S,A,IDX) \
+" fmla  v"#D".4s, v"#S".4s, v"#A".s["#IDX"] \n\t"
+#define SSCALEA2V(D0,D1,S0,S1,A,IDX) \
+  SSCALEA1V(D0,S0,A,IDX) \
+  SSCALEA1V(D1,S1,A,IDX)
+#define SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
+  SSCALEA2V(D0,D1,S0,S1,A,IDX) \
+  SSCALEA2V(D2,D3,S2,S3,A,IDX)
+#define SSCALEA8V(D0,D1,D2,D3,D4,D5,D6,D7,S0,S1,S2,S3,S4,S5,S6,S7,A,IDX) \
+  SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
+  SSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX)
+
 #define DSCALEA1V(D,S,A,IDX) \
 " fmla  v"#D".2d, v"#S".2d, v"#A".d["#IDX"] \n\t"
 #define DSCALEA2V(D0,D1,S0,S1,A,IDX) \
@@ -95,8 +119,16 @@
 #define DLOAD4V(V0,V1,V2,V3,ADDR,SHIFT) \
   DLOAD2V(V0,V1,ADDR,SHIFT) \
   DLOAD2V(V2,V3,ADDR,SHIFT+32)
+#define SLOAD1V DLOAD1V
+#define SLOAD2V DLOAD2V
+#define SLOAD4V DLOAD4V
 
 // Generic: load one line.
+#define SLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \
+" ld1   {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \
+" ld1   {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \
+" ld1   {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \
+" ld1   {v"#V".s}[3], ["#ADDR"], "#INC" \n\t"
 #define DLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \
 " ld1   {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \
 " ld1   {v"#V".d}[1], ["#ADDR"], "#INC" \n\t"
@@ -110,8 +142,16 @@
 #define DSTORE4V(V0,V1,V2,V3,ADDR,SHIFT) \
   DSTORE2V(V0,V1,ADDR,SHIFT) \
   DSTORE2V(V2,V3,ADDR,SHIFT+32)
+#define SSTORE1V DSTORE1V
+#define SSTORE2V DSTORE2V
+#define SSTORE4V DSTORE4V
 
 // Generic: store one line.
+#define SSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \
+" st1   {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \
+" st1   {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \
+" st1   {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \
+" st1   {v"#V".s}[3], ["#ADDR"], "#INC" \n\t"
 #define DSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \
 " st1   {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \
 " st1   {v"#V".d}[1], ["#ADDR"], "#INC" \n\t"
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
new file mode 100644
index 000000000..b0df23fb0
--- /dev/null
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
@@ -0,0 +1,605 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "blis.h"
+
+// Label locality & misc.
+#include "armv8a_asm_utils.h"
+
+// Nanokernel operations.
+#include "armv8a_asm_d2x2.h"
+
+/* Order of row-major SGEMM_12x8's execution in 4x5 blocks:
+ *
+ * +---+ +---+ 
+ * | 0 | | 1 | 
+ * +---+ +---+ 
+ * +---+ +---+ 
+ * | 2 | | 3 | 
+ * +---+ +---+ 
+ * +---+ +---+ 
+ * | 4 | | 5 | 
+ * +---+ +---+ 
+ */
+#define SGEMM_12X8_MKER_LOOP_PLAIN(C00,C01,C10,C11,C20,C21,C30,C31,C40,C41,C50,C51,C60,C61,C70,C71,C80,C81,C90,C91,CA0,CA1,CB0,CB1,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
+  SGEMM_4X4_NANOKERNEL(C00,C10,C20,C30,B0,A0) \
+  SGEMM_4X4_NANOKERNEL(C01,C11,C21,C31,B1,A0) \
+  DGEMM_LOAD1V_ ##LOADNEXT (A0,AADDR,ASHIFT) /* Contiguous load is the same across S/D. */ \
+  SGEMM_4X4_NANOKERNEL(C40,C50,C60,C70,B0,A1) \
+  SGEMM_4X4_NANOKERNEL(C41,C51,C61,C71,B1,A1) \
+  DGEMM_LOAD1V_ ##LOADNEXT (A1,AADDR,ASHIFT+16) \
+  SGEMM_4X4_NANOKERNEL(C80,C90,CA0,CB0,B0,A2) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
+  SGEMM_4X4_NANOKERNEL(C81,C91,CA1,CB1,B1,A2)
+
+// For contiguous storage of C, SLOAD is the same as DLOAD.
+#define SLOADC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+#define SSTOREC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+
+/* Order of row-major DGEMM_8x6's execution in 2x2 blocks:
+ *
+ * +---+ +---+ +---+
+ * | 0 | | 2 | | 4 |
+ * +---+ +---+ +---+
+ * +---+ +---+ +---+
+ * | 1 | | 3 | | 5 |
+ * +---+ +---+ +---+
+ * +---+ +---+ +---+
+ * | 6 | | 8 | | 10|
+ * +---+ +---+ +---+
+ * +---+ +---+ +---+
+ * | 7 | | 9 | | 11|
+ * +---+ +---+ +---+
+ *
+ */
+#define DGEMM_8X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \
+  DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \
+  DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \
+  DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \
+  DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \
+  DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \
+  DGEMM_LOAD2V_ ##LOADNEXT (A0,A1,AADDR,ASHIFT) \
+  DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \
+  DGEMM_2X2_NANOKERNEL(C60,C70,B0,A3) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
+  DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \
+  DGEMM_2X2_NANOKERNEL(C61,C71,B1,A3) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \
+  DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \
+  DGEMM_2X2_NANOKERNEL(C62,C72,B2,A3)
+
+// Interleaving load or not.
+#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
+#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+  DLOAD1V(V1,ADDR,IMM)
+
+#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
+#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
+  DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+  DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
+
+// For contiguous storage of C.
+#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+  DLOAD1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+  DSTORE1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+
+// Prefetch C.
+#define PRFMC_FWD(CADDR,RSC,LASTB) \
+" prfm PLDL1KEEP, ["#CADDR"]           \n\t" \
+" prfm PLDL1KEEP, ["#CADDR", "#LASTB"] \n\t" \
+" add  "#CADDR", "#CADDR", "#RSC"      \n\t"
+
+void bli_sgemm_armv8a_asm_12x8r
+     (
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
+       float*     restrict alpha,
+       float*     restrict a,
+       float*     restrict b,
+       float*     restrict beta,
+       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k / 4;
+  uint64_t k_left = k % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  GEMM_UKR_SETUP_CT( s, 12, 8, true );
+
+  __asm__ volatile
+  (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, #12                         \n\t" // Column-skip of A.
+" mov             x3, #8                          \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C. (column-skip == 1)
+"                                                 \n\t"
+"                                                 \n\t" // Multiply some address skips by sizeof(float).
+" lsl             x2, x2, #2                      \n\t" // cs_a
+" lsl             x3, x3, #2                      \n\t" // rs_b
+" lsl             x6, x6, #2                      \n\t" // rs_c
+"                                                 \n\t"
+" cmp             %w[ct], wzr                     \n\t"
+" mov             x9, x5                          \n\t"
+BNE(SEND_PRFMC_FH)
+PRFMC_FWD(x9,x6,32) // Prefetch C 01/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 02/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 03/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 04/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 05/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 06/12.
+LABEL(SEND_PRFMC_FH)
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+// Storage scheme:
+//  V[ 0:23] <- C
+//  V[24:27] <- A
+//  V[28:31] <- B
+// Under this scheme, the following is defined:
+#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
+  SGEMM_12X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT)
+// Load from memory.
+LABEL(SLOAD_ABC)
+"                                                 \n\t" // No-microkernel early return is a must
+" cmp             x4, #0                          \n\t" //  to avoid out-of-boundary read.
+BEQ(SCLEAR_CCOLS)
+"                                                 \n\t"
+" ldr             q24, [x0, #16*0]                \n\t" // Load A.
+" ldr             q25, [x0, #16*1]                \n\t"
+" ldr             q26, [x0, #16*2]                \n\t"
+" add             x0, x0, x2                      \n\t"
+" ldr             q27, [x0, #16*0]                \n\t"
+"                                                 \n\t"
+" cmp             %w[ct], wzr                     \n\t"
+BNE(SEND_PRFMC_LH)
+PRFMC_FWD(x9,x6,32) // Prefetch C 07/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 08/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 09/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 10/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 11/12.
+PRFMC_FWD(x9,x6,32) // Prefetch C 12/12.
+LABEL(SEND_PRFMC_LH)
+" cmp             x4, #0                          \n\t" // Reset branching flag.
+"                                                 \n\t"
+" ldr             q28, [x1, #16*0]                \n\t" // Load B.
+" ldr             q29, [x1, #16*1]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" ldr             q30, [x1, #16*0]                \n\t"
+" ldr             q31, [x1, #16*1]                \n\t"
+" add             x1, x1, x3                      \n\t"
+LABEL(SCLEAR_CCOLS)
+CLEAR8V(0,1,2,3,4,5,6,7)
+CLEAR8V(8,9,10,11,12,13,14,15)
+CLEAR8V(16,17,18,19,20,21,22,23)
+// No-microkernel early return, once again.
+BEQ(SK_LEFT_LOOP)
+//
+// Microkernel is defined here as:
+#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1) \
+  SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,x0,16,x1,0,load) \
+ "add             x0, x0, x2                      \n\t" \
+ "ldr             q"#A2", [x0, #16*0]             \n\t" \
+ "ldr             q"#B1", [x1, #16*1]             \n\t" \
+ "add             x1, x1, x3                      \n\t"
+// Start microkernel loop.
+LABEL(SK_MKER_LOOP)
+SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29)
+SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,30,31)
+"                                                 \n\t" // Decrease counter before final replica.
+" subs            x4, x4, #1                      \n\t" // Branch early to avoid reading excess mem.
+BEQ(SFIN_MKER_LOOP)
+SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29)
+SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,30,31)
+BRANCH(SK_MKER_LOOP)
+//
+// Final microkernel loop.
+LABEL(SFIN_MKER_LOOP)
+SGEMM_12X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,xzr,-1,xzr,-1,noload)
+" ldr             q26, [x0, #16*1]                \n\t"
+" ldr             q27, [x0, #16*2]                \n\t"
+" add             x0, x0, x2                      \n\t"
+SGEMM_12X8_MKER_LOOP_PLAIN_LOC(25,26,27,30,31,xzr,-1,xzr,-1,noload)
+//
+// Loops left behind microkernels.
+LABEL(SK_LEFT_LOOP)
+" cmp             x8, #0                          \n\t" // End of exec.
+BEQ(SWRITE_MEM_PREP)
+" ldr             q24, [x0, #16*0]                \n\t" // Load A col.
+" ldr             q25, [x0, #16*1]                \n\t"
+" ldr             q26, [x0, #16*2]                \n\t"
+" add             x0, x0, x2                      \n\t"
+" ldr             q28, [x1, #16*0]                \n\t" // Load B row.
+" ldr             q29, [x1, #16*1]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" sub             x8, x8, #1                      \n\t"
+SGEMM_12X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,xzr,-1,xzr,-1,noload)
+BRANCH(SK_LEFT_LOOP)
+//
+// Scale and write to memory.
+LABEL(SWRITE_MEM_PREP)
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1r            {v24.4s}, [x4]                  \n\t" // Load alpha & beta.
+" ld1r            {v25.4s}, [x8]                  \n\t"
+"                                                 \n\t"
+LABEL(SPREFETCH_ABNEXT)
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+" prfm            PLDL1STRM, [x0, 64*0]           \n\t" // Do not know cache line size,
+" prfm            PLDL1STRM, [x0, 64*1]           \n\t" //  issue some number of prfm instructions
+" prfm            PLDL1STRM, [x0, 64*2]           \n\t" //  to try to activate hardware prefetcher.
+" prfm            PLDL1STRM, [x1, 64*0]           \n\t"
+" prfm            PLDL1STRM, [x1, 64*1]           \n\t"
+" prfm            PLDL1STRM, [x1, 64*3]           \n\t"
+"                                                 \n\t"
+" fmov            d26, #1.0                       \n\t"
+" fcvt            s26, d26                        \n\t"
+" fcmp            s24, s26                        \n\t"
+BEQ(SUNIT_ALPHA)
+SSCALE8V(0,1,2,3,4,5,6,7,24,0)
+SSCALE8V(8,9,10,11,12,13,14,15,24,0)
+SSCALE8V(16,17,18,19,20,21,22,23,24,0)
+LABEL(SUNIT_ALPHA)
+"                                                 \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+//
+// Contiguous C-storage.
+LABEL(SWRITE_MEM_R)
+" fcmp            s25, #0.0                       \n\t" // Sets conditional flag whether *beta == 0.
+"                                                 \n\t" // This conditional flag will be used
+"                                                 \n\t" //  multiple times for skipping load.
+// Row 0 & 1 & 2:
+BEQ(SZERO_BETA_R_0_1_2)
+SLOADC_2V_R_FWD(26,27,x9,0,x6)
+SLOADC_2V_R_FWD(28,29,x9,0,x6)
+SLOADC_2V_R_FWD(30,31,x9,0,x6)
+SSCALEA2V(0,1,26,27,25,0)
+SSCALEA2V(2,3,28,29,25,0)
+SSCALEA2V(4,5,30,31,25,0)
+LABEL(SZERO_BETA_R_0_1_2)
+SSTOREC_2V_R_FWD(0,1,x5,0,x6)
+SSTOREC_2V_R_FWD(2,3,x5,0,x6)
+SSTOREC_2V_R_FWD(4,5,x5,0,x6)
+// Row 3 & 4 & 5 & 6 & 7 & 8:
+BEQ(SZERO_BETA_R_3_4_5_6_7_8)
+SLOADC_2V_R_FWD(26,27,x9,0,x6)
+SLOADC_2V_R_FWD(28,29,x9,0,x6)
+SLOADC_2V_R_FWD(30,31,x9,0,x6)
+SLOADC_2V_R_FWD(0,1,x9,0,x6)
+SLOADC_2V_R_FWD(2,3,x9,0,x6)
+SLOADC_2V_R_FWD(4,5,x9,0,x6)
+SSCALEA4V(6,7,8,9,26,27,28,29,25,0)
+SSCALEA4V(10,11,12,13,30,31,0,1,25,0)
+SSCALEA4V(14,15,16,17,2,3,4,5,25,0)
+LABEL(SZERO_BETA_R_3_4_5_6_7_8)
+SSTOREC_2V_R_FWD(6,7,x5,0,x6)
+SSTOREC_2V_R_FWD(8,9,x5,0,x6)
+SSTOREC_2V_R_FWD(10,11,x5,0,x6)
+SSTOREC_2V_R_FWD(12,13,x5,0,x6)
+SSTOREC_2V_R_FWD(14,15,x5,0,x6)
+SSTOREC_2V_R_FWD(16,17,x5,0,x6)
+// Row 9 & 10 & 11
+BEQ(SZERO_BETA_R_9_10_11)
+SLOADC_2V_R_FWD(26,27,x9,0,x6)
+SLOADC_2V_R_FWD(28,29,x9,0,x6)
+SLOADC_2V_R_FWD(30,31,x9,0,x6)
+SSCALEA2V(18,19,26,27,25,0)
+SSCALEA2V(20,21,28,29,25,0)
+SSCALEA2V(22,23,30,31,25,0)
+LABEL(SZERO_BETA_R_9_10_11)
+SSTOREC_2V_R_FWD(18,19,x5,0,x6)
+SSTOREC_2V_R_FWD(20,21,x5,0,x6)
+SSTOREC_2V_R_FWD(22,23,x5,0,x6)
+// Done.
+LABEL(SEND_WRITE_MEM)
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next),
+  [ct]     "r" (_use_ct) // Defined by macro.
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",
+  "v0","v1","v2","v3","v4","v5","v6","v7",
+  "v8","v9","v10","v11","v12","v13","v14","v15",
+  "v16","v17","v18","v19",
+  "v20","v21","v22","v23",
+  "v24","v25","v26","v27",
+  "v28","v29","v30","v31"
+  );
+
+  GEMM_UKR_FLUSH_CT( s );
+}
+
+/*
+ * Differences from the col-major 6x8 in HW modeling:
+ * * Stream HW prefetcher is assumed s.t. PRFM instructions for packed A&B are omitted.
+ */
+void bli_dgemm_armv8a_asm_8x6r
+     (
+       dim_t               m,
+       dim_t               n,
+       dim_t               k,
+       double*    restrict alpha,
+       double*    restrict a,
+       double*    restrict b,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k / 4;
+  uint64_t k_left = k % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  GEMM_UKR_SETUP_CT( d, 8, 6, true );
+
+  __asm__ volatile
+  (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, #8                          \n\t" // Column-skip of A.
+" mov             x3, #6                          \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C. (column-skip == 1)
+"                                                 \n\t"
+"                                                 \n\t" // Multiply some address skips by sizeof(double).
+" lsl             x2, x2, #3                      \n\t" // cs_a
+" lsl             x3, x3, #3                      \n\t" // rs_b
+" lsl             x6, x6, #3                      \n\t" // rs_c
+"                                                 \n\t"
+" cmp             %w[ct], wzr                     \n\t"
+" mov             x9, x5                          \n\t"
+BNE(DEND_PRFMC)
+PRFMC_FWD(x9,x6,40) // Prefetch C 1/8.
+PRFMC_FWD(x9,x6,40) // Prefetch C 2/8.
+PRFMC_FWD(x9,x6,40) // Prefetch C 3/8.
+PRFMC_FWD(x9,x6,40) // Prefetch C 4/8.
+PRFMC_FWD(x9,x6,40) // Prefetch C 5/8.
+PRFMC_FWD(x9,x6,40) // Prefetch C 6/8.
+PRFMC_FWD(x9,x6,40) // Prefetch C 7/8.
+PRFMC_FWD(x9,x6,40) // Prefetch C 8/8.
+LABEL(DEND_PRFMC)
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+// Storage scheme:
+//  V[ 0:23] <- C
+//  V[24:27] <- A
+//  V[28:31] <- B
+// Under this scheme, the following is defined:
+#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_8X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT)
+// Load from memory.
+LABEL(DLOAD_ABC)
+"                                                 \n\t" // No-microkernel early return is a must
+" cmp             x4, #0                          \n\t" //  to avoid out-of-boundary read.
+BEQ(DCLEAR_CCOLS)
+"                                                 \n\t"
+" ldr             q24, [x0, #16*0]                \n\t" // Load A.
+" ldr             q25, [x0, #16*1]                \n\t"
+" ldr             q26, [x0, #16*2]                \n\t"
+" ldr             q27, [x0, #16*3]                \n\t"
+" add             x0, x0, x2                      \n\t"
+"                                                 \n\t"
+" ldr             q28, [x1, #16*0]                \n\t" // Load B.
+" ldr             q29, [x1, #16*1]                \n\t"
+" ldr             q30, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" ldr             q31, [x1, #16*0]                \n\t"
+LABEL(DCLEAR_CCOLS)
+CLEAR8V(0,1,2,3,4,5,6,7)
+CLEAR8V(8,9,10,11,12,13,14,15)
+CLEAR8V(16,17,18,19,20,21,22,23)
+// No-microkernel early return, once again.
+BEQ(DK_LEFT_LOOP)
+//
+// Microkernel is defined here as:
+#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2) \
+  DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load) \
+ "add             x1, x1, x3                      \n\t" \
+ "ldr             q"#B2", [x1, #16*0]             \n\t" \
+ "ldr             q"#A2", [x0, #16*2]             \n\t" \
+ "ldr             q"#A3", [x0, #16*3]             \n\t" \
+ "add             x0, x0, x2                      \n\t"
+// Start microkernel loop.
+LABEL(DK_MKER_LOOP)
+DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30)
+DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29)
+"                                                 \n\t" // Decrease counter before final replica.
+" subs            x4, x4, #1                      \n\t" // Branch early to avoid reading excess mem.
+BEQ(DFIN_MKER_LOOP)
+DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28)
+DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31)
+BRANCH(DK_MKER_LOOP)
+//
+// Final microkernel loop.
+LABEL(DFIN_MKER_LOOP)
+DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,30,31,28,x0,0,x1,16,load)
+" add             x1, x1, x3                      \n\t"
+" ldr             q26, [x0, #16*2]                \n\t"
+" ldr             q27, [x0, #16*3]                \n\t"
+" add             x0, x0, x2                      \n\t"
+DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload)
+//
+// Loops left behind microkernels.
+LABEL(DK_LEFT_LOOP)
+" cmp             x8, #0                          \n\t" // End of exec.
+BEQ(DWRITE_MEM_PREP)
+" ldr             q24, [x0, #16*0]                \n\t" // Load A col.
+" ldr             q25, [x0, #16*1]                \n\t"
+" ldr             q26, [x0, #16*2]                \n\t"
+" ldr             q27, [x0, #16*3]                \n\t"
+" add             x0, x0, x2                      \n\t"
+" ldr             q28, [x1, #16*0]                \n\t" // Load B row.
+" ldr             q29, [x1, #16*1]                \n\t"
+" ldr             q30, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" sub             x8, x8, #1                      \n\t"
+DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload)
+BRANCH(DK_LEFT_LOOP)
+//
+// Scale and write to memory.
+LABEL(DWRITE_MEM_PREP)
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1r            {v24.2d}, [x4]                  \n\t" // Load alpha & beta.
+" ld1r            {v25.2d}, [x8]                  \n\t"
+"                                                 \n\t"
+LABEL(DPREFETCH_ABNEXT)
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+" prfm            PLDL1STRM, [x0, 64*0]           \n\t" // Do not know cache line size,
+" prfm            PLDL1STRM, [x0, 64*1]           \n\t" //  issue some number of prfm instructions
+" prfm            PLDL1STRM, [x0, 64*2]           \n\t" //  to try to activate hardware prefetcher.
+" prfm            PLDL1STRM, [x1, 64*0]           \n\t"
+" prfm            PLDL1STRM, [x1, 64*1]           \n\t"
+" prfm            PLDL1STRM, [x1, 64*3]           \n\t"
+"                                                 \n\t"
+" fmov            d26, #1.0                       \n\t"
+" fcmp            d24, d26                        \n\t"
+BEQ(DUNIT_ALPHA)
+DSCALE8V(0,1,2,3,4,5,6,7,24,0)
+DSCALE8V(8,9,10,11,12,13,14,15,24,0)
+DSCALE8V(16,17,18,19,20,21,22,23,24,0)
+LABEL(DUNIT_ALPHA)
+"                                                 \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+//
+// Contiguous C-storage.
+LABEL(DWRITE_MEM_R)
+" fcmp            d25, #0.0                       \n\t" // Sets conditional flag whether *beta == 0.
+"                                                 \n\t" // This conditional flag will be used
+"                                                 \n\t" //  multiple times for skipping load.
+// Row 0 & 1:
+BEQ(DZERO_BETA_R_0_1)
+DLOADC_3V_R_FWD(26,27,28,x9,0,x6)
+DLOADC_3V_R_FWD(29,30,31,x9,0,x6)
+DSCALEA2V(0,1,26,27,25,0)
+DSCALEA2V(2,3,28,29,25,0)
+DSCALEA2V(4,5,30,31,25,0)
+LABEL(DZERO_BETA_R_0_1)
+DSTOREC_3V_R_FWD(0,1,2,x5,0,x6)
+DSTOREC_3V_R_FWD(3,4,5,x5,0,x6)
+// Row 2 & 3 & 4 & 5:
+BEQ(DZERO_BETA_R_2_3_4_5)
+DLOADC_3V_R_FWD(26,27,28,x9,0,x6)
+DLOADC_3V_R_FWD(29,30,31,x9,0,x6)
+DLOADC_3V_R_FWD(0,1,2,x9,0,x6)
+DLOADC_3V_R_FWD(3,4,5,x9,0,x6)
+DSCALEA4V(6,7,8,9,26,27,28,29,25,0)
+DSCALEA4V(10,11,12,13,30,31,0,1,25,0)
+DSCALEA4V(14,15,16,17,2,3,4,5,25,0)
+LABEL(DZERO_BETA_R_2_3_4_5)
+DSTOREC_3V_R_FWD(6,7,8,x5,0,x6)
+DSTOREC_3V_R_FWD(9,10,11,x5,0,x6)
+DSTOREC_3V_R_FWD(12,13,14,x5,0,x6)
+DSTOREC_3V_R_FWD(15,16,17,x5,0,x6)
+// Row 6 & 7
+BEQ(DZERO_BETA_R_6_7)
+DLOADC_3V_R_FWD(26,27,28,x9,0,x6)
+DLOADC_3V_R_FWD(29,30,31,x9,0,x6)
+DSCALEA2V(18,19,26,27,25,0)
+DSCALEA2V(20,21,28,29,25,0)
+DSCALEA2V(22,23,30,31,25,0)
+LABEL(DZERO_BETA_R_6_7)
+DSTOREC_3V_R_FWD(18,19,20,x5,0,x6)
+DSTOREC_3V_R_FWD(21,22,23,x5,0,x6)
+// Done.
+LABEL(DEND_WRITE_MEM)
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next),
+  [ct]     "r" (_use_ct) // Defined by macro.
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",
+  "v0","v1","v2","v3","v4","v5","v6","v7",
+  "v8","v9","v10","v11","v12","v13","v14","v15",
+  "v16","v17","v18","v19",
+  "v20","v21","v22","v23",
+  "v24","v25","v26","v27",
+  "v28","v29","v30","v31"
+  );
+
+  GEMM_UKR_FLUSH_CT( d );
+}
+
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c b/kernels/armv8a/3/old/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
similarity index 100%
rename from kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
rename to kernels/armv8a/3/old/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
deleted file mode 100644
index 44e0ac419..000000000
--- a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// Separate instantiation for Armv8-A reference kernels.
-// Temporary workaround. Will be removed after upstream has switched to a better way
-//  of exposing gemmsup interface.
-
-//
-// -- Row storage case ---------------------------------------------------------
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
-     ) \
-{ \
-	/* NOTE: This microkernel can actually handle arbitrarily large
-       values of m, n, and k. */ \
-\
-	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
-				PASTEMAC(ch,conjs)( ab ); \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmsup_r, _armv8a, _ref2 )
-
-//
-// -- Column storage case ------------------------------------------------------
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
-     ) \
-{ \
-	/* NOTE: This microkernel can actually handle arbitrarily large
-       values of m, n, and k. */ \
-\
-	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
-				PASTEMAC(ch,conjs)( ab ); \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmsup_c, _armv8a, _ref2 )
-
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
index cade3ee05..847bfe8da 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
@@ -37,7 +37,6 @@
 #include "blis.h"
 #include "assert.h"
 
-GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 
 // Label locality & misc.
 #include "../armv8a_asm_utils.h"
@@ -109,6 +108,83 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 " prfm PLDL1KEEP, ["#CADDR"]         \n\t" \
 " add  "#CADDR", "#CADDR", "#DLONGC" \n\t"
 
+
+BLIS_INLINE
+void bli_dgemmsup_rd_armv8a_inline_3x4m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( n0 == 4 );
+
+  for ( ; m0 >= 3; m0 -= 3 )
+  {
+    bli_dgemmsup_rd_armv8a_asm_3x4
+    (
+      conja, conjb, 3, 4, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    a += 3 * rs_a0;
+    c += 3 * rs_c0;
+  }
+
+  if ( m0 > 0 )
+  {
+    bli_dgemmsup_rd_armv8a_int_3x4
+    (
+      conja, conjb, m0, 4, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+  }
+}
+
+BLIS_INLINE
+void bli_dgemmsup_rd_armv8a_inline_3xcm
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  for ( ; m0 > 0; m0 -= 3 )
+  {
+    dim_t m_loc = ( m0 < 3 ) ? m0 : 3;
+
+    bli_dgemmsup_rd_armv8a_int_3x4
+    (
+      conja, conjb, m_loc, n0, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+
+    a += 3 * rs_a0;
+    c += 3 * rs_c0;
+  }
+}
+
+
 void bli_dgemmsup_rd_armv8a_asm_6x8m
      (
        conj_t              conja,
@@ -127,58 +203,74 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
 {
   if ( n0 != 8 )
   {
-    if ( n0 < 8 )
+    assert( n0 <= 13 );
+
+    // Manual separation.
+    dgemmsup_ker_ft ker_fp1 = NULL;
+    dgemmsup_ker_ft ker_fp2 = NULL;
+    dgemmsup_ker_ft ker_fp3 = NULL;
+    dim_t           nr1, nr2, nr3;
+
+    switch ( n0 )
     {
-      for ( ; n0 >= 4; n0 -= 4 )
-      {
-        dim_t m = m0;
-        double *a_loc = a;
-        double *c_loc = c;
-
-        for ( ; m >= 3; m -= 3 )
-        {
-          bli_dgemmsup_rd_armv8a_asm_3x4
-          (
-            conja, conjb, 3, 4, k0,
-            alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c_loc, rs_c0, cs_c0, data, cntx
-          );
-          a_loc += 3 * rs_a0;
-          c_loc += 3 * rs_c0;
-        }
-
-        if ( m > 0 )
-        {
-          bli_dgemmsup_rd_armv8a_int_3x4
-          (
-            conja, conjb, m, 4, k0,
-            alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c_loc, rs_c0, cs_c0, data, cntx
-          );
-        }
-        b += 4 * cs_b0;
-        c += 4 * cs_c0;
-      }
-
-      for ( ; m0 > 0; m0 -= 3 )
-      {
-        dim_t m_loc = ( m0 < 3 ) ? m0 : 3;
-
-        bli_dgemmsup_rd_armv8a_int_3x4
-        (
-          conja, conjb, m_loc, n0, k0,
-          alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-          beta, c, rs_c0, cs_c0, data, cntx
-        );
-
-        a += 3 * rs_a0;
-        c += 3 * rs_c0;
-      }
+      case 13:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3;
+        ker_fp3 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr3 = 2; break;
+      case 12:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr2 = 4; break;
+      case 11:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break;
+      case 10:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break;
+      case 9:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function.
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 1; break;
+      case 7:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4;
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break;
+      case 6:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4;
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break;
+      case 5:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 3;
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break;
+      case 4:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 4; break;
+      default:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = n0; break;
     }
-    else
+
+    ker_fp1
+    (
+      conja, conjb, m0, nr1, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    b += nr1 * cs_b0;
+    c += nr1 * cs_c0;
+    if ( ker_fp2 )
     {
-      assert( FALSE );
+      ker_fp2
+      (
+        conja, conjb, m0, nr2, k0,
+        alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+        beta, c, rs_c0, cs_c0, data, cntx
+      );
+      b += nr2 * cs_b0;
+      c += nr2 * cs_c0;
     }
+    if ( ker_fp3 )
+      ker_fp3
+      (
+        conja, conjb, m0, nr3, k0,
+        alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+        beta, c, rs_c0, cs_c0, data, cntx
+      );
+
     return;
   }
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
index 06c9ac32c..c4fb7cac6 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
@@ -37,7 +37,6 @@
 #include "blis.h"
 #include "assert.h"
 
-GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 
 // Label locality & misc.
 #include "../armv8a_asm_utils.h"
@@ -102,6 +101,122 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 " prfm PLDL1KEEP, ["#CADDR"]         \n\t" \
 " add  "#CADDR", "#CADDR", "#DLONGC" \n\t"
 
+
+BLIS_INLINE
+void bli_dgemmsup_rd_armv8a_inline_4x8n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( m0 == 4 );
+
+  for ( ; n0 > 0; n0 -= 8 )
+  {
+    // Call twice the 2xc kernel in column order.
+    dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
+    bli_dgemmsup_rd_armv8a_int_2x8
+    (
+      conja, conjb, 2, n_loc, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    bli_dgemmsup_rd_armv8a_int_2x8
+    (
+      conja, conjb, 2, n_loc, k0,
+      alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx
+    );
+    b += 8 * cs_b0;
+    c += 8 * cs_c0;
+  }
+}
+
+BLIS_INLINE
+void bli_dgemmsup_rd_armv8a_inline_3x8n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( m0 == 3 );
+
+  for ( ; n0 >= 4; n0 -= 4 )
+  {
+    bli_dgemmsup_rd_armv8a_asm_3x4
+    (
+      conja, conjb, 3, 4, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    b += 4 * cs_b0;
+    c += 4 * cs_c0;
+  }
+  if ( n0 > 0 )
+  {
+    bli_dgemmsup_rd_armv8a_int_3x4
+    (
+      conja, conjb, 3, n0, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+  }
+}
+
+BLIS_INLINE
+void bli_dgemmsup_rd_armv8a_inline_rx8n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( m0 <= 2 );
+
+  for ( ; n0 > 0; n0 -= 8 )
+  {
+    dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
+    bli_dgemmsup_rd_armv8a_int_2x8
+    (
+      conja, conjb, m0, n_loc, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    b += 8 * cs_b0;
+    c += 8 * cs_c0;
+  }
+}
+
+
 void bli_dgemmsup_rd_armv8a_asm_6x8n
      (
        conj_t              conja,
@@ -120,116 +235,51 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
 {
   if ( m0 != 6 )
   {
-    if ( m0 < 6 )
-    {
-      if ( m0 == 5 )
-      {
-        // 3xk calls.
-        dim_t n = n0;
-        double *b_loc = b;
-        double *c_loc = c;
-        for ( ; n >= 4; n -= 4 )
-        {
-          bli_dgemmsup_rd_armv8a_asm_3x4
-          (
-            conja, conjb, 3, 4, k0,
-            alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0,
-            beta, c_loc, rs_c0, cs_c0, data, cntx
-          );
-          b_loc += 4 * cs_b0;
-          c_loc += 4 * cs_c0;
-        }
-        if ( n > 0 )
-        {
-          bli_dgemmsup_rd_armv8a_int_3x4
-          (
-            conja, conjb, 3, n, k0,
-            alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0,
-            beta, c_loc, rs_c0, cs_c0, data, cntx
-          );
-        }
-        a += 3 * rs_a0;
-        c += 3 * rs_c0;
-
-        // 2xk calls.
-        for ( ; n0 > 0; n0 -= 8 )
-        {
-          dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
-          bli_dgemmsup_rd_armv8a_int_2x8
-          (
-            conja, conjb, 2, n_loc, k0,
-            alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c, rs_c0, cs_c0, data, cntx
-          );
-          b += 8 * cs_b0;
-          c += 8 * cs_c0;
-        }
-        return;
-      }
-      else if ( m0 == 4 )
-      {
-        for ( ; n0 > 0; n0 -= 8 )
-        {
-          dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
-          bli_dgemmsup_rd_armv8a_int_2x8
-          (
-            conja, conjb, 2, n_loc, k0,
-            alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c, rs_c0, cs_c0, data, cntx
-          );
-          bli_dgemmsup_rd_armv8a_int_2x8
-          (
-            conja, conjb, 2, n_loc, k0,
-            alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx
-          );
-          b += 8 * cs_b0;
-          c += 8 * cs_c0;
-        }
-      }
-      else if ( m0 == 3 )
-      {
-        for ( ; n0 >= 4; n0 -= 4 )
-        {
-          bli_dgemmsup_rd_armv8a_asm_3x4
-          (
-            conja, conjb, 3, 4, k0,
-            alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c, rs_c0, cs_c0, data, cntx
-          );
-          b += 4 * cs_b0;
-          c += 4 * cs_c0;
-        }
-        if ( n0 > 0 )
-        {
-          bli_dgemmsup_rd_armv8a_int_3x4
-          (
-            conja, conjb, 3, n0, k0,
-            alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c, rs_c0, cs_c0, data, cntx
-          );
-        }
-      }
-      else // m0 == 2 or 1.
-      {
-        for ( ; n0 > 0; n0 -= 8 )
-        {
-          dim_t n_loc = ( n0 < 8 ) ? n0 : 8;
-          bli_dgemmsup_rd_armv8a_int_2x8
-          (
-            conja, conjb, m0, n_loc, k0,
-            alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-            beta, c, rs_c0, cs_c0, data, cntx
-          );
-          b += 8 * cs_b0;
-          c += 8 * cs_c0;
-        }
-      }
-    }
-    else
+    assert( m0 <= 9 );
+
+    // Manual separation.
+    dgemmsup_ker_ft ker_fp1 = NULL;
+    dgemmsup_ker_ft ker_fp2 = NULL;
+    dim_t           mr1, mr2;
+
+    switch ( m0 )
     {
-      assert( FALSE );
+      case 9:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function.
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr2 = 3; break;
+      case 8:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function.
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break;
+      case 7:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3;
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr2 = 4; break;
+      case 5:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3;
+        ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break;
+      case 4:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr1 = 4; break;
+      case 3:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; break;
+      default:
+        ker_fp1 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr1 = m0; break;
     }
+
+    ker_fp1
+    (
+      conja, conjb, mr1, n0, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    a += mr1 * rs_a0;
+    c += mr1 * rs_c0;
+    if ( ker_fp2 )
+      ker_fp2
+      (
+        conja, conjb, mr2, n0, k0,
+        alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+        beta, c, rs_c0, cs_c0, data, cntx
+      );
+
     return;
   }
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
index bc7402a5f..b7d1a7d0f 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
@@ -36,7 +36,6 @@
 #include "blis.h"
 #include "assert.h"
 
-GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 
 // Label locality & misc.
 #include "../armv8a_asm_utils.h"
@@ -76,6 +75,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 " prfm PLDL1KEEP, ["#CADDR"]      \n\t" \
 " add  "#CADDR", "#CADDR", "#DLONGC" \n\t"
 
+// For row-storage of C.
 #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \
   DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \
 " add  "#CADDR", "#CADDR", "#RSC" \n\t"
@@ -83,6 +83,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
   DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \
 " add  "#CADDR", "#CADDR", "#RSC" \n\t"
 
+// For column-storage of C.
 #define DLOADC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \
   DLOAD2V(C00,C10,CADDR,CSHIFT) \
 " add  "#CADDR", "#CADDR", "#CSC" \n\t" \
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
new file mode 100644
index 000000000..eaddfd076
--- /dev/null
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
@@ -0,0 +1,482 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "blis.h"
+#include "assert.h"
+
+
+// Label locality & misc.
+#include "../armv8a_asm_utils.h"
+
+// Nanokernel operations.
+#include "../armv8a_asm_d2x2.h"
+
+/* Order of row-major DGEMM_6x8's execution in 2x2 blocks:
+ *
+ * +---+ +---+ +---+ +---+
+ * | 0 | | 1 | | 6 | | 7 |
+ * +---+ +---+ +---+ +---+
+ * +---+ +---+ +---+ +---+
+ * | 2 | | 3 | | 8 | | 9 |
+ * +---+ +---+ +---+ +---+
+ * ----- ----- ----- -----
+ * 4     5     10    11
+ */
+#define DGEMM_5X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \
+  DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \
+  DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \
+  DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \
+" fmla  v"#C40".2d, v"#B0".2d, v"#A2".d[0] \n\t" \
+" fmla  v"#C41".2d, v"#B1".2d, v"#A2".d[0] \n\t" \
+  DGEMM_LOAD2V_ ##LOADNEXT (B0,B1,BADDR,BSHIFT) \
+  DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \
+  DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
+  DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \
+  DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
+" fmla  v"#C42".2d, v"#B2".2d, v"#A2".d[0] \n\t" \
+" fmla  v"#C43".2d, v"#B3".2d, v"#A2".d[0] \n\t"
+
+// Interleaving load or not.
+#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
+#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+" ldr  q"#V1", ["#ADDR", #"#IMM"] \n\t"
+
+#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
+#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
+  DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+  DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
+
+#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
+#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
+" ld1  {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
+" ld1  {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
+
+// Prefetch C in the long direction.
+#define DPRFMC_FWD(CADDR,DLONGC) \
+" prfm PLDL1KEEP, ["#CADDR"]      \n\t" \
+" add  "#CADDR", "#CADDR", "#DLONGC" \n\t"
+
+// For row-storage of C.
+#define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \
+  DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+#define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \
+  DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+
+// For column-storage of C: Store 2+1/2 vectors.
+#define DLOADC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \
+" add  "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+" ld1  {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+#define DSTOREC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \
+" add  "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+" st1  {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+
+#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \
+  DSCALE4V(V0,V1,V2,V3,A,IDX) \
+  DSCALE1V(V4,A,IDX)
+#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \
+  DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
+  DSCALEA1V(D4,S4,A,IDX)
+
+
+void bli_dgemmsup_rv_armv8a_asm_5x8n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( m0 == 5 );
+
+  // LLVM has very bad routing ability for inline asm.
+  // Limit number of registers in case of Clang compilation.
+#ifndef __clang__
+  void*    a_next = bli_auxinfo_next_a( data );
+  void*    b_next = bli_auxinfo_next_b( data );
+#endif
+  uint64_t ps_b   = bli_auxinfo_ps_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 6;
+  uint64_t k_left = k0 % 6;
+
+  int64_t  n_iter = n0 / 8;
+  int64_t  n_left = n0 % 8;
+
+  uint64_t rs_a   = rs_a0;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  // uint64_t cs_b   = cs_b0;
+  assert( cs_b0 == 1 );
+
+  if ( n_iter == 0 ) goto consider_edge_cases;
+
+  __asm__ volatile
+  (
+" ldr             x10, %[b]                       \n\t"
+" ldr             x13, %[c]                       \n\t"
+" ldr             x12, %[n_iter]                  \n\t"
+" ldr             x11, %[ps_b]                    \n\t" // Panel-skip of B.
+" ldr             x3, %[rs_b]                     \n\t" // Row-skip of B.
+" ldr             x9, %[rs_a]                     \n\t" // Row-skip of A.
+" ldr             x2, %[cs_a]                     \n\t" // Column-skip of A.
+"                                                 \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+"                                                 \n\t"
+"                                                 \n\t" // Multiply some address skips by sizeof(double).
+" lsl             x11, x11, #3                    \n\t" // ps_b
+" lsl             x9, x9, #3                      \n\t" // rs_a
+" lsl             x2, x2, #3                      \n\t" // cs_a
+" lsl             x3, x3, #3                      \n\t" // rs_b
+" lsl             x6, x6, #3                      \n\t" // rs_c
+" lsl             x7, x7, #3                      \n\t" // cs_c
+"                                                 \n\t"
+" mov             x1, x5                          \n\t"
+" cmp             x7, #8                          \n\t" // Prefetch column-strided C.
+BEQ(C_PREFETCH_COLS)
+DPRFMC_FWD(x1,x6)
+DPRFMC_FWD(x1,x6)
+DPRFMC_FWD(x1,x6)
+DPRFMC_FWD(x1,x6)
+DPRFMC_FWD(x1,x6)
+BRANCH(C_PREFETCH_END)
+LABEL(C_PREFETCH_COLS)
+// This prefetch will not cover further mker perts. Skip.
+//
+// DPRFMC_FWD(x1,x7)
+// DPRFMC_FWD(x1,x7)
+// DPRFMC_FWD(x1,x7)
+// DPRFMC_FWD(x1,x7)
+// DPRFMC_FWD(x1,x7)
+// DPRFMC_FWD(x1,x7)
+// DPRFMC_FWD(x1,x7)
+// DPRFMC_FWD(x1,x7)
+LABEL(C_PREFETCH_END)
+//
+// Millikernel.
+LABEL(MILLIKER_MLOOP)
+"                                                 \n\t"
+" mov             x1, x10                         \n\t" // Parameters to be reloaded
+" mov             x5, x13                         \n\t" //  within each millikernel loop.
+" ldr             x0, %[a]                        \n\t"
+" ldr             x4, %[k_mker]                   \n\t"
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+// Storage scheme:
+//  V[ 0:19] <- C
+//  V[20:25] <- A
+//  V[26:31] <- B
+// Under this scheme, the following is defined:
+#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_5X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
+// Load from memory.
+LABEL(LOAD_ABC)
+"                                                 \n\t" // No-microkernel early return is a must
+" cmp             x4, #0                          \n\t" //  to avoid out-of-boundary read.
+BEQ(CLEAR_CCOLS)
+"                                                 \n\t"
+" ldr             q26, [x1, #16*0]                \n\t" // Load B first.
+" ldr             q27, [x1, #16*1]                \n\t"
+" ldr             q28, [x1, #16*2]                \n\t"
+" ldr             q29, [x1, #16*3]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" ldr             q30, [x1, #16*0]                \n\t"
+" ldr             q31, [x1, #16*1]                \n\t"
+"                                                 \n\t"
+" mov             x14, x0                         \n\t" // Load A.
+" ld1             {v20.d}[0], [x14], x9           \n\t" // We want A to be kept in L1.
+" ld1             {v20.d}[1], [x14], x9           \n\t"
+" ld1             {v21.d}[0], [x14], x9           \n\t"
+" ld1             {v21.d}[1], [x14], x9           \n\t"
+" ld1             {v22.d}[0], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" ld1             {v23.d}[0], [x14], x9           \n\t"
+" ld1             {v23.d}[1], [x14], x9           \n\t"
+" ld1             {v24.d}[0], [x14], x9           \n\t"
+" ld1             {v24.d}[1], [x14], x9           \n\t"
+" ld1             {v25.d}[0], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+LABEL(CLEAR_CCOLS)
+CLEAR8V(0,1,2,3,4,5,6,7)
+CLEAR8V(8,9,10,11,12,13,14,15)
+CLEAR4V(16,17,18,19)
+// No-microkernel early return, once again.
+BEQ(K_LEFT_LOOP)
+//
+// Microkernel is defined here as:
+#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \
+  DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x14,x9,x1,16*2,load) \
+ "add             x1, x1, x3                      \n\t" \
+ "ldr             q"#B2", [x1, #16*0]             \n\t" /* Next B line. */ \
+ "ldr             q"#B3", [x1, #16*1]             \n\t" \
+ "ld1             {v"#A2".d}[0], [x14], x9        \n\t" /* Finish A line. */ \
+ "add             x0, x0, x2                      \n\t" \
+ "mov             x14, x0                         \n\t"
+// Start microkernel loop.
+LABEL(K_MKER_LOOP)
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,26,27,28,29)
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,30,31,26,27)
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,28,29,30,31)
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,26,27,28,29)
+"                                                 \n\t" // Decrease counter before final replica.
+" subs            x4, x4, #1                      \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP)
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,30,31,26,27)
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31)
+BRANCH(K_MKER_LOOP)
+//
+// Final microkernel loop.
+LABEL(FIN_MKER_LOOP)
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,30,31,26,27,xzr,-1,xzr,-1,noload)
+" ldr             q30, [x1, #16*2]                \n\t"
+" ldr             q31, [x1, #16*3]                \n\t"
+" add             x1, x1, x3                      \n\t"
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC(23,24,25,28,29,30,31,xzr,-1,xzr,-1,noload)
+//
+// Loops left behind microkernels.
+LABEL(K_LEFT_LOOP)
+" cmp             x8, #0                          \n\t" // End of exec.
+BEQ(WRITE_MEM_PREP)
+" ldr             q26, [x1, #16*0]                \n\t" // Load B row.
+" ldr             q27, [x1, #16*1]                \n\t"
+" ldr             q28, [x1, #16*2]                \n\t"
+" ldr             q29, [x1, #16*3]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" mov             x14, x0                         \n\t"
+" ld1             {v20.d}[0], [x14], x9           \n\t" // Load A col.
+" ld1             {v20.d}[1], [x14], x9           \n\t"
+" ld1             {v21.d}[0], [x14], x9           \n\t"
+" ld1             {v21.d}[1], [x14], x9           \n\t"
+" ld1             {v22.d}[0], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" sub             x8, x8, #1                      \n\t"
+DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,26,27,28,29,xzr,-1,xzr,-1,noload)
+BRANCH(K_LEFT_LOOP)
+//
+// Scale and write to memory.
+LABEL(WRITE_MEM_PREP)
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1r            {v30.2d}, [x4]                  \n\t" // Load alpha & beta.
+" ld1r            {v31.2d}, [x8]                  \n\t"
+" fmov            d20, #1.0                       \n\t"
+" fcmp            d30, d20                        \n\t"
+BEQ(UNIT_ALPHA_R)
+DSCALE8V(0,1,2,3,4,5,6,7,30,0)
+DSCALE8V(8,9,10,11,12,13,14,15,30,0)
+DSCALE4V(16,17,18,19,30,0)
+LABEL(UNIT_ALPHA_R)
+"                                                 \n\t"
+" mov             x1, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x7, #8                          \n\t" // Check for column-storage.
+BNE(WRITE_MEM_C)
+//
+// C storage in rows.
+LABEL(WRITE_MEM_R)
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_R_1_2)
+DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6)
+DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6)
+DSCALEA4V(0,1,2,3,20,21,22,23,31,0)
+DSCALEA4V(4,5,6,7,24,25,26,27,31,0)
+LABEL(ZERO_BETA_R_1_2)
+DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6)
+DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6)
+BEQ(ZERO_BETA_R_3_4_5)
+DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6)
+DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6)
+DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6)
+DSCALEA8V(8,9,10,11,12,13,14,15,20,21,22,23,24,25,26,27,31,0)
+DSCALEA4V(16,17,18,19,0,1,2,3,31,0)
+LABEL(ZERO_BETA_R_3_4_5)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_R)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_R)
+#endif
+DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6)
+DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6)
+DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6)
+BRANCH(END_WRITE_MEM)
+//
+// C storage in columns.
+LABEL(WRITE_MEM_C)
+// In-register transpose,
+//  do transposition in row-order.
+" trn1            v20.2d, v0.2d, v4.2d            \n\t" // Row 0-1.
+" trn2            v21.2d, v0.2d, v4.2d            \n\t"
+" trn1            v22.2d, v1.2d, v5.2d            \n\t"
+" trn2            v23.2d, v1.2d, v5.2d            \n\t"
+" trn1            v24.2d, v2.2d, v6.2d            \n\t"
+" trn2            v25.2d, v2.2d, v6.2d            \n\t"
+" trn1            v26.2d, v3.2d, v7.2d            \n\t"
+" trn2            v27.2d, v3.2d, v7.2d            \n\t"
+"                                                 \n\t"
+" trn1            v0.2d, v8.2d, v12.2d            \n\t" // Row 2-3.
+" trn2            v1.2d, v8.2d, v12.2d            \n\t"
+" trn1            v2.2d, v9.2d, v13.2d            \n\t"
+" trn2            v3.2d, v9.2d, v13.2d            \n\t"
+" trn1            v4.2d, v10.2d, v14.2d           \n\t"
+" trn2            v5.2d, v10.2d, v14.2d           \n\t"
+" trn1            v6.2d, v11.2d, v15.2d           \n\t"
+" trn2            v7.2d, v11.2d, v15.2d           \n\t"
+"                                                 \n\t"
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_C_1_2_3_4)
+DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8)
+DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8)
+DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8)
+DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8)
+DSCALEA5V(20,0,21,1,16,8,9,11,12,10,31,0)
+DSCALEA5V(22,2,23,3,17,13,14,28,29,15,31,0)
+LABEL(ZERO_BETA_C_1_2_3_4)
+DSTOREC_2PHV_C_FWD(20,0,16,0,x5,0,x7,x8)
+DSTOREC_2PHV_C_FWD(21,1,16,1,x5,0,x7,x8)
+DSTOREC_2PHV_C_FWD(22,2,17,0,x5,0,x7,x8)
+DSTOREC_2PHV_C_FWD(23,3,17,1,x5,0,x7,x8)
+BEQ(ZERO_BETA_C_5_6_7_8)
+DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8)
+DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8)
+DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8)
+DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8)
+DSCALEA5V(24,4,25,5,18,8,9,11,12,10,31,0)
+DSCALEA5V(26,6,27,7,19,13,14,28,29,15,31,0)
+LABEL(ZERO_BETA_C_5_6_7_8)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_C)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_C)
+#endif
+DSTOREC_2PHV_C_FWD(24,4,18,0,x5,0,x7,x8)
+DSTOREC_2PHV_C_FWD(25,5,18,1,x5,0,x7,x8)
+DSTOREC_2PHV_C_FWD(26,6,19,0,x5,0,x7,x8)
+DSTOREC_2PHV_C_FWD(27,7,19,1,x5,0,x7,x8)
+//
+// End of this microkernel.
+LABEL(END_WRITE_MEM)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t"
+BEQ(END_EXEC)
+"                                                 \n\t"
+" mov             x8, #8                          \n\t"
+" madd            x13, x7, x8, x13                \n\t" // Forward C's base address to the next logic panel.
+" add             x10, x10, x11                   \n\t" // Forward B's base address to the next logic panel.
+BRANCH(MILLIKER_MLOOP)
+//
+// End of execution.
+LABEL(END_EXEC)
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_a]   "m" (rs_a),
+  [cs_a]   "m" (cs_a),
+  [ps_b]   "m" (ps_b),
+  [rs_b]   "m" (rs_b),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  // In Clang, even "m"-passed parameter takes 1 register.
+  // Have to disable prefetching to pass compilation.
+#ifndef __clang__
+  [a_next] "r" (a_next),
+  [b_next] "r" (b_next),
+#endif
+  [n_iter] "m" (n_iter),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta)
+: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+  "x8", "x9", "x10","x11","x12","x13","x14",
+  "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10","v11","v12","v13","v14","v15",
+  "v16","v17","v18","v19","v20","v21","v22","v23",
+  "v24","v25","v26","v27","v28","v29","v30","v31"
+  );
+
+consider_edge_cases:
+  // Forward address.
+  b = b + n_iter * ps_b;
+  c = c + n_iter * 8 * cs_c;
+  if ( n_left )
+  {
+    // Set panel stride to unpacked mode.
+    // Only 1 millikernel w.r.t. 6x8 is executed.
+    auxinfo_t data_d6x4mn = *data;
+    bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
+    //
+    bli_dgemmsup_rv_armv8a_int_6x4mn
+    (
+      conja, conjb, 5, n_left, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+    );
+  }
+
+}
+
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
new file mode 100644
index 000000000..91d6ca596
--- /dev/null
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
@@ -0,0 +1,475 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "blis.h"
+#include "assert.h"
+
+
+// Label locality & misc.
+#include "../armv8a_asm_utils.h"
+
+// Nanokernel operations.
+#include "../armv8a_asm_d2x2.h"
+
+/* Odd-NR dgemmsup_rv_*m kernels are special in that
+ * despite of the row-major name, C is laid out in COLUMNS in the register space.
+ *
+ * Block order:
+ *
+ * +---+ +---+
+ * | 0 | | 3 | |6
+ * +---+ +---+ |
+ * +---+ +---+
+ * | 1 | | 4 | |7
+ * +---+ +---+ |
+ * +---+ +---+
+ * | 2 | | 5 | |8
+ * +---+ +---+ |
+ *
+ */
+#define DGEMM_C6X5_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C10,C11,C12,C13,C14,C20,C21,C22,C23,C24,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \
+  DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \
+  DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
+  DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \
+  DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \
+  DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \
+" fmla  v"#C04".2d, v"#A0".2d, v"#B2".d["#BIDX"] \n\t" \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
+" fmla  v"#C14".2d, v"#A1".2d, v"#B2".d["#BIDX"] \n\t" \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
+" fmla  v"#C24".2d, v"#A2".2d, v"#B2".d["#BIDX"] \n\t"
+
+// Interleaving load or not.
+#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
+#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+" ldr  q"#V1", ["#ADDR", #"#IMM"] \n\t"
+
+#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
+#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
+" ld1  {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
+" ld1  {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
+
+// Prefetch C in the long direction.
+#define DPRFMC_FWD(CADDR,DLONGC) \
+" prfm PLDL1KEEP, ["#CADDR"]      \n\t" \
+" add  "#CADDR", "#CADDR", "#DLONGC" \n\t"
+
+// For column-storage of C.
+#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+  DLOAD1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+  DSTORE1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+
+// For row-storage of C: Store 2+1/2 vectors.
+#define DLOADC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \
+" add  "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+" ld1  {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+#define DSTOREC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \
+" add  "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+" st1  {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+
+#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \
+  DSCALE4V(V0,V1,V2,V3,A,IDX) \
+  DSCALE1V(V4,A,IDX)
+#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \
+  DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
+  DSCALEA1V(D4,S4,A,IDX)
+
+
+void bli_dgemmsup_rv_armv8a_asm_6x5m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( n0 == 5 );
+
+  // LLVM has very bad routing ability for inline asm.
+  // Limit number of registers in case of Clang compilation.
+#ifndef __clang__
+  void*    a_next = bli_auxinfo_next_a( data );
+  void*    b_next = bli_auxinfo_next_b( data );
+#endif
+  uint64_t ps_a   = bli_auxinfo_ps_a( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 6;
+  uint64_t k_left = k0 % 6;
+
+  int64_t  m_iter = m0 / 6;
+  int64_t  m_left = m0 % 6;
+
+  uint64_t rs_a   = rs_a0;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  // uint64_t cs_b   = cs_b0;
+  assert( cs_b0 == 1 );
+
+  if ( m_iter == 0 ) goto consider_edge_cases;
+
+  __asm__ volatile
+  (
+" ldr             x10, %[a]                       \n\t"
+" ldr             x13, %[c]                       \n\t"
+" ldr             x12, %[m_iter]                  \n\t"
+" ldr             x11, %[ps_a]                    \n\t" // Panel-skip of A.
+" ldr             x9, %[rs_a]                     \n\t" // Row-skip of A.
+" ldr             x2, %[cs_a]                     \n\t" // Column-skip of A.
+" ldr             x3, %[rs_b]                     \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+"                                                 \n\t"
+"                                                 \n\t" // Multiply some address skips by sizeof(double).
+" lsl             x11, x11, #3                    \n\t" // ps_a
+" lsl             x9, x9, #3                      \n\t" // rs_a
+" lsl             x2, x2, #3                      \n\t" // cs_a
+" lsl             x3, x3, #3                      \n\t" // rs_b
+" lsl             x6, x6, #3                      \n\t" // rs_c
+" lsl             x7, x7, #3                      \n\t" // cs_c
+"                                                 \n\t"
+" mov             x1, x5                          \n\t"
+" cmp             x7, #8                          \n\t" // Prefetch column-strided C.
+BNE(C_PREFETCH_COLS)
+// This prefetch will not cover further mker perts. Skip.
+//
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+BRANCH(C_PREFETCH_END)
+LABEL(C_PREFETCH_COLS)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+LABEL(C_PREFETCH_END)
+//
+// Millikernel.
+LABEL(MILLIKER_MLOOP)
+"                                                 \n\t"
+" mov             x0, x10                         \n\t" // Parameters to be reloaded
+" mov             x5, x13                         \n\t" //  within each millikernel loop.
+" ldr             x1, %[b]                        \n\t"
+" ldr             x4, %[k_mker]                   \n\t"
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+// Storage scheme:
+//  V[ 0:14] <- C
+//  V[15:23] <- A
+//  V[24:29] <- B
+// Under this scheme, the following is defined:
+#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_C6X5_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
+// Load from memory.
+LABEL(LOAD_ABC)
+"                                                 \n\t" // No-microkernel early return is a must
+" cmp             x4, #0                          \n\t" //  to avoid out-of-boundary read.
+BEQ(CLEAR_CCOLS)
+"                                                 \n\t"
+" mov             x14, x0                         \n\t" // Load A.
+" ld1             {v15.d}[0], [x14], x9           \n\t"
+" ld1             {v15.d}[1], [x14], x9           \n\t"
+" ld1             {v16.d}[0], [x14], x9           \n\t"
+" ld1             {v16.d}[1], [x14], x9           \n\t"
+" ld1             {v17.d}[0], [x14], x9           \n\t"
+" ld1             {v17.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" ld1             {v18.d}[0], [x14], x9           \n\t"
+" ld1             {v18.d}[1], [x14], x9           \n\t"
+" ld1             {v19.d}[0], [x14], x9           \n\t"
+" ld1             {v19.d}[1], [x14], x9           \n\t"
+" ld1             {v20.d}[0], [x14], x9           \n\t"
+" ld1             {v20.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" ld1             {v21.d}[0], [x14], x9           \n\t"
+" ld1             {v21.d}[1], [x14], x9           \n\t"
+" ld1             {v22.d}[0], [x14], x9           \n\t"
+" ld1             {v22.d}[1], [x14], x9           \n\t"
+" ld1             {v23.d}[0], [x14], x9           \n\t"
+" ld1             {v23.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+"                                                 \n\t"
+" ldr             q24, [x1, #16*0]                \n\t" // Load B.
+" ldr             q25, [x1, #16*1]                \n\t"
+" ldr             d26, [x1, #16*2]                \n\t" // Scalar loads into idx 0.
+" add             x1, x1, x3                      \n\t"
+" ldr             q27, [x1, #16*0]                \n\t"
+" ldr             q28, [x1, #16*1]                \n\t"
+" ldr             d29, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+LABEL(CLEAR_CCOLS)
+CLEAR4V(0,1,2,3)
+CLEAR1V(4)
+CLEAR4V(5,6,7,8)
+CLEAR1V(9)
+CLEAR4V(10,11,12,13)
+CLEAR1V(14)
+// No-microkernel early return, once again.
+BEQ(K_LEFT_LOOP)
+//
+// Microkernel is defined here as:
+#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,BIDX) \
+  DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,x14,x9,x1,0,load) \
+ "ld1             {v"#A2".d}[0], [x14], x9        \n\t" \
+ "ld1             {v"#A2".d}[1], [x14], x9        \n\t" \
+ "add             x0, x0, x2                      \n\t" \
+ "mov             x14, x0                         \n\t" \
+ /* Due to this loading, BIDX can only be 0 here. */ \
+ "ldr             d"#B2", [x1, #16*2]             \n\t" \
+ "add             x1, x1, x3                      \n\t"
+// Start microkernel loop.
+LABEL(K_MKER_LOOP)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,24,25,26,0)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,27,28,29,0)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,24,25,26,0)
+"                                                 \n\t" // Decrease counter before final replica.
+" subs            x4, x4, #1                      \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,27,28,29,0)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26,0)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29,0)
+BRANCH(K_MKER_LOOP)
+//
+// Final microkernel loop.
+LABEL(FIN_MKER_LOOP)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,27,28,29,0,xzr,-1,xzr,-1,noload)
+" ldr             q27, [x1, #16*0]                \n\t"
+" ldr             q28, [x1, #16*1]                \n\t"
+" ldr             d29, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,0,xzr,-1,xzr,-1,noload)
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(21,22,23,27,28,29,0,xzr,-1,xzr,-1,noload)
+//
+// Loops left behind microkernels.
+LABEL(K_LEFT_LOOP)
+" cmp             x8, #0                          \n\t" // End of exec.
+BEQ(WRITE_MEM_PREP)
+" mov             x14, x0                         \n\t" // Load A col.
+" ld1             {v15.d}[0], [x14], x9           \n\t"
+" ld1             {v15.d}[1], [x14], x9           \n\t"
+" ld1             {v16.d}[0], [x14], x9           \n\t"
+" ld1             {v16.d}[1], [x14], x9           \n\t"
+" ld1             {v17.d}[0], [x14], x9           \n\t"
+" ld1             {v17.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" ldr             q24, [x1, #16*0]                \n\t" // Load B row.
+" ldr             q25, [x1, #16*1]                \n\t"
+" ldr             d26, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" sub             x8, x8, #1                      \n\t"
+DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,24,25,26,0,xzr,-1,xzr,-1,noload)
+BRANCH(K_LEFT_LOOP)
+//
+// Scale and write to memory.
+LABEL(WRITE_MEM_PREP)
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1r            {v30.2d}, [x4]                  \n\t" // Load alpha & beta.
+" ld1r            {v31.2d}, [x8]                  \n\t"
+" fmov            d26, #1.0                       \n\t"
+" fcmp            d30, d26                        \n\t"
+BEQ(UNIT_ALPHA)
+DSCALE5V(0,1,2,3,4,30,0)
+DSCALE5V(5,6,7,8,9,30,0)
+DSCALE5V(10,11,12,13,14,30,0)
+LABEL(UNIT_ALPHA)
+"                                                 \n\t"
+" mov             x1, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x7, #8                          \n\t" // Check for column-storage.
+BNE(WRITE_MEM_C)
+// Unlike other RV kernels, here row-storage of C requires
+//   in-register transpose.
+" trn1            v15.2d, v0.2d, v1.2d            \n\t"
+" trn2            v16.2d, v0.2d, v1.2d            \n\t"
+" trn1            v17.2d, v2.2d, v3.2d            \n\t"
+" trn2            v18.2d, v2.2d, v3.2d            \n\t"
+"                                                 \n\t"
+" trn1            v19.2d, v5.2d, v6.2d            \n\t"
+" trn2            v20.2d, v5.2d, v6.2d            \n\t"
+" trn1            v21.2d, v7.2d, v8.2d            \n\t"
+" trn2            v22.2d, v7.2d, v8.2d            \n\t"
+"                                                 \n\t"
+" trn1            v23.2d, v10.2d, v11.2d          \n\t"
+" trn2            v24.2d, v10.2d, v11.2d          \n\t"
+" trn1            v25.2d, v12.2d, v13.2d          \n\t"
+" trn2            v26.2d, v12.2d, v13.2d          \n\t"
+"                                                 \n\t"
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_R)
+DLOADC_2PHV_R_FWD(0,1,28,0,x1,0,x6,x8)
+DLOADC_2PHV_R_FWD(2,3,28,1,x1,0,x6,x8)
+DLOADC_2PHV_R_FWD(5,6,29,0,x1,0,x6,x8)
+DLOADC_2PHV_R_FWD(7,8,29,1,x1,0,x6,x8)
+DLOADC_2PHV_R_FWD(10,11,30,0,x1,0,x6,x8)
+DLOADC_2PHV_R_FWD(12,13,30,1,x1,0,x6,x8)
+DSCALEA5V(15,17,16,18,4,0,1,2,3,28,31,0)
+DSCALEA5V(19,21,20,22,9,5,6,7,8,29,31,0)
+DSCALEA5V(23,25,24,26,14,10,11,12,13,30,31,0)
+LABEL(ZERO_BETA_R)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_R)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_R)
+#endif
+DSTOREC_2PHV_R_FWD(15,17,4,0,x5,0,x6,x8)
+DSTOREC_2PHV_R_FWD(16,18,4,1,x5,0,x6,x8)
+DSTOREC_2PHV_R_FWD(19,21,9,0,x5,0,x6,x8)
+DSTOREC_2PHV_R_FWD(20,22,9,1,x5,0,x6,x8)
+DSTOREC_2PHV_R_FWD(23,25,14,0,x5,0,x6,x8)
+DSTOREC_2PHV_R_FWD(24,26,14,1,x5,0,x6,x8)
+BRANCH(END_WRITE_MEM)
+//
+// C storage in columns.
+LABEL(WRITE_MEM_C)
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_C)
+DLOADC_3V_C_FWD(15,20,25,x1,0,x7)
+DLOADC_3V_C_FWD(16,21,26,x1,0,x7)
+DLOADC_3V_C_FWD(17,22,27,x1,0,x7)
+DLOADC_3V_C_FWD(18,23,28,x1,0,x7)
+DLOADC_3V_C_FWD(19,24,29,x1,0,x7)
+DSCALEA5V(0,1,2,3,4,15,16,17,18,19,31,0)
+DSCALEA5V(5,6,7,8,9,20,21,22,23,24,31,0)
+DSCALEA5V(10,11,12,13,14,25,26,27,28,29,31,0)
+LABEL(ZERO_BETA_C)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_C)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_C)
+#endif
+DSTOREC_3V_C_FWD(0,5,10,x5,0,x7)
+DSTOREC_3V_C_FWD(1,6,11,x5,0,x7)
+DSTOREC_3V_C_FWD(2,7,12,x5,0,x7)
+DSTOREC_3V_C_FWD(3,8,13,x5,0,x7)
+DSTOREC_3V_C_FWD(4,9,14,x5,0,x7)
+//
+// End of this microkernel.
+LABEL(END_WRITE_MEM)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t"
+BEQ(END_EXEC)
+"                                                 \n\t"
+" mov             x8, #6                          \n\t"
+" madd            x13, x6, x8, x13                \n\t" // Forward C's base address to the next logic panel.
+" add             x10, x10, x11                   \n\t" // Forward A's base address to the next logic panel.
+BRANCH(MILLIKER_MLOOP)
+//
+// End of execution.
+LABEL(END_EXEC)
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_a]   "m" (rs_a),
+  [cs_a]   "m" (cs_a),
+  [ps_a]   "m" (ps_a),
+  [rs_b]   "m" (rs_b),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  // In Clang, even "m"-passed parameter takes 1 register.
+  // Have to disable prefetching to pass compilation.
+#ifndef __clang__
+  [a_next] "r" (a_next),
+  [b_next] "r" (b_next),
+#endif
+  [m_iter] "m" (m_iter),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta)
+: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+  "x8", "x9", "x10","x11","x12","x13","x14",
+  "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10","v11","v12","v13","v14","v15",
+  "v16","v17","v18","v19","v20","v21","v22","v23",
+  "v24","v25","v26","v27","v28","v29","v30","v31"
+  );
+
+consider_edge_cases:
+  // Forward address.
+  a = a + m_iter * ps_a;
+  c = c + m_iter * 6 * rs_c;
+  auxinfo_t data_d6x4mn = *data;
+  bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
+  bli_dgemmsup_rv_armv8a_int_6x4mn
+  (
+    conja, conjb, m_left, 5, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+  );
+
+}
+
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
new file mode 100644
index 000000000..4273030dd
--- /dev/null
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
@@ -0,0 +1,477 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "blis.h"
+#include "assert.h"
+
+
+// Label locality & misc.
+#include "../armv8a_asm_utils.h"
+
+// Nanokernel operations.
+#include "../armv8a_asm_d2x2.h"
+
+/* Order of row-major DGEMM_6x6's execution in 2x2 blocks:
+ *
+ * +---+ +---+ +---+
+ * | 0 | | 1 | | 2 |
+ * +---+ +---+ +---+
+ * +---+ +---+ +---+
+ * | 3 | | 4 | | 5 |
+ * +---+ +---+ +---+
+ * +---+ +---+ +---+
+ * | 6 | | 7 | | 8 |
+ * +---+ +---+ +---+
+ *
+ */
+#define DGEMM_6X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \
+  DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \
+  DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
+  DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \
+  DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \
+  DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
+  DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
+  DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \
+  DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2)
+
+// Interleaving load or not.
+#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
+#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+" ldr  q"#V1", ["#ADDR", #"#IMM"] \n\t"
+
+#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
+#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
+" ld1  {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
+" ld1  {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
+
+// Prefetch C in the long direction.
+#define DPRFMC_FWD(CADDR,DLONGC) \
+" prfm PLDL1KEEP, ["#CADDR"]      \n\t" \
+" add  "#CADDR", "#CADDR", "#DLONGC" \n\t"
+
+// For row-storage of C.
+#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
+  DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC)
+#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \
+  DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC)
+
+// For column-storage of C.
+#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+  DLOAD1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+  DSTORE1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+
+#define DSCALE6V(V0,V1,V2,V3,V4,V5,A,IDX) \
+  DSCALE4V(V0,V1,V2,V3,A,IDX) \
+  DSCALE2V(V4,V5,A,IDX)
+#define DSCALEA6V(D0,D1,D2,D3,D4,D5,S0,S1,S2,S3,S4,S5,A,IDX) \
+  DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
+  DSCALEA2V(D4,D5,S4,S5,A,IDX)
+
+
+void bli_dgemmsup_rv_armv8a_asm_6x6m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( n0 == 6 );
+
+  // LLVM has very bad routing ability for inline asm.
+  // Limit number of registers in case of Clang compilation.
+#ifndef __clang__
+  void*    a_next = bli_auxinfo_next_a( data );
+  void*    b_next = bli_auxinfo_next_b( data );
+#endif
+  uint64_t ps_a   = bli_auxinfo_ps_a( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 8;
+  uint64_t k_left = k0 % 8;
+
+  int64_t  m_iter = m0 / 6;
+  int64_t  m_left = m0 % 6;
+
+  uint64_t rs_a   = rs_a0;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  // uint64_t cs_b   = cs_b0;
+  assert( cs_b0 == 1 );
+
+  if ( m_iter == 0 ) goto consider_edge_cases;
+
+  __asm__ volatile
+  (
+" ldr             x10, %[a]                       \n\t"
+" ldr             x13, %[c]                       \n\t"
+" ldr             x12, %[m_iter]                  \n\t"
+" ldr             x11, %[ps_a]                    \n\t" // Panel-skip of A.
+" ldr             x9, %[rs_a]                     \n\t" // Row-skip of A.
+" ldr             x2, %[cs_a]                     \n\t" // Column-skip of A.
+" ldr             x3, %[rs_b]                     \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+"                                                 \n\t"
+"                                                 \n\t" // Multiply some address skips by sizeof(double).
+" lsl             x11, x11, #3                    \n\t" // ps_a
+" lsl             x9, x9, #3                      \n\t" // rs_a
+" lsl             x2, x2, #3                      \n\t" // cs_a
+" lsl             x3, x3, #3                      \n\t" // rs_b
+" lsl             x6, x6, #3                      \n\t" // rs_c
+" lsl             x7, x7, #3                      \n\t" // cs_c
+"                                                 \n\t"
+" mov             x1, x5                          \n\t"
+" cmp             x7, #8                          \n\t" // Prefetch column-strided C.
+BEQ(C_PREFETCH_COLS)
+// This prefetch will not cover further mker perts. Skip.
+//
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+BRANCH(C_PREFETCH_END)
+LABEL(C_PREFETCH_COLS)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+LABEL(C_PREFETCH_END)
+//
+// Millikernel.
+LABEL(MILLIKER_MLOOP)
+"                                                 \n\t"
+" mov             x0, x10                         \n\t" // Parameters to be reloaded
+" mov             x5, x13                         \n\t" //  within each millikernel loop.
+" ldr             x1, %[b]                        \n\t"
+" ldr             x4, %[k_mker]                   \n\t"
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+// Storage scheme:
+//  V[ 0:17] <- C
+//  V[18:23] <- A
+//  V[24:31] <- B
+// Under this scheme, the following is defined:
+#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_6X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
+// Load from memory.
+LABEL(LOAD_ABC)
+"                                                 \n\t" // No-microkernel early return is a must
+" cmp             x4, #0                          \n\t" //  to avoid out-of-boundary read.
+BEQ(CLEAR_CCOLS)
+"                                                 \n\t"
+" mov             x14, x0                         \n\t" // Load A.
+" ld1             {v18.d}[0], [x14], x9           \n\t"
+" ld1             {v18.d}[1], [x14], x9           \n\t"
+" ld1             {v19.d}[0], [x14], x9           \n\t"
+" ld1             {v19.d}[1], [x14], x9           \n\t"
+" ld1             {v20.d}[0], [x14], x9           \n\t"
+" ld1             {v20.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" ld1             {v21.d}[0], [x14], x9           \n\t"
+" ld1             {v21.d}[1], [x14], x9           \n\t"
+" ld1             {v22.d}[0], [x14], x9           \n\t"
+" ld1             {v22.d}[1], [x14], x9           \n\t"
+" ld1             {v23.d}[0], [x14], x9           \n\t"
+" ld1             {v23.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+"                                                 \n\t"
+" ldr             q24, [x1, #16*0]                \n\t" // Load B.
+" ldr             q25, [x1, #16*1]                \n\t"
+" ldr             q26, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" ldr             q27, [x1, #16*0]                \n\t"
+" ldr             q28, [x1, #16*1]                \n\t"
+" ldr             q29, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" ldr             q30, [x1, #16*0]                \n\t"
+" ldr             q31, [x1, #16*1]                \n\t"
+LABEL(CLEAR_CCOLS)
+CLEAR4V(0,1,2,3)
+CLEAR2V(4,5)
+CLEAR4V(6,7,8,9)
+CLEAR2V(10,11)
+CLEAR4V(12,13,14,15)
+CLEAR2V(16,17)
+// No-microkernel early return, once again.
+BEQ(K_LEFT_LOOP)
+//
+// Microkernel is defined here as:
+#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2) \
+  DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,x14,x9,x1,16*2,load) \
+ "ld1             {v"#A2".d}[0], [x14], x9        \n\t" \
+ "ld1             {v"#A2".d}[1], [x14], x9        \n\t" \
+ "add             x0, x0, x2                      \n\t" \
+ "mov             x14, x0                         \n\t" \
+ "add             x1, x1, x3                      \n\t" \
+ "ldr             q"#B1", [x1, #16*0]             \n\t" \
+ "ldr             q"#B2", [x1, #16*1]             \n\t"
+// Start microkernel loop.
+LABEL(K_MKER_LOOP)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,30,31,24)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,25,26,27)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,28,29,30)
+"                                                 \n\t" // Decrease counter before final replica.
+" subs            x4, x4, #1                      \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,31,24,25)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,26,27,28)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,29,30,31)
+BRANCH(K_MKER_LOOP)
+//
+// Final microkernel loop.
+LABEL(FIN_MKER_LOOP)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,31,24,25,x14,x9,x1,16*2,load)
+" ld1             {v23.d}[0], [x14], x9           \n\t"
+" ld1             {v23.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" add             x1, x1, x3                      \n\t"
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,26,27,28,xzr,-1,xzr,-1,noload)
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,29,30,31,xzr,-1,xzr,-1,noload)
+//
+// Loops left behind microkernels.
+LABEL(K_LEFT_LOOP)
+" cmp             x8, #0                          \n\t" // End of exec.
+BEQ(WRITE_MEM_PREP)
+" mov             x14, x0                         \n\t"
+" ld1             {v18.d}[0], [x14], x9           \n\t" // Load A col.
+" ld1             {v18.d}[1], [x14], x9           \n\t"
+" ld1             {v19.d}[0], [x14], x9           \n\t"
+" ld1             {v19.d}[1], [x14], x9           \n\t"
+" ld1             {v20.d}[0], [x14], x9           \n\t"
+" ld1             {v20.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" ldr             q24, [x1, #16*0]                \n\t" // Load B row.
+" ldr             q25, [x1, #16*1]                \n\t"
+" ldr             q26, [x1, #16*2]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" sub             x8, x8, #1                      \n\t"
+DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,xzr,-1,xzr,-1,noload)
+BRANCH(K_LEFT_LOOP)
+//
+// Scale and write to memory.
+LABEL(WRITE_MEM_PREP)
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1r            {v30.2d}, [x4]                  \n\t" // Load alpha & beta.
+" ld1r            {v31.2d}, [x8]                  \n\t"
+" fmov            d26, #1.0                       \n\t"
+" fcmp            d30, d26                        \n\t"
+BEQ(UNIT_ALPHA)
+DSCALE6V(0,1,2,3,4,5,30,0)
+DSCALE6V(6,7,8,9,10,11,30,0)
+DSCALE6V(12,13,14,15,16,17,30,0)
+LABEL(UNIT_ALPHA)
+"                                                 \n\t"
+" mov             x1, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x7, #8                          \n\t" // Check for column-storage.
+BNE(WRITE_MEM_C)
+//
+// C storage in rows.
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_R_1_2)
+DLOADC_3V_R_FWD(18,19,20,x1,0,x6)
+DLOADC_3V_R_FWD(21,22,23,x1,0,x6)
+DSCALEA6V(0,1,2,3,4,5,18,19,20,21,22,23,31,0)
+LABEL(ZERO_BETA_R_1_2)
+DSTOREC_3V_R_FWD(0,1,2,x5,0,x6)
+DSTOREC_3V_R_FWD(3,4,5,x5,0,x6)
+BEQ(ZERO_BETA_R_3_4_5_6)
+DLOADC_3V_R_FWD(18,19,20,x1,0,x6)
+DLOADC_3V_R_FWD(21,22,23,x1,0,x6)
+DLOADC_3V_R_FWD(0,1,2,x1,0,x6)
+DLOADC_3V_R_FWD(3,4,5,x1,0,x6)
+DSCALEA6V(6,7,8,9,10,11,18,19,20,21,22,23,31,0)
+DSCALEA6V(12,13,14,15,16,17,0,1,2,3,4,5,31,0)
+LABEL(ZERO_BETA_R_3_4_5_6)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_R)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_R)
+#endif
+DSTOREC_3V_R_FWD(6,7,8,x5,0,x6)
+DSTOREC_3V_R_FWD(9,10,11,x5,0,x6)
+DSTOREC_3V_R_FWD(12,13,14,x5,0,x6)
+DSTOREC_3V_R_FWD(15,16,17,x5,0,x6)
+BRANCH(END_WRITE_MEM)
+//
+// C storage in columns.
+LABEL(WRITE_MEM_C)
+// In-register transpose,
+//  do transposition in row-order.
+" trn1            v18.2d, v0.2d, v3.2d            \n\t" // Row 0-1.
+" trn2            v19.2d, v0.2d, v3.2d            \n\t"
+" trn1            v20.2d, v1.2d, v4.2d            \n\t"
+" trn2            v21.2d, v1.2d, v4.2d            \n\t"
+" trn1            v22.2d, v2.2d, v5.2d            \n\t"
+" trn2            v23.2d, v2.2d, v5.2d            \n\t"
+"                                                 \n\t"
+" trn1            v24.2d, v6.2d, v9.2d            \n\t" // Row 2-3.
+" trn2            v25.2d, v6.2d, v9.2d            \n\t"
+" trn1            v26.2d, v7.2d, v10.2d           \n\t"
+" trn2            v27.2d, v7.2d, v10.2d           \n\t"
+" trn1            v28.2d, v8.2d, v11.2d           \n\t"
+" trn2            v29.2d, v8.2d, v11.2d           \n\t"
+"                                                 \n\t"
+" trn1            v0.2d, v12.2d, v15.2d           \n\t" // Row 4-5.
+" trn2            v1.2d, v12.2d, v15.2d           \n\t"
+" trn1            v2.2d, v13.2d, v16.2d           \n\t"
+" trn2            v3.2d, v13.2d, v16.2d           \n\t"
+" trn1            v4.2d, v14.2d, v17.2d           \n\t"
+" trn2            v5.2d, v14.2d, v17.2d           \n\t"
+"                                                 \n\t"
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_C_1_2)
+DLOADC_3V_C_FWD(6,7,8,x1,0,x7)
+DLOADC_3V_C_FWD(9,10,11,x1,0,x7)
+DSCALEA6V(18,24,0,19,25,1,6,7,8,9,10,11,31,0)
+LABEL(ZERO_BETA_C_1_2)
+DSTOREC_3V_C_FWD(18,24,0,x5,0,x7)
+DSTOREC_3V_C_FWD(19,25,1,x5,0,x7)
+BEQ(ZERO_BETA_C_3_4_5_6)
+DLOADC_3V_C_FWD(6,7,8,x1,0,x7)
+DLOADC_3V_C_FWD(9,10,11,x1,0,x7)
+DLOADC_3V_C_FWD(12,13,14,x1,0,x7)
+DLOADC_3V_C_FWD(15,16,17,x1,0,x7)
+DSCALEA6V(20,26,2,21,27,3,6,7,8,9,10,11,31,0)
+DSCALEA6V(22,28,4,23,29,5,12,13,14,15,16,17,31,0)
+LABEL(ZERO_BETA_C_3_4_5_6)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_C)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_C)
+#endif
+DSTOREC_3V_C_FWD(20,26,2,x5,0,x7)
+DSTOREC_3V_C_FWD(21,27,3,x5,0,x7)
+DSTOREC_3V_C_FWD(22,28,4,x5,0,x7)
+DSTOREC_3V_C_FWD(23,29,5,x5,0,x7)
+//
+// End of this microkernel.
+LABEL(END_WRITE_MEM)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t"
+BEQ(END_EXEC)
+"                                                 \n\t"
+" mov             x8, #6                          \n\t"
+" madd            x13, x6, x8, x13                \n\t" // Forward C's base address to the next logic panel.
+" add             x10, x10, x11                   \n\t" // Forward A's base address to the next logic panel.
+BRANCH(MILLIKER_MLOOP)
+//
+// End of execution.
+LABEL(END_EXEC)
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_a]   "m" (rs_a),
+  [cs_a]   "m" (cs_a),
+  [ps_a]   "m" (ps_a),
+  [rs_b]   "m" (rs_b),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  // In Clang, even "m"-passed parameter takes 1 register.
+  // Have to disable prefetching to pass compilation.
+#ifndef __clang__
+  [a_next] "r" (a_next),
+  [b_next] "r" (b_next),
+#endif
+  [m_iter] "m" (m_iter),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta)
+: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+  "x8", "x9", "x10","x11","x12","x13","x14",
+  "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10","v11","v12","v13","v14","v15",
+  "v16","v17","v18","v19","v20","v21","v22","v23",
+  "v24","v25","v26","v27","v28","v29","v30","v31"
+  );
+
+consider_edge_cases:
+  // Forward address.
+  a = a + m_iter * ps_a;
+  c = c + m_iter * 6 * rs_c;
+  auxinfo_t data_d6x4mn = *data;
+  bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
+  bli_dgemmsup_rv_armv8a_int_6x4mn
+  (
+    conja, conjb, m_left, 6, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+  );
+
+}
+
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
new file mode 100644
index 000000000..afdd13e28
--- /dev/null
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
@@ -0,0 +1,513 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "blis.h"
+#include "assert.h"
+
+
+// Label locality & misc.
+#include "../armv8a_asm_utils.h"
+
+// Nanokernel operations.
+#include "../armv8a_asm_d2x2.h"
+
+/* Odd-NR dgemmsup_rv_*m kernels are special in that
+ * despite of the row-major name, C is laid out in COLUMNS in the register space.
+ *
+ * Block order:
+ *
+ * +---+ +---+ +---+
+ * | 0 | | 3 | | 6 | |9
+ * +---+ +---+ +---+ |
+ * +---+ +---+ +---+
+ * | 1 | | 4 | | 7 | |10
+ * +---+ +---+ +---+ |
+ * +---+ +---+ +---+
+ * | 2 | | 5 | | 8 | |11
+ * +---+ +---+ +---+ |
+ *
+ */
+#define DGEMM_C6X7_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C05,C06,C10,C11,C12,C13,C14,C15,C16,C20,C21,C22,C23,C24,C25,C26,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \
+  DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \
+  DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \
+  DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \
+  DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \
+  DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \
+  DGEMM_2X2_NANOKERNEL(C04,C05,A0,B2) \
+  DGEMM_2X2_NANOKERNEL(C14,C15,A1,B2) \
+  DGEMM_2X2_NANOKERNEL(C24,C25,A2,B2) \
+  DGEMM_LOAD1V_ ##LOADNEXT (B2,BADDR,BSHIFT+32) \
+" fmla  v"#C06".2d, v"#A0".2d, v"#B3".d["#BIDX"] \n\t" \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \
+" fmla  v"#C16".2d, v"#A1".2d, v"#B3".d["#BIDX"] \n\t" \
+  DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \
+" fmla  v"#C26".2d, v"#A2".2d, v"#B3".d["#BIDX"] \n\t"
+
+// Interleaving load or not.
+#define DGEMM_LOAD1V_noload(V1,ADDR,IMM)
+#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+" ldr  q"#V1", ["#ADDR", #"#IMM"] \n\t"
+
+// #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
+// #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
+//   DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+//   DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
+
+#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
+#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
+" ld1  {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \
+" ld1  {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t"
+
+// Prefetch C in the long direction.
+#define DPRFMC_FWD(CADDR,DLONGC) \
+" prfm PLDL1KEEP, ["#CADDR"]      \n\t" \
+" add  "#CADDR", "#CADDR", "#DLONGC" \n\t"
+
+// For column-storage of C.
+#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+  DLOAD1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+  DSTORE1V(C2,CADDR,CSHIFT+32) \
+" add  "#CADDR", "#CADDR", "#CSC" \n\t"
+
+// For row-storage of C: Store 3+1/2 vectors.
+#define DLOADC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \
+" add  "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \
+  DLOAD2V(C0,C1,CADDR,CSHIFT) \
+  DLOAD1V(C2,CADDR,CSHIFT+32) \
+" ld1  {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+#define DSTOREC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \
+" add  "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \
+  DSTORE2V(C0,C1,CADDR,CSHIFT) \
+  DSTORE1V(C2,CADDR,CSHIFT+32) \
+" st1  {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \
+" add  "#CADDR", "#CADDR", "#RSC" \n\t"
+
+#define DSCALE7V(V0,V1,V2,V3,V4,V5,V6,A,IDX) \
+  DSCALE4V(V0,V1,V2,V3,A,IDX) \
+  DSCALE2V(V4,V5,A,IDX) \
+  DSCALE1V(V6,A,IDX)
+#define DSCALEA7V(D0,D1,D2,D3,D4,D5,D6,S0,S1,S2,S3,S4,S5,S6,A,IDX) \
+  DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \
+  DSCALEA2V(D4,D5,S4,S5,A,IDX) \
+  DSCALEA1V(D6,S6,A,IDX)
+#define DSCALEA3V(D0,D1,D2,S0,S1,S2,A,IDX) \
+  DSCALEA2V(D0,D1,S0,S1,A,IDX) \
+  DSCALEA1V(D2,S2,A,IDX)
+
+
+void bli_dgemmsup_rv_armv8a_asm_6x7m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*          data,
+       cntx_t*             cntx
+     )
+{
+  assert( n0 == 7 );
+
+  // LLVM has very bad routing ability for inline asm.
+  // Limit number of registers in case of Clang compilation.
+#ifndef __clang__
+  void*    a_next = bli_auxinfo_next_a( data );
+  void*    b_next = bli_auxinfo_next_b( data );
+#endif
+  uint64_t ps_a   = bli_auxinfo_ps_a( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 7;
+  uint64_t k_left = k0 % 7;
+
+  int64_t  m_iter = m0 / 6;
+  int64_t  m_left = m0 % 6;
+
+  uint64_t rs_a   = rs_a0;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  // uint64_t cs_b   = cs_b0;
+  assert( cs_b0 == 1 );
+
+  if ( m_iter == 0 ) goto consider_edge_cases;
+
+  __asm__ volatile
+  (
+" ldr             x10, %[a]                       \n\t"
+" ldr             x13, %[c]                       \n\t"
+" ldr             x12, %[m_iter]                  \n\t"
+" ldr             x11, %[ps_a]                    \n\t" // Panel-skip of A.
+" ldr             x9, %[rs_a]                     \n\t" // Row-skip of A.
+" ldr             x2, %[cs_a]                     \n\t" // Column-skip of A.
+" ldr             x3, %[rs_b]                     \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+"                                                 \n\t"
+"                                                 \n\t" // Multiply some address skips by sizeof(double).
+" lsl             x11, x11, #3                    \n\t" // ps_a
+" lsl             x9, x9, #3                      \n\t" // rs_a
+" lsl             x2, x2, #3                      \n\t" // cs_a
+" lsl             x3, x3, #3                      \n\t" // rs_b
+" lsl             x6, x6, #3                      \n\t" // rs_c
+" lsl             x7, x7, #3                      \n\t" // cs_c
+"                                                 \n\t"
+" mov             x1, x5                          \n\t"
+" cmp             x7, #8                          \n\t" // Prefetch column-strided C.
+BNE(C_PREFETCH_COLS)
+// This prefetch will not cover further mker perts. Skip.
+//
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+// DPRFMC_FWD(x1,x6)
+BRANCH(C_PREFETCH_END)
+LABEL(C_PREFETCH_COLS)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+DPRFMC_FWD(x1,x7)
+LABEL(C_PREFETCH_END)
+//
+// Millikernel.
+LABEL(MILLIKER_MLOOP)
+"                                                 \n\t"
+" mov             x0, x10                         \n\t" // Parameters to be reloaded
+" mov             x5, x13                         \n\t" //  within each millikernel loop.
+" ldr             x1, %[b]                        \n\t"
+" ldr             x4, %[k_mker]                   \n\t"
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+// Storage scheme:
+//  V[ 0:20] <- C
+//  V[21:27] <- A
+//  V[28:31] <- B
+// Under this scheme, the following is defined:
+#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \
+  DGEMM_C6X7_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT)
+// Load from memory.
+LABEL(LOAD_ABC)
+"                                                 \n\t" // No-microkernel early return is a must
+" cmp             x4, #0                          \n\t" //  to avoid out-of-boundary read.
+BEQ(CLEAR_CCOLS)
+"                                                 \n\t"
+" mov             x14, x0                         \n\t" // Load A.
+" ld1             {v21.d}[0], [x14], x9           \n\t"
+" ld1             {v21.d}[1], [x14], x9           \n\t"
+" ld1             {v22.d}[0], [x14], x9           \n\t"
+" ld1             {v22.d}[1], [x14], x9           \n\t"
+" ld1             {v23.d}[0], [x14], x9           \n\t"
+" ld1             {v23.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" ld1             {v24.d}[0], [x14], x9           \n\t"
+" ld1             {v24.d}[1], [x14], x9           \n\t"
+" ld1             {v25.d}[0], [x14], x9           \n\t"
+" ld1             {v25.d}[1], [x14], x9           \n\t"
+" ld1             {v26.d}[0], [x14], x9           \n\t"
+" ld1             {v26.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" ld1             {v27.d}[0], [x14], x9           \n\t"
+" ld1             {v27.d}[1], [x14], x9           \n\t"
+"                                                 \n\t"
+" ldr             q28, [x1, #16*0]                \n\t" // Load B.
+" ldr             q29, [x1, #16*1]                \n\t"
+" ldr             q30, [x1, #16*2]                \n\t"
+" ldr             d31, [x1, #16*3]                \n\t" // Scalar loads into idx 0.
+" add             x1, x1, x3                      \n\t"
+"                                                 \n\t"
+LABEL(CLEAR_CCOLS)
+CLEAR4V(0,1,2,3)
+CLEAR2V(4,5)
+CLEAR1V(6)
+CLEAR4V(7,8,9,10)
+CLEAR2V(11,12)
+CLEAR1V(13)
+CLEAR4V(14,15,16,17)
+CLEAR2V(18,19)
+CLEAR1V(20)
+// No-microkernel early return, once again.
+BEQ(K_LEFT_LOOP)
+//
+// Microkernel is defined here as:
+#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3,BIDX) \
+  DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,x14,x9,x1,0,load) \
+ "add             x0, x0, x2                      \n\t" \
+ "mov             x14, x0                         \n\t" \
+ "ld1             {v"#A2".d}[0], [x14], x9        \n\t" \
+ "ld1             {v"#A2".d}[1], [x14], x9        \n\t" \
+ /* Due to this loading, BIDX can only be 0 here. */ \
+ "ldr             d"#B3", [x1, #16*3]             \n\t" \
+ "add             x1, x1, x3                      \n\t"
+// Start microkernel loop.
+LABEL(K_MKER_LOOP)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,28,29,30,31,0)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31,0)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(27,21,22,28,29,30,31,0)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31,0)
+"                                                 \n\t" // Decrease counter before final replica.
+" subs            x4, x4, #1                      \n\t" // Branch early to avoid reading excess mem.
+BEQ(FIN_MKER_LOOP)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(26,27,21,28,29,30,31,0)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(22,23,24,28,29,30,31,0)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,28,29,30,31,0)
+BRANCH(K_MKER_LOOP)
+//
+// Final microkernel loop.
+LABEL(FIN_MKER_LOOP)
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(26,27,21,28,29,30,31,0,x14,x9,x1,0,load)
+" add             x0, x0, x2                      \n\t"
+" mov             x14, x0                         \n\t"
+" ldr             d31, [x1, #16*3]                \n\t"
+" add             x1, x1, x3                      \n\t"
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(22,23,24,28,29,30,31,0,xzr,-1,xzr,-1,noload)
+" ldr             q28, [x1, #16*0]                \n\t"
+" ldr             q29, [x1, #16*1]                \n\t"
+" ldr             q30, [x1, #16*2]                \n\t"
+" ldr             d31, [x1, #16*3]                \n\t"
+" add             x1, x1, x3                      \n\t"
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(25,26,27,28,29,30,31,0,xzr,-1,xzr,-1,noload)
+//
+// Loops left behind microkernels.
+LABEL(K_LEFT_LOOP)
+" cmp             x8, #0                          \n\t" // End of exec.
+BEQ(WRITE_MEM_PREP)
+" mov             x14, x0                         \n\t" // Load A col.
+" ld1             {v21.d}[0], [x14], x9           \n\t"
+" ld1             {v21.d}[1], [x14], x9           \n\t"
+" ld1             {v22.d}[0], [x14], x9           \n\t"
+" ld1             {v22.d}[1], [x14], x9           \n\t"
+" ld1             {v23.d}[0], [x14], x9           \n\t"
+" ld1             {v23.d}[1], [x14], x9           \n\t"
+" add             x0, x0, x2                      \n\t"
+" ldr             q28, [x1, #16*0]                \n\t" // Load B row.
+" ldr             q29, [x1, #16*1]                \n\t"
+" ldr             q30, [x1, #16*2]                \n\t"
+" ldr             d31, [x1, #16*3]                \n\t"
+" add             x1, x1, x3                      \n\t"
+" sub             x8, x8, #1                      \n\t"
+DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(21,22,23,28,29,30,31,0,xzr,-1,xzr,-1,noload)
+BRANCH(K_LEFT_LOOP)
+//
+// Scale and write to memory.
+LABEL(WRITE_MEM_PREP)
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1r            {v30.2d}, [x4]                  \n\t" // Load alpha & beta.
+" ld1r            {v31.2d}, [x8]                  \n\t"
+" fmov            d26, #1.0                       \n\t"
+" fcmp            d30, d26                        \n\t"
+BEQ(UNIT_ALPHA)
+DSCALE7V(0,1,2,3,4,5,6,30,0)
+DSCALE7V(7,8,9,10,11,12,13,30,0)
+DSCALE7V(14,15,16,17,18,19,20,30,0)
+LABEL(UNIT_ALPHA)
+"                                                 \n\t"
+" mov             x1, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x7, #8                          \n\t" // Check for column-storage.
+BNE(WRITE_MEM_C)
+// Unlike other RV kernels, here row-storage of C requires
+//   in-register transpose.
+" trn1            v21.2d, v0.2d, v1.2d            \n\t"
+" trn2            v22.2d, v0.2d, v1.2d            \n\t"
+" trn1            v23.2d, v2.2d, v3.2d            \n\t"
+" trn2            v24.2d, v2.2d, v3.2d            \n\t"
+" trn1            v25.2d, v4.2d, v5.2d            \n\t"
+" trn2            v26.2d, v4.2d, v5.2d            \n\t"
+"                                                 \n\t"
+" trn1            v0.2d, v7.2d, v8.2d             \n\t"
+" trn2            v1.2d, v7.2d, v8.2d             \n\t"
+" trn1            v2.2d, v9.2d, v10.2d            \n\t"
+" trn2            v3.2d, v9.2d, v10.2d            \n\t"
+" trn1            v4.2d, v11.2d, v12.2d           \n\t"
+" trn2            v5.2d, v11.2d, v12.2d           \n\t"
+"                                                 \n\t"
+" trn1            v7.2d, v14.2d, v15.2d           \n\t"
+" trn2            v8.2d, v14.2d, v15.2d           \n\t"
+" trn1            v9.2d, v16.2d, v17.2d           \n\t"
+" trn2            v10.2d, v16.2d, v17.2d          \n\t"
+" trn1            v11.2d, v18.2d, v19.2d          \n\t"
+" trn2            v12.2d, v18.2d, v19.2d          \n\t"
+"                                                 \n\t"
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_R_1_2)
+DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8)
+DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8)
+DSCALEA7V(21,23,25,22,24,26,6,14,15,16,17,18,19,30,31,0)
+LABEL(ZERO_BETA_R_1_2)
+DSTOREC_3PHV_R_FWD(21,23,25,6,0,x5,0,x6,x8)
+DSTOREC_3PHV_R_FWD(22,24,26,6,1,x5,0,x6,x8)
+BEQ(ZERO_BETA_R_3_4_5_6)
+DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8)
+DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8)
+DLOADC_3PHV_R_FWD(21,22,23,28,0,x1,0,x6,x8)
+DLOADC_3PHV_R_FWD(24,25,26,28,1,x1,0,x6,x8)
+DSCALEA7V(0,2,4,1,3,5,13,14,15,16,17,18,19,30,31,0)
+DSCALEA7V(7,9,11,8,10,12,20,21,22,23,24,25,26,28,31,0)
+LABEL(ZERO_BETA_R_3_4_5_6)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_R)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_R)
+#endif
+DSTOREC_3PHV_R_FWD(0,2,4,13,0,x5,0,x6,x8)
+DSTOREC_3PHV_R_FWD(1,3,5,13,1,x5,0,x6,x8)
+DSTOREC_3PHV_R_FWD(7,9,11,20,0,x5,0,x6,x8)
+DSTOREC_3PHV_R_FWD(8,10,12,20,1,x5,0,x6,x8)
+BRANCH(END_WRITE_MEM)
+//
+// C storage in columns.
+LABEL(WRITE_MEM_C)
+" fcmp            d31, #0.0                       \n\t"
+BEQ(ZERO_BETA_C_1_2)
+DLOADC_3V_C_FWD(21,22,23,x1,0,x7)
+DLOADC_3V_C_FWD(24,25,26,x1,0,x7)
+DSCALEA3V(0,7,14,21,22,23,31,0)
+DSCALEA3V(1,8,15,24,25,26,31,0)
+LABEL(ZERO_BETA_C_1_2)
+DSTOREC_3V_C_FWD(0,7,14,x5,0,x7)
+DSTOREC_3V_C_FWD(1,8,15,x5,0,x7)
+BEQ(ZERO_BETA_C_3_4_5_6_7)
+DLOADC_3V_C_FWD(21,22,23,x1,0,x7)
+DLOADC_3V_C_FWD(24,25,26,x1,0,x7)
+DLOADC_3V_C_FWD(27,28,29,x1,0,x7)
+DLOADC_3V_C_FWD(0,7,14,x1,0,x7)
+DLOADC_3V_C_FWD(1,8,15,x1,0,x7)
+DSCALEA3V(2,9,16,21,22,23,31,0)
+DSCALEA3V(3,10,17,24,25,26,31,0)
+DSCALEA3V(4,11,18,27,28,29,31,0)
+DSCALEA3V(5,12,19,0,7,14,31,0)
+DSCALEA3V(6,13,20,1,8,15,31,0)
+LABEL(ZERO_BETA_C_3_4_5_6_7)
+#ifndef __clang__
+" cmp   x12, #1                       \n\t"
+BRANCH(PRFM_END_C)
+" prfm  PLDL1KEEP, [%[a_next], #16*0] \n\t"
+" prfm  PLDL1KEEP, [%[a_next], #16*1] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*0] \n\t"
+" prfm  PLDL1STRM, [%[b_next], #16*1] \n\t"
+LABEL(PRFM_END_C)
+#endif
+DSTOREC_3V_C_FWD(2,9,16,x5,0,x7)
+DSTOREC_3V_C_FWD(3,10,17,x5,0,x7)
+DSTOREC_3V_C_FWD(4,11,18,x5,0,x7)
+DSTOREC_3V_C_FWD(5,12,19,x5,0,x7)
+DSTOREC_3V_C_FWD(6,13,20,x5,0,x7)
+//
+// End of this microkernel.
+LABEL(END_WRITE_MEM)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t"
+BEQ(END_EXEC)
+"                                                 \n\t"
+" mov             x8, #6                          \n\t"
+" madd            x13, x6, x8, x13                \n\t" // Forward C's base address to the next logic panel.
+" add             x10, x10, x11                   \n\t" // Forward A's base address to the next logic panel.
+BRANCH(MILLIKER_MLOOP)
+//
+// End of execution.
+LABEL(END_EXEC)
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_a]   "m" (rs_a),
+  [cs_a]   "m" (cs_a),
+  [ps_a]   "m" (ps_a),
+  [rs_b]   "m" (rs_b),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  // In Clang, even "m"-passed parameter takes 1 register.
+  // Have to disable prefetching to pass compilation.
+#ifndef __clang__
+  [a_next] "r" (a_next),
+  [b_next] "r" (b_next),
+#endif
+  [m_iter] "m" (m_iter),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta)
+: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+  "x8", "x9", "x10","x11","x12","x13","x14",
+  "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10","v11","v12","v13","v14","v15",
+  "v16","v17","v18","v19","v20","v21","v22","v23",
+  "v24","v25","v26","v27","v28","v29","v30","v31"
+  );
+
+consider_edge_cases:
+  // Forward address.
+  a = a + m_iter * ps_a;
+  c = c + m_iter * 6 * rs_c;
+  auxinfo_t data_d6x4mn = *data;
+  bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
+  bli_dgemmsup_rv_armv8a_int_6x4mn
+  (
+    conja, conjb, m_left, 7, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+  );
+
+}
+
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
index 8ff5ec173..b912480fa 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
@@ -37,7 +37,6 @@
 #include "blis.h"
 #include "assert.h"
 
-GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 
 // Label locality & misc.
 #include "../armv8a_asm_utils.h"
@@ -146,47 +145,70 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
 {
   if ( n0 != 8 )
   {
-    if ( n0 < 8 )
+    assert( n0 <= 13 );
+
+    // Manual separation.
+    dgemmsup_ker_ft ker_fp1 = NULL;
+    dgemmsup_ker_ft ker_fp2 = NULL;
+    dim_t           nr1, nr2;
+
+    if ( n0 == 13 )
     {
-      for ( ; n0 >= 4; n0 -= 4 )
-      {
-	dgemmsup_ker_ft ukr_fp;
-	auxinfo_t data_d8xkm = *data;
-	if ( bli_auxinfo_ps_a( data ) == 6 * rs_a0 )
-	{
-	  // Use 8x4 Asm kernel for the unpacked case.
-	  bli_auxinfo_set_ps_a( 8 * rs_a0, &data_d8xkm );
-	  ukr_fp = bli_dgemmsup_rv_armv8a_asm_8x4m;
-	}
-	else
-	{
-	  // Cannot change dimension for m when A is packed.
-	  ukr_fp = bli_dgemmsup_rv_armv8a_int_6x4mn;
-	}
-
-	ukr_fp
-	(
-	  conja, conjb, m0, 4, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, &data_d8xkm, cntx
-	);
-	b += 4 * cs_b0;
-	c += 4 * cs_c0;
-      }
-      if ( n0 > 0 )
-      {
-	bli_dgemmsup_rv_armv8a_int_6x4mn
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-      }
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6;
     }
-    else
+    if ( n0 == 12 )
     {
-      assert( FALSE );
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6;
     }
+    if ( n0 == 11 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5;
+    }
+    if ( n0 == 10 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5;
+    }
+    if ( n0 == 9 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr2 = 4;
+    }
+    if ( n0 == 7 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7;
+    }
+    if ( n0 == 6 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6;
+    }
+    if ( n0 == 5 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5;
+    }
+    if ( n0 <= 4 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr1 = n0;
+    }
+
+    ker_fp1
+    (
+      conja, conjb, m0, nr1, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    b += nr1 * cs_b0;
+    c += nr1 * cs_c0;
+    if ( ker_fp2 )
+      ker_fp2
+      (
+	conja, conjb, m0, nr2, k0,
+	alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+	beta, c, rs_c0, cs_c0, data, cntx
+      );
     return;
   }
 
@@ -534,7 +556,6 @@ LABEL(END_EXEC)
   // Forward address.
   a = a + m_iter * ps_a;
   c = c + m_iter * 6 * rs_c;
-#if 1
   auxinfo_t data_d6x4mn = *data;
   bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
   bli_dgemmsup_rv_armv8a_int_6x4mn
@@ -543,33 +564,6 @@ LABEL(END_EXEC)
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
   );
-#else
-  if ( m_left >= 4 )
-  {
-    // Calls 4x8m with only 1 outermost loop.
-    // As only 1 outermost loop is called,
-    //  ps_a needs not being set here.
-    //
-    bli_dgemmsup_rv_armv8a_asm_4x8m
-    (
-      conja, conjb, 4, 8, k0,
-      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-      beta, c, rs_c0, cs_c0, data, cntx
-    );
-    m_left -= 4;
-    a = a + 4 * rs_a;
-    c = c + 4 * rs_c;
-  }
-  if ( m_left )
-  {
-    bli_dgemmsup_r_armv8a_ref2
-    (
-      conja, conjb, m_left, 8, k0,
-      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-      beta, c, rs_c0, cs_c0, data, cntx
-    );
-  }
-#endif
 
 }
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
index 9bdf4b3b8..910e07dbb 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
@@ -37,7 +37,6 @@
 #include "blis.h"
 #include "assert.h"
 
-GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 
 // Label locality & misc.
 #include "../armv8a_asm_utils.h"
@@ -146,33 +145,56 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
 {
   if ( m0 != 6 )
   {
-    // 5 = 4 + 1;
-    // 4;
-    //
-    while ( m0 >= 4 )
+    assert( m0 <= 9 );
+
+    // Manual separation.
+    dgemmsup_ker_ft ker_fp1 = NULL;
+    dgemmsup_ker_ft ker_fp2 = NULL;
+    dim_t           mr1, mr2;
+    
+    if ( m0 == 9 )
     {
-      bli_dgemmsup_rv_armv8a_asm_4x8n
-      (
-        conja, conjb, 4, n0, k0,
-	alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	beta, c, rs_c0, cs_c0, data, cntx
-      );
-      m0 -= 4;
-      a += 4 * rs_a0;
-      c += 4 * rs_c0;
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4;
     }
-
-    // 3, 2, 1;
-    //
-    if ( m0 > 0 )
+    if ( m0 == 8 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4;
+    }
+    if ( m0 == 7 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4;
+      ker_fp2 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr2 = 3;
+    }
+    if ( m0 == 5 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5;
+    }
+    if ( m0 == 4 )
     {
-      bli_dgemmsup_rv_armv8a_int_3x8mn
+      ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4;
+    }
+    if ( m0 < 4 )
+    {
+      ker_fp1 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr1 = m0;
+    }
+
+    ker_fp1
+    (
+      conja, conjb, mr1, n0, k0,
+      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+      beta, c, rs_c0, cs_c0, data, cntx
+    );
+    a += mr1 * rs_a0;
+    c += mr1 * rs_c0;
+    if ( ker_fp2 )
+      ker_fp2
       (
-	conja, conjb, m0, n0, k0,
+	conja, conjb, mr2, n0, k0,
 	alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
 	beta, c, rs_c0, cs_c0, data, cntx
       );
-    }
     return;
   }
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
index 4d374df98..d3af5781c 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
@@ -36,7 +36,6 @@
 #include "blis.h"
 #include "assert.h"
 
-GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 
 // Label locality & misc.
 #include "../armv8a_asm_utils.h"
diff --git a/kernels/armv8a/bli_kernels_armv8a.h b/kernels/armv8a/bli_kernels_armv8a.h
index b7ab75541..64a3f2fb5 100644
--- a/kernels/armv8a/bli_kernels_armv8a.h
+++ b/kernels/armv8a/bli_kernels_armv8a.h
@@ -39,6 +39,8 @@ PACKM_KER_PROT( double,   d, packm_armv8a_int_8xk )
 
 GEMM_UKR_PROT( float,    s, gemm_armv8a_asm_8x12 )
 GEMM_UKR_PROT( double,   d, gemm_armv8a_asm_6x8 )
+GEMM_UKR_PROT( float,    s, gemm_armv8a_asm_12x8r )
+GEMM_UKR_PROT( double,   d, gemm_armv8a_asm_8x6r )
 // GEMM_UKR_PROT( double,   d, gemm_armv8a_asm_6x8r )
 // GEMM_UKR_PROT( double,   d, gemm_armv8a_asm_8x4 )
 // GEMM_UKR_PROT( double,   d, gemm_armv8a_asm_4x4 )
@@ -47,6 +49,10 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_rd_armv8a_asm_6x8n )
 GEMMSUP_KER_PROT( double,   d, gemmsup_rd_armv8a_asm_6x8m )
 GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_6x8n )
 GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_6x8m )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_6x7m )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_6x6m )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_6x5m )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_5x8n )
 GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_4x8n )
 GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_4x8m )
 GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armv8a_asm_8x4m )

From a87eae2b11408b556e562f1b04e673c6cd1612bc Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 6 Sep 2022 18:04:09 -0500
Subject: [PATCH 076/230] Added '-q' quiet mode option to testsuite. (#657)

Details:
- Added support for a '-q' command line option to the testsuite. This
  option suppresses most informational output that would normally
  clutter up the screen. By default, verbose mode (the previous
  status quo) will be operative, and so quiet mode must be requested.
---
 testsuite/src/test_libblis.c | 58 ++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index eaa0a9cef..442fae0e0 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -42,6 +42,8 @@ char libblis_test_binary_name[ MAX_BINARY_NAME_LENGTH + 1 ];
 char libblis_test_parameters_filename[ MAX_FILENAME_LENGTH + 1 ];
 char libblis_test_operations_filename[ MAX_FILENAME_LENGTH + 1 ];
 
+bool libblis_test_quiet_mode = FALSE;
+
 char libblis_test_pass_string[ MAX_PASS_STRING_LENGTH + 1 ];
 char libblis_test_warn_string[ MAX_PASS_STRING_LENGTH + 1 ];
 char libblis_test_fail_string[ MAX_PASS_STRING_LENGTH + 1 ];
@@ -720,6 +722,9 @@ void libblis_test_read_op_info( test_ops_t*  ops,
 
 void libblis_test_output_section_overrides( FILE* os, test_ops_t* ops )
 {
+	// Skip informational output if BLIS is running in quiet mode.
+	if ( libblis_test_quiet_mode ) return;
+
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "--- Section overrides ---\n" );
 	libblis_test_fprintf_c( os, "\n" );
@@ -746,6 +751,17 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	cntx_t* cntx_c;
 	cntx_t* cntx_z;
 
+#ifndef BLIS_ENABLE_GEMM_MD
+	// Notify the user if mixed domain or mixed precision was requested.
+	if ( params->mixed_domain || params->mixed_precision )
+	{
+		libblis_test_printf_error( "mixed domain and/or mixed precision testing requested, but building against BLIS without mixed datatype support.\n" );
+	}
+#endif
+
+	// Skip informational output if BLIS is running in quiet mode.
+	if ( libblis_test_quiet_mode ) return;
+
 	// If bli_info_get_int_type_size() returns 32 or 64, the size is forced.
 	// Otherwise, the size is chosen automatically. We query the result of
 	// that automatic choice via sizeof(gint_t).
@@ -1241,14 +1257,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf( os, "\n" );
 
-#ifndef BLIS_ENABLE_GEMM_MD
-	// Notify the user if mixed domain or mixed precision was requested.
-	if ( params->mixed_domain || params->mixed_precision )
-	{
-		libblis_test_printf_error( "mixed domain and/or mixed precision testing requested, but building against BLIS without mixed datatype support.\n" );
-	}
-#endif
-
 	// If mixed domain or mixed precision was requested, we disable all
 	// induced methods except 1m and native execution.
 	if ( params->mixed_domain || params->mixed_precision )
@@ -1267,6 +1275,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 
 void libblis_test_output_op_struct( FILE* os, test_op_t* op, char* op_str )
 {
+	// Skip informational output if BLIS is running in quiet mode.
+	if ( libblis_test_quiet_mode ) return;
+
+	libblis_test_fprintf_c( os, "--- %s ---\n", op_str );
+	libblis_test_fprintf_c( os, "\n" );
+
 	dimset_t dimset = op->dimset;
 
 	if      ( dimset == BLIS_TEST_DIMS_MNK )
@@ -2086,8 +2100,6 @@ void libblis_test_op_driver
 	if ( tdata->id == 0 )
 	{
 		// Output a heading and the contents of the op struct.
-		libblis_test_fprintf_c( stdout, "--- %s ---\n", op_str );
-		libblis_test_fprintf_c( stdout, "\n" );
 		libblis_test_output_op_struct( stdout, op, op_str );
 
 		// Also output to a matlab file if requested (and successfully opened).
@@ -2099,8 +2111,6 @@ void libblis_test_op_driver
 			// stdout (at the end of libblis_test_read_parameter_file()).
 			libblis_test_output_params_struct( output_stream, params );
 
-			libblis_test_fprintf_c( output_stream, "--- %s ---\n", op_str );
-			libblis_test_fprintf_c( output_stream, "\n" );
 			libblis_test_output_op_struct( output_stream, op, op_str );
 		}
 	}
@@ -3082,7 +3092,7 @@ void libblis_test_parse_command_line( int argc, char** argv )
 	bli_getopt_init_state( 0, &state );
 
 	// Process all option arguments until we get a -1, which means we're done.
-	while( (opt = bli_getopt( argc, ( const char** )argv, "g:o:", &state )) != -1 )
+	while( (opt = bli_getopt( argc, ( const char** )argv, "g:o:q", &state )) != -1 )
 	{
 		// Explicitly typecast opt, which is an int, to a char. (Failing to
 		// typecast resulted in at least one user-reported problem whereby
@@ -3092,19 +3102,21 @@ void libblis_test_parse_command_line( int argc, char** argv )
 		switch( opt_ch )
 		{
 			case 'g':
-			libblis_test_printf_infoc( "detected -g option; using \"%s\" for parameters filename.\n", state.optarg );
 			strncpy( libblis_test_parameters_filename,
 			         state.optarg, MAX_FILENAME_LENGTH );
 			gave_option_g = TRUE;
 			break;
 
 			case 'o':
-			libblis_test_printf_infoc( "detected -o option; using \"%s\" for operations filename.\n", state.optarg );
 			strncpy( libblis_test_operations_filename,
 			         state.optarg, MAX_FILENAME_LENGTH );
 			gave_option_o = TRUE;
 			break;
 
+			case 'q':
+			libblis_test_quiet_mode = TRUE;
+			break;
+
 			case '?':
 			libblis_test_printf_error( "unexpected option '%c' given or missing option argument\n", state.optopt );
 			break;
@@ -3116,21 +3128,37 @@ void libblis_test_parse_command_line( int argc, char** argv )
 
 	if ( gave_option_g == FALSE )
 	{
+		// Skip informational output if BLIS is running in quiet mode.
+		if ( !libblis_test_quiet_mode )
 		libblis_test_printf_infoc( "no -g option given; defaulting to \"%s\" for parameters filename.\n", PARAMETERS_FILENAME );
 
 		// Copy default parameters filename into its global string.
 		strncpy( libblis_test_parameters_filename,
 		         PARAMETERS_FILENAME, MAX_FILENAME_LENGTH );
 	}
+	else
+	{
+		// Skip informational output if BLIS is running in quiet mode.
+		if ( !libblis_test_quiet_mode )
+		libblis_test_printf_infoc( "detected -g option; using \"%s\" for parameters filename.\n", state.optarg );
+	}
 
 	if ( gave_option_o == FALSE )
 	{
+		// Skip informational output if BLIS is running in quiet mode.
+		if ( !libblis_test_quiet_mode )
 		libblis_test_printf_infoc( "no -o option given; defaulting to \"%s\" for operations filename.\n", OPERATIONS_FILENAME );
 
 		// Copy default operations filename into its global string.
 		strncpy( libblis_test_operations_filename,
 		         OPERATIONS_FILENAME, MAX_FILENAME_LENGTH );
 	}
+	else
+	{
+		// Skip informational output if BLIS is running in quiet mode.
+		if ( !libblis_test_quiet_mode )
+		libblis_test_printf_infoc( "detected -o option; using \"%s\" for operations filename.\n", state.optarg );
+	}
 
 	// If there are still arguments remaining after getopt() processing is
 	// complete, print an error.

From 4afe0cfdab0e069e027f97920ea604249e34df47 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 8 Sep 2022 18:33:20 -0500
Subject: [PATCH 077/230] Defined invscalv, invscalm, invscald operations.
 (#661)

Details:
- Defined invert-scale (invscal) operation on vectors (level-1v),
  matrices (level-1m), and diagonals (level-1d).
- Added test modules for invscalv and invscalm to the testsuite.
- Updated BLISObjectAPI.md and BLISTypedAPI.md API documentation to
  reflect the new operations. Also updated KernelsHowTo.md accordingly.
- Renamed 'beta' to 'alpha' in scalv and scalm testsuite modules (and
  input.operations files) so that the parameter name matches the
  parameter used in the documentation.
---
 docs/BLISObjectAPI.md            |  55 +++++-
 docs/BLISTypedAPI.md             |  60 +++++-
 docs/KernelsHowTo.md             |  28 ++-
 frame/1/bli_l1v_check.c          |   1 +
 frame/1/bli_l1v_check.h          |   1 +
 frame/1/bli_l1v_fpa.c            |   1 +
 frame/1/bli_l1v_fpa.h            |   1 +
 frame/1/bli_l1v_ft.h             |   3 +-
 frame/1/bli_l1v_ft_ker.h         |   3 +-
 frame/1/bli_l1v_ker.h            |   6 +
 frame/1/bli_l1v_ker_prot.h       |  12 ++
 frame/1/bli_l1v_oapi.c           |   1 +
 frame/1/bli_l1v_oapi.h           |   1 +
 frame/1/bli_l1v_tapi.c           |   1 +
 frame/1/bli_l1v_tapi.h           |   1 +
 frame/1d/bli_l1d_check.c         |   1 +
 frame/1d/bli_l1d_check.h         |   1 +
 frame/1d/bli_l1d_fpa.c           |   1 +
 frame/1d/bli_l1d_fpa.h           |   1 +
 frame/1d/bli_l1d_ft.h            |   3 +-
 frame/1d/bli_l1d_oapi.c          |   1 +
 frame/1d/bli_l1d_oapi.h          |   1 +
 frame/1d/bli_l1d_tapi.c          |   1 +
 frame/1d/bli_l1d_tapi.h          |   1 +
 frame/1m/bli_l1m_check.c         |   1 +
 frame/1m/bli_l1m_check.h         |   1 +
 frame/1m/bli_l1m_fpa.c           |   1 +
 frame/1m/bli_l1m_fpa.h           |   1 +
 frame/1m/bli_l1m_ft.h            |   3 +-
 frame/1m/bli_l1m_oapi.c          |   1 +
 frame/1m/bli_l1m_oapi.h          |   1 +
 frame/1m/bli_l1m_tapi.c          |   1 +
 frame/1m/bli_l1m_tapi.h          |   1 +
 frame/1m/bli_l1m_unb_var1.c      |   1 +
 frame/1m/bli_l1m_unb_var1.h      |   1 +
 frame/include/bli_type_defs.h    |   1 +
 ref_kernels/1/bli_invscalv_ref.c |  81 +++++++++
 ref_kernels/bli_cntx_ref.c       |  31 ++--
 testsuite/input.operations       |  12 +-
 testsuite/input.operations.fast  |  12 +-
 testsuite/input.operations.mixed |  12 +-
 testsuite/input.operations.salt  |  12 +-
 testsuite/src/test_invscalm.c    | 301 +++++++++++++++++++++++++++++++
 testsuite/src/test_invscalm.h    |  42 +++++
 testsuite/src/test_invscalv.c    | 297 ++++++++++++++++++++++++++++++
 testsuite/src/test_invscalv.h    |  42 +++++
 testsuite/src/test_libblis.c     |  14 +-
 testsuite/src/test_libblis.h     |   4 +
 testsuite/src/test_scalm.c       |  50 ++---
 testsuite/src/test_scalv.c       |  48 ++---
 50 files changed, 1070 insertions(+), 88 deletions(-)
 create mode 100644 ref_kernels/1/bli_invscalv_ref.c
 create mode 100644 testsuite/src/test_invscalm.c
 create mode 100644 testsuite/src/test_invscalm.h
 create mode 100644 testsuite/src/test_invscalv.c
 create mode 100644 testsuite/src/test_invscalv.h

diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md
index 5e8ed3d8f..51f5753a0 100644
--- a/docs/BLISObjectAPI.md
+++ b/docs/BLISObjectAPI.md
@@ -41,11 +41,11 @@
 This index provides a quick way to jump directly to the description for each operation discussed later in the [Computational function reference](BLISObjectAPI.md#computational-function-reference) section:
 
   * **[Level-1v](BLISObjectAPI.md#level-1v-operations)**: Operations on vectors:
-    * [addv](BLISObjectAPI.md#addv), [amaxv](BLISObjectAPI.md#amaxv), [axpyv](BLISObjectAPI.md#axpyv), [axpbyv](BLISObjectAPI.md#axpbyv), [copyv](BLISObjectAPI.md#copyv), [dotv](BLISObjectAPI.md#dotv), [dotxv](BLISObjectAPI.md#dotxv), [invertv](BLISObjectAPI.md#invertv), [scal2v](BLISObjectAPI.md#scal2v), [scalv](BLISObjectAPI.md#scalv), [setv](BLISObjectAPI.md#setv), [setrv](BLISObjectAPI.md#setrv), [setiv](BLISObjectAPI.md#setiv), [subv](BLISObjectAPI.md#subv), [swapv](BLISObjectAPI.md#swapv), [xpbyv](BLISObjectAPI.md#xpbyv)
+    * [addv](BLISObjectAPI.md#addv), [amaxv](BLISObjectAPI.md#amaxv), [axpyv](BLISObjectAPI.md#axpyv), [axpbyv](BLISObjectAPI.md#axpbyv), [copyv](BLISObjectAPI.md#copyv), [dotv](BLISObjectAPI.md#dotv), [dotxv](BLISObjectAPI.md#dotxv), [invertv](BLISObjectAPI.md#invertv), [invscalv](BLISObjectAPI.md#invscalv), [scalv](BLISObjectAPI.md#scalv), [scal2v](BLISObjectAPI.md#scal2v), [setv](BLISObjectAPI.md#setv), [setrv](BLISObjectAPI.md#setrv), [setiv](BLISObjectAPI.md#setiv), [subv](BLISObjectAPI.md#subv), [swapv](BLISObjectAPI.md#swapv), [xpbyv](BLISObjectAPI.md#xpbyv)
   * **[Level-1d](BLISObjectAPI.md#level-1d-operations)**: Element-wise operations on matrix diagonals:
-    * [addd](BLISObjectAPI.md#addd), [axpyd](BLISObjectAPI.md#axpyd), [copyd](BLISObjectAPI.md#copyd), [invertd](BLISObjectAPI.md#invertd), [scald](BLISObjectAPI.md#scald), [scal2d](BLISObjectAPI.md#scal2d), [setd](BLISObjectAPI.md#setd), [setid](BLISObjectAPI.md#setid), [shiftd](BLISObjectAPI.md#shiftd), [subd](BLISObjectAPI.md#subd), [xpbyd](BLISObjectAPI.md#xpbyd)
+    * [addd](BLISObjectAPI.md#addd), [axpyd](BLISObjectAPI.md#axpyd), [copyd](BLISObjectAPI.md#copyd), [invertd](BLISObjectAPI.md#invertd), [invscald](BLISObjectAPI.md#invscald), [scald](BLISObjectAPI.md#scald), [scal2d](BLISObjectAPI.md#scal2d), [setd](BLISObjectAPI.md#setd), [setid](BLISObjectAPI.md#setid), [shiftd](BLISObjectAPI.md#shiftd), [subd](BLISObjectAPI.md#subd), [xpbyd](BLISObjectAPI.md#xpbyd)
   * **[Level-1m](BLISObjectAPI.md#level-1m-operations)**: Element-wise operations on matrices:
-    * [addm](BLISObjectAPI.md#addm), [axpym](BLISObjectAPI.md#axpym), [copym](BLISObjectAPI.md#copym), [scalm](BLISObjectAPI.md#scalm), [scal2m](BLISObjectAPI.md#scal2m), [setm](BLISObjectAPI.md#setm), [setrm](BLISObjectAPI.md#setrm), [setim](BLISObjectAPI.md#setim), [subm](BLISObjectAPI.md#subm)
+    * [addm](BLISObjectAPI.md#addm), [axpym](BLISObjectAPI.md#axpym), [copym](BLISObjectAPI.md#copym), [invscalm](BLISObjectAPI.md#invscalm), [scalm](BLISObjectAPI.md#scalm), [scal2m](BLISObjectAPI.md#scal2m), [setm](BLISObjectAPI.md#setm), [setrm](BLISObjectAPI.md#setrm), [setim](BLISObjectAPI.md#setim), [subm](BLISObjectAPI.md#subm)
   * **[Level-1f](BLISObjectAPI.md#level-1f-operations)**: Fused operations on multiple vectors:
     * [axpy2v](BLISObjectAPI.md#axpy2v), [dotaxpyv](BLISObjectAPI.md#dotaxpyv), [axpyf](BLISObjectAPI.md#axpyf), [dotxf](BLISObjectAPI.md#dotxf), [dotxaxpyf](BLISObjectAPI.md#dotxaxpyf)
   * **[Level-2](BLISObjectAPI.md#level-2-operations)**: Operations with one matrix and (at least) one vector operand:
@@ -845,6 +845,24 @@ Invert all elements of an _n_-length vector `x`.
 
 ---
 
+#### invscalv
+```c
+void bli_invscalv
+     (
+       obj_t*  alpha,
+       obj_t*  x
+     );
+```
+Perform
+```
+  x := ( 1.0 / conj?(alpha) ) * x
+```
+where `x` is a vector of length _n_, and `alpha` is a scalar.
+
+Observed object properties: `conj?(alpha)`.
+
+---
+
 #### scalv
 ```c
 void bli_scalv
@@ -1049,6 +1067,19 @@ Observed object properties: `diagoff(A)`.
 
 ---
 
+#### invscald
+```c
+void bli_invscald
+     (
+       obj_t*  alpha,
+       obj_t*  a
+     );
+```
+
+Observed object properties: `conj?(alpha)`, `diagoff(A)`.
+
+---
+
 #### scald
 ```c
 void bli_scald
@@ -1213,6 +1244,24 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`, `trans?(A)`.
 
 ---
 
+#### invscalm
+```c
+void bli_invscalm
+     (
+       obj_t*  alpha,
+       obj_t*  a
+     );
+```
+Perform
+```
+  A := ( 1.0 / conj?(alpha) ) * A
+```
+where `A` is an _m x n_ matrix stored as a dense matrix, or lower- or upper-triangular/trapezoidal matrix with arbitrary diagonal offset. If `uplo(A)` indicates lower or upper storage, only that part of matrix `A` will be updated.
+
+Observed object properties: `conj?(alpha)`, `diagoff(A)`, `uplo(A)`.
+
+---
+
 #### scalm
 ```c
 void bli_scalm
diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md
index 76d7ef8f6..497776a15 100644
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -36,11 +36,11 @@
 This index provides a quick way to jump directly to the description for each operation discussed later in the [Computational function reference](BLISTypedAPI.md#computational-function-reference) section:
 
   * **[Level-1v](BLISTypedAPI.md#level-1v-operations)**: Operations on vectors:
-    * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [axpbyv](BLISTypedAPI.md#axpbyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [scal2v](BLISTypedAPI.md#scal2v), [scalv](BLISTypedAPI.md#scalv), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv), [xpbyv](BLISTypedAPI.md#xpbyv)
+    * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [axpbyv](BLISTypedAPI.md#axpbyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [invscalv](BLISTypedAPI.md#invscalv), [scalv](BLISTypedAPI.md#scalv), [scal2v](BLISTypedAPI.md#scal2v), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv), [xpbyv](BLISTypedAPI.md#xpbyv)
   * **[Level-1d](BLISTypedAPI.md#level-1d-operations)**: Element-wise operations on matrix diagonals:
-    * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [shiftd](BLISTypedAPI.md#shiftd), [subd](BLISTypedAPI.md#subd), [xpbyd](BLISTypedAPI.md#xpbyd)
+    * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [invscald](BLISTypedAPI.md#invscald), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [shiftd](BLISTypedAPI.md#shiftd), [subd](BLISTypedAPI.md#subd), [xpbyd](BLISTypedAPI.md#xpbyd)
   * **[Level-1m](BLISTypedAPI.md#level-1m-operations)**: Element-wise operations on matrices:
-    * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm)
+    * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [invscalm](BLISTypedAPI.md#invscalm), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm)
   * **[Level-1f](BLISTypedAPI.md#level-1f-operations)**: Fused operations on multiple vectors:
     * [axpy2v](BLISTypedAPI.md#axpy2v), [dotaxpyv](BLISTypedAPI.md#dotaxpyv), [axpyf](BLISTypedAPI.md#axpyf), [dotxf](BLISTypedAPI.md#dotxf), [dotxaxpyf](BLISTypedAPI.md#dotxaxpyf)
   * **[Level-2](BLISTypedAPI.md#level-2-operations)**: Operations with one matrix and (at least) one vector operand:
@@ -369,6 +369,24 @@ Invert all elements of an _n_-length vector `x`.
 
 ---
 
+#### invscalv
+```c
+void bli_?invscalv
+     (
+       conj_t  conjalpha,
+       dim_t   n,
+       ctype*  alpha,
+       ctype*  x, inc_t incx
+     );
+```
+Perform
+```
+  x := ( 1.0 / conjalpha(alpha) ) * x
+```
+where `x` is a vector of length _n_, and `alpha` is a scalar.
+
+---
+
 #### scalv
 ```c
 void bli_?scalv
@@ -548,6 +566,21 @@ void bli_?invertd
 
 ---
 
+#### invscald
+```c
+void bli_?invscald
+     (
+       conj_t  conjalpha,
+       doff_t  diagoffa,
+       dim_t   m,
+       dim_t   n,
+       ctype*  alpha,
+       ctype*  a, inc_t rsa, inc_t csa
+     );
+```
+
+---
+
 #### scald
 ```c
 void bli_?scald
@@ -737,6 +770,27 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up
 
 ---
 
+#### invscalm
+```c
+void bli_?invscalm
+     (
+       conj_t  conjalpha,
+       doff_t  diagoffa,
+       uplo_t  uploa,
+       dim_t   m,
+       dim_t   n,
+       ctype*  alpha,
+       ctype*  a, inc_t rsa, inc_t csa
+     );
+```
+Perform
+```
+  A := ( 1.0 / conjalpha(alpha) ) * A
+```
+where `A` is an _m x n_ matrix stored as a dense matrix, or lower- or upper-triangular/trapezoidal matrix, as specified by `uploa`, with the diagonal offset of `A` specified by `diagoffa`. If `uploa` indicates lower or upper storage, only that part of matrix `A` will be updated.
+
+---
+
 #### scalm
 ```c
 void bli_?scalm
diff --git a/docs/KernelsHowTo.md b/docs/KernelsHowTo.md
index 6e84db8e7..30a4dc736 100644
--- a/docs/KernelsHowTo.md
+++ b/docs/KernelsHowTo.md
@@ -22,11 +22,11 @@ One of the primary features of BLIS is that it provides a large set of dense lin
 
 Presently, BLIS supports several groups of operations:
   * **[Level-1v](BLISTypedAPI.md#level-1v-operations)**: Operations on vectors:
-    * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [scal2v](BLISTypedAPI.md#scal2v), [scalv](BLISTypedAPI.md#scalv), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv)
+    * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [invscalv](BLISTypedAPI.md#invscalv), [scalv](BLISTypedAPI.md#scalv), [scal2v](BLISTypedAPI.md#scal2v), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv)
   * **[Level-1d](BLISTypedAPI.md#level-1d-operations)**: Element-wise operations on matrix diagonals:
-    * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [subd](BLISTypedAPI.md#subd)
+    * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [invscald](BLISTypedAPI.md#invscald), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [subd](BLISTypedAPI.md#subd)
   * **[Level-1m](BLISTypedAPI.md#level-1m-operations)**: Element-wise operations on matrices:
-    * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm)
+    * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [invscalm](BLISTypedAPI.md#invscalm), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm)
   * **[Level-1f](BLISTypedAPI.md#level-1f-operations)**: Fused operations on multiple vectors:
     * [axpy2v](BLISTypedAPI.md#axpy2v), [dotaxpyv](BLISTypedAPI.md#dotaxpyv), [axpyf](BLISTypedAPI.md#axpyf), [dotxf](BLISTypedAPI.md#dotxf), [dotxaxpyf](BLISTypedAPI.md#dotxaxpyf)
   * **[Level-2](BLISTypedAPI.md#level-2-operations)**: Operations with one matrix and (at least) one vector operand:
@@ -81,6 +81,7 @@ BLIS supports the following 14 level-1v kernels. These kernels are used primaril
   * **dotv**: Performs a [dot product](BLISTypedAPI.md#dotv) where the output scalar is overwritten.
   * **dotxv**: Performs an [extended dot product](BLISTypedAPI.md#dotxv) operation where the dot product is first scaled and then accumulated into a scaled output scalar.
   * **invertv**: Performs an [element-wise vector inversion](BLISTypedAPI.md#invertv) operation.
+  * **invscalv**: Performs an [in-place (destructive) vector inverse-scaling](BLISTypedAPI.md#invscalv) operation.
   * **scalv**: Performs an [in-place (destructive) vector scaling](BLISTypedAPI.md#scalv) operation.
   * **scal2v**: Performs an [out-of-place (non-destructive) vector scaling](BLISTypedAPI.md#scal2v) operation.
   * **setv**: Performs a [vector broadcast](BLISTypedAPI.md#setv) operation.
@@ -184,6 +185,7 @@ datatype characters.
 | copyv            | `BLIS_COPYV_KER`      | `?copyv_ft`           |
 | dotxv            | `BLIS_DOTXV_KER`      | `?dotxv_ft`           |
 | invertv          | `BLIS_INVERTV_KER`    | `?invertv_ft`         |
+| invscalv         | `BLIS_INVSCALV_KER`   | `?invscalv_ft`        |
 | scalv            | `BLIS_SCALV_KER`      | `?scalv_ft`           |
 | scal2v           | `BLIS_SCAL2V_KER`     | `?scal2v_ft`          |
 | setv             | `BLIS_SETV_KER`       | `?setv_ft`            |
@@ -220,6 +222,7 @@ This section seeks to provide developers with a complete reference for each of t
     * [dotv](KernelsHowTo.md#dotv-kernel)
     * [dotxv](KernelsHowTo.md#dotxv-kernel)
     * [invertv](KernelsHowTo.md#invertv-kernel)
+    * [invscalv](KernelsHowTo.md#invscalv-kernel)
     * [scalv](KernelsHowTo.md#scalv-kernel)
     * [scal2v](KernelsHowTo.md#scal2v-kernel)
     * [setv](KernelsHowTo.md#setv-kernel)
@@ -929,6 +932,25 @@ This kernel inverts all elements of an _n_-length vector `x`.
 
 ---
 
+#### invscalv kernel
+```c
+void bli_?invscalv_<suffix>
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       ctype*  restrict alpha,
+       ctype*  restrict x, inc_t incx,
+       cntx_t* restrict cntx
+     )
+```
+This kernel performs the following operation:
+```
+  x := ( 1.0 / conjalpha(alpha) ) * x
+```
+where `x` is a vector of length _n_ stored with stride `incx` and `alpha` is a scalar.
+
+---
+
 #### scalv kernel
 ```c
 void bli_?scalv_<suffix>
diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c
index 8ab470bf4..f2c4622d5 100644
--- a/frame/1/bli_l1v_check.c
+++ b/frame/1/bli_l1v_check.c
@@ -165,6 +165,7 @@ void PASTEMAC(opname,_check) \
 	bli_l1v_ax_check( alpha, x ); \
 }
 
+GENFRONT( invscalv )
 GENFRONT( scalv )
 GENFRONT( setv )
 
diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h
index 110b25d55..cfd6d9e6e 100644
--- a/frame/1/bli_l1v_check.h
+++ b/frame/1/bli_l1v_check.h
@@ -140,6 +140,7 @@ void PASTEMAC(opname,_check) \
        const obj_t* x  \
      );
 
+GENTPROT( invscalv )
 GENTPROT( scalv )
 GENTPROT( setv )
 
diff --git a/frame/1/bli_l1v_fpa.c b/frame/1/bli_l1v_fpa.c
index 311f0b2b9..a88aba93d 100644
--- a/frame/1/bli_l1v_fpa.c
+++ b/frame/1/bli_l1v_fpa.c
@@ -60,6 +60,7 @@ GENFRONT( scal2v )
 GENFRONT( dotv )
 GENFRONT( dotxv )
 GENFRONT( invertv )
+GENFRONT( invscalv )
 GENFRONT( scalv )
 GENFRONT( setv )
 GENFRONT( swapv )
diff --git a/frame/1/bli_l1v_fpa.h b/frame/1/bli_l1v_fpa.h
index c05a4ff7b..52d477d30 100644
--- a/frame/1/bli_l1v_fpa.h
+++ b/frame/1/bli_l1v_fpa.h
@@ -52,6 +52,7 @@ GENPROT( scal2v )
 GENPROT( dotv )
 GENPROT( dotxv )
 GENPROT( invertv )
+GENPROT( invscalv )
 GENPROT( scalv )
 GENPROT( setv )
 GENPROT( swapv )
diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h
index 57f9d223a..244b926ca 100644
--- a/frame/1/bli_l1v_ft.h
+++ b/frame/1/bli_l1v_ft.h
@@ -158,7 +158,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
 
 INSERT_GENTDEF( invertv )
 
-// scalv, setv
+// invscalv, scalv, setv
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
@@ -172,6 +172,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
+INSERT_GENTDEF( invscalv )
 INSERT_GENTDEF( scalv )
 INSERT_GENTDEF( setv )
 
diff --git a/frame/1/bli_l1v_ft_ker.h b/frame/1/bli_l1v_ft_ker.h
index fd3f14c1c..ade2c98eb 100644
--- a/frame/1/bli_l1v_ft_ker.h
+++ b/frame/1/bli_l1v_ft_ker.h
@@ -161,7 +161,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
 
 INSERT_GENTDEF( invertv )
 
-// scalv, setv
+// invscalv, scalv, setv
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
@@ -175,6 +175,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        cntx_t*         cntx  \
      );
 
+INSERT_GENTDEF( invscalv )
 INSERT_GENTDEF( scalv )
 INSERT_GENTDEF( setv )
 
diff --git a/frame/1/bli_l1v_ker.h b/frame/1/bli_l1v_ker.h
index e91813a07..4ebbffa82 100644
--- a/frame/1/bli_l1v_ker.h
+++ b/frame/1/bli_l1v_ker.h
@@ -90,6 +90,12 @@ INSERT_GENTPROT_BASIC0( dotxv_ker_name )
 INSERT_GENTPROT_BASIC0( invertv_ker_name )
 
 
+#undef  GENTPROT
+#define GENTPROT INVSCALV_KER_PROT
+
+INSERT_GENTPROT_BASIC0( invscalv_ker_name )
+
+
 #undef  GENTPROT
 #define GENTPROT SCALV_KER_PROT
 
diff --git a/frame/1/bli_l1v_ker_prot.h b/frame/1/bli_l1v_ker_prot.h
index b912ba7e0..965626392 100644
--- a/frame/1/bli_l1v_ker_prot.h
+++ b/frame/1/bli_l1v_ker_prot.h
@@ -139,6 +139,18 @@ void PASTEMAC(ch,opname) \
      ); \
 
 
+#define INVSCALV_KER_PROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       conj_t           conjalpha, \
+       dim_t            n, \
+       ctype*  restrict alpha, \
+       ctype*  restrict x, inc_t incx, \
+       cntx_t*          cntx  \
+     ); \
+
+
 #define SCALV_KER_PROT( ctype, ch, opname ) \
 \
 void PASTEMAC(ch,opname) \
diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c
index 4ea241693..ae12250e7 100644
--- a/frame/1/bli_l1v_oapi.c
+++ b/frame/1/bli_l1v_oapi.c
@@ -460,6 +460,7 @@ void PASTEMAC(opname,EX_SUF) \
 	); \
 }
 
+GENFRONT( invscalv )
 GENFRONT( scalv )
 GENFRONT( setv )
 
diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h
index 957747a2a..b503cf9f4 100644
--- a/frame/1/bli_l1v_oapi.h
+++ b/frame/1/bli_l1v_oapi.h
@@ -147,6 +147,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
        BLIS_OAPI_EX_PARAMS  \
      );
 
+GENTPROT( invscalv )
 GENTPROT( scalv )
 GENTPROT( setv )
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index 01e3356d5..b22ba365f 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -341,6 +341,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
+INSERT_GENTFUNC_BASIC( invscalv, BLIS_INVSCALV_KER )
 INSERT_GENTFUNC_BASIC( scalv, BLIS_SCALV_KER )
 INSERT_GENTFUNC_BASIC( setv,  BLIS_SETV_KER )
 
diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h
index c1965cb3c..8eaf2b185 100644
--- a/frame/1/bli_l1v_tapi.h
+++ b/frame/1/bli_l1v_tapi.h
@@ -163,6 +163,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
+INSERT_GENTPROT_BASIC0( invscalv )
 INSERT_GENTPROT_BASIC0( scalv )
 INSERT_GENTPROT_BASIC0( setv )
 
diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c
index fcc62a757..776ab8aee 100644
--- a/frame/1d/bli_l1d_check.c
+++ b/frame/1d/bli_l1d_check.c
@@ -98,6 +98,7 @@ void PASTEMAC(opname,_check) \
 	bli_l1d_ax_check( alpha, x ); \
 }
 
+GENFRONT( invscald )
 GENFRONT( scald )
 GENFRONT( setd )
 GENFRONT( setid )
diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h
index 1ef57e236..56286f9ee 100644
--- a/frame/1d/bli_l1d_check.h
+++ b/frame/1d/bli_l1d_check.h
@@ -85,6 +85,7 @@ void PASTEMAC(opname,_check) \
        const obj_t*  x  \
     );
 
+GENTPROT( invscald )
 GENTPROT( scald )
 GENTPROT( setd )
 GENTPROT( setid )
diff --git a/frame/1d/bli_l1d_fpa.c b/frame/1d/bli_l1d_fpa.c
index ec4c222ab..371f9289b 100644
--- a/frame/1d/bli_l1d_fpa.c
+++ b/frame/1d/bli_l1d_fpa.c
@@ -56,6 +56,7 @@ GENFRONT( subd )
 GENFRONT( axpyd )
 GENFRONT( scal2d )
 GENFRONT( invertd )
+GENFRONT( invscald )
 GENFRONT( scald )
 GENFRONT( setd )
 GENFRONT( setid )
diff --git a/frame/1d/bli_l1d_fpa.h b/frame/1d/bli_l1d_fpa.h
index 4516912de..11fb36192 100644
--- a/frame/1d/bli_l1d_fpa.h
+++ b/frame/1d/bli_l1d_fpa.h
@@ -48,6 +48,7 @@ GENPROT( subd )
 GENPROT( axpyd )
 GENPROT( scal2d )
 GENPROT( invertd )
+GENPROT( invscald )
 GENPROT( scald )
 GENPROT( setd )
 GENPROT( setid )
diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h
index 3de317527..b14e17b6a 100644
--- a/frame/1d/bli_l1d_ft.h
+++ b/frame/1d/bli_l1d_ft.h
@@ -95,7 +95,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
 
 INSERT_GENTDEF( invertd )
 
-// scald, setd
+// invscald, scald, setd
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
@@ -111,6 +111,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
+INSERT_GENTDEF( invscald )
 INSERT_GENTDEF( scald )
 INSERT_GENTDEF( setd )
 
diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c
index 7027e7780..8dfd9cad0 100644
--- a/frame/1d/bli_l1d_oapi.c
+++ b/frame/1d/bli_l1d_oapi.c
@@ -260,6 +260,7 @@ void PASTEMAC(opname,EX_SUF) \
 	); \
 }
 
+GENFRONT( invscald )
 GENFRONT( scald )
 GENFRONT( setd )
 
diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h
index 66f9d698c..81171f3b8 100644
--- a/frame/1d/bli_l1d_oapi.h
+++ b/frame/1d/bli_l1d_oapi.h
@@ -89,6 +89,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
        BLIS_OAPI_EX_PARAMS  \
      );
 
+GENTPROT( invscald )
 GENTPROT( scald )
 GENTPROT( setd )
 GENTPROT( setid )
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index 60916cd56..907afb703 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -312,6 +312,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
+INSERT_GENTFUNC_BASIC2( invscald, invscalv, BLIS_INVSCALV_KER )
 INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
 INSERT_GENTFUNC_BASIC2( setd,  setv,  BLIS_SETV_KER )
 
diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h
index 831b3d390..8fe882f0c 100644
--- a/frame/1d/bli_l1d_tapi.h
+++ b/frame/1d/bli_l1d_tapi.h
@@ -106,6 +106,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
+INSERT_GENTPROT_BASIC0( invscald )
 INSERT_GENTPROT_BASIC0( scald )
 INSERT_GENTPROT_BASIC0( setd )
 
diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c
index f5d4bf1b4..92f192838 100644
--- a/frame/1m/bli_l1m_check.c
+++ b/frame/1m/bli_l1m_check.c
@@ -84,6 +84,7 @@ void PASTEMAC(opname,_check) \
 	bli_l1m_ax_check( alpha, x ); \
 }
 
+GENFRONT( invscalm )
 GENFRONT( scalm )
 GENFRONT( setm )
 
diff --git a/frame/1m/bli_l1m_check.h b/frame/1m/bli_l1m_check.h
index 6089dfa17..d767f104c 100644
--- a/frame/1m/bli_l1m_check.h
+++ b/frame/1m/bli_l1m_check.h
@@ -74,6 +74,7 @@ void PASTEMAC(opname,_check) \
        const obj_t* x  \
     );
 
+GENPROT( invscalm )
 GENPROT( scalm )
 GENPROT( setm )
 
diff --git a/frame/1m/bli_l1m_fpa.c b/frame/1m/bli_l1m_fpa.c
index c3d13fb51..7299dd7c8 100644
--- a/frame/1m/bli_l1m_fpa.c
+++ b/frame/1m/bli_l1m_fpa.c
@@ -55,6 +55,7 @@ GENFRONT( copym )
 GENFRONT( subm )
 GENFRONT( axpym )
 GENFRONT( scal2m )
+GENFRONT( invscalm )
 GENFRONT( scalm )
 GENFRONT( setm )
 GENFRONT( xpbym )
diff --git a/frame/1m/bli_l1m_fpa.h b/frame/1m/bli_l1m_fpa.h
index 84ef8b77f..9de988559 100644
--- a/frame/1m/bli_l1m_fpa.h
+++ b/frame/1m/bli_l1m_fpa.h
@@ -47,6 +47,7 @@ GENPROT( copym )
 GENPROT( subm )
 GENPROT( axpym )
 GENPROT( scal2m )
+GENPROT( invscalm )
 GENPROT( scalm )
 GENPROT( setm )
 GENPROT( xpbym )
diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h
index 36d06b2fe..0851470dd 100644
--- a/frame/1m/bli_l1m_ft.h
+++ b/frame/1m/bli_l1m_ft.h
@@ -101,7 +101,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
 
 INSERT_GENTDEF( scal2m )
 
-// scalm, setm
+// invscalm, scalm, setm
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
@@ -119,6 +119,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
+INSERT_GENTDEF( invscalm )
 INSERT_GENTDEF( scalm )
 INSERT_GENTDEF( setm )
 
diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c
index 7520afce7..775d69018 100644
--- a/frame/1m/bli_l1m_oapi.c
+++ b/frame/1m/bli_l1m_oapi.c
@@ -237,6 +237,7 @@ void PASTEMAC(opname,EX_SUF) \
 	); \
 }
 
+GENFRONT( invscalm )
 GENFRONT( scalm )
 
 
diff --git a/frame/1m/bli_l1m_oapi.h b/frame/1m/bli_l1m_oapi.h
index 9510f1aee..6873e9903 100644
--- a/frame/1m/bli_l1m_oapi.h
+++ b/frame/1m/bli_l1m_oapi.h
@@ -77,6 +77,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
        BLIS_OAPI_EX_PARAMS  \
      );
 
+GENPROT( invscalm )
 GENPROT( scalm )
 GENPROT( setm )
 
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 6b802b9fe..0a641cf9e 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -378,6 +378,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
+INSERT_GENTFUNC_BASIC0( invscalm )
 INSERT_GENTFUNC_BASIC0( scalm )
 INSERT_GENTFUNC_BASIC0( setm )
 
diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h
index 68646a71f..531fae075 100644
--- a/frame/1m/bli_l1m_tapi.h
+++ b/frame/1m/bli_l1m_tapi.h
@@ -95,6 +95,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
+INSERT_GENTPROT_BASIC0( invscalm )
 INSERT_GENTPROT_BASIC0( scalm )
 INSERT_GENTPROT_BASIC0( setm )
 
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index c979f082a..1bcd9b9ca 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -376,6 +376,7 @@ void PASTEMAC(ch,opname) \
 	} \
 }
 
+INSERT_GENTFUNC_BASIC2( invscalm_unb_var1, invscalv, BLIS_INVSCALV_KER )
 INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER )
 INSERT_GENTFUNC_BASIC2( setm_unb_var1,  setv,  BLIS_SETV_KER )
 
diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h
index 0364d4b7c..fe01989e3 100644
--- a/frame/1m/bli_l1m_unb_var1.h
+++ b/frame/1m/bli_l1m_unb_var1.h
@@ -98,6 +98,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        rntm_t* rntm  \
      );
 
+INSERT_GENTPROT_BASIC0( invscalm )
 INSERT_GENTPROT_BASIC0( scalm )
 INSERT_GENTPROT_BASIC0( setm )
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 08c7ddc4a..b5c3ec255 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -635,6 +635,7 @@ typedef enum
 	BLIS_DOTV_KER,
 	BLIS_DOTXV_KER,
 	BLIS_INVERTV_KER,
+	BLIS_INVSCALV_KER,
 	BLIS_SCALV_KER,
 	BLIS_SCAL2V_KER,
 	BLIS_SETV_KER,
diff --git a/ref_kernels/1/bli_invscalv_ref.c b/ref_kernels/1/bli_invscalv_ref.c
new file mode 100644
index 000000000..a2263ee58
--- /dev/null
+++ b/ref_kernels/1/bli_invscalv_ref.c
@@ -0,0 +1,81 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       conj_t           conjalpha, \
+       dim_t            n, \
+       ctype*  restrict alpha, \
+       ctype*  restrict x, inc_t incx, \
+       cntx_t*          cntx  \
+     ) \
+{ \
+	if ( bli_zero_dim1( n ) ) return; \
+\
+	/* If alpha is one, return. */ \
+	if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
+\
+	/* If alpha is zero, inv(alpha) is undefined. Bad user! Return early. */ \
+	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
+\
+	ctype alpha_conj; \
+\
+	PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \
+\
+	if ( incx == 1 ) \
+	{ \
+		PRAGMA_SIMD \
+		for ( dim_t i = 0; i < n; ++i ) \
+		{ \
+			PASTEMAC(ch,invscals)( alpha_conj, x[i] ); \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t i = 0; i < n; ++i ) \
+		{ \
+			PASTEMAC(ch,invscals)( alpha_conj, *x ); \
+\
+			x += incx; \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( invscalv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index e094db54b..11c3091e9 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -173,6 +173,8 @@
 #define dotxv_ker_name     GENARNAME(dotxv)
 #undef  invertv_ker_name
 #define invertv_ker_name   GENARNAME(invertv)
+#undef  invscalv_ker_name
+#define invscalv_ker_name  GENARNAME(invscalv)
 #undef  scalv_ker_name
 #define scalv_ker_name     GENARNAME(scalv)
 #undef  scal2v_ker_name
@@ -380,20 +382,21 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-1v kernels -------------------------------------------------
 
-	gen_func_init( &funcs[ BLIS_ADDV_KER ],    addv_ker_name    );
-	gen_func_init( &funcs[ BLIS_AMAXV_KER ],   amaxv_ker_name   );
-	gen_func_init( &funcs[ BLIS_AXPBYV_KER ],  axpbyv_ker_name  );
-	gen_func_init( &funcs[ BLIS_AXPYV_KER ],   axpyv_ker_name   );
-	gen_func_init( &funcs[ BLIS_COPYV_KER ],   copyv_ker_name   );
-	gen_func_init( &funcs[ BLIS_DOTV_KER ],    dotv_ker_name    );
-	gen_func_init( &funcs[ BLIS_DOTXV_KER ],   dotxv_ker_name   );
-	gen_func_init( &funcs[ BLIS_INVERTV_KER ], invertv_ker_name );
-	gen_func_init( &funcs[ BLIS_SCALV_KER ],   scalv_ker_name   );
-	gen_func_init( &funcs[ BLIS_SCAL2V_KER ],  scal2v_ker_name  );
-	gen_func_init( &funcs[ BLIS_SETV_KER ],    setv_ker_name    );
-	gen_func_init( &funcs[ BLIS_SUBV_KER ],    subv_ker_name    );
-	gen_func_init( &funcs[ BLIS_SWAPV_KER ],   swapv_ker_name   );
-	gen_func_init( &funcs[ BLIS_XPBYV_KER ],   xpbyv_ker_name   );
+	gen_func_init( &funcs[ BLIS_ADDV_KER ],     addv_ker_name     );
+	gen_func_init( &funcs[ BLIS_AMAXV_KER ],    amaxv_ker_name    );
+	gen_func_init( &funcs[ BLIS_AXPBYV_KER ],   axpbyv_ker_name   );
+	gen_func_init( &funcs[ BLIS_AXPYV_KER ],    axpyv_ker_name    );
+	gen_func_init( &funcs[ BLIS_COPYV_KER ],    copyv_ker_name    );
+	gen_func_init( &funcs[ BLIS_DOTV_KER ],     dotv_ker_name     );
+	gen_func_init( &funcs[ BLIS_DOTXV_KER ],    dotxv_ker_name    );
+	gen_func_init( &funcs[ BLIS_INVERTV_KER ],  invertv_ker_name  );
+	gen_func_init( &funcs[ BLIS_INVSCALV_KER ], invscalv_ker_name );
+	gen_func_init( &funcs[ BLIS_SCALV_KER ],    scalv_ker_name    );
+	gen_func_init( &funcs[ BLIS_SCAL2V_KER ],   scal2v_ker_name   );
+	gen_func_init( &funcs[ BLIS_SETV_KER ],     setv_ker_name     );
+	gen_func_init( &funcs[ BLIS_SUBV_KER ],     subv_ker_name     );
+	gen_func_init( &funcs[ BLIS_SWAPV_KER ],    swapv_ker_name    );
+	gen_func_init( &funcs[ BLIS_XPBYV_KER ],    xpbyv_ker_name    );
 
 
 	// -- Set level-1m (packm/unpackm) kernels ---------------------------------
diff --git a/testsuite/input.operations b/testsuite/input.operations
index eebe8b605..e6c39e631 100644
--- a/testsuite/input.operations
+++ b/testsuite/input.operations
@@ -138,9 +138,13 @@
 1        # normfv
 -1       #   dimensions: m
 
+1        # invscalv
+-1       #   dimensions: m
+?        #   parameters: conjalpha
+
 1        # scalv
 -1       #   dimensions: m
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2v
 -1       #   dimensions: m
@@ -175,9 +179,13 @@
 1        # normfm
 -1 -2    #   dimensions: m n
 
+1        # invscalm
+-1 -2    #   dimensions: m n
+?        #   parameters: conjalpha
+
 1        # scalm
 -1 -2    #   dimensions: m n
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2m
 -1 -2    #   dimensions: m n
diff --git a/testsuite/input.operations.fast b/testsuite/input.operations.fast
index b733c672d..ecd526aaa 100644
--- a/testsuite/input.operations.fast
+++ b/testsuite/input.operations.fast
@@ -138,9 +138,13 @@
 1        # normfv
 -1       #   dimensions: m
 
+1        # invscalv
+-1       #   dimensions: m
+?        #   parameters: conjalpha
+
 1        # scalv
 -1       #   dimensions: m
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2v
 -1       #   dimensions: m
@@ -175,9 +179,13 @@
 1        # normfm
 -1 -2    #   dimensions: m n
 
+1        # invscalm
+-1 -2    #   dimensions: m n
+?        #   parameters: conjalpha
+
 1        # scalm
 -1 -2    #   dimensions: m n
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2m
 -1 -2    #   dimensions: m n
diff --git a/testsuite/input.operations.mixed b/testsuite/input.operations.mixed
index 6292ea8ab..eb851b786 100644
--- a/testsuite/input.operations.mixed
+++ b/testsuite/input.operations.mixed
@@ -138,9 +138,13 @@
 1        # normfv
 -1       #   dimensions: m
 
+1        # invscalv
+-1       #   dimensions: m
+?        #   parameters: conjalpha
+
 1        # scalv
 -1       #   dimensions: m
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2v
 -1       #   dimensions: m
@@ -175,9 +179,13 @@
 1        # normfm
 -1 -2    #   dimensions: m n
 
+1        # invscalm
+-1 -2    #   dimensions: m n
+?        #   parameters: conjalpha
+
 1        # scalm
 -1 -2    #   dimensions: m n
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2m
 -1 -2    #   dimensions: m n
diff --git a/testsuite/input.operations.salt b/testsuite/input.operations.salt
index b733c672d..ecd526aaa 100644
--- a/testsuite/input.operations.salt
+++ b/testsuite/input.operations.salt
@@ -138,9 +138,13 @@
 1        # normfv
 -1       #   dimensions: m
 
+1        # invscalv
+-1       #   dimensions: m
+?        #   parameters: conjalpha
+
 1        # scalv
 -1       #   dimensions: m
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2v
 -1       #   dimensions: m
@@ -175,9 +179,13 @@
 1        # normfm
 -1 -2    #   dimensions: m n
 
+1        # invscalm
+-1 -2    #   dimensions: m n
+?        #   parameters: conjalpha
+
 1        # scalm
 -1 -2    #   dimensions: m n
-?        #   parameters: conjbeta
+?        #   parameters: conjalpha
 
 1        # scal2m
 -1 -2    #   dimensions: m n
diff --git a/testsuite/src/test_invscalm.c b/testsuite/src/test_invscalm.c
new file mode 100644
index 000000000..9ad730631
--- /dev/null
+++ b/testsuite/src/test_invscalm.c
@@ -0,0 +1,301 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "test_libblis.h"
+
+
+// Static variables.
+static char*     op_str                    = "invscalm";
+static char*     o_types                   = "m";  // x
+static char*     p_types                   = "c";  // conjalpha
+static thresh_t  thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 },   // warn, pass for s
+                                               { 1e-04, 1e-05 },   // warn, pass for c
+                                               { 1e-13, 1e-14 },   // warn, pass for d
+                                               { 1e-13, 1e-14 } }; // warn, pass for z
+
+// Local prototypes.
+void libblis_test_invscalm_deps
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     );
+
+void libblis_test_invscalm_experiment
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     );
+
+void libblis_test_invscalm_impl
+     (
+       iface_t   iface,
+       obj_t*    alpha,
+       obj_t*    y
+     );
+
+void libblis_test_invscalm_check
+     (
+       test_params_t* params,
+       obj_t*         alpha,
+       obj_t*         y,
+       obj_t*         y_save,
+       double*        resid
+     );
+
+
+
+void libblis_test_invscalm_deps
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     )
+{
+	libblis_test_randm( tdata, params, &(op->ops->randm) );
+	libblis_test_normfm( tdata, params, &(op->ops->normfm) );
+	libblis_test_copym( tdata, params, &(op->ops->copym) );
+}
+
+
+
+void libblis_test_invscalm
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     )
+{
+
+	// Return early if this test has already been done.
+	if ( libblis_test_op_is_done( op ) ) return;
+
+	// Return early if operation is disabled.
+	if ( libblis_test_op_is_disabled( op ) ||
+	     libblis_test_l1m_is_disabled( op ) ) return;
+
+	// Call dependencies first.
+	if ( TRUE ) libblis_test_invscalm_deps( tdata, params, op );
+
+	// Execute the test driver for each implementation requested.
+	//if ( op->front_seq == ENABLE )
+	{
+		libblis_test_op_driver( tdata,
+		                        params,
+		                        op,
+		                        BLIS_TEST_SEQ_FRONT_END,
+		                        op_str,
+		                        p_types,
+		                        o_types,
+		                        thresh,
+		                        libblis_test_invscalm_experiment );
+	}
+}
+
+
+
+void libblis_test_invscalm_experiment
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     )
+{
+	unsigned int n_repeats = params->n_repeats;
+	unsigned int i;
+
+	double       time_min  = DBL_MAX;
+	double       time;
+
+	num_t        datatype;
+
+	dim_t        m, n;
+
+	conj_t       conjalpha;
+
+	obj_t        alpha, y;
+	obj_t        y_save;
+
+
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
+	// Map the dimension specifier to actual dimensions.
+	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
+	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
+
+	// Map parameter characters to BLIS constants.
+	bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha );
+
+	// Create test scalars.
+	bli_obj_scalar_init_detached( datatype, &alpha );
+
+	// Create test operands (vectors and/or matrices).
+	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	                          sc_str[0], m, n, &y );
+	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	                          sc_str[0], m, n, &y_save );
+
+	// Set alpha to 0 + i.
+	//bli_setsc( 0.0, 1.0, &alpha );
+	if ( bli_obj_is_real( &y ) )
+		bli_setsc( -2.0,  0.0, &alpha );
+	else
+		bli_setsc(  0.0, -2.0, &alpha );
+
+	// Randomize and save y.
+	libblis_test_mobj_randomize( params, FALSE, &y );
+	bli_copym( &y, &y_save );
+
+	// Apply the parameters.
+	bli_obj_set_conj( conjalpha, &alpha );
+
+	// Repeat the experiment n_repeats times and record results. 
+	for ( i = 0; i < n_repeats; ++i )
+	{
+		bli_copym( &y_save, &y );
+
+		time = bli_clock();
+
+		libblis_test_invscalm_impl( iface, &alpha, &y );
+
+		time_min = bli_clock_min_diff( time_min, time );
+	}
+
+	// Estimate the performance of the best experiment repeat.
+	*perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+	if ( bli_obj_is_complex( &y ) ) *perf *= 6.0;
+
+	// Perform checks.
+	libblis_test_invscalm_check( params, &alpha, &y, &y_save, resid );
+
+	// Zero out performance and residual if output matrix is empty.
+	libblis_test_check_empty_problem( &y, perf, resid );
+
+	// Free the test objects.
+	bli_obj_free( &y );
+	bli_obj_free( &y_save );
+}
+
+
+
+void libblis_test_invscalm_impl
+     (
+       iface_t   iface,
+       obj_t*    alpha,
+       obj_t*    y
+     )
+{
+	switch ( iface )
+	{
+		case BLIS_TEST_SEQ_FRONT_END:
+		bli_invscalm( alpha, y );
+		break;
+
+		default:
+		libblis_test_printf_error( "Invalid interface type.\n" );
+	}
+}
+
+
+
+void libblis_test_invscalm_check
+     (
+       test_params_t* params,
+       obj_t*         alpha,
+       obj_t*         y,
+       obj_t*         y_orig,
+       double*        resid
+     )
+{
+	num_t  dt      = bli_obj_dt( y );
+	num_t  dt_real = bli_obj_dt_proj_to_real( y );
+
+	dim_t  m       = bli_obj_length( y );
+	dim_t  n       = bli_obj_width( y );
+
+	obj_t  norm_y_r;
+
+	obj_t  y2;
+
+	double junk;
+
+	//
+	// Pre-conditions:
+	// - y_orig is randomized.
+	// Note:
+	// - alpha should have a non-zero imaginary component in the complex
+	//   cases in order to more fully exercise the implementation.
+	//
+	// Under these conditions, we assume that the implementation for
+	//
+	//   y := ( 1.0 / conjalpha(alpha) ) * y_orig
+	//
+	// is functioning correctly if
+	//
+	//   normfv( y_orig - conjalpha(alpha) * y )
+	//
+	// is negligible.
+	//
+
+	bli_obj_create( dt, m, n, 0, 0, &y2 );
+	bli_copym( y, &y2 );
+
+	bli_obj_scalar_init_detached( dt_real, &norm_y_r );
+
+	bli_scalm( alpha, &y2 );
+	bli_subm( y_orig, &y2 );
+	
+	bli_normfm( &y2, &norm_y_r );
+
+	bli_getsc( &norm_y_r, resid, &junk );
+
+	bli_obj_free( &y2 );
+}
+
diff --git a/testsuite/src/test_invscalm.h b/testsuite/src/test_invscalm.h
new file mode 100644
index 000000000..698f9b377
--- /dev/null
+++ b/testsuite/src/test_invscalm.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void libblis_test_invscalm
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     );
+
diff --git a/testsuite/src/test_invscalv.c b/testsuite/src/test_invscalv.c
new file mode 100644
index 000000000..47d46b4c2
--- /dev/null
+++ b/testsuite/src/test_invscalv.c
@@ -0,0 +1,297 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "test_libblis.h"
+
+
+// Static variables.
+static char*     op_str                    = "invscalv";
+static char*     o_types                   = "v";  // y
+static char*     p_types                   = "c";  // conjalpha
+static thresh_t  thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 },   // warn, pass for s
+                                               { 1e-04, 1e-05 },   // warn, pass for c
+                                               { 1e-13, 1e-14 },   // warn, pass for d
+                                               { 1e-13, 1e-14 } }; // warn, pass for z
+
+// Local prototypes.
+void libblis_test_invscalv_deps
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     );
+
+void libblis_test_invscalv_experiment
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     );
+
+void libblis_test_invscalv_impl
+     (
+       iface_t   iface,
+       obj_t*    alpha,
+       obj_t*    y
+     );
+
+void libblis_test_invscalv_check
+     (
+       test_params_t* params,
+       obj_t*         alpha,
+       obj_t*         y,
+       obj_t*         y_orig,
+       double*        resid
+     );
+
+
+
+void libblis_test_invscalv_deps
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     )
+{
+	libblis_test_randv( tdata, params, &(op->ops->randv) );
+	libblis_test_normfv( tdata, params, &(op->ops->normfv) );
+	libblis_test_addv( tdata, params, &(op->ops->addv) );
+	libblis_test_copyv( tdata, params, &(op->ops->copyv) );
+}
+
+
+
+void libblis_test_invscalv
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     )
+{
+
+	// Return early if this test has already been done.
+	if ( libblis_test_op_is_done( op ) ) return;
+
+	// Return early if operation is disabled.
+	if ( libblis_test_op_is_disabled( op ) ||
+	     libblis_test_l1v_is_disabled( op ) ) return;
+
+	// Call dependencies first.
+	if ( TRUE ) libblis_test_invscalv_deps( tdata, params, op );
+
+	// Execute the test driver for each implementation requested.
+	//if ( op->front_seq == ENABLE )
+	{
+		libblis_test_op_driver( tdata,
+		                        params,
+		                        op,
+		                        BLIS_TEST_SEQ_FRONT_END,
+		                        op_str,
+		                        p_types,
+		                        o_types,
+		                        thresh,
+		                        libblis_test_invscalv_experiment );
+	}
+}
+
+
+
+void libblis_test_invscalv_experiment
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     )
+{
+	unsigned int n_repeats = params->n_repeats;
+	unsigned int i;
+
+	double       time_min  = DBL_MAX;
+	double       time;
+
+	num_t        datatype;
+
+	dim_t        m;
+
+	conj_t       conjalpha;
+
+	obj_t        alpha, y;
+	obj_t        y_save;
+
+
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
+	// Map the dimension specifier to an actual dimension.
+	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
+
+	// Map parameter characters to BLIS constants.
+	bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha );
+
+	// Create test scalars.
+	bli_obj_scalar_init_detached( datatype, &alpha );
+
+	// Create test operands (vectors and/or matrices).
+	libblis_test_vobj_create( params, datatype, sc_str[0], m, &y );
+	libblis_test_vobj_create( params, datatype, sc_str[0], m, &y_save );
+
+	// Set alpha.
+	if ( bli_obj_is_real( &y ) )
+		bli_setsc( -2.0,  0.0, &alpha );
+	else
+		bli_setsc(  0.0, -2.0, &alpha );
+
+	// Randomize and save y.
+	libblis_test_vobj_randomize( params, FALSE, &y );
+	bli_copyv( &y, &y_save );
+
+	// Apply the parameters.
+	bli_obj_set_conj( conjalpha, &alpha );
+
+	// Repeat the experiment n_repeats times and record results. 
+	for ( i = 0; i < n_repeats; ++i )
+	{
+		bli_copyv( &y_save, &y );
+
+		time = bli_clock();
+
+		libblis_test_invscalv_impl( iface, &alpha, &y );
+
+		time_min = bli_clock_min_diff( time_min, time );
+	}
+
+	// Estimate the performance of the best experiment repeat.
+	*perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+	if ( bli_obj_is_complex( &y ) ) *perf *= 6.0;
+
+	// Perform checks.
+	libblis_test_invscalv_check( params, &alpha, &y, &y_save, resid );
+
+	// Zero out performance and residual if output vector is empty.
+	libblis_test_check_empty_problem( &y, perf, resid );
+
+	// Free the test objects.
+	bli_obj_free( &y );
+	bli_obj_free( &y_save );
+}
+
+
+
+void libblis_test_invscalv_impl
+     (
+       iface_t   iface,
+       obj_t*    alpha,
+       obj_t*    y
+     )
+{
+	switch ( iface )
+	{
+		case BLIS_TEST_SEQ_FRONT_END:
+		bli_invscalv( alpha, y );
+		break;
+
+		default:
+		libblis_test_printf_error( "Invalid interface type.\n" );
+	}
+}
+
+
+
+void libblis_test_invscalv_check
+     (
+       test_params_t* params,
+       obj_t*         alpha,
+       obj_t*         y,
+       obj_t*         y_orig,
+       double*        resid
+     )
+{
+	num_t  dt      = bli_obj_dt( y );
+	num_t  dt_real = bli_obj_dt_proj_to_real( y );
+
+	dim_t  m       = bli_obj_vector_dim( y );
+
+	obj_t  norm_y_r;
+
+	obj_t  y2;
+
+	double junk;
+
+	//
+	// Pre-conditions:
+	// - y_orig is randomized.
+	// Note:
+	// - alpha should have a non-zero imaginary component in the complex
+	//   cases in order to more fully exercise the implementation.
+	//
+	// Under these conditions, we assume that the implementation for
+	//
+	//   y := ( 1.0 / conjalpha(alpha) ) * y_orig
+	//
+	// is functioning correctly if
+	//
+	//   normfv( y_orig - conjalpha(alpha) * y )
+	//
+	// is negligible.
+	//
+
+	bli_obj_create( dt, m, 1, 0, 0, &y2 );
+    bli_copyv( y, &y2 );
+
+	bli_obj_scalar_init_detached( dt_real, &norm_y_r );
+
+	bli_scalv( alpha, &y2 );
+    bli_subv( y_orig, &y2 );
+
+    bli_normfv( &y2, &norm_y_r );
+
+    bli_getsc( &norm_y_r, resid, &junk );
+
+    bli_obj_free( &y2 );
+}
+
diff --git a/testsuite/src/test_invscalv.h b/testsuite/src/test_invscalv.h
new file mode 100644
index 000000000..297be4836
--- /dev/null
+++ b/testsuite/src/test_invscalv.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void libblis_test_invscalv
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     );
+
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 442fae0e0..3ce92e377 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -255,6 +255,7 @@ void libblis_test_level1v_ops( thread_data_t* tdata, test_params_t* params, test
 	libblis_test_dotv( tdata, params, &(ops->dotv) );
 	libblis_test_dotxv( tdata, params, &(ops->dotxv) );
 	libblis_test_normfv( tdata, params, &(ops->normfv) );
+	libblis_test_invscalv( tdata, params, &(ops->invscalv) );
 	libblis_test_scalv( tdata, params, &(ops->scalv) );
 	libblis_test_scal2v( tdata, params, &(ops->scal2v) );
 	libblis_test_setv( tdata, params, &(ops->setv) );
@@ -270,6 +271,7 @@ void libblis_test_level1m_ops( thread_data_t* tdata, test_params_t* params, test
 	libblis_test_axpym( tdata, params, &(ops->axpym) );
 	libblis_test_copym( tdata, params, &(ops->copym) );
 	libblis_test_normfm( tdata, params, &(ops->normfm) );
+	libblis_test_invscalm( tdata, params, &(ops->invscalm) );
 	libblis_test_scalm( tdata, params, &(ops->scalm) );
 	libblis_test_scal2m( tdata, params, &(ops->scal2m) );
 	libblis_test_setm( tdata, params, &(ops->setm) );
@@ -370,6 +372,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops )
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   2, &(ops->dotv) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   2, &(ops->dotxv) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   0, &(ops->normfv) );
+	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   1, &(ops->invscalv) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   1, &(ops->scalv) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   1, &(ops->scal2v) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   0, &(ops->setv) );
@@ -381,6 +384,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops )
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->axpym) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->copym) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  0, &(ops->normfm) );
+	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->invscalm) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->scalm) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->scal2m) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  0, &(ops->setm) );
@@ -2705,8 +2709,9 @@ void libblis_test_vobj_randomize( test_params_t* params, bool normalize, obj_t*
 		bli_normfv( x, &kappa_r );
 		libblis_test_ceil_pow2( &kappa_r );
 		bli_copysc( &kappa_r, &kappa );
-		bli_invertsc( &kappa );
-		bli_scalv( &kappa, x );
+		//bli_invertsc( &kappa );
+		//bli_scalv( &kappa, x );
+		bli_invscalv( &kappa, x );
 	}
 }
 
@@ -2744,8 +2749,9 @@ void libblis_test_mobj_randomize( test_params_t* params, bool normalize, obj_t*
 		bli_norm1m( a, &kappa_r );
 		libblis_test_ceil_pow2( &kappa_r );
 		bli_copysc( &kappa_r, &kappa );
-		bli_invertsc( &kappa );
-		bli_scalm( &kappa, a );
+		//bli_invertsc( &kappa );
+		//bli_scalm( &kappa, a );
+		bli_invscalm( &kappa, a );
 	}
 }
 
diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h
index cdb3c6dac..9e38964ee 100644
--- a/testsuite/src/test_libblis.h
+++ b/testsuite/src/test_libblis.h
@@ -230,6 +230,7 @@ typedef struct test_ops_s
 	test_op_t dotv;
 	test_op_t dotxv;
 	test_op_t normfv;
+	test_op_t invscalv;
 	test_op_t scalv;
 	test_op_t scal2v;
 	test_op_t setv;
@@ -241,6 +242,7 @@ typedef struct test_ops_s
 	test_op_t axpym;
 	test_op_t copym;
 	test_op_t normfm;
+	test_op_t invscalm;
 	test_op_t scalm;
 	test_op_t scal2m;
 	test_op_t setm;
@@ -504,6 +506,7 @@ char libblis_test_proj_dtchar_to_precchar( char dt_char );
 #include "test_dotv.h"
 #include "test_dotxv.h"
 #include "test_normfv.h"
+#include "test_invscalv.h"
 #include "test_scalv.h"
 #include "test_scal2v.h"
 #include "test_setv.h"
@@ -515,6 +518,7 @@ char libblis_test_proj_dtchar_to_precchar( char dt_char );
 #include "test_axpym.h"
 #include "test_copym.h"
 #include "test_normfm.h"
+#include "test_invscalm.h"
 #include "test_scalm.h"
 #include "test_scal2m.h"
 #include "test_setm.h"
diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c
index 6219c71df..bd4565ccd 100644
--- a/testsuite/src/test_scalm.c
+++ b/testsuite/src/test_scalm.c
@@ -40,7 +40,7 @@
 // Static variables.
 static char*     op_str                    = "scalm";
 static char*     o_types                   = "m";  // x
-static char*     p_types                   = "c";  // conjbeta
+static char*     p_types                   = "c";  // conjalpha
 static thresh_t  thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 },   // warn, pass for s
                                                { 1e-04, 1e-05 },   // warn, pass for c
                                                { 1e-13, 1e-14 },   // warn, pass for d
@@ -70,14 +70,14 @@ void libblis_test_scalm_experiment
 void libblis_test_scalm_impl
      (
        iface_t   iface,
-       obj_t*    beta,
+       obj_t*    alpha,
        obj_t*    y
      );
 
 void libblis_test_scalm_check
      (
        test_params_t* params,
-       obj_t*         beta,
+       obj_t*         alpha,
        obj_t*         y,
        obj_t*         y_save,
        double*        resid
@@ -157,9 +157,9 @@ void libblis_test_scalm_experiment
 
 	dim_t        m, n;
 
-	conj_t       conjbeta;
+	conj_t       conjalpha;
 
-	obj_t        beta, y;
+	obj_t        alpha, y;
 	obj_t        y_save;
 
 
@@ -171,10 +171,10 @@ void libblis_test_scalm_experiment
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
 
 	// Map parameter characters to BLIS constants.
-	bli_param_map_char_to_blis_conj( pc_str[0], &conjbeta );
+	bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( datatype, &alpha );
 
 	// Create test operands (vectors and/or matrices).
 	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
@@ -182,19 +182,19 @@ void libblis_test_scalm_experiment
 	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, n, &y_save );
 
-	// Set beta to 0 + i.
-	//bli_setsc( 0.0, 1.0, &beta );
+	// Set alpha to 0 + i.
+	//bli_setsc( 0.0, 1.0, &alpha );
 	if ( bli_obj_is_real( &y ) )
-		bli_setsc( -2.0,  0.0, &beta );
+		bli_setsc( -2.0,  0.0, &alpha );
 	else
-		bli_setsc(  0.0, -2.0, &beta );
+		bli_setsc(  0.0, -2.0, &alpha );
 
 	// Randomize and save y.
 	libblis_test_mobj_randomize( params, FALSE, &y );
 	bli_copym( &y, &y_save );
 
 	// Apply the parameters.
-	bli_obj_set_conj( conjbeta, &beta );
+	bli_obj_set_conj( conjalpha, &alpha );
 
 	// Repeat the experiment n_repeats times and record results. 
 	for ( i = 0; i < n_repeats; ++i )
@@ -203,7 +203,7 @@ void libblis_test_scalm_experiment
 
 		time = bli_clock();
 
-		libblis_test_scalm_impl( iface, &beta, &y );
+		libblis_test_scalm_impl( iface, &alpha, &y );
 
 		time_min = bli_clock_min_diff( time_min, time );
 	}
@@ -213,7 +213,7 @@ void libblis_test_scalm_experiment
 	if ( bli_obj_is_complex( &y ) ) *perf *= 6.0;
 
 	// Perform checks.
-	libblis_test_scalm_check( params, &beta, &y, &y_save, resid );
+	libblis_test_scalm_check( params, &alpha, &y, &y_save, resid );
 
 	// Zero out performance and residual if output matrix is empty.
 	libblis_test_check_empty_problem( &y, perf, resid );
@@ -228,14 +228,14 @@ void libblis_test_scalm_experiment
 void libblis_test_scalm_impl
      (
        iface_t   iface,
-       obj_t*    beta,
+       obj_t*    alpha,
        obj_t*    y
      )
 {
 	switch ( iface )
 	{
 		case BLIS_TEST_SEQ_FRONT_END:
-		bli_scalm( beta, y );
+		bli_scalm( alpha, y );
 		break;
 
 		default:
@@ -248,7 +248,7 @@ void libblis_test_scalm_impl
 void libblis_test_scalm_check
      (
        test_params_t* params,
-       obj_t*         beta,
+       obj_t*         alpha,
        obj_t*         y,
        obj_t*         y_orig,
        double*        resid
@@ -261,7 +261,7 @@ void libblis_test_scalm_check
 	dim_t  n       = bli_obj_width( y );
 
 	obj_t  norm_y_r;
-	obj_t  nbeta;
+	obj_t  nalpha;
 
 	obj_t  y2;
 
@@ -271,16 +271,16 @@ void libblis_test_scalm_check
 	// Pre-conditions:
 	// - y_orig is randomized.
 	// Note:
-	// - beta should have a non-zero imaginary component in the complex
+	// - alpha should have a non-zero imaginary component in the complex
 	//   cases in order to more fully exercise the implementation.
 	//
 	// Under these conditions, we assume that the implementation for
 	//
-	//   y := conjbeta(beta) * y_orig
+	//   y := conjalpha(alpha) * y_orig
 	//
 	// is functioning correctly if
 	//
-	//   normfm( y + -conjbeta(beta) * y_orig )
+	//   normfm( y + -conjalpha(alpha) * y_orig )
 	//
 	// is negligible.
 	//
@@ -288,13 +288,13 @@ void libblis_test_scalm_check
 	bli_obj_create( dt, m, n, 0, 0, &y2 );
 	bli_copym( y_orig, &y2 );
 
-	bli_obj_scalar_init_detached( dt,      &nbeta );
+	bli_obj_scalar_init_detached( dt,      &nalpha );
 	bli_obj_scalar_init_detached( dt_real, &norm_y_r );
 
-	bli_copysc( beta, &nbeta );
-	bli_mulsc( &BLIS_MINUS_ONE, &nbeta );
+	bli_copysc( alpha, &nalpha );
+	bli_mulsc( &BLIS_MINUS_ONE, &nalpha );
 
-	bli_scalm( &nbeta, &y2 );
+	bli_scalm( &nalpha, &y2 );
 	bli_addm( &y2, y );
 	
 	bli_normfm( y, &norm_y_r );
diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c
index 142b5e410..7b409103b 100644
--- a/testsuite/src/test_scalv.c
+++ b/testsuite/src/test_scalv.c
@@ -40,7 +40,7 @@
 // Static variables.
 static char*     op_str                    = "scalv";
 static char*     o_types                   = "v";  // y
-static char*     p_types                   = "c";  // conjbeta
+static char*     p_types                   = "c";  // conjalpha
 static thresh_t  thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 },   // warn, pass for s
                                                { 1e-04, 1e-05 },   // warn, pass for c
                                                { 1e-13, 1e-14 },   // warn, pass for d
@@ -70,14 +70,14 @@ void libblis_test_scalv_experiment
 void libblis_test_scalv_impl
      (
        iface_t   iface,
-       obj_t*    beta,
+       obj_t*    alpha,
        obj_t*    y
      );
 
 void libblis_test_scalv_check
      (
        test_params_t* params,
-       obj_t*         beta,
+       obj_t*         alpha,
        obj_t*         y,
        obj_t*         y_orig,
        double*        resid
@@ -158,9 +158,9 @@ void libblis_test_scalv_experiment
 
 	dim_t        m;
 
-	conj_t       conjbeta;
+	conj_t       conjalpha;
 
-	obj_t        beta, y;
+	obj_t        alpha, y;
 	obj_t        y_save;
 
 
@@ -171,27 +171,27 @@ void libblis_test_scalv_experiment
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
 	// Map parameter characters to BLIS constants.
-	bli_param_map_char_to_blis_conj( pc_str[0], &conjbeta );
+	bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( datatype, &alpha );
 
 	// Create test operands (vectors and/or matrices).
 	libblis_test_vobj_create( params, datatype, sc_str[0], m, &y );
 	libblis_test_vobj_create( params, datatype, sc_str[0], m, &y_save );
 
-	// Set beta.
+	// Set alpha.
 	if ( bli_obj_is_real( &y ) )
-		bli_setsc( -2.0,  0.0, &beta );
+		bli_setsc( -2.0,  0.0, &alpha );
 	else
-		bli_setsc(  0.0, -2.0, &beta );
+		bli_setsc(  0.0, -2.0, &alpha );
 
 	// Randomize and save y.
 	libblis_test_vobj_randomize( params, FALSE, &y );
 	bli_copyv( &y, &y_save );
 
 	// Apply the parameters.
-	bli_obj_set_conj( conjbeta, &beta );
+	bli_obj_set_conj( conjalpha, &alpha );
 
 	// Repeat the experiment n_repeats times and record results. 
 	for ( i = 0; i < n_repeats; ++i )
@@ -200,7 +200,7 @@ void libblis_test_scalv_experiment
 
 		time = bli_clock();
 
-		libblis_test_scalv_impl( iface, &beta, &y );
+		libblis_test_scalv_impl( iface, &alpha, &y );
 
 		time_min = bli_clock_min_diff( time_min, time );
 	}
@@ -210,7 +210,7 @@ void libblis_test_scalv_experiment
 	if ( bli_obj_is_complex( &y ) ) *perf *= 6.0;
 
 	// Perform checks.
-	libblis_test_scalv_check( params, &beta, &y, &y_save, resid );
+	libblis_test_scalv_check( params, &alpha, &y, &y_save, resid );
 
 	// Zero out performance and residual if output vector is empty.
 	libblis_test_check_empty_problem( &y, perf, resid );
@@ -225,14 +225,14 @@ void libblis_test_scalv_experiment
 void libblis_test_scalv_impl
      (
        iface_t   iface,
-       obj_t*    beta,
+       obj_t*    alpha,
        obj_t*    y
      )
 {
 	switch ( iface )
 	{
 		case BLIS_TEST_SEQ_FRONT_END:
-		bli_scalv( beta, y );
+		bli_scalv( alpha, y );
 		break;
 
 		default:
@@ -245,7 +245,7 @@ void libblis_test_scalv_impl
 void libblis_test_scalv_check
      (
        test_params_t* params,
-       obj_t*         beta,
+       obj_t*         alpha,
        obj_t*         y,
        obj_t*         y_orig,
        double*        resid
@@ -257,7 +257,7 @@ void libblis_test_scalv_check
 	dim_t  m       = bli_obj_vector_dim( y );
 
 	obj_t  norm_y_r;
-	obj_t  nbeta;
+	obj_t  nalpha;
 
 	obj_t  y2;
 
@@ -267,16 +267,16 @@ void libblis_test_scalv_check
 	// Pre-conditions:
 	// - y_orig is randomized.
 	// Note:
-	// - beta should have a non-zero imaginary component in the complex
+	// - alpha should have a non-zero imaginary component in the complex
 	//   cases in order to more fully exercise the implementation.
 	//
 	// Under these conditions, we assume that the implementation for
 	//
-	//   y := conjbeta(beta) * y_orig
+	//   y := conjalpha(alpha) * y_orig
 	//
 	// is functioning correctly if
 	//
-	//   normfv( y + -conjbeta(beta) * y_orig )
+	//   normfv( y + -conjalpha(alpha) * y_orig )
 	//
 	// is negligible.
 	//
@@ -284,13 +284,13 @@ void libblis_test_scalv_check
 	bli_obj_create( dt, m, 1, 0, 0, &y2 );
     bli_copyv( y_orig, &y2 );
 
-	bli_obj_scalar_init_detached( dt,      &nbeta );
+	bli_obj_scalar_init_detached( dt,      &nalpha );
 	bli_obj_scalar_init_detached( dt_real, &norm_y_r );
 
-	bli_copysc( beta, &nbeta );
-	bli_mulsc( &BLIS_MINUS_ONE, &nbeta );
+	bli_copysc( alpha, &nalpha );
+	bli_mulsc( &BLIS_MINUS_ONE, &nalpha );
 
-	bli_scalv( &nbeta, &y2 );
+	bli_scalv( &nalpha, &y2 );
     bli_addv( &y2, y );
 
     bli_normfv( y, &norm_y_r );

From 6e5431e8494b06bd80efcab3abf0a6456d6c0381 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sat, 10 Sep 2022 15:16:58 -0500
Subject: [PATCH 078/230] Fix line number issue in flattened blis.h. (#660)

Details:
- Updated the top-level Makefile so that it invokes flatten-headers.py
  without the -c option, which was requesting that comments be stripped
  (since comment stripping is disabled by default).
- Updated flatten-headers.py to accept a new option (-l) to enable
  insertion of #line directives into the output file. This new option
  is enabled by default.
- Also added logic to flatten-headers.py that outputs a warning if both
  comment stripping and line numbers are requested since the comment
  stripping will cause the line numbers to become inaccurate.
---
 Makefile                 |  8 ++++----
 build/flatten-headers.py | 16 +++++++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 5605dd8fc..e790e8752 100644
--- a/Makefile
+++ b/Makefile
@@ -492,10 +492,10 @@ flat-header: check-env $(BLIS_H_FLAT)
 
 $(BLIS_H_FLAT): $(ALL_H99_FILES)
 ifeq ($(ENABLE_VERBOSE),yes)
-	$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
+	$(FLATTEN_H) -l -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
 else
 	@echo -n "Generating monolithic blis.h"
-	@$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
+	@$(FLATTEN_H) -l -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
 	@echo "Generated $@"
 endif
 
@@ -505,10 +505,10 @@ flat-cblas-header: check-env $(CBLAS_H_FLAT)
 
 $(CBLAS_H_FLAT): $(FRAME_H99_FILES)
 ifeq ($(ENABLE_VERBOSE),yes)
-	$(FLATTEN_H) -c -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
+	$(FLATTEN_H) -l -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
 else
 	@echo -n "Generating monolithic cblas.h"
-	@$(FLATTEN_H) -c -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
+	@$(FLATTEN_H) -l -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
 	@echo "Generated $@"
 endif
 
diff --git a/build/flatten-headers.py b/build/flatten-headers.py
index ecd4635d1..2d5b74c7a 100755
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -278,14 +278,16 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 
 				# Mark the beginning of the header being inserted.
 				ostring += "%s%s%c" % ( beginstr, header, '\n' )
-				#ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' )
+				if line_numbers:
+					ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' )
 
 				# Recurse on the header, accumulating the string.
 				ostring += flatten_header( header_path, header_dirpaths, cursp + "  " )
 
 				# Mark the end of the header being inserted.
 				ostring += "%s%s%c" % ( endstr, header, '\n' )
-				#ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' )
+				if line_numbers:
+					ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' )
 
 				echov2( "%sheader file '%s' fully processed." \
 				        % ( cursp, header_path ) )
@@ -350,6 +352,7 @@ def find_header_dirs( dirpath ):
 output_name    = None
 strip_comments = None
 recursive_flag = None
+line_numbers   = None
 verbose_flag   = None
 regex          = None
 root_inputfile = None
@@ -360,6 +363,7 @@ def main():
 	global output_name
 	global strip_comments
 	global recursive_flag
+	global line_numbers
 	global verbose_flag
 	global regex
 	global root_inputfile
@@ -371,13 +375,14 @@ def main():
 
 	strip_comments = False
 	recursive_flag = False
+	line_numbers   = False
 	verbose_flag   = "1"
 
 	nestsp         = "  "
 
 	# Process our command line options.
 	try:
-		opts, args = getopt.getopt( sys.argv[1:], "o:rchv:" )
+		opts, args = getopt.getopt( sys.argv[1:], "o:rclhv:" )
 
 	except getopt.GetoptError as err:
 		# print help information and exit:
@@ -390,6 +395,8 @@ def main():
 			output_name = optarg
 		elif opt == "-r":
 			recursive_flag = True
+		elif opt == "-l":
+			line_numbers = True
 		elif opt == "-c":
 			strip_comments = True
 		elif opt == "-v":
@@ -401,6 +408,9 @@ def main():
 			print_usage()
 			sys.exit()
 
+	if line_numbers and strip_comments:
+		my_print( "WARNING: stripping comments will result in inaccurate line numbers" )
+
 	# Make sure that the verboseness level is valid.
 	if ( verbose_flag != "0" and
 	     verbose_flag != "1" and

From cb74202db39dc8cb81fdd06f8a445f8837e27853 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 13 Sep 2022 11:46:24 -0500
Subject: [PATCH 079/230] Fixed incorrect sizeof(type) in edge case macros.
 (#662)

Details:
- In bli_edge_case_macro_defs.h, the GEMM_UKR_SETUP_CT_PRE() and
  GEMMTRSM_UKR_SETUP_CT_PRE() macros previously declared their temporary
  ct microtiles as:

    PASTEMAC(ch,ctype)
          _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \
               __attribute__((aligned(alignment))); \

  The problem here is that sizeof( PASTEMAC(ch,type) ) evaluates to
  things like sizeof( BLIS_DOUBLE ), not sizeof( double ), and since
  BLIS_DOUBLE is an enum, it is typically an int, which means the
  sizeof() expression is evaluating to the wrong value. This was likely
  a benign bug, though, since BLIS does not support any computational
  datatypes that are smaller than sizeof( int ), which means the ct
  array would be *over*-allocated rather than underallocated. Thanks
  to @moon-chilled for identifying and reporting this bug in #624.
- CREDITS file update.
---
 CREDITS                                  | 1 +
 frame/include/bli_edge_case_macro_defs.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CREDITS b/CREDITS
index 49361c801..152de0a4b 100644
--- a/CREDITS
+++ b/CREDITS
@@ -68,6 +68,7 @@ but many others have contributed code and feedback, including
   Devin Matthews           @devinamatthews     (The University of Texas at Austin)
   Stefanos Mavros          @smavros
   Mithun Mohan             @MithunMohanKadavil (AMD)
+                           @moon-chilled
   Ilknur Mustafazade       @Runkli
                            @nagsingh
   Bhaskar Nallani          @BhaskarNallani     (AMD)
diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h
index 70d97d5d1..6fc4e46c8 100644
--- a/frame/include/bli_edge_case_macro_defs.h
+++ b/frame/include/bli_edge_case_macro_defs.h
@@ -47,7 +47,7 @@
 	PASTEMAC(ch,ctype)* restrict _c      = c; \
 	const inc_t                  _rs_c   = rs_c; \
 	const inc_t                  _cs_c   = cs_c; \
-	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \
+	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,ctype) ) ] \
 	                                  __attribute__((aligned(alignment))); \
 	const inc_t                  _rs_ct  = row_major ? nr :  1; \
 	const inc_t                  _cs_ct  = row_major ?  1 : mr;
@@ -137,7 +137,7 @@
 	PASTEMAC(ch,ctype)* restrict _c      = c11; \
 	const inc_t                  _rs_c   = rs_c; \
 	const inc_t                  _cs_c   = cs_c; \
-	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \
+	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,ctype) ) ] \
 	                                  __attribute__((aligned(alignment))); \
 	const inc_t                  _rs_ct  = row_major ? nr :  1; \
 	const inc_t                  _cs_ct  = row_major ?  1 : mr;

From fd885cf98f4fe1d3bc46468e567776c37c670fcc Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 13 Sep 2022 11:50:23 -0500
Subject: [PATCH 080/230] Use kernel CFLAGS for 'kernels' subdirs in addons.
 (#658)

Details:
- Updated Makefile and common.mk so that the targeted configuration's
  kernel CFLAGS are applied to source files that are found in a
  'kernels' subdirectory within an enabled addon. For now, this
  behavior only applies when the 'kernels' directory is at the top
  level of the addon directory structure. For example, if there is an
  addon named 'foobar', the source code must be located in
  addon/foobar/kernels/ in order for it to be compiled with the target
  configurations's kernel CFLAGS. Any other source code within
  addon/foobar/ will be compiled with general-purpose CFLAGS (the same
  ones that were used on all addon code prior to this commit). Thanks
  to AMD (esp. Mithun Mohan) for suggesting this change and catching an
  intermediate bug in the PR.
- Comment/whitespace updates.
---
 Makefile  | 43 +++++++++++++++++++++++++++++++++++++++++--
 common.mk | 32 +++++++++++++++++++++-----------
 2 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index e790e8752..f5396f79b 100644
--- a/Makefile
+++ b/Makefile
@@ -213,8 +213,19 @@ MK_REFKERN_OBJS     := $(foreach arch, $(CONFIG_LIST), \
 MK_FRAME_OBJS       := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH))
 
 # Generate object file paths for the addon source code. If one or more addons
-# were not enabled a configure-time, this variable will we empty.
-MK_ADDON_OBJS       := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH))
+# were not enabled a configure-time, these variable will we empty.
+# NOTE: We separate the source and objects into kernel and non-kernel lists.
+MK_ADDON_KERS_SRC   := $(foreach addon, $(ADDON_LIST), \
+                           $(filter $(ADDON_PATH)/$(addon)/$(KERNELS_DIR)/%, \
+                                    $(MK_ADDON_SRC)) \
+                        )
+MK_ADDON_OTHER_SRC  := $(foreach addon, $(ADDON_LIST), \
+                           $(filter-out $(ADDON_PATH)/$(addon)/$(KERNELS_DIR)/%, \
+                                        $(MK_ADDON_SRC)) \
+                        )
+MK_ADDON_KERS_OBJS  := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_KERS_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH))
+MK_ADDON_OTHER_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_OTHER_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH))
+MK_ADDON_OBJS       := $(MK_ADDON_KERS_OBJS) $(MK_ADDON_OTHER_OBJS)
 
 # Generate object file paths for the sandbox source code. If a sandbox was not
 # enabled a configure-time, this variable will we empty.
@@ -580,6 +591,7 @@ endef
 
 # first argument: a configuration name from the union of config_list and
 # config_name, used to look up the CFLAGS to use during compilation.
+# second argument: the C99 addon file suffix being considered.
 define make-c99-addon-rule
 $(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
@@ -590,6 +602,23 @@ else
 endif
 endef
 
+# first argument: a configuration name from the union of config_list and
+# config_name, used to look up the CFLAGS to use during compilation.
+# second argument: the C99 addon file suffix being considered.
+# third argument: the name of the addon being considered.
+define make-c99-addon-kers-rule
+$(BASE_OBJ_ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.o: $(ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(call get-addon-kernel-c99flags-for,$(1)) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-addon-kernel-text-for,$(1))
+	@$(CC) $(call get-addon-kernel-c99flags-for,$(1)) -c $$< -o $$@
+endif
+endef
+
+# first argument: a configuration name from the union of config_list and
+# config_name, used to look up the CFLAGS to use during compilation.
+# second argument: the C++ addon file suffix being considered.
 define make-cxx-addon-rule
 $(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
@@ -602,6 +631,7 @@ endef
 
 # first argument: a configuration name from the union of config_list and
 # config_name, used to look up the CFLAGS to use during compilation.
+# second argument: the C99 sandbox file suffix being considered.
 define make-c99-sandbox-rule
 $(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_H99_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
@@ -612,6 +642,9 @@ else
 endif
 endef
 
+# first argument: a configuration name from the union of config_list and
+# config_name, used to look up the CFLAGS to use during compilation.
+# second argument: the C++ sandbox file suffix being considered.
 define make-cxx-sandbox-rule
 $(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
@@ -657,6 +690,12 @@ $(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call g
 $(foreach suf, $(ADDON_C99_SUFS), \
 $(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf)))))
 
+# Instantiate the build rule for C addon/kernels files. Use the CFLAGS for the
+# configuration family.
+$(foreach addon, $(ADDON_LIST), \
+$(foreach suf, $(ADDON_C99_SUFS), \
+$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-kers-rule,$(conf),$(suf),$(addon))))))
+
 # Instantiate the build rule for C++ addon files. Use the CFLAGS for the
 # configuration family.
 $(foreach suf, $(ADDON_CXX_SUFS), \
diff --git a/common.mk b/common.mk
index 33713e9f5..b49089419 100644
--- a/common.mk
+++ b/common.mk
@@ -154,7 +154,7 @@ get-kernel-cflags-for    = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
                                    $(BUILD_SYMFLAGS) \
                             )
 
-# When compiling sandboxes, we use flags similar to those of general framework
+# When compiling addons, we use flags similar to those of general framework
 # source. This ensures that the same code can be linked and run across various
 # sub-configurations.
 get-addon-c99flags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
@@ -169,6 +169,15 @@ get-addon-cxxflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
+# When compiling addon kernels, we use flags similar to those of kernels
+# flags, except we also include the addon header paths.
+get-addon-kernel-c99flags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
+                                        $(call load-var-for,CKVECFLAGS,$(1)) \
+                                        $(call get-noopt-cflags-for,$(1)) \
+                                        $(CADDONINCFLAGS) \
+                                        $(BUILD_CPPFLAGS) \
+                                        $(BUILD_SYMFLAGS) \
+                                 )
 
 # When compiling sandboxes, we use flags similar to those of general framework
 # source. This ensures that the same code can be linked and run across various
@@ -203,16 +212,17 @@ get-user-cflags-for      = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
 
 # Define functions that return messages appropriate for each non-verbose line
 # of compilation output.
-get-noopt-text          = "(CFLAGS for no optimization)"
-get-refinit-text-for    = "('$(1)' CFLAGS for ref. kernel init)"
-get-refkern-text-for    = "('$(1)' CFLAGS for ref. kernels)"
-get-config-text-for     = "('$(1)' CFLAGS for config code)"
-get-frame-text-for      = "('$(1)' CFLAGS for framework code)"
-get-kernel-text-for     = "('$(1)' CFLAGS for kernels)"
-get-addon-c99text-for   = "('$(1)' CFLAGS for addons)"
-get-addon-cxxtext-for   = "('$(1)' CXXFLAGS for addons)"
-get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)"
-get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
+get-noopt-text            = "(CFLAGS for no optimization)"
+get-refinit-text-for      = "('$(1)' CFLAGS for ref. kernel init)"
+get-refkern-text-for      = "('$(1)' CFLAGS for ref. kernels)"
+get-config-text-for       = "('$(1)' CFLAGS for config code)"
+get-frame-text-for        = "('$(1)' CFLAGS for framework code)"
+get-kernel-text-for       = "('$(1)' CFLAGS for kernels)"
+get-addon-c99text-for     = "('$(1)' CFLAGS for addons)"
+get-addon-cxxtext-for     = "('$(1)' CXXFLAGS for addons)"
+get-addon-kernel-text-for = "('$(1)' CFLAGS for addon kernels)"
+get-sandbox-c99text-for   = "('$(1)' CFLAGS for sandboxes)"
+get-sandbox-cxxtext-for   = "('$(1)' CXXFLAGS for sandboxes)"
 
 
From 05a811e898b371a76581abd4afa416980cce7db9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 13 Sep 2022 19:24:05 -0500
Subject: [PATCH 081/230] Initialize rntm_t nt/ways fields with 1 (not -1).
 (#663)

Details:
- Changed the way that rntm_t structs are initialized, mainly so that
  the global rntm_t that is set via environment variables at runtime
  may be queried by the application prior to any computation taking
  place. (Strictly speaking, the application may already query these
  fields, but they do not always contain valid values and often contain
  -1 when they are unset.) These changes also served to clarify how
  these parameters are treated, and homogenized the implementations of
  bli_rntm_set_ways_from_rntm(), bli_rntm_set_ways_from_rntm_sup(), and
  bli_thread_init_rntm_from_env(). Special thanks to Jeff Diamond,
  Leick Robinson, and Devin Matthews for pointing out that the previous
  behavior was needlessly confusing and could be improved.
- The aforementioned modifications also included subtle changes as to
  what counts as "setting" a loop's ways of parallelism for the purposes
  of deciding whether to use the ways or the total number of threads.
  Previously, setting any loop's ways, even to 1, counted in favor of
  using the ways. Now, only values greater than 1 will count as
  "setting", and all other values will silently be mapped to 1, with
  those parameters treated as if they were untouched all along.
- Updated bli_rntm.h and bli_thread.c so that any attempt to set the
  PC_NT variable (or pc_nt field of a rntm_t) will either ignore the
  request or reassert the value as 1.
- Updated bli_rntm_set_ways() so that rather than clear the
  num_threads field, it is set to the product of all of the per-loop
  ways of parallelism.
- Removed code from test_libblis.c that handled the possibility of unset
  environment variables when printing out their values.
- Removed bli_rntm_equals() inline function from bli_rntm.h, which has
  long been disabled.
- Updates to docs/Multithreading.md related to the aforementioned
  changes.
- Comment updates.
---
 docs/Multithreading.md       |  59 +++++----
 frame/base/bli_rntm.c        | 249 ++++++++++++++++++-----------------
 frame/base/bli_rntm.h        |  38 ++----
 frame/thread/bli_thread.c    |  97 ++++++++++----
 testsuite/src/test_libblis.c |  21 +--
 5 files changed, 246 insertions(+), 218 deletions(-)

diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 48fbc8ca1..8e636f06a 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -47,6 +47,7 @@ $ ./configure --enable-threading=pthreads auto
 ```
 You can also use the shorthand option for `--enable-threading`, which is `-t`:
 ```
+$ ./configure -t openmp auto
 $ ./configure -t pthreads auto
 ```
 For more complete and up-to-date information on the `--enable-threading` option, simply run `configure` with the `--help` (or `-h`) option:
@@ -102,19 +103,19 @@ There are three broad methods of specifying multithreading in BLIS:
 * [Globally at runtime](Multithreading.md#globally-at-runtime)
 * [Locally at runtime](Multithreading.md#locally-at-runtime) (that is, on a per-call, thread-safe basis)
 
-Within these three broad methods there are two specific ways of expressing a request for parallelism. First, the user may express a single number--the total number of threads, or ways of parallelism, to use within a single operation such as `gemm`. We call this the "automatic" way. Alternatively, the user may express the number of ways of parallelism to obtain within *each loop* of the level-3 operation. We call this the "manual" way. The latter way is actually what BLIS eventually needs before it can perform its multithreading; the former is viable only because we have a heuristic of determining a reasonable instance of the latter when given the former.
-This pattern--automatic or manual--holds regardless of which of the three methods is used.
+Within each of these three broad methods there are two specific ways of expressing a request for parallelism. First, the user may express a single number--the total number of threads, or ways of parallelism, to use within a single operation such as `gemm`. We call this the "automatic" way. Alternatively, the user may express the number of ways of parallelism to obtain within *each loop* of the level-3 operation. We call this the "manual" way. The latter way is actually what BLIS eventually needs before it can perform its multithreading; the former is viable only because we have a heuristic of determining a reasonable instance of the latter when given the former.
+This choice--automatic or manual--must be made regardless of which of the three methods is used.
 
 Regardless of which method is employed, and which specific way within each method, after setting the number of threads, the application may call the desired level-3 operation (via either the [typed API](docs/BLISTypedAPI.md) or the [object API](docs/BLISObjectAPI.md)) and the operation will execute in a multithreaded manner. (When calling BLIS via the BLAS API, only the first two (global) methods are available.)
 
 **Note**: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Here are the important points:
- * Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the values set via the manual way will always take precedence.**
- * Specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1). And in the case of the environment variable method, setting the ways of parallelism for a loop to 1 counts as specifying parallelism! If you want to switch from using the manual way to automatic way, you must not only set (`export`) the `BLIS_NUM_THREADS` variable, but you must also `unset` all of the `BLIS_*_NT` variables.
- * If you have specified multithreading via *both* the automatic and manual ways, BLIS will **not** complain if the values are inconsistent with one another. (For example, you may request 12 total threads be used while also specifying 2 and 4 ways of parallelism within the JC and IC loops, respectively, for a total of 8 ways.) Furthermore, you will be able to query these inconsistent values via the runtime API both before and after multithreading executes.
+ * Regardless of which of the three methods is used, **if multithreading is specified via both the automatic and manual ways, the values set via the manual way will always take precedence.**
+ * Specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1). (Note: Setting the ways of parallelism for a loop to any value less than or equal to 1 does *not* count as specifying parallelism for that loop; in these cases, the default of 1 will silently be used instead.) If you want to switch from using the manual way to automatic way, you must not only set (`export`) the `BLIS_NUM_THREADS` variable, but you must either `unset` all of the `BLIS_*_NT` variables, or make sure they are all set to 1.
+ * If you have specified multithreading via *both* the automatic and manual ways, BLIS will **not** complain if the values are inconsistent with one another. (For example, you may request 12 total threads be used while also specifying 2 and 4 ways of parallelism within the JC and IC loops, respectively, for a total of 8 ways. 12 is obviously not equal to 8, and in this case the 8-thread specification will prevail.) Furthermore, you will be able to query these inconsistent values via the runtime API both before and after multithreading executes.
  * If multithreading is disabled, you **may still** specify multithreading values via either the manual or automatic ways. However, BLIS will silently ignore **all** of these values. A BLIS library that is built with multithreading disabled at configure-time will always run sequentially (from the perspective of a single application thread).
 
 Furthermore:
-* For small numbers of threads, the number requested will be honored faithfully. However, if you request a larger number of threads that happens to also be prime, BLIS will reduce the number by one in order to allow more more efficient thread factorizations. This behavior can be overridden by configuring BLIS with the `BLIS_ENABLE_AUTO_PRIME_NUM_THREADS` macro defined in the `bli_family_*.h` file of the relevant subconfiguration. Similarly, the threshold beyond which BLIS will reduce primes by one can be set via `BLIS_NT_MAX_PRIME`. (This latter value is ignored if the former macro is defined.)
+* For small numbers of threads, the number requested will be honored faithfully. However, if you request a larger number of threads that happens to also be prime, BLIS will (by default) reduce the number by one in order to allow more more efficient thread factorizations. This behavior (in which `BLIS_DISABLE_AUTO_PRIME_NUM_THREADS` is set by default) can be overridden by configuring BLIS with the `BLIS_ENABLE_AUTO_PRIME_NUM_THREADS` macro defined in the `bli_family_*.h` file of the relevant target configuration. This `BLIS_ENABLE_*` macro will allow BLIS to use any prime number of threads. Note that the threshold beyond which BLIS will reduce primes by one (assuming `BLIS_DISABLE_AUTO_PRIME_NUM_THREADS` is set) can be set via `BLIS_NT_MAX_PRIME`. This value is ignored if `BLIS_ENABLE_AUTO_PRIME_NUM_THREADS` is defined.
 
 ## Globally via environment variables
 
@@ -126,7 +127,7 @@ Regardless of whether you end up using the automatic or manual way of expressing
 
 ### Environment variables: the automatic way
 
-The automatic way of specifying parallelism entails simply setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable:
+The automatic way of specifying parallelism entails setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable:
 ```
 $ export GOMP_CPU_AFFINITY="..."  # optional step when using GNU libgomp.
 $ export BLIS_NUM_THREADS=16
@@ -134,7 +135,7 @@ $ ./my_blis_program
 ```
 This causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `OMP_NUM_THREADS`. If neither variable is set, the default number of threads is 1.
 
-**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
+**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable to specify multithreading within BLIS and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
 
 ### Environment variables: the manual way
 
@@ -142,15 +143,15 @@ The manual way of specifying parallelism involves communicating which loops with
 
 The below chart describes the five loops used in BLIS's matrix multiplication operations.
 
-| Loop around microkernel  | Environment variable | Direction | Notes          |
-|:-------------------------|:---------------------|:----------|:---------------|
-| 5th loop                 | `BLIS_JC_NT`         | `n`       |                |
-| 4th loop                 | _N/A_                | `k`       | Not enabled    |
-| 3rd loop                 | `BLIS_IC_NT`         | `m`       |                |
-| 2nd loop                 | `BLIS_JR_NT`         | `n`       | Typically <= 4 |
-| 1st loop                 | `BLIS_IR_NT`         | `m`       | Typically 1    |
+| Loop around microkernel  | Environment variable | Direction | Notes                 |
+|:-------------------------|:---------------------|:----------|:----------------------|
+| 5th loop ("JC loop")     | `BLIS_JC_NT`         | `n`       |                       |
+| 4th loop ("PC loop")     | _N/A_                | `k`       | Unavailable; always 1 |
+| 3rd loop ("IC loop")     | `BLIS_IC_NT`         | `m`       |                       |
+| 2nd loop ("JR loop")     | `BLIS_JR_NT`         | `n`       | Typically <= 8        |
+| 1st loop ("IR loop")     | `BLIS_IR_NT`         | `m`       | Typically 1           |
 
-**Note**: Parallelization of the 4th loop is not currently enabled because each iteration of the loop updates the same part of the output matrix C. Thus, to safely parallelize it requires either a reduction or mutex locks when updating C.
+**Note**: Parallelization of the 4th loop is not currently available because each iteration of the loop updates the same part of the output matrix C. Thus, to safely parallelize it requires either a reduction or mutex locks when updating C.
 
 Parallelization in BLIS is hierarchical. So if we parallelize multiple loops, the total number of threads will be the product of the amount of parallelism for each loop. Thus the total number of threads used is the product of all the values:
 `BLIS_JC_NT * BLIS_IC_NT * BLIS_JR_NT * BLIS_IR_NT`.
@@ -169,6 +170,8 @@ Next, which combinations of loops to parallelize depends on which caches are sha
 
 If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized).
 
+**Note**: If you set parallelization globally via environment variables and *then* your application *also* uses the global runtime API to set the ways of parallelism, the global runtime API will prevail.
+
 **Note**: Regardless of which way ([automatic](Multithreading.md#globally-at-runtime-the-automatic-way) or [manual](Multithreading.md#globally-at-runtime-the-manual-way)) the global runtime API is used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native ([typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md)) APIs that are unique to BLIS.
 
 ### Globally at runtime: the automatic way
@@ -181,7 +184,7 @@ This function takes one integer--the total number of threads for BLIS to utilize
 ```c
 bli_thread_set_num_threads( 4 );
 ```
-we are requesting that the global number of threads be set to 4. You may also query the global number of threads at any time via
+we are requesting that the total number of threads (ways of parallelism) be set to 4. You may also query the number of threads at any time via
 ```c
 dim_t bli_thread_get_num_threads( void );
 ```
@@ -201,7 +204,7 @@ So, for example, if we call
 ```c
 bli_thread_set_ways( 2, 1, 4, 1, 1 );
 ```
-we are requesting two ways of parallelism in the `JC` loop and 4 ways of parallelism in the `IC` loop.
+we are requesting 2 ways of parallelism in the `JC` loop and 4 ways of parallelism in the `IC` loop.
 Unlike environment variables, which only allow the user to set the parallelization strategy prior to running the executable, `bli_thread_set_ways()` may be called any time during the normal course of the BLIS-linked application's execution.
 
 ## Locally at runtime
@@ -210,15 +213,17 @@ In addition to the global methods based on environment variables and runtime fun
 
 As with environment variables and the global runtime API, there are two ways to specify parallelism: the automatic way and the manual way. Both ways involve allocating a BLIS-specific object, initializing the object and encoding the desired parallelization, and then passing a pointer to the object into one of the expert interfaces of either the [typed](docs/BLISTypedAPI.md) or [object](docs/BLISObjectAPI) APIs. We provide examples of utilizing this threading object below.
 
+**Note**: If you set parallelization globally via environment variables and/or globally via the runtime API, and *then* specify parallelization locally on a per-call basis, the values specified locally will prevail.
+
 **Note**: Neither way ([automatic](Multithreading.md#locally-at-runtime-the-automatic-way) nor [manual](Multithreading.md#locally-at-runtime-the-manual-way)) of specifying multithreading via the local runtime API can be used via the BLAS interfaces. The local runtime API may *only* be used via the native ([typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md)) APIs, which are unique to BLIS. (Furthermore, the expert interfaces of each API must be used. This is demonstrated later on in this section.)
 
 ### Initializing a rntm_t
 
-Before specifying the parallelism (automatically or manually), you must first allocate a special BLIS object called a `rntm_t` (runtime). The object is quite small (about 64 bytes), and so we recommend allocating it statically on the function stack:
+Before specifying the parallelism (automatically or manually), you must first allocate a special BLIS object called a `rntm_t` (runtime). The object is quite small (about 128 bytes), and so we recommend allocating it statically on the function stack:
 ```c
 rntm_t rntm;
 ```
-We **strongly recommend** initializing the `rntm_t`. This can be done in either of two ways.
+You **must** initialize the `rntm_t`. This can be done in either of two ways.
 If you want to initialize it as part of the declaration, you may do so via the default `BLIS_RNTM_INITIALIZER` macro:
 ```c
 rntm_t rntm = BLIS_RNTM_INITIALIZER;
@@ -229,7 +234,7 @@ bli_rntm_init( &rntm );
 ```
 As of this writing, BLIS treats a default-initialized `rntm_t` as a request for single-threaded execution.
 
-**Note**: If you choose to **not** initialize the `rntm_t` object, you **must** set its parallelism via either the automatic way or the manual way, described below. Passing a completely uninitialized `rntm_t` to a level-3 operation **will almost surely result in undefined behavior!**
+**Note**: If you choose to **not** initialize the `rntm_t` object and then pass it into a level-3 operation, **you will almost surely observe undefined behavior!** Please don't do this!
 
 ### Locally at runtime: the automatic way
 
@@ -241,7 +246,7 @@ As with `bli_thread_set_num_threads()` [discussed previously](Multithreading.md#
 ```c
 bli_rntm_set_num_threads( 6, &rntm );
 ```
-the `rntm_t` object will be encoded to use a total of 6 threads. 
+the `rntm_t` object will be encoded to use a total of 6 threads.
 
 ### Locally at runtime: the manual way
 
@@ -250,7 +255,7 @@ Once your `rntm_t` is initialized, you may manually encode the ways of paralleli
 void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm );
 ```
 As with `bli_thread_set_ways()` [discussed previously](Multithreading.md#globally-at-runtime-the-manual-way), this function takes one integer for each loop in the level-3 operations. It also takes the address of the `rntm_t` to modify.
-(**Note**: even though the function takes a `pc` argument, it will be ignored until parallelism is supported in the `KC` loop.)
+(**Note**: even though the function takes a `pc` argument, it will be ignored--and assumed to be 1--until parallelism is supported in the `KC` loop.)
 So, for example, if we call
 ```c
 bli_rntm_set_ways( 1, 1, 2, 3, 1, &rntm );
@@ -259,13 +264,13 @@ we are requesting two ways of parallelism in the `IC` loop and three ways of par
 
 ### Locally at runtime: using the expert interfaces
 
-Regardless of whether you specified parallelism into your `rntm_t` object via the automatic or manual method, eventually you must use the data structure when calling a BLIS operation.
+Regardless of whether you specified parallelism into your `rntm_t` object via the automatic or manual method, eventually you must use the data structure when calling a BLIS operation in order for it to have any effect.
 
-Let's assume you wish to call `gemm`. To so do, simply use the expert interface, which takes two additional arguments: a `cntx_t` (context) and a `rntm_t`. For the context, you may simply pass in `NULL` and BLIS will select a default context (which is exactly what happens when you call the basic/non-expert interfaces). Here is an example of such a call:
+Let's assume you wish to call `gemm`. To so do, use the expert interface, which takes two additional arguments: a `cntx_t` (context) and a `rntm_t`. For the context, you may simply pass in `NULL` and BLIS will select a default context internally (which is exactly what happens for both the `cntx_t*` and `rntm_t*` parameters when you call the basic/non-expert interfaces). Here is an example of such a call:
 ```c
 bli_gemm_ex( &alpha, &a, &b, &beta, &c, NULL, &rntm );
 ```
-This will cause `gemm` to execute and parallelize in the manner encoded by `rntm`.
+This will cause `gemm` to execute and parallelize in the manner encoded by `rntm` (and it will do so using a default `cntx_t*`).
 
 To summarize, using a `rntm_t` involves three steps:
 ```c
@@ -293,7 +298,7 @@ Also, you may pass in `NULL` for the `rntm_t*` parameter of an expert interface.
 
    There are currently no good *and* easy solutions to this problem. Eventually, though, we plan to add support for two microkernels per datatype per configuration--one for use with matrices C that are row-stored, and one for those that are column-stored. This will obviate the logic within BLIS that sometimes induces the operation transposition, and the problem will go away.
    
-* **Thread affinity when BLIS and MKL are used together.** Some users have reported that when running a program that links both BLIS (configured with OpenMP) and MKL, **and** when OpenMP thread affinity has been specified (e.g. via `OMP_PROC_BIND` and `OMP_PLACES`), that very poor performance is observed. This may be due to incorrect thread masking in this case, causing all threads to run on one physical core. The exact circumstances leading to this behavior have not been identified, but unsetting the OpenMP thread affinity variables appears to be a solution.
+* **Thread affinity when BLIS and MKL are used together.** Some users have reported that when running a program that links both BLIS (configured with OpenMP) and MKL, **and** when OpenMP thread affinity has been specified (e.g. via `OMP_PROC_BIND` and `OMP_PLACES`), that very poor performance is observed. This may be due to incorrect thread masking, causing all threads to run on one physical core. The exact circumstances leading to this behavior have not been identified, but unsetting the OpenMP thread affinity variables appears to be a solution.
 
 # Conclusion
 
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index 2c13c74a2..aae0ac043 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -161,70 +161,72 @@ void bli_rntm_set_ways_from_rntm
        rntm_t* rntm
      )
 {
-	dim_t nt = bli_rntm_num_threads( rntm );
+	// NOTE: While much of the multithreading cpp case of this function may seem
+	// redundant with bli_thread_init_rntm_from_env(), we need them both. The
+	// bli_thread_init_rntm_from_env() function is only called to initialize the
+	// global rntm_t. There, the consistency logic serves to make sure that sane
+	// values will be returned if the application (in the time between library
+	// initialization and when computation begins) subsequently queries the
+	// number of threads or ways via the runtime API. This function also needs
+	// the same consistency logic, but for a different reason: this function
+	// guarantees that the rntm_t has sane values in the event that the
+	// application passed in a custom rntm_t via an expert interface.
 
-	dim_t jc = bli_rntm_jc_ways( rntm );
-	dim_t pc = bli_rntm_pc_ways( rntm );
-	dim_t ic = bli_rntm_ic_ways( rntm );
-	dim_t jr = bli_rntm_jr_ways( rntm );
-	dim_t ir = bli_rntm_ir_ways( rntm );
 
 	bool  auto_factor = FALSE;
+	dim_t nt;
+	dim_t jc, pc, ic, jr, ir;
 
 #ifdef BLIS_ENABLE_MULTITHREADING
 
-	bool  nt_set   = FALSE;
-	bool  ways_set = FALSE;
-
-	// If the rntm was fed in as a copy of the global runtime via
-	// bli_rntm_init_from_global(), we know that either:
-	// - the num_threads field is -1 and all of the ways are -1;
-	// - the num_threads field is -1 and all of the ways are set;
-	// - the num_threads field is set and all of the ways are -1.
-	// However, we can't be sure that a user-provided rntm_t isn't
-	// initialized uncleanly. So here we have to enforce some rules
-	// to get the rntm_t into a predictable state.
-
-	// First, we establish whether or not the number of threads is set.
-	if ( nt > 0 ) nt_set = TRUE;
-
-	// Take this opportunity to set the auto_factor field.
-	if ( nt_set ) auto_factor = TRUE;
-
-	// Next, we establish whether or not any of the ways of parallelism
-	// for each loop were set. If any of the ways are set (positive), we
-	// then we assume the user wanted to use those positive values and
-	// default the non-positive values to 1.
-	if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 )
-	{
-		ways_set = TRUE;
-
-		if ( jc < 1 ) jc = 1;
-		if ( pc < 1 ) pc = 1;
-		if ( ic < 1 ) ic = 1;
-		if ( jr < 1 ) jr = 1;
-		if ( ir < 1 ) ir = 1;
-	}
+	nt = bli_rntm_num_threads( rntm );
+	jc = bli_rntm_jc_ways( rntm );
+	pc = bli_rntm_pc_ways( rntm );
+	ic = bli_rntm_ic_ways( rntm );
+	jr = bli_rntm_jr_ways( rntm );
+	ir = bli_rntm_ir_ways( rntm );
+
+	bool nt_set   = FALSE;
+	bool ways_set = FALSE;
+
+	// Some users are mischievous/dumb. Make sure they don't cause trouble.
+	if ( nt < 1 ) nt = 1;
+	if ( jc < 1 ) jc = 1;
+	if ( pc < 1 ) pc = 1;
+	if ( ic < 1 ) ic = 1;
+	if ( jr < 1 ) jr = 1;
+	if ( ir < 1 ) ir = 1;
+
+	// First, we establish whether or not the number of threads or ways of
+	// parallelism were set to meaningful values.
+	if ( nt > 1 ) nt_set   = TRUE;
+	if ( jc > 1 ) ways_set = TRUE;
+	if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values.
+	if ( ic > 1 ) ways_set = TRUE;
+	if ( jr > 1 ) ways_set = TRUE;
+	if ( ir > 1 ) ways_set = TRUE;
 
 	// Now we use the values of nt_set and ways_set to determine how to
 	// interpret the original values we found in the rntm_t object.
 
 	if ( ways_set == TRUE )
 	{
-		// If the ways were set, then we use the values that were given
-		// and interpreted above (we set any non-positive value to 1).
-		// The only thing left to do is calculate the correct number of
-		// threads.
+		// If the per-loop ways of parallelism were set, then we use the values
+		// that were given and interpreted above. The only thing left to do is
+		// calculate the correct number of threads. Notice that if the user also
+		// happened to set the total number of threads that value is discarded
+		// in favor of the implied value from the per-loop ways of parallelism.
 
 		nt = jc * pc * ic * jr * ir;
+		auto_factor = FALSE;
 	}
 	else if ( ways_set == FALSE && nt_set == TRUE )
 	{
-		// If the ways were not set but the number of thread was set, then
-		// we attempt to automatically generate a thread factorization that
+		// If the ways were not set but the number of thread was set, then we
+		// will attempt to automatically generate a thread factorization that
 		// will work given the problem size.
 
-#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
+		#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
 		// If use of prime numbers is disallowed for automatic thread
 		// factorizations, we first check if the number of threads requested
 		// is prime. If it is prime, and it exceeds a minimum threshold, then
@@ -232,11 +234,11 @@ void bli_rntm_set_ways_from_rntm
 		// prime. This will allow for automatic thread factorizations to span
 		// two dimensions (loops), which tends to be more efficient.
 		if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
-#endif
+		#endif
 
-		pc = 1;
-
-		//printf( "m n = %d %d  BLIS_THREAD_RATIO_M _N = %d %d\n", (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, (int)BLIS_THREAD_RATIO_N );
+		//printf( "m n = %d %d  BLIS_THREAD_RATIO_M _N = %d %d\n",
+		//         (int)m, (int)n, (int)BLIS_THREAD_RATIO_M,
+		//                         (int)BLIS_THREAD_RATIO_N );
 
 		bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
 		                              n*BLIS_THREAD_RATIO_N, &ic, &jc );
@@ -252,27 +254,34 @@ void bli_rntm_set_ways_from_rntm
 		{
 			if ( jc % jr == 0 ) { jc /= jr; break; }
 		}
+
+		// Force the number of ways of parallelism in the pc loop to 1
+		// just in case the caller set it to something greater than 1.
+		pc = 1;
+
+		// Make note that auto-factorization was performed.
+		auto_factor = TRUE;
 	}
 	else // if ( ways_set == FALSE && nt_set == FALSE )
 	{
-		// If neither the ways nor the number of threads were set, then
-		// the rntm was not meaningfully changed since initialization,
-		// and thus we'll default to single-threaded execution.
-
-		nt = 1;
-		jc = pc = ic = jr = ir = 1;
+		// If neither the ways nor the number of threads were set, then the
+		// rntm_t was not meaningfully changed since initialization. This means
+		// the fields are all 1, which will lead to the default behavior of
+		// single-threaded execution.
+		//nt = jc = pc = ic = jr = ir = 1;
+		//auto_factor = FALSE;
 	}
 
 #else
 
-	// When multithreading is disabled, always set the rntm_t ways
-	// values to 1.
+	// When multithreading is disabled, always set the per-loop ways of
+	// parallelism to 1.
 	nt = 1;
 	jc = pc = ic = jr = ir = 1;
 
 #endif
 
-	// Save the results back in the runtime object.
+	// Save the results back in the rntm_t object.
 	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
@@ -286,70 +295,60 @@ void bli_rntm_set_ways_from_rntm_sup
        rntm_t* rntm
      )
 {
-	dim_t nt = bli_rntm_num_threads( rntm );
-
-	dim_t jc = bli_rntm_jc_ways( rntm );
-	dim_t pc = bli_rntm_pc_ways( rntm );
-	dim_t ic = bli_rntm_ic_ways( rntm );
-	dim_t jr = bli_rntm_jr_ways( rntm );
-	dim_t ir = bli_rntm_ir_ways( rntm );
-
 	bool  auto_factor = FALSE;
+	dim_t nt;
+	dim_t jc, pc, ic, jr, ir;
 
 #ifdef BLIS_ENABLE_MULTITHREADING
 
-	bool  nt_set   = FALSE;
-	bool  ways_set = FALSE;
-
-	// If the rntm was fed in as a copy of the global runtime via
-	// bli_rntm_init_from_global(), we know that either:
-	// - the num_threads field is -1 and all of the ways are -1;
-	// - the num_threads field is -1 and all of the ways are set;
-	// - the num_threads field is set and all of the ways are -1.
-	// However, we can't be sure that a user-provided rntm_t isn't
-	// initialized uncleanly. So here we have to enforce some rules
-	// to get the rntm_t into a predictable state.
-
-	// First, we establish whether or not the number of threads is set.
-	if ( nt > 0 ) nt_set = TRUE;
-
-	// Take this opportunity to set the auto_factor field.
-	if ( nt_set ) auto_factor = TRUE;
-
-	// Next, we establish whether or not any of the ways of parallelism
-	// for each loop were set. If any of the ways are set (positive), we
-	// then we assume the user wanted to use those positive values and
-	// default the non-positive values to 1.
-	if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 )
-	{
-		ways_set = TRUE;
-
-		if ( jc < 1 ) jc = 1;
-		if ( pc < 1 ) pc = 1;
-		if ( ic < 1 ) ic = 1;
-		if ( jr < 1 ) jr = 1;
-		if ( ir < 1 ) ir = 1;
-	}
+	nt = bli_rntm_num_threads( rntm );
+	jc = bli_rntm_jc_ways( rntm );
+	pc = bli_rntm_pc_ways( rntm );
+	ic = bli_rntm_ic_ways( rntm );
+	jr = bli_rntm_jr_ways( rntm );
+	ir = bli_rntm_ir_ways( rntm );
+
+	bool nt_set   = FALSE;
+	bool ways_set = FALSE;
+
+	// Some users are mischievous/dumb. Make sure they don't cause trouble.
+	if ( nt < 1 ) nt = 1;
+	if ( jc < 1 ) jc = 1;
+	if ( pc < 1 ) pc = 1;
+	if ( ic < 1 ) ic = 1;
+	if ( jr < 1 ) jr = 1;
+	if ( ir < 1 ) ir = 1;
+
+	// First, we establish whether or not the number of threads or ways of
+	// parallelism were set to meaningful values.
+	if ( nt > 1 ) nt_set   = TRUE;
+	if ( jc > 1 ) ways_set = TRUE;
+	if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values.
+	if ( ic > 1 ) ways_set = TRUE;
+	if ( jr > 1 ) ways_set = TRUE;
+	if ( ir > 1 ) ways_set = TRUE;
 
 	// Now we use the values of nt_set and ways_set to determine how to
 	// interpret the original values we found in the rntm_t object.
 
 	if ( ways_set == TRUE )
 	{
-		// If the ways were set, then we use the values that were given
-		// and interpreted above (we set any non-positive value to 1).
-		// The only thing left to do is calculate the correct number of
-		// threads.
+		// If the per-loop ways of parallelism were set, then we use the values
+		// that were given and interpreted above. The only thing left to do is
+		// calculate the correct number of threads. Notice that if the user also
+		// happened to set the total number of threads that value is discarded
+		// in favor of the implied value from the per-loop ways of parallelism.
 
 		nt = jc * pc * ic * jr * ir;
+		auto_factor = FALSE;
 	}
 	else if ( ways_set == FALSE && nt_set == TRUE )
 	{
-		// If the ways were not set but the number of thread was set, then
-		// we attempt to automatically generate a thread factorization that
-		// will work given the problem size.
+		// If the ways were not set but the number of thread was set, then we
+		// will attempt to automatically generate a thread factorization that
+		// work given the problem size.
 
-#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
+		#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
 		// If use of prime numbers is disallowed for automatic thread
 		// factorizations, we first check if the number of threads requested
 		// is prime. If it is prime, and it exceeds a minimum threshold, then
@@ -357,17 +356,17 @@ void bli_rntm_set_ways_from_rntm_sup
 		// prime. This will allow for automatic thread factorizations to span
 		// two dimensions (loops), which tends to be more efficient.
 		if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
-#endif
-
-		pc = 1;
+		#endif
 
 		//bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M,
 		//                              n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc );
 		bli_thread_partition_2x2( nt, m,
 		                              n, &ic, &jc );
 
-//printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d  ic = %d\n", (int)jc, (int)ic );
-#if 0
+		//printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d  ic = %d\n",
+		//        (int)jc, (int)ic );
+
+		#if 0
 		for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- )
 		{
 			if ( ic % ir == 0 ) { ic /= ir; break; }
@@ -377,32 +376,38 @@ void bli_rntm_set_ways_from_rntm_sup
 		{
 			if ( jc % jr == 0 ) { jc /= jr; break; }
 		}
-#else
+		#else
 		ir = 1;
 		jr = 1;
+		#endif
 
-#endif
+		// Force the number of ways of parallelism in the pc loop to 1 just in
+		// case the caller set it to something greater than 1.
+		pc = 1;
+
+		// Make note that auto-factorization was performed.
+		auto_factor = TRUE;
 	}
 	else // if ( ways_set == FALSE && nt_set == FALSE )
 	{
-		// If neither the ways nor the number of threads were set, then
-		// the rntm was not meaningfully changed since initialization,
-		// and thus we'll default to single-threaded execution.
-
-		nt = 1;
-		jc = pc = ic = jr = ir = 1;
+		// If neither the ways nor the number of threads were set, then the
+		// rntm_t was not meaningfully changed since initialization. This means
+		// the fields are all 1, which will lead to the default behavior of
+		// single-threaded execution.
+		//nt = jc = pc = ic = jr = ir = 1;
+		//auto_factor = FALSE;
 	}
 
 #else
 
-	// When multithreading is disabled, always set the rntm_t ways
-	// values to 1.
+	// When multithreading is disabled, always set the per-loop ways of
+	// parallelism to 1.
 	nt = 1;
 	jc = pc = ic = jr = ir = 1;
 
 #endif
 
-	// Save the results back in the runtime object.
+	// Save the results back in the rntm_t object.
 	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 2a39f8894..8b6538484 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -46,7 +46,7 @@ typedef struct rntm_s
 	bool      auto_factor;
 
 	dim_t     num_threads;
-	dim_t*    thrloop;
+	dim_t     thrloop[ BLIS_NUM_LOOPS ];
 	bool      pack_a;
 	bool      pack_b;
 	bool      l3_sup;
@@ -129,22 +129,6 @@ BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm )
 	return rntm->pba;
 }
 
-#if 0
-BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 )
-{
-	const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 );
-	const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 );
-	const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 );
-	const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 );
-	const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 );
-	const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 );
-	const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 );
-
-	if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE;
-	else                                          return FALSE;
-}
-#endif
-
 //
 // -- rntm_t modification (internal use only) ----------------------------------
 //
@@ -170,7 +154,7 @@ BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm )
 }
 BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm )
 {
-	bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm );
+	bli_rntm_set_ways_for_only( BLIS_KC, 1, rntm );
 }
 BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm )
 {
@@ -193,7 +177,7 @@ BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr,
 {
 	// Record the number of ways of parallelism per loop.
 	bli_rntm_set_jc_ways_only( jc, rntm );
-	bli_rntm_set_pc_ways_only( pc, rntm );
+	bli_rntm_set_pc_ways_only(  1, rntm );
 	bli_rntm_set_ic_ways_only( ic, rntm );
 	bli_rntm_set_jr_ways_only( jr, rntm );
 	bli_rntm_set_ir_ways_only( ir, rntm );
@@ -212,11 +196,11 @@ BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm )
 
 BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm )
 {
-	bli_rntm_set_num_threads_only( -1, rntm );
+	bli_rntm_set_num_threads_only( 1, rntm );
 }
 BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm )
 {
-	bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm );
+	bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
 }
 BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm )
 {
@@ -244,14 +228,16 @@ BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_
 {
 	// Record the number of ways of parallelism per loop.
 	bli_rntm_set_jc_ways_only( jc, rntm );
-	bli_rntm_set_pc_ways_only( pc, rntm );
+	bli_rntm_set_pc_ways_only(  1, rntm );
 	bli_rntm_set_ic_ways_only( ic, rntm );
 	bli_rntm_set_jr_ways_only( jr, rntm );
 	bli_rntm_set_ir_ways_only( ir, rntm );
 	bli_rntm_set_pr_ways_only(  1, rntm );
 
-	// Set the num_threads field to a default state.
-	bli_rntm_clear_num_threads_only( rntm );
+	// Set the num_threads field to the product of all the ways. The only
+	// benefit of doing this, though, is that the user can query the total
+	// number of threads from the rntm_t after calling this function.
+	bli_rntm_set_num_threads_only( jc * 1 * ic * jr * ir, rntm );
 }
 
 BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm )
@@ -307,8 +293,8 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 #define BLIS_RNTM_INITIALIZER \
         { \
           .auto_factor = TRUE, \
-          .num_threads = -1, \
-          .thrloop     = { -1, -1, -1, -1, -1, -1 }, \
+          .num_threads = 1, \
+          .thrloop     = { 1, 1, 1, 1, 1, 1 }, \
           .pack_a      = FALSE, \
           .pack_b      = FALSE, \
           .l3_sup      = TRUE, \
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 7d647a314..0e5afa3f8 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -1564,7 +1564,7 @@ void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
 	// Acquire the mutex protecting global_rntm.
 	bli_pthread_mutex_lock( &global_rntm_mutex );
 
-	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, &global_rntm );
+	bli_rntm_set_ways_only( jc, 1, ic, jr, ir, &global_rntm );
 
 	// Release the mutex protecting global_rntm.
 	bli_pthread_mutex_unlock( &global_rntm_mutex );
@@ -1595,6 +1595,17 @@ void bli_thread_init_rntm_from_env
 	// function is only called from bli_thread_init(), which is only called
 	// by bli_init_once().
 
+	// NOTE: While much of the multithreading cpp case of this function may seem
+	// redundant with bli_rntm_set_ways_from_rntm(), we need them both. This
+	// function is only called to initialize the global rntm_t. Here, the
+	// consistency logic serves to make sure that sane values will be returned
+	// if the application (in the time between library initialization and when
+	// computation begins) subsequently queries the number of threads or ways
+	// via the runtime API. The bli_rntm_set_ways_from_rntm() function also
+	// needs the same consistency logic, but for a different reason: that
+	// function guarantees that the rntm_t has sane values in the event that the
+	// application passed in a custom rntm_t via an expert interface.
+
 	bool  auto_factor = FALSE;
 	dim_t nt;
 	dim_t jc, pc, ic, jr, ir;
@@ -1608,44 +1619,72 @@ void bli_thread_init_rntm_from_env
 	if ( nt == -1 )
 		nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
 
-	// Read the environment variables for the number of threads (ways
-	// of parallelism) for each individual loop.
+	// Read the environment variables for the number of threads (ways of
+	// parallelism) for each individual loop.
 	jc = bli_env_get_var( "BLIS_JC_NT", -1 );
-	pc = bli_env_get_var( "BLIS_PC_NT", -1 );
+	pc = bli_env_get_var( "BLIS_PC_NT", -1 ); pc = 1; // Disable PC_NT values.
 	ic = bli_env_get_var( "BLIS_IC_NT", -1 );
 	jr = bli_env_get_var( "BLIS_JR_NT", -1 );
 	ir = bli_env_get_var( "BLIS_IR_NT", -1 );
 
-	// If any BLIS_*_NT environment variable was set, then we ignore the
-	// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the
-	// BLIS_*_NT values instead (with unset variables being treated as if
-	// they contained 1).
-	if ( jc != -1 || pc != -1 || ic != -1 || jr != -1 || ir != -1 )
+	bool nt_set   = FALSE;
+	bool ways_set = FALSE;
+
+	// Some users are mischievous/dumb. Make sure they don't cause trouble.
+	if ( nt < 1 ) nt = 1;
+	if ( jc < 1 ) jc = 1;
+	if ( pc < 1 ) pc = 1;
+	if ( ic < 1 ) ic = 1;
+	if ( jr < 1 ) jr = 1;
+	if ( ir < 1 ) ir = 1;
+
+	// First, we establish whether or not the number of threads or ways of
+	// parallelism were set to meaningful values.
+	if ( nt > 1 ) nt_set   = TRUE;
+	if ( jc > 1 ) ways_set = TRUE;
+	if ( pc > 1 ) ways_set = TRUE;
+	if ( ic > 1 ) ways_set = TRUE;
+	if ( jr > 1 ) ways_set = TRUE;
+	if ( ir > 1 ) ways_set = TRUE;
+
+	// Now we use the values of nt_set and ways_set to determine how to
+	// interpret the original values we found in the rntm_t object.
+
+	if ( ways_set == TRUE )
 	{
-		if ( jc == -1 ) jc = 1;
-		if ( pc == -1 ) pc = 1;
-		if ( ic == -1 ) ic = 1;
-		if ( jr == -1 ) jr = 1;
-		if ( ir == -1 ) ir = 1;
-
-		// Unset the value for nt.
-		nt = -1;
+		// If the per-loop ways of parallelism were set, then we use the values
+		// that were given and interpreted above. The only thing left to do is
+		// calculate the correct number of threads. Notice that if the user also
+		// happened to set BLIS_NUM_THREADS, that value is discarded in favor of
+		// the implied value from the per-loop ways of parallelism.
+
+		nt = jc * pc * ic * jr * ir;
+		auto_factor = FALSE;
+	}
+	else if ( ways_set == FALSE && nt_set == TRUE )
+	{
+		// If the ways were not set but the number of thread was set, then we
+		// will attempt to automatically generate a thread factorization that
+		// will work given the problem size. This auto-factorization will
+		// occur later, in bli_rntm_set_ways_from_rntm(), once we know the
+		// problem size.
+
+		// Make note that auto-factorization will be performed.
+		auto_factor = TRUE;
+	}
+	else // if ( ways_set == FALSE && nt_set == FALSE )
+	{
+		// If neither the ways nor the number of threads were set, then we
+		// allow the default values to stand.
+		//nt = jc = pc = ic = jr = ir = 1;
+		//auto_factor = FALSE;
 	}
-
-	// By this time, one of the following conditions holds:
-	// - nt is -1 and the ways for each loop are -1.
-	// - nt is -1 and the ways for each loop are all set.
-	// - nt is set and the ways for each loop are -1.
-
-	// If nt is set (ie: not -1), then we know we will perform an automatic
-	// thread factorization (later, in bli_rntm.c).
-	if ( nt != -1 ) auto_factor = TRUE;
 
 #else
 
-	// When multithreading is disabled, always set the rntm_t ways
-	// values to 1.
-	nt = -1;
+	// When multithreading is disabled, always set the per-loop ways of
+	// parallelism to 1.
+	nt = 1;
 	jc = pc = ic = jr = ir = 1;
 
 #endif
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 3ce92e377..3bfde8788 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -795,8 +795,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	char ir_nt_str[16];
 
 	// Query the number of ways of parallelism per loop (and overall) and
-	// convert these values into strings, with "unset" being used if the
-	// value returned was -1 (indicating the environment variable was unset).
+	// convert these values into strings.
 	dim_t nt    = bli_thread_get_num_threads();
 	dim_t jc_nt = bli_thread_get_jc_nt();
 	dim_t pc_nt = bli_thread_get_pc_nt();
@@ -804,18 +803,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	dim_t jr_nt = bli_thread_get_jr_nt();
 	dim_t ir_nt = bli_thread_get_ir_nt();
 
-	if (    nt == -1 ) sprintf(    nt_str, "unset" );
-	else               sprintf(    nt_str, "%d", ( int )   nt );
-	if ( jc_nt == -1 ) sprintf( jc_nt_str, "unset" );
-	else               sprintf( jc_nt_str, "%d", ( int )jc_nt );
-	if ( pc_nt == -1 ) sprintf( pc_nt_str, "unset" );
-	else               sprintf( pc_nt_str, "%d", ( int )pc_nt );
-	if ( ic_nt == -1 ) sprintf( ic_nt_str, "unset" );
-	else               sprintf( ic_nt_str, "%d", ( int )ic_nt );
-	if ( jr_nt == -1 ) sprintf( jr_nt_str, "unset" );
-	else               sprintf( jr_nt_str, "%d", ( int )jr_nt );
-	if ( ir_nt == -1 ) sprintf( ir_nt_str, "unset" );
-	else               sprintf( ir_nt_str, "%d", ( int )ir_nt );
+	sprintf(    nt_str, "%d", ( int )   nt );
+	sprintf( jc_nt_str, "%d", ( int )jc_nt );
+	sprintf( pc_nt_str, "%d", ( int )pc_nt );
+	sprintf( ic_nt_str, "%d", ( int )ic_nt );
+	sprintf( jr_nt_str, "%d", ( int )jr_nt );
+	sprintf( ir_nt_str, "%d", ( int )ir_nt );
 
 	// Set up rntm_t objects for each of the four families:
 	// gemm, herk, trmm, trsm.

From 63177dca48cb7d066576d884da4a7a599ececebf Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 15 Sep 2022 11:21:26 -0500
Subject: [PATCH 082/230] Fixed gemmlike sandbox bug introduced in 7c07b47.

Details:
- Fixed a bug in the 'gemmlike' sandbox that was introduced in 7c07b47.
  This bug was the result of the fact that the gemmlike implementation
  uses bli_thrinfo_sup_grow() to grow its thrinfo_t tree, but the
  aforementioned commit added an optimization that kicks in when the
  rntm_t .pack_a and .pack_b fields are both FALSE. Those fields were
  originally added only for sup execution; for large code path, they
  are intended to be ignored. But the default initial state of a rntm_t
  has those fields set to FALSE, which was inadvertantly activating the
  optimization (which targeted single-threaded cases only) and would
  cause multithreaded use cases of 'gemmlike' to segfault. The fix took
  the form of setting the .pack_a and .pack_b fields to TRUE in
  bls_gemm_ex().
- Added minimal 'const' and 'const'-casting to 'gemmlike' so that gcc
  stays quiet.
---
 sandbox/gemmlike/bli_gemm_ex.c | 23 ++++++++++++++---------
 sandbox/gemmlike/bls_gemm.c    | 29 +++++++++++++++++++++--------
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/sandbox/gemmlike/bli_gemm_ex.c b/sandbox/gemmlike/bli_gemm_ex.c
index 96dae1a3a..fe220e603 100644
--- a/sandbox/gemmlike/bli_gemm_ex.c
+++ b/sandbox/gemmlike/bli_gemm_ex.c
@@ -46,13 +46,13 @@
 
 void bli_gemm_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -65,7 +65,11 @@ void bli_gemm_ex
 	// directly.
 	if ( 1 )
 	{
-		bls_gemm_ex( alpha, a, b, beta, c, cntx, rntm );
+		bls_gemm_ex
+		(
+		  ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
+		  ( cntx_t* )cntx, ( rntm_t* )rntm
+		);
 		return;
 	}
 
@@ -85,7 +89,8 @@ void bli_gemm_ex
 	// Invoke the operation's front end.
 	bli_gemm_front
 	(
-	  alpha, a, b, beta, c, cntx, rntm, NULL
+	  ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
+	  ( cntx_t* )cntx, ( rntm_t* )rntm, NULL
 	);
 }
 
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index ec5d8d5b1..d960928a4 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -78,14 +78,27 @@ void bls_gemm_ex
 	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
 	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
 
+	// Set the .pack_a and .pack_b fields to TRUE. This is only needed because
+	// this sandbox uses bli_thrinfo_sup_grow(), which calls
+	// bli_thrinfo_sup_create_for_cntl(), which employs an optimization if
+	// both fields are FALSE (as is often the case with sup). However, this
+	// sandbox implements the "large" code path, and so both A and B must
+	// always be packed. Setting the fields to TRUE will avoid the optimization
+	// while this sandbox implementation executes (and it also reinforces the
+	// fact that we *are* indeed packing A and B, albeit not in the sup context
+	// originally envisioned for the .pack_a and .pack_b fields).
+	bli_rntm_set_pack_a( TRUE, rntm );
+	bli_rntm_set_pack_b( TRUE, rntm );
+
 	// Obtain a valid (native) context from the gks if necessary.
 	// NOTE: This must be done before calling the _check() function, since
 	// that function assumes the context pointer is valid.
-	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+	if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
-		bls_gemm_check( alpha, a, b, beta, c, cntx );
+		bls_gemm_check( ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b,
+		                ( obj_t* )beta,  ( obj_t* )c, ( cntx_t* )cntx );
 
 	// -- bli_gemm_front() -----------------------------------------------------
 
@@ -163,12 +176,12 @@ void bls_gemm_ex
 	(
 	  bls_gemm_int,
 	  BLIS_GEMM, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  beta,
-	  &c_local,
-	  cntx,
+	  ( obj_t* )alpha,
+	  ( obj_t* )&a_local,
+	  ( obj_t* )&b_local,
+	  ( obj_t* )beta,
+	  ( obj_t* )&c_local,
+	  ( cntx_t* )cntx,
 	  rntm
 	);
 }

From e86076bf4461d1a78186fb21ba8320cfb430f62c Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 15 Sep 2022 14:22:59 -0500
Subject: [PATCH 083/230] Test the 'gemmlike' sandbox via AppVeyor. (#664)

Details:
- Added a fifth test to our .appveyor.yml that enables the 'gemmlike'
  sandbox with OpenMP enabled (via clang, the 'auto' configuration
  target, and building to a static library). Thanks to Jeff Diamond
  for pointing out that this test would be useful.
---
 .appveyor.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.appveyor.yml b/.appveyor.yml
index f4f56fa15..cafad4817 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -23,6 +23,12 @@ environment:
       CC: clang
       THREADING: openmp
 
+    - LIB_TYPE: static
+      CONFIG: auto
+      CC: clang
+      THREADING: openmp
+      SANDBOX: yes
+
 install:
 - set "PATH=C:\msys64\mingw64\bin;C:\msys64\bin;%PATH%"
 - if [%CC%]==[clang] set "PATH=C:\Program Files\LLVM\bin;%PATH%"
@@ -34,6 +40,7 @@ build_script:
 - if [%LIB_TYPE%]==[shared] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --enable-shared --disable-static"
 - if [%LIB_TYPE%]==[static] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --disable-shared --enable-static"
 - if not [%CBLAS%]==[no] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --enable-cblas"
+- if [%SANDBOX%]==[yes] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% -s gemmlike"
 - set RANLIB=echo
 - set LIBPTHREAD=
 - set "PATH=%PATH%;C:\blis\lib"

From fb91337eff1ee2098f315a83888f6667b3a56f86 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 15 Sep 2022 19:08:10 -0500
Subject: [PATCH 084/230] Fixed a harmless pc_nt bug in 05a811e.

Details:
- Added missing curly braces around some statements in bli_rntm.c, one
  of which  needed them in order for the relevant code to be executed in
  the intended way. The consequence of 05a811e omitting those braces was
  that a statement (pc_nt = 1;) was executed more often than it needed
  to be.
- Also adjusted the analagous code in bli_thread.c to match that of
  bli_rntm.c.
---
 frame/base/bli_rntm.c     | 24 ++++++++++++------------
 frame/thread/bli_thread.c | 14 +++++++-------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index aae0ac043..1411ffaa3 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -199,12 +199,12 @@ void bli_rntm_set_ways_from_rntm
 
 	// First, we establish whether or not the number of threads or ways of
 	// parallelism were set to meaningful values.
-	if ( nt > 1 ) nt_set   = TRUE;
-	if ( jc > 1 ) ways_set = TRUE;
-	if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values.
-	if ( ic > 1 ) ways_set = TRUE;
-	if ( jr > 1 ) ways_set = TRUE;
-	if ( ir > 1 ) ways_set = TRUE;
+	if ( nt > 1 ) { nt_set   = TRUE; }
+	if ( jc > 1 ) { ways_set = TRUE; }
+	if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values.
+	if ( ic > 1 ) { ways_set = TRUE; }
+	if ( jr > 1 ) { ways_set = TRUE; }
+	if ( ir > 1 ) { ways_set = TRUE; }
 
 	// Now we use the values of nt_set and ways_set to determine how to
 	// interpret the original values we found in the rntm_t object.
@@ -321,12 +321,12 @@ void bli_rntm_set_ways_from_rntm_sup
 
 	// First, we establish whether or not the number of threads or ways of
 	// parallelism were set to meaningful values.
-	if ( nt > 1 ) nt_set   = TRUE;
-	if ( jc > 1 ) ways_set = TRUE;
-	if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values.
-	if ( ic > 1 ) ways_set = TRUE;
-	if ( jr > 1 ) ways_set = TRUE;
-	if ( ir > 1 ) ways_set = TRUE;
+	if ( nt > 1 ) { nt_set   = TRUE; }
+	if ( jc > 1 ) { ways_set = TRUE; }
+	if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values.
+	if ( ic > 1 ) { ways_set = TRUE; }
+	if ( jr > 1 ) { ways_set = TRUE; }
+	if ( ir > 1 ) { ways_set = TRUE; }
 
 	// Now we use the values of nt_set and ways_set to determine how to
 	// interpret the original values we found in the rntm_t object.
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 0e5afa3f8..9bad6a456 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -1622,7 +1622,7 @@ void bli_thread_init_rntm_from_env
 	// Read the environment variables for the number of threads (ways of
 	// parallelism) for each individual loop.
 	jc = bli_env_get_var( "BLIS_JC_NT", -1 );
-	pc = bli_env_get_var( "BLIS_PC_NT", -1 ); pc = 1; // Disable PC_NT values.
+	pc = bli_env_get_var( "BLIS_PC_NT", -1 );
 	ic = bli_env_get_var( "BLIS_IC_NT", -1 );
 	jr = bli_env_get_var( "BLIS_JR_NT", -1 );
 	ir = bli_env_get_var( "BLIS_IR_NT", -1 );
@@ -1640,12 +1640,12 @@ void bli_thread_init_rntm_from_env
 
 	// First, we establish whether or not the number of threads or ways of
 	// parallelism were set to meaningful values.
-	if ( nt > 1 ) nt_set   = TRUE;
-	if ( jc > 1 ) ways_set = TRUE;
-	if ( pc > 1 ) ways_set = TRUE;
-	if ( ic > 1 ) ways_set = TRUE;
-	if ( jr > 1 ) ways_set = TRUE;
-	if ( ir > 1 ) ways_set = TRUE;
+	if ( nt > 1 ) { nt_set   = TRUE; }
+	if ( jc > 1 ) { ways_set = TRUE; }
+	if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values.
+	if ( ic > 1 ) { ways_set = TRUE; }
+	if ( jr > 1 ) { ways_set = TRUE; }
+	if ( ir > 1 ) { ways_set = TRUE; }
 
 	// Now we use the values of nt_set and ways_set to determine how to
 	// interpret the original values we found in the rntm_t object.

From 89df7b8fa3a3e47ab2fc10ac4d65d0b9fde16942 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 18 Sep 2022 18:46:57 -0500
Subject: [PATCH 085/230] De-templatized _sup_var1n2m.c; unified
 _sup_packm_a/b(). (#659)

Details:
- Re-expressed the two variants in frame/3/bli_l3_sup_var1n2m.c as a
  single function each that performs char* pointer arithmetic rather
  than four datatype-specific functions. Did the same for the functions
  in bli_l3_sup_packm_a.c and _sup_packm_b.c, and then unified the two
  into a single set of functions for packing either A or B, which now
  resides in bli_l3_sup_packm.c.
- Pre-grow the cntl_t tree in both bli_l3_sup_var1n2m.c variants rather
  than grow them incrementally.
- Relocated empty-matrix and scale-by-beta early return handlnig from
  bli_gemm_front() and bli_gemmt_front() to their _ex() counterparts.
- Comment, whitespace updates.
---
 frame/3/bli_l3.h                |    3 +-
 frame/3/bli_l3_oapi_ex.c        |   26 +
 frame/3/bli_l3_sup_packm.c      |  428 +++++++
 frame/3/bli_l3_sup_packm.h      |   95 ++
 frame/3/bli_l3_sup_packm_a.c    |  430 -------
 frame/3/bli_l3_sup_packm_a.h    |  118 --
 frame/3/bli_l3_sup_packm_b.c    |  430 -------
 frame/3/bli_l3_sup_packm_b.h    |  118 --
 frame/3/bli_l3_sup_packm_var.c  |   38 +-
 frame/3/bli_l3_sup_packm_var.h  |   38 +-
 frame/3/bli_l3_sup_var1n2m.c    | 1895 ++++++++++++-------------------
 frame/3/bli_l3_sup_vars.h       |   26 -
 frame/3/gemm/bli_gemm_front.c   |   16 -
 frame/3/gemmt/bli_gemmt_front.c |   16 -
 14 files changed, 1340 insertions(+), 2337 deletions(-)
 create mode 100644 frame/3/bli_l3_sup_packm.c
 create mode 100644 frame/3/bli_l3_sup_packm.h
 delete mode 100644 frame/3/bli_l3_sup_packm_a.c
 delete mode 100644 frame/3/bli_l3_sup_packm_a.h
 delete mode 100644 frame/3/bli_l3_sup_packm_b.c
 delete mode 100644 frame/3/bli_l3_sup_packm_b.h

diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index 4dc1a9d54..9d39fc47d 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -71,8 +71,7 @@
 #include "bli_l3_sup_ref.h"
 #include "bli_l3_sup_int.h"
 #include "bli_l3_sup_vars.h"
-#include "bli_l3_sup_packm_a.h"
-#include "bli_l3_sup_packm_b.h"
+#include "bli_l3_sup_packm.h"
 #include "bli_l3_sup_packm_var.h"
 
 // Prototype microkernel wrapper APIs.
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index 20b0294eb..16e5f15de 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -55,6 +55,19 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) ) return;
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
 	// If the rntm is non-NULL, it may indicate that we should forgo sup
 	// handling altogether.
 	bool enable_sup = TRUE;
@@ -128,6 +141,19 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) ) return;
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c
new file mode 100644
index 000000000..b7a7ee02b
--- /dev/null
+++ b/frame/3/bli_l3_sup_packm.c
@@ -0,0 +1,428 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_packm_sup_init_mem
+     (
+       bool       will_pack,
+       packbuf_t  pack_buf_type,
+       num_t      dt,
+       dim_t      m,
+       dim_t      k,
+       dim_t      mr,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     )
+{
+	// Inspect whether we are going to be packing matrix A.
+	if ( will_pack == FALSE )
+	{
+	}
+	else // if ( will_pack == TRUE )
+	{
+		// NOTE: This "rounding up" of the last upanel is actually optional
+		// for the rrc/crc cases, but absolutely necessary for the other cases
+		// since we NEED that last micropanel to have the same ldim (cs_p) as
+		// the other micropanels. Why? So that millikernels can use the same
+		// upanel ldim for all iterations of the ir loop.
+		const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+		const dim_t k_pack = k;
+
+		// Barrier to make sure all threads are caught up and ready to begin
+		// the packm stage.
+		bli_thread_barrier( thread );
+
+		// Compute the size of the memory block eneded.
+		siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
+
+		// Check the mem_t entry provided by the caller. If it is unallocated,
+		// then we need to acquire a block from the pba.
+		if ( bli_mem_is_unalloc( mem ) )
+		{
+			if ( bli_thread_am_ochief( thread ) )
+			{
+				// Acquire directly to the chief thread's mem_t that was
+				// passed in. It needs to be that mem_t struct, and not a
+				// local (temporary) mem_t, since there is no barrier until
+				// after packing is finished, which could allow a race
+				// condition whereby the chief thread exits the current
+				// function before the other threads have a chance to copy
+				// from it. (A barrier would fix that race condition, but
+				// then again, I prefer to keep barriers to a minimum.)
+				bli_pba_acquire_m
+				(
+				  rntm,
+				  size_needed,
+				  pack_buf_type,
+				  mem
+				);
+			}
+
+			// Broadcast the address of the chief thread's passed-in mem_t
+			// to all threads.
+			mem_t* mem_p = bli_thread_broadcast( thread, mem );
+
+			// Non-chief threads: Copy the contents of the chief thread's
+			// passed-in mem_t to the passed-in mem_t for this thread. (The
+			// chief thread already has the mem_t, so it does not need to
+			// perform any copy.)
+			if ( !bli_thread_am_ochief( thread ) )
+			{
+				*mem = *mem_p;
+			}
+		}
+		else // if ( bli_mem_is_alloc( mem ) )
+		{
+			// If the mem_t entry provided by the caller does NOT contain a NULL
+			// buffer, then a block has already been acquired from the pba and
+			// cached by the caller.
+
+			// As a sanity check, we should make sure that the mem_t object isn't
+			// associated with a block that is too small compared to the size of
+			// the packed matrix buffer that is needed, according to the value
+			// computed above.
+			siz_t mem_size = bli_mem_size( mem );
+
+			if ( mem_size < size_needed )
+			{
+				if ( bli_thread_am_ochief( thread ) )
+				{
+					// The chief thread releases the existing block associated
+					// with the mem_t, and then re-acquires a new block, saving
+					// the associated mem_t to its passed-in mem_t. (See coment
+					// above for why the acquisition needs to be directly to
+					// the chief thread's passed-in mem_t and not a local
+					// (temporary) mem_t.
+					bli_pba_release
+					(
+					  rntm,
+					  mem
+					);
+					bli_pba_acquire_m
+					(
+					  rntm,
+					  size_needed,
+					  pack_buf_type,
+					  mem
+					);
+				}
+
+				// Broadcast the address of the chief thread's passed-in mem_t
+				// to all threads.
+				mem_t* mem_p = bli_thread_broadcast( thread, mem );
+
+				// Non-chief threads: Copy the contents of the chief thread's
+				// passed-in mem_t to the passed-in mem_t for this thread. (The
+				// chief thread already has the mem_t, so it does not need to
+				// perform any copy.)
+				if ( !bli_thread_am_ochief( thread ) )
+				{
+					*mem = *mem_p;
+				}
+			}
+			else
+			{
+				// If the mem_t entry is already allocated and sufficiently large,
+				// then we use it as-is. No action is needed.
+			}
+		}
+	}
+}
+
+void bli_packm_sup_finalize_mem
+     (
+       bool       did_pack,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     )
+{
+	// Inspect whether we previously packed matrix A.
+	if ( did_pack == FALSE )
+	{
+		// If we didn't pack matrix A, there's nothing to be done.
+	}
+	else // if ( did_pack == TRUE )
+	{
+		if ( thread != NULL )
+		if ( bli_thread_am_ochief( thread ) )
+		{
+			// Check the mem_t entry provided by the caller. Only proceed if it
+			// is allocated, which it should be.
+			if ( bli_mem_is_alloc( mem ) )
+			{
+				bli_pba_release
+				(
+				  rntm,
+				  mem
+				);
+			}
+		}
+	}
+}
+
+void bli_packm_sup_init
+     (
+             bool    will_pack,
+             stor3_t stor_id,
+             pack_t* schema,
+             dim_t   m,
+             dim_t   k,
+             dim_t   mr,
+             dim_t*  m_max,
+             dim_t*  k_max,
+       const void*   x, inc_t  rs_x, inc_t  cs_x,
+             void**  p, inc_t* rs_p, inc_t* cs_p,
+                        dim_t* pd_p, inc_t* ps_p,
+             mem_t*  mem
+     )
+{
+	// Inspect whether we are going to be packing matrix A.
+	if ( will_pack == FALSE )
+	{
+		*m_max = m;
+		*k_max = k;
+
+		// Set the parameters for use with no packing of A (ie: using the
+		// source matrix A directly).
+		{
+			// Use the strides of the source matrix as the final values.
+			*rs_p = rs_x;
+			*cs_p = cs_x;
+
+			*pd_p = mr;
+			*ps_p = mr * rs_x;
+
+			// Set the schema to "not packed" to indicate that packing will be
+			// skipped.
+			*schema = BLIS_NOT_PACKED;
+		}
+
+		// Since we won't be packing, simply update the buffer address provided
+		// by the caller to point to source matrix.
+		*p = ( void* )x;
+	}
+	else // if ( will_pack == TRUE )
+	{
+		// NOTE: This is "rounding up" of the last upanel is actually optional
+		// for the rrc/crc cases, but absolutely necessary for the other cases
+		// since we NEED that last micropanel to have the same ldim (cs_p) as
+		// the other micropanels. Why? So that millikernels can use the same
+		// upanel ldim for all iterations of the ir loop.
+		*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+		*k_max = k;
+
+		// Determine the dimensions and strides for the packed matrix A.
+		if ( stor_id == BLIS_RRC ||
+			 stor_id == BLIS_CRC )
+		{
+			// stor3_t id values _RRC and _CRC: pack A to plain row storage.
+			*rs_p = k;
+			*cs_p = 1;
+
+			*pd_p = mr;
+			*ps_p = mr * k;
+
+			// Set the schema to "row packed" to indicate packing to plain
+			// row storage.
+			*schema = BLIS_PACKED_ROWS;
+		}
+		else
+		{
+			// All other stor3_t ids: pack A to column-stored row-panels.
+			*rs_p = 1;
+			*cs_p = mr;
+
+			*pd_p = mr;
+			*ps_p = mr * k;
+
+			// Set the schema to "packed row panels" to indicate packing to
+			// conventional column-stored row panels.
+			*schema = BLIS_PACKED_ROW_PANELS;
+		}
+
+		// Set the buffer address provided by the caller to point to the
+		// memory associated with the mem_t entry acquired from the pba.
+		*p = bli_mem_buffer( mem );
+	}
+}
+
+typedef void (*packm_sup_var1_fp)
+     (
+       trans_t    transc,
+       pack_t     schema,
+       dim_t      m,
+       dim_t      n,
+       dim_t      m_max,
+       dim_t      n_max,
+       void*      kappa,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       void*      p, inc_t rs_p, inc_t cs_p,
+                           dim_t pd_p, inc_t ps_p,
+       cntx_t*    cntx,
+       thrinfo_t* thread
+     );
+
+typedef void (*packm_sup_var2_fp)
+     (
+       trans_t    transc,
+       pack_t     schema,
+       dim_t      m,
+       dim_t      n,
+       void*      kappa,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       void*      p, inc_t rs_p, inc_t cs_p,
+       cntx_t*    cntx,
+       thrinfo_t* thread
+     );
+
+static packm_sup_var1_fp GENARRAY(packm_sup_var1,packm_sup_var1);
+static packm_sup_var2_fp GENARRAY(packm_sup_var2,packm_sup_var2);
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+void bli_packm_sup
+     (
+             bool       will_pack,
+             packbuf_t  pack_buf_type,
+             stor3_t    stor_id,
+             trans_t    transc,
+             num_t      dt,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             mem_t*     mem,
+             thrinfo_t* thread
+     )
+{
+	pack_t schema;
+	dim_t  m_max;
+	dim_t  k_max;
+	dim_t  pd_p;
+
+	// Prepare the packing destination buffer. If packing is not requested,
+	// this function will reduce to a no-op.
+	bli_packm_sup_init_mem
+	(
+	  will_pack,
+	  pack_buf_type,
+	  dt, m_alloc, k_alloc, mr,
+      rntm,
+      mem,
+	  thread
+	);
+
+	// Determine the packing buffer and related parameters for matrix A. If A
+	// will not be packed, then a_use will be set to point to a and the _a_use
+	// strides will be set accordingly.
+	bli_packm_sup_init
+	(
+	  will_pack,
+	  stor_id,
+	  &schema,
+	  m, k, mr,
+	  &m_max, &k_max,
+	  a, rs_a,  cs_a,
+	  p, rs_p,  cs_p,
+	     &pd_p, ps_p,
+      mem
+	);
+
+	// Inspect whether we are going to be packing matrix A.
+	if ( will_pack == FALSE )
+	{
+		// If we aren't going to pack matrix A, then there's nothing to do.
+
+		// printf( "blis_ packm_sup_a: not packing A.\n" );
+	}
+	else // if ( will_pack == TRUE )
+	{
+		if ( schema == BLIS_PACKED_ROWS )
+		{
+			// printf( "blis_ packm_sup_a: packing A to rows.\n" );
+
+			// For plain packing by rows, use var2.
+			packm_sup_var2[ dt ]
+			(
+			  transc,
+			  schema,
+			  m,
+			  k,
+			  ( void* )kappa,
+			  ( void* )a,  rs_a,  cs_a,
+			          *p, *rs_p, *cs_p,
+			  ( cntx_t* )cntx,
+			  thread
+			);
+		}
+		else // if ( schema == BLIS_PACKED_ROW_PANELS )
+		{
+			// printf( "blis_ packm_sup_a: packing A to row panels.\n" );
+
+			// For packing to column-stored row panels, use var1.
+			packm_sup_var1[ dt ]
+			(
+			  transc,
+			  schema,
+			  m,
+			  k,
+			  m_max,
+			  k_max,
+			  ( void* )kappa,
+			  ( void* )a,  rs_a,  cs_a,
+			          *p, *rs_p, *cs_p,
+			               pd_p, *ps_p,
+			  ( cntx_t* )cntx,
+			  thread
+			);
+		}
+
+		// Barrier so that packing is done before computation.
+		bli_thread_barrier( thread );
+	}
+}
+
diff --git a/frame/3/bli_l3_sup_packm.h b/frame/3/bli_l3_sup_packm.h
new file mode 100644
index 000000000..a84d4e45c
--- /dev/null
+++ b/frame/3/bli_l3_sup_packm.h
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+void bli_packm_sup_init_mem
+     (
+       bool       will_pack,
+       packbuf_t  pack_buf_type,
+       num_t      dt,
+       dim_t      m,
+       dim_t      k,
+       dim_t      mr,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     );
+
+void bli_packm_sup_finalize_mem
+     (
+       bool       did_pack,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     );
+
+void bli_packm_sup_init
+     (
+             bool       will_pack,
+             stor3_t    stor_id,
+             pack_t*    schema,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+             dim_t*     m_max,
+             dim_t*     k_max,
+       const void*      x, inc_t  rs_x, inc_t  cs_x,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           dim_t* pd_p, inc_t* ps_p,
+             mem_t*     mem
+     );
+
+void bli_packm_sup
+     (
+             bool       will_pack,
+             packbuf_t  pack_buf_type,
+             stor3_t    stor_id,
+             trans_t    transc,
+             num_t      dt,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             mem_t*     mem,
+             thrinfo_t* thread
+     );
+
diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c
deleted file mode 100644
index 6b73050fd..000000000
--- a/frame/3/bli_l3_sup_packm_a.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      m, \
-             dim_t      k, \
-             dim_t      mr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-		const dim_t k_pack = k; \
-\
-		/* Barrier to make sure all threads are caught up and ready to begin
-		   the packm stage. */ \
-		bli_thread_barrier( thread ); \
-\
-		/* Compute the size of the memory block eneded. */ \
-		siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
-\
-		/* Check the mem_t entry provided by the caller. If it is unallocated,
-		   then we need to acquire a block from the memory broker. */ \
-		if ( bli_mem_is_unalloc( mem ) ) \
-		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
-			{ \
-				/* Acquire directly to the chief thread's mem_t that was
-				   passed in. It needs to be that mem_t struct, and not a
-				   local (temporary) mem_t, since there is no barrier until
-				   after packing is finished, which could allow a race
-				   condition whereby the chief thread exits the current
-				   function before the other threads have a chance to copy
-				   from it. (A barrier would fix that race condition, but
-				   then again, I prefer to keep barriers to a minimum.) */ \
-				bli_pba_acquire_m \
-				( \
-				  rntm, \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem  \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else /* if ( bli_mem_is_alloc( mem ) ) */ \
-		{ \
-			/* If the mem_t entry provided by the caller does NOT contain a NULL
-			   buffer, then a block has already been acquired from the memory
-			   broker and cached by the caller. */ \
-\
-			/* As a sanity check, we should make sure that the mem_t object isn't
-			   associated with a block that is too small compared to the size of
-			   the packed matrix buffer that is needed, according to the value
-			   computed above. */ \
-			siz_t mem_size = bli_mem_size( mem ); \
-\
-			if ( mem_size < size_needed ) \
-			{ \
-				if ( bli_thread_am_ochief( thread ) ) \
-				{ \
-					/* The chief thread releases the existing block associated
-					   with the mem_t, and then re-acquires a new block, saving
-					   the associated mem_t to its passed-in mem_t. (See coment
-					   above for why the acquisition needs to be directly to
-					   the chief thread's passed-in mem_t and not a local
-					   (temporary) mem_t. */ \
-					bli_pba_release \
-					( \
-					  rntm, \
-					  mem \
-					); \
-					bli_pba_acquire_m \
-					( \
-					  rntm, \
-					  size_needed, \
-					  pack_buf_type, \
-					  mem \
-					); \
-				} \
-\
-				/* Broadcast the address of the chief thread's passed-in mem_t
-				   to all threads. */ \
-				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-				/* Non-chief threads: Copy the contents of the chief thread's
-				   passed-in mem_t to the passed-in mem_t for this thread. (The
-				   chief thread already has the mem_t, so it does not need to
-				   perform any copy.) */ \
-				if ( !bli_thread_am_ochief( thread ) ) \
-				{ \
-					*mem = *mem_p; \
-				} \
-			} \
-			else \
-			{ \
-				/* If the mem_t entry is already allocated and sufficiently large,
-				   then we use it as-is. No action is needed. */ \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we previously packed matrix A. */ \
-	if ( did_pack == FALSE ) \
-	{ \
-		/* If we didn't pack matrix A, there's nothing to be done. */ \
-	} \
-	else /* if ( did_pack == TRUE ) */ \
-	{ \
-		if ( thread != NULL ) \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			/* Check the mem_t entry provided by the caller. Only proceed if it
-			   is allocated, which it should be. */ \
-			if ( bli_mem_is_alloc( mem ) ) \
-			{ \
-				bli_pba_release \
-				( \
-				  rntm, \
-				  mem \
-				); \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   m, \
-       dim_t   k, \
-       dim_t   mr, \
-       dim_t*  m_max, \
-       dim_t*  k_max, \
-       ctype*  a, inc_t  rs_a, inc_t  cs_a, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		*m_max = m; \
-		*k_max = k; \
-\
-		/* Set the parameters for use with no packing of A (ie: using the
-		   source matrix A directly). */ \
-		{ \
-			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_a; \
-			*cs_p = cs_a; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * rs_a; \
-\
-			/* Set the schema to "not packed" to indicate that packing will be
-			   skipped. */ \
-			*schema = BLIS_NOT_PACKED; \
-		} \
-\
-		/* Since we won't be packing, simply update the buffer address provided
-		   by the caller to point to source matrix. */ \
-		*p = a; \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-		*k_max = k; \
-\
-		/* Determine the dimensions and strides for the packed matrix A. */ \
-		if ( stor_id == BLIS_RRC || \
-			 stor_id == BLIS_CRC ) \
-		{ \
-			/* stor3_t id values _RRC and _CRC: pack A to plain row storage. */ \
-			*rs_p = k; \
-			*cs_p = 1; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * k; \
-\
-			/* Set the schema to "row packed" to indicate packing to plain
-			   row storage. */ \
-			*schema = BLIS_PACKED_ROWS; \
-		} \
-		else \
-		{ \
-			/* All other stor3_t ids: pack A to column-stored row-panels. */ \
-			*rs_p = 1; \
-			*cs_p = mr; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * k; \
-\
-			/* Set the schema to "packed row panels" to indicate packing to
-			   conventional column-stored row panels. */ \
-			*schema = BLIS_PACKED_ROW_PANELS; \
-		} \
-\
-		/* Set the buffer address provided by the caller to point to the
-		   memory associated with the mem_t entry acquired from the memory
-		   broker. */ \
-		*p = bli_mem_buffer( mem ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_a )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     m_alloc, \
-       dim_t     k_alloc, \
-       dim_t     m, \
-       dim_t     k, \
-       dim_t     mr, \
-       ctype*    kappa, \
-       ctype*    a, inc_t  rs_a, inc_t  cs_a, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  m_max; \
-	dim_t  k_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. If packing is not requested,
-	   this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_a) \
-	( \
-	  will_pack, \
-	  pack_buf_type, \
-	  m_alloc, k_alloc, mr, \
-	  cntx, \
-	  rntm, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix A. If A
-	   will not be packed, then a_use will be set to point to a and the _a_use
-	   strides will be set accordingly. */ \
-	PASTEMAC(ch,packm_sup_init_a) \
-	( \
-	  will_pack, \
-	  stor_id, \
-	  &schema, \
-	  m, k, mr, \
-	  &m_max, &k_max, \
-	  a, rs_a,  cs_a, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  cntx, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		/* If we aren't going to pack matrix A, then there's nothing to do. */ \
-\
-		/*
-		printf( "blis_ packm_sup_a: not packing A.\n" ); \
-		*/ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		if ( schema == BLIS_PACKED_ROWS ) \
-		{ \
-			/*
-			printf( "blis_ packm_sup_a: packing A to rows.\n" ); \
-			*/ \
-\
-			/* For plain packing by rows, use var2. */ \
-			PASTEMAC(ch,packm_sup_var2) \
-			( \
-			  transc, \
-			  schema, \
-			  m, \
-			  k, \
-			  kappa, \
-			  a,  rs_a,  cs_a, \
-			  *p, *rs_p, *cs_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-		else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \
-		{ \
-			/*
-			printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
-			*/ \
-\
-			/* For packing to column-stored row panels, use var1. */ \
-			PASTEMAC(ch,packm_sup_var1) \
-			( \
-			  transc, \
-			  schema, \
-			  m, \
-			  k, \
-			  m_max, \
-			  k_max, \
-			  kappa, \
-			  a,  rs_a,  cs_a, \
-			  *p, *rs_p, *cs_p, \
-			      pd_p,  *ps_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-\
-		/* Barrier so that packing is done before computation. */ \
-		bli_thread_barrier( thread ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_a )
-
diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h
deleted file mode 100644
index 0aaa302c8..000000000
--- a/frame/3/bli_l3_sup_packm_a.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      m, \
-             dim_t      k, \
-             dim_t      mr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   m, \
-       dim_t   k, \
-       dim_t   mr, \
-       dim_t*  m_max, \
-       dim_t*  k_max, \
-       ctype*  a, inc_t  rs_a, inc_t  cs_a, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     m_alloc, \
-       dim_t     k_alloc, \
-       dim_t     m, \
-       dim_t     k, \
-       dim_t     mr, \
-       ctype*    kappa, \
-       ctype*    a, inc_t  rs_a, inc_t  cs_a, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_a )
-
diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c
deleted file mode 100644
index 7a2030ccf..000000000
--- a/frame/3/bli_l3_sup_packm_b.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      k, \
-             dim_t      n, \
-             dim_t      nr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		const dim_t k_pack = k; \
-		const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-		/* Barrier to make sure all threads are caught up and ready to begin
-		   the packm stage. */ \
-		bli_thread_barrier( thread ); \
-\
-		/* Compute the size of the memory block eneded. */ \
-		siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
-\
-		/* Check the mem_t entry provided by the caller. If it is unallocated,
-		   then we need to acquire a block from the memory broker. */ \
-		if ( bli_mem_is_unalloc( mem ) ) \
-		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
-			{ \
-				/* Acquire directly to the chief thread's mem_t that was
-				   passed in. It needs to be that mem_t struct, and not a
-				   local (temporary) mem_t, since there is no barrier until
-				   after packing is finished, which could allow a race
-				   condition whereby the chief thread exits the current
-				   function before the other threads have a chance to copy
-				   from it. (A barrier would fix that race condition, but
-				   then again, I prefer to keep barriers to a minimum.) */ \
-				bli_pba_acquire_m \
-				( \
-				  rntm, \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem  \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else /* if ( bli_mem_is_alloc( mem ) ) */ \
-		{ \
-			/* If the mem_t entry provided by the caller does NOT contain a NULL
-			   buffer, then a block has already been acquired from the memory
-			   broker and cached by the caller. */ \
-\
-			/* As a sanity check, we should make sure that the mem_t object isn't
-			   associated with a block that is too small compared to the size of
-			   the packed matrix buffer that is needed, according to the value
-			   computed above. */ \
-			siz_t mem_size = bli_mem_size( mem ); \
-\
-			if ( mem_size < size_needed ) \
-			{ \
-				if ( bli_thread_am_ochief( thread ) ) \
-				{ \
-					/* The chief thread releases the existing block associated
-					   with the mem_t, and then re-acquires a new block, saving
-					   the associated mem_t to its passed-in mem_t. (See coment
-					   above for why the acquisition needs to be directly to
-					   the chief thread's passed-in mem_t and not a local
-					   (temporary) mem_t. */ \
-					bli_pba_release \
-					( \
-					  rntm, \
-					  mem \
-					); \
-					bli_pba_acquire_m \
-					( \
-					  rntm, \
-					  size_needed, \
-					  pack_buf_type, \
-					  mem \
-					); \
-				} \
-\
-				/* Broadcast the address of the chief thread's passed-in mem_t
-				   to all threads. */ \
-				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-				/* Non-chief threads: Copy the contents of the chief thread's
-				   passed-in mem_t to the passed-in mem_t for this thread. (The
-				   chief thread already has the mem_t, so it does not need to
-				   perform any copy.) */ \
-				if ( !bli_thread_am_ochief( thread ) ) \
-				{ \
-					*mem = *mem_p; \
-				} \
-			} \
-			else \
-			{ \
-				/* If the mem_t entry is already allocated and sufficiently large,
-				   then we use it as-is. No action is needed. */ \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we previously packed matrix A. */ \
-	if ( did_pack == FALSE ) \
-	{ \
-		/* If we didn't pack matrix A, there's nothing to be done. */ \
-	} \
-	else /* if ( did_pack == TRUE ) */ \
-	{ \
-		if ( thread != NULL ) \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			/* Check the mem_t entry provided by the caller. Only proceed if it
-			   is allocated, which it should be. */ \
-			if ( bli_mem_is_alloc( mem ) ) \
-			{ \
-				bli_pba_release \
-				( \
-				  rntm, \
-				  mem \
-				); \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   k, \
-       dim_t   n, \
-       dim_t   nr, \
-       dim_t*  k_max, \
-       dim_t*  n_max, \
-       ctype*  b, inc_t  rs_b, inc_t  cs_b, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		*k_max = k; \
-		*n_max = n; \
-\
-		/* Set the parameters for use with no packing of B (ie: using the
-		   source matrix B directly). */ \
-		{ \
-			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_b; \
-			*cs_p = cs_b; \
-\
-			*pd_p = nr; \
-			*ps_p = nr * cs_b; \
-\
-			/* Set the schema to "not packed" to indicate that packing will be
-			   skipped. */ \
-			*schema = BLIS_NOT_PACKED; \
-		} \
-\
-		/* Since we won't be packing, simply update the buffer address provided
-		   by the caller to point to source matrix. */ \
-		*p = b; \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		*k_max = k; \
-		*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-		/* Determine the dimensions and strides for the packed matrix B. */ \
-		if ( stor_id == BLIS_RRC || \
-			 stor_id == BLIS_CRC ) \
-		{ \
-			/* stor3_t id values _RRC and _CRC: pack B to plain row storage. */ \
-			*rs_p = 1; \
-			*cs_p = k; \
-\
-			*pd_p = nr; \
-			*ps_p = k * nr; \
-\
-			/* Set the schema to "column packed" to indicate packing to plain
-			   column storage. */ \
-			*schema = BLIS_PACKED_COLUMNS; \
-		} \
-		else \
-		{ \
-			/* All other stor3_t ids: pack B to row-stored column-panels. */ \
-			*rs_p = nr; \
-			*cs_p = 1; \
-\
-			*pd_p = nr; \
-			*ps_p = k * nr; \
-\
-			/* Set the schema to "packed column panels" to indicate packing to
-			   conventional row-stored column panels. */ \
-			*schema = BLIS_PACKED_COL_PANELS; \
-		} \
-\
-		/* Set the buffer address provided by the caller to point to the
-		   memory associated with the mem_t entry acquired from the memory
-		   broker. */ \
-		*p = bli_mem_buffer( mem ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_b )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     k_alloc, \
-       dim_t     n_alloc, \
-       dim_t     k, \
-       dim_t     n, \
-       dim_t     nr, \
-       ctype*    kappa, \
-       ctype*    b, inc_t  rs_b, inc_t  cs_b, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  k_max; \
-	dim_t  n_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. If packing is not requested,
-	   this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_b) \
-	( \
-	  will_pack, \
-	  pack_buf_type, \
-	  k_alloc, n_alloc, nr, \
-	  cntx, \
-	  rntm, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix B. If B
-	   will not be packed, then b_use will be set to point to b and the _b_use
-	   strides will be set accordingly. */ \
-	PASTEMAC(ch,packm_sup_init_b) \
-	( \
-	  will_pack, \
-	  stor_id, \
-	  &schema, \
-	  k, n, nr, \
-	  &k_max, &n_max, \
-	  b, rs_b,  cs_b, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  cntx, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		/* If we aren't going to pack matrix B, then there's nothing to do. */ \
-\
-		/*
-		printf( "blis_ packm_sup_b: not packing B.\n" ); \
-		*/ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		if ( schema == BLIS_PACKED_COLUMNS ) \
-		{ \
-			/*
-			printf( "blis_ packm_sup_b: packing B to columns.\n" ); \
-			*/ \
-\
-			/* For plain packing by columns, use var2. */ \
-			PASTEMAC(ch,packm_sup_var2) \
-			( \
-			  transc, \
-			  schema, \
-			  k, \
-			  n, \
-			  kappa, \
-			  b,  rs_b,  cs_b, \
-			  *p, *rs_p, *cs_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-		else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \
-		{ \
-			/*
-			printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
-			*/ \
-\
-			/* For packing to row-stored column panels, use var1. */ \
-			PASTEMAC(ch,packm_sup_var1) \
-			( \
-			  transc, \
-			  schema, \
-			  k, \
-			  n, \
-			  k_max, \
-			  n_max, \
-			  kappa, \
-			  b,  rs_b,  cs_b, \
-			  *p, *rs_p, *cs_p, \
-			      pd_p,  *ps_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-\
-		/* Barrier so that packing is done before computation. */ \
-		bli_thread_barrier( thread ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_b )
-
diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h
deleted file mode 100644
index bd18e5887..000000000
--- a/frame/3/bli_l3_sup_packm_b.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      k, \
-             dim_t      n, \
-             dim_t      nr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   k, \
-       dim_t   n, \
-       dim_t   nr, \
-       dim_t*  k_max, \
-       dim_t*  n_max, \
-       ctype*  b, inc_t  rs_b, inc_t  cs_b, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     k_alloc, \
-       dim_t     n_alloc, \
-       dim_t     k, \
-       dim_t     n, \
-       dim_t     nr, \
-       ctype*    kappa, \
-       ctype*    b, inc_t  rs_b, inc_t  cs_b, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_b )
-
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 54ecab8ff..357251002 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -44,17 +44,17 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      m_max, \
+       dim_t      n_max, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+                     dim_t pd_p, inc_t ps_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      ) \
 { \
@@ -317,14 +317,14 @@ bli_thread_barrier( thread ); \
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      ) \
 { \
diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h
index 9c62c9c68..17cf9a482 100644
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -42,17 +42,17 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      m_max, \
+       dim_t      n_max, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+                     dim_t pd_p, inc_t ps_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      );
 
@@ -63,14 +63,14 @@ INSERT_GENTPROT_BASIC0( packm_sup_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      );
 
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index 61c85d6e9..76f1a96b7 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -34,34 +34,10 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemmsup_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       bool       packa,
-       bool       packb,
-       conj_t     conja,
-       conj_t     conjb,
-       dim_t      m,
-       dim_t      n,
-       dim_t      k,
-       void*      alpha,
-       void*      a, inc_t rs_a, inc_t cs_a,
-       void*      b, inc_t rs_b, inc_t cs_b,
-       void*      beta,
-       void*      c, inc_t rs_c, inc_t cs_c,
-       stor3_t    eff_id,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     );
-
 //
 // -- var1n --------------------------------------------------------------------
 //
 
-static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
-
 void bli_gemmsup_ref_var1n
      (
              trans_t trans,
@@ -70,67 +46,31 @@ void bli_gemmsup_ref_var1n
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-             stor3_t eff_id,
+             stor3_t stor_id,
        const cntx_t* cntx,
              rntm_t* rntm,
              thrinfo_t* thread
      )
 {
-#if 0
-	obj_t at, bt;
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_alias_to( b, &bt );
-
-	// Induce transpositions on A and/or B if either object is marked for
-	// transposition. We can induce "fast" transpositions since they objects
-	// are guaranteed to not have structure or be packed.
-	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
-	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
-
-	const num_t    dt        = bli_obj_dt( c );
-
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
-
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-
-	const dim_t    k         = bli_obj_width( &at );
-
-	void* buf_a     = bli_obj_buffer_at_off( &at );
-	const inc_t    rs_a      = bli_obj_row_stride( &at );
-	const inc_t    cs_a      = bli_obj_col_stride( &at );
-
-	void* buf_b     = bli_obj_buffer_at_off( &bt );
-	const inc_t    rs_b      = bli_obj_row_stride( &bt );
-	const inc_t    cs_b      = bli_obj_col_stride( &bt );
-
-	void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
+	const num_t  dt      = bli_obj_dt( c );
 
-	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	const dim_t  dt_size = bli_dt_size( dt );
 
-#else
-	const num_t  dt    = bli_obj_dt( c );
+	      bool   packa   = bli_rntm_pack_a( rntm );
+	      bool   packb   = bli_rntm_pack_b( rntm );
 
-	const bool   packa = bli_rntm_pack_a( rntm );
-	const bool   packb = bli_rntm_pack_b( rntm );
+	      conj_t conja   = bli_obj_conj_status( a );
+	      conj_t conjb   = bli_obj_conj_status( b );
 
-	const conj_t conja = bli_obj_conj_status( a );
-	const conj_t conjb = bli_obj_conj_status( b );
-
-	const dim_t  m     = bli_obj_length( c );
-	const dim_t  n     = bli_obj_width( c );
+	      dim_t  m       = bli_obj_length( c );
+	      dim_t  n       = bli_obj_width( c );
 	      dim_t  k;
 
-	const void*  buf_a = bli_obj_buffer_at_off( a );
+	const void*  buf_a   = bli_obj_buffer_at_off( a );
 	      inc_t  rs_a;
 	      inc_t  cs_a;
 
-	const void*  buf_b = bli_obj_buffer_at_off( b );
+	const void*  buf_b   = bli_obj_buffer_at_off( b );
 	      inc_t  rs_b;
 	      inc_t  cs_b;
 
@@ -163,556 +103,407 @@ void bli_gemmsup_ref_var1n
 	}
 
 	      void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t rs_c      = bli_obj_row_stride( c );
-	const inc_t cs_c      = bli_obj_col_stride( c );
+	      inc_t rs_c      = bli_obj_row_stride( c );
+	      inc_t cs_c      = bli_obj_col_stride( c );
 
 	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
 	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
-#endif
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	FUNCPTR_T f = ftypes_var1n[dt];
-
 #if 1
 	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+	// These optimizations are expressed by changing trans and/or stor_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx );
 #endif
 
-	if ( bli_is_notrans( trans ) )
+	// Note: This code explicitly performs the swaps that could be done
+	// implicitly in other BLIS contexts where a type-specific helper function
+	// was being called.
+    if ( bli_is_trans( trans ) )
+    {
+              bool   packtmp = packa; packa = packb; packb = packtmp;
+              conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
+              dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
+        const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
+              inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
+                     str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
+                     str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
+
+        stor_id = bli_stor3_trans( stor_id );
+    }
+
+	// This transposition of the stor3_t id value is inherent to variant 1.
+	// The reason: we assume that variant 2 is the "main" variant. The
+	// consequence of this is that we assume that the millikernels that
+	// iterate over m are registered to the "primary" kernel group associated
+	// with the kernel IO preference; similarly, mkernels that iterate over
+	// n are assumed to be registered to the "non-primary" group associated
+	// with the ("non-primary") anti-preference. Note that this pattern holds
+	// regardless of whether the mkernel set has a row or column preference.)
+	// See bli_l3_sup_int.c for a higher-level view of how this choice is made.
+	stor_id = bli_stor3_trans( stor_id );
+
+	// Query the context for various blocksizes.
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+	// Disable modification of KC since it seems to negatively impact certain
+	// operations (#644).
+	dim_t KC = KC0;
+
+	/*
+	if      ( packa && packb )
 	{
-		// Invoke the function.
-		f
-		(
-		  packa,
-		  packb,
-		  conja,
-		  conjb,
-		  m,
-		  n,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_a, rs_a, cs_a,
-		  ( void* )buf_b, rs_b, cs_b,
-		  ( void* )buf_beta,
-		           buf_c, rs_c, cs_c,
-		  eff_id,
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		KC = KC0;
 	}
-	else
+	else if ( packb )
 	{
-		// Invoke the function (transposing the operation).
-		f
-		(
-		  packb,
-		  packa,
-		  conjb,             // swap the conj values.
-		  conja,
-		  n,                 // swap the m and n dimensions.
-		  m,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  ( void* )buf_beta,
-		           buf_c, cs_c, rs_c, // swap the strides of C.
-		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-          ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else if ( packa )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else // if ( !packa && !packb )
+	{
+		if      ( FALSE                  ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( m <=   MR && n <=   NR ) KC = KC0;
+		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2;
+		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4;
+		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4;
+		else                               KC = (( KC0 / 5 ) / 4 ) * 4;
+	}
+	*/
+
+	// Nudge NC up to a multiple of MR and MC up to a multiple of NR.
+	// NOTE: This is unique to variant 1 (ie: not performed in variant 2)
+	// because MC % MR == 0 and NC % NR == 0 is already enforced at runtime.
+	const dim_t NC  = bli_align_dim_to_mult( NC0, MR );
+	const dim_t MC  = bli_align_dim_to_mult( MC0, NR );
+
+	// Query the maximum blocksize for MR, which implies a maximum blocksize
+	// extension for the final iteration.
+	const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx );
+	const dim_t MRE = MRM - MR;
+
+	// Compute partitioning step values for each matrix of each loop.
+	const inc_t jcstep_c = rs_c * dt_size;
+	const inc_t jcstep_a = rs_a * dt_size;
+
+	const inc_t pcstep_a = cs_a * dt_size;
+	const inc_t pcstep_b = rs_b * dt_size;
+
+	const inc_t icstep_c = cs_c * dt_size;
+	const inc_t icstep_b = cs_b * dt_size;
+
+	const inc_t jrstep_c = rs_c * MR * dt_size;
+
+	//const inc_t jrstep_a = rs_a * MR;
+
+	//const inc_t irstep_c = cs_c * NR;
+	//const inc_t irstep_b = cs_b * NR;
+
+	// Query the context for the sup microkernel address and cast it to its
+	// function pointer type.
+	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+
+	const char* a_00       = buf_a;
+	const char* b_00       = buf_b;
+	      char* c_00       = buf_c;
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+
+	auxinfo_t aux;
+
+	mem_t mem_a = BLIS_MEM_INITIALIZER;
+	mem_t mem_b = BLIS_MEM_INITIALIZER;
+\
+	// Define an array of bszid_t ids, which will act as our substitute for
+	// the cntl_t tree.
+	// NOTE: These bszid_t values, and their order, match that of the bp
+	// algorithm (variant 2) because they are not used to query actual
+	// blocksizes but rather query the ways of parallelism for the various
+	// loops. For example, the 2nd loop in variant 1 partitions in the m
+	// dimension (in increments of MR), but parallelizes that m dimension
+	// with BLIS_JR_NT.
+	// Note that this panel-block algorithm partitions an NC x KC submatrix
+	// of A to be packed in the 4th loop, and a KC x MC submatrix of B to be
+	// packed in the 3rd loop.
+	//                    5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop
+	bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
+\
+	// Determine whether we are using more than one thread.
+	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
+\
+	thrinfo_t* thread_jc = NULL;
+	thrinfo_t* thread_pc = NULL;
+	thrinfo_t* thread_pa = NULL;
+	thrinfo_t* thread_ic = NULL;
+	thrinfo_t* thread_pb = NULL;
+	thrinfo_t* thread_jr = NULL;
+\
+	// Pre-grow the thrinfo_t tree.
+	bszid_t* bszids_jc = bszids;
+	         thread_jc = thread;
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc );
+\
+	bszid_t* bszids_pc = &bszids_jc[1];
+	         thread_pc = bli_thrinfo_sub_node( thread_jc );
+	bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc );
+
+	bszid_t* bszids_pa = &bszids_pc[1];
+	         thread_pa = bli_thrinfo_sub_node( thread_pc );
+
+	bszid_t* bszids_ic = &bszids_pa[1];
+	         thread_ic = bli_thrinfo_sub_node( thread_pa );
+	bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic );
+
+	bszid_t* bszids_pb = &bszids_ic[1];
+	         thread_pb = bli_thrinfo_sub_node( thread_ic );
+
+	bszid_t* bszids_jr = &bszids_pb[1];
+	         thread_jr = bli_thrinfo_sub_node( thread_pb );
+	bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr );
+
+	// Compute the JC loop thread range for the current thread.
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end );
+	const dim_t m_local = jc_end - jc_start;
+
+	// Compute number of primary and leftover components of the JC loop.
+	//const dim_t jc_iter = ( m_local + NC - 1 ) / NC;
+	const dim_t jc_left =   m_local % NC;
+
+	// Loop over the m dimension (NC rows/columns at a time).
+	//for ( dim_t jj = 0; jj < jc_iter; jj += 1 )
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC )
+	{
+		// Calculate the thread's current JC block dimension.
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left );
+
+		const char* a_jc = a_00 + jj * jcstep_a;
+		      char* c_jc = c_00 + jj * jcstep_c;
+
+		// Compute the PC loop thread range for the current thread.
+		const dim_t pc_start = 0, pc_end = k;
+		const dim_t k_local = k;
+
+		// Compute number of primary and leftover components of the PC loop.
+		//const dim_t pc_iter = ( k_local + KC - 1 ) / KC;
+		const dim_t pc_left =   k_local % KC;
+
+		// Loop over the k dimension (KC rows/columns at a time).
+		//for ( dim_t pp = 0; pp < pc_iter; pp += 1 )
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC )
+		{
+			// Calculate the thread's current PC block dimension.
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left );
+
+			const char* a_pc = a_jc + pp * pcstep_a;
+			const char* b_pc = b_00 + pp * pcstep_b;
+
+			// Only apply beta to the first iteration of the pc loop.
+			const void* beta_use = ( pp == 0 ? buf_beta : one );
+
+		          char* a_use;
+			      inc_t rs_a_use, cs_a_use, ps_a_use;
+
+			// Determine the packing buffer and related parameters for matrix
+			// A. (If A will not be packed, then a_use will be set to point to
+			// a and the _a_use strides will be set accordingly.) Then call
+			// the packm sup variant chooser, which will call the appropriate
+			// implementation based on the schema deduced from the stor_id.
+			// NOTE: packing matrix A in this panel-block algorithm corresponds
+			// to packing matrix B in the block-panel algorithm.
+			bli_packm_sup
+			(
+			  packa,
+			  BLIS_BUFFER_FOR_B_PANEL, // This algorithm packs matrix A to
+			  stor_id,                 // a "panel of B".
+			  BLIS_NO_TRANSPOSE,
+			  dt,
+			  NC,     KC,       // This "panel of B" is (at most) NC x KC.
+			  nc_cur, kc_cur, MR,
+			  one,
+			  a_pc,   rs_a,      cs_a,
+			  ( void** )&a_use, &rs_a_use, &cs_a_use,
+			                    &ps_a_use,
+			  cntx,
+			  rntm,
+			  &mem_a,
+			  thread_pa
+			);
+
+			// Alias a_use so that it's clear this is our current block of
+			// matrix A.
+			const char* a_pc_use = a_use;
+
+			// We don't need to embed the panel stride of A within the auxinfo_t
+			// object because this variant iterates through A in the jr loop,
+			// which occurs here, within the macrokernel, not within the
+			// millikernel.
+			//bli_auxinfo_set_ps_a( ps_a_use, &aux );
+
+			// Compute the IC loop thread range for the current thread.
+			dim_t ic_start, ic_end;
+			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end );
+			const dim_t n_local = ic_end - ic_start;
+
+			// Compute number of primary and leftover components of the IC loop.
+			//const dim_t ic_iter = ( n_local + MC - 1 ) / MC;
+			const dim_t ic_left =   n_local % MC;
+
+			// Loop over the n dimension (MC rows at a time).
+			//for ( dim_t ii = 0; ii < ic_iter; ii += 1 )
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC )
+			{
+				// Calculate the thread's current IC block dimension.
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left );
+
+				const char* b_ic = b_pc + ii * icstep_b;
+				      char* c_ic = c_jc + ii * icstep_c;
+
+				      char* b_use;
+				      inc_t rs_b_use, cs_b_use, ps_b_use;
+
+				// Determine the packing buffer and related parameters for matrix
+				// B. (If B will not be packed, then b_use will be set to point to
+				// b and the _b_use strides will be set accordingly.) Then call
+				// the packm sup variant chooser, which will call the appropriate
+				// implementation based on the schema deduced from the stor_id.
+				// NOTE: packing matrix B in this panel-block algorithm corresponds
+				// to packing matrix A in the block-panel algorithm.
+				bli_packm_sup
+				(
+				  packb,
+				  BLIS_BUFFER_FOR_A_BLOCK, // This algorithm packs matrix B to
+				  stor_id,                 // a "block of A".
+				  BLIS_NO_TRANSPOSE,
+				  dt,
+				  MC,     KC,       // This "block of A" is (at most) KC x MC.
+				  mc_cur, kc_cur, NR,
+				  one,
+				  b_ic,   cs_b,      rs_b,
+				  ( void** )&b_use, &cs_b_use, &rs_b_use,
+				                    &ps_b_use,
+				  cntx,
+				  rntm,
+				  &mem_b,
+				  thread_pb
+				);
+
+				// Alias b_use so that it's clear this is our current block of
+				// matrix B.
+				const char* b_ic_use = b_use;
+
+				// Embed the panel stride of B within the auxinfo_t object. The
+				// millikernel will query and use this to iterate through
+				// micropanels of B.
+				bli_auxinfo_set_ps_b( ps_b_use, &aux );
+
+				// Compute number of primary and leftover components of the JR loop.
+				dim_t jr_iter = ( nc_cur + MR - 1 ) / MR;
+				dim_t jr_left =   nc_cur % MR;
+
+				// An optimization: allow the last jr iteration to contain up to MRE
+				// rows of C and A. (If MRE > MR, the mkernel has agreed to handle
+				// these cases.) Note that this prevents us from declaring jr_iter and
+				// jr_left as const. NOTE: We forgo this optimization when packing A
+				// since packing an extended edge case is not yet supported.
+				if ( !packa && !is_mt )
+				if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE )
+				{
+					jr_iter--; jr_left += MR;
+				}
+
+				// Compute the JR loop thread range for the current thread.
+				dim_t jr_start, jr_end;
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+
+				// Loop over the m dimension (NR columns at a time).
+				//for ( dim_t j = 0; j < jr_iter; j += 1 )
+				for ( dim_t j = jr_start; j < jr_end; j += 1 )
+				{
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left );
+
+					//ctype* a_jr = a_pc + j * jrstep_a;
+					const char* a_jr = a_pc_use + j * ps_a_use * dt_size;
+					      char* c_jr = c_ic     + j * jrstep_c;
+
+					//const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR;
+					//const dim_t ir_left =   mc_cur % NR;
+
+					// Loop over the n dimension (MR rows at a time).
+					{
+						// Invoke the gemmsup millikernel.
+						gemmsup_ker
+						(
+						  conja,
+						  conjb,
+						  nr_cur, // Notice: nr_cur <= MR.
+						  mc_cur, // Recall: mc_cur partitions the n dimension!
+						  kc_cur,
+						  ( void* )buf_alpha,
+						  ( void* )a_jr,     rs_a_use, cs_a_use,
+						  ( void* )b_ic_use, rs_b_use, cs_b_use,
+						  ( void* )beta_use,
+						  ( void* )c_jr,     rs_c,     cs_c,
+						  &aux,
+						  ( cntx_t* )cntx
+						);
+					}
+				}
+			}
+
+			// NOTE: This barrier is only needed if we are packing A (since
+			// that matrix is packed within the pc loop of this variant).
+			if ( packa ) bli_thread_barrier( thread_pa );
+		}
 	}
-}
 
+	// Release any memory that was acquired for packing matrices A and B.
+	bli_packm_sup_finalize_mem
+	(
+	  packa,
+	  rntm,
+	  &mem_a,
+	  thread_pa
+	);
+	bli_packm_sup_finalize_mem
+	(
+	  packb,
+	  rntm,
+	  &mem_b,
+	  thread_pb
+	);
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    stor_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t dt = PASTEMAC(ch,type); \
-\
-	/* If m or n is zero, return immediately. */ \
-	if ( bli_zero_dim2( m, n ) ) return; \
-\
-	/* If k < 1 or alpha is zero, scale by beta and return. */ \
-	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
-	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			PASTEMAC(ch,scalm) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m, n, \
-			  beta, \
-			  c, rs_c, cs_c \
-			); \
-		} \
-		return; \
-	} \
-\
-	/* This transposition of the stor3_t id value is inherent to variant 1.
-	   The reason: we assume that variant 2 is the "main" variant. The
-	   consequence of this is that we assume that the millikernels that
-	   iterate over m are registered to the "primary" kernel group associated
-	   with the kernel IO preference; similarly, mkernels that iterate over
-	   n are assumed to be registered to the "non-primary" group associated
-	   with the ("non-primary") anti-preference. Note that this pattern holds
-	   regardless of whether the mkernel set has a row or column preference.)
-	   See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
-	stor_id = bli_stor3_trans( stor_id ); \
-\
-	/* Query the context for various blocksizes. */ \
-	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
-	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
-	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
-	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
-\
-	/* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \
-	dim_t KC = KC0; \
-	/* \
-	dim_t KC; \
-	if      ( packa && packb ) \
-	{ \
-		KC = KC0; \
-	} \
-	else if ( packb ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else if ( packa ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else *//* if ( !packa && !packb ) *//* \
-	{ \
-		if      ( FALSE                  ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( m <=   MR && n <=   NR ) KC = KC0; \
-		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
-		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
-		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
-		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	}*/ \
-\
-	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
-	   NOTE: This is unique to variant 1 (ie: not performed in variant 2)
-	   because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
-	const dim_t NC  = bli_align_dim_to_mult( NC0, MR ); \
-	const dim_t MC  = bli_align_dim_to_mult( MC0, NR ); \
-\
-	/* Query the maximum blocksize for MR, which implies a maximum blocksize
-	   extension for the final iteration. */ \
-	const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const dim_t MRE = MRM - MR; \
-\
-	/* Compute partitioning step values for each matrix of each loop. */ \
-	const inc_t jcstep_c = rs_c; \
-	const inc_t jcstep_a = rs_a; \
-\
-	const inc_t pcstep_a = cs_a; \
-	const inc_t pcstep_b = rs_b; \
-\
-	const inc_t icstep_c = cs_c; \
-	const inc_t icstep_b = cs_b; \
-\
-	const inc_t jrstep_c = rs_c * MR; \
-\
-	/*
-	const inc_t jrstep_a = rs_a * MR; \
-\
-	const inc_t irstep_c = cs_c * NR; \
-	const inc_t irstep_b = cs_b * NR; \
-	*/ \
-\
-	/* Query the context for the sup microkernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemmsup_ker_ft) \
-	    gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
-\
-	ctype* a_00       = a; \
-	ctype* b_00       = b; \
-	ctype* c_00       = c; \
-	ctype* alpha_cast = alpha; \
-	ctype* beta_cast  = beta; \
-\
-	/* Make local copies of beta and one scalars to prevent any unnecessary
-	   sharing of cache lines between the cores' caches. */ \
-	ctype           beta_local = *beta_cast; \
-	ctype           one_local  = *PASTEMAC(ch,1); \
-\
-	auxinfo_t aux; \
-\
-	/* Parse and interpret the contents of the rntm_t object to properly
-	   set the ways of parallelism for each loop. */ \
-	/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
-\
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. An alternative way of initializing the
-	   mem_t entries is:
-
-	     bli_mem_clear( &mem_a ); \
-	     bli_mem_clear( &mem_b ); \
-	*/ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
-\
-	/* Define an array of bszid_t ids, which will act as our substitute for
-	   the cntl_t tree.
-	   NOTE: These bszid_t values, and their order, match that of the bp
-	   algorithm (variant 2) because they are not used to query actual
-	   blocksizes but rather query the ways of parallelism for the various
-	   loops. For example, the 2nd loop in variant 1 partitions in the m
-	   dimension (in increments of MR), but parallelizes that m dimension
-	   with BLIS_JR_NT. The only difference is that the _packa and _packb
-	   arrays have been adjusted for the semantic difference in order in
-	   which packa and packb nodes are encountered in the thrinfo tree.
-	   That is, this panel-block algorithm partitions an NC x KC submatrix
-	   of A to be packed in the 4th loop, and a KC x MC submatrix of B
-	   to be packed in the 3rd loop. */ \
-	/*                                  5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
-	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* bszids; \
-\
-	/* Set the bszids pointer to the correct bszids array above based on which
-	   matrices (if any) are being packed. */ \
-	if ( packa ) { if ( packb ) bszids = bszids_packab; \
-	               else         bszids = bszids_packa; } \
-	else         { if ( packb ) bszids = bszids_packb; \
-	               else         bszids = bszids_nopack; } \
-\
-	/* Determine whether we are using more than one thread. */ \
-	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
-\
-	thrinfo_t* thread_jc = NULL; \
-	thrinfo_t* thread_pc = NULL; \
-	thrinfo_t* thread_pa = NULL; \
-	thrinfo_t* thread_ic = NULL; \
-	thrinfo_t* thread_pb = NULL; \
-	thrinfo_t* thread_jr = NULL; \
-\
-	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   bszids_jc = bszids; \
-	               thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
-\
-	/* Compute the JC loop thread range for the current thread. */ \
-	dim_t jc_start, jc_end; \
-	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
-	const dim_t m_local = jc_end - jc_start; \
-\
-	/* Compute number of primary and leftover components of the JC loop. */ \
-	/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   m_local % NC; \
-\
-	/* Loop over the m dimension (NC rows/columns at a time). */ \
-	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
-	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
-	{ \
-		/* Calculate the thread's current JC block dimension. */ \
-		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
-\
-		ctype* a_jc = a_00 + jj * jcstep_a; \
-		ctype* c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   bszids_pc = &bszids_jc[1]; \
-		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
-\
-		/* Compute the PC loop thread range for the current thread. */ \
-		const dim_t pc_start = 0, pc_end = k; \
-		const dim_t k_local = k; \
-\
-		/* Compute number of primary and leftover components of the PC loop. */ \
-		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
-		const dim_t pc_left =   k_local % KC; \
-\
-		/* Loop over the k dimension (KC rows/columns at a time). */ \
-		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
-		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
-		{ \
-			/* Calculate the thread's current PC block dimension. */ \
-			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
-\
-			ctype* a_pc = a_jc + pp * pcstep_a; \
-			ctype* b_pc = b_00 + pp * pcstep_b; \
-\
-			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
-\
-			ctype* a_use; \
-			      inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-			/* Set the bszid_t array and thrinfo_t pointer based on whether
-			   we will be packing A. If we won't be packing A, we alias to
-			   the _pc variables so that code further down can unconditionally
-			   reference the _pa variables. Note that *if* we will be packing
-			   A, the thrinfo_t node will have already been created by a
-			   previous call to bli_thrinfo_grow(), since bszid values of
-			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   bszids_pa; \
-			if ( packa ) { bszids_pa = &bszids_pc[1]; \
-			               thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
-			else         { bszids_pa = &bszids_pc[0]; \
-			               thread_pa = thread_pc; } \
-\
-			/* Determine the packing buffer and related parameters for matrix
-			   A. (If A will not be packed, then a_use will be set to point to
-			   a and the _a_use strides will be set accordingly.) Then call
-			   the packm sup variant chooser, which will call the appropriate
-			   implementation based on the schema deduced from the stor_id.
-			   NOTE: packing matrix A in this panel-block algorithm corresponds
-			   to packing matrix B in the block-panel algorithm. */ \
-			PASTEMAC(ch,packm_sup_a) \
-			( \
-			  packa, \
-			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
-			  stor_id,                 /* a "panel of B".                  */ \
-			  BLIS_NO_TRANSPOSE, \
-			  NC,     KC,       /* This "panel of B" is (at most) NC x KC. */ \
-			  nc_cur, kc_cur, MR, \
-			  &one_local, \
-			  a_pc,   rs_a,      cs_a, \
-			  &a_use, &rs_a_use, &cs_a_use, \
-			                     &ps_a_use, \
-			  cntx, \
-			  rntm, \
-			  &mem_a, \
-			  thread_pa  \
-			); \
-\
-			/* Alias a_use so that it's clear this is our current block of
-			   matrix A. */ \
-			ctype* a_pc_use = a_use; \
-\
-			/* We don't need to embed the panel stride of A within the auxinfo_t
-			   object because this variant iterates through A in the jr loop,
-			   which occurs here, within the macrokernel, not within the
-			   millikernel. */ \
-			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
-\
-			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   bszids_ic = &bszids_pa[1]; \
-			               thread_ic = bli_thrinfo_sub_node( thread_pa ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
-\
-			/* Compute the IC loop thread range for the current thread. */ \
-			dim_t ic_start, ic_end; \
-			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
-			const dim_t n_local = ic_end - ic_start; \
-\
-			/* Compute number of primary and leftover components of the IC loop. */ \
-			/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
-			const dim_t ic_left =   n_local % MC; \
-\
-			/* Loop over the n dimension (MC rows at a time). */ \
-			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
-			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
-			{ \
-				/* Calculate the thread's current IC block dimension. */ \
-				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
-\
-				ctype* b_ic = b_pc + ii * icstep_b; \
-				ctype* c_ic = c_jc + ii * icstep_c; \
-\
-				ctype* b_use; \
-				      inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-				/* Set the bszid_t array and thrinfo_t pointer based on whether
-				   we will be packing A. If we won't be packing A, we alias to
-				   the _pc variables so that code further down can unconditionally
-				   reference the _pa variables. Note that *if* we will be packing
-				   A, the thrinfo_t node will have already been created by a
-				   previous call to bli_thrinfo_grow(), since bszid values of
-				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   bszids_pb; \
-				if ( packb ) { bszids_pb = &bszids_ic[1]; \
-							   thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
-				else         { bszids_pb = &bszids_ic[0]; \
-							   thread_pb = thread_ic; } \
-\
-				/* Determine the packing buffer and related parameters for matrix
-				   B. (If B will not be packed, then b_use will be set to point to
-				   b and the _b_use strides will be set accordingly.) Then call
-				   the packm sup variant chooser, which will call the appropriate
-				   implementation based on the schema deduced from the stor_id.
-				   NOTE: packing matrix B in this panel-block algorithm corresponds
-				   to packing matrix A in the block-panel algorithm. */ \
-				PASTEMAC(ch,packm_sup_b) \
-				( \
-				  packb, \
-				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
-				  stor_id,                 /* a "block of A".                  */ \
-				  BLIS_NO_TRANSPOSE, \
-				  KC,     MC,       /* This "block of A" is (at most) KC x MC. */ \
-				  kc_cur, mc_cur, NR, \
-				  &one_local, \
-				  b_ic,   rs_b,      cs_b, \
-				  &b_use, &rs_b_use, &cs_b_use, \
-				                     &ps_b_use, \
-				  cntx, \
-				  rntm, \
-				  &mem_b, \
-				  thread_pb  \
-				); \
-\
-				/* Alias b_use so that it's clear this is our current block of
-				   matrix B. */ \
-				ctype* b_ic_use = b_use; \
-\
-				/* Embed the panel stride of B within the auxinfo_t object. The
-				   millikernel will query and use this to iterate through
-				   micropanels of B. */ \
-				bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
-\
-				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   bszids_jr = &bszids_pb[1]; \
-				               thread_jr = bli_thrinfo_sub_node( thread_pb ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
-\
-				/* Compute number of primary and leftover components of the JR loop. */ \
-				dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
-				dim_t jr_left =   nc_cur % MR; \
-\
-				/* An optimization: allow the last jr iteration to contain up to MRE
-				   rows of C and A. (If MRE > MR, the mkernel has agreed to handle
-				   these cases.) Note that this prevents us from declaring jr_iter and
-				   jr_left as const. NOTE: We forgo this optimization when packing A
-				   since packing an extended edge case is not yet supported. */ \
-				if ( !packa && !is_mt ) \
-				if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
-				{ \
-					jr_iter--; jr_left += MR; \
-				} \
-\
-				/* Compute the JR loop thread range for the current thread. */ \
-				dim_t jr_start, jr_end; \
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
-\
-				/* Loop over the m dimension (NR columns at a time). */ \
-				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
-				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
-				{ \
-					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
-\
-					/*
-					ctype* a_jr = a_pc + j * jrstep_a; \
-					*/ \
-					ctype* a_jr = a_pc_use + j * ps_a_use; \
-					ctype* c_jr = c_ic     + j * jrstep_c; \
-\
-					/*
-					const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
-					const dim_t ir_left =   mc_cur % NR; \
-					*/ \
-\
-					/* Loop over the n dimension (MR rows at a time). */ \
-					{ \
-						/* Invoke the gemmsup millikernel. */ \
-						gemmsup_ker \
-						( \
-						  conja, \
-						  conjb, \
-						  nr_cur, /* Notice: nr_cur <= MR. */ \
-						  mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
-						  kc_cur, \
-						  alpha_cast, \
-						  a_jr,     rs_a_use, cs_a_use, \
-						  b_ic_use, rs_b_use, cs_b_use, \
-						  beta_use, \
-						            c_jr,     rs_c,     cs_c, \
-						  &aux, \
-						  cntx  \
-						); \
-					} \
-				} \
-			} \
-\
-			/* NOTE: This barrier is only needed if we are packing A (since
-			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packa ) bli_thread_barrier( thread_pa ); \
-		} \
-	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTEMAC(ch,packm_sup_finalize_mem_a) \
-	( \
-	  packa, \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTEMAC(ch,packm_sup_finalize_mem_b) \
-	( \
-	  packb, \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
-\
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
-
 
 //
 // -- var2m --------------------------------------------------------------------
 //
 
-static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
-
 void bli_gemmsup_ref_var2m
      (
              trans_t    trans,
@@ -721,67 +512,30 @@ void bli_gemmsup_ref_var2m
        const obj_t*     b,
        const obj_t*     beta,
        const obj_t*     c,
-             stor3_t    eff_id,
+             stor3_t    stor_id,
        const cntx_t*    cntx,
              rntm_t*    rntm,
              thrinfo_t* thread
      )
 {
-#if 0
-	obj_t at, bt;
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_alias_to( b, &bt );
-
-	// Induce transpositions on A and/or B if either object is marked for
-	// transposition. We can induce "fast" transpositions since they objects
-	// are guaranteed to not have structure or be packed.
-	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
-	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
-
-	const num_t    dt        = bli_obj_dt( c );
-
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
-
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-
-	const dim_t    k         = bli_obj_width( &at );
-
-	void* buf_a     = bli_obj_buffer_at_off( &at );
-	const inc_t    rs_a      = bli_obj_row_stride( &at );
-	const inc_t    cs_a      = bli_obj_col_stride( &at );
-
-	void* buf_b     = bli_obj_buffer_at_off( &bt );
-	const inc_t    rs_b      = bli_obj_row_stride( &bt );
-	const inc_t    cs_b      = bli_obj_col_stride( &bt );
-
-	void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
-
-	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
-
-#else
-	const num_t  dt    = bli_obj_dt( c );
+	const num_t  dt      = bli_obj_dt( c );
+	const dim_t  dt_size = bli_dt_size( dt );
 
-	const bool   packa = bli_rntm_pack_a( rntm );
-	const bool   packb = bli_rntm_pack_b( rntm );
+	      bool   packa   = bli_rntm_pack_a( rntm );
+	      bool   packb   = bli_rntm_pack_b( rntm );
 
-	const conj_t conja = bli_obj_conj_status( a );
-	const conj_t conjb = bli_obj_conj_status( b );
+	      conj_t conja   = bli_obj_conj_status( a );
+	      conj_t conjb   = bli_obj_conj_status( b );
 
-	const dim_t  m     = bli_obj_length( c );
-	const dim_t  n     = bli_obj_width( c );
+	      dim_t  m       = bli_obj_length( c );
+	      dim_t  n       = bli_obj_width( c );
 	      dim_t  k;
 
-	const void*  buf_a = bli_obj_buffer_at_off( a );
+	const void*  buf_a   = bli_obj_buffer_at_off( a );
 	      inc_t  rs_a;
 	      inc_t  cs_a;
 
-	const void*  buf_b = bli_obj_buffer_at_off( b );
+	const void*  buf_b   = bli_obj_buffer_at_off( b );
 	      inc_t  rs_b;
 	      inc_t  cs_b;
 
@@ -814,516 +568,371 @@ void bli_gemmsup_ref_var2m
 	}
 
 	      void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t rs_c      = bli_obj_row_stride( c );
-	const inc_t cs_c      = bli_obj_col_stride( c );
+	      inc_t rs_c      = bli_obj_row_stride( c );
+	      inc_t cs_c      = bli_obj_col_stride( c );
 
 	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
 	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
-#endif
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	FUNCPTR_T f = ftypes_var2m[dt];
-
 #if 1
 	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+	// These optimizations are expressed by changing trans and/or stor_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx );
 #endif
 
-	if ( bli_is_notrans( trans ) )
+	// Note: This code explicitly performs the swaps that could be done
+	// implicitly in other BLIS contexts where a type-specific helper function
+	// was being called.
+	if ( bli_is_trans( trans ) )
 	{
-		// Invoke the function.
-		f
-		(
-		  packa,
-		  packb,
-		  conja,
-		  conjb,
-		  m,
-		  n,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_a, rs_a, cs_a,
-		  ( void* )buf_b, rs_b, cs_b,
-		  ( void* )buf_beta,
-		           buf_c, rs_c, cs_c,
-		  eff_id,
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		      bool   packtmp = packa; packa = packb; packb = packtmp;
+		      conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
+		      dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
+		const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
+		      inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
+		             str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
+		             str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
+
+		stor_id = bli_stor3_trans( stor_id );
 	}
-	else
+
+	// Query the context for various blocksizes.
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+	// Disable modification of KC since it seems to negatively impact certain
+	// operations (#644).
+	dim_t KC = KC0;
+
+	/*
+	if      ( packa && packb )
 	{
-		// Invoke the function (transposing the operation).
-		f
-		(
-		  packb,             // swap the pack values.
-		  packa,
-		  conjb,             // swap the conj values.
-		  conja,
-		  n,                 // swap the m and n dimensions.
-		  m,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  ( void* )buf_beta,
-		           buf_c, cs_c, rs_c, // swap the strides of C.
-		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		KC = KC0;
 	}
-}
+	else if ( packb )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else if ( packa )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else // if ( !packa && !packb )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( m <=   MR && n <=   NR ) KC = KC0;
+		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2;
+		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4;
+		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4;
+		else                               KC = (( KC0 / 5 ) / 4 ) * 4;
+	}
+	*/
 
+	// Query the maximum blocksize for NR, which implies a maximum blocksize
+	// extension for the final iteration.
+	const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx );
+	const dim_t NRE = NRM - NR;
+
+	// Compute partitioning step values for each matrix of each loop.
+	const inc_t jcstep_c = cs_c * dt_size;
+	const inc_t jcstep_b = cs_b * dt_size;
+
+	const inc_t pcstep_a = cs_a * dt_size;
+	const inc_t pcstep_b = rs_b * dt_size;
+
+	const inc_t icstep_c = rs_c * dt_size;
+	const inc_t icstep_a = rs_a * dt_size;
+
+	const inc_t jrstep_c = cs_c * NR * dt_size;
+
+	//const inc_t jrstep_b = cs_b * NR;
+	//( void )jrstep_b;
+
+	//const inc_t irstep_c = rs_c * MR;
+	//const inc_t irstep_a = rs_a * MR;
+
+	// Query the context for the sup microkernel address and cast it to its
+	// function pointer type.
+	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+
+	const char* a_00       = buf_a;
+	const char* b_00       = buf_b;
+	      char* c_00       = buf_c;
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+
+	auxinfo_t       aux;
+
+	mem_t mem_a = BLIS_MEM_INITIALIZER;
+	mem_t mem_b = BLIS_MEM_INITIALIZER;
+
+	// Define an array of bszid_t ids, which will act as our substitute for
+	// the cntl_t tree.
+	//                    5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop
+	bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
+
+	// Determine whether we are using more than one thread.
+	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
+
+	thrinfo_t* thread_jc = NULL;
+	thrinfo_t* thread_pc = NULL;
+	thrinfo_t* thread_pb = NULL;
+	thrinfo_t* thread_ic = NULL;
+	thrinfo_t* thread_pa = NULL;
+	thrinfo_t* thread_jr = NULL;
+
+	// Pre-grow the thrinfo_t tree.
+	bszid_t* bszids_jc = bszids;
+	         thread_jc = thread;
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc );
+
+	bszid_t* bszids_pc = &bszids_jc[1];
+	         thread_pc = bli_thrinfo_sub_node( thread_jc );
+	bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc );
+
+	bszid_t* bszids_pb = &bszids_pc[1];
+	         thread_pb = bli_thrinfo_sub_node( thread_pc );
+
+	bszid_t* bszids_ic = &bszids_pb[1];
+	         thread_ic = bli_thrinfo_sub_node( thread_pb );
+	bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic );
+
+	bszid_t* bszids_pa = &bszids_ic[1];
+	         thread_pa = bli_thrinfo_sub_node( thread_ic );
+
+	bszid_t* bszids_jr = &bszids_pa[1];
+	         thread_jr = bli_thrinfo_sub_node( thread_pa );
+	bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr );
+
+	// Compute the JC loop thread range for the current thread.
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+	const dim_t n_local = jc_end - jc_start;
+
+	// Compute number of primary and leftover components of the JC loop.
+	//const dim_t jc_iter = ( n_local + NC - 1 ) / NC;
+	const dim_t jc_left =   n_local % NC;
+
+	// Loop over the n dimension (NC rows/columns at a time).
+	//for ( dim_t jj = 0; jj < jc_iter; jj += 1 )
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC )
+	{
+		// Calculate the thread's current JC block dimension.
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left );
+
+		const char* b_jc = b_00 + jj * jcstep_b;
+		      char* c_jc = c_00 + jj * jcstep_c;
+
+		// Compute the PC loop thread range for the current thread.
+		const dim_t pc_start = 0, pc_end = k;
+		const dim_t k_local = k;
+
+		// Compute number of primary and leftover components of the PC loop.
+		//const dim_t pc_iter = ( k_local + KC - 1 ) / KC;
+		const dim_t pc_left =   k_local % KC;
+
+		// Loop over the k dimension (KC rows/columns at a time).
+		//for ( dim_t pp = 0; pp < pc_iter; pp += 1 )
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC )
+		{
+			// Calculate the thread's current PC block dimension.
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left );
+
+			const char* a_pc = a_00 + pp * pcstep_a;
+			const char* b_pc = b_jc + pp * pcstep_b;
+
+			// Only apply beta to the first iteration of the pc loop.
+			const void* beta_use = ( pp == 0 ? buf_beta : one );
+
+			      char* b_use;
+			      inc_t rs_b_use, cs_b_use, ps_b_use;
+
+			// Determine the packing buffer and related parameters for matrix
+			// B. (If B will not be packed, then a_use will be set to point to
+			// b and the _b_use strides will be set accordingly.) Then call
+			// the packm sup variant chooser, which will call the appropriate
+			// implementation based on the schema deduced from the stor_id.
+			bli_packm_sup
+			(
+			  packb,
+			  BLIS_BUFFER_FOR_B_PANEL, // This algorithm packs matrix B to
+			  stor_id,                 // a "panel of B."
+			  BLIS_NO_TRANSPOSE,
+			  dt,
+			  NC,     KC,       // This "panel of B" is (at most) KC x NC.
+			  nc_cur, kc_cur, NR,
+			  one,
+			  b_pc,   cs_b,      rs_b,
+			  ( void** )&b_use, &cs_b_use, &rs_b_use,
+			                    &ps_b_use,
+			  cntx,
+			  rntm,
+			  &mem_b,
+			  thread_pb
+			);
+
+			// Alias b_use so that it's clear this is our current block of
+			// matrix B.
+			char* b_pc_use = b_use;
+
+			// We don't need to embed the panel stride of B within the auxinfo_t
+			// object because this variant iterates through B in the jr loop,
+			// which occurs here, within the macrokernel, not within the
+			// millikernel.
+			//bli_auxinfo_set_ps_b( ps_b_use, &aux );
+
+			// Compute the IC loop thread range for the current thread.
+			dim_t ic_start, ic_end;
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end );
+			const dim_t m_local = ic_end - ic_start;
+
+			// Compute number of primary and leftover components of the IC loop.
+			//const dim_t ic_iter = ( m_local + MC - 1 ) / MC;
+			const dim_t ic_left =   m_local % MC;
+
+			// Loop over the m dimension (MC rows at a time).
+			//for ( dim_t ii = 0; ii < ic_iter; ii += 1 )
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC )
+			{
+				// Calculate the thread's current IC block dimension.
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left );
+
+				const char* a_ic = a_pc + ii * icstep_a;
+				      char* c_ic = c_jc + ii * icstep_c;
+
+				      char* a_use;
+				      inc_t rs_a_use, cs_a_use, ps_a_use;
+
+				// Determine the packing buffer and related parameters for matrix
+				// A. (If A will not be packed, then a_use will be set to point to
+				// a and the _a_use strides will be set accordingly.) Then call
+				// the packm sup variant chooser, which will call the appropriate
+				// implementation based on the schema deduced from the stor_id.
+				bli_packm_sup
+				(
+				  packa,
+				  BLIS_BUFFER_FOR_A_BLOCK, // This algorithm packs matrix A to
+				  stor_id,                 // a "block of A."
+				  BLIS_NO_TRANSPOSE,
+				  dt,
+				  MC,     KC,       // This "block of A" is (at most) MC x KC.
+				  mc_cur, kc_cur, MR,
+				  one,
+				  a_ic,   rs_a,      cs_a,
+				  ( void** )&a_use, &rs_a_use, &cs_a_use,
+				                    &ps_a_use,
+				  cntx,
+				  rntm,
+				  &mem_a,
+				  thread_pa
+				);
+
+				// Alias a_use so that it's clear this is our current block of
+				// matrix A.
+				char* a_ic_use = a_use;
+
+				// Embed the panel stride of A within the auxinfo_t object. The
+				// millikernel will query and use this to iterate through
+				// micropanels of A (if needed).
+				bli_auxinfo_set_ps_a( ps_a_use, &aux );
+
+				// Compute number of primary and leftover components of the JR loop.
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR;
+				dim_t jr_left =   nc_cur % NR;
+
+				// An optimization: allow the last jr iteration to contain up to NRE
+				// columns of C and B. (If NRE > NR, the mkernel has agreed to handle
+				// these cases.) Note that this prevents us from declaring jr_iter and
+				// jr_left as const. NOTE: We forgo this optimization when packing B
+				// since packing an extended edge case is not yet supported.
+				if ( !packb && !is_mt )
+				if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE )
+				{
+					jr_iter--; jr_left += NR;
+				}
+
+				// Compute the JR loop thread range for the current thread.
+				dim_t jr_start, jr_end;
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+
+				// Loop over the n dimension (NR columns at a time).
+				//for ( dim_t j = 0; j < jr_iter; j += 1 )
+				for ( dim_t j = jr_start; j < jr_end; j += 1 )
+				{
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left );
+
+					//ctype* b_jr = b_pc_use + j * jrstep_b;
+					const char* b_jr = b_pc_use + j * ps_b_use * dt_size;
+					      char* c_jr = c_ic     + j * jrstep_c;
+
+					//const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR;
+					//const dim_t ir_left =   mc_cur % MR;
+
+					// Loop over the m dimension (MR rows at a time).
+					{
+						// Invoke the gemmsup millikernel.
+						gemmsup_ker
+						(
+						  conja,
+						  conjb,
+						  mc_cur,
+						  nr_cur,
+						  kc_cur,
+						  ( void* )buf_alpha,
+						  ( void* )a_ic_use, rs_a_use, cs_a_use,
+						  ( void* )b_jr,     rs_b_use, cs_b_use,
+						  ( void* )beta_use,
+						  ( void* )c_jr,     rs_c,     cs_c,
+						  &aux,
+						  ( cntx_t* )cntx
+						);
+					}
+				}
+			}
+
+			// NOTE: This barrier is only needed if we are packing B (since
+			// that matrix is packed within the pc loop of this variant).
+			if ( packb ) bli_thread_barrier( thread_pb );
+		}
+	}
+
+	// Release any memory that was acquired for packing matrices A and B.
+	bli_packm_sup_finalize_mem
+	(
+	  packa,
+	  rntm,
+	  &mem_a,
+	  thread_pa
+	);
+	bli_packm_sup_finalize_mem
+	(
+	  packb,
+	  rntm,
+	  &mem_b,
+	  thread_pb
+	);
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    stor_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t dt = PASTEMAC(ch,type); \
-\
-	/* If m or n is zero, return immediately. */ \
-	if ( bli_zero_dim2( m, n ) ) return; \
-\
-	/* If k < 1 or alpha is zero, scale by beta and return. */ \
-	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
-	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			PASTEMAC(ch,scalm) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m, n, \
-			  beta, \
-			  c, rs_c, cs_c \
-			); \
-		} \
-		return; \
-	} \
-\
-	/* Query the context for various blocksizes. */ \
-	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
-	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
-	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
-	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
-\
-	/* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \
-	dim_t KC = KC0; \
-	/* \
-	dim_t KC; \
-	if      ( packa && packb ) \
-	{ \
-		KC = KC0; \
-	} \
-	else if ( packb ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else if ( packa ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else *//* if ( !packa && !packb ) *//* \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( m <=   MR && n <=   NR ) KC = KC0; \
-		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
-		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
-		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
-		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	}*/ \
-\
-	/* Query the maximum blocksize for NR, which implies a maximum blocksize
-	   extension for the final iteration. */ \
-	const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
-	const dim_t NRE = NRM - NR; \
-\
-	/* Compute partitioning step values for each matrix of each loop. */ \
-	const inc_t jcstep_c = cs_c; \
-	const inc_t jcstep_b = cs_b; \
-\
-	const inc_t pcstep_a = cs_a; \
-	const inc_t pcstep_b = rs_b; \
-\
-	const inc_t icstep_c = rs_c; \
-	const inc_t icstep_a = rs_a; \
-\
-	const inc_t jrstep_c = cs_c * NR; \
-\
-	/*
-	const inc_t jrstep_b = cs_b * NR; \
-	( void )jrstep_b; \
-\
-	const inc_t irstep_c = rs_c * MR; \
-	const inc_t irstep_a = rs_a * MR; \
-	*/ \
-\
-	/* Query the context for the sup microkernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemmsup_ker_ft) \
-        gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
-\
-	ctype* a_00       = a; \
-	ctype* b_00       = b; \
-	ctype* c_00       = c; \
-	ctype* alpha_cast = alpha; \
-	ctype* beta_cast  = beta; \
-\
-	/* Make local copies of beta and one scalars to prevent any unnecessary
-	   sharing of cache lines between the cores' caches. */ \
-	ctype           beta_local = *beta_cast; \
-	ctype           one_local  = *PASTEMAC(ch,1); \
-\
-	auxinfo_t       aux; \
-\
-	/* Parse and interpret the contents of the rntm_t object to properly
-	   set the ways of parallelism for each loop. */ \
-	/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
-\
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. An alternative way of initializing the
-	   mem_t entries is:
-
-	     bli_mem_clear( &mem_a ); \
-	     bli_mem_clear( &mem_b ); \
-	*/ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
-\
-	/* Define an array of bszid_t ids, which will act as our substitute for
-	   the cntl_t tree. */ \
-	/*                           5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop */ \
-	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* bszids; \
-\
-	/* Set the bszids pointer to the correct bszids array above based on which
-	   matrices (if any) are being packed. */ \
-	if ( packa ) { if ( packb ) bszids = bszids_packab; \
-	               else         bszids = bszids_packa; } \
-	else         { if ( packb ) bszids = bszids_packb; \
-	               else         bszids = bszids_nopack; } \
-\
-	/* Determine whether we are using more than one thread. */ \
-	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
-\
-	thrinfo_t* thread_jc = NULL; \
-	thrinfo_t* thread_pc = NULL; \
-	thrinfo_t* thread_pb = NULL; \
-	thrinfo_t* thread_ic = NULL; \
-	thrinfo_t* thread_pa = NULL; \
-	thrinfo_t* thread_jr = NULL; \
-\
-	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   bszids_jc = bszids; \
-	               thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
-\
-	/* Compute the JC loop thread range for the current thread. */ \
-	dim_t jc_start, jc_end; \
-	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
-	const dim_t n_local = jc_end - jc_start; \
-\
-	/* Compute number of primary and leftover components of the JC loop. */ \
-	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   n_local % NC; \
-\
-	/* Loop over the n dimension (NC rows/columns at a time). */ \
-	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
-	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
-	{ \
-		/* Calculate the thread's current JC block dimension. */ \
-		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
-\
-		ctype* b_jc = b_00 + jj * jcstep_b; \
-		ctype* c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   bszids_pc = &bszids_jc[1]; \
-		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
-\
-		/* Compute the PC loop thread range for the current thread. */ \
-		const dim_t pc_start = 0, pc_end = k; \
-		const dim_t k_local = k; \
-\
-		/* Compute number of primary and leftover components of the PC loop. */ \
-		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
-		const dim_t pc_left =   k_local % KC; \
-\
-		/* Loop over the k dimension (KC rows/columns at a time). */ \
-		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
-		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
-		{ \
-			/* Calculate the thread's current PC block dimension. */ \
-			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
-\
-			ctype* a_pc = a_00 + pp * pcstep_a; \
-			ctype* b_pc = b_jc + pp * pcstep_b; \
-\
-			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
-\
-			ctype* b_use; \
-			inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-			/* Set the bszid_t array and thrinfo_t pointer based on whether
-			   we will be packing B. If we won't be packing B, we alias to
-			   the _pc variables so that code further down can unconditionally
-			   reference the _pb variables. Note that *if* we will be packing
-			   B, the thrinfo_t node will have already been created by a
-			   previous call to bli_thrinfo_grow(), since bszid values of
-			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   bszids_pb; \
-			if ( packb ) { bszids_pb = &bszids_pc[1]; \
-			               thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
-			else         { bszids_pb = &bszids_pc[0]; \
-			               thread_pb = thread_pc; } \
-\
-			/* Determine the packing buffer and related parameters for matrix
-			   B. (If B will not be packed, then a_use will be set to point to
-			   b and the _b_use strides will be set accordingly.) Then call
-			   the packm sup variant chooser, which will call the appropriate
-			   implementation based on the schema deduced from the stor_id. */ \
-			PASTEMAC(ch,packm_sup_b) \
-			( \
-			  packb, \
-			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
-			  stor_id,                 /* a "panel of B."                  */ \
-			  BLIS_NO_TRANSPOSE, \
-			  KC,     NC,       /* This "panel of B" is (at most) KC x NC. */ \
-			  kc_cur, nc_cur, NR, \
-			  &one_local, \
-			  b_pc,   rs_b,      cs_b, \
-			  &b_use, &rs_b_use, &cs_b_use, \
-			                     &ps_b_use, \
-			  cntx, \
-			  rntm, \
-			  &mem_b, \
-			  thread_pb  \
-			); \
-\
-			/* Alias b_use so that it's clear this is our current block of
-			   matrix B. */ \
-			ctype* b_pc_use = b_use; \
-\
-			/* We don't need to embed the panel stride of B within the auxinfo_t
-			   object because this variant iterates through B in the jr loop,
-			   which occurs here, within the macrokernel, not within the
-			   millikernel. */ \
-			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
-\
-			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   bszids_ic = &bszids_pb[1]; \
-			               thread_ic = bli_thrinfo_sub_node( thread_pb ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
-\
-			/* Compute the IC loop thread range for the current thread. */ \
-			dim_t ic_start, ic_end; \
-			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
-			const dim_t m_local = ic_end - ic_start; \
-\
-			/* Compute number of primary and leftover components of the IC loop. */ \
-			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
-			const dim_t ic_left =   m_local % MC; \
-\
-			/* Loop over the m dimension (MC rows at a time). */ \
-			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
-			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
-			{ \
-				/* Calculate the thread's current IC block dimension. */ \
-				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
-\
-				ctype* a_ic = a_pc + ii * icstep_a; \
-				ctype* c_ic = c_jc + ii * icstep_c; \
-\
-				ctype* a_use; \
-				inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-				/* Set the bszid_t array and thrinfo_t pointer based on whether
-				   we will be packing B. If we won't be packing A, we alias to
-				   the _ic variables so that code further down can unconditionally
-				   reference the _pa variables. Note that *if* we will be packing
-				   A, the thrinfo_t node will have already been created by a
-				   previous call to bli_thrinfo_grow(), since bszid values of
-				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   bszids_pa; \
-				if ( packa ) { bszids_pa = &bszids_ic[1]; \
-							   thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
-				else         { bszids_pa = &bszids_ic[0]; \
-							   thread_pa = thread_ic; } \
-\
-				/* Determine the packing buffer and related parameters for matrix
-				   A. (If A will not be packed, then a_use will be set to point to
-				   a and the _a_use strides will be set accordingly.) Then call
-				   the packm sup variant chooser, which will call the appropriate
-				   implementation based on the schema deduced from the stor_id. */ \
-				PASTEMAC(ch,packm_sup_a) \
-				( \
-				  packa, \
-				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
-				  stor_id,                 /* a "block of A."                  */ \
-				  BLIS_NO_TRANSPOSE, \
-				  MC,     KC,       /* This "block of A" is (at most) MC x KC. */ \
-				  mc_cur, kc_cur, MR, \
-				  &one_local, \
-				  a_ic,   rs_a,      cs_a, \
-				  &a_use, &rs_a_use, &cs_a_use, \
-				                     &ps_a_use, \
-				  cntx, \
-				  rntm, \
-				  &mem_a, \
-				  thread_pa  \
-				); \
-\
-				/* Alias a_use so that it's clear this is our current block of
-				   matrix A. */ \
-				ctype* a_ic_use = a_use; \
-\
-				/* Embed the panel stride of A within the auxinfo_t object. The
-				   millikernel will query and use this to iterate through
-				   micropanels of A (if needed). */ \
-				bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
-\
-				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   bszids_jr = &bszids_pa[1]; \
-				               thread_jr = bli_thrinfo_sub_node( thread_pa ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
-\
-				/* Compute number of primary and leftover components of the JR loop. */ \
-				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
-				dim_t jr_left =   nc_cur % NR; \
-\
-				/* An optimization: allow the last jr iteration to contain up to NRE
-				   columns of C and B. (If NRE > NR, the mkernel has agreed to handle
-				   these cases.) Note that this prevents us from declaring jr_iter and
-				   jr_left as const. NOTE: We forgo this optimization when packing B
-				   since packing an extended edge case is not yet supported. */ \
-				if ( !packb && !is_mt ) \
-				if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
-				{ \
-					jr_iter--; jr_left += NR; \
-				} \
-\
-				/* Compute the JR loop thread range for the current thread. */ \
-				dim_t jr_start, jr_end; \
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
-\
-				/* Loop over the n dimension (NR columns at a time). */ \
-				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
-				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
-				{ \
-					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
-\
-					/*
-					ctype* b_jr = b_pc_use + j * jrstep_b; \
-					*/ \
-					ctype* b_jr = b_pc_use + j * ps_b_use; \
-					ctype* c_jr = c_ic     + j * jrstep_c; \
-\
-					/*
-					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
-					const dim_t ir_left =   mc_cur % MR; \
-					*/ \
-\
-					/* Loop over the m dimension (MR rows at a time). */ \
-					{ \
-						/* Invoke the gemmsup millikernel. */ \
-						gemmsup_ker \
-						( \
-						  conja, \
-						  conjb, \
-						  mc_cur, \
-						  nr_cur, \
-						  kc_cur, \
-						  alpha_cast, \
-						  a_ic_use, rs_a_use, cs_a_use, \
-						  b_jr,     rs_b_use, cs_b_use, \
-						  beta_use, \
-						            c_jr,     rs_c,     cs_c, \
-						  &aux, \
-						  cntx  \
-						); \
-					} \
-				} \
-			} \
-\
-			/* NOTE: This barrier is only needed if we are packing B (since
-			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packb ) bli_thread_barrier( thread_pb ); \
-		} \
-	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTEMAC(ch,packm_sup_finalize_mem_a) \
-	( \
-	  packa, \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTEMAC(ch,packm_sup_finalize_mem_b) \
-	( \
-	  packb, \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
-\
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
-
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index df9a747ab..be6b17f39 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -89,32 +89,6 @@ void PASTEMAC(ch,varname) \
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    eff_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
-
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 1ae904abf..5f992bd67 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -53,22 +53,6 @@ void bli_gemm_front
 	obj_t   b_local;
 	obj_t   c_local;
 
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) )
-	{
-		return;
-	}
-
-	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
-	// and return early.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
-	     bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
 #if 0
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 	// Only handle small problems separately for homogeneous datatypes.
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index e291b5f27..49b32c976 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -53,22 +53,6 @@ void bli_gemmt_front
 	obj_t   b_local;
 	obj_t   c_local;
 
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) )
-	{
-		return;
-	}
-
-	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
-	// and return early.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
-	     bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
 	// Alias A, B, and C in case we need to apply transformations.
 	bli_obj_alias_to( a, &a_local );
 	bli_obj_alias_to( b, &b_local );

From a1a5a9b4cbef9208da494c45a2f933a8e82559ac Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 21 Sep 2022 18:31:01 -0500
Subject: [PATCH 086/230] Implemented support for fat multithreading. (#665)

Details:
- Allow the user to configure BLIS in such a way that multiple threading
  implementations get compiled into the library, with one of those
  implementations chosen at runtime. For now, there are only three
  implementations available: OpenMP, pthreads, and single. (Here,
  'single' merely refers to single-threaded mode.) The configure script
  now allows the user to give the -t option with a comma-separated list
  of values, such as '-t openmp,pthreads'. The first value in the list
  will always be the default at library initialization time, and
  'single' is always silently appended to the end of the list. The user
  can specify which implementation should execute in one of three ways:
  by setting the BLIS_THREAD_IMPL environment variable prior to launch;
  by calling the bli_thread_set_thread_impl() global runtime API; or by
  encoding their choice into a rntm_t that is passed into one of the
  expert interfaces. Any of these three choices overrides the
  initialization-time default (i.e., the first value listed to the -t
  configure option). Requesting an implementation that was not compiled
  into the library will result in an error message followed by
  bli_abort().
- Relocated the 'auto' logic for the -t option from the top-level
  Makefile to the configure script. (Currently, this logic is pretty
  dumb, choosing 'openmp' for gcc and icc, and 'pthreads' for clang.)
- Defined a new 'timpl_t' enum in bli_type_defs.h, with three valid
  values: BLIS_SINGLE, BLIS_OPENMP, BLIS_POSIX.
- Reorganized the thrcomm_t struct into a single defintion with two
  preprocessor blocks, one each for additional fields needed by OpenMP
  and pthreads.
- Added timpl_t argument to bli_thrcomm_bcast(), bli_thrcomm_barrier(),
  bli_thrcomm_init(), and bli_thrcomm_cleanup(), which these functions
  need since they are now wrappers that choose the implementation-
  specific function corresponding to the currently enabled threading
  implementation.
- Added rntm_t* to bli_thread_broadcast(), bli_thread_barrier() so that
  those functions can pass the timpl_t value into bli_thrcomm_bcast()
  and bli_thrcomm_barrier(), respectively.
- Defined bli_env_get_str() in bli_env.c to allow the querying of
  BLIS_THREAD_IMPL (which, unlike BLIS_NUM_THREADS and friends, is
  expected to be a string).
- Defined bli_thread_get_thread_impl(), bli_thread_set_thread_impl() to
  get and set the current threading implementation at runtime.
- Defined bli_rntm_thread_impl() and bli_rntm_set_thread_impl() to query
  and set the threading implementation within a rntm_t. Also choose
  BLIS_SINGLE as the default value when initializing rntm_t structs.
- Added bli_info_get_*() functions to query whether OpenMP or pthreads
  would be chosen as the default at init-time. Note that this only
  tests whether OpenMP or pthreads is the first implementation in the
  list passed to the threading configure option (-t) and is *not* the
  same as querying which implementation is currently selected, since
  that can be influenced by BLIS_THREAD_IMPL and/or
  bli_thread_set_thread_impl().
- Changed l3int_t to l3int_ft.
- Updated docs/Multithreading.md to document the new behavior.
- Updated sandbox/gemmlike and addon/gemmd to work with the new fat
  threading feature. This included a few bugfixes to bring the codes up
  to date, as necessary.
- Comment, whitespace updates.
---
 Makefile                                      |  34 +--
 addon/gemmd/attic/bli_gemm_ex.c               |  17 +-
 addon/gemmd/bao_gemmd.c                       |  16 +-
 addon/gemmd/bao_gemmd_bp_var1.c               |   2 +-
 addon/gemmd/bao_l3_packm_a.c                  |   8 +-
 addon/gemmd/bao_l3_packm_b.c                  |   8 +-
 addon/gemmd/thread/bao_l3_decor.c             | 150 ++++++++++++
 addon/gemmd/thread/bao_l3_decor.h             |  58 +++--
 addon/gemmd/thread/bao_l3_decor_openmp.c      |  15 +-
 addon/gemmd/thread/bao_l3_decor_openmp.h      |  17 +-
 addon/gemmd/thread/bao_l3_decor_pthreads.c    |  15 +-
 addon/gemmd/thread/bao_l3_decor_pthreads.h    |  17 +-
 addon/gemmd/thread/bao_l3_decor_single.c      |   8 +-
 addon/gemmd/thread/bao_l3_decor_single.h      |  25 +-
 build/bli_config.h.in                         |   6 +
 build/config.mk.in                            |   2 +-
 common.mk                                     |  44 ++--
 configure                                     | 221 +++++++++++++++---
 docs/Multithreading.md                        |  81 ++++++-
 frame/1m/packm/bli_packm_alloc.c              |   4 +-
 frame/1m/packm/bli_packm_int.c                |   4 +-
 frame/1m/unpackm/bli_unpackm_int.c            |   3 +-
 frame/1m/unpackm/bli_unpackm_int.h            |   1 +
 frame/3/bli_l3_int.c                          |   4 +-
 frame/3/bli_l3_sup_packm.c                    |   8 +-
 frame/3/bli_l3_sup_packm_var.c                |   8 +-
 frame/3/bli_l3_sup_var1n2m.c                  |  14 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |   2 +-
 frame/3/trsm/bli_trsm_blk_var1.c              |   2 +-
 frame/3/trsm/bli_trsm_blk_var3.c              |   2 +-
 frame/base/bli_env.c                          |   8 +
 frame/base/bli_env.h                          |   1 +
 frame/base/bli_info.c                         |  16 ++
 frame/base/bli_info.h                         |   2 +
 frame/base/bli_rntm.c                         |  22 +-
 frame/base/bli_rntm.h                         |  29 ++-
 frame/include/bli_config_macro_defs.h         |  11 +-
 frame/include/bli_type_defs.h                 |  16 ++
 frame/thread/bli_l3_decor.c                   | 176 ++++++++++++++
 frame/thread/bli_l3_decor.h                   |  44 ++--
 frame/thread/bli_l3_decor_openmp.c            |  39 ++--
 frame/thread/bli_l3_decor_openmp.h            |  14 ++
 frame/thread/bli_l3_decor_pthreads.c          |  44 ++--
 frame/thread/bli_l3_decor_pthreads.h          |  14 ++
 frame/thread/bli_l3_decor_single.c            |  45 ++--
 frame/thread/bli_l3_decor_single.h            |  17 +-
 frame/thread/bli_l3_sup_decor.c               | 137 +++++++++++
 frame/thread/bli_l3_sup_decor.h               |  39 +++-
 frame/thread/bli_l3_sup_decor_openmp.c        |  25 +-
 frame/thread/bli_l3_sup_decor_openmp.h        |  13 ++
 frame/thread/bli_l3_sup_decor_pthreads.c      |  53 +++--
 frame/thread/bli_l3_sup_decor_pthreads.h      |  13 ++
 frame/thread/bli_l3_sup_decor_single.c        |  24 +-
 frame/thread/bli_l3_sup_decor_single.h        |  16 +-
 frame/thread/bli_thrcomm.c                    | 179 +++++++++++++-
 frame/thread/bli_thrcomm.h                    |  90 ++++++-
 frame/thread/bli_thrcomm_openmp.c             |  76 +++---
 frame/thread/bli_thrcomm_openmp.h             |  49 +---
 frame/thread/bli_thrcomm_pthreads.c           |  43 +---
 frame/thread/bli_thrcomm_pthreads.h           |  33 +--
 frame/thread/bli_thrcomm_single.c             |  37 +--
 frame/thread/bli_thrcomm_single.h             |  44 +---
 frame/thread/bli_thread.c                     | 113 +++++++--
 frame/thread/bli_thread.h                     |  24 +-
 frame/thread/bli_thrinfo.c                    |  12 +-
 frame/thread/bli_thrinfo.h                    |  16 +-
 frame/thread/bli_thrinfo_sup.c                |   6 +-
 sandbox/gemmlike/bls_gemm.c                   |  24 +-
 sandbox/gemmlike/bls_gemm.h                   |  24 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           |   2 +-
 sandbox/gemmlike/bls_gemm_check.c             |  12 +-
 sandbox/gemmlike/bls_gemm_check.h             |  12 +-
 sandbox/gemmlike/bls_l3_packm_a.c             |   8 +-
 sandbox/gemmlike/bls_l3_packm_b.c             |   8 +-
 sandbox/gemmlike/thread/bls_l3_decor.c        | 148 ++++++++++++
 sandbox/gemmlike/thread/bls_l3_decor.h        |  43 ++--
 sandbox/gemmlike/thread/bls_l3_decor_openmp.c |  15 +-
 sandbox/gemmlike/thread/bls_l3_decor_openmp.h |  13 ++
 .../gemmlike/thread/bls_l3_decor_pthreads.c   |  55 +++--
 .../gemmlike/thread/bls_l3_decor_pthreads.h   |  13 ++
 sandbox/gemmlike/thread/bls_l3_decor_single.c |  34 ++-
 sandbox/gemmlike/thread/bls_l3_decor_single.h |  18 +-
 testsuite/src/test_libblis.c                  |  35 ++-
 83 files changed, 2083 insertions(+), 717 deletions(-)
 create mode 100644 addon/gemmd/thread/bao_l3_decor.c
 create mode 100644 frame/thread/bli_l3_decor.c
 create mode 100644 frame/thread/bli_l3_sup_decor.c
 create mode 100644 sandbox/gemmlike/thread/bls_l3_decor.c

diff --git a/Makefile b/Makefile
index f5396f79b..5c4a32b59 100644
--- a/Makefile
+++ b/Makefile
@@ -1149,24 +1149,24 @@ endif # ifeq ($(IS_WIN),no)
 # --- Query current configuration ---
 
 showconfig: check-env
-	@echo "configuration family:    $(CONFIG_NAME)"
-	@echo "sub-configurations:      $(CONFIG_LIST)"
-	@echo "requisite kernels sets:  $(KERNEL_LIST)"
-	@echo "kernel-to-config map:    $(KCONFIG_MAP)"
+	@echo "configuration family:       $(CONFIG_NAME)"
+	@echo "sub-configurations:         $(CONFIG_LIST)"
+	@echo "requisite kernels sets:     $(KERNEL_LIST)"
+	@echo "kernel-to-config map:       $(KCONFIG_MAP)"
 	@echo "-------------------------"
-	@echo "BLIS version string:     $(VERSION)"
-	@echo ".so major version:       $(SO_MAJOR)"
-	@echo ".so minor.build vers:    $(SO_MINORB)"
-	@echo "install libdir:          $(INSTALL_LIBDIR)"
-	@echo "install includedir:      $(INSTALL_INCDIR)"
-	@echo "install sharedir:        $(INSTALL_SHAREDIR)"
-	@echo "debugging status:        $(DEBUG_TYPE)"
-	@echo "multithreading status:   $(THREADING_MODEL)"
-	@echo "enable BLAS API?         $(MK_ENABLE_BLAS)"
-	@echo "enable CBLAS API?        $(MK_ENABLE_CBLAS)"
-	@echo "build static library?    $(MK_ENABLE_STATIC)"
-	@echo "build shared library?    $(MK_ENABLE_SHARED)"
-	@echo "ARG_MAX hack enabled?    $(ARG_MAX_HACK)"
+	@echo "BLIS version string:        $(VERSION)"
+	@echo ".so major version:          $(SO_MAJOR)"
+	@echo ".so minor.build vers:       $(SO_MINORB)"
+	@echo "install libdir:             $(INSTALL_LIBDIR)"
+	@echo "install includedir:         $(INSTALL_INCDIR)"
+	@echo "install sharedir:           $(INSTALL_SHAREDIR)"
+	@echo "debugging status:           $(DEBUG_TYPE)"
+	@echo "enabled threading model(s): $(THREADING_MODEL)"
+	@echo "enable BLAS API?            $(MK_ENABLE_BLAS)"
+	@echo "enable CBLAS API?           $(MK_ENABLE_CBLAS)"
+	@echo "build static library?       $(MK_ENABLE_STATIC)"
+	@echo "build shared library?       $(MK_ENABLE_SHARED)"
+	@echo "ARG_MAX hack enabled?       $(ARG_MAX_HACK)"
 
 
 # --- Clean rules ---
diff --git a/addon/gemmd/attic/bli_gemm_ex.c b/addon/gemmd/attic/bli_gemm_ex.c
index 0f40d1cb3..8b7d11d81 100644
--- a/addon/gemmd/attic/bli_gemm_ex.c
+++ b/addon/gemmd/attic/bli_gemm_ex.c
@@ -36,13 +36,13 @@
 
 void bli_gemm_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -82,7 +82,8 @@ void bli_gemm_ex
 	// Invoke the operation's front end.
 	bli_gemm_front
 	(
-	  alpha, a, b, beta, c, cntx, rntm, NULL
+	  ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
+	  ( cntx_t* )cntx, ( rntm_t* )rntm, NULL
 	);
 }
 
diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c
index 01185a9d7..8379ff6d4 100644
--- a/addon/gemmd/bao_gemmd.c
+++ b/addon/gemmd/bao_gemmd.c
@@ -81,16 +81,28 @@ void bao_gemmd_ex
 	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
 	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
 
+	// Set the .pack_a and .pack_b fields to TRUE. This is only needed because
+	// this addon uses bli_thrinfo_sup_grow(), which calls
+	// bli_thrinfo_sup_create_for_cntl(), which employs an optimization if
+	// both fields are FALSE (as is often the case with sup). However, this
+	// addon implements the "large" code path, and so both A and B must
+	// always be packed. Setting the fields to TRUE will avoid the optimization
+	// while this addon implementation executes (and it also reinforces the
+	// fact that we *are* indeed packing A and B, albeit not in the sup context
+	// originally envisioned for the .pack_a and .pack_b fields).
+	bli_rntm_set_pack_a( TRUE, rntm );
+	bli_rntm_set_pack_b( TRUE, rntm );
+
 	// Obtain a valid (native) context from the gks if necessary.
 	// NOTE: This must be done before calling the _check() function, since
 	// that function assumes the context pointer is valid.
-	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+	if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bao_gemmd_check( alpha, a, d, b, beta, c, cntx );
 
-	// -- bli_gemmd_front() ----------------------------------------------------
+	// -- bao_gemmd_front() ----------------------------------------------------
 
 	obj_t a_local;
 	obj_t b_local;
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
index 689471367..e3f47982c 100644
--- a/addon/gemmd/bao_gemmd_bp_var1.c
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -458,7 +458,7 @@ void PASTECH2(bao_,ch,varname) \
 			/* This barrier is needed to prevent threads from starting to pack
 			   the next row panel of B before the current row panel is fully
 			   computed upon. */ \
-			bli_thread_barrier( thread_pb ); \
+			bli_thread_barrier( rntm, thread_pb ); \
 		} \
 	} \
 \
diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c
index 49bb34664..1d6502884 100644
--- a/addon/gemmd/bao_l3_packm_a.c
+++ b/addon/gemmd/bao_l3_packm_a.c
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_a )
diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c
index c41b062b6..8d020007c 100644
--- a/addon/gemmd/bao_l3_packm_b.c
+++ b/addon/gemmd/bao_l3_packm_b.c
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_b )
diff --git a/addon/gemmd/thread/bao_l3_decor.c b/addon/gemmd/thread/bao_l3_decor.c
new file mode 100644
index 000000000..ff510b6f3
--- /dev/null
+++ b/addon/gemmd/thread/bao_l3_decor.c
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// Initialize a function pointer array containing function addresses for
+// each of the threading-specific level-3 thread decorators.
+
+static l3ao_decor_ft l3ao_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bao_l3_thread_decorator_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bao_l3_thread_decorator_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                NULL,
+#else
+	                NULL,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bao_l3_thread_decorator_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                NULL,
+#else
+	                NULL,
+#endif
+};
+
+// Define a dispatcher that chooses a threading-specific function from the
+// above function pointer array.
+
+void bao_l3_thread_decorator
+     (
+       l3aoint_ft func,
+       opid_t   family,
+       obj_t*   alpha,
+       obj_t*   a,
+       obj_t*   d,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx,
+       rntm_t*  rntm
+     )
+{
+	rntm_t rntm_l;
+
+	// Query the threading implementation and the number of threads requested.
+	timpl_t ti = bli_rntm_thread_impl( rntm );
+	dim_t   nt = bli_rntm_num_threads( rntm );
+
+	if ( bli_error_checking_is_enabled() )
+		bao_l3_thread_decorator_check( rntm );
+
+	if ( 1 < nt && ti == BLIS_SINGLE )
+	{
+		// Here, we resolve conflicting information. The caller requested
+		// a sequential threading implementation, but also requested more
+		// than one thread. Here, we choose to favor the requested threading
+		// implementation over the number of threads, and so reset all
+		// parallelism parameters to 1.
+		rntm_l = *rntm;
+		nt = 1;
+		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
+		bli_rntm_set_num_threads_only( 1, &rntm_l );
+		rntm = &rntm_l;
+	}
+
+	// Use the timpl_t value to index into the corresponding function address
+	// from the function pointer array.
+	const l3ao_decor_ft fp = l3ao_decor_fpa[ ti ];
+
+	// Call the threading-specific decorator function.
+	fp
+	(
+	  func,
+	  family,
+	  alpha,
+	  a,
+	  d,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm
+	);
+}
+
+void bao_l3_thread_decorator_check
+     (
+       rntm_t* rntm
+     )
+{
+	//err_t e_val;
+
+	//e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) );
+	//bli_check_error_code( e_val );
+
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	if (
+#ifndef BLIS_ENABLE_OPENMP
+	    ti == BLIS_OPENMP ||
+#endif
+#ifndef BLIS_ENABLE_PTHREADS
+	    ti == BLIS_POSIX ||
+#endif
+	    FALSE
+	   )
+	{
+		fprintf( stderr, "\n" );
+		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
+		bli_abort();
+	}
+}
+
diff --git a/addon/gemmd/thread/bao_l3_decor.h b/addon/gemmd/thread/bao_l3_decor.h
index b4fd2b9b7..4c087bdb6 100644
--- a/addon/gemmd/thread/bao_l3_decor.h
+++ b/addon/gemmd/thread/bao_l3_decor.h
@@ -4,7 +4,8 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2021, The University of Texas at Austin
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,18 +33,13 @@
 
 */
 
-#ifndef BLIS_SBX_L3_DECOR_H
-#define BLIS_SBX_L3_DECOR_H
-
-// -- sup definitions ----------------------------------------------------------
-
-// Level-3 sup internal function type.
-typedef void (*l3sbxint_t)
+// Level-3 internal function type.
+typedef void (*l3aoint_ft)
      (
        obj_t*     alpha,
        obj_t*     a,
-       obj_t*     d,
        obj_t*     b,
+       obj_t*     d,
        obj_t*     beta,
        obj_t*     c,
        cntx_t*    cntx,
@@ -51,19 +47,39 @@ typedef void (*l3sbxint_t)
        thrinfo_t* thread
      );
 
-// Level-3 sup thread decorator prototype.
+// Level-3 thread decorator function type.
+typedef void (*l3ao_decor_ft)
+     (
+       l3aoint_ft func,
+       opid_t   family,
+       obj_t*   alpha,
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   d,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx,
+       rntm_t*  rntm
+     );
+
+// Level-3 thread decorator prototype.
 void bao_l3_thread_decorator
      (
-       l3sbxint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     d,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+       l3aoint_ft func,
+       opid_t   family,
+       obj_t*   alpha,
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   d,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx,
+       rntm_t*  rntm
+     );
+
+void bao_l3_thread_decorator_check
+     (
+       rntm_t* rntm
      );
 
 // Include definitions specific to the method of multithreading.
@@ -71,5 +87,3 @@ void bao_l3_thread_decorator
 #include "bao_l3_decor_openmp.h"
 #include "bao_l3_decor_pthreads.h"
 
-#endif
-
diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/gemmd/thread/bao_l3_decor_openmp.c
index 1aca8de27..7deee95ed 100644
--- a/addon/gemmd/thread/bao_l3_decor_openmp.c
+++ b/addon/gemmd/thread/bao_l3_decor_openmp.c
@@ -36,16 +36,11 @@
 
 #ifdef BLIS_ENABLE_OPENMP
 
-// Define a dummy thread entry function, which is needed in the pthreads
-// version, so that when building Windows DLLs (with OpenMP enabled or with
-// no multithreading) we don't risk having an unresolved symbol.
-void* bao_l3_thread_entry( void* data_void ) { return NULL; }
-
 //#define PRINT_THRINFO
 
-void bao_l3_thread_decorator
+void bao_l3_thread_decorator_openmp
      (
-       l3sbxint_t func,
+       l3aoint_ft func,
        opid_t     family,
        obj_t*     alpha,
        obj_t*     a,
@@ -66,7 +61,7 @@ void bao_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -79,7 +74,7 @@ void bao_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
@@ -94,8 +89,6 @@ void bao_l3_thread_decorator
 		const dim_t tid = omp_get_thread_num();
 
 		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
-		// NOTE: This calls the same function used for the conventional/large
-		// code path.
 		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
 
 		// Use the thread id to access the appropriate pool_t* within the
diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.h b/addon/gemmd/thread/bao_l3_decor_openmp.h
index 9c956d7c3..4ed3e7efc 100644
--- a/addon/gemmd/thread/bao_l3_decor_openmp.h
+++ b/addon/gemmd/thread/bao_l3_decor_openmp.h
@@ -32,13 +32,22 @@
 
 */
 
-#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
-#define BLIS_SBX_L3_DECOR_OPENMP_H
-
 // Definitions specific to situations when OpenMP multithreading is enabled.
 #ifdef BLIS_ENABLE_OPENMP
 
-#endif
+void bao_l3_thread_decorator_openmp
+     (
+       l3aoint_ft func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     );
 
 #endif
 
diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.c b/addon/gemmd/thread/bao_l3_decor_pthreads.c
index 587b8400f..dfbfbaa61 100644
--- a/addon/gemmd/thread/bao_l3_decor_pthreads.c
+++ b/addon/gemmd/thread/bao_l3_decor_pthreads.c
@@ -39,7 +39,7 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	l3sbxint_t func;
+	l3aoint_ft func;
 	opid_t     family;
 	obj_t*     alpha;
 	obj_t*     a;
@@ -59,7 +59,7 @@ void* bao_l3_thread_entry( void* data_void )
 {
 	thread_data_t* data     = data_void;
 
-	l3sbxint_t     func     = data->func;
+	l3aoint_ft     func     = data->func;
 	opid_t         family   = data->family;
 	obj_t*         alpha    = data->alpha;
 	obj_t*         a        = data->a;
@@ -111,9 +111,9 @@ void* bao_l3_thread_entry( void* data_void )
 	return NULL;
 }
 
-void bao_l3_thread_decorator
+void bao_l3_thread_decorator_pthreads
      (
-       l3sbxint_t func,
+       l3aoint_ft func,
        opid_t     family,
        obj_t*     alpha,
        obj_t*     a,
@@ -216,5 +216,12 @@ void bao_l3_thread_decorator
 	bli_free_intl( datas );
 }
 
+#else
+
+// Define a dummy function bli_l3_thread_entry(), which is needed for
+// consistent dynamic linking behavior when building shared objects in Linux
+// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol.
+void* bao_l3_thread_entry( void* data_void ) { return NULL; }
+
 #endif
 
diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.h b/addon/gemmd/thread/bao_l3_decor_pthreads.h
index 69adec45e..1c0b58900 100644
--- a/addon/gemmd/thread/bao_l3_decor_pthreads.h
+++ b/addon/gemmd/thread/bao_l3_decor_pthreads.h
@@ -32,16 +32,25 @@
 
 */
 
-#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
-#define BLIS_SBX_L3_DECOR_PTHREADS_H
-
 // Definitions specific to situations when POSIX multithreading is enabled.
 #ifdef BLIS_ENABLE_PTHREADS
 
 // Thread entry point prototype.
 void* bao_l3_thread_entry( void* data_void );
 
-#endif
+void bao_l3_thread_decorator_pthreads
+     (
+       l3aoint_ft func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     );
 
 #endif
 
diff --git a/addon/gemmd/thread/bao_l3_decor_single.c b/addon/gemmd/thread/bao_l3_decor_single.c
index d60891d65..362c1e68c 100644
--- a/addon/gemmd/thread/bao_l3_decor_single.c
+++ b/addon/gemmd/thread/bao_l3_decor_single.c
@@ -34,13 +34,11 @@
 
 #include "blis.h"
 
-#ifndef BLIS_ENABLE_MULTITHREADING
-
 #define SKIP_THRINFO_TREE
 
-void bao_l3_thread_decorator
+void bao_l3_thread_decorator_single
      (
-       l3sbxint_t func,
+       l3aoint_ft func,
        opid_t     family,
        //pack_t     schema_a,
        //pack_t     schema_b,
@@ -139,5 +137,3 @@ void bao_l3_thread_decorator
 	bli_sba_checkin_array( array );
 }
 
-#endif
-
diff --git a/addon/gemmd/thread/bao_l3_decor_single.h b/addon/gemmd/thread/bao_l3_decor_single.h
index 211a43a89..813bb6d75 100644
--- a/addon/gemmd/thread/bao_l3_decor_single.h
+++ b/addon/gemmd/thread/bao_l3_decor_single.h
@@ -32,13 +32,18 @@
 
 */
 
-#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
-#define BLIS_SBX_L3_DECOR_SINGLE_H
-
-// Definitions specific to situations when multithreading is disabled.
-#ifndef BLIS_ENABLE_MULTITHREADING
-
-#endif
-
-#endif
-
+void bao_l3_thread_decorator_single
+     (
+       l3aoint_ft func,
+       opid_t     family,
+       //pack_t     schema_a,
+       //pack_t     schema_b,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     d,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     );
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index fa6bbbe12..5208a90f8 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -53,10 +53,16 @@
 
 #if @enable_openmp@
 #define BLIS_ENABLE_OPENMP
+#if @enable_openmp_as_def@
+#define BLIS_ENABLE_OPENMP_AS_DEFAULT
+#endif
 #endif
 
 #if @enable_pthreads@
 #define BLIS_ENABLE_PTHREADS
+#if @enable_pthreads_as_def@
+#define BLIS_ENABLE_PTHREADS_AS_DEFAULT
+#endif
 #endif
 
 #if @enable_jrir_slab@
diff --git a/build/config.mk.in b/build/config.mk.in
index 7ef8c6bd0..849a7ccfa 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -127,7 +127,7 @@ DEBUG_TYPE        := @debug_type@
 # Whether operating system support was requested via --enable-system.
 ENABLE_SYSTEM     := @enable_system@
 
-# The requested threading model.
+# The requested threading model(s).
 THREADING_MODEL   := @threading_model@
 
 # Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option.
diff --git a/common.mk b/common.mk
index b49089419..00b9f8ad3 100644
--- a/common.mk
+++ b/common.mk
@@ -802,44 +802,46 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c)))
 # since that option forces --enable-threading=none, and thus -pthread never gets
 # added to begin with.
 
+CTHREADFLAGS :=
+
 ifeq ($(CC_VENDOR),gcc)
-ifeq ($(THREADING_MODEL),auto)
-THREADING_MODEL := openmp
-endif
-ifeq ($(THREADING_MODEL),openmp)
-CTHREADFLAGS := -fopenmp
+#ifneq ($(findstring auto,$(THREADING_MODEL)),)
+#THREADING_MODEL := openmp
+#endif
+ifneq ($(findstring openmp,$(THREADING_MODEL)),)
+CTHREADFLAGS += -fopenmp
 LDFLAGS      += -fopenmp
 endif
-ifeq ($(THREADING_MODEL),pthreads)
-CTHREADFLAGS := -pthread
+ifneq ($(findstring pthreads,$(THREADING_MODEL)),)
+CTHREADFLAGS += -pthread
 LDFLAGS      += $(LIBPTHREAD)
 endif
 endif
 
 ifeq ($(CC_VENDOR),icc)
-ifeq ($(THREADING_MODEL),auto)
-THREADING_MODEL := openmp
-endif
-ifeq ($(THREADING_MODEL),openmp)
-CTHREADFLAGS := -fopenmp
+#ifneq ($(findstring auto,$(THREADING_MODEL)),)
+#THREADING_MODEL := openmp
+#endif
+ifneq ($(findstring openmp,$(THREADING_MODEL)),)
+CTHREADFLAGS += -fopenmp
 LDFLAGS      += -fopenmp
 endif
-ifeq ($(THREADING_MODEL),pthreads)
-CTHREADFLAGS := -pthread
+ifneq ($(findstring pthreads,$(THREADING_MODEL)),)
+CTHREADFLAGS += -pthread
 LDFLAGS      += $(LIBPTHREAD)
 endif
 endif
 
 ifeq ($(CC_VENDOR),clang)
-ifeq ($(THREADING_MODEL),auto)
-THREADING_MODEL := pthreads
-endif
-ifeq ($(THREADING_MODEL),openmp)
-CTHREADFLAGS := -fopenmp
+#ifneq ($(findstring auto,$(THREADING_MODEL)),)
+#THREADING_MODEL := pthreads
+#endif
+ifneq ($(findstring openmp,$(THREADING_MODEL)),)
+CTHREADFLAGS += -fopenmp
 LDFLAGS      += -fopenmp
 endif
-ifeq ($(THREADING_MODEL),pthreads)
-CTHREADFLAGS := -pthread
+ifneq ($(findstring pthreads,$(THREADING_MODEL)),)
+CTHREADFLAGS += -pthread
 LDFLAGS      += $(LIBPTHREAD)
 endif
 endif
diff --git a/configure b/configure
index a6018edab..858ce55de 100755
--- a/configure
+++ b/configure
@@ -169,10 +169,23 @@ print_usage()
 	echo " "
 	echo "   -t MODEL, --enable-threading[=MODEL], --disable-threading"
 	echo " "
-	echo "                 Enable threading in the library, using threading model"
-	echo "                 MODEL={openmp,pthreads,no}. If MODEL=no or "
-	echo "                 --disable-threading is specified, threading will be"
-	echo "                 disabled. The default is 'no'."
+	echo "                 Enable threading in the library, using threading model(s)"
+	echo "                 MODEL={single,openmp,pthreads,auto}. If multiple values"
+	echo "                 are specified within MODEL, they will all be compiled into"
+	echo "                 BLIS, and the choice of which to use will be determined at"
+	echo "                 runtime. If the user does not express a preference (by"
+	echo "                 setting the BLIS_THREAD_IMPL environment variable to"
+	echo "                 'single', 'openmp', or 'pthreads'; by calling the global"
+	echo "                 runtime API bli_thread_set_thread_impl(); or by encoding a"
+	echo "                 choice on a per-call basis within a rntm_t passed into the"
+	echo "                 expert API), then the first model listed in MODEL will be"
+	echo "                 used by default. Note that 'single' is silently appended"
+	echo "                 to whatever the user specifies in MODEL, meaning that"
+	echo "                 single-threaded functionality will always be available,"
+	echo "                 even if it is not requested and even if it is not enabled"
+	echo "                 by default. Even --disable-threading is actually shorthand"
+	echo "                 for --enable-threading=single (which is the default when"
+	echo "                 the option is not specified)."
 	echo " "
 	echo "   --enable-system, --disable-system"
 	echo " "
@@ -2606,7 +2619,7 @@ main()
 							threading_model=${OPTARG#*=}
 							;;
 						disable-threading)
-							threading_model='off'
+							threading_model='single'
 							;;
 						thread-part-jrir=*)
 							thread_part_jrir=${OPTARG#*=}
@@ -3420,36 +3433,182 @@ main()
 	fi
 
 	# Check the threading model flag and standardize its value, if needed.
-	# NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred.
+	# Note that single-threaded mode will always be enabled, but not necessarily
+	# by default.
+	enable_single='yes'
 	enable_openmp='no'
-	enable_openmp_01=0
 	enable_pthreads='no'
+	enable_single_01=1
+	enable_openmp_01=0
 	enable_pthreads_01=0
-	if [ "x${threading_model}" = "xauto" ]; then
+	parsed_tm=''
+	first_tm=''
+	enable_single_as_def_01=0
+	enable_openmp_as_def_01=0
+	enable_pthreads_as_def_01=0
+
+	# Convert whatever reasonable separator the user may have used into a space.
+	threading_model_list=$(echo "${threading_model}" | sed -e "s/[,+]/ /g")
+
+	# Search for all recognized values and standardize them to one of four
+	# strings: 'single', 'openmp', 'pthreads', 'auto'. Notice that we keep
+	# the strings in the same order as they originally appeared.
+	for word in ${threading_model_list}; do
+
+		if [ "x${word}" = "xsingle" ] ||
+		   [ "x${word}" = "xnone"   ] ||
+		   [ "x${word}" = "xoff"    ] ||
+		   [ "x${word}" = "xno"     ]; then
+
+			parsed_tm="${parsed_tm} single"
+
+		elif [ "x${word}" = "xopenmp" ] ||
+			 [ "x${word}" = "xomp"    ]; then
+
+			parsed_tm="${parsed_tm} openmp"
+
+		elif [ "x${word}" = "xpthreads" ] ||
+			 [ "x${word}" = "xpthread"  ] ||
+			 [ "x${word}" = "xposix"    ]; then
+
+			parsed_tm="${parsed_tm} pthreads"
+
+		elif [ "x${word}" = "xauto" ]; then
+
+			parsed_tm="${parsed_tm} auto"
+
+		else
+
+			echo "${script_name}: *** Unsupported threading model: ${word}."
+			exit 1
+		fi
+	done
+
+	# Always enable single-threaded behavior. If the user explicitly
+	# requested 'single' as well as other modes, the first occurrence will
+	# be kept when duplicates are removed, which will preserve the order
+	# for purposes of determining which mode will be the default (absent
+	# any explicit choice at runtime).
+	parsed_tm="${parsed_tm} single"
+
+	# Remove duplicates, if they exist.
+	parsed_tm=$(rm_duplicate_words_simple "${parsed_tm}")
+
+	#echo "parsed_tm0: _${parsed_tm}_"
+
+	# If parsed_tm contains 'auto', substitute in the automatic choice
+	# based on which compiler family is being used.
+	if [ "$(is_in_list "auto" "${parsed_tm}")" = "true" ]; then
+
+		# If 'auto' was found in the threading model string, we ignore any
+		# other choice that may have been expressed and leave everything
+		# disabled. (The Makefile will automatically choose a model based
+		# on information such as the compiler.)
 		echo "${script_name}: determining the threading model automatically."
-	elif [ "x${threading_model}" = "xopenmp" ] ||
-	     [ "x${threading_model}" = "xomp" ]; then
-		echo "${script_name}: using OpenMP for threading."
-		enable_openmp='yes'
-		enable_openmp_01=1
-		threading_model="openmp" # Standardize the value.
-	elif [ "x${threading_model}" = "xpthreads" ] ||
-	     [ "x${threading_model}" = "xpthread" ] ||
-	     [ "x${threading_model}" = "xposix" ]; then
-		echo "${script_name}: using POSIX threads for threading."
-		enable_pthreads='yes'
-		enable_pthreads_01=1
-		threading_model="pthreads" # Standardize the value.
-	elif [ "x${threading_model}" = "xoff" ] ||
-	     [ "x${threading_model}" = "xno" ] ||
-	     [ "x${threading_model}" = "xnone" ]; then
-		echo "${script_name}: threading is disabled."
-		threading_model="off"
-	else
-		echo "${script_name}: *** Unsupported threading model: ${threading_model}."
-		exit 1
+
+		# Use OpenMP for gcc and icc, but pthreads for clang.
+		if   [ "${cc_vendor}" = "gcc" ]; then
+
+			selected_tm="openmp"
+			echo "${script_name}:   automatically selected OpenMP."
+
+		elif [ "${cc_vendor}" = "icc" ]; then
+
+			selected_tm="openmp"
+			echo "${script_name}:   automatically selected OpenMP."
+
+		elif [ "${cc_vendor}" = "clang" ]; then
+
+			selected_tm="pthreads"
+			echo "${script_name}:   automatically selected pthreads."
+		fi
+
+		# Substitute the selected threading model for 'auto' in parsed_tm.
+		parsed_tm=$(substitute_words "auto" "${selected_tm}" "${parsed_tm}")
 	fi
 
+	#echo "parsed_tm1: _${parsed_tm}_"
+
+	# Remove any extra whitespace.
+	parsed_tm=$(canonicalize_ws "${parsed_tm}")
+
+	#echo "parsed_tm2: _${parsed_tm}_"
+
+	# Find the first word. This will be the default threading model.
+	first_tm=${parsed_tm%% *}
+
+	#echo "first_tm0:  _${first_tm}_"
+
+	# Now that we've standardized the list, removed duplicates, and handled
+	# the possibility of 'auto' being among the listed threading models, we can
+	# proceed to formally processing each threading model to enable. Since
+	# 'auto' has been converted to 'openmp' or 'pthreads', we only need to
+	# handle the remaining three options (openmp, pthreads, and single) going
+	# forward.
+	for word in ${parsed_tm}; do
+
+		if [ "x${word}" = "xsingle" ]; then
+
+			echo "${script_name}: enabling support for single-threading."
+			enable_single='yes'
+			enable_single_01=1
+
+		elif [ "x${word}" = "xopenmp" ]; then
+
+			echo "${script_name}: enabling support for threading via OpenMP."
+			enable_openmp='yes'
+			enable_openmp_01=1
+
+		elif [ "x${word}" = "xpthreads" ]; then
+
+			echo "${script_name}: enabling support for threading via pthreads."
+			enable_pthreads='yes'
+			enable_pthreads_01=1
+		fi
+	done
+
+	# Define boolean variables that can easily be interpreted with #ifdef
+	# directives.
+	if [ "x${first_tm}" = "xsingle" ]; then
+
+		enable_single_as_def_01=1
+		enable_openmp_as_def_01=0
+		enable_pthreads_as_def_01=0
+
+	elif [ "x${first_tm}" = "xopenmp" ]; then
+
+		enable_single_as_def_01=0
+		enable_openmp_as_def_01=1
+		enable_pthreads_as_def_01=0
+
+	elif [ "x${first_tm}" = "xpthreads" ]; then
+
+		enable_single_as_def_01=0
+		enable_openmp_as_def_01=0
+		enable_pthreads_as_def_01=1
+	fi
+
+	# If either OpenMP or pthreads was enabled, given that single-threaded mode is
+	# also always enabled, remind the user which one will serve as the default
+	# (that is, absent any explicit choice at runtime).
+	if [ "x${enable_openmp}"   = "xyes" ] ||
+	   [ "x${enable_pthreads}" = "xyes" ]; then
+
+		if   [ "x${first_tm}"   = "xsingle" ]; then
+			echo "${script_name}: threading will default to single-threaded."
+		elif [ "x${first_tm}"   = "xopenmp" ]; then
+			echo "${script_name}: threading will default to OpenMP."
+		elif [ "x${first_tm}"   = "xpthreads" ]; then
+			echo "${script_name}: threading will default to pthreads."
+		fi
+	fi
+
+	# Copy the final parsed threading model list back to the original variable.
+	threading_model="${parsed_tm}"
+
+	#echo "parsed_tm: _${parsed_tm}_"
+	#echo "first_tm:  _${first_tm}_"
+
 	# Check the method of assigning micropanels to threads in the JR and IR
 	# loops.
 	enable_jrir_slab_01=0
@@ -3461,7 +3620,7 @@ main()
 		echo "${script_name}: requesting round-robin threading in jr and ir loops."
 		enable_jrir_rr_01=1
 	else
-		echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${threading_model}."
+		echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${thread_part_jrir}."
 		exit 1
 	fi
 
@@ -3891,7 +4050,9 @@ main()
 		| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
 		| sed   -e "s/@enable_system@/${enable_system_01}/g" \
 		| sed   -e "s/@enable_openmp@/${enable_openmp_01}/g" \
+		| sed   -e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g" \
 		| sed   -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
+		| sed   -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \
 		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
 		| sed   -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
 		| sed   -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \
diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 8e636f06a..933296f79 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -1,5 +1,5 @@
 # Contents
-
+/
 * **[Contents](Multithreading.md#contents)**
 * **[Introduction](Multithreading.md#introduction)**
 * **[Enabling multithreading](Multithreading.md#enabling-multithreading)**
@@ -9,13 +9,16 @@
   * [Globally via environment variables](Multithreading.md#globally-via-environment-variables)
     * [The automatic way](Multithreading.md#environment-variables-the-automatic-way)
     * [The manual way](Multithreading.md#environment-variables-the-manual-way)
+    * [Overriding the default threading implementation](Multithreading.md#environment-variables-overriding-the-default-threading-implementation)
   * [Globally at runtime](Multithreading.md#globally-at-runtime)
     * [The automatic way](Multithreading.md#globally-at-runtime-the-automatic-way)
     * [The manual way](Multithreading.md#globally-at-runtime-the-manual-way)
+    * [Overriding the default threading implementation](Multithreading.md#globally-at-runtime-overriding-the-default-threading-implementation)
   * [Locally at runtime](Multithreading.md#locally-at-runtime)
     * [Initializing a rntm_t](Multithreading.md#initializing-a-rntm-t)
     * [The automatic way](Multithreading.md#locally-at-runtime-the-automatic-way)
     * [The manual way](Multithreading.md#locally-at-runtime-the-manual-way)
+    * [Overriding the default threading implementation](Multithreading.md#locally-at-runtime-overriding-the-default-threading-implementation)
     * [Using the expert interface](Multithreading.md#locally-at-runtime-using-the-expert-interface)
 * **[Known issues](Multithreading.md#known-issues)**
 * **[Conclusion](Multithreading.md#conclusion)**
@@ -35,13 +38,13 @@ To summarize: In order to observe multithreaded parallelism within a BLIS operat
 
 BLIS disables multithreading by default. In order to allow multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.
 
-As of this writing, BLIS optionally supports multithreading via either OpenMP or POSIX threads.
+As of this writing, BLIS optionally supports multithreading via OpenMP or POSIX threads(or both).
 
 To enable multithreading via OpenMP, you must provide the `--enable-threading` option to the `configure` script:
 ```
 $ ./configure --enable-threading=openmp auto
 ```
-In this example, we target the `auto` configuration, which is like asking `configure` to choose the most appropriate configuration based on some detection heuristic (e.g. `cpuid` on x86_64). Similarly, to enable multithreading via POSIX threads (pthreads), specify the threading model as `pthreads` instead of `openmp`:
+In this example, we target the `auto` configuration, which is like asking `configure` to choose the most appropriate configuration based on some detection heuristic (e.g. `cpuid` on x86_64 hardware). Similarly, to enable multithreading via POSIX threads (pthreads), specify the threading model as `pthreads` instead of `openmp`:
 ```
 $ ./configure --enable-threading=pthreads auto
 ```
@@ -50,7 +53,12 @@ You can also use the shorthand option for `--enable-threading`, which is `-t`:
 $ ./configure -t openmp auto
 $ ./configure -t pthreads auto
 ```
-For more complete and up-to-date information on the `--enable-threading` option, simply run `configure` with the `--help` (or `-h`) option:
+You may even combine multiple threading implementations into the same library build. We call this "fat threading." When more than one option is given, the first option acts as the default. Note that no matter what arguments you specify for the `-t` option, the single-threaded implementation will always be available.
+```
+$ ./configure -t openmp,pthreads auto
+```
+In the above example, OpenMP will serve as the default threading implementation since it is listed first. This default can be overridden at runtime, though, which is discussed later on.
+For more complete and up-to-date information on the `--enable-threading` option, run `configure` with the `--help` (or `-h`) option:
 ```
 $ ./configure --help
 ```
@@ -129,11 +137,15 @@ Regardless of whether you end up using the automatic or manual way of expressing
 
 The automatic way of specifying parallelism entails setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable:
 ```
-$ export GOMP_CPU_AFFINITY="..."  # optional step when using GNU libgomp.
+$ export GOMP_CPU_AFFINITY="0-15"  # optional step when using GNU libgomp.
 $ export BLIS_NUM_THREADS=16
 $ ./my_blis_program
 ```
-This causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `OMP_NUM_THREADS`. If neither variable is set, the default number of threads is 1.
+If you don't want or need your environment variable assignments to persist after `my_blis_program` completes, you can instead set the variables only for the duration of the program as follows:
+```
+$ GOMP_CPU_AFFINITY="0-15" BLIS_NUM_THREADS=16 ./my_blis_program
+```
+Either of these approaches causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `BLIS_NT` (a shorthand alternative to `BLIS_NUM_THREADS`). If neither variable is defined, then BLIS will attempt to read `OMP_NUM_THREADS`. If none of these variables is set, the default number of threads is 1.
 
 **Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable to specify multithreading within BLIS and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
 
@@ -166,6 +178,23 @@ Next, which combinations of loops to parallelize depends on which caches are sha
 
 ![The primary algorithm for level-3 operations in BLIS](http://www.cs.utexas.edu/users/field/mm_algorithm_color.png)
 
+### Environment variables: overriding the default threading implementation
+
+Just as you may specify the number of threads for BLIS to use by setting environment variables prior to running your BLIS-linked application, you may also specify your preferred threading implementation. Suppose that you configured BLIS as follows:
+```
+$ ./configure -t openmp,pthreads auto
+```
+This will result in both OpenMP and pthreads implementations being compiled and included within the BLIS library, with OpenMP serving as the default (since it was listed first to the `-t` option). You can link your program against this BLIS library and force the use of pthreads (instead of OpenMP) via environment variables as follows:
+```
+$ BLIS_THREAD_IMPL=pthreads BLIS_NUM_THREADS=8 ./my_blis_program
+```
+You can even disable multithreading altogether by forcing the use of the single-threaded code path:
+```
+$ BLIS_THREAD_IMPL=single ./my_blis_program
+```
+Note that if `BLIS_THREAD_IMPL` is assigned to `single`, any other threading-related variables that may be set, such as `BLIS_NUM_THREADS` or any of the `BLIS_*_NT` variables, are ignored.
+If `BLIS_THREAD_IMPL` is not set, BLIS will attempt to query its shorthand alternative, `BLIS_TI`. If neither value is set, the configure-time default (in the example shown above, OpenMP) will prevail.
+
 ## Globally at runtime
 
 If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized).
@@ -207,6 +236,26 @@ bli_thread_set_ways( 2, 1, 4, 1, 1 );
 we are requesting 2 ways of parallelism in the `JC` loop and 4 ways of parallelism in the `IC` loop.
 Unlike environment variables, which only allow the user to set the parallelization strategy prior to running the executable, `bli_thread_set_ways()` may be called any time during the normal course of the BLIS-linked application's execution.
 
+### Globally at runtime: overriding the default threading implementation
+
+Let's assume that you configured BLIS as follows:
+```
+$ ./configure -t openmp,pthreads auto
+```
+This will result in both OpenMP and pthreads implementations being compiled and included within the BLIS library, with OpenMP serving as the default (since it was listed first to the `-t` option). You can link your program against this BLIS library and force the use of pthreads (instead of OpenMP) globally at runtime via the following API:
+```c
+void bli_thread_set_thread_impl( timpl_t ti );
+```
+The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
+```c
+bli_thread_set_thread_impl( BLIS_POSIX )
+```
+You can even disable multithreading altogether by forcing the use of the single-threaded code path:
+```c
+bli_thread_set_thread_impl( BLIS_SINGLE )
+```
+Note that if `BLIS_SINGLE` is specified, any other-related parameters previously set, such as via `bli_thread_set_num_threads()` or `bli_thread_set_ways()`, are ignored.
+
 ## Locally at runtime
 
 In addition to the global methods based on environment variables and runtime function calls, BLIS also offers a local, *per-call* method of requesting parallelism at runtime. This method has the benefit of being thread-safe and flexible; your application can spawn two threads at the application level, with each thread requesting different degrees of parallelism from their respective calls to level-3 BLIS operations.
@@ -262,6 +311,26 @@ bli_rntm_set_ways( 1, 1, 2, 3, 1, &rntm );
 ```
 we are requesting two ways of parallelism in the `IC` loop and three ways of parallelism in the `JR` loop.
 
+### Locally at runtime: overriding the default threading implementation
+
+Let's assume that you configured BLIS as follows:
+```
+$ ./configure -t openmp,pthreads auto
+```
+This will result in both OpenMP and pthreads implementations being compiled and included within the BLIS library, with OpenMP serving as the default (since it was listed first to the `-t` option). You can link your program against this BLIS library and force the use of pthreads (instead of OpenMP) at runtime, on a per-call basis, by encoding your choice within your `rntm_t`:
+```c
+void bli_rntm_set_thread_impl( timpl_t ti, rntm_t* rntm );
+```
+The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
+```c
+bli_rntm_set_thread_impl( BLIS_POSIX, &rntm );
+```
+You can even disable multithreading altogether by forcing the use of the single-threaded code path:
+```c
+bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm );
+```
+Note that if `BLIS_SINGLE` is specified, any other-related parameters previously set within the `rntm_t`, such as via `bli_rntm_set_num_threads()` or `bli_rntm_set_ways()`, are ignored.
+
 ### Locally at runtime: using the expert interfaces
 
 Regardless of whether you specified parallelism into your `rntm_t` object via the automatic or manual method, eventually you must use the data structure when calling a BLIS operation in order for it to have any effect.
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index 22ed31ecc..07f54de78 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -103,7 +103,7 @@ void* bli_packm_alloc_ex
 
 		// Broadcast the address of the chief thread's local mem_t entry to
 		// all threads.
-		local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
+		local_mem_p = bli_thread_broadcast( rntm, thread, &local_mem_s );
 
 		// Save the chief thread's local mem_t entry to the mem_t field in
 		// this thread's control tree node.
@@ -111,7 +111,7 @@ void* bli_packm_alloc_ex
 
 		// Barrier so that the master thread doesn't return from the function
 		// before we are done reading.
-		bli_thread_barrier( thread );
+		bli_thread_barrier( rntm, thread );
 	}
 
 	return bli_mem_buffer( cntl_mem_p );
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index f76607508..ae788e671 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -51,7 +51,7 @@ void bli_packm_int
 
 	// Barrier so that we know threads are done with previous computation
 	// with the same packing buffer before starting to pack.
-	bli_thread_barrier( thread );
+	bli_thread_barrier( rntm, thread );
 
 	// Invoke the variant with kappa_use.
 	f
@@ -65,6 +65,6 @@ void bli_packm_int
 	);
 
 	// Barrier so that packing is done before computation.
-	bli_thread_barrier( thread );
+	bli_thread_barrier( rntm, thread );
 }
 
diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c
index f6b09d8ae..3b542b061 100644
--- a/frame/1m/unpackm/bli_unpackm_int.c
+++ b/frame/1m/unpackm/bli_unpackm_int.c
@@ -39,6 +39,7 @@ void bli_unpackm_int
        const obj_t*  p,
        const obj_t*  a,
        const cntx_t* cntx,
+       const rntm_t* rntm,
        const cntl_t* cntl,
        const thrinfo_t* thread
      )
@@ -73,6 +74,6 @@ void bli_unpackm_int
 	}
 
 	// Barrier so that unpacking is done before computation.
-	bli_thread_barrier( thread );
+	bli_thread_barrier( rntm, thread );
 }
 
diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h
index 8258ea367..fc2c3e66d 100644
--- a/frame/1m/unpackm/bli_unpackm_int.h
+++ b/frame/1m/unpackm/bli_unpackm_int.h
@@ -37,6 +37,7 @@ void bli_unpackm_int
        const obj_t*  p,
        const obj_t*  a,
        const cntx_t* cntx,
+       const rntm_t* rntm,
        const cntl_t* cntl,
        const thrinfo_t* thread
      );
diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c
index b786236ab..b9d389839 100644
--- a/frame/3/bli_l3_int.c
+++ b/frame/3/bli_l3_int.c
@@ -70,7 +70,7 @@ void bli_l3_int
 	{
 		if ( bli_thread_am_ochief( thread ) )
 			bli_scalm( beta, c );
-		bli_thread_barrier( thread );
+		bli_thread_barrier( rntm, thread );
 		return;
 	}
 
@@ -84,7 +84,7 @@ void bli_l3_int
 
 		if ( bli_thread_am_ochief( thread ) )
 			bli_scalm( beta, c );
-		bli_thread_barrier( thread );
+		bli_thread_barrier( rntm, thread );
 		return;
 	}
 
diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c
index b7a7ee02b..5ed7700dc 100644
--- a/frame/3/bli_l3_sup_packm.c
+++ b/frame/3/bli_l3_sup_packm.c
@@ -64,7 +64,7 @@ void bli_packm_sup_init_mem
 
 		// Barrier to make sure all threads are caught up and ready to begin
 		// the packm stage.
-		bli_thread_barrier( thread );
+		bli_thread_barrier( rntm, thread );
 
 		// Compute the size of the memory block eneded.
 		siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
@@ -94,7 +94,7 @@ void bli_packm_sup_init_mem
 
 			// Broadcast the address of the chief thread's passed-in mem_t
 			// to all threads.
-			mem_t* mem_p = bli_thread_broadcast( thread, mem );
+			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem );
 
 			// Non-chief threads: Copy the contents of the chief thread's
 			// passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -143,7 +143,7 @@ void bli_packm_sup_init_mem
 
 				// Broadcast the address of the chief thread's passed-in mem_t
 				// to all threads.
-				mem_t* mem_p = bli_thread_broadcast( thread, mem );
+				mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem );
 
 				// Non-chief threads: Copy the contents of the chief thread's
 				// passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -422,7 +422,7 @@ void bli_packm_sup
 		}
 
 		// Barrier so that packing is done before computation.
-		bli_thread_barrier( thread );
+		bli_thread_barrier( rntm, thread );
 	}
 }
 
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 357251002..71357cec4 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -244,7 +244,7 @@ if ( col_stored ) { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( thread ); \
+bli_thread_barrier( rntm, thread ); \
 	if ( bli_thread_work_id( thread ) == 1 ) \
 	{ \
 	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -255,7 +255,7 @@ bli_thread_barrier( thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( thread ); \
+bli_thread_barrier( rntm, thread ); \
 } \
 else { \
 	if ( bli_thread_work_id( thread ) == 0 ) \
@@ -268,7 +268,7 @@ else { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( thread ); \
+bli_thread_barrier( rntm, thread ); \
 	if ( bli_thread_work_id( thread ) == 1 ) \
 	{ \
 	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -279,7 +279,7 @@ bli_thread_barrier( thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( thread ); \
+bli_thread_barrier( rntm, thread ); \
 } \
 */
 /*
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index 76f1a96b7..e4858621a 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -232,7 +232,7 @@ void bli_gemmsup_ref_var1n
 
 	mem_t mem_a = BLIS_MEM_INITIALIZER;
 	mem_t mem_b = BLIS_MEM_INITIALIZER;
-\
+
 	// Define an array of bszid_t ids, which will act as our substitute for
 	// the cntl_t tree.
 	// NOTE: These bszid_t values, and their order, match that of the bp
@@ -246,22 +246,22 @@ void bli_gemmsup_ref_var1n
 	// packed in the 3rd loop.
 	//                    5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop
 	bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
-\
+
 	// Determine whether we are using more than one thread.
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
-\
+
 	thrinfo_t* thread_jc = NULL;
 	thrinfo_t* thread_pc = NULL;
 	thrinfo_t* thread_pa = NULL;
 	thrinfo_t* thread_ic = NULL;
 	thrinfo_t* thread_pb = NULL;
 	thrinfo_t* thread_jr = NULL;
-\
+
 	// Pre-grow the thrinfo_t tree.
 	bszid_t* bszids_jc = bszids;
 	         thread_jc = thread;
 	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc );
-\
+
 	bszid_t* bszids_pc = &bszids_jc[1];
 	         thread_pc = bli_thrinfo_sub_node( thread_jc );
 	bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc );
@@ -472,7 +472,7 @@ void bli_gemmsup_ref_var1n
 
 			// NOTE: This barrier is only needed if we are packing A (since
 			// that matrix is packed within the pc loop of this variant).
-			if ( packa ) bli_thread_barrier( thread_pa );
+			if ( packa ) bli_thread_barrier( rntm, thread_pa );
 		}
 	}
 
@@ -909,7 +909,7 @@ void bli_gemmsup_ref_var2m
 
 			// NOTE: This barrier is only needed if we are packing B (since
 			// that matrix is packed within the pc loop of this variant).
-			if ( packb ) bli_thread_barrier( thread_pb );
+			if ( packb ) bli_thread_barrier( rntm, thread_pb );
 		}
 	}
 
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index 1bbec1d95..cb20b7f36 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -88,7 +88,7 @@ void bli_gemm_blk_var3
 		  bli_thrinfo_sub_node( thread )
 		);
 
-		bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
+		bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) );
 
 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal beta scalar on matrix C is non-zero, we must use it
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index 79ac65c48..413b12818 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -118,7 +118,7 @@ void bli_trsm_blk_var1
 	// We must execute a barrier here because the upcoming rank-k update
 	// requires the packed matrix B to be fully updated by the trsm
 	// subproblem.
-	bli_thread_barrier( thread );
+	bli_thread_barrier( rntm, thread );
 
 	// Isolate the remaining part of the column panel matrix A, which we do by
 	// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending
diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index 2ff3db6f1..229259a95 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -89,7 +89,7 @@ void bli_trsm_blk_var3
 		);
 
 		//bli_thread_ibarrier( thread );
-		bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
+		bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) );
 
 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal alpha scalars on A/B and C are non-zero, we must ensure
diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c
index 0972f1771..fab6af89e 100644
--- a/frame/base/bli_env.c
+++ b/frame/base/bli_env.c
@@ -89,6 +89,14 @@ gint_t bli_env_get_var( const char* env, gint_t fallback )
 	return r_val;
 }
 
+char* bli_env_get_str( const char* env )
+{
+	// Query the environment variable.
+	return getenv( env );
+}
+
+
+
 #if 0
 #ifdef _MSC_VER
 #define strerror_r(errno,buf,len) strerror_s(buf,len,errno)
diff --git a/frame/base/bli_env.h b/frame/base/bli_env.h
index de86fadff..207fbf9d4 100644
--- a/frame/base/bli_env.h
+++ b/frame/base/bli_env.h
@@ -38,6 +38,7 @@
 #define BLIS_ENV_H
 
 gint_t bli_env_get_var( const char* env, gint_t fallback );
+char*  bli_env_get_str( const char* env );
 //void  bli_env_set_var( const char* env, dim_t value );
 
 #endif
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index 72b54ca20..9d6e181d3 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -123,6 +123,22 @@ gint_t bli_info_get_enable_pthreads( void )
 	return 0;
 #endif
 }
+gint_t bli_info_get_enable_openmp_as_default( void )
+{
+#ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT
+	return 1;
+#else
+	return 0;
+#endif
+}
+gint_t bli_info_get_enable_pthreads_as_default( void )
+{
+#ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT
+	return 1;
+#else
+	return 0;
+#endif
+}
 gint_t bli_info_get_thread_part_jrir_slab( void )
 {
 #ifdef BLIS_ENABLE_JRIR_SLAB
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index 250504c23..b3514f434 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -70,6 +70,8 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp_as_default( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads_as_default( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void );
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index 1411ffaa3..895976679 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -285,6 +285,11 @@ void bli_rntm_set_ways_from_rntm
 	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+
+	// NOTE: The caller should have already set the timpl_t field of the rntm_t,
+	// either in the course of it being initialized via BLIS_RNTM_INITIALIZER
+	// or bli_rntm_init(), or by the user (subsequently) setting the value
+	// directly via bli_rntm_set_thread_impl().
 }
 
 void bli_rntm_set_ways_from_rntm_sup
@@ -418,16 +423,19 @@ void bli_rntm_print
        const rntm_t* rntm
      )
 {
-	dim_t af = bli_rntm_auto_factor( rntm );
+	timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	dim_t   af = bli_rntm_auto_factor( rntm );
 
-	dim_t nt = bli_rntm_num_threads( rntm );
+	dim_t   nt = bli_rntm_num_threads( rntm );
 
-	dim_t jc = bli_rntm_jc_ways( rntm );
-	dim_t pc = bli_rntm_pc_ways( rntm );
-	dim_t ic = bli_rntm_ic_ways( rntm );
-	dim_t jr = bli_rntm_jr_ways( rntm );
-	dim_t ir = bli_rntm_ir_ways( rntm );
+	dim_t   jc = bli_rntm_jc_ways( rntm );
+	dim_t   pc = bli_rntm_pc_ways( rntm );
+	dim_t   ic = bli_rntm_ic_ways( rntm );
+	dim_t   jr = bli_rntm_jr_ways( rntm );
+	dim_t   ir = bli_rntm_ir_ways( rntm );
 
+	printf( "thread impl: %d\n", ti );
 	printf( "rntm contents    nt  jc  pc  ic  jr  ir\n" );
 	printf( "autofac? %1d | %4d%4d%4d%4d%4d%4d\n", (int)af,
 	                                               (int)nt, (int)jc, (int)pc,
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 8b6538484..426b74d60 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -43,6 +43,8 @@
 /*
 typedef struct rntm_s
 {
+	timpl_t   thread_impl;
+
 	bool      auto_factor;
 
 	dim_t     num_threads;
@@ -61,6 +63,11 @@ typedef struct rntm_s
 // -- rntm_t query (public API) ------------------------------------------------
 //
 
+BLIS_INLINE timpl_t bli_rntm_thread_impl( const rntm_t* rntm )
+{
+	return rntm->thread_impl;
+}
+
 BLIS_INLINE bool bli_rntm_auto_factor( const rntm_t* rntm )
 {
 	return rntm->auto_factor;
@@ -133,6 +140,11 @@ BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm )
 // -- rntm_t modification (internal use only) ----------------------------------
 //
 
+BLIS_INLINE void bli_rntm_set_thread_impl_only( timpl_t thread_impl, rntm_t* rntm )
+{
+	rntm->thread_impl = thread_impl;
+}
+
 BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm )
 {
 	rntm->auto_factor = auto_factor;
@@ -215,6 +227,12 @@ BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm )
 // -- rntm_t modification (public API) -----------------------------------------
 //
 
+BLIS_INLINE void bli_rntm_set_thread_impl( timpl_t thread_impl, rntm_t* rntm )
+{
+	// Set the threading implementation to use.
+	bli_rntm_set_thread_impl_only( thread_impl, rntm );
+}
+
 BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm )
 {
 	// Record the total number of threads to use.
@@ -292,6 +310,7 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 
 #define BLIS_RNTM_INITIALIZER \
         { \
+          .thread_impl = SINGLE, \
           .auto_factor = TRUE, \
           .num_threads = 1, \
           .thrloop     = { 1, 1, 1, 1, 1, 1 }, \
@@ -304,6 +323,8 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 
 BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 {
+	bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm );
+
 	bli_rntm_set_auto_factor_only( TRUE, rntm );
 
 	bli_rntm_clear_num_threads_only( rntm );
@@ -316,7 +337,9 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 	bli_rntm_clear_pba( rntm );
 }
 
+//
 // -- rntm_t total thread calculation ------------------------------------------
+//
 
 BLIS_INLINE dim_t bli_rntm_calc_num_threads
      (
@@ -334,9 +357,9 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads
 	return n_threads;
 }
 
-// -----------------------------------------------------------------------------
-
-// Function prototypes
+//
+// -- Function prototypes ------------------------------------------------------
+//
 
 BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
 
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 0c75fb639..542973b18 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -83,17 +83,10 @@
   // Default behavior is disabled.
 #endif
 
-// Perform a sanity check to make sure the user doesn't try to enable
-// both OpenMP and pthreads.
-#if defined ( BLIS_ENABLE_OPENMP ) && \
-    defined ( BLIS_ENABLE_PTHREADS )
-  #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined."
-#endif
-
 // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP
 // or pthreads are enabled. This macro is useful in situations when
-// we want to detect use of either OpenMP or pthreads (as opposed
-// to neither being used).
+// we want to detect use of either OpenMP or pthreads, or both (as
+// opposed to neither being used).
 #if defined ( BLIS_ENABLE_OPENMP ) || \
     defined ( BLIS_ENABLE_PTHREADS )
   #define BLIS_ENABLE_MULTITHREADING
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index b5c3ec255..d37e62f8a 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -622,6 +622,20 @@ typedef enum
 #define bli_nat  BLIS_NAT
 
 
+// -- Threading implementation type --
+
+typedef enum
+{
+	BLIS_SINGLE = 0,
+	BLIS_OPENMP,
+	BLIS_POSIX,
+
+	// BLIS_NUM_THREAD_IMPLS must be last!
+	BLIS_NUM_THREAD_IMPLS
+
+} timpl_t;
+
+
 // -- Kernel ID types --
 
 typedef enum
@@ -1434,6 +1448,8 @@ typedef struct cntx_s
 typedef struct rntm_s
 {
 	// "External" fields: these may be queried by the end-user.
+	timpl_t   thread_impl;
+
 	bool      auto_factor;
 
 	dim_t     num_threads;
diff --git a/frame/thread/bli_l3_decor.c b/frame/thread/bli_l3_decor.c
new file mode 100644
index 000000000..33fb834be
--- /dev/null
+++ b/frame/thread/bli_l3_decor.c
@@ -0,0 +1,176 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// Initialize a function pointer array containing function addresses for
+// each of the threading-specific level-3 thread decorators.
+
+static l3_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bli_l3_thread_decorator_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bli_l3_thread_decorator_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                NULL,
+#else
+	                NULL,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bli_l3_thread_decorator_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                NULL,
+#else
+	                NULL,
+#endif
+};
+
+// Define a dispatcher that chooses a threading-specific function from the
+// above function pointer array.
+
+void bli_l3_thread_decorator
+     (
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
+     )
+{
+	rntm_t rntm_l;
+
+	// Query the threading implementation and the number of threads requested.
+	timpl_t ti = bli_rntm_thread_impl( rntm );
+	dim_t   nt = bli_rntm_num_threads( rntm );
+
+#if 0
+	printf( "(pre-opt) application requested rntm.thread_impl = %s\n",
+	        ( ti == BLIS_SINGLE ? "single" :
+	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
+
+	if ( bli_error_checking_is_enabled() )
+		bli_l3_thread_decorator_check( rntm );
+
+#ifdef BLIS_ENABLE_NT1_VIA_SINGLE
+	if ( nt == 1 )
+	{
+		// An optimization. If the caller requests only one thread, force
+		// the sequential level-3 thread decorator even if that means
+		// overriding the caller's preferred threading implementation (as
+		// communicated via the rntm_t).
+		rntm_l = *rntm;
+		ti = BLIS_SINGLE;
+		bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm_l );
+		rntm = &rntm_l;
+	}
+#endif
+
+	if ( 1 < nt && ti == BLIS_SINGLE )
+	{
+		// Here, we resolve conflicting information. The caller requested
+		// a sequential threading implementation, but also requested more
+		// than one thread. Here, we choose to favor the requested threading
+		// implementation over the number of threads, and so reset all
+		// parallelism parameters to 1.
+		rntm_l = *rntm;
+		nt = 1;
+		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
+		bli_rntm_set_num_threads_only( 1, &rntm_l );
+		rntm = &rntm_l;
+	}
+
+#if 0
+	printf( "(post-opt) moving forward with rntm.thread_impl  = %s\n",
+	        ( ti == BLIS_SINGLE ? "single" :
+	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
+
+	// Use the timpl_t value to index into the corresponding function address
+	// from the function pointer array.
+	const l3_decor_ft fp = l3_decor_fpa[ ti ];
+
+	// Call the threading-specific decorator function.
+	fp
+	(
+	  func,
+	  family,
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  cntl
+	);
+}
+
+void bli_l3_thread_decorator_check
+     (
+       rntm_t* rntm
+     )
+{
+	//err_t e_val;
+
+	//e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) );
+	//bli_check_error_code( e_val );
+
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	if (
+#ifndef BLIS_ENABLE_OPENMP
+	    ti == BLIS_OPENMP ||
+#endif
+#ifndef BLIS_ENABLE_PTHREADS
+	    ti == BLIS_POSIX ||
+#endif
+	    FALSE
+	   )
+	{
+		fprintf( stderr, "\n" );
+		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
+		bli_abort();
+	}
+}
+
diff --git a/frame/thread/bli_l3_decor.h b/frame/thread/bli_l3_decor.h
index e2208aae6..087eda874 100644
--- a/frame/thread/bli_l3_decor.h
+++ b/frame/thread/bli_l3_decor.h
@@ -36,10 +36,8 @@
 #ifndef BLIS_L3_DECOR_H
 #define BLIS_L3_DECOR_H
 
-// -- conventional definitions -------------------------------------------------
-
 // Level-3 internal function type.
-typedef void (*l3int_t)
+typedef void (*l3int_ft)
      (
        const obj_t*     alpha,
        const obj_t*     a,
@@ -52,19 +50,39 @@ typedef void (*l3int_t)
              thrinfo_t* thread
      );
 
+// Level-3 thread decorator function type.
+typedef void (*l3_decor_ft)
+     (
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
+     );
+
 // Level-3 thread decorator prototype.
 void bli_l3_thread_decorator
      (
-             l3int_t func,
-             opid_t  family,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
+     );
+
+void bli_l3_thread_decorator_check
+     (
+       rntm_t* rntm
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
index 2c71c7532..890c174cf 100644
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -37,30 +37,33 @@
 
 #ifdef BLIS_ENABLE_OPENMP
 
-// Define a dummy function bli_l3_thread_entry(), which is needed in the
-// pthreads version, so that when building Windows DLLs (with OpenMP enabled
-// or no multithreading) we don't risk having an unresolved symbol.
-void* bli_l3_thread_entry( void* data_void ) { return NULL; }
-
 //#define PRINT_THRINFO
+//#define PRINT_IMPL
 
-void bli_l3_thread_decorator
+void bli_l3_thread_decorator_openmp
      (
-             l3int_t func,
-             opid_t  family,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
      )
 {
 	// Query the total number of threads from the rntm_t object.
 	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
+#ifdef PRINT_IMPL
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+	printf( "l3_decor_openmp: l3 decor with rntm.thread_impl  = %s\n",
+	        ( ti == BLIS_SINGLE ? "single" :
+	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
+
 	#ifdef PRINT_THRINFO
 	err_t r_val;
 	thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ), &r_val );
@@ -233,8 +236,10 @@ void bli_l3_thread_decorator_thread_check
 				bli_abort();
 			}
 
+			const timpl_t ti = bli_rntm_thread_impl( rntm );
+
 			//n_threads = 1; // not needed since it has no effect?
-			bli_thrcomm_init( 1, gl_comm );
+			bli_thrcomm_init( ti, 1, gl_comm );
 			bli_rntm_set_num_threads_only( 1, rntm );
 			bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
 		//}
diff --git a/frame/thread/bli_l3_decor_openmp.h b/frame/thread/bli_l3_decor_openmp.h
index 6ff7f16a9..95e1582e5 100644
--- a/frame/thread/bli_l3_decor_openmp.h
+++ b/frame/thread/bli_l3_decor_openmp.h
@@ -39,6 +39,20 @@
 // Definitions specific to situations when OpenMP multithreading is enabled.
 #ifdef BLIS_ENABLE_OPENMP
 
+void bli_l3_thread_decorator_openmp
+     (
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
+     );
+
 void bli_l3_thread_decorator_thread_check
      (
        dim_t      n_threads,
diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c
index 80247dfb1..d31414d3b 100644
--- a/frame/thread/bli_l3_decor_pthreads.c
+++ b/frame/thread/bli_l3_decor_pthreads.c
@@ -40,7 +40,7 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	      l3int_t    func;
+	      l3int_ft   func;
 	      opid_t     family;
 	const obj_t*     alpha;
 	const obj_t*     a;
@@ -60,7 +60,7 @@ void* bli_l3_thread_entry( void* data_void )
 {
 	const thread_data_t* data     = data_void;
 
-	const l3int_t        func     = data->func;
+	const l3int_ft       func     = data->func;
 	const opid_t         family   = data->family;
 	const obj_t*         alpha    = data->alpha;
 	const obj_t*         a        = data->a;
@@ -139,25 +139,34 @@ void* bli_l3_thread_entry( void* data_void )
 	return NULL;
 }
 
-void bli_l3_thread_decorator
+//#define PRINT_IMPL
+
+void bli_l3_thread_decorator_pthreads
      (
-             l3int_t func,
-             opid_t  family,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
      )
 {
 	err_t r_val;
 
-	// Query the total number of threads from the context.
+	// Query the total number of threads from the rntm_t object.
 	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
+#ifdef PRINT_IMPL
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+	printf( "l3_decor_pthrea: l3 decor with rntm.thread_impl  = %s\n",
+	        ( ti == BLIS_SINGLE ? "single" :
+	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
+
 	// NOTE: The sba was initialized in bli_init().
 
 	// Check out an array_t from the small block allocator. This is done
@@ -244,5 +253,12 @@ void bli_l3_thread_decorator
 	bli_free_intl( datas );
 }
 
+#else
+
+// Define a dummy function bli_l3_thread_entry(), which is needed for
+// consistent dynamic linking behavior when building shared objects in Linux
+// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol.
+void* bli_l3_thread_entry( void* data_void ) { return NULL; }
+
 #endif
 
diff --git a/frame/thread/bli_l3_decor_pthreads.h b/frame/thread/bli_l3_decor_pthreads.h
index 772e05ca7..edf36cf6e 100644
--- a/frame/thread/bli_l3_decor_pthreads.h
+++ b/frame/thread/bli_l3_decor_pthreads.h
@@ -41,6 +41,20 @@
 // Thread entry point prototype.
 void* bli_l3_thread_entry( void* data_void );
 
+void bli_l3_thread_decorator_pthreads
+     (
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
+     );
+
 #endif
 
 #endif
diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c
index c2c43b370..6f0f8603b 100644
--- a/frame/thread/bli_l3_decor_single.c
+++ b/frame/thread/bli_l3_decor_single.c
@@ -35,22 +35,32 @@
 
 #include "blis.h"
 
-#ifndef BLIS_ENABLE_MULTITHREADING
+//#define PRINT_IMPL
 
-void bli_l3_thread_decorator
+void bli_l3_thread_decorator_single
      (
-             l3int_t func,
-             opid_t  family,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
      )
 {
+	// For sequential execution, we use only one thread.
+	const dim_t n_threads = 1;
+
+#ifdef PRINT_IMPL
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+	printf( "l3_decor_single: l3 decor with rntm.thread_impl  = %s\n",
+	        ( ti == BLIS_SINGLE ? "single" :
+	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
+
 	obj_t a_t, b_t;
 	bli_obj_alias_to( a, &a_t );
 	bli_obj_alias_to( b, &b_t );
@@ -66,9 +76,6 @@ void bli_l3_thread_decorator
 	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
 	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
 
-	// For sequential execution, we use only one thread.
-	const dim_t n_threads = 1;
-
 	// NOTE: The sba was initialized in bli_init().
 
 	// Check out an array_t from the small block allocator. This is done
@@ -86,6 +93,12 @@ void bli_l3_thread_decorator
 
 	// Allcoate a global communicator for the root thrinfo_t structures.
 	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
+#if 0
+	timpl_t ti2 = bli_rntm_thread_impl( rntm );
+	printf( "l3_decor_single: created thrcomm_t.ti            = %s\n",
+	        ( ti2 == BLIS_SINGLE ? "single" :
+	        ( ti2 == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
 
 
 	{
@@ -150,5 +163,3 @@ void bli_l3_thread_decorator
 	bli_sba_checkin_array( array );
 }
 
-#endif
-
diff --git a/frame/thread/bli_l3_decor_single.h b/frame/thread/bli_l3_decor_single.h
index 481763a90..c118ad7be 100644
--- a/frame/thread/bli_l3_decor_single.h
+++ b/frame/thread/bli_l3_decor_single.h
@@ -35,10 +35,19 @@
 #ifndef BLIS_L3_DECOR_SINGLE_H
 #define BLIS_L3_DECOR_SINGLE_H
 
-// Definitions specific to situations when multithreading is disabled.
-#ifndef BLIS_ENABLE_MULTITHREADING
-
-#endif
+void bli_l3_thread_decorator_single
+     (
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+             rntm_t*  rntm,
+             cntl_t*  cntl
+     );
 
 #endif
 
diff --git a/frame/thread/bli_l3_sup_decor.c b/frame/thread/bli_l3_sup_decor.c
new file mode 100644
index 000000000..53c7b41be
--- /dev/null
+++ b/frame/thread/bli_l3_sup_decor.c
@@ -0,0 +1,137 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// Initialize a function pointer array containing function addresses for
+// each of the threading-specific level-3 sup thread decorators.
+
+static l3_sup_decor_ft l3_sup_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bli_l3_sup_thread_decorator_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bli_l3_sup_thread_decorator_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                NULL,
+#else
+	                NULL,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bli_l3_sup_thread_decorator_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                NULL,
+#else
+	                NULL,
+#endif
+};
+
+// Define a dispatcher that chooses a threading-specific function from the
+// above function pointer array.
+
+err_t bli_l3_sup_thread_decorator
+     (
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
+     )
+{
+	rntm_t rntm_l;
+
+	// Query the threading implementation and the number of threads requested.
+	timpl_t ti = bli_rntm_thread_impl( rntm );
+	dim_t   nt = bli_rntm_num_threads( rntm );
+
+#ifdef BLIS_ENABLE_NT1_VIA_SINGLE
+	if ( nt == 1 )
+	{
+		// An optimization. If the caller requests only one thread, force
+		// the sequential level-3 thread decorator even if that means
+		// overriding the caller's preferred threading implementation (as
+		// communicated via the rntm_t).
+		rntm_l = *rntm;
+		ti = BLIS_SINGLE;
+		bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm_l );
+		rntm = &rntm_l;
+	}
+#endif
+
+	if ( 1 < nt && ti == BLIS_SINGLE )
+	{
+		// Here, we resolve conflicting information. The caller requested
+		// a sequential threading implementation, but also requested more
+		// than one thread. Here, we choose to favor the requested threading
+		// implementation over the number of threads, and so reset all
+		// parallelism parameters to 1.
+		rntm_l = *rntm;
+		nt = 1;
+		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
+		bli_rntm_set_num_threads_only( 1, &rntm_l );
+		rntm = &rntm_l;
+	}
+
+	// Use the timpl_t value to index into the corresponding function address
+	// from the function pointer array.
+	const l3_sup_decor_ft fp = l3_sup_decor_fpa[ ti ];
+
+	// Call the threading-specific decorator function.
+	return fp
+	(
+	  func,
+	  family,
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm
+	);
+}
+
+void bli_l3_sup_thread_decorator_check
+     (
+       rntm_t* rntm
+     )
+{
+	bli_l3_sup_thread_decorator_check( rntm );
+}
+
diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h
index 6e0401151..a271920b4 100644
--- a/frame/thread/bli_l3_sup_decor.h
+++ b/frame/thread/bli_l3_sup_decor.h
@@ -39,7 +39,7 @@
 // -- sup definitions ----------------------------------------------------------
 
 // Level-3 sup internal function type.
-typedef err_t (*l3supint_t)
+typedef err_t (*l3supint_ft)
      (
        const obj_t*     alpha,
        const obj_t*     a,
@@ -51,18 +51,37 @@ typedef err_t (*l3supint_t)
              thrinfo_t* thread
      );
 
+// Level-3 sup thread decorator function type.
+typedef err_t (*l3_sup_decor_ft)
+     (
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
+     );
+
 // Level-3 sup thread decorator prototype.
 err_t bli_l3_sup_thread_decorator
      (
-             l3supint_t func,
-             opid_t     family,
-       const obj_t*     alpha,
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     beta,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-             rntm_t*    rntm
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
+     );
+
+void bli_l3_sup_thread_decorator_check
+     (
+       rntm_t* rntm
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c
index ff6bc667d..7d06ad622 100644
--- a/frame/thread/bli_l3_sup_decor_openmp.c
+++ b/frame/thread/bli_l3_sup_decor_openmp.c
@@ -37,24 +37,19 @@
 
 #ifdef BLIS_ENABLE_OPENMP
 
-// Define a dummy function bli_l3_sup_thread_entry(), which is needed in the
-// pthreads version, so that when building Windows DLLs (with OpenMP enabled
-// or no multithreading) we don't risk having an unresolved symbol.
-void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
-
 //#define PRINT_THRINFO
 
-err_t bli_l3_sup_thread_decorator
+err_t bli_l3_sup_thread_decorator_openmp
      (
-             l3supint_t func,
-             opid_t     family,
-       const obj_t*     alpha,
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     beta,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-             rntm_t*    rntm
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
      )
 {
 	// Query the total number of threads from the rntm_t object.
diff --git a/frame/thread/bli_l3_sup_decor_openmp.h b/frame/thread/bli_l3_sup_decor_openmp.h
index 1d1097a82..4c5059d00 100644
--- a/frame/thread/bli_l3_sup_decor_openmp.h
+++ b/frame/thread/bli_l3_sup_decor_openmp.h
@@ -38,6 +38,19 @@
 // Definitions specific to situations when OpenMP multithreading is enabled.
 #ifdef BLIS_ENABLE_OPENMP
 
+err_t bli_l3_sup_thread_decorator_openmp
+     (
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
+     );
+
 #endif
 
 #endif
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c
index 375a85730..7be5cf8fb 100644
--- a/frame/thread/bli_l3_sup_decor_pthreads.c
+++ b/frame/thread/bli_l3_sup_decor_pthreads.c
@@ -40,18 +40,18 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	      l3supint_t func;
-	      opid_t     family;
-	const obj_t*     alpha;
-	const obj_t*     a;
-	const obj_t*     b;
-	const obj_t*     beta;
-	const obj_t*     c;
-	const cntx_t*    cntx;
-	      rntm_t*    rntm;
-	      dim_t      tid;
-	      thrcomm_t* gl_comm;
-	      array_t*   array;
+	      l3supint_ft func;
+	      opid_t      family;
+	const obj_t*      alpha;
+	const obj_t*      a;
+	const obj_t*      b;
+	const obj_t*      beta;
+	const obj_t*      c;
+	const cntx_t*     cntx;
+	      rntm_t*     rntm;
+	      dim_t       tid;
+	      thrcomm_t*  gl_comm;
+	      array_t*    array;
 } thread_data_t;
 
 // Entry point for additional threads
@@ -59,7 +59,7 @@ void* bli_l3_sup_thread_entry( void* data_void )
 {
 	thread_data_t* data     = data_void;
 
-	      l3supint_t     func     = data->func;
+	      l3supint_ft    func     = data->func;
 	      opid_t         family   = data->family;
 	const obj_t*         alpha    = data->alpha;
 	const obj_t*         a        = data->a;
@@ -109,17 +109,17 @@ void* bli_l3_sup_thread_entry( void* data_void )
 	return NULL;
 }
 
-err_t bli_l3_sup_thread_decorator
+err_t bli_l3_sup_thread_decorator_pthreads
      (
-             l3supint_t func,
-             opid_t     family,
-       const obj_t*     alpha,
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     beta,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-             rntm_t*    rntm
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
      )
 {
 	err_t r_val;
@@ -214,5 +214,12 @@ err_t bli_l3_sup_thread_decorator
 	return BLIS_SUCCESS;
 }
 
+#else
+
+// Define a dummy function bli_l3_thread_entry(), which is needed for
+// consistent dynamic linking behavior when building shared objects in Linux
+// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol.
+void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
+
 #endif
 
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.h b/frame/thread/bli_l3_sup_decor_pthreads.h
index 1362b4035..310ea4e8b 100644
--- a/frame/thread/bli_l3_sup_decor_pthreads.h
+++ b/frame/thread/bli_l3_sup_decor_pthreads.h
@@ -41,6 +41,19 @@
 // Thread entry point prototype.
 void* bli_l3_sup_thread_entry( void* data_void );
 
+err_t bli_l3_sup_thread_decorator_pthreads
+     (
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
+     );
+
 #endif
 
 #endif
diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c
index df767ad29..a419154e7 100644
--- a/frame/thread/bli_l3_sup_decor_single.c
+++ b/frame/thread/bli_l3_sup_decor_single.c
@@ -35,21 +35,19 @@
 
 #include "blis.h"
 
-#ifndef BLIS_ENABLE_MULTITHREADING
-
 #define SKIP_THRINFO_TREE
 
-err_t bli_l3_sup_thread_decorator
+err_t bli_l3_sup_thread_decorator_single
      (
-             l3supint_t func,
-             opid_t     family,
-       const obj_t*     alpha,
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     beta,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-             rntm_t*    rntm
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
      )
 {
 	// For sequential execution, we use only one thread.
@@ -138,5 +136,3 @@ err_t bli_l3_sup_thread_decorator
 	return BLIS_SUCCESS;
 }
 
-#endif
-
diff --git a/frame/thread/bli_l3_sup_decor_single.h b/frame/thread/bli_l3_sup_decor_single.h
index 418c3814c..8ca279baf 100644
--- a/frame/thread/bli_l3_sup_decor_single.h
+++ b/frame/thread/bli_l3_sup_decor_single.h
@@ -35,10 +35,18 @@
 #ifndef BLIS_L3_SUP_DECOR_SINGLE_H
 #define BLIS_L3_SUP_DECOR_SINGLE_H
 
-// Definitions specific to situations when multithreading is disabled.
-#ifndef BLIS_ENABLE_MULTITHREADING
-
-#endif
+err_t bli_l3_sup_thread_decorator_single
+     (
+             l3supint_ft func,
+             opid_t      family,
+       const obj_t*      alpha,
+       const obj_t*      a,
+       const obj_t*      b,
+       const obj_t*      beta,
+       const obj_t*      c,
+       const cntx_t*     cntx,
+             rntm_t*     rntm
+     );
 
 #endif
 
diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c
index ef46a7ad4..6cd4325df 100644
--- a/frame/thread/bli_thrcomm.c
+++ b/frame/thread/bli_thrcomm.c
@@ -35,8 +35,183 @@
 
 #include "blis.h"
 
+// -- Method-agnostic functions ------------------------------------------------
+
+thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
+{
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_thrcomm_create(): " );
+	#endif
+
+	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) );
+
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	bli_thrcomm_init( ti, n_threads, comm );
+
+	return comm;
+}
+
+void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
+{
+	if ( comm == NULL ) return;
+
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	bli_thrcomm_cleanup( ti, comm );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_thrcomm_free(): " );
+	#endif
+
+	bli_sba_release( rntm, comm );
+}
+
+// -- Method-specific functions ------------------------------------------------
+
+// Initialize a function pointer array for each family of threading-specific
+// functions (init, cleanup, and barrier).
+
+static thrcomm_init_ft init_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bli_thrcomm_init_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bli_thrcomm_init_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                NULL,
+#else
+	                NULL,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bli_thrcomm_init_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                NULL,
+#else
+	                NULL,
+#endif
+};
+static thrcomm_cleanup_ft cleanup_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bli_thrcomm_cleanup_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bli_thrcomm_cleanup_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                NULL,
+#else
+	                NULL,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bli_thrcomm_cleanup_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                NULL,
+#else
+	                NULL,
+#endif
+};
+static thrcomm_barrier_ft barrier_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bli_thrcomm_barrier_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bli_thrcomm_barrier_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                bli_thrcomm_barrier_pthreads,
+#else
+	                bli_thrcomm_barrier_single,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bli_thrcomm_barrier_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                bli_thrcomm_barrier_openmp,
+#else
+	                bli_thrcomm_barrier_single,
+#endif
+};
+
+// Define dispatchers that choose a threading-specific function from each
+// of the above function pointer arrays.
+
+void bli_thrcomm_init( timpl_t ti, dim_t nt, thrcomm_t* comm )
+{
+	const thrcomm_init_ft fp = init_fpa[ ti ];
+
+	if ( fp == NULL ) bli_abort();
+
+	// Call the threading-specific init function.
+	fp( nt, comm );
+
+	// Embed the type of threading implementation within the thrcomm_t struct.
+	// This can be used later to make sure the application doesn't use a
+	// thrcomm_t initialized with threading type A with the API for threading
+	// type B. Note that we wait until after the init function has returned
+	// in case that function zeros out the entire struct before setting the
+	// fields.
+	comm->ti = ti;
+}
+
+void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm )
+{
+	const thrcomm_cleanup_ft fp = cleanup_fpa[ ti ];
+
+	if ( fp == NULL ) bli_abort();
+
+	// If comm is BLIS_SINGLE_COMM, we return early since there is no cleanup,
+	// especially if it is being used with a threading implementation that
+	// would normally want to free its thrcomm_t resources.
+	if ( comm == &BLIS_SINGLE_COMM ) return;
+
+	// Sanity check. Make sure the threading implementation we were asked to use
+	// is the same as the implementation that initialized the thrcomm_t object.
+	if ( ti != comm->ti )
+	{
+		printf( "bli_thrcomm_cleanup(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n",
+		        ( comm->ti == BLIS_SINGLE ? "single" :
+		        ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ),
+		        ( ti       == BLIS_SINGLE ? "single" :
+		        ( ti       == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+		bli_abort();
+	}
+
+	// Call the threading-specific cleanup function.
+	fp( comm );
+}
+
+void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm )
+{
+	const thrcomm_barrier_ft fp = barrier_fpa[ ti ];
+
+	if ( fp == NULL ) bli_abort();
+
+	// Sanity check. Make sure the threading implementation we were asked to use
+	// is the same as the implementation that initialized the thrcomm_t object.
+	// We skip this check if comm is BLIS_SINGLE_COMM since the timpl_t value
+	// embedded in comm will often be different than that of BLIS_SINGLE_COMM
+	// (but we don't return early since we still need to barrier... wait, or do
+	// we?).
+	if ( ti != comm->ti && comm != &BLIS_SINGLE_COMM )
+	{
+		printf( "bli_thrcomm_barrier(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n",
+		        ( comm->ti == BLIS_SINGLE ? "single" :
+		        ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ),
+		        ( ti       == BLIS_SINGLE ? "single" :
+		        ( ti       == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+		bli_abort();
+	}
+
+	// Call the threading-specific barrier function.
+	fp( tid, comm );
+}
+
+// -- Other functions ----------------------------------------------------------
+
 void* bli_thrcomm_bcast
      (
+       timpl_t    ti,
        dim_t      id,
        void*      to_send,
        thrcomm_t* comm
@@ -46,9 +221,9 @@ void* bli_thrcomm_bcast
 
 	if ( id == 0 ) comm->sent_object = to_send;
 
-	bli_thrcomm_barrier( id, comm );
+	bli_thrcomm_barrier( ti, id, comm );
 	void* object = comm->sent_object;
-	bli_thrcomm_barrier( id, comm );
+	bli_thrcomm_barrier( ti, id, comm );
 
 	return object;
 }
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index d0ffb1346..4532fd00d 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -36,12 +36,82 @@
 #ifndef BLIS_THRCOMM_H
 #define BLIS_THRCOMM_H
 
+// Define barrier_t, which is specific to the tree barrier in the OpenMP
+// implementation. This needs to be done first since it is (potentially)
+// used within the definition of thrcomm_t below.
+
+#ifdef BLIS_ENABLE_OPENMP
+#ifdef BLIS_TREE_BARRIER
+struct barrier_s
+{
+	int               arity;
+	int               count;
+	struct barrier_s* dad;
+	volatile int      signal;
+};
+typedef struct barrier_s barrier_t;
+#endif
+#endif
+
+// Define the thrcomm_t structure, which will be common to all threading
+// implementations.
+
+typedef struct thrcomm_s
+{
+	// -- Fields common to all threading implementations --
+
+	void*       sent_object;
+	dim_t       n_threads;
+	timpl_t     ti;
+
+	// NOTE: barrier_sense was originally a gint_t-based bool_t, but upon
+	// redefining bool_t as bool we discovered that some gcc __atomic built-ins
+	// don't allow the use of bool for the variables being operated upon.
+	// (Specifically, this was observed of __atomic_fetch_xor(), but it likely
+	// applies to all other related built-ins.) Thus, we get around this by
+	// redefining barrier_sense as a gint_t.
+	//volatile gint_t  barrier_sense;
+	gint_t barrier_sense;
+	dim_t  barrier_threads_arrived;
+
+	// -- Fields specific to OpenMP --
+
+	#ifdef BLIS_ENABLE_OPENMP
+	#ifdef BLIS_TREE_BARRIER
+	// This field is only needed if the tree barrier implementation is being
+	// compiled. The non-tree barrier code does not use it.
+	barrier_t** barriers;
+	#endif
+	#endif
+
+	// -- Fields specific to pthreads --
+
+	#ifdef BLIS_ENABLE_PTHREADS
+	#ifdef BLIS_USE_PTHREAD_BARRIER
+	// This field is only needed if the pthread_barrier_t implementation is
+	// being compiled. The non-pthread_barrier_t code does not use it.
+	bli_pthread_barrier_t barrier;
+	#endif
+	#endif
+
+} thrcomm_t;
+
+
+
+
+
 // Include definitions (mostly thrcomm_t) specific to the method of
 // multithreading.
 #include "bli_thrcomm_single.h"
 #include "bli_thrcomm_openmp.h"
 #include "bli_thrcomm_pthreads.h"
 
+// Define a function pointer type for each of the functions that are
+// "overloaded" by each method of multithreading.
+typedef void (*thrcomm_init_ft)( dim_t nt, thrcomm_t* comm );
+typedef void (*thrcomm_cleanup_ft)( thrcomm_t* comm );
+typedef void (*thrcomm_barrier_ft)( dim_t tid, thrcomm_t* comm );
+
 
 // thrcomm_t query (field only)
 
@@ -51,16 +121,22 @@ BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm )
 }
 
 
-// Thread communicator prototypes.
+// Threading method-agnostic function prototypes.
 thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads );
 void       bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm );
-void       bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm );
-void       bli_thrcomm_cleanup( thrcomm_t* comm );
-
-BLIS_EXPORT_BLIS void  bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm );
-BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm );
 
-void       bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm );
+// Threading method-specific function prototypes.
+// NOTE: These are the prototypes to the dispatcher functions and thus they
+// require the timpl_t as an argument. The threading-specific functions can
+// (and do) omit the timpl_t from their function signatures since their
+// threading implementation is intrinsically known.
+void                   bli_thrcomm_init( timpl_t ti, dim_t n_threads, thrcomm_t* comm );
+void                   bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm );
+BLIS_EXPORT_BLIS void  bli_thrcomm_barrier( timpl_t ti, dim_t thread_id, thrcomm_t* comm );
+
+// Other function prototypes.
+BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( timpl_t ti, dim_t inside_id, void* to_send, thrcomm_t* comm );
+void                   bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm );
 
 #endif
 
diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c
index 9bb35ea31..a42dabe18 100644
--- a/frame/thread/bli_thrcomm_openmp.c
+++ b/frame/thread/bli_thrcomm_openmp.c
@@ -37,35 +37,13 @@
 
 #ifdef BLIS_ENABLE_OPENMP
 
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
-{
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_thrcomm_create(): " );
-	#endif
-
-	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) );
-
-	bli_thrcomm_init( n_threads, comm );
-
-	return comm;
-}
-
-void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
-{
-	if ( comm == NULL ) return;
-
-	bli_thrcomm_cleanup( comm );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_thrcomm_free(): " );
-	#endif
-
-	bli_sba_release( rntm, comm );
-}
-
 #ifndef BLIS_TREE_BARRIER
 
-void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
+// Define the non-tree barrier implementations of the init, cleanup, and
+// barrier functions. These are the default unless the tree barrier
+// versions are requested at compile-time.
+
+void bli_thrcomm_init_openmp( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 	comm->sent_object = NULL;
@@ -75,14 +53,15 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
 }
 
 
-void bli_thrcomm_cleanup( thrcomm_t* comm )
+void bli_thrcomm_cleanup_openmp( thrcomm_t* comm )
 {
-	if ( comm == NULL ) return;
+	//if ( comm == NULL ) return;
+	return;
 }
 
 //'Normal' barrier for openmp
 //barrier routine taken from art of multicore programming
-void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
+void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm )
 {
 #if 0
 	if ( comm == NULL || comm->n_threads == 1 )
@@ -109,7 +88,10 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
 
 #else
 
-void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
+// Define the tree barrier implementations of the init, cleanup, and
+// barrier functions.
+
+void bli_thrcomm_init_openmp( dim_t n_threads, thrcomm_t* comm )
 {
 	err_t r_val;
 
@@ -120,6 +102,23 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
 	bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 );
 }
 
+void bli_thrcomm_cleanup_openmp( thrcomm_t* comm )
+{
+	if ( comm == NULL ) return;
+	for ( dim_t i = 0; i < comm->n_threads; i++ )
+	{
+	   bli_thrcomm_tree_barrier_free( comm->barriers[i] );
+	}
+	bli_free_intl( comm->barriers );
+}
+
+void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm )
+{
+	bli_thrcomm_tree_barrier( comm->barriers[t_id] );
+}
+
+// -- Helper functions ---------------------------------------------------------
+
 //Tree barrier used for Intel Xeon Phi
 barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index )
 {
@@ -164,16 +163,6 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_
 	return me;
 }
 
-void bli_thrcomm_cleanup( thrcomm_t* comm )
-{
-	if ( comm == NULL ) return;
-	for ( dim_t i = 0; i < comm->n_threads; i++ )
-	{
-	   bli_thrcomm_tree_barrier_free( comm->barriers[i] );
-	}
-	bli_free_intl( comm->barriers );
-}
-
 void bli_thrcomm_tree_barrier_free( barrier_t* barrier )
 {
 	if ( barrier == NULL )
@@ -187,11 +176,6 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier )
 	return;
 }
 
-void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
-{
-	bli_thrcomm_tree_barrier( comm->barriers[t_id] );
-}
-
 void bli_thrcomm_tree_barrier( barrier_t* barack )
 {
 	int my_signal = barack->signal;
diff --git a/frame/thread/bli_thrcomm_openmp.h b/frame/thread/bli_thrcomm_openmp.h
index 3abfd0a41..8c33d0c2f 100644
--- a/frame/thread/bli_thrcomm_openmp.h
+++ b/frame/thread/bli_thrcomm_openmp.h
@@ -36,53 +36,22 @@
 #ifndef BLIS_THRCOMM_OPENMP_H
 #define BLIS_THRCOMM_OPENMP_H
 
-// Define thrcomm_t for situations when OpenMP multithreading is enabled.
+// Define these prototypes for situations when OpenMP multithreading is
+// enabled.
 #ifdef BLIS_ENABLE_OPENMP
 
 #include <omp.h>
 
-// Define thrcomm_t for tree barriers and non-tree barriers.
-#ifdef BLIS_TREE_BARRIER
-struct barrier_s
-{   
-	int               arity;
-	int               count;
-	struct barrier_s* dad;
-	volatile int      signal;
-};  
-typedef struct barrier_s barrier_t;
-
-struct thrcomm_s
-{   
-	void*       sent_object;
-	dim_t       n_threads;
-	barrier_t** barriers;
-}; 
-#else
-struct thrcomm_s
-{
-	void*  sent_object;
-	dim_t  n_threads;
-
-	// NOTE: barrier_sense was originally a gint_t-based bool_t, but upon
-	// redefining bool_t as bool we discovered that some gcc __atomic built-ins
-	// don't allow the use of bool for the variables being operated upon.
-	// (Specifically, this was observed of __atomic_fetch_xor(), but it likely
-	// applies to all other related built-ins.) Thus, we get around this by
-	// redefining barrier_sense as a gint_t.
-	//volatile gint_t  barrier_sense;
-	gint_t barrier_sense;
-	dim_t  barrier_threads_arrived;
-};
-#endif
-
-typedef struct thrcomm_s thrcomm_t;
+// OpenMP-specific function prototypes.
+void bli_thrcomm_init_openmp( dim_t nt, thrcomm_t* comm );
+void bli_thrcomm_cleanup_openmp( thrcomm_t* comm );
+void bli_thrcomm_barrier_openmp( dim_t tid, thrcomm_t* comm );
 
-// Prototypes specific to tree barriers.
+// Prototypes specific to the OpenMP tree barrier implementation.
 #ifdef BLIS_TREE_BARRIER
 barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index );
-void        bli_thrcomm_tree_barrier_free( barrier_t* barrier );
-void        bli_thrcomm_tree_barrier( barrier_t* barack );
+void       bli_thrcomm_tree_barrier_free( barrier_t* barrier );
+void       bli_thrcomm_tree_barrier( barrier_t* barack );
 #endif
 
 #endif
diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c
index d0896f94d..39b15d590 100644
--- a/frame/thread/bli_thrcomm_pthreads.c
+++ b/frame/thread/bli_thrcomm_pthreads.c
@@ -37,35 +37,12 @@
 
 #ifdef BLIS_ENABLE_PTHREADS
 
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
-{
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_thrcomm_create(): " );
-	#endif
-
-	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) );
-
-	bli_thrcomm_init( n_threads, comm );
-
-	return comm;
-}
-
-void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
-{
-	if ( comm == NULL ) return;
-
-	bli_thrcomm_cleanup( comm );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_thrcomm_free(): " );
-	#endif
-
-	bli_sba_release( rntm, comm );
-}
-
 #ifdef BLIS_USE_PTHREAD_BARRIER
 
-void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
+// Define the pthread_barrier_t implementations of the init, cleanup, and
+// barrier functions.
+
+void bli_thrcomm_init_pthreads( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 	comm->sent_object = NULL;
@@ -73,7 +50,7 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
 	bli_pthread_barrier_init( &comm->barrier, NULL, n_threads );
 }
 
-void bli_thrcomm_cleanup( thrcomm_t* comm )
+void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 	bli_pthread_barrier_destroy( &comm->barrier );
@@ -86,7 +63,11 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
 
 #else
 
-void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
+// Define the non-pthread_barrier_t implementations of the init, cleanup,
+// and barrier functions. These are the default unless the pthread_barrier_t
+// versions are requested at compile-time.
+
+void bli_thrcomm_init_pthreads( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 	comm->sent_object = NULL;
@@ -95,11 +76,11 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
 	comm->barrier_threads_arrived = 0;
 }
 
-void bli_thrcomm_cleanup( thrcomm_t* comm )
+void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm )
 {
 }
 
-void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
+void bli_thrcomm_barrier_pthreads( dim_t t_id, thrcomm_t* comm )
 {
 #if 0
 	if ( comm == NULL || comm->n_threads == 1 ) return;
diff --git a/frame/thread/bli_thrcomm_pthreads.h b/frame/thread/bli_thrcomm_pthreads.h
index 2c2e88551..9a2447b99 100644
--- a/frame/thread/bli_thrcomm_pthreads.h
+++ b/frame/thread/bli_thrcomm_pthreads.h
@@ -35,36 +35,13 @@
 #ifndef BLIS_THRCOMM_PTHREADS_H
 #define BLIS_THRCOMM_PTHREADS_H
 
-// Define thrcomm_t for situations when POSIX multithreading is enabled.
+// Define these prototypes for situations when POSIX multithreading is enabled.
 #ifdef BLIS_ENABLE_PTHREADS 
 
-#ifdef BLIS_USE_PTHREAD_BARRIER
-struct thrcomm_s
-{
-	void*                 sent_object;
-	dim_t                 n_threads;
-
-	bli_pthread_barrier_t barrier;
-};
-#else
-struct thrcomm_s
-{
-	void*  sent_object;
-	dim_t  n_threads;
-
-	// NOTE: barrier_sense was originally a gint_t-based bool_t, but upon
-	// redefining bool_t as bool we discovered that some gcc __atomic built-ins
-	// don't allow the use of bool for the variables being operated upon.
-	// (Specifically, this was observed of __atomic_fetch_xor(), but it likely
-	// applies to all other related built-ins.) Thus, we get around this by
-	// redefining barrier_sense as a gint_t.
-	//volatile gint_t  barrier_sense;
-	gint_t barrier_sense;
-	dim_t  barrier_threads_arrived;
-};
-#endif
-
-typedef struct thrcomm_s thrcomm_t;
+// pthreads-specific function prototypes.
+void bli_thrcomm_init_pthreads( dim_t nt, thrcomm_t* comm );
+void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm );
+void bli_thrcomm_barrier_pthreads( dim_t tid, thrcomm_t* comm );
 
 #endif
 
diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c
index cedb3c5b6..cb12e37f3 100644
--- a/frame/thread/bli_thrcomm_single.c
+++ b/frame/thread/bli_thrcomm_single.c
@@ -35,36 +35,7 @@
 
 #include "blis.h"
 
-#ifndef BLIS_ENABLE_MULTITHREADING
-
-//Constructors and destructors for constructors
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
-{
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_thrcomm_create(): " );
-	#endif
-
-	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof( thrcomm_t ) );
-
-	bli_thrcomm_init( n_threads, comm );
-
-	return comm;
-}
-
-void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
-{
-	if ( comm == NULL ) return;
-
-	bli_thrcomm_cleanup( comm );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_thrcomm_free(): " );
-	#endif
-
-	bli_sba_release( rntm, comm );
-}
-
-void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
+void bli_thrcomm_init_single( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 
@@ -74,15 +45,13 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
 	comm->barrier_threads_arrived = 0;
 }
 
-void bli_thrcomm_cleanup( thrcomm_t* comm )
+void bli_thrcomm_cleanup_single( thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 }
 
-void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
+void bli_thrcomm_barrier_single( dim_t t_id, thrcomm_t* comm )
 {
 	return;
 }
 
-#endif
-
diff --git a/frame/thread/bli_thrcomm_single.h b/frame/thread/bli_thrcomm_single.h
index c10727df2..fffb3fb75 100644
--- a/frame/thread/bli_thrcomm_single.h
+++ b/frame/thread/bli_thrcomm_single.h
@@ -35,45 +35,13 @@
 #ifndef BLIS_THRCOMM_SINGLE_H
 #define BLIS_THRCOMM_SINGLE_H
 
-// Define thrcomm_t for situations when multithreading is disabled.
-#ifndef BLIS_ENABLE_MULTITHREADING 
+// Always define these prototypes since disabling multithreading is always
+// an option.
 
-//thread communicators may be implementation dependent
-#ifdef BLIS_TREE_BARRIER
-struct barrier_s
-{   
-	int               arity;
-	int               count;
-	struct barrier_s* dad;
-	int               signal;
-};  
-typedef struct barrier_s barrier_t;
-
-struct thrcomm_s
-{   
-	void*       sent_object;
-	dim_t       n_threads;
-	barrier_t** barriers;
-}; 
-#else
-struct thrcomm_s
-{
-	void*   sent_object;
-	dim_t   n_threads;
- 
-	// NOTE: barrier_sense was originally a gint_t-based bool_t, but upon
-	// redefining bool_t as bool we discovered that some gcc __atomic built-ins
-	// don't allow the use of bool for the variables being operated upon.
-	// (Specifically, this was observed of __atomic_fetch_xor(), but it likely
-	// applies to all other related built-ins.) Thus, we get around this by
-	// redefining barrier_sense as a gint_t.
-	gint_t  barrier_sense;
-	dim_t   barrier_threads_arrived;
-};
-#endif
-typedef struct thrcomm_s thrcomm_t;
-
-#endif
+// Sequential-specific function prototypes.
+void bli_thrcomm_init_single( dim_t nt, thrcomm_t* comm );
+void bli_thrcomm_cleanup_single( thrcomm_t* comm );
+void bli_thrcomm_barrier_single( dim_t tid, thrcomm_t* comm );
 
 #endif
 
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 9bad6a456..9369b373b 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -50,7 +50,7 @@ extern bli_pthread_mutex_t global_rntm_mutex;
 
 void bli_thread_init( void )
 {
-	bli_thrcomm_init( 1, &BLIS_SINGLE_COMM );
+	bli_thrcomm_init( BLIS_SINGLE, 1, &BLIS_SINGLE_COMM );
 	bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED );
 	bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED );
 
@@ -1554,6 +1554,14 @@ dim_t bli_thread_get_num_threads( void )
 	return bli_rntm_num_threads( &global_rntm );
 }
 
+timpl_t bli_thread_get_thread_impl( void )
+{
+	// We must ensure that global_rntm has been initialized.
+	bli_init_once();
+
+	return bli_rntm_thread_impl( &global_rntm );
+}
+
 // ----------------------------------------------------------------------------
 
 void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
@@ -1584,8 +1592,24 @@ void bli_thread_set_num_threads( dim_t n_threads )
 	bli_pthread_mutex_unlock( &global_rntm_mutex );
 }
 
+void bli_thread_set_thread_impl( timpl_t ti )
+{
+	// We must ensure that global_rntm has been initialized.
+	bli_init_once();
+
+	// Acquire the mutex protecting global_rntm.
+	bli_pthread_mutex_lock( &global_rntm_mutex );
+
+	bli_rntm_set_thread_impl_only( ti, &global_rntm );
+
+	// Release the mutex protecting global_rntm.
+	bli_pthread_mutex_unlock( &global_rntm_mutex );
+}
+
 // ----------------------------------------------------------------------------
 
+//#define PRINT_IMPL
+
 void bli_thread_init_rntm_from_env
      (
        rntm_t* rntm
@@ -1606,18 +1630,69 @@ void bli_thread_init_rntm_from_env
 	// function guarantees that the rntm_t has sane values in the event that the
 	// application passed in a custom rntm_t via an expert interface.
 
-	bool  auto_factor = FALSE;
-	dim_t nt;
-	dim_t jc, pc, ic, jr, ir;
+	bool    auto_factor = FALSE;
+	dim_t   nt;
+	dim_t   jc, pc, ic, jr, ir;
+	timpl_t ti;
 
 #ifdef BLIS_ENABLE_MULTITHREADING
 
+	// Try to read BLIS_THREAD_IMPL.
+	char* ti_env = bli_env_get_str( "BLIS_THREAD_IMPL" );
+
+	// If BLIS_THREAD_IMPL was not set, try to read BLIS_TI.
+	if ( ti_env == NULL ) ti_env = bli_env_get_str( "BLIS_TI" );
+
+	if ( ti_env != NULL )
+	{
+		// If BLIS_THREAD_IMPL was set, parse the value. If the value was
+		// anything other than a "openmp" or "pthreads" (or reasonable
+		// variations thereof), interpret it as a request for single-threaded
+		// execution.
+		if      ( !strncmp( ti_env, "openmp",   6 ) ) ti = BLIS_OPENMP;
+		else if ( !strncmp( ti_env, "omp",      3 ) ) ti = BLIS_OPENMP;
+		else if ( !strncmp( ti_env, "pthreads", 8 ) ) ti = BLIS_POSIX;
+		else if ( !strncmp( ti_env, "pthread",  7 ) ) ti = BLIS_POSIX;
+		else if ( !strncmp( ti_env, "posix",    5 ) ) ti = BLIS_POSIX;
+		else                                          ti = BLIS_SINGLE;
+
+		#ifdef PRINT_IMPL
+		if      ( ti == BLIS_OPENMP )
+			printf( "detected BLIS_THREAD_IMPL=openmp.\n" );
+		else if ( ti == BLIS_POSIX )
+			printf( "detected BLIS_THREAD_IMPL=pthreads.\n" );
+		else
+			printf( "detected BLIS_THREAD_IMPL=single.\n" );
+		#endif
+	}
+	else
+	{
+		// If BLIS_THREAD_IMPL was unset, default to the implementation that
+		// was determined at configure-time.
+		#ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT
+		ti = BLIS_OPENMP;
+		#endif
+		#ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT
+		ti = BLIS_POSIX;
+		#endif
+
+		#ifdef PRINT_IMPL
+		printf( "BLIS_THREAD_IMPL unset.\n" );
+		if      ( ti == BLIS_OPENMP )
+			printf( "defaulting to BLIS_THREAD_IMPL=openmp.\n" );
+		else if ( ti == BLIS_POSIX )
+			printf( "defaulting to BLIS_THREAD_IMPL=pthreads.\n" );
+		#endif
+	}
+
 	// Try to read BLIS_NUM_THREADS first.
 	nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
 
-	// If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS.
-	if ( nt == -1 )
-		nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
+	// If BLIS_NUM_THREADS was not set, try to read BLIS_NT.
+	if ( nt == -1 ) nt = bli_env_get_var( "BLIS_NT", -1 );
+
+	// If neither BLIS_NUM_THREADS nor BLIS_NT were set, try OMP_NUM_THREADS.
+	if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
 
 	// Read the environment variables for the number of threads (ways of
 	// parallelism) for each individual loop.
@@ -1650,13 +1725,15 @@ void bli_thread_init_rntm_from_env
 	// Now we use the values of nt_set and ways_set to determine how to
 	// interpret the original values we found in the rntm_t object.
 
-	if ( ways_set == TRUE )
+	if ( ( ways_set == TRUE && nt_set == TRUE  ) ||
+	     ( ways_set == TRUE && nt_set == FALSE ) )
 	{
 		// If the per-loop ways of parallelism were set, then we use the values
 		// that were given and interpreted above. The only thing left to do is
-		// calculate the correct number of threads. Notice that if the user also
-		// happened to set BLIS_NUM_THREADS, that value is discarded in favor of
-		// the implied value from the per-loop ways of parallelism.
+		// calculate the correct number of threads. Notice that whatever value
+		// may have been asigned to BLIS_NUM_THREADS will be ignored, and the
+		// total number of threads will be taken to be the number implied from
+		// the per-loop ways of parallelism.
 
 		nt = jc * pc * ic * jr * ir;
 		auto_factor = FALSE;
@@ -1682,21 +1759,27 @@ void bli_thread_init_rntm_from_env
 
 #else
 
+	// Note that we don't even bother checking BLIS_THREAD_IMPL if neither
+	// OpenMP nor pthreads was enabled at compile time.
+	ti = BLIS_SINGLE;
+
 	// When multithreading is disabled, always set the per-loop ways of
 	// parallelism to 1.
-	nt = 1;
 	jc = pc = ic = jr = ir = 1;
+	nt = 1;
+	auto_factor = FALSE;
 
 #endif
 
 	// Save the results back in the runtime object.
-	bli_rntm_set_auto_factor_only( auto_factor, rntm );
+	bli_rntm_set_thread_impl_only( ti, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 
-#if 0
+	#if 0
 	printf( "bli_thread_init_rntm_from_env()\n" );
 	bli_rntm_print( rntm );
-#endif
+	#endif
 }
 
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 5e9c650b5..509072e57 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -195,17 +195,19 @@ dim_t bli_ipow( dim_t base, dim_t power );
 
 // -----------------------------------------------------------------------------
 
-BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void );
-BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void );
-BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void );
-BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void );
-BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void );
-BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void );
-
-BLIS_EXPORT_BLIS void  bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir );
-BLIS_EXPORT_BLIS void  bli_thread_set_num_threads( dim_t value );
-
-void  bli_thread_init_rntm_from_env( rntm_t* rntm );
+BLIS_EXPORT_BLIS dim_t   bli_thread_get_jc_nt( void );
+BLIS_EXPORT_BLIS dim_t   bli_thread_get_pc_nt( void );
+BLIS_EXPORT_BLIS dim_t   bli_thread_get_ic_nt( void );
+BLIS_EXPORT_BLIS dim_t   bli_thread_get_jr_nt( void );
+BLIS_EXPORT_BLIS dim_t   bli_thread_get_ir_nt( void );
+BLIS_EXPORT_BLIS dim_t   bli_thread_get_num_threads( void );
+BLIS_EXPORT_BLIS timpl_t bli_thread_get_thread_impl( void );
+
+BLIS_EXPORT_BLIS void    bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir );
+BLIS_EXPORT_BLIS void    bli_thread_set_num_threads( dim_t value );
+BLIS_EXPORT_BLIS void    bli_thread_set_thread_impl( timpl_t ti );
+
+void                     bli_thread_init_rntm_from_env( rntm_t* rntm );
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index bbe711400..3730ab946 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -360,7 +360,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 
 	// Broadcast the temporary array to all threads in the parent's
 	// communicator.
-	new_comms = bli_thread_broadcast( thread_par, new_comms );
+	new_comms = bli_thread_broadcast( rntm, thread_par, new_comms );
 
 	// Chiefs in the child communicator allocate the communicator
 	// object and store it in the array element corresponding to the
@@ -368,7 +368,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 	if ( child_comm_id == 0 )
 		new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
 
-	bli_thread_barrier( thread_par );
+	bli_thread_barrier( rntm, thread_par );
 
 	// All threads create a new thrinfo_t node using the communicator
 	// that was created by their chief, as identified by parent_work_id.
@@ -384,7 +384,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 	  NULL                         // sub_node
 	);
 
-	bli_thread_barrier( thread_par );
+	bli_thread_barrier( rntm, thread_par );
 
 	// The parent's chief thread frees the temporary array of thrcomm_t
 	// pointers.
@@ -497,7 +497,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
 	const dim_t child_comm_id = parent_comm_id % child_nt_in;
 	const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
 
-	bli_thread_barrier( thread_par );
+	bli_thread_barrier( rntm, thread_par );
 
 	// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
 	// parent's chief-ness is equivalent to checking for chief-ness in the new
@@ -508,7 +508,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
 
 	// Broadcast the new thrcomm_t address to the other threads in the
 	// parent's group.
-	new_comm = bli_thread_broadcast( thread_par, new_comm );
+	new_comm = bli_thread_broadcast( rntm, thread_par, new_comm );
 
 	// All threads create a new thrinfo_t node using the communicator
 	// that was created by their chief, as identified by parent_work_id.
@@ -524,7 +524,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
 	  NULL           // sub_node
 	);
 
-	bli_thread_barrier( thread_par );
+	bli_thread_barrier( rntm, thread_par );
 
 	return thread_chl;
 }
diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h
index 6b9809684..9d234bc91 100644
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -171,14 +171,22 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t*
 
 // other thrinfo_t-related functions
 
-BLIS_INLINE void* bli_thread_broadcast( const thrinfo_t* t, void* p )
+BLIS_INLINE void* bli_thread_broadcast( const rntm_t* rntm, const thrinfo_t* t, void* p )
 {
-	return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
+	// We can't use any bli_rntm_*() APIs here because they haven't been
+	// defined yet. So we have to manually access the timpl_t field (le ugh).
+	//const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	return bli_thrcomm_bcast( rntm->thread_impl, t->ocomm_id, p, t->ocomm );
 }
 
-BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t )
+BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t )
 {
-	bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
+	// We can't use any bli_rntm_*() APIs here because they haven't been
+	// defined yet. So we have to manually access the timpl_t field (le ugh).
+	//const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	bli_thrcomm_barrier( rntm->thread_impl, t->ocomm_id, t->ocomm );
 }
 
 
diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
index 966247fd0..26a40e00f 100644
--- a/frame/thread/bli_thrinfo_sup.c
+++ b/frame/thread/bli_thrinfo_sup.c
@@ -250,7 +250,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 
 		// Broadcast the temporary array to all threads in the parent's
 		// communicator.
-		new_comms = bli_thread_broadcast( thread_par, new_comms );
+		new_comms = bli_thread_broadcast( rntm, thread_par, new_comms );
 
 		// Chiefs in the child communicator allocate the communicator
 		// object and store it in the array element corresponding to the
@@ -258,7 +258,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 		if ( child_comm_id == 0 )
 			new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
 
-		bli_thread_barrier( thread_par );
+		bli_thread_barrier( rntm, thread_par );
 
 		// All threads create a new thrinfo_t node using the communicator
 		// that was created by their chief, as identified by parent_work_id.
@@ -274,7 +274,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 		  NULL                         // sub_node
 		);
 
-		bli_thread_barrier( thread_par );
+		bli_thread_barrier( rntm, thread_par );
 
 		// The parent's chief thread frees the temporary array of thrcomm_t
 		// pointers.
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index d960928a4..1e567a114 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -40,11 +40,11 @@
 
 void bls_gemm
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      )
 {
 	bls_gemm_ex
@@ -61,13 +61,13 @@ void bls_gemm
 
 void bls_gemm_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h
index b296ac1c0..d01c6647e 100644
--- a/sandbox/gemmlike/bls_gemm.h
+++ b/sandbox/gemmlike/bls_gemm.h
@@ -38,22 +38,22 @@
 
 void bls_gemm
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 
 void bls_gemm_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
 //
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 1e3e5ea03..c8fd50083 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -446,7 +446,7 @@ void PASTECH2(bls_,ch,varname) \
 			/* This barrier is needed to prevent threads from starting to pack
 			   the next row panel of B before the current row panel is fully
 			   computed upon. */ \
-			bli_thread_barrier( thread_pb ); \
+			bli_thread_barrier( rntm, thread_pb ); \
 		} \
 	} \
 \
diff --git a/sandbox/gemmlike/bls_gemm_check.c b/sandbox/gemmlike/bls_gemm_check.c
index 369017338..9cfcf8063 100644
--- a/sandbox/gemmlike/bls_gemm_check.c
+++ b/sandbox/gemmlike/bls_gemm_check.c
@@ -36,12 +36,12 @@
 
 void bls_gemm_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	//bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
diff --git a/sandbox/gemmlike/bls_gemm_check.h b/sandbox/gemmlike/bls_gemm_check.h
index 8b9706991..bd96c4cff 100644
--- a/sandbox/gemmlike/bls_gemm_check.h
+++ b/sandbox/gemmlike/bls_gemm_check.h
@@ -39,11 +39,11 @@
 
 void bls_gemm_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
     );
 
diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c
index 0dcc531fd..9e1f67fc5 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.c
+++ b/sandbox/gemmlike/bls_l3_packm_a.c
@@ -61,7 +61,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -317,7 +317,7 @@ void PASTECH2(bls_,ch,opname) \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_a )
diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c
index 9d563109a..cb8275fae 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.c
+++ b/sandbox/gemmlike/bls_l3_packm_b.c
@@ -61,7 +61,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -317,7 +317,7 @@ void PASTECH2(bls_,ch,opname) \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( thread ); \
+	bli_thread_barrier( rntm, thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_b )
diff --git a/sandbox/gemmlike/thread/bls_l3_decor.c b/sandbox/gemmlike/thread/bls_l3_decor.c
new file mode 100644
index 000000000..7fa799f14
--- /dev/null
+++ b/sandbox/gemmlike/thread/bls_l3_decor.c
@@ -0,0 +1,148 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// Initialize a function pointer array containing function addresses for
+// each of the threading-specific level-3 thread decorators.
+
+static l3sbx_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bls_l3_thread_decorator_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bls_l3_thread_decorator_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                NULL,
+#else
+	                NULL,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bls_l3_thread_decorator_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                NULL,
+#else
+	                NULL,
+#endif
+};
+
+// Define a dispatcher that chooses a threading-specific function from the
+// above function pointer array.
+
+void bls_l3_thread_decorator
+     (
+       l3sbxint_ft func,
+       opid_t      family,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
+     )
+{
+	rntm_t rntm_l;
+
+	// Query the threading implementation and the number of threads requested.
+	timpl_t ti = bli_rntm_thread_impl( rntm );
+	dim_t   nt = bli_rntm_num_threads( rntm );
+
+	if ( bli_error_checking_is_enabled() )
+		bls_l3_thread_decorator_check( rntm );
+
+	if ( 1 < nt && ti == BLIS_SINGLE )
+	{
+		// Here, we resolve conflicting information. The caller requested
+		// a sequential threading implementation, but also requested more
+		// than one thread. Here, we choose to favor the requested threading
+		// implementation over the number of threads, and so reset all
+		// parallelism parameters to 1.
+		rntm_l = *rntm;
+		nt = 1;
+		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
+		bli_rntm_set_num_threads_only( 1, &rntm_l );
+		rntm = &rntm_l;
+	}
+
+	// Use the timpl_t value to index into the corresponding function address
+	// from the function pointer array.
+	const l3sbx_decor_ft fp = l3_decor_fpa[ ti ];
+
+	// Call the threading-specific decorator function.
+	fp
+	(
+	  func,
+	  family,
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm
+	);
+}
+
+void bls_l3_thread_decorator_check
+     (
+       rntm_t* rntm
+     )
+{
+	//err_t e_val;
+
+	//e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) );
+	//bli_check_error_code( e_val );
+
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	if (
+#ifndef BLIS_ENABLE_OPENMP
+	    ti == BLIS_OPENMP ||
+#endif
+#ifndef BLIS_ENABLE_PTHREADS
+	    ti == BLIS_POSIX ||
+#endif
+	    FALSE
+	   )
+	{
+		fprintf( stderr, "\n" );
+		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
+		bli_abort();
+	}
+}
+
diff --git a/sandbox/gemmlike/thread/bls_l3_decor.h b/sandbox/gemmlike/thread/bls_l3_decor.h
index bb8a95bb4..58b076270 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor.h
@@ -35,10 +35,8 @@
 #ifndef BLIS_SBX_L3_DECOR_H
 #define BLIS_SBX_L3_DECOR_H
 
-// -- sup definitions ----------------------------------------------------------
-
 // Level-3 sup internal function type.
-typedef void (*l3sbxint_t)
+typedef void (*l3sbxint_ft)
      (
        obj_t*     alpha,
        obj_t*     a,
@@ -50,18 +48,37 @@ typedef void (*l3sbxint_t)
        thrinfo_t* thread
      );
 
-// Level-3 sup thread decorator prototype.
+// Level-3 thread decorator function type.
+typedef void (*l3sbx_decor_ft)
+     (
+       l3sbxint_ft func,
+       opid_t      family,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
+     );
+
+// Level-3 thread decorator prototype.
 void bls_l3_thread_decorator
      (
-       l3sbxint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+       l3sbxint_ft func,
+       opid_t      family,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
+     );
+
+void bls_l3_thread_decorator_check
+     (
+       rntm_t* rntm
      );
 
 // Include definitions specific to the method of multithreading.
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
index bf0d4d8bc..9c29ef27e 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
@@ -36,16 +36,11 @@
 
 #ifdef BLIS_ENABLE_OPENMP
 
-// Define a dummy thread entry function, which is needed in the pthreads
-// version, so that when building Windows DLLs (with OpenMP enabled or with
-// no multithreading) we don't risk having an unresolved symbol.
-void* bls_l3_thread_entry( void* data_void ) { return NULL; }
-
 //#define PRINT_THRINFO
 
-void bls_l3_thread_decorator
+void bls_l3_thread_decorator_openmp
      (
-       l3sbxint_t func,
+       l3sbxint_ft func,
        opid_t     family,
        obj_t*     alpha,
        obj_t*     a,
@@ -65,7 +60,7 @@ void bls_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -78,7 +73,7 @@ void bls_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
@@ -93,8 +88,6 @@ void bls_l3_thread_decorator
 		const dim_t tid = omp_get_thread_num();
 
 		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
-		// NOTE: This calls the same function used for the conventional/large
-		// code path.
 		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
 
 		// Use the thread id to access the appropriate pool_t* within the
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h
index 9c956d7c3..8198a1ba1 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h
@@ -38,6 +38,19 @@
 // Definitions specific to situations when OpenMP multithreading is enabled.
 #ifdef BLIS_ENABLE_OPENMP
 
+void bls_l3_thread_decorator_openmp
+     (
+       l3sbxint_ft func,
+       opid_t      family,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
+     );
+
 #endif
 
 #endif
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
index ff723a4ce..95d0e968e 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
@@ -39,18 +39,18 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	l3sbxint_t func;
-	opid_t     family;
-	obj_t*     alpha;
-	obj_t*     a;
-	obj_t*     b;
-	obj_t*     beta;
-	obj_t*     c;
-	cntx_t*    cntx;
-	rntm_t*    rntm;
-	dim_t      tid;
-	thrcomm_t* gl_comm;
-	array_t*   array;
+	l3sbxint_ft func;
+	opid_t      family;
+	obj_t*      alpha;
+	obj_t*      a;
+	obj_t*      b;
+	obj_t*      beta;
+	obj_t*      c;
+	cntx_t*     cntx;
+	rntm_t*     rntm;
+	dim_t       tid;
+	thrcomm_t*  gl_comm;
+	array_t*    array;
 } thread_data_t;
 
 // Entry point function for additional threads.
@@ -58,7 +58,7 @@ void* bls_l3_thread_entry( void* data_void )
 {
 	thread_data_t* data     = data_void;
 
-	l3sbxint_t     func     = data->func;
+	l3sbxint_ft    func     = data->func;
 	opid_t         family   = data->family;
 	obj_t*         alpha    = data->alpha;
 	obj_t*         a        = data->a;
@@ -108,17 +108,17 @@ void* bls_l3_thread_entry( void* data_void )
 	return NULL;
 }
 
-void bls_l3_thread_decorator
+void bls_l3_thread_decorator_pthreads
      (
-       l3sbxint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+       l3sbxint_ft func,
+       opid_t      family,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
      )
 {
 	err_t r_val;
@@ -145,7 +145,7 @@ void bls_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
@@ -211,5 +211,12 @@ void bls_l3_thread_decorator
 	bli_free_intl( datas );
 }
 
+#else
+
+// Define a dummy function bli_l3_thread_entry(), which is needed for
+// consistent dynamic linking behavior when building shared objects in Linux
+// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol.
+void* bli_l3_thread_entry( void* data_void ) { return NULL; }
+
 #endif
 
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
index ef5c3bad4..162086bb0 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
@@ -41,6 +41,19 @@
 // Thread entry point prototype.
 void* bls_l3_thread_entry( void* data_void );
 
+void bls_l3_thread_decorator_pthreads
+     (
+       l3sbxint_ft func,
+       opid_t      family,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
+     );
+
 #endif
 
 #endif
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c
index 8bb04817f..b5f5a6669 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_single.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_single.c
@@ -34,23 +34,21 @@
 
 #include "blis.h"
 
-#ifndef BLIS_ENABLE_MULTITHREADING
-
 #define SKIP_THRINFO_TREE
 
-void bls_l3_thread_decorator
+void bls_l3_thread_decorator_single
      (
-       l3sbxint_t func,
-       opid_t     family,
-       //pack_t     schema_a,
-       //pack_t     schema_b,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+       l3sbxint_ft func,
+       opid_t      family,
+       //pack_t      schema_a,
+       //pack_t      schema_b,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
      )
 {
 	// For sequential execution, we use only one thread.
@@ -62,7 +60,7 @@ void bls_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm.
 	bli_sba_rntm_set_pool( 0, array, rntm );
@@ -72,14 +70,14 @@ void bls_l3_thread_decorator
 
 #ifndef SKIP_THRINFO_TREE
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 #endif
 
 
 	{
 		// NOTE: We don't need to create another copy of the rntm_t since
 		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
+		rntm_t* rntm_p = rntm;
 
 		// There is only one thread id (for the thief thread).
 		const dim_t tid = 0;
@@ -137,5 +135,3 @@ void bls_l3_thread_decorator
 	bli_sba_checkin_array( array );
 }
 
-#endif
-
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.h b/sandbox/gemmlike/thread/bls_l3_decor_single.h
index 211a43a89..82dfbc993 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_single.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor_single.h
@@ -35,10 +35,20 @@
 #ifndef BLIS_SBX_L3_DECOR_SINGLE_H
 #define BLIS_SBX_L3_DECOR_SINGLE_H
 
-// Definitions specific to situations when multithreading is disabled.
-#ifndef BLIS_ENABLE_MULTITHREADING
-
-#endif
+void bls_l3_thread_decorator_single
+     (
+       l3sbxint_ft func,
+       opid_t      family,
+       //pack_t      schema_a,
+       //pack_t      schema_b,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
+     );
 
 #endif
 
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 3bfde8788..a355385a3 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -775,13 +775,34 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	else
 		int_type_size = sizeof(gint_t) * 8;
 
-	char impl_str[16];
+	char impl_str[32];
+	char def_impl_set_str[32];
+	char def_impl_unset_str[32];
 	char jrir_str[16];
 
-	// Describe the threading implementation.
-	if      ( bli_info_get_enable_openmp()   ) sprintf( impl_str, "openmp" );
-	else if ( bli_info_get_enable_pthreads() ) sprintf( impl_str, "pthreads" );
-	else    /* threading disabled */           sprintf( impl_str, "disabled" );
+	const bool    has_openmp      = bli_info_get_enable_openmp();
+	const bool    has_pthreads    = bli_info_get_enable_pthreads();
+	const bool    openmp_is_def   = bli_info_get_enable_openmp_as_default();
+	const bool    pthreads_is_def = bli_info_get_enable_pthreads_as_default();
+	const timpl_t ti              = bli_thread_get_thread_impl();
+
+	// List the available threading implementation(s).
+	if      ( has_openmp && has_pthreads   ) sprintf( impl_str, "openmp,pthreads,single" );
+	else if ( has_openmp                   ) sprintf( impl_str, "openmp,single" );
+	else if (               has_pthreads   ) sprintf( impl_str, "pthreads,single" );
+	else                                     sprintf( impl_str, "single only" );
+
+	// Describe the default threading implementation that would be active if
+	// or when BLIS_THREAD_IMPL is unset.
+	if      ( openmp_is_def   ) sprintf( def_impl_unset_str, "openmp" );
+	else if ( pthreads_is_def ) sprintf( def_impl_unset_str, "pthreads" );
+	else                        sprintf( def_impl_unset_str, "single" );
+
+	// Describe the default threading implementation as the testsuite was
+	// currently run.
+	if      ( ti == BLIS_OPENMP ) sprintf( def_impl_set_str, "openmp" );
+	else if ( ti == BLIS_POSIX  ) sprintf( def_impl_set_str, "pthreads" );
+	else                          sprintf( def_impl_set_str, "single" );
 
 	// Describe the status of jrir thread partitioning.
 	if   ( bli_info_get_thread_part_jrir_slab() ) sprintf( jrir_str, "slab" );
@@ -878,7 +899,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "--- BLIS parallelization info ---\n" );
 	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "multithreading                 %s\n", impl_str );
+	libblis_test_fprintf_c( os, "multithreading modes           %s\n", impl_str );
+	libblis_test_fprintf_c( os, "  default mode                 %s\n", def_impl_unset_str );
+	libblis_test_fprintf_c( os, "  current mode                 %s\n", def_impl_set_str );
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "thread auto-factorization        \n" );
 	libblis_test_fprintf_c( os, "  m dim thread ratio           %d\n", ( int )BLIS_THREAD_RATIO_M );

From 036a4f9d822df25a76a653e70be76fb02284d3d3 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 22 Sep 2022 18:36:50 -0500
Subject: [PATCH 087/230] Refactored some rntm_t management code. (#666)

Details:
- Separated the "sanitizing" code from the auto-factorization code
  in bli_rntm_set_ways_from_rntm() and _rntm_set_ways_from_rntm_sup().
  The santizing code now resides in bli_rntm_sanitize() while the
  factorization code resides in bli_rntm_factorize() and
  bli_rntm_factorize_sup(). (There are two different functions because
  the conventional and sup factorization codes are currently somewhat
  different.) Also note that the factorization code now relies on the
  .auto_factor field to have already been set, either during
  rntm_t initialization or when the rntm_t was previously updated and
  santized. So rather than locally determining whether to auto-
  factorize, those functions just read the .auto_factor field and
  proceed accordingly.
- Refactored and removed most code from bli_thread_init_rntm_from_env().
  This function now reads the environment variables needed to set nt,
  jc, pc, ic, jr, and ir; sets them into the global rntm_t; and then
  calls bli_rntm_sanitize() in order to make sure that the contents are
  in a "good" state. Thanks to Devin Matthews for suggesting this
  refactoring.
- Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() such that
  if multithreading is disabled at compile time (that is, if the cpp
  macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the
  caller's request and instead clear the nt and ways fields.
- Redefined bli_thread_set_num_threads() and bli_thread_set_ways() such
  that if multithreading is disabled at compile time (that is, if the
  cpp macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the
  caller's request and do nothing.
- Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() as true
  functions rather than static inline functions.
- In bli_rntm.c, statically initialize the global_rntm global variable
  via the BLIS_RNTM_INITIALIZER macro.
- In bli_rntm.h, defined bli_rntm_clear_auto_factor(), which sets the
  .auto_factor field of the rntm_t to FALSE.
- Reorganized order of some inline function definitions in bli_rntm.h.
- Changed the default value given to the .auto_factor field by the
  BLIS_RNTM_INITIALIZER macro from TRUE to FALSE.
- Call bli_rntm_clear_auto_factor() instead of
  bli_rntm_set_auto_factor_only() in bli_rntm_init().
- Comment/whitespace updates.
---
 frame/3/bli_l3_sup_ref.c  |   4 +-
 frame/base/bli_rntm.c     | 453 ++++++++++++++++++++------------------
 frame/base/bli_rntm.h     |  90 ++++----
 frame/thread/bli_thread.c | 178 ++++++---------
 frame/thread/bli_thread.h |   1 +
 5 files changed, 364 insertions(+), 362 deletions(-)

diff --git a/frame/3/bli_l3_sup_ref.c b/frame/3/bli_l3_sup_ref.c
index 8eb7a6d4b..76314aba7 100644
--- a/frame/3/bli_l3_sup_ref.c
+++ b/frame/3/bli_l3_sup_ref.c
@@ -89,7 +89,7 @@ err_t bli_gemmsup_ref
 
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop.
-	bli_rntm_set_ways_from_rntm_sup
+	bli_rntm_factorize_sup
 	(
 	  bli_obj_length( c ),
 	  bli_obj_width( c ),
@@ -163,7 +163,7 @@ err_t bli_gemmtsup_ref
 
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop.
-	bli_rntm_set_ways_from_rntm_sup
+	bli_rntm_factorize_sup
 	(
 	  bli_obj_length( c ),
 	  bli_obj_width( c ),
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index 895976679..786998f23 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -36,12 +36,12 @@
 
 // The global rntm_t structure, which holds the global thread settings
 // along with a few other key parameters.
-rntm_t global_rntm;
+rntm_t global_rntm = BLIS_RNTM_INITIALIZER;
 
 // A mutex to allow synchronous access to global_rntm.
 bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
-// ----------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
 
 void bli_rntm_init_from_global( rntm_t* rntm )
 {
@@ -59,6 +59,76 @@ void bli_rntm_init_from_global( rntm_t* rntm )
 
 // -----------------------------------------------------------------------------
 
+void bli_rntm_set_num_threads
+     (
+       dim_t   nt,
+       rntm_t* rntm
+     )
+{
+#ifdef BLIS_ENABLE_MULTITHREADING
+
+	// Record the total number of threads to use.
+	bli_rntm_set_num_threads_only( nt, rntm );
+
+	// Set the individual ways of parallelism to default states. This
+	// must be done before sanitization so that the .num_threads field
+	// will prevail over any previous ways that may have been set.
+	bli_rntm_clear_ways_only( rntm );
+
+	// Ensure that the rntm_t is in a consistent state.
+	bli_rntm_sanitize( rntm );
+
+#else
+
+	// When multithreading is disabled at compile time, ignore the user's
+	// request. And just to be safe, reassert the default rntm_t values.
+	bli_rntm_clear_num_threads_only( rntm );
+	bli_rntm_clear_ways_only( rntm );
+
+#endif
+}
+
+void bli_rntm_set_ways
+     (
+       dim_t   jc,
+       dim_t   pc,
+       dim_t   ic,
+       dim_t   jr,
+       dim_t   ir,
+       rntm_t* rntm
+     )
+{
+#ifdef BLIS_ENABLE_MULTITHREADING
+
+	// Record the number of ways of parallelism per loop.
+	bli_rntm_set_jc_ways_only( jc, rntm );
+	bli_rntm_set_pc_ways_only(  1, rntm ); // Disable pc_nt values.
+	bli_rntm_set_ic_ways_only( ic, rntm );
+	bli_rntm_set_jr_ways_only( jr, rntm );
+	bli_rntm_set_ir_ways_only( ir, rntm );
+	bli_rntm_set_pr_ways_only(  1, rntm );
+
+	// Set the total number of threads to its default state. This isn't
+	// strictly necessary, but is done in case the priority of nt vs.
+	// ways ever changes. (Currently, the ways always prevail over the
+	// number of threads, if both are set.)
+	bli_rntm_clear_num_threads_only( rntm );
+
+	// Ensure that the rntm_t is in a consistent state.
+	bli_rntm_sanitize( rntm );
+
+#else
+
+	// When multithreading is disabled at compile time, ignore the user's
+	// request. And just to be safe, reassert the default rntm_t values.
+	bli_rntm_clear_num_threads_only( rntm );
+	bli_rntm_clear_ways_only( rntm );
+
+#endif
+}
+
+// -----------------------------------------------------------------------------
+
 void bli_rntm_set_ways_for_op
      (
        opid_t  l3_op,
@@ -71,7 +141,7 @@ void bli_rntm_set_ways_for_op
 {
 	// Set the number of ways for each loop, if needed, depending on what
 	// kind of information is already stored in the rntm_t object.
-	bli_rntm_set_ways_from_rntm( m, n, k, rntm );
+	bli_rntm_factorize( m, n, k, rntm );
 
 #if 0
 printf( "bli_rntm_set_ways_for_op()\n" );
@@ -153,146 +223,112 @@ bli_rntm_print( rntm );
 	}
 }
 
-void bli_rntm_set_ways_from_rntm
+void bli_rntm_sanitize
      (
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
        rntm_t* rntm
      )
 {
-	// NOTE: While much of the multithreading cpp case of this function may seem
-	// redundant with bli_thread_init_rntm_from_env(), we need them both. The
-	// bli_thread_init_rntm_from_env() function is only called to initialize the
-	// global rntm_t. There, the consistency logic serves to make sure that sane
-	// values will be returned if the application (in the time between library
-	// initialization and when computation begins) subsequently queries the
-	// number of threads or ways via the runtime API. This function also needs
-	// the same consistency logic, but for a different reason: this function
-	// guarantees that the rntm_t has sane values in the event that the
-	// application passed in a custom rntm_t via an expert interface.
-
-
-	bool  auto_factor = FALSE;
-	dim_t nt;
-	dim_t jc, pc, ic, jr, ir;
-
 #ifdef BLIS_ENABLE_MULTITHREADING
 
-	nt = bli_rntm_num_threads( rntm );
-	jc = bli_rntm_jc_ways( rntm );
-	pc = bli_rntm_pc_ways( rntm );
-	ic = bli_rntm_ic_ways( rntm );
-	jr = bli_rntm_jr_ways( rntm );
-	ir = bli_rntm_ir_ways( rntm );
+	timpl_t ti = bli_rntm_thread_impl( rntm );
+	dim_t   nt = bli_rntm_num_threads( rntm );
+	dim_t   jc = bli_rntm_jc_ways( rntm );
+	dim_t   pc = bli_rntm_pc_ways( rntm );
+	dim_t   ic = bli_rntm_ic_ways( rntm );
+	dim_t   jr = bli_rntm_jr_ways( rntm );
+	dim_t   ir = bli_rntm_ir_ways( rntm );
+
+	bool auto_factor = FALSE;
 
 	bool nt_set   = FALSE;
 	bool ways_set = FALSE;
 
-	// Some users are mischievous/dumb. Make sure they don't cause trouble.
-	if ( nt < 1 ) nt = 1;
-	if ( jc < 1 ) jc = 1;
-	if ( pc < 1 ) pc = 1;
-	if ( ic < 1 ) ic = 1;
-	if ( jr < 1 ) jr = 1;
-	if ( ir < 1 ) ir = 1;
-
-	// First, we establish whether or not the number of threads or ways of
-	// parallelism were set to meaningful values.
-	if ( nt > 1 ) { nt_set   = TRUE; }
-	if ( jc > 1 ) { ways_set = TRUE; }
-	if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values.
-	if ( ic > 1 ) { ways_set = TRUE; }
-	if ( jr > 1 ) { ways_set = TRUE; }
-	if ( ir > 1 ) { ways_set = TRUE; }
-
-	// Now we use the values of nt_set and ways_set to determine how to
-	// interpret the original values we found in the rntm_t object.
-
-	if ( ways_set == TRUE )
+	if ( ti == BLIS_SINGLE )
 	{
-		// If the per-loop ways of parallelism were set, then we use the values
-		// that were given and interpreted above. The only thing left to do is
-		// calculate the correct number of threads. Notice that if the user also
-		// happened to set the total number of threads that value is discarded
-		// in favor of the implied value from the per-loop ways of parallelism.
+		// If the threading implementation was set to BLIS_SINGLE, we ignore
+		// everything else.
 
-		nt = jc * pc * ic * jr * ir;
+		nt = 1;
+		jc = pc = ic = jr = ir = 1;
 		auto_factor = FALSE;
 	}
-	else if ( ways_set == FALSE && nt_set == TRUE )
+	else // if ( ti != BLIS_SINGLE )
 	{
-		// If the ways were not set but the number of thread was set, then we
-		// will attempt to automatically generate a thread factorization that
-		// will work given the problem size.
-
-		#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
-		// If use of prime numbers is disallowed for automatic thread
-		// factorizations, we first check if the number of threads requested
-		// is prime. If it is prime, and it exceeds a minimum threshold, then
-		// we reduce the number of threads by one so that the number is not
-		// prime. This will allow for automatic thread factorizations to span
-		// two dimensions (loops), which tends to be more efficient.
-		if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
-		#endif
-
-		//printf( "m n = %d %d  BLIS_THREAD_RATIO_M _N = %d %d\n",
-		//         (int)m, (int)n, (int)BLIS_THREAD_RATIO_M,
-		//                         (int)BLIS_THREAD_RATIO_N );
-
-		bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
-		                              n*BLIS_THREAD_RATIO_N, &ic, &jc );
-
-		//printf( "jc ic = %d %d\n", (int)jc, (int)ic );
-
-		for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- )
+		// If the threading implementation was set to one of the true
+		// multithreading implementations (e.g. BLIS_OPENMP, BLIS_POSIX),
+		// we proceed to interpret and process the rntm_t's fields.
+
+		// Some users are mischievous/dumb. Make sure they don't cause trouble.
+		if ( nt < 1 ) nt = 1;
+		if ( jc < 1 ) jc = 1;
+		if ( pc < 1 ) pc = 1;
+		if ( ic < 1 ) ic = 1;
+		if ( jr < 1 ) jr = 1;
+		if ( ir < 1 ) ir = 1;
+
+		// Now establish whether or not the number of threads or ways of
+		// parallelism were set to meaningful values.
+		if ( nt > 1 ) { nt_set   = TRUE; }
+		if ( jc > 1 ) { ways_set = TRUE; }
+		if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values.
+		if ( ic > 1 ) { ways_set = TRUE; }
+		if ( jr > 1 ) { ways_set = TRUE; }
+		if ( ir > 1 ) { ways_set = TRUE; }
+
+		// Next, we use the values of nt_set and ways_set to determine how to
+		// interpret the original values we found in the rntm_t object.
+
+		if ( ways_set == TRUE )
 		{
-			if ( ic % ir == 0 ) { ic /= ir; break; }
+			// If the per-loop ways of parallelism were set, then we use the values
+			// that were given and interpreted above. Since the per-loop ways are
+			// known, we can calculate the total number of threads. Notice that if
+			// the user also happened to set the total number of threads, that value
+			// is discarded in favor of the implied value from the per-loop ways of
+			// parallelism.
+
+			nt = jc * pc * ic * jr * ir;
+			auto_factor = FALSE;
 		}
+		else if ( ways_set == FALSE && nt_set == TRUE )
+		{
+			// If the ways were not set but the number of thread was set, then we
+			// will attempt to automatically generate a thread factorization that
+			// will work given the problem size. This happens later, in
+			// bli_rntm_factorize().
 
-		for ( jr = BLIS_THREAD_MAX_JR ; jr > 1 ; jr-- )
+			auto_factor = TRUE;
+		}
+		else // if ( ways_set == FALSE && nt_set == FALSE )
 		{
-			if ( jc % jr == 0 ) { jc /= jr; break; }
+			// If neither the ways nor the number of threads were set, then the
+			// rntm_t was not meaningfully changed since initialization. This means
+			// the ways are already 1, which will lead to the default behavior of
+			// single-threaded execution.
 		}
-
-		// Force the number of ways of parallelism in the pc loop to 1
-		// just in case the caller set it to something greater than 1.
-		pc = 1;
-
-		// Make note that auto-factorization was performed.
-		auto_factor = TRUE;
-	}
-	else // if ( ways_set == FALSE && nt_set == FALSE )
-	{
-		// If neither the ways nor the number of threads were set, then the
-		// rntm_t was not meaningfully changed since initialization. This means
-		// the fields are all 1, which will lead to the default behavior of
-		// single-threaded execution.
-		//nt = jc = pc = ic = jr = ir = 1;
-		//auto_factor = FALSE;
 	}
 
+	// Save the results back in the rntm_t object.
+	// Note: We don't need to set the .thread_impl field of the rntm_t because
+	// it was not changed in the sanitization process.
+	//bli_rntm_set_thread_impl_only( ti, rntm );
+	bli_rntm_set_num_threads_only( nt, rntm );
+	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+	bli_rntm_set_auto_factor_only( auto_factor, rntm );
+
 #else
 
 	// When multithreading is disabled, always set the per-loop ways of
 	// parallelism to 1.
-	nt = 1;
-	jc = pc = ic = jr = ir = 1;
+	bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm );
+	bli_rntm_set_num_threads_only( 1, rntm );
+	bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
+	bli_rntm_set_auto_factor_only( FALSE, rntm );
 
 #endif
-
-	// Save the results back in the rntm_t object.
-	bli_rntm_set_auto_factor_only( auto_factor, rntm );
-	bli_rntm_set_num_threads_only( nt, rntm );
-	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
-
-	// NOTE: The caller should have already set the timpl_t field of the rntm_t,
-	// either in the course of it being initialized via BLIS_RNTM_INITIALIZER
-	// or bli_rntm_init(), or by the user (subsequently) setting the value
-	// directly via bli_rntm_set_thread_impl().
 }
 
-void bli_rntm_set_ways_from_rntm_sup
+void bli_rntm_factorize
      (
        dim_t   m,
        dim_t   n,
@@ -300,122 +336,115 @@ void bli_rntm_set_ways_from_rntm_sup
        rntm_t* rntm
      )
 {
-	bool  auto_factor = FALSE;
-	dim_t nt;
-	dim_t jc, pc, ic, jr, ir;
-
 #ifdef BLIS_ENABLE_MULTITHREADING
 
-	nt = bli_rntm_num_threads( rntm );
-	jc = bli_rntm_jc_ways( rntm );
-	pc = bli_rntm_pc_ways( rntm );
-	ic = bli_rntm_ic_ways( rntm );
-	jr = bli_rntm_jr_ways( rntm );
-	ir = bli_rntm_ir_ways( rntm );
+	// The .auto_factor field would have been set either at initialization or
+	// when the rntm_t was sanitized after being updated by the user.
+	if ( bli_rntm_auto_factor( rntm ) )
+	{
+		dim_t nt = bli_rntm_num_threads( rntm );
+		dim_t jc = bli_rntm_jc_ways( rntm );
+		dim_t pc = bli_rntm_pc_ways( rntm );
+		dim_t ic = bli_rntm_ic_ways( rntm );
+		dim_t jr = bli_rntm_jr_ways( rntm );
+		dim_t ir = bli_rntm_ir_ways( rntm );
 
-	bool nt_set   = FALSE;
-	bool ways_set = FALSE;
+		if ( 0 < m && 0 < n && 0 <= k )
+		{
+			#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
+			// If use of prime numbers is disallowed for automatic thread
+			// factorizations, we first check if the number of threads requested
+			// is prime. If it is prime, and it exceeds a minimum threshold, then
+			// we reduce the number of threads by one so that the number is not
+			// prime. This will allow for automatic thread factorizations to span
+			// two dimensions (loops), which tends to be more efficient.
+			if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
+			#endif
+
+			//printf( "m n = %d %d  BLIS_THREAD_RATIO_M _N = %d %d\n",
+			//         (int)m, (int)n, (int)BLIS_THREAD_RATIO_M,
+			//                         (int)BLIS_THREAD_RATIO_N );
+
+			bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
+			                              n*BLIS_THREAD_RATIO_N, &ic, &jc );
+
+			//printf( "jc ic = %d %d\n", (int)jc, (int)ic );
+
+			for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- )
+			{
+				if ( ic % ir == 0 ) { ic /= ir; break; }
+			}
 
-	// Some users are mischievous/dumb. Make sure they don't cause trouble.
-	if ( nt < 1 ) nt = 1;
-	if ( jc < 1 ) jc = 1;
-	if ( pc < 1 ) pc = 1;
-	if ( ic < 1 ) ic = 1;
-	if ( jr < 1 ) jr = 1;
-	if ( ir < 1 ) ir = 1;
-
-	// First, we establish whether or not the number of threads or ways of
-	// parallelism were set to meaningful values.
-	if ( nt > 1 ) { nt_set   = TRUE; }
-	if ( jc > 1 ) { ways_set = TRUE; }
-	if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values.
-	if ( ic > 1 ) { ways_set = TRUE; }
-	if ( jr > 1 ) { ways_set = TRUE; }
-	if ( ir > 1 ) { ways_set = TRUE; }
-
-	// Now we use the values of nt_set and ways_set to determine how to
-	// interpret the original values we found in the rntm_t object.
-
-	if ( ways_set == TRUE )
-	{
-		// If the per-loop ways of parallelism were set, then we use the values
-		// that were given and interpreted above. The only thing left to do is
-		// calculate the correct number of threads. Notice that if the user also
-		// happened to set the total number of threads that value is discarded
-		// in favor of the implied value from the per-loop ways of parallelism.
+			for ( jr = BLIS_THREAD_MAX_JR ; jr > 1 ; jr-- )
+			{
+				if ( jc % jr == 0 ) { jc /= jr; break; }
+			}
+		}
 
-		nt = jc * pc * ic * jr * ir;
-		auto_factor = FALSE;
+		// Save the results back in the rntm_t object.
+		bli_rntm_set_num_threads_only( nt, rntm );
+		bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
 	}
-	else if ( ways_set == FALSE && nt_set == TRUE )
+
+#else
+
+	// When multithreading is disabled at compile time, the rntm can keep its
+	// default initialization values since using one thread requires no
+	// factorization.
+
+#endif
+}
+
+void bli_rntm_factorize_sup
+     (
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       rntm_t* rntm
+     )
+{
+#ifdef BLIS_ENABLE_MULTITHREADING
+
+	// The .auto_factor field would have been set either at initialization or
+	// when the rntm_t was sanitized after being updated by the user.
+	if ( bli_rntm_auto_factor( rntm ) )
 	{
-		// If the ways were not set but the number of thread was set, then we
-		// will attempt to automatically generate a thread factorization that
-		// work given the problem size.
-
-		#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
-		// If use of prime numbers is disallowed for automatic thread
-		// factorizations, we first check if the number of threads requested
-		// is prime. If it is prime, and it exceeds a minimum threshold, then
-		// we reduce the number of threads by one so that the number is not
-		// prime. This will allow for automatic thread factorizations to span
-		// two dimensions (loops), which tends to be more efficient.
-		if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
-		#endif
-
-		//bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M,
-		//                              n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc );
-		bli_thread_partition_2x2( nt, m,
-		                              n, &ic, &jc );
-
-		//printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d  ic = %d\n",
-		//        (int)jc, (int)ic );
-
-		#if 0
-		for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- )
-		{
-			if ( ic % ir == 0 ) { ic /= ir; break; }
-		}
+		dim_t nt = bli_rntm_num_threads( rntm );
+		dim_t jc = bli_rntm_jc_ways( rntm );
+		dim_t pc = bli_rntm_pc_ways( rntm );
+		dim_t ic = bli_rntm_ic_ways( rntm );
+		dim_t jr = bli_rntm_jr_ways( rntm );
+		dim_t ir = bli_rntm_ir_ways( rntm );
 
-		for ( jr = BLIS_THREAD_SUP_MAX_JR ; jr > 1 ; jr-- )
+		if ( 0 < m && 0 < n && 0 <= k )
 		{
-			if ( jc % jr == 0 ) { jc /= jr; break; }
+			#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
+			// If use of prime numbers is disallowed for automatic thread
+			// factorizations, we first check if the number of threads requested
+			// is prime. If it is prime, and it exceeds a minimum threshold, then
+			// we reduce the number of threads by one so that the number is not
+			// prime. This will allow for automatic thread factorizations to span
+			// two dimensions (loops), which tends to be more efficient.
+			if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
+			#endif
+
+			bli_thread_partition_2x2( nt, m,
+										  n, &ic, &jc );
+			ir = 1; jr = 1;
 		}
-		#else
-		ir = 1;
-		jr = 1;
-		#endif
 
-		// Force the number of ways of parallelism in the pc loop to 1 just in
-		// case the caller set it to something greater than 1.
-		pc = 1;
-
-		// Make note that auto-factorization was performed.
-		auto_factor = TRUE;
-	}
-	else // if ( ways_set == FALSE && nt_set == FALSE )
-	{
-		// If neither the ways nor the number of threads were set, then the
-		// rntm_t was not meaningfully changed since initialization. This means
-		// the fields are all 1, which will lead to the default behavior of
-		// single-threaded execution.
-		//nt = jc = pc = ic = jr = ir = 1;
-		//auto_factor = FALSE;
+		// Save the results back in the rntm_t object.
+		bli_rntm_set_num_threads_only( nt, rntm );
+		bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
 	}
 
 #else
 
-	// When multithreading is disabled, always set the per-loop ways of
-	// parallelism to 1.
-	nt = 1;
-	jc = pc = ic = jr = ir = 1;
+	// When multithreading is disabled at compile time, the rntm can keep its
+	// default initialization values since using one thread requires no
+	// factorization.
 
 #endif
-
-	// Save the results back in the rntm_t object.
-	bli_rntm_set_auto_factor_only( auto_factor, rntm );
-	bli_rntm_set_num_threads_only( nt, rntm );
-	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
 }
 
 void bli_rntm_print
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 426b74d60..f6756c589 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -45,10 +45,10 @@ typedef struct rntm_s
 {
 	timpl_t   thread_impl;
 
-	bool      auto_factor;
-
 	dim_t     num_threads;
 	dim_t     thrloop[ BLIS_NUM_LOOPS ];
+
+	bool      auto_factor;
 	bool      pack_a;
 	bool      pack_b;
 	bool      l3_sup;
@@ -214,14 +214,6 @@ BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm )
 {
 	bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
 }
-BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm )
-{
-	bli_rntm_set_sba_pool( NULL, rntm );
-}
-BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm )
-{
-	bli_rntm_set_pba( NULL, rntm );
-}
 
 //
 // -- rntm_t modification (public API) -----------------------------------------
@@ -233,31 +225,6 @@ BLIS_INLINE void bli_rntm_set_thread_impl( timpl_t thread_impl, rntm_t* rntm )
 	bli_rntm_set_thread_impl_only( thread_impl, rntm );
 }
 
-BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm )
-{
-	// Record the total number of threads to use.
-	bli_rntm_set_num_threads_only( nt, rntm );
-
-	// Set the individual ways of parallelism to default states.
-	bli_rntm_clear_ways_only( rntm );
-}
-
-BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm )
-{
-	// Record the number of ways of parallelism per loop.
-	bli_rntm_set_jc_ways_only( jc, rntm );
-	bli_rntm_set_pc_ways_only(  1, rntm );
-	bli_rntm_set_ic_ways_only( ic, rntm );
-	bli_rntm_set_jr_ways_only( jr, rntm );
-	bli_rntm_set_ir_ways_only( ir, rntm );
-	bli_rntm_set_pr_ways_only(  1, rntm );
-
-	// Set the num_threads field to the product of all the ways. The only
-	// benefit of doing this, though, is that the user can query the total
-	// number of threads from the rntm_t after calling this function.
-	bli_rntm_set_num_threads_only( jc * 1 * ic * jr * ir, rntm );
-}
-
 BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm )
 {
 	// Set the bool indicating whether matrix A should be packed.
@@ -287,6 +254,15 @@ BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm )
 // -- rntm_t modification (internal use only) ----------------------------------
 //
 
+BLIS_INLINE void bli_rntm_clear_thread_impl( rntm_t* rntm )
+{
+	bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm );
+}
+
+BLIS_INLINE void bli_rntm_clear_auto_factor( rntm_t* rntm )
+{
+	bli_rntm_set_auto_factor_only( FALSE, rntm );
+}
 BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm )
 {
 	bli_rntm_set_pack_a( FALSE, rntm );
@@ -300,6 +276,15 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 	bli_rntm_set_l3_sup( TRUE, rntm );
 }
 
+BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm )
+{
+	bli_rntm_set_sba_pool( NULL, rntm );
+}
+BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm )
+{
+	bli_rntm_set_pba( NULL, rntm );
+}
+
 //
 // -- rntm_t initialization ----------------------------------------------------
 //
@@ -310,10 +295,10 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 
 #define BLIS_RNTM_INITIALIZER \
         { \
-          .thread_impl = SINGLE, \
-          .auto_factor = TRUE, \
+          .thread_impl = BLIS_SINGLE, \
           .num_threads = 1, \
           .thrloop     = { 1, 1, 1, 1, 1, 1 }, \
+          .auto_factor = FALSE, \
           .pack_a      = FALSE, \
           .pack_b      = FALSE, \
           .l3_sup      = TRUE, \
@@ -323,12 +308,12 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 
 BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 {
-	bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm );
-
-	bli_rntm_set_auto_factor_only( TRUE, rntm );
+	bli_rntm_clear_thread_impl( rntm );
 
 	bli_rntm_clear_num_threads_only( rntm );
 	bli_rntm_clear_ways_only( rntm );
+
+	bli_rntm_clear_auto_factor( rntm );
 	bli_rntm_clear_pack_a( rntm );
 	bli_rntm_clear_pack_b( rntm );
 	bli_rntm_clear_l3_sup( rntm );
@@ -363,6 +348,22 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads
 
 BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
 
+BLIS_EXPORT_BLIS void bli_rntm_set_num_threads
+     (
+       dim_t   nt,
+       rntm_t* rntm
+     );
+
+BLIS_EXPORT_BLIS void bli_rntm_set_ways
+     (
+       dim_t   jc,
+       dim_t   pc,
+       dim_t   ic,
+       dim_t   jr,
+       dim_t   ir,
+       rntm_t* rntm
+     );
+
 BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
      (
        opid_t  l3_op,
@@ -373,7 +374,12 @@ BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
        rntm_t* rntm
      );
 
-void bli_rntm_set_ways_from_rntm
+void bli_rntm_sanitize
+     (
+       rntm_t* rntm
+     );
+
+void bli_rntm_factorize
      (
        dim_t   m,
        dim_t   n,
@@ -381,7 +387,7 @@ void bli_rntm_set_ways_from_rntm
        rntm_t* rntm
      );
 
-void bli_rntm_set_ways_from_rntm_sup
+void bli_rntm_factorize_sup
      (
        dim_t   m,
        dim_t   n,
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 9369b373b..eefc20fdd 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -1562,6 +1562,18 @@ timpl_t bli_thread_get_thread_impl( void )
 	return bli_rntm_thread_impl( &global_rntm );
 }
 
+static const char* bli_timpl_string[BLIS_NUM_THREAD_IMPLS] =
+{
+	[BLIS_SINGLE] = "single",
+	[BLIS_OPENMP] = "openmp",
+	[BLIS_POSIX]  = "pthreads",
+};
+
+const char* bli_thread_get_thread_impl_str( timpl_t ti )
+{
+	return bli_timpl_string[ti];
+}
+
 // ----------------------------------------------------------------------------
 
 void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
@@ -1569,13 +1581,25 @@ void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
+#ifdef BLIS_ENABLE_MULTITHREADING
+
 	// Acquire the mutex protecting global_rntm.
 	bli_pthread_mutex_lock( &global_rntm_mutex );
 
 	bli_rntm_set_ways_only( jc, 1, ic, jr, ir, &global_rntm );
 
+	// Ensure that the rntm_t is in a consistent state.
+	bli_rntm_sanitize( &global_rntm );
+
 	// Release the mutex protecting global_rntm.
 	bli_pthread_mutex_unlock( &global_rntm_mutex );
+
+#else
+
+	// When multithreading is disabled at compile time, ignore the user's
+	// request.
+
+#endif
 }
 
 void bli_thread_set_num_threads( dim_t n_threads )
@@ -1583,13 +1607,25 @@ void bli_thread_set_num_threads( dim_t n_threads )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
+#ifdef BLIS_ENABLE_MULTITHREADING
+
 	// Acquire the mutex protecting global_rntm.
 	bli_pthread_mutex_lock( &global_rntm_mutex );
 
 	bli_rntm_set_num_threads_only( n_threads, &global_rntm );
 
+	// Ensure that the rntm_t is in a consistent state.
+	bli_rntm_sanitize( &global_rntm );
+
 	// Release the mutex protecting global_rntm.
 	bli_pthread_mutex_unlock( &global_rntm_mutex );
+
+#else
+
+	// When multithreading is disabled at compile time, ignore the user's
+	// request.
+
+#endif
 }
 
 void bli_thread_set_thread_impl( timpl_t ti )
@@ -1619,24 +1655,10 @@ void bli_thread_init_rntm_from_env
 	// function is only called from bli_thread_init(), which is only called
 	// by bli_init_once().
 
-	// NOTE: While much of the multithreading cpp case of this function may seem
-	// redundant with bli_rntm_set_ways_from_rntm(), we need them both. This
-	// function is only called to initialize the global rntm_t. Here, the
-	// consistency logic serves to make sure that sane values will be returned
-	// if the application (in the time between library initialization and when
-	// computation begins) subsequently queries the number of threads or ways
-	// via the runtime API. The bli_rntm_set_ways_from_rntm() function also
-	// needs the same consistency logic, but for a different reason: that
-	// function guarantees that the rntm_t has sane values in the event that the
-	// application passed in a custom rntm_t via an expert interface.
-
-	bool    auto_factor = FALSE;
-	dim_t   nt;
-	dim_t   jc, pc, ic, jr, ir;
-	timpl_t ti;
-
 #ifdef BLIS_ENABLE_MULTITHREADING
 
+	timpl_t ti = BLIS_SINGLE;
+
 	// Try to read BLIS_THREAD_IMPL.
 	char* ti_env = bli_env_get_str( "BLIS_THREAD_IMPL" );
 
@@ -1657,18 +1679,16 @@ void bli_thread_init_rntm_from_env
 		else                                          ti = BLIS_SINGLE;
 
 		#ifdef PRINT_IMPL
-		if      ( ti == BLIS_OPENMP )
-			printf( "detected BLIS_THREAD_IMPL=openmp.\n" );
-		else if ( ti == BLIS_POSIX )
-			printf( "detected BLIS_THREAD_IMPL=pthreads.\n" );
-		else
-			printf( "detected BLIS_THREAD_IMPL=single.\n" );
+		printf( "detected BLIS_THREAD_IMPL=%s.\n",
+		        bli_thread_get_thread_impl_str( ti );
 		#endif
 	}
 	else
 	{
 		// If BLIS_THREAD_IMPL was unset, default to the implementation that
 		// was determined at configure-time.
+		ti = BLIS_SINGLE;
+
 		#ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT
 		ti = BLIS_OPENMP;
 		#endif
@@ -1677,16 +1697,15 @@ void bli_thread_init_rntm_from_env
 		#endif
 
 		#ifdef PRINT_IMPL
-		printf( "BLIS_THREAD_IMPL unset.\n" );
-		if      ( ti == BLIS_OPENMP )
-			printf( "defaulting to BLIS_THREAD_IMPL=openmp.\n" );
-		else if ( ti == BLIS_POSIX )
-			printf( "defaulting to BLIS_THREAD_IMPL=pthreads.\n" );
+		printf( "BLIS_THREAD_IMPL unset; defaulting to BLIS_THREAD_IMPL=%s.\n",
+		        bli_thread_get_thread_impl_str( ti );
 		#endif
 	}
 
+	// ------------------------------------------------------------------------
+
 	// Try to read BLIS_NUM_THREADS first.
-	nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
+	dim_t nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
 
 	// If BLIS_NUM_THREADS was not set, try to read BLIS_NT.
 	if ( nt == -1 ) nt = bli_env_get_var( "BLIS_NT", -1 );
@@ -1694,92 +1713,39 @@ void bli_thread_init_rntm_from_env
 	// If neither BLIS_NUM_THREADS nor BLIS_NT were set, try OMP_NUM_THREADS.
 	if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
 
+	// ------------------------------------------------------------------------
+
 	// Read the environment variables for the number of threads (ways of
 	// parallelism) for each individual loop.
-	jc = bli_env_get_var( "BLIS_JC_NT", -1 );
-	pc = bli_env_get_var( "BLIS_PC_NT", -1 );
-	ic = bli_env_get_var( "BLIS_IC_NT", -1 );
-	jr = bli_env_get_var( "BLIS_JR_NT", -1 );
-	ir = bli_env_get_var( "BLIS_IR_NT", -1 );
-
-	bool nt_set   = FALSE;
-	bool ways_set = FALSE;
-
-	// Some users are mischievous/dumb. Make sure they don't cause trouble.
-	if ( nt < 1 ) nt = 1;
-	if ( jc < 1 ) jc = 1;
-	if ( pc < 1 ) pc = 1;
-	if ( ic < 1 ) ic = 1;
-	if ( jr < 1 ) jr = 1;
-	if ( ir < 1 ) ir = 1;
-
-	// First, we establish whether or not the number of threads or ways of
-	// parallelism were set to meaningful values.
-	if ( nt > 1 ) { nt_set   = TRUE; }
-	if ( jc > 1 ) { ways_set = TRUE; }
-	if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values.
-	if ( ic > 1 ) { ways_set = TRUE; }
-	if ( jr > 1 ) { ways_set = TRUE; }
-	if ( ir > 1 ) { ways_set = TRUE; }
-
-	// Now we use the values of nt_set and ways_set to determine how to
-	// interpret the original values we found in the rntm_t object.
-
-	if ( ( ways_set == TRUE && nt_set == TRUE  ) ||
-	     ( ways_set == TRUE && nt_set == FALSE ) )
-	{
-		// If the per-loop ways of parallelism were set, then we use the values
-		// that were given and interpreted above. The only thing left to do is
-		// calculate the correct number of threads. Notice that whatever value
-		// may have been asigned to BLIS_NUM_THREADS will be ignored, and the
-		// total number of threads will be taken to be the number implied from
-		// the per-loop ways of parallelism.
-
-		nt = jc * pc * ic * jr * ir;
-		auto_factor = FALSE;
-	}
-	else if ( ways_set == FALSE && nt_set == TRUE )
-	{
-		// If the ways were not set but the number of thread was set, then we
-		// will attempt to automatically generate a thread factorization that
-		// will work given the problem size. This auto-factorization will
-		// occur later, in bli_rntm_set_ways_from_rntm(), once we know the
-		// problem size.
-
-		// Make note that auto-factorization will be performed.
-		auto_factor = TRUE;
-	}
-	else // if ( ways_set == FALSE && nt_set == FALSE )
-	{
-		// If neither the ways nor the number of threads were set, then we
-		// allow the default values to stand.
-		//nt = jc = pc = ic = jr = ir = 1;
-		//auto_factor = FALSE;
-	}
-
-#else
+	dim_t jc = bli_env_get_var( "BLIS_JC_NT", -1 );
+	dim_t pc = bli_env_get_var( "BLIS_PC_NT", -1 );
+	dim_t ic = bli_env_get_var( "BLIS_IC_NT", -1 );
+	dim_t jr = bli_env_get_var( "BLIS_JR_NT", -1 );
+	dim_t ir = bli_env_get_var( "BLIS_IR_NT", -1 );
 
-	// Note that we don't even bother checking BLIS_THREAD_IMPL if neither
-	// OpenMP nor pthreads was enabled at compile time.
-	ti = BLIS_SINGLE;
-
-	// When multithreading is disabled, always set the per-loop ways of
-	// parallelism to 1.
-	jc = pc = ic = jr = ir = 1;
-	nt = 1;
-	auto_factor = FALSE;
-
-#endif
+	// ------------------------------------------------------------------------
 
 	// Save the results back in the runtime object.
 	bli_rntm_set_thread_impl_only( ti, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
-	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 
-	#if 0
-	printf( "bli_thread_init_rntm_from_env()\n" );
-	bli_rntm_print( rntm );
-	#endif
+	// ------------------------------------------------------------------------
+
+	// This function, bli_thread_init_rntm_from_env(), is only called when BLIS
+	// is initialized, and so we need to go one step further and process the
+	// rntm's contents into a standard form to ensure, for example, that none of
+	// the ways of parallelism are negative or zero (in case the user queries
+	// them later).
+	bli_rntm_sanitize( rntm );
+
+#else
+
+	// When multithreading is disabled, the global rntm can keep the values it
+	// was assigned at (static) initialization time.
+
+#endif
+
+	//printf( "bli_thread_init_rntm_from_env()\n" ); bli_rntm_print( rntm );
 }
 
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 509072e57..88bdccda5 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -202,6 +202,7 @@ BLIS_EXPORT_BLIS dim_t   bli_thread_get_jr_nt( void );
 BLIS_EXPORT_BLIS dim_t   bli_thread_get_ir_nt( void );
 BLIS_EXPORT_BLIS dim_t   bli_thread_get_num_threads( void );
 BLIS_EXPORT_BLIS timpl_t bli_thread_get_thread_impl( void );
+BLIS_EXPORT_BLIS const char* bli_thread_get_thread_impl_str( timpl_t ti );
 
 BLIS_EXPORT_BLIS void    bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir );
 BLIS_EXPORT_BLIS void    bli_thread_set_num_threads( dim_t value );

From ee81efc7887374c974a78bfb3e0865776b2f97a8 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 22 Sep 2022 19:15:07 -0500
Subject: [PATCH 088/230] Parameterized test/3 drivers via command line args.
 (#667)

Details:
- Rewrote the drivers in test/3, the Makefile, and the runme.sh script
  so that most of the important parameters, including parameter combo,
  datatype, storage combo, induced method, problem size range, dimension
  bindings, number of repeats, and alpha/beta values can be passed in
  via command line arguments. (Previously, most of these parameters were
  hard-coded into the driver source, except a few that were hard-coded
  into the Makefile.) If no argument is given for any particular option,
  it will be assigned a sane default. Either way, the values employed at
  runtime will be printed to stdout before the performance data in a
  section that is commented out with '%' characters (which is used by
  matlab and octave for comments), unless the -q option is given, in
  which case the driver will proceed quietly and output only performance
  data. Each driver also provides extensive help via the -h option, with
  the help text tailored for the operation in question (e.g. gemm, hemm,
  herk, etc.). In this help text, the driver reminds the user which
  implementation it was linked to (e.g. blis, openblas, vendor, eigen).
  Thanks to Jeff Diamond for suggesting this CLI-based reimagining of
  the test/3 drivers.
- In the test/3 drivers: converted cpp macro string constants, as well
  as two string literals (for the opname and pc_str) used in each test
  driver, to global (or static) const char* strings, and replaced the
  use of strncpy() for storing the results of the command line argument
  parsing with pointer copies from the corresponding strings in argv.
  This works because the argv array is guaranteed by the C99 standard
  to persist throughout the life of the program. This new approach uses
  less storage and executes faster. Thanks to Minh Quan Ho for
  recommending this change.
- Renamed the IMP_STR cpp macro that gets defined on the command line,
  via the test/3/Makefile, to IMPL_STR.
- Updated runme.sh to set the problem size ranges for single-threaded
  and multithreaded execution independently from one another, as well as
  on a per-system basis.
- Added a 'quiet' variable to runme.sh that can easily toggle quiet mode
  for the test drivers' output.
- Very minor typecast fix in call to bli_getopt() in bli_utils.c.
- In bli_getopt(), changed the nextchar variable from being a local
  static variable to a field of the getopt_t state struct. (Not sure why
  it was ever declared static to begin with.)
- Other minor changes to bli_getopt() to accommodate the rewritten test
  drivers' command line parsing needs.
---
 frame/base/bli_getopt.c |  54 ++--
 frame/base/bli_getopt.h |   1 +
 test/3/Makefile         | 302 ++++++------------
 test/3/old/runme.sh     | 277 ++++++++++++++++
 test/3/runme.sh         | 199 +++++++-----
 test/3/test_gemm.c      | 335 +++++++++++---------
 test/3/test_hemm.c      | 194 +++++++-----
 test/3/test_herk.c      | 192 ++++++-----
 test/3/test_trmm.c      | 202 ++++++------
 test/3/test_trsm.c      | 203 ++++++------
 test/3/test_utils.c     | 684 ++++++++++++++++++++++++++++++++++++++++
 test/3/test_utils.h     | 142 +++++++++
 12 files changed, 1993 insertions(+), 792 deletions(-)
 create mode 100755 test/3/old/runme.sh
 create mode 100644 test/3/test_utils.c
 create mode 100644 test/3/test_utils.h

diff --git a/frame/base/bli_getopt.c b/frame/base/bli_getopt.c
index e1d90d323..bf74eb1d7 100644
--- a/frame/base/bli_getopt.c
+++ b/frame/base/bli_getopt.c
@@ -37,18 +37,19 @@
 
 static const char OPT_MARKER = '-';
 
+//bool bli_char_is_in_str( char ch, const char* str );
+
 void bli_getopt_init_state( int opterr, getopt_t* state )
 {
-	state->optarg = NULL;
-	state->optind = 1;
-	state->opterr = opterr;
-	state->optopt = 0;
+	state->nextchar = NULL;
+	state->optarg   = NULL;
+	state->optind   = 1;
+	state->opterr   = opterr;
+	state->optopt   = 0;
 }
 
 int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state )
 {
-	static const char* nextchar = NULL;
-
 	const char* elem_str;
 	const char* optstr_char;
 
@@ -60,7 +61,7 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop
 	// an element of argv with more than one option character, in which
 	// case we need to pick up where we left off (which is the address
 	// contained in nextchar).
-	if ( nextchar == NULL )
+	if ( state->nextchar == NULL )
 	{
 		elem_str = argv[ state->optind ];
 
@@ -87,10 +88,10 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop
 		// character.
 
 		// Use the nextchar pointer as our element string.
-		elem_str = nextchar;
+		elem_str = state->nextchar;
 
 		// Reset nextchar to NULL.
-		nextchar = NULL;
+		state->nextchar = NULL;
 	}
 
 	// Find the first occurrence of elem_str[0] in optstring.
@@ -130,17 +131,24 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop
 				state->optind += 1;
 				return '?';
 			}
-			// If there are still more elements in argv yet to process AND
-			// the next one is an option, then the argument was omitted.
+			// If there are still more elements in argv yet to process AND the
+			// next one is an option marker, then the argument was omitted
+			// (unless the option marker is actually part of the argument,
+			// such as with negative numbers, e.g. -1, which is very likely
+			// if the char *after* the option marker is missing from optstring).
 			else if ( argv[ state->optind + 1 ][0] == OPT_MARKER )
 			{
-				if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (next element of argv is option '%c')\n", elem_str[0], argv[ state->optind + 1 ][1] );
-
-				state->optopt = *optstr_char;
-				state->optind += 1;
-				return '?';
+				// If the char after the option marker is present in optstring,
+				// then the first option argument is missing.
+				if ( strchr( optstring, argv[ state->optind + 1 ][1] ) != NULL )
+				{
+					if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (next element of argv is option '%c')\n", elem_str[0], argv[ state->optind + 1 ][1] );
+
+					state->optopt = *optstr_char;
+					state->optind += 1;
+					return '?';
+				}
 			}
-
 			// If no error was deteced above, we can safely assign optarg
 			// to be the next element in argv and increment optind by two.
 			state->optarg = argv[ state->optind + 1 ];
@@ -166,7 +174,7 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop
 	{
 		if ( strchr( optstring, elem_str[1] ) != NULL )
 		{
-			nextchar = &elem_str[1];
+			state->nextchar = &elem_str[1];
 			return *optstr_char;
 		}
 	}
@@ -176,3 +184,13 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop
 	return *optstr_char;
 }
 
+#if 0
+bool bli_char_is_in_str( char ch, const char* str )
+{
+	int chi = ( int )ch;
+
+	if ( strchr( str, chi ) == NULL ) return FALSE;
+
+	return TRUE;
+}
+#endif
diff --git a/frame/base/bli_getopt.h b/frame/base/bli_getopt.h
index bb0e4f2cf..1e0f7b250 100644
--- a/frame/base/bli_getopt.h
+++ b/frame/base/bli_getopt.h
@@ -34,6 +34,7 @@
 
 typedef struct getopt_s
 {
+	const char* nextchar;
 	const char* optarg;
 	      int   optind;
 	      int   opterr;
diff --git a/test/3/Makefile b/test/3/Makefile
index 568b7ffb0..e7cb7235a 100644
--- a/test/3/Makefile
+++ b/test/3/Makefile
@@ -126,25 +126,6 @@ VENDOR_LIB     := $(MKL_LIB)
 VENDORP_LIB    := $(MKLP_LIB)
 
 
-#
-# --- Problem size definitions -------------------------------------------------
-#
-
-# Single core (single-threaded)
-PS_BEGIN := 48
-PS_MAX   := 2400
-PS_INC   := 48
-
-# Single-socket (multithreaded)
-P1_BEGIN := 120
-P1_MAX   := 6000
-P1_INC   := 120
-
-# Dual-socket (multithreaded)
-P2_BEGIN := 160
-P2_MAX   := 8000
-P2_INC   := 160
-
 
 #
 # --- General build definitions ------------------------------------------------
@@ -182,30 +163,19 @@ CXXFLAGS_MT    := -march=native $(CXXFLAGS)
 
 
 # Which library?
-BLI_DEF  := -DBLIS
-BLA_DEF  := -DBLAS
-EIG_DEF  := -DEIGEN
-
-# Complex implementation type
-D1M      := -DIND=BLIS_1M
-DNAT     := -DIND=BLIS_NAT
-
-# Implementation string
-#STR_1M   := -DSTR=\"1m\"
-STR_NAT  := -DSTR=\"asm_blis\"
-STR_OBL  := -DSTR=\"openblas\"
-STR_EIG  := -DSTR=\"eigen\"
-STR_VEN  := -DSTR=\"vendor\"
-
-# Single or multithreaded string
-STR_ST   := -DTHR_STR=\"st\"
-STR_1S   := -DTHR_STR=\"1s\"
-STR_2S   := -DTHR_STR=\"2s\"
+DEF_BLI  := -DBLIS
+DEF_BLA  := -DBLAS
+DEF_EIG  := -DEIGEN
 
-# Problem size specification
-PDEF_ST  := -DP_BEGIN=$(PS_BEGIN)  -DP_INC=$(PS_INC)  -DP_MAX=$(PS_MAX)
-PDEF_1S  := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
-PDEF_2S  := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
+# Implementation string.
+STR_BLI  := -DIMPL_STR=\"blis\"
+STR_OBL  := -DIMPL_STR=\"openblas\"
+STR_EIG  := -DIMPL_STR=\"eigen\"
+STR_VEN  := -DIMPL_STR=\"vendor\"
+
+# Single or multithreaded string.
+STR_ST   := -DTHR_STR=\"st\"
+STR_MT   := -DTHR_STR=\"mt\"
 
 
@@ -213,188 +183,132 @@ PDEF_2S  := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
 # --- Targets/rules ------------------------------------------------------------
 #
 
-all:        all-st all-1s all-2s
-blis:       blis-st blis-1s blis-2s
-openblas:   openblas-st openblas-1s openblas-2s
-eigen:      eigen-st eigen-1s eigen-2s
-vendor:     vendor-st vendor-1s vendor-2s
-mkl:        vendor
-armpl:      vendor
+all:      all-st
 
-all-st:     blis-st openblas-st mkl-st eigen-st
-all-1s:     blis-1s openblas-1s mkl-1s eigen-1s
-all-2s:     blis-2s openblas-2s mkl-2s eigen-2s
+all-st:   blis-st openblas-st mkl-st eigen-st
+all-mt:   blis-mt openblas-mt mkl-mt eigen-mt
 
-blis-st:    blis-nat-st
-blis-1s:    blis-nat-1s
-blis-2s:    blis-nat-2s
-
-#blis-ind:   blis-ind-st blis-ind-mt
-blis-nat:   blis-nat-st blis-nat-1s blis-nat-2s
+blis:     blis-st
+openblas: openblas-st
+eigen:    eigen-st
+vendor:   vendor-st
+mkl:      mkl-st
 
 # Define the datatypes, operations, and implementations.
-DTS    := s d c z
 OPS    := gemm hemm herk trmm trsm
-BIMPLS := asm_blis openblas vendor
+BIMPLS := blis openblas vendor
 EIMPLS := eigen
 
-# Define functions to construct object filenames from the datatypes and
-# operations given an implementation. We define one function for single-
-# threaded, single-socket, and dual-socket filenames.
-get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
-get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
-get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
-
-# Construct object and binary names for single-threaded, single-socket, and
-# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
-BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
-BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
-BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
-BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
-BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
-BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
+# Define a function to construct object filenames from the operations
+# given an implementation.
+get-st-objs = $(foreach op,$(OPS),test_$(op)_$(1)_st.o)
+get-mt-objs = $(foreach op,$(OPS),test_$(op)_$(1)_mt.o)
+
+# Construct object and binary names for single-threaded and multithreaded
+# files for BLIS, OpenBLAS, Eigen, and a vendor library (e.g. MKL).
+BLIS_ST_OBJS     := $(call get-st-objs,blis)
+BLIS_ST_BINS     := $(patsubst %.o,%.x,$(BLIS_ST_OBJS))
+
+BLIS_MT_OBJS     := $(call get-mt-objs,blis)
+BLIS_MT_BINS     := $(patsubst %.o,%.x,$(BLIS_MT_OBJS))
 
 OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
 OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
-OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
-OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
-OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
-OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
+
+OPENBLAS_MT_OBJS := $(call get-mt-objs,openblas)
+OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS))
 
 EIGEN_ST_OBJS    := $(call get-st-objs,eigen)
 EIGEN_ST_BINS    := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
-EIGEN_1S_OBJS    := $(call get-1s-objs,eigen)
-EIGEN_1S_BINS    := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS))
-EIGEN_2S_OBJS    := $(call get-2s-objs,eigen)
-EIGEN_2S_BINS    := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS))
+
+EIGEN_MT_OBJS    := $(call get-mt-objs,eigen)
+EIGEN_MT_BINS    := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS))
 
 VENDOR_ST_OBJS   := $(call get-st-objs,vendor)
 VENDOR_ST_BINS   := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
-VENDOR_1S_OBJS   := $(call get-1s-objs,vendor)
-VENDOR_1S_BINS   := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
-VENDOR_2S_OBJS   := $(call get-2s-objs,vendor)
-VENDOR_2S_BINS   := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
-
-# Define some targets associated with the above object/binary files.
-blis-nat-st: check-env $(BLIS_NAT_ST_BINS)
-blis-nat-1s: check-env $(BLIS_NAT_1S_BINS)
-blis-nat-2s: check-env $(BLIS_NAT_2S_BINS)
-
-openblas-st: check-env $(OPENBLAS_ST_BINS)
-openblas-1s: check-env $(OPENBLAS_1S_BINS)
-openblas-2s: check-env $(OPENBLAS_2S_BINS)
 
-eigen-st: check-env $(EIGEN_ST_BINS)
-eigen-1s: check-env $(EIGEN_1S_BINS)
-eigen-2s: check-env $(EIGEN_2S_BINS)
+VENDOR_MT_OBJS   := $(call get-mt-objs,vendor)
+VENDOR_MT_BINS   := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS))
 
-vendor-st: check-env $(VENDOR_ST_BINS)
-vendor-1s: check-env $(VENDOR_1S_BINS)
-vendor-2s: check-env $(VENDOR_2S_BINS)
+# List other miscellaneous object files
+UTIL_OBJS        := test_utils.o
+UTIL_HDRS        := test_utils.h
 
-mkl-st: vendor-st
-mkl-1s: vendor-1s
-mkl-2s: vendor-2s
-
-armpl-st: vendor-st
-armpl-1s: vendor-1s
-armpl-2s: vendor-2s
+# Define some targets associated with the above object/binary files.
+blis-st:     check-env $(BLIS_ST_BINS)
+blis-mt:     check-env $(BLIS_MT_BINS)
+openblas-st: check-env $(OPENBLAS_ST_BINS)
+openblas-mt: check-env $(OPENBLAS_MT_BINS)
+eigen-st:    check-env $(EIGEN_ST_BINS)
+eigen-mt:    check-env $(EIGEN_MT_BINS)
+vendor-st:   check-env $(VENDOR_ST_BINS)
+vendor-mt:   check-env $(VENDOR_MT_BINS)
+mkl-st:      vendor-st
+mkl-mt:      vendor-mt
+armpl-st:    vendor-st
+armpl-mt:    vendor-mt
 
 # Mark the object files as intermediate so that make will remove them
 # automatically after building the binaries on which they depend.
-.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS)
-.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS)
-.INTERMEDIATE: $(EIGEN_ST_OBJS)    $(EIGEN_1S_OBJS)    $(EIGEN_2S_OBJS)
-.INTERMEDIATE: $(VENDOR_ST_OBJS)   $(VENDOR_1S_OBJS)   $(VENDOR_2S_OBJS)
+.INTERMEDIATE: $(BLIS_ST_OBJS)     $(BLIS_MT_OBJS)
+.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_MT_OBJS)
+.INTERMEDIATE: $(EIGEN_ST_OBJS)    $(EIGEN_MT_OBJS)
+.INTERMEDIATE: $(VENDOR_ST_OBJS)   $(VENDOR_MT_OBJS)
+.INTERMEDIATE: $(UTIL_OBJS)
 
 
 # -- Object file rules --
 
-#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
-#	$(CC) $(CFLAGS) -c $< -o $@
-
-# A function to return the datatype cpp macro def from the datatype
-# character.
-get-dt-cpp = $(strip \
-             $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT    -DIS_FLOAT,\
-             $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE   -DIS_DOUBLE,\
-             $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
-                                       -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
-
 # A function to return other cpp macros that help the test driver
 # identify the implementation.
-#get-bl-cpp = $(strip \
-#             $(if $(findstring     blis,$(1)),$(STR_NAT) $(BLI_DEF),\
-#             $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
-#             $(if $(findstring    eigen,$(1)),$(STR_EIG) $(EIG_DEF),\
-#                                              $(STR_VEN) $(BLA_DEF)))))
-
 get-bl-cpp = $(strip \
-             $(if $(findstring     blis,$(1)),$(STR_NAT) $(BLI_DEF),\
-             $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
+             $(if $(findstring     blis,$(1)),$(STR_BLI) $(DEF_BLI),\
+             $(if $(findstring openblas,$(1)),$(STR_OBL) $(DEF_BLA),\
              $(if $(and $(findstring eigen,$(1)),\
                         $(findstring  gemm,$(2))),\
-                                              $(STR_EIG) $(EIG_DEF),\
+                                              $(STR_EIG) $(DEF_EIG),\
              $(if       $(findstring eigen,$(1)),\
-                                              $(STR_EIG) $(BLA_DEF),\
-                                              $(STR_VEN) $(BLA_DEF))))))
+                                              $(STR_EIG) $(DEF_BLA),\
+                                              $(STR_VEN) $(DEF_BLA))))))
 
+# Rules for miscellaneous files.
+test_utils.o: test_utils.c test_utils.h
+	$(CC) $(CFLAGS) -c $< -o $@
 
 # Rules for BLIS and BLAS libraries.
 define make-st-rule
-test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
-	$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
-endef
-
-define make-1s-rule
-test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
-	$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
+test_$(1)_$(2)_st.o: test_$(op).c Makefile
+	$(CC) $(CFLAGS) $(call get-bl-cpp,$(2),$(1)) $(STR_ST) -c $$< -o $$@
 endef
 
-define make-2s-rule
-test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
-	$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
+define make-mt-rule
+test_$(1)_$(2)_mt.o: test_$(op).c Makefile
+	$(CC) $(CFLAGS) $(call get-bl-cpp,$(2),$(1)) $(STR_MT) -c $$< -o $$@
 endef
 
-$(foreach dt,$(DTS), \
 $(foreach op,$(OPS), \
-$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
+$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(op),$(im)))))
 
-$(foreach dt,$(DTS), \
 $(foreach op,$(OPS), \
-$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
-
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
+$(foreach im,$(BIMPLS),$(eval $(call make-mt-rule,$(op),$(im)))))
 
 # Rules for Eigen.
+# NOTE: Eigen determines single- vs. multithreadedness at compile time.
 define make-eigst-rule
-test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
-	$(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
-endef
-
-define make-eig1s-rule
-test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
-	$(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
+test_$(1)_$(2)_st.o: test_$(op).c Makefile
+	$(CXX) $(CXXFLAGS_ST) $(call get-bl-cpp,$(2),$(1)) $(STR_ST) -c $$< -o $$@
 endef
 
-define make-eig2s-rule
-test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
-	$(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
+define make-eigmt-rule
+test_$(1)_$(2)_mt.o: test_$(op).c Makefile
+	$(CXX) $(CXXFLAGS_MT) $(call get-bl-cpp,$(2),$(1)) $(STR_MT) -c $$< -o $$@
 endef
 
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im))))))
-
-$(foreach dt,$(DTS), \
 $(foreach op,$(OPS), \
-$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im))))))
+$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(op),$(im)))))
 
-$(foreach dt,$(DTS), \
 $(foreach op,$(OPS), \
-$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
+$(foreach im,$(EIMPLS),$(eval $(call make-eigmt-rule,$(op),$(im)))))
 
 
 # -- Executable file rules --
@@ -404,44 +318,36 @@ $(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
 # compatibility layer. This prevents BLIS from inadvertently getting called
 # for the BLAS routines we are trying to test with.
 
-test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
+# Combine the miscellaneous objects with libblis for conciseness (since all
+# driver binaries depend on these objects).
+COMMON_OBJS := $(UTIL_OBJS) $(LIBBLIS_LINK)
 
-test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(OPENBLAS_LIB)  $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+test_%_blis_st.x:     test_%_blis_st.o     $(COMMON_OBJS)
+	$(CC) $(strip $<                    $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
-test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+test_%_blis_mt.x:     test_%_blis_mt.o     $(COMMON_OBJS)
+	$(CC) $(strip $<                    $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
-test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
 
+test_%_openblas_st.x: test_%_openblas_st.o $(COMMON_OBJS)
+	$(CC) $(strip $<   $(OPENBLAS_LIB)  $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
-test_%_$(PS_MAX)_eigen_st.x:    test_%_$(PS_MAX)_eigen_st.o    $(LIBBLIS_LINK)
-	$(CXX) $(strip $<  $(EIGEN_LIB)     $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+test_%_openblas_mt.x: test_%_openblas_mt.o $(COMMON_OBJS)
+	$(CC) $(strip $<   $(OPENBLASP_LIB) $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
-test_%_$(P1_MAX)_eigen_1s.x:    test_%_$(P1_MAX)_eigen_1s.o    $(LIBBLIS_LINK)
-	$(CXX) $(strip $<  $(EIGENP_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
 
-test_%_$(P2_MAX)_eigen_2s.x:    test_%_$(P2_MAX)_eigen_2s.o    $(LIBBLIS_LINK)
-	$(CXX) $(strip $<  $(EIGENP_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+test_%_eigen_st.x:    test_%_eigen_st.o    $(COMMON_OBJS)
+	$(CXX) $(strip $<  $(EIGEN_LIB)     $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
+test_%_eigen_mt.x:    test_%_eigen_mt.o    $(COMMON_OBJS)
+	$(CXX) $(strip $<  $(EIGENP_LIB)    $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
-test_%_$(PS_MAX)_vendor_st.x:   test_%_$(PS_MAX)_vendor_st.o   $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(VENDOR_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
 
-test_%_$(P1_MAX)_vendor_1s.x:   test_%_$(P1_MAX)_vendor_1s.o   $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(VENDORP_LIB)   $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+test_%_vendor_st.x:   test_%_vendor_st.o   $(COMMON_OBJS)
+	$(CC) $(strip $<   $(VENDOR_LIB)    $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
-test_%_$(P2_MAX)_vendor_2s.x:   test_%_$(P2_MAX)_vendor_2s.o   $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(VENDORP_LIB)   $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+test_%_vendor_mt.x:   test_%_vendor_mt.o   $(COMMON_OBJS)
+	$(CC) $(strip $<   $(VENDORP_LIB)   $(COMMON_OBJS) $(LDFLAGS) -o $@)
 
 
 # -- Environment check rules --
diff --git a/test/3/old/runme.sh b/test/3/old/runme.sh
new file mode 100755
index 000000000..cf84bd121
--- /dev/null
+++ b/test/3/old/runme.sh
@@ -0,0 +1,277 @@
+#!/bin/bash
+
+# File pefixes.
+exec_root="test"
+out_root="output"
+delay=0.1
+
+sys="blis"
+#sys="stampede2"
+#sys="lonestar5"
+#sys="ul252"
+#sys="ul264"
+#sys="ul2128"
+
+# Bind threads to processors.
+#export OMP_PROC_BIND=true
+#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
+#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
+
+if [ ${sys} = "blis" ]; then
+
+	export GOMP_CPU_AFFINITY="0-3"
+
+	numactl=""
+	threads="jc1ic1jr1_st
+	         jc2ic1jr1_1s
+	         jc2ic2jr1_2s"
+
+elif [ ${sys} = "stampede2" ]; then
+
+	echo "Need to set GOMP_CPU_AFFINITY."
+	exit 1
+
+	numactl=""
+	threads="jc1ic1jr1_st
+	         jc4ic6jr1_1s
+	         jc4ic12jr1_2s"
+
+elif [ ${sys} = "lonestar5" ]; then
+
+	export GOMP_CPU_AFFINITY="0-23"
+
+	# A hack to use libiomp5 with gcc.
+	#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
+
+	numactl=""
+	threads="jc1ic1jr1_st
+	         jc2ic3jr2_1s
+	         jc4ic3jr2_2s"
+
+elif [ ${sys} = "ul252" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
+	export GOMP_CPU_AFFINITY="0-51"
+
+	numactl=""
+	threads="jc1ic1jr1_st
+	         jc2ic13jr1_1s
+	         jc4ic13jr1_2s"
+
+elif [ ${sys} = "ul264" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
+	export GOMP_CPU_AFFINITY="0-63"
+
+	numactl="numactl --interleave=all"
+	threads="jc1ic1jr1_st
+	         jc1ic8jr4_1s
+	         jc2ic8jr4_2s"
+
+elif [ ${sys} = "ul2128" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
+	export GOMP_CPU_AFFINITY="0-127"
+
+	numactl="numactl --interleave=all"
+	threads="jc1ic1jr1_st
+	         jc4ic4jr4_1s
+	         jc8ic4jr4_2s"
+	#threads="jc4ic4jr4_1s
+	#         jc8ic4jr4_2s"
+	#threads="jc1ic1jr1_st"
+	#threads="jc4ic4jr4_1s"
+	#threads="jc8ic4jr4_2s"
+fi
+
+# Datatypes to test.
+test_dts="d s z c"
+#test_dts="s"
+
+# Operations to test.
+test_ops="gemm hemm herk trmm trsm"
+#test_ops="herk"
+
+# Implementations to test.
+impls="blis"
+#impls="openblas"
+#impls="vendor"
+#impls="other"
+#impls="eigen"
+#impls="all"
+
+if [ "${impls}" = "blis" ]; then
+
+	test_impls="asm_blis"
+
+elif [ "${impls}" = "openblas" ]; then
+
+	test_impls="openblas"
+
+elif [ "${impls}" = "vendor" ]; then
+
+	test_impls="vendor"
+
+elif [ "${impls}" = "eigen" ]; then
+
+	test_impls="eigen"
+
+elif [ "${impls}" = "other" ]; then
+
+	test_impls="openblas vendor eigen"
+else
+
+	test_impls="openblas asm_blis vendor eigen"
+fi
+
+# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
+# restore the value.
+GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
+
+
+# Iterate over the threading configs.
+for th in ${threads}; do
+
+	# Start with one way of parallelism in each loop. We will now begin
+	# parsing the 'th' variable to update one or more of these threading
+	# parameters.
+	jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1
+
+	# Strip everything before and after the underscore so that what remains
+	# is the problem size and threading parameter string, respectively.
+	#psize=${th##*_}; thinfo=${th%%_*}
+	tsuf=${th##*_}; thinfo=${th%%_*}
+
+	# Identify each threading parameter and insert a space before it.
+	thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
+
+	nt=1
+
+	for loopnum in ${thsep}; do
+
+		# Given the current string, which identifies a loop and the
+		# number of ways of parallelism for that loop, strip out
+		# the ways and loop separately to identify each.
+		loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" )
+		num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
+
+		# Construct a string that we can evaluate to set the number
+		# of ways of parallelism for the current loop.
+		loop_nt_eq_num="${loop}_nt=${num}"
+
+		# Update the total number of threads.
+		nt=$(expr ${nt} \* ${num})
+
+		# Evaluate the string to assign the ways to the variable.
+		eval ${loop_nt_eq_num}
+
+	done
+
+	# Find a binary using the test driver prefix and the threading suffix.
+	# Then strip everything before and after the max problem size that's
+	# encoded into the name of the binary.
+	binname=$(ls -1 ${exec_root}_*_${tsuf}.x | head -n1)
+	temp1=${binname#${exec_root}_*_}
+	psize=${temp1%%_*}
+
+	# Sanity check: If 'ls' couldn't find any binaries, then the user
+	# probably didn't build them. Inform the user and proceed to the next
+	# threading config.
+	if [ "${binname}" = "" ]; then
+
+		echo "Could not find binaries corresponding to '${tsuf}' threading config. Skipping."
+		continue
+	fi
+
+	# Let the user know what threading config we are working on.
+	echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
+
+	# Iterate over the datatypes.
+	for dt in ${test_dts}; do
+
+		# Iterate over the implementations.
+		for im in ${test_impls}; do
+
+			# Iterate over the operations.
+			for op in ${test_ops}; do
+
+				# Eigen does not support multithreading for hemm, herk, trmm,
+				# or trsm. So if we're getting ready to execute an Eigen driver
+				# for one of these operations and nt > 1, we skip this test.
+				if [ "${im}"  = "eigen" ] && \
+				   [ "${op}" != "gemm"  ] && \
+				   [ "${nt}" != "1"     ]; then
+					continue;
+				fi
+
+				# Find the threading suffix by probing the executable.
+				binname=$(ls ${exec_root}_${dt}${op}_*_${im}_${tsuf}.x)
+
+				#echo "found file: ${binname} with suffix ${suf}"
+
+				# Set the number of threads according to th.
+				if [ "${tsuf}" = "1s" ] || [ "${tsuf}" = "2s" ]; then
+
+					# Set the threading parameters based on the implementation
+					# that we are preparing to run.
+					if   [ "${im}" = "asm_blis" ]; then
+						unset  OMP_NUM_THREADS
+						export BLIS_JC_NT=${jc_nt}
+						export BLIS_PC_NT=${pc_nt}
+						export BLIS_IC_NT=${ic_nt}
+						export BLIS_JR_NT=${jr_nt}
+						export BLIS_IR_NT=${ir_nt}
+					elif [ "${im}" = "openblas" ]; then
+						unset  OMP_NUM_THREADS
+						export OPENBLAS_NUM_THREADS=${nt}
+					elif [ "${im}" = "eigen" ]; then
+						export OMP_NUM_THREADS=${nt}
+					elif [ "${im}" = "vendor" ]; then
+						unset  OMP_NUM_THREADS
+						export MKL_NUM_THREADS=${nt}
+					fi
+					export nt_use=${nt}
+
+					# Multithreaded OpenBLAS seems to have a problem running
+					# properly if GOMP_CPU_AFFINITY is set. So we temporarily
+					# unset it here if we are about to execute OpenBLAS, but
+					# otherwise restore it.
+					if [ ${im} = "openblas" ]; then
+						unset GOMP_CPU_AFFINITY
+					else
+						export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
+					fi
+				else
+
+					export BLIS_JC_NT=1
+					export BLIS_PC_NT=1
+					export BLIS_IC_NT=1
+					export BLIS_JR_NT=1
+					export BLIS_IR_NT=1
+					export OMP_NUM_THREADS=1
+					export OPENBLAS_NUM_THREADS=1
+					export MKL_NUM_THREADS=1
+					export nt_use=1
+				fi
+
+				# Construct the name of the test executable.
+				exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${tsuf}.x"
+
+				# Construct the name of the output file.
+				out_file="${out_root}_${tsuf}_${dt}${op}_${im}.m"
+
+				#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
+				echo "Running ${numactl} ./${exec_name} > ${out_file}"
+
+				# Run executable with or without numactl, depending on how
+				# the numactl variable was set.
+				${numactl} ./${exec_name} > ${out_file}
+
+				# Bedtime!
+				sleep ${delay}
+
+			done
+		done
+	done
+done
+
diff --git a/test/3/runme.sh b/test/3/runme.sh
index cf84bd121..fefcbe5ee 100755
--- a/test/3/runme.sh
+++ b/test/3/runme.sh
@@ -5,6 +5,18 @@ exec_root="test"
 out_root="output"
 delay=0.1
 
+# Bind threads to processors.
+#export OMP_PROC_BIND=true
+#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
+#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
+
+# ------------------
+
+# Problem size range for single- and multithreaded execution. Set psr_st and
+# psr_mt on a per-system basis below to override these default values.
+psr_st="100 1000 100"
+psr_mt="200 2000 200"
+
 sys="blis"
 #sys="stampede2"
 #sys="lonestar5"
@@ -12,19 +24,15 @@ sys="blis"
 #sys="ul264"
 #sys="ul2128"
 
-# Bind threads to processors.
-#export OMP_PROC_BIND=true
-#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
-#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
-
 if [ ${sys} = "blis" ]; then
 
 	export GOMP_CPU_AFFINITY="0-3"
 
 	numactl=""
 	threads="jc1ic1jr1_st
-	         jc2ic1jr1_1s
-	         jc2ic2jr1_2s"
+	         jc2ic2jr1_mt"
+	#psr_st="40 1000 40"
+	#psr_mt="40 4000 40"
 
 elif [ ${sys} = "stampede2" ]; then
 
@@ -33,8 +41,9 @@ elif [ ${sys} = "stampede2" ]; then
 
 	numactl=""
 	threads="jc1ic1jr1_st
-	         jc4ic6jr1_1s
-	         jc4ic12jr1_2s"
+	         jc4ic12jr1_mt"
+	#psr_st="40 1000 40"
+	#psr_mt="40 4000 40"
 
 elif [ ${sys} = "lonestar5" ]; then
 
@@ -45,8 +54,9 @@ elif [ ${sys} = "lonestar5" ]; then
 
 	numactl=""
 	threads="jc1ic1jr1_st
-	         jc2ic3jr2_1s
-	         jc4ic3jr2_2s"
+	         jc4ic3jr2_mt"
+	#psr_st="40 1000 40"
+	#psr_mt="40 4000 40"
 
 elif [ ${sys} = "ul252" ]; then
 
@@ -55,8 +65,9 @@ elif [ ${sys} = "ul252" ]; then
 
 	numactl=""
 	threads="jc1ic1jr1_st
-	         jc2ic13jr1_1s
-	         jc4ic13jr1_2s"
+	         jc4ic13jr1_mt"
+	#psr_st="40 1000 40"
+	#psr_mt="40 4000 40"
 
 elif [ ${sys} = "ul264" ]; then
 
@@ -65,8 +76,9 @@ elif [ ${sys} = "ul264" ]; then
 
 	numactl="numactl --interleave=all"
 	threads="jc1ic1jr1_st
-	         jc1ic8jr4_1s
-	         jc2ic8jr4_2s"
+	         jc2ic8jr4_mt"
+	#psr_st="40 1000 40"
+	#psr_mt="40 4000 40"
 
 elif [ ${sys} = "ul2128" ]; then
 
@@ -75,54 +87,42 @@ elif [ ${sys} = "ul2128" ]; then
 
 	numactl="numactl --interleave=all"
 	threads="jc1ic1jr1_st
-	         jc4ic4jr4_1s
-	         jc8ic4jr4_2s"
-	#threads="jc4ic4jr4_1s
-	#         jc8ic4jr4_2s"
-	#threads="jc1ic1jr1_st"
-	#threads="jc4ic4jr4_1s"
-	#threads="jc8ic4jr4_2s"
+	         jc8ic4jr4_mt"
+
+	#psr_st="40 1000 40"
+	#psr_mt="40 4000 40"
 fi
 
 # Datatypes to test.
-test_dts="d s z c"
-#test_dts="s"
+test_dts="s d c z"
+test_dts="d"
 
 # Operations to test.
-test_ops="gemm hemm herk trmm trsm"
+test_ops="gemm_nn hemm_ll herk_ln trmm_llnn trsm_runn"
 #test_ops="herk"
 
 # Implementations to test.
-impls="blis"
-#impls="openblas"
-#impls="vendor"
-#impls="other"
-#impls="eigen"
-#impls="all"
-
-if [ "${impls}" = "blis" ]; then
-
-	test_impls="asm_blis"
-
-elif [ "${impls}" = "openblas" ]; then
-
-	test_impls="openblas"
-
-elif [ "${impls}" = "vendor" ]; then
-
-	test_impls="vendor"
+test_impls="blis"
+#test_impls="openblas"
+#test_impls="vendor"
+#test_impls="eigen"
+#test_impls="all"
+
+if [ "${impls}" = "all" ]; then
+	test_impls="openblas blis vendor eigen"
+fi
 
-elif [ "${impls}" = "eigen" ]; then
+# Number of repeats per problem size.
+nrepeats=3
 
-	test_impls="eigen"
+# The induced method to use ('native' or '1m').
+ind="native"
 
-elif [ "${impls}" = "other" ]; then
+# Quiet mode?
+#quiet="yes"
 
-	test_impls="openblas vendor eigen"
-else
-
-	test_impls="openblas asm_blis vendor eigen"
-fi
+# For testing purposes.
+#dryrun="yes"
 
 # Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
 # restore the value.
@@ -132,35 +132,41 @@ GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
 # Iterate over the threading configs.
 for th in ${threads}; do
 
+	#threads="jc1ic1jr1_st
+	#         jc8ic4jr4_mt"
+
 	# Start with one way of parallelism in each loop. We will now begin
 	# parsing the 'th' variable to update one or more of these threading
 	# parameters.
 	jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1
 
-	# Strip everything before and after the underscore so that what remains
-	# is the problem size and threading parameter string, respectively.
-	#psize=${th##*_}; thinfo=${th%%_*}
-	tsuf=${th##*_}; thinfo=${th%%_*}
+	# Strip everything before the understore so that what remains is the
+	# threading suffix.
+	tsuf=${th##*_};
+
+	# Strip everything after the understore so that what remains is the
+	# parallelism (threading) info.
+	thinfo=${th%%_*}
 
 	# Identify each threading parameter and insert a space before it.
-	thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
+	thinfo_sep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
 
 	nt=1
 
-	for loopnum in ${thsep}; do
+	for loopnum in ${thinfo_sep}; do
 
-		# Given the current string, which identifies a loop and the
-		# number of ways of parallelism for that loop, strip out
-		# the ways and loop separately to identify each.
+		# Given the current string, which identifies a loop and the number of
+		# ways of parallelism to be obtained from that loop, strip out the ways
+		# and loop separately to identify each.
 		loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" )
-		num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
+		nways=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
 
-		# Construct a string that we can evaluate to set the number
-		# of ways of parallelism for the current loop.
-		loop_nt_eq_num="${loop}_nt=${num}"
+		# Construct a string that we can evaluate to set the number of ways of
+		# parallelism for the current loop (e.g. jc_nt, ic_nt, jr_nt).
+		loop_nt_eq_num="${loop}_nt=${nways}"
 
 		# Update the total number of threads.
-		nt=$(expr ${nt} \* ${num})
+		nt=$(expr ${nt} \* ${nways})
 
 		# Evaluate the string to assign the ways to the variable.
 		eval ${loop_nt_eq_num}
@@ -171,8 +177,6 @@ for th in ${threads}; do
 	# Then strip everything before and after the max problem size that's
 	# encoded into the name of the binary.
 	binname=$(ls -1 ${exec_root}_*_${tsuf}.x | head -n1)
-	temp1=${binname#${exec_root}_*_}
-	psize=${temp1%%_*}
 
 	# Sanity check: If 'ls' couldn't find any binaries, then the user
 	# probably didn't build them. Inform the user and proceed to the next
@@ -184,7 +188,7 @@ for th in ${threads}; do
 	fi
 
 	# Let the user know what threading config we are working on.
-	echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
+	echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt})"
 
 	# Iterate over the datatypes.
 	for dt in ${test_dts}; do
@@ -195,26 +199,29 @@ for th in ${threads}; do
 			# Iterate over the operations.
 			for op in ${test_ops}; do
 
+				# Strip everything before the understore so that what remains is
+				# the operation parameter string.
+				oppars=${op##*_};
+
+				# Strip everything after the understore so that what remains is
+				# the operation name (sans parameter encoding).
+				opname=${op%%_*}
+
 				# Eigen does not support multithreading for hemm, herk, trmm,
 				# or trsm. So if we're getting ready to execute an Eigen driver
 				# for one of these operations and nt > 1, we skip this test.
-				if [ "${im}"  = "eigen" ] && \
-				   [ "${op}" != "gemm"  ] && \
-				   [ "${nt}" != "1"     ]; then
+				if [ "${im}"      = "eigen" ] && \
+				   [ "${opname}" != "gemm"  ] && \
+				   [ "${nt}"     != "1"     ]; then
 					continue;
 				fi
 
-				# Find the threading suffix by probing the executable.
-				binname=$(ls ${exec_root}_${dt}${op}_*_${im}_${tsuf}.x)
-
-				#echo "found file: ${binname} with suffix ${suf}"
-
 				# Set the number of threads according to th.
-				if [ "${tsuf}" = "1s" ] || [ "${tsuf}" = "2s" ]; then
+				if [ "${tsuf}" = "mt" ]; then
 
 					# Set the threading parameters based on the implementation
 					# that we are preparing to run.
-					if   [ "${im}" = "asm_blis" ]; then
+					if   [ "${im}" = "blis" ]; then
 						unset  OMP_NUM_THREADS
 						export BLIS_JC_NT=${jc_nt}
 						export BLIS_PC_NT=${pc_nt}
@@ -241,8 +248,14 @@ for th in ${threads}; do
 					else
 						export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
 					fi
+
+					# Choose the mt problem size range.
+					psr="${psr_mt}"
+
 				else
 
+					# Set all environment variables to 1 to ensure single-
+					# threaded execution.
 					export BLIS_JC_NT=1
 					export BLIS_PC_NT=1
 					export BLIS_IC_NT=1
@@ -252,20 +265,38 @@ for th in ${threads}; do
 					export OPENBLAS_NUM_THREADS=1
 					export MKL_NUM_THREADS=1
 					export nt_use=1
+
+					# Choose the st problem size range.
+					psr="${psr_st}"
+				fi
+
+				if [ "${quiet}" = "yes" ]; then
+					qv="-q" # quiet
+				else
+					qv="-v" # verbose (the default)
 				fi
 
 				# Construct the name of the test executable.
-				exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${tsuf}.x"
+				exec_name="${exec_root}_${opname}_${im}_${tsuf}.x"
 
 				# Construct the name of the output file.
-				out_file="${out_root}_${tsuf}_${dt}${op}_${im}.m"
-
-				#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
-				echo "Running ${numactl} ./${exec_name} > ${out_file}"
+				out_file="${out_root}_${tsuf}_${dt}${opname}_${oppars}_${im}.m"
+
+				# Use printf for its formatting capabilities.
+				printf 'Running %s %-21s %s %-7s %s %s %s %s > %s\n' \
+				       "${numactl}" "./${exec_name}" "-d ${dt}" \
+				                                     "-c ${oppars}" \
+				                                     "-i ${ind}" \
+				                                     "-p \"${psr}\"" \
+				                                     "-r ${nrepeats}" \
+				                                     "${qv}" \
+				                                     "${out_file}"
 
 				# Run executable with or without numactl, depending on how
 				# the numactl variable was set.
-				${numactl} ./${exec_name} > ${out_file}
+				if [ "${dryrun}" != "yes" ]; then
+					${numactl} ./${exec_name} -d ${dt} -c ${oppars} -i ${ind} -p "${psr}" -r ${nrepeats} ${qv} > ${out_file}
+				fi
 
 				# Bedtime!
 				sleep ${delay}
diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c
index 96992f4a1..20bcca46c 100644
--- a/test/3/test_gemm.c
+++ b/test/3/test_gemm.c
@@ -36,18 +36,20 @@
 #ifdef EIGEN
   #define BLIS_DISABLE_BLAS_DEFS
   #include "blis.h"
+  #include "test_utils.h"
   #include <Eigen/Core>
   #include <Eigen/src/misc/blas.h>
   using namespace Eigen;
 #else
   #include "blis.h"
+  #include "test_utils.h"
 #endif
 
-#define COL_STORAGE
-//#define ROW_STORAGE
-
 //#define PRINT
 
+static const char* LOCAL_OPNAME_STR = "gemm";
+static const char* LOCAL_PC_STR     = "nn";
+
 int main( int argc, char** argv )
 {
 	obj_t    a, b, c;
@@ -70,65 +72,43 @@ int main( int argc, char** argv )
 	double   dtime_save;
 	double   gflops;
 
-	//bli_init();
-
-	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-	n_repeats = 3;
-
-	dt      = DT;
-
-	ind     = IND;
-
-#if 1
-	p_begin = P_BEGIN;
-	p_max   = P_MAX;
-	p_inc   = P_INC;
-
-	m_input = -1;
-	n_input = -1;
-	k_input = -1;
-#else
-	p_begin = 40;
-	p_max   = 1000;
-	p_inc   = 40;
-
-	m_input = -1;
-	n_input = -1;
-	k_input = -1;
-#endif
-
+	params_t params;
 
 	// Supress compiler warnings about unused variable 'ind'.
 	( void )ind;
 
-#if 0
 
-	cntx_t* cntx;
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	// Parse the command line options into strings, integers, enums,
+	// and doubles, as appropriate.
+	parse_cl_params( argc, argv, init_def_params, &params );
 
-	ind_t ind_mod = ind;
+	dt        = params.dt;
 
-	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod );
+	ind       = params.im;
 
-	// Set k to the kc blocksize for the current datatype.
-	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+	p_begin   = params.sta;
+	p_max     = params.end;
+	p_inc     = params.inc;
 
-#elif 1
+	m_input   = params.m;
+	n_input   = params.n;
+	k_input   = params.k;
 
-	//k_input = 256;
+	n_repeats = params.nr;
 
-#endif
 
-	// Choose the char corresponding to the requested datatype.
-	if      ( bli_is_float( dt ) )    dt_ch = 's';
-	else if ( bli_is_double( dt ) )   dt_ch = 'd';
-	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
-	else                              dt_ch = 'z';
+	// Map the datatype to its corresponding char.
+	bli_param_map_blis_to_char_dt( dt, &dt_ch );
 
-	transa = BLIS_NO_TRANSPOSE;
-	transb = BLIS_NO_TRANSPOSE;
+	// Map the parameter chars to their corresponding BLIS enum type values.
+	bli_param_map_char_to_blis_trans( params.pc_str[0], &transa );
+	bli_param_map_char_to_blis_trans( params.pc_str[1], &transb );
 
+	// Map the BLIS enum type values to their corresponding BLAS chars.
 	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
 	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
 
@@ -136,8 +116,8 @@ int main( int argc, char** argv )
 	// matlab allocates space for the entire array once up-front.
 	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
 
-	printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
-	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+	printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, IMPL_STR );
+	printf( "( %4lu, 1:4 ) = [ %5lu %5lu %5lu %8.2f ];\n",
 	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0,
@@ -158,17 +138,20 @@ int main( int argc, char** argv )
 		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
 		bli_obj_create( dt, 1, 1, 0, 0, &beta );
 
-	#ifdef COL_STORAGE
-		bli_obj_create( dt, m, k, 0, 0, &a );
-		bli_obj_create( dt, k, n, 0, 0, &b );
-		bli_obj_create( dt, m, n, 0, 0, &c );
-		bli_obj_create( dt, m, n, 0, 0, &c_save );
-	#else
-		bli_obj_create( dt, m, k, k, 1, &a );
-		bli_obj_create( dt, k, n, n, 1, &b );
-		bli_obj_create( dt, m, n, n, 1, &c );
-		bli_obj_create( dt, m, n, n, 1, &c_save );
-	#endif
+		// Choose the storage of each matrix based on the corresponding
+		// char in the params_t struct. Note that the expected order of
+		// storage specifers in sc_str is CAB (not ABC).
+		if ( params.sc_str[1] == 'c' ) bli_obj_create( dt, m, k, 0, 0, &a );
+		else                           bli_obj_create( dt, m, k, k, 1, &a );
+
+		if ( params.sc_str[2] == 'c' ) bli_obj_create( dt, k, n, 0, 0, &b );
+		else                           bli_obj_create( dt, k, n, n, 1, &b );
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c );
+		else                           bli_obj_create( dt, m, n, n, 1, &c );
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save );
+		else                           bli_obj_create( dt, m, n, n, 1, &c_save );
 
 		bli_randm( &a );
 		bli_randm( &b );
@@ -177,12 +160,18 @@ int main( int argc, char** argv )
 		bli_obj_set_conjtrans( transa, &a );
 		bli_obj_set_conjtrans( transb, &b );
 
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
+		//bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		//bli_setsc(  (1.0/1.0), 0.0, &beta );
+		bli_setsc( params.alpha, 0.0, &alpha );
+		bli_setsc( params.beta,  0.0, &beta );
+
+		//bli_printm( "alpha:", &alpha, "%7.4e", "" );
+		//bli_printm( "beta: ", &beta,  "%7.4e", "" );
 
 		bli_copym( &c, &c_save );
 
-#if 0 //def BLIS
+#ifdef BLIS
+		// Switch to the induced method specified by ind.
 		bli_ind_disable_all_dt( dt );
 		bli_ind_enable_dt( ind, dt );
 #endif
@@ -196,58 +185,66 @@ int main( int argc, char** argv )
 		void* bp = bli_obj_buffer_at_off( &b );
 		void* cp = bli_obj_buffer_at_off( &c );
 
-	#ifdef COL_STORAGE
-		const int os_a = bli_obj_col_stride( &a );
-		const int os_b = bli_obj_col_stride( &b );
-		const int os_c = bli_obj_col_stride( &c );
-	#else
-		const int os_a = bli_obj_row_stride( &a );
-		const int os_b = bli_obj_row_stride( &b );
-		const int os_c = bli_obj_row_stride( &c );
-	#endif
+		int os_a, os_b, os_c;
+
+		if ( params.sc_str[0] == 'c' )
+		{
+			os_a = bli_obj_col_stride( &a );
+			os_b = bli_obj_col_stride( &b );
+			os_c = bli_obj_col_stride( &c );
+		}
+		else
+		{
+			os_a = bli_obj_row_stride( &a );
+			os_b = bli_obj_row_stride( &b );
+			os_c = bli_obj_row_stride( &c );
+		}
 
 		Stride<Dynamic,1> stride_a( os_a, 1 );
 		Stride<Dynamic,1> stride_b( os_b, 1 );
 		Stride<Dynamic,1> stride_c( os_c, 1 );
 
-	#ifdef COL_STORAGE
-		#if defined(IS_FLOAT)
-		typedef Matrix<float,                Dynamic, Dynamic, ColMajor> MatrixXf_;
-		#elif defined (IS_DOUBLE)
-		typedef Matrix<double,               Dynamic, Dynamic, ColMajor> MatrixXd_;
-		#elif defined (IS_SCOMPLEX)
-		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, ColMajor> MatrixXcf_;
-		#elif defined (IS_DCOMPLEX)
-		typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
-		#endif
-	#else
-		#if defined(IS_FLOAT)
-		typedef Matrix<float,                Dynamic, Dynamic, RowMajor> MatrixXf_;
-		#elif defined (IS_DOUBLE)
-		typedef Matrix<double,               Dynamic, Dynamic, RowMajor> MatrixXd_;
-		#elif defined (IS_SCOMPLEX)
-		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, RowMajor> MatrixXcf_;
-		#elif defined (IS_DCOMPLEX)
-		typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
-		#endif
-	#endif
-	#if defined(IS_FLOAT)
-		Map<MatrixXf_,  0, Stride<Dynamic,1> > A( ( float*  )ap, m, k, stride_a );
-		Map<MatrixXf_,  0, Stride<Dynamic,1> > B( ( float*  )bp, k, n, stride_b );
-		Map<MatrixXf_,  0, Stride<Dynamic,1> > C( ( float*  )cp, m, n, stride_c );
-	#elif defined (IS_DOUBLE)
-		Map<MatrixXd_,  0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
-		Map<MatrixXd_,  0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
-		Map<MatrixXd_,  0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
-	#elif defined (IS_SCOMPLEX)
-		Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>*  )ap, m, k, stride_a );
-		Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>*  )bp, k, n, stride_b );
-		Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>*  )cp, m, n, stride_c );
-	#elif defined (IS_DCOMPLEX)
-		Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
-		Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
-		Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
-	#endif
+		typedef Matrix<float,                Dynamic, Dynamic, ColMajor> MatrixXs_c;
+		typedef Matrix<double,               Dynamic, Dynamic, ColMajor> MatrixXd_c;
+		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, ColMajor> MatrixXc_c;
+		typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXz_c;
+
+		typedef Matrix<float,                Dynamic, Dynamic, RowMajor> MatrixXs_r;
+		typedef Matrix<double,               Dynamic, Dynamic, RowMajor> MatrixXd_r;
+		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, RowMajor> MatrixXc_r;
+		typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXz_r;
+
+		Map<MatrixXs_c, 0, Stride<Dynamic,1> > As_c(               ( float*  )ap, m, k, stride_a );
+		Map<MatrixXs_c, 0, Stride<Dynamic,1> > Bs_c(               ( float*  )bp, k, n, stride_b );
+		Map<MatrixXs_c, 0, Stride<Dynamic,1> > Cs_c(               ( float*  )cp, m, n, stride_c );
+
+		Map<MatrixXd_c, 0, Stride<Dynamic,1> > Ad_c(               ( double* )ap, m, k, stride_a );
+		Map<MatrixXd_c, 0, Stride<Dynamic,1> > Bd_c(               ( double* )bp, k, n, stride_b );
+		Map<MatrixXd_c, 0, Stride<Dynamic,1> > Cd_c(               ( double* )cp, m, n, stride_c );
+
+		Map<MatrixXc_c, 0, Stride<Dynamic,1> > Ac_c( ( std::complex<float>*  )ap, m, k, stride_a );
+		Map<MatrixXc_c, 0, Stride<Dynamic,1> > Bc_c( ( std::complex<float>*  )bp, k, n, stride_b );
+		Map<MatrixXc_c, 0, Stride<Dynamic,1> > Cc_c( ( std::complex<float>*  )cp, m, n, stride_c );
+
+		Map<MatrixXz_c, 0, Stride<Dynamic,1> > Az_c( ( std::complex<double>* )ap, m, k, stride_a );
+		Map<MatrixXz_c, 0, Stride<Dynamic,1> > Bz_c( ( std::complex<double>* )bp, k, n, stride_b );
+		Map<MatrixXz_c, 0, Stride<Dynamic,1> > Cz_c( ( std::complex<double>* )cp, m, n, stride_c );
+
+		Map<MatrixXs_r, 0, Stride<Dynamic,1> > As_r(               ( float*  )ap, m, k, stride_a );
+		Map<MatrixXs_r, 0, Stride<Dynamic,1> > Bs_r(               ( float*  )bp, k, n, stride_b );
+		Map<MatrixXs_r, 0, Stride<Dynamic,1> > Cs_r(               ( float*  )cp, m, n, stride_c );
+
+		Map<MatrixXd_r, 0, Stride<Dynamic,1> > Ad_r(               ( double* )ap, m, k, stride_a );
+		Map<MatrixXd_r, 0, Stride<Dynamic,1> > Bd_r(               ( double* )bp, k, n, stride_b );
+		Map<MatrixXd_r, 0, Stride<Dynamic,1> > Cd_r(               ( double* )cp, m, n, stride_c );
+
+		Map<MatrixXc_r, 0, Stride<Dynamic,1> > Ac_r( ( std::complex<float>*  )ap, m, k, stride_a );
+		Map<MatrixXc_r, 0, Stride<Dynamic,1> > Bc_r( ( std::complex<float>*  )bp, k, n, stride_b );
+		Map<MatrixXc_r, 0, Stride<Dynamic,1> > Cc_r( ( std::complex<float>*  )cp, m, n, stride_c );
+
+		Map<MatrixXz_r, 0, Stride<Dynamic,1> > Az_r( ( std::complex<double>* )ap, m, k, stride_a );
+		Map<MatrixXz_r, 0, Stride<Dynamic,1> > Bz_r( ( std::complex<double>* )bp, k, n, stride_b );
+		Map<MatrixXz_r, 0, Stride<Dynamic,1> > Cz_r( ( std::complex<double>* )cp, m, n, stride_c );
 #endif
 
 		dtime_save = DBL_MAX;
@@ -274,7 +271,22 @@ int main( int argc, char** argv )
 
 #elif defined(EIGEN)
 
-			C.noalias() += alpha_r * A * B;
+			//C.noalias() += alpha_r * A * B;
+
+			if ( params.sc_str[0] == 'c' )
+			{
+				if      ( params.dt_str[0] == 's' ) Cs_c.noalias() += alpha_r * As_c * Bs_c;
+				else if ( params.dt_str[0] == 'd' ) Cd_c.noalias() += alpha_r * Ad_c * Bd_c;
+				else if ( params.dt_str[0] == 'c' ) Cc_c.noalias() += alpha_r * Ac_c * Bc_c;
+				else if ( params.dt_str[0] == 'z' ) Cz_c.noalias() += alpha_r * Az_c * Bz_c;
+			}
+			else // if ( params.sc_str[0] == 'r' )
+			{
+				if      ( params.dt_str[0] == 's' ) Cs_r.noalias() += alpha_r * As_r * Bs_r;
+				else if ( params.dt_str[0] == 'd' ) Cd_r.noalias() += alpha_r * Ad_r * Bd_r;
+				else if ( params.dt_str[0] == 'c' ) Cc_r.noalias() += alpha_r * Ac_r * Bc_r;
+				else if ( params.dt_str[0] == 'z' ) Cz_r.noalias() += alpha_r * Az_r * Bz_r;
+			}
 
 #else // if defined(BLAS)
 
@@ -293,15 +305,15 @@ int main( int argc, char** argv )
 				float*    cp     = ( float* )bli_obj_buffer( &c );
 
 				sgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_double( dt ) )
 			{
@@ -318,15 +330,15 @@ int main( int argc, char** argv )
 				double*   cp     = ( double* )bli_obj_buffer( &c );
 
 				dgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_scomplex( dt ) )
 			{
@@ -343,15 +355,15 @@ int main( int argc, char** argv )
 				scomplex* cp     = ( scomplex* )bli_obj_buffer( &c );
 
 				cgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_dcomplex( dt ) )
 			{
@@ -368,15 +380,15 @@ int main( int argc, char** argv )
 				dcomplex* cp     = ( dcomplex* )bli_obj_buffer( &c );
 
 				zgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 #endif
 
@@ -392,12 +404,13 @@ int main( int argc, char** argv )
 
 		if ( bli_is_complex( dt ) ) gflops *= 4.0;
 
-		printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, IMPL_STR );
+		printf( "( %4lu, 1:4 ) = [ %5lu %5lu %5lu %8.2f ];\n",
 		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k,
 		        ( unsigned long )n, gflops );
+		fflush( stdout );
 
 		bli_obj_free( &alpha );
 		bli_obj_free( &beta );
@@ -413,3 +426,25 @@ int main( int argc, char** argv )
 	return 0;
 }
 
+void init_def_params( params_t* params )
+{
+	params->opname    = LOCAL_OPNAME_STR;
+	params->impl      = IMPL_STR;
+
+	params->pc_str    = LOCAL_PC_STR;
+	params->dt_str    = GLOB_DEF_DT_STR;
+	params->sc_str    = GLOB_DEF_SC_STR;
+
+	params->im_str    = GLOB_DEF_IM_STR;
+
+	params->ps_str    = GLOB_DEF_PS_STR;
+	params->m_str     = GLOB_DEF_M_STR;
+	params->n_str     = GLOB_DEF_N_STR;
+	params->k_str     = GLOB_DEF_K_STR;
+
+	params->nr_str    = GLOB_DEF_NR_STR;
+
+	params->alpha_str = GLOB_DEF_ALPHA_STR;
+	params->beta_str  = GLOB_DEF_BETA_STR;
+}
+
diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c
index 537378d43..d04d8cab2 100644
--- a/test/3/test_hemm.c
+++ b/test/3/test_hemm.c
@@ -34,9 +34,13 @@
 
 #include <unistd.h>
 #include "blis.h"
+#include "test_utils.h"
 
 //#define PRINT
 
+static const char* LOCAL_OPNAME_STR = "hemm";
+static const char* LOCAL_PC_STR     = "ll";
+
 int main( int argc, char** argv )
 {
 	obj_t    a, b, c;
@@ -59,54 +63,42 @@ int main( int argc, char** argv )
 	double   dtime_save;
 	double   gflops;
 
-	//bli_init();
-
-	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-	n_repeats = 3;
-
-	dt      = DT;
-
-	ind     = IND;
-
-	p_begin = P_BEGIN;
-	p_max   = P_MAX;
-	p_inc   = P_INC;
-
-	m_input = -1;
-	n_input = -1;
-
+	params_t params;
 
 	// Supress compiler warnings about unused variable 'ind'.
 	( void )ind;
 
-#if 0
 
-	cntx_t* cntx;
+	//bli_init();
 
-	ind_t ind_mod = ind;
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
 
-	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod );
+	// Parse the command line options into strings, integers, enums,
+	// and doubles, as appropriate.
+	parse_cl_params( argc, argv, init_def_params, &params );
 
-	// Set k to the kc blocksize for the current datatype.
-	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+	dt        = params.dt;
 
-#elif 1
+	ind       = params.im;
 
-	//k_input = 256;
+	p_begin   = params.sta;
+	p_max     = params.end;
+	p_inc     = params.inc;
 
-#endif
+	m_input   = params.m;
+	n_input   = params.n;
 
-	// Choose the char corresponding to the requested datatype.
-	if      ( bli_is_float( dt ) )    dt_ch = 's';
-	else if ( bli_is_double( dt ) )   dt_ch = 'd';
-	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
-	else                              dt_ch = 'z';
+	n_repeats = params.nr;
 
-	side  = BLIS_LEFT;
-	uploa = BLIS_LOWER;
 
+	// Map the datatype to its corresponding char.
+	bli_param_map_blis_to_char_dt( dt, &dt_ch );
+
+	// Map the parameter chars to their corresponding BLIS enum type values.
+	bli_param_map_char_to_blis_side( params.pc_str[0], &side );
+	bli_param_map_char_to_blis_uplo( params.pc_str[1], &uploa );
+
+	// Map the BLIS enum type values to their corresponding BLAS chars.
 	bli_param_map_blis_to_netlib_side( side, &f77_side );
 	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
 
@@ -114,8 +106,8 @@ int main( int argc, char** argv )
 	// matlab allocates space for the entire array once up-front.
 	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
 
-	printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
-	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+	printf( "data_%s_%chemm_%s", THR_STR, dt_ch, IMPL_STR );
+	printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
@@ -133,13 +125,28 @@ int main( int argc, char** argv )
 		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
 		bli_obj_create( dt, 1, 1, 0, 0, &beta );
 
-		if ( bli_is_left( side ) )
-			bli_obj_create( dt, m, m, 0, 0, &a );
-		else
-			bli_obj_create( dt, n, n, 0, 0, &a );
-		bli_obj_create( dt, m, n, 0, 0, &b );
-		bli_obj_create( dt, m, n, 0, 0, &c );
-		bli_obj_create( dt, m, n, 0, 0, &c_save );
+		// Choose the storage of each matrix based on the corresponding
+		// char in the params_t struct. Note that the expected order of
+		// storage specifers in sc_str is CAB (not ABC).
+		if ( params.sc_str[1] == 'c' )
+		{
+			if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a );
+			else                       bli_obj_create( dt, n, n, 0, 0, &a );
+		}
+		else // if ( params.sc_str[1] == 'r' )
+		{
+			if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, m, 1, &a );
+			else                       bli_obj_create( dt, n, n, n, 1, &a );
+		}
+
+		if ( params.sc_str[2] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &b );
+		else                           bli_obj_create( dt, m, n, n, 1, &b );
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c );
+		else                           bli_obj_create( dt, m, n, n, 1, &c );
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save );
+		else                           bli_obj_create( dt, m, n, n, 1, &c_save );
 
 		bli_randm( &a );
 		bli_randm( &b );
@@ -153,12 +160,15 @@ int main( int argc, char** argv )
 		bli_mkherm( &a );
 		bli_mktrim( &a );
 
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
+		//bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		//bli_setsc(  (1.0/1.0), 0.0, &beta );
+		bli_setsc( params.alpha, 0.0, &alpha );
+		bli_setsc( params.beta,  0.0, &beta );
 
 		bli_copym( &c, &c_save );
-	
-#if 0 //def BLIS
+
+#ifdef BLIS
+		// Switch to the induced method specified by ind.
 		bli_ind_disable_all_dt( dt );
 		bli_ind_enable_dt( ind, dt );
 #endif
@@ -202,14 +212,14 @@ int main( int argc, char** argv )
 				float*    cp     = ( float* )bli_obj_buffer( &c );
 
 				ssymm_( &f77_side,
-						&f77_uploa,
-						&mm,
-						&nn,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_uploa,
+				        &mm,
+				        &nn,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_double( dt ) )
 			{
@@ -225,14 +235,14 @@ int main( int argc, char** argv )
 				double*   cp     = ( double* )bli_obj_buffer( &c );
 
 				dsymm_( &f77_side,
-						&f77_uploa,
-						&mm,
-						&nn,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_uploa,
+				        &mm,
+				        &nn,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_scomplex( dt ) )
 			{
@@ -256,14 +266,14 @@ int main( int argc, char** argv )
 #endif
 
 				chemm_( &f77_side,
-						&f77_uploa,
-						&mm,
-						&nn,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_uploa,
+				        &mm,
+				        &nn,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_dcomplex( dt ) )
 			{
@@ -287,14 +297,14 @@ int main( int argc, char** argv )
 #endif
 
 				zhemm_( &f77_side,
-						&f77_uploa,
-						&mm,
-						&nn,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
+				        &f77_uploa,
+				        &mm,
+				        &nn,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
 			}
 #endif
 
@@ -313,11 +323,12 @@ int main( int argc, char** argv )
 
 		if ( bli_is_complex( dt ) ) gflops *= 4.0;
 
-		printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
-		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+		printf( "data_%s_%chemm_%s", THR_STR, dt_ch, IMPL_STR );
+		printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
+		fflush( stdout );
 
 		bli_obj_free( &alpha );
 		bli_obj_free( &beta );
@@ -333,3 +344,24 @@ int main( int argc, char** argv )
 	return 0;
 }
 
+void init_def_params( params_t* params )
+{
+	params->opname    = LOCAL_OPNAME_STR;
+	params->impl      = IMPL_STR;
+
+	params->pc_str    = LOCAL_PC_STR;
+	params->dt_str    = GLOB_DEF_DT_STR;
+	params->sc_str    = GLOB_DEF_SC_STR;
+
+	params->im_str    = GLOB_DEF_IM_STR;
+
+	params->ps_str    = GLOB_DEF_PS_STR;
+	params->m_str     = GLOB_DEF_M_STR;
+	params->n_str     = GLOB_DEF_N_STR;
+
+	params->nr_str    = GLOB_DEF_NR_STR;
+
+	params->alpha_str = GLOB_DEF_ALPHA_STR;
+	params->beta_str  = GLOB_DEF_BETA_STR;
+}
+
diff --git a/test/3/test_herk.c b/test/3/test_herk.c
index 6dbaf1936..a713b6766 100644
--- a/test/3/test_herk.c
+++ b/test/3/test_herk.c
@@ -35,9 +35,13 @@
 
 #include <unistd.h>
 #include "blis.h"
+#include "test_utils.h"
 
 //#define PRINT
 
+static const char* LOCAL_OPNAME_STR = "herk";
+static const char* LOCAL_PC_STR     = "ln";
+
 int main( int argc, char** argv )
 {
 	obj_t    a, c;
@@ -60,55 +64,43 @@ int main( int argc, char** argv )
 	double   dtime_save;
 	double   gflops;
 
-	//bli_init();
-
-	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-	n_repeats = 3;
-
-	dt      = DT;
-	dt_real = bli_dt_proj_to_real( DT );
-
-	ind     = IND;
-
-	p_begin = P_BEGIN;
-	p_max   = P_MAX;
-	p_inc   = P_INC;
-
-	m_input = -1;
-	k_input = -1;
-
+	params_t params;
 
 	// Supress compiler warnings about unused variable 'ind'.
 	( void )ind;
 
-#if 0
 
-	cntx_t* cntx;
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	// Parse the command line options into strings, integers, enums,
+	// and doubles, as appropriate.
+	parse_cl_params( argc, argv, init_def_params, &params );
 
-	ind_t ind_mod = ind;
+	dt        = params.dt;
+	dt_real   = bli_dt_proj_to_real( dt );
 
-	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod );
+	ind       = params.im;
 
-	// Set k to the kc blocksize for the current datatype.
-	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+	p_begin   = params.sta;
+	p_max     = params.end;
+	p_inc     = params.inc;
 
-#elif 1
+	m_input   = params.m;
+	k_input   = params.k;
 
-	//k_input = 256;
+	n_repeats = params.nr;
 
-#endif
 
-	// Choose the char corresponding to the requested datatype.
-	if      ( bli_is_float( dt ) )    dt_ch = 's';
-	else if ( bli_is_double( dt ) )   dt_ch = 'd';
-	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
-	else                              dt_ch = 'z';
+	// Map the datatype to its corresponding char.
+	bli_param_map_blis_to_char_dt( dt, &dt_ch );
 
-	uploc  = BLIS_LOWER;
-	transa = BLIS_NO_TRANSPOSE;
+	// Map the parameter chars to their corresponding BLIS enum type values.
+	bli_param_map_char_to_blis_uplo( params.pc_str[0], &uploc );
+	bli_param_map_char_to_blis_trans( params.pc_str[1], &transa );
 
+	// Map the BLIS enum type values to their corresponding BLAS chars.
 	bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
 	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
 
@@ -116,8 +108,8 @@ int main( int argc, char** argv )
 	// matlab allocates space for the entire array once up-front.
 	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
 
-	printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
-	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+	printf( "data_%s_%cherk_%s", THR_STR, dt_ch, IMPL_STR );
+	printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
@@ -135,15 +127,25 @@ int main( int argc, char** argv )
 		bli_obj_create( dt_real, 1, 1, 0, 0, &alpha );
 		bli_obj_create( dt,      1, 1, 0, 0, &beta );
 
-		if ( bli_does_trans( transa ) )
-			bli_obj_create( dt, k, m, 0, 0, &a );
-        else
-			bli_obj_create( dt, m, k, 0, 0, &a );
-		bli_obj_create( dt, m, m, 0, 0, &c );
-		//bli_obj_create( dt, m, k, 2, 2*m, &a );
-		//bli_obj_create( dt, k, n, 2, 2*k, &b );
-		//bli_obj_create( dt, m, n, 2, 2*m, &c );
-		bli_obj_create( dt, m, m, 0, 0, &c_save );
+		// Choose the storage of each matrix based on the corresponding
+		// char in the params_t struct. Note that the expected order of
+		// storage specifers in sc_str is CA (not AC).
+		if ( params.sc_str[1] == 'c' )
+		{
+			if ( bli_does_trans( transa ) ) bli_obj_create( dt, k, m, 0, 0, &a );
+			else                            bli_obj_create( dt, m, k, 0, 0, &a );
+		}
+		else // if ( params.sc_str[1] == 'r' )
+		{
+			if ( bli_does_trans( transa ) ) bli_obj_create( dt, k, m, m, 1, &a );
+			else                            bli_obj_create( dt, m, k, k, 1, &a );
+		}
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, m, 0, 0, &c );
+		else                           bli_obj_create( dt, m, m, m, 1, &c );
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, m, 0, 0, &c_save );
+		else                           bli_obj_create( dt, m, m, m, 1, &c_save );
 
 		bli_randm( &a );
 		bli_randm( &c );
@@ -151,14 +153,22 @@ int main( int argc, char** argv )
 		bli_obj_set_struc( BLIS_HERMITIAN, &c );
 		bli_obj_set_uplo( uploc, &c );
 
+		// Make C densely Hermitian, and zero the unstored triangle to
+		// ensure the implementation reads only from the stored region.
+		bli_mkherm( &c );
+		bli_mktrim( &c );
+
 		bli_obj_set_conjtrans( transa, &a );
 
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
+		//bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		//bli_setsc(  (1.0/1.0), 0.0, &beta );
+		bli_setsc( params.alpha, 0.0, &alpha );
+		bli_setsc( params.beta,  0.0, &beta );
 
 		bli_copym( &c, &c_save );
-	
-#if 0 //def BLIS
+
+#ifdef BLIS
+		// Switch to the induced method specified by ind.
 		bli_ind_disable_all_dt( dt );
 		bli_ind_enable_dt( ind, dt );
 #endif
@@ -197,13 +207,13 @@ int main( int argc, char** argv )
 				float*    cp     = ( float* )bli_obj_buffer( &c );
 
 				ssyrk_( &f77_uploc,
-						&f77_transa,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						betap,
-						cp, &ldc );
+				        &f77_transa,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_double( dt ) )
 			{
@@ -217,13 +227,13 @@ int main( int argc, char** argv )
 				double*   cp     = ( double* )bli_obj_buffer( &c );
 
 				dsyrk_( &f77_uploc,
-						&f77_transa,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						betap,
-						cp, &ldc );
+				        &f77_transa,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_scomplex( dt ) )
 			{
@@ -244,13 +254,13 @@ int main( int argc, char** argv )
 #endif
 
 				cherk_( &f77_uploc,
-						&f77_transa,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						betap,
-						cp, &ldc );
+				        &f77_transa,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        betap,
+				        cp, &ldc );
 			}
 			else if ( bli_is_dcomplex( dt ) )
 			{
@@ -271,13 +281,13 @@ int main( int argc, char** argv )
 #endif
 
 				zherk_( &f77_uploc,
-						&f77_transa,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						betap,
-						cp, &ldc );
+				        &f77_transa,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        betap,
+				        cp, &ldc );
 			}
 #endif
 
@@ -293,11 +303,12 @@ int main( int argc, char** argv )
 
 		if ( bli_is_complex( dt ) ) gflops *= 4.0;
 
-		printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
-		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+		printf( "data_%s_%cherk_%s", THR_STR, dt_ch, IMPL_STR );
+		printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k, gflops );
+		fflush( stdout );
 
 		bli_obj_free( &alpha );
 		bli_obj_free( &beta );
@@ -312,3 +323,24 @@ int main( int argc, char** argv )
 	return 0;
 }
 
+void init_def_params( params_t* params )
+{
+	params->opname    = LOCAL_OPNAME_STR;
+	params->impl      = IMPL_STR;
+
+	params->pc_str    = LOCAL_PC_STR;
+	params->dt_str    = GLOB_DEF_DT_STR;
+	params->sc_str    = GLOB_DEF_SC_STR;
+
+	params->im_str    = GLOB_DEF_IM_STR;
+
+	params->ps_str    = GLOB_DEF_PS_STR;
+	params->m_str     = GLOB_DEF_M_STR;
+	params->k_str     = GLOB_DEF_K_STR;
+
+	params->nr_str    = GLOB_DEF_NR_STR;
+
+	params->alpha_str = GLOB_DEF_ALPHA_STR;
+	params->beta_str  = GLOB_DEF_BETA_STR;
+}
+
diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c
index 4e58b95fa..2ecbb19b1 100644
--- a/test/3/test_trmm.c
+++ b/test/3/test_trmm.c
@@ -35,9 +35,13 @@
 
 #include <unistd.h>
 #include "blis.h"
+#include "test_utils.h"
 
 //#define PRINT
 
+static const char* LOCAL_OPNAME_STR = "trmm";
+static const char* LOCAL_PC_STR     = "llnn";
+
 int main( int argc, char** argv )
 {
 	obj_t    a, c;
@@ -64,64 +68,44 @@ int main( int argc, char** argv )
 	double   dtime_save;
 	double   gflops;
 
-	//bli_init();
-
-	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-	n_repeats = 3;
-
-	dt      = DT;
-
-	ind     = IND;
-
-	p_begin = P_BEGIN;
-	p_max   = P_MAX;
-	p_inc   = P_INC;
-
-	m_input = -1;
-	n_input = -1;
-
+	params_t params;
 
 	// Supress compiler warnings about unused variable 'ind'.
 	( void )ind;
 
-#if 0
 
-	cntx_t* cntx;
+	//bli_init();
 
-	ind_t ind_mod = ind;
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
 
-	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod );
+	// Parse the command line options into strings, integers, enums,
+	// and doubles, as appropriate.
+	parse_cl_params( argc, argv, init_def_params, &params );
 
-	// Set k to the kc blocksize for the current datatype.
-	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+	dt        = params.dt;
 
-#elif 1
+	ind       = params.im;
 
-	//k_input = 256;
+	p_begin   = params.sta;
+	p_max     = params.end;
+	p_inc     = params.inc;
 
-#endif
+	m_input   = params.m;
+	n_input   = params.n;
 
-	// Choose the char corresponding to the requested datatype.
-	if      ( bli_is_float( dt ) )    dt_ch = 's';
-	else if ( bli_is_double( dt ) )   dt_ch = 'd';
-	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
-	else                              dt_ch = 'z';
+	n_repeats = params.nr;
 
-#if 0
-	side   = BLIS_LEFT;
-#else
-	side   = BLIS_RIGHT;
-#endif
-#if 0
-	uploa  = BLIS_LOWER;
-#else
-	uploa  = BLIS_UPPER;
-#endif
-	transa = BLIS_NO_TRANSPOSE;
-	diaga  = BLIS_NONUNIT_DIAG;
 
+	// Map the datatype to its corresponding char.
+	bli_param_map_blis_to_char_dt( dt, &dt_ch );
+
+	// Map the parameter chars to their corresponding BLIS enum type values.
+	bli_param_map_char_to_blis_side( params.pc_str[0], &side );
+	bli_param_map_char_to_blis_uplo( params.pc_str[1], &uploa );
+	bli_param_map_char_to_blis_trans( params.pc_str[2], &transa );
+	bli_param_map_char_to_blis_diag( params.pc_str[3], &diaga );
+
+	// Map the BLIS enum type values to their corresponding BLAS chars.
 	bli_param_map_blis_to_netlib_side( side, &f77_side );
 	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
 	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
@@ -131,8 +115,8 @@ int main( int argc, char** argv )
 	// matlab allocates space for the entire array once up-front.
 	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
 
-	printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
-	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+	printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, IMPL_STR );
+	printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
@@ -149,12 +133,26 @@ int main( int argc, char** argv )
 
 		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
 
-		if ( bli_is_left( side ) )
-			bli_obj_create( dt, m, m, 0, 0, &a );
-        else
-			bli_obj_create( dt, n, n, 0, 0, &a );
-		bli_obj_create( dt, m, n, 0, 0, &c );
-		bli_obj_create( dt, m, n, 0, 0, &c_save );
+		// Choose the storage of each matrix based on the corresponding
+		// char in the params_t struct. Note that the expected order of
+		// storage specifers in sc_str is CA (not AC). Also note that
+		// C plays the role of matrix B.
+		if ( params.sc_str[1] == 'c' )
+		{
+			if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a );
+			else                       bli_obj_create( dt, n, n, 0, 0, &a );
+		}
+		else // if ( params.sc_str[1] == 'r' )
+		{
+			if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, m, 1, &a );
+			else                       bli_obj_create( dt, n, n, n, 1, &a );
+		}
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c );
+		else                           bli_obj_create( dt, m, n, n, 1, &c );
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save );
+		else                           bli_obj_create( dt, m, n, n, 1, &c_save );
 
 		bli_randm( &a );
 		bli_randm( &c );
@@ -164,14 +162,16 @@ int main( int argc, char** argv )
 		bli_obj_set_conjtrans( transa, &a );
 		bli_obj_set_diag( diaga, &a );
 
-		bli_randm( &a );
+		// Zero the unstored triangle.
 		bli_mktrim( &a );
 
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		//bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		bli_setsc( params.alpha, 0.0, &alpha );
 
 		bli_copym( &c, &c_save );
-	
-#if 0 //def BLIS
+
+#ifdef BLIS
+		// Switch to the induced method specified by ind.
 		bli_ind_disable_all_dt( dt );
 		bli_ind_enable_dt( ind, dt );
 #endif
@@ -209,14 +209,14 @@ int main( int argc, char** argv )
 				float*    cp     = ( float* )bli_obj_buffer( &c );
 
 				strmm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 			else if ( bli_is_double( dt ) )
 			{
@@ -229,14 +229,14 @@ int main( int argc, char** argv )
 				double*   cp     = ( double* )bli_obj_buffer( &c );
 
 				dtrmm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 			else if ( bli_is_scomplex( dt ) )
 			{
@@ -255,14 +255,14 @@ int main( int argc, char** argv )
 #endif
 
 				ctrmm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 			else if ( bli_is_dcomplex( dt ) )
 			{
@@ -281,14 +281,14 @@ int main( int argc, char** argv )
 #endif
 
 				ztrmm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 #endif
 
@@ -307,11 +307,12 @@ int main( int argc, char** argv )
 
 		if ( bli_is_complex( dt ) ) gflops *= 4.0;
 
-		printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
-		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+		printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, IMPL_STR );
+		printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
+		fflush( stdout );
 
 		bli_obj_free( &alpha );
 
@@ -325,3 +326,24 @@ int main( int argc, char** argv )
 	return 0;
 }
 
+void init_def_params( params_t* params )
+{
+	params->opname    = LOCAL_OPNAME_STR;
+	params->impl      = IMPL_STR;
+
+	params->pc_str    = LOCAL_PC_STR;
+	params->dt_str    = GLOB_DEF_DT_STR;
+	params->sc_str    = GLOB_DEF_SC_STR;
+
+	params->im_str    = GLOB_DEF_IM_STR;
+
+	params->ps_str    = GLOB_DEF_PS_STR;
+	params->m_str     = GLOB_DEF_M_STR;
+	params->n_str     = GLOB_DEF_N_STR;
+
+	params->nr_str    = GLOB_DEF_NR_STR;
+
+	params->alpha_str = GLOB_DEF_ALPHA_STR;
+	params->beta_str  = GLOB_DEF_BETA_STR;
+}
+
diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c
index 4897d4627..4b92f6128 100644
--- a/test/3/test_trsm.c
+++ b/test/3/test_trsm.c
@@ -35,9 +35,13 @@
 
 #include <unistd.h>
 #include "blis.h"
+#include "test_utils.h"
 
 //#define PRINT
 
+static const char* LOCAL_OPNAME_STR = "trsm";
+static const char* LOCAL_PC_STR     = "llnn";
+
 int main( int argc, char** argv )
 {
 	obj_t    a, c;
@@ -64,64 +68,44 @@ int main( int argc, char** argv )
 	double   dtime_save;
 	double   gflops;
 
-	//bli_init();
-
-	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-	n_repeats = 3;
-
-	dt      = DT;
-
-	ind     = IND;
-
-	p_begin = P_BEGIN;
-	p_max   = P_MAX;
-	p_inc   = P_INC;
-
-	m_input = -1;
-	n_input = -1;
-
+	params_t params;
 
 	// Supress compiler warnings about unused variable 'ind'.
 	( void )ind;
 
-#if 0
 
-	cntx_t* cntx;
+	//bli_init();
 
-	ind_t ind_mod = ind;
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
 
-	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod );
+	// Parse the command line options into strings, integers, enums,
+	// and doubles, as appropriate.
+	parse_cl_params( argc, argv, init_def_params, &params );
 
-	// Set k to the kc blocksize for the current datatype.
-	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+	dt        = params.dt;
 
-#elif 1
+	ind       = params.im;
 
-	//k_input = 256;
+	p_begin   = params.sta;
+	p_max     = params.end;
+	p_inc     = params.inc;
 
-#endif
+	m_input   = params.m;
+	n_input   = params.n;
 
-	// Choose the char corresponding to the requested datatype.
-	if      ( bli_is_float( dt ) )    dt_ch = 's';
-	else if ( bli_is_double( dt ) )   dt_ch = 'd';
-	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
-	else                              dt_ch = 'z';
+	n_repeats = params.nr;
 
-#if 0
-	side   = BLIS_LEFT;
-#else
-	side   = BLIS_RIGHT;
-#endif
-#if 0
-	uploa  = BLIS_LOWER;
-#else
-	uploa  = BLIS_UPPER;
-#endif
-	transa = BLIS_NO_TRANSPOSE;
-	diaga  = BLIS_NONUNIT_DIAG;
 
+	// Map the datatype to its corresponding char.
+	bli_param_map_blis_to_char_dt( dt, &dt_ch );
+
+	// Map the parameter chars to their corresponding BLIS enum type values.
+	bli_param_map_char_to_blis_side( params.pc_str[0], &side );
+	bli_param_map_char_to_blis_uplo( params.pc_str[1], &uploa );
+	bli_param_map_char_to_blis_trans( params.pc_str[2], &transa );
+	bli_param_map_char_to_blis_diag( params.pc_str[3], &diaga );
+
+	// Map the BLIS enum type values to their corresponding BLAS chars.
 	bli_param_map_blis_to_netlib_side( side, &f77_side );
 	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
 	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
@@ -131,8 +115,8 @@ int main( int argc, char** argv )
 	// matlab allocates space for the entire array once up-front.
 	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
 
-	printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
-	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+	printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, IMPL_STR );
+	printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
@@ -149,13 +133,26 @@ int main( int argc, char** argv )
 
 		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
 
-		if ( bli_is_left( side ) )
-			bli_obj_create( dt, m, m, 0, 0, &a );
-        else
-			bli_obj_create( dt, n, n, 0, 0, &a );
-		bli_obj_create( dt, m, n, 0, 0, &c );
-		//bli_obj_create( dt, m, n, n, 1, &c );
-		bli_obj_create( dt, m, n, 0, 0, &c_save );
+		// Choose the storage of each matrix based on the corresponding
+		// char in the params_t struct. Note that the expected order of
+		// storage specifers in sc_str is CA (not AC). Also note that
+		// C plays the role of matrix B.
+		if ( params.sc_str[1] == 'c' )
+		{
+			if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a );
+			else                       bli_obj_create( dt, n, n, 0, 0, &a );
+		}
+		else // if ( params.sc_str[1] == 'r' )
+		{
+			if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, m, 1, &a );
+			else                       bli_obj_create( dt, n, n, n, 1, &a );
+		}
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c );
+		else                           bli_obj_create( dt, m, n, n, 1, &c );
+
+		if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save );
+		else                           bli_obj_create( dt, m, n, n, 1, &c_save );
 
 		bli_randm( &a );
 		bli_randm( &c );
@@ -165,17 +162,19 @@ int main( int argc, char** argv )
 		bli_obj_set_conjtrans( transa, &a );
 		bli_obj_set_diag( diaga, &a );
 
-		bli_randm( &a );
+		// Zero the unstored triangle.
 		bli_mktrim( &a );
 
 		// Load the diagonal of A to make it more likely to be invertible.
 		bli_shiftd( &BLIS_TWO, &a );
 
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		//bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		bli_setsc( params.alpha, 0.0, &alpha );
 
 		bli_copym( &c, &c_save );
-	
-#if 0 //def BLIS
+
+#ifdef BLIS
+		// Switch to the induced method specified by ind.
 		bli_ind_disable_all_dt( dt );
 		bli_ind_enable_dt( ind, dt );
 #endif
@@ -213,14 +212,14 @@ int main( int argc, char** argv )
 				float*    cp     = ( float* )bli_obj_buffer( &c );
 
 				strsm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 			else if ( bli_is_double( dt ) )
 			{
@@ -233,14 +232,14 @@ int main( int argc, char** argv )
 				double*   cp     = ( double* )bli_obj_buffer( &c );
 
 				dtrsm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 			else if ( bli_is_scomplex( dt ) )
 			{
@@ -259,14 +258,14 @@ int main( int argc, char** argv )
 #endif
 
 				ctrsm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 			else if ( bli_is_dcomplex( dt ) )
 			{
@@ -285,14 +284,14 @@ int main( int argc, char** argv )
 #endif
 
 				ztrsm_( &f77_side,
-						&f77_uploa,
-						&f77_transa,
-						&f77_diaga,
-						&mm,
-						&kk,
-						alphap,
-						ap, &lda,
-						cp, &ldc );
+				        &f77_uploa,
+				        &f77_transa,
+				        &f77_diaga,
+				        &mm,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        cp, &ldc );
 			}
 #endif
 
@@ -311,11 +310,12 @@ int main( int argc, char** argv )
 
 		if ( bli_is_complex( dt ) ) gflops *= 4.0;
 
-		printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
-		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+		printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, IMPL_STR );
+		printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n",
 		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
+		fflush( stdout );
 
 		bli_obj_free( &alpha );
 
@@ -329,3 +329,24 @@ int main( int argc, char** argv )
 	return 0;
 }
 
+void init_def_params( params_t* params )
+{
+	params->opname    = LOCAL_OPNAME_STR;
+	params->impl      = IMPL_STR;
+
+	params->pc_str    = LOCAL_PC_STR;
+	params->dt_str    = GLOB_DEF_DT_STR;
+	params->sc_str    = GLOB_DEF_SC_STR;
+
+	params->im_str    = GLOB_DEF_IM_STR;
+
+	params->ps_str    = GLOB_DEF_PS_STR;
+	params->m_str     = GLOB_DEF_M_STR;
+	params->n_str     = GLOB_DEF_N_STR;
+
+	params->nr_str    = GLOB_DEF_NR_STR;
+
+	params->alpha_str = GLOB_DEF_ALPHA_STR;
+	params->beta_str  = GLOB_DEF_BETA_STR;
+}
+
diff --git a/test/3/test_utils.c b/test/3/test_utils.c
new file mode 100644
index 000000000..8e441d055
--- /dev/null
+++ b/test/3/test_utils.c
@@ -0,0 +1,684 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "test_utils.h"
+
+// Global string constants.
+const char* GLOB_DEF_DT_STR    = "d";
+const char* GLOB_DEF_SC_STR    = "ccc";
+const char* GLOB_DEF_IM_STR    = "native";
+
+const char* GLOB_DEF_PS_STR    = "50 1000 50";
+const char* GLOB_DEF_M_STR     = "-1";
+const char* GLOB_DEF_N_STR     = "-1";
+const char* GLOB_DEF_K_STR     = "-1";
+
+const char* GLOB_DEF_NR_STR    = "3";
+
+const char* GLOB_DEF_ALPHA_STR = "1.0";
+const char* GLOB_DEF_BETA_STR  = "1.0";
+
+
+void parse_cl_params( int argc, char** argv, init_fp fp, params_t* params )
+{
+	bool     gave_option_c = FALSE;
+	bool     gave_option_d = FALSE;
+	bool     gave_option_s = FALSE;
+
+	bool     gave_option_i = FALSE;
+
+	bool     gave_option_p = FALSE;
+	bool     gave_option_m = FALSE;
+	bool     gave_option_n = FALSE;
+	bool     gave_option_k = FALSE;
+
+	bool     gave_option_r = FALSE;
+
+	bool     gave_option_a = FALSE;
+	bool     gave_option_b = FALSE;
+
+	int      opt;
+	char     opt_ch;
+
+	getopt_t state;
+
+	// Initialize the params_t struct with the caller-supplied function.
+	fp( params );
+
+	// Copy the binary name pointer so we can use it later.
+	params->bin = argv[0];
+
+	// Alias the binary name for conciseness.
+	const char* bin = params->bin;
+
+	// Initialize the state for running bli_getopt(). Here, 0 is the
+	// initial value for opterr, which suppresses error messages.
+	bli_getopt_init_state( 0, &state );
+
+	// Process all option arguments until we get a -1, which means we're done.
+	while( (opt = bli_getopt( argc, ( const char* const * )argv, "c:d:s:i:p:m:n:k:r:a:b:qvh", &state )) != -1 )
+	{
+		// Explicitly typecast opt, which is an int, to a char. (Failing to
+		// typecast resulted in at least one user-reported problem whereby
+		// opt was being filled with garbage.)
+		opt_ch = ( char )opt;
+
+		switch( opt_ch )
+		{
+			case 'c':
+			params->pc_str = state.optarg;
+			gave_option_c = TRUE;
+			break;
+
+			case 'd':
+			params->dt_str = state.optarg;
+			gave_option_d = TRUE;
+			break;
+
+			case 's':
+			params->sc_str = state.optarg;
+			gave_option_s = TRUE;
+			break;
+
+
+			case 'i':
+			params->im_str = state.optarg;
+			gave_option_i = TRUE;
+			break;
+
+
+			case 'p':
+			params->ps_str = state.optarg;
+			gave_option_p = TRUE;
+			break;
+
+			case 'm':
+			params->m_str = state.optarg;
+			gave_option_m = TRUE;
+			break;
+
+			case 'n':
+			params->n_str = state.optarg;
+			gave_option_n = TRUE;
+			break;
+
+			case 'k':
+			params->k_str = state.optarg;
+			gave_option_k = TRUE;
+			break;
+
+
+			case 'r':
+			params->nr_str = state.optarg;
+			gave_option_r = TRUE;
+			break;
+
+
+			case 'a':
+			params->alpha_str = state.optarg;
+			gave_option_a = TRUE;
+			break;
+
+			case 'b':
+			params->beta_str = state.optarg;
+			gave_option_b = TRUE;
+			break;
+
+
+			case 'q':
+			params->verbose = FALSE;
+			break;
+
+			case 'v':
+			params->verbose = TRUE;
+			break;
+
+			case 'h':
+			{
+				bool has_trans = FALSE;
+				bool has_side  = FALSE;
+				bool has_uplo  = FALSE;
+				bool has_unit  = FALSE;
+
+				if ( is_gemm( params ) ||
+					 is_herk( params ) ||
+					 is_trmm( params ) ||
+					 is_trsm( params ) ) has_trans = TRUE;
+
+				if ( is_hemm( params ) ||
+					 is_trmm( params ) ||
+					 is_trsm( params ) ) has_side = TRUE;
+
+				if ( is_hemm( params ) ||
+					 is_herk( params ) ||
+					 is_trmm( params ) ||
+					 is_trsm( params ) ) has_uplo = TRUE;
+
+				if ( is_trmm( params ) ||
+					 is_trsm( params ) ) has_unit = TRUE;
+
+				printf( "\n" );
+				printf( "  %s performance driver\n", params->opname );
+				printf( "  -----------------------\n" );
+				printf( "  (part of the BLIS framework)\n" );
+				printf( "\n" );
+				printf( "  Measure performance of the '%s' implementation of the '%s' operation:\n", params->impl, params->opname );
+				printf( "\n" );
+				if ( is_gemm( params ) )
+				{
+				printf( "      C := beta * C + alpha * trans(A) * trans(B)\n" );
+				printf( "\n" );
+				printf( "  where C is an m x n matrix, trans(A) is an m x k matrix, and\n" );
+				printf( "  trans(B) is a k x n matrix.\n" );
+				}
+				else if ( is_hemm( params ) )
+				{
+				printf( "      C := beta * C + alpha * uplo(A) * B     (side = left)\n" );
+				printf( "      C := beta * C + alpha * B * uplo(A)     (side = right)\n" );
+				printf( "\n" );
+				printf( "  where C and B are m x n matrices and A is a Hermitian matrix stored\n" );
+				printf( "  in the lower or upper triangle, as specified by uplo(A). When side =\n" );
+				printf( "  left, A is m x m, and when side = right, A is n x n.\n" );
+				}
+				else if ( is_herk( params ) )
+				{
+				printf( "      uplo(C) := beta * uplo(C) + alpha * trans(A) * trans(A)^H\n" );
+				printf( "\n" );
+				printf( "  where C is an m x m Hermitian matrix stored in the lower or upper\n" );
+				printf( "  triangle, as specified by uplo(C), and trans(A) is an m x k matrix.\n" );
+				}
+				else if ( is_trmm( params ) )
+				{
+				printf( "      B := alpha * trans(uplo(A)) * B      (side = left)\n" );
+				printf( "      B := alpha * B * trans(uplo(A))      (side = right)\n" );
+				printf( "\n" );
+				printf( "  where B is an m x n matrix and A is a triangular matrix stored in\n" );
+				printf( "  the lower or upper triangle, as specified by uplo(A), with unit/non-unit\n" );
+				printf( "  diagonal specified by diag(A). When side = left, A is m x m, and when\n" );
+				printf( "  side = right, A is n x n.\n" );
+				}
+				else if ( is_trsm( params ) )
+				{
+				printf( "      B := alpha * trans(uplo(A))^{-1} * B     (side = left)\n" );
+				printf( "      B := alpha * B * trans(uplo(A))^{-1}     (side = right)\n" );
+				printf( "\n" );
+				printf( "  where B is an m x n matrix and A is a triangular matrix stored in\n" );
+				printf( "  the lower or upper triangle, as specified by uplo(A), with unit/non-unit\n" );
+				printf( "  diagonal specified by diag(A). When side = left, A is m x m, and when\n" );
+				printf( "  side = right, A is n x n. Note that while ^{-1} indicates inversion,\n" );
+				printf( "  trsm does not explicitly invert A, but rather solves for an m x n\n" );
+				printf( "  solution matrix X, which then overwrites the original contents of B.\n" );
+				}
+				printf( "\n" );
+				printf( "  Performance measurements are taken for a range of problem sizes with a fixed\n" );
+				printf( "  set of parameters, and results are printed to stdout in a matlab/octave-\n" );
+				printf( "  friendly format.\n" );
+				printf( "\n" );
+				printf( "  Usage:\n" );
+				printf( "\n" );
+				printf( "    %s [options]\n", bin );
+				printf( "\n" );
+				printf( "  The following computational options are supported:\n" );
+				printf( "\n" );
+				printf( "    -c pc\n" );
+				printf( "            Use the operation-specific parameter combination specified by\n" );
+				printf( "            the 'pc' string. The following tables list expected parameters\n" );
+				printf( "            for the '%s' operation and the valid values for each parameter.\n", params->opname );
+				printf( "\n" );
+				printf( "               Operation   List (order) of parameters          Example\n" );
+				printf( "               -------------------------------------------------------\n" );
+				if ( is_gemm( params ) )
+				{
+				printf( "               gemm        trans(A) trans(A)                   -c tn\n" );
+				}
+				else if ( is_hemm( params ) )
+				{
+				printf( "               hemm/symm   side(A) uplo(A)                     -c rl\n" );
+				}
+				else if ( is_herk( params ) )
+				{
+				printf( "               herk/syrk   uplo(C) trans(A)                    -c ln\n" );
+				}
+				else if ( is_trmm( params ) )
+				{
+				printf( "               trmm        side(A) uplo(A) trans(A) unit(A)    -c lutn\n" );
+				}
+				else if ( is_trsm( params ) )
+				{
+				printf( "               trsm        side(A) uplo(A) trans(A) unit(A)    -c rlnn\n" );
+				}
+				printf( "\n" );
+				printf( "                           Valid\n" );
+				printf( "               Param       chars    Interpretation\n" );
+				printf( "               ---------------------------------------\n" );
+				if ( has_trans )
+				{
+				printf( "               trans       n        No transpose\n" );
+				printf( "                           t        Transpose only\n" );
+				printf( "                           c        Conjugate only*\n" );
+				printf( "                           h        Hermitian transpose\n" );
+				printf( "\n" );
+				}
+				if ( has_side )
+				{
+				printf( "               side        l        Left\n" );
+				printf( "                           r        Right\n" );
+				printf( "\n" );
+				}
+				if ( has_uplo )
+				{
+				printf( "               uplo        l        Lower-stored\n" );
+				printf( "                           u        Upper-stored\n" );
+				printf( "\n" );
+				}
+				if ( has_unit )
+				{
+				printf( "               unit        u        Unit diagonal\n" );
+				printf( "                           n        Non-unit diagonal\n" );
+				printf( "\n" );
+				}
+				if ( has_trans )
+				{
+				printf( "               *This option is supported by BLIS but not by classic BLAS.\n" );
+				}
+				printf( "\n" );
+				printf( "    -d dt\n" );
+				printf( "            Allocate matrix elements using the datatype character specified\n" );
+				printf( "            by dt, and also perform computation in that same datatype. Valid\n" );
+				printf( "            char values for dt are:\n" );
+				printf( "\n" );
+				printf( "               Valid\n" );
+				printf( "               chars     Interpretation\n" );
+				printf( "               -----------------------------------------\n" );
+				printf( "                s        single-precision real domain\n" );
+				printf( "                d        double-precision real domain\n" );
+				printf( "                c        single-precision complex domain\n" );
+				printf( "                z        double-precision complex domain\n" );
+				printf( "\n" );
+				printf( "    -s sc\n" );
+				printf( "            Use the characters in sc to determine the storage formats\n" );
+				printf( "            of each operand matrix used in the performance measurements.\n" );
+				printf( "            Valid chars are 'r' (row storage) and 'c' (column storage).\n" );
+				printf( "            The characters encode the storage format for each operand\n" );
+				printf( "            used by %s, with the mapping of chars to operand interpreted\n", params->opname );
+				printf( "            in the following order:\n" );
+				printf( "\n" );
+				printf( "                            Order of\n" );
+				printf( "                            operand      \n" );
+				printf( "               Operation    mapping      Example     Interpretation\n" );
+				printf( "               ----------------------------------------------------------\n" );
+				if ( is_gemm( params ) )
+				{
+				printf( "               gemm         C A B        -s crr      C is col-stored;\n" );
+				printf( "                                                     A and B are row-stored.\n" );
+				}
+				else if ( is_hemm( params ) )
+				{
+				printf( "               hemm/symm    C A B        -s rcc      C is row-stored;\n" );
+				printf( "                                                     A and B are col-stored.\n" );
+				}
+				else if ( is_herk( params ) )
+				{
+				printf( "               herk/syrk    C A          -s rc       C is row-stored;\n" );
+				printf( "                                                     A is col-stored.\n" );
+				}
+				else if ( is_trmm( params ) )
+				{
+				printf( "               trmm         B A          -s cr       B is col-stored;\n" );
+				printf( "                                                     A is row-stored.\n" );
+				}
+				else if ( is_trsm( params ) )
+				{
+				printf( "               trsm         B A          -s cc       B and A are col-stored.\n" );
+				}
+				printf( "\n" );
+				printf( "    -i im\n" );
+				printf( "            Use native execution if im is 'native' (or 'nat'). Otherwise,\n" );
+				printf( "            if im is '1m', use the 1m method to induce complex computation\n" );
+				printf( "            using the equivalent real-domain microkernels.\n" );
+				printf( "\n" );
+				printf( "    -p 'lo hi in'\n" );
+				printf( "            Perform a sweep of measurements of problem sizes ranging from \n" );
+				printf( "            'lo' to 'hi' in increments of 'in'. Note that measurements will\n" );
+				printf( "            be taken in descending order, starting from 'hi', and so 'lo'\n" );
+				printf( "            will act as a floor and may not be measured (see 2nd example).\n" );
+				printf( "\n" );
+				printf( "               Example             Interpretation\n" );
+				printf( "               -------------------------------------------------------\n" );
+				printf( "               -p '40 400 40'      Measure performance from 40 to 400\n" );
+				printf( "                                   (inclusive) in increments of 40.\n" );
+				printf( "               -p '40 400 80'      Measure performance for problem sizes\n" );
+				printf( "                                   {80,160,240,320,400}.\n" );
+				printf( "\n" );
+				printf( "            Note that unlike the other option arguments, quotes are required\n" );
+				printf( "            around the 'lo hi in' string in order to facilitate parsing.\n" );
+				printf( "\n" );
+				printf( "    -m M\n" );
+				if ( is_gemm( params ) || is_hemm( params ) || is_trmm( params ) || is_trsm( params ) )
+				printf( "    -n N\n" );
+				if ( is_gemm( params ) || is_herk( params ) )
+				printf( "    -k K\n" );
+				if ( is_gemm( params ) )
+				{
+				printf( "            Bind the m, n, or k dimensions to M, N, or K, respectively.\n" );
+				printf( "            Binding of matrix dimensions takes place as follows:\n" );
+				}
+				else if ( is_herk( params ) )
+				{
+				printf( "            Bind the m or k dimensions to M or K, respectively. Binding\n" );
+				printf( "            of matrix dimensions takes place as follows:\n" );
+				}
+				else if ( is_hemm( params ) || is_trmm( params ) || is_trsm( params ) )
+				{
+				printf( "            Bind the m or n dimensions to M or N, respectively. Binding\n" );
+				printf( "            of matrix dimensions takes place as follows:\n" );
+				}
+				printf( "\n" );
+				printf( "               if 0 <  X: Bind the x dimension to X and hold it constant.\n" );
+				printf( "               if X = -1: Bind the x dimension to p.\n" );
+				printf( "               if X < -1: Bind the x dimension to p / abs(x).\n" );
+				printf( "\n" );
+				printf( "            where p is the current problem size. Note: X = 0 is undefined.\n" );
+				printf( "\n" );
+				printf( "               Examples             Interpretation\n" );
+				printf( "               ---------------------------------------------------------\n" );
+				if ( is_gemm( params ) )
+				{
+				printf( "               -m -1 -n -1 -k -1    Bind m, n, and k to the problem size\n" );
+				printf( "                                    to keep all matrices square.\n" );
+				printf( "               -m -1 -n -1 -k 100   Bind m and n to the problem size, but\n" );
+				printf( "                                    hold k = 100 constant.\n" );
+				}
+				else if ( is_hemm( params ) )
+				{
+				printf( "               -m -1 -n -1          Bind m and n to the problem size to\n" );
+				printf( "                                    keep all matrices square.\n" );
+				printf( "               -m -1 -n 500         Bind m to the problem size, but hold\n" );
+				printf( "                                    n = 500 constant.\n" );
+				}
+				else if ( is_herk( params ) )
+				{
+				printf( "               -m -1 -k -1          Bind m and k to the problem size to\n" );
+				printf( "                                    keep both matrices square.\n" );
+				printf( "               -m -1 -k 200         Bind m to the problem size, but hold\n" );
+				printf( "                                    k = 200 constant.\n" );
+				}
+				else if ( is_trmm( params ) || is_trsm( params ) )
+				{
+				printf( "               -m -1 -n -1          Bind m and n to the problem size to\n" );
+				printf( "                                    keep both matrices square.\n" );
+				printf( "               -m -1 -n 300         Bind m to the problem size, but hold\n" );
+				printf( "                                    n = 300 constant.\n" );
+				}
+				printf( "\n" );
+				printf( "    -r num\n" );
+				printf( "            When measuring performance for a given problem size, perform num\n" );
+				printf( "            repetitions and report performance using the best timing.\n" );
+				printf( "\n" );
+				if ( is_gemm( params ) || is_hemm( params ) || is_herk( params ) )
+				{
+				printf( "    -a alpha\n" );
+				printf( "    -b beta\n" );
+				printf( "            Specify the value to use for the alpha and/or beta scalars.\n" );
+				}
+				else // if ( is_trmm( params ) || is_trsm( params ) )
+				{
+				printf( "    -a alpha\n" );
+				printf( "            Specify the value to use for the alpha scalar.\n" );
+				}
+				printf( "\n" );
+				printf( "  If any of the computational options is not specified, its default value will\n" );
+				printf( "  be used. (Please use the -v option to see how the driver is interpreting each\n" );
+				printf( "  option.)\n" );
+				printf( "\n" );
+				printf( "  The following IO options are also supported:\n" );
+				printf( "\n" );
+				printf( "    -q\n" );
+				printf( "    -v\n" );
+				printf( "            Enable quiet or verbose output. (By default, output is quiet.)\n" );
+				printf( "            The verbose option is useful if you are unsure whether your options\n" );
+				printf( "            are being interpreted as you intended.\n" );
+				printf( "\n" );
+				printf( "    -h\n" );
+				printf( "            Display this help and exit.\n" );
+				printf( "\n" );
+				printf( "\n" );
+
+				exit(0);
+
+				break;
+			}
+
+
+			case '?':
+			printf( "%s: unexpected option '%c' given or missing option argument\n", bin, state.optopt );
+			exit(1);
+			break;
+
+			default:
+			printf( "%s: unexpected option chararcter returned from getopt: %c\n", bin, opt_ch );
+			exit(1);
+		}
+	}
+
+	// Process the command line options from strings to integers/enums/doubles,
+	// as appropriate.
+	proc_params( params );
+
+	// Inform the user of the values that were chosen (or defaulted to).
+	if ( params->verbose )
+	{
+		const char* def_str = " (default)";
+		const char* nul_str = " ";
+
+		printf( "%%\n" );
+		printf( "%% operation:              %s\n", params->opname );
+		printf( "%% parameter combination:  %s%s\n", params->pc_str, ( gave_option_c ? nul_str : def_str ) );
+		printf( "%% datatype:               %s%s\n", params->dt_str, ( gave_option_d ? nul_str : def_str ) );
+		printf( "%% storage combination:    %s%s\n", params->sc_str, ( gave_option_s ? nul_str : def_str ) );
+		printf( "%% induced method:         %s%s\n", params->im_str, ( gave_option_i ? nul_str : def_str ) );
+		printf( "%% problem size range:     %s%s\n", params->ps_str, ( gave_option_p ? nul_str : def_str ) );
+		printf( "%% m dim specifier:        %s%s\n", params->m_str, ( gave_option_m ? nul_str : def_str ) );
+		if ( is_gemm( params ) || is_hemm( params ) || is_trmm( params ) || is_trsm( params )  )
+		printf( "%% n dim specifier:        %s%s\n", params->n_str, ( gave_option_n ? nul_str : def_str ) );
+		if ( is_gemm( params ) || is_herk( params ) )
+		printf( "%% k dim specifier:        %s%s\n", params->k_str, ( gave_option_k ? nul_str : def_str ) );
+		printf( "%% number of repeats:      %s%s\n", params->nr_str, ( gave_option_r ? nul_str : def_str ) );
+		printf( "%% alpha scalar:           %s%s\n", params->alpha_str, ( gave_option_a ? nul_str : def_str ) );
+		if ( is_gemm( params ) || is_hemm( params ) || is_herk( params ) )
+		printf( "%% beta scalar:            %s%s\n", params->beta_str, ( gave_option_b ? nul_str : def_str ) );
+		printf( "%% ---\n" );
+		printf( "%% implementation:         %s\n", params->impl );
+		if ( params->nt == -1 )
+		printf( "%% number of threads:      %s\n", "unset (defaults to 1)" );
+		else
+		printf( "%% number of threads:      %ld\n", params->nt );
+		printf( "%% thread affinity:        %s\n", ( params->af_str == NULL ? "unset" : params->af_str ) );
+		printf( "%%\n" );
+	}
+
+
+	// If there are still arguments remaining after getopt() processing is
+	// complete, print an error.
+	if ( state.optind < argc )
+	{
+		printf( "%s: encountered unexpected non-option argument: %s\n", bin, argv[ state.optind ] );
+		exit(1);
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void proc_params( params_t* params )
+{
+	dim_t nt;
+
+	// Binary name doesn't need any conversion.
+
+	// Operation name doesn't need any conversion.
+
+	// Implementation name doesn't need any conversion.
+
+	// Query the multithreading strings and convert them to integers.
+	if ( strncmp( params->impl, "blis", MAX_STRING_SIZE ) == 0 )
+	{
+		nt = bli_thread_get_num_threads();
+	}
+	else if ( strncmp( params->impl, "mkl", MAX_STRING_SIZE ) == 0 )
+	{
+		nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
+
+		if ( nt == -1 ) nt = bli_env_get_var( "MKL_NUM_THREADS", -1 );
+	}
+	else if ( strncmp( params->impl, "openblas", MAX_STRING_SIZE ) == 0 )
+	{
+		nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
+
+		if ( nt == -1 ) nt = bli_env_get_var( "OPENBLAS_NUM_THREADS", -1 );
+	}
+	else
+	{
+		nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
+	}
+
+	// Store nt to the params_t struct.
+	params->nt = ( long int )nt;
+
+	// Store the affinity string pointer to the params_t struct.
+	params->af_str = bli_env_get_str( "GOMP_CPU_AFFINITY" );
+
+#if 0
+	dim_t nt    = bli_thread_get_num_threads();
+	dim_t jc_nt = bli_thread_get_jc_nt();
+	dim_t pc_nt = bli_thread_get_pc_nt();
+	dim_t ic_nt = bli_thread_get_ic_nt();
+	dim_t jr_nt = bli_thread_get_jr_nt();
+	dim_t ir_nt = bli_thread_get_ir_nt();
+
+	if (    nt == -1 ) nt    = 1;
+	if ( jc_nt == -1 ) jc_nt = 1;
+	if ( pc_nt == -1 ) pc_nt = 1;
+	if ( ic_nt == -1 ) ic_nt = 1;
+	if ( jr_nt == -1 ) jr_nt = 1;
+	if ( ir_nt == -1 ) ir_nt = 1;
+
+	params->nt    = ( long int )nt;
+	params->jc_nt = ( long int )jc_nt;
+	params->pc_nt = ( long int )pc_nt;
+	params->ic_nt = ( long int )ic_nt;
+	params->jr_nt = ( long int )jr_nt;
+	params->ir_nt = ( long int )ir_nt;
+#endif
+
+	// Parameter combinations, datatype, and operand storage combination,
+	// need no conversion.
+
+	// Convert the datatype to a num_t.
+	bli_param_map_char_to_blis_dt( params->dt_str[0], &params->dt );
+
+	// Parse the induced method to the corresponding ind_t.
+	if      ( strncmp( params->im_str, "native", 6 ) == 0 )
+	{
+		params->im = BLIS_NAT;
+	}
+	else if ( strncmp( params->im_str, "1m",     2 ) == 0 )
+	{
+		params->im = BLIS_1M;
+	}
+	else
+	{
+		printf( "%s: invalid induced method '%s'.\n", params->bin, params->im_str );
+		exit(1);
+	}
+
+	// Convert the problem size range and dimension specifier strings to
+	// integers.
+	sscanf( params->ps_str, "%ld %ld %ld", &(params->sta),
+	                                       &(params->end),
+	                                       &(params->inc) );
+	sscanf( params->m_str, "%ld", &(params->m) );
+	sscanf( params->n_str, "%ld", &(params->n) );
+	sscanf( params->k_str, "%ld", &(params->k) );
+
+	// Convert the number of repeats to an integer.
+	sscanf( params->nr_str, "%ld", &(params->nr) );
+
+	// Convert the alpha and beta strings to doubles.
+	//params->alpha = ( double )atof( params->alpha_str );
+	//params->beta  = ( double )atof( params->beta_str );
+	//sscanf( params->alpha_str, "%lf", &(params->alpha) );
+	//sscanf( params->beta_str,  "%lf", &(params->beta) );
+	params->alpha = strtod( params->alpha_str, NULL );
+	params->beta  = strtod( params->beta_str,  NULL );
+}
+
+// -----------------------------------------------------------------------------
+
+bool is_match( const char* str1, const char* str2 )
+{
+	if ( strncmp( str1, str2, MAX_STRING_SIZE ) == 0 ) return TRUE;
+	return FALSE;
+}
+
+bool is_gemm( params_t* params )
+{
+	if ( is_match( params->opname, "gemm" ) ) return TRUE;
+	return FALSE;
+}
+
+bool is_hemm( params_t* params )
+{
+	if ( is_match( params->opname, "hemm" ) ) return TRUE;
+	return FALSE;
+}
+
+bool is_herk( params_t* params )
+{
+	if ( is_match( params->opname, "herk" ) ) return TRUE;
+	return FALSE;
+}
+
+bool is_trmm( params_t* params )
+{
+	if ( is_match( params->opname, "trmm" ) ) return TRUE;
+	return FALSE;
+}
+
+bool is_trsm( params_t* params )
+{
+	if ( is_match( params->opname, "trsm" ) ) return TRUE;
+	return FALSE;
+}
+
diff --git a/test/3/test_utils.h b/test/3/test_utils.h
new file mode 100644
index 000000000..088f9ce97
--- /dev/null
+++ b/test/3/test_utils.h
@@ -0,0 +1,142 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifndef TEST_UTILS_H
+#define TEST_UTILS_H
+
+// Allow C++ users to include this header file in their source code. However,
+// we make the extern "C" conditional on whether we're using a C++ compiler,
+// since regular C compilers don't understand the extern "C" construct.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// String arrays allocated using this constant will always add 1 to
+// the value defined below, and so the total allocated will still be
+// a nice power of two.
+#define MAX_STRING_SIZE    31
+
+
+extern const char* GLOB_DEF_DT_STR;
+extern const char* GLOB_DEF_SC_STR;
+extern const char* GLOB_DEF_IM_STR;
+
+extern const char* GLOB_DEF_PS_STR;
+extern const char* GLOB_DEF_M_STR;
+extern const char* GLOB_DEF_N_STR;
+extern const char* GLOB_DEF_K_STR;
+
+extern const char* GLOB_DEF_NR_STR;
+
+extern const char* GLOB_DEF_ALPHA_STR;
+extern const char* GLOB_DEF_BETA_STR;
+
+
+typedef struct params_s
+{
+	// Binary name.
+	const char* bin;
+
+	// Operation name.
+	const char* opname;
+
+	// Implementation name.
+	const char* impl;
+
+	// Multithreading parameters: number of threads and affinity string.
+	const char* nt_str;
+	long int    nt;
+	const char* af_str;
+
+	// Parameter combinations, datatype, operand storage combination,
+	// and induced method.
+	const char* pc_str;
+	const char* dt_str;
+	const char* sc_str;
+	num_t dt;
+
+	const char* im_str;
+	ind_t im;
+
+	// Problem size range and dimension specifiers.
+	const char* ps_str;
+	const char* m_str;
+	const char* n_str;
+	const char* k_str;
+	long int sta;
+	long int end;
+	long int inc;
+	long int m;
+	long int n;
+	long int k;
+
+	// Number of repeats.
+	const char* nr_str;
+	long int nr;
+
+	// Value of alpha and beta.
+	const char* alpha_str;
+	const char* beta_str;
+	double alpha;
+	double beta;
+
+	// A flag controlling whether to print informational messages.
+	bool verbose;
+
+} params_t;
+
+typedef void (*init_fp)( params_t* params );
+
+// -----------------------------------------------------------------------------
+
+void init_def_params( params_t* params );
+void parse_cl_params( int argc, char** argv, init_fp fp, params_t* params );
+void proc_params( params_t* params );
+
+// -----------------------------------------------------------------------------
+
+bool is_match( const char* str1, const char* str2 );
+bool is_gemm( params_t* params );
+bool is_hemm( params_t* params );
+bool is_herk( params_t* params );
+bool is_trmm( params_t* params );
+bool is_trsm( params_t* params );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

From b861c71b50c6d48cb07282f44aa9dddffc1f1b3f Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 23 Sep 2022 13:22:27 -0500
Subject: [PATCH 089/230] Add consistent NaN/Inf handling in sumsqv. (#668)

Details:
- Changed sumsqv implementation as follows:
  - If there is a NaN (either real or imaginary), then return a sum of
    NaN and unit scale.
  - Else, if there is an Inf (either real or imaginary), then return a
    sum of +Inf and unit scale.
  - Otherwise behave as normal.
---
 frame/util/bli_util_unb_var1.c | 56 ++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index 2b65c8460..3c501d107 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -1068,6 +1068,7 @@ void PASTEMAC(ch,varname) \
 	ctype_r scale_r; \
 	ctype_r sumsq_r; \
 	ctype_r abs_chi1_r; \
+	ctype_r abs_chi1_i; \
 	dim_t   i; \
 \
 	/* NOTE: This function attempts to mimic the algorithm for computing
@@ -1085,10 +1086,47 @@ void PASTEMAC(ch,varname) \
 		PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 		abs_chi1_r = bli_fabs( chi1_r ); \
+		abs_chi1_i = bli_fabs( chi1_i ); \
+\
+		if ( bli_isnan( abs_chi1_r ) ) \
+		{ \
+			sumsq_r = abs_chi1_r; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isnan( abs_chi1_i ) ) \
+		{ \
+			sumsq_r = abs_chi1_i; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isnan( sumsq_r ) ) \
+		{ \
+			chi1 += incx; \
+			continue; \
+		} \
+\
+		if ( bli_isinf( abs_chi1_r ) ) \
+		{ \
+			sumsq_r = abs_chi1_r; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isinf( abs_chi1_i ) ) \
+		{ \
+			sumsq_r = abs_chi1_i; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isinf( sumsq_r ) ) \
+		{ \
+			chi1 += incx; \
+			continue; \
+		} \
 \
 		/* Accumulate real component into sumsq, adjusting scale if
 		   needed. */ \
-		if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \
+		if ( abs_chi1_r > zero_r ) \
 		{ \
 			if ( scale_r < abs_chi1_r ) \
 			{ \
@@ -1104,25 +1142,23 @@ void PASTEMAC(ch,varname) \
 				                    ( abs_chi1_r / scale_r );  \
 			} \
 		} \
-\
-		abs_chi1_r = bli_fabs( chi1_i ); \
 \
 		/* Accumulate imaginary component into sumsq, adjusting scale if
 		   needed. */ \
-		if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \
+		if ( abs_chi1_i > zero_r ) \
 		{ \
-			if ( scale_r < abs_chi1_r ) \
+			if ( scale_r < abs_chi1_i ) \
 			{ \
 				sumsq_r = one_r + \
-				          sumsq_r * ( scale_r / abs_chi1_r ) * \
-				                    ( scale_r / abs_chi1_r );  \
+				          sumsq_r * ( scale_r / abs_chi1_i ) * \
+				                    ( scale_r / abs_chi1_i );  \
 \
-				PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \
+				PASTEMAC(chr,copys)( abs_chi1_i, scale_r ); \
 			} \
 			else \
 			{ \
-				sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \
-				                    ( abs_chi1_r / scale_r );  \
+				sumsq_r = sumsq_r + ( abs_chi1_i / scale_r ) * \
+				                    ( abs_chi1_i / scale_r );  \
 			} \
 		} \
 \

From 42d0e66318b186d25eeb215b40ce26115401ed8b Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 29 Sep 2022 17:38:02 -0500
Subject: [PATCH 090/230] Add AddressSanitizer (-fsanitize=address) option.
 (#669)

Details:
- Added support for AddressSanitizer (ASan), a compiler-integrated
  memory error detector. The option (disabled by default) enables
  compiling and linking with the -fsanitize=address flag supported by
  clang, gcc, and probably others. This flag is employed during
  compilation of all BLIS source files *except* for optimized kernels,
  which are exempted because ASan usually requires an extra register,
  which violates the constraints for many gemm microkernels.
- Minor whitespace, comment, ordering, and configure help text updates.
---
 Makefile           |  1 +
 build/config.mk.in |  3 +++
 common.mk          | 34 ++++++++++++++++++++------
 configure          | 61 ++++++++++++++++++++++++++++++++++------------
 4 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/Makefile b/Makefile
index 5c4a32b59..04cdca421 100644
--- a/Makefile
+++ b/Makefile
@@ -1161,6 +1161,7 @@ showconfig: check-env
 	@echo "install includedir:         $(INSTALL_INCDIR)"
 	@echo "install sharedir:           $(INSTALL_SHAREDIR)"
 	@echo "debugging status:           $(DEBUG_TYPE)"
+	@echo "enable AddressSanitizer?    $(MK_ENABLE_ASAN)"
 	@echo "enabled threading model(s): $(THREADING_MODEL)"
 	@echo "enable BLAS API?            $(MK_ENABLE_BLAS)"
 	@echo "enable CBLAS API?           $(MK_ENABLE_CBLAS)"
diff --git a/build/config.mk.in b/build/config.mk.in
index 849a7ccfa..efb123366 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -124,6 +124,9 @@ LDFLAGS_PRESET    := @ldflags_preset@
 # The level of debugging info to generate.
 DEBUG_TYPE        := @debug_type@
 
+# Whether to compile and link the AddressSanitizer library.
+MK_ENABLE_ASAN    := @enable_asan@
+
 # Whether operating system support was requested via --enable-system.
 ENABLE_SYSTEM     := @enable_system@
 
diff --git a/common.mk b/common.mk
index 00b9f8ad3..e69b97782 100644
--- a/common.mk
+++ b/common.mk
@@ -118,6 +118,7 @@ get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
 get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
                                    -DBLIS_CNAME=$(1) \
+                                   $(BUILD_ASANFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                                    -DBLIS_IN_REF_KERNEL=1 \
@@ -129,6 +130,7 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
                                    $(COMPSIMDFLAGS) \
                                    -DBLIS_CNAME=$(1) \
+                                   $(BUILD_ASANFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                                    -DBLIS_IN_REF_KERNEL=1 \
@@ -137,12 +139,14 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
 
 get-config-cflags-for    = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
+                                   $(BUILD_ASANFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
 
 get-frame-cflags-for     = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
+                                   $(BUILD_ASANFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
@@ -201,11 +205,14 @@ get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
 # Define a separate function that will return appropriate flags for use by
 # applications that want to use the same basic flags as those used when BLIS
 # was compiled. (NOTE: This is the same as the $(get-frame-cflags-for ...)
-# function, except that it omits two variables that contain flags exclusively
-# for use when BLIS is being compiled/built: BUILD_CPPFLAGS, which contains a
-# cpp macro that confirms that BLIS is being built; and BUILD_SYMFLAGS, which
-# contains symbol export flags that are only needed when a shared library is
-# being compiled/linked.)
+# function, except that it omits a few variables that contain flags exclusively
+# for use when BLIS is being compiled/built:
+# - BUILD_CPPFLAGS, which contains a cpp macro that confirms that BLIS
+#   is being built;
+# - BUILD_SYMFLAGS, which contains symbol export flags that are only
+#   needed when a shared library is being compiled/linked; and
+# - BUILD_ASANFLAGS, which contains a flag that causes the compiler to
+#   insert instrumentation for memory error detection.
 get-user-cflags-for      = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
                             )
@@ -563,6 +570,11 @@ ifeq ($(DEBUG_TYPE),sde)
 LDFLAGS    := $(filter-out $(LIBMEMKIND),$(LDFLAGS))
 endif
 
+# If AddressSanitizer is enabled, add the compiler flag to LDFLAGS.
+ifeq ($(MK_ENABLE_ASAN),yes)
+LDFLAGS    += -fsanitize=address
+endif
+
 # Specify the shared library's 'soname' field.
 # NOTE: The flag for creating shared objects is different for Linux and OS X.
 ifeq ($(OS_NAME),Darwin)
@@ -796,11 +808,19 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))
 CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c))))
 
+# --- AddressSanitizer flags ---
+
+ifeq ($(MK_ENABLE_ASAN),yes)
+BUILD_ASANFLAGS := -fsanitize=address
+else
+BUILD_ASANFLAGS :=
+endif
+
 # --- Threading flags ---
 
 # NOTE: We don't have to explicitly omit -pthread when --disable-system is given
-# since that option forces --enable-threading=none, and thus -pthread never gets
-# added to begin with.
+# since that option forces --enable-threading=single, and thus -pthread never
+# gets added to begin with.
 
 CTHREADFLAGS :=
 
diff --git a/configure b/configure
index 858ce55de..a53f25380 100755
--- a/configure
+++ b/configure
@@ -224,12 +224,22 @@ print_usage()
 	echo " "
 	echo "   --enable-mem-tracing, --disable-mem-tracing"
 	echo " "
-	echo "                 Enable (disable by default) output to stdout that traces"
+	echo "                 Enable (disabled by default) output to stdout that traces"
 	echo "                 the allocation and freeing of memory, including the names"
 	echo "                 of the functions that triggered the allocation/freeing."
 	echo "                 Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE."
 	echo "                 Please use only for informational/debugging purposes."
 	echo " "
+	echo "   --enable-asan, --disable-asan"
+	echo " "
+	echo "                 Enable (disabled by default) compiling and linking BLIS"
+	echo "                 framework code with the AddressSanitizer (ASan) library."
+	echo "                 Optimized kernels are NOT compiled with ASan support due"
+	echo "                 to limitations of register assignment in inline assembly."
+	echo "                 WARNING: ENABLING THIS OPTION WILL NEGATIVELY IMPACT"
+	echo "                 PERFORMANCE. Please use only for informational/debugging"
+	echo "                 purposes."
+	echo " "
 	echo "   -i SIZE, --int-size=SIZE"
 	echo " "
 	echo "                 Set the size (in bits) of internal BLIS integers and"
@@ -2451,6 +2461,9 @@ main()
 	debug_type=''
 	debug_flag=''
 
+	# A flag indicating whether AddressSanitizer should be used.
+	enable_asan='no'
+
 	# The system flag.
 	enable_system='yes'
 
@@ -2576,6 +2589,12 @@ main()
 						disable-debug)
 							debug_flag=0
 							;;
+						enable-asan)
+							enable_asan='yes'
+							;;
+						disable-asan)
+							enable_asan='no'
+							;;
 						enable-verbose-make)
 							enable_verbose='yes'
 							;;
@@ -3357,6 +3376,20 @@ main()
 		echo "${script_name}: no preset LDFLAGS detected."
 	fi
 
+	# Check if the verbose make flag was specified.
+	if [ "x${enable_verbose}" = "xyes" ]; then
+		echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)"
+	else
+		echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)"
+	fi
+
+	# Check if the ARG_MAX hack was requested.
+	if [ "x${enable_arg_max_hack}" = "xyes" ]; then
+		echo "${script_name}: enabling ARG_MAX hack."
+	else
+		echo "${script_name}: disabling ARG_MAX hack."
+	fi
+
 	# Check if the debug flag was specified.
 	if [ -n "${debug_flag}" ]; then
 		if [ "x${debug_type}" = "xopt" ]; then
@@ -3373,29 +3406,24 @@ main()
 		echo "${script_name}: debug symbols disabled."
 	fi
 
-	# Check if the verbose make flag was specified.
-	if [ "x${enable_verbose}" = "xyes" ]; then
-		echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)"
+	# Check if the AddressSanitizer flag was specified.
+	if [ "x${enable_asan}" = "xyes" ]; then
+		echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)."
 	else
-		echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)"
+        enable_asan='no'
+		echo "${script_name}: AddressSanitizer support disabled."
 	fi
 
-	# Check if the ARG_MAX hack was requested.
-	if [ "x${enable_arg_max_hack}" = "xyes" ]; then
-		echo "${script_name}: enabling ARG_MAX hack."
-	else
-		echo "${script_name}: disabling ARG_MAX hack."
-	fi
-
-	enable_shared_01=1
 	# Check if the static lib flag was specified.
 	if   [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xyes" ]; then
 		echo "${script_name}: building BLIS as both static and shared libraries."
+		enable_shared_01=1
+	elif [ "x${enable_static}" = "xno"  -a "x${enable_shared}" = "xyes" ]; then
+		echo "${script_name}: building BLIS as a shared library (static library disabled)."
+		enable_shared_01=1
 	elif [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xno"  ]; then
 		echo "${script_name}: building BLIS as a static library (shared library disabled)."
 		enable_shared_01=0
-	elif [ "x${enable_static}" = "xno"  -a "x${enable_shared}" = "xyes" ]; then
-		echo "${script_name}: building BLIS as a shared library (static library disabled)."
 	else
 		echo "${script_name}: Both static and shared libraries were disabled."
 		echo "${script_name}: *** Please enable one (or both) to continue."
@@ -3917,7 +3945,7 @@ main()
 	# Create a #define for the configuration family (config_name).
 	uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
 	config_name_define="#define BLIS_FAMILY_${uconf}\n"
-	
+
 	# Create a list of #defines, one for each configuration in config_list.
 	config_list_defines=""
 	for conf in ${config_list}; do
@@ -4012,6 +4040,7 @@ main()
 		| sed -e "s/@libpthread@/${libpthread_esc}/g" \
 		| sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \
 		| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
+		| sed -e "s/@enable_asan@/${enable_asan}/g" \
 		| sed -e "s/@debug_type@/${debug_type}/g" \
 		| sed -e "s/@enable_system@/${enable_system}/g" \
 		| sed -e "s/@threading_model@/${threading_model}/g" \

From 63470b49e3b9b15e00a8f666e86ccd70c6005fe9 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 29 Sep 2022 18:52:08 -0500
Subject: [PATCH 091/230] Fix some bugs in bli_pool.c (#670)

Details:
- Add a check for premature pool exhaustion when checking in blocks via
  bli_pool_checkin_block(). This detects "double-free" and other bad
  conditions that don't necessarily result in a segfault.
- Make sure to copy all block pointers when growing the pool size.
  Previously, checked-out block pointers (which are guaranteed to be set
  to NULL) were not being copied, leading to the presence of
  uninitialized data.
---
 frame/base/bli_pool.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c
index 684b0ef73..6449a9774 100644
--- a/frame/base/bli_pool.c
+++ b/frame/base/bli_pool.c
@@ -335,6 +335,10 @@ void bli_pool_checkin_block
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
 
+	// Check for double-free and other conditions which may prematurely
+	// exhaust the memory pool.
+	if ( top_index == 0 ) bli_abort();
+
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_pool_checkin_block(): checking in block %d of size %d "
 	        "(align %d, offset %d).\n",
@@ -407,10 +411,11 @@ void bli_pool_grow
 		const siz_t top_index = bli_pool_top_index( pool );
 
 		// Copy the contents of the old block_ptrs array to the new/resized
-		// array. Notice that we can begin with top_index since all entries
-		// from 0 to top_index-1 have been (and are currently) checked out
-		// to threads.
-		for ( dim_t i = top_index; i < num_blocks_cur; ++i )
+		// array. Notice that we copy the entire array, including elements
+		// corresponding to blocks that have been checked out. Those elements
+		// were set to NULL upon checkout, and so it's important to copy them
+		// into the new block_ptrs array.
+		for ( dim_t i = 0; i < num_blocks_cur; ++i )
 		{
 			block_ptrs_new[i] = block_ptrs_cur[i];
 		}

From 76a23bd8c33e161221891935a489df9a9fb9c8c0 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 3 Oct 2022 15:55:07 -0500
Subject: [PATCH 092/230] Reinstate sanity check in bli_pool_finalize. (#671)

Details:
- Added a reinit argument to bli_pool_finalize(). This bool will signal
  whether or not the function is being called from bli_pool_reinit(). If
  it is not being called from _reinit(), we can safely check to confirm
  that .top_index == 0 (i.e., all blocks have been checked in). But if
  it *is* being called from _reinit(), then that check will be skipped
  since one of the predicted use cases for bli_pool_reinit() anticipates
  that some blocks are (probably) checked out when the pool_t is
  reinitialized.
- Updated existing invocations of bli_pool_finalize() to pass in either
  FALSE (from bli_apool_free_block() or bli_pba_finalize_pools()) or
  TRUE (from bli_pool_reinit()) for the new reinit argument.
---
 frame/base/bli_apool.c |  2 +-
 frame/base/bli_pba.c   |  6 +++---
 frame/base/bli_pool.c  | 22 +++++++++-------------
 frame/base/bli_pool.h  |  3 ++-
 4 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c
index a42c7103e..693e91bf9 100644
--- a/frame/base/bli_apool.c
+++ b/frame/base/bli_apool.c
@@ -188,7 +188,7 @@ void bli_apool_free_block
 		if ( pool != NULL )
 		{
 			// Finalize the pool.
-			bli_pool_finalize( pool );
+			bli_pool_finalize( pool, FALSE );
 
 			#ifdef BLIS_ENABLE_MEM_TRACING
 			printf( "bli_apool_free_block(): pool_t %d: ", ( int )i );
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index 68dffd728..cabaf4ff6 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -389,9 +389,9 @@ void bli_pba_finalize_pools
 	pool_t* pool_c  = bli_pba_pool( index_c, pba );
 
 	// Finalize the memory pools for A, B, and C.
-	bli_pool_finalize( pool_a );
-	bli_pool_finalize( pool_b );
-	bli_pool_finalize( pool_c );
+	bli_pool_finalize( pool_a, FALSE );
+	bli_pool_finalize( pool_b, FALSE );
+	bli_pool_finalize( pool_c, FALSE );
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c
index 6449a9774..891f770ae 100644
--- a/frame/base/bli_pool.c
+++ b/frame/base/bli_pool.c
@@ -115,7 +115,8 @@ void bli_pool_init
 
 void bli_pool_finalize
      (
-       pool_t* pool
+       pool_t* pool,
+       bool    reinit
      )
 {
 	// NOTE: This implementation assumes that either:
@@ -129,24 +130,22 @@ void bli_pool_finalize
 	// Query the total number of blocks currently allocated.
 	const siz_t num_blocks = bli_pool_num_blocks( pool );
 
-	// NOTE: This sanity check has been disabled because bli_pool_reinit()
-	// is currently implemented in terms of bli_pool_finalize() followed by
-	// bli_pool_init(). If that _reinit() takes place when some blocks are
-	// checked out, then we would expect top_index != 0, and therefore this
-	// check is not universally appropriate.
-#if 0
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
 
 	// Sanity check: The top_index should be zero.
-	if ( top_index != 0 )
+	// NOTE: This sanity check is disabled when called from bli_pool_reinit()
+	// because it is currently implemented in terms of bli_pool_finalize() followed by
+	// bli_pool_init(). If that _reinit() takes place when some blocks are
+	// checked out, then we would expect top_index != 0, and therefore this
+	// check is not universally appropriate.
+	if ( top_index != 0 && !reinit )
 	{
 		printf( "bli_pool_finalize(): final top_index == %d (expected 0); block_size: %d.\n",
 		        ( int )top_index, ( int )bli_pool_block_size( pool ) );
 		printf( "bli_pool_finalize(): Implication: not all blocks were checked back in!\n" );
 		bli_abort();
 	}
-#endif
 
 	// Query the free() function pointer for the pool.
 	free_ft free_fp = bli_pool_free_fp( pool );
@@ -215,7 +214,7 @@ void bli_pool_reinit
 	// those blocks back into the pool. (This condition can be detected
 	// since the block size is encoded into each pblk, which is copied
 	// upon checkout.)
-	bli_pool_finalize( pool );
+	bli_pool_finalize( pool, TRUE );
 
 	// Reinitialize the pool with the new parameters, in particular,
 	// the new block size.
@@ -407,9 +406,6 @@ void bli_pool_grow
 		=
 		bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val );
 
-		// Query the top_index of the pool.
-		const siz_t top_index = bli_pool_top_index( pool );
-
 		// Copy the contents of the old block_ptrs array to the new/resized
 		// array. Notice that we copy the entire array, including elements
 		// corresponding to blocks that have been checked out. Those elements
diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h
index 0b16ae8ee..6f199f7a4 100644
--- a/frame/base/bli_pool.h
+++ b/frame/base/bli_pool.h
@@ -228,7 +228,8 @@ void bli_pool_init
      );
 void bli_pool_finalize
      (
-       pool_t* pool
+       pool_t* pool,
+       bool    reinit
      );
 void bli_pool_reinit
      (

From 9453e0f163503f64a290256b4be53d8882224863 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 3 Oct 2022 19:46:20 -0500
Subject: [PATCH 093/230] CREDITS file update.

Details:
- This attribution was intended to go in PR #647.
---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index 152de0a4b..55c974f1b 100644
--- a/CREDITS
+++ b/CREDITS
@@ -36,6 +36,7 @@ but many others have contributed code and feedback, including
   Roman Gareev             @gareevroman
   Richard Goldschmidt      @SuperFluffy
   Chris Goodyer
+  Alexander Grund          @Flamefire
   John Gunnels             @jagunnels          (IBM, T.J. Watson Research Center)
   Ali Emre Gülcü           @Lephar
   Jeff Hammond             @jeffhammond        (Intel)

From 23f5b8df3e802a27bacd92571184ec57bbdfa646 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 17 Oct 2022 20:21:21 -0500
Subject: [PATCH 094/230] Shuffled checked properties in bli_l3_check.c. (#676)

Details:
- Added certain checks for matrix structure to the level-3 operations'
  _check() functions, and slightly reorganized existing checks.
---
 frame/3/bli_l3_check.c | 179 ++++++++++++++++++++++++++++-------------
 1 file changed, 122 insertions(+), 57 deletions(-)

diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 3b4d88746..9ac0a7fbb 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -44,7 +44,7 @@ void bli_gemm_check
        const cntx_t* cntx
      )
 {
-	//err_t e_val;
+	err_t e_val;
 
 	// Check basic properties of the operation.
 
@@ -52,15 +52,14 @@ void bli_gemm_check
 
 	// Check object structure.
 
-	// NOTE: Can't perform these checks as long as bli_gemm_check() is called
-	// from bli_l3_int(), which is in the execution path for structured
-	// level-3 operations such as hemm.
+	e_val = bli_check_general_object( a );
+	bli_check_error_code( e_val );
 
-	//e_val = bli_check_general_object( a );
-	//bli_check_error_code( e_val );
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
 
-	//e_val = bli_check_general_object( b );
-	//bli_check_error_code( e_val );
+	e_val = bli_check_general_object( c );
+	bli_check_error_code( e_val );
 }
 
 void bli_gemmt_check
@@ -83,6 +82,14 @@ void bli_gemmt_check
 
 	e_val = bli_check_square_object( c );
 	bli_check_error_code( e_val );
+
+	// Check object structure.
+
+	e_val = bli_check_general_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
 }
 
 void bli_hemm_check
@@ -102,10 +109,21 @@ void bli_hemm_check
 
 	bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
 
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( a );
+	bli_check_error_code( e_val );
+
 	// Check object structure.
 
 	e_val = bli_check_hermitian_object( a );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( c );
+	bli_check_error_code( e_val );
 }
 
 void bli_herk_check
@@ -127,18 +145,26 @@ void bli_herk_check
 
 	bli_herk_basic_check( alpha, a, &ah, beta, c, cntx );
 
-	// Check for real-valued alpha and beta.
-
-	e_val = bli_check_real_valued_object( alpha );
-	bli_check_error_code( e_val );
+	// Check matrix squareness.
 
-	e_val = bli_check_real_valued_object( beta );
+	e_val = bli_check_square_object( c );
 	bli_check_error_code( e_val );
 
 	// Check matrix structure.
 
 	e_val = bli_check_hermitian_object( c );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( a );
+	bli_check_error_code( e_val );
+
+	// Check for real-valued alpha and beta.
+
+	e_val = bli_check_real_valued_object( alpha );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_real_valued_object( beta );
+	bli_check_error_code( e_val );
 }
 
 void bli_her2k_check
@@ -162,15 +188,26 @@ void bli_her2k_check
 
 	bli_her2k_basic_check( alpha, a, &bh, b, &ah, beta, c, cntx );
 
-	// Check for real-valued beta.
+	// Check matrix squareness.
 
-	e_val = bli_check_real_valued_object( beta );
+	e_val = bli_check_square_object( c );
 	bli_check_error_code( e_val );
 
 	// Check matrix structure.
 
 	e_val = bli_check_hermitian_object( c );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
+
+	// Check for real-valued beta.
+
+	e_val = bli_check_real_valued_object( beta );
+	bli_check_error_code( e_val );
 }
 
 void bli_symm_check
@@ -190,10 +227,21 @@ void bli_symm_check
 
 	bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
 
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( a );
+	bli_check_error_code( e_val );
+
 	// Check object structure.
 
 	e_val = bli_check_symmetric_object( a );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( c );
+	bli_check_error_code( e_val );
 }
 
 void bli_syrk_check
@@ -215,10 +263,18 @@ void bli_syrk_check
 
 	bli_herk_basic_check( alpha, a, &at, beta, c, cntx );
 
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( c );
+	bli_check_error_code( e_val );
+
 	// Check matrix structure.
 
 	e_val = bli_check_symmetric_object( c );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( a );
+	bli_check_error_code( e_val );
 }
 
 void bli_syr2k_check
@@ -242,10 +298,21 @@ void bli_syr2k_check
 
 	bli_her2k_basic_check( alpha, a, &bt, b, &at, beta, c, cntx );
 
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( c );
+	bli_check_error_code( e_val );
+
 	// Check matrix structure.
 
 	e_val = bli_check_symmetric_object( c );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
 }
 
 void bli_trmm3_check
@@ -261,14 +328,25 @@ void bli_trmm3_check
 {
 	err_t e_val;
 
-	// Perform checks common to hemm/symm/trmm/trsm.
+	// Check basic properties of the operation.
 
 	bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
 
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( a );
+	bli_check_error_code( e_val );
+
 	// Check object structure.
 
 	e_val = bli_check_triangular_object( a );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( c );
+	bli_check_error_code( e_val );
 }
 
 void bli_trmm_check
@@ -282,14 +360,22 @@ void bli_trmm_check
 {
 	err_t e_val;
 
-	// Perform checks common to hemm/symm/trmm/trsm.
+	// Check basic properties of the operation.
 
 	bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
 
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( a );
+	bli_check_error_code( e_val );
+
 	// Check object structure.
 
 	e_val = bli_check_triangular_object( a );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
 }
 
 void bli_trsm_check
@@ -307,10 +393,18 @@ void bli_trsm_check
 
 	bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
 
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( a );
+	bli_check_error_code( e_val );
+
 	// Check object structure.
 
 	e_val = bli_check_triangular_object( a );
 	bli_check_error_code( e_val );
+
+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
 }
 
 // -----------------------------------------------------------------------------
@@ -385,6 +479,14 @@ void bli_gemmt_basic_check
 
 	e_val = bli_check_level3_dims( a, b, c );
 	bli_check_error_code( e_val );
+
+	// Check for consistent datatypes.
+
+	e_val = bli_check_consistent_object_datatypes( c, a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_consistent_object_datatypes( c, b );
+	bli_check_error_code( e_val );
 }
 
 void bli_hemm_basic_check
@@ -417,11 +519,6 @@ void bli_hemm_basic_check
 		bli_check_error_code( e_val );
 	}
 
-	// Check matrix squareness.
-
-	e_val = bli_check_square_object( a );
-	bli_check_error_code( e_val );
-
 	// Check for consistent datatypes.
 
 	e_val = bli_check_consistent_object_datatypes( c, a );
@@ -452,19 +549,6 @@ void bli_herk_basic_check
 	e_val = bli_check_level3_dims( a, ah, c );
 	bli_check_error_code( e_val );
 
-	// Check matrix squareness.
-
-	e_val = bli_check_square_object( c );
-	bli_check_error_code( e_val );
-
-	// Check matrix structure.
-
-	e_val = bli_check_general_object( a );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_general_object( ah );
-	bli_check_error_code( e_val );
-
 	// Check for consistent datatypes.
 
 	e_val = bli_check_consistent_object_datatypes( c, a );
@@ -501,25 +585,6 @@ void bli_her2k_basic_check
 	e_val = bli_check_level3_dims( b, ah, c );
 	bli_check_error_code( e_val );
 
-	// Check matrix squareness.
-
-	e_val = bli_check_square_object( c );
-	bli_check_error_code( e_val );
-
-	// Check matrix structure.
-
-	e_val = bli_check_general_object( a );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_general_object( bh );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_general_object( b );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_general_object( ah );
-	bli_check_error_code( e_val );
-
 	// Check for consistent datatypes.
 
 	e_val = bli_check_consistent_object_datatypes( c, a );
@@ -586,13 +651,13 @@ void bli_l3_basic_check
 	e_val = bli_check_object_buffer( alpha );
 	bli_check_error_code( e_val );
 
-	e_val = bli_check_object_buffer( a );
+	e_val = bli_check_object_buffer( beta );
 	bli_check_error_code( e_val );
 
-	e_val = bli_check_object_buffer( b );
+	e_val = bli_check_object_buffer( a );
 	bli_check_error_code( e_val );
 
-	e_val = bli_check_object_buffer( beta );
+	e_val = bli_check_object_buffer( b );
 	bli_check_error_code( e_val );
 
 	e_val = bli_check_object_buffer( c );

From 88105dbecf0f9dfbfa30215743346e8bd6afb971 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 21 Oct 2022 15:16:12 -0500
Subject: [PATCH 095/230] Added Discord documentation (#677)

Details:
- Added a docs/Discord.md markdown document that walks the reader
  through creating a Discord account, obtaining the invite link, and
  using the link to join the BLIS Discord server.
- Updated README.md to reference the new Discord.md document in multiple
  places, including via the official Discord logo (used with explicit
  permission from representatives at Discord Inc.).
---
 README.md               |  31 ++++++++---
 docs/Discord.md         | 115 ++++++++++++++++++++++++++++++++++++++++
 docs/images/discord.svg |  23 ++++++++
 3 files changed, 163 insertions(+), 6 deletions(-)
 create mode 100644 docs/Discord.md
 create mode 100644 docs/images/discord.svg

diff --git a/README.md b/README.md
index 7996cb676..012861366 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
 [![Build Status](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis)
 [![Build Status](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master)
 
+[<img alt="Discord logo" title="Join us on Discord!" height="32px" src="docs/images/discord.svg" />](docs/Discord.md)
+
 Contents
 --------
 
@@ -97,6 +99,17 @@ all of which are available for free via the [edX platform](http://www.edx.org/).
 What's New
 ----------
 
+ * **Join us on Discord!** In 2021, we soft-launched our [Discord](https://discord.com/)
+server by privately inviting current and former collaborators, attendees of our BLIS
+Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled by
+the results thus far, and are happy to announce that our new community is now open to
+the broader public! If you'd like to hang out with other BLIS users and developers,
+ask a question, discuss future features, or just say hello, please feel free to join us!
+We've put together a [step-by-step guide](docs/Discord.md) for creating an account and
+joining our cozy enclave. We even have a monthly "BLIS happy hour" event where people
+can casually come together for a video chat, Q&A, brainstorm session, or whatever it
+happens to unfold into!
+
  * **Addons feature now available!** Have you ever wanted to quickly extend BLIS's
 operation support or define new custom BLIS APIs for your application, but were
 unsure of how to add your source code to BLIS? Do you want to isolate your custom
@@ -417,6 +430,9 @@ If/when you have time, we *strongly* encourage you to read the detailed
 walkthrough of the build system found in our [Build System](docs/BuildSystem.md)
 guide.
 
+If you are still having trouble, you are welcome to [join us on Discord](docs/Discord.md)
+for further information and/or assistance.
+
 Example Code
 ------------
 
@@ -500,6 +516,10 @@ empirically measured performance of `gemm` on select hardware architectures
 within BLIS and other BLAS libraries when performing matrix problems where one
 or two dimensions is exceedingly small.
 
+ * **[Discord](docs/Discord.md).** This document describes how to: create an
+account on Discord (if you don't already have one); obtain a private invite
+link; and use that invite link to join our BLIS server on Discord.
+
  * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of
 changes included with each new version of BLIS, along with contributor credits
 for key features.
@@ -610,16 +630,15 @@ has Linux, OSX and Windows binary packages for x86_64.
 Discussion
 ----------
 
-You can keep in touch with developers and other users of the project by joining
-one of the following mailing lists:
+Most of the active discussions are now happening on our [Discord](https://discord.com/)
+server. Users and developers alike are welcome! Please see the
+[BLIS Discord guide](docs/Discord.md) for a walkthrough of how to join us.
+
+You can also still stay in touch by using either of the following mailing lists:
 
  * [blis-devel](https://groups.google.com/group/blis-devel): Please join and
 post to this mailing list if you are a BLIS developer, or if you are trying
 to use BLIS beyond simply linking to it as a BLAS library.
-**Note:** Most of the interesting discussions happen here; don't be afraid to
-join! If you would like to submit a bug report, or discuss a possible bug,
-please consider opening a [new issue](https://github.com/flame/blis/issues) on
-github.
 
  * [blis-discuss](https://groups.google.com/group/blis-discuss): Please join and
 post to this mailing list if you have general questions or feedback regarding
diff --git a/docs/Discord.md b/docs/Discord.md
new file mode 100644
index 000000000..b4403f7bc
--- /dev/null
+++ b/docs/Discord.md
@@ -0,0 +1,115 @@
+*NOTE: The [BLIS](https://github.com/flame/blis) project is not affiliated with [Discord Inc.](https://discord.com/company) in any way, and we use the Discord logo with their permission.*
+
+
+## Contents
+
+* **[Welcome](Discord.md#welcome)**
+* **[Introduction to Discord](Discord.md#introduction-to-discord)**
+* **[Creating an account](Discord.md#creating-an-account)**
+* **[Obtaining the invite link](Discord.md#obtaining-the-invite-link)**
+* **[Joining the BLIS server](Discord.md#joining-the-blis-server)**
+* **[Additional resources](Discord.md#additional-resources)**
+
+## Welcome
+
+In 2021, we soft-launched our Discord server by privately inviting current and former collaborators, attendees of our BLIS Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled by the results thus far, and are happy to announce that our new community is now open to the broader public!
+
+If you'd like to hang out with other BLIS users and developers, ask a question, discuss future features, or just say hello, please feel free to join us! Joining our server is also a great way to get announcements for new versions, workshop events, video chat parties, and other infrequent updates.
+
+**If you already use Discord** and want to skip straight to the invite link, you can find it [here](#obtaining-the-invite-link). Just be sure to manually remove the dashes (`-`) and equal signs (`=`) before using it!
+
+## Introduction to Discord
+
+The remaining sections of this file walk the reader through basic instructions for joining the BLIS community on [Discord](https://discord.com).
+
+Discord is free to use for everyone. You can optionally pay for premium features via their [Nitro](https://discord.com/nitro) subscription, but Nitro is not necessary for most casual users.
+
+Discord offers several kinds of clients. Users may use Discord via:
+
+- the official Android and iOS apps on mobile devices
+- a [web browser](https://discord.com/login)
+- the standalone desktop application, available from their [Download](https://discord.com/download) page.
+
+You can even stay logged in on multiple devices! Each one will automatically sync itself to newly sent/received messages.
+
+In this document, we'll walk you through each step necessary to join the BLIS Discord community. First, we'll talk about how to [create a Discord account](#creating-an-account) (if you don't already have one). Then, we'll explain how to [obtain the invite link](#obtaining-the-invite-link). And finally, we'll tell you how to use that invite link to [join the BLIS Discord server](#joining-the-blis-server).
+
+
+## Creating an account
+
+If you don't already have a Discord account, you'll need to first create one.
+
+As of this writing, you may follow these steps to create your account:
+
+*NOTE: We recommend executing these steps using a desktop web browser. Once you've created your account and joined the BLIS server, you can proceed to use your client(s) of choice (mobile app, desktop app, or web browser).*
+
+1. Go to [https://discord.com](https://discord.com) and click on "Login" at the top-right.
+2. At the bottom of the dialog, click the "Register" link.
+3. Enter the prompted information, such as username and email, then click "Continue".
+4. Perform the Captcha verification.
+5. This should take you into the web browser version of Discord. You will be asked if you want to create your own server. Close the dialog without making any selection.
+6. At this point, you need to verify your email address. Check your email account for a message from Discord. Click the link in the email. This should bring up a dialog confirming your email has been verified. You may now close the web browser tab.
+
+Congratulations! You're now a member of Discord and ready to join individual communities, or "servers."
+
+
+## Obtaining the invite link
+
+Since we do not have access to an official Captcha-like service to confirm that you are not a software bot, we have instead obfuscated our invite link in a way that should be easy for a human to unmangle.
+
+Here's an example invite link (for reference purposes only): `https://discord.gg/abC2jUVeip`
+
+Notice that the link consists of `https://discord.gg/` followed by a 10-character string consisting of lower- and upper-case letters, and (typically) one numerical digit.
+
+**The BLIS Discord invite link is: https://discord.gg/e-Zx=p-z9=p-Ks=x**
+
+*Note that you **must** remove the dashes (`-`) and equal signs (`=`) before using the link!*
+
+Once you decipher the invite link, copy it to your clipboard so it's ready to use in the appropriate step within the next section, [Joining the BLIS server](#joining-the-blis-server).
+
+
+## Joining the BLIS server
+
+Once you have the invite link copied to your clipboard, follow these steps in order to join the BLIS server:
+
+*NOTE: We recommend executing these steps using a desktop web browser. Once you've joined the BLIS server, you can proceed to use your client(s) of choice (mobile app, desktop app, or web browser).*
+
+1. Log in to the [Discord website](https://discord.com).
+2. Once logged in, on the left-hand side of the UI, click on the button with the "+" symbol. This will bring up a dialog asking if you want to create a server.
+3. At the bottom of the dialog, there will be a section asking, "Have an invite already?" Click the button below it labeled "Join a Server".
+4. Paste the invite link into the prompt and click "Join Server".
+5. This should bring up a dialog stating that you've been invited to join the BLIS server. Click on "Accept Invite". This will trigger a new dialog informing you that your account has been updated with the invitation.
+
+That's it! Now that you've joined our server, please consider introducing yourself in `#general`! We love hearing about how application developers and end-users are using BLIS.
+
+If you had any difficulty joining or with the invite link, please reach out to [field@cs.utexas.edu](field@cs.utexas.edu).
+
+
+## Additional resources
+
+Are you new to Discord? Not sure how to work this newfangled technology? Don't worry; once you learn the basics, you'll feel much more at home!
+
+While a tutorial on Discord is beyond the scope of this document, there are countless articles and YouTube videos that introduce newcomers to Discord's UI. Here are a few articles on the basics:
+
+- **tom's guide**. [Discord: Everything You Need to Know](https://www.tomsguide.com/us/what-is-discord,review-5203.html)
+- **WIRED.** [How to Use Discord: A Beginner's Guide](https://www.wired.com/story/how-to-use-discord/)
+- **Discord Support.** [Beginner's Guide to Discord](https://support.discord.com/hc/en-us/articles/360045138571-Beginner-s-Guide-to-Discord)
+
+And some YouTube videos:
+
+- **Tech Audit TV.** [How to Use Discord in 2022: The Ultimate Beginner Walkthrough](https://www.youtube.com/watch?v=nPmdafMo1b8)
+- **Howfinity.** [How to Use Discord - Beginner's Guide](https://www.youtube.com/watch?v=rnYGrq95ezA)
+
+Some things I recommend setting up shortly after you create your account:
+
+- Take note of your username's "tag" or disambiguator. This is a randomly-assigned four-digit number that gets implicitly appended to the end of your username (e.g. `bsmith#1234`), which helps when others need to tell you apart from others who have the same username.
+- Not happy with your username? You can change it!
+- Review your privacy settings, and consider using two-factor authentication.
+- Personalize your account with a custom profile image.
+- Consider switching to the "dark" theme (if you prefer dark modes on other websites or on mobile devices).
+- Tweak other appearence settings such as the font size or UI compactness.
+- Set up your notifications.
+
+There are many other settings in Discord! Feel free to explore all of them by clicking the gear icon in the bottom-left area of your screen, just to the right of your username.
+
+We hope you found this short guide useful, and we hope to see you on Discord! Thanks for your interest in BLIS and our community! :)
diff --git a/docs/images/discord.svg b/docs/images/discord.svg
new file mode 100644
index 000000000..1f483fe8f
--- /dev/null
+++ b/docs/images/discord.svg
@@ -0,0 +1,23 @@
+<svg width="292" height="80" viewBox="0 0 292 80" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0)">
+<g clip-path="url(#clip1)">
+<path d="M61.7958 16.494C57.0736 14.2846 52.0244 12.6789 46.7456 11.7646C46.0973 12.9367 45.3399 14.5132 44.8177 15.7673C39.2062 14.9234 33.6463 14.9234 28.138 15.7673C27.6159 14.5132 26.8413 12.9367 26.1872 11.7646C20.9027 12.6789 15.8477 14.2905 11.1255 16.5057C1.60078 30.8988 -0.981215 44.9344 0.309785 58.7707C6.62708 63.4883 12.7493 66.3541 18.7682 68.2294C20.2543 66.1841 21.5797 64.0099 22.7215 61.7185C20.5469 60.8922 18.4641 59.8725 16.4961 58.6887C17.0182 58.3019 17.5289 57.8975 18.0223 57.4814C30.0257 63.0957 43.0677 63.0957 54.9277 57.4814C55.4269 57.8975 55.9375 58.3019 56.4539 58.6887C54.4801 59.8783 52.3916 60.898 50.217 61.7244C51.3588 64.0099 52.6785 66.19 54.1703 68.2352C60.195 66.3599 66.3229 63.4942 72.6402 58.7707C74.155 42.7309 70.0525 28.8242 61.7958 16.494ZM24.3568 50.2615C20.7535 50.2615 17.7985 46.8976 17.7985 42.8012C17.7985 38.7048 20.6904 35.3351 24.3568 35.3351C28.0233 35.3351 30.9782 38.6989 30.9151 42.8012C30.9208 46.8976 28.0233 50.2615 24.3568 50.2615ZM48.5932 50.2615C44.9899 50.2615 42.0349 46.8976 42.0349 42.8012C42.0349 38.7048 44.9267 35.3351 48.5932 35.3351C52.2596 35.3351 55.2146 38.6989 55.1515 42.8012C55.1515 46.8976 52.2596 50.2615 48.5932 50.2615Z" fill="#5865F2"/>
+<path d="M98.0293 26.1707H113.693C117.469 26.1707 120.659 26.7743 123.276 27.9757C125.886 29.177 127.843 30.8531 129.14 32.998C130.436 35.1429 131.09 37.5984 131.09 40.3645C131.09 43.072 130.413 45.5275 129.059 47.7251C127.705 49.9286 125.645 51.6692 122.874 52.9526C120.103 54.236 116.671 54.8806 112.569 54.8806H98.0293V26.1707ZM112.408 47.5845C114.95 47.5845 116.907 46.934 118.272 45.6388C119.638 44.3378 120.321 42.568 120.321 40.3235C120.321 38.243 119.712 36.5845 118.496 35.3421C117.28 34.0997 115.438 33.4727 112.976 33.4727H108.076V47.5845H112.408Z" fill="#5865F2"/>
+<path d="M154.541 54.8456C152.372 54.2713 150.415 53.4391 148.677 52.3432V45.5335C149.991 46.5707 151.752 47.4264 153.961 48.1003C156.17 48.7684 158.305 49.1024 160.37 49.1024C161.334 49.1024 162.063 48.9735 162.556 48.7156C163.05 48.4578 163.297 48.1472 163.297 47.7897C163.297 47.3795 163.165 47.0396 162.895 46.7641C162.625 46.4887 162.103 46.2601 161.329 46.0667L156.509 44.9591C153.749 44.3028 151.792 43.3944 150.628 42.2282C149.463 41.0678 148.883 39.5441 148.883 37.6571C148.883 36.0689 149.388 34.6918 150.41 33.5138C151.425 32.3359 152.871 31.4275 154.747 30.7887C156.624 30.1441 158.815 29.8218 161.334 29.8218C163.583 29.8218 165.643 30.0679 167.52 30.5602C169.396 31.0525 170.945 31.6795 172.179 32.4472V38.8878C170.916 38.1201 169.47 37.5165 167.818 37.0593C166.171 36.6081 164.479 36.3854 162.734 36.3854C160.215 36.3854 158.959 36.8249 158.959 37.6981C158.959 38.1084 159.154 38.4131 159.544 38.6182C159.934 38.8233 160.651 39.0343 161.69 39.257L165.706 39.9954C168.329 40.4584 170.285 41.273 171.57 42.4333C172.856 43.5937 173.498 45.3108 173.498 47.5846C173.498 50.0752 172.437 52.0502 170.308 53.5153C168.179 54.9804 165.161 55.7129 161.248 55.7129C158.947 55.7071 156.71 55.4199 154.541 54.8456Z" fill="#5865F2"/>
+<path d="M182.978 53.9839C180.678 52.8352 178.939 51.2764 177.78 49.3073C176.621 47.3382 176.036 45.123 176.036 42.6616C176.036 40.2003 176.638 37.9968 177.843 36.057C179.048 34.1172 180.815 32.5935 183.145 31.4859C185.474 30.3783 188.257 29.8274 191.499 29.8274C195.515 29.8274 198.849 30.6889 201.5 32.4118V39.919C200.565 39.2626 199.474 38.7293 198.229 38.3191C196.984 37.9089 195.653 37.7037 194.23 37.7037C191.74 37.7037 189.795 38.1667 188.389 39.0985C186.983 40.0303 186.278 41.2434 186.278 42.7495C186.278 44.2263 186.96 45.4336 188.326 46.383C189.692 47.3265 191.671 47.8012 194.27 47.8012C195.607 47.8012 196.927 47.6019 198.229 47.2093C199.526 46.8108 200.645 46.3244 201.58 45.75V53.011C198.637 54.816 195.223 55.7185 191.338 55.7185C188.068 55.7068 185.279 55.1325 182.978 53.9839Z" fill="#5865F2"/>
+<path d="M211.518 53.9841C209.2 52.8355 207.433 51.2649 206.216 49.2665C205 47.2681 204.386 45.0412 204.386 42.5798C204.386 40.1185 204.994 37.9208 206.216 35.9928C207.438 34.0647 209.194 32.5527 211.501 31.4568C213.801 30.3609 216.55 29.8159 219.734 29.8159C222.919 29.8159 225.667 30.3609 227.968 31.4568C230.269 32.5527 232.025 34.053 233.23 35.9693C234.435 37.8857 235.037 40.0833 235.037 42.574C235.037 45.0353 234.435 47.2623 233.23 49.2606C232.025 51.259 230.263 52.8296 227.945 53.9782C225.627 55.1269 222.89 55.7012 219.729 55.7012C216.567 55.7012 213.83 55.1327 211.518 53.9841ZM223.722 46.7055C224.698 45.7093 225.191 44.3907 225.191 42.7498C225.191 41.1089 224.703 39.802 223.722 38.835C222.747 37.8622 221.415 37.3758 219.729 37.3758C218.013 37.3758 216.67 37.8622 215.689 38.835C214.714 39.8079 214.226 41.1089 214.226 42.7498C214.226 44.3907 214.714 45.7093 215.689 46.7055C216.665 47.7018 218.013 48.2058 219.729 48.2058C221.415 48.1999 222.747 47.7018 223.722 46.7055Z" fill="#5865F2"/>
+<path d="M259.17 31.3395V40.2004C258.149 39.5147 256.829 39.1748 255.194 39.1748C253.053 39.1748 251.401 39.8371 250.253 41.1615C249.1 42.486 248.526 44.5488 248.526 47.3383V54.8865H238.686V30.8883H248.326V38.5185C248.859 35.7289 249.726 33.672 250.919 32.3416C252.107 31.0172 253.644 30.355 255.515 30.355C256.932 30.355 258.149 30.6832 259.17 31.3395Z" fill="#5865F2"/>
+<path d="M291.864 25.3503V54.8866H282.023V49.5127C281.191 51.5345 279.929 53.0758 278.231 54.1306C276.532 55.1797 274.432 55.7071 271.942 55.7071C269.716 55.7071 267.777 55.1562 266.118 54.0486C264.46 52.941 263.181 51.4232 262.28 49.4951C261.385 47.567 260.931 45.387 260.931 42.9491C260.903 40.435 261.379 38.1787 262.36 36.1803C263.336 34.1819 264.718 32.6231 266.497 31.5037C268.276 30.3844 270.307 29.8218 272.585 29.8218C277.273 29.8218 280.417 31.9022 282.023 36.0572V25.3503H291.864ZM280.555 46.5415C281.559 45.5452 282.058 44.2501 282.058 42.6678C282.058 41.1382 281.57 39.8899 280.595 38.9347C279.619 37.9795 278.282 37.4989 276.601 37.4989C274.943 37.4989 273.618 37.9853 272.625 38.9581C271.632 39.931 271.139 41.1909 271.139 42.7498C271.139 44.3087 271.632 45.5804 272.625 46.5649C273.618 47.5494 274.926 48.0417 276.561 48.0417C278.219 48.0359 279.55 47.5377 280.555 46.5415Z" fill="#5865F2"/>
+<path d="M139.382 33.4432C142.091 33.4432 144.288 31.4281 144.288 28.9424C144.288 26.4567 142.091 24.4417 139.382 24.4417C136.672 24.4417 134.476 26.4567 134.476 28.9424C134.476 31.4281 136.672 33.4432 139.382 33.4432Z" fill="#5865F2"/>
+<path d="M134.472 36.5435C137.478 37.8679 141.208 37.9265 144.283 36.5435V55.0154H134.472V36.5435Z" fill="#5865F2"/>
+</g>
+</g>
+<defs>
+<clipPath id="clip0">
+<rect width="292" height="56.4706" fill="white" transform="translate(0 11.7646)"/>
+</clipPath>
+<clipPath id="clip1">
+<rect width="292" height="56.4706" fill="white" transform="translate(0 11.7646)"/>
+</clipPath>
+</defs>
+</svg>

From 2dd692b710b6a9889f7ebdd7934a2108be5c5530 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 26 Oct 2022 18:10:26 -0500
Subject: [PATCH 096/230] Fix auto-detection of firestorm (Apple M1).

---
 frame/base/bli_cpuid.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c
index 527db1f5d..d967cc05d 100644
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -781,7 +781,7 @@ uint32_t bli_cpuid_query
 		if ( bli_cpuid_has_features( ecx, FEATURE_MASK_AVX   ) ) *features |= FEATURE_AVX;
 		if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA3  ) ) *features |= FEATURE_FMA3;
 
-		// Check whether the hardware supports xsave/xrestor/xsetbv/xgetbv AND 
+		// Check whether the hardware supports xsave/xrestor/xsetbv/xgetbv AND
 		// support for these is enabled by the OS. If so, then we proceed with
 		// checking that various register-state saving features are available.
 		if ( bli_cpuid_has_features( ecx, FEATURE_MASK_XGETBV ) )
@@ -813,7 +813,7 @@ uint32_t bli_cpuid_query
 
 			// The OS can manage the state of 512-bit zmm (AVX-512) registers
 			// only if the xcr[7:5] bits are set. If they are not set, then
-			// clear all feature bits related to AVX-512. 
+			// clear all feature bits related to AVX-512.
 			if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM |
 				                               XGETBV_MASK_YMM |
 				                               XGETBV_MASK_ZMM ) )
@@ -829,7 +829,7 @@ uint32_t bli_cpuid_query
 
 			// The OS can manage the state of 256-bit ymm (AVX) registers
 			// only if the xcr[2] bit is set. If it is not set, then
-			// clear all feature bits related to AVX. 
+			// clear all feature bits related to AVX.
 			if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM |
 				                               XGETBV_MASK_YMM ) )
 			{
@@ -842,7 +842,7 @@ uint32_t bli_cpuid_query
 			// The OS can manage the state of 128-bit xmm (SSE) registers
 			// only if the xcr[1] bit is set. If it is not set, then
 			// clear all feature bits related to SSE (which means the
-			// entire bitfield is clear). 
+			// entire bitfield is clear).
 			if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM ) )
 			{
 				*features = 0;
@@ -1025,6 +1025,7 @@ static uint32_t get_coretype
 {
 	int implementer = 0x00, part = 0x000;
 	*features = FEATURE_NEON;
+    bool has_sve = FALSE;
 
 #ifdef __linux__
 	if ( getauxval( AT_HWCAP ) & HWCAP_CPUID )
@@ -1033,7 +1034,7 @@ static uint32_t get_coretype
 		// /sys/devices/system/cpu/cpu0/regs/identification/midr_el1
 		// and split out in /proc/cpuinfo (with a tab before the colon):
 		// CPU part	: 0x0a1
-		
+
 		uint64_t midr_el1;
 		__asm("mrs %0, MIDR_EL1" : "=r" (midr_el1));
 		/*
@@ -1047,8 +1048,8 @@ static uint32_t get_coretype
 		implementer = (midr_el1 >> 24) & 0xFF;
 		part        = (midr_el1 >> 4)  & 0xFFF;
 	}
-	
-	bool has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE;
+
+	has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE;
 	if (has_sve)
 		*features |= FEATURE_SVE;
 #endif //__linux__
@@ -1097,7 +1098,7 @@ static uint32_t get_coretype
 	// CAVIUM_CPU_PART_THUNDERX2 0x0AF
 	// CAVIUM_CPU_PART_THUNDERX3 0x0B8  // taken from OpenBLAS
 	//
-	// BRCM_CPU_PART_BRAHMA_B53 0x100 
+	// BRCM_CPU_PART_BRAHMA_B53 0x100
 	// BRCM_CPU_PART_VULCAN 0x516
 	//
 	// QCOM_CPU_PART_FALKOR_V1 0x800
@@ -1210,7 +1211,7 @@ uint32_t bli_cpuid_query
 
 #elif defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC)
 
-/* 
+/*
    I can't easily find documentation to do this as for aarch64, though
    it presumably could be unearthed from Linux code.  However, on
    Linux 5.2 (and Androids's 3.4), /proc/cpuinfo has this sort of

From c803b03e52a7a6997a8d304a8cfa9acf7c1c555b Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 26 Oct 2022 18:20:00 -0500
Subject: [PATCH 097/230] Add check to disable armsve on Apple M1.

---
 configure | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/configure b/configure
index a53f25380..37399fbde 100755
--- a/configure
+++ b/configure
@@ -1335,6 +1335,17 @@ blacklistbu_add()
 	fi
 }
 
+blacklistos_add()
+{
+	# Check whether we've already blacklisted the given sub-config so
+	# we don't output redundant messages.
+	if [ $(is_in_list "$1" "${config_blist}") == "false" ]; then
+
+		echowarn "The operating system does not support building '$1'; adding to blacklist."
+		config_blist="${config_blist} $1"
+	fi
+}
+
 blacklist_init()
 {
 	config_blist=""
@@ -1989,6 +2000,13 @@ check_assembler()
 	fi
 }
 
+check_os()
+{
+	if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then
+		blacklistos_add "armsve"
+	fi
+}
+
 try_assemble()
 {
 	local cc cflags asm_src asm_base asm_bin rval
@@ -2886,6 +2904,9 @@ main()
 	get_binutils_version
 	check_assembler
 
+    # Check if there is any incompatibility due to the operating system.
+	check_os
+
 	# Remove duplicates and whitespace from the blacklist.
 	blacklist_cleanup
 

From aeb5f0cc19665456e990a7ffccdb09da2e3f504b Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 27 Oct 2022 12:39:11 -0500
Subject: [PATCH 098/230] Omnibus PR - Oct 2023 (#678)

Details:
- This is an "omnibus" commit, consisting of multiple medium-sized
  commits that affect non-trivial aspects of BLIS. The major highlights:
  - Relocated the pba, sba pool (from the rntm_t), and mem_t (from the
    cntl_t) to the thrinfo_t object. This allows the rntm_t to be
    effectively const (although it is sometimes copied internally and
    modified to reflect different ways of parallelism). Moving the mem_t
    sets the stage for sharing a global control tree amongst all
    threads.
  - De-templatized the macrokernels for gemmt, trmm, and trsm to match
    the macrokernel for gemm, which has been de-templatized since
    54fa28b.
  - Reimplemented bli_l3_determine_kc() by separating out the logic for
    adjusting KC based on MR/NR for triangular A and/or B into a new
    function, bli_l3_adjust_kc(). For now, this function is still called
    from bli_l3_determine_kc(), but in the future we plan to have it
    called once when constructing the control tree.
  - Refactored the level-3 thread decorator into two parts:
    - One part deals only with launching threads, each one calling a
      generic thread entry function. This code resides in frame/thread
      and constitutes the definition of bli_thread_launch(). Note that
      it is specific to the threading implementation (OpenMP, pthreads,
      single, etc.)
    - The other part deals with passing the matrix operands and related
      information into bli_thread_launch(). This is the "l3 decorator"
      and now resides in frame/3. It is agnostic to the threading
      implementation.
  - Modified the "level" of the thread control tree passed in at each
    operation. Previously, each operation (e.g. bli_gemm_blk_var1()) was
    passed in a communicator representing the active thread teams which
    would share the available work. Now, the *parent* thread comm is
    passed in. The operation then grabs the child comm and uses it to
    partition the work. The difference is in bli_trsm_blk_var1(), where
    there are now two children nodes for this single operation (i.e. the
    thread control tree is split one level above where the control tree
    is). The sub-prenode is used for the trsm subproblem while the
    normal sub-node is used for the gemm part. Importantly, the parent
    comm is used for the barrier between them.
- Removed cntl_t* arguments from bli_*_front() functions. These will be
  added back in the future when the control tree's creation is moved so
  that it happens much sooner (provided that bli_*_front() have not been
  absorbed into their respective bli_*_ex() functions).
- Renamed various bli_thread_*() query functions to bli_thrinfo_*(),
  for consistency. This includes _num_threads(), _thread_id(), _n_way(),
  _work_id(), _sba_pool(), _pba(), _mem(), _barrier(), _broadcast(), and
  _am_chief().
- Removed extraneous barrier from _blk_var3() of gemm and trsm.
- Fixed a typo in bli_type_defs.h where BLIS_BLAS_INT_TYPE_SIZE was
  misspelled.
---
 addon/gemmd/attic/bao_gemmd_bp_var2.c         |   10 +-
 addon/gemmd/bao_gemmd_bp_var1.c               |   10 +-
 addon/gemmd/bao_l3_packm_a.c                  |   10 +-
 addon/gemmd/bao_l3_packm_b.c                  |   10 +-
 addon/gemmd/bao_l3_packm_var1.c               |    4 +-
 addon/gemmd/bao_l3_packm_var2.c               |    4 +-
 build/libblis-symbols.def                     | 1484 ++---------------
 frame/1m/bli_l1m_oft_var.h                    |    5 +-
 frame/1m/bli_l1m_tapi.c                       |   24 +-
 frame/1m/bli_l1m_unb_var1.c                   |   15 +-
 frame/1m/bli_l1m_unb_var1.h                   |   15 +-
 frame/1m/packm/bli_packm.h                    |    1 +
 frame/1m/packm/bli_packm_alloc.c              |   50 +-
 frame/1m/packm/bli_packm_alloc.h              |   13 +-
 frame/1m/packm/bli_packm_blk_var1.c           |   11 +-
 frame/1m/packm/bli_packm_blk_var1.h           |   11 +-
 frame/1m/packm/bli_packm_cntl.c               |    6 +-
 frame/1m/packm/bli_packm_cntl.h               |    2 +-
 frame/1m/packm/bli_packm_init.c               |    7 +-
 frame/1m/packm/bli_packm_init.h               |    5 +-
 frame/1m/packm/bli_packm_int.c                |   11 +-
 frame/1m/packm/bli_packm_int.h                |    5 +-
 frame/1m/packm/bli_packm_struc_cxk_md.c       |    2 -
 frame/1m/packm/bli_packm_thrinfo.c            |   75 -
 frame/1m/packm/bli_packm_thrinfo.h            |   40 -
 frame/1m/unpackm/bli_unpackm_cntl.c           |   10 +-
 frame/1m/unpackm/bli_unpackm_cntl.h           |    8 +-
 frame/1m/unpackm/bli_unpackm_int.c            |    4 +-
 frame/3/bli_l3.h                              |    4 +
 frame/3/bli_l3_blocksize.c                    |  297 +---
 frame/3/bli_l3_blocksize.h                    |   57 +-
 frame/3/bli_l3_cntl.c                         |   25 +-
 frame/3/bli_l3_cntl.h                         |   11 +-
 frame/3/bli_l3_decor.c                        |  298 ++++
 frame/{thread => 3}/bli_l3_decor.h            |   33 +-
 frame/3/bli_l3_int.c                          |   15 +-
 frame/3/bli_l3_int.h                          |    3 +-
 frame/3/bli_l3_oapi_ex.c                      |   64 +-
 frame/3/bli_l3_oapi_ex.h                      |    8 +-
 frame/3/bli_l3_oft.h                          |    8 +-
 frame/3/bli_l3_oft_var.h                      |    3 +-
 frame/3/bli_l3_packab.c                       |   10 +-
 frame/3/bli_l3_packab.h                       |    6 +-
 frame/3/bli_l3_sup.c                          |   16 +-
 frame/3/bli_l3_sup.h                          |    4 +-
 .../bli_l3_decor.c => 3/bli_l3_sup_decor.c}   |  161 +-
 frame/{thread => 3}/bli_l3_sup_decor.h        |   29 +-
 frame/3/bli_l3_sup_int.c                      |   40 +-
 frame/3/bli_l3_sup_int.h                      |    4 +-
 frame/3/bli_l3_sup_oft.h                      |    2 +-
 frame/3/bli_l3_sup_packm.c                    |   68 +-
 frame/3/bli_l3_sup_packm.h                    |    8 +-
 frame/3/bli_l3_sup_packm_var.c                |   32 +-
 frame/3/bli_l3_sup_var1n2m.c                  |  145 +-
 frame/3/bli_l3_sup_vars.h                     |    2 +-
 frame/3/bli_l3_tapi_ex.c                      |   18 +-
 frame/3/bli_l3_tapi_ex.h                      |   16 +-
 frame/3/bli_l3_thrinfo.c                      |  389 +++--
 frame/3/bli_l3_thrinfo.h                      |   63 +-
 frame/3/gemm/bli_gemm_blk_var1.c              |   17 +-
 frame/3/gemm/bli_gemm_blk_var2.c              |   17 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |   20 +-
 frame/3/gemm/bli_gemm_cntl.c                  |  158 +-
 frame/3/gemm/bli_gemm_cntl.h                  |   18 +-
 frame/3/gemm/bli_gemm_front.c                 |    6 +-
 frame/3/gemm/bli_gemm_front.h                 |    5 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |   14 +-
 frame/3/gemm/bli_gemm_var.h                   |   11 +-
 frame/3/gemm/{ => other}/bli_gemm_ker_var1.c  |    0
 frame/3/gemm/other/bli_gemm_ker_var2.c        |    8 +-
 frame/3/gemm/other/bli_gemm_ker_var2rr.c      |    8 +-
 frame/3/gemm/other/bli_gemm_ker_var2sl.c      |    8 +-
 frame/3/gemmt/bli_gemmt_front.c               |    6 +-
 frame/3/gemmt/bli_gemmt_front.h               |    3 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |  757 ++++-----
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |  763 ++++-----
 frame/3/gemmt/bli_gemmt_var.h                 |    4 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2.c          |    9 +-
 frame/3/gemmt/other/bli_gemmt_l_ker_var2.c    |    8 +-
 frame/3/gemmt/other/bli_gemmt_u_ker_var2.c    |    8 +-
 frame/3/hemm/bli_hemm_front.c                 |    6 +-
 frame/3/hemm/bli_hemm_front.h                 |    3 +-
 frame/3/symm/bli_symm_front.c                 |    6 +-
 frame/3/symm/bli_symm_front.h                 |    3 +-
 frame/3/trmm/bli_trmm_front.c                 |    6 +-
 frame/3/trmm/bli_trmm_front.h                 |    3 +-
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |  551 +++---
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |  566 +++----
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |  676 +++-----
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |  714 ++++----
 frame/3/trmm/bli_trmm_var.h                   |    4 +-
 frame/3/trmm/bli_trmm_xx_ker_var2.c           |   11 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2.c     |    4 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c   |    8 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c   |    8 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2.c     |    4 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c   |    8 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c   |    8 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2.c     |    4 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c   |    8 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c   |    8 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2.c     |    4 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c   |    8 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c   |    8 +-
 frame/3/trmm3/bli_trmm3_front.c               |    6 +-
 frame/3/trmm3/bli_trmm3_front.h               |    3 +-
 frame/3/trsm/bli_trsm_blk_var1.c              |   28 +-
 frame/3/trsm/bli_trsm_blk_var2.c              |   17 +-
 frame/3/trsm/bli_trsm_blk_var3.c              |   25 +-
 frame/3/trsm/bli_trsm_cntl.c                  |  101 +-
 frame/3/trsm/bli_trsm_cntl.h                  |   13 +-
 frame/3/trsm/bli_trsm_front.c                 |    6 +-
 frame/3/trsm/bli_trsm_front.h                 |    3 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |  625 +++----
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |  658 +++-----
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |  576 +++----
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |  641 +++----
 frame/3/trsm/bli_trsm_var.h                   |   44 +-
 frame/3/trsm/bli_trsm_xx_ker_var2.c           |   11 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2.c     |    4 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c   |    4 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c   |    4 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2.c     |    4 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c   |    4 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c   |    4 +-
 frame/3/trsm/other/bli_trsm_rl_ker_var2.c     |    4 +-
 frame/3/trsm/other/bli_trsm_ru_ker_var2.c     |    4 +-
 frame/base/bli_cntl.c                         |  140 +-
 frame/base/bli_cntl.h                         |   43 +-
 frame/base/bli_cntx.h                         |    2 -
 frame/base/bli_mem.h                          |   15 +-
 frame/base/bli_pba.c                          |   29 +-
 frame/base/bli_pba.h                          |   16 +-
 frame/base/bli_rntm.h                         |   44 +-
 frame/base/bli_sba.c                          |  144 +-
 frame/base/bli_sba.h                          |   13 +-
 frame/compat/extra/bla_gemm3m.c               |    3 +-
 frame/include/bli_extern_defs.h               |    2 -
 frame/include/bli_oapi_ex.h                   |    2 +-
 frame/include/bli_tapi_ex.h                   |    2 +-
 frame/include/bli_type_defs.h                 |   28 +-
 frame/include/level0/1e/bli_copy1es.h         |   28 +-
 frame/include/level0/1e/bli_copyj1es.h        |   28 +-
 frame/include/level0/1e/bli_scal21es.h        |   86 +-
 frame/include/level0/1e/bli_scal2j1es.h       |   86 +-
 frame/include/level0/ri/bli_copyris.h         |   18 +-
 frame/include/level0/ri/bli_scal2jris.h       |    4 +
 frame/include/level0/ri/bli_scal2ris.h        |    4 +
 frame/thread/bli_l3_decor_openmp.c            |  253 ---
 frame/thread/bli_l3_decor_pthreads.c          |  264 ---
 frame/thread/bli_l3_decor_pthreads.h          |   61 -
 frame/thread/bli_l3_decor_single.c            |  165 --
 frame/thread/bli_l3_sup_decor_openmp.c        |  136 --
 frame/thread/bli_l3_sup_decor_pthreads.c      |  225 ---
 frame/thread/bli_l3_sup_decor_pthreads.h      |   60 -
 frame/thread/bli_l3_sup_decor_single.c        |  138 --
 frame/thread/bli_thrcomm.c                    |   76 +-
 frame/thread/bli_thrcomm.h                    |   15 +-
 frame/thread/bli_thread.c                     |   55 +-
 frame/thread/bli_thread.h                     |   37 +-
 ..._l3_decor_openmp.h => bli_thread_openmp.c} |   46 +-
 ...sup_decor_openmp.h => bli_thread_openmp.h} |   18 +-
 frame/thread/bli_thread_pthreads.c            |  128 ++
 ...3_decor_single.h => bli_thread_pthreads.h} |   24 +-
 .../thread/bli_thread_single.c                |   29 +-
 ...sup_decor_single.h => bli_thread_single.h} |   18 +-
 frame/thread/bli_thrinfo.c                    |  629 ++-----
 frame/thread/bli_thrinfo.h                    |  176 +-
 frame/thread/bli_thrinfo_sup.c                |  290 ----
 frame/thread/bli_thrinfo_sup.h                |   66 -
 frame/util/bli_util_tapi.c                    |   14 +-
 sandbox/gemmlike/attic/bls_gemm_bp_var2.c     |   10 +-
 sandbox/gemmlike/bli_gemm_ex.c                |    4 +-
 sandbox/gemmlike/bls_gemm.c                   |   14 +-
 sandbox/gemmlike/bls_gemm.h                   |    2 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           |  102 +-
 .../gemmlike/bls_l3_decor.c                   |  156 +-
 sandbox/gemmlike/{thread => }/bls_l3_decor.h  |   25 -
 sandbox/gemmlike/bls_l3_packm_a.c             |   67 +-
 sandbox/gemmlike/bls_l3_packm_a.h             |   21 -
 sandbox/gemmlike/bls_l3_packm_b.c             |   67 +-
 sandbox/gemmlike/bls_l3_packm_b.h             |   21 -
 sandbox/gemmlike/bls_l3_packm_var.h           |    2 +-
 sandbox/gemmlike/bls_l3_packm_var1.c          |   11 +-
 sandbox/gemmlike/bls_l3_packm_var2.c          |   11 +-
 sandbox/gemmlike/bls_l3_packm_var3.c          |   11 +-
 sandbox/gemmlike/thread/bls_l3_decor.c        |  148 --
 sandbox/gemmlike/thread/bls_l3_decor_openmp.c |  131 --
 sandbox/gemmlike/thread/bls_l3_decor_openmp.h |   57 -
 .../gemmlike/thread/bls_l3_decor_pthreads.c   |  222 ---
 .../gemmlike/thread/bls_l3_decor_pthreads.h   |   60 -
 sandbox/gemmlike/thread/bls_l3_decor_single.c |  137 --
 sandbox/old/ref99/old/packm/blx_l3_packm.c    |   10 +-
 .../old/ref99/old/vars/blx_gemm_blk_var3.c    |    4 +-
 .../old/ref99/old/vars/blx_gemm_ker_var2.c    |    8 +-
 .../old/vars/other/blx_gemm_ker_var2rr.c      |    8 +-
 .../old/vars/other/blx_gemm_ker_var2sl.c      |    8 +-
 sandbox/power10/bli_gemm_ex.c                 |    4 +-
 test/syrk_diagonal/syrk_diagonal_example2.c   |    4 +-
 test/syrk_diagonal/syrk_diagonal_example2.cxx |    4 +-
 test/tensor_contraction/tcontract_example.cxx |   16 +-
 testsuite/src/test_gemm_ukr.c                 |   18 +-
 testsuite/src/test_gemmtrsm_ukr.c             |   28 +-
 testsuite/src/test_libblis.c                  |   20 +-
 testsuite/src/test_libblis.h                  |    2 +-
 testsuite/src/test_trsm_ukr.c                 |   18 +-
 206 files changed, 5013 insertions(+), 11035 deletions(-)
 delete mode 100644 frame/1m/packm/bli_packm_thrinfo.c
 create mode 100644 frame/3/bli_l3_decor.c
 rename frame/{thread => 3}/bli_l3_decor.h (78%)
 rename frame/{thread/bli_l3_decor.c => 3/bli_l3_sup_decor.c} (59%)
 rename frame/{thread => 3}/bli_l3_sup_decor.h (77%)
 rename frame/3/gemm/{ => other}/bli_gemm_ker_var1.c (100%)
 delete mode 100644 frame/thread/bli_l3_decor_openmp.c
 delete mode 100644 frame/thread/bli_l3_decor_pthreads.c
 delete mode 100644 frame/thread/bli_l3_decor_pthreads.h
 delete mode 100644 frame/thread/bli_l3_decor_single.c
 delete mode 100644 frame/thread/bli_l3_sup_decor_openmp.c
 delete mode 100644 frame/thread/bli_l3_sup_decor_pthreads.c
 delete mode 100644 frame/thread/bli_l3_sup_decor_pthreads.h
 delete mode 100644 frame/thread/bli_l3_sup_decor_single.c
 rename frame/thread/{bli_l3_decor_openmp.h => bli_thread_openmp.c} (69%)
 rename frame/thread/{bli_l3_sup_decor_openmp.h => bli_thread_openmp.h} (82%)
 create mode 100644 frame/thread/bli_thread_pthreads.c
 rename frame/thread/{bli_l3_decor_single.h => bli_thread_pthreads.h} (82%)
 rename sandbox/gemmlike/thread/bls_l3_decor_single.h => frame/thread/bli_thread_single.c (77%)
 rename frame/thread/{bli_l3_sup_decor_single.h => bli_thread_single.h} (81%)
 delete mode 100644 frame/thread/bli_thrinfo_sup.c
 delete mode 100644 frame/thread/bli_thrinfo_sup.h
 rename frame/thread/bli_l3_sup_decor.c => sandbox/gemmlike/bls_l3_decor.c (50%)
 rename sandbox/gemmlike/{thread => }/bls_l3_decor.h (79%)
 delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor.c
 delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_openmp.c
 delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_openmp.h
 delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
 delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
 delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_single.c

diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c
index 9139e89b1..dbccedc35 100644
--- a/addon/gemmd/attic/bao_gemmd_bp_var2.c
+++ b/addon/gemmd/attic/bao_gemmd_bp_var2.c
@@ -386,8 +386,8 @@ void PASTECH2(bao_,ch,varname) \
 				/* Query the number of threads and thread ids for the JR loop.
 				   NOTE: These values are only needed when computing the next
 				   micropanel of B. */ \
-				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
-				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+				const dim_t jr_nt  = bli_thrinfo_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
 				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
@@ -416,8 +416,8 @@ void PASTECH2(bao_,ch,varname) \
 					/* Query the number of threads and thread ids for the IR loop.
 					   NOTE: These values are only needed when computing the next
 					   micropanel of A. */ \
-					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
-					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+					const dim_t ir_nt  = bli_thrinfo_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \
 \
 					/* Compute number of primary and leftover components of the IR loop. */ \
 					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -476,7 +476,7 @@ void PASTECH2(bao_,ch,varname) \
 			/* This barrier is needed to prevent threads from starting to pack
 			   the next row panel of B before the current row panel is fully
 			   computed upon. */ \
-			bli_thread_barrier( thread_pb ); \
+			bli_thrinfo_barrier( thread_pb ); \
 		} \
 	} \
 \
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
index e3f47982c..b475218e9 100644
--- a/addon/gemmd/bao_gemmd_bp_var1.c
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -370,8 +370,8 @@ void PASTECH2(bao_,ch,varname) \
 				/* Query the number of threads and thread ids for the JR loop.
 				   NOTE: These values are only needed when computing the next
 				   micropanel of B. */ \
-				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
-				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+				const dim_t jr_nt  = bli_thrinfo_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
 				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
@@ -400,8 +400,8 @@ void PASTECH2(bao_,ch,varname) \
 					/* Query the number of threads and thread ids for the IR loop.
 					   NOTE: These values are only needed when computing the next
 					   micropanel of A. */ \
-					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
-					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+					const dim_t ir_nt  = bli_thrinfo_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \
 \
 					/* Compute number of primary and leftover components of the IR loop. */ \
 					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -458,7 +458,7 @@ void PASTECH2(bao_,ch,varname) \
 			/* This barrier is needed to prevent threads from starting to pack
 			   the next row panel of B before the current row panel is fully
 			   computed upon. */ \
-			bli_thread_barrier( rntm, thread_pb ); \
+			bli_thrinfo_barrier( thread_pb ); \
 		} \
 	} \
 \
diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c
index 1d6502884..b33fd9089 100644
--- a/addon/gemmd/bao_l3_packm_a.c
+++ b/addon/gemmd/bao_l3_packm_a.c
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+		mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -313,13 +313,13 @@ void PASTECH2(bao_,ch,opname) \
 	  d,  incd, \
 	  a,  rs_a,  cs_a, \
 	  *p, *rs_p, *cs_p, \
-		  pd_p,  *ps_p, \
+	       pd_p, *ps_p, \
 	  cntx, \
 	  thread  \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_a )
diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c
index 8d020007c..76860c8ee 100644
--- a/addon/gemmd/bao_l3_packm_b.c
+++ b/addon/gemmd/bao_l3_packm_b.c
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+		mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -313,13 +313,13 @@ void PASTECH2(bao_,ch,opname) \
 	  d,  incd, \
 	  b,  rs_b,  cs_b, \
 	  *p, *rs_p, *cs_p, \
-		  pd_p,  *ps_p, \
+	       pd_p, *ps_p, \
 	  cntx, \
 	  thread  \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_b )
diff --git a/addon/gemmd/bao_l3_packm_var1.c b/addon/gemmd/bao_l3_packm_var1.c
index 24c0a2cc1..d002dc6bf 100644
--- a/addon/gemmd/bao_l3_packm_var1.c
+++ b/addon/gemmd/bao_l3_packm_var1.c
@@ -127,8 +127,8 @@ void PASTECH2(bao_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thrinfo_n_way( thread ); \
+	const dim_t tid = bli_thrinfo_work_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/addon/gemmd/bao_l3_packm_var2.c b/addon/gemmd/bao_l3_packm_var2.c
index 830e499b3..49e9d1941 100644
--- a/addon/gemmd/bao_l3_packm_var2.c
+++ b/addon/gemmd/bao_l3_packm_var2.c
@@ -127,8 +127,8 @@ void PASTECH2(bao_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thrinfo_n_way( thread ); \
+	const dim_t tid = bli_thrinfo_work_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index 8d29d73b2..db20ffbca 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -1,122 +1,69 @@
 EXPORTS
 bli_abort
 bli_absqsc
-bli_absqsc_check
-bli_absqsc_qfp
 bli_acquire_mij
 bli_acquire_mpart
 bli_acquire_mpart_b2t
 bli_acquire_mpart_br2tl
 bli_acquire_mpart_l2r
-bli_acquire_mpart_l2r_check
 bli_acquire_mpart_mdim
 bli_acquire_mpart_mndim
 bli_acquire_mpart_ndim
 bli_acquire_mpart_r2l
 bli_acquire_mpart_t2b
-bli_acquire_mpart_t2b_check
 bli_acquire_mpart_tl2br
-bli_acquire_mpart_tl2br_check
 bli_acquire_vi
 bli_acquire_vpart_b2f
 bli_acquire_vpart_f2b
 bli_addd
-bli_addd_check
 bli_addd_ex
-bli_addd_ex_qfp
 bli_addm
-bli_addm_check
 bli_addm_ex
-bli_addm_ex_qfp
 bli_addsc
-bli_addsc_check
-bli_addsc_qfp
 bli_addv
-bli_addv_check
 bli_addv_ex
-bli_addv_ex_qfp
-bli_adjust_strides
 bli_align_dim_to_mult
 bli_align_dim_to_size
 bli_align_ptr_to_size
 bli_amaxv
-bli_amaxv_check
 bli_amaxv_ex
-bli_amaxv_ex_qfp
-bli_apool_alloc_block
-bli_apool_array_elem
-bli_apool_checkin_array
-bli_apool_checkout_array
-bli_apool_finalize
-bli_apool_free_block
-bli_apool_grow
-bli_apool_init
 bli_arch_query_id
-bli_arch_set_id
-bli_arch_set_id_once
 bli_arch_string
-bli_array_elem
-bli_array_finalize
-bli_array_init
-bli_array_resize
-bli_array_set_elem
 bli_asumv
-bli_asumv_check
 bli_asumv_ex
-bli_asumv_ex_qfp
 bli_axpbyv
-bli_axpbyv_check
 bli_axpbyv_ex
-bli_axpbyv_ex_qfp
 bli_axpy2v
-bli_axpy2v_check
 bli_axpy2v_ex
-bli_axpy2v_ex_qfp
 bli_axpyd
-bli_axpyd_check
 bli_axpyd_ex
-bli_axpyd_ex_qfp
 bli_axpyf
-bli_axpyf_check
 bli_axpyf_ex
-bli_axpyf_ex_qfp
 bli_axpym
-bli_axpym_check
 bli_axpym_ex
-bli_axpym_ex_qfp
 bli_axpyv
-bli_axpyv_check
 bli_axpyv_ex
-bli_axpyv_ex_qfp
 bli_blksz_create
 bli_blksz_create_ed
 bli_blksz_free
 bli_blksz_init
 bli_blksz_init_easy
 bli_blksz_init_ed
-bli_blksz_reduce_def_to
-bli_blksz_reduce_max_to
 bli_cabsqsc
 bli_caddd
 bli_caddd_ex
 bli_caddm
 bli_caddm_ex
-bli_caddm_unb_var1
 bli_caddsc
 bli_caddv
 bli_caddv_ex
-bli_calloc_intl
 bli_camaxv
 bli_camaxv_ex
 bli_castm
-bli_castm_check
 bli_castnzm
-bli_castnzm_check
 bli_castv
-bli_castv_check
 bli_casumv
 bli_casumv_ex
-bli_casumv_unb_var1
 bli_caxpbyv
 bli_caxpbyv_ex
 bli_caxpy2v
@@ -127,33 +74,24 @@ bli_caxpyf
 bli_caxpyf_ex
 bli_caxpym
 bli_caxpym_ex
-bli_caxpym_unb_var1
 bli_caxpyv
 bli_caxpyv_ex
 bli_cccastm
 bli_cccastnzm
 bli_cccastv
 bli_cccopysc
-bli_ccgemm_ker_var2_md
 bli_ccopyd
 bli_ccopyd_ex
 bli_ccopym
 bli_ccopym_ex
-bli_ccopym_unb_var1
 bli_ccopyv
 bli_ccopyv_ex
-bli_ccpackm_blk_var1_md
-bli_ccpackm_cxk_1e_md
-bli_ccpackm_cxk_1r_md
-bli_ccpackm_struc_cxk_md
 bli_ccxpbym_md
 bli_ccxpbym_md_ex
-bli_ccxpbym_md_unb_var1
 bli_cdcastm
 bli_cdcastnzm
 bli_cdcastv
 bli_cdcopysc
-bli_cdgemm_ker_var2_md
 bli_cdivsc
 bli_cdotaxpyv
 bli_cdotaxpyv_ex
@@ -165,288 +103,111 @@ bli_cdotxf
 bli_cdotxf_ex
 bli_cdotxv
 bli_cdotxv_ex
-bli_cdpackm_blk_var1_md
-bli_cdpackm_cxk_1e_md
-bli_cdpackm_cxk_1r_md
-bli_cdpackm_struc_cxk_md
 bli_cdxpbym_md
 bli_cdxpbym_md_ex
-bli_cdxpbym_md_unb_var1
+bli_ceqm
+bli_ceqsc
+bli_ceqv
 bli_cfprintm
 bli_cfprintv
 bli_cgemm
-bli_cgemm1m
-bli_cgemm3m1
-bli_cgemm3mh
-bli_cgemm4m1
-bli_cgemm4mb
-bli_cgemm4mb_ker_var2
-bli_cgemm4mh
 bli_cgemm_ex
-bli_cgemm_ker_var2
-bli_cgemm_md_c2r_ref
-bli_cgemmtrsm_l_ukernel
-bli_cgemmtrsm_u_ukernel
-bli_cgemm_ukernel
+bli_cgemmt
+bli_cgemmt_ex
 bli_cgemv
 bli_cgemv_ex
-bli_cgemv_unb_var1
-bli_cgemv_unb_var2
-bli_cgemv_unf_var1
-bli_cgemv_unf_var2
 bli_cger
 bli_cger_ex
-bli_cger_unb_var1
-bli_cger_unb_var2
 bli_cgetijm
+bli_cgetijv
 bli_cgetsc
-bli_check_alignment_is_mult_of_ptr_size
-bli_check_alignment_is_power_of_two
-bli_check_conformal_dims
-bli_check_consistent_datatypes
-bli_check_consistent_object_datatypes
-bli_check_consistent_object_precisions
-bli_check_consistent_precisions
-bli_check_datatype_real_proj_of
-bli_check_equal_vector_lengths
 bli_check_error_code_helper
-bli_check_floating_datatype
-bli_check_floating_object
-bli_check_general_object
-bli_check_hermitian_object
-bli_check_if_exhausted_pool
-bli_check_integer_datatype
-bli_check_integer_object
-bli_check_level3_dims
-bli_check_matrix_object
-bli_check_matrix_strides
-bli_check_nonconstant_datatype
-bli_check_nonconstant_object
-bli_check_noninteger_datatype
-bli_check_noninteger_object
-bli_check_nonunit_diag
-bli_check_null_pointer
-bli_check_object_alias_of
-bli_check_object_buffer
-bli_check_object_diag_offset_equals
-bli_check_object_length_equals
-bli_check_object_real_proj_of
-bli_check_object_struc
-bli_check_object_valid_datatype
-bli_check_object_width_equals
-bli_check_packm_schema_on_unpack
-bli_check_packv_schema_on_unpack
-bli_check_real_datatype
-bli_check_real_object
-bli_check_real_valued_object
-bli_check_scalar_object
-bli_check_square_object
-bli_check_sufficient_stack_buf_size
-bli_check_symmetric_object
-bli_check_triangular_object
-bli_check_upper_or_lower_object
-bli_check_valid_1x3_subpart
-bli_check_valid_3x1_subpart
-bli_check_valid_3x3_subpart
-bli_check_valid_arch_id
-bli_check_valid_cntl
-bli_check_valid_datatype
-bli_check_valid_diag
-bli_check_valid_error_level
-bli_check_valid_kc_mod_mult
-bli_check_valid_malloc_buf
-bli_check_valid_mc_mod_mult
-bli_check_valid_nc_mod_mult
-bli_check_valid_packbuf
-bli_check_valid_side
-bli_check_valid_trans
-bli_check_valid_uplo
-bli_check_vector_dim_equals
-bli_check_vector_object
 bli_chemm
-bli_chemm1m
-bli_chemm3m1
-bli_chemm3mh
-bli_chemm4m1
-bli_chemm4mh
 bli_chemm_ex
 bli_chemv
 bli_chemv_ex
-bli_chemv_unb_var1
-bli_chemv_unb_var2
-bli_chemv_unb_var3
-bli_chemv_unb_var4
-bli_chemv_unf_var1
-bli_chemv_unf_var1a
-bli_chemv_unf_var3
-bli_chemv_unf_var3a
 bli_cher
 bli_cher2
 bli_cher2_ex
 bli_cher2k
-bli_cher2k1m
-bli_cher2k3m1
-bli_cher2k3mh
-bli_cher2k4m1
-bli_cher2k4mh
 bli_cher2k_ex
-bli_cher2_unb_var1
-bli_cher2_unb_var2
-bli_cher2_unb_var3
-bli_cher2_unb_var4
-bli_cher2_unf_var1
-bli_cher2_unf_var4
 bli_cher_ex
 bli_cherk
-bli_cherk1m
-bli_cherk3m1
-bli_cherk3mh
-bli_cherk4m1
-bli_cherk4mh
 bli_cherk_ex
-bli_cherk_l_ker_var2
-bli_cherk_u_ker_var2
-bli_cher_unb_var1
-bli_cher_unb_var2
 bli_cinvertd
 bli_cinvertd_ex
 bli_cinvertsc
 bli_cinvertv
 bli_cinvertv_ex
+bli_cinvscald
+bli_cinvscald_ex
+bli_cinvscalm
+bli_cinvscalm_ex
+bli_cinvscalv
+bli_cinvscalv_ex
 bli_clock
-bli_clock_helper
 bli_clock_min_diff
 bli_cmachval
 bli_cmkherm
 bli_cmkherm_ex
-bli_cmkherm_unb_var1
 bli_cmksymm
 bli_cmksymm_ex
-bli_cmksymm_unb_var1
 bli_cmktrim
 bli_cmktrim_ex
-bli_cmktrim_unb_var1
 bli_cmulsc
 bli_cnorm1m
 bli_cnorm1m_ex
-bli_cnorm1m_unb_var1
 bli_cnorm1v
 bli_cnorm1v_ex
-bli_cnorm1v_unb_var1
 bli_cnormfm
 bli_cnormfm_ex
-bli_cnormfm_unb_var1
 bli_cnormfsc
 bli_cnormfv
 bli_cnormfv_ex
-bli_cnormfv_unb_var1
 bli_cnormim
 bli_cnormim_ex
-bli_cnormim_unb_var1
 bli_cnormiv
 bli_cnormiv_ex
-bli_cnormiv_unb_var1
-bli_cntl_calc_num_threads_in
 bli_cntl_clear_node
 bli_cntl_copy
 bli_cntl_create_node
 bli_cntl_free
 bli_cntl_free_node
-bli_cntl_free_wo_thrinfo
-bli_cntl_free_w_thrinfo
 bli_cntl_mark_family
-bli_cntx_1m_stage
-bli_cntx_3m1_stage
-bli_cntx_3mh_stage
-bli_cntx_4m1_stage
-bli_cntx_4mb_stage
-bli_cntx_4mh_stage
 bli_cntx_clear
-bli_cntx_ind_stage
-bli_cntx_nat_stage
 bli_cntx_print
 bli_cntx_set_blkszs
 bli_cntx_set_ind_blkszs
-bli_cntx_set_l1f_kers
-bli_cntx_set_l1v_kers
-bli_cntx_set_l3_nat_ukrs
-bli_cntx_set_packm_kers
+bli_cntx_set_l3_sup_handlers
+bli_cntx_set_ukr_prefs
+bli_cntx_set_ukrs
 bli_copyd
-bli_copyd_check
 bli_copyd_ex
-bli_copyd_ex_qfp
 bli_copym
-bli_copym_check
 bli_copym_ex
-bli_copym_ex_qfp
 bli_copysc
-bli_copysc_check
 bli_copyv
-bli_copyv_check
 bli_copyv_ex
-bli_copyv_ex_qfp
-bli_cpackm_blk_var1
-bli_cpackm_cxk
-bli_cpackm_cxk_1er
-bli_cpackm_cxk_3mis
-bli_cpackm_cxk_4mi
-bli_cpackm_cxk_rih
-bli_cpackm_herm_cxk
-bli_cpackm_herm_cxk_1er
-bli_cpackm_herm_cxk_3mis
-bli_cpackm_herm_cxk_4mi
-bli_cpackm_herm_cxk_rih
-bli_cpackm_struc_cxk
-bli_cpackm_struc_cxk_1er
-bli_cpackm_struc_cxk_3mis
-bli_cpackm_struc_cxk_4mi
-bli_cpackm_struc_cxk_rih
-bli_cpackm_tri_cxk
-bli_cpackm_tri_cxk_1er
-bli_cpackm_tri_cxk_3mis
-bli_cpackm_tri_cxk_4mi
-bli_cpackm_tri_cxk_rih
-bli_cpackm_unb_var1
 bli_cprintm
-bli_cprintm_ex
 bli_cprintv
-bli_cprintv_ex
-bli_cpuid_is_bulldozer
-bli_cpuid_is_excavator
-bli_cpuid_is_haswell
-bli_cpuid_is_knl
-bli_cpuid_is_penryn
-bli_cpuid_is_piledriver
-bli_cpuid_is_sandybridge
-bli_cpuid_is_skx
-bli_cpuid_is_steamroller
-bli_cpuid_is_zen
-bli_cpuid_query
-bli_cpuid_query_id
 bli_crandm
 bli_crandm_ex
-bli_crandm_unb_var1
 bli_crandnm
 bli_crandnm_ex
-bli_crandnm_unb_var1
 bli_crandnv
 bli_crandnv_ex
-bli_crandnv_unb_var1
 bli_crandv
 bli_crandv_ex
-bli_crandv_unb_var1
 bli_cscal2d
 bli_cscal2d_ex
 bli_cscal2m
 bli_cscal2m_ex
-bli_cscal2m_unb_var1
 bli_cscal2v
 bli_cscal2v_ex
 bli_cscald
 bli_cscald_ex
 bli_cscalm
 bli_cscalm_ex
-bli_cscalm_unb_var1
 bli_cscalv
 bli_cscalv_ex
 bli_cscastm
@@ -458,42 +219,29 @@ bli_csetd_ex
 bli_csetid
 bli_csetid_ex
 bli_csetijm
+bli_csetijv
 bli_csetm
 bli_csetm_ex
-bli_csetm_unb_var1
 bli_csetsc
 bli_csetv
 bli_csetv_ex
-bli_csgemm_ker_var2_md
 bli_cshiftd
 bli_cshiftd_ex
-bli_cspackm_blk_var1_md
-bli_cspackm_cxk_1e_md
-bli_cspackm_cxk_1r_md
-bli_cspackm_struc_cxk_md
 bli_csqrtsc
 bli_csubd
 bli_csubd_ex
 bli_csubm
 bli_csubm_ex
-bli_csubm_unb_var1
 bli_csubsc
 bli_csubv
 bli_csubv_ex
 bli_csumsqv
 bli_csumsqv_ex
-bli_csumsqv_unb_var1
 bli_cswapv
 bli_cswapv_ex
 bli_csxpbym_md
 bli_csxpbym_md_ex
-bli_csxpbym_md_unb_var1
 bli_csymm
-bli_csymm1m
-bli_csymm3m1
-bli_csymm3mh
-bli_csymm4m1
-bli_csymm4mh
 bli_csymm_ex
 bli_csymv
 bli_csymv_ex
@@ -501,89 +249,39 @@ bli_csyr
 bli_csyr2
 bli_csyr2_ex
 bli_csyr2k
-bli_csyr2k1m
-bli_csyr2k3m1
-bli_csyr2k3mh
-bli_csyr2k4m1
-bli_csyr2k4mh
 bli_csyr2k_ex
 bli_csyr_ex
 bli_csyrk
-bli_csyrk1m
-bli_csyrk3m1
-bli_csyrk3mh
-bli_csyrk4m1
-bli_csyrk4mh
 bli_csyrk_ex
 bli_ctrmm
-bli_ctrmm1m
 bli_ctrmm3
-bli_ctrmm31m
-bli_ctrmm33m1
-bli_ctrmm33mh
-bli_ctrmm34m1
-bli_ctrmm34mh
 bli_ctrmm3_ex
-bli_ctrmm3m1
-bli_ctrmm4m1
 bli_ctrmm_ex
-bli_ctrmm_ll_ker_var2
-bli_ctrmm_lu_ker_var2
-bli_ctrmm_rl_ker_var2
-bli_ctrmm_ru_ker_var2
 bli_ctrmv
 bli_ctrmv_ex
-bli_ctrmv_unb_var1
-bli_ctrmv_unb_var2
-bli_ctrmv_unf_var1
-bli_ctrmv_unf_var2
 bli_ctrsm
-bli_ctrsm1m
-bli_ctrsm3m1
-bli_ctrsm4m1
 bli_ctrsm_ex
-bli_ctrsm_ll_ker_var2
-bli_ctrsm_l_ukernel
-bli_ctrsm_lu_ker_var2
-bli_ctrsm_rl_ker_var2
-bli_ctrsm_ru_ker_var2
-bli_ctrsm_u_ukernel
 bli_ctrsv
 bli_ctrsv_ex
-bli_ctrsv_unb_var1
-bli_ctrsv_unb_var2
-bli_ctrsv_unf_var1
-bli_ctrsv_unf_var2
-bli_cunpackm_blk_var1
-bli_cunpackm_cxk
-bli_cunpackm_unb_var1
 bli_cunzipsc
 bli_cxpbyd
 bli_cxpbyd_ex
 bli_cxpbym
 bli_cxpbym_ex
-bli_cxpbym_unb_var1
 bli_cxpbyv
 bli_cxpbyv_ex
 bli_czcastm
 bli_czcastnzm
 bli_czcastv
 bli_czcopysc
-bli_czgemm_ker_var2_md
 bli_czipsc
-bli_czpackm_blk_var1_md
-bli_czpackm_cxk_1e_md
-bli_czpackm_cxk_1r_md
-bli_czpackm_struc_cxk_md
 bli_czxpbym_md
 bli_czxpbym_md_ex
-bli_czxpbym_md_unb_var1
 bli_dabsqsc
 bli_daddd
 bli_daddd_ex
 bli_daddm
 bli_daddm_ex
-bli_daddm_unb_var1
 bli_daddsc
 bli_daddv
 bli_daddv_ex
@@ -591,7 +289,6 @@ bli_damaxv
 bli_damaxv_ex
 bli_dasumv
 bli_dasumv_ex
-bli_dasumv_unb_var1
 bli_daxpbyv
 bli_daxpbyv_ex
 bli_daxpy2v
@@ -602,33 +299,24 @@ bli_daxpyf
 bli_daxpyf_ex
 bli_daxpym
 bli_daxpym_ex
-bli_daxpym_unb_var1
 bli_daxpyv
 bli_daxpyv_ex
 bli_dccastm
 bli_dccastnzm
 bli_dccastv
 bli_dccopysc
-bli_dcgemm_ker_var2_md
 bli_dcopyd
 bli_dcopyd_ex
 bli_dcopym
 bli_dcopym_ex
-bli_dcopym_unb_var1
 bli_dcopyv
 bli_dcopyv_ex
-bli_dcpackm_blk_var1_md
-bli_dcpackm_cxk_1e_md
-bli_dcpackm_cxk_1r_md
-bli_dcpackm_struc_cxk_md
 bli_dcxpbym_md
 bli_dcxpbym_md_ex
-bli_dcxpbym_md_unb_var1
 bli_ddcastm
 bli_ddcastnzm
 bli_ddcastv
 bli_ddcopysc
-bli_ddgemm_ker_var2_md
 bli_ddivsc
 bli_ddotaxpyv
 bli_ddotaxpyv_ex
@@ -640,183 +328,99 @@ bli_ddotxf
 bli_ddotxf_ex
 bli_ddotxv
 bli_ddotxv_ex
-bli_ddpackm_blk_var1_md
-bli_ddpackm_cxk_1e_md
-bli_ddpackm_cxk_1r_md
-bli_ddpackm_struc_cxk_md
 bli_ddxpbym_md
 bli_ddxpbym_md_ex
-bli_ddxpbym_md_unb_var1
-bli_determine_blocksize
-bli_determine_blocksize_b
-bli_determine_blocksize_b_sub
-bli_determine_blocksize_f
-bli_determine_blocksize_f_sub
+bli_deqm
+bli_deqsc
+bli_deqv
 bli_dfprintm
 bli_dfprintv
 bli_dgemm
-bli_dgemm1m
-bli_dgemm3m1
-bli_dgemm3mh
-bli_dgemm4m1
-bli_dgemm4mb
-bli_dgemm4mb_ker_var2
-bli_dgemm4mh
 bli_dgemm_ex
-bli_dgemm_ker_var2
-bli_dgemmtrsm_l_ukernel
-bli_dgemmtrsm_u_ukernel
-bli_dgemm_ukernel
+bli_dgemmt
+bli_dgemmt_ex
 bli_dgemv
 bli_dgemv_ex
-bli_dgemv_unb_var1
-bli_dgemv_unb_var2
-bli_dgemv_unf_var1
-bli_dgemv_unf_var2
 bli_dger
 bli_dger_ex
-bli_dger_unb_var1
-bli_dger_unb_var2
 bli_dgetijm
+bli_dgetijv
 bli_dgetsc
 bli_dhemm
-bli_dhemm1m
-bli_dhemm3m1
-bli_dhemm3mh
-bli_dhemm4m1
-bli_dhemm4mh
 bli_dhemm_ex
 bli_dhemv
 bli_dhemv_ex
-bli_dhemv_unb_var1
-bli_dhemv_unb_var2
-bli_dhemv_unb_var3
-bli_dhemv_unb_var4
-bli_dhemv_unf_var1
-bli_dhemv_unf_var1a
-bli_dhemv_unf_var3
-bli_dhemv_unf_var3a
 bli_dher
 bli_dher2
 bli_dher2_ex
 bli_dher2k
-bli_dher2k1m
-bli_dher2k3m1
-bli_dher2k3mh
-bli_dher2k4m1
-bli_dher2k4mh
 bli_dher2k_ex
-bli_dher2_unb_var1
-bli_dher2_unb_var2
-bli_dher2_unb_var3
-bli_dher2_unb_var4
-bli_dher2_unf_var1
-bli_dher2_unf_var4
 bli_dher_ex
 bli_dherk
-bli_dherk1m
-bli_dherk3m1
-bli_dherk3mh
-bli_dherk4m1
-bli_dherk4mh
 bli_dherk_ex
-bli_dherk_l_ker_var2
-bli_dherk_u_ker_var2
-bli_dher_unb_var1
-bli_dher_unb_var2
 bli_dinvertd
 bli_dinvertd_ex
 bli_dinvertsc
 bli_dinvertv
 bli_dinvertv_ex
+bli_dinvscald
+bli_dinvscald_ex
+bli_dinvscalm
+bli_dinvscalm_ex
+bli_dinvscalv
+bli_dinvscalv_ex
 bli_divsc
-bli_divsc_check
-bli_divsc_qfp
-bli_dlamch
 bli_dmachval
 bli_dmkherm
 bli_dmkherm_ex
-bli_dmkherm_unb_var1
 bli_dmksymm
 bli_dmksymm_ex
-bli_dmksymm_unb_var1
 bli_dmktrim
 bli_dmktrim_ex
-bli_dmktrim_unb_var1
 bli_dmulsc
 bli_dnorm1m
 bli_dnorm1m_ex
-bli_dnorm1m_unb_var1
 bli_dnorm1v
 bli_dnorm1v_ex
-bli_dnorm1v_unb_var1
 bli_dnormfm
 bli_dnormfm_ex
-bli_dnormfm_unb_var1
 bli_dnormfsc
 bli_dnormfv
 bli_dnormfv_ex
-bli_dnormfv_unb_var1
 bli_dnormim
 bli_dnormim_ex
-bli_dnormim_unb_var1
 bli_dnormiv
 bli_dnormiv_ex
-bli_dnormiv_unb_var1
 bli_dotaxpyv
-bli_dotaxpyv_check
 bli_dotaxpyv_ex
-bli_dotaxpyv_ex_qfp
 bli_dotv
-bli_dotv_check
 bli_dotv_ex
-bli_dotv_ex_qfp
 bli_dotxaxpyf
-bli_dotxaxpyf_check
 bli_dotxaxpyf_ex
-bli_dotxaxpyf_ex_qfp
 bli_dotxf
-bli_dotxf_check
 bli_dotxf_ex
-bli_dotxf_ex_qfp
 bli_dotxv
-bli_dotxv_check
 bli_dotxv_ex
-bli_dotxv_ex_qfp
-bli_dpackm_blk_var1
-bli_dpackm_cxk
-bli_dpackm_herm_cxk
-bli_dpackm_struc_cxk
-bli_dpackm_tri_cxk
-bli_dpackm_unb_var1
 bli_dprintm
-bli_dprintm_ex
 bli_dprintv
-bli_dprintv_ex
 bli_drandm
 bli_drandm_ex
-bli_drandm_unb_var1
 bli_drandnm
 bli_drandnm_ex
-bli_drandnm_unb_var1
 bli_drandnv
 bli_drandnv_ex
-bli_drandnv_unb_var1
 bli_drandv
 bli_drandv_ex
-bli_drandv_unb_var1
 bli_dscal2d
 bli_dscal2d_ex
 bli_dscal2m
 bli_dscal2m_ex
-bli_dscal2m_unb_var1
 bli_dscal2v
 bli_dscal2v_ex
 bli_dscald
 bli_dscald_ex
 bli_dscalm
 bli_dscalm_ex
-bli_dscalm_unb_var1
 bli_dscalv
 bli_dscalv_ex
 bli_dscastm
@@ -828,42 +432,29 @@ bli_dsetd_ex
 bli_dsetid
 bli_dsetid_ex
 bli_dsetijm
+bli_dsetijv
 bli_dsetm
 bli_dsetm_ex
-bli_dsetm_unb_var1
 bli_dsetsc
 bli_dsetv
 bli_dsetv_ex
-bli_dsgemm_ker_var2_md
 bli_dshiftd
 bli_dshiftd_ex
-bli_dspackm_blk_var1_md
-bli_dspackm_cxk_1e_md
-bli_dspackm_cxk_1r_md
-bli_dspackm_struc_cxk_md
 bli_dsqrtsc
 bli_dsubd
 bli_dsubd_ex
 bli_dsubm
 bli_dsubm_ex
-bli_dsubm_unb_var1
 bli_dsubsc
 bli_dsubv
 bli_dsubv_ex
 bli_dsumsqv
 bli_dsumsqv_ex
-bli_dsumsqv_unb_var1
 bli_dswapv
 bli_dswapv_ex
 bli_dsxpbym_md
 bli_dsxpbym_md_ex
-bli_dsxpbym_md_unb_var1
 bli_dsymm
-bli_dsymm1m
-bli_dsymm3m1
-bli_dsymm3mh
-bli_dsymm4m1
-bli_dsymm4mh
 bli_dsymm_ex
 bli_dsymv
 bli_dsymv_ex
@@ -871,301 +462,79 @@ bli_dsyr
 bli_dsyr2
 bli_dsyr2_ex
 bli_dsyr2k
-bli_dsyr2k1m
-bli_dsyr2k3m1
-bli_dsyr2k3mh
-bli_dsyr2k4m1
-bli_dsyr2k4mh
 bli_dsyr2k_ex
 bli_dsyr_ex
 bli_dsyrk
-bli_dsyrk1m
-bli_dsyrk3m1
-bli_dsyrk3mh
-bli_dsyrk4m1
-bli_dsyrk4mh
 bli_dsyrk_ex
 bli_dtrmm
-bli_dtrmm1m
 bli_dtrmm3
-bli_dtrmm31m
-bli_dtrmm33m1
-bli_dtrmm33mh
-bli_dtrmm34m1
-bli_dtrmm34mh
 bli_dtrmm3_ex
-bli_dtrmm3m1
-bli_dtrmm4m1
 bli_dtrmm_ex
-bli_dtrmm_ll_ker_var2
-bli_dtrmm_lu_ker_var2
-bli_dtrmm_rl_ker_var2
-bli_dtrmm_ru_ker_var2
 bli_dtrmv
 bli_dtrmv_ex
-bli_dtrmv_unb_var1
-bli_dtrmv_unb_var2
-bli_dtrmv_unf_var1
-bli_dtrmv_unf_var2
 bli_dtrsm
-bli_dtrsm1m
-bli_dtrsm3m1
-bli_dtrsm4m1
 bli_dtrsm_ex
-bli_dtrsm_ll_ker_var2
-bli_dtrsm_l_ukernel
-bli_dtrsm_lu_ker_var2
-bli_dtrsm_rl_ker_var2
-bli_dtrsm_ru_ker_var2
-bli_dtrsm_u_ukernel
 bli_dtrsv
 bli_dtrsv_ex
-bli_dtrsv_unb_var1
-bli_dtrsv_unb_var2
-bli_dtrsv_unf_var1
-bli_dtrsv_unf_var2
 bli_dt_size
-bli_dt_size_check
 bli_dt_string
-bli_dt_string_check
-bli_dt_union_check
-bli_dunpackm_blk_var1
-bli_dunpackm_cxk
-bli_dunpackm_unb_var1
 bli_dunzipsc
 bli_dxpbyd
 bli_dxpbyd_ex
 bli_dxpbym
 bli_dxpbym_ex
-bli_dxpbym_unb_var1
 bli_dxpbyv
 bli_dxpbyv_ex
 bli_dzcastm
 bli_dzcastnzm
 bli_dzcastv
 bli_dzcopysc
-bli_dzgemm_ker_var2_md
 bli_dzipsc
-bli_dzpackm_blk_var1_md
-bli_dzpackm_cxk_1e_md
-bli_dzpackm_cxk_1r_md
-bli_dzpackm_struc_cxk_md
 bli_dzxpbym_md
 bli_dzxpbym_md_ex
-bli_dzxpbym_md_unb_var1
+bli_eqm
+bli_eqsc
+bli_eqv
 bli_error_checking_is_enabled
 bli_error_checking_level
 bli_error_checking_level_set
-bli_error_string_for_code
-bli_ffree_align
-bli_ffree_noalign
 bli_finalize
-bli_finalize_apis
-bli_finalize_auto
-bli_finalize_once
-bli_find_area_trap_l
-bli_fmalloc_align
-bli_fmalloc_align_check
-bli_fmalloc_noalign
-bli_fmalloc_post_check
 bli_fprintm
-bli_fprintm_check
-bli_fprintm_ex
-bli_fprintm_qfp
 bli_fprintv
-bli_fprintv_check
-bli_fprintv_ex
-bli_fprintv_qfp
-bli_free_intl
 bli_free_user
-bli_func_create
-bli_func_free
-bli_func_init
-bli_func_init_null
-bli_func_is_null
-bli_func_is_null_dt
-bli_gcd
 bli_gemm
-bli_gemm1m
-bli_gemm3m1
-bli_gemm3mh
-bli_gemm4m1
-bli_gemm4mb
-bli_gemm4mb_ker_var2
-bli_gemm4mh
-bli_gemm_basic_check
-bli_gemm_blk_var1
-bli_gemm_blk_var2
-bli_gemm_blk_var3
-bli_gemmbp_cntl_create
-bli_gemm_check
-bli_gemm_cntl_create
-bli_gemm_cntl_create_node
-bli_gemm_cntl_free
-bli_gemm_determine_kc
-bli_gemm_determine_kc_b
-bli_gemm_determine_kc_f
-bli_gemm_direct
 bli_gemm_ex
-bli_gemm_front
-bli_gemmind
-bli_gemmind_get_avail
-bli_gemm_int
-bli_gemm_ker_var2
-bli_gemm_ker_var2_md
-bli_gemm_md
-bli_gemm_md_ccc
-bli_gemm_md_ccr
-bli_gemm_md_crc
-bli_gemm_md_crr
-bli_gemm_md_rcc
-bli_gemm_md_rcr
-bli_gemm_md_rrc
-bli_gemm_md_rrr
-bli_gemmnat
-bli_gemm_packa
-bli_gemm_packb
-bli_gemm_prune_unref_mparts_k
-bli_gemm_prune_unref_mparts_m
-bli_gemm_prune_unref_mparts_n
-bli_gemmtrsm_l_ukernel_qfp
+bli_gemmt
+bli_gemmt_ex
 bli_gemmtrsm_ukernel
-bli_gemmtrsm_u_ukernel_qfp
 bli_gemm_ukernel
-bli_gemm_ukernel_qfp
 bli_gemv
-bli_gemv_check
 bli_gemv_ex
-bli_gemv_ex_qfp
-bli_gemv_unb_var1
-bli_gemv_unb_var1_qfp
-bli_gemv_unb_var2
-bli_gemv_unb_var2_qfp
-bli_gemv_unf_var1
-bli_gemv_unf_var1_qfp
-bli_gemv_unf_var2
-bli_gemv_unf_var2_qfp
 bli_ger
-bli_ger_check
 bli_ger_ex
-bli_ger_ex_qfp
-bli_ger_unb_var1
-bli_ger_unb_var1_qfp
-bli_ger_unb_var2
-bli_ger_unb_var2_qfp
 bli_getijm
+bli_getijv
 bli_getopt
 bli_getopt_init_state
 bli_getsc
-bli_getsc_check
-bli_getsc_qfp
-bli_gks_cntx_l3_nat_ukr_is_ref
-bli_gks_finalize
-bli_gks_init
-bli_gks_init_index
 bli_gks_init_ref_cntx
 bli_gks_l3_ukr_impl_string
 bli_gks_l3_ukr_impl_type
-bli_gks_lookup_ind_cntx
-bli_gks_lookup_nat_cntx
 bli_gks_query_cntx
-bli_gks_query_cntx_noinit
 bli_gks_query_ind_cntx
 bli_gks_query_nat_cntx
-bli_gks_register_cntx
 bli_hemm
-bli_hemm1m
-bli_hemm3m1
-bli_hemm3mh
-bli_hemm4m1
-bli_hemm4mh
-bli_hemm_basic_check
-bli_hemm_check
 bli_hemm_ex
-bli_hemm_front
-bli_hemmind
-bli_hemmind_get_avail
-bli_hemmnat
 bli_hemv
-bli_hemv_check
 bli_hemv_ex
-bli_hemv_ex_qfp
-bli_hemv_unb_var1
-bli_hemv_unb_var1_qfp
-bli_hemv_unb_var2
-bli_hemv_unb_var2_qfp
-bli_hemv_unb_var3
-bli_hemv_unb_var3_qfp
-bli_hemv_unb_var4
-bli_hemv_unb_var4_qfp
-bli_hemv_unf_var1
-bli_hemv_unf_var1a
-bli_hemv_unf_var1a_qfp
-bli_hemv_unf_var1_qfp
-bli_hemv_unf_var3
-bli_hemv_unf_var3a
-bli_hemv_unf_var3a_qfp
-bli_hemv_unf_var3_qfp
 bli_her
 bli_her2
-bli_her2_check
 bli_her2_ex
-bli_her2_ex_qfp
 bli_her2k
-bli_her2k1m
-bli_her2k3m1
-bli_her2k3mh
-bli_her2k4m1
-bli_her2k4mh
-bli_her2k_basic_check
-bli_her2k_check
 bli_her2k_ex
-bli_her2k_front
-bli_her2kind
-bli_her2kind_get_avail
-bli_her2knat
-bli_her2_unb_var1
-bli_her2_unb_var1_qfp
-bli_her2_unb_var2
-bli_her2_unb_var2_qfp
-bli_her2_unb_var3
-bli_her2_unb_var3_qfp
-bli_her2_unb_var4
-bli_her2_unb_var4_qfp
-bli_her2_unf_var1
-bli_her2_unf_var1_qfp
-bli_her2_unf_var4
-bli_her2_unf_var4_qfp
-bli_her_check
 bli_her_ex
-bli_her_ex_qfp
 bli_herk
-bli_herk1m
-bli_herk3m1
-bli_herk3mh
-bli_herk4m1
-bli_herk4mh
-bli_herk_basic_check
-bli_herk_check
-bli_herk_determine_kc
-bli_herk_determine_kc_b
-bli_herk_determine_kc_f
-bli_herk_direct
 bli_herk_ex
-bli_herk_front
-bli_herkind
-bli_herkind_get_avail
-bli_herk_l_ker_var2
-bli_herknat
-bli_herk_prune_unref_mparts_k
-bli_herk_prune_unref_mparts_m
-bli_herk_prune_unref_mparts_n
-bli_herk_u_ker_var2
-bli_herk_x_ker_var2
-bli_her_unb_var1
-bli_her_unb_var1_qfp
-bli_her_unb_var2
-bli_her_unb_var2_qfp
 bli_ifprintm
 bli_ifprintv
 bli_igetsc
@@ -1175,13 +544,8 @@ bli_ind_disable_all_dt
 bli_ind_disable_dt
 bli_ind_enable
 bli_ind_enable_dt
-bli_ind_finalize
-bli_ind_get_impl_string
-bli_ind_init
-bli_ind_map_cdt_to_index
 bli_ind_oper_enable_only
 bli_ind_oper_find_avail
-bli_ind_oper_get_avail
 bli_ind_oper_get_avail_impl_string
 bli_ind_oper_is_impl
 bli_info_get_blas_int_type_size
@@ -1189,13 +553,15 @@ bli_info_get_enable_blas
 bli_info_get_enable_cblas
 bli_info_get_enable_memkind
 bli_info_get_enable_openmp
+bli_info_get_enable_openmp_as_default
 bli_info_get_enable_pba_pools
 bli_info_get_enable_pthreads
+bli_info_get_enable_pthreads_as_default
 bli_info_get_enable_sandbox
 bli_info_get_enable_sba_pools
-bli_info_get_enable_stay_auto_init
 bli_info_get_enable_threading
 bli_info_get_gemm_impl_string
+bli_info_get_gemmt_impl_string
 bli_info_get_gemmtrsm_l_ukr_impl_string
 bli_info_get_gemmtrsm_u_ukr_impl_string
 bli_info_get_gemm_ukr_impl_string
@@ -1209,7 +575,14 @@ bli_info_get_int_type_size_str
 bli_info_get_max_type_size
 bli_info_get_num_fp_types
 bli_info_get_page_size
-bli_info_get_pool_addr_align_size
+bli_info_get_pool_addr_align_size_a
+bli_info_get_pool_addr_align_size_b
+bli_info_get_pool_addr_align_size_c
+bli_info_get_pool_addr_align_size_gen
+bli_info_get_pool_addr_offset_size_a
+bli_info_get_pool_addr_offset_size_b
+bli_info_get_pool_addr_offset_size_c
+bli_info_get_pool_addr_offset_size_gen
 bli_info_get_simd_align_size
 bli_info_get_simd_num_registers
 bli_info_get_simd_size
@@ -1227,152 +600,57 @@ bli_info_get_trsm_l_ukr_impl_string
 bli_info_get_trsm_u_ukr_impl_string
 bli_info_get_version_str
 bli_init
-bli_init_apis
-bli_init_auto
-bli_init_once
 bli_invertd
-bli_invertd_check
 bli_invertd_ex
-bli_invertd_ex_qfp
 bli_invertsc
-bli_invertsc_check
-bli_invertsc_qfp
 bli_invertv
-bli_invertv_check
 bli_invertv_ex
-bli_invertv_ex_qfp
-bli_ipow
+bli_invscald
+bli_invscald_ex
+bli_invscalm
+bli_invscalm_ex
+bli_invscalv
+bli_invscalv_ex
 bli_iprintm
-bli_iprintm_ex
 bli_iprintv
-bli_iprintv_ex
 bli_isetsc
-bli_l0_xsc_check
-bli_l0_xx2sc_check
-bli_l0_xxsc_check
-bli_l1d_ax_check
-bli_l1d_axy_check
-bli_l1d_x_check
-bli_l1d_xy_check
-bli_l1m_ax_check
-bli_l1m_axy_check
-bli_l1m_xy_check
-bli_l1v_axby_check
-bli_l1v_ax_check
-bli_l1v_axy_check
-bli_l1v_dot_check
-bli_l1v_xby_check
-bli_l1v_x_check
-bli_l1v_xi_check
-bli_l1v_xy_check
-bli_l3_basic_check
-bli_l3_cntl_create_if
 bli_l3_cntl_free
-bli_l3_determine_kc
-bli_l3_direct
-bli_l3_ind_oper_enable_only
-bli_l3_ind_oper_find_avail
-bli_l3_ind_oper_get_enable
-bli_l3_ind_oper_get_func
-bli_l3_ind_oper_set_enable
-bli_l3_ind_oper_set_enable_all
-bli_l3_ind_set_enable_dt
-bli_l3_packm
-bli_l3_prune_unref_mparts_k
-bli_l3_prune_unref_mparts_m
-bli_l3_prune_unref_mparts_n
-bli_l3_thread_decorator
-bli_l3_thread_entry
-bli_l3_thrinfo_create_root
-bli_l3_thrinfo_free
-bli_l3_thrinfo_free_paths
-bli_l3_thrinfo_init_single
-bli_l3_thrinfo_print_gemm_paths
-bli_l3_thrinfo_print_trsm_paths
-bli_lcm
-bli_lsame
+bli_l3_thrinfo_create
 bli_machval
-bli_malloc_intl
 bli_malloc_user
-bli_mbool_create
-bli_mbool_free
-bli_mbool_init
-bli_pba_acquire_m
-bli_pba_compute_pool_block_sizes
-bli_pba_compute_pool_block_sizes_dt
-bli_pba_finalize
-bli_pba_finalize_pools
-bli_pba_init
-bli_pba_init_pools
-bli_pba_pool_size
-bli_pba_query
-bli_pba_release
-bli_memsys_finalize
-bli_memsys_init
 bli_mkherm
-bli_mkherm_check
 bli_mkherm_ex
-bli_mkherm_ex_qfp
 bli_mksymm
-bli_mksymm_check
 bli_mksymm_ex
-bli_mksymm_ex_qfp
 bli_mktrim
-bli_mktrim_check
 bli_mktrim_ex
-bli_mktrim_ex_qfp
 bli_mulsc
-bli_mulsc_check
-bli_mulsc_qfp
-bli_next_prime_factor
 bli_norm1m
-bli_norm1m_check
 bli_norm1m_ex
-bli_norm1m_ex_qfp
 bli_norm1v
-bli_norm1v_check
 bli_norm1v_ex
-bli_norm1v_ex_qfp
 bli_normfm
-bli_normfm_check
 bli_normfm_ex
-bli_normfm_ex_qfp
 bli_normfsc
-bli_normfsc_check
-bli_normfsc_qfp
 bli_normfv
-bli_normfv_check
 bli_normfv_ex
-bli_normfv_ex_qfp
 bli_normim
-bli_normim_check
 bli_normim_ex
-bli_normim_ex_qfp
 bli_normiv
-bli_normiv_check
 bli_normiv_ex
-bli_normiv_ex_qfp
 bli_obj_alloc_buffer
-bli_obj_alloc_buffer_check
 bli_obj_attach_buffer
-bli_obj_attach_buffer_check
 bli_obj_create
 bli_obj_create_1x1
 bli_obj_create_1x1_with_attached_buffer
-bli_obj_create_check
 bli_obj_create_conf_to
-bli_obj_create_const_check
-bli_obj_create_scalar_check
 bli_obj_create_with_attached_buffer
 bli_obj_create_without_buffer
-bli_obj_create_without_buffer_check
 bli_obj_equals
 bli_obj_free
-bli_obj_free_check
 bli_obj_imag_equals
 bli_obj_imag_is_zero
 bli_obj_print
-bli_obj_print_check
 bli_obj_scalar_apply_scalar
 bli_obj_scalar_attach
 bli_obj_scalar_cast_to
@@ -1382,21 +660,16 @@ bli_obj_scalar_has_nonzero_imag
 bli_obj_scalar_init_detached
 bli_obj_scalar_init_detached_copy_of
 bli_obj_scalar_reset
-bli_packm_acquire_mpart_l2r
-bli_packm_acquire_mpart_t2b
-bli_packm_acquire_mpart_tl2br
+bli_pack_get_pack_a
+bli_pack_get_pack_b
+bli_packm_alloc
+bli_packm_alloc_ex
 bli_packm_blk_var1
-bli_packm_blk_var1_md
 bli_packm_cntl_create_node
 bli_packm_init
-bli_packm_init_check
-bli_packm_init_pack
-bli_packm_int
-bli_packm_int_check
-bli_packm_offset_to_panel_for
-bli_packm_thrinfo_init
-bli_packm_thrinfo_init_single
-bli_packm_unb_var1
+bli_packm_scalar
+bli_pack_set_pack_a
+bli_pack_set_pack_b
 bli_param_map_blis_to_char_conj
 bli_param_map_blis_to_char_diag
 bli_param_map_blis_to_char_dt
@@ -1414,33 +687,11 @@ bli_param_map_char_to_blis_dt
 bli_param_map_char_to_blis_side
 bli_param_map_char_to_blis_trans
 bli_param_map_char_to_blis_uplo
-bli_param_map_netlib_to_blis_diag
-bli_param_map_netlib_to_blis_side
-bli_param_map_netlib_to_blis_trans
-bli_param_map_netlib_to_blis_uplo
-bli_partition_2x2
-bli_pblk_print
-bli_pool_alloc_block
-bli_pool_checkin_block
-bli_pool_checkout_block
-bli_pool_finalize
-bli_pool_free_block
-bli_pool_grow
-bli_pool_init
-bli_pool_print
-bli_pool_reinit
-bli_pool_shrink
-bli_prime_factorization
+bli_pba_query
 bli_printm
-bli_printm_ex
-bli_print_msg
 bli_printv
-bli_printv_ex
 bli_projm
-bli_projm_check
 bli_projv
-bli_projv_check
-bli_prune_unref_mparts
 bli_pthread_barrier_destroy
 bli_pthread_barrier_init
 bli_pthread_barrier_wait
@@ -1457,30 +708,22 @@ bli_pthread_mutex_trylock
 bli_pthread_mutex_unlock
 bli_pthread_once
 bli_randm
-bli_randm_check
 bli_randm_ex
-bli_randm_ex_qfp
 bli_randnm
-bli_randnm_check
 bli_randnm_ex
-bli_randnm_ex_qfp
 bli_randnv
-bli_randnv_check
 bli_randnv_ex
-bli_randnv_ex_qfp
 bli_randv
-bli_randv_check
 bli_randv_ex
-bli_randv_ex_qfp
-bli_rntm_print
+bli_rntm_init_from_global
+bli_rntm_set_num_threads
+bli_rntm_set_ways
 bli_rntm_set_ways_for_op
-bli_rntm_set_ways_from_rntm
 bli_sabsqsc
 bli_saddd
 bli_saddd_ex
 bli_saddm
 bli_saddm_ex
-bli_saddm_unb_var1
 bli_saddsc
 bli_saddv
 bli_saddv_ex
@@ -1488,7 +731,6 @@ bli_samaxv
 bli_samaxv_ex
 bli_sasumv
 bli_sasumv_ex
-bli_sasumv_unb_var1
 bli_saxpbyv
 bli_saxpbyv_ex
 bli_saxpy2v
@@ -1499,65 +741,36 @@ bli_saxpyf
 bli_saxpyf_ex
 bli_saxpym
 bli_saxpym_ex
-bli_saxpym_unb_var1
 bli_saxpyv
 bli_saxpyv_ex
-bli_sba_acquire
-bli_sba_checkin_array
-bli_sba_checkout_array
-bli_sba_finalize
-bli_sba_init
-bli_sba_query
-bli_sba_release
-bli_sba_rntm_set_pool
 bli_scal2d
-bli_scal2d_check
 bli_scal2d_ex
-bli_scal2d_ex_qfp
 bli_scal2m
-bli_scal2m_check
 bli_scal2m_ex
-bli_scal2m_ex_qfp
 bli_scal2v
-bli_scal2v_check
 bli_scal2v_ex
-bli_scal2v_ex_qfp
 bli_scald
-bli_scald_check
 bli_scald_ex
-bli_scald_ex_qfp
 bli_scalm
-bli_scalm_check
 bli_scalm_ex
-bli_scalm_ex_qfp
 bli_scalv
-bli_scalv_check
 bli_scalv_ex
-bli_scalv_ex_qfp
 bli_sccastm
 bli_sccastnzm
 bli_sccastv
 bli_sccopysc
-bli_scgemm_ker_var2_md
 bli_scopyd
 bli_scopyd_ex
 bli_scopym
 bli_scopym_ex
-bli_scopym_unb_var1
 bli_scopyv
 bli_scopyv_ex
-bli_scpackm_blk_var1_md
-bli_scpackm_cxk_1e_md
-bli_scpackm_cxk_1r_md
-bli_scpackm_struc_cxk_md
 bli_scxpbym_md
 bli_scxpbym_md_ex
-bli_scxpbym_md_unb_var1
 bli_sdcastm
 bli_sdcastnzm
 bli_sdcastv
 bli_sdcopysc
-bli_sdgemm_ker_var2_md
 bli_sdivsc
 bli_sdotaxpyv
 bli_sdotaxpyv_ex
@@ -1569,187 +782,107 @@ bli_sdotxf
 bli_sdotxf_ex
 bli_sdotxv
 bli_sdotxv_ex
-bli_sdpackm_blk_var1_md
-bli_sdpackm_cxk_1e_md
-bli_sdpackm_cxk_1r_md
-bli_sdpackm_struc_cxk_md
 bli_sdxpbym_md
 bli_sdxpbym_md_ex
-bli_sdxpbym_md_unb_var1
+bli_seqm
+bli_seqsc
+bli_seqv
 bli_setd
-bli_setd_check
 bli_setd_ex
-bli_setd_ex_qfp
 bli_setid
-bli_setid_check
 bli_setid_ex
-bli_setid_ex_qfp
 bli_setijm
+bli_setijv
 bli_setim
 bli_setiv
 bli_setm
-bli_setm_check
 bli_setm_ex
-bli_setm_ex_qfp
 bli_setrm
 bli_setrv
 bli_setsc
-bli_setsc_check
-bli_setsc_qfp
 bli_setv
-bli_setv_check
 bli_setv_ex
-bli_setv_ex_qfp
 bli_sfprintm
 bli_sfprintv
 bli_sgemm
-bli_sgemm1m
-bli_sgemm3m1
-bli_sgemm3mh
-bli_sgemm4m1
-bli_sgemm4mb
-bli_sgemm4mb_ker_var2
-bli_sgemm4mh
 bli_sgemm_ex
-bli_sgemm_ker_var2
-bli_sgemmtrsm_l_ukernel
-bli_sgemmtrsm_u_ukernel
-bli_sgemm_ukernel
+bli_sgemmt
+bli_sgemmt_ex
 bli_sgemv
 bli_sgemv_ex
-bli_sgemv_unb_var1
-bli_sgemv_unb_var2
-bli_sgemv_unf_var1
-bli_sgemv_unf_var2
 bli_sger
 bli_sger_ex
-bli_sger_unb_var1
-bli_sger_unb_var2
 bli_sgetijm
+bli_sgetijv
 bli_sgetsc
 bli_shemm
-bli_shemm1m
-bli_shemm3m1
-bli_shemm3mh
-bli_shemm4m1
-bli_shemm4mh
 bli_shemm_ex
 bli_shemv
 bli_shemv_ex
-bli_shemv_unb_var1
-bli_shemv_unb_var2
-bli_shemv_unb_var3
-bli_shemv_unb_var4
-bli_shemv_unf_var1
-bli_shemv_unf_var1a
-bli_shemv_unf_var3
-bli_shemv_unf_var3a
 bli_sher
 bli_sher2
 bli_sher2_ex
 bli_sher2k
-bli_sher2k1m
-bli_sher2k3m1
-bli_sher2k3mh
-bli_sher2k4m1
-bli_sher2k4mh
 bli_sher2k_ex
-bli_sher2_unb_var1
-bli_sher2_unb_var2
-bli_sher2_unb_var3
-bli_sher2_unb_var4
-bli_sher2_unf_var1
-bli_sher2_unf_var4
 bli_sher_ex
 bli_sherk
-bli_sherk1m
-bli_sherk3m1
-bli_sherk3mh
-bli_sherk4m1
-bli_sherk4mh
 bli_sherk_ex
-bli_sherk_l_ker_var2
-bli_sherk_u_ker_var2
-bli_sher_unb_var1
-bli_sher_unb_var2
 bli_shiftd
-bli_shiftd_check
 bli_shiftd_ex
-bli_shiftd_ex_qfp
 bli_sinvertd
 bli_sinvertd_ex
 bli_sinvertsc
 bli_sinvertv
 bli_sinvertv_ex
-bli_slamch
+bli_sinvscald
+bli_sinvscald_ex
+bli_sinvscalm
+bli_sinvscalm_ex
+bli_sinvscalv
+bli_sinvscalv_ex
 bli_sleep
 bli_smachval
 bli_smkherm
 bli_smkherm_ex
-bli_smkherm_unb_var1
 bli_smksymm
 bli_smksymm_ex
-bli_smksymm_unb_var1
 bli_smktrim
 bli_smktrim_ex
-bli_smktrim_unb_var1
 bli_smulsc
 bli_snorm1m
 bli_snorm1m_ex
-bli_snorm1m_unb_var1
 bli_snorm1v
 bli_snorm1v_ex
-bli_snorm1v_unb_var1
 bli_snormfm
 bli_snormfm_ex
-bli_snormfm_unb_var1
 bli_snormfsc
 bli_snormfv
 bli_snormfv_ex
-bli_snormfv_unb_var1
 bli_snormim
 bli_snormim_ex
-bli_snormim_unb_var1
 bli_snormiv
 bli_snormiv_ex
-bli_snormiv_unb_var1
-bli_spackm_blk_var1
-bli_spackm_cxk
-bli_spackm_herm_cxk
-bli_spackm_struc_cxk
-bli_spackm_tri_cxk
-bli_spackm_unb_var1
 bli_sprintm
-bli_sprintm_ex
 bli_sprintv
-bli_sprintv_ex
 bli_sqrtsc
-bli_sqrtsc_check
-bli_sqrtsc_qfp
 bli_srandm
 bli_srandm_ex
-bli_srandm_unb_var1
 bli_srandnm
 bli_srandnm_ex
-bli_srandnm_unb_var1
 bli_srandnv
 bli_srandnv_ex
-bli_srandnv_unb_var1
 bli_srandv
 bli_srandv_ex
-bli_srandv_unb_var1
 bli_sscal2d
 bli_sscal2d_ex
 bli_sscal2m
 bli_sscal2m_ex
-bli_sscal2m_unb_var1
 bli_sscal2v
 bli_sscal2v_ex
 bli_sscald
 bli_sscald_ex
 bli_sscalm
 bli_sscalm_ex
-bli_sscalm_unb_var1
 bli_sscalv
 bli_sscalv_ex
 bli_sscastm
@@ -1761,42 +894,29 @@ bli_ssetd_ex
 bli_ssetid
 bli_ssetid_ex
 bli_ssetijm
+bli_ssetijv
 bli_ssetm
 bli_ssetm_ex
-bli_ssetm_unb_var1
 bli_ssetsc
 bli_ssetv
 bli_ssetv_ex
-bli_ssgemm_ker_var2_md
 bli_sshiftd
 bli_sshiftd_ex
-bli_sspackm_blk_var1_md
-bli_sspackm_cxk_1e_md
-bli_sspackm_cxk_1r_md
-bli_sspackm_struc_cxk_md
 bli_ssqrtsc
 bli_ssubd
 bli_ssubd_ex
 bli_ssubm
 bli_ssubm_ex
-bli_ssubm_unb_var1
 bli_ssubsc
 bli_ssubv
 bli_ssubv_ex
 bli_ssumsqv
 bli_ssumsqv_ex
-bli_ssumsqv_unb_var1
 bli_sswapv
 bli_sswapv_ex
 bli_ssxpbym_md
 bli_ssxpbym_md_ex
-bli_ssxpbym_md_unb_var1
 bli_ssymm
-bli_ssymm1m
-bli_ssymm3m1
-bli_ssymm3mh
-bli_ssymm4m1
-bli_ssymm4mh
 bli_ssymm_ex
 bli_ssymv
 bli_ssymv_ex
@@ -1804,330 +924,99 @@ bli_ssyr
 bli_ssyr2
 bli_ssyr2_ex
 bli_ssyr2k
-bli_ssyr2k1m
-bli_ssyr2k3m1
-bli_ssyr2k3mh
-bli_ssyr2k4m1
-bli_ssyr2k4mh
 bli_ssyr2k_ex
 bli_ssyr_ex
 bli_ssyrk
-bli_ssyrk1m
-bli_ssyrk3m1
-bli_ssyrk3mh
-bli_ssyrk4m1
-bli_ssyrk4mh
 bli_ssyrk_ex
-bli_string_mkupper
 bli_strmm
-bli_strmm1m
 bli_strmm3
-bli_strmm31m
-bli_strmm33m1
-bli_strmm33mh
-bli_strmm34m1
-bli_strmm34mh
 bli_strmm3_ex
-bli_strmm3m1
-bli_strmm4m1
 bli_strmm_ex
-bli_strmm_ll_ker_var2
-bli_strmm_lu_ker_var2
-bli_strmm_rl_ker_var2
-bli_strmm_ru_ker_var2
 bli_strmv
 bli_strmv_ex
-bli_strmv_unb_var1
-bli_strmv_unb_var2
-bli_strmv_unf_var1
-bli_strmv_unf_var2
 bli_strsm
-bli_strsm1m
-bli_strsm3m1
-bli_strsm4m1
 bli_strsm_ex
-bli_strsm_ll_ker_var2
-bli_strsm_l_ukernel
-bli_strsm_lu_ker_var2
-bli_strsm_rl_ker_var2
-bli_strsm_ru_ker_var2
-bli_strsm_u_ukernel
 bli_strsv
 bli_strsv_ex
-bli_strsv_unb_var1
-bli_strsv_unb_var2
-bli_strsv_unf_var1
-bli_strsv_unf_var2
 bli_subd
-bli_subd_check
 bli_subd_ex
-bli_subd_ex_qfp
 bli_subm
-bli_subm_check
 bli_subm_ex
-bli_subm_ex_qfp
 bli_subsc
-bli_subsc_check
-bli_subsc_qfp
 bli_subv
-bli_subv_check
 bli_subv_ex
-bli_subv_ex_qfp
 bli_sumsqv
-bli_sumsqv_check
 bli_sumsqv_ex
-bli_sumsqv_ex_qfp
-bli_sunpackm_blk_var1
-bli_sunpackm_cxk
-bli_sunpackm_unb_var1
 bli_sunzipsc
 bli_swapv
-bli_swapv_check
 bli_swapv_ex
-bli_swapv_ex_qfp
 bli_sxpbyd
 bli_sxpbyd_ex
 bli_sxpbym
 bli_sxpbym_ex
-bli_sxpbym_unb_var1
 bli_sxpbyv
 bli_sxpbyv_ex
 bli_symm
-bli_symm1m
-bli_symm3m1
-bli_symm3mh
-bli_symm4m1
-bli_symm4mh
-bli_symm_check
 bli_symm_ex
-bli_symm_front
-bli_symmind
-bli_symmind_get_avail
-bli_symmnat
 bli_symv
-bli_symv_check
 bli_symv_ex
-bli_symv_ex_qfp
 bli_syr
 bli_syr2
-bli_syr2_check
 bli_syr2_ex
-bli_syr2_ex_qfp
 bli_syr2k
-bli_syr2k1m
-bli_syr2k3m1
-bli_syr2k3mh
-bli_syr2k4m1
-bli_syr2k4mh
-bli_syr2k_check
 bli_syr2k_ex
-bli_syr2k_front
-bli_syr2kind
-bli_syr2kind_get_avail
-bli_syr2knat
-bli_syr_check
 bli_syr_ex
-bli_syr_ex_qfp
 bli_syrk
-bli_syrk1m
-bli_syrk3m1
-bli_syrk3mh
-bli_syrk4m1
-bli_syrk4mh
-bli_syrk_check
 bli_syrk_ex
-bli_syrk_front
-bli_syrkind
-bli_syrkind_get_avail
-bli_syrknat
 bli_szcastm
 bli_szcastnzm
 bli_szcastv
 bli_szcopysc
-bli_szgemm_ker_var2_md
 bli_szipsc
-bli_szpackm_blk_var1_md
-bli_szpackm_cxk_1e_md
-bli_szpackm_cxk_1r_md
-bli_szpackm_struc_cxk_md
 bli_szxpbym_md
 bli_szxpbym_md_ex
-bli_szxpbym_md_unb_var1
 bli_thrcomm_barrier
-bli_thrcomm_barrier_atomic
 bli_thrcomm_bcast
-bli_thrcomm_cleanup
-bli_thrcomm_create
-bli_thrcomm_free
-bli_thrcomm_init
-bli_thread_finalize
-bli_thread_get_env
 bli_thread_get_ic_nt
 bli_thread_get_ir_nt
 bli_thread_get_jc_nt
 bli_thread_get_jr_nt
 bli_thread_get_num_threads
 bli_thread_get_pc_nt
-bli_thread_init
-bli_thread_init_rntm
-bli_thread_init_rntm_from_env
-bli_thread_range_b2t
-bli_thread_range_l2r
-bli_thread_range_mdim
-bli_thread_range_ndim
-bli_thread_range_r2l
+bli_thread_get_thread_impl
+bli_thread_get_thread_impl_str
 bli_thread_range_sub
-bli_thread_range_t2b
-bli_thread_range_weighted_b2t
-bli_thread_range_weighted_l2r
-bli_thread_range_weighted_r2l
-bli_thread_range_weighted_sub
-bli_thread_range_weighted_t2b
-bli_thread_range_width_l
 bli_thread_set_num_threads
 bli_thread_set_num_threads_
+bli_thread_set_thread_impl
 bli_thread_set_ways
 bli_thread_set_ways_
-bli_thrinfo_create
-bli_thrinfo_create_for_cntl
-bli_thrinfo_create_for_cntl_prenode
 bli_thrinfo_free
-bli_thrinfo_grow
-bli_thrinfo_init
-bli_thrinfo_init_single
-bli_thrinfo_rgrow
-bli_thrinfo_rgrow_prenode
 bli_trmm
-bli_trmm1m
 bli_trmm3
-bli_trmm31m
-bli_trmm33m1
-bli_trmm33mh
-bli_trmm34m1
-bli_trmm34mh
 bli_trmm3_ex
-bli_trmm3_front
-bli_trmm3ind
-bli_trmm3ind_get_avail
-bli_trmm3m1
-bli_trmm3nat
-bli_trmm4m1
-bli_trmm_check
-bli_trmm_determine_kc
-bli_trmm_determine_kc_b
-bli_trmm_determine_kc_f
-bli_trmm_direct
 bli_trmm_ex
-bli_trmm_front
-bli_trmmind
-bli_trmmind_get_avail
-bli_trmm_ll_ker_var2
-bli_trmm_lu_ker_var2
-bli_trmmnat
-bli_trmm_prune_unref_mparts_k
-bli_trmm_prune_unref_mparts_m
-bli_trmm_prune_unref_mparts_n
-bli_trmm_rl_ker_var2
-bli_trmm_ru_ker_var2
-bli_trmm_xx_ker_var2
 bli_trmv
-bli_trmv_check
 bli_trmv_ex
-bli_trmv_ex_qfp
-bli_trmv_unb_var1
-bli_trmv_unb_var1_qfp
-bli_trmv_unb_var2
-bli_trmv_unb_var2_qfp
-bli_trmv_unf_var1
-bli_trmv_unf_var1_qfp
-bli_trmv_unf_var2
-bli_trmv_unf_var2_qfp
 bli_trsm
-bli_trsm1m
-bli_trsm3m1
-bli_trsm4m1
-bli_trsm_blk_var1
-bli_trsm_blk_var2
-bli_trsm_blk_var3
-bli_trsm_check
-bli_trsm_cntl_create
-bli_trsm_cntl_create_node
-bli_trsm_cntl_free
-bli_trsm_determine_kc
-bli_trsm_determine_kc_b
-bli_trsm_determine_kc_f
-bli_trsm_direct
 bli_trsm_ex
-bli_trsm_front
-bli_trsmind
-bli_trsmind_get_avail
-bli_trsm_int
-bli_trsm_l_cntl_create
-bli_trsm_ll_ker_var2
-bli_trsm_l_ukernel_qfp
-bli_trsm_lu_ker_var2
-bli_trsmnat
-bli_trsm_packa
-bli_trsm_packb
-bli_trsm_prune_unref_mparts_k
-bli_trsm_prune_unref_mparts_m
-bli_trsm_prune_unref_mparts_n
-bli_trsm_r_cntl_create
-bli_trsm_rl_ker_var2
-bli_trsm_ru_ker_var2
 bli_trsm_ukernel
-bli_trsm_u_ukernel_qfp
-bli_trsm_xx_ker_var2
 bli_trsv
-bli_trsv_check
 bli_trsv_ex
-bli_trsv_ex_qfp
-bli_trsv_unb_var1
-bli_trsv_unb_var1_qfp
-bli_trsv_unb_var2
-bli_trsv_unb_var2_qfp
-bli_trsv_unf_var1
-bli_trsv_unf_var1_qfp
-bli_trsv_unf_var2
-bli_trsv_unf_var2_qfp
-bli_unpackm_blk_var1
-bli_unpackm_cntl_create_node
-bli_unpackm_int
-bli_unpackm_int_check
-bli_unpackm_unb_var1
 bli_unzipsc
-bli_unzipsc_check
-bli_unzipsc_qfp
-bli_utilm_fprint_check
-bli_utilm_mkhst_check
-bli_utilm_norm_check
-bli_utilm_rand_check
-bli_utilv_norm_check
-bli_utilv_sumsqv_check
-bli_utilv_xa_check
 bli_xpbyd
-bli_xpbyd_check
 bli_xpbyd_ex
-bli_xpbyd_ex_qfp
 bli_xpbym
-bli_xpbym_check
 bli_xpbym_ex
-bli_xpbym_ex_qfp
 bli_xpbym_md
 bli_xpbym_md_ex
-bli_xpbym_md_ex_qfp2
 bli_xpbyv
-bli_xpbyv_check
 bli_xpbyv_ex
-bli_xpbyv_ex_qfp
-bli_xxmv_check
-bli_xxr_check
 bli_zabsqsc
 bli_zaddd
 bli_zaddd_ex
 bli_zaddm
 bli_zaddm_ex
-bli_zaddm_unb_var1
 bli_zaddsc
 bli_zaddv
 bli_zaddv_ex
@@ -2135,7 +1024,6 @@ bli_zamaxv
 bli_zamaxv_ex
 bli_zasumv
 bli_zasumv_ex
-bli_zasumv_unb_var1
 bli_zaxpbyv
 bli_zaxpbyv_ex
 bli_zaxpy2v
@@ -2146,33 +1034,24 @@ bli_zaxpyf
 bli_zaxpyf_ex
 bli_zaxpym
 bli_zaxpym_ex
-bli_zaxpym_unb_var1
 bli_zaxpyv
 bli_zaxpyv_ex
 bli_zccastm
 bli_zccastnzm
 bli_zccastv
 bli_zccopysc
-bli_zcgemm_ker_var2_md
 bli_zcopyd
 bli_zcopyd_ex
 bli_zcopym
 bli_zcopym_ex
-bli_zcopym_unb_var1
 bli_zcopyv
 bli_zcopyv_ex
-bli_zcpackm_blk_var1_md
-bli_zcpackm_cxk_1e_md
-bli_zcpackm_cxk_1r_md
-bli_zcpackm_struc_cxk_md
 bli_zcxpbym_md
 bli_zcxpbym_md_ex
-bli_zcxpbym_md_unb_var1
 bli_zdcastm
 bli_zdcastnzm
 bli_zdcastv
 bli_zdcopysc
-bli_zdgemm_ker_var2_md
 bli_zdivsc
 bli_zdotaxpyv
 bli_zdotaxpyv_ex
@@ -2184,174 +1063,89 @@ bli_zdotxf
 bli_zdotxf_ex
 bli_zdotxv
 bli_zdotxv_ex
-bli_zdpackm_blk_var1_md
-bli_zdpackm_cxk_1e_md
-bli_zdpackm_cxk_1r_md
-bli_zdpackm_struc_cxk_md
 bli_zdxpbym_md
 bli_zdxpbym_md_ex
-bli_zdxpbym_md_unb_var1
+bli_zeqm
+bli_zeqsc
+bli_zeqv
 bli_zfprintm
 bli_zfprintv
 bli_zgemm
-bli_zgemm1m
-bli_zgemm3m1
-bli_zgemm3mh
-bli_zgemm4m1
-bli_zgemm4mb
-bli_zgemm4mb_ker_var2
-bli_zgemm4mh
 bli_zgemm_ex
-bli_zgemm_ker_var2
-bli_zgemm_md_c2r_ref
-bli_zgemmtrsm_l_ukernel
-bli_zgemmtrsm_u_ukernel
-bli_zgemm_ukernel
+bli_zgemmt
+bli_zgemmt_ex
 bli_zgemv
 bli_zgemv_ex
-bli_zgemv_unb_var1
-bli_zgemv_unb_var2
-bli_zgemv_unf_var1
-bli_zgemv_unf_var2
 bli_zger
 bli_zger_ex
-bli_zger_unb_var1
-bli_zger_unb_var2
 bli_zgetijm
+bli_zgetijv
 bli_zgetsc
 bli_zhemm
-bli_zhemm1m
-bli_zhemm3m1
-bli_zhemm3mh
-bli_zhemm4m1
-bli_zhemm4mh
 bli_zhemm_ex
 bli_zhemv
 bli_zhemv_ex
-bli_zhemv_unb_var1
-bli_zhemv_unb_var2
-bli_zhemv_unb_var3
-bli_zhemv_unb_var4
-bli_zhemv_unf_var1
-bli_zhemv_unf_var1a
-bli_zhemv_unf_var3
-bli_zhemv_unf_var3a
 bli_zher
 bli_zher2
 bli_zher2_ex
 bli_zher2k
-bli_zher2k1m
-bli_zher2k3m1
-bli_zher2k3mh
-bli_zher2k4m1
-bli_zher2k4mh
 bli_zher2k_ex
-bli_zher2_unb_var1
-bli_zher2_unb_var2
-bli_zher2_unb_var3
-bli_zher2_unb_var4
-bli_zher2_unf_var1
-bli_zher2_unf_var4
 bli_zher_ex
 bli_zherk
-bli_zherk1m
-bli_zherk3m1
-bli_zherk3mh
-bli_zherk4m1
-bli_zherk4mh
 bli_zherk_ex
-bli_zherk_l_ker_var2
-bli_zherk_u_ker_var2
-bli_zher_unb_var1
-bli_zher_unb_var2
 bli_zinvertd
 bli_zinvertd_ex
 bli_zinvertsc
 bli_zinvertv
 bli_zinvertv_ex
+bli_zinvscald
+bli_zinvscald_ex
+bli_zinvscalm
+bli_zinvscalm_ex
+bli_zinvscalv
+bli_zinvscalv_ex
 bli_zipsc
-bli_zipsc_check
-bli_zipsc_qfp
 bli_zmachval
 bli_zmkherm
 bli_zmkherm_ex
-bli_zmkherm_unb_var1
 bli_zmksymm
 bli_zmksymm_ex
-bli_zmksymm_unb_var1
 bli_zmktrim
 bli_zmktrim_ex
-bli_zmktrim_unb_var1
 bli_zmulsc
 bli_znorm1m
 bli_znorm1m_ex
-bli_znorm1m_unb_var1
 bli_znorm1v
 bli_znorm1v_ex
-bli_znorm1v_unb_var1
 bli_znormfm
 bli_znormfm_ex
-bli_znormfm_unb_var1
 bli_znormfsc
 bli_znormfv
 bli_znormfv_ex
-bli_znormfv_unb_var1
 bli_znormim
 bli_znormim_ex
-bli_znormim_unb_var1
 bli_znormiv
 bli_znormiv_ex
-bli_znormiv_unb_var1
-bli_zpackm_blk_var1
-bli_zpackm_cxk
-bli_zpackm_cxk_1er
-bli_zpackm_cxk_3mis
-bli_zpackm_cxk_4mi
-bli_zpackm_cxk_rih
-bli_zpackm_herm_cxk
-bli_zpackm_herm_cxk_1er
-bli_zpackm_herm_cxk_3mis
-bli_zpackm_herm_cxk_4mi
-bli_zpackm_herm_cxk_rih
-bli_zpackm_struc_cxk
-bli_zpackm_struc_cxk_1er
-bli_zpackm_struc_cxk_3mis
-bli_zpackm_struc_cxk_4mi
-bli_zpackm_struc_cxk_rih
-bli_zpackm_tri_cxk
-bli_zpackm_tri_cxk_1er
-bli_zpackm_tri_cxk_3mis
-bli_zpackm_tri_cxk_4mi
-bli_zpackm_tri_cxk_rih
-bli_zpackm_unb_var1
 bli_zprintm
-bli_zprintm_ex
 bli_zprintv
-bli_zprintv_ex
 bli_zrandm
 bli_zrandm_ex
-bli_zrandm_unb_var1
 bli_zrandnm
 bli_zrandnm_ex
-bli_zrandnm_unb_var1
 bli_zrandnv
 bli_zrandnv_ex
-bli_zrandnv_unb_var1
 bli_zrandv
 bli_zrandv_ex
-bli_zrandv_unb_var1
 bli_zscal2d
 bli_zscal2d_ex
 bli_zscal2m
 bli_zscal2m_ex
-bli_zscal2m_unb_var1
 bli_zscal2v
 bli_zscal2v_ex
 bli_zscald
 bli_zscald_ex
 bli_zscalm
 bli_zscalm_ex
-bli_zscalm_unb_var1
 bli_zscalv
 bli_zscalv_ex
 bli_zscastm
@@ -2363,42 +1157,29 @@ bli_zsetd_ex
 bli_zsetid
 bli_zsetid_ex
 bli_zsetijm
+bli_zsetijv
 bli_zsetm
 bli_zsetm_ex
-bli_zsetm_unb_var1
 bli_zsetsc
 bli_zsetv
 bli_zsetv_ex
-bli_zsgemm_ker_var2_md
 bli_zshiftd
 bli_zshiftd_ex
-bli_zspackm_blk_var1_md
-bli_zspackm_cxk_1e_md
-bli_zspackm_cxk_1r_md
-bli_zspackm_struc_cxk_md
 bli_zsqrtsc
 bli_zsubd
 bli_zsubd_ex
 bli_zsubm
 bli_zsubm_ex
-bli_zsubm_unb_var1
 bli_zsubsc
 bli_zsubv
 bli_zsubv_ex
 bli_zsumsqv
 bli_zsumsqv_ex
-bli_zsumsqv_unb_var1
 bli_zswapv
 bli_zswapv_ex
 bli_zsxpbym_md
 bli_zsxpbym_md_ex
-bli_zsxpbym_md_unb_var1
 bli_zsymm
-bli_zsymm1m
-bli_zsymm3m1
-bli_zsymm3mh
-bli_zsymm4m1
-bli_zsymm4mh
 bli_zsymm_ex
 bli_zsymv
 bli_zsymv_ex
@@ -2406,85 +1187,37 @@ bli_zsyr
 bli_zsyr2
 bli_zsyr2_ex
 bli_zsyr2k
-bli_zsyr2k1m
-bli_zsyr2k3m1
-bli_zsyr2k3mh
-bli_zsyr2k4m1
-bli_zsyr2k4mh
 bli_zsyr2k_ex
 bli_zsyr_ex
 bli_zsyrk
-bli_zsyrk1m
-bli_zsyrk3m1
-bli_zsyrk3mh
-bli_zsyrk4m1
-bli_zsyrk4mh
 bli_zsyrk_ex
 bli_ztrmm
-bli_ztrmm1m
 bli_ztrmm3
-bli_ztrmm31m
-bli_ztrmm33m1
-bli_ztrmm33mh
-bli_ztrmm34m1
-bli_ztrmm34mh
 bli_ztrmm3_ex
-bli_ztrmm3m1
-bli_ztrmm4m1
 bli_ztrmm_ex
-bli_ztrmm_ll_ker_var2
-bli_ztrmm_lu_ker_var2
-bli_ztrmm_rl_ker_var2
-bli_ztrmm_ru_ker_var2
 bli_ztrmv
 bli_ztrmv_ex
-bli_ztrmv_unb_var1
-bli_ztrmv_unb_var2
-bli_ztrmv_unf_var1
-bli_ztrmv_unf_var2
 bli_ztrsm
-bli_ztrsm1m
-bli_ztrsm3m1
-bli_ztrsm4m1
 bli_ztrsm_ex
-bli_ztrsm_ll_ker_var2
-bli_ztrsm_l_ukernel
-bli_ztrsm_lu_ker_var2
-bli_ztrsm_rl_ker_var2
-bli_ztrsm_ru_ker_var2
-bli_ztrsm_u_ukernel
 bli_ztrsv
 bli_ztrsv_ex
-bli_ztrsv_unb_var1
-bli_ztrsv_unb_var2
-bli_ztrsv_unf_var1
-bli_ztrsv_unf_var2
-bli_zunpackm_blk_var1
-bli_zunpackm_cxk
-bli_zunpackm_unb_var1
 bli_zunzipsc
 bli_zxpbyd
 bli_zxpbyd_ex
 bli_zxpbym
 bli_zxpbym_ex
-bli_zxpbym_unb_var1
 bli_zxpbyv
 bli_zxpbyv_ex
 bli_zzcastm
 bli_zzcastnzm
 bli_zzcastv
 bli_zzcopysc
-bli_zzgemm_ker_var2_md
 bli_zzipsc
-bli_zzpackm_blk_var1_md
-bli_zzpackm_cxk_1e_md
-bli_zzpackm_cxk_1r_md
-bli_zzpackm_struc_cxk_md
 bli_zzxpbym_md
 bli_zzxpbym_md_ex
-bli_zzxpbym_md_unb_var1
 sasum_
 sasumsub_
+saxpby_
 saxpy_
 scabs1_
 scasum_
@@ -2498,6 +1231,8 @@ sdsdot_
 sdsdotsub_
 sgbmv_
 sgemm_
+sgemm_batch_
+sgemmt_
 sgemv_
 sger_
 snrm2_
@@ -2528,6 +1263,7 @@ strsm_
 strsv_
 dasum_
 dasumsub_
+daxpby_
 daxpy_
 dcabs1_
 dcopy_
@@ -2535,6 +1271,8 @@ ddot_
 ddotsub_
 dgbmv_
 dgemm_
+dgemm_batch_
+dgemmt_
 dgemv_
 dger_
 dnrm2_
@@ -2569,6 +1307,7 @@ dzasum_
 dzasumsub_
 dznrm2_
 dznrm2sub_
+caxpby_
 caxpy_
 ccopy_
 cdotc_
@@ -2577,6 +1316,9 @@ cdotu_
 cdotusub_
 cgbmv_
 cgemm_
+cgemm3m_
+cgemm_batch_
+cgemmt_
 cgemv_
 cgerc_
 cgeru_
@@ -2606,6 +1348,7 @@ ctrmm_
 ctrmv_
 ctrsm_
 ctrsv_
+zaxpby_
 zaxpy_
 zcopy_
 zdotc_
@@ -2616,6 +1359,9 @@ zdrot_
 zdscal_
 zgbmv_
 zgemm_
+zgemm3m_
+zgemm_batch_
+zgemmt_
 zgemv_
 zgerc_
 zgeru_
@@ -2651,12 +1397,16 @@ isamax_
 isamaxsub_
 izamax_
 izamaxsub_
+cblas_caxpby
 cblas_caxpy
 cblas_ccopy
 cblas_cdotc_sub
 cblas_cdotu_sub
 cblas_cgbmv
 cblas_cgemm
+cblas_cgemm3m
+cblas_cgemm_batch
+cblas_cgemmt
 cblas_cgemv
 cblas_cgerc
 cblas_cgeru
@@ -2685,11 +1435,14 @@ cblas_ctrmv
 cblas_ctrsm
 cblas_ctrsv
 cblas_dasum
+cblas_daxpby
 cblas_daxpy
 cblas_dcopy
 cblas_ddot
 cblas_dgbmv
 cblas_dgemm
+cblas_dgemm_batch
+cblas_dgemmt
 cblas_dgemv
 cblas_dger
 cblas_dnrm2
@@ -2725,6 +1478,7 @@ cblas_idamax
 cblas_isamax
 cblas_izamax
 cblas_sasum
+cblas_saxpby
 cblas_saxpy
 cblas_scasum
 cblas_scnrm2
@@ -2733,6 +1487,8 @@ cblas_sdot
 cblas_sdsdot
 cblas_sgbmv
 cblas_sgemm
+cblas_sgemm_batch
+cblas_sgemmt
 cblas_sgemv
 cblas_sger
 cblas_snrm2
@@ -2761,6 +1517,7 @@ cblas_strmv
 cblas_strsm
 cblas_strsv
 cblas_xerbla
+cblas_zaxpby
 cblas_zaxpy
 cblas_zcopy
 cblas_zdotc_sub
@@ -2768,6 +1525,9 @@ cblas_zdotu_sub
 cblas_zdscal
 cblas_zgbmv
 cblas_zgemm
+cblas_zgemm3m
+cblas_zgemm_batch
+cblas_zgemmt
 cblas_zgemv
 cblas_zgerc
 cblas_zgeru
diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h
index 325ed0ecf..4888cbdaa 100644
--- a/frame/1m/bli_l1m_oft_var.h
+++ b/frame/1m/bli_l1m_oft_var.h
@@ -48,9 +48,8 @@ typedef void (*PASTECH(opname,_var_oft)) \
   const obj_t*  a, \
         obj_t*  p, \
   const cntx_t* cntx, \
-        rntm_t* rntm, \
-        cntl_t* cntl, \
-  const thrinfo_t* thread  \
+  const cntl_t* cntl, \
+        thrinfo_t* thread  \
 );
 
 GENTDEF( packm )
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 0a641cf9e..487116329 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -77,8 +77,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  n, \
 	  ( ctype* )x, rs_x, cs_x, \
 	            y, rs_y, cs_y, \
-	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( cntx_t* )cntx  \
 	); \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
@@ -142,8 +141,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  n, \
 	  ( ctype* )x, rs_x, cs_x, \
 	            y, rs_y, cs_y, \
-	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( cntx_t* )cntx  \
 	); \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
@@ -216,8 +214,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  ( ctype* )alpha, \
 	  ( ctype* )x, rs_x, cs_x, \
 	            y, rs_y, cs_y, \
-	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( cntx_t* )cntx  \
 	); \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
@@ -305,8 +302,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  ( ctype* )alpha, \
 	  ( ctype* )x, rs_x, cs_x, \
 	            y, rs_y, cs_y, \
-	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( cntx_t* )cntx  \
 	); \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
@@ -373,8 +369,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  n, \
 	  ( ctype* )alpha, \
 	            x, rs_x, cs_x, \
-	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -422,8 +417,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  n, \
 		  ( ctype* )x, rs_x, cs_x, \
 		            y, rs_y, cs_y, \
-		  ( cntx_t* )cntx, \
-		  rntm  \
+		  ( cntx_t* )cntx  \
 		); \
 \
 		return; \
@@ -442,8 +436,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  ( ctype* )x, rs_x, cs_x, \
 	  ( ctype* )beta, \
 	            y, rs_y, cs_y, \
-	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( cntx_t* )cntx  \
 	); \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
@@ -524,8 +517,7 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \
 	  ( ctype_x* )x, rs_x, cs_x, \
 	  ( ctype_y* )beta, \
 	              y, rs_y, cs_y, \
-	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index 1bcd9b9ca..9d051c169 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -51,8 +51,7 @@ void PASTEMAC(ch,opname) \
        dim_t   n, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -168,8 +167,7 @@ void PASTEMAC(ch,opname) \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -286,8 +284,7 @@ void PASTEMAC(ch,opname) \
        dim_t   n, \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -395,8 +392,7 @@ void PASTEMAC(ch,opname) \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  beta, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -513,8 +509,7 @@ void PASTEMAC2(chx,chy,opname) \
        ctype_x* x, inc_t rs_x, inc_t cs_x, \
        ctype_y* beta, \
        ctype_y* y, inc_t rs_y, inc_t cs_y, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       cntx_t*  cntx \
      ) \
 { \
 	uplo_t uplox_eff; \
diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h
index fe01989e3..06aed2fe1 100644
--- a/frame/1m/bli_l1m_unb_var1.h
+++ b/frame/1m/bli_l1m_unb_var1.h
@@ -50,8 +50,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        dim_t   n, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( addm )
@@ -73,8 +72,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( axpym )
@@ -94,8 +92,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        dim_t   n, \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( invscalm )
@@ -117,8 +114,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  beta, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( xpbym )
@@ -138,8 +134,7 @@ void PASTEMAC3(chx,chy,opname,_unb_var1) \
        ctype_x* x, inc_t rs_x, inc_t cs_x, \
        ctype_y* beta, \
        ctype_y* y, inc_t rs_y, inc_t cs_y, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       cntx_t*  cntx \
      );
 
 INSERT_GENTPROT2_BASIC0( xpbym_md )
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index 7d73bf903..80878fba0 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -39,6 +39,7 @@
 #include "bli_packm_init.h"
 #include "bli_packm_int.h"
 #include "bli_packm_scalar.h"
+#include "bli_packm_thrinfo.h"
 
 #include "bli_packm_part.h"
 
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index 07f54de78..18cc6f627 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -38,9 +38,8 @@
 void* bli_packm_alloc
      (
              siz_t      size_needed,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+       const cntl_t*    cntl,
+             thrinfo_t* thread
      )
 {
 	// Query the pack buffer type from the control tree node.
@@ -50,51 +49,48 @@ void* bli_packm_alloc
 	(
 	  size_needed,
 	  pack_buf_type,
-	  rntm,
-	  cntl,
 	  thread
 	);
 }
 
 void* bli_packm_alloc_ex
      (
-             siz_t      size_needed,
-             packbuf_t  pack_buf_type,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+       siz_t      size_needed,
+       packbuf_t  pack_buf_type,
+       thrinfo_t* thread
      )
 {
-	// Query the address of the mem_t entry within the control tree node.
-	mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl );
+	// Query the address of the mem_t entry within the thrinfo tree node.
+	mem_t* mem_p = bli_thrinfo_mem( thread );
+	pba_t* pba   = bli_thrinfo_pba( thread );
 
 	mem_t* local_mem_p;
 	mem_t  local_mem_s;
 
-	siz_t cntl_mem_size = 0;
+	siz_t  mem_size = 0;
 
-	if ( bli_mem_is_alloc( cntl_mem_p ) )
-		cntl_mem_size = bli_mem_size( cntl_mem_p );
+	if ( bli_mem_is_alloc( mem_p ) )
+		mem_size = bli_mem_size( mem_p );
 
-	if ( cntl_mem_size < size_needed )
+	if ( mem_size < size_needed )
 	{
-		if ( bli_thread_am_ochief( thread ) )
+		if ( bli_thrinfo_am_chief( thread ) )
 		{
 			// The chief thread releases the existing block associated with
-			// the mem_t entry in the control tree, and then re-acquires a
+			// the mem_t entry in the thrinfo tree, and then re-acquires a
 			// new block, saving the associated mem_t entry to local_mem_s.
-			if ( bli_mem_is_alloc( cntl_mem_p ) )
+			if ( bli_mem_is_alloc( mem_p ) )
 			{
 				bli_pba_release
 				(
-				  rntm,
-				  cntl_mem_p
+				  pba,
+				  mem_p
 				);
 			}
 
 			bli_pba_acquire_m
 			(
-			  rntm,
+			  pba,
 			  size_needed,
 			  pack_buf_type,
 			  &local_mem_s
@@ -103,17 +99,17 @@ void* bli_packm_alloc_ex
 
 		// Broadcast the address of the chief thread's local mem_t entry to
 		// all threads.
-		local_mem_p = bli_thread_broadcast( rntm, thread, &local_mem_s );
+		local_mem_p = bli_thrinfo_broadcast( thread, &local_mem_s );
 
 		// Save the chief thread's local mem_t entry to the mem_t field in
-		// this thread's control tree node.
-		*cntl_mem_p = *local_mem_p;
+		// this thread's thrinfo tree node.
+		*mem_p = *local_mem_p;
 
 		// Barrier so that the master thread doesn't return from the function
 		// before we are done reading.
-		bli_thread_barrier( rntm, thread );
+		bli_thrinfo_barrier( thread );
 	}
 
-	return bli_mem_buffer( cntl_mem_p );
+	return bli_mem_buffer( mem_p );
 }
 
diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h
index aec2e1af5..a24e7a500 100644
--- a/frame/1m/packm/bli_packm_alloc.h
+++ b/frame/1m/packm/bli_packm_alloc.h
@@ -35,17 +35,14 @@
 BLIS_EXPORT_BLIS void* bli_packm_alloc
      (
              siz_t      size_needed,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+       const cntl_t*    cntl,
+             thrinfo_t* thread
      );
 
 BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
      (
-             siz_t      size_needed,
-             packbuf_t  pack_buf_type,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+       siz_t      size_needed,
+       packbuf_t  pack_buf_type,
+       thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 601f2c05c..da49126a5 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -57,9 +57,8 @@ void bli_packm_blk_var1
        const obj_t*   c,
              obj_t*   p,
        const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl,
-       const thrinfo_t* thread
+       const cntl_t*  cntl,
+             thrinfo_t* thread
      )
 {
 	// Extract various fields from the control tree.
@@ -71,7 +70,7 @@ void bli_packm_blk_var1
 	// Every thread initializes p and determines the size of memory
 	// block needed (which gets embedded into the otherwise "blank" mem_t
 	// entry in the control tree node). Return early if no packing is required.
-	if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) )
+	if ( !bli_packm_init( c, p, cntx, cntl, thread ) )
 		return;
 
 	// Check parameters.
@@ -161,8 +160,8 @@ void bli_packm_blk_var1
 
 	// Query the number of threads and thread ids from the current thread's
 	// packm thrinfo_t node.
-	const dim_t nt  = bli_thread_n_way( thread );
-	const dim_t tid = bli_thread_work_id( thread );
+	const dim_t nt  = bli_thrinfo_num_threads( thread );
+	const dim_t tid = bli_thrinfo_thread_id( thread );
 
 	// Determine the thread range and increment using the current thread's
 	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h
index 5797e3b94..870988fec 100644
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -49,11 +49,10 @@ typedef struct
 
 BLIS_EXPORT_BLIS void bli_packm_blk_var1
      (
-       const obj_t*   c,
-             obj_t*   p,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl,
-       const thrinfo_t* t
+       const obj_t*     c,
+             obj_t*     p,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c
index e99ed9cf3..7f7401045 100644
--- a/frame/1m/packm/bli_packm_cntl.c
+++ b/frame/1m/packm/bli_packm_cntl.c
@@ -37,7 +37,7 @@
 
 BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
      (
-       rntm_t*   rntm,
+       pool_t*   pool,
        void_fp   var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
@@ -57,7 +57,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
 	#endif
 
 	// Allocate a packm_params_t struct.
-	params = bli_sba_acquire( rntm, sizeof( packm_params_t ) );
+	params = bli_sba_acquire( pool, sizeof( packm_params_t ) );
 
 	// Initialize the packm_params_t struct.
 	params->size              = sizeof( packm_params_t );
@@ -79,7 +79,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
 	// sync with the cntl_t tree.
 	cntl = bli_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  BLIS_NOID,
 	  BLIS_NO_PART,
 	  var_func,
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index be0fc8fde..8a43f711d 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -85,7 +85,7 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
 
 cntl_t* bli_packm_cntl_create_node
      (
-       rntm_t*   rntm,
+       pool_t*   pool,
        void_fp   var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index 67e02ac0e..d4480f2c1 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -40,9 +40,8 @@ bool bli_packm_init
        const obj_t*  c,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	bli_init_once();
@@ -179,7 +178,7 @@ bool bli_packm_init
 	// Update the buffer address in p to point to the buffer associated
 	// with the mem_t entry acquired from the memory broker (now cached in
 	// the control tree node).
-	void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread );
+	void* buffer = bli_packm_alloc( size_p, cntl, thread );
 	bli_obj_set_buffer( buffer, p );
 
 	return true;
diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h
index 6f9b47273..b34bd5379 100644
--- a/frame/1m/packm/bli_packm_init.h
+++ b/frame/1m/packm/bli_packm_init.h
@@ -37,8 +37,7 @@ BLIS_EXPORT_BLIS bool bli_packm_init
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index ae788e671..fa4fcb47a 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -39,9 +39,8 @@ void bli_packm_int
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
 	bli_init_once();
@@ -51,7 +50,8 @@ void bli_packm_int
 
 	// Barrier so that we know threads are done with previous computation
 	// with the same packing buffer before starting to pack.
-	bli_thread_barrier( rntm, thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	bli_thrinfo_barrier( thread );
 
 	// Invoke the variant with kappa_use.
 	f
@@ -59,12 +59,11 @@ void bli_packm_int
 	  a,
 	  p,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
 
 	// Barrier so that packing is done before computation.
-	bli_thread_barrier( rntm, thread );
+	bli_thrinfo_barrier( thread );
 }
 
diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h
index a4cf17d59..b7720cd3e 100644
--- a/frame/1m/packm/bli_packm_int.h
+++ b/frame/1m/packm/bli_packm_int.h
@@ -37,7 +37,6 @@ void bli_packm_int
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread
      );
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c
index 650b6178c..b83a0271f 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.c
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.c
@@ -306,8 +306,6 @@ void PASTEMAC2(cha,chp,opname) \
 	PASTEMAC(cha,ctyper)* restrict alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \
 	PASTEMAC(chp,ctyper)* restrict pi1_r    = ( PASTEMAC(chp,ctyper)* )p; \
 	PASTEMAC(chp,ctyper)* restrict pi1_i    = ( PASTEMAC(chp,ctyper)* )p + ldp; \
-\
-	( void )kappa_i; \
 \
 	if ( PASTEMAC(chp,eq1)( *kappa ) ) \
 	{ \
diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c
deleted file mode 100644
index 4b57971ef..000000000
--- a/frame/1m/packm/bli_packm_thrinfo.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_packm_thrinfo_init
-     (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
-     )
-{
-	bli_thrinfo_init
-	(
-	  thread,
-	  ocomm, ocomm_id,
-	  n_way, work_id,
-	  FALSE,
-	  BLIS_NO_PART,
-	  sub_node
-	);
-}
-
-void bli_packm_thrinfo_init_single
-     (
-       thrinfo_t* thread
-     )
-{
-	bli_packm_thrinfo_init
-	(
-	  thread,
-	  &BLIS_SINGLE_COMM, 0,
-	  1,
-	  0,
-	  BLIS_NO_PART,
-	  NULL
-	);
-}
-
diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h
index 85b61931c..1ac7f88df 100644
--- a/frame/1m/packm/bli_packm_thrinfo.h
+++ b/frame/1m/packm/bli_packm_thrinfo.h
@@ -5,7 +5,6 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -64,42 +63,3 @@
 
 #endif
 
-
-//
-// thrinfo_t APIs specific to packm.
-//
-
-#if 0
-thrinfo_t* bli_packm_thrinfo_create
-     (
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       thrinfo_t* sub_node
-     );
-#endif
-
-void bli_packm_thrinfo_init
-     (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
-     );
-
-void bli_packm_thrinfo_init_single
-     (
-       thrinfo_t* thread
-     );
-
-#if 0
-void bli_packm_thrinfo_free
-     (
-       thrinfo_t* thread
-     );
-#endif
-
diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c
index 95d0545be..e33e3b151 100644
--- a/frame/1m/unpackm/bli_unpackm_cntl.c
+++ b/frame/1m/unpackm/bli_unpackm_cntl.c
@@ -37,10 +37,10 @@
 
 cntl_t* bli_unpackm_cntl_create_node
      (
-       rntm_t*   rntm,
-       void_fp   var_func,
-       void_fp   unpackm_var_func,
-       cntl_t*   sub_node
+       pool_t* pool,
+       void_fp var_func,
+       void_fp unpackm_var_func,
+       cntl_t* sub_node
      )
 {
 	cntl_t*           cntl;
@@ -64,7 +64,7 @@ cntl_t* bli_unpackm_cntl_create_node
 	// sync with the cntl_t tree.
 	cntl = bli_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  BLIS_NOID,
 	  BLIS_NO_PART,
 	  var_func,
diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h
index 5c41d9465..075800d0a 100644
--- a/frame/1m/unpackm/bli_unpackm_cntl.h
+++ b/frame/1m/unpackm/bli_unpackm_cntl.h
@@ -48,9 +48,9 @@ typedef struct unpackm_params_s unpackm_params_t;
 
 cntl_t* bli_unpackm_cntl_create_node
      (
-       rntm_t*   rntm,
-       void_fp   var_func,
-       void_fp   unpackm_var_func,
-       cntl_t*   sub_node
+       pool_t* pool,
+       void_fp var_func,
+       void_fp unpackm_var_func,
+       cntl_t* sub_node
      );
 
diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c
index 3b542b061..2ced9a1a2 100644
--- a/frame/1m/unpackm/bli_unpackm_int.c
+++ b/frame/1m/unpackm/bli_unpackm_int.c
@@ -61,7 +61,7 @@ void bli_unpackm_int
 	f = bli_cntl_unpackm_params_var_func( cntl );
 
 	// Invoke the variant.
-	if ( bli_thread_am_ochief( thread ) )
+	if ( bli_thrinfo_am_chief( thread ) )
 	{
 		f
 		(
@@ -74,6 +74,6 @@ void bli_unpackm_int
 	}
 
 	// Barrier so that unpacking is done before computation.
-	bli_thread_barrier( rntm, thread );
+	bli_thrinfo_barrier( thread );
 }
 
diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index 9d39fc47d..a55091539 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -33,6 +33,10 @@
 
 */
 
+#include "bli_l3_thrinfo.h"
+#include "bli_l3_decor.h"
+#include "bli_l3_sup_decor.h"
+
 #include "bli_l3_cntl.h"
 #include "bli_l3_check.h"
 #include "bli_l3_int.h"
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
index 78482b5f6..586aeb6ea 100644
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -35,6 +35,40 @@
 #include "blis.h"
 
 
+void bli_l3_adjust_kc
+      (
+        const obj_t*  a,
+        const obj_t*  b,
+              dim_t*  b_alg,
+              dim_t*  b_max,
+        const cntx_t* cntx,
+        const cntl_t* cntl
+      )
+{
+	const opid_t family = bli_cntl_family( cntl );
+	const num_t  dt     = bli_obj_exec_dt( a );
+	      dim_t  mnr    = 1;
+
+	// Nudge the default and maximum kc blocksizes up to the nearest
+	// multiple of MR if A is Hermitian, symmetric, or triangular or
+	// NR if B is Hermitian, symmetric, or triangular. If neither case
+	// applies, then we leave the blocksizes unchanged. For trsm we
+	// always use MR (rather than sometimes using NR) because even
+	// when the triangle is on the right, packing of that matrix uses
+	// MR, since only left-side trsm micro-kernels are supported.
+	if ( !bli_obj_root_is_general( a ) || family == BLIS_TRSM )
+	{
+		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	}
+	else if ( !bli_obj_root_is_general( b ) )
+	{
+		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+	}
+
+	*b_alg = bli_align_dim_to_mult( *b_alg, mnr );
+	*b_max = bli_align_dim_to_mult( *b_max, mnr );
+}
+
 dim_t bli_l3_determine_kc
       (
               dir_t   direct,
@@ -47,261 +81,16 @@ dim_t bli_l3_determine_kc
         const cntl_t* cntl
       )
 {
-	opid_t family = bli_cntl_family( cntl );
-
-	if      ( family == BLIS_GEMM )
-		return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-	else if ( family == BLIS_GEMMT )
-		return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx );
-	else if ( family == BLIS_TRMM )
-		return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-	else if ( family == BLIS_TRSM )
-		return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-
-	// This should never execute.
-	return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
-//
-// NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize
-// function to determine the kc blocksize so that we can implement the
-// "nudging" of kc to be a multiple of mr or nr, as needed.
-//
-
-#undef  GENFRONT
-#define GENFRONT( opname, l3op ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dir_t   direct, \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	if ( direct == BLIS_FWD ) \
-		return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \
-	else \
-		return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \
-}
-
-GENFRONT( gemm_determine_kc, gemm )
-GENFRONT( gemmt_determine_kc, gemmt )
-GENFRONT( trmm_determine_kc, trmm )
-GENFRONT( trsm_determine_kc, trsm )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
+	const num_t    dt    = bli_obj_exec_dt( a );
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx );
+	      dim_t    b_alg = bli_blksz_get_def( dt, bsize );
+	      dim_t    b_max = bli_blksz_get_max( dt, bsize );
 
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
+	bli_l3_adjust_kc( a, b, &b_alg, &b_max, cntx, cntl );
 
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
-	/* Nudge the default and maximum kc blocksizes up to the nearest
-	   multiple of MR if A is Hermitian or symmetric, or NR if B is
-	   Hermitian or symmetric. If neither case applies, then we leave
-	   the blocksizes unchanged. */ \
-	dim_t    mnr; \
-	if      ( bli_obj_root_is_herm_or_symm( a ) ) \
-	{ \
-		mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-		b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-		b_max = bli_align_dim_to_mult( b_max, mnr ); \
-	} \
-	else if ( bli_obj_root_is_herm_or_symm( b ) ) \
-	{ \
-		mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-		b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-		b_max = bli_align_dim_to_mult( b_max, mnr ); \
-	} \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
+	if ( direct == BLIS_FWD )
+		return bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );
+	else
+		return bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
 }
 
-GENFRONT( gemm_determine_kc_f, f )
-GENFRONT( gemm_determine_kc_b, b )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	const dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	const dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
-	/* Notice that for gemmt, we do not need to perform any special handling
-	   for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-}
-
-GENFRONT( gemmt_determine_kc_f, f )
-GENFRONT( gemmt_determine_kc_b, b )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
-	/* Nudge the default and maximum kc blocksizes up to the nearest
-	   multiple of MR if the triangular matrix is on the left, or NR
-	   if the triangular matrix is one the right. */ \
-	dim_t mnr; \
-	if ( bli_obj_root_is_triangular( a ) ) \
-		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	else \
-		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-	b_max = bli_align_dim_to_mult( b_max, mnr ); \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-}
-
-GENFRONT( trmm_determine_kc_f, f )
-GENFRONT( trmm_determine_kc_b, b )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
-	/* Nudge the default and maximum kc blocksizes up to the nearest
-	   multiple of MR. We always use MR (rather than sometimes using NR)
-	   because even when the triangle is on the right, packing of that
-	   matrix uses MR, since only left-side trsm micro-kernels are
-	   supported. */ \
-	const dim_t mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	            b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-	            b_max = bli_align_dim_to_mult( b_max, mnr ); \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-}
-
-GENFRONT( trsm_determine_kc_f, f )
-GENFRONT( trsm_determine_kc_b, b )
-
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
index 1ec889e03..843d5f241 100644
--- a/frame/3/bli_l3_blocksize.h
+++ b/frame/3/bli_l3_blocksize.h
@@ -32,6 +32,16 @@
 
 */
 
+void bli_l3_adjust_kc
+      (
+        const obj_t*  a,
+        const obj_t*  b,
+              dim_t*  b_alg,
+              dim_t*  b_max,
+        const cntx_t* cntx,
+        const cntl_t* cntl
+      );
+
 dim_t bli_l3_determine_kc
       (
               dir_t   direct,
@@ -43,50 +53,3 @@ dim_t bli_l3_determine_kc
         const cntx_t* cntx,
         const cntl_t* cntl
       );
-
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-               dir_t   direct, \
-               dim_t   i, \
-               dim_t   dim, \
-         const obj_t*  a, \
-         const obj_t*  b, \
-               bszid_t bszid, \
-         const cntx_t* cntx  \
-      );
-
-GENPROT( gemm_determine_kc )
-GENPROT( gemmt_determine_kc )
-GENPROT( trmm_determine_kc )
-GENPROT( trsm_determine_kc )
-
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-               dim_t   i, \
-               dim_t   dim, \
-         const obj_t*  a, \
-         const obj_t*  b, \
-               bszid_t bszid, \
-         const cntx_t* cntx  \
-      );
-
-GENPROT( gemm_determine_kc_f )
-GENPROT( gemm_determine_kc_b )
-
-GENPROT( gemmt_determine_kc_f )
-GENPROT( gemmt_determine_kc_b )
-
-GENPROT( trmm_determine_kc_f )
-GENPROT( trmm_determine_kc_b )
-
-GENPROT( trsm_determine_kc_f )
-GENPROT( trsm_determine_kc_b )
-
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
index d7fd9649e..27d140143 100644
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -44,8 +44,8 @@ void bli_l3_cntl_create_if
        const obj_t*   a,
        const obj_t*   b,
        const obj_t*   c,
-             rntm_t*  rntm,
-             cntl_t*  cntl_orig,
+             pool_t*  pool,
+       const cntl_t*  cntl_orig,
              cntl_t** cntl_use
      )
 {
@@ -59,7 +59,7 @@ void bli_l3_cntl_create_if
 		{
 			*cntl_use = bli_gemm_cntl_create
 			(
-			  rntm,
+			  pool,
 			  family,
 			  schema_a,
 			  schema_b,
@@ -70,12 +70,18 @@ void bli_l3_cntl_create_if
 		{
 			side_t side;
 
+			// NOTE: We no longer ever use right-sided trsm, and therefore this
+			// function will only ever get called with side = BLIS_LEFT, which
+			// means that in the future, we can remove the a, b, and c operands
+			// from the function signature. (This assumes that the call to
+			// bli_obj_ker_fn( c ) is replaced in some future reorganization
+			// that moves the .ker_fn argument from obj_t to, say, the rntm_t.)
 			if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT;
 			else                              side = BLIS_RIGHT;
 
 			*cntl_use = bli_trsm_cntl_create
 			(
-			  rntm,
+			  pool,
 			  side,
 			  schema_a,
 			  schema_b,
@@ -88,7 +94,7 @@ void bli_l3_cntl_create_if
 		// If the user provided a control tree, create a copy and use it
 		// instead (so that threads can use its local tree as a place to
 		// cache things like pack mem_t entries).
-		*cntl_use = bli_cntl_copy( rntm, cntl_orig );
+		*cntl_use = bli_cntl_copy( pool, cntl_orig );
 
 		// Recursively set the family fields of the newly copied control tree
 		// nodes.
@@ -98,9 +104,8 @@ void bli_l3_cntl_create_if
 
 void bli_l3_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl_use,
-       thrinfo_t* thread
+       pool_t* pool,
+       cntl_t* cntl_use
      )
 {
 	// NOTE: We don't actually need to call separate _cntl_free() functions
@@ -114,11 +119,11 @@ void bli_l3_cntl_free
 	     family == BLIS_GEMMT ||
 	     family == BLIS_TRMM )
 	{
-		bli_gemm_cntl_free( rntm, cntl_use, thread );
+		bli_gemm_cntl_free( pool, cntl_use );
 	}
 	else // if ( family == BLIS_TRSM )
 	{
-		bli_trsm_cntl_free( rntm, cntl_use, thread );
+		bli_trsm_cntl_free( pool, cntl_use );
 	}
 }
 
diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h
index eb4321ecd..68e837663 100644
--- a/frame/3/bli_l3_cntl.h
+++ b/frame/3/bli_l3_cntl.h
@@ -46,15 +46,14 @@ void bli_l3_cntl_create_if
        const obj_t*   a,
        const obj_t*   b,
        const obj_t*   c,
-             rntm_t*  rntm,
-             cntl_t*  cntl_orig,
+             pool_t*  pool,
+       const cntl_t*  cntl_orig,
              cntl_t** cntl_use
      );
 
-void bli_l3_cntl_free
+BLIS_EXPORT_BLIS void bli_l3_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl_use,
-       thrinfo_t* thread
+       pool_t* pool,
+       cntl_t* cntl_use
      );
 
diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c
new file mode 100644
index 000000000..e482d37a1
--- /dev/null
+++ b/frame/3/bli_l3_decor.c
@@ -0,0 +1,298 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+struct l3_decor_params_s
+{
+	      l3int_ft func;
+	      opid_t   family;
+	const obj_t*   alpha;
+	const obj_t*   a;
+	const obj_t*   b;
+	const obj_t*   beta;
+	const obj_t*   c;
+	const cntx_t*  cntx;
+	      rntm_t*  rntm;
+	      array_t* array;
+};
+typedef struct l3_decor_params_s l3_decor_params_t;
+
+static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const void* data_void )
+{
+	const l3_decor_params_t* data    = data_void;
+
+	const l3int_ft           func    = data->func;
+	const opid_t             family  = data->family;
+	const obj_t*             alpha   = data->alpha;
+	const obj_t*             a       = data->a;
+	const obj_t*             b       = data->b;
+	const obj_t*             beta    = data->beta;
+	const obj_t*             c       = data->c;
+	const cntx_t*            cntx    = data->cntx;
+	      rntm_t*            rntm    = data->rntm;
+	      array_t*           array   = data->array;
+
+	bli_l3_thread_decorator_thread_check( gl_comm, rntm );
+
+	// Alias thread-local copies of A, B, and C. These will be the objects
+	// we pass down the algorithmic function stack. Making thread-local
+	// aliases is highly recommended in case a thread needs to change any
+	// of the properties of an object without affecting other threads'
+	// objects.
+	obj_t a_t, b_t, c_t;
+	bli_obj_alias_to( a, &a_t );
+	bli_obj_alias_to( b, &b_t );
+	bli_obj_alias_to( c, &c_t );
+
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	pack_t schema_a = bli_obj_pack_schema( &a_t );
+	pack_t schema_b = bli_obj_pack_schema( &b_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
+
+	// Create a default control tree for the operation, if needed.
+	cntl_t* cntl_use;
+	pool_t* sba_pool = bli_apool_array_elem( tid, array );
+	bli_l3_cntl_create_if( family, schema_a, schema_b,
+	                       &a_t, &b_t, &c_t, sba_pool, NULL, &cntl_use );
+
+	// Create the root node of the current thread's thrinfo_t structure.
+	// The root node is the *parent* of the node corresponding to the first
+	// control tree node.
+	thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl_use );
+
+	func
+	(
+	  alpha,
+	  &a_t,
+	  &b_t,
+	  beta,
+	  &c_t,
+	  cntx,
+	  cntl_use,
+	  thread
+	);
+
+	// Free the thread's local control tree.
+	bli_l3_cntl_free( sba_pool, cntl_use );
+
+	// Free the current thread's thrinfo_t structure.
+	bli_thrinfo_free( thread );
+}
+
+void bli_l3_thread_decorator
+     (
+             l3int_ft func,
+             opid_t   family,
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+       const rntm_t*  rntm
+     )
+{
+	rntm_t rntm_l = *rntm;
+
+	// Query the threading implementation and the number of threads requested.
+	timpl_t ti = bli_rntm_thread_impl( &rntm_l );
+	dim_t   nt = bli_rntm_num_threads( &rntm_l );
+
+#if 0
+	printf( "(pre-opt) application requested rntm.thread_impl = %s\n",
+	        ( ti == BLIS_SINGLE ? "single" :
+	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
+
+	if ( bli_error_checking_is_enabled() )
+		bli_l3_thread_decorator_check( &rntm_l );
+
+#ifdef BLIS_ENABLE_NT1_VIA_SINGLE
+	if ( nt == 1 )
+	{
+		// An optimization. If the caller requests only one thread, force
+		// the sequential level-3 thread decorator even if that means
+		// overriding the caller's preferred threading implementation (as
+		// communicated via the rntm_t).
+		rntm_l = *rntm;
+		ti = BLIS_SINGLE;
+		bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm_l );
+		rntm = &rntm_l;
+	}
+#endif
+
+	if ( 1 < nt && ti == BLIS_SINGLE )
+	{
+		// Here, we resolve conflicting information. The caller requested
+		// a sequential threading implementation, but also requested more
+		// than one thread. Here, we choose to favor the requested threading
+		// implementation over the number of threads, and so reset all
+		// parallelism parameters to 1.
+		nt = 1;
+		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
+		bli_rntm_set_num_threads_only( 1, &rntm_l );
+	}
+
+#if 0
+	printf( "(post-opt) moving forward with rntm.thread_impl  = %s\n",
+	        ( ti == BLIS_SINGLE ? "single" :
+	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
+#endif
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* array = bli_sba_checkout_array( nt );
+
+	l3_decor_params_t params;
+	params.func     = func;
+	params.family   = family;
+	params.alpha    = alpha;
+	params.a        = a;
+	params.b        = b;
+	params.beta     = beta;
+	params.c        = c;
+	params.cntx     = cntx;
+	params.rntm     = &rntm_l;
+	params.array    = array;
+
+	// Launch the threads using the threading implementation specified by ti,
+	// and use bli_l3_thread_decorator_entry() as their entry points. The
+	// params struct will be passed along to each thread.
+	bli_thread_launch( ti, nt, bli_l3_thread_decorator_entry, &params );
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+void bli_l3_thread_decorator_check
+     (
+       const rntm_t* rntm
+     )
+{
+	//err_t e_val;
+
+	//e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) );
+	//bli_check_error_code( e_val );
+
+	const timpl_t ti = bli_rntm_thread_impl( rntm );
+
+	if (
+#ifndef BLIS_ENABLE_OPENMP
+	     ti == BLIS_OPENMP ||
+#endif
+#ifndef BLIS_ENABLE_PTHREADS
+	     ti == BLIS_POSIX ||
+#endif
+	     FALSE
+	   )
+	{
+		fprintf( stderr, "\n" );
+		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
+		bli_abort();
+	}
+}
+
+void bli_l3_thread_decorator_thread_check
+     (
+       thrcomm_t* gl_comm,
+       rntm_t*    rntm
+     )
+{
+#ifdef BLIS_ENABLE_OPENMP
+
+	if ( bli_thrcomm_thread_impl( gl_comm ) != BLIS_OPENMP)
+		return;
+
+	dim_t n_threads_real = omp_get_num_threads();
+	dim_t n_threads      = bli_thrcomm_num_threads( gl_comm );
+	dim_t tid            = omp_get_thread_num();
+
+	// Check if the number of OpenMP threads created within this parallel
+	// region is different from the number of threads that were requested
+	// of BLIS. This inequality may trigger when, for example, the
+	// following conditions are satisfied:
+	// - an application is executing an OpenMP parallel region in which
+	//   BLIS is invoked,
+	// - BLIS is configured for multithreading via OpenMP,
+	// - OMP_NUM_THREADS = t > 1,
+	// - the number of threads requested of BLIS (regardless of method)
+	//   is p <= t,
+	// - OpenMP nesting is disabled.
+	// In this situation, the application spawns t threads. Each application
+	// thread calls gemm (for example). Each gemm will attempt to spawn p
+	// threads via OpenMP. However, since nesting is disabled, the OpenMP
+	// implementation finds that t >= p threads are already spawned, and
+	// thus it doesn't spawn *any* additional threads for each gemm.
+	if ( n_threads_real != n_threads )
+	{
+		// If the number of threads active in the current region is not
+		// equal to the number requested of BLIS, we then only continue
+		// if the number of threads in the current region is 1. If, for
+		// example, BLIS requested 4 threads but only got 3, then we
+		// abort().
+		if ( n_threads_real != 1 )
+		{
+			bli_print_msg( "A different number of threads was "
+			               "created than was requested.",
+			               __FILE__, __LINE__ );
+			bli_abort();
+		}
+
+		if ( tid == 0 )
+		{
+			bli_thrcomm_init( BLIS_OPENMP, 1, gl_comm );
+			bli_rntm_set_num_threads_only( 1, rntm );
+			bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
+		}
+
+		// Synchronize all threads and continue.
+		_Pragma( "omp barrier" )
+	}
+
+#endif
+}
+
diff --git a/frame/thread/bli_l3_decor.h b/frame/3/bli_l3_decor.h
similarity index 78%
rename from frame/thread/bli_l3_decor.h
rename to frame/3/bli_l3_decor.h
index 087eda874..e00b8ed49 100644
--- a/frame/thread/bli_l3_decor.h
+++ b/frame/3/bli_l3_decor.h
@@ -45,26 +45,10 @@ typedef void (*l3int_ft)
        const obj_t*     beta,
        const obj_t*     c,
        const cntx_t*    cntx,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
+       const cntl_t*    cntl,
              thrinfo_t* thread
      );
 
-// Level-3 thread decorator function type.
-typedef void (*l3_decor_ft)
-     (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   beta,
-       const obj_t*   c,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
-     );
-
 // Level-3 thread decorator prototype.
 void bli_l3_thread_decorator
      (
@@ -76,20 +60,19 @@ void bli_l3_thread_decorator
        const obj_t*   beta,
        const obj_t*   c,
        const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
+       const rntm_t*  rntm
      );
 
 void bli_l3_thread_decorator_check
      (
-       rntm_t* rntm
+       const rntm_t* rntm
      );
 
-// Include definitions specific to the method of multithreading for the
-// conventional code path.
-#include "bli_l3_decor_single.h"
-#include "bli_l3_decor_openmp.h"
-#include "bli_l3_decor_pthreads.h"
+void bli_l3_thread_decorator_thread_check
+     (
+       thrcomm_t* gl_comm,
+       rntm_t*    rntm
+     );
 
 #endif
 
diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c
index b9d389839..70e6be3a9 100644
--- a/frame/3/bli_l3_int.c
+++ b/frame/3/bli_l3_int.c
@@ -42,8 +42,7 @@ void bli_l3_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -68,9 +67,9 @@ void bli_l3_int
 	if ( bli_obj_has_zero_dim( a ) ||
 	     bli_obj_has_zero_dim( b ) )
 	{
-		if ( bli_thread_am_ochief( thread ) )
+		if ( bli_thrinfo_am_chief( thread ) )
 			bli_scalm( beta, c );
-		bli_thread_barrier( rntm, thread );
+		bli_thrinfo_barrier( thread );
 		return;
 	}
 
@@ -82,9 +81,9 @@ void bli_l3_int
 		// This should never execute.
 		bli_abort();
 
-		if ( bli_thread_am_ochief( thread ) )
+		if ( bli_thrinfo_am_chief( thread ) )
 			bli_scalm( beta, c );
-		bli_thread_barrier( rntm, thread );
+		bli_thrinfo_barrier( thread );
 		return;
 	}
 
@@ -130,9 +129,6 @@ void bli_l3_int
 	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
 		bli_obj_scalar_apply_scalar( beta, &c_local );
 
-	// Create the next node in the thrinfo_t structure.
-	bli_thrinfo_grow( rntm, cntl, thread );
-
 	// Extract the function pointer from the current control tree node.
 	l3_var_oft f = bli_cntl_var_func( cntl );
 
@@ -143,7 +139,6 @@ void bli_l3_int
 	  &b_local,
 	  &c_local,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h
index 65485206d..8364d91e4 100644
--- a/frame/3/bli_l3_int.h
+++ b/frame/3/bli_l3_int.h
@@ -40,8 +40,7 @@ void bli_l3_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index 16e5f15de..76234525d 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -50,7 +50,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -90,8 +90,8 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -122,7 +122,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 		bli_gemm_check( alpha, a, b, beta, c, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
+	bli_gemm_front( alpha, a, b, beta, c, cntx, &rntm_l );
 }
 
 #endif
@@ -136,7 +136,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -157,8 +157,8 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -186,7 +186,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 		bli_gemmt_check( alpha, a, b, beta, c, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL );
+	bli_gemmt_front( alpha, a, b, beta, c, cntx, &rntm_l );
 }
 
 
@@ -198,7 +198,7 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -244,7 +244,7 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -277,7 +277,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -285,8 +285,8 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -314,7 +314,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 		bli_hemm_check( side, alpha, a, b, beta, c, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
+	bli_hemm_front( side, alpha, a, b, beta, c, cntx, &rntm_l );
 }
 
 
@@ -327,7 +327,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -335,8 +335,8 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -364,7 +364,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 		bli_symm_check( side, alpha, a, b, beta, c, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
+	bli_symm_front( side, alpha, a, b, beta, c, cntx, &rntm_l );
 }
 
 
@@ -377,7 +377,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -385,8 +385,8 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -414,7 +414,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
 		bli_trmm3_check( side, alpha, a, b, beta, c, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
+	bli_trmm3_front( side, alpha, a, b, beta, c, cntx, &rntm_l );
 }
 
 
@@ -425,7 +425,7 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -459,7 +459,7 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -484,7 +484,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -492,8 +492,8 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( b );
@@ -520,7 +520,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 		bli_trmm_check( side, alpha, a, b, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL );
+	bli_trmm_front( side, alpha, a, b, cntx, &rntm_l );
 }
 
 
@@ -531,7 +531,7 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -539,8 +539,8 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( b );
@@ -567,5 +567,5 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
 		bli_trsm_check( side, alpha, a, b, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL );
+	bli_trsm_front( side, alpha, a, b, cntx, &rntm_l );
 }
diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h
index 58091704b..dd7624d92 100644
--- a/frame/3/bli_l3_oapi_ex.h
+++ b/frame/3/bli_l3_oapi_ex.h
@@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  beta, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( gemm )
@@ -70,7 +70,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  beta, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( hemm )
@@ -88,7 +88,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  beta, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( herk )
@@ -105,7 +105,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  a, \
        const obj_t*  b, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( trmm )
diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h
index 997ade58e..67fa2c75d 100644
--- a/frame/3/bli_l3_oft.h
+++ b/frame/3/bli_l3_oft.h
@@ -54,7 +54,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( gemm )
@@ -77,7 +77,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( hemm )
@@ -97,7 +97,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( herk )
@@ -116,7 +116,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  a, \
   const obj_t*  b, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( trmm )
diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h
index ee529b115..b295b5812 100644
--- a/frame/3/bli_l3_oft_var.h
+++ b/frame/3/bli_l3_oft_var.h
@@ -49,8 +49,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
   const obj_t*  b, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm, \
-        cntl_t* cntl, \
+  const cntl_t* cntl, \
         thrinfo_t* thread  \
 );
 
diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c
index 6f18169b2..65776d49f 100644
--- a/frame/3/bli_l3_packab.c
+++ b/frame/3/bli_l3_packab.c
@@ -40,8 +40,7 @@ void bli_l3_packa
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -60,7 +59,6 @@ void bli_l3_packa
 	  &a_local,
 	  &a_pack,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
@@ -74,7 +72,6 @@ void bli_l3_packa
 	  &BLIS_ONE,
 	  c,
 	  cntx,
-	  rntm,
 	  bli_cntl_sub_node( cntl ),
 	  bli_thrinfo_sub_node( thread )
 	);
@@ -88,8 +85,7 @@ void bli_l3_packb
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -112,7 +108,6 @@ void bli_l3_packb
 	  &bt_local,
 	  &bt_pack,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
@@ -129,7 +124,6 @@ void bli_l3_packb
 	  &BLIS_ONE,
 	  c,
 	  cntx,
-	  rntm,
 	  bli_cntl_sub_node( cntl ),
 	  bli_thrinfo_sub_node( thread )
 	);
diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h
index f03b7f62c..e58a08e4b 100644
--- a/frame/3/bli_l3_packab.h
+++ b/frame/3/bli_l3_packab.h
@@ -38,8 +38,7 @@ void bli_l3_packa
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      );
 
@@ -49,8 +48,7 @@ void bli_l3_packb
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index eedbd9ec5..57513ab5b 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -42,7 +42,7 @@ err_t bli_gemmsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
@@ -89,8 +89,8 @@ err_t bli_gemmsup
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 #if 0
 const num_t dt = bli_obj_dt( c );
@@ -127,7 +127,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
 	  beta,
 	  c,
 	  cntx,
-	  rntm
+	  &rntm_l
 	);
 }
 
@@ -140,7 +140,7 @@ err_t bli_gemmtsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
@@ -174,8 +174,8 @@ err_t bli_gemmtsup
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// We've now ruled out the possibility that the sup thresholds are
 	// unsatisfied.
@@ -196,7 +196,7 @@ err_t bli_gemmtsup
 	  beta,
 	  c,
 	  cntx,
-	  rntm
+	  &rntm_l
 	);
 }
 
diff --git a/frame/3/bli_l3_sup.h b/frame/3/bli_l3_sup.h
index 33b3f8ca7..77ff02d91 100644
--- a/frame/3/bli_l3_sup.h
+++ b/frame/3/bli_l3_sup.h
@@ -40,7 +40,7 @@ err_t bli_gemmsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      );
 
 err_t bli_gemmtsup
@@ -51,6 +51,6 @@ err_t bli_gemmtsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      );
 
diff --git a/frame/thread/bli_l3_decor.c b/frame/3/bli_l3_sup_decor.c
similarity index 59%
rename from frame/thread/bli_l3_decor.c
rename to frame/3/bli_l3_sup_decor.c
index 33fb834be..5f415ac50 100644
--- a/frame/thread/bli_l3_decor.c
+++ b/frame/3/bli_l3_sup_decor.c
@@ -34,36 +34,63 @@
 
 #include "blis.h"
 
-// Initialize a function pointer array containing function addresses for
-// each of the threading-specific level-3 thread decorators.
-
-static l3_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+struct l3_sup_decor_params_s
 {
-	[BLIS_SINGLE] = bli_l3_thread_decorator_single,
-	[BLIS_OPENMP] =
-#if   defined(BLIS_ENABLE_OPENMP)
-	                bli_l3_thread_decorator_openmp,
-#elif defined(BLIS_ENABLE_PTHREADS)
-	                NULL,
-#else
-	                NULL,
-#endif
-	[BLIS_POSIX]  =
-#if   defined(BLIS_ENABLE_PTHREADS)
-	                bli_l3_thread_decorator_pthreads,
-#elif defined(BLIS_ENABLE_OPENMP)
-	                NULL,
-#else
-	                NULL,
-#endif
+	      l3supint_ft func;
+	      opid_t      family;
+	const obj_t*      alpha;
+	const obj_t*      a;
+	const obj_t*      b;
+	const obj_t*      beta;
+	const obj_t*      c;
+	const cntx_t*     cntx;
+	      rntm_t*     rntm;
+	      array_t*    array;
 };
+typedef struct l3_sup_decor_params_s l3_sup_decor_params_t;
+
+static void bli_l3_sup_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const void* data_void )
+{
+	const l3_sup_decor_params_t* data    = data_void;
+
+	const l3supint_ft            func    = data->func;
+	const opid_t                 family  = data->family;
+	const obj_t*                 alpha   = data->alpha;
+	const obj_t*                 a       = data->a;
+	const obj_t*                 b       = data->b;
+	const obj_t*                 beta    = data->beta;
+	const obj_t*                 c       = data->c;
+	const cntx_t*                cntx    = data->cntx;
+	      rntm_t*                rntm    = data->rntm;
+	      array_t*               array   = data->array;
+
+	( void )family;
+
+	bli_l3_thread_decorator_thread_check( gl_comm, rntm );
+
+	// Create the root node of the thread's thrinfo_t structure.
+	pool_t*    pool   = bli_apool_array_elem( tid, array );
+	thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
+
+	func
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  thread
+	);
 
-// Define a dispatcher that chooses a threading-specific function from the
-// above function pointer array.
+	// Free the current thread's thrinfo_t structure.
+	bli_thrinfo_free( thread );
+}
 
-void bli_l3_thread_decorator
+err_t bli_l3_sup_thread_decorator
      (
-             l3int_ft func,
+             l3supint_ft func,
              opid_t   family,
        const obj_t*   alpha,
        const obj_t*   a,
@@ -71,15 +98,14 @@ void bli_l3_thread_decorator
        const obj_t*   beta,
        const obj_t*   c,
        const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
+       const rntm_t*  rntm
      )
 {
-	rntm_t rntm_l;
+	rntm_t rntm_l = *rntm;
 
 	// Query the threading implementation and the number of threads requested.
-	timpl_t ti = bli_rntm_thread_impl( rntm );
-	dim_t   nt = bli_rntm_num_threads( rntm );
+	timpl_t ti = bli_rntm_thread_impl( &rntm_l );
+	dim_t   nt = bli_rntm_num_threads( &rntm_l );
 
 #if 0
 	printf( "(pre-opt) application requested rntm.thread_impl = %s\n",
@@ -88,7 +114,7 @@ void bli_l3_thread_decorator
 #endif
 
 	if ( bli_error_checking_is_enabled() )
-		bli_l3_thread_decorator_check( rntm );
+		bli_l3_thread_decorator_check( &rntm_l );
 
 #ifdef BLIS_ENABLE_NT1_VIA_SINGLE
 	if ( nt == 1 )
@@ -111,11 +137,9 @@ void bli_l3_thread_decorator
 		// than one thread. Here, we choose to favor the requested threading
 		// implementation over the number of threads, and so reset all
 		// parallelism parameters to 1.
-		rntm_l = *rntm;
 		nt = 1;
 		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
 		bli_rntm_set_num_threads_only( 1, &rntm_l );
-		rntm = &rntm_l;
 	}
 
 #if 0
@@ -124,53 +148,28 @@ void bli_l3_thread_decorator
 	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
 #endif
 
-	// Use the timpl_t value to index into the corresponding function address
-	// from the function pointer array.
-	const l3_decor_ft fp = l3_decor_fpa[ ti ];
-
-	// Call the threading-specific decorator function.
-	fp
-	(
-	  func,
-	  family,
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-}
-
-void bli_l3_thread_decorator_check
-     (
-       rntm_t* rntm
-     )
-{
-	//err_t e_val;
-
-	//e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) );
-	//bli_check_error_code( e_val );
-
-	const timpl_t ti = bli_rntm_thread_impl( rntm );
-
-	if (
-#ifndef BLIS_ENABLE_OPENMP
-	    ti == BLIS_OPENMP ||
-#endif
-#ifndef BLIS_ENABLE_PTHREADS
-	    ti == BLIS_POSIX ||
-#endif
-	    FALSE
-	   )
-	{
-		fprintf( stderr, "\n" );
-		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
-		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
-		fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
-		bli_abort();
-	}
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* array = bli_sba_checkout_array( nt );
+
+	l3_sup_decor_params_t params;
+	params.func   = func;
+	params.family = family;
+	params.alpha  = alpha;
+	params.a      = a;
+	params.b      = b;
+	params.beta   = beta;
+	params.c      = c;
+	params.cntx   = cntx;
+	params.rntm   = &rntm_l;
+	params.array  = array;
+
+	bli_thread_launch( ti, nt, bli_l3_sup_thread_decorator_entry, &params );
+
+	bli_sba_checkin_array( array );
+
+	return BLIS_SUCCESS;
 }
 
diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/3/bli_l3_sup_decor.h
similarity index 77%
rename from frame/thread/bli_l3_sup_decor.h
rename to frame/3/bli_l3_sup_decor.h
index a271920b4..c8f31a10f 100644
--- a/frame/thread/bli_l3_sup_decor.h
+++ b/frame/3/bli_l3_sup_decor.h
@@ -47,24 +47,10 @@ typedef err_t (*l3supint_ft)
        const obj_t*     beta,
        const obj_t*     c,
        const cntx_t*    cntx,
-             rntm_t*    rntm,
+       const rntm_t*    rntm,
              thrinfo_t* thread
      );
 
-// Level-3 sup thread decorator function type.
-typedef err_t (*l3_sup_decor_ft)
-     (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
-     );
-
 // Level-3 sup thread decorator prototype.
 err_t bli_l3_sup_thread_decorator
      (
@@ -76,19 +62,8 @@ err_t bli_l3_sup_thread_decorator
        const obj_t*      beta,
        const obj_t*      c,
        const cntx_t*     cntx,
-             rntm_t*     rntm
+       const rntm_t*     rntm
      );
 
-void bli_l3_sup_thread_decorator_check
-     (
-       rntm_t* rntm
-     );
-
-// Include definitions specific to the method of multithreading for the
-// sup code path.
-#include "bli_l3_sup_decor_single.h"
-#include "bli_l3_sup_decor_openmp.h"
-#include "bli_l3_sup_decor_pthreads.h"
-
 #endif
 
diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index 3ff13bdb5..ffba1d661 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -42,7 +42,7 @@ err_t bli_gemmsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
+       const rntm_t* rntm,
              thrinfo_t* thread
      )
 {
@@ -137,15 +137,16 @@ err_t bli_gemmsup_int
 			// Update the ways of parallelism for the jc and ic loops, and then
 			// update the current thread's root thrinfo_t node according to the
 			// new ways of parallelism value for the jc loop.
-			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
-			bli_l3_sup_thrinfo_update_root( rntm, thread );
+			rntm_t rntm_l = *rntm;
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l );
+			bli_l3_sup_thrinfo_update( &rntm_l, &thread );
 		}
 
 
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
@@ -156,7 +157,7 @@ err_t bli_gemmsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
@@ -202,15 +203,16 @@ err_t bli_gemmsup_int
 			// Update the ways of parallelism for the jc and ic loops, and then
 			// update the current thread's root thrinfo_t node according to the
 			// new ways of parallelism value for the jc loop.
-			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
-			bli_l3_sup_thrinfo_update_root( rntm, thread );
+			rntm_t rntm_l = *rntm;
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l );
+			bli_l3_sup_thrinfo_update( &rntm_l, &thread );
 		}
 
 
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m non-primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
@@ -221,7 +223,7 @@ err_t bli_gemmsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n non-primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
@@ -246,7 +248,7 @@ err_t bli_gemmtsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
+       const rntm_t* rntm,
              thrinfo_t* thread
      )
 {
@@ -311,15 +313,16 @@ err_t bli_gemmtsup_int
 			// Update the ways of parallelism for the jc and ic loops, and then
 			// update the current thread's root thrinfo_t node according to the
 			// new ways of parallelism value for the jc loop.
-			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
-			bli_l3_sup_thrinfo_update_root( rntm, thread );
+			rntm_t rntm_l = *rntm;
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l );
+			bli_l3_sup_thrinfo_update( &rntm_l, &thread );
 		}
 
 
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
@@ -332,7 +335,7 @@ err_t bli_gemmtsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
@@ -380,15 +383,16 @@ err_t bli_gemmtsup_int
 			// Update the ways of parallelism for the jc and ic loops, and then
 			// update the current thread's root thrinfo_t node according to the
 			// new ways of parallelism value for the jc loop.
-			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
-			bli_l3_sup_thrinfo_update_root( rntm, thread );
+			rntm_t rntm_l = *rntm;
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l );
+			bli_l3_sup_thrinfo_update( &rntm_l, &thread );
 		}
 
 
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m non-primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
@@ -401,7 +405,7 @@ err_t bli_gemmtsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n non-primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h
index 195e3ca40..e76f21360 100644
--- a/frame/3/bli_l3_sup_int.h
+++ b/frame/3/bli_l3_sup_int.h
@@ -40,7 +40,7 @@ err_t bli_gemmsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
+       const rntm_t* rntm,
              thrinfo_t* thread
      );
 
@@ -52,6 +52,6 @@ err_t bli_gemmtsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
+       const rntm_t* rntm,
              thrinfo_t* thread
      );
diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h
index ba60035b7..c36197201 100644
--- a/frame/3/bli_l3_sup_oft.h
+++ b/frame/3/bli_l3_sup_oft.h
@@ -53,7 +53,7 @@ typedef err_t (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( gemmsup )
diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c
index 5ed7700dc..797335aeb 100644
--- a/frame/3/bli_l3_sup_packm.c
+++ b/frame/3/bli_l3_sup_packm.c
@@ -43,8 +43,6 @@ void bli_packm_sup_init_mem
        dim_t      m,
        dim_t      k,
        dim_t      mr,
-       rntm_t*    rntm,
-       mem_t*     mem,
        thrinfo_t* thread
      )
 {
@@ -54,6 +52,9 @@ void bli_packm_sup_init_mem
 	}
 	else // if ( will_pack == TRUE )
 	{
+		mem_t* mem = bli_thrinfo_mem( thread );
+		pba_t* pba = bli_thrinfo_pba( thread );
+
 		// NOTE: This "rounding up" of the last upanel is actually optional
 		// for the rrc/crc cases, but absolutely necessary for the other cases
 		// since we NEED that last micropanel to have the same ldim (cs_p) as
@@ -64,7 +65,7 @@ void bli_packm_sup_init_mem
 
 		// Barrier to make sure all threads are caught up and ready to begin
 		// the packm stage.
-		bli_thread_barrier( rntm, thread );
+		bli_thrinfo_barrier( thread );
 
 		// Compute the size of the memory block eneded.
 		siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
@@ -73,7 +74,7 @@ void bli_packm_sup_init_mem
 		// then we need to acquire a block from the pba.
 		if ( bli_mem_is_unalloc( mem ) )
 		{
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thrinfo_am_chief( thread ) )
 			{
 				// Acquire directly to the chief thread's mem_t that was
 				// passed in. It needs to be that mem_t struct, and not a
@@ -85,7 +86,7 @@ void bli_packm_sup_init_mem
 				// then again, I prefer to keep barriers to a minimum.)
 				bli_pba_acquire_m
 				(
-				  rntm,
+				  pba,
 				  size_needed,
 				  pack_buf_type,
 				  mem
@@ -94,13 +95,13 @@ void bli_packm_sup_init_mem
 
 			// Broadcast the address of the chief thread's passed-in mem_t
 			// to all threads.
-			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem );
+			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem );
 
 			// Non-chief threads: Copy the contents of the chief thread's
 			// passed-in mem_t to the passed-in mem_t for this thread. (The
 			// chief thread already has the mem_t, so it does not need to
 			// perform any copy.)
-			if ( !bli_thread_am_ochief( thread ) )
+			if ( !bli_thrinfo_am_chief( thread ) )
 			{
 				*mem = *mem_p;
 			}
@@ -119,7 +120,7 @@ void bli_packm_sup_init_mem
 
 			if ( mem_size < size_needed )
 			{
-				if ( bli_thread_am_ochief( thread ) )
+				if ( bli_thrinfo_am_chief( thread ) )
 				{
 					// The chief thread releases the existing block associated
 					// with the mem_t, and then re-acquires a new block, saving
@@ -129,12 +130,12 @@ void bli_packm_sup_init_mem
 					// (temporary) mem_t.
 					bli_pba_release
 					(
-					  rntm,
+					  pba,
 					  mem
 					);
 					bli_pba_acquire_m
 					(
-					  rntm,
+					  pba,
 					  size_needed,
 					  pack_buf_type,
 					  mem
@@ -143,13 +144,13 @@ void bli_packm_sup_init_mem
 
 				// Broadcast the address of the chief thread's passed-in mem_t
 				// to all threads.
-				mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem );
+				mem_t* mem_p = bli_thrinfo_broadcast( thread, mem );
 
 				// Non-chief threads: Copy the contents of the chief thread's
 				// passed-in mem_t to the passed-in mem_t for this thread. (The
 				// chief thread already has the mem_t, so it does not need to
 				// perform any copy.)
-				if ( !bli_thread_am_ochief( thread ) )
+				if ( !bli_thrinfo_am_chief( thread ) )
 				{
 					*mem = *mem_p;
 				}
@@ -166,8 +167,6 @@ void bli_packm_sup_init_mem
 void bli_packm_sup_finalize_mem
      (
        bool       did_pack,
-       rntm_t*    rntm,
-       mem_t*     mem,
        thrinfo_t* thread
      )
 {
@@ -178,8 +177,11 @@ void bli_packm_sup_finalize_mem
 	}
 	else // if ( did_pack == TRUE )
 	{
+		mem_t* mem = bli_thrinfo_mem( thread );
+		pba_t* pba = bli_thrinfo_pba( thread );
+
 		if ( thread != NULL )
-		if ( bli_thread_am_ochief( thread ) )
+		if ( bli_thrinfo_am_chief( thread ) )
 		{
 			// Check the mem_t entry provided by the caller. Only proceed if it
 			// is allocated, which it should be.
@@ -187,7 +189,7 @@ void bli_packm_sup_finalize_mem
 			{
 				bli_pba_release
 				(
-				  rntm,
+				  pba,
 				  mem
 				);
 			}
@@ -197,18 +199,18 @@ void bli_packm_sup_finalize_mem
 
 void bli_packm_sup_init
      (
-             bool    will_pack,
-             stor3_t stor_id,
-             pack_t* schema,
-             dim_t   m,
-             dim_t   k,
-             dim_t   mr,
-             dim_t*  m_max,
-             dim_t*  k_max,
-       const void*   x, inc_t  rs_x, inc_t  cs_x,
-             void**  p, inc_t* rs_p, inc_t* cs_p,
-                        dim_t* pd_p, inc_t* ps_p,
-             mem_t*  mem
+             bool       will_pack,
+             stor3_t    stor_id,
+             pack_t*    schema,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+             dim_t*     m_max,
+             dim_t*     k_max,
+       const void*      x, inc_t  rs_x, inc_t  cs_x,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           dim_t* pd_p, inc_t* ps_p,
+             thrinfo_t* thread
      )
 {
 	// Inspect whether we are going to be packing matrix A.
@@ -277,7 +279,7 @@ void bli_packm_sup_init
 
 		// Set the buffer address provided by the caller to point to the
 		// memory associated with the mem_t entry acquired from the pba.
-		*p = bli_mem_buffer( mem );
+		*p = bli_mem_buffer( bli_thrinfo_mem( thread ) );
 	}
 }
 
@@ -334,8 +336,6 @@ void bli_packm_sup
              void**     p, inc_t* rs_p, inc_t* cs_p,
                            inc_t* ps_p,
        const cntx_t*    cntx,
-             rntm_t*    rntm,
-             mem_t*     mem,
              thrinfo_t* thread
      )
 {
@@ -351,8 +351,6 @@ void bli_packm_sup
 	  will_pack,
 	  pack_buf_type,
 	  dt, m_alloc, k_alloc, mr,
-      rntm,
-      mem,
 	  thread
 	);
 
@@ -369,7 +367,7 @@ void bli_packm_sup
 	  a, rs_a,  cs_a,
 	  p, rs_p,  cs_p,
 	     &pd_p, ps_p,
-      mem
+	  thread
 	);
 
 	// Inspect whether we are going to be packing matrix A.
@@ -422,7 +420,7 @@ void bli_packm_sup
 		}
 
 		// Barrier so that packing is done before computation.
-		bli_thread_barrier( rntm, thread );
+		bli_thrinfo_barrier( thread );
 	}
 }
 
diff --git a/frame/3/bli_l3_sup_packm.h b/frame/3/bli_l3_sup_packm.h
index a84d4e45c..032ba0afe 100644
--- a/frame/3/bli_l3_sup_packm.h
+++ b/frame/3/bli_l3_sup_packm.h
@@ -42,16 +42,12 @@ void bli_packm_sup_init_mem
        dim_t      m,
        dim_t      k,
        dim_t      mr,
-       rntm_t*    rntm,
-       mem_t*     mem,
        thrinfo_t* thread
      );
 
 void bli_packm_sup_finalize_mem
      (
        bool       did_pack,
-       rntm_t*    rntm,
-       mem_t*     mem,
        thrinfo_t* thread
      );
 
@@ -68,7 +64,7 @@ void bli_packm_sup_init
        const void*      x, inc_t  rs_x, inc_t  cs_x,
              void**     p, inc_t* rs_p, inc_t* cs_p,
                            dim_t* pd_p, inc_t* ps_p,
-             mem_t*     mem
+             thrinfo_t* thread
      );
 
 void bli_packm_sup
@@ -88,8 +84,6 @@ void bli_packm_sup
              void**     p, inc_t* rs_p, inc_t* cs_p,
                            inc_t* ps_p,
        const cntx_t*    cntx,
-             rntm_t*    rntm,
-             mem_t*     mem,
              thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 71357cec4..e47f65aea 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -145,8 +145,8 @@ void PASTEMAC(ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thrinfo_n_way( thread ); \
+	const dim_t tid = bli_thrinfo_work_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
@@ -234,9 +234,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel
 \
 /*
 if ( col_stored ) { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
+	if ( bli_thrinfo_work_id( thread ) == 0 ) \
 	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \
 	fflush( stdout ); \
 	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
 	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
@@ -244,10 +244,10 @@ if ( col_stored ) { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( rntm, thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
+bli_thrinfo_barrier( thread ); \
+	if ( bli_thrinfo_work_id( thread ) == 1 ) \
 	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \
 	fflush( stdout ); \
 	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
 	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
@@ -255,12 +255,12 @@ bli_thread_barrier( rntm, thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( rntm, thread ); \
+bli_thrinfo_barrier( thread ); \
 } \
 else { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
+	if ( bli_thrinfo_work_id( thread ) == 0 ) \
 	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \
 	fflush( stdout ); \
 	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
 	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
@@ -268,10 +268,10 @@ else { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( rntm, thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
+bli_thrinfo_barrier( thread ); \
+	if ( bli_thrinfo_work_id( thread ) == 1 ) \
 	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \
 	fflush( stdout ); \
 	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
 	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
@@ -279,7 +279,7 @@ bli_thread_barrier( rntm, thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_barrier( rntm, thread ); \
+bli_thrinfo_barrier( thread ); \
 } \
 */
 /*
@@ -388,8 +388,8 @@ void PASTEMAC(ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thrinfo_n_way( thread ); \
+	const dim_t tid = bli_thrinfo_work_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index e4858621a..5d7ea345c 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -48,7 +48,7 @@ void bli_gemmsup_ref_var1n
        const obj_t*  c,
              stor3_t stor_id,
        const cntx_t* cntx,
-             rntm_t* rntm,
+       const rntm_t* rntm,
              thrinfo_t* thread
      )
 {
@@ -118,18 +118,18 @@ void bli_gemmsup_ref_var1n
 	// Note: This code explicitly performs the swaps that could be done
 	// implicitly in other BLIS contexts where a type-specific helper function
 	// was being called.
-    if ( bli_is_trans( trans ) )
-    {
-              bool   packtmp = packa; packa = packb; packb = packtmp;
-              conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
-              dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
-        const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
-              inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
-                     str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
-                     str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
-
-        stor_id = bli_stor3_trans( stor_id );
-    }
+	if ( bli_is_trans( trans ) )
+	{
+		      bool   packtmp = packa; packa = packb; packb = packtmp;
+		      conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
+		      dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
+		const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
+		      inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
+		             str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
+		             str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
+
+		stor_id = bli_stor3_trans( stor_id );
+	}
 
 	// This transposition of the stor3_t id value is inherent to variant 1.
 	// The reason: we assume that variant 2 is the "main" variant. The
@@ -230,55 +230,15 @@ void bli_gemmsup_ref_var1n
 
 	auxinfo_t aux;
 
-	mem_t mem_a = BLIS_MEM_INITIALIZER;
-	mem_t mem_b = BLIS_MEM_INITIALIZER;
-
-	// Define an array of bszid_t ids, which will act as our substitute for
-	// the cntl_t tree.
-	// NOTE: These bszid_t values, and their order, match that of the bp
-	// algorithm (variant 2) because they are not used to query actual
-	// blocksizes but rather query the ways of parallelism for the various
-	// loops. For example, the 2nd loop in variant 1 partitions in the m
-	// dimension (in increments of MR), but parallelizes that m dimension
-	// with BLIS_JR_NT.
-	// Note that this panel-block algorithm partitions an NC x KC submatrix
-	// of A to be packed in the 4th loop, and a KC x MC submatrix of B to be
-	// packed in the 3rd loop.
-	//                    5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop
-	bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
-
 	// Determine whether we are using more than one thread.
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
 
-	thrinfo_t* thread_jc = NULL;
-	thrinfo_t* thread_pc = NULL;
-	thrinfo_t* thread_pa = NULL;
-	thrinfo_t* thread_ic = NULL;
-	thrinfo_t* thread_pb = NULL;
-	thrinfo_t* thread_jr = NULL;
-
-	// Pre-grow the thrinfo_t tree.
-	bszid_t* bszids_jc = bszids;
-	         thread_jc = thread;
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc );
-
-	bszid_t* bszids_pc = &bszids_jc[1];
-	         thread_pc = bli_thrinfo_sub_node( thread_jc );
-	bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc );
-
-	bszid_t* bszids_pa = &bszids_pc[1];
-	         thread_pa = bli_thrinfo_sub_node( thread_pc );
-
-	bszid_t* bszids_ic = &bszids_pa[1];
-	         thread_ic = bli_thrinfo_sub_node( thread_pa );
-	bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic );
-
-	bszid_t* bszids_pb = &bszids_ic[1];
-	         thread_pb = bli_thrinfo_sub_node( thread_ic );
-
-	bszid_t* bszids_jr = &bszids_pb[1];
-	         thread_jr = bli_thrinfo_sub_node( thread_pb );
-	bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr );
+	thrinfo_t* thread_jc = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc );
+	thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pa );
+	thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pb );
 
 	// Compute the JC loop thread range for the current thread.
 	dim_t jc_start, jc_end;
@@ -320,7 +280,7 @@ void bli_gemmsup_ref_var1n
 			// Only apply beta to the first iteration of the pc loop.
 			const void* beta_use = ( pp == 0 ? buf_beta : one );
 
-		          char* a_use;
+			      char* a_use;
 			      inc_t rs_a_use, cs_a_use, ps_a_use;
 
 			// Determine the packing buffer and related parameters for matrix
@@ -344,8 +304,6 @@ void bli_gemmsup_ref_var1n
 			  ( void** )&a_use, &rs_a_use, &cs_a_use,
 			                    &ps_a_use,
 			  cntx,
-			  rntm,
-			  &mem_a,
 			  thread_pa
 			);
 
@@ -402,8 +360,6 @@ void bli_gemmsup_ref_var1n
 				  ( void** )&b_use, &cs_b_use, &rs_b_use,
 				                    &ps_b_use,
 				  cntx,
-				  rntm,
-				  &mem_b,
 				  thread_pb
 				);
 
@@ -472,7 +428,7 @@ void bli_gemmsup_ref_var1n
 
 			// NOTE: This barrier is only needed if we are packing A (since
 			// that matrix is packed within the pc loop of this variant).
-			if ( packa ) bli_thread_barrier( rntm, thread_pa );
+			if ( packa ) bli_thrinfo_barrier( thread_pa );
 		}
 	}
 
@@ -480,15 +436,11 @@ void bli_gemmsup_ref_var1n
 	bli_packm_sup_finalize_mem
 	(
 	  packa,
-	  rntm,
-	  &mem_a,
 	  thread_pa
 	);
 	bli_packm_sup_finalize_mem
 	(
 	  packb,
-	  rntm,
-	  &mem_b,
 	  thread_pb
 	);
 
@@ -514,7 +466,7 @@ void bli_gemmsup_ref_var2m
        const obj_t*     c,
              stor3_t    stor_id,
        const cntx_t*    cntx,
-             rntm_t*    rntm,
+       const rntm_t*    rntm,
              thrinfo_t* thread
      )
 {
@@ -680,46 +632,15 @@ void bli_gemmsup_ref_var2m
 
 	auxinfo_t       aux;
 
-	mem_t mem_a = BLIS_MEM_INITIALIZER;
-	mem_t mem_b = BLIS_MEM_INITIALIZER;
-
-	// Define an array of bszid_t ids, which will act as our substitute for
-	// the cntl_t tree.
-	//                    5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop
-	bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
-
 	// Determine whether we are using more than one thread.
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
 
-	thrinfo_t* thread_jc = NULL;
-	thrinfo_t* thread_pc = NULL;
-	thrinfo_t* thread_pb = NULL;
-	thrinfo_t* thread_ic = NULL;
-	thrinfo_t* thread_pa = NULL;
-	thrinfo_t* thread_jr = NULL;
-
-	// Pre-grow the thrinfo_t tree.
-	bszid_t* bszids_jc = bszids;
-	         thread_jc = thread;
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc );
-
-	bszid_t* bszids_pc = &bszids_jc[1];
-	         thread_pc = bli_thrinfo_sub_node( thread_jc );
-	bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc );
-
-	bszid_t* bszids_pb = &bszids_pc[1];
-	         thread_pb = bli_thrinfo_sub_node( thread_pc );
-
-	bszid_t* bszids_ic = &bszids_pb[1];
-	         thread_ic = bli_thrinfo_sub_node( thread_pb );
-	bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic );
-
-	bszid_t* bszids_pa = &bszids_ic[1];
-	         thread_pa = bli_thrinfo_sub_node( thread_ic );
-
-	bszid_t* bszids_jr = &bszids_pa[1];
-	         thread_jr = bli_thrinfo_sub_node( thread_pa );
-	bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr );
+	thrinfo_t* thread_jc = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc );
+	thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pb );
+	thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pa );
 
 	// Compute the JC loop thread range for the current thread.
 	dim_t jc_start, jc_end;
@@ -783,8 +704,6 @@ void bli_gemmsup_ref_var2m
 			  ( void** )&b_use, &cs_b_use, &rs_b_use,
 			                    &ps_b_use,
 			  cntx,
-			  rntm,
-			  &mem_b,
 			  thread_pb
 			);
 
@@ -839,8 +758,6 @@ void bli_gemmsup_ref_var2m
 				  ( void** )&a_use, &rs_a_use, &cs_a_use,
 				                    &ps_a_use,
 				  cntx,
-				  rntm,
-				  &mem_a,
 				  thread_pa
 				);
 
@@ -909,7 +826,7 @@ void bli_gemmsup_ref_var2m
 
 			// NOTE: This barrier is only needed if we are packing B (since
 			// that matrix is packed within the pc loop of this variant).
-			if ( packb ) bli_thread_barrier( rntm, thread_pb );
+			if ( packb ) bli_thrinfo_barrier( thread_pb );
 		}
 	}
 
@@ -917,15 +834,11 @@ void bli_gemmsup_ref_var2m
 	bli_packm_sup_finalize_mem
 	(
 	  packa,
-	  rntm,
-	  &mem_a,
 	  thread_pa
 	);
 	bli_packm_sup_finalize_mem
 	(
 	  packb,
-	  rntm,
-	  &mem_b,
 	  thread_pb
 	);
 
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index be6b17f39..8bbb73ca9 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -50,7 +50,7 @@ void PASTEMAC0(opname) \
        const obj_t*     c, \
              stor3_t    eff_id, \
        const cntx_t*    cntx, \
-             rntm_t*    rntm, \
+       const rntm_t*    rntm, \
              thrinfo_t* thread  \
      );
 
diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c
index c934ba949..130237ee4 100644
--- a/frame/3/bli_l3_tapi_ex.c
+++ b/frame/3/bli_l3_tapi_ex.c
@@ -55,7 +55,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -115,7 +115,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -178,7 +178,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -236,7 +236,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -298,7 +298,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -355,7 +355,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -418,7 +418,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -481,7 +481,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -545,7 +545,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
              ctype*  b, inc_t rs_b, inc_t cs_b, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h
index eb142af05..d8610dee8 100644
--- a/frame/3/bli_l3_tapi_ex.h
+++ b/frame/3/bli_l3_tapi_ex.h
@@ -54,7 +54,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm )
@@ -76,7 +76,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( hemm )
@@ -97,7 +97,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( herk )
@@ -119,7 +119,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( her2k )
@@ -139,7 +139,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( syrk )
@@ -161,7 +161,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmt )
@@ -186,7 +186,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm3 )
@@ -207,7 +207,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
              ctype*  b, inc_t rs_b, inc_t cs_b, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm )
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index f866cfd4c..402497153 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -34,139 +34,135 @@
 */
 
 #include "blis.h"
-#include "assert.h"
 
-void bli_l3_thrinfo_init_single
+thrinfo_t* bli_l3_thrinfo_create
      (
-       thrinfo_t* thread
+             dim_t       id,
+             thrcomm_t*  gl_comm,
+             array_t*    array,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      )
 {
-	bli_thrinfo_init_single( thread );
-}
+	pool_t* pool = NULL;
+	if ( array != NULL )
+		pool = bli_apool_array_elem( id, array );
 
-void bli_l3_thrinfo_free
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     )
-{
-	bli_thrinfo_free( rntm, thread );
-}
+	// Create the root thrinfo_t node.
+	thrinfo_t* root = bli_thrinfo_create_root
+	(
+	  gl_comm,
+	  id,
+	  pool,
+	  bli_pba_query()
+	);
 
-void bli_l3_sup_thrinfo_free
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     )
-{
-	bli_thrinfo_free( rntm, thread );
-}
+	bli_l3_thrinfo_grow( root, rntm, cntl );
 
-// -----------------------------------------------------------------------------
+	return root;
+}
 
-void bli_l3_thrinfo_create_root
+void bli_l3_thrinfo_grow
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       cntl_t*     cntl,
-       thrinfo_t** thread
+             thrinfo_t*  thread_par,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      )
 {
-	// Query the global communicator for the total number of threads to use.
-	dim_t   n_threads  = bli_thrcomm_num_threads( gl_comm );
-
-	// Use the thread id passed in as the global communicator id.
-	dim_t   gl_comm_id = id;
+	const cntl_t* sub_prenode = bli_cntl_sub_prenode( cntl );
+	const cntl_t* sub_node    = bli_cntl_sub_node( cntl );
+	const bszid_t bszid       = bli_cntl_bszid( cntl );
+	const dim_t   n_way       = bli_rntm_ways_for( bszid, rntm );
 
-	// Use the blocksize id of the current (root) control tree node to
-	// query the top-most ways of parallelism to obtain.
-	bszid_t bszid      = bli_cntl_bszid( cntl );
-	dim_t   xx_way     = bli_rntm_ways_for( bszid, rntm );
+	thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par );
+	bli_thrinfo_set_sub_node( thread_cur, thread_par );
 
-	// Determine the work id for this thrinfo_t node.
-	dim_t   work_id    = gl_comm_id / ( n_threads / xx_way );
+	if ( sub_prenode != NULL )
+	{
+		// A pre-node is only used in the IC loop of trsm. In this case,
+		// we cannot actually thread in the m dimension due to data dependencies
+		// and so all parallelism must be moved down to the JR loop.
+		rntm_t rntm_l = *rntm;
+		const dim_t ic_nway = bli_rntm_ic_ways( &rntm_l );
+		const dim_t jr_nway = bli_rntm_jr_ways( &rntm_l );
+		bli_rntm_set_ic_ways_only(               1, &rntm_l );
+		bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l );
+
+		// Use thread_pre instead of thread_cur since we *don't* want to
+		// do any parallelism at this level.
+		thrinfo_t* thread_pre = bli_thrinfo_split( 1, thread_par );
+		bli_thrinfo_set_sub_prenode( thread_pre, thread_par );
+		bli_l3_thrinfo_grow( thread_pre, &rntm_l, sub_prenode );
+	}
 
-	// Create the root thrinfo_t node.
-	*thread = bli_thrinfo_create
-	(
-	  rntm,
-	  gl_comm,
-	  gl_comm_id,
-	  xx_way,
-	  work_id,
-	  TRUE,
-	  bszid,
-	  NULL
-	);
+	if ( sub_node != NULL )
+	{
+		bli_l3_thrinfo_grow( thread_cur, rntm, sub_node );
+	}
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_l3_sup_thrinfo_create_root
+thrinfo_t* bli_l3_sup_thrinfo_create
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       thrinfo_t** thread
+             dim_t      id,
+             thrcomm_t* gl_comm,
+             pool_t*    pool,
+       const rntm_t*    rntm
      )
 {
-	// Query the global communicator for the total number of threads to use.
-	dim_t   n_threads  = bli_thrcomm_num_threads( gl_comm );
-
-	// Use the thread id passed in as the global communicator id.
-	dim_t   gl_comm_id = id;
-
-	// Use the BLIS_NC blocksize id to query the top-most ways of parallelism
-	// to obtain. Note that hard-coding BLIS_NC like this is a little bit of a
-	// hack, but it works fine since both of the sup algorithms (bp and pb) use
-	// the cache blocksizes down to the 3rd loop. (See the definitions of
-	// bli_rntm_calc_num_threads_bp() and bli_rntm_calc_num_threads_pb() for
-	// a concise enumeration of these bszid_t ids.)
-	const bszid_t bszid  = BLIS_NC;
-	dim_t         xx_way = bli_rntm_ways_for( BLIS_NC, rntm );
-
-	// Determine the work id for this thrinfo_t node.
-	dim_t   work_id    = gl_comm_id / ( n_threads / xx_way );
-
 	// Create the root thrinfo_t node.
-	*thread = bli_thrinfo_create
+	thrinfo_t* root = bli_thrinfo_create_root
 	(
-	  rntm,
 	  gl_comm,
-	  gl_comm_id,
-	  xx_way,
-	  work_id,
-	  TRUE,
-	  bszid,
-	  NULL
+	  id,
+	  pool,
+	  bli_pba_query()
 	);
-}
 
-// -----------------------------------------------------------------------------
+	const dim_t n_way_jc = bli_rntm_ways_for( BLIS_NC, rntm );
+	const dim_t n_way_pc = bli_rntm_ways_for( BLIS_KC, rntm );
+	const dim_t n_way_ic = bli_rntm_ways_for( BLIS_MC, rntm );
+	const dim_t n_way_jr = bli_rntm_ways_for( BLIS_NR, rntm );
+	const dim_t n_way_ir = bli_rntm_ways_for( BLIS_MR, rntm );
+
+	thrinfo_t* thread_jc = bli_thrinfo_split( n_way_jc,      root );
+	thrinfo_t* thread_pc = bli_thrinfo_split( n_way_pc, thread_jc );
+	thrinfo_t* thread_pb = bli_thrinfo_split(        1, thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_split( n_way_ic, thread_pb );
+	thrinfo_t* thread_pa = bli_thrinfo_split(        1, thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa );
+	thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr );
+
+	bli_thrinfo_set_sub_node( thread_jc,      root );
+	bli_thrinfo_set_sub_node( thread_pc, thread_jc );
+	bli_thrinfo_set_sub_node( thread_pb, thread_pc );
+	bli_thrinfo_set_sub_node( thread_ic, thread_pb );
+	bli_thrinfo_set_sub_node( thread_pa, thread_ic );
+	bli_thrinfo_set_sub_node( thread_jr, thread_pa );
+	bli_thrinfo_set_sub_node( thread_ir, thread_jr );
+
+	return root;
+}
 
-void bli_l3_sup_thrinfo_update_root
+void bli_l3_sup_thrinfo_update
      (
-       rntm_t*    rntm,
-       thrinfo_t* thread
+       const rntm_t*     rntm,
+             thrinfo_t** root
      )
 {
-	// Query the current root for the total number of threads to use.
-	const dim_t n_threads  = bli_thread_num_threads( thread );
-
-	// Query the current root for the (global) comm id.
-	const dim_t gl_comm_id = bli_thread_ocomm_id( thread );
-
-	// Query the rntm_t for the updated number of ways of parallelism.
-	const dim_t xx_way     = bli_rntm_ways_for( BLIS_NC, rntm );
-
-	// Recompute the work id for this thrinfo_t node using the updated
-	// number of ways of parallelism.
-	dim_t       work_id    = gl_comm_id / ( n_threads / xx_way );
-
-	// Save the updated ways of parallelism and work id to the thrinfo_t node.
-	bli_thrinfo_set_n_way( xx_way, thread );
-	bli_thrinfo_set_work_id( work_id, thread );
+	thrcomm_t* gl_comm = bli_thrinfo_comm( *root );
+	dim_t      tid     = bli_thrinfo_thread_id( *root );
+	pool_t*    pool    = bli_thrinfo_sba_pool( *root );
+	dim_t      nt      = bli_thrinfo_num_threads( *root );
+
+	// Return early in single-threaded execution
+	// since the thread control tree may not have been
+	// allocated normally
+	if ( nt == 1 ) return;
+
+	bli_thrinfo_free( *root );
+	*root = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
 }
 
 // -----------------------------------------------------------------------------
@@ -178,7 +174,7 @@ void bli_l3_thrinfo_print_gemm_paths
 {
 	// In order to query the number of threads, we query the only thread we
 	// know exists: thread 0.
-	dim_t n_threads = bli_thread_num_threads( threads[0] );
+	dim_t n_threads = bli_thrinfo_num_threads( threads[0] );
 
 	// For the purposes of printing the "header" information that is common
 	// to the various instances of a thrinfo_t (ie: across all threads), we
@@ -211,44 +207,44 @@ void bli_l3_thrinfo_print_gemm_paths
 
 	if ( !jc_info ) goto print_header;
 
-	jc_way  = bli_thread_n_way( jc_info );
-	jc_nt   = bli_thread_num_threads( jc_info );
+	jc_way  = bli_thrinfo_n_way( jc_info );
+	jc_nt   = bli_thrinfo_num_threads( jc_info );
 	pc_info = bli_thrinfo_sub_node( jc_info );
 
 	if ( !pc_info ) goto print_header;
 
-	pc_way  = bli_thread_n_way( pc_info );
-	pc_nt   = bli_thread_num_threads( pc_info );
+	pc_way  = bli_thrinfo_n_way( pc_info );
+	pc_nt   = bli_thrinfo_num_threads( pc_info );
 	pb_info = bli_thrinfo_sub_node( pc_info );
 
 	if ( !pb_info ) goto print_header;
 
-	pb_way  = bli_thread_n_way( pb_info );
-	pb_nt   = bli_thread_num_threads( pb_info );
+	pb_way  = bli_thrinfo_n_way( pb_info );
+	pb_nt   = bli_thrinfo_num_threads( pb_info );
 	ic_info = bli_thrinfo_sub_node( pb_info );
 
 	if ( !ic_info ) goto print_header;
 
-	ic_way  = bli_thread_n_way( ic_info );
-	ic_nt   = bli_thread_num_threads( ic_info );
+	ic_way  = bli_thrinfo_n_way( ic_info );
+	ic_nt   = bli_thrinfo_num_threads( ic_info );
 	pa_info = bli_thrinfo_sub_node( ic_info );
 
 	if ( !pa_info ) goto print_header;
 
-	pa_way  = bli_thread_n_way( pa_info );
-	pa_nt   = bli_thread_num_threads( pa_info );
+	pa_way  = bli_thrinfo_n_way( pa_info );
+	pa_nt   = bli_thrinfo_num_threads( pa_info );
 	jr_info = bli_thrinfo_sub_node( pa_info );
 
 	if ( !jr_info ) goto print_header;
 
-	jr_way  = bli_thread_n_way( jr_info );
-	jr_nt   = bli_thread_num_threads( jr_info );
+	jr_way  = bli_thrinfo_n_way( jr_info );
+	jr_nt   = bli_thrinfo_num_threads( jr_info );
 	ir_info = bli_thrinfo_sub_node( jr_info );
 
 	if ( !ir_info ) goto print_header;
 
-	ir_way  = bli_thread_n_way( ir_info );
-	ir_nt   = bli_thread_num_threads( ir_info );
+	ir_way  = bli_thrinfo_n_way( ir_info );
+	ir_nt   = bli_thrinfo_num_threads( ir_info );
 
 	print_header:
 
@@ -262,7 +258,7 @@ void bli_l3_thrinfo_print_gemm_paths
 	( unsigned long )jr_nt,
 	( unsigned long )ir_nt );
 	printf( "xx_way:   %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
-    ( unsigned long )jc_way,
+	( unsigned long )jc_way,
 	( unsigned long )pc_way,
 	( unsigned long )pb_way,
 	( unsigned long )ic_way,
@@ -283,44 +279,44 @@ void bli_l3_thrinfo_print_gemm_paths
 
 		if ( !jc_info ) goto print_thrinfo;
 
-		jc_comm_id = bli_thread_ocomm_id( jc_info );
-		jc_work_id = bli_thread_work_id( jc_info );
+		jc_comm_id = bli_thrinfo_thread_id( jc_info );
+		jc_work_id = bli_thrinfo_work_id( jc_info );
 		pc_info    = bli_thrinfo_sub_node( jc_info );
 
 		if ( !pc_info ) goto print_thrinfo;
 
-		pc_comm_id = bli_thread_ocomm_id( pc_info );
-		pc_work_id = bli_thread_work_id( pc_info );
+		pc_comm_id = bli_thrinfo_thread_id( pc_info );
+		pc_work_id = bli_thrinfo_work_id( pc_info );
 		pb_info    = bli_thrinfo_sub_node( pc_info );
 
 		if ( !pb_info ) goto print_thrinfo;
 
-		pb_comm_id = bli_thread_ocomm_id( pb_info );
-		pb_work_id = bli_thread_work_id( pb_info );
+		pb_comm_id = bli_thrinfo_thread_id( pb_info );
+		pb_work_id = bli_thrinfo_work_id( pb_info );
 		ic_info    = bli_thrinfo_sub_node( pb_info );
 
 		if ( !ic_info ) goto print_thrinfo;
 
-		ic_comm_id = bli_thread_ocomm_id( ic_info );
-		ic_work_id = bli_thread_work_id( ic_info );
+		ic_comm_id = bli_thrinfo_thread_id( ic_info );
+		ic_work_id = bli_thrinfo_work_id( ic_info );
 		pa_info    = bli_thrinfo_sub_node( ic_info );
 
 		if ( !pa_info ) goto print_thrinfo;
 
-		pa_comm_id = bli_thread_ocomm_id( pa_info );
-		pa_work_id = bli_thread_work_id( pa_info );
+		pa_comm_id = bli_thrinfo_thread_id( pa_info );
+		pa_work_id = bli_thrinfo_work_id( pa_info );
 		jr_info    = bli_thrinfo_sub_node( pa_info );
 
 		if ( !jr_info ) goto print_thrinfo;
 
-		jr_comm_id = bli_thread_ocomm_id( jr_info );
-		jr_work_id = bli_thread_work_id( jr_info );
+		jr_comm_id = bli_thrinfo_thread_id( jr_info );
+		jr_work_id = bli_thrinfo_work_id( jr_info );
 		ir_info    = bli_thrinfo_sub_node( jr_info );
 
 		if ( !ir_info ) goto print_thrinfo;
 
-		ir_comm_id = bli_thread_ocomm_id( ir_info );
-		ir_work_id = bli_thread_work_id( ir_info );
+		ir_comm_id = bli_thrinfo_thread_id( ir_info );
+		ir_work_id = bli_thrinfo_work_id( ir_info );
 
 		print_thrinfo:
 
@@ -356,7 +352,7 @@ void bli_l3_thrinfo_print_trsm_paths
 {
 	// In order to query the number of threads, we query the only thread we
 	// know exists: thread 0.
-	dim_t n_threads = bli_thread_num_threads( threads[0] );
+	dim_t n_threads = bli_thrinfo_num_threads( threads[0] );
 
 	// For the purposes of printing the "header" information that is common
 	// to the various instances of a thrinfo_t (ie: across all threads), we
@@ -391,26 +387,26 @@ void bli_l3_thrinfo_print_trsm_paths
 
 	if ( !jc_info ) goto print_header;
 
-	jc_way   = bli_thread_n_way( jc_info );
-	jc_nt    = bli_thread_num_threads( jc_info );
+	jc_way   = bli_thrinfo_n_way( jc_info );
+	jc_nt    = bli_thrinfo_num_threads( jc_info );
 	pc_info  = bli_thrinfo_sub_node( jc_info );
 
 	if ( !pc_info ) goto print_header;
 
-	pc_way   = bli_thread_n_way( pc_info );
-	pc_nt    = bli_thread_num_threads( pc_info );
+	pc_way   = bli_thrinfo_n_way( pc_info );
+	pc_nt    = bli_thrinfo_num_threads( pc_info );
 	pb_info  = bli_thrinfo_sub_node( pc_info );
 
 	if ( !pb_info ) goto print_header;
 
-	pb_way   = bli_thread_n_way( pb_info );
-	pb_nt    = bli_thread_num_threads( pb_info );
+	pb_way   = bli_thrinfo_n_way( pb_info );
+	pb_nt    = bli_thrinfo_num_threads( pb_info );
 	ic_info  = bli_thrinfo_sub_node( pb_info );
 
 	if ( !ic_info ) goto print_header;
 
-	ic_way   = bli_thread_n_way( ic_info );
-	ic_nt    = bli_thread_num_threads( ic_info );
+	ic_way   = bli_thrinfo_n_way( ic_info );
+	ic_nt    = bli_thrinfo_num_threads( ic_info );
 	pa_info  = bli_thrinfo_sub_node( ic_info );
 	pa_info0 = bli_thrinfo_sub_prenode( ic_info );
 
@@ -418,39 +414,39 @@ void bli_l3_thrinfo_print_trsm_paths
 
 	if ( !pa_info0 ) goto check_header_node;
 
-	pa_way0  = bli_thread_n_way( pa_info0 );
-	pa_nt0   = bli_thread_num_threads( pa_info0 );
+	pa_way0  = bli_thrinfo_n_way( pa_info0 );
+	pa_nt0   = bli_thrinfo_num_threads( pa_info0 );
 	jr_info0 = bli_thrinfo_sub_node( pa_info0 );
 
 	if ( !jr_info0 ) goto check_header_node;
 
-	jr_way0  = bli_thread_n_way( jr_info0 );
-	jr_nt0   = bli_thread_num_threads( jr_info0 );
+	jr_way0  = bli_thrinfo_n_way( jr_info0 );
+	jr_nt0   = bli_thrinfo_num_threads( jr_info0 );
 	ir_info0 = bli_thrinfo_sub_node( jr_info0 );
 
 	if ( !ir_info0 ) goto check_header_node;
 
-	ir_way0  = bli_thread_n_way( ir_info0 );
-	ir_nt0   = bli_thread_num_threads( ir_info0 );
+	ir_way0  = bli_thrinfo_n_way( ir_info0 );
+	ir_nt0   = bli_thrinfo_num_threads( ir_info0 );
 
 	check_header_node:
 
 	if ( !pa_info ) goto print_header;
 
-	pa_way  = bli_thread_n_way( pa_info );
-	pa_nt   = bli_thread_num_threads( pa_info );
+	pa_way  = bli_thrinfo_n_way( pa_info );
+	pa_nt   = bli_thrinfo_num_threads( pa_info );
 	jr_info = bli_thrinfo_sub_node( pa_info );
 
 	if ( !jr_info ) goto print_header;
 
-	jr_way  = bli_thread_n_way( jr_info );
-	jr_nt   = bli_thread_num_threads( jr_info );
+	jr_way  = bli_thrinfo_n_way( jr_info );
+	jr_nt   = bli_thrinfo_num_threads( jr_info );
 	ir_info = bli_thrinfo_sub_node( jr_info );
 
 	if ( !ir_info ) goto print_header;
 
-	ir_way  = bli_thread_n_way( ir_info );
-	ir_nt   = bli_thread_num_threads( ir_info );
+	ir_way  = bli_thrinfo_n_way( ir_info );
+	ir_nt   = bli_thrinfo_num_threads( ir_info );
 
 	print_header:
 
@@ -493,26 +489,26 @@ void bli_l3_thrinfo_print_trsm_paths
 
 		if ( !jc_info ) goto print_thrinfo;
 
-		jc_comm_id = bli_thread_ocomm_id( jc_info );
-		jc_work_id = bli_thread_work_id( jc_info );
+		jc_comm_id = bli_thrinfo_thread_id( jc_info );
+		jc_work_id = bli_thrinfo_work_id( jc_info );
 		pc_info    = bli_thrinfo_sub_node( jc_info );
 
 		if ( !pc_info ) goto print_thrinfo;
 
-		pc_comm_id = bli_thread_ocomm_id( pc_info );
-		pc_work_id = bli_thread_work_id( pc_info );
+		pc_comm_id = bli_thrinfo_thread_id( pc_info );
+		pc_work_id = bli_thrinfo_work_id( pc_info );
 		pb_info    = bli_thrinfo_sub_node( pc_info );
 
 		if ( !pb_info ) goto print_thrinfo;
 
-		pb_comm_id = bli_thread_ocomm_id( pb_info );
-		pb_work_id = bli_thread_work_id( pb_info );
+		pb_comm_id = bli_thrinfo_thread_id( pb_info );
+		pb_work_id = bli_thrinfo_work_id( pb_info );
 		ic_info    = bli_thrinfo_sub_node( pb_info );
 
 		if ( !ic_info ) goto print_thrinfo;
 
-		ic_comm_id = bli_thread_ocomm_id( ic_info );
-		ic_work_id = bli_thread_work_id( ic_info );
+		ic_comm_id = bli_thrinfo_thread_id( ic_info );
+		ic_work_id = bli_thrinfo_work_id( ic_info );
 		pa_info    = bli_thrinfo_sub_node( ic_info );
 		pa_info0   = bli_thrinfo_sub_prenode( ic_info );
 
@@ -520,39 +516,39 @@ void bli_l3_thrinfo_print_trsm_paths
 
 		if ( !pa_info0 ) goto check_thrinfo_node;
 
-		pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
-		pa_work_id0 = bli_thread_work_id( pa_info0 );
+		pa_comm_id0 = bli_thrinfo_thread_id( pa_info0 );
+		pa_work_id0 = bli_thrinfo_work_id( pa_info0 );
 		jr_info0    = bli_thrinfo_sub_node( pa_info0 );
 
 		if ( !jr_info0 ) goto check_thrinfo_node;
 
-		jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
-		jr_work_id0 = bli_thread_work_id( jr_info0 );
+		jr_comm_id0 = bli_thrinfo_thread_id( jr_info0 );
+		jr_work_id0 = bli_thrinfo_work_id( jr_info0 );
 		ir_info0    = bli_thrinfo_sub_node( jr_info0 );
 
 		if ( !ir_info0 ) goto check_thrinfo_node;
 
-		ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
-		ir_work_id0 = bli_thread_work_id( ir_info0 );
+		ir_comm_id0 = bli_thrinfo_thread_id( ir_info0 );
+		ir_work_id0 = bli_thrinfo_work_id( ir_info0 );
 
 		check_thrinfo_node:
 
 		if ( !pa_info ) goto print_thrinfo;
 
-		pa_comm_id = bli_thread_ocomm_id( pa_info );
-		pa_work_id = bli_thread_work_id( pa_info );
+		pa_comm_id = bli_thrinfo_thread_id( pa_info );
+		pa_work_id = bli_thrinfo_work_id( pa_info );
 		jr_info    = bli_thrinfo_sub_node( pa_info );
 
 		if ( !jr_info ) goto print_thrinfo;
 
-		jr_comm_id = bli_thread_ocomm_id( jr_info );
-		jr_work_id = bli_thread_work_id( jr_info );
+		jr_comm_id = bli_thrinfo_thread_id( jr_info );
+		jr_work_id = bli_thrinfo_work_id( jr_info );
 		ir_info    = bli_thrinfo_sub_node( jr_info );
 
 		if ( !ir_info ) goto print_thrinfo;
 
-		ir_comm_id = bli_thread_ocomm_id( ir_info );
-		ir_work_id = bli_thread_work_id( ir_info );
+		ir_comm_id = bli_thrinfo_thread_id( ir_info );
+		ir_work_id = bli_thrinfo_work_id( ir_info );
 
 		print_thrinfo:
 #else
@@ -584,8 +580,8 @@ void bli_l3_thrinfo_print_trsm_paths
 		}
 		else
 		{
-			jc_comm_id = bli_thread_ocomm_id( jc_info );
-			jc_work_id = bli_thread_work_id( jc_info );
+			jc_comm_id = bli_thrinfo_thread_id( jc_info );
+			jc_work_id = bli_thrinfo_work_id( jc_info );
 			pc_info = bli_thrinfo_sub_node( jc_info );
 
 			if ( !pc_info )
@@ -595,8 +591,8 @@ void bli_l3_thrinfo_print_trsm_paths
 			}
 			else
 			{
-				pc_comm_id = bli_thread_ocomm_id( pc_info );
-				pc_work_id = bli_thread_work_id( pc_info );
+				pc_comm_id = bli_thrinfo_thread_id( pc_info );
+				pc_work_id = bli_thrinfo_work_id( pc_info );
 				pb_info = bli_thrinfo_sub_node( pc_info );
 
 				if ( !pb_info )
@@ -606,8 +602,8 @@ void bli_l3_thrinfo_print_trsm_paths
 				}
 				else
 				{
-					pb_comm_id = bli_thread_ocomm_id( pb_info );
-					pb_work_id = bli_thread_work_id( pb_info );
+					pb_comm_id = bli_thrinfo_thread_id( pb_info );
+					pb_work_id = bli_thrinfo_work_id( pb_info );
 					ic_info = bli_thrinfo_sub_node( pb_info );
 
 					if ( !ic_info )
@@ -617,8 +613,8 @@ void bli_l3_thrinfo_print_trsm_paths
 					}
 					else
 					{
-						ic_comm_id = bli_thread_ocomm_id( ic_info );
-						ic_work_id = bli_thread_work_id( ic_info );
+						ic_comm_id = bli_thrinfo_thread_id( ic_info );
+						ic_work_id = bli_thrinfo_work_id( ic_info );
 						pa_info0 = bli_thrinfo_sub_prenode( ic_info );
 						pa_info = bli_thrinfo_sub_node( ic_info );
 
@@ -630,8 +626,8 @@ void bli_l3_thrinfo_print_trsm_paths
 						}
 						else
 						{
-							pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
-							pa_work_id0 = bli_thread_work_id( pa_info0 );
+							pa_comm_id0 = bli_thrinfo_thread_id( pa_info0 );
+							pa_work_id0 = bli_thrinfo_work_id( pa_info0 );
 							jr_info0 = bli_thrinfo_sub_node( pa_info0 );
 
 							if ( !jr_info0 )
@@ -641,8 +637,8 @@ void bli_l3_thrinfo_print_trsm_paths
 							}
 							else
 							{
-								jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
-								jr_work_id0 = bli_thread_work_id( jr_info0 );
+								jr_comm_id0 = bli_thrinfo_thread_id( jr_info0 );
+								jr_work_id0 = bli_thrinfo_work_id( jr_info0 );
 								ir_info0 = bli_thrinfo_sub_node( jr_info0 );
 
 								if ( !ir_info0 )
@@ -652,8 +648,8 @@ void bli_l3_thrinfo_print_trsm_paths
 								}
 								else
 								{
-									ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
-									ir_work_id0 = bli_thread_work_id( ir_info0 );
+									ir_comm_id0 = bli_thrinfo_thread_id( ir_info0 );
+									ir_work_id0 = bli_thrinfo_work_id( ir_info0 );
 								}
 							}
 						}
@@ -666,8 +662,8 @@ void bli_l3_thrinfo_print_trsm_paths
 						}
 						else
 						{
-							pa_comm_id = bli_thread_ocomm_id( pa_info );
-							pa_work_id = bli_thread_work_id( pa_info );
+							pa_comm_id = bli_thrinfo_thread_id( pa_info );
+							pa_work_id = bli_thrinfo_work_id( pa_info );
 							jr_info = bli_thrinfo_sub_node( pa_info );
 
 							if ( !jr_info )
@@ -677,8 +673,8 @@ void bli_l3_thrinfo_print_trsm_paths
 							}
 							else
 							{
-								jr_comm_id = bli_thread_ocomm_id( jr_info );
-								jr_work_id = bli_thread_work_id( jr_info );
+								jr_comm_id = bli_thrinfo_thread_id( jr_info );
+								jr_work_id = bli_thrinfo_work_id( jr_info );
 								ir_info = bli_thrinfo_sub_node( jr_info );
 
 								if ( !ir_info )
@@ -688,8 +684,8 @@ void bli_l3_thrinfo_print_trsm_paths
 								}
 								else
 								{
-									ir_comm_id = bli_thread_ocomm_id( ir_info );
-									ir_work_id = bli_thread_work_id( ir_info );
+									ir_comm_id = bli_thrinfo_thread_id( ir_info );
+									ir_work_id = bli_thrinfo_work_id( ir_info );
 								}
 							}
 						}
@@ -724,15 +720,14 @@ void bli_l3_thrinfo_print_trsm_paths
 
 void bli_l3_thrinfo_free_paths
      (
-       rntm_t*     rntm,
        thrinfo_t** threads
      )
 {
-	dim_t n_threads = bli_thread_num_threads( threads[0] );
+	dim_t n_threads = bli_thrinfo_num_threads( threads[0] );
 	dim_t i;
 
 	for ( i = 0; i < n_threads; ++i )
-		bli_l3_thrinfo_free( rntm, threads[i] );
+		bli_thrinfo_free( threads[i] );
 
 	bli_free_intl( threads );
 }
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index 37a3909fd..b1290df50 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -68,60 +68,36 @@
 \
 	( index % thread->n_way == thread->work_id % thread->n_way )
 
-//
-// thrinfo_t APIs specific to level-3 operations.
-//
-
-void bli_l3_thrinfo_init
-     (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       thrinfo_t* sub_node
-     );
-
-void bli_l3_thrinfo_init_single
-     (
-       thrinfo_t* thread
-     );
-
-void bli_l3_thrinfo_free
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     );
+// -----------------------------------------------------------------------------
 
-void bli_l3_sup_thrinfo_free
+BLIS_EXPORT_BLIS thrinfo_t* bli_l3_thrinfo_create
      (
-       rntm_t*    rntm,
-       thrinfo_t* thread
+             dim_t       id,
+             thrcomm_t*  gl_comm,
+             array_t*    array,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      );
 
-// -----------------------------------------------------------------------------
-
-void bli_l3_thrinfo_create_root
+void bli_l3_thrinfo_grow
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       cntl_t*     cntl,
-       thrinfo_t** thread
+             thrinfo_t*  thread_par,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      );
 
-void bli_l3_sup_thrinfo_create_root
+thrinfo_t* bli_l3_sup_thrinfo_create
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       thrinfo_t** thread
+             dim_t      id,
+             thrcomm_t* gl_comm,
+             pool_t*    pool,
+       const rntm_t*    rntm
      );
 
-void bli_l3_sup_thrinfo_update_root
+void bli_l3_sup_thrinfo_update
      (
-       rntm_t*    rntm,
-       thrinfo_t* thread
+       const rntm_t*     rntm,
+             thrinfo_t** root
      );
 
 void bli_l3_thrinfo_print_gemm_paths
@@ -138,7 +114,6 @@ void bli_l3_thrinfo_print_trsm_paths
 
 void bli_l3_thrinfo_free_paths
      (
-       rntm_t*     rntm,
        thrinfo_t** threads
      );
 
diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c
index 485779a90..f841e5eb2 100644
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -37,13 +37,12 @@
 
 void bli_gemm_blk_var1
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t ap, cp;
@@ -58,6 +57,7 @@ void bli_gemm_blk_var1
 
 	// Determine the current thread's subpartition range.
 	dim_t my_start, my_end;
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	bli_thread_range_mdim
 	(
 	  direct, thread, &ap, b, &cp, cntl, cntx,
@@ -88,9 +88,8 @@ void bli_gemm_blk_var1
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
-		  bli_thrinfo_sub_node( thread )
+		  thread
 		);
 	}
 }
diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c
index 254a31064..ceadce7d7 100644
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -37,13 +37,12 @@
 
 void bli_gemm_blk_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t bp, cp;
@@ -58,6 +57,7 @@ void bli_gemm_blk_var2
 
 	// Determine the current thread's subpartition range.
 	dim_t my_start, my_end;
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	bli_thread_range_ndim
 	(
 	  direct, thread, a, &bp, &cp, cntl, cntx,
@@ -88,9 +88,8 @@ void bli_gemm_blk_var2
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
-		  bli_thrinfo_sub_node( thread )
+		  thread
 		);
 	}
 }
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index cb20b7f36..d683cfc88 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -36,13 +36,12 @@
 
 void bli_gemm_blk_var3
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t ap, bp, cs;
@@ -50,6 +49,8 @@ void bli_gemm_blk_var3
 	bli_obj_alias_to( b, &bp );
 	bli_obj_alias_to( c, &cs );
 
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+
 	// Determine the direction in which to partition (forwards or backwards).
 	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
 
@@ -83,13 +84,10 @@ void bli_gemm_blk_var3
 		  &BLIS_ONE,
 		  &cs,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
-		  bli_thrinfo_sub_node( thread )
+		  thread
 		);
 
-		bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) );
-
 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal beta scalar on matrix C is non-zero, we must use it
 		// only for the first iteration (and then BLIS_ONE for all others).
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index 052c812a3..bd8d97d13 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -37,21 +37,21 @@
 
 cntl_t* bli_gemm_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        pack_t  schema_a,
        pack_t  schema_b,
        void_fp ker
      )
 {
-	return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker );
+	return bli_gemmbp_cntl_create( pool, family, schema_a, schema_b, ker );
 }
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_gemmbp_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        pack_t  schema_a,
        pack_t  schema_b,
@@ -73,18 +73,18 @@ cntl_t* bli_gemmbp_cntl_create
 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
 	(
-	  rntm,    // the thread's runtime structure
-	  family,  // the operation family
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
+	  pool,         // the thread's sba pool
+	  family,       // the operation family
+	  BLIS_MR,
+	  NULL,         // variant function pointer not used
+	  NULL          // no sub-node; this is the leaf of the tree.
 	);
 
 	cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node
 	(
-	  rntm,    // the thread's runtime structure
+	  pool,         // the thread's sba pool
 	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
+	  BLIS_NR,
 	  macro_kernel_fp,
 	  gemm_cntl_bu_ke
 	);
@@ -92,14 +92,14 @@ cntl_t* bli_gemmbp_cntl_create
 	// Create a node for packing matrix A.
 	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
 	(
-	  rntm,
-	  bli_l3_packa,  // pack the left-hand operand
+	  pool,
+	  bli_l3_packa, // pack the left-hand operand
 	  BLIS_MR,
 	  BLIS_KR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
+	  FALSE,        // do NOT invert diagonal
+	  FALSE,        // reverse iteration if upper?
+	  FALSE,        // reverse iteration if lower?
+	  schema_a,     // normally BLIS_PACKED_ROW_PANELS
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  gemm_cntl_bp_bu
 	);
@@ -107,7 +107,7 @@ cntl_t* bli_gemmbp_cntl_create
 	// Create a node for partitioning the m dimension by MC.
 	cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_MC,
 	  bli_gemm_blk_var1,
@@ -117,14 +117,14 @@ cntl_t* bli_gemmbp_cntl_create
 	// Create a node for packing matrix B.
 	cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
 	(
-	  rntm,
-	  bli_l3_packb,  // pack the right-hand operand
+	  pool,
+	  bli_l3_packb, // pack the right-hand operand
 	  BLIS_NR,
 	  BLIS_KR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_b, // normally BLIS_PACKED_COL_PANELS
+	  FALSE,        // do NOT invert diagonal
+	  FALSE,        // reverse iteration if upper?
+	  FALSE,        // reverse iteration if lower?
+	  schema_b,     // normally BLIS_PACKED_COL_PANELS
 	  BLIS_BUFFER_FOR_B_PANEL,
 	  gemm_cntl_op_bp
 	);
@@ -132,7 +132,7 @@ cntl_t* bli_gemmbp_cntl_create
 	// Create a node for partitioning the k dimension by KC.
 	cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_KC,
 	  bli_gemm_blk_var3,
@@ -142,7 +142,7 @@ cntl_t* bli_gemmbp_cntl_create
 	// Create a node for partitioning the n dimension by NC.
 	cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_NC,
 	  bli_gemm_blk_var2,
@@ -154,124 +154,26 @@ cntl_t* bli_gemmbp_cntl_create
 
 // -----------------------------------------------------------------------------
 
-// This control tree creation function is disabled because it is no longer used.
-// (It was originally created in the run up to publishing the 1m journal article,
-// but was disabled to reduce complexity.)
-#if 0
-cntl_t* bli_gemmpb_cntl_create
-     (
-       opid_t family
-     )
-{
-	void_fp macro_kernel_p = bli_gemm_ker_var1;
-
-	// Change the macro-kernel if the operation family is gemmt or trmm.
-	//if      ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2;
-	//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
-
-	// Create two nodes for the macro-kernel.
-	cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node
-	(
-	  family,  // the operation family
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
-	);
-
-	cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node
-	(
-	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
-	  gemm_cntl_ub_ke
-	);
-
-	// Create a node for packing matrix A (which is really the right-hand
-	// operand "B").
-	cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
-	(
-	  bli_gemm_packb,  // pack the right-hand operand
-	  bli_packm_blk_var1,
-	  BLIS_MR,
-	  BLIS_KR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  BLIS_PACKED_COL_PANELS,
-	  BLIS_BUFFER_FOR_A_BLOCK,
-	  gemm_cntl_pb_ub
-	);
-
-	// Create a node for partitioning the n dimension by MC.
-	cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node
-	(
-	  family,
-	  BLIS_MC,
-	  bli_gemm_blk_var2,
-	  gemm_cntl_packb
-	);
-
-	// Create a node for packing matrix B (which is really the left-hand
-	// operand "A").
-	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
-	(
-	  bli_gemm_packa,  // pack the left-hand operand
-	  bli_packm_blk_var1,
-	  BLIS_NR,
-	  BLIS_KR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  BLIS_PACKED_ROW_PANELS,
-	  BLIS_BUFFER_FOR_B_PANEL,
-	  gemm_cntl_op_pb
-	);
-
-	// Create a node for partitioning the k dimension by KC.
-	cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
-	(
-	  family,
-	  BLIS_KC,
-	  bli_gemm_blk_var3,
-	  gemm_cntl_packa
-	);
-
-	// Create a node for partitioning the m dimension by NC.
-	cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
-	(
-	  family,
-	  BLIS_NC,
-	  bli_gemm_blk_var1,
-	  gemm_cntl_mm_op
-	);
-
-	return gemm_cntl_vl_mm;
-}
-#endif
-
-// -----------------------------------------------------------------------------
-
 void bli_gemm_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       pool_t* pool,
+       cntl_t* cntl
      )
 {
-	bli_cntl_free( rntm, cntl, thread );
+	bli_cntl_free( pool, cntl );
 }
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_gemm_cntl_create_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
        cntl_t* sub_node
      )
 {
-	return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
+	return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node );
 }
 
diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h
index 5fa213ac4..48e0652ca 100644
--- a/frame/3/gemm/bli_gemm_cntl.h
+++ b/frame/3/gemm/bli_gemm_cntl.h
@@ -35,7 +35,7 @@
 
 cntl_t* bli_gemm_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        pack_t  schema_a,
        pack_t  schema_b,
@@ -46,34 +46,26 @@ cntl_t* bli_gemm_cntl_create
 
 cntl_t* bli_gemmbp_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        pack_t  schema_a,
        pack_t  schema_b,
        void_fp ker
      );
 
-#if 0
-cntl_t* bli_gemmpb_cntl_create
-     (
-       opid_t family,
-     );
-#endif
-
 // -----------------------------------------------------------------------------
 
 void bli_gemm_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       pool_t* pool,
+       cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_gemm_cntl_create_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 5f992bd67..fe0dc61a8 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -43,8 +43,7 @@ void bli_gemm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -244,8 +243,7 @@ void bli_gemm_front
 	  betap,
 	  cp,
 	  cntx,
-	  rntm,
-	  cntl
+	  rntm
 	);
 
 #ifdef BLIS_ENABLE_GEMM_MD
diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h
index 744f88d1b..9465c37d9 100644
--- a/frame/3/gemm/bli_gemm_front.h
+++ b/frame/3/gemm/bli_gemm_front.h
@@ -40,8 +40,7 @@ void bli_gemm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
@@ -53,7 +52,7 @@ err_t bli_gemm_small
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             cntl_t* cntl
+       const cntl_t* cntl
      );
 #endif
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 199e72cb6..51dceced2 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -81,9 +81,8 @@ void bli_gemm_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
 	      num_t  dt_exec   = bli_obj_exec_dt( c );
@@ -254,13 +253,14 @@ void bli_gemm_ker_var2
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	dim_t jr_nt  = bli_thread_n_way( thread );
-	dim_t jr_tid = bli_thread_work_id( thread );
-	dim_t ir_nt  = bli_thread_n_way( caucus );
-	dim_t ir_tid = bli_thread_work_id( caucus );
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
 	dim_t jr_start, jr_end;
 	dim_t ir_start, ir_end;
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index d3109e600..24f7ecfb9 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -53,12 +53,11 @@ typedef struct
 \
 void PASTEMAC0(opname) \
      ( \
-       const obj_t*  a, \
-       const obj_t*  b, \
-       const obj_t*  c, \
-       const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     c, \
+       const cntx_t*    cntx, \
+       const cntl_t*    cntl, \
              thrinfo_t* thread  \
      );
 
diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/other/bli_gemm_ker_var1.c
similarity index 100%
rename from frame/3/gemm/bli_gemm_ker_var1.c
rename to frame/3/gemm/other/bli_gemm_ker_var1.c
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c
index c5cf935b8..8b26b2263 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2.c
@@ -270,10 +270,10 @@ void PASTEMAC(ch,varname) \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
 	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	dim_t jr_num_threads = bli_thrinfo_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thrinfo_work_id( thread ); \
+	dim_t ir_num_threads = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thrinfo_work_id( caucus ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
index 946e3048c..c374e178b 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2rr.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
@@ -276,10 +276,10 @@ void PASTEMAC(ch,varname) \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
index f5159bbb9..f61911c53 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2sl.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
@@ -276,10 +276,10 @@ void PASTEMAC(ch,varname) \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index 49b32c976..d75738a94 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -43,8 +43,7 @@ void bli_gemmt_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -106,8 +105,7 @@ void bli_gemmt_front
 	  beta,
 	  &c_local,
 	  cntx,
-	  rntm,
-	  cntl
+	  rntm
 	);
 }
 
diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/gemmt/bli_gemmt_front.h
index 0f2a9ada2..4a7cd7abe 100644
--- a/frame/3/gemmt/bli_gemmt_front.h
+++ b/frame/3/gemmt/bli_gemmt_front.h
@@ -41,6 +41,5 @@ void bli_gemmt_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index aed0359ec..4a3a48304 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -35,30 +35,46 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemmt_fp
+typedef void (*xpbys_mxn_l_vft)
+    (
+      doff_t diagoff,
+      dim_t  m,
+      dim_t  n,
+      void*  x, inc_t rs_x, inc_t cs_x,
+      void*  b,
+      void*  y, inc_t rs_y, inc_t cs_y
+    );
 
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+\
+void PASTEMAC(ch,op) \
+    ( \
+      doff_t diagoff, \
+      dim_t  m, \
+      dim_t  n, \
+      void*  x, inc_t rs_x, inc_t cs_x, \
+      void*  b, \
+      void*  y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctype* restrict x_cast = x; \
+	ctype* restrict b_cast = b; \
+	ctype* restrict y_cast = y; \
+\
+	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
+	( \
+	  diagoff, \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
+
+INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
 
+static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
 void bli_gemmt_l_ker_var2
      (
@@ -66,30 +82,28 @@ void bli_gemmt_l_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt        = bli_obj_exec_dt( c );
+	const dim_t  dt_size   = bli_dt_size( dt );
 
-	const doff_t diagoffc  = bli_obj_diag_offset( c );
+	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
 	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t  m         = bli_obj_length( c );
-	const dim_t  n         = bli_obj_width( c );
-	const dim_t  k         = bli_obj_width( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t  cs_a      = bli_obj_col_stride( a );
 	const inc_t  is_a      = bli_obj_imag_stride( a );
 	const dim_t  pd_a      = bli_obj_panel_dim( a );
 	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const void*  buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t  rs_b      = bli_obj_row_stride( b );
 	const inc_t  is_b      = bli_obj_imag_stride( b );
 	const dim_t  pd_b      = bli_obj_panel_dim( b );
 	const inc_t  ps_b      = bli_obj_panel_stride( b );
@@ -109,97 +123,32 @@ void bli_gemmt_l_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffc,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, is_a,
-	                  pd_a, ps_a,
-	  ( void* )buf_b, rs_b, is_b,
-	                  pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t MR = pd_a;
+	const dim_t NR = pd_b;
 
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt ];
+
+	// Temporary C buffer for edge cases. Note that the strides of this
+	// temporary buffer are set so that they match the storage of the
+	// original C matrix. For example, if C is column-stored, ct will be
+	// column-stored as well.
+	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx );
+	const inc_t rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+
+	const void* zero       = bli_obj_buffer_for_const( dt, &BLIS_ZERO );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -212,296 +161,280 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region above where the diagonal of C intersects
-	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* Save the desired output datatype (indicating no typecasting). */ \
-	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of C, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the rectangular region by dividing NR into the diagonal
-		   offset. Any remainder from this integer division is discarded, which
-		   is what we want. That is, we want the rectangular region to contain
-		   as many columns of whole microtiles as possible without including any
-		   microtiles that intersect the diagonal. The number of iterations in
-		   the triangular (or trapezoidal) region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffc / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
-	/* Determine the thread range and increment for the 2nd and 1st loops for
-	   the initial rectangular region of C (if it exists).
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd loop
-	   and the default (slab or rr) partitioning in the 1st loop for the
-	   remaining triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the triangular region
-	   by the number of iterations used for the rectangular region. */ \
-	jr_start += n_iter_rct; \
-	jr_end   += n_iter_rct; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  MR, \
-				  NR, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-}
+	*/
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of C is entirely above the diagonal,
+	// it is not stored. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return;
+
+	// If there is a zero region above where the diagonal of C intersects
+	// the left edge of the panel, adjust the pointer to C and A and treat
+	// this case as if the diagonal offset were zero.
+	if ( diagoffc < 0 )
+	{
+		dim_t ip       = -diagoffc / MR;
+		dim_t i        = ip * MR;
+		      m        = m - i;
+		      diagoffc = -diagoffc % MR;
+		      c_cast   = c_cast + (i  )*rs_c*dt_size;
+		      a_cast   = a_cast + (ip )*ps_a*dt_size;
+	}
+
+	// If there is a zero region to the right of where the diagonal
+	// of C intersects the bottom of the panel, shrink it to prevent
+	// "no-op" iterations from executing.
+	if ( diagoffc + m < n )
+	{
+		n = diagoffc + m;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// Save the imaginary stride of A and B to the auxinfo_t object.
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	// Save the desired output datatype (indicating no typecasting).
+	//bli_auxinfo_set_dt_on_output( dt, &aux );*/
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
 
-INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
+	// Note that we partition the 2nd loop into two regions: the rectangular
+	// part of C, and the triangular portion.
+	dim_t n_iter_rct;
+	dim_t n_iter_tri;
+
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) )
+	{
+		// If the entire panel of C does not intersect the diagonal, there is
+		// no triangular region, and therefore we can skip the second set of
+		// loops.
+		n_iter_rct = n_iter;
+		n_iter_tri = 0;
+	}
+	else
+	{
+		// If the panel of C does intersect the diagonal, compute the number of
+		// iterations in the rectangular region by dividing NR into the diagonal
+		// offset. Any remainder from this integer division is discarded, which
+		// is what we want. That is, we want the rectangular region to contain
+		// as many columns of whole microtiles as possible without including any
+		// microtiles that intersect the diagonal. The number of iterations in
+		// the triangular (or trapezoidal) region is computed as the remaining
+		// number of iterations in the n dimension.
+		n_iter_rct = diagoffc / NR;
+		n_iter_tri = n_iter - n_iter_rct;
+	}
+
+	// Determine the thread range and increment for the 2nd and 1st loops for
+	// the initial rectangular region of C (if it exists).
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// Interior loop over the m dimension (MR rows at a time).
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
+			// No need to compute the diagonal offset for the rectangular
+			// region.
+			//diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/
+
+			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// Compute the addresses of the next panels of A and B.
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
+			// Save addresses of next panels of A and B to the auxinfo_t
+			// object.
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
+			// If the diagonal intersects the current MR x NR submatrix, we
+			// compute it the temporary buffer and then add in the elements
+			// on or below the diagonal.
+			// Otherwise, if the submatrix is strictly below the diagonal,
+			// we compute and store as we normally would.
+			// And if we're strictly above the diagonal, we do nothing and
+			// continue.
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+
+	// If there is no triangular region, then we're done.
+	if ( n_iter_tri == 0 ) return;
+
+	// Use round-robin assignment of micropanels to threads in the 2nd loop
+	// and the default (slab or rr) partitioning in the 1st loop for the
+	// remaining triangular region of C.
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
+	// Advance the start and end iteration offsets for the triangular region
+	// by the number of iterations used for the rectangular region.
+	jr_start += n_iter_rct;
+	jr_end   += n_iter_rct;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// Interior loop over the m dimension (MR rows at a time).
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
+			// Compute the diagonal offset for the submatrix at (i,j).
+			doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// Compute the addresses of the next panels of A and B.
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
+			// Save addresses of next panels of A and B to the auxinfo_t
+			// object.
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
+			// If the diagonal intersects the current MR x NR submatrix, we
+			// compute it the temporary buffer and then add in the elements
+			// on or below the diagonal.
+			// Otherwise, if the submatrix is strictly below the diagonal,
+			// we compute and store as we normally would.
+			// And if we're strictly above the diagonal, we do nothing and
+			// continue.
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				  ct, rs_ct, cs_ct,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Scale C and add the result to only the stored part.
+				xpbys_mxn_l_ukr( diagoffc_ij,
+				                 m_cur, n_cur,
+				                 ct,  rs_ct, cs_ct,
+				                 ( void* )beta_cast,
+				                 c11, rs_c,  cs_c );
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+}
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 87d77ee55..5b4e1ccd9 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -35,30 +35,46 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemmt_fp
+typedef void (*xpbys_mxn_u_vft)
+    (
+      doff_t diagoff,
+      dim_t  m,
+      dim_t  n,
+      void*  x, inc_t rs_x, inc_t cs_x,
+      void*  b,
+      void*  y, inc_t rs_y, inc_t cs_y
+    );
 
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+\
+void PASTEMAC(ch,op) \
+    ( \
+      doff_t diagoff, \
+      dim_t  m, \
+      dim_t  n, \
+      void*  x, inc_t rs_x, inc_t cs_x, \
+      void*  b, \
+      void*  y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctype* restrict x_cast = x; \
+	ctype* restrict b_cast = b; \
+	ctype* restrict y_cast = y; \
+\
+	PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \
+	( \
+	  diagoff, \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
+
+INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
 
+static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
 void bli_gemmt_u_ker_var2
      (
@@ -66,30 +82,28 @@ void bli_gemmt_u_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt        = bli_obj_exec_dt( c );
+	const dim_t  dt_size   = bli_dt_size( dt );
 
-	const doff_t diagoffc  = bli_obj_diag_offset( c );
+	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
 	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t  m         = bli_obj_length( c );
-	const dim_t  n         = bli_obj_width( c );
-	const dim_t  k         = bli_obj_width( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t  cs_a      = bli_obj_col_stride( a );
 	const inc_t  is_a      = bli_obj_imag_stride( a );
 	const dim_t  pd_a      = bli_obj_panel_dim( a );
 	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const void*  buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t  rs_b      = bli_obj_row_stride( b );
 	const inc_t  is_b      = bli_obj_imag_stride( b );
 	const dim_t  pd_b      = bli_obj_panel_dim( b );
 	const inc_t  ps_b      = bli_obj_panel_stride( b );
@@ -109,97 +123,32 @@ void bli_gemmt_u_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffc,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, is_a,
-	                  pd_a, ps_a,
-	  ( void* )buf_b, rs_b, is_b,
-	                  pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t MR = pd_a;
+	const dim_t NR = pd_b;
 
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt ];
+
+	// Temporary C buffer for edge cases. Note that the strides of this
+	// temporary buffer are set so that they match the storage of the
+	// original C matrix. For example, if C is column-stored, ct will be
+	// column-stored as well.
+	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx );
+	const inc_t rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+
+	const void* zero       = bli_obj_buffer_for_const( dt, &BLIS_ZERO );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -212,299 +161,283 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region to the left of where the diagonal of C
-	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero.
-	   NOTE: It's possible that after this pruning that the diagonal offset
-	   is still positive (though it is guaranteed to be less than NR). */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of C intersects
-	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* Save the desired output datatype (indicating no typecasting). */ \
-	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the triangular (or trapezoidal) region by dividing NR
-		   into the number of rows in C. A non-zero remainder means we need to
-		   add one additional iteration. That is, we want the triangular region
-		   to contain as few columns of whole microtiles as possible while still
-		   including all microtiles that intersect the diagonal. The number of
-		   iterations in the rectangular region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd loop
-	   and the default (slab or rr) partitioning in the 1st loop for the
-	   initial triangular region of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  MR, \
-				  NR, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
-	/* Determine the thread range and increment for the 2nd loop of the
-	   remaining rectangular region of C (and also use default partitioning
-	   for the 1st loop).
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-}
+	*/
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of C is entirely below the diagonal,
+	// it is not stored. So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of C
+	// intersects the top edge of the panel, adjust the pointer to C and B
+	// and treat this case as if the diagonal offset were zero.
+	// NOTE: It's possible that after this pruning that the diagonal offset
+	// is still positive (though it is guaranteed to be less than NR).
+	if ( diagoffc > 0 )
+	{
+		dim_t jp       = diagoffc / NR;
+		dim_t j        = jp * NR;
+		      n        = n - j;
+		      diagoffc = diagoffc % NR;
+		      c_cast   = c_cast + (j  )*cs_c*dt_size;
+		      b_cast   = b_cast + (jp )*ps_b*dt_size;
+	}
+
+	// If there is a zero region below where the diagonal of C intersects
+	// the right edge of the panel, shrink it to prevent "no-op" iterations
+	// from executing.
+	if ( -diagoffc + n < m )
+	{
+		m = -diagoffc + n;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// Save the imaginary stride of A and B to the auxinfo_t object.
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	// Save the desired output datatype (indicating no typecasting).
+	//bli_auxinfo_set_dt_on_output( dt, &aux );*/
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
 
-INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
+	// Note that we partition the 2nd loop into two regions: the triangular
+	// part of C, and the rectangular portion.
+	dim_t n_iter_tri;
+	dim_t n_iter_rct;
+
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) )
+	{
+		// If the entire panel of C does not intersect the diagonal, there is
+		// no triangular region, and therefore we can skip the first set of
+		// loops.
+		n_iter_tri = 0;
+		n_iter_rct = n_iter;
+	}
+	else
+	{
+		// If the panel of C does intersect the diagonal, compute the number of
+		// iterations in the triangular (or trapezoidal) region by dividing NR
+		// into the number of rows in C. A non-zero remainder means we need to
+		// add one additional iteration. That is, we want the triangular region
+		// to contain as few columns of whole microtiles as possible while still
+		// including all microtiles that intersect the diagonal. The number of
+		// iterations in the rectangular region is computed as the remaining
+		// number of iterations in the n dimension.
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 );
+		n_iter_rct = n_iter - n_iter_tri;
+	}
+
+	// Use round-robin assignment of micropanels to threads in the 2nd loop
+	// and the default (slab or rr) partitioning in the 1st loop for the
+	// initial triangular region of C (if it exists).
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// Interior loop over the m dimension (MR rows at a time).
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
+			// Compute the diagonal offset for the submatrix at (i,j).
+			doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// Compute the addresses of the next panels of A and B.
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
+			// Save addresses of next panels of A and B to the auxinfo_t
+			// object.
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
+			// If the diagonal intersects the current MR x NR submatrix, we
+			// compute it the temporary buffer and then add in the elements
+			// on or below the diagonal.
+			// Otherwise, if the submatrix is strictly above the diagonal,
+			// we compute and store as we normally would.
+			// And if we're strictly below the diagonal, we do nothing and
+			// continue.
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				  ct, rs_ct, cs_ct,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Scale C and add the result to only the stored part.
+				xpbys_mxn_u_ukr( diagoffc_ij,
+				                 m_cur, n_cur,
+				                 ct,  rs_ct, cs_ct,
+				                 ( void* )beta_cast,
+				                 c11, rs_c,  cs_c );
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+
+	// If there is no rectangular region, then we're done.
+	if ( n_iter_rct == 0 ) return;
+
+	// Determine the thread range and increment for the 2nd loop of the
+	// remaining rectangular region of C (and also use default partitioning
+	// for the 1st loop).
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
+	// Advance the start and end iteration offsets for the rectangular region
+	// by the number of iterations used for the triangular region.
+	jr_start += n_iter_tri;
+	jr_end   += n_iter_tri;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// Interior loop over the m dimension (MR rows at a time).
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
+			// No need to compute the diagonal offset for the rectangular
+			// region.
+			//diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/
+
+			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// Compute the addresses of the next panels of A and B.
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
+			// Save addresses of next panels of A and B to the auxinfo_t
+			// object.
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
+			// If the diagonal intersects the current MR x NR submatrix, we
+			// compute it the temporary buffer and then add in the elements
+			// on or below the diagonal.
+			// Otherwise, if the submatrix is strictly above the diagonal,
+			// we compute and store as we normally would.
+			// And if we're strictly below the diagonal, we do nothing and
+			// continue.
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+}
 
diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h
index 98d8f5563..eb6e16018 100644
--- a/frame/3/gemmt/bli_gemmt_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -47,8 +47,7 @@ void PASTEMAC0(opname) \
        const obj_t*  ah, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const cntl_t* cntl, \
              thrinfo_t* thread  \
      );
 
@@ -81,7 +80,6 @@ void PASTEMAC(ch,varname) \
        void*   beta, \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
-       rntm_t* rntm, \
        thrinfo_t* thread  \
      );
 
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
index 76fe106b0..207e1c938 100644
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -46,20 +46,18 @@ void bli_gemmt_x_ker_var2
        const obj_t*  ah,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	dim_t      uplo;
-	l3_var_oft f;
+	dim_t uplo;
 
 	// Set a bool based on the uplo field of C's root object.
 	if ( bli_obj_root_is_lower( c ) ) uplo = 0;
 	else                              uplo = 1;
 
 	// Index into the variant array to extract the correct function pointer.
-	f = vars[uplo];
+	l3_var_oft f = vars[uplo];
 
 	// Call the macrokernel.
 	f
@@ -68,7 +66,6 @@ void bli_gemmt_x_ker_var2
 	  ah,
 	  c,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
index ece351ef7..64df59e88 100644
--- a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
@@ -284,10 +284,10 @@ void PASTEMAC(ch,varname) \
 	c1 = c_cast; \
 \
 	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	dim_t jr_num_threads = bli_thrinfo_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thrinfo_work_id( thread ); \
+	dim_t ir_num_threads = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thrinfo_work_id( caucus ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
index f00e769b5..d5acec3b8 100644
--- a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
@@ -284,10 +284,10 @@ void PASTEMAC(ch,varname) \
 	c1 = c_cast; \
 \
 	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	dim_t jr_num_threads = bli_thrinfo_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thrinfo_work_id( thread ); \
+	dim_t ir_num_threads = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thrinfo_work_id( caucus ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index c39703503..a281ddade 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -43,8 +43,7 @@ void bli_hemm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -161,8 +160,7 @@ void bli_hemm_front
 	  beta,
 	  &c_local,
 	  cntx,
-	  rntm,
-	  cntl
+	  rntm
 	);
 }
 
diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h
index 63eb91cd3..2ccd8e0c8 100644
--- a/frame/3/hemm/bli_hemm_front.h
+++ b/frame/3/hemm/bli_hemm_front.h
@@ -41,6 +41,5 @@ void bli_hemm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index c9aada989..1ee5e0a7f 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -43,8 +43,7 @@ void bli_symm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -160,8 +159,7 @@ void bli_symm_front
 	  beta,
 	  &c_local,
 	  cntx,
-	  rntm,
-	  cntl
+	  rntm
 	);
 }
 
diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h
index 417cb9acb..585ec1025 100644
--- a/frame/3/symm/bli_symm_front.h
+++ b/frame/3/symm/bli_symm_front.h
@@ -41,6 +41,5 @@ void bli_symm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index edd4ce1ef..d351e78e1 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -42,8 +42,7 @@ void bli_trmm_front
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -179,8 +178,7 @@ void bli_trmm_front
 	  &BLIS_ZERO,
 	  &c_local,
 	  cntx,
-	  rntm,
-	  cntl
+	  rntm
 	);
 }
 
diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h
index cfefdd39b..f13d4c34b 100644
--- a/frame/3/trmm/bli_trmm_front.h
+++ b/frame/3/trmm/bli_trmm_front.h
@@ -39,6 +39,5 @@ void bli_trmm_front
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index f5476b2ca..3bc4e3c6b 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
-
-
 void bli_trmm_ll_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_ll_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
 
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1011; \
-	dim_t           off_a1011; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,227 +111,201 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current block of A is entirely above the diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
-\
-	/* Compute k_full. For all trmm, k_full is simply k. This is
-	   needed because some parameter combinations of trmm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
-	/* If there is a zero region above where the diagonal of A intersects the
-	   left edge of the block, adjust the pointer to C and treat this case as
-	   if the diagonal offset were zero. This skips over the region that was
-	   not packed. (Note we assume the diagonal offset is a multiple of MR;
-	   this assumption will hold as long as the cache blocksizes are each a
-	   multiple of MR and NR.) */ \
-	if ( diagoffa < 0 ) \
-	{ \
-		i        = -diagoffa; \
-		m        = m - i; \
-		diagoffa = 0; \
-		c_cast   = c_cast + (i  )*rs_c; \
-	} \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
-\
-	dim_t jr_start, jr_end; \
-	/*dim_t ir_start, ir_end;*/ \
-	dim_t jr_inc; \
-\
-	/* Determine the thread range and increment for the 2nd loop.
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* If the current panel of A intersects the diagonal, scale C
-			   by beta. If it is strictly below the diagonal, scale by one.
-			   This allows the current macro-kernel to work for both trmm
-			   and trmm3. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict b1_i; \
-				ctype* restrict a2; \
-\
-				/* Determine the offset to and length of the panel that was
-				   packed so we can index into the corresponding location in
-				   b1. */ \
-				off_a1011 = 0; \
-				k_a1011   = bli_min( diagoffa_i + MR, k ); \
-\
-				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1011 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				b1_i = b1 + off_a1011 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a1011, \
-				  alpha_cast, \
-				  a1, \
-				  b1_i, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 += rstep_c; \
-		} \
-	} \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current block of A is entirely above the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return;
+
+	// If there is a zero region above where the diagonal of A intersects the
+	// left edge of the block, adjust the pointer to C and treat this case as
+	// if the diagonal offset were zero. This skips over the region that was
+	// not packed. (Note we assume the diagonal offset is a multiple of MR;
+	// this assumption will hold as long as the cache blocksizes are each a
+	// multiple of MR and NR.)
+	if ( diagoffa < 0 )
+	{
+		m        += diagoffa;
+		c_cast   -= diagoffa * rs_c * dt_size;
+		diagoffa  = 0;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	//thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//dim_t ir_nt  = bli_thrinfo_n_way( ir_thread );
+	//dim_t ir_tid = bli_thrinfo_work_id( ir_thread );
+
+	dim_t jr_start, jr_end;
+	//dim_t ir_start, ir_end;
+	dim_t jr_inc;
+
+	// Determine the thread range and increment for the 2nd loop.
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is disabled for now.
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	//bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( dim_t i = 0; i < m_iter; ++i )
+		{
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// If the current panel of A intersects the diagonal, scale C
+			// by beta. If it is strictly below the diagonal, scale by one.
+			// This allows the current macro-kernel to work for both trmm
+			// and trmm3.
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Determine the offset to and length of the panel that was
+				// packed so we can index into the corresponding location in
+				// b1.
+				dim_t off_a1011 = 0;
+				dim_t k_a1011   = bli_min( diagoffa_i + MR, k );
+
+				// Compute the panel stride for the current diagonal-
+				// intersecting micro-panel.
+				inc_t ps_a_cur  = k_a1011 * PACKMR;
+				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+				      ps_a_cur *= dt_size;
+
+				// NOTE: ir loop parallelism disabled for now.
+				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
+
+				const char* b1_i = b1 + off_a1011 * PACKNR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a1011,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1_i,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				//}
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) )
+			{
+				// NOTE: ir loop parallelism disabled for now.
+				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				//}
+
+				a1 += rstep_a;
+			}
+
+			c11 += rstep_c;
+		}
+	}
 }
 
-INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
+//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );
 
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index df5b2dac5..265e21a66 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
-
-
 void bli_trmm_lu_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_lu_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1112; \
-	dim_t           off_a1112; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,235 +111,208 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current block of A is entirely below the diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
-\
-	/* Compute k_full. For all trmm, k_full is simply k. This is
-	   needed because some parameter combinations of trmm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
-	/* If there is a zero region to the left of where the diagonal of A
-	   intersects the top edge of the block, adjust the pointer to B and
-	   treat this case as if the diagonal offset were zero. Note that we
-	   don't need to adjust the pointer to A since packm would have simply
-	   skipped over the region that was not stored. */ \
-	if ( diagoffa > 0 ) \
-	{ \
-		i        = diagoffa; \
-		k        = k - i; \
-		diagoffa = 0; \
-		b_cast   = b_cast + i * PACKNR; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of A intersects the
-	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffa + k < m ) \
-	{ \
-		m = -diagoffa + k; \
-	} \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
-\
-	dim_t jr_start, jr_end; \
-	/*dim_t ir_start, ir_end;*/ \
-	dim_t jr_inc; \
-\
-	/* Determine the thread range and increment for the 2nd loop.
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* If the current panel of A intersects the diagonal, scale C
-			   by beta. If it is strictly above the diagonal, scale by one.
-			   This allows the current macro-kernel to work for both trmm
-			   and trmm3. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict b1_i; \
-				ctype* restrict a2; \
-\
-				/* Determine the offset to and length of the panel that was
-				   packed so we can index into the corresponding location in
-				   b1. */ \
-				off_a1112 = diagoffa_i; \
-				k_a1112   = k - off_a1112; \
-\
-				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1112 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				b1_i = b1 + off_a1112 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a1112, \
-				  alpha_cast, \
-				  a1, \
-				  b1_i, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 += rstep_c; \
-		} \
-	} \
-\
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current block of A is entirely below the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of A
+	// intersects the top edge of the block, adjust the pointer to B and
+	// treat this case as if the diagonal offset were zero. Note that we
+	// don't need to adjust the pointer to A since packm would have simply
+	// skipped over the region that was not stored.
+	if ( diagoffa > 0 )
+	{
+		k        -= diagoffa;
+		b_cast   += diagoffa * PACKNR * dt_size;
+		diagoffa  = 0;
+	}
+
+	// If there is a zero region below where the diagonal of A intersects the
+	// right side of the block, shrink it to prevent "no-op" iterations from
+	// executing.
+	if ( -diagoffa + k < m )
+	{
+		m = -diagoffa + k;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	//thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//dim_t ir_nt  = bli_thrinfo_n_way( ir_thread );
+	//dim_t ir_tid = bli_thrinfo_work_id( ir_thread );
+
+	dim_t jr_start, jr_end;
+	//dim_t ir_start, ir_end;
+	dim_t jr_inc;
+
+	// Determine the thread range and increment for the 2nd loop.
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is disabled for now.
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	//bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( dim_t i = 0; i < m_iter; ++i )
+		{
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// If the current panel of A intersects the diagonal, scale C
+			// by beta. If it is strictly above the diagonal, scale by one.
+			// This allows the current macro-kernel to work for both trmm
+			// and trmm3.
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Determine the offset to and length of the panel that was
+				// packed so we can index into the corresponding location in
+				// b1.
+				dim_t off_a1112 = diagoffa_i;
+				dim_t k_a1112   = k - off_a1112;
+
+				// Compute the panel stride for the current diagonal-
+				// intersecting micro-panel.
+				inc_t ps_a_cur  = k_a1112 * PACKMR;
+				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+				      ps_a_cur *= dt_size;
+
+				// NOTE: ir loop parallelism disabled for now.
+				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
+
+				const char* b1_i = b1 + off_a1112 * PACKNR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a1112,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1_i,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				//}
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) )
+			{
+				// NOTE: ir loop parallelism disabled for now.
+				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				//}
+
+				a1 += rstep_a;
+			}
+
+			c11 += rstep_c;
+		}
+	}
 }
 
-INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
+//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );
 
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index 89f86aa3a..785f2cf5f 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
-
-
 void bli_trmm_rl_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_rl_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b1121; \
-	dim_t           off_b1121; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,292 +111,261 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of B is entirely above the diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
-\
-	/* Compute k_full. For all trmm, k_full is simply k. This is
-	   needed because some parameter combinations of trmm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of A (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
-	/* If there is a zero region above where the diagonal of B intersects
-	   the left edge of the panel, adjust the pointer to A and treat this
-	   case as if the diagonal offset were zero. Note that we don't need to
-	   adjust the pointer to B since packm would have simply skipped over
-	   the region that was not stored. */ \
-	if ( diagoffb < 0 ) \
-	{ \
-		j        = -diagoffb; \
-		k        = k - j; \
-		diagoffb = 0; \
-		a_cast   = a_cast + j * PACKMR; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of B intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffb + k < n ) \
-	{ \
-		n = diagoffb + k; \
-	} \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( istep_a, &aux ); \
-\
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of B, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
-	{ \
-		/* If the entire panel of B does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
-		/* If the panel of B does intersect the diagonal, compute the number of
-		   iterations in the rectangular region by dividing NR into the diagonal
-		   offset. (There should never be any remainder in this division.) The
-		   number of iterations in the triangular (or trapezoidal) region is
-		   computed as the remaining number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffb / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
-	/* Determine the thread range and increment for the 2nd and 1st loops for
-	   the initial rectangular region of B (if it exists).
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = ir_start; i < ir_end; i += ir_inc ) \
-			{ \
-				ctype* restrict a2; \
-\
-				a1  = a_cast + i * rstep_a; \
-				c11 = c1     + i * rstep_c; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and
-	   1st loops for the remaining triangular region of B (if it exists).
-	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
-	   employ a hack that calls for each thread to execute every iteration
-	   of the jr and ir loops but skip all but the pointer increment for
-	   iterations that are not assigned to it. */ \
-\
-	/* Advance the starting b1 and c1 pointers to the positions corresponding
-	   to the start of the triangular region of B. */ \
-	jr_start = n_iter_rct; \
-	b1 = b_cast + jr_start * cstep_b; \
-	c1 = c_cast + jr_start * cstep_c; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < n_iter; ++j ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-\
-		/* Determine the offset to the beginning of the panel that
-		   was packed so we can index into the corresponding location
-		   in A. Then compute the length of that panel. */ \
-		off_b1121 = bli_max( -diagoffb_j, 0 ); \
-		k_b1121   = k - off_b1121; \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* If the current panel of B intersects the diagonal, scale C
-		   by beta. If it is strictly below the diagonal, scale by one.
-		   This allows the current macro-kernel to work for both trmm
-		   and trmm3. */ \
-		{ \
-			/* Compute the panel stride for the current diagonal-
-			   intersecting micro-panel. */ \
-			is_b_cur  = k_b1121 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
-\
-				ctype* restrict a1_i; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				a1_i = a1 + off_b1121 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b1121, \
-				  alpha_cast, \
-				  a1_i, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-\
-		c1 += cstep_c; \
-	} \
-\
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of B is entirely above the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
+
+	// If there is a zero region above where the diagonal of B intersects
+	// the left edge of the panel, adjust the pointer to A and treat this
+	// case as if the diagonal offset were zero. Note that we don't need to
+	// adjust the pointer to B since packm would have simply skipped over
+	// the region that was not stored.
+	if ( diagoffb < 0 )
+	{
+		k        += diagoffb;
+		a_cast   -= diagoffb * PACKMR * dt_size;
+		diagoffb  = 0;
+	}
+
+	// If there is a zero region to the right of where the diagonal
+	// of B intersects the bottom of the panel, shrink it to prevent
+	// "no-op" iterations from executing.
+	if ( diagoffb + k < n )
+	{
+		n = diagoffb + k;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
+	// Note that we partition the 2nd loop into two regions: the rectangular
+	// part of B, and the triangular portion.
+	dim_t n_iter_rct;
+	dim_t n_iter_tri;
+
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) )
+	{
+		// If the entire panel of B does not intersect the diagonal, there is
+		// no triangular region, and therefore we can skip the second set of
+		// loops.
+		n_iter_rct = n_iter;
+		n_iter_tri = 0;
+	}
+	else
+	{
+		// If the panel of B does intersect the diagonal, compute the number of
+		// iterations in the rectangular region by dividing NR into the diagonal
+		// offset. (There should never be any remainder in this division.) The
+		// number of iterations in the triangular (or trapezoidal) region is
+		// computed as the remaining number of iterations in the n dimension.
+		n_iter_rct = diagoffb / NR;
+		n_iter_tri = n_iter - n_iter_rct;
+	}
+
+	// Determine the thread range and increment for the 2nd and 1st loops for
+	// the initial rectangular region of B (if it exists).
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is disabled for now.
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		{
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc );
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+
+	// If there is no triangular region, then we're done.
+	if ( n_iter_tri == 0 ) return;
+
+	// Use round-robin assignment of micropanels to threads in the 2nd and
+	// 1st loops for the remaining triangular region of B (if it exists).
+	// NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	// employ a hack that calls for each thread to execute every iteration
+	// of the jr and ir loops but skip all but the pointer increment for
+	// iterations that are not assigned to it.
+
+	// Advance the starting b1 and c1 pointers to the positions corresponding
+	// to the start of the triangular region of B.
+	jr_start = n_iter_rct;
+	const char* b1 = b_cast + jr_start * cstep_b;
+	      char* c1 = c_cast + jr_start * cstep_c;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < n_iter; ++j )
+	{
+		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		// Determine the offset to the beginning of the panel that
+		// was packed so we can index into the corresponding location
+		// in A. Then compute the length of that panel.
+		dim_t off_b1121 = bli_max( -diagoffb_j, 0 );
+		dim_t k_b1121   = k - off_b1121;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// If the current panel of B intersects the diagonal, scale C
+		// by beta. If it is strictly below the diagonal, scale by one.
+		// This allows the current macro-kernel to work for both trmm
+		// and trmm3.
+		{
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			if ( bli_trmm_my_iter_rr( j, thread ) ) {
+
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) {
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				const char* a1_i = a1 + off_b1121 * PACKMR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b1121,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+			}
+
+			b1 += ps_b_cur;
+		}
+
+		c1 += cstep_c;
+	}
 }
 
-INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );
 
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index 4ed38e761..ca27caef1 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
-
-
 void bli_trmm_ru_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_ru_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b0111; \
-	dim_t           off_b0111; \
-	dim_t           i, j, jb0; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,312 +111,279 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of B is entirely below its diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
-\
-	/* Compute k_full. For all trmm, k_full is simply k. This is
-	   needed because some parameter combinations of trmm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of A (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
-	/* If there is a zero region to the left of where the diagonal of B
-	   intersects the top edge of the panel, adjust the pointer to C and
-	   treat this case as if the diagonal offset were zero. This skips over
-	   the region that was not packed. (Note we assume the diagonal offset
-	   is a multiple of MR; this assumption will hold as long as the cache
-	   blocksizes are each a multiple of MR and NR.) */ \
-	if ( diagoffb > 0 ) \
-	{ \
-		j        = diagoffb; \
-		n        = n - j; \
-		diagoffb = 0; \
-		c_cast   = c_cast + (j  )*cs_c; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of B intersects the
-	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffb + n < k ) \
-	{ \
-		k = -diagoffb + n; \
-	} \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( istep_a, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
-	{ \
-		/* If the entire panel of B does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
-		/* If the panel of B does intersect the diagonal, compute the number of
-		   iterations in the triangular (or trapezoidal) region by dividing NR
-		   into the number of rows in B. (There should never be any remainder
-		   in this division.) The number of iterations in the rectangular region
-		   is computed as the remaining number of iterations in the n dimension. */ \
-		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and
-	   1st loops for the initial triangular region of B (if it exists).
-	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
-	   employ a hack that calls for each thread to execute every iteration
-	   of the jr and ir loops but skip all but the pointer increment for
-	   iterations that are not assigned to it. */ \
-\
-	b1 = b_cast; \
-	c1 = c_cast; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter_tri; ++j ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-\
-		/* Determine the offset to and length of the panel that was packed
-		   so we can index into the corresponding location in A. */ \
-		off_b0111 = 0; \
-		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* If the current panel of B intersects the diagonal, scale C
-		   by beta. If it is strictly below the diagonal, scale by one.
-		   This allows the current macro-kernel to work for both trmm
-		   and trmm3. */ \
-		{ \
-			/* Compute the panel stride for the current diagonal-
-			   intersecting micro-panel. */ \
-			is_b_cur  = k_b0111 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
-\
-				ctype* restrict a1_i; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				a1_i = a1 + off_b0111 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b0111, \
-				  alpha_cast, \
-				  a1_i, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-\
-		c1 += cstep_c; \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
-	/* Determine the thread range and increment for the 2nd and 1st loops for
-	   the remaining rectangular region of B.
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-	jb0       = n_iter_tri; \
-\
-	/* Save the resulting value of b1 from the previous loop since it represents
-	   the starting point for the rectangular region. */ \
-	b_cast = b1; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		/* NOTE: We must index through b_cast differently since it contains
-		   the starting address of the rectangular region (which is already
-		   n_iter_tri logical iterations through B). */ \
-		b1 = b_cast + (j-jb0) * cstep_b; \
-		c1 = c_cast +  j      * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* If the current panel of B intersects the diagonal, scale C
-		   by beta. If it is strictly below the diagonal, scale by one.
-		   This allows the current macro-kernel to work for both trmm
-		   and trmm3. */ \
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = ir_start; i < ir_end; i += ir_inc ) \
-			{ \
-				ctype* restrict a2; \
-\
-				a1  = a_cast + i * rstep_a; \
-				c11 = c1     + i * rstep_c; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-\
-\
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of B is entirely below its diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of B
+	// intersects the top edge of the panel, adjust the pointer to C and
+	// treat this case as if the diagonal offset were zero. This skips over
+	// the region that was not packed. (Note we assume the diagonal offset
+	// is a multiple of MR; this assumption will hold as long as the cache
+	// blocksizes are each a multiple of MR and NR.)
+	if ( diagoffb > 0 )
+	{
+		n        -= diagoffb;
+		c_cast   += diagoffb * cs_c * dt_size;
+		diagoffb  = 0;
+	}
+
+	// If there is a zero region below where the diagonal of B intersects the
+	// right side of the block, shrink it to prevent "no-op" iterations from
+	// executing.
+	if ( -diagoffb + n < k )
+	{
+		k = -diagoffb + n;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
+	// Note that we partition the 2nd loop into two regions: the triangular
+	// part of C, and the rectangular portion.
+	dim_t n_iter_tri;
+	dim_t n_iter_rct;
+
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) )
+	{
+		// If the entire panel of B does not intersect the diagonal, there is
+		// no triangular region, and therefore we can skip the first set of
+		// loops.
+		n_iter_tri = 0;
+		n_iter_rct = n_iter;
+	}
+	else
+	{
+		// If the panel of B does intersect the diagonal, compute the number of
+		// iterations in the triangular (or trapezoidal) region by dividing NR
+		// into the number of rows in B. (There should never be any remainder
+		// in this division.) The number of iterations in the rectangular region
+		// is computed as the remaining number of iterations in the n dimension.
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 );
+		n_iter_rct = n_iter - n_iter_tri;
+	}
+
+	// Use round-robin assignment of micropanels to threads in the 2nd and
+	// 1st loops for the initial triangular region of B (if it exists).
+	// NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	// employ a hack that calls for each thread to execute every iteration
+	// of the jr and ir loops but skip all but the pointer increment for
+	// iterations that are not assigned to it.
+
+	const char* b1 = b_cast;
+	      char* c1 = c_cast;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = 0; j < n_iter_tri; ++j )
+	{
+		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		// Determine the offset to and length of the panel that was packed
+		// so we can index into the corresponding location in A.
+		dim_t off_b0111 = 0;
+		dim_t k_b0111   = bli_min( k, -diagoffb_j + NR );
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// If the current panel of B intersects the diagonal, scale C
+		// by beta. If it is strictly below the diagonal, scale by one.
+		// This allows the current macro-kernel to work for both trmm
+		// and trmm3.
+		{
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b0111 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			if ( bli_trmm_my_iter_rr( j, thread ) ) {
+
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) {
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				const char* a1_i = a1 + off_b0111 * PACKMR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b0111,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+			}
+
+			b1 += ps_b_cur;
+		}
+
+		c1 += cstep_c;
+	}
+
+	// If there is no rectangular region, then we're done.
+	if ( n_iter_rct == 0 ) return;
+
+	// Determine the thread range and increment for the 2nd and 1st loops for
+	// the remaining rectangular region of B.
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is disabled for now.
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Advance the start and end iteration offsets for the rectangular region
+	// by the number of iterations used for the triangular region.
+	      jr_start += n_iter_tri;
+	      jr_end   += n_iter_tri;
+	dim_t jb0       = n_iter_tri;
+
+	// Save the resulting value of b1 from the previous loop since it represents
+	// the starting point for the rectangular region.
+	b_cast = b1;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		// NOTE: We must index through b_cast differently since it contains
+		// the starting address of the rectangular region (which is already
+		// n_iter_tri logical iterations through B).
+		b1 = b_cast + (j-jb0) * cstep_b;
+		c1 = c_cast +  j      * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// If the current panel of B intersects the diagonal, scale C
+		// by beta. If it is strictly below the diagonal, scale by one.
+		// This allows the current macro-kernel to work for both trmm
+		// and trmm3.
+		{
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc );
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
 }
 
-INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
+//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );
 
diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h
index 2f0642ca8..f8c3d7ee2 100644
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -47,8 +47,7 @@ void PASTEMAC0(opname) \
        const obj_t*  b, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const cntl_t* cntl, \
              thrinfo_t* thread  \
      );
 
@@ -87,7 +86,6 @@ void PASTEMAC(ch,varname) \
        void*   beta, \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
-       rntm_t* rntm, \
        thrinfo_t* thread  \
      );
 
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
index d42bc88c2..60030bf4a 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -47,14 +47,12 @@ void bli_trmm_xx_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	dim_t      side;
-	dim_t      uplo;
-	l3_var_oft f;
+	dim_t side;
+	dim_t uplo;
 
 	// Set two bools: one based on the implied side parameter (the structure
 	// of the root object) and one based on the uplo field of the triangular
@@ -73,7 +71,7 @@ void bli_trmm_xx_ker_var2
 	}
 
 	// Index into the variant array to extract the correct function pointer.
-	f = vars[side][uplo];
+	l3_var_oft f = vars[side][uplo];
 
 	// Call the macrokernel.
 	f
@@ -82,7 +80,6 @@ void bli_trmm_xx_ker_var2
 	  b,
 	  c,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
index 706e14d43..ccf6e2160 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
@@ -322,8 +322,8 @@ void PASTEMAC(ch,varname) \
 	c1 = c_cast; \
 \
 	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	dim_t jr_num_threads      = bli_thrinfo_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thrinfo_work_id( jr_thread ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
index 699892635..c3c11e62f 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
@@ -327,10 +327,10 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thrinfo_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \
 \
 	dim_t jr_start, jr_end; \
 	/*dim_t ir_start, ir_end;*/ \
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
index eb5577593..f69b38d7f 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
@@ -327,10 +327,10 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thrinfo_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \
 \
 	dim_t jr_start, jr_end; \
 	/*dim_t ir_start, ir_end;*/ \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
index 738711f58..7aaf2606f 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
@@ -329,8 +329,8 @@ void PASTEMAC(ch,varname) \
 	c1 = c_cast; \
 \
 	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	dim_t jr_num_threads      = bli_thrinfo_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thrinfo_work_id( jr_thread ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
index df53b2011..e3d75d474 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
@@ -334,10 +334,10 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thrinfo_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \
 \
 	dim_t jr_start, jr_end; \
 	/*dim_t ir_start, ir_end;*/ \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
index fbcd4f9aa..700c54a0c 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
@@ -334,10 +334,10 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thrinfo_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \
 \
 	dim_t jr_start, jr_end; \
 	/*dim_t ir_start, ir_end;*/ \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
index 7775d9217..4499dd6ae 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
@@ -329,8 +329,8 @@ void PASTEMAC(ch,varname) \
 	c1 = c_cast; \
 \
 	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	dim_t jr_num_threads      = bli_thrinfo_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thrinfo_work_id( jr_thread ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
index c1354a962..a35e6adf1 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
@@ -330,10 +330,10 @@ void PASTEMAC(ch,varname) \
 \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
index 7cf8eeef0..438835156 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
@@ -330,10 +330,10 @@ void PASTEMAC(ch,varname) \
 \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
index 1d0f31708..275d6ca47 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
@@ -330,8 +330,8 @@ void PASTEMAC(ch,varname) \
 	c1 = c_cast; \
 \
 	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	dim_t jr_num_threads      = bli_thrinfo_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thrinfo_work_id( jr_thread ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
index d8ae4f8bb..704b38833 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
@@ -335,10 +335,10 @@ void PASTEMAC(ch,varname) \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
index c05a082d4..eab41f665 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
@@ -335,10 +335,10 @@ void PASTEMAC(ch,varname) \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index 9681eb640..88478713f 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -43,8 +43,7 @@ void bli_trmm3_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -171,8 +170,7 @@ void bli_trmm3_front
 	  beta,
 	  &c_local,
 	  cntx,
-	  rntm,
-	  cntl
+	  rntm
 	);
 }
 
diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h
index b5dde34cd..dcaa4d0ee 100644
--- a/frame/3/trmm3/bli_trmm3_front.h
+++ b/frame/3/trmm3/bli_trmm3_front.h
@@ -41,6 +41,5 @@ void bli_trmm3_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index 413b12818..cfd1b4d7d 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -39,13 +39,12 @@
 
 void bli_trsm_blk_var1
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t ap, cp;
@@ -67,7 +66,13 @@ void bli_trsm_blk_var1
 	                        0, kc, &cp, &c1 );
 
 	// All threads iterate over the entire diagonal block A11.
+	thrinfo_t* thread_pre = bli_thrinfo_sub_prenode( thread_par );
 	dim_t my_start = 0, my_end = kc;
+	//bli_thread_range_mdim
+	//(
+	//  direct, thread_pre, &a11, b, &c1, cntl, cntx,
+	//  &my_start, &my_end
+	//);
 
 #ifdef PRINT
 	printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n",
@@ -105,9 +110,8 @@ void bli_trsm_blk_var1
 		  &BLIS_ONE,
 		  &c1_1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_prenode( cntl ),
-		  bli_thrinfo_sub_prenode( thread )
+		  thread_pre
 		);
 	}
 
@@ -118,7 +122,7 @@ void bli_trsm_blk_var1
 	// We must execute a barrier here because the upcoming rank-k update
 	// requires the packed matrix B to be fully updated by the trsm
 	// subproblem.
-	bli_thread_barrier( rntm, thread );
+	bli_thrinfo_barrier( thread_par );
 
 	// Isolate the remaining part of the column panel matrix A, which we do by
 	// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending
@@ -137,6 +141,7 @@ void bli_trsm_blk_var1
 
 	// Determine the current thread's subpartition range for the gemm
 	// subproblem over Ax1.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	bli_thread_range_mdim
 	(
 	  direct, thread, &ax1, b, &cx1, cntl, cntx,
@@ -177,9 +182,8 @@ void bli_trsm_blk_var1
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
-		  bli_thrinfo_sub_node( thread )
+		  thread
 		);
 	}
 #ifdef PRINT
diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c
index 88db57e51..e86eb988a 100644
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -37,13 +37,12 @@
 
 void bli_trsm_blk_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t bp, cp;
@@ -58,6 +57,7 @@ void bli_trsm_blk_var2
 
 	// Determine the current thread's subpartition range.
 	dim_t my_start, my_end;
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	bli_thread_range_ndim
 	(
 	  direct, thread, a, &bp, &cp, cntl, cntx,
@@ -88,9 +88,8 @@ void bli_trsm_blk_var2
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
-		  bli_thrinfo_sub_node( thread )
+		  thread
 		);
 	}
 }
diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index 229259a95..77a3b77d1 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -36,13 +36,12 @@
 
 void bli_trsm_blk_var3
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t ap, bp, cs;
@@ -50,6 +49,8 @@ void bli_trsm_blk_var3
 	bli_obj_alias_to( b, &bp );
 	bli_obj_alias_to( c, &cs );
 
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+
 	// Determine the direction in which to partition (forwards or backwards).
 	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
 
@@ -64,8 +65,8 @@ void bli_trsm_blk_var3
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_trsm_determine_kc( direct, i, k_trans, &ap, &bp,
-		                               bli_cntl_bszid( cntl ), cntx );
+		b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp,
+		                             bli_cntl_bszid( cntl ), cntx, cntl );
 
 		// Acquire partitions for A1 and B1.
 		obj_t a1, b1;
@@ -83,14 +84,10 @@ void bli_trsm_blk_var3
 		  &BLIS_ONE,
 		  &cs,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
-		  bli_thrinfo_sub_node( thread )
+		  thread
 		);
 
-		//bli_thread_ibarrier( thread );
-		bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) );
-
 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal alpha scalars on A/B and C are non-zero, we must ensure
 		// that they are only used in the first iteration.
diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c
index 0a3be87f7..d036e94c7 100644
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -37,7 +37,7 @@
 
 cntl_t* bli_trsm_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        side_t  side,
        pack_t  schema_a,
        pack_t  schema_b,
@@ -45,14 +45,14 @@ cntl_t* bli_trsm_cntl_create
      )
 {
 	if ( bli_is_left( side ) )
-		return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker );
+		return bli_trsm_l_cntl_create( pool, schema_a, schema_b, ker );
 	else
-		return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker );
+		return bli_trsm_r_cntl_create( pool, schema_a, schema_b, ker );
 }
 
 cntl_t* bli_trsm_l_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        pack_t  schema_a,
        pack_t  schema_b,
        void_fp ker
@@ -73,18 +73,18 @@ cntl_t* bli_trsm_l_cntl_create
 
 	cntl_t* gemm_cntl_bu_ke = bli_trsm_cntl_create_node
 	(
-	  rntm,    // the thread's runtime structure
-	  family,  // the operation family
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
+	  pool,         // the thread's sba pool
+	  family,       // the operation family
+	  BLIS_MR,
+	  NULL,         // variant function pointer not used
+	  NULL          // no sub-node; this is the leaf of the tree.
 	);
 
 	cntl_t* gemm_cntl_bp_bu = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
+	  BLIS_NR,
 	  macro_kernel_p,
 	  gemm_cntl_bu_ke
 	);
@@ -92,14 +92,14 @@ cntl_t* bli_trsm_l_cntl_create
 	// Create a node for packing matrix A.
 	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  bli_l3_packa, // trsm operation's packm function for A.
 	  BLIS_MR,
 	  BLIS_MR,
-	  FALSE,   // do NOT invert diagonal
-	  TRUE,    // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
+	  FALSE,        // do NOT invert diagonal
+	  TRUE,         // reverse iteration if upper?
+	  FALSE,        // reverse iteration if lower?
+	  schema_a,     // normally BLIS_PACKED_ROW_PANELS
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  gemm_cntl_bp_bu
 	);
@@ -110,18 +110,18 @@ cntl_t* bli_trsm_l_cntl_create
 
 	cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
 	(
-	  rntm,    // the thread's runtime structure
-	  family,  // the operation family
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
+	  pool,         // the thread's sba pool
+	  family,       // the operation family
+	  BLIS_MR,
+	  NULL,         // variant function pointer not used
+	  NULL          // no sub-node; this is the leaf of the tree.
 	);
 
 	cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
+	  BLIS_NR,
 	  macro_kernel_p,
 	  trsm_cntl_bu_ke
 	);
@@ -129,18 +129,18 @@ cntl_t* bli_trsm_l_cntl_create
 	// Create a node for packing matrix A.
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  bli_l3_packa, // trsm operation's packm function for A.
 	  BLIS_MR,
 	  BLIS_MR,
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-	  TRUE,    // invert diagonal
+	  TRUE,         // invert diagonal
 #else
-	  FALSE,   // do NOT invert diagonal
+	  FALSE,        // do NOT invert diagonal
 #endif
-	  TRUE,    // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
+	  TRUE,         // reverse iteration if upper?
+	  FALSE,        // reverse iteration if lower?
+	  schema_a,     // normally BLIS_PACKED_ROW_PANELS
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  trsm_cntl_bp_bu
 	);
@@ -151,7 +151,7 @@ cntl_t* bli_trsm_l_cntl_create
 	// NOTE: We attach the gemm sub-tree as the main branch.
 	cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_MC,
 	  bli_trsm_blk_var1,
@@ -166,14 +166,14 @@ cntl_t* bli_trsm_l_cntl_create
 	// Create a node for packing matrix B.
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  bli_l3_packb,
 	  BLIS_NR,
 	  BLIS_MR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_b, // normally BLIS_PACKED_COL_PANELS
+	  FALSE,        // do NOT invert diagonal
+	  FALSE,        // reverse iteration if upper?
+	  FALSE,        // reverse iteration if lower?
+	  schema_b,     // normally BLIS_PACKED_COL_PANELS
 	  BLIS_BUFFER_FOR_B_PANEL,
 	  trsm_cntl_op_bp
 	);
@@ -181,7 +181,7 @@ cntl_t* bli_trsm_l_cntl_create
 	// Create a node for partitioning the k dimension by KC.
 	cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_KC,
 	  bli_trsm_blk_var3,
@@ -191,7 +191,7 @@ cntl_t* bli_trsm_l_cntl_create
 	// Create a node for partitioning the n dimension by NC.
 	cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_NC,
 	  bli_trsm_blk_var2,
@@ -203,7 +203,7 @@ cntl_t* bli_trsm_l_cntl_create
 
 cntl_t* bli_trsm_r_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        pack_t  schema_a,
        pack_t  schema_b,
        void_fp ker
@@ -220,7 +220,7 @@ cntl_t* bli_trsm_r_cntl_create
 	// Create two nodes for the macro-kernel.
 	cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_MR, // needed for bli_thrinfo_rgrow()
 	  NULL,    // variant function pointer not used
@@ -229,7 +229,7 @@ cntl_t* bli_trsm_r_cntl_create
 
 	cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
 	  macro_kernel_p,
@@ -239,7 +239,7 @@ cntl_t* bli_trsm_r_cntl_create
 	// Create a node for packing matrix A.
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  bli_l3_packa,
 	  BLIS_NR,
 	  BLIS_MR,
@@ -254,7 +254,7 @@ cntl_t* bli_trsm_r_cntl_create
 	// Create a node for partitioning the m dimension by MC.
 	cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_MC,
 	  bli_trsm_blk_var1,
@@ -264,7 +264,7 @@ cntl_t* bli_trsm_r_cntl_create
 	// Create a node for packing matrix B.
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  bli_l3_packb,
 	  BLIS_MR,
 	  BLIS_MR,
@@ -279,7 +279,7 @@ cntl_t* bli_trsm_r_cntl_create
 	// Create a node for partitioning the k dimension by KC.
 	cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_KC,
 	  bli_trsm_blk_var3,
@@ -289,7 +289,7 @@ cntl_t* bli_trsm_r_cntl_create
 	// Create a node for partitioning the n dimension by NC.
 	cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
 	(
-	  rntm,
+	  pool,
 	  family,
 	  BLIS_NC,
 	  bli_trsm_blk_var2,
@@ -301,25 +301,24 @@ cntl_t* bli_trsm_r_cntl_create
 
 void bli_trsm_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       pool_t* pool,
+       cntl_t* cntl
      )
 {
-	bli_cntl_free( rntm, cntl, thread );
+	bli_cntl_free( pool, cntl );
 }
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_trsm_cntl_create_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
        cntl_t* sub_node
      )
 {
-	return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
+	return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node );
 }
 
diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h
index 86f4a29b2..a23120ff8 100644
--- a/frame/3/trsm/bli_trsm_cntl.h
+++ b/frame/3/trsm/bli_trsm_cntl.h
@@ -35,7 +35,7 @@
 
 cntl_t* bli_trsm_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        side_t  side,
        pack_t  schema_a,
        pack_t  schema_b,
@@ -44,7 +44,7 @@ cntl_t* bli_trsm_cntl_create
 
 cntl_t* bli_trsm_l_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        pack_t  schema_a,
        pack_t  schema_b,
        void_fp ker
@@ -52,7 +52,7 @@ cntl_t* bli_trsm_l_cntl_create
 
 cntl_t* bli_trsm_r_cntl_create
      (
-       rntm_t* rntm,
+       pool_t* pool,
        pack_t  schema_a,
        pack_t  schema_b,
        void_fp ker
@@ -60,16 +60,15 @@ cntl_t* bli_trsm_r_cntl_create
 
 void bli_trsm_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       pool_t* pool,
+       cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_trsm_cntl_create_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index b94a129d9..4672366e5 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -42,8 +42,7 @@ void bli_trsm_front
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -153,8 +152,7 @@ void bli_trsm_front
 	  alpha,
 	  &c_local,
 	  cntx,
-	  rntm,
-	  cntl
+	  rntm
 	);
 }
 
diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h
index b31e88b04..dacfd19e9 100644
--- a/frame/3/trsm/bli_trsm_front.h
+++ b/frame/3/trsm/bli_trsm_front.h
@@ -40,8 +40,7 @@ void bli_trsm_front
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 075b40336..e2128f100 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
-
-
 void bli_trsm_ll_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,105 +87,23 @@ void bli_trsm_ll_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t MR     = pd_a;
+	const dim_t NR     = pd_b;
+	const dim_t PACKMR = cs_a;
+	const dim_t PACKNR = rs_b;
 
+	// Cast the micro-kernel address to its function pointer type.
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1011; \
-	dim_t           k_a10; \
-	dim_t           off_a10; \
-	dim_t           off_a11; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -221,262 +116,224 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
-	   So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
-\
-	/* Compute k_full as k inflated up to a multiple of MR. This is
-	   needed because some parameter combinations of trsm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
-\
-	/* If there is a zero region above where the diagonal of A intersects the
-	   left edge of the block, adjust the pointer to C and treat this case as
-	   if the diagonal offset were zero. This skips over the region that was
-	   not packed. (Note we assume the diagonal offset is a multiple of MR;
-	   this assumption will hold as long as the cache blocksizes are each a
-	   multiple of MR and NR.) */ \
-	if ( diagoffa < 0 ) \
-	{ \
-		i        = -diagoffa; \
-		m        = m - i; \
-		diagoffa = 0; \
-		c_cast   = c_cast + (i  )*rs_c; \
-	} \
-\
-	/* Check the k dimension, which needs to be a multiple of MR. If k
-	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
-	   kernel, which is expecting to perform an MR x MR triangular solve.
-	   This adjustment of k is consistent with what happened when A was
-	   packed: all of its bottom/right edges were zero-padded, and
-	   furthermore, the panel that stores the bottom-right corner of the
-	   matrix has its diagonal extended into the zero-padded region (as
-	   identity). This allows the trsm of that bottom-right panel to
-	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of B. */ \
-	if ( k % MR != 0 ) k += MR - ( k % MR ); \
-\
-	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
-	   know that the underlying buffer was already allocated to have an m
-	   dimension that is a multiple of PACKMR, with the region between the
-	   last row and the next multiple of MR zero-padded accordingly. */ \
-\
-	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
-	/* We don't bother querying the thrinfo_t node for the 1st loop because
-	   we can't parallelize that loop in trsm due to the inter-iteration
-	   dependencies that exist. */ \
-	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t jr_inc; \
-\
-	/* Determine the thread range and increment for the 2nd loop.
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time.
-	   NOTE: Parallelism in the 1st loop is unattainable due to the
-	   inter-iteration dependencies present in trsm. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1 + (0  )*rstep_c; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* If the current panel of A intersects the diagonal, use a
-			   special micro-kernel that performs a fused gemm and trsm.
-			   If the current panel of A resides below the diagonal, use a
-			   a regular gemm micro-kernel. Otherwise, if it is above the
-			   diagonal, it was not packed (because it is implicitly zero)
-			   and so we do nothing. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a10; \
-				ctype* restrict a11; \
-				ctype* restrict b01; \
-				ctype* restrict b11; \
-				ctype* restrict a2; \
-\
-				/* Compute various offsets into and lengths of parts of A. */ \
-				off_a10 = 0; \
-				k_a1011 = diagoffa_i + MR; \
-				k_a10   = k_a1011 - MR; \
-				off_a11 = k_a10; \
-\
-				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1011 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
-				/* Compute the addresses of the panel A10 and the triangular
-				   block A11. */ \
-				a10 = a1; \
-				a11 = a1 + k_a10 * PACKMR; \
-				/*a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );*/ \
-\
-				/* Compute the addresses of the panel B01 and the block
-				   B11. */ \
-				b01 = b1 + off_a10 * PACKNR; \
-				b11 = b1 + off_a11 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a10, \
-				  alpha1_cast, \
-				  a10, \
-				  a11, \
-				  b01, \
-				  b11, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  a1, \
-				  b1, \
-				  alpha2_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 += rstep_c; \
-		} \
-	} \
-\
-/*
-PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
-                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
-*/ \
-\
-/*
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
-*/ \
-\
-/*
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
-*/ \
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If matrix A is above the diagonal, it is implicitly zero.
+	// So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return;
+
+	// Compute k_full as k inflated up to a multiple of MR. This is
+	// needed because some parameter combinations of trsm reduce k
+	// to advance past zero regions in the triangular matrix, and
+	// when computing the imaginary stride of B (the non-triangular
+	// matrix), which is used by 4m1/3m1 implementations, we need
+	// this unreduced value of k.
+	if ( k % MR != 0 ) k += MR - ( k % MR );
+
+	// If there is a zero region above where the diagonal of A intersects the
+	// left edge of the block, adjust the pointer to C and treat this case as
+	// if the diagonal offset were zero. This skips over the region that was
+	// not packed. (Note we assume the diagonal offset is a multiple of MR;
+	// this assumption will hold as long as the cache blocksizes are each a
+	// multiple of MR and NR.)
+	if ( diagoffa < 0 )
+	{
+		m        += diagoffa;
+		c_cast   -= diagoffa * rs_c * dt_size;
+		diagoffa  = 0;
+	}
+
+	// NOTE: We don't need to check that m is a multiple of PACKMR since we
+	// know that the underlying buffer was already allocated to have an m
+	// dimension that is a multiple of PACKMR, with the region between the
+	// last row and the next multiple of MR zero-padded accordingly.
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// We don't bother querying the thrinfo_t node for the 1st loop because
+	// we can't parallelize that loop in trsm due to the inter-iteration
+	// dependencies that exist.
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+
+	dim_t jr_start, jr_end;
+	dim_t jr_inc;
+
+	// Determine the thread range and increment for the 2nd loop.
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is unattainable due to the
+	// inter-iteration dependencies present in trsm.
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		      dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2  = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1 + (0  )*rstep_c;
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( dim_t i = 0; i < m_iter; ++i )
+		{
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			// If the current panel of A intersects the diagonal, use a
+			// special micro-kernel that performs a fused gemm and trsm.
+			// If the current panel of A resides below the diagonal, use a
+			// a regular gemm micro-kernel. Otherwise, if it is above the
+			// diagonal, it was not packed (because it is implicitly zero)
+			// and so we do nothing.
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Compute various offsets into and lengths of parts of A.
+				dim_t off_a10 = 0;
+				dim_t k_a1011 = diagoffa_i + MR;
+				dim_t k_a10   = k_a1011 - MR;
+				dim_t off_a11 = k_a10;
+
+				// Compute the panel stride for the current diagonal-
+				// intersecting micro-panel.
+				inc_t ps_a_cur  = k_a1011 * PACKMR;
+				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+				      ps_a_cur *= dt_size;
+
+				// Compute the addresses of the panel A10 and the triangular
+				// block A11.
+				const char* a10 = a1;
+				const char* a11 = a1 + k_a10 * PACKMR * dt_size;
+				//a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );
+
+				// Compute the addresses of the panel B01 and the block
+				// B11.
+				const char* b01 = b1 + off_a10 * PACKNR * dt_size;
+				const char* b11 = b1 + off_a11 * PACKNR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1 + ps_a_cur;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a10,
+				  ( void* )alpha1_cast,
+				  ( void* )a10,
+				  ( void* )a11,
+				  ( void* )b01,
+				  ( void* )b11,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1 + rstep_a;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )alpha2_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += rstep_a;
+			}
+
+			c11 += rstep_c;
+		}
+	}
 }
 
-INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
+/*
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR,
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" );
+
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );
+
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k_full, a1, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k_full, NR, bp, NR, 1, "%5.2f", "" );
+*/
 
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 799fdd101..314ee3070 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
-
-
 void bli_trsm_lu_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,106 +87,23 @@ void bli_trsm_lu_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t     MR          = pd_a;
+	const dim_t     NR          = pd_b;
+	const dim_t     PACKMR      = cs_a;
+	const dim_t     PACKNR      = rs_b;
 
+	// Cast the micro-kernel address to its function pointer type.
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1112; \
-	dim_t           k_a11; \
-	dim_t           k_a12; \
-	dim_t           off_a11; \
-	dim_t           off_a12; \
-	dim_t           i, j, ib; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -222,275 +116,243 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
-	   So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
-\
-	/* Compute k_full as k inflated up to a multiple of MR. This is
-	   needed because some parameter combinations of trsm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
-\
-	/* If there is a zero region to the left of where the diagonal of A
-	   intersects the top edge of the block, adjust the pointer to B and
-	   treat this case as if the diagonal offset were zero. Note that we
-	   don't need to adjust the pointer to A since packm would have simply
-	   skipped over the region that was not stored. */ \
-	if ( diagoffa > 0 ) \
-	{ \
-		i        = diagoffa; \
-		k        = k - i; \
-		diagoffa = 0; \
-		b_cast   = b_cast + i * PACKNR; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of A intersects the
-	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffa + k < m ) \
-	{ \
-		m = -diagoffa + k; \
-	} \
-\
-	/* Check the k dimension, which needs to be a multiple of MR. If k
-	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
-	   kernel, which is expecting to perform an MR x MR triangular solve.
-	   This adjustment of k is consistent with what happened when A was
-	   packed: all of its bottom/right edges were zero-padded, and
-	   furthermore, the panel that stores the bottom-right corner of the
-	   matrix has its diagonal extended into the zero-padded region (as
-	   identity). This allows the trsm of that bottom-right panel to
-	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of B. */ \
-	if ( k % MR != 0 ) k += MR - ( k % MR ); \
-\
-	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
-	   know that the underlying buffer was already allocated to have an m
-	   dimension that is a multiple of PACKMR, with the region between the
-	   last row and the next multiple of MR zero-padded accordingly. */ \
-\
-	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
-	/* We don't bother querying the thrinfo_t node for the 1st loop because
-	   we can't parallelize that loop in trsm due to the inter-iteration
-	   dependencies that exist. */ \
-	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t jr_inc; \
-\
-	/* Determine the thread range and increment for the 2nd loop.
-	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time.
-	   NOTE: Parallelism in the 1st loop is unattainable due to the
-	   inter-iteration dependencies present in trsm. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1 + (m_iter-1)*rstep_c; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( ib = 0; ib < m_iter; ++ib ) \
-		{ \
-			i          = m_iter - 1 - ib; \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* If the current panel of A intersects the diagonal, use a
-			   special micro-kernel that performs a fused gemm and trsm.
-			   If the current panel of A resides above the diagonal, use a
-			   a regular gemm micro-kernel. Otherwise, if it is below the
-			   diagonal, it was not packed (because it is implicitly zero)
-			   and so we do nothing. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a11; \
-				ctype* restrict a12; \
-				ctype* restrict b11; \
-				ctype* restrict b21; \
-				ctype* restrict a2; \
-\
-				/* Compute various offsets into and lengths of parts of A. */ \
-				off_a11 = diagoffa_i; \
-				k_a1112 = k - off_a11;; \
-				k_a11   = MR; \
-				k_a12   = k_a1112 - MR; \
-				off_a12 = off_a11 + k_a11; \
-\
-				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1112 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
-				/* Compute the addresses of the triangular block A11 and the
-				   panel A12. */ \
-				a11 = a1; \
-				a12 = a1 + k_a11 * PACKMR; \
-				/*a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );*/ \
-\
-				/* Compute the addresses of the panel B01 and the block
-				   B11. */ \
-				b11 = b1 + off_a11 * PACKNR; \
-				b21 = b1 + off_a12 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a12, \
-				  alpha1_cast, \
-				  a12, \
-				  a11, \
-				  b21, \
-				  b11, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  a1, \
-				  b1, \
-				  alpha2_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 -= rstep_c; \
-		} \
-	} \
-\
-/*
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
-printf( "m_iter     = %lu\n", m_iter ); \
-printf( "m_cur      = %lu\n", m_cur ); \
-printf( "k          = %lu\n", k ); \
-printf( "diagoffa_i = %lu\n", diagoffa_i ); \
-printf( "off_a1112  = %lu\n", off_a1112 ); \
-printf( "k_a1112    = %lu\n", k_a1112 ); \
-printf( "k_a12      = %lu\n", k_a12 ); \
-printf( "k_a11      = %lu\n", k_a11 ); \
-printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
-printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
-*/ \
-\
-/*
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
-*/ \
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If matrix A is below the diagonal, it is implicitly zero.
+	// So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of A
+	// intersects the top edge of the block, adjust the pointer to B and
+	// treat this case as if the diagonal offset were zero. Note that we
+	// don't need to adjust the pointer to A since packm would have simply
+	// skipped over the region that was not stored.
+	if ( diagoffa > 0 )
+	{
+		k        -= diagoffa;
+		b_cast   += diagoffa * PACKNR * dt_size;
+		diagoffa  = 0;
+	}
+
+	// If there is a zero region below where the diagonal of A intersects the
+	// right side of the block, shrink it to prevent "no-op" iterations from
+	// executing.
+	if ( -diagoffa + k < m )
+	{
+		m = -diagoffa + k;
+	}
+
+	// Check the k dimension, which needs to be a multiple of MR. If k
+	// isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	// kernel, which is expecting to perform an MR x MR triangular solve.
+	// This adjustment of k is consistent with what happened when A was
+	// packed: all of its bottom/right edges were zero-padded, and
+	// furthermore, the panel that stores the bottom-right corner of the
+	// matrix has its diagonal extended into the zero-padded region (as
+	// identity). This allows the trsm of that bottom-right panel to
+	// proceed without producing any infs or NaNs that would infect the
+	// "good" values of the corresponding block of B.
+	if ( k % MR != 0 ) k += MR - ( k % MR );
+
+	// NOTE: We don't need to check that m is a multiple of PACKMR since we
+	// know that the underlying buffer was already allocated to have an m
+	// dimension that is a multiple of PACKMR, with the region between the
+	// last row and the next multiple of MR zero-padded accordingly.
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// We don't bother querying the thrinfo_t node for the 1st loop because
+	// we can't parallelize that loop in trsm due to the inter-iteration
+	// dependencies that exist.
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+
+	dim_t jr_start, jr_end;
+	dim_t jr_inc;
+
+	// Determine the thread range and increment for the 2nd loop.
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is unattainable due to the
+	// inter-iteration dependencies present in trsm.
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1 + (m_iter-1)*rstep_c;
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( dim_t ib = 0; ib < m_iter; ++ib )
+		{
+			dim_t  i          = m_iter - 1 - ib;
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left );
+
+			// If the current panel of A intersects the diagonal, use a
+			// special micro-kernel that performs a fused gemm and trsm.
+			// If the current panel of A resides above the diagonal, use a
+			// a regular gemm micro-kernel. Otherwise, if it is below the
+			// diagonal, it was not packed (because it is implicitly zero)
+			// and so we do nothing.
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Compute various offsets into and lengths of parts of A.
+				dim_t off_a11 = diagoffa_i;
+				dim_t k_a1112 = k - off_a11;;
+				dim_t k_a11   = MR;
+				dim_t k_a12   = k_a1112 - MR;
+				dim_t off_a12 = off_a11 + k_a11;
+
+				// Compute the panel stride for the current diagonal-
+				// intersecting micro-panel.
+				inc_t ps_a_cur  = k_a1112 * PACKMR;
+				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+				      ps_a_cur *= dt_size;
+
+				// Compute the addresses of the triangular block A11 and the
+				// panel A12.
+				const char* a11 = a1;
+				const char* a12 = a1 + k_a11 * PACKMR * dt_size;
+				//a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );
+
+				// Compute the addresses of the panel B01 and the block
+				// B11.
+				const char* b11 = b1 + off_a11 * PACKNR * dt_size;
+				const char* b21 = b1 + off_a12 * PACKNR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1 + ps_a_cur;
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a12,
+				  ( void* )alpha1_cast,
+				  ( void* )a12,
+				  ( void* )a11,
+				  ( void* )b21,
+				  ( void* )b11,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1 + rstep_a;
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )alpha2_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += rstep_a;
+			}
+
+			c11 -= rstep_c;
+		}
+	}
 }
 
-INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" );
+printf( "m_iter     = %lu\n", m_iter );
+printf( "m_cur      = %lu\n", m_cur );
+printf( "k          = %lu\n", k );
+printf( "diagoffa_i = %lu\n", diagoffa_i );
+printf( "off_a1112  = %lu\n", off_a1112 );
+printf( "k_a1112    = %lu\n", k_a1112 );
+printf( "k_a12      = %lu\n", k_a12 );
+printf( "k_a11      = %lu\n", k_a11 );
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c );
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct );
+
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" );
+*/
 
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 721203df7..42e72840e 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
-
-
 void bli_trsm_rl_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size ( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,111 +87,28 @@ void bli_trsm_rl_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t     MR          = pd_a;
+	const dim_t     NR          = pd_b;
+	const dim_t     PACKMR      = cs_a;
+	const dim_t     PACKNR      = rs_b;
 
+	// Cast the micro-kernel address to its function pointer type.
+	// NOTE: We use the upper-triangular gemmtrsm ukernel because, while
+	// the current macro-kernel targets the "rl" case (right-side/lower-
+	// triangular), it becomes upper-triangular after the kernel operation
+	// is transposed so that all kernel instances are of the "left"
+	// variety (since those are the only trsm ukernels that exist).
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
-	/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
-	   the current macro-kernel targets the "rl" case (right-side/lower-
-	   triangular), it becomes upper-triangular after the kernel operation
-	   is transposed so that all kernel instances are of the "left"
-	   variety (since those are the only trsm ukernels that exist). */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b1121; \
-	dim_t           k_b11; \
-	dim_t           k_b21; \
-	dim_t           off_b11; \
-	dim_t           off_b21; \
-	dim_t           i, j, jb; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -235,41 +129,32 @@ void PASTEMAC(ch,varname) \
 	  transposing the operation, then A needs to be packed with NR and B
 	  needs to be packed with MR (remember: B is the triangular matrix in
 	  the right-hand side parameter case).
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current panel of B is entirely above its diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
-\
-	/* Compute k_full as k inflated up to a multiple of NR. This is
-	   needed because some parameter combinations of trsm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
-\
+	   it is implicitly zero. So we do nothing. */
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
+
 	/* If there is a zero region above where the diagonal of B intersects
 	   the left edge of the panel, adjust the pointer to A and treat this
 	   case as if the diagonal offset were zero. Note that we don't need to
 	   adjust the pointer to B since packm would have simply skipped over
-	   the region that was not stored. */ \
-	if ( diagoffb < 0 ) \
-	{ \
-		j        = -diagoffb; \
-		k        = k - j; \
-		diagoffb = 0; \
-		a_cast   = a_cast + j * PACKMR; \
-	} \
-\
+	   the region that was not stored. */
+	if ( diagoffb < 0 )
+	{
+		k        += diagoffb;
+		a_cast   -= diagoffb * PACKMR * dt_size;
+		diagoffb  = 0;
+	}
+
 	/* If there is a zero region to the right of where the diagonal
 	   of B intersects the bottom of the panel, shrink it so that
 	   we can index to the correct place in C (corresponding to the
@@ -277,12 +162,12 @@ void PASTEMAC(ch,varname) \
 	   NOTE: This is NOT being done to skip over "no-op" iterations,
 	   as with the trsm_lu macro-kernel. This MUST be done for correct
 	   execution because we use n (via n_iter) to compute diagonal and
-	   index offsets for backwards movement through B. */ \
-	if ( diagoffb + k < n ) \
-	{ \
-		n = diagoffb + k; \
-	} \
-\
+	   index offsets for backwards movement through B. */
+	if ( diagoffb + k < n )
+	{
+		n = diagoffb + k;
+	}
+
 	/* Check the k dimension, which needs to be a multiple of NR. If k
 	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
 	   kernel, which is expecting to perform an NR x NR triangular solve.
@@ -292,209 +177,188 @@ void PASTEMAC(ch,varname) \
 	   matrix has its diagonal extended into the zero-padded region (as
 	   identity). This allows the trsm of that bottom-right panel to
 	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of A. */ \
-	if ( k % NR != 0 ) k += NR - ( k % NR ); \
-\
+	   "good" values of the corresponding block of A. */
+	if ( k % NR != 0 ) k += NR - ( k % NR );
+
 	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
 	   know that the underlying buffer was already allocated to have an n
 	   dimension that is a multiple of PACKNR, with the region between the
-	   last column and the next multiple of NR zero-padded accordingly. */ \
-\
+	   last column and the next multiple of NR zero-padded accordingly. */
+
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+
 	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
+	   dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
 	/* Save the pack schemas of A and B to the auxinfo_t object.
 	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_schema_a( schema_b, &aux ); \
-	bli_auxinfo_set_schema_b( schema_a, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object.
-	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_is_b( istep_a, &aux ); \
-\
-	b1 = b_cast; \
-	c1 = c_cast; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( jb = 0; jb < n_iter; ++jb ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b11; \
-		ctype* restrict b21; \
-		ctype* restrict b2; \
-\
-		j          = n_iter - 1 - jb; \
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-		a1         = a_cast; \
-		c11        = c1 + (n_iter-1)*cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
+	   "A" matrix is actually contained within B. */
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_b, &aux );
+	bli_auxinfo_set_schema_b( schema_a, &aux );
+
+	const char* b1 = b_cast;
+	      char* c1 = c_cast;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t jb = 0; jb < n_iter; ++jb )
+	{
+		dim_t  j          = n_iter - 1 - jb;
+		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		dim_t  n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left );
+
+		const char* a1         = a_cast;
+		      char* c11        = c1 + (n_iter-1)*cstep_c;
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
 		/* If the current panel of B intersects the diagonal, use a
 		   special micro-kernel that performs a fused gemm and trsm.
 		   If the current panel of B resides below the diagonal, use a
 		   a regular gemm micro-kernel. Otherwise, if it is above the
 		   diagonal, it was not packed (because it is implicitly zero)
-		   and so we do nothing. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
+		   and so we do nothing. */
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
+		{
 			/* Determine the offset to and length of the panel that was packed
-			   so we can index into the corresponding location in A. */ \
-			off_b11   = bli_max( -diagoffb_j, 0 ); \
-			k_b1121   = k - off_b11; \
-			k_b11     = NR; \
-			k_b21     = k_b1121 - NR; \
-			off_b21   = off_b11 + k_b11; \
-\
+			   so we can index into the corresponding location in A. */
+			dim_t off_b11   = bli_max( -diagoffb_j, 0 );
+			dim_t k_b1121   = k - off_b11;
+			dim_t k_b11     = NR;
+			dim_t k_b21     = k_b1121 - NR;
+			dim_t off_b21   = off_b11 + k_b11;
+
 			/* Compute the addresses of the triangular block B11 and the
-			   panel B21. */ \
-			b11 = b1; \
-			b21 = b1 + k_b11 * PACKNR; \
-			/*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ \
-\
-			/* Compute the panel stride for the current micro-panel. */ \
-			is_b_cur  = k_b1121 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a11; \
-				ctype* restrict a12; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the A11 block and A12 panel. */ \
-				a11  = a1 + off_b11 * PACKMR; \
-				a12  = a1 + off_b21 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
+			   panel B21. */
+			const char* b11 = b1;
+			const char* b21 = b1 + k_b11 * PACKNR * dt_size;
+			/*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/
+
+			/* Compute the panel stride for the current micro-panel. */
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+				  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+				  ps_b_cur *= dt_size;
+
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the A11 block and A12 panel. */
+				const char* a11  = a1 + off_b11 * PACKMR * dt_size;
+				const char* a12  = a1 + off_b21 * PACKMR * dt_size;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + ps_b_cur;
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
 				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b21, \
-				  alpha1_cast, \
-				  b21, \
-				  b11, \
-				  a12, \
-				  a11, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   triangular "A" matrix is actually contained within B. */
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b21,
+				  ( void* )alpha1_cast,
+				  ( void* )b21,
+				  ( void* )b11,
+				  ( void* )a12,
+				  ( void* )a11,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) )
+		{
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + cstep_b;
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
 				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  b1, \
-				  a1, \
-				  alpha2_cast, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += cstep_b; \
-		} \
-\
-		c1 -= cstep_c; \
-	} \
-}
+				   triangular "A" matrix is actually contained within B. */
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )b1,
+				  ( void* )a1,
+				  ( void* )alpha2_cast,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
 
-INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += cstep_b;
+		}
+
+		c1 -= cstep_c;
+	}
+}
 
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 447fbf8cd..6cc9a8bbb 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
-
-
 void bli_trsm_ru_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread_par
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,110 +87,28 @@ void bli_trsm_ru_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	// Alias some constants to simpler names.
+	const dim_t     MR          = pd_a;
+	const dim_t     NR          = pd_b;
+	const dim_t     PACKMR      = cs_a;
+	const dim_t     PACKNR      = rs_b;
 
+	// Cast the micro-kernel address to its function pointer type.
+	// NOTE: We use the lower-triangular gemmtrsm ukernel because, while
+	// the current macro-kernel targets the "ru" case (right-side/upper-
+	// triangular), it becomes lower-triangular after the kernel operation
+	// is transposed so that all kernel instances are of the "left"
+	// variety (since those are the only trsm ukernels that exist).
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
-	/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
-	   the current macro-kernel targets the "ru" case (right-side/upper-
-	   triangular), it becomes lower-triangular after the kernel operation
-	   is transposed so that all kernel instances are of the "left"
-	   variety (since those are the only trsm ukernels that exist). */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b0111; \
-	dim_t           k_b01; \
-	dim_t           off_b01; \
-	dim_t           off_b11; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -234,260 +129,230 @@ void PASTEMAC(ch,varname) \
 	  transposing the operation, then A needs to be packed with NR and B
 	  needs to be packed with MR (remember: B is the triangular matrix in
 	  the right-hand side parameter case).
-	*/ \
-\
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of B is entirely below its diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
-\
-	/* Compute k_full as k inflated up to a multiple of NR. This is
-	   needed because some parameter combinations of trsm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
-\
-	/* If there is a zero region to the left of where the diagonal of B
-	   intersects the top edge of the panel, adjust the pointer to C and
-	   treat this case as if the diagonal offset were zero. This skips over
-	   the region that was not packed. (Note we assume the diagonal offset
-	   is a multiple of MR; this assumption will hold as long as the cache
-	   blocksizes are each a multiple of MR and NR.) */ \
-	if ( diagoffb > 0 ) \
-	{ \
-		j        = diagoffb; \
-		n        = n - j; \
-		diagoffb = 0; \
-		c_cast   = c_cast + (j  )*cs_c; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of B intersects the
-	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffb + n < k ) \
-	{ \
-		k = -diagoffb + n; \
-	} \
-\
-	/* Check the k dimension, which needs to be a multiple of NR. If k
-	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
-	   kernel, which is expecting to perform an NR x NR triangular solve.
-	   This adjustment of k is consistent with what happened when B was
-	   packed: all of its bottom/right edges were zero-padded, and
-	   furthermore, the panel that stores the bottom-right corner of the
-	   matrix has its diagonal extended into the zero-padded region (as
-	   identity). This allows the trsm of that bottom-right panel to
-	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of A. */ \
-	if ( k % NR != 0 ) k += NR - ( k % NR ); \
-\
-	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
-	   know that the underlying buffer was already allocated to have an n
-	   dimension that is a multiple of PACKNR, with the region between the
-	   last column and the next multiple of NR zero-padded accordingly. */ \
-\
-	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object.
-	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_schema_a( schema_b, &aux ); \
-	bli_auxinfo_set_schema_b( schema_a, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object.
-	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_is_b( istep_a, &aux ); \
-\
-	b1 = b_cast; \
-	c1 = c_cast; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b01; \
-		ctype* restrict b11; \
-		ctype* restrict b2; \
-\
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-		a1         = a_cast; \
-		c11        = c1; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* If the current panel of B intersects the diagonal, use a
-		   special micro-kernel that performs a fused gemm and trsm.
-		   If the current panel of B resides above the diagonal, use a
-		   a regular gemm micro-kernel. Otherwise, if it is below the
-		   diagonal, it was not packed (because it is implicitly zero)
-		   and so we do nothing. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			/* Determine the offset to and length of the panel that was packed
-			   so we can index into the corresponding location in A. */ \
-			off_b01   = 0; \
-			k_b0111   = bli_min( k, -diagoffb_j + NR ); \
-			k_b01     = k_b0111 - NR; \
-			off_b11   = k_b01; \
-\
-			/* Compute the addresses of the panel B10 and the triangular
-			   block B11. */ \
-			b01 = b1; \
-			b11 = b1 + k_b01 * PACKNR; \
-			/*b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ \
-\
-			/* Compute the panel stride for the current micro-panel. */ \
-			is_b_cur  = k_b0111 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a10; \
-				ctype* restrict a11; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the A10 panel and A11 block. */ \
-				a10  = a1 + off_b01 * PACKMR; \
-				a11  = a1 + off_b11 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b01, \
-				  alpha1_cast, \
-				  b01, \
-				  b11, \
-				  a10, \
-				  a11, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  b1, \
-				  a1, \
-				  alpha2_cast, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += cstep_b; \
-		} \
-\
-		c1 += cstep_c; \
-	} \
-}
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of B is entirely below its diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of B
+	// intersects the top edge of the panel, adjust the pointer to C and
+	// treat this case as if the diagonal offset were zero. This skips over
+	// the region that was not packed. (Note we assume the diagonal offset
+	// is a multiple of MR; this assumption will hold as long as the cache
+	// blocksizes are each a multiple of MR and NR.)
+	if ( diagoffb > 0 )
+	{
+		n        -= diagoffb;
+		c_cast   += diagoffb * cs_c * dt_size;
+		diagoffb  = 0;
+	}
+
+	// If there is a zero region below where the diagonal of B intersects the
+	// right side of the block, shrink it to prevent "no-op" iterations from
+	// executing.
+	if ( -diagoffb + n < k )
+	{
+		k = -diagoffb + n;
+	}
+
+	// Check the k dimension, which needs to be a multiple of NR. If k
+	// isn't a multiple of NR, we adjust it higher to satisfy the micro-
+	// kernel, which is expecting to perform an NR x NR triangular solve.
+	// This adjustment of k is consistent with what happened when B was
+	// packed: all of its bottom/right edges were zero-padded, and
+	// furthermore, the panel that stores the bottom-right corner of the
+	// matrix has its diagonal extended into the zero-padded region (as
+	// identity). This allows the trsm of that bottom-right panel to
+	// proceed without producing any infs or NaNs that would infect the
+	// "good" values of the corresponding block of A.
+	if ( k % NR != 0 ) k += NR - ( k % NR );
+
+	// NOTE: We don't need to check that n is a multiple of PACKNR since we
+	// know that the underlying buffer was already allocated to have an n
+	// dimension that is a multiple of PACKNR, with the region between the
+	// last column and the next multiple of NR zero-padded accordingly.
+
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	// NOTE: We swap the values for A and B since the triangular
+	// "A" matrix is actually contained within B.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_b, &aux );
+	bli_auxinfo_set_schema_b( schema_a, &aux );
+
+	const char* b1 = b_cast;
+	      char* c1 = c_cast;
 
-INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = 0; j < n_iter; ++j )
+	{
+		dim_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		const char* a1         = a_cast;
+		      char* c11        = c1;
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// If the current panel of B intersects the diagonal, use a
+		// special micro-kernel that performs a fused gemm and trsm.
+		// If the current panel of B resides above the diagonal, use a
+		// a regular gemm micro-kernel. Otherwise, if it is below the
+		// diagonal, it was not packed (because it is implicitly zero)
+		// and so we do nothing.
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Determine the offset to and length of the panel that was packed
+			// so we can index into the corresponding location in A.
+			dim_t off_b01   = 0;
+			dim_t k_b0111   = bli_min( k, -diagoffb_j + NR );
+			dim_t k_b01     = k_b0111 - NR;
+			dim_t off_b11   = k_b01;
+
+			// Compute the addresses of the panel B10 and the triangular
+			// block B11.
+			const char* b01 = b1;
+			const char* b11 = b1 + k_b01 * PACKNR * dt_size;
+			//b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/
+
+			// Compute the panel stride for the current micro-panel.
+			inc_t ps_b_cur  = k_b0111 * PACKNR;
+				  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+				  ps_b_cur *= dt_size;
+
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				// Compute the addresses of the A10 panel and A11 block.
+				const char* a10  = a1 + off_b01 * PACKMR * dt_size;
+				const char* a11  = a1 + off_b11 * PACKMR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				//if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + ps_b_cur;
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object. NOTE: We swap the values for A and B since the
+				// triangular "A" matrix is actually contained within B.
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b01,
+				  ( void* )alpha1_cast,
+				  ( void* )b01,
+				  ( void* )b11,
+				  ( void* )a10,
+				  ( void* )a11,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				//if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + cstep_b;
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object. NOTE: We swap the values for A and B since the
+				// triangular "A" matrix is actually contained within B.
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )b1,
+				  ( void* )a1,
+				  ( void* )alpha2_cast,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += cstep_b;
+		}
+
+		c1 += cstep_c;
+	}
+}
 
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index 7e747b4a8..a498e687e 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -43,12 +43,11 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       const obj_t*  a, \
-       const obj_t*  b, \
-       const obj_t*  c, \
-       const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     c, \
+       const cntx_t*    cntx, \
+       const cntl_t*    cntl, \
              thrinfo_t* thread  \
      );
 
@@ -63,36 +62,3 @@ GENPROT( trsm_lu_ker_var2 )
 GENPROT( trsm_rl_ker_var2 )
 GENPROT( trsm_ru_ker_var2 )
 
-
-//
-// Prototype BLAS-like interfaces with void pointer operands.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoff, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
-INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
-INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
-INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )
-
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index a0a59c0a8..39c5372f3 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -47,14 +47,12 @@ void bli_trsm_xx_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	dim_t      side;
-	dim_t      uplo;
-	l3_var_oft f;
+	dim_t side;
+	dim_t uplo;
 
 	// Set two bools: one based on the implied side parameter (the structure
 	// of the root object) and one based on the uplo field of the triangular
@@ -73,7 +71,7 @@ void bli_trsm_xx_ker_var2
 	}
 
 	// Index into the variant array to extract the correct function pointer.
-	f = vars[side][uplo];
+	l3_var_oft f = vars[side][uplo];
 
 	// Call the macrokernel.
 	f
@@ -82,7 +80,6 @@ void bli_trsm_xx_ker_var2
 	  b,
 	  c,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
index 26da1b004..7a4d2c736 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
@@ -411,7 +411,7 @@ void PASTEMAC(ch,varname) \
 					a2 = a_cast; \
 					b2 = b1; \
 					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \
 						b2 = b_cast; \
 				} \
 \
@@ -476,7 +476,7 @@ void PASTEMAC(ch,varname) \
 					a2 = a_cast; \
 					b2 = b1; \
 					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \
 						b2 = b_cast; \
 				} \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
index 607b40e54..ac4ab28b9 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
@@ -349,8 +349,8 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t jr_inc; \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
index 3299b5f8e..7fa4bd2c0 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
@@ -349,8 +349,8 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t jr_inc; \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
index b02ff0955..5379ac0ab 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
@@ -421,7 +421,7 @@ void PASTEMAC(ch,varname) \
 					a2 = a_cast; \
 					b2 = b1; \
 					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \
 						b2 = b_cast; \
 				} \
 \
@@ -486,7 +486,7 @@ void PASTEMAC(ch,varname) \
 					a2 = a_cast; \
 					b2 = b1; \
 					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \
 						b2 = b_cast; \
 				} \
 \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
index e78cef477..fadf3b92b 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
@@ -357,8 +357,8 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t jr_inc; \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
index 93cac371a..106ab499e 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
@@ -357,8 +357,8 @@ void PASTEMAC(ch,varname) \
 	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t jr_inc; \
diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
index 1e903c3c1..99e6d7984 100644
--- a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
@@ -443,7 +443,7 @@ void PASTEMAC(ch,varname) \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
 				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + ps_b_cur; \
@@ -523,7 +523,7 @@ void PASTEMAC(ch,varname) \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
 				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + cstep_b; \
diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
index a44d64f45..ebddbcd19 100644
--- a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
@@ -436,7 +436,7 @@ void PASTEMAC(ch,varname) \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
 				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + ps_b_cur; \
@@ -516,7 +516,7 @@ void PASTEMAC(ch,varname) \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
 				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + cstep_b; \
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index b22ddbee0..daa092ba7 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -37,7 +37,7 @@
 
 cntl_t* bli_cntl_create_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
@@ -46,14 +46,13 @@ cntl_t* bli_cntl_create_node
      )
 {
 	cntl_t* cntl;
-	mem_t*  pack_mem;
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_cntl_create_node(): " );
 	#endif
 
 	// Allocate the cntl_t struct.
-	cntl = bli_sba_acquire( rntm, sizeof( cntl_t ) );
+	cntl = bli_sba_acquire( pool, sizeof( cntl_t ) );
 
 	bli_cntl_set_family( family, cntl );
 	bli_cntl_set_bszid( bszid, cntl );
@@ -62,19 +61,12 @@ cntl_t* bli_cntl_create_node
 	bli_cntl_set_sub_prenode( NULL, cntl );
 	bli_cntl_set_sub_node( sub_node, cntl );
 
-	// Query the address of the node's packed mem_t entry so we can initialize
-	// key fields (to NULL or 0).
-	// NOTE: This initialization is important, since it allows threads to
-	// discern whether blocks have been acquired from the memory allocator.
-	pack_mem = bli_cntl_pack_mem( cntl );
-	bli_mem_clear( pack_mem );
-
 	return cntl;
 }
 
 void bli_cntl_free_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        cntl_t* cntl
      )
 {
@@ -82,7 +74,7 @@ void bli_cntl_free_node
 	printf( "bli_cntl_free_node(): " );
 	#endif
 
-	bli_sba_release( rntm, cntl );
+	bli_sba_release( pool, cntl );
 }
 
 void bli_cntl_clear_node
@@ -90,39 +82,20 @@ void bli_cntl_clear_node
        cntl_t* cntl
      )
 {
-	mem_t* pack_mem;
-
 	// Clear various fields in the control tree. Clearing these fields
 	// actually is not needed, but we do it for debugging/completeness.
 	bli_cntl_set_var_func( NULL, cntl );
 	bli_cntl_set_params( NULL, cntl );
 	bli_cntl_set_sub_prenode( NULL, cntl );
 	bli_cntl_set_sub_node( NULL, cntl );
-
-	// Clearing these fields is potentially more important if the control
-	// tree is cached somewhere and reused.
-	pack_mem = bli_cntl_pack_mem( cntl );
-	bli_mem_clear( pack_mem );
 }
 
 // -----------------------------------------------------------------------------
 
 void bli_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	if ( thread != NULL ) bli_cntl_free_w_thrinfo( rntm, cntl, thread );
-	else                  bli_cntl_free_wo_thrinfo( rntm, cntl );
-}
-
-void bli_cntl_free_w_thrinfo
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       pool_t* pool,
+       cntl_t* cntl
      )
 {
 	// Base case: simply return when asked to free NULL nodes.
@@ -131,33 +104,13 @@ void bli_cntl_free_w_thrinfo
 	cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl );
 	cntl_t* cntl_sub_node    = bli_cntl_sub_node( cntl );
 	void*   cntl_params      = bli_cntl_params( cntl );
-	mem_t*  cntl_pack_mem    = bli_cntl_pack_mem( cntl );
-
-	// Don't immediately dereference the prenode and subnode of the thrinfo_t
-	// node. In some cases, the thrinfo_t tree is not built out all the way,
-	// perhaps because there are more ways of parallelization than micropanels
-	// of data in this dimension, or because the problem is small enough that
-	// there is no gemm subproblem in bli_trsm_blk_var1(). Thus, we start with
-	// NULL values for these variables and only dereference the fields of the
-	// thrinfo_t struct if the thrinfo_t exists (ie: is non-NULL). We will also
-	// have to check the thrinfo_t pointer for NULLness before using it below,
-	// when checking if we need to free the pack_mem field of the cntl_t node
-	// (see below).
-	thrinfo_t* thread_sub_prenode = NULL;
-	thrinfo_t* thread_sub_node    = NULL;
-
-	if ( thread != NULL )
-	{
-		thread_sub_prenode = bli_thrinfo_sub_prenode( thread );
-		thread_sub_node    = bli_thrinfo_sub_node( thread );
-	}
 
 	// Only recurse into prenode branch if it exists.
 	if ( cntl_sub_prenode != NULL )
 	{
 		// Recursively free all memory associated with the sub-prenode and its
 		// children.
-		bli_cntl_free_w_thrinfo( rntm, cntl_sub_prenode, thread_sub_prenode );
+		bli_cntl_free( pool, cntl_sub_prenode );
 	}
 
 	// Only recurse into the child node if it exists.
@@ -165,7 +118,7 @@ void bli_cntl_free_w_thrinfo
 	{
 		// Recursively free all memory associated with the sub-node and its
 		// children.
-		bli_cntl_free_w_thrinfo( rntm, cntl_sub_node, thread_sub_node );
+		bli_cntl_free( pool, cntl_sub_node );
 	}
 
 	// Free the current node's params field, if it is non-NULL.
@@ -175,80 +128,19 @@ void bli_cntl_free_w_thrinfo
 		printf( "bli_cntl_free_w_thrinfo(): " );
 		#endif
 
-		bli_sba_release( rntm, cntl_params );
-	}
-
-	// Release the current node's pack mem_t entry back to the memory
-	// broker from which it originated, but only if the mem_t entry is
-	// allocated, and only if the current thread is chief for its group.
-	// Also note that we don't proceed with either of the above tests if
-	// the thrinfo_t pointer is NULL. (See above for background on when
-	// this can happen.)
-	if ( thread != NULL )
-	if ( bli_thread_am_ochief( thread ) )
-	if ( bli_mem_is_alloc( cntl_pack_mem ) )
-	{
-		#ifdef BLIS_ENABLE_MEM_TRACING
-		printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" );
-		#endif
-
-		bli_pba_release( rntm, cntl_pack_mem );
+		bli_sba_release( pool, cntl_params );
 	}
 
 	// Free the current node.
-	bli_cntl_free_node( rntm, cntl );
-}
-
-void bli_cntl_free_wo_thrinfo
-     (
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	// Base case: simply return when asked to free NULL nodes.
-	if ( cntl == NULL ) return;
-
-	cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl );
-	cntl_t* cntl_sub_node    = bli_cntl_sub_node( cntl );
-	void*   cntl_params      = bli_cntl_params( cntl );
-	mem_t*  cntl_pack_mem    = bli_cntl_pack_mem( cntl );
-
-	{
-		// Recursively free all memory associated with the sub-prenode and its
-		// children.
-		bli_cntl_free_wo_thrinfo( rntm, cntl_sub_prenode );
-	}
-
-	{
-		// Recursively free all memory associated with the sub-node and its
-		// children.
-		bli_cntl_free_wo_thrinfo( rntm, cntl_sub_node );
-	}
-
-	// Free the current node's params field, if it is non-NULL.
-	if ( cntl_params != NULL )
-	{
-		bli_sba_release( rntm, cntl_params );
-	}
-
-	// Release the current node's pack mem_t entry back to the memory
-	// broker from which it originated, but only if the mem_t entry is
-	// allocated.
-	if ( bli_mem_is_alloc( cntl_pack_mem ) )
-	{
-		bli_pba_release( rntm, cntl_pack_mem );
-	}
-
-	// Free the current node.
-	bli_cntl_free_node( rntm, cntl );
+	bli_cntl_free_node( pool, cntl );
 }
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_cntl_copy
      (
-       rntm_t* rntm,
-       cntl_t* cntl
+             pool_t* pool,
+       const cntl_t* cntl
      )
 {
 	// Make a copy of the current node. Notice that the source node
@@ -257,7 +149,7 @@ cntl_t* bli_cntl_copy
 	// field.
 	cntl_t* cntl_copy = bli_cntl_create_node
 	(
-      rntm,
+	  pool,
 	  bli_cntl_family( cntl ),
 	  bli_cntl_bszid( cntl ),
 	  bli_cntl_var_func( cntl ),
@@ -273,7 +165,7 @@ cntl_t* bli_cntl_copy
 		// struct.
 		uint64_t params_size = bli_cntl_params_size( cntl );
 		void*    params_orig = bli_cntl_params( cntl );
-		void*    params_copy = bli_sba_acquire( rntm, ( size_t )params_size );
+		void*    params_copy = bli_sba_acquire( pool, ( size_t )params_size );
 
 		// Copy the original params struct to the new memory region.
 		memcpy( params_copy, params_orig, params_size );
@@ -288,7 +180,7 @@ cntl_t* bli_cntl_copy
 	{
 		cntl_t* sub_prenode_copy = bli_cntl_copy
 		(
-		  rntm,
+		  pool,
 		  bli_cntl_sub_prenode( cntl )
 		);
 
@@ -302,7 +194,7 @@ cntl_t* bli_cntl_copy
 	{
 		cntl_t* sub_node_copy = bli_cntl_copy
 		(
-		  rntm,
+		  pool,
 		  bli_cntl_sub_node( cntl )
 		);
 
diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h
index 406a350ee..2c1aeb603 100644
--- a/frame/base/bli_cntl.h
+++ b/frame/base/bli_cntl.h
@@ -45,14 +45,7 @@ struct cntl_s
 	void_fp        var_func;
 	struct cntl_s* sub_prenode;
 	struct cntl_s* sub_node;
-
-	// Optional fields (needed only by some operations such as packm).
-	// NOTE: first field of params must be a uint64_t containing the size
-	// of the struct.
 	void*          params;
-
-	// Internal fields that track "cached" data.
-	mem_t          pack_mem;
 };
 typedef struct cntl_s cntl_t;
 */
@@ -62,7 +55,7 @@ typedef struct cntl_s cntl_t;
 
 BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
@@ -72,7 +65,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node
 
 BLIS_EXPORT_BLIS void bli_cntl_free_node
      (
-       rntm_t* rntm,
+       pool_t* pool,
        cntl_t* cntl
      );
 
@@ -85,28 +78,14 @@ BLIS_EXPORT_BLIS void bli_cntl_clear_node
 
 BLIS_EXPORT_BLIS void bli_cntl_free
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl
+       pool_t* pool,
+       cntl_t* cntl
      );
 
 BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy
      (
-       rntm_t* rntm,
-       cntl_t* cntl
+             pool_t* pool,
+       const cntl_t* cntl
      );
 
 BLIS_EXPORT_BLIS void bli_cntl_mark_family
@@ -163,11 +142,6 @@ BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl )
 	return *( ( uint64_t* )(cntl->params) );
 }
 
-BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl )
-{
-	return &(cntl->pack_mem);
-}
-
 // cntl_t query (complex)
 
 BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl )
@@ -220,8 +194,3 @@ BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl )
 	cntl->params = params;
 }
 
-BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl )
-{
-	cntl->pack_mem = *pack_mem;
-}
-
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 827b19cfd..90050a5ed 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -357,8 +357,6 @@ BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx );
 
 BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
-
 
 #endif
 
diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h
index c25511486..b46c0509d 100644
--- a/frame/base/bli_mem.h
+++ b/frame/base/bli_mem.h
@@ -136,13 +136,26 @@ BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem )
 // removed from the mem_t type definition. An alternative to the initializer is
 // calling bli_mem_clear() at runtime.
 
+#ifdef __cplusplus
+#define BLIS_MEM_INITIALIZER \
+        { \
+          .pblk        = BLIS_PBLK_INITIALIZER, \
+          /* When using C++, which is strongly typed, we avoid use of -1 as a
+             packbuf_t value since it will result in a compile-time error. */ \
+          .buf_type    = BLIS_BUFFER_FOR_GEN_USE, \
+          .pool        = NULL, \
+          .size        = 0, \
+        }
+#else // C99
 #define BLIS_MEM_INITIALIZER \
         { \
           .pblk        = BLIS_PBLK_INITIALIZER, \
           .buf_type    = -1, \
           .pool        = NULL, \
           .size        = 0, \
-        }  \
+        }
+#endif
+
 
 BLIS_INLINE void bli_mem_clear( mem_t* mem )
 {
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index cabaf4ff6..abcf708e2 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -37,13 +37,13 @@
 #include "blis.h"
 
 // Statically initialize the mutex within the packing block allocator object.
-static pba_t pba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER };
+static pba_t global_pba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER };
 
 // -----------------------------------------------------------------------------
 
 pba_t* bli_pba_query( void )
 {
-    return &pba;
+    return &global_pba;
 }
 
 void bli_pba_init
@@ -92,17 +92,12 @@ void bli_pba_finalize
 
 void bli_pba_acquire_m
      (
-       rntm_t*   rntm,
+       pba_t*    pba,
        siz_t     req_size,
        packbuf_t buf_type,
        mem_t*    mem
      )
 {
-	pool_t* pool;
-	pblk_t* pblk;
-	dim_t   pi;
-	err_t   r_val;
-
 	// If the internal memory pools for packing block allocator are disabled,
 	// we spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the
 	// immediate usage of bli_pba_malloc().
@@ -115,10 +110,6 @@ void bli_pba_acquire_m
 	#endif
 #endif
 
-	// Query the memory broker from the runtime.
-	pba_t* pba = bli_rntm_pba( rntm );
-
-
 	if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
 	{
 		malloc_ft malloc_fp  = bli_pba_malloc_fp( pba );
@@ -126,6 +117,7 @@ void bli_pba_acquire_m
 
 		// For general-use buffer requests, dynamically allocating memory
 		// is assumed to be sufficient.
+		err_t r_val;
 		void* buf = bli_fmalloc_align( malloc_fp, req_size, align_size, &r_val );
 
 		// Initialize the mem_t object with:
@@ -148,11 +140,11 @@ void bli_pba_acquire_m
 
 		// Map the requested packed buffer type to a zero-based index, which
 		// we then use to select the corresponding memory pool.
-		pi   = bli_packbuf_index( buf_type );
-		pool = bli_pba_pool( pi, pba );
+		dim_t   pi   = bli_packbuf_index( buf_type );
+		pool_t* pool = bli_pba_pool( pi, pba );
 
 		// Extract the address of the pblk_t struct within the mem_t.
-		pblk = bli_mem_pblk( mem );
+		pblk_t* pblk = bli_mem_pblk( mem );
 
 		// Acquire the mutex associated with the pba object.
 		bli_pba_lock( pba );
@@ -197,13 +189,10 @@ void bli_pba_acquire_m
 
 void bli_pba_release
      (
-       rntm_t* rntm,
-       mem_t*  mem
+       pba_t* pba,
+       mem_t* mem
      )
 {
-	// Query the memory broker from the runtime.
-	pba_t* pba = bli_rntm_pba( rntm );
-
 	// Extract the buffer type so we know what kind of memory was allocated.
 	packbuf_t buf_type = bli_mem_buf_type( mem );
 
diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h
index dfda53090..0adde1941 100644
--- a/frame/base/bli_pba.h
+++ b/frame/base/bli_pba.h
@@ -132,7 +132,7 @@ void bli_pba_finalize
 
 void bli_pba_acquire_m
      (
-       rntm_t*   rntm,
+       pba_t*    pba,
        siz_t     req_size,
        packbuf_t buf_type,
        mem_t*    mem
@@ -140,20 +140,10 @@ void bli_pba_acquire_m
 
 void bli_pba_release
      (
-       rntm_t* rntm,
-       mem_t*  mem
+       pba_t* pba,
+       mem_t* mem
      );
 
-BLIS_INLINE void bli_pba_rntm_set_pba
-     (
-       rntm_t* rntm
-     )
-{
-	pba_t* pba = bli_pba_query();
-
-	bli_rntm_set_pba( pba, rntm );
-}
-
 siz_t bli_pba_pool_size
      (
        const pba_t*    pba,
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index f6756c589..882ad1cc3 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -52,10 +52,6 @@ typedef struct rntm_s
 	bool      pack_a;
 	bool      pack_b;
 	bool      l3_sup;
-
-	pool_t*   sba_pool;
-	pba_t*    pba;
-
 } rntm_t;
 */
 
@@ -80,7 +76,7 @@ BLIS_INLINE dim_t bli_rntm_num_threads( const rntm_t* rntm )
 
 BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm )
 {
-	return rntm->thrloop[ bszid ];
+	return ( bszid == BLIS_NO_PART ? 1 : rntm->thrloop[ bszid ] );
 }
 
 BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm )
@@ -122,20 +118,6 @@ BLIS_INLINE bool bli_rntm_l3_sup( const rntm_t* rntm )
 	return rntm->l3_sup;
 }
 
-//
-// -- rntm_t query (internal use only) -----------------------------------------
-//
-
-BLIS_INLINE pool_t* bli_rntm_sba_pool( const rntm_t* rntm )
-{
-	return rntm->sba_pool;
-}
-
-BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm )
-{
-	return rntm->pba;
-}
-
 //
 // -- rntm_t modification (internal use only) ----------------------------------
 //
@@ -196,16 +178,6 @@ BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr,
 	bli_rntm_set_pr_ways_only(  1, rntm );
 }
 
-BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm )
-{
-	rntm->sba_pool = sba_pool;
-}
-
-BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm )
-{
-	rntm->pba = pba;
-}
-
 BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm )
 {
 	bli_rntm_set_num_threads_only( 1, rntm );
@@ -276,15 +248,6 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 	bli_rntm_set_l3_sup( TRUE, rntm );
 }
 
-BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm )
-{
-	bli_rntm_set_sba_pool( NULL, rntm );
-}
-BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm )
-{
-	bli_rntm_set_pba( NULL, rntm );
-}
-
 //
 // -- rntm_t initialization ----------------------------------------------------
 //
@@ -302,8 +265,6 @@ BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm )
           .pack_a      = FALSE, \
           .pack_b      = FALSE, \
           .l3_sup      = TRUE, \
-          .sba_pool    = NULL, \
-          .pba         = NULL, \
         }  \
 
 BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
@@ -317,9 +278,6 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 	bli_rntm_clear_pack_a( rntm );
 	bli_rntm_clear_pack_b( rntm );
 	bli_rntm_clear_l3_sup( rntm );
-
-	bli_rntm_clear_sba_pool( rntm );
-	bli_rntm_clear_pba( rntm );
 }
 
 //
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 776622bb4..5123c5b4b 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -57,7 +57,7 @@ void bli_sba_finalize( void )
 
 void* bli_sba_acquire
      (
-       rntm_t* rntm,
+       pool_t* pool,
        siz_t   req_size
      )
 {
@@ -65,7 +65,16 @@ void* bli_sba_acquire
 	err_t r_val;
 
 #ifdef BLIS_ENABLE_SBA_POOLS
-	if ( rntm == NULL )
+
+	// We don't expect NULL sba_pool pointers in the normal course of BLIS
+	// operation. However, there are rare instances where it is convenient
+	// to support use of bli_sba_acquire() without having to pass in a valid
+	// sba pool data structure. The case that inspired this branch was the
+	// gemm_ukr and related test modules in the BLIS testsuite. (There, it
+	// is convenient to not have to checkout an array_t from the sba, and it
+	// does no harm since the malloc() happens outside of the region that
+	// would be timed.)
+	if ( pool == NULL )
 	{
 		block = bli_malloc_intl( req_size, &r_val );
 	}
@@ -73,43 +82,26 @@ void* bli_sba_acquire
 	{
 		pblk_t pblk;
 
-		// Query the small block pool from the rntm.
-		pool_t* pool = bli_rntm_sba_pool( rntm );
-
-		// We don't expect NULL sba_pool pointers in the normal course of BLIS
-		// operation. However, there are rare instances where it is convenient
-		// to support use of bli_sba_acquire() without having to pass in a valid
-		// sba pool data structure. The case that inspired this branch was the
-		// gemm_ukr and related test modules in the BLIS testsuite. (There, it
-		// is convenient to not have to checkout an array_t from the sba, and it
-		// does no harm since the malloc() happens outside of the region that
-		// would be timed.)
-		if ( pool == NULL )
-		{
-		    block = bli_malloc_intl( req_size, &r_val );
-		}
-		else
+		// Query the block_size of the pool_t so that we can request the exact
+		// size present.
+		const siz_t block_size = bli_pool_block_size( pool );
+
+		// Sanity check: Make sure the requested size is no larger than the
+		// block_size field of the pool.
+		if ( block_size < req_size )
 		{
-			// Query the block_size of the pool_t so that we can request the exact
-			// size present.
-			const siz_t block_size = bli_pool_block_size( pool );
-
-			// Sanity check: Make sure the requested size is no larger than the
-			// block_size field of the pool.
-			if ( block_size < req_size )
-			{
-				printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
-				        ( int )block_size, ( int )req_size );
-				bli_abort();
-			}
-
-			// Check out a block using the block_size queried above.
-			bli_pool_checkout_block( block_size, &pblk, pool );
-
-			// The block address is stored within the pblk_t.
-			block = bli_pblk_buf( &pblk );
+			printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
+			        ( int )block_size, ( int )req_size );
+			bli_abort();
 		}
+
+		// Check out a block using the block_size queried above.
+		bli_pool_checkout_block( block_size, &pblk, pool );
+
+		// The block address is stored within the pblk_t.
+		block = bli_pblk_buf( &pblk );
 	}
+
 #else
 
 	block = bli_malloc_intl( req_size, &r_val );
@@ -122,12 +114,13 @@ void* bli_sba_acquire
 
 void bli_sba_release
      (
-       rntm_t* rntm,
+       pool_t* pool,
        void*   block
      )
 {
 #ifdef BLIS_ENABLE_SBA_POOLS
-	if ( rntm == NULL )
+
+	if ( pool == NULL )
 	{
 		bli_free_intl( block );
 	}
@@ -135,32 +128,23 @@ void bli_sba_release
 	{
 		pblk_t pblk;
 
-		// Query the small block pool from the rntm.
-		pool_t* pool = bli_rntm_sba_pool( rntm );
-
-		if ( pool == NULL )
-		{
-		    bli_free_intl( block );
-		}
-		else
-		{
-			// Query the block_size field from the pool. This is not super-important
-			// for this particular application of the pool_t (that is, the "leaf"
-			// component of the sba), but it seems like good housekeeping to maintain
-			// the block_size field of the pblk_t in case its ever needed/read.
-			const siz_t block_size = bli_pool_block_size( pool );
-
-			// Embed the block's memory address into a pblk_t, along with the
-			// block_size queried from the pool.
-			bli_pblk_set_buf( block, &pblk );
-			bli_pblk_set_block_size( block_size, &pblk );
-
-			// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
-			// a local variable since its contents are copied into the pool's internal
-			// data structure--an array of pblk_t.)
-			bli_pool_checkin_block( &pblk, pool );
-		}
+		// Query the block_size field from the pool. This is not super-important
+		// for this particular application of the pool_t (that is, the "leaf"
+		// component of the sba), but it seems like good housekeeping to maintain
+		// the block_size field of the pblk_t in case its ever needed/read.
+		const siz_t block_size = bli_pool_block_size( pool );
+
+		// Embed the block's memory address into a pblk_t, along with the
+		// block_size queried from the pool.
+		bli_pblk_set_buf( block, &pblk );
+		bli_pblk_set_block_size( block_size, &pblk );
+
+		// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
+		// a local variable since its contents are copied into the pool's internal
+		// data structure--an array of pblk_t.)
+		bli_pool_checkin_block( &pblk, pool );
 	}
+
 #else
 
 	bli_free_intl( block );
@@ -173,11 +157,11 @@ array_t* bli_sba_checkout_array
        const siz_t n_threads
      )
 {
-	#ifndef BLIS_ENABLE_SBA_POOLS
-	return NULL;
-	#endif
-
+#ifdef BLIS_ENABLE_SBA_POOLS
 	return bli_apool_checkout_array( n_threads, &sba );
+#else
+	return NULL;
+#endif
 }
 
 void bli_sba_checkin_array
@@ -185,30 +169,10 @@ void bli_sba_checkin_array
        array_t* array
      )
 {
-	#ifndef BLIS_ENABLE_SBA_POOLS
-	return;
-	#endif
-
+#ifdef BLIS_ENABLE_SBA_POOLS
 	bli_apool_checkin_array( array, &sba );
-}
-
-void bli_sba_rntm_set_pool
-     (
-       siz_t    index,
-       array_t* array,
-       rntm_t*  rntm
-     )
-{
-	#ifndef BLIS_ENABLE_SBA_POOLS
-	bli_rntm_set_sba_pool( NULL, rntm );
+#else
 	return;
-	#endif
-
-	// Query the pool_t* in the array_t corresponding to index.
-	pool_t* pool = bli_apool_array_elem( index, array );
-
-	// Embed the pool_t* into the rntm_t.
-	bli_rntm_set_sba_pool( pool, rntm );
+#endif
 }
 
-
diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h
index 4fc3aaaee..8d9db844f 100644
--- a/frame/base/bli_sba.h
+++ b/frame/base/bli_sba.h
@@ -52,24 +52,17 @@ void bli_sba_checkin_array
        array_t* array
      );
 
-void bli_sba_rntm_set_pool
-     (
-       siz_t    index,
-       array_t* array,
-       rntm_t*  rntm
-     );
-
 void* bli_sba_acquire
      (
-       rntm_t* rntm,
+       pool_t* pool,
        siz_t   req_size
      );
+
 void bli_sba_release
      (
-       rntm_t* rntm,
+       pool_t* pool,
        void*   block
      );
 
-
 #endif
 
diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c
index 258ac5bbb..1d124cbc2 100644
--- a/frame/compat/extra/bla_gemm3m.c
+++ b/frame/compat/extra/bla_gemm3m.c
@@ -242,8 +242,7 @@ void PASTEF77(ch,blasname) \
 		  &betao, \
 		  &co, \
 		  cntx, \
-		  rntm, \
-		  NULL  \
+		  rntm \
 		); \
 	} \
 \
diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h
index 42ad9c72b..71a6096e1 100644
--- a/frame/include/bli_extern_defs.h
+++ b/frame/include/bli_extern_defs.h
@@ -44,7 +44,5 @@ BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE;
 BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO;
 
 BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM;
-BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED;
-BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED;
 
 #endif
diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h
index 7252fd7ff..b150b89fc 100644
--- a/frame/include/bli_oapi_ex.h
+++ b/frame/include/bli_oapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_OAPI_EX_PARAMS
-#define BLIS_OAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
+#define BLIS_OAPI_EX_PARAMS   , const cntx_t* cntx, const rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h
index f12be24b8..e7665e779 100644
--- a/frame/include/bli_tapi_ex.h
+++ b/frame/include/bli_tapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_TAPI_EX_PARAMS
-#define BLIS_TAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
+#define BLIS_TAPI_EX_PARAMS   , const cntx_t* cntx, const rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index d37e62f8a..0c5d11e6b 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -69,7 +69,7 @@
 // to be 32 bits, since explicit selection of 32 bits is prohibited at
 // configure-time (and explicit or automatic selection of 64 bits is fine
 // and would have had the same result).
-#if BLIS_BLAS_INT_SIZE == 64
+#if BLIS_BLAS_INT_TYPE_SIZE == 64
   #undef  BLIS_INT_TYPE_SIZE
   #define BLIS_INT_TYPE_SIZE 64
 #endif
@@ -1072,14 +1072,7 @@ struct cntl_s
 	void_fp        var_func;
 	struct cntl_s* sub_prenode;
 	struct cntl_s* sub_node;
-
-	// Optional fields (needed only by some operations such as packm).
-	// NOTE: first field of params must be a uint64_t containing the size
-	// of the struct.
 	void*          params;
-
-	// Internal fields that track "cached" data.
-	mem_t          pack_mem;
 };
 typedef struct cntl_s cntl_t;
 
@@ -1184,9 +1177,8 @@ typedef void (*obj_pack_fn_t)
       const struct obj_s*     a,
             struct obj_s*     ap,
       const struct cntx_s*    cntx,
-            struct rntm_s*    rntm,
-            struct cntl_s*    cntl,
-      const struct thrinfo_s* thread
+      const struct cntl_s*    cntl,
+            struct thrinfo_s* thread
     );
 
 typedef void (*obj_ker_fn_t)
@@ -1195,9 +1187,8 @@ typedef void (*obj_ker_fn_t)
       const struct obj_s*     b,
       const struct obj_s*     c,
       const struct cntx_s*    cntx,
-            struct rntm_s*    rntm,
-            struct cntl_s*    cntl,
-      const struct thrinfo_s* thread
+      const struct cntl_s*    cntl,
+            struct thrinfo_s* thread
     );
 
 typedef struct obj_s
@@ -1457,15 +1448,6 @@ typedef struct rntm_s
 	bool      pack_a; // enable/disable packing of left-hand matrix A.
 	bool      pack_b; // enable/disable packing of right-hand matrix B.
 	bool      l3_sup; // enable/disable small matrix handling in level-3 ops.
-
-	// "Internal" fields: these should not be exposed to the end-user.
-
-	// The small block pool, which is attached in the l3 thread decorator.
-	pool_t*   sba_pool;
-
-	// The packing block allocator, which is attached in the l3 thread decorator.
-	pba_t*    pba;
-
 } rntm_t;
 
 
diff --git a/frame/include/level0/1e/bli_copy1es.h b/frame/include/level0/1e/bli_copy1es.h
index 0d5c98175..7dc6a493a 100644
--- a/frame/include/level0/1e/bli_copy1es.h
+++ b/frame/include/level0/1e/bli_copy1es.h
@@ -41,18 +41,18 @@
 // - The first char encodes the type of x.
 // - The second char encodes the type of y.
 
-#define bli_sscopy1es( a, bri, bir ) {}
-#define bli_dscopy1es( a, bri, bir ) {}
-#define bli_cscopy1es( a, bri, bir ) {}
-#define bli_zscopy1es( a, bri, bir ) {}
-
-#define bli_sdcopy1es( a, bri, bir ) {}
-#define bli_ddcopy1es( a, bri, bir ) {}
-#define bli_cdcopy1es( a, bri, bir ) {}
-#define bli_zdcopy1es( a, bri, bir ) {}
-
-#define bli_sccopy1es( a, bri, bir ) {}
-#define bli_dccopy1es( a, bri, bir ) {}
+#define bli_sscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_dscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_cscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_zscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+
+#define bli_sdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_ddcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_cdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_zdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+
+#define bli_sccopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_dccopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
 #define bli_cccopy1es( a, bri, bir ) \
 { \
 	bli_cccopyris(  bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
@@ -64,8 +64,8 @@
 	bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \
 }
 
-#define bli_szcopy1es( a, bri, bir ) {}
-#define bli_dzcopy1es( a, bri, bir ) {}
+#define bli_szcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_dzcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
 #define bli_czcopy1es( a, bri, bir ) \
 { \
 	bli_czcopyris(  bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \
diff --git a/frame/include/level0/1e/bli_copyj1es.h b/frame/include/level0/1e/bli_copyj1es.h
index f2139a883..25bb19d5b 100644
--- a/frame/include/level0/1e/bli_copyj1es.h
+++ b/frame/include/level0/1e/bli_copyj1es.h
@@ -41,18 +41,18 @@
 // - The first char encodes the type of x.
 // - The second char encodes the type of y.
 
-#define bli_sscopyj1es( a, bri, bir ) {}
-#define bli_dscopyj1es( a, bri, bir ) {}
-#define bli_cscopyj1es( a, bri, bir ) {}
-#define bli_zscopyj1es( a, bri, bir ) {}
-
-#define bli_sdcopyj1es( a, bri, bir ) {}
-#define bli_ddcopyj1es( a, bri, bir ) {}
-#define bli_cdcopyj1es( a, bri, bir ) {}
-#define bli_zdcopyj1es( a, bri, bir ) {}
-
-#define bli_sccopyj1es( a, bri, bir ) {}
-#define bli_dccopyj1es( a, bri, bir ) {}
+#define bli_sscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_dscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_cscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_zscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+
+#define bli_sdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_ddcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_cdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_zdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+
+#define bli_sccopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_dccopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
 #define bli_cccopyj1es( a, bri, bir ) \
 { \
 	bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
@@ -64,8 +64,8 @@
 	bli_zccopyris( bli_zimag(a),  bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \
 }
 
-#define bli_szcopyj1es( a, bri, bir ) {}
-#define bli_dzcopyj1es( a, bri, bir ) {}
+#define bli_szcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
+#define bli_dzcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
 #define bli_czcopyj1es( a, bri, bir ) \
 { \
 	bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \
diff --git a/frame/include/level0/1e/bli_scal21es.h b/frame/include/level0/1e/bli_scal21es.h
index f54f2fd01..cee7745e8 100644
--- a/frame/include/level0/1e/bli_scal21es.h
+++ b/frame/include/level0/1e/bli_scal21es.h
@@ -44,52 +44,52 @@
 
 // -- (axy) = (??s) ------------------------------------------------------------
 
-#define bli_sssscal21es( a, x, yri, yir ) {}
-#define bli_sdsscal21es( a, x, yri, yir ) {}
-#define bli_scsscal21es( a, x, yri, yir ) {}
-#define bli_szsscal21es( a, x, yri, yir ) {}
-
-#define bli_dssscal21es( a, x, yri, yir ) {}
-#define bli_ddsscal21es( a, x, yri, yir ) {}
-#define bli_dcsscal21es( a, x, yri, yir ) {}
-#define bli_dzsscal21es( a, x, yri, yir ) {}
-
-#define bli_cssscal21es( a, x, yri, yir ) {}
-#define bli_cdsscal21es( a, x, yri, yir ) {}
-#define bli_ccsscal21es( a, x, yri, yir ) {}
-#define bli_czsscal21es( a, x, yri, yir ) {}
-
-#define bli_zssscal21es( a, x, yri, yir ) {}
-#define bli_zdsscal21es( a, x, yri, yir ) {}
-#define bli_zcsscal21es( a, x, yri, yir ) {}
-#define bli_zzsscal21es( a, x, yri, yir ) {}
+#define bli_sssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_scsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_szsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+
+#define bli_dssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ddsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dcsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dzsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+
+#define bli_cssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_cdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ccsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_czsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+
+#define bli_zssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zcsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zzsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
 // -- (axy) = (??d) ------------------------------------------------------------
 
-#define bli_ssdscal21es( a, x, yri, yir ) {}
-#define bli_sddscal21es( a, x, yri, yir ) {}
-#define bli_scdscal21es( a, x, yri, yir ) {}
-#define bli_szdscal21es( a, x, yri, yir ) {}
+#define bli_ssdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_scdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_szdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
-#define bli_dsdscal21es( a, x, yri, yir ) {}
-#define bli_dddscal21es( a, x, yri, yir ) {}
-#define bli_dcdscal21es( a, x, yri, yir ) {}
-#define bli_dzdscal21es( a, x, yri, yir ) {}
+#define bli_dsdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dcdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dzdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
-#define bli_csdscal21es( a, x, yri, yir ) {}
-#define bli_cddscal21es( a, x, yri, yir ) {}
-#define bli_ccdscal21es( a, x, yri, yir ) {}
-#define bli_czdscal21es( a, x, yri, yir ) {}
+#define bli_csdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_cddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ccdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_czdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
-#define bli_zsdscal21es( a, x, yri, yir ) {}
-#define bli_zddscal21es( a, x, yri, yir ) {}
-#define bli_zcdscal21es( a, x, yri, yir ) {}
-#define bli_zzdscal21es( a, x, yri, yir ) {}
+#define bli_zsdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zcdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zzdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
 // -- (axy) = (??c) ------------------------------------------------------------
 
-#define bli_sscscal21es( a, x, yri, yir ) {}
-#define bli_sdcscal21es( a, x, yri, yir ) {}
+#define bli_sscscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sdcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_sccscal21es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
@@ -101,8 +101,8 @@
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
 }
 
-#define bli_dscscal21es( a, x, yri, yir ) {}
-#define bli_ddcscal21es( a, x, yri, yir ) {}
+#define bli_dscscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ddcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_dccscal21es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
@@ -158,8 +158,8 @@
 
 // -- (axy) = (??z) ------------------------------------------------------------
 
-#define bli_sszscal21es( a, x, yri, yir ) {}
-#define bli_sdzscal21es( a, x, yri, yir ) {}
+#define bli_sszscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sdzscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_sczscal21es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
@@ -171,8 +171,8 @@
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
 }
 
-#define bli_dszscal21es( a, x, yri, yir ) {}
-#define bli_ddzscal21es( a, x, yri, yir ) {}
+#define bli_dszscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ddzscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_dczscal21es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
diff --git a/frame/include/level0/1e/bli_scal2j1es.h b/frame/include/level0/1e/bli_scal2j1es.h
index 741fbceed..a32c4f2e4 100644
--- a/frame/include/level0/1e/bli_scal2j1es.h
+++ b/frame/include/level0/1e/bli_scal2j1es.h
@@ -44,52 +44,52 @@
 
 // -- (axy) = (??s) ------------------------------------------------------------
 
-#define bli_sssscal2j1es( a, x, yri, yir ) {}
-#define bli_sdsscal2j1es( a, x, yri, yir ) {}
-#define bli_scsscal2j1es( a, x, yri, yir ) {}
-#define bli_szsscal2j1es( a, x, yri, yir ) {}
-
-#define bli_dssscal2j1es( a, x, yri, yir ) {}
-#define bli_ddsscal2j1es( a, x, yri, yir ) {}
-#define bli_dcsscal2j1es( a, x, yri, yir ) {}
-#define bli_dzsscal2j1es( a, x, yri, yir ) {}
-
-#define bli_cssscal2j1es( a, x, yri, yir ) {}
-#define bli_cdsscal2j1es( a, x, yri, yir ) {}
-#define bli_ccsscal2j1es( a, x, yri, yir ) {}
-#define bli_czsscal2j1es( a, x, yri, yir ) {}
-
-#define bli_zssscal2j1es( a, x, yri, yir ) {}
-#define bli_zdsscal2j1es( a, x, yri, yir ) {}
-#define bli_zcsscal2j1es( a, x, yri, yir ) {}
-#define bli_zzsscal2j1es( a, x, yri, yir ) {}
+#define bli_sssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_scsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_szsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+
+#define bli_dssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ddsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dcsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dzsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+
+#define bli_cssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_cdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ccsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_czsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+
+#define bli_zssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zcsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zzsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
 // -- (axy) = (??d) ------------------------------------------------------------
 
-#define bli_ssdscal2j1es( a, x, yri, yir ) {}
-#define bli_sddscal2j1es( a, x, yri, yir ) {}
-#define bli_scdscal2j1es( a, x, yri, yir ) {}
-#define bli_szdscal2j1es( a, x, yri, yir ) {}
+#define bli_ssdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_scdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_szdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
-#define bli_dsdscal2j1es( a, x, yri, yir ) {}
-#define bli_dddscal2j1es( a, x, yri, yir ) {}
-#define bli_dcdscal2j1es( a, x, yri, yir ) {}
-#define bli_dzdscal2j1es( a, x, yri, yir ) {}
+#define bli_dsdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dcdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_dzdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
-#define bli_csdscal2j1es( a, x, yri, yir ) {}
-#define bli_cddscal2j1es( a, x, yri, yir ) {}
-#define bli_ccdscal2j1es( a, x, yri, yir ) {}
-#define bli_czdscal2j1es( a, x, yri, yir ) {}
+#define bli_csdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_cddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ccdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_czdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
-#define bli_zsdscal2j1es( a, x, yri, yir ) {}
-#define bli_zddscal2j1es( a, x, yri, yir ) {}
-#define bli_zcdscal2j1es( a, x, yri, yir ) {}
-#define bli_zzdscal2j1es( a, x, yri, yir ) {}
+#define bli_zsdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zcdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_zzdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 
 // -- (axy) = (??c) ------------------------------------------------------------
 
-#define bli_sscscal2j1es( a, x, yri, yir ) {}
-#define bli_sdcscal2j1es( a, x, yri, yir ) {}
+#define bli_sscscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sdcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_sccscal2j1es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
@@ -101,8 +101,8 @@
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
 }
 
-#define bli_dscscal2j1es( a, x, yri, yir ) {}
-#define bli_ddcscal2j1es( a, x, yri, yir ) {}
+#define bli_dscscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ddcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_dccscal2j1es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
@@ -158,8 +158,8 @@
 
 // -- (axy) = (??z) ------------------------------------------------------------
 
-#define bli_sszscal2j1es( a, x, yri, yir ) {}
-#define bli_sdzscal2j1es( a, x, yri, yir ) {}
+#define bli_sszscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_sdzscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_sczscal2j1es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
@@ -171,8 +171,8 @@
 	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
 }
 
-#define bli_dszscal2j1es( a, x, yri, yir ) {}
-#define bli_ddzscal2j1es( a, x, yri, yir ) {}
+#define bli_dszscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
+#define bli_ddzscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_dczscal2j1es( a, x, yri, yir ) \
 { \
 	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
diff --git a/frame/include/level0/ri/bli_copyris.h b/frame/include/level0/ri/bli_copyris.h
index 8dd7b9b73..cd971587d 100644
--- a/frame/include/level0/ri/bli_copyris.h
+++ b/frame/include/level0/ri/bli_copyris.h
@@ -40,11 +40,13 @@
 #define bli_scopyris( ar, ai, br, bi ) \
 { \
 	(br) = (ar); \
+    ( void )ai; ( void )bi; \
 }
 
 #define bli_dcopyris( ar, ai, br, bi ) \
 { \
 	(br) = (ar); \
+    ( void )ai; ( void )bi; \
 }
 
 #define bli_ccopyris( ar, ai, br, bi ) \
@@ -59,23 +61,23 @@
 	(bi) = (ai); \
 }
 
-#define bli_sscopyris( ar, ai, br, bi )  bli_scopyris( ar, 0.0F, br, bi )
-#define bli_dscopyris( ar, ai, br, bi )  bli_scopyris( ar, 0.0,  br, bi )
+#define bli_sscopyris( ar, ai, br, bi )  { bli_scopyris( ar, 0.0F, br, bi ); ( void )ai; }
+#define bli_dscopyris( ar, ai, br, bi )  { bli_scopyris( ar, 0.0,  br, bi ); ( void )ai; }
 #define bli_cscopyris( ar, ai, br, bi )  bli_scopyris( ar, ai,   br, bi )
 #define bli_zscopyris( ar, ai, br, bi )  bli_scopyris( ar, ai,   br, bi )
 
-#define bli_sdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, 0.0F, br, bi )
-#define bli_ddcopyris( ar, ai, br, bi )  bli_dcopyris( ar, 0.0,  br, bi )
+#define bli_sdcopyris( ar, ai, br, bi )  { bli_dcopyris( ar, 0.0F, br, bi ); ( void )ai; }
+#define bli_ddcopyris( ar, ai, br, bi )  { bli_dcopyris( ar, 0.0,  br, bi ); ( void )ai; }
 #define bli_cdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, ai,   br, bi )
 #define bli_zdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, ai,   br, bi )
 
-#define bli_sccopyris( ar, ai, br, bi )  bli_ccopyris( ar, 0.0F, br, bi )
-#define bli_dccopyris( ar, ai, br, bi )  bli_ccopyris( ar, 0.0,  br, bi )
+#define bli_sccopyris( ar, ai, br, bi )  { bli_ccopyris( ar, 0.0F, br, bi ); ( void )ai; }
+#define bli_dccopyris( ar, ai, br, bi )  { bli_ccopyris( ar, 0.0,  br, bi ); ( void )ai; }
 #define bli_cccopyris( ar, ai, br, bi )  bli_ccopyris( ar, ai,   br, bi )
 #define bli_zccopyris( ar, ai, br, bi )  bli_ccopyris( ar, ai,   br, bi )
 
-#define bli_szcopyris( ar, ai, br, bi )  bli_zcopyris( ar, 0.0F, br, bi )
-#define bli_dzcopyris( ar, ai, br, bi )  bli_zcopyris( ar, 0.0,  br, bi )
+#define bli_szcopyris( ar, ai, br, bi )  { bli_zcopyris( ar, 0.0F, br, bi ); ( void )ai; }
+#define bli_dzcopyris( ar, ai, br, bi )  { bli_zcopyris( ar, 0.0,  br, bi ); ( void )ai; }
 #define bli_czcopyris( ar, ai, br, bi )  bli_zcopyris( ar, ai,   br, bi )
 #define bli_zzcopyris( ar, ai, br, bi )  bli_zcopyris( ar, ai,   br, bi )
 
diff --git a/frame/include/level0/ri/bli_scal2jris.h b/frame/include/level0/ri/bli_scal2jris.h
index 9e99e583d..f3b71ed2e 100644
--- a/frame/include/level0/ri/bli_scal2jris.h
+++ b/frame/include/level0/ri/bli_scal2jris.h
@@ -40,6 +40,7 @@
 #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) * (xr); \
+    ( void )ai; ( void )xi; ( void )yi; \
 }
 
 #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \
@@ -51,18 +52,21 @@
 #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) * (xr) + (ai) * (xi); \
+    ( void )yi; \
 }
 
 #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) *  (xr); \
 	(yi) = (ar) * -(xi); \
+    ( void )ai; \
 }
 
 #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) * (xr); \
 	(yi) = (ai) * (xr); \
+    ( void )xi; \
 }
 
 // Notes:
diff --git a/frame/include/level0/ri/bli_scal2ris.h b/frame/include/level0/ri/bli_scal2ris.h
index 45e0ce427..e30fd9789 100644
--- a/frame/include/level0/ri/bli_scal2ris.h
+++ b/frame/include/level0/ri/bli_scal2ris.h
@@ -40,6 +40,7 @@
 #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) * (xr); \
+    ( void )ai; ( void )xi; ( void )yi; \
 }
 
 #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \
@@ -51,18 +52,21 @@
 #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) * (xr) - (ai) * (xi); \
+    ( void )yi; \
 }
 
 #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) * (xr); \
 	(yi) = (ar) * (xi); \
+    ( void )ai; \
 }
 
 #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \
 { \
 	(yr) = (ar) * (xr); \
 	(yi) = (ai) * (xr); \
+    ( void )xi; \
 }
 
 // Notes:
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
deleted file mode 100644
index 890c174cf..000000000
--- a/frame/thread/bli_l3_decor_openmp.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_OPENMP
-
-//#define PRINT_THRINFO
-//#define PRINT_IMPL
-
-void bli_l3_thread_decorator_openmp
-     (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   beta,
-       const obj_t*   c,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
-     )
-{
-	// Query the total number of threads from the rntm_t object.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-#ifdef PRINT_IMPL
-	const timpl_t ti = bli_rntm_thread_impl( rntm );
-	printf( "l3_decor_openmp: l3 decor with rntm.thread_impl  = %s\n",
-	        ( ti == BLIS_SINGLE ? "single" :
-	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
-#endif
-
-	#ifdef PRINT_THRINFO
-	err_t r_val;
-	thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ), &r_val );
-	#endif
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-
-	_Pragma( "omp parallel num_threads(n_threads)" )
-	{
-		// Create a thread-local copy of the master thread's rntm_t. This is
-		// necessary since we want each thread to be able to track its own
-		// small block pool_t as it executes down the function stack.
-		rntm_t  rntm_l = *rntm;
-		rntm_t* rntm_p = &rntm_l;
-
-		// Query the thread's id from OpenMP.
-		const dim_t tid = omp_get_thread_num();
-
-		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
-		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		obj_t      a_t, b_t, c_t;
-		cntl_t*    cntl_use;
-		thrinfo_t* thread;
-
-		// Alias thread-local copies of A, B, and C. These will be the objects
-		// we pass down the algorithmic function stack. Making thread-local
-		// aliases is highly recommended in case a thread needs to change any
-		// of the properties of an object without affecting other threads'
-		// objects.
-		bli_obj_alias_to( a, &a_t );
-		bli_obj_alias_to( b, &b_t );
-		bli_obj_alias_to( c, &c_t );
-
-		// This is part of a hack to support mixed domain in bli_gemm_front().
-		// Sometimes we need to specify a non-standard schema for A and B, and
-		// we decided to transmit them via the schema field in the obj_t's
-		// rather than pass them in as function parameters. Once the values
-		// have been read, we immediately reset them back to their expected
-		// values for unpacked objects.
-		pack_t schema_a = bli_obj_pack_schema( &a_t );
-		pack_t schema_b = bli_obj_pack_schema( &b_t );
-		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
-		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
-
-		// Create a default control tree for the operation, if needed.
-		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
-
-		// Create the root node of the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-
-#if 1
-		func
-		(
-		  alpha,
-		  &a_t,
-		  &b_t,
-		  beta,
-		  &c_t,
-		  cntx,
-		  rntm_p,
-		  cntl_use,
-		  thread
-		);
-#else
-		bli_thrinfo_grow_tree
-		(
-		  rntm_p,
-		  cntl_use,
-		  thread
-		);
-#endif
-
-		// Free the thread's local control tree.
-		bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
-		#ifdef PRINT_THRINFO
-		threads[tid] = thread;
-		#else
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
-		#endif
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
-	#ifdef PRINT_THRINFO
-	if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
-	else                       bli_l3_thrinfo_print_trsm_paths( threads );
-	exit(1);
-	#endif
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_l3_thread_decorator_thread_check
-     (
-       dim_t      n_threads,
-       dim_t      tid,
-       thrcomm_t* gl_comm,
-       rntm_t*    rntm
-     )
-{
-	dim_t n_threads_real = omp_get_num_threads();
-
-	// Check if the number of OpenMP threads created within this parallel
-	// region is different from the number of threads that were requested
-	// of BLIS. This inequality may trigger when, for example, the
-	// following conditions are satisfied:
-	// - an application is executing an OpenMP parallel region in which
-	//   BLIS is invoked,
-	// - BLIS is configured for multithreading via OpenMP,
-	// - OMP_NUM_THREADS = t > 1,
-	// - the number of threads requested of BLIS (regardless of method)
-	//   is p <= t,
-	// - OpenMP nesting is disabled.
-	// In this situation, the application spawns t threads. Each application
-	// thread calls gemm (for example). Each gemm will attempt to spawn p
-	// threads via OpenMP. However, since nesting is disabled, the OpenMP
-	// implementation finds that t >= p threads are already spawned, and
-	// thus it doesn't spawn *any* additional threads for each gemm.
-	if ( n_threads_real != n_threads )
-	{
-		// If the number of threads active in the current region is not
-		// equal to the number requested of BLIS, we then only continue
-		// if the number of threads in the current region is 1. If, for
-		// example, BLIS requested 4 threads but only got 3, then we
-		// abort().
-		//if ( tid == 0 )
-		//{
-			if ( n_threads_real != 1 )
-			{
-				bli_print_msg( "A different number of threads was "
-				               "created than was requested.",
-				               __FILE__, __LINE__ );
-				bli_abort();
-			}
-
-			const timpl_t ti = bli_rntm_thread_impl( rntm );
-
-			//n_threads = 1; // not needed since it has no effect?
-			bli_thrcomm_init( ti, 1, gl_comm );
-			bli_rntm_set_num_threads_only( 1, rntm );
-			bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
-		//}
-
-		// Synchronize all threads and continue.
-		_Pragma( "omp barrier" )
-	}
-}
-
-#endif
-
diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c
deleted file mode 100644
index d31414d3b..000000000
--- a/frame/thread/bli_l3_decor_pthreads.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_PTHREADS
-
-// A data structure to assist in passing operands to additional threads.
-typedef struct thread_data
-{
-	      l3int_ft   func;
-	      opid_t     family;
-	const obj_t*     alpha;
-	const obj_t*     a;
-	const obj_t*     b;
-	const obj_t*     beta;
-	const obj_t*     c;
-	const cntx_t*    cntx;
-	      rntm_t*    rntm;
-	      cntl_t*    cntl;
-	      dim_t      tid;
-	      thrcomm_t* gl_comm;
-	      array_t*   array;
-} thread_data_t;
-
-// Entry point for additional threads
-void* bli_l3_thread_entry( void* data_void )
-{
-	const thread_data_t* data     = data_void;
-
-	const l3int_ft       func     = data->func;
-	const opid_t         family   = data->family;
-	const obj_t*         alpha    = data->alpha;
-	const obj_t*         a        = data->a;
-	const obj_t*         b        = data->b;
-	const obj_t*         beta     = data->beta;
-	const obj_t*         c        = data->c;
-	const cntx_t*        cntx     = data->cntx;
-	      rntm_t*        rntm     = data->rntm;
-	      cntl_t*        cntl     = data->cntl;
-	const dim_t          tid      = data->tid;
-	      array_t*       array    = data->array;
-	      thrcomm_t*     gl_comm  = data->gl_comm;
-
-	// Create a thread-local copy of the master thread's rntm_t. This is
-	// necessary since we want each thread to be able to track its own
-	// small block pool_t as it executes down the function stack.
-	rntm_t  rntm_l = *rntm;
-	rntm_t* rntm_p = &rntm_l;
-
-	// Use the thread id to access the appropriate pool_t* within the
-	// array_t, and use it to set the sba_pool field within the rntm_t.
-	// If the pool_t* element within the array_t is NULL, it will first
-	// be allocated/initialized.
-	bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-	obj_t      a_t, b_t, c_t;
-	cntl_t*    cntl_use;
-	thrinfo_t* thread;
-
-	// Alias thread-local copies of A, B, and C. These will be the objects
-	// we pass down the algorithmic function stack. Making thread-local
-	// aliases is highly recommended in case a thread needs to change any
-	// of the properties of an object without affecting other threads'
-	// objects.
-	bli_obj_alias_to( a, &a_t );
-	bli_obj_alias_to( b, &b_t );
-	bli_obj_alias_to( c, &c_t );
-
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( &a_t );
-	pack_t schema_b = bli_obj_pack_schema( &b_t );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
-
-	// Create a default control tree for the operation, if needed.
-	bli_l3_cntl_create_if( family, schema_a, schema_b,
-	                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
-
-	// Create the root node of the current thread's thrinfo_t structure.
-	bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-
-	func
-	(
-	  alpha,
-	  &a_t,
-	  &b_t,
-	  beta,
-	  &c_t,
-	  cntx,
-	  rntm_p,
-	  cntl_use,
-	  thread
-	);
-
-	// Free the thread's local control tree.
-	bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
-	// Free the current thread's thrinfo_t structure.
-	bli_l3_thrinfo_free( rntm_p, thread );
-
-	return NULL;
-}
-
-//#define PRINT_IMPL
-
-void bli_l3_thread_decorator_pthreads
-     (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   beta,
-       const obj_t*   c,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
-     )
-{
-	err_t r_val;
-
-	// Query the total number of threads from the rntm_t object.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-#ifdef PRINT_IMPL
-	const timpl_t ti = bli_rntm_thread_impl( rntm );
-	printf( "l3_decor_pthrea: l3 decor with rntm.thread_impl  = %s\n",
-	        ( ti == BLIS_SINGLE ? "single" :
-	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
-#endif
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-	// Allocate an array of pthread objects and auxiliary data structs to pass
-	// to the thread entry functions.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val );
-
-	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
-	// can spawn all other threads before proceeding with its own computation.
-	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
-	{
-		// Set up thread data for additional threads (beyond thread 0).
-		datas[tid].func     = func;
-		datas[tid].family   = family;
-		datas[tid].alpha    = alpha;
-		datas[tid].a        = a;
-		datas[tid].b        = b;
-		datas[tid].beta     = beta;
-		datas[tid].c        = c;
-		datas[tid].cntx     = cntx;
-		datas[tid].rntm     = rntm;
-		datas[tid].cntl     = cntl;
-		datas[tid].tid      = tid;
-		datas[tid].gl_comm  = gl_comm;
-		datas[tid].array    = array;
-
-		// Spawn additional threads for ids greater than 1.
-		if ( tid != 0 )
-			bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
-		else
-			bli_l3_thread_entry( ( void* )(&datas[0]) );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
-	// Thread 0 waits for additional threads to finish.
-	for ( dim_t tid = 1; tid < n_threads; tid++ )
-	{
-		bli_pthread_join( pthreads[tid], NULL );
-	}
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( pthreads );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( datas );
-}
-
-#else
-
-// Define a dummy function bli_l3_thread_entry(), which is needed for
-// consistent dynamic linking behavior when building shared objects in Linux
-// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol.
-void* bli_l3_thread_entry( void* data_void ) { return NULL; }
-
-#endif
-
diff --git a/frame/thread/bli_l3_decor_pthreads.h b/frame/thread/bli_l3_decor_pthreads.h
deleted file mode 100644
index edf36cf6e..000000000
--- a/frame/thread/bli_l3_decor_pthreads.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_L3_DECOR_PTHREADS_H
-#define BLIS_L3_DECOR_PTHREADS_H
-
-// Definitions specific to situations when POSIX multithreading is enabled.
-#ifdef BLIS_ENABLE_PTHREADS
-
-// Thread entry point prototype.
-void* bli_l3_thread_entry( void* data_void );
-
-void bli_l3_thread_decorator_pthreads
-     (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   beta,
-       const obj_t*   c,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
-     );
-
-#endif
-
-#endif
-
diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c
deleted file mode 100644
index 6f0f8603b..000000000
--- a/frame/thread/bli_l3_decor_single.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-//#define PRINT_IMPL
-
-void bli_l3_thread_decorator_single
-     (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   beta,
-       const obj_t*   c,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
-     )
-{
-	// For sequential execution, we use only one thread.
-	const dim_t n_threads = 1;
-
-#ifdef PRINT_IMPL
-	const timpl_t ti = bli_rntm_thread_impl( rntm );
-	printf( "l3_decor_single: l3 decor with rntm.thread_impl  = %s\n",
-	        ( ti == BLIS_SINGLE ? "single" :
-	        ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
-#endif
-
-	obj_t a_t, b_t;
-	bli_obj_alias_to( a, &a_t );
-	bli_obj_alias_to( b, &b_t );
-
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( &a_t );
-	pack_t schema_b = bli_obj_pack_schema( &b_t );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we can create the global comm below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-#if 0
-	timpl_t ti2 = bli_rntm_thread_impl( rntm );
-	printf( "l3_decor_single: created thrcomm_t.ti            = %s\n",
-	        ( ti2 == BLIS_SINGLE ? "single" :
-	        ( ti2 == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
-#endif
-
-
-	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* rntm_p = rntm;
-
-		cntl_t*    cntl_use;
-		thrinfo_t* thread;
-
-		const dim_t tid = 0;
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
-		// need to alias objects for A, B, and C since they were already aliased
-		// in bli_*_front(). However, we may add aliasing here in the future so
-		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
-		// consistently providing local aliases, we can then eliminate aliasing
-		// elsewhere.
-
-		// Create a default control tree for the operation, if needed.
-		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       &a_t, &b_t, c, rntm_p, cntl, &cntl_use );
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-
-		func
-		(
-		  alpha,
-		  &a_t,
-		  &b_t,
-		  beta,
-		  c,
-		  cntx,
-		  rntm_p,
-		  cntl_use,
-		  thread
-		);
-
-		// Free the thread's local control tree.
-		bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-}
-
diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c
deleted file mode 100644
index 7d06ad622..000000000
--- a/frame/thread/bli_l3_sup_decor_openmp.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_OPENMP
-
-//#define PRINT_THRINFO
-
-err_t bli_l3_sup_thread_decorator_openmp
-     (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
-     )
-{
-	// Query the total number of threads from the rntm_t object.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-
-	_Pragma( "omp parallel num_threads(n_threads)" )
-	{
-		// Create a thread-local copy of the master thread's rntm_t. This is
-		// necessary since we want each thread to be able to track its own
-		// small block pool_t as it executes down the function stack.
-		rntm_t  rntm_l = *rntm;
-		rntm_t* rntm_p = &rntm_l;
-
-		// Query the thread's id from OpenMP.
-		const dim_t tid = omp_get_thread_num();
-
-		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
-		// NOTE: This calls the same function used for the conventional/large
-		// code path.
-		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		thrinfo_t* thread = NULL;
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-
-		func
-		(
-		  alpha,
-		  a,
-		  b,
-		  beta,
-		  c,
-		  cntx,
-		  rntm_p,
-		  thread
-		);
-
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-
-	return BLIS_SUCCESS;
-}
-
-#endif
-
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c
deleted file mode 100644
index 7be5cf8fb..000000000
--- a/frame/thread/bli_l3_sup_decor_pthreads.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_PTHREADS
-
-// A data structure to assist in passing operands to additional threads.
-typedef struct thread_data
-{
-	      l3supint_ft func;
-	      opid_t      family;
-	const obj_t*      alpha;
-	const obj_t*      a;
-	const obj_t*      b;
-	const obj_t*      beta;
-	const obj_t*      c;
-	const cntx_t*     cntx;
-	      rntm_t*     rntm;
-	      dim_t       tid;
-	      thrcomm_t*  gl_comm;
-	      array_t*    array;
-} thread_data_t;
-
-// Entry point for additional threads
-void* bli_l3_sup_thread_entry( void* data_void )
-{
-	thread_data_t* data     = data_void;
-
-	      l3supint_ft    func     = data->func;
-	      opid_t         family   = data->family;
-	const obj_t*         alpha    = data->alpha;
-	const obj_t*         a        = data->a;
-	const obj_t*         b        = data->b;
-	const obj_t*         beta     = data->beta;
-	const obj_t*         c        = data->c;
-	const cntx_t*        cntx     = data->cntx;
-	      rntm_t*        rntm     = data->rntm;
-	      dim_t          tid      = data->tid;
-	      array_t*       array    = data->array;
-	      thrcomm_t*     gl_comm  = data->gl_comm;
-
-	( void )family;
-
-	// Create a thread-local copy of the master thread's rntm_t. This is
-	// necessary since we want each thread to be able to track its own
-	// small block pool_t as it executes down the function stack.
-	rntm_t  rntm_l = *rntm;
-	rntm_t* rntm_p = &rntm_l;
-
-	// Use the thread id to access the appropriate pool_t* within the
-	// array_t, and use it to set the sba_pool field within the rntm_t.
-	// If the pool_t* element within the array_t is NULL, it will first
-	// be allocated/initialized.
-	bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-	thrinfo_t* thread = NULL;
-
-	// Create the root node of the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-
-	func
-	(
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm_p,
-	  thread
-	);
-
-	// Free the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_free( rntm_p, thread );
-
-	return NULL;
-}
-
-err_t bli_l3_sup_thread_decorator_pthreads
-     (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
-     )
-{
-	err_t r_val;
-
-	// Query the total number of threads from the context.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-	// Allocate an array of pthread objects and auxiliary data structs to pass
-	// to the thread entry functions.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val );
-
-	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
-	// can spawn all other threads before proceeding with its own computation.
-	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
-	{
-		// Set up thread data for additional threads (beyond thread 0).
-		datas[tid].func     = func;
-		datas[tid].family   = family;
-		datas[tid].alpha    = alpha;
-		datas[tid].a        = a;
-		datas[tid].b        = b;
-		datas[tid].beta     = beta;
-		datas[tid].c        = c;
-		datas[tid].cntx     = cntx;
-		datas[tid].rntm     = rntm;
-		datas[tid].tid      = tid;
-		datas[tid].gl_comm  = gl_comm;
-		datas[tid].array    = array;
-
-		// Spawn additional threads for ids greater than 1.
-		if ( tid != 0 )
-			bli_pthread_create( &pthreads[tid], NULL, &bli_l3_sup_thread_entry, &datas[tid] );
-		else
-			bli_l3_sup_thread_entry( ( void* )(&datas[0]) );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
-	// Thread 0 waits for additional threads to finish.
-	for ( dim_t tid = 1; tid < n_threads; tid++ )
-	{
-		bli_pthread_join( pthreads[tid], NULL );
-	}
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( pthreads );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( datas );
-
-	return BLIS_SUCCESS;
-}
-
-#else
-
-// Define a dummy function bli_l3_thread_entry(), which is needed for
-// consistent dynamic linking behavior when building shared objects in Linux
-// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol.
-void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
-
-#endif
-
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.h b/frame/thread/bli_l3_sup_decor_pthreads.h
deleted file mode 100644
index 310ea4e8b..000000000
--- a/frame/thread/bli_l3_sup_decor_pthreads.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_L3_SUP_DECOR_PTHREADS_H
-#define BLIS_L3_SUP_DECOR_PTHREADS_H
-
-// Definitions specific to situations when POSIX multithreading is enabled.
-#ifdef BLIS_ENABLE_PTHREADS
-
-// Thread entry point prototype.
-void* bli_l3_sup_thread_entry( void* data_void );
-
-err_t bli_l3_sup_thread_decorator_pthreads
-     (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
-     );
-
-#endif
-
-#endif
-
diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c
deleted file mode 100644
index a419154e7..000000000
--- a/frame/thread/bli_l3_sup_decor_single.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define SKIP_THRINFO_TREE
-
-err_t bli_l3_sup_thread_decorator_single
-     (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
-     )
-{
-	// For sequential execution, we use only one thread.
-	const dim_t n_threads = 1;
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm.
-	bli_pba_rntm_set_pba( rntm );
-
-#ifndef SKIP_THRINFO_TREE
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-#endif
-
-
-	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* rntm_p = rntm;
-
-		// There is only one thread id (for the thief thread).
-		const dim_t tid = 0;
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-#ifndef SKIP_THRINFO_TREE
-		thrinfo_t* thread = NULL;
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-#else
-		// This optimization allows us to use one of the global thrinfo_t
-		// objects for single-threaded execution rather than grow one from
-		// scratch. The key is that bli_thrinfo_sup_grow(), which is called
-		// from within the variants, will immediately return if it detects
-		// that the thrinfo_t* passed into it is either
-		// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
-		thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
-
-		( void )tid;
-#endif
-
-		func
-		(
-		  alpha,
-		  a,
-		  b,
-		  beta,
-		  c,
-		  cntx,
-		  rntm_p,
-		  thread
-		);
-
-#ifndef SKIP_THRINFO_TREE
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
-#endif
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-
-	return BLIS_SUCCESS;
-}
-
diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c
index 6cd4325df..0547d296e 100644
--- a/frame/thread/bli_thrcomm.c
+++ b/frame/thread/bli_thrcomm.c
@@ -37,34 +37,30 @@
 
 // -- Method-agnostic functions ------------------------------------------------
 
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
+thrcomm_t* bli_thrcomm_create( timpl_t ti, pool_t* sba_pool, dim_t n_threads )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_thrcomm_create(): " );
 	#endif
 
-	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) );
-
-	const timpl_t ti = bli_rntm_thread_impl( rntm );
+	thrcomm_t* comm = bli_sba_acquire( sba_pool, sizeof(thrcomm_t) );
 
 	bli_thrcomm_init( ti, n_threads, comm );
 
 	return comm;
 }
 
-void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
+void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 
-	const timpl_t ti = bli_rntm_thread_impl( rntm );
-
-	bli_thrcomm_cleanup( ti, comm );
+	bli_thrcomm_cleanup( comm );
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_thrcomm_free(): " );
 	#endif
 
-	bli_sba_release( rntm, comm );
+	bli_sba_release( sba_pool, comm );
 }
 
 // -- Method-specific functions ------------------------------------------------
@@ -140,69 +136,46 @@ void bli_thrcomm_init( timpl_t ti, dim_t nt, thrcomm_t* comm )
 {
 	const thrcomm_init_ft fp = init_fpa[ ti ];
 
+	// Sanity check: the function pointer queried from the function pointer
+	// array should never be NULL.
 	if ( fp == NULL ) bli_abort();
 
 	// Call the threading-specific init function.
 	fp( nt, comm );
 
 	// Embed the type of threading implementation within the thrcomm_t struct.
-	// This can be used later to make sure the application doesn't use a
-	// thrcomm_t initialized with threading type A with the API for threading
-	// type B. Note that we wait until after the init function has returned
-	// in case that function zeros out the entire struct before setting the
-	// fields.
+	// Note that we wait until after the init function has returned in case
+	// that function zeros out the entire struct before setting the fields.
 	comm->ti = ti;
 }
 
-void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm )
+void bli_thrcomm_cleanup( thrcomm_t* comm )
 {
-	const thrcomm_cleanup_ft fp = cleanup_fpa[ ti ];
-
-	if ( fp == NULL ) bli_abort();
-
 	// If comm is BLIS_SINGLE_COMM, we return early since there is no cleanup,
 	// especially if it is being used with a threading implementation that
 	// would normally want to free its thrcomm_t resources.
 	if ( comm == &BLIS_SINGLE_COMM ) return;
 
-	// Sanity check. Make sure the threading implementation we were asked to use
-	// is the same as the implementation that initialized the thrcomm_t object.
-	if ( ti != comm->ti )
-	{
-		printf( "bli_thrcomm_cleanup(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n",
-		        ( comm->ti == BLIS_SINGLE ? "single" :
-		        ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ),
-		        ( ti       == BLIS_SINGLE ? "single" :
-		        ( ti       == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
-		bli_abort();
-	}
+	const timpl_t            ti = bli_thrcomm_thread_impl( comm );
+	const thrcomm_cleanup_ft fp = cleanup_fpa[ ti ];
+
+	// Sanity check: the function pointer queried from the function pointer
+	// array should never be NULL.
+	if ( fp == NULL ) bli_abort();
 
 	// Call the threading-specific cleanup function.
 	fp( comm );
 }
 
-void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm )
+void bli_thrcomm_barrier( dim_t tid, thrcomm_t* comm )
 {
+	const timpl_t            ti = bli_thrcomm_thread_impl( comm );
 	const thrcomm_barrier_ft fp = barrier_fpa[ ti ];
 
+	// Sanity check: the function pointer queried from the function pointer
+	// array should never be NULL.
 	if ( fp == NULL ) bli_abort();
 
-	// Sanity check. Make sure the threading implementation we were asked to use
-	// is the same as the implementation that initialized the thrcomm_t object.
-	// We skip this check if comm is BLIS_SINGLE_COMM since the timpl_t value
-	// embedded in comm will often be different than that of BLIS_SINGLE_COMM
-	// (but we don't return early since we still need to barrier... wait, or do
-	// we?).
-	if ( ti != comm->ti && comm != &BLIS_SINGLE_COMM )
-	{
-		printf( "bli_thrcomm_barrier(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n",
-		        ( comm->ti == BLIS_SINGLE ? "single" :
-		        ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ),
-		        ( ti       == BLIS_SINGLE ? "single" :
-		        ( ti       == BLIS_OPENMP ? "openmp" : "pthreads" ) ) );
-		bli_abort();
-	}
-
 	// Call the threading-specific barrier function.
 	fp( tid, comm );
 }
@@ -211,19 +184,18 @@ void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm )
 
 void* bli_thrcomm_bcast
      (
-       timpl_t    ti,
        dim_t      id,
        void*      to_send,
        thrcomm_t* comm
      )
-{   
+{
 	if ( comm == NULL || comm->n_threads == 1 ) return to_send;
 
 	if ( id == 0 ) comm->sent_object = to_send;
 
-	bli_thrcomm_barrier( ti, id, comm );
+	bli_thrcomm_barrier( id, comm );
 	void* object = comm->sent_object;
-	bli_thrcomm_barrier( ti, id, comm );
+	bli_thrcomm_barrier( id, comm );
 
 	return object;
 }
@@ -257,7 +229,7 @@ void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm )
 	// the current barrier. The first n-1 threads will spin on this variable
 	// until it changes. The sense variable gets incremented by the last
 	// thread to enter the barrier, just before it exits. But it turns out
-	// that you don't need many unique IDs before you can wrap around. In 
+	// that you don't need many unique IDs before you can wrap around. In
 	// fact, if everything else is working, a binary variable is sufficient,
 	// which is what we do here (i.e., 0 is incremented to 1, which is then
 	// decremented back to 0, and so forth).
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index 4532fd00d..7abd190c7 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -120,10 +120,15 @@ BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm )
 	return comm->n_threads;
 }
 
+BLIS_INLINE timpl_t bli_thrcomm_thread_impl( thrcomm_t* comm )
+{
+	return comm->ti;
+}
+
 
 // Threading method-agnostic function prototypes.
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads );
-void       bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm );
+thrcomm_t* bli_thrcomm_create( timpl_t ti, pool_t* sba_pool, dim_t n_threads );
+void       bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm );
 
 // Threading method-specific function prototypes.
 // NOTE: These are the prototypes to the dispatcher functions and thus they
@@ -131,11 +136,11 @@ void       bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm );
 // (and do) omit the timpl_t from their function signatures since their
 // threading implementation is intrinsically known.
 void                   bli_thrcomm_init( timpl_t ti, dim_t n_threads, thrcomm_t* comm );
-void                   bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm );
-BLIS_EXPORT_BLIS void  bli_thrcomm_barrier( timpl_t ti, dim_t thread_id, thrcomm_t* comm );
+void                   bli_thrcomm_cleanup( thrcomm_t* comm );
+BLIS_EXPORT_BLIS void  bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm );
 
 // Other function prototypes.
-BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( timpl_t ti, dim_t inside_id, void* to_send, thrcomm_t* comm );
+BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm );
 void                   bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm );
 
 #endif
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index eefc20fdd..8904c88e3 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -35,9 +35,7 @@
 
 #include "blis.h"
 
-thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
-thrinfo_t BLIS_GEMM_SINGLE_THREADED  = {};
-thrcomm_t BLIS_SINGLE_COMM           = {};
+thrcomm_t BLIS_SINGLE_COMM = {};
 
 // The global rntm_t structure. (The definition resides in bli_rntm.c.)
 extern rntm_t global_rntm;
@@ -46,13 +44,39 @@ extern rntm_t global_rntm;
 // resides in bli_rntm.c.)
 extern bli_pthread_mutex_t global_rntm_mutex;
 
+typedef void (*thread_launch_t)
+     (
+             dim_t         nt,
+             thread_func_t func,
+       const void*         params
+     );
+
+static thread_launch_t thread_launch_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+{
+	[BLIS_SINGLE] = bli_thread_launch_single,
+	[BLIS_OPENMP] =
+#if   defined(BLIS_ENABLE_OPENMP)
+	                bli_thread_launch_openmp,
+#elif defined(BLIS_ENABLE_PTHREADS)
+	                NULL,
+#else
+	                NULL,
+#endif
+	[BLIS_POSIX]  =
+#if   defined(BLIS_ENABLE_PTHREADS)
+	                bli_thread_launch_pthreads,
+#elif defined(BLIS_ENABLE_OPENMP)
+	                NULL,
+#else
+	                NULL,
+#endif
+};
+
 // -----------------------------------------------------------------------------
 
 void bli_thread_init( void )
 {
 	bli_thrcomm_init( BLIS_SINGLE, 1, &BLIS_SINGLE_COMM );
-	bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED );
-	bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED );
 
 	// Read the environment variables and use them to initialize the
 	// global runtime object.
@@ -65,6 +89,19 @@ void bli_thread_finalize( void )
 
 // -----------------------------------------------------------------------------
 
+void bli_thread_launch
+     (
+             timpl_t       ti,
+             dim_t         nt,
+             thread_func_t func,
+       const void*         params
+     )
+{
+	thread_launch_fpa[ti]( nt, func, params );
+}
+
+// -----------------------------------------------------------------------------
+
 void bli_thread_range_sub
      (
        const thrinfo_t* thread,
@@ -75,11 +112,11 @@ void bli_thread_range_sub
              dim_t*     end
      )
 {
-	dim_t      n_way      = bli_thread_n_way( thread );
+	dim_t      n_way      = bli_thrinfo_n_way( thread );
 
 	if ( n_way == 1 ) { *start = 0; *end = n; return; }
 
-	dim_t      work_id    = bli_thread_work_id( thread );
+	dim_t      work_id    = bli_thrinfo_work_id( thread );
 
 	dim_t      all_start  = 0;
 	dim_t      all_end    = n;
@@ -515,8 +552,8 @@ siz_t bli_thread_range_weighted_sub
              dim_t*     j_end_thr
      )
 {
-	dim_t      n_way   = bli_thread_n_way( thread );
-	dim_t      my_id   = bli_thread_work_id( thread );
+	dim_t      n_way   = bli_thrinfo_n_way( thread );
+	dim_t      my_id   = bli_thrinfo_work_id( thread );
 
 	dim_t      bf_left = n % bf;
 
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 88bdccda5..821e2fe7c 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -42,29 +42,30 @@
 
 // Include thread info (thrinfo_t) object definitions and prototypes.
 #include "bli_thrinfo.h"
-#include "bli_thrinfo_sup.h"
 
-// Include some operation-specific thrinfo_t prototypes.
-// Note that the bli_packm_thrinfo.h must be included before the others!
-#include "bli_packm_thrinfo.h"
-#include "bli_l3_thrinfo.h"
+// Thread lanuch prototypes. Must go before including implementation headers.
+typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params );
 
-// Include the level-3 thread decorator and related definitions and prototypes
-// for the conventional code path.
-#include "bli_l3_decor.h"
-
-// Include the level-3 thread decorator and related definitions and prototypes
-// for the sup code path.
-#include "bli_l3_sup_decor.h"
+// Include threading implementations.
+#include "bli_thread_openmp.h"
+#include "bli_thread_pthreads.h"
+#include "bli_thread_single.h"
 
 // Initialization-related prototypes.
 void bli_thread_init( void );
 void bli_thread_finalize( void );
 
+BLIS_EXPORT_BLIS void bli_thread_launch
+     (
+             timpl_t       ti,
+             dim_t         nt,
+             thread_func_t func,
+       const void*         params
+     );
+
 // Thread range-related prototypes.
 
-BLIS_EXPORT_BLIS
-void bli_thread_range_sub
+BLIS_EXPORT_BLIS void bli_thread_range_sub
      (
        const thrinfo_t* thread,
              dim_t      n,
@@ -224,8 +225,8 @@ BLIS_INLINE void bli_thread_range_jrir_rr
      )
 {
 	// Use interleaved partitioning of jr/ir loops.
-	*start = bli_thread_work_id( thread );
-	*inc   = bli_thread_n_way( thread );
+	*start = bli_thrinfo_work_id( thread );
+	*inc   = bli_thrinfo_n_way( thread );
 	*end   = n;
 }
 
@@ -295,8 +296,8 @@ BLIS_INLINE void bli_thread_range_weighted_jrir
 #else
 
 	// Use interleaved partitioning of jr/ir loops.
-	*start = bli_thread_work_id( thread );
-	*inc   = bli_thread_n_way( thread );
+	*start = bli_thrinfo_work_id( thread );
+	*inc   = bli_thrinfo_n_way( thread );
 	*end   = n;
 
 #endif
diff --git a/frame/thread/bli_l3_decor_openmp.h b/frame/thread/bli_thread_openmp.c
similarity index 69%
rename from frame/thread/bli_l3_decor_openmp.h
rename to frame/thread/bli_thread_openmp.c
index 95e1582e5..c7a74832b 100644
--- a/frame/thread/bli_l3_decor_openmp.h
+++ b/frame/thread/bli_thread_openmp.c
@@ -5,7 +5,6 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,35 +32,32 @@
 
 */
 
-#ifndef BLIS_L3_DECOR_OPENMP_H
-#define BLIS_L3_DECOR_OPENMP_H
+#include "blis.h"
 
-// Definitions specific to situations when OpenMP multithreading is enabled.
 #ifdef BLIS_ENABLE_OPENMP
 
-void bli_l3_thread_decorator_openmp
-     (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   beta,
-       const obj_t*   c,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
-     );
+void bli_thread_launch_openmp( dim_t n_threads, thread_func_t func, const void* params )
+{
+	const timpl_t ti = BLIS_OPENMP;
 
-void bli_l3_thread_decorator_thread_check
-     (
-       dim_t      n_threads,
-       dim_t      tid,
-       thrcomm_t* gl_comm,
-       rntm_t*    rntm
-     );
+	// Allocate a global communicator for the root thrinfo_t structures.
+	pool_t*    gl_comm_pool = NULL;
+	thrcomm_t* gl_comm      = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
 
-#endif
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Query the thread's id from OpenMP.
+		const dim_t tid = omp_get_thread_num();
+
+		// Call the thread entry point, passing the global communicator, the
+		// thread id, and the params struct as arguments.
+		func( gl_comm, tid, params );
+	}
+
+	// Free the global communicator, because the root thrinfo_t node
+	// never frees its communicator.
+	bli_thrcomm_free( gl_comm_pool, gl_comm );
+}
 
 #endif
 
diff --git a/frame/thread/bli_l3_sup_decor_openmp.h b/frame/thread/bli_thread_openmp.h
similarity index 82%
rename from frame/thread/bli_l3_sup_decor_openmp.h
rename to frame/thread/bli_thread_openmp.h
index 4c5059d00..d26023a15 100644
--- a/frame/thread/bli_l3_sup_decor_openmp.h
+++ b/frame/thread/bli_thread_openmp.h
@@ -32,23 +32,17 @@
 
 */
 
-#ifndef BLIS_L3_SUP_DECOR_OPENMP_H
-#define BLIS_L3_SUP_DECOR_OPENMP_H
+#ifndef BLIS_THREAD_OPENMP_H
+#define BLIS_THREAD_OPENMP_H
 
 // Definitions specific to situations when OpenMP multithreading is enabled.
 #ifdef BLIS_ENABLE_OPENMP
 
-err_t bli_l3_sup_thread_decorator_openmp
+void bli_thread_launch_openmp
      (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
+             dim_t         nt,
+             thread_func_t func,
+       const void*         params
      );
 
 #endif
diff --git a/frame/thread/bli_thread_pthreads.c b/frame/thread/bli_thread_pthreads.c
new file mode 100644
index 000000000..88a11cf11
--- /dev/null
+++ b/frame/thread/bli_thread_pthreads.c
@@ -0,0 +1,128 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_PTHREADS
+
+// A data structure to assist in passing operands to additional threads.
+typedef struct thread_data
+{
+	      dim_t         tid;
+	      thrcomm_t*    gl_comm;
+	      thread_func_t func;
+	const void*         params;
+} thread_data_t;
+
+// Entry point for additional threads
+static void* bli_posix_thread_entry( void* data_void )
+{
+	const thread_data_t* data     = data_void;
+
+	const dim_t          tid      = data->tid;
+	      thrcomm_t*     gl_comm  = data->gl_comm;
+	      thread_func_t  func     = data->func;
+	const void*          params   = data->params;
+
+	// Call the thread entry point, passing the global communicator, the
+	// thread id, and the params struct as arguments.
+	func( gl_comm, tid, params );
+
+	return NULL;
+}
+
+void bli_thread_launch_pthreads( dim_t n_threads, thread_func_t func, const void* params )
+{
+	err_t r_val;
+
+	const timpl_t ti = BLIS_POSIX;
+
+	// Allocate a global communicator for the root thrinfo_t structures.
+	pool_t*    gl_comm_pool = NULL;
+	thrcomm_t* gl_comm      = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
+
+	// Allocate an array of pthread objects and auxiliary data structs to pass
+	// to the thread entry functions.
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val );
+
+	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
+	// can spawn all other threads before proceeding with its own computation.
+	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
+	{
+		// Set up thread data for additional threads (beyond thread 0).
+		datas[tid].tid      = tid;
+		datas[tid].gl_comm  = gl_comm;
+		datas[tid].func     = func;
+		datas[tid].params   = params;
+
+		// Spawn additional threads for ids greater than 1.
+		if ( tid != 0 )
+			bli_pthread_create( &pthreads[tid], NULL, &bli_posix_thread_entry, &datas[tid] );
+		else
+			bli_posix_thread_entry( &datas[0] );
+	}
+
+	// Thread 0 waits for additional threads to finish.
+	for ( dim_t tid = 1; tid < n_threads; tid++ )
+	{
+		bli_pthread_join( pthreads[tid], NULL );
+	}
+
+	// Free the global communicator, because the root thrinfo_t node
+	// never frees its communicator.
+	bli_thrcomm_free( gl_comm_pool, gl_comm );
+
+	// Free the array of pthread objects and auxiliary data structs.
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( pthreads );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( datas );
+}
+
+#endif
+
diff --git a/frame/thread/bli_l3_decor_single.h b/frame/thread/bli_thread_pthreads.h
similarity index 82%
rename from frame/thread/bli_l3_decor_single.h
rename to frame/thread/bli_thread_pthreads.h
index c118ad7be..5fb82e292 100644
--- a/frame/thread/bli_l3_decor_single.h
+++ b/frame/thread/bli_thread_pthreads.h
@@ -32,22 +32,20 @@
 
 */
 
-#ifndef BLIS_L3_DECOR_SINGLE_H
-#define BLIS_L3_DECOR_SINGLE_H
+#ifndef BLIS_THREAD_PTHREADS_H
+#define BLIS_THREAD_PTHREADS_H
 
-void bli_l3_thread_decorator_single
+// Definitions specific to situations when POSIX multithreading is enabled.
+#ifdef BLIS_ENABLE_PTHREADS
+
+void bli_thread_launch_pthreads
      (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   beta,
-       const obj_t*   c,
-       const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl
+             dim_t         nt,
+             thread_func_t func,
+       const void*         params
      );
 
 #endif
 
+#endif
+
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.h b/frame/thread/bli_thread_single.c
similarity index 77%
rename from sandbox/gemmlike/thread/bls_l3_decor_single.h
rename to frame/thread/bli_thread_single.c
index 82dfbc993..323e0d8f2 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_single.h
+++ b/frame/thread/bli_thread_single.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2021, The University of Texas at Austin
+   Copyright (C) 2014, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,23 +32,12 @@
 
 */
 
-#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
-#define BLIS_SBX_L3_DECOR_SINGLE_H
-
-void bls_l3_thread_decorator_single
-     (
-       l3sbxint_ft func,
-       opid_t      family,
-       //pack_t      schema_a,
-       //pack_t      schema_b,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm
-     );
-
-#endif
+#include "blis.h"
+
+void bli_thread_launch_single( dim_t nt, thread_func_t func, const void* params )
+{
+	// Call the thread entry point, passing the global single-threaded
+	// communicator, thread id of 0, and the params struct as arguments.
+    func( &BLIS_SINGLE_COMM, 0, params );
+}
 
diff --git a/frame/thread/bli_l3_sup_decor_single.h b/frame/thread/bli_thread_single.h
similarity index 81%
rename from frame/thread/bli_l3_sup_decor_single.h
rename to frame/thread/bli_thread_single.h
index 8ca279baf..fda91232e 100644
--- a/frame/thread/bli_l3_sup_decor_single.h
+++ b/frame/thread/bli_thread_single.h
@@ -32,20 +32,14 @@
 
 */
 
-#ifndef BLIS_L3_SUP_DECOR_SINGLE_H
-#define BLIS_L3_SUP_DECOR_SINGLE_H
+#ifndef BLIS_THREAD_SINGLE_H
+#define BLIS_THREAD_SINGLE_H
 
-err_t bli_l3_sup_thread_decorator_single
+void bli_thread_launch_single
      (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
+             dim_t         nt,
+             thread_func_t func,
+       const void*         params
      );
 
 #endif
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index 3730ab946..f48e70bb6 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -35,101 +35,83 @@
 
 #include "blis.h"
 
-thrinfo_t* bli_thrinfo_create
+#define BLIS_NUM_STATIC_COMMS 80
+
+thrinfo_t* bli_thrinfo_create_root
      (
-       rntm_t*    rntm,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
+       thrcomm_t* comm,
+       dim_t      thread_id,
+       pool_t*    sba_pool,
+       pba_t*     pba
      )
 {
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_thrinfo_create(): " );
-	#endif
-
-	thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) );
-
-	bli_thrinfo_init
+	return bli_thrinfo_create
 	(
-	  thread,
-	  ocomm, ocomm_id,
-	  n_way, work_id,
-	  free_comm,
-	  bszid,
-	  sub_node
+	  comm,
+	  thread_id,
+	  1,
+	  0,
+	  FALSE,
+	  sba_pool,
+	  pba
 	);
-
-    return thread;
 }
 
-void bli_thrinfo_init
+thrinfo_t* bli_thrinfo_create
      (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
+       thrcomm_t* comm,
+       dim_t      thread_id,
        dim_t      n_way,
        dim_t      work_id,
        bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
+       pool_t*    sba_pool,
+       pba_t*     pba
      )
 {
-	bli_thrinfo_set_ocomm( ocomm, thread );
-	bli_thrinfo_set_ocomm_id( ocomm_id, thread );
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_thrinfo_create(): " );
+	#endif
+
+	thrinfo_t* thread = bli_sba_acquire( sba_pool, sizeof( thrinfo_t ) );
+
+	bli_thrinfo_set_comm( comm, thread );
+	bli_thrinfo_set_thread_id( thread_id, thread );
 	bli_thrinfo_set_n_way( n_way, thread );
 	bli_thrinfo_set_work_id( work_id, thread );
 	bli_thrinfo_set_free_comm( free_comm, thread );
-	bli_thrinfo_set_bszid( bszid, thread );
+	bli_thrinfo_set_sba_pool( sba_pool, thread );
+	bli_thrinfo_set_pba( pba, thread );
+	bli_mem_clear( bli_thrinfo_mem( thread ) );
 
-	bli_thrinfo_set_sub_node( sub_node, thread );
+	bli_thrinfo_set_sub_node( NULL, thread );
 	bli_thrinfo_set_sub_prenode( NULL, thread );
-}
 
-void bli_thrinfo_init_single
-     (
-       thrinfo_t* thread
-     )
-{
-	bli_thrinfo_init
-	(
-	  thread,
-	  &BLIS_SINGLE_COMM, 0,
-	  1,
-	  0,
-	  FALSE,
-	  BLIS_NO_PART,
-	  thread
-	);
+	return thread;
 }
 
 void bli_thrinfo_free
      (
-       rntm_t*    rntm,
        thrinfo_t* thread
      )
 {
-	if ( thread == NULL ||
-	     thread == &BLIS_PACKM_SINGLE_THREADED ||
-	     thread == &BLIS_GEMM_SINGLE_THREADED
-	   ) return;
+	if ( thread == NULL ) return;
 
 	thrinfo_t* thrinfo_sub_prenode = bli_thrinfo_sub_prenode( thread );
 	thrinfo_t* thrinfo_sub_node    = bli_thrinfo_sub_node( thread );
+	pool_t*    sba_pool            = bli_thrinfo_sba_pool( thread );
+	mem_t*     cntl_mem_p          = bli_thrinfo_mem( thread );
+	pba_t*     pba                 = bli_thrinfo_pba( thread );
 
 	// Recursively free all children of the current thrinfo_t.
 	if ( thrinfo_sub_prenode != NULL )
 	{
-		bli_thrinfo_free( rntm, thrinfo_sub_prenode );
+		bli_thrinfo_free( thrinfo_sub_prenode );
 	}
 
 	// Recursively free all children of the current thrinfo_t.
 	if ( thrinfo_sub_node != NULL )
 	{
-		bli_thrinfo_free( rntm, thrinfo_sub_node );
+		bli_thrinfo_free( thrinfo_sub_node );
 	}
 
 	// Free the communicators, but only if the current thrinfo_t struct
@@ -139,198 +121,48 @@ void bli_thrinfo_free
 	if ( bli_thrinfo_needs_free_comm( thread ) )
 	{
 		// The ochief always frees his communicator.
-		if ( bli_thread_am_ochief( thread ) )
-			bli_thrcomm_free( rntm, bli_thrinfo_ocomm( thread ) );
+		if ( bli_thrinfo_am_chief( thread ) )
+			bli_thrcomm_free( sba_pool, bli_thrinfo_comm( thread ) );
 	}
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_thrinfo_free(): " );
 	#endif
 
-	// Free the thrinfo_t struct.
-	bli_sba_release( rntm, thread );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_thrinfo_grow
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	// First, consider the prenode branch of the thrinfo_t tree, which should be
-	// expanded only if there exists a prenode branch in the cntl_t tree.
-
-	if ( bli_cntl_sub_prenode( cntl ) != NULL )
+	// Free any allocated memory from the pba.
+	if ( bli_mem_is_alloc( cntl_mem_p ) && bli_thrinfo_am_chief( thread ) )
 	{
-		// We only need to take action if the thrinfo_t sub-node is NULL; if it
-		// is non-NULL, then it has already been created and we'll use it as-is.
-		if ( bli_thrinfo_sub_prenode( thread ) == NULL )
-		{
-			// Assertion / sanity check.
-			if ( bli_cntl_bszid( cntl ) != BLIS_MC )
-			{
-				printf( "Assertion failed: Expanding prenode for non-IC loop?\n" );
-				bli_abort();
-			}
-
-			// Now we must create the packa, jr, and ir nodes that make up
-			// the prenode branch of current cntl_t node.
-
-			// Create a new node (or, if needed, multiple nodes) along the
-			// prenode branch of the tree and return the pointer to the
-			// (highest) child.
-			thrinfo_t* thread_prenode = bli_thrinfo_rgrow_prenode
-			(
-			  rntm,
-			  cntl,
-			  bli_cntl_sub_prenode( cntl ),
-			  thread
-			);
-
-			// Attach the child thrinfo_t node for the secondary branch to its
-			// parent structure.
-			bli_thrinfo_set_sub_prenode( thread_prenode, thread );
-		}
-	}
-
-	// Now, grow the primary branch of the thrinfo_t tree.
-
-	// NOTE: If bli_thrinfo_rgrow() is being called, the sub_node field will
-	// always be non-NULL, and so there's no need to check it.
-	//if ( bli_cntl_sub_node( cntl ) != NULL )
-	{
-		// We only need to take action if the thrinfo_t sub-node is NULL; if it
-		// is non-NULL, then it has already been created and we'll use it as-is.
-		if ( bli_thrinfo_sub_node( thread ) == NULL )
-		{
-			// Create a new node (or, if needed, multiple nodes) along the
-			// main sub-node branch of the tree and return the pointer to the
-			// (highest) child.
-			thrinfo_t* thread_child = bli_thrinfo_rgrow
-			(
-			  rntm,
-			  cntl,
-			  bli_cntl_sub_node( cntl ),
-			  thread
-			);
-
-			// Attach the child thrinfo_t node for the primary branch to its
-			// parent structure.
-			bli_thrinfo_set_sub_node( thread_child, thread );
-		}
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-thrinfo_t* bli_thrinfo_rgrow
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
-       thrinfo_t* thread_par
-     )
-{
-	thrinfo_t* thread_cur;
-
-	// We must handle two cases: those where the next node in the
-	// control tree is a partitioning node, and those where it is
-	// a non-partitioning (ie: packing) node.
-	if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART )
-	{
-		// Create the child thrinfo_t node corresponding to cntl_cur,
-		// with cntl_par being the parent.
-		thread_cur = bli_thrinfo_create_for_cntl
-		(
-		  rntm,
-		  cntl_par,
-		  cntl_cur,
-		  thread_par
-		);
-	}
-	else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART )
-	{
-		// Recursively grow the thread structure and return the top-most
-		// thrinfo_t node of that segment.
-		thrinfo_t* thread_seg = bli_thrinfo_rgrow
-		(
-		  rntm,
-		  cntl_par,
-		  bli_cntl_sub_node( cntl_cur ),
-		  thread_par
-		);
-
-		// Create a thrinfo_t node corresponding to cntl_cur. Since the
-		// corresponding cntl node, cntl_cur, is a non-partitioning node
-		// (bszid = BLIS_NO_PART), this means it's a packing node. Packing
-		// thrinfo_t nodes are formed differently than those corresponding to
-		// partitioning nodes; specifically, their work_id's are set equal to
-		// the their comm_id's. Also, notice that the free_comm field is set
-		// to FALSE since cntl_cur is a non-partitioning node. The reason:
-		// the communicator used here will be freed when thread_seg, or one
-		// of its descendents, is freed.
-		thread_cur = bli_thrinfo_create
+		bli_pba_release
 		(
-		  rntm,                                           // rntm
-		  bli_thrinfo_ocomm( thread_seg ),                // ocomm
-		  bli_thread_ocomm_id( thread_seg ),              // ocomm_id
-		  bli_cntl_calc_num_threads_in( rntm, cntl_cur ), // n_way
-		  bli_thread_ocomm_id( thread_seg ),              // work_id
-		  FALSE,                                          // free_comm
-		  BLIS_NO_PART,                                   // bszid
-		  thread_seg                                      // sub_node
+		  pba,
+		  cntl_mem_p
 		);
 	}
 
-	return thread_cur;
+	// Free the thrinfo_t struct.
+	bli_sba_release( sba_pool, thread );
 }
 
-#define BLIS_NUM_STATIC_COMMS 80
+// -----------------------------------------------------------------------------
 
-thrinfo_t* bli_thrinfo_create_for_cntl
+thrinfo_t* bli_thrinfo_split
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
+       dim_t      n_way,
        thrinfo_t* thread_par
      )
 {
-	// If we are running with a single thread, all of the code can be reduced
-	// and simplified to this.
-	if ( bli_rntm_calc_num_threads( rntm ) == 1 )
-	{
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,                        // rntm
-		  &BLIS_SINGLE_COMM,           // ocomm
-		  0,                           // ocomm_id
-		  1,                           // n_way
-		  0,                           // work_id
-		  FALSE,                       // free_comm
-		  BLIS_NO_PART,                // bszid
-		  NULL                         // sub_node
-		);
-		return thread_chl;
-	}
-
-	thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
-	thrcomm_t** new_comms = NULL;
-
-	const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl );
-
-	const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
-	const dim_t parent_n_way   = bli_thread_n_way( thread_par );
-	const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-	const dim_t parent_work_id = bli_thread_work_id( thread_par );
+	      thrcomm_t* parent_comm        = bli_thrinfo_comm( thread_par );
+	const timpl_t    ti                 = bli_thrcomm_thread_impl( parent_comm );
+	const dim_t      parent_num_threads = bli_thrinfo_num_threads( thread_par );
+	const dim_t      parent_thread_id   = bli_thrinfo_thread_id( thread_par );
+	      pool_t*    sba_pool           = bli_thrinfo_sba_pool( thread_par );
+	      pba_t*     pba                = bli_thrinfo_pba( thread_par );
 
 	// Sanity check: make sure the number of threads in the parent's
 	// communicator is divisible by the number of new sub-groups.
-	if ( parent_nt_in % parent_n_way != 0 )
+	if ( parent_num_threads % n_way != 0 )
 	{
-		printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
+		printf( "Assertion failed: parent_num_threads %% n_way != 0\n" );
 		bli_abort();
 	}
 
@@ -339,312 +171,105 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 	// - the current thread's id within the new communicator,
 	// - the current thread's work id, given the ways of parallelism
 	//   to be obtained within the next loop.
-	const dim_t child_nt_in   = bli_cntl_calc_num_threads_in( rntm, cntl_chl );
-	const dim_t child_n_way   = bli_rntm_ways_for( bszid_chl, rntm );
-	const dim_t child_comm_id = parent_comm_id % child_nt_in;
-	const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
-
-	// The parent's chief thread creates a temporary array of thrcomm_t
-	// pointers.
-	if ( bli_thread_am_ochief( thread_par ) )
-	{
-		err_t r_val;
-
-		if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-			new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
-		else
-			new_comms = static_comms;
-	}
-
-	// Broadcast the temporary array to all threads in the parent's
-	// communicator.
-	new_comms = bli_thread_broadcast( rntm, thread_par, new_comms );
+	const dim_t child_num_threads = parent_num_threads / n_way;
+	const dim_t child_thread_id   = parent_thread_id % child_num_threads;
+	const dim_t child_work_id     = parent_thread_id / child_num_threads;
 
-	// Chiefs in the child communicator allocate the communicator
-	// object and store it in the array element corresponding to the
-	// parent's work id.
-	if ( child_comm_id == 0 )
-		new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
-
-	bli_thread_barrier( rntm, thread_par );
-
-	// All threads create a new thrinfo_t node using the communicator
-	// that was created by their chief, as identified by parent_work_id.
-	thrinfo_t* thread_chl = bli_thrinfo_create
-	(
-	  rntm,                        // rntm
-	  new_comms[ parent_work_id ], // ocomm
-	  child_comm_id,               // ocomm_id
-	  child_n_way,                 // n_way
-	  child_work_id,               // work_id
-	  TRUE,                        // free_comm
-	  bszid_chl,                   // bszid
-	  NULL                         // sub_node
-	);
-
-	bli_thread_barrier( rntm, thread_par );
+	thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
+	thrcomm_t** new_comms = NULL;
+	thrcomm_t*  my_comm = NULL;
+	bool        free_comm = FALSE;
 
-	// The parent's chief thread frees the temporary array of thrcomm_t
-	// pointers.
-	if ( bli_thread_am_ochief( thread_par ) )
+	if ( n_way == 1 )
 	{
-		if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-			bli_free_intl( new_comms );
+		my_comm = parent_comm;
 	}
-
-	return thread_chl;
-}
-
-// -----------------------------------------------------------------------------
-
-thrinfo_t* bli_thrinfo_rgrow_prenode
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
-       thrinfo_t* thread_par
-     )
-{
-	thrinfo_t* thread_cur;
-
-	// We must handle two cases: those where the next node in the
-	// control tree is a partitioning node, and those where it is
-	// a non-partitioning (ie: packing) node.
-	if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART )
+	else if ( n_way == parent_num_threads )
 	{
-		// Create the child thrinfo_t node corresponding to cntl_cur,
-		// with cntl_par being the parent.
-		thread_cur = bli_thrinfo_create_for_cntl_prenode
-		(
-		  rntm,
-		  cntl_par,
-		  cntl_cur,
-		  thread_par
-		);
+		my_comm = &BLIS_SINGLE_COMM;
 	}
-	else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART )
+	else
 	{
-		// Recursively grow the thread structure and return the top-most
-		// thrinfo_t node of that segment.
-		thrinfo_t* thread_seg = bli_thrinfo_rgrow_prenode
-		(
-		  rntm,
-		  cntl_par,
-		  bli_cntl_sub_node( cntl_cur ),
-		  thread_par
-		);
-
-		// Create a thrinfo_t node corresponding to cntl_cur. Since the
-		// corresponding cntl node, cntl_cur, is a non-partitioning node
-		// (bszid = BLIS_NO_PART), this means it's a packing node. Packing
-		// thrinfo_t nodes are formed differently than those corresponding to
-		// partitioning nodes; specifically, their work_id's are set equal to
-		// the their comm_id's. Also, notice that the free_comm field is set
-		// to FALSE since cntl_cur is a non-partitioning node. The reason:
-		// the communicator used here will be freed when thread_seg, or one
-		// of its descendents, is freed.
-		thread_cur = bli_thrinfo_create
-		(
-		  rntm,                                           // rntm
-		  bli_thrinfo_ocomm( thread_seg ),                // ocomm
-		  bli_thread_ocomm_id( thread_seg ),              // ocomm_id
-		  bli_cntl_calc_num_threads_in( rntm, cntl_par ), // n_way
-		  bli_thread_ocomm_id( thread_seg ),              // work_id
-		  FALSE,                                          // free_comm
-		  BLIS_NO_PART,                                   // bszid
-		  thread_seg                                      // sub_node
-		);
-	}
+		// The parent's chief thread creates a temporary array of thrcomm_t
+		// pointers.
+		if ( bli_thrinfo_am_chief( thread_par ) )
+		{
+			err_t r_val;
 
-	return thread_cur;
-}
+			if ( n_way > BLIS_NUM_STATIC_COMMS )
+				new_comms = bli_malloc_intl( n_way * sizeof( thrcomm_t* ), &r_val );
+			else
+				new_comms = static_comms;
+		}
 
-thrinfo_t* bli_thrinfo_create_for_cntl_prenode
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
-       thrinfo_t* thread_par
-     )
-{
-	// NOTE: This function only has to work for the ic -> (pa -> jr)
-	// thrinfo_t tree branch extension. After that, the function
-	// bli_thrinfo_create_for_cntl() will be called for the last jr->ir
-	// branch extension.
+		// Broadcast the temporary array to all threads in the parent's
+		// communicator.
+		new_comms = bli_thrinfo_broadcast( thread_par, new_comms );
 
-	const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl );
+		// Chiefs in the child communicator allocate the communicator
+		// object and store it in the array element corresponding to the
+		// parent's work id.
+		if ( child_thread_id == 0 )
+			new_comms[ child_work_id ] = bli_thrcomm_create( ti, sba_pool, child_num_threads );
 
-	const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
-	const dim_t parent_n_way   = bli_thread_n_way( thread_par );
-	const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-	//const dim_t parent_work_id = bli_thread_work_id( thread_par );
+		bli_thrinfo_barrier( thread_par );
 
-	// Sanity check: make sure the number of threads in the parent's
-	// communicator is divisible by the number of new sub-groups.
-	if ( parent_nt_in % parent_n_way != 0 )
-	{
-		printf( "Assertion failed: parent_nt_in (%d) <mod> parent_n_way (%d) != 0\n",
-		        ( int )parent_nt_in, ( int )parent_n_way );
-		bli_abort();
+		my_comm = new_comms[ child_work_id ];
+		free_comm = TRUE;
 	}
 
-	//dim_t child_nt_in   = bli_cntl_calc_num_threads_in( rntm, cntl_chl );
-	//dim_t child_n_way   = bli_rntm_ways_for( bszid_chl, rntm );
-	const dim_t child_nt_in   = parent_nt_in;
-	const dim_t child_n_way   = parent_nt_in;
-	const dim_t child_comm_id = parent_comm_id % child_nt_in;
-	const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-	bli_thread_barrier( rntm, thread_par );
-
-	// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
-	// parent's chief-ness is equivalent to checking for chief-ness in the new
-	// about-to-be-created communicator group.
-	thrcomm_t* new_comm = NULL;
-	if ( bli_thread_am_ochief( thread_par ) )
-		new_comm = bli_thrcomm_create( rntm, child_nt_in );
-
-	// Broadcast the new thrcomm_t address to the other threads in the
-	// parent's group.
-	new_comm = bli_thread_broadcast( rntm, thread_par, new_comm );
-
 	// All threads create a new thrinfo_t node using the communicator
 	// that was created by their chief, as identified by parent_work_id.
 	thrinfo_t* thread_chl = bli_thrinfo_create
 	(
-	  rntm,          // rntm
-	  new_comm,      // ocomm
-	  child_comm_id, // ocomm_id
-	  child_n_way,   // n_way
-	  child_work_id, // work_id
-	  TRUE,          // free_comm
-	  bszid_chl,     // bszid
-	  NULL           // sub_node
+	  my_comm,
+	  child_thread_id,
+	  n_way,
+	  child_work_id,
+	  free_comm,
+	  sba_pool,
+	  pba
 	);
 
-	bli_thread_barrier( rntm, thread_par );
+	bli_thrinfo_barrier( thread_par );
+
+	// The parent's chief thread frees the temporary array of thrcomm_t
+	// pointers.
+	if ( bli_thrinfo_am_chief( thread_par ) &&
+	     new_comms != static_comms )
+	{
+		bli_free_intl( new_comms );
+	}
 
 	return thread_chl;
 }
 
-// -----------------------------------------------------------------------------
-
-#if 0
-void bli_thrinfo_grow_tree
+void bli_thrinfo_print
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
        thrinfo_t* thread
      )
 {
-	cntl_t*    cntl_jc     = cntl;
-	thrinfo_t* thrinfo_jc  = thread;
-
-	bli_thrinfo_grow( rntm, cntl_jc, thrinfo_jc );
-
-	// inside jc loop:
-	cntl_t*    cntl_pc     = bli_cntl_sub_node( cntl_jc );
-	thrinfo_t* thrinfo_pc  = bli_thrinfo_sub_node( thrinfo_jc );
-
-	bli_thrinfo_grow( rntm, cntl_pc, thrinfo_pc );
-
-	// inside pc loop:
-	cntl_t*    cntl_pb     = bli_cntl_sub_node( cntl_pc );
-	thrinfo_t* thrinfo_pb  = bli_thrinfo_sub_node( thrinfo_pc );
-
-	bli_thrinfo_grow( rntm, cntl_pb, thrinfo_pb );
-
-	// after pb packing:
-	cntl_t*    cntl_ic     = bli_cntl_sub_node( cntl_pb );
-	thrinfo_t* thrinfo_ic  = bli_thrinfo_sub_node( thrinfo_pb );
-
-	bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic );
-
-	// -- main branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa     = bli_cntl_sub_node( cntl_ic );
-	thrinfo_t* thrinfo_pa  = bli_thrinfo_sub_node( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa );
-
-	// after pa packing:
-	cntl_t*    cntl_jr     = bli_cntl_sub_node( cntl_pa );
-	thrinfo_t* thrinfo_jr  = bli_thrinfo_sub_node( thrinfo_pa );
-
-	bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir     = bli_cntl_sub_node( cntl_jr );
-	//thrinfo_t* thrinfo_ir  = bli_thrinfo_sub_node( thrinfo_jr );
-
-	// -- trsm branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa0    = bli_cntl_sub_prenode( cntl_ic );
-	thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 );
-
-	// after pa packing:
-	cntl_t*    cntl_jr0    = bli_cntl_sub_node( cntl_pa0 );
-	thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 );
-
-	bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir0   = bli_cntl_sub_node( cntl_jr0 );
-	//thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 );
+	printf( " lvl   nt  tid nway wkid free\n" );
+	bli_thrinfo_print_sub( thread, 0 );
 }
 
-void bli_thrinfo_grow_tree_ic
+void bli_thrinfo_print_sub
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       thrinfo_t* thread,
+       gint_t     level
      )
 {
-	cntl_t*    cntl_ic     = cntl;
-	thrinfo_t* thrinfo_ic  = thread;
-
-	bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic );
-
-	// -- main branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa     = bli_cntl_sub_node( cntl_ic );
-	thrinfo_t* thrinfo_pa  = bli_thrinfo_sub_node( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa );
-
-	// after pa packing:
-	cntl_t*    cntl_jr     = bli_cntl_sub_node( cntl_pa );
-	thrinfo_t* thrinfo_jr  = bli_thrinfo_sub_node( thrinfo_pa );
-
-	bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir     = bli_cntl_sub_node( cntl_jr );
-	//thrinfo_t* thrinfo_ir  = bli_thrinfo_sub_node( thrinfo_jr );
-
-	// -- trsm branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa0    = bli_cntl_sub_prenode( cntl_ic );
-	thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 );
-
-	// after pa packing:
-	cntl_t*    cntl_jr0    = bli_cntl_sub_node( cntl_pa0 );
-	thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 );
-
-	bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir0   = bli_cntl_sub_node( cntl_jr0 );
-	//thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 );
+	if ( thread == NULL ) return;
+
+	printf( "%4ld %4ld %4ld %4ld %4ld %4ld\n",
+	        ( unsigned long )level,
+	        ( unsigned long )bli_thrinfo_num_threads( thread ),
+	        ( unsigned long )bli_thrinfo_thread_id( thread ),
+	        ( unsigned long )bli_thrinfo_n_way( thread ),
+	        ( unsigned long )bli_thrinfo_work_id( thread ),
+	        ( unsigned long )bli_thrinfo_needs_free_comm( thread ));
+
+	bli_thrinfo_print_sub( bli_thrinfo_sub_prenode( thread ), level+1 );
+	bli_thrinfo_print_sub( bli_thrinfo_sub_node( thread ), level+1 );
 }
-#endif
+
diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h
index 9d234bc91..d15fb49f6 100644
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -41,15 +41,16 @@ struct thrinfo_s
 {
 	// The thread communicator for the other threads sharing the same work
 	// at this level.
-	thrcomm_t*         ocomm;
+	thrcomm_t*         comm;
 
-	// Our thread id within the ocomm thread communicator.
-	dim_t              ocomm_id;
+	// Our thread id within the thread communicator.
+	dim_t              thread_id;
 
-	// The number of distinct threads used to parallelize the loop.
+	// The number of communicators which are "siblings" of our communicator.
 	dim_t              n_way;
 
-	// What we're working on.
+	// An id to identify what we're working on. This is the same for all threads
+	// in the same communicator, and 0 <= work_id < n_way.
 	dim_t              work_id;
 
 	// When freeing, should the communicators in this node be freed? Usually,
@@ -58,9 +59,14 @@ struct thrinfo_s
 	// to false.
 	bool               free_comm;
 
-	// The bszid_t to help identify the node. This is mostly only useful when
-	// debugging or tracing the allocation and release of thrinfo_t nodes.
-	bszid_t            bszid;
+	// The small block pool.
+	pool_t*            sba_pool;
+
+	// The packing block allocator.
+	pba_t*             pba;
+
+	// Storage for allocated memory obtained from the packing block allocator.
+	mem_t              mem;
 
 	struct thrinfo_s*  sub_prenode;
 	struct thrinfo_s*  sub_node;
@@ -69,35 +75,33 @@ typedef struct thrinfo_s thrinfo_t;
 
 //
 // thrinfo_t functions
-// NOTE: The naming of these should be made consistent at some point.
-// (ie: bli_thrinfo_ vs. bli_thread_)
 //
 
 // thrinfo_t query (field only)
 
-BLIS_INLINE dim_t bli_thread_num_threads( const thrinfo_t* t )
+BLIS_INLINE dim_t bli_thrinfo_num_threads( const thrinfo_t* t )
 {
-	return (t->ocomm)->n_threads;
+	return (t->comm)->n_threads;
 }
 
-BLIS_INLINE dim_t bli_thread_ocomm_id( const thrinfo_t* t )
+BLIS_INLINE dim_t bli_thrinfo_thread_id( const thrinfo_t* t )
 {
-	return t->ocomm_id;
+	return t->thread_id;
 }
 
-BLIS_INLINE dim_t bli_thread_n_way( const thrinfo_t* t )
+BLIS_INLINE dim_t bli_thrinfo_n_way( const thrinfo_t* t )
 {
 	return t->n_way;
 }
 
-BLIS_INLINE dim_t bli_thread_work_id( const thrinfo_t* t )
+BLIS_INLINE dim_t bli_thrinfo_work_id( const thrinfo_t* t )
 {
 	return t->work_id;
 }
 
-BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( const thrinfo_t* t )
+BLIS_INLINE thrcomm_t* bli_thrinfo_comm( const thrinfo_t* t )
 {
-	return t->ocomm;
+	return t->comm;
 }
 
 BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t )
@@ -105,9 +109,19 @@ BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t )
 	return t->free_comm;
 }
 
-BLIS_INLINE dim_t bli_thread_bszid( const thrinfo_t* t )
+BLIS_INLINE pool_t* bli_thrinfo_sba_pool( const thrinfo_t* t )
+{
+	return t->sba_pool;
+}
+
+BLIS_INLINE pba_t* bli_thrinfo_pba( const thrinfo_t* t )
+{
+	return t->pba;
+}
+
+BLIS_INLINE mem_t* bli_thrinfo_mem( thrinfo_t* t )
 {
-	return t->bszid;
+	return &t->mem;
 }
 
 BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t )
@@ -122,21 +136,21 @@ BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t )
 
 // thrinfo_t query (complex)
 
-BLIS_INLINE bool bli_thread_am_ochief( const thrinfo_t* t )
+BLIS_INLINE bool bli_thrinfo_am_chief( const thrinfo_t* t )
 {
-	return t->ocomm_id == 0;
+	return t->thread_id == 0;
 }
 
 // thrinfo_t modification
 
-BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_set_comm( thrcomm_t* comm, thrinfo_t* t )
 {
-	t->ocomm = ocomm;
+	t->comm = comm;
 }
 
-BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_set_thread_id( dim_t thread_id, thrinfo_t* t )
 {
-	t->ocomm_id = ocomm_id;
+	t->thread_id = thread_id;
 }
 
 BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t )
@@ -154,9 +168,14 @@ BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t )
 	t->free_comm = free_comm;
 }
 
-BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_set_sba_pool( pool_t* sba_pool, thrinfo_t* t )
+{
+	t->sba_pool = sba_pool;
+}
+
+BLIS_INLINE void bli_thrinfo_set_pba( pba_t* pba, thrinfo_t* t )
 {
-	t->bszid = bszid;
+	t->pba = pba;
 }
 
 BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t )
@@ -171,22 +190,14 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t*
 
 // other thrinfo_t-related functions
 
-BLIS_INLINE void* bli_thread_broadcast( const rntm_t* rntm, const thrinfo_t* t, void* p )
+BLIS_INLINE void* bli_thrinfo_broadcast( const thrinfo_t* t, void* p )
 {
-	// We can't use any bli_rntm_*() APIs here because they haven't been
-	// defined yet. So we have to manually access the timpl_t field (le ugh).
-	//const timpl_t ti = bli_rntm_thread_impl( rntm );
-
-	return bli_thrcomm_bcast( rntm->thread_impl, t->ocomm_id, p, t->ocomm );
+	return bli_thrcomm_bcast( t->thread_id, p, t->comm );
 }
 
-BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_barrier( const thrinfo_t* t )
 {
-	// We can't use any bli_rntm_*() APIs here because they haven't been
-	// defined yet. So we have to manually access the timpl_t field (le ugh).
-	//const timpl_t ti = bli_rntm_thread_impl( rntm );
-
-	bli_thrcomm_barrier( rntm->thread_impl, t->ocomm_id, t->ocomm );
+	bli_thrcomm_barrier( t->thread_id, t->comm );
 }
 
 
@@ -194,98 +205,47 @@ BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t )
 // Prototypes for level-3 thrinfo functions not specific to any operation.
 //
 
-thrinfo_t* bli_thrinfo_create
+thrinfo_t* bli_thrinfo_create_root
      (
-       rntm_t*    rntm,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
+       thrcomm_t* comm,
+       dim_t      thread_id,
+       pool_t*    sba_pool,
+       pba_t*     pba
      );
 
-void bli_thrinfo_init
+thrinfo_t* bli_thrinfo_create
      (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
+       thrcomm_t* comm,
+       dim_t      thread_id,
        dim_t      n_way,
        dim_t      work_id,
        bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
-     );
-
-void bli_thrinfo_init_single
-     (
-       thrinfo_t* thread
+       pool_t*    sba_pool,
+       pba_t*     pba
      );
 
-void bli_thrinfo_free
+BLIS_EXPORT_BLIS void bli_thrinfo_free
      (
-       rntm_t*    rntm,
        thrinfo_t* thread
      );
 
 // -----------------------------------------------------------------------------
 
-void bli_thrinfo_grow
+thrinfo_t* bli_thrinfo_split
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-thrinfo_t* bli_thrinfo_rgrow
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
-       thrinfo_t* thread_par
-     );
-
-thrinfo_t* bli_thrinfo_create_for_cntl
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
-       thrinfo_t* thread_par
-     );
-
-thrinfo_t* bli_thrinfo_rgrow_prenode
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
+       dim_t      n_way,
        thrinfo_t* thread_par
      );
 
-thrinfo_t* bli_thrinfo_create_for_cntl_prenode
+void bli_thrinfo_print
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
-       thrinfo_t* thread_par
-     );
-
-// -----------------------------------------------------------------------------
-
-#if 0
-void bli_thrinfo_grow_tree
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
        thrinfo_t* thread
      );
 
-void bli_thrinfo_grow_tree_ic
+void bli_thrinfo_print_sub
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       thrinfo_t* thread,
+       gint_t     level
      );
-#endif
 
 #endif
diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
deleted file mode 100644
index 26a40e00f..000000000
--- a/frame/thread/bli_thrinfo_sup.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_thrinfo_sup_grow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-             thrinfo_t* thread
-     )
-{
-	if ( thread == &BLIS_GEMM_SINGLE_THREADED ||
-	     thread == &BLIS_PACKM_SINGLE_THREADED ) return;
-
-	// NOTE: If bli_thrinfo_sup_rgrow() is being called, the sub_node field will
-	// always be non-NULL, and so there's no need to check it.
-	//if ( bli_cntl_sub_node( cntl ) != NULL )
-	{
-		// We only need to take action if the thrinfo_t sub-node is NULL; if it
-		// is non-NULL, then it has already been created and we'll use it as-is.
-		if ( bli_thrinfo_sub_node( thread ) == NULL )
-		{
-			// Create a new node (or, if needed, multiple nodes) along the
-			// main sub-node branch of the tree and return the pointer to the
-			// (highest) child.
-			thrinfo_t* thread_child = bli_thrinfo_sup_rgrow
-			(
-			  rntm,
-			  bszid_par,
-			  &bszid_par[1],
-			  thread
-			);
-
-			// Attach the child thrinfo_t node for the primary branch to its
-			// parent structure.
-			bli_thrinfo_set_sub_node( thread_child, thread );
-		}
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-thrinfo_t* bli_thrinfo_sup_rgrow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_cur,
-             thrinfo_t* thread_par
-     )
-{
-	thrinfo_t* thread_cur;
-
-	// We must handle two cases: those where the next node in the
-	// control tree is a partitioning node, and those where it is
-	// a non-partitioning (ie: packing) node.
-	if ( *bszid_cur != BLIS_NO_PART )
-	{
-		// Create the child thrinfo_t node corresponding to cntl_cur,
-		// with cntl_par being the parent.
-		thread_cur = bli_thrinfo_sup_create_for_cntl
-		(
-		  rntm,
-		  bszid_par,
-		  bszid_cur,
-		  thread_par
-		);
-	}
-	else // if ( *bszid_cur == BLIS_NO_PART )
-	{
-		// Recursively grow the thread structure and return the top-most
-		// thrinfo_t node of that segment.
-		thrinfo_t* thread_seg = bli_thrinfo_sup_rgrow
-		(
-		  rntm,
-		  bszid_par,
-		  &bszid_cur[1],
-		  thread_par
-		);
-
-		// Create a thrinfo_t node corresponding to cntl_cur. Since the
-		// corresponding cntl node, cntl_cur, is a non-partitioning node
-		// (bszid = BLIS_NO_PART), this means it's a packing node. Packing
-		// thrinfo_t nodes are formed differently than those corresponding to
-		// partitioning nodes; specifically, their work_id's are set equal to
-		// the their comm_id's. Also, notice that the free_comm field is set
-		// to FALSE since cntl_cur is a non-partitioning node. The reason:
-		// the communicator used here will be freed when thread_seg, or one
-		// of its descendents, is freed.
-		thread_cur = bli_thrinfo_create
-		(
-		  rntm,                                            // rntm
-		  bli_thrinfo_ocomm( thread_seg ),                 // ocomm
-		  bli_thread_ocomm_id( thread_seg ),               // ocomm_id
-		  bli_rntm_calc_num_threads_in( bszid_cur, rntm ), // n_way
-		  bli_thread_ocomm_id( thread_seg ),               // work_id
-		  FALSE,                                           // free_comm
-		  BLIS_NO_PART,                                    // bszid
-		  thread_seg                                       // sub_node
-		);
-	}
-
-	return thread_cur;
-}
-
-#define BLIS_NUM_STATIC_COMMS 80
-
-thrinfo_t* bli_thrinfo_sup_create_for_cntl
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_chl,
-             thrinfo_t* thread_par
-     )
-{
-	// If we are running with a single thread, all of the code can be reduced
-	// and simplified to this.
-	if ( bli_rntm_calc_num_threads( rntm ) == 1 )
-	{
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,               // rntm
-		  &BLIS_SINGLE_COMM,  // ocomm
-		  0,                  // ocomm_id
-		  1,                  // n_way
-		  0,                  // work_id
-		  FALSE,              // free_comm
-		  BLIS_NO_PART,       // bszid
-		  NULL                // sub_node
-		);
-
-		return thread_chl;
-	}
-
-	// The remainder of this function handles the cases involving the use of
-	// multiple BLIS threads.
-
-	if ( bli_rntm_pack_a( rntm ) == FALSE &&
-	     bli_rntm_pack_b( rntm ) == FALSE )
-	{
-		// If we are packing neither A nor B, there are no broadcasts or barriers
-		// needed to synchronize threads (since all threads can work completely
-		// independently). In this special case situation, the thrinfo_t can be
-		// created with much simpler logic.
-
-		const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-
-		// Compute:
-		// - the number of threads inside the new child comm,
-		// - the current thread's id within the new communicator,
-		// - the current thread's work id, given the ways of parallelism
-		//   to be obtained within the next loop.
-		const dim_t child_nt_in   = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
-		const dim_t child_n_way   = bli_rntm_ways_for( *bszid_chl, rntm );
-		const dim_t child_comm_id = parent_comm_id % child_nt_in;
-		const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-		// All threads create a new thrinfo_t node using the communicator
-		// that was created by their chief, as identified by parent_work_id.
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,                        // rntm
-		  NULL,                        // ocomm
-		  child_comm_id,               // ocomm_id
-		  child_n_way,                 // n_way
-		  child_work_id,               // work_id
-		  TRUE,                        // free_comm
-		  *bszid_chl,                  // bszid
-		  NULL                         // sub_node
-		);
-
-		return thread_chl;
-	}
-	else
-	{
-		// If we are packing at least one of A or B, then we use the general
-		// approach that employs broadcasts and barriers.
-
-		thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
-		thrcomm_t** new_comms = NULL;
-
-		const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
-		const dim_t parent_n_way   = bli_thread_n_way( thread_par );
-		const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-		const dim_t parent_work_id = bli_thread_work_id( thread_par );
-
-		// Sanity check: make sure the number of threads in the parent's
-		// communicator is divisible by the number of new sub-groups.
-		if ( parent_nt_in % parent_n_way != 0 )
-		{
-			printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
-			bli_abort();
-		}
-
-		// Compute:
-		// - the number of threads inside the new child comm,
-		// - the current thread's id within the new communicator,
-		// - the current thread's work id, given the ways of parallelism
-		//   to be obtained within the next loop.
-		const dim_t child_nt_in   = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
-		const dim_t child_n_way   = bli_rntm_ways_for( *bszid_chl, rntm );
-		const dim_t child_comm_id = parent_comm_id % child_nt_in;
-		const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
-
-		// The parent's chief thread creates a temporary array of thrcomm_t
-		// pointers.
-		if ( bli_thread_am_ochief( thread_par ) )
-		{
-			err_t r_val;
-
-			if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-				new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
-			else
-				new_comms = static_comms;
-		}
-
-		// Broadcast the temporary array to all threads in the parent's
-		// communicator.
-		new_comms = bli_thread_broadcast( rntm, thread_par, new_comms );
-
-		// Chiefs in the child communicator allocate the communicator
-		// object and store it in the array element corresponding to the
-		// parent's work id.
-		if ( child_comm_id == 0 )
-			new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
-
-		bli_thread_barrier( rntm, thread_par );
-
-		// All threads create a new thrinfo_t node using the communicator
-		// that was created by their chief, as identified by parent_work_id.
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,                        // rntm
-		  new_comms[ parent_work_id ], // ocomm
-		  child_comm_id,               // ocomm_id
-		  child_n_way,                 // n_way
-		  child_work_id,               // work_id
-		  TRUE,                        // free_comm
-		  *bszid_chl,                  // bszid
-		  NULL                         // sub_node
-		);
-
-		bli_thread_barrier( rntm, thread_par );
-
-		// The parent's chief thread frees the temporary array of thrcomm_t
-		// pointers.
-		if ( bli_thread_am_ochief( thread_par ) )
-		{
-			if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-				bli_free_intl( new_comms );
-		}
-
-		return thread_chl;
-	}
-}
-
diff --git a/frame/thread/bli_thrinfo_sup.h b/frame/thread/bli_thrinfo_sup.h
deleted file mode 100644
index 1afcd3337..000000000
--- a/frame/thread/bli_thrinfo_sup.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_THRINFO_SUP_H
-#define BLIS_THRINFO_SUP_H
-
-//
-// Prototypes for level-3 thrinfo sup functions.
-//
-
-void bli_thrinfo_sup_grow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-             thrinfo_t* thread
-     );
-
-thrinfo_t* bli_thrinfo_sup_rgrow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_cur,
-             thrinfo_t* thread_par
-     );
-
-thrinfo_t* bli_thrinfo_sup_create_for_cntl
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_chl,
-             thrinfo_t* thread_par
-     );
-
-#endif
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index abc9c9089..5bd03882a 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -74,7 +74,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  ( ctype* )x, incx, \
 	            asum, \
 	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( rntm_t* )rntm  \
 	); \
 }
 
@@ -110,7 +110,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  m, \
 	  a, rs_a, cs_a, \
 	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( rntm_t* )rntm  \
 	); \
 }
 
@@ -153,7 +153,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  ( ctype* )x, incx, \
 	            norm, \
 	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( rntm_t* )rntm  \
 	); \
 }
 
@@ -204,7 +204,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  ( ctype* )x, rs_x, cs_x, \
 	            norm, \
 	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( rntm_t* )rntm  \
 	); \
 }
 
@@ -248,7 +248,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  n, \
 		  x, incx, \
 		  ( cntx_t* )cntx, \
-		  rntm  \
+		  ( rntm_t* )rntm  \
 		); \
 \
 		/* Check the 1-norm of the randomzied vector. In the unlikely event that
@@ -310,7 +310,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  n, \
 		  x, rs_x, cs_x, \
 		  ( cntx_t* )cntx, \
-		  rntm  \
+		  ( rntm_t* )rntm  \
 		); \
 \
 		/* Check the 1-norm of the randomzied matrix. In the unlikely event that
@@ -366,7 +366,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	            scale, \
 	            sumsq, \
 	  ( cntx_t* )cntx, \
-	  rntm  \
+	  ( rntm_t* )rntm  \
 	); \
 }
 
diff --git a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
index 8caccf923..133786843 100644
--- a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
+++ b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
@@ -374,8 +374,8 @@ void PASTECH2(bls_,ch,varname) \
 				/* Query the number of threads and thread ids for the JR loop.
 				   NOTE: These values are only needed when computing the next
 				   micropanel of B. */ \
-				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
-				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+				const dim_t jr_nt  = bli_thrinfo_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
 				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
@@ -404,8 +404,8 @@ void PASTECH2(bls_,ch,varname) \
 					/* Query the number of threads and thread ids for the IR loop.
 					   NOTE: These values are only needed when computing the next
 					   micropanel of A. */ \
-					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
-					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+					const dim_t ir_nt  = bli_thrinfo_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \
 \
 					/* Compute number of primary and leftover components of the IR loop. */ \
 					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -464,7 +464,7 @@ void PASTECH2(bls_,ch,varname) \
 			/* This barrier is needed to prevent threads from starting to pack
 			   the next row panel of B before the current row panel is fully
 			   computed upon. */ \
-			bli_thread_barrier( thread_pb ); \
+			bli_thrinfo_barrier( thread_pb ); \
 		} \
 	} \
 \
diff --git a/sandbox/gemmlike/bli_gemm_ex.c b/sandbox/gemmlike/bli_gemm_ex.c
index fe220e603..f8e8f86f7 100644
--- a/sandbox/gemmlike/bli_gemm_ex.c
+++ b/sandbox/gemmlike/bli_gemm_ex.c
@@ -52,7 +52,7 @@ void bli_gemm_ex
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -90,7 +90,7 @@ void bli_gemm_ex
 	bli_gemm_front
 	(
 	  ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
-	  ( cntx_t* )cntx, ( rntm_t* )rntm, NULL
+	  ( cntx_t* )cntx, ( rntm_t* )rntm
 	);
 }
 
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index 1e567a114..ba930ebc5 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -67,7 +67,7 @@ void bls_gemm_ex
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -75,8 +75,8 @@ void bls_gemm_ex
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Set the .pack_a and .pack_b fields to TRUE. This is only needed because
 	// this sandbox uses bli_thrinfo_sup_grow(), which calls
@@ -87,8 +87,8 @@ void bls_gemm_ex
 	// while this sandbox implementation executes (and it also reinforces the
 	// fact that we *are* indeed packing A and B, albeit not in the sup context
 	// originally envisioned for the .pack_a and .pack_b fields).
-	bli_rntm_set_pack_a( TRUE, rntm );
-	bli_rntm_set_pack_b( TRUE, rntm );
+	bli_rntm_set_pack_a( TRUE, &rntm_l );
+	bli_rntm_set_pack_b( TRUE, &rntm_l );
 
 	// Obtain a valid (native) context from the gks if necessary.
 	// NOTE: This must be done before calling the _check() function, since
@@ -166,7 +166,7 @@ void bls_gemm_ex
 	  bli_obj_length( &c_local ),
 	  bli_obj_width( &c_local ),
 	  bli_obj_width( &a_local ),
-	  rntm
+	  &rntm_l
 	);
 
 	// Spawn threads (if applicable), where bls_gemm_int() is the thread entry
@@ -182,7 +182,7 @@ void bls_gemm_ex
 	  ( obj_t* )beta,
 	  ( obj_t* )&c_local,
 	  ( cntx_t* )cntx,
-	  rntm
+	  &rntm_l
 	);
 }
 
diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h
index d01c6647e..7380f02ad 100644
--- a/sandbox/gemmlike/bls_gemm.h
+++ b/sandbox/gemmlike/bls_gemm.h
@@ -53,7 +53,7 @@ void bls_gemm_ex
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      );
 
 //
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index c8fd50083..28c5032bc 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -186,42 +186,13 @@ void PASTECH2(bls_,ch,varname) \
 \
 	auxinfo_t       aux; \
 \
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. */ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
-\
-	/* Define an array of bszid_t ids, which will act as our substitute for
-	   the cntl_t tree. */ \
-	bszid_t bszids[8] = { BLIS_NC,      /* 5th loop */ \
-	                      BLIS_KC,      /* 4th loop */ \
-	                      BLIS_NO_PART, /* pack B */ \
-	                      BLIS_MC,      /* 3rd loop */ \
-	                      BLIS_NO_PART, /* pack A */ \
-	                      BLIS_NR,      /* 2nd loop */ \
-	                      BLIS_MR,      /* 1st loop */ \
-	                      BLIS_KR };    /* microkernel loop */  \
-\
-	bszid_t* restrict bszids_jc = &bszids[0]; \
-	bszid_t* restrict bszids_pc = &bszids[1]; \
-	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
-	bszid_t* restrict bszids_ic = &bszids[3]; \
-	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
-	bszid_t* restrict bszids_jr = &bszids[5]; \
-	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
-\
-	thrinfo_t* restrict thread_jc = NULL; \
-	thrinfo_t* restrict thread_pc = NULL; \
-	thrinfo_t* restrict thread_pb = NULL; \
-	thrinfo_t* restrict thread_ic = NULL; \
-	thrinfo_t* restrict thread_pa = NULL; \
-	thrinfo_t* restrict thread_jr = NULL; \
-	thrinfo_t* restrict thread_ir = NULL; \
-\
-	/* Identify the current thrinfo_t node and then grow the tree. */ \
-	thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+	thrinfo_t* restrict thread_jc = bli_thrinfo_sub_node( thread ); \
+	thrinfo_t* restrict thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+	thrinfo_t* restrict thread_pb = bli_thrinfo_sub_node( thread_pc ); \
+	thrinfo_t* restrict thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+	thrinfo_t* restrict thread_pa = bli_thrinfo_sub_node( thread_ic ); \
+	thrinfo_t* restrict thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+	thrinfo_t* restrict thread_ir = bli_thrinfo_sub_node( thread_jr ); \
 \
 	/* Compute the JC loop thread range for the current thread. */ \
 	dim_t jc_start, jc_end; \
@@ -240,10 +211,6 @@ void PASTECH2(bls_,ch,varname) \
 \
 		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
 		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Identify the current thrinfo_t node and then grow the tree. */ \
-		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
 \
 		/* Compute the PC loop thread range for the current thread. */ \
 		const dim_t pc_start = 0, pc_end = k; \
@@ -267,14 +234,6 @@ void PASTECH2(bls_,ch,varname) \
 \
 			ctype* b_use; \
 			inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-			/* Identify the current thrinfo_t node. Note that the thrinfo_t
-			   node will have already been created by a previous call to
-			   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
-			   cause the tree to grow by two (e.g. to the next bszid that is
-			   a normal bszid_t value). */ \
-			thread_pb = bli_thrinfo_sub_node( thread_pc ); \
-			/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
 \
 			/* Determine the packing buffer and related parameters for matrix
 			   B. Then call the packm implementation. */ \
@@ -288,18 +247,12 @@ void PASTECH2(bls_,ch,varname) \
 			  &b_use, &rs_b_use, &cs_b_use, \
 			                     &ps_b_use, \
 			  cntx, \
-			  rntm, \
-			  &mem_b, \
 			  thread_pb  \
 			); \
 \
 			/* Alias b_use so that it's clear this is our current block of
 			   matrix B. */ \
 			ctype* restrict b_pc_use = b_use; \
-\
-			/* Identify the current thrinfo_t node and then grow the tree. */ \
-			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
 \
 			/* Compute the IC loop thread range for the current thread. */ \
 			dim_t ic_start, ic_end; \
@@ -321,14 +274,6 @@ void PASTECH2(bls_,ch,varname) \
 \
 				ctype* a_use; \
 				inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-				/* Identify the current thrinfo_t node. Note that the thrinfo_t
-				   node will have already been created by a previous call to
-				   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
-				   cause the tree to grow by two (e.g. to the next bszid that is
-				   a normal bszid_t value). */ \
-				thread_pa = bli_thrinfo_sub_node( thread_ic ); \
-				/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
 \
 				/* Determine the packing buffer and related parameters for matrix
 				   A. Then call the packm implementation. */ \
@@ -342,24 +287,18 @@ void PASTECH2(bls_,ch,varname) \
 				  &a_use, &rs_a_use, &cs_a_use, \
 				                     &ps_a_use, \
 				  cntx, \
-				  rntm, \
-				  &mem_a, \
 				  thread_pa  \
 				); \
 \
 				/* Alias a_use so that it's clear this is our current block of
 				   matrix A. */ \
 				ctype* restrict a_ic_use = a_use; \
-\
-				/* Identify the current thrinfo_t node and then grow the tree. */ \
-				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
 \
 				/* Query the number of threads and thread ids for the JR loop.
 				   NOTE: These values are only needed when computing the next
 				   micropanel of B. */ \
-				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
-				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+				const dim_t jr_nt  = bli_thrinfo_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
 				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
@@ -381,15 +320,12 @@ void PASTECH2(bls_,ch,varname) \
 					/* Assume for now that our next panel of B to be the current panel
 					   of B. */ \
 					ctype* restrict b2 = b_jr; \
-\
-					/* Identify the current thrinfo_t node. */ \
-					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
 \
 					/* Query the number of threads and thread ids for the IR loop.
 					   NOTE: These values are only needed when computing the next
 					   micropanel of A. */ \
-					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
-					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+					const dim_t ir_nt  = bli_thrinfo_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \
 \
 					/* Compute number of primary and leftover components of the IR loop. */ \
 					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -446,23 +382,9 @@ void PASTECH2(bls_,ch,varname) \
 			/* This barrier is needed to prevent threads from starting to pack
 			   the next row panel of B before the current row panel is fully
 			   computed upon. */ \
-			bli_thread_barrier( rntm, thread_pb ); \
+			bli_thrinfo_barrier( thread_pb ); \
 		} \
 	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTECH2(bls_,ch,packm_finalize_mem_a) \
-	( \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTECH2(bls_,ch,packm_finalize_mem_b) \
-	( \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
 \
 /*
 PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
diff --git a/frame/thread/bli_l3_sup_decor.c b/sandbox/gemmlike/bls_l3_decor.c
similarity index 50%
rename from frame/thread/bli_l3_sup_decor.c
rename to sandbox/gemmlike/bls_l3_decor.c
index 53c7b41be..4bf030586 100644
--- a/frame/thread/bli_l3_sup_decor.c
+++ b/sandbox/gemmlike/bls_l3_decor.c
@@ -34,51 +34,79 @@
 
 #include "blis.h"
 
-// Initialize a function pointer array containing function addresses for
-// each of the threading-specific level-3 sup thread decorators.
-
-static l3_sup_decor_ft l3_sup_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
+struct l3_sbx_decor_params_s
 {
-	[BLIS_SINGLE] = bli_l3_sup_thread_decorator_single,
-	[BLIS_OPENMP] =
-#if   defined(BLIS_ENABLE_OPENMP)
-	                bli_l3_sup_thread_decorator_openmp,
-#elif defined(BLIS_ENABLE_PTHREADS)
-	                NULL,
-#else
-	                NULL,
-#endif
-	[BLIS_POSIX]  =
-#if   defined(BLIS_ENABLE_PTHREADS)
-	                bli_l3_sup_thread_decorator_pthreads,
-#elif defined(BLIS_ENABLE_OPENMP)
-	                NULL,
-#else
-	                NULL,
-#endif
+	l3sbxint_ft func;
+	opid_t      family;
+	obj_t*      alpha;
+	obj_t*      a;
+	obj_t*      b;
+	obj_t*      beta;
+	obj_t*      c;
+	cntx_t*     cntx;
+	rntm_t*     rntm;
+	array_t*    array;
 };
+typedef struct l3_sbx_decor_params_s l3_sbx_decor_params_t;
+
+static void bls_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const void* data_void )
+{
+	const l3_sbx_decor_params_t* data   = data_void;
+
+	l3sbxint_ft func   = data->func;
+	opid_t      family = data->family;
+	obj_t*      alpha  = data->alpha;
+	obj_t*      a      = data->a;
+	obj_t*      b      = data->b;
+	obj_t*      beta   = data->beta;
+	obj_t*      c      = data->c;
+	cntx_t*     cntx   = data->cntx;
+	rntm_t*     rntm   = data->rntm;
+	array_t*    array  = data->array;
+
+	( void )family;
+
+	// Create the root node of the thread's thrinfo_t structure.
+	pool_t*    sba_pool = bli_apool_array_elem( tid, array );
+	thrinfo_t* thread   = bli_l3_sup_thrinfo_create( tid, gl_comm, sba_pool, rntm );
+
+	func
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  thread
+	);
 
-// Define a dispatcher that chooses a threading-specific function from the
-// above function pointer array.
+	// Free the current thread's thrinfo_t structure.
+	bli_thrinfo_free( thread );
+}
 
-err_t bli_l3_sup_thread_decorator
+void bls_l3_thread_decorator
      (
-             l3supint_ft func,
-             opid_t      family,
-       const obj_t*      alpha,
-       const obj_t*      a,
-       const obj_t*      b,
-       const obj_t*      beta,
-       const obj_t*      c,
-       const cntx_t*     cntx,
-             rntm_t*     rntm
+       l3sbxint_ft func,
+       opid_t      family,
+       obj_t*      alpha,
+       obj_t*      a,
+       obj_t*      b,
+       obj_t*      beta,
+       obj_t*      c,
+       cntx_t*     cntx,
+       rntm_t*     rntm
      )
 {
-	rntm_t rntm_l;
+	rntm_t rntm_l = *rntm;
 
 	// Query the threading implementation and the number of threads requested.
-	timpl_t ti = bli_rntm_thread_impl( rntm );
-	dim_t   nt = bli_rntm_num_threads( rntm );
+	timpl_t ti = bli_rntm_thread_impl( &rntm_l );
+	dim_t   nt = bli_rntm_num_threads( &rntm_l );
+
+	if ( bli_error_checking_is_enabled() )
+		bli_l3_thread_decorator_check( &rntm_l );
 
 #ifdef BLIS_ENABLE_NT1_VIA_SINGLE
 	if ( nt == 1 )
@@ -101,37 +129,39 @@ err_t bli_l3_sup_thread_decorator
 		// than one thread. Here, we choose to favor the requested threading
 		// implementation over the number of threads, and so reset all
 		// parallelism parameters to 1.
-		rntm_l = *rntm;
 		nt = 1;
 		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
 		bli_rntm_set_num_threads_only( 1, &rntm_l );
-		rntm = &rntm_l;
 	}
 
-	// Use the timpl_t value to index into the corresponding function address
-	// from the function pointer array.
-	const l3_sup_decor_ft fp = l3_sup_decor_fpa[ ti ];
-
-	// Call the threading-specific decorator function.
-	return fp
-	(
-	  func,
-	  family,
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm
-	);
-}
-
-void bli_l3_sup_thread_decorator_check
-     (
-       rntm_t* rntm
-     )
-{
-	bli_l3_sup_thread_decorator_check( rntm );
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* array = bli_sba_checkout_array( nt );
+
+	// Declare a params struct and embed within it all of the information
+	// that is relevant to the computation.
+	l3_sbx_decor_params_t params;
+	params.func   = func;
+	params.family = family;
+	params.alpha  = alpha;
+	params.a      = a;
+	params.b      = b;
+	params.beta   = beta;
+	params.c      = c;
+	params.cntx   = cntx;
+	params.rntm   = &rntm_l;
+	params.array  = array;
+
+	// Launch the threads using the threading implementation specified by ti,
+	// and use bli_l3_thread_decorator_entry() as their entry points. The
+	// params struct will be passed along to each thread.
+	bli_thread_launch( ti, nt, bls_l3_thread_decorator_entry, &params );
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
 }
 
diff --git a/sandbox/gemmlike/thread/bls_l3_decor.h b/sandbox/gemmlike/bls_l3_decor.h
similarity index 79%
rename from sandbox/gemmlike/thread/bls_l3_decor.h
rename to sandbox/gemmlike/bls_l3_decor.h
index 58b076270..524c24f38 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor.h
+++ b/sandbox/gemmlike/bls_l3_decor.h
@@ -48,21 +48,6 @@ typedef void (*l3sbxint_ft)
        thrinfo_t* thread
      );
 
-// Level-3 thread decorator function type.
-typedef void (*l3sbx_decor_ft)
-     (
-       l3sbxint_ft func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm
-     );
-
-// Level-3 thread decorator prototype.
 void bls_l3_thread_decorator
      (
        l3sbxint_ft func,
@@ -76,15 +61,5 @@ void bls_l3_thread_decorator
        rntm_t*     rntm
      );
 
-void bls_l3_thread_decorator_check
-     (
-       rntm_t* rntm
-     );
-
-// Include definitions specific to the method of multithreading.
-#include "bls_l3_decor_single.h"
-#include "bls_l3_decor_openmp.h"
-#include "bls_l3_decor_pthreads.h"
-
 #endif
 
diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c
index 9e1f67fc5..412c6c24e 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.c
+++ b/sandbox/gemmlike/bls_l3_packm_a.c
@@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            k, \
        dim_t            mr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -61,16 +59,18 @@ void PASTECH2(bls_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
+\
+	mem_t* mem = bli_thrinfo_mem( thread ); \
 \
 	/* Check the mem_t entry provided by the caller. If it is unallocated,
 	   then we need to acquire a block from the packed block allocator. */ \
 	if ( bli_mem_is_unalloc( mem ) ) \
 	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
+		if ( bli_thrinfo_am_chief( thread ) ) \
 		{ \
 			/* Acquire directly to the chief thread's mem_t that was passed in.
 			   It needs to be that mem_t struct, and not a local (temporary)
@@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \
 			   again, I prefer to keep barriers to a minimum.) */ \
 			bli_pba_acquire_m \
 			( \
-			  rntm, \
+			  bli_thrinfo_pba( thread ), \
 			  size_needed, \
 			  pack_buf_type, \
 			  mem  \
@@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+		mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
 		   chief thread already has the mem_t, so it does not need to
 		   perform any copy.) */ \
-		if ( !bli_thread_am_ochief( thread ) ) \
+		if ( !bli_thrinfo_am_chief( thread ) ) \
 		{ \
 			*mem = *mem_p; \
 		} \
@@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 		if ( mem_size < size_needed ) \
 		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
+			if ( bli_thrinfo_am_chief( thread ) ) \
 			{ \
 				/* The chief thread releases the existing block associated
 				   with the mem_t, and then re-acquires a new block, saving
@@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \
 				   (temporary) mem_t. */ \
 				bli_pba_release \
 				( \
-				  rntm, \
+				  bli_thrinfo_pba( thread ), \
 				  mem \
 				); \
 				bli_pba_acquire_m \
 				( \
-				  rntm, \
+				  bli_thrinfo_pba( thread ), \
 				  size_needed, \
 				  pack_buf_type, \
 				  mem \
@@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
 			   chief thread already has the mem_t, so it does not need to
 			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
+			if ( !bli_thrinfo_am_chief( thread ) ) \
 			{ \
 				*mem = *mem_p; \
 			} \
@@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_a )
 GENTFUNC( dcomplex, z, packm_init_mem_a )
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	if ( thread != NULL ) \
-	if ( bli_thread_am_ochief( thread ) ) \
-	{ \
-		/* Check the mem_t entry provided by the caller. Only proceed if it
-		   is allocated, which it should be. */ \
-		if ( bli_mem_is_alloc( mem ) ) \
-		{ \
-			bli_pba_release \
-			( \
-			  rntm, \
-			  mem \
-			); \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
-GENTFUNC( float,    s, packm_finalize_mem_a )
-GENTFUNC( double,   d, packm_finalize_mem_a )
-GENTFUNC( scomplex, c, packm_finalize_mem_a )
-GENTFUNC( dcomplex, z, packm_finalize_mem_a )
-
-
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
@@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \
 	( \
 	  m_alloc, k_alloc, mr, \
 	  cntx, \
-	  rntm, \
-	  mem, \
 	  thread  \
 	); \
 \
@@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \
 	  &m_max, &k_max, \
 	  p, rs_p,  cs_p, \
 	     &pd_p, ps_p, \
-	  mem  \
+	  bli_thrinfo_mem( thread )  \
 	); \
 \
 	/* Pack matrix A to the destination buffer chosen above. Here, the packed
@@ -311,13 +274,13 @@ void PASTECH2(bls_,ch,opname) \
 	  kappa, \
 	  a,  rs_a,  cs_a, \
 	  *p, *rs_p, *cs_p, \
-		  pd_p,  *ps_p, \
+	       pd_p, *ps_p, \
 	  cntx, \
 	  thread  \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_a )
diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/sandbox/gemmlike/bls_l3_packm_a.h
index 201a24efa..2ab53dcbf 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.h
+++ b/sandbox/gemmlike/bls_l3_packm_a.h
@@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            k, \
        dim_t            mr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
@@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_a )
 GENTPROT( dcomplex, z, packm_init_mem_a )
 
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
-GENTPROT( float,    s, packm_finalize_mem_a )
-GENTPROT( double,   d, packm_finalize_mem_a )
-GENTPROT( scomplex, c, packm_finalize_mem_a )
-GENTPROT( dcomplex, z, packm_finalize_mem_a )
-
-
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
@@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c
index cb8275fae..cc9757b1d 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.c
+++ b/sandbox/gemmlike/bls_l3_packm_b.c
@@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            n, \
        dim_t            nr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -61,16 +59,18 @@ void PASTECH2(bls_,ch,opname) \
 \
 	/* Barrier to make sure all threads are caught up and ready to begin the
 	   packm stage. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
+\
+	mem_t* mem = bli_thrinfo_mem( thread ); \
 \
 	/* Check the mem_t entry provided by the caller. If it is unallocated,
 	   then we need to acquire a block from the packed block allocator. */ \
 	if ( bli_mem_is_unalloc( mem ) ) \
 	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
+		if ( bli_thrinfo_am_chief( thread ) ) \
 		{ \
 			/* Acquire directly to the chief thread's mem_t that was passed in.
 			   It needs to be that mem_t struct, and not a local (temporary)
@@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \
 			   again, I prefer to keep barriers to a minimum.) */ \
 			bli_pba_acquire_m \
 			( \
-			  rntm, \
+			  bli_thrinfo_pba( thread ), \
 			  size_needed, \
 			  pack_buf_type, \
 			  mem  \
@@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+		mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
 		   chief thread already has the mem_t, so it does not need to
 		   perform any copy.) */ \
-		if ( !bli_thread_am_ochief( thread ) ) \
+		if ( !bli_thrinfo_am_chief( thread ) ) \
 		{ \
 			*mem = *mem_p; \
 		} \
@@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 		if ( mem_size < size_needed ) \
 		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
+			if ( bli_thrinfo_am_chief( thread ) ) \
 			{ \
 				/* The chief thread releases the existing block associated
 				   with the mem_t, and then re-acquires a new block, saving
@@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \
 				   (temporary) mem_t. */ \
 				bli_pba_release \
 				( \
-				  rntm, \
+				  bli_thrinfo_pba( thread ), \
 				  mem \
 				); \
 				bli_pba_acquire_m \
 				( \
-				  rntm, \
+				  bli_thrinfo_pba( thread ), \
 				  size_needed, \
 				  pack_buf_type, \
 				  mem \
@@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
 			   chief thread already has the mem_t, so it does not need to
 			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
+			if ( !bli_thrinfo_am_chief( thread ) ) \
 			{ \
 				*mem = *mem_p; \
 			} \
@@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_b )
 GENTFUNC( dcomplex, z, packm_init_mem_b )
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	if ( thread != NULL ) \
-	if ( bli_thread_am_ochief( thread ) ) \
-	{ \
-		/* Check the mem_t entry provided by the caller. Only proceed if it
-		   is allocated, which it should be. */ \
-		if ( bli_mem_is_alloc( mem ) ) \
-		{ \
-			bli_pba_release \
-			( \
-			  rntm, \
-			  mem \
-			); \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
-GENTFUNC( float,    s, packm_finalize_mem_b )
-GENTFUNC( double,   d, packm_finalize_mem_b )
-GENTFUNC( scomplex, c, packm_finalize_mem_b )
-GENTFUNC( dcomplex, z, packm_finalize_mem_b )
-
-
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
@@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \
 	( \
 	  k_alloc, n_alloc, nr, \
 	  cntx, \
-	  rntm, \
-	  mem, \
 	  thread  \
 	); \
 \
@@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \
 	  &k_max, &n_max, \
 	  p, rs_p,  cs_p, \
 	     &pd_p, ps_p, \
-	  mem  \
+	  bli_thrinfo_mem( thread )  \
 	); \
 \
 	/* Pack matrix B to the destination buffer chosen above. Here, the packed
@@ -311,13 +274,13 @@ void PASTECH2(bls_,ch,opname) \
 	  kappa, \
 	  b,  rs_b,  cs_b, \
 	  *p, *rs_p, *cs_p, \
-		  pd_p,  *ps_p, \
+	       pd_p, *ps_p, \
 	  cntx, \
 	  thread  \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
-	bli_thread_barrier( rntm, thread ); \
+	bli_thrinfo_barrier( thread ); \
 }
 
 //INSERT_GENTFUNC_BASIC0( packm_b )
diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/sandbox/gemmlike/bls_l3_packm_b.h
index 728d21aed..791cf9b71 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.h
+++ b/sandbox/gemmlike/bls_l3_packm_b.h
@@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            n, \
        dim_t            nr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
@@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_b )
 GENTPROT( dcomplex, z, packm_init_mem_b )
 
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
-GENTPROT( float,    s, packm_finalize_mem_b )
-GENTPROT( double,   d, packm_finalize_mem_b )
-GENTPROT( scomplex, c, packm_finalize_mem_b )
-GENTPROT( dcomplex, z, packm_finalize_mem_b )
-
-
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
@@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h
index 98300536b..4c6db2cac 100644
--- a/sandbox/gemmlike/bls_l3_packm_var.h
+++ b/sandbox/gemmlike/bls_l3_packm_var.h
@@ -41,7 +41,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c
index c0649a9ec..e4d566b44 100644
--- a/sandbox/gemmlike/bls_l3_packm_var1.c
+++ b/sandbox/gemmlike/bls_l3_packm_var1.c
@@ -43,7 +43,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
@@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \
 	inc_t           incc; \
 	inc_t           ldc; \
 	inc_t           ldp; \
-	conj_t          conjc; \
-\
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
 \
 	/* Create flags to incidate row or column storage. Note that the
 	   schema bit that encodes row or column is describing the form of
@@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thrinfo_num_threads( thread ); \
+	const dim_t tid = bli_thrinfo_thread_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c
index 8d2b90cac..3e7e7888a 100644
--- a/sandbox/gemmlike/bls_l3_packm_var2.c
+++ b/sandbox/gemmlike/bls_l3_packm_var2.c
@@ -43,7 +43,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
@@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \
 	inc_t           incc; \
 	inc_t           ldc; \
 	inc_t           ldp; \
-	conj_t          conjc; \
-\
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
 \
 	/* Create flags to incidate row or column storage. Note that the
 	   schema bit that encodes row or column is describing the form of
@@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thrinfo_num_threads( thread ); \
+	const dim_t tid = bli_thrinfo_thread_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c
index 5ea80ff42..4ccb1828d 100644
--- a/sandbox/gemmlike/bls_l3_packm_var3.c
+++ b/sandbox/gemmlike/bls_l3_packm_var3.c
@@ -45,7 +45,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
@@ -75,11 +75,6 @@ void PASTECH2(bls_,ch,varname) \
 	inc_t           incc; \
 	inc_t           ldc; \
 	inc_t           ldp; \
-	conj_t          conjc; \
-\
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
 \
 	/* Create flags to incidate row or column storage. Note that the
 	   schema bit that encodes row or column is describing the form of
@@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thrinfo_num_threads( thread ); \
+	const dim_t tid = bli_thrinfo_thread_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/thread/bls_l3_decor.c b/sandbox/gemmlike/thread/bls_l3_decor.c
deleted file mode 100644
index 7fa799f14..000000000
--- a/sandbox/gemmlike/thread/bls_l3_decor.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// Initialize a function pointer array containing function addresses for
-// each of the threading-specific level-3 thread decorators.
-
-static l3sbx_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
-{
-	[BLIS_SINGLE] = bls_l3_thread_decorator_single,
-	[BLIS_OPENMP] =
-#if   defined(BLIS_ENABLE_OPENMP)
-	                bls_l3_thread_decorator_openmp,
-#elif defined(BLIS_ENABLE_PTHREADS)
-	                NULL,
-#else
-	                NULL,
-#endif
-	[BLIS_POSIX]  =
-#if   defined(BLIS_ENABLE_PTHREADS)
-	                bls_l3_thread_decorator_pthreads,
-#elif defined(BLIS_ENABLE_OPENMP)
-	                NULL,
-#else
-	                NULL,
-#endif
-};
-
-// Define a dispatcher that chooses a threading-specific function from the
-// above function pointer array.
-
-void bls_l3_thread_decorator
-     (
-       l3sbxint_ft func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm
-     )
-{
-	rntm_t rntm_l;
-
-	// Query the threading implementation and the number of threads requested.
-	timpl_t ti = bli_rntm_thread_impl( rntm );
-	dim_t   nt = bli_rntm_num_threads( rntm );
-
-	if ( bli_error_checking_is_enabled() )
-		bls_l3_thread_decorator_check( rntm );
-
-	if ( 1 < nt && ti == BLIS_SINGLE )
-	{
-		// Here, we resolve conflicting information. The caller requested
-		// a sequential threading implementation, but also requested more
-		// than one thread. Here, we choose to favor the requested threading
-		// implementation over the number of threads, and so reset all
-		// parallelism parameters to 1.
-		rntm_l = *rntm;
-		nt = 1;
-		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
-		bli_rntm_set_num_threads_only( 1, &rntm_l );
-		rntm = &rntm_l;
-	}
-
-	// Use the timpl_t value to index into the corresponding function address
-	// from the function pointer array.
-	const l3sbx_decor_ft fp = l3_decor_fpa[ ti ];
-
-	// Call the threading-specific decorator function.
-	fp
-	(
-	  func,
-	  family,
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm
-	);
-}
-
-void bls_l3_thread_decorator_check
-     (
-       rntm_t* rntm
-     )
-{
-	//err_t e_val;
-
-	//e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) );
-	//bli_check_error_code( e_val );
-
-	const timpl_t ti = bli_rntm_thread_impl( rntm );
-
-	if (
-#ifndef BLIS_ENABLE_OPENMP
-	    ti == BLIS_OPENMP ||
-#endif
-#ifndef BLIS_ENABLE_PTHREADS
-	    ti == BLIS_POSIX ||
-#endif
-	    FALSE
-	   )
-	{
-		fprintf( stderr, "\n" );
-		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
-		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
-		fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
-		bli_abort();
-	}
-}
-
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
deleted file mode 100644
index 9c29ef27e..000000000
--- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_OPENMP
-
-//#define PRINT_THRINFO
-
-void bls_l3_thread_decorator_openmp
-     (
-       l3sbxint_ft func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
-     )
-{
-	// Query the total number of threads from the rntm_t object.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-
-	_Pragma( "omp parallel num_threads(n_threads)" )
-	{
-		// Create a thread-local copy of the master thread's rntm_t. This is
-		// necessary since we want each thread to be able to track its own
-		// small block pool_t as it executes down the function stack.
-		rntm_t           rntm_l = *rntm;
-		rntm_t* restrict rntm_p = &rntm_l;
-
-		// Query the thread's id from OpenMP.
-		const dim_t tid = omp_get_thread_num();
-
-		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
-		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		thrinfo_t* thread = NULL;
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-
-		func
-		(
-		  alpha,
-		  a,
-		  b,
-		  beta,
-		  c,
-		  cntx,
-		  rntm_p,
-		  thread
-		);
-
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-}
-
-#endif
-
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h
deleted file mode 100644
index 8198a1ba1..000000000
--- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
-#define BLIS_SBX_L3_DECOR_OPENMP_H
-
-// Definitions specific to situations when OpenMP multithreading is enabled.
-#ifdef BLIS_ENABLE_OPENMP
-
-void bls_l3_thread_decorator_openmp
-     (
-       l3sbxint_ft func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm
-     );
-
-#endif
-
-#endif
-
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
deleted file mode 100644
index 95d0e968e..000000000
--- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_PTHREADS
-
-// A data structure to assist in passing operands to additional threads.
-typedef struct thread_data
-{
-	l3sbxint_ft func;
-	opid_t      family;
-	obj_t*      alpha;
-	obj_t*      a;
-	obj_t*      b;
-	obj_t*      beta;
-	obj_t*      c;
-	cntx_t*     cntx;
-	rntm_t*     rntm;
-	dim_t       tid;
-	thrcomm_t*  gl_comm;
-	array_t*    array;
-} thread_data_t;
-
-// Entry point function for additional threads.
-void* bls_l3_thread_entry( void* data_void )
-{
-	thread_data_t* data     = data_void;
-
-	l3sbxint_ft    func     = data->func;
-	opid_t         family   = data->family;
-	obj_t*         alpha    = data->alpha;
-	obj_t*         a        = data->a;
-	obj_t*         b        = data->b;
-	obj_t*         beta     = data->beta;
-	obj_t*         c        = data->c;
-	cntx_t*        cntx     = data->cntx;
-	rntm_t*        rntm     = data->rntm;
-	dim_t          tid      = data->tid;
-	array_t*       array    = data->array;
-	thrcomm_t*     gl_comm  = data->gl_comm;
-
-	( void )family;
-
-	// Create a thread-local copy of the master thread's rntm_t. This is
-	// necessary since we want each thread to be able to track its own
-	// small block pool_t as it executes down the function stack.
-	rntm_t           rntm_l = *rntm;
-	rntm_t* restrict rntm_p = &rntm_l;
-
-	// Use the thread id to access the appropriate pool_t* within the
-	// array_t, and use it to set the sba_pool field within the rntm_t.
-	// If the pool_t* element within the array_t is NULL, it will first
-	// be allocated/initialized.
-	bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-	thrinfo_t* thread = NULL;
-
-	// Create the root node of the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-
-	func
-	(
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm_p,
-	  thread
-	);
-
-	// Free the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_free( rntm_p, thread );
-
-	return NULL;
-}
-
-void bls_l3_thread_decorator_pthreads
-     (
-       l3sbxint_ft func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm
-     )
-{
-	err_t r_val;
-
-	// Query the total number of threads from the context.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-	// Allocate an array of pthread objects and auxiliary data structs to pass
-	// to the thread entry functions.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val );
-
-	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
-	// can spawn all other threads before proceeding with its own computation.
-	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
-	{
-		// Set up thread data for additional threads (beyond thread 0).
-		datas[tid].func     = func;
-		datas[tid].family   = family;
-		datas[tid].alpha    = alpha;
-		datas[tid].a        = a;
-		datas[tid].b        = b;
-		datas[tid].beta     = beta;
-		datas[tid].c        = c;
-		datas[tid].cntx     = cntx;
-		datas[tid].rntm     = rntm;
-		datas[tid].tid      = tid;
-		datas[tid].gl_comm  = gl_comm;
-		datas[tid].array    = array;
-
-		// Spawn additional threads for ids greater than 1.
-		if ( tid != 0 )
-			bli_pthread_create( &pthreads[tid], NULL, &bls_l3_thread_entry, &datas[tid] );
-		else
-			bls_l3_thread_entry( ( void* )(&datas[0]) );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
-	// Thread 0 waits for additional threads to finish.
-	for ( dim_t tid = 1; tid < n_threads; tid++ )
-	{
-		bli_pthread_join( pthreads[tid], NULL );
-	}
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( pthreads );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( datas );
-}
-
-#else
-
-// Define a dummy function bli_l3_thread_entry(), which is needed for
-// consistent dynamic linking behavior when building shared objects in Linux
-// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol.
-void* bli_l3_thread_entry( void* data_void ) { return NULL; }
-
-#endif
-
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
deleted file mode 100644
index 162086bb0..000000000
--- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
-#define BLIS_SBX_L3_DECOR_PTHREADS_H
-
-// Definitions specific to situations when POSIX multithreading is enabled.
-#ifdef BLIS_ENABLE_PTHREADS
-
-// Thread entry point prototype.
-void* bls_l3_thread_entry( void* data_void );
-
-void bls_l3_thread_decorator_pthreads
-     (
-       l3sbxint_ft func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm
-     );
-
-#endif
-
-#endif
-
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c
deleted file mode 100644
index b5f5a6669..000000000
--- a/sandbox/gemmlike/thread/bls_l3_decor_single.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define SKIP_THRINFO_TREE
-
-void bls_l3_thread_decorator_single
-     (
-       l3sbxint_ft func,
-       opid_t      family,
-       //pack_t      schema_a,
-       //pack_t      schema_b,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm
-     )
-{
-	// For sequential execution, we use only one thread.
-	const dim_t n_threads = 1;
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm.
-	bli_pba_rntm_set_pba( rntm );
-
-#ifndef SKIP_THRINFO_TREE
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-#endif
-
-
-	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* rntm_p = rntm;
-
-		// There is only one thread id (for the thief thread).
-		const dim_t tid = 0;
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-#ifndef SKIP_THRINFO_TREE
-		thrinfo_t* thread = NULL;
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-#else
-		// This optimization allows us to use one of the global thrinfo_t
-		// objects for single-threaded execution rather than grow one from
-		// scratch. The key is that bli_thrinfo_sup_grow(), which is called
-		// from within the variants, will immediately return if it detects
-		// that the thrinfo_t* passed into it is either
-		// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
-		thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
-
-		( void )tid;
-#endif
-
-		func
-		(
-		  alpha,
-		  a,
-		  b,
-		  beta,
-		  c,
-		  cntx,
-		  rntm_p,
-		  thread
-		);
-
-#ifndef SKIP_THRINFO_TREE
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
-#endif
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-}
-
diff --git a/sandbox/old/ref99/old/packm/blx_l3_packm.c b/sandbox/old/ref99/old/packm/blx_l3_packm.c
index 982e2d963..dcec1e8cb 100644
--- a/sandbox/old/ref99/old/packm/blx_l3_packm.c
+++ b/sandbox/old/ref99/old/packm/blx_l3_packm.c
@@ -51,7 +51,7 @@ void blx_l3_packm
 	siz_t     size_needed;
 
 	// FGVZ: Not sure why we need this barrier, but we do.
-	bli_thread_barrier( thread );
+	bli_thrinfo_barrier( thread );
 
 	// Every thread initializes x_pack and determines the size of memory
 	// block needed (which gets embedded into the otherwise "blank" mem_t
@@ -102,7 +102,7 @@ void blx_l3_packm
 
 		// Broadcast the address of the chief thread's local mem_t entry to
 		// all threads.
-		local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
+		local_mem_p = bli_thrinfo_broadcast( thread, &local_mem_s );
 
 		// Save the contents of the chief thread's local mem_t entry to the
 		// mem_t field in this thread's control tree node.
@@ -142,7 +142,7 @@ void blx_l3_packm
 
 			// Broadcast the address of the chief thread's local mem_t entry to
 			// all threads.
-			local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
+			local_mem_p = bli_thrinfo_broadcast( thread, &local_mem_s );
 
 			// Save the chief thread's local mem_t entry to the mem_t field in
 			// this thread's control tree node.
@@ -155,7 +155,7 @@ void blx_l3_packm
 			// will already have the cached values in their local control
 			// trees' mem_t entries, currently pointed to by cntl_mem_p.
 
-			bli_thread_barrier( thread );
+			bli_thrinfo_barrier( thread );
 		}
 	}
 
@@ -178,6 +178,6 @@ void blx_l3_packm
 	);
 
 	// Barrier so that packing is done before computation.
-	bli_thread_barrier( thread );
+	bli_thrinfo_barrier( thread );
 }
 
diff --git a/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c b/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c
index 6e8786268..6342f5ab6 100644
--- a/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c
+++ b/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c
@@ -73,14 +73,14 @@ void blx_gemm_blk_var3
 		  bli_thrinfo_sub_node( thread )
 		);
 
-		bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
+		bli_thrinfo_barrier( bli_thrinfo_sub_node( thread ) );
 
 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal beta scalar on matrix C is non-zero, we must use it
 		// only for the first iteration (and then BLIS_ONE for all others).
 		// And since c is a locally aliased obj_t, we can simply overwrite
 		// the internal beta scalar with BLIS_ONE once it has been used in
-		// the first iteration. 
+		// the first iteration.
 		if ( i == 0 ) bli_obj_scalar_reset( c );
 	}
 }
diff --git a/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c b/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c
index 10c6b81ad..09d5e2c51 100644
--- a/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c
+++ b/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c
@@ -265,10 +265,10 @@ void PASTECH2(blx_,ch,varname) \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c
index 7cbd402e0..2095f8bd2 100644
--- a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c
+++ b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c
@@ -262,10 +262,10 @@ void PASTECH2(blx_,ch,varname) \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c
index 2d46886b7..9a1c63a29 100644
--- a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c
+++ b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c
@@ -262,10 +262,10 @@ void PASTECH2(blx_,ch,varname) \
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
 	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
+	dim_t jr_nt  = bli_thrinfo_n_way( thread ); \
+	dim_t jr_tid = bli_thrinfo_work_id( thread ); \
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus ); \
+	dim_t ir_tid = bli_thrinfo_work_id( caucus ); \
 \
 	dim_t jr_start, jr_end; \
 	dim_t ir_start, ir_end; \
diff --git a/sandbox/power10/bli_gemm_ex.c b/sandbox/power10/bli_gemm_ex.c
index 3334dc4a5..d136c7e1b 100644
--- a/sandbox/power10/bli_gemm_ex.c
+++ b/sandbox/power10/bli_gemm_ex.c
@@ -52,7 +52,7 @@ void bli_gemm_ex
        obj_t*  beta,
        obj_t*  c,
        cntx_t* cntx,
-       rntm_t* rntm 
+       rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -73,7 +73,7 @@ void bli_gemm_ex
 	// Invoke the operation's front end.
 	bli_gemm_front
 	(
-	  alpha, a, b, beta, c, cntx, rntm, NULL
+	  alpha, a, b, beta, c, cntx, rntm
 	);
 }
 
diff --git a/test/syrk_diagonal/syrk_diagonal_example2.c b/test/syrk_diagonal/syrk_diagonal_example2.c
index 92371f48b..710db815c 100644
--- a/test/syrk_diagonal/syrk_diagonal_example2.c
+++ b/test/syrk_diagonal/syrk_diagonal_example2.c
@@ -226,8 +226,8 @@ void packm_diag
 
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */
-	const dim_t nt  = bli_thread_n_way( thread );
-	const dim_t tid = bli_thread_work_id( thread );
+	const dim_t nt  = bli_thrinfo_n_way( thread );
+	const dim_t tid = bli_thrinfo_work_id( thread );
 
 	/* Determine the thread range and increment using the current thread's
 	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
diff --git a/test/syrk_diagonal/syrk_diagonal_example2.cxx b/test/syrk_diagonal/syrk_diagonal_example2.cxx
index 8312a07ee..cc98d97ef 100644
--- a/test/syrk_diagonal/syrk_diagonal_example2.cxx
+++ b/test/syrk_diagonal/syrk_diagonal_example2.cxx
@@ -210,8 +210,8 @@ void packm_diag
 
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */
-	const dim_t nt  = bli_thread_n_way( thread );
-	const dim_t tid = bli_thread_work_id( thread );
+	const dim_t nt  = bli_thrinfo_n_way( thread );
+	const dim_t tid = bli_thrinfo_work_id( thread );
 
 	/* Determine the thread range and increment using the current thread's
 	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
diff --git a/test/tensor_contraction/tcontract_example.cxx b/test/tensor_contraction/tcontract_example.cxx
index 0b935c54d..caff3604b 100644
--- a/test/tensor_contraction/tcontract_example.cxx
+++ b/test/tensor_contraction/tcontract_example.cxx
@@ -431,12 +431,12 @@ void packm_tensor
     }
 
     /* Wait for the scatter vectors to be done. */
-    bli_thread_barrier( thread );
+    bli_thrinfo_barrier( thread );
 
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */
-	auto nt  = bli_thread_n_way( thread );
-	auto tid = bli_thread_work_id( thread );
+	auto nt  = bli_thrinfo_n_way( thread );
+	auto tid = bli_thrinfo_work_id( thread );
 
 	/* Determine the thread range and increment using the current thread's
 	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
@@ -656,7 +656,7 @@ void gemm_tensor
     }
 
     /* Wait for the scatter vectors to be done. */
-    bli_thread_barrier( thread );
+    bli_thrinfo_barrier( thread );
 
 	/* Compute number of primary and leftover components of the m and n
 	   dimensions. */
@@ -684,10 +684,10 @@ void gemm_tensor
 	auto caucus = bli_thrinfo_sub_node( thread );
 
 	/* Query the number of threads and thread ids for each loop. */
-	auto jr_nt  = bli_thread_n_way( thread );
-	auto jr_tid = bli_thread_work_id( thread );
-	auto ir_nt  = bli_thread_n_way( caucus );
-	auto ir_tid = bli_thread_work_id( caucus );
+	auto jr_nt  = bli_thrinfo_n_way( thread );
+	auto jr_tid = bli_thrinfo_work_id( thread );
+	auto ir_nt  = bli_thrinfo_n_way( caucus );
+	auto ir_tid = bli_thrinfo_work_id( caucus );
 
 	/* Determine the thread range and increment for the 2nd and 1st loops.
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index 69ee4339d..f3b5f7b52 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -231,16 +231,12 @@ void libblis_test_gemm_ukr_experiment
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
 
-	rntm_t rntm;
-	bli_rntm_init( &rntm );
-	bli_pba_rntm_set_pba( &rntm );
-
 	// Transpose B to B^T for packing.
 	bli_obj_induce_trans( &b );
 
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
-	cntl_t* cntl_a = libblis_test_pobj_create
+	thrinfo_t* thread_a = libblis_test_pobj_create
 	(
 	  BLIS_MR,
 	  BLIS_KR,
@@ -248,10 +244,9 @@ void libblis_test_gemm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx,
-	  &rntm
+	  cntx
 	);
-	cntl_t* cntl_b = libblis_test_pobj_create
+	thrinfo_t* thread_b = libblis_test_pobj_create
 	(
 	  BLIS_NR,
 	  BLIS_KR,
@@ -259,8 +254,7 @@ void libblis_test_gemm_ukr_experiment
 	  BLIS_PACKED_COL_PANELS,
 	  BLIS_BUFFER_FOR_B_PANEL,
 	  &b, &bp,
-	  cntx,
-	  &rntm
+	  cntx
 	);
 
 	// Transpose B^T back to B and Bp^T back to Bp.
@@ -293,8 +287,8 @@ void libblis_test_gemm_ukr_experiment
 
 	// Free the control tree nodes and release their cached mem_t entries
 	// back to the pba.
-	bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
-	bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
+	bli_thrinfo_free( thread_a );
+	bli_thrinfo_free( thread_b );
 
 	// Free the test objects.
 	bli_obj_free( &a );
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 44ba51587..480e49c2d 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -283,13 +283,9 @@ void libblis_test_gemmtrsm_ukr_experiment
 	bli_copym( &b11, &c11 );
 	bli_copym( &c11, &c11_save );
 
-	rntm_t rntm;
-	bli_rntm_init( &rntm );
-	bli_pba_rntm_set_pba( &rntm );
-
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
-	cntl_t* cntl_a = libblis_test_pobj_create
+	thrinfo_t* thread_a = libblis_test_pobj_create
 	(
 	  BLIS_MR,
 	  BLIS_MR,
@@ -297,8 +293,7 @@ void libblis_test_gemmtrsm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx,
-	  &rntm
+	  cntx
 	);
 
 	// Set the diagonal offset of ap.
@@ -315,7 +310,7 @@ bli_printm( "a", &a, "%5.2f", "" );
 bli_printm( "ap", &ap, "%5.2f", "" );
 #endif
 
-	cntl_t* cntl_b = NULL;
+	thrinfo_t* thread_b = NULL;
 
 	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
@@ -325,7 +320,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		// Transpose B to B^T for packing.
 		bli_obj_induce_trans( &b );
 
-		cntl_b = libblis_test_pobj_create
+		thread_b = libblis_test_pobj_create
 		(
 		  BLIS_NR,
 		  BLIS_MR,
@@ -333,8 +328,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		  BLIS_PACKED_COL_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,
 		  &b, &bp,
-		  cntx,
-		  &rntm
+		  cntx
 		);
 
 		// Transpose B^T back to B and Bp^T back to Bp.
@@ -362,9 +356,9 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		// to perform the correctness check later.
 		if ( i < n_repeats - 1 )
 		{
-			// Free the control tree nodes and release their cached mem_t entries
+			// Free the thread control tree nodes and release their cached mem_t entries
 			// back to the memory broker.
-			bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
+			bli_thrinfo_free( thread_b );
 		}
 	}
 
@@ -401,11 +395,11 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 	// Zero out performance and residual if output matrix is empty.
 	//libblis_test_check_empty_problem( &c11, perf, resid );
 
-	// Free the control tree nodes and release their cached mem_t entries
+	// Free the thread control tree nodes and release their cached mem_t entries
 	// back to the pba.
-	bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
-	if ( cntl_b )
-	    bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
+	bli_thrinfo_free( thread_a );
+	if ( thread_b )
+	    bli_thrinfo_free( thread_b );
 
 	// Free the test objects.
 	bli_obj_free( &a_big );
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index a355385a3..aec9357ae 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -2652,17 +2652,20 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
 }
 
 
-cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm )
+thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
 {
-	bool   does_inv_diag;
+	bool does_inv_diag;
 
 	if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE;
 	else                                   does_inv_diag = TRUE;
 
+	rntm_t rntm;
+	bli_rntm_init( &rntm );
+
 	// Create a control tree node for the packing operation.
 	cntl_t* cntl = bli_packm_cntl_create_node
 	(
-	  NULL, // we don't need the small block allocator from the runtime.
+	  NULL, // pass NULL as the pool so that malloc() is used.
 	  NULL, // func ptr is not referenced b/c we don't call via l3 _int().
 	  bmult_id_m,
 	  bmult_id_n,
@@ -2674,12 +2677,17 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia
 	  NULL  // no child node needed
 	);
 
+	thrinfo_t* thread = bli_l3_thrinfo_create( 0, &BLIS_SINGLE_COMM, NULL, &rntm, cntl );
+
 	// Pack the contents of A to P.
-	bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED );
+	bli_packm_blk_var1( a, p, cntx, cntl, thread );
+
+	// Free the control tree.
+	bli_l3_cntl_free( NULL, cntl );
 
-	// Return the control tree pointer so the caller can free the cntl_t and its
+	// Return the thread control tree pointer so the caller can free the thrinfo_t and its
 	// mem_t entry later on.
-	return cntl;
+	return thread;
 }
 
 
diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h
index 9e38964ee..93c892c4f 100644
--- a/testsuite/src/test_libblis.h
+++ b/testsuite/src/test_libblis.h
@@ -420,7 +420,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces );
 // --- Create object ---
 
 void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a );
-cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm );
+thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx );
 void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x );
 
 // --- Randomize/initialize object ---
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 5f4988e1c..ae5c9a814 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -232,13 +232,9 @@ void libblis_test_trsm_ukr_experiment
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
 
-	rntm_t rntm;
-	bli_rntm_init( &rntm );
-	bli_pba_rntm_set_pba( &rntm );
-
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
-	cntl_t* cntl_a = libblis_test_pobj_create
+	thrinfo_t* thread_a = libblis_test_pobj_create
 	(
 	  BLIS_MR,
 	  BLIS_MR,
@@ -246,8 +242,7 @@ void libblis_test_trsm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx,
-	  &rntm
+	  cntx
 	);
 
 	// Set the diagonal offset of ap.
@@ -271,7 +266,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		// Transpose B to B^T for packing.
 		bli_obj_induce_trans( &b );
 
-		cntl_t* cntl_b = libblis_test_pobj_create
+		thrinfo_t* thread_b = libblis_test_pobj_create
 		(
 		  BLIS_NR,
 		  BLIS_MR,
@@ -279,8 +274,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		  BLIS_PACKED_COL_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,
 		  &b, &bp,
-		  cntx,
-		  &rntm
+		  cntx
 		);
 
 		// Transpose B^T back to B and Bp^T back to Bp.
@@ -297,7 +291,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 
 		// Free the control tree nodes and release their cached mem_t entries
 		// back to the memory broker.
-		bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
+		bli_thrinfo_free( thread_b );
 	}
 
 	// Estimate the performance of the best experiment repeat.
@@ -312,7 +306,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 
 	// Free the control tree nodes and release their cached mem_t entries
 	// back to the memory broker.
-	bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
+	bli_thrinfo_free( thread_a );
 
 	// Free the test objects.
 	bli_obj_free( &a );

From 29f79f030e939969d4f3876c4fdaac7b0c5daa63 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 31 Oct 2022 18:57:45 -0500
Subject: [PATCH 099/230] Fixed performance bug caused by redundant packing.
 (#680)

Details:
- Fixed a performance bug whereby multiple threads were redundantly
  packing the same (rather than separate) micropanels. This bug was
  caused by different parts of the code using the num_threads/thread_id
  field of the thrinfo_t vs. the n_way/work_id fields. The fix was to
  standardize on the latter and provide a "fake" thrinfo_t sub-prenode
  in the thrinfo tree which consists of single-member thread teams. The
  single team with multiple threads node is still required since it and
  only it can be used to perform barriers and broadcasts (e.g. of the
  packed buffer pointer).
---
 frame/1m/packm/bli_packm_blk_var1.c | 38 +++++++++++++++++------------
 frame/1m/packm/bli_packm_int.c      | 21 +++++++++++-----
 frame/3/bli_l3_thrinfo.c            | 22 ++++++++++++++---
 3 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index da49126a5..9ac9582db 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -54,11 +54,11 @@ static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
 
 void bli_packm_blk_var1
      (
-       const obj_t*   c,
-             obj_t*   p,
-       const cntx_t*  cntx,
-       const cntl_t*  cntl,
-             thrinfo_t* thread
+       const obj_t*     c,
+             obj_t*     p,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	// Extract various fields from the control tree.
@@ -67,12 +67,18 @@ void bli_packm_blk_var1
 	bool   revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
 	bool   reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );
 
-	// Every thread initializes p and determines the size of memory
-	// block needed (which gets embedded into the otherwise "blank" mem_t
-	// entry in the control tree node). Return early if no packing is required.
-	if ( !bli_packm_init( c, p, cntx, cntl, thread ) )
+	// Every thread initializes p and determines the size of memory block
+	// needed (which gets embedded into the otherwise "blank" mem_t entry
+	// in the control tree node). Return early if no packing is required.
+	if ( !bli_packm_init( c, p, cntx, cntl, bli_thrinfo_sub_node( thread_par ) ) )
 		return;
 
+	// Use the sub-prenode. In bli_l3_thrinfo_grow(), this node was created to
+	// represent the team of threads as a group of single-member thread teams.
+	// This is necessary since the all of the work distribution function depend
+	// on the work_id and n_way fields.
+	thrinfo_t* thread = bli_thrinfo_sub_prenode( thread_par );
+
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bli_packm_int_check( c, p, cntx );
@@ -134,11 +140,11 @@ void bli_packm_blk_var1
 		packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
 	}
 
-	/* Compute the total number of iterations we'll need. */
+	// Compute the total number of iterations we'll need.
 	dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );
 
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */
+	// Set the initial values and increments for indices related to C and P
+	// based on whether reverse iteration was requested.
 	dim_t  ic0, ip0;
 	doff_t ic_inc, ip_inc;
 
@@ -158,10 +164,10 @@ void bli_packm_blk_var1
 		ip_inc = 1;
 	}
 
-	// Query the number of threads and thread ids from the current thread's
-	// packm thrinfo_t node.
-	const dim_t nt  = bli_thrinfo_num_threads( thread );
-	const dim_t tid = bli_thrinfo_thread_id( thread );
+	// Query the number of threads (single-member thread teams) and the thread
+	// team ids from the current thread's packm thrinfo_t node.
+	const dim_t nt  = bli_thrinfo_n_way( thread );
+	const dim_t tid = bli_thrinfo_work_id( thread );
 
 	// Determine the thread range and increment using the current thread's
 	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index fa4fcb47a..49d5a49a3 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -36,10 +36,10 @@
 
 void bli_packm_int
      (
-       const obj_t*  a,
-             obj_t*  p,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+             obj_t*     p,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -53,14 +53,23 @@ void bli_packm_int
 	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	bli_thrinfo_barrier( thread );
 
-	// Invoke the variant with kappa_use.
+	// Invoke the packm variant.
+	// NOTE: The packing kernel uses two communicators: one which represents a
+	// single workgroup of many threads, and one which represents a group of
+	// many single-member workgroups. The former communicator is used for
+	// barriers and thread communication (i.e. broadcasting the pack buffer
+	// pointer), while the latter communicator is used for partitioning work.
+	// This is because all of the thread range functions rely on the work_id
+	// and number of workgroups (n_way). Thus, we pass along the parent
+	// thrinfo_t node which has these two communicators as the sub-node and
+	// sub-prenode, respectively.
 	f
 	(
 	  a,
 	  p,
 	  cntx,
 	  cntl,
-	  thread
+	  thread_par
 	);
 
 	// Barrier so that packing is done before computation.
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 402497153..0b45abbf6 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -77,7 +77,19 @@ void bli_l3_thrinfo_grow
 	thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par );
 	bli_thrinfo_set_sub_node( thread_cur, thread_par );
 
-	if ( sub_prenode != NULL )
+	if ( bszid == BLIS_NO_PART )
+	{
+		// A hack: the packing code needs a thread communicator which represents
+		// a group of single-member thread teams working cooperatively However,
+		// the "normal" packm thrinfo_t node has a single team of multiple
+		// threads. Our solution (for now) is to create a sub-prenode on the
+		// thrinfo_t tree which splits this single team into multiple
+		// single-member thread teams.
+		const dim_t n_threads = bli_thrinfo_num_threads( thread_par );
+		thrinfo_t* thread_pre = bli_thrinfo_split( n_threads, thread_par );
+		bli_thrinfo_set_sub_prenode( thread_pre, thread_par );
+	}
+	else if ( sub_prenode != NULL )
 	{
 		// A pre-node is only used in the IC loop of trsm. In this case,
 		// we cannot actually thread in the m dimension due to data dependencies
@@ -88,8 +100,12 @@ void bli_l3_thrinfo_grow
 		bli_rntm_set_ic_ways_only(               1, &rntm_l );
 		bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l );
 
-		// Use thread_pre instead of thread_cur since we *don't* want to
-		// do any parallelism at this level.
+		// Use thread_pre instead of thread_cur since we *don't* want to do any
+		// parallelism at this level. So the thread_pre node gets attached to
+		// thread_par and not thread_cur! This results in a split "one level
+		// higher" than in the corresponding cntl_t tree. This is intentional
+		// since two different thrinfo_t nodes will be used at the cntl_t node
+		// for trsm blocked variant 1 (one for trsm, one for gemm).
 		thrinfo_t* thread_pre = bli_thrinfo_split( 1, thread_par );
 		bli_thrinfo_set_sub_prenode( thread_pre, thread_par );
 		bli_l3_thrinfo_grow( thread_pre, &rntm_l, sub_prenode );

From 5eea6ad9eb25f37685d1ae4ae08c73cd1daca297 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 2 Nov 2022 17:07:54 -0500
Subject: [PATCH 100/230] Add mention of Wilkinson Prize to README.md. (#683)

Details:
- Added blurbs and links to Wilkinson Prize to README.md.
- Added mention of both Best Paper and Wilkinson Prizes to the top of
  README.md.
- Other minor tweaks.
---
 README.md | 66 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 012861366..e0e4238ca 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,8 @@
+_Recipient of the **[2023 James H. Wilkinson Prize for Numerical Software](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software)**_
+
+_Recipient of the **[2020 SIAM Activity Group on Supercomputing Best Paper Prize](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize)**_
+
+
 ![The BLIS cat is sleeping.](http://www.cs.utexas.edu/users/field/blis_cat.png)
 
 [![Build Status](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis)
@@ -22,12 +27,14 @@ Contents
 * **[Discussion](#discussion)**
 * **[Contributing](#contributing)**
 * **[Citations](#citations)**
+* **[Awards](#awards)**
 * **[Funding](#funding)**
 
 Introduction
 ------------
 
-BLIS is a portable software framework for instantiating high-performance
+BLIS is an [award-winning](#awards)
+portable software framework for instantiating high-performance
 BLAS-like dense linear algebra libraries. The framework was designed to isolate
 essential kernels of computation that, when optimized, immediately enable
 optimized implementations of most of its commonly used and computationally
@@ -99,16 +106,30 @@ all of which are available for free via the [edX platform](http://www.edx.org/).
 What's New
 ----------
 
+ * **BLIS selected for the 2023 James H. Wilkinson Prize for Numerical Software!** We
+are thrilled to announce that Field Van Zee and Devin Matthews were chosen to receive
+the [2023 James H. Wilkinson Prize for Numerical Software](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software).
+The selection committee sought to recognize the recipients "for the development of
+BLIS, a portable open-source software framework that facilitates rapid instantiation
+of high-performance BLAS and BLAS-like operations targeting modern CPUs." This prize
+is awarded once every four years to the authors of an outstanding piece of numerical
+software, or to individuals who have made an outstanding contribution to an existing
+piece of numerical software. It is awarded to an entry that best addresses all phases
+of the preparation of high-quality numerical software, and is intended to recognize
+innovative software in scientific computing and to encourage researchers in the
+earlier stages of their career. The prize will be awarded at the
+[2023 SIAM Conference on Computational Science and Engineering](https://www.siam.org/conferences/cm/conference/cse23) in Amsterdam.
+
  * **Join us on Discord!** In 2021, we soft-launched our [Discord](https://discord.com/)
 server by privately inviting current and former collaborators, attendees of our BLIS
-Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled by
-the results thus far, and are happy to announce that our new community is now open to
-the broader public! If you'd like to hang out with other BLIS users and developers,
-ask a question, discuss future features, or just say hello, please feel free to join us!
-We've put together a [step-by-step guide](docs/Discord.md) for creating an account and
-joining our cozy enclave. We even have a monthly "BLIS happy hour" event where people
-can casually come together for a video chat, Q&A, brainstorm session, or whatever it
-happens to unfold into!
+Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled
+by the results thus far, and are happy to announce that our new community is now open
+to the broader public! If you'd like to hang out with other BLIS users and developers,
+ask a question, discuss future features, or just say hello, please feel free to join
+us! We've put together a [step-by-step guide](docs/Discord.md) for creating an account
+and joining our cozy enclave. We even have a monthly "BLIS happy hour" event where
+people can casually come together for a video chat, Q&A, brainstorm session, or
+whatever it happens to unfold into!
 
  * **Addons feature now available!** Have you ever wanted to quickly extend BLIS's
 operation support or define new custom BLIS APIs for your application, but were
@@ -622,10 +643,10 @@ releases. The source packages may build on other rpm-based distributions.
 the source rpms may build for others.
 
  * **GNU Guix**. Guix has BLIS packages, provides builds only for the generic
-target and some specific x86_64 micro-architectures.
+target and some specific `x86_64` micro-architectures.
 
  * **Conda**. conda channel [conda-forge](https://github.com/conda-forge/blis-feedstock)
-has Linux, OSX and Windows binary packages for x86_64.
+has Linux, OSX and Windows binary packages for `x86_64`.
 
 Discussion
 ----------
@@ -795,6 +816,29 @@ within the BLIS Framework},
 }
 ```
 
+Awards
+------
+
+ * **[2023 James H. Wilkinson Prize for Numerical Software.](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software)**
+This prize is awarded once every four years to the authors of an outstanding piece of
+numerical software, or to individuals who have made an outstanding contribution to an
+existing piece of numerical software. The selection committee sought to recognize the
+recipients "for the development of [BLIS](https://github.com/flame/blis), a portable
+open-source software framework that facilitates rapid instantiation of
+high-performance BLAS and BLAS-like operations targeting modern CPUs." The prize will
+be awarded at the
+[2023 SIAM Conference on Computational Science and Engineering](https://www.siam.org/conferences/cm/conference/cse23) in Amsterdam.
+
+ * **[2020 SIAM Activity Group on Supercomputing Best Paper Prize.](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize)**
+This prize is awarded once every two years to the authors of the most outstanding
+paper, as determined by the selection committee, in the field of parallel scientific
+and engineering computing published within the four calendar years preceding the
+award year. The prize was chosen for the paper ["The BLIS Framework: Experiments in
+Portability."](#citations) and awarded at the [2020 SIAM Conference on Parallel Processing for Scientific Computing](https://www.siam.org/conferences/cm/conference/pp20) in Seattle where Robert van de Geijn delivered [a talk on BLIS](https://meetings.siam.org/sess/dsp_programsess.cfm?SESSIONCODE=68266) and accepted the prize alongside other coauthors.
+See also:
+   * [SIAM News | January 2020 Prize Spotlight](https://sinews.siam.org/Details-Page/january-2020-prize-spotlight#Field&Robert)
+   * [Oden Institute's SHPC Group Win SIAM Best Paper Prize](https://www.oden.utexas.edu/about/news/ScienceHighPerfomanceComputingSIAMBestPaperPrize/)
+
 Funding
 -------
 

From edcc2f9940449f7d9cefcfc02159d27b013e7995 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 2 Nov 2022 19:04:49 -0500
Subject: [PATCH 101/230] Support --nosup, --sup configure options. (#684)

Details:
- Added --nosup and --sup as alternative ways of requesting that sup be
  disabled or enabled. These are analagous to --disable-sup-handling and
  --enable-sup-handling, respectively. (I got tired of typing out
  --disable-sup-handling and needed a shorthand notation.)
- Tweaked message output by configure when sup is enable/disabled for
  clarity and specificity.
- Whitespace changes.
---
 configure | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 37399fbde..49deec819 100755
--- a/configure
+++ b/configure
@@ -2570,12 +2570,15 @@ main()
 			case $opt in
 				-)
 					case "$OPTARG" in
+
 						help)
 							print_usage
 							;;
+
 						quiet)
 							quiet_flag=1
 							;;
+
 						prefix=*)
 							prefix_flag=1
 							prefix=${OPTARG#*=}
@@ -2596,6 +2599,7 @@ main()
 							sharedir_flag=1
 							sharedir=${OPTARG#*=}
 							;;
+
 						enable-debug)
 							debug_flag=1
 							debug_type=noopt
@@ -2607,78 +2611,92 @@ main()
 						disable-debug)
 							debug_flag=0
 							;;
+
 						enable-asan)
 							enable_asan='yes'
 							;;
 						disable-asan)
 							enable_asan='no'
 							;;
+
 						enable-verbose-make)
 							enable_verbose='yes'
 							;;
 						disable-verbose-make)
 							enable_verbose='no'
 							;;
+
 						enable-arg-max-hack)
 							enable_arg_max_hack='yes'
 							;;
 						disable-arg-max-hack)
 							enable_arg_max_hack='no'
 							;;
+
 						enable-static)
 							enable_static='yes'
 							;;
 						disable-static)
 							enable_static='no'
 							;;
+
 						enable-shared)
 							enable_shared='yes'
 							;;
 						disable-shared)
 							enable_shared='no'
 							;;
+
 						enable-rpath)
 							enable_rpath='yes'
 							;;
 						disable-rpath)
 							enable_rpath='no'
 							;;
+
 						export-shared=*)
 							export_shared=${OPTARG#*=}
 							;;
+
 						enable-system)
 							enable_system='yes'
 							;;
 						disable-system)
 							enable_system='no'
 							;;
+
 						enable-threading=*)
 							threading_model=${OPTARG#*=}
 							;;
 						disable-threading)
 							threading_model='single'
 							;;
+
 						thread-part-jrir=*)
 							thread_part_jrir=${OPTARG#*=}
 							;;
+
 						enable-pba-pools)
 							enable_pba_pools='yes'
 							;;
 						disable-pba-pools)
 							enable_pba_pools='no'
 							;;
+
 						enable-sba-pools)
 							enable_sba_pools='yes'
 							;;
 						disable-sba-pools)
 							enable_sba_pools='no'
 							;;
+
 						enable-mem-tracing)
 							enable_mem_tracing='yes'
 							;;
 						disable-mem-tracing)
 							enable_mem_tracing='no'
 							;;
+
 						enable-addon=*)
 							addon_flag=1
 							addon_name=${OPTARG#*=}
@@ -2688,6 +2706,7 @@ main()
 						disable-addon)
 							addon_flag=''
 							;;
+
 						enable-sandbox=*)
 							sandbox_flag=1
 							sandbox=${OPTARG#*=}
@@ -2695,69 +2714,89 @@ main()
 						disable-sandbox)
 							sandbox_flag=''
 							;;
+
 						int-size=*)
 							int_type_size=${OPTARG#*=}
 							;;
+
 						blas-int-size=*)
 							blas_int_type_size=${OPTARG#*=}
 							;;
+
 						enable-blas)
 							enable_blas='yes'
 							;;
 						disable-blas)
 							enable_blas='no'
 							;;
+
 						enable-cblas)
 							enable_cblas='yes'
 							;;
 						disable-cblas)
 							enable_cblas='no'
 							;;
+
 						enable-mixed-dt)
 							enable_mixed_dt='yes'
 							;;
 						disable-mixed-dt)
 							enable_mixed_dt='no'
 							;;
+
 						enable-mixed-dt-extra-mem)
 							enable_mixed_dt_extra_mem='yes'
 							;;
 						disable-mixed-dt-extra-mem)
 							enable_mixed_dt_extra_mem='no'
 							;;
+
+						sup)
+							enable_sup_handling='yes'
+							;;
 						enable-sup-handling)
 							enable_sup_handling='yes'
 							;;
+						nosup)
+							enable_sup_handling='no'
+							;;
 						disable-sup-handling)
 							enable_sup_handling='no'
 							;;
+
 						enable-amd-frame-tweaks)
 							enable_amd_frame_tweaks='yes'
 							;;
 						disable-amd-frame-tweaks)
 							enable_amd_frame_tweaks='no'
 							;;
+
 						with-memkind)
 							enable_memkind='yes'
 							;;
 						without-memkind)
 							enable_memkind='no'
 							;;
+
 						enable-trsm-preinversion)
 							enable_trsm_preinversion='yes'
 							;;
 						disable-trsm-preinversion)
 							enable_trsm_preinversion='no'
 							;;
+
 						force-version=*)
 							force_version=${OPTARG#*=}
 							;;
+
 						show-config-list)
 							show_config_list=1
 							;;
+
 						complex-return=*)
 							complex_return=${OPTARG#*=}
 							;;
+
 						*)
 							print_usage
 							;;
@@ -3764,10 +3803,10 @@ main()
 		enable_mixed_dt_01=0
 	fi
 	if [ "x${enable_sup_handling}" = "xyes" ]; then
-		echo "${script_name}: small matrix handling is enabled."
+		echo "${script_name}: sup (skinny/unpacked) matrix handling is enabled."
 		enable_sup_handling_01=1
 	else
-		echo "${script_name}: small matrix handling is disabled."
+		echo "${script_name}: sup (skinny/unpacked) matrix handling is disabled."
 		enable_sup_handling_01=0
 	fi
 	if [ "x${enable_trsm_preinversion}" = "xyes" ]; then

From 872898d817f35702e7678ff7f3eeff0f12e641f5 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 2 Nov 2022 21:53:22 -0500
Subject: [PATCH 102/230] Fixed trmm[3]/trsm performance bug in cf7d616. (#685)

Details:
- Fixed a performance bug in the packing of micropanels that intersect
  the diagonal of triangular matrices (i.e., those found in trmm, trmm3,
  and trsm). This bug was introduced in cf7d616 and stemmed from an
  ill-formed boolean conditional expression in bli_packm_blk_var1().
  This conditional would chose when to use round-robin parallel work
  allocation, but checked for the triangularity of the submatrix being
  packed while failing also to check for whether the current micropanel
  actually intersected the diagonal. The net result of this bug was that
  *all* micropanels of a triangular matrix, no matter where the upanels
  resided within the matrix, were assigned to threads via a round-robin
  policy. This affected some microarchitectures and threading
  configurations much worse than others, but it seems that overall the
  effect was universally negative, likely because of the reduced spatial
  locality during the packing with round-robin. Thanks to Leick Robinson
  for his tireless efforts in helping track down this issue.
---
 frame/1m/packm/bli_packm_blk_var1.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 9ac9582db..05263c4b7 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -190,15 +190,16 @@ void bli_packm_blk_var1
 
 		inc_t  p_inc           = ps_p;
 
-		// NOTE: We MUST use round-robin partitioning when packing
-		// micropanels of a triangular matrix. Hermitian/symmetric
-		// and general packing may use slab or round-robin, depending
-		// on which was selected at configure-time.
-		// The definition of bli_packm_my_iter() will depend on whether slab
-		// or round-robin partitioning was requested at configure-time.
-		bool   my_iter         = bli_is_triangular( strucc )
-		    ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
-		    : bli_packm_my_iter   ( it, it_start, it_end, tid, nt );
+		// NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr())
+		// when packing micropanels of a triangular matrix. Hermitian/symmetric
+		// and general packing may use slab or round-robin (bli_packm_my_iter()),
+		// depending on which was selected at configure-time.
+		bool my_iter = ( bli_is_triangular( strucc ) &&
+		                 bli_intersects_diag_n( diagoffc_i, panel_dim_i,
+		                                        panel_len_full )
+		                 ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
+		                 : bli_packm_my_iter   ( it, it_start, it_end, tid, nt )
+		               );
 
 		if ( bli_is_triangular( strucc ) &&
 		     bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) )

From 6774bf08c92fc6983706a91bbb93b960e8eef285 Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Thu, 3 Nov 2022 15:20:47 -0500
Subject: [PATCH 103/230] Fix typo in configure --help text. (#686)

Details:
- Fixed a misspelling in the --help description for the --int-size (-i)
  configure option.
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index 49deec819..5bfa608cd 100755
--- a/configure
+++ b/configure
@@ -244,7 +244,7 @@ print_usage()
 	echo " "
 	echo "                 Set the size (in bits) of internal BLIS integers and"
 	echo "                 integer types used in native BLIS interfaces. The"
-	echo "                 default inteter type size is architecture dependent."
+	echo "                 default integer type size is architecture dependent."
 	echo "                 (Hint: You can always find this value printed at the"
 	echo "                 beginning of the testsuite output.)"
 	echo " "

From 8d813f7f12732d52c95570ae884d5defbfd19234 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 3 Nov 2022 19:10:47 -0500
Subject: [PATCH 104/230] Some decluttering of the top-level directory.

Details:
- Relocated 'mpi_test' directory to test/mpi_test.
- Relocated 'so_version' and 'version' files from top-level directory to
  'build' directory.
- Updated build/bump-version.sh script to accommodate relocation of
  'version' file to 'build' directory.
- Updated configure script to accommodate relocation of 'so_version'
  file to 'build' directory.
- Updated INSTALL file to replace pointers to blis-devel mailing list
  with a pointer to docs/Discord.md.
- Updated RELEASING file to contain a reminder to consider whether the
  so_version file should be updated prior to the release.
---
 INSTALL                                  | 11 +++++++----
 RELEASING                                | 22 +++++++++++++---------
 build/bump-version.sh                    |  6 +++---
 so_version => build/so_version           |  0
 version => build/version                 |  0
 configure                                | 10 +++++-----
 {mpi_test => test/mpi_test}/Makefile     |  0
 {mpi_test => test/mpi_test}/test_gemm.c  |  0
 {mpi_test => test/mpi_test}/test_hemm.c  |  0
 {mpi_test => test/mpi_test}/test_her2k.c |  0
 {mpi_test => test/mpi_test}/test_herk.c  |  0
 {mpi_test => test/mpi_test}/test_trmm.c  |  0
 {mpi_test => test/mpi_test}/test_trsm.c  |  0
 13 files changed, 28 insertions(+), 21 deletions(-)
 rename so_version => build/so_version (100%)
 rename version => build/version (100%)
 rename {mpi_test => test/mpi_test}/Makefile (100%)
 rename {mpi_test => test/mpi_test}/test_gemm.c (100%)
 rename {mpi_test => test/mpi_test}/test_hemm.c (100%)
 rename {mpi_test => test/mpi_test}/test_her2k.c (100%)
 rename {mpi_test => test/mpi_test}/test_herk.c (100%)
 rename {mpi_test => test/mpi_test}/test_trmm.c (100%)
 rename {mpi_test => test/mpi_test}/test_trsm.c (100%)

diff --git a/INSTALL b/INSTALL
index 9adc43867..75850a96b 100644
--- a/INSTALL
+++ b/INSTALL
@@ -17,11 +17,14 @@ viewing the file over GitHub via a web browser:
 This document will always contain the most up-to-date information related
 to instantiating a BLIS library from the framework source code. If you have
 any further questions or wish to provide feedback, please contact the BLIS
-community by posting your message to the BLIS developer's mailing list:
+community by either by joining our Discord community! Instructions for
+joining may be found in:
 
-  https://groups.google.com/d/forum/blis-devel
+  docs/Discord.md
 
-Thanks for your interest in the BLIS framework!
+or in rendered form at:
+
+  https://github.com/flame/blis/blob/master/docs/Discord.md
 
-Field Van Zee
+Thanks for your interest in the BLIS framework!
 
diff --git a/RELEASING b/RELEASING
index 351594c49..0996a560d 100644
--- a/RELEASING
+++ b/RELEASING
@@ -7,38 +7,42 @@ Here are the steps to follow to create a new release (version) of BLIS:
 
    If there are any commits upstream, merge them as appropriate.
 
-2. Verify that the code builds properly.
+2. Consider whether the so_version should be updated (via the so_version
+   file in the 'build' directory) due to any ABI changes since the previous
+   version. If so, commit that change now.
+
+3. Verify that the code builds properly.
 
    $ ./configure auto; make
 
-3. Verify that the code passes BLIS and BLAS tests:
+4. Verify that the code passes BLIS and BLAS tests:
 
    $ make check           # BLIS testsuite (fast) + BLAS test drivers
    $ make checkblis       # BLIS testsuite (full ex. mixed-datatype)
    $ make checkblis-md    # BLIS testsuite (mixed-datatype only)
    $ make checkblis-salt  # BLIS testsuite (fast + salt)
 
-4. Draft a new announcement to blis-devel, crediting those who
+5. Draft a new announcement to blis-devel, crediting those who
    contributed towards this version by browsing 'git log'.
 
-5. Update CREDITS file if 'git log' reveals any new contributors.
+6. Update CREDITS file if 'git log' reveals any new contributors.
 
-6. Update docs/ReleaseNotes.md file with body of finalized announcement
+7. Update docs/ReleaseNotes.md file with body of finalized announcement
    and the date of the release.
 
-7. Commit changes from steps 5 and 6.
+8. Commit changes from steps 5 and 6.
 
-8. Bump the version number:
+9. Bump the version number:
 
    $ ./build/bump-version.sh "0.3.2"
 
    This will result in two new commits: a version file update and a CHANGELOG
    file update.
 
-9. Push the new commits and new tag associated with the new version:
+10. Push the new commits and new tag associated with the new version:
 
    $ git push
    $ git push --tag
 
-10. Send finalized announcement to blis-devel.
+11. Send finalized announcement to blis-devel.
 
diff --git a/build/bump-version.sh b/build/bump-version.sh
index 65e1a2988..b72a09a40 100755
--- a/build/bump-version.sh
+++ b/build/bump-version.sh
@@ -98,10 +98,10 @@ main()
 	# The name of the CHANGELOG file.
 	changelog_file='CHANGELOG'
 
-	# The name of the default version file.
-	version_file_def='version'
+	# The name and location of the default version file.
+	version_file_def='build/version'
 
-	# The name of the specified version file.
+	# The name and location of the specified version file.
 	version_file=''
 
 	# Strings used during version query.
diff --git a/so_version b/build/so_version
similarity index 100%
rename from so_version
rename to build/so_version
diff --git a/version b/build/version
similarity index 100%
rename from version
rename to build/version
diff --git a/configure b/configure
index 5bfa608cd..fd4812b1b 100755
--- a/configure
+++ b/configure
@@ -2346,10 +2346,6 @@ main()
 	# of the distribution and the directory in which we are building.
 	cur_dirpath="."
 
-	# The file in which the version string is kept.
-	version_file="version"
-	version_filepath="${dist_path}/${version_file}"
-
 	# The name of and path to the directory named "build" in the top-level
 	# directory of the source distribution.
 	build_dir='build'
@@ -2431,9 +2427,13 @@ main()
 
 	# -- Version-related --
 
+	# The file in which the version string is kept.
+	version_file="version"
+	version_filepath="${build_dirpath}/${version_file}"
+
 	# The shared library (.so) version file.
 	so_version_file='so_version'
-	so_version_filepath="${dist_path}/${so_version_file}"
+	so_version_filepath="${build_dirpath}/${so_version_file}"
 
 	# The major and minor/build .so version numbers.
 	so_version_major=''
diff --git a/mpi_test/Makefile b/test/mpi_test/Makefile
similarity index 100%
rename from mpi_test/Makefile
rename to test/mpi_test/Makefile
diff --git a/mpi_test/test_gemm.c b/test/mpi_test/test_gemm.c
similarity index 100%
rename from mpi_test/test_gemm.c
rename to test/mpi_test/test_gemm.c
diff --git a/mpi_test/test_hemm.c b/test/mpi_test/test_hemm.c
similarity index 100%
rename from mpi_test/test_hemm.c
rename to test/mpi_test/test_hemm.c
diff --git a/mpi_test/test_her2k.c b/test/mpi_test/test_her2k.c
similarity index 100%
rename from mpi_test/test_her2k.c
rename to test/mpi_test/test_her2k.c
diff --git a/mpi_test/test_herk.c b/test/mpi_test/test_herk.c
similarity index 100%
rename from mpi_test/test_herk.c
rename to test/mpi_test/test_herk.c
diff --git a/mpi_test/test_trmm.c b/test/mpi_test/test_trmm.c
similarity index 100%
rename from mpi_test/test_trmm.c
rename to test/mpi_test/test_trmm.c
diff --git a/mpi_test/test_trsm.c b/test/mpi_test/test_trsm.c
similarity index 100%
rename from mpi_test/test_trsm.c
rename to test/mpi_test/test_trsm.c

From 713d078075a4a563a43d83fd0880ab5091c2e4a4 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 3 Nov 2022 20:00:11 -0500
Subject: [PATCH 105/230] Delete mpi_test garbage. (#689)

Details:
- tlrmchlsmth: "What even is this? No comments, no commit message, not
  used by anything. Trash."
---
 test/mpi_test/Makefile     | 283 -------------------------------------
 test/mpi_test/test_gemm.c  | 232 ------------------------------
 test/mpi_test/test_hemm.c  | 252 ---------------------------------
 test/mpi_test/test_her2k.c | 209 ---------------------------
 test/mpi_test/test_herk.c  | 200 --------------------------
 test/mpi_test/test_trmm.c  | 246 --------------------------------
 test/mpi_test/test_trsm.c  | 282 ------------------------------------
 7 files changed, 1704 deletions(-)
 delete mode 100644 test/mpi_test/Makefile
 delete mode 100644 test/mpi_test/test_gemm.c
 delete mode 100644 test/mpi_test/test_hemm.c
 delete mode 100644 test/mpi_test/test_her2k.c
 delete mode 100644 test/mpi_test/test_herk.c
 delete mode 100644 test/mpi_test/test_trmm.c
 delete mode 100644 test/mpi_test/test_trsm.c

diff --git a/test/mpi_test/Makefile b/test/mpi_test/Makefile
deleted file mode 100644
index 00ca01e47..000000000
--- a/test/mpi_test/Makefile
+++ /dev/null
@@ -1,283 +0,0 @@
-#
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-#
-# Makefile
-#
-# Field G. Van Zee
-# 
-# Makefile for standalone BLIS test drivers.
-#
-
-#
-# --- Makefile PHONY target definitions ----------------------------------------
-#
-
-.PHONY: all \
-        blis essl \
-        clean cleanx
-
-
-
-# Comments:
-# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
-# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
-#   the second case because CONFIG_NAME is not yet set.
-ifneq ($(strip $(BLIS_INSTALL_PATH)),)
-LIB_PATH   := $(BLIS_INSTALL_PATH)/lib
-INC_PATH   := $(BLIS_INSTALL_PATH)/include/blis
-SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
-else
-DIST_PATH  := ..
-LIB_PATH    = ../lib/$(CONFIG_NAME)
-INC_PATH    = ../include/$(CONFIG_NAME)
-SHARE_PATH := ..
-endif
-
-
-
-#
-# --- Include common makefile definitions --------------------------------------
-#
-
-# Include the common makefile fragment.
--include $(SHARE_PATH)/common.mk
-
-
-
-#
-# --- BLAS and LAPACK implementations ------------------------------------------
-#
-
-# BLAS library path(s). This is where the BLAS libraries reside.
-BLAS_LIB_PATH  := $(HOME)/flame/lib
-MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64/
-ESSL_LIB_PATH  := /soft/libraries/essl/current/lib64
-
-# OpenBLAS
-OPENBLAS_LIB   := $(BLAS_LIB_PATH)/libopenblas.a
-
-# ATLAS
-ATLAS_LIB      := $(BLAS_LIB_PATH)/libf77blas.a \
-                  $(BLAS_LIB_PATH)/libatlas.a
-
-# MKL
-MKL_LIB        := -L$(MKL_LIB_PATH) \
-                  -lmkl_sequential \
-                  -lmkl_core \
-                  -lmkl_intel_lp64
-
-# ESSL
-# Note: ESSL is named differently for SMP and/or BG
-ESSL_LIB        := $(ESSL_LIB_PATH)/libesslsmpbg.a   \
-                -L$(IBM_MAIN_DIR)/xlsmp/bg/3.1/bglib64/ \
-				-L$(IBM_MAIN_DIR)/xlf/bg/14.1/bglib64/ \
-				-lxlsmp -lxlf90_r -lxlfmath -lxl
-
-# Accelerate
-MAC_LIB        := -framework Accelerate
-
-
-
-#
-# --- General build definitions ------------------------------------------------
-#
-
-TEST_SRC_PATH  := .
-TEST_OBJ_PATH  := .
-
-# Gather all local object files.
-TEST_OBJS      := $(patsubst $(TEST_SRC_PATH)/%.c, \
-                             $(TEST_OBJ_PATH)/%.o, \
-                             $(wildcard $(TEST_SRC_PATH)/*.c))
-
-# Override the value of CINCFLAGS so that the value of CFLAGS returned by
-# get-user-cflags-for() is not cluttered up with include paths needed only
-# while building BLIS.
-CINCFLAGS      := -I$(INC_PATH)
-
-# Use the "framework" CFLAGS for the configuration family.
-CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
-
-# Add local header paths to CFLAGS
-CFLAGS         += -I$(TEST_SRC_PATH)
-
-# Locate the libblis library to which we will link.
-#LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
-
-
-
-#
-# --- Targets/rules ------------------------------------------------------------
-#
-
-# Complete list of possible targets when defining 'all':
-#
-#   blis openblas atlas mkl mac essl
-#
-all: blis essl
-
-blis: test_gemm_blis.x \
-      test_hemm_blis.x \
-      test_herk_blis.x \
-      test_her2k_blis.x \
-      test_trmm_blis.x \
-      test_trsm_blis.x
-
-essl: test_gemm_essl.x \
-      test_hemm_essl.x \
-      test_herk_essl.x \
-      test_her2k_essl.x \
-      test_trmm_essl.x \
-      test_trsm_essl.x
-
-openblas: test_gemv_openblas.x \
-      test_ger_openblas.x \
-      test_hemv_openblas.x \
-      test_her_openblas.x \
-      test_her2_openblas.x \
-      test_trmv_openblas.x \
-      test_trsv_openblas.x \
-      \
-      test_gemm_openblas.x \
-      test_hemm_openblas.x \
-      test_herk_openblas.x \
-      test_her2k_openblas.x \
-      test_trmm_openblas.x \
-      test_trsm_openblas.x
-
-atlas: test_gemv_atlas.x \
-      test_ger_atlas.x \
-      test_hemv_atlas.x \
-      test_her_atlas.x \
-      test_her2_atlas.x \
-      test_trmv_atlas.x \
-      test_trsv_atlas.x \
-      \
-      test_gemm_atlas.x \
-      test_hemm_atlas.x \
-      test_herk_atlas.x \
-      test_her2k_atlas.x \
-      test_trmm_atlas.x \
-      test_trsm_atlas.x
-
-mkl:  test_gemv_mkl.x \
-      test_ger_mkl.x \
-      test_hemv_mkl.x \
-      test_her_mkl.x \
-      test_her2_mkl.x \
-      test_trmv_mkl.x \
-      test_trsv_mkl.x \
-      \
-      test_gemm_mkl.x \
-      test_hemm_mkl.x \
-      test_herk_mkl.x \
-      test_her2k_mkl.x \
-      test_trmm_mkl.x \
-      test_trsm_mkl.x
-
-mac:  test_gemv_mac.x \
-      test_ger_mac.x \
-      test_hemv_mac.x \
-      test_her_mac.x \
-      test_her2_mac.x \
-      test_trmv_mac.x \
-      test_trsv_mac.x \
-      \
-      test_gemm_mac.x \
-      test_hemm_mac.x \
-      test_herk_mac.x \
-      test_her2k_mac.x \
-      test_trmm_mac.x \
-      test_trsm_mac.x
-
-
-
-# --Object file rules --
-
-$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
-	$(CC) $(CFLAGS) -c $< -o $@
-
-test_%_openblas.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"openblas\" -c $< -o $@
-
-test_%_atlas.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"atlas\" -c $< -o $@
-
-test_%_mkl.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"mkl\" -c $< -o $@
-
-test_%_essl.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"essl\" -c $< -o $@
-
-test_%_mac.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"mac\" -c $< -o $@
-
-test_%_blis.o: test_%.c
-	$(CC) $(CFLAGS) -DBLIS -c $< -o $@
-
-
-# -- Executable file rules --
-
-# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
-# on the link command line in case BLIS was configured with the BLAS
-# compatibility layer. This prevents BLIS from inadvertently getting called
-# for the BLAS routines we are trying to test with.
-
-test_%_openblas.x: test_%_openblas.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
-test_%_atlas.x: test_%_atlas.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(ATLAS_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
-test_%_mkl.x: test_%_mkl.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(MKL_LIB)      $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
-test_%_essl.x: test_%_essl.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(ESSL_LIB)     $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
-test_%_mac.x: test_%_mac.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(MAC_LIB)      $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
-test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK)
-	$(LINKER) $<                             $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
-
-# -- Clean rules --
-
-clean: cleanx
-
-cleanx:
-	- $(RM_F) *.o *.x
-
diff --git a/test/mpi_test/test_gemm.c b/test/mpi_test/test_gemm.c
deleted file mode 100644
index 8c5c58c23..000000000
--- a/test/mpi_test/test_gemm.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <unistd.h>
-#include "blis.h"
-#include <mpi.h>
-
-//           transa transb m     n     k     alpha    a        lda   b        ldb   beta     c        ldc
-//void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* );
-
-//#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t a, b, c;
-	obj_t c_save;
-	obj_t alpha, beta;
-	dim_t m, n, k;
-	dim_t p;
-	dim_t p_begin, p_end, p_inc;
-	int   m_input, n_input, k_input;
-	num_t dt_a, dt_b, dt_c;
-	num_t dt_alpha, dt_beta;
-	int   r, n_repeats;
-
-	double dtime;
-	double dtime_save;
-	double gflops;
-
-	bli_init();
-
-	n_repeats = 3;
-
-    if( argc < 7 )
-    {
-        printf("Usage:\n");
-        printf("test_foo.x m n k p_begin p_inc p_end:\n");
-        exit;
-    }
-
-    int world_size, world_rank, provided;
-    MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided );
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
-
-    m_input = strtol( argv[1], NULL, 10 );
-    n_input = strtol( argv[2], NULL, 10 );
-    k_input = strtol( argv[3], NULL, 10 );
-    p_begin = strtol( argv[4], NULL, 10 );
-    p_inc   = strtol( argv[5], NULL, 10 );
-    p_end   = strtol( argv[6], NULL, 10 );
-
-#if 1
-	dt_a = BLIS_DOUBLE;
-	dt_b = BLIS_DOUBLE;
-	dt_c = BLIS_DOUBLE;
-	dt_alpha = BLIS_DOUBLE;
-	dt_beta = BLIS_DOUBLE;
-#else
-	dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX;
-#endif
-
-	for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size )
-	{
-
-		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
-		else               n =     ( dim_t )    n_input;
-		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
-		else               k =     ( dim_t )    k_input;
-
-
-		bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt_beta,  1, 1, 0, 0, &beta );
-
-		bli_obj_create( dt_a, m, k, 0, 0, &a );
-		bli_obj_create( dt_b, k, n, 0, 0, &b );
-		bli_obj_create( dt_c, m, n, 0, 0, &c );
-		bli_obj_create( dt_c, m, n, 0, 0, &c_save );
-
-		bli_randm( &a );
-		bli_randm( &b );
-		bli_randm( &c );
-
-
-		bli_setsc(  (0.9/1.0), 0.2, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
-
-
-		bli_copym( &c, &c_save );
-	
-		dtime_save = 1.0e9;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-
-			dtime = bli_clock();
-
-#ifdef BLIS
-			//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-			bli_gemm( &alpha,
-			//bli_gemm4m( &alpha,
-			          &a,
-			          &b,
-			          &beta,
-			          &c );
-
-#else
-		if ( bli_is_real( dt_a ) )
-		{
-			f77_char transa = 'N';
-			f77_char transb = 'N';
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  nn     = bli_obj_width( &c );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  ap     = bli_obj_buffer( &a );
-			double*  bp     = bli_obj_buffer( &b );
-			double*  betap  = bli_obj_buffer( &beta );
-			double*  cp     = bli_obj_buffer( &c );
-
-			dgemm_( &transa,
-			        &transb,
-			        &mm,
-			        &nn,
-			        &kk,
-			        alphap,
-			        ap, &lda,
-			        bp, &ldb,
-			        betap,
-			        cp, &ldc );
-		}
-		else
-		{
-			f77_char transa = 'N';
-			f77_char transb = 'N';
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  nn     = bli_obj_width( &c );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			dcomplex*  alphap = bli_obj_buffer( &alpha );
-			dcomplex*  ap     = bli_obj_buffer( &a );
-			dcomplex*  bp     = bli_obj_buffer( &b );
-			dcomplex*  betap  = bli_obj_buffer( &beta );
-			dcomplex*  cp     = bli_obj_buffer( &c );
-
-			zgemm_( &transa,
-			//zgemm3m_( &transa,
-			        &transb,
-			        &mm,
-			        &nn,
-			        &kk,
-			        alphap,
-			        ap, &lda,
-			        bp, &ldb,
-			        betap,
-			        cp, &ldc );
-		}
-#endif
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
-
-		if ( bli_is_complex( dt_a ) ) gflops *= 4.0;
-
-#ifdef BLIS
-		printf( "data_gemm_blis" );
-#else
-		printf( "data_gemm_%s", BLAS );
-#endif
-		printf( "( %2lu, 1:5 ) = [ %4lu %4lu %4lu  %10.3e  %6.3f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )k,
-		        ( unsigned long )n, dtime_save, gflops );
-
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &b );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	bli_finalize();
-
-	return 0;
-}
-
diff --git a/test/mpi_test/test_hemm.c b/test/mpi_test/test_hemm.c
deleted file mode 100644
index 1934de013..000000000
--- a/test/mpi_test/test_hemm.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <unistd.h>
-#include "blis.h"
-#include <mpi.h>
-
-//           side   uploa  m     n     alpha    a        lda   b        ldb   beta     c        ldc
-//void dsymm_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* );
-
-//#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t a, b, c;
-	obj_t c_save;
-	obj_t alpha, beta;
-	dim_t m, n;
-	dim_t p;
-	dim_t p_begin, p_end, p_inc;
-	int   m_input, n_input;
-	num_t dt_a, dt_b, dt_c;
-	num_t dt_alpha, dt_beta;
-	int   r, n_repeats;
-	side_t side;
-	uplo_t uplo;
-
-	double dtime;
-	double dtime_save;
-	double gflops;
-
-	bli_init();
-
-	n_repeats = 3;
-
-    if( argc < 7 ) 
-    {   
-        printf("Usage:\n");
-        printf("test_foo.x m n k p_begin p_inc p_end:\n");
-        exit;
-    }   
-
-    int world_size, world_rank, provided;
-    MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided );
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
-
-    m_input = strtol( argv[1], NULL, 10 );
-    n_input = strtol( argv[2], NULL, 10 );
-    p_begin = strtol( argv[4], NULL, 10 );
-    p_inc   = strtol( argv[5], NULL, 10 );
-    p_end   = strtol( argv[6], NULL, 10 );
-
-#if 1
-	dt_a = BLIS_DOUBLE;
-	dt_b = BLIS_DOUBLE;
-	dt_c = BLIS_DOUBLE;
-	dt_alpha = BLIS_DOUBLE;
-	dt_beta = BLIS_DOUBLE;
-#else
-	dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX;
-#endif
-
-	side = BLIS_LEFT;
-	//side = BLIS_RIGHT;
-
-	uplo = BLIS_LOWER;
-	//uplo = BLIS_UPPER;
-
-    for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size )
-	{
-
-		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
-		else               n =     ( dim_t )    n_input;
-
-
-		bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt_beta,  1, 1, 0, 0, &beta );
-
-		if ( bli_is_left( side ) )
-			bli_obj_create( dt_a, m, m, 0, 0, &a );
-		else
-			bli_obj_create( dt_a, n, n, 0, 0, &a );
-		bli_obj_create( dt_b, m, n, 0, 0, &b );
-		bli_obj_create( dt_c, m, n, 0, 0, &c );
-		bli_obj_create( dt_c, m, n, 0, 0, &c_save );
-
-		bli_randm( &a );
-		bli_randm( &b );
-		bli_randm( &c );
-
-		bli_obj_set_struc( BLIS_HERMITIAN, &a );
-		bli_obj_set_uplo( uplo, &a );
-
-		// Randomize A, make it densely Hermitian, and zero the unstored
-		// triangle to ensure the implementation reads only from the stored
-		// region.
-		bli_randm( &a );
-		bli_mkherm( &a );
-		bli_mktrim( &a );
-/*
-		bli_obj_toggle_uplo( &a );
-		bli_obj_inc_diag_offset( 1, &a );
-		bli_setm( &BLIS_ZERO, &a );
-		bli_obj_inc_diag_offset( -1, &a );
-		bli_obj_toggle_uplo( &a );
-		bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a );
-		bli_scalm( &BLIS_TWO, &a );
-		bli_scalm( &BLIS_TWO, &a );
-*/
-
-
-		bli_setsc(  (2.0/1.0), 1.0, &alpha );
-		bli_setsc( (1.0/1.0), 0.0, &beta );
-
-
-		bli_copym( &c, &c_save );
-	
-		dtime_save = 1.0e9;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-
-			dtime = bli_clock();
-
-#ifdef PRINT
-/*
-			obj_t ar, ai;
-			bli_obj_alias_to( &a, &ar );
-			bli_obj_alias_to( &a, &ai );
-			bli_obj_set_dt( BLIS_DOUBLE, &ar ); ar.rs *= 2; ar.cs *= 2;
-			bli_obj_set_dt( BLIS_DOUBLE, &ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1;
-			bli_printm( "ar", &ar, "%4.1f", "" );
-			bli_printm( "ai", &ai, "%4.1f", "" );
-*/
-
-			bli_printm( "a", &a, "%4.1f", "" );
-			bli_printm( "b", &b, "%4.1f", "" );
-			bli_printm( "c", &c, "%4.1f", "" );
-#endif
-
-#ifdef BLIS
-
-			//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-			bli_hemm( side,
-			//bli_hemm4m( side,
-			          &alpha,
-			          &a,
-			          &b,
-			          &beta,
-			          &c );
-#else
-
-			f77_char side   = 'L';
-			f77_char uplo   = 'L';
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  nn     = bli_obj_width( &c );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  ap     = bli_obj_buffer( &a );
-			double*  bp     = bli_obj_buffer( &b );
-			double*  betap  = bli_obj_buffer( &beta );
-			double*  cp     = bli_obj_buffer( &c );
-
-			dsymm_( &side,
-			        &uplo,
-			        &mm,
-			        &nn,
-			        alphap,
-			        ap, &lda,
-			        bp, &ldb,
-			        betap,
-			        cp, &ldc );
-#endif
-
-#ifdef PRINT
-			bli_printm( "c after", &c, "%9.5f", "" );
-			exit(1);
-#endif
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		if ( bli_is_left( side ) )
-			gflops = ( 2.0 * m * m * n ) / ( dtime_save * 1.0e9 );
-		else
-			gflops = ( 2.0 * m * n * n ) / ( dtime_save * 1.0e9 );
-
-		if ( bli_is_complex( dt_a ) ) gflops *= 4.0;
-
-#ifdef BLIS
-		printf( "data_hemm_blis" );
-#else
-		printf( "data_hemm_%s", BLAS );
-#endif
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu  %10.3e  %6.3f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )n, dtime_save, gflops );
-
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &b );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	bli_finalize();
-
-	return 0;
-}
-
diff --git a/test/mpi_test/test_her2k.c b/test/mpi_test/test_her2k.c
deleted file mode 100644
index 6aa15038a..000000000
--- a/test/mpi_test/test_her2k.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <unistd.h>
-#include "blis.h"
-#include <mpi.h>
-
-//            uploa  transa m     k     alpha    a        lda   b        ldb   beta     c        ldc
-//void dsyr2k_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* );
-
-//#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t a, b, c;
-	obj_t c_save;
-	obj_t alpha, beta;
-	dim_t m, k;
-	dim_t p;
-	dim_t p_begin, p_end, p_inc;
-	int   m_input, k_input;
-	num_t dt_a, dt_b, dt_c;
-	num_t dt_alpha, dt_beta;
-	int   r, n_repeats;
-	uplo_t uplo;
-
-	double dtime;
-	double dtime_save;
-	double gflops;
-
-	bli_init();
-
-	n_repeats = 3;
-
-    if( argc < 7 ) 
-    {   
-        printf("Usage:\n");
-        printf("test_foo.x m n k p_begin p_inc p_end:\n");
-        exit;
-    }   
-
-    int world_size, world_rank, provided;
-    MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided );
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
-
-    m_input = strtol( argv[1], NULL, 10 );
-    k_input = strtol( argv[3], NULL, 10 );
-    p_begin = strtol( argv[4], NULL, 10 );
-    p_inc   = strtol( argv[5], NULL, 10 );
-    p_end   = strtol( argv[6], NULL, 10 );
-
-	dt_a = BLIS_DOUBLE;
-	dt_b = BLIS_DOUBLE;
-	dt_c = BLIS_DOUBLE;
-	dt_alpha = BLIS_DOUBLE;
-	dt_beta = BLIS_DOUBLE;
-
-	uplo = BLIS_LOWER;
-
-    for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size )
-	{
-
-		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
-		else               k =     ( dim_t )    k_input;
-
-
-		bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt_beta,  1, 1, 0, 0, &beta );
-
-		bli_obj_create( dt_a, m, k, 0, 0, &a );
-		bli_obj_create( dt_b, m, k, 0, 0, &b );
-		bli_obj_create( dt_c, m, m, 0, 0, &c );
-		bli_obj_create( dt_c, m, m, 0, 0, &c_save );
-
-		bli_randm( &a );
-		bli_randm( &b );
-		bli_randm( &c );
-
-		bli_obj_set_struc( BLIS_HERMITIAN, &c );
-		bli_obj_set_uplo( uplo, &c );
-
-
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
-
-
-		bli_copym( &c, &c_save );
-	
-		dtime_save = 1.0e9;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-
-			dtime = bli_clock();
-
-#ifdef PRINT
-			bli_printm( "a", &a, "%4.1f", "" );
-			bli_printm( "b", &b, "%4.1f", "" );
-			bli_printm( "c", &c, "%4.1f", "" );
-#endif
-
-#ifdef BLIS
-
-			//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-			bli_her2k( &alpha,
-			           &a,
-			           &b,
-			           &beta,
-			           &c );
-
-#else
-
-			f77_char uploa  = 'L';
-			f77_char transa = 'N';
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  ap     = bli_obj_buffer( &a );
-			double*  bp     = bli_obj_buffer( &b );
-			double*  betap  = bli_obj_buffer( &beta );
-			double*  cp     = bli_obj_buffer( &c );
-
-			dsyr2k_( &uploa,
-			         &transa,
-			         &mm,
-			         &kk,
-			         alphap,
-			         ap, &lda,
-			         bp, &ldb,
-			         betap,
-			         cp, &ldc );
-#endif
-
-#ifdef PRINT
-			bli_printm( "c after", &c, "%4.1f", "" );
-			exit(1);
-#endif
-
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		gflops = ( 2.0 * m * k * m ) / ( dtime_save * 1.0e9 );
-
-#ifdef BLIS
-		printf( "data_her2k_blis" );
-#else
-		printf( "data_her2k_%s", BLAS );
-#endif
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu  %10.3e  %6.3f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )k, dtime_save, gflops );
-
-
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &b );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	bli_finalize();
-
-	return 0;
-}
-
diff --git a/test/mpi_test/test_herk.c b/test/mpi_test/test_herk.c
deleted file mode 100644
index 06e11afe1..000000000
--- a/test/mpi_test/test_herk.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <unistd.h>
-#include "blis.h"
-#include <mpi.h>
-
-//           uploa  transa m     k     alpha    a        lda   beta     c        ldc
-//void dsyrk_( char*, char*, int*, int*, double*, double*, int*, double*, double*, int* );
-
-//#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t a, c;
-	obj_t c_save;
-	obj_t alpha, beta;
-	dim_t m, k;
-	dim_t p;
-	dim_t p_begin, p_end, p_inc;
-	int   m_input, k_input;
-	num_t dt_a, dt_c;
-	num_t dt_alpha, dt_beta;
-	int   r, n_repeats;
-	uplo_t uplo;
-
-	double dtime;
-	double dtime_save;
-	double gflops;
-
-	bli_init();
-
-	n_repeats = 3;
-
-    if( argc < 7 ) 
-    {   
-        printf("Usage:\n");
-        printf("test_foo.x m n k p_begin p_inc p_end:\n");
-        exit;
-    }   
-
-    int world_size, world_rank, provided;
-    MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided );
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
-
-    m_input = strtol( argv[1], NULL, 10 );
-    k_input = strtol( argv[3], NULL, 10 );
-    p_begin = strtol( argv[4], NULL, 10 );
-    p_inc   = strtol( argv[5], NULL, 10 );
-    p_end   = strtol( argv[6], NULL, 10 );
-
-	dt_a = BLIS_DOUBLE;
-	dt_c = BLIS_DOUBLE;
-	dt_alpha = BLIS_DOUBLE;
-	dt_beta = BLIS_DOUBLE;
-
-	uplo = BLIS_LOWER;
-
-    for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size )
-	{
-
-		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
-		else               k =     ( dim_t )    k_input;
-
-
-		bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt_beta,  1, 1, 0, 0, &beta );
-
-		bli_obj_create( dt_a, m, k, 0, 0, &a );
-		bli_obj_create( dt_c, m, m, 0, 0, &c );
-		bli_obj_create( dt_c, m, m, 0, 0, &c_save );
-
-		bli_randm( &a );
-		bli_randm( &c );
-
-		bli_obj_set_struc( BLIS_HERMITIAN, &c );
-		bli_obj_set_uplo( uplo, &c );
-
-
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
-
-
-		bli_copym( &c, &c_save );
-	
-		dtime_save = 1.0e9;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-
-			dtime = bli_clock();
-
-#ifdef PRINT
-			bli_printm( "a", &a, "%4.1f", "" );
-			bli_printm( "c", &c, "%4.1f", "" );
-#endif
-
-#ifdef BLIS
-
-			//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-			bli_herk( &alpha,
-			          &a,
-			          &beta,
-			          &c );
-
-#else
-
-			f77_char uploa  = 'L';
-			f77_char transa = 'N';
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  kk     = bli_obj_width_after_trans( &a );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  ap     = bli_obj_buffer( &a );
-			double*  betap  = bli_obj_buffer( &beta );
-			double*  cp     = bli_obj_buffer( &c );
-
-			dsyrk_( &uploa,
-			        &transa,
-			        &mm,
-			        &kk,
-			        alphap,
-			        ap, &lda,
-			        betap,
-			        cp, &ldc );
-#endif
-
-#ifdef PRINT
-			bli_printm( "c after", &c, "%4.1f", "" );
-			exit(1);
-#endif
-
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
-
-#ifdef BLIS
-		printf( "data_herk_blis" );
-#else
-		printf( "data_herk_%s", BLAS );
-#endif
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu  %10.3e  %6.3f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )k, dtime_save, gflops );
-
-
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	bli_finalize();
-
-	return 0;
-}
-
diff --git a/test/mpi_test/test_trmm.c b/test/mpi_test/test_trmm.c
deleted file mode 100644
index 2ba1c6a79..000000000
--- a/test/mpi_test/test_trmm.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <unistd.h>
-#include "blis.h"
-#include <mpi.h>
-
-//           side   uplo   trans  diag   m     n     alpha    a        lda   b        ldb
-//void dtrmm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* );
-
-//#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t a, b, c;
-	obj_t c_save;
-	obj_t alpha, beta;
-	dim_t m, n;
-	dim_t p;
-	dim_t p_begin, p_end, p_inc;
-	int   m_input, n_input;
-	num_t dt_a, dt_b, dt_c;
-	num_t dt_alpha, dt_beta;
-	int   r, n_repeats;
-	side_t side;
-	uplo_t uplo;
-
-	double dtime;
-	double dtime_save;
-	double gflops;
-
-	bli_init();
-
-	n_repeats = 3;
-
-    if( argc < 7 ) 
-    {   
-        printf("Usage:\n");
-        printf("test_foo.x m n p_begin p_inc p_end:\n");
-        exit;
-    }   
-
-    int world_size, world_rank, provided;
-    MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided );
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
-
-    m_input = strtol( argv[1], NULL, 10 );
-    n_input = strtol( argv[2], NULL, 10 );
-    p_begin = strtol( argv[4], NULL, 10 );
-    p_inc   = strtol( argv[5], NULL, 10 );
-    p_end   = strtol( argv[6], NULL, 10 );
-
-#if 1
-	dt_a = BLIS_DOUBLE;
-	dt_b = BLIS_DOUBLE;
-	dt_c = BLIS_DOUBLE;
-	dt_alpha = BLIS_DOUBLE;
-	dt_beta = BLIS_DOUBLE;
-#else
-	dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX;
-#endif
-
-	side = BLIS_LEFT;
-	//side = BLIS_RIGHT;
-
-	uplo = BLIS_LOWER;
-	//uplo = BLIS_UPPER;
-
-    for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size )
-	{
-
-		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
-		else               n =     ( dim_t )    n_input;
-
-
-		bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt_beta,  1, 1, 0, 0, &beta );
-
-		if ( bli_is_left( side ) )
-			bli_obj_create( dt_a, m, m, 0, 0, &a );
-		else
-			bli_obj_create( dt_a, n, n, 0, 0, &a );
-		bli_obj_create( dt_b, m, n, 0, 0, &b );
-		bli_obj_create( dt_c, m, n, 0, 0, &c );
-		bli_obj_create( dt_c, m, n, 0, 0, &c_save );
-
-		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
-		bli_obj_set_uplo( uplo, &a );
-
-		bli_randm( &a );
-		bli_randm( &c );
-		bli_randm( &b );
-
-/*
-		bli_obj_toggle_uplo( &a );
-		bli_obj_inc_diag_offset( -1, &a );
-		bli_setm( &BLIS_ZERO, &a );
-		bli_obj_inc_diag_offset( 1, &a );
-		bli_obj_toggle_uplo( &a );
-		bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a );
-		bli_scalm( &BLIS_TWO, &a );
-		//bli_scalm( &BLIS_TWO, &a );
-*/
-
-
-
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
-
-
-		bli_copym( &c, &c_save );
-	
-		dtime_save = 1.0e9;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-			dtime = bli_clock();
-
-
-#ifdef PRINT
-
-/*
-			obj_t ar, ai;
-			bli_obj_alias_to( &a, &ar );
-			bli_obj_alias_to( &a, &ai );
-			bli_obj_set_dt( BLIS_DOUBLE, &ar ); ar.rs *= 2; ar.cs *= 2;
-			bli_obj_set_dt( BLIS_DOUBLE, &ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1;
-			bli_printm( "ar", &ar, "%4.1f", "" );
-			bli_printm( "ai", &ai, "%4.1f", "" );
-*/
-			bli_printm( "a", &a, "%4.1f", "" );
-			bli_printm( "c", &c, "%4.1f", "" );
-#endif
-
-#ifdef BLIS
-			bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-			bli_trmm( side,
-			//bli_trmm4m( side,
-			          &alpha,
-			          &a,
-			          &c );
-
-#else
-
-			f77_char side   = 'L';
-			f77_char uplo   = 'L';
-			f77_char transa = 'N';
-			f77_char diag   = 'N';
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  nn     = bli_obj_width( &c );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = bli_obj_buffer( &alpha );
-			double*  ap     = bli_obj_buffer( &a );
-			double*  cp     = bli_obj_buffer( &c );
-
-			dtrmm_( &side,
-			        &uplo,
-			        &transa,
-			        &diag,
-			        &mm,
-			        &nn,
-			        alphap,
-			        ap, &lda,
-			        cp, &ldc );
-#endif
-
-#ifdef PRINT
-			bli_printm( "c after", &c, "%4.1f", "" );
-			exit(1);
-#endif
-
-
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		if ( bli_is_left( side ) )
-			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
-		else
-			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
-
-		if ( bli_is_complex( dt_a ) ) gflops *= 4.0;
-
-#ifdef BLIS
-		printf( "data_trmm_blis" );
-#else
-		printf( "data_trmm_%s", BLAS );
-#endif
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu  %10.3e  %6.3f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )n, dtime_save, gflops );
-
-
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &b );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	bli_finalize();
-
-	return 0;
-}
-
diff --git a/test/mpi_test/test_trsm.c b/test/mpi_test/test_trsm.c
deleted file mode 100644
index 12fc54232..000000000
--- a/test/mpi_test/test_trsm.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <unistd.h>
-#include "blis.h"
-#include <mpi.h>
-
-//           side   uplo   trans  diag   m     n     alpha    a        lda   b        ldb
-//void dtrsm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* );
-
-//#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t a, b, c;
-	obj_t c_save;
-	obj_t alpha, beta;
-	dim_t m, n;
-	dim_t p;
-	dim_t p_begin, p_end, p_inc;
-	int   m_input, n_input;
-	num_t dt_a, dt_b, dt_c;
-	num_t dt_alpha, dt_beta;
-	int   r, n_repeats;
-	side_t side;
-	uplo_t uplo;
-
-	double dtime;
-	double dtime_save;
-	double gflops;
-
-	bli_init();
-
-	n_repeats = 3;
-
-    if( argc < 7 ) 
-    {   
-        printf("Usage:\n");
-        printf("test_foo.x m n k p_begin p_inc p_end:\n");
-        exit;
-    }   
-
-    int world_size, world_rank, provided;
-    MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided );
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
-
-    m_input = strtol( argv[1], NULL, 10 );
-    n_input = strtol( argv[2], NULL, 10 );
-    p_begin = strtol( argv[4], NULL, 10 );
-    p_inc   = strtol( argv[5], NULL, 10 );
-    p_end   = strtol( argv[6], NULL, 10 );
-
-#if 1
-	dt_a = BLIS_DOUBLE;
-	dt_b = BLIS_DOUBLE;
-	dt_c = BLIS_DOUBLE;
-	dt_alpha = BLIS_DOUBLE;
-	dt_beta = BLIS_DOUBLE;
-#else
-	dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_FLOAT; 
-	//dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_SCOMPLEX; 
-#endif
-
-	side = BLIS_LEFT;
-	//side = BLIS_RIGHT;
-
-	uplo = BLIS_LOWER;
-	//uplo = BLIS_UPPER;
-
-    for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size )
-	{
-
-		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
-		else               n =     ( dim_t )    n_input;
-
-
-		bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt_beta,  1, 1, 0, 0, &beta );
-
-		if ( bli_is_left( side ) )
-			bli_obj_create( dt_a, m, m, 0, 0, &a );
-		else
-			bli_obj_create( dt_a, n, n, 0, 0, &a );
-		bli_obj_create( dt_b, m, n, 0, 0, &b );
-		bli_obj_create( dt_c, m, n, 0, 0, &c );
-		bli_obj_create( dt_c, m, n, 0, 0, &c_save );
-
-		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
-		bli_obj_set_uplo( uplo, &a );
-		//bli_obj_set_diag( BLIS_UNIT_DIAG, &a );
-
-		bli_randm( &a );
-		bli_randm( &c );
-		bli_randm( &b );
-
-/*
-		{ 
-			obj_t a2;
-
-			bli_obj_alias_to( &a, &a2 );
-			bli_obj_toggle_uplo( &a2 );
-			bli_obj_inc_diag_offset( 1, &a2 );
-			bli_setm( &BLIS_ZERO, &a2 );
-			bli_obj_inc_diag_offset( -2, &a2 );
-			bli_obj_toggle_uplo( &a2 );
-			bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a2 );
-			bli_scalm( &BLIS_TWO, &a2 );
-			//bli_scalm( &BLIS_TWO, &a );
-		} 
-*/
-
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
-
-
-		bli_copym( &c, &c_save );
-	
-		dtime_save = 1.0e9;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-			dtime = bli_clock();
-
-
-#ifdef PRINT
-/*
-			obj_t ar, ai;
-			bli_obj_alias_to( &a, &ar );
-			bli_obj_alias_to( &a, &ai );
-			bli_obj_set_dt( BLIS_DOUBLE, &ar ); ar.rs *= 2; ar.cs *= 2;
-			bli_obj_set_dt( BLIS_DOUBLE, &ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1;
-
-			bli_printm( "ar", &ar, "%4.1f", "" );
-			bli_printm( "ai", &ai, "%4.1f", "" );
-*/
-
-			bli_invertd( &a );
-			bli_printm( "a", &a, "%4.1f", "" );
-			bli_invertd( &a );
-			bli_printm( "c", &c, "%4.1f", "" );
-#endif
-
-#ifdef BLIS
-			//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-			bli_trsm( side,
-			//bli_trsm4m( side,
-			//bli_trsm3m( side,
-			          &alpha,
-			          &a,
-			          &c );
-#else
-
-		if ( bli_is_real( dt_a ) )
-		{
-			f77_char side   = 'L';
-			f77_char uplo   = 'L';
-			f77_char transa = 'N';
-			f77_char diag   = 'N';
-			f77_int  mm     = bli_obj_length( &c );
-			f77_int  nn     = bli_obj_width( &c );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			float *  alphap = bli_obj_buffer( &alpha );
-			float *  ap     = bli_obj_buffer( &a );
-			float *  cp     = bli_obj_buffer( &c );
-
-			strsm_( &side,
-			        &uplo,
-			        &transa,
-			        &diag,
-			        &mm,
-			        &nn,
-			        alphap,
-			        ap, &lda,
-			        cp, &ldc );
-		}
-		else // if ( bli_is_complex( dt_a ) )
-		{
-			f77_char  side   = 'L';
-			f77_char  uplo   = 'L';
-			f77_char  transa = 'N';
-			f77_char  diag   = 'N';
-			f77_int   mm     = bli_obj_length( &c );
-			f77_int   nn     = bli_obj_width( &c );
-			f77_int   lda    = bli_obj_col_stride( &a );
-			f77_int   ldc    = bli_obj_col_stride( &c );
-			scomplex* alphap = bli_obj_buffer( &alpha );
-			scomplex* ap     = bli_obj_buffer( &a );
-			scomplex* cp     = bli_obj_buffer( &c );
-
-			ctrsm_( &side,
-			//ztrsm_( &side,
-			        &uplo,
-			        &transa,
-			        &diag,
-			        &mm,
-			        &nn,
-			        alphap,
-			        ap, &lda,
-			        cp, &ldc );
-		}
-		
-#endif
-
-#ifdef PRINT
-			bli_printm( "c after", &c, "%4.1f", "" );
-			exit(1);
-#endif
-
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		if ( bli_is_left( side ) )
-			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
-		else
-			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
-
-		if ( bli_is_complex( dt_a ) ) gflops *= 4.0;
-
-#ifdef BLIS
-		printf( "data_trsm_blis" );
-#else
-		printf( "data_trsm_%s", BLAS );
-#endif
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu  %10.3e  %6.3f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )n, dtime_save, gflops );
-
-
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &b );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	bli_finalize();
-
-	return 0;
-}
-

From dc6e5f3f5770074ba38554541b8b64711a68c084 Mon Sep 17 00:00:00 2001
From: leekillough <15950023+leekillough@users.noreply.github.com>
Date: Thu, 3 Nov 2022 18:33:08 -0500
Subject: [PATCH 106/230] Enhance emacs formatting of C files to remove
 trailing whitespace and ensure a newline at the end of file

---
 .dir-locals.el | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/.dir-locals.el b/.dir-locals.el
index fccb20502..711f4a63d 100644
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -1,9 +1,11 @@
-;; First (minimal) attempt at configuring Emacs CC mode for the BLIS
-;; layout requirements.
+;; Emacs C mode formatting for the BLIS layout requirements.
 ((c-mode . ((c-file-style . "stroustrup")
-            (c-basic-offset . 4)
-            (comment-start . "// ")
-            (comment-end . "")
-            (indent-tabs-mode . t)
-            (tab-width . 4)
-            (parens-require-spaces . nil))))
+	    (c-basic-offset . 4)
+	    (comment-start . "// ")
+	    (comment-end . "")
+	    (indent-tabs-mode . t)
+	    (tab-width . 4)
+	    (parens-require-spaces . nil)
+	    (require-final-newline . t)
+	    (eval add-hook `before-save-hook `delete-trailing-whitespace)
+	    )))

From e1ea25da43508925e33d4e57e420cfc0a9de793f Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 11 Nov 2022 12:07:51 -0600
Subject: [PATCH 107/230] Fixed subtle barrier_fpa bug in bli_thrcomm.c. (#690)

Details:
- In bli_thrcommo.c, correctly initialize the BLIS_OPENMP element of the
  barrier function pointer array (barrier_fpa) to NULL when
  BLIS_ENABLE_OPENMP is *not* defined. Similarly, initialize the
  BLIS_POSIX element of barrier_fpa to NULL when BLIS_ENABLE_PTHREADS is
  not enabled. This bug was introduced in a1a5a9b and was likely the
  result of an incomplete edit. The effects of the bug would have
  likely manifested when querying a thrcomm_t that was initialized with
  a timpl_t value corresponding to a threading implementation that was
  omitted from the -t option at configure-time.
---
 frame/thread/bli_thrcomm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c
index 0547d296e..f0bba205a 100644
--- a/frame/thread/bli_thrcomm.c
+++ b/frame/thread/bli_thrcomm.c
@@ -115,17 +115,17 @@ static thrcomm_barrier_ft barrier_fpa[ BLIS_NUM_THREAD_IMPLS ] =
 #if   defined(BLIS_ENABLE_OPENMP)
 	                bli_thrcomm_barrier_openmp,
 #elif defined(BLIS_ENABLE_PTHREADS)
-	                bli_thrcomm_barrier_pthreads,
+	                NULL,
 #else
-	                bli_thrcomm_barrier_single,
+	                NULL,
 #endif
 	[BLIS_POSIX]  =
 #if   defined(BLIS_ENABLE_PTHREADS)
 	                bli_thrcomm_barrier_pthreads,
 #elif defined(BLIS_ENABLE_OPENMP)
-	                bli_thrcomm_barrier_openmp,
+	                NULL,
 #else
-	                bli_thrcomm_barrier_single,
+	                NULL,
 #endif
 };
 

From 2b05948ad2c9785bc53f376d53a7141cbc917447 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Sun, 13 Nov 2022 17:40:22 -0500
Subject: [PATCH 108/230] blis support for hpx (#682)

Implement threading backend via HPX.

HPX is an asynchronous many task runtime system used in high performance computing applications. The runtime implements the ISO C++ parallelism specification and provides a user-space thread implementation.

This PR provides BLIS a thread backend implementation using HPX and resolves feature request #681. The configuration script, makefiles, and testsuite have been updated to support an HPX build option. The addition of HPX support provides other developers an exemplar for integrating other C++ threading backends into BLIS.

Co-authored-by: ctaylor <ctaylor@pennywise.cm.cluster>
Co-authored-by: Devin Matthews <damatthews@smu.edu>
---
 Makefile                              |  10 +
 README.md                             |   2 +-
 blastest/src/cblat1.c                 |  40 +-
 blastest/src/cblat2.c                 | 426 ++++++++---------
 blastest/src/cblat3.c                 | 602 ++++++++++++------------
 blastest/src/dblat1.c                 | 164 +++----
 blastest/src/dblat2.c                 | 444 +++++++++---------
 blastest/src/dblat3.c                 | 428 ++++++++---------
 blastest/src/sblat1.c                 | 180 ++++----
 blastest/src/sblat2.c                 | 354 +++++++-------
 blastest/src/sblat3.c                 | 380 +++++++--------
 blastest/src/zblat1.c                 |  48 +-
 blastest/src/zblat2.c                 | 518 +++++++++++----------
 blastest/src/zblat3.c                 | 636 +++++++++++++-------------
 build/bli_config.h.in                 |   7 +
 build/config.mk.in                    |   1 +
 build/libblis-symbols.def             |   2 +
 common.mk                             |  43 +-
 configure                             |  41 +-
 docs/FAQ.md                           |   4 +-
 docs/Multithreading.md                |   6 +-
 frame/3/bli_l3_decor.c                |   7 +-
 frame/base/bli_info.c                 |  19 +-
 frame/base/bli_info.h                 |   2 +
 frame/include/bli_config_macro_defs.h |  10 +-
 frame/include/bli_type_defs.h         |   4 +-
 frame/thread/bli_thrcomm.c            |  24 +-
 frame/thread/bli_thrcomm.h            |   8 +
 frame/thread/bli_thrcomm_hpx.cpp      |  92 ++++
 frame/thread/bli_thrcomm_hpx.h        |  48 ++
 frame/thread/bli_thread.c             |  17 +-
 frame/thread/bli_thread.h             |   1 +
 frame/thread/bli_thread_hpx.cpp       |  85 ++++
 frame/thread/bli_thread_hpx.h         |  54 +++
 testsuite/src/test_libblis.c          |  24 +-
 35 files changed, 2648 insertions(+), 2083 deletions(-)
 create mode 100644 frame/thread/bli_thrcomm_hpx.cpp
 create mode 100644 frame/thread/bli_thrcomm_hpx.h
 create mode 100644 frame/thread/bli_thread_hpx.cpp
 create mode 100644 frame/thread/bli_thread_hpx.h

diff --git a/Makefile b/Makefile
index 04cdca421..33641f8c8 100644
--- a/Makefile
+++ b/Makefile
@@ -552,6 +552,16 @@ else
 	@echo "Compiling $$@" $(call get-frame-text-for,$(1))
 	@$(CC) $(call get-frame-cflags-for,$(1)) -c $$< -o $$@
 endif
+
+ifneq ($(findstring hpx,$(THREADING_MODEL)),)
+$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-frame-cxxflags-for,$(1)) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-frame-cxxtext-for,$(1))
+	@$(CXX) $(call get-frame-cxxflags-for,$(1)) -c $$< -o $$@
+endif
+endif
 endef
 
 # first argument: a kernel set (name) being targeted (e.g. haswell).
diff --git a/README.md b/README.md
index e0e4238ca..68c937f52 100644
--- a/README.md
+++ b/README.md
@@ -286,7 +286,7 @@ writing complex kernels.
 
  * **Advanced multithreading support.** BLIS allows multiple levels of
 symmetric multithreading for nearly all level-3 operations. (Currently, users
-may choose to obtain parallelism via either OpenMP or POSIX threads). This
+may choose to obtain parallelism via OpenMP, POSIX threads, or HPX). This
 means that matrices may be partitioned in multiple dimensions simultaneously to
 attain scalable, high-performance parallelism on multicore and many-core
 architectures. The key to this innovation is a thread-specific control tree
diff --git a/blastest/src/cblat1.c b/blastest/src/cblat1.c
index 606511662..656294684 100644
--- a/blastest/src/cblat1.c
+++ b/blastest/src/cblat1.c
@@ -68,6 +68,11 @@ static real c_b52 = 0.f;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "cblat1";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
     static real sfac = 9.765625e-4f;
@@ -136,7 +141,12 @@ static real c_b52 = 0.f;
     }
     s_stop("", (ftnlen)0);
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int header_(void)
@@ -230,7 +240,7 @@ static real c_b52 = 0.f;
     complex q__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -238,15 +248,15 @@ static real c_b52 = 0.f;
     integer i__;
     complex cx[8];
     integer np1, len;
-    extern /* Subroutine */ int cscal_(integer *, complex *, complex *, 
-	    integer *), ctest_(integer *, complex *, complex *, complex *, 
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *), ctest_(integer *, complex *, complex *, complex *,
 	    real *);
     complex mwpcs[5], mwpct[5];
     extern real scnrm2_(integer *, complex *, integer *);
     extern /* Subroutine */ int itest1_(integer *, integer *), stest1_(real *,
 	     real *, real *, real *);
     extern integer icamax_(integer *, complex *, integer *);
-    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer
 	    *);
     extern real scasum_(integer *, complex *, integer *);
 
@@ -465,7 +475,7 @@ static real c_b52 = 0.f;
     complex q__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -481,9 +491,9 @@ static real c_b52 = 0.f;
 #else
 complex cdotc_(
 #endif
- integer *, complex *, integer 
+ integer *, complex *, integer
 	    *, complex *, integer *);
-    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, 
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
 	    complex *, integer *);
     extern /* Complex */
 #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
@@ -491,13 +501,13 @@ complex cdotc_(
 #else
 complex cdotu_(
 #endif
- integer *, complex *, integer 
+ integer *, complex *, integer
 	    *, complex *, integer *);
-    extern /* Subroutine */ int cswap_(integer *, complex *, integer *, 
-	    complex *, integer *), ctest_(integer *, complex *, complex *, 
+    extern /* Subroutine */ int cswap_(integer *, complex *, integer *,
+	    complex *, integer *), ctest_(integer *, complex *, complex *,
 	    complex *, real *);
     integer ksize;
-    extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, 
+    extern /* Subroutine */ int caxpy_(integer *, complex *, complex *,
 	    integer *, complex *, integer *);
 
     /* Fortran I/O blocks */
@@ -691,7 +701,7 @@ complex cdotu_(
 	sfac)
 {
     real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(integer *, real *, real *, real *, 
+    extern /* Subroutine */ int stest_(integer *, real *, real *, real *,
 	    real *);
 
 /*     ************************* STEST1 ***************************** */
@@ -733,7 +743,7 @@ real sdiff_(real *sa, real *sb)
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int ctest_(integer *len, complex *ccomp, complex *ctrue, 
+/* Subroutine */ int ctest_(integer *len, complex *ccomp, complex *ctrue,
 	complex *csize, real *sfac)
 {
     /* System generated locals */
@@ -745,7 +755,7 @@ real sdiff_(real *sa, real *sb)
     /* Local variables */
     integer i__;
     real scomp[20], ssize[20], strue[20];
-    extern /* Subroutine */ int stest_(integer *, real *, real *, real *, 
+    extern /* Subroutine */ int stest_(integer *, real *, real *, real *,
 	    real *);
 
 /*     **************************** CTEST ***************************** */
diff --git a/blastest/src/cblat2.c b/blastest/src/cblat2.c
index 2916a36a4..08d215aee 100644
--- a/blastest/src/cblat2.c
+++ b/blastest/src/cblat2.c
@@ -158,10 +158,15 @@ static logical c_false = FALSE_;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "cblat2";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*17] = "CGEMV " "CGBMV " "CHEMV " "CHBMV " "CHPMV " 
-	    "CTRMV " "CTBMV " "CTPMV " "CTRSV " "CTBSV " "CTPSV " "CGERC " 
+    static char snames[6*17] = "CGEMV " "CGBMV " "CHEMV " "CHBMV " "CHPMV "
+	    "CTRMV " "CTBMV " "CTPMV " "CTRSV " "CTBSV " "CTPSV " "CGERC "
 	    "CGERU " "CHER  " "CHPR  " "CHER2 " "CHPR2 ";
 
     /* Format strings */
@@ -209,10 +214,10 @@ static logical c_false = FALSE_;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -234,42 +239,42 @@ static logical c_false = FALSE_;
     integer ninc, nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, integer *, integer *, complex *, integer *, complex *, 
-	    integer *, integer *, integer *, integer *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, real *, ftnlen), cchk2_(char *, real *, 
-	    real *, integer *, integer *, logical *, logical *, logical *, 
-	    integer *, integer *, integer *, integer *, integer *, complex *, 
-	    integer *, complex *, integer *, integer *, integer *, integer *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, real *, ftnlen), 
-	    cchk3_(char *, real *, real *, integer *, integer *, logical *, 
-	    logical *, logical *, integer *, integer *, integer *, integer *, 
-	    integer *, integer *, integer *, integer *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, complex *, real *, 
-	    complex *, ftnlen), cchk4_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, complex *, integer *, integer *, integer *, integer *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, real *, complex *, 
-	    ftnlen), cchk5_(char *, real *, real *, integer *, integer *, 
-	    logical *, logical *, logical *, integer *, integer *, integer *, 
-	    complex *, integer *, integer *, integer *, integer *, complex *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, complex *, real *, complex *, ftnlen), 
-	    cchk6_(char *, real *, real *, integer *, integer *, logical *, 
-	    logical *, logical *, integer *, integer *, integer *, complex *, 
-	    integer *, integer *, integer *, integer *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
+    extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, integer *, integer *, complex *, integer *, complex *,
+	    integer *, integer *, integer *, integer *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, real *, ftnlen), cchk2_(char *, real *,
+	    real *, integer *, integer *, logical *, logical *, logical *,
+	    integer *, integer *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, integer *, integer *, integer *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, real *, ftnlen),
+	    cchk3_(char *, real *, real *, integer *, integer *, logical *,
+	    logical *, logical *, integer *, integer *, integer *, integer *,
+	    integer *, integer *, integer *, integer *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, complex *, real *,
+	    complex *, ftnlen), cchk4_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, complex *, integer *, integer *, integer *, integer *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, real *, complex *,
+	    ftnlen), cchk5_(char *, real *, real *, integer *, integer *,
+	    logical *, logical *, logical *, integer *, integer *, integer *,
+	    complex *, integer *, integer *, integer *, integer *, complex *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, complex *, real *, complex *, ftnlen),
+	    cchk6_(char *, real *, real *, integer *, integer *, logical *,
+	    logical *, logical *, integer *, integer *, integer *, complex *,
+	    integer *, integer *, integer *, integer *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
 	    complex *, complex *, real *, complex *, ftnlen), cchke_(integer *
 	    , char *, integer *, ftnlen);
     logical fatal, trace;
     integer nidim;
     extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex *
 	    , complex *, integer *, complex *, integer *, complex *, complex *
-	    , integer *, complex *, real *, complex *, real *, real *, 
+	    , integer *, complex *, real *, complex *, real *, real *,
 	    logical *, integer *, logical *, ftnlen);
     char snaps[32], trans[1];
     integer isnum;
@@ -618,7 +623,7 @@ static logical c_false = FALSE_;
 	goto L80;
     }
     for (i__ = 1; i__ <= 17; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L70;
 	}
@@ -677,7 +682,7 @@ static logical c_false = FALSE_;
 /*     YY holds the exact result. On exit from CMVCH YT holds */
 /*     the result computed by CMVCH. */
     *(unsigned char *)trans = 'N';
-    cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, 
+    cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g,
 	    yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1);
     same = lce_(yy, yt, &n);
     if (! same || err != 0.f) {
@@ -690,7 +695,7 @@ static logical c_false = FALSE_;
 	s_stop("", (ftnlen)0);
     }
     *(unsigned char *)trans = 'T';
-    cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, 
+    cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g,
 	    yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1);
     same = lce_(yy, yt, &n);
     if (! same || err != 0.f) {
@@ -751,44 +756,44 @@ static logical c_false = FALSE_;
 /*           Test CGEMV, 01, and CGBMV, 02. */
 L140:
 	    cchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. */
 L150:
 	    cchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test CTRMV, 06, CTBMV, 07, CTPMV, 08, */
 /*           CTRSV, 09, CTBSV, 10, and CTPSV, 11. */
 L160:
 	    cchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc,
 		    &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen)
 		    6);
 	    goto L200;
 /*           Test CGERC, 12, CGERU, 13. */
 L170:
 	    cchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test CHER, 14, and CHPR, 15. */
 L180:
 	    cchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test CHER2, 16, and CHPR2, 17. */
 L190:
 	    cchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 
 L200:
@@ -830,15 +835,20 @@ static logical c_false = FALSE_;
 
 /*     End of CBLAT2. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nkb, integer *kb, integer *
-	nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, 
+	nalf, complex *alf, integer *nbet, complex *bet, integer *ninc,
 	integer *inc, integer *nmax, integer *incmax, complex *a, complex *aa,
-	 complex *as, complex *x, complex *xx, complex *xs, complex *y, 
+	 complex *as, complex *x, complex *xx, complex *xs, complex *y,
 	complex *yy, complex *ys, complex *yt, real *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -867,7 +877,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, 
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8,
 	    i__9;
     alist al__1;
 
@@ -887,26 +897,26 @@ static logical c_false = FALSE_;
     logical same;
     integer incx, incy;
     logical full, tran, null;
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, integer *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, integer *,
 	    integer *, logical *, complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
     logical isame[13];
     extern /* Subroutine */ int cgbmv_(char *, integer *, integer *, integer *
 	    , integer *, complex *, complex *, integer *, complex *, integer *
-	    , complex *, complex *, integer *, ftnlen), cgemv_(char *, 
-	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    , complex *, complex *, integer *, ftnlen), cgemv_(char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
 	    integer *, complex *, complex *, integer *, ftnlen), cmvch_(char *
 	    , integer *, integer *, complex *, complex *, integer *, complex *
-	    , integer *, complex *, complex *, integer *, complex *, real *, 
-	    complex *, real *, real *, logical *, integer *, logical *, 
+	    , integer *, complex *, complex *, integer *, complex *, real *,
+	    complex *, real *, real *, logical *, integer *, logical *,
 	    ftnlen);
     integer nargs;
     logical reset;
     integer incxs, incys;
     char trans[1];
     logical banded;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     complex transl;
@@ -1089,9 +1099,9 @@ static logical c_false = FALSE_;
 				    transl.r = 0.f, transl.i = 0.f;
 				    i__7 = abs(incy);
 				    i__8 = ml - 1;
-				    cmake_("GE", " ", " ", &c__1, &ml, &y[1], 
+				    cmake_("GE", " ", " ", &c__1, &ml, &y[1],
 					    &c__1, &yy[1], &i__7, &c__0, &
-					    i__8, &reset, &transl, (ftnlen)2, 
+					    i__8, &reset, &transl, (ftnlen)2,
 					    (ftnlen)1, (ftnlen)1);
 
 				    ++nc;
@@ -1099,7 +1109,7 @@ static logical c_false = FALSE_;
 /*                             Save every datum before calling the */
 /*                             subroutine. */
 
-				    *(unsigned char *)transs = *(unsigned 
+				    *(unsigned char *)transs = *(unsigned
 					    char *)trans;
 				    ms = m;
 				    ns = n;
@@ -1110,7 +1120,7 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__7; ++i__) {
 					i__8 = i__;
 					i__9 = i__;
-					as[i__8].r = aa[i__9].r, as[i__8].i = 
+					as[i__8].r = aa[i__9].r, as[i__8].i =
 						aa[i__9].i;
 /* L10: */
 				    }
@@ -1119,7 +1129,7 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__7; ++i__) {
 					i__8 = i__;
 					i__9 = i__;
-					xs[i__8].r = xx[i__9].r, xs[i__8].i = 
+					xs[i__8].r = xx[i__9].r, xs[i__8].i =
 						xx[i__9].i;
 /* L20: */
 				    }
@@ -1129,7 +1139,7 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__7; ++i__) {
 					i__8 = i__;
 					i__9 = i__;
-					ys[i__8].r = yy[i__9].r, ys[i__8].i = 
+					ys[i__8].r = yy[i__9].r, ys[i__8].i =
 						yy[i__9].i;
 /* L30: */
 				    }
@@ -1166,7 +1176,7 @@ static logical c_false = FALSE_;
 					    al__1.aunit = *ntra;
 					    f_rew(&al__1);
 					}
-					cgemv_(trans, &m, &n, &alpha, &aa[1], 
+					cgemv_(trans, &m, &n, &alpha, &aa[1],
 						&lda, &xx[1], &incx, &beta, &
 						yy[1], &incy, (ftnlen)1);
 				    } else if (banded) {
@@ -1225,7 +1235,7 @@ static logical c_false = FALSE_;
 				    isame[1] = ms == m;
 				    isame[2] = ns == n;
 				    if (full) {
-					isame[3] = als.r == alpha.r && als.i 
+					isame[3] = als.r == alpha.r && als.i
 						== alpha.i;
 					isame[4] = lce_(&as[1], &aa[1], &laa);
 					isame[5] = ldas == lda;
@@ -1247,13 +1257,13 @@ static logical c_false = FALSE_;
 				    } else if (banded) {
 					isame[3] = kls == kl;
 					isame[4] = kus == ku;
-					isame[5] = als.r == alpha.r && als.i 
+					isame[5] = als.r == alpha.r && als.i
 						== alpha.i;
 					isame[6] = lce_(&as[1], &aa[1], &laa);
 					isame[7] = ldas == lda;
 					isame[8] = lce_(&xs[1], &xx[1], &lx);
 					isame[9] = incxs == incx;
-					isame[10] = bls.r == beta.r && bls.i 
+					isame[10] = bls.r == beta.r && bls.i
 						== beta.i;
 					if (null) {
 					    isame[11] = lce_(&ys[1], &yy[1], &
@@ -1295,8 +1305,8 @@ static logical c_false = FALSE_;
 
 					cmvch_(trans, &m, &n, &alpha, &a[
 						a_offset], nmax, &x[1], &incx,
-						 &beta, &y[1], &incy, &yt[1], 
-						&g[1], &yy[1], eps, &err, 
+						 &beta, &y[1], &incy, &yt[1],
+						&g[1], &yy[1], eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1);
 					errmax = max(errmax,err);
@@ -1401,11 +1411,11 @@ static logical c_false = FALSE_;
 } /* cchk1_ */
 
 /* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nkb, integer *kb, integer *
-	nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, 
+	nalf, complex *alf, integer *nbet, complex *bet, integer *ninc,
 	integer *inc, integer *nmax, integer *incmax, complex *a, complex *aa,
-	 complex *as, complex *x, complex *xx, complex *xs, complex *y, 
+	 complex *as, complex *x, complex *xx, complex *xs, complex *y,
 	complex *yy, complex *ys, complex *yt, real *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -1438,7 +1448,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, 
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8,
 	    i__9;
     alist al__1;
 
@@ -1447,7 +1457,7 @@ static logical c_false = FALSE_;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, 
+    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly,
 	    laa, lda;
     extern logical lce_(complex *, complex *, integer *);
     complex als, bls;
@@ -1458,18 +1468,18 @@ static logical c_false = FALSE_;
     integer incx, incy;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, integer *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, integer *,
 	    integer *, logical *, complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
     logical isame[13];
     extern /* Subroutine */ int chbmv_(char *, integer *, integer *, complex *
 	    , complex *, integer *, complex *, integer *, complex *, complex *
-	    , integer *, ftnlen), chemv_(char *, integer *, complex *, 
-	    complex *, integer *, complex *, integer *, complex *, complex *, 
+	    , integer *, ftnlen), chemv_(char *, integer *, complex *,
+	    complex *, integer *, complex *, integer *, complex *, complex *,
 	    integer *, ftnlen), cmvch_(char *, integer *, integer *, complex *
 	    , complex *, integer *, complex *, integer *, complex *, complex *
-	    , integer *, complex *, real *, complex *, real *, real *, 
+	    , integer *, complex *, real *, complex *, real *, real *,
 	    logical *, integer *, logical *, ftnlen);
     integer nargs;
     extern /* Subroutine */ int chpmv_(char *, integer *, complex *, complex *
@@ -1478,7 +1488,7 @@ static logical c_false = FALSE_;
     integer incxs, incys;
     char uplos[1];
     logical banded, packed;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     complex transl;
@@ -1643,7 +1653,7 @@ static logical c_false = FALSE_;
 				i__8 = n - 1;
 				cmake_("GE", " ", " ", &c__1, &n, &y[1], &
 					c__1, &yy[1], &i__7, &c__0, &i__8, &
-					reset, &transl, (ftnlen)2, (ftnlen)1, 
+					reset, &transl, (ftnlen)2, (ftnlen)1,
 					(ftnlen)1);
 
 				++nc;
@@ -1795,13 +1805,13 @@ static logical c_false = FALSE_;
 					unsigned char *)uplos;
 				isame[1] = ns == n;
 				if (full) {
-				    isame[2] = als.r == alpha.r && als.i == 
+				    isame[2] = als.r == alpha.r && als.i ==
 					    alpha.i;
 				    isame[3] = lce_(&as[1], &aa[1], &laa);
 				    isame[4] = ldas == lda;
 				    isame[5] = lce_(&xs[1], &xx[1], &lx);
 				    isame[6] = incxs == incx;
-				    isame[7] = bls.r == beta.r && bls.i == 
+				    isame[7] = bls.r == beta.r && bls.i ==
 					    beta.i;
 				    if (null) {
 					isame[8] = lce_(&ys[1], &yy[1], &ly);
@@ -1814,13 +1824,13 @@ static logical c_false = FALSE_;
 				    isame[9] = incys == incy;
 				} else if (banded) {
 				    isame[2] = ks == k;
-				    isame[3] = als.r == alpha.r && als.i == 
+				    isame[3] = als.r == alpha.r && als.i ==
 					    alpha.i;
 				    isame[4] = lce_(&as[1], &aa[1], &laa);
 				    isame[5] = ldas == lda;
 				    isame[6] = lce_(&xs[1], &xx[1], &lx);
 				    isame[7] = incxs == incx;
-				    isame[8] = bls.r == beta.r && bls.i == 
+				    isame[8] = bls.r == beta.r && bls.i ==
 					    beta.i;
 				    if (null) {
 					isame[9] = lce_(&ys[1], &yy[1], &ly);
@@ -1832,12 +1842,12 @@ static logical c_false = FALSE_;
 				    }
 				    isame[10] = incys == incy;
 				} else if (packed) {
-				    isame[2] = als.r == alpha.r && als.i == 
+				    isame[2] = als.r == alpha.r && als.i ==
 					    alpha.i;
 				    isame[3] = lce_(&as[1], &aa[1], &laa);
 				    isame[4] = lce_(&xs[1], &xx[1], &lx);
 				    isame[5] = incxs == incx;
-				    isame[6] = bls.r == beta.r && bls.i == 
+				    isame[6] = bls.r == beta.r && bls.i ==
 					    beta.i;
 				    if (null) {
 					isame[7] = lce_(&ys[1], &yy[1], &ly);
@@ -1875,8 +1885,8 @@ static logical c_false = FALSE_;
 
 /*                             Check the result. */
 
-				    cmvch_("N", &n, &n, &alpha, &a[a_offset], 
-					    nmax, &x[1], &incx, &beta, &y[1], 
+				    cmvch_("N", &n, &n, &alpha, &a[a_offset],
+					    nmax, &x[1], &incx, &beta, &y[1],
 					    &incy, &yt[1], &g[1], &yy[1], eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1);
@@ -1987,10 +1997,10 @@ static logical c_false = FALSE_;
 } /* cchk2_ */
 
 /* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nkb, integer *kb, integer *
-	ninc, integer *inc, integer *nmax, integer *incmax, complex *a, 
-	complex *aa, complex *as, complex *x, complex *xx, complex *xs, 
+	ninc, integer *inc, integer *nmax, integer *incmax, complex *a,
+	complex *aa, complex *as, complex *x, complex *xx, complex *xs,
 	complex *xt, real *g, complex *z__, ftnlen sname_len)
 {
     /* Initialized data */
@@ -2040,36 +2050,36 @@ static logical c_false = FALSE_;
     integer incx;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, integer *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, integer *,
 	    integer *, logical *, complex *, ftnlen, ftnlen, ftnlen);
     char diags[1];
     logical isame[13];
     extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex *
 	    , complex *, integer *, complex *, integer *, complex *, complex *
-	    , integer *, complex *, real *, complex *, real *, real *, 
+	    , integer *, complex *, real *, complex *, real *, real *,
 	    logical *, integer *, logical *, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int ctbmv_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, ftnlen, 
-	    ftnlen, ftnlen), ctbsv_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, ftnlen, 
+    extern /* Subroutine */ int ctbmv_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, ftnlen,
+	    ftnlen, ftnlen), ctbsv_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, ftnlen,
 	    ftnlen, ftnlen);
     logical reset;
     integer incxs;
     char trans[1];
-    extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *,
 	    complex *, complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_(
 	    char *, char *, char *, integer *, complex *, integer *, complex *
-	    , integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char 
-	    *, integer *, complex *, complex *, integer *, ftnlen, ftnlen, 
+	    , integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char
+	    *, integer *, complex *, complex *, integer *, ftnlen, ftnlen,
 	    ftnlen);
     char uplos[1];
-    extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *, 
-	    complex *, integer *, complex *, integer *, ftnlen, ftnlen, 
+    extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *,
+	    complex *, integer *, complex *, integer *, ftnlen, ftnlen,
 	    ftnlen);
     logical banded, packed;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     complex transl;
@@ -2197,13 +2207,13 @@ static logical c_false = FALSE_;
 			    ;
 
 		    for (icd = 1; icd <= 2; ++icd) {
-			*(unsigned char *)diag = *(unsigned char *)&ichd[icd 
+			*(unsigned char *)diag = *(unsigned char *)&ichd[icd
 				- 1];
 
 /*                    Generate the matrix A. */
 
 			transl.r = 0.f, transl.i = 0.f;
-			cmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], 
+			cmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &k, &k, &reset, &transl, (
 				ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2258,7 +2268,7 @@ static logical c_false = FALSE_;
 
 /*                       Call the subroutine. */
 
-			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) 
+			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2)
 				    == 0) {
 				if (full) {
 				    if (*trace) {
@@ -2311,7 +2321,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    ctbmv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    ctbmv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2392,7 +2402,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    ctbsv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    ctbsv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2434,11 +2444,11 @@ static logical c_false = FALSE_;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplo == *(unsigned 
+			    isame[0] = *(unsigned char *)uplo == *(unsigned
 				    char *)uplos;
-			    isame[1] = *(unsigned char *)trans == *(unsigned 
+			    isame[1] = *(unsigned char *)trans == *(unsigned
 				    char *)transs;
-			    isame[2] = *(unsigned char *)diag == *(unsigned 
+			    isame[2] = *(unsigned char *)diag == *(unsigned
 				    char *)diags;
 			    isame[3] = ns == n;
 			    if (full) {
@@ -2508,7 +2518,7 @@ static logical c_false = FALSE_;
 
 				    cmvch_(trans, &n, &n, &c_b2, &a[a_offset],
 					     nmax, &x[1], &incx, &c_b1, &z__[
-					    1], &incx, &xt[1], &g[1], &xx[1], 
+					    1], &incx, &xt[1], &g[1], &xx[1],
 					    eps, &err, fatal, nout, &c_true, (
 					    ftnlen)1);
 				} else if (s_cmp(sname + 3, "SV", (ftnlen)2, (
@@ -2520,18 +2530,18 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__4; ++i__) {
 					i__5 = i__;
 					i__6 = (i__ - 1) * abs(incx) + 1;
-					z__[i__5].r = xx[i__6].r, z__[i__5].i 
+					z__[i__5].r = xx[i__6].r, z__[i__5].i
 						= xx[i__6].i;
 					i__5 = (i__ - 1) * abs(incx) + 1;
 					i__6 = i__;
-					xx[i__5].r = x[i__6].r, xx[i__5].i = 
+					xx[i__5].r = x[i__6].r, xx[i__5].i =
 						x[i__6].i;
 /* L50: */
 				    }
 				    cmvch_(trans, &n, &n, &c_b2, &a[a_offset],
 					     nmax, &z__[1], &incx, &c_b1, &x[
-					    1], &incx, &xt[1], &g[1], &xx[1], 
-					    eps, &err, fatal, nout, &c_false, 
+					    1], &incx, &xt[1], &g[1], &xx[1],
+					    eps, &err, fatal, nout, &c_false,
 					    (ftnlen)1);
 				}
 				errmax = max(errmax,err);
@@ -2634,10 +2644,10 @@ static logical c_false = FALSE_;
 } /* cchk3_ */
 
 /* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
-	ninc, integer *inc, integer *nmax, integer *incmax, complex *a, 
-	complex *aa, complex *as, complex *x, complex *xx, complex *xs, 
+	ninc, integer *inc, integer *nmax, integer *incmax, complex *a,
+	complex *aa, complex *as, complex *x, complex *xx, complex *xs,
 	complex *y, complex *yy, complex *ys, complex *yt, real *g, complex *
 	z__, ftnlen sname_len)
 {
@@ -2681,23 +2691,23 @@ static logical c_false = FALSE_;
     logical same, conj;
     integer incx, incy;
     logical null;
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, integer *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, integer *,
 	    integer *, logical *, complex *, ftnlen, ftnlen, ftnlen), cgerc_(
-	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *,
 	    integer *, complex *, integer *);
     complex alpha;
     logical isame[13];
     extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex *
 	    , complex *, integer *, complex *, integer *, complex *, complex *
-	    , integer *, complex *, real *, complex *, real *, real *, 
-	    logical *, integer *, logical *, ftnlen), cgeru_(integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    , integer *, complex *, real *, complex *, real *, real *,
+	    logical *, integer *, logical *, ftnlen), cgeru_(integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
 	    complex *, integer *);
     integer nargs;
     logical reset;
     integer incxs, incys;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     complex transl;
@@ -2801,7 +2811,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = m - 1;
 		cmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (m > 1) {
 		    i__3 = m / 2;
@@ -2840,7 +2850,7 @@ static logical c_false = FALSE_;
 			transl.r = 0.f, transl.i = 0.f;
 			i__5 = m - 1;
 			i__6 = n - 1;
-			cmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], 
+			cmake_(sname + 1, " ", " ", &m, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2999,9 +3009,9 @@ static logical c_false = FALSE_;
 				    r_cnjg(&q__1, w);
 				    w[0].r = q__1.r, w[0].i = q__1.i;
 				}
-				cmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, 
+				cmvch_("N", &m, &c__1, &alpha, &z__[1], nmax,
 					w, &c__1, &c_b2, &a[j * a_dim1 + 1], &
-					c__1, &yt[1], &g[1], &aa[(j - 1) * 
+					c__1, &yt[1], &g[1], &aa[(j - 1) *
 					lda + 1], eps, &err, fatal, nout, &
 					c_true, (ftnlen)1);
 				errmax = max(errmax,err);
@@ -3082,10 +3092,10 @@ static logical c_false = FALSE_;
 } /* cchk4_ */
 
 /* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
-	ninc, integer *inc, integer *nmax, integer *incmax, complex *a, 
-	complex *aa, complex *as, complex *x, complex *xx, complex *xs, 
+	ninc, integer *inc, integer *nmax, integer *incmax, complex *a,
+	complex *aa, complex *as, complex *x, complex *xx, complex *xs,
 	complex *y, complex *yy, complex *ys, complex *yt, real *g, complex *
 	z__, ftnlen sname_len)
 {
@@ -3130,24 +3140,24 @@ static logical c_false = FALSE_;
     integer ia, ja, ic, nc, jj, lj, in, ix, ns, lx, laa, lda;
     extern logical lce_(complex *, complex *, integer *);
     real err;
-    extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, 
+    extern /* Subroutine */ int cher_(char *, integer *, real *, complex *,
 	    integer *, complex *, integer *, ftnlen);
     integer ldas;
     logical same;
-    extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, 
+    extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *,
 	    integer *, complex *, ftnlen);
     real rals;
     integer incx;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, integer *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, integer *,
 	    integer *, logical *, complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
     logical isame[13];
     extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex *
 	    , complex *, integer *, complex *, integer *, complex *, complex *
-	    , integer *, complex *, real *, complex *, real *, real *, 
+	    , integer *, complex *, real *, complex *, real *, real *,
 	    logical *, integer *, logical *, ftnlen);
     integer nargs;
     logical reset;
@@ -3156,7 +3166,7 @@ static logical c_false = FALSE_;
     char uplos[1];
     logical packed;
     real ralpha;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     complex transl;
@@ -3261,7 +3271,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		cmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    i__3 = n / 2;
@@ -3336,7 +3346,7 @@ static logical c_false = FALSE_;
 			    al__1.aunit = *ntra;
 			    f_rew(&al__1);
 			}
-			cher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, 
+			cher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda,
 				(ftnlen)1);
 		    } else if (packed) {
 			if (*trace) {
@@ -3446,9 +3456,9 @@ static logical c_false = FALSE_;
 				jj = j;
 				lj = n - j + 1;
 			    }
-			    cmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, 
-				    &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, 
-				    &yt[1], &g[1], &aa[ja], eps, &err, fatal, 
+			    cmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w,
+				    &c__1, &c_b2, &a[jj + j * a_dim1], &c__1,
+				    &yt[1], &g[1], &aa[ja], eps, &err, fatal,
 				    nout, &c_true, (ftnlen)1);
 			    if (full) {
 				if (upper) {
@@ -3547,10 +3557,10 @@ static logical c_false = FALSE_;
 } /* cchk5_ */
 
 /* Subroutine */ int cchk6_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
-	ninc, integer *inc, integer *nmax, integer *incmax, complex *a, 
-	complex *aa, complex *as, complex *x, complex *xx, complex *xs, 
+	ninc, integer *inc, integer *nmax, integer *incmax, complex *a,
+	complex *aa, complex *as, complex *x, complex *xx, complex *xs,
 	complex *y, complex *yy, complex *ys, complex *yt, real *g, complex *
 	z__, ftnlen sname_len)
 {
@@ -3580,7 +3590,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, 
+    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5,
 	    i__6, i__7;
     complex q__1, q__2, q__3;
     alist al__1;
@@ -3603,17 +3613,17 @@ static logical c_false = FALSE_;
     logical full, null;
     char uplo[1];
     extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex *
-	    , integer *, complex *, integer *, complex *, integer *, ftnlen), 
-	    chpr2_(char *, integer *, complex *, complex *, integer *, 
-	    complex *, integer *, complex *, ftnlen), cmake_(char *, char *, 
-	    char *, integer *, integer *, complex *, integer *, complex *, 
-	    integer *, integer *, integer *, logical *, complex *, ftnlen, 
+	    , integer *, complex *, integer *, complex *, integer *, ftnlen),
+	    chpr2_(char *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, complex *, ftnlen), cmake_(char *, char *,
+	    char *, integer *, integer *, complex *, integer *, complex *,
+	    integer *, integer *, integer *, logical *, complex *, ftnlen,
 	    ftnlen, ftnlen);
     complex alpha;
     logical isame[13];
     extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex *
 	    , complex *, integer *, complex *, integer *, complex *, complex *
-	    , integer *, complex *, real *, complex *, real *, real *, 
+	    , integer *, complex *, real *, complex *, real *, real *,
 	    logical *, integer *, logical *, ftnlen);
     integer nargs;
     logical reset;
@@ -3621,7 +3631,7 @@ static logical c_false = FALSE_;
     logical upper;
     char uplos[1];
     logical packed;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     complex transl;
@@ -3728,7 +3738,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		cmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    i__3 = n / 2;
@@ -3768,7 +3778,7 @@ static logical c_false = FALSE_;
 			transl.r = 0.f, transl.i = 0.f;
 			i__5 = n - 1;
 			i__6 = n - 1;
-			cmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], 
+			cmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -3956,14 +3966,14 @@ static logical c_false = FALSE_;
 			    i__5 = n;
 			    for (j = 1; j <= i__5; ++j) {
 				r_cnjg(&q__2, &z__[j + (z_dim1 << 1)]);
-				q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, 
-					q__1.i = alpha.r * q__2.i + alpha.i * 
+				q__1.r = alpha.r * q__2.r - alpha.i * q__2.i,
+					q__1.i = alpha.r * q__2.i + alpha.i *
 					q__2.r;
 				w[0].r = q__1.r, w[0].i = q__1.i;
 				r_cnjg(&q__2, &alpha);
 				r_cnjg(&q__3, &z__[j + z_dim1]);
-				q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, 
-					q__1.i = q__2.r * q__3.i + q__2.i * 
+				q__1.r = q__2.r * q__3.r - q__2.i * q__3.i,
+					q__1.i = q__2.r * q__3.i + q__2.i *
 					q__3.r;
 				w[1].r = q__1.r, w[1].i = q__1.i;
 				if (upper) {
@@ -3973,8 +3983,8 @@ static logical c_false = FALSE_;
 				    jj = j;
 				    lj = n - j + 1;
 				}
-				cmvch_("N", &lj, &c__2, &c_b2, &z__[jj + 
-					z_dim1], nmax, w, &c__1, &c_b2, &a[jj 
+				cmvch_("N", &lj, &c__2, &c_b2, &z__[jj +
+					z_dim1], nmax, w, &c__1, &c_b2, &a[jj
 					+ j * a_dim1], &c__1, &yt[1], &g[1], &
 					aa[ja], eps, &err, fatal, nout, &
 					c_true, (ftnlen)1);
@@ -4079,7 +4089,7 @@ static logical c_false = FALSE_;
 
 } /* cchk6_ */
 
-/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -4093,40 +4103,40 @@ static logical c_false = FALSE_;
 
     /* Local variables */
     complex a[1]	/* was [1][1] */, x[1], y[1], beta;
-    extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, 
+    extern /* Subroutine */ int cher_(char *, integer *, real *, complex *,
 	    integer *, complex *, integer *, ftnlen), chpr_(char *, integer *,
-	     real *, complex *, integer *, complex *, ftnlen), cher2_(char *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	     real *, complex *, integer *, complex *, ftnlen), cher2_(char *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
 	    complex *, integer *, ftnlen), chpr2_(char *, integer *, complex *
-	    , complex *, integer *, complex *, integer *, complex *, ftnlen), 
-	    cgerc_(integer *, integer *, complex *, complex *, integer *, 
+	    , complex *, integer *, complex *, integer *, complex *, ftnlen),
+	    cgerc_(integer *, integer *, complex *, complex *, integer *,
 	    complex *, integer *, complex *, integer *);
     complex alpha;
     extern /* Subroutine */ int cgbmv_(char *, integer *, integer *, integer *
 	    , integer *, complex *, complex *, integer *, complex *, integer *
-	    , complex *, complex *, integer *, ftnlen), chbmv_(char *, 
-	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    , complex *, complex *, integer *, ftnlen), chbmv_(char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
 	    integer *, complex *, complex *, integer *, ftnlen), cgemv_(char *
 	    , integer *, integer *, complex *, complex *, integer *, complex *
 	    , integer *, complex *, complex *, integer *, ftnlen), chemv_(
-	    char *, integer *, complex *, complex *, integer *, complex *, 
+	    char *, integer *, complex *, complex *, integer *, complex *,
 	    integer *, complex *, complex *, integer *, ftnlen), cgeru_(
-	    integer *, integer *, complex *, complex *, integer *, complex *, 
-	    integer *, complex *, integer *), ctbmv_(char *, char *, char *, 
-	    integer *, integer *, complex *, integer *, complex *, integer *, 
-	    ftnlen, ftnlen, ftnlen), chpmv_(char *, integer *, complex *, 
-	    complex *, complex *, integer *, complex *, complex *, integer *, 
-	    ftnlen), ctbsv_(char *, char *, char *, integer *, integer *, 
-	    complex *, integer *, complex *, integer *, ftnlen, ftnlen, 
-	    ftnlen), ctpmv_(char *, char *, char *, integer *, complex *, 
-	    complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_(char *, 
-	    char *, char *, integer *, complex *, integer *, complex *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *, complex *, integer *), ctbmv_(char *, char *, char *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    ftnlen, ftnlen, ftnlen), chpmv_(char *, integer *, complex *,
+	    complex *, complex *, integer *, complex *, complex *, integer *,
+	    ftnlen), ctbsv_(char *, char *, char *, integer *, integer *,
+	    complex *, integer *, complex *, integer *, ftnlen, ftnlen,
+	    ftnlen), ctpmv_(char *, char *, char *, integer *, complex *,
+	    complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_(char *,
+	    char *, char *, integer *, complex *, integer *, complex *,
 	    integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char *,
-	     integer *, complex *, complex *, integer *, ftnlen, ftnlen, 
-	    ftnlen), ctrsv_(char *, char *, char *, integer *, complex *, 
+	     integer *, complex *, complex *, integer *, ftnlen, ftnlen,
+	    ftnlen), ctrsv_(char *, char *, char *, integer *, complex *,
 	    integer *, complex *, integer *, ftnlen, ftnlen, ftnlen);
     real ralpha;
-    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical 
+    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical
 	    *, logical *, ftnlen);
 
     /* Fortran I/O blocks */
@@ -4655,9 +4665,9 @@ static logical c_false = FALSE_;
 
 } /* cchke_ */
 
-/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, 
-	integer *n, complex *a, integer *nmax, complex *aa, integer *lda, 
-	integer *kl, integer *ku, logical *reset, complex *transl, ftnlen 
+/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m,
+	integer *n, complex *a, integer *nmax, complex *aa, integer *lda,
+	integer *kl, integer *ku, logical *reset, complex *transl, ftnlen
 	type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
@@ -4718,7 +4728,7 @@ static logical c_false = FALSE_;
 	i__2 = *m;
 	for (i__ = 1; i__ <= i__2; ++i__) {
 	    if (gen || upper && i__ <= j || lower && i__ >= j) {
-		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) 
+		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl)
 			{
 		    i__3 = i__ + j * a_dim1;
 		    cbeg_(&q__2, reset);
@@ -4953,8 +4963,8 @@ static logical c_false = FALSE_;
 
 /* Subroutine */ int cmvch_(char *trans, integer *m, integer *n, complex *
 	alpha, complex *a, integer *nmax, complex *x, integer *incx, complex *
-	beta, complex *y, integer *incy, complex *yt, real *g, complex *yy, 
-	real *eps, real *err, logical *fatal, integer *nout, logical *mv, 
+	beta, complex *y, integer *incy, complex *yt, real *g, complex *yy,
+	real *eps, real *err, logical *fatal, integer *nout, logical *mv,
 	ftnlen trans_len)
 {
     /* Format strings */
@@ -5057,15 +5067,15 @@ static logical c_false = FALSE_;
 		i__4 = iy;
 		i__5 = j + i__ * a_dim1;
 		i__6 = jx;
-		q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, 
+		q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i,
 			q__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6]
 			.r;
 		q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i;
 		yt[i__3].r = q__1.r, yt[i__3].i = q__1.i;
 		i__3 = j + i__ * a_dim1;
 		i__4 = jx;
-		g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j 
-			+ i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, 
+		g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j
+			+ i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r,
 			abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4)));
 		jx += incxl;
 /* L10: */
@@ -5077,14 +5087,14 @@ static logical c_false = FALSE_;
 		i__4 = iy;
 		r_cnjg(&q__3, &a[j + i__ * a_dim1]);
 		i__5 = jx;
-		q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i, q__2.i = 
+		q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i, q__2.i =
 			q__3.r * x[i__5].i + q__3.i * x[i__5].r;
 		q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i;
 		yt[i__3].r = q__1.r, yt[i__3].i = q__1.i;
 		i__3 = j + i__ * a_dim1;
 		i__4 = jx;
-		g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j 
-			+ i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, 
+		g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j
+			+ i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r,
 			abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4)));
 		jx += incxl;
 /* L20: */
@@ -5096,7 +5106,7 @@ static logical c_false = FALSE_;
 		i__4 = iy;
 		i__5 = i__ + j * a_dim1;
 		i__6 = jx;
-		q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, 
+		q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i,
 			q__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6]
 			.r;
 		q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i;
@@ -5104,7 +5114,7 @@ static logical c_false = FALSE_;
 		i__3 = i__ + j * a_dim1;
 		i__4 = jx;
 		g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[
-			i__ + j * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, 
+			i__ + j * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r,
 			abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4)));
 		jx += incxl;
 /* L30: */
@@ -5112,7 +5122,7 @@ static logical c_false = FALSE_;
 	}
 	i__2 = iy;
 	i__3 = iy;
-	q__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, q__2.i = 
+	q__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, q__2.i =
 		alpha->r * yt[i__3].i + alpha->i * yt[i__3].r;
 	i__4 = iy;
 	q__3.r = beta->r * y[i__4].r - beta->i * y[i__4].i, q__3.i = beta->r *
@@ -5121,7 +5131,7 @@ static logical c_false = FALSE_;
 	yt[i__2].r = q__1.r, yt[i__2].i = q__1.i;
 	i__2 = iy;
 	g[iy] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), abs(
-		r__2))) * g[iy] + ((r__3 = beta->r, abs(r__3)) + (r__4 = 
+		r__2))) * g[iy] + ((r__3 = beta->r, abs(r__3)) + (r__4 =
 		r_imag(beta), abs(r__4))) * ((r__5 = y[i__2].r, abs(r__5)) + (
 		r__6 = r_imag(&y[iy]), abs(r__6)));
 	iy += incyl;
@@ -5410,7 +5420,7 @@ real sdiff_(real *x, real *y)
 
 } /* sdiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/blastest/src/cblat3.c b/blastest/src/cblat3.c
index a5b870f0f..e3d5e32a3 100644
--- a/blastest/src/cblat3.c
+++ b/blastest/src/cblat3.c
@@ -140,9 +140,14 @@ static integer c_n1 = -1;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "cblat3";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*9] = "CGEMM " "CHEMM " "CSYMM " "CTRMM " "CTRSM " 
+    static char snames[6*9] = "CGEMM " "CHEMM " "CSYMM " "CTRMM " "CTRSM "
 	    "CHERK " "CSYRK " "CHER2K" "CSYR2K";
 
     /* Format strings */
@@ -186,10 +191,10 @@ static integer c_n1 = -1;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -209,34 +214,34 @@ static integer c_n1 = -1;
     integer nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, complex *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, complex *, real *, ftnlen), cchk2_(char *, 
-	    real *, real *, integer *, integer *, logical *, logical *, 
-	    logical *, integer *, integer *, integer *, complex *, integer *, 
-	    complex *, integer *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
-	    real *, ftnlen), cchk3_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, complex *, integer *, complex *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, real *, complex *, 
-	    ftnlen), cchk4_(char *, real *, real *, integer *, integer *, 
-	    logical *, logical *, logical *, integer *, integer *, integer *, 
-	    complex *, integer *, complex *, integer *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, real *, ftnlen), cchk5_(char *, real *, 
-	    real *, integer *, integer *, logical *, logical *, logical *, 
-	    integer *, integer *, integer *, complex *, integer *, complex *, 
-	    integer *, complex *, complex *, complex *, complex *, complex *, 
-	    complex *, complex *, complex *, complex *, real *, complex *, 
+    extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, complex *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, complex *, real *, ftnlen), cchk2_(char *,
+	    real *, real *, integer *, integer *, logical *, logical *,
+	    logical *, integer *, integer *, integer *, complex *, integer *,
+	    complex *, integer *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
+	    real *, ftnlen), cchk3_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, real *, complex *,
+	    ftnlen), cchk4_(char *, real *, real *, integer *, integer *,
+	    logical *, logical *, logical *, integer *, integer *, integer *,
+	    complex *, integer *, complex *, integer *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, real *, ftnlen), cchk5_(char *, real *,
+	    real *, integer *, integer *, logical *, logical *, logical *,
+	    integer *, integer *, integer *, complex *, integer *, complex *,
+	    integer *, complex *, complex *, complex *, complex *, complex *,
+	    complex *, complex *, complex *, complex *, real *, complex *,
 	    ftnlen), cchke_(integer *, char *, integer *, ftnlen);
     logical fatal;
-    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    complex *, complex *, integer *, complex *, real *, complex *, 
-	    integer *, real *, real *, logical *, integer *, logical *, 
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, complex *, real *, complex *,
+	    integer *, real *, real *, logical *, integer *, logical *,
 	    ftnlen, ftnlen);
     logical trace;
     integer nidim;
@@ -508,7 +513,7 @@ static integer c_n1 = -1;
 	goto L60;
     }
     for (i__ = 1; i__ <= 9; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L50;
 	}
@@ -571,7 +576,7 @@ static integer c_n1 = -1;
     *(unsigned char *)transa = 'N';
     *(unsigned char *)transb = 'N';
     cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
@@ -586,7 +591,7 @@ static integer c_n1 = -1;
     }
     *(unsigned char *)transb = 'C';
     cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
@@ -619,7 +624,7 @@ static integer c_n1 = -1;
     *(unsigned char *)transa = 'C';
     *(unsigned char *)transb = 'N';
     cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
@@ -634,7 +639,7 @@ static integer c_n1 = -1;
     }
     *(unsigned char *)transb = 'C';
     cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
@@ -688,34 +693,34 @@ static integer c_n1 = -1;
 /*           Test CGEMM, 01. */
 L140:
 	    cchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test CHEMM, 02, CSYMM, 03. */
 L150:
 	    cchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test CTRMM, 04, CTRSM, 05. */
 L160:
 	    cchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65,
 		    ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6);
 	    goto L190;
 /*           Test CHERK, 06, CSYRK, 07. */
 L170:
 	    cchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test CHER2K, 08, CSYR2K, 09. */
 L180:
 	    cchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
 		    bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, (
 		    ftnlen)6);
 	    goto L190;
@@ -759,14 +764,19 @@ static integer c_n1 = -1;
 
 /*     End of CBLAT3. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
 	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
-	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc,
 	complex *cs, complex *ct, real *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -791,7 +801,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7, i__8;
     alist al__1;
 
@@ -800,7 +810,7 @@ static integer c_n1 = -1;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, 
+    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns,
 	    ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     extern logical lce_(complex *, complex *, integer *);
     complex als, bls;
@@ -808,21 +818,21 @@ static integer c_n1 = -1;
     complex beta;
     integer ldas, ldbs, ldcs;
     logical same, null;
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, logical *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, logical *,
 	    complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
-    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    complex *, complex *, integer *, ftnlen, ftnlen), cmmch_(char *, 
-	    char *, integer *, integer *, integer *, complex *, complex *, 
-	    integer *, complex *, integer *, complex *, complex *, integer *, 
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, ftnlen, ftnlen), cmmch_(char *,
+	    char *, integer *, integer *, integer *, complex *, complex *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
 	    complex *, real *, complex *, integer *, real *, real *, logical *
 	    , integer *, logical *, ftnlen, ftnlen);
     logical isame[13], trana, tranb;
     integer nargs;
     logical reset;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     char tranas[1], tranbs[1], transa[1], transb[1];
     real errmax;
@@ -915,7 +925,7 @@ static integer c_n1 = -1;
 		for (ica = 1; ica <= 3; ++ica) {
 		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
 			    ;
-		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned
 			    char *)transa == 'C';
 
 		    if (trana) {
@@ -943,9 +953,9 @@ static integer c_n1 = -1;
 			    ftnlen)1);
 
 		    for (icb = 1; icb <= 3; ++icb) {
-			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb
 				- 1];
-			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned
 				char *)transb == 'C';
 
 			if (tranb) {
@@ -1086,13 +1096,13 @@ static integer c_n1 = -1;
 				isame[2] = ms == m;
 				isame[3] = ns == n;
 				isame[4] = ks == k;
-				isame[5] = als.r == alpha.r && als.i == 
+				isame[5] = als.r == alpha.r && als.i ==
 					alpha.i;
 				isame[6] = lce_(&as[1], &aa[1], &laa);
 				isame[7] = ldas == lda;
 				isame[8] = lce_(&bs[1], &bb[1], &lbb);
 				isame[9] = ldbs == ldb;
-				isame[10] = bls.r == beta.r && bls.i == 
+				isame[10] = bls.r == beta.r && bls.i ==
 					beta.i;
 				if (null) {
 				    isame[11] = lce_(&cs[1], &cc[1], &lcc);
@@ -1130,9 +1140,9 @@ static integer c_n1 = -1;
 
 				    cmmch_(transa, transb, &m, &n, &k, &alpha,
 					     &a[a_offset], nmax, &b[b_offset],
-					     nmax, &beta, &c__[c_offset], 
+					     nmax, &beta, &c__[c_offset],
 					    nmax, &ct[1], &g[1], &cc[1], &ldc,
-					     eps, &err, fatal, nout, &c_true, 
+					     eps, &err, fatal, nout, &c_true,
 					    (ftnlen)1, (ftnlen)1);
 				    errmax = max(errmax,err);
 /*                             If got really bad answer, report and */
@@ -1214,10 +1224,10 @@ static integer c_n1 = -1;
 } /* cchk1_ */
 
 /* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
 	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
-	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc,
 	complex *cs, complex *ct, real *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -1243,7 +1253,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7;
     alist al__1;
 
@@ -1252,7 +1262,7 @@ static integer c_n1 = -1;
 	    integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *);
 
     /* Local variables */
-    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, 
+    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc,
 	    ldb, ldc;
     extern logical lce_(complex *, complex *, integer *);
     integer ics;
@@ -1265,26 +1275,26 @@ static integer c_n1 = -1;
     char side[1];
     logical conj, left, null;
     char uplo[1];
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, logical *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, logical *,
 	    complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
-    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    complex *, complex *, integer *, complex *, real *, complex *, 
-	    integer *, real *, real *, logical *, integer *, logical *, 
-	    ftnlen, ftnlen), chemm_(char *, char *, integer *, integer *, 
-	    complex *, complex *, integer *, complex *, integer *, complex *, 
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, complex *, real *, complex *,
+	    integer *, real *, real *, logical *, integer *, logical *,
+	    ftnlen, ftnlen), chemm_(char *, char *, integer *, integer *,
+	    complex *, complex *, integer *, complex *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     logical isame[13];
     char sides[1];
     integer nargs;
     logical reset;
-    extern /* Subroutine */ int csymm_(char *, char *, integer *, integer *, 
-	    complex *, complex *, integer *, complex *, integer *, complex *, 
+    extern /* Subroutine */ int csymm_(char *, char *, integer *, integer *,
+	    complex *, complex *, integer *, complex *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     char uplos[1];
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
 
@@ -1426,7 +1436,7 @@ static integer c_n1 = -1;
 
 /*                       Generate the matrix C. */
 
-			    cmake_("GE", " ", " ", &m, &n, &c__[c_offset], 
+			    cmake_("GE", " ", " ", &m, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b1, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1522,9 +1532,9 @@ static integer c_n1 = -1;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)sides == *(unsigned 
+			    isame[0] = *(unsigned char *)sides == *(unsigned
 				    char *)side;
-			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+			    isame[1] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
 			    isame[2] = ms == m;
 			    isame[3] = ns == n;
@@ -1569,14 +1579,14 @@ static integer c_n1 = -1;
 
 				if (left) {
 				    cmmch_("N", "N", &m, &n, &m, &alpha, &a[
-					    a_offset], nmax, &b[b_offset], 
+					    a_offset], nmax, &b[b_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1, (ftnlen)1);
 				} else {
 				    cmmch_("N", "N", &m, &n, &n, &alpha, &b[
-					    b_offset], nmax, &a[a_offset], 
+					    b_offset], nmax, &a[a_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
@@ -1657,9 +1667,9 @@ static integer c_n1 = -1;
 } /* cchk2_ */
 
 /* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
-	nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, 
+	nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb,
 	complex *bs, complex *ct, real *g, complex *c__, ftnlen sname_len)
 {
     /* Initialized data */
@@ -1686,7 +1696,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7;
     complex q__1;
     alist al__1;
@@ -1708,27 +1718,27 @@ static integer c_n1 = -1;
     char side[1];
     logical left, null;
     char uplo[1];
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, logical *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, logical *,
 	    complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
     char diags[1];
-    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    complex *, complex *, integer *, complex *, real *, complex *, 
-	    integer *, real *, real *, logical *, integer *, logical *, 
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, complex *, real *, complex *,
+	    integer *, real *, real *, logical *, integer *, logical *,
 	    ftnlen, ftnlen);
     logical isame[13];
     char sides[1];
     integer nargs;
     logical reset;
-    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, 
-	    integer *, integer *, complex *, complex *, integer *, complex *, 
+    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
 	    integer *, ftnlen, ftnlen, ftnlen, ftnlen), ctrsm_(char *, char *,
-	     char *, char *, integer *, integer *, complex *, complex *, 
+	     char *, char *, integer *, integer *, complex *, complex *,
 	    integer *, complex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen);
     char uplos[1];
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     char tranas[1], transa[1];
     real errmax;
@@ -1867,7 +1877,7 @@ static integer c_n1 = -1;
 
 /*                          Generate the matrix B. */
 
-				cmake_("GE", " ", " ", &m, &n, &b[b_offset], 
+				cmake_("GE", " ", " ", &m, &n, &b[b_offset],
 					nmax, &bb[1], &ldb, &reset, &c_b1, (
 					ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1939,7 +1949,7 @@ static integer c_n1 = -1;
 				    }
 				    ctrmm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				} else if (s_cmp(sname + 3, "SM", (ftnlen)2, (
 					ftnlen)2) == 0) {
@@ -1972,7 +1982,7 @@ static integer c_n1 = -1;
 				    }
 				    ctrsm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				}
 
@@ -1998,7 +2008,7 @@ static integer c_n1 = -1;
 					unsigned char *)diag;
 				isame[4] = ms == m;
 				isame[5] = ns == n;
-				isame[6] = als.r == alpha.r && als.i == 
+				isame[6] = als.r == alpha.r && als.i ==
 					alpha.i;
 				isame[7] = lce_(&as[1], &aa[1], &laa);
 				isame[8] = ldas == lda;
@@ -2042,18 +2052,18 @@ static integer c_n1 = -1;
 					    cmmch_(transa, "N", &m, &n, &m, &
 						    alpha, &a[a_offset], nmax,
 						     &b[b_offset], nmax, &
-						    c_b1, &c__[c_offset], 
+						    c_b1, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					} else {
 					    cmmch_("N", transa, &m, &n, &n, &
 						    alpha, &b[b_offset], nmax,
 						     &a[a_offset], nmax, &
-						    c_b1, &c__[c_offset], 
+						    c_b1, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					}
@@ -2066,14 +2076,14 @@ static integer c_n1 = -1;
 					i__4 = n;
 					for (j = 1; j <= i__4; ++j) {
 					    i__5 = m;
-					    for (i__ = 1; i__ <= i__5; ++i__) 
+					    for (i__ = 1; i__ <= i__5; ++i__)
 						    {
 			  i__6 = i__ + j * c_dim1;
 			  i__7 = i__ + (j - 1) * ldb;
 			  c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i;
 			  i__6 = i__ + (j - 1) * ldb;
 			  i__7 = i__ + j * b_dim1;
-			  q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, 
+			  q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i,
 				  q__1.i = alpha.r * b[i__7].i + alpha.i * b[
 				  i__7].r;
 			  bb[i__6].r = q__1.r, bb[i__6].i = q__1.i;
@@ -2084,20 +2094,20 @@ static integer c_n1 = -1;
 
 					if (left) {
 					    cmmch_(transa, "N", &m, &n, &m, &
-						    c_b2, &a[a_offset], nmax, 
+						    c_b2, &a[a_offset], nmax,
 						    &c__[c_offset], nmax, &
-						    c_b1, &b[b_offset], nmax, 
+						    c_b1, &b[b_offset], nmax,
 						    &ct[1], &g[1], &bb[1], &
-						    ldb, eps, &err, fatal, 
+						    ldb, eps, &err, fatal,
 						    nout, &c_false, (ftnlen)1,
 						     (ftnlen)1);
 					} else {
 					    cmmch_("N", transa, &m, &n, &n, &
-						    c_b2, &c__[c_offset], 
-						    nmax, &a[a_offset], nmax, 
+						    c_b2, &c__[c_offset],
+						    nmax, &a[a_offset], nmax,
 						    &c_b1, &b[b_offset], nmax,
 						     &ct[1], &g[1], &bb[1], &
-						    ldb, eps, &err, fatal, 
+						    ldb, eps, &err, fatal,
 						    nout, &c_false, (ftnlen)1,
 						     (ftnlen)1);
 					}
@@ -2179,10 +2189,10 @@ static integer c_n1 = -1;
 } /* cchk3_ */
 
 /* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
 	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
-	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc,
 	complex *cs, complex *ct, real *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -2213,7 +2223,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7;
     complex q__1;
     alist al__1;
@@ -2236,16 +2246,16 @@ static integer c_n1 = -1;
     real rals;
     logical tran, null;
     char uplo[1];
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, logical *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, logical *,
 	    complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
-    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    complex *, complex *, integer *, complex *, real *, complex *, 
-	    integer *, real *, real *, logical *, integer *, logical *, 
-	    ftnlen, ftnlen), cherk_(char *, char *, integer *, integer *, 
-	    real *, complex *, integer *, real *, complex *, integer *, 
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, complex *, real *, complex *,
+	    integer *, real *, real *, logical *, integer *, logical *,
+	    ftnlen, ftnlen), cherk_(char *, char *, integer *, integer *,
+	    real *, complex *, integer *, real *, complex *, integer *,
 	    ftnlen, ftnlen);
     real rbeta;
     logical isame[13];
@@ -2254,12 +2264,12 @@ static integer c_n1 = -1;
     logical reset;
     char trans[1];
     logical upper;
-    extern /* Subroutine */ int csyrk_(char *, char *, integer *, integer *, 
-	    complex *, complex *, integer *, complex *, complex *, integer *, 
+    extern /* Subroutine */ int csyrk_(char *, char *, integer *, integer *,
+	    complex *, complex *, integer *, complex *, complex *, integer *,
 	    ftnlen, ftnlen);
     char uplos[1];
     real ralpha;
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     char transs[1], transt[1];
@@ -2402,7 +2412,7 @@ static integer c_n1 = -1;
 			    }
 			    null = n <= 0;
 			    if (conj) {
-				null = null || (k <= 0 || ralpha == 0.f) && 
+				null = null || (k <= 0 || ralpha == 0.f) &&
 					rbeta == 1.f;
 			    }
 
@@ -2481,7 +2491,7 @@ static integer c_n1 = -1;
 				    f_rew(&al__1);
 				}
 				cherk_(uplo, trans, &n, &k, &ralpha, &aa[1], &
-					lda, &rbeta, &cc[1], &ldc, (ftnlen)1, 
+					lda, &rbeta, &cc[1], &ldc, (ftnlen)1,
 					(ftnlen)1);
 			    } else {
 				if (*trace) {
@@ -2528,16 +2538,16 @@ static integer c_n1 = -1;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
 			    if (conj) {
 				isame[4] = rals == ralpha;
 			    } else {
-				isame[4] = als.r == alpha.r && als.i == 
+				isame[4] = als.r == alpha.r && als.i ==
 					alpha.i;
 			    }
 			    isame[5] = lce_(&as[1], &aa[1], &laa);
@@ -2545,7 +2555,7 @@ static integer c_n1 = -1;
 			    if (conj) {
 				isame[7] = rbets == rbeta;
 			    } else {
-				isame[7] = bets.r == beta.r && bets.i == 
+				isame[7] = bets.r == beta.r && bets.i ==
 					beta.i;
 			    }
 			    if (null) {
@@ -2599,19 +2609,19 @@ static integer c_n1 = -1;
 				    }
 				    if (tran) {
 					cmmch_(transt, "N", &lj, &c__1, &k, &
-						alpha, &a[jj * a_dim1 + 1], 
-						nmax, &a[j * a_dim1 + 1], 
-						nmax, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						alpha, &a[jj * a_dim1 + 1],
+						nmax, &a[j * a_dim1 + 1],
+						nmax, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
 					cmmch_("N", transt, &lj, &c__1, &k, &
-						alpha, &a[jj + a_dim1], nmax, 
+						alpha, &a[jj + a_dim1], nmax,
 						&a[j + a_dim1], nmax, &beta, &
 						c__[jj + j * c_dim1], nmax, &
-						ct[1], &g[1], &cc[jc], &ldc, 
+						ct[1], &g[1], &cc[jc], &ldc,
 						eps, &err, fatal, nout, &
 						c_true, (ftnlen)1, (ftnlen)1);
 				    }
@@ -2720,10 +2730,10 @@ static integer c_n1 = -1;
 } /* cchk4_ */
 
 /* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
 	nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex *
-	as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, 
+	as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs,
 	complex *ct, real *g, complex *w, ftnlen sname_len)
 {
     /* Initialized data */
@@ -2778,14 +2788,14 @@ static integer c_n1 = -1;
     complex bets;
     logical tran, null;
     char uplo[1];
-    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
-	    integer *, complex *, integer *, complex *, integer *, logical *, 
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, logical *,
 	    complex *, ftnlen, ftnlen, ftnlen);
     complex alpha;
-    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    complex *, complex *, integer *, complex *, real *, complex *, 
-	    integer *, real *, real *, logical *, integer *, logical *, 
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, complex *, real *, complex *,
+	    integer *, real *, real *, logical *, integer *, logical *,
 	    ftnlen, ftnlen);
     real rbeta;
     logical isame[13];
@@ -2795,12 +2805,12 @@ static integer c_n1 = -1;
     char trans[1];
     logical upper;
     char uplos[1];
-    extern /* Subroutine */ int cher2k_(char *, char *, integer *, integer *, 
-	    complex *, complex *, integer *, complex *, integer *, real *, 
-	    complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, char *, 
-	    integer *, integer *, complex *, complex *, integer *, complex *, 
+    extern /* Subroutine */ int cher2k_(char *, char *, integer *, integer *,
+	    complex *, complex *, integer *, complex *, integer *, real *,
+	    complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
 	    integer *, complex *, complex *, integer *, ftnlen, ftnlen);
-    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+    extern logical lceres_(char *, char *, integer *, integer *, complex *,
 	    complex *, integer *, ftnlen, ftnlen);
     real errmax;
     char transs[1], transt[1];
@@ -2957,7 +2967,7 @@ static integer c_n1 = -1;
 			    }
 			    null = n <= 0;
 			    if (conj) {
-				null = null || (k <= 0 || alpha.r == 0.f && 
+				null = null || (k <= 0 || alpha.r == 0.f &&
 					alpha.i == 0.f) && rbeta == 1.f;
 			    }
 
@@ -3092,9 +3102,9 @@ static integer c_n1 = -1;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
@@ -3106,7 +3116,7 @@ static integer c_n1 = -1;
 			    if (conj) {
 				isame[9] = rbets == rbeta;
 			    } else {
-				isame[9] = bets.r == beta.r && bets.i == 
+				isame[9] = bets.r == beta.r && bets.i ==
 					beta.i;
 			    }
 			    if (null) {
@@ -3162,20 +3172,20 @@ static integer c_n1 = -1;
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
 					    i__7 = i__;
-					    i__8 = (j - 1 << 1) * *nmax + k + 
+					    i__8 = (j - 1 << 1) * *nmax + k +
 						    i__;
-					    q__1.r = alpha.r * ab[i__8].r - 
-						    alpha.i * ab[i__8].i, 
+					    q__1.r = alpha.r * ab[i__8].r -
+						    alpha.i * ab[i__8].i,
 						    q__1.i = alpha.r * ab[
 						    i__8].i + alpha.i * ab[
 						    i__8].r;
-					    w[i__7].r = q__1.r, w[i__7].i = 
+					    w[i__7].r = q__1.r, w[i__7].i =
 						    q__1.i;
 					    if (conj) {
 			  i__7 = k + i__;
 			  r_cnjg(&q__2, &alpha);
 			  i__8 = (j - 1 << 1) * *nmax + i__;
-			  q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, 
+			  q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i,
 				  q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[
 				  i__8].r;
 			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
@@ -3183,7 +3193,7 @@ static integer c_n1 = -1;
 			  i__7 = k + i__;
 			  i__8 = (j - 1 << 1) * *nmax + i__;
 			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
 					    }
@@ -3194,9 +3204,9 @@ static integer c_n1 = -1;
 					i__8 = *nmax << 1;
 					cmmch_(transt, "N", &lj, &c__1, &i__6,
 						 &c_b2, &ab[jjab], &i__7, &w[
-						1], &i__8, &beta, &c__[jj + j 
+						1], &i__8, &beta, &c__[jj + j
 						* c_dim1], nmax, &ct[1], &g[1]
-						, &cc[jc], &ldc, eps, &err, 
+						, &cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
@@ -3205,14 +3215,14 @@ static integer c_n1 = -1;
 					    if (conj) {
 			  i__7 = i__;
 			  r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]);
-			  q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, 
-				  q__1.i = alpha.r * q__2.i + alpha.i * 
+			  q__1.r = alpha.r * q__2.r - alpha.i * q__2.i,
+				  q__1.i = alpha.r * q__2.i + alpha.i *
 				  q__2.r;
 			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
 			  i__7 = k + i__;
 			  i__8 = (i__ - 1) * *nmax + j;
 			  q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, q__2.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, q__2.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  r_cnjg(&q__1, &q__2);
 			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
@@ -3220,13 +3230,13 @@ static integer c_n1 = -1;
 			  i__7 = i__;
 			  i__8 = (k + i__ - 1) * *nmax + j;
 			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
 			  i__7 = k + i__;
 			  i__8 = (i__ - 1) * *nmax + j;
 			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
 					    }
@@ -3236,9 +3246,9 @@ static integer c_n1 = -1;
 					i__7 = *nmax << 1;
 					cmmch_("N", "N", &lj, &c__1, &i__6, &
 						c_b2, &ab[jj], nmax, &w[1], &
-						i__7, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						i__7, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    }
@@ -3351,7 +3361,7 @@ static integer c_n1 = -1;
 
 } /* cchk5_ */
 
-/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -3364,34 +3374,34 @@ static integer c_n1 = -1;
     integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void);
 
     /* Local variables */
-    complex a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]	
+    complex a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]
 	    /* was [2][1] */, beta, alpha;
-    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    complex *, complex *, integer *, ftnlen, ftnlen), chemm_(char *, 
-	    char *, integer *, integer *, complex *, complex *, integer *, 
-	    complex *, integer *, complex *, complex *, integer *, ftnlen, 
-	    ftnlen), cherk_(char *, char *, integer *, integer *, real *, 
-	    complex *, integer *, real *, complex *, integer *, ftnlen, 
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, ftnlen, ftnlen), chemm_(char *,
+	    char *, integer *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, ftnlen,
+	    ftnlen), cherk_(char *, char *, integer *, integer *, real *,
+	    complex *, integer *, real *, complex *, integer *, ftnlen,
 	    ftnlen);
     real rbeta;
-    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, 
-	    integer *, integer *, complex *, complex *, integer *, complex *, 
+    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
 	    integer *, ftnlen, ftnlen, ftnlen, ftnlen), csymm_(char *, char *,
 	     integer *, integer *, complex *, complex *, integer *, complex *,
-	     integer *, complex *, complex *, integer *, ftnlen, ftnlen), 
-	    ctrsm_(char *, char *, char *, char *, integer *, integer *, 
-	    complex *, complex *, integer *, complex *, integer *, ftnlen, 
-	    ftnlen, ftnlen, ftnlen), csyrk_(char *, char *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, complex *, 
-	    integer *, ftnlen, ftnlen), cher2k_(char *, char *, integer *, 
-	    integer *, complex *, complex *, integer *, complex *, integer *, 
-	    real *, complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, 
-	    char *, integer *, integer *, complex *, complex *, integer *, 
-	    complex *, integer *, complex *, complex *, integer *, ftnlen, 
+	     integer *, complex *, complex *, integer *, ftnlen, ftnlen),
+	    ctrsm_(char *, char *, char *, char *, integer *, integer *,
+	    complex *, complex *, integer *, complex *, integer *, ftnlen,
+	    ftnlen, ftnlen, ftnlen), csyrk_(char *, char *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, complex *,
+	    integer *, ftnlen, ftnlen), cher2k_(char *, char *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    real *, complex *, integer *, ftnlen, ftnlen), csyr2k_(char *,
+	    char *, integer *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, ftnlen,
 	    ftnlen);
     real ralpha;
-    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical 
+    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical
 	    *, logical *, ftnlen);
 
     /* Fortran I/O blocks */
@@ -3451,302 +3461,302 @@ static integer c_n1 = -1;
     }
 L10:
     infoc_1.infot = 1;
-    cgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 1;
-    cgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 1;
-    cgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    cgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    cgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    cgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    cgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    cgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    cgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, 
+    cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, 
+    cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    cgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, 
+    cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, 
+    cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    cgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    cgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    cgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
@@ -4926,9 +4936,9 @@ static integer c_n1 = -1;
 
 } /* cchke_ */
 
-/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, 
-	integer *n, complex *a, integer *nmax, complex *aa, integer *lda, 
-	logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len, 
+/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m,
+	integer *n, complex *a, integer *nmax, complex *aa, integer *lda,
+	logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len,
 	ftnlen diag_len)
 {
     /* System generated locals */
@@ -5114,10 +5124,10 @@ static integer c_n1 = -1;
 } /* cmake_ */
 
 /* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer *
-	n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, 
-	integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, 
+	n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b,
+	integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct,
 	real *g, complex *cc, integer *ldcc, real *eps, real *err, logical *
-	fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen 
+	fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen
 	transb_len)
 {
     /* Format strings */
@@ -5131,7 +5141,7 @@ static integer c_n1 = -1;
 	    " \002,i3)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1,
 	    cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
     real r__1, r__2, r__3, r__4, r__5, r__6;
     complex q__1, q__2, q__3, q__4;
@@ -5190,9 +5200,9 @@ static integer c_n1 = -1;
     cc -= cc_offset;
 
     /* Function Body */
-    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa ==
 	    'C';
-    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb ==
 	    'C';
     ctrana = *(unsigned char *)transa == 'C';
     ctranb = *(unsigned char *)transb == 'C';
@@ -5220,17 +5230,17 @@ static integer c_n1 = -1;
 		    i__5 = i__;
 		    i__6 = i__ + k * a_dim1;
 		    i__7 = k + j * b_dim1;
-		    q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, 
+		    q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i,
 			    q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[
 			    i__7].r;
-		    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+		    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i +
 			    q__2.i;
 		    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 		    i__4 = i__ + k * a_dim1;
 		    i__5 = k + j * b_dim1;
 		    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(
 			    &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[
-			    i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * 
+			    i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j *
 			    b_dim1]), abs(r__4)));
 /* L20: */
 		}
@@ -5246,15 +5256,15 @@ static integer c_n1 = -1;
 			i__5 = i__;
 			r_cnjg(&q__3, &a[k + i__ * a_dim1]);
 			i__6 = k + j * b_dim1;
-			q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, 
+			q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i,
 				q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6]
 				.r;
-			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i +
 				q__2.i;
 			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			i__4 = k + i__ * a_dim1;
 			i__5 = k + j * b_dim1;
-			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * ((
 				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
 				&b[k + j * b_dim1]), abs(r__4)));
@@ -5274,12 +5284,12 @@ static integer c_n1 = -1;
 			q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
 				.i, q__2.i = a[i__6].r * b[i__7].i + a[i__6]
 				.i * b[i__7].r;
-			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i +
 				q__2.i;
 			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			i__4 = k + i__ * a_dim1;
 			i__5 = k + j * b_dim1;
-			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * ((
 				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
 				&b[k + j * b_dim1]), abs(r__4)));
@@ -5298,15 +5308,15 @@ static integer c_n1 = -1;
 			i__5 = i__;
 			i__6 = i__ + k * a_dim1;
 			r_cnjg(&q__3, &b[j + k * b_dim1]);
-			q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, 
-				q__2.i = a[i__6].r * q__3.i + a[i__6].i * 
+			q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i,
+				q__2.i = a[i__6].r * q__3.i + a[i__6].i *
 				q__3.r;
-			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i +
 				q__2.i;
 			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			i__4 = i__ + k * a_dim1;
 			i__5 = j + k * b_dim1;
-			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * ((
 				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
 				&b[j + k * b_dim1]), abs(r__4)));
@@ -5326,12 +5336,12 @@ static integer c_n1 = -1;
 			q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
 				.i, q__2.i = a[i__6].r * b[i__7].i + a[i__6]
 				.i * b[i__7].r;
-			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i +
 				q__2.i;
 			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			i__4 = i__ + k * a_dim1;
 			i__5 = j + k * b_dim1;
-			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * ((
 				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
 				&b[j + k * b_dim1]), abs(r__4)));
@@ -5351,17 +5361,17 @@ static integer c_n1 = -1;
 			    i__5 = i__;
 			    r_cnjg(&q__3, &a[k + i__ * a_dim1]);
 			    r_cnjg(&q__4, &b[j + k * b_dim1]);
-			    q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, 
-				    q__2.i = q__3.r * q__4.i + q__3.i * 
+			    q__2.r = q__3.r * q__4.r - q__3.i * q__4.i,
+				    q__2.i = q__3.r * q__4.i + q__3.i *
 				    q__4.r;
-			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i
 				    + q__2.i;
 			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
-				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4
 				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
 /* L120: */
 			}
@@ -5376,17 +5386,17 @@ static integer c_n1 = -1;
 			    i__5 = i__;
 			    r_cnjg(&q__3, &a[k + i__ * a_dim1]);
 			    i__6 = j + k * b_dim1;
-			    q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, 
+			    q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i,
 				    q__2.i = q__3.r * b[i__6].i + q__3.i * b[
 				    i__6].r;
-			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i
 				    + q__2.i;
 			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
-				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4
 				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
 /* L140: */
 			}
@@ -5403,17 +5413,17 @@ static integer c_n1 = -1;
 			    i__5 = i__;
 			    i__6 = k + i__ * a_dim1;
 			    r_cnjg(&q__3, &b[j + k * b_dim1]);
-			    q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, 
-				    q__2.i = a[i__6].r * q__3.i + a[i__6].i * 
+			    q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i,
+				    q__2.i = a[i__6].r * q__3.i + a[i__6].i *
 				    q__3.r;
-			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i
 				    + q__2.i;
 			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
-				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4
 				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
 /* L160: */
 			}
@@ -5429,16 +5439,16 @@ static integer c_n1 = -1;
 			    i__6 = k + i__ * a_dim1;
 			    i__7 = j + k * b_dim1;
 			    q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[
-				    i__7].i, q__2.i = a[i__6].r * b[i__7].i + 
+				    i__7].i, q__2.i = a[i__6].r * b[i__7].i +
 				    a[i__6].i * b[i__7].r;
-			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i
 				    + q__2.i;
 			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
 				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
-				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4
 				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
 /* L180: */
 			}
@@ -5451,17 +5461,17 @@ static integer c_n1 = -1;
 	for (i__ = 1; i__ <= i__2; ++i__) {
 	    i__3 = i__;
 	    i__4 = i__;
-	    q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = 
+	    q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i =
 		    alpha->r * ct[i__4].i + alpha->i * ct[i__4].r;
 	    i__5 = i__ + j * c_dim1;
-	    q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = 
+	    q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i =
 		    beta->r * c__[i__5].i + beta->i * c__[i__5].r;
 	    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
 	    ct[i__3].r = q__1.r, ct[i__3].i = q__1.i;
 	    i__3 = i__ + j * c_dim1;
-	    g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), 
+	    g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha),
 		    abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + (
-		    r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, 
+		    r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r,
 		    abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs(
 		    r__6)));
 /* L200: */
@@ -5772,7 +5782,7 @@ real sdiff_(real *x, real *y)
 
 } /* sdiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/blastest/src/dblat1.c b/blastest/src/dblat1.c
index 14665d844..e84867178 100644
--- a/blastest/src/dblat1.c
+++ b/blastest/src/dblat1.c
@@ -70,6 +70,11 @@ static real c_b81 = 0.f;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "dblat1";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
     static doublereal sfac = 9.765625e-4;
@@ -85,7 +90,7 @@ static real c_b81 = 0.f;
 
     /* Local variables */
     integer ic;
-    extern /* Subroutine */ int check0_(doublereal *), check1_(doublereal *), 
+    extern /* Subroutine */ int check0_(doublereal *), check1_(doublereal *),
 	    check2_(doublereal *), check3_(doublereal *), header_(void);
 
     /* Fortran I/O blocks */
@@ -124,11 +129,11 @@ static real c_b81 = 0.f;
 	combla_1.incy = 9999;
 	if (combla_1.icase == 3 || combla_1.icase == 11) {
 	    check0_(&sfac);
-	} else if (combla_1.icase == 7 || combla_1.icase == 8 || 
+	} else if (combla_1.icase == 7 || combla_1.icase == 8 ||
 		combla_1.icase == 9 || combla_1.icase == 10) {
 	    check1_(&sfac);
-	} else if (combla_1.icase == 1 || combla_1.icase == 2 || 
-		combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase 
+	} else if (combla_1.icase == 1 || combla_1.icase == 2 ||
+		combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase
 		== 12 || combla_1.icase == 13) {
 	    check2_(&sfac);
 	} else if (combla_1.icase == 4) {
@@ -143,7 +148,12 @@ static real c_b81 = 0.f;
     }
     s_stop("", (ftnlen)0);
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int header_(void)
@@ -201,17 +211,17 @@ static real c_b81 = 0.f;
     static doublereal dc1[8] = { .6,.8,-.6,.8,.6,1.,0.,1. };
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
     /* Local variables */
     integer i__, k;
     doublereal sa, sb, sc, ss, dtemp[9];
-    extern /* Subroutine */ int drotg_(doublereal *, doublereal *, doublereal 
-	    *, doublereal *), stest_(integer *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *), stest1_(doublereal *, doublereal *, 
-	    doublereal *, doublereal *), drotmg_(doublereal *, doublereal *, 
+    extern /* Subroutine */ int drotg_(doublereal *, doublereal *, doublereal
+	    *, doublereal *), stest_(integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *), stest1_(doublereal *, doublereal *,
+	    doublereal *, doublereal *), drotmg_(doublereal *, doublereal *,
 	    doublereal *, doublereal *, doublereal *);
 
     /* Fortran I/O blocks */
@@ -319,7 +329,7 @@ static real c_b81 = 0.f;
     doublereal d__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -328,12 +338,12 @@ static real c_b81 = 0.f;
     doublereal sx[8];
     integer np1, len;
     extern doublereal dnrm2_(integer *, doublereal *, integer *);
-    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
 	    integer *);
     extern doublereal dasum_(integer *, doublereal *, integer *);
     doublereal stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *), itest1_(integer *, integer *), 
+    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *), itest1_(integer *, integer *),
 	    stest1_(doublereal *, doublereal *, doublereal *, doublereal *);
     extern integer idamax_(integer *, doublereal *, integer *);
 
@@ -375,11 +385,11 @@ static real c_b81 = 0.f;
 		stest1_(&d__1, stemp, stemp, sfac);
 	    } else if (combla_1.icase == 9) {
 /*              .. DSCAL .. */
-		dscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], 
+		dscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1],
 			sx, &combla_1.incx);
 		i__1 = len;
 		for (i__ = 1; i__ <= i__1; ++i__) {
-		    strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << 
+		    strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 <<
 			    3) - 49];
 /* L40: */
 		}
@@ -446,71 +456,71 @@ static real c_b81 = 0.f;
 	    -3.,-4.,5.,0.,0.,2.,-3.,0.,1.,5.,2.,0.,-4. };
     static struct {
 	doublereal e_1[448];
-	} equiv_3 = {{ .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., 
-		.6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 
-		0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 
-		0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, 0., 
-		0., 0., 0., 0., -.8, 3.8, 0., 0., 0., 0., 0., -.9, 2.8, 0., 
+	} equiv_3 = {{ .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0.,
+		.6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6,
+		0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0.,
+		0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, 0.,
+		0., 0., 0., 0., -.8, 3.8, 0., 0., 0., 0., 0., -.9, 2.8, 0.,
 		0., 0., 0., 0., 3.5, -.4, 0., 0., 0., 0., 0., .6, .1, -.5, .8,
 		 0., 0., 0., -.8, 3.8, -2.2, -1.2, 0., 0., 0., -.9, 2.8, -1.4,
-		 -1.3, 0., 0., 0., 3.5, -.4, -2.2, 4.7, 0., 0., 0., .6, 0., 
-		0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 
-		0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 
-		0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 
-		0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., 
+		 -1.3, 0., 0., 0., 3.5, -.4, -2.2, 4.7, 0., 0., 0., .6, 0.,
+		0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0.,
+		0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0.,
+		0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0.,
+		0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0.,
 		0., 0., .1, -3., 0., 0., 0., 0., -.3, .1, -2., 0., 0., 0., 0.,
-		 3.3, .1, -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, 
-		-2., .1, 1.4, .8, .6, -.3, -2.8, -1.8, .1, 1.3, .8, 0., -.3, 
-		-1.9, 3.8, .1, -3.1, .8, 4.8, -.3, -1.5, .6, 0., 0., 0., 0., 
-		0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 
-		0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., 
+		 3.3, .1, -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4,
+		-2., .1, 1.4, .8, .6, -.3, -2.8, -1.8, .1, 1.3, .8, 0., -.3,
+		-1.9, 3.8, .1, -3.1, .8, 4.8, -.3, -1.5, .6, 0., 0., 0., 0.,
+		0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0.,
+		0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0.,
 		-.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5,
 		 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., 0., 4.8, .1,
-		 -3., 0., 0., 0., 0., 3.3, .1, -2., 0., 0., 0., 0., 2.1, .1, 
-		-2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, -1.6, .1, 
-		-2.2, .8, 5.4, -.3, -2.8, -1.5, .1, -1.4, .8, 3.6, -.3, -1.9, 
+		 -3., 0., 0., 0., 0., 3.3, .1, -2., 0., 0., 0., 0., 2.1, .1,
+		-2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, -1.6, .1,
+		-2.2, .8, 5.4, -.3, -2.8, -1.5, .1, -1.4, .8, 3.6, -.3, -1.9,
 		3.7, .1, -2.2, .8, 3.6, -.3, -1.5, .6, 0., 0., 0., 0., 0., 0.,
-		 .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 
-		0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0., 
-		0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0., 
-		0., 0., 0., 0., .6, .1, 0., 0., 0., 0., 0., -.8, -1., 0., 0., 
+		 .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6,
+		0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0.,
+		0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0.,
+		0., 0., 0., 0., .6, .1, 0., 0., 0., 0., 0., -.8, -1., 0., 0.,
 		0., 0., 0., -.9, -.8, 0., 0., 0., 0., 0., 3.5, .8, 0., 0., 0.,
 		 0., 0., .6, .1, -.5, .8, 0., 0., 0., -.8, -1., 1.4, -1.6, 0.,
-		 0., 0., -.9, -.8, 1.3, -1.6, 0., 0., 0., 3.5, .8, -3.1, 4.8, 
+		 0., 0., -.9, -.8, 1.3, -1.6, 0., 0., 0., 3.5, .8, -3.1, 4.8,
 		0., 0., 0. }};
 
     static struct {
 	doublereal e_1[448];
-	} equiv_7 = {{ .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., 
-		.5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 
-		0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 
+	} equiv_7 = {{ .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0.,
+		.5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5,
+		0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0.,
 		0., 0., 0., 0., 0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, 0.,
-		 0., 0., 0., 0., .7, -4.8, 0., 0., 0., 0., 0., 1.7, -.7, 0., 
-		0., 0., 0., 0., -2.6, 3.5, 0., 0., 0., 0., 0., .5, -.9, .3, 
-		.7, 0., 0., 0., .7, -4.8, 3., 1.1, 0., 0., 0., 1.7, -.7, -.7, 
+		 0., 0., 0., 0., .7, -4.8, 0., 0., 0., 0., 0., 1.7, -.7, 0.,
+		0., 0., 0., 0., -2.6, 3.5, 0., 0., 0., 0., 0., .5, -.9, .3,
+		.7, 0., 0., 0., .7, -4.8, 3., 1.1, 0., 0., 0., 1.7, -.7, -.7,
 		2.3, 0., 0., 0., -2.6, 3.5, -.7, -3.6, 0., 0., 0., .5, 0., 0.,
-		 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 
-		0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 
-		0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 
+		 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0.,
+		0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0.,
+		0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0.,
 		0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0.,
-		 4., -.9, -.3, 0., 0., 0., 0., -.5, -.9, 1.5, 0., 0., 0., 0., 
+		 4., -.9, -.3, 0., 0., 0., 0., -.5, -.9, 1.5, 0., 0., 0., 0.,
 		-1.5, -.9, -1.8, 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8,
-		 3.7, -.9, -1.2, .7, -1.5, .2, 2.2, -.3, -.9, 2.1, .7, -1.6, 
-		.2, 2., -1.6, -.9, -2.1, .7, 2.9, .2, -3.8, .5, 0., 0., 0., 
-		0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 
-		0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 
-		0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., 
+		 3.7, -.9, -1.2, .7, -1.5, .2, 2.2, -.3, -.9, 2.1, .7, -1.6,
+		.2, 2., -1.6, -.9, -2.1, .7, 2.9, .2, -3.8, .5, 0., 0., 0.,
+		0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0.,
+		0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0.,
+		0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0.,
 		-2.6, 0., 0., 0., 0., 0., 0., .5, -.9, 0., 0., 0., 0., 0., 4.,
-		 -6.3, 0., 0., 0., 0., 0., -.5, .3, 0., 0., 0., 0., 0., -1.5, 
-		3., 0., 0., 0., 0., 0., .5, -.9, .3, .7, 0., 0., 0., 3.7, 
-		-7.2, 3., 1.7, 0., 0., 0., -.3, .9, -.7, 1.9, 0., 0., 0., 
-		-1.6, 2.7, -.7, -3.4, 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., 
-		.5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 
-		0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .7, 0., 
+		 -6.3, 0., 0., 0., 0., 0., -.5, .3, 0., 0., 0., 0., 0., -1.5,
+		3., 0., 0., 0., 0., 0., .5, -.9, .3, .7, 0., 0., 0., 3.7,
+		-7.2, 3., 1.7, 0., 0., 0., -.3, .9, -.7, 1.9, 0., 0., 0.,
+		-1.6, 2.7, -.7, -3.4, 0., 0., 0., .5, 0., 0., 0., 0., 0., 0.,
+		.5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5,
+		0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .7, 0.,
 		0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0.,
-		 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., .7, -.9, 1.2, 
+		 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., .7, -.9, 1.2,
 		0., 0., 0., 0., 1.7, -.9, .5, 0., 0., 0., 0., -2.6, -.9, -1.3,
-		 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, .7, -.9, 1.2, 
+		 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, .7, -.9, 1.2,
 		.7, -1.5, .2, 1.6, 1.7, -.9, .5, .7, -1.6, .2, 2.4, -2.6, -.9,
 		 -1.3, .7, 2.9, .2, -4. }};
 
@@ -521,7 +531,7 @@ static real c_b81 = 0.f;
     doublereal d__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -532,7 +542,7 @@ static real c_b81 = 0.f;
     doublereal sx[7], sy[7];
     integer kni;
     doublereal stx[7], sty[7];
-    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, 
+    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *,
 	    integer *);
     integer kpar, lenx, leny;
 #define dt19x ((doublereal *)&equiv_3)
@@ -547,16 +557,16 @@ static real c_b81 = 0.f;
 #define dt19yc ((doublereal *)&equiv_7 + 224)
 #define dt19yd ((doublereal *)&equiv_7 + 336)
     extern doublereal dsdot_(integer *, real *, integer *, real *, integer *);
-    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
 	    doublereal *, integer *);
     integer ksize;
-    extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, 
-	    integer *, doublereal *, integer *), drotm_(integer *, doublereal 
+    extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *), drotm_(integer *, doublereal
 	    *, integer *, doublereal *, integer *, doublereal *), dswap_(
 	    integer *, doublereal *, integer *, doublereal *, integer *);
     doublereal ssize[7];
-    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *), stest1_(doublereal *, doublereal *, 
+    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *), stest1_(doublereal *, doublereal *,
 	    doublereal *, doublereal *);
 
     /* Fortran I/O blocks */
@@ -616,7 +626,7 @@ static real c_b81 = 0.f;
 /*              .. DDOT .. */
 		d__1 = ddot_(&combla_1.n, sx, &combla_1.incx, sy, &
 			combla_1.incy);
-		stest1_(&d__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], 
+		stest1_(&d__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1],
 			sfac);
 	    } else if (combla_1.icase == 2) {
 /*              .. DAXPY .. */
@@ -653,9 +663,9 @@ static real c_b81 = 0.f;
 		    for (i__ = 1; i__ <= 7; ++i__) {
 			sx[i__ - 1] = dx1[i__ - 1];
 			sy[i__ - 1] = dy1[i__ - 1];
-			stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - 
+			stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 -
 				36];
-			sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - 
+			sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 -
 				36];
 		    }
 
@@ -746,7 +756,7 @@ static real c_b81 = 0.f;
 	    1.17,1.17,1.17,1.17,1.17,1.17,1.17 };
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -755,13 +765,13 @@ static real c_b81 = 0.f;
     doublereal sx[7], sy[7], stx[7], sty[7];
     integer lenx, leny;
     doublereal mwpc[11];
-    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
 	    doublereal *, integer *, doublereal *, doublereal *);
     integer mwpn[11];
     doublereal mwps[11], mwpx[5], mwpy[5];
     integer ksize;
     doublereal copyx[5], copyy[5];
-    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *,
 	    doublereal *, doublereal *);
     doublereal mwptx[55]	/* was [11][5] */, mwpty[55]	/* was [11][5]
 	     */;
@@ -1090,11 +1100,11 @@ static real c_b81 = 0.f;
 
 } /* testdsdot_ */
 
-/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, 
+/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1,
 	doublereal *ssize, doublereal *sfac)
 {
     doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *,
 	    doublereal *, doublereal *);
 
 /*     ************************* STEST1 ***************************** */
diff --git a/blastest/src/dblat2.c b/blastest/src/dblat2.c
index 0cdc8f16f..7982c67c5 100644
--- a/blastest/src/dblat2.c
+++ b/blastest/src/dblat2.c
@@ -155,10 +155,15 @@ static logical c_false = FALSE_;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "dblat2";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*16] = "DGEMV " "DGBMV " "DSYMV " "DSBMV " "DSPMV " 
-	    "DTRMV " "DTBMV " "DTPMV " "DTRSV " "DTBSV " "DTPSV " "DGER  " 
+    static char snames[6*16] = "DGEMV " "DGBMV " "DSYMV " "DSBMV " "DSPMV "
+	    "DTRMV " "DTBMV " "DTPMV " "DTRSV " "DTBSV " "DTPSV " "DGER  "
 	    "DSYR  " "DSPR  " "DSYR2 " "DSPR2 ";
 
     /* Format strings */
@@ -204,10 +209,10 @@ static logical c_false = FALSE_;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -227,50 +232,50 @@ static logical c_false = FALSE_;
     integer ninc, nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, integer *, integer *, doublereal *, integer 
-	    *, doublereal *, integer *, integer *, integer *, integer *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, 
-	    doublereal *, doublereal *, integer *, integer *, logical *, 
-	    logical *, logical *, integer *, integer *, integer *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
-	    integer *, integer *, integer *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, integer *, integer *, integer *, integer *, 
+    extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, integer *, integer *, doublereal *, integer
+	    *, doublereal *, integer *, integer *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *,
+	    doublereal *, doublereal *, integer *, integer *, logical *,
+	    logical *, logical *, integer *, integer *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    integer *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, integer *, integer *, integer *, integer *,
 	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, ftnlen), dchk4_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublereal *, integer *, integer *, integer 
-	    *, integer *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, ftnlen), dchk5_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublereal *, integer *, integer *, integer 
-	    *, integer *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, ftnlen), dchk6_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublereal *, integer *, integer *, integer 
-	    *, integer *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, ftnlen), dchke_(integer *, char *, integer *, 
+	     doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, ftnlen), dchk4_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, integer *, integer
+	    *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, ftnlen), dchk5_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, integer *, integer
+	    *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, ftnlen), dchk6_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, integer *, integer
+	    *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, ftnlen), dchke_(integer *, char *, integer *,
 	    ftnlen);
     logical fatal, trace;
     integer nidim;
-    extern /* Subroutine */ int dmvch_(char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmvch_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, logical *, integer *, 
+	     doublereal *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen);
     char snaps[32], trans[1];
     integer isnum;
@@ -621,7 +626,7 @@ static logical c_false = FALSE_;
 	goto L80;
     }
     for (i__ = 1; i__ <= 16; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L70;
 	}
@@ -668,7 +673,7 @@ static logical c_false = FALSE_;
     }
     i__1 = n;
     for (j = 1; j <= i__1; ++j) {
-	yy[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 
+	yy[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j -
 		1) / 3);
 /* L130: */
     }
@@ -748,44 +753,44 @@ static logical c_false = FALSE_;
 /*           Test DGEMV, 01, and DGBMV, 02. */
 L140:
 	    dchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. */
 L150:
 	    dchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test DTRMV, 06, DTBMV, 07, DTPMV, 08, */
 /*           DTRSV, 09, DTBSV, 10, and DTPSV, 11. */
 L160:
 	    dchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc,
 		    &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen)
 		    6);
 	    goto L200;
 /*           Test DGER, 12. */
 L170:
 	    dchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test DSYR, 13, and DSPR, 14. */
 L180:
 	    dchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test DSYR2, 15, and DSPR2, 16. */
 L190:
 	    dchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 
 L200:
@@ -827,16 +832,21 @@ static logical c_false = FALSE_;
 
 /*     End of DBLAT2. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
-/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, 
-	integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublereal *a, doublereal *aa, doublereal *as, doublereal *x, 
-	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, 
+	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb,
+	integer *nalf, doublereal *alf, integer *nbet, doublereal *bet,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublereal *a, doublereal *aa, doublereal *as, doublereal *x,
+	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy,
 	doublereal *ys, doublereal *yt, doublereal *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -881,21 +891,21 @@ static logical c_false = FALSE_;
     logical same;
     integer incx, incy;
     logical full, tran, null;
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
-	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen,
 	    ftnlen);
     doublereal alpha;
     logical isame[13];
     extern /* Subroutine */ int dgbmv_(char *, integer *, integer *, integer *
-	    , integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    , integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    integer *, doublereal *, doublereal *, integer *, ftnlen), dgemv_(
-	    char *, integer *, integer *, doublereal *, doublereal *, integer 
+	    char *, integer *, integer *, doublereal *, doublereal *, integer
 	    *, doublereal *, integer *, doublereal *, doublereal *, integer *,
-	     ftnlen), dmvch_(char *, integer *, integer *, doublereal *, 
-	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	     ftnlen), dmvch_(char *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
 	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, logical *, integer *, logical *, 
+	     doublereal *, doublereal *, logical *, integer *, logical *,
 	    ftnlen);
     integer nargs;
     logical reset;
@@ -1079,9 +1089,9 @@ static logical c_false = FALSE_;
 				    transl = 0.;
 				    i__7 = abs(incy);
 				    i__8 = ml - 1;
-				    dmake_("GE", " ", " ", &c__1, &ml, &y[1], 
+				    dmake_("GE", " ", " ", &c__1, &ml, &y[1],
 					    &c__1, &yy[1], &i__7, &c__0, &
-					    i__8, &reset, &transl, (ftnlen)2, 
+					    i__8, &reset, &transl, (ftnlen)2,
 					    (ftnlen)1, (ftnlen)1);
 
 				    ++nc;
@@ -1089,7 +1099,7 @@ static logical c_false = FALSE_;
 /*                             Save every datum before calling the */
 /*                             subroutine. */
 
-				    *(unsigned char *)transs = *(unsigned 
+				    *(unsigned char *)transs = *(unsigned
 					    char *)trans;
 				    ms = m;
 				    ns = n;
@@ -1149,7 +1159,7 @@ static logical c_false = FALSE_;
 					    al__1.aunit = *ntra;
 					    f_rew(&al__1);
 					}
-					dgemv_(trans, &m, &n, &alpha, &aa[1], 
+					dgemv_(trans, &m, &n, &alpha, &aa[1],
 						&lda, &xx[1], &incx, &beta, &
 						yy[1], &incy, (ftnlen)1);
 				    } else if (banded) {
@@ -1276,8 +1286,8 @@ static logical c_false = FALSE_;
 
 					dmvch_(trans, &m, &n, &alpha, &a[
 						a_offset], nmax, &x[1], &incx,
-						 &beta, &y[1], &incy, &yt[1], 
-						&g[1], &yy[1], eps, &err, 
+						 &beta, &y[1], &incy, &yt[1],
+						&g[1], &yy[1], eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1);
 					errmax = max(errmax,err);
@@ -1381,13 +1391,13 @@ static logical c_false = FALSE_;
 
 } /* dchk1_ */
 
-/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, 
-	integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublereal *a, doublereal *aa, doublereal *as, doublereal *x, 
-	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, 
+	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb,
+	integer *nalf, doublereal *alf, integer *nbet, doublereal *bet,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublereal *a, doublereal *aa, doublereal *as, doublereal *x,
+	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy,
 	doublereal *ys, doublereal *yt, doublereal *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -1425,7 +1435,7 @@ static logical c_false = FALSE_;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, 
+    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly,
 	    laa, lda;
     extern logical lde_(doublereal *, doublereal *, integer *);
     doublereal als, bls, err, beta;
@@ -1434,29 +1444,29 @@ static logical c_false = FALSE_;
     integer incx, incy;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
-	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen,
 	    ftnlen);
     doublereal alpha;
     logical isame[13];
-    extern /* Subroutine */ int dmvch_(char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmvch_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, logical *, integer *, 
+	     doublereal *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int dsbmv_(char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dsbmv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, ftnlen);
     logical reset;
     integer incxs, incys;
-    extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, 
+    extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     integer *, ftnlen);
     char uplos[1];
-    extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, 
-	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
 	    doublereal *, integer *, ftnlen);
     logical banded, packed;
     extern logical lderes_(char *, char *, integer *, integer *, doublereal *,
@@ -1619,7 +1629,7 @@ static logical c_false = FALSE_;
 				i__8 = n - 1;
 				dmake_("GE", " ", " ", &c__1, &n, &y[1], &
 					c__1, &yy[1], &i__7, &c__0, &i__8, &
-					reset, &transl, (ftnlen)2, (ftnlen)1, 
+					reset, &transl, (ftnlen)2, (ftnlen)1,
 					(ftnlen)1);
 
 				++nc;
@@ -1836,8 +1846,8 @@ static logical c_false = FALSE_;
 
 /*                             Check the result. */
 
-				    dmvch_("N", &n, &n, &alpha, &a[a_offset], 
-					    nmax, &x[1], &incx, &beta, &y[1], 
+				    dmvch_("N", &n, &n, &alpha, &a[a_offset],
+					    nmax, &x[1], &incx, &beta, &y[1],
 					    &incy, &yt[1], &g[1], &yy[1], eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1);
@@ -1947,12 +1957,12 @@ static logical c_false = FALSE_;
 
 } /* dchk2_ */
 
-/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublereal *a, doublereal *aa, doublereal *as, doublereal *x, 
-	doublereal *xx, doublereal *xs, doublereal *xt, doublereal *g, 
+	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublereal *a, doublereal *aa, doublereal *as, doublereal *x,
+	doublereal *xx, doublereal *xs, doublereal *xt, doublereal *g,
 	doublereal *z__, ftnlen sname_len)
 {
     /* Initialized data */
@@ -2002,36 +2012,36 @@ static logical c_false = FALSE_;
     integer incx;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
-	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen,
 	    ftnlen);
     char diags[1];
     logical isame[13];
-    extern /* Subroutine */ int dmvch_(char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmvch_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, logical *, integer *, 
+	     doublereal *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int dtbmv_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dtbmv_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
 	    ftnlen, ftnlen, ftnlen);
     logical reset;
-    extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
 	    ftnlen, ftnlen, ftnlen);
     integer incxs;
     char trans[1];
-    extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, 
-	    doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), 
+    extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *,
+	    doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen),
 	    dtrmv_(char *, char *, char *, integer *, doublereal *, integer *,
-	     doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtpsv_(char *, 
-	    char *, char *, integer *, doublereal *, doublereal *, integer *, 
+	     doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtpsv_(char *,
+	    char *, char *, integer *, doublereal *, doublereal *, integer *,
 	    ftnlen, ftnlen, ftnlen);
     char uplos[1];
-    extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *, 
-	    doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, 
+    extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen,
 	    ftnlen);
     logical banded, packed;
     extern logical lderes_(char *, char *, integer *, integer *, doublereal *,
@@ -2160,13 +2170,13 @@ static logical c_false = FALSE_;
 			    ;
 
 		    for (icd = 1; icd <= 2; ++icd) {
-			*(unsigned char *)diag = *(unsigned char *)&ichd[icd 
+			*(unsigned char *)diag = *(unsigned char *)&ichd[icd
 				- 1];
 
 /*                    Generate the matrix A. */
 
 			transl = 0.;
-			dmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], 
+			dmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &k, &k, &reset, &transl, (
 				ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2213,7 +2223,7 @@ static logical c_false = FALSE_;
 
 /*                       Call the subroutine. */
 
-			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) 
+			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2)
 				    == 0) {
 				if (full) {
 				    if (*trace) {
@@ -2266,7 +2276,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    dtbmv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    dtbmv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2347,7 +2357,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    dtbsv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    dtbsv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2389,11 +2399,11 @@ static logical c_false = FALSE_;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplo == *(unsigned 
+			    isame[0] = *(unsigned char *)uplo == *(unsigned
 				    char *)uplos;
-			    isame[1] = *(unsigned char *)trans == *(unsigned 
+			    isame[1] = *(unsigned char *)trans == *(unsigned
 				    char *)transs;
-			    isame[2] = *(unsigned char *)diag == *(unsigned 
+			    isame[2] = *(unsigned char *)diag == *(unsigned
 				    char *)diags;
 			    isame[3] = ns == n;
 			    if (full) {
@@ -2464,7 +2474,7 @@ static logical c_false = FALSE_;
 				    dmvch_(trans, &n, &n, &c_b128, &a[
 					    a_offset], nmax, &x[1], &incx, &
 					    c_b120, &z__[1], &incx, &xt[1], &
-					    g[1], &xx[1], eps, &err, fatal, 
+					    g[1], &xx[1], eps, &err, fatal,
 					    nout, &c_true, (ftnlen)1);
 				} else if (s_cmp(sname + 3, "SV", (ftnlen)2, (
 					ftnlen)2) == 0) {
@@ -2473,7 +2483,7 @@ static logical c_false = FALSE_;
 
 				    i__4 = n;
 				    for (i__ = 1; i__ <= i__4; ++i__) {
-					z__[i__] = xx[(i__ - 1) * abs(incx) + 
+					z__[i__] = xx[(i__ - 1) * abs(incx) +
 						1];
 					xx[(i__ - 1) * abs(incx) + 1] = x[i__]
 						;
@@ -2482,7 +2492,7 @@ static logical c_false = FALSE_;
 				    dmvch_(trans, &n, &n, &c_b128, &a[
 					    a_offset], nmax, &z__[1], &incx, &
 					    c_b120, &x[1], &incx, &xt[1], &g[
-					    1], &xx[1], eps, &err, fatal, 
+					    1], &xx[1], eps, &err, fatal,
 					    nout, &c_false, (ftnlen)1);
 				}
 				errmax = max(errmax,err);
@@ -2584,13 +2594,13 @@ static logical c_false = FALSE_;
 
 } /* dchk3_ */
 
-/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublereal *a, doublereal *aa, doublereal *as, doublereal *x, 
-	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, 
-	doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublereal *a, doublereal *aa, doublereal *as, doublereal *x,
+	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy,
+	doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__,
 	ftnlen sname_len)
 {
     /* Format strings */
@@ -2625,23 +2635,23 @@ static logical c_false = FALSE_;
     integer ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly, laa, lda;
     extern logical lde_(doublereal *, doublereal *, integer *);
     doublereal als, err;
-    extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, 
-	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dger_(integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
 	    integer *);
     integer ldas;
     logical same;
     integer incx, incy;
     logical null;
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
-	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    integer *, integer *, logical *, doublereal *, ftnlen, ftnlen,
 	    ftnlen);
     doublereal alpha;
     logical isame[13];
-    extern /* Subroutine */ int dmvch_(char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmvch_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, logical *, integer *, 
+	     doublereal *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen);
     integer nargs;
     logical reset;
@@ -2748,7 +2758,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = m - 1;
 		dmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (m > 1) {
 		    x[m / 2] = 0.;
@@ -2782,7 +2792,7 @@ static logical c_false = FALSE_;
 			transl = 0.;
 			i__5 = m - 1;
 			i__6 = n - 1;
-			dmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], 
+			dmake_(sname + 1, " ", " ", &m, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2913,9 +2923,9 @@ static logical c_false = FALSE_;
 				} else {
 				    w[0] = y[n - j + 1];
 				}
-				dmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, 
+				dmvch_("N", &m, &c__1, &alpha, &z__[1], nmax,
 					w, &c__1, &c_b128, &a[j * a_dim1 + 1],
-					 &c__1, &yt[1], &g[1], &aa[(j - 1) * 
+					 &c__1, &yt[1], &g[1], &aa[(j - 1) *
 					lda + 1], eps, &err, fatal, nout, &
 					c_true, (ftnlen)1);
 				errmax = max(errmax,err);
@@ -2995,13 +3005,13 @@ static logical c_false = FALSE_;
 
 } /* dchk4_ */
 
-/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublereal *a, doublereal *aa, doublereal *as, doublereal *x, 
-	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, 
-	doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublereal *a, doublereal *aa, doublereal *as, doublereal *x,
+	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy,
+	doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__,
 	ftnlen sname_len)
 {
     /* Initialized data */
@@ -3047,21 +3057,21 @@ static logical c_false = FALSE_;
     logical same;
     integer incx;
     logical full;
-    extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, 
+    extern /* Subroutine */ int dspr_(char *, integer *, doublereal *,
 	    doublereal *, integer *, doublereal *, ftnlen);
     logical null;
     char uplo[1];
-    extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, 
+    extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *,
 	    doublereal *, integer *, doublereal *, integer *, ftnlen), dmake_(
-	    char *, char *, char *, integer *, integer *, doublereal *, 
-	    integer *, doublereal *, integer *, integer *, integer *, logical 
+	    char *, char *, char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, integer *, integer *, logical
 	    *, doublereal *, ftnlen, ftnlen, ftnlen);
     doublereal alpha;
     logical isame[13];
-    extern /* Subroutine */ int dmvch_(char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmvch_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, logical *, integer *, 
+	     doublereal *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen);
     integer nargs;
     logical reset;
@@ -3173,7 +3183,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		dmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    x[n / 2] = 0.;
@@ -3342,9 +3352,9 @@ static logical c_false = FALSE_;
 				jj = j;
 				lj = n - j + 1;
 			    }
-			    dmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, 
+			    dmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w,
 				    &c__1, &c_b128, &a[jj + j * a_dim1], &
-				    c__1, &yt[1], &g[1], &aa[ja], eps, &err, 
+				    c__1, &yt[1], &g[1], &aa[ja], eps, &err,
 				    fatal, nout, &c_true, (ftnlen)1);
 			    if (full) {
 				if (upper) {
@@ -3442,13 +3452,13 @@ static logical c_false = FALSE_;
 
 } /* dchk5_ */
 
-/* Subroutine */ int dchk6_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk6_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublereal *a, doublereal *aa, doublereal *as, doublereal *x, 
-	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, 
-	doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublereal *a, doublereal *aa, doublereal *as, doublereal *x,
+	doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy,
+	doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__,
 	ftnlen sname_len)
 {
     /* Initialized data */
@@ -3477,7 +3487,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, 
+    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5,
 	    i__6;
     alist al__1;
 
@@ -3496,19 +3506,19 @@ static logical c_false = FALSE_;
     integer incx, incy;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *, 
-	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
-	    ftnlen), dsyr2_(char *, integer *, doublereal *, doublereal *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
-	    ftnlen), dmake_(char *, char *, char *, integer *, integer *, 
-	    doublereal *, integer *, doublereal *, integer *, integer *, 
+    extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    ftnlen), dsyr2_(char *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    ftnlen), dmake_(char *, char *, char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, integer *,
 	    integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen);
     doublereal alpha;
     logical isame[13];
-    extern /* Subroutine */ int dmvch_(char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmvch_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, logical *, integer *, 
+	     doublereal *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen);
     integer nargs;
     logical reset;
@@ -3622,7 +3632,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		dmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    x[n / 2] = 0.;
@@ -3657,7 +3667,7 @@ static logical c_false = FALSE_;
 			transl = 0.;
 			i__5 = n - 1;
 			i__6 = n - 1;
-			dmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], 
+			dmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -3835,7 +3845,7 @@ static logical c_false = FALSE_;
 				    jj = j;
 				    lj = n - j + 1;
 				}
-				dmvch_("N", &lj, &c__2, &alpha, &z__[jj + 
+				dmvch_("N", &lj, &c__2, &alpha, &z__[jj +
 					z_dim1], nmax, w, &c__1, &c_b128, &a[
 					jj + j * a_dim1], &c__1, &yt[1], &g[1]
 					, &aa[ja], eps, &err, fatal, nout, &
@@ -3941,7 +3951,7 @@ static logical c_false = FALSE_;
 
 } /* dchk6_ */
 
-/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -3955,39 +3965,39 @@ static logical c_false = FALSE_;
 
     /* Local variables */
     doublereal a[1]	/* was [1][1] */, x[1], y[1], beta;
-    extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, 
-	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
-	    integer *), dspr_(char *, integer *, doublereal *, doublereal *, 
-	    integer *, doublereal *, ftnlen), dsyr_(char *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
-	    ftnlen), dspr2_(char *, integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int dger_(integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *), dspr_(char *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, ftnlen), dsyr_(char *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    ftnlen), dspr2_(char *, integer *, doublereal *, doublereal *,
 	    integer *, doublereal *, integer *, doublereal *, ftnlen), dsyr2_(
-	    char *, integer *, doublereal *, doublereal *, integer *, 
+	    char *, integer *, doublereal *, doublereal *, integer *,
 	    doublereal *, integer *, doublereal *, integer *, ftnlen);
     doublereal alpha;
     extern /* Subroutine */ int dgbmv_(char *, integer *, integer *, integer *
-	    , integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    , integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    integer *, doublereal *, doublereal *, integer *, ftnlen), dgemv_(
-	    char *, integer *, integer *, doublereal *, doublereal *, integer 
+	    char *, integer *, integer *, doublereal *, doublereal *, integer
 	    *, doublereal *, integer *, doublereal *, doublereal *, integer *,
-	     ftnlen), dsbmv_(char *, integer *, integer *, doublereal *, 
-	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
-	    doublereal *, integer *, ftnlen), dtbmv_(char *, char *, char *, 
-	    integer *, integer *, doublereal *, integer *, doublereal *, 
+	     ftnlen), dsbmv_(char *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, ftnlen), dtbmv_(char *, char *, char *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
 	    integer *, ftnlen, ftnlen, ftnlen), dtbsv_(char *, char *, char *,
-	     integer *, integer *, doublereal *, integer *, doublereal *, 
-	    integer *, ftnlen, ftnlen, ftnlen), dspmv_(char *, integer *, 
+	     integer *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, ftnlen, ftnlen, ftnlen), dspmv_(char *, integer *,
 	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
-	     doublereal *, integer *, ftnlen), dtpmv_(char *, char *, char *, 
-	    integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen, 
-	    ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *, 
-	    integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), 
-	    dtpsv_(char *, char *, char *, integer *, doublereal *, 
-	    doublereal *, integer *, ftnlen, ftnlen, ftnlen), dsymv_(char *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	     doublereal *, integer *, ftnlen), dtpmv_(char *, char *, char *,
+	    integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen,
+	    ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen),
+	    dtpsv_(char *, char *, char *, integer *, doublereal *,
+	    doublereal *, integer *, ftnlen, ftnlen, ftnlen), dsymv_(char *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    integer *, doublereal *, doublereal *, integer *, ftnlen), dtrsv_(
-	    char *, char *, char *, integer *, doublereal *, integer *, 
-	    doublereal *, integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, 
+	    char *, char *, char *, integer *, doublereal *, integer *,
+	    doublereal *, integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *,
 	    integer *, integer *, logical *, logical *, ftnlen);
 
     /* Fortran I/O blocks */
@@ -4493,9 +4503,9 @@ static logical c_false = FALSE_;
 
 } /* dchke_ */
 
-/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, 
+/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m,
 	integer *n, doublereal *a, integer *nmax, doublereal *aa, integer *
-	lda, integer *kl, integer *ku, logical *reset, doublereal *transl, 
+	lda, integer *kl, integer *ku, logical *reset, doublereal *transl,
 	ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
@@ -4553,7 +4563,7 @@ static logical c_false = FALSE_;
 	i__2 = *m;
 	for (i__ = 1; i__ <= i__2; ++i__) {
 	    if (gen || upper && i__ <= j || lower && i__ >= j) {
-		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) 
+		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl)
 			{
 		    a[i__ + j * a_dim1] = dbeg_(reset) + *transl;
 		} else {
@@ -4728,9 +4738,9 @@ static logical c_false = FALSE_;
 } /* dmake_ */
 
 /* Subroutine */ int dmvch_(char *trans, integer *m, integer *n, doublereal *
-	alpha, doublereal *a, integer *nmax, doublereal *x, integer *incx, 
-	doublereal *beta, doublereal *y, integer *incy, doublereal *yt, 
-	doublereal *g, doublereal *yy, doublereal *eps, doublereal *err, 
+	alpha, doublereal *a, integer *nmax, doublereal *x, integer *incx,
+	doublereal *beta, doublereal *y, integer *incy, doublereal *yt,
+	doublereal *g, doublereal *yy, doublereal *eps, doublereal *err,
 	logical *fatal, integer *nout, logical *mv, ftnlen trans_len)
 {
     /* Format strings */
@@ -4845,7 +4855,7 @@ static logical c_false = FALSE_;
     *err = 0.;
     i__1 = ml;
     for (i__ = 1; i__ <= i__1; ++i__) {
-	erri = (d__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(d__1)) / 
+	erri = (d__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(d__1)) /
 		*eps;
 	if (g[i__] != 0.) {
 	    erri /= g[i__];
@@ -5102,7 +5112,7 @@ doublereal ddiff_(doublereal *x, doublereal *y)
 
 } /* ddiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/blastest/src/dblat3.c b/blastest/src/dblat3.c
index d7a85e29c..b4698f56c 100644
--- a/blastest/src/dblat3.c
+++ b/blastest/src/dblat3.c
@@ -135,9 +135,14 @@ static integer c__2 = 2;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "dblat3";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*6] = "DGEMM " "DSYMM " "DTRMM " "DTRSM " "DSYRK " 
+    static char snames[6*6] = "DGEMM " "DSYMM " "DTRMM " "DTRSM " "DSYRK "
 	    "DSYR2K";
 
     /* Format strings */
@@ -179,10 +184,10 @@ static integer c__2 = 2;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -200,38 +205,38 @@ static integer c__2 = 2;
     integer nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
 	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, 
-	    doublereal *, doublereal *, integer *, integer *, logical *, 
-	    logical *, logical *, integer *, integer *, integer *, doublereal 
+	     doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *,
+	    doublereal *, doublereal *, integer *, integer *, logical *,
+	    logical *, logical *, integer *, integer *, integer *, doublereal
 	    *, integer *, doublereal *, integer *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublereal *, integer *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), 
-	    dchk4_(char *, doublereal *, doublereal *, integer *, integer *, 
-	    logical *, logical *, logical *, integer *, integer *, integer *, 
-	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, ftnlen), dchk5_(char *, doublereal *, 
-	    doublereal *, integer *, integer *, logical *, logical *, logical 
-	    *, integer *, integer *, integer *, doublereal *, integer *, 
+	     doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, ftnlen),
+	    dchk4_(char *, doublereal *, doublereal *, integer *, integer *,
+	    logical *, logical *, logical *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, ftnlen), dchk5_(char *, doublereal *,
+	    doublereal *, integer *, integer *, logical *, logical *, logical
+	    *, integer *, integer *, integer *, doublereal *, integer *,
 	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
-	     doublereal *, doublereal *, doublereal *, doublereal *, 
-	    doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), 
+	     doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, ftnlen),
 	    dchke_(integer *, char *, integer *, ftnlen);
     logical fatal;
-    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     logical *, integer *, logical *, ftnlen, ftnlen);
     logical trace;
@@ -506,7 +511,7 @@ static integer c__2 = 2;
 	goto L60;
     }
     for (i__ = 1; i__ <= 6; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L50;
 	}
@@ -554,7 +559,7 @@ static integer c__2 = 2;
     }
     i__1 = n;
     for (j = 1; j <= i__1; ++j) {
-	cc[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 
+	cc[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j -
 		1) / 3);
 /* L110: */
     }
@@ -599,7 +604,7 @@ static integer c__2 = 2;
     }
     i__1 = n;
     for (j = 1; j <= i__1; ++j) {
-	cc[n - j] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 
+	cc[n - j] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j -
 		1) / 3);
 /* L130: */
     }
@@ -672,34 +677,34 @@ static integer c__2 = 2;
 /*           Test DGEMM, 01. */
 L140:
 	    dchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test DSYMM, 02. */
 L150:
 	    dchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test DTRMM, 03, DTRSM, 04. */
 L160:
 	    dchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65,
 		    ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6);
 	    goto L190;
 /*           Test DSYRK, 05. */
 L170:
 	    dchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test DSYR2K, 06. */
 L180:
 	    dchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
 		    bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, (
 		    ftnlen)6);
 	    goto L190;
@@ -743,15 +748,20 @@ static integer c__2 = 2;
 
 /*     End of DBLAT3. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
-/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *nbet, doublereal *bet, integer *nmax, doublereal *a, 
-	doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, 
-	doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *nbet, doublereal *bet, integer *nmax, doublereal *a,
+	doublereal *aa, doublereal *as, doublereal *b, doublereal *bb,
+	doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs,
 	doublereal *ct, doublereal *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -775,7 +785,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6;
     alist al__1;
 
@@ -784,22 +794,22 @@ static integer c__2 = 2;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, 
+    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns,
 	    ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     extern logical lde_(doublereal *, doublereal *, integer *);
     doublereal als, bls, err, beta;
     integer ldas, ldbs, ldcs;
     logical same, null;
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
 	    logical *, doublereal *, ftnlen, ftnlen, ftnlen);
     doublereal alpha;
-    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
-	     logical *, integer *, logical *, ftnlen, ftnlen), dgemm_(char *, 
-	    char *, integer *, integer *, integer *, doublereal *, doublereal 
+	     logical *, integer *, logical *, ftnlen, ftnlen), dgemm_(char *,
+	    char *, integer *, integer *, integer *, doublereal *, doublereal
 	    *, integer *, doublereal *, integer *, doublereal *, doublereal *,
 	     integer *, ftnlen, ftnlen);
     logical isame[13], trana, tranb;
@@ -898,7 +908,7 @@ static integer c__2 = 2;
 		for (ica = 1; ica <= 3; ++ica) {
 		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
 			    ;
-		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned
 			    char *)transa == 'C';
 
 		    if (trana) {
@@ -926,9 +936,9 @@ static integer c__2 = 2;
 			    ftnlen)1);
 
 		    for (icb = 1; icb <= 3; ++icb) {
-			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb
 				- 1];
-			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned
 				char *)transb == 'C';
 
 			if (tranb) {
@@ -1100,9 +1110,9 @@ static integer c__2 = 2;
 
 				    dmmch_(transa, transb, &m, &n, &k, &alpha,
 					     &a[a_offset], nmax, &b[b_offset],
-					     nmax, &beta, &c__[c_offset], 
+					     nmax, &beta, &c__[c_offset],
 					    nmax, &ct[1], &g[1], &cc[1], &ldc,
-					     eps, &err, fatal, nout, &c_true, 
+					     eps, &err, fatal, nout, &c_true,
 					    (ftnlen)1, (ftnlen)1);
 				    errmax = max(errmax,err);
 /*                             If got really bad answer, report and */
@@ -1183,12 +1193,12 @@ static integer c__2 = 2;
 
 } /* dchk1_ */
 
-/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *nbet, doublereal *bet, integer *nmax, doublereal *a, 
-	doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, 
-	doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *nbet, doublereal *bet, integer *nmax, doublereal *a,
+	doublereal *aa, doublereal *as, doublereal *b, doublereal *bb,
+	doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs,
 	doublereal *ct, doublereal *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -1213,7 +1223,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5;
     alist al__1;
 
@@ -1222,7 +1232,7 @@ static integer c__2 = 2;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, 
+    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc,
 	    ldb, ldc;
     extern logical lde_(doublereal *, doublereal *, integer *);
     integer ics;
@@ -1234,21 +1244,21 @@ static integer c__2 = 2;
     char side[1];
     logical left, null;
     char uplo[1];
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
 	    logical *, doublereal *, ftnlen, ftnlen, ftnlen);
     doublereal alpha;
-    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     logical *, integer *, logical *, ftnlen, ftnlen);
     logical isame[13];
     char sides[1];
     integer nargs;
     logical reset;
-    extern /* Subroutine */ int dsymm_(char *, char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dsymm_(char *, char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, ftnlen, ftnlen);
     char uplos[1];
     extern logical lderes_(char *, char *, integer *, integer *, doublereal *,
@@ -1391,7 +1401,7 @@ static integer c__2 = 2;
 
 /*                       Generate the matrix C. */
 
-			    dmake_("GE", " ", " ", &m, &n, &c__[c_offset], 
+			    dmake_("GE", " ", " ", &m, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b86, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1472,9 +1482,9 @@ static integer c__2 = 2;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)sides == *(unsigned 
+			    isame[0] = *(unsigned char *)sides == *(unsigned
 				    char *)side;
-			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+			    isame[1] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
 			    isame[2] = ms == m;
 			    isame[3] = ns == n;
@@ -1519,14 +1529,14 @@ static integer c__2 = 2;
 
 				if (left) {
 				    dmmch_("N", "N", &m, &n, &m, &alpha, &a[
-					    a_offset], nmax, &b[b_offset], 
+					    a_offset], nmax, &b[b_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1, (ftnlen)1);
 				} else {
 				    dmmch_("N", "N", &m, &n, &n, &alpha, &b[
-					    b_offset], nmax, &a[a_offset], 
+					    b_offset], nmax, &a[a_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
@@ -1606,11 +1616,11 @@ static integer c__2 = 2;
 
 } /* dchk2_ */
 
-/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *nmax, doublereal *a, doublereal *aa, doublereal *as, 
-	doublereal *b, doublereal *bb, doublereal *bs, doublereal *ct, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *nmax, doublereal *a, doublereal *aa, doublereal *as,
+	doublereal *b, doublereal *bb, doublereal *bs, doublereal *ct,
 	doublereal *g, doublereal *c__, ftnlen sname_len)
 {
     /* Initialized data */
@@ -1637,7 +1647,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5;
     alist al__1;
 
@@ -1658,25 +1668,25 @@ static integer c__2 = 2;
     char side[1];
     logical left, null;
     char uplo[1];
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
 	    logical *, doublereal *, ftnlen, ftnlen, ftnlen);
     doublereal alpha;
     char diags[1];
-    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     logical *, integer *, logical *, ftnlen, ftnlen);
     logical isame[13];
     char sides[1];
     integer nargs;
     logical reset;
-    extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, 
-	    integer *, integer *, doublereal *, doublereal *, integer *, 
+    extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
 	    doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), dtrsm_(
 	    char *, char *, char *, char *, integer *, integer *, doublereal *
-	    , doublereal *, integer *, doublereal *, integer *, ftnlen, 
+	    , doublereal *, integer *, doublereal *, integer *, ftnlen,
 	    ftnlen, ftnlen, ftnlen);
     char uplos[1];
     extern logical lderes_(char *, char *, integer *, integer *, doublereal *,
@@ -1816,7 +1826,7 @@ static integer c__2 = 2;
 
 /*                          Generate the matrix B. */
 
-				dmake_("GE", " ", " ", &m, &n, &b[b_offset], 
+				dmake_("GE", " ", " ", &m, &n, &b[b_offset],
 					nmax, &bb[1], &ldb, &reset, &c_b86, (
 					ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1882,7 +1892,7 @@ static integer c__2 = 2;
 				    }
 				    dtrmm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				} else if (s_cmp(sname + 3, "SM", (ftnlen)2, (
 					ftnlen)2) == 0) {
@@ -1915,7 +1925,7 @@ static integer c__2 = 2;
 				    }
 				    dtrsm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				}
 
@@ -1984,18 +1994,18 @@ static integer c__2 = 2;
 					    dmmch_(transa, "N", &m, &n, &m, &
 						    alpha, &a[a_offset], nmax,
 						     &b[b_offset], nmax, &
-						    c_b86, &c__[c_offset], 
+						    c_b86, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					} else {
 					    dmmch_("N", transa, &m, &n, &n, &
 						    alpha, &b[b_offset], nmax,
 						     &a[a_offset], nmax, &
-						    c_b86, &c__[c_offset], 
+						    c_b86, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					}
@@ -2008,10 +2018,10 @@ static integer c__2 = 2;
 					i__4 = n;
 					for (j = 1; j <= i__4; ++j) {
 					    i__5 = m;
-					    for (i__ = 1; i__ <= i__5; ++i__) 
+					    for (i__ = 1; i__ <= i__5; ++i__)
 						    {
 			  c__[i__ + j * c_dim1] = bb[i__ + (j - 1) * ldb];
-			  bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * 
+			  bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j *
 				  b_dim1];
 /* L60: */
 					    }
@@ -2024,16 +2034,16 @@ static integer c__2 = 2;
 						     &c__[c_offset], nmax, &
 						    c_b86, &b[b_offset], nmax,
 						     &ct[1], &g[1], &bb[1], &
-						    ldb, eps, &err, fatal, 
+						    ldb, eps, &err, fatal,
 						    nout, &c_false, (ftnlen)1,
 						     (ftnlen)1);
 					} else {
 					    dmmch_("N", transa, &m, &n, &n, &
-						    c_b96, &c__[c_offset], 
-						    nmax, &a[a_offset], nmax, 
-						    &c_b86, &b[b_offset], 
+						    c_b96, &c__[c_offset],
+						    nmax, &a[a_offset], nmax,
+						    &c_b86, &b[b_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_false, (
 						    ftnlen)1, (ftnlen)1);
 					}
@@ -2114,12 +2124,12 @@ static integer c__2 = 2;
 
 } /* dchk3_ */
 
-/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *nbet, doublereal *bet, integer *nmax, doublereal *a, 
-	doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, 
-	doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *nbet, doublereal *bet, integer *nmax, doublereal *a,
+	doublereal *aa, doublereal *as, doublereal *b, doublereal *bb,
+	doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs,
 	doublereal *ct, doublereal *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -2146,7 +2156,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5;
     alist al__1;
 
@@ -2166,13 +2176,13 @@ static integer c__2 = 2;
     doublereal bets;
     logical tran, null;
     char uplo[1];
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
 	    logical *, doublereal *, ftnlen, ftnlen, ftnlen);
     doublereal alpha;
-    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     logical *, integer *, logical *, ftnlen, ftnlen);
     logical isame[13];
@@ -2180,7 +2190,7 @@ static integer c__2 = 2;
     logical reset;
     char trans[1];
     logical upper;
-    extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, 
+    extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     integer *, ftnlen, ftnlen);
     char uplos[1];
@@ -2312,7 +2322,7 @@ static integer c__2 = 2;
 
 /*                       Generate the matrix C. */
 
-			    dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], 
+			    dmake_("SY", uplo, " ", &n, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b86, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2369,7 +2379,7 @@ static integer c__2 = 2;
 				al__1.aunit = *ntra;
 				f_rew(&al__1);
 			    }
-			    dsyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, 
+			    dsyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda,
 				    &beta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1)
 				    ;
 
@@ -2385,9 +2395,9 @@ static integer c__2 = 2;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
@@ -2440,19 +2450,19 @@ static integer c__2 = 2;
 				    }
 				    if (tran) {
 					dmmch_("T", "N", &lj, &c__1, &k, &
-						alpha, &a[jj * a_dim1 + 1], 
-						nmax, &a[j * a_dim1 + 1], 
-						nmax, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						alpha, &a[jj * a_dim1 + 1],
+						nmax, &a[j * a_dim1 + 1],
+						nmax, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
 					dmmch_("N", "T", &lj, &c__1, &k, &
-						alpha, &a[jj + a_dim1], nmax, 
+						alpha, &a[jj + a_dim1], nmax,
 						&a[j + a_dim1], nmax, &beta, &
 						c__[jj + j * c_dim1], nmax, &
-						ct[1], &g[1], &cc[jc], &ldc, 
+						ct[1], &g[1], &cc[jc], &ldc,
 						eps, &err, fatal, nout, &
 						c_true, (ftnlen)1, (ftnlen)1);
 				    }
@@ -2544,12 +2554,12 @@ static integer c__2 = 2;
 
 } /* dchk4_ */
 
-/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, 
-	integer *nbet, doublereal *bet, integer *nmax, doublereal *ab, 
-	doublereal *aa, doublereal *as, doublereal *bb, doublereal *bs, 
-	doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, 
+	fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf,
+	integer *nbet, doublereal *bet, integer *nmax, doublereal *ab,
+	doublereal *aa, doublereal *as, doublereal *bb, doublereal *bs,
+	doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct,
 	doublereal *g, doublereal *w, ftnlen sname_len)
 {
     /* Initialized data */
@@ -2597,13 +2607,13 @@ static integer c__2 = 2;
     doublereal bets;
     logical tran, null;
     char uplo[1];
-    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, 
-	    integer *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dmake_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
 	    logical *, doublereal *, ftnlen, ftnlen, ftnlen);
     doublereal alpha;
-    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     logical *, integer *, logical *, ftnlen, ftnlen);
     logical isame[13];
@@ -2612,8 +2622,8 @@ static integer c__2 = 2;
     char trans[1];
     logical upper;
     char uplos[1];
-    extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+    extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    doublereal *, doublereal *, integer *, ftnlen, ftnlen);
     extern logical lderes_(char *, char *, integer *, integer *, doublereal *,
 	     doublereal *, integer *, ftnlen, ftnlen);
@@ -2762,7 +2772,7 @@ static integer c__2 = 2;
 
 /*                       Generate the matrix C. */
 
-			    dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], 
+			    dmake_("SY", uplo, " ", &n, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b86, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2843,9 +2853,9 @@ static integer c__2 = 2;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
@@ -2902,7 +2912,7 @@ static integer c__2 = 2;
 				    if (tran) {
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
-					    w[i__] = ab[(j - 1 << 1) * *nmax 
+					    w[i__] = ab[(j - 1 << 1) * *nmax
 						    + k + i__];
 					    w[k + i__] = ab[(j - 1 << 1) * *
 						    nmax + i__];
@@ -2913,17 +2923,17 @@ static integer c__2 = 2;
 					i__8 = *nmax << 1;
 					dmmch_("T", "N", &lj, &c__1, &i__6, &
 						alpha, &ab[jjab], &i__7, &w[1]
-						, &i__8, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						, &i__8, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
-					    w[i__] = ab[(k + i__ - 1) * *nmax 
+					    w[i__] = ab[(k + i__ - 1) * *nmax
 						    + j];
-					    w[k + i__] = ab[(i__ - 1) * *nmax 
+					    w[k + i__] = ab[(i__ - 1) * *nmax
 						    + j];
 /* L60: */
 					}
@@ -2931,9 +2941,9 @@ static integer c__2 = 2;
 					i__7 = *nmax << 1;
 					dmmch_("N", "N", &lj, &c__1, &i__6, &
 						alpha, &ab[jj], nmax, &w[1], &
-						i__7, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						i__7, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    }
@@ -3029,7 +3039,7 @@ static integer c__2 = 2;
 
 } /* dchk5_ */
 
-/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -3042,24 +3052,24 @@ static integer c__2 = 2;
     integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void);
 
     /* Local variables */
-    doublereal a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]	
+    doublereal a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]
 	    /* was [2][1] */, beta, alpha;
-    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
-	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen),
-	     dtrmm_(char *, char *, char *, char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	     dtrmm_(char *, char *, char *, char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    ftnlen, ftnlen, ftnlen, ftnlen), dsymm_(char *, char *, integer *,
-	     integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	     integer *, doublereal *, doublereal *, integer *, doublereal *,
 	    integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen),
-	     dtrsm_(char *, char *, char *, char *, integer *, integer *, 
-	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	     dtrsm_(char *, char *, char *, char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
 	    ftnlen, ftnlen, ftnlen, ftnlen), dsyrk_(char *, char *, integer *,
-	     integer *, doublereal *, doublereal *, integer *, doublereal *, 
-	    doublereal *, integer *, ftnlen, ftnlen), dsyr2k_(char *, char *, 
-	    integer *, integer *, doublereal *, doublereal *, integer *, 
-	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
-	    ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, 
+	     integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, ftnlen, ftnlen), dsyr2k_(char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *,
 	    logical *, ftnlen);
 
     /* Fortran I/O blocks */
@@ -3113,142 +3123,142 @@ static integer c__2 = 2;
     }
 L10:
     infoc_1.infot = 1;
-    dgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 1;
-    dgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    dgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    dgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    dgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    dgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    dgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    dgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    dgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    dgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    dgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    dgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    dgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    dgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    dgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    dgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, 
+    dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    dgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    dgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, 
+    dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    dgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    dgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    dgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    dgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    dgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
@@ -3952,9 +3962,9 @@ static integer c__2 = 2;
 
 } /* dchke_ */
 
-/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, 
+/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m,
 	integer *n, doublereal *a, integer *nmax, doublereal *aa, integer *
-	lda, logical *reset, doublereal *transl, ftnlen type_len, ftnlen 
+	lda, logical *reset, doublereal *transl, ftnlen type_len, ftnlen
 	uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
@@ -4097,8 +4107,8 @@ static integer c__2 = 2;
 } /* dmake_ */
 
 /* Subroutine */ int dmmch_(char *transa, char *transb, integer *m, integer *
-	n, integer *kk, doublereal *alpha, doublereal *a, integer *lda, 
-	doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, 
+	n, integer *kk, doublereal *alpha, doublereal *a, integer *lda,
+	doublereal *b, integer *ldb, doublereal *beta, doublereal *c__,
 	integer *ldc, doublereal *ct, doublereal *g, doublereal *cc, integer *
 	ldcc, doublereal *eps, doublereal *err, logical *fatal, integer *nout,
 	 logical *mv, ftnlen transa_len, ftnlen transb_len)
@@ -4112,7 +4122,7 @@ static integer c__2 = 2;
 	    " \002,i3)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1,
 	    cc_offset, i__1, i__2, i__3;
     doublereal d__1, d__2;
 
@@ -4166,9 +4176,9 @@ static integer c__2 = 2;
     cc -= cc_offset;
 
     /* Function Body */
-    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa ==
 	    'C';
-    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb ==
 	    'C';
 
 /*     Compute expected result, one column at a time, in CT using data */
@@ -4190,7 +4200,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[i__ + k * a_dim1] * b[k + j * b_dim1];
-		    g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 
+		    g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2
 			    = b[k + j * b_dim1], abs(d__2));
 /* L20: */
 		}
@@ -4202,7 +4212,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[k + i__ * a_dim1] * b[k + j * b_dim1];
-		    g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 
+		    g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2
 			    = b[k + j * b_dim1], abs(d__2));
 /* L40: */
 		}
@@ -4214,7 +4224,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[i__ + k * a_dim1] * b[j + k * b_dim1];
-		    g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 
+		    g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2
 			    = b[j + k * b_dim1], abs(d__2));
 /* L60: */
 		}
@@ -4226,7 +4236,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[k + i__ * a_dim1] * b[j + k * b_dim1];
-		    g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 
+		    g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2
 			    = b[j + k * b_dim1], abs(d__2));
 /* L80: */
 		}
@@ -4520,7 +4530,7 @@ doublereal ddiff_(doublereal *x, doublereal *y)
 
 } /* ddiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/blastest/src/sblat1.c b/blastest/src/sblat1.c
index 6996666a5..7bde1b108 100644
--- a/blastest/src/sblat1.c
+++ b/blastest/src/sblat1.c
@@ -69,6 +69,11 @@ static real c_b63 = 0.f;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "sblat1";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
     static real sfac = 9.765625e-4f;
@@ -123,11 +128,11 @@ static real c_b63 = 0.f;
 	combla_1.incy = 9999;
 	if (combla_1.icase == 3 || combla_1.icase == 11) {
 	    check0_(&sfac);
-	} else if (combla_1.icase == 7 || combla_1.icase == 8 || 
+	} else if (combla_1.icase == 7 || combla_1.icase == 8 ||
 		combla_1.icase == 9 || combla_1.icase == 10) {
 	    check1_(&sfac);
-	} else if (combla_1.icase == 1 || combla_1.icase == 2 || 
-		combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase 
+	} else if (combla_1.icase == 1 || combla_1.icase == 2 ||
+		combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase
 		== 12 || combla_1.icase == 13) {
 	    check2_(&sfac);
 	} else if (combla_1.icase == 4) {
@@ -142,7 +147,12 @@ static real c_b63 = 0.f;
     }
     s_stop("", (ftnlen)0);
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int header_(void)
@@ -202,16 +212,16 @@ static real c_b63 = 0.f;
     static real dc1[8] = { .6f,.8f,-.6f,.8f,.6f,1.f,0.f,1.f };
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
     /* Local variables */
     integer i__, k;
     real sa, sb, sc, ss, dtemp[9];
-    extern /* Subroutine */ int srotg_(real *, real *, real *, real *), 
+    extern /* Subroutine */ int srotg_(real *, real *, real *, real *),
 	    stest_(integer *, real *, real *, real *, real *), stest1_(real *,
-	     real *, real *, real *), srotmg_(real *, real *, real *, real *, 
+	     real *, real *, real *), srotmg_(real *, real *, real *, real *,
 	    real *);
 
     /* Fortran I/O blocks */
@@ -322,7 +332,7 @@ static real c_b63 = 0.f;
     real r__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -335,8 +345,8 @@ static real c_b63 = 0.f;
     real stemp[1];
     extern real sasum_(integer *, real *, integer *);
     real strue[8];
-    extern /* Subroutine */ int stest_(integer *, real *, real *, real *, 
-	    real *), itest1_(integer *, integer *), stest1_(real *, real *, 
+    extern /* Subroutine */ int stest_(integer *, real *, real *, real *,
+	    real *), itest1_(integer *, integer *), stest1_(real *, real *,
 	    real *, real *);
     extern integer isamax_(integer *, real *, integer *);
 
@@ -378,11 +388,11 @@ static real c_b63 = 0.f;
 		stest1_(&r__1, stemp, stemp, sfac);
 	    } else if (combla_1.icase == 9) {
 /*              .. SSCAL .. */
-		sscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], 
+		sscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1],
 			sx, &combla_1.incx);
 		i__1 = len;
 		for (i__ = 1; i__ <= i__1; ++i__) {
-		    strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << 
+		    strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 <<
 			    3) - 49];
 /* L40: */
 		}
@@ -455,87 +465,87 @@ static real c_b63 = 0.f;
 	    ;
     static struct {
 	real e_1[448];
-	} equiv_3 = {{ .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		-.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, 0.f, 
+	} equiv_3 = {{ .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f,
+		0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f,
+		0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		-.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f,
+		0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, 0.f,
 		0.f, 0.f, 0.f, 0.f, -.8f, 3.8f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f,
-		 2.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, -.4f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, 3.8f, 
-		-2.2f, -1.2f, 0.f, 0.f, 0.f, -.9f, 2.8f, -1.4f, -1.3f, 0.f, 
-		0.f, 0.f, 3.5f, -.4f, -2.2f, 4.7f, 0.f, 0.f, 0.f, .6f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, 
+		 2.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, -.4f, 0.f, 0.f, 0.f,
+		0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, 3.8f,
+		-2.2f, -1.2f, 0.f, 0.f, 0.f, -.9f, 2.8f, -1.4f, -1.3f, 0.f,
+		0.f, 0.f, 3.5f, -.4f, -2.2f, 4.7f, 0.f, 0.f, 0.f, .6f, 0.f,
+		0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f,
+		0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f,
+		0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f,
+		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f,
 		0.f, 0.f, .1f, -3.f, 0.f, 0.f, 0.f, 0.f, -.3f, .1f, -2.f, 0.f,
 		 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f,
-		 -.5f, .8f, .9f, -.3f, -.4f, -2.f, .1f, 1.4f, .8f, .6f, -.3f, 
-		-2.8f, -1.8f, .1f, 1.3f, .8f, 0.f, -.3f, -1.9f, 3.8f, .1f, 
-		-3.1f, .8f, 4.8f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		-.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 
+		 -.5f, .8f, .9f, -.3f, -.4f, -2.f, .1f, 1.4f, .8f, .6f, -.3f,
+		-2.8f, -1.8f, .1f, 1.3f, .8f, 0.f, -.3f, -1.9f, 3.8f, .1f,
+		-3.1f, .8f, 4.8f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f,
+		0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f,
+		0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		-.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f,
 		0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, 0.f, 4.8f, .1f, -3.f,
-		 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, 
+		 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f,
 		2.1f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, .8f, .9f,
 		 -.3f, -.4f, -1.6f, .1f, -2.2f, .8f, 5.4f, -.3f, -2.8f, -1.5f,
-		 .1f, -1.4f, .8f, 3.6f, -.3f, -1.9f, 3.7f, .1f, -2.2f, .8f, 
-		3.6f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		.6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, -1.f, 0.f, 0.f, 0.f, 
+		 .1f, -1.4f, .8f, 3.6f, -.3f, -1.9f, 3.7f, .1f, -2.2f, .8f,
+		3.6f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f,
+		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f,
+		0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f,
+		0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		.6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, -1.f, 0.f, 0.f, 0.f,
 		0.f, 0.f, -.9f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, .8f, 0.f,
 		 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f,
-		 -1.f, 1.4f, -1.6f, 0.f, 0.f, 0.f, -.9f, -.8f, 1.3f, -1.6f, 
+		 -1.f, 1.4f, -1.6f, 0.f, 0.f, 0.f, -.9f, -.8f, 1.3f, -1.6f,
 		0.f, 0.f, 0.f, 3.5f, .8f, -3.1f, 4.8f, 0.f, 0.f, 0.f }};
 
     static struct {
 	real e_1[448];
-	} equiv_7 = {{ .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 
+	} equiv_7 = {{ .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f,
+		0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f,
+		0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f,
 		0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, 0.f,
-		 0.f, 0.f, 0.f, 0.f, .7f, -4.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		1.7f, -.7f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 3.5f, 0.f, 0.f, 
+		 0.f, 0.f, 0.f, 0.f, .7f, -4.8f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		1.7f, -.7f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 3.5f, 0.f, 0.f,
 		0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, 0.f, 0.f, 0.f, .7f, -4.8f,
-		 3.f, 1.1f, 0.f, 0.f, 0.f, 1.7f, -.7f, -.7f, 2.3f, 0.f, 0.f, 
-		0.f, -2.6f, 3.5f, -.7f, -3.6f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 
+		 3.f, 1.1f, 0.f, 0.f, 0.f, 1.7f, -.7f, -.7f, 2.3f, 0.f, 0.f,
+		0.f, -2.6f, 3.5f, -.7f, -3.6f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f,
+		0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f,
+		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f,
 		0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f,
-		 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, 
-		4.f, -.9f, -.3f, 0.f, 0.f, 0.f, 0.f, -.5f, -.9f, 1.5f, 0.f, 
-		0.f, 0.f, 0.f, -1.5f, -.9f, -1.8f, 0.f, 0.f, 0.f, 0.f, .5f, 
+		 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f,
+		4.f, -.9f, -.3f, 0.f, 0.f, 0.f, 0.f, -.5f, -.9f, 1.5f, 0.f,
+		0.f, 0.f, 0.f, -1.5f, -.9f, -1.8f, 0.f, 0.f, 0.f, 0.f, .5f,
 		-.9f, .3f, .7f, -.6f, .2f, .8f, 3.7f, -.9f, -1.2f, .7f, -1.5f,
-		 .2f, 2.2f, -.3f, -.9f, 2.1f, .7f, -1.6f, .2f, 2.f, -1.6f, 
-		-.9f, -2.1f, .7f, 2.9f, .2f, -3.8f, .5f, 0.f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 
+		 .2f, 2.2f, -.3f, -.9f, 2.1f, .7f, -1.6f, .2f, 2.f, -1.6f,
+		-.9f, -2.1f, .7f, 2.9f, .2f, -3.8f, .5f, 0.f, 0.f, 0.f, 0.f,
+		0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f,
+		0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f,
+		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f,
 		0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f,
-		 0.f, 0.f, 0.f, .5f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 4.f, 
+		 0.f, 0.f, 0.f, .5f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 4.f,
 		-6.3f, 0.f, 0.f, 0.f, 0.f, 0.f, -.5f, .3f, 0.f, 0.f, 0.f, 0.f,
-		 0.f, -1.5f, 3.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 
-		.7f, 0.f, 0.f, 0.f, 3.7f, -7.2f, 3.f, 1.7f, 0.f, 0.f, 0.f, 
-		-.3f, .9f, -.7f, 1.9f, 0.f, 0.f, 0.f, -1.6f, 2.7f, -.7f, 
-		-3.4f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 
-		0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 
-		0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 
-		0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 
+		 0.f, -1.5f, 3.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f,
+		.7f, 0.f, 0.f, 0.f, 3.7f, -7.2f, 3.f, 1.7f, 0.f, 0.f, 0.f,
+		-.3f, .9f, -.7f, 1.9f, 0.f, 0.f, 0.f, -1.6f, 2.7f, -.7f,
+		-3.4f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f,
+		0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f,
+		0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f,
+		0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f,
+		0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
 		.5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, .7f, -.9f, 1.2f, 0.f, 0.f,
-		 0.f, 0.f, 1.7f, -.9f, .5f, 0.f, 0.f, 0.f, 0.f, -2.6f, -.9f, 
-		-1.3f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f, 
-		.8f, .7f, -.9f, 1.2f, .7f, -1.5f, .2f, 1.6f, 1.7f, -.9f, .5f, 
-		.7f, -1.6f, .2f, 2.4f, -2.6f, -.9f, -1.3f, .7f, 2.9f, .2f, 
+		 0.f, 0.f, 1.7f, -.9f, .5f, 0.f, 0.f, 0.f, 0.f, -2.6f, -.9f,
+		-1.3f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f,
+		.8f, .7f, -.9f, 1.2f, .7f, -1.5f, .2f, 1.6f, 1.7f, -.9f, .5f,
+		.7f, -1.6f, .2f, 2.4f, -2.6f, -.9f, -1.3f, .7f, 2.9f, .2f,
 		-4.f }};
 
 
@@ -544,7 +554,7 @@ static real c_b63 = 0.f;
     real r__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -568,13 +578,13 @@ static real c_b63 = 0.f;
 #define dt19yd ((real *)&equiv_7 + 336)
     integer ksize;
     real ssize[7];
-    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, 
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
 	    integer *), sswap_(integer *, real *, integer *, real *, integer *
 	    ), stest_(integer *, real *, real *, real *, real *), saxpy_(
 	    integer *, real *, real *, integer *, real *, integer *), srotm_(
 	    integer *, real *, integer *, real *, integer *, real *), stest1_(
 	    real *, real *, real *, real *);
-    extern real sdsdot_(integer *, real *, real *, integer *, real *, integer 
+    extern real sdsdot_(integer *, real *, real *, integer *, real *, integer
 	    *);
 
     /* Fortran I/O blocks */
@@ -627,7 +637,7 @@ static real c_b63 = 0.f;
 /*              .. SDOT .. */
 		r__1 = sdot_(&combla_1.n, sx, &combla_1.incx, sy, &
 			combla_1.incy);
-		stest1_(&r__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], 
+		stest1_(&r__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1],
 			sfac);
 	    } else if (combla_1.icase == 2) {
 /*              .. SAXPY .. */
@@ -664,9 +674,9 @@ static real c_b63 = 0.f;
 		    for (i__ = 1; i__ <= 7; ++i__) {
 			sx[i__ - 1] = dx1[i__ - 1];
 			sy[i__ - 1] = dy1[i__ - 1];
-			stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - 
+			stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 -
 				36];
-			sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - 
+			sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 -
 				36];
 		    }
 
@@ -696,7 +706,7 @@ static real c_b63 = 0.f;
 /*              .. SDSROT .. */
 		r__1 = sdsdot_(&combla_1.n, &c_b39, sx, &combla_1.incx, sy, &
 			combla_1.incy);
-		stest1_(&r__1, &st7b[kn + (ki << 2) - 5], &ssize3[kn - 1], 
+		stest1_(&r__1, &st7b[kn + (ki << 2) - 5], &ssize3[kn - 1],
 			sfac);
 	    } else {
 		s_wsle(&io___80);
@@ -759,7 +769,7 @@ static real c_b63 = 0.f;
 	    1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f };
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -770,12 +780,12 @@ static real c_b63 = 0.f;
     real mwpc[11];
     integer mwpn[11];
     real mwps[11];
-    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, 
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
 	    integer *, real *, real *);
     real mwpx[5], mwpy[5];
     integer ksize;
     real copyx[5], copyy[5];
-    extern /* Subroutine */ int stest_(integer *, real *, real *, real *, 
+    extern /* Subroutine */ int stest_(integer *, real *, real *, real *,
 	    real *);
     real mwptx[55]	/* was [11][5] */, mwpty[55]	/* was [11][5] */;
     integer mwpinx[11], mwpiny[11];
@@ -1032,7 +1042,7 @@ static real c_b63 = 0.f;
 	sfac)
 {
     real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(integer *, real *, real *, real *, 
+    extern /* Subroutine */ int stest_(integer *, real *, real *, real *,
 	    real *);
 
 /*     ************************* STEST1 ***************************** */
diff --git a/blastest/src/sblat2.c b/blastest/src/sblat2.c
index 54d0a010a..a2ce310f6 100644
--- a/blastest/src/sblat2.c
+++ b/blastest/src/sblat2.c
@@ -155,10 +155,15 @@ static logical c_false = FALSE_;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "sblat2";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*16] = "SGEMV " "SGBMV " "SSYMV " "SSBMV " "SSPMV " 
-	    "STRMV " "STBMV " "STPMV " "STRSV " "STBSV " "STPSV " "SGER  " 
+    static char snames[6*16] = "SGEMV " "SGBMV " "SSYMV " "SSBMV " "SSPMV "
+	    "STRMV " "STBMV " "STPMV " "STRSV " "STBSV " "STPSV " "SGER  "
 	    "SSYR  " "SSPR  " "SSYR2 " "SSPR2 ";
 
     /* Format strings */
@@ -204,10 +209,10 @@ static logical c_false = FALSE_;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -228,40 +233,40 @@ static logical c_false = FALSE_;
     integer ninc, nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, integer *, integer *, real *, integer *, real *, 
+    extern /* Subroutine */ int schk1_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, integer *, integer *, real *, integer *, real *,
 	    integer *, integer *, integer *, integer *, real *, real *, real *
-	    , real *, real *, real *, real *, real *, real *, real *, real *, 
-	    ftnlen), schk2_(char *, real *, real *, integer *, integer *, 
-	    logical *, logical *, logical *, integer *, integer *, integer *, 
-	    integer *, integer *, real *, integer *, real *, integer *, 
-	    integer *, integer *, integer *, real *, real *, real *, real *, 
-	    real *, real *, real *, real *, real *, real *, real *, ftnlen), 
-	    schk3_(char *, real *, real *, integer *, integer *, logical *, 
-	    logical *, logical *, integer *, integer *, integer *, integer *, 
+	    , real *, real *, real *, real *, real *, real *, real *, real *,
+	    ftnlen), schk2_(char *, real *, real *, integer *, integer *,
+	    logical *, logical *, logical *, integer *, integer *, integer *,
+	    integer *, integer *, real *, integer *, real *, integer *,
+	    integer *, integer *, integer *, real *, real *, real *, real *,
+	    real *, real *, real *, real *, real *, real *, real *, ftnlen),
+	    schk3_(char *, real *, real *, integer *, integer *, logical *,
+	    logical *, logical *, integer *, integer *, integer *, integer *,
 	    integer *, integer *, integer *, integer *, real *, real *, real *
 	    , real *, real *, real *, real *, real *, real *, ftnlen), schk4_(
 	    char *, real *, real *, integer *, integer *, logical *, logical *
-	    , logical *, integer *, integer *, integer *, real *, integer *, 
-	    integer *, integer *, integer *, real *, real *, real *, real *, 
-	    real *, real *, real *, real *, real *, real *, real *, real *, 
-	    ftnlen), schk5_(char *, real *, real *, integer *, integer *, 
-	    logical *, logical *, logical *, integer *, integer *, integer *, 
+	    , logical *, integer *, integer *, integer *, real *, integer *,
+	    integer *, integer *, integer *, real *, real *, real *, real *,
+	    real *, real *, real *, real *, real *, real *, real *, real *,
+	    ftnlen), schk5_(char *, real *, real *, integer *, integer *,
+	    logical *, logical *, logical *, integer *, integer *, integer *,
 	    real *, integer *, integer *, integer *, integer *, real *, real *
-	    , real *, real *, real *, real *, real *, real *, real *, real *, 
+	    , real *, real *, real *, real *, real *, real *, real *, real *,
 	    real *, real *, ftnlen), schk6_(char *, real *, real *, integer *,
 	     integer *, logical *, logical *, logical *, integer *, integer *,
-	     integer *, real *, integer *, integer *, integer *, integer *, 
-	    real *, real *, real *, real *, real *, real *, real *, real *, 
+	     integer *, real *, integer *, integer *, integer *, integer *,
+	    real *, real *, real *, real *, real *, real *, real *, real *,
 	    real *, real *, real *, real *, ftnlen);
     logical fatal;
     extern /* Subroutine */ int schke_(integer *, char *, integer *, ftnlen);
     logical trace;
     integer nidim;
-    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
-	    real *, real *, real *, real *, real *, logical *, integer *, 
+    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    real *, real *, real *, real *, real *, logical *, integer *,
 	    logical *, ftnlen);
     char snaps[32], trans[1];
     integer isnum;
@@ -610,7 +615,7 @@ static logical c_false = FALSE_;
 	goto L80;
     }
     for (i__ = 1; i__ <= 16; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L70;
 	}
@@ -737,44 +742,44 @@ static logical c_false = FALSE_;
 /*           Test SGEMV, 01, and SGBMV, 02. */
 L140:
 	    schk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. */
 L150:
 	    schk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test STRMV, 06, STBMV, 07, STPMV, 08, */
 /*           STRSV, 09, STBSV, 10, and STPSV, 11. */
 L160:
 	    schk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc,
 		    &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen)
 		    6);
 	    goto L200;
 /*           Test SGER, 12. */
 L170:
 	    schk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test SSYR, 13, and SSPR, 14. */
 L180:
 	    schk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test SSYR2, 15, and SSPR2, 16. */
 L190:
 	    schk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 
 L200:
@@ -816,15 +821,20 @@ static logical c_false = FALSE_;
 
 /*     End of SBLAT2. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int schk1_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nkb, integer *kb, integer *
 	nalf, real *alf, integer *nbet, real *bet, integer *ninc, integer *
-	inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, 
-	real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, 
+	inc, integer *nmax, integer *incmax, real *a, real *aa, real *as,
+	real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt,
 	real *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -872,24 +882,24 @@ static logical c_false = FALSE_;
     logical full, tran, null;
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
-	    integer *, real *, integer *, real *, integer *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
 	    integer *, logical *, real *, ftnlen, ftnlen, ftnlen);
     integer nargs;
     extern /* Subroutine */ int sgbmv_(char *, integer *, integer *, integer *
 	    , integer *, real *, real *, integer *, real *, integer *, real *,
-	     real *, integer *, ftnlen), smvch_(char *, integer *, integer *, 
-	    real *, real *, integer *, real *, integer *, real *, real *, 
-	    integer *, real *, real *, real *, real *, real *, logical *, 
+	     real *, integer *, ftnlen), smvch_(char *, integer *, integer *,
+	    real *, real *, integer *, real *, integer *, real *, real *,
+	    integer *, real *, real *, real *, real *, real *, logical *,
 	    integer *, logical *, ftnlen), sgemv_(char *, integer *, integer *
-	    , real *, real *, integer *, real *, integer *, real *, real *, 
+	    , real *, real *, integer *, real *, integer *, real *, real *,
 	    integer *, ftnlen);
     logical reset;
     integer incxs, incys;
     char trans[1];
     logical banded;
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     real transl;
     char transs[1];
@@ -1066,9 +1076,9 @@ static logical c_false = FALSE_;
 				    transl = 0.f;
 				    i__7 = abs(incy);
 				    i__8 = ml - 1;
-				    smake_("GE", " ", " ", &c__1, &ml, &y[1], 
+				    smake_("GE", " ", " ", &c__1, &ml, &y[1],
 					    &c__1, &yy[1], &i__7, &c__0, &
-					    i__8, &reset, &transl, (ftnlen)2, 
+					    i__8, &reset, &transl, (ftnlen)2,
 					    (ftnlen)1, (ftnlen)1);
 
 				    ++nc;
@@ -1076,7 +1086,7 @@ static logical c_false = FALSE_;
 /*                             Save every datum before calling the */
 /*                             subroutine. */
 
-				    *(unsigned char *)transs = *(unsigned 
+				    *(unsigned char *)transs = *(unsigned
 					    char *)trans;
 				    ms = m;
 				    ns = n;
@@ -1134,7 +1144,7 @@ static logical c_false = FALSE_;
 					    al__1.aunit = *ntra;
 					    f_rew(&al__1);
 					}
-					sgemv_(trans, &m, &n, &alpha, &aa[1], 
+					sgemv_(trans, &m, &n, &alpha, &aa[1],
 						&lda, &xx[1], &incx, &beta, &
 						yy[1], &incy, (ftnlen)1);
 				    } else if (banded) {
@@ -1259,8 +1269,8 @@ static logical c_false = FALSE_;
 
 					smvch_(trans, &m, &n, &alpha, &a[
 						a_offset], nmax, &x[1], &incx,
-						 &beta, &y[1], &incy, &yt[1], 
-						&g[1], &yy[1], eps, &err, 
+						 &beta, &y[1], &incy, &yt[1],
+						&g[1], &yy[1], eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1);
 					errmax = max(errmax,err);
@@ -1365,11 +1375,11 @@ static logical c_false = FALSE_;
 } /* schk1_ */
 
 /* Subroutine */ int schk2_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nkb, integer *kb, integer *
 	nalf, real *alf, integer *nbet, real *bet, integer *ninc, integer *
-	inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, 
-	real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, 
+	inc, integer *nmax, integer *incmax, real *a, real *aa, real *as,
+	real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt,
 	real *g, ftnlen sname_len)
 {
     /* Initialized data */
@@ -1407,7 +1417,7 @@ static logical c_false = FALSE_;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, 
+    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly,
 	    laa, lda;
     real als, bls;
     extern logical lse_(real *, real *, integer *);
@@ -1419,27 +1429,27 @@ static logical c_false = FALSE_;
     char uplo[1];
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
-	    integer *, real *, integer *, real *, integer *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
 	    integer *, logical *, real *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
-	    real *, real *, real *, real *, real *, logical *, integer *, 
+    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    real *, real *, real *, real *, real *, logical *, integer *,
 	    logical *, ftnlen);
     logical reset;
     integer incxs, incys;
-    extern /* Subroutine */ int ssbmv_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
+    extern /* Subroutine */ int ssbmv_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
 	    ftnlen);
     char uplos[1];
-    extern /* Subroutine */ int sspmv_(char *, integer *, real *, real *, 
+    extern /* Subroutine */ int sspmv_(char *, integer *, real *, real *,
 	    real *, integer *, real *, real *, integer *, ftnlen), ssymv_(
-	    char *, integer *, real *, real *, integer *, real *, integer *, 
+	    char *, integer *, real *, real *, integer *, real *, integer *,
 	    real *, real *, integer *, ftnlen);
     logical banded, packed;
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     real transl;
 
@@ -1599,7 +1609,7 @@ static logical c_false = FALSE_;
 				i__8 = n - 1;
 				smake_("GE", " ", " ", &c__1, &n, &y[1], &
 					c__1, &yy[1], &i__7, &c__0, &i__8, &
-					reset, &transl, (ftnlen)2, (ftnlen)1, 
+					reset, &transl, (ftnlen)2, (ftnlen)1,
 					(ftnlen)1);
 
 				++nc;
@@ -1816,8 +1826,8 @@ static logical c_false = FALSE_;
 
 /*                             Check the result. */
 
-				    smvch_("N", &n, &n, &alpha, &a[a_offset], 
-					    nmax, &x[1], &incx, &beta, &y[1], 
+				    smvch_("N", &n, &n, &alpha, &a[a_offset],
+					    nmax, &x[1], &incx, &beta, &y[1],
 					    &incy, &yt[1], &g[1], &yy[1], eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1);
@@ -1928,10 +1938,10 @@ static logical c_false = FALSE_;
 } /* schk2_ */
 
 /* Subroutine */ int schk3_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nkb, integer *kb, integer *
 	ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa,
-	 real *as, real *x, real *xx, real *xs, real *xt, real *g, real *z__, 
+	 real *as, real *x, real *xx, real *xs, real *xt, real *g, real *z__,
 	ftnlen sname_len)
 {
     /* Initialized data */
@@ -1971,7 +1981,7 @@ static logical c_false = FALSE_;
 	    integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, 
+    integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict,
 	    icu;
     extern logical lse_(real *, real *, integer *);
     real err;
@@ -1982,32 +1992,32 @@ static logical c_false = FALSE_;
     logical full, null;
     char uplo[1], diags[1];
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
-	    integer *, real *, integer *, real *, integer *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
 	    integer *, logical *, real *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
-	    real *, real *, real *, real *, real *, logical *, integer *, 
+    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    real *, real *, real *, real *, real *, logical *, integer *,
 	    logical *, ftnlen);
     logical reset;
     integer incxs;
     char trans[1];
-    extern /* Subroutine */ int stbmv_(char *, char *, char *, integer *, 
-	    integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, 
-	    ftnlen), stbsv_(char *, char *, char *, integer *, integer *, 
+    extern /* Subroutine */ int stbmv_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, integer *, ftnlen, ftnlen,
+	    ftnlen), stbsv_(char *, char *, char *, integer *, integer *,
 	    real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen);
     char uplos[1];
-    extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *,
 	    real *, real *, integer *, ftnlen, ftnlen, ftnlen), strmv_(char *,
-	     char *, char *, integer *, real *, integer *, real *, integer *, 
+	     char *, char *, integer *, real *, integer *, real *, integer *,
 	    ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, char *, integer *,
 	     real *, real *, integer *, ftnlen, ftnlen, ftnlen), strsv_(char *
 	    , char *, char *, integer *, real *, integer *, real *, integer *,
 	     ftnlen, ftnlen, ftnlen);
     logical banded, packed;
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     real transl;
     char transs[1];
@@ -2133,13 +2143,13 @@ static logical c_false = FALSE_;
 			    ;
 
 		    for (icd = 1; icd <= 2; ++icd) {
-			*(unsigned char *)diag = *(unsigned char *)&ichd[icd 
+			*(unsigned char *)diag = *(unsigned char *)&ichd[icd
 				- 1];
 
 /*                    Generate the matrix A. */
 
 			transl = 0.f;
-			smake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], 
+			smake_(sname + 1, uplo, diag, &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &k, &k, &reset, &transl, (
 				ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2186,7 +2196,7 @@ static logical c_false = FALSE_;
 
 /*                       Call the subroutine. */
 
-			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) 
+			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2)
 				    == 0) {
 				if (full) {
 				    if (*trace) {
@@ -2239,7 +2249,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    stbmv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    stbmv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2320,7 +2330,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    stbsv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    stbsv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2362,11 +2372,11 @@ static logical c_false = FALSE_;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplo == *(unsigned 
+			    isame[0] = *(unsigned char *)uplo == *(unsigned
 				    char *)uplos;
-			    isame[1] = *(unsigned char *)trans == *(unsigned 
+			    isame[1] = *(unsigned char *)trans == *(unsigned
 				    char *)transs;
-			    isame[2] = *(unsigned char *)diag == *(unsigned 
+			    isame[2] = *(unsigned char *)diag == *(unsigned
 				    char *)diags;
 			    isame[3] = ns == n;
 			    if (full) {
@@ -2437,7 +2447,7 @@ static logical c_false = FALSE_;
 				    smvch_(trans, &n, &n, &c_b128, &a[
 					    a_offset], nmax, &x[1], &incx, &
 					    c_b120, &z__[1], &incx, &xt[1], &
-					    g[1], &xx[1], eps, &err, fatal, 
+					    g[1], &xx[1], eps, &err, fatal,
 					    nout, &c_true, (ftnlen)1);
 				} else if (s_cmp(sname + 3, "SV", (ftnlen)2, (
 					ftnlen)2) == 0) {
@@ -2446,7 +2456,7 @@ static logical c_false = FALSE_;
 
 				    i__4 = n;
 				    for (i__ = 1; i__ <= i__4; ++i__) {
-					z__[i__] = xx[(i__ - 1) * abs(incx) + 
+					z__[i__] = xx[(i__ - 1) * abs(incx) +
 						1];
 					xx[(i__ - 1) * abs(incx) + 1] = x[i__]
 						;
@@ -2455,7 +2465,7 @@ static logical c_false = FALSE_;
 				    smvch_(trans, &n, &n, &c_b128, &a[
 					    a_offset], nmax, &z__[1], &incx, &
 					    c_b120, &x[1], &incx, &xt[1], &g[
-					    1], &xx[1], eps, &err, fatal, 
+					    1], &xx[1], eps, &err, fatal,
 					    nout, &c_false, (ftnlen)1);
 				}
 				errmax = max(errmax,err);
@@ -2558,10 +2568,10 @@ static logical c_false = FALSE_;
 } /* schk3_ */
 
 /* Subroutine */ int schk4_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
 	ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa,
-	 real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, 
+	 real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys,
 	real *yt, real *g, real *z__, ftnlen sname_len)
 {
     /* Format strings */
@@ -2599,24 +2609,24 @@ static logical c_false = FALSE_;
     real err;
     integer ldas;
     logical same;
-    extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, 
+    extern /* Subroutine */ int sger_(integer *, integer *, real *, real *,
 	    integer *, real *, integer *, real *, integer *);
     integer incx, incy;
     logical null;
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
-	    integer *, real *, integer *, real *, integer *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
 	    integer *, logical *, real *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
-	    real *, real *, real *, real *, real *, logical *, integer *, 
+    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    real *, real *, real *, real *, real *, logical *, integer *,
 	    logical *, ftnlen);
     logical reset;
     integer incxs, incys;
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     real transl;
 
@@ -2718,7 +2728,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = m - 1;
 		smake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (m > 1) {
 		    x[m / 2] = 0.f;
@@ -2752,7 +2762,7 @@ static logical c_false = FALSE_;
 			transl = 0.f;
 			i__5 = m - 1;
 			i__6 = n - 1;
-			smake_(sname + 1, " ", " ", &m, &n, &a[a_offset], 
+			smake_(sname + 1, " ", " ", &m, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2883,9 +2893,9 @@ static logical c_false = FALSE_;
 				} else {
 				    w[0] = y[n - j + 1];
 				}
-				smvch_("N", &m, &c__1, &alpha, &z__[1], nmax, 
+				smvch_("N", &m, &c__1, &alpha, &z__[1], nmax,
 					w, &c__1, &c_b128, &a[j * a_dim1 + 1],
-					 &c__1, &yt[1], &g[1], &aa[(j - 1) * 
+					 &c__1, &yt[1], &g[1], &aa[(j - 1) *
 					lda + 1], eps, &err, fatal, nout, &
 					c_true, (ftnlen)1);
 				errmax = max(errmax,err);
@@ -2966,10 +2976,10 @@ static logical c_false = FALSE_;
 } /* schk4_ */
 
 /* Subroutine */ int schk5_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
 	ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa,
-	 real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, 
+	 real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys,
 	real *yt, real *g, real *z__, ftnlen sname_len)
 {
     /* Initialized data */
@@ -3017,18 +3027,18 @@ static logical c_false = FALSE_;
     integer incx;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, 
-	    integer *, real *, ftnlen), ssyr_(char *, integer *, real *, real 
+    extern /* Subroutine */ int sspr_(char *, integer *, real *, real *,
+	    integer *, real *, ftnlen), ssyr_(char *, integer *, real *, real
 	    *, integer *, real *, integer *, ftnlen);
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
-	    integer *, real *, integer *, real *, integer *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
 	    integer *, logical *, real *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
-	    real *, real *, real *, real *, real *, logical *, integer *, 
+    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    real *, real *, real *, real *, real *, logical *, integer *,
 	    logical *, ftnlen);
     logical reset;
     integer incxs;
@@ -3036,7 +3046,7 @@ static logical c_false = FALSE_;
     char uplos[1];
     logical packed;
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     real transl;
 
@@ -3140,7 +3150,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		smake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    x[n / 2] = 0.f;
@@ -3309,9 +3319,9 @@ static logical c_false = FALSE_;
 				jj = j;
 				lj = n - j + 1;
 			    }
-			    smvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, 
+			    smvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w,
 				    &c__1, &c_b128, &a[jj + j * a_dim1], &
-				    c__1, &yt[1], &g[1], &aa[ja], eps, &err, 
+				    c__1, &yt[1], &g[1], &aa[ja], eps, &err,
 				    fatal, nout, &c_true, (ftnlen)1);
 			    if (full) {
 				if (upper) {
@@ -3410,10 +3420,10 @@ static logical c_false = FALSE_;
 } /* schk5_ */
 
 /* Subroutine */ int schk6_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
 	ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa,
-	 real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, 
+	 real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys,
 	real *yt, real *g, real *z__, ftnlen sname_len)
 {
     /* Initialized data */
@@ -3442,7 +3452,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, 
+    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5,
 	    i__6;
     alist al__1;
 
@@ -3462,19 +3472,19 @@ static logical c_false = FALSE_;
     integer incx, incy;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *, 
-	    integer *, real *, integer *, real *, ftnlen), ssyr2_(char *, 
-	    integer *, real *, real *, integer *, real *, integer *, real *, 
+    extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, ftnlen), ssyr2_(char *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
 	    integer *, ftnlen);
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
-	    integer *, real *, integer *, real *, integer *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
 	    integer *, logical *, real *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
-	    real *, real *, real *, real *, real *, logical *, integer *, 
+    extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    real *, real *, real *, real *, real *, logical *, integer *,
 	    logical *, ftnlen);
     logical reset;
     integer incxs, incys;
@@ -3482,7 +3492,7 @@ static logical c_false = FALSE_;
     char uplos[1];
     logical packed;
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     real transl;
 
@@ -3588,7 +3598,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		smake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    x[n / 2] = 0.f;
@@ -3623,7 +3633,7 @@ static logical c_false = FALSE_;
 			transl = 0.f;
 			i__5 = n - 1;
 			i__6 = n - 1;
-			smake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], 
+			smake_(sname + 1, uplo, " ", &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -3801,7 +3811,7 @@ static logical c_false = FALSE_;
 				    jj = j;
 				    lj = n - j + 1;
 				}
-				smvch_("N", &lj, &c__2, &alpha, &z__[jj + 
+				smvch_("N", &lj, &c__2, &alpha, &z__[jj +
 					z_dim1], nmax, w, &c__1, &c_b128, &a[
 					jj + j * a_dim1], &c__1, &yt[1], &g[1]
 					, &aa[ja], eps, &err, fatal, nout, &
@@ -3907,7 +3917,7 @@ static logical c_false = FALSE_;
 
 } /* schk6_ */
 
-/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -3921,35 +3931,35 @@ static logical c_false = FALSE_;
 
     /* Local variables */
     real a[1]	/* was [1][1] */, x[1], y[1], beta;
-    extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, 
-	    integer *, real *, integer *, real *, integer *), sspr_(char *, 
-	    integer *, real *, real *, integer *, real *, ftnlen), ssyr_(char 
-	    *, integer *, real *, real *, integer *, real *, integer *, 
-	    ftnlen), sspr2_(char *, integer *, real *, real *, integer *, 
-	    real *, integer *, real *, ftnlen), ssyr2_(char *, integer *, 
-	    real *, real *, integer *, real *, integer *, real *, integer *, 
+    extern /* Subroutine */ int sger_(integer *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, integer *), sspr_(char *,
+	    integer *, real *, real *, integer *, real *, ftnlen), ssyr_(char
+	    *, integer *, real *, real *, integer *, real *, integer *,
+	    ftnlen), sspr2_(char *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, ftnlen), ssyr2_(char *, integer *,
+	    real *, real *, integer *, real *, integer *, real *, integer *,
 	    ftnlen);
     real alpha;
     extern /* Subroutine */ int sgbmv_(char *, integer *, integer *, integer *
 	    , integer *, real *, real *, integer *, real *, integer *, real *,
-	     real *, integer *, ftnlen), sgemv_(char *, integer *, integer *, 
-	    real *, real *, integer *, real *, integer *, real *, real *, 
-	    integer *, ftnlen), ssbmv_(char *, integer *, integer *, real *, 
-	    real *, integer *, real *, integer *, real *, real *, integer *, 
-	    ftnlen), stbmv_(char *, char *, char *, integer *, integer *, 
-	    real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), 
-	    stbsv_(char *, char *, char *, integer *, integer *, real *, 
+	     real *, integer *, ftnlen), sgemv_(char *, integer *, integer *,
+	    real *, real *, integer *, real *, integer *, real *, real *,
+	    integer *, ftnlen), ssbmv_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    ftnlen), stbmv_(char *, char *, char *, integer *, integer *,
+	    real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen),
+	    stbsv_(char *, char *, char *, integer *, integer *, real *,
 	    integer *, real *, integer *, ftnlen, ftnlen, ftnlen), sspmv_(
-	    char *, integer *, real *, real *, real *, integer *, real *, 
-	    real *, integer *, ftnlen), stpmv_(char *, char *, char *, 
-	    integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), 
-	    strmv_(char *, char *, char *, integer *, real *, integer *, real 
-	    *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, 
-	    char *, integer *, real *, real *, integer *, ftnlen, ftnlen, 
-	    ftnlen), ssymv_(char *, integer *, real *, real *, integer *, 
+	    char *, integer *, real *, real *, real *, integer *, real *,
+	    real *, integer *, ftnlen), stpmv_(char *, char *, char *,
+	    integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen),
+	    strmv_(char *, char *, char *, integer *, real *, integer *, real
+	    *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *,
+	    char *, integer *, real *, real *, integer *, ftnlen, ftnlen,
+	    ftnlen), ssymv_(char *, integer *, real *, real *, integer *,
 	    real *, integer *, real *, real *, integer *, ftnlen), strsv_(
-	    char *, char *, char *, integer *, real *, integer *, real *, 
-	    integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *, 
+	    char *, char *, char *, integer *, real *, integer *, real *,
+	    integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *,
 	    integer *, logical *, logical *, ftnlen);
 
     /* Fortran I/O blocks */
@@ -4455,9 +4465,9 @@ static logical c_false = FALSE_;
 
 } /* schke_ */
 
-/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, 
+/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m,
 	integer *n, real *a, integer *nmax, real *aa, integer *lda, integer *
-	kl, integer *ku, logical *reset, real *transl, ftnlen type_len, 
+	kl, integer *ku, logical *reset, real *transl, ftnlen type_len,
 	ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
@@ -4516,7 +4526,7 @@ static logical c_false = FALSE_;
 	i__2 = *m;
 	for (i__ = 1; i__ <= i__2; ++i__) {
 	    if (gen || upper && i__ <= j || lower && i__ >= j) {
-		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) 
+		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl)
 			{
 		    a[i__ + j * a_dim1] = sbeg_(reset) + *transl;
 		} else {
@@ -4690,9 +4700,9 @@ static logical c_false = FALSE_;
 
 } /* smake_ */
 
-/* Subroutine */ int smvch_(char *trans, integer *m, integer *n, real *alpha, 
-	real *a, integer *nmax, real *x, integer *incx, real *beta, real *y, 
-	integer *incy, real *yt, real *g, real *yy, real *eps, real *err, 
+/* Subroutine */ int smvch_(char *trans, integer *m, integer *n, real *alpha,
+	real *a, integer *nmax, real *x, integer *incx, real *beta, real *y,
+	integer *incy, real *yt, real *g, real *yy, real *eps, real *err,
 	logical *fatal, integer *nout, logical *mv, ftnlen trans_len)
 {
     /* Format strings */
@@ -4807,7 +4817,7 @@ static logical c_false = FALSE_;
     *err = 0.f;
     i__1 = ml;
     for (i__ = 1; i__ <= i__1; ++i__) {
-	erri = (r__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(r__1)) / 
+	erri = (r__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(r__1)) /
 		*eps;
 	if (g[i__] != 0.f) {
 	    erri /= g[i__];
@@ -4903,7 +4913,7 @@ logical lse_(real *ri, real *rj, integer *lr)
 
 } /* lse_ */
 
-logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, 
+logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa,
 	real *as, integer *lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
@@ -5064,7 +5074,7 @@ real sdiff_(real *x, real *y)
 
 } /* sdiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/blastest/src/sblat3.c b/blastest/src/sblat3.c
index dc5ef5738..01d4ca4b8 100644
--- a/blastest/src/sblat3.c
+++ b/blastest/src/sblat3.c
@@ -135,9 +135,14 @@ static integer c__2 = 2;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "sblat3";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*6] = "SGEMM " "SSYMM " "STRMM " "STRSM " "SSYRK " 
+    static char snames[6*6] = "SGEMM " "SSYMM " "STRMM " "STRSM " "SSYRK "
 	    "SSYR2K";
 
     /* Format strings */
@@ -179,10 +184,10 @@ static integer c__2 = 2;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -200,33 +205,33 @@ static integer c__2 = 2;
     integer nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, real *, integer *, real *, integer *, real *, real *, 
-	    real *, real *, real *, real *, real *, real *, real *, real *, 
-	    real *, ftnlen), schk2_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, real *, integer *, real *, integer *, real *, real *, 
-	    real *, real *, real *, real *, real *, real *, real *, real *, 
-	    real *, ftnlen), schk3_(char *, real *, real *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, real *, integer *, real *, real *, real *, real *, 
-	    real *, real *, real *, real *, real *, ftnlen), schk4_(char *, 
-	    real *, real *, integer *, integer *, logical *, logical *, 
-	    logical *, integer *, integer *, integer *, real *, integer *, 
+    extern /* Subroutine */ int schk1_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, real *, integer *, real *, integer *, real *, real *,
+	    real *, real *, real *, real *, real *, real *, real *, real *,
+	    real *, ftnlen), schk2_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, real *, integer *, real *, integer *, real *, real *,
+	    real *, real *, real *, real *, real *, real *, real *, real *,
+	    real *, ftnlen), schk3_(char *, real *, real *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, real *, real *,
+	    real *, real *, real *, real *, real *, ftnlen), schk4_(char *,
+	    real *, real *, integer *, integer *, logical *, logical *,
+	    logical *, integer *, integer *, integer *, real *, integer *,
 	    real *, integer *, real *, real *, real *, real *, real *, real *,
-	     real *, real *, real *, real *, real *, ftnlen), schk5_(char *, 
-	    real *, real *, integer *, integer *, logical *, logical *, 
-	    logical *, integer *, integer *, integer *, real *, integer *, 
+	     real *, real *, real *, real *, real *, ftnlen), schk5_(char *,
+	    real *, real *, integer *, integer *, logical *, logical *,
+	    logical *, integer *, integer *, integer *, real *, integer *,
 	    real *, integer *, real *, real *, real *, real *, real *, real *,
 	     real *, real *, real *, real *, real *, ftnlen);
     logical fatal;
     extern /* Subroutine */ int schke_(integer *, char *, integer *, ftnlen);
     logical trace;
     integer nidim;
-    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, 
-	    integer *, real *, real *, integer *, real *, integer *, real *, 
-	    real *, integer *, real *, real *, real *, integer *, real *, 
+    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *, real *, real *, real *, integer *, real *,
 	    real *, logical *, integer *, logical *, ftnlen, ftnlen);
     char snaps[32];
     integer isnum;
@@ -496,7 +501,7 @@ static integer c__2 = 2;
 	goto L60;
     }
     for (i__ = 1; i__ <= 6; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L50;
 	}
@@ -662,34 +667,34 @@ static integer c__2 = 2;
 /*           Test SGEMM, 01. */
 L140:
 	    schk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test SSYMM, 02. */
 L150:
 	    schk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test STRMM, 03, STRSM, 04. */
 L160:
 	    schk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65,
 		    ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6);
 	    goto L190;
 /*           Test SSYRK, 05. */
 L170:
 	    schk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test SSYR2K, 06. */
 L180:
 	    schk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
 		    bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, (
 		    ftnlen)6);
 	    goto L190;
@@ -733,14 +738,19 @@ static integer c__2 = 2;
 
 /*     End of SBLAT3. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int schk1_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
-	nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, 
-	real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, 
+	nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b,
+	real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g,
 	ftnlen sname_len)
 {
     /* Initialized data */
@@ -764,7 +774,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6;
     alist al__1;
 
@@ -773,7 +783,7 @@ static integer c__2 = 2;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, 
+    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns,
 	    ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     real als, bls;
     extern logical lse_(real *, real *, integer *);
@@ -782,22 +792,22 @@ static integer c__2 = 2;
     logical same, null;
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
 	    integer *, real *, integer *, real *, integer *, logical *, real *
 	    , ftnlen, ftnlen, ftnlen);
     logical trana, tranb;
-    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, 
-	    integer *, real *, real *, integer *, real *, integer *, real *, 
-	    real *, integer *, real *, real *, real *, integer *, real *, 
+    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *, real *, real *, real *, integer *, real *,
 	    real *, logical *, integer *, logical *, ftnlen, ftnlen), sgemm_(
-	    char *, char *, integer *, integer *, integer *, real *, real *, 
-	    integer *, real *, integer *, real *, real *, integer *, ftnlen, 
+	    char *, char *, integer *, integer *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, real *, integer *, ftnlen,
 	    ftnlen);
     integer nargs;
     logical reset;
     char tranas[1], tranbs[1], transa[1], transb[1];
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -888,7 +898,7 @@ static integer c__2 = 2;
 		for (ica = 1; ica <= 3; ++ica) {
 		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
 			    ;
-		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned
 			    char *)transa == 'C';
 
 		    if (trana) {
@@ -916,9 +926,9 @@ static integer c__2 = 2;
 			    ftnlen)1);
 
 		    for (icb = 1; icb <= 3; ++icb) {
-			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb
 				- 1];
-			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned
 				char *)transb == 'C';
 
 			if (tranb) {
@@ -1090,9 +1100,9 @@ static integer c__2 = 2;
 
 				    smmch_(transa, transb, &m, &n, &k, &alpha,
 					     &a[a_offset], nmax, &b[b_offset],
-					     nmax, &beta, &c__[c_offset], 
+					     nmax, &beta, &c__[c_offset],
 					    nmax, &ct[1], &g[1], &cc[1], &ldc,
-					     eps, &err, fatal, nout, &c_true, 
+					     eps, &err, fatal, nout, &c_true,
 					    (ftnlen)1, (ftnlen)1);
 				    errmax = max(errmax,err);
 /*                             If got really bad answer, report and */
@@ -1174,10 +1184,10 @@ static integer c__2 = 2;
 } /* schk1_ */
 
 /* Subroutine */ int schk2_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
-	nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, 
-	real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, 
+	nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b,
+	real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g,
 	ftnlen sname_len)
 {
     /* Initialized data */
@@ -1202,7 +1212,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5;
     alist al__1;
 
@@ -1211,7 +1221,7 @@ static integer c__2 = 2;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, 
+    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc,
 	    ldb, ldc, ics;
     real als, bls;
     integer icu;
@@ -1224,22 +1234,22 @@ static integer c__2 = 2;
     char uplo[1];
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
 	    integer *, real *, integer *, real *, integer *, logical *, real *
 	    , ftnlen, ftnlen, ftnlen);
     char sides[1];
-    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, 
-	    integer *, real *, real *, integer *, real *, integer *, real *, 
-	    real *, integer *, real *, real *, real *, integer *, real *, 
+    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *, real *, real *, real *, integer *, real *,
 	    real *, logical *, integer *, logical *, ftnlen, ftnlen);
     integer nargs;
     logical reset;
     char uplos[1];
-    extern /* Subroutine */ int ssymm_(char *, char *, integer *, integer *, 
-	    real *, real *, integer *, real *, integer *, real *, real *, 
+    extern /* Subroutine */ int ssymm_(char *, char *, integer *, integer *,
+	    real *, real *, integer *, real *, integer *, real *, real *,
 	    integer *, ftnlen, ftnlen);
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -1378,7 +1388,7 @@ static integer c__2 = 2;
 
 /*                       Generate the matrix C. */
 
-			    smake_("GE", " ", " ", &m, &n, &c__[c_offset], 
+			    smake_("GE", " ", " ", &m, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b84, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1459,9 +1469,9 @@ static integer c__2 = 2;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)sides == *(unsigned 
+			    isame[0] = *(unsigned char *)sides == *(unsigned
 				    char *)side;
-			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+			    isame[1] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
 			    isame[2] = ms == m;
 			    isame[3] = ns == n;
@@ -1506,14 +1516,14 @@ static integer c__2 = 2;
 
 				if (left) {
 				    smmch_("N", "N", &m, &n, &m, &alpha, &a[
-					    a_offset], nmax, &b[b_offset], 
+					    a_offset], nmax, &b[b_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1, (ftnlen)1);
 				} else {
 				    smmch_("N", "N", &m, &n, &n, &alpha, &b[
-					    b_offset], nmax, &a[a_offset], 
+					    b_offset], nmax, &a[a_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
@@ -1594,7 +1604,7 @@ static integer c__2 = 2;
 } /* schk2_ */
 
 /* Subroutine */ int schk3_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
 	nmax, real *a, real *aa, real *as, real *b, real *bb, real *bs, real *
 	ct, real *g, real *c__, ftnlen sname_len)
@@ -1623,7 +1633,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5;
     alist al__1;
 
@@ -1647,25 +1657,25 @@ static integer c__2 = 2;
     real alpha;
     char diags[1];
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
 	    integer *, real *, integer *, real *, integer *, logical *, real *
 	    , ftnlen, ftnlen, ftnlen);
     char sides[1];
-    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, 
-	    integer *, real *, real *, integer *, real *, integer *, real *, 
-	    real *, integer *, real *, real *, real *, integer *, real *, 
+    extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *, real *, real *, real *, integer *, real *,
 	    real *, logical *, integer *, logical *, ftnlen, ftnlen);
     integer nargs;
     logical reset;
     char uplos[1];
-    extern /* Subroutine */ int strmm_(char *, char *, char *, char *, 
+    extern /* Subroutine */ int strmm_(char *, char *, char *, char *,
 	    integer *, integer *, real *, real *, integer *, real *, integer *
-	    , ftnlen, ftnlen, ftnlen, ftnlen), strsm_(char *, char *, char *, 
-	    char *, integer *, integer *, real *, real *, integer *, real *, 
+	    , ftnlen, ftnlen, ftnlen, ftnlen), strsm_(char *, char *, char *,
+	    char *, integer *, integer *, real *, real *, integer *, real *,
 	    integer *, ftnlen, ftnlen, ftnlen, ftnlen);
     char tranas[1], transa[1];
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -1800,7 +1810,7 @@ static integer c__2 = 2;
 
 /*                          Generate the matrix B. */
 
-				smake_("GE", " ", " ", &m, &n, &b[b_offset], 
+				smake_("GE", " ", " ", &m, &n, &b[b_offset],
 					nmax, &bb[1], &ldb, &reset, &c_b84, (
 					ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1866,7 +1876,7 @@ static integer c__2 = 2;
 				    }
 				    strmm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				} else if (s_cmp(sname + 3, "SM", (ftnlen)2, (
 					ftnlen)2) == 0) {
@@ -1899,7 +1909,7 @@ static integer c__2 = 2;
 				    }
 				    strsm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				}
 
@@ -1968,18 +1978,18 @@ static integer c__2 = 2;
 					    smmch_(transa, "N", &m, &n, &m, &
 						    alpha, &a[a_offset], nmax,
 						     &b[b_offset], nmax, &
-						    c_b84, &c__[c_offset], 
+						    c_b84, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					} else {
 					    smmch_("N", transa, &m, &n, &n, &
 						    alpha, &b[b_offset], nmax,
 						     &a[a_offset], nmax, &
-						    c_b84, &c__[c_offset], 
+						    c_b84, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					}
@@ -1992,10 +2002,10 @@ static integer c__2 = 2;
 					i__4 = n;
 					for (j = 1; j <= i__4; ++j) {
 					    i__5 = m;
-					    for (i__ = 1; i__ <= i__5; ++i__) 
+					    for (i__ = 1; i__ <= i__5; ++i__)
 						    {
 			  c__[i__ + j * c_dim1] = bb[i__ + (j - 1) * ldb];
-			  bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * 
+			  bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j *
 				  b_dim1];
 /* L60: */
 					    }
@@ -2008,16 +2018,16 @@ static integer c__2 = 2;
 						     &c__[c_offset], nmax, &
 						    c_b84, &b[b_offset], nmax,
 						     &ct[1], &g[1], &bb[1], &
-						    ldb, eps, &err, fatal, 
+						    ldb, eps, &err, fatal,
 						    nout, &c_false, (ftnlen)1,
 						     (ftnlen)1);
 					} else {
 					    smmch_("N", transa, &m, &n, &n, &
-						    c_b94, &c__[c_offset], 
-						    nmax, &a[a_offset], nmax, 
-						    &c_b84, &b[b_offset], 
+						    c_b94, &c__[c_offset],
+						    nmax, &a[a_offset], nmax,
+						    &c_b84, &b[b_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_false, (
 						    ftnlen)1, (ftnlen)1);
 					}
@@ -2099,10 +2109,10 @@ static integer c__2 = 2;
 } /* schk3_ */
 
 /* Subroutine */ int schk4_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
-	nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, 
-	real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, 
+	nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b,
+	real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g,
 	ftnlen sname_len)
 {
     /* Initialized data */
@@ -2129,7 +2139,7 @@ static integer c__2 = 2;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5;
     alist al__1;
 
@@ -2151,22 +2161,22 @@ static integer c__2 = 2;
     char uplo[1];
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
 	    integer *, real *, integer *, real *, integer *, logical *, real *
-	    , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, 
+	    , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *,
 	    integer *, integer *, real *, real *, integer *, real *, integer *
-	    , real *, real *, integer *, real *, real *, real *, integer *, 
+	    , real *, real *, integer *, real *, real *, real *, integer *,
 	    real *, real *, logical *, integer *, logical *, ftnlen, ftnlen);
     integer nargs;
     logical reset;
     char trans[1];
     logical upper;
     char uplos[1];
-    extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *, 
-	    real *, real *, integer *, real *, real *, integer *, ftnlen, 
+    extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *,
+	    real *, real *, integer *, real *, real *, integer *, ftnlen,
 	    ftnlen);
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     char transs[1];
 
@@ -2293,7 +2303,7 @@ static integer c__2 = 2;
 
 /*                       Generate the matrix C. */
 
-			    smake_("SY", uplo, " ", &n, &n, &c__[c_offset], 
+			    smake_("SY", uplo, " ", &n, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b84, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2350,7 +2360,7 @@ static integer c__2 = 2;
 				al__1.aunit = *ntra;
 				f_rew(&al__1);
 			    }
-			    ssyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, 
+			    ssyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda,
 				    &beta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1)
 				    ;
 
@@ -2366,9 +2376,9 @@ static integer c__2 = 2;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
@@ -2421,19 +2431,19 @@ static integer c__2 = 2;
 				    }
 				    if (tran) {
 					smmch_("T", "N", &lj, &c__1, &k, &
-						alpha, &a[jj * a_dim1 + 1], 
-						nmax, &a[j * a_dim1 + 1], 
-						nmax, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						alpha, &a[jj * a_dim1 + 1],
+						nmax, &a[j * a_dim1 + 1],
+						nmax, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
 					smmch_("N", "T", &lj, &c__1, &k, &
-						alpha, &a[jj + a_dim1], nmax, 
+						alpha, &a[jj + a_dim1], nmax,
 						&a[j + a_dim1], nmax, &beta, &
 						c__[jj + j * c_dim1], nmax, &
-						ct[1], &g[1], &cc[jc], &ldc, 
+						ct[1], &g[1], &cc[jc], &ldc,
 						eps, &err, fatal, nout, &
 						c_true, (ftnlen)1, (ftnlen)1);
 				    }
@@ -2526,7 +2536,7 @@ static integer c__2 = 2;
 } /* schk4_ */
 
 /* Subroutine */ int schk5_(char *sname, real *eps, real *thresh, integer *
-	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal,
 	integer *nidim, integer *idim, integer *nalf, real *alf, integer *
 	nbet, real *bet, integer *nmax, real *ab, real *aa, real *as, real *
 	bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, real *
@@ -2579,22 +2589,22 @@ static integer c__2 = 2;
     char uplo[1];
     real alpha;
     logical isame[13];
-    extern /* Subroutine */ int smake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int smake_(char *, char *, char *, integer *,
 	    integer *, real *, integer *, real *, integer *, logical *, real *
-	    , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, 
+	    , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *,
 	    integer *, integer *, real *, real *, integer *, real *, integer *
-	    , real *, real *, integer *, real *, real *, real *, integer *, 
+	    , real *, real *, integer *, real *, real *, real *, integer *,
 	    real *, real *, logical *, integer *, logical *, ftnlen, ftnlen);
     integer nargs;
     logical reset;
     char trans[1];
     logical upper;
     char uplos[1];
-    extern /* Subroutine */ int ssyr2k_(char *, char *, integer *, integer *, 
-	    real *, real *, integer *, real *, integer *, real *, real *, 
+    extern /* Subroutine */ int ssyr2k_(char *, char *, integer *, integer *,
+	    real *, real *, integer *, real *, integer *, real *, real *,
 	    integer *, ftnlen, ftnlen);
     real errmax;
-    extern logical lseres_(char *, char *, integer *, integer *, real *, real 
+    extern logical lseres_(char *, char *, integer *, integer *, real *, real
 	    *, integer *, ftnlen, ftnlen);
     char transs[1];
 
@@ -2740,7 +2750,7 @@ static integer c__2 = 2;
 
 /*                       Generate the matrix C. */
 
-			    smake_("SY", uplo, " ", &n, &n, &c__[c_offset], 
+			    smake_("SY", uplo, " ", &n, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b84, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2821,9 +2831,9 @@ static integer c__2 = 2;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
@@ -2880,7 +2890,7 @@ static integer c__2 = 2;
 				    if (tran) {
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
-					    w[i__] = ab[(j - 1 << 1) * *nmax 
+					    w[i__] = ab[(j - 1 << 1) * *nmax
 						    + k + i__];
 					    w[k + i__] = ab[(j - 1 << 1) * *
 						    nmax + i__];
@@ -2891,17 +2901,17 @@ static integer c__2 = 2;
 					i__8 = *nmax << 1;
 					smmch_("T", "N", &lj, &c__1, &i__6, &
 						alpha, &ab[jjab], &i__7, &w[1]
-						, &i__8, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						, &i__8, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
-					    w[i__] = ab[(k + i__ - 1) * *nmax 
+					    w[i__] = ab[(k + i__ - 1) * *nmax
 						    + j];
-					    w[k + i__] = ab[(i__ - 1) * *nmax 
+					    w[k + i__] = ab[(i__ - 1) * *nmax
 						    + j];
 /* L60: */
 					}
@@ -2909,9 +2919,9 @@ static integer c__2 = 2;
 					i__7 = *nmax << 1;
 					smmch_("N", "N", &lj, &c__1, &i__6, &
 						alpha, &ab[jj], nmax, &w[1], &
-						i__7, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						i__7, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    }
@@ -3007,7 +3017,7 @@ static integer c__2 = 2;
 
 } /* schk5_ */
 
-/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -3020,22 +3030,22 @@ static integer c__2 = 2;
     integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void);
 
     /* Local variables */
-    real a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]	/* 
+    real a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]	/*
 	    was [2][1] */, beta, alpha;
-    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
-	    integer *, real *, real *, integer *, real *, integer *, real *, 
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
 	    real *, integer *, ftnlen, ftnlen), strmm_(char *, char *, char *,
-	     char *, integer *, integer *, real *, real *, integer *, real *, 
+	     char *, integer *, integer *, real *, real *, integer *, real *,
 	    integer *, ftnlen, ftnlen, ftnlen, ftnlen), ssymm_(char *, char *,
-	     integer *, integer *, real *, real *, integer *, real *, integer 
-	    *, real *, real *, integer *, ftnlen, ftnlen), strsm_(char *, 
-	    char *, char *, char *, integer *, integer *, real *, real *, 
-	    integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), 
-	    ssyrk_(char *, char *, integer *, integer *, real *, real *, 
+	     integer *, integer *, real *, real *, integer *, real *, integer
+	    *, real *, real *, integer *, ftnlen, ftnlen), strsm_(char *,
+	    char *, char *, char *, integer *, integer *, real *, real *,
+	    integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen),
+	    ssyrk_(char *, char *, integer *, integer *, real *, real *,
 	    integer *, real *, real *, integer *, ftnlen, ftnlen), ssyr2k_(
-	    char *, char *, integer *, integer *, real *, real *, integer *, 
-	    real *, integer *, real *, real *, integer *, ftnlen, ftnlen), 
-	    chkxer_(char *, integer *, integer *, logical *, logical *, 
+	    char *, char *, integer *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *, ftnlen, ftnlen),
+	    chkxer_(char *, integer *, integer *, logical *, logical *,
 	    ftnlen);
 
     /* Fortran I/O blocks */
@@ -3089,142 +3099,142 @@ static integer c__2 = 2;
     }
 L10:
     infoc_1.infot = 1;
-    sgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 1;
-    sgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    sgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    sgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    sgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    sgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    sgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    sgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    sgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    sgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    sgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    sgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    sgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    sgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    sgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    sgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, 
+    sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    sgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    sgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, 
+    sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    sgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    sgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    sgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    sgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    sgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
@@ -3928,9 +3938,9 @@ static integer c__2 = 2;
 
 } /* schke_ */
 
-/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, 
+/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m,
 	integer *n, real *a, integer *nmax, real *aa, integer *lda, logical *
-	reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen 
+	reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen
 	diag_len)
 {
     /* System generated locals */
@@ -4075,7 +4085,7 @@ static integer c__2 = 2;
 /* Subroutine */ int smmch_(char *transa, char *transb, integer *m, integer *
 	n, integer *kk, real *alpha, real *a, integer *lda, real *b, integer *
 	ldb, real *beta, real *c__, integer *ldc, real *ct, real *g, real *cc,
-	 integer *ldcc, real *eps, real *err, logical *fatal, integer *nout, 
+	 integer *ldcc, real *eps, real *err, logical *fatal, integer *nout,
 	logical *mv, ftnlen transa_len, ftnlen transb_len)
 {
     /* Format strings */
@@ -4087,7 +4097,7 @@ static integer c__2 = 2;
 	    " \002,i3)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1,
 	    cc_offset, i__1, i__2, i__3;
     real r__1, r__2;
 
@@ -4141,9 +4151,9 @@ static integer c__2 = 2;
     cc -= cc_offset;
 
     /* Function Body */
-    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa ==
 	    'C';
-    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb ==
 	    'C';
 
 /*     Compute expected result, one column at a time, in CT using data */
@@ -4165,7 +4175,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[i__ + k * a_dim1] * b[k + j * b_dim1];
-		    g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 
+		    g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2
 			    = b[k + j * b_dim1], abs(r__2));
 /* L20: */
 		}
@@ -4177,7 +4187,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[k + i__ * a_dim1] * b[k + j * b_dim1];
-		    g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 
+		    g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2
 			    = b[k + j * b_dim1], abs(r__2));
 /* L40: */
 		}
@@ -4189,7 +4199,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[i__ + k * a_dim1] * b[j + k * b_dim1];
-		    g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 
+		    g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2
 			    = b[j + k * b_dim1], abs(r__2));
 /* L60: */
 		}
@@ -4201,7 +4211,7 @@ static integer c__2 = 2;
 		i__3 = *m;
 		for (i__ = 1; i__ <= i__3; ++i__) {
 		    ct[i__] += a[k + i__ * a_dim1] * b[j + k * b_dim1];
-		    g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 
+		    g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2
 			    = b[j + k * b_dim1], abs(r__2));
 /* L80: */
 		}
@@ -4328,7 +4338,7 @@ logical lse_(real *ri, real *rj, integer *lr)
 
 } /* lse_ */
 
-logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, 
+logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa,
 	real *as, integer *lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
@@ -4495,7 +4505,7 @@ real sdiff_(real *x, real *y)
 
 } /* sdiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/blastest/src/zblat1.c b/blastest/src/zblat1.c
index b620910be..93a24f4c3 100644
--- a/blastest/src/zblat1.c
+++ b/blastest/src/zblat1.c
@@ -68,6 +68,11 @@ static doublereal c_b52 = 0.;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "zblat1";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
     static doublereal sfac = 9.765625e-4;
@@ -84,7 +89,7 @@ static doublereal c_b52 = 0.;
 
     /* Local variables */
     integer ic;
-    extern /* Subroutine */ int check1_(doublereal *), check2_(doublereal *), 
+    extern /* Subroutine */ int check1_(doublereal *), check2_(doublereal *),
 	    header_(void);
 
     /* Fortran I/O blocks */
@@ -136,7 +141,12 @@ static doublereal c_b52 = 0.;
     }
     s_stop("", (ftnlen)0);
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
 /* Subroutine */ int header_(void)
@@ -222,7 +232,7 @@ static doublereal c_b52 = 0.;
     doublecomplex z__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -230,14 +240,14 @@ static doublereal c_b52 = 0.;
     integer i__;
     doublecomplex cx[8];
     integer np1, len;
-    extern /* Subroutine */ int zscal_(integer *, doublecomplex *, 
-	    doublecomplex *, integer *), ctest_(integer *, doublecomplex *, 
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *), ctest_(integer *, doublecomplex *,
 	    doublecomplex *, doublecomplex *, doublereal *);
     doublecomplex mwpcs[5], mwpct[5];
     extern /* Subroutine */ int itest1_(integer *, integer *);
     extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
-    extern /* Subroutine */ int stest1_(doublereal *, doublereal *, 
-	    doublereal *, doublereal *), zdscal_(integer *, doublereal *, 
+    extern /* Subroutine */ int stest1_(doublereal *, doublereal *,
+	    doublereal *, doublereal *), zdscal_(integer *, doublereal *,
 	    doublecomplex *, integer *);
     extern integer izamax_(integer *, doublecomplex *, integer *);
     extern doublereal dzasum_(integer *, doublecomplex *, integer *);
@@ -433,7 +443,7 @@ static doublereal c_b52 = 0.;
 	    0.,0.},{0.,0.},{0.,0.},{0.,0.},{.7,-.8},{-.9,.5},{-.4,-.7},{0.,0.}
 	    ,{0.,0.},{0.,0.},{0.,0.},{.7,-.8},{-.9,.5},{-.4,-.7},{.1,-.5},{
 	    -.1,-.9},{-.5,-.3},{.2,-.8} };
-    static doublecomplex csize1[4] = { {0.,0.},{.9,.9},{1.63,1.73},{2.9,2.78} 
+    static doublecomplex csize1[4] = { {0.,0.},{.9,.9},{1.63,1.73},{2.9,2.78}
 	    };
     static doublecomplex csize3[14] = { {0.,0.},{0.,0.},{0.,0.},{0.,0.},{0.,
 	    0.},{0.,0.},{0.,0.},{1.17,1.17},{1.17,1.17},{1.17,1.17},{1.17,
@@ -447,7 +457,7 @@ static doublereal c_b52 = 0.;
     doublecomplex z__1;
 
     /* Builtin functions */
-    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_wsle(void);
     /* Subroutine */ int s_stop(char *, ftnlen);
 
@@ -457,7 +467,7 @@ static doublereal c_b52 = 0.;
     integer mx, my;
     doublecomplex cdot[1];
     integer lenx, leny;
-    extern /* Subroutine */ int ctest_(integer *, doublecomplex *, 
+    extern /* Subroutine */ int ctest_(integer *, doublecomplex *,
 	    doublecomplex *, doublecomplex *, doublereal *);
     extern /* Double Complex */
 #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
@@ -465,10 +475,10 @@ static doublereal c_b52 = 0.;
 #else
 doublecomplex zdotc_(
 #endif
- integer *, 
+ integer *,
 	    doublecomplex *, integer *, doublecomplex *, integer *);
     integer ksize;
-    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, 
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
 	    doublecomplex *, integer *);
     extern /* Double Complex */
 #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
@@ -476,10 +486,10 @@ doublecomplex zdotc_(
 #else
 doublecomplex zdotu_(
 #endif
- integer *, 
+ integer *,
 	    doublecomplex *, integer *, doublecomplex *, integer *);
-    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, 
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *,
 	    doublecomplex *, integer *, doublecomplex *, integer *);
 
     /* Fortran I/O blocks */
@@ -669,11 +679,11 @@ doublecomplex zdotu_(
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, 
+/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1,
 	doublereal *ssize, doublereal *sfac)
 {
     doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *,
 	    doublereal *, doublereal *);
 
 /*     ************************* STEST1 ***************************** */
@@ -715,7 +725,7 @@ doublereal sdiff_(doublereal *sa, doublereal *sb)
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int ctest_(integer *len, doublecomplex *ccomp, doublecomplex 
+/* Subroutine */ int ctest_(integer *len, doublecomplex *ccomp, doublecomplex
 	*ctrue, doublecomplex *csize, doublereal *sfac)
 {
     /* System generated locals */
@@ -727,7 +737,7 @@ doublereal sdiff_(doublereal *sa, doublereal *sb)
     /* Local variables */
     integer i__;
     doublereal scomp[20], ssize[20], strue[20];
-    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *,
 	    doublereal *, doublereal *);
 
 /*     **************************** CTEST ***************************** */
diff --git a/blastest/src/zblat2.c b/blastest/src/zblat2.c
index 030f03b83..5550b413f 100644
--- a/blastest/src/zblat2.c
+++ b/blastest/src/zblat2.c
@@ -157,10 +157,15 @@ static logical c_false = FALSE_;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "zblat2";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*17] = "ZGEMV " "ZGBMV " "ZHEMV " "ZHBMV " "ZHPMV " 
-	    "ZTRMV " "ZTBMV " "ZTPMV " "ZTRSV " "ZTBSV " "ZTPSV " "ZGERC " 
+    static char snames[6*17] = "ZGEMV " "ZGBMV " "ZHEMV " "ZHBMV " "ZHPMV "
+	    "ZTRMV " "ZTBMV " "ZTPMV " "ZTRSV " "ZTBSV " "ZTPSV " "ZGERC "
 	    "ZGERU " "ZHER  " "ZHPR  " "ZHER2 " "ZHPR2 ";
 
     /* Format strings */
@@ -208,10 +213,10 @@ static logical c_false = FALSE_;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -234,53 +239,53 @@ static logical c_false = FALSE_;
     integer ninc, nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, integer *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, integer *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+    extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
+	    integer *, integer *, integer *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, doublecomplex *,
 	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
 	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
-	     ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, integer *, integer *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, integer *, integer *, integer *, 
+	     ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, integer *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
-	    , doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, 
-	    ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, integer *, integer *, integer *, integer *, integer *, 
+	    , doublecomplex *, doublecomplex *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
+	    ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
 	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
-	     doublecomplex *, ftnlen), zchk4_(char *, doublereal *, 
-	    doublereal *, integer *, integer *, logical *, logical *, logical 
-	    *, integer *, integer *, integer *, doublecomplex *, integer *, 
+	     doublecomplex *, ftnlen), zchk4_(char *, doublereal *,
+	    doublereal *, integer *, integer *, logical *, logical *, logical
+	    *, integer *, integer *, integer *, doublecomplex *, integer *,
 	    integer *, integer *, integer *, doublecomplex *, doublecomplex *,
-	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex 
-	    *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex
+	    *, doublecomplex *, doublecomplex *, doublecomplex *,
 	    doublecomplex *, doublereal *, doublecomplex *, ftnlen), zchk5_(
-	    char *, doublereal *, doublereal *, integer *, integer *, logical 
-	    *, logical *, logical *, integer *, integer *, integer *, 
-	    doublecomplex *, integer *, integer *, integer *, integer *, 
+	    char *, doublereal *, doublereal *, integer *, integer *, logical
+	    *, logical *, logical *, integer *, integer *, integer *,
+	    doublecomplex *, integer *, integer *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
-	    , doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, 
-	    doublecomplex *, ftnlen), zchk6_(char *, doublereal *, doublereal 
-	    *, integer *, integer *, logical *, logical *, logical *, integer 
-	    *, integer *, integer *, doublecomplex *, integer *, integer *, 
-	    integer *, integer *, doublecomplex *, doublecomplex *, 
+	    , doublecomplex *, doublecomplex *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
+	    doublecomplex *, ftnlen), zchk6_(char *, doublereal *, doublereal
+	    *, integer *, integer *, logical *, logical *, logical *, integer
+	    *, integer *, integer *, doublecomplex *, integer *, integer *,
+	    integer *, integer *, doublecomplex *, doublecomplex *,
 	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
-	    , doublecomplex *, doublecomplex *, doublecomplex *, 
+	    , doublecomplex *, doublecomplex *, doublecomplex *,
 	    doublecomplex *, doublereal *, doublecomplex *, ftnlen);
     logical fatal, trace;
     integer nidim;
     extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen);
     char snaps[32], trans[1];
-    extern /* Subroutine */ int zmvch_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublereal *, doublecomplex *, doublereal *, 
+    extern /* Subroutine */ int zmvch_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, doublereal *, doublecomplex *, doublereal *,
 	    doublereal *, logical *, integer *, logical *, ftnlen);
     integer isnum;
     logical ltest[17], sfatal;
@@ -630,7 +635,7 @@ static logical c_false = FALSE_;
 	goto L80;
     }
     for (i__ = 1; i__ <= 17; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L70;
 	}
@@ -689,7 +694,7 @@ static logical c_false = FALSE_;
 /*     YY holds the exact result. On exit from ZMVCH YT holds */
 /*     the result computed by ZMVCH. */
     *(unsigned char *)trans = 'N';
-    zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, 
+    zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g,
 	    yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1);
     same = lze_(yy, yt, &n);
     if (! same || err != 0.) {
@@ -702,7 +707,7 @@ static logical c_false = FALSE_;
 	s_stop("", (ftnlen)0);
     }
     *(unsigned char *)trans = 'T';
-    zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, 
+    zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g,
 	    yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1);
     same = lze_(yy, yt, &n);
     if (! same || err != 0.) {
@@ -763,44 +768,44 @@ static logical c_false = FALSE_;
 /*           Test ZGEMV, 01, and ZGBMV, 02. */
 L140:
 	    zchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. */
 L150:
 	    zchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, 
-		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf,
+		    &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx,
 		    xs, y, yy, ys, yt, g, (ftnlen)6);
 	    goto L200;
 /*           Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, */
 /*           ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. */
 L160:
 	    zchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, 
+		    trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc,
 		    &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen)
 		    6);
 	    goto L200;
 /*           Test ZGERC, 12, ZGERU, 13. */
 L170:
 	    zchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test ZHER, 14, and ZHPR, 15. */
 L180:
 	    zchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 	    goto L200;
 /*           Test ZHER2, 16, and ZHPR2, 17. */
 L190:
 	    zchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, 
-		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc,
+		    inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt,
 		    g, z__, (ftnlen)6);
 
 L200:
@@ -842,16 +847,21 @@ static logical c_false = FALSE_;
 
 /*     End of ZBLAT2. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
-/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, 
-	integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex 
-	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, 
+	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb,
+	integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex
+	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y,
 	doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal *
 	g, ftnlen sname_len)
 {
@@ -881,7 +891,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, 
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8,
 	    i__9;
     alist al__1;
 
@@ -904,7 +914,7 @@ static logical c_false = FALSE_;
     logical full, tran, null;
     doublecomplex alpha;
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen,
 	     ftnlen);
@@ -912,22 +922,22 @@ static logical c_false = FALSE_;
     logical reset;
     integer incxs, incys;
     extern /* Subroutine */ int zgbmv_(char *, integer *, integer *, integer *
-	    , integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    , integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
 	    integer *, ftnlen);
     char trans[1];
-    extern /* Subroutine */ int zgemv_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), 
-	    zmvch_(char *, integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    doublereal *, doublecomplex *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen),
+	    zmvch_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    doublereal *, doublecomplex *, doublereal *, doublereal *,
 	    logical *, integer *, logical *, ftnlen);
     logical banded;
     doublereal errmax;
     doublecomplex transl;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
     char transs[1];
 
@@ -1108,9 +1118,9 @@ static logical c_false = FALSE_;
 				    transl.r = 0., transl.i = 0.;
 				    i__7 = abs(incy);
 				    i__8 = ml - 1;
-				    zmake_("GE", " ", " ", &c__1, &ml, &y[1], 
+				    zmake_("GE", " ", " ", &c__1, &ml, &y[1],
 					    &c__1, &yy[1], &i__7, &c__0, &
-					    i__8, &reset, &transl, (ftnlen)2, 
+					    i__8, &reset, &transl, (ftnlen)2,
 					    (ftnlen)1, (ftnlen)1);
 
 				    ++nc;
@@ -1118,7 +1128,7 @@ static logical c_false = FALSE_;
 /*                             Save every datum before calling the */
 /*                             subroutine. */
 
-				    *(unsigned char *)transs = *(unsigned 
+				    *(unsigned char *)transs = *(unsigned
 					    char *)trans;
 				    ms = m;
 				    ns = n;
@@ -1129,7 +1139,7 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__7; ++i__) {
 					i__8 = i__;
 					i__9 = i__;
-					as[i__8].r = aa[i__9].r, as[i__8].i = 
+					as[i__8].r = aa[i__9].r, as[i__8].i =
 						aa[i__9].i;
 /* L10: */
 				    }
@@ -1138,7 +1148,7 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__7; ++i__) {
 					i__8 = i__;
 					i__9 = i__;
-					xs[i__8].r = xx[i__9].r, xs[i__8].i = 
+					xs[i__8].r = xx[i__9].r, xs[i__8].i =
 						xx[i__9].i;
 /* L20: */
 				    }
@@ -1148,7 +1158,7 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__7; ++i__) {
 					i__8 = i__;
 					i__9 = i__;
-					ys[i__8].r = yy[i__9].r, ys[i__8].i = 
+					ys[i__8].r = yy[i__9].r, ys[i__8].i =
 						yy[i__9].i;
 /* L30: */
 				    }
@@ -1187,7 +1197,7 @@ static logical c_false = FALSE_;
 					    al__1.aunit = *ntra;
 					    f_rew(&al__1);
 					}
-					zgemv_(trans, &m, &n, &alpha, &aa[1], 
+					zgemv_(trans, &m, &n, &alpha, &aa[1],
 						&lda, &xx[1], &incx, &beta, &
 						yy[1], &incy, (ftnlen)1);
 				    } else if (banded) {
@@ -1248,7 +1258,7 @@ static logical c_false = FALSE_;
 				    isame[1] = ms == m;
 				    isame[2] = ns == n;
 				    if (full) {
-					isame[3] = als.r == alpha.r && als.i 
+					isame[3] = als.r == alpha.r && als.i
 						== alpha.i;
 					isame[4] = lze_(&as[1], &aa[1], &laa);
 					isame[5] = ldas == lda;
@@ -1270,13 +1280,13 @@ static logical c_false = FALSE_;
 				    } else if (banded) {
 					isame[3] = kls == kl;
 					isame[4] = kus == ku;
-					isame[5] = als.r == alpha.r && als.i 
+					isame[5] = als.r == alpha.r && als.i
 						== alpha.i;
 					isame[6] = lze_(&as[1], &aa[1], &laa);
 					isame[7] = ldas == lda;
 					isame[8] = lze_(&xs[1], &xx[1], &lx);
 					isame[9] = incxs == incx;
-					isame[10] = bls.r == beta.r && bls.i 
+					isame[10] = bls.r == beta.r && bls.i
 						== beta.i;
 					if (null) {
 					    isame[11] = lze_(&ys[1], &yy[1], &
@@ -1318,8 +1328,8 @@ static logical c_false = FALSE_;
 
 					zmvch_(trans, &m, &n, &alpha, &a[
 						a_offset], nmax, &x[1], &incx,
-						 &beta, &y[1], &incy, &yt[1], 
-						&g[1], &yy[1], eps, &err, 
+						 &beta, &y[1], &incy, &yt[1],
+						&g[1], &yy[1], eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1);
 					errmax = max(errmax,err);
@@ -1423,13 +1433,13 @@ static logical c_false = FALSE_;
 
 } /* zchk1_ */
 
-/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, 
-	integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex 
-	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, 
+	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb,
+	integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex
+	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y,
 	doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal *
 	g, ftnlen sname_len)
 {
@@ -1463,7 +1473,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, 
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8,
 	    i__9;
     alist al__1;
 
@@ -1472,7 +1482,7 @@ static logical c_false = FALSE_;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, 
+    integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly,
 	    laa, lda;
     doublecomplex als, bls;
     doublereal err;
@@ -1485,31 +1495,31 @@ static logical c_false = FALSE_;
     char uplo[1];
     doublecomplex alpha;
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen,
 	     ftnlen);
     integer nargs;
     logical reset;
     integer incxs, incys;
-    extern /* Subroutine */ int zhbmv_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), 
-	    zmvch_(char *, integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    doublereal *, doublecomplex *, doublereal *, doublereal *, 
+    extern /* Subroutine */ int zhbmv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen),
+	    zmvch_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    doublereal *, doublecomplex *, doublereal *, doublereal *,
 	    logical *, integer *, logical *, ftnlen), zhemv_(char *, integer *
-	    , doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+	    , doublecomplex *, doublecomplex *, integer *, doublecomplex *,
 	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen);
     char uplos[1];
-    extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+    extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
 	    doublecomplex *, integer *, ftnlen);
     logical banded, packed;
     doublereal errmax;
     doublecomplex transl;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -1672,7 +1682,7 @@ static logical c_false = FALSE_;
 				i__8 = n - 1;
 				zmake_("GE", " ", " ", &c__1, &n, &y[1], &
 					c__1, &yy[1], &i__7, &c__0, &i__8, &
-					reset, &transl, (ftnlen)2, (ftnlen)1, 
+					reset, &transl, (ftnlen)2, (ftnlen)1,
 					(ftnlen)1);
 
 				++nc;
@@ -1824,13 +1834,13 @@ static logical c_false = FALSE_;
 					unsigned char *)uplos;
 				isame[1] = ns == n;
 				if (full) {
-				    isame[2] = als.r == alpha.r && als.i == 
+				    isame[2] = als.r == alpha.r && als.i ==
 					    alpha.i;
 				    isame[3] = lze_(&as[1], &aa[1], &laa);
 				    isame[4] = ldas == lda;
 				    isame[5] = lze_(&xs[1], &xx[1], &lx);
 				    isame[6] = incxs == incx;
-				    isame[7] = bls.r == beta.r && bls.i == 
+				    isame[7] = bls.r == beta.r && bls.i ==
 					    beta.i;
 				    if (null) {
 					isame[8] = lze_(&ys[1], &yy[1], &ly);
@@ -1843,13 +1853,13 @@ static logical c_false = FALSE_;
 				    isame[9] = incys == incy;
 				} else if (banded) {
 				    isame[2] = ks == k;
-				    isame[3] = als.r == alpha.r && als.i == 
+				    isame[3] = als.r == alpha.r && als.i ==
 					    alpha.i;
 				    isame[4] = lze_(&as[1], &aa[1], &laa);
 				    isame[5] = ldas == lda;
 				    isame[6] = lze_(&xs[1], &xx[1], &lx);
 				    isame[7] = incxs == incx;
-				    isame[8] = bls.r == beta.r && bls.i == 
+				    isame[8] = bls.r == beta.r && bls.i ==
 					    beta.i;
 				    if (null) {
 					isame[9] = lze_(&ys[1], &yy[1], &ly);
@@ -1861,12 +1871,12 @@ static logical c_false = FALSE_;
 				    }
 				    isame[10] = incys == incy;
 				} else if (packed) {
-				    isame[2] = als.r == alpha.r && als.i == 
+				    isame[2] = als.r == alpha.r && als.i ==
 					    alpha.i;
 				    isame[3] = lze_(&as[1], &aa[1], &laa);
 				    isame[4] = lze_(&xs[1], &xx[1], &lx);
 				    isame[5] = incxs == incx;
-				    isame[6] = bls.r == beta.r && bls.i == 
+				    isame[6] = bls.r == beta.r && bls.i ==
 					    beta.i;
 				    if (null) {
 					isame[7] = lze_(&ys[1], &yy[1], &ly);
@@ -1904,8 +1914,8 @@ static logical c_false = FALSE_;
 
 /*                             Check the result. */
 
-				    zmvch_("N", &n, &n, &alpha, &a[a_offset], 
-					    nmax, &x[1], &incx, &beta, &y[1], 
+				    zmvch_("N", &n, &n, &alpha, &a[a_offset],
+					    nmax, &x[1], &incx, &beta, &y[1],
 					    &incy, &yt[1], &g[1], &yy[1], eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1);
@@ -2015,12 +2025,12 @@ static logical c_false = FALSE_;
 
 } /* zchk2_ */
 
-/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, 
-	integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex 
-	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *xt, 
+	fatal, integer *nidim, integer *idim, integer *nkb, integer *kb,
+	integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex
+	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *xt,
 	doublereal *g, doublecomplex *z__, ftnlen sname_len)
 {
     /* Initialized data */
@@ -2060,7 +2070,7 @@ static logical c_false = FALSE_;
 	    integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, 
+    integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict,
 	    icu;
     doublereal err;
     extern logical lze_(doublecomplex *, doublecomplex *, integer *);
@@ -2071,7 +2081,7 @@ static logical c_false = FALSE_;
     logical full, null;
     char uplo[1], diags[1];
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen,
 	     ftnlen);
@@ -2079,28 +2089,28 @@ static logical c_false = FALSE_;
     logical reset;
     integer incxs;
     char trans[1];
-    extern /* Subroutine */ int zmvch_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublereal *, doublecomplex *, doublereal *, 
+    extern /* Subroutine */ int zmvch_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, doublereal *, doublecomplex *, doublereal *,
 	    doublereal *, logical *, integer *, logical *, ftnlen);
     char uplos[1];
-    extern /* Subroutine */ int ztbmv_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int ztbmv_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     ftnlen, ftnlen, ftnlen), ztbsv_(char *, char *, char *, integer *
-	    , integer *, doublecomplex *, integer *, doublecomplex *, integer 
-	    *, ftnlen, ftnlen, ftnlen), ztpmv_(char *, char *, char *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, 
-	    ftnlen, ftnlen), ztrmv_(char *, char *, char *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, 
-	    ftnlen, ftnlen), ztpsv_(char *, char *, char *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen, 
+	    , integer *, doublecomplex *, integer *, doublecomplex *, integer
+	    *, ftnlen, ftnlen, ftnlen), ztpmv_(char *, char *, char *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen,
+	    ftnlen, ftnlen), ztrmv_(char *, char *, char *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen,
+	    ftnlen, ftnlen), ztpsv_(char *, char *, char *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen,
 	    ftnlen), ztrsv_(char *, char *, char *, integer *, doublecomplex *
 	    , integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen);
     logical banded, packed;
     doublereal errmax;
     doublecomplex transl;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
     char transs[1];
 
@@ -2226,13 +2236,13 @@ static logical c_false = FALSE_;
 			    ;
 
 		    for (icd = 1; icd <= 2; ++icd) {
-			*(unsigned char *)diag = *(unsigned char *)&ichd[icd 
+			*(unsigned char *)diag = *(unsigned char *)&ichd[icd
 				- 1];
 
 /*                    Generate the matrix A. */
 
 			transl.r = 0., transl.i = 0.;
-			zmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], 
+			zmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &k, &k, &reset, &transl, (
 				ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -2287,7 +2297,7 @@ static logical c_false = FALSE_;
 
 /*                       Call the subroutine. */
 
-			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) 
+			    if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2)
 				    == 0) {
 				if (full) {
 				    if (*trace) {
@@ -2340,7 +2350,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    ztbmv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    ztbmv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2421,7 +2431,7 @@ static logical c_false = FALSE_;
 					al__1.aunit = *ntra;
 					f_rew(&al__1);
 				    }
-				    ztbsv_(uplo, trans, diag, &n, &k, &aa[1], 
+				    ztbsv_(uplo, trans, diag, &n, &k, &aa[1],
 					    &lda, &xx[1], &incx, (ftnlen)1, (
 					    ftnlen)1, (ftnlen)1);
 				} else if (packed) {
@@ -2463,11 +2473,11 @@ static logical c_false = FALSE_;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplo == *(unsigned 
+			    isame[0] = *(unsigned char *)uplo == *(unsigned
 				    char *)uplos;
-			    isame[1] = *(unsigned char *)trans == *(unsigned 
+			    isame[1] = *(unsigned char *)trans == *(unsigned
 				    char *)transs;
-			    isame[2] = *(unsigned char *)diag == *(unsigned 
+			    isame[2] = *(unsigned char *)diag == *(unsigned
 				    char *)diags;
 			    isame[3] = ns == n;
 			    if (full) {
@@ -2537,7 +2547,7 @@ static logical c_false = FALSE_;
 
 				    zmvch_(trans, &n, &n, &c_b2, &a[a_offset],
 					     nmax, &x[1], &incx, &c_b1, &z__[
-					    1], &incx, &xt[1], &g[1], &xx[1], 
+					    1], &incx, &xt[1], &g[1], &xx[1],
 					    eps, &err, fatal, nout, &c_true, (
 					    ftnlen)1);
 				} else if (s_cmp(sname + 3, "SV", (ftnlen)2, (
@@ -2549,18 +2559,18 @@ static logical c_false = FALSE_;
 				    for (i__ = 1; i__ <= i__4; ++i__) {
 					i__5 = i__;
 					i__6 = (i__ - 1) * abs(incx) + 1;
-					z__[i__5].r = xx[i__6].r, z__[i__5].i 
+					z__[i__5].r = xx[i__6].r, z__[i__5].i
 						= xx[i__6].i;
 					i__5 = (i__ - 1) * abs(incx) + 1;
 					i__6 = i__;
-					xx[i__5].r = x[i__6].r, xx[i__5].i = 
+					xx[i__5].r = x[i__6].r, xx[i__5].i =
 						x[i__6].i;
 /* L50: */
 				    }
 				    zmvch_(trans, &n, &n, &c_b2, &a[a_offset],
 					     nmax, &z__[1], &incx, &c_b1, &x[
-					    1], &incx, &xt[1], &g[1], &xx[1], 
-					    eps, &err, fatal, nout, &c_false, 
+					    1], &incx, &xt[1], &g[1], &xx[1],
+					    eps, &err, fatal, nout, &c_false,
 					    (ftnlen)1);
 				}
 				errmax = max(errmax,err);
@@ -2662,12 +2672,12 @@ static logical c_false = FALSE_;
 
 } /* zchk3_ */
 
-/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex 
-	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, 
+	alf, integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex
+	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y,
 	doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal *
 	g, doublecomplex *z__, ftnlen sname_len)
 {
@@ -2713,26 +2723,26 @@ static logical c_false = FALSE_;
     logical null;
     doublecomplex alpha;
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen,
 	     ftnlen);
     integer nargs;
-    extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, 
+    extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
 	    doublecomplex *, integer *);
     logical reset;
     integer incxs, incys;
-    extern /* Subroutine */ int zmvch_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublereal *, doublecomplex *, doublereal *, 
+    extern /* Subroutine */ int zmvch_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, doublereal *, doublecomplex *, doublereal *,
 	    doublereal *, logical *, integer *, logical *, ftnlen), zgeru_(
 	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
 	     doublecomplex *, integer *, doublecomplex *, integer *);
     doublereal errmax;
     doublecomplex transl;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -2834,7 +2844,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = m - 1;
 		zmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (m > 1) {
 		    i__3 = m / 2;
@@ -2873,7 +2883,7 @@ static logical c_false = FALSE_;
 			transl.r = 0., transl.i = 0.;
 			i__5 = m - 1;
 			i__6 = n - 1;
-			zmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], 
+			zmake_(sname + 1, " ", " ", &m, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -3032,9 +3042,9 @@ static logical c_false = FALSE_;
 				    d_cnjg(&z__1, w);
 				    w[0].r = z__1.r, w[0].i = z__1.i;
 				}
-				zmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, 
+				zmvch_("N", &m, &c__1, &alpha, &z__[1], nmax,
 					w, &c__1, &c_b2, &a[j * a_dim1 + 1], &
-					c__1, &yt[1], &g[1], &aa[(j - 1) * 
+					c__1, &yt[1], &g[1], &aa[(j - 1) *
 					lda + 1], eps, &err, fatal, nout, &
 					c_true, (ftnlen)1);
 				errmax = max(errmax,err);
@@ -3114,12 +3124,12 @@ static logical c_false = FALSE_;
 
 } /* zchk4_ */
 
-/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex 
-	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, 
+	alf, integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex
+	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y,
 	doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal *
 	g, doublecomplex *z__, ftnlen sname_len)
 {
@@ -3169,32 +3179,32 @@ static logical c_false = FALSE_;
     doublereal rals;
     integer incx;
     logical full;
-    extern /* Subroutine */ int zher_(char *, integer *, doublereal *, 
+    extern /* Subroutine */ int zher_(char *, integer *, doublereal *,
 	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen);
     logical null;
     char uplo[1];
-    extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, 
+    extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *,
 	    doublecomplex *, integer *, doublecomplex *, ftnlen);
     doublecomplex alpha;
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen,
 	     ftnlen);
     integer nargs;
     logical reset;
     integer incxs;
-    extern /* Subroutine */ int zmvch_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublereal *, doublecomplex *, doublereal *, 
+    extern /* Subroutine */ int zmvch_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, doublereal *, doublecomplex *, doublereal *,
 	    doublereal *, logical *, integer *, logical *, ftnlen);
     logical upper;
     char uplos[1];
     logical packed;
     doublereal ralpha, errmax;
     doublecomplex transl;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -3297,7 +3307,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		zmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    i__3 = n / 2;
@@ -3372,7 +3382,7 @@ static logical c_false = FALSE_;
 			    al__1.aunit = *ntra;
 			    f_rew(&al__1);
 			}
-			zher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, 
+			zher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda,
 				(ftnlen)1);
 		    } else if (packed) {
 			if (*trace) {
@@ -3482,9 +3492,9 @@ static logical c_false = FALSE_;
 				jj = j;
 				lj = n - j + 1;
 			    }
-			    zmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, 
-				    &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, 
-				    &yt[1], &g[1], &aa[ja], eps, &err, fatal, 
+			    zmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w,
+				    &c__1, &c_b2, &a[jj + j * a_dim1], &c__1,
+				    &yt[1], &g[1], &aa[ja], eps, &err, fatal,
 				    nout, &c_true, (ftnlen)1);
 			    if (full) {
 				if (upper) {
@@ -3582,12 +3592,12 @@ static logical c_false = FALSE_;
 
 } /* zchk5_ */
 
-/* Subroutine */ int zchk6_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk6_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, 
-	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex 
-	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, 
+	alf, integer *ninc, integer *inc, integer *nmax, integer *incmax,
+	doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex
+	*x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y,
 	doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal *
 	g, doublecomplex *z__, ftnlen sname_len)
 {
@@ -3617,7 +3627,7 @@ static logical c_false = FALSE_;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, 
+    integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5,
 	    i__6, i__7;
     doublecomplex z__1, z__2, z__3;
     alist al__1;
@@ -3639,31 +3649,31 @@ static logical c_false = FALSE_;
     integer incx, incy;
     logical full, null;
     char uplo[1];
-    extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, ftnlen), zhpr2_(char *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+    extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, ftnlen), zhpr2_(char *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
 	    integer *, doublecomplex *, ftnlen);
     doublecomplex alpha;
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen,
 	     ftnlen);
     integer nargs;
     logical reset;
     integer incxs, incys;
-    extern /* Subroutine */ int zmvch_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublereal *, doublecomplex *, doublereal *, 
+    extern /* Subroutine */ int zmvch_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, doublereal *, doublecomplex *, doublereal *,
 	    doublereal *, logical *, integer *, logical *, ftnlen);
     logical upper;
     char uplos[1];
     logical packed;
     doublereal errmax;
     doublecomplex transl;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -3768,7 +3778,7 @@ static logical c_false = FALSE_;
 		i__3 = abs(incx);
 		i__4 = n - 1;
 		zmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3,
-			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, 
+			 &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1,
 			(ftnlen)1);
 		if (n > 1) {
 		    i__3 = n / 2;
@@ -3808,7 +3818,7 @@ static logical c_false = FALSE_;
 			transl.r = 0., transl.i = 0.;
 			i__5 = n - 1;
 			i__6 = n - 1;
-			zmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], 
+			zmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset],
 				nmax, &aa[1], &lda, &i__5, &i__6, &reset, &
 				transl, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -3996,14 +4006,14 @@ static logical c_false = FALSE_;
 			    i__5 = n;
 			    for (j = 1; j <= i__5; ++j) {
 				d_cnjg(&z__2, &z__[j + (z_dim1 << 1)]);
-				z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, 
-					z__1.i = alpha.r * z__2.i + alpha.i * 
+				z__1.r = alpha.r * z__2.r - alpha.i * z__2.i,
+					z__1.i = alpha.r * z__2.i + alpha.i *
 					z__2.r;
 				w[0].r = z__1.r, w[0].i = z__1.i;
 				d_cnjg(&z__2, &alpha);
 				d_cnjg(&z__3, &z__[j + z_dim1]);
-				z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, 
-					z__1.i = z__2.r * z__3.i + z__2.i * 
+				z__1.r = z__2.r * z__3.r - z__2.i * z__3.i,
+					z__1.i = z__2.r * z__3.i + z__2.i *
 					z__3.r;
 				w[1].r = z__1.r, w[1].i = z__1.i;
 				if (upper) {
@@ -4013,8 +4023,8 @@ static logical c_false = FALSE_;
 				    jj = j;
 				    lj = n - j + 1;
 				}
-				zmvch_("N", &lj, &c__2, &c_b2, &z__[jj + 
-					z_dim1], nmax, w, &c__1, &c_b2, &a[jj 
+				zmvch_("N", &lj, &c__2, &c_b2, &z__[jj +
+					z_dim1], nmax, w, &c__1, &c_b2, &a[jj
 					+ j * a_dim1], &c__1, &yt[1], &g[1], &
 					aa[ja], eps, &err, fatal, nout, &
 					c_true, (ftnlen)1);
@@ -4119,7 +4129,7 @@ static logical c_false = FALSE_;
 
 } /* zchk6_ */
 
-/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -4133,47 +4143,47 @@ static logical c_false = FALSE_;
 
     /* Local variables */
     doublecomplex a[1]	/* was [1][1] */, x[1], y[1], beta;
-    extern /* Subroutine */ int zher_(char *, integer *, doublereal *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen), 
+    extern /* Subroutine */ int zher_(char *, integer *, doublereal *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen),
 	    zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *,
-	     doublecomplex *, ftnlen), zher2_(char *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, integer *, ftnlen), zhpr2_(char *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	     doublecomplex *, ftnlen), zher2_(char *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *, ftnlen), zhpr2_(char *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
 	    doublecomplex *, integer *, doublecomplex *, ftnlen);
     doublecomplex alpha;
-    extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *), zgbmv_(char *, integer *, integer *, 
+    extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zgbmv_(char *, integer *, integer *,
 	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
-	     doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, ftnlen), zhbmv_(char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), 
-	    zgemv_(char *, integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, ftnlen), zhemv_(char 
-	    *, integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, ftnlen), zgeru_(integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *), ztbmv_(char *, char *, char *, 
+	     doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, ftnlen), zhbmv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen),
+	    zgemv_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, ftnlen), zhemv_(char
+	    *, integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, ftnlen), zgeru_(integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), ztbmv_(char *, char *, char *,
 	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
-	     integer *, ftnlen, ftnlen, ftnlen), zhpmv_(char *, integer *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, ftnlen), ztbsv_(char 
-	    *, char *, char *, integer *, integer *, doublecomplex *, integer 
+	     integer *, ftnlen, ftnlen, ftnlen), zhpmv_(char *, integer *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, ftnlen), ztbsv_(char
+	    *, char *, char *, integer *, integer *, doublecomplex *, integer
 	    *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztpmv_(
-	    char *, char *, char *, integer *, doublecomplex *, doublecomplex 
-	    *, integer *, ftnlen, ftnlen, ftnlen), ztrmv_(char *, char *, 
-	    char *, integer *, doublecomplex *, integer *, doublecomplex *, 
+	    char *, char *, char *, integer *, doublecomplex *, doublecomplex
+	    *, integer *, ftnlen, ftnlen, ftnlen), ztrmv_(char *, char *,
+	    char *, integer *, doublecomplex *, integer *, doublecomplex *,
 	    integer *, ftnlen, ftnlen, ftnlen), ztpsv_(char *, char *, char *,
-	     integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, 
-	    ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, 
+	     integer *, doublecomplex *, doublecomplex *, integer *, ftnlen,
+	    ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen,
 	    ftnlen, ftnlen);
     doublereal ralpha;
-    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical 
+    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical
 	    *, logical *, ftnlen);
 
     /* Fortran I/O blocks */
@@ -4702,9 +4712,9 @@ static logical c_false = FALSE_;
 
 } /* zchke_ */
 
-/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, 
-	integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, 
-	integer *lda, integer *kl, integer *ku, logical *reset, doublecomplex 
+/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m,
+	integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa,
+	integer *lda, integer *kl, integer *ku, logical *reset, doublecomplex
 	*transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
@@ -4765,7 +4775,7 @@ static logical c_false = FALSE_;
 	i__2 = *m;
 	for (i__ = 1; i__ <= i__2; ++i__) {
 	    if (gen || upper && i__ <= j || lower && i__ >= j) {
-		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) 
+		if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl)
 			{
 		    i__3 = i__ + j * a_dim1;
 		    zbeg_(&z__2, reset);
@@ -4998,11 +5008,11 @@ static logical c_false = FALSE_;
 
 } /* zmake_ */
 
-/* Subroutine */ int zmvch_(char *trans, integer *m, integer *n, 
+/* Subroutine */ int zmvch_(char *trans, integer *m, integer *n,
 	doublecomplex *alpha, doublecomplex *a, integer *nmax, doublecomplex *
 	x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *
-	incy, doublecomplex *yt, doublereal *g, doublecomplex *yy, doublereal 
-	*eps, doublereal *err, logical *fatal, integer *nout, logical *mv, 
+	incy, doublecomplex *yt, doublereal *g, doublecomplex *yy, doublereal
+	*eps, doublereal *err, logical *fatal, integer *nout, logical *mv,
 	ftnlen trans_len)
 {
     /* Format strings */
@@ -5105,15 +5115,15 @@ static logical c_false = FALSE_;
 		i__4 = iy;
 		i__5 = j + i__ * a_dim1;
 		i__6 = jx;
-		z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, 
+		z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i,
 			z__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6]
 			.r;
 		z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i;
 		yt[i__3].r = z__1.r, yt[i__3].i = z__1.i;
 		i__3 = j + i__ * a_dim1;
 		i__4 = jx;
-		g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j 
-			+ i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, 
+		g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j
+			+ i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r,
 			abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4)));
 		jx += incxl;
 /* L10: */
@@ -5125,14 +5135,14 @@ static logical c_false = FALSE_;
 		i__4 = iy;
 		d_cnjg(&z__3, &a[j + i__ * a_dim1]);
 		i__5 = jx;
-		z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, z__2.i = 
+		z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, z__2.i =
 			z__3.r * x[i__5].i + z__3.i * x[i__5].r;
 		z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i;
 		yt[i__3].r = z__1.r, yt[i__3].i = z__1.i;
 		i__3 = j + i__ * a_dim1;
 		i__4 = jx;
-		g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j 
-			+ i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, 
+		g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j
+			+ i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r,
 			abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4)));
 		jx += incxl;
 /* L20: */
@@ -5144,7 +5154,7 @@ static logical c_false = FALSE_;
 		i__4 = iy;
 		i__5 = i__ + j * a_dim1;
 		i__6 = jx;
-		z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, 
+		z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i,
 			z__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6]
 			.r;
 		z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i;
@@ -5152,7 +5162,7 @@ static logical c_false = FALSE_;
 		i__3 = i__ + j * a_dim1;
 		i__4 = jx;
 		g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[
-			i__ + j * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, 
+			i__ + j * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r,
 			abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4)));
 		jx += incxl;
 /* L30: */
@@ -5160,7 +5170,7 @@ static logical c_false = FALSE_;
 	}
 	i__2 = iy;
 	i__3 = iy;
-	z__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, z__2.i = 
+	z__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, z__2.i =
 		alpha->r * yt[i__3].i + alpha->i * yt[i__3].r;
 	i__4 = iy;
 	z__3.r = beta->r * y[i__4].r - beta->i * y[i__4].i, z__3.i = beta->r *
@@ -5169,7 +5179,7 @@ static logical c_false = FALSE_;
 	yt[i__2].r = z__1.r, yt[i__2].i = z__1.i;
 	i__2 = iy;
 	g[iy] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), abs(
-		d__2))) * g[iy] + ((d__3 = beta->r, abs(d__3)) + (d__4 = 
+		d__2))) * g[iy] + ((d__3 = beta->r, abs(d__3)) + (d__4 =
 		d_imag(beta), abs(d__4))) * ((d__5 = y[i__2].r, abs(d__5)) + (
 		d__6 = d_imag(&y[iy]), abs(d__6)));
 	iy += incyl;
@@ -5281,8 +5291,8 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr)
 
 } /* lze_ */
 
-logical lzeres_(char *type__, char *uplo, integer *m, integer *n, 
-	doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, 
+logical lzeres_(char *type__, char *uplo, integer *m, integer *n,
+	doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len,
 	ftnlen uplo_len)
 {
     /* System generated locals */
@@ -5459,7 +5469,7 @@ doublereal ddiff_(doublereal *x, doublereal *y)
 
 } /* ddiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/blastest/src/zblat3.c b/blastest/src/zblat3.c
index 3ff3634b6..045eeba42 100644
--- a/blastest/src/zblat3.c
+++ b/blastest/src/zblat3.c
@@ -140,9 +140,14 @@ static integer c_n1 = -1;
 /*  ===================================================================== */
 /* Main program */ int main(void)
 {
+#ifdef BLIS_ENABLE_HPX
+    char* program = "zblat3";
+    bli_thread_initialize_hpx( 1, &program );
+#endif
+
     /* Initialized data */
 
-    static char snames[6*9] = "ZGEMM " "ZHEMM " "ZSYMM " "ZTRMM " "ZTRSM " 
+    static char snames[6*9] = "ZGEMM " "ZHEMM " "ZSYMM " "ZTRMM " "ZTRSM "
 	    "ZHERK " "ZSYRK " "ZHER2K" "ZSYR2K";
 
     /* Format strings */
@@ -186,10 +191,10 @@ static integer c_n1 = -1;
     cllist cl__1;
 
     /* Builtin functions */
-    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), 
+    integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen),
 	    e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *,
-	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), 
-	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, 
+	     char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void),
+	    s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen,
 	    ftnlen);
     /* Subroutine */ int s_stop(char *, ftnlen);
     integer f_clos(cllist *);
@@ -208,44 +213,44 @@ static integer c_n1 = -1;
     integer nbet, ntra;
     logical rewi;
     integer nout;
-    extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
+    extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *,
+	    integer *, integer *, logical *, logical *, logical *, integer *,
 	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
-	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *,
 	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
 	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
-	     ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
+	     ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex 
-	    *, doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, 
-	    ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
-	    integer *, doublecomplex *, integer *, doublecomplex *, 
+	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex
+	    *, doublecomplex *, doublecomplex *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
+	    ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
 	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
 	    , doublecomplex *, doublecomplex *, doublereal *, doublecomplex *,
-	     ftnlen), zchk4_(char *, doublereal *, doublereal *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
+	     ftnlen), zchk4_(char *, doublereal *, doublereal *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex 
-	    *, doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, 
-	    ftnlen), zchk5_(char *, doublereal *, doublereal *, integer *, 
-	    integer *, logical *, logical *, logical *, integer *, integer *, 
+	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex
+	    *, doublecomplex *, doublecomplex *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
+	    ftnlen), zchk5_(char *, doublereal *, doublereal *, integer *,
+	    integer *, logical *, logical *, logical *, integer *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex 
-	    *, doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, 
+	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex
+	    *, doublecomplex *, doublecomplex *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, doublereal *, doublecomplex *,
 	    ftnlen);
     logical fatal, trace;
     integer nidim;
-    extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen), 
-	    zmmch_(char *, char *, integer *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublereal *, doublecomplex *, integer *, 
-	    doublereal *, doublereal *, logical *, integer *, logical *, 
+    extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen),
+	    zmmch_(char *, char *, integer *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, doublereal *, doublecomplex *, integer *,
+	    doublereal *, doublereal *, logical *, integer *, logical *,
 	    ftnlen, ftnlen);
     char snaps[32];
     integer isnum;
@@ -517,7 +522,7 @@ static integer c_n1 = -1;
 	goto L60;
     }
     for (i__ = 1; i__ <= 9; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) 
+	if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0)
 		{
 	    goto L50;
 	}
@@ -580,7 +585,7 @@ static integer c_n1 = -1;
     *(unsigned char *)transa = 'N';
     *(unsigned char *)transb = 'N';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
@@ -595,7 +600,7 @@ static integer c_n1 = -1;
     }
     *(unsigned char *)transb = 'C';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
@@ -628,7 +633,7 @@ static integer c_n1 = -1;
     *(unsigned char *)transa = 'C';
     *(unsigned char *)transb = 'N';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
@@ -643,7 +648,7 @@ static integer c_n1 = -1;
     }
     *(unsigned char *)transb = 'C';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
-	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal,
 	    &nout, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
@@ -697,34 +702,34 @@ static integer c_n1 = -1;
 /*           Test ZGEMM, 01. */
 L140:
 	    zchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test ZHEMM, 02, ZSYMM, 03. */
 L150:
 	    zchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test ZTRMM, 04, ZTRSM, 05. */
 L160:
 	    zchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65,
 		    ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6);
 	    goto L190;
 /*           Test ZHERK, 06, ZSYRK, 07. */
 L170:
 	    zchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
-		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
+		    bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs,
 		    ct, g, (ftnlen)6);
 	    goto L190;
 /*           Test ZHER2K, 08, ZSYR2K, 09. */
 L180:
 	    zchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, &
-		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, 
+		    trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet,
 		    bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, (
 		    ftnlen)6);
 	    goto L190;
@@ -768,15 +773,20 @@ static integer c_n1 = -1;
 
 /*     End of ZBLAT3. */
 
-    return 0;
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
+	// Return peacefully.
+	return 0;
+#endif
 } /* main */
 
-/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
 	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
-	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
+	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b,
+	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__,
 	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
 	g, ftnlen sname_len)
 {
@@ -802,7 +812,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7, i__8;
     alist al__1;
 
@@ -811,7 +821,7 @@ static integer c_n1 = -1;
 	     f_rew(alist *);
 
     /* Local variables */
-    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, 
+    integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns,
 	    ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     doublecomplex als, bls;
     doublereal err;
@@ -821,23 +831,23 @@ static integer c_n1 = -1;
     logical same, null;
     doublecomplex alpha;
     logical isame[13], trana, tranb;
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     logical *, doublecomplex *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *, ftnlen, ftnlen), zgemm_(char *, char *, integer *, 
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *, doublereal *, doublecomplex *,
+	    integer *, doublereal *, doublereal *, logical *, integer *,
+	    logical *, ftnlen, ftnlen), zgemm_(char *, char *, integer *,
 	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
-	     doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	     doublecomplex *, integer *, doublecomplex *, doublecomplex *,
 	    integer *, ftnlen, ftnlen);
     logical reset;
     char tranas[1], tranbs[1], transa[1], transb[1];
     doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -928,7 +938,7 @@ static integer c_n1 = -1;
 		for (ica = 1; ica <= 3; ++ica) {
 		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
 			    ;
-		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned
 			    char *)transa == 'C';
 
 		    if (trana) {
@@ -956,9 +966,9 @@ static integer c_n1 = -1;
 			    ftnlen)1);
 
 		    for (icb = 1; icb <= 3; ++icb) {
-			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb
 				- 1];
-			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned
 				char *)transb == 'C';
 
 			if (tranb) {
@@ -1099,13 +1109,13 @@ static integer c_n1 = -1;
 				isame[2] = ms == m;
 				isame[3] = ns == n;
 				isame[4] = ks == k;
-				isame[5] = als.r == alpha.r && als.i == 
+				isame[5] = als.r == alpha.r && als.i ==
 					alpha.i;
 				isame[6] = lze_(&as[1], &aa[1], &laa);
 				isame[7] = ldas == lda;
 				isame[8] = lze_(&bs[1], &bb[1], &lbb);
 				isame[9] = ldbs == ldb;
-				isame[10] = bls.r == beta.r && bls.i == 
+				isame[10] = bls.r == beta.r && bls.i ==
 					beta.i;
 				if (null) {
 				    isame[11] = lze_(&cs[1], &cc[1], &lcc);
@@ -1143,9 +1153,9 @@ static integer c_n1 = -1;
 
 				    zmmch_(transa, transb, &m, &n, &k, &alpha,
 					     &a[a_offset], nmax, &b[b_offset],
-					     nmax, &beta, &c__[c_offset], 
+					     nmax, &beta, &c__[c_offset],
 					    nmax, &ct[1], &g[1], &cc[1], &ldc,
-					     eps, &err, fatal, nout, &c_true, 
+					     eps, &err, fatal, nout, &c_true,
 					    (ftnlen)1, (ftnlen)1);
 				    errmax = max(errmax,err);
 /*                             If got really bad answer, report and */
@@ -1226,12 +1236,12 @@ static integer c_n1 = -1;
 
 } /* zchk1_ */
 
-/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
 	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
-	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
+	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b,
+	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__,
 	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
 	g, ftnlen sname_len)
 {
@@ -1258,7 +1268,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7;
     alist al__1;
 
@@ -1267,7 +1277,7 @@ static integer c_n1 = -1;
 	    integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *);
 
     /* Local variables */
-    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, 
+    integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc,
 	    ldb, ldc, ics;
     doublecomplex als, bls;
     integer icu;
@@ -1282,27 +1292,27 @@ static integer c_n1 = -1;
     doublecomplex alpha;
     logical isame[13];
     char sides[1];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     logical *, doublecomplex *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *, doublereal *, doublecomplex *,
+	    integer *, doublereal *, doublereal *, logical *, integer *,
+	    logical *, ftnlen, ftnlen), zhemm_(char *, char *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
 	    integer *, ftnlen, ftnlen);
     logical reset;
     char uplos[1];
-    extern /* Subroutine */ int zsymm_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, 
+    extern /* Subroutine */ int zsymm_(char *, char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen,
 	    ftnlen);
     doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -1443,7 +1453,7 @@ static integer c_n1 = -1;
 
 /*                       Generate the matrix C. */
 
-			    zmake_("GE", " ", " ", &m, &n, &c__[c_offset], 
+			    zmake_("GE", " ", " ", &m, &n, &c__[c_offset],
 				    nmax, &cc[1], &ldc, &reset, &c_b1, (
 				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1539,9 +1549,9 @@ static integer c_n1 = -1;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)sides == *(unsigned 
+			    isame[0] = *(unsigned char *)sides == *(unsigned
 				    char *)side;
-			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+			    isame[1] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
 			    isame[2] = ms == m;
 			    isame[3] = ns == n;
@@ -1586,14 +1596,14 @@ static integer c_n1 = -1;
 
 				if (left) {
 				    zmmch_("N", "N", &m, &n, &m, &alpha, &a[
-					    a_offset], nmax, &b[b_offset], 
+					    a_offset], nmax, &b[b_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
 					    ftnlen)1, (ftnlen)1);
 				} else {
 				    zmmch_("N", "N", &m, &n, &n, &alpha, &b[
-					    b_offset], nmax, &a[a_offset], 
+					    b_offset], nmax, &a[a_offset],
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
 					     &err, fatal, nout, &c_true, (
@@ -1673,12 +1683,12 @@ static integer c_n1 = -1;
 
 } /* zchk2_ */
 
-/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *nmax, doublecomplex *a, doublecomplex *aa, 
-	doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex 
-	*bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, ftnlen 
+	alf, integer *nmax, doublecomplex *a, doublecomplex *aa,
+	doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex
+	*bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, ftnlen
 	sname_len)
 {
     /* Initialized data */
@@ -1705,7 +1715,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7;
     doublecomplex z__1;
     alist al__1;
@@ -1731,27 +1741,27 @@ static integer c_n1 = -1;
     char diags[1];
     logical isame[13];
     char sides[1];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     logical *, doublecomplex *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *, doublereal *, doublecomplex *,
+	    integer *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen, ftnlen);
     logical reset;
     char uplos[1];
-    extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, 
+    extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *,
 	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
-	     doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), 
-	    ztrsm_(char *, char *, char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+	     doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen),
+	    ztrsm_(char *, char *, char *, char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
 	    integer *, ftnlen, ftnlen, ftnlen, ftnlen);
     char tranas[1], transa[1];
     doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
 
     /* Fortran I/O blocks */
@@ -1888,7 +1898,7 @@ static integer c_n1 = -1;
 
 /*                          Generate the matrix B. */
 
-				zmake_("GE", " ", " ", &m, &n, &b[b_offset], 
+				zmake_("GE", " ", " ", &m, &n, &b[b_offset],
 					nmax, &bb[1], &ldb, &reset, &c_b1, (
 					ftnlen)2, (ftnlen)1, (ftnlen)1);
 
@@ -1960,7 +1970,7 @@ static integer c_n1 = -1;
 				    }
 				    ztrmm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				} else if (s_cmp(sname + 3, "SM", (ftnlen)2, (
 					ftnlen)2) == 0) {
@@ -1993,7 +2003,7 @@ static integer c_n1 = -1;
 				    }
 				    ztrsm_(side, uplo, transa, diag, &m, &n, &
 					    alpha, &aa[1], &lda, &bb[1], &ldb,
-					     (ftnlen)1, (ftnlen)1, (ftnlen)1, 
+					     (ftnlen)1, (ftnlen)1, (ftnlen)1,
 					    (ftnlen)1);
 				}
 
@@ -2019,7 +2029,7 @@ static integer c_n1 = -1;
 					unsigned char *)diag;
 				isame[4] = ms == m;
 				isame[5] = ns == n;
-				isame[6] = als.r == alpha.r && als.i == 
+				isame[6] = als.r == alpha.r && als.i ==
 					alpha.i;
 				isame[7] = lze_(&as[1], &aa[1], &laa);
 				isame[8] = ldas == lda;
@@ -2063,18 +2073,18 @@ static integer c_n1 = -1;
 					    zmmch_(transa, "N", &m, &n, &m, &
 						    alpha, &a[a_offset], nmax,
 						     &b[b_offset], nmax, &
-						    c_b1, &c__[c_offset], 
+						    c_b1, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					} else {
 					    zmmch_("N", transa, &m, &n, &n, &
 						    alpha, &b[b_offset], nmax,
 						     &a[a_offset], nmax, &
-						    c_b1, &c__[c_offset], 
+						    c_b1, &c__[c_offset],
 						    nmax, &ct[1], &g[1], &bb[
-						    1], &ldb, eps, &err, 
+						    1], &ldb, eps, &err,
 						    fatal, nout, &c_true, (
 						    ftnlen)1, (ftnlen)1);
 					}
@@ -2087,14 +2097,14 @@ static integer c_n1 = -1;
 					i__4 = n;
 					for (j = 1; j <= i__4; ++j) {
 					    i__5 = m;
-					    for (i__ = 1; i__ <= i__5; ++i__) 
+					    for (i__ = 1; i__ <= i__5; ++i__)
 						    {
 			  i__6 = i__ + j * c_dim1;
 			  i__7 = i__ + (j - 1) * ldb;
 			  c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i;
 			  i__6 = i__ + (j - 1) * ldb;
 			  i__7 = i__ + j * b_dim1;
-			  z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, 
+			  z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i,
 				  z__1.i = alpha.r * b[i__7].i + alpha.i * b[
 				  i__7].r;
 			  bb[i__6].r = z__1.r, bb[i__6].i = z__1.i;
@@ -2105,20 +2115,20 @@ static integer c_n1 = -1;
 
 					if (left) {
 					    zmmch_(transa, "N", &m, &n, &m, &
-						    c_b2, &a[a_offset], nmax, 
+						    c_b2, &a[a_offset], nmax,
 						    &c__[c_offset], nmax, &
-						    c_b1, &b[b_offset], nmax, 
+						    c_b1, &b[b_offset], nmax,
 						    &ct[1], &g[1], &bb[1], &
-						    ldb, eps, &err, fatal, 
+						    ldb, eps, &err, fatal,
 						    nout, &c_false, (ftnlen)1,
 						     (ftnlen)1);
 					} else {
 					    zmmch_("N", transa, &m, &n, &n, &
-						    c_b2, &c__[c_offset], 
-						    nmax, &a[a_offset], nmax, 
+						    c_b2, &c__[c_offset],
+						    nmax, &a[a_offset], nmax,
 						    &c_b1, &b[b_offset], nmax,
 						     &ct[1], &g[1], &bb[1], &
-						    ldb, eps, &err, fatal, 
+						    ldb, eps, &err, fatal,
 						    nout, &c_false, (ftnlen)1,
 						     (ftnlen)1);
 					}
@@ -2199,12 +2209,12 @@ static integer c_n1 = -1;
 
 } /* zchk3_ */
 
-/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
 	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
-	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
+	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b,
+	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__,
 	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
 	g, ftnlen sname_len)
 {
@@ -2236,7 +2246,7 @@ static integer c_n1 = -1;
 	    "ER:\002)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4, i__5, i__6, i__7;
     doublecomplex z__1;
     alist al__1;
@@ -2262,29 +2272,29 @@ static integer c_n1 = -1;
     doublecomplex alpha;
     doublereal rbeta;
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     logical *, doublecomplex *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *, doublereal *, doublecomplex *,
+	    integer *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen, ftnlen);
     doublereal rbets;
     logical reset;
-    extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, 
-	    doublereal *, doublecomplex *, integer *, doublereal *, 
+    extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *,
+	    doublereal *, doublecomplex *, integer *, doublereal *,
 	    doublecomplex *, integer *, ftnlen, ftnlen);
     char trans[1];
     logical upper;
     char uplos[1];
-    extern /* Subroutine */ int zsyrk_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+    extern /* Subroutine */ int zsyrk_(char *, char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
 	    doublecomplex *, integer *, ftnlen, ftnlen);
     doublereal ralpha, errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
     char transs[1], transt[1];
 
@@ -2426,7 +2436,7 @@ static integer c_n1 = -1;
 			    }
 			    null = n <= 0;
 			    if (conj) {
-				null = null || (k <= 0 || ralpha == 0.) && 
+				null = null || (k <= 0 || ralpha == 0.) &&
 					rbeta == 1.;
 			    }
 
@@ -2505,7 +2515,7 @@ static integer c_n1 = -1;
 				    f_rew(&al__1);
 				}
 				zherk_(uplo, trans, &n, &k, &ralpha, &aa[1], &
-					lda, &rbeta, &cc[1], &ldc, (ftnlen)1, 
+					lda, &rbeta, &cc[1], &ldc, (ftnlen)1,
 					(ftnlen)1);
 			    } else {
 				if (*trace) {
@@ -2552,16 +2562,16 @@ static integer c_n1 = -1;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
 			    if (conj) {
 				isame[4] = rals == ralpha;
 			    } else {
-				isame[4] = als.r == alpha.r && als.i == 
+				isame[4] = als.r == alpha.r && als.i ==
 					alpha.i;
 			    }
 			    isame[5] = lze_(&as[1], &aa[1], &laa);
@@ -2569,7 +2579,7 @@ static integer c_n1 = -1;
 			    if (conj) {
 				isame[7] = rbets == rbeta;
 			    } else {
-				isame[7] = bets.r == beta.r && bets.i == 
+				isame[7] = bets.r == beta.r && bets.i ==
 					beta.i;
 			    }
 			    if (null) {
@@ -2623,19 +2633,19 @@ static integer c_n1 = -1;
 				    }
 				    if (tran) {
 					zmmch_(transt, "N", &lj, &c__1, &k, &
-						alpha, &a[jj * a_dim1 + 1], 
-						nmax, &a[j * a_dim1 + 1], 
-						nmax, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						alpha, &a[jj * a_dim1 + 1],
+						nmax, &a[j * a_dim1 + 1],
+						nmax, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
 					zmmch_("N", transt, &lj, &c__1, &k, &
-						alpha, &a[jj + a_dim1], nmax, 
+						alpha, &a[jj + a_dim1], nmax,
 						&a[j + a_dim1], nmax, &beta, &
 						c__[jj + j * c_dim1], nmax, &
-						ct[1], &g[1], &cc[jc], &ldc, 
+						ct[1], &g[1], &cc[jc], &ldc,
 						eps, &err, fatal, nout, &
 						c_true, (ftnlen)1, (ftnlen)1);
 				    }
@@ -2743,12 +2753,12 @@ static integer c_n1 = -1;
 
 } /* zchk4_ */
 
-/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, 
+/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh,
 	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
 	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
 	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, 
-	doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, 
+	ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb,
+	doublecomplex *bs, doublecomplex *c__, doublecomplex *cc,
 	doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w,
 	 ftnlen sname_len)
 {
@@ -2807,30 +2817,30 @@ static integer c_n1 = -1;
     doublecomplex alpha;
     doublereal rbeta;
     logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *,
 	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
 	     logical *, doublecomplex *, ftnlen, ftnlen, ftnlen);
     integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *, doublereal *, doublecomplex *,
+	    integer *, doublereal *, doublereal *, logical *, integer *,
 	    logical *, ftnlen, ftnlen);
     doublereal rbets;
     logical reset;
     char trans[1];
     logical upper;
     char uplos[1];
-    extern /* Subroutine */ int zher2k_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublereal *, doublecomplex *, integer *, ftnlen, 
-	    ftnlen), zsyr2k_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, 
+    extern /* Subroutine */ int zher2k_(char *, char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublereal *, doublecomplex *, integer *, ftnlen,
+	    ftnlen), zsyr2k_(char *, char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen,
 	    ftnlen);
     doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
+    extern logical lzeres_(char *, char *, integer *, integer *,
 	    doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen);
     char transs[1], transt[1];
 
@@ -2986,7 +2996,7 @@ static integer c_n1 = -1;
 			    }
 			    null = n <= 0;
 			    if (conj) {
-				null = null || (k <= 0 || alpha.r == 0. && 
+				null = null || (k <= 0 || alpha.r == 0. &&
 					alpha.i == 0.) && rbeta == 1.;
 			    }
 
@@ -3121,9 +3131,9 @@ static integer c_n1 = -1;
 
 /*                       See what data changed inside subroutines. */
 
-			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+			    isame[0] = *(unsigned char *)uplos == *(unsigned
 				    char *)uplo;
-			    isame[1] = *(unsigned char *)transs == *(unsigned 
+			    isame[1] = *(unsigned char *)transs == *(unsigned
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
@@ -3135,7 +3145,7 @@ static integer c_n1 = -1;
 			    if (conj) {
 				isame[9] = rbets == rbeta;
 			    } else {
-				isame[9] = bets.r == beta.r && bets.i == 
+				isame[9] = bets.r == beta.r && bets.i ==
 					beta.i;
 			    }
 			    if (null) {
@@ -3191,20 +3201,20 @@ static integer c_n1 = -1;
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
 					    i__7 = i__;
-					    i__8 = (j - 1 << 1) * *nmax + k + 
+					    i__8 = (j - 1 << 1) * *nmax + k +
 						    i__;
-					    z__1.r = alpha.r * ab[i__8].r - 
-						    alpha.i * ab[i__8].i, 
+					    z__1.r = alpha.r * ab[i__8].r -
+						    alpha.i * ab[i__8].i,
 						    z__1.i = alpha.r * ab[
 						    i__8].i + alpha.i * ab[
 						    i__8].r;
-					    w[i__7].r = z__1.r, w[i__7].i = 
+					    w[i__7].r = z__1.r, w[i__7].i =
 						    z__1.i;
 					    if (conj) {
 			  i__7 = k + i__;
 			  d_cnjg(&z__2, &alpha);
 			  i__8 = (j - 1 << 1) * *nmax + i__;
-			  z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, 
+			  z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i,
 				  z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[
 				  i__8].r;
 			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
@@ -3212,7 +3222,7 @@ static integer c_n1 = -1;
 			  i__7 = k + i__;
 			  i__8 = (j - 1 << 1) * *nmax + i__;
 			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
 					    }
@@ -3223,9 +3233,9 @@ static integer c_n1 = -1;
 					i__8 = *nmax << 1;
 					zmmch_(transt, "N", &lj, &c__1, &i__6,
 						 &c_b2, &ab[jjab], &i__7, &w[
-						1], &i__8, &beta, &c__[jj + j 
+						1], &i__8, &beta, &c__[jj + j
 						* c_dim1], nmax, &ct[1], &g[1]
-						, &cc[jc], &ldc, eps, &err, 
+						, &cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    } else {
@@ -3234,14 +3244,14 @@ static integer c_n1 = -1;
 					    if (conj) {
 			  i__7 = i__;
 			  d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]);
-			  z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, 
-				  z__1.i = alpha.r * z__2.i + alpha.i * 
+			  z__1.r = alpha.r * z__2.r - alpha.i * z__2.i,
+				  z__1.i = alpha.r * z__2.i + alpha.i *
 				  z__2.r;
 			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
 			  i__7 = k + i__;
 			  i__8 = (i__ - 1) * *nmax + j;
 			  z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, z__2.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, z__2.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  d_cnjg(&z__1, &z__2);
 			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
@@ -3249,13 +3259,13 @@ static integer c_n1 = -1;
 			  i__7 = i__;
 			  i__8 = (k + i__ - 1) * *nmax + j;
 			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
 			  i__7 = k + i__;
 			  i__8 = (i__ - 1) * *nmax + j;
 			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
-				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i
 				  * ab[i__8].r;
 			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
 					    }
@@ -3265,9 +3275,9 @@ static integer c_n1 = -1;
 					i__7 = *nmax << 1;
 					zmmch_("N", "N", &lj, &c__1, &i__6, &
 						c_b2, &ab[jj], nmax, &w[1], &
-						i__7, &beta, &c__[jj + j * 
-						c_dim1], nmax, &ct[1], &g[1], 
-						&cc[jc], &ldc, eps, &err, 
+						i__7, &beta, &c__[jj + j *
+						c_dim1], nmax, &ct[1], &g[1],
+						&cc[jc], &ldc, eps, &err,
 						fatal, nout, &c_true, (ftnlen)
 						1, (ftnlen)1);
 				    }
@@ -3380,7 +3390,7 @@ static integer c_n1 = -1;
 
 } /* zchk5_ */
 
-/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, 
+/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout,
 	ftnlen srnamt_len)
 {
     /* Format strings */
@@ -3393,37 +3403,37 @@ static integer c_n1 = -1;
     integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void);
 
     /* Local variables */
-    doublecomplex a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]	
+    doublecomplex a[2]	/* was [2][1] */, b[2]	/* was [2][1] */, c__[2]
 	    /* was [2][1] */, beta, alpha;
     doublereal rbeta;
-    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, ftnlen, ftnlen), zherk_(char *, char *, integer *, 
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, ftnlen, ftnlen), zhemm_(char *, char *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, ftnlen, ftnlen), zherk_(char *, char *, integer *,
 	    integer *, doublereal *, doublecomplex *, integer *, doublereal *,
-	     doublecomplex *, integer *, ftnlen, ftnlen), ztrmm_(char *, char 
-	    *, char *, char *, integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, 
-	    ftnlen, ftnlen, ftnlen), zsymm_(char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	     doublecomplex *, integer *, ftnlen, ftnlen), ztrmm_(char *, char
+	    *, char *, char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen,
+	    ftnlen, ftnlen, ftnlen), zsymm_(char *, char *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
 	    integer *, ftnlen, ftnlen), ztrsm_(char *, char *, char *, char *,
 	     integer *, integer *, doublecomplex *, doublecomplex *, integer *
-	    , doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), 
-	    zsyrk_(char *, char *, integer *, integer *, doublecomplex *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, ftnlen, ftnlen), zher2k_(char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublereal *, doublecomplex *, 
-	    integer *, ftnlen, ftnlen), zsyr2k_(char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    , doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen),
+	    zsyrk_(char *, char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, ftnlen, ftnlen), zher2k_(char *, char *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublereal *, doublecomplex *,
+	    integer *, ftnlen, ftnlen), zsyr2k_(char *, char *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
 	    integer *, ftnlen, ftnlen);
     doublereal ralpha;
-    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical 
+    extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical
 	    *, logical *, ftnlen);
 
     /* Fortran I/O blocks */
@@ -3485,302 +3495,302 @@ static integer c_n1 = -1;
     }
 L10:
     infoc_1.infot = 1;
-    zgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 1;
-    zgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 1;
-    zgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    zgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    zgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 2;
-    zgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 3;
-    zgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 4;
-    zgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 5;
-    zgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__2, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, 
+    zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, 
+    zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 8;
-    zgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, 
+    zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, 
+    zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 10;
-    zgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, 
+    zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
     infoc_1.infot = 13;
-    zgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, 
+    zgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta,
 	    c__, &c__1, (ftnlen)1, (ftnlen)1);
     chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen)
 	    6);
@@ -4960,9 +4970,9 @@ static integer c_n1 = -1;
 
 } /* zchke_ */
 
-/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, 
-	integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, 
-	integer *lda, logical *reset, doublecomplex *transl, ftnlen type_len, 
+/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m,
+	integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa,
+	integer *lda, logical *reset, doublecomplex *transl, ftnlen type_len,
 	ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
@@ -5148,10 +5158,10 @@ static integer c_n1 = -1;
 } /* zmake_ */
 
 /* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer *
-	n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, 
+	n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda,
 	doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex *
 	c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex *
-	cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, 
+	cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal,
 	integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len)
 {
     /* Format strings */
@@ -5165,7 +5175,7 @@ static integer c_n1 = -1;
 	    " \002,i3)";
 
     /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1,
 	    cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
     doublereal d__1, d__2, d__3, d__4, d__5, d__6;
     doublecomplex z__1, z__2, z__3, z__4;
@@ -5224,9 +5234,9 @@ static integer c_n1 = -1;
     cc -= cc_offset;
 
     /* Function Body */
-    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa ==
 	    'C';
-    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb ==
 	    'C';
     ctrana = *(unsigned char *)transa == 'C';
     ctranb = *(unsigned char *)transb == 'C';
@@ -5254,17 +5264,17 @@ static integer c_n1 = -1;
 		    i__5 = i__;
 		    i__6 = i__ + k * a_dim1;
 		    i__7 = k + j * b_dim1;
-		    z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, 
+		    z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i,
 			    z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[
 			    i__7].r;
-		    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+		    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i +
 			    z__2.i;
 		    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 		    i__4 = i__ + k * a_dim1;
 		    i__5 = k + j * b_dim1;
 		    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(
 			    &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[
-			    i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * 
+			    i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j *
 			    b_dim1]), abs(d__4)));
 /* L20: */
 		}
@@ -5280,15 +5290,15 @@ static integer c_n1 = -1;
 			i__5 = i__;
 			d_cnjg(&z__3, &a[k + i__ * a_dim1]);
 			i__6 = k + j * b_dim1;
-			z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, 
+			z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i,
 				z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6]
 				.r;
-			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i +
 				z__2.i;
 			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			i__4 = k + i__ * a_dim1;
 			i__5 = k + j * b_dim1;
-			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * ((
 				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
 				&b[k + j * b_dim1]), abs(d__4)));
@@ -5308,12 +5318,12 @@ static integer c_n1 = -1;
 			z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
 				.i, z__2.i = a[i__6].r * b[i__7].i + a[i__6]
 				.i * b[i__7].r;
-			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i +
 				z__2.i;
 			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			i__4 = k + i__ * a_dim1;
 			i__5 = k + j * b_dim1;
-			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * ((
 				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
 				&b[k + j * b_dim1]), abs(d__4)));
@@ -5332,15 +5342,15 @@ static integer c_n1 = -1;
 			i__5 = i__;
 			i__6 = i__ + k * a_dim1;
 			d_cnjg(&z__3, &b[j + k * b_dim1]);
-			z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, 
-				z__2.i = a[i__6].r * z__3.i + a[i__6].i * 
+			z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i,
+				z__2.i = a[i__6].r * z__3.i + a[i__6].i *
 				z__3.r;
-			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i +
 				z__2.i;
 			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			i__4 = i__ + k * a_dim1;
 			i__5 = j + k * b_dim1;
-			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * ((
 				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
 				&b[j + k * b_dim1]), abs(d__4)));
@@ -5360,12 +5370,12 @@ static integer c_n1 = -1;
 			z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
 				.i, z__2.i = a[i__6].r * b[i__7].i + a[i__6]
 				.i * b[i__7].r;
-			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i +
 				z__2.i;
 			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			i__4 = i__ + k * a_dim1;
 			i__5 = j + k * b_dim1;
-			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * ((
 				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
 				&b[j + k * b_dim1]), abs(d__4)));
@@ -5385,17 +5395,17 @@ static integer c_n1 = -1;
 			    i__5 = i__;
 			    d_cnjg(&z__3, &a[k + i__ * a_dim1]);
 			    d_cnjg(&z__4, &b[j + k * b_dim1]);
-			    z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, 
-				    z__2.i = z__3.r * z__4.i + z__3.i * 
+			    z__2.r = z__3.r * z__4.r - z__3.i * z__4.i,
+				    z__2.i = z__3.r * z__4.i + z__3.i *
 				    z__4.r;
-			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i
 				    + z__2.i;
 			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
-				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4
 				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
 /* L120: */
 			}
@@ -5410,17 +5420,17 @@ static integer c_n1 = -1;
 			    i__5 = i__;
 			    d_cnjg(&z__3, &a[k + i__ * a_dim1]);
 			    i__6 = j + k * b_dim1;
-			    z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, 
+			    z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i,
 				    z__2.i = z__3.r * b[i__6].i + z__3.i * b[
 				    i__6].r;
-			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i
 				    + z__2.i;
 			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
-				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4
 				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
 /* L140: */
 			}
@@ -5437,17 +5447,17 @@ static integer c_n1 = -1;
 			    i__5 = i__;
 			    i__6 = k + i__ * a_dim1;
 			    d_cnjg(&z__3, &b[j + k * b_dim1]);
-			    z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, 
-				    z__2.i = a[i__6].r * z__3.i + a[i__6].i * 
+			    z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i,
+				    z__2.i = a[i__6].r * z__3.i + a[i__6].i *
 				    z__3.r;
-			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i
 				    + z__2.i;
 			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
-				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4
 				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
 /* L160: */
 			}
@@ -5463,16 +5473,16 @@ static integer c_n1 = -1;
 			    i__6 = k + i__ * a_dim1;
 			    i__7 = j + k * b_dim1;
 			    z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[
-				    i__7].i, z__2.i = a[i__6].r * b[i__7].i + 
+				    i__7].i, z__2.i = a[i__6].r * b[i__7].i +
 				    a[i__6].i * b[i__7].r;
-			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i
 				    + z__2.i;
 			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
 			    i__4 = k + i__ * a_dim1;
 			    i__5 = j + k * b_dim1;
 			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
 				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
-				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4
 				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
 /* L180: */
 			}
@@ -5485,17 +5495,17 @@ static integer c_n1 = -1;
 	for (i__ = 1; i__ <= i__2; ++i__) {
 	    i__3 = i__;
 	    i__4 = i__;
-	    z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = 
+	    z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i =
 		    alpha->r * ct[i__4].i + alpha->i * ct[i__4].r;
 	    i__5 = i__ + j * c_dim1;
-	    z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = 
+	    z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i =
 		    beta->r * c__[i__5].i + beta->i * c__[i__5].r;
 	    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
 	    ct[i__3].r = z__1.r, ct[i__3].i = z__1.i;
 	    i__3 = i__ + j * c_dim1;
-	    g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), 
+	    g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha),
 		    abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + (
-		    d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, 
+		    d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r,
 		    abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs(
 		    d__6)));
 /* L200: */
@@ -5621,8 +5631,8 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr)
 
 } /* lze_ */
 
-logical lzeres_(char *type__, char *uplo, integer *m, integer *n, 
-	doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, 
+logical lzeres_(char *type__, char *uplo, integer *m, integer *n,
+	doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len,
 	ftnlen uplo_len)
 {
     /* System generated locals */
@@ -5807,7 +5817,7 @@ doublereal ddiff_(doublereal *x, doublereal *y)
 
 } /* ddiff_ */
 
-/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, 
+/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout,
 	logical *lerr, logical *ok, ftnlen srnamt_len)
 {
     /* Format strings */
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 5208a90f8..41e76d214 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -65,6 +65,13 @@
 #endif
 #endif
 
+#if @enable_hpx@
+#define BLIS_ENABLE_HPX
+#if @enable_hpx_as_def@
+#define BLIS_ENABLE_HPX_AS_DEFAULT
+#endif
+#endif
+
 #if @enable_jrir_slab@
 #define BLIS_ENABLE_JRIR_SLAB
 #endif
diff --git a/build/config.mk.in b/build/config.mk.in
index efb123366..4624220cf 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -123,6 +123,7 @@ LDFLAGS_PRESET    := @ldflags_preset@
 
 # The level of debugging info to generate.
 DEBUG_TYPE        := @debug_type@
+ENABLE_DEBUG      := @enable_debug@
 
 # Whether to compile and link the AddressSanitizer library.
 MK_ENABLE_ASAN    := @enable_asan@
diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index db20ffbca..4bc91784c 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -557,6 +557,8 @@ bli_info_get_enable_openmp_as_default
 bli_info_get_enable_pba_pools
 bli_info_get_enable_pthreads
 bli_info_get_enable_pthreads_as_default
+bli_info_get_enable_hpx
+bli_info_get_enable_hpx_as_default
 bli_info_get_enable_sandbox
 bli_info_get_enable_sba_pools
 bli_info_get_enable_threading
diff --git a/common.mk b/common.mk
index e69b97782..119d09e87 100644
--- a/common.mk
+++ b/common.mk
@@ -112,6 +112,7 @@ get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
                                    $(call load-var-for,CXXLANGFLAGS,$(1)) \
                                    $(call load-var-for,CPPROCFLAGS,$(1)) \
                                    $(CTHREADFLAGS) \
+                                   $(CXXTHREADFLAGS) \
                                    $(CINCFLAGS) $(VERS_DEF) \
                             )
 
@@ -151,6 +152,13 @@ get-frame-cflags-for     = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(BUILD_SYMFLAGS) \
                             )
 
+get-frame-cxxflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
+                                   $(call get-noopt-cxxflags-for,$(1)) \
+                                   $(BUILD_ASANFLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
+                            )
+
 get-kernel-cflags-for    = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
                                    $(call load-var-for,CKVECFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
@@ -224,6 +232,7 @@ get-refinit-text-for      = "('$(1)' CFLAGS for ref. kernel init)"
 get-refkern-text-for      = "('$(1)' CFLAGS for ref. kernels)"
 get-config-text-for       = "('$(1)' CFLAGS for config code)"
 get-frame-text-for        = "('$(1)' CFLAGS for framework code)"
+get-frame-cxxtext-for     = "('$(1)' CXXFLAGS for framework code)"
 get-kernel-text-for       = "('$(1)' CFLAGS for kernels)"
 get-addon-c99text-for     = "('$(1)' CFLAGS for addons)"
 get-addon-cxxtext-for     = "('$(1)' CXXFLAGS for addons)"
@@ -348,7 +357,11 @@ REFNM              := ref
 # Source suffixes.
 CONFIG_SRC_SUFS    := c
 KERNELS_SRC_SUFS   := c s S
+ifneq ($(findstring hpx,$(THREADING_MODEL)),)
+FRAME_SRC_SUFS     := c cpp
+else
 FRAME_SRC_SUFS     := c
+endif
 
 ADDON_C99_SUFS     := c
 ADDON_CXX_SUFS     := cc cpp cxx
@@ -427,7 +440,6 @@ ADDON_FRAG_PATH    := ./obj/$(CONFIG_NAME)/$(ADDON_DIR)
 SANDBOX_FRAG_PATH  := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR)
 
 
-
 #
 # --- Library name and local paths ---------------------------------------------
 #
@@ -687,8 +699,12 @@ endif
 
 # --- Linker program ---
 
-# Use whatever compiler was chosen.
+# Use whatever compiler was chosen. A C++ compiler must be used if HPX is enabled.
+ifneq ($(findstring hpx,$(THREADING_MODEL)),)
+LINKER     := $(CXX)
+else
 LINKER     := $(CC)
+endif
 
 # --- Warning flags ---
 
@@ -798,14 +814,22 @@ endif
 CLANGFLAGS := -std=c99
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CLANGFLAGS,$(c))))
 
-# Enable C++11.
+# Enable C++11, or C++17 if HPX threading is enabled.
+ifneq ($(findstring hpx,$(THREADING_MODEL)),)
+CXXLANGFLAGS := -std=c++17
+else
 CXXLANGFLAGS := -std=c++11
+endif
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))))
 
 # --- C Preprocessor flags ---
 
 # Enable clock_gettime() in time.h.
 CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
+# Enable ip_mreq on macOS which is needed for ASIO which is needed for HPX
+ifeq ($(OS_NAME),Darwin)
+CPPROCFLAGS += -D_DARWIN_C_SOURCE
+endif
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c))))
 
 # --- AddressSanitizer flags ---
@@ -823,6 +847,7 @@ endif
 # gets added to begin with.
 
 CTHREADFLAGS :=
+CXXTHREADFLAGS :=
 
 ifeq ($(CC_VENDOR),gcc)
 #ifneq ($(findstring auto,$(THREADING_MODEL)),)
@@ -866,6 +891,18 @@ LDFLAGS      += $(LIBPTHREAD)
 endif
 endif
 
+# Threading flags for HPX
+ifneq ($(findstring hpx,$(THREADING_MODEL)),)
+HPX_CXXFLAGS := $(shell pkg-config --cflags hpx_component)
+HPX_LDFLAGS  := $(filter-out -shared,$(shell pkg-config --libs hpx_component))
+CTHREADFLAGS += $(filter-out -std=%,$(HPX_CXXFLAGS))
+LDFLAGS      += $(HPX_LDFLAGS)
+ifeq ($(OS_NAME),Darwin)
+RPATH_PREFIX := -Wl,-rpath,
+LDFLAGS      += $(patsubst -L%,$(RPATH_PREFIX)%,$(filter -L%,$(HPX_LDFLAGS)))
+endif
+endif
+
 # --- #pragma omp simd flags (used for reference kernels only) ---
 
 ifeq ($(PRAGMA_OMP_SIMD),yes)
diff --git a/configure b/configure
index fd4812b1b..f808134d3 100755
--- a/configure
+++ b/configure
@@ -170,12 +170,12 @@ print_usage()
 	echo "   -t MODEL, --enable-threading[=MODEL], --disable-threading"
 	echo " "
 	echo "                 Enable threading in the library, using threading model(s)"
-	echo "                 MODEL={single,openmp,pthreads,auto}. If multiple values"
+	echo "                 MODEL={single,openmp,pthreads,hpx,auto}. If multiple values"
 	echo "                 are specified within MODEL, they will all be compiled into"
 	echo "                 BLIS, and the choice of which to use will be determined at"
 	echo "                 runtime. If the user does not express a preference (by"
 	echo "                 setting the BLIS_THREAD_IMPL environment variable to"
-	echo "                 'single', 'openmp', or 'pthreads'; by calling the global"
+	echo "                 'single', 'openmp', 'pthreads', or 'hpx'; by calling the global"
 	echo "                 runtime API bli_thread_set_thread_impl(); or by encoding a"
 	echo "                 choice on a per-call basis within a rntm_t passed into the"
 	echo "                 expert API), then the first model listed in MODEL will be"
@@ -2478,6 +2478,7 @@ main()
 	# The user-given debug type and a flag indicating it was given.
 	debug_type=''
 	debug_flag=''
+	enable_debug='no'
 
 	# A flag indicating whether AddressSanitizer should be used.
 	enable_asan='no'
@@ -3461,8 +3462,10 @@ main()
 			debug_type='noopt'
 			echo "${script_name}: enabling debug symbols; optimizations disabled."
 		fi
+		enable_debug='yes'
 	else
 		debug_type='off'
+		enable_debug='no'
 		echo "${script_name}: debug symbols disabled."
 	fi
 
@@ -3526,14 +3529,17 @@ main()
 	enable_single='yes'
 	enable_openmp='no'
 	enable_pthreads='no'
+	enable_hpx='no'
 	enable_single_01=1
 	enable_openmp_01=0
 	enable_pthreads_01=0
+	enable_hpx_01=0
 	parsed_tm=''
 	first_tm=''
 	enable_single_as_def_01=0
 	enable_openmp_as_def_01=0
 	enable_pthreads_as_def_01=0
+	enable_hpx_as_def_01=0
 
 	# Convert whatever reasonable separator the user may have used into a space.
 	threading_model_list=$(echo "${threading_model}" | sed -e "s/[,+]/ /g")
@@ -3561,6 +3567,10 @@ main()
 
 			parsed_tm="${parsed_tm} pthreads"
 
+		elif [ "x${word}" = "xhpx" ]; then
+
+			parsed_tm="${parsed_tm} hpx"
+
 		elif [ "x${word}" = "xauto" ]; then
 
 			parsed_tm="${parsed_tm} auto"
@@ -3652,7 +3662,15 @@ main()
 			echo "${script_name}: enabling support for threading via pthreads."
 			enable_pthreads='yes'
 			enable_pthreads_01=1
+
+		elif [ "x${word}" = "xhpx" ]; then
+
+			echo "${script_name}: enabling support for threading via hpx."
+			enable_hpx='yes'
+			enable_hpx_01=1
+
 		fi
+
 	done
 
 	# Define boolean variables that can easily be interpreted with #ifdef
@@ -3662,25 +3680,37 @@ main()
 		enable_single_as_def_01=1
 		enable_openmp_as_def_01=0
 		enable_pthreads_as_def_01=0
+		enable_hpx_as_def_01=0
 
 	elif [ "x${first_tm}" = "xopenmp" ]; then
 
 		enable_single_as_def_01=0
 		enable_openmp_as_def_01=1
 		enable_pthreads_as_def_01=0
+		enable_hpx_as_def_01=0
 
 	elif [ "x${first_tm}" = "xpthreads" ]; then
 
 		enable_single_as_def_01=0
 		enable_openmp_as_def_01=0
 		enable_pthreads_as_def_01=1
+		enable_hpx_as_def_01=0
+
+	elif [ "x${first_tm}" = "xhpx" ]; then
+
+		enable_single_as_def_01=0
+		enable_openmp_as_def_01=0
+		enable_pthreads_as_def_01=0
+		enable_hpx_as_def_01=1
+
 	fi
 
 	# If either OpenMP or pthreads was enabled, given that single-threaded mode is
 	# also always enabled, remind the user which one will serve as the default
 	# (that is, absent any explicit choice at runtime).
 	if [ "x${enable_openmp}"   = "xyes" ] ||
-	   [ "x${enable_pthreads}" = "xyes" ]; then
+	   [ "x${enable_pthreads}" = "xyes" ] ||
+	   [ "x${enable_hpx}"      = "xyes" ]; then
 
 		if   [ "x${first_tm}"   = "xsingle" ]; then
 			echo "${script_name}: threading will default to single-threaded."
@@ -3688,6 +3718,8 @@ main()
 			echo "${script_name}: threading will default to OpenMP."
 		elif [ "x${first_tm}"   = "xpthreads" ]; then
 			echo "${script_name}: threading will default to pthreads."
+		elif [ "x${first_tm}"   = "xhpx" ]; then
+			echo "${script_name}: threading will default to HPX."
 		fi
 	fi
 
@@ -4102,6 +4134,7 @@ main()
 		| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
 		| sed -e "s/@enable_asan@/${enable_asan}/g" \
 		| sed -e "s/@debug_type@/${debug_type}/g" \
+		| sed -e "s/@enable_debug@/${enable_debug}/g" \
 		| sed -e "s/@enable_system@/${enable_system}/g" \
 		| sed -e "s/@threading_model@/${threading_model}/g" \
 		| sed -e "s/@prefix@/${prefix_esc}/g" \
@@ -4142,6 +4175,8 @@ main()
 		| sed   -e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g" \
 		| sed   -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
 		| sed   -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \
+		| sed   -e "s/@enable_hpx@/${enable_hpx_01}/g" \
+		| sed   -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \
 		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
 		| sed   -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
 		| sed   -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \
diff --git a/docs/FAQ.md b/docs/FAQ.md
index 3d0852d36..aee099b37 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -115,7 +115,7 @@ For more information on macrokernels, please read our [ACM TOMS papers](https://
 
 As of 0.2.0, BLIS contains a new infrastructure for communicating runtime information (such as kernel addresses and blocksizes) from the highest levels of code all the way down the function stack, even into the kernels themselves. This new data structure is called a *context* (defined in code as a `cntx_t` type), and together with its API it helped us clean up some hacks and other awkwardness that existed in BLIS prior to 0.2.0. Contexts also lay the groundwork for managing kernels and related kernel information at runtime.
 
-If you are a kernel developer, you can usually ignore the `cntx_t*` argument that is passed into each kernel, since the kernels already inherently "know" this information (such as register blocksizes). And if you are a user, and the function you want to call takes a `cntx_t*` argument, you can safely pass in `NULL` and BLIS will automatically build a suitable context for you at runtime. 
+If you are a kernel developer, you can usually ignore the `cntx_t*` argument that is passed into each kernel, since the kernels already inherently "know" this information (such as register blocksizes). And if you are a user, and the function you want to call takes a `cntx_t*` argument, you can safely pass in `NULL` and BLIS will automatically build a suitable context for you at runtime.
 
 ### I'm used to thinking in terms of column-major/row-major storage and leading dimensions. What is a "row stride" / "column stride"?
 
@@ -171,7 +171,7 @@ Originally, BLIS did indeed require the application to explicitly setup (initial
 
 ### Does BLIS support multithreading?
 
-Yes! BLIS supports multithreading (via OpenMP or POSIX threads) for all of its level-3 operations. For more information on enabling and controlling multithreading, please see the [Multithreading](Multithreading.md) guide.
+Yes! BLIS supports multithreading (via OpenMP, POSIX threads, or HPX) for all of its level-3 operations. For more information on enabling and controlling multithreading, please see the [Multithreading](Multithreading.md) guide.
 
 BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives its thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution.
 
diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 933296f79..1a46f6556 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -246,7 +246,7 @@ This will result in both OpenMP and pthreads implementations being compiled and
 ```c
 void bli_thread_set_thread_impl( timpl_t ti );
 ```
-The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
+The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
 ```c
 bli_thread_set_thread_impl( BLIS_POSIX )
 ```
@@ -321,7 +321,7 @@ This will result in both OpenMP and pthreads implementations being compiled and
 ```c
 void bli_rntm_set_thread_impl( timpl_t ti, rntm_t* rntm );
 ```
-The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
+The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
 ```c
 bli_rntm_set_thread_impl( BLIS_POSIX, &rntm );
 ```
@@ -366,7 +366,7 @@ Also, you may pass in `NULL` for the `rntm_t*` parameter of an expert interface.
    This situation could lead to unexpectedly low multithreaded performance. Suppose the user calls `gemm` on a problem with a large m dimension and small k and n dimensions, and explicitly requests parallelism only in the IC loop, but also suppose that the storage of C does not match that of the microkernel's preference. After BLIS transposes the operation internally, the *effective* m dimension will no longer be large; instead, it will be small (because the original m and n dimension will have been swapped). The multithreaded implementation will then proceed to parallelize this small m dimension.
 
    There are currently no good *and* easy solutions to this problem. Eventually, though, we plan to add support for two microkernels per datatype per configuration--one for use with matrices C that are row-stored, and one for those that are column-stored. This will obviate the logic within BLIS that sometimes induces the operation transposition, and the problem will go away.
-   
+
 * **Thread affinity when BLIS and MKL are used together.** Some users have reported that when running a program that links both BLIS (configured with OpenMP) and MKL, **and** when OpenMP thread affinity has been specified (e.g. via `OMP_PROC_BIND` and `OMP_PLACES`), that very poor performance is observed. This may be due to incorrect thread masking, causing all threads to run on one physical core. The exact circumstances leading to this behavior have not been identified, but unsetting the OpenMP thread affinity variables appears to be a solution.
 
 # Conclusion
diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c
index e482d37a1..4160751e6 100644
--- a/frame/3/bli_l3_decor.c
+++ b/frame/3/bli_l3_decor.c
@@ -224,13 +224,16 @@ void bli_l3_thread_decorator_check
 #endif
 #ifndef BLIS_ENABLE_PTHREADS
 	     ti == BLIS_POSIX ||
+#endif
+#ifndef BLIS_ENABLE_HPX
+	     ti == BLIS_HPX ||
 #endif
 	     FALSE
 	   )
 	{
 		fprintf( stderr, "\n" );
-		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
-		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
+		fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", bli_thread_get_thread_impl_str( ti ) );
+		fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", bli_thread_get_thread_impl_str( ti ) );
 		fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
 		bli_abort();
 	}
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index 9d6e181d3..1f00537d5 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -104,7 +104,8 @@ gint_t bli_info_get_enable_sba_pools( void )
 gint_t bli_info_get_enable_threading( void )
 {
 	if ( bli_info_get_enable_openmp() ||
-	     bli_info_get_enable_pthreads() ) return 1;
+	     bli_info_get_enable_pthreads() ||
+	     bli_info_get_enable_hpx() ) return 1;
 	else                                  return 0;
 }
 gint_t bli_info_get_enable_openmp( void )
@@ -123,6 +124,14 @@ gint_t bli_info_get_enable_pthreads( void )
 	return 0;
 #endif
 }
+gint_t bli_info_get_enable_hpx( void )
+{
+#ifdef BLIS_ENABLE_HPX
+	return 1;
+#else
+	return 0;
+#endif
+}
 gint_t bli_info_get_enable_openmp_as_default( void )
 {
 #ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT
@@ -139,6 +148,14 @@ gint_t bli_info_get_enable_pthreads_as_default( void )
 	return 0;
 #endif
 }
+gint_t bli_info_get_enable_hpx_as_default( void )
+{
+#ifdef BLIS_ENABLE_HPX_AS_DEFAULT
+	return 1;
+#else
+	return 0;
+#endif
+}
 gint_t bli_info_get_thread_part_jrir_slab( void )
 {
 #ifdef BLIS_ENABLE_JRIR_SLAB
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index b3514f434..08a99daea 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -70,8 +70,10 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp_as_default( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads_as_default( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx_as_default( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void );
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 542973b18..633d7f671 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -83,12 +83,20 @@
   // Default behavior is disabled.
 #endif
 
+// Enable multithreading via HPX.
+#ifdef BLIS_ENABLE_HPX
+  // No additional definitions needed.
+#else
+  // Default behavior is disabled.
+#endif
+
 // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP
 // or pthreads are enabled. This macro is useful in situations when
 // we want to detect use of either OpenMP or pthreads, or both (as
 // opposed to neither being used).
 #if defined ( BLIS_ENABLE_OPENMP ) || \
-    defined ( BLIS_ENABLE_PTHREADS )
+    defined ( BLIS_ENABLE_PTHREADS ) || \
+    defined ( BLIS_ENABLE_HPX )
   #define BLIS_ENABLE_MULTITHREADING
 #endif
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 0c5d11e6b..014be18b7 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -44,9 +44,10 @@
 
 #ifdef __cplusplus
   // For C++, include stdint.h.
-  #include <stdint.h>
+  #include <cstdint>
 #elif __STDC_VERSION__ >= 199901L
   // For C99 (or later), include stdint.h.
+  #include <stddef.h>
   #include <stdint.h>
   #include <stdbool.h>
 #else
@@ -629,6 +630,7 @@ typedef enum
 	BLIS_SINGLE = 0,
 	BLIS_OPENMP,
 	BLIS_POSIX,
+	BLIS_HPX,
 
 	// BLIS_NUM_THREAD_IMPLS must be last!
 	BLIS_NUM_THREAD_IMPLS
diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c
index f0bba205a..e9f9d9dc7 100644
--- a/frame/thread/bli_thrcomm.c
+++ b/frame/thread/bli_thrcomm.c
@@ -74,16 +74,18 @@ static thrcomm_init_ft init_fpa[ BLIS_NUM_THREAD_IMPLS ] =
 	[BLIS_OPENMP] =
 #if   defined(BLIS_ENABLE_OPENMP)
 	                bli_thrcomm_init_openmp,
-#elif defined(BLIS_ENABLE_PTHREADS)
-	                NULL,
 #else
 	                NULL,
 #endif
 	[BLIS_POSIX]  =
 #if   defined(BLIS_ENABLE_PTHREADS)
 	                bli_thrcomm_init_pthreads,
-#elif defined(BLIS_ENABLE_OPENMP)
+#else
 	                NULL,
+#endif
+	[BLIS_HPX]  =
+#if   defined(BLIS_ENABLE_HPX)
+	                bli_thrcomm_init_hpx,
 #else
 	                NULL,
 #endif
@@ -94,16 +96,18 @@ static thrcomm_cleanup_ft cleanup_fpa[ BLIS_NUM_THREAD_IMPLS ] =
 	[BLIS_OPENMP] =
 #if   defined(BLIS_ENABLE_OPENMP)
 	                bli_thrcomm_cleanup_openmp,
-#elif defined(BLIS_ENABLE_PTHREADS)
-	                NULL,
 #else
 	                NULL,
 #endif
 	[BLIS_POSIX]  =
 #if   defined(BLIS_ENABLE_PTHREADS)
 	                bli_thrcomm_cleanup_pthreads,
-#elif defined(BLIS_ENABLE_OPENMP)
+#else
 	                NULL,
+#endif
+	[BLIS_HPX]  =
+#if   defined(BLIS_ENABLE_HPX)
+	                bli_thrcomm_cleanup_hpx,
 #else
 	                NULL,
 #endif
@@ -114,16 +118,18 @@ static thrcomm_barrier_ft barrier_fpa[ BLIS_NUM_THREAD_IMPLS ] =
 	[BLIS_OPENMP] =
 #if   defined(BLIS_ENABLE_OPENMP)
 	                bli_thrcomm_barrier_openmp,
-#elif defined(BLIS_ENABLE_PTHREADS)
-	                NULL,
 #else
 	                NULL,
 #endif
 	[BLIS_POSIX]  =
 #if   defined(BLIS_ENABLE_PTHREADS)
 	                bli_thrcomm_barrier_pthreads,
-#elif defined(BLIS_ENABLE_OPENMP)
+#else
 	                NULL,
+#endif
+	[BLIS_HPX]  =
+#if   defined(BLIS_ENABLE_HPX)
+	                bli_thrcomm_barrier_hpx,
 #else
 	                NULL,
 #endif
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index 7abd190c7..b65cb0b7a 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -94,6 +94,13 @@ typedef struct thrcomm_s
 	#endif
 	#endif
 
+	#ifdef BLIS_ENABLE_HPX
+	#ifdef BLIS_USE_HPX_BARRIER
+	hpx::barrier<> * barrier;
+	#endif
+	#endif
+
+
 } thrcomm_t;
 
 
@@ -105,6 +112,7 @@ typedef struct thrcomm_s
 #include "bli_thrcomm_single.h"
 #include "bli_thrcomm_openmp.h"
 #include "bli_thrcomm_pthreads.h"
+#include "bli_thrcomm_hpx.h"
 
 // Define a function pointer type for each of the functions that are
 // "overloaded" by each method of multithreading.
diff --git a/frame/thread/bli_thrcomm_hpx.cpp b/frame/thread/bli_thrcomm_hpx.cpp
new file mode 100644
index 000000000..d9fb258c2
--- /dev/null
+++ b/frame/thread/bli_thrcomm_hpx.cpp
@@ -0,0 +1,92 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 Tactical Computing Laboratories, LLC
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_HPX
+
+extern "C" {
+
+#ifdef BLIS_USE_HPX_BARRIER
+
+// Define the pthread_barrier_t implementations of the init, cleanup, and
+// barrier functions.
+
+void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
+{
+	if ( comm == nullptr ) return;
+	comm->barrier = new hpx:barrier<>();
+}
+
+void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
+{
+	if ( comm == nullptr ) return;
+	delete comm->barrier;
+}
+
+void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
+{
+	comm->barrier->arrive_and_wait();
+}
+
+#else
+
+// Define the non-hpx::barrier implementations of the init, cleanup,
+// and barrier functions. These are the default unless the hpx::barrier
+// versions are requested at compile-time.
+
+void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
+{
+	if ( comm == nullptr ) return;
+	comm->sent_object = nullptr;
+	comm->n_threads = n_threads;
+	comm->barrier_sense = 0;
+	comm->barrier_threads_arrived = 0;
+}
+
+void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
+{
+}
+
+void bli_thrcomm_barrier_hpx( dim_t t_id, thrcomm_t* comm )
+{
+	bli_thrcomm_barrier_atomic( t_id, comm );
+}
+
+} // extern "C"
+
+#endif
+
+#endif
+
diff --git a/frame/thread/bli_thrcomm_hpx.h b/frame/thread/bli_thrcomm_hpx.h
new file mode 100644
index 000000000..d80cd2268
--- /dev/null
+++ b/frame/thread/bli_thrcomm_hpx.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 Tactical Computing Laboratories, LLC
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_THRCOMM_HPX_H
+#define BLIS_THRCOMM_HPX_H
+
+// Define these prototypes for situations when HPX multithreading is enabled.
+#ifdef BLIS_ENABLE_HPX
+
+void bli_thrcomm_init_hpx( dim_t nt, thrcomm_t* comm );
+void bli_thrcomm_cleanup_hpx( thrcomm_t* comm );
+void bli_thrcomm_barrier_hpx( dim_t tid, thrcomm_t* comm );
+
+#endif
+
+#endif
+
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 8904c88e3..4cba76b20 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -35,6 +35,10 @@
 
 #include "blis.h"
 
+#ifdef BLIS_ENABLE_HPX
+#include "bli_thread_hpx.h"
+#endif
+
 thrcomm_t BLIS_SINGLE_COMM = {};
 
 // The global rntm_t structure. (The definition resides in bli_rntm.c.)
@@ -57,16 +61,18 @@ static thread_launch_t thread_launch_fpa[ BLIS_NUM_THREAD_IMPLS ] =
 	[BLIS_OPENMP] =
 #if   defined(BLIS_ENABLE_OPENMP)
 	                bli_thread_launch_openmp,
-#elif defined(BLIS_ENABLE_PTHREADS)
-	                NULL,
 #else
 	                NULL,
 #endif
 	[BLIS_POSIX]  =
 #if   defined(BLIS_ENABLE_PTHREADS)
 	                bli_thread_launch_pthreads,
-#elif defined(BLIS_ENABLE_OPENMP)
+#else
 	                NULL,
+#endif
+	[BLIS_HPX] =
+#if   defined(BLIS_ENABLE_HPX)
+	                bli_thread_launch_hpx,
 #else
 	                NULL,
 #endif
@@ -1604,6 +1610,7 @@ static const char* bli_timpl_string[BLIS_NUM_THREAD_IMPLS] =
 	[BLIS_SINGLE] = "single",
 	[BLIS_OPENMP] = "openmp",
 	[BLIS_POSIX]  = "pthreads",
+	[BLIS_HPX]    = "hpx",
 };
 
 const char* bli_thread_get_thread_impl_str( timpl_t ti )
@@ -1713,6 +1720,7 @@ void bli_thread_init_rntm_from_env
 		else if ( !strncmp( ti_env, "pthreads", 8 ) ) ti = BLIS_POSIX;
 		else if ( !strncmp( ti_env, "pthread",  7 ) ) ti = BLIS_POSIX;
 		else if ( !strncmp( ti_env, "posix",    5 ) ) ti = BLIS_POSIX;
+		else if ( !strncmp( ti_env, "hpx",      3 ) ) ti = BLIS_HPX;
 		else                                          ti = BLIS_SINGLE;
 
 		#ifdef PRINT_IMPL
@@ -1732,6 +1740,9 @@ void bli_thread_init_rntm_from_env
 		#ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT
 		ti = BLIS_POSIX;
 		#endif
+		#ifdef BLIS_ENABLE_HPX_AS_DEFAULT
+		ti = BLIS_HPX;
+		#endif
 
 		#ifdef PRINT_IMPL
 		printf( "BLIS_THREAD_IMPL unset; defaulting to BLIS_THREAD_IMPL=%s.\n",
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 821e2fe7c..e61fc8b89 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -49,6 +49,7 @@ typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params
 // Include threading implementations.
 #include "bli_thread_openmp.h"
 #include "bli_thread_pthreads.h"
+#include "bli_thread_hpx.h"
 #include "bli_thread_single.h"
 
 // Initialization-related prototypes.
diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp
new file mode 100644
index 000000000..38c92481d
--- /dev/null
+++ b/frame/thread/bli_thread_hpx.cpp
@@ -0,0 +1,85 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 Tactical Computing Laboratories, LLC
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_HPX
+
+#include <hpx/local/execution.hpp>
+#include <hpx/parallel/algorithms/for_each.hpp>
+#include <hpx/hpx_start.hpp>
+
+extern "C"
+{
+
+void bli_thread_launch_hpx
+     (
+             dim_t         n_threads,
+             thread_func_t func,
+       const void*         params
+     )
+{
+	const timpl_t ti = BLIS_HPX;
+
+	// Allocate a global communicator for the root thrinfo_t structures.
+	pool_t*    gl_comm_pool = nullptr;
+	thrcomm_t* gl_comm      = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
+
+	auto irange = hpx::util::detail::make_counting_shape(n_threads);
+
+	hpx::for_each(hpx::execution::par, hpx::util::begin(irange), hpx::util::end(irange),
+	[&gl_comm, &func, &params](const dim_t tid)
+	{
+		func( gl_comm, tid, params );
+	});
+
+	// Free the global communicator, because the root thrinfo_t node
+	// never frees its communicator.
+	bli_thrcomm_free( gl_comm_pool, gl_comm );
+}
+
+void bli_thread_initialize_hpx( int argc, char** argv )
+{
+    hpx::start( nullptr, argc, argv );
+}
+
+int bli_thread_finalize_hpx()
+{
+    hpx::apply([]() { hpx::finalize(); });
+    return hpx::stop();
+}
+
+} // extern "C"
+
+#endif
diff --git a/frame/thread/bli_thread_hpx.h b/frame/thread/bli_thread_hpx.h
new file mode 100644
index 000000000..55d2758a9
--- /dev/null
+++ b/frame/thread/bli_thread_hpx.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 Tactical Computing Laboratories, LLC
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_THREAD_HPX_H
+#define BLIS_THREAD_HPX_H
+
+// Definitions specific to situations when HPX multithreading is enabled.
+#ifdef BLIS_ENABLE_HPX
+
+void bli_thread_launch_hpx
+     (
+             dim_t         nt,
+             thread_func_t func,
+       const void*         params
+     );
+
+void bli_thread_initialize_hpx( int argc, char** argv );
+
+int bli_thread_finalize_hpx();
+
+#endif
+
+#endif
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index aec9357ae..7ca314c5f 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -66,6 +66,10 @@ int main( int argc, char** argv )
 	test_params_t params;
 	test_ops_t    ops;
 
+#ifdef BLIS_ENABLE_HPX
+    bli_thread_initialize_hpx( 1, argv );
+#endif
+
 	// Initialize libblis.
 	//bli_init();
 
@@ -88,8 +92,12 @@ int main( int argc, char** argv )
 	// Finalize libblis.
 	bli_finalize();
 
+#ifdef BLIS_ENABLE_HPX
+    return bli_thread_finalize_hpx();
+#else
 	// Return peacefully.
 	return 0;
+#endif
 }
 
 
@@ -782,26 +790,34 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 
 	const bool    has_openmp      = bli_info_get_enable_openmp();
 	const bool    has_pthreads    = bli_info_get_enable_pthreads();
+	const bool    has_hpx         = bli_info_get_enable_hpx();
 	const bool    openmp_is_def   = bli_info_get_enable_openmp_as_default();
 	const bool    pthreads_is_def = bli_info_get_enable_pthreads_as_default();
+	const bool    hpx_is_def      = bli_info_get_enable_hpx_as_default();
 	const timpl_t ti              = bli_thread_get_thread_impl();
 
 	// List the available threading implementation(s).
-	if      ( has_openmp && has_pthreads   ) sprintf( impl_str, "openmp,pthreads,single" );
-	else if ( has_openmp                   ) sprintf( impl_str, "openmp,single" );
-	else if (               has_pthreads   ) sprintf( impl_str, "pthreads,single" );
-	else                                     sprintf( impl_str, "single only" );
+	if      ( has_hpx && has_openmp && has_pthreads   ) sprintf( impl_str, "openmp,pthreads,hpx,single" );
+	else if ( has_hpx && has_openmp                   ) sprintf( impl_str, "openmp,hpx,single" );
+	else if ( has_hpx &&               has_pthreads   ) sprintf( impl_str, "pthreads,hpx,single" );
+	else if ( has_hpx                                 ) sprintf( impl_str, "hpx,single" );
+	else if (            has_openmp && has_pthreads   ) sprintf( impl_str, "openmp,pthreads,single" );
+	else if (            has_openmp                   ) sprintf( impl_str, "openmp,single" );
+	else if (                          has_pthreads   ) sprintf( impl_str, "pthreads,single" );
+	else                                                sprintf( impl_str, "single only" );
 
 	// Describe the default threading implementation that would be active if
 	// or when BLIS_THREAD_IMPL is unset.
 	if      ( openmp_is_def   ) sprintf( def_impl_unset_str, "openmp" );
 	else if ( pthreads_is_def ) sprintf( def_impl_unset_str, "pthreads" );
+	else if ( hpx_is_def      ) sprintf( def_impl_unset_str, "hpx" );
 	else                        sprintf( def_impl_unset_str, "single" );
 
 	// Describe the default threading implementation as the testsuite was
 	// currently run.
 	if      ( ti == BLIS_OPENMP ) sprintf( def_impl_set_str, "openmp" );
 	else if ( ti == BLIS_POSIX  ) sprintf( def_impl_set_str, "pthreads" );
+	else if ( ti == BLIS_HPX    ) sprintf( def_impl_set_str, "hpx" );
 	else                          sprintf( def_impl_set_str, "single" );
 
 	// Describe the status of jrir thread partitioning.

From f0337b784d164ae505ca0e11277a1155680500d1 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sun, 13 Nov 2022 21:36:47 -0600
Subject: [PATCH 109/230] Trival whitespace/comment tweaks.

Details:
- Trivial whitespace and comment changes, most of which ideally would
  have been part of the previous commit pertaining to HPX (2b05948).
---
 common.mk                             |  9 ++++-
 configure                             | 56 +++++++++++++--------------
 frame/include/bli_config_macro_defs.h |  2 +-
 frame/thread/bli_thrcomm.h            |  3 +-
 frame/thread/bli_thread_hpx.cpp       |  6 +--
 testsuite/src/test_libblis.c          |  4 +-
 6 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/common.mk b/common.mk
index 119d09e87..6b7403afb 100644
--- a/common.mk
+++ b/common.mk
@@ -256,6 +256,7 @@ files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)
 rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1)))
 
 
+
 #
 # --- Include makefile configuration file --------------------------------------
 #
@@ -440,6 +441,7 @@ ADDON_FRAG_PATH    := ./obj/$(CONFIG_NAME)/$(ADDON_DIR)
 SANDBOX_FRAG_PATH  := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR)
 
 
+
 #
 # --- Library name and local paths ---------------------------------------------
 #
@@ -515,6 +517,8 @@ else
 LIBBLIS_SO_OUTPUT_NAME := $(LIBBLIS_SO_PATH)
 endif
 
+
+
 #
 # --- Utility program definitions ----------------------------------------------
 #
@@ -644,6 +648,7 @@ endif
 endif
 
 
+
 #
 # --- Include makefile definitions file ----------------------------------------
 #
@@ -826,7 +831,7 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))
 
 # Enable clock_gettime() in time.h.
 CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
-# Enable ip_mreq on macOS which is needed for ASIO which is needed for HPX
+# Enable ip_mreq on macOS which is needed for ASIO which is needed for HPX.
 ifeq ($(OS_NAME),Darwin)
 CPPROCFLAGS += -D_DARWIN_C_SOURCE
 endif
@@ -891,7 +896,7 @@ LDFLAGS      += $(LIBPTHREAD)
 endif
 endif
 
-# Threading flags for HPX
+# Threading flags for HPX.
 ifneq ($(findstring hpx,$(THREADING_MODEL)),)
 HPX_CXXFLAGS := $(shell pkg-config --cflags hpx_component)
 HPX_LDFLAGS  := $(filter-out -shared,$(shell pkg-config --libs hpx_component))
diff --git a/configure b/configure
index f808134d3..286a66123 100755
--- a/configure
+++ b/configure
@@ -147,8 +147,8 @@ print_usage()
 	echo "   --enable-rpath, --disable-rpath"
 	echo " "
 	echo "                 Enable (disabled by default) setting an install_name for"
-    echo "                 dynamic libraries on macOS which starts with @rpath rather"
-    echo "                 than the absolute install path."
+	echo "                 dynamic libraries on macOS which starts with @rpath rather"
+	echo "                 than the absolute install path."
 	echo " "
 	echo "   -e SYMBOLS, --export-shared[=SYMBOLS]"
 	echo " "
@@ -175,17 +175,17 @@ print_usage()
 	echo "                 BLIS, and the choice of which to use will be determined at"
 	echo "                 runtime. If the user does not express a preference (by"
 	echo "                 setting the BLIS_THREAD_IMPL environment variable to"
-	echo "                 'single', 'openmp', 'pthreads', or 'hpx'; by calling the global"
-	echo "                 runtime API bli_thread_set_thread_impl(); or by encoding a"
-	echo "                 choice on a per-call basis within a rntm_t passed into the"
-	echo "                 expert API), then the first model listed in MODEL will be"
-	echo "                 used by default. Note that 'single' is silently appended"
-	echo "                 to whatever the user specifies in MODEL, meaning that"
-	echo "                 single-threaded functionality will always be available,"
-	echo "                 even if it is not requested and even if it is not enabled"
-	echo "                 by default. Even --disable-threading is actually shorthand"
-	echo "                 for --enable-threading=single (which is the default when"
-	echo "                 the option is not specified)."
+	echo "                 'single', 'openmp', 'pthreads', or 'hpx'; by calling the"
+	echo "                 global runtime API bli_thread_set_thread_impl(); or by"
+	echo "                 encoding a choice on a per-call basis within a rntm_t"
+	echo "                 passed into the expert API), then the first model listed"
+	echo "                 in MODEL will be used by default. Note that 'single' is"
+	echo "                 silently appended to whatever the user specifies in MODEL,"
+	echo "                 meaning that single-threaded functionality will always be"
+	echo "                 available, even if it is not requested and even if it is"
+	echo "                 not enabled by default. Even --disable-threading is"
+	echo "                 actually shorthand for --enable-threading=single (which is"
+	echo "                 the default when the option is not specified)."
 	echo " "
 	echo "   --enable-system, --disable-system"
 	echo " "
@@ -1262,9 +1262,9 @@ has_libmemkind()
 	# Depending on the return code from the compile step above, we set
 	# enable_memkind accordingly.
 	if [ "$?" == 0 ]; then
-	    rval='yes'
+		rval='yes'
 	else
-	    rval='no'
+		rval='no'
 	fi
 
 	# Remove the executable generated above.
@@ -1292,9 +1292,9 @@ has_pragma_omp_simd()
 	# Depending on the return code from the compile step above, we set
 	# enable_memkind accordingly.
 	if [ "$?" == 0 ]; then
-	    rval='yes'
+		rval='yes'
 	else
-	    rval='no'
+		rval='no'
 	fi
 
 	# Remove the executable generated above.
@@ -1514,11 +1514,11 @@ get_compiler_version()
 	# Begin parsing cc_vendor for the version string.
 
 	if [ "${cc_vendor}" = "GCC" ]; then
-	     # Conda gcc sometimes has GCC (all caps) in the version string
+		# Conda gcc sometimes has GCC (all caps) in the version string
 		cc_vendor="gcc"
 	fi
 	if [ "${cc_vendor}" = "crosstool-NG" ]; then
-	     # Treat compilers built by crosstool-NG (for eg: conda) as gcc.
+		# Treat compilers built by crosstool-NG (for eg: conda) as gcc.
 		cc_vendor="gcc"
 	fi
 	if [ "${cc_vendor}" = "icc" -o \
@@ -1561,7 +1561,7 @@ get_compiler_version()
 			cc_version=$(echo "${vendor_string}" \
 			             | egrep -o 'AOCC.LLVM.[0-9]+\.[0-9]+\.?[0-9]*' \
 			             | egrep -o           '[0-9]+\.[0-9]+\.?[0-9]*' \
-		                 | { read first rest ; echo $first ; })
+			             | { read first rest ; echo $first ; })
 		else
 
 			# Grep for the AOCC_x.y.z substring first, and then isolate the
@@ -1572,7 +1572,7 @@ get_compiler_version()
 			cc_version=$(echo "${vendor_string}" \
 			             | egrep -o 'AOCC_[0-9]+\.[0-9]+\.?[0-9]*' \
 			             | egrep -o      '[0-9]+\.[0-9]+\.?[0-9]*' \
-		                 | { read first rest ; echo $first ; })
+			             | { read first rest ; echo $first ; })
 		fi
 
 	elif [ "${cc_vendor}" = "oneAPI" ]; then
@@ -2025,9 +2025,9 @@ try_assemble()
 	${cc} ${cflags} -c ${asm_src} -o ${asm_bin} > /dev/null 2>&1
 
 	if [ "$?" == 0 ]; then
-	    rval='yes'
+		rval='yes'
 	else
-	    rval='no'
+		rval='no'
 	fi
 
 	# Remove the object file.
@@ -2501,7 +2501,7 @@ main()
 	enable_arg_max_hack='no'
 	enable_static='yes'
 	enable_shared='yes'
-    enable_rpath='no'
+	enable_rpath='no'
 	export_shared='public'
 	enable_pba_pools='yes'
 	enable_sba_pools='yes'
@@ -2944,7 +2944,7 @@ main()
 	get_binutils_version
 	check_assembler
 
-    # Check if there is any incompatibility due to the operating system.
+	# Check if there is any incompatibility due to the operating system.
 	check_os
 
 	# Remove duplicates and whitespace from the blacklist.
@@ -3473,7 +3473,7 @@ main()
 	if [ "x${enable_asan}" = "xyes" ]; then
 		echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)."
 	else
-        enable_asan='no'
+		enable_asan='no'
 		echo "${script_name}: AddressSanitizer support disabled."
 	fi
 
@@ -3665,7 +3665,7 @@ main()
 
 		elif [ "x${word}" = "xhpx" ]; then
 
-			echo "${script_name}: enabling support for threading via hpx."
+			echo "${script_name}: enabling support for threading via HPX."
 			enable_hpx='yes'
 			enable_hpx_01=1
 
@@ -3705,7 +3705,7 @@ main()
 
 	fi
 
-	# If either OpenMP or pthreads was enabled, given that single-threaded mode is
+	# If OpenMP, pthreads, or HPX was enabled, given that single-threaded mode is
 	# also always enabled, remind the user which one will serve as the default
 	# (that is, absent any explicit choice at runtime).
 	if [ "x${enable_openmp}"   = "xyes" ] ||
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 633d7f671..bf9319f4f 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -94,7 +94,7 @@
 // or pthreads are enabled. This macro is useful in situations when
 // we want to detect use of either OpenMP or pthreads, or both (as
 // opposed to neither being used).
-#if defined ( BLIS_ENABLE_OPENMP ) || \
+#if defined ( BLIS_ENABLE_OPENMP )   || \
     defined ( BLIS_ENABLE_PTHREADS ) || \
     defined ( BLIS_ENABLE_HPX )
   #define BLIS_ENABLE_MULTITHREADING
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index b65cb0b7a..b55922acd 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -94,13 +94,14 @@ typedef struct thrcomm_s
 	#endif
 	#endif
 
+	// -- Fields specific to HPX --
+
 	#ifdef BLIS_ENABLE_HPX
 	#ifdef BLIS_USE_HPX_BARRIER
 	hpx::barrier<> * barrier;
 	#endif
 	#endif
 
-
 } thrcomm_t;
 
 
diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp
index 38c92481d..a7818ffd6 100644
--- a/frame/thread/bli_thread_hpx.cpp
+++ b/frame/thread/bli_thread_hpx.cpp
@@ -71,13 +71,13 @@ void bli_thread_launch_hpx
 
 void bli_thread_initialize_hpx( int argc, char** argv )
 {
-    hpx::start( nullptr, argc, argv );
+	hpx::start( nullptr, argc, argv );
 }
 
 int bli_thread_finalize_hpx()
 {
-    hpx::apply([]() { hpx::finalize(); });
-    return hpx::stop();
+	hpx::apply([]() { hpx::finalize(); });
+	return hpx::stop();
 }
 
 } // extern "C"
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 7ca314c5f..851102a2f 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -67,7 +67,7 @@ int main( int argc, char** argv )
 	test_ops_t    ops;
 
 #ifdef BLIS_ENABLE_HPX
-    bli_thread_initialize_hpx( 1, argv );
+	bli_thread_initialize_hpx( 1, argv );
 #endif
 
 	// Initialize libblis.
@@ -93,7 +93,7 @@ int main( int argc, char** argv )
 	bli_finalize();
 
 #ifdef BLIS_ENABLE_HPX
-    return bli_thread_finalize_hpx();
+	return bli_thread_finalize_hpx();
 #else
 	// Return peacefully.
 	return 0;

From db10dd8e11a12d85017f84455558a82c0093b1da Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 29 Nov 2022 19:10:31 -0600
Subject: [PATCH 110/230] Fixed _gemm_small() prototype; disabled gemm_small.

Details:
- Fixed a mismatch between the prototype for bli_gemm_small() in
  bli_gemm_front.h and the actual definition of bli_gemm_small() in
  kernels/zen/3/bli_gemm_small.c. The former was erroneously declaring
  the cntl_t* argument as 'const'. Thanks to Jeff Diamond for reporting
  this issue.
- Commented out BLIS_ENABLE_SMALL_MATRIX, BLIS_ENABLE_SMALL_MATRIX_TRSM
  macro definitions in config/zen3/bli_family_zen3.h. AMD's small matrix
  implementation should probably remain disabled in vanilla BLIS, at
  least for now.
---
 config/zen3/bli_family_zen3.h | 4 ++--
 frame/3/gemm/bli_gemm_front.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h
index 661313ca9..d03e2edc7 100644
--- a/config/zen3/bli_family_zen3.h
+++ b/config/zen3/bli_family_zen3.h
@@ -52,8 +52,8 @@
 // All zen3 specific code should be included in this macro
 #define BLIS_CONFIG_ZEN3
 
-#define BLIS_ENABLE_SMALL_MATRIX
-#define BLIS_ENABLE_SMALL_MATRIX_TRSM
+//#define BLIS_ENABLE_SMALL_MATRIX
+//#define BLIS_ENABLE_SMALL_MATRIX_TRSM
 
 
 // This will select the threshold below which small matrix code will be called.
diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h
index 9465c37d9..3acf29cfb 100644
--- a/frame/3/gemm/bli_gemm_front.h
+++ b/frame/3/gemm/bli_gemm_front.h
@@ -52,7 +52,7 @@ err_t bli_gemm_small
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-       const cntl_t* cntl
+             cntl_t* cntl
      );
 #endif
 

From 4833ba224eba54df3f349bcb7e188bcc53442449 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 12 Dec 2022 20:26:02 -0600
Subject: [PATCH 111/230] Fixed perf of mt sup with packing, and mt gemmlike.
 (#696)

Details:
- Brought the gemmsup code path up to date relative to the latest
  thrinfo_t semantics introduced in the October Omnibus commit
  (aeb5f0c). This was done by passing the prenode (instead of the
  current node) into the packm variant within bli_l3_sup_packm.c as well
  as creating the prenodes and attaching them to the thrinfo_t tree in
  bli_l3_sup_thrinfo_create(). These changes erase the performance
  degradation introduced in the omnibus when running multithreaded sup
  with optional packing enabled. Special thanks to Devin Matthews for
  sussing out this fix in short order.
- Fixed the gemmlike sandbox in a manner similar to that of sup with
  packing, described above. This also involved passing the prenode into
  the local gemmlike packm variant. (Recall that gemmlike recycles the
  use of bli_l3_sup_thrinfo_create(), so it automatically inherits that
  part of the sup fix described above.)
- Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and
  bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and
  bli_thrinfo_thread_id(), respectively.
---
 frame/3/bli_l3_sup_packm.c           | 4 ++--
 frame/3/bli_l3_thrinfo.c             | 9 +++++++++
 sandbox/gemmlike/bls_l3_packm_a.c    | 2 +-
 sandbox/gemmlike/bls_l3_packm_b.c    | 2 +-
 sandbox/gemmlike/bls_l3_packm_var1.c | 4 ++--
 sandbox/gemmlike/bls_l3_packm_var2.c | 4 ++--
 sandbox/gemmlike/bls_l3_packm_var3.c | 4 ++--
 7 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c
index 797335aeb..890980da3 100644
--- a/frame/3/bli_l3_sup_packm.c
+++ b/frame/3/bli_l3_sup_packm.c
@@ -394,7 +394,7 @@ void bli_packm_sup
 			  ( void* )a,  rs_a,  cs_a,
 			          *p, *rs_p, *cs_p,
 			  ( cntx_t* )cntx,
-			  thread
+			  bli_thrinfo_sub_prenode( thread )
 			);
 		}
 		else // if ( schema == BLIS_PACKED_ROW_PANELS )
@@ -415,7 +415,7 @@ void bli_packm_sup
 			          *p, *rs_p, *cs_p,
 			               pd_p, *ps_p,
 			  ( cntx_t* )cntx,
-			  thread
+			  bli_thrinfo_sub_prenode( thread )
 			);
 		}
 
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 0b45abbf6..95d2a5439 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -150,6 +150,15 @@ thrinfo_t* bli_l3_sup_thrinfo_create
 	thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa );
 	thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr );
 
+	const dim_t n_way_pb = bli_thrinfo_num_threads( thread_pb );
+	const dim_t n_way_pa = bli_thrinfo_num_threads( thread_pa );
+
+	// Create and set the prenodes for the packb and packa thrinfo_t nodes.
+	thrinfo_t* thread_pb_single = bli_thrinfo_split( n_way_pb, thread_pb );
+	thrinfo_t* thread_pa_single = bli_thrinfo_split( n_way_pa, thread_pa );
+	bli_thrinfo_set_sub_prenode( thread_pb_single, thread_pb );
+	bli_thrinfo_set_sub_prenode( thread_pa_single, thread_pa );
+
 	bli_thrinfo_set_sub_node( thread_jc,      root );
 	bli_thrinfo_set_sub_node( thread_pc, thread_jc );
 	bli_thrinfo_set_sub_node( thread_pb, thread_pc );
diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c
index 412c6c24e..742c78bfb 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.c
+++ b/sandbox/gemmlike/bls_l3_packm_a.c
@@ -276,7 +276,7 @@ void PASTECH2(bls_,ch,opname) \
 	  *p, *rs_p, *cs_p, \
 	       pd_p, *ps_p, \
 	  cntx, \
-	  thread  \
+	  bli_thrinfo_sub_prenode( thread )  \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c
index cc9757b1d..db6bca8fc 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.c
+++ b/sandbox/gemmlike/bls_l3_packm_b.c
@@ -276,7 +276,7 @@ void PASTECH2(bls_,ch,opname) \
 	  *p, *rs_p, *cs_p, \
 	       pd_p, *ps_p, \
 	  cntx, \
-	  thread  \
+	  bli_thrinfo_sub_prenode( thread )  \
 	); \
 \
 	/* Barrier so that packing is done before computation. */ \
diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c
index e4d566b44..7c2c4e9a9 100644
--- a/sandbox/gemmlike/bls_l3_packm_var1.c
+++ b/sandbox/gemmlike/bls_l3_packm_var1.c
@@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thrinfo_num_threads( thread ); \
-	const dim_t tid = bli_thrinfo_thread_id( thread ); \
+	const dim_t nt  = bli_thrinfo_n_way( thread ); \
+	const dim_t tid = bli_thrinfo_work_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c
index 3e7e7888a..94ee0efcd 100644
--- a/sandbox/gemmlike/bls_l3_packm_var2.c
+++ b/sandbox/gemmlike/bls_l3_packm_var2.c
@@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thrinfo_num_threads( thread ); \
-	const dim_t tid = bli_thrinfo_thread_id( thread ); \
+	const dim_t nt  = bli_thrinfo_n_way( thread ); \
+	const dim_t tid = bli_thrinfo_work_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c
index 4ccb1828d..48cd6dd60 100644
--- a/sandbox/gemmlike/bls_l3_packm_var3.c
+++ b/sandbox/gemmlike/bls_l3_packm_var3.c
@@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thrinfo_num_threads( thread ); \
-	const dim_t tid = bli_thrinfo_thread_id( thread ); \
+	const dim_t nt  = bli_thrinfo_n_way( thread ); \
+	const dim_t tid = bli_thrinfo_work_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \

From 3accacf57d11e9b109339754f91bf22329b6cb6a Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 16 Dec 2022 10:26:33 -0600
Subject: [PATCH 112/230] Skip 1m optimization when forcing hemm_l/symm_l.
 (#697)

Details:
- Fixed a bug in right-sided hemm when:
  - using the 1m method,
  - #defining BLIS_DISABLE_HEMM_RIGHT in the active subconfiguration,
    and
  - the storage of C matches the gemm microkernel IO preference PRIOR to
    the right-sidedness being detected and recast in terms of the left-
    side code path.
  It turns out that bli_gemm_ind_recast_1m_params() was applying its
  optimization (recasting a complex-domain macrokernel calling a 1m
  virtual microkernel to a real-domain macrokernel calling the real-
  domain microkernel) in situations in which it should not have. The
  optimization was silently assuming that the storage of C always
  matched that of the microkernel preference, since the front-end (in
  this case, bli_hemm_front()) would have already had a chance to
  transpose the operation to bring the two into agreement. However, by
  disabling right-sided hemm, we deprive BLIS of that flexibility (as a
  transposed left-sided case would necessarily have to become a right-
  sided case), and thus the assumption was no longer holding in all
  cases. Thanks to Nisanth M P for reporting this bug in Issue #621.
- The aforementioned bug, and its bugfix, also apply to symm when
  BLIS_DISABLE_SYMM_RIGHT is defined.
- Comment updates.
- CREDITS file update.
---
 CREDITS                             |  1 +
 frame/3/gemm/bli_gemm_ker_var2.c    | 12 +++---
 frame/3/gemm/ind/bli_gemm_ind_opt.h | 64 +++++++++++++++++++++--------
 3 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/CREDITS b/CREDITS
index 55c974f1b..939351c00 100644
--- a/CREDITS
+++ b/CREDITS
@@ -74,6 +74,7 @@ but many others have contributed code and feedback, including
                            @nagsingh
   Bhaskar Nallani          @BhaskarNallani     (AMD)
   Stepan Nassyr            @stepannassyr       (Jülich Supercomputing Centre)
+  Nisanth M P              @nisanthmp
   Nisanth Padinharepatt                        (AMD)
   Ajay Panyala             @ajaypanyala
   Marc-Antoine Parent      @maparent           (Conversence)
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 51dceced2..d59695081 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -131,13 +131,10 @@ void bli_gemm_ker_var2
 	const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
 	const char* beta_cast  = bli_obj_internal_scalar_buffer( c );
 
-	// If 1m is being employed on a column- or row-stored matrix with a
-	// real-valued beta, we can use the real domain macro-kernel, which
-	// eliminates a little overhead associated with the 1m virtual
-	// micro-kernel.
-	// Only employ this optimization if the storage datatype of C is
-	// equal to the execution/computation datatype.
 #if 1
+	// Under certain conditions, we can avoid the overhead of calling the 1m
+	// virtual microkernel by having the real-domain macrokernel execute with
+	// the real-domain microkernel. (See the function definition for details.)
 	if ( bli_cntx_method( cntx ) == BLIS_1M )
 	{
 		bli_gemm_ind_recast_1m_params
@@ -149,7 +146,8 @@ void bli_gemm_ker_var2
 		  &m, &n, &k,
 		  &pd_a, &ps_a,
 		  &pd_b, &ps_b,
-		  &rs_c, &cs_c
+		  &rs_c, &cs_c,
+		  cntx
 		);
 	}
 #endif
diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h
index 789d5895c..a57325580 100644
--- a/frame/3/gemm/ind/bli_gemm_ind_opt.h
+++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h
@@ -34,27 +34,59 @@
 
 BLIS_INLINE void bli_gemm_ind_recast_1m_params
      (
-             num_t* dt_exec,
-             num_t* dt_c,
-             pack_t schema_a,
-       const obj_t* c,
-             dim_t* m,
-             dim_t* n,
-             dim_t* k,
-             inc_t* pd_a, inc_t* ps_a,
-             inc_t* pd_b, inc_t* ps_b,
-             inc_t* rs_c, inc_t* cs_c
+             num_t*  dt_exec,
+             num_t*  dt_c,
+             pack_t  schema_a,
+       const obj_t*  c,
+             dim_t*  m,
+             dim_t*  n,
+             dim_t*  k,
+             inc_t*  pd_a, inc_t* ps_a,
+             inc_t*  pd_b, inc_t* ps_b,
+             inc_t*  rs_c, inc_t* cs_c,
+       const cntx_t* cntx
      )
 {
 	obj_t beta;
 
-	/* Detach the beta scalar from c so that we can test its imaginary
-	   component. */
+	// Detach the beta scalar from c so that we can test its imaginary
+	// component.
 	bli_obj_scalar_detach( c, &beta );
 
-	/* If beta is in the real domain, and c is row- or column-stored,
-	   then we may proceed with the optimization. */
-	if ( bli_obj_imag_is_zero( &beta ) &&
+#if 1
+	// Determine whether the storage of C matches the IO preference of the
+	// microkernel. (We cannot utilize the optimization below if there is a
+	// mismatch.)
+	const ukr_t ukr_id     = BLIS_GEMM_VIR_UKR;
+
+	const bool  row_stored = bli_is_row_stored( *rs_c, *cs_c );
+	const bool  col_stored = !row_stored;
+	const bool  row_pref   = bli_cntx_ukr_prefers_rows_dt( *dt_c, ukr_id, cntx );
+	const bool  col_pref   = !row_pref;
+
+	const bool  is_match   = ( row_stored && row_pref ) ||
+	                         ( col_stored && col_pref );
+#else
+	// This was the previous behavior, which resulted in buggy behavior
+	// when executing right-sided hemm, and:
+	// - the 1m method is enabled,
+	// - BLIS_DISABLE_HEMM_RIGHT is #defined, and
+	// - the storage of C matches the microkernel IO preference PRIOR to
+	//   detecting the right-sidedness of the operation.
+	// See Issue #621 for details.
+	const bool is_match = TRUE;
+#endif
+
+	// If (a) the storage of C matches the IO pref of the ukernel, (b) beta is
+	// in the real domain, and (c) C is row- or column-stored, then we may
+	// proceed with the optimization below, which allows 1m to be induced by
+	// executing the real-domain macrokernel with the real-domain microkernel
+	// plus a few tweaked parameters. Otherwise, we must skip the optimization
+	// and allow 1m to execute via the complex-domain macrokernel calling the
+	// 1m virtual microkernel function, which will incur a little extra
+	// overhead.
+	if ( is_match &&
+	     bli_obj_imag_is_zero( &beta ) &&
 	     !bli_is_gen_stored( *rs_c, *cs_c ) )
 	{
 		*dt_exec = bli_dt_proj_to_real( *dt_exec );
@@ -69,7 +101,7 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params
 			*pd_b *= 1; *ps_b *= 2;
 			*rs_c *= 1; *cs_c *= 2;
 		}
-		else /* if ( bli_is_1r_packed( schema_a ) ) */
+		else // if ( bli_is_1r_packed( schema_a ) )
 		{
 			*m    *= 1;
 			*n    *= 2;

From 7d23dc2a064a371dc9883e2c2c7236a70912428c Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 25 Dec 2022 19:09:14 -0600
Subject: [PATCH 113/230] Fix a race condition which manifested as incorrect
 results (rarely). (#702)

The problem occurs when there are at least two teams of threads packing different parts of a matrix, and where each team has at least two threads; call them team A and team B. The problematic sequence is:

1. The chief of team A checks out a block B and broadcasts the pointer to its teammates.
2. Team A completely packs their data and perform a barrier amongst themselves.
3. Team A commences computing with the packed data.
4. The chief of team A finishes computing before its teammates, then calls bli_thrinfo_free on its thrinfo_t struct (which contains the mem_t object referencing the buffer B). This causes buffer B to be checked back in to the pba.
5. The chief of team B checks out the *same* block B that was just checked back in and broadcasts the pointer to its teammates.
6. DATA RACE: now the remaining threads of team A are reading *while* team B are writing to the same buffer B. If team A write new data before team B are done computing then an incorrect result is generated.

The solution is to place a global barrier before the call to bli_thrinfo_free at the end of the computation.

Co-authored-by: Field G. Van Zee <field@cs.utexas.edu>
---
 frame/3/bli_l3_decor.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c
index 4160751e6..88ec5def9 100644
--- a/frame/3/bli_l3_decor.c
+++ b/frame/3/bli_l3_decor.c
@@ -114,6 +114,11 @@ static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const
 	bli_l3_cntl_free( sba_pool, cntl_use );
 
 	// Free the current thread's thrinfo_t structure.
+	// NOTE: The barrier here is very important as it prevents memory being
+	// released by the chief of some thread sub-group before its peers are done
+	// using it. See PR #702 for more info [1].
+	// [1] https://github.com/flame/blis/pull/702
+	bli_thrinfo_barrier( thread );
 	bli_thrinfo_free( thread );
 }
 

From 538150c5845ad903773ca797c740048174116aa4 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sun, 25 Dec 2022 22:28:09 -0600
Subject: [PATCH 114/230] Applied race condition fix to sup thread decorator.

Details:
- Applied the race condition bugfix in commit 7d23dc2 to the
  corresponding sup code in bli_l3_sup_decor.c. Note that in the case
  of sup, the race condition would have only manifested when optional
  packing was enabled at runtime (typically via setting BLIS_PACK_A
  and/or BLIS_PACK_B environment variables).
- Both the fix in this commit and the fix in 7d23dc2 address bugs
  that were introduced when the thrinfo_t trees/communicators were
  restructured in the October omnibus commit (aeb5f0c).
---
 frame/3/bli_l3_sup_decor.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/frame/3/bli_l3_sup_decor.c b/frame/3/bli_l3_sup_decor.c
index 5f415ac50..7cda8bdca 100644
--- a/frame/3/bli_l3_sup_decor.c
+++ b/frame/3/bli_l3_sup_decor.c
@@ -85,6 +85,11 @@ static void bli_l3_sup_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, co
 	);
 
 	// Free the current thread's thrinfo_t structure.
+	// NOTE: The barrier here is very important as it prevents memory being
+	// released by the chief of some thread sub-group before its peers are done
+	// using it. See PR #702 for more info [1].
+	// [1] https://github.com/flame/blis/pull/702
+	bli_thrinfo_barrier( thread );
 	bli_thrinfo_free( thread );
 }
 

From f956b79922da412791e4c8b8b846b3aafc0a5ee0 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sat, 31 Dec 2022 20:18:08 -0600
Subject: [PATCH 115/230] Switch to l3 sup decorator in gemmlike sandbox.
 (#704)

Details:
- Modified the gemmlike sandbox to call bli_l3_sup_thread_decorator()
  rather than a local analogue of that code. This reduces redundant
  logic and makes it easier for the sandbox to inherit future
  improvements to the framework's threading code.
- Moved addon/gemmd to addon/old/gemmd. This code has fallen out of date
  and is taking too much effort to maintain. We will very likely
  reimplement it completely once future changes are made to the
  framework proper.
---
 .../{ => old}/gemmd/attic/bao_gemmd_bp_var2.c |  0
 addon/{ => old}/gemmd/attic/bli_gemm_ex.c     |  0
 addon/{ => old}/gemmd/bao_gemmd.c             |  0
 addon/{ => old}/gemmd/bao_gemmd.h             |  0
 addon/{ => old}/gemmd/bao_gemmd_bp_var1.c     |  0
 addon/{ => old}/gemmd/bao_gemmd_check.c       |  0
 addon/{ => old}/gemmd/bao_gemmd_check.h       |  0
 addon/{ => old}/gemmd/bao_gemmd_var.h         |  0
 addon/{ => old}/gemmd/bao_l3_packm_a.c        |  0
 addon/{ => old}/gemmd/bao_l3_packm_a.h        |  0
 addon/{ => old}/gemmd/bao_l3_packm_b.c        |  0
 addon/{ => old}/gemmd/bao_l3_packm_b.h        |  0
 addon/{ => old}/gemmd/bao_l3_packm_var.h      |  0
 addon/{ => old}/gemmd/bao_l3_packm_var1.c     |  0
 addon/{ => old}/gemmd/bao_l3_packm_var2.c     |  0
 addon/{ => old}/gemmd/bao_packm_cxk.c         |  0
 addon/{ => old}/gemmd/bao_packm_cxk.h         |  0
 addon/{ => old}/gemmd/gemmd.h                 |  0
 addon/{ => old}/gemmd/thread/bao_l3_decor.c   |  0
 addon/{ => old}/gemmd/thread/bao_l3_decor.h   |  0
 .../gemmd/thread/bao_l3_decor_openmp.c        |  0
 .../gemmd/thread/bao_l3_decor_openmp.h        |  0
 .../gemmd/thread/bao_l3_decor_pthreads.c      |  0
 .../gemmd/thread/bao_l3_decor_pthreads.h      |  0
 .../gemmd/thread/bao_l3_decor_single.c        |  0
 .../gemmd/thread/bao_l3_decor_single.h        |  0
 sandbox/gemmlike/bli_sandbox.h                |  2 --
 sandbox/gemmlike/bls_gemm.c                   | 35 ++++++++++---------
 sandbox/gemmlike/bls_gemm.h                   | 18 +++++-----
 sandbox/gemmlike/bls_gemm_bp_var1.c           | 20 +++++------
 sandbox/gemmlike/bls_gemm_var.h               | 16 ++++-----
 sandbox/gemmlike/{ => old}/bls_l3_decor.c     |  0
 sandbox/gemmlike/{ => old}/bls_l3_decor.h     |  0
 33 files changed, 42 insertions(+), 49 deletions(-)
 rename addon/{ => old}/gemmd/attic/bao_gemmd_bp_var2.c (100%)
 rename addon/{ => old}/gemmd/attic/bli_gemm_ex.c (100%)
 rename addon/{ => old}/gemmd/bao_gemmd.c (100%)
 rename addon/{ => old}/gemmd/bao_gemmd.h (100%)
 rename addon/{ => old}/gemmd/bao_gemmd_bp_var1.c (100%)
 rename addon/{ => old}/gemmd/bao_gemmd_check.c (100%)
 rename addon/{ => old}/gemmd/bao_gemmd_check.h (100%)
 rename addon/{ => old}/gemmd/bao_gemmd_var.h (100%)
 rename addon/{ => old}/gemmd/bao_l3_packm_a.c (100%)
 rename addon/{ => old}/gemmd/bao_l3_packm_a.h (100%)
 rename addon/{ => old}/gemmd/bao_l3_packm_b.c (100%)
 rename addon/{ => old}/gemmd/bao_l3_packm_b.h (100%)
 rename addon/{ => old}/gemmd/bao_l3_packm_var.h (100%)
 rename addon/{ => old}/gemmd/bao_l3_packm_var1.c (100%)
 rename addon/{ => old}/gemmd/bao_l3_packm_var2.c (100%)
 rename addon/{ => old}/gemmd/bao_packm_cxk.c (100%)
 rename addon/{ => old}/gemmd/bao_packm_cxk.h (100%)
 rename addon/{ => old}/gemmd/gemmd.h (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor.c (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor.h (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor_openmp.c (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor_openmp.h (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor_pthreads.c (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor_pthreads.h (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor_single.c (100%)
 rename addon/{ => old}/gemmd/thread/bao_l3_decor_single.h (100%)
 rename sandbox/gemmlike/{ => old}/bls_l3_decor.c (100%)
 rename sandbox/gemmlike/{ => old}/bls_l3_decor.h (100%)

diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/old/gemmd/attic/bao_gemmd_bp_var2.c
similarity index 100%
rename from addon/gemmd/attic/bao_gemmd_bp_var2.c
rename to addon/old/gemmd/attic/bao_gemmd_bp_var2.c
diff --git a/addon/gemmd/attic/bli_gemm_ex.c b/addon/old/gemmd/attic/bli_gemm_ex.c
similarity index 100%
rename from addon/gemmd/attic/bli_gemm_ex.c
rename to addon/old/gemmd/attic/bli_gemm_ex.c
diff --git a/addon/gemmd/bao_gemmd.c b/addon/old/gemmd/bao_gemmd.c
similarity index 100%
rename from addon/gemmd/bao_gemmd.c
rename to addon/old/gemmd/bao_gemmd.c
diff --git a/addon/gemmd/bao_gemmd.h b/addon/old/gemmd/bao_gemmd.h
similarity index 100%
rename from addon/gemmd/bao_gemmd.h
rename to addon/old/gemmd/bao_gemmd.h
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/old/gemmd/bao_gemmd_bp_var1.c
similarity index 100%
rename from addon/gemmd/bao_gemmd_bp_var1.c
rename to addon/old/gemmd/bao_gemmd_bp_var1.c
diff --git a/addon/gemmd/bao_gemmd_check.c b/addon/old/gemmd/bao_gemmd_check.c
similarity index 100%
rename from addon/gemmd/bao_gemmd_check.c
rename to addon/old/gemmd/bao_gemmd_check.c
diff --git a/addon/gemmd/bao_gemmd_check.h b/addon/old/gemmd/bao_gemmd_check.h
similarity index 100%
rename from addon/gemmd/bao_gemmd_check.h
rename to addon/old/gemmd/bao_gemmd_check.h
diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/old/gemmd/bao_gemmd_var.h
similarity index 100%
rename from addon/gemmd/bao_gemmd_var.h
rename to addon/old/gemmd/bao_gemmd_var.h
diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/old/gemmd/bao_l3_packm_a.c
similarity index 100%
rename from addon/gemmd/bao_l3_packm_a.c
rename to addon/old/gemmd/bao_l3_packm_a.c
diff --git a/addon/gemmd/bao_l3_packm_a.h b/addon/old/gemmd/bao_l3_packm_a.h
similarity index 100%
rename from addon/gemmd/bao_l3_packm_a.h
rename to addon/old/gemmd/bao_l3_packm_a.h
diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/old/gemmd/bao_l3_packm_b.c
similarity index 100%
rename from addon/gemmd/bao_l3_packm_b.c
rename to addon/old/gemmd/bao_l3_packm_b.c
diff --git a/addon/gemmd/bao_l3_packm_b.h b/addon/old/gemmd/bao_l3_packm_b.h
similarity index 100%
rename from addon/gemmd/bao_l3_packm_b.h
rename to addon/old/gemmd/bao_l3_packm_b.h
diff --git a/addon/gemmd/bao_l3_packm_var.h b/addon/old/gemmd/bao_l3_packm_var.h
similarity index 100%
rename from addon/gemmd/bao_l3_packm_var.h
rename to addon/old/gemmd/bao_l3_packm_var.h
diff --git a/addon/gemmd/bao_l3_packm_var1.c b/addon/old/gemmd/bao_l3_packm_var1.c
similarity index 100%
rename from addon/gemmd/bao_l3_packm_var1.c
rename to addon/old/gemmd/bao_l3_packm_var1.c
diff --git a/addon/gemmd/bao_l3_packm_var2.c b/addon/old/gemmd/bao_l3_packm_var2.c
similarity index 100%
rename from addon/gemmd/bao_l3_packm_var2.c
rename to addon/old/gemmd/bao_l3_packm_var2.c
diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/old/gemmd/bao_packm_cxk.c
similarity index 100%
rename from addon/gemmd/bao_packm_cxk.c
rename to addon/old/gemmd/bao_packm_cxk.c
diff --git a/addon/gemmd/bao_packm_cxk.h b/addon/old/gemmd/bao_packm_cxk.h
similarity index 100%
rename from addon/gemmd/bao_packm_cxk.h
rename to addon/old/gemmd/bao_packm_cxk.h
diff --git a/addon/gemmd/gemmd.h b/addon/old/gemmd/gemmd.h
similarity index 100%
rename from addon/gemmd/gemmd.h
rename to addon/old/gemmd/gemmd.h
diff --git a/addon/gemmd/thread/bao_l3_decor.c b/addon/old/gemmd/thread/bao_l3_decor.c
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor.c
rename to addon/old/gemmd/thread/bao_l3_decor.c
diff --git a/addon/gemmd/thread/bao_l3_decor.h b/addon/old/gemmd/thread/bao_l3_decor.h
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor.h
rename to addon/old/gemmd/thread/bao_l3_decor.h
diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/old/gemmd/thread/bao_l3_decor_openmp.c
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor_openmp.c
rename to addon/old/gemmd/thread/bao_l3_decor_openmp.c
diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.h b/addon/old/gemmd/thread/bao_l3_decor_openmp.h
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor_openmp.h
rename to addon/old/gemmd/thread/bao_l3_decor_openmp.h
diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.c b/addon/old/gemmd/thread/bao_l3_decor_pthreads.c
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor_pthreads.c
rename to addon/old/gemmd/thread/bao_l3_decor_pthreads.c
diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.h b/addon/old/gemmd/thread/bao_l3_decor_pthreads.h
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor_pthreads.h
rename to addon/old/gemmd/thread/bao_l3_decor_pthreads.h
diff --git a/addon/gemmd/thread/bao_l3_decor_single.c b/addon/old/gemmd/thread/bao_l3_decor_single.c
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor_single.c
rename to addon/old/gemmd/thread/bao_l3_decor_single.c
diff --git a/addon/gemmd/thread/bao_l3_decor_single.h b/addon/old/gemmd/thread/bao_l3_decor_single.h
similarity index 100%
rename from addon/gemmd/thread/bao_l3_decor_single.h
rename to addon/old/gemmd/thread/bao_l3_decor_single.h
diff --git a/sandbox/gemmlike/bli_sandbox.h b/sandbox/gemmlike/bli_sandbox.h
index f3782b3db..6f33da602 100644
--- a/sandbox/gemmlike/bli_sandbox.h
+++ b/sandbox/gemmlike/bli_sandbox.h
@@ -53,7 +53,5 @@
 
 #include "bls_packm_cxk.h"
 
-#include "bls_l3_decor.h"
-
 
 #endif
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index ba930ebc5..e0fb5bb8a 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -172,16 +172,16 @@ void bls_gemm_ex
 	// Spawn threads (if applicable), where bls_gemm_int() is the thread entry
 	// point function for each thread. This also begins the process of creating
 	// the thrinfo_t tree, which contains thread communicators.
-	bls_l3_thread_decorator
+	bli_l3_sup_thread_decorator
 	(
 	  bls_gemm_int,
 	  BLIS_GEMM, // operation family id
-	  ( obj_t* )alpha,
-	  ( obj_t* )&a_local,
-	  ( obj_t* )&b_local,
-	  ( obj_t* )beta,
-	  ( obj_t* )&c_local,
-	  ( cntx_t* )cntx,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
 	  &rntm_l
 	);
 }
@@ -190,16 +190,16 @@ void bls_gemm_ex
 // -- Define the gemm-like operation's thread entry point ----------------------
 //
 
-void bls_gemm_int
+err_t bls_gemm_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const rntm_t*    rntm,
+             thrinfo_t* thread
      )
 {
 	// In this function, we choose the gemm implementation that is executed
@@ -214,9 +214,10 @@ void bls_gemm_int
 	  beta,
 	  c,
 	  cntx,
-	  rntm,
 	  thread
 	);
+
+	return BLIS_SUCCESS;
 }
 
 //
diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h
index 7380f02ad..b8dba9cfd 100644
--- a/sandbox/gemmlike/bls_gemm.h
+++ b/sandbox/gemmlike/bls_gemm.h
@@ -60,16 +60,16 @@ void bls_gemm_ex
 // -- Prototype the gemm-like operation's thread entry point -------------------
 //
 
-void bls_gemm_int
+err_t bls_gemm_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const rntm_t*    rntm,
+             thrinfo_t* thread
      );
 
 //
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 28c5032bc..02f7458ad 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -49,7 +49,6 @@ typedef void (*FUNCPTR_T)
        void*   restrict beta,
        void*   restrict c, inc_t rs_c, inc_t cs_c,
        cntx_t* restrict cntx,
-       rntm_t* restrict rntm,
        thrinfo_t* restrict thread
      );
 
@@ -63,14 +62,13 @@ static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var1);
 
 void bls_gemm_bp_var1
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
      )
 {
 	const num_t    dt        = bli_obj_dt( c );
@@ -114,8 +112,7 @@ void bls_gemm_bp_var1
 	  buf_b, rs_b, cs_b,
 	  buf_beta,
 	  buf_c, rs_c, cs_c,
-	  cntx,
-	  rntm,
+	  ( cntx_t* )cntx,
 	  thread
 	);
 }
@@ -140,7 +137,6 @@ void PASTECH2(bls_,ch,varname) \
        void*   restrict beta, \
        void*   restrict c, inc_t rs_c, inc_t cs_c, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
        thrinfo_t* restrict thread  \
      ) \
 { \
diff --git a/sandbox/gemmlike/bls_gemm_var.h b/sandbox/gemmlike/bls_gemm_var.h
index 7c515f8c3..0a41afde4 100644
--- a/sandbox/gemmlike/bls_gemm_var.h
+++ b/sandbox/gemmlike/bls_gemm_var.h
@@ -42,14 +42,13 @@
 \
 void PASTECH(bls_,opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
+       const obj_t*     alpha, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     beta, \
+       const obj_t*     c, \
+       const cntx_t*    cntx, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( gemm_bp_var1 )
@@ -75,7 +74,6 @@ void PASTECH2(bls_,ch,varname) \
        void*   restrict beta, \
        void*   restrict c, inc_t rs_c, inc_t cs_c, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
        thrinfo_t* restrict thread  \
      );
 
diff --git a/sandbox/gemmlike/bls_l3_decor.c b/sandbox/gemmlike/old/bls_l3_decor.c
similarity index 100%
rename from sandbox/gemmlike/bls_l3_decor.c
rename to sandbox/gemmlike/old/bls_l3_decor.c
diff --git a/sandbox/gemmlike/bls_l3_decor.h b/sandbox/gemmlike/old/bls_l3_decor.h
similarity index 100%
rename from sandbox/gemmlike/bls_l3_decor.h
rename to sandbox/gemmlike/old/bls_l3_decor.h

From b6735ca26b9d459d9253795dc5841ae8de9e84c9 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 6 Jan 2023 14:10:01 -0600
Subject: [PATCH 116/230] Refactor structure awareness in packm_blk_var1.c.
 (#707)

Details:
- Factored some of the structure awareness out of the loop in
  bli_packm_blk_var1(). So instead of having a single loop with
  conditionals in the body to handle various kinds of structure (and
  stored/unstored submatrix placement), we now have a conditional branch
  to handle various structure/storage scenarios with a loop in each
  section. This change was originally motivated to choose slab or round-
  robin partitioning (in the context of triangular matrices) based on
  the structure of the entire block (or panel) being packed rather than
  each micropanel individually. Previously, the code would attempt to
  limit rr to the portion of the block that intersects the diagonal and
  use slab for the remainder. However, that approach was not well-thought
  out and in many situations this would lead to inferior load balancing
  when compared to using round-robin for the entire block (or panel).
  This commit has the added benefit of incurring less overhead during
  the packing process now that each of the new loops is simpler.
---
 frame/1m/packm/bli_packm_blk_var1.c | 174 ++++++++++++++--------------
 1 file changed, 87 insertions(+), 87 deletions(-)

diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 05263c4b7..b8f4f945d 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -178,71 +178,98 @@ void bli_packm_blk_var1
 
 	char* p_begin = p_cast;
 
-	// Iterate over every logical micropanel in the source matrix.
-	for ( dim_t ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter;
-	            ic += ic_inc, ip += ip_inc, it += 1 )
+	if ( !bli_is_triangular( strucc ) ||
+	     bli_is_stored_subpart_n( diagoffc, uploc, iter_dim, panel_len_full ) )
 	{
-		dim_t  panel_dim_i     = bli_min( panel_dim_max, iter_dim - ic );
-		dim_t  panel_dim_off_i = panel_dim_off + ic;
-
-		doff_t diagoffc_i      = diagoffc + (ip  )*diagoffc_inc;
-		char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;
-
-		inc_t  p_inc           = ps_p;
-
-		// NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr())
-		// when packing micropanels of a triangular matrix. Hermitian/symmetric
-		// and general packing may use slab or round-robin (bli_packm_my_iter()),
-		// depending on which was selected at configure-time.
-		bool my_iter = ( bli_is_triangular( strucc ) &&
-		                 bli_intersects_diag_n( diagoffc_i, panel_dim_i,
-		                                        panel_len_full )
-		                 ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
-		                 : bli_packm_my_iter   ( it, it_start, it_end, tid, nt )
-		               );
-
-		if ( bli_is_triangular( strucc ) &&
-		     bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) )
+		// This case executes if the panel is either dense, belongs
+		// to a Hermitian or symmetric matrix, which includes stored,
+		// unstored, and diagonal-intersecting panels, or belongs
+		// to a completely stored part of a triangular matrix.
+
+		// Iterate over every logical micropanel in the source matrix.
+		for ( dim_t ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter;
+		            ic += ic_inc, ip += ip_inc, it += 1 )
 		{
-			// This case executes if the panel belongs to a triangular
-			// matrix AND is completely unstored (ie: zero). If the panel
-			// is unstored, we do nothing. (Notice that we don't even
-			// increment p_begin.)
+			dim_t  panel_dim_i     = bli_min( panel_dim_max, iter_dim - ic );
+			dim_t  panel_dim_off_i = panel_dim_off + ic;
 
-			continue;
+			char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;
+
+			// Hermitian/symmetric and general packing may use slab or
+			// round-robin (bli_packm_my_iter()), depending on which was
+			// selected at configure-time.
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) )
+			{
+				packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
+				                diagc,
+				                uploc,
+				                conjc,
+				                schema,
+				                invdiag,
+				                panel_dim_i,
+				                panel_len_full,
+				                panel_dim_max,
+				                panel_len_max,
+				                panel_dim_off_i,
+				                panel_len_off,
+				                kappa_cast,
+				                c_begin, incc, ldc,
+				                p_begin,       ldp, is_p,
+				                ( cntx_t* )cntx,
+				                params );
+			}
+
+			p_begin += ps_p*dt_p_size;
 		}
-		else if ( bli_is_triangular( strucc ) &&
-		          bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) )
+	}
+	else
+	{
+		// This case executes if the panel belongs to a diagonal-intersecting
+		// part of a triangular matrix.
+
+		// Iterate over every logical micropanel in the source matrix.
+		for ( dim_t ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter;
+		            ic += ic_inc, ip += ip_inc, it += 1 )
 		{
-			// This case executes if the panel belongs to a triangular
-			// matrix AND is diagonal-intersecting. Notice that we
-			// cannot bury the following conditional logic into
-			// packm_struc_cxk() because we need to know the value of
-			// panel_len_max_i so we can properly increment p_inc.
-
-			// Sanity check. Diagonals should not intersect the short end of
-			// a micro-panel. If they do, then somehow the constraints on
-			// cache blocksizes being a whole multiple of the register
-			// blocksizes was somehow violated.
-			if ( diagoffc_i < 0 )
+			dim_t  panel_dim_i     = bli_min( panel_dim_max, iter_dim - ic );
+			dim_t  panel_dim_off_i = panel_dim_off + ic;
+
+			doff_t diagoffc_i      = diagoffc + (ip  )*diagoffc_inc;
+			char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;
+
+			if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i,
+			                                panel_len_full ) )
+				continue;
+
+			// Sanity check. Diagonals should not intersect the short edge of
+			// a micro-panel (typically corresponding to a register blocksize).
+			// If they do, then the constraints on cache blocksizes being a
+			// whole multiple of the register blocksizes was somehow violated.
+			if ( ( diagoffc_i > -panel_dim_i &&
+			       diagoffc_i < 0 ) ||
+			     ( diagoffc_i > panel_len_full &&
+			       diagoffc_i < panel_len_full + panel_dim_i ) )
 				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
 
-			dim_t  panel_off_i;
-			dim_t  panel_len_i;
-			dim_t  panel_len_max_i;
+			dim_t panel_off_i     = 0;
+			dim_t panel_len_i     = panel_len_full;
+			dim_t panel_len_max_i = panel_len_max;
 
-			if ( bli_is_lower( uploc ) )
+			if ( bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) )
 			{
-				panel_off_i     = 0;
-				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i;
-				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max,
-				                           panel_len_max );
-			}
-			else // if ( bli_is_upper( uploc ) )
-			{
-				panel_off_i     = bli_abs( diagoffc_i );
-				panel_len_i     = panel_len_full - panel_off_i;
-				panel_len_max_i = panel_len_max  - panel_off_i;
+				if ( bli_is_lower( uploc ) )
+				{
+					panel_off_i     = 0;
+					panel_len_i     = diagoffc_i + panel_dim_i;
+					panel_len_max_i = bli_min( diagoffc_i + panel_dim_max,
+					                           panel_len_max );
+				}
+				else // if ( bli_is_upper( uploc ) )
+				{
+					panel_off_i     = diagoffc_i;
+					panel_len_i     = panel_len_full - panel_off_i;
+					panel_len_max_i = panel_len_max  - panel_off_i;
+				}
 			}
 
 			dim_t panel_len_off_i = panel_off_i + panel_len_off;
@@ -259,7 +286,9 @@ void bli_packm_blk_var1
 			// We nudge the imaginary stride up by one if it is odd.
 			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );
 
-			if ( my_iter )
+			// NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr())
+			// when packing micropanels of a triangular matrix.
+			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) )
 			{
 				packm_ker_cast( strucc,
 				                diagc,
@@ -284,37 +313,8 @@ void bli_packm_blk_var1
 			// NOTE: This value is usually LESS than ps_p because triangular
 			// matrices usually have several micro-panels that are shorter
 			// than a "full" micro-panel.
-			p_inc = is_p_use;
+			p_begin += is_p_use*dt_p_size;
 		}
-		else
-		{
-			// This case executes if the panel is either dense, or belongs
-			// to a Hermitian or symmetric matrix, which includes stored,
-			// unstored, and diagonal-intersecting panels.
-
-			if ( my_iter )
-			{
-				packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
-				                diagc,
-				                uploc,
-				                conjc,
-				                schema,
-				                invdiag,
-				                panel_dim_i,
-				                panel_len_full,
-				                panel_dim_max,
-				                panel_len_max,
-				                panel_dim_off_i,
-				                panel_len_off,
-				                kappa_cast,
-				                c_begin, incc, ldc,
-				                p_begin,       ldp, is_p,
-				                ( cntx_t* )cntx,
-				                params );
-			}
-		}
-
-		p_begin += p_inc*dt_p_size;
 	}
 }
 

From 2e1ba9d13c23a06a7b6f8bd326af428f7ea68c31 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 10 Jan 2023 21:05:54 -0600
Subject: [PATCH 117/230] Tile-level partitioning in jr/ir loops (ex-trsm).
 (#695)

Details:
- Reimplemented parallelization of the JR loop in gemmt (which is
  recycled for herk, her2k, syrk, and syr2k). Previously, the
  rectangular region of the current MC x NC panel of C would be
  parallelized separately from from the diagonal region of that same
  submatrix, with the rectangular portion being assigned to threads via
  slab or round-robin (rr) partitioning (as determined at configure-
  time) and the diagonal region being assigned via round-robin. This
  approach did not work well when extracting lots of parallelism from
  the JR loop and was often suboptimal even for smaller degrees of
  parallelism. This commit implements tile-level load balancing (tlb) in
  which the IR loop is effectively subjugated in service of more
  equitably dividing work in the JR loop. This approach is especially
  potent for certain situations where the diagonal region of the MC x NR
  panel of C are significant relative to the entire region. However, it
  also seems to benefit many problem sizes of other level-3 operations
  (excluding trsm, which has an inherent algorithmic dependency in the
  IR loop that prevents the application of tlb). For now, tlb is
  implemented as _var2b.c macrokernels for gemm (which forms the basis
  for gemm, hemm, and symm), gemmt (which forms the basis of herk,
  her2k, syrk, and syr2k), and trmm (which forms the basis of trmm and
  trmm3). Which function pointers (_var2() or _var2b()) are embedded in
  the control tree will depend on whether the BLIS_ENABLE_JRIR_TLB cpp
  macro is defined, which is controlled by the value passed to the
  existing --thread-part-jrir=METHOD (or -r METHOD) configure option.
  This script adds 'tlb' as a valid option alongside the previously
  supported values of 'slab' and 'rr'. ('slab' is still the default.)
  Thanks to Leick Robinson for abstractly inspiring this work, and to
  Minh Quan Ho for inquiring (in PR #562, and before that in Issue #437)
  about the possibility of improved load balance in macrokernel loops,
  and even prototyping what it might look like, long before I fully
  understood the problem.
- In bli_thread_range_weighted_sub(), tweaked the the way we compute the
  area of the current MC x NC trapezoidal panel of C by better taking
  into account the microtile structure along the diagonal. Previously,
  it was an underestimate, as it assumed MR = NR = 1 (that is, it
  assumed that the microtile column of C that overlapped with microtiles
  exactly coincided with the diagonal). Now, we only assume MR = NR.
  This is still a slight underestimate when MR != NR, so the additional
  area is scaled by 1.5 in a hackish attempt to compensate for this, as
  well as other additional effects that are difficult to model (such as
  the increased cost of writing to temporary tiles before finally
  updating C). The net effect of this better estimation of the
  trapezoidal area should be (on average) slightly larger regions
  assigned to threads that have little or no overlap with the diagonal
  region (and correspondingly slightly smaller regions in the diagonal
  region), which we expect will lead to slightly better load balancing
  in most situations.
- Spun off the contents of bli_thread.[ch] that relate to computing
  thread ranges into one of three source/header file pairs:
  - bli_thread_range.[ch], which define functions that are not specific
    to the jr/ir loops;
  - bli_thread_range_slab_rr.[ch], which define functions that implement
    slab or round-robin partitioning for the jr/ir loops;
  - bli_thread_range_tlb.[ch], which define functions that implement
    tlb for the jr/ir loops.
- Fixed the computation of a_next in the last iteration of the IR loop
  in bli_gemmt_l_ker_var2(). Previously, it always "wrapped" back around
  to the first micropanel of the current MC x KC packed block of A.
  However, this is almost never actually the micropanel that is used
  next. A new macro, bli_gemmt_l_wrap_a_upanel(), computes a_next
  correctly, with a similarly named bli_gemmt_u_wrap_a_upanel() for use
  in the upper-stored case (which *does* actually always choose the
  first micropanel of A as its a_next at the end of the IR loop).
- Removed adjustments for a_next/b_next (a2/b2) for the diagonal-
  intersecting case of gemmt_l_ker_var2() and the above-diagonal case
  of gemmt_u_ker_var2() since these cases will only coincide with the
  last iteration of the IR loop in very small problems.
- Defined bli_is_last_iter_l() and bli_is_last_iter_u(), the latter of
  which explicitly considers whether the current microtile is the last
  tile that intersects the diagonal. (The former does the same, but the
  computation coincides with the original bli_is_last_iter().) These
  functions are now used in gemmt to test when a_next (or a2) should
  "wrap" (as discussed above). Also defined bli_is_last_iter_tlb_l()
  and bli_is_last_iter_tlb_u(), which are similar to the aforementioned
  functions but are used when employing tlb in gemmt.
- Redefined macros in bli_packm_thrinfo.h, which test whether an
  iteration of work is assigned to a thread, as static inline functions
  in bli_param_macro_defs.h (and then deleted bli_packm_thrinfo.h).
  In the process of redefining these macros, I also renamed them from
  bli_packm_my_iter_rr/sl() to bli_is_my_iter_rr/sl().
- Renamed
    bli_thread_range_jrir_rr() -> bli_thread_range_rr()
    bli_thread_range_jrir_sl() -> bli_thread_range_sl()
    bli_thread_range_jrir()    -> bli_thread_range_slrr()
- Renamed
    bli_is_last_iter() -> bli_is_last_iter_slrr()
- Defined
    bli_info_get_thread_jrir_tlb()
  and renamed:
  - bli_info_get_thread_part_jrir_slab() ->
    bli_info_get_thread_jrir_slab()
  - bli_info_get_thread_part_jrir_rr() ->
    bli_info_get_thread_jrir_rr()
- Modified bli_rntm_set_ways_for_op() to redirect IR loop parallelism
  into the JR loop when tlb is enabled for non-trsm level-3 operations.
- Added a sanity check to prevent bli_prune_unref_mparts() from being
  used on packed objects. This prohibition is necessary because the
  current implementation does not take into account the atomicity of
  packed micropanel widths relative to the diagonal of structured
  matrices. That is, the function prunes greedily without regard to
  whether doing so would prune off part of a micropanel *which has
  already been packed* and assigned to a thread for inclusion in the
  computation.
- Further restricted early returns in bli_prune_unref_mparts() to
  situations where the primary matrix is not only of general structure
  but also dense (in terms of its uplo_t value). The addition of the
  matrix's dense-ness to the conditional is required because gemmt is
  somewhat unusual in that its C matrix has general structure but is
  marked as lower- or upper-stored via its uplo_t. By only checking
  for general structure, attempts to prune gemmt C matrices would
  incorrectly result in early returns, even though that operation
  effectively treats the matrix as symmetric (and stored in only one
  triangle).
- Fixed a latent bug in bli_thread_range_rr() wherein incorrect ranges
  were computed when 1 < bf. Thankfully, this bug was not yet
  manifesting since all current invocations used bf == 1.
- Fixed a latent bug in some unexercised code in bli_?gemmt_l_ker_var2()
  that would perform incorrect pruning of unreferenced regions above
  where the diagonal of a lower-stored matrix intersects the right edge.
  Thankfully, the bug was not harming anything since those unreferenced
  regions were being pruned prior to the macrokernel.
- Rewrote slab/rr-based gemmt macrokernels so that they no longer carved
  C into rectangular and diagonal regions prior to parallelizing each
  separately. The new macrokernels use a unified loop structure where
  quadratic (slab) partitioning is used.
- Updated all level-3 macrokernels to have a more uniform coding style,
  such as wrt combining variable declarations with initializations as
  well as the use of const.
- Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and
  bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and
  bli_thrinfo_thread_id(), respectively. This change probably should
  have been included in aeb5f0c.
- Removed old prototypes in bli_gemmt_var.h and bli_trmm_var.h that
  corresponded to functions that were removed in aeb5f0c.
- Other very minor cleanups.
- Comment updates.
---
 build/bli_config.h.in                         |    4 +
 configure                                     |   61 +-
 frame/1m/packm/bli_packm.h                    |    1 -
 frame/1m/packm/bli_packm_blk_var1.c           |   16 +-
 frame/3/bli_l3_sup_packm_var.c                |   16 +-
 frame/3/bli_l3_sup_var12.c                    |    4 +-
 frame/3/bli_l3_thrinfo.h                      |   12 +-
 frame/3/gemm/bli_gemm_cntl.c                  |   23 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |   93 +-
 frame/3/gemm/bli_gemm_ker_var2b.c             |  379 ++++
 frame/3/gemm/bli_gemm_var.h                   |    3 +-
 frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c   |  429 +++++
 frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c   |  418 ++++
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |  274 +--
 frame/3/gemmt/bli_gemmt_l_ker_var2b.c         |  387 ++++
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |  273 +--
 frame/3/gemmt/bli_gemmt_u_ker_var2b.c         |  386 ++++
 frame/3/gemmt/bli_gemmt_var.h                 |   45 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2.c          |   14 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2b.c         |   73 +
 .../3/gemmt/other/bli_gemmt_l_ker_var2.c.prev |  507 +++++
 .../other/bli_gemmt_l_ker_var2b.c.before      |  427 +++++
 .../3/gemmt/other/bli_gemmt_u_ker_var2.c.prev |  510 +++++
 .../other/bli_gemmt_u_ker_var2b.c.before      |  415 ++++
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |   99 +-
 frame/3/trmm/bli_trmm_ll_ker_var2b.c          |  365 ++++
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |   99 +-
 frame/3/trmm/bli_trmm_lu_ker_var2b.c          |  366 ++++
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |   75 +-
 frame/3/trmm/bli_trmm_rl_ker_var2b.c          |  392 ++++
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |   77 +-
 frame/3/trmm/bli_trmm_ru_ker_var2b.c          |  390 ++++
 frame/3/trmm/bli_trmm_var.h                   |   53 +-
 frame/3/trmm/bli_trmm_xx_ker_var2.c           |   14 +-
 frame/3/trmm/bli_trmm_xx_ker_var2b.c          |   87 +
 .../3/trmm/other/bli_trmm_rl_ker_var2.c.prev  |  371 ++++
 .../trmm/other/bli_trmm_rl_ker_var2.c.unified |  324 ++++
 frame/3/trmm/other/bli_trmm_ru_ker_var2.c     |    2 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |   65 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |   69 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |  143 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |   12 +-
 frame/3/trsm/bli_trsm_var.h                   |    2 +-
 frame/3/trsm/bli_trsm_xx_ker_var2.c           |   14 +-
 frame/base/bli_info.c                         |   12 +-
 frame/base/bli_info.h                         |    5 +-
 frame/base/bli_prune.c                        |   39 +-
 frame/base/bli_rntm.c                         |   40 +-
 frame/include/bli_config_macro_defs.h         |   10 +
 frame/include/bli_kernel_macro_defs.h         |    2 +
 frame/include/bli_param_macro_defs.h          |   51 +-
 frame/include/blis.h                          |    4 +
 frame/thread/bli_thread.c                     |  901 ---------
 frame/thread/bli_thread.h                     |  180 +-
 frame/thread/bli_thread_range.c               | 1121 +++++++++++
 frame/thread/bli_thread_range.h               |  128 ++
 frame/thread/bli_thread_range_slab_rr.c       |  134 ++
 frame/thread/bli_thread_range_slab_rr.h       |  116 ++
 frame/thread/bli_thread_range_tlb.c           | 1699 +++++++++++++++++
 frame/thread/bli_thread_range_tlb.h           |  192 ++
 frame/thread/old/bli_thread_range_snake.c     |  120 ++
 .../old/bli_thread_range_snake.h}             |   46 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           |    4 +-
 sandbox/gemmlike/bls_l3_packm_var1.c          |    8 +-
 sandbox/gemmlike/bls_l3_packm_var2.c          |    8 +-
 testsuite/src/test_libblis.c                  |    7 +-
 testsuite/src/test_trmm.c                     |    3 +
 67 files changed, 10597 insertions(+), 2022 deletions(-)
 create mode 100644 frame/3/gemm/bli_gemm_ker_var2b.c
 create mode 100644 frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c
 create mode 100644 frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c
 create mode 100644 frame/3/gemmt/bli_gemmt_l_ker_var2b.c
 create mode 100644 frame/3/gemmt/bli_gemmt_u_ker_var2b.c
 create mode 100644 frame/3/gemmt/bli_gemmt_x_ker_var2b.c
 create mode 100644 frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev
 create mode 100644 frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before
 create mode 100644 frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev
 create mode 100644 frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before
 create mode 100644 frame/3/trmm/bli_trmm_ll_ker_var2b.c
 create mode 100644 frame/3/trmm/bli_trmm_lu_ker_var2b.c
 create mode 100644 frame/3/trmm/bli_trmm_rl_ker_var2b.c
 create mode 100644 frame/3/trmm/bli_trmm_ru_ker_var2b.c
 create mode 100644 frame/3/trmm/bli_trmm_xx_ker_var2b.c
 create mode 100644 frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev
 create mode 100644 frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified
 create mode 100644 frame/thread/bli_thread_range.c
 create mode 100644 frame/thread/bli_thread_range.h
 create mode 100644 frame/thread/bli_thread_range_slab_rr.c
 create mode 100644 frame/thread/bli_thread_range_slab_rr.h
 create mode 100644 frame/thread/bli_thread_range_tlb.c
 create mode 100644 frame/thread/bli_thread_range_tlb.h
 create mode 100644 frame/thread/old/bli_thread_range_snake.c
 rename frame/{1m/packm/bli_packm_thrinfo.h => thread/old/bli_thread_range_snake.h} (70%)

diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 41e76d214..7dc67059f 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -80,6 +80,10 @@
 #define BLIS_ENABLE_JRIR_RR
 #endif
 
+#if @enable_jrir_tlb@
+#define BLIS_ENABLE_JRIR_TLB
+#endif
+
 #if @enable_pba_pools@
 #define BLIS_ENABLE_PBA_POOLS
 #else
diff --git a/configure b/configure
index 286a66123..06201b4fa 100755
--- a/configure
+++ b/configure
@@ -340,16 +340,36 @@ print_usage()
 	echo " "
 	echo "   -r METHOD, --thread-part-jrir=METHOD"
 	echo " "
-	echo "                 Request a method of assigning micropanels to threads in"
-	echo "                 the JR and IR loops. Valid values for METHOD are 'slab'"
-	echo "                 and 'rr'. Using 'slab' assigns (as much as possible)"
-	echo "                 contiguous regions of micropanels to each thread while"
-	echo "                 using 'rr' assigns micropanels to threads in a round-"
-	echo "                 robin fashion. The chosen method also applies during"
-	echo "                 the packing of A and B. The default method is 'slab'."
-	echo "                 NOTE: Specifying this option constitutes a request,"
-	echo "                 which may be ignored in select situations if the"
-	echo "                 implementation has a good reason to do so."
+	echo "                 Select a strategy for partitioning computation in JR and"
+	echo "                 IR loops and assigning that computation to threads. Valid"
+	echo "                 values for METHOD are 'rr', 'slab', and 'tlb':"
+	echo "                  'rr':   Assign the computation associated with whole"
+	echo "                          columns of microtiles to threads in a round-"
+	echo "                          robin fashion. When selected, round-robin"
+	echo "                          assignment is also employed during packing."
+	echo "                  'slab': Partition the computation into N contiguous"
+	echo "                          regions, where each region contains a whole"
+	echo "                          number of microtile columns, and assign one"
+	echo "                          region to each thread. For some operations, the"
+	echo "                          number of microtile columns contained within a"
+	echo "                          given region may differ from that of other"
+	echo "                          regions, depending on how much work is implied"
+	echo "                          by each region. When selected, slab assignment"
+	echo "                          is also employed during packing."
+	echo "                  'tlb':  Tile-level load balancing is similar to slab,"
+	echo "                          except that regions will be divided at a more"
+	echo "                          granular level (individual microtiles instead"
+	echo "                          of whole columns of microtiles) to ensure more"
+	echo "                          equitable assignment of work to threads. When"
+	echo "                          selected, tlb will only be employed for level-3"
+	echo "                          operations except trsm; due to practical and"
+	echo "                          algorithmic limitations, slab partitioning will"
+	echo "                          be used instead during packing and for trsm."
+	echo "                 The default strategy is 'slab'. NOTE: Specifying this"
+	echo "                 option constitutes a request, which may be ignored in"
+	echo "                 select situations if implementation has a good reason to"
+	echo "                 do so. (See description of 'tlb' above for an example of"
+	echo "                 this.)"
 	echo " "
 	echo "   --disable-trsm-preinversion, --enable-trsm-preinversion"
 	echo " "
@@ -3731,16 +3751,20 @@ main()
 
 	# Check the method of assigning micropanels to threads in the JR and IR
 	# loops.
-	enable_jrir_slab_01=0
 	enable_jrir_rr_01=0
-	if [ "x${thread_part_jrir}" = "xslab" ]; then
-		echo "${script_name}: requesting slab threading in jr and ir loops."
-		enable_jrir_slab_01=1
-	elif [ "x${thread_part_jrir}" = "xrr" ]; then
-		echo "${script_name}: requesting round-robin threading in jr and ir loops."
+	enable_jrir_slab_01=0
+	enable_jrir_tlb_01=0
+	if   [ "x${thread_part_jrir}" = "xrr" ]; then
+		echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops."
 		enable_jrir_rr_01=1
+	elif [ "x${thread_part_jrir}" = "xslab" ]; then
+		echo "${script_name}: requesting slab work partitioning in jr and/or ir loops."
+		enable_jrir_slab_01=1
+	elif [ "x${thread_part_jrir}" = "xtlb" ]; then
+		echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop."
+		enable_jrir_tlb_01=1
 	else
-		echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${thread_part_jrir}."
+		echo "${script_name}: *** Unsupported method of work partitioning in jr/ir loops: ${thread_part_jrir}."
 		exit 1
 	fi
 
@@ -4177,8 +4201,9 @@ main()
 		| sed   -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \
 		| sed   -e "s/@enable_hpx@/${enable_hpx_01}/g" \
 		| sed   -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \
-		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
 		| sed   -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
+		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
+		| sed   -e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g" \
 		| sed   -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \
 		| sed   -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \
 		| sed   -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index 80878fba0..7d73bf903 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -39,7 +39,6 @@
 #include "bli_packm_init.h"
 #include "bli_packm_int.h"
 #include "bli_packm_scalar.h"
-#include "bli_packm_thrinfo.h"
 
 #include "bli_packm_part.h"
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index b8f4f945d..561988e7f 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -170,11 +170,11 @@ void bli_packm_blk_var1
 	const dim_t tid = bli_thrinfo_work_id( thread );
 
 	// Determine the thread range and increment using the current thread's
-	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	// will depend on whether slab or round-robin partitioning was requested
 	// at configure-time.
 	dim_t it_start, it_end, it_inc;
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
 
 	char* p_begin = p_cast;
 
@@ -195,10 +195,10 @@ void bli_packm_blk_var1
 
 			char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;
 
-			// Hermitian/symmetric and general packing may use slab or
-			// round-robin (bli_packm_my_iter()), depending on which was
-			// selected at configure-time.
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) )
+			// Hermitian/symmetric and general packing may use slab or round-
+			// robin (bli_is_my_iter()), depending on which was selected at
+			// configure-time.
+			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) )
 			{
 				packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
 				                diagc,
@@ -286,9 +286,9 @@ void bli_packm_blk_var1
 			// We nudge the imaginary stride up by one if it is odd.
 			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );
 
-			// NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr())
+			// NOTE: We MUST use round-robin work allocation (bli_is_my_iter_rr())
 			// when packing micropanels of a triangular matrix.
-			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) )
+			if ( bli_is_my_iter_rr( it, tid, nt ) )
 			{
 				packm_ker_cast( strucc,
 				                diagc,
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index e47f65aea..67b33f407 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -155,10 +155,10 @@ void PASTEMAC(ch,varname) \
 	dim_t it_start, it_end, it_inc; \
 \
 	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( ic  = ic0,    it  = 0; it < n_iter; \
@@ -175,9 +175,9 @@ void PASTEMAC(ch,varname) \
 			panel_len_i     = panel_len_full; \
 			panel_len_max_i = panel_len_max; \
 \
-			/* The definition of bli_packm_my_iter() will depend on whether slab
+			/* The definition of bli_is_my_iter() will depend on whether slab
 			   or round-robin partitioning was requested at configure-time. */ \
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				f \
 				( \
@@ -398,10 +398,10 @@ void PASTEMAC(ch,varname) \
 	dim_t it_start, it_end, it_inc; \
 \
 	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( it = 0; it < n_iter; it += 1 ) \
@@ -412,9 +412,9 @@ void PASTEMAC(ch,varname) \
 		ctype* p_use = p_begin; \
 \
 		{ \
-			/* The definition of bli_packm_my_iter() will depend on whether slab
+			/* The definition of bli_is_my_iter() will depend on whether slab
 			   or round-robin partitioning was requested at configure-time. */ \
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
 				( \
diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c
index d65482243..4162c3d33 100644
--- a/frame/3/bli_l3_sup_var12.c
+++ b/frame/3/bli_l3_sup_var12.c
@@ -357,11 +357,11 @@ void PASTEMAC(ch,varname) \
 						   object. */ \
 /*
 						ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
-						if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \
+						if ( bli_is_last_iter_slrr( i, ir_iter, 0, 1 ) ) \
 						{ \
 							a2 = a_00; \
 							b2 = bli_gemm_get_next_b_upanel( b_jr, jrstep_b, jr_inc ); \
-							if ( bli_is_last_iter( j, jr_iter, 0, 1 ) ) \
+							if ( bli_is_last_iter_slrr( j, jr_iter, 0, 1 ) ) \
 								b2 = b_00; \
 						} \
 \
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index b1290df50..2ea7a3fc2 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -39,22 +39,22 @@
 
 // gemm
 
-// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
-// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
 // gemmt
 
-// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to
-// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
+// NOTE: Here, we assume NO parallelism in the IR loop.
+#define bli_gemmt_l_wrap_a_upanel( a0, step, doff_j, mr, nr ) \
+        ( a0 + ( (-doff_j + 1*nr) / mr ) * step )
+#define bli_gemmt_u_wrap_a_upanel( a0, step, doff_j, mr, nr ) \
+        ( a0 )
+
 // trmm
 
-// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
-// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index bd8d97d13..b9c231cf7 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -61,10 +61,25 @@ cntl_t* bli_gemmbp_cntl_create
 	void_fp macro_kernel_fp;
 
 	// Choose the default macrokernel based on the operation family...
-	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
-	else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
-	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
-	else /* should never execute */ macro_kernel_fp = NULL;
+	if      ( family == BLIS_GEMM )  macro_kernel_fp =
+	                                   #ifdef BLIS_ENABLE_JRIR_TLB
+	                                   bli_gemm_ker_var2b;
+	                                   #else // ifdef ( _SLAB || _RR )
+	                                   bli_gemm_ker_var2;
+	                                   #endif
+	else if ( family == BLIS_GEMMT ) macro_kernel_fp =
+	                                   #ifdef BLIS_ENABLE_JRIR_TLB
+	                                   bli_gemmt_x_ker_var2b;
+	                                   #else // ifdef ( _SLAB || _RR )
+	                                   bli_gemmt_x_ker_var2;
+	                                   #endif
+	else if ( family == BLIS_TRMM )  macro_kernel_fp =
+	                                   #ifdef BLIS_ENABLE_JRIR_TLB
+	                                   bli_trmm_xx_ker_var2b;
+	                                   #else // ifdef ( _SLAB || _RR )
+	                                   bli_trmm_xx_ker_var2;
+	                                   #endif
+	else /* should never execute */  macro_kernel_fp = NULL;
 
 	// ...unless a non-NULL kernel function pointer is passed in, in which
 	// case we use that instead.
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index d59695081..3e862e6c5 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -47,7 +47,7 @@ typedef void (*xpbys_mxn_vft)
 #undef GENTFUNC2
 #define GENTFUNC2(ctypex,ctypey,chx,chy,op) \
 \
-void PASTEMAC2(chx,chy,op) \
+BLIS_INLINE void PASTEMAC2(chx,chy,op) \
     ( \
       dim_t m, \
       dim_t n, \
@@ -77,31 +77,31 @@ static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn);
 
 void bli_gemm_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
 	      num_t  dt_exec   = bli_obj_exec_dt( c );
 	      num_t  dt_c      = bli_obj_dt( c );
 
-	      pack_t schema_a  = bli_obj_pack_schema( a );
-	      pack_t schema_b  = bli_obj_pack_schema( b );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
 	      dim_t  m         = bli_obj_length( c );
 	      dim_t  n         = bli_obj_width( c );
 	      dim_t  k         = bli_obj_width( a );
 
 	const char*  a_cast    = bli_obj_buffer_at_off( a );
-	      inc_t  is_a      = bli_obj_imag_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
 	      dim_t  pd_a      = bli_obj_panel_dim( a );
 	      inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const char*  b_cast    = bli_obj_buffer_at_off( b );
-	      inc_t  is_b      = bli_obj_imag_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
 	      dim_t  pd_b      = bli_obj_panel_dim( b );
 	      inc_t  ps_b      = bli_obj_panel_stride( b );
 
@@ -116,8 +116,7 @@ void bli_gemm_ker_var2
 	// NOTE: We know that the internal scalars of A and B are already of the
 	// target datatypes because the necessary typecasting would have already
 	// taken place during bli_packm_init().
-	obj_t     scalar_a;
-	obj_t     scalar_b;
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
@@ -217,22 +216,19 @@ void bli_gemm_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_c_size;
-	inc_t cstep_c = cs_c * NR * dt_c_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -255,20 +251,19 @@ void bli_gemm_ker_var2
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
-	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	dim_t jr_start, jr_end;
-	dim_t ir_start, ir_end;
-	dim_t jr_inc,   ir_inc;
+	dim_t jr_start, jr_end, jr_inc;
+	dim_t ir_start, ir_end, ir_inc;
 
 	// Determine the thread range and increment for the 2nd and 1st loops.
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -276,7 +271,9 @@ void bli_gemm_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		// Compute the current microtile's width.
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -287,15 +284,17 @@ void bli_gemm_ker_var2
 			const char* a1  = a_cast + i * rstep_a;
 			      char* c11 = c1     + i * rstep_c;
 
-			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			// Compute the current microtile's length.
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
 
 			// Compute the addresses of the next panels of A and B.
 			const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
-			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) )
+			if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) )
 			{
 				a2 = a_cast;
 				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc );
-				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) )
+				if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) )
 					b2 = b_cast;
 			}
 
@@ -342,22 +341,20 @@ void bli_gemm_ker_var2
 				  ( cntx_t* )cntx
 				);
 
-				// Accumulate to C with type-casting.
+				// Accumulate to C with typecasting.
 				xpbys_mxn[ dt_exec ][ dt_c ]
 				(
-				    m_cur, n_cur,
-				    &ct, rs_ct, cs_ct,
-				    ( void* )beta_cast,
-				    c11, rs_c, cs_c
+				  m_cur, n_cur,
+				  &ct, rs_ct, cs_ct,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c
 				);
 			}
 		}
 	}
-
-/*
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" );
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" );
-*/
 }
 
+//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" );
+
diff --git a/frame/3/gemm/bli_gemm_ker_var2b.c b/frame/3/gemm/bli_gemm_ker_var2b.c
new file mode 100644
index 000000000..50375708a
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_ker_var2b.c
@@ -0,0 +1,379 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+typedef void (*xpbys_mxn_vft)
+    (
+      dim_t m,
+      dim_t n,
+      void* x, inc_t rs_x, inc_t cs_x,
+      void* b,
+      void* y, inc_t rs_y, inc_t cs_y
+    );
+
+#undef GENTFUNC2
+#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \
+\
+BLIS_INLINE void PASTEMAC2(chx,chy,op) \
+    ( \
+      dim_t m, \
+      dim_t n, \
+      void* x, inc_t rs_x, inc_t cs_x, \
+      void* b, \
+      void* y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctypex* restrict x_cast = x; \
+	ctypey* restrict b_cast = b; \
+	ctypey* restrict y_cast = y; \
+\
+	PASTEMAC3(chx,chy,chy,xpbys_mxn) \
+	( \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
+
+INSERT_GENTFUNC2_BASIC0(xpbys_mxnb_fn);
+INSERT_GENTFUNC2_MIXDP0(xpbys_mxnb_fn);
+
+static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxnb_fn);
+
+
+void bli_gemm_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	      num_t  dt_exec   = bli_obj_exec_dt( c );
+	      num_t  dt_c      = bli_obj_dt( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
+
+	const char*  a_cast    = bli_obj_buffer_at_off( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	      dim_t  pd_a      = bli_obj_panel_dim( a );
+	      inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const char*  b_cast    = bli_obj_buffer_at_off( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	      dim_t  pd_b      = bli_obj_panel_dim( b );
+	      inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      char*  c_cast    = bli_obj_buffer_at_off( c );
+	      inc_t  rs_c      = bli_obj_row_stride( c );
+	      inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Detach and multiply the scalars attached to A and B.
+	// NOTE: We know that the internal scalars of A and B are already of the
+	// target datatypes because the necessary typecasting would have already
+	// taken place during bli_packm_init().
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	// NOTE: We know that scalar_b is of type dt_exec due to the above code
+	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
+	// and we know that the internal scalar in C is already of the type dt_c
+	// due to the casting in the implementation of bli_obj_scalar_attach().
+	const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
+	const char* beta_cast  = bli_obj_internal_scalar_buffer( c );
+
+	// If 1m is being employed on a column- or row-stored matrix with a
+	// real-valued beta, we can use the real domain macro-kernel, which
+	// eliminates a little overhead associated with the 1m virtual
+	// micro-kernel.
+	// Only employ this optimization if the storage datatype of C is
+	// equal to the execution/computation datatype.
+#if 1
+	if ( bli_cntx_method( cntx ) == BLIS_1M )
+	{
+		bli_gemm_ind_recast_1m_params
+		(
+		  &dt_exec,
+		  &dt_c,
+		  schema_a,
+		  c,
+		  &m, &n, &k,
+		  &pd_a, &ps_a,
+		  &pd_b, &ps_b,
+		  &rs_c, &cs_c,
+		  cntx
+		);
+	}
+#endif
+
+#ifdef BLIS_ENABLE_GEMM_MD
+	// Tweak parameters in select mixed domain cases (rcc, crc, ccr).
+	if ( bli_cntx_method( cntx ) == BLIS_NAT )
+	{
+		bli_gemm_md_ker_var2_recast
+		(
+		  &dt_exec,
+		  bli_obj_dt( a ),
+		  bli_obj_dt( b ),
+		  &dt_c,
+		  &m, &n, &k,
+		  &pd_a, &ps_a,
+		  &pd_b, &ps_b,
+		  c,
+		  &rs_c, &cs_c
+		);
+	}
+#endif
+
+	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_c_size = bli_dt_size( dt_c );
+
+	// Alias some constants to simpler names.
+	const dim_t MR = pd_a;
+	const dim_t NR = pd_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+
+	// Query the params field from the obj_t. If it is non-NULL, grab the ukr
+	// field of the params struct. If that function pointer is non-NULL, use it
+	// as our microkernel instead of the default microkernel queried from the
+	// cntx above.
+	const gemm_ker_params_t* params = bli_obj_ker_params( c );
+	gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
+	if ( user_ukr ) gemm_ukr = user_ukr;
+
+	// Temporary C buffer for edge cases. Note that the strides of this
+	// temporary buffer are set so that they match the storage of the
+	// original C matrix. For example, if C is column-stored, ct will be
+	// column-stored as well.
+	char        ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
+	const inc_t rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+	const char* zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+
+	//
+	// Assumptions/assertions:
+	//   rs_a == 1
+	//   cs_a == PACKMR
+	//   pd_a == MR
+	//   ps_a == stride to next micro-panel of A
+	//   rs_b == PACKNR
+	//   cs_b == 1
+	//   pd_b == NR
+	//   ps_b == stride to next micro-panel of B
+	//   rs_c == (no assumptions)
+	//   cs_c == (no assumptions)
+	//
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// Save the imaginary stride of A and B to the auxinfo_t object.
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Notice that this variant doesn't utilize
+	// parallelism in the 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	// Determine the starting microtile offsets and number of microtiles to
+	// compute for each thread. Note that assignment of microtiles is done
+	// according to the tlb policy.
+	dim_t jr_st, ir_st;
+	const dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_d( jr_nt, jr_tid, m_iter, n_iter, MR, NR, &jr_st, &ir_st );
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
+	// Start the jr/ir loops with the current thread's microtile offsets computed
+	// by bli_thread_range_tlb().
+	dim_t i = ir_st;
+	dim_t j = jr_st;
+
+	// Initialize a counter to track the number of microtiles computed by the
+	// current thread.
+	dim_t ut = 0;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( ; true; ++j )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		// Compute the current microtile's width.
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		bli_auxinfo_set_next_b( b2, &aux );
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( ; i < m_iter; ++i )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
+			// Compute the current microtile's length.
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
+
+			// Compute the addresses of the next panels of A and B.
+			const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, 1 );
+			if ( bli_is_last_iter_sl( i, m_iter ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, 1 );
+				bli_auxinfo_set_next_b( b2, &aux );
+			}
+
+			// Save addresses of next panels of A and B to the auxinfo_t
+			// object.
+			bli_auxinfo_set_next_a( a2, &aux );
+
+			// Edge case handling now occurs within the microkernel itself, but
+			// we must still explicitly accumulate to a temporary microtile in
+			// situations where a virtual microkernel is being used, such as
+			// during the 1m method or some cases of mixed datatypes.
+			if ( dt_exec == dt_c )
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				           c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+			else
+			{
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				           &ct, rs_ct, cs_ct,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Accumulate to C with typecasting.
+				xpbys_mxn[ dt_exec ][ dt_c ]
+				(
+				  m_cur, n_cur,
+				  &ct, rs_ct, cs_ct,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c
+				);
+			}
+
+			ut += 1;
+			if ( ut == n_ut_for_me ) return;
+		}
+
+		i = 0;
+	}
+}
+
+//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: b1", k, NR, b1, NR, 1, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: a1", MR, k, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" );
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index 24f7ecfb9..f69327db0 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -65,6 +65,7 @@ GENPROT( gemm_blk_var1 )
 GENPROT( gemm_blk_var2 )
 GENPROT( gemm_blk_var3 )
 
-GENPROT( gemm_ker_var1 )
 GENPROT( gemm_ker_var2 )
 
+GENPROT( gemm_ker_var2b )
+
diff --git a/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c
new file mode 100644
index 000000000..fbfafebb0
--- /dev/null
+++ b/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c
@@ -0,0 +1,429 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+typedef void (*xpbys_mxn_l_vft)
+    (
+      doff_t diagoff,
+      dim_t  m,
+      dim_t  n,
+      void*  x, inc_t rs_x, inc_t cs_x,
+      void*  b,
+      void*  y, inc_t rs_y, inc_t cs_y
+    );
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+\
+void PASTEMAC(ch,op) \
+    ( \
+      doff_t diagoff, \
+      dim_t  m, \
+      dim_t  n, \
+      void*  x, inc_t rs_x, inc_t cs_x, \
+      void*  b, \
+      void*  y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctype* restrict x_cast = x; \
+	ctype* restrict b_cast = b; \
+	ctype* restrict y_cast = y; \
+\
+	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
+	( \
+	  diagoff, \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
+
+INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
+
+static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
+
+void bli_gemmt_l_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t  dt        = bli_obj_exec_dt( c );
+	const dim_t  dt_size   = bli_dt_size( dt );
+
+	      doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still negative (though its absolute value is guaranteed to be less
+	   than MR). */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		const dim_t ip = -diagoffc / MR; \
+		const dim_t i  = ip * MR; \
+\
+		m        = m - i; \
+		diagoffc = diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \
+	const dim_t n_left = n % NR; \
+\
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \
+	const dim_t m_left = m % MR; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	const inc_t rstep_a = ps_a; \
+\
+	const inc_t cstep_b = ps_b; \
+\
+	const inc_t rstep_c = rs_c * MR; \
+	const inc_t cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	const dim_t jr_inc = 1; \
+	const dim_t ir_inc = 1; \
+\
+	/* Determine the starting microtile offsets and number of microtiles to
+	   compute for each thread. Note that assignment of microtiles is done
+	   according to the tlb policy. */ \
+	dim_t jr_st, ir_st; \
+	const dim_t n_ut_for_me \
+	= \
+	bli_thread_range_tlb( thread, diagoffc, BLIS_LOWER, m, n, MR, NR, \
+	                      &jr_st, &ir_st ); \
+\
+	/* It's possible that there are so few microtiles relative to the number
+	   of threads that one or more threads gets no work. If that happens, those
+	   threads can return early. */ \
+	if ( n_ut_for_me == 0 ) return; \
+\
+	/* Start the jr/ir loops with the current thread's microtile offsets computed
+	   by bli_thread_range_tlb(). */ \
+	dim_t i = ir_st; \
+	dim_t j = jr_st; \
+\
+	/* Initialize a counter to track the number of microtiles computed by the
+	   current thread. */ \
+	dim_t ut = 0; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( ; true; ++j ) \
+	{ \
+		ctype* restrict b1 = b_cast + j * cstep_b; \
+		ctype* restrict c1 = c_cast + j * cstep_c; \
+\
+		/* Compute the diagonal offset for the column of microtiles at (0,j). */ \
+		const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \
+		const dim_t  n_cur      = ( bli_is_not_edge_f( j, n_iter, n_left ) \
+		                            ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		ctype* restrict b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( ; i < m_iter; ++i ) \
+		{ \
+			/* Compute the diagonal offset for the microtile at (i,j). */ \
+			const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \
+			const dim_t  m_cur       = ( bli_is_not_edge_f( i, m_iter, m_left ) \
+			                             ? MR : m_left ); \
+\
+			/* If the diagonal intersects the current MR x NR microtile, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the microtile is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we simply advance
+			   to last microtile before the diagonal. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  MR, \
+				  NR, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+\
+				ut += 1; \
+				if ( ut == n_ut_for_me ) return; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_tlb_l( i, m_iter ) ) \
+				{ \
+					a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \
+					                                diagoffc_j, MR, NR ); \
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					/* We don't bother computing b2 for the last iteration of the
+					   jr loop since the current thread won't know its j_st until
+					   the next time it calls bli_thread_range_tlb(). */ \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				ut += 1; \
+				if ( ut == n_ut_for_me ) return; \
+			} \
+			else /* if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) */ \
+			{ \
+				/* Skip ahead to the last microtile strictly above the diagonal. */ \
+				i = -diagoffc_j / MR - 1; \
+			} \
+		} \
+\
+		/* Upon reaching the end of the column of microtiles, get ready to begin at
+		   the beginning of the next column (i.e., the next jr loop iteration). */ \
+		i = 0; \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2b )
+
diff --git a/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c
new file mode 100644
index 000000000..311180d19
--- /dev/null
+++ b/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c
@@ -0,0 +1,418 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmt_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2b);
+
+
+void bli_gemmt_u_ker_var2b
+     (
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl,
+             thrinfo_t* thread
+     )
+{
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		const dim_t jp = diagoffc / NR; \
+		const dim_t j  = jp * NR; \
+\
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \
+	const dim_t n_left = n % NR; \
+\
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \
+	const dim_t m_left = m % MR; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	const inc_t rstep_a = ps_a; \
+\
+	const inc_t cstep_b = ps_b; \
+\
+	const inc_t rstep_c = rs_c * MR; \
+	const inc_t cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the virtual microkernel address and the params. */ \
+	/*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \
+	/*bli_auxinfo_set_params( params, &aux );*/ \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
+\
+	const dim_t jr_inc = 1; \
+	const dim_t ir_inc = 1; \
+\
+	/* Determine the starting microtile offsets and number of microtiles to
+	   compute for each thread. Note that assignment of microtiles is done
+	   according to the tlb policy. */ \
+	dim_t jr_st, ir_st; \
+	const dim_t n_ut_for_me \
+	= \
+	bli_thread_range_tlb( thread, diagoffc, BLIS_UPPER, m, n, MR, NR, \
+	                      &jr_st, &ir_st ); \
+\
+	/* It's possible that there are so few microtiles relative to the number
+	   of threads that one or more threads gets no work. If that happens, those
+	   threads can return early. */ \
+	if ( n_ut_for_me == 0 ) return; \
+\
+	/* Start the jr/ir loops with the current thread's microtile offsets computed
+	   by bli_thread_range_tlb(). */ \
+	dim_t i = ir_st; \
+	dim_t j = jr_st; \
+\
+	/* Initialize a counter to track the number of microtiles computed by the
+	   current thread. */ \
+	dim_t ut = 0; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( ; true; ++j ) \
+	{ \
+		ctype* restrict b1 = b_cast + j * cstep_b; \
+		ctype* restrict c1 = c_cast + j * cstep_c; \
+\
+		/* Compute the diagonal offset for the column of microtiles at (0,j). */ \
+		const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \
+		const dim_t  n_cur      = ( bli_is_not_edge_f( j, n_iter, n_left ) \
+		                            ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		ctype* restrict b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( ; i < m_iter; ++i ) \
+		{ \
+			/* Compute the diagonal offset for the microtile at (i,j). */ \
+			const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \
+			const dim_t  m_cur       = ( bli_is_not_edge_f( i, m_iter, m_left ) \
+			                             ? MR : m_left ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we simply advance
+			   to last microtile before the bottom of the matrix. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_tlb_u( diagoffc_ij, MR, NR ) ) \
+				{ \
+					a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \
+					                                diagoffc_j, MR, NR ); \
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					/* We don't bother computing b2 for the last iteration of the
+					   jr loop since the current thread won't know its j_st until
+					   the next time it calls bli_thread_range_tlb(). */ \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  MR, \
+				  NR, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+\
+				ut += 1; \
+				if ( ut == n_ut_for_me ) return; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				ut += 1; \
+				if ( ut == n_ut_for_me ) return; \
+			} \
+			else /* if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) */ \
+			{ \
+				/* Skip past the microtiles strictly below the diagonal. */ \
+				i = m_iter - 1; \
+			} \
+		} \
+\
+		i = 0; \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2b )
+
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 4a3a48304..fd726da6f 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -48,7 +48,7 @@ typedef void (*xpbys_mxn_l_vft)
 #undef GENTFUNC
 #define GENTFUNC(ctype,ch,op) \
 \
-void PASTEMAC(ch,op) \
+BLIS_INLINE void PASTEMAC(ch,op) \
     ( \
       doff_t diagoff, \
       dim_t  m, \
@@ -76,18 +76,19 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
 
 static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
+
 void bli_gemmt_l_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
-	const num_t  dt        = bli_obj_exec_dt( c );
-	const dim_t  dt_size   = bli_dt_size( dt );
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_c      = bli_obj_dt( c );
 
 	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
@@ -113,7 +114,7 @@ void bli_gemmt_l_ker_var2
 	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
-	obj_t  scalar_a, scalar_b;
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
@@ -123,14 +124,17 @@ void bli_gemmt_l_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
+	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_c_size = bli_dt_size( dt_c );
+
 	// Alias some constants to simpler names.
 	const dim_t MR = pd_a;
 	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
-	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt ];
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
 	// temporary buffer are set so that they match the storage of the
@@ -138,11 +142,11 @@ void bli_gemmt_l_ker_var2
 	// column-stored as well.
 	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx );
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
 	const inc_t rs_ct       = ( col_pref ? 1 : NR );
 	const inc_t cs_ct       = ( col_pref ? MR : 1 );
 
-	const void* zero       = bli_obj_buffer_for_const( dt, &BLIS_ZERO );
+	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -175,12 +179,13 @@ void bli_gemmt_l_ker_var2
 	// this case as if the diagonal offset were zero.
 	if ( diagoffc < 0 )
 	{
-		dim_t ip       = -diagoffc / MR;
-		dim_t i        = ip * MR;
-		      m        = m - i;
-		      diagoffc = -diagoffc % MR;
-		      c_cast   = c_cast + (i  )*rs_c*dt_size;
-		      a_cast   = a_cast + (ip )*ps_a*dt_size;
+		const dim_t ip = -diagoffc / MR;
+		const dim_t i  = ip * MR;
+
+		m        = m - i;
+		diagoffc = diagoffc % MR;
+		c_cast   = c_cast + (i  )*rs_c*dt_c_size;
+		a_cast   = a_cast + (ip )*ps_a*dt_size;
 	}
 
 	// If there is a zero region to the right of where the diagonal
@@ -193,25 +198,23 @@ void bli_gemmt_l_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
@@ -219,9 +222,6 @@ void bli_gemmt_l_ker_var2
 	bli_auxinfo_set_is_a( is_a, &aux );
 	bli_auxinfo_set_is_b( is_b, &aux );
 
-	// Save the desired output datatype (indicating no typecasting).
-	//bli_auxinfo_set_dt_on_output( dt, &aux );*/
-
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
@@ -229,48 +229,21 @@ void bli_gemmt_l_ker_var2
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
-	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	dim_t ir_tid = bli_thrinfo_work_id( caucus );
-
-	dim_t jr_start, jr_end;
-	dim_t ir_start, ir_end;
-	dim_t jr_inc,   ir_inc;
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	// Note that we partition the 2nd loop into two regions: the rectangular
-	// part of C, and the triangular portion.
-	dim_t n_iter_rct;
-	dim_t n_iter_tri;
+	dim_t jr_start, jr_end, jr_inc;
+	dim_t ir_start, ir_end, ir_inc;
 
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) )
-	{
-		// If the entire panel of C does not intersect the diagonal, there is
-		// no triangular region, and therefore we can skip the second set of
-		// loops.
-		n_iter_rct = n_iter;
-		n_iter_tri = 0;
-	}
-	else
-	{
-		// If the panel of C does intersect the diagonal, compute the number of
-		// iterations in the rectangular region by dividing NR into the diagonal
-		// offset. Any remainder from this integer division is discarded, which
-		// is what we want. That is, we want the rectangular region to contain
-		// as many columns of whole microtiles as possible without including any
-		// microtiles that intersect the diagonal. The number of iterations in
-		// the triangular (or trapezoidal) region is computed as the remaining
-		// number of iterations in the n dimension.
-		n_iter_rct = diagoffc / NR;
-		n_iter_tri = n_iter - n_iter_rct;
-	}
-
-	// Determine the thread range and increment for the 2nd and 1st loops for
-	// the initial rectangular region of C (if it exists).
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// Determine the thread range and increment for the 2nd and 1st loops.
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_quad( thread, diagoffc, BLIS_LOWER, m, n, NR,
+	                       FALSE, &jr_start, &jr_end, &jr_inc );
+	//bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -278,7 +251,12 @@ void bli_gemmt_l_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		// Compute the diagonal offset for the column of microtiles at (0,j).
+		const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR;
+
+		// Compute the current microtile's width.
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -286,115 +264,34 @@ void bli_gemmt_l_ker_var2
 		// Interior loop over the m dimension (MR rows at a time).
 		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
 		{
-			const char* a1  = a_cast + i * rstep_a;
-			      char* c11 = c1     + i * rstep_c;
-
-			// No need to compute the diagonal offset for the rectangular
-			// region.
-			//diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/
+			// Compute the diagonal offset for the microtile at (i,j).
+			const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR;
 
-			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			// Compute the current microtile's length.
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
 
-			// Compute the addresses of the next panels of A and B.
-			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
-			{
-				a2 = a_cast;
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
-					b2 = b_cast;
-			}
-
-			// Save addresses of next panels of A and B to the auxinfo_t
-			// object.
-			bli_auxinfo_set_next_a( a2, &aux );
-			bli_auxinfo_set_next_b( b2, &aux );
-
-			// If the diagonal intersects the current MR x NR submatrix, we
+			// If the diagonal intersects the current MR x NR microtile, we
 			// compute it the temporary buffer and then add in the elements
 			// on or below the diagonal.
-			// Otherwise, if the submatrix is strictly below the diagonal,
+			// Otherwise, if the microtile is strictly below the diagonal,
 			// we compute and store as we normally would.
 			// And if we're strictly above the diagonal, we do nothing and
-			// continue.
+			// continue on through the IR loop to consider the next MR x NR
+			// microtile.
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
 			{
-				// Invoke the gemm micro-kernel.
-				gemm_ukr
-				(
-				  m_cur,
-				  n_cur,
-				  k,
-				  ( void* )alpha_cast,
-				  ( void* )a1,
-				  ( void* )b1,
-				  ( void* )beta_cast,
-				  c11, rs_c, cs_c,
-				  &aux,
-				  ( cntx_t* )cntx
-				);
-			}
-		}
-	}
-
-	// If there is no triangular region, then we're done.
-	if ( n_iter_tri == 0 ) return;
-
-	// Use round-robin assignment of micropanels to threads in the 2nd loop
-	// and the default (slab or rr) partitioning in the 1st loop for the
-	// remaining triangular region of C.
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-
-	// Advance the start and end iteration offsets for the triangular region
-	// by the number of iterations used for the rectangular region.
-	jr_start += n_iter_rct;
-	jr_end   += n_iter_rct;
-
-	// Loop over the n dimension (NR columns at a time).
-	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
-	{
-		const char* b1 = b_cast + j * cstep_b;
-		      char* c1 = c_cast + j * cstep_c;
-
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
-
-		// Initialize our next panel of B to be the current panel of B.
-		const char* b2 = b1;
-
-		// Interior loop over the m dimension (MR rows at a time).
-		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
-		{
-			const char* a1  = a_cast + i * rstep_a;
-			      char* c11 = c1     + i * rstep_c;
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
 
-			// Compute the diagonal offset for the submatrix at (i,j).
-			doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;
+				// Compute the addresses of the next panel of A.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
 
-			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
 
-			// Compute the addresses of the next panels of A and B.
-			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
-			{
-				a2 = a_cast;
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
-					b2 = b_cast;
-			}
-
-			// Save addresses of next panels of A and B to the auxinfo_t
-			// object.
-			bli_auxinfo_set_next_a( a2, &aux );
-			bli_auxinfo_set_next_b( b2, &aux );
-
-			// If the diagonal intersects the current MR x NR submatrix, we
-			// compute it the temporary buffer and then add in the elements
-			// on or below the diagonal.
-			// Otherwise, if the submatrix is strictly below the diagonal,
-			// we compute and store as we normally would.
-			// And if we're strictly above the diagonal, we do nothing and
-			// continue.
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
-			{
 				// Invoke the gemm micro-kernel.
 				gemm_ukr
 				(
@@ -411,14 +308,35 @@ void bli_gemmt_l_ker_var2
 				);
 
 				// Scale C and add the result to only the stored part.
-				xpbys_mxn_l_ukr( diagoffc_ij,
-				                 m_cur, n_cur,
-				                 ct,  rs_ct, cs_ct,
-				                 ( void* )beta_cast,
-				                 c11, rs_c,  cs_c );
+				xpbys_mxn_l_ukr
+				(
+				  diagoffc_ij,
+				  m_cur, n_cur,
+				  ct,  rs_ct, cs_ct,
+				  ( void* )beta_cast,
+				  c11, rs_c,  cs_c
+				);
 			}
 			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) )
 			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+				if ( bli_is_last_iter_l( i, m_iter, ir_tid, ir_nt ) )
+				{
+					a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR );
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+					if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
 				// Invoke the gemm micro-kernel.
 				gemm_ukr
 				(
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
new file mode 100644
index 000000000..7c50a4a54
--- /dev/null
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
@@ -0,0 +1,387 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+typedef void (*xpbys_mxn_l_vft)
+    (
+      doff_t diagoff,
+      dim_t  m,
+      dim_t  n,
+      void*  x, inc_t rs_x, inc_t cs_x,
+      void*  b,
+      void*  y, inc_t rs_y, inc_t cs_y
+    );
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+\
+BLIS_INLINE void PASTEMAC(ch,op) \
+    ( \
+      doff_t diagoff, \
+      dim_t  m, \
+      dim_t  n, \
+      void*  x, inc_t rs_x, inc_t cs_x, \
+      void*  b, \
+      void*  y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctype* restrict x_cast = x; \
+	ctype* restrict b_cast = b; \
+	ctype* restrict y_cast = y; \
+\
+	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
+	( \
+	  diagoff, \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
+
+INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
+
+static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
+
+
+void bli_gemmt_l_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_c      = bli_obj_dt( c );
+
+	      doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_c_size = bli_dt_size( dt_c );
+
+	// Alias some constants to simpler names.
+	const dim_t MR = pd_a;
+	const dim_t NR = pd_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ];
+
+	// Temporary C buffer for edge cases. Note that the strides of this
+	// temporary buffer are set so that they match the storage of the
+	// original C matrix. For example, if C is column-stored, ct will be
+	// column-stored as well.
+	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
+	const inc_t rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+
+	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of C is entirely above the diagonal,
+	// it is not stored. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return;
+
+	// If there is a zero region above where the diagonal of C intersects
+	// the left edge of the panel, adjust the pointer to C and A and treat
+	// this case as if the diagonal offset were zero.
+	// NOTE: It's possible that after this pruning that the diagonal offset
+	// is still negative (though its absolute value is guaranteed to be less
+	// than MR).
+	if ( diagoffc < 0 )
+	{
+		const dim_t ip = -diagoffc / MR;
+		const dim_t i  = ip * MR;
+
+		m        = m - i;
+		diagoffc = diagoffc % MR;
+		c_cast   = c_cast + (i  )*rs_c*dt_c_size;
+		a_cast   = a_cast + (ip )*ps_a*dt_size;
+	}
+
+	// If there is a zero region to the right of where the diagonal
+	// of C intersects the bottom of the panel, shrink it to prevent
+	// "no-op" iterations from executing.
+	if ( diagoffc + m < n )
+	{
+		n = diagoffc + m;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// Save the imaginary stride of A and B to the auxinfo_t object.
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	// Determine the starting microtile offsets and number of microtiles to
+	// compute for each thread. Note that assignment of microtiles is done
+	// according to the tlb policy.
+	dim_t jr_st, ir_st;
+	const dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_l( jr_nt, jr_tid, diagoffc, m_iter, n_iter, MR, NR,
+	                        &jr_st, &ir_st );
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
+	// Start the jr/ir loops with the current thread's microtile offsets computed
+	// by bli_thread_range_tlb().
+	dim_t i = ir_st;
+	dim_t j = jr_st;
+
+	// Initialize a counter to track the number of microtiles computed by the
+    // current thread.
+	dim_t ut = 0;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( ; true; ++j )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		// Compute the diagonal offset for the column of microtiles at (0,j).
+		const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR;
+
+		// Compute the current microtile's width.
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		bli_auxinfo_set_next_b( b2, &aux );
+
+		// Interior loop over the m dimension (MR rows at a time).
+		for ( ; i < m_iter; ++i )
+		{
+			// Compute the diagonal offset for the microtile at (i,j).
+			const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR;
+
+			// Compute the current microtile's length.
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
+
+			// If the diagonal intersects the current MR x NR microtile, we
+			// compute it the temporary buffer and then add in the elements
+			// on or below the diagonal.
+			// Otherwise, if the microtile is strictly below the diagonal,
+			// we compute and store as we normally would.
+			// And if we're strictly above the diagonal, we simply advance
+			// to the last microtile before the diagonal.
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				// Compute the addresses of the next panel of A.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 );
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				  ct, rs_ct, cs_ct,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Scale C and add the result to only the stored part.
+				xpbys_mxn_l_ukr
+				(
+				  diagoffc_ij,
+				  m_cur, n_cur,
+				  ct,  rs_ct, cs_ct,
+				  ( void* )beta_cast,
+				  c11, rs_c,  cs_c
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1;
+				if ( ut == n_ut_for_me ) return;
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_tlb_l( i, m_iter ) )
+				{
+					a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR );
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1;
+				if ( ut == n_ut_for_me ) return;
+			}
+			else // if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				// Skip ahead to the last microtile strictly above the diagonal.
+				i = -diagoffc_j / MR - 1;
+			}
+		}
+
+		// Upon reaching the end of the column of microtiles, get ready to begin
+		// at the beginning of the next column (i.e., the next jr loop iteration).
+		i = 0;
+	}
+}
+
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 5b4e1ccd9..78d5b869d 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -48,7 +48,7 @@ typedef void (*xpbys_mxn_u_vft)
 #undef GENTFUNC
 #define GENTFUNC(ctype,ch,op) \
 \
-void PASTEMAC(ch,op) \
+BLIS_INLINE void PASTEMAC(ch,op) \
     ( \
       doff_t diagoff, \
       dim_t  m, \
@@ -76,18 +76,19 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
 
 static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
+
 void bli_gemmt_u_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
-	const num_t  dt        = bli_obj_exec_dt( c );
-	const dim_t  dt_size   = bli_dt_size( dt );
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_c      = bli_obj_dt( c );
 
 	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
@@ -113,7 +114,7 @@ void bli_gemmt_u_ker_var2
 	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
-	obj_t  scalar_a, scalar_b;
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
@@ -123,14 +124,17 @@ void bli_gemmt_u_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
+	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_c_size = bli_dt_size( dt_c );
+
 	// Alias some constants to simpler names.
 	const dim_t MR = pd_a;
 	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
-	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt ];
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
 	// temporary buffer are set so that they match the storage of the
@@ -138,11 +142,11 @@ void bli_gemmt_u_ker_var2
 	// column-stored as well.
 	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx );
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
 	const inc_t rs_ct       = ( col_pref ? 1 : NR );
 	const inc_t cs_ct       = ( col_pref ? MR : 1 );
 
-	const void* zero       = bli_obj_buffer_for_const( dt, &BLIS_ZERO );
+	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -177,12 +181,13 @@ void bli_gemmt_u_ker_var2
 	// is still positive (though it is guaranteed to be less than NR).
 	if ( diagoffc > 0 )
 	{
-		dim_t jp       = diagoffc / NR;
-		dim_t j        = jp * NR;
-		      n        = n - j;
-		      diagoffc = diagoffc % NR;
-		      c_cast   = c_cast + (j  )*cs_c*dt_size;
-		      b_cast   = b_cast + (jp )*ps_b*dt_size;
+		const dim_t jp = diagoffc / NR;
+		const dim_t j  = jp * NR;
+
+		n        = n - j;
+		diagoffc = diagoffc % NR;
+		c_cast   = c_cast + (j  )*cs_c*dt_c_size;
+		b_cast   = b_cast + (jp )*ps_b*dt_size;
 	}
 
 	// If there is a zero region below where the diagonal of C intersects
@@ -195,25 +200,23 @@ void bli_gemmt_u_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
@@ -221,9 +224,6 @@ void bli_gemmt_u_ker_var2
 	bli_auxinfo_set_is_a( is_a, &aux );
 	bli_auxinfo_set_is_b( is_b, &aux );
 
-	// Save the desired output datatype (indicating no typecasting).
-	//bli_auxinfo_set_dt_on_output( dt, &aux );*/
-
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
@@ -231,47 +231,21 @@ void bli_gemmt_u_ker_var2
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
-	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	dim_t ir_tid = bli_thrinfo_work_id( caucus );
-
-	dim_t jr_start, jr_end;
-	dim_t ir_start, ir_end;
-	dim_t jr_inc,   ir_inc;
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	// Note that we partition the 2nd loop into two regions: the triangular
-	// part of C, and the rectangular portion.
-	dim_t n_iter_tri;
-	dim_t n_iter_rct;
+	dim_t jr_start, jr_end, jr_inc;
+	dim_t ir_start, ir_end, ir_inc;
 
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) )
-	{
-		// If the entire panel of C does not intersect the diagonal, there is
-		// no triangular region, and therefore we can skip the first set of
-		// loops.
-		n_iter_tri = 0;
-		n_iter_rct = n_iter;
-	}
-	else
-	{
-		// If the panel of C does intersect the diagonal, compute the number of
-		// iterations in the triangular (or trapezoidal) region by dividing NR
-		// into the number of rows in C. A non-zero remainder means we need to
-		// add one additional iteration. That is, we want the triangular region
-		// to contain as few columns of whole microtiles as possible while still
-		// including all microtiles that intersect the diagonal. The number of
-		// iterations in the rectangular region is computed as the remaining
-		// number of iterations in the n dimension.
-		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 );
-		n_iter_rct = n_iter - n_iter_tri;
-	}
-
-	// Use round-robin assignment of micropanels to threads in the 2nd loop
-	// and the default (slab or rr) partitioning in the 1st loop for the
-	// initial triangular region of C (if it exists).
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+	// Determine the thread range and increment for the 2nd and 1st loops.
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	bli_thread_range_quad( thread, diagoffc, BLIS_UPPER, m, n, NR,
+	                       FALSE, &jr_start, &jr_end, &jr_inc );
+	//bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -279,7 +253,12 @@ void bli_gemmt_u_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		// Compute the diagonal offset for the column of microtiles at (0,j).
+		const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR;
+
+		// Compute the current microtile's width.
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -287,38 +266,41 @@ void bli_gemmt_u_ker_var2
 		// Interior loop over the m dimension (MR rows at a time).
 		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
 		{
-			const char* a1  = a_cast + i * rstep_a;
-			      char* c11 = c1     + i * rstep_c;
-
-			// Compute the diagonal offset for the submatrix at (i,j).
-			doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;
+			// Compute the diagonal offset for the microtile at (i,j).
+			const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR;
 
-			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			// Compute the current microtile's length.
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
 
-			// Compute the addresses of the next panels of A and B.
-			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
-			{
-				a2 = a_cast;
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
-					b2 = b_cast;
-			}
-
-			// Save addresses of next panels of A and B to the auxinfo_t
-			// object.
-			bli_auxinfo_set_next_a( a2, &aux );
-			bli_auxinfo_set_next_b( b2, &aux );
-
-			// If the diagonal intersects the current MR x NR submatrix, we
+			// If the diagonal intersects the current MR x NR microtile, we
 			// compute it the temporary buffer and then add in the elements
 			// on or below the diagonal.
-			// Otherwise, if the submatrix is strictly above the diagonal,
+			// Otherwise, if the microtile is strictly above the diagonal,
 			// we compute and store as we normally would.
 			// And if we're strictly below the diagonal, we do nothing and
-			// continue.
+			// continue on through the IR loop to consider the next MR x NR
+			// microtile.
 			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
 			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+				if ( bli_is_last_iter_u( diagoffc_ij, MR, NR, ir_inc ) )
+				{
+					a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR );
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+					if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
 				// Invoke the gemm micro-kernel.
 				gemm_ukr
 				(
@@ -335,93 +317,28 @@ void bli_gemmt_u_ker_var2
 				);
 
 				// Scale C and add the result to only the stored part.
-				xpbys_mxn_u_ukr( diagoffc_ij,
-				                 m_cur, n_cur,
-				                 ct,  rs_ct, cs_ct,
-				                 ( void* )beta_cast,
-				                 c11, rs_c,  cs_c );
-			}
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) )
-			{
-				// Invoke the gemm micro-kernel.
-				gemm_ukr
+				xpbys_mxn_u_ukr
 				(
-				  m_cur,
-				  n_cur,
-				  k,
-				  ( void* )alpha_cast,
-				  ( void* )a1,
-				  ( void* )b1,
+				  diagoffc_ij,
+				  m_cur, n_cur,
+				  ct,  rs_ct, cs_ct,
 				  ( void* )beta_cast,
-				  c11, rs_c, cs_c,
-				  &aux,
-				  ( cntx_t* )cntx
+				  c11, rs_c,  cs_c
 				);
 			}
-		}
-	}
-
-	// If there is no rectangular region, then we're done.
-	if ( n_iter_rct == 0 ) return;
-
-	// Determine the thread range and increment for the 2nd loop of the
-	// remaining rectangular region of C (and also use default partitioning
-	// for the 1st loop).
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	// slab or round-robin partitioning was requested at configure-time.
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-
-	// Advance the start and end iteration offsets for the rectangular region
-	// by the number of iterations used for the triangular region.
-	jr_start += n_iter_tri;
-	jr_end   += n_iter_tri;
-
-	// Loop over the n dimension (NR columns at a time).
-	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
-	{
-		const char* b1 = b_cast + j * cstep_b;
-		      char* c1 = c_cast + j * cstep_c;
-
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
-
-		// Initialize our next panel of B to be the current panel of B.
-		const char* b2 = b1;
-
-		// Interior loop over the m dimension (MR rows at a time).
-		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
-		{
-			const char* a1  = a_cast + i * rstep_a;
-			      char* c11 = c1     + i * rstep_c;
-
-			// No need to compute the diagonal offset for the rectangular
-			// region.
-			//diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/
-
-			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
-
-			// Compute the addresses of the next panels of A and B.
-			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) )
 			{
-				a2 = a_cast;
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
-					b2 = b_cast;
-			}
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
 
-			// Save addresses of next panels of A and B to the auxinfo_t
-			// object.
-			bli_auxinfo_set_next_a( a2, &aux );
-			bli_auxinfo_set_next_b( b2, &aux );
+				// Compute the addresses of the next panel of A.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
 
-			// If the diagonal intersects the current MR x NR submatrix, we
-			// compute it the temporary buffer and then add in the elements
-			// on or below the diagonal.
-			// Otherwise, if the submatrix is strictly above the diagonal,
-			// we compute and store as we normally would.
-			// And if we're strictly below the diagonal, we do nothing and
-			// continue.
-			{
 				// Invoke the gemm micro-kernel.
 				gemm_ukr
 				(
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
new file mode 100644
index 000000000..91275577a
--- /dev/null
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
@@ -0,0 +1,386 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+typedef void (*xpbys_mxn_u_vft)
+    (
+      doff_t diagoff,
+      dim_t  m,
+      dim_t  n,
+      void*  x, inc_t rs_x, inc_t cs_x,
+      void*  b,
+      void*  y, inc_t rs_y, inc_t cs_y
+    );
+
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+\
+BLIS_INLINE void PASTEMAC(ch,op) \
+    ( \
+      doff_t diagoff, \
+      dim_t  m, \
+      dim_t  n, \
+      void*  x, inc_t rs_x, inc_t cs_x, \
+      void*  b, \
+      void*  y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctype* restrict x_cast = x; \
+	ctype* restrict b_cast = b; \
+	ctype* restrict y_cast = y; \
+\
+	PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \
+	( \
+	  diagoff, \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
+
+INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
+
+static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
+
+
+void bli_gemmt_u_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_c      = bli_obj_dt( c );
+
+	      doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_c_size = bli_dt_size( dt_c );
+
+	// Alias some constants to simpler names.
+	const dim_t MR = pd_a;
+	const dim_t NR = pd_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ];
+
+	// Temporary C buffer for edge cases. Note that the strides of this
+	// temporary buffer are set so that they match the storage of the
+	// original C matrix. For example, if C is column-stored, ct will be
+	// column-stored as well.
+	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
+	const inc_t rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+
+	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of C is entirely below the diagonal,
+	// it is not stored. So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of C
+	// intersects the top edge of the panel, adjust the pointer to C and B
+	// and treat this case as if the diagonal offset were zero.
+	// NOTE: It's possible that after this pruning that the diagonal offset
+	// is still positive (though it is guaranteed to be less than NR).
+	if ( diagoffc > 0 )
+	{
+		const dim_t jp = diagoffc / NR;
+		const dim_t j  = jp * NR;
+
+		n        = n - j;
+		diagoffc = diagoffc % NR;
+		c_cast   = c_cast + (j  )*cs_c*dt_c_size;
+		b_cast   = b_cast + (jp )*ps_b*dt_size;
+	}
+
+	// If there is a zero region below where the diagonal of C intersects
+	// the right edge of the panel, shrink it to prevent "no-op" iterations
+	// from executing.
+	if ( -diagoffc + n < m )
+	{
+		m = -diagoffc + n;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// Save the imaginary stride of A and B to the auxinfo_t object.
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	// Determine the starting microtile offsets and number of microtiles to
+	// compute for each thread. Note that assignment of microtiles is done
+	// according to the tlb policy.
+	dim_t jr_st, ir_st;
+	const dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_u( jr_nt, jr_tid, diagoffc, m_iter, n_iter, MR, NR,
+	                        &jr_st, &ir_st );
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
+	// Start the jr/ir loops with the current thread's microtile offsets computed
+	// by bli_thread_range_tlb().
+	dim_t i = ir_st;
+	dim_t j = jr_st;
+
+	// Initialize a counter to track the number of microtiles computed by the
+	// current thread.
+	dim_t ut = 0;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( ; true; ++j )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		// Compute the diagonal offset for the column of microtiles at (0,j).
+		const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR;
+
+		// Compute the current microtile's width.
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		bli_auxinfo_set_next_b( b2, &aux );
+
+		// Interior loop over the m dimension (MR rows at a time).
+		for ( ; i < m_iter; ++i )
+		{
+			// Compute the diagonal offset for the microtile at (i,j).
+			const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR;
+
+			// Compute the current microtile's length.
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
+
+			// If the diagonal intersects the current MR x NR microtile, we
+			// compute it the temporary buffer and then add in the elements
+			// on or below the diagonal.
+			// Otherwise, if the microtile is strictly above the diagonal,
+			// we compute and store as we normally would.
+			// And if we're strictly below the diagonal, we simply advance
+			// to last microtile before the bottom of the matrix.
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_tlb_u( diagoffc_ij, MR, NR ) )
+				{
+					a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR );
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				  ct, rs_ct, cs_ct,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Scale C and add the result to only the stored part.
+				xpbys_mxn_u_ukr
+				(
+				  diagoffc_ij,
+				  m_cur, n_cur,
+				  ct,  rs_ct, cs_ct,
+				  ( void* )beta_cast,
+				  c11, rs_c,  cs_c
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1;
+				if ( ut == n_ut_for_me ) return;
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				// Compute the addresses of the next panel of A.
+				const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 );
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1;
+				if ( ut == n_ut_for_me ) return;
+			}
+			else // if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				// Skip past the microtiles strictly below the diagonal.
+				i = m_iter - 1;
+			}
+		}
+
+		// Upon reaching the end of the column of microtiles, get ready to begin
+		// at the beginning of the next column (i.e., the next jr loop iteration).
+		i = 0;
+	}
+}
+
diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h
index eb6e16018..339b93755 100644
--- a/frame/3/gemmt/bli_gemmt_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -43,46 +43,19 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       const obj_t*  a, \
-       const obj_t*  ah, \
-       const obj_t*  c, \
-       const cntx_t* cntx, \
-       const cntl_t* cntl, \
-             thrinfo_t* thread  \
+       const obj_t*     a, \
+       const obj_t*     ah, \
+       const obj_t*     c, \
+       const cntx_t*    cntx, \
+       const cntl_t*    cntl, \
+             thrinfo_t* thread_par  \
      );
 
 GENPROT( gemmt_x_ker_var2 )
-
 GENPROT( gemmt_l_ker_var2 )
 GENPROT( gemmt_u_ker_var2 )
 
-
-//
-// Prototype BLAS-like interfaces with void pointer operands.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 )
-INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 )
+GENPROT( gemmt_x_ker_var2b )
+GENPROT( gemmt_l_ker_var2b )
+GENPROT( gemmt_u_ker_var2b )
 
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
index 207e1c938..8081537b9 100644
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -42,12 +42,12 @@ static l3_var_oft vars[2] =
 
 void bli_gemmt_x_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  ah,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     ah,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	dim_t uplo;
@@ -67,7 +67,7 @@ void bli_gemmt_x_ker_var2
 	  c,
 	  cntx,
 	  cntl,
-	  thread
+	  thread_par
 	);
 }
 
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2b.c b/frame/3/gemmt/bli_gemmt_x_ker_var2b.c
new file mode 100644
index 000000000..132d7c13a
--- /dev/null
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2b.c
@@ -0,0 +1,73 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+static l3_var_oft vars[2] =
+{
+	bli_gemmt_l_ker_var2b, bli_gemmt_u_ker_var2b,
+};
+
+void bli_gemmt_x_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     ah,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	dim_t uplo;
+
+	// Set a bool based on the uplo field of C's root object.
+	if ( bli_obj_root_is_lower( c ) ) uplo = 0;
+	else                              uplo = 1;
+
+	// Index into the variant array to extract the correct function pointer.
+	l3_var_oft f = vars[uplo];
+
+	// Call the macrokernel.
+	f
+	(
+	  a,
+	  ah,
+	  c,
+	  cntx,
+	  cntl,
+	  thread_par
+	);
+}
+
diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev
new file mode 100644
index 000000000..aed0359ec
--- /dev/null
+++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev
@@ -0,0 +1,507 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmt_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
+
+
+void bli_gemmt_l_ker_var2
+     (
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
+     )
+{
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of C, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. Any remainder from this integer division is discarded, which
+		   is what we want. That is, we want the rectangular region to contain
+		   as many columns of whole microtiles as possible without including any
+		   microtiles that intersect the diagonal. The number of iterations in
+		   the triangular (or trapezoidal) region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffc / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops for
+	   the initial rectangular region of C (if it exists).
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and the default (slab or rr) partitioning in the 1st loop for the
+	   remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the triangular region
+	   by the number of iterations used for the rectangular region. */ \
+	jr_start += n_iter_rct; \
+	jr_end   += n_iter_rct; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  MR, \
+				  NR, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
+
diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before b/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before
new file mode 100644
index 000000000..4285bd135
--- /dev/null
+++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before
@@ -0,0 +1,427 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmt_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2b);
+
+
+void bli_gemmt_l_ker_var2b
+     (
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
+     )
+{
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still negative (though its absolute value is guaranteed to be less
+	   than MR). */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		const dim_t ip = -diagoffc / MR; \
+		const dim_t i  = ip * MR; \
+\
+		m        = m - i; \
+		diagoffc = diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \
+	const dim_t n_left = n % NR; \
+\
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \
+	const dim_t m_left = m % MR; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	const inc_t rstep_a = ps_a; \
+\
+	const inc_t cstep_b = ps_b; \
+\
+	const inc_t rstep_c = rs_c * MR; \
+	const inc_t cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the virtual microkernel address and the params. */ \
+	/*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \
+	/*bli_auxinfo_set_params( params, &aux );*/ \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	const dim_t jr_nt  = bli_thread_n_way( thread ); \
+	const dim_t jr_tid = bli_thread_work_id( thread ); \
+	const dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	const dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+/*
+*/ \
+	bli_thread_range_weighted_jr( thread, diagoffc, BLIS_LOWER, m, n, NR, \
+	                                          FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );*/ \
+/*
+*/ \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+/*
+	dim_t jr_st, ir_st; \
+	const dim_t n_ut_for_me \
+	= \
+	bli_thread_range_tlb( thread, diagoffc, BLIS_LOWER, m, n, MR, NR, \
+	                      &jr_st, &ir_st ); \
+*/ \
+\
+/*
+printf( "bli_gemmt_l_ker_var2b():      tid %d: m n = %d %d  st en in = %3d %3d %3d do %d\n", (int)jr_tid, (int)m, (int)n, (int)jr_start, (int)jr_end, (int)jr_inc, (int)diagoffc ); \
+*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict b1 = b_cast + j * cstep_b; \
+		ctype* restrict c1 = c_cast + j * cstep_c; \
+\
+		/* Compute the diagonal offset for the column of microtiles at (0,j). */ \
+		const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \
+		const dim_t  n_cur      = ( bli_is_not_edge_f( j, n_iter, n_left ) \
+		                            ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		ctype* restrict b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			/* Compute the diagonal offset for the microtile at (i,j). */ \
+			const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \
+			const dim_t  m_cur       = ( bli_is_not_edge_f( i, m_iter, m_left ) \
+			                             ? MR : m_left ); \
+\
+			/* If the diagonal intersects the current MR x NR microtile, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the microtile is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue on through the IR loop to consider the next MR x NR
+			   microtile. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \
+					                                diagoffc_j, MR, NR ); \
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  MR, \
+				  NR, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \
+					                                diagoffc_j, MR, NR ); \
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2b )
+
diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev
new file mode 100644
index 000000000..87d77ee55
--- /dev/null
+++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev
@@ -0,0 +1,510 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmt_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
+
+
+void bli_gemmt_u_ker_var2
+     (
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
+     )
+{
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in C. A non-zero remainder means we need to
+		   add one additional iteration. That is, we want the triangular region
+		   to contain as few columns of whole microtiles as possible while still
+		   including all microtiles that intersect the diagonal. The number of
+		   iterations in the rectangular region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and the default (slab or rr) partitioning in the 1st loop for the
+	   initial triangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  MR, \
+				  NR, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+		} \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Determine the thread range and increment for the 2nd loop of the
+	   remaining rectangular region of C (and also use default partitioning
+	   for the 1st loop).
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
+
diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before b/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before
new file mode 100644
index 000000000..dbf8f389f
--- /dev/null
+++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before
@@ -0,0 +1,415 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmt_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2b);
+
+
+void bli_gemmt_u_ker_var2b
+     (
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
+     )
+{
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
+
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
+
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
+
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
+
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
+
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	ftypes[dt_exec]
+	(
+	  diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		const dim_t jp = diagoffc / NR; \
+		const dim_t j  = jp * NR; \
+\
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \
+	const dim_t n_left = n % NR; \
+\
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \
+	const dim_t m_left = m % MR; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	const inc_t rstep_a = ps_a; \
+\
+	const inc_t cstep_b = ps_b; \
+\
+	const inc_t rstep_c = rs_c * MR; \
+	const inc_t cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the virtual microkernel address and the params. */ \
+	/*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \
+	/*bli_auxinfo_set_params( params, &aux );*/ \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_weighted_jr( thread, diagoffc, BLIS_UPPER, m, n, NR, \
+	                                          FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );*/ \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+/*
+printf( "bli_gemmt_u_ker_var2b():      tid %d: m n = %d %d  st en in = %3d %3d %3d do %d\n", (int)jr_tid, (int)m, (int)n, (int)jr_start, (int)jr_end, (int)jr_inc, (int)diagoffc ); \
+*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict b1 = b_cast + j * cstep_b; \
+		ctype* restrict c1 = c_cast + j * cstep_c; \
+\
+		/* Compute the diagonal offset for the column of microtiles at (0,j). */ \
+		const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \
+		const dim_t  n_cur      = ( bli_is_not_edge_f( j, n_iter, n_left ) \
+		                            ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		ctype* restrict b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			/* Compute the diagonal offset for the microtile at (i,j). */ \
+			const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \
+			const dim_t  m_cur       = ( bli_is_not_edge_f( i, m_iter, m_left ) \
+			                             ? MR : m_left ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue on through the IR loop to consider the next MR x NR
+			   microtile. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \
+					                                diagoffc_j, MR, NR ); \
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  MR, \
+				  NR, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				ctype* restrict a1  = a_cast + i * rstep_a; \
+				ctype* restrict c11 = c1     + i * rstep_c; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				ctype* restrict a2 \
+				= bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \
+					                                diagoffc_j, MR, NR ); \
+					b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  m_cur, \
+				  n_cur, \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2b )
+
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index 3bc4e3c6b..0c5cde72c 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trmm_ll_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -83,10 +83,10 @@ void bli_trmm_ll_ker_var2
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Alias some constants to simpler names.
-	const dim_t     MR         = pd_a;
-	const dim_t     NR         = pd_b;
-	const dim_t     PACKMR     = cs_a;
-	const dim_t     PACKNR     = rs_b;
+	const dim_t MR         = pd_a;
+	const dim_t NR         = pd_b;
+	const dim_t PACKMR     = cs_a;
+	const dim_t PACKNR     = rs_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
@@ -140,50 +140,45 @@ void bli_trmm_ll_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	//thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
-	//dim_t ir_nt  = bli_thrinfo_n_way( ir_thread );
-	//dim_t ir_tid = bli_thrinfo_work_id( ir_thread );
+	//const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	//const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	dim_t jr_start, jr_end;
-	//dim_t ir_start, ir_end;
-	dim_t jr_inc;
+	dim_t jr_start, jr_end, jr_inc;
 
 	// Determine the thread range and increment for the 2nd loop.
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	//bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -191,20 +186,24 @@ void bli_trmm_ll_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
 
+		// Initialize pointers for stepping through the block of A and current
+		// column of microtiles of C.
 		const char* a1  = a_cast;
 		      char* c11 = c1;
 
 		// Loop over the m dimension (MR rows at a time).
 		for ( dim_t i = 0; i < m_iter; ++i )
 		{
-			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+			const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
 
-			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
 
 			// If the current panel of A intersects the diagonal, scale C
 			// by beta. If it is strictly below the diagonal, scale by one.
@@ -215,8 +214,8 @@ void bli_trmm_ll_ker_var2
 				// Determine the offset to and length of the panel that was
 				// packed so we can index into the corresponding location in
 				// b1.
-				dim_t off_a1011 = 0;
-				dim_t k_a1011   = bli_min( diagoffa_i + MR, k );
+				const dim_t off_a1011 = 0;
+				const dim_t k_a1011   = bli_min( diagoffa_i + MR, k );
 
 				// Compute the panel stride for the current diagonal-
 				// intersecting micro-panel.
@@ -230,13 +229,13 @@ void bli_trmm_ll_ker_var2
 				const char* b1_i = b1 + off_a1011 * PACKNR * dt_size;
 
 				// Compute the addresses of the next panels of A and B.
-				const char* a2 = a1;
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) )
 				{
 					a2 = a_cast;
-					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
-						b2 = b_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					//if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
+					//	b2 = b_cast;
 				}
 
 				// Save addresses of next panels of A and B to the auxinfo_t
@@ -268,13 +267,13 @@ void bli_trmm_ll_ker_var2
 				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
 
 				// Compute the addresses of the next panels of A and B.
-				const char* a2 = a1;
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) )
 				{
 					a2 = a_cast;
-					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
-						b2 = b_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					//if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
+					//	b2 = b_cast;
 				}
 
 				// Save addresses of next panels of A and B to the auxinfo_t
@@ -306,6 +305,6 @@ void bli_trmm_ll_ker_var2
 	}
 }
 
-//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );
-//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );
+//PASTEMAC(ch,printm)( "trmm_ll_ker_var2: a1", MR, k_a1011, a1,   1, MR, "%4.1f", "" );
+//PASTEMAC(ch,printm)( "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );
 
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2b.c b/frame/3/trmm/bli_trmm_ll_ker_var2b.c
new file mode 100644
index 000000000..bb6de00f5
--- /dev/null
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2b.c
@@ -0,0 +1,365 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_trmm_ll_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
+
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
+
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Alias some constants to simpler names.
+	const dim_t MR         = pd_a;
+	const dim_t NR         = pd_b;
+	const dim_t PACKMR     = cs_a;
+	const dim_t PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current block of A is entirely above the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return;
+
+	// If there is a zero region above where the diagonal of A intersects the
+	// left edge of the block, adjust the pointer to C and treat this case as
+	// if the diagonal offset were zero. This skips over the region that was
+	// not packed. (Note we assume the diagonal offset is a multiple of MR;
+	// this assumption will hold as long as the cache blocksizes KC nd MC are
+	// each a multiple of MR.)
+	if ( diagoffa < 0 )
+	{
+		m        += diagoffa;
+		c_cast   -= diagoffa * rs_c * dt_size;
+		diagoffa  = 0;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Computing the number of MR x MR tiles in the k dimension is needed
+	// when computing the thread ranges below.
+	const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 );
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for the JR loop.
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_st, ir_st;
+	const dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_trmm_ll( jr_nt, jr_tid, diagoffa, m_iter, n_iter, k_iter,
+	                              MR, NR, &jr_st, &ir_st );
+
+#if 0
+	printf( "tid: %ld  m,n,k_iter: %ld %ld %ld\n", tid, m_iter, n_iter, k_iter );
+	printf( "tid: %ld  trmm_ll_tlb begins at: %ld %ld  (n_ut: %ld)\n",
+	        tid, jr_st, ir_st, n_ut_for_me );
+#endif
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
+	// Start the jr/ir loops with the current thread's microtile offsets computed
+	// by bli_thread_range_tlb_trmm_ll().
+	dim_t i = ir_st;
+	dim_t j = jr_st;
+
+	// Initialize a counter to track the number of microtiles computed by the
+	// current thread.
+	dim_t ut = 0;
+
+	const char* a1 = a_cast;
+
+	// Get pointers into position by stepping through to the ith micropanel of
+	// A and ith microtile of C (within the appropriate column of microtiles).
+	for ( dim_t ii = 0; ii < ir_st; ++ii )
+	{
+		const doff_t diagoffa_ii = diagoffa + ( doff_t )ii*MR;
+
+		if ( bli_intersects_diag_n( diagoffa_ii, MR, k ) )
+		{
+			// Determine the length of the panel that was packed.
+			const dim_t k_a1011 = bli_min( diagoffa_ii + MR, k );
+
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_a_cur  = k_a1011 * PACKMR;
+			      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+			      ps_a_cur *= dt_size;
+
+			a1 += ps_a_cur;
+		}
+		else if ( bli_is_strictly_below_diag_n( diagoffa_ii, MR, k ) )
+		{
+			a1 += rstep_a;
+		}
+	}
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( ; true; ++j )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		bli_auxinfo_set_next_b( b2, &aux );
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( ; i < m_iter; ++i )
+		{
+			char* c11 = c1 + i * rstep_c;
+
+			const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
+
+			// If the current panel of A intersects the diagonal, scale C
+			// by beta. If it is strictly below the diagonal, scale by one.
+			// This allows the current macro-kernel to work for both trmm
+			// and trmm3.
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Determine the offset to and length of the panel that was
+				// packed so we can index into the corresponding location in B.
+				const dim_t off_a1011 = 0;
+				const dim_t k_a1011   = bli_min( diagoffa_i + MR, k );
+
+				// Compute the panel stride for the current diagonal-
+				// intersecting micro-panel.
+				inc_t ps_a_cur  = k_a1011 * PACKMR;
+				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+				      ps_a_cur *= dt_size;
+
+				const char* b1_i = b1 + off_a1011 * PACKNR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a1011,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1_i,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+
+				a1 += rstep_a;
+			}
+		}
+
+		// Upon reaching the end of the column of microtiles, reset the ir
+		// loop index so that we're ready to start the next pass through the
+		// m dimension (i.e., the next jr loop iteration).
+		i = 0;
+
+		// Reset the a1 pointer to the beginning of the packed matrix A.
+		a1 = a_cast;
+	}
+}
+
+//PASTEMAC(ch,printm)( "trmm_ll_ker_var2b: a1", MR, k_a1011, a1,   1, MR, "%4.1f", "" );
+//PASTEMAC(ch,printm)( "trmm_ll_ker_var2b: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );
+
+//printf( "tid: %ld  intersects diag. j,i:  %ld %ld  (ut: %ld)\n", tid, j, i, ut );
+//printf( "tid: %ld  strictbelow diag j,i:  %ld %ld  (ut: %ld)\n", tid, j, i, ut );
+
+//printf( "tid: %ld  incrementing by ps_a_cur: %ld  (k_a1011: %ld)\n",
+//        tid, ps_a_cur, k_a1011 );
+//printf( "tid: %ld  incrementing by rstep_a: %ld  (k      : %ld)\n",
+//        tid, rstep_a, k );
+
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index 265e21a66..039bcc292 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trmm_lu_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -83,10 +83,10 @@ void bli_trmm_lu_ker_var2
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Alias some constants to simpler names.
-	const dim_t     MR         = pd_a;
-	const dim_t     NR         = pd_b;
-	const dim_t     PACKMR     = cs_a;
-	const dim_t     PACKNR     = rs_b;
+	const dim_t MR         = pd_a;
+	const dim_t NR         = pd_b;
+	const dim_t PACKMR     = cs_a;
+	const dim_t PACKNR     = rs_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
@@ -147,50 +147,45 @@ void bli_trmm_lu_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	//thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
-	//dim_t ir_nt  = bli_thrinfo_n_way( ir_thread );
-	//dim_t ir_tid = bli_thrinfo_work_id( ir_thread );
+	//const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	//const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	dim_t jr_start, jr_end;
-	//dim_t ir_start, ir_end;
-	dim_t jr_inc;
+	dim_t jr_start, jr_end, jr_inc;
 
 	// Determine the thread range and increment for the 2nd loop.
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	//bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -198,20 +193,24 @@ void bli_trmm_lu_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
 
+		// Initialize pointers for stepping through the block of A and current
+		// column of microtiles of C.
 		const char* a1  = a_cast;
 		      char* c11 = c1;
 
 		// Loop over the m dimension (MR rows at a time).
 		for ( dim_t i = 0; i < m_iter; ++i )
 		{
-			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+			const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
 
-			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
 
 			// If the current panel of A intersects the diagonal, scale C
 			// by beta. If it is strictly above the diagonal, scale by one.
@@ -222,8 +221,8 @@ void bli_trmm_lu_ker_var2
 				// Determine the offset to and length of the panel that was
 				// packed so we can index into the corresponding location in
 				// b1.
-				dim_t off_a1112 = diagoffa_i;
-				dim_t k_a1112   = k - off_a1112;
+				const dim_t off_a1112 = diagoffa_i;
+				const dim_t k_a1112   = k - off_a1112;
 
 				// Compute the panel stride for the current diagonal-
 				// intersecting micro-panel.
@@ -237,13 +236,13 @@ void bli_trmm_lu_ker_var2
 				const char* b1_i = b1 + off_a1112 * PACKNR * dt_size;
 
 				// Compute the addresses of the next panels of A and B.
-				const char* a2 = a1;
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) )
 				{
 					a2 = a_cast;
-					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
-						b2 = b_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					//if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
+					//	b2 = b_cast;
 				}
 
 				// Save addresses of next panels of A and B to the auxinfo_t
@@ -275,13 +274,13 @@ void bli_trmm_lu_ker_var2
 				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
 
 				// Compute the addresses of the next panels of A and B.
-				const char* a2 = a1;
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) )
 				{
 					a2 = a_cast;
-					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
-						b2 = b_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					//if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
+					//	b2 = b_cast;
 				}
 
 				// Save addresses of next panels of A and B to the auxinfo_t
@@ -313,6 +312,6 @@ void bli_trmm_lu_ker_var2
 	}
 }
 
-//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );
-//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );
+//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: a1", MR, k_a1112, a1,   1, MR, "%4.1f", "" );
+//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );
 
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2b.c b/frame/3/trmm/bli_trmm_lu_ker_var2b.c
new file mode 100644
index 000000000..39640ad6b
--- /dev/null
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2b.c
@@ -0,0 +1,366 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_trmm_lu_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
+
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
+
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Alias some constants to simpler names.
+	const dim_t MR         = pd_a;
+	const dim_t NR         = pd_b;
+	const dim_t PACKMR     = cs_a;
+	const dim_t PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current block of A is entirely below the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of A
+	// intersects the top edge of the block, adjust the pointer to B and
+	// treat this case as if the diagonal offset were zero. Note that we
+	// don't need to adjust the pointer to A since packm would have simply
+	// skipped over the region that was not stored. (Note we assume the
+	// diagonal offset is a multiple of MR; this assumption will hold as
+	// long as the cache blocksizes KC nd MC are each a multiple of MR.)
+	if ( diagoffa > 0 )
+	{
+		k        -= diagoffa;
+		b_cast   += diagoffa * PACKNR * dt_size;
+		diagoffa  = 0;
+	}
+
+	// If there is a zero region below where the diagonal of A intersects the
+	// right side of the block, shrink it to prevent "no-op" iterations from
+	// executing.
+	if ( -diagoffa + k < m )
+	{
+		m = -diagoffa + k;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Computing the number of MR x MR tiles in the k dimension is needed
+	// when computing the thread ranges below.
+	const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 );
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_st, ir_st;
+	const dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_trmm_lu( jr_nt, jr_tid, diagoffa, m_iter, n_iter, k_iter,
+	                              MR, NR, &jr_st, &ir_st );
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
+	// Start the jr/ir loops with the current thread's microtile offsets computed
+	// by bli_thread_range_tlb_trmm_ll().
+	dim_t i = ir_st;
+	dim_t j = jr_st;
+
+	// Initialize a counter to track the number of microtiles computed by the
+	// current thread.
+	dim_t ut = 0;
+
+	const char* a1 = a_cast;
+
+	// Get pointers into position by stepping through to the ith micropanel of
+	// A and ith microtile of C (within the appropriate column of microtiles).
+	for ( dim_t ii = 0; ii < ir_st; ++ii )
+	{
+		const doff_t diagoffa_ii = diagoffa + ( doff_t )ii*MR;
+
+		if ( bli_intersects_diag_n( diagoffa_ii, MR, k ) )
+		{
+			// Determine the length of the panel that was packed.
+			const dim_t k_a1112 = k - diagoffa_ii;
+
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_a_cur  = k_a1112 * PACKMR;
+			      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+			      ps_a_cur *= dt_size;
+
+			a1 += ps_a_cur;
+		}
+		else if ( bli_is_strictly_above_diag_n( diagoffa_ii, MR, k ) )
+		{
+			a1 += rstep_a;
+		}
+	}
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( ; true; ++j )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		bli_auxinfo_set_next_b( b2, &aux );
+
+		// Loop over the m dimension (MR rows at a time).
+		for ( ; i < m_iter; ++i )
+		{
+			char* c11 = c1 + i * rstep_c;
+
+			const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
+
+			// If the current panel of A intersects the diagonal, scale C
+			// by beta. If it is strictly above the diagonal, scale by one.
+			// This allows the current macro-kernel to work for both trmm
+			// and trmm3.
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Determine the offset to and length of the panel that was
+				// packed so we can index into the corresponding location in B.
+				const dim_t off_a1112 = diagoffa_i;
+				const dim_t k_a1112   = k - off_a1112;
+
+				// Compute the panel stride for the current diagonal-
+				// intersecting micro-panel.
+				inc_t ps_a_cur  = k_a1112 * PACKMR;
+				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+				      ps_a_cur *= dt_size;
+
+				const char* b1_i = b1 + off_a1112 * PACKNR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a1112,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1_i,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) )
+			{
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+
+				a1 += rstep_a;
+			}
+		}
+
+		// Upon reaching the end of the column of microtiles, reset the ir
+		// loop index so that we're ready to start the next pass through the
+		// m dimension (i.e., the next jr loop iteration).
+		i = 0;
+
+		// Reset the a1 pointer to the beginning of the packed matrix A.
+		a1 = a_cast;
+	}
+}
+
+//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: a1", MR, k_a1112, a1,   1, MR, "%4.1f", "" );
+//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );
+
+#if 0
+	printf( "tid: %ld  m,n,k_iter: %ld %ld %ld\n", tid, m_iter, n_iter, k_iter );
+	printf( "tid: %ld  trmm_lu_tlb begins at: %ld %ld  (n_ut: %ld)\n",
+	        tid, jr_st, ir_st, n_ut_for_me );
+#endif
+
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index 785f2cf5f..f8d0fc6c8 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trmm_rl_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -147,39 +147,40 @@ void bli_trmm_rl_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
 	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
-	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+	// Query the number of threads and thread ids for each loop.
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	dim_t jr_start, jr_end;
-	dim_t ir_start, ir_end;
-	dim_t jr_inc,   ir_inc;
+	dim_t jr_start, jr_end, jr_inc;
+	dim_t ir_start, ir_end, ir_inc;
 
 	// Note that we partition the 2nd loop into two regions: the rectangular
 	// part of B, and the triangular portion.
@@ -207,11 +208,11 @@ void bli_trmm_rl_ker_var2
 
 	// Determine the thread range and increment for the 2nd and 1st loops for
 	// the initial rectangular region of B (if it exists).
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -219,7 +220,7 @@ void bli_trmm_rl_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -231,15 +232,15 @@ void bli_trmm_rl_ker_var2
 				const char* a1  = a_cast + i * rstep_a;
 				      char* c11 = c1     + i * rstep_c;
 
-				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc );
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+				if ( bli_is_last_iter_slrr( i, m_iter, ir_tid, ir_nt ) )
 				{
 					a2 = a_cast;
 					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
 						b2 = b_cast;
 				}
 
@@ -271,7 +272,7 @@ void bli_trmm_rl_ker_var2
 
 	// Use round-robin assignment of micropanels to threads in the 2nd and
 	// 1st loops for the remaining triangular region of B (if it exists).
-	// NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	// NOTE: We don't need to call bli_thread_range_rr() here since we
 	// employ a hack that calls for each thread to execute every iteration
 	// of the jr and ir loops but skip all but the pointer increment for
 	// iterations that are not assigned to it.
@@ -285,18 +286,18 @@ void bli_trmm_rl_ker_var2
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < n_iter; ++j )
 	{
-		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+		const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
 
 		// Determine the offset to the beginning of the panel that
 		// was packed so we can index into the corresponding location
 		// in A. Then compute the length of that panel.
-		dim_t off_b1121 = bli_max( -diagoffb_j, 0 );
-		dim_t k_b1121   = k - off_b1121;
+		const dim_t off_b1121 = bli_max( -diagoffb_j, 0 );
+		const dim_t k_b1121   = k - off_b1121;
 
 		const char* a1  = a_cast;
 		      char* c11 = c1;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -319,7 +320,7 @@ void bli_trmm_rl_ker_var2
 			{
 				if ( bli_trmm_my_iter_rr( i, caucus ) ) {
 
-				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
 				const char* a1_i = a1 + off_b1121 * PACKMR * dt_size;
 
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2b.c b/frame/3/trmm/bli_trmm_rl_ker_var2b.c
new file mode 100644
index 000000000..7f2757c3a
--- /dev/null
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2b.c
@@ -0,0 +1,392 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_trmm_rl_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
+
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
+
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of B is entirely above the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
+
+	// If there is a zero region above where the diagonal of B intersects
+	// the left edge of the panel, adjust the pointer to A and treat this
+	// case as if the diagonal offset were zero. Note that we don't need to
+	// adjust the pointer to B since packm would have simply skipped over
+	// the region that was not stored. (Note we assume the diagonal offset
+	// is a multiple of NR; this assumption will hold as long as the cache
+	// blocksizes KC and NC are each a multiple of NR.)
+	if ( diagoffb < 0 )
+	{
+		k        += diagoffb;
+		a_cast   -= diagoffb * PACKMR * dt_size;
+		diagoffb  = 0;
+	}
+
+	// If there is a zero region to the right of where the diagonal
+	// of B intersects the bottom of the panel, shrink it to prevent
+	// "no-op" iterations from executing.
+	if ( diagoffb + k < n )
+	{
+		n = diagoffb + k;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Computing the number of NR x NR tiles in the k dimension is needed
+	// when computing the thread ranges below.
+	const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 );
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel while the 'caucus' points to the thrinfo_t
+	// node for the 1st loop (ir).
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+#if 0
+{
+	const dim_t jr_nt  = 17;
+	const dim_t jr_tid = jr_nt - 1;
+
+	const doff_t m_iter = 10;
+	const doff_t k_iter = 10;
+	const doff_t n_iter = 20;
+
+	diagoffb = 30 * NR;
+#else
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+#endif
+	dim_t jr_st, ir_st;
+	const dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_trmm_rl( jr_nt, jr_tid, diagoffb, m_iter, n_iter, k_iter,
+	                              MR, NR, &jr_st, &ir_st );
+
+#if 0
+	printf( "tid %ld: final range: jr_st, ir_st: %ld %ld  (n_ut_for_me: %ld)\n",
+	        jr_tid, jr_st, ir_st, n_ut_for_me );
+	return;
+}
+const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
+#endif
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
+	// Start the jr/ir loops with the current thread's microtile offsets computed
+	// by bli_thread_range_tlb_trmm_r().
+	dim_t i = ir_st;
+	dim_t j = jr_st;
+
+	// Initialize a counter to track the number of microtiles computed by the
+	// current thread.
+	dim_t ut = 0;
+
+	const char* b1 = b_cast;
+
+	// Get pointers into position by stepping through to the jth micropanel of
+	// B and jth microtile of C (within the appropriate row of microtiles).
+	for ( dim_t jj = 0; jj < jr_st; ++jj )
+	{
+		const doff_t diagoffb_jj = diagoffb - ( doff_t )jj*NR;
+
+		if ( bli_intersects_diag_n( diagoffb_jj, k, NR ) )
+		{
+			// Determine the length of the panel that was packed.
+			const dim_t off_b1121 = bli_max( -diagoffb_jj, 0 );
+			const dim_t k_b1121   = k - off_b1121;
+
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_below_diag_n( diagoffb_jj, k, NR ) )
+		{
+			b1 += cstep_b;
+		}
+	}
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( ; true; ++j )
+	{
+		char* c1 = c_cast + j * cstep_c;
+
+		const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Determine the offset to and length of the panel that was packed
+		// so we can index into the corresponding location in A.
+		const dim_t off_b1121 = bli_max( -diagoffb_j, 0 );
+		const dim_t k_b1121   = k - off_b1121;
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		bli_auxinfo_set_next_b( b2, &aux );
+
+		// If the current panel of B intersects the diagonal, scale C
+		// by beta. If it is strictly below the diagonal, scale by one.
+		// This allows the current macro-kernel to work for both trmm
+		// and trmm3.
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			// Loop over the m dimension (MR rows at a time).
+			for ( ; i < m_iter; ++i )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
+
+				const char* a1_i = a1 + off_b1121 * PACKMR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, ps_b_cur, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b1121,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+			}
+
+			// Upon reaching the end of the column of microtiles, reset the ir
+			// loop index so that we're ready to start the next pass through the
+			// m dimension (i.e., the next jr loop iteration).
+			i = 0;
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Loop over the m dimension (MR rows at a time).
+			for ( ; i < m_iter; ++i )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+			}
+
+			// Upon reaching the end of the column of microtiles, reset the ir
+			// loop index so that we're ready to start the next pass through the
+			// m dimension (i.e., the next jr loop iteration).
+			i = 0;
+
+			b1 += cstep_b;
+		}
+	}
+}
+
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );
+
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index ca27caef1..a031b6794 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trmm_ru_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -148,25 +148,23 @@ void bli_trmm_ru_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+    const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+    const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+    const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
@@ -177,14 +175,13 @@ void bli_trmm_ru_ker_var2
 	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
-	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+	//const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	//const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	dim_t jr_start, jr_end;
-	dim_t ir_start, ir_end;
-	dim_t jr_inc,   ir_inc;
+	dim_t jr_start, jr_end, jr_inc;
+	dim_t ir_start, ir_end, ir_inc;
 
 	// Note that we partition the 2nd loop into two regions: the triangular
 	// part of C, and the rectangular portion.
@@ -212,7 +209,7 @@ void bli_trmm_ru_ker_var2
 
 	// Use round-robin assignment of micropanels to threads in the 2nd and
 	// 1st loops for the initial triangular region of B (if it exists).
-	// NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	// NOTE: We don't need to call bli_thread_range_rr() here since we
 	// employ a hack that calls for each thread to execute every iteration
 	// of the jr and ir loops but skip all but the pointer increment for
 	// iterations that are not assigned to it.
@@ -223,17 +220,18 @@ void bli_trmm_ru_ker_var2
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = 0; j < n_iter_tri; ++j )
 	{
-		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+		const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
 
 		// Determine the offset to and length of the panel that was packed
 		// so we can index into the corresponding location in A.
-		dim_t off_b0111 = 0;
-		dim_t k_b0111   = bli_min( k, -diagoffb_j + NR );
+		const dim_t off_b0111 = 0;
+		const dim_t k_b0111   = bli_min( k, -diagoffb_j + NR );
 
 		const char* a1  = a_cast;
 		      char* c11 = c1;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -256,7 +254,8 @@ void bli_trmm_ru_ker_var2
 			{
 				if ( bli_trmm_my_iter_rr( i, caucus ) ) {
 
-				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
 
 				const char* a1_i = a1 + off_b0111 * PACKMR * dt_size;
 
@@ -266,8 +265,6 @@ void bli_trmm_ru_ker_var2
 				{
 					a2 = a_cast;
 					b2 = b1;
-					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
-						b2 = b_cast;
 				}
 
 				// Save addresses of next panels of A and B to the auxinfo_t
@@ -307,11 +304,11 @@ void bli_trmm_ru_ker_var2
 
 	// Determine the thread range and increment for the 2nd and 1st loops for
 	// the remaining rectangular region of B.
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Advance the start and end iteration offsets for the rectangular region
 	// by the number of iterations used for the triangular region.
@@ -332,7 +329,8 @@ void bli_trmm_ru_ker_var2
 		b1 = b_cast + (j-jb0) * cstep_b;
 		c1 = c_cast +  j      * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -348,16 +346,15 @@ void bli_trmm_ru_ker_var2
 				const char* a1  = a_cast + i * rstep_a;
 				      char* c11 = c1     + i * rstep_c;
 
-				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc );
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+				if ( bli_is_last_iter_slrr( i, m_iter, ir_tid, ir_nt ) )
 				{
 					a2 = a_cast;
 					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
-						b2 = b_cast;
 				}
 
 				// Save addresses of next panels of A and B to the auxinfo_t
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2b.c b/frame/3/trmm/bli_trmm_ru_ker_var2b.c
new file mode 100644
index 000000000..8aae2386a
--- /dev/null
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2b.c
@@ -0,0 +1,390 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_trmm_ru_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
+
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
+
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of B is entirely below its diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return;
+
+	// If there is a zero region to the left of where the diagonal of B
+	// intersects the top edge of the panel, adjust the pointer to C and
+	// treat this case as if the diagonal offset were zero. This skips over
+	// the region that was not packed. (Note we assume the diagonal offset
+	// is a multiple of NR; this assumption will hold as long as the cache
+	// blocksizes KC and NC are each a multiple of NR.)
+	if ( diagoffb > 0 )
+	{
+		n        -= diagoffb;
+		c_cast   += diagoffb * cs_c * dt_size;
+		diagoffb  = 0;
+	}
+
+	// If there is a zero region below where the diagonal of B intersects the
+	// right side of the block, shrink it to prevent "no-op" iterations from
+	// executing.
+	if ( -diagoffb + n < k )
+	{
+		k = -diagoffb + n;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Computing the number of NR x NR tiles in the k dimension is needed
+	// when computing the thread ranges below.
+	const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 );
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel. Here we query the thrinfo_t node for the
+	// 1st (ir) loop around the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+#if 0
+{
+	const dim_t jr_nt  = 1;
+	const dim_t jr_tid = 0; //jr_nt - 1;
+
+	const doff_t m_iter = 10;
+	const doff_t k_iter = 10;
+	const doff_t n_iter = 20;
+
+	diagoffb = 0 * NR;
+#else
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+#endif
+	dim_t jr_st, ir_st;
+	const dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_trmm_ru( jr_nt, jr_tid, diagoffb, m_iter, n_iter, k_iter,
+	                              MR, NR, &jr_st, &ir_st );
+
+#if 0
+	printf( "tid %ld: final range: jr_st, ir_st: %ld %ld  (n_ut_for_me: %ld)\n",
+	        jr_tid, jr_st, ir_st, n_ut_for_me );
+	return;
+}
+const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
+#endif
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
+	// Start the jr/ir loops with the current thread's microtile offsets computed
+	// by bli_thread_range_tlb_trmm_r().
+	dim_t i = ir_st;
+	dim_t j = jr_st;
+
+	// Initialize a counter to track the number of microtiles computed by the
+	// current thread.
+	dim_t ut = 0;
+
+	const char* b1 = b_cast;
+
+	// Get pointers into position by stepping through to the jth micropanel of
+	// B and jth microtile of C (within the appropriate row of microtiles).
+	for ( dim_t jj = 0; jj < jr_st; ++jj )
+	{
+		const doff_t diagoffb_jj = diagoffb - ( doff_t )jj*NR;
+
+		if ( bli_intersects_diag_n( diagoffb_jj, k, NR ) )
+		{
+			// Determine the length of the panel that was packed.
+			const dim_t k_b0111 = bli_min( k, -diagoffb_jj + NR );
+
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b0111 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_above_diag_n( diagoffb_jj, k, NR ) )
+		{
+			b1 += cstep_b;
+		}
+	}
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( ; true; ++j )
+	{
+		char* c1 = c_cast + j * cstep_c;
+
+		const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Determine the offset to and length of the panel that was packed
+		// so we can index into the corresponding location in A.
+		const dim_t off_b0111 = 0;
+		const dim_t k_b0111   = bli_min( k, -diagoffb_j + NR );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		bli_auxinfo_set_next_b( b2, &aux );
+
+		// If the current panel of B intersects the diagonal, scale C
+		// by beta. If it is strictly above the diagonal, scale by one.
+		// This allows the current macro-kernel to work for both trmm
+		// and trmm3.
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b0111 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			// Loop over the m dimension (MR rows at a time).
+			for ( ; i < m_iter; ++i )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
+
+				const char* a1_i = a1 + off_b0111 * PACKMR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, ps_b_cur, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b0111,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+			}
+
+			// Upon reaching the end of the column of microtiles, reset the ir
+			// loop index so that we're ready to start the next pass through the
+			// m dimension (i.e., the next jr loop iteration).
+			i = 0;
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Loop over the m dimension (MR rows at a time).
+			for ( ; i < m_iter; ++i )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter_sl( i, m_iter ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 );
+					bli_auxinfo_set_next_b( b2, &aux );
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				// Increment the microtile counter and check if the thread is done.
+				ut += 1; if ( ut == n_ut_for_me ) return;
+			}
+
+			// Upon reaching the end of the column of microtiles, reset the ir
+			// loop index so that we're ready to start the next pass through the
+			// m dimension (i.e., the next jr loop iteration).
+			i = 0;
+
+			b1 += cstep_b;
+		}
+	}
+}
+
+//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );
+
diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h
index f8c3d7ee2..0a605ba86 100644
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -43,54 +43,23 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       const obj_t*  a, \
-       const obj_t*  b, \
-       const obj_t*  c, \
-       const cntx_t* cntx, \
-       const cntl_t* cntl, \
-             thrinfo_t* thread  \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     c, \
+       const cntx_t*    cntx, \
+       const cntl_t*    cntl, \
+             thrinfo_t* thread_par  \
      );
 
-//GENPROT( trmm_blk_var1 )
-//GENPROT( trmm_blk_var2 )
-//GENPROT( trmm_blk_var3 )
-
 GENPROT( trmm_xx_ker_var2 )
-
 GENPROT( trmm_ll_ker_var2 )
 GENPROT( trmm_lu_ker_var2 )
 GENPROT( trmm_rl_ker_var2 )
 GENPROT( trmm_ru_ker_var2 )
 
-
-//
-// Prototype BLAS-like interfaces with void pointer operands.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoff, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 )
-INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 )
-INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 )
-INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 )
+GENPROT( trmm_xx_ker_var2b )
+GENPROT( trmm_ll_ker_var2b )
+GENPROT( trmm_lu_ker_var2b )
+GENPROT( trmm_rl_ker_var2b )
+GENPROT( trmm_ru_ker_var2b )
 
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
index 60030bf4a..918b8f973 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -43,12 +43,12 @@ static l3_var_oft vars[2][2] =
 
 void bli_trmm_xx_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	dim_t side;
@@ -81,7 +81,7 @@ void bli_trmm_xx_ker_var2
 	  c,
 	  cntx,
 	  cntl,
-	  thread
+	  thread_par
 	);
 }
 
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2b.c b/frame/3/trmm/bli_trmm_xx_ker_var2b.c
new file mode 100644
index 000000000..57894165c
--- /dev/null
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2b.c
@@ -0,0 +1,87 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+static l3_var_oft vars[2][2] =
+{
+	{ bli_trmm_ll_ker_var2b, bli_trmm_lu_ker_var2b },
+	{ bli_trmm_rl_ker_var2b, bli_trmm_ru_ker_var2b }
+};
+
+void bli_trmm_xx_ker_var2b
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	dim_t side;
+	dim_t uplo;
+
+	// Set two bools: one based on the implied side parameter (the structure
+	// of the root object) and one based on the uplo field of the triangular
+	// matrix's root object (whether that is matrix A or matrix B).
+	if ( bli_obj_root_is_triangular( a ) )
+	{
+		side = 0;
+		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
+		else                              uplo = 1;
+	}
+	else // if ( bli_obj_root_is_triangular( b ) )
+	{
+		side = 1;
+		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
+		else                              uplo = 1;
+	}
+
+	// Index into the variant array to extract the correct function pointer.
+	l3_var_oft f = vars[side][uplo];
+
+	// Call the macrokernel.
+	f
+	(
+	  a,
+	  b,
+	  c,
+	  cntx,
+	  cntl,
+	  thread_par
+	);
+}
+
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev
new file mode 100644
index 000000000..5aebe23c1
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev
@@ -0,0 +1,371 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_trmm_rl_ker_var2
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
+
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
+
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of B is entirely above the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
+
+	// If there is a zero region above where the diagonal of B intersects
+	// the left edge of the panel, adjust the pointer to A and treat this
+	// case as if the diagonal offset were zero. Note that we don't need to
+	// adjust the pointer to B since packm would have simply skipped over
+	// the region that was not stored.
+	if ( diagoffb < 0 )
+	{
+		k        += diagoffb;
+		a_cast   -= diagoffb * PACKMR * dt_size;
+		diagoffb  = 0;
+	}
+
+	// If there is a zero region to the right of where the diagonal
+	// of B intersects the bottom of the panel, shrink it to prevent
+	// "no-op" iterations from executing.
+	if ( diagoffb + k < n )
+	{
+		n = diagoffb + k;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	// Determine some increments used to step through A, B, and C.
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
+	// Note that we partition the 2nd loop into two regions: the rectangular
+	// part of B, and the triangular portion.
+	dim_t n_iter_rct;
+	dim_t n_iter_tri;
+
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) )
+	{
+		// If the entire panel of B does not intersect the diagonal, there is
+		// no triangular region, and therefore we can skip the second set of
+		// loops.
+		n_iter_rct = n_iter;
+		n_iter_tri = 0;
+	}
+	else
+	{
+		// If the panel of B does intersect the diagonal, compute the number of
+		// iterations in the rectangular region by dividing NR into the diagonal
+		// offset. (There should never be any remainder in this division.) The
+		// number of iterations in the triangular (or trapezoidal) region is
+		// computed as the remaining number of iterations in the n dimension.
+		n_iter_rct = diagoffb / NR;
+		n_iter_tri = n_iter - n_iter_rct;
+	}
+
+	// Determine the thread range and increment for the 2nd and 1st loops for
+	// the initial rectangular region of B (if it exists).
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is disabled for now.
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		{
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc );
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+
+	// If there is no triangular region, then we're done.
+	if ( n_iter_tri == 0 ) return;
+
+	// Use round-robin assignment of micropanels to threads in the 2nd and
+	// 1st loops for the remaining triangular region of B (if it exists).
+	// NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	// employ a hack that calls for each thread to execute every iteration
+	// of the jr and ir loops but skip all but the pointer increment for
+	// iterations that are not assigned to it.
+
+	// Advance the starting b1 and c1 pointers to the positions corresponding
+	// to the start of the triangular region of B.
+	jr_start = n_iter_rct;
+	const char* b1 = b_cast + jr_start * cstep_b;
+	      char* c1 = c_cast + jr_start * cstep_c;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < n_iter; ++j )
+	{
+		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		// Determine the offset to the beginning of the panel that
+		// was packed so we can index into the corresponding location
+		// in A. Then compute the length of that panel.
+		dim_t off_b1121 = bli_max( -diagoffb_j, 0 );
+		dim_t k_b1121   = k - off_b1121;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// If the current panel of B intersects the diagonal, scale C
+		// by beta. If it is strictly below the diagonal, scale by one.
+		// This allows the current macro-kernel to work for both trmm
+		// and trmm3.
+		{
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			if ( bli_trmm_my_iter_rr( j, thread ) ) {
+
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) {
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				const char* a1_i = a1 + off_b1121 * PACKMR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b1121,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+			}
+
+			b1 += ps_b_cur;
+		}
+
+		c1 += cstep_c;
+	}
+}
+
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );
+
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified
new file mode 100644
index 000000000..7d2aabaa4
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified
@@ -0,0 +1,324 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_trmm_rl_ker_var2
+     (
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
+     )
+{
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
+
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
+
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
+
+	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Alias some constants to simpler names.
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
+
+	// Query the context for the micro-kernel address and cast it to its
+	// function pointer type.
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/
+
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	// If any dimension is zero, return immediately.
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
+	// Safeguard: If the current panel of B is entirely above the diagonal,
+	// it is implicitly zero. So we do nothing.
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
+
+	// If there is a zero region above where the diagonal of B intersects
+	// the left edge of the panel, adjust the pointer to A and treat this
+	// case as if the diagonal offset were zero. Note that we don't need to
+	// adjust the pointer to B since packm would have simply skipped over
+	// the region that was not stored.
+	if ( diagoffb < 0 )
+	{
+		k        += diagoffb;
+		a_cast   -= diagoffb * PACKMR * dt_size;
+		diagoffb  = 0;
+	}
+
+	// If there is a zero region to the right of where the diagonal
+	// of B intersects the bottom of the panel, shrink it to prevent
+	// "no-op" iterations from executing.
+	if ( diagoffb + k < n )
+	{
+		n = diagoffb + k;
+	}
+
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
+
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
+
+	// Determine some increments used to step through A, B, and C.
+	const inc_t rstep_a = ps_a * dt_size;
+
+	const inc_t cstep_b = ps_b * dt_size;
+
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
+
+	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	// loop around the microkernel while the 'caucus' points to the thrinfo_t
+	// node for the 1st loop (ir).
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	// Query the number of threads and thread ids for each loop.
+	//const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	//const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+
+	dim_t jr_start, jr_end, jr_inc;
+	dim_t ir_start, ir_end, ir_inc;
+
+	// Determine the thread range and increment for the 2nd and 1st loops.
+	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// slab or round-robin partitioning was requested at configure-time.
+	// NOTE: Parallelism in the 1st loop is disabled for now.
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	const char* b1 = b_cast;
+	//      char* c1 = c_cast;
+
+	// Loop over the n dimension (NR columns at a time).
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* a1  = a_cast;
+		      char* c1  = c_cast + j * cstep_c;
+		      char* c11 = c1;
+
+		const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
+
+		// Determine the offset to the beginning of the panel that
+		// was packed so we can index into the corresponding location
+		// in A. Then compute the length of that panel.
+		const dim_t off_b1121 = bli_max( -diagoffb_j, 0 );
+		const dim_t k_b1121   = k - off_b1121;
+
+		// Initialize our next panel of B to be the current panel of B.
+		const char* b2 = b1;
+
+		// If the current panel of B intersects the diagonal, scale C
+		// by beta. If it is strictly below the diagonal, scale by one.
+		// This allows the current macro-kernel to work for both trmm
+		// and trmm3.
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Compute the panel stride for the current diagonal-
+			// intersecting micro-panel.
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+			      ps_b_cur *= dt_size;
+
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = 0; i < m_iter; ++i )
+			//for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+			{
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
+
+				const char* a1_i = a1 + off_b1121 * PACKMR * dt_size;
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					//if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					//	b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b1121,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) )
+		{
+			// Loop over the m dimension (MR rows at a time).
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+				                      ? MR : m_left );
+
+				// Compute the addresses of the next panels of A and B.
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					//if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					//	b2 = b_cast;
+				}
+
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object.
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				// Invoke the gemm micro-kernel.
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += cstep_b;
+		}
+
+		//c1 += cstep_c;
+	}
+}
+
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );
+//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );
+
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
index 275d6ca47..45af76910 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
@@ -356,7 +356,7 @@ void PASTEMAC(ch,varname) \
 		b2 = b1; \
 \
 		/* If the current panel of B intersects the diagonal, scale C
-		   by beta. If it is strictly below the diagonal, scale by one.
+		   by beta. If it is strictly above the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
 		   and trmm3. */ \
 		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index e2128f100..786e4f343 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trsm_ll_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -158,47 +158,44 @@ void bli_trsm_ll_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
 	// We don't bother querying the thrinfo_t node for the 1st loop because
 	// we can't parallelize that loop in trsm due to the inter-iteration
 	// dependencies that exist.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
 
-	dim_t jr_start, jr_end;
-	dim_t jr_inc;
+	dim_t jr_start, jr_end, jr_inc;
 
 	// Determine the thread range and increment for the 2nd loop.
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is unattainable due to the
 	// inter-iteration dependencies present in trsm.
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -206,7 +203,8 @@ void bli_trsm_ll_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		      dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2  = b1;
@@ -217,9 +215,10 @@ void bli_trsm_ll_ker_var2
 		// Loop over the m dimension (MR rows at a time).
 		for ( dim_t i = 0; i < m_iter; ++i )
 		{
-			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+			const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
 
-			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
+			                      ? MR : m_left );
 
 			// If the current panel of A intersects the diagonal, use a
 			// special micro-kernel that performs a fused gemm and trsm.
@@ -230,10 +229,10 @@ void bli_trsm_ll_ker_var2
 			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
 			{
 				// Compute various offsets into and lengths of parts of A.
-				dim_t off_a10 = 0;
-				dim_t k_a1011 = diagoffa_i + MR;
-				dim_t k_a10   = k_a1011 - MR;
-				dim_t off_a11 = k_a10;
+				const dim_t off_a10 = 0;
+				const dim_t k_a1011 = diagoffa_i + MR;
+				const dim_t k_a10   = k_a1011 - MR;
+				const dim_t off_a11 = k_a10;
 
 				// Compute the panel stride for the current diagonal-
 				// intersecting micro-panel.
@@ -258,7 +257,7 @@ void bli_trsm_ll_ker_var2
 				{
 					a2 = a_cast;
 					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
 						b2 = b_cast;
 				}
 
@@ -292,7 +291,7 @@ void bli_trsm_ll_ker_var2
 				{
 					a2 = a_cast;
 					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
 						b2 = b_cast;
 				}
 
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 314ee3070..ebf44905b 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trsm_lu_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -169,47 +169,44 @@ void bli_trsm_lu_ker_var2
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
-	dim_t n_iter = n / NR;
-	dim_t n_left = n % NR;
+	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
+	const dim_t n_left = n % NR;
 
-	dim_t m_iter = m / MR;
-	dim_t m_left = m % MR;
-
-	if ( n_left ) ++n_iter;
-	if ( m_left ) ++m_iter;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_size;
+	const inc_t cstep_c = cs_c * NR * dt_size;
 
-	// Save the pack schemas of A and B to the auxinfo_t object.
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
 	// We don't bother querying the thrinfo_t node for the 1st loop because
 	// we can't parallelize that loop in trsm due to the inter-iteration
 	// dependencies that exist.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 
 	// Query the number of threads and thread ids for each loop.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
 
-	dim_t jr_start, jr_end;
-	dim_t jr_inc;
+	dim_t jr_start, jr_end, jr_inc;
 
 	// Determine the thread range and increment for the 2nd loop.
-	// NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is unattainable due to the
 	// inter-iteration dependencies present in trsm.
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -217,7 +214,8 @@ void bli_trsm_lu_ker_var2
 		const char* b1 = b_cast + j * cstep_b;
 		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
+		                      ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
@@ -228,10 +226,11 @@ void bli_trsm_lu_ker_var2
 		// Loop over the m dimension (MR rows at a time).
 		for ( dim_t ib = 0; ib < m_iter; ++ib )
 		{
-			dim_t  i          = m_iter - 1 - ib;
-			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+			const dim_t  i          = m_iter - 1 - ib;
+			const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
 
-			dim_t  m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left );
+			const dim_t m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left )
+			                      ? MR : m_left );
 
 			// If the current panel of A intersects the diagonal, use a
 			// special micro-kernel that performs a fused gemm and trsm.
@@ -242,11 +241,11 @@ void bli_trsm_lu_ker_var2
 			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
 			{
 				// Compute various offsets into and lengths of parts of A.
-				dim_t off_a11 = diagoffa_i;
-				dim_t k_a1112 = k - off_a11;;
-				dim_t k_a11   = MR;
-				dim_t k_a12   = k_a1112 - MR;
-				dim_t off_a12 = off_a11 + k_a11;
+				const dim_t off_a11 = diagoffa_i;
+				const dim_t k_a1112 = k - off_a11;;
+				const dim_t k_a11   = MR;
+				const dim_t k_a12   = k_a1112 - MR;
+				const dim_t off_a12 = off_a11 + k_a11;
 
 				// Compute the panel stride for the current diagonal-
 				// intersecting micro-panel.
@@ -271,7 +270,7 @@ void bli_trsm_lu_ker_var2
 				{
 					a2 = a_cast;
 					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
 						b2 = b_cast;
 				}
 
@@ -305,7 +304,7 @@ void bli_trsm_lu_ker_var2
 				{
 					a2 = a_cast;
 					b2 = b1;
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) )
 						b2 = b_cast;
 				}
 
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 42e72840e..073fe3ec0 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trsm_rl_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -131,23 +131,23 @@ void bli_trsm_rl_ker_var2
 	  the right-hand side parameter case).
 	*/
 
-	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */
+	// Safety trap: Certain indexing within this macro-kernel does not
+	// work as intended if both MR and NR are odd.
 	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
 	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
 
-	/* If any dimension is zero, return immediately. */
+	// If any dimension is zero, return immediately.
 	if ( bli_zero_dim3( m, n, k ) ) return;
 
-	/* Safeguard: If the current panel of B is entirely above its diagonal,
-	   it is implicitly zero. So we do nothing. */
+	// Safeguard: If the current panel of B is entirely above its diagonal,
+	// it is implicitly zero. So we do nothing.
 	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
 
-	/* If there is a zero region above where the diagonal of B intersects
-	   the left edge of the panel, adjust the pointer to A and treat this
-	   case as if the diagonal offset were zero. Note that we don't need to
-	   adjust the pointer to B since packm would have simply skipped over
-	   the region that was not stored. */
+	// If there is a zero region above where the diagonal of B intersects
+	// the left edge of the panel, adjust the pointer to A and treat this
+	// case as if the diagonal offset were zero. Note that we don't need to
+	// adjust the pointer to B since packm would have simply skipped over
+	// the region that was not stored.
 	if ( diagoffb < 0 )
 	{
 		k        += diagoffb;
@@ -155,40 +155,40 @@ void bli_trsm_rl_ker_var2
 		diagoffb  = 0;
 	}
 
-	/* If there is a zero region to the right of where the diagonal
-	   of B intersects the bottom of the panel, shrink it so that
-	   we can index to the correct place in C (corresponding to the
-	   part of the panel of B that was packed).
-	   NOTE: This is NOT being done to skip over "no-op" iterations,
-	   as with the trsm_lu macro-kernel. This MUST be done for correct
-	   execution because we use n (via n_iter) to compute diagonal and
-	   index offsets for backwards movement through B. */
+	// If there is a zero region to the right of where the diagonal
+	// of B intersects the bottom of the panel, shrink it so that
+	// we can index to the correct place in C (corresponding to the
+	// part of the panel of B that was packed).
+	// NOTE: This is NOT being done to skip over "no-op" iterations,
+	// as with the trsm_lu macro-kernel. This MUST be done for correct
+	// execution because we use n (via n_iter) to compute diagonal and
+	// index offsets for backwards movement through B.
 	if ( diagoffb + k < n )
 	{
 		n = diagoffb + k;
 	}
 
-	/* Check the k dimension, which needs to be a multiple of NR. If k
-	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
-	   kernel, which is expecting to perform an NR x NR triangular solve.
-	   This adjustment of k is consistent with what happened when B was
-	   packed: all of its bottom/right edges were zero-padded, and
-	   furthermore, the panel that stores the bottom-right corner of the
-	   matrix has its diagonal extended into the zero-padded region (as
-	   identity). This allows the trsm of that bottom-right panel to
-	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of A. */
+	// Check the k dimension, which needs to be a multiple of NR. If k
+	// isn't a multiple of NR, we adjust it higher to satisfy the micro-
+	// kernel, which is expecting to perform an NR x NR triangular solve.
+	// This adjustment of k is consistent with what happened when B was
+	// packed: all of its bottom/right edges were zero-padded, and
+	// furthermore, the panel that stores the bottom-right corner of the
+	// matrix has its diagonal extended into the zero-padded region (as
+	// identity). This allows the trsm of that bottom-right panel to
+	// proceed without producing any infs or NaNs that would infect the
+	// "good" values of the corresponding block of A.
 	if ( k % NR != 0 ) k += NR - ( k % NR );
 
-	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
-	   know that the underlying buffer was already allocated to have an n
-	   dimension that is a multiple of PACKNR, with the region between the
-	   last column and the next multiple of NR zero-padded accordingly. */
+	// NOTE: We don't need to check that n is a multiple of PACKNR since we
+	// know that the underlying buffer was already allocated to have an n
+	// dimension that is a multiple of PACKNR, with the region between the
+	// last column and the next multiple of NR zero-padded accordingly.
 
 	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */
+	// Compute number of primary and leftover components of the m and n
+	// dimensions.
 	dim_t n_iter = n / NR;
 	dim_t n_left = n % NR;
 
@@ -198,7 +198,7 @@ void bli_trsm_rl_ker_var2
 	if ( n_left ) ++n_iter;
 	if ( m_left ) ++m_iter;
 
-	/* Determine some increments used to step through A, B, and C. */
+	// Determine some increments used to step through A, B, and C.
 	inc_t rstep_a = ps_a * dt_size;
 
 	inc_t cstep_b = ps_b * dt_size;
@@ -206,17 +206,18 @@ void bli_trsm_rl_ker_var2
 	inc_t rstep_c = rs_c * MR * dt_size;
 	inc_t cstep_c = cs_c * NR * dt_size;
 
-	/* Save the pack schemas of A and B to the auxinfo_t object.
-	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */
 	auxinfo_t aux;
+
+	// Save the pack schemas of A and B to the auxinfo_t object.
+	// NOTE: We swap the values for A and B since the triangular
+	// "A" matrix is actually contained within B.
 	bli_auxinfo_set_schema_a( schema_b, &aux );
 	bli_auxinfo_set_schema_b( schema_a, &aux );
 
 	const char* b1 = b_cast;
 	      char* c1 = c_cast;
 
-	/* Loop over the n dimension (NR columns at a time). */
+	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t jb = 0; jb < n_iter; ++jb )
 	{
 		dim_t  j          = n_iter - 1 - jb;
@@ -227,50 +228,50 @@ void bli_trsm_rl_ker_var2
 		const char* a1         = a_cast;
 		      char* c11        = c1 + (n_iter-1)*cstep_c;
 
-		/* Initialize our next panel of B to be the current panel of B. */
+		// Initialize our next panel of B to be the current panel of B.
 		const char* b2 = b1;
 
-		/* If the current panel of B intersects the diagonal, use a
-		   special micro-kernel that performs a fused gemm and trsm.
-		   If the current panel of B resides below the diagonal, use a
-		   a regular gemm micro-kernel. Otherwise, if it is above the
-		   diagonal, it was not packed (because it is implicitly zero)
-		   and so we do nothing. */
+		// If the current panel of B intersects the diagonal, use a
+		// special micro-kernel that performs a fused gemm and trsm.
+		// If the current panel of B resides below the diagonal, use a
+		// a regular gemm micro-kernel. Otherwise, if it is above the
+		// diagonal, it was not packed (because it is implicitly zero)
+		// and so we do nothing.
 		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
 		{
-			/* Determine the offset to and length of the panel that was packed
-			   so we can index into the corresponding location in A. */
+			// Determine the offset to and length of the panel that was packed
+			// so we can index into the corresponding location in A.
 			dim_t off_b11   = bli_max( -diagoffb_j, 0 );
 			dim_t k_b1121   = k - off_b11;
 			dim_t k_b11     = NR;
 			dim_t k_b21     = k_b1121 - NR;
 			dim_t off_b21   = off_b11 + k_b11;
 
-			/* Compute the addresses of the triangular block B11 and the
-			   panel B21. */
+			// Compute the addresses of the triangular block B11 and the
+			// panel B21.
 			const char* b11 = b1;
 			const char* b21 = b1 + k_b11 * PACKNR * dt_size;
-			/*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/
+			//b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );
 
-			/* Compute the panel stride for the current micro-panel. */
+			// Compute the panel stride for the current micro-panel.
 			inc_t ps_b_cur  = k_b1121 * PACKNR;
 				  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
 				  ps_b_cur *= dt_size;
 
-			/* Loop over the m dimension (MR rows at a time). */
+			// Loop over the m dimension (MR rows at a time).
 			for ( dim_t i = 0; i < m_iter; ++i )
 			{
 				if ( bli_trsm_my_iter_rr( i, thread ) ){
 
 				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
-				/* Compute the addresses of the A11 block and A12 panel. */
+				// Compute the addresses of the A11 block and A12 panel.
 				const char* a11  = a1 + off_b11 * PACKMR * dt_size;
 				const char* a12  = a1 + off_b21 * PACKMR * dt_size;
 
-				/* Compute the addresses of the next panels of A and B. */
+				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1;
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				//if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
 				if ( i + bli_thrinfo_num_threads(thread) >= m_iter )
 				{
 					a2 = a_cast;
@@ -279,9 +280,9 @@ void bli_trsm_rl_ker_var2
 						b2 = b_cast;
 				}
 
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object. NOTE: We swap the values for A and B since the
+				// triangular "A" matrix is actually contained within B.
 				bli_auxinfo_set_next_a( b2, &aux );
 				bli_auxinfo_set_next_b( a2, &aux );
 
@@ -310,16 +311,16 @@ void bli_trsm_rl_ker_var2
 		}
 		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) )
 		{
-			/* Loop over the m dimension (MR rows at a time). */
+			// Loop over the m dimension (MR rows at a time).
 			for ( dim_t i = 0; i < m_iter; ++i )
 			{
 				if ( bli_trsm_my_iter_rr( i, thread ) ){
 
 				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
-				/* Compute the addresses of the next panels of A and B. */
+				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1;
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				//if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
 				if ( i + bli_thrinfo_num_threads(thread) >= m_iter )
 				{
 					a2 = a_cast;
@@ -328,13 +329,13 @@ void bli_trsm_rl_ker_var2
 						b2 = b_cast;
 				}
 
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */
+				// Save addresses of next panels of A and B to the auxinfo_t
+				// object. NOTE: We swap the values for A and B since the
+				// triangular "A" matrix is actually contained within B.
 				bli_auxinfo_set_next_a( b2, &aux );
 				bli_auxinfo_set_next_b( a2, &aux );
 
-				/* Invoke the gemm micro-kernel. */
+				// Invoke the gemm micro-kernel.
 				gemm_ukr
 				(
 				  m_cur,
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 6cc9a8bbb..a05e94494 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -37,11 +37,11 @@
 
 void bli_trsm_ru_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
              thrinfo_t* thread_par
      )
 {
@@ -244,7 +244,7 @@ void bli_trsm_ru_ker_var2
 			// block B11.
 			const char* b01 = b1;
 			const char* b11 = b1 + k_b01 * PACKNR * dt_size;
-			//b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/
+			//b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );
 
 			// Compute the panel stride for the current micro-panel.
 			inc_t ps_b_cur  = k_b0111 * PACKNR;
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index a498e687e..4d7e72b43 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -48,7 +48,7 @@ void PASTEMAC0(opname) \
        const obj_t*     c, \
        const cntx_t*    cntx, \
        const cntl_t*    cntl, \
-             thrinfo_t* thread  \
+             thrinfo_t* thread_par  \
      );
 
 GENPROT( trsm_blk_var1 )
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index 39c5372f3..dfeefcd9d 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -43,12 +43,12 @@ static l3_var_oft vars[2][2] =
 
 void bli_trsm_xx_ker_var2
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	dim_t side;
@@ -81,7 +81,7 @@ void bli_trsm_xx_ker_var2
 	  c,
 	  cntx,
 	  cntl,
-	  thread
+	  thread_par
 	);
 }
 
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index 1f00537d5..3fc76b978 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -156,7 +156,7 @@ gint_t bli_info_get_enable_hpx_as_default( void )
 	return 0;
 #endif
 }
-gint_t bli_info_get_thread_part_jrir_slab( void )
+gint_t bli_info_get_thread_jrir_slab( void )
 {
 #ifdef BLIS_ENABLE_JRIR_SLAB
 	return 1;
@@ -164,7 +164,7 @@ gint_t bli_info_get_thread_part_jrir_slab( void )
 	return 0;
 #endif
 }
-gint_t bli_info_get_thread_part_jrir_rr( void )
+gint_t bli_info_get_thread_jrir_rr( void )
 {
 #ifdef BLIS_ENABLE_JRIR_RR
 	return 1;
@@ -172,6 +172,14 @@ gint_t bli_info_get_thread_part_jrir_rr( void )
 	return 0;
 #endif
 }
+gint_t bli_info_get_thread_jrir_tlb( void )
+{
+#ifdef BLIS_ENABLE_JRIR_TLB
+	return 1;
+#else
+	return 0;
+#endif
+}
 gint_t bli_info_get_enable_memkind( void )
 {
 #ifdef BLIS_ENABLE_MEMKIND
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index 08a99daea..300b3f584 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -74,8 +74,9 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp_as_default( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads_as_default( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx_as_default( void );
-BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void );
-BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_slab( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_rr( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_tlb( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void );
 
diff --git a/frame/base/bli_prune.c b/frame/base/bli_prune.c
index ebe5c2365..31c3d86d2 100644
--- a/frame/base/bli_prune.c
+++ b/frame/base/bli_prune.c
@@ -38,9 +38,28 @@
 void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
                              obj_t* s, mdim_t mdim_s )
 {
-	// If the primary object is general, it has no structure, and
+	// NOTE: This function is not safe to use on packed objects because it does
+	// not currently take into account the atomicity of the packed micropanel
+	// widths (i.e., the register blocksize). That is, this function will prune
+	// greedily, without regard to whether doing so would prune off part of a
+	// micropanel *which has already been packed* and "assigned" to a thread for
+	// inclusion in the computation. In order to be safe for use use on packed
+	// matrices, this function would need to prune only up to the nearest
+	// micropanel edge (and to the corresponding location within the secondary
+	// matrix), which may not coincide exactly with the diagonal offset.
+	if ( bli_obj_is_packed( p ) || bli_obj_is_packed( s ) ) bli_abort();
+
+	// If the primary object is general AND dense, it has no structure, and
 	// therefore, no unreferenced parts.
-	if ( bli_obj_is_general( p ) ) return;
+	// NOTE: There is at least one situation where the matrix is general but
+	// its uplo_t value is lower or upper: gemmt. This operation benefits from
+	// pruning unreferenced regions the same way herk/her2k/syrk/syr2k would.
+	// Because of gemmt, and any future similar operations, we limit early
+	// returns to situations where the primary object has a dense uplo_t value
+	// IN ADDITION TO general structure (rather than only checking for general
+	// structure).
+	if ( bli_obj_is_general( p ) &&
+	     bli_obj_is_dense( p ) ) return;
 
 	// If the primary object is BLIS_ZEROS, set the dimensions so that the
 	// matrix is empty. This is not strictly needed but rather a minor
@@ -116,21 +135,13 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
 		if         ( bli_is_m_dim( mdim_p ) )    q = m;
 		else /* if ( bli_is_n_dim( mdim_p ) ) */ q = n;
 
-		// Update the affected objects in case anything changed. Notice that
-		// it is okay to update the dimension and diagonal offset fields of
-		// packed primary objects, as long as we do so in tandem with the
-		// secondary object to maintain conformality. This just means that
-		// the "ignore-able" zero region is skipped over here, rather than
-		// within the macro-kernel.
+		// Update the affected objects' diagonal offset, dimensions, and row
+		// and column offsets, in case anything changed.
 		bli_obj_set_diag_offset( diagoff_p, p );
 		bli_obj_set_dim( mdim_p, q, p );
 		bli_obj_set_dim( mdim_s, q, s );
-
-		// Only update the affected offset fields if the object in question
-		// is NOT a packed object. Otherwise, bli_obj_buffer_at_off() will
-		// compute the wrong address within the macro-kernel object wrapper.
-		if ( !bli_obj_is_packed( p ) ) { bli_obj_inc_off( mdim_p, off_inc, p ); }
-		if ( !bli_obj_is_packed( s ) ) { bli_obj_inc_off( mdim_s, off_inc, s ); }
+		bli_obj_inc_off( mdim_p, off_inc, p );
+		bli_obj_inc_off( mdim_s, off_inc, s );
 	}
 }
 
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index 786998f23..64124c682 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -143,12 +143,44 @@ void bli_rntm_set_ways_for_op
 	// kind of information is already stored in the rntm_t object.
 	bli_rntm_factorize( m, n, k, rntm );
 
-#if 0
-printf( "bli_rntm_set_ways_for_op()\n" );
-bli_rntm_print( rntm );
-#endif
+	#if 0
+	printf( "bli_rntm_set_ways_for_op()\n" );
+	bli_rntm_print( rntm );
+	#endif
 
 	// Now modify the number of ways, if necessary, based on the operation.
+
+	// Consider gemm (hemm, symm), gemmt (herk, her2k, syrk, syr2k), and
+	// trmm (trmm, trmm3).
+	if (
+#ifdef BLIS_ENABLE_JRIR_TLB
+	     l3_op == BLIS_GEMM  ||
+	     l3_op == BLIS_GEMMT ||
+	     l3_op == BLIS_TRMM  ||
+#endif
+	     FALSE
+	   )
+	{
+		dim_t jc = bli_rntm_jc_ways( rntm );
+		dim_t pc = bli_rntm_pc_ways( rntm );
+		dim_t ic = bli_rntm_ic_ways( rntm );
+		dim_t jr = bli_rntm_jr_ways( rntm );
+		dim_t ir = bli_rntm_ir_ways( rntm );
+
+		// If TLB is enabled for gemm or gemmt, redirect any ir loop parallelism
+		// into the jr loop.
+		bli_rntm_set_ways_only
+		(
+		  jc,
+		  pc,
+		  ic,
+		  jr * ir,
+		  1,
+		  rntm
+		);
+	}
+
+	// Consider trmm, trmm3, trsm.
 	if ( l3_op == BLIS_TRMM ||
 	     l3_op == BLIS_TRSM )
 	{
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index bf9319f4f..9e9d47699 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -36,6 +36,16 @@
 #ifndef BLIS_CONFIG_MACRO_DEFS_H
 #define BLIS_CONFIG_MACRO_DEFS_H
 
+// NOTE: This file should ONLY contain processing of macros that are set by
+// configure and output into bli_config.h. Any other macro processing --
+// especially such as for those macros that are expected to be optionally
+// set within a configuration's bli_family_<conf>.h header -- MUST be placed
+// in bli_kernel_macro_defs.h instead. The reason: bli_arch_config.h (which
+// #includes the configuration's bli_family_<conf>.h header) is #included
+// much later in blis.h than this file (bli_config_macro_defs.h), and so any
+// macros set in bli_family_<conf>.h would have no effect on the processing
+// that happens below.
+
 
 // -- INTEGER PROPERTIES -------------------------------------------------------
 
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index d273c353a..8c0f1cb14 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -151,6 +151,7 @@
 #define BLIS_FREE_USER                   free
 #endif
 
+
 // -- Other system-related definitions -----------------------------------------
 
 // Size of a virtual memory page. This is used to align blocks within the
@@ -245,6 +246,7 @@
 #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN   0
 #endif
 
+
 // -- MR and NR blocksizes (only for reference kernels) ------------------------
 
 // The build system defines BLIS_IN_REF_KERNEL, but only when compiling
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index 1822065da..0865b11e9 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -927,7 +927,6 @@ BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id )
 }
 
 
-
 // index-related
 
 BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left )
@@ -954,7 +953,7 @@ BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left )
 	       ( i != 0 || n_left == 0 );
 }
 
-BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter )
 {
 	return ( bool )
 	       ( i == end_iter - 1 );
@@ -966,15 +965,59 @@ BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t
 	       ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
 }
 
-BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+BLIS_INLINE bool bli_is_last_iter_slrr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
 {
 #ifdef BLIS_ENABLE_JRIR_SLAB
-	return bli_is_last_iter_sl( i, end_iter, tid, nth );
+	return bli_is_last_iter_sl( i, end_iter );
 #else // BLIS_ENABLE_JRIR_RR
 	return bli_is_last_iter_rr( i, end_iter, tid, nth );
 #endif
 }
 
+BLIS_INLINE bool bli_is_last_iter_l( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+{
+	return bli_is_last_iter_slrr( i, end_iter, tid, nth );
+}
+
+BLIS_INLINE bool bli_is_last_iter_u( doff_t diagoff, dim_t mr, dim_t nr, inc_t inc )
+{
+	return bli_is_strictly_below_diag_n( diagoff + inc*mr, mr, nr );
+}
+
+BLIS_INLINE bool bli_is_last_iter_tlb_l( dim_t i, dim_t end_iter )
+{
+	return bli_is_last_iter_sl( i, end_iter );
+}
+
+BLIS_INLINE bool bli_is_last_iter_tlb_u( doff_t diagoff, dim_t mr, dim_t nr )
+{
+	return bli_is_strictly_below_diag_n( diagoff + 1*mr, mr, nr );
+}
+
+BLIS_INLINE bool bli_is_my_iter_sl( dim_t i, dim_t st, dim_t en )
+{
+	return ( st <= i && i < en );
+}
+
+BLIS_INLINE bool bli_is_my_iter_rr( dim_t i, dim_t work_id, dim_t n_way )
+{
+	return ( i % n_way == work_id % n_way );
+}
+
+BLIS_INLINE bool bli_is_my_iter( dim_t i, dim_t st, dim_t en, dim_t work_id, dim_t n_way )
+{
+	// NOTE: This function is (as of this writing) only called from packm.
+	// If the structure of the cpp macros below is ever changed, make sure
+	// it is still consistent with that of bli_thread_range_slrr() since
+	// these functions are used together in packm.
+
+#ifdef BLIS_ENABLE_JRIR_RR
+	return bli_is_my_iter_rr( i, work_id, n_way );
+#else // ifdef ( _SLAB || _TLB )
+	return bli_is_my_iter_sl( i, st, en );
+#endif
+}
+
 
 // packbuf_t-related
 
diff --git a/frame/include/blis.h b/frame/include/blis.h
index 98ebee878..70005e57d 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -83,6 +83,10 @@ extern "C" {
 // -- Threading definitions --
 
 #include "bli_thread.h"
+#include "bli_thread_range.h"
+#include "bli_thread_range_slab_rr.h"
+#include "bli_thread_range_tlb.h"
+
 #include "bli_pthread.h"
 
 
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 4cba76b20..d41f37053 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -108,907 +108,6 @@ void bli_thread_launch
 
 // -----------------------------------------------------------------------------
 
-void bli_thread_range_sub
-     (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	dim_t      n_way      = bli_thrinfo_n_way( thread );
-
-	if ( n_way == 1 ) { *start = 0; *end = n; return; }
-
-	dim_t      work_id    = bli_thrinfo_work_id( thread );
-
-	dim_t      all_start  = 0;
-	dim_t      all_end    = n;
-
-	dim_t      size       = all_end - all_start;
-
-	dim_t      n_bf_whole = size / bf;
-	dim_t      n_bf_left  = size % bf;
-
-	dim_t      n_bf_lo    = n_bf_whole / n_way;
-	dim_t      n_bf_hi    = n_bf_whole / n_way;
-
-	// In this function, we partition the space between all_start and
-	// all_end into n_way partitions, each a multiple of block_factor
-	// with the exception of the one partition that recieves the
-	// "edge" case (if applicable).
-	//
-	// Here are examples of various thread partitionings, in units of
-	// the block_factor, when n_way = 4. (A '+' indicates the thread
-	// that receives the leftover edge case (ie: n_bf_left extra
-	// rows/columns in its sub-range).
-	//                                        (all_start ... all_end)
-	// n_bf_whole  _left  hel  n_th_lo  _hi   thr0  thr1  thr2  thr3
-	//         12     =0    f        0    4      3     3     3     3
-	//         12     >0    f        0    4      3     3     3     3+
-	//         13     >0    f        1    3      4     3     3     3+
-	//         14     >0    f        2    2      4     4     3     3+
-	//         15     >0    f        3    1      4     4     4     3+
-	//         15     =0    f        3    1      4     4     4     3
-	//
-	//         12     =0    t        4    0      3     3     3     3
-	//         12     >0    t        4    0      3+    3     3     3
-	//         13     >0    t        3    1      3+    3     3     4
-	//         14     >0    t        2    2      3+    3     4     4
-	//         15     >0    t        1    3      3+    4     4     4
-	//         15     =0    t        1    3      3     4     4     4
-
-	// As indicated by the table above, load is balanced as equally
-	// as possible, even in the presence of an edge case.
-
-	// First, we must differentiate between cases where the leftover
-	// "edge" case (n_bf_left) should be allocated to a thread partition
-	// at the low end of the index range or the high end.
-
-	if ( handle_edge_low == FALSE )
-	{
-		// Notice that if all threads receive the same number of
-		// block_factors, those threads are considered "high" and
-		// the "low" thread group is empty.
-		dim_t n_th_lo = n_bf_whole % n_way;
-		//dim_t n_th_hi = n_way - n_th_lo;
-
-		// If some partitions must have more block_factors than others
-		// assign the slightly larger partitions to lower index threads.
-		if ( n_th_lo != 0 ) n_bf_lo += 1;
-
-		// Compute the actual widths (in units of rows/columns) of
-		// individual threads in the low and high groups.
-		dim_t size_lo = n_bf_lo * bf;
-		dim_t size_hi = n_bf_hi * bf;
-
-		// Precompute the starting indices of the low and high groups.
-		dim_t lo_start = all_start;
-		dim_t hi_start = all_start + n_th_lo * size_lo;
-
-		// Compute the start and end of individual threads' ranges
-		// as a function of their work_ids and also the group to which
-		// they belong (low or high).
-		if ( work_id < n_th_lo )
-		{
-			*start = lo_start + (work_id  ) * size_lo;
-			*end   = lo_start + (work_id+1) * size_lo;
-		}
-		else // if ( n_th_lo <= work_id )
-		{
-			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
-			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
-
-			// Since the edge case is being allocated to the high
-			// end of the index range, we have to advance the last
-			// thread's end.
-			if ( work_id == n_way - 1 ) *end += n_bf_left;
-		}
-	}
-	else // if ( handle_edge_low == TRUE )
-	{
-		// Notice that if all threads receive the same number of
-		// block_factors, those threads are considered "low" and
-		// the "high" thread group is empty.
-		dim_t n_th_hi = n_bf_whole % n_way;
-		dim_t n_th_lo = n_way - n_th_hi;
-
-		// If some partitions must have more block_factors than others
-		// assign the slightly larger partitions to higher index threads.
-		if ( n_th_hi != 0 ) n_bf_hi += 1;
-
-		// Compute the actual widths (in units of rows/columns) of
-		// individual threads in the low and high groups.
-		dim_t size_lo = n_bf_lo * bf;
-		dim_t size_hi = n_bf_hi * bf;
-
-		// Precompute the starting indices of the low and high groups.
-		dim_t lo_start = all_start;
-		dim_t hi_start = all_start + n_th_lo * size_lo
-		                           + n_bf_left;
-
-		// Compute the start and end of individual threads' ranges
-		// as a function of their work_ids and also the group to which
-		// they belong (low or high).
-		if ( work_id < n_th_lo )
-		{
-			*start = lo_start + (work_id  ) * size_lo;
-			*end   = lo_start + (work_id+1) * size_lo;
-
-			// Since the edge case is being allocated to the low
-			// end of the index range, we have to advance the
-			// starts/ends accordingly.
-			if ( work_id == 0 )   *end   += n_bf_left;
-			else                { *start += n_bf_left;
-			                      *end   += n_bf_left; }
-		}
-		else // if ( n_th_lo <= work_id )
-		{
-			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
-			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
-		}
-	}
-}
-
-siz_t bli_thread_range_l2r
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, n, bf,
-	                      FALSE, start, end );
-
-	return m * ( *end - *start );
-}
-
-siz_t bli_thread_range_r2l
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, n, bf,
-	                      TRUE, start, end );
-
-	return m * ( *end - *start );
-}
-
-siz_t bli_thread_range_t2b
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, m, bf,
-	                      FALSE, start, end );
-
-	return n * ( *end - *start );
-}
-
-siz_t bli_thread_range_b2t
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, m, bf,
-	                      TRUE, start, end );
-
-	return n * ( *end - *start );
-}
-
-// -----------------------------------------------------------------------------
-
-dim_t bli_thread_range_width_l
-     (
-       doff_t diagoff_j,
-       dim_t  m,
-       dim_t  n_j,
-       dim_t  j,
-       dim_t  n_way,
-       dim_t  bf,
-       dim_t  bf_left,
-       double area_per_thr,
-       bool   handle_edge_low
-     )
-{
-	dim_t width;
-
-	// In this function, we assume that we are somewhere in the process of
-	// partitioning an m x n lower-stored region (with arbitrary diagonal
-	// offset) n_ways along the n dimension (into column panels). The value
-	// j identifies the left-to-right subpartition index (from 0 to n_way-1)
-	// of the subpartition whose width we are about to compute using the
-	// area per thread determined by the caller. n_j is the number of
-	// columns in the remaining region of the matrix being partitioned,
-	// and diagoff_j is that region's diagonal offset.
-
-	// If this is the last subpartition, the width is simply equal to n_j.
-	// Note that this statement handles cases where the "edge case" (if
-	// one exists) is assigned to the high end of the index range (ie:
-	// handle_edge_low == FALSE).
-	if ( j == n_way - 1 ) return n_j;
-
-	// At this point, we know there are at least two subpartitions left.
-	// We also know that IF the submatrix contains a completely dense
-	// rectangular submatrix, it will occur BEFORE the triangular (or
-	// trapezoidal) part.
-
-	// Here, we implement a somewhat minor load balancing optimization
-	// that ends up getting employed only for relatively small matrices.
-	// First, recall that all subpartition widths will be some multiple
-	// of the blocking factor bf, except perhaps either the first or last
-	// subpartition, which will receive the edge case, if it exists.
-	// Also recall that j represents the current thread (or thread group,
-	// or "caucus") for which we are computing a subpartition width.
-	// If n_j is sufficiently small that we can only allocate bf columns
-	// to each of the remaining threads, then we set the width to bf. We
-	// do not allow the subpartition width to be less than bf, so, under
-	// some conditions, if n_j is small enough, some of the reamining
-	// threads may not get any work. For the purposes of this lower bound
-	// on work (ie: width >= bf), we allow the edge case to count as a
-	// "full" set of bf columns.
-	{
-		dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 );
-
-		if ( n_j_bf <= n_way - j )
-		{
-			if ( j == 0 && handle_edge_low )
-				width = ( bf_left > 0 ? bf_left : bf );
-			else
-				width = bf;
-
-			// Make sure that the width does not exceed n_j. This would
-			// occur if and when n_j_bf < n_way - j; that is, when the
-			// matrix being partitioned is sufficiently small relative to
-			// n_way such that there is not even enough work for every
-			// (remaining) thread to get bf (or bf_left) columns. The
-			// net effect of this safeguard is that some threads may get
-			// assigned empty ranges (ie: no work), which of course must
-			// happen in some situations.
-			if ( width > n_j ) width = n_j;
-
-			return width;
-		}
-	}
-
-	// This block computes the width assuming that we are entirely within
-	// a dense rectangle that precedes the triangular (or trapezoidal)
-	// part.
-	{
-		// First compute the width of the current panel under the
-		// assumption that the diagonal offset would not intersect.
-		width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m );
-
-		// Adjust the width, if necessary. Specifically, we may need
-		// to allocate the edge case to the first subpartition, if
-		// requested; otherwise, we just need to ensure that the
-		// subpartition is a multiple of the blocking factor.
-		if ( j == 0 && handle_edge_low )
-		{
-			if ( width % bf != bf_left ) width += bf_left - ( width % bf );
-		}
-		else // if interior case
-		{
-			// Round up to the next multiple of the blocking factor.
-			//if ( width % bf != 0       ) width += bf      - ( width % bf );
-			// Round to the nearest multiple of the blocking factor.
-			if ( width % bf != 0       ) width = bli_round_to_mult( width, bf );
-		}
-	}
-
-	// We need to recompute width if the panel, according to the width
-	// as currently computed, would intersect the diagonal.
-	if ( diagoff_j < width )
-	{
-		dim_t offm_inc, offn_inc;
-
-		// Prune away the unstored region above the diagonal, if it exists.
-		// Note that the entire region was pruned initially, so we know that
-		// we don't need to try to prune the right side. (Also, we discard
-		// the offset deltas since we don't need to actually index into the
-		// subpartition.)
-		bli_prune_unstored_region_top_l( &diagoff_j, &m, &n_j, &offm_inc );
-		//bli_prune_unstored_region_right_l( &diagoff_j, &m, &n_j, &offn_inc );
-
-		// We don't need offm_inc, offn_inc here. These statements should
-		// prevent compiler warnings.
-		( void )offm_inc;
-		( void )offn_inc;
-
-		// Prepare to solve a quadratic equation to find the width of the
-		// current (jth) subpartition given the m dimension, diagonal offset,
-		// and area.
-		// NOTE: We know that the +/- in the quadratic formula must be a +
-		// here because we know that the desired solution (the subpartition
-		// width) will be smaller than (m + diagoff), not larger. If you
-		// don't believe me, draw a picture!
-		const double a = -0.5;
-		const double b = ( double )m + ( double )diagoff_j + 0.5;
-		const double c = -0.5 * (   ( double )diagoff_j *
-		                          ( ( double )diagoff_j + 1.0 )
-		                        ) - area_per_thr;
-		const double r = b * b - 4.0 * a * c;
-
-		// If the quadratic solution is not imaginary, round it and use that
-		// as our width, but make sure it didn't round to zero. Otherwise,
-		// discard the quadratic solution and leave width, as previously
-		// computed, unchanged.
-		if ( r >= 0.0 )
-		{
-			const double x = ( -b + sqrt( r ) ) / ( 2.0 * a );
-
-			width = ( dim_t )bli_round( x );
-			if ( width == 0 ) width = 1;
-		}
-
-		// Adjust the width, if necessary.
-		if ( j == 0 && handle_edge_low )
-		{
-			if ( width % bf != bf_left ) width += bf_left - ( width % bf );
-		}
-		else // if interior case
-		{
-			// Round up to the next multiple of the blocking factor.
-			//if ( width % bf != 0       ) width += bf      - ( width % bf );
-			// Round to the nearest multiple of the blocking factor.
-			if ( width % bf != 0       ) width = bli_round_to_mult( width, bf );
-		}
-	}
-
-	// Make sure that the width, after being adjusted, does not cause the
-	// subpartition to exceed n_j.
-	if ( width > n_j ) width = n_j;
-
-	return width;
-}
-
-siz_t bli_find_area_trap_l
-     (
-       dim_t  m,
-       dim_t  n,
-       doff_t diagoff
-     )
-{
-	dim_t  offm_inc = 0;
-	dim_t  offn_inc = 0;
-	double tri_area;
-	double area;
-
-	// Prune away any rectangular region above where the diagonal
-	// intersects the left edge of the subpartition, if it exists.
-	bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc );
-
-	// Prune away any rectangular region to the right of where the
-	// diagonal intersects the bottom edge of the subpartition, if
-	// it exists. (This shouldn't ever be needed, since the caller
-	// would presumably have already performed rightward pruning,
-	// but it's here just in case.)
-	bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc );
-
-	( void )offm_inc;
-	( void )offn_inc;
-
-	// Compute the area of the empty triangle so we can subtract it
-	// from the area of the rectangle that bounds the subpartition.
-	if ( bli_intersects_diag_n( diagoff, m, n ) )
-	{
-		double tri_dim = ( double )( n - diagoff - 1 );
-		tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0;
-	}
-	else
-	{
-		// If the diagonal does not intersect the trapezoid, then
-		// we can compute the area as a simple rectangle.
-		tri_area = 0.0;
-	}
-
-	area = ( double )m * ( double )n - tri_area;
-
-	return ( siz_t )area;
-}
-
-// -----------------------------------------------------------------------------
-
-siz_t bli_thread_range_weighted_sub
-     (
-       const thrinfo_t* thread,
-             doff_t     diagoff,
-             uplo_t     uplo,
-             dim_t      m,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     j_start_thr,
-             dim_t*     j_end_thr
-     )
-{
-	dim_t      n_way   = bli_thrinfo_n_way( thread );
-	dim_t      my_id   = bli_thrinfo_work_id( thread );
-
-	dim_t      bf_left = n % bf;
-
-	dim_t      j;
-
-	dim_t      off_j;
-	doff_t     diagoff_j;
-	dim_t      n_left;
-
-	dim_t      width_j;
-
-	dim_t      offm_inc, offn_inc;
-
-	double     tri_dim, tri_area;
-	double     area_total, area_per_thr;
-
-	siz_t      area = 0;
-
-	// In this function, we assume that the caller has already determined
-	// that (a) the diagonal intersects the submatrix, and (b) the submatrix
-	// is either lower- or upper-stored.
-
-	if ( bli_is_lower( uplo ) )
-	{
-		// Prune away the unstored region above the diagonal, if it exists,
-		// and then to the right of where the diagonal intersects the bottom,
-		// if it exists. (Also, we discard the offset deltas since we don't
-		// need to actually index into the subpartition.)
-		bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc );
-		bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc );
-
-		// We don't need offm_inc, offn_inc here. These statements should
-		// prevent compiler warnings.
-		( void )offm_inc;
-		( void )offn_inc;
-
-		// Now that pruning has taken place, we know that diagoff >= 0.
-
-		// Compute the total area of the submatrix, accounting for the
-		// location of the diagonal, and divide it by the number of ways
-		// of parallelism.
-		tri_dim      = ( double )( n - diagoff - 1 );
-		tri_area     = tri_dim * ( tri_dim + 1.0 ) / 2.0;
-		area_total   = ( double )m * ( double )n - tri_area;
-		area_per_thr = area_total / ( double )n_way;
-
-		// Initialize some variables prior to the loop: the offset to the
-		// current subpartition, the remainder of the n dimension, and
-		// the diagonal offset of the current subpartition.
-		off_j     = 0;
-		diagoff_j = diagoff;
-		n_left    = n;
-
-		// Iterate over the subpartition indices corresponding to each
-		// thread/caucus participating in the n_way parallelism.
-		for ( j = 0; j < n_way; ++j )
-		{
-			// Compute the width of the jth subpartition, taking the
-			// current diagonal offset into account, if needed.
-			width_j =
-			bli_thread_range_width_l
-			(
-			  diagoff_j, m, n_left,
-			  j, n_way,
-			  bf, bf_left,
-			  area_per_thr,
-			  handle_edge_low
-			);
-
-			// If the current thread belongs to caucus j, this is his
-			// subpartition. So we compute the implied index range and
-			// end our search.
-			if ( j == my_id )
-			{
-				*j_start_thr = off_j;
-				*j_end_thr   = off_j + width_j;
-
-				area = bli_find_area_trap_l( m, width_j, diagoff_j );
-
-				break;
-			}
-
-			// Shift the current subpartition's starting and diagonal offsets,
-			// as well as the remainder of the n dimension, according to the
-			// computed width, and then iterate to the next subpartition.
-			off_j     += width_j;
-			diagoff_j -= width_j;
-			n_left    -= width_j;
-		}
-	}
-	else // if ( bli_is_upper( uplo ) )
-	{
-		// Express the upper-stored case in terms of the lower-stored case.
-
-		// First, we convert the upper-stored trapezoid to an equivalent
-		// lower-stored trapezoid by rotating it 180 degrees.
-		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
-
-		// Now that the trapezoid is "flipped" in the n dimension, negate
-		// the bool that encodes whether to handle the edge case at the
-		// low (or high) end of the index range.
-		bli_toggle_bool( &handle_edge_low );
-
-		// Compute the appropriate range for the rotated trapezoid.
-		area = bli_thread_range_weighted_sub
-		(
-		  thread, diagoff, uplo, m, n, bf,
-		  handle_edge_low,
-		  j_start_thr, j_end_thr
-		);
-
-		// Reverse the indexing basis for the subpartition ranges so that
-		// the indices, relative to left-to-right iteration through the
-		// unrotated upper-stored trapezoid, map to the correct columns
-		// (relative to the diagonal). This amounts to subtracting the
-		// range from n.
-		bli_reverse_index_direction( n, j_start_thr, j_end_thr );
-	}
-
-	return area;
-}
-
-siz_t bli_thread_range_mdim
-     (
-             dir_t      direct,
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     c,
-       const cntl_t*    cntl,
-       const cntx_t*    cntx,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	bszid_t  bszid  = bli_cntl_bszid( cntl );
-	opid_t   family = bli_cntl_family( cntl );
-
-	// This is part of trsm's current implementation, whereby right side
-	// cases are implemented in left-side micro-kernels, which requires
-	// we swap the usage of the register blocksizes for the purposes of
-	// packing A and B.
-	if ( family == BLIS_TRSM )
-	{
-		if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR;
-		else                                   bszid = BLIS_NR;
-	}
-
-	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	const obj_t*   x;
-	bool     use_weighted;
-
-	// Use the operation family to choose the one of the two matrices
-	// being partitioned that potentially has structure, and also to
-	// decide whether or not we need to use weighted range partitioning.
-	// NOTE: It's important that we use non-weighted range partitioning
-	// for hemm and symm (ie: the gemm family) because the weighted
-	// function will mistakenly skip over unstored regions of the
-	// structured matrix, even though they represent part of that matrix
-	// that will be dense and full (after packing).
-	if      ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; }
-	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
-	else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE;  }
-	else    /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; }
-
-	if ( use_weighted )
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_weighted_t2b( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_weighted_b2t( thr, x, bmult, start, end );
-	}
-	else
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_t2b( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_b2t( thr, x, bmult, start, end );
-	}
-}
-
-siz_t bli_thread_range_ndim
-     (
-             dir_t      direct,
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     c,
-       const cntl_t*    cntl,
-       const cntx_t*    cntx,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	bszid_t  bszid  = bli_cntl_bszid( cntl );
-	opid_t   family = bli_cntl_family( cntl );
-
-	// This is part of trsm's current implementation, whereby right side
-	// cases are implemented in left-side micro-kernels, which requires
-	// we swap the usage of the register blocksizes for the purposes of
-	// packing A and B.
-	if ( family == BLIS_TRSM )
-	{
-		if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR;
-		else                                   bszid = BLIS_NR;
-	}
-
-	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	const obj_t*   x;
-	bool     use_weighted;
-
-	// Use the operation family to choose the one of the two matrices
-	// being partitioned that potentially has structure, and also to
-	// decide whether or not we need to use weighted range partitioning.
-	// NOTE: It's important that we use non-weighted range partitioning
-	// for hemm and symm (ie: the gemm family) because the weighted
-	// function will mistakenly skip over unstored regions of the
-	// structured matrix, even though they represent part of that matrix
-	// that will be dense and full (after packing).
-	if      ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; }
-	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
-	else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE;  }
-	else    /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; }
-
-	if ( use_weighted )
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_weighted_l2r( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_weighted_r2l( thr, x, bmult, start, end );
-	}
-	else
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_l2r( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_r2l( thr, x, bmult, start, end );
-	}
-}
-
-siz_t bli_thread_range_weighted_l2r
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	siz_t area;
-
-	// This function assigns area-weighted ranges in the n dimension
-	// where the total range spans 0 to n-1 with 0 at the left end and
-	// n-1 at the right end.
-
-	if ( bli_obj_intersects_diag( a ) &&
-	     bli_obj_is_upper_or_lower( a ) )
-	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
-		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-		}
-
-		area =
-		bli_thread_range_weighted_sub
-		(
-		  thr, diagoff, uplo, m, n, bf,
-		  FALSE, start, end
-		);
-	}
-	else // if dense or zeros
-	{
-		area = bli_thread_range_l2r
-		(
-		  thr, a, bmult,
-		  start, end
-		);
-	}
-
-	return area;
-}
-
-siz_t bli_thread_range_weighted_r2l
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	siz_t area;
-
-	// This function assigns area-weighted ranges in the n dimension
-	// where the total range spans 0 to n-1 with 0 at the right end and
-	// n-1 at the left end.
-
-	if ( bli_obj_intersects_diag( a ) &&
-	     bli_obj_is_upper_or_lower( a ) )
-	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
-		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-		}
-
-		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
-
-		area =
-		bli_thread_range_weighted_sub
-		(
-		  thr, diagoff, uplo, m, n, bf,
-		  TRUE, start, end
-		);
-	}
-	else // if dense or zeros
-	{
-		area = bli_thread_range_r2l
-		(
-		  thr, a, bmult,
-		  start, end
-		);
-	}
-
-	return area;
-}
-
-siz_t bli_thread_range_weighted_t2b
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	siz_t area;
-
-	// This function assigns area-weighted ranges in the m dimension
-	// where the total range spans 0 to m-1 with 0 at the top end and
-	// m-1 at the bottom end.
-
-	if ( bli_obj_intersects_diag( a ) &&
-	     bli_obj_is_upper_or_lower( a ) )
-	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
-		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-		}
-
-		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-
-		area =
-		bli_thread_range_weighted_sub
-		(
-		  thr, diagoff, uplo, m, n, bf,
-		  FALSE, start, end
-		);
-	}
-	else // if dense or zeros
-	{
-		area = bli_thread_range_t2b
-		(
-		  thr, a, bmult,
-		  start, end
-		);
-	}
-
-	return area;
-}
-
-siz_t bli_thread_range_weighted_b2t
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	siz_t area;
-
-	// This function assigns area-weighted ranges in the m dimension
-	// where the total range spans 0 to m-1 with 0 at the bottom end and
-	// m-1 at the top end.
-
-	if ( bli_obj_intersects_diag( a ) &&
-	     bli_obj_is_upper_or_lower( a ) )
-	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
-		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-		}
-
-		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-
-		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
-
-		area = bli_thread_range_weighted_sub
-		(
-		  thr, diagoff, uplo, m, n, bf,
-		  TRUE, start, end
-		);
-	}
-	else // if dense or zeros
-	{
-		area = bli_thread_range_b2t
-		(
-		  thr, a, bmult,
-		  start, end
-		);
-	}
-
-	return area;
-}
-
-// -----------------------------------------------------------------------------
-
 void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors )
 {
 	factors->n = n;
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index e61fc8b89..5002672dc 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -56,6 +56,8 @@ typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params
 void bli_thread_init( void );
 void bli_thread_finalize( void );
 
+// -----------------------------------------------------------------------------
+
 BLIS_EXPORT_BLIS void bli_thread_launch
      (
              timpl_t       ti,
@@ -64,91 +66,6 @@ BLIS_EXPORT_BLIS void bli_thread_launch
        const void*         params
      );
 
-// Thread range-related prototypes.
-
-BLIS_EXPORT_BLIS void bli_thread_range_sub
-     (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end
-     );
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-siz_t PASTEMAC0( opname ) \
-     ( \
-             dir_t      direct, \
-       const thrinfo_t* thr, \
-       const obj_t*     a, \
-       const obj_t*     b, \
-       const obj_t*     c, \
-       const cntl_t*    cntl, \
-       const cntx_t*    cntx, \
-             dim_t*     start, \
-             dim_t*     end  \
-     );
-
-GENPROT( thread_range_mdim )
-GENPROT( thread_range_ndim )
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-siz_t PASTEMAC0( opname ) \
-     ( \
-       const thrinfo_t* thr, \
-       const obj_t*     a, \
-       const blksz_t*   bmult, \
-             dim_t*     start, \
-             dim_t*     end  \
-     );
-
-GENPROT( thread_range_l2r )
-GENPROT( thread_range_r2l )
-GENPROT( thread_range_t2b )
-GENPROT( thread_range_b2t )
-
-GENPROT( thread_range_weighted_l2r )
-GENPROT( thread_range_weighted_r2l )
-GENPROT( thread_range_weighted_t2b )
-GENPROT( thread_range_weighted_b2t )
-
-
-dim_t bli_thread_range_width_l
-     (
-       doff_t diagoff_j,
-       dim_t  m,
-       dim_t  n_j,
-       dim_t  j,
-       dim_t  n_way,
-       dim_t  bf,
-       dim_t  bf_left,
-       double area_per_thr,
-       bool   handle_edge_low
-     );
-siz_t bli_find_area_trap_l
-     (
-       dim_t  m,
-       dim_t  n,
-       doff_t diagoff
-     );
-siz_t bli_thread_range_weighted_sub
-     (
-       const thrinfo_t* thread,
-             doff_t     diagoff,
-             uplo_t     uplo,
-             dim_t      m,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     j_start_thr,
-             dim_t*     j_end_thr
-     );
-
 // -----------------------------------------------------------------------------
 
 // Factorization and partitioning prototypes
@@ -212,98 +129,5 @@ BLIS_EXPORT_BLIS void    bli_thread_set_thread_impl( timpl_t ti );
 
 void                     bli_thread_init_rntm_from_env( rntm_t* rntm );
 
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE void bli_thread_range_jrir_rr
-     (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end,
-             dim_t*     inc
-     )
-{
-	// Use interleaved partitioning of jr/ir loops.
-	*start = bli_thrinfo_work_id( thread );
-	*inc   = bli_thrinfo_n_way( thread );
-	*end   = n;
-}
-
-BLIS_INLINE void bli_thread_range_jrir_sl
-     (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end,
-             dim_t*     inc
-     )
-{
-	// Use contiguous slab partitioning of jr/ir loops.
-	bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
-	*inc = 1;
-}
-
-BLIS_INLINE void bli_thread_range_jrir
-     (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end,
-             dim_t*     inc
-     )
-{
-	// Define a general-purpose version of bli_thread_range_jrir() whose
-	// definition depends on whether slab or round-robin partitioning was
-	// requested at configure-time.
-#ifdef BLIS_ENABLE_JRIR_SLAB
-	bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
-#else
-	bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
-#endif
-}
-
-#if 0
-BLIS_INLINE void bli_thread_range_weighted_jrir
-     (
-       thrinfo_t* thread,
-       doff_t     diagoff,
-       uplo_t     uplo,
-       dim_t      m,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-       dim_t*     inc
-     )
-{
-#ifdef BLIS_ENABLE_JRIR_SLAB
-
-	// Use contiguous slab partitioning for jr/ir loops.
-	bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
-	                               handle_edge_low, start, end );
-
-	*start = *start / bf; *inc = 1;
-
-	if ( *end % bf ) *end = *end / bf + 1;
-	else             *end = *end / bf;
-
-#else
 
-	// Use interleaved partitioning of jr/ir loops.
-	*start = bli_thrinfo_work_id( thread );
-	*inc   = bli_thrinfo_n_way( thread );
-	*end   = n;
-
-#endif
-}
 #endif
-
-#endif
-
diff --git a/frame/thread/bli_thread_range.c b/frame/thread/bli_thread_range.c
new file mode 100644
index 000000000..a28e529b0
--- /dev/null
+++ b/frame/thread/bli_thread_range.c
@@ -0,0 +1,1121 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_thread_range_sub
+     (
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	dim_t      n_way      = bli_thrinfo_n_way( thread );
+
+	if ( n_way == 1 ) { *start = 0; *end = n; return; }
+
+	dim_t      work_id    = bli_thrinfo_work_id( thread );
+
+	dim_t      all_start  = 0;
+	dim_t      all_end    = n;
+
+	dim_t      size       = all_end - all_start;
+
+	dim_t      n_bf_whole = size / bf;
+	dim_t      n_bf_left  = size % bf;
+
+	dim_t      n_bf_lo    = n_bf_whole / n_way;
+	dim_t      n_bf_hi    = n_bf_whole / n_way;
+
+	// In this function, we partition the space between all_start and
+	// all_end into n_way partitions, each a multiple of block_factor
+	// with the exception of the one partition that recieves the
+	// "edge" case (if applicable).
+	//
+	// Here are examples of various thread partitionings, in units of
+	// the block_factor, when n_way = 4. (A '+' indicates the thread
+	// that receives the leftover edge case (ie: n_bf_left extra
+	// rows/columns in its sub-range).
+	//                                        (all_start ... all_end)
+	// n_bf_whole  _left  hel  n_th_lo  _hi   thr0  thr1  thr2  thr3
+	//         12     =0    f        0    4      3     3     3     3
+	//         12     >0    f        0    4      3     3     3     3+
+	//         13     >0    f        1    3      4     3     3     3+
+	//         14     >0    f        2    2      4     4     3     3+
+	//         15     >0    f        3    1      4     4     4     3+
+	//         15     =0    f        3    1      4     4     4     3
+	//
+	//         12     =0    t        4    0      3     3     3     3
+	//         12     >0    t        4    0      3+    3     3     3
+	//         13     >0    t        3    1      3+    3     3     4
+	//         14     >0    t        2    2      3+    3     4     4
+	//         15     >0    t        1    3      3+    4     4     4
+	//         15     =0    t        1    3      3     4     4     4
+
+	// As indicated by the table above, load is balanced as equally
+	// as possible, even in the presence of an edge case.
+
+	// First, we must differentiate between cases where the leftover
+	// "edge" case (n_bf_left) should be allocated to a thread partition
+	// at the low end of the index range or the high end.
+
+	if ( handle_edge_low == FALSE )
+	{
+		// Notice that if all threads receive the same number of
+		// block_factors, those threads are considered "high" and
+		// the "low" thread group is empty.
+		dim_t n_th_lo = n_bf_whole % n_way;
+		//dim_t n_th_hi = n_way - n_th_lo;
+
+		// If some partitions must have more block_factors than others
+		// assign the slightly larger partitions to lower index threads.
+		if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+		// Compute the actual widths (in units of rows/columns) of
+		// individual threads in the low and high groups.
+		dim_t size_lo = n_bf_lo * bf;
+		dim_t size_hi = n_bf_hi * bf;
+
+		// Precompute the starting indices of the low and high groups.
+		dim_t lo_start = all_start;
+		dim_t hi_start = all_start + n_th_lo * size_lo;
+
+		// Compute the start and end of individual threads' ranges
+		// as a function of their work_ids and also the group to which
+		// they belong (low or high).
+		if ( work_id < n_th_lo )
+		{
+			*start = lo_start + (work_id  ) * size_lo;
+			*end   = lo_start + (work_id+1) * size_lo;
+		}
+		else // if ( n_th_lo <= work_id )
+		{
+			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
+			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+			// Since the edge case is being allocated to the high
+			// end of the index range, we have to advance the last
+			// thread's end.
+			if ( work_id == n_way - 1 ) *end += n_bf_left;
+		}
+	}
+	else // if ( handle_edge_low == TRUE )
+	{
+		// Notice that if all threads receive the same number of
+		// block_factors, those threads are considered "low" and
+		// the "high" thread group is empty.
+		dim_t n_th_hi = n_bf_whole % n_way;
+		dim_t n_th_lo = n_way - n_th_hi;
+
+		// If some partitions must have more block_factors than others
+		// assign the slightly larger partitions to higher index threads.
+		if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+		// Compute the actual widths (in units of rows/columns) of
+		// individual threads in the low and high groups.
+		dim_t size_lo = n_bf_lo * bf;
+		dim_t size_hi = n_bf_hi * bf;
+
+		// Precompute the starting indices of the low and high groups.
+		dim_t lo_start = all_start;
+		dim_t hi_start = all_start + n_th_lo * size_lo
+		                           + n_bf_left;
+
+		// Compute the start and end of individual threads' ranges
+		// as a function of their work_ids and also the group to which
+		// they belong (low or high).
+		if ( work_id < n_th_lo )
+		{
+			*start = lo_start + (work_id  ) * size_lo;
+			*end   = lo_start + (work_id+1) * size_lo;
+
+			// Since the edge case is being allocated to the low
+			// end of the index range, we have to advance the
+			// starts/ends accordingly.
+			if ( work_id == 0 )   *end   += n_bf_left;
+			else                { *start += n_bf_left;
+			                      *end   += n_bf_left; }
+		}
+		else // if ( n_th_lo <= work_id )
+		{
+			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
+			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
+		}
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+siz_t bli_thread_range_l2r
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	num_t dt = bli_obj_dt( a );
+	dim_t m  = bli_obj_length_after_trans( a );
+	dim_t n  = bli_obj_width_after_trans( a );
+	dim_t bf = bli_blksz_get_def( dt, bmult );
+
+	bli_thread_range_sub( thr, n, bf,
+	                      FALSE, start, end );
+
+	return m * ( *end - *start );
+}
+
+siz_t bli_thread_range_r2l
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	num_t dt = bli_obj_dt( a );
+	dim_t m  = bli_obj_length_after_trans( a );
+	dim_t n  = bli_obj_width_after_trans( a );
+	dim_t bf = bli_blksz_get_def( dt, bmult );
+
+	bli_thread_range_sub( thr, n, bf,
+	                      TRUE, start, end );
+
+	return m * ( *end - *start );
+}
+
+siz_t bli_thread_range_t2b
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	num_t dt = bli_obj_dt( a );
+	dim_t m  = bli_obj_length_after_trans( a );
+	dim_t n  = bli_obj_width_after_trans( a );
+	dim_t bf = bli_blksz_get_def( dt, bmult );
+
+	bli_thread_range_sub( thr, m, bf,
+	                      FALSE, start, end );
+
+	return n * ( *end - *start );
+}
+
+siz_t bli_thread_range_b2t
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	num_t dt = bli_obj_dt( a );
+	dim_t m  = bli_obj_length_after_trans( a );
+	dim_t n  = bli_obj_width_after_trans( a );
+	dim_t bf = bli_blksz_get_def( dt, bmult );
+
+	bli_thread_range_sub( thr, m, bf,
+	                      TRUE, start, end );
+
+	return n * ( *end - *start );
+}
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_thread_range_width_l
+     (
+       doff_t diagoff_j,
+       dim_t  m,
+       dim_t  n_j,
+       dim_t  j,
+       dim_t  n_way,
+       dim_t  bf,
+       dim_t  bf_left,
+       double area_per_thr,
+       bool   handle_edge_low
+     )
+{
+	dim_t width;
+
+	// In this function, we assume that we are somewhere in the process of
+	// partitioning an m x n lower-stored region (with arbitrary diagonal
+	// offset) n_ways along the n dimension (into column panels). The value
+	// j identifies the left-to-right subpartition index (from 0 to n_way-1)
+	// of the subpartition whose width we are about to compute using the
+	// area per thread determined by the caller. n_j is the number of
+	// columns in the remaining region of the matrix being partitioned,
+	// and diagoff_j is that region's diagonal offset.
+
+	// If this is the last subpartition, the width is simply equal to n_j.
+	// Note that this statement handles cases where the "edge case" (if
+	// one exists) is assigned to the high end of the index range (ie:
+	// handle_edge_low == FALSE).
+	if ( j == n_way - 1 ) return n_j;
+
+	// At this point, we know there are at least two subpartitions left.
+	// We also know that IF the submatrix contains a completely dense
+	// rectangular submatrix, it will occur BEFORE the triangular (or
+	// trapezoidal) part.
+
+	// Here, we implement a somewhat minor load balancing optimization
+	// that ends up getting employed only for relatively small matrices.
+	// First, recall that all subpartition widths will be some multiple
+	// of the blocking factor bf, except perhaps either the first or last
+	// subpartition, which will receive the edge case, if it exists.
+	// Also recall that j represents the current thread (or thread group,
+	// or "caucus") for which we are computing a subpartition width.
+	// If n_j is sufficiently small that we can only allocate bf columns
+	// to each of the remaining threads, then we set the width to bf. We
+	// do not allow the subpartition width to be less than bf, so, under
+	// some conditions, if n_j is small enough, some of the reamining
+	// threads may not get any work. For the purposes of this lower bound
+	// on work (ie: width >= bf), we allow the edge case to count as a
+	// "full" set of bf columns.
+	{
+		dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 );
+
+		if ( n_j_bf <= n_way - j )
+		{
+			if ( j == 0 && handle_edge_low )
+				width = ( bf_left > 0 ? bf_left : bf );
+			else
+				width = bf;
+
+			// Make sure that the width does not exceed n_j. This would
+			// occur if and when n_j_bf < n_way - j; that is, when the
+			// matrix being partitioned is sufficiently small relative to
+			// n_way such that there is not even enough work for every
+			// (remaining) thread to get bf (or bf_left) columns. The
+			// net effect of this safeguard is that some threads may get
+			// assigned empty ranges (ie: no work), which of course must
+			// happen in some situations.
+			if ( width > n_j ) width = n_j;
+
+			return width;
+		}
+	}
+
+	// This block computes the width assuming that we are entirely within
+	// a dense rectangle that precedes the triangular (or trapezoidal)
+	// part.
+	{
+		// First compute the width of the current panel under the
+		// assumption that the diagonal offset would not intersect.
+		width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m );
+
+		// Adjust the width, if necessary. Specifically, we may need
+		// to allocate the edge case to the first subpartition, if
+		// requested; otherwise, we just need to ensure that the
+		// subpartition is a multiple of the blocking factor.
+		if ( j == 0 && handle_edge_low )
+		{
+			if ( width % bf != bf_left ) width += bf_left - ( width % bf );
+		}
+		else // if interior case
+		{
+			// Round up to the next multiple of the blocking factor.
+			//if ( width % bf != 0       ) width += bf      - ( width % bf );
+			// Round to the nearest multiple of the blocking factor.
+			if ( width % bf != 0       ) width = bli_round_to_mult( width, bf );
+		}
+	}
+
+	// We need to recompute width if the panel, according to the width
+	// as currently computed, would intersect the diagonal.
+	if ( diagoff_j < width )
+	{
+		dim_t offm_inc, offn_inc;
+
+		// Prune away the unstored region above the diagonal, if it exists.
+		// Note that the entire region was pruned initially, so we know that
+		// we don't need to try to prune the right side. (Also, we discard
+		// the offset deltas since we don't need to actually index into the
+		// subpartition.)
+		bli_prune_unstored_region_top_l( &diagoff_j, &m, &n_j, &offm_inc );
+		//bli_prune_unstored_region_right_l( &diagoff_j, &m, &n_j, &offn_inc );
+
+		// We don't need offm_inc, offn_inc here. These statements should
+		// prevent compiler warnings.
+		( void )offm_inc;
+		( void )offn_inc;
+
+		// Prepare to solve a quadratic equation to find the width of the
+		// current (jth) subpartition given the m dimension, diagonal offset,
+		// and area.
+		// NOTE: We know that the +/- in the quadratic formula must be a +
+		// here because we know that the desired solution (the subpartition
+		// width) will be smaller than (m + diagoff), not larger. If you
+		// don't believe me, draw a picture!
+		const double a = -0.5;
+		const double b = ( double )m + ( double )diagoff_j + 0.5;
+		const double c = -0.5 * (   ( double )diagoff_j *
+		                          ( ( double )diagoff_j + 1.0 )
+		                        ) - area_per_thr;
+		const double r = b * b - 4.0 * a * c;
+
+		// If the quadratic solution is not imaginary, round it and use that
+		// as our width (but make sure it didn't round to zero). Otherwise,
+		// discard the quadratic solution and leave width, as previously
+		// computed, unchanged.
+		if ( r >= 0.0 )
+		{
+			const double x = ( -b + sqrt( r ) ) / ( 2.0 * a );
+
+			width = ( dim_t )bli_round( x );
+			if ( width == 0 ) width = 1;
+		}
+
+		// Adjust the width, if necessary.
+		if ( j == 0 && handle_edge_low )
+		{
+			if ( width % bf != bf_left ) width += bf_left - ( width % bf );
+		}
+		else // if interior case
+		{
+			// Round up to the next multiple of the blocking factor.
+			//if ( width % bf != 0       ) width += bf      - ( width % bf );
+			// Round to the nearest multiple of the blocking factor.
+			if ( width % bf != 0       ) width = bli_round_to_mult( width, bf );
+		}
+	}
+
+	// Make sure that the width, after being adjusted, does not cause the
+	// subpartition to exceed n_j.
+	if ( width > n_j ) width = n_j;
+
+	return width;
+}
+
+siz_t bli_find_area_trap_l
+     (
+       doff_t diagoff,
+       dim_t  m,
+       dim_t  n,
+       dim_t  bf
+     )
+{
+	dim_t  offm_inc = 0;
+	dim_t  offn_inc = 0;
+	double utri_area;
+	double blktri_area;
+
+	// Prune away any rectangular region above where the diagonal
+	// intersects the left edge of the subpartition, if it exists.
+	bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc );
+
+	// Prune away any rectangular region to the right of where the
+	// diagonal intersects the bottom edge of the subpartition, if
+	// it exists. (This shouldn't ever be needed, since the caller
+	// would presumably have already performed rightward pruning,
+	// but it's here just in case.)
+	//bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc );
+
+	( void )offm_inc;
+	( void )offn_inc;
+
+	// Compute the area of the empty triangle so we can subtract it
+	// from the area of the rectangle that bounds the subpartition.
+	if ( bli_intersects_diag_n( diagoff, m, n ) )
+	{
+		double tri_dim = ( double )( n - diagoff - 1 );
+		       tri_dim = bli_min( tri_dim, m - 1 );
+
+		utri_area   = tri_dim * ( tri_dim + 1.0 ) / 2.0;
+		blktri_area = tri_dim * ( bf      - 1.0 ) / 2.0;
+	}
+	else
+	{
+		// If the diagonal does not intersect the trapezoid, then
+		// we can compute the area as a simple rectangle.
+		utri_area   = 0.0;
+		blktri_area = 0.0;
+	}
+
+	double area = ( double )m * ( double )n - utri_area + blktri_area;
+
+	return ( siz_t )area;
+}
+
+// -----------------------------------------------------------------------------
+
+siz_t bli_thread_range_weighted_sub
+     (
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             uplo_t     uplo_orig,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     j_start_thr,
+             dim_t*     j_end_thr
+     )
+{
+	dim_t      n_way   = bli_thrinfo_n_way( thread );
+	dim_t      my_id   = bli_thrinfo_work_id( thread );
+
+	dim_t      bf_left = n % bf;
+
+	dim_t      offm_inc, offn_inc;
+
+	siz_t      area = 0;
+
+	// In this function, we assume that the caller has already determined
+	// that (a) the diagonal intersects the submatrix, and (b) the submatrix
+	// is either lower- or upper-stored.
+
+	if ( bli_is_lower( uplo ) )
+	{
+		#if 0
+		if ( n_way > 1 )
+		printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (lower)\n",
+		        (int)my_id, (int)(m), (int)(n), (int)(diagoff) );
+		#endif
+
+		// Prune away the unstored region above the diagonal, if it exists,
+		// and then to the right of where the diagonal intersects the bottom,
+		// if it exists. (Also, we discard the offset deltas since we don't
+		// need to actually index into the subpartition.)
+		bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc );
+
+		if ( !handle_edge_low )
+		{
+			// This branch handles the following two cases:
+			// - note: Edge case microtiles are marked as 'e'.
+			//
+			// uplo_orig = lower       | uplo = lower
+			// handle edge high (orig) | handle edge high
+			//
+			//     x x x x x x x              x x x x x x x
+			//     x x x x x x x x            x x x x x x x x
+			//     x x x x x x x x x      ->  x x x x x x x x x
+			//     x x x x x x x x x x        x x x x x x x x x x
+			//     x x x x x x x x x x e      x x x x x x x x x x e
+			//     x x x x x x x x x x e      x x x x x x x x x x e
+			//
+			// uplo_orig = upper       | uplo = lower
+			// handle edge low  (orig) | handle edge high
+			//
+			//     e x x x x x x x x x x      x x x x x x x
+			//     e x x x x x x x x x x      x x x x x x x x
+			//       x x x x x x x x x x  ->  x x x x x x x x x
+			//         x x x x x x x x x      x x x x x x x x x x
+			//           x x x x x x x x      x x x x x x x x x x e
+			//             x x x x x x x      x x x x x x x x x x e
+
+			// If the edge case is being handled "high", then we can employ this
+			// simple macro for pruning the region to the right of where the
+			// diagonal intersets the right side of the submatrix (which amounts
+			// to adjusting the n dimension).
+			bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc );
+		}
+		else // if ( handle_edge_low )
+		{
+			// This branch handles the following two cases:
+			//
+			// uplo_orig = upper       | uplo = lower
+			// handle edge high (orig) | handle edge low
+			//
+			//     x x x x x x x x x x e      e x x x x x x
+			//     x x x x x x x x x x e      e x x x x x x x
+			//       x x x x x x x x x e  ->  e x x x x x x x x
+			//         x x x x x x x x e      e x x x x x x x x x
+			//           x x x x x x x e      e x x x x x x x x x x
+			//             x x x x x x e      e x x x x x x x x x x
+			//
+			// uplo_orig = lower       | uplo = lower
+			// handle edge low  (orig) | handle edge low
+			//
+			//     e x x x x x x              e x x x x x x
+			//     e x x x x x x x            e x x x x x x x
+			//     e x x x x x x x x      ->  e x x x x x x x x
+			//     e x x x x x x x x x        e x x x x x x x x x
+			//     e x x x x x x x x x x      e x x x x x x x x x x
+			//     e x x x x x x x x x x      e x x x x x x x x x x
+
+			// If the edge case is being handled "low", then we have to be more
+			// careful. The problem can be seen in certain situations when we're
+			// actually computing the weighted ranges for an upper-stored
+			// subpartition whose (a) diagonal offset is positive (though will
+			// always be less than NR), (b) right-side edge case exists, and (c)
+			// sum of (a) and (b) is less than NR. This is a problem because the
+			// upcoming loop that iterates over/ bli_thread_range_width_l()
+			// doesn't realize that the offsets associated with (a) and (b)
+			// belong on two separate columns of microtiles. If we naively use
+			// bli_prune_unstored_region_right_l() when handle_edge_low == TRUE,
+			// the loop over bli_thread_range_width_l() will only "see" p-1
+			// IR-iterations of work to assign to threads when there are
+			// actually p micropanels.
+
+			const dim_t n_inner = ( diagoff + bli_min( m, n - diagoff ) - bf_left );
+
+			const dim_t n_bf_iter_br = n_inner / bf;
+			const dim_t n_bf_left_br = n_inner % bf;
+			const dim_t n_bf_br = ( bf_left > 0 ? 1 : 0 ) +
+			                        n_bf_iter_br +
+			                      ( n_bf_left_br > 0 ? 1 : 0 );
+
+			// Compute the number of extra columns that were included in n_bf_br
+			// as a result of including a full micropanel for the part of the
+			// submatrix that contains bf_left columns. For example, if bf = 16
+			// and bf_left = 4, then bf_extra = 12. But if bf_left = 0, then we
+			// didn't include any extra columns.
+			const dim_t bf_extra = ( bf_left > 0 ? bf - bf_left : 0 );
+
+			// Subtract off bf_extra from n_bf_br to arrive at the "true" value
+			// of n that we'll use going forward.
+			n = n_bf_br * bf - bf_extra;
+
+			#if 0
+			if ( n_way > 1 )
+			{
+				//printf( "thread_range_weighted_sub(): tid %d: _iter _left = %3d %3d (lower1)\n",
+				//		(int)my_id, (int)n_bf_iter_br, (int)n_bf_left_br );
+				printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (lower2)\n",
+						(int)my_id, (int)(m), (int)(n), (int)(diagoff) );
+			}
+			#endif
+		}
+
+		// We don't need offm_inc, offn_inc here. These statements should
+		// prevent compiler warnings.
+		( void )offm_inc;
+		( void )offn_inc;
+
+		// Now that pruning has taken place, we know that diagoff >= 0.
+
+		// Compute the total area of the submatrix, accounting for the
+		// location of the diagonal. This is done by computing the area in
+		// the strictly upper triangle, subtracting it off the area of the
+		// full rectangle, and then adding the missing strictly upper
+		// triangles of the bf x bf blocks along the diagonal.
+		double tri_dim     = ( double )( n - diagoff - 1 );
+		       tri_dim     = bli_min( tri_dim, m - 1 );
+		double utri_area   = tri_dim * ( tri_dim + 1.0 ) / 2.0;
+
+		// Note that the expression below is the simplified form of:
+		//   blktri_area = ( tri_dim / bf ) * bf * ( bf - 1.0 ) / 2.0;
+		double blktri_area = tri_dim * ( bf - 1.0 ) / 2.0;
+
+		// Compute the area of the region to the right of where the diagonal
+		// intersects the bottom edge of the submatrix. If it instead intersects
+		// the right edge (or the bottom-right corner), then this region does
+		// not exist and so its area is explicitly set to zero.
+		double beyondtri_dim = n - diagoff - m;
+		double beyondtri_area;
+		if ( 0 < beyondtri_dim ) beyondtri_area = beyondtri_dim * m;
+		else                     beyondtri_area = 0.0;
+
+		// Here, we try to account for the added cost of computing columns of
+		// microtiles that intersect the diagonal. This is rather difficult to
+		// model, but this is partly due to the way non-square microtiles map
+		// onto the matrix relative to the diagonal, as well as additional
+		// overhead incurred from (potentially) computing with less-than-full
+		// columns of microtiles (i.e., columns for which diagoff_j < 0).
+		// Note that higher values for blktri_area have the net effect of
+		// increasing the relative size of slabs that share little or no overlap
+		// with the diagonal region. this is because it slightly increases the
+		// total area computation below, which in turn increases the area
+		// targeted by each thread/group earlier in the thread range, which
+		// for lower trapezoidal submatrices, corresponds to the regular
+		// rectangular region that precedes the diagonal part (if such a
+		// rectangular region exists).
+		blktri_area *= 1.5;
+		//blktri_area = 0.0;
+
+		double area_total  = ( double )m * ( double )n - utri_area + blktri_area
+		                                               - beyondtri_area;
+
+		// Divide the computed area by the number of ways of parallelism.
+		double area_per_thr = area_total / ( double )n_way;
+
+
+		// Initialize some variables prior to the loop: the offset to the
+		// current subpartition, the remainder of the n dimension, and
+		// the diagonal offset of the current subpartition.
+		dim_t  off_j     = 0;
+		doff_t diagoff_j = diagoff;
+		dim_t  n_left    = n;
+
+		#if 0
+		printf( "thread_range_weighted_sub(): tid %d: n_left = %3d       (lower4)\n",
+		        (int)my_id, (int)(n_left) );
+		#endif
+
+		// Iterate over the subpartition indices corresponding to each
+		// thread/caucus participating in the n_way parallelism.
+		for ( dim_t j = 0; j < n_way; ++j )
+		{
+			// Compute the width of the jth subpartition, taking the
+			// current diagonal offset into account, if needed.
+			dim_t width_j
+			=
+			bli_thread_range_width_l
+			(
+			  diagoff_j, m, n_left,
+			  j, n_way,
+			  bf, bf_left,
+			  area_per_thr,
+			  handle_edge_low
+			);
+
+			#if 0
+			if ( n_way > 1 )
+			printf( "thread_range_weighted_sub(): tid %d: width_j = %d doff_j = %d\n",
+			        (int)my_id, (int)width_j, (int)diagoff_j );
+			#endif
+
+			// If the current thread belongs to caucus j, this is his
+			// subpartition. So we compute the implied index range and
+			// end our search.
+			#if 0
+			// An alternate way of assigning work to threads such that regions
+			// are assigned to threads left to right *after* accounting for the
+			// fact that we recycle the same lower-trapezoidal code to also
+			// compute the upper-trapezoidal case.
+			bool is_my_range;
+			if ( bli_is_lower( uplo_orig ) ) is_my_range = ( j ==         my_id     );
+			else                             is_my_range = ( j == n_way - my_id - 1 );
+			#else
+			bool is_my_range = ( j == my_id );
+			#endif
+
+			if ( is_my_range )
+			{
+				*j_start_thr = off_j;
+				*j_end_thr   = off_j + width_j;
+
+				#if 0
+				if ( n_way > 1 )
+				printf( "thread_range_weighted_sub(): tid %d: sta end = %3d %3d\n",
+				        (int)my_id, (int)(*j_start_thr), (int)(*j_end_thr) );
+				//printf( "thread_range_weighted_sub(): tid %d: n_left = %3d\n",
+				//        (int)my_id, (int)(n) );
+				#endif
+
+				// Compute the area of the thread's current subpartition in case
+				// the caller is curious how much work they were assigned.
+				// NOTE: This area computation isn't actually needed for BLIS to
+				// function properly.)
+				area = bli_find_area_trap_l( diagoff_j, m, width_j, bf );
+
+				break;
+			}
+
+			// Shift the current subpartition's starting and diagonal offsets,
+			// as well as the remainder of the n dimension, according to the
+			// computed width, and then iterate to the next subpartition.
+			off_j     += width_j;
+			diagoff_j -= width_j;
+			n_left    -= width_j;
+		}
+	}
+	else // if ( bli_is_upper( uplo ) )
+	{
+		// Express the upper-stored case in terms of the lower-stored case.
+
+		#if 0
+		if ( n_way > 1 )
+		printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (upper)\n",
+		        (int)my_id, (int)(m), (int)(n), (int)(diagoff) );
+		#endif
+
+		// First, we convert the upper-stored trapezoid to an equivalent
+		// lower-stored trapezoid by rotating it 180 degrees.
+		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
+
+		// Now that the trapezoid is "flipped" in the n dimension, negate
+		// the bool that encodes whether to handle the edge case at the
+		// low (or high) end of the index range.
+		bli_toggle_bool( &handle_edge_low );
+
+		// Compute the appropriate range for the rotated trapezoid.
+		area = bli_thread_range_weighted_sub
+		(
+		  thread, diagoff, uplo, uplo_orig, m, n, bf,
+		  handle_edge_low,
+		  j_start_thr, j_end_thr
+		);
+
+		// Reverse the indexing basis for the subpartition ranges so that
+		// the indices, relative to left-to-right iteration through the
+		// unrotated upper-stored trapezoid, map to the correct columns
+		// (relative to the diagonal). This amounts to subtracting the
+		// range from n.
+		bli_reverse_index_direction( n, j_start_thr, j_end_thr );
+	}
+
+	return area;
+}
+
+// -----------------------------------------------------------------------------
+
+siz_t bli_thread_range_mdim
+     (
+             dir_t      direct,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntl_t*    cntl,
+       const cntx_t*    cntx,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	bszid_t  bszid  = bli_cntl_bszid( cntl );
+	opid_t   family = bli_cntl_family( cntl );
+
+	// This is part of trsm's current implementation, whereby right side
+	// cases are implemented in left-side micro-kernels, which requires
+	// we swap the usage of the register blocksizes for the purposes of
+	// packing A and B.
+	if ( family == BLIS_TRSM )
+	{
+		if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR;
+		else                                   bszid = BLIS_NR;
+	}
+
+	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	const obj_t*   x;
+	bool  use_weighted;
+
+	// Use the operation family to choose the one of the two matrices
+	// being partitioned that potentially has structure, and also to
+	// decide whether or not we need to use weighted range partitioning.
+	// NOTE: It's important that we use non-weighted range partitioning
+	// for hemm and symm (ie: the gemm family) because the weighted
+	// function will mistakenly skip over unstored regions of the
+	// structured matrix, even though they represent part of that matrix
+	// that will be dense and full (after packing).
+	if      ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; }
+	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
+	else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE;  }
+	else    /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; }
+
+	if ( use_weighted )
+	{
+		if ( direct == BLIS_FWD )
+			return bli_thread_range_weighted_t2b( thr, x, bmult, start, end );
+		else
+			return bli_thread_range_weighted_b2t( thr, x, bmult, start, end );
+	}
+	else
+	{
+		if ( direct == BLIS_FWD )
+			return bli_thread_range_t2b( thr, x, bmult, start, end );
+		else
+			return bli_thread_range_b2t( thr, x, bmult, start, end );
+	}
+}
+
+siz_t bli_thread_range_ndim
+     (
+             dir_t      direct,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntl_t*    cntl,
+       const cntx_t*    cntx,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	bszid_t  bszid  = bli_cntl_bszid( cntl );
+	opid_t   family = bli_cntl_family( cntl );
+
+	// This is part of trsm's current implementation, whereby right side
+	// cases are implemented in left-side micro-kernels, which requires
+	// we swap the usage of the register blocksizes for the purposes of
+	// packing A and B.
+	if ( family == BLIS_TRSM )
+	{
+		if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR;
+		else                                   bszid = BLIS_NR;
+	}
+
+	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	const obj_t*   x;
+	bool  use_weighted;
+
+	// Use the operation family to choose the one of the two matrices
+	// being partitioned that potentially has structure, and also to
+	// decide whether or not we need to use weighted range partitioning.
+	// NOTE: It's important that we use non-weighted range partitioning
+	// for hemm and symm (ie: the gemm family) because the weighted
+	// function will mistakenly skip over unstored regions of the
+	// structured matrix, even though they represent part of that matrix
+	// that will be dense and full (after packing).
+	if      ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; }
+	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
+	else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE;  }
+	else    /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; }
+
+	if ( use_weighted )
+	{
+		if ( direct == BLIS_FWD )
+			return bli_thread_range_weighted_l2r( thr, x, bmult, start, end );
+		else
+			return bli_thread_range_weighted_r2l( thr, x, bmult, start, end );
+	}
+	else
+	{
+		if ( direct == BLIS_FWD )
+			return bli_thread_range_l2r( thr, x, bmult, start, end );
+		else
+			return bli_thread_range_r2l( thr, x, bmult, start, end );
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+siz_t bli_thread_range_weighted_l2r
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	siz_t area;
+
+	// This function assigns area-weighted ranges in the n dimension
+	// where the total range spans 0 to n-1 with 0 at the left end and
+	// n-1 at the right end.
+
+	if ( bli_obj_intersects_diag( a ) &&
+	     bli_obj_is_upper_or_lower( a ) )
+	{
+		num_t  dt      = bli_obj_dt( a );
+		doff_t diagoff = bli_obj_diag_offset( a );
+		uplo_t uplo    = bli_obj_uplo( a );
+		dim_t  m       = bli_obj_length( a );
+		dim_t  n       = bli_obj_width( a );
+		dim_t  bf      = bli_blksz_get_def( dt, bmult );
+
+		// Support implicit transposition.
+		if ( bli_obj_has_trans( a ) )
+		{
+			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
+		}
+
+		area =
+		bli_thread_range_weighted_sub
+		(
+		  thr, diagoff, uplo, uplo, m, n, bf,
+		  FALSE, start, end
+		);
+	}
+	else // if dense or zeros
+	{
+		area = bli_thread_range_l2r
+		(
+		  thr, a, bmult,
+		  start, end
+		);
+	}
+
+	return area;
+}
+
+siz_t bli_thread_range_weighted_r2l
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	siz_t area;
+
+	// This function assigns area-weighted ranges in the n dimension
+	// where the total range spans 0 to n-1 with 0 at the right end and
+	// n-1 at the left end.
+
+	if ( bli_obj_intersects_diag( a ) &&
+	     bli_obj_is_upper_or_lower( a ) )
+	{
+		num_t  dt      = bli_obj_dt( a );
+		doff_t diagoff = bli_obj_diag_offset( a );
+		uplo_t uplo    = bli_obj_uplo( a );
+		dim_t  m       = bli_obj_length( a );
+		dim_t  n       = bli_obj_width( a );
+		dim_t  bf      = bli_blksz_get_def( dt, bmult );
+
+		// Support implicit transposition.
+		if ( bli_obj_has_trans( a ) )
+		{
+			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
+		}
+
+		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
+
+		area =
+		bli_thread_range_weighted_sub
+		(
+		  thr, diagoff, uplo, uplo, m, n, bf,
+		  TRUE, start, end
+		);
+	}
+	else // if dense or zeros
+	{
+		area = bli_thread_range_r2l
+		(
+		  thr, a, bmult,
+		  start, end
+		);
+	}
+
+	return area;
+}
+
+siz_t bli_thread_range_weighted_t2b
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	siz_t area;
+
+	// This function assigns area-weighted ranges in the m dimension
+	// where the total range spans 0 to m-1 with 0 at the top end and
+	// m-1 at the bottom end.
+
+	if ( bli_obj_intersects_diag( a ) &&
+	     bli_obj_is_upper_or_lower( a ) )
+	{
+		num_t  dt      = bli_obj_dt( a );
+		doff_t diagoff = bli_obj_diag_offset( a );
+		uplo_t uplo    = bli_obj_uplo( a );
+		dim_t  m       = bli_obj_length( a );
+		dim_t  n       = bli_obj_width( a );
+		dim_t  bf      = bli_blksz_get_def( dt, bmult );
+
+		// Support implicit transposition.
+		if ( bli_obj_has_trans( a ) )
+		{
+			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
+		}
+
+		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
+
+		area =
+		bli_thread_range_weighted_sub
+		(
+		  thr, diagoff, uplo, uplo, m, n, bf,
+		  FALSE, start, end
+		);
+	}
+	else // if dense or zeros
+	{
+		area = bli_thread_range_t2b
+		(
+		  thr, a, bmult,
+		  start, end
+		);
+	}
+
+	return area;
+}
+
+siz_t bli_thread_range_weighted_b2t
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
+             dim_t*     start,
+             dim_t*     end
+     )
+{
+	siz_t area;
+
+	// This function assigns area-weighted ranges in the m dimension
+	// where the total range spans 0 to m-1 with 0 at the bottom end and
+	// m-1 at the top end.
+
+	if ( bli_obj_intersects_diag( a ) &&
+	     bli_obj_is_upper_or_lower( a ) )
+	{
+		num_t  dt      = bli_obj_dt( a );
+		doff_t diagoff = bli_obj_diag_offset( a );
+		uplo_t uplo    = bli_obj_uplo( a );
+		dim_t  m       = bli_obj_length( a );
+		dim_t  n       = bli_obj_width( a );
+		dim_t  bf      = bli_blksz_get_def( dt, bmult );
+
+		// Support implicit transposition.
+		if ( bli_obj_has_trans( a ) )
+		{
+			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
+		}
+
+		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
+
+		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
+
+		area = bli_thread_range_weighted_sub
+		(
+		  thr, diagoff, uplo, uplo, m, n, bf,
+		  TRUE, start, end
+		);
+	}
+	else // if dense or zeros
+	{
+		area = bli_thread_range_b2t
+		(
+		  thr, a, bmult,
+		  start, end
+		);
+	}
+
+	return area;
+}
+
diff --git a/frame/thread/bli_thread_range.h b/frame/thread/bli_thread_range.h
new file mode 100644
index 000000000..cf966b5a3
--- /dev/null
+++ b/frame/thread/bli_thread_range.h
@@ -0,0 +1,128 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_THREAD_RANGE_H
+#define BLIS_THREAD_RANGE_H
+
+// Thread range-related prototypes.
+
+BLIS_EXPORT_BLIS void bli_thread_range_sub
+     (
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end
+     );
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+siz_t PASTEMAC0( opname ) \
+     ( \
+             dir_t      direct, \
+       const thrinfo_t* thr, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     c, \
+       const cntl_t*    cntl, \
+       const cntx_t*    cntx, \
+             dim_t*     start, \
+             dim_t*     end  \
+     );
+
+GENPROT( thread_range_mdim )
+GENPROT( thread_range_ndim )
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+siz_t PASTEMAC0( opname ) \
+     ( \
+       const thrinfo_t* thr, \
+       const obj_t*     a, \
+       const blksz_t*   bmult, \
+             dim_t*     start, \
+             dim_t*     end  \
+     );
+
+GENPROT( thread_range_l2r )
+GENPROT( thread_range_r2l )
+GENPROT( thread_range_t2b )
+GENPROT( thread_range_b2t )
+
+GENPROT( thread_range_weighted_l2r )
+GENPROT( thread_range_weighted_r2l )
+GENPROT( thread_range_weighted_t2b )
+GENPROT( thread_range_weighted_b2t )
+
+
+dim_t bli_thread_range_width_l
+     (
+       doff_t diagoff_j,
+       dim_t  m,
+       dim_t  n_j,
+       dim_t  j,
+       dim_t  n_way,
+       dim_t  bf,
+       dim_t  bf_left,
+       double area_per_thr,
+       bool   handle_edge_low
+     );
+siz_t bli_find_area_trap_l
+     (
+       doff_t diagoff,
+       dim_t  m,
+       dim_t  n,
+       dim_t  bf
+     );
+
+siz_t bli_thread_range_weighted_sub
+     (
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             uplo_t     uplo_orig,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     j_start_thr,
+             dim_t*     j_end_thr
+     );
+
+#endif
diff --git a/frame/thread/bli_thread_range_slab_rr.c b/frame/thread/bli_thread_range_slab_rr.c
new file mode 100644
index 000000000..be4432309
--- /dev/null
+++ b/frame/thread/bli_thread_range_slab_rr.c
@@ -0,0 +1,134 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_thread_range_quad
+     (
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
+     )
+{
+
+#ifdef BLIS_ENABLE_JRIR_RR
+
+	const dim_t tid    = bli_thrinfo_work_id( thread );
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 );
+
+	// Use round-robin (interleaved) partitioning of jr/ir loops.
+	*start = tid;
+	*end   = n_iter;
+	*inc   = jr_nt;
+
+#else // #elif defined( BLIS_ENABLE_JRIR_SLAB ) ||
+	  //       defined( BLIS_ENABLE_JRIR_TLB  )
+
+	// NOTE: While this cpp conditional branch applies to both _SLAB and _TLB
+	// cases, this *function* should never be called when BLIS_ENABLE_JRIR_TLB
+	// is defined, since the function is only called from macrokernels that were
+	// designed for slab/rr partitioning.
+
+	const dim_t jr_nt = bli_thrinfo_n_way( thread );
+	const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 );
+
+	// If there is no parallelism in this loop, set the output variables
+	// and return early.
+	if ( jr_nt == 1 ) { *start = 0; *end = n_iter; *inc = 1; return; }
+
+	// Local variables for the computed start, end, and increment.
+	dim_t st, en, in;
+
+	if ( bli_intersects_diag_n( diagoff, m, n ) )
+	{
+		// If the current submatrix intersects the diagonal, try to be
+		// intelligent about how threads are assigned work by using the
+		// quadratic partitioning function.
+
+		bli_thread_range_weighted_sub
+		(
+		  thread, diagoff, uplo, uplo, m, n, bf,
+		  handle_edge_low, &st, &en
+		);
+		in = bf;
+	}
+	else
+	{
+		// If the current submatrix does not intersect the diagonal, then we
+		// are free to perform a uniform (and contiguous) slab partitioning.
+
+		bli_thread_range_sub
+		(
+		  thread, n, bf,
+		  handle_edge_low, &st, &en
+		);
+		in = bf;
+	}
+
+	// Convert the start and end column indices into micropanel indices by
+	// dividing by the blocking factor (which, for the jr loop, is NR). If
+	// either one yields a remainder, add an extra unit to the result. This
+	// is necessary for situations where there are t threads with t-1 or
+	// fewer micropanels of work, including an edge case. For example, if
+	// t = 3 and n = 10 (with bf = NR = 8), then we want start and end for
+	// each thread to be:
+	//
+	//                  column index           upanel index
+	//   tid 0:  start, end =  0,  8  ->  start, end = 0, 1
+	//   tid 1:  start, end =  8, 10  ->  start, end = 1, 2
+	//   tid 2:  start, end = 10, 10  ->  start, end = 2, 2
+	//
+	// In this example, it's important that thread (tid) 2 gets no work, and
+	// we express that by specifying start = end = n, which is a non-existent
+	// column index.
+
+	if ( st % bf == 0 ) *start = st / bf;
+	else                *start = st / bf + 1;
+
+	if ( en % bf == 0 ) *end = en / bf;
+	else                *end = en / bf + 1;
+
+	*inc = in / bf;
+
+#endif
+}
diff --git a/frame/thread/bli_thread_range_slab_rr.h b/frame/thread/bli_thread_range_slab_rr.h
new file mode 100644
index 000000000..3e9797363
--- /dev/null
+++ b/frame/thread/bli_thread_range_slab_rr.h
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_THREAD_RANGE_SLAB_RR_H
+#define BLIS_THREAD_RANGE_SLAB_RR_H
+
+BLIS_INLINE void bli_thread_range_rr
+     (
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
+     )
+{
+	const dim_t tid    = bli_thrinfo_work_id( thread );
+	const dim_t nt     = bli_thrinfo_n_way( thread );
+	const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 );
+
+	// Use round-robin (interleaved) partitioning of jr/ir loops.
+	*start = tid;
+	*end   = n_iter;
+	*inc   = nt;
+}
+
+BLIS_INLINE void bli_thread_range_sl
+     (
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
+     )
+{
+	// Use contiguous slab partitioning of jr/ir loops.
+	bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
+	*inc = 1;
+}
+
+BLIS_INLINE void bli_thread_range_slrr
+     (
+       const thrinfo_t* thread,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
+     )
+{
+	// Define a general-purpose slab/rr function whose definition depends on
+	// whether slab or round-robin partitioning was requested at configure-time.
+	// Note that this function also uses the slab code path when tlb is enabled.
+	// If this is ever changed, make sure to change bli_is_my_iter() since they
+	// are used together by packm.
+
+#ifdef BLIS_ENABLE_JRIR_RR
+	bli_thread_range_rr( thread, n, bf, handle_edge_low, start, end, inc );
+#else // ifdef ( _SLAB || _TLB )
+	bli_thread_range_sl( thread, n, bf, handle_edge_low, start, end, inc );
+#endif
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_thread_range_quad
+     (
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
+     );
+
+#endif
+
diff --git a/frame/thread/bli_thread_range_tlb.c b/frame/thread/bli_thread_range_tlb.c
new file mode 100644
index 000000000..546ed341d
--- /dev/null
+++ b/frame/thread/bli_thread_range_tlb.c
@@ -0,0 +1,1699 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+#define PRINT_MODE
+#define PGUARD if ( 0 )
+//#define PRINT_RESULT
+
+
+#if 0
+dim_t bli_thread_range_tlb
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const uplo_t uplo,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	dim_t n_ut_for_me;
+
+	if ( bli_is_lower( uplo ) )
+	{
+		n_ut_for_me = bli_thread_range_tlb_l
+		(
+		  nt, tid, diagoff, m_iter, n_iter, mr, nr, j_st_p, i_st_p
+		);
+	}
+	else if ( bli_is_upper( uplo ) )
+	{
+		n_ut_for_me = bli_thread_range_tlb_u
+		(
+		  nt, tid, diagoff, m_iter, n_iter, mr, nr, j_st_p, i_st_p
+		);
+	}
+	else // if ( bli_is_dense( uplo ) )
+	{
+		n_ut_for_me = bli_thread_range_tlb_d
+		(
+		  nt, tid,          m_iter, n_iter, mr, nr, j_st_p, i_st_p
+		);
+	}
+
+	return n_ut_for_me;
+}
+#endif
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_thread_range_tlb_l
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	// This function implements tile-level load balancing for a
+	// lower-trapezoidal submatrix. This partitioning guarantees that all
+	// threads are assigned nearly the same number of microtiles-worth of work,
+	// with a maximum imbalance of one microtile. It makes no effort, however,
+	// to account for differences in threads' workload that is attributable to
+	// differences in the number of edge-case (or diagonal-intersecting)
+	// microtiles (which incur slightly more work since they must first write
+	// to a temporary microtile before updating the output C matrix).
+
+	// Assumption: -mr < diagoff. Make sure to prune leading rows beforehand!
+	if ( diagoff <= -mr ) bli_abort();
+
+	//
+	// -- Step 1: Compute the computational area of the region -----------------
+	//
+
+	// Compute the m and n dimensions according to m_iter and n_iter. (These
+	// m and n dims will likely be larger than the actual m and n since they
+	// "round up" the edge case microtiles into full-sized microtiles.)
+	const dim_t m = m_iter * mr;
+	const dim_t n = n_iter * nr;
+
+	// For the purposes of many computations in this function, we aren't
+	// interested in the extent to which diagoff exceeds n (if it does)
+	// So we use a new variable that is guaranteed to be no greater than n.
+	const doff_t diagoffmin = bli_min( diagoff, n );
+
+	const dim_t m_rect = m;
+	const dim_t n_rect = ( diagoffmin / nr ) * nr;
+
+	const dim_t rect_area    = m_rect * n_rect;
+	const dim_t nonrect_area = m * n - rect_area;
+
+	//const dim_t offn_rect       = 0;
+	const dim_t offn_nonrect    = n_rect;
+	const dim_t diagoff_nonrect = diagoffmin - n_rect; //diagoff % nr;
+
+	const dim_t n_nonrect       = n - n_rect;
+
+	const dim_t offn_ut_nonrect = ( diagoffmin / nr );
+
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "min(diagoff,n):     %7ld\n", diagoffmin );
+	PGUARD printf( "offn_ut_nonrect:    %7ld\n", offn_ut_nonrect );
+	PGUARD printf( "offn_nonrect:       %7ld\n", offn_nonrect );
+	PGUARD printf( "diagoff_nonrect:    %7ld\n", diagoff_nonrect );
+	PGUARD printf( "n_nonrect:          %7ld\n", n_nonrect );
+	PGUARD printf( "---------------------------\n" );
+
+	dim_t num_unref_ut = 0;
+
+	// Count the number of unreferenced utiles strictly above the diagonal.
+	for ( dim_t j = 0; j < n_nonrect; j += nr )
+	{
+		const dim_t diagoff_j = diagoff_nonrect - j;
+
+		// diagoff_j will always be at most nr - 1, but will typically be
+		// negative. This is because the non-rectangular region's diagonal
+		// offset will be at most nr - 1 for the first column of microtiles,
+		// since if it were more than nr - 1, that column would have already
+		// been pruned away (via the implicit pruning of diagoff_nonrect).
+		// NOTE: We use bli_max() to ensure that -diagoff_j / mr does not
+		// become negative, which can only happen if "top" pruning is not
+		// performed beforehand (and so it really isn't necessary here).
+		const dim_t num_unref_ut_j = bli_max( ( -diagoff_j / mr ), 0 );
+
+		num_unref_ut += num_unref_ut_j;
+
+		PGUARD printf( "j                   %7ld\n", j );
+		PGUARD printf( "diagoff_j           %7ld\n", diagoff_j );
+		PGUARD printf( "num_unref_ut_j      %7ld\n", num_unref_ut_j );
+		PGUARD printf( "num_unref_ut        %7ld\n", num_unref_ut );
+		PGUARD printf( "\n" );
+	}
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t tri_unref_area = num_unref_ut * mr * nr;
+	const dim_t tri_ref_area   = nonrect_area - tri_unref_area;
+	const dim_t total_ref_area = rect_area + tri_ref_area;
+
+	PGUARD printf( "gross area:         %7ld\n", m * n );
+	PGUARD printf( "rect_area:          %7ld\n", rect_area );
+	PGUARD printf( "nonrect_area:       %7ld\n", nonrect_area );
+	PGUARD printf( "tri_unref_area:     %7ld\n", tri_unref_area );
+	PGUARD printf( "tri_ref_area:       %7ld\n", tri_ref_area );
+	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "---------------------------\n" );
+
+	//
+	// -- Step 2: Compute key utile counts (per thread, per column, etc.) ------
+	//
+
+	const dim_t n_ut_ref      = total_ref_area / ( mr * nr );
+	//const dim_t n_ut_tri_ref  = tri_ref_area   / ( mr * nr );
+	const dim_t n_ut_rect     = rect_area      / ( mr * nr );
+
+	PGUARD printf( "n_ut_ref:           %7ld\n", n_ut_ref );
+	//PGUARD printf( "n_ut_tri_ref:       %7ld\n", n_ut_tri_ref );
+	PGUARD printf( "n_ut_rect:          %7ld\n", n_ut_rect );
+	PGUARD printf( "---------------------------\n" );
+
+	// Compute the number of microtiles to allocate per thread as well as the
+	// number of leftover microtiles.
+	const dim_t n_ut_per_thr = n_ut_ref / nt;
+	const dim_t n_ut_pt_left = n_ut_ref % nt;
+
+	PGUARD printf( "n_ut_per_thr:       %7ld\n", n_ut_per_thr );
+	PGUARD printf( "n_ut_pt_left:       %7ld\n", n_ut_pt_left );
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t n_ut_per_col = m_iter;
+
+	PGUARD printf( "n_ut_per_col:       %7ld\n", n_ut_per_col );
+
+	// Allocate one of the leftover microtiles to the current thread if its
+	// tid is one of the lower thread ids.
+	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
+
+	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", n_ut_for_me,
+	               n_ut_per_thr, n_ut_for_me - n_ut_per_thr );
+
+	// Compute the number of utiles prior to the current thread's starting
+	// point. This is the sum of all n_ut_for_me for all thread ids less
+	// than tid. Notice that the second half of this expression effectively
+	// adds one extra microtile for each lower-valued thread id, up to
+	// n_ut_pt_left.
+	const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left );
+
+	PGUARD printf( "n_ut_before:        %7ld\n", n_ut_before );
+	PGUARD printf( "---------------------------\n" );
+
+	//
+	// -- Step 3: Compute the starting j/i utile offset for a given tid --------
+	//
+
+	dim_t j_st;
+	dim_t i_st;
+
+	if ( n_ut_before < n_ut_rect )
+	{
+		// This branch handles scenarios where the number of microtiles
+		// assigned to lower thread ids is strictly less than the number of
+		// utiles in the rectangular region. This means that calculating the
+		// starting microtile index is easy (because it does not need to
+		// take the location of the diagonal into account).
+
+		PGUARD printf( "Rectangular region: n_ut_before < n_ut_rect\n" );
+		PGUARD printf( "\n" );
+
+		const dim_t ut_index_rect_st = n_ut_before;
+
+		PGUARD printf( "ut_index_st:        %7ld\n", ut_index_rect_st );
+		PGUARD printf( "---------------------------\n" );
+
+		j_st = ut_index_rect_st / n_ut_per_col;
+		i_st = ut_index_rect_st % n_ut_per_col;
+
+		PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+	}
+	else // if ( n_ut_rect <= n_ut_before )
+	{
+		// This branch handles scenarios where the number of microtiles
+		// assigned to lower thread ids exceeds (or equals) the number of
+		// utiles in the rectangular region. This means we need to observe the
+		// location of the diagonal to see how many utiles are referenced per
+		// column of utiles.
+
+		PGUARD printf( "Diagonal region: n_ut_rect <= n_ut_before\n" );
+		PGUARD printf( "\n" );
+
+		// This will be the number of microtile columns we will immediately
+		// advance past to get to the diagonal region.
+		const dim_t n_ut_col_adv = offn_ut_nonrect;
+
+		PGUARD printf( "n_ut_col_adv:       %7ld\n", n_ut_col_adv );
+
+		// In order to find j_st and i_st, we need to "allocate" n_ut_before
+		// microtiles.
+		dim_t n_ut_tba = n_ut_before;
+
+		PGUARD printf( "n_ut_tba:           %7ld\n", n_ut_tba );
+
+		// Advance past the rectangular region, decrementing n_ut_tba
+		// accordingly.
+		n_ut_tba -= n_ut_per_col * n_ut_col_adv;
+
+		PGUARD printf( "n_ut_tba_1:         %7ld\n", n_ut_tba );
+		PGUARD printf( "\n" );
+
+		// In case n_ut_tba == 0. Only happens when n_ut_before == n_ut_rect.
+		j_st = n_ut_col_adv;
+		i_st = 0;
+
+		for ( dim_t j = n_ut_col_adv; 0 < n_ut_tba; ++j )
+		{
+			const dim_t diagoff_j     = diagoffmin - j*nr;
+			const dim_t n_ut_skip_j   = bli_max( -diagoff_j / mr, 0 );
+			const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j;
+
+			PGUARD printf( "j:                  %7ld\n", j );
+			PGUARD printf( "diagoff_j:          %7ld\n", diagoff_j );
+			PGUARD printf( "n_ut_skip_j:        %7ld\n", n_ut_skip_j );
+			PGUARD printf( "n_ut_this_col:      %7ld\n", n_ut_this_col );
+			PGUARD printf( "n_ut_tba_j0:        %7ld\n", n_ut_tba );
+
+			if ( n_ut_tba < n_ut_this_col )
+			{
+				// If the number of utiles to allocate is less than the number
+				// in this column, we know that j_st will refer to the current
+				// column. To find i_st, we first skip to the utile that
+				// intersects the diagonal and then add n_ut_tba.
+				j_st = j;
+				i_st = n_ut_skip_j + n_ut_tba;
+				PGUARD printf( "j_st, i_st (fnl<)      %4ld,%4ld\n", j_st, i_st );
+			}
+			else if ( n_ut_tba == n_ut_this_col )
+			{
+				// If the number of utiles to allocate is exactly equal to the
+				// number in this column, we know that j_st will refer to the
+				// *next* column. But to find i_st, we will have to take the
+				// location of the diagonal into account.
+				const doff_t diagoff_jp1   = diagoff_j - nr;
+				const dim_t  n_ut_skip_jp1 = bli_max( -diagoff_jp1 / mr, 0 );
+
+				j_st = j + 1;
+				i_st = n_ut_skip_jp1;
+				PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+			}
+
+			// No matter what (especially if the number of utiles to allocate
+			// exceeds the number in this column), we decrement n_ut_tba attempt
+			// to continue to the next iteration. (Note: If either of the two
+			// branches above is triggered, n_ut_tba will be decremented down to
+			// zero (or less), in which case this will be the final iteration.)
+			n_ut_tba -= n_ut_this_col;
+
+			PGUARD printf( "n_ut_tba_j1:        %7ld\n", n_ut_tba );
+			PGUARD printf( "\n" );
+		}
+	}
+
+	//
+	// -- Step 4: Save the results ---------------------------------------------
+	//
+
+	*j_st_p = j_st;
+	*i_st_p = i_st;
+
+	#ifdef PRINT_RESULT
+	printf( "j_st, i_st (mem)       %4ld,%4ld  (n_ut: %4ld)\n",
+	        j_st, i_st, n_ut_for_me );
+	#endif
+
+	// Return the number of utiles that this thread was allocated.
+	return n_ut_for_me;
+}
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_thread_range_tlb_u
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	// This function implements tile-level load balancing for an
+	// upper-trapezoidal submatrix. This partitioning guarantees that all
+	// threads are assigned nearly the same number of microtiles-worth of work,
+	// with a maximum imbalance of one microtile. It makes no effort, however,
+	// to account for differences in threads' workload that is attributable to
+	// differences in the number of edge-case (or diagonal-intersecting)
+	// microtiles (which incur slightly more work since they must first write
+	// to a temporary microtile before updating the output C matrix).
+
+	// Assumption: diagoff < nr. Make sure to prune leading columns beforehand!
+	if ( nr <= diagoff ) bli_abort();
+
+	//
+	// -- Step 1: Compute the computational area of the region -----------------
+	//
+
+	// Compute the m and n dimensions according to m_iter and n_iter. (These
+	// m and n dims will likely be larger than the actual m and n since they
+	// "round up" the edge case microtiles into full-sized microtiles.)
+	const dim_t m = m_iter * mr;
+	const dim_t n = n_iter * nr;
+
+	// For the purposes of many computations in this function, we aren't
+	// interested in the extent to which diagoff exceeds -m (if it does)
+	// So we use a new variable that is guaranteed to be no less than -m.
+	const doff_t diagoffmin = bli_max( diagoff, -m );
+
+	const dim_t m_rect = m;
+	const dim_t n_rect = ( -diagoffmin / nr ) * nr;
+
+	const dim_t rect_area    = m_rect * n_rect;
+	const dim_t nonrect_area = m * n - rect_area;
+
+	const dim_t offn_rect       = n - n_rect;
+	//const dim_t offn_nonrect    = 0;
+	const dim_t diagoff_nonrect = diagoffmin;
+
+	const dim_t n_nonrect       = n - n_rect;
+
+	const dim_t offn_ut_rect    = n_iter + ( diagoffmin / nr );
+
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "max(diagoff,-m):    %7ld\n", diagoffmin );
+	PGUARD printf( "offn_ut_rect:       %7ld\n", offn_ut_rect );
+	PGUARD printf( "offn_rect:          %7ld\n", offn_rect );
+	PGUARD printf( "diagoff_nonrect:    %7ld\n", diagoff_nonrect );
+	PGUARD printf( "n_nonrect:          %7ld\n", n_nonrect );
+	PGUARD printf( "---------------------------\n" );
+
+	dim_t num_unref_ut = 0;
+
+	// Count the number of unreferenced utiles strictly below the diagonal.
+	for ( dim_t j = 0; j < n_nonrect; j += nr )
+	{
+		const dim_t diagoff_j = diagoff_nonrect - j;
+
+		// diagoff_j will always be at most nr - 1, but will typically be
+		// negative. This is because the non-rectangular region's diagonal
+		// offset will be at most nr - 1 for the first column of microtiles,
+		// since if it were more than nr - 1, that column would have already
+		// been pruned away (prior to this function being called).
+		// NOTE: We use bli_max() to ensure that ( m + diagoff_j - nr ) / mr
+		// does not become negative, which can happen in some situations
+		// during the first iteration if diagoff is relatively close to -m.
+		// NOTE: We subtract nr from diagoff_j since it's really the diagonal
+		// offset of the *next* column of utiles that needs to be used to
+		// determine how many utiles are referenced in the current column.
+		const dim_t num_unref_ut_j = bli_max( ( m + diagoff_j - nr ) / mr, 0 );
+
+		num_unref_ut += num_unref_ut_j;
+
+		PGUARD printf( "j                   %7ld\n", j );
+		PGUARD printf( "diagoff_j - nr      %7ld\n", diagoff_j - nr );
+		PGUARD printf( "num_unref_ut_j      %7ld\n", num_unref_ut_j );
+		PGUARD printf( "num_unref_ut        %7ld\n", num_unref_ut );
+		PGUARD printf( "\n" );
+	}
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t tri_unref_area = num_unref_ut * mr * nr;
+	const dim_t tri_ref_area   = nonrect_area - tri_unref_area;
+	const dim_t total_ref_area = rect_area + tri_ref_area;
+
+	PGUARD printf( "gross area:         %7ld\n", m * n );
+	PGUARD printf( "rect_area:          %7ld\n", rect_area );
+	PGUARD printf( "nonrect_area:       %7ld\n", nonrect_area );
+	PGUARD printf( "tri_unref_area:     %7ld\n", tri_unref_area );
+	PGUARD printf( "tri_ref_area:       %7ld\n", tri_ref_area );
+	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "---------------------------\n" );
+
+	//
+	// -- Step 2: Compute key utile counts (per thread, per column, etc.) ------
+	//
+
+	const dim_t n_ut_ref      = total_ref_area / ( mr * nr );
+	const dim_t n_ut_tri_ref  = tri_ref_area   / ( mr * nr );
+	//const dim_t n_ut_rect     = rect_area      / ( mr * nr );
+
+	PGUARD printf( "n_ut_ref:           %7ld\n", n_ut_ref );
+	PGUARD printf( "n_ut_tri_ref:       %7ld\n", n_ut_tri_ref );
+	//PGUARD printf( "n_ut_rect:          %7ld\n", n_ut_rect );
+	PGUARD printf( "---------------------------\n" );
+
+	// Compute the number of microtiles to allocate per thread as well as the
+	// number of leftover microtiles.
+	const dim_t n_ut_per_thr = n_ut_ref / nt;
+	const dim_t n_ut_pt_left = n_ut_ref % nt;
+
+	PGUARD printf( "n_ut_per_thr:       %7ld\n", n_ut_per_thr );
+	PGUARD printf( "n_ut_pt_left:       %7ld\n", n_ut_pt_left );
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t n_ut_per_col = m_iter;
+
+	PGUARD printf( "n_ut_per_col:       %7ld\n", n_ut_per_col );
+
+	// Allocate one of the leftover microtiles to the current thread if its
+	// tid is one of the lower thread ids.
+	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
+
+	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", n_ut_for_me,
+	               n_ut_per_thr, n_ut_for_me - n_ut_per_thr );
+
+	// Compute the number of utiles prior to the current thread's starting
+	// point. This is the sum of all n_ut_for_me for all thread ids less
+	// than tid. Notice that the second half of this expression effectively
+	// adds one extra microtile for each lower-valued thread id, up to
+	// n_ut_pt_left.
+	const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left );
+
+	PGUARD printf( "n_ut_before:        %7ld\n", n_ut_before );
+	PGUARD printf( "---------------------------\n" );
+
+	//
+	// -- Step 3: Compute the starting j/i utile offset for a given tid --------
+	//
+
+	dim_t j_st;
+	dim_t i_st;
+
+	if ( n_ut_tri_ref <= n_ut_before )
+	{
+		// This branch handles scenarios where the number of microtiles
+		// assigned to lower thread ids exceeds (or equals) the number of
+		// utiles in the diagonal region. This means that calculating the
+		// starting microtile index is easy (because it does not need to
+		// take the location of the diagonal into account).
+
+		PGUARD printf( "Rectangular region: n_ut_tri_ref <= n_ut_before\n" );
+		PGUARD printf( "\n" );
+
+		const dim_t ut_index_rect_st = n_ut_before - n_ut_tri_ref;
+
+		PGUARD printf( "ut_index_rect_st:   %7ld\n", ut_index_rect_st );
+		PGUARD printf( "---------------------------\n" );
+
+		j_st = offn_ut_rect + ut_index_rect_st / n_ut_per_col;
+		i_st =                ut_index_rect_st % n_ut_per_col;
+
+		PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+	}
+	else // if ( n_ut_before < n_ut_tri_ref )
+	{
+		// This branch handles scenarios where the number of microtiles
+		// assigned to lower thread ids is strictly less than the number of
+		// utiles in the diagonal region. This means we need to observe the
+		// location of the diagonal to see how many utiles are referenced per
+		// column of utiles.
+
+		PGUARD printf( "Diagonal region: n_ut_before < n_ut_tri_ref\n" );
+		PGUARD printf( "\n" );
+
+		// This will be the number of microtile columns we will immediately
+		// advance past to get to the diagonal region.
+		const dim_t n_ut_col_adv = 0;
+
+		PGUARD printf( "n_ut_col_adv:       %7ld\n", n_ut_col_adv );
+
+		// In order to find j_st and i_st, we need to "allocate" n_ut_before
+		// microtiles.
+		dim_t n_ut_tba = n_ut_before;
+
+		PGUARD printf( "n_ut_tba:           %7ld\n", n_ut_tba );
+
+		// No need to advance since the upper-trapezoid begins with the
+		// diagonal region.
+		//n_ut_tba -= 0;
+
+		PGUARD printf( "n_ut_tba_1:         %7ld\n", n_ut_tba );
+		PGUARD printf( "\n" );
+
+		// In case n_ut_tba == 0. Only happens when n_ut_before == 0.
+		j_st = 0;
+		i_st = 0;
+
+		for ( dim_t j = n_ut_col_adv; 0 < n_ut_tba; ++j )
+		{
+			const dim_t diagoff_j     = diagoffmin - j*nr;
+			const dim_t n_ut_skip_j   = bli_max( ( m + diagoff_j - nr ) / mr, 0 );
+			const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j;
+
+			PGUARD printf( "j:                  %7ld\n", j );
+			PGUARD printf( "diagoff_j:          %7ld\n", diagoff_j );
+			PGUARD printf( "n_ut_skip_j:        %7ld\n", n_ut_skip_j );
+			PGUARD printf( "n_ut_this_col:      %7ld\n", n_ut_this_col );
+			PGUARD printf( "n_ut_tba_j0:        %7ld\n", n_ut_tba );
+
+			if ( n_ut_tba < n_ut_this_col )
+			{
+				// If the number of utiles to allocate is less than the number
+				// in this column, we know that j_st will refer to the current
+				// column. To find i_st, we simply use n_ut_tba.
+				j_st = j;
+				i_st = n_ut_tba;
+				PGUARD printf( "j_st, i_st (fnl<)      %4ld,%4ld\n", j_st, i_st );
+			}
+			else if ( n_ut_tba == n_ut_this_col )
+			{
+				// If the number of utiles to allocate is exactly equal to the
+				// number in this column, we know that j_st will refer to the
+				// *next* column. In this situation, i_st will always be 0.
+
+				j_st = j + 1;
+				i_st = 0;
+				PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+			}
+
+			// No matter what (especially if the number of utiles to allocate
+			// exceeds the number in this column), we decrement n_ut_tba attempt
+			// to continue to the next iteration. (Note: If either of the two
+			// branches above is triggered, n_ut_tba will be decremented down to
+			// zero (or less), in which case this will be the final iteration.)
+			n_ut_tba -= n_ut_this_col;
+
+			PGUARD printf( "n_ut_tba_j1:        %7ld\n", n_ut_tba );
+			PGUARD printf( "\n" );
+		}
+	}
+
+	//
+	// -- Step 4: Save the results ---------------------------------------------
+	//
+
+	*j_st_p = j_st;
+	*i_st_p = i_st;
+
+	#ifdef PRINT_RESULT
+	printf( "j_st, i_st (mem)       %4ld,%4ld  (n_ut: %4ld)\n",
+	         j_st, i_st, n_ut_for_me );
+	#endif
+
+	// Return the number of utiles that this thread was allocated.
+	return n_ut_for_me;
+}
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_thread_range_tlb_d
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	// This function implements tile-level load balancing for a
+	// general/dense submatrix. This partitioning guarantees that all
+	// threads are assigned nearly the same number of microtiles-worth of work,
+	// with a maximum imbalance of one microtile. It makes no effort, however,
+	// to account for differences in threads' workload that is attributable to
+	// differences in the number of edge-case microtiles (which incur slightly
+	// more work since they must first write to a temporary microtile before
+	// updating the output C matrix).
+
+	//
+	// -- Step 1: Compute the computational area of the region -----------------
+	//
+
+	// Compute the m and n dimensions according to m_iter and n_iter. (These
+	// m and n dims will likely be larger than the actual m and n since they
+	// "round up" the edge case microtiles into full-sized microtiles.)
+	const dim_t m = m_iter * mr;
+	const dim_t n = n_iter * nr;
+
+	const dim_t m_rect = m;
+	const dim_t n_rect = n;
+
+	const dim_t total_ref_area = m_rect * n_rect;
+
+	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "---------------------------\n" );
+
+	//
+	// -- Step 2: Compute key utile counts (per thread, per column, etc.) ------
+	//
+
+	const dim_t n_ut_ref = total_ref_area / ( mr * nr );
+
+	PGUARD printf( "n_ut_ref:           %7ld\n", n_ut_ref );
+	PGUARD printf( "---------------------------\n" );
+
+	// Compute the number of microtiles to allocate per thread as well as the
+	// number of leftover microtiles.
+	const dim_t n_ut_per_thr = n_ut_ref / nt;
+	const dim_t n_ut_pt_left = n_ut_ref % nt;
+
+	PGUARD printf( "n_ut_per_thr:       %7ld\n", n_ut_per_thr );
+	PGUARD printf( "n_ut_pt_left:       %7ld\n", n_ut_pt_left );
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t n_ut_per_col = m_iter;
+
+	PGUARD printf( "n_ut_per_col:       %7ld\n", n_ut_per_col );
+
+	// Allocate one of the leftover microtiles to the current thread if its
+	// tid is one of the lower thread ids.
+	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
+
+	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", n_ut_for_me,
+	               n_ut_per_thr, n_ut_for_me - n_ut_per_thr );
+
+	// Compute the number of utiles prior to the current thread's starting
+	// point. This is the sum of all n_ut_for_me for all thread ids less
+	// than tid. Notice that the second half of this expression effectively
+	// adds one extra microtile for each lower-valued thread id, up to
+	// n_ut_pt_left.
+	const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left );
+
+	PGUARD printf( "n_ut_before:        %7ld\n", n_ut_before );
+	PGUARD printf( "---------------------------\n" );
+
+	//
+	// -- Step 3: Compute the starting j/i utile offset for a given tid --------
+	//
+
+	const dim_t ut_index_st = n_ut_before;
+
+	PGUARD printf( "ut_index_st:        %7ld\n", ut_index_st );
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t j_st = ut_index_st / n_ut_per_col;
+	const dim_t i_st = ut_index_st % n_ut_per_col;
+
+	//
+	// -- Step 4: Save the results ---------------------------------------------
+	//
+
+	*j_st_p = j_st;
+	*i_st_p = i_st;
+
+	#ifdef PRINT_RESULT
+	printf( "j_st, i_st (mem)       %4ld,%4ld  (n_ut: %4ld)\n",
+	        j_st, i_st, n_ut_for_me );
+	#endif
+
+	// Return the number of utiles that this thread was allocated.
+	return n_ut_for_me;
+}
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE dim_t bli_tlb_trmm_lx_k_iter
+     (
+       const doff_t diagoff_iter,
+       const uplo_t uplo,
+       const dim_t  k_iter,
+       const dim_t  ir_iter
+     )
+{
+	if ( bli_is_lower( uplo ) )
+		return bli_min( diagoff_iter + ( ir_iter + 1 ), k_iter );
+	else // if ( bli_is_upper( uplo ) )
+		return k_iter - bli_max( diagoff_iter + ir_iter, 0 );
+}
+
+BLIS_INLINE dim_t bli_tlb_trmm_rl_k_iter
+     (
+       const doff_t diagoff_iter,
+       const dim_t  k_iter,
+       const dim_t  jr_iter
+     )
+{
+	return k_iter - bli_max( -diagoff_iter + jr_iter, 0 );
+}
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_thread_range_tlb_trmm_ll
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	return bli_thread_range_tlb_trmm_lx_impl
+	(
+	  nt, tid, diagoff, BLIS_LOWER, m_iter, n_iter, k_iter, mr, nr,
+	  j_st_p, i_st_p
+	);
+}
+
+dim_t bli_thread_range_tlb_trmm_lu
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	return bli_thread_range_tlb_trmm_lx_impl
+	(
+	  nt, tid, diagoff, BLIS_UPPER, m_iter, n_iter, k_iter, mr, nr,
+	  j_st_p, i_st_p
+	);
+}
+
+dim_t bli_thread_range_tlb_trmm_lx_impl
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const uplo_t uplo,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	// Assumption: 0 <= diagoff (lower); diagoff <= 0 (upper).
+	// Make sure to prune leading rows (lower) or columns (upper) beforehand!
+	if      ( bli_is_lower( uplo ) && diagoff < 0 ) bli_abort();
+	else if ( bli_is_upper( uplo ) && diagoff > 0 ) bli_abort();
+
+	// Single-threaded cases are simple and allow early returns.
+	if ( nt == 1 )
+	{
+		const dim_t n_ut_for_me = m_iter * n_iter;
+
+		*j_st_p = 0;
+		*i_st_p = 0;
+
+		return n_ut_for_me;
+	}
+
+	//
+	// -- Step 1: Compute the computational flop cost of each utile column -----
+	//
+
+	// Normalize the diagonal offset by mr so that it represents the offset in
+	// units of mr x mr chunks.
+	const doff_t diagoff_iter = diagoff / mr;
+
+	// Determine the actual k dimension, in units of mr x mr iterations, capped
+	// by the k_iter given by the caller.
+
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "m_iter:             %7ld\n", m_iter );
+	PGUARD printf( "n_iter:             %7ld\n", n_iter );
+	PGUARD printf( "k_iter:             %7ld\n", k_iter );
+	PGUARD printf( "mr:                 %7ld\n", mr );
+	PGUARD printf( "nr:                 %7ld\n", nr );
+	PGUARD printf( "diagoff_iter:       %7ld\n", diagoff_iter );
+
+	dim_t uops_per_col = 0;
+
+	// Compute the computation flop cost of each microtile column, normalized
+	// by the number of flops performed by each mr x nr rank-1 update. This
+	// is simply the sum of all of the k dimensions of each micropanel, up to
+	// and including (lower) or starting from (upper) the part that intersects
+	// the diagonal, or the right (lower) or left (upper) edge of the matrix,
+	// as applicable.
+	for ( dim_t i = 0; i < m_iter; ++i )
+	{
+		// Don't allow k_a1011 to exceed k_iter, which is the maximum possible
+		// k dimension (in units of mr x mr chunks of micropanel).
+		const dim_t k_i_iter
+		= bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i );
+
+		uops_per_col += k_i_iter;
+	}
+
+	PGUARD printf( "uops_per_col:       %7ld\n", uops_per_col );
+
+	//
+	// -- Step 2: Compute key flop counts (per thread, per column, etc.) -------
+	//
+
+	// Compute the total cost for the entire block-panel multiply.
+	const dim_t total_uops = uops_per_col * n_iter;
+
+	// Compute the number of microtile ops to allocate per thread as well as the
+	// number of leftover microtile ops.
+	const dim_t n_uops_per_thr = total_uops / nt;
+	const dim_t n_uops_pt_left = total_uops % nt;
+
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "total_uops:         %7ld\n", total_uops );
+	PGUARD printf( "n_uops_per_thr:     %7ld\n", n_uops_per_thr );
+	PGUARD printf( "n_uops_pt_left:     %7ld\n", n_uops_pt_left );
+
+	//
+	// -- Step 3: Compute the starting j/i utile offset for a given tid --------
+	//
+
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "total_utiles:       %7ld\n", m_iter * n_iter );
+	PGUARD printf( "---------------------------\n" );
+
+	dim_t j_st_cur = 0; dim_t j_en_cur = 0;
+	dim_t i_st_cur = 0; dim_t i_en_cur = 0;
+
+	PGUARD printf( "          tid %ld will start at j,i: %ld %ld\n",
+	               ( dim_t )0, j_st_cur, i_st_cur );
+
+	// Find the utile update that pushes uops_tba to 0 or less.
+#ifdef PRINT_MODE
+	for ( dim_t tid_i = 0; tid_i < nt; ++tid_i )
+#else
+	for ( dim_t tid_i = 0; tid_i < nt - 1; ++tid_i )
+#endif
+	{
+		const dim_t uops_ta     = n_uops_per_thr + ( tid_i < n_uops_pt_left ? 1 : 0 );
+		      dim_t uops_tba    = uops_ta;
+		      dim_t j           = j_st_cur;
+		      dim_t n_ut_for_me = 0;
+		      bool  done_e      = FALSE;
+
+		PGUARD printf( "tid_i: %ld  n_uops to alloc: %3ld \n", tid_i, uops_tba );
+
+		// This code begins allocating uops when the starting point is somewhere
+		// after the first microtile. Typically this will not be enough to
+		// allocate all uops, except for small matrices (and/or high numbers of
+		// threads), in which case the code signals an early finish (via done_e).
+		if ( 0 < i_st_cur )
+		{
+			dim_t i;
+
+			//PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n", tid_i, uops_tba );
+
+			for ( i = i_st_cur; i < m_iter; ++i )
+			{
+				n_ut_for_me += 1;
+
+				const dim_t uops_tba_new
+				= uops_tba -
+				  bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i );
+
+				uops_tba = uops_tba_new;
+
+				PGUARD printf( "tid_i: %ld  i: %2ld  (1 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+				               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+
+				if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE;
+				                           break; }
+			}
+
+			if ( i == m_iter ) j += 1;
+		}
+
+		// This code advances over as many columns of utiles as possible and then
+		// walks down to the correct utile within the subsequent column. However,
+		// it gets skipped entirely if the previous code block was able to
+		// allocate all of the current tid's uops.
+		if ( !done_e )
+		{
+			const dim_t j_inc0  = uops_tba / uops_per_col;
+			const dim_t j_left0 = uops_tba % uops_per_col;
+
+			// We need to set a hard limit on how much j_inc can be. Namely,
+			// it should not exceed the number of utile columns that are left
+			// in the matrix. We also correctly compute j_left when the initial
+			// computation of j_inc0 above exceeds the revised j_inc, but this
+			// is mostly only so that in these situations the debug statements
+			// report the correct numbers.
+			const dim_t j_inc  = bli_min( j_inc0, n_iter - j );
+			const dim_t delta  = j_inc0 - j_inc;
+			const dim_t j_left = j_left0 + delta * uops_per_col;
+
+			// Increment j by the number of full utile columns we allocate, and
+			// set the remaining utile ops to be allocated to the remainder.
+			j       += j_inc;
+			uops_tba = j_left;
+
+			n_ut_for_me += j_inc * m_iter;
+
+			PGUARD printf( "tid_i: %ld  advanced to col: %2ld  (uops traversed: %ld)\n",
+			               tid_i, j, uops_per_col * j_inc );
+			PGUARD printf( "tid_i: %ld  j: %2ld  (  n_ut_cur: %ld) (uops_alloc: %ld)\n",
+			               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+			PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n", tid_i, j_left );
+
+			if ( uops_tba == 0 )
+			{
+				// If advancing j_inc columns allocated all of our uops, then
+				// designate the last iteration of the previous column as the
+				// end point.
+				j_en_cur = j - 1;
+				i_en_cur = m_iter - 1;
+			}
+			else if ( j >  n_iter ) bli_abort(); // safety check.
+			else if ( j == n_iter )
+			{
+				// If we still have at least some uops to allocate, and advancing
+				// j_inc columns landed us at the beginning of the first non-
+				// existent column (column n_iter), then we're done. (The fact
+				// that we didn't get to allocate all of our uops just means that
+				// the lower tids slightly overshot their allocations, leaving
+				// fewer uops for the last thread.)
+			}
+			else // if ( 0 < uops_tba && j < n_iter )
+			{
+				// If we have at least some uops to allocate, and we still have
+				// at least some columns to process, then we search for the
+				// utile that will put us over the top.
+
+				for ( dim_t i = 0; i < m_iter; ++i )
+				{
+					n_ut_for_me += 1;
+
+					const dim_t uops_tba_new
+					= uops_tba -
+					  bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i );
+
+					uops_tba = uops_tba_new;
+
+					PGUARD printf( "tid_i: %ld  i: %2ld  (4 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+					               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+
+					if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i;
+					                           break; }
+				}
+			}
+		}
+
+
+		PGUARD printf( "tid_i: %ld         (5 n_ut_cur: %ld) (overshoot: %ld out of %ld)\n",
+		                tid_i, n_ut_for_me, -uops_tba, uops_ta );
+
+		if ( tid_i == tid )
+		{
+			*j_st_p = j_st_cur;
+			*i_st_p = i_st_cur;
+			return n_ut_for_me;
+		}
+
+		// Use the current tid's ending i,j values to determine the starting i,j
+		// values for the next tid.
+		j_st_cur = j_en_cur;
+		i_st_cur = i_en_cur + 1;
+		if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; }
+
+		PGUARD printf( "tid_i: %ld         (6 n_ut_cur: %ld)\n",
+		               tid_i, n_ut_for_me );
+		PGUARD printf( "tid_i: %ld  tid %ld will start at j,i: %ld %ld\n",
+		               tid_i, tid_i + 1, j_st_cur, i_st_cur );
+		PGUARD printf( "---------------------------\n" );
+	}
+
+#ifndef PRINT_MODE
+
+	//
+	// -- Step 4: Handle the last thread's allocation --------------------------
+	//
+
+	// An optimization: The above loop runs to nt - 1 rather than nt since it's
+	// easy to count the number of utiles allocated to the last thread.
+	const dim_t n_ut_for_me = m_iter - i_st_cur +
+	                          (n_iter - j_st_cur - 1) * m_iter;
+	*j_st_p = j_st_cur;
+	*i_st_p = i_st_cur;
+
+	PGUARD printf( "tid_i: %ld         (7 n_ut_for_me: %ld) (j,i_st: %ld %ld)\n",
+	               tid, n_ut_for_me, j_st_cur, i_st_cur );
+
+	return n_ut_for_me;
+#else
+	// This line should never execute, but we need it to satisfy the compiler.
+	return -1;
+#endif
+}
+
+// -----------------------------------------------------------------------------
+
+#if 0
+dim_t bli_thread_range_tlb_trmm_r
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const uplo_t uplo,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	dim_t n_ut_for_me;
+
+	if ( bli_is_lower( uplo ) )
+	{
+		inc_t j_en_l, i_en_l;
+
+		n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl
+		(
+		  nt, tid, diagoff, m_iter, n_iter, k_iter, mr, nr,
+		  j_st_p, i_st_p, &j_en_l, &i_en_l
+		);
+	}
+	else // if ( bli_is_upper( uplo ) )
+	{
+		inc_t j_st_l, i_st_l;
+		inc_t j_en_l, i_en_l;
+
+		// Reverse the effective tid and use the diagonal offset as if the m and
+		// n dimension were reversed (similar to a 180 degree rotation). This
+		// transforms the problem into one of allocating ranges for a lower-
+		// triangular matrix, for which we already have a special routine.
+		const dim_t  tid_rev     = nt - tid - 1;
+		const doff_t diagoff_rev = nr*n_iter - ( nr*k_iter + diagoff );
+
+		n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl
+		(
+		  nt, tid_rev, diagoff_rev, m_iter, n_iter, k_iter, mr, nr,
+		  &j_st_l, &i_st_l, &j_en_l, &i_en_l
+		);
+
+		// The ending j and i offsets will serve as our starting offsets
+		// returned to the caller, but first we have to reverse the offsets so
+		// that their semantics are once again relative to an upper-triangular
+		// matrix.
+		j_en_l = n_iter - j_en_l - 1;
+		i_en_l = m_iter - i_en_l - 1;
+
+		*j_st_p = j_en_l;
+		*i_st_p = i_en_l;
+	}
+
+	return n_ut_for_me;
+}
+#endif
+
+dim_t bli_thread_range_tlb_trmm_rl
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	inc_t j_en_l, i_en_l;
+
+	return bli_thread_range_tlb_trmm_rl_impl
+	(
+	  nt, tid, diagoff, m_iter, n_iter, k_iter, mr, nr,
+	  j_st_p, i_st_p, &j_en_l, &i_en_l
+	);
+}
+
+dim_t bli_thread_range_tlb_trmm_ru
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     )
+{
+	inc_t j_st_l, i_st_l;
+	inc_t j_en_l, i_en_l;
+
+	// Reverse the effective tid and use the diagonal offset as if the m and
+	// n dimension were reversed (similar to a 180 degree rotation). This
+	// transforms the problem into one of allocating ranges for a lower-
+	// triangular matrix, for which we already have a special routine.
+	const dim_t  tid_rev     = nt - tid - 1;
+	const doff_t diagoff_rev = nr*n_iter - ( nr*k_iter + diagoff );
+
+	const dim_t n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl
+	(
+	  nt, tid_rev, diagoff_rev, m_iter, n_iter, k_iter, mr, nr,
+	  &j_st_l, &i_st_l, &j_en_l, &i_en_l
+	);
+
+	// The ending j and i offsets will serve as our starting offsets
+	// returned to the caller, but first we have to reverse the offsets so
+	// that their semantics are once again relative to an upper-triangular
+	// matrix.
+	j_en_l = n_iter - j_en_l - 1;
+	i_en_l = m_iter - i_en_l - 1;
+
+	*j_st_p = j_en_l;
+	*i_st_p = i_en_l;
+
+	return n_ut_for_me;
+}
+
+dim_t bli_thread_range_tlb_trmm_rl_impl
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p,
+             inc_t* j_en_p,
+             inc_t* i_en_p
+     )
+{
+	// Assumption: 0 <= diagoff. Make sure to prune leading rows beforehand!
+	if ( diagoff < 0 ) bli_abort();
+
+	// Single-threaded cases are simple and allow early returns.
+	if ( nt == 1 )
+	{
+		const dim_t n_ut_for_me = m_iter * n_iter;
+
+		*j_st_p = 0;
+		*i_st_p = 0;
+		*j_en_p = n_iter - 1;
+		*i_en_p = m_iter - 1;
+
+		return n_ut_for_me;
+	}
+
+	//
+	// -- Step 1: Compute the computational volume of the region ---------------
+	//
+
+	// Normalize the diagonal offset by nr so that it represents the offset in
+	// units of nr x nr chunks.
+	const doff_t diagoff_iter = diagoff / nr;
+
+	// For the purposes of many computations in this function, we aren't
+	// interested in the extent to which diagoff exceeds n (if it does)
+	// So we use a new variable that is guaranteed to be no greater than n.
+	const doff_t diagoffmin_iter = bli_min( diagoff_iter, n_iter );
+
+	const dim_t k_rect = k_iter;
+	const dim_t n_rect = diagoffmin_iter;
+
+	const dim_t gross_area   = k_rect * n_iter;
+	const dim_t rect_area    = k_rect * n_rect;
+	const dim_t nonrect_area = gross_area - rect_area;
+
+	const dim_t offn_nonrect    = n_rect;
+	const dim_t diagoff_nonrect = 0;
+
+	const dim_t n_nonrect       = n_iter - n_rect;
+
+	const dim_t offn_ut_nonrect = diagoffmin_iter;
+
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "m_iter:             %7ld\n", m_iter );
+	PGUARD printf( "k_iter:             %7ld\n", k_iter );
+	PGUARD printf( "n_iter:             %7ld\n", n_iter );
+	PGUARD printf( "min(diagoff_it,n):  %7ld\n", diagoffmin_iter );
+	PGUARD printf( "offn_ut_nonrect:    %7ld\n", offn_ut_nonrect );
+	PGUARD printf( "offn_nonrect:       %7ld\n", offn_nonrect );
+	PGUARD printf( "diagoff_nonrect:    %7ld\n", diagoff_nonrect );
+	PGUARD printf( "n_nonrect:          %7ld\n", n_nonrect );
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t num_unref_ut0 = n_nonrect * ( n_nonrect - 1 ) / 2;
+	const dim_t num_unref_ut  = bli_max( 0, num_unref_ut0 );
+
+	const dim_t tri_unref_area = num_unref_ut;
+	const dim_t tri_ref_area   = nonrect_area - tri_unref_area;
+	const dim_t total_ref_area = rect_area + tri_ref_area;
+	const dim_t rect_vol       = rect_area * m_iter;
+	const dim_t tri_ref_vol    = tri_ref_area * m_iter;
+	const dim_t total_vol      = total_ref_area * m_iter;
+
+	PGUARD printf( "gross_area:         %7ld\n", gross_area );
+	PGUARD printf( "nonrect_area:       %7ld\n", nonrect_area );
+	PGUARD printf( "tri_unref_area:     %7ld\n", tri_unref_area );
+	PGUARD printf( "rect_area:          %7ld\n", rect_area );
+	PGUARD printf( "tri_ref_area:       %7ld\n", tri_ref_area );
+	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "rect_vol (uops):    %7ld\n", rect_vol );
+	PGUARD printf( "tri_ref_vol (uops): %7ld\n", tri_ref_vol );
+	PGUARD printf( "total_vol (uops):   %7ld\n", total_vol );
+	PGUARD printf( "---------------------------\n" );
+
+	//
+	// -- Step 2: Compute key flop counts (per thread, per column, etc.) -------
+	//
+
+	//const dim_t rect_uops    = rect_vol;
+	//const dim_t tri_ref_uops = tri_ref_vol;
+	const dim_t total_uops   = total_vol;
+
+	// Compute the number of microtile ops to allocate per thread as well as the
+	// number of leftover microtile ops.
+	const dim_t n_uops_per_thr = total_uops / nt;
+	const dim_t n_uops_pt_left = total_uops % nt;
+
+	PGUARD printf( "n_threads:          %7ld\n", nt );
+	PGUARD printf( "n_uops_per_thr:     %7ld\n", n_uops_per_thr );
+	PGUARD printf( "n_uops_pt_left:     %7ld\n", n_uops_pt_left );
+	PGUARD printf( "---------------------------\n" );
+
+	const dim_t uops_per_col_rect = m_iter * k_iter;
+
+	PGUARD printf( "uops_per_col_rect:  %7ld\n", uops_per_col_rect );
+
+	// Allocate one of the leftover uops to the current thread if its tid is
+	// one of the lower thread ids.
+	//const dim_t n_uops_for_me = n_uops_per_thr + ( tid < n_uops_pt_left ? 1 : 0 );
+
+	//PGUARD printf( "n_uops_for_me:      %7ld (%ld+%ld)\n",
+	//               n_uops_for_me, n_uops_per_thr, n_uops_for_me - n_uops_per_thr );
+
+	//
+	// -- Step 3: Compute the starting j/i utile offset for a given tid) -------
+	//
+
+	PGUARD printf( "---------------------------\n" );
+	PGUARD printf( "total_utiles:       %7ld\n", m_iter * n_iter );
+	PGUARD printf( "---------------------------\n" );
+
+	dim_t j_st_cur = 0; dim_t j_en_cur = 0;
+	dim_t i_st_cur = 0; dim_t i_en_cur = 0;
+
+	// Find the utile update that pushes uops_tba to 0 or less.
+#ifdef PRINT_MODE
+	for ( dim_t tid_i = 0; tid_i < nt; ++tid_i )
+#else
+	for ( dim_t tid_i = 0; tid_i < nt - 1; ++tid_i )
+#endif
+	{
+		const dim_t uops_ta     = n_uops_per_thr + ( tid_i < n_uops_pt_left ? 1 : 0 );
+		      dim_t uops_tba    = uops_ta;
+		      dim_t j           = j_st_cur;
+		      dim_t n_ut_for_me = 0;
+		      bool  done_e      = FALSE;
+		      bool  search_tri  = FALSE;
+
+		PGUARD printf( "tid_i: %ld  n_uops_ta:    %3ld \n", tid_i, uops_tba );
+		PGUARD printf( "tid_i: %ld  j: %2ld  (  n_ut_cur: %ld) (uops_alloc: %ld)\n",
+		                tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+
+		// This code begins allocating uops when the starting point is somewhere
+		// after the first microtile. Typically this will not be enough to
+		// allocate all uops, except for situations where the number of threads
+		// is high relative to the number of utile columns, in which case the
+		// code signals an early finish (via done_e).
+		if ( 0 < i_st_cur )
+		{
+			// Compute the number of uops needed to update each utile in the
+			// current column.
+			const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j );
+
+			dim_t i;
+
+			#if 0
+
+			// Starting from i_st_cur within the current utile column, allocate
+			// utiles until (a) we run out of utiles in the column (which is tyipcally
+			// what happens), or (b) we finish allocating all uops for the current
+			// thread (uops_tba drops to zero or less).
+			for ( i = i_st_cur; i < m_iter; ++i )
+			{
+				n_ut_for_me += 1;
+
+				const dim_t uops_tba_new = uops_tba - k_iter_j;
+
+				uops_tba = uops_tba_new;
+
+				PGUARD printf( "tid_i: %ld  i: %2ld  (0 n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n",
+				                tid_i, i, n_ut_for_me, uops_ta - uops_tba, k_iter_j );
+
+				if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE;
+				                           break; }
+			}
+
+			// If we traversed the entire column (regardless of whether we finished
+			// allocating utiles for the current thread), increment j to the next
+			// column, which is where we'll continue our search for the current tid
+			// (or start our search for the next tid if we finished allocating utiles).
+			// Additionally, if we finished traversing all utile columns, mark the
+			// last utile of the last column as the end point, and set the "done early"
+			// flag.
+			if ( i == m_iter )
+			{
+				j += 1;
+				if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; done_e = TRUE; }
+			}
+
+			#else
+
+			// Compute the number of utiles left to allocate under the (probably false)
+			// assumption that all utiles incur the same uop cost (k_iter_j) to update.
+			// Also compute the number of utiles that remain in the current column.
+			const dim_t n_ut_tba_j = uops_tba / k_iter_j + ( uops_tba % k_iter_j ? 1 : 0 );
+			const dim_t n_ut_rem_j = m_iter - i_st_cur;
+
+			// Compare the aforementioned values. If n_ut_tba_j is less than or equal to
+			// the number of remaining utiles in the column, we can finish allocating
+			// without moving to the next column. But if n_ut_tba_j exceeds n_ut_rem_j,
+			// then we aren't done yet, so allocate what we can and move on.
+			if ( n_ut_tba_j <= n_ut_rem_j )
+			{
+				n_ut_for_me += n_ut_tba_j;
+				uops_tba    -= n_ut_tba_j * k_iter_j;
+				i            = i_st_cur + n_ut_tba_j;
+
+				j_en_cur = j; i_en_cur = i - 1; done_e = TRUE;
+			}
+			else // if ( n_ut_rem_j < n_ut_tba_j )
+			{
+				n_ut_for_me += n_ut_rem_j;
+				uops_tba    -= n_ut_rem_j * k_iter_j;
+				i            = i_st_cur + n_ut_rem_j;
+			}
+
+			PGUARD printf( "tid_i: %ld  i: %2ld  (* n_ut_cur: %ld) (uops_alloc: %ld)\n",
+						   tid_i, i-1, n_ut_for_me, uops_ta - uops_tba );
+
+			// If we allocated all utiles in the column (regardless of whether we finished
+			// allocating utiles for the current thread), increment j to the next column,
+			// which is where we'll continue our search for the current tid's end point
+			// (or start our search through the next tid's range if we finished allocating
+			// the current tid's utiles). Additionally, if we allocated utiles from the
+			// last column, mark the tid's end point and set the "done early" flag.
+			if ( i == m_iter )
+			{
+				j += 1; i = 0;
+				if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; done_e = TRUE; }
+
+				PGUARD printf( "tid_i: %ld  j: %2ld  (! n_ut_cur: %ld) (uops_alloc: %ld)\n",
+							   tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+			}
+
+			#endif
+		}
+
+		// This code advances over as many columns of utiles as possible, within
+		// the rectangular region (i.e., pre-diagonal), and then walks down to
+		// the correct utile within the subsequent column. However, note that
+		// this code gets skipped entirely if the previous code block was able
+		// to allocate all of the current tid's uops.
+		if ( !done_e )
+		{
+			// If j is positioned somewhere within the rectangular region, we can
+			// skip over as many utile columns as possible with some integer math.
+			// And depending on how many uops we were able to allocate relative to
+			// the number of columns that exist, we may need to walk through the
+			// triangular region as well. But if j is already in the triangular
+			// region, we set a flag so that we execute the code that will walk
+			// through those columns.
+			if ( j < diagoff_iter )
+			{
+				const dim_t j_inc0  = uops_tba / uops_per_col_rect;
+				const dim_t j_left0 = uops_tba % uops_per_col_rect;
+
+				// We need to set a hard limit on how much j_inc can be. Namely,
+				// it should not exceed the number of utile columns that are left
+				// in the rectangular region of the matrix, nor should it exceed
+				// the total number of utile columns that are left.
+				const dim_t j_inc1 = bli_min( j_inc0, diagoff_iter - j );
+				const dim_t j_inc  = bli_min( j_inc1, n_iter - j );
+				const dim_t delta  = j_inc0 - j_inc;
+				const dim_t j_left = j_left0 + delta * uops_per_col_rect;
+
+				// Increment j by the number of full utile columns we allocate, and
+				// set the remaining utile ops to be allocated to the remainder.
+				j       += j_inc;
+				uops_tba = j_left;
+
+				n_ut_for_me += j_inc * m_iter;
+
+				PGUARD printf( "tid_i: %ld  advanced to col: %2ld  (uops traversed: %ld)\n",
+				               tid_i, j, uops_per_col_rect * j_inc );
+				PGUARD printf( "tid_i: %ld  j: %2ld  (1 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+				                tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+				PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n", tid_i, j_left );
+
+				if ( uops_tba == 0 )
+				{
+					// If advancing j_inc columns allocated all of our uops, then
+					// designate the last iteration of the previous column as the
+					// end point.
+					j_en_cur = j - 1;
+					i_en_cur = m_iter - 1;
+					search_tri = FALSE;
+
+					PGUARD printf( "tid_i: %ld  j: %2ld  (2 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+				}
+				else if ( j >  n_iter ) bli_abort(); // Safety check; should never execute.
+				else if ( j == n_iter )
+				{
+					// If we still have at least some uops to allocate, and advancing
+					// j_inc columns landed us at the beginning of the first non-
+					// existent column (column n_iter), then we're done. (The fact
+					// that we didn't get to allocate all of our uops just means that
+					// the lower tids slightly overshot their allocations, leaving
+					// fewer uops for the last thread.)
+					search_tri = FALSE;
+					PGUARD printf( "tid_i: %ld  j: %2ld  (3 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+				}
+				else if ( j < diagoff_iter )
+				{
+					// If we still have at least some uops to allocate, and advancing
+					// j_inc columns landed us at the beginning of a column that is
+					// still in the rectangular region, then we don't need to enter
+					// the triangular region (if it even exists). The code below will
+					// walk down the current column and find the utile that puts us
+					// over the top.
+					search_tri = FALSE;
+					PGUARD printf( "tid_i: %ld  j: %2ld  (4 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+				}
+				else // if ( 0 < uops_tba && j == diagoff_iter && j < n_iter )
+				{
+					// If we have at least some uops to allocate, and we still have
+					// at least some columns to process, then we set a flag to
+					// indicate that we still need to step through the triangular
+					// region.
+					search_tri = TRUE;
+					PGUARD printf( "tid_i: %ld  j: %2ld  (5 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+				}
+			}
+			else /* if ( diagoff_iter <= j ) */
+			{
+				PGUARD printf( "tid_i: %ld  j: %2ld >= diagoff_iter: %ld\n",
+				               tid_i, j, diagoff_iter );
+				search_tri = TRUE;
+			}
+
+			PGUARD printf( "tid_i: %ld  j: %2ld  search_tri: %d\n", tid_i, j, search_tri );
+
+			if ( search_tri )
+			{
+				// If we still have some uops to allocate in the triangular region,
+				// we first allocate as many full utile columns as possible without
+				// exceeding the number of uops left to be allocated.
+				for ( ; j < n_iter; ++j )
+				{
+					const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j );
+					const dim_t n_uops_j = k_iter_j * m_iter;
+
+					PGUARD printf( "tid_i: %ld  j: %2ld  (6 n_ut_cur: %ld) (uops_alloc: %ld) (n_uops_j: %ld)\n",
+					               tid_i, j, n_ut_for_me, uops_ta - uops_tba, n_uops_j );
+
+					if ( uops_tba == 0 )
+					{
+						PGUARD printf( "tid_i: %ld  j: %2ld  (7 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+						               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+						// If advancing over the previous column allocated all of
+						// our uops, then designate the last iteration of the
+						// previous column as the end point.
+						j_en_cur = j - 1;
+						i_en_cur = m_iter - 1;
+						break;
+					}
+					if ( n_uops_j <= uops_tba )
+					{
+						// If advancing over the current column doesn't exceed the
+						// number of uops left to allocate, then allocate them. (If
+						// n_uops_j == uops_tba, then we'll be done shortly after
+						// incrementing j.)
+						n_ut_for_me += m_iter;
+						uops_tba -= n_uops_j;
+
+						PGUARD printf( "tid_i: %ld  j: %2ld  (8 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+						               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+					}
+					else // if ( uops_tba < n_uops_j )
+					{
+						PGUARD printf( "tid_i: %ld  j: %2ld  (9 n_ut_cur: %ld) (uops_alloc: %ld)\n",
+						               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+						// If we can finish allocating all the remaining uops
+						// with the utiles in the current column, then we break
+						// out of the loop without updating j, n_ut_for_me, or
+						// uops_tba. The remaining uops will be allocated in
+						// the loop over m_iter below.
+						break;
+					}
+				}
+			}
+
+			// If there are any uops left to allocate, and we haven't already
+			// exhausted all allocatable utiles, it means that we have to walk down
+			// the current column and find the utile that puts us over the top.
+			if ( 0 < uops_tba && j < n_iter )
+			{
+				const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j );
+
+				PGUARD printf( "tid_i: %ld  j: %2ld  (A n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n",
+				               tid_i, j, n_ut_for_me, uops_ta - uops_tba, k_iter_j );
+
+				#if 0
+
+				dim_t i;
+				for ( i = 0; i < m_iter; ++i )
+				{
+					n_ut_for_me += 1;
+					const dim_t uops_tba_new = uops_tba - k_iter_j;
+					uops_tba = uops_tba_new;
+					PGUARD printf( "tid_i: %ld  i: %2ld  (B n_ut_cur: %ld) (uops_alloc: %ld)\n",
+					               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+					if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; break; }
+				}
+
+				if ( i == m_iter )
+				{
+					j += 1;
+					if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; }
+				}
+
+				#else
+
+				const dim_t n_ut_j = uops_tba / k_iter_j + ( uops_tba % k_iter_j ? 1 : 0 );
+				const dim_t i      = n_ut_j - 1;
+
+				uops_tba    -= n_ut_j * k_iter_j;
+				n_ut_for_me += n_ut_j;
+
+				j_en_cur = j; i_en_cur = i;
+
+				PGUARD printf( "tid_i: %ld  i: %2ld  (b n_ut_cur: %ld) (uops_alloc: %ld)\n",
+				               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+
+				#endif
+			}
+			else // if ( uops_tba <= 0 || j == n_iter )
+			{
+				j_en_cur = j - 1;
+				i_en_cur = m_iter - 1;
+			}
+		}
+
+		PGUARD printf( "tid_i: %ld  done!  (C n_ut_cur: %ld) (overshoot: %ld out of %ld)\n",
+					   tid_i, n_ut_for_me, -uops_tba, uops_ta );
+
+		if ( tid_i == tid )
+		{
+			*j_st_p = j_st_cur;
+			*i_st_p = i_st_cur;
+			*j_en_p = j_en_cur;
+			*i_en_p = i_en_cur;
+			return n_ut_for_me;
+		}
+
+		// Use the current tid's ending i,j values to determine the starting i,j
+		// values for the next tid.
+		j_st_cur = j_en_cur;
+		i_st_cur = i_en_cur + 1;
+		if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; }
+
+		PGUARD printf( "tid_i: %ld         (D n_ut_cur: %ld)\n",
+					   tid_i, n_ut_for_me );
+		PGUARD printf( "tid_i: %ld  tid %ld will start at j,i: %ld %ld\n",
+					   tid_i, tid_i + 1, j_st_cur, i_st_cur );
+		PGUARD printf( "---------------------------\n" );
+	}
+
+#ifndef PRINT_MODE
+
+	//
+	// -- Step 4: Handle the last thread's allocation --------------------------
+	//
+
+	// An optimization: The above loop runs to nt - 1 rather than nt since it's
+	// easy to count the number of utiles allocated to the last thread.
+	const dim_t n_ut_for_me = m_iter - i_st_cur +
+	                          (n_iter - j_st_cur - 1) * m_iter;
+	*j_st_p = j_st_cur;
+	*i_st_p = i_st_cur;
+	*j_en_p = n_iter - 1;
+	*i_en_p = m_iter - 1;
+
+	PGUARD printf( "tid_i: %ld         (E n_ut_for_me: %ld) (j,i_st: %ld %ld)\n",
+	               tid, n_ut_for_me, j_st_cur, i_st_cur );
+
+	return n_ut_for_me;
+#else
+	// This line should never execute, but we need it to satisfy the compiler.
+	return -1;
+#endif
+}
+
diff --git a/frame/thread/bli_thread_range_tlb.h b/frame/thread/bli_thread_range_tlb.h
new file mode 100644
index 000000000..b344f09ef
--- /dev/null
+++ b/frame/thread/bli_thread_range_tlb.h
@@ -0,0 +1,192 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_THREAD_RANGE_TLB_H
+#define BLIS_THREAD_RANGE_TLB_H
+
+#if 0
+dim_t bli_thread_range_tlb
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const uplo_t uplo,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+#endif
+dim_t bli_thread_range_tlb_l
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+dim_t bli_thread_range_tlb_u
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+dim_t bli_thread_range_tlb_d
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+
+// ---
+
+dim_t bli_thread_range_tlb_trmm_ll
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+dim_t bli_thread_range_tlb_trmm_lu
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+dim_t bli_thread_range_tlb_trmm_lx_impl
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const uplo_t uplo,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+#if 0
+dim_t bli_thread_range_tlb_trmm_r
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const uplo_t uplo,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+#endif
+
+// ---
+
+dim_t bli_thread_range_tlb_trmm_rl
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+dim_t bli_thread_range_tlb_trmm_ru
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p
+     );
+dim_t bli_thread_range_tlb_trmm_rl_impl
+     (
+       const dim_t  nt,
+       const dim_t  tid,
+       const doff_t diagoff,
+       const dim_t  m_iter,
+       const dim_t  n_iter,
+       const dim_t  k_iter,
+       const dim_t  mr,
+       const dim_t  nr,
+             inc_t* j_st_p,
+             inc_t* i_st_p,
+             inc_t* j_en_p,
+             inc_t* i_en_p
+     );
+
+#endif
diff --git a/frame/thread/old/bli_thread_range_snake.c b/frame/thread/old/bli_thread_range_snake.c
new file mode 100644
index 000000000..11a287659
--- /dev/null
+++ b/frame/thread/old/bli_thread_range_snake.c
@@ -0,0 +1,120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#if 0
+void bli_thread_range_snake_jr
+     (
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
+     )
+{
+	// Use snake partitioning of jr loop.
+
+	// NOTE: This function currently assumes that edge cases are handled
+	// "high" and therefore ignores handle_edge_low. This is because the
+	// function is only used by gemmt and friends (herk/her2k/syrk/syr2k).
+	// These operations, unlike trmm/trmm3 and trsm, never require
+	// low-range edge cases.
+
+	const dim_t tid = bli_thrinfo_work_id( thread );
+	const dim_t nt  = bli_thrinfo_n_way( thread );
+
+	const dim_t n_left = n % bf;
+	const dim_t n_iter = n / bf + ( n_left ? 1 : 0 );
+
+	if ( bli_is_lower( uplo ) )
+	{
+		// Use the thrinfo_t work id as the thread's starting index.
+		const dim_t st = tid;
+
+		// This increment will be too big for some threads with only one unit
+		// (NR columns, or an edge case) of work, but that's okay since all that
+		// matters is that st + in >= en, which will cause that thread's jr loop
+		// to not execute beyond the first iteration.
+		const dim_t in = 2 * ( nt - tid ) - 1;
+
+		      dim_t en = st + in + 1;
+
+		// Don't let the thread's end index exceed n_iter.
+		if ( n_iter < en ) en = n_iter;
+
+		*start = st * bf;
+		*end   = en * bf; // - ( bf - n_left );
+		*inc   = in * bf;
+	}
+	else // if ( bli_is_upper( uplo ) )
+	{
+		      dim_t st = n_iter - 2 * nt + tid;
+
+		const dim_t in = 2 * ( nt - tid ) - 1;
+
+		      dim_t en = st + in + 1;
+
+		#if 1
+		// When nt exceeds half n_iter, some threads will only get one unit
+		// (NR columns, or an edge case) of work. This manifests as st being
+		// negative, and thus we need to move their start index to their other
+		// assigned unit in the positive index range.
+		if ( st < 0 ) st += in;
+
+		// If the start index is *still* negative, which happens for some
+		// threads when nt exceeds n_iter, then manually assign this thread
+		// an empty index range.
+		if ( st < 0 ) { st = 0; en = 0; }
+		#else
+		if ( 0 <= st + in ) { st += in; }
+		else                { st = 0; en = 0; }
+		#endif
+
+		#if 0
+		printf( "thread_range_snake_jr():  tid %d: sta end = %3d %3d %3d\n",
+		        (int)tid, (int)(st), (int)(en), (int)(in) );
+		#endif
+
+		*start = st * bf;
+		*end   = en * bf;
+		*inc   = in * bf;
+	}
+}
+#endif
diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/thread/old/bli_thread_range_snake.h
similarity index 70%
rename from frame/1m/packm/bli_packm_thrinfo.h
rename to frame/thread/old/bli_thread_range_snake.h
index 1ac7f88df..73fd4ae73 100644
--- a/frame/1m/packm/bli_packm_thrinfo.h
+++ b/frame/thread/old/bli_thread_range_snake.h
@@ -32,34 +32,22 @@
 
 */
 
-//
-// thrinfo_t macros specific to packm.
-//
-
-/*
-#define bli_packm_thread_my_iter( index, thread ) \
-\
-	( index % thread->n_way == thread->work_id % thread->n_way )
-*/
-
-#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \
-\
-	( i % n_way == work_id % n_way )
-
-#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \
-\
-	( start <= i && i < end )
-
-// Define a general-purpose version of bli_packm_my_iter() whose definition
-// depends on whether slab or round-robin partitioning was requested at
-// configure-time.
-#ifdef BLIS_ENABLE_JRIR_SLAB
-
-  #define bli_packm_my_iter bli_packm_my_iter_sl
-
-#else // BLIS_ENABLE_JRIR_RR
-
-  #define bli_packm_my_iter bli_packm_my_iter_rr
-
+#ifndef BLIS_THREAD_RANGE_SNAKE_H
+#define BLIS_THREAD_RANGE_SNAKE_H
+
+#if 0
+void bli_thread_range_snake_jr
+     (
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
+     );
 #endif
 
+#endif
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 02f7458ad..b61140743 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -344,11 +344,11 @@ void PASTECH2(bls_,ch,varname) \
 \
 						/* Compute the addresses of the next micropanels of A and B. */ \
 						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
-						if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+						if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) ) \
 						{ \
 							a2 = a_ic_use; \
 							b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
-							if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+							if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) ) \
 								b2 = b_pc_use; \
 						} \
 \
diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c
index 7c2c4e9a9..b37d34cce 100644
--- a/sandbox/gemmlike/bls_l3_packm_var1.c
+++ b/sandbox/gemmlike/bls_l3_packm_var1.c
@@ -131,10 +131,10 @@ void PASTECH2(bls_,ch,varname) \
 	dim_t it_start, it_end, it_inc; \
 \
 	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( ic  = ic0,    it  = 0; it < n_iter; \
@@ -147,10 +147,10 @@ void PASTECH2(bls_,ch,varname) \
 		ctype* restrict c_use = c_begin; \
 		ctype* restrict p_use = p_begin; \
 \
-		/* The definition of bli_packm_my_iter() will depend on whether slab
+		/* The definition of bli_is_my_iter() will depend on whether slab
 		   or round-robin partitioning was requested at configure-time. (The
 		   default is slab.) */ \
-		if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+		if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
 		{ \
 			PASTECH2(bls_,ch,packm_cxk) \
 			( \
diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c
index 94ee0efcd..b3efbbc28 100644
--- a/sandbox/gemmlike/bls_l3_packm_var2.c
+++ b/sandbox/gemmlike/bls_l3_packm_var2.c
@@ -131,10 +131,10 @@ void PASTECH2(bls_,ch,varname) \
 	dim_t it_start, it_end, it_inc; \
 \
 	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( ic  = ic0,    it  = 0; it < n_iter; \
@@ -147,10 +147,10 @@ void PASTECH2(bls_,ch,varname) \
 		ctype* restrict c_use = c_begin; \
 		ctype* restrict p_use = p_begin; \
 \
-		/* The definition of bli_packm_my_iter() will depend on whether slab
+		/* The definition of bli_is_my_iter() will depend on whether slab
 		   or round-robin partitioning was requested at configure-time. (The
 		   default is slab.) */ \
-		if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+		if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
 		{ \
 			/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
 			   we're wrong, this will get someone's attention. */ \
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 851102a2f..8656652b3 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -786,7 +786,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	char impl_str[32];
 	char def_impl_set_str[32];
 	char def_impl_unset_str[32];
-	char jrir_str[16];
+	char jrir_str[32];
 
 	const bool    has_openmp      = bli_info_get_enable_openmp();
 	const bool    has_pthreads    = bli_info_get_enable_pthreads();
@@ -821,8 +821,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	else                          sprintf( def_impl_set_str, "single" );
 
 	// Describe the status of jrir thread partitioning.
-	if   ( bli_info_get_thread_part_jrir_slab() ) sprintf( jrir_str, "slab" );
-	else /*bli_info_get_thread_part_jrir_rr()*/   sprintf( jrir_str, "round-robin" );
+	if      ( bli_info_get_thread_jrir_slab() ) sprintf( jrir_str, "slab" );
+	else if ( bli_info_get_thread_jrir_rr()   ) sprintf( jrir_str, "round-robin" );
+	else    /*bli_info_get_thread_jrir_tlb()*/  sprintf( jrir_str, "tile-level (slab)" );
 
 	char nt_str[16];
 	char jc_nt_str[16];
diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c
index 0504b3315..497ecf97e 100644
--- a/testsuite/src/test_trmm.c
+++ b/testsuite/src/test_trmm.c
@@ -271,7 +271,10 @@ void libblis_test_trmm_impl
 	switch ( iface )
 	{
 		case BLIS_TEST_SEQ_FRONT_END:
+//bli_printm( "a", a, "%5.2f", "" );
+//bli_printm( "b", b, "%5.2f", "" );
 		bli_trmm( side, alpha, a, b );
+//bli_printm( "b after", b, "%5.2f", "" );
 		break;
 
 		default:

From d220f9c436c0dae409974724d42ab6c52f12a726 Mon Sep 17 00:00:00 2001
From: Nisanth M P <nisanthmp.01@gmail.com>
Date: Wed, 11 Jan 2023 08:43:03 +0530
Subject: [PATCH 118/230] Fix k = 0 edge case in power10 microkernels (#706)

Details:
- When power10 sgemm and dgemm microkernels are called with k = 0, they
  become caught in infinite loops and segfault. This is fixed now via an
  early exit in the case of k = 0.
---
 kernels/power10/3/bli_dgemm_power10_mma.c | 29 ++++++++++-------------
 kernels/power10/3/bli_sgemm_power10_mma.c | 29 ++++++++++-------------
 2 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c
index abf66f58f..67163b5a7 100644
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -74,12 +74,10 @@ void bli_dgemm_power10_mma_8x8
         cntx_t*             cntx
     )
 {
-
     // Typecast local copies of integers in case dim_t and inc_t are a
     // different size than is expected by load instructions.
-    // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-    uint64_t k_iter = (k-1) / 4;
-    uint64_t k_left = (k-1) % 4;
+    uint64_t k_iter = k / 4;
+    uint64_t k_left = k % 4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -110,6 +108,16 @@ void bli_dgemm_power10_mma_8x8
     __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
+    // initialize the accumulators to zeros
+    __builtin_mma_xxsetaccz(&acc0);
+    __builtin_mma_xxsetaccz(&acc1);
+    __builtin_mma_xxsetaccz(&acc2);
+    __builtin_mma_xxsetaccz(&acc3);
+    __builtin_mma_xxsetaccz(&acc4);
+    __builtin_mma_xxsetaccz(&acc5);
+    __builtin_mma_xxsetaccz(&acc6);
+    __builtin_mma_xxsetaccz(&acc7);
+
     /* 2 vector pairs are necessary for a double precision outer product
        instruction. */
     __vector_pair colA_1,
@@ -141,19 +149,6 @@ void bli_dgemm_power10_mma_8x8
     */
     D_ASSEMBLE_VEC_PAIR
 
-    /* Compute accumulate outer products and override accumulators with result */
-    __builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
-    __builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
-    __builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
-    __builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
-    __builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
-    __builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
-    __builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
-    __builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);
-
-    /* Move A and B pointers */
-    D_INCREMENT
-
     // k loop (unrolled by 4)
     for (int k = 0; k<k_iter; k++)
     {
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index 15895e654..3ccee7cbd 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -69,9 +69,8 @@ void bli_sgemm_power10_mma_8x16
 {
     // Typecast local copies of integers in case dim_t and inc_t are a
     // different size than is expected by load instructions.
-    // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-    uint64_t k_iter = (k-1) / 4;
-    uint64_t k_left = (k-1) % 4;
+    uint64_t k_iter = k / 4;
+    uint64_t k_left = k % 4;
 
     uint64_t rs_c   = rs_c0;
 
@@ -84,6 +83,16 @@ void bli_sgemm_power10_mma_8x16
     __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
+    // initialize the accumulators to zeros
+    __builtin_mma_xxsetaccz(&acc0);
+    __builtin_mma_xxsetaccz(&acc1);
+    __builtin_mma_xxsetaccz(&acc2);
+    __builtin_mma_xxsetaccz(&acc3);
+    __builtin_mma_xxsetaccz(&acc4);
+    __builtin_mma_xxsetaccz(&acc5);
+    __builtin_mma_xxsetaccz(&acc6);
+    __builtin_mma_xxsetaccz(&acc7);
+
     float* restrict A0 = a;
     float* restrict B0 = b;
     float* restrict C0 = c;
@@ -95,18 +104,6 @@ void bli_sgemm_power10_mma_8x16
     vec_t *ca = (vec_t *) A0;
     vec_t *rb = (vec_t *) B0;
 
-    /* Compute accumulate outer products and override accumulators with result */
-    __builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
-    __builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
-    __builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
-    __builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
-    __builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
-    __builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
-    __builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
-    __builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);
-
-    S_INCREMENT
-
     // k loop (unrolled by 4)
     for (int k = 0; k<k_iter; k++)
     {
@@ -147,4 +144,4 @@ void bli_sgemm_power10_mma_8x16
     }
 
     GEMM_UKR_FLUSH_CT( s );
-}
\ No newline at end of file
+}

From cdb22b8ffa5b31a0c16ac1a7bcecefeb5216f669 Mon Sep 17 00:00:00 2001
From: Nisanth M P <nisanthmp.01@gmail.com>
Date: Wed, 11 Jan 2023 08:50:57 +0530
Subject: [PATCH 119/230] Disable power10 kernels other than sgemm, dgemm.
 (#705)

Details:
- There is a power10 sandbox which uses microkernels for datatypes other
  than float and double (or scomplex/dcomplex). In a regular power10-
  configured build (that is, with the sandbox disabled), there were
  compile errors for some of these other non-sgemm/non-dgemm
  microkernels. This commit protects those kernels with a new cpp macro
  guard (which is defined in sandbox/power10/bli_sandbox.h) that
  prevents that kernel code from being compiled for normal, non-sandbox
  power10 builds.
---
 kernels/power10/3/bli_i16gemm_power10_mma.c  | 5 ++++-
 kernels/power10/3/bli_i16sgemm_power10_mma.c | 5 ++++-
 kernels/power10/3/bli_i4gemm_power10_mma.c   | 3 +++
 kernels/power10/3/bli_i8gemm_power10_mma.c   | 3 +++
 kernels/power10/3/bli_sbgemm_power10_mma.c   | 5 ++++-
 kernels/power10/3/bli_shgemm_power10_mma.c   | 5 ++++-
 sandbox/power10/bli_sandbox.h                | 4 ++++
 7 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index d0c9390f5..cc1cd3d84 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -32,6 +32,8 @@
 
 */
 
+#ifdef BLIS_SANDBOX_POWER10
+
 #include "vector_int_macros.h"
 
 #define I16_ACCUMULATE \
@@ -139,4 +141,5 @@ void bli_i16gemm_power10_mma_8x16
         SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
         SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
-}
\ No newline at end of file
+}
+#endif // BLIS_SANDBOX_POWER10
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index 7d84e68e2..26da6cf79 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -32,6 +32,8 @@
 
 */
 
+#ifdef BLIS_SANDBOX_POWER10
+
 #include "vector_int_macros.h"
 
 #define I16S_ACCUMULATE \
@@ -139,4 +141,5 @@ void bli_i16sgemm_power10_mma_8x16
         SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
         SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
-}
\ No newline at end of file
+}
+#endif // BLIS_SANDBOX_POWER10
diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c
index 6c78a9f00..a8d25d2da 100644
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -32,6 +32,8 @@
 
 */
 
+#ifdef BLIS_SANDBOX_POWER10
+
 #include "vector_int_macros.h"
 
 #define I4_ACCUMULATE \
@@ -140,3 +142,4 @@ void bli_i4gemm_power10_mma_8x16
         SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
 }
+#endif // BLIS_SANDBOX_POWER10
diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c
index 8a0b158a5..2948e10bf 100644
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -32,6 +32,8 @@
 
 */
 
+#ifdef BLIS_SANDBOX_POWER10
+
 #include "vector_int_macros.h"
 
 #define I8_ACCUMULATE \
@@ -139,3 +141,4 @@ void bli_i8gemm_power10_mma_8x16
         SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
 }
+#endif // BLIS_SANDBOX_POWER10
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index c16710f45..e68c5bed9 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -32,6 +32,8 @@
 
 */
 
+#ifdef BLIS_SANDBOX_POWER10
+
 #include "vector_int_macros.h"
 
 #define B_ACCUMULATE \
@@ -140,4 +142,5 @@ void bli_sbgemm_power10_mma_8x16
         SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
 
-}
\ No newline at end of file
+}
+#endif // BLIS_SANDBOX_POWER10
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index dc62b5d60..9c7f9f741 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -32,6 +32,8 @@
 
 */
 
+#ifdef BLIS_SANDBOX_POWER10
+
 #include "vector_int_macros.h"
 
 #define H_ACCUMULATE \
@@ -140,4 +142,5 @@ void bli_shgemm_power10_mma_8x16
         SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
 
-}
\ No newline at end of file
+}
+#endif // BLIS_SANDBOX_POWER10
diff --git a/sandbox/power10/bli_sandbox.h b/sandbox/power10/bli_sandbox.h
index 22d293d13..35f786912 100644
--- a/sandbox/power10/bli_sandbox.h
+++ b/sandbox/power10/bli_sandbox.h
@@ -35,6 +35,10 @@
 #ifndef BLIS_SANDBOX_H
 #define BLIS_SANDBOX_H
 
+#ifndef BLIS_SANDBOX_POWER10
+#define BLIS_SANDBOX_POWER10
+#endif
+
 #include "blis.h"
 #include "gemm_prototypes.h"
 

From 38d88d5c131253066cad4f98eea06fa9299cae3b Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 10 Jan 2023 21:24:58 -0600
Subject: [PATCH 120/230] Define new global scalar (obj_t) constants. (#703)

Details:
- This commit defines the following new global scalar constants:
  - BLIS_ONE_I: This constant encodes the imaginary unit.
  - BLIS_MINUS_ONE_I: This constant encodes the negative imaginary unit.
  - BLIS_NAN: This constant encodes a not-a-number value. Both real and
    imaginary parts are set to NaN for complex datatypes.
---
 frame/base/bli_const.c          | 26 ++++++++++++++++----------
 frame/include/bli_extern_defs.h |  3 +++
 frame/include/bli_type_defs.h   |  9 +++++++++
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/frame/base/bli_const.c b/frame/base/bli_const.c
index 210d6ae77..03f1f7e60 100644
--- a/frame/base/bli_const.c
+++ b/frame/base/bli_const.c
@@ -36,19 +36,25 @@
 
 // Statically initialize structs containing representations of various
 // constants for each datatype supported in BLIS.
-static constdata_t bli_two_buffer  = bli_obj_init_constdata(  2.0 );
-static constdata_t bli_one_buffer  = bli_obj_init_constdata(  1.0 );
-static constdata_t bli_zero_buffer = bli_obj_init_constdata(  0.0 );
-static constdata_t bli_mone_buffer = bli_obj_init_constdata( -1.0 );
-static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 );
+static constdata_t bli_two_buffer   = bli_obj_init_constdata(  2.0 );
+static constdata_t bli_one_buffer   = bli_obj_init_constdata(  1.0 );
+static constdata_t bli_zero_buffer  = bli_obj_init_constdata(  0.0 );
+static constdata_t bli_mone_buffer  = bli_obj_init_constdata( -1.0 );
+static constdata_t bli_mtwo_buffer  = bli_obj_init_constdata( -2.0 );
+static constdata_t bli_onei_buffer  = bli_obj_init_constdata_ri( 0.0,  1.0 );
+static constdata_t bli_monei_buffer = bli_obj_init_constdata_ri( 0.0, -1.0 );
+static constdata_t bli_nan_buffer   = bli_obj_init_constdata_ri( NAN,  NAN );
 
 // Statically initialize global scalar constants, attaching the addresses
 // of the corresponding structs above.
-const obj_t BLIS_TWO       = bli_obj_init_const( &bli_two_buffer );
-const obj_t BLIS_ONE       = bli_obj_init_const( &bli_one_buffer );
-const obj_t BLIS_ZERO      = bli_obj_init_const( &bli_zero_buffer );
-const obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer );
-const obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer );
+const obj_t BLIS_TWO         = bli_obj_init_const( &bli_two_buffer );
+const obj_t BLIS_ONE         = bli_obj_init_const( &bli_one_buffer );
+const obj_t BLIS_ZERO        = bli_obj_init_const( &bli_zero_buffer );
+const obj_t BLIS_MINUS_ONE   = bli_obj_init_const( &bli_mone_buffer );
+const obj_t BLIS_MINUS_TWO   = bli_obj_init_const( &bli_mtwo_buffer );
+const obj_t BLIS_ONE_I       = bli_obj_init_const( &bli_onei_buffer );
+const obj_t BLIS_MINUS_ONE_I = bli_obj_init_const( &bli_monei_buffer );
+const obj_t BLIS_NAN         = bli_obj_init_const( &bli_nan_buffer );
 
 #if 0
 obj_t BLIS_TWO = {};
diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h
index 71a6096e1..f157a6d56 100644
--- a/frame/include/bli_extern_defs.h
+++ b/frame/include/bli_extern_defs.h
@@ -42,6 +42,9 @@ BLIS_EXPORT_BLIS extern const obj_t BLIS_ZERO;
 //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF;
 BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE;
 BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_ONE_I;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE_I;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_NAN;
 
 BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM;
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 014be18b7..cb933bfa4 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1415,6 +1415,15 @@ BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b )
 	.i =           ( gint_t )val, \
 }
 
+#define bli_obj_init_constdata_ri( valr, vali ) \
+{ \
+	.s =           ( float  )valr, \
+	.d =           ( double )valr, \
+	.c = { .real = ( float  )valr, .imag = ( float  )vali }, \
+	.z = { .real = ( double )valr, .imag = ( double )vali }, \
+	.i =           ( gint_t )valr, \
+}
+
 
 // -- Context type --
 

From b895ec9f1f66fb93972589c06bff171337153a31 Mon Sep 17 00:00:00 2001
From: Nisanth M P <nisanthmp.01@gmail.com>
Date: Wed, 11 Jan 2023 09:02:32 +0530
Subject: [PATCH 121/230] Fixing type-mismatch errors in power10 sandbox (#701)

Details:
- This commit fixes a mismatch between the function type signature of
  bli_gemm_ex() required by BLIS and the version of the function defined
  within the power10 sandbox. It also performs typecasting upon calling
  bli_gemm_front() to attain type consistency with the type signature
  defined by BLIS for bli_gemm_front().
---
 sandbox/power10/bli_gemm_ex.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sandbox/power10/bli_gemm_ex.c b/sandbox/power10/bli_gemm_ex.c
index d136c7e1b..7eef0ccef 100644
--- a/sandbox/power10/bli_gemm_ex.c
+++ b/sandbox/power10/bli_gemm_ex.c
@@ -46,13 +46,13 @@
 
 void bli_gemm_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -73,7 +73,7 @@ void bli_gemm_ex
 	// Invoke the operation's front end.
 	bli_gemm_front
 	(
-	  alpha, a, b, beta, c, cntx, rntm
+	  alpha, a, b, beta, c, cntx, (rntm_t* )rntm
 	);
 }
 

From 9a366b14fe52c469f4664ef5dd93d85be8d97baa Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 12 Jan 2023 13:07:22 -0600
Subject: [PATCH 122/230] Implement cntx_t pointer caching in gks. (#709)

Details:
- Refactored the gks cntx_t query functions so that: (1) there is a
  clearer pattern of similarity between functions that query a native
  context and those that query its induced (1m) counterpart; and (2)
  queried cntx_t pointers (for both native and induced cntx_t pointers)
  are cached (by default), or deep-queried upon each invocation,
  depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is defined.
- Refactored query-related functions in bli_arch.c to cache the queried
  arch_t value (by default), or deep-query the arch_t value upon each
  invocation, depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is
  defined.
- Tweaked the behavior of bli_gks_query_ind_cntx_impl() (formerly named
  bli_gks_query_ind_cntx()) so that the induced method cntx_t struct is
  repopulated each time the function is called. (It is still only
  allocated once on first call.) This was mostly done in preparation for
  some future in which the arch_t value might change at runtime. In such
  a scenario, the induced method context would need to be recalculated
  any time the native context changes.
- Added preprocessor logic to bli_config_macro_defs.h to handle enabling
  or disabling of cntx_t pointer caching (via BLIS_ENABLE_GKS_CACHING).
- For now, cntx_t pointer caching is enabled by default and does not
  correspond to any official configure option. Disabling can be done
  by inserting a #define for BLIS_DISABLE_GKS_CACHING into the
  appropriate bli_family_*.h header file within the configuration of
  interest.
- Thanks to Harihara Sudhan S (AMD) for suggesting that cntxt_t pointers
  (and not just arch_t values) be cached.
- Comment updates.
---
 CREDITS                               |   1 +
 frame/base/bli_arch.c                 |  43 +++++++-
 frame/base/bli_arch.h                 |   5 +-
 frame/base/bli_gks.c                  | 146 ++++++++++++++++++++------
 frame/base/bli_gks.h                  |   7 +-
 frame/base/bli_ind.c                  |   6 +-
 frame/base/bli_memsys.c               |  10 +-
 frame/include/bli_config_macro_defs.h |  11 ++
 frame/thread/bli_pthread.c            |   2 +-
 9 files changed, 179 insertions(+), 52 deletions(-)

diff --git a/CREDITS b/CREDITS
index 939351c00..51afcc276 100644
--- a/CREDITS
+++ b/CREDITS
@@ -104,6 +104,7 @@ but many others have contributed code and feedback, including
   Paul Springer            @springer13         (RWTH Aachen University)
   Adam J. Stewart          @adamjstewart       (University of Illinois at Urbana-Champaign)
   Vladimir Sukarev
+  Harihara Sudhan S        @ihariharasudhan    (AMD)
   Chengguo Sun             @chengguosun
   Santanu Thangaraj                            (AMD)
   Nicholai Tukanov         @nicholaiTukanov    (The University of Texas at Austin)
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index 48b50a774..bd3f24993 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -67,14 +67,27 @@
 
 // The arch_t id for the currently running hardware. We initialize to -1,
 // which will be overwritten upon calling bli_arch_set_id().
-static arch_t id = -1;
+static arch_t cached_id = -1;
 
 arch_t bli_arch_query_id( void )
 {
+
+#ifdef BLIS_ENABLE_GKS_CACHING
+
+	// Deep-query the arch_t id once via bli_pthread_once(). Since we are
+	// constrained by the pthread interface to pthread_once(), the id must be
+	// "returned" indirectly via a static variable (cached_id).
 	bli_arch_set_id_once();
 
-	// Simply return the id that was previously cached.
-	return id;
+	// Return the id that was previously cached.
+	return cached_id;
+
+#else
+
+	// Deep-query and return a fresh arch_t.
+	return bli_arch_query_id_impl();
+
+#endif
 }
 
 // -----------------------------------------------------------------------------
@@ -85,6 +98,9 @@ static bli_pthread_once_t once_id = BLIS_PTHREAD_ONCE_INIT;
 
 void bli_arch_set_id_once( void )
 {
+	// When this file is being compiled as part of the configure script's
+	// hardware auto-detection driver, we avoid calling the bli_pthread APIs
+	// so that we aren't required to include those symbols in the executable.
 #ifndef BLIS_CONFIGURETIME_CPUID
 	bli_pthread_once( &once_id, bli_arch_set_id );
 #endif
@@ -94,6 +110,16 @@ void bli_arch_set_id_once( void )
 
 void bli_arch_set_id( void )
 {
+	// Deep-query the arch_t and save it in the static variable (cached_id).
+	cached_id = bli_arch_query_id_impl();
+}
+
+// -----------------------------------------------------------------------------
+
+arch_t bli_arch_query_id_impl( void )
+{
+	arch_t id;
+
 	// Check the environment variable BLIS_ARCH_DEBUG to see if the user
 	// requested that we echo the result of the subconfiguration selection.
 	bool do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 );
@@ -103,6 +129,9 @@ void bli_arch_set_id( void )
 	// requested that we use a specific subconfiguration.
 	dim_t req_id = bli_env_get_var( "BLIS_ARCH_TYPE", -1 );
 
+	// When this file is being compiled as part of the configure script's
+	// hardware auto-detection driver, we avoid calling the bli_check APIs
+	// so that we aren't required to include those symbols in the executable.
 #ifndef BLIS_CONFIGURETIME_CPUID
 	if ( req_id != -1 )
 	{
@@ -243,8 +272,12 @@ void bli_arch_set_id( void )
 		fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n",
 				 bli_arch_string( id ) );
 
-	//printf( "blis_arch_query_id(): id = %u\n", id );
-	//exit(1);
+	#if 0
+	printf( "blis_arch_query_id_impl(): id = %u\n", id );
+	exit(1);
+	#endif
+
+	return id;
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h
index 08af7ae79..a5f0c15d3 100644
--- a/frame/base/bli_arch.h
+++ b/frame/base/bli_arch.h
@@ -37,8 +37,9 @@
 
 BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void );
 
-void bli_arch_set_id_once( void );
-void bli_arch_set_id( void );
+void   bli_arch_set_id_once( void );
+void   bli_arch_set_id( void );
+arch_t bli_arch_query_id_impl( void );
 
 BLIS_EXPORT_BLIS const char*  bli_arch_string( arch_t id );
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 094810d9d..df0abc8ed 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -35,8 +35,8 @@
 
 #include "blis.h"
 
-// The array of cntx_t* pointers to cache modified contexts used by
-// induced methods.
+// The array of cntx_t* pointers to cache modified contexts used by induced
+// methods.
 static cntx_t** gks[ BLIS_NUM_ARCHS ];
 
 // The array of function pointers holding the registered context initialization
@@ -52,6 +52,13 @@ typedef void (*nat_cntx_init_ft)( cntx_t* cntx );
 typedef void (*ref_cntx_init_ft)( cntx_t* cntx );
 typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
 
+// Cached copies of the pointers to the native and induced contexts for the
+// active subconfiguration. When BLIS_ENABLE_GKS_CACHING is enabled, these
+// pointers will be set once and then reused to fulfill subsequent context
+// queries.
+static cntx_t* cached_cntx_nat = NULL;
+static cntx_t* cached_cntx_ind = NULL;
+
 // -----------------------------------------------------------------------------
 
 void bli_gks_init( void )
@@ -216,6 +223,18 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_generic_ind );
 #endif
 	}
+
+#ifdef BLIS_ENABLE_GKS_CACHING
+	// Deep-query and cache the native and induced method contexts so they are
+	// ready to go when needed (by BLIS or the application). Notice that we use
+	// the _noinit() APIs, which skip their internal calls to bli_init_once().
+	// The reasons: (1) Skipping that call is necessary to prevent an infinite
+	// loop since the current function, bli_gks_init(), is called from within
+	// bli_init_once(); and (2) we can guarantee that the gks has been
+	// initialized given that bli_gks_init() is about to return.
+	cached_cntx_nat = ( cntx_t* )bli_gks_query_nat_cntx_noinit();
+	cached_cntx_ind = ( cntx_t* )bli_gks_query_ind_cntx_noinit( BLIS_1M );
+#endif
 }
 
 // -----------------------------------------------------------------------------
@@ -267,6 +286,12 @@ void bli_gks_finalize( void )
 
 	}
 	// END CRITICAL SECTION
+
+#ifdef BLIS_ENABLE_GKS_CACHING
+	// Clear the cached pointers to the native and induced contexts.
+	cached_cntx_nat = NULL;
+	cached_cntx_ind = NULL;
+#endif
 }
 
 // -----------------------------------------------------------------------------
@@ -475,10 +500,38 @@ const cntx_t* bli_gks_query_cntx( void )
 	return bli_gks_query_nat_cntx();
 }
 
+// -----------------------------------------------------------------------------
+
 const cntx_t* bli_gks_query_nat_cntx( void )
 {
 	bli_init_once();
 
+#ifdef BLIS_ENABLE_GKS_CACHING
+
+	// Return a pointer to the context for native execution that was deep-
+	// queried and cached at the end of bli_gks_init().
+	return cached_cntx_nat;
+
+#else
+
+	// Deep-query and return the address of a context for native execution.
+	return bli_gks_query_nat_cntx_impl();
+
+#endif
+}
+
+const cntx_t* bli_gks_query_nat_cntx_noinit( void )
+{
+	// NOTE: This function purposefully avoids calling bli_init_once() so that
+	// it is safe to call during inititalization.
+
+	return bli_gks_query_nat_cntx_impl();
+}
+
+// -----------------------------------------------------------------------------
+
+const cntx_t* bli_gks_query_nat_cntx_impl( void )
+{
 	// Return the address of the native context for the architecture id
 	// corresponding to the current hardware, as determined by
 	// bli_arch_query_id().
@@ -494,18 +547,42 @@ const cntx_t* bli_gks_query_nat_cntx( void )
 
 // -----------------------------------------------------------------------------
 
-const cntx_t* bli_gks_query_cntx_noinit( void )
+const cntx_t* bli_gks_query_ind_cntx
+     (
+       ind_t ind
+     )
 {
-	// This function is identical to bli_gks_query_cntx(), except that it
-	// does not call bli_init_once().
+	bli_init_once();
 
-	// Query the architecture id.
-	arch_t id = bli_arch_query_id();
+#ifdef BLIS_ENABLE_GKS_CACHING
 
-	// Use the architecture id to look up a pointer to its context.
-	const cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
+	// If for some reason the native context was requested, we return its
+	// address instead of the one for induced execution.
+	if ( ind == BLIS_NAT ) return cached_cntx_nat;
 
-	return cntx;
+	// Return a pointer to the context for the induced method that was deep-
+	// queried and cached at the end of bli_gks_init().
+	return cached_cntx_ind;
+
+#else
+
+	// Deep-query and return the address of a context for the requested induced
+	// method. (In this case, caching never takes place since it was disabled
+	// at configure-time.)
+	return bli_gks_query_ind_cntx_impl( ind );
+
+#endif
+}
+
+const cntx_t* bli_gks_query_ind_cntx_noinit
+     (
+       ind_t ind
+     )
+{
+	// NOTE: This function purposefully avoids calling bli_init_once() so that
+	// it is safe to call during inititalization.
+
+	return bli_gks_query_ind_cntx_impl( ind );
 }
 
 // -----------------------------------------------------------------------------
@@ -514,16 +591,15 @@ const cntx_t* bli_gks_query_cntx_noinit( void )
 // with a new entry corresponding to a context for an ind_t value.
 static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
-const cntx_t* bli_gks_query_ind_cntx
+const cntx_t* bli_gks_query_ind_cntx_impl
      (
        ind_t ind
      )
 {
-	bli_init_once();
-
 	cntx_t* gks_id_ind;
 	err_t r_val;
 
+
 	// Return the address of a context that will be suited for executing a
 	// level-3 operation via the requested induced method (and datatype) for
 	// the architecture id corresponding to the current hardware, as
@@ -532,10 +608,13 @@ const cntx_t* bli_gks_query_ind_cntx
 	// This function is called when a level-3 operation via induced method is
 	// called, e.g. bli_gemm1m(). If this is the first time that induced method
 	// is being executed since bli_gks_init(), the necessary context structure
-	// is allocated and initialized. If this is not the first time, then the
-	// address of a previously-allocated and initialized (cached) context is
-	// returned. Note that much of this must be done with mutual exclusion to
-	// ensure thread safety and deterministic behavior.
+	// is allocated. If this is not the first time a context for the requested
+	// induced method was queried, then the memory will already be allocated
+	// and initialized, and the previous cntx_t struct will be overwritten.
+	// The function will then return the address to the newly-initialized (or
+	// previously-allocated-but-reinitialized) cntx_t struct. Note that some of
+	// this function must be executed with mutual exclusion to ensure thread
+	// safety and deterministic behavior.
 
 	// Query the architecture id.
 	arch_t id = bli_arch_query_id();
@@ -583,23 +662,24 @@ const cntx_t* bli_gks_query_ind_cntx
 			// gks_id[ ind ].
 			gks_id_ind    = bli_calloc_intl( sizeof( cntx_t ), &r_val );
 			gks_id[ ind ] = gks_id_ind;
-
-			// Before we can call the induced method context initialization
-			// function on the newly allocated structure, we must first copy
-			// over the contents of the native context.
-			*gks_id_ind = *gks_id_nat;
-
-			// Use the architecture id to look up the function pointer to the
-			// context initialization function for induced methods.
-			ind_cntx_init_ft f = cntx_ind_init[ id ];
-
-			// Now we modify the context (so that it contains the proper values
-			// for its induced method) by calling the context initialization
-			// function for the current induced method. (That function assumes
-			// that the context is pre- initialized with values for native
-			// execution.)
-			f( ind, gks_id_ind );
 		}
+
+		// Before we can call the induced method context initialization
+		// function on the newly allocated structure, we must first copy
+		// over the contents of the native context. If a previous context
+		// was already copied, this will overwrite those previous values.
+		*gks_id_ind = *gks_id_nat;
+
+		// Use the architecture id to look up the function pointer to the
+		// context initialization function for induced methods.
+		ind_cntx_init_ft f = cntx_ind_init[ id ];
+
+		// Now we modify the context (so that it contains the proper values
+		// for its induced method) by calling the context initialization
+		// function for the current induced method. (That function assumes
+		// that the context is pre-initialized with values for native
+		// execution.)
+		f( ind, gks_id_ind );
 	}
 	// END CRITICAL SECTION
 
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index 3a93fd59e..d1c715be1 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -46,11 +46,14 @@ const cntx_t* const *          bli_gks_lookup_id( arch_t id );
 void                           bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
 
 BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_cntx( void );
-BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void );
 
-const cntx_t*                  bli_gks_query_cntx_noinit( void );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void );
+const cntx_t*                  bli_gks_query_nat_cntx_noinit( void );
+const cntx_t*                  bli_gks_query_nat_cntx_impl( void );
 
 BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind );
+const cntx_t*                  bli_gks_query_ind_cntx_noinit( ind_t ind );
+const cntx_t*                  bli_gks_query_ind_cntx_impl( ind_t ind );
 
 BLIS_EXPORT_BLIS void          bli_gks_init_ref_cntx( cntx_t* cntx );
 
diff --git a/frame/base/bli_ind.c b/frame/base/bli_ind.c
index fbe740465..cc2810d51 100644
--- a/frame/base/bli_ind.c
+++ b/frame/base/bli_ind.c
@@ -44,9 +44,9 @@ static const char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
 
 void bli_ind_init( void )
 {
-	// NOTE: Instead of calling bli_gks_query_cntx(), we call
-	// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
-	const cntx_t* cntx = bli_gks_query_cntx_noinit();
+	// NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order
+	// to avoid the internal call to bli_init_once().
+	const cntx_t* cntx = bli_gks_query_nat_cntx_noinit();
 
 	// For each precision, enable the default induced method (1m) if both of
 	// the following conditions are met:
diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c
index 7b62ded5c..a226b7b85 100644
--- a/frame/base/bli_memsys.c
+++ b/frame/base/bli_memsys.c
@@ -39,12 +39,10 @@
 void bli_memsys_init( void )
 {
 	// Query a native context so we have something to pass into
-	// bli_pba_init_pools(). We use BLIS_DOUBLE for the datatype,
-	// but the dt argument is actually only used when initializing
-	// contexts for induced methods.
-	// NOTE: Instead of calling bli_gks_query_cntx(), we call
-	// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
-	const cntx_t* cntx_p = bli_gks_query_cntx_noinit();
+	// bli_pba_init_pools().
+	// NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order
+	// to avoid the internal call to bli_init_once().
+	const cntx_t* cntx_p = bli_gks_query_nat_cntx_noinit();
 
 	// Initialize the packing block allocator and its data structures.
 	bli_pba_init( cntx_p );
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 9e9d47699..e7b77acbb 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -77,6 +77,17 @@
 #endif
 
 
+// -- MULTITHREADING -----------------------------------------------------------
+
+// Enable caching of queried cntx_t pointers in the gks?
+#ifdef BLIS_DISABLE_GKS_CACHING
+  #undef BLIS_ENABLE_GKS_CACHING
+#else
+  // Default behavior is enabled.
+  #define BLIS_ENABLE_GKS_CACHING
+#endif
+
+
 // -- MULTITHREADING -----------------------------------------------------------
 
 // Enable multithreading via POSIX threads.
diff --git a/frame/thread/bli_pthread.c b/frame/thread/bli_pthread.c
index 804ace46d..b840e2b77 100644
--- a/frame/thread/bli_pthread.c
+++ b/frame/thread/bli_pthread.c
@@ -701,7 +701,7 @@ int bli_pthread_barrier_wait
 // Note that bli_pthread_switch_t has the following properties:
 //
 // 1. Access to a switch is protected by a mutex specific to that switch, and
-//    therefore state changes and thread-safe.
+//    therefore state changes are thread-safe.
 //
 // 2. An initialized switch always starts in the "off" state.
 //

From 16d2e9ea9ca0853197b416eba701b840a8587bca Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 13 Jan 2023 20:03:01 -0600
Subject: [PATCH 123/230] Defined lt, lte, gt, gte + misc. other updates.
 (#712)

Details:
- Changed invertsc operation to be a non-destructive operation; that is,
  it now takes separate input and output operands. This change applies
  to both the object and typed APIs.
- Defined an alternative square root operation, sqrtrsc, which, when
  operating on complex scalars, assumes the imaginary part of the input
  to be zero.
- Changed the semantics of addm, subm, copym, axpym, scal2m, and xpbym
  so that when the source matrix has an implicit unit diagonal, the
  operation leaves the diagonal of the destination matrix untouched.
  Previously, the operations would interpret an implicit unit diagonal
  on the source matrix as a request to manifest the unit diagonal
  *explicitly* on output (either as something to copy in the case of
  copym, or something to compute with in the cases of addm, subm, axpym,
  scal2m, and xpbym). It turns out that this behavior was too cute by
  half and could cause unintended headaches for practical use cases.
  (This change in behavior also required small modifications to the trmv
  and trsv testsuite modules so that they would properly test matrices
  with unit diagonals.)
- Added missing dependencies for copym to gemv, ger, hemv, trmv, and
  trsv testsuite modules.
- Implemented level-0-like ltsc, ltesc, gtsc, gtesc operations in
  frame/util, which use lt, lte, gt, and gte level-0 scalar macros.
- Trivial variable rename in bli_part.c to harmonize with other
  variable naming conventions.
---
 examples/oapi/04level0.c              |  2 +-
 frame/0/bli_l0_check.c                | 16 +-----
 frame/0/bli_l0_check.h                | 13 +----
 frame/0/bli_l0_fpa.c                  |  1 +
 frame/0/bli_l0_fpa.h                  |  1 +
 frame/0/bli_l0_ft.h                   | 27 +++++-----
 frame/0/bli_l0_oapi.c                 | 33 +------------
 frame/0/bli_l0_oapi.h                 | 11 +----
 frame/0/bli_l0_tapi.c                 | 26 ++++++++--
 frame/0/bli_l0_tapi.h                 | 12 +----
 frame/1m/bli_l1m_tapi.c               | 30 +++++++++++
 frame/base/bli_part.c                 | 18 +++----
 frame/include/bli_scalar_macro_defs.h |  2 +
 frame/include/level0/bli_lt.h         | 71 +++++++++++++++++++++++++++
 frame/include/level0/bli_lte.h        | 71 +++++++++++++++++++++++++++
 frame/util/bli_util_check.c           | 12 +++--
 frame/util/bli_util_check.h           |  8 ++-
 frame/util/bli_util_fpa.c             |  4 ++
 frame/util/bli_util_fpa.h             |  4 ++
 frame/util/bli_util_ft.h              | 23 +++++++--
 frame/util/bli_util_oapi.c            | 50 +++++++++++++++++++
 frame/util/bli_util_oapi.h            |  6 ++-
 frame/util/bli_util_tapi.c            | 21 ++++++++
 frame/util/bli_util_tapi.h            | 16 ++++++
 testsuite/src/test_gemv.c             |  1 +
 testsuite/src/test_ger.c              |  1 +
 testsuite/src/test_hemv.c             |  1 +
 testsuite/src/test_trmv.c             |  6 +++
 testsuite/src/test_trsv.c             |  6 +++
 29 files changed, 379 insertions(+), 114 deletions(-)
 create mode 100644 frame/include/level0/bli_lt.h
 create mode 100644 frame/include/level0/bli_lte.h

diff --git a/examples/oapi/04level0.c b/examples/oapi/04level0.c
index c876ac414..72fe98200 100644
--- a/examples/oapi/04level0.c
+++ b/examples/oapi/04level0.c
@@ -166,7 +166,7 @@ int main( int argc, char** argv )
 	bli_normfsc( &zeta, &alpha );
 	bli_printm( "alpha := normf( zeta )  # normf() = complex modulus in complex domain.", &alpha, "%4.1f", "" );
 
-	bli_invertsc( &gamma );
+	bli_invertsc( &gamma, &gamma );
 	bli_printm( "gamma := 1.0 / gamma", &gamma, "%4.2f", "" );
 
 
diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c
index 02867a22d..a1f1c1ca1 100644
--- a/frame/0/bli_l0_check.c
+++ b/frame/0/bli_l0_check.c
@@ -55,20 +55,8 @@ GENFRONT( copysc )
 GENFRONT( divsc )
 GENFRONT( mulsc )
 GENFRONT( sqrtsc )
+GENFRONT( sqrtrsc )
 GENFRONT( subsc )
-
-
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,_check) \
-     ( \
-       const obj_t* chi  \
-     ) \
-{ \
-	bli_l0_xsc_check( chi ); \
-}
-
 GENFRONT( invertsc )
 
 
@@ -357,7 +345,7 @@ void bli_l0_xxbsc_check
      (
        const obj_t* chi,
        const obj_t* psi,
-       const bool*  is_eq
+       const bool*  is
      )
 {
 	err_t e_val;
diff --git a/frame/0/bli_l0_check.h b/frame/0/bli_l0_check.h
index 1bbb4a756..e5818dbde 100644
--- a/frame/0/bli_l0_check.h
+++ b/frame/0/bli_l0_check.h
@@ -51,17 +51,8 @@ GENTPROT( copysc )
 GENTPROT( divsc )
 GENTPROT( mulsc )
 GENTPROT( sqrtsc )
+GENTPROT( sqrtrsc )
 GENTPROT( subsc )
-
-
-#undef  GENTPROT
-#define GENTPROT( opname ) \
-\
-void PASTEMAC(opname,_check) \
-     ( \
-       const obj_t* chi  \
-     );
-
 GENTPROT( invertsc )
 
 
@@ -152,5 +143,5 @@ void bli_l0_xxbsc_check
      (
        const obj_t* chi,
        const obj_t* psi,
-       const bool*  is_eq
+       const bool*  is
      );
diff --git a/frame/0/bli_l0_fpa.c b/frame/0/bli_l0_fpa.c
index 4aa7ae764..b841ce5a5 100644
--- a/frame/0/bli_l0_fpa.c
+++ b/frame/0/bli_l0_fpa.c
@@ -56,6 +56,7 @@ GENFRONT( mulsc )
 GENFRONT( subsc )
 GENFRONT( invertsc )
 GENFRONT( sqrtsc )
+GENFRONT( sqrtrsc )
 GENFRONT( unzipsc )
 GENFRONT( zipsc )
 
diff --git a/frame/0/bli_l0_fpa.h b/frame/0/bli_l0_fpa.h
index 0d9b28361..623a3f69b 100644
--- a/frame/0/bli_l0_fpa.h
+++ b/frame/0/bli_l0_fpa.h
@@ -50,6 +50,7 @@ GENPROT( mulsc )
 GENPROT( subsc )
 GENPROT( invertsc )
 GENPROT( sqrtsc )
+GENPROT( sqrtrsc )
 GENPROT( unzipsc )
 GENPROT( zipsc )
 
diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h
index 01d90cc3b..9ca69d534 100644
--- a/frame/0/bli_l0_ft.h
+++ b/frame/0/bli_l0_ft.h
@@ -37,7 +37,7 @@
 // -- Level-0 function types ---------------------------------------------------
 //
 
-// addsc, divsc, subsc
+// addsc, divsc, subsc, invertsc
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
@@ -52,18 +52,6 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
 INSERT_GENTDEF( addsc )
 INSERT_GENTDEF( divsc )
 INSERT_GENTDEF( subsc )
-
-// invertsc
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH2(ch,opname,tsuf)) \
-     ( \
-       conj_t  conjchi, \
-       ctype*  chi  \
-     );
-
 INSERT_GENTDEF( invertsc )
 
 // mulsc
@@ -119,6 +107,19 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
 
 INSERT_GENTDEF( sqrtsc )
 
+// sqrtrsc
+
+#undef  GENTDEF
+#define GENTDEF( ctype, ch, opname, tsuf ) \
+\
+typedef void (*PASTECH2(ch,opname,tsuf)) \
+     ( \
+       const ctype* chi, \
+             ctype* psi  \
+     );
+
+INSERT_GENTDEF( sqrtrsc )
+
 // getsc
 
 #undef  GENTDEF
diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c
index 0bfdbe3b3..612babe56 100644
--- a/frame/0/bli_l0_oapi.c
+++ b/frame/0/bli_l0_oapi.c
@@ -115,38 +115,6 @@ GENFRONT( addsc )
 GENFRONT( divsc )
 GENFRONT( mulsc )
 GENFRONT( subsc )
-
-
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC0(opname) \
-     ( \
-       const obj_t* chi  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	num_t     dt        = bli_obj_dt( chi ); \
-\
-	conj_t    conjchi   = bli_obj_conj_status( chi ); \
-\
-	void*     buf_chi   = bli_obj_buffer_for_1x1( dt, chi ); \
-\
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( chi ); \
-\
-	/* Query a type-specific function pointer, except one that uses
-	   void* for function arguments instead of typed pointers. */ \
-	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
-\
-	f \
-	( \
-	  conjchi, \
-	  buf_chi  \
-	); \
-}
-
 GENFRONT( invertsc )
 
 
@@ -181,6 +149,7 @@ void PASTEMAC0(opname) \
 }
 
 GENFRONT( sqrtsc )
+GENFRONT( sqrtrsc )
 
 
 #undef  GENFRONT
diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h
index a34252cf7..713da0d70 100644
--- a/frame/0/bli_l0_oapi.h
+++ b/frame/0/bli_l0_oapi.h
@@ -63,17 +63,8 @@ GENPROT( addsc )
 GENPROT( divsc )
 GENPROT( mulsc )
 GENPROT( sqrtsc )
+GENPROT( sqrtrsc )
 GENPROT( subsc )
-
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
-     ( \
-       const obj_t* chi  \
-     );
-
 GENPROT( invertsc )
 
 
diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c
index e0cdffcf3..7d6d33131 100644
--- a/frame/0/bli_l0_tapi.c
+++ b/frame/0/bli_l0_tapi.c
@@ -66,8 +66,9 @@ INSERT_GENTFUNC_BASIC( subsc, subs )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t conjchi, \
-       ctype* chi  \
+             conj_t conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -76,7 +77,7 @@ void PASTEMAC(ch,opname) \
 \
 	PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
 	PASTEMAC(ch,kername)( chi_conj ); \
-	PASTEMAC(ch,copys)( chi_conj, *chi ); \
+	PASTEMAC(ch,copys)( chi_conj, *psi ); \
 }
 
 INSERT_GENTFUNC_BASIC( invertsc, inverts )
@@ -176,6 +177,25 @@ void PASTEMAC(ch,opname) \
 INSERT_GENTFUNC_BASIC0( sqrtsc )
 
 
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       const ctype* chi, \
+             ctype* psi  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	const ctype_r chi_r = PASTEMAC(ch,real)( *chi ); \
+\
+	PASTEMAC2(chr,ch,sqrt2s)( chi_r, *psi ); \
+}
+
+INSERT_GENTFUNCR_BASIC0( sqrtrsc )
+
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h
index b39303410..ead89c056 100644
--- a/frame/0/bli_l0_tapi.h
+++ b/frame/0/bli_l0_tapi.h
@@ -51,17 +51,6 @@ INSERT_GENTPROT_BASIC0( addsc )
 INSERT_GENTPROT_BASIC0( divsc )
 INSERT_GENTPROT_BASIC0( mulsc )
 INSERT_GENTPROT_BASIC0( subsc )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
-     ( \
-       conj_t conjchi, \
-       ctype* chi  \
-     );
-
 INSERT_GENTPROT_BASIC0( invertsc )
 
 
@@ -88,6 +77,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      );
 
 INSERT_GENTPROT_BASIC0( sqrtsc )
+INSERT_GENTPROT_BASIC0( sqrtrsc )
 
 
 #undef  GENTPROT
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 487116329..83ccf6853 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -82,6 +82,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
 	   we handle it with a separate post-processing step. */ \
+	/* NOTE: This code was disabled after I realized that when matrix A has the
+	   properties of having a unit diagonal (and being lower or upper stored),
+	   the operation should only read the strictly lower/upper triangle and
+	   leave the diagonal of B untouched. */ \
+/*
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
@@ -98,6 +103,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  rntm  \
 		); \
 	} \
+*/ \
 }
 
 INSERT_GENTFUNC_BASIC( addm, addd )
@@ -146,6 +152,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
 	   we handle it with a separate post-processing step. */ \
+	/* NOTE: This code was disabled after I realized that when matrix A has the
+	   properties of having a unit diagonal (and being lower or upper stored),
+	   the operation should only read the strictly lower/upper triangle and
+	   leave the diagonal of B untouched. */ \
+/*
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
@@ -167,6 +178,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  rntm  \
 		); \
 	} \
+*/ \
 }
 
 INSERT_GENTFUNC_BASIC0( copym )
@@ -219,6 +231,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
 	   we handle it with a separate post-processing step. */ \
+	/* NOTE: This code was disabled after I realized that when matrix A has the
+	   properties of having a unit diagonal (and being lower or upper stored),
+	   the operation should only read the strictly lower/upper triangle and
+	   leave the diagonal of B untouched. */ \
+/*
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
@@ -236,6 +253,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  rntm  \
 		); \
 	} \
+*/ \
 }
 
 INSERT_GENTFUNC_BASIC0( axpym )
@@ -307,6 +325,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
 	   we handle it with a separate post-processing step. */ \
+	/* NOTE: This code was disabled after I realized that when matrix A has the
+	   properties of having a unit diagonal (and being lower or upper stored),
+	   the operation should only read the strictly lower/upper triangle and
+	   leave the diagonal of B untouched. */ \
+/*
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
@@ -327,6 +350,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  rntm  \
 		); \
 	} \
+*/ \
 }
 
 INSERT_GENTFUNC_BASIC0( scal2m )
@@ -441,6 +465,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* When the diagonal of an upper- or lower-stored matrix is unit,
 	   we handle it with a separate post-processing step. */ \
+	/* NOTE: This code was disabled after I realized that when matrix A has the
+	   properties of having a unit diagonal (and being lower or upper stored),
+	   the operation should only read the strictly lower/upper triangle and
+	   leave the diagonal of B untouched. */ \
+/*
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
@@ -458,6 +487,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  rntm  \
 		); \
 	} \
+*/ \
 }
 
 INSERT_GENTFUNC_BASIC0( xpbym )
diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c
index f3a2deeb4..fd6ca3a0c 100644
--- a/frame/base/bli_part.c
+++ b/frame/base/bli_part.c
@@ -234,7 +234,7 @@ void bli_acquire_mpart_mdim
 
 
 	// Compute the diagonal offset based on the m and n offsets.
-	doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
+	doff_t diagoff_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
 
 
 	// Begin by copying the info, elem size, buffer, row stride, and column
@@ -250,13 +250,13 @@ void bli_acquire_mpart_mdim
 	{
 		bli_obj_set_dims( m_part, n_part, sub_obj );
 		bli_obj_inc_offs( offm_inc, offn_inc, sub_obj );
-		bli_obj_inc_diag_offset( diag_off_inc, sub_obj );
+		bli_obj_inc_diag_offset( diagoff_inc, sub_obj );
 	}
 	else // if ( bli_obj_has_trans( obj ) )
 	{
 		bli_obj_set_dims( n_part, m_part, sub_obj );
 		bli_obj_inc_offs( offn_inc, offm_inc, sub_obj );
-		bli_obj_inc_diag_offset( -diag_off_inc, sub_obj );
+		bli_obj_inc_diag_offset( -diagoff_inc, sub_obj );
 	}
 
 
@@ -457,7 +457,7 @@ void bli_acquire_mpart_ndim
 
 
 	// Compute the diagonal offset based on the m and n offsets.
-	doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
+	doff_t diagoff_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
 
 
 	// Begin by copying the info, elem size, buffer, row stride, and column
@@ -473,13 +473,13 @@ void bli_acquire_mpart_ndim
 	{
 		bli_obj_set_dims( m_part, n_part, sub_obj );
 		bli_obj_inc_offs( offm_inc, offn_inc, sub_obj );
-		bli_obj_inc_diag_offset( diag_off_inc, sub_obj );
+		bli_obj_inc_diag_offset( diagoff_inc, sub_obj );
 	}
 	else // if ( bli_obj_has_trans( obj ) )
 	{
 		bli_obj_set_dims( n_part, m_part, sub_obj );
 		bli_obj_inc_offs( offn_inc, offm_inc, sub_obj );
-		bli_obj_inc_diag_offset( -diag_off_inc, sub_obj );
+		bli_obj_inc_diag_offset( -diagoff_inc, sub_obj );
 	}
 
 
@@ -709,7 +709,7 @@ void bli_acquire_mpart_mndim
 
 
 	// Compute the diagonal offset based on the m and n offsets.
-	doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
+	doff_t diagoff_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
 
 
 	// Begin by copying the info, elem size, buffer, row stride, and column
@@ -725,13 +725,13 @@ void bli_acquire_mpart_mndim
 	{
 		bli_obj_set_dims( m_part, n_part, sub_obj );
 		bli_obj_inc_offs( offm_inc, offn_inc, sub_obj );
-		bli_obj_inc_diag_offset( diag_off_inc, sub_obj );
+		bli_obj_inc_diag_offset( diagoff_inc, sub_obj );
 	}
 	else // if ( bli_obj_has_trans( obj ) )
 	{
 		bli_obj_set_dims( n_part, m_part, sub_obj );
 		bli_obj_inc_offs( offn_inc, offm_inc, sub_obj );
-		bli_obj_inc_diag_offset( -diag_off_inc, sub_obj );
+		bli_obj_inc_diag_offset( -diagoff_inc, sub_obj );
 	}
 
 	// If the root matrix is not general (ie: has structure defined by the
diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h
index f567e7ef3..3d60e8ec3 100644
--- a/frame/include/bli_scalar_macro_defs.h
+++ b/frame/include/bli_scalar_macro_defs.h
@@ -154,6 +154,8 @@
 #include "bli_dotjs.h"
 
 #include "bli_eq.h"
+#include "bli_lt.h"
+#include "bli_lte.h"
 
 #include "bli_fprints.h"
 
diff --git a/frame/include/level0/bli_lt.h b/frame/include/level0/bli_lt.h
new file mode 100644
index 000000000..b7c68ddaa
--- /dev/null
+++ b/frame/include/level0/bli_lt.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_LT_H
+#define BLIS_LT_H
+
+
+// lt (passed by value)
+
+#define bli_slt( a, b )  (          (a) <          (b) )
+#define bli_dlt( a, b )  (          (a) <          (b) )
+#define bli_clt( a, b )  ( bli_creal(a) < bli_creal(b) )
+#define bli_zlt( a, b )  ( bli_zreal(a) < bli_zreal(b) )
+#define bli_ilt( a, b )  (          (a) <          (b) )
+
+// lt0
+
+#define bli_slt0( a )  (          (a) < 0.0F )
+#define bli_dlt0( a )  (          (a) < 0.0  )
+#define bli_clt0( a )  ( bli_creal(a) < 0.0F )
+#define bli_zlt0( a )  ( bli_zreal(a) < 0.0  )
+
+// gt (passed by value)
+
+#define bli_sgt( a, b )  (          (a) >          (b) )
+#define bli_dgt( a, b )  (          (a) >          (b) )
+#define bli_cgt( a, b )  ( bli_creal(a) > bli_creal(b) )
+#define bli_zgt( a, b )  ( bli_zreal(a) > bli_zreal(b) )
+#define bli_igt( a, b )  (          (a) >          (b) )
+
+// gt0
+
+#define bli_sgt0( a )  (          (a) > 0.0F )
+#define bli_dgt0( a )  (          (a) > 0.0  )
+#define bli_cgt0( a )  ( bli_creal(a) > 0.0F )
+#define bli_zgt0( a )  ( bli_zreal(a) > 0.0  )
+
+
+
+#endif
diff --git a/frame/include/level0/bli_lte.h b/frame/include/level0/bli_lte.h
new file mode 100644
index 000000000..ab87ff800
--- /dev/null
+++ b/frame/include/level0/bli_lte.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_LTE_H
+#define BLIS_LTE_H
+
+
+// lte (passed by value)
+
+#define bli_slte( a, b )  (          (a) <=          (b) )
+#define bli_dlte( a, b )  (          (a) <=          (b) )
+#define bli_clte( a, b )  ( bli_creal(a) <= bli_creal(b) )
+#define bli_zlte( a, b )  ( bli_zreal(a) <= bli_zreal(b) )
+#define bli_ilte( a, b )  (          (a) <=          (b) )
+
+// lte0
+
+#define bli_slte0( a )  (          (a) <= 0.0F )
+#define bli_dlte0( a )  (          (a) <= 0.0  )
+#define bli_clte0( a )  ( bli_creal(a) <= 0.0F )
+#define bli_zlte0( a )  ( bli_zreal(a) <= 0.0  )
+
+// gte (passed by value)
+
+#define bli_sgte( a, b )  (          (a) >=          (b) )
+#define bli_dgte( a, b )  (          (a) >=          (b) )
+#define bli_cgte( a, b )  ( bli_creal(a) >= bli_creal(b) )
+#define bli_zgte( a, b )  ( bli_zreal(a) >= bli_zreal(b) )
+#define bli_igte( a, b )  (          (a) >=          (b) )
+
+// gte0
+
+#define bli_sgte0( a )  (          (a) >= 0.0F )
+#define bli_dgte0( a )  (          (a) >= 0.0  )
+#define bli_cgte0( a )  ( bli_creal(a) >= 0.0F )
+#define bli_zgte0( a )  ( bli_zreal(a) >= 0.0  )
+
+
+
+#endif
diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c
index a96f6f5e9..3fafb4e50 100644
--- a/frame/util/bli_util_check.c
+++ b/frame/util/bli_util_check.c
@@ -144,13 +144,17 @@ void PASTEMAC(opname,_check) \
      ( \
        const obj_t* chi, \
        const obj_t* psi, \
-       const bool*  is_eq  \
+       const bool*  is  \
      ) \
 { \
-	bli_l0_xxbsc_check( chi, psi, is_eq ); \
+	bli_l0_xxbsc_check( chi, psi, is ); \
 }
 
 GENFRONT( eqsc )
+GENFRONT( ltsc )
+GENFRONT( ltesc )
+GENFRONT( gtsc )
+GENFRONT( gtesc )
 
 
 #undef  GENFRONT
@@ -160,7 +164,7 @@ void PASTEMAC(opname,_check) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
-       const bool*  is_eq  \
+       const bool*  is  \
      ) \
 { \
 	bli_l1v_xy_check( x, y ); \
@@ -176,7 +180,7 @@ void PASTEMAC(opname,_check) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
-       const bool*  is_eq  \
+       const bool*  is  \
      ) \
 { \
 	bli_l1m_xy_check( x, y ); \
diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h
index c3f4fd1aa..26986b52c 100644
--- a/frame/util/bli_util_check.h
+++ b/frame/util/bli_util_check.h
@@ -125,10 +125,14 @@ void PASTEMAC(opname,_check) \
      ( \
        const obj_t* chi, \
        const obj_t* psi, \
-       const bool*  is_eq  \
+       const bool*  is  \
      );
 
 GENTPROT( eqsc )
+GENTPROT( ltsc )
+GENTPROT( ltesc )
+GENTPROT( gtsc )
+GENTPROT( gtesc )
 
 
 #undef  GENPROT
@@ -138,7 +142,7 @@ void PASTEMAC(opname,_check) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
-       const bool*  is_eq  \
+       const bool*  is  \
     );
 
 GENPROT( eqv )
diff --git a/frame/util/bli_util_fpa.c b/frame/util/bli_util_fpa.c
index fba513fae..4ed95d4c9 100644
--- a/frame/util/bli_util_fpa.c
+++ b/frame/util/bli_util_fpa.c
@@ -89,6 +89,10 @@ PASTEMAC(opname,_qfp)( num_t dt ) \
 GENFRONT( eqsc )
 GENFRONT( eqv )
 GENFRONT( eqm )
+GENFRONT( ltsc )
+GENFRONT( ltesc )
+GENFRONT( gtsc )
+GENFRONT( gtesc )
 GENFRONT( fprintv )
 GENFRONT( fprintm )
 //GENFRONT( printv )
diff --git a/frame/util/bli_util_fpa.h b/frame/util/bli_util_fpa.h
index 9ed6a4cf7..f4b67ba36 100644
--- a/frame/util/bli_util_fpa.h
+++ b/frame/util/bli_util_fpa.h
@@ -69,6 +69,10 @@ PASTEMAC(opname,_qfp)( num_t dt );
 GENPROT( eqsc )
 GENPROT( eqv )
 GENPROT( eqm )
+GENPROT( ltsc )
+GENPROT( ltesc )
+GENPROT( gtsc )
+GENPROT( gtesc )
 GENPROT( fprintv )
 GENPROT( fprintm )
 //GENPROT( printv )
diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h
index ccdd7ae66..39c27bd9a 100644
--- a/frame/util/bli_util_ft.h
+++ b/frame/util/bli_util_ft.h
@@ -207,7 +207,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
              conj_t conjchi, \
        const ctype* chi, \
        const ctype* psi, \
-             bool*  is_eq  \
+             bool*  is  \
      );
 
 INSERT_GENTDEF( eqsc )
@@ -223,7 +223,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
              dim_t  n, \
        const ctype* x, inc_t incx, \
        const ctype* y, inc_t incy, \
-             bool*  is_eq  \
+             bool*  is  \
      );
 
 INSERT_GENTDEF( eqv )
@@ -243,10 +243,27 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
              dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
        const ctype*  y, inc_t rs_y, inc_t cs_y, \
-             bool*   is_eq  \
+             bool*   is  \
      );
 
 INSERT_GENTDEF( eqm )
 
+// ltsc, ltesc, gtsc, gtesc
+
+#undef  GENTDEF
+#define GENTDEF( ctype, ch, opname, tsuf ) \
+\
+typedef void (*PASTECH2(ch,opname,tsuf)) \
+     ( \
+       const ctype* chi, \
+       const ctype* psi, \
+             bool*  is  \
+     );
+
+INSERT_GENTDEF( ltsc )
+INSERT_GENTDEF( ltesc )
+INSERT_GENTDEF( gtsc )
+INSERT_GENTDEF( gtesc )
+
 #endif // #ifdef BLIS_OAPI_BASIC
 
diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c
index d4e5617ee..8223ffff8 100644
--- a/frame/util/bli_util_oapi.c
+++ b/frame/util/bli_util_oapi.c
@@ -526,6 +526,56 @@ void PASTEMAC0(opname) \
 GENFRONT( eqm )
 
 
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC0(opname) \
+     ( \
+       const obj_t* chi, \
+       const obj_t* psi, \
+             bool*  is  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	num_t dt_chi = bli_obj_dt( chi ); \
+	num_t dt_psi = bli_obj_dt( psi ); \
+	num_t dt; \
+\
+	if ( bli_error_checking_is_enabled() ) \
+		PASTEMAC(opname,_check)( chi, psi, is ); \
+\
+	/* Decide which datatype will be used to query the buffer from the
+	   constant object (if there is one). */ \
+	if ( bli_is_constant( dt_psi ) ) dt = dt_chi; \
+	else                             dt = dt_psi; \
+\
+	/* If chi and psi are both constants, then we compare only the dcomplex
+	   fields. */ \
+	if ( bli_is_constant( dt ) ) dt = BLIS_DOUBLE; \
+\
+	void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \
+	void* buf_psi = bli_obj_buffer_for_1x1( dt, psi ); \
+\
+	/* Query a type-specific function pointer, except one that uses
+	   void* for function arguments instead of typed pointers. */ \
+	PASTECH(opname,_vft) f = \
+	PASTEMAC(opname,_qfp)( dt ); \
+\
+	f \
+	( \
+	  buf_chi, \
+	  buf_psi, \
+	  is  \
+	); \
+}
+
+GENFRONT( ltsc )
+GENFRONT( ltesc )
+GENFRONT( gtsc )
+GENFRONT( gtesc )
+
+
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h
index ab48f841a..682a58cb3 100644
--- a/frame/util/bli_util_oapi.h
+++ b/frame/util/bli_util_oapi.h
@@ -147,12 +147,16 @@ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
-             bool*  is_eq  \
+             bool*  is  \
      );
 
 GENPROT( eqsc )
 GENPROT( eqv )
 GENPROT( eqm )
+GENPROT( ltsc )
+GENPROT( ltesc )
+GENPROT( gtsc )
+GENPROT( gtesc )
 
 
 #undef  GENPROT
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index 5bd03882a..8611b9164 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -475,6 +475,27 @@ void PASTEMAC(ch,opname) \
 INSERT_GENTFUNC_BASIC0( eqm )
 
 
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, kername ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       const ctype* chi, \
+       const ctype* psi, \
+             bool*  is  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	*is = PASTEMAC(ch,kername)( *chi, *psi ); \
+}
+
+INSERT_GENTFUNC_BASIC( ltsc,  lt )
+INSERT_GENTFUNC_BASIC( ltesc, lte )
+INSERT_GENTFUNC_BASIC( gtsc,  gt )
+INSERT_GENTFUNC_BASIC( gtesc, gte )
+
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, varname ) \
 \
diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h
index 29c67df23..b720877b5 100644
--- a/frame/util/bli_util_tapi.h
+++ b/frame/util/bli_util_tapi.h
@@ -202,6 +202,22 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
 INSERT_GENTPROT_BASIC0( eqm )
 
 
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
+     ( \
+       const ctype* chi, \
+       const ctype* psi, \
+             bool*  is  \
+     );
+
+INSERT_GENTPROT_BASIC0( ltsc )
+INSERT_GENTPROT_BASIC0( ltesc )
+INSERT_GENTPROT_BASIC0( gtsc )
+INSERT_GENTPROT_BASIC0( gtesc )
+
+
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c
index e6090e1c5..f63178564 100644
--- a/testsuite/src/test_gemv.c
+++ b/testsuite/src/test_gemv.c
@@ -104,6 +104,7 @@ void libblis_test_gemv_deps
 	libblis_test_subv( tdata, params, &(op->ops->subv) );
 	libblis_test_copyv( tdata, params, &(op->ops->copyv) );
 	libblis_test_scalv( tdata, params, &(op->ops->scalv) );
+	libblis_test_copym( tdata, params, &(op->ops->copym) );
 }
 
 
diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c
index b44fe6ba6..aad507cdc 100644
--- a/testsuite/src/test_ger.c
+++ b/testsuite/src/test_ger.c
@@ -101,6 +101,7 @@ void libblis_test_ger_deps
 	libblis_test_subv( tdata, params, &(op->ops->subv) );
 	libblis_test_scal2v( tdata, params, &(op->ops->scal2v) );
 	libblis_test_dotv( tdata, params, &(op->ops->dotv) );
+	libblis_test_copym( tdata, params, &(op->ops->copym) );
 	libblis_test_gemv( tdata, params, &(op->ops->gemv) );
 }
 
diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c
index 02e205392..06852f052 100644
--- a/testsuite/src/test_hemv.c
+++ b/testsuite/src/test_hemv.c
@@ -104,6 +104,7 @@ void libblis_test_hemv_deps
 	libblis_test_subv( tdata, params, &(op->ops->subv) );
 	libblis_test_copyv( tdata, params, &(op->ops->copyv) );
 	libblis_test_scalv( tdata, params, &(op->ops->scalv) );
+	libblis_test_copym( tdata, params, &(op->ops->copym) );
 	libblis_test_gemv( tdata, params, &(op->ops->gemv) );
 }
 
diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c
index 71acc90ba..243216e96 100644
--- a/testsuite/src/test_trmv.c
+++ b/testsuite/src/test_trmv.c
@@ -100,6 +100,7 @@ void libblis_test_trmv_deps
 	libblis_test_subv( tdata, params, &(op->ops->subv) );
 	libblis_test_copyv( tdata, params, &(op->ops->copyv) );
 	libblis_test_scalv( tdata, params, &(op->ops->scalv) );
+	libblis_test_copym( tdata, params, &(op->ops->copym) );
 	libblis_test_gemv( tdata, params, &(op->ops->gemv) );
 }
 
@@ -325,6 +326,11 @@ void libblis_test_trmv_check
 	bli_obj_set_struc( BLIS_GENERAL, &a_local );
 	bli_obj_set_uplo( BLIS_DENSE, &a_local );
 
+	// If matrix A has an implicit unit diagonal, we have to make it explicit
+	// for the gemv below.
+	if ( bli_obj_has_unit_diag( a ) )
+		bli_setd( &BLIS_ONE, &a_local );
+
 	bli_gemv( alpha, &a_local, x_orig, &BLIS_ZERO, &y );
 
 	bli_subv( x, &y );
diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c
index 12543cd9a..788be1b2c 100644
--- a/testsuite/src/test_trsv.c
+++ b/testsuite/src/test_trsv.c
@@ -100,6 +100,7 @@ void libblis_test_trsv_deps
 	libblis_test_subv( tdata, params, &(op->ops->subv) );
 	libblis_test_copyv( tdata, params, &(op->ops->copyv) );
 	libblis_test_scalv( tdata, params, &(op->ops->scalv) );
+	libblis_test_copym( tdata, params, &(op->ops->copym) );
 	libblis_test_gemv( tdata, params, &(op->ops->gemv) );
 }
 
@@ -330,6 +331,11 @@ void libblis_test_trsv_check
 	bli_obj_set_struc( BLIS_GENERAL, &a_local );
 	bli_obj_set_uplo( BLIS_DENSE, &a_local );
 
+	// If matrix A has an implicit unit diagonal, we have to make it explicit
+	// for the gemv below.
+	if ( bli_obj_has_unit_diag( a ) )
+		bli_setd( &BLIS_ONE, &a_local );
+
 	bli_gemv( &alpha_inv, &a_local, x, &BLIS_ZERO, &y );
 
 	bli_subv( x_orig, &y );

From 5793a77937aee9847a5692c8e44b36a6380800a1 Mon Sep 17 00:00:00 2001
From: HarshDave12 <122850830+HarshDave12@users.noreply.github.com>
Date: Tue, 17 Jan 2023 21:55:02 +0530
Subject: [PATCH 124/230] Fixed mis-mapped instruction for VEXTRACTF64X2.
 (#713)

Details:
- This commit fixes a typo in the macro definition for the extended
  inline assembly macro VEXTRACTF64X2 in bli_x86_asm_macros.h. The macro
  was previously defined (incorrectly) in terms of the vextractf64x4
  instruction rather than vextractf64x2.
- CREDITS file update.
---
 CREDITS                            | 1 +
 frame/include/bli_x86_asm_macros.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CREDITS b/CREDITS
index 51afcc276..53904234e 100644
--- a/CREDITS
+++ b/CREDITS
@@ -24,6 +24,7 @@ but many others have contributed code and feedback, including
   Dilyn Corner             @dilyn-corner
   Mat Cross                @matcross           (NAG)
                            @decandia50
+  Harsh Dave               @HarshDave12        (AMD)
   Daniël de Kok            @danieldk           (Explosion)
   Kay Dewhurst             @jkd2016            (Max Planck Institute, Halle, Germany)
   Jeff Diamond                                 (Oracle)
diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h
index b470d320d..1c27b8ff2 100644
--- a/frame/include/bli_x86_asm_macros.h
+++ b/frame/include/bli_x86_asm_macros.h
@@ -1205,7 +1205,7 @@
 #define VEXTRACTF128(_0, _1, _2) INSTR_(vextractf128, _0, _1, _2)
 #define VEXTRACTF32X4(_0, _1, _2) INSTR_(vextractf32x4, _0, _1, _2)
 #define VEXTRACTF32X8(_0, _1, _2) INSTR_(vextractf32x8, _0, _1, _2)
-#define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2)
+#define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x2, _0, _1, _2)
 #define VEXTRACTF64X4(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2)
 #define VBLENDPS(_0, _1, _2, _3) INSTR_(vblendps, _0, _1, _2, _3)
 #define VBLENDPD(_0, _1, _2, _3) INSTR_(vblendpd, _0, _1, _2, _3)

From c334ec278f5e2a101625629b2e13bbf1b38dede5 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 18 Jan 2023 13:10:19 -0600
Subject: [PATCH 125/230] Merge tlb- and slab/rr-specific gemm macrokernels.
 (#711)

Details:
- Merged the tlb-specific gemm macrokernel (_var2b) with the slab/rr-
  specific one (var2) so that a single function can be compiled with
  either tlb or slab/rr support, depending on the value of the
  BLIS_ENABLE_JRIR_TLB, _SLAB, and _RR. This is done by incorporating
  information from both approaches: the start/end/inc for the JR and IR
  loops from slab or rr partitioning; and the number of assigned
  microtiles, plus the starting IR dimension offset for all iterations
  after the first (ir_next). With these changes, slab, rr, and tlb can
  all be parameterized by initializing a similar set of variables prior
  to the jr loop.
- Removed the wrap-around logic that sets the "b_next" field of the
  auxinfo_t struct, which executes during the last IR iteration of the
  last JR iteration. The potential benefit of this code is so minor
  (and hinges on the microkernel making use of the b_next field) that
  it's arguably not worth including. The code also does the wrong
  thing for some threads whenever JR_NT > 1, since only thread 0 (in the
  JR group) would even compute with the first micropanel of B.
- Re-expressed the definition of bli_is_last_iter_slrr so that slab and
  tlb use the same code rather than rr and tlb.
- Adjusted the initialization of the gemm control tree accordingly.
---
 frame/3/gemm/bli_gemm_cntl.c         |   7 +-
 frame/3/gemm/bli_gemm_ker_var2.c     |  65 ++++-
 frame/3/gemm/bli_gemm_ker_var2b.c    | 379 ---------------------------
 frame/include/bli_param_macro_defs.h |   6 +-
 4 files changed, 58 insertions(+), 399 deletions(-)
 delete mode 100644 frame/3/gemm/bli_gemm_ker_var2b.c

diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index b9c231cf7..10484adf3 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -61,12 +61,7 @@ cntl_t* bli_gemmbp_cntl_create
 	void_fp macro_kernel_fp;
 
 	// Choose the default macrokernel based on the operation family...
-	if      ( family == BLIS_GEMM )  macro_kernel_fp =
-	                                   #ifdef BLIS_ENABLE_JRIR_TLB
-	                                   bli_gemm_ker_var2b;
-	                                   #else // ifdef ( _SLAB || _RR )
-	                                   bli_gemm_ker_var2;
-	                                   #endif
+	if      ( family == BLIS_GEMM )  macro_kernel_fp = bli_gemm_ker_var2;
 	else if ( family == BLIS_GEMMT ) macro_kernel_fp =
 	                                   #ifdef BLIS_ENABLE_JRIR_TLB
 	                                   bli_gemmt_x_ker_var2b;
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 3e862e6c5..732d5ec06 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -244,27 +244,66 @@ void bli_gemm_ker_var2
 	bli_auxinfo_set_ukr( gemm_ukr, &aux );
 	bli_auxinfo_set_params( params, &aux );
 
-	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	// loop around the microkernel. Here we query the thrinfo_t node for the
-	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	dim_t jr_start, jr_end, jr_inc;
+	dim_t ir_start, ir_end, ir_inc;
 
-	// Query the number of threads and thread ids for each loop.
+#ifdef BLIS_ENABLE_JRIR_TLB
+
+	// Query the number of threads and thread ids for the jr loop around
+	// the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
 	const dim_t jr_tid = bli_thrinfo_work_id( thread );
+
+	const dim_t ir_nt  = 1;
+	const dim_t ir_tid = 0;
+
+	dim_t n_ut_for_me
+	=
+	bli_thread_range_tlb_d( jr_nt, jr_tid, m_iter, n_iter, MR, NR,
+	                        &jr_start, &ir_start );
+
+	// Always increment by 1 in both dimensions.
+	jr_inc = 1;
+	ir_inc = 1;
+
+	// Each thread iterates over the entire panel of C until it exhausts its
+	// assigned set of microtiles.
+	jr_end = n_iter;
+	ir_end = m_iter;
+
+	// Successive iterations of the ir loop should start at 0.
+	const dim_t ir_next = 0;
+
+#else // ifdef ( _SLAB || _RR )
+
+	// Query the number of threads and thread ids for the ir loop around
+	// the microkernel.
+	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
 	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
 	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
-	dim_t jr_start, jr_end, jr_inc;
-	dim_t ir_start, ir_end, ir_inc;
-
 	// Determine the thread range and increment for the 2nd and 1st loops.
 	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
 	bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
+	// Calculate the total number of microtiles assigned to this thread.
+	dim_t n_ut_for_me = ( ( ir_end + ir_inc - 1 - ir_start ) / ir_inc ) *
+	                    ( ( jr_end + jr_inc - 1 - jr_start ) / jr_inc );
+
+	// Each succesive iteration of the ir loop always starts at ir_start.
+	const dim_t ir_next = ir_start;
+
+#endif
+
+	// It's possible that there are so few microtiles relative to the number
+	// of threads that one or more threads gets no work. If that happens, those
+	// threads can return early.
+	if ( n_ut_for_me == 0 ) return;
+
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
 	{
@@ -294,8 +333,6 @@ void bli_gemm_ker_var2
 			{
 				a2 = a_cast;
 				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc );
-				if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) )
-					b2 = b_cast;
 			}
 
 			// Save addresses of next panels of A and B to the auxinfo_t
@@ -350,7 +387,13 @@ void bli_gemm_ker_var2
 				  c11, rs_c, cs_c
 				);
 			}
+
+			// Decrement the number of microtiles assigned to the thread; once
+			// it reaches zero, return immediately.
+			n_ut_for_me -= 1; if ( n_ut_for_me == 0 ) return;
 		}
+
+		ir_start = ir_next;
 	}
 }
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2b.c b/frame/3/gemm/bli_gemm_ker_var2b.c
deleted file mode 100644
index 50375708a..000000000
--- a/frame/3/gemm/bli_gemm_ker_var2b.c
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-typedef void (*xpbys_mxn_vft)
-    (
-      dim_t m,
-      dim_t n,
-      void* x, inc_t rs_x, inc_t cs_x,
-      void* b,
-      void* y, inc_t rs_y, inc_t cs_y
-    );
-
-#undef GENTFUNC2
-#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \
-\
-BLIS_INLINE void PASTEMAC2(chx,chy,op) \
-    ( \
-      dim_t m, \
-      dim_t n, \
-      void* x, inc_t rs_x, inc_t cs_x, \
-      void* b, \
-      void* y, inc_t rs_y, inc_t cs_y \
-    ) \
-{ \
-	ctypex* restrict x_cast = x; \
-	ctypey* restrict b_cast = b; \
-	ctypey* restrict y_cast = y; \
-\
-	PASTEMAC3(chx,chy,chy,xpbys_mxn) \
-	( \
-	  m, n, \
-	  x_cast, rs_x, cs_x, \
-	  b_cast, \
-	  y_cast, rs_y,  cs_y \
-	); \
-}
-
-INSERT_GENTFUNC2_BASIC0(xpbys_mxnb_fn);
-INSERT_GENTFUNC2_MIXDP0(xpbys_mxnb_fn);
-
-static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxnb_fn);
-
-
-void bli_gemm_ker_var2b
-     (
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-       const cntl_t*    cntl,
-             thrinfo_t* thread_par
-     )
-{
-	      num_t  dt_exec   = bli_obj_exec_dt( c );
-	      num_t  dt_c      = bli_obj_dt( c );
-
-	const pack_t schema_a  = bli_obj_pack_schema( a );
-	const pack_t schema_b  = bli_obj_pack_schema( b );
-
-	      dim_t  m         = bli_obj_length( c );
-	      dim_t  n         = bli_obj_width( c );
-	      dim_t  k         = bli_obj_width( a );
-
-	const char*  a_cast    = bli_obj_buffer_at_off( a );
-	const inc_t  is_a      = bli_obj_imag_stride( a );
-	      dim_t  pd_a      = bli_obj_panel_dim( a );
-	      inc_t  ps_a      = bli_obj_panel_stride( a );
-
-	const char*  b_cast    = bli_obj_buffer_at_off( b );
-	const inc_t  is_b      = bli_obj_imag_stride( b );
-	      dim_t  pd_b      = bli_obj_panel_dim( b );
-	      inc_t  ps_b      = bli_obj_panel_stride( b );
-
-	      char*  c_cast    = bli_obj_buffer_at_off( c );
-	      inc_t  rs_c      = bli_obj_row_stride( c );
-	      inc_t  cs_c      = bli_obj_col_stride( c );
-
-	// If any dimension is zero, return immediately.
-	if ( bli_zero_dim3( m, n, k ) ) return;
-
-	// Detach and multiply the scalars attached to A and B.
-	// NOTE: We know that the internal scalars of A and B are already of the
-	// target datatypes because the necessary typecasting would have already
-	// taken place during bli_packm_init().
-	obj_t scalar_a, scalar_b;
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	// NOTE: We know that scalar_b is of type dt_exec due to the above code
-	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
-	// and we know that the internal scalar in C is already of the type dt_c
-	// due to the casting in the implementation of bli_obj_scalar_attach().
-	const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
-	const char* beta_cast  = bli_obj_internal_scalar_buffer( c );
-
-	// If 1m is being employed on a column- or row-stored matrix with a
-	// real-valued beta, we can use the real domain macro-kernel, which
-	// eliminates a little overhead associated with the 1m virtual
-	// micro-kernel.
-	// Only employ this optimization if the storage datatype of C is
-	// equal to the execution/computation datatype.
-#if 1
-	if ( bli_cntx_method( cntx ) == BLIS_1M )
-	{
-		bli_gemm_ind_recast_1m_params
-		(
-		  &dt_exec,
-		  &dt_c,
-		  schema_a,
-		  c,
-		  &m, &n, &k,
-		  &pd_a, &ps_a,
-		  &pd_b, &ps_b,
-		  &rs_c, &cs_c,
-		  cntx
-		);
-	}
-#endif
-
-#ifdef BLIS_ENABLE_GEMM_MD
-	// Tweak parameters in select mixed domain cases (rcc, crc, ccr).
-	if ( bli_cntx_method( cntx ) == BLIS_NAT )
-	{
-		bli_gemm_md_ker_var2_recast
-		(
-		  &dt_exec,
-		  bli_obj_dt( a ),
-		  bli_obj_dt( b ),
-		  &dt_c,
-		  &m, &n, &k,
-		  &pd_a, &ps_a,
-		  &pd_b, &ps_b,
-		  c,
-		  &rs_c, &cs_c
-		);
-	}
-#endif
-
-	const siz_t dt_size   = bli_dt_size( dt_exec );
-	const siz_t dt_c_size = bli_dt_size( dt_c );
-
-	// Alias some constants to simpler names.
-	const dim_t MR = pd_a;
-	const dim_t NR = pd_b;
-
-	// Query the context for the micro-kernel address and cast it to its
-	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
-
-	// Query the params field from the obj_t. If it is non-NULL, grab the ukr
-	// field of the params struct. If that function pointer is non-NULL, use it
-	// as our microkernel instead of the default microkernel queried from the
-	// cntx above.
-	const gemm_ker_params_t* params = bli_obj_ker_params( c );
-	gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
-	if ( user_ukr ) gemm_ukr = user_ukr;
-
-	// Temporary C buffer for edge cases. Note that the strides of this
-	// temporary buffer are set so that they match the storage of the
-	// original C matrix. For example, if C is column-stored, ct will be
-	// column-stored as well.
-	char        ct[ BLIS_STACK_BUF_MAX_SIZE ]
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
-	const inc_t rs_ct       = ( col_pref ? 1 : NR );
-	const inc_t cs_ct       = ( col_pref ? MR : 1 );
-	const char* zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
-
-	//
-	// Assumptions/assertions:
-	//   rs_a == 1
-	//   cs_a == PACKMR
-	//   pd_a == MR
-	//   ps_a == stride to next micro-panel of A
-	//   rs_b == PACKNR
-	//   cs_b == 1
-	//   pd_b == NR
-	//   ps_b == stride to next micro-panel of B
-	//   rs_c == (no assumptions)
-	//   cs_c == (no assumptions)
-	//
-
-	// Compute number of primary and leftover components of the m and n
-	// dimensions.
-	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
-	const dim_t n_left = n % NR;
-
-	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
-	const dim_t m_left = m % MR;
-
-	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
-
-	const inc_t cstep_b = ps_b * dt_size;
-
-	const inc_t rstep_c = rs_c * MR * dt_c_size;
-	const inc_t cstep_c = cs_c * NR * dt_c_size;
-
-	auxinfo_t aux;
-
-	// Save the pack schemas of A and B to the auxinfo_t object.
-	bli_auxinfo_set_schema_a( schema_a, &aux );
-	bli_auxinfo_set_schema_b( schema_b, &aux );
-
-	// Save the imaginary stride of A and B to the auxinfo_t object.
-	bli_auxinfo_set_is_a( is_a, &aux );
-	bli_auxinfo_set_is_b( is_b, &aux );
-
-	// Save the virtual microkernel address and the params.
-	bli_auxinfo_set_ukr( gemm_ukr, &aux );
-	bli_auxinfo_set_params( params, &aux );
-
-	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	// loop around the microkernel. Notice that this variant doesn't utilize
-	// parallelism in the 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
-
-	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	const dim_t jr_tid = bli_thrinfo_work_id( thread );
-	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
-
-	// Determine the starting microtile offsets and number of microtiles to
-	// compute for each thread. Note that assignment of microtiles is done
-	// according to the tlb policy.
-	dim_t jr_st, ir_st;
-	const dim_t n_ut_for_me
-	=
-	bli_thread_range_tlb_d( jr_nt, jr_tid, m_iter, n_iter, MR, NR, &jr_st, &ir_st );
-
-	// It's possible that there are so few microtiles relative to the number
-	// of threads that one or more threads gets no work. If that happens, those
-	// threads can return early.
-	if ( n_ut_for_me == 0 ) return;
-
-	// Start the jr/ir loops with the current thread's microtile offsets computed
-	// by bli_thread_range_tlb().
-	dim_t i = ir_st;
-	dim_t j = jr_st;
-
-	// Initialize a counter to track the number of microtiles computed by the
-	// current thread.
-	dim_t ut = 0;
-
-	// Loop over the n dimension (NR columns at a time).
-	for ( ; true; ++j )
-	{
-		const char* b1 = b_cast + j * cstep_b;
-		      char* c1 = c_cast + j * cstep_c;
-
-		// Compute the current microtile's width.
-		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left )
-		                      ? NR : n_left );
-
-		// Initialize our next panel of B to be the current panel of B.
-		const char* b2 = b1;
-
-		bli_auxinfo_set_next_b( b2, &aux );
-
-		// Loop over the m dimension (MR rows at a time).
-		for ( ; i < m_iter; ++i )
-		{
-			const char* a1  = a_cast + i * rstep_a;
-			      char* c11 = c1     + i * rstep_c;
-
-			// Compute the current microtile's length.
-			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
-			                      ? MR : m_left );
-
-			// Compute the addresses of the next panels of A and B.
-			const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, 1 );
-			if ( bli_is_last_iter_sl( i, m_iter ) )
-			{
-				a2 = a_cast;
-				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, 1 );
-				bli_auxinfo_set_next_b( b2, &aux );
-			}
-
-			// Save addresses of next panels of A and B to the auxinfo_t
-			// object.
-			bli_auxinfo_set_next_a( a2, &aux );
-
-			// Edge case handling now occurs within the microkernel itself, but
-			// we must still explicitly accumulate to a temporary microtile in
-			// situations where a virtual microkernel is being used, such as
-			// during the 1m method or some cases of mixed datatypes.
-			if ( dt_exec == dt_c )
-			{
-				// Invoke the gemm micro-kernel.
-				gemm_ukr
-				(
-				  m_cur,
-				  n_cur,
-				  k,
-				  ( void* )alpha_cast,
-				  ( void* )a1,
-				  ( void* )b1,
-				  ( void* )beta_cast,
-				           c11, rs_c, cs_c,
-				  &aux,
-				  ( cntx_t* )cntx
-				);
-			}
-			else
-			{
-				// Invoke the gemm micro-kernel.
-				gemm_ukr
-				(
-				  MR,
-				  NR,
-				  k,
-				  ( void* )alpha_cast,
-				  ( void* )a1,
-				  ( void* )b1,
-				  ( void* )zero,
-				           &ct, rs_ct, cs_ct,
-				  &aux,
-				  ( cntx_t* )cntx
-				);
-
-				// Accumulate to C with typecasting.
-				xpbys_mxn[ dt_exec ][ dt_c ]
-				(
-				  m_cur, n_cur,
-				  &ct, rs_ct, cs_ct,
-				  ( void* )beta_cast,
-				  c11, rs_c, cs_c
-				);
-			}
-
-			ut += 1;
-			if ( ut == n_ut_for_me ) return;
-		}
-
-		i = 0;
-	}
-}
-
-//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: b1", k, NR, b1, NR, 1, "%4.1f", "" );
-//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: a1", MR, k, a1, 1, MR, "%4.1f", "" );
-//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" );
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index 0865b11e9..fea67c0af 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -967,10 +967,10 @@ BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t
 
 BLIS_INLINE bool bli_is_last_iter_slrr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
 {
-#ifdef BLIS_ENABLE_JRIR_SLAB
-	return bli_is_last_iter_sl( i, end_iter );
-#else // BLIS_ENABLE_JRIR_RR
+#ifdef BLIS_ENABLE_JRIR_RR
 	return bli_is_last_iter_rr( i, end_iter, tid, nth );
+#else // ifdef ( _SLAB || _TLB )
+	return bli_is_last_iter_sl( i, end_iter );
 #endif
 }
 

From ecbcf4008815035c695822fcaf106477debff89a Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Wed, 18 Jan 2023 20:35:50 -0600
Subject: [PATCH 126/230] Use here-document for 'configure --help' output.
 (#714)

Details:
- Changed the configure script function that outputs "--help" text to do
  so via so-called "here-document" syntax for improved readability and
  maintainability. The change eliminates hundreds of echo statements and
  makes it easier to change existing configure options' help text, along
  with other benefits such as eliminating the need to escape double-
  quote characters (").
---
 configure | 780 +++++++++++++++++++++++++++---------------------------
 1 file changed, 390 insertions(+), 390 deletions(-)

diff --git a/configure b/configure
index 06201b4fa..a89225107 100755
--- a/configure
+++ b/configure
@@ -46,396 +46,396 @@ print_usage()
 	fi
 
 	# Echo usage info.
-	echo " "
-	echo " ${script_name} (BLIS ${version})"
-	#echo " "
-	#echo " BLIS ${version}"
-	echo " "
-	echo " Configure BLIS's build system for compilation using a specified"
-	echo " configuration directory."
-	echo " "
-	echo " Usage:"
-	echo " "
-	echo "   ${script_name} [options] [env. vars.] confname"
-	echo " "
-	echo " Arguments:"
-	echo " "
-	echo "   confname      The name of the sub-directory inside of the 'config'"
-	echo "                 directory containing the desired BLIS configuration."
-	echo "                 Note that confname MUST be specified; if it is not,"
-	echo "                 configure will complain. To build a completely generic"
-	echo "                 implementation, use the 'generic' configuration"
-	echo " "
-	echo " Options:"
-	echo " "
-	echo "   -p PREFIX, --prefix=PREFIX"
-	echo " "
-	echo "                 The common installation prefix for all files. If given,"
-	echo "                 this option effectively implies:"
-	echo "                   --libdir=EXECPREFIX/lib"
-	echo "                   --includedir=PREFIX/include"
-	echo "                   --sharedir=PREFIX/share"
-	echo "                 where EXECPREFIX defaults to PREFIX. If this option is"
-	echo "                 not given, PREFIX defaults to '${prefix_def}'. If PREFIX"
-	echo "                 refers to a directory that does not exist, it will be"
-	echo "                 created."
-	echo " "
-	echo "   --exec-prefix=EXECPREFIX"
-	echo " "
-	echo "                 The installation prefix for libraries. Specifically, if"
-	echo "                 given, this option effectively implies:"
-	echo "                   --libdir=EXECPREFIX/lib"
-	echo "                 If not given, EXECPREFIX defaults to PREFIX, which may be"
-	echo "                 modified by the --prefix option. If EXECPREFIX refers to"
-	echo "                 a directory that does not exist, it will be created."
-	echo " "
-	echo "   --libdir=LIBDIR"
-	echo " "
-	echo "                 The path to which make will install libraries. If not"
-	echo "                 given, LIBDIR defaults to PREFIX/lib. If LIBDIR refers to"
-	echo "                 a directory that does not exist, it will be created."
-	echo " "
-	echo "   --includedir=INCDIR"
-	echo " "
-	echo "                 The path to which make will install development header"
-	echo "                 files. If not given, INCDIR defaults to PREFIX/include."
-	echo "                 If INCDIR refers to a directory that does not exist, it"
-	echo "                 will be created."
-	echo " "
-	echo "   --sharedir=SHAREDIR"
-	echo " "
-	echo "                 The path to which make will makefile fragments containing"
-	echo "                 make variables determined by configure (e.g. CC, CFLAGS,"
-	echo "                 and LDFLAGS). These files allow certain BLIS makefiles,"
-	echo "                 such as those in the examples or testsuite directories, to"
-	echo "                 operate on an installed copy of BLIS rather than a local"
-	echo "                 (and possibly uninstalled) copy. If not given, SHAREDIR"
-	echo "                 defaults to PREFIX/share. If SHAREDIR refers to a"
-	echo "                 directory that does not exist, it will be created."
-	echo " "
-	echo "   --enable-verbose-make, --disable-verbose-make"
-	echo " "
-	echo "                 Enable (disabled by default) verbose compilation output"
-	echo "                 during make."
-	echo " "
-	echo "   --enable-arg-max-hack --disable-arg-max-hack"
-	echo " "
-	echo "                 Enable (disabled by default) build system logic that"
-	echo "                 will allow archiving/linking the static/shared library"
-	echo "                 even if the command plus command line arguments exceeds"
-	echo "                 the operating system limit (ARG_MAX)."
-	echo " "
-	echo "   -d DEBUG, --enable-debug[=DEBUG]"
-	echo " "
-	echo "                 Enable debugging symbols in the library. If argument"
-	echo "                 DEBUG is given as 'opt', then optimization flags are"
-	echo "                 kept in the framework, otherwise optimization is"
-	echo "                 turned off."
-	echo " "
-	echo "   --disable-static, --enable-static"
-	echo " "
-	echo "                 Disable (enabled by default) building BLIS as a static"
-	echo "                 library. If the static library build is disabled, the"
-	echo "                 shared library build must remain enabled."
-	echo " "
-	echo "   --disable-shared, --enable-shared"
-	echo " "
-	echo "                 Disable (enabled by default) building BLIS as a shared"
-	echo "                 library. If the shared library build is disabled, the"
-	echo "                 static library build must remain enabled."
-	echo " "
-	echo "   --enable-rpath, --disable-rpath"
-	echo " "
-	echo "                 Enable (disabled by default) setting an install_name for"
-	echo "                 dynamic libraries on macOS which starts with @rpath rather"
-	echo "                 than the absolute install path."
-	echo " "
-	echo "   -e SYMBOLS, --export-shared[=SYMBOLS]"
-	echo " "
-	echo "                 Specify the subset of library symbols that are exported"
-	echo "                 within a shared library. Valid values for SYMBOLS are:"
-	echo "                 'public' (the default) and 'all'. By default, only"
-	echo "                 functions and variables that belong to public APIs are"
-	echo "                 exported in shared libraries. However, the user may"
-	echo "                 instead export all symbols in BLIS, even those that were"
-	echo "                 intended for internal use only. Note that the public APIs"
-	echo "                 encompass all functions that almost any user would ever"
-	echo "                 want to call, including the BLAS/CBLAS compatibility APIs"
-	echo "                 as well as the basic and expert interfaces to the typed"
-	echo "                 and object APIs that are unique to BLIS. Also note that"
-	echo "                 changing this option to 'all' will have no effect in some"
-	echo "                 environments, such as when compiling with clang on"
-	echo "                 Windows."
-	echo " "
-	echo "   -t MODEL, --enable-threading[=MODEL], --disable-threading"
-	echo " "
-	echo "                 Enable threading in the library, using threading model(s)"
-	echo "                 MODEL={single,openmp,pthreads,hpx,auto}. If multiple values"
-	echo "                 are specified within MODEL, they will all be compiled into"
-	echo "                 BLIS, and the choice of which to use will be determined at"
-	echo "                 runtime. If the user does not express a preference (by"
-	echo "                 setting the BLIS_THREAD_IMPL environment variable to"
-	echo "                 'single', 'openmp', 'pthreads', or 'hpx'; by calling the"
-	echo "                 global runtime API bli_thread_set_thread_impl(); or by"
-	echo "                 encoding a choice on a per-call basis within a rntm_t"
-	echo "                 passed into the expert API), then the first model listed"
-	echo "                 in MODEL will be used by default. Note that 'single' is"
-	echo "                 silently appended to whatever the user specifies in MODEL,"
-	echo "                 meaning that single-threaded functionality will always be"
-	echo "                 available, even if it is not requested and even if it is"
-	echo "                 not enabled by default. Even --disable-threading is"
-	echo "                 actually shorthand for --enable-threading=single (which is"
-	echo "                 the default when the option is not specified)."
-	echo " "
-	echo "   --enable-system, --disable-system"
-	echo " "
-	echo "                 Enable conventional operating system support, such as"
-	echo "                 pthreads for thread-safety. The default state is enabled."
-	echo "                 However, in rare circumstances you may wish to configure"
-	echo "                 BLIS for use with a minimal or nonexistent operating"
-	echo "                 system (e.g. hardware simulators). In these situations,"
-	echo "                 --disable-system may be used to jettison all compile-time"
-	echo "                 and link-time dependencies outside of the standard C"
-	echo "                 library. When disabled, this option also forces the use"
-	echo "                 of --disable-threading."
-	echo " "
-	echo "   --disable-pba-pools, --enable-pba-pools"
-	echo "   --disable-sba-pools, --enable-sba-pools"
-	echo " "
-	echo "                 Disable (enabled by default) use of internal memory pools"
-	echo "                 within the packing block allocator (pba) and/or the small"
-	echo "                 block allocator (sba). The former is used to allocate"
-	echo "                 memory used to pack submatrices while the latter is used"
-	echo "                 to allocate control/thread tree nodes and thread"
-	echo "                 communicators. Both allocations take place in the context"
-	echo "                 of level-3 operations. When the pba is disabled, the"
-	echo "                 malloc()-like function specified by BLIS_MALLOC_POOL is"
-	echo "                 called on-demand whenever a packing block is needed, and"
-	echo "                 when the sba is disabled, the malloc()-like function"
-	echo "                 specified by BLIS_MALLOC_INTL is called whenever a small"
-	echo "                 block is needed, with the two allocators calling free()-"
-	echo "                 like functions BLIS_FREE_POOL and BLIS_FREE_INTL,"
-	echo "                 respectively when blocks are released. When enabled,"
-	echo "                 either or both pools are populated via the same functions"
-	echo "                 mentioned previously, and henceforth blocks are checked"
-	echo "                 out and in. The library quickly reaches a state in which"
-	echo "                 it no longer needs to call malloc() or free(), even"
-	echo "                 across many separate level-3 operation invocations."
-	echo " "
-	echo "   --enable-mem-tracing, --disable-mem-tracing"
-	echo " "
-	echo "                 Enable (disabled by default) output to stdout that traces"
-	echo "                 the allocation and freeing of memory, including the names"
-	echo "                 of the functions that triggered the allocation/freeing."
-	echo "                 Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE."
-	echo "                 Please use only for informational/debugging purposes."
-	echo " "
-	echo "   --enable-asan, --disable-asan"
-	echo " "
-	echo "                 Enable (disabled by default) compiling and linking BLIS"
-	echo "                 framework code with the AddressSanitizer (ASan) library."
-	echo "                 Optimized kernels are NOT compiled with ASan support due"
-	echo "                 to limitations of register assignment in inline assembly."
-	echo "                 WARNING: ENABLING THIS OPTION WILL NEGATIVELY IMPACT"
-	echo "                 PERFORMANCE. Please use only for informational/debugging"
-	echo "                 purposes."
-	echo " "
-	echo "   -i SIZE, --int-size=SIZE"
-	echo " "
-	echo "                 Set the size (in bits) of internal BLIS integers and"
-	echo "                 integer types used in native BLIS interfaces. The"
-	echo "                 default integer type size is architecture dependent."
-	echo "                 (Hint: You can always find this value printed at the"
-	echo "                 beginning of the testsuite output.)"
-	echo " "
-	echo "   -b SIZE, --blas-int-size=SIZE"
-	echo " "
-	echo "                 Set the size (in bits) of integer types in external"
-	echo "                 BLAS and CBLAS interfaces, if enabled. The default"
-	echo "                 integer type size used in BLAS/CBLAS is 32 bits."
-	echo " "
-	echo "   --disable-blas, --enable-blas"
-	echo " "
-	echo "                 Disable (enabled by default) building the BLAS"
-	echo "                 compatibility layer."
-	echo " "
-	echo "   --enable-cblas, --disable-cblas"
-	echo " "
-	echo "                 Enable (disabled by default) building the CBLAS"
-	echo "                 compatibility layer. This automatically enables the"
-	echo "                 BLAS compatibility layer as well."
-	echo " "
-	echo "   --disable-mixed-dt, --enable-mixed-dt"
-	echo " "
-	echo "                 Disable (enabled by default) support for mixing the"
-	echo "                 storage domain and/or storage precision of matrix"
-	echo "                 operands for the gemm operation, as well as support"
-	echo "                 for computing in a precision different from one or"
-	echo "                 both of matrices A and B."
-	echo " "
-	echo "   --disable-mixed-dt-extra-mem, --enable-mixed-dt-extra-mem"
-	echo " "
-	echo "                 Disable (enabled by default) support for additional"
-	echo "                 mixed datatype optimizations that require temporarily"
-	echo "                 allocating extra memory--specifically, a single m x n"
-	echo "                 matrix (per application thread) whose storage datatype"
-	echo "                 is equal to the computation datatype. This option may"
-	echo "                 only be enabled when mixed domain/precision support is"
-	echo "                 enabled."
-	echo " "
-	echo "   --disable-sup-handling, --enable-sup-handling"
-	echo " "
-	echo "                 Disable (enabled by default) handling of small/skinny"
-	echo "                 matrix problems via separate code branches. When disabled,"
-	echo "                 these small/skinny level-3 operations will be performed by"
-	echo "                 the conventional implementation, which is optimized for"
-	echo "                 medium and large problems. Note that what qualifies as"
-	echo "                 \"small\" depends on thresholds that may vary by sub-"
-	echo "                 configuration."
-	echo " "
-	echo "   --enable-amd-frame-tweaks, --disable-amd-frame-tweaks"
-	echo " "
-	echo "                 Enable building with certain framework files that have"
-	echo "                 been customized by AMD for Zen-based microarchitectures."
-	echo "                 The default counterparts of these files must be portable,"
-	echo "                 and so these customized files may provide some (typically"
-	echo "                 modest) performance improvement for some select operations"
-	echo "                 and/or APIs, though there may a few (tiny dimension) cases"
-	echo "                 where the improvement is more pronounced. Note that the"
-	echo "                 target configuration must be Zen-based (or 'amd64') for"
-	echo "                 this option to have any effect. (Also note that this"
-	echo "                 option is NOT to be confused with enabling AMD *kernels*,"
-	echo "                 which are determined by the BLIS subconfiguration used at"
-	echo "                 runtime.) By default, these customized files are disabled."
-	echo " "
-	echo "   -a NAME --enable-addon=NAME"
-	echo " "
-	echo "                 Enable the code provided by an addon. An addon consists"
-	echo "                 of a separate directory of code that provides additional"
-	echo "                 APIs, implementations, and/or operations that would"
-	echo "                 otherwise not be present within a build of BLIS. This"
-	echo "                 option may be used multiple times to specify the inclusion"
-	echo "                 of multiple addons. By default, no addons are enabled."
-	echo " "
-	echo "   -s NAME --enable-sandbox=NAME"
-	echo " "
-	echo "                 Enable a separate sandbox implementation of gemm. This"
-	echo "                 option disables BLIS's conventional gemm implementation"
-	echo "                 (which shares common infrastructure with other level-3"
-	echo "                 operations) and instead compiles and uses the code in"
-	echo "                 the NAME directory, which is expected to be a sub-"
-	echo "                 directory of 'sandbox'. By default, no sandboxes are"
-	echo "                 enabled."
-	echo " "
-	echo "   --with-memkind, --without-memkind"
-	echo " "
-	echo "                 Forcibly enable or disable the use of libmemkind's"
-	echo "                 hbw_malloc() and hbw_free() as substitutes for malloc()"
-	echo "                 and free(), respectively, when allocating memory for"
-	echo "                 BLIS's memory pools, which are used to manage buffers"
-	echo "                 into which matrices are packed. The default behavior"
-	echo "                 for this option is environment-dependent; if configure"
-	echo "                 detects the presence of libmemkind, libmemkind is used"
-	echo "                 by default, and otherwise it is not used by default."
-	echo " "
-	echo "   -r METHOD, --thread-part-jrir=METHOD"
-	echo " "
-	echo "                 Select a strategy for partitioning computation in JR and"
-	echo "                 IR loops and assigning that computation to threads. Valid"
-	echo "                 values for METHOD are 'rr', 'slab', and 'tlb':"
-	echo "                  'rr':   Assign the computation associated with whole"
-	echo "                          columns of microtiles to threads in a round-"
-	echo "                          robin fashion. When selected, round-robin"
-	echo "                          assignment is also employed during packing."
-	echo "                  'slab': Partition the computation into N contiguous"
-	echo "                          regions, where each region contains a whole"
-	echo "                          number of microtile columns, and assign one"
-	echo "                          region to each thread. For some operations, the"
-	echo "                          number of microtile columns contained within a"
-	echo "                          given region may differ from that of other"
-	echo "                          regions, depending on how much work is implied"
-	echo "                          by each region. When selected, slab assignment"
-	echo "                          is also employed during packing."
-	echo "                  'tlb':  Tile-level load balancing is similar to slab,"
-	echo "                          except that regions will be divided at a more"
-	echo "                          granular level (individual microtiles instead"
-	echo "                          of whole columns of microtiles) to ensure more"
-	echo "                          equitable assignment of work to threads. When"
-	echo "                          selected, tlb will only be employed for level-3"
-	echo "                          operations except trsm; due to practical and"
-	echo "                          algorithmic limitations, slab partitioning will"
-	echo "                          be used instead during packing and for trsm."
-	echo "                 The default strategy is 'slab'. NOTE: Specifying this"
-	echo "                 option constitutes a request, which may be ignored in"
-	echo "                 select situations if implementation has a good reason to"
-	echo "                 do so. (See description of 'tlb' above for an example of"
-	echo "                 this.)"
-	echo " "
-	echo "   --disable-trsm-preinversion, --enable-trsm-preinversion"
-	echo " "
-	echo "                 Disable (enabled by default) pre-inversion of triangular"
-	echo "                 matrix diagonals when performing trsm. When pre-inversion"
-	echo "                 is enabled, diagonal elements are inverted outside of the"
-	echo "                 microkernel (e.g. during packing) so that the microkernel"
-	echo "                 can use multiply instructions. When disabled, division"
-	echo "                 instructions are used within the microkernel. Executing"
-	echo "                 these division instructions within the microkernel will"
-	echo "                 incur a performance penalty, but numerical robustness will"
-	echo "                 improve for certain cases involving denormal numbers that"
-	echo "                 would otherwise result in overflow in the pre-inverted"
-	echo "                 values."
-	echo " "
-	echo "   --force-version=STRING"
-	echo " "
-	echo "                 Force configure to use an arbitrary version string"
-	echo "                 STRING. This option may be useful when repackaging"
-	echo "                 custom versions of BLIS by outside organizations."
-	echo " "
-	echo "   -c, --show-config-lists"
-	echo " "
-	echo "                 Print the config and kernel lists, and kernel-to-config"
-	echo "                 map after they are read from file. This can be useful"
-	echo "                 when debugging certain configuration issues, and/or as"
-	echo "                 a sanity check to make sure these lists are constituted"
-	echo "                 as expected."
-	echo " "
-	echo "   --complex-return=gnu|intel"
-	echo " "
-	echo "                 Specify the way in which complex numbers are returned"
-	echo "                 from Fortran functions, either \"gnu\" (return in"
-	echo "                 registers) or \"intel\" (return via hidden argument)."
-	echo "                 If not specified and the environment variable FC is set,"
-	echo "                 attempt to determine the return type from the compiler."
-	echo "                 Otherwise, the default is \"gnu\"."
-	echo " "
-	echo "   -q, --quiet   Suppress informational output. By default, configure"
-	echo "                 is verbose. (NOTE: -q is not yet implemented)"
-	echo " "
-	echo "   -h, --help    Output this information and quit."
-	echo " "
-	echo " Environment Variables:"
-	echo " "
-	echo "   CC            Specifies the C compiler to use."
-	echo "   CXX           Specifies the C++ compiler to use (sandbox only)."
-	echo "   FC            Specifies the Fortran compiler to use (only to determine --complex-return)."
-	echo "   AR            Specifies the static library archiver to use."
-	echo "   RANLIB        Specifies the ranlib (library indexer) executable to use."
-	echo "   PYTHON        Specifies the python interpreter to use."
-	echo "   CFLAGS        Specifies additional compiler flags to use (prepended)."
-	echo "   LDFLAGS       Specifies additional linker flags to use (prepended)."
-	echo "   LIBPTHREAD    Pthreads library to use."
-	echo " "
-	echo "   Environment variables are traditionally set prior to running configure:"
-	echo " "
-	echo "     CC=gcc ./configure [options] haswell"
-	echo " "
-	echo "   However, they may also be specified as command line options, e.g.:"
-	echo " "
-	echo "     ./configure [options] CC=gcc haswell"
-	echo " "
-	echo "   Note that not all compilers are compatible with a given"
-	echo "   configuration."
-	echo " "
+ 	cat <<EOF
+
+ ${script_name} (BLIS ${version})
+
+ Configure BLIS's build system for compilation using a specified
+ configuration directory.
+
+ Usage:
+
+   ${script_name} [options] [env. vars.] confname
+
+ Arguments:
+
+   confname      The name of the sub-directory inside of the 'config'
+                 directory containing the desired BLIS configuration.
+                 Note that confname MUST be specified; if it is not,
+                 configure will complain. To build a completely generic
+                 implementation, use the 'generic' configuration
+
+ Options:
+
+   -p PREFIX, --prefix=PREFIX
+
+                 The common installation prefix for all files. If given,
+                 this option effectively implies:
+                   --libdir=EXECPREFIX/lib
+                   --includedir=PREFIX/include
+                   --sharedir=PREFIX/share
+                 where EXECPREFIX defaults to PREFIX. If this option is
+                 not given, PREFIX defaults to '${prefix_def}'. If PREFIX
+                 refers to a directory that does not exist, it will be
+                 created.
+
+   --exec-prefix=EXECPREFIX
+
+                 The installation prefix for libraries. Specifically, if
+                 given, this option effectively implies:
+                   --libdir=EXECPREFIX/lib
+                 If not given, EXECPREFIX defaults to PREFIX, which may be
+                 modified by the --prefix option. If EXECPREFIX refers to
+                 a directory that does not exist, it will be created.
+
+   --libdir=LIBDIR
+
+                 The path to which make will install libraries. If not
+                 given, LIBDIR defaults to PREFIX/lib. If LIBDIR refers to
+                 a directory that does not exist, it will be created.
+
+   --includedir=INCDIR
+
+                 The path to which make will install development header
+                 files. If not given, INCDIR defaults to PREFIX/include.
+                 If INCDIR refers to a directory that does not exist, it
+                 will be created.
+
+   --sharedir=SHAREDIR
+
+                 The path to which make will makefile fragments containing
+                 make variables determined by configure (e.g. CC, CFLAGS,
+                 and LDFLAGS). These files allow certain BLIS makefiles,
+                 such as those in the examples or testsuite directories, to
+                 operate on an installed copy of BLIS rather than a local
+                 (and possibly uninstalled) copy. If not given, SHAREDIR
+                 defaults to PREFIX/share. If SHAREDIR refers to a
+                 directory that does not exist, it will be created.
+
+   --enable-verbose-make, --disable-verbose-make
+
+                 Enable (disabled by default) verbose compilation output
+                 during make.
+
+   --enable-arg-max-hack --disable-arg-max-hack
+
+                 Enable (disabled by default) build system logic that
+                 will allow archiving/linking the static/shared library
+                 even if the command plus command line arguments exceeds
+                 the operating system limit (ARG_MAX).
+
+   -d DEBUG, --enable-debug[=DEBUG]
+
+                 Enable debugging symbols in the library. If argument
+                 DEBUG is given as 'opt', then optimization flags are
+                 kept in the framework, otherwise optimization is
+                 turned off.
+
+   --disable-static, --enable-static
+
+                 Disable (enabled by default) building BLIS as a static
+                 library. If the static library build is disabled, the
+                 shared library build must remain enabled.
+
+   --disable-shared, --enable-shared
+
+                 Disable (enabled by default) building BLIS as a shared
+                 library. If the shared library build is disabled, the
+                 static library build must remain enabled.
+
+   --enable-rpath, --disable-rpath
+
+                 Enable (disabled by default) setting an install_name for
+                 dynamic libraries on macOS which starts with @rpath rather
+                 than the absolute install path.
+
+   -e SYMBOLS, --export-shared[=SYMBOLS]
+
+                 Specify the subset of library symbols that are exported
+                 within a shared library. Valid values for SYMBOLS are:
+                 'public' (the default) and 'all'. By default, only
+                 functions and variables that belong to public APIs are
+                 exported in shared libraries. However, the user may
+                 instead export all symbols in BLIS, even those that were
+                 intended for internal use only. Note that the public APIs
+                 encompass all functions that almost any user would ever
+                 want to call, including the BLAS/CBLAS compatibility APIs
+                 as well as the basic and expert interfaces to the typed
+                 and object APIs that are unique to BLIS. Also note that
+                 changing this option to 'all' will have no effect in some
+                 environments, such as when compiling with clang on
+                 Windows.
+
+   -t MODEL, --enable-threading[=MODEL], --disable-threading
+
+                 Enable threading in the library, using threading model(s)
+                 MODEL={single,openmp,pthreads,hpx,auto}. If multiple values
+                 are specified within MODEL, they will all be compiled into
+                 BLIS, and the choice of which to use will be determined at
+                 runtime. If the user does not express a preference (by
+                 setting the BLIS_THREAD_IMPL environment variable to
+                 'single', 'openmp', 'pthreads', or 'hpx'; by calling the
+                 global runtime API bli_thread_set_thread_impl(); or by
+                 encoding a choice on a per-call basis within a rntm_t
+                 passed into the expert API), then the first model listed
+                 in MODEL will be used by default. Note that 'single' is
+                 silently appended to whatever the user specifies in MODEL,
+                 meaning that single-threaded functionality will always be
+                 available, even if it is not requested and even if it is
+                 not enabled by default. Even --disable-threading is
+                 actually shorthand for --enable-threading=single (which is
+                 the default when the option is not specified).
+
+   --enable-system, --disable-system
+
+                 Enable conventional operating system support, such as
+                 pthreads for thread-safety. The default state is enabled.
+                 However, in rare circumstances you may wish to configure
+                 BLIS for use with a minimal or nonexistent operating
+                 system (e.g. hardware simulators). In these situations,
+                 --disable-system may be used to jettison all compile-time
+                 and link-time dependencies outside of the standard C
+                 library. When disabled, this option also forces the use
+                 of --disable-threading.
+
+   --disable-pba-pools, --enable-pba-pools
+   --disable-sba-pools, --enable-sba-pools
+
+                 Disable (enabled by default) use of internal memory pools
+                 within the packing block allocator (pba) and/or the small
+                 block allocator (sba). The former is used to allocate
+                 memory used to pack submatrices while the latter is used
+                 to allocate control/thread tree nodes and thread
+                 communicators. Both allocations take place in the context
+                 of level-3 operations. When the pba is disabled, the
+                 malloc()-like function specified by BLIS_MALLOC_POOL is
+                 called on-demand whenever a packing block is needed, and
+                 when the sba is disabled, the malloc()-like function
+                 specified by BLIS_MALLOC_INTL is called whenever a small
+                 block is needed, with the two allocators calling free()-
+                 like functions BLIS_FREE_POOL and BLIS_FREE_INTL,
+                 respectively when blocks are released. When enabled,
+                 either or both pools are populated via the same functions
+                 mentioned previously, and henceforth blocks are checked
+                 out and in. The library quickly reaches a state in which
+                 it no longer needs to call malloc() or free(), even
+                 across many separate level-3 operation invocations.
+
+   --enable-mem-tracing, --disable-mem-tracing
+
+                 Enable (disabled by default) output to stdout that traces
+                 the allocation and freeing of memory, including the names
+                 of the functions that triggered the allocation/freeing.
+                 Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE.
+                 Please use only for informational/debugging purposes.
+
+   --enable-asan, --disable-asan
+
+                 Enable (disabled by default) compiling and linking BLIS
+                 framework code with the AddressSanitizer (ASan) library.
+                 Optimized kernels are NOT compiled with ASan support due
+                 to limitations of register assignment in inline assembly.
+                 WARNING: ENABLING THIS OPTION WILL NEGATIVELY IMPACT
+                 PERFORMANCE. Please use only for informational/debugging
+                 purposes.
+
+   -i SIZE, --int-size=SIZE
+
+                 Set the size (in bits) of internal BLIS integers and
+                 integer types used in native BLIS interfaces. The
+                 default integer type size is architecture dependent.
+                 (Hint: You can always find this value printed at the
+                 beginning of the testsuite output.)
+
+   -b SIZE, --blas-int-size=SIZE
+
+                 Set the size (in bits) of integer types in external
+                 BLAS and CBLAS interfaces, if enabled. The default
+                 integer type size used in BLAS/CBLAS is 32 bits.
+
+   --disable-blas, --enable-blas
+
+                 Disable (enabled by default) building the BLAS
+                 compatibility layer.
+
+   --enable-cblas, --disable-cblas
+
+                 Enable (disabled by default) building the CBLAS
+                 compatibility layer. This automatically enables the
+                 BLAS compatibility layer as well.
+
+   --disable-mixed-dt, --enable-mixed-dt
+
+                 Disable (enabled by default) support for mixing the
+                 storage domain and/or storage precision of matrix
+                 operands for the gemm operation, as well as support
+                 for computing in a precision different from one or
+                 both of matrices A and B.
+
+   --disable-mixed-dt-extra-mem, --enable-mixed-dt-extra-mem
+
+                 Disable (enabled by default) support for additional
+                 mixed datatype optimizations that require temporarily
+                 allocating extra memory--specifically, a single m x n
+                 matrix (per application thread) whose storage datatype
+                 is equal to the computation datatype. This option may
+                 only be enabled when mixed domain/precision support is
+                 enabled.
+
+   --disable-sup-handling, --enable-sup-handling
+
+                 Disable (enabled by default) handling of small/skinny
+                 matrix problems via separate code branches. When disabled,
+                 these small/skinny level-3 operations will be performed by
+                 the conventional implementation, which is optimized for
+                 medium and large problems. Note that what qualifies as
+                 "small" depends on thresholds that may vary by sub-
+                 configuration.
+
+   --enable-amd-frame-tweaks, --disable-amd-frame-tweaks
+
+                 Enable building with certain framework files that have
+                 been customized by AMD for Zen-based microarchitectures.
+                 The default counterparts of these files must be portable,
+                 and so these customized files may provide some (typically
+                 modest) performance improvement for some select operations
+                 and/or APIs, though there may a few (tiny dimension) cases
+                 where the improvement is more pronounced. Note that the
+                 target configuration must be Zen-based (or 'amd64') for
+                 this option to have any effect. (Also note that this
+                 option is NOT to be confused with enabling AMD *kernels*,
+                 which are determined by the BLIS subconfiguration used at
+                 runtime.) By default, these customized files are disabled.
+
+   -a NAME --enable-addon=NAME
+
+                 Enable the code provided by an addon. An addon consists
+                 of a separate directory of code that provides additional
+                 APIs, implementations, and/or operations that would
+                 otherwise not be present within a build of BLIS. This
+                 option may be used multiple times to specify the inclusion
+                 of multiple addons. By default, no addons are enabled.
+
+   -s NAME --enable-sandbox=NAME
+
+                 Enable a separate sandbox implementation of gemm. This
+                 option disables BLIS's conventional gemm implementation
+                 (which shares common infrastructure with other level-3
+                 operations) and instead compiles and uses the code in
+                 the NAME directory, which is expected to be a sub-
+                 directory of 'sandbox'. By default, no sandboxes are
+                 enabled.
+
+   --with-memkind, --without-memkind
+
+                 Forcibly enable or disable the use of libmemkind's
+                 hbw_malloc() and hbw_free() as substitutes for malloc()
+                 and free(), respectively, when allocating memory for
+                 BLIS's memory pools, which are used to manage buffers
+                 into which matrices are packed. The default behavior
+                 for this option is environment-dependent; if configure
+                 detects the presence of libmemkind, libmemkind is used
+                 by default, and otherwise it is not used by default.
+
+   -r METHOD, --thread-part-jrir=METHOD
+
+                 Select a strategy for partitioning computation in JR and
+                 IR loops and assigning that computation to threads. Valid
+                 values for METHOD are 'rr', 'slab', and 'tlb':
+                  'rr':   Assign the computation associated with whole
+                          columns of microtiles to threads in a round-
+                          robin fashion. When selected, round-robin
+                          assignment is also employed during packing.
+                  'slab': Partition the computation into N contiguous
+                          regions, where each region contains a whole
+                          number of microtile columns, and assign one
+                          region to each thread. For some operations, the
+                          number of microtile columns contained within a
+                          given region may differ from that of other
+                          regions, depending on how much work is implied
+                          by each region. When selected, slab assignment
+                          is also employed during packing.
+                  'tlb':  Tile-level load balancing is similar to slab,
+                          except that regions will be divided at a more
+                          granular level (individual microtiles instead
+                          of whole columns of microtiles) to ensure more
+                          equitable assignment of work to threads. When
+                          selected, tlb will only be employed for level-3
+                          operations except trsm; due to practical and
+                          algorithmic limitations, slab partitioning will
+                          be used instead during packing and for trsm.
+                 The default strategy is 'slab'. NOTE: Specifying this
+                 option constitutes a request, which may be ignored in
+                 select situations if implementation has a good reason to
+                 do so. (See description of 'tlb' above for an example of
+                 this.)
+
+   --disable-trsm-preinversion, --enable-trsm-preinversion
+
+                 Disable (enabled by default) pre-inversion of triangular
+                 matrix diagonals when performing trsm. When pre-inversion
+                 is enabled, diagonal elements are inverted outside of the
+                 microkernel (e.g. during packing) so that the microkernel
+                 can use multiply instructions. When disabled, division
+                 instructions are used within the microkernel. Executing
+                 these division instructions within the microkernel will
+                 incur a performance penalty, but numerical robustness will
+                 improve for certain cases involving denormal numbers that
+                 would otherwise result in overflow in the pre-inverted
+                 values.
+
+   --force-version=STRING
+
+                 Force configure to use an arbitrary version string
+                 STRING. This option may be useful when repackaging
+                 custom versions of BLIS by outside organizations.
+
+   -c, --show-config-lists
+
+                 Print the config and kernel lists, and kernel-to-config
+                 map after they are read from file. This can be useful
+                 when debugging certain configuration issues, and/or as
+                 a sanity check to make sure these lists are constituted
+                 as expected.
+
+   --complex-return=gnu|intel
+
+                 Specify the way in which complex numbers are returned
+                 from Fortran functions, either "gnu" (return in
+                 registers) or "intel" (return via hidden argument).
+                 If not specified and the environment variable FC is set,
+                 attempt to determine the return type from the compiler.
+                 Otherwise, the default is "gnu".
+
+   -q, --quiet   Suppress informational output. By default, configure
+                 is verbose. (NOTE: -q is not yet implemented)
+
+   -h, --help    Output this information and quit.
+
+ Environment Variables:
+
+   CC            Specifies the C compiler to use.
+   CXX           Specifies the C++ compiler to use (sandbox only).
+   FC            Specifies the Fortran compiler to use (only to determine --complex-return).
+   AR            Specifies the static library archiver to use.
+   RANLIB        Specifies the ranlib (library indexer) executable to use.
+   PYTHON        Specifies the python interpreter to use.
+   CFLAGS        Specifies additional compiler flags to use (prepended).
+   LDFLAGS       Specifies additional linker flags to use (prepended).
+   LIBPTHREAD    Pthreads library to use.
+
+   Environment variables are traditionally set prior to running configure:
+
+     CC=gcc ./configure [options] haswell
+
+   However, they may also be specified as command line options, e.g.:
+
+     ./configure [options] CC=gcc haswell
+
+   Note that not all compilers are compatible with a given
+   configuration.
+
+EOF
 
 	# Exit with non-zero exit status
 	exit 1

From dc5d00a6ce0350cd82859d8c24f23d98f205d8db Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Fri, 27 Jan 2023 17:36:47 -0600
Subject: [PATCH 127/230] Typecast printf() args to avoid compiler warnings.
 (#716)

Details:
- In bli_thread_range_tlb.c, typecast integer arguments passed to
  printf() -- which are typically disabled unless debugging -- to type
  "long" to guarantee a match to the "%ld" format specifiers used in
  those calls. This avoids spurious warnings with certain compilers in
  certain toolchain environments, such as 32-bit RISC-V (rv32iv).
---
 frame/thread/bli_thread_range_tlb.c | 343 +++++++++++++++-------------
 1 file changed, 186 insertions(+), 157 deletions(-)

diff --git a/frame/thread/bli_thread_range_tlb.c b/frame/thread/bli_thread_range_tlb.c
index 546ed341d..d0c767373 100644
--- a/frame/thread/bli_thread_range_tlb.c
+++ b/frame/thread/bli_thread_range_tlb.c
@@ -5,7 +5,6 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -142,11 +141,11 @@ dim_t bli_thread_range_tlb_l
 	const dim_t offn_ut_nonrect = ( diagoffmin / nr );
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "min(diagoff,n):     %7ld\n", diagoffmin );
-	PGUARD printf( "offn_ut_nonrect:    %7ld\n", offn_ut_nonrect );
-	PGUARD printf( "offn_nonrect:       %7ld\n", offn_nonrect );
-	PGUARD printf( "diagoff_nonrect:    %7ld\n", diagoff_nonrect );
-	PGUARD printf( "n_nonrect:          %7ld\n", n_nonrect );
+	PGUARD printf( "min(diagoff,n):     %7ld\n", (long) diagoffmin );
+	PGUARD printf( "offn_ut_nonrect:    %7ld\n", (long) offn_ut_nonrect );
+	PGUARD printf( "offn_nonrect:       %7ld\n", (long) offn_nonrect );
+	PGUARD printf( "diagoff_nonrect:    %7ld\n", (long) diagoff_nonrect );
+	PGUARD printf( "n_nonrect:          %7ld\n", (long) n_nonrect );
 	PGUARD printf( "---------------------------\n" );
 
 	dim_t num_unref_ut = 0;
@@ -168,10 +167,10 @@ dim_t bli_thread_range_tlb_l
 
 		num_unref_ut += num_unref_ut_j;
 
-		PGUARD printf( "j                   %7ld\n", j );
-		PGUARD printf( "diagoff_j           %7ld\n", diagoff_j );
-		PGUARD printf( "num_unref_ut_j      %7ld\n", num_unref_ut_j );
-		PGUARD printf( "num_unref_ut        %7ld\n", num_unref_ut );
+		PGUARD printf( "j                   %7ld\n", (long) j );
+		PGUARD printf( "diagoff_j           %7ld\n", (long) diagoff_j );
+		PGUARD printf( "num_unref_ut_j      %7ld\n", (long) num_unref_ut_j );
+		PGUARD printf( "num_unref_ut        %7ld\n", (long) num_unref_ut );
 		PGUARD printf( "\n" );
 	}
 	PGUARD printf( "---------------------------\n" );
@@ -180,12 +179,12 @@ dim_t bli_thread_range_tlb_l
 	const dim_t tri_ref_area   = nonrect_area - tri_unref_area;
 	const dim_t total_ref_area = rect_area + tri_ref_area;
 
-	PGUARD printf( "gross area:         %7ld\n", m * n );
-	PGUARD printf( "rect_area:          %7ld\n", rect_area );
-	PGUARD printf( "nonrect_area:       %7ld\n", nonrect_area );
-	PGUARD printf( "tri_unref_area:     %7ld\n", tri_unref_area );
-	PGUARD printf( "tri_ref_area:       %7ld\n", tri_ref_area );
-	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "gross area:         %7ld\n", (long) m * n );
+	PGUARD printf( "rect_area:          %7ld\n", (long) rect_area );
+	PGUARD printf( "nonrect_area:       %7ld\n", (long) nonrect_area );
+	PGUARD printf( "tri_unref_area:     %7ld\n", (long) tri_unref_area );
+	PGUARD printf( "tri_ref_area:       %7ld\n", (long) tri_ref_area );
+	PGUARD printf( "total_ref_area:     %7ld\n", (long) total_ref_area );
 	PGUARD printf( "---------------------------\n" );
 
 	//
@@ -196,9 +195,9 @@ dim_t bli_thread_range_tlb_l
 	//const dim_t n_ut_tri_ref  = tri_ref_area   / ( mr * nr );
 	const dim_t n_ut_rect     = rect_area      / ( mr * nr );
 
-	PGUARD printf( "n_ut_ref:           %7ld\n", n_ut_ref );
-	//PGUARD printf( "n_ut_tri_ref:       %7ld\n", n_ut_tri_ref );
-	PGUARD printf( "n_ut_rect:          %7ld\n", n_ut_rect );
+	PGUARD printf( "n_ut_ref:           %7ld\n", (long) n_ut_ref );
+	//PGUARD printf( "n_ut_tri_ref:       %7ld\n", (long) n_ut_tri_ref );
+	PGUARD printf( "n_ut_rect:          %7ld\n", (long) n_ut_rect );
 	PGUARD printf( "---------------------------\n" );
 
 	// Compute the number of microtiles to allocate per thread as well as the
@@ -206,20 +205,20 @@ dim_t bli_thread_range_tlb_l
 	const dim_t n_ut_per_thr = n_ut_ref / nt;
 	const dim_t n_ut_pt_left = n_ut_ref % nt;
 
-	PGUARD printf( "n_ut_per_thr:       %7ld\n", n_ut_per_thr );
-	PGUARD printf( "n_ut_pt_left:       %7ld\n", n_ut_pt_left );
+	PGUARD printf( "n_ut_per_thr:       %7ld\n", (long) n_ut_per_thr );
+	PGUARD printf( "n_ut_pt_left:       %7ld\n", (long) n_ut_pt_left );
 	PGUARD printf( "---------------------------\n" );
 
 	const dim_t n_ut_per_col = m_iter;
 
-	PGUARD printf( "n_ut_per_col:       %7ld\n", n_ut_per_col );
+	PGUARD printf( "n_ut_per_col:       %7ld\n", (long) n_ut_per_col );
 
 	// Allocate one of the leftover microtiles to the current thread if its
 	// tid is one of the lower thread ids.
 	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
 
-	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", n_ut_for_me,
-	               n_ut_per_thr, n_ut_for_me - n_ut_per_thr );
+	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", (long) n_ut_for_me,
+	               (long) n_ut_per_thr, (long) n_ut_for_me - n_ut_per_thr );
 
 	// Compute the number of utiles prior to the current thread's starting
 	// point. This is the sum of all n_ut_for_me for all thread ids less
@@ -228,7 +227,7 @@ dim_t bli_thread_range_tlb_l
 	// n_ut_pt_left.
 	const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left );
 
-	PGUARD printf( "n_ut_before:        %7ld\n", n_ut_before );
+	PGUARD printf( "n_ut_before:        %7ld\n", (long) n_ut_before );
 	PGUARD printf( "---------------------------\n" );
 
 	//
@@ -251,13 +250,14 @@ dim_t bli_thread_range_tlb_l
 
 		const dim_t ut_index_rect_st = n_ut_before;
 
-		PGUARD printf( "ut_index_st:        %7ld\n", ut_index_rect_st );
+		PGUARD printf( "ut_index_st:        %7ld\n", (long) ut_index_rect_st );
 		PGUARD printf( "---------------------------\n" );
 
 		j_st = ut_index_rect_st / n_ut_per_col;
 		i_st = ut_index_rect_st % n_ut_per_col;
 
-		PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+		PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n",
+		               (long) j_st, (long) i_st );
 	}
 	else // if ( n_ut_rect <= n_ut_before )
 	{
@@ -274,19 +274,19 @@ dim_t bli_thread_range_tlb_l
 		// advance past to get to the diagonal region.
 		const dim_t n_ut_col_adv = offn_ut_nonrect;
 
-		PGUARD printf( "n_ut_col_adv:       %7ld\n", n_ut_col_adv );
+		PGUARD printf( "n_ut_col_adv:       %7ld\n", (long) n_ut_col_adv );
 
 		// In order to find j_st and i_st, we need to "allocate" n_ut_before
 		// microtiles.
 		dim_t n_ut_tba = n_ut_before;
 
-		PGUARD printf( "n_ut_tba:           %7ld\n", n_ut_tba );
+		PGUARD printf( "n_ut_tba:           %7ld\n", (long) n_ut_tba );
 
 		// Advance past the rectangular region, decrementing n_ut_tba
 		// accordingly.
 		n_ut_tba -= n_ut_per_col * n_ut_col_adv;
 
-		PGUARD printf( "n_ut_tba_1:         %7ld\n", n_ut_tba );
+		PGUARD printf( "n_ut_tba_1:         %7ld\n", (long) n_ut_tba );
 		PGUARD printf( "\n" );
 
 		// In case n_ut_tba == 0. Only happens when n_ut_before == n_ut_rect.
@@ -299,11 +299,11 @@ dim_t bli_thread_range_tlb_l
 			const dim_t n_ut_skip_j   = bli_max( -diagoff_j / mr, 0 );
 			const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j;
 
-			PGUARD printf( "j:                  %7ld\n", j );
-			PGUARD printf( "diagoff_j:          %7ld\n", diagoff_j );
-			PGUARD printf( "n_ut_skip_j:        %7ld\n", n_ut_skip_j );
-			PGUARD printf( "n_ut_this_col:      %7ld\n", n_ut_this_col );
-			PGUARD printf( "n_ut_tba_j0:        %7ld\n", n_ut_tba );
+			PGUARD printf( "j:                  %7ld\n", (long) j );
+			PGUARD printf( "diagoff_j:          %7ld\n", (long) diagoff_j );
+			PGUARD printf( "n_ut_skip_j:        %7ld\n", (long) n_ut_skip_j );
+			PGUARD printf( "n_ut_this_col:      %7ld\n", (long) n_ut_this_col );
+			PGUARD printf( "n_ut_tba_j0:        %7ld\n", (long) n_ut_tba );
 
 			if ( n_ut_tba < n_ut_this_col )
 			{
@@ -313,7 +313,8 @@ dim_t bli_thread_range_tlb_l
 				// intersects the diagonal and then add n_ut_tba.
 				j_st = j;
 				i_st = n_ut_skip_j + n_ut_tba;
-				PGUARD printf( "j_st, i_st (fnl<)      %4ld,%4ld\n", j_st, i_st );
+				PGUARD printf( "j_st, i_st (fnl<)      %4ld,%4ld\n",
+				               (long)  j_st, (long) i_st );
 			}
 			else if ( n_ut_tba == n_ut_this_col )
 			{
@@ -326,7 +327,8 @@ dim_t bli_thread_range_tlb_l
 
 				j_st = j + 1;
 				i_st = n_ut_skip_jp1;
-				PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+				PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n",
+				               (long) j_st, (long) i_st );
 			}
 
 			// No matter what (especially if the number of utiles to allocate
@@ -336,7 +338,7 @@ dim_t bli_thread_range_tlb_l
 			// zero (or less), in which case this will be the final iteration.)
 			n_ut_tba -= n_ut_this_col;
 
-			PGUARD printf( "n_ut_tba_j1:        %7ld\n", n_ut_tba );
+			PGUARD printf( "n_ut_tba_j1:        %7ld\n", (long) n_ut_tba );
 			PGUARD printf( "\n" );
 		}
 	}
@@ -350,7 +352,7 @@ dim_t bli_thread_range_tlb_l
 
 	#ifdef PRINT_RESULT
 	printf( "j_st, i_st (mem)       %4ld,%4ld  (n_ut: %4ld)\n",
-	        j_st, i_st, n_ut_for_me );
+	        (long) j_st, (long) i_st, (long) n_ut_for_me );
 	#endif
 
 	// Return the number of utiles that this thread was allocated.
@@ -414,11 +416,11 @@ dim_t bli_thread_range_tlb_u
 	const dim_t offn_ut_rect    = n_iter + ( diagoffmin / nr );
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "max(diagoff,-m):    %7ld\n", diagoffmin );
-	PGUARD printf( "offn_ut_rect:       %7ld\n", offn_ut_rect );
-	PGUARD printf( "offn_rect:          %7ld\n", offn_rect );
-	PGUARD printf( "diagoff_nonrect:    %7ld\n", diagoff_nonrect );
-	PGUARD printf( "n_nonrect:          %7ld\n", n_nonrect );
+	PGUARD printf( "max(diagoff,-m):    %7ld\n", (long) diagoffmin );
+	PGUARD printf( "offn_ut_rect:       %7ld\n", (long) offn_ut_rect );
+	PGUARD printf( "offn_rect:          %7ld\n", (long) offn_rect );
+	PGUARD printf( "diagoff_nonrect:    %7ld\n", (long) diagoff_nonrect );
+	PGUARD printf( "n_nonrect:          %7ld\n", (long) n_nonrect );
 	PGUARD printf( "---------------------------\n" );
 
 	dim_t num_unref_ut = 0;
@@ -443,10 +445,10 @@ dim_t bli_thread_range_tlb_u
 
 		num_unref_ut += num_unref_ut_j;
 
-		PGUARD printf( "j                   %7ld\n", j );
-		PGUARD printf( "diagoff_j - nr      %7ld\n", diagoff_j - nr );
-		PGUARD printf( "num_unref_ut_j      %7ld\n", num_unref_ut_j );
-		PGUARD printf( "num_unref_ut        %7ld\n", num_unref_ut );
+		PGUARD printf( "j                   %7ld\n", (long) j );
+		PGUARD printf( "diagoff_j - nr      %7ld\n", (long) diagoff_j - nr );
+		PGUARD printf( "num_unref_ut_j      %7ld\n", (long) num_unref_ut_j );
+		PGUARD printf( "num_unref_ut        %7ld\n", (long) num_unref_ut );
 		PGUARD printf( "\n" );
 	}
 	PGUARD printf( "---------------------------\n" );
@@ -455,12 +457,12 @@ dim_t bli_thread_range_tlb_u
 	const dim_t tri_ref_area   = nonrect_area - tri_unref_area;
 	const dim_t total_ref_area = rect_area + tri_ref_area;
 
-	PGUARD printf( "gross area:         %7ld\n", m * n );
-	PGUARD printf( "rect_area:          %7ld\n", rect_area );
-	PGUARD printf( "nonrect_area:       %7ld\n", nonrect_area );
-	PGUARD printf( "tri_unref_area:     %7ld\n", tri_unref_area );
-	PGUARD printf( "tri_ref_area:       %7ld\n", tri_ref_area );
-	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "gross area:         %7ld\n", (long) m * n );
+	PGUARD printf( "rect_area:          %7ld\n", (long) rect_area );
+	PGUARD printf( "nonrect_area:       %7ld\n", (long) nonrect_area );
+	PGUARD printf( "tri_unref_area:     %7ld\n", (long) tri_unref_area );
+	PGUARD printf( "tri_ref_area:       %7ld\n", (long) tri_ref_area );
+	PGUARD printf( "total_ref_area:     %7ld\n", (long) total_ref_area );
 	PGUARD printf( "---------------------------\n" );
 
 	//
@@ -471,8 +473,8 @@ dim_t bli_thread_range_tlb_u
 	const dim_t n_ut_tri_ref  = tri_ref_area   / ( mr * nr );
 	//const dim_t n_ut_rect     = rect_area      / ( mr * nr );
 
-	PGUARD printf( "n_ut_ref:           %7ld\n", n_ut_ref );
-	PGUARD printf( "n_ut_tri_ref:       %7ld\n", n_ut_tri_ref );
+	PGUARD printf( "n_ut_ref:           %7ld\n", (long) n_ut_ref );
+	PGUARD printf( "n_ut_tri_ref:       %7ld\n", (long) n_ut_tri_ref );
 	//PGUARD printf( "n_ut_rect:          %7ld\n", n_ut_rect );
 	PGUARD printf( "---------------------------\n" );
 
@@ -481,20 +483,20 @@ dim_t bli_thread_range_tlb_u
 	const dim_t n_ut_per_thr = n_ut_ref / nt;
 	const dim_t n_ut_pt_left = n_ut_ref % nt;
 
-	PGUARD printf( "n_ut_per_thr:       %7ld\n", n_ut_per_thr );
-	PGUARD printf( "n_ut_pt_left:       %7ld\n", n_ut_pt_left );
+	PGUARD printf( "n_ut_per_thr:       %7ld\n", (long) n_ut_per_thr );
+	PGUARD printf( "n_ut_pt_left:       %7ld\n", (long) n_ut_pt_left );
 	PGUARD printf( "---------------------------\n" );
 
 	const dim_t n_ut_per_col = m_iter;
 
-	PGUARD printf( "n_ut_per_col:       %7ld\n", n_ut_per_col );
+	PGUARD printf( "n_ut_per_col:       %7ld\n", (long) n_ut_per_col );
 
 	// Allocate one of the leftover microtiles to the current thread if its
 	// tid is one of the lower thread ids.
 	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
 
-	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", n_ut_for_me,
-	               n_ut_per_thr, n_ut_for_me - n_ut_per_thr );
+	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", (long) n_ut_for_me,
+	               (long) n_ut_per_thr, (long) n_ut_for_me - n_ut_per_thr );
 
 	// Compute the number of utiles prior to the current thread's starting
 	// point. This is the sum of all n_ut_for_me for all thread ids less
@@ -503,7 +505,7 @@ dim_t bli_thread_range_tlb_u
 	// n_ut_pt_left.
 	const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left );
 
-	PGUARD printf( "n_ut_before:        %7ld\n", n_ut_before );
+	PGUARD printf( "n_ut_before:        %7ld\n", (long) n_ut_before );
 	PGUARD printf( "---------------------------\n" );
 
 	//
@@ -526,13 +528,14 @@ dim_t bli_thread_range_tlb_u
 
 		const dim_t ut_index_rect_st = n_ut_before - n_ut_tri_ref;
 
-		PGUARD printf( "ut_index_rect_st:   %7ld\n", ut_index_rect_st );
+		PGUARD printf( "ut_index_rect_st:   %7ld\n", (long) ut_index_rect_st );
 		PGUARD printf( "---------------------------\n" );
 
 		j_st = offn_ut_rect + ut_index_rect_st / n_ut_per_col;
 		i_st =                ut_index_rect_st % n_ut_per_col;
 
-		PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+		PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n",
+		               (long) j_st, (long) i_st );
 	}
 	else // if ( n_ut_before < n_ut_tri_ref )
 	{
@@ -549,19 +552,19 @@ dim_t bli_thread_range_tlb_u
 		// advance past to get to the diagonal region.
 		const dim_t n_ut_col_adv = 0;
 
-		PGUARD printf( "n_ut_col_adv:       %7ld\n", n_ut_col_adv );
+		PGUARD printf( "n_ut_col_adv:       %7ld\n", (long) n_ut_col_adv );
 
 		// In order to find j_st and i_st, we need to "allocate" n_ut_before
 		// microtiles.
 		dim_t n_ut_tba = n_ut_before;
 
-		PGUARD printf( "n_ut_tba:           %7ld\n", n_ut_tba );
+		PGUARD printf( "n_ut_tba:           %7ld\n", (long) n_ut_tba );
 
 		// No need to advance since the upper-trapezoid begins with the
 		// diagonal region.
 		//n_ut_tba -= 0;
 
-		PGUARD printf( "n_ut_tba_1:         %7ld\n", n_ut_tba );
+		PGUARD printf( "n_ut_tba_1:         %7ld\n", (long) n_ut_tba );
 		PGUARD printf( "\n" );
 
 		// In case n_ut_tba == 0. Only happens when n_ut_before == 0.
@@ -574,11 +577,11 @@ dim_t bli_thread_range_tlb_u
 			const dim_t n_ut_skip_j   = bli_max( ( m + diagoff_j - nr ) / mr, 0 );
 			const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j;
 
-			PGUARD printf( "j:                  %7ld\n", j );
-			PGUARD printf( "diagoff_j:          %7ld\n", diagoff_j );
-			PGUARD printf( "n_ut_skip_j:        %7ld\n", n_ut_skip_j );
-			PGUARD printf( "n_ut_this_col:      %7ld\n", n_ut_this_col );
-			PGUARD printf( "n_ut_tba_j0:        %7ld\n", n_ut_tba );
+			PGUARD printf( "j:                  %7ld\n", (long) j );
+			PGUARD printf( "diagoff_j:          %7ld\n", (long) diagoff_j );
+			PGUARD printf( "n_ut_skip_j:        %7ld\n", (long) n_ut_skip_j );
+			PGUARD printf( "n_ut_this_col:      %7ld\n", (long) n_ut_this_col );
+			PGUARD printf( "n_ut_tba_j0:        %7ld\n", (long) n_ut_tba );
 
 			if ( n_ut_tba < n_ut_this_col )
 			{
@@ -587,7 +590,8 @@ dim_t bli_thread_range_tlb_u
 				// column. To find i_st, we simply use n_ut_tba.
 				j_st = j;
 				i_st = n_ut_tba;
-				PGUARD printf( "j_st, i_st (fnl<)      %4ld,%4ld\n", j_st, i_st );
+				PGUARD printf( "j_st, i_st (fnl<)      %4ld,%4ld\n",
+				               (long) j_st, (long) i_st );
 			}
 			else if ( n_ut_tba == n_ut_this_col )
 			{
@@ -597,7 +601,8 @@ dim_t bli_thread_range_tlb_u
 
 				j_st = j + 1;
 				i_st = 0;
-				PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n", j_st, i_st );
+				PGUARD printf( "j_st, i_st (fnl=)      %4ld,%4ld\n",
+				               (long) j_st, (long) i_st );
 			}
 
 			// No matter what (especially if the number of utiles to allocate
@@ -607,7 +612,7 @@ dim_t bli_thread_range_tlb_u
 			// zero (or less), in which case this will be the final iteration.)
 			n_ut_tba -= n_ut_this_col;
 
-			PGUARD printf( "n_ut_tba_j1:        %7ld\n", n_ut_tba );
+			PGUARD printf( "n_ut_tba_j1:        %7ld\n", (long) n_ut_tba );
 			PGUARD printf( "\n" );
 		}
 	}
@@ -621,7 +626,7 @@ dim_t bli_thread_range_tlb_u
 
 	#ifdef PRINT_RESULT
 	printf( "j_st, i_st (mem)       %4ld,%4ld  (n_ut: %4ld)\n",
-	         j_st, i_st, n_ut_for_me );
+	        (long) j_st, (long) i_st, (long) n_ut_for_me );
 	#endif
 
 	// Return the number of utiles that this thread was allocated.
@@ -666,7 +671,7 @@ dim_t bli_thread_range_tlb_d
 
 	const dim_t total_ref_area = m_rect * n_rect;
 
-	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "total_ref_area:     %7ld\n", (long) total_ref_area );
 	PGUARD printf( "---------------------------\n" );
 
 	//
@@ -675,7 +680,7 @@ dim_t bli_thread_range_tlb_d
 
 	const dim_t n_ut_ref = total_ref_area / ( mr * nr );
 
-	PGUARD printf( "n_ut_ref:           %7ld\n", n_ut_ref );
+	PGUARD printf( "n_ut_ref:           %7ld\n", (long) n_ut_ref );
 	PGUARD printf( "---------------------------\n" );
 
 	// Compute the number of microtiles to allocate per thread as well as the
@@ -683,20 +688,20 @@ dim_t bli_thread_range_tlb_d
 	const dim_t n_ut_per_thr = n_ut_ref / nt;
 	const dim_t n_ut_pt_left = n_ut_ref % nt;
 
-	PGUARD printf( "n_ut_per_thr:       %7ld\n", n_ut_per_thr );
-	PGUARD printf( "n_ut_pt_left:       %7ld\n", n_ut_pt_left );
+	PGUARD printf( "n_ut_per_thr:       %7ld\n", (long) n_ut_per_thr );
+	PGUARD printf( "n_ut_pt_left:       %7ld\n", (long) n_ut_pt_left );
 	PGUARD printf( "---------------------------\n" );
 
 	const dim_t n_ut_per_col = m_iter;
 
-	PGUARD printf( "n_ut_per_col:       %7ld\n", n_ut_per_col );
+	PGUARD printf( "n_ut_per_col:       %7ld\n", (long) n_ut_per_col );
 
 	// Allocate one of the leftover microtiles to the current thread if its
 	// tid is one of the lower thread ids.
 	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
 
-	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", n_ut_for_me,
-	               n_ut_per_thr, n_ut_for_me - n_ut_per_thr );
+	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", (long) n_ut_for_me,
+	               (long) n_ut_per_thr, (long) n_ut_for_me - n_ut_per_thr );
 
 	// Compute the number of utiles prior to the current thread's starting
 	// point. This is the sum of all n_ut_for_me for all thread ids less
@@ -705,7 +710,7 @@ dim_t bli_thread_range_tlb_d
 	// n_ut_pt_left.
 	const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left );
 
-	PGUARD printf( "n_ut_before:        %7ld\n", n_ut_before );
+	PGUARD printf( "n_ut_before:        %7ld\n", (long) n_ut_before );
 	PGUARD printf( "---------------------------\n" );
 
 	//
@@ -714,7 +719,7 @@ dim_t bli_thread_range_tlb_d
 
 	const dim_t ut_index_st = n_ut_before;
 
-	PGUARD printf( "ut_index_st:        %7ld\n", ut_index_st );
+	PGUARD printf( "ut_index_st:        %7ld\n", (long) ut_index_st );
 	PGUARD printf( "---------------------------\n" );
 
 	const dim_t j_st = ut_index_st / n_ut_per_col;
@@ -729,7 +734,7 @@ dim_t bli_thread_range_tlb_d
 
 	#ifdef PRINT_RESULT
 	printf( "j_st, i_st (mem)       %4ld,%4ld  (n_ut: %4ld)\n",
-	        j_st, i_st, n_ut_for_me );
+	        (long) j_st, (long) i_st, (long) n_ut_for_me );
 	#endif
 
 	// Return the number of utiles that this thread was allocated.
@@ -849,12 +854,12 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 	// by the k_iter given by the caller.
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "m_iter:             %7ld\n", m_iter );
-	PGUARD printf( "n_iter:             %7ld\n", n_iter );
-	PGUARD printf( "k_iter:             %7ld\n", k_iter );
-	PGUARD printf( "mr:                 %7ld\n", mr );
-	PGUARD printf( "nr:                 %7ld\n", nr );
-	PGUARD printf( "diagoff_iter:       %7ld\n", diagoff_iter );
+	PGUARD printf( "m_iter:             %7ld\n", (long) m_iter );
+	PGUARD printf( "n_iter:             %7ld\n", (long) n_iter );
+	PGUARD printf( "k_iter:             %7ld\n", (long) k_iter );
+	PGUARD printf( "mr:                 %7ld\n", (long) mr );
+	PGUARD printf( "nr:                 %7ld\n", (long) nr );
+	PGUARD printf( "diagoff_iter:       %7ld\n", (long) diagoff_iter );
 
 	dim_t uops_per_col = 0;
 
@@ -874,7 +879,7 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 		uops_per_col += k_i_iter;
 	}
 
-	PGUARD printf( "uops_per_col:       %7ld\n", uops_per_col );
+	PGUARD printf( "uops_per_col:       %7ld\n", (long) uops_per_col );
 
 	//
 	// -- Step 2: Compute key flop counts (per thread, per column, etc.) -------
@@ -889,23 +894,23 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 	const dim_t n_uops_pt_left = total_uops % nt;
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "total_uops:         %7ld\n", total_uops );
-	PGUARD printf( "n_uops_per_thr:     %7ld\n", n_uops_per_thr );
-	PGUARD printf( "n_uops_pt_left:     %7ld\n", n_uops_pt_left );
+	PGUARD printf( "total_uops:         %7ld\n", (long) total_uops );
+	PGUARD printf( "n_uops_per_thr:     %7ld\n", (long) n_uops_per_thr );
+	PGUARD printf( "n_uops_pt_left:     %7ld\n", (long) n_uops_pt_left );
 
 	//
 	// -- Step 3: Compute the starting j/i utile offset for a given tid --------
 	//
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "total_utiles:       %7ld\n", m_iter * n_iter );
+	PGUARD printf( "total_utiles:       %7ld\n", (long) m_iter * n_iter );
 	PGUARD printf( "---------------------------\n" );
 
 	dim_t j_st_cur = 0; dim_t j_en_cur = 0;
 	dim_t i_st_cur = 0; dim_t i_en_cur = 0;
 
 	PGUARD printf( "          tid %ld will start at j,i: %ld %ld\n",
-	               ( dim_t )0, j_st_cur, i_st_cur );
+	               (long) 0, (long) j_st_cur, (long) i_st_cur );
 
 	// Find the utile update that pushes uops_tba to 0 or less.
 #ifdef PRINT_MODE
@@ -920,7 +925,7 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 		      dim_t n_ut_for_me = 0;
 		      bool  done_e      = FALSE;
 
-		PGUARD printf( "tid_i: %ld  n_uops to alloc: %3ld \n", tid_i, uops_tba );
+		PGUARD printf( "tid_i: %ld  n_uops to alloc: %3ld \n", (long) tid_i, (long) uops_tba );
 
 		// This code begins allocating uops when the starting point is somewhere
 		// after the first microtile. Typically this will not be enough to
@@ -930,7 +935,7 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 		{
 			dim_t i;
 
-			//PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n", tid_i, uops_tba );
+			//PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n", (long) tid_i, (long) uops_tba );
 
 			for ( i = i_st_cur; i < m_iter; ++i )
 			{
@@ -943,7 +948,8 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 				uops_tba = uops_tba_new;
 
 				PGUARD printf( "tid_i: %ld  i: %2ld  (1 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-				               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+				               (long) tid_i, (long) i, (long) n_ut_for_me,
+				               (long) uops_ta - uops_tba );
 
 				if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE;
 				                           break; }
@@ -979,10 +985,12 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 			n_ut_for_me += j_inc * m_iter;
 
 			PGUARD printf( "tid_i: %ld  advanced to col: %2ld  (uops traversed: %ld)\n",
-			               tid_i, j, uops_per_col * j_inc );
+			               (long) tid_i, (long) j, (long) uops_per_col * j_inc );
 			PGUARD printf( "tid_i: %ld  j: %2ld  (  n_ut_cur: %ld) (uops_alloc: %ld)\n",
-			               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
-			PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n", tid_i, j_left );
+			               (long) tid_i, (long) j, (long) n_ut_for_me,
+			               (long) uops_ta - uops_tba );
+			PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n",
+			               (long) tid_i, (long) j_left );
 
 			if ( uops_tba == 0 )
 			{
@@ -1019,7 +1027,8 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 					uops_tba = uops_tba_new;
 
 					PGUARD printf( "tid_i: %ld  i: %2ld  (4 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-					               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+					               (long) tid_i, (long) i,
+					               (long) n_ut_for_me, (long) uops_ta - uops_tba );
 
 					if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i;
 					                           break; }
@@ -1029,7 +1038,7 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 
 
 		PGUARD printf( "tid_i: %ld         (5 n_ut_cur: %ld) (overshoot: %ld out of %ld)\n",
-		                tid_i, n_ut_for_me, -uops_tba, uops_ta );
+		               (long)  tid_i, (long) n_ut_for_me, -(long) uops_tba, (long) uops_ta );
 
 		if ( tid_i == tid )
 		{
@@ -1045,9 +1054,10 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 		if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; }
 
 		PGUARD printf( "tid_i: %ld         (6 n_ut_cur: %ld)\n",
-		               tid_i, n_ut_for_me );
+		               (long) tid_i, (long) n_ut_for_me );
 		PGUARD printf( "tid_i: %ld  tid %ld will start at j,i: %ld %ld\n",
-		               tid_i, tid_i + 1, j_st_cur, i_st_cur );
+		               (long) tid_i, (long) tid_i + 1,
+		               (long) j_st_cur, (long) i_st_cur );
 		PGUARD printf( "---------------------------\n" );
 	}
 
@@ -1065,7 +1075,8 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 	*i_st_p = i_st_cur;
 
 	PGUARD printf( "tid_i: %ld         (7 n_ut_for_me: %ld) (j,i_st: %ld %ld)\n",
-	               tid, n_ut_for_me, j_st_cur, i_st_cur );
+	               (long) tid, (long) n_ut_for_me,
+	               (long) j_st_cur, (long) i_st_cur );
 
 	return n_ut_for_me;
 #else
@@ -1263,14 +1274,14 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 	const dim_t offn_ut_nonrect = diagoffmin_iter;
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "m_iter:             %7ld\n", m_iter );
-	PGUARD printf( "k_iter:             %7ld\n", k_iter );
-	PGUARD printf( "n_iter:             %7ld\n", n_iter );
-	PGUARD printf( "min(diagoff_it,n):  %7ld\n", diagoffmin_iter );
-	PGUARD printf( "offn_ut_nonrect:    %7ld\n", offn_ut_nonrect );
-	PGUARD printf( "offn_nonrect:       %7ld\n", offn_nonrect );
-	PGUARD printf( "diagoff_nonrect:    %7ld\n", diagoff_nonrect );
-	PGUARD printf( "n_nonrect:          %7ld\n", n_nonrect );
+	PGUARD printf( "m_iter:             %7ld\n", (long) m_iter );
+	PGUARD printf( "k_iter:             %7ld\n", (long) k_iter );
+	PGUARD printf( "n_iter:             %7ld\n", (long) n_iter );
+	PGUARD printf( "min(diagoff_it,n):  %7ld\n", (long) diagoffmin_iter );
+	PGUARD printf( "offn_ut_nonrect:    %7ld\n", (long) offn_ut_nonrect );
+	PGUARD printf( "offn_nonrect:       %7ld\n", (long) offn_nonrect );
+	PGUARD printf( "diagoff_nonrect:    %7ld\n", (long) diagoff_nonrect );
+	PGUARD printf( "n_nonrect:          %7ld\n", (long) n_nonrect );
 	PGUARD printf( "---------------------------\n" );
 
 	const dim_t num_unref_ut0 = n_nonrect * ( n_nonrect - 1 ) / 2;
@@ -1283,16 +1294,16 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 	const dim_t tri_ref_vol    = tri_ref_area * m_iter;
 	const dim_t total_vol      = total_ref_area * m_iter;
 
-	PGUARD printf( "gross_area:         %7ld\n", gross_area );
-	PGUARD printf( "nonrect_area:       %7ld\n", nonrect_area );
-	PGUARD printf( "tri_unref_area:     %7ld\n", tri_unref_area );
-	PGUARD printf( "rect_area:          %7ld\n", rect_area );
-	PGUARD printf( "tri_ref_area:       %7ld\n", tri_ref_area );
-	PGUARD printf( "total_ref_area:     %7ld\n", total_ref_area );
+	PGUARD printf( "gross_area:         %7ld\n", (long) gross_area );
+	PGUARD printf( "nonrect_area:       %7ld\n", (long) nonrect_area );
+	PGUARD printf( "tri_unref_area:     %7ld\n", (long) tri_unref_area );
+	PGUARD printf( "rect_area:          %7ld\n", (long) rect_area );
+	PGUARD printf( "tri_ref_area:       %7ld\n", (long) tri_ref_area );
+	PGUARD printf( "total_ref_area:     %7ld\n", (long) total_ref_area );
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "rect_vol (uops):    %7ld\n", rect_vol );
-	PGUARD printf( "tri_ref_vol (uops): %7ld\n", tri_ref_vol );
-	PGUARD printf( "total_vol (uops):   %7ld\n", total_vol );
+	PGUARD printf( "rect_vol (uops):    %7ld\n", (long) rect_vol );
+	PGUARD printf( "tri_ref_vol (uops): %7ld\n", (long) tri_ref_vol );
+	PGUARD printf( "total_vol (uops):   %7ld\n", (long) total_vol );
 	PGUARD printf( "---------------------------\n" );
 
 	//
@@ -1308,14 +1319,14 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 	const dim_t n_uops_per_thr = total_uops / nt;
 	const dim_t n_uops_pt_left = total_uops % nt;
 
-	PGUARD printf( "n_threads:          %7ld\n", nt );
-	PGUARD printf( "n_uops_per_thr:     %7ld\n", n_uops_per_thr );
-	PGUARD printf( "n_uops_pt_left:     %7ld\n", n_uops_pt_left );
+	PGUARD printf( "n_threads:          %7ld\n", (long) nt );
+	PGUARD printf( "n_uops_per_thr:     %7ld\n", (long) n_uops_per_thr );
+	PGUARD printf( "n_uops_pt_left:     %7ld\n", (long) n_uops_pt_left );
 	PGUARD printf( "---------------------------\n" );
 
 	const dim_t uops_per_col_rect = m_iter * k_iter;
 
-	PGUARD printf( "uops_per_col_rect:  %7ld\n", uops_per_col_rect );
+	PGUARD printf( "uops_per_col_rect:  %7ld\n", (long) uops_per_col_rect );
 
 	// Allocate one of the leftover uops to the current thread if its tid is
 	// one of the lower thread ids.
@@ -1329,7 +1340,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 	//
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "total_utiles:       %7ld\n", m_iter * n_iter );
+	PGUARD printf( "total_utiles:       %7ld\n", (long) m_iter * n_iter );
 	PGUARD printf( "---------------------------\n" );
 
 	dim_t j_st_cur = 0; dim_t j_en_cur = 0;
@@ -1349,9 +1360,11 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 		      bool  done_e      = FALSE;
 		      bool  search_tri  = FALSE;
 
-		PGUARD printf( "tid_i: %ld  n_uops_ta:    %3ld \n", tid_i, uops_tba );
+		PGUARD printf( "tid_i: %ld  n_uops_ta:    %3ld \n",
+		               (long) tid_i, (long) uops_tba );
 		PGUARD printf( "tid_i: %ld  j: %2ld  (  n_ut_cur: %ld) (uops_alloc: %ld)\n",
-		                tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+		               (long) tid_i, (long) j, (long) n_ut_for_me,
+		               (long) uops_ta - uops_tba );
 
 		// This code begins allocating uops when the starting point is somewhere
 		// after the first microtile. Typically this will not be enough to
@@ -1381,7 +1394,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 				uops_tba = uops_tba_new;
 
 				PGUARD printf( "tid_i: %ld  i: %2ld  (0 n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n",
-				                tid_i, i, n_ut_for_me, uops_ta - uops_tba, k_iter_j );
+				               (long) tid_i, (long) i, (long) n_ut_for_me,
+				               (long) uops_ta - uops_tba, k_iter_j );
 
 				if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE;
 				                           break; }
@@ -1428,7 +1442,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 			}
 
 			PGUARD printf( "tid_i: %ld  i: %2ld  (* n_ut_cur: %ld) (uops_alloc: %ld)\n",
-						   tid_i, i-1, n_ut_for_me, uops_ta - uops_tba );
+			               (long) tid_i, (long) i-1, (long) n_ut_for_me,
+			               (long) uops_ta - uops_tba );
 
 			// If we allocated all utiles in the column (regardless of whether we finished
 			// allocating utiles for the current thread), increment j to the next column,
@@ -1442,7 +1457,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 				if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; done_e = TRUE; }
 
 				PGUARD printf( "tid_i: %ld  j: %2ld  (! n_ut_cur: %ld) (uops_alloc: %ld)\n",
-							   tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+				               (long) tid_i, (long) j, (long) n_ut_for_me,
+				               (long) uops_ta - uops_tba );
 			}
 
 			#endif
@@ -1484,10 +1500,12 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 				n_ut_for_me += j_inc * m_iter;
 
 				PGUARD printf( "tid_i: %ld  advanced to col: %2ld  (uops traversed: %ld)\n",
-				               tid_i, j, uops_per_col_rect * j_inc );
+				               (long) tid_i, (long) j, (long) uops_per_col_rect * j_inc );
 				PGUARD printf( "tid_i: %ld  j: %2ld  (1 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-				                tid_i, j, n_ut_for_me, uops_ta - uops_tba );
-				PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n", tid_i, j_left );
+				               (long) tid_i, (long) j, (long) n_ut_for_me,
+				               (long) uops_ta - uops_tba );
+				PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n",
+				               (long) tid_i, (long) j_left );
 
 				if ( uops_tba == 0 )
 				{
@@ -1499,7 +1517,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					search_tri = FALSE;
 
 					PGUARD printf( "tid_i: %ld  j: %2ld  (2 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+					               (long) tid_i, (long) j, (long) n_ut_for_me,
+					               (long) uops_ta - uops_tba );
 				}
 				else if ( j >  n_iter ) bli_abort(); // Safety check; should never execute.
 				else if ( j == n_iter )
@@ -1512,7 +1531,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					// fewer uops for the last thread.)
 					search_tri = FALSE;
 					PGUARD printf( "tid_i: %ld  j: %2ld  (3 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+					               (long) tid_i, (long) j, (long) n_ut_for_me,
+					               (long) uops_ta - uops_tba );
 				}
 				else if ( j < diagoff_iter )
 				{
@@ -1524,7 +1544,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					// over the top.
 					search_tri = FALSE;
 					PGUARD printf( "tid_i: %ld  j: %2ld  (4 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+					               (long) tid_i, (long) j, (long)  n_ut_for_me,
+					               (long) uops_ta - uops_tba );
 				}
 				else // if ( 0 < uops_tba && j == diagoff_iter && j < n_iter )
 				{
@@ -1534,17 +1555,19 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					// region.
 					search_tri = TRUE;
 					PGUARD printf( "tid_i: %ld  j: %2ld  (5 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-					               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+					               (long) tid_i, (long) j, (long) n_ut_for_me,
+					               (long) uops_ta - uops_tba );
 				}
 			}
 			else /* if ( diagoff_iter <= j ) */
 			{
 				PGUARD printf( "tid_i: %ld  j: %2ld >= diagoff_iter: %ld\n",
-				               tid_i, j, diagoff_iter );
+				               (long) tid_i, (long) j, (long) diagoff_iter );
 				search_tri = TRUE;
 			}
 
-			PGUARD printf( "tid_i: %ld  j: %2ld  search_tri: %d\n", tid_i, j, search_tri );
+			PGUARD printf( "tid_i: %ld  j: %2ld  search_tri: %ld\n", (long) tid_i,
+			               (long) j, (long) search_tri );
 
 			if ( search_tri )
 			{
@@ -1557,12 +1580,13 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					const dim_t n_uops_j = k_iter_j * m_iter;
 
 					PGUARD printf( "tid_i: %ld  j: %2ld  (6 n_ut_cur: %ld) (uops_alloc: %ld) (n_uops_j: %ld)\n",
-					               tid_i, j, n_ut_for_me, uops_ta - uops_tba, n_uops_j );
+					               (long) tid_i, (long) j, (long) n_ut_for_me,
+					               (long) uops_ta - uops_tba, (long) n_uops_j );
 
 					if ( uops_tba == 0 )
 					{
 						PGUARD printf( "tid_i: %ld  j: %2ld  (7 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-						               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+						               (long) tid_i, (long) j, (long) n_ut_for_me, (long) uops_ta - uops_tba );
 						// If advancing over the previous column allocated all of
 						// our uops, then designate the last iteration of the
 						// previous column as the end point.
@@ -1580,12 +1604,14 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 						uops_tba -= n_uops_j;
 
 						PGUARD printf( "tid_i: %ld  j: %2ld  (8 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-						               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+						               (long) tid_i, (long) j, (long) n_ut_for_me,
+						               (long) uops_ta - uops_tba );
 					}
 					else // if ( uops_tba < n_uops_j )
 					{
 						PGUARD printf( "tid_i: %ld  j: %2ld  (9 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-						               tid_i, j, n_ut_for_me, uops_ta - uops_tba );
+						               (long) tid_i, (long) j, (long) n_ut_for_me,
+						               (long) uops_ta - uops_tba );
 						// If we can finish allocating all the remaining uops
 						// with the utiles in the current column, then we break
 						// out of the loop without updating j, n_ut_for_me, or
@@ -1604,7 +1630,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 				const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j );
 
 				PGUARD printf( "tid_i: %ld  j: %2ld  (A n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n",
-				               tid_i, j, n_ut_for_me, uops_ta - uops_tba, k_iter_j );
+				               (long) tid_i, (long) j, (long) n_ut_for_me,
+				               (long) uops_ta - uops_tba, (long) k_iter_j );
 
 				#if 0
 
@@ -1615,7 +1642,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					const dim_t uops_tba_new = uops_tba - k_iter_j;
 					uops_tba = uops_tba_new;
 					PGUARD printf( "tid_i: %ld  i: %2ld  (B n_ut_cur: %ld) (uops_alloc: %ld)\n",
-					               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+					               (long) tid_i, (long) i, (long) n_ut_for_me, uops_ta - uops_tba );
 					if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; break; }
 				}
 
@@ -1636,7 +1663,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 				j_en_cur = j; i_en_cur = i;
 
 				PGUARD printf( "tid_i: %ld  i: %2ld  (b n_ut_cur: %ld) (uops_alloc: %ld)\n",
-				               tid_i, i, n_ut_for_me, uops_ta - uops_tba );
+				               (long) tid_i, (long) i, (long) n_ut_for_me,
+				               (long) uops_ta - uops_tba );
 
 				#endif
 			}
@@ -1648,7 +1676,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 		}
 
 		PGUARD printf( "tid_i: %ld  done!  (C n_ut_cur: %ld) (overshoot: %ld out of %ld)\n",
-					   tid_i, n_ut_for_me, -uops_tba, uops_ta );
+		               (long) tid_i, (long) n_ut_for_me, -(long) uops_tba, (long) uops_ta );
 
 		if ( tid_i == tid )
 		{
@@ -1666,9 +1694,10 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 		if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; }
 
 		PGUARD printf( "tid_i: %ld         (D n_ut_cur: %ld)\n",
-					   tid_i, n_ut_for_me );
+		               (long) tid_i, (long) n_ut_for_me );
 		PGUARD printf( "tid_i: %ld  tid %ld will start at j,i: %ld %ld\n",
-					   tid_i, tid_i + 1, j_st_cur, i_st_cur );
+		               (long) tid_i, (long) tid_i + 1,
+		               (long) j_st_cur, (long) i_st_cur );
 		PGUARD printf( "---------------------------\n" );
 	}
 
@@ -1688,7 +1717,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 	*i_en_p = m_iter - 1;
 
 	PGUARD printf( "tid_i: %ld         (E n_ut_for_me: %ld) (j,i_st: %ld %ld)\n",
-	               tid, n_ut_for_me, j_st_cur, i_st_cur );
+	               (long) tid, (long) n_ut_for_me,
+	               (long) j_st_cur, (long) i_st_cur );
 
 	return n_ut_for_me;
 #else
@@ -1696,4 +1726,3 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 	return -1;
 #endif
 }
-

From e730c685d09336b3bd09e86c94330c4eba967f3e Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 6 Feb 2023 15:31:54 -0600
Subject: [PATCH 128/230] Define `BLIS_VERSION_STRING` in `blis.h`. (#720)

Details:
- Previously, the version string was communicated from configure to
  config.mk (via the config.mk.in template), where it was included via
  the top-level Makefile, where it was then used to define the
  preprocessor macro BLIS_VERSION_STRING via a command line argument to
  the compiler (via -D). This macro is then used within bli_info.c to
  initialize a static string which can then be queried via the
  bli_info_get_version_str() function. However, there are some
  applications that may find utility in being able to access the version
  string by inspecting the monolithic (flattened) blis.h header file
  that is created at compile time and installed alongside the library.
  This commit moves the definition of BLIS_VERSION_STRING into
  bli_config.h (via the bli_config.h.in template) so that it is
  embedded in blis.h. The version string is now available in three
  places:
  - the static/shared library, which is installed in the 'lib'
    subdirectory of the install prefix (query-able via the
    bli_info_get_version_str() function);
  - the config.mk makefile fragment, which is installed in the 'share'
    subdirectory of the install prefix (in the VERSION variable);
  - the blis.h header file, which is installed in the 'include'
    subdirectory of the install prefix (via the BLIS_VERSION_STRING
    macro constant).
  Thanks to Mohsen Aznaveh and Tim Davis for providing the idea for this
  change.
- CREDITS file update.
---
 CREDITS               | 132 +++++++++++++++++++++---------------------
 build/bli_config.h.in |   2 +
 common.mk             |   8 +--
 configure             |   1 +
 4 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/CREDITS b/CREDITS
index 53904234e..628e14f58 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3,126 +3,128 @@ BLIS framework
 Acknowledgements
 ---
 
-The BLIS framework was primarily authored by
+The BLIS framework was originally authored by
 
-  Field Van Zee            @fgvanzee           (The University of Texas at Austin)
+  Field Van Zee            @fgvanzee                  (The University of Texas at Austin)
 
-but many others have contributed code and feedback, including
+but many others have contributed code, ideas, and feedback, including
 
-  Sameer Agarwal           @sandwichmaker      (Google)
-  Murtaza Ali                                  (Texas Instruments)
-  Sajid Ali                @s-sajid-ali        (Northwestern University)
+  Sameer Agarwal           @sandwichmaker             (Google)
+  Murtaza Ali                                         (Texas Instruments)
+  Sajid Ali                @s-sajid-ali               (Northwestern University)
   Erling Andersen          @erling-d-andersen
   Alex Arslan              @ararslan
-  Vernon Austel                                (IBM, T.J. Watson Research Center)
-  Satish Balay             @balay              (Argonne National Laboratory)
+  Vernon Austel                                       (IBM, T.J. Watson Research Center)
+  Mohsen Aznaveh           @Aznaveh                   (Texas A&M University)
+  Satish Balay             @balay                     (Argonne National Laboratory)
   Kihiro Bando             @bandokihiro
-  Matthew Brett            @matthew-brett      (University of Birmingham)
+  Matthew Brett            @matthew-brett             (University of Birmingham)
   Jérémie du Boisberranger @jeremiedbb
-  Jed Brown                @jedbrown           (Argonne National Laboratory)
+  Jed Brown                @jedbrown                  (Argonne National Laboratory)
   Robin Christ             @robinchrist
   Dilyn Corner             @dilyn-corner
-  Mat Cross                @matcross           (NAG)
+  Mat Cross                @matcross                  (NAG)
                            @decandia50
-  Harsh Dave               @HarshDave12        (AMD)
-  Daniël de Kok            @danieldk           (Explosion)
-  Kay Dewhurst             @jkd2016            (Max Planck Institute, Halle, Germany)
-  Jeff Diamond                                 (Oracle)
+  Harsh Dave               @HarshDave12               (AMD)
+  Tim Davis                @DrTimothyAldenDavis       (Texas A&M University)
+  Daniël de Kok            @danieldk                  (Explosion)
+  Kay Dewhurst             @jkd2016                   (Max Planck Institute, Halle, Germany)
+  Jeff Diamond                                        (Oracle)
   Johannes Dieterich       @iotamudelta
   Krzysztof Drewniak       @krzysz00
-  Marat Dukhan             @Maratyszcza        (Google)
-  Victor Eijkhout          @VictorEijkhout     (Texas Advanced Computing Center)
-  Evgeny Epifanovsky       @epifanovsky        (Q-Chem)
+  Marat Dukhan             @Maratyszcza               (Google)
+  Victor Eijkhout          @VictorEijkhout            (Texas Advanced Computing Center)
+  Evgeny Epifanovsky       @epifanovsky               (Q-Chem)
   Isuru Fernando           @isuruf
   Roman Gareev             @gareevroman
   Richard Goldschmidt      @SuperFluffy
   Chris Goodyer
   Alexander Grund          @Flamefire
-  John Gunnels             @jagunnels          (IBM, T.J. Watson Research Center)
+  John Gunnels             @jagunnels                 (IBM, T.J. Watson Research Center)
   Ali Emre Gülcü           @Lephar
-  Jeff Hammond             @jeffhammond        (Intel)
+  Jeff Hammond             @jeffhammond               (Intel)
   Jacob Gorm Hansen        @jacobgorm
-  Shivaprashanth H                             (Global Edge)
+  Shivaprashanth H                                    (Global Edge)
   Jean-Michel Hautbois     @jhautbois
   Ian Henriksen            @insertinterestingnamehere (The University of Texas at Austin)
-  Greg Henry                                   (Intel)
+  Greg Henry                                          (Intel)
   Minh Quan Ho             @hominhquan
   Matthew Honnibal         @honnibal
   Stefan Husmann           @stefanhusmann
-  Francisco Igual          @figual             (Universidad Complutense de Madrid)
+  Francisco Igual          @figual                    (Universidad Complutense de Madrid)
   Madeesh Kannan           @shadeMe
   Tony Kelman              @tkelman
-  Lee Killough             @leekillough        (Cray)
-  Mike Kistler             @mkistler           (IBM, Austin Research Laboratory)
-  Ivan Korostelev          @ivan23kor          (University of Alberta)
-  Kyungmin Lee             @kyungminlee        (Ohio State University)
+  Lee Killough             @leekillough               (Cray)
+  Mike Kistler             @mkistler                  (IBM, Austin Research Laboratory)
+  Ivan Korostelev          @ivan23kor                 (University of Alberta)
+  Kyungmin Lee             @kyungminlee               (Ohio State University)
   Michael Lehn             @michael-lehn
   Shmuel Levine            @ShmuelLevine
                            @lschork2
   Dave Love                @loveshack
-  Tze Meng Low                                 (The University of Texas at Austin)
-  Ye Luo                   @ye-luo             (Argonne National Laboratory)
-  Ricardo Magana           @magania            (Hewlett Packard Enterprise)
-  Madan mohan Manokar      @madanm3            (AMD)
+  Tze Meng Low                                        (The University of Texas at Austin)
+  Ye Luo                   @ye-luo                    (Argonne National Laboratory)
+  Ricardo Magana           @magania                   (Hewlett Packard Enterprise)
+  Madan mohan Manokar      @madanm3                   (AMD)
   Giorgos Margaritis
-  Bryan Marker             @bamarker           (The University of Texas at Austin)
-  Simon Lukas Märtens      @ACSimon33          (RWTH Aachen University)
-  Devin Matthews           @devinamatthews     (The University of Texas at Austin)
+  Bryan Marker             @bamarker                  (The University of Texas at Austin)
+  Simon Lukas Märtens      @ACSimon33                 (RWTH Aachen University)
+  Devin Matthews           @devinamatthews            (The University of Texas at Austin)
   Stefanos Mavros          @smavros
-  Mithun Mohan             @MithunMohanKadavil (AMD)
+  Mithun Mohan             @MithunMohanKadavil        (AMD)
                            @moon-chilled
   Ilknur Mustafazade       @Runkli
                            @nagsingh
-  Bhaskar Nallani          @BhaskarNallani     (AMD)
-  Stepan Nassyr            @stepannassyr       (Jülich Supercomputing Centre)
+  Bhaskar Nallani          @BhaskarNallani            (AMD)
+  Stepan Nassyr            @stepannassyr              (Jülich Supercomputing Centre)
   Nisanth M P              @nisanthmp
-  Nisanth Padinharepatt                        (AMD)
+  Nisanth Padinharepatt                               (AMD)
   Ajay Panyala             @ajaypanyala
-  Marc-Antoine Parent      @maparent           (Conversence)
-  Devangi Parikh           @dnparikh           (The University of Texas at Austin)
-  Elmar Peise              @elmar-peise        (RWTH-Aachen)
+  Marc-Antoine Parent      @maparent                  (Conversence)
+  Devangi Parikh           @dnparikh                  (The University of Texas at Austin)
+  Elmar Peise              @elmar-peise               (RWTH-Aachen)
   Clément Pernet           @ClementPernet
   Ilya Polkovnichenko
-  Jack Poulson             @poulson            (Stanford)
+  Jack Poulson             @poulson                   (Stanford)
   Mathieu Poumeyrol        @kali
-  Christos Psarras         @ChrisPsa           (RWTH Aachen University)
+  Christos Psarras         @ChrisPsa                  (RWTH Aachen University)
                            @pkubaj
                            @qnerd
   Michael Rader            @mrader1248
-  Pradeep Rao              @pradeeptrgit       (AMD)
+  Pradeep Rao              @pradeeptrgit              (AMD)
   Aleksei Rechinskii
-  Leick Robinson           @LeickR             (Oracle)
+  Leick Robinson           @LeickR                    (Oracle)
   Karl Rupp                @karlrupp
-  Martin Schatz                                (The University of Texas at Austin)
+  Martin Schatz                                       (The University of Texas at Austin)
   Nico Schlömer            @nschloe
   Rene Sitt
-  Tony Skjellum            @tonyskjellum       (The University of Tennessee at Chattanooga)
-  Mikhail Smelyanskiy                          (Intel, Parallel Computing Lab)
+  Tony Skjellum            @tonyskjellum              (The University of Tennessee at Chattanooga)
+  Mikhail Smelyanskiy                                 (Intel, Parallel Computing Lab)
   Nathaniel Smith          @njsmith
   Shaden Smith             @ShadenSmith
-  Tyler Smith              @tlrmchlsmth        (The University of Texas at Austin)
+  Tyler Smith              @tlrmchlsmth               (The University of Texas at Austin)
   Snehith                  @ArcadioN09
-  Paul Springer            @springer13         (RWTH Aachen University)
-  Adam J. Stewart          @adamjstewart       (University of Illinois at Urbana-Champaign)
+  Paul Springer            @springer13                (RWTH Aachen University)
+  Adam J. Stewart          @adamjstewart              (University of Illinois at Urbana-Champaign)
   Vladimir Sukarev
-  Harihara Sudhan S        @ihariharasudhan    (AMD)
+  Harihara Sudhan S        @ihariharasudhan           (AMD)
   Chengguo Sun             @chengguosun
-  Santanu Thangaraj                            (AMD)
-  Nicholai Tukanov         @nicholaiTukanov    (The University of Texas at Austin)
-  Rhys Ulerich             @RhysU              (The University of Texas at Austin)
-  Robert van de Geijn      @rvdg               (The University of Texas at Austin)
-  Meghana Vankadari        @Meghana-vankadari  (AMD)
-  Kiran Varaganti          @kvaragan           (AMD)
-  Natalia Vassilieva                           (Hewlett Packard Enterprise)
+  Santanu Thangaraj                                   (AMD)
+  Nicholai Tukanov         @nicholaiTukanov           (The University of Texas at Austin)
+  Rhys Ulerich             @RhysU                     (The University of Texas at Austin)
+  Robert van de Geijn      @rvdg                      (The University of Texas at Austin)
+  Meghana Vankadari        @Meghana-vankadari         (AMD)
+  Kiran Varaganti          @kvaragan                  (AMD)
+  Natalia Vassilieva                                  (Hewlett Packard Enterprise)
                            @h-vetinari
-  Andrew Wildman           @awild82            (University of Washington)
-  Zhang Xianyi             @xianyi             (Chinese Academy of Sciences)
+  Andrew Wildman           @awild82                   (University of Washington)
+  Zhang Xianyi             @xianyi                    (Chinese Academy of Sciences)
   Benda Xu                 @heroxbd
-  Guodong Xu               @docularxu          (Linaro.org)
-  RuQing Xu                @xrq-phys           (The University of Tokyo)
+  Guodong Xu               @docularxu                 (Linaro.org)
+  RuQing Xu                @xrq-phys                  (The University of Tokyo)
   Costas Yamin             @cosstas
-  Chenhan Yu               @ChenhanYu          (The University of Texas at Austin)
-  Roman Yurchak            @rth                (Symerio)
+  Chenhan Yu               @ChenhanYu                 (The University of Texas at Austin)
+  Roman Yurchak            @rth                       (Symerio)
   Stefano Zampini          @stefanozampini
   M. Zhou                  @cdluminate
 
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 7dc67059f..9636278d9 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -45,6 +45,8 @@
 // Enabled kernel sets (kernel_list)
 @kernel_list_defines@
 
+#define BLIS_VERSION_STRING "@version@"
+
 #if @enable_system@
 #define BLIS_ENABLE_SYSTEM
 #else
diff --git a/common.mk b/common.mk
index 6b7403afb..b9f5bf750 100644
--- a/common.mk
+++ b/common.mk
@@ -101,7 +101,7 @@ get-noopt-cflags-for     = $(strip $(CFLAGS_PRESET) \
                                    $(call load-var-for,CLANGFLAGS,$(1)) \
                                    $(call load-var-for,CPPROCFLAGS,$(1)) \
                                    $(CTHREADFLAGS) \
-                                   $(CINCFLAGS) $(VERS_DEF) \
+                                   $(CINCFLAGS) \
                             )
 
 get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
@@ -113,7 +113,7 @@ get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
                                    $(call load-var-for,CPPROCFLAGS,$(1)) \
                                    $(CTHREADFLAGS) \
                                    $(CXXTHREADFLAGS) \
-                                   $(CINCFLAGS) $(VERS_DEF) \
+                                   $(CINCFLAGS) \
                             )
 
 get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
@@ -1232,10 +1232,6 @@ BLIS_CONFIG_H   := ./bli_config.h
 # --- Special preprocessor macro definitions -----------------------------------
 #
 
-# Define a C preprocessor macro to communicate the current version so that it
-# can be embedded into the library and queried later.
-VERS_DEF       := -DBLIS_VERSION_STRING=\"$(VERSION)\"
-
 # Define a C preprocessor flag that is *only* defined when BLIS is being
 # compiled. (In other words, an application that #includes blis.h will not
 # get this cpp macro.)
diff --git a/configure b/configure
index a89225107..d45b0ba9d 100755
--- a/configure
+++ b/configure
@@ -4194,6 +4194,7 @@ main()
 		| perl -pe "s/\@config_name_define\@/${config_name_define}/g" \
 		| perl -pe "s/\@config_list_defines\@/${config_list_defines}/g" \
 		| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
+		| sed   -e "s/@version@/${version_esc}/g" \
 		| sed   -e "s/@enable_system@/${enable_system_01}/g" \
 		| sed   -e "s/@enable_openmp@/${enable_openmp_01}/g" \
 		| sed   -e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g" \

From e3d352f1fcc93e6a46fde1aa4a7f0a18fb27bd42 Mon Sep 17 00:00:00 2001
From: Nisanth M P <nisanthmp.01@gmail.com>
Date: Wed, 8 Feb 2023 06:11:41 +0530
Subject: [PATCH 129/230] Added runtime selection of 'power' config family.
 (#718)

Details:
- Created a 'power' umbrella configuration family, which, when targeted
  at configure-time, will build both 'power9' and 'power10' subconfigs.
  (With this feature, a BLIS shared library could be compiled on a
  power9 system and run on power10 and vice-versa. Unoptimised code
  will execute if it is linked and run on any other generic system.)
- This new configuration family will only work with gcc, since that is
  the only compiler supported by both power9 and power10 subconfigs in
  BLIS.
- Documented power9 and power10 as supported microarchitectures in the
  docs/HardwareSupport.md document.
---
 config/power/bli_family_power.h | 41 +++++++++++++++++
 config/power/make_defs.mk       | 82 +++++++++++++++++++++++++++++++++
 config_registry                 |  1 +
 docs/HardwareSupport.md         |  2 +
 frame/base/bli_arch.c           |  3 +-
 frame/include/bli_arch_config.h |  7 ++-
 6 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 config/power/bli_family_power.h
 create mode 100644 config/power/make_defs.mk

diff --git a/config/power/bli_family_power.h b/config/power/bli_family_power.h
new file mode 100644
index 000000000..21b44db87
--- /dev/null
+++ b/config/power/bli_family_power.h
@@ -0,0 +1,41 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+
+
+//#endif
+
diff --git a/config/power/make_defs.mk b/config/power/make_defs.mk
new file mode 100644
index 000000000..2a366f1e2
--- /dev/null
+++ b/config/power/make_defs.mk
@@ -0,0 +1,82 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := power
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS) -O3
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config_registry b/config_registry
index 93cff1523..f25d66e7f 100644
--- a/config_registry
+++ b/config_registry
@@ -14,6 +14,7 @@ amd64_legacy:   excavator steamroller piledriver bulldozer generic
 amd64:          zen3 zen2 zen generic
 arm64:          armsve firestorm thunderx2 cortexa57 cortexa53 generic
 arm32:          cortexa15 cortexa9 generic
+power:		power10 power9 generic
 
 # Intel architectures.
 skx:         skx/skx/haswell/zen
diff --git a/docs/HardwareSupport.md b/docs/HardwareSupport.md
index 944cfa8ee..66891548a 100644
--- a/docs/HardwareSupport.md
+++ b/docs/HardwareSupport.md
@@ -39,6 +39,8 @@ A few remarks / reminders:
 | ARMv8.1 A64FX (SVE)                  | `a64fx`                |  `d`   |            |
 | IBM Blue Gene/Q (QPX int)            | `bgq`                  |  `d`   |            |
 | IBM Power7 (QPX int)                 | `power7`               |  `d`   |            |
+| IBM Power9                           | `power9`               | `sdcz` |            |
+| IBM Power10                          | `power10`              | `sdcz` |            |
 | template (C99)                       | `template`             | `sdcz` | `sdcz`     |
 
 ## Level-1f kernels
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index bd3f24993..b697e35f9 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -175,7 +175,8 @@ arch_t bli_arch_query_id_impl( void )
 		    defined BLIS_FAMILY_AMD64   || \
 		    defined BLIS_FAMILY_X86_64  || \
 		    defined BLIS_FAMILY_ARM64   || \
-		    defined BLIS_FAMILY_ARM32
+		    defined BLIS_FAMILY_ARM32   || \
+		    defined BLIS_FAMILY_POWER
 		id = bli_cpuid_query_id();
 		#endif
 
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index f804d3003..0485295df 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -237,7 +237,12 @@ CNTX_INIT_PROTS( generic )
 #include "bli_family_cortexa9.h"
 #endif
 
-// -- IBM Power --
+// -- IBM Power families --
+#ifdef BLIS_FAMILY_POWER
+#include "bli_family_power.h"
+#endif
+
+// -- IBM Power architectures --
 
 #ifdef BLIS_FAMILY_POWER10
 #include "bli_family_power10.h"

From b1d3fc7e5b0927086e336a23f16ea59aa3611ccb Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 10 Feb 2023 15:34:47 -0600
Subject: [PATCH 130/230] Redirect grep stderr to /dev/null. (#723)

Details:
- In common.mk, added a redirection of stderr to /dev/null for the grep
  command being used to gather a list of header files #included from
  bli_cntx_ref.c. The redirection is desirable because as of grep 3.8,
  regular expressions with "stray" backslashes trigger warnings [1].
  But removing the backslash seems to break the BLIS build system when
  using pre-3.8 versions of grep, so this seems to be easiest way to
  satisfy the BLIS build system for both pre- and post-3.8 grep
  environments.

  [1] https://lists.gnu.org/archive/html/info-gnu/2022-09/msg00001.html
---
 common.mk | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/common.mk b/common.mk
index b9f5bf750..3a8cb9886 100644
--- a/common.mk
+++ b/common.mk
@@ -534,6 +534,7 @@ GREP       := grep
 EGREP      := grep -E
 XARGS      := xargs
 INSTALL    := install -c
+DEVNULL    := /dev/null
 
 # Script for creating a monolithic header file.
 #FLATTEN_H  := $(DIST_PATH)/build/flatten-headers.sh
@@ -1181,7 +1182,18 @@ CBLAS_H_FLAT    := $(BASE_INC_PATH)/$(CBLAS_H)
 # files will be needed when compiling bli_cntx_ref.c with the monolithic header.
 ifeq ($(strip $(SHARE_PATH)),.)
 REF_KER_SRC     := $(DIST_PATH)/$(REFKERN_DIR)/bli_cntx_ref.c
-REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H))
+#
+# NOTE: A redirect to /dev/null has been added to the grep command below because
+# as of version 3.8, grep outputs warnings when encountering stray backslashes
+# in regular expressions [1]. Versions older than 3.8 not only do not complain,
+# but actually seem to *require* the backslash, perhaps because of the way we
+# are invoking grep via GNU make's shell command. WHEN DEBUGGING ANYTHING
+# INVOLVING THE MAKE VARIABLE BELOW, PLEASE CONSIDER TEMPORARILY REMOVING THE
+# REDIRECT TO /dev/null SO THAT YOU SEE ANY MESSAGES SENT TO STANDARD ERROR.
+#
+# [1] https://lists.gnu.org/archive/html/info-gnu/2022-09/msg00001.html
+#
+REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) 2> $(DEVNULL) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H))
 endif
 
 # Match each header found above with the path to that header, and then strip

From 0b421eff130b5c896edcc09e7358d18564d177e9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sat, 18 Feb 2023 13:11:41 -0600
Subject: [PATCH 131/230] Added an 'arm64' entry to `.travis.yml`. (#726)

Details:
- Added a new 'arm64' entry to the .travis.yml file in an attempt to get
  Travis CI to compile both NEON and SVE kernels, even if only NEON
  kernels are exercised in the testing. With this new 'arm64' entry, the
  'cortexa57' entry becomes redundant and may be removed. Thanks to
  RuQing Xu for this suggestion.
- Previously, the macro BLIS_SIMD_MAX_SIZE was *not* being set in
  bli_kernels_arm64.h, which meant that the default value of 64 was
  being used. This caused a runtime consistency check to fail in
  bli_gks.c (in Travis CI), one which requires that

    mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE

  for all datatype sizes dt_size, where BLIS_STACK_BUF_MAX_SIZE is
  defined as

    BLIS_SIMD_MAX_NUM_REGISTERS * BLIS_SIMD_MAX_SIZE * 2

  This commit increases BLIS_SIMD_MAX_SIZE to 128 for the 'arm64'
  configuration, thus overriding the default and (hopefully) avoiding
  the aforementioned consistency check failures.
- Appended '|| cat ./output.testsuite' to all 'make' commands in
  travis/do_testsuite.sh. Thanks to RuQing Xu for this suggestion.
- Whitespace changes.
---
 .travis.yml                     |  9 +++++++++
 config/arm64/bli_family_arm64.h |  2 ++
 travis/do_testsuite.sh          | 18 +++++++++---------
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6603ca2f3..b177bb23a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -62,6 +62,15 @@ matrix:
       CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \
       PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
       TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/"
+  # arm64 build and fast testsuite (qemu)
+  # NOTE: This entry omits the -cpu flag so that while both NEON and SVE kernels
+  # are compiled, only NEON kernels will be tested. (h/t to RuQing Xu)
+  - os: linux
+    compiler: aarch64-linux-gnu-gcc-10
+    env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="arm64" \
+      CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \
+      PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
+      TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
 install:
 - if [ "$CC" = "gcc"  ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi
 - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
diff --git a/config/arm64/bli_family_arm64.h b/config/arm64/bli_family_arm64.h
index b242d7049..3fb08fc42 100644
--- a/config/arm64/bli_family_arm64.h
+++ b/config/arm64/bli_family_arm64.h
@@ -39,6 +39,8 @@
 // -- MEMORY ALLOCATION --------------------------------------------------------
 
 #define BLIS_SIMD_ALIGN_SIZE 16
+
+#define BLIS_SIMD_MAX_SIZE 128 // Note: The default is 64.
 #define BLIS_SIMD_MAX_NUM_REGISTERS 32
 
 // SVE-specific configs.
diff --git a/travis/do_testsuite.sh b/travis/do_testsuite.sh
index 6778f81d8..c21df3a32 100755
--- a/travis/do_testsuite.sh
+++ b/travis/do_testsuite.sh
@@ -9,27 +9,27 @@ export BLIS_JR_NT=1
 export BLIS_IR_NT=1
 
 if [ "$TEST" = "FAST" -o "$TEST" = "ALL" ]; then
-    make testblis-fast
-    $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
+	make testblis-fast || cat ./output.testsuite
+	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
 if [ "$TEST" = "MD" -o "$TEST" = "ALL" ]; then
-	make testblis-md
-    $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
+	make testblis-md || cat ./output.testsuite
+	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
 if [ "$TEST" = "SALT" -o "$TEST" = "ALL" ]; then
 	# Disable multithreading within BLIS.
 	export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=1 BLIS_IR_NT=1
-	make testblis-salt
-    $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
+	make testblis-salt || cat ./output.testsuite
+	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
 if [ "$TEST" = "1" -o "$TEST" = "ALL" ]; then
-    make testblis
-    $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
+	make testblis || cat ./output.testsuite
+	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
-make testblas
+make testblas || cat ./output.testsuite
 $DIST_PATH/blastest/check-blastest.sh
 

From 059f15105b1643fe56084f883c22b3cadf368b39 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Sat, 18 Feb 2023 14:13:23 -0500
Subject: [PATCH 132/230] Updated hpx namespace for make_count_shape. (#725)

Details:
- The hpx namespace for *counting_shape changed. This PR updates the use
  of counting_shape in blis to comply with the change in hpx.
- Co-authored-by: ctaylor <ctaylor@tactcomplabs.com>
---
 frame/thread/bli_thread_hpx.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp
index a7818ffd6..f69a0f5d7 100644
--- a/frame/thread/bli_thread_hpx.cpp
+++ b/frame/thread/bli_thread_hpx.cpp
@@ -56,7 +56,7 @@ void bli_thread_launch_hpx
 	pool_t*    gl_comm_pool = nullptr;
 	thrcomm_t* gl_comm      = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
 
-	auto irange = hpx::util::detail::make_counting_shape(n_threads);
+	auto irange = hpx::util::counting_shape(n_threads);
 
 	hpx::for_each(hpx::execution::par, hpx::util::begin(irange), hpx::util::end(irange),
 	[&gl_comm, &func, &params](const dim_t tid)

From 0ba6e9eafb1e667373d9dbc2aa045557921f33e2 Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Sat, 18 Feb 2023 13:15:42 -0600
Subject: [PATCH 133/230] Refined emacs handling of indentation. (#717)

Details:
- This refines the emacs autoformatting to be better in line with
  contribution guidelines.
- Removed a stray shebang in a .mk file which confuses emacs about the
  file mode, which should be makefile-mode. (emacs also removes stray
  whitespace at the ends of lines.)
---
 .dir-locals.el                  | 43 ++++++++++++++++++++++++---------
 config/old/newarch/make_defs.mk | 13 +++++-----
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/.dir-locals.el b/.dir-locals.el
index 711f4a63d..c0dc5741b 100644
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -1,11 +1,32 @@
-;; Emacs C mode formatting for the BLIS layout requirements.
-((c-mode . ((c-file-style . "stroustrup")
-	    (c-basic-offset . 4)
-	    (comment-start . "// ")
-	    (comment-end . "")
-	    (indent-tabs-mode . t)
-	    (tab-width . 4)
-	    (parens-require-spaces . nil)
-	    (require-final-newline . t)
-	    (eval add-hook `before-save-hook `delete-trailing-whitespace)
-	    )))
+;; Emacs formatting for the BLIS layout requirements.
+
+(
+ ;; Recognize *.mk files as Makefile fragments
+ (auto-mode-alist . (("\\.mk\\'" . makefile-mode)) )
+
+ ;; Makefiles require tabs and are almost always width 8
+ (makefile-mode . (
+                   (indent-tabs-mode . t)
+                   (tab-width . 8)
+                   )
+                )
+
+ ;; C code formatting roughly according to docs/CodingConventions.md
+ (c-mode . (
+            (c-file-style . "bsd")
+            (c-basic-offset . 4)
+            (comment-start . "// ")
+            (comment-end . "")
+            (parens-require-spaces . nil)
+            )
+         )
+
+ ;; Default formatting for all source files not overriden above
+ (prog-mode . (
+               (indent-tabs-mode . nil)
+               (tab-width . 4)
+               (require-final-newline . t)
+               (eval add-hook `before-save-hook `delete-trailing-whitespace)
+               )
+            )
+)
diff --git a/config/old/newarch/make_defs.mk b/config/old/newarch/make_defs.mk
index 523e0b13b..59393c56f 100644
--- a/config/old/newarch/make_defs.mk
+++ b/config/old/newarch/make_defs.mk
@@ -1,6 +1,6 @@
-#!/bin/bash
 #
-#  BLIS    
+#
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -47,7 +47,7 @@ CC             := gcc
 CC_VENDOR      := gcc
 endif
 
-# Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99
@@ -67,13 +67,13 @@ endif
 CKOPTFLAGS     := $(COPTFLAGS)
 
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := 
+CKVECFLAGS     :=
 else
 ifeq ($(CC_VENDOR),icc)
-CKVECFLAGS     := 
+CKVECFLAGS     :=
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := 
+CKVECFLAGS     :=
 else
 $(error gcc, icc, or clang is required for this configuration.)
 endif
@@ -83,4 +83,3 @@ endif
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
-

From 4e18cd34f909c5045597f411340ede3a5e0bc5e1 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sun, 19 Feb 2023 04:18:41 +0900
Subject: [PATCH 134/230] Restored ArmSVE general storage case. (#708)

Details:
- Restored general storage case in armsve kernels.
- Reason for doing this: Though real `g`-storage is difficult to
  speedup, `g`-codepath here can provide a good support for
  transposed-storage. i.e. at least good for `GEMM_UKR_SETUP_CT_AMBI`.
- By experience, this solution is only *a little* slower than in-reg
  transpose. Plus in-reg transpose is only possible for a fixed VL in
  our case.
---
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  | 77 +++++++++---------
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  | 45 ++++++-----
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  | 41 ++++++----
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  | 78 +++++++++----------
 4 files changed, 129 insertions(+), 112 deletions(-)

diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 9bc7fd949..5723a10f3 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -57,8 +57,8 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
        cntx_t*             cntx
      )
 {
-  void* a_next = bli_auxinfo_next_a( data );
-  void* b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
@@ -68,7 +68,7 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  GEMM_UKR_SETUP_CT( c, m, 10, false );
+  GEMM_UKR_SETUP_CT_ANY( c, m, 10, false );
 
   __asm__ volatile (
 " whilelo         p0.s, xzr, %12                  \n\t"
@@ -117,8 +117,8 @@ BEQ(END_CCOL_PRFM)
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 "                                                 \n\t"
 LABEL(CCOL_PRFM)
-// " cmp             %3, #1                          \n\t"
-// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
+" cmp             %3, #1                          \n\t"
+BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, %2                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
@@ -232,8 +232,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
 LABEL(WRITE_MEM_EXEC)
 " mov             x9, %2                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is %2 itself.
-// " cmp             %3, #1                          \n\t"
-// BNE(WRITE_MEM_G)
+" cmp             %3, #1                          \n\t"
+BNE(WRITE_MEM_G)
 "                                                 \n\t"
 LABEL(WRITE_MEM_C)
 " fmov            s29, wzr                        \n\t"
@@ -259,38 +259,37 @@ LABEL(ZERO_BETA_C_4_5_6_7_8_9)
 GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
-// BRANCH(END_WRITE_MEM)
-// "                                                 \n\t"
-// LABEL(WRITE_MEM_G)
-// " add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
-// " mov             x3, %3                          \n\t" //  s.t. 2*sizeof(float) = 2*4 = 8.
-// " index           z28.s, wzr, w3                  \n\t"
-// " fmov            s29, wzr                        \n\t"
-// " fcmp            s31, #0.0                       \n\t" // Whether Imag(beta) == 0.
-// " fccmp           s30, s29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-// BEQ(ZERO_BETA_G_0_1_2_3)
-// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
-// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-// LABEL(ZERO_BETA_G_0_1_2_3)
-// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
-// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
-// "                                                 \n\t"
-// BEQ(ZERO_BETA_G_4_5_6_7_8_9)
-// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
-// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
-// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
-// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-// LABEL(ZERO_BETA_G_4_5_6_7_8_9)
-// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
-// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
-// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
-// "                                                 \n\t"
-// LABEL(END_WRITE_MEM)
-// BRANCH(END_EXEC)
+BRANCH(END_WRITE_MEM)
+// General-storage case -- Mainly for Column-storage or other aligned cases.
+LABEL(WRITE_MEM_G)
+" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+" index           z28.s, wzr, %w3                 \n\t" //  s.t. 2*sizeof(float) = 2*4 = 8.
+" fmov            s29, wzr                        \n\t"
+" fcmp            s31, #0.0                       \n\t" // Whether Imag(beta) == 0.
+" fccmp           s30, s29, 0, eq                 \n\t" // Whether Real(beta) == 0.
+BEQ(ZERO_BETA_G_0_1_2_3)
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+LABEL(ZERO_BETA_G_0_1_2_3)
+GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+BEQ(ZERO_BETA_G_4_5_6_7_8_9)
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+LABEL(ZERO_BETA_G_4_5_6_7_8_9)
+GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+LABEL(END_WRITE_MEM)
+BRANCH(END_EXEC)
 "                                                 \n\t"
 LABEL(END_EXEC)
 " mov             %11, #0                         \n\t" // Return normal.
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 1c9d68dec..3bc6de506 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -57,8 +57,8 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
        cntx_t*             cntx
      )
 {
-  void* a_next = bli_auxinfo_next_a( data );
-  void* b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
@@ -67,7 +67,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
-  GEMM_UKR_SETUP_CT( d, m, 10, false );
+  GEMM_UKR_SETUP_CT_ANY( d, m, 10, false );
 
   __asm__ volatile (
 " mov             x0, xzr                         \n\t"
@@ -82,7 +82,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
 " mov             x3, #10                         \n\t" // Row-skip of B.
 "                                                 \n\t"
 " ldr             x5, %[c]                        \n\t"
-// " ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
 " ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
 #ifdef _A64FX
 " mov             x8, 0x3                         \n\t" // Tag C address.
@@ -120,8 +120,8 @@ BEQ(END_CCOL_PRFM)
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 "                                                 \n\t"
 LABEL(CCOL_PRFM)
-// " cmp             x6, #1                          \n\t"
-// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
+" cmp             x6, #1                          \n\t"
+BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, x5                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
@@ -256,8 +256,8 @@ LABEL(PREFETCH_ABNEXT)
 "                                                 \n\t"
 " mov             x9, x5                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is x5 itself.
-// " cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
-// BNE(WRITE_MEM)
+" cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
+BNE(WRITE_MEM)
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 "                                                 \n\t"
 LABEL(WRITE_MEM)
@@ -268,8 +268,8 @@ BEQ(UNIT_ALPHA)
 SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
 "                                                 \n\t"
 LABEL(UNIT_ALPHA)
-// " cmp             x6, #1                          \n\t"
-// BNE(WRITE_MEM_G)
+" cmp             x6, #1                          \n\t"
+BNE(WRITE_MEM_G)
 "                                                 \n\t"
 LABEL(WRITE_MEM_C)
 "                                                 \n\t" // Available scratch: Z[20-30].
@@ -281,17 +281,26 @@ BEQ(BETA_ZERO_C)
 GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-"                                                 \n\t"
 LABEL(BETA_ZERO_C)
 GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7)
 GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7)
-// BRANCH(END_WRITE_MEM)
-// "                                                 \n\t"
-// LABEL(END_WRITE_MEM)
-// BRANCH(END_EXEC)
-// "                                                 \n\t"
-// LABEL(END_ERROR)
-// " mov             x0, #1                          \n\t" // Return error.
+BRANCH(END_EXEC)
+// Generic-storage case -- Mainly for transposed storage.
+LABEL(WRITE_MEM_G)
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+"                                                 \n\t"
+" fcmp            d31, #0.0                       \n\t" // Skip loading if *beta == 0 to override NaN.
+BEQ(BETA_ZERO_G)
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p1,x9,x7,x8,x16)
+GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p1,x9,x7,x8,x16)
+GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+LABEL(BETA_ZERO_G)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p1,x5,x7,x8,x16)
+GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p1,x5,x7,x8,x16)
 LABEL(END_EXEC)
 " mov             x0, #0                          \n\t" // Return normal.
 :
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index 05005f8c3..4aec5c4df 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -57,8 +57,8 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
        cntx_t*             cntx
      )
 {
-  void* a_next = bli_auxinfo_next_a( data );
-  void* b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
@@ -67,7 +67,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
   uint64_t rs_c   = rs_c0;
   uint64_t cs_c   = cs_c0;
 
-  GEMM_UKR_SETUP_CT( s, m, 10, false );
+  GEMM_UKR_SETUP_CT_ANY( s, m, 10, false );
 
   __asm__ volatile (
 " mov             x0, xzr                         \n\t"
@@ -82,7 +82,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
 " mov             x3, #10                         \n\t" // Row-skip of B.
 "                                                 \n\t"
 " ldr             x5, %[c]                        \n\t"
-// " ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
 " ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
 #ifdef _A64FX
 " mov             x8, 0x3                         \n\t" // Tag C address.
@@ -120,8 +120,8 @@ BEQ(END_CCOL_PRFM)
 GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0)
 "                                                 \n\t"
 LABEL(CCOL_PRFM)
-// " cmp             x6, #1                          \n\t"
-// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
+" cmp             x6, #1                          \n\t"
+BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, x5                         \n\t"
 " prfm            PLDL1STRM, [x16]                \n\t"
 " add             x16, x16, x7                    \n\t"
@@ -256,8 +256,8 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
 LABEL(UNIT_ALPHA)
 " mov             x9, x5                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is x5 itself.
-// " cmp             x6, #1                          \n\t"
-// BNE(WRITE_MEM_G)
+" cmp             x6, #1                          \n\t"
+BNE(WRITE_MEM_G)
 "                                                 \n\t"
 LABEL(WRITE_MEM_C)
 "                                                 \n\t" // Available scratch: Z[20-30].
@@ -268,17 +268,26 @@ GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
 GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7)
 GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
-"                                                 \n\t"
 LABEL(BETA_ZERO_C)
 GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7)
 GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7)
-// BRANCH(END_WRITE_MEM)
-// "                                                 \n\t"
-// LABEL(END_WRITE_MEM)
-// BRANCH(END_EXEC)
-// "                                                 \n\t"
-// LABEL(END_ERROR)
-// " mov             x0, #1                          \n\t" // Return error.
+BRANCH(END_EXEC)
+// Generic-storage case -- Mainly for transposed storage.
+LABEL(WRITE_MEM_G)
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+" index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
+"                                                 \n\t"
+" fcmp            s31, #0.0                       \n\t" // Skip loading if *beta == 0 to override NaN.
+BEQ(BETA_ZERO_G)
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p1,x9,x7,x8,x16)
+GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p1,x9,x7,x8,x16)
+GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
+LABEL(BETA_ZERO_G)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p1,x5,x7,x8,x16)
+GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p1,x5,x7,x8,x16)
 LABEL(END_EXEC)
 " mov             x0, #0                          \n\t" // Return normal.
 :
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 210d40f0b..2c70f486f 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -57,8 +57,8 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
        cntx_t*             cntx
      )
 {
-  void* a_next = bli_auxinfo_next_a( data );
-  void* b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
@@ -68,7 +68,7 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
   uint64_t cs_c   = cs_c0;
   uint64_t info   = 0;
 
-  GEMM_UKR_SETUP_CT( z, m, 10, false );
+  GEMM_UKR_SETUP_CT_ANY( z, m, 10, false );
 
   __asm__ volatile (
 " whilelo         p0.d, xzr, %12                  \n\t"
@@ -117,8 +117,8 @@ BEQ(END_CCOL_PRFM)
 GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
 "                                                 \n\t"
 LABEL(CCOL_PRFM)
-// " cmp             %3, #1                          \n\t"
-// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
+" cmp             %3, #1                          \n\t"
+BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage.
 " mov             x16, %2                         \n\t"
 " prfm            PLDL1KEEP, [x16]                \n\t"
 " add             x16, x16, %4                    \n\t"
@@ -232,8 +232,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
 LABEL(WRITE_MEM_EXEC)
 " mov             x9, %2                          \n\t" // C address for loading.
 "                                                 \n\t" // C address for storing is %2 itself.
-// " cmp             %3, #1                          \n\t"
-// BNE(WRITE_MEM_G)
+" cmp             %3, #1                          \n\t"
+BNE(WRITE_MEM_G)
 "                                                 \n\t"
 LABEL(WRITE_MEM_C)
 " fmov            d29, xzr                        \n\t"
@@ -259,38 +259,38 @@ LABEL(ZERO_BETA_C_4_5_6_7_8_9)
 GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
 GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
-// BRANCH(END_WRITE_MEM)
-// "                                                 \n\t"
-// LABEL(WRITE_MEM_G)
-// " add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
-// " index           z28.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
-// " fmov            d29, xzr                        \n\t"
-// " fcmp            d31, #0.0                       \n\t" // Whether Imag(beta) == 0.
-// " fccmp           d30, d29, 0, eq                 \n\t" // Whether Real(beta) == 0.
-// BEQ(ZERO_BETA_G_0_1_2_3)
-// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
-// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
-// LABEL(ZERO_BETA_G_0_1_2_3)
-// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
-// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
-// "                                                 \n\t"
-// BEQ(ZERO_BETA_G_4_5_6_7_8_9)
-// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
-// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
-// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
-// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
-// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
-// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
-// LABEL(ZERO_BETA_G_4_5_6_7_8_9)
-// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
-// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
-// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
-// "                                                 \n\t"
-// LABEL(END_WRITE_MEM)
-// BRANCH(END_EXEC)
-// "                                                 \n\t"
+BRANCH(END_WRITE_MEM)
+// General-storage case -- Mainly for Column-storage or other aligned cases.
+LABEL(WRITE_MEM_G)
+" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+" index           z28.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
+" fmov            d29, xzr                        \n\t"
+" fcmp            d31, #0.0                       \n\t" // Whether Imag(beta) == 0.
+" fccmp           d30, d29, 0, eq                 \n\t" // Whether Real(beta) == 0.
+BEQ(ZERO_BETA_G_0_1_2_3)
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+LABEL(ZERO_BETA_G_0_1_2_3)
+GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+BEQ(ZERO_BETA_G_4_5_6_7_8_9)
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+LABEL(ZERO_BETA_G_4_5_6_7_8_9)
+GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+LABEL(END_WRITE_MEM)
+BRANCH(END_EXEC)
+"                                                 \n\t"
 LABEL(END_EXEC)
 " mov             %11, #0                         \n\t" // Return normal.
 : "+r" (a),      // %0

From 93c63d1f469c4650df082d0fa2f29c46db0e25f5 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 20 Feb 2023 11:14:23 -0600
Subject: [PATCH 135/230] Use 'const' pointers in kernel APIs. (#722)

Details:
- Qualified all input-only data pointers in the various kernel APIs with
  the 'const' keyword while also removing 'restrict' from those kernel
  APIs. (Use of 'restrict' was maintained in kernel implementations,
  where appropriate.) This affected the function pointer types defined
  for all of the kernels, their prototypes, and the reference and
  optimized kernel definitions' signatures.
- Templatized the definitions of copys_mxn and xpbys_mxn static inline
  functions.
- Minor whitespace and style changes (e.g. combining local variable
  declaration and initialization into a single statement).
- Removed some unused kernel code left in 'old' directories.
- Thanks to Nisanth M P for helping to validate changes to the power10
  microkernels.
---
 frame/1/bli_l1v_ft_ker.h                      |   112 +-
 frame/1/bli_l1v_ker_prot.h                    |   164 +-
 frame/1f/bli_l1f_ft_ker.h                     |   112 +-
 frame/1f/bli_l1f_ker_prot.h                   |   104 +-
 frame/1m/bli_l1m_ft_ker.h                     |    94 +-
 frame/1m/bli_l1m_ker_prot.h                   |    58 +-
 frame/3/bli_l3_ft_ukr.h                       |    52 +-
 frame/3/bli_l3_ind_ukr.h                      |    52 +-
 frame/3/bli_l3_sup_ft_ker.h                   |    24 +-
 frame/3/bli_l3_sup_ker_prot.h                 |    24 +-
 frame/3/bli_l3_ukr_prot.h                     |    55 +-
 frame/3/bli_l3_ukr_tapi.c                     |    52 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |    28 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |    72 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |    36 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2b.c         |    36 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |    36 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2b.c         |    36 +-
 frame/include/bli_edge_case_macro_defs.h      |    32 +-
 frame/include/level0/bli_copys_mxn.h          |   248 +-
 frame/include/level0/bli_xpbys_mxn.h          |   304 +-
 .../armsve/1m/bli_dpackm_armsve256_int_8xk.c  |    18 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c |    18 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c |    18 +-
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |    20 +-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  |    20 +-
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  |    20 +-
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |    20 +-
 kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c   |   151 +-
 kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c   |    40 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c |    23 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c |    23 +-
 .../armv8a/1m/bli_packm_armv8a_int_s12xk.c    |    23 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c |    23 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   |    48 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c  |    40 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c   |    76 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c   |   100 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c   |    28 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c   |    28 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c   |    28 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c   |    28 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c   |    38 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c   |    28 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c   |    28 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c   |    29 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c |    24 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c |    24 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c |    30 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c |    30 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c   |    33 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c   |    33 +-
 kernels/bgq/1/bli_axpyv_bgq_int.c             |    12 +-
 kernels/bgq/1/bli_dotv_bgq_int.c              |    14 +-
 kernels/bgq/1f/bli_axpyf_bgq_int.c            |    36 +-
 kernels/bgq/3/bli_gemm_bgq_int_8x8.c          |    46 +-
 .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c      |    88 +-
 .../haswell/1m/bli_packm_haswell_asm_c3xk.c   |    18 +-
 .../haswell/1m/bli_packm_haswell_asm_c8xk.c   |    18 +-
 .../haswell/1m/bli_packm_haswell_asm_d6xk.c   |    18 +-
 .../haswell/1m/bli_packm_haswell_asm_d8xk.c   |    18 +-
 .../haswell/1m/bli_packm_haswell_asm_s16xk.c  |    18 +-
 .../haswell/1m/bli_packm_haswell_asm_s6xk.c   |    18 +-
 .../haswell/1m/bli_packm_haswell_asm_z3xk.c   |    18 +-
 .../haswell/1m/bli_packm_haswell_asm_z4xk.c   |    18 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c |    80 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c |    80 +-
 .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c       |    44 +-
 .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c       |    68 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c  |   108 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c  |   126 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c |   176 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c |   126 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c  |   164 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c  |   210 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c |   240 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c |   210 +-
 .../sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c |    38 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c    |    96 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c    |    96 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c    |    72 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c    |    88 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c    |   144 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c    |   144 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c    |   144 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c    |   156 +-
 .../old/bli_gemmsup_rd_haswell_asm_d6x8.c     |  4566 -------
 .../old/bli_gemmsup_rv_haswell_asm_d6x8.c     | 11048 ----------------
 .../sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c |  5249 --------
 .../old/bli_gemmsup_rd_haswell_asm_d6x8m.c    |  5543 --------
 .../bli_gemmsup_rd_haswell_asm_d6x8m.c.newji  |  5628 --------
 ...bli_gemmsup_rd_haswell_asm_d6x8m.c.worksij |  5634 --------
 .../old/bli_gemmsup_rd_haswell_asm_d6x8n.c    |  5836 --------
 .../s6x16/bli_gemmsup_r_haswell_ref_sMx1.c    |    74 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c   |    96 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c  |    72 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c  |    88 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c   |    96 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c   |    72 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c   |    72 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c  |   144 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c  |   156 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c   |   144 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c   |   144 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c   |   168 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c   |   144 +-
 kernels/knc/3/bli_dgemm_knc_asm_30x8.c        |    20 +-
 kernels/knc/3/bli_sgemm_knc_asm_30x16.c       |    20 +-
 kernels/knl/1m/bli_dpackm_knl_asm_24x8.c      |    36 +-
 kernels/knl/1m/bli_spackm_knl_asm_24x16.c     |    36 +-
 kernels/knl/3/bli_dgemm_knl_asm_24x8.c        |    20 +-
 kernels/knl/3/bli_sgemm_knl_asm_24x16.c       |    26 +-
 kernels/penryn/1/bli_axpyv_penryn_int.c       |    46 +-
 kernels/penryn/1/bli_dotv_penryn_int.c        |    44 +-
 kernels/penryn/1f/bli_axpy2v_penryn_int.c     |   133 +-
 kernels/penryn/1f/bli_axpyf_penryn_int.c      |    65 +-
 kernels/penryn/1f/bli_dotaxpyv_penryn_int.c   |    57 +-
 kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c  |   120 +-
 kernels/penryn/1f/bli_dotxf_penryn_int.c      |    81 +-
 kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c   |    48 +-
 .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c |    33 +-
 .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c |    33 +-
 kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c |    15 +-
 kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c |    15 +-
 .../3/bli_gemm_piledriver_asm_d8x3.c          |    96 +-
 kernels/power10/3/bli_dgemm_power10_mma.c     |    27 +-
 kernels/power10/3/bli_i16gemm_power10_mma.c   |    27 +-
 kernels/power10/3/bli_i16sgemm_power10_mma.c  |    27 +-
 kernels/power10/3/bli_i4gemm_power10_mma.c    |    27 +-
 kernels/power10/3/bli_i8gemm_power10_mma.c    |    27 +-
 kernels/power10/3/bli_sbgemm_power10_mma.c    |    27 +-
 kernels/power10/3/bli_sgemm_power10_mma.c     |    29 +-
 kernels/power10/3/bli_shgemm_power10_mma.c    |    27 +-
 kernels/power7/3/bli_gemm_power7_int_8x4.c    |    80 +-
 kernels/power9/3/bli_gemm_power9_asm_d12x6.c  |    20 +-
 .../3/bli_gemm_sandybridge_asm_d8x4.c         |    88 +-
 .../3/bli_gemm_sandybridge_int_d8x4.c         |    54 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c    |    22 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x14.c       |    22 +-
 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c    |    22 +-
 kernels/zen/1/bli_amaxv_zen_int.c             |    32 +-
 kernels/zen/1/bli_axpyv_zen_int.c             |    38 +-
 kernels/zen/1/bli_axpyv_zen_int10.c           |    38 +-
 kernels/zen/1/bli_copyv_zen_int.c             |    20 +-
 kernels/zen/1/bli_dotv_zen_int.c              |    40 +-
 kernels/zen/1/bli_dotv_zen_int10.c            |    42 +-
 kernels/zen/1/bli_dotxv_zen_int.c             |    48 +-
 kernels/zen/1/bli_scalv_zen_int.c             |    20 +-
 kernels/zen/1/bli_scalv_zen_int10.c           |    30 +-
 kernels/zen/1/bli_setv_zen_int.c              |    22 +-
 kernels/zen/1/bli_swapv_zen_int8.c            |    30 +-
 kernels/zen/1f/bli_axpyf_zen_int_4.c          |    36 +-
 kernels/zen/1f/bli_axpyf_zen_int_5.c          |   179 +-
 kernels/zen/1f/bli_axpyf_zen_int_8.c          |   110 +-
 kernels/zen/1f/bli_dotxf_zen_int_8.c          |   136 +-
 ref_kernels/1/bli_addv_ref.c                  |    14 +-
 ref_kernels/1/bli_amaxv_ref.c                 |    20 +-
 ref_kernels/1/bli_axpbyv_ref.c                |    16 +-
 ref_kernels/1/bli_axpyv_ref.c                 |    12 +-
 ref_kernels/1/bli_copyv_ref.c                 |    10 +-
 ref_kernels/1/bli_dotv_ref.c                  |    14 +-
 ref_kernels/1/bli_dotxv_ref.c                 |    18 +-
 ref_kernels/1/bli_invertv_ref.c               |     6 +-
 ref_kernels/1/bli_invscalv_ref.c              |    10 +-
 ref_kernels/1/bli_scal2v_ref.c                |    14 +-
 ref_kernels/1/bli_scalv_ref.c                 |    12 +-
 ref_kernels/1/bli_setv_ref.c                  |    10 +-
 ref_kernels/1/bli_subv_ref.c                  |    10 +-
 ref_kernels/1/bli_swapv_ref.c                 |     8 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |    12 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |    18 +-
 ref_kernels/1f/bli_axpyf_ref.c                |    24 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |    20 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |    28 +-
 ref_kernels/1f/bli_dotxf_ref.c                |    26 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   |    39 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       |    33 +-
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        |    41 +-
 ref_kernels/1m/bli_packm_cxk_ref.c            |    26 +-
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |    24 +-
 ref_kernels/3/bli_gemm_ref.c                  |    66 +-
 ref_kernels/3/bli_gemmsup_ref.c               |   540 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |    32 +-
 ref_kernels/3/bli_trsm_ref.c                  |    47 +-
 ref_kernels/ind/bli_gemm1m_ref.c              |    63 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |    70 +-
 ref_kernels/ind/bli_trsm1m_ref.c              |   224 +-
 187 files changed, 5408 insertions(+), 49135 deletions(-)
 delete mode 100644 kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c
 delete mode 100644 kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c
 delete mode 100644 kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c
 delete mode 100644 kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c
 delete mode 100644 kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.newji
 delete mode 100644 kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.worksij
 delete mode 100644 kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c

diff --git a/frame/1/bli_l1v_ft_ker.h b/frame/1/bli_l1v_ft_ker.h
index ade2c98eb..1c5575ec7 100644
--- a/frame/1/bli_l1v_ft_ker.h
+++ b/frame/1/bli_l1v_ft_ker.h
@@ -47,11 +47,11 @@
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjx, \
-       dim_t           n, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       cntx_t*         cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( addv )
@@ -65,10 +65,10 @@ INSERT_GENTDEF( subv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       dim_t           n, \
-       ctype* restrict x, inc_t incx, \
-       dim_t* restrict index, \
-       cntx_t*         cntx  \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             dim_t*  index, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( amaxv )
@@ -80,13 +80,13 @@ INSERT_GENTDEF( amaxv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjx, \
-       dim_t           n, \
-       ctype* restrict alpha, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict beta, \
-       ctype* restrict y, inc_t incy, \
-       cntx_t*         cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( axpbyv )
@@ -98,12 +98,12 @@ INSERT_GENTDEF( axpbyv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjx, \
-       dim_t           n, \
-       ctype* restrict alpha, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       cntx_t*         cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( axpyv )
@@ -116,13 +116,13 @@ INSERT_GENTDEF( scal2v )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjx, \
-       conj_t          conjy, \
-       dim_t           n, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       ctype* restrict rho, \
-       cntx_t*         cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( dotv )
@@ -134,15 +134,15 @@ INSERT_GENTDEF( dotv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjx, \
-       conj_t          conjy, \
-       dim_t           n, \
-       ctype* restrict alpha, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       ctype* restrict beta, \
-       ctype* restrict rho, \
-       cntx_t*         cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+       const ctype*  beta, \
+             ctype*  rho, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( dotxv )
@@ -154,9 +154,9 @@ INSERT_GENTDEF( dotxv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       dim_t           n, \
-       ctype* restrict x, inc_t incx, \
-       cntx_t*         cntx  \
+             dim_t   n, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( invertv )
@@ -168,11 +168,11 @@ INSERT_GENTDEF( invertv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjalpha, \
-       dim_t           n, \
-       ctype* restrict alpha, \
-       ctype* restrict x, inc_t incx, \
-       cntx_t*         cntx  \
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( invscalv )
@@ -186,10 +186,10 @@ INSERT_GENTDEF( setv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       dim_t           n, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       cntx_t*         cntx  \
+             dim_t   n, \
+             ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( swapv )
@@ -201,12 +201,12 @@ INSERT_GENTDEF( swapv )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjx, \
-       dim_t           n, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict beta, \
-       ctype* restrict y, inc_t incy, \
-       cntx_t*         cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( xpbyv )
diff --git a/frame/1/bli_l1v_ker_prot.h b/frame/1/bli_l1v_ker_prot.h
index 965626392..add65bba5 100644
--- a/frame/1/bli_l1v_ker_prot.h
+++ b/frame/1/bli_l1v_ker_prot.h
@@ -41,11 +41,11 @@
 \
 void PASTEMAC(ch,opname) \
       ( \
-        conj_t           conjx, \
-        dim_t            n, \
-        ctype*  restrict x, inc_t incx, \
-        ctype*  restrict y, inc_t incy, \
-        cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
       );
 
 
@@ -53,10 +53,10 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       dim_t*  restrict index, \
-       cntx_t*          cntx  \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             dim_t*  index, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -64,13 +64,13 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -78,12 +78,12 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -91,11 +91,11 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
       ( \
-        conj_t           conjx, \
-        dim_t            n, \
-        ctype*  restrict x, inc_t incx, \
-        ctype*  restrict y, inc_t incy, \
-        cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
       );
 
 
@@ -103,13 +103,13 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict rho, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -117,15 +117,15 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict beta, \
-       ctype*  restrict rho, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+       const ctype*  beta, \
+             ctype*  rho, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -133,9 +133,9 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             dim_t   n, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -143,11 +143,11 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjalpha, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -155,11 +155,11 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjalpha, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -167,12 +167,12 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -180,11 +180,11 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjalpha, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -192,11 +192,11 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
       ( \
-        conj_t           conjx, \
-        dim_t            n, \
-        ctype*  restrict x, inc_t incx, \
-        ctype*  restrict y, inc_t incy, \
-        cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
       );
 
 
@@ -204,10 +204,10 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             dim_t   n, \
+             ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ); \
 
 
@@ -215,11 +215,11 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ); \
 
diff --git a/frame/1f/bli_l1f_ft_ker.h b/frame/1f/bli_l1f_ft_ker.h
index c0b7cf656..54557c1da 100644
--- a/frame/1f/bli_l1f_ft_ker.h
+++ b/frame/1f/bli_l1f_ft_ker.h
@@ -47,15 +47,15 @@
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjx, \
-       conj_t          conjy, \
-       dim_t           n, \
-       ctype* restrict alpha1, \
-       ctype* restrict alpha2, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       ctype* restrict z, inc_t incz, \
-       cntx_t*         cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  alphax, \
+       const ctype*  alphay, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( axpy2v )
@@ -67,15 +67,15 @@ INSERT_GENTDEF( axpy2v )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conja, \
-       conj_t          conjx, \
-       dim_t           m, \
-       dim_t           b_n, \
-       ctype* restrict alpha, \
-       ctype* restrict a, inc_t inca, inc_t lda, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       cntx_t*         cntx  \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( axpyf )
@@ -87,65 +87,65 @@ INSERT_GENTDEF( axpyf )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjxt, \
-       conj_t          conjx, \
-       conj_t          conjy, \
-       dim_t           m, \
-       ctype* restrict alpha, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict y, inc_t incy, \
-       ctype* restrict rho, \
-       ctype* restrict z, inc_t incz, \
-       cntx_t*         cntx  \
+             conj_t  conjxt, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( dotaxpyv )
 
-// dotxf
+// dotxaxpyf
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjat, \
-       conj_t          conjx, \
-       dim_t           m, \
-       dim_t           b_n, \
-       ctype* restrict alpha, \
-       ctype* restrict a, inc_t inca, inc_t lda, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict beta, \
-       ctype* restrict y, inc_t incy, \
-       cntx_t*         cntx  \
+             conj_t  conjat, \
+             conj_t  conja, \
+             conj_t  conjw, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  w, inc_t incw, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      );
 
-INSERT_GENTDEF( dotxf )
+INSERT_GENTDEF( dotxaxpyf )
 
-// dotxaxpyf
+// dotxf
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t          conjat, \
-       conj_t          conja, \
-       conj_t          conjw, \
-       conj_t          conjx, \
-       dim_t           m, \
-       dim_t           b_n, \
-       ctype* restrict alpha, \
-       ctype* restrict a, inc_t inca, inc_t lda, \
-       ctype* restrict w, inc_t incw, \
-       ctype* restrict x, inc_t incx, \
-       ctype* restrict beta, \
-       ctype* restrict y, inc_t incy, \
-       ctype* restrict z, inc_t incz, \
-       cntx_t*         cntx  \
+             conj_t  conjat, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
-INSERT_GENTDEF( dotxaxpyf )
+INSERT_GENTDEF( dotxf )
 
 
diff --git a/frame/1f/bli_l1f_ker_prot.h b/frame/1f/bli_l1f_ker_prot.h
index 4393faf10..8f0967736 100644
--- a/frame/1f/bli_l1f_ker_prot.h
+++ b/frame/1f/bli_l1f_ker_prot.h
@@ -41,15 +41,15 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            n, \
-       ctype*  restrict alphax, \
-       ctype*  restrict alphay, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict z, inc_t incz, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  alphax, \
+       const ctype*  alphay, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      );
 
 
@@ -57,15 +57,15 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conja, \
-       conj_t           conjx, \
-       dim_t            m, \
-       dim_t            b_n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
 
@@ -73,16 +73,16 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjxt, \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict rho, \
-       ctype*  restrict z, inc_t incz, \
-       cntx_t*          cntx  \
+             conj_t  conjxt, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      );
 
 
@@ -90,20 +90,20 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjat, \
-       conj_t           conja, \
-       conj_t           conjw, \
-       conj_t           conjx, \
-       dim_t            m, \
-       dim_t            b_n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict w, inc_t incw, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict z, inc_t incz, \
-       cntx_t*          cntx  \
+             conj_t  conjat, \
+             conj_t  conja, \
+             conj_t  conjw, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  w, inc_t incw, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      );
 
 
@@ -111,15 +111,15 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t           conjat, \
-       conj_t           conjx, \
-       dim_t            m, \
-       dim_t            b_n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjat, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      );
 
diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h
index f25c3c943..729b03d79 100644
--- a/frame/1m/bli_l1m_ft_ker.h
+++ b/frame/1m/bli_l1m_ft_ker.h
@@ -49,24 +49,24 @@
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx, \
-       void*           params \
+             struc_t strucc, \
+             diag_t  diagc, \
+             uplo_t  uploc, \
+             conj_t  conjc, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   panel_dim, \
+             dim_t   panel_len, \
+             dim_t   panel_dim_max, \
+             dim_t   panel_len_max, \
+             dim_t   panel_dim_off, \
+             dim_t   panel_len_off, \
+       const ctype*  kappa, \
+       const ctype*  c, inc_t incc, inc_t ldc, \
+             ctype*  p,             inc_t ldp, \
+                        inc_t is_p, \
+       const cntx_t* cntx, \
+       const void*   params \
      );
 
 INSERT_GENTDEF( packm )
@@ -82,15 +82,15 @@ INSERT_GENTDEF( packm )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx  \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   n, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( packm_cxk )
@@ -102,14 +102,14 @@ INSERT_GENTDEF( packm_cxk )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       ctype*  restrict kappa, \
-       ctype*  restrict p,             inc_t ldp, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t*          cntx \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   n, \
+       const ctype*  kappa, \
+       const ctype*  p,             inc_t ldp, \
+             ctype*  a, inc_t inca, inc_t lda, \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTDEF( unpackm_cxk )
@@ -121,18 +121,18 @@ INSERT_GENTDEF( unpackm_cxk )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       struc_t          struca, \
-       diag_t           diaga, \
-       uplo_t           uploa, \
-       conj_t           conja, \
-       pack_t           schema, \
-       bool             invdiag, \
-       dim_t            cdim, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx \
+             struc_t struca, \
+             diag_t  diaga, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   cdim, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx \
      );
 
 INSERT_GENTDEF( packm_cxc_diag )
diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h
index 8430614d2..a18eab623 100644
--- a/frame/1m/bli_l1m_ker_prot.h
+++ b/frame/1m/bli_l1m_ker_prot.h
@@ -43,15 +43,15 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx  \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   n, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx  \
      );
 
 
@@ -61,14 +61,14 @@ void PASTEMAC(ch,varname) \
 \
 void PASTEMAC(ch,varname) \
      ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       ctype*  restrict kappa, \
-       ctype*  restrict p,             inc_t ldp, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t*          cntx \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   n, \
+       const ctype*  kappa, \
+       const ctype*  p,             inc_t ldp, \
+             ctype*  a, inc_t inca, inc_t lda, \
+       const cntx_t* cntx  \
      );
 
 
@@ -78,17 +78,17 @@ void PASTEMAC(ch,varname) \
 \
 void PASTEMAC(ch,varname) \
      ( \
-       struc_t          struca, \
-       diag_t           diaga, \
-       uplo_t           uploa, \
-       conj_t           conja, \
-       pack_t           schema, \
-       bool             invdiag, \
-       dim_t            cdim, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx \
+             struc_t struca, \
+             diag_t  diaga, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   cdim, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx \
      );
 
diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h
index e7952409f..2a42d859b 100644
--- a/frame/3/bli_l3_ft_ukr.h
+++ b/frame/3/bli_l3_ft_ukr.h
@@ -47,16 +47,16 @@
 \
 typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 INSERT_GENTDEF( gemm )
@@ -69,17 +69,17 @@ INSERT_GENTDEF( gemm )
 \
 typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a1x, \
+       const ctype*     a11, \
+       const ctype*     bx1, \
+             ctype*     b11, \
+             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 INSERT_GENTDEF( gemmtrsm )
@@ -92,11 +92,11 @@ INSERT_GENTDEF( gemmtrsm )
 \
 typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 INSERT_GENTDEF( trsm )
diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h
index 243ff818d..16b2c1173 100644
--- a/frame/3/bli_l3_ind_ukr.h
+++ b/frame/3/bli_l3_ind_ukr.h
@@ -43,16 +43,16 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
@@ -63,17 +63,17 @@ INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a1x, \
+       const ctype*     a11, \
+       const ctype*     bx1, \
+             ctype*     b11, \
+             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name )
@@ -85,11 +85,11 @@ INSERT_GENTPROT_BASIC0( gemmtrsm1m_u_ukr_name )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name )
diff --git a/frame/3/bli_l3_sup_ft_ker.h b/frame/3/bli_l3_sup_ft_ker.h
index dbeafb404..e78edf800 100644
--- a/frame/3/bli_l3_sup_ft_ker.h
+++ b/frame/3/bli_l3_sup_ft_ker.h
@@ -47,18 +47,18 @@
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             conj_t     conja, \
+             conj_t     conjb, \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 INSERT_GENTDEF( gemmsup )
diff --git a/frame/3/bli_l3_sup_ker_prot.h b/frame/3/bli_l3_sup_ker_prot.h
index 30cad5257..efb216c22 100644
--- a/frame/3/bli_l3_sup_ker_prot.h
+++ b/frame/3/bli_l3_sup_ker_prot.h
@@ -40,17 +40,17 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             conj_t     conja, \
+             conj_t     conjb, \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h
index 44a59bd4c..655d45a18 100644
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -36,22 +36,23 @@
 // Define template prototypes for level-3 micro-kernels.
 //
 
-#define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname)
+#define GEMM_UKR_PROT( ctype, ch, opname ) \
+        GEMM_UKR_PROT2( ctype, ctype, ch, opname )
 
 #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype_out* restrict alpha, \
-       ctype_in*  restrict a, \
-       ctype_in*  restrict b, \
-       ctype_out* restrict beta, \
-       ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype_out* alpha, \
+       const ctype_in*  a, \
+       const ctype_in*  b, \
+       const ctype_out* beta, \
+             ctype_out* c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 
@@ -59,17 +60,17 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a1x, \
+       const ctype*     a11, \
+       const ctype*     bx1, \
+             ctype*     b11, \
+             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
 
@@ -77,10 +78,10 @@ void PASTEMAC(ch,opname) \
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      );
 
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index c2e8ed5d5..2145fb4c5 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -39,16 +39,16 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	bli_init_once(); \
@@ -83,17 +83,17 @@ INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a1x, \
+       const ctype*     a11, \
+       const ctype*     bx1, \
+             ctype*     b11, \
+             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	bli_init_once(); \
@@ -130,11 +130,11 @@ INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ukernel, gemmtrsm, BLIS_GEMMTRSM_U_UKR )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 732d5ec06..69700e6c3 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -37,28 +37,28 @@
 
 typedef void (*xpbys_mxn_vft)
     (
-      dim_t m,
-      dim_t n,
-      void* x, inc_t rs_x, inc_t cs_x,
-      void* b,
-      void* y, inc_t rs_y, inc_t cs_y
+            dim_t m,
+            dim_t n,
+      const void* x, inc_t rs_x, inc_t cs_x,
+      const void* b,
+            void* y, inc_t rs_y, inc_t cs_y
     );
 
-#undef GENTFUNC2
+#undef  GENTFUNC2
 #define GENTFUNC2(ctypex,ctypey,chx,chy,op) \
 \
 BLIS_INLINE void PASTEMAC2(chx,chy,op) \
     ( \
-      dim_t m, \
-      dim_t n, \
-      void* x, inc_t rs_x, inc_t cs_x, \
-      void* b, \
-      void* y, inc_t rs_y, inc_t cs_y \
+            dim_t m, \
+            dim_t n, \
+      const void* x, inc_t rs_x, inc_t cs_x, \
+      const void* b, \
+            void* y, inc_t rs_y, inc_t cs_y \
     ) \
 { \
-	ctypex* restrict x_cast = x; \
-	ctypey* restrict b_cast = b; \
-	ctypey* restrict y_cast = y; \
+	const ctypex* restrict x_cast = x; \
+	const ctypey* restrict b_cast = b; \
+	      ctypey* restrict y_cast = y; \
 \
 	PASTEMAC3(chx,chy,chy,xpbys_mxn) \
 	( \
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index 086a3b1df..0fbb0cc49 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -41,16 +41,16 @@
 \
 void PASTEMAC2(ch,opname,suf) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t       dt        = PASTEMAC(ch,type); \
@@ -64,37 +64,28 @@ void PASTEMAC2(ch,opname,suf) \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
 \
-	dim_t             mr_r      = mr; \
-	dim_t             nr_r      = nr; \
+	      dim_t       mr_r      = mr; \
+	      dim_t       nr_r      = nr; \
 \
-	ctype             ct[ BLIS_STACK_BUF_MAX_SIZE \
+	      ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                      / sizeof( ctype_r ) ] \
 	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	inc_t             rs_ct; \
-	inc_t             cs_ct; \
+	      inc_t       rs_ct; \
+	      inc_t       cs_ct; \
 \
-	ctype_r* restrict a_r       = ( ctype_r* )a; \
+	const ctype_r*    a_r       = ( ctype_r* )a; \
 \
-	ctype_r* restrict b_r       = ( ctype_r* )b; \
+	const ctype_r*    b_r       = ( ctype_r* )b; \
 \
-	ctype_r* restrict zero_r    = PASTEMAC(chr,0); \
+	const ctype_r*    zero_r    = PASTEMAC(chr,0); \
 \
-	ctype_r* restrict alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
-/*
-	ctype_r* restrict alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
-*/ \
-\
-	ctype_r* restrict beta_r    = &PASTEMAC(ch,real)( *beta ); \
-	ctype_r* restrict beta_i    = &PASTEMAC(ch,imag)( *beta ); \
+	const ctype_r*    alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
+	   /* ctype_r*    alpha_i   = &PASTEMAC(ch,imag)( *alpha ); */ \
 \
-	dim_t             m_use; \
-	dim_t             n_use; \
+	const ctype_r*    beta_r    = &PASTEMAC(ch,real)( *beta ); \
+	const ctype_r*    beta_i    = &PASTEMAC(ch,imag)( *beta ); \
 \
-	ctype_r*          c_use; \
-	inc_t             rs_c_use; \
-	inc_t             cs_c_use; \
-\
-	bool              using_ct; \
+	      bool        using_ct; \
 \
 	/* This virtual microkernel is used by ccr and crc mixed-domain cases
 	   when any of the following conditions are met:
@@ -150,9 +141,9 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		if ( col_pref ) { rs_ct = 1;  cs_ct = mr; } \
 		else            { rs_ct = nr; cs_ct = 1; } \
 \
-		c_use    = ( ctype_r* )ct; \
-		rs_c_use = rs_ct; \
-		cs_c_use = cs_ct; \
+		ctype_r* c_use    = ( ctype_r* )ct; \
+		inc_t    rs_c_use = rs_ct; \
+		inc_t    cs_c_use = cs_ct; \
 \
 		/* Convert the strides and corresponding microtile dimension from being
 		   in units of complex elements to be in units of real elements. */ \
@@ -209,11 +200,12 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		/* In the typical cases, we use the real part of beta and
 		   accumulate directly into the output matrix c. */ \
 \
-		c_use    = ( ctype_r* )c; \
-		rs_c_use = rs_c; \
-		cs_c_use = cs_c; \
-		m_use    = m; \
-		n_use    = n; \
+		ctype_r* c_use    = ( ctype_r* )c; \
+		inc_t    rs_c_use = rs_c; \
+		inc_t    cs_c_use = cs_c; \
+\
+		dim_t    m_use    = m; \
+		dim_t    n_use    = n; \
 \
 		/* Convert the strides and corresponding microtile dimension from being
 		   in units of complex elements to be in units of real elements. */ \
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index fd726da6f..ecf0265a1 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -37,35 +37,36 @@
 
 typedef void (*xpbys_mxn_l_vft)
     (
-      doff_t diagoff,
-      dim_t  m,
-      dim_t  n,
-      void*  x, inc_t rs_x, inc_t cs_x,
-      void*  b,
-      void*  y, inc_t rs_y, inc_t cs_y
+            doff_t diagoff,
+            dim_t  m,
+            dim_t  n,
+      const void*  x, inc_t rs_x, inc_t cs_x,
+      const void*  b,
+            void*  y, inc_t rs_y, inc_t cs_y
     );
 
-#undef GENTFUNC
+#undef  GENTFUNC
 #define GENTFUNC(ctype,ch,op) \
 \
 BLIS_INLINE void PASTEMAC(ch,op) \
     ( \
-      doff_t diagoff, \
-      dim_t  m, \
-      dim_t  n, \
-      void*  x, inc_t rs_x, inc_t cs_x, \
-      void*  b, \
-      void*  y, inc_t rs_y, inc_t cs_y \
+            doff_t diagoff, \
+            dim_t  m, \
+            dim_t  n, \
+      const void*  x, inc_t rs_x, inc_t cs_x, \
+      const void*  b, \
+            void*  y, inc_t rs_y, inc_t cs_y \
     ) \
 { \
-	ctype* restrict x_cast = x; \
-	ctype* restrict b_cast = b; \
-	ctype* restrict y_cast = y; \
+	const ctype* restrict x_cast = x; \
+	const ctype* restrict b_cast = b; \
+	      ctype* restrict y_cast = y; \
 \
 	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
 	( \
 	  diagoff, \
-	  m, n, \
+	  m, \
+	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
 	  y_cast, rs_y,  cs_y \
@@ -76,6 +77,7 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
 
 static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
+// -----------------------------------------------------------------------------
 
 void bli_gemmt_l_ker_var2
      (
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
index 7c50a4a54..d7af21d19 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
@@ -37,35 +37,36 @@
 
 typedef void (*xpbys_mxn_l_vft)
     (
-      doff_t diagoff,
-      dim_t  m,
-      dim_t  n,
-      void*  x, inc_t rs_x, inc_t cs_x,
-      void*  b,
-      void*  y, inc_t rs_y, inc_t cs_y
+            doff_t diagoff,
+            dim_t  m,
+            dim_t  n,
+      const void*  x, inc_t rs_x, inc_t cs_x,
+      const void*  b,
+            void*  y, inc_t rs_y, inc_t cs_y
     );
 
-#undef GENTFUNC
+#undef  GENTFUNC
 #define GENTFUNC(ctype,ch,op) \
 \
 BLIS_INLINE void PASTEMAC(ch,op) \
     ( \
-      doff_t diagoff, \
-      dim_t  m, \
-      dim_t  n, \
-      void*  x, inc_t rs_x, inc_t cs_x, \
-      void*  b, \
-      void*  y, inc_t rs_y, inc_t cs_y \
+            doff_t diagoff, \
+            dim_t  m, \
+            dim_t  n, \
+      const void*  x, inc_t rs_x, inc_t cs_x, \
+      const void*  b, \
+            void*  y, inc_t rs_y, inc_t cs_y \
     ) \
 { \
-	ctype* restrict x_cast = x; \
-	ctype* restrict b_cast = b; \
-	ctype* restrict y_cast = y; \
+	const ctype* restrict x_cast = x; \
+	const ctype* restrict b_cast = b; \
+	      ctype* restrict y_cast = y; \
 \
 	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
 	( \
 	  diagoff, \
-	  m, n, \
+	  m, \
+	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
 	  y_cast, rs_y,  cs_y \
@@ -76,6 +77,7 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
 
 static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
+// -----------------------------------------------------------------------------
 
 void bli_gemmt_l_ker_var2b
      (
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 78d5b869d..7d396555d 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -37,35 +37,36 @@
 
 typedef void (*xpbys_mxn_u_vft)
     (
-      doff_t diagoff,
-      dim_t  m,
-      dim_t  n,
-      void*  x, inc_t rs_x, inc_t cs_x,
-      void*  b,
-      void*  y, inc_t rs_y, inc_t cs_y
+            doff_t diagoff,
+            dim_t  m,
+            dim_t  n,
+      const void*  x, inc_t rs_x, inc_t cs_x,
+      const void*  b,
+            void*  y, inc_t rs_y, inc_t cs_y
     );
 
-#undef GENTFUNC
+#undef  GENTFUNC
 #define GENTFUNC(ctype,ch,op) \
 \
 BLIS_INLINE void PASTEMAC(ch,op) \
     ( \
-      doff_t diagoff, \
-      dim_t  m, \
-      dim_t  n, \
-      void*  x, inc_t rs_x, inc_t cs_x, \
-      void*  b, \
-      void*  y, inc_t rs_y, inc_t cs_y \
+            doff_t diagoff, \
+            dim_t  m, \
+            dim_t  n, \
+      const void*  x, inc_t rs_x, inc_t cs_x, \
+      const void*  b, \
+            void*  y, inc_t rs_y, inc_t cs_y \
     ) \
 { \
-	ctype* restrict x_cast = x; \
-	ctype* restrict b_cast = b; \
-	ctype* restrict y_cast = y; \
+	const ctype* restrict x_cast = x; \
+	const ctype* restrict b_cast = b; \
+	      ctype* restrict y_cast = y; \
 \
 	PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \
 	( \
 	  diagoff, \
-	  m, n, \
+	  m, \
+	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
 	  y_cast, rs_y,  cs_y \
@@ -76,6 +77,7 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
 
 static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
+// -----------------------------------------------------------------------------
 
 void bli_gemmt_u_ker_var2
      (
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
index 91275577a..e1f939c43 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
@@ -37,35 +37,36 @@
 
 typedef void (*xpbys_mxn_u_vft)
     (
-      doff_t diagoff,
-      dim_t  m,
-      dim_t  n,
-      void*  x, inc_t rs_x, inc_t cs_x,
-      void*  b,
-      void*  y, inc_t rs_y, inc_t cs_y
+            doff_t diagoff,
+            dim_t  m,
+            dim_t  n,
+      const void*  x, inc_t rs_x, inc_t cs_x,
+      const void*  b,
+            void*  y, inc_t rs_y, inc_t cs_y
     );
 
-#undef GENTFUNC
+#undef  GENTFUNC
 #define GENTFUNC(ctype,ch,op) \
 \
 BLIS_INLINE void PASTEMAC(ch,op) \
     ( \
-      doff_t diagoff, \
-      dim_t  m, \
-      dim_t  n, \
-      void*  x, inc_t rs_x, inc_t cs_x, \
-      void*  b, \
-      void*  y, inc_t rs_y, inc_t cs_y \
+            doff_t diagoff, \
+            dim_t  m, \
+            dim_t  n, \
+      const void*  x, inc_t rs_x, inc_t cs_x, \
+      const void*  b, \
+            void*  y, inc_t rs_y, inc_t cs_y \
     ) \
 { \
-	ctype* restrict x_cast = x; \
-	ctype* restrict b_cast = b; \
-	ctype* restrict y_cast = y; \
+	const ctype* restrict x_cast = x; \
+	const ctype* restrict b_cast = b; \
+	      ctype* restrict y_cast = y; \
 \
 	PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \
 	( \
 	  diagoff, \
-	  m, n, \
+	  m, \
+	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
 	  y_cast, rs_y,  cs_y \
@@ -76,6 +77,7 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
 
 static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
+// -----------------------------------------------------------------------------
 
 void bli_gemmt_u_ker_var2b
      (
diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h
index 6fc4e46c8..ad72e7514 100644
--- a/frame/include/bli_edge_case_macro_defs.h
+++ b/frame/include/bli_edge_case_macro_defs.h
@@ -43,14 +43,15 @@
 
 #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \
 \
-	PASTEMAC(ch,ctype)* restrict _beta   = beta; \
-	PASTEMAC(ch,ctype)* restrict _c      = c; \
-	const inc_t                  _rs_c   = rs_c; \
-	const inc_t                  _cs_c   = cs_c; \
-	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,ctype) ) ] \
-	                                  __attribute__((aligned(alignment))); \
-	const inc_t                  _rs_ct  = row_major ? nr :  1; \
-	const inc_t                  _cs_ct  = row_major ?  1 : mr;
+	const PASTEMAC(ch,ctype)* restrict _beta   = beta; \
+	      PASTEMAC(ch,ctype)* restrict _c      = c; \
+	const inc_t                        _rs_c   = rs_c; \
+	const inc_t                        _cs_c   = cs_c; \
+	      PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                                        / sizeof( PASTEMAC(ch,ctype) ) \
+	                                      ] __attribute__((aligned(alignment))); \
+	const inc_t                        _rs_ct  = row_major ? nr :  1; \
+	const inc_t                        _cs_ct  = row_major ?  1 : mr;
 
 #define GEMM_UKR_SETUP_CT_POST(ch) \
 \
@@ -134,13 +135,14 @@
 
 #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \
 \
-	PASTEMAC(ch,ctype)* restrict _c      = c11; \
-	const inc_t                  _rs_c   = rs_c; \
-	const inc_t                  _cs_c   = cs_c; \
-	PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,ctype) ) ] \
-	                                  __attribute__((aligned(alignment))); \
-	const inc_t                  _rs_ct  = row_major ? nr :  1; \
-	const inc_t                  _cs_ct  = row_major ?  1 : mr;
+	      PASTEMAC(ch,ctype)* restrict _c      = c11; \
+	const inc_t                        _rs_c   = rs_c; \
+	const inc_t                        _cs_c   = cs_c; \
+	      PASTEMAC(ch,ctype)           _ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                                        / sizeof( PASTEMAC(ch,ctype) ) \
+	                                      ] __attribute__((aligned(alignment))); \
+	const inc_t                        _rs_ct  = row_major ? nr :  1; \
+	const inc_t                        _cs_ct  = row_major ?  1 : mr;
 
 #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \
 \
diff --git a/frame/include/level0/bli_copys_mxn.h b/frame/include/level0/bli_copys_mxn.h
index a8ead1c30..9dc688ac0 100644
--- a/frame/include/level0/bli_copys_mxn.h
+++ b/frame/include/level0/bli_copys_mxn.h
@@ -41,10 +41,81 @@
 // - The first char encodes the type of x.
 // - The second char encodes the type of y.
 
+#undef  BLIS_ENABLE_CR_CASES
+#define BLIS_ENABLE_CR_CASES 0
+
+// -- bli_??copys_mxn --
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
+\
+BLIS_INLINE void PASTEMAC2(chx,chy,opname) \
+     ( \
+       const dim_t   m, \
+       const dim_t   n, \
+       const ctypex* x, inc_t rs_x, inc_t cs_x, \
+             ctypey* y, inc_t rs_y, inc_t cs_y  \
+     ) \
+{ \
+	if      ( BLIS_ENABLE_CR_CASES && rs_x == 1 && rs_y == 1 ) \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		PASTEMAC2(chx,chy,kername)( *(x + ii + jj*cs_x), \
+		                            *(y + ii + jj*cs_y) ); \
+	} \
+	else if ( BLIS_ENABLE_CR_CASES && cs_x == 1 && cs_y == 1 ) \
+	{ \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		PASTEMAC2(chx,chy,kername)( *(x + ii*rs_x + jj), \
+		                            *(y + ii*rs_y + jj) ); \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		PASTEMAC2(chx,chy,kername)( *(x + ii*rs_x + jj*cs_x), \
+		                            *(y + ii*rs_y + jj*cs_y) ); \
+	} \
+}
+
+INSERT_GENTFUNC2_BASIC ( copys_mxn, copys )
+INSERT_GENTFUNC2_MIX_DP( copys_mxn, copys )
+
+
+// -- bli_?copys_mxn --
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+BLIS_INLINE void PASTEMAC(ch,opname) \
+     ( \
+       const dim_t  m, \
+       const dim_t  n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+             ctype* y, inc_t rs_y, inc_t cs_y  \
+     ) \
+{ \
+	PASTEMAC2(ch,ch,opname)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+}
+
+INSERT_GENTFUNC_BASIC0( copys_mxn )
+
+
+
+
+#if 0
+
 // xy = ?s
 
-BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_sscopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -70,8 +141,13 @@ BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float*    restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dscopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -97,8 +173,13 @@ BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double*   restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_cscopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -124,8 +205,13 @@ BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zscopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -154,8 +240,13 @@ BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restri
 
 // xy = ?d
 
-BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_sdcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -181,8 +272,13 @@ BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float*    restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_ddcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -208,8 +304,13 @@ BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double*   restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_cdcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -235,8 +336,13 @@ BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zdcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -265,8 +371,13 @@ BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restri
 
 // xy = ?c
 
-BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_sccopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -292,8 +403,13 @@ BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float*    restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dccopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -319,8 +435,13 @@ BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double*   restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_cccopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -346,8 +467,13 @@ BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zccopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -376,8 +502,13 @@ BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restri
 
 // xy = ?c
 
-BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_szcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -403,8 +534,13 @@ BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float*    restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dzcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -430,8 +566,13 @@ BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double*   restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_czcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -457,8 +598,13 @@ BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restri
 		             *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zzcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 #ifdef BLIS_ENABLE_CR_CASES
 	if ( rs_x == 1 && rs_y == 1 )
@@ -485,26 +631,46 @@ BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restri
 	}
 }
 
-
-BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_scopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_ccopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zcopys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
+#endif
 
 #endif
diff --git a/frame/include/level0/bli_xpbys_mxn.h b/frame/include/level0/bli_xpbys_mxn.h
index f23df17a2..446e45d7f 100644
--- a/frame/include/level0/bli_xpbys_mxn.h
+++ b/frame/include/level0/bli_xpbys_mxn.h
@@ -41,13 +41,100 @@
 // - The first char encodes the type of x.
 // - The second char encodes the type of b.
 // - The third char encodes the type of y.
+// - We only implement cases where typeof(b) == typeof(y).
+
+#undef  BLIS_ENABLE_CR_CASES
+#define BLIS_ENABLE_CR_CASES 0
+
+// -- bli_???xpbys_mxn --
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
+\
+BLIS_INLINE void PASTEMAC3(chx,chy,chy,opname) \
+     ( \
+       const dim_t   m, \
+       const dim_t   n, \
+       const ctypex* x, inc_t rs_x, inc_t cs_x, \
+       const ctypey* beta, \
+             ctypey* y, inc_t rs_y, inc_t cs_y  \
+     ) \
+{ \
+	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
+	if ( PASTEMAC(chy,eq0)( *beta ) ) \
+	{ \
+		PASTEMAC2(chx,chy,copys_mxn)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+		return; \
+	} \
+\
+	if      ( BLIS_ENABLE_CR_CASES && rs_x == 1 && rs_y == 1 ) \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		PASTEMAC3(chx,chy,chy,kername) \
+		( \
+		  *(x + ii + jj*cs_x), *beta, \
+		  *(y + ii + jj*cs_y) \
+		); \
+	} \
+	else if ( BLIS_ENABLE_CR_CASES && cs_x == 1 && cs_y == 1 ) \
+	{ \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		PASTEMAC3(chx,chy,chy,kername) \
+		( \
+		  *(x + ii*rs_x + jj), *beta, \
+		  *(y + ii*rs_y + jj) \
+		); \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		PASTEMAC3(chx,chy,chy,kername) \
+		( \
+		  *(x + ii*rs_x + jj*cs_x), *beta, \
+		  *(y + ii*rs_y + jj*cs_y) \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC2_BASIC ( xpbys_mxn, xpbys )
+INSERT_GENTFUNC2_MIX_DP( xpbys_mxn, xpbys )
+
+
+// -- bli_?xpbys_mxn --
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+BLIS_INLINE void PASTEMAC(ch,opname) \
+     ( \
+       const dim_t  m, \
+       const dim_t  n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const ctype* beta, \
+             ctype* y, inc_t rs_y, inc_t cs_y  \
+     ) \
+{ \
+    PASTEMAC3(ch,ch,ch,opname)( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
+}
+
+INSERT_GENTFUNC_BASIC0( xpbys_mxn )
 
 
+
+#if 0
 // -- (xby) = (?ss) ------------------------------------------------------------
 
-BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            float*    restrict beta,
-                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_sssxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+       const float*    restrict beta,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_seq0( *beta ) )
@@ -80,9 +167,14 @@ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float*    restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            float*    restrict beta,
-                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dssxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+       const float*    restrict beta,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_seq0( *beta ) )
@@ -115,9 +207,14 @@ BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double*   restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            float*    restrict beta,
-                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_cssxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const float*    restrict beta,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_seq0( *beta ) )
@@ -150,9 +247,14 @@ BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            float*    restrict beta,
-                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zssxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const float*    restrict beta,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_seq0( *beta ) )
@@ -188,9 +290,14 @@ BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restr
 
 // -- (xby) = (?dd) ------------------------------------------------------------
 
-BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            double*   restrict beta,
-                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_sddxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+       const double*   restrict beta,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_deq0( *beta ) )
@@ -223,9 +330,14 @@ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float*    restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            double*   restrict beta,
-                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dddxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+       const double*   restrict beta,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_deq0( *beta ) )
@@ -258,9 +370,14 @@ BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double*   restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            double*   restrict beta,
-                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_cddxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const double*   restrict beta,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_deq0( *beta ) )
@@ -293,9 +410,14 @@ BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            double*   restrict beta,
-                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zddxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const double*   restrict beta,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_deq0( *beta ) )
@@ -331,9 +453,14 @@ BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restr
 
 // -- (xby) = (?cc) ------------------------------------------------------------
 
-BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            scomplex* restrict beta,
-                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_sccxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+       const scomplex* restrict beta,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_ceq0( *beta ) )
@@ -366,9 +493,14 @@ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float*    restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            scomplex* restrict beta,
-                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dccxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+       const scomplex* restrict beta,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_ceq0( *beta ) )
@@ -401,9 +533,14 @@ BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double*   restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            scomplex* restrict beta,
-                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_cccxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const scomplex* restrict beta,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_ceq0( *beta ) )
@@ -436,9 +573,14 @@ BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            scomplex* restrict beta,
-                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zccxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const scomplex* restrict beta,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_ceq0( *beta ) )
@@ -474,9 +616,14 @@ BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restr
 
 // -- (xby) = (?zz) ------------------------------------------------------------
 
-BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            dcomplex* restrict beta,
-                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_szzxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+       const dcomplex* restrict beta,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_zeq0( *beta ) )
@@ -509,9 +656,14 @@ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float*    restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            dcomplex* restrict beta,
-                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dzzxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+       const dcomplex* restrict beta,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_zeq0( *beta ) )
@@ -544,9 +696,14 @@ BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double*   restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            dcomplex* restrict beta,
-                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_czzxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const dcomplex* restrict beta,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_zeq0( *beta ) )
@@ -579,9 +736,14 @@ BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restr
 		              *(y + ii*rs_y + jj*cs_y) );
 	}
 }
-BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                            dcomplex* restrict beta,
-                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zzzxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const dcomplex* restrict beta,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
 	if ( bli_zeq0( *beta ) )
@@ -617,30 +779,52 @@ BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restr
 
 
-BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          float*    restrict beta,
-                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+
+BLIS_INLINE void bli_sxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+       const float*    restrict beta,
+             float*    restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
-BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          double*   restrict beta,
-                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_dxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+       const double*   restrict beta,
+             double*   restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
-BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          scomplex* restrict beta,
-                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_cxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const scomplex* restrict beta,
+             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
-BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          dcomplex* restrict beta,
-                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+BLIS_INLINE void bli_zxpbys_mxn
+     (
+       const dim_t m,
+       const dim_t n,
+       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+       const dcomplex* restrict beta,
+             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
+     )
 {
 	bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
+#endif
 
 
 #endif
diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
index 8f1122b45..fc6755ae5 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
@@ -44,15 +44,15 @@
 
 void bli_dpackm_armsve256_int_8xk
      (
-       conj_t           conja,
-       pack_t           schema,
-       dim_t            cdim_,
-       dim_t            n_,
-       dim_t            n_max_,
-       double* restrict kappa,
-       double* restrict a, inc_t inca_, inc_t lda_,
-       double* restrict p,              inc_t ldp_,
-       cntx_t*          cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim_,
+             dim_t   n_,
+             dim_t   n_max_,
+       const double* kappa,
+       const double* a, inc_t inca_, inc_t lda_,
+             double* p,              inc_t ldp_,
+       const cntx_t* cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
index 5866ed26f..b17aacfbb 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
@@ -43,15 +43,15 @@
 
 void bli_dpackm_armsve512_asm_10xk
      (
-       conj_t           conja,
-       pack_t           schema,
-       dim_t            cdim_,
-       dim_t            n_,
-       dim_t            n_max_,
-       double* restrict kappa,
-       double* restrict a, inc_t inca_, inc_t lda_,
-       double* restrict p,              inc_t ldp_,
-       cntx_t*          cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim_,
+             dim_t   n_,
+             dim_t   n_max_,
+       const double* kappa,
+       const double* a, inc_t inca_, inc_t lda_,
+             double* p,              inc_t ldp_,
+       const cntx_t* cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
index 88ccb4b8e..370b400d2 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
@@ -42,15 +42,15 @@
 
 void bli_dpackm_armsve512_asm_16xk
      (
-       conj_t           conja,
-       pack_t           schema,
-       dim_t            cdim_,
-       dim_t            n_,
-       dim_t            n_max_,
-       double* restrict kappa,
-       double* restrict a, inc_t inca_, inc_t lda_,
-       double* restrict p,              inc_t ldp_,
-       cntx_t*          cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim_,
+             dim_t   n_,
+             dim_t   n_max_,
+       const double* kappa,
+       const double* a, inc_t inca_, inc_t lda_,
+             double* p,              inc_t ldp_,
+       const cntx_t* cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 5723a10f3..452fbaeef 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -45,16 +45,16 @@
 
 void bli_cgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const scomplex*  alpha, \
+       const scomplex*  a, \
+       const scomplex*  b, \
+       const scomplex*  beta, \
+             scomplex*  c, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
   const void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 3bc6de506..0226a252d 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -45,16 +45,16 @@
 
 void bli_dgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const double*    alpha, \
+       const double*    a, \
+       const double*    b, \
+       const double*    beta, \
+             double*    c, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
   const void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index 4aec5c4df..ede6e170a 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -45,16 +45,16 @@
 
 void bli_sgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const float*     alpha, \
+       const float*     a, \
+       const float*     b, \
+       const float*     beta, \
+             float*     c, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
   const void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 2c70f486f..0af877cc9 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -45,16 +45,16 @@
 
 void bli_zgemm_armsve_asm_2vx10_unindexed
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const dcomplex*  alpha, \
+       const dcomplex*  a, \
+       const dcomplex*  b, \
+       const dcomplex*  beta, \
+             dcomplex*  c, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
   const void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
index 4dec190e0..4bfd77d56 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
@@ -34,30 +34,41 @@
 
 #include "blis.h"
 
-extern
-void bli_sgemm_armv7a_ker_4x4
-     (
-       uint32_t            k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t*          data
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname, suf ) \
+\
+extern \
+void PASTEMAC2(ch,opname,suf) \
+     ( \
+             uint32_t   k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, uint32_t rs_c, uint32_t cs_c, \
+             auxinfo_t* data  \
      );
 
+GENTPROT( float,    s, gemm_armv7a_ker_, 4x4 )
+GENTPROT( double,   d, gemm_armv7a_ker_, 4x4 )
+GENTPROT( scomplex, c, gemm_armv7a_ker_, 2x2 )
+GENTPROT( dcomplex, z, gemm_armv7a_ker_, 2x2 )
+
+
+
+
 void bli_sgemm_armv7a_asm_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -68,31 +79,18 @@ void bli_sgemm_armv7a_asm_4x4
 }
 
 
-
-extern
-void bli_dgemm_armv7a_ker_4x4
-     (
-       uint32_t            k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t*          data
-     );
-
 void bli_dgemm_armv7a_asm_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -103,31 +101,18 @@ void bli_dgemm_armv7a_asm_4x4
 }
 
 
-
-extern
-void bli_cgemm_armv7a_ker_2x2
-     (
-       uint32_t            k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t*          data
-     );
-
 void bli_cgemm_armv7a_asm_2x2
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const scomplex*  alpha,
+       const scomplex*  a,
+       const scomplex*  b,
+       const scomplex*  beta,
+             scomplex*  c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -137,32 +122,18 @@ void bli_cgemm_armv7a_asm_2x2
 	GEMM_UKR_FLUSH_CT( c );
 }
 
-
-
-extern
-void bli_zgemm_armv7a_ker_2x2
-     (
-       uint32_t            k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t*          data
-     );
-
 void bli_zgemm_armv7a_asm_2x2
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
index b1e9481a3..7678e2c59 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
@@ -37,16 +37,16 @@
 
 void bli_sgemm_armv7a_int_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -243,16 +243,16 @@ void bli_sgemm_armv7a_int_4x4
 
 void bli_dgemm_armv7a_int_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
index 3eefd9ddc..90b8b9b2b 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
@@ -49,15 +49,15 @@
 
 void bli_dpackm_armv8a_int_6xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       double*    restrict kappa,
-       double*    restrict a, inc_t inca0, inc_t lda0,
-       double*    restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const double* kappa,
+       const double* a, inc_t inca0, inc_t lda0,
+             double* p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -67,8 +67,9 @@ void bli_dpackm_armv8a_int_6xk
   // different size than is expected by load instructions.
   uint64_t       k_iter = k0 / 2;
   uint64_t       k_left = k0 % 2;
-  double*        a_loc  = a;
-  double*        p_loc  = p;
+
+  const double*  a_loc  = a;
+        double*  p_loc  = p;
 
   // NOTE: For the purposes of the comments in this packm kernel, we
   // interpret inca and lda as rs_a and cs_a, respectively, and similarly
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
index 51b064a24..ae9e090e2 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
@@ -49,15 +49,15 @@
 
 void bli_dpackm_armv8a_int_8xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       double*    restrict kappa,
-       double*    restrict a, inc_t inca0, inc_t lda0,
-       double*    restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const double* kappa,
+       const double* a, inc_t inca0, inc_t lda0,
+             double* p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -67,8 +67,9 @@ void bli_dpackm_armv8a_int_8xk
   // different size than is expected by load instructions.
   uint64_t       k_iter = k0 / 2;
   uint64_t       k_left = k0 % 2;
-  double*        a_loc  = a;
-  double*        p_loc  = p;
+
+  const double*  a_loc  = a;
+        double*  p_loc  = p;
 
   // NOTE: For the purposes of the comments in this packm kernel, we
   // interpret inca and lda as rs_a and cs_a, respectively, and similarly
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
index f915215e1..f4a793db0 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
@@ -49,15 +49,15 @@
 
 void bli_spackm_armv8a_int_12xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       float*     restrict kappa,
-       float*     restrict a, inc_t inca0, inc_t lda0,
-       float*     restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const float*  kappa,
+       const float*  a, inc_t inca0, inc_t lda0,
+             float*  p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -67,8 +67,9 @@ void bli_spackm_armv8a_int_12xk
   // different size than is expected by load instructions.
   uint64_t       k_iter = k0 / 4;
   uint64_t       k_left = k0 % 4;
-  float*         a_loc  = a;
-  float*         p_loc  = p;
+
+  const float*   a_loc  = a;
+        float*   p_loc  = p;
 
   // NOTE: For the purposes of the comments in this packm kernel, we
   // interpret inca and lda as rs_a and cs_a, respectively, and similarly
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
index b508b2a0e..2fd1ec9d3 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
@@ -49,15 +49,15 @@
 
 void bli_spackm_armv8a_int_8xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       float*     restrict kappa,
-       float*     restrict a, inc_t inca0, inc_t lda0,
-       float*     restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const float*  kappa,
+       const float*  a, inc_t inca0, inc_t lda0,
+             float*  p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -67,8 +67,9 @@ void bli_spackm_armv8a_int_8xk
   // different size than is expected by load instructions.
   uint64_t       k_iter = k0 / 4;
   uint64_t       k_left = k0 % 4;
-  float*         a_loc  = a;
-  float*         p_loc  = p;
+
+  const float*   a_loc  = a;
+        float*   p_loc  = p;
 
   // NOTE: For the purposes of the comments in this packm kernel, we
   // interpret inca and lda as rs_a and cs_a, respectively, and similarly
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 12c670a9f..c4970ebb5 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -67,20 +67,20 @@
 
 void bli_sgemm_armv8a_asm_8x12
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void* a_next = bli_auxinfo_next_a( data );
-	void* b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -790,16 +790,16 @@ void bli_sgemm_armv8a_asm_8x12
  */
 void bli_dgemm_armv8a_asm_6x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 #ifdef DISPLAY_DEBUG_INFO
@@ -817,8 +817,8 @@ void bli_dgemm_armv8a_asm_6x8
 #endif
 
 
-	void* a_next = bli_auxinfo_next_a( data );
-	void* b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
index b0df23fb0..0737c7719 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
@@ -134,16 +134,16 @@
 
 void bli_sgemm_armv8a_asm_12x8r
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   const void* a_next = bli_auxinfo_next_a( data );
@@ -379,16 +379,16 @@ LABEL(SEND_WRITE_MEM)
  */
 void bli_dgemm_armv8a_asm_8x6r
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   const void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
index 847bfe8da..9af32439d 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
@@ -112,18 +112,18 @@
 BLIS_INLINE
 void bli_dgemmsup_rd_armv8a_inline_3x4m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( n0 == 4 );
@@ -154,18 +154,18 @@ void bli_dgemmsup_rd_armv8a_inline_3x4m
 BLIS_INLINE
 void bli_dgemmsup_rd_armv8a_inline_3xcm
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   for ( ; m0 > 0; m0 -= 3 )
@@ -187,18 +187,18 @@ void bli_dgemmsup_rd_armv8a_inline_3xcm
 
 void bli_dgemmsup_rd_armv8a_asm_6x8m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   if ( n0 != 8 )
@@ -277,8 +277,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
 
   // Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
index c4fb7cac6..51b6f75c0 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
@@ -105,18 +105,18 @@
 BLIS_INLINE
 void bli_dgemmsup_rd_armv8a_inline_4x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( m0 == 4 );
@@ -145,18 +145,18 @@ void bli_dgemmsup_rd_armv8a_inline_4x8n
 BLIS_INLINE
 void bli_dgemmsup_rd_armv8a_inline_3x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( m0 == 3 );
@@ -186,18 +186,18 @@ void bli_dgemmsup_rd_armv8a_inline_3x8n
 BLIS_INLINE
 void bli_dgemmsup_rd_armv8a_inline_rx8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( m0 <= 2 );
@@ -219,18 +219,18 @@ void bli_dgemmsup_rd_armv8a_inline_rx8n
 
 void bli_dgemmsup_rd_armv8a_asm_6x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   if ( m0 != 6 )
@@ -286,8 +286,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
 
   // Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
index b7d1a7d0f..9669400ce 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
@@ -106,18 +106,18 @@
  */
 void bli_dgemmsup_rv_armv8a_asm_4x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
@@ -128,8 +128,8 @@ void bli_dgemmsup_rv_armv8a_asm_4x8n
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
   uint64_t ps_b   = bli_auxinfo_ps_b( data );
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
index eaddfd076..aa57d5850 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
@@ -122,18 +122,18 @@
 
 void bli_dgemmsup_rv_armv8a_asm_5x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( m0 == 5 );
@@ -141,8 +141,8 @@ void bli_dgemmsup_rv_armv8a_asm_5x8n
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
   uint64_t ps_b   = bli_auxinfo_ps_b( data );
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
index 91d6ca596..040431e1e 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
@@ -122,18 +122,18 @@
 
 void bli_dgemmsup_rv_armv8a_asm_6x5m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( n0 == 5 );
@@ -141,8 +141,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x5m
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
   uint64_t ps_a   = bli_auxinfo_ps_a( data );
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
index 4273030dd..43f1c193c 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
@@ -112,18 +112,18 @@
 
 void bli_dgemmsup_rv_armv8a_asm_6x6m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( n0 == 6 );
@@ -131,8 +131,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x6m
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
   uint64_t ps_a   = bli_auxinfo_ps_a( data );
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
index afdd13e28..78801c8ef 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
@@ -84,10 +84,12 @@
 #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \
 " ldr  q"#V1", ["#ADDR", #"#IMM"] \n\t"
 
-// #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
-// #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
-//   DGEMM_LOAD1V_load(V1,ADDR,IMM) \
-//   DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
+#if 0
+ #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM)
+ #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \
+   DGEMM_LOAD1V_load(V1,ADDR,IMM) \
+   DGEMM_LOAD1V_load(V2,ADDR,IMM+16)
+#endif
 
 #define DGEMM_LOAD1V_G_noload(V1,ADDR,ST)
 #define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \
@@ -138,18 +140,18 @@
 
 void bli_dgemmsup_rv_armv8a_asm_6x7m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( n0 == 7 );
@@ -157,8 +159,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x7m
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
   uint64_t ps_a   = bli_auxinfo_ps_a( data );
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
index b912480fa..a3d25f860 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
@@ -129,18 +129,18 @@
  */
 void bli_dgemmsup_rv_armv8a_asm_6x8m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   if ( n0 != 8 )
@@ -215,8 +215,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
   uint64_t ps_a   = bli_auxinfo_ps_a( data );
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
index 910e07dbb..9c8ccdd12 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
@@ -129,18 +129,18 @@
  */
 void bli_dgemmsup_rv_armv8a_asm_6x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   if ( m0 != 6 )
@@ -201,8 +201,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
   // LLVM has very bad routing ability for inline asm.
   // Limit number of registers in case of Clang compilation.
 #ifndef __clang__
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
 #endif
   uint64_t ps_b   = bli_auxinfo_ps_b( data );
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
index d3af5781c..ea93ecf5f 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
@@ -100,18 +100,18 @@
  */
 void bli_dgemmsup_rv_armv8a_asm_8x4m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
@@ -119,8 +119,9 @@ void bli_dgemmsup_rv_armv8a_asm_8x4m
   //  and cannot be registered in configurations.
   assert( n0 == 4 );
 
-  void*    a_next = bli_auxinfo_next_a( data );
-  void*    b_next = bli_auxinfo_next_b( data );
+  const void* a_next = bli_auxinfo_next_a( data );
+  const void* b_next = bli_auxinfo_next_b( data );
+
   uint64_t ps_a   = bli_auxinfo_ps_a( data );
 
   // Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
index aa53de55c..42cde2a8b 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
@@ -84,18 +84,18 @@
 
 void bli_dgemmsup_rd_armv8a_asm_3x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( m0 == 3 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
index b10546764..84276879b 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
@@ -108,18 +108,18 @@
 
 void bli_dgemmsup_rd_armv8a_asm_6x3
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( m0 == 6 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
index 5438fdfc2..d4a17e064 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
@@ -59,26 +59,26 @@
  */
 void bli_dgemmsup_rd_armv8a_int_2x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a, inc_t cs_a,
-       double*    restrict b, inc_t rs_b, inc_t cs_b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a, inc_t cs_a,
+       const double*    b, inc_t rs_b, inc_t cs_b,
+       const double*    beta,
+             double*    c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   assert( m0 <= 2 );
   assert( n0 <= 8 );
 
-  double *a_loc = a;
-  double *b_loc = b;
-  double *c_loc = c;
+  const double *a_loc = a;
+  const double *b_loc = b;
+        double *c_loc = c;
 
   uint64_t k_mker = k0 / 2;
   uint64_t k_left = k0 % 2;
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
index 89817d6d5..c58ecc2fc 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
@@ -59,18 +59,18 @@
  */
 void bli_dgemmsup_rd_armv8a_int_3x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a, inc_t cs_a,
-       double*    restrict b, inc_t rs_b, inc_t cs_b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a, inc_t cs_a,
+       const double*    b, inc_t rs_b, inc_t cs_b,
+       const double*    beta,
+             double*    c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   // if ( m0 == 3 && n0 == 4 )
@@ -88,9 +88,9 @@ void bli_dgemmsup_rd_armv8a_int_3x4
   assert( m0 <= 3 );
   assert( n0 <= 4 );
 
-  double *a_loc = a;
-  double *b_loc = b;
-  double *c_loc = c;
+  const double *a_loc = a;
+  const double *b_loc = b;
+        double *c_loc = c;
 
   uint64_t k_mker = k0 / 2;
   uint64_t k_left = k0 % 2;
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
index 931f3ed66..9e6c38352 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
@@ -59,26 +59,29 @@
  */
 void bli_dgemmsup_rv_armv8a_int_3x8mn
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a0, inc_t rs_a, inc_t cs_a,
-       double*    restrict b0, inc_t rs_b, inc_t cs_b,
-       double*    restrict beta,
-       double*    restrict c0, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a0, inc_t rs_a, inc_t cs_a,
+       const double*    b0, inc_t rs_b, inc_t cs_b,
+       const double*    beta,
+             double*    c0, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   // Unlike the rd case, this rv case does not impose restriction upon
   //  maximal m & n.
 
-  double *a_loc;
-  double *b_loc, *b_in;
-  double *c_loc, *c_in;
+  const double *a_loc;
+  const double *b_loc;
+        double *c_loc;
+
+  const double *b_in;
+        double *c_in;
 
   dim_t n;
   dim_t k;
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
index f850b0fa6..1ddd582ae 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
@@ -59,26 +59,29 @@
  */
 void bli_dgemmsup_rv_armv8a_int_6x4mn
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a0, inc_t rs_a, inc_t cs_a,
-       double*    restrict b0, inc_t rs_b, inc_t cs_b,
-       double*    restrict beta,
-       double*    restrict c0, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a0, inc_t rs_a, inc_t cs_a,
+       const double*    b0, inc_t rs_b, inc_t cs_b,
+       const double*    beta,
+             double*    c0, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
   // Unlike the rd case, this rv case does not impose restriction upon
   //  maximal m & n.
 
-  double *a_loc;
-  double *b_loc, *b_in;
-  double *c_loc, *c_in;
+  const double *a_loc;
+  const double *b_loc;
+        double *c_loc;
+
+  const double *b_in;
+        double *c_in;
 
   dim_t n;
   dim_t k;
diff --git a/kernels/bgq/1/bli_axpyv_bgq_int.c b/kernels/bgq/1/bli_axpyv_bgq_int.c
index 1d233f5c1..6822c57b2 100644
--- a/kernels/bgq/1/bli_axpyv_bgq_int.c
+++ b/kernels/bgq/1/bli_axpyv_bgq_int.c
@@ -36,12 +36,12 @@
 
 void bli_daxpyv_bgq_int
      (
-       conj_t           conjx,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const double* alpha,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	if ( bli_zero_dim1( n ) ) return;
diff --git a/kernels/bgq/1/bli_dotv_bgq_int.c b/kernels/bgq/1/bli_dotv_bgq_int.c
index eb6805a4c..a4329df32 100644
--- a/kernels/bgq/1/bli_dotv_bgq_int.c
+++ b/kernels/bgq/1/bli_dotv_bgq_int.c
@@ -36,13 +36,13 @@
 
 void bli_ddotv_bgq_int
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       double* restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const double* x, inc_t incx,
+       const double* y, inc_t incy,
+             double* rho,
+       const cntx_t* cntx
      )
 {
 	bool   use_ref = FALSE;
diff --git a/kernels/bgq/1f/bli_axpyf_bgq_int.c b/kernels/bgq/1f/bli_axpyf_bgq_int.c
index cf0fe633c..52f9378be 100644
--- a/kernels/bgq/1f/bli_axpyf_bgq_int.c
+++ b/kernels/bgq/1f/bli_axpyf_bgq_int.c
@@ -37,15 +37,15 @@
 
 void bli_daxpyf_bgq_int
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t fusefac = 8;
@@ -70,15 +70,15 @@ void bli_daxpyf_bgq_int
 	dim_t m_run       =  m / 4;
 	dim_t m_left      =  m % 4;
 
-	double * a0   = a + 0*lda;
-	double * a1   = a + 1*lda;
-	double * a2   = a + 2*lda;
-	double * a3   = a + 3*lda;
-	double * a4   = a + 4*lda;
-	double * a5   = a + 5*lda;
-	double * a6   = a + 6*lda;
-	double * a7   = a + 7*lda;
-	double * y0   = y;
+	const double* a0   = a + 0*lda;
+	const double* a1   = a + 1*lda;
+	const double* a2   = a + 2*lda;
+	const double* a3   = a + 3*lda;
+	const double* a4   = a + 4*lda;
+	const double* a5   = a + 5*lda;
+	const double* a6   = a + 6*lda;
+	const double* a7   = a + 7*lda;
+	      double* y0   = y;
 
 	double chi0 = *(x + 0*incx);
 	double chi1 = *(x + 1*incx);
diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
index 2adbc4c36..4e9dc27d2 100644
--- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
+++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
@@ -56,16 +56,16 @@
 
 void bli_dgemm_bgq_int_8x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false );
@@ -220,23 +220,23 @@ void printvec(vector4double v)
 
 void bli_zgemm_bgq_int_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false );
 
-    double* a_d = ( double* )a;
-    double* b_d = ( double* )b;
-    double* c_d = ( double* )c;
+    const double* a_d = ( double* )a;
+    const double* b_d = ( double* )b;
+          double* c_d = ( double* )c;
 
     //Registers for storing C.
     //2 2x4 subblocks of C, c0, and c1
diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
index bef7232dd..d26c2c6b8 100644
--- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
+++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
@@ -90,16 +90,16 @@
 
 void bli_sgemm_bulldozer_asm_8x8_fma4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -582,16 +582,16 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
 
 void bli_dgemm_bulldozer_asm_4x6_fma4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -802,20 +802,20 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
 
 void bli_cgemm_bulldozer_asm_8x4_fma4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const scomplex*  alpha,
+       const scomplex*  a,
+       const scomplex*  b,
+       const scomplex*  beta,
+             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	//void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	//const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -1326,20 +1326,20 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
 
 void bli_zgemm_bulldozer_asm_4x4_fma4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	//void*   a_next = bli_auxinfo_next_a( data );
-	//void*   b_next = bli_auxinfo_next_b( data );
+	//const void* a_next = bli_auxinfo_next_a( data );
+	//const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
index e5d077409..843874c9a 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( scomplex, c, packm_3xk_haswell_ref )
 
 void bli_cpackm_haswell_asm_3xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       scomplex*  restrict kappa,
-       scomplex*  restrict a, inc_t inca0, inc_t lda0,
-       scomplex*  restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t    conja,
+             pack_t    schema,
+             dim_t     cdim0,
+             dim_t     k0,
+             dim_t     k0_max,
+       const scomplex* kappa,
+       const scomplex* a, inc_t inca0, inc_t lda0,
+             scomplex* p,              inc_t ldp0,
+       const cntx_t*   cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
index fa8fabe9d..25fc8bf05 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( scomplex, c, packm_8xk_haswell_ref )
 
 void bli_cpackm_haswell_asm_8xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       scomplex*  restrict kappa,
-       scomplex*  restrict a, inc_t inca0, inc_t lda0,
-       scomplex*  restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t    conja,
+             pack_t    schema,
+             dim_t     cdim0,
+             dim_t     k0,
+             dim_t     k0_max,
+       const scomplex* kappa,
+       const scomplex* a, inc_t inca0, inc_t lda0,
+             scomplex* p,              inc_t ldp0,
+       const cntx_t*   cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
index 47fc5b98d..4cfc241d3 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( double,   d, packm_6xk_haswell_ref )
 
 void bli_dpackm_haswell_asm_6xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       double*    restrict kappa,
-       double*    restrict a, inc_t inca0, inc_t lda0,
-       double*    restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const double* kappa,
+       const double* a, inc_t inca0, inc_t lda0,
+             double* p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
index 9f07e37a4..7fdb9b14f 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( double,   d, packm_8xk_haswell_ref )
 
 void bli_dpackm_haswell_asm_8xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       double*    restrict kappa,
-       double*    restrict a, inc_t inca0, inc_t lda0,
-       double*    restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const double* kappa,
+       const double* a, inc_t inca0, inc_t lda0,
+             double* p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
index 27b2c71ee..cc7d52134 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( double,   d, packm_16xk_haswell_ref )
 
 void bli_spackm_haswell_asm_16xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       float*     restrict kappa,
-       float*     restrict a, inc_t inca0, inc_t lda0,
-       float*     restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const float*  kappa,
+       const float*  a, inc_t inca0, inc_t lda0,
+             float*  p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
index a073eca62..bb464a6b5 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( double,   d, packm_6xk_haswell_ref )
 
 void bli_spackm_haswell_asm_6xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       float*     restrict kappa,
-       float*     restrict a, inc_t inca0, inc_t lda0,
-       float*     restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim0,
+             dim_t   k0,
+             dim_t   k0_max,
+       const float*  kappa,
+       const float*  a, inc_t inca0, inc_t lda0,
+             float*  p,              inc_t ldp0,
+       const cntx_t* cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
index 5e65565d5..bf63592d0 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( dcomplex, z, packm_3xk_haswell_ref )
 
 void bli_zpackm_haswell_asm_3xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       dcomplex*  restrict kappa,
-       dcomplex*  restrict a, inc_t inca0, inc_t lda0,
-       dcomplex*  restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t    conja,
+             pack_t    schema,
+             dim_t     cdim0,
+             dim_t     k0,
+             dim_t     k0_max,
+       const dcomplex* kappa,
+       const dcomplex* a, inc_t inca0, inc_t lda0,
+             dcomplex* p,              inc_t ldp0,
+       const cntx_t*   cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
index d118081cc..87b596ad8 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
@@ -43,15 +43,15 @@ PACKM_KER_PROT( dcomplex, z, packm_4xk_haswell_ref )
 
 void bli_zpackm_haswell_asm_4xk
      (
-       conj_t              conja,
-       pack_t              schema,
-       dim_t               cdim0,
-       dim_t               k0,
-       dim_t               k0_max,
-       dcomplex*  restrict kappa,
-       dcomplex*  restrict a, inc_t inca0, inc_t lda0,
-       dcomplex*  restrict p,              inc_t ldp0,
-       cntx_t*             cntx
+             conj_t    conja,
+             pack_t    schema,
+             dim_t     cdim0,
+             dim_t     k0,
+             dim_t     k0_max,
+       const dcomplex* kappa,
+       const dcomplex* a, inc_t inca0, inc_t lda0,
+             dcomplex* p,              inc_t ldp0,
+       const cntx_t*   cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index b7be1c674..617690e4d 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -79,16 +79,16 @@
 
 void bli_sgemm_haswell_asm_6x16
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -759,16 +759,16 @@ void bli_sgemm_haswell_asm_6x16
 
 void bli_dgemm_haswell_asm_6x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1318,16 +1318,16 @@ void bli_dgemm_haswell_asm_6x8
 
 void bli_cgemm_haswell_asm_3x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const scomplex*  alpha,
+       const scomplex*  a,
+       const scomplex*  b,
+       const scomplex*  beta,
+             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1711,16 +1711,16 @@ void bli_cgemm_haswell_asm_3x8
 
 void bli_zgemm_haswell_asm_3x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index 261054499..5f1ca3e97 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -78,16 +78,16 @@
 
 void bli_sgemm_haswell_asm_16x6
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -462,16 +462,16 @@ void bli_sgemm_haswell_asm_16x6
 
 void bli_dgemm_haswell_asm_8x6
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -832,16 +832,16 @@ void bli_dgemm_haswell_asm_8x6
 
 void bli_cgemm_haswell_asm_8x3
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const scomplex*  alpha,
+       const scomplex*  a,
+       const scomplex*  b,
+       const scomplex*  beta,
+             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1223,16 +1223,16 @@ void bli_cgemm_haswell_asm_8x3
 
 void bli_zgemm_haswell_asm_4x3
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index 915fbf08f..f97dc6c67 100644
--- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -58,17 +58,17 @@
 
 void bli_sgemmtrsm_l_haswell_asm_6x16
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a10,
-       float*     restrict a11,
-       float*     restrict b01,
-       float*     restrict b11,
-       float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k0, \
+       const float*     alpha, \
+       const float*     a10, \
+       const float*     a11, \
+       const float*     b01, \
+             float*     b11, \
+             float*     c11, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -849,17 +849,17 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
 
 void bli_dgemmtrsm_l_haswell_asm_6x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a10,
-       double*    restrict a11,
-       double*    restrict b01,
-       double*    restrict b11,
-       double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k0, \
+       const double*    alpha, \
+       const double*    a10, \
+       const double*    a11, \
+       const double*    b01, \
+             double*    b11, \
+             double*    c11, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index 63c42785c..7cbd4cb12 100644
--- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -58,17 +58,17 @@
 
 void bli_sgemmtrsm_u_haswell_asm_6x16
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a10,
-       float*     restrict a11,
-       float*     restrict b01,
-       float*     restrict b11,
-       float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k0, \
+       const float*     alpha, \
+       const float*     a12, \
+       const float*     a11, \
+       const float*     b21, \
+             float*     b11, \
+             float*     c11, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -90,8 +90,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 	vzeroall() // zero all xmm/ymm registers.
 
 
-	mov(var(a10), rax) // load address of a.
-	mov(var(b01), rbx) // load address of b.
+	mov(var(a12), rax) // load address of a.
+	mov(var(b21), rbx) // load address of b.
 
 	add(imm(32*4), rbx)
 	 // initialize loop by pre-loading
@@ -285,7 +285,7 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 
 	label(.SPOSTACCUM)
 
-	 // ymm4..ymm15 = -a10 * b01
+	 // ymm4..ymm15 = -a12 * b21
 
 
@@ -304,7 +304,7 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 	mov(rdx, r14) // save rdx = b11+8*cs_b for later
 
 
-	 // b11 := alpha * b11 - a10 * b01
+	 // b11 := alpha * b11 - a12 * b21
 	vfmsub231ps(mem(rcx), ymm3, ymm4)
 	add(rdi, rcx)
 	vfmsub231ps(mem(rdx), ymm3, ymm5)
@@ -816,8 +816,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 	: // input operands
 	  [k_iter] "m" (k_iter), // 0
 	  [k_left] "m" (k_left), // 1
-	  [a10]    "m" (a10),    // 2
-	  [b01]    "m" (b01),    // 3
+	  [a12]    "m" (a12),    // 2
+	  [b21]    "m" (b21),    // 3
 	  [beta]   "m" (beta),   // 4
 	  [alpha]  "m" (alpha),  // 5
 	  [a11]    "m" (a11),    // 6
@@ -854,17 +854,17 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 
 void bli_dgemmtrsm_u_haswell_asm_6x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a10,
-       double*    restrict a11,
-       double*    restrict b01,
-       double*    restrict b11,
-       double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k0, \
+       const double*    alpha, \
+       const double*    a12, \
+       const double*    a11, \
+       const double*    b21, \
+             double*    b11, \
+             double*    c11, inc_t rs_c0, inc_t cs_c0, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -886,8 +886,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
 	vzeroall() // zero all xmm/ymm registers.
 
 
-	mov(var(a10), rax) // load address of a.
-	mov(var(b01), rbx) // load address of b.
+	mov(var(a12), rax) // load address of a.
+	mov(var(b21), rbx) // load address of b.
 
 	add(imm(32*4), rbx)
 	 // initialize loop by pre-loading
@@ -1083,7 +1083,7 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
 
 	label(.DPOSTACCUM)
 
-	 // ymm4..ymm15 = -a10 * b01
+	 // ymm4..ymm15 = -a12 * b21
 
 
@@ -1103,7 +1103,7 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
 	mov(rdx, r14) // save rdx = b11+4*cs_b for later
 
 
-	 // b11 := alpha * b11 - a10 * b01
+	 // b11 := alpha * b11 - a12 * b21
 	vfmsub231pd(mem(rcx), ymm3, ymm4)
 	add(rdi, rcx)
 	vfmsub231pd(mem(rdx), ymm3, ymm5)
@@ -1575,8 +1575,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
 	: // input operands
 	  [k_iter] "m" (k_iter), // 0
 	  [k_left] "m" (k_left), // 1
-	  [a10]    "m" (a10),    // 2
-	  [b01]    "m" (b01),    // 3
+	  [a12]    "m" (a12),    // 2
+	  [b21]    "m" (b21),    // 3
 	  [beta]   "m" (beta),   // 4
 	  [alpha]  "m" (alpha),  // 5
 	  [a11]    "m" (a11),    // 6
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
index 637e5917b..a5e912dd0 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
@@ -68,18 +68,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rd_haswell_asm_6x8m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -88,9 +88,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
+		      double* cij = c;
+		const double* bj  = b;
+		const double* ai  = a;
 
 		if ( 4 <= n_left )
 		{
@@ -690,12 +690,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 8;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
+		      double* cij = c + i_edge*rs_c;
+		const double* bj  = b;
+		const double* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -725,18 +725,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 void bli_dgemmsup_rd_haswell_asm_6x4m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1262,12 +1262,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 4;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
+		      double* cij = c + i_edge*rs_c;
+		const double* bj  = b;
+		const double* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -1297,18 +1297,18 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 void bli_dgemmsup_rd_haswell_asm_6x2m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1875,12 +1875,12 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 2;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
+		      double* cij = c + i_edge*rs_c;
+		const double* bj  = b;
+		const double* ai  = a + i_edge*rs_a;
 
 		if ( 3 <= m_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
index d9dad5fea..fdbbcaa2d 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
@@ -68,18 +68,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rd_haswell_asm_6x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -88,9 +88,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
+		      double* cij = c;
+		const double* bj  = b;
+		const double* ai  = a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
@@ -751,9 +751,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		const double* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -794,18 +794,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 void bli_dgemmsup_rd_haswell_asm_3x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1337,9 +1337,9 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		const double* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1381,18 +1381,18 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 void bli_dgemmsup_rd_haswell_asm_2x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1868,9 +1868,9 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		const double* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1911,18 +1911,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 void bli_dgemmsup_rd_haswell_asm_1x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2342,9 +2342,9 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		const double* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
index fcf448423..ee4761005 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
@@ -68,18 +68,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x16m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -88,9 +88,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		float* restrict cij = c;
-		float* restrict bj  = b;
-		float* restrict ai  = a;
+		      float* cij = c;
+		const float* bj  = b;
+		const float* ai  = a;
 
 		if ( 12 <= n_left )
 		{
@@ -725,12 +725,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 16;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 16;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		float* restrict bj  = b;
-		float* restrict ai  = a + i_edge*rs_a;
+		      float* cij = c + i_edge*rs_c;
+		const float* bj  = b;
+		const float* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -760,18 +760,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 void bli_sgemmsup_rd_haswell_asm_6x12m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1336,12 +1336,12 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 12;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 12;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		float* restrict bj  = b;
-		float* restrict ai  = a + i_edge*rs_a;
+		      float* cij = c + i_edge*rs_c;
+		const float* bj  = b;
+		const float* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -1373,18 +1373,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 void bli_sgemmsup_rd_haswell_asm_6x8m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1949,12 +1949,12 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 8;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		float* restrict bj  = b;
-		float* restrict ai  = a + i_edge*rs_a;
+		      float* cij = c + i_edge*rs_c;
+		const float* bj  = b;
+		const float* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -1984,18 +1984,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 void bli_sgemmsup_rd_haswell_asm_6x4m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2533,12 +2533,12 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 4;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		float* restrict bj  = b;
-		float* restrict ai  = a + i_edge*rs_a;
+		      float* cij = c + i_edge*rs_c;
+		const float* bj  = b;
+		const float* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -2569,18 +2569,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 void bli_sgemmsup_rd_haswell_asm_6x2m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3165,12 +3165,12 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 2;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		float* restrict bj  = b;
-		float* restrict ai  = a + i_edge*rs_a;
+		      float* cij = c + i_edge*rs_c;
+		const float* bj  = b;
+		const float* ai  = a + i_edge*rs_a;
 
 		if ( 3 <= m_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
index 33b2df4b4..b288eab33 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
@@ -68,18 +68,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -88,9 +88,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		float* restrict cij = c;
-		float* restrict bj  = b;
-		float* restrict ai  = a;
+		      float* cij = c;
+		const float* bj  = b;
+		const float* ai  = a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
@@ -763,9 +763,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		float* restrict bj  = b + j_edge*cs_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		const float* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -806,18 +806,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 
 void bli_sgemmsup_rd_haswell_asm_3x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1359,9 +1359,9 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		float* restrict bj  = b + j_edge*cs_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		const float* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1403,18 +1403,18 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 
 void bli_sgemmsup_rd_haswell_asm_2x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1896,9 +1896,9 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		float* restrict bj  = b + j_edge*cs_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		const float* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1939,18 +1939,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 
 void bli_sgemmsup_rd_haswell_asm_1x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2373,9 +2373,9 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		float* restrict bj  = b + j_edge*cs_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		const float* bj  = b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
index 4e6b75572..5603f9cb4 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rv_haswell_asm_6x8m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -103,9 +103,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
+		      double* cij = c;
+		const double* bj  = b;
+		const double* ai  = a;
 
 		if ( 6 <= n_left )
 		{
@@ -176,8 +176,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 				dim_t m_iter = ( m0 + mr - 1 ) / mr;
 				dim_t m_left =   m0            % mr;
 
-				double* restrict ai_ii  = ai;
-				double* restrict cij_ii = cij;
+				const double* ai_ii  = ai;
+				      double* cij_ii = cij;
 
 				for ( dim_t ii = 0; ii < m_iter; ii += 1 )
 				{
@@ -904,14 +904,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 8;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		double* restrict cij = c + i_edge*rs_c;
-		//double* restrict ai  = a + i_edge*rs_a;
-		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		double* restrict ai  = a + m_iter * ps_a;
-		double* restrict bj  = b;
+		      double* cij = c + i_edge*rs_c;
+		//const double* ai  = a + i_edge*rs_a;
+		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = a + m_iter * ps_a;
+		const double* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -985,18 +985,18 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 void bli_dgemmsup_rv_haswell_asm_6x6m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1683,14 +1683,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 6;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 6;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		double* restrict cij = c + i_edge*rs_c;
-		//double* restrict ai  = a + i_edge*rs_a;
-		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		double* restrict ai  = a + m_iter * ps_a;
-		double* restrict bj  = b;
+		      double* cij = c + i_edge*rs_c;
+		//const double* ai  = a + i_edge*rs_a;
+		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = a + m_iter * ps_a;
+		const double* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -1764,18 +1764,18 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 void bli_dgemmsup_rv_haswell_asm_6x4m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2340,14 +2340,14 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 4;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		double* restrict cij = c + i_edge*rs_c;
-		//double* restrict ai  = a + i_edge*rs_a;
-		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		double* restrict ai  = a + m_iter * ps_a;
-		double* restrict bj  = b;
+		      double* cij = c + i_edge*rs_c;
+		//const double* ai  = a + i_edge*rs_a;
+		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = a + m_iter * ps_a;
+		const double* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -2421,18 +2421,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 
 void bli_dgemmsup_rv_haswell_asm_6x2m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2973,14 +2973,14 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		//double* restrict ai  = a + i_edge*rs_a;
-		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		double* restrict ai  = a + m_iter * ps_a;
-		double* restrict bj  = b;
+		const dim_t   nr_cur = 2;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
+
+		      double* cij = c + i_edge*rs_c;
+		//const double* ai  = a + i_edge*rs_a;
+		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = a + m_iter * ps_a;
+		const double* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
index 2533a7825..efa997764 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rv_haswell_asm_6x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -103,9 +103,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
+		      double* cij = c;
+		const double* bj  = b;
+		const double* ai  = a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
@@ -882,11 +882,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		//double* restrict bj  = b + j_edge*cs_b;
-		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		double* restrict bj  = b + n_iter * ps_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		//const double* bj  = b + j_edge*cs_b;
+		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -949,18 +949,18 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 void bli_dgemmsup_rv_haswell_asm_5x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1630,11 +1630,11 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 		const dim_t      mr_cur = 5;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		//double* restrict bj  = b + j_edge*cs_b;
-		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		double* restrict bj  = b + n_iter * ps_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		//const double* bj  = b + j_edge*cs_b;
+		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -1697,18 +1697,18 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 void bli_dgemmsup_rv_haswell_asm_4x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2287,11 +2287,11 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 		const dim_t      mr_cur = 4;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		//double* restrict bj  = b + j_edge*cs_b;
-		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		double* restrict bj  = b + n_iter * ps_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		//const double* bj  = b + j_edge*cs_b;
+		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -2345,18 +2345,18 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 void bli_dgemmsup_rv_haswell_asm_3x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2965,11 +2965,11 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		//double* restrict bj  = b + j_edge*cs_b;
-		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		double* restrict bj  = b + n_iter * ps_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		//const double* bj  = b + j_edge*cs_b;
+		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -3023,18 +3023,18 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 void bli_dgemmsup_rv_haswell_asm_2x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3524,11 +3524,11 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		//double* restrict bj  = b + j_edge*cs_b;
-		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		double* restrict bj  = b + n_iter * ps_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		//const double* bj  = b + j_edge*cs_b;
+		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -3582,18 +3582,18 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 void bli_dgemmsup_rv_haswell_asm_1x8n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4047,11 +4047,11 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		//double* restrict bj  = b + j_edge*cs_b;
-		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		double* restrict bj  = b + n_iter * ps_b;
+		      double* cij = c + j_edge*cs_c;
+		const double* ai  = a;
+		//const double* bj  = b + j_edge*cs_b;
+		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
index b5424f09a..170f81fab 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rv_haswell_asm_6x16m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -103,9 +103,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		float*  restrict cij = c;
-		float*  restrict bj  = b;
-		float*  restrict ai  = a;
+		      float* cij = c;
+		const float* bj  = b;
+		const float* ai  = a;
 
 		if ( 12 <= n_left )
 		{
@@ -200,8 +200,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 				dim_t m_iter = ( m0 + mr - 1 ) / mr;
 				dim_t m_left =   m0            % mr;
 
-				float*  restrict ai_ii  = ai;
-				float*  restrict cij_ii = cij;
+				const float* ai_ii  = ai;
+				      float* cij_ii = cij;
 
 				for ( dim_t ii = 0; ii < m_iter; ii += 1 )
 				{
@@ -1038,14 +1038,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 16;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 16;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		//float* restrict ai  = a + i_edge*rs_a;
-		//float* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		float* restrict ai  = a + m_iter * ps_a;
-		float* restrict bj  = b;
+		      float* cij = c + i_edge*rs_c;
+		//const float* ai  = a + i_edge*rs_a;
+		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = a + m_iter * ps_a;
+		const float* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -1119,18 +1119,18 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 void bli_sgemmsup_rv_haswell_asm_6x12m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1905,14 +1905,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 12;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 12;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		//float* restrict ai  = a + i_edge*rs_a;
-		//float* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		float* restrict ai  = a + m_iter * ps_a;
-		float* restrict bj  = b;
+		      float* cij = c + i_edge*rs_c;
+		//const float* ai  = a + i_edge*rs_a;
+		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = a + m_iter * ps_a;
+		const float* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -1986,18 +1986,18 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 void bli_sgemmsup_rv_haswell_asm_6x8m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2618,14 +2618,14 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 8;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		//float* restrict ai  = a + i_edge*rs_a;
-		//float* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		float* restrict ai  = a + m_iter * ps_a;
-		float* restrict bj  = b;
+		      float* cij = c + i_edge*rs_c;
+		//const float* ai  = a + i_edge*rs_a;
+		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = a + m_iter * ps_a;
+		const float* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -2699,18 +2699,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 void bli_sgemmsup_rv_haswell_asm_6x6m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3364,14 +3364,14 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 6;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 6;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		//float* restrict ai  = a + i_edge*rs_a;
-		//float* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		float* restrict ai  = a + m_iter * ps_a;
-		float* restrict bj  = b;
+		      float* cij = c + i_edge*rs_c;
+		//const float* ai  = a + i_edge*rs_a;
+		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = a + m_iter * ps_a;
+		const float* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -3445,18 +3445,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 void bli_sgemmsup_rv_haswell_asm_6x4m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4033,14 +4033,14 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 4;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		//float* restrict ai  = a + i_edge*rs_a;
-		//float* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		float* restrict ai  = a + m_iter * ps_a;
-		float* restrict bj  = b;
+		      float* cij = c + i_edge*rs_c;
+		//const float* ai  = a + i_edge*rs_a;
+		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = a + m_iter * ps_a;
+		const float* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
@@ -4114,18 +4114,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 void bli_sgemmsup_rv_haswell_asm_6x2m
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4673,14 +4673,14 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		float* restrict cij = c + i_edge*rs_c;
-		//float* restrict ai  = a + i_edge*rs_a;
-		//float* restrict ai  = a + ( i_edge / 6 ) * ps_a;
-		float* restrict ai  = a + m_iter * ps_a;
-		float* restrict bj  = b;
+		const dim_t   nr_cur = 2;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
+
+		      float* cij = c + i_edge*rs_c;
+		//const float* ai  = a + i_edge*rs_a;
+		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = a + m_iter * ps_a;
+		const float* bj  = b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
index da768ebf1..14bbad5fe 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rv_haswell_asm_6x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -103,9 +103,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		float* restrict cij = c;
-		float* restrict bj  = b;
-		float* restrict ai  = a;
+		      float* cij = c;
+		const float* bj  = b;
+		const float* ai  = a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
@@ -993,11 +993,11 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		//float* restrict bj  = b + j_edge*cs_b;
-		//float* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		float* restrict bj  = b + n_iter * ps_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		//const float* bj  = b + j_edge*cs_b;
+		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -1084,18 +1084,18 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 void bli_sgemmsup_rv_haswell_asm_5x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1925,11 +1925,11 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 		const dim_t      mr_cur = 5;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		//float* restrict bj  = b + j_edge*cs_b;
-		//float* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		float* restrict bj  = b + n_iter * ps_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		//const float* bj  = b + j_edge*cs_b;
+		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -2016,18 +2016,18 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 void bli_sgemmsup_rv_haswell_asm_4x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2683,11 +2683,11 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 		const dim_t      mr_cur = 4;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		//float* restrict bj  = b + j_edge*cs_b;
-		//float* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		float* restrict bj  = b + n_iter * ps_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		//const float* bj  = b + j_edge*cs_b;
+		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -2774,18 +2774,18 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 void bli_sgemmsup_rv_haswell_asm_3x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3502,11 +3502,11 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		//float* restrict bj  = b + j_edge*cs_b;
-		//float* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		float* restrict bj  = b + n_iter * ps_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		//const float* bj  = b + j_edge*cs_b;
+		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -3593,18 +3593,18 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 void bli_sgemmsup_rv_haswell_asm_2x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4140,11 +4140,11 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		//float* restrict bj  = b + j_edge*cs_b;
-		//float* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		float* restrict bj  = b + n_iter * ps_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		//const float* bj  = b + j_edge*cs_b;
+		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -4231,18 +4231,18 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 void bli_sgemmsup_rv_haswell_asm_1x16n
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4791,11 +4791,11 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		float* restrict cij = c + j_edge*cs_c;
-		float* restrict ai  = a;
-		//float* restrict bj  = b + j_edge*cs_b;
-		//float* restrict bj  = b + ( j_edge / 8 ) * ps_b;
-		float* restrict bj  = b + n_iter * ps_b;
+		      float* cij = c + j_edge*cs_c;
+		const float* ai  = a;
+		//const float* bj  = b + j_edge*cs_b;
+		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
index 67b3ec8bf..bd4da7804 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
@@ -94,38 +94,38 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx \
+             conj_t     conja, \
+             conj_t     conjb, \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < mdim; ++i ) \
 	{ \
-		ctype* restrict ci = &c[ i*rs_c ]; \
-		ctype* restrict ai = &a[ i*rs_a ]; \
+		      ctype* ci = &c[ i*rs_c ]; \
+		const ctype* ai = &a[ i*rs_a ]; \
 \
 		/* for ( dim_t j = 0; j < 1; ++j ) */ \
 		{ \
-			ctype* restrict cij = ci /*[ j*cs_c ]*/ ; \
-			ctype* restrict bj  = b  /*[ j*cs_b ]*/ ; \
-			ctype           ab; \
+			      ctype* cij = ci /*[ j*cs_c ]*/ ; \
+			const ctype* bj  = b  /*[ j*cs_b ]*/ ; \
+			ctype        ab; \
 \
 			PASTEMAC(ch,set0s)( ab ); \
 \
 			/* Perform a dot product to update the (i,j) element of c. */ \
 			for ( dim_t l = 0; l < k; ++l ) \
 			{ \
-				ctype* restrict aij = &ai[ l*cs_a ]; \
-				ctype* restrict bij = &bj[ l*rs_b ]; \
+				const ctype* aij = &ai[ l*cs_a ]; \
+				const ctype* bij = &bj[ l*rs_b ]; \
 \
 				PASTEMAC(ch,dots)( *aij, *bij, ab ); \
 			} \
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
index 929f9ea47..883bbb236 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rd_haswell_asm_6x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -576,18 +576,18 @@ void bli_dgemmsup_rd_haswell_asm_6x1
 
 void bli_dgemmsup_rd_haswell_asm_3x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -985,18 +985,18 @@ void bli_dgemmsup_rd_haswell_asm_3x1
 
 void bli_dgemmsup_rd_haswell_asm_2x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1359,18 +1359,18 @@ void bli_dgemmsup_rd_haswell_asm_2x1
 
 void bli_dgemmsup_rd_haswell_asm_1x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
index 397d932e4..e9be29bcb 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rd_haswell_asm_6x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -618,18 +618,18 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 
 void bli_dgemmsup_rd_haswell_asm_3x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1051,18 +1051,18 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 
 void bli_dgemmsup_rd_haswell_asm_2x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1443,18 +1443,18 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 
 void bli_dgemmsup_rd_haswell_asm_1x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index 75e84650c..2e82c6ee0 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rd_haswell_asm_6x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -595,18 +595,18 @@ void bli_dgemmsup_rd_haswell_asm_6x4
 
 void bli_dgemmsup_rd_haswell_asm_2x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1045,18 +1045,18 @@ void bli_dgemmsup_rd_haswell_asm_2x4
 
 void bli_dgemmsup_rd_haswell_asm_1x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
index b2e3d83af..bad647c34 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rd_haswell_asm_6x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -82,9 +82,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
+		      double* cij = c;
+		const double* bj  = b;
+		const double* ai  = a;
 
 		if ( 4 <= n_left )
 		{
@@ -688,12 +688,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 8;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
+		      double* cij = c + i_edge*rs_c;
+		const double* bj  = b;
+		const double* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -725,18 +725,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 
 void bli_dgemmsup_rd_haswell_asm_2x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1199,18 +1199,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 void bli_dgemmsup_rd_haswell_asm_1x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
index 5843d5e40..dd12186cb 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rv_haswell_asm_6x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -556,18 +556,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 
 void bli_dgemmsup_rv_haswell_asm_5x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1006,18 +1006,18 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 void bli_dgemmsup_rv_haswell_asm_4x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1414,18 +1414,18 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 void bli_dgemmsup_rv_haswell_asm_3x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1819,18 +1819,18 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 
 void bli_dgemmsup_rv_haswell_asm_2x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2169,18 +2169,18 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 void bli_dgemmsup_rv_haswell_asm_1x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
index 6fb5eaf8a..cea208958 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rv_haswell_asm_6x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -578,18 +578,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 void bli_dgemmsup_rv_haswell_asm_5x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1049,18 +1049,18 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 
 void bli_dgemmsup_rv_haswell_asm_4x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1469,18 +1469,18 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 void bli_dgemmsup_rv_haswell_asm_3x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1896,18 +1896,18 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 
 void bli_dgemmsup_rv_haswell_asm_2x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2259,18 +2259,18 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 void bli_dgemmsup_rv_haswell_asm_1x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
index 2b7222a34..a78232eb3 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 
 void bli_dgemmsup_rv_haswell_asm_6x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -694,18 +694,18 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 
 void bli_dgemmsup_rv_haswell_asm_5x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1269,18 +1269,18 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 
 void bli_dgemmsup_rv_haswell_asm_4x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1767,18 +1767,18 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 
 void bli_dgemmsup_rv_haswell_asm_3x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2282,18 +2282,18 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 
 void bli_dgemmsup_rv_haswell_asm_2x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2713,18 +2713,18 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 
 void bli_dgemmsup_rv_haswell_asm_1x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
index b3a7c17ca..543eebf0e 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
@@ -98,18 +98,18 @@ static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
 
 void bli_dgemmsup_rv_haswell_asm_6x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 #if 0
@@ -135,9 +135,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	);
 	return;
 #endif
-		dim_t            n_left = n0;
-		double* restrict cj     = c;
-		double* restrict bj     = b;
+		dim_t         n_left = n0;
+		      double* cj     = c;
+		const double* bj     = b;
 
 		// Iterate across columns (corresponding to elements of nrs) until
 		// n_left is zero.
@@ -149,9 +149,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 			// n_left, we use the kernels in that column.
 			if ( nr_cur <= n_left )
 			{
-				dim_t            m_left = m0;
-				double* restrict cij    = cj;
-				double* restrict ai     = a;
+				dim_t         m_left = m0;
+				      double* cij    = cj;
+				const double* ai     = a;
 
 				// Iterate down the current column (corresponding to elements
 				// of mrs) until m_left is zero.
@@ -810,18 +810,18 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 
 void bli_dgemmsup_rv_haswell_asm_5x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1405,18 +1405,18 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 
 void bli_dgemmsup_rv_haswell_asm_4x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1913,18 +1913,18 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 
 void bli_dgemmsup_rv_haswell_asm_3x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2447,18 +2447,18 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 
 void bli_dgemmsup_rv_haswell_asm_2x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2870,18 +2870,18 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 
 void bli_dgemmsup_rv_haswell_asm_1x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a, inc_t rs_a0, inc_t cs_a0,
+       const double*    b, inc_t rs_b0, inc_t cs_b0,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c b/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c
deleted file mode 100644
index 87ef7309b..000000000
--- a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c
+++ /dev/null
@@ -1,4566 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-*/
-
-// Prototype reference microkernels.
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-
-#if 0
-// Define parameters and variables for edge case kernel map.
-#define NUM_MR 4
-#define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
-
-static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 };
-static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
-static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
-{     /*  8                                4                                2                                1  */
-/* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref_6x1 },
-/* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref_3x1 },
-/* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref_2x1 },
-/* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref_1x1 }
-};
-#endif
-
-
-void bli_dgemmsup_rd_haswell_asm_6x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	uint64_t n_left = n0 % 8;
-
-	// First check whether this is a edge case in the n dimension. If so,
-	// dispatch other 6x?m kernels, as needed.
-	if ( n_left )
-	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
-
-		if ( 4 <= n_left )
-		{
-			const dim_t nr_cur = 4;
-
-			bli_dgemmsup_rd_haswell_asm_6x4
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_6x2
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, m0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-#endif
-		}
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	//mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	//mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-	mov(var(a), r14)                   // load address of a
-	mov(var(c), r12)                   // load address of c
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c*sizeof(double) = 1*8
-	lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
-	lea(mem(r14), rax)                 // rax = a_ii;
-	lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-
-#if 1
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 0
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 0
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-#if 0
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x8
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x8
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c*sizeof(double) = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c*sizeof(double) = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter .. 1 0 ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 3*ii*rs_c;
-	lea(mem(r14), rax)                 // rax = a + 3*ii*rs_a;
-	lea(mem(rdx), rbx)                 // rbx = b;
-
-
-#if 1
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 0
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 0
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x4
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x4
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | | | |
-     --------        -- -- -- ...    | | | |
-     --------   +=   -- -- --        | | | |
-     --------                        | | | |
-     --------                           :
-     --------                           :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 6;
-	uint64_t m_left = m0 % 6;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-#if 0
-    vzeroall()                         // zero all xmm/ymm registers.
-#else
-                                       // skylake can execute 3 vxorpd ipc with
-                                       // a latency of 1 cycle, while vzeroall
-                                       // has a latency of 12 cycles.
-    vxorpd(ymm4,  ymm4,  ymm4)
-    vxorpd(ymm5,  ymm5,  ymm5)
-    vxorpd(ymm6,  ymm6,  ymm6)
-    vxorpd(ymm7,  ymm7,  ymm7)
-    vxorpd(ymm8,  ymm8,  ymm8)
-    vxorpd(ymm9,  ymm9,  ymm9)
-    vxorpd(ymm10, ymm10, ymm10)
-    vxorpd(ymm11, ymm11, ymm11)
-    vxorpd(ymm12, ymm12, ymm12)
-    vxorpd(ymm13, ymm13, ymm13)
-    vxorpd(ymm14, ymm14, ymm14)
-    vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 6*ii*rs_c;
-    lea(mem(r14), rax)                 // rax = a + 6*ii*rs_a;
-    lea(mem(rdx), rbx)                 // rbx = b;
-
-
-	lea(mem(rcx, rdi, 2), r10)         //
-	lea(mem(r10, rdi, 1), r10)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(r10, 1*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovsd(mem(rax, r13, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rax, r8,  4), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovsd(mem(rax, r15, 1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	                                   // ymm10 ymm11
-	                                   // ymm12 ymm13
-	                                   // ymm14 ymm15
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	vhaddpd( ymm11, ymm10, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm10 )
-
-	vhaddpd( ymm13, ymm12, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm12 )
-
-	vhaddpd( ymm15, ymm14, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm14 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-	                                   // xmm10 = sum(ymm10) sum(ymm11)
-	                                   // xmm12 = sum(ymm12) sum(ymm13)
-	                                   // xmm14 = sum(ymm14) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm14)
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-
-
-
-	lea(mem(r12, rdi, 4), r12)         //
-    lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c
-
-    lea(mem(r14, r8,  4), r14)         //
-    lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 3 <= m_left )
-		{
-			const dim_t mr_cur = 3;
-
-			bli_dgemmsup_rd_haswell_asm_3x2
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 2 <= m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x2
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x2
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
diff --git a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c b/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c
deleted file mode 100644
index fe61fbc31..000000000
--- a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c
+++ /dev/null
@@ -1,11048 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
-
-   rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
-
-   Assumptions:
-   - B is row-stored;
-   - A is row- or column-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential kernel is well-suited for contiguous
-   (v)ector loads on B and single-element broadcasts from A.
-
-   NOTE: These kernels explicitly support column-oriented IO, implemented
-   via an in-register transpose. And thus they also support the crr and
-   ccr cases, though only crr is ever utilized (because ccr is handled by
-   transposing the operation and executing rcr, which does not incur the
-   cost of the in-register transpose).
-
-   crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
-*/
-
-// Prototype reference microkernels.
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-
-// Define parameters and variables for edge case kernel map.
-#define NUM_MR 4
-#define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
-
-static dim_t mrs[NUM_MR] = { 6, 4, 2, 1 };
-static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
-static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
-{     /*  8                                4                                2                                1  */
-/* 6 */ { bli_dgemmsup_rv_haswell_asm_6x8, bli_dgemmsup_rv_haswell_asm_6x4, bli_dgemmsup_rv_haswell_asm_6x2, bli_dgemmsup_r_haswell_ref_6x1 },
-/* 4 */	{ bli_dgemmsup_rv_haswell_asm_4x8, bli_dgemmsup_rv_haswell_asm_4x4, bli_dgemmsup_rv_haswell_asm_4x2, bli_dgemmsup_r_haswell_ref_4x1 },
-/* 2 */	{ bli_dgemmsup_rv_haswell_asm_2x8, bli_dgemmsup_rv_haswell_asm_2x4, bli_dgemmsup_rv_haswell_asm_2x2, bli_dgemmsup_r_haswell_ref_2x1 },
-/* 1 */	{ bli_dgemmsup_rv_haswell_asm_1x8, bli_dgemmsup_rv_haswell_asm_1x4, bli_dgemmsup_rv_haswell_asm_1x2, bli_dgemmsup_r_haswell_ref_1x1 },
-};
-
-
-void bli_dgemmsup_rv_haswell_asm_6x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	// Use a reference kernel if this is an edge case in the m or n
-	// dimensions.
-	if ( m0 < 6 || n0 < 8 )
-	{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-		dim_t            n_left = n0;
-		double* restrict cj     = c;
-		double* restrict bj     = b;
-
-		// Iterate across columns (corresponding to elements of nrs) until
-		// n_left is zero.
-		for ( dim_t j = 0; n_left != 0; ++j )
-		{
-			const dim_t nr_cur = nrs[ j ];
-
-			// Once we find the value of nrs that is less than (or equal to)
-			// n_left, we use the kernels in that column.
-			if ( nr_cur <= n_left )
-			{
-				dim_t            m_left = m0;
-				double* restrict cij    = cj;
-				double* restrict ai     = a;
-
-				// Iterate down the current column (corresponding to elements
-				// of mrs) until m_left is zero.
-				for ( dim_t i = 0; m_left != 0; ++i )
-				{
-					const dim_t mr_cur = mrs[ i ];
-
-					// Once we find the value of mrs that is less than (or equal
-					// to) m_left, we select that kernel.
-					if ( mr_cur <= m_left )
-					{
-						FUNCPTR_T ker_fp = kmap[i][j];
-
-						//printf( "executing %d x %d sup kernel.\n", (int)mr_cur, (int)nr_cur );
-
-						// Call the kernel using current mrs and nrs values.
-						ker_fp
-						(
-						  conja, conjb, mr_cur, nr_cur, k0,
-						  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-						  beta, cij, rs_c0, cs_c0, data, cntx
-						);
-
-						// Advance C and A pointers by the mrs and nrs we just
-						// used, and decrement m_left.
-						cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-					} 
-				}
-
-				// Advance C and B pointers by the mrs and nrs we just used, and
-				// decrement n_left.
-				cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-			}
-		}
-
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 4*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 4*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c
-	lea(mem(rdx, rsi, 2), rdx)         // rdx = c + 5*cs_c;
-	prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-
-
-#if 1
-	lea(mem(rax, r9,  8), rdx)         //
-	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 1
-	prefetch(0, mem(rdx, 5*8))
-#endif
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-#if 0
-	prefetch(0, mem(rdx, r9, 1, 5*8))
-#endif
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	// ---------------------------------- iteration 2
-
-#if 1
-	prefetch(0, mem(rdx, r9, 2, 5*8))
-#endif
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-
-	// ---------------------------------- iteration 3
-
-#if 1
-	lea(mem(rdx, r9,  4), rdx)         // a_prefetch += 4*cs_a;
-#endif
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-	
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm7, ymm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm0, ymm9, ymm9)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(ymm0, ymm11, ymm11)
-	vmulpd(ymm0, ymm12, ymm12)
-	vmulpd(ymm0, ymm13, ymm13)
-	vmulpd(ymm0, ymm14, ymm14)
-	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7)
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9)
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm11)
-	vmovupd(ymm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm13)
-	vmovupd(ymm13, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm14)
-	vmovupd(ymm14, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm15)
-	vmovupd(ymm15, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11)
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm10, mem(rcx))
-	vmovupd(ymm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm12, mem(rcx))
-	vmovupd(ymm13, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm14, mem(rcx))
-	vmovupd(ymm15, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_5x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 4*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 4*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c
-	lea(mem(rdx, rsi, 2), rdx)         // rdx = c + 5*cs_c;
-	prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-	
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm7, ymm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm0, ymm9, ymm9)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(ymm0, ymm11, ymm11)
-	vmulpd(ymm0, ymm12, ymm12)
-	vmulpd(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7)
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9)
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm11)
-	vmovupd(ymm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm13)
-	vmovupd(ymm13, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovlpd(mem(rdx), xmm0, xmm0)
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rdx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm12, ymm3, ymm0)
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, rsi, 2))
-	vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11)
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovlpd(mem(rdx), xmm0, xmm0)
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rdx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm13, ymm3, ymm0)
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, rsi, 2))
-	vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm10, mem(rcx))
-	vmovupd(ymm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm12, mem(rcx))
-	vmovupd(ymm13, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovupd(ymm12, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-#endif
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovupd(ymm13, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, rsi, 2))
-	vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-
-
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_4x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 3*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c
-	lea(mem(rdx, rsi, 2), rdx)         // rdx = c + 5*cs_c;
-	prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm7, ymm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm0, ymm9, ymm9)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7)
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9)
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm11)
-	vmovupd(ymm11, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11)
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm10, mem(rcx))
-	vmovupd(ymm11, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	vmovupd(ymm9, mem(rcx, rsi, 2))
-	vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_3x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 2*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 2*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c
-	lea(mem(rdx, rsi, 2), rdx)         // rdx = c + 5*cs_c;
-	prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 6*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm7, ymm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7)
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9)
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	vextractf128(imm(0x1), ymm8, xmm14)
-	vextractf128(imm(0x1), ymm10, xmm15)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10)
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vfmadd231sd(mem(rdx), xmm3, xmm12)
-	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
-	vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14)
-	vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15)
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-	
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vextractf128(imm(0x1), ymm5, xmm12)
-	vextractf128(imm(0x1), ymm7, xmm13)
-	vextractf128(imm(0x1), ymm9, xmm14)
-	vextractf128(imm(0x1), ymm11, xmm15)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm5)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm9)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm11)
-	vmovupd(xmm5, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 1))
-	vmovupd(xmm9, mem(rcx, rsi, 2))
-	vmovupd(xmm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vfmadd231sd(mem(rdx), xmm3, xmm12)
-	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
-	vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14)
-	vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15)
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-	
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(ymm9, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	vextractf128(imm(0x1), ymm8, xmm14)
-	vextractf128(imm(0x1), ymm10, xmm15)
-
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vextractf128(imm(0x1), ymm5, xmm12)
-	vextractf128(imm(0x1), ymm7, xmm13)
-	vextractf128(imm(0x1), ymm9, xmm14)
-	vextractf128(imm(0x1), ymm11, xmm15)
-
-	vmovupd(xmm5, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 1))
-	vmovupd(xmm9, mem(rcx, rsi, 2))
-	vmovupd(xmm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_2x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 1*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c
-	lea(mem(rdx, rsi, 2), rdx)         // rdx = c + 5*cs_c;
-	prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 6*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7)
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm0)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-	vmovupd(xmm2, mem(rcx, rsi, 2))
-	vmovupd(xmm4, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm0)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-	vmovupd(xmm2, mem(rcx, rsi, 2))
-	vmovupd(xmm4, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-	vmovupd(xmm2, mem(rcx, rsi, 2))
-	vmovupd(xmm4, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-	vmovupd(xmm2, mem(rcx, rsi, 2))
-	vmovupd(xmm4, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_1x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 0*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 0*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c
-	lea(mem(rdx, rsi, 2), rdx)         // rdx = c + 5*cs_c;
-	prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 6*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vmovlpd(mem(rcx), xmm0, xmm0)
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rcx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm4, ymm3, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-	
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vmovlpd(mem(rcx), xmm0, xmm0)
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rcx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm5, ymm3, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vmovupd(ymm4, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vmovupd(ymm5, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-	
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_6x6
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 5*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 5*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-
-
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-	
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(xmm0, xmm5, xmm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(xmm0, xmm7, xmm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(xmm0, xmm9, xmm9)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(xmm0, xmm11, xmm11)
-	vmulpd(ymm0, ymm12, ymm12)
-	vmulpd(xmm0, xmm13, xmm13)
-	vmulpd(ymm0, ymm14, ymm14)
-	vmulpd(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5)
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7)
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9)
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm11)
-	vmovupd(xmm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm13)
-	vmovupd(xmm13, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm14)
-	vmovupd(ymm14, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm15)
-	vmovupd(xmm15, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
-	//vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	//vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11)
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	//vextractf128(imm(0x1), ymm0, xmm2)
-	//vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	//vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	//vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	//vmovupd(xmm2, mem(rdx, rsi, 2))
-	//vmovupd(xmm4, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm10, mem(rcx))
-	vmovupd(xmm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm12, mem(rcx))
-	vmovupd(xmm13, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm14, mem(rcx))
-	vmovupd(xmm15, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	//vextractf128(imm(0x1), ymm0, xmm2)
-	//vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	//vmovupd(xmm2, mem(rdx, rsi, 2))
-	//vmovupd(xmm4, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_5x6
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 5*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 4*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 4*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-
-
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-	
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(xmm0, xmm5, xmm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(xmm0, xmm7, xmm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(xmm0, xmm9, xmm9)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(xmm0, xmm11, xmm11)
-	vmulpd(ymm0, ymm12, ymm12)
-	vmulpd(xmm0, xmm13, xmm13)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5)
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7)
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9)
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm11)
-	vmovupd(xmm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm13)
-	vmovupd(xmm13, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovlpd(mem(rdx), xmm0, xmm0)
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rdx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm12, ymm3, ymm0)
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, rsi, 2))
-	vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
-	//vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	//vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11)
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	//vextractf128(imm(0x1), ymm0, xmm2)
-	//vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	//vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	//vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	//vmovupd(xmm2, mem(rdx, rsi, 2))
-	//vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovlpd(mem(rdx), xmm0, xmm0)
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	//vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)
-	//vmovhpd(mem(rdx, rax, 1), xmm1, xmm1)
-	//vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(xmm13, xmm3, xmm0)
-	//vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	//vmovlpd(xmm1, mem(rdx, rsi, 2))
-	//vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm10, mem(rcx))
-	vmovupd(xmm11, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm12, mem(rcx))
-	vmovupd(xmm13, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovupd(ymm12, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, rsi, 2))
-	vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm15, ymm13, ymm0)
-	vunpckhpd(ymm15, ymm13, ymm1)
-	//vextractf128(imm(0x1), ymm0, xmm2)
-	//vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	//vmovupd(xmm2, mem(rdx, rsi, 2))
-	//vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovupd(ymm13, ymm0)
-
-	//vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	//vmovlpd(xmm1, mem(rdx, rsi, 2))
-	//vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_4x6
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 5*8))         // prefetch c + 3*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 3*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-
-
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-	
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(xmm0, xmm5, xmm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(xmm0, xmm7, xmm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(xmm0, xmm9, xmm9)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(xmm0, xmm11, xmm11)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5)
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7)
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9)
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm11)
-	vmovupd(xmm11, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7)
-	//vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	//vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11)
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm10, mem(rcx))
-	vmovupd(xmm11, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(ymm5, mem(rcx))
-	vmovupd(ymm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	
-	
-	
-	label(.DDONE)
-
-
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_3x6
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 2*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 2*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(xmm0, xmm5, xmm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(xmm0, xmm7, xmm7)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(xmm0, xmm9, xmm9)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5)
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7)
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9)
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	vextractf128(imm(0x1), ymm8, xmm14)
-	vextractf128(imm(0x1), ymm10, xmm15)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10)
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vfmadd231sd(mem(rdx), xmm3, xmm12)
-	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
-	vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14)
-	vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15)
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-	
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vextractf128(imm(0x1), ymm5, xmm12)
-	vextractf128(imm(0x1), ymm7, xmm13)
-	//vextractf128(imm(0x1), ymm9, xmm14)
-	//vextractf128(imm(0x1), ymm11, xmm15)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm5)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm9)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm11)
-	vmovupd(xmm5, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 1))
-	//vmovupd(xmm9, mem(rcx, rsi, 2))
-	//vmovupd(xmm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vfmadd231sd(mem(rdx), xmm3, xmm12)
-	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
-	//vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14)
-	//vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15)
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	//vmovsd(xmm14, mem(rdx, rsi, 2))
-	//vmovsd(xmm15, mem(rdx, rax, 1))
-	
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(xmm9, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	vextractf128(imm(0x1), ymm8, xmm14)
-	vextractf128(imm(0x1), ymm10, xmm15)
-
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-
-	lea(mem(rdx, rsi, 4), rdx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vextractf128(imm(0x1), ymm5, xmm12)
-	vextractf128(imm(0x1), ymm7, xmm13)
-	//vextractf128(imm(0x1), ymm9, xmm14)
-	//vextractf128(imm(0x1), ymm11, xmm15)
-
-	vmovupd(xmm5, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 1))
-	//vmovupd(xmm9, mem(rcx, rsi, 2))
-	//vmovupd(xmm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	//vmovsd(xmm14, mem(rdx, rsi, 2))
-	//vmovsd(xmm15, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_2x6
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 1*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-
-
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-	
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(xmm0, xmm5, xmm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(xmm0, xmm7, xmm7)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5)
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7)
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10)
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm5)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7)
-	//vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9)
-	//vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11)
-	vmovupd(xmm5, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	add(rdi, rcx)
-	
-
-	vmovupd(ymm6, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vunpcklpd(ymm7, ymm5, ymm0)
-	vunpckhpd(ymm7, ymm5, ymm1)
-	vunpcklpd(ymm11, ymm9, ymm2)
-	vunpckhpd(ymm11, ymm9, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm5)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm7)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm9)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm11)
-
-	vmovupd(xmm5, mem(rcx))
-	vmovupd(xmm7, mem(rcx, rsi, 1))
-	//vmovupd(ymm9, mem(rcx, rsi, 2))
-	//vmovupd(ymm11, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	
-	
-	
-	label(.DDONE)
-
-
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_1x6
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-#if 1
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 0*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 0*8))         // prefetch c + 3*cs_c
-	prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c
-	prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-
-
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), xmm1)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-	
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-
-	vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5)
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vmovlpd(mem(rcx), xmm0, xmm0)
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rcx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm4, ymm3, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vmovlpd(mem(rcx), xmm0, xmm0)
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	//vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)
-	//vmovhpd(mem(rcx, rax, 1), xmm1, xmm1)
-	//vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(xmm5, xmm3, xmm0)
-
-	//vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	//vmovlpd(xmm1, mem(rcx, rsi, 2))
-	//vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(xmm5, mem(rcx, rsi, 4))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vmovupd(ymm4, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	lea(mem(rcx, rsi, 4), rcx)
-
-
-	vmovupd(xmm5, xmm0)
-
-	//vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	//vmovlpd(xmm1, mem(rcx, rsi, 2))
-	//vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	
-	
-	
-	label(.DDONE)
-
-
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_6x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 3*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 5*8))         // prefetch c + 3*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(ymm0, ymm12, ymm12)
-	vmulpd(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm14)
-	vmovupd(ymm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm14, mem(rcx))
-	//add(rdi, rcx)
-
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_5x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 3*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 4*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 4*8))         // prefetch c + 3*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-
-
-
-#if 0
-	lea(mem(rax, r9,  8), rdx)         // use rdx for prefetching b.
-	lea(mem(rdx, r9,  8), rdx)         // rdx = b + 16*rs_b;
-#else
-	#if 1
-	mov(r9, rsi)                       // rsi = rs_b;
-	sal(imm(5), rsi)                   // rsi = 16*rs_b;
-	lea(mem(rax, rsi, 1), rdx)         // rdx = b + 16*rs_b;
-	#endif
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm0, ymm10, ymm10)
-	vmulpd(ymm0, ymm12, ymm12)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm12)
-	vmovupd(ymm12, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovlpd(mem(rdx), xmm0, xmm0)
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rdx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm12, ymm3, ymm0)
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, rsi, 2))
-	vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm12, mem(rcx))
-	//add(rdi, rcx)
-
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(ymm14, ymm12, ymm0)
-	vunpckhpd(ymm14, ymm12, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-	vmovupd(xmm2, mem(rdx, rsi, 2))
-	vmovupd(xmm4, mem(rdx, rax, 1))
-#else
-	vmovupd(ymm12, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-	vmovlpd(xmm1, mem(rdx, rsi, 2))
-	vmovhpd(xmm1, mem(rdx, rax, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_4x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 3*8))         // prefetch c + 3*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 3*8))         // prefetch c + 3*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 3*8))         // prefetch c + 3*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm0, ymm10, ymm10)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-
-
-
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm10)
-	vmovupd(ymm10, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8)
-	vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm10, mem(rcx))
-	//add(rdi, rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-	
-	
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-	vmovupd(ymm8, mem(rcx, rsi, 2))
-	vmovupd(ymm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_3x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 2*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 2*8))         // prefetch c + 3*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm6, ymm6)
-	vmulpd(ymm0, ymm8, ymm8)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), ymm3, ymm8)
-	vmovupd(ymm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	vextractf128(imm(0x1), ymm8, xmm14)
-	vextractf128(imm(0x1), ymm10, xmm15)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10)
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vfmadd231sd(mem(rdx), xmm3, xmm12)
-	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
-	vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14)
-	vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15)
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-	
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-
-	vmovupd(ymm6, mem(rcx))
-	add(rdi, rcx)
-
-	vmovupd(ymm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	vextractf128(imm(0x1), ymm8, xmm14)
-	vextractf128(imm(0x1), ymm10, xmm15)
-
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	vmovupd(xmm8, mem(rcx, rsi, 2))
-	vmovupd(xmm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	vmovsd(xmm14, mem(rdx, rsi, 2))
-	vmovsd(xmm15, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_2x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 1*8))         // prefetch c + 3*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm0)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)
-	vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2)
-	vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4)
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-	vmovupd(xmm2, mem(rcx, rsi, 2))
-	vmovupd(xmm4, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-	
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vextractf128(imm(0x1), ymm0, xmm2)
-	vextractf128(imm(0x1), ymm1, xmm4)
-
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-	vmovupd(xmm2, mem(rcx, rsi, 2))
-	vmovupd(xmm4, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_1x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 0*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
-	prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rdx, 0*8))         // prefetch c + 3*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vmovlpd(mem(rcx), xmm0, xmm0)
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-	vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)
-	vmovhpd(mem(rcx, rax, 1), xmm1, xmm1)
-	vperm2f128(imm(0x20), ymm1, ymm0, ymm0)
-
-	vfmadd213pd(ymm4, ymm3, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-	
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-	
-
-	vmovupd(ymm4, ymm0)
-
-	vextractf128(imm(1), ymm0, xmm1)
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	vmovlpd(xmm1, mem(rcx, rsi, 2))
-	vmovhpd(xmm1, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_6x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 1*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 1*8)) // prefetch c + 5*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rcx, rsi, 2), rdx)         //
-	//lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 5*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	vfmadd231pd(xmm0, xmm3, xmm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	vbroadcastsd(mem(rax, r15, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	vmulpd(xmm0, xmm6, xmm6)
-	vmulpd(xmm0, xmm8, xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm14)
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-	vunpcklpd(xmm10, xmm8, xmm2)
-	vunpckhpd(xmm10, xmm8, xmm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(xmm14, xmm12, xmm0)
-	vunpckhpd(xmm14, xmm12, xmm1)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-	
-
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-	vunpcklpd(xmm10, xmm8, xmm2)
-	vunpckhpd(xmm10, xmm8, xmm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vunpcklpd(xmm14, xmm12, xmm0)
-	vunpckhpd(xmm14, xmm12, xmm1)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_5x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 1*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 4*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	vbroadcastsd(mem(rax, r8,  4), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	vmulpd(xmm0, xmm6, xmm6)
-	vmulpd(xmm0, xmm8, xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-
-	//lea(mem(rsi, rsi, 2), rax)         // r13 = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-	vunpcklpd(xmm10, xmm8, xmm2)
-	vunpckhpd(xmm10, xmm8, xmm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(xmm14, xmm12, xmm0)
-	vunpckhpd(xmm14, xmm12, xmm1)
-
-	vfmadd231pd(mem(rdx), xmm3, xmm0)
-	vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-#else
-	vmovlpd(mem(rdx), xmm0, xmm0)
-	vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)
-
-	vfmadd213pd(xmm12, xmm3, xmm0)
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(xmm12, mem(rcx))
-	//add(rdi, rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-	
-
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-	vunpcklpd(xmm10, xmm8, xmm2)
-	vunpckhpd(xmm10, xmm8, xmm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-#if 0
-	vunpcklpd(xmm14, xmm12, xmm0)
-	vunpckhpd(xmm14, xmm12, xmm1)
-
-	vmovupd(xmm0, mem(rdx))
-	vmovupd(xmm1, mem(rdx, rsi, 1))
-#else
-	vmovupd(xmm12, xmm0)
-
-	vmovlpd(xmm0, mem(rdx))
-	vmovhpd(xmm0, mem(rdx, rsi, 1))
-#endif
-
-	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_4x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 1*8))         // prefetch c + 3*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rcx, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vbroadcastsd(mem(rax, r13, 1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	vmulpd(xmm0, xmm6, xmm6)
-	vmulpd(xmm0, xmm8, xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
-
-	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-	vunpcklpd(xmm10, xmm8, xmm2)
-	vunpckhpd(xmm10, xmm8, xmm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6)
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	//add(rdi, rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-	
-	
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-	vunpcklpd(xmm10, xmm8, xmm2)
-	vunpckhpd(xmm10, xmm8, xmm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm6, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-	
-
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_3x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rcx, rsi, 2), rdx)         //
-	//lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 2*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx, 0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx, 0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx, 0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx, 0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx, 0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	vmulpd(xmm0, xmm6, xmm6)
-	vmulpd(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
-
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-
-
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	//vextractf128(imm(0x1), ymm8, xmm14)
-	//vextractf128(imm(0x1), ymm10, xmm15)
-
-	vbroadcastsd(mem(rbx), ymm3)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6)
-	//vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8)
-	//vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10)
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	//vmovupd(xmm8, mem(rcx, rsi, 2))
-	//vmovupd(xmm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vfmadd231sd(mem(rdx), xmm3, xmm12)
-	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
-	//vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14)
-	//vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15)
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	//vmovsd(xmm14, mem(rdx, rsi, 2))
-	//vmovsd(xmm15, mem(rdx, rax, 1))
-	
-	//lea(mem(rdx, rsi, 4), rdx)
-
-
-	jmp(.DDONE)                        // jump to end.
-
-
-	
-	
-	label(.DBETAZERO)
-	
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-
-
-	vunpcklpd(ymm6, ymm4, ymm0)
-	vunpckhpd(ymm6, ymm4, ymm1)
-	vunpcklpd(ymm10, ymm8, ymm2)
-	vunpckhpd(ymm10, ymm8, ymm3)
-	vinsertf128(imm(0x1), xmm2, ymm0, ymm4)
-	vinsertf128(imm(0x1), xmm3, ymm1, ymm6)
-	//vperm2f128(imm(0x31), ymm2, ymm0, ymm8)
-	//vperm2f128(imm(0x31), ymm3, ymm1, ymm10)
-
-	vextractf128(imm(0x1), ymm4, xmm12)
-	vextractf128(imm(0x1), ymm6, xmm13)
-	//vextractf128(imm(0x1), ymm8, xmm14)
-	//vextractf128(imm(0x1), ymm10, xmm15)
-
-	vmovupd(xmm4, mem(rcx))
-	vmovupd(xmm6, mem(rcx, rsi, 1))
-	//vmovupd(xmm8, mem(rcx, rsi, 2))
-	//vmovupd(xmm10, mem(rcx, rax, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-	vmovsd(xmm12, mem(rdx))
-	vmovsd(xmm13, mem(rdx, rsi, 1))
-	//vmovsd(xmm14, mem(rdx, rsi, 2))
-	//vmovsd(xmm15, mem(rdx, rax, 1))
-
-	//lea(mem(rdx, rsi, 4), rdx)
-
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_2x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rcx, rsi, 2), rdx)         //
-	//lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-	
-	vbroadcastsd(mem(rax        ), ymm2)
-	vbroadcastsd(mem(rax, r8,  1), ymm3)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	vmulpd(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
-
-	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-
-	vfmadd231pd(mem(rcx), xmm3, xmm0)
-	vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORBZ)
-	
-
-	vunpcklpd(xmm6, xmm4, xmm0)
-	vunpckhpd(xmm6, xmm4, xmm1)
-
-	vmovupd(xmm0, mem(rcx))
-	vmovupd(xmm1, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rv_haswell_asm_1x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(var(rs_b), r10)                // load rs_b
-	//mov(var(cs_b), r11)                // load cs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-#if 0
-	lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-#else
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLPFETCH)                    // jump to column storage case
-	label(.DROWPFETCH)                 // row-stored prefetching on c
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-
-	jmp(.DPOSTPFETCH)                  // jump to end of prefetching c
-	label(.DCOLPFETCH)                 // column-stored prefetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rcx, rsi, 2), rdx)         //
-	//lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-	prefetch(0, mem(rcx, 0*8))         // prefetch c + 0*cs_c
-	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
-
-	label(.DPOSTPFETCH)                // done prefetching c
-#endif
-	
-	
-	
-	
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-	
-	
-	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKLEFT)
-	
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-	
-	
-	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
-	vmovupd(mem(rbx,  0*32), xmm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	add(r9, rax)                       // a += cs_a;
-	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
-	label(.DPOSTACCUM)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
-	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
-
-	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORED)                    // jump to column storage case
-
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	jmp(.DDONE)                        // jump to end.
-
-
-
-	label(.DCOLSTORED)
-
-
-	vmovlpd(mem(rcx), xmm0, xmm0)
-	vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)
-
-	vfmadd213pd(xmm4, xmm3, xmm0)
-
-	vmovlpd(xmm0, mem(rcx))
-	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	
-	//lea(mem(rcx, rsi, 4), rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-
-
-	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-
-
-	jmp(.DDONE)                        // jump to end.
-	
-
-
-	label(.DCOLSTORBZ)
-	
-
-	vmovlpd(xmm4, mem(rcx))
-	vmovhpd(xmm4, mem(rcx, rsi, 1))
-
-	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-// -----------------------------------------------------------------------------
-
-// NOTE: Normally, for any "?x1" kernel, we would call the reference kernel.
-// However, at least one other subconfiguration (zen) uses this kernel set, so
-// we need to be able to call a set of "?x1" kernels that we know will actually
-// exist regardless of which subconfiguration these kernels were used by. Thus,
-// the compromise employed here is to inline the reference kernel so it gets
-// compiled as part of the haswell kernel set, and hence can unconditionally be
-// called by other kernels within that kernel set.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mdim ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx \
-     ) \
-{ \
-	for ( dim_t i = 0; i < mdim; ++i ) \
-	{ \
-		ctype* restrict ci = &c[ i*rs_c ]; \
-		ctype* restrict ai = &a[ i*rs_a ]; \
-\
-		/* for ( dim_t j = 0; j < 1; ++j ) */ \
-		{ \
-			ctype* restrict cij = ci /*[ j*cs_c ]*/ ; \
-			ctype* restrict bj  = b  /*[ j*cs_b ]*/ ; \
-			ctype           ab; \
-\
-			PASTEMAC(ch,set0s)( ab ); \
-\
-			/* Perform a dot product to update the (i,j) element of c. */ \
-			for ( dim_t l = 0; l < k; ++l ) \
-			{ \
-				ctype* restrict aij = &ai[ l*cs_a ]; \
-				ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-				PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-			} \
-\
-			/* If beta is one, add ab into c. If beta is zero, overwrite c
-			   with the result in ab. Otherwise, scale by beta and accumulate
-			   ab to c. */ \
-			if ( PASTEMAC(ch,eq1)( *beta ) ) \
-			{ \
-				PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-			} \
-			else if ( PASTEMAC(d,eq0)( *beta ) ) \
-			{ \
-				PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-			} \
-			else \
-			{ \
-				PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-			} \
-		} \
-	} \
-}
-
-GENTFUNC( double, d, gemmsup_r_haswell_ref_6x1, 6 )
-GENTFUNC( double, d, gemmsup_r_haswell_ref_5x1, 5 )
-GENTFUNC( double, d, gemmsup_r_haswell_ref_4x1, 4 )
-GENTFUNC( double, d, gemmsup_r_haswell_ref_3x1, 3 )
-GENTFUNC( double, d, gemmsup_r_haswell_ref_2x1, 2 )
-GENTFUNC( double, d, gemmsup_r_haswell_ref_1x1, 1 )
-
diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c
deleted file mode 100644
index c5addd9cf..000000000
--- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c
+++ /dev/null
@@ -1,5249 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-*/
-
-// Prototype reference microkernels.
-GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref )
-
-// Define parameters and variables for edge case kernel map.
-#define NUM_MR 4
-#define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
-
-static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 };
-static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
-static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
-{     /*  8                                4                                2                                1  */
-/* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8, bli_dgemmsup_rd_haswell_asm_6x4, bli_dgemmsup_rd_haswell_asm_6x2, bli_dgemmsup_r_haswell_ref },
-/* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8, bli_dgemmsup_rd_haswell_asm_3x4, bli_dgemmsup_rd_haswell_asm_3x2, bli_dgemmsup_r_haswell_ref },
-/* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8, bli_dgemmsup_rd_haswell_asm_2x4, bli_dgemmsup_rd_haswell_asm_2x2, bli_dgemmsup_r_haswell_ref },
-/* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8, bli_dgemmsup_rd_haswell_asm_1x4, bli_dgemmsup_rd_haswell_asm_1x2, bli_dgemmsup_r_haswell_ref }
-};
-
-
-void bli_dgemmsup_rd_haswell_asm_6x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	// Use a reference kernel if this is an edge case in the m or n
-	// dimensions.
-	if ( m0 < 6 || n0 < 8 )
-	{
-		dim_t            n_left = n0;
-		double* restrict cj     = c;
-		double* restrict bj     = b;
-
-		// Iterate across columns (corresponding to elements of nrs) until
-		// n_left is zero.
-		for ( dim_t j = 0; n_left != 0; ++j )
-		{
-			const dim_t nr_cur = nrs[ j ];
-
-			// Once we find the value of nrs that is less than (or equal to)
-			// n_left, we use the kernels in that column.
-			if ( nr_cur <= n_left )
-			{
-				dim_t            m_left = m0;
-				double* restrict cij    = cj;
-				double* restrict ai     = a;
-
-				// Iterate down the current column (corresponding to elements
-				// of mrs) until m_left is zero.
-				for ( dim_t i = 0; m_left != 0; ++i )
-				{
-					const dim_t mr_cur = mrs[ i ];
-
-					// Once we find the value of mrs that is less than (or equal
-					// to) m_left, we select that kernel.
-					if ( mr_cur <= m_left )
-					{
-						FUNCPTR_T ker_fp = kmap[i][j];
-
-						//printf( "executing %d x %d sup kernel.\n", (int)mr_cur, (int)nr_cur );
-
-						// Call the kernel using current mrs and nrs values.
-						ker_fp
-						(
-						  conja, conjb, mr_cur, nr_cur, k0,
-						  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-						  beta, cij, rs_c0, cs_c0, data, cntx
-						);
-
-						// Advance C and A pointers by the mrs and nrs we just
-						// used, and decrement m_left.
-						cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-					} 
-				}
-
-				// Advance C and B pointers by the mrs and nrs we just used, and
-				// decrement n_left.
-				cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-			}
-		}
-
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r12)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r10)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r10 = rcx = c
-	// r12 = rax = a
-	// r14 = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-#if 1
-	mov(imm(0), r9)                    // ii = 0;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ 0 1 ... ]
-
-
-
-	lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-	imul(rdi, rsi)                     // rsi *= rs_c;
-	lea(mem(r10, rsi, 1), rdx)         // rdx = c_jj + 3*ii*rs_c;
-
-	lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-	imul(r8,  rsi)                     // rsi *= rs_a;
-	lea(mem(r12, rsi, 1), r12)         // rax = a + 3*ii*rs_a;
-
-
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-	vzeroall()                         // zero all xmm/ymm registers.
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(rdx, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(r14, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(   , r12, 1), rax)         // rax = a_ii;
-#endif
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-#if 1
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-	add(imm(3), r9)                    // ii += 3;
-	cmp(imm(3), r9)                    // compare ii to 3
-	jle(.DLOOP3X4I)                    // if ii <= 3, jump to beginning
-#endif
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r12)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r10)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r10 = rcx = c
-	// r12 = rax = a
-	// r14 = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-	vzeroall()                         // zero all xmm/ymm registers.
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r10, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(r14, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(   , r12, 1), rax)         // rax = a;
-
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r12)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r10)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r10 = rcx = c
-	// r12 = rax = a
-	// r14 = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-	vzeroall()                         // zero all xmm/ymm registers.
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r10, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(r14, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(   , r12, 1), rax)         // rax = a;
-
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x8
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r12)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r10)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r10 = rcx = c
-	// r12 = rax = a
-	// r14 = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-	vzeroall()                         // zero all xmm/ymm registers.
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r10, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(r14, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(   , r12, 1), rax)         // rax = a;
-
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	mov(var(cs_c), rsi)                // load cs_c
-	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r12)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r10)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r10 = rcx = c
-	// r12 = rax = a
-	// r14 = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r9)                    // ii = 0;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ 0 1 ... ]
-
-
-
-	vzeroall()                         // zero all xmm/ymm registers.
-
-	lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-	imul(rdi, rsi)                     // rsi *= rs_c;
-	lea(mem(r10, rsi, 1), rcx)         // rcx = c + 3*ii*rs_c;
-
-	lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-	imul(r8,  rsi)                     // rsi *= rs_a;
-	lea(mem(r12, rsi, 1), rax)         // rax = a + 3*ii*rs_a;
-
-	lea(mem(   , r14, 1), rbx)         // rbx = b;
-
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(3), r9)                    // ii += 3;
-	cmp(imm(3), r9)                    // compare ii to 3
-	jle(.DLOOP3X4I)                    // if ii <= 3, jump to beginning
-	                                   // of ii loop; otherwise, loop ends.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x4
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | | | |
-     --------        -- -- -- ...    | | | |
-     --------   +=   -- -- --        | | | |
-     --------                        | | | |
-     --------                           :
-     --------                           :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-                                       // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	lea(mem(rcx, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(rdx, 7*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovsd(mem(rax, r13, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rax, r8,  4), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovsd(mem(rax, r15, 1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	                                   // ymm10 ymm11
-	                                   // ymm12 ymm13
-	                                   // ymm14 ymm15
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	vhaddpd( ymm11, ymm10, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm10 )
-
-	vhaddpd( ymm13, ymm12, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm12 )
-
-	vhaddpd( ymm15, ymm14, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm14 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-	                                   // xmm10 = sum(ymm10) sum(ymm11)
-	                                   // xmm12 = sum(ymm12) sum(ymm13)
-	                                   // xmm14 = sum(ymm14) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm14)
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x2
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c
deleted file mode 100644
index 55ae6d0f9..000000000
--- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ /dev/null
@@ -1,5543 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-*/
-
-// Prototype reference microkernels.
-GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref )
-
-// Define parameters and variables for edge case kernel map.
-#define NUM_MR 4
-#define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
-
-#if 0
-static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 };
-static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
-static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
-{     /*  8                                4                                2                                1  */
-/* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref },
-/* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref },
-/* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref },
-/* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref }
-};
-#endif
-
-
-void bli_dgemmsup_rd_haswell_asm_6x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	uint64_t n_left = n0 % 8;
-
-	// First check whether this is a edge case in the n dimension. If so,
-	// dispatch other 6x?m kernels, as needed.
-	if ( n_left )
-	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
-
-		if ( 4 <= n_left )
-		{
-			const dim_t nr_cur = 4;
-
-			bli_dgemmsup_rd_haswell_asm_6x4m
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_6x2m
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, m0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-#endif
-		}
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	//mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	//mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-	mov(var(a), r14)                   // load address of a
-	mov(var(c), r12)                   // load address of c
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
-	lea(mem(r14), rax)                 // rax = a_ii;
-	lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-
-#if 0
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x8m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x8m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter .. 1 0 ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 3*ii*rs_c;
-	lea(mem(r14), rax)                 // rax = a + 3*ii*rs_a;
-	lea(mem(rdx), rbx)                 // rbx = b;
-
-
-#if 0
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x4m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x4m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | | | |
-     --------        -- -- -- ...    | | | |
-     --------   +=   -- -- --        | | | |
-     --------                        | | | |
-     --------                           :
-     --------                           :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 6;
-	uint64_t m_left = m0 % 6;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-#if 0
-    vzeroall()                         // zero all xmm/ymm registers.
-#else
-                                       // skylake can execute 3 vxorpd ipc with
-                                       // a latency of 1 cycle, while vzeroall
-                                       // has a latency of 12 cycles.
-    vxorpd(ymm4,  ymm4,  ymm4)
-    vxorpd(ymm5,  ymm5,  ymm5)
-    vxorpd(ymm6,  ymm6,  ymm6)
-    vxorpd(ymm7,  ymm7,  ymm7)
-    vxorpd(ymm8,  ymm8,  ymm8)
-    vxorpd(ymm9,  ymm9,  ymm9)
-    vxorpd(ymm10, ymm10, ymm10)
-    vxorpd(ymm11, ymm11, ymm11)
-    vxorpd(ymm12, ymm12, ymm12)
-    vxorpd(ymm13, ymm13, ymm13)
-    vxorpd(ymm14, ymm14, ymm14)
-    vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 6*ii*rs_c;
-    lea(mem(r14), rax)                 // rax = a + 6*ii*rs_a;
-    lea(mem(rdx), rbx)                 // rbx = b;
-
-
-	lea(mem(rcx, rdi, 2), r10)         //
-	lea(mem(r10, rdi, 1), r10)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(r10, 1*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovsd(mem(rax, r13, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rax, r8,  4), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovsd(mem(rax, r15, 1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	                                   // ymm10 ymm11
-	                                   // ymm12 ymm13
-	                                   // ymm14 ymm15
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	vhaddpd( ymm11, ymm10, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm10 )
-
-	vhaddpd( ymm13, ymm12, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm12 )
-
-	vhaddpd( ymm15, ymm14, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm14 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-	                                   // xmm10 = sum(ymm10) sum(ymm11)
-	                                   // xmm12 = sum(ymm12) sum(ymm13)
-	                                   // xmm14 = sum(ymm14) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm14)
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-
-
-
-	lea(mem(r12, rdi, 4), r12)         //
-    lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c
-
-    lea(mem(r14, r8,  4), r14)         //
-    lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 3 <= m_left )
-		{
-			const dim_t mr_cur = 3;
-
-			bli_dgemmsup_rd_haswell_asm_3x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 2 <= m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.newji b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.newji
deleted file mode 100644
index c1cb37214..000000000
--- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.newji
+++ /dev/null
@@ -1,5628 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-*/
-
-// Prototype reference microkernels.
-GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref )
-
-// Define parameters and variables for edge case kernel map.
-#define NUM_MR 4
-#define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
-
-#if 0
-static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 };
-static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
-static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
-{     /*  8                                4                                2                                1  */
-/* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref },
-/* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref },
-/* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref },
-/* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref }
-};
-#endif
-
-
-void bli_dgemmsup_rd_haswell_asm_6x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	uint64_t n_left = n0 % 8;
-
-	// First check whether this is a edge case in the n dimension. If so,
-	// dispatch other 6x?m kernels, as needed.
-	if ( n_left )
-	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
-
-		if ( 4 <= n_left )
-		{
-			const dim_t nr_cur = 4;
-
-			bli_dgemmsup_rd_haswell_asm_6x4m
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_6x2m
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, m0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-#endif
-		}
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	//mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	//mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-	mov(var(a), r14)                   // load address of a
-	mov(var(c), r12)                   // load address of c
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
-	lea(mem(r14), rax)                 // rax = a_ii;
-	lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-
-#if 0
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x8m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x8m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter .. 1 0 ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 3*ii*rs_c;
-	lea(mem(r14), rax)                 // rax = a + 3*ii*rs_a;
-	lea(mem(rdx), rbx)                 // rbx = b;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x4m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x4m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | | | |
-     --------        -- -- -- ...    | | | |
-     --------   +=   -- -- --        | | | |
-     --------                        | | | |
-     --------                           :
-     --------                           :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 6;
-	uint64_t m_left = m0 % 6;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-#if 0
-    vzeroall()                         // zero all xmm/ymm registers.
-#else
-                                       // skylake can execute 3 vxorpd ipc with
-                                       // a latency of 1 cycle, while vzeroall
-                                       // has a latency of 12 cycles.
-    vxorpd(ymm4,  ymm4,  ymm4)
-    vxorpd(ymm5,  ymm5,  ymm5)
-    vxorpd(ymm6,  ymm6,  ymm6)
-    vxorpd(ymm7,  ymm7,  ymm7)
-    vxorpd(ymm8,  ymm8,  ymm8)
-    vxorpd(ymm9,  ymm9,  ymm9)
-    vxorpd(ymm10, ymm10, ymm10)
-    vxorpd(ymm11, ymm11, ymm11)
-    vxorpd(ymm12, ymm12, ymm12)
-    vxorpd(ymm13, ymm13, ymm13)
-    vxorpd(ymm14, ymm14, ymm14)
-    vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 6*ii*rs_c;
-    lea(mem(r14), rax)                 // rax = a + 6*ii*rs_a;
-    lea(mem(rdx), rbx)                 // rbx = b;
-
-
-	lea(mem(rcx, rdi, 2), r10)         //
-	lea(mem(r10, rdi, 1), r10)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(r10, 3*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(r10, rdi, 2, 3*8)) // prefetch c + 5*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovsd(mem(rax, r13, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rax, r8,  4), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovsd(mem(rax, r15, 1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	                                   // ymm10 ymm11
-	                                   // ymm12 ymm13
-	                                   // ymm14 ymm15
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	vhaddpd( ymm11, ymm10, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm10 )
-
-	vhaddpd( ymm13, ymm12, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm12 )
-
-	vhaddpd( ymm15, ymm14, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm14 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-	                                   // xmm10 = sum(ymm10) sum(ymm11)
-	                                   // xmm12 = sum(ymm12) sum(ymm13)
-	                                   // xmm14 = sum(ymm14) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm14)
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-
-
-
-	lea(mem(r12, rdi, 4), r12)         //
-    lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c
-
-    lea(mem(r14, r8,  4), r14)         //
-    lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 3 <= m_left )
-		{
-			const dim_t mr_cur = 3;
-
-			bli_dgemmsup_rd_haswell_asm_3x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 2 <= m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.worksij b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.worksij
deleted file mode 100644
index fd1c2ae65..000000000
--- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.worksij
+++ /dev/null
@@ -1,5634 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-*/
-
-// Prototype reference microkernels.
-GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref )
-
-// Define parameters and variables for edge case kernel map.
-#define NUM_MR 4
-#define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
-
-#if 0
-static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 };
-static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
-static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
-{     /*  8                                4                                2                                1  */
-/* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref },
-/* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref },
-/* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref },
-/* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref }
-};
-#endif
-
-
-void bli_dgemmsup_rd_haswell_asm_6x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-	uint64_t n_left = n0 % 8;
-
-	// First check whether this is a edge case in the n dimension. If so,
-	// dispatch other 6x?m kernels, as needed.
-	if ( n_left )
-	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
-
-		if ( 4 <= n_left )
-		{
-			const dim_t nr_cur = 4;
-
-			bli_dgemmsup_rd_haswell_asm_6x4m
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_6x2m
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, m0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-#endif
-		}
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a_ii;
-
-
-#if 0
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 8;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x8m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x8m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r15)                   // jj = 0;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(imm(1*8), rsi)                // rsi *= cs_c = 1*8
-	lea(mem(r12, rsi, 1), rcx)         // rcx = c + 4*jj*cs_c;
-
-	lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-	imul(r11, rsi)                     // rsi *= cs_b;
-	lea(mem(rdx, rsi, 1), rbx)         // rbx = b + 4*jj*cs_b;
-
-	lea(mem(r14), rax)                 // rax = a;
-
-
-#if 0
-	prefetch(0, mem(rcx, 7*8))         // prefetch c + 0*rs_c
-#else
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-#endif
-
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4), r15)                   // jj += 4;
-	cmp(imm(4), r15)                   // compare jj to 4
-	jle(.DLOOP3X4J)                    // if jj <= 4, jump to beginning
-	                                   // of jj loop; otherwise, loop ends.
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter .. 1 0 ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 3*ii*rs_c;
-	lea(mem(r14), rax)                 // rax = a + 3*ii*rs_a;
-	lea(mem(rdx), rbx)                 // rbx = b;
-
-
-#if 0
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x4m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x4m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | | | |
-     --------        -- -- -- ...    | | | |
-     --------   +=   -- -- --        | | | |
-     --------                        | | | |
-     --------                           :
-     --------                           :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 6;
-	uint64_t m_left = m0 % 6;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-#if 0
-    vzeroall()                         // zero all xmm/ymm registers.
-#else
-                                       // skylake can execute 3 vxorpd ipc with
-                                       // a latency of 1 cycle, while vzeroall
-                                       // has a latency of 12 cycles.
-    vxorpd(ymm4,  ymm4,  ymm4)
-    vxorpd(ymm5,  ymm5,  ymm5)
-    vxorpd(ymm6,  ymm6,  ymm6)
-    vxorpd(ymm7,  ymm7,  ymm7)
-    vxorpd(ymm8,  ymm8,  ymm8)
-    vxorpd(ymm9,  ymm9,  ymm9)
-    vxorpd(ymm10, ymm10, ymm10)
-    vxorpd(ymm11, ymm11, ymm11)
-    vxorpd(ymm12, ymm12, ymm12)
-    vxorpd(ymm13, ymm13, ymm13)
-    vxorpd(ymm14, ymm14, ymm14)
-    vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 6*ii*rs_c;
-    lea(mem(r14), rax)                 // rax = a + 6*ii*rs_a;
-    lea(mem(rdx), rbx)                 // rbx = b;
-
-
-	lea(mem(rcx, rdi, 2), r10)         //
-	lea(mem(r10, rdi, 1), r10)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(r10, 3*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(r10, rdi, 2, 3*8)) // prefetch c + 5*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovsd(mem(rax, r13, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rax, r8,  4), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovsd(mem(rax, r15, 1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	                                   // ymm10 ymm11
-	                                   // ymm12 ymm13
-	                                   // ymm14 ymm15
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	vhaddpd( ymm11, ymm10, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm10 )
-
-	vhaddpd( ymm13, ymm12, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm12 )
-
-	vhaddpd( ymm15, ymm14, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm14 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-	                                   // xmm10 = sum(ymm10) sum(ymm11)
-	                                   // xmm12 = sum(ymm12) sum(ymm13)
-	                                   // xmm14 = sum(ymm14) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm14)
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-
-
-
-	lea(mem(r12, rdi, 4), r12)         //
-    lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c
-
-    lea(mem(r14, r8,  4), r14)         //
-    lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 2;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 3 <= m_left )
-		{
-			const dim_t mr_cur = 3;
-
-			bli_dgemmsup_rd_haswell_asm_3x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 2 <= m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x2m
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c
deleted file mode 100644
index a23764f8d..000000000
--- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ /dev/null
@@ -1,5836 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-*/
-
-// Prototype reference microkernels.
-GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref )
-GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref )
-
-// Define parameters and variables for edge case kernel map.
-#define NUM_MR 4
-#define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
-
-#if 0
-static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 };
-static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
-static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
-{     /*  8                                4                                2                                1  */
-/* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8n, bli_dgemmsup_rd_haswell_asm_6x4n, bli_dgemmsup_rd_haswell_asm_6x2n, bli_dgemmsup_r_haswell_ref },
-/* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8n, bli_dgemmsup_rd_haswell_asm_3x4n, bli_dgemmsup_rd_haswell_asm_3x2n, bli_dgemmsup_r_haswell_ref },
-/* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8n, bli_dgemmsup_rd_haswell_asm_2x4n, bli_dgemmsup_rd_haswell_asm_2x2n, bli_dgemmsup_r_haswell_ref },
-/* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8n, bli_dgemmsup_rd_haswell_asm_1x4n, bli_dgemmsup_rd_haswell_asm_1x2n, bli_dgemmsup_r_haswell_ref }
-};
-#endif
-
-
-void bli_dgemmsup_rd_haswell_asm_6x8n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	); return;
-#endif
-	uint64_t m_left = m0 % 6;
-
-	// First check whether this is a edge case in the n dimension. If so,
-	// dispatch other ?x8m kernels, as needed.
-	if ( m_left )
-	{
-		double* restrict cij = c;
-		double* restrict bj  = b;
-		double* restrict ai  = a;
-
-		if ( 3 <= m_left )
-		{
-			const dim_t mr_cur = 3;
-
-			bli_dgemmsup_rd_haswell_asm_3x8n
-			(
-			  conja, conjb, mr_cur, n0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 2 <= m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x8n
-			(
-			  conja, conjb, mr_cur, n0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-#if 0
-			const dim_t mr_cur = 1;
-
-			//bli_dgemmsup_r_haswell_ref
-			bli_dgemmsup_rd_haswell_asm_1x8n
-			(
-			  conja, conjb, mr_cur, n0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_TRANSPOSE, conja, k0, n0,
-			  alpha, bj, rs_b0, cs_b0, ai, cs_a0,
-			  beta, cij, cs_c0, cntx, NULL
-			);
-#endif
-		}
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t n_iter = n0 / 4;
-	uint64_t n_left = n0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( n_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rdx)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	//mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	//mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// rdx = rax = a
-	// r14 = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-
-	mov(imm(0), r9)                    // ii = 0;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ 0 1 ... ]
-
-
-
-	mov(var(b), r14)                   // load address of b
-	mov(var(c), r12)                   // load address of c
-
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-	imul(rdi, rsi)                     // rsi *= rs_c
-	lea(mem(r12, rsi, 1), r12)         // r12 = c + 3*ii*rs_c;
-
-	lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-	imul(r8,  rsi)                     // rsi *= rs_a;
-	lea(mem(rdx, rsi, 1), rdx)         // rax = a + 3*ii*rs_a;
-
-
-
-	mov(var(n_iter), r15)              // jj = n_iter;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
-	lea(mem(rdx), rax)                 // rax = a_ii;
-	lea(mem(r14), rbx)                 // rbx = b_jj;
-
-
-#if 1
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 0
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-	add(imm(8*8), r10)                 // r10 += 8*rs_b = 8*8;
-#else
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-	add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
-
-	lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-	dec(r15)                           // jj -= 1;
-	jne(.DLOOP3X4J)                    // iterate again if jj != 0.
-
-
-
-	add(imm(3), r9)                    // ii += 3;
-	cmp(imm(3), r9)                    // compare ii to 3
-	jle(.DLOOP3X4I)                    // if ii <= 3, jump to beginning
-	                                   // of ii loop; otherwise, loop ends.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( n_left )
-	{
-		const dim_t      mr_cur = 6;
-		const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
-
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_6x2n
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			//bli_dgemmsup_rd_haswell_asm_6x1n
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-#endif
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x8n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t n_iter = n0 / 4;
-	uint64_t n_left = n0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( n_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rdx)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// rdx = rax = a
-	// r14 = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-
-	mov(var(n_iter), r15)              // jj = n_iter;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
-	lea(mem(rdx), rax)                 // rax = a_ii;
-	lea(mem(r14), rbx)                 // rbx = b_jj;
-
-
-#if 1
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 0
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-	add(imm(8*8), r10)                 // r10 += 8*rs_b = 8*8;
-#else
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-	add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
-
-	lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-	dec(r15)                           // jj -= 1;
-	jne(.DLOOP3X4J)                    // iterate again if jj != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( n_left )
-	{
-		const dim_t      mr_cur = 3;
-		const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
-
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_3x2n
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			//bli_dgemmsup_rd_haswell_asm_3x1n
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-#endif
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x8n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t n_iter = n0 / 4;
-	uint64_t n_left = n0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( n_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rdx)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// rdx = rax = a
-	// r14 = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-
-	mov(var(n_iter), r15)              // jj = n_iter;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
-	lea(mem(rdx), rax)                 // rax = a_ii;
-	lea(mem(r14), rbx)                 // rbx = b_jj;
-
-
-#if 1
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-#endif
-	lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 0
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-	add(imm(8*8), r10)                 // r10 += 8*rs_b = 8*8;
-#else
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-	add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
-
-	lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-	dec(r15)                           // jj -= 1;
-	jne(.DLOOP3X4J)                    // iterate again if jj != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( n_left )
-	{
-		const dim_t      mr_cur = 2;
-		const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
-
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x2n
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			//bli_dgemmsup_rd_haswell_asm_2x1n
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-#endif
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x8n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t n_iter = n0 / 4;
-	uint64_t n_left = n0 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( n_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rdx)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), r14)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// rdx = rax = a
-	// r14 = rbx = b
-	// r9  = unused
-	// r15 = n dim index jj
-
-	mov(var(n_iter), r15)              // jj = n_iter;
-
-	label(.DLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
-	lea(mem(rdx), rax)                 // rax = a_ii;
-	lea(mem(r14), rbx)                 // rbx = b_jj;
-
-
-#if 1
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-#endif
-	lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-	
-
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 0
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-	add(imm(8*8), r10)                 // r10 += 8*rs_b = 8*8;
-#else
-	prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-	prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-	prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-#if 1
-	prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-	prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-	add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
-
-	lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-	dec(r15)                           // jj -= 1;
-	jne(.DLOOP3X4J)                    // iterate again if jj != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( n_left )
-	{
-		const dim_t      mr_cur = 1;
-		const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-		double* restrict cij = c + j_edge*cs_c;
-		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
-
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_1x2n
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-#if 0
-			const dim_t nr_cur = 1;
-
-			//bli_dgemmsup_rd_haswell_asm_1x1n
-			bli_dgemmsup_r_haswell_ref
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-#else
-			bli_ddotxv_ex
-			(
-			  conja, conjb, k0,
-			  alpha, ai, cs_a0, bj, rs_b0,
-			  beta, cij, cntx, NULL
-			);
-#endif
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x4n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rdx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
-
-	mov(var(c), r12)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-	// r12 = rcx = c
-	// r14 = rax = a
-	// rdx = rbx = b
-	// r9  = m dim index ii
-	// r15 = n dim index jj
-	// r10 = unused
-
-	mov(var(m_iter), r9)               // ii = m_iter;
-
-	label(.DLOOP3X4I)                  // LOOP OVER ii = [ m_iter .. 1 0 ]
-
-
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(r12), rcx)                 // rcx = c + 3*ii*rs_c;
-	lea(mem(r14), rax)                 // rax = a + 3*ii*rs_a;
-	lea(mem(rdx), rbx)                 // rbx = b;
-
-
-#if 0
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-#endif
-	lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-#if 1
-	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-	prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-#endif
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-
-	lea(mem(r12, rdi, 2), r12)         //
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)         //
-	lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-	dec(r9)                            // ii -= 1;
-	jne(.DLOOP3X4I)                    // iterate again if ii != 0.
-
-
-
-
-	label(.DRETURN)
-
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		double* restrict cij = c + i_edge*rs_c;
-		double* restrict bj  = b;
-		double* restrict ai  = a + i_edge*rs_a;
-
-		if ( 2 == m_left )
-		{
-			const dim_t mr_cur = 2;
-
-			bli_dgemmsup_rd_haswell_asm_2x4n
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			//cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-		}
-		if ( 1 == m_left )
-		{
-			const dim_t mr_cur = 1;
-
-			bli_dgemmsup_rd_haswell_asm_1x4n
-			(
-			  conja, conjb, mr_cur, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-		}
-	}
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x4n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm12, ymm12, ymm12)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-	vxorpd(ymm15, ymm15, ymm15)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	vmovupd(mem(rax, r8, 2), ymm2)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	vmovsd(mem(rax, r8, 2), xmm2)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-	vfmadd231pd(ymm2, ymm3, ymm6)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-	vfmadd231pd(ymm2, ymm3, ymm9)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-	vfmadd231pd(ymm2, ymm3, ymm12)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-	vfmadd231pd(ymm2, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	                                   // ymm6  ymm9  ymm12 ymm15
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-
-	vhaddpd( ymm9, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm15, ymm12, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm6 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-	                                   // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm6)
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x4n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm11, ymm11, ymm11)
-	vxorpd(ymm13, ymm13, ymm13)
-	vxorpd(ymm14, ymm14, ymm14)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	vmovupd(mem(rax, r8, 1), ymm1)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	vmovsd(mem(rax, r8, 1), xmm1)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-	vfmadd231pd(ymm1, ymm3, ymm8)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-	vfmadd231pd(ymm1, ymm3, ymm14)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	                                   // ymm5  ymm8  ymm11 ymm14
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-
-	vhaddpd( ymm8, ymm5, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )
-
-	vhaddpd( ymm14, ymm11, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-	                                   // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm5)
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm5, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x4n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | | | |
-     --------        -- -- -- ...    | | | |
-     --------   +=   -- -- --        | | | |
-     --------                        | | | |
-     --------                           :
-     --------                           :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm10, ymm10, ymm10)
-	vxorpd(ymm13, ymm13, ymm13)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-	
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rax       ), ymm0)
-	add(imm(4*8), rax)                 // a += 4*cs_b = 4*8;
-	
-	vmovupd(mem(rbx        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovupd(mem(rbx, r11, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovupd(mem(rbx, r11, 2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovupd(mem(rbx, r13, 1), ymm3)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rax       ), xmm0)
-	add(imm(1*8), rax)                 // a += 1*cs_b = 1*8;
-	
-	vmovsd(mem(rbx        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-
-	vmovsd(mem(rbx, r11, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm7)
-
-	vmovsd(mem(rbx, r11, 2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vmovsd(mem(rbx, r13, 1), xmm3)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm13)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
-	vhaddpd( ymm7, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-	vhaddpd( ymm13, ymm10, ymm2 )
-	vextractf128(imm(1), ymm2, xmm1 )
-	vaddpd( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-	vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
-	                                   // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), ymm3, ymm4)
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(ymm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_6x2n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-	//vzeroall()                         // zero all xmm/ymm registers.
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-
-
-#if 0
-    vzeroall()                         // zero all xmm/ymm registers.
-#else
-                                       // skylake can execute 3 vxorpd ipc with
-                                       // a latency of 1 cycle, while vzeroall
-                                       // has a latency of 12 cycles.
-    vxorpd(ymm4,  ymm4,  ymm4)
-    vxorpd(ymm5,  ymm5,  ymm5)
-    vxorpd(ymm6,  ymm6,  ymm6)
-    vxorpd(ymm7,  ymm7,  ymm7)
-    vxorpd(ymm8,  ymm8,  ymm8)
-    vxorpd(ymm9,  ymm9,  ymm9)
-    vxorpd(ymm10, ymm10, ymm10)
-    vxorpd(ymm11, ymm11, ymm11)
-    vxorpd(ymm12, ymm12, ymm12)
-    vxorpd(ymm13, ymm13, ymm13)
-    vxorpd(ymm14, ymm14, ymm14)
-    vxorpd(ymm15, ymm15, ymm15)
-#endif
-
-
-	lea(mem(rcx, rdi, 2), r10)         //
-	lea(mem(r10, rdi, 1), r10)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	prefetch(0, mem(r10, 1*8))         // prefetch c + 3*rs_c
-	prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
-	prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovupd(mem(rax, r13, 1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovupd(mem(rax, r8,  4), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovupd(mem(rax, r15, 1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	vmovsd(mem(rax, r13, 1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vmovsd(mem(rax, r8,  4), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm12)
-	vfmadd231pd(ymm1, ymm3, ymm13)
-
-	vmovsd(mem(rax, r15, 1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	                                   // ymm10 ymm11
-	                                   // ymm12 ymm13
-	                                   // ymm14 ymm15
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	vhaddpd( ymm11, ymm10, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm10 )
-
-	vhaddpd( ymm13, ymm12, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm12 )
-
-	vhaddpd( ymm15, ymm14, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm14 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-	                                   // xmm10 = sum(ymm10) sum(ymm11)
-	                                   // xmm12 = sum(ymm12) sum(ymm13)
-	                                   // xmm14 = sum(ymm14) sum(ymm15)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	vmulpd(xmm0, xmm10, xmm10)
-	vmulpd(xmm0, xmm12, xmm12)
-	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm10)
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm12)
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm14)
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm10, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm12, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm14, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_3x2n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-	vxorpd(ymm8,  ymm8,  ymm8)
-	vxorpd(ymm9,  ymm9,  ymm9)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovupd(mem(rax, r8,  2), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vmovsd(mem(rax, r8,  2), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm8)
-	vfmadd231pd(ymm1, ymm3, ymm9)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	                                   // ymm8  ymm9
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	vhaddpd( ymm9, ymm8, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm8 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-	                                   // xmm8  = sum(ymm8)  sum(ymm9)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm8)
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm8, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_2x2n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-	vxorpd(ymm6,  ymm6,  ymm6)
-	vxorpd(ymm7,  ymm7,  ymm7)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovupd(mem(rax, r8,  1), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	vmovsd(mem(rax, r8,  1), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	                                   // ymm6  ymm7
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	vhaddpd( ymm7, ymm6, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm6 )
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-	                                   // xmm6  = sum(ymm6)  sum(ymm7)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm6)
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(xmm6, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
-void bli_dgemmsup_rd_haswell_asm_1x2n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-#if 0
-	bli_dgemmsup_r_haswell_ref
-	(
-	  conja, conjb, m0, n0, k0,
-	  alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	  beta, c, rs_c0, cs_c0, data, cntx
-	);
-	return;
-#endif
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter16 = k0 / 16;
-	uint64_t k_left16 = k0 % 16;
-	uint64_t k_iter4  = k_left16 / 4;
-	uint64_t k_left1  = k_left16 % 4;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-/*
-   rrc:
-     --------        -- -- --        | |
-     --------        -- -- -- ...    | |
-     --------   +=   -- -- --        | |
-     --------        -- -- --        | |
-     --------        -- -- --         :
-     --------        -- -- --         :
-*/
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-	
-#if 0
-	vzeroall()                         // zero all xmm/ymm registers.
-#else
-	                                   // skylake can execute 3 vxorpd ipc with
-	                                   // a latency of 1 cycle, while vzeroall
-	                                   // has a latency of 12 cycles.
-	vxorpd(ymm4,  ymm4,  ymm4)
-	vxorpd(ymm5,  ymm5,  ymm5)
-#endif
-	
-	mov(var(a), rax)                   // load address of a.
-	//mov(var(rs_a), r8)                 // load rs_a
-	//mov(var(cs_a), r9)                 // load cs_a
-	//lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
-	//lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(var(rs_b), r10)                // load rs_b
-	mov(var(cs_b), r11)                // load cs_b
-	//lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
-	lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-
-	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
-	                                   // initialize loop by pre-loading
-	                                   // a column of a.
-
-	mov(var(c), rcx)                   // load address of c
-	//mov(var(rs_c), rdi)                // load rs_c
-	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-
-	//lea(mem(rcx, rdi, 2), rdx)         //
-	//lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	prefetch(0, mem(rcx, 1*8))         // prefetch c + 0*rs_c
-	
-
-	
-	
-	mov(var(k_iter16), rsi)            // i = k_iter16;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
-	                                   // contains the k_iter4 loop.
-	
-	
-	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 2
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-
-	// ---------------------------------- iteration 3
-
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
-	label(.DCONSIDKITER4)
-	
-	mov(var(k_iter4), rsi)             // i = k_iter4;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
-	                                   // considers k_left1 loop.
-	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
-	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
-	vmovupd(mem(rbx        ), ymm0)
-	vmovupd(mem(rbx, r11, 1), ymm1)
-	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
-
-	vmovupd(mem(rax        ), ymm3)
-	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
-
-	label(.DCONSIDKLEFT1)
-	
-	mov(var(k_left1), rsi)             // i = k_left1;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left1 loop.
-	
-	
-
-
-	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
-	                                   // NOTE: We must use ymm registers here bc
-	                                   // using the xmm registers would zero out the
-	                                   // high bits of the destination registers,
-	                                   // which would destory intermediate results.
-	
-	vmovsd(mem(rbx        ), xmm0)
-	vmovsd(mem(rbx, r11, 1), xmm1)
-	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
-
-	vmovsd(mem(rax        ), xmm3)
-	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	vfmadd231pd(ymm0, ymm3, ymm4)
-	vfmadd231pd(ymm1, ymm3, ymm5)
-
-	
-	dec(rsi)                           // i -= 1;
-	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
-
-
-
-
-	label(.DPOSTACCUM)
-
-	                                   // ymm4  ymm5
-	
-	vhaddpd( ymm5, ymm4, ymm0 )
-	vextractf128(imm(1), ymm0, xmm1 )
-	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
-
-	                                   // xmm4  = sum(ymm4)  sum(ymm5)
-
-
-	
-	mov(var(alpha), rax)               // load address of alpha
-	mov(var(beta), rbx)                // load address of beta
-	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
-	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
-	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
-	//mov(var(cs_c), rsi)                // load cs_c
-	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
-	                                   // now avoid loading C if beta == 0
-	
-	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
-
-	
-	label(.DROWSTORED)
-	
-	
-	vfmadd231pd(mem(rcx), xmm3, xmm4)
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
-	label(.DBETAZERO)
-	
-
-	
-	label(.DROWSTORBZ)
-	
-	
-	vmovupd(xmm4, mem(rcx))
-	//add(rdi, rcx)
-	
-	
-	
-	
-	label(.DDONE)
-	
-	
-
-    end_asm(
-	: // output operands (none)
-	: // input operands
-      [k_iter16] "m" (k_iter16),
-      [k_iter4] "m" (k_iter4),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-}
-
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
index 98b557fae..a0653373c 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
@@ -93,38 +93,38 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx \
+             conj_t     conja, \
+             conj_t     conjb, \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < mdim; ++i ) \
 	{ \
-		ctype* restrict ci = &c[ i*rs_c ]; \
-		ctype* restrict ai = &a[ i*rs_a ]; \
+		      ctype* ci = &c[ i*rs_c ]; \
+		const ctype* ai = &a[ i*rs_a ]; \
 \
 		/* for ( dim_t j = 0; j < 1; ++j ) */ \
 		{ \
-			ctype* restrict cij = ci /*[ j*cs_c ]*/ ; \
-			ctype* restrict bj  = b  /*[ j*cs_b ]*/ ; \
-			ctype           ab; \
+			      ctype* cij = ci /*[ j*cs_c ]*/ ; \
+			const ctype* bj  = b  /*[ j*cs_b ]*/ ; \
+			ctype        ab; \
 \
 			PASTEMAC(ch,set0s)( ab ); \
 \
 			/* Perform a dot product to update the (i,j) element of c. */ \
 			for ( dim_t l = 0; l < k; ++l ) \
 			{ \
-				ctype* restrict aij = &ai[ l*cs_a ]; \
-				ctype* restrict bij = &bj[ l*rs_b ]; \
+				const ctype* aij = &ai[ l*cs_a ]; \
+				const ctype* bij = &bj[ l*rs_b ]; \
 \
 				PASTEMAC(ch,dots)( *aij, *bij, ab ); \
 			} \
@@ -165,29 +165,29 @@ GENTFUNC( float,  s, gemmsup_r_haswell_ref_1x1, 1 )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx \
+             conj_t     conja, \
+             conj_t     conjb, \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < m; ++i ) \
 	{ \
-		ctype* restrict ci = &c[ i*rs_c ]; \
-		ctype* restrict ai = &a[ i*rs_a ]; \
+		      ctype* ci = &c[ i*rs_c ]; \
+		const ctype* ai = &a[ i*rs_a ]; \
 \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict cij = &ci[ j*cs_c ]; \
-			ctype* restrict bj  = &b [ j*cs_b ]; \
+			      ctype* cij = &ci[ j*cs_c ]; \
+			const ctype* bj  = &b [ j*cs_b ]; \
 			ctype           ab; \
 \
 			PASTEMAC(ch,set0s)( ab ); \
@@ -195,8 +195,8 @@ void PASTEMAC(ch,opname) \
 			/* Perform a dot product to update the (i,j) element of c. */ \
 			for ( dim_t l = 0; l < k; ++l ) \
 			{ \
-				ctype* restrict aij = &ai[ l*cs_a ]; \
-				ctype* restrict bij = &bj[ l*rs_b ]; \
+				const ctype* aij = &ai[ l*cs_a ]; \
+				const ctype* bij = &bj[ l*rs_b ]; \
 \
 				PASTEMAC(ch,dots)( *aij, *bij, ab ); \
 			} \
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
index c17b0b275..ac2cf4602 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -588,18 +588,18 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 
 void bli_sgemmsup_rd_haswell_asm_3x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1003,18 +1003,18 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 
 void bli_sgemmsup_rd_haswell_asm_2x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1381,18 +1381,18 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 
 void bli_sgemmsup_rd_haswell_asm_1x1
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
index 5fb91e634..646280375 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -634,18 +634,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 
 void bli_sgemmsup_rd_haswell_asm_2x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1111,18 +1111,18 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 
 void bli_sgemmsup_rd_haswell_asm_1x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
index 1398c3da7..0fce13c0e 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -82,9 +82,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		float* restrict cij = c;
-		float* restrict bj  = b;
-		float* restrict ai  = a;
+		      float* cij = c;
+		const float* bj  = b;
+		const float* ai  = a;
 
 		if ( 8 <= n_left )
 		{
@@ -711,12 +711,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	// Handle edge cases in the m dimension, if they exist.
 	if ( m_left )
 	{
-		const dim_t      nr_cur = 16;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
+		const dim_t   nr_cur = 16;
+		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		float* restrict cij = c + i_edge*rs_c;
-		float* restrict bj  = b;
-		float* restrict ai  = a + i_edge*rs_a;
+		      float* cij = c + i_edge*rs_c;
+		const float* bj  = b;
+		const float* ai  = a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -748,18 +748,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 
 void bli_sgemmsup_rd_haswell_asm_2x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1228,18 +1228,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 
 void bli_sgemmsup_rd_haswell_asm_1x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
index 75c687267..d81ef6442 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -636,18 +636,18 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 
 void bli_sgemmsup_rd_haswell_asm_3x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1078,18 +1078,18 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 
 void bli_sgemmsup_rd_haswell_asm_2x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1475,18 +1475,18 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 
 void bli_sgemmsup_rd_haswell_asm_1x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
index 80be4e932..7302b697c 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -606,18 +606,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 
 void bli_sgemmsup_rd_haswell_asm_2x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1063,18 +1063,18 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 
 void bli_sgemmsup_rd_haswell_asm_1x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
index 3a82e9b3e..52d3ccd3d 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
@@ -62,18 +62,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rd_haswell_asm_6x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -634,18 +634,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 
 void bli_sgemmsup_rd_haswell_asm_2x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1111,18 +1111,18 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 
 void bli_sgemmsup_rd_haswell_asm_1x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
index 65d8664da..bcf5744c9 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rv_haswell_asm_6x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -781,18 +781,18 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 
 void bli_sgemmsup_rv_haswell_asm_5x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1480,18 +1480,18 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 
 void bli_sgemmsup_rv_haswell_asm_4x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2037,18 +2037,18 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 
 void bli_sgemmsup_rv_haswell_asm_3x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2625,18 +2625,18 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 
 void bli_sgemmsup_rv_haswell_asm_2x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3071,18 +3071,18 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 
 void bli_sgemmsup_rv_haswell_asm_1x12
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
index 26eec0c09..23231a3be 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
@@ -98,18 +98,18 @@ static FUNCPTR_T kmap[NUM_MR][NUM_NR] =
 
 void bli_sgemmsup_rv_haswell_asm_6x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 #if 0
@@ -135,9 +135,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	);
 	return;
 #endif
-		dim_t            n_left = n0;
-		float*  restrict cj     = c;
-		float*  restrict bj     = b;
+		dim_t        n_left = n0;
+		      float* cj     = c;
+		const float* bj     = b;
 
 		// Iterate across columns (corresponding to elements of nrs) until
 		// n_left is zero.
@@ -149,9 +149,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 			// n_left, we use the kernels in that column.
 			if ( nr_cur <= n_left )
 			{
-				dim_t            m_left = m0;
-				float*  restrict cij    = cj;
-				float*  restrict ai     = a;
+				dim_t        m_left = m0;
+				      float* cij    = cj;
+				const float* ai     = a;
 
 				// Iterate down the current column (corresponding to elements
 				// of mrs) until m_left is zero.
@@ -922,18 +922,18 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 
 void bli_sgemmsup_rv_haswell_asm_5x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1678,18 +1678,18 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 
 void bli_sgemmsup_rv_haswell_asm_4x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2263,18 +2263,18 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 
 void bli_sgemmsup_rv_haswell_asm_3x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2904,18 +2904,18 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 
 void bli_sgemmsup_rv_haswell_asm_2x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3373,18 +3373,18 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 
 void bli_sgemmsup_rv_haswell_asm_1x16
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
index efb336395..9117e6b17 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rv_haswell_asm_6x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -560,18 +560,18 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 
 void bli_sgemmsup_rv_haswell_asm_5x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1022,18 +1022,18 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 
 void bli_sgemmsup_rv_haswell_asm_4x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1433,18 +1433,18 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 
 void bli_sgemmsup_rv_haswell_asm_3x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1836,18 +1836,18 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 
 void bli_sgemmsup_rv_haswell_asm_2x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2188,18 +2188,18 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 
 void bli_sgemmsup_rv_haswell_asm_1x2
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
index 2d6165710..b06a638df 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rv_haswell_asm_6x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -590,18 +590,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 
 void bli_sgemmsup_rv_haswell_asm_5x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1087,18 +1087,18 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 
 void bli_sgemmsup_rv_haswell_asm_4x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1520,18 +1520,18 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 
 void bli_sgemmsup_rv_haswell_asm_3x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1949,18 +1949,18 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 
 void bli_sgemmsup_rv_haswell_asm_2x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2314,18 +2314,18 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 
 void bli_sgemmsup_rv_haswell_asm_1x4
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
index f2cb1df42..b7cfe9321 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rv_haswell_asm_6x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -668,18 +668,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 
 void bli_sgemmsup_rv_haswell_asm_5x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1242,18 +1242,18 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 
 void bli_sgemmsup_rv_haswell_asm_4x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1729,18 +1729,18 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 
 void bli_sgemmsup_rv_haswell_asm_3x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2217,18 +2217,18 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 
 void bli_sgemmsup_rv_haswell_asm_2x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2617,18 +2617,18 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 
 void bli_sgemmsup_rv_haswell_asm_1x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3008,18 +3008,18 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 void bli_sgemmsup_rv_haswell_asm_1x6
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
index 603ba7554..d2e145e76 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
@@ -83,18 +83,18 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 
 void bli_sgemmsup_rv_haswell_asm_6x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -634,18 +634,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 
 void bli_sgemmsup_rv_haswell_asm_5x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1185,18 +1185,18 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 
 void bli_sgemmsup_rv_haswell_asm_4x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1642,18 +1642,18 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 
 void bli_sgemmsup_rv_haswell_asm_3x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2122,18 +2122,18 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 
 void bli_sgemmsup_rv_haswell_asm_2x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2508,18 +2508,18 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 
 void bli_sgemmsup_rv_haswell_asm_1x8
      (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*     restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             conj_t     conja,
+             conj_t     conjb,
+             dim_t      m0,
+             dim_t      n0,
+             dim_t      k0,
+       const float*     alpha,
+       const float*     a, inc_t rs_a0, inc_t cs_a0,
+       const float*     b, inc_t rs_b0, inc_t cs_b0,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
index a53b763da..4e8c4cac8 100644
--- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
+++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
@@ -256,16 +256,16 @@ extern int offsets[16];
 //#define LOOPMON
 void bli_dgemm_knc_asm_30x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     double * a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
index 7374abfe0..c9d16e049 100644
--- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
+++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
@@ -256,16 +256,16 @@ int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9
 //#define LOOPMON
 void bli_sgemm_knc_asm_30x16
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     float * a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
index 2464ecf0a..1f99e7a39 100644
--- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
+++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
@@ -108,15 +108,15 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
 
 void bli_dpackm_knl_asm_8xk
      (
-       conj_t           conja,
-       pack_t           schema,
-       dim_t            cdim_,
-       dim_t            n_,
-       dim_t            n_max_,
-       double* restrict kappa_,
-       double* restrict a_, inc_t inca_, inc_t lda_,
-       double* restrict p_,              inc_t ldp_,
-       cntx_t*          cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim_,
+             dim_t   n_,
+             dim_t   n_max_,
+       const double* kappa_,
+       const double* a_, inc_t inca_, inc_t lda_,
+             double* p_,              inc_t ldp_,
+       const cntx_t* cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
@@ -359,15 +359,15 @@ void bli_dpackm_knl_asm_8xk
 
 void bli_dpackm_knl_asm_24xk
      (
-       conj_t           conja,
-       pack_t           schema,
-       dim_t            cdim_,
-       dim_t            n_,
-       dim_t            n_max_,
-       double* restrict kappa_,
-       double* restrict a_, inc_t inca_, inc_t lda_,
-       double* restrict p_,              inc_t ldp_,
-       cntx_t*          cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim_,
+             dim_t   n_,
+             dim_t   n_max_,
+       const double* kappa_,
+       const double* a_, inc_t inca_, inc_t lda_,
+             double* p_,              inc_t ldp_,
+       const cntx_t* cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
index 4326a00dd..493b0259b 100644
--- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
+++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
@@ -110,15 +110,15 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
 
 void bli_spackm_knl_asm_16xk
      (
-       conj_t           conja,
-       pack_t           schema,
-       dim_t            cdim_,
-       dim_t            n_,
-       dim_t            n_max_,
-       float*  restrict kappa_,
-       float*  restrict a_, inc_t inca_, inc_t lda_,
-       float*  restrict p_,              inc_t ldp_,
-       cntx_t*          cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim_,
+             dim_t   n_,
+             dim_t   n_max_,
+       const float*  kappa_,
+       const float*  a_, inc_t inca_, inc_t lda_,
+             float*  p_,              inc_t ldp_,
+       const cntx_t* cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
@@ -377,15 +377,15 @@ void bli_spackm_knl_asm_16xk
 
 void bli_spackm_knl_asm_24xk
      (
-       conj_t           conja,
-       pack_t           schema,
-       dim_t            cdim_,
-       dim_t            n_,
-       dim_t            n_max_,
-       float*  restrict kappa_,
-       float*  restrict a_, inc_t inca_, inc_t lda_,
-       float*  restrict p_,              inc_t ldp_,
-       cntx_t*          cntx
+             conj_t  conja,
+             pack_t  schema,
+             dim_t   cdim_,
+             dim_t   n_,
+             dim_t   n_max_,
+       const float*  kappa_,
+       const float*  a_, inc_t inca_, inc_t lda_,
+             float*  p_,              inc_t ldp_,
+       const cntx_t* cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
index 11a480997..579ac61f5 100644
--- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
+++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
@@ -185,16 +185,16 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
 //#define LOOPMON
 void bli_dgemm_knl_asm_24x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k_,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k_,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c_, inc_t cs_c_,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     (void)data;
diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
index cbef0cb82..184f44951 100644
--- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
+++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
@@ -182,25 +182,25 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
 //#define LOOPMON
 void bli_sgemm_knl_asm_24x16
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k_,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k_,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c_, inc_t cs_c_,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     (void)data;
     (void)cntx;
 
-    const double * a_next = bli_auxinfo_next_a( data );
-    const double * b_next = bli_auxinfo_next_b( data );
+    const double* a_next = bli_auxinfo_next_a( data );
+    const double* b_next = bli_auxinfo_next_b( data );
 
-    int32_t * offsetPtr = &offsets[0];
+    int32_t* offsetPtr = &offsets[0];
     int64_t k = k_;
     int64_t rs_c = rs_c_;
     int64_t cs_c = cs_c_;
diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c
index c329912b4..690af0452 100644
--- a/kernels/penryn/1/bli_axpyv_penryn_int.c
+++ b/kernels/penryn/1/bli_axpyv_penryn_int.c
@@ -45,35 +45,33 @@ typedef union
 
 void bli_daxpyv_penryn_int
      (
-       conj_t           conjx,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const double* alpha,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
-	double*  restrict alpha_cast = alpha;
-	double*  restrict x_cast = x;
-	double*  restrict y_cast = y;
-	dim_t             i;
+	const double*  restrict alpha_cast = alpha;
+	const double*  restrict x_cast = x;
+	      double*  restrict y_cast = y;
+	      dim_t             i;
 
-	const dim_t       n_elem_per_reg = 2;
-	const dim_t       n_iter_unroll  = 4;
+	const dim_t             n_elem_per_reg = 2;
+	const dim_t             n_iter_unroll  = 4;
 
-	dim_t             n_pre;
-	dim_t             n_run;
-	dim_t             n_left;
+	      dim_t             n_pre;
+	      dim_t             n_run;
+	      dim_t             n_left;
 
-	double*  restrict x1;
-	double*  restrict y1;
-	double            alpha1c, x1c;
+	      double            alpha1c, x1c;
 
-	v2df_t            alpha1v;
-	v2df_t            x1v, x2v, x3v, x4v;
-	v2df_t            y1v, y2v, y3v, y4v;
+	      v2df_t            alpha1v;
+	      v2df_t            x1v, x2v, x3v, x4v;
+	      v2df_t            y1v, y2v, y3v, y4v;
 
-	bool              use_ref = FALSE;
+	      bool              use_ref = FALSE;
 
 
 	if ( bli_zero_dim1( n ) ) return;
@@ -122,8 +120,8 @@ void bli_daxpyv_penryn_int
 
 	alpha1c = *alpha_cast;
 
-	x1 = x_cast;
-	y1 = y_cast;
+	const double* restrict x1 = x_cast;
+	      double* restrict y1 = y_cast;
 
 	if ( n_pre == 1 )
 	{
diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c
index 6d63a9cf0..72a5e5653 100644
--- a/kernels/penryn/1/bli_dotv_penryn_int.c
+++ b/kernels/penryn/1/bli_dotv_penryn_int.c
@@ -45,33 +45,31 @@ typedef union
 
 void bli_ddotv_penryn_int
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       double* restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const double* x, inc_t incx,
+       const double* y, inc_t incy,
+             double* rho,
+       const cntx_t* cntx
      )
 {
-	double*  restrict x_cast   = x;
-	double*  restrict y_cast   = y;
-	double*  restrict rho_cast = rho;
-	dim_t             i;
+	const double*  restrict x_cast   = x;
+	const double*  restrict y_cast   = y;
+	      double*  restrict rho_cast = rho;
+	      dim_t             i;
 
-	dim_t             n_pre;
-	dim_t             n_run;
-	dim_t             n_left;
+	      dim_t             n_pre;
+	      dim_t             n_run;
+	      dim_t             n_left;
 
-	double*  restrict x1;
-	double*  restrict y1;
-	double            rho1;
-	double            x1c, y1c;
+	      double            rho1;
+	      double            x1c, y1c;
 
-	v2df_t            rho1v;
-	v2df_t            x1v, y1v;
+	      v2df_t            rho1v;
+	      v2df_t            x1v, y1v;
 
-	bool              use_ref = FALSE;
+	      bool              use_ref = FALSE;
 
 	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) )
@@ -122,8 +120,8 @@ void bli_ddotv_penryn_int
 	n_run       = ( n - n_pre ) / 2;
 	n_left      = ( n - n_pre ) % 2;
 
-	x1 = x_cast;
-	y1 = y_cast;
+	const double* restrict x1 = x_cast;
+	const double* restrict y1 = y_cast;
 
 	PASTEMAC(d,set0s)( rho1 );
 
diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
index 350a0af5f..29ca050f5 100644
--- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
@@ -45,41 +45,38 @@ typedef union
 
 void bli_daxpy2v_penryn_int
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict beta,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       double* restrict z, inc_t incz,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const double* alphax,
+       const double* alphay,
+       const double* x, inc_t incx,
+       const double* y, inc_t incy,
+             double* z, inc_t incz,
+       const cntx_t* cntx
      )
 {
-	double*  restrict alpha_cast  = alpha;
-	double*  restrict beta_cast   = beta;
-	double*  restrict x_cast      = x;
-	double*  restrict y_cast      = y;
-	double*  restrict z_cast      = z;
-	dim_t             i;
+	const double*  restrict alphax_cast  = alphax;
+	const double*  restrict alphay_cast  = alphay;
+	const double*  restrict x_cast      = x;
+	const double*  restrict y_cast      = y;
+	      double*  restrict z_cast      = z;
+	      dim_t             i;
 
-	const dim_t       n_elem_per_reg = 2;
-	const dim_t       n_iter_unroll  = 4;
+	const dim_t             n_elem_per_reg = 2;
+	const dim_t             n_iter_unroll  = 4;
 
-	dim_t             n_pre;
-	dim_t             n_run;
-	dim_t             n_left;
+	      dim_t             n_pre;
+	      dim_t             n_run;
+	      dim_t             n_left;
 
-	double*  restrict x1;
-	double*  restrict y1;
-	double*  restrict z1;
-	double            alphac, betac, x1c, y1c;
+	      double            alphaxc, alphayc, x1c, y1c;
 
-	v2df_t            alphav, betav;
-	v2df_t            x1v, y1v, z1v;
-	v2df_t            x2v, y2v, z2v;
+	      v2df_t            alphaxv, alphayv;
+	      v2df_t            x1v, y1v, z1v;
+	      v2df_t            x2v, y2v, z2v;
 
-	bool              use_ref = FALSE;
+	      bool              use_ref = FALSE;
 
 
 	if ( bli_zero_dim1( n ) ) return;
@@ -117,8 +114,8 @@ void bli_daxpy2v_penryn_int
 		  conjx,
 		  conjy,
 		  n,
-		  alpha,
-		  beta,
+		  alphax,
+		  alphay,
 		  x, incx,
 		  y, incy,
 		  z, incz,
@@ -131,69 +128,31 @@ void bli_daxpy2v_penryn_int
 	n_run       = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	n_left      = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
 
-	alphac = *alpha_cast;
-	betac  = *beta_cast;
+	alphaxc = *alphax_cast;
+	alphayc = *alphay_cast;
 
-	x1 = x_cast;
-	y1 = y_cast;
-	z1 = z_cast;
+	const double* restrict x1 = x_cast;
+	const double* restrict y1 = y_cast;
+	      double* restrict z1 = z_cast;
 
 	if ( n_pre == 1 )
 	{
 		x1c = *x1;
 		y1c = *y1;
 
-		*z1 += alphac * x1c +
-		       betac  * y1c;
+		*z1 += alphaxc * x1c +
+		       alphayc * y1c;
 
 		x1 += incx;
 		y1 += incy;
 		z1 += incz;
 	}
 
-	alphav.v = _mm_loaddup_pd( ( double* )alpha_cast );
-	betav.v  = _mm_loaddup_pd( ( double* )beta_cast );
+	alphaxv.v = _mm_loaddup_pd( ( double* )alphax_cast );
+	alphayv.v = _mm_loaddup_pd( ( double* )alphay_cast );
 
 	for ( i = 0; i < n_run; ++i )
 	{
-/*
-		z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
-		x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
-		y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
-
-		z1v.v += alphav.v * x1v.v;
-		z1v.v += betav.v  * y1v.v;
-
-		_mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
-
-		z1v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
-		x1v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
-		y1v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
-
-		z1v.v += alphav.v * x1v.v;
-		z1v.v += betav.v  * y1v.v;
-
-		_mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z1v.v );
-*/
-/*
-		z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
-		x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
-		y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
-
-		z2v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg );
-		x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
-		y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
-
-		z1v.v += alphav.v * x1v.v;
-		z1v.v += betav.v  * y1v.v;
-
-		_mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
-
-		z2v.v += alphav.v * x2v.v;
-		z2v.v += betav.v  * y2v.v;
-
-		_mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v );
-*/
 		z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
 		x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
 		y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg );
@@ -202,8 +161,8 @@ void bli_daxpy2v_penryn_int
 		x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg );
 		y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg );
 
-		z1v.v += alphav.v * x1v.v;
-		z1v.v += betav.v  * y1v.v;
+		z1v.v += alphaxv.v * x1v.v;
+		z1v.v += alphayv.v  * y1v.v;
 
 		_mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v );
 
@@ -211,8 +170,8 @@ void bli_daxpy2v_penryn_int
 		x1v.v = _mm_load_pd( ( double* )x1 + 2*n_elem_per_reg );
 		y1v.v = _mm_load_pd( ( double* )y1 + 2*n_elem_per_reg );
 
-		z2v.v += alphav.v * x2v.v;
-		z2v.v += betav.v  * y2v.v;
+		z2v.v += alphaxv.v * x2v.v;
+		z2v.v += alphayv.v  * y2v.v;
 
 		_mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v );
 
@@ -220,13 +179,13 @@ void bli_daxpy2v_penryn_int
 		x2v.v = _mm_load_pd( ( double* )x1 + 3*n_elem_per_reg );
 		y2v.v = _mm_load_pd( ( double* )y1 + 3*n_elem_per_reg );
 
-		z1v.v += alphav.v * x1v.v;
-		z1v.v += betav.v  * y1v.v;
+		z1v.v += alphaxv.v * x1v.v;
+		z1v.v += alphayv.v  * y1v.v;
 
 		_mm_store_pd( ( double* )(z1 + 2*n_elem_per_reg ), z1v.v );
 
-		z2v.v += alphav.v * x2v.v;
-		z2v.v += betav.v  * y2v.v;
+		z2v.v += alphaxv.v * x2v.v;
+		z2v.v += alphayv.v  * y2v.v;
 
 		_mm_store_pd( ( double* )(z1 + 3*n_elem_per_reg ), z2v.v );
 
@@ -244,8 +203,8 @@ void bli_daxpy2v_penryn_int
 			x1c = *x1;
 			y1c = *y1;
 
-			*z1 += alphac * x1c +
-			       betac  * y1c;
+			*z1 += alphaxc * x1c +
+			       alphayc * y1c;
 
 			x1 += incx;
 			y1 += incy;
diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c
index f52c05d67..924782a36 100644
--- a/kernels/penryn/1f/bli_axpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c
@@ -45,43 +45,38 @@ typedef union
 
 void bli_daxpyf_penryn_int
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
-	double*  restrict alpha_cast = alpha;
-	double*  restrict a_cast = a;
-	double*  restrict x_cast = x;
-	double*  restrict y_cast = y;
-	dim_t             i;
+	const double*  restrict alpha_cast = alpha;
+	const double*  restrict a_cast = a;
+	const double*  restrict x_cast = x;
+	      double*  restrict y_cast = y;
+	      dim_t             i;
 
-	const dim_t       n_elem_per_reg = 2;
-	const dim_t       n_iter_unroll  = 2;
+	const dim_t             n_elem_per_reg = 2;
+	const dim_t             n_iter_unroll  = 2;
 
-	dim_t             m_pre;
-	dim_t             m_run;
-	dim_t             m_left;
+	      dim_t             m_pre;
+	      dim_t             m_run;
+	      dim_t             m_left;
 
-    double*  restrict a0;
-    double*  restrict a1;
-    double*  restrict a2;
-    double*  restrict a3;
-    double*  restrict y0;
-    double            a0c, a1c, a2c, a3c;
-    double            chi0, chi1, chi2, chi3;
+          double            a0c, a1c, a2c, a3c;
+          double            chi0, chi1, chi2, chi3;
 
-	v2df_t            a00v, a01v, a02v, a03v, y0v;
-	v2df_t            a10v, a11v, a12v, a13v, y1v;
-	v2df_t            chi0v, chi1v, chi2v, chi3v;
+	      v2df_t            a00v, a01v, a02v, a03v, y0v;
+	      v2df_t            a10v, a11v, a12v, a13v, y1v;
+	      v2df_t            chi0v, chi1v, chi2v, chi3v;
 
-	bool              use_ref = FALSE;
+	      bool              use_ref = FALSE;
 
 
 	if ( bli_zero_dim2( m, b_n ) ) return;
@@ -136,11 +131,11 @@ void bli_daxpyf_penryn_int
 	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
 
-	a0   = a_cast + 0*lda;
-	a1   = a_cast + 1*lda;
-	a2   = a_cast + 2*lda;
-	a3   = a_cast + 3*lda;
-	y0   = y_cast;
+	const double* restrict a0   = a_cast + 0*lda;
+	const double* restrict a1   = a_cast + 1*lda;
+	const double* restrict a2   = a_cast + 2*lda;
+	const double* restrict a3   = a_cast + 3*lda;
+	      double* restrict y0   = y_cast;
 
 	chi0 = *(x_cast + 0*incx);
 	chi1 = *(x_cast + 1*incx);
diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
index 244e3f11c..bc2473249 100644
--- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
@@ -45,39 +45,36 @@ typedef union
 
 void bli_ddotaxpyv_penryn_int
      (
-       conj_t           conjxt,
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       double* restrict rho,
-       double* restrict z, inc_t incz,
-       cntx_t*          cntx
+             conj_t  conjxt,
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const double* alpha,
+       const double* x, inc_t incx,
+       const double* y, inc_t incy,
+             double* rho,
+             double* z, inc_t incz,
+       const cntx_t* cntx
      )
 {
-	double*  restrict alpha_cast = alpha;
-	double*  restrict x_cast     = x;
-	double*  restrict y_cast     = y;
-	double*  restrict rho_cast   = rho;
-	double*  restrict z_cast     = z;
+	const double*  restrict alpha_cast = alpha;
+	const double*  restrict x_cast     = x;
+	const double*  restrict y_cast     = y;
+	      double*  restrict rho_cast   = rho;
+	      double*  restrict z_cast     = z;
 
-	dim_t             n_pre;
-	dim_t             n_run;
-	dim_t             n_left;
+	      dim_t             n_pre;
+	      dim_t             n_run;
+	      dim_t             n_left;
 
-	double*  restrict chi1;
-	double*  restrict psi1;
-	double*  restrict zeta1;
-	double            alpha1c, chi1c, psi1c, rho1c;
-	dim_t             i;
-	//inc_t             stepx, stepy, stepz;
+	      double            alpha1c, chi1c, psi1c, rho1c;
+	      dim_t             i;
+	      //inc_t             stepx, stepy, stepz;
 
-	v2df_t            alphav, rhov;
-	v2df_t            x1v, y1v, z1v;
+	      v2df_t            alphav, rhov;
+	      v2df_t            x1v, y1v, z1v;
 
-	bool              use_ref = FALSE;
+	      bool              use_ref = FALSE;
 
 	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) )
@@ -142,9 +139,9 @@ void bli_ddotaxpyv_penryn_int
 
 	alpha1c = *alpha_cast;
 
-	chi1  = x_cast;
-	psi1  = y_cast;
-	zeta1 = z_cast;
+	const double* restrict chi1  = x_cast;
+	const double* restrict psi1  = y_cast;
+	      double* restrict zeta1 = z_cast;
 
 	if ( n_pre == 1 )
 	{
diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
index 3ff80319a..0aa68b5bc 100644
--- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
@@ -45,58 +45,52 @@ typedef union
 
 void bli_ddotxaxpyf_penryn_int
      (
-       conj_t           conjat,
-       conj_t           conja,
-       conj_t           conjw,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict w, inc_t incw,
-       double* restrict x, inc_t incx,
-       double* restrict beta,
-       double* restrict y, inc_t incy,
-       double* restrict z, inc_t incz,
-       cntx_t*          cntx
+             conj_t  conjat,
+             conj_t  conja,
+             conj_t  conjw,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* w, inc_t incw,
+       const double* x, inc_t incx,
+       const double* beta,
+             double* y, inc_t incy,
+             double* z, inc_t incz,
+       const cntx_t* cntx
      )
 {
-	double*  restrict alpha_cast = alpha;
-	double*  restrict beta_cast  = beta;
-	double*  restrict a_cast     = a;
-	double*  restrict w_cast     = w;
-	double*  restrict x_cast     = x;
-	double*  restrict y_cast     = y;
-	double*  restrict z_cast     = z;
-	dim_t             i;
-
-	const dim_t       n_elem_per_reg = 2;
-	const dim_t       n_iter_unroll  = 2;
-
-	dim_t             m_pre;
-	dim_t             m_run;
-	dim_t             m_left;
-
-	double*  restrict a0;
-	double*  restrict a1;
-	double*  restrict a2;
-	double*  restrict a3;
-	double*  restrict w1;
-	double*  restrict z1;
-	double            rho0, rho1, rho2, rho3;
-	double            chi0, chi1, chi2, chi3;
-	double            a0c, a1c, a2c, a3c, w1c, z1c;
-
-	v2df_t            rho0v, rho1v, rho2v, rho3v;
-	v2df_t            chi0v, chi1v, chi2v, chi3v;
-	//v2df_t            a0v, a1v, a2v, a3v, w1v, z1v;
-	v2df_t            a00v, a01v, a02v, a03v;
-	v2df_t            a10v, a11v, a12v, a13v;
-	v2df_t            w1v, z1v;
-	v2df_t            w2v, z2v;
-	v2df_t            psi0v, psi1v, betav, alphav;
-
-	bool              use_ref = FALSE;
+	const double*  restrict alpha_cast = alpha;
+	const double*  restrict beta_cast  = beta;
+	const double*  restrict a_cast     = a;
+	const double*  restrict w_cast     = w;
+	const double*  restrict x_cast     = x;
+	      double*  restrict y_cast     = y;
+	      double*  restrict z_cast     = z;
+	      dim_t             i;
+
+	const dim_t             n_elem_per_reg = 2;
+	const dim_t             n_iter_unroll  = 2;
+
+	      dim_t             m_pre;
+	      dim_t             m_run;
+	      dim_t             m_left;
+
+	      double            rho0, rho1, rho2, rho3;
+	      double            chi0, chi1, chi2, chi3;
+	      double            a0c, a1c, a2c, a3c, w1c, z1c;
+
+	      v2df_t            rho0v, rho1v, rho2v, rho3v;
+	      v2df_t            chi0v, chi1v, chi2v, chi3v;
+	      //v2df_t            a0v, a1v, a2v, a3v, w1v, z1v;
+	      v2df_t            a00v, a01v, a02v, a03v;
+	      v2df_t            a10v, a11v, a12v, a13v;
+	      v2df_t            w1v, z1v;
+	      v2df_t            w2v, z2v;
+	      v2df_t            psi0v, psi1v, betav, alphav;
+
+	      bool              use_ref = FALSE;
 
 
 	if ( bli_zero_dim1( b_n ) ) return;
@@ -117,19 +111,19 @@ void bli_ddotxaxpyf_penryn_int
 		return;
 	}
 
-    m_pre = 0;
+	m_pre = 0;
 
-    // If there is anything that would interfere with our use of aligned
-    // vector loads/stores, call the reference implementation.
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
 	if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_XF, cntx ) )
 	{
 		use_ref = TRUE;
 	}
-    else if ( inca != 1 || incw != 1 || incx != 1 || incy != 1 || incz != 1 ||
+	else if ( inca != 1 || incw != 1 || incx != 1 || incy != 1 || incz != 1 ||
 	          bli_is_unaligned_to( ( siz_t )(lda*sizeof(double)), 16 ) )
-    {
-        use_ref = TRUE;
-    }
+	{
+		use_ref = TRUE;
+	}
 	else if ( bli_is_unaligned_to( ( siz_t )a, 16 ) ||
 	          bli_is_unaligned_to( ( siz_t )w, 16 ) ||
 	          bli_is_unaligned_to( ( siz_t )z, 16 ) ||
@@ -174,12 +168,12 @@ void bli_ddotxaxpyf_penryn_int
 	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
 
-	a0 = a_cast + 0*lda;
-	a1 = a_cast + 1*lda;
-	a2 = a_cast + 2*lda;
-	a3 = a_cast + 3*lda;
-	w1 = w_cast;
-	z1 = z_cast;
+	const double* restrict a0 = a_cast + 0*lda;
+	const double* restrict a1 = a_cast + 1*lda;
+	const double* restrict a2 = a_cast + 2*lda;
+	const double* restrict a3 = a_cast + 3*lda;
+	const double* restrict w1 = w_cast;
+	      double* restrict z1 = z_cast;
 
 	chi0 = *(x_cast + 0*incx);
 	chi1 = *(x_cast + 1*incx);
diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c
index e8775bd0c..06e46a3b2 100644
--- a/kernels/penryn/1f/bli_dotxf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c
@@ -45,44 +45,39 @@ typedef union
 
 void bli_ddotxf_penryn_int
      (
-       conj_t           conjat,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict beta,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjat,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+       const double* beta,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
-	double*  restrict alpha_cast = alpha;
-	double*  restrict beta_cast = beta;
-	double*  restrict a_cast = a;
-	double*  restrict x_cast = x;
-	double*  restrict y_cast = y;
-	dim_t             i;
+	const double*  restrict alpha_cast = alpha;
+	const double*  restrict beta_cast = beta;
+	const double*  restrict a_cast = a;
+	const double*  restrict x_cast = x;
+	      double*  restrict y_cast = y;
+	      dim_t             i;
 
-	const dim_t       n_elem_per_reg = 2;
-	const dim_t       n_iter_unroll  = 4;
+	const dim_t             n_elem_per_reg = 2;
+	const dim_t             n_iter_unroll  = 4;
 
-	dim_t             m_pre;
-	dim_t             m_run;
-	dim_t             m_left;
+	      dim_t             m_pre;
+	      dim_t             m_run;
+	      dim_t             m_left;
 
-	double*  restrict x0;
-	double*  restrict x1;
-	double*  restrict x2;
-	double*  restrict x3;
-	double*  restrict y0;
-	double            rho0, rho1, rho2, rho3;
-	double            x0c, x1c, x2c, x3c, y0c;
+	      double            rho0, rho1, rho2, rho3;
+	      double            x0c, x1c, x2c, x3c, y0c;
 
-	v2df_t            rho0v, rho1v, rho2v, rho3v;
-	v2df_t            x0v, x1v, x2v, x3v, y0v, betav, alphav;
+	      v2df_t            rho0v, rho1v, rho2v, rho3v;
+	      v2df_t            x0v, x1v, x2v, x3v, y0v, betav, alphav;
 
-	bool              use_ref = FALSE;
+	      bool              use_ref = FALSE;
 
 
 	if ( bli_zero_dim1( b_n ) ) return;
@@ -103,19 +98,19 @@ void bli_ddotxf_penryn_int
 		return;
 	}
 
-    m_pre = 0;
+	m_pre = 0;
 
-    // If there is anything that would interfere with our use of aligned
-    // vector loads/stores, call the reference implementation.
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
 	if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_DF, cntx ) )
 	{
 		use_ref = TRUE;
 	}
-    else if ( inca != 1 || incx != 1 || incy != 1 ||
+	else if ( inca != 1 || incx != 1 || incy != 1 ||
 	          bli_is_unaligned_to( ( siz_t )(lda*sizeof(double)), 16 ) )
-    {
-        use_ref = TRUE;
-    }
+	{
+		use_ref = TRUE;
+	}
 	else if ( bli_is_unaligned_to( ( siz_t )a, 16 ) ||
 	          bli_is_unaligned_to( ( siz_t )x, 16 ) ||
 	          bli_is_unaligned_to( ( siz_t )y, 16 ) )
@@ -155,11 +150,11 @@ void bli_ddotxf_penryn_int
 	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
 
-	x0 = a_cast;
-	x1 = a_cast +   lda;
-	x2 = a_cast + 2*lda;
-	x3 = a_cast + 3*lda;
-	y0 = x_cast;
+	const double* restrict x0 = a_cast;
+	const double* restrict x1 = a_cast +   lda;
+	const double* restrict x2 = a_cast + 2*lda;
+	const double* restrict x3 = a_cast + 3*lda;
+	const double* restrict y0 = x_cast;
 
 	PASTEMAC(d,set0s)( rho0 );
 	PASTEMAC(d,set0s)( rho1 );
diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
index 8a3ec077f..bbfdd413f 100644
--- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
@@ -39,20 +39,20 @@
 
 void bli_sgemm_penryn_asm_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	//void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	//const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -514,20 +514,20 @@ void bli_sgemm_penryn_asm_8x4
 
 void bli_dgemm_penryn_asm_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
index aa8dcf858..091cfde96 100644
--- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
@@ -40,15 +40,6 @@
 #if 0
 void bli_sgemmtrsm_l_penryn_asm_8x4
      (
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a10,
-       float*     restrict a11,
-       float*     restrict b01,
-       float*     restrict b11,
-       float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
      )
 {
 }
@@ -56,20 +47,20 @@ void bli_sgemmtrsm_l_penryn_asm_8x4
 
 void bli_dgemmtrsm_l_penryn_asm_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a10,
-       double*    restrict a11,
-       double*    restrict b01,
-       double*    restrict b11,
-       double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a10,
+       const double*    a11,
+       const double*    b01,
+             double*    b11,
+             double*    c11, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void*   b_next  = bli_auxinfo_next_b( data );
+	const void* b_next  = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
index 2efc037cc..988c8303e 100644
--- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
@@ -40,15 +40,6 @@
 #if 0
 void bli_sgemmtrsm_u_penryn_asm_8x4
      (
-       dim_t               k0,
-       float*     restrict alpha,
-       float*     restrict a12,
-       float*     restrict a11,
-       float*     restrict b21,
-       float*     restrict b11,
-       float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
      )
 {
 }
@@ -56,20 +47,20 @@ void bli_sgemmtrsm_u_penryn_asm_8x4
 
 void bli_dgemmtrsm_u_penryn_asm_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a12,
-       double*    restrict a11,
-       double*    restrict b21,
-       double*    restrict b11,
-       double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k0,
+       const double*    alpha,
+       const double*    a12,
+       const double*    a11,
+       const double*    b21,
+             double*    b11,
+             double*    c11, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void*   b_next  = bli_auxinfo_next_b( data );
+	const void* b_next  = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
index 69341320e..fc131b55d 100644
--- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
@@ -40,11 +40,6 @@
 #if 0
 void bli_strsm_l_penryn_asm_8x4
      (
-       float*     restrict a11,
-       float*     restrict b11,
-       float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
      )
 {
 }
@@ -52,11 +47,11 @@ void bli_strsm_l_penryn_asm_8x4
 
 void bli_dtrsm_l_penryn_asm_4x4
      (
-       double*    restrict a11,
-       double*    restrict b11,
-       double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+       const double*    a11,
+             double*    b11,
+             double*    c11, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
index 0befb4e4e..eb366cc9d 100644
--- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
@@ -40,11 +40,6 @@
 #if 0
 void bli_strsm_u_penryn_asm_8x4
      (
-       float*     restrict a11,
-       float*     restrict b11,
-       float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
      )
 {
 }
@@ -52,11 +47,11 @@ void bli_strsm_u_penryn_asm_8x4
 
 void bli_dtrsm_u_penryn_asm_4x4
      (
-       double*    restrict a11,
-       double*    restrict b11,
-       double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+       const double*    a11,
+             double*    b11,
+             double*    c11, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
index 95ce7edeb..d6329e6c5 100644
--- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
+++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
@@ -42,20 +42,20 @@
 
 void bli_sgemm_piledriver_asm_16x3
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -523,20 +523,20 @@ void bli_sgemm_piledriver_asm_16x3
 
 void bli_dgemm_piledriver_asm_8x3
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -979,20 +979,20 @@ void bli_dgemm_piledriver_asm_8x3
 
 void bli_cgemm_piledriver_asm_4x2
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const scomplex*  alpha,
+       const scomplex*  a,
+       const scomplex*  b,
+       const scomplex*  beta,
+             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -1389,20 +1389,20 @@ void bli_cgemm_piledriver_asm_4x2
 
 void bli_zgemm_piledriver_asm_2x2
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c
index 67163b5a7..1b7196b43 100644
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -62,16 +62,16 @@
 
 void bli_dgemm_power10_mma_8x8
     (
-        dim_t               m,
-        dim_t               n,
-        dim_t               k,
-        double*    restrict alpha,
-        double*    restrict a,
-        double*    restrict b,
-        double*    restrict beta,
-        double*    restrict c, inc_t rs_c0, inc_t cs_c,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
     )
 {
     // Typecast local copies of integers in case dim_t and inc_t are a
@@ -80,12 +80,13 @@ void bli_dgemm_power10_mma_8x8
     uint64_t k_left = k % 4;
 
     uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
 
     GEMM_UKR_SETUP_CT( d, 8, 8, true );
 
-    double* restrict A0 = a;
-    double* restrict B0 = b;
-    double* restrict C0 = c;
+    const double* restrict A0 = a;
+    const double* restrict B0 = b;
+          double* restrict C0 = c;
 
     double alpha_ = *alpha,
            beta_ = *beta;
diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index cc1cd3d84..2a181a3c3 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -57,16 +57,16 @@
 
 void bli_i16gemm_power10_mma_8x16
     (
-        dim_t               m,
-        dim_t               n,
-        dim_t               k,
-        int32_t*       restrict alpha,
-        short*     restrict a,
-        short*     restrict b,
-        int32_t*       restrict beta,
-        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const int32_t*   alpha,
+        const short*     a,
+        const short*     b,
+        const int32_t*   beta,
+              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+              auxinfo_t* data,
+        const cntx_t*    cntx
     )
 {
 
@@ -74,10 +74,11 @@ void bli_i16gemm_power10_mma_8x16
     uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
+    //uint64_t cs_c   = cs_c0;
 
-    short* restrict A0 = a;
-    short* restrict B0 = b;
-    int*   restrict C0 = c;
+    const short* restrict A0 = a;
+    const short* restrict B0 = b;
+          int*   restrict C0 = c;
 
     int alpha_ = *alpha,
         beta_ = *beta;
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index 26da6cf79..cb4afbf16 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -57,16 +57,16 @@
 
 void bli_i16sgemm_power10_mma_8x16
     (
-        dim_t               m,
-        dim_t               n,
-        dim_t               k,
-        int32_t*       restrict alpha,
-        short*     restrict a,
-        short*     restrict b,
-        int32_t*       restrict beta,
-        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const int32_t*   alpha,
+        const short*     a,
+        const short*     b,
+        const int32_t*   beta,
+              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+              auxinfo_t* data,
+        const cntx_t*    cntx
     )
 {
 
@@ -74,10 +74,11 @@ void bli_i16sgemm_power10_mma_8x16
     uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
+    //uint64_t cs_c   = cs_c0;
 
-    short* restrict A0 = a;
-    short* restrict B0 = b;
-    int*   restrict C0 = c;
+    const short* restrict A0 = a;
+    const short* restrict B0 = b;
+          int*   restrict C0 = c;
 
     int alpha_ = *alpha,
         beta_ = *beta;
diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c
index a8d25d2da..da83c4b99 100644
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -57,16 +57,16 @@
 
 void bli_i4gemm_power10_mma_8x16
     (
-        dim_t               m,
-        dim_t               n,
-        dim_t               k,
-        int32_t*       restrict alpha,
-        nibbles*   restrict a,
-        nibbles*   restrict b,
-        int32_t*       restrict beta,
-        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const int32_t*   alpha,
+        const nibbles*   a,
+        const nibbles*   b,
+        const int32_t*   beta,
+              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+              auxinfo_t* data,
+        const cntx_t*    cntx
     )
 {
 
@@ -74,10 +74,11 @@ void bli_i4gemm_power10_mma_8x16
     uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
+    //uint64_t cs_c   = cs_c0;
 
-    nibbles* restrict A0 = a;
-    nibbles* restrict B0 = b;
-    int*     restrict C0 = c;
+    const nibbles* restrict A0 = a;
+    const nibbles* restrict B0 = b;
+          int*     restrict C0 = c;
 
     int alpha_ = *alpha,
         beta_ = *beta;
diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c
index 2948e10bf..f7609fed8 100644
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -57,26 +57,27 @@
 
 void bli_i8gemm_power10_mma_8x16
     (
-        dim_t               m,
-        dim_t               n,
-        dim_t               k,
-        int32_t*       restrict alpha,
-        int8_t*    restrict a,
-        int8_t*    restrict b,
-        int32_t*       restrict beta,
-        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const int32_t*   alpha,
+        const int8_t*    a,
+        const int8_t*    b,
+        const int32_t*   beta,
+              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+              auxinfo_t* data,
+        const cntx_t*    cntx
     )
 {
     uint64_t k_iter = (k-1) / 4;
     uint64_t k_left = (k-1) % 4;
 
     uint64_t rs_c   = rs_c0;
+    //uint64_t cs_c   = cs_c0;
 
-    int8_t* restrict A0 = a;
-    int8_t* restrict B0 = b;
-    int*    restrict C0 = c;
+    const int8_t* restrict A0 = a;
+    const int8_t* restrict B0 = b;
+          int*    restrict C0 = c;
 
     int alpha_ = *alpha,
         beta_ = *beta;
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index e68c5bed9..881529927 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -58,16 +58,16 @@
 
 void bli_sbgemm_power10_mma_8x16
     (
-        dim_t               m,
-        dim_t               n,
-        dim_t               k,
-        float*     restrict alpha,
-        bfloat16*  restrict a,
-        bfloat16*  restrict b,
-        float*     restrict beta,
-        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const float*     alpha,
+        const bfloat16*  a,
+        const bfloat16*  b,
+        const float*     beta,
+              float*     c, inc_t rs_c0, inc_t cs_c0,
+              auxinfo_t* data,
+        const cntx_t*    cntx
     )
 {
 
@@ -75,10 +75,11 @@ void bli_sbgemm_power10_mma_8x16
     uint64_t k_left = (k-1)%4;
 
     uint64_t rs_c   = rs_c0;
+    //uint64_t cs_c   = cs_c0;
 
-    bfloat16* restrict A0 = a;
-    bfloat16* restrict B0 = b;
-    float* restrict C0 = c;
+    const bfloat16* restrict A0 = a;
+    const bfloat16* restrict B0 = b;
+          float*    restrict C0 = c;
 
     float alpha_= *alpha,
           beta_ = *beta;
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index 3ccee7cbd..fd64c8cc0 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -55,16 +55,16 @@
 
 void bli_sgemm_power10_mma_8x16
     (
-        dim_t               m,
-        dim_t               n,
-        dim_t               k,
-        float*     restrict alpha,
-        float*     restrict a,
-        float*     restrict b,
-        float*     restrict beta,
-        float*     restrict c, inc_t rs_c0, inc_t cs_c,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const float*     alpha,
+        const float*     a,
+        const float*     b,
+        const float*     beta,
+              float*     c, inc_t rs_c0, inc_t cs_c0,
+              auxinfo_t* data,
+        const cntx_t*    cntx
     )
 {
     // Typecast local copies of integers in case dim_t and inc_t are a
@@ -73,11 +73,12 @@ void bli_sgemm_power10_mma_8x16
     uint64_t k_left = k % 4;
 
     uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
 
     GEMM_UKR_SETUP_CT( s, 8, 16, true );
 
     fv4sf_t result[4];
-      fv4sf_t *rowC;
+    fv4sf_t *rowC;
 
     // accumulators that will hold the matrix product
     __vector_quad acc0, acc1, acc2, acc3,
@@ -93,9 +94,9 @@ void bli_sgemm_power10_mma_8x16
     __builtin_mma_xxsetaccz(&acc6);
     __builtin_mma_xxsetaccz(&acc7);
 
-    float* restrict A0 = a;
-    float* restrict B0 = b;
-    float* restrict C0 = c;
+    const float* restrict A0 = a;
+    const float* restrict B0 = b;
+          float* restrict C0 = c;
 
     float alpha_ = *alpha,
           beta_  = *beta;
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index 9c7f9f741..af11befcc 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -58,16 +58,16 @@
 
 void bli_shgemm_power10_mma_8x16
     (
-        dim_t              m,
-        dim_t              n,
-        dim_t              k,
-        float*     restrict alpha,
-        float16*  restrict a,
-        float16*  restrict b,
-        float*     restrict beta,
-        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t*          data,
-        cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const float*     alpha,
+        const float16*   a,
+        const float16*   b,
+        const float*     beta,
+              float*     c, inc_t rs_c0, inc_t cs_c0,
+              auxinfo_t* data,
+        const cntx_t*    cntx
     )
 {
 
@@ -75,10 +75,11 @@ void bli_shgemm_power10_mma_8x16
     uint64_t k_left = (k-1)%4;
 
     uint64_t rs_c   = rs_c0;
+    //uint64_t cs_c   = cs_c0;
 
-    float16* restrict A0 = a;
-    float16* restrict B0 = b;
-    float* restrict C0 = c;
+    const float16* restrict A0 = a;
+    const float16* restrict B0 = b;
+          float* restrict C0 = c;
 
     float alpha_= *alpha,
           beta_ = *beta;
diff --git a/kernels/power7/3/bli_gemm_power7_int_8x4.c b/kernels/power7/3/bli_gemm_power7_int_8x4.c
index 8ca0c891e..c1f0cf7c5 100644
--- a/kernels/power7/3/bli_gemm_power7_int_8x4.c
+++ b/kernels/power7/3/bli_gemm_power7_int_8x4.c
@@ -50,16 +50,16 @@
  */
 void bli_sgemm_power7_int_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+              dim_t      m,
+              dim_t      n,
+              dim_t      k,
+        const float*     alpha,
+        const float*     a,
+        const float*     b,
+        const float*     beta,
+              float*     c, inc_t rs_c, inc_t cs_c,
+              auxinfo_t* data,
+        const cntx_t*    cntx
      )
 {
 #if 1 || defined(UTEST)
@@ -92,16 +92,16 @@ void bli_sgemm_power7_int_8x4
  */
 void bli_dgemm_power7_int_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     if ( cs_c == 1 )
@@ -449,16 +449,16 @@ void bli_dgemm_power7_int_8x4
  */
 void bli_cgemm_power7_int_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const scomplex*  alpha,
+       const scomplex*  a,
+       const scomplex*  b,
+       const scomplex*  beta,
+             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 #if 1 || defined(UTEST)
@@ -502,16 +502,16 @@ void bli_cgemm_power7_int_8x4
  */
 void bli_zgemm_power7_int_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 #if 1 || defined(UTEST)
diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
index 70af2b17e..3d785bcdf 100644
--- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
+++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
@@ -37,16 +37,16 @@
 
 void bli_dgemm_power9_asm_12x6
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
index 051af62e7..111d02a0e 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
@@ -42,16 +42,16 @@
 
 void bli_sgemm_sandybridge_asm_8x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -534,20 +534,20 @@ void bli_sgemm_sandybridge_asm_8x8
 
 void bli_dgemm_sandybridge_asm_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	//void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	//const void*   a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -996,20 +996,20 @@ void bli_dgemm_sandybridge_asm_8x4
 
 void bli_cgemm_sandybridge_asm_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const scomplex*  alpha,
+       const scomplex*  a,
+       const scomplex*  b,
+       const scomplex*  beta,
+             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
-	//void*   a_next = bli_auxinfo_next_a( data );
-	void*   b_next = bli_auxinfo_next_b( data );
+	//const void*   a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -1699,16 +1699,16 @@ void bli_cgemm_sandybridge_asm_8x4
 
 void bli_zgemm_sandybridge_asm_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const dcomplex*  alpha,
+       const dcomplex*  a,
+       const dcomplex*  b,
+       const dcomplex*  beta,
+             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
index cb1cdc7c2..e2685da2b 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
@@ -40,16 +40,6 @@
 #if 0
 void bli_sgemm_sandybridge_int_8x8
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       float*     restrict alpha,
-       float*     restrict a,
-       float*     restrict b,
-       float*     restrict beta,
-       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
      )
 {
 }
@@ -57,21 +47,21 @@ void bli_sgemm_sandybridge_int_8x8
 
 void bli_dgemm_sandybridge_int_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       double*    restrict alpha,
-       double*    restrict a,
-       double*    restrict b,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c0, inc_t cs_c0,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
 
-	//void* a_next = bli_auxinfo_next_a( data );
-	void* b_next = bli_auxinfo_next_b( data );
+	//const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -495,16 +485,6 @@ void bli_dgemm_sandybridge_int_8x4
 #if 0
 void bli_cgemm_sandybridge_int_8x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       scomplex*  restrict alpha,
-       scomplex*  restrict a,
-       scomplex*  restrict b,
-       scomplex*  restrict beta,
-       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
      )
 {
 }
@@ -515,16 +495,6 @@ void bli_cgemm_sandybridge_int_8x4
 #if 0
 void bli_zgemm_sandybridge_int_4x4
      (
-       dim_t               m,
-       dim_t               n,
-       dim_t               k,
-       dcomplex*  restrict alpha,
-       dcomplex*  restrict a,
-       dcomplex*  restrict b,
-       dcomplex*  restrict beta,
-       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*          data,
-       cntx_t*             cntx
      )
 {
 }
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
index 2579ac4b5..05e7c59e2 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
@@ -282,23 +282,25 @@ ahead*/
     VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
     VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
 
+#if 0
 //This is an array used for the scatter/gather instructions.
 static int64_t offsets[16] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
+#endif
 
 
 void bli_dgemm_skx_asm_16x12_l2
      (
-       dim_t            m,
-       dim_t            n,
-       dim_t            k_,
-       double* restrict alpha,
-       double* restrict a,
-       double* restrict b,
-       double* restrict beta,
-       double* restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t*       data,
-       cntx_t*          cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k_,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c_, inc_t cs_c_,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     (void)data;
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
index babb89a1d..0b5f178c6 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
@@ -149,22 +149,24 @@
     VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
     VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
 
+#if 0
 //This is an array used for the scatter/gather instructions.
 static int64_t offsets[16] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
+#endif
 
 void bli_dgemm_skx_asm_16x14
      (
-       dim_t            m,
-       dim_t            n,
-       dim_t            k_,
-       double* restrict alpha,
-       double* restrict a,
-       double* restrict b,
-       double* restrict beta,
-       double* restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t*       data,
-       cntx_t*          cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k_,
+       const double*    alpha,
+       const double*    a,
+       const double*    b,
+       const double*    beta,
+             double*    c, inc_t rs_c_, inc_t cs_c_,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     (void)data;
diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
index 99b850d1d..b04ced575 100644
--- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
+++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
@@ -313,22 +313,24 @@ ahead*/
     VMOVAPD(ZMM(0), MEM(RAX,(32*n+0)*4)) \
     VMOVAPD(ZMM(1), MEM(RAX,(32*n+16)*4))
 
+#if 0
 //This is an array used for the scatter/gather instructions.
 static int64_t offsets[16] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
+#endif
 
 void bli_sgemm_skx_asm_32x12_l2
      (
-       dim_t            m,
-       dim_t            n,
-       dim_t            k_,
-       float* restrict alpha,
-       float* restrict a,
-       float* restrict b,
-       float* restrict beta,
-       float* restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t*       data,
-       cntx_t*          cntx
+             dim_t      m,
+             dim_t      n,
+             dim_t      k_,
+       const float*     alpha,
+       const float*     a,
+       const float*     b,
+       const float*     beta,
+             float*     c, inc_t rs_c_, inc_t cs_c_,
+             auxinfo_t* data,
+       const cntx_t*    cntx
      )
 {
     (void)data;
diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c
index d1263a6c1..8aafa542f 100644
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -101,14 +101,14 @@ typedef union
 
 void bli_samaxv_zen_int
      (
-       dim_t            n,
-       float*  restrict x, inc_t incx,
-       dim_t*  restrict i_max,
-       cntx_t*          cntx
+             dim_t   n,
+       const float*  x, inc_t incx,
+             dim_t*  i_max,
+       const cntx_t* cntx
      )
 {
-	float*  minus_one = PASTEMAC(s,m1);
-	dim_t*  zero_i    = PASTEMAC(i,0);
+	const float* restrict minus_one = PASTEMAC(s,m1);
+	const dim_t* restrict zero_i    = PASTEMAC(i,0);
 
 	float   chi1_r;
 	//float   chi1_i;
@@ -139,7 +139,7 @@ void bli_samaxv_zen_int
 	{
 		for ( i = 0; i < n; ++i )
 		{
-			float* chi1 = x + (i  )*incx;
+			const float* restrict chi1 = x + (i  )*incx;
 
 			/* Get the real and imaginary components of chi1. */
 			chi1_r = *chi1;
@@ -229,7 +229,7 @@ void bli_samaxv_zen_int
 
 		for ( i = n - n_left; i < n; i++ )
 		{
-			float* chi1 = x;
+			const float* restrict chi1 = x;
 
 			/* Get the real and imaginary components of chi1. */
 			chi1_r = *chi1;
@@ -266,14 +266,14 @@ void bli_samaxv_zen_int
 
 void bli_damaxv_zen_int
      (
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       dim_t*  restrict i_max,
-       cntx_t*          cntx
+             dim_t   n,
+       const double* x, inc_t incx,
+             dim_t*  i_max,
+       const cntx_t* cntx
      )
 {
-	double* minus_one = PASTEMAC(d,m1);
-	dim_t*  zero_i    = PASTEMAC(i,0);
+	const double* restrict minus_one = PASTEMAC(d,m1);
+	const dim_t*  restrict zero_i    = PASTEMAC(i,0);
 
 	double  chi1_r;
 	//double  chi1_i;
@@ -304,7 +304,7 @@ void bli_damaxv_zen_int
 	{
 		for ( i = 0; i < n; ++i )
 		{
-			double* chi1 = x + (i  )*incx;
+			const double* restrict chi1 = x + (i  )*incx;
 
 			/* Get the real and imaginary components of chi1. */
 			chi1_r = *chi1;
@@ -386,7 +386,7 @@ void bli_damaxv_zen_int
 
 		for ( i = n - n_left; i < n; i++ )
 		{
-			double* chi1 = x;
+			const double* restrict chi1 = x;
 
 			/* Get the real and imaginary components of chi1. */
 			chi1_r = *chi1;
diff --git a/kernels/zen/1/bli_axpyv_zen_int.c b/kernels/zen/1/bli_axpyv_zen_int.c
index b842c59ed..af2925c2f 100644
--- a/kernels/zen/1/bli_axpyv_zen_int.c
+++ b/kernels/zen/1/bli_axpyv_zen_int.c
@@ -57,12 +57,12 @@ typedef union
 
 void bli_saxpyv_zen_int
      (
-       conj_t           conjx,
-       dim_t            n,
-       float*  restrict alpha,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const float*  alpha,
+       const float*  x, inc_t incx,
+             float*  y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -72,9 +72,6 @@ void bli_saxpyv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	float*  restrict x0;
-	float*  restrict y0;
-
 	v8sf_t           alphav;
 	v8sf_t           x0v, x1v, x2v, x3v;
 	v8sf_t           y0v, y1v, y2v, y3v;
@@ -97,8 +94,8 @@ void bli_saxpyv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const float* restrict x0 = x;
+	      float* restrict y0 = y;
 
 	// Broadcast the alpha scalar to all elements of a vector register.
 	alphav.v = _mm256_broadcast_ss( alpha );
@@ -161,12 +158,12 @@ void bli_saxpyv_zen_int
 
 void bli_daxpyv_zen_int
      (
-       conj_t           conjx,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const double* alpha,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t       n_elem_per_reg = 4;
@@ -176,9 +173,6 @@ void bli_daxpyv_zen_int
 	dim_t             n_viter;
 	dim_t             n_left;
 
-	double*  restrict x0;
-	double*  restrict y0;
-
 	v4df_t            alphav;
 	v4df_t            x0v, x1v, x2v, x3v;
 	v4df_t            y0v, y1v, y2v, y3v;
@@ -201,8 +195,8 @@ void bli_daxpyv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const double* restrict x0 = x;
+	      double* restrict y0 = y;
 
 	// Broadcast the alpha scalar to all elements of a vector register.
 	alphav.v = _mm256_broadcast_sd( alpha );
diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c
index 6ad6d30cf..abd63a52f 100644
--- a/kernels/zen/1/bli_axpyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpyv_zen_int10.c
@@ -57,21 +57,18 @@ typedef union
 
 void bli_saxpyv_zen_int10
      (
-       conj_t           conjx,
-       dim_t            n,
-       float*  restrict alpha,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const float*  alpha,
+       const float*  x, inc_t incx,
+             float*  y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
 
 	dim_t            i;
 
-	float*  restrict x0;
-	float*  restrict y0;
-
 	__m256           alphav;
 	__m256           xv[10];
 	__m256           yv[10];
@@ -81,8 +78,8 @@ void bli_saxpyv_zen_int10
 	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const float* restrict x0 = x;
+	      float* restrict y0 = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
@@ -263,21 +260,18 @@ void bli_saxpyv_zen_int10
 
 void bli_daxpyv_zen_int10
      (
-       conj_t           conjx,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const double* alpha,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
 
 	dim_t            i;
 
-	double* restrict x0 = x;
-	double* restrict y0 = y;
-
 	__m256d          alphav;
 	__m256d          xv[10];
 	__m256d          yv[10];
@@ -287,8 +281,8 @@ void bli_daxpyv_zen_int10
 	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const double* restrict x0 = x;
+	      double* restrict y0 = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c
index 6307b5341..ddf16e15d 100644
--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -39,11 +39,11 @@
 
 void bli_scopyv_zen_int
      (
-       conj_t           conjx,
-       dim_t            n,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const float*  x, inc_t incx,
+             float*  y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t num_elem_per_reg = 8;
@@ -188,11 +188,11 @@ void bli_scopyv_zen_int
 
 void bli_dcopyv_zen_int
      (
-       conj_t           conjx,
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjx,
+             dim_t   n,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t num_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_dotv_zen_int.c b/kernels/zen/1/bli_dotv_zen_int.c
index 03c448f85..3a03a6eed 100644
--- a/kernels/zen/1/bli_dotv_zen_int.c
+++ b/kernels/zen/1/bli_dotv_zen_int.c
@@ -56,13 +56,13 @@ typedef union
 
 void bli_sdotv_zen_int
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       float*  restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const float*  x, inc_t incx,
+       const float*  y, inc_t incy,
+             float*  rho,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -72,8 +72,6 @@ void bli_sdotv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	float*  restrict x0;
-	float*  restrict y0;
 	float            rho0;
 
 	v8sf_t           rho0v, rho1v, rho2v, rho3v;
@@ -104,8 +102,8 @@ void bli_sdotv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const float* restrict x0 = x;
+	const float* restrict y0 = y;
 
 	// Initialize the local scalar rho1 to zero.
 	PASTEMAC(s,set0s)( rho0 );
@@ -178,13 +176,13 @@ void bli_sdotv_zen_int
 
 void bli_ddotv_zen_int
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       double* restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const double* x, inc_t incx,
+       const double* y, inc_t incy,
+             double* rho,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
@@ -194,8 +192,6 @@ void bli_ddotv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	double* restrict x0;
-	double* restrict y0;
 	double           rho0;
 
 	v4df_t           rho0v, rho1v, rho2v, rho3v;
@@ -226,8 +222,8 @@ void bli_ddotv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const double* restrict x0 = x;
+	const double* restrict y0 = y;
 
 	// Initialize the local scalar rho1 to zero.
 	PASTEMAC(d,set0s)( rho0 );
diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c
index f3fe5ea71..62d95b500 100644
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -57,22 +57,19 @@ typedef union
 
 void bli_sdotv_zen_int10
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       float*  restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const float*  x, inc_t incx,
+       const float*  y, inc_t incy,
+             float*  rho,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
 
 	dim_t            i;
 
-	float*  restrict x0;
-	float*  restrict y0;
-
 	float            rho0 = 0.0;
 
 	__m256           xv[10];
@@ -87,8 +84,8 @@ void bli_sdotv_zen_int10
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const float* restrict x0 = x;
+	const float* restrict y0 = y;
 
 	PASTEMAC(s,set0s)( rho0 );
 
@@ -248,22 +245,19 @@ void bli_sdotv_zen_int10
 
 void bli_ddotv_zen_int10
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       double* restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const double* x, inc_t incx,
+       const double* y, inc_t incy,
+             double* rho,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
 
 	dim_t            i;
 
-	double* restrict x0;
-	double* restrict y0;
-
 	double           rho0 = 0.0;
 
 	__m256d          xv[10];
@@ -278,8 +272,8 @@ void bli_ddotv_zen_int10
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const double* restrict x0 = x;
+	const double* restrict y0 = y;
 
 	PASTEMAC(d,set0s)( rho0 );
 
diff --git a/kernels/zen/1/bli_dotxv_zen_int.c b/kernels/zen/1/bli_dotxv_zen_int.c
index 48a9878a7..2ae22cdf2 100644
--- a/kernels/zen/1/bli_dotxv_zen_int.c
+++ b/kernels/zen/1/bli_dotxv_zen_int.c
@@ -56,15 +56,15 @@ typedef union
 
 void bli_sdotxv_zen_int
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       float*  restrict alpha,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       float*  restrict beta,
-       float*  restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const float*  alpha,
+       const float*  x, inc_t incx,
+       const float*  y, inc_t incy,
+       const float*  beta,
+             float*  rho,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -74,8 +74,6 @@ void bli_sdotxv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	float*  restrict x0;
-	float*  restrict y0;
 	float            rho0;
 
 	v8sf_t           rho0v, rho1v, rho2v, rho3v;
@@ -113,8 +111,8 @@ void bli_sdotxv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const float* restrict x0 = x;
+	const float* restrict y0 = y;
 
 	// Initialize the unrolled iterations' rho vectors to zero.
 	rho0v.v = _mm256_setzero_ps();
@@ -184,15 +182,15 @@ void bli_sdotxv_zen_int
 
 void bli_ddotxv_zen_int
      (
-       conj_t           conjx,
-       conj_t           conjy,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       double* restrict beta,
-       double* restrict rho,
-       cntx_t*          cntx
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const double* alpha,
+       const double* x, inc_t incx,
+       const double* y, inc_t incy,
+       const double* beta,
+             double* rho,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
@@ -202,8 +200,6 @@ void bli_ddotxv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	double* restrict x0;
-	double* restrict y0;
 	double           rho0;
 
 	v4df_t           rho0v, rho1v, rho2v, rho3v;
@@ -241,8 +237,8 @@ void bli_ddotxv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
-	y0 = y;
+	const double* restrict x0 = x;
+	const double* restrict y0 = y;
 
 	// Initialize the unrolled iterations' rho vectors to zero.
 	rho0v.v = _mm256_setzero_pd();
diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c
index f92cb0c6c..2c521d165 100644
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -57,11 +57,11 @@ typedef union
 
 void bli_sscalv_zen_int
      (
-       conj_t           conjalpha,
-       dim_t            n,
-       float*  restrict alpha,
-       float*  restrict x, inc_t incx,
-       cntx_t*          cntx
+             conj_t  conjalpha,
+             dim_t   n,
+       const float*  alpha,
+             float*  x, inc_t incx,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -156,11 +156,11 @@ void bli_sscalv_zen_int
 
 void bli_dscalv_zen_int
      (
-       conj_t           conjalpha,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       cntx_t*          cntx
+             conj_t  conjalpha,
+             dim_t   n,
+       const double* alpha,
+             double* x, inc_t incx,
+       const cntx_t* cntx
      )
 {
 	const dim_t       n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index 7487880b8..fbde870e3 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -57,11 +57,11 @@ typedef union
 
 void bli_sscalv_zen_int10
      (
-       conj_t           conjalpha,
-       dim_t            n,
-       float*  restrict alpha,
-       float*  restrict x, inc_t incx,
-       cntx_t*          cntx
+             conj_t  conjalpha,
+             dim_t   n,
+       const float*  alpha,
+             float*  x, inc_t incx,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -251,11 +251,11 @@ void bli_sscalv_zen_int10
 
 void bli_dscalv_zen_int10
      (
-       conj_t           conjalpha,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       cntx_t*          cntx
+             conj_t  conjalpha,
+             dim_t   n,
+       const double* alpha,
+             double* x, inc_t incx,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
@@ -450,11 +450,11 @@ void bli_dscalv_zen_int10
 
 void bli_cscalv_zen_int10
      (
-       conj_t             conjalpha,
-       dim_t              n,
-       scomplex* restrict alpha,
-       scomplex* restrict x, inc_t incx,
-       cntx_t*   restrict cntx
+             conj_t  conjalpha,
+             dim_t   n,
+       const scomplex* alpha,
+             scomplex* x, inc_t incx,
+       const cntx_t* cntx
      )
 {
 	const num_t dt = BLIS_SCOMPLEX;
diff --git a/kernels/zen/1/bli_setv_zen_int.c b/kernels/zen/1/bli_setv_zen_int.c
index 0fbc24cfd..c706f39ef 100644
--- a/kernels/zen/1/bli_setv_zen_int.c
+++ b/kernels/zen/1/bli_setv_zen_int.c
@@ -39,11 +39,11 @@
 
 void bli_ssetv_zen_int
      (
-       conj_t           conjalpha,
-       dim_t            n,
-       float*  restrict alpha,
-       float*  restrict x, inc_t incx,
-       cntx_t*          cntx
+             conj_t  conjalpha,
+             dim_t   n,
+       const float*  alpha,
+             float*  x, inc_t incx,
+       const cntx_t* cntx
      )
 {
 	const dim_t num_elem_per_reg = 8;
@@ -132,13 +132,13 @@ void bli_ssetv_zen_int
 	}
 }
 
-void  bli_dsetv_zen_int
+void bli_dsetv_zen_int
      (
-       conj_t           conjalpha,
-       dim_t            n,
-       double* restrict alpha,
-       double* restrict x, inc_t incx,
-       cntx_t*          cntx
+             conj_t  conjalpha,
+             dim_t   n,
+       const double* alpha,
+             double* x, inc_t incx,
+       const cntx_t* cntx
      )
 {
 	const dim_t num_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_swapv_zen_int8.c b/kernels/zen/1/bli_swapv_zen_int8.c
index 824fd0fb8..e62474be2 100644
--- a/kernels/zen/1/bli_swapv_zen_int8.c
+++ b/kernels/zen/1/bli_swapv_zen_int8.c
@@ -56,27 +56,24 @@ typedef union
 
 void bli_sswapv_zen_int8
      (
-       dim_t            n,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       cntx_t*          cntx
+             dim_t   n,
+             float*  x, inc_t incx,
+             float*  y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 
 	const dim_t     n_elem_per_reg = 8;
 	dim_t           i = 0;
 
-	float* restrict x0;
-	float* restrict y0;
-
 	__m256          xv[8];
 	__m256          yv[8];
 
 	// If the vector dimension is zero, return early.
 	if ( bli_zero_dim1( n ) ) return;
 
-	x0 = x;
-	y0 = y;
+	float* restrict x0 = x;
+	float* restrict y0 = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
@@ -202,26 +199,23 @@ void bli_sswapv_zen_int8
 
 void bli_dswapv_zen_int8
      (
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             dim_t   n,
+             double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
 	dim_t            i = 0;
 
-	double* restrict x0;
-	double* restrict y0;
-
 	__m256d          xv[8];
 	__m256d          yv[8];
 
 	// If the vector dimension is zero, return early.
 	if ( bli_zero_dim1( n ) ) return;
 
-	x0 = x;
-	y0 = y;
+	double* restrict x0 = x;
+	double* restrict y0 = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
index ddebc5ee0..24f66be0b 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -38,15 +38,15 @@
 
 void bli_caxpyf_zen_int_4
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       scomplex* restrict alpha,
-       scomplex* restrict a, inc_t inca, inc_t lda,
-       scomplex* restrict x, inc_t incx,
-       scomplex* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+             conj_t    conja,
+             conj_t    conjx,
+             dim_t     m,
+             dim_t     b_n,
+       const scomplex* alpha,
+       const scomplex* a, inc_t inca, inc_t lda,
+       const scomplex* x, inc_t incx,
+             scomplex* y, inc_t incy,
+       const cntx_t*   cntx
      )
 {
     inc_t fuse_fac = 4;
@@ -85,10 +85,10 @@ void bli_caxpyf_zen_int_4
 
         for ( i = 0; i < b_n; ++i )
         {
-            scomplex* a1   = a + (0  )*inca + (i  )*lda;
-            scomplex* chi1 = x + (i  )*incx;
-            scomplex* y1   = y + (0  )*incy;
-            scomplex  alpha_chi1;
+            const scomplex* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const scomplex* restrict chi1 = x + (i  )*incx;
+                  scomplex* restrict y1   = y + (0  )*incy;
+                  scomplex           alpha_chi1;
 
             bli_ccopycjs( conjx, *chi1, alpha_chi1 );
             bli_cscals( *alpha, alpha_chi1 );
@@ -118,10 +118,10 @@ void bli_caxpyf_zen_int_4
     }
     else
     {
-        scomplex *pchi0 = x + 0*incx ;
-        scomplex *pchi1 = x + 1*incx ;
-        scomplex *pchi2 = x + 2*incx ;
-        scomplex *pchi3 = x + 3*incx ;
+        const scomplex* restrict pchi0 = x + 0*incx ;
+        const scomplex* restrict pchi1 = x + 1*incx ;
+        const scomplex* restrict pchi2 = x + 2*incx ;
+        const scomplex* restrict pchi3 = x + 3*incx ;
 
         bli_ccopycjs( conjx, *pchi0, chi0 );
         bli_ccopycjs( conjx, *pchi1, chi1 );
@@ -217,7 +217,7 @@ void bli_caxpyf_zen_int_4
         for ( i = 0; (i + 0) < n2 ; ++i )
         {
 
-	    scomplex       y0c = *(scomplex*)y0;
+            scomplex       y0c = *(scomplex*)y0;
 
             const scomplex a0c = *(scomplex*)ap[0];
             const scomplex a1c = *(scomplex*)ap[1];
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c
index 9c8a370e1..d4427e86e 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -61,15 +61,15 @@ typedef union
 
 void bli_saxpyf_zen_int_5
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       float* restrict alpha,
-       float* restrict a, inc_t inca, inc_t lda,
-       float* restrict x, inc_t incx,
-       float* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const float*  alpha,
+       const float*  a, inc_t inca, inc_t lda,
+       const float*  x, inc_t incx,
+             float*  y, inc_t incy,
+       const cntx_t* cntx
      )
 {
     const dim_t      fuse_fac       = 5;
@@ -79,14 +79,6 @@ void bli_saxpyf_zen_int_5
 
     dim_t            i;
 
-    float* restrict a0;
-    float* restrict a1;
-    float* restrict a2;
-    float* restrict a3;
-    float* restrict a4;
-
-    float* restrict y0;
-
     v8sf_t           chi0v, chi1v, chi2v, chi3v;
     v8sf_t           chi4v;
 
@@ -114,10 +106,10 @@ void bli_saxpyf_zen_int_5
 
         for ( i = 0; i < b_n; ++i )
         {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
+            const float* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const float* restrict chi1 = x + (i  )*incx;
+                  float* restrict y1   = y + (0  )*incy;
+                  float           alpha_chi1;
 
             bli_scopycjs( conjx, *chi1, alpha_chi1 );
             bli_sscals( *alpha, alpha_chi1 );
@@ -138,12 +130,12 @@ void bli_saxpyf_zen_int_5
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
-    a2   = a + 2*lda;
-    a3   = a + 3*lda;
-    a4   = a + 4*lda;
-    y0   = y;
+    const float* restrict a0   = a + 0*lda;
+    const float* restrict a1   = a + 1*lda;
+    const float* restrict a2   = a + 2*lda;
+    const float* restrict a3   = a + 3*lda;
+    const float* restrict a4   = a + 4*lda;
+          float*          y0   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -313,15 +305,15 @@ void bli_saxpyf_zen_int_5
 
 void bli_daxpyf_zen_int_5
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
     const dim_t      fuse_fac       = 5;
@@ -331,14 +323,6 @@ void bli_daxpyf_zen_int_5
 
     dim_t            i;
 
-    double* restrict a0;
-    double* restrict a1;
-    double* restrict a2;
-    double* restrict a3;
-    double* restrict a4;
-
-    double* restrict y0;
-
     v4df_t           chi0v, chi1v, chi2v, chi3v;
     v4df_t           chi4v;
 
@@ -366,10 +350,10 @@ void bli_daxpyf_zen_int_5
 
         for ( i = 0; i < b_n; ++i )
         {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
+            const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const double* restrict chi1 = x + (i  )*incx;
+                  double* restrict y1   = y + (0  )*incy;
+                  double           alpha_chi1;
 
             bli_dcopycjs( conjx, *chi1, alpha_chi1 );
             bli_dscals( *alpha, alpha_chi1 );
@@ -390,12 +374,12 @@ void bli_daxpyf_zen_int_5
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
-    a2   = a + 2*lda;
-    a3   = a + 3*lda;
-    a4   = a + 4*lda;
-    y0   = y;
+    const double* restrict a0   = a + 0*lda;
+    const double* restrict a1   = a + 1*lda;
+    const double* restrict a2   = a + 2*lda;
+    const double* restrict a3   = a + 3*lda;
+    const double* restrict a4   = a + 4*lda;
+          double* restrict y0   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -564,15 +548,15 @@ void bli_daxpyf_zen_int_5
 
 void bli_daxpyf_zen_int_16x2
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
     const dim_t      fuse_fac       = 2;
@@ -582,11 +566,6 @@ void bli_daxpyf_zen_int_16x2
 
     dim_t            i;
 
-    double* restrict a0;
-    double* restrict a1;
-
-    double* restrict y0;
-
     v4df_t           chi0v, chi1v;
 
     v4df_t           a00v, a01v;
@@ -604,6 +583,7 @@ void bli_daxpyf_zen_int_16x2
     v2df_t           a40v, a41v;
 
     v2df_t           y4v; 
+
     // If either dimension is zero, or if alpha is zero, return early.
     if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
 
@@ -615,10 +595,10 @@ void bli_daxpyf_zen_int_16x2
 
         for ( i = 0; i < b_n; ++i )
         {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
+            const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const double* restrict chi1 = x + (i  )*incx;
+                  double* restrict y1   = y + (0  )*incy;
+                  double           alpha_chi1;
 
             bli_dcopycjs( conjx, *chi1, alpha_chi1 );
             bli_dscals( *alpha, alpha_chi1 );
@@ -639,10 +619,10 @@ void bli_daxpyf_zen_int_16x2
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
+    const double* restrict a0   = a + 0*lda;
+    const double* restrict a1   = a + 1*lda;
 
-    y0   = y;
+          double* restrict y0   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -849,15 +829,15 @@ void bli_daxpyf_zen_int_16x2
 
 void bli_daxpyf_zen_int_16x4
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
     const dim_t      fuse_fac       = 4;
@@ -867,13 +847,6 @@ void bli_daxpyf_zen_int_16x4
 
     dim_t            i;
 
-    double* restrict a0;
-    double* restrict a1;
-    double* restrict a2;
-    double* restrict a3;
-
-    double* restrict y0;
-
     v4df_t           chi0v, chi1v, chi2v, chi3v;
 
     v4df_t           a00v, a01v, a02v, a03v;
@@ -905,10 +878,10 @@ void bli_daxpyf_zen_int_16x4
 
         for ( i = 0; i < b_n; ++i )
         {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
+            const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const double* restrict chi1 = x + (i  )*incx;
+                  double* restrict y1   = y + (0  )*incy;
+                  double           alpha_chi1;
 
             bli_dcopycjs( conjx, *chi1, alpha_chi1 );
             bli_dscals( *alpha, alpha_chi1 );
@@ -929,12 +902,12 @@ void bli_daxpyf_zen_int_16x4
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
-    a2   = a + 2*lda;
-    a3   = a + 3*lda;
+    const double* restrict a0   = a + 0*lda;
+    const double* restrict a1   = a + 1*lda;
+    const double* restrict a2   = a + 2*lda;
+    const double* restrict a3   = a + 3*lda;
 
-    y0   = y;
+          double* restrict y0   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -985,7 +958,7 @@ void bli_daxpyf_zen_int_16x4
             a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg );
             a33v.v = _mm256_loadu_pd( a3 + 3*n_elem_per_reg );
 
-        // perform : y += alpha * x;
+            // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
             y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
             y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
@@ -1149,7 +1122,7 @@ void bli_daxpyf_zen_int_16x4
         for ( ; (i + 1) < m; i += 2)
         {
 
-	    // Load the input values.
+            // Load the input values.
             y4v.v  = _mm_loadu_pd( y0 + 0*n_elem_per_reg );
 
             a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg );
@@ -1227,7 +1200,7 @@ void bli_daxpyf_zen_int_16x4
             a2 += inca;
             a3 += inca;
 
-	    y0 += incy;
+            y0 += incy;
         }
 
     }
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c
index 24e6ee5e2..254cbe573 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -56,15 +56,15 @@ typedef union
 
 void bli_saxpyf_zen_int_8
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       float*  restrict alpha,
-       float*  restrict a, inc_t inca, inc_t lda,
-       float*  restrict x, inc_t incx,
-       float*  restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const float*  alpha,
+       const float*  a, inc_t inca, inc_t lda,
+       const float*  x, inc_t incx,
+             float*  y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
@@ -76,17 +76,6 @@ void bli_saxpyf_zen_int_8
 	dim_t            m_viter;
 	dim_t            m_left;
 
-	float*  restrict a0;
-	float*  restrict a1;
-	float*  restrict a2;
-	float*  restrict a3;
-	float*  restrict a4;
-	float*  restrict a5;
-	float*  restrict a6;
-	float*  restrict a7;
-
-	float*  restrict y0;
-
 	v8sf_t           chi0v, chi1v, chi2v, chi3v;
 	v8sf_t           chi4v, chi5v, chi6v, chi7v;
 
@@ -108,10 +97,10 @@ void bli_saxpyf_zen_int_8
 
 		for ( i = 0; i < b_n; ++i )
 		{
-			float* a1   = a + (0  )*inca + (i  )*lda;
-			float* chi1 = x + (i  )*incx;
-			float* y1   = y + (0  )*incy;
-			float  alpha_chi1;
+			const float* restrict a1   = a + (0  )*inca + (i  )*lda;
+			const float* restrict chi1 = x + (i  )*incx;
+			      float* restrict y1   = y + (0  )*incy;
+			      float           alpha_chi1;
 
 			PASTEMAC(s,copycjs)( conjx, *chi1, alpha_chi1 );
 			PASTEMAC(s,scals)( *alpha, alpha_chi1 );
@@ -146,15 +135,15 @@ void bli_saxpyf_zen_int_8
 		m_left  = m;
 	}
 
-	a0   = a + 0*lda;
-	a1   = a + 1*lda;
-	a2   = a + 2*lda;
-	a3   = a + 3*lda;
-	a4   = a + 4*lda;
-	a5   = a + 5*lda;
-	a6   = a + 6*lda;
-	a7   = a + 7*lda;
-	y0   = y;
+	const float* restrict a0   = a + 0*lda;
+	const float* restrict a1   = a + 1*lda;
+	const float* restrict a2   = a + 2*lda;
+	const float* restrict a3   = a + 3*lda;
+	const float* restrict a4   = a + 4*lda;
+	const float* restrict a5   = a + 5*lda;
+	const float* restrict a6   = a + 6*lda;
+	const float* restrict a7   = a + 7*lda;
+	      float* restrict y0   = y;
 
 	chi0 = *( x + 0*incx );
 	chi1 = *( x + 1*incx );
@@ -265,15 +254,15 @@ void bli_saxpyf_zen_int_8
 
 void bli_daxpyf_zen_int_8
      (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
@@ -285,17 +274,6 @@ void bli_daxpyf_zen_int_8
 	dim_t            m_viter;
 	dim_t            m_left;
 
-	double* restrict a0;
-	double* restrict a1;
-	double* restrict a2;
-	double* restrict a3;
-	double* restrict a4;
-	double* restrict a5;
-	double* restrict a6;
-	double* restrict a7;
-
-	double* restrict y0;
-
 	v4df_t           chi0v, chi1v, chi2v, chi3v;
 	v4df_t           chi4v, chi5v, chi6v, chi7v;
 
@@ -317,10 +295,10 @@ void bli_daxpyf_zen_int_8
 
 		for ( i = 0; i < b_n; ++i )
 		{
-			double* a1   = a + (0  )*inca + (i  )*lda;
-			double* chi1 = x + (i  )*incx;
-			double* y1   = y + (0  )*incy;
-			double  alpha_chi1;
+			const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+			const double* restrict chi1 = x + (i  )*incx;
+			      double* restrict y1   = y + (0  )*incy;
+			      double           alpha_chi1;
 
 			PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
 			PASTEMAC(d,scals)( *alpha, alpha_chi1 );
@@ -355,15 +333,15 @@ void bli_daxpyf_zen_int_8
 		m_left  = m;
 	}
 
-	a0   = a + 0*lda;
-	a1   = a + 1*lda;
-	a2   = a + 2*lda;
-	a3   = a + 3*lda;
-	a4   = a + 4*lda;
-	a5   = a + 5*lda;
-	a6   = a + 6*lda;
-	a7   = a + 7*lda;
-	y0   = y;
+	const double* restrict a0   = a + 0*lda;
+	const double* restrict a1   = a + 1*lda;
+	const double* restrict a2   = a + 2*lda;
+	const double* restrict a3   = a + 3*lda;
+	const double* restrict a4   = a + 4*lda;
+	const double* restrict a5   = a + 5*lda;
+	const double* restrict a6   = a + 6*lda;
+	const double* restrict a7   = a + 7*lda;
+	      double* restrict y0   = y;
 
 	chi0 = *( x + 0*incx );
 	chi1 = *( x + 1*incx );
diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c
index 50ca92561..76309de93 100644
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -56,16 +56,16 @@ typedef union
 
 void bli_sdotxf_zen_int_8
      (
-       conj_t           conjat,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       float*  restrict alpha,
-       float*  restrict a, inc_t inca, inc_t lda,
-       float*  restrict x, inc_t incx,
-       float*  restrict beta,
-       float*  restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjat,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const float*  alpha,
+       const float*  a, inc_t inca, inc_t lda,
+       const float*  x, inc_t incx,
+       const float*  beta,
+             float*  y, inc_t incy,
+       const cntx_t* cntx
      )
 {
 	const dim_t fuse_fac       = 8;
@@ -99,9 +99,9 @@ void bli_sdotxf_zen_int_8
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
-			float* a1   = a + (0  )*inca + (i  )*lda;
-			float* x1   = x + (0  )*incx;
-			float* psi1 = y + (i  )*incy;
+			const float* restrict a1   = a + (0  )*inca + (i  )*lda;
+			const float* restrict x1   = x + (0  )*incx;
+			      float* restrict psi1 = y + (i  )*incy;
 
 			f
 			(
@@ -147,15 +147,15 @@ void bli_sdotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
 
 		// Set up pointers for x and the b_n columns of A (rows of A^T).
-		float* restrict x0 = x;
-		float* restrict a0 = a + 0*lda;
-		float* restrict a1 = a + 1*lda;
-		float* restrict a2 = a + 2*lda;
-		float* restrict a3 = a + 3*lda;
-		float* restrict a4 = a + 4*lda;
-		float* restrict a5 = a + 5*lda;
-		float* restrict a6 = a + 6*lda;
-		float* restrict a7 = a + 7*lda;
+		const float* restrict x0 = x;
+		const float* restrict a0 = a + 0*lda;
+		const float* restrict a1 = a + 1*lda;
+		const float* restrict a2 = a + 2*lda;
+		const float* restrict a3 = a + 3*lda;
+		const float* restrict a4 = a + 4*lda;
+		const float* restrict a5 = a + 5*lda;
+		const float* restrict a6 = a + 6*lda;
+		const float* restrict a7 = a + 7*lda;
 
 		// Initialize b_n rho vector accumulators to zero.
 		v8sf_t rho0v; rho0v.v = _mm256_setzero_ps();
@@ -268,8 +268,8 @@ void bli_sdotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_iter_unroll );
 
 		// Initialize pointers for x and A.
-		float* restrict x0 = x;
-		float* restrict a0 = a;
+		const float* restrict x0 = x;
+		const float* restrict a0 = a;
 
 		// Initialize rho vector accumulators to zero.
 		v8sf_t rho0v; rho0v.v = _mm256_setzero_ps();
@@ -332,15 +332,15 @@ void bli_sdotxf_zen_int_8
 	// Scalar edge case.
 	{
 		// Initialize pointers for x and the b_n columns of A (rows of A^T).
-		float* restrict x0 = x;
-		float* restrict a0 = a + 0*lda;
-		float* restrict a1 = a + 1*lda;
-		float* restrict a2 = a + 2*lda;
-		float* restrict a3 = a + 3*lda;
-		float* restrict a4 = a + 4*lda;
-		float* restrict a5 = a + 5*lda;
-		float* restrict a6 = a + 6*lda;
-		float* restrict a7 = a + 7*lda;
+		const float* restrict x0 = x;
+		const float* restrict a0 = a + 0*lda;
+		const float* restrict a1 = a + 1*lda;
+		const float* restrict a2 = a + 2*lda;
+		const float* restrict a3 = a + 3*lda;
+		const float* restrict a4 = a + 4*lda;
+		const float* restrict a5 = a + 5*lda;
+		const float* restrict a6 = a + 6*lda;
+		const float* restrict a7 = a + 7*lda;
 
 		// If there are leftover iterations, perform them with scalar code.
 		for ( dim_t i = 0; i < m ; ++i )
@@ -446,20 +446,20 @@ void bli_sdotxf_zen_int_8
 
 void bli_ddotxf_zen_int_8
      (
-       conj_t           conjat,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict beta,
-       double* restrict y, inc_t incy,
-       cntx_t*          cntx
+             conj_t  conjat,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const double* alpha,
+       const double* a, inc_t inca, inc_t lda,
+       const double* x, inc_t incx,
+       const double* beta,
+             double* y, inc_t incy,
+       const cntx_t* cntx
      )
 {
-	const dim_t      fuse_fac       = 8;
-	const dim_t      n_elem_per_reg = 4;
+	const dim_t fuse_fac       = 8;
+	const dim_t n_elem_per_reg = 4;
 
 	// If the b_n dimension is zero, y is empty and there is no computation.
 	if ( bli_zero_dim1( b_n ) ) return;
@@ -489,9 +489,9 @@ void bli_ddotxf_zen_int_8
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
-			double* a1   = a + (0  )*inca + (i  )*lda;
-			double* x1   = x + (0  )*incx;
-			double* psi1 = y + (i  )*incy;
+			const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+			const double* restrict x1   = x + (0  )*incx;
+			      double* restrict psi1 = y + (i  )*incy;
 
 			f
 			(
@@ -537,15 +537,15 @@ void bli_ddotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
 
 		// Set up pointers for x and the b_n columns of A (rows of A^T).
-		double* restrict x0 = x;
-		double* restrict a0 = a + 0*lda;
-		double* restrict a1 = a + 1*lda;
-		double* restrict a2 = a + 2*lda;
-		double* restrict a3 = a + 3*lda;
-		double* restrict a4 = a + 4*lda;
-		double* restrict a5 = a + 5*lda;
-		double* restrict a6 = a + 6*lda;
-		double* restrict a7 = a + 7*lda;
+		const double* restrict x0 = x;
+		const double* restrict a0 = a + 0*lda;
+		const double* restrict a1 = a + 1*lda;
+		const double* restrict a2 = a + 2*lda;
+		const double* restrict a3 = a + 3*lda;
+		const double* restrict a4 = a + 4*lda;
+		const double* restrict a5 = a + 5*lda;
+		const double* restrict a6 = a + 6*lda;
+		const double* restrict a7 = a + 7*lda;
 
 		// Initialize b_n rho vector accumulators to zero.
 		v4df_t rho0v; rho0v.v = _mm256_setzero_pd();
@@ -643,8 +643,8 @@ void bli_ddotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_reg_per_row * n_iter_unroll );
 
 		// Initialize pointers for x and A.
-		double* restrict x0 = x;
-		double* restrict a0 = a;
+		const double* restrict x0 = x;
+		const double* restrict a0 = a;
 
 		// Initialize rho vector accumulators to zero.
 		v4df_t rho0v; rho0v.v = _mm256_setzero_pd();
@@ -713,15 +713,15 @@ void bli_ddotxf_zen_int_8
 	// Scalar edge case.
 	{
 		// Initialize pointers for x and the b_n columns of A (rows of A^T).
-		double* restrict x0 = x;
-		double* restrict a0 = a + 0*lda;
-		double* restrict a1 = a + 1*lda;
-		double* restrict a2 = a + 2*lda;
-		double* restrict a3 = a + 3*lda;
-		double* restrict a4 = a + 4*lda;
-		double* restrict a5 = a + 5*lda;
-		double* restrict a6 = a + 6*lda;
-		double* restrict a7 = a + 7*lda;
+		const double* restrict x0 = x;
+		const double* restrict a0 = a + 0*lda;
+		const double* restrict a1 = a + 1*lda;
+		const double* restrict a2 = a + 2*lda;
+		const double* restrict a3 = a + 3*lda;
+		const double* restrict a4 = a + 4*lda;
+		const double* restrict a5 = a + 5*lda;
+		const double* restrict a6 = a + 6*lda;
+		const double* restrict a7 = a + 7*lda;
 
 		// If there are leftover iterations, perform them with scalar code.
 		for ( dim_t i = 0; i < m ; ++i )
diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c
index bb637d7e6..004220a59 100644
--- a/ref_kernels/1/bli_addv_ref.c
+++ b/ref_kernels/1/bli_addv_ref.c
@@ -39,17 +39,17 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
 \
-	ctype* restrict chi1 = x; \
-	ctype* restrict psi1 = y; \
+	const ctype* restrict chi1 = x; \
+	      ctype* restrict psi1 = y; \
 \
 	if ( bli_is_conj( conjx ) ) \
 	{ \
diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c
index cdfae9568..4d249feb4 100644
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -43,14 +43,14 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       dim_t*  restrict i_max, \
-       cntx_t*          cntx  \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             dim_t*  index, \
+       const cntx_t* cntx  \
      ) \
 { \
-	ctype_r* minus_one = PASTEMAC(chr,m1); \
-	dim_t*   zero_i    = PASTEMAC(i,0); \
+	const ctype_r* minus_one = PASTEMAC(chr,m1); \
+	const dim_t*   zero_i    = PASTEMAC(i,0); \
 \
 	ctype_r  chi1_r; \
 	ctype_r  chi1_i; \
@@ -62,7 +62,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	   the behavior of netlib BLAS's i?amax() routines. */ \
 	if ( bli_zero_dim1( n ) ) \
 	{ \
-		PASTEMAC(i,copys)( *zero_i, *i_max ); \
+		PASTEMAC(i,copys)( *zero_i, *index ); \
 		return; \
 	} \
 \
@@ -76,7 +76,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	if ( incx == 1 ) \
 	{ \
-		ctype* chi1 = x; \
+		const ctype* restrict chi1 = x; \
 \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
@@ -110,7 +110,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			ctype* chi1 = x + (i  )*incx; \
+			const ctype* restrict chi1 = x + (i  )*incx; \
 \
 			/* Get the real and imaginary components of chi1. */ \
 			PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
@@ -138,7 +138,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 \
 	/* Store the final index to the output variable. */ \
-	PASTEMAC(i,copys)( i_max_l, *i_max ); \
+	PASTEMAC(i,copys)( i_max_l, *index ); \
 }
 
 INSERT_GENTFUNCR_BASIC2( amaxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index fb48070a5..56354bf48 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -39,13 +39,13 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
@@ -56,7 +56,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		{ \
 			/* If alpha is zero and beta is zero, set to zero. */ \
 \
-			ctype* zero = PASTEMAC(ch,0); \
+			const ctype* zero = PASTEMAC(ch,0); \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t             dt     = PASTEMAC(ch,type); \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 295fcf24c..547aa55cf 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -130,12 +130,12 @@ GENTFUNC( double,   d, axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c
index 1202aa896..6ec81b69d 100644
--- a/ref_kernels/1/bli_copyv_ref.c
+++ b/ref_kernels/1/bli_copyv_ref.c
@@ -39,11 +39,11 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c
index d17c71dd3..f9e95f68b 100644
--- a/ref_kernels/1/bli_dotv_ref.c
+++ b/ref_kernels/1/bli_dotv_ref.c
@@ -39,13 +39,13 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict rho, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+       const cntx_t* cntx  \
      ) \
 { \
 	ctype dotxy; \
diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c
index caea62176..d390c1602 100644
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -39,15 +39,15 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict beta, \
-       ctype*  restrict rho, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+       const ctype*  beta, \
+             ctype*  rho, \
+       const cntx_t* cntx  \
      ) \
 { \
 	ctype dotxy; \
diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c
index 914663c82..2c5d04187 100644
--- a/ref_kernels/1/bli_invertv_ref.c
+++ b/ref_kernels/1/bli_invertv_ref.c
@@ -39,9 +39,9 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             dim_t   n, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_invscalv_ref.c b/ref_kernels/1/bli_invscalv_ref.c
index a2263ee58..993c79dcf 100644
--- a/ref_kernels/1/bli_invscalv_ref.c
+++ b/ref_kernels/1/bli_invscalv_ref.c
@@ -39,11 +39,11 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjalpha, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index f4785c228..40d25cae3 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -39,12 +39,12 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
@@ -53,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* If alpha is zero, use setv. */ \
 \
-		ctype* zero = PASTEMAC(ch,0); \
+		const ctype* zero = PASTEMAC(ch,0); \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 6ca9a88a5..f096dd80a 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -39,11 +39,11 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjalpha, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
@@ -54,7 +54,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	/* If alpha is zero, use setv. */ \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
-		ctype* zero = PASTEMAC(ch,0); \
+		const ctype* zero = PASTEMAC(ch,0); \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c
index be6e76cbb..1e1252d7a 100644
--- a/ref_kernels/1/bli_setv_ref.c
+++ b/ref_kernels/1/bli_setv_ref.c
@@ -39,11 +39,11 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjalpha, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       cntx_t*          cntx  \
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c
index ce1ec2079..e4d4896fb 100644
--- a/ref_kernels/1/bli_subv_ref.c
+++ b/ref_kernels/1/bli_subv_ref.c
@@ -39,11 +39,11 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c
index 73a90c87b..43bff0e46 100644
--- a/ref_kernels/1/bli_swapv_ref.c
+++ b/ref_kernels/1/bli_swapv_ref.c
@@ -39,10 +39,10 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             dim_t   n, \
+             ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 0a6844bf1..6baddf16c 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -39,12 +39,12 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             dim_t   n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 0563322ae..c87f976e7 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -40,15 +40,15 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            n, \
-       ctype*  restrict alphax, \
-       ctype*  restrict alphay, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict z, inc_t incz, \
-       cntx_t*          cntx  \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const ctype*  alphax, \
+       const ctype*  alphay, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index 873cee563..b72fea107 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -40,15 +40,15 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conja, \
-       conj_t           conjx, \
-       dim_t            m, \
-       dim_t            b_n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
@@ -101,9 +101,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
-			ctype* restrict a1   = a + (0  )*inca + (i  )*lda; \
-			ctype* restrict chi1 = x + (i  )*incx; \
-			ctype* restrict y1   = y + (0  )*incy; \
+			const ctype* restrict a1   = a + (0  )*inca + (i  )*lda; \
+			const ctype* restrict chi1 = x + (i  )*incx; \
+			      ctype* restrict y1   = y + (0  )*incy; \
 \
 			ctype alpha_chi1; \
 \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index b83b927c9..5fe620b12 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -40,16 +40,16 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjxt, \
-       conj_t           conjx, \
-       conj_t           conjy, \
-       dim_t            m, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict rho, \
-       ctype*  restrict z, inc_t incz, \
-       cntx_t*          cntx  \
+             conj_t  conjxt, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index 249b9a6de..6fb3c9aa8 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -40,20 +40,20 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjat, \
-       conj_t           conja, \
-       conj_t           conjw, \
-       conj_t           conjx, \
-       dim_t            m, \
-       dim_t            b_n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict w, inc_t incw, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       ctype*  restrict z, inc_t incz, \
-       cntx_t*          cntx  \
+             conj_t  conjat, \
+             conj_t  conja, \
+             conj_t  conjw, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  w, inc_t incw, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz, \
+       const cntx_t* cntx  \
      ) \
 { \
 	/* A is m x n.                   */ \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 2d2da1318..7986cd86f 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -40,16 +40,16 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjat, \
-       conj_t           conjx, \
-       dim_t            m, \
-       dim_t            b_n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict beta, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
+             conj_t  conjat, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+       const cntx_t* cntx  \
      ) \
 { \
 	if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \
@@ -117,9 +117,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
-			ctype* restrict a1   = a + (0  )*inca + (i  )*lda; \
-			ctype* restrict x1   = x + (0  )*incx; \
-			ctype* restrict psi1 = y + (i  )*incy; \
+			const ctype* restrict a1   = a + (0  )*inca + (i  )*lda; \
+			const ctype* restrict x1   = x + (0  )*incx; \
+			      ctype* restrict psi1 = y + (i  )*incy; \
 \
 			kfp_dv \
 			( \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index e2008d255..42bf454c8 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -34,6 +34,7 @@
 
 #include "blis.h"
 
+
 #define PACKM_SET1_1E( chr, mnk ) \
 do { \
 	PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
@@ -42,12 +43,14 @@ do { \
 	PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
 } while (0)
 
+
 #define PACKM_SET1_1R( chr, mnk ) \
 do { \
 	PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \
 	PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \
 } while (0)
 
+
 #define PACKM_SCAL_1E( ch, mn, k, op ) \
 do { \
 	PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
@@ -60,6 +63,7 @@ do { \
 	                                    *(pi1_ir + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
 } while (0)
 
+
 #define PACKM_SCAL_1R( ch, mn, k, op ) \
 do { \
 	PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0 + k*lda2), \
@@ -68,6 +72,7 @@ do { \
 	                                   *(pi1_i  + mn*dfac  + d + k*ldp2) ); \
 } while (0)
 
+
 #define PACKM_DIAG_1E_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
 \
 do \
@@ -81,12 +86,14 @@ do \
 		PACKM_SCAL_1E( ch, mn, k, op ); \
 } while(0)
 
+
 #define PACKM_DIAG_BODY_1E_L( ch, op ) \
 	PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
 
 #define PACKM_DIAG_BODY_1E_U( ch, op ) \
 	PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op )
 
+
 #define PACKM_DIAG_1R_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
 \
 do \
@@ -100,29 +107,31 @@ do \
 		PACKM_SCAL_1R( ch, mn, k, op ); \
 } while(0)
 
+
 #define PACKM_DIAG_BODY_1R_L( ch, op ) \
 	PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
 
 #define PACKM_DIAG_BODY_1R_U( ch, op ) \
 	PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op )
 
+
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       struc_t          struca, \
-       diag_t           diaga, \
-       uplo_t           uploa, \
-       conj_t           conja, \
-       pack_t           schema, \
-       bool             invdiag, \
-       dim_t            cdim, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx \
+             struc_t struca, \
+             diag_t  diaga, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   cdim, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx \
      ) \
 { \
 	const num_t dt_r      = PASTEMAC(chr,type); \
@@ -141,9 +150,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t       lda2    = 2 * lda; \
 	const inc_t       ldp2    = 2 * ldp; \
 \
-	ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
-	ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
-	ctype_r* restrict alpha1  = ( ctype_r* )a; \
+	      ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
+	      ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
+	const ctype_r* restrict alpha1  = ( const ctype_r* )a; \
 \
 	if ( bli_is_1e_packed( schema ) ) \
 	{ \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index d12ff59ab..060ba3fdb 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -34,6 +34,7 @@
 
 #include "blis.h"
 
+
 #define PACKM_DIAG_BODY( ctype, ch, mn_min, mn_max, inca, lda, op ) \
 \
 do \
@@ -44,29 +45,31 @@ do \
 		PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca + k*lda), *(pi1 + mn*dfac + d + k*ldp) ); \
 } while(0)
 
+
 #define PACKM_DIAG_BODY_L( ctype, ch, op ) \
 	PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op )
 
 #define PACKM_DIAG_BODY_U( ctype, ch, op ) \
 	PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op )
 
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       struc_t          struca, \
-       diag_t           diaga, \
-       uplo_t           uploa, \
-       conj_t           conja, \
-       pack_t           schema, \
-       bool             invdiag, \
-       dim_t            cdim, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx \
+             struc_t struca, \
+             diag_t  diaga, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   cdim, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx  \
      ) \
 { \
 	const num_t dt        = PASTEMAC(ch,type); \
@@ -82,9 +85,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	  p, 1, ldp  \
 	); \
 \
-	ctype           kappa_cast = *( ctype* )kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
+	      ctype           kappa_cast = *( ctype* )kappa; \
+	const ctype* restrict alpha1     = a; \
+	      ctype* restrict pi1        = p; \
 \
 	/* write the strictly lower part if it exists */ \
 	if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index f3dd3d78f..53067983c 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -34,6 +34,7 @@
 
 #include "blis.h"
 
+
 #define PACKM_1E_BODY( ctype, ch, pragma, cdim, inca2, op ) \
 \
 do \
@@ -56,6 +57,7 @@ do \
 	} \
 } while(0)
 
+
 #define PACKM_1R_BODY( ctype, ch, pragma, cdim, inca2, op ) \
 \
 do \
@@ -74,20 +76,21 @@ do \
 	} \
 } while(0)
 
+
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx  \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   n, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx  \
      ) \
 { \
 	const dim_t dfac = PASTECH2(bb0, _, chr); \
@@ -103,11 +106,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const inc_t       lda2    = 2 * lda; \
 		const inc_t       ldp2    = 2 * ldp; \
 \
-		ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
-		ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
-		ctype_r* restrict alpha1  = ( ctype_r* )a; \
-		ctype_r* restrict pi1_ri  = ( ctype_r* )p; \
-		ctype_r* restrict pi1_ir  = ( ctype_r* )p + ldp; \
+		      ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
+		      ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
+		const ctype_r* restrict alpha1  = ( ctype_r* )a; \
+		      ctype_r* restrict pi1_ri  = ( ctype_r* )p; \
+		      ctype_r* restrict pi1_ir  = ( ctype_r* )p + ldp; \
 \
 		if ( cdim == mnr && mnr != -1 ) \
 		{ \
@@ -144,11 +147,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const inc_t       lda2    = 2 * lda; \
 		const inc_t       ldp2    = 2 * ldp; \
 \
-		ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
-		ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
-		ctype_r* restrict alpha1  = ( ctype_r* )a; \
-		ctype_r* restrict pi1_r   = ( ctype_r* )p; \
-		ctype_r* restrict pi1_i   = ( ctype_r* )p + ldp; \
+		      ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
+		      ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
+		const ctype_r* restrict alpha1  = ( ctype_r* )a; \
+		      ctype_r* restrict pi1_r   = ( ctype_r* )p; \
+		      ctype_r* restrict pi1_i   = ( ctype_r* )p + ldp; \
 \
 		if ( cdim == mnr && mnr != -1 ) \
 		{ \
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index efbbc95e4..fcf9c5360 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -34,6 +34,7 @@
 
 #include "blis.h"
 
+
 #define PACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
 \
 do \
@@ -50,20 +51,21 @@ do \
 	} \
 } while(0)
 
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t*          cntx \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   n, \
+             dim_t   n_max, \
+       const ctype*  kappa, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+             ctype*  p,             inc_t ldp, \
+       const cntx_t* cntx  \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
@@ -71,9 +73,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t     cdim_max   = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
 	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
-	ctype           kappa_cast = *( ctype* )kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
+	      ctype           kappa_cast = *( ctype* )kappa; \
+	const ctype* restrict alpha1     = a; \
+	      ctype* restrict pi1        = p; \
 \
 	if ( cdim == mnr && mnr != -1 ) \
 	{ \
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 172e93bdf..9a63f6971 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -34,6 +34,7 @@
 
 #include "blis.h"
 
+
 #define UNPACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
 \
 do \
@@ -49,28 +50,29 @@ do \
 	} \
 } while(0)
 
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       ctype*  restrict kappa, \
-       ctype*  restrict p,             inc_t ldp, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t*          cntx \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   n, \
+       const ctype*  kappa, \
+       const ctype*  p,             inc_t ldp, \
+             ctype*  a, inc_t inca, inc_t lda, \
+       const cntx_t* cntx  \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
     /* It's not clear if unpack needs to care about BB storage... */ \
 	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
+	const ctype* restrict kappa_cast = kappa; \
+	const ctype* restrict pi1        = p; \
+	      ctype* restrict alpha1     = a; \
 \
 	if ( cdim == mnr && mnr != -1 ) \
 	{ \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 26eda0c65..e8f4364cc 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -42,16 +42,16 @@
 \
 static void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t dt     = PASTEMAC(ch,type); \
@@ -70,33 +70,27 @@ static void PASTEMAC3(ch,opname,arch,suf) \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const inc_t rs_ab  = 1; \
 	const inc_t cs_ab  = m; \
-\
-	dim_t       l, j, i; \
-\
-	ctype       ai; \
-	ctype       bj; \
-\
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( i = 0; i < m * n; ++i ) \
+	for ( dim_t i = 0; i < m * n; ++i ) \
 	{ \
 		PASTEMAC(ch,set0s)( *(ab + i) ); \
 	} \
 \
 	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( l = 0; l < k; ++l ) \
+	for ( dim_t l = 0; l < k; ++l ) \
 	{ \
 		ctype* restrict abij = ab; \
 \
 		/* In an optimized implementation, these two loops over MR and NR
 		   are typically fully unrolled. */ \
-		for ( j = 0; j < n; ++j ) \
+		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			bj = *(b + j*cs_b); \
+			ctype bj = *(b + j*cs_b); \
 \
-			for ( i = 0; i < m; ++i ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ai = *(a + i*rs_a); \
+				ctype ai = *(a + i*rs_a); \
 \
 				PASTEMAC(ch,dots)( ai, bj, *abij ); \
 \
@@ -109,7 +103,7 @@ static void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 \
 	/* Scale the result in ab by alpha. */ \
-	for ( i = 0; i < m * n; ++i ) \
+	for ( dim_t i = 0; i < m * n; ++i ) \
 	{ \
 		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
 	} \
@@ -151,22 +145,26 @@ INSERT_GENTFUNC_BASIC2( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 \
 	const dim_t mr = PASTECH(BLIS_MR_,ch); \
 	const dim_t nr = PASTECH(BLIS_NR_,ch); \
 \
+	/* If either BLIS_MR_? or BLIS_NR_? was left undefined by the subconfig,
+	   the compiler can't fully unroll the MR and NR loop iterations below,
+	   which means there's no benefit to using this kernel over a general-
+	   purpose implementation instead. */ \
 	if ( mr == -1 || nr == -1 ) \
 	{ \
 		PASTEMAC3(ch,gemm_gen,arch,suf) \
@@ -185,7 +183,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		return; \
 	} \
 \
-	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
+	      ctype ab[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const inc_t rs_ab  = nr; \
diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c
index 9cadb3bd6..aaa6ff742 100644
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -43,18 +43,18 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             conj_t     conja, \
+             conj_t     conjb, \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
@@ -65,22 +65,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by rows. */ \
 		for ( dim_t i = 0; i < m; ++i ) \
 		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
+			      ctype* restrict ci = &c[ i*rs_c ]; \
+			const ctype* restrict ai = &a[ i*rs_a ]; \
 \
 			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &ci[ j*cs_c ]; \
+				const ctype* restrict bj  = &b [ j*cs_b ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
 				} \
@@ -108,22 +108,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by rows. */ \
 		for ( dim_t i = 0; i < m; ++i ) \
 		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
+			      ctype* restrict ci = &c[ i*rs_c ]; \
+			const ctype* restrict ai = &a[ i*rs_a ]; \
 \
 			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &ci[ j*cs_c ]; \
+				const ctype* restrict bj  = &b [ j*cs_b ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
 				} \
@@ -151,22 +151,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by rows. */ \
 		for ( dim_t i = 0; i < m; ++i ) \
 		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
+			      ctype* restrict ci = &c[ i*rs_c ]; \
+			const ctype* restrict ai = &a[ i*rs_a ]; \
 \
 			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &ci[ j*cs_c ]; \
+				const ctype* restrict bj  = &b [ j*cs_b ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
 				} \
@@ -194,22 +194,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by rows. */ \
 		for ( dim_t i = 0; i < m; ++i ) \
 		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
+			      ctype* restrict ci = &c[ i*rs_c ]; \
+			const ctype* restrict ai = &a[ i*rs_a ]; \
 \
 			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &ci[ j*cs_c ]; \
+				const ctype* restrict bj  = &b [ j*cs_b ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
 				} \
@@ -248,18 +248,18 @@ INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             conj_t     conja, \
+             conj_t     conjb, \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
@@ -270,22 +270,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by columns. */ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
+			      ctype* restrict cj = &c[ j*cs_c ]; \
+			const ctype* restrict bj = &b[ j*cs_b ]; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &cj[ i*rs_c ]; \
+				const ctype* restrict ai  = &a [ i*rs_a ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
 				} \
@@ -313,22 +313,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by columns. */ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
+			      ctype* restrict cj = &c[ j*cs_c ]; \
+			const ctype* restrict bj = &b[ j*cs_b ]; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &cj[ i*rs_c ]; \
+				const ctype* restrict ai  = &a [ i*rs_a ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
 				} \
@@ -356,22 +356,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by columns. */ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
+			      ctype* restrict cj = &c[ j*cs_c ]; \
+			const ctype* restrict bj = &b[ j*cs_b ]; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &cj[ i*rs_c ]; \
+				const ctype* restrict ai  = &a [ i*rs_a ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
 				} \
@@ -399,22 +399,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Traverse c by columns. */ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
+			      ctype* restrict cj = &c[ j*cs_c ]; \
+			const ctype* restrict bj = &b[ j*cs_b ]; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
+				      ctype* restrict cij = &cj[ i*rs_c ]; \
+				const ctype* restrict ai  = &a [ i*rs_a ]; \
+				      ctype           ab; \
 \
 				PASTEMAC(ch,set0s)( ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
 				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
+					const ctype* restrict aij = &ai[ l*cs_a ]; \
+					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
 				} \
@@ -450,383 +450,3 @@ INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
-
-
-
-
-
-
-
-#if 0
-
-//
-// -- Row storage case ---------------------------------------------------------
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
-     ) \
-{ \
-	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                / sizeof( ctype ) ] \
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-\
-	const dim_t mn    = m * n; \
-	const inc_t rs_ab = n; \
-	const inc_t cs_ab = 1; \
-\
-\
-	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
-	   sufficiently large enough to hold the m x n microtile.
-
-	   The ability to handle m < mr and n < nr is being provided so that
-	   optimized ukernels can call one of these reference implementations
-	   for their edge cases, if they choose. When they do so, they will
-	   need to call the function directly, by its configuration-mangled
-	   name, since it will have been overwritten in the context when
-	   the optimized ukernel functions are registered. */ \
-\
-\
-	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( dim_t i = 0; i < mn; ++i ) \
-	{ \
-		PASTEMAC(ch,set0s)( ab[i] ); \
-	} \
-\
-	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( dim_t l = 0; l < k; ++l ) \
-	{ \
-		/* Traverse ab by rows; assume cs_ab = 1. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				PASTEMAC(ch,dots) \
-				( \
-				  a[ i*rs_a ], \
-				  b[ j*cs_b ], \
-				  ab[ i*rs_ab + j*cs_ab ]  \
-				); \
-			} \
-		} \
-\
-		a += cs_a; \
-		b += rs_b; \
-	} \
-\
-	/* Scale the result in ab by alpha. */ \
-	for ( dim_t i = 0; i < mn; ++i ) \
-	{ \
-		PASTEMAC(ch,scals)( *alpha, ab[i] ); \
-	} \
-\
-\
-	/* If beta is one, add ab into c. If beta is zero, overwrite c with the
-	   result in ab. Otherwise, scale by beta and accumulate ab to c. */ \
-	if ( PASTEMAC(ch,eq1)( *beta ) ) \
-	{ \
-		/* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			PASTEMAC(ch,adds) \
-			( \
-			  ab[ i*rs_ab + j*1 ], \
-			  c[  i*rs_c  + j*1 ]  \
-			) \
-		} \
-	} \
-	else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-	{ \
-\
-		/* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			PASTEMAC(ch,copys) \
-			( \
-			  ab[ i*rs_ab + j*1 ], \
-			  c[  i*rs_c  + j*1 ]  \
-			) \
-		} \
-	} \
-	else /* beta != 0 && beta != 1 */ \
-	{ \
-		/* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			PASTEMAC(ch,xpbys) \
-			( \
-			  ab[ i*rs_ab + j*1 ], \
-			  *beta, \
-			  c[  i*rs_c  + j*1 ]  \
-			) \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-//
-// -- Column storage case ------------------------------------------------------
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
-     ) \
-{ \
-	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                / sizeof( ctype ) ] \
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-\
-	const dim_t mn    = m * n; \
-	const inc_t rs_ab = 1; \
-	const inc_t cs_ab = m; \
-\
-\
-	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
-	   sufficiently large enough to hold the m x n microtile.
-
-	   The ability to handle m < mr and n < nr is being provided so that
-	   optimized ukernels can call one of these reference implementations
-	   for their edge cases, if they choose. When they do so, they will
-	   need to call the function directly, by its configuration-mangled
-	   name, since it will have been overwritten in the context when
-	   the optimized ukernel functions are registered. */ \
-\
-\
-	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( dim_t i = 0; i < mn; ++i ) \
-	{ \
-		PASTEMAC(ch,set0s)( ab[i] ); \
-	} \
-\
-	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( dim_t l = 0; l < k; ++l ) \
-	{ \
-		/* Traverse ab by columns; assume rs_ab = 1. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,dots) \
-				( \
-				  a[ i*rs_a ], \
-				  b[ j*cs_b ], \
-				  ab[ i*rs_ab + j*cs_ab ]  \
-				); \
-			} \
-		} \
-\
-		a += cs_a; \
-		b += rs_b; \
-	} \
-\
-	/* Scale the result in ab by alpha. */ \
-	for ( dim_t i = 0; i < mn; ++i ) \
-	{ \
-		PASTEMAC(ch,scals)( *alpha, ab[i] ); \
-	} \
-\
-\
-	/* If beta is one, add ab into c. If beta is zero, overwrite c with the
-	   result in ab. Otherwise, scale by beta and accumulate ab to c. */ \
-	if ( PASTEMAC(ch,eq1)( *beta ) ) \
-	{ \
-		/* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			PASTEMAC(ch,adds) \
-			( \
-			  ab[ i*1 + j*cs_ab ], \
-			  c[  i*1 + j*cs_c  ]  \
-			) \
-		} \
-	} \
-	else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-	{ \
-		/* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			PASTEMAC(ch,copys) \
-			( \
-			  ab[ i*1 + j*cs_ab ], \
-			  c[  i*1 + j*cs_c  ]  \
-			) \
-		} \
-	} \
-	else /* beta != 0 && beta != 1 */ \
-	{ \
-		/* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			PASTEMAC(ch,xpbys) \
-			( \
-			  ab[ i*1 + j*cs_ab ], \
-			  *beta, \
-			  c[  i*1 + j*cs_c  ]  \
-			) \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-//
-// -- General storage case -----------------------------------------------------
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
-     ) \
-{ \
-	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                / sizeof( ctype ) ] \
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-\
-	const dim_t mn    = m * n; \
-	const inc_t rs_ab = 1; \
-	const inc_t cs_ab = m; \
-\
-\
-	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
-	   sufficiently large enough to hold the m x n microtile.
-
-	   The ability to handle m < mr and n < nr is being provided so that
-	   optimized ukernels can call one of these reference implementations
-	   for their edge cases, if they choose. When they do so, they will
-	   need to call the function directly, by its configuration-mangled
-	   name, since it will have been overwritten in the context when
-	   the optimized ukernel functions are registered. */ \
-\
-\
-	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( dim_t i = 0; i < mn; ++i ) \
-	{ \
-		PASTEMAC(ch,set0s)( ab[i] ); \
-	} \
-\
-	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( dim_t l = 0; l < k; ++l ) \
-	{ \
-		/* General storage: doesn't matter how we traverse ab. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,dots) \
-				( \
-				  a[ i*rs_a ], \
-				  b[ j*cs_b ], \
-				  ab[ i*rs_ab + j*cs_ab ]  \
-				); \
-			} \
-		} \
-\
-		a += cs_a; \
-		b += rs_b; \
-	} \
-\
-	/* Scale the result in ab by alpha. */ \
-	for ( dim_t i = 0; i < mn; ++i ) \
-	{ \
-		PASTEMAC(ch,scals)( *alpha, ab[i] ); \
-	} \
-\
-\
-	/* If beta is one, add ab into c. If beta is zero, overwrite c with the
-	   result in ab. Otherwise, scale by beta and accumulate ab to c. */ \
-	if ( PASTEMAC(ch,eq1)( *beta ) ) \
-	{ \
-		/* General storage: doesn't matter how we traverse ab and c. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			PASTEMAC(ch,adds) \
-			( \
-			  ab[ i*rs_ab + j*cs_ab ], \
-			  c[  i*rs_c  + j*cs_c  ]  \
-			) \
-		} \
-	} \
-	else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-	{ \
-		/* General storage: doesn't matter how we traverse ab and c. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			PASTEMAC(ch,copys) \
-			( \
-			  ab[ i*rs_ab + j*cs_ab ], \
-			  c[  i*rs_c  + j*cs_c  ]  \
-			) \
-		} \
-	} \
-	else /* beta != 0 && beta != 1 */ \
-	{ \
-		/* General storage: doesn't matter how we traverse ab and c. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			PASTEMAC(ch,xpbys) \
-			( \
-			  ab[ i*rs_ab + j*cs_ab ], \
-			  *beta, \
-			  c[  i*rs_c  + j*cs_c  ]  \
-			) \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-#endif
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 0a11aa052..14ff03780 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -42,17 +42,17 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a1x, \
+       const ctype*     a11, \
+       const ctype*     bx1, \
+             ctype*     b11, \
+             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t dt     = PASTEMAC(ch,type); \
@@ -69,7 +69,7 @@ printf( "bli_gemmtrsm_ref(): cs_b = %d\n", (int)cs_b ); \
 printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
 */ \
 \
-	ctype*      minus_one = PASTEMAC(ch,m1); \
+	const ctype* minus_one = PASTEMAC(ch,m1); \
 \
 	PASTECH(ch,gemm_ukr_ft) \
 	            gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
@@ -83,7 +83,7 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
                      (double*)b11, rs_b, 1, "%5.2f", "" ); \
 */ \
 \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	      ctype     ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	/* to FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
@@ -100,9 +100,9 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
 \
 	const bool      use_ct   = ( m < mr || n < nr ); \
 \
-	ctype* restrict c11_use  = c11; \
-	inc_t           rs_c_use = rs_c; \
-	inc_t           cs_c_use = cs_c; \
+	      ctype*    c11_use  = c11; \
+	      inc_t     rs_c_use = rs_c; \
+	      inc_t     cs_c_use = cs_c; \
 \
 	if ( use_ct ) \
 	{ \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index f115e2a60..5a726b1da 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -42,11 +42,11 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
@@ -66,22 +66,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t     rs_b   = packnr; \
 	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
-	dim_t           iter, i, j, l; \
-	dim_t           n_behind; \
-\
-	for ( iter = 0; iter < m; ++iter ) \
+	for ( dim_t iter = 0; iter < m; ++iter ) \
 	{ \
-		i        = iter; \
-		n_behind = i; \
+		dim_t i        = iter; \
+		dim_t n_behind = i; \
 \
-		ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a10t    = a + (i  )*rs_a + (0  )*cs_a; \
-		ctype* restrict B0      = b + (0  )*rs_b + (0  )*cs_b; \
-		ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
+		const ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
+		const ctype* restrict a10t    = a + (i  )*rs_a + (0  )*cs_a; \
+		      ctype* restrict B0      = b + (0  )*rs_b + (0  )*cs_b; \
+		      ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
 \
 		/* b1 = b1 - a10t * B0; */ \
 		/* b1 = b1 / alpha11; */ \
-		for ( j = 0; j < n; ++j ) \
+		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
 			ctype* restrict b01     = B0 + (0  )*rs_b + (j  )*cs_b; \
 			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
@@ -91,10 +88,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* beta11 = beta11 - a10t * b01; */ \
 			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( l = 0; l < n_behind; ++l ) \
+			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
-				ctype* restrict alpha10 = a10t + (l  )*cs_a; \
-				ctype* restrict beta01  = b01  + (l  )*rs_b; \
+				const ctype* restrict alpha10 = a10t + (l  )*cs_a; \
+				      ctype* restrict beta01  = b01  + (l  )*rs_b; \
 \
 				PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \
 			} \
@@ -158,10 +155,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		dim_t i        = m - iter - 1; \
 		dim_t n_behind = iter; \
 \
-		ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a12t    = a + (i  )*rs_a + (i+1)*cs_a; \
-		ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
-		ctype* restrict B2      = b + (i+1)*rs_b + (0  )*cs_b; \
+		const ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
+		const ctype* restrict a12t    = a + (i  )*rs_a + (i+1)*cs_a; \
+		      ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
+		      ctype* restrict B2      = b + (i+1)*rs_b + (0  )*cs_b; \
 \
 		/* b1 = b1 - a12t * B2; */ \
 		/* b1 = b1 / alpha11; */ \
@@ -177,8 +174,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PASTEMAC(ch,set0s)( rho11 ); \
 			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
-				ctype* restrict alpha12 = a12t + (l  )*cs_a; \
-				ctype* restrict beta21  = b21  + (l  )*rs_b; \
+				const ctype* restrict alpha12 = a12t + (l  )*cs_a; \
+				      ctype* restrict beta21  = b21  + (l  )*rs_b; \
 \
 				PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \
 			} \
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 317cf2604..7dd55e358 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -39,16 +39,16 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t       dt        = PASTEMAC(ch,type); \
@@ -67,29 +67,30 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	const dim_t       k2        = 2 * k; \
 \
-	ctype             ct[ BLIS_STACK_BUF_MAX_SIZE \
+	      ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                      / sizeof( ctype_r ) ] \
 	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	inc_t             rs_ct; \
-	inc_t             cs_ct; \
+	      inc_t       rs_ct; \
+	      inc_t       cs_ct; \
 \
-	ctype_r* restrict a_r       = ( ctype_r* )a; \
+	const ctype_r*    a_r       = ( ctype_r* )a; \
 \
-	ctype_r* restrict b_r       = ( ctype_r* )b; \
+	const ctype_r*    b_r       = ( ctype_r* )b; \
 \
-	ctype_r* restrict zero_r    = PASTEMAC(chr,0); \
+	const ctype_r*    zero_r    = PASTEMAC(chr,0); \
 \
-	ctype_r* restrict alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
-	ctype_r* restrict alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
+	const ctype_r*    alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
+	const ctype_r*    alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
 \
-	ctype_r* restrict beta_r    = &PASTEMAC(ch,real)( *beta ); \
-	ctype_r* restrict beta_i    = &PASTEMAC(ch,imag)( *beta ); \
+	const ctype_r*    beta_r    = &PASTEMAC(ch,real)( *beta ); \
+	const ctype_r*    beta_i    = &PASTEMAC(ch,imag)( *beta ); \
 \
-	ctype_r*          c_use; \
-	inc_t             rs_c_use; \
-	inc_t             cs_c_use; \
+	      ctype_r*    c_use; \
 \
-	bool              using_ct; \
+	      inc_t       rs_c_use; \
+	      inc_t       cs_c_use; \
+\
+	      bool        using_ct; \
 \
 /*
 	PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: a", mr, 2*k, \
@@ -170,14 +171,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		  data, \
 		  cntx  \
 		); \
-\
-		dim_t i, j; \
 \
 		/* Accumulate the final result in ct back to c. */ \
 		if ( PASTEMAC(ch,eq1)( *beta ) ) \
 		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < m; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
 				                   *(c  + i*rs_c  + j*cs_c ) ); \
@@ -185,8 +184,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < m; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
 				                    *(c  + i*rs_c  + j*cs_c ) ); \
@@ -194,8 +193,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else \
 		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < m; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
 				PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
 				                    *beta, \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 1688b688d..dafc14f2c 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -39,17 +39,17 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a1x, \
+       const ctype*     a11, \
+       const ctype*     bx1, \
+             ctype*     b11, \
+             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t       dt          = PASTEMAC(ch,type); \
@@ -69,14 +69,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t       mr_r        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
 	const dim_t       nr_r        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
 \
-	ctype             bt[ BLIS_STACK_BUF_MAX_SIZE \
+	      ctype       bt[ BLIS_STACK_BUF_MAX_SIZE \
 	                      / sizeof( ctype ) ] \
 	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	inc_t             rs_bt; \
-	inc_t             cs_bt; \
+	      inc_t       rs_bt; \
+	      inc_t       cs_bt; \
 \
-	inc_t             rs_bt_r; \
-	inc_t             cs_bt_r; \
+	      inc_t       rs_bt_r; \
+	      inc_t       cs_bt_r; \
 \
 	const dim_t       packnr      = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
@@ -84,24 +84,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	const dim_t       k2          = 2 * k; \
 \
-	ctype_r* restrict a1x_r       = ( ctype_r* )a1x; \
+	const ctype_r*    a1x_r       = ( ctype_r* )a1x; \
 \
-	ctype_r* restrict bx1_r       = ( ctype_r* )bx1; \
+	      ctype_r*    bx1_r       = ( ctype_r* )bx1; \
 \
 	const inc_t       rs_b        = packnr; \
 	const inc_t       cs_b        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
 \
-	ctype_r* restrict zero_r      = PASTEMAC(chr,0); \
-	ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
+	const ctype_r*    zero_r      = PASTEMAC(chr,0); \
+	const ctype_r*    minus_one_r = PASTEMAC(chr,m1); \
 \
 	const ctype_r     alpha_r     = PASTEMAC(ch,real)( *alpha ); \
 	const ctype_r     alpha_i     = PASTEMAC(ch,imag)( *alpha ); \
 \
-	ctype_r*          b_use; \
-	inc_t             rs_b_use; \
-	inc_t             cs_b_use; \
+	      ctype_r*    b_use; \
 \
-	ctype             ct[ BLIS_STACK_BUF_MAX_SIZE \
+	      inc_t       rs_b_use; \
+	      inc_t       cs_b_use; \
+\
+	      ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                      / sizeof( ctype ) ] \
 	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	/* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
@@ -112,9 +113,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	const bool        use_ct      = ( m < mr || n < nr ); \
 \
-	ctype* restrict   c11_use     = c11; \
-	inc_t             rs_c_use    = rs_c; \
-	inc_t             cs_c_use    = cs_c; \
+	      ctype*      c11_use     = c11; \
+\
+	      inc_t       rs_c_use    = rs_c; \
+	      inc_t       cs_c_use    = cs_c; \
 \
 	if ( use_ct ) \
 	{ \
@@ -192,9 +194,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	if ( bli_is_1e_packed( schema_b ) ) \
 	{ \
-		const inc_t       ld_b   =     rs_b; \
-		const inc_t       rs_b2  = 2 * rs_b; \
-		const inc_t       cs_b2  = 2 * cs_b; \
+		const inc_t ld_b  =     rs_b; \
+		const inc_t rs_b2 = 2 * rs_b; \
+		const inc_t cs_b2 = 2 * cs_b; \
 \
 		ctype_r* restrict b11_ri = ( ctype_r* )b11; \
 		ctype_r* restrict b11_ir = ( ctype_r* )b11 + ld_b; \
@@ -207,6 +209,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype*   restrict beta11t     = bt     + i*rs_bt + j*cs_bt; \
 			ctype_r* restrict beta11t_r   = &PASTEMAC(ch,real)( *beta11t ); \
 			ctype_r* restrict beta11t_i   = &PASTEMAC(ch,imag)( *beta11t ); \
+\
 			ctype_r* restrict beta11_ri_r = b11_ri + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
 			ctype_r* restrict beta11_ri_i = b11_ri + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \
 			ctype_r* restrict beta11_ir_r = b11_ir + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
@@ -228,9 +231,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
 	{ \
-		const inc_t       ld_b  =     rs_b; \
-		const inc_t       rs_b2 = 2 * rs_b; \
-		const inc_t       cs_b2 =     cs_b; \
+		const inc_t ld_b  =     rs_b; \
+		const inc_t rs_b2 = 2 * rs_b; \
+		const inc_t cs_b2 =     cs_b; \
 \
 		ctype_r* restrict b11_r = ( ctype_r* )b11; \
 		ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \
@@ -243,6 +246,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype*   restrict beta11t   = bt    + i*rs_bt + j*cs_bt; \
 			ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \
 			ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \
+\
 			ctype_r* restrict beta11_r  = b11_r + i*rs_b2 + j*cs_b2 + d; \
 			ctype_r* restrict beta11_i  = b11_i + i*rs_b2 + j*cs_b2 + d; \
 \
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index 37551b399..9f2e20ffe 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -40,11 +40,11 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
@@ -69,40 +69,36 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t       ld_b  = rs_b; \
 \
 	const pack_t      schema_b = bli_auxinfo_schema_b( data ); \
-\
-	dim_t             iter, i, j, l; \
-	dim_t             n_behind; \
-\
 \
 	if ( bli_is_1e_packed( schema_b ) ) \
 	{ \
-		const inc_t       rs_a2 = 1 * rs_a; \
-		const inc_t       cs_a2 = 2 * cs_a; \
-		const inc_t       rs_b2 = 2 * rs_b; \
-		const inc_t       cs_b2 = 2 * cs_b; \
+		const inc_t rs_a2 = 1 * rs_a; \
+		const inc_t cs_a2 = 2 * cs_a; \
+		const inc_t rs_b2 = 2 * rs_b; \
+		const inc_t cs_b2 = 2 * cs_b; \
 \
-		ctype_r* restrict a_r   = ( ctype_r* )a; \
-		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
+		const ctype_r* restrict a_r  = ( ctype_r* )a; \
+		const ctype_r* restrict a_i  = ( ctype_r* )a + ld_a; \
 \
-		ctype_r* restrict b_ri  = ( ctype_r* )b; \
-		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
+		      ctype_r* restrict b_ri = ( ctype_r* )b; \
+		      ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \
 \
-		for ( iter = 0; iter < m; ++iter ) \
+		for ( dim_t iter = 0; iter < m; ++iter ) \
 		{ \
-			i         = iter; \
-			n_behind  = i; \
+			dim_t i         = iter; \
+			dim_t n_behind  = i; \
 \
-			ctype_r* restrict alpha11_r = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
-			ctype_r* restrict alpha11_i = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
-			ctype_r* restrict a10t_r    = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
-			ctype_r* restrict a10t_i    = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
-			ctype_r* restrict b1_ri     = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict b1_ir     = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict B0_ri     = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
+			const ctype_r* restrict alpha11_r = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
+			const ctype_r* restrict alpha11_i = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
+			const ctype_r* restrict a10t_r    = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
+			const ctype_r* restrict a10t_i    = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
+			      ctype_r* restrict b1_ri     = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict b1_ir     = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict B0_ri     = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a10t * B0; */ \
 			/* b1 = b1 / alpha11; */ \
-			for ( j = 0; j < n; ++j ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
 				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
 				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
@@ -118,12 +114,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				/* beta11 = beta11 - a10t * b01; */ \
 				PASTEMAC(ch,set0ris)( rho11_r, \
 				                      rho11_i ); \
-				for ( l = 0; l < n_behind; ++l ) \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
-					ctype_r* restrict alpha10_r = a10t_r  + (l  )*cs_a2; \
-					ctype_r* restrict alpha10_i = a10t_i  + (l  )*cs_a2; \
-					ctype_r* restrict beta01_r  = b01_ri  + (l  )*rs_b2 + 0*cs_b; \
-					ctype_r* restrict beta01_i  = b01_ri  + (l  )*rs_b2 + 1*cs_b; \
+					const ctype_r* restrict alpha10_r = a10t_r  + (l  )*cs_a2; \
+					const ctype_r* restrict alpha10_i = a10t_i  + (l  )*cs_a2; \
+					const ctype_r* restrict beta01_r  = b01_ri  + (l  )*rs_b2 + 0*cs_b; \
+					const ctype_r* restrict beta01_i  = b01_ri  + (l  )*rs_b2 + 1*cs_b; \
 \
 					PASTEMAC(ch,axpyris)( *alpha10_r, \
 					                      *alpha10_i, \
@@ -161,32 +157,32 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 	else /* ( bli_is_1r_packed( schema_b ) ) */ \
 	{ \
-		const inc_t       rs_b2 = 2 * rs_b; \
-		const inc_t       cs_b2 = 1 * cs_b; \
+		const inc_t rs_b2 = 2 * rs_b; \
+		const inc_t cs_b2 = 1 * cs_b; \
 \
-		ctype*   restrict a_ri  = ( ctype*   )a; \
-		/*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
+		const ctype*   restrict a_ri  = ( ctype*   )a; \
+		    /*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
 \
-		ctype_r* restrict b_r   = ( ctype_r* )b; \
-		ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
+		      ctype_r* restrict b_r   = ( ctype_r* )b; \
+		      ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
 \
-		for ( iter = 0; iter < m; ++iter ) \
+		for ( dim_t iter = 0; iter < m; ++iter ) \
 		{ \
-			i         = iter; \
-			n_behind  = i; \
-\
-			ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
-			ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
-			ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
-			ctype*   restrict a10t_ri    = a_ri + (i  )*rs_a  + (0  )*cs_a; \
-			ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict B0_r       = b_r  + (0  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict B0_i       = b_i  + (0  )*rs_b2 + (0  )*cs_b2; \
+			dim_t i         = iter; \
+			dim_t n_behind  = i; \
+\
+			const ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
+			const ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
+			const ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
+			const ctype*   restrict a10t_ri    = a_ri + (i  )*rs_a  + (0  )*cs_a; \
+			      ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict B0_r       = b_r  + (0  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict B0_i       = b_i  + (0  )*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a10t * B0; */ \
 			/* b1 = b1 / alpha11; */ \
-			for ( j = 0; j < n; ++j ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
 				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
 				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
@@ -201,13 +197,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				/* beta11 = beta11 - a10t * b01; */ \
 				PASTEMAC(ch,set0ris)( rho11_r, \
 				                      rho11_i ); \
-				for ( l = 0; l < n_behind; ++l ) \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
-					ctype*   restrict alpha10_ri = a10t_ri + (l  )*cs_a; \
-					ctype_r* restrict alpha10_r  = &PASTEMAC(ch,real)( *alpha10_ri ); \
-					ctype_r* restrict alpha10_i  = &PASTEMAC(ch,imag)( *alpha10_ri ); \
-					ctype_r* restrict beta01_r   = b01_r   + (l  )*rs_b2; \
-					ctype_r* restrict beta01_i   = b01_i   + (l  )*rs_b2; \
+					const ctype*   restrict alpha10_ri = a10t_ri + (l  )*cs_a; \
+					const ctype_r* restrict alpha10_r  = &PASTEMAC(ch,real)( *alpha10_ri ); \
+					const ctype_r* restrict alpha10_i  = &PASTEMAC(ch,imag)( *alpha10_ri ); \
+					      ctype_r* restrict beta01_r   = b01_r   + (l  )*rs_b2; \
+					      ctype_r* restrict beta01_i   = b01_i   + (l  )*rs_b2; \
 \
 					PASTEMAC(ch,axpyris)( *alpha10_r, \
 					                      *alpha10_i, \
@@ -258,11 +254,11 @@ INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalri
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
@@ -287,40 +283,36 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t       ld_b  = rs_b; \
 \
 	const pack_t      schema_b = bli_auxinfo_schema_b( data ); \
-\
-	dim_t             iter, i, j, l; \
-	dim_t             n_behind; \
-\
 \
 	if ( bli_is_1e_packed( schema_b ) ) \
 	{ \
-		const inc_t       rs_a2 = 1 * rs_a; \
-		const inc_t       cs_a2 = 2 * cs_a; \
-		const inc_t       rs_b2 = 2 * rs_b; \
-		const inc_t       cs_b2 = 2 * cs_b; \
+		const inc_t rs_a2 = 1 * rs_a; \
+		const inc_t cs_a2 = 2 * cs_a; \
+		const inc_t rs_b2 = 2 * rs_b; \
+		const inc_t cs_b2 = 2 * cs_b; \
 \
-		ctype_r* restrict a_r   = ( ctype_r* )a; \
-		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
+		const ctype_r* restrict a_r  = ( ctype_r* )a; \
+		const ctype_r* restrict a_i  = ( ctype_r* )a + ld_a; \
 \
-		ctype_r* restrict b_ri  = ( ctype_r* )b; \
-		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
+		      ctype_r* restrict b_ri = ( ctype_r* )b; \
+		      ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \
 \
-		for ( iter = 0; iter < m; ++iter ) \
+		for ( dim_t iter = 0; iter < m; ++iter ) \
 		{ \
-			i         = m - iter - 1; \
-			n_behind  = iter; \
+			dim_t i         = m - iter - 1; \
+			dim_t n_behind  = iter; \
 \
-			ctype_r* restrict alpha11_r  = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
-			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
-			ctype_r* restrict a12t_r     = a_r  + (i  )*rs_a2 + (i+1)*cs_a2; \
-			ctype_r* restrict a12t_i     = a_i  + (i  )*rs_a2 + (i+1)*cs_a2; \
-			ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict B2_ri      = b_ri + (i+1)*rs_b2 + (0  )*cs_b2; \
+			const ctype_r* restrict alpha11_r  = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
+			const ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
+			const ctype_r* restrict a12t_r     = a_r  + (i  )*rs_a2 + (i+1)*cs_a2; \
+			const ctype_r* restrict a12t_i     = a_i  + (i  )*rs_a2 + (i+1)*cs_a2; \
+			      ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict B2_ri      = b_ri + (i+1)*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a12t * B2; */ \
 			/* b1 = b1 / alpha11; */ \
-			for ( j = 0; j < n; ++j ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
 				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
 				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
@@ -336,12 +328,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				/* beta11 = beta11 - a10t * b01; */ \
 				PASTEMAC(ch,set0ris)( rho11_r, \
 				                      rho11_i ); \
-				for ( l = 0; l < n_behind; ++l ) \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
-					ctype_r* restrict alpha12_r = a12t_r + (l  )*cs_a2; \
-					ctype_r* restrict alpha12_i = a12t_i + (l  )*cs_a2; \
-					ctype_r* restrict beta21_r  = b21_ri + (l  )*rs_b2 + 0*cs_b; \
-					ctype_r* restrict beta21_i  = b21_ri + (l  )*rs_b2 + 1*cs_b; \
+					const ctype_r* restrict alpha12_r = a12t_r + (l  )*cs_a2; \
+					const ctype_r* restrict alpha12_i = a12t_i + (l  )*cs_a2; \
+					      ctype_r* restrict beta21_r  = b21_ri + (l  )*rs_b2 + 0*cs_b; \
+					      ctype_r* restrict beta21_i  = b21_ri + (l  )*rs_b2 + 1*cs_b; \
 \
 					PASTEMAC(ch,axpyris)( *alpha12_r, \
 					                      *alpha12_i, \
@@ -379,32 +371,32 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
 	{ \
-		const inc_t       rs_b2 = 2 * rs_b; \
-		const inc_t       cs_b2 = 1 * cs_b; \
+		const inc_t rs_b2 = 2 * rs_b; \
+		const inc_t cs_b2 = 1 * cs_b; \
 \
-		ctype*   restrict a_ri  = ( ctype*   )a; \
-		/*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
+		const ctype*   restrict a_ri  = ( ctype*   )a; \
+		    /*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
 \
-		ctype_r* restrict b_r   = ( ctype_r* )b; \
-		ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
+		      ctype_r* restrict b_r   = ( ctype_r* )b; \
+		      ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
 \
-		for ( iter = 0; iter < m; ++iter ) \
+		for ( dim_t iter = 0; iter < m; ++iter ) \
 		{ \
-			i         = m - iter - 1; \
-			n_behind  = iter; \
-\
-			ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
-			ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
-			ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
-			ctype*   restrict a12t_ri    = a_ri + (i  )*rs_a  + (i+1)*cs_a; \
-			ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict B2_r       = b_r  + (i+1)*rs_b2 + (0  )*cs_b2; \
-			ctype_r* restrict B2_i       = b_i  + (i+1)*rs_b2 + (0  )*cs_b2; \
+			dim_t i         = m - iter - 1; \
+			dim_t n_behind  = iter; \
+\
+			const ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
+			const ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
+			const ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
+			const ctype*   restrict a12t_ri    = a_ri + (i  )*rs_a  + (i+1)*cs_a; \
+			      ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict B2_r       = b_r  + (i+1)*rs_b2 + (0  )*cs_b2; \
+			      ctype_r* restrict B2_i       = b_i  + (i+1)*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a12t * B2; */ \
 			/* b1 = b1 / alpha11; */ \
-			for ( j = 0; j < n; ++j ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
 				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
 				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
@@ -419,13 +411,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				/* beta11 = beta11 - a10t * b01; */ \
 				PASTEMAC(ch,set0ris)( rho11_r, \
 				                      rho11_i ); \
-				for ( l = 0; l < n_behind; ++l ) \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
-					ctype*   restrict alpha12_ri = a12t_ri + (l  )*cs_a; \
-					ctype_r* restrict alpha12_r  = &PASTEMAC(ch,real)( *alpha12_ri ); \
-					ctype_r* restrict alpha12_i  = &PASTEMAC(ch,imag)( *alpha12_ri ); \
-					ctype_r* restrict beta21_r   = b21_r   + (l  )*rs_b2; \
-					ctype_r* restrict beta21_i   = b21_i   + (l  )*rs_b2; \
+					const ctype*   restrict alpha12_ri = a12t_ri + (l  )*cs_a; \
+					const ctype_r* restrict alpha12_r  = &PASTEMAC(ch,real)( *alpha12_ri ); \
+					const ctype_r* restrict alpha12_i  = &PASTEMAC(ch,imag)( *alpha12_ri ); \
+					      ctype_r* restrict beta21_r   = b21_r   + (l  )*rs_b2; \
+					      ctype_r* restrict beta21_i   = b21_i   + (l  )*rs_b2; \
 \
 					PASTEMAC(ch,axpyris)( *alpha12_r, \
 					                      *alpha12_i, \

From fab18dca46618799bb0b4f652820b33d36a5d4d4 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 22 Feb 2023 16:50:00 -0600
Subject: [PATCH 136/230] Use 'void*' datatypes in kernel APIs. (#727)

Details:
- Migrated all kernel APIs to use void* pointers instead of float*,
  double*, scomplex*, and dcomplex* pointers. This allows us to define
  many fewer kernel function pointer types, which also makes it much
  easier to know which function pointer type to use at any given time.
  (For example, whereas before there was ?axpyv_ker_ft, ?axpyv_ker_vft,
  and axpyv_ker_vft, now there is just axpyv_ker_ft, which is equivalent
  so what axpyv_ker_vft used to be.)
- Refactored how kernel function prototypes and kernel function types
  are defined so as to reduce redundant code. Specifically, the
  function signatures (excluding cntx_t* and, in the case of level-3
  microkernels, auxinfo_t*) are defined in new headers named, for
  example, bli_l1v_ker_params.h. Those signatures are reused via macro
  instantiation when defining both kernel prototypes and kernel function
  types. This will hopefully make it a little easier to update, add, and
  manage kernel APIs going forward.
- Updated all reference kernels according to the aforementioned switch
  to void* pointers.
- Updated all optimzied kernels according to the aforementioned switch
  to void* pointers. This sometimes required renaming variables,
  inserting typecasting so that pointer arithmetic could continue to
  function as intended, and related tweaks.
- Updated sandbox/gemmlike according to the aforementioned switch to
  void* pointers.
- Renamed:
  - frame/1/bli_l1v_ft_ker.h    -> frame/1/bli_l1v_ker_ft.h
  - frame/1f/bli_l1f_ft_ker.h   -> frame/1f/bli_l1f_ker_ft.h
  - frame/1m/bli_l1m_ft_ker.h   -> frame/1m/bli_l1m_ker_ft.h
  - frame/3/bli_l1m_ft_ukr.h    -> frame/3/bli_l1m_ukr_ft.h
  - frame/3/bli_l3_sup_ft_ker.h -> frame/3/bli_l3_sup_ker_ft.h
  to better align with naming of neighboring files.
- Added the missing "void* params" argument to bli_?packm_struc_cxk() in
  frame/1m/packm/bli_packm_struc_cxk.c. This argument is being passed
  into the function from bli_packm_blk_var1(), but wasn't being "caught"
  by the function definition itself. The function prototype for
  bli_?packm_struc_cxk() also needed updating.
- Reordered the last two parameters in bli_?packm_struc_cxk().
  (Previously, the "void* params" was passed in after the
  "const cntx_t* cntx", although because of the above bug the params
  argument wasn't actually present in the function definition.)
---
 frame/1/bli_l1v.h                             |   3 +-
 frame/1/bli_l1v_ft_ker.h                      | 216 -----
 frame/1/bli_l1v_ker.h                         | 133 ----
 .../bli_l3_sup_ker.h => 1/bli_l1v_ker_ft.h}   |  47 +-
 frame/1/bli_l1v_ker_params.h                  | 152 ++++
 frame/1/bli_l1v_ker_prot.h                    | 210 +----
 frame/1/bli_l1v_tapi.c                        |  20 +-
 frame/1d/bli_l1d_tapi.c                       |  14 +-
 frame/1f/bli_l1f.h                            |   2 +-
 frame/1f/bli_l1f_ker_ft.h                     |  59 ++
 ...{bli_l1f_ft_ker.h => bli_l1f_ker_params.h} | 119 +--
 frame/1f/bli_l1f_ker_prot.h                   |  92 +--
 frame/1f/bli_l1f_tapi.c                       |  10 +-
 frame/1m/bli_l1m.h                            |   2 +-
 frame/1m/bli_l1m_ker.h                        |  88 --
 frame/{3/bli_l3_ukr.h => 1m/bli_l1m_ker_ft.h} |  39 +-
 ...{bli_l1m_ft_ker.h => bli_l1m_ker_params.h} |  73 +-
 frame/1m/bli_l1m_ker_prot.h                   |  59 +-
 frame/1m/bli_l1m_unb_var1.c                   |   8 +-
 frame/1m/packm/bli_packm_blk_var1.c           |  78 +-
 frame/1m/packm/bli_packm_blk_var1.h           |   4 +-
 frame/1m/packm/bli_packm_struc_cxk.c          |   5 +-
 frame/1m/packm/bli_packm_struc_cxk.h          |   1 +
 frame/1m/packm/bli_packm_struc_cxk_md.c       |  48 +-
 frame/1m/packm/bli_packm_struc_cxk_md.h       |  48 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.c       |   2 +-
 frame/2/gemv/amd/bli_gemv_unf_var2_amd.c      |   4 +-
 frame/2/gemv/bli_gemv_unb_var1.c              |   4 +-
 frame/2/gemv/bli_gemv_unb_var2.c              |   4 +-
 frame/2/gemv/bli_gemv_unf_var1.c              |   4 +-
 frame/2/gemv/bli_gemv_unf_var2.c              |   4 +-
 frame/2/ger/bli_ger_unb_var1.c                |   4 +-
 frame/2/ger/bli_ger_unb_var2.c                |   4 +-
 frame/2/hemv/bli_hemv_unb_var1.c              |   6 +-
 frame/2/hemv/bli_hemv_unb_var2.c              |   4 +-
 frame/2/hemv/bli_hemv_unb_var3.c              |   6 +-
 frame/2/hemv/bli_hemv_unb_var4.c              |   4 +-
 frame/2/hemv/bli_hemv_unf_var1.c              |   4 +-
 frame/2/hemv/bli_hemv_unf_var1a.c             |   4 +-
 frame/2/hemv/bli_hemv_unf_var3.c              |   4 +-
 frame/2/hemv/bli_hemv_unf_var3a.c             |   4 +-
 frame/2/her/bli_her_unb_var1.c                |   4 +-
 frame/2/her/bli_her_unb_var2.c                |   4 +-
 frame/2/her2/bli_her2_unb_var1.c              |   4 +-
 frame/2/her2/bli_her2_unb_var2.c              |   4 +-
 frame/2/her2/bli_her2_unb_var3.c              |   4 +-
 frame/2/her2/bli_her2_unb_var4.c              |   4 +-
 frame/2/her2/bli_her2_unf_var1.c              |   4 +-
 frame/2/her2/bli_her2_unf_var4.c              |   4 +-
 frame/2/trmv/bli_trmv_unb_var1.c              |   4 +-
 frame/2/trmv/bli_trmv_unb_var2.c              |   4 +-
 frame/2/trmv/bli_trmv_unf_var1.c              |   4 +-
 frame/2/trmv/bli_trmv_unf_var2.c              |   4 +-
 frame/2/trsv/bli_trsv_unb_var1.c              |   4 +-
 frame/2/trsv/bli_trsv_unb_var2.c              |   4 +-
 frame/2/trsv/bli_trsv_unf_var1.c              |   4 +-
 frame/2/trsv/bli_trsv_unf_var2.c              |   4 +-
 frame/3/bli_l3.h                              |   5 +-
 frame/3/bli_l3_ft_ukr.h                       | 106 ---
 ...li_l3_sup_ft_ker.h => bli_l3_sup_ker_ft.h} |  27 +-
 frame/3/bli_l3_sup_ker_params.h               |  54 ++
 frame/3/bli_l3_sup_ker_prot.h                 |  28 +-
 frame/3/bli_l3_sup_packm_var.c                |   2 +-
 frame/3/bli_l3_sup_var1n2m.c                  |   7 +-
 frame/3/bli_l3_ukr_fpa.c                      |   4 +-
 frame/3/bli_l3_ukr_fpa.h                      |   2 +-
 frame/{1f/bli_l1f_ker.h => 3/bli_l3_ukr_ft.h} |  48 +-
 frame/3/bli_l3_ukr_oapi.c                     |  10 +-
 frame/3/bli_l3_ukr_params.h                   |  70 ++
 frame/3/bli_l3_ukr_prot.h                     |  55 +-
 frame/3/bli_l3_ukr_tapi.c                     |   6 +-
 frame/3/bli_l3_ukr_tapi.h                     |  64 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |   4 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |  48 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.h            |  38 +-
 frame/3/gemm/bli_gemm_var.h                   |   2 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |   2 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2b.c         |   2 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |   2 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2b.c         |   2 +-
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |   2 +-
 frame/3/trmm/bli_trmm_ll_ker_var2b.c          |   2 +-
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |   2 +-
 frame/3/trmm/bli_trmm_lu_ker_var2b.c          |   2 +-
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |   2 +-
 frame/3/trmm/bli_trmm_rl_ker_var2b.c          |   2 +-
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |   2 +-
 frame/3/trmm/bli_trmm_ru_ker_var2b.c          |   2 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |   4 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |   4 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |   4 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |   4 +-
 frame/include/bli_pre_ker_params.h            |  45 ++
 frame/include/blis.h                          |   7 +
 .../armsve/1m/bli_dpackm_armsve256_int_8xk.c  |  28 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c |  12 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c |  12 +-
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |  10 +-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  |  10 +-
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  |  10 +-
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |  10 +-
 kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c   |  40 +-
 kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c   |  36 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c |  12 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c |  12 +-
 .../armv8a/1m/bli_packm_armv8a_int_s12xk.c    |  12 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c |  12 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   |  21 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c  |  20 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c   |  62 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c   |  74 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c   |  14 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c   |  14 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c   |  18 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c   |  18 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c   |  18 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c   |  34 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c   |  30 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c   |  14 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c |  10 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c |  10 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c |  12 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c |  12 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c   |  16 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c   |  16 +-
 kernels/bgq/1/bli_axpyv_bgq_int.c             |  10 +-
 kernels/bgq/1/bli_dotv_bgq_int.c              |  12 +-
 kernels/bgq/1f/bli_axpyf_bgq_int.c            |  85 +-
 kernels/bgq/3/bli_gemm_bgq_int_8x8.c          |  32 +-
 .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c      |  46 +-
 .../haswell/1m/bli_packm_haswell_asm_c3xk.c   |  12 +-
 .../haswell/1m/bli_packm_haswell_asm_c8xk.c   |  12 +-
 .../haswell/1m/bli_packm_haswell_asm_d6xk.c   |  12 +-
 .../haswell/1m/bli_packm_haswell_asm_d8xk.c   |  12 +-
 .../haswell/1m/bli_packm_haswell_asm_s16xk.c  |  12 +-
 .../haswell/1m/bli_packm_haswell_asm_s6xk.c   |  12 +-
 .../haswell/1m/bli_packm_haswell_asm_z3xk.c   |  12 +-
 .../haswell/1m/bli_packm_haswell_asm_z4xk.c   |  12 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c |  40 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c |  40 +-
 .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c       |  24 +-
 .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c       |  24 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c  |  54 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c  |  74 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c |  86 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c |  74 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c  | 118 +--
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c  | 134 ++--
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c | 174 ++--
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c | 134 ++--
 .../sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c |  16 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c    |  40 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c    |  40 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c    |  30 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c    |  42 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c    |  60 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c    |  60 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c    |  60 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c    |  62 +-
 .../s6x16/bli_gemmsup_r_haswell_ref_sMx1.c    |  83 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c   |  40 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c  |  30 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c  |  42 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c   |  40 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c   |  30 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c   |  30 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c  |  60 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c  |  62 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c   |  60 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c   |  60 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c   |  70 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c   |  60 +-
 kernels/knc/3/bli_dgemm_knc_asm_30x8.c        |  16 +-
 kernels/knc/3/bli_sgemm_knc_asm_30x16.c       |  17 +-
 kernels/knl/1m/bli_dpackm_knl_asm_24x8.c      |  20 +-
 kernels/knl/1m/bli_spackm_knl_asm_24x16.c     |  20 +-
 kernels/knl/3/bli_dgemm_knl_asm_24x8.c        |  16 +-
 kernels/knl/3/bli_sgemm_knl_asm_24x16.c       |  10 +-
 kernels/penryn/1/bli_axpyv_penryn_int.c       |  33 +-
 kernels/penryn/1/bli_dotv_penryn_int.c        |  33 +-
 kernels/penryn/1f/bli_axpy2v_penryn_int.c     |  24 +-
 kernels/penryn/1f/bli_axpyf_penryn_int.c      |  24 +-
 kernels/penryn/1f/bli_dotaxpyv_penryn_int.c   |  15 +-
 kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c  |  30 +-
 kernels/penryn/1f/bli_dotxf_penryn_int.c      |  31 +-
 kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c   |  20 +-
 .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c |  12 +-
 .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c |  12 +-
 kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c |   6 +-
 kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c |   6 +-
 .../3/bli_gemm_piledriver_asm_d8x3.c          |  40 +-
 kernels/power10/3/bli_dgemm_power10_mma.c     |  14 +-
 kernels/power10/3/bli_i16gemm_power10_mma.c   |  23 +-
 kernels/power10/3/bli_i16sgemm_power10_mma.c  |  19 +-
 kernels/power10/3/bli_i4gemm_power10_mma.c    |  19 +-
 kernels/power10/3/bli_i8gemm_power10_mma.c    |  19 +-
 kernels/power10/3/bli_sbgemm_power10_mma.c    |  19 +-
 kernels/power10/3/bli_sgemm_power10_mma.c     |  19 +-
 kernels/power10/3/bli_shgemm_power10_mma.c    |  21 +-
 kernels/power7/3/bli_gemm_power7_int_8x4.c    |  64 +-
 kernels/power9/3/bli_gemm_power9_asm_d12x6.c  |  10 +-
 .../3/bli_gemm_sandybridge_asm_d8x4.c         |  40 +-
 .../3/bli_gemm_sandybridge_int_d8x4.c         |  44 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c    |  10 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x14.c       |  10 +-
 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c    |  10 +-
 kernels/zen/1/bli_amaxv_zen_int.c             |  28 +-
 kernels/zen/1/bli_axpyv_zen_int.c             | 100 +--
 kernels/zen/1/bli_axpyv_zen_int10.c           | 372 ++++-----
 kernels/zen/1/bli_copyv_zen_int.c             |  14 +-
 kernels/zen/1/bli_dotv_zen_int.c              | 106 +--
 kernels/zen/1/bli_dotv_zen_int10.c            | 286 +++----
 kernels/zen/1/bli_dotxv_zen_int.c             | 114 +--
 kernels/zen/1/bli_scalv_zen_int.c             |  74 +-
 kernels/zen/1/bli_scalv_zen_int10.c           | 256 +++---
 kernels/zen/1/bli_setv_zen_int.c              |  14 +-
 kernels/zen/1/bli_swapv_zen_int8.c            | 337 ++++----
 kernels/zen/1f/bli_axpyf_zen_int_4.c          |  48 +-
 kernels/zen/1f/bli_axpyf_zen_int_5.c          | 750 +++++++++---------
 kernels/zen/1f/bli_axpyf_zen_int_8.c          | 226 +++---
 kernels/zen/1f/bli_dotxf_zen_int_8.c          | 306 +++----
 ref_kernels/1/bli_addv_ref.c                  |  24 +-
 ref_kernels/1/bli_amaxv_ref.c                 |   4 +-
 ref_kernels/1/bli_axpbyv_ref.c                |  73 +-
 ref_kernels/1/bli_axpyv_ref.c                 | 109 +--
 ref_kernels/1/bli_copyv_ref.c                 |   7 +-
 ref_kernels/1/bli_dotv_ref.c                  |  10 +-
 ref_kernels/1/bli_dotxv_ref.c                 |  16 +-
 ref_kernels/1/bli_invertv_ref.c               |   4 +-
 ref_kernels/1/bli_invscalv_ref.c              |   7 +-
 ref_kernels/1/bli_scal2v_ref.c                |  24 +-
 ref_kernels/1/bli_scalv_ref.c                 |  13 +-
 ref_kernels/1/bli_setv_ref.c                  |   7 +-
 ref_kernels/1/bli_subv_ref.c                  |   7 +-
 ref_kernels/1/bli_swapv_ref.c                 |   7 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |  26 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |  22 +-
 ref_kernels/1f/bli_axpyf_ref.c                |  19 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |  26 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |  32 +-
 ref_kernels/1f/bli_dotxf_ref.c                |  22 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   |   6 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       |   6 +-
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        |   6 +-
 ref_kernels/1m/bli_packm_cxk_ref.c            |   6 +-
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |   6 +-
 ref_kernels/3/bli_gemm_ref.c                  |  31 +-
 ref_kernels/3/bli_gemmsup_ref.c               |  32 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |  25 +-
 ref_kernels/3/bli_trsm_ref.c                  |  24 +-
 ref_kernels/bli_cntx_ref.c                    | 196 +++--
 ref_kernels/ind/bli_gemm1m_ref.c              |  19 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |  27 +-
 ref_kernels/ind/bli_trsm1m_ref.c              |  20 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           |   3 +-
 sandbox/gemmlike/bls_packm_cxk.c              |   4 +-
 256 files changed, 4810 insertions(+), 5112 deletions(-)
 delete mode 100644 frame/1/bli_l1v_ft_ker.h
 delete mode 100644 frame/1/bli_l1v_ker.h
 rename frame/{3/bli_l3_sup_ker.h => 1/bli_l1v_ker_ft.h} (68%)
 create mode 100644 frame/1/bli_l1v_ker_params.h
 create mode 100644 frame/1f/bli_l1f_ker_ft.h
 rename frame/1f/{bli_l1f_ft_ker.h => bli_l1f_ker_params.h} (52%)
 delete mode 100644 frame/1m/bli_l1m_ker.h
 rename frame/{3/bli_l3_ukr.h => 1m/bli_l1m_ker_ft.h} (71%)
 rename frame/1m/{bli_l1m_ft_ker.h => bli_l1m_ker_params.h} (64%)
 delete mode 100644 frame/3/bli_l3_ft_ukr.h
 rename frame/3/{bli_l3_sup_ft_ker.h => bli_l3_sup_ker_ft.h} (74%)
 create mode 100644 frame/3/bli_l3_sup_ker_params.h
 rename frame/{1f/bli_l1f_ker.h => 3/bli_l3_ukr_ft.h} (68%)
 create mode 100644 frame/3/bli_l3_ukr_params.h
 create mode 100644 frame/include/bli_pre_ker_params.h

diff --git a/frame/1/bli_l1v.h b/frame/1/bli_l1v.h
index 99ceb3a3f..9f0e7734a 100644
--- a/frame/1/bli_l1v.h
+++ b/frame/1/bli_l1v.h
@@ -35,8 +35,7 @@
 #include "bli_l1v_check.h"
 
 // Define kernel function types.
-//#include "bli_l1v_ft_ex.h"
-#include "bli_l1v_ft_ker.h"
+#include "bli_l1v_ker_ft.h"
 
 // Prototype object APIs (expert and non-expert).
 #include "bli_oapi_ex.h"
diff --git a/frame/1/bli_l1v_ft_ker.h b/frame/1/bli_l1v_ft_ker.h
deleted file mode 100644
index 1c5575ec7..000000000
--- a/frame/1/bli_l1v_ft_ker.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_L1V_FT_KER_H
-#define BLIS_L1V_FT_KER_H
-
-
-//
-// -- Level-1v kernel function types -------------------------------------------
-//
-
-// addv, copyv, subv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( addv )
-INSERT_GENTDEF( copyv )
-INSERT_GENTDEF( subv )
-
-// amaxv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             dim_t*  index, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( amaxv )
-
-// axpbyv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( axpbyv )
-
-// axpyv, scal2v
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( axpyv )
-INSERT_GENTDEF( scal2v )
-
-// dotv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             conj_t  conjx, \
-             conj_t  conjy, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  rho, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( dotv )
-
-// dotxv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             conj_t  conjx, \
-             conj_t  conjy, \
-             dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-       const ctype*  beta, \
-             ctype*  rho, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( dotxv )
-
-// invertv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             dim_t   n, \
-             ctype*  x, inc_t incx, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( invertv )
-
-// invscalv, scalv, setv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             conj_t  conjalpha, \
-             dim_t   n, \
-       const ctype*  alpha, \
-             ctype*  x, inc_t incx, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( invscalv )
-INSERT_GENTDEF( scalv )
-INSERT_GENTDEF( setv )
-
-// swapv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             dim_t   n, \
-             ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( swapv )
-
-// xpybv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( xpbyv )
-
-
-#endif
-
diff --git a/frame/1/bli_l1v_ker.h b/frame/1/bli_l1v_ker.h
deleted file mode 100644
index 4ebbffa82..000000000
--- a/frame/1/bli_l1v_ker.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-
-//
-// Define template prototypes for level-1v kernels.
-//
-
-// Note: Instead of defining function prototype macro templates and then
-// instantiating those macros to define the individual function prototypes,
-// we simply alias the official operations' prototypes as defined in
-// bli_l1v_ker_prot.h.
-
-#undef  GENTPROT
-#define GENTPROT ADDV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( addv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT AMAXV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( amaxv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT AXPBYV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( axpbyv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT AXPYV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( axpyv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT COPYV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( copyv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT DOTV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( dotv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT DOTXV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( dotxv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT INVERTV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( invertv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT INVSCALV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( invscalv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT SCALV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( scalv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT SCAL2V_KER_PROT
-
-INSERT_GENTPROT_BASIC0( scal2v_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT SETV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( setv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT SUBV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( subv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT SWAPV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( swapv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT XPBYV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( xpbyv_ker_name )
-
diff --git a/frame/3/bli_l3_sup_ker.h b/frame/1/bli_l1v_ker_ft.h
similarity index 68%
rename from frame/3/bli_l3_sup_ker.h
rename to frame/1/bli_l1v_ker_ft.h
index 6c77fffe0..3c0673f48 100644
--- a/frame/3/bli_l3_sup_ker.h
+++ b/frame/1/bli_l1v_ker_ft.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2014, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,25 +32,38 @@
 
 */
 
-//
-// Define template prototypes for level-3 kernels on small/unpacked matrices.
-//
+#ifndef BLIS_L1V_KER_FT_H
+#define BLIS_L1V_KER_FT_H
 
-// Note: Instead of defining function prototype macro templates and then
-// instantiating those macros to define the individual function prototypes,
-// we simply alias the official operations' prototypes as defined in
-// bli_l3_ker_prot.h.
 
-#undef  GENTPROT
-#define GENTPROT GEMMSUP_KER_PROT
+//
+// -- Level-1v kernel function types -------------------------------------------
+//
 
-INSERT_GENTPROT_BASIC0( gemmsup_rv_ukr_name )
-INSERT_GENTPROT_BASIC0( gemmsup_rg_ukr_name )
-INSERT_GENTPROT_BASIC0( gemmsup_cv_ukr_name )
-INSERT_GENTPROT_BASIC0( gemmsup_cg_ukr_name )
+#undef  GENTDEF
+#define GENTDEF( opname ) \
+\
+typedef void (*PASTECH(opname,_ker_ft)) \
+     ( \
+       PASTECH(opname,_params), \
+       BLIS_CNTX_PARAM  \
+     );
 
-INSERT_GENTPROT_BASIC0( gemmsup_rd_ukr_name )
-INSERT_GENTPROT_BASIC0( gemmsup_cd_ukr_name )
+GENTDEF( addv )
+GENTDEF( amaxv )
+GENTDEF( axpbyv )
+GENTDEF( axpyv )
+GENTDEF( copyv )
+GENTDEF( dotv )
+GENTDEF( dotxv )
+GENTDEF( invertv )
+GENTDEF( invscalv )
+GENTDEF( scalv )
+GENTDEF( scal2v )
+GENTDEF( setv )
+GENTDEF( subv )
+GENTDEF( swapv )
+GENTDEF( xpbyv )
 
-INSERT_GENTPROT_BASIC0( gemmsup_gx_ukr_name )
+#endif
 
diff --git a/frame/1/bli_l1v_ker_params.h b/frame/1/bli_l1v_ker_params.h
new file mode 100644
index 000000000..3afe84a86
--- /dev/null
+++ b/frame/1/bli_l1v_ker_params.h
@@ -0,0 +1,152 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L1V_KER_PARAMS_H
+#define BLIS_L1V_KER_PARAMS_H
+
+
+#define addv_params \
+\
+             conj_t  conjx, \
+             dim_t   n, \
+       const void*   x, inc_t incx, \
+             void*   y, inc_t incy
+
+#define amaxv_params \
+\
+             dim_t   n, \
+       const void*   x, inc_t incx, \
+             dim_t*  index
+
+#define axpbyv_params \
+\
+             conj_t  conjx, \
+             dim_t   n, \
+       const void*   alpha, \
+       const void*   x, inc_t incx, \
+       const void*   beta, \
+             void*   y, inc_t incy
+
+#define axpyv_params \
+\
+             conj_t  conjx, \
+             dim_t   n, \
+       const void*   alpha, \
+       const void*   x, inc_t incx, \
+             void*   y, inc_t incy
+
+#define copyv_params \
+\
+             conj_t  conjx, \
+             dim_t   n, \
+       const void*   x, inc_t incx, \
+             void*   y, inc_t incy
+
+#define dotv_params \
+\
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const void*   x, inc_t incx, \
+       const void*   y, inc_t incy, \
+             void*   rho
+
+#define dotxv_params \
+\
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
+       const void*   alpha, \
+       const void*   x, inc_t incx, \
+       const void*   y, inc_t incy, \
+       const void*   beta, \
+             void*   rho
+
+#define invertv_params \
+\
+             dim_t   n, \
+             void*   x, inc_t incx
+
+#define invscalv_params \
+\
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const void*   alpha, \
+             void*   x, inc_t incx
+
+#define scalv_params \
+\
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const void*   alpha, \
+             void*   x, inc_t incx
+
+#define scal2v_params \
+\
+             conj_t  conjx, \
+             dim_t   n, \
+       const void*   alpha, \
+       const void*   x, inc_t incx, \
+             void*   y, inc_t incy
+
+#define setv_params \
+\
+             conj_t  conjalpha, \
+             dim_t   n, \
+       const void*   alpha, \
+             void*   x, inc_t incx
+
+#define subv_params \
+\
+             conj_t  conjx, \
+             dim_t   n, \
+       const void*   x, inc_t incx, \
+             void*   y, inc_t incy
+
+#define swapv_params \
+\
+             dim_t   n, \
+             void*   x, inc_t incx, \
+             void*   y, inc_t incy
+
+#define xpbyv_params \
+\
+             conj_t  conjx, \
+             dim_t   n, \
+       const void*   x, inc_t incx, \
+       const void*   beta, \
+             void*   y, inc_t incy
+
+#endif
+
diff --git a/frame/1/bli_l1v_ker_prot.h b/frame/1/bli_l1v_ker_prot.h
index add65bba5..8c2c63eb9 100644
--- a/frame/1/bli_l1v_ker_prot.h
+++ b/frame/1/bli_l1v_ker_prot.h
@@ -32,194 +32,38 @@
 
 */
 
+#ifndef BLIS_L1V_KER_PROT_H
+#define BLIS_L1V_KER_PROT_H
 
 //
 // Define template prototypes for level-1v kernels.
 //
 
-#define ADDV_KER_PROT( ctype, ch, opname ) \
+#undef  L1VTPROT
+#define L1VTPROT( ctype, ch, funcname, opname ) \
 \
-void PASTEMAC(ch,opname) \
-      ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-      );
-
-
-#define AMAXV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             dim_t*  index, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define AXPBYV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define AXPYV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define COPYV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-      ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-      );
-
-
-#define DOTV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjx, \
-             conj_t  conjy, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  rho, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define DOTXV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjx, \
-             conj_t  conjy, \
-             dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-       const ctype*  beta, \
-             ctype*  rho, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define INVERTV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             dim_t   n, \
-             ctype*  x, inc_t incx, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define INVSCALV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjalpha, \
-             dim_t   n, \
-       const ctype*  alpha, \
-             ctype*  x, inc_t incx, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define SCALV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjalpha, \
-             dim_t   n, \
-       const ctype*  alpha, \
-             ctype*  x, inc_t incx, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define SCAL2V_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define SETV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjalpha, \
-             dim_t   n, \
-       const ctype*  alpha, \
-             ctype*  x, inc_t incx, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define SUBV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-      ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-      );
-
-
-#define SWAPV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             dim_t   n, \
-             ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     ); \
-
-
-#define XPBYV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
+void PASTEMAC(ch,funcname) \
      ( \
-             conj_t  conjx, \
-             dim_t   n, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     ); \
+       PASTECH(opname,_params), \
+       BLIS_CNTX_PARAM  \
+     );
+
+#define ADDV_KER_PROT(     ctype, ch, fn )  L1VTPROT( ctype, ch, fn, addv );
+#define AMAXV_KER_PROT(    ctype, ch, fn )  L1VTPROT( ctype, ch, fn, amaxv );
+#define AXPBYV_KER_PROT(   ctype, ch, fn )  L1VTPROT( ctype, ch, fn, axpbyv );
+#define AXPYV_KER_PROT(    ctype, ch, fn )  L1VTPROT( ctype, ch, fn, axpyv );
+#define COPYV_KER_PROT(    ctype, ch, fn )  L1VTPROT( ctype, ch, fn, copyv );
+#define DOTV_KER_PROT(     ctype, ch, fn )  L1VTPROT( ctype, ch, fn, dotv );
+#define DOTXV_KER_PROT(    ctype, ch, fn )  L1VTPROT( ctype, ch, fn, dotxv );
+#define INVERTV_KER_PROT(  ctype, ch, fn )  L1VTPROT( ctype, ch, fn, invertv );
+#define INVSCALV_KER_PROT( ctype, ch, fn )  L1VTPROT( ctype, ch, fn, invscalv );
+#define SCALV_KER_PROT(    ctype, ch, fn )  L1VTPROT( ctype, ch, fn, scalv );
+#define SCAL2V_KER_PROT(   ctype, ch, fn )  L1VTPROT( ctype, ch, fn, scal2v );
+#define SETV_KER_PROT(     ctype, ch, fn )  L1VTPROT( ctype, ch, fn, setv );
+#define SUBV_KER_PROT(     ctype, ch, fn )  L1VTPROT( ctype, ch, fn, subv );
+#define SWAPV_KER_PROT(    ctype, ch, fn )  L1VTPROT( ctype, ch, fn, swapv );
+#define XPBYV_KER_PROT(    ctype, ch, fn )  L1VTPROT( ctype, ch, fn, xpbyv );
+
+
+#endif
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index b22ba365f..9c3f4a30c 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -61,7 +61,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -98,7 +98,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -135,7 +135,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -175,7 +175,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) \
 		cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -215,7 +215,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -257,7 +257,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -295,7 +295,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -329,7 +329,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -366,7 +366,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -401,7 +401,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index 907afb703..60d5cf1d6 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -101,7 +101,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f \
@@ -181,7 +181,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f \
@@ -241,7 +241,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f \
@@ -299,7 +299,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f \
@@ -377,7 +377,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f \
@@ -436,7 +436,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f \
@@ -514,7 +514,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f \
diff --git a/frame/1f/bli_l1f.h b/frame/1f/bli_l1f.h
index 43676ec4e..fde1bec33 100644
--- a/frame/1f/bli_l1f.h
+++ b/frame/1f/bli_l1f.h
@@ -35,7 +35,7 @@
 #include "bli_l1f_check.h"
 
 // Define kernel function types.
-#include "bli_l1f_ft_ker.h"
+#include "bli_l1f_ker_ft.h"
 
 // Prototype object APIs (expert and non-expert).
 #include "bli_oapi_ex.h"
diff --git a/frame/1f/bli_l1f_ker_ft.h b/frame/1f/bli_l1f_ker_ft.h
new file mode 100644
index 000000000..bb88a7105
--- /dev/null
+++ b/frame/1f/bli_l1f_ker_ft.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L1F_KER_FT_H
+#define BLIS_L1F_KER_FT_H
+
+
+//
+// -- Level-1f kernel function types -------------------------------------------
+//
+
+#undef  GENTDEF
+#define GENTDEF( opname ) \
+\
+typedef void (*PASTECH(opname,_ker_ft)) \
+     ( \
+       PASTECH(opname,_params), \
+       BLIS_CNTX_PARAM  \
+     );
+
+GENTDEF( axpy2v )
+GENTDEF( axpyf )
+GENTDEF( dotaxpyv )
+GENTDEF( dotxaxpyf )
+GENTDEF( dotxf )
+
+#endif
+
diff --git a/frame/1f/bli_l1f_ft_ker.h b/frame/1f/bli_l1f_ker_params.h
similarity index 52%
rename from frame/1f/bli_l1f_ft_ker.h
rename to frame/1f/bli_l1f_ker_params.h
index 54557c1da..c6963c995 100644
--- a/frame/1f/bli_l1f_ft_ker.h
+++ b/frame/1f/bli_l1f_ker_params.h
@@ -32,122 +32,71 @@
 
 */
 
-#ifndef BLIS_L1F_FT_KER_H
-#define BLIS_L1F_FT_KER_H
+#ifndef BLIS_L1F_KER_PARAMS_H
+#define BLIS_L1F_KER_PARAMS_H
 
 
-//
-// -- Level-1f kernel function types -------------------------------------------
-//
-
-// axpy2v
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define axpy2v_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              conj_t  conjx, \
              conj_t  conjy, \
              dim_t   n, \
-       const ctype*  alphax, \
-       const ctype*  alphay, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  z, inc_t incz, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( axpy2v )
-
-// axpyf
+       const void*   alphax, \
+       const void*   alphay, \
+       const void*   x, inc_t incx, \
+       const void*   y, inc_t incy, \
+             void*   z, inc_t incz
 
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define axpyf_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              conj_t  conja, \
              conj_t  conjx, \
              dim_t   m, \
              dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
+       const void*   alpha, \
+       const void*   a, inc_t inca, inc_t lda, \
+       const void*   x, inc_t incx, \
+             void*   y, inc_t incy
 
-INSERT_GENTDEF( axpyf )
-
-// dotaxpyv
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define dotaxpyv_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              conj_t  conjxt, \
              conj_t  conjx, \
              conj_t  conjy, \
              dim_t   m, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  rho, \
-             ctype*  z, inc_t incz, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( dotaxpyv )
+       const void*   alpha, \
+       const void*   x, inc_t incx, \
+       const void*   y, inc_t incy, \
+             void*   rho, \
+             void*   z, inc_t incz
 
-// dotxaxpyf
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define dotxaxpyf_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              conj_t  conjat, \
              conj_t  conja, \
              conj_t  conjw, \
              conj_t  conjx, \
              dim_t   m, \
              dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  w, inc_t incw, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-             ctype*  z, inc_t incz, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( dotxaxpyf )
-
-// dotxf
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+       const void*   alpha, \
+       const void*   a, inc_t inca, inc_t lda, \
+       const void*   w, inc_t incw, \
+       const void*   x, inc_t incx, \
+       const void*   beta, \
+             void*   y, inc_t incy, \
+             void*   z, inc_t incz
+
+#define dotxf_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              conj_t  conjat, \
              conj_t  conjx, \
              dim_t   m, \
              dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
-
-INSERT_GENTDEF( dotxf )
-
-
+       const void*   alpha, \
+       const void*   a, inc_t inca, inc_t lda, \
+       const void*   x, inc_t incx, \
+       const void*   beta, \
+             void*   y, inc_t incy
 
 #endif
 
diff --git a/frame/1f/bli_l1f_ker_prot.h b/frame/1f/bli_l1f_ker_prot.h
index 8f0967736..3abd7ca69 100644
--- a/frame/1f/bli_l1f_ker_prot.h
+++ b/frame/1f/bli_l1f_ker_prot.h
@@ -32,94 +32,28 @@
 
 */
 
+#ifndef BLIS_L1F_KER_PROT_H
+#define BLIS_L1F_KER_PROT_H
 
 //
 // Define template prototypes for level-1f kernels.
 //
 
-#define AXPY2V_KER_PROT( ctype, ch, opname ) \
+#undef  L1FTPROT
+#define L1FTPROT( ctype, ch, funcname, opname ) \
 \
-void PASTEMAC(ch,opname) \
+void PASTEMAC(ch,funcname) \
      ( \
-             conj_t  conjx, \
-             conj_t  conjy, \
-             dim_t   n, \
-       const ctype*  alphax, \
-       const ctype*  alphay, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  z, inc_t incz, \
-       const cntx_t* cntx  \
+       PASTECH(opname,_params), \
+       BLIS_CNTX_PARAM  \
      );
 
-
-#define AXPYF_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conja, \
-             conj_t  conjx, \
-             dim_t   m, \
-             dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
-
-
-#define DOTAXPYV_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjxt, \
-             conj_t  conjx, \
-             conj_t  conjy, \
-             dim_t   m, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  rho, \
-             ctype*  z, inc_t incz, \
-       const cntx_t* cntx  \
-     );
+#define AXPY2V_KER_PROT(    ctype, ch, fn )  L1FTPROT( ctype, ch, fn, axpy2v );
+#define AXPYF_KER_PROT(     ctype, ch, fn )  L1FTPROT( ctype, ch, fn, axpyf );
+#define DOTAXPYV_KER_PROT(  ctype, ch, fn )  L1FTPROT( ctype, ch, fn, dotaxpyv );
+#define DOTXAXPYF_KER_PROT( ctype, ch, fn )  L1FTPROT( ctype, ch, fn, dotxaxpyf );
+#define DOTXF_KER_PROT(     ctype, ch, fn )  L1FTPROT( ctype, ch, fn, dotxf );
 
 
-#define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjat, \
-             conj_t  conja, \
-             conj_t  conjw, \
-             conj_t  conjx, \
-             dim_t   m, \
-             dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  w, inc_t incw, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-             ctype*  z, inc_t incz, \
-       const cntx_t* cntx  \
-     );
-
-
-#define DOTXF_KER_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t  conjat, \
-             conj_t  conjx, \
-             dim_t   m, \
-             dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-       const cntx_t* cntx  \
-     );
+#endif
 
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index 04d100cb3..b6811dbc2 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -65,7 +65,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -109,7 +109,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -154,7 +154,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -204,7 +204,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -254,7 +254,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
diff --git a/frame/1m/bli_l1m.h b/frame/1m/bli_l1m.h
index 925b9b376..fb17ae492 100644
--- a/frame/1m/bli_l1m.h
+++ b/frame/1m/bli_l1m.h
@@ -35,7 +35,7 @@
 #include "bli_l1m_check.h"
 
 // Define kernel function types.
-#include "bli_l1m_ft_ker.h"
+#include "bli_l1m_ker_ft.h"
 
 // Define object function types for variants.
 #include "bli_l1m_oft_var.h"
diff --git a/frame/1m/bli_l1m_ker.h b/frame/1m/bli_l1m_ker.h
deleted file mode 100644
index 970c5f040..000000000
--- a/frame/1m/bli_l1m_ker.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-
-//
-// Define template prototypes for level-1m kernels.
-//
-
-// Note: Instead of defining function prototype macro templates and then
-// instantiating those macros to define the individual function prototypes,
-// we simply alias the official operations' prototypes as defined in
-// bli_l1m_ker_prot.h.
-
-// native packm kernels
-
-#undef  GENTPROT
-#define GENTPROT PACKM_KER_PROT
-
-INSERT_GENTPROT_BASIC0( packm_mrxk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_nrxk_ker_name )
-
-
-// native unpackm kernels
-
-#undef  GENTPROT
-#define GENTPROT UNPACKM_KER_PROT
-
-INSERT_GENTPROT_BASIC0( unpackm_mrxk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name )
-
-
-// 1e/1r packm kernels
-
-#undef  GENTPROT
-#define GENTPROT PACKM_KER_PROT
-
-INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name )
-
-
-// packm kernels for diagonal blocks
-
-#undef  GENTPROT
-#define GENTPROT PACKM_DIAG_KER_PROT
-
-INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_ker_name )
-INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_ker_name )
-
-
-// 1e/1r packm kernels for diagonal blocks
-
-#undef  GENTPROT
-#define GENTPROT PACKM_DIAG_KER_PROT
-
-INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_1er_ker_name )
-
diff --git a/frame/3/bli_l3_ukr.h b/frame/1m/bli_l1m_ker_ft.h
similarity index 71%
rename from frame/3/bli_l3_ukr.h
rename to frame/1m/bli_l1m_ker_ft.h
index e7cc8c71d..6080d1ebe 100644
--- a/frame/3/bli_l3_ukr.h
+++ b/frame/1m/bli_l1m_ker_ft.h
@@ -32,31 +32,28 @@
 
 */
 
-//
-// Define template prototypes for level-3 micro-kernels.
-//
-
-// Note: Instead of defining function prototype macro templates and then
-// instantiating those macros to define the individual function prototypes,
-// we simply alias the official operations' prototypes as defined in
-// bli_l3_ukr_prot.h.
-
-#undef  GENTPROT
-#define GENTPROT GEMM_UKR_PROT
+#ifndef BLIS_L1M_KER_FT_H
+#define BLIS_L1M_KER_FT_H
 
-INSERT_GENTPROT_BASIC0( gemm_ukr_name )
 
+//
+// -- Level-1m kernel function types -------------------------------------------
+//
 
-#undef  GENTPROT
-#define GENTPROT GEMMTRSM_UKR_PROT
-
-INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name )
-INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name )
+#undef  GENTDEF
+#define GENTDEF( opname ) \
+\
+typedef void (*PASTECH(opname,_ker_ft)) \
+     ( \
+       PASTECH(opname,_params), \
+       BLIS_CNTX_PARAM  \
+     );
 
+GENTDEF( packm )
+GENTDEF( packm_cxk )
+GENTDEF( unpackm_cxk )
+GENTDEF( packm_cxc_diag )
 
-#undef  GENTPROT
-#define GENTPROT TRSM_UKR_PROT
 
-INSERT_GENTPROT_BASIC0( trsm_l_ukr_name )
-INSERT_GENTPROT_BASIC0( trsm_u_ukr_name )
+#endif
 
diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ker_params.h
similarity index 64%
rename from frame/1m/bli_l1m_ft_ker.h
rename to frame/1m/bli_l1m_ker_params.h
index 729b03d79..b0b383b21 100644
--- a/frame/1m/bli_l1m_ft_ker.h
+++ b/frame/1m/bli_l1m_ker_params.h
@@ -32,8 +32,8 @@
 
 */
 
-#ifndef BLIS_L1M_FT_KER_H
-#define BLIS_L1M_FT_KER_H
+#ifndef BLIS_L1M_KER_PARAMS_H
+#define BLIS_L1M_KER_PARAMS_H
 
 
 //
@@ -44,11 +44,8 @@
 
 // NOTE: This is the function type for the structure-aware "kernel".
 
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define packm_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              struc_t strucc, \
              diag_t  diagc, \
              uplo_t  uploc, \
@@ -61,66 +58,44 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
              dim_t   panel_len_max, \
              dim_t   panel_dim_off, \
              dim_t   panel_len_off, \
-       const ctype*  kappa, \
-       const ctype*  c, inc_t incc, inc_t ldc, \
-             ctype*  p,             inc_t ldp, \
+       const void*   kappa, \
+       const void*   c, inc_t incc, inc_t ldc, \
+             void*   p,             inc_t ldp, \
                         inc_t is_p, \
-       const cntx_t* cntx, \
-       const void*   params \
-     );
+       const void*   params  \
 
-INSERT_GENTDEF( packm )
 
+// packm_cxk (packm microkernel)
 
-// NOTE: the following macros generate packm kernel function type definitions
-// that are "ctyped" and void-typed, for each of the floating-point datatypes.
-
-// packm_ker
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define packm_cxk_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              conj_t  conja, \
              pack_t  schema, \
              dim_t   cdim, \
              dim_t   n, \
              dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
-       const cntx_t* cntx  \
-     );
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp  \
 
-INSERT_GENTDEF( packm_cxk )
 
-// unpackm_ker
+// unpackm_cxk kernel
 
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define unpackm_cxk_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              conj_t  conja, \
              pack_t  schema, \
              dim_t   cdim, \
              dim_t   n, \
-       const ctype*  kappa, \
-       const ctype*  p,             inc_t ldp, \
-             ctype*  a, inc_t inca, inc_t lda, \
-       const cntx_t* cntx  \
-     );
+       const void*   kappa, \
+       const void*   p,             inc_t ldp, \
+             void*   a, inc_t inca, inc_t lda  \
 
-INSERT_GENTDEF( unpackm_cxk )
 
-// packm_diag_ker
+// packm_cxc_diag kernel
 
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define packm_cxc_diag_params \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
-     ( \
              struc_t struca, \
              diag_t  diaga, \
              uplo_t  uploa, \
@@ -129,13 +104,9 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
              bool    invdiag, \
              dim_t   cdim, \
              dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
-       const cntx_t* cntx \
-     );
-
-INSERT_GENTDEF( packm_cxc_diag )
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp  \
 
 
 #endif
diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h
index a18eab623..1889370fa 100644
--- a/frame/1m/bli_l1m_ker_prot.h
+++ b/frame/1m/bli_l1m_ker_prot.h
@@ -32,63 +32,26 @@
 
 */
 
+#ifndef BLIS_L1M_KER_PROT_H
+#define BLIS_L1M_KER_PROT_H
 
 //
 // Define template prototypes for level-1m kernels.
 //
 
-// packm kernels
-
-#define PACKM_KER_PROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-             conj_t  conja, \
-             pack_t  schema, \
-             dim_t   cdim, \
-             dim_t   n, \
-             dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
-       const cntx_t* cntx  \
-     );
-
-
-// unpackm kernels
-
-#define UNPACKM_KER_PROT( ctype, ch, varname ) \
+#undef  L1MTPROT
+#define L1MTPROT( ctype, ch, funcname, opname ) \
 \
-void PASTEMAC(ch,varname) \
+void PASTEMAC(ch,funcname) \
      ( \
-             conj_t  conja, \
-             pack_t  schema, \
-             dim_t   cdim, \
-             dim_t   n, \
-       const ctype*  kappa, \
-       const ctype*  p,             inc_t ldp, \
-             ctype*  a, inc_t inca, inc_t lda, \
-       const cntx_t* cntx  \
+       PASTECH(opname,_params), \
+       BLIS_CNTX_PARAM  \
      );
 
+#define PACKM_KER_PROT(      ctype, ch, fn )  L1MTPROT( ctype, ch, fn, packm_cxk );
+#define UNPACKM_KER_PROT(    ctype, ch, fn )  L1MTPROT( ctype, ch, fn, unpackm_cxk );
+#define PACKM_DIAG_KER_PROT( ctype, ch, fn )  L1MTPROT( ctype, ch, fn, packm_cxc_diag );
 
-// packm kernels for diagonal blocks
 
-#define PACKM_DIAG_KER_PROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-             struc_t struca, \
-             diag_t  diaga, \
-             uplo_t  uploa, \
-             conj_t  conja, \
-             pack_t  schema, \
-             bool    invdiag, \
-             dim_t   cdim, \
-             dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
-       const cntx_t* cntx \
-     );
+#endif
 
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index 9d051c169..79e4d9efd 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -79,7 +79,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -195,7 +195,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -307,7 +307,7 @@ void PASTEMAC(ch,opname) \
 	if ( bli_is_zeros( uplox_eff ) ) return; \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -420,7 +420,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 561988e7f..c0b6869f7 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -121,7 +121,7 @@ void bli_packm_blk_var1
 	func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
 
 	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
+	packm_ker_ft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
 
 	// For mixed-precision gemm, select the proper kernel (only dense panels).
 	if ( dt_c != dt_p )
@@ -200,23 +200,26 @@ void bli_packm_blk_var1
 			// configure-time.
 			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) )
 			{
-				packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
-				                diagc,
-				                uploc,
-				                conjc,
-				                schema,
-				                invdiag,
-				                panel_dim_i,
-				                panel_len_full,
-				                panel_dim_max,
-				                panel_len_max,
-				                panel_dim_off_i,
-				                panel_len_off,
-				                kappa_cast,
-				                c_begin, incc, ldc,
-				                p_begin,       ldp, is_p,
-				                ( cntx_t* )cntx,
-				                params );
+				packm_ker_cast
+				(
+				  bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
+				  diagc,
+				  uploc,
+				  conjc,
+				  schema,
+				  invdiag,
+				  panel_dim_i,
+				  panel_len_full,
+				  panel_dim_max,
+				  panel_len_max,
+				  panel_dim_off_i,
+				  panel_len_off,
+				  kappa_cast,
+				  c_begin, incc, ldc,
+				  p_begin,       ldp, is_p,
+				  params,
+				  ( cntx_t* )cntx
+				);
 			}
 
 			p_begin += ps_p*dt_p_size;
@@ -290,24 +293,27 @@ void bli_packm_blk_var1
 			// when packing micropanels of a triangular matrix.
 			if ( bli_is_my_iter_rr( it, tid, nt ) )
 			{
-				packm_ker_cast( strucc,
-				                diagc,
-				                uploc,
-				                conjc,
-				                schema,
-				                invdiag,
-				                panel_dim_i,
-				                panel_len_i,
-				                panel_dim_max,
-				                panel_len_max_i,
-				                panel_dim_off_i,
-				                panel_len_off_i,
-				                kappa_cast,
-				                c_use, incc, ldc,
-				                p_use,       ldp,
-				                       is_p_use,
-				                ( cntx_t* )cntx,
-				                params );
+				packm_ker_cast
+				(
+				  strucc,
+				  diagc,
+				  uploc,
+				  conjc,
+				  schema,
+				  invdiag,
+				  panel_dim_i,
+				  panel_len_i,
+				  panel_dim_max,
+				  panel_len_max_i,
+				  panel_dim_off_i,
+				  panel_len_off_i,
+				  kappa_cast,
+				  c_use, incc, ldc,
+				  p_use,       ldp,
+				         is_p_use,
+				  params,
+				  ( cntx_t* )cntx
+				);
 			}
 
 			// NOTE: This value is usually LESS than ps_p because triangular
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h
index 870988fec..3273b5bb5 100644
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -39,8 +39,8 @@
 
 typedef struct
 {
-	//                   Type of C          Type of P
-	packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
+	//                  Type of C          Type of P
+	packm_ker_ft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
 } packm_blk_var1_params_t;
 
 //
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 3f0d48dbf..93801ebe9 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -55,6 +55,7 @@ void PASTEMAC(ch,varname) \
        ctype*  c, inc_t incc, inc_t ldc, \
        ctype*  p,             inc_t ldp, \
                   inc_t is_p, \
+       void*   params, \
        cntx_t* cntx  \
      ) \
 { \
@@ -79,8 +80,8 @@ void PASTEMAC(ch,varname) \
 		                                         : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
 	} \
 \
-	PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
-	PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
+	PASTECH(cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
+	PASTECH(cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
 \
 	/* For general matrices, pack and return early */ \
 	if ( bli_is_general( strucc ) ) \
diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h
index f0293330b..58bef1d76 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.h
+++ b/frame/1m/packm/bli_packm_struc_cxk.h
@@ -53,6 +53,7 @@ void PASTEMAC(ch,varname) \
        ctype*  c, inc_t incc, inc_t ldc, \
        ctype*  p,             inc_t ldp, \
                   inc_t is_p, \
+       void*   params, \
        cntx_t* cntx  \
      );
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c
index b83a0271f..2b8a07bd5 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.c
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.c
@@ -41,24 +41,24 @@
 \
 void PASTEMAC2(chc,chp,varname) \
      ( \
-       struc_t           strucc, \
-       diag_t            diagc, \
-       uplo_t            uploc, \
-       conj_t            conjc, \
-       pack_t            schema, \
-       bool              invdiag, \
-       dim_t             panel_dim, \
-       dim_t             panel_len, \
-       dim_t             panel_dim_max, \
-       dim_t             panel_len_max, \
-       dim_t             panel_dim_off, \
-       dim_t             panel_len_off, \
-       ctype_p* restrict kappa, \
-       ctype_c* restrict c, inc_t incc, inc_t ldc, \
-       ctype_p* restrict p,             inc_t ldp, \
-                            inc_t is_p, \
-       cntx_t*           cntx, \
-       void*             params \
+       struc_t  strucc, \
+       diag_t   diagc, \
+       uplo_t   uploc, \
+       conj_t   conjc, \
+       pack_t   schema, \
+       bool     invdiag, \
+       dim_t    panel_dim, \
+       dim_t    panel_len, \
+       dim_t    panel_dim_max, \
+       dim_t    panel_len_max, \
+       dim_t    panel_dim_off, \
+       dim_t    panel_len_off, \
+       ctype_p* kappa, \
+       ctype_c* c, inc_t incc, inc_t ldc, \
+       ctype_p* p,             inc_t ldp, \
+                   inc_t is_p, \
+       void*    params, \
+       cntx_t*  cntx  \
      ) \
 { \
 	if ( bli_is_nat_packed( schema ) ) \
@@ -288,12 +288,12 @@ INSERT_GENTFUNC2_MIXDP0( packm_struc_cxk_md )
 \
 void PASTEMAC2(cha,chp,opname) \
      ( \
-       conj_t            conja, \
-       dim_t             m, \
-       dim_t             n, \
-       ctype_p* restrict kappa, \
-       ctype_a* restrict a, inc_t inca, inc_t lda, \
-       ctype_p* restrict p,             inc_t ldp  \
+       conj_t   conja, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype_p* kappa, \
+       ctype_a* a, inc_t inca, inc_t lda, \
+       ctype_p* p,             inc_t ldp  \
      ) \
 { \
 	const inc_t                    inca2    = 2 * inca; \
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h
index f493838b3..8c3fa0335 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.h
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.h
@@ -37,24 +37,24 @@
 \
 void PASTEMAC2(chc,chp,varname) \
      ( \
-       struc_t           strucc, \
-       diag_t            diagc, \
-       uplo_t            uploc, \
-       conj_t            conjc, \
-       pack_t            schema, \
-       bool              invdiag, \
-       dim_t             panel_dim, \
-       dim_t             panel_len, \
-       dim_t             panel_dim_max, \
-       dim_t             panel_len_max, \
-       dim_t             panel_dim_off, \
-       dim_t             panel_len_off, \
-       ctype_p* restrict kappa, \
-       ctype_c* restrict c, inc_t incc, inc_t ldc, \
-       ctype_p* restrict p,             inc_t ldp, \
-                            inc_t is_p, \
-       cntx_t*           cntx, \
-       void*             params \
+       struc_t  strucc, \
+       diag_t   diagc, \
+       uplo_t   uploc, \
+       conj_t   conjc, \
+       pack_t   schema, \
+       bool     invdiag, \
+       dim_t    panel_dim, \
+       dim_t    panel_len, \
+       dim_t    panel_dim_max, \
+       dim_t    panel_len_max, \
+       dim_t    panel_dim_off, \
+       dim_t    panel_len_off, \
+       ctype_p* kappa, \
+       ctype_c* c, inc_t incc, inc_t ldc, \
+       ctype_p* p,             inc_t ldp, \
+                   inc_t is_p, \
+       void*    params, \
+       cntx_t*  cntx  \
      );
 
 INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md )
@@ -66,12 +66,12 @@ INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md )
 \
 void PASTEMAC2(cha,chp,opname) \
      ( \
-       conj_t            conja, \
-       dim_t             m, \
-       dim_t             n, \
-       ctype_p* restrict kappa, \
-       ctype_a* restrict a, inc_t inca, inc_t lda, \
-       ctype_p* restrict p,             inc_t ldp  \
+       conj_t   conja, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype_p* kappa, \
+       ctype_a* a, inc_t inca, inc_t lda, \
+       ctype_p* p,             inc_t ldp  \
      );
 
 INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md )
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index f9f7f511c..8659ad5e2 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -220,7 +220,7 @@ void PASTEMAC(ch,varname) \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. */ \
-	PASTECH2(ch,unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
+	PASTECH(unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
diff --git a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
index 8f0f31479..397189971 100644
--- a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
+++ b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
@@ -83,11 +83,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* If alpha == 0, then we are done. */ \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
-\
-	/*PASTECH(ch,axpyf_ker_ft) kfp_af;*/ \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	/*kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );*/ \
+	/*axpyf_ker_ft kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );*/ \
 	/*b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );*/ \
 	b_fuse = fusefac; \
 \
diff --git a/frame/2/gemv/bli_gemv_unb_var1.c b/frame/2/gemv/bli_gemv_unb_var1.c
index 840b96901..606c867ba 100644
--- a/frame/2/gemv/bli_gemv_unb_var1.c
+++ b/frame/2/gemv/bli_gemv_unb_var1.c
@@ -66,11 +66,9 @@ void PASTEMAC(ch,varname) \
 	                              &n_iter, &n_elem, &rs_at, &cs_at ); \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	dotxv_ker_ft kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < n_iter; ++i ) \
 	{ \
diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c
index 7fc4fcfe4..cfee006f9 100644
--- a/frame/2/gemv/bli_gemv_unb_var2.c
+++ b/frame/2/gemv/bli_gemv_unb_var2.c
@@ -96,11 +96,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < n_iter; ++i ) \
 	{ \
diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c
index 0dceed4cf..0f7eeaf1d 100644
--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -67,11 +67,9 @@ void PASTEMAC(ch,varname) \
 	                              &n_iter, &n_elem, &rs_at, &cs_at ); \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	dotxf_ker_ft kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c
index 4c43657ad..c16511da2 100644
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -96,11 +96,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	axpyf_ker_ft kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
diff --git a/frame/2/ger/bli_ger_unb_var1.c b/frame/2/ger/bli_ger_unb_var1.c
index d8ddd1247..ceab85012 100644
--- a/frame/2/ger/bli_ger_unb_var1.c
+++ b/frame/2/ger/bli_ger_unb_var1.c
@@ -57,11 +57,9 @@ void PASTEMAC(ch,varname) \
 	ctype*  y1; \
 	ctype   alpha_chi1; \
 	dim_t   i; \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/ger/bli_ger_unb_var2.c b/frame/2/ger/bli_ger_unb_var2.c
index 9c49e336b..5bed4116b 100644
--- a/frame/2/ger/bli_ger_unb_var2.c
+++ b/frame/2/ger/bli_ger_unb_var2.c
@@ -57,11 +57,9 @@ void PASTEMAC(ch,varname) \
 	ctype*  psi1; \
 	ctype   alpha_psi1; \
 	dim_t   j; \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( j = 0; j < n; ++j ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c
index 71c27a326..f592bdf6c 100644
--- a/frame/2/hemv/bli_hemv_unb_var1.c
+++ b/frame/2/hemv/bli_hemv_unb_var1.c
@@ -118,12 +118,10 @@ void PASTEMAC(ch,varname) \
 		); \
 	} \
 \
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
-	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
-	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	dotxv_ker_ft kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c
index 3753c8d3b..9465659fa 100644
--- a/frame/2/hemv/bli_hemv_unb_var2.c
+++ b/frame/2/hemv/bli_hemv_unb_var2.c
@@ -119,11 +119,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	dotxv_ker_ft kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c
index d592251d5..a93b78b81 100644
--- a/frame/2/hemv/bli_hemv_unb_var3.c
+++ b/frame/2/hemv/bli_hemv_unb_var3.c
@@ -118,12 +118,10 @@ void PASTEMAC(ch,varname) \
 		); \
 	} \
 \
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
-	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
-	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	dotxv_ker_ft kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c
index 10cf953b6..810900dc2 100644
--- a/frame/2/hemv/bli_hemv_unb_var4.c
+++ b/frame/2/hemv/bli_hemv_unb_var4.c
@@ -118,11 +118,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c
index a449909a5..65ddeb3e6 100644
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -126,11 +126,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	dotxaxpyf_ker_ft kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
 \
 	for ( i = 0; i < m; i += f ) \
diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c
index d0af57393..5377f20a2 100644
--- a/frame/2/hemv/bli_hemv_unf_var1a.c
+++ b/frame/2/hemv/bli_hemv_unf_var1a.c
@@ -117,11 +117,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
+	dotaxpyv_ker_ft kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c
index baaff098d..97a7a5a66 100644
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -126,11 +126,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	dotxaxpyf_ker_ft kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
 \
 	for ( i = 0; i < m; i += f ) \
diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c
index 55c1929ff..c7aa38550 100644
--- a/frame/2/hemv/bli_hemv_unf_var3a.c
+++ b/frame/2/hemv/bli_hemv_unf_var3a.c
@@ -117,11 +117,9 @@ void PASTEMAC(ch,varname) \
 		  NULL  \
 		); \
 	} \
-\
-	PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
+	dotaxpyv_ker_ft kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her/bli_her_unb_var1.c b/frame/2/her/bli_her_unb_var1.c
index 8cd6bd397..b1d91269e 100644
--- a/frame/2/her/bli_her_unb_var1.c
+++ b/frame/2/her/bli_her_unb_var1.c
@@ -99,11 +99,9 @@ void PASTEMAC(ch,varname) \
 	   conjugation for the scalar and vector subproblems. */ \
 	conj0 = conjx; \
 	conj1 = bli_apply_conj( conjh, conjx ); \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her/bli_her_unb_var2.c b/frame/2/her/bli_her_unb_var2.c
index f68798dce..adf86635a 100644
--- a/frame/2/her/bli_her_unb_var2.c
+++ b/frame/2/her/bli_her_unb_var2.c
@@ -99,11 +99,9 @@ void PASTEMAC(ch,varname) \
 	   conjugation for the scalar and vector subproblems. */ \
 	conj0 = bli_apply_conj( conjh, conjx ); \
 	conj1 = conjx; \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c
index b5c182639..5ae70e321 100644
--- a/frame/2/her2/bli_her2_unb_var1.c
+++ b/frame/2/her2/bli_her2_unb_var1.c
@@ -102,11 +102,9 @@ void PASTEMAC(ch,varname) \
 	   the effective conjugation for the vector subproblems. */ \
 	conj0 = bli_apply_conj( conjh, conjy ); \
 	conj1 = bli_apply_conj( conjh, conjx ); \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c
index 602e922a8..14135e894 100644
--- a/frame/2/her2/bli_her2_unb_var2.c
+++ b/frame/2/her2/bli_her2_unb_var2.c
@@ -109,11 +109,9 @@ void PASTEMAC(ch,varname) \
 	conj0       = conjx; \
 	conj1       = bli_apply_conj( conjh, conjx ); \
 	conjh_conjy = bli_apply_conj( conjh, conjy ); \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c
index 1d5872d5d..ffcaf434f 100644
--- a/frame/2/her2/bli_her2_unb_var3.c
+++ b/frame/2/her2/bli_her2_unb_var3.c
@@ -109,11 +109,9 @@ void PASTEMAC(ch,varname) \
 	conj0       = bli_apply_conj( conjh, conjy ); \
 	conj1       = conjy; \
 	conjh_conjx = bli_apply_conj( conjh, conjx ); \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c
index 922fe7db7..8b8be1c57 100644
--- a/frame/2/her2/bli_her2_unb_var4.c
+++ b/frame/2/her2/bli_her2_unb_var4.c
@@ -110,11 +110,9 @@ void PASTEMAC(ch,varname) \
 	conj1       = conjy; \
 	conjh_conjx = bli_apply_conj( conjh, conjx ); \
 	conjh_conjy = bli_apply_conj( conjh, conjy ); \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c
index 3824880c6..c2250b87a 100644
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -102,11 +102,9 @@ void PASTEMAC(ch,varname) \
 	   the effective conjugation for the vector subproblems. */ \
 	conj0 = bli_apply_conj( conjh, conjy ); \
 	conj1 = bli_apply_conj( conjh, conjx ); \
-\
-	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+	axpy2v_ker_ft kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c
index 6b2b0e9ac..7ad36e951 100644
--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -110,11 +110,9 @@ void PASTEMAC(ch,varname) \
 	conj1       = conjy; \
 	conjh_conjx = bli_apply_conj( conjh, conjx ); \
 	conjh_conjy = bli_apply_conj( conjh, conjy ); \
-\
-	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+	axpy2v_ker_ft kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/trmv/bli_trmv_unb_var1.c b/frame/2/trmv/bli_trmv_unb_var1.c
index 367a34e6c..9ed74b037 100644
--- a/frame/2/trmv/bli_trmv_unb_var1.c
+++ b/frame/2/trmv/bli_trmv_unb_var1.c
@@ -79,11 +79,9 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,dotv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
+	dotv_ker_ft kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trmv/bli_trmv_unb_var2.c b/frame/2/trmv/bli_trmv_unb_var2.c
index fa21776b3..5674c4d74 100644
--- a/frame/2/trmv/bli_trmv_unb_var2.c
+++ b/frame/2/trmv/bli_trmv_unb_var2.c
@@ -79,11 +79,9 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c
index 9e576fc77..1a8199cae 100644
--- a/frame/2/trmv/bli_trmv_unf_var1.c
+++ b/frame/2/trmv/bli_trmv_unf_var1.c
@@ -87,11 +87,9 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	dotxf_ker_ft kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c
index 052595935..5d9d37aa8 100644
--- a/frame/2/trmv/bli_trmv_unf_var2.c
+++ b/frame/2/trmv/bli_trmv_unf_var2.c
@@ -86,11 +86,9 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	axpyf_ker_ft kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c
index 2f24b10a8..e50162956 100644
--- a/frame/2/trsv/bli_trsv_unb_var1.c
+++ b/frame/2/trsv/bli_trsv_unb_var1.c
@@ -90,11 +90,9 @@ void PASTEMAC(ch,varname) \
 	  cntx, \
 	  NULL  \
 	); \
-\
-	PASTECH(ch,dotv_ker_ft) kfp_tv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_tv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
+	dotv_ker_ft kfp_tv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c
index 1a8e81634..661489f1a 100644
--- a/frame/2/trsv/bli_trsv_unb_var2.c
+++ b/frame/2/trsv/bli_trsv_unb_var2.c
@@ -90,11 +90,9 @@ void PASTEMAC(ch,varname) \
 	  cntx, \
 	  NULL  \
 	); \
-\
-	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c
index 824f26d15..88e9101ce 100644
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -99,11 +99,9 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	dotxf_ker_ft kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c
index bd1f8e3b0..edde2bf5b 100644
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -98,11 +98,9 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	conja = bli_extract_conj( transa ); \
-\
-	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	axpyf_ker_ft kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index a55091539..3e50275b3 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -43,8 +43,7 @@
 #include "bli_l3_packab.h"
 
 // Define function types.
-//#include "bli_l3_ft_ex.h"
-#include "bli_l3_ft_ukr.h"
+#include "bli_l3_ukr_ft.h"
 #include "bli_l3_oft.h"
 #include "bli_l3_oft_var.h"
 
@@ -63,7 +62,7 @@
 
 // Define function types for small/unpacked handlers/kernels.
 #include "bli_l3_sup_oft.h"
-#include "bli_l3_sup_ft_ker.h"
+#include "bli_l3_sup_ker_ft.h"
 
 // Define static edge case logic for use in small/unpacked kernels.
 //#include "bli_l3_sup_edge.h"
diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h
deleted file mode 100644
index 2a42d859b..000000000
--- a/frame/3/bli_l3_ft_ukr.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_L3_FT_UKR_H
-#define BLIS_L3_FT_UKR_H
-
-
-//
-// -- Level-3 micro-kernel function types --------------------------------------
-//
-
-// gemm
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
-     ( \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, \
-       const ctype*     b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     );
-
-INSERT_GENTDEF( gemm )
-
-
-// gemmtrsm_[lu]
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
-     ( \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a1x, \
-       const ctype*     a11, \
-       const ctype*     bx1, \
-             ctype*     b11, \
-             ctype*     c11, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     );
-
-INSERT_GENTDEF( gemmtrsm )
-
-
-// trsm_[lu]
-
-#undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
-\
-typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
-     ( \
-       const ctype*     a, \
-             ctype*     b, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     );
-
-INSERT_GENTDEF( trsm )
-
-
-#endif
-
diff --git a/frame/3/bli_l3_sup_ft_ker.h b/frame/3/bli_l3_sup_ker_ft.h
similarity index 74%
rename from frame/3/bli_l3_sup_ft_ker.h
rename to frame/3/bli_l3_sup_ker_ft.h
index e78edf800..1fbb4f639 100644
--- a/frame/3/bli_l3_sup_ft_ker.h
+++ b/frame/3/bli_l3_sup_ker_ft.h
@@ -32,36 +32,25 @@
 
 */
 
-#ifndef BLIS_L3_SUP_FT_KER_H
-#define BLIS_L3_SUP_FT_KER_H
+#ifndef BLIS_L3_SUP_KER_FT_H
+#define BLIS_L3_SUP_KER_FT_H
 
 
 //
 // -- Level-3 small/unpacked kernel function types -----------------------------
 //
 
-// gemmsup
-
 #undef  GENTDEF
-#define GENTDEF( ctype, ch, opname, tsuf ) \
+#define GENTDEF( opname ) \
 \
-typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
+typedef void (*PASTECH(opname,_ker_ft)) \
      ( \
-             conj_t     conja, \
-             conj_t     conjb, \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
+       PASTECH(opname,_params), \
+       BLIS_AUXINFO_PARAM, \
+       BLIS_CNTX_PARAM  \
      );
 
-INSERT_GENTDEF( gemmsup )
+GENTDEF( gemmsup )
 
 
 #endif
diff --git a/frame/3/bli_l3_sup_ker_params.h b/frame/3/bli_l3_sup_ker_params.h
new file mode 100644
index 000000000..043b0a280
--- /dev/null
+++ b/frame/3/bli_l3_sup_ker_params.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_SUP_KER_PARAMS_H
+#define BLIS_L3_SUP_KER_PARAMS_H
+
+
+#define gemmsup_params \
+\
+             conj_t conja, \
+             conj_t conjb, \
+             dim_t  m, \
+             dim_t  n, \
+             dim_t  k, \
+       const void*  alpha, \
+       const void*  a, inc_t rs_a, inc_t cs_a, \
+       const void*  b, inc_t rs_b, inc_t cs_b, \
+       const void*  beta, \
+             void*  c, inc_t rs_c, inc_t cs_c
+
+
+#endif
+
diff --git a/frame/3/bli_l3_sup_ker_prot.h b/frame/3/bli_l3_sup_ker_prot.h
index efb216c22..c4c0e824c 100644
--- a/frame/3/bli_l3_sup_ker_prot.h
+++ b/frame/3/bli_l3_sup_ker_prot.h
@@ -32,25 +32,25 @@
 
 */
 
+#ifndef BLIS_L3_SUP_KER_PROT_H
+#define BLIS_L3_SUP_KER_PROT_H
+
 //
 // Define template prototypes for level-3 kernels on small/unpacked matrices.
 //
 
-#define GEMMSUP_KER_PROT( ctype, ch, opname ) \
+#undef  SUPTPROT
+#define SUPTPROT( ctype, ch, funcname, opname ) \
 \
-void PASTEMAC(ch,opname) \
+void PASTEMAC(ch,funcname) \
      ( \
-             conj_t     conja, \
-             conj_t     conjb, \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
+       PASTECH(opname,_params), \
+       BLIS_AUXINFO_PARAM, \
+       BLIS_CNTX_PARAM  \
      );
 
+#define GEMMSUP_KER_PROT( ctype, ch, fn )  SUPTPROT( ctype, ch, fn, gemmsup );
+
+
+#endif
+
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 67b33f407..fa31468cb 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -129,7 +129,7 @@ void PASTEMAC(ch,varname) \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. */ \
-	PASTECH2(ch,packm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
+	PASTECH(packm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index 5d7ea345c..0fc4a8e82 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -215,13 +215,14 @@ void bli_gemmsup_ref_var1n
 	const inc_t jrstep_c = rs_c * MR * dt_size;
 
 	//const inc_t jrstep_a = rs_a * MR;
+	//( void )jrstep_a;
 
 	//const inc_t irstep_c = cs_c * NR;
 	//const inc_t irstep_b = cs_b * NR;
 
 	// Query the context for the sup microkernel address and cast it to its
 	// function pointer type.
-	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+	gemmsup_ker_ft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
 
 	const char* a_00       = buf_a;
 	const char* b_00       = buf_b;
@@ -623,14 +624,14 @@ void bli_gemmsup_ref_var2m
 
 	// Query the context for the sup microkernel address and cast it to its
 	// function pointer type.
-	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+	gemmsup_ker_ft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
 
 	const char* a_00       = buf_a;
 	const char* b_00       = buf_b;
 	      char* c_00       = buf_c;
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 
-	auxinfo_t       aux;
+	auxinfo_t aux;
 
 	// Determine whether we are using more than one thread.
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
diff --git a/frame/3/bli_l3_ukr_fpa.c b/frame/3/bli_l3_ukr_fpa.c
index b174d5739..701ed54aa 100644
--- a/frame/3/bli_l3_ukr_fpa.c
+++ b/frame/3/bli_l3_ukr_fpa.c
@@ -41,10 +41,10 @@
 #undef  GENFRONT
 #define GENFRONT( tname, opname ) \
 \
-GENARRAY_FPA( PASTECH2(tname,_ukr,_vft), \
+GENARRAY_FPA( PASTECH(tname,_ukr_ft), \
               opname ); \
 \
-PASTECH2(tname,_ukr,_vft) \
+PASTECH(tname,_ukr_ft) \
 PASTEMAC(opname,_qfp)( num_t dt ) \
 { \
 	return PASTECH(opname,_fpa)[ dt ]; \
diff --git a/frame/3/bli_l3_ukr_fpa.h b/frame/3/bli_l3_ukr_fpa.h
index 65bdc6964..6c17c37a6 100644
--- a/frame/3/bli_l3_ukr_fpa.h
+++ b/frame/3/bli_l3_ukr_fpa.h
@@ -39,7 +39,7 @@
 #undef  GENPROT
 #define GENPROT( tname, opname ) \
 \
-PASTECH2(tname,_ukr,_vft) \
+PASTECH(tname,_ukr_ft) \
 PASTEMAC(opname,_qfp)( num_t dt );
 
 GENPROT( gemm,     gemm_ukernel )
diff --git a/frame/1f/bli_l1f_ker.h b/frame/3/bli_l3_ukr_ft.h
similarity index 68%
rename from frame/1f/bli_l1f_ker.h
rename to frame/3/bli_l3_ukr_ft.h
index efedc5cc8..ea008a470 100644
--- a/frame/1f/bli_l1f_ker.h
+++ b/frame/3/bli_l3_ukr_ft.h
@@ -32,42 +32,28 @@
 
 */
 
+#ifndef BLIS_L3_UKR_FT_H
+#define BLIS_L3_UKR_FT_H
+
 
 //
-// Define template prototypes for level-1f kernels.
+// -- Level-3 micro-kernel function types --------------------------------------
 //
 
-// Note: Instead of defining function prototype macro templates and then
-// instantiating those macros to define the individual function prototypes,
-// we simply alias the official operations' prototypes as defined in
-// bli_l1f_ker_prot.h.
-
-#undef  GENTPROT
-#define GENTPROT AXPY2V_KER_PROT
-
-INSERT_GENTPROT_BASIC0( axpy2v_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT AXPYF_KER_PROT
-
-INSERT_GENTPROT_BASIC0( axpyf_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT DOTAXPYV_KER_PROT
-
-INSERT_GENTPROT_BASIC0( dotaxpyv_ker_name )
-
-
-#undef  GENTPROT
-#define GENTPROT DOTXAXPYF_KER_PROT
-
-INSERT_GENTPROT_BASIC0( dotxaxpyf_ker_name )
+#undef  GENTDEF
+#define GENTDEF( opname ) \
+\
+typedef void (*PASTECH(opname,_ukr_ft)) \
+     ( \
+       PASTECH(opname,_params), \
+       BLIS_AUXINFO_PARAM, \
+       BLIS_CNTX_PARAM  \
+     );
 
+GENTDEF( gemm )
+GENTDEF( gemmtrsm )
+GENTDEF( trsm )
 
-#undef  GENTPROT
-#define GENTPROT DOTXF_KER_PROT
 
-INSERT_GENTPROT_BASIC0( dotxf_ker_name )
+#endif
 
diff --git a/frame/3/bli_l3_ukr_oapi.c b/frame/3/bli_l3_ukr_oapi.c
index e500bab71..8494100fa 100644
--- a/frame/3/bli_l3_ukr_oapi.c
+++ b/frame/3/bli_l3_ukr_oapi.c
@@ -72,7 +72,7 @@ void PASTEMAC0(opname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(tname,_ukr,_vft) f = \
+	PASTECH(tname,_ukr_ft) f = \
 	PASTEMAC(opname,_qfp)( dt ); \
 \
 	f \
@@ -137,7 +137,7 @@ void PASTEMAC0(opname) \
 	{ \
 		/* Query a type-specific function pointer, except one that uses
 		   void* for function arguments instead of typed pointers. */ \
-		PASTECH2(tname,_ukr,_vft) f = \
+		PASTECH(tname,_ukr_ft) f = \
 		PASTEMAC(opnamel,_qfp)( dt ); \
 \
 		f \
@@ -159,7 +159,7 @@ void PASTEMAC0(opname) \
 	{ \
 		/* Query a type-specific function pointer, except one that uses
 		   void* for function arguments instead of typed pointers. */ \
-		PASTECH2(tname,_ukr,_vft) f = \
+		PASTECH(tname,_ukr_ft) f = \
 		PASTEMAC(opnameu,_qfp)( dt ); \
 \
 		f \
@@ -216,7 +216,7 @@ void PASTEMAC0(opname) \
 	{ \
 		/* Query a type-specific function pointer, except one that uses
 		   void* for function arguments instead of typed pointers. */ \
-		PASTECH2(tname,_ukr,_vft) f = \
+		PASTECH(tname,_ukr_ft) f = \
 		PASTEMAC(opnamel,_qfp)( dt ); \
 \
 		f \
@@ -232,7 +232,7 @@ void PASTEMAC0(opname) \
 	{ \
 		/* Query a type-specific function pointer, except one that uses
 		   void* for function arguments instead of typed pointers. */ \
-		PASTECH2(tname,_ukr,_vft) f = \
+		PASTECH(tname,_ukr_ft) f = \
 		PASTEMAC(opnameu,_qfp)( dt ); \
 \
 		f \
diff --git a/frame/3/bli_l3_ukr_params.h b/frame/3/bli_l3_ukr_params.h
new file mode 100644
index 000000000..b55115998
--- /dev/null
+++ b/frame/3/bli_l3_ukr_params.h
@@ -0,0 +1,70 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_UKR_PARAMS_H
+#define BLIS_L3_UKR_PARAMS_H
+
+
+#define gemm_params \
+\
+             dim_t  m, \
+             dim_t  n, \
+             dim_t  k, \
+       const void*  alpha, \
+       const void*  a, \
+       const void*  b, \
+       const void*  beta, \
+             void*  c, inc_t rs_c, inc_t cs_c
+
+#define gemmtrsm_params \
+\
+             dim_t  m, \
+             dim_t  n, \
+             dim_t  k, \
+       const void*  alpha, \
+       const void*  a1x, \
+       const void*  a11, \
+       const void*  bx1, \
+             void*  b11, \
+             void*  c11, inc_t rs_c, inc_t cs_c
+
+#define trsm_params \
+\
+       const void*  a, \
+             void*  b, \
+             void*  c, inc_t rs_c, inc_t cs_c
+
+
+#endif
+
diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h
index 655d45a18..ea26ee7f4 100644
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -32,56 +32,27 @@
 
 */
 
+#ifndef BLIS_L3_UKR_PROT_H
+#define BLIS_L3_UKR_PROT_H
+
 //
 // Define template prototypes for level-3 micro-kernels.
 //
 
-#define GEMM_UKR_PROT( ctype, ch, opname ) \
-        GEMM_UKR_PROT2( ctype, ctype, ch, opname )
-
-#define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \
+#undef  L3TPROT
+#define L3TPROT( ctype, ch, funcname, opname ) \
 \
-void PASTEMAC(ch,opname) \
+void PASTEMAC(ch,funcname) \
      ( \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const ctype_out* alpha, \
-       const ctype_in*  a, \
-       const ctype_in*  b, \
-       const ctype_out* beta, \
-             ctype_out* c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
+       PASTECH(opname,_params), \
+       BLIS_AUXINFO_PARAM, \
+       BLIS_CNTX_PARAM  \
      );
 
+#define GEMM_UKR_PROT(     ctype, ch, fn )  L3TPROT( ctype, ch, fn, gemm );
+#define GEMMTRSM_UKR_PROT( ctype, ch, fn )  L3TPROT( ctype, ch, fn, gemmtrsm );
+#define TRSM_UKR_PROT(     ctype, ch, fn )  L3TPROT( ctype, ch, fn, trsm );
 
-#define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a1x, \
-       const ctype*     a11, \
-       const ctype*     bx1, \
-             ctype*     b11, \
-             ctype*     c11, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     );
 
-
-#define TRSM_UKR_PROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       const ctype*     a, \
-             ctype*     b, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     );
+#endif
 
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index 2145fb4c5..7ea68a9d2 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
 \
 	/* Query the context for the function address of the current
 	   datatype's micro-kernel. */ \
-	PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
 	f \
@@ -102,7 +102,7 @@ void PASTEMAC(ch,opname) \
 \
 	/* Query the context for the function address of the current
 	   datatype's micro-kernel. */ \
-	PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
 	f \
@@ -143,7 +143,7 @@ void PASTEMAC(ch,opname) \
 \
 	/* Query the context for the function address of the current
 	   datatype's micro-kernel. */ \
-	PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
 	f \
diff --git a/frame/3/bli_l3_ukr_tapi.h b/frame/3/bli_l3_ukr_tapi.h
index dc4d22651..68335c731 100644
--- a/frame/3/bli_l3_ukr_tapi.h
+++ b/frame/3/bli_l3_ukr_tapi.h
@@ -37,20 +37,60 @@
 // Generate prototypes for level-3 micro-kernel wrappers.
 //
 
-#undef  gemm_ukr_name
-#define gemm_ukr_name       gemm_ukernel
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a, \
+       const ctype*     b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
+     );
 
-#undef  gemmtrsm_l_ukr_name
-#define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel
-#undef  gemmtrsm_u_ukr_name
-#define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel
+INSERT_GENTPROT_BASIC0( gemm_ukernel )
 
-#undef  trsm_l_ukr_name
-#define trsm_l_ukr_name     trsm_l_ukernel
-#undef  trsm_u_ukr_name
-#define trsm_u_ukr_name     trsm_u_ukernel
 
-// Include the level-3 micro-kernel API template.
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const ctype*     alpha, \
+       const ctype*     a1x, \
+       const ctype*     a11, \
+       const ctype*     bx1, \
+             ctype*     b11, \
+             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
+     );
 
-#include "bli_l3_ukr.h"
+INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukernel )
+INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukernel )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       const ctype*     a, \
+             ctype*     b, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
+     );
+
+INSERT_GENTPROT_BASIC0( trsm_l_ukernel )
+INSERT_GENTPROT_BASIC0( trsm_u_ukernel )
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 69700e6c3..cc0a633e2 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -179,14 +179,14 @@ void bli_gemm_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
 
 	// Query the params field from the obj_t. If it is non-NULL, grab the ukr
 	// field of the params struct. If that function pointer is non-NULL, use it
 	// as our microkernel instead of the default microkernel queried from the
 	// cntx above.
 	const gemm_ker_params_t* params = bli_obj_ker_params( c );
-	gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
+	gemm_ukr_ft user_ukr = params ? params->ukr : NULL;
 	if ( user_ukr ) gemm_ukr = user_ukr;
 
 	// Temporary C buffer for edge cases. Note that the strides of this
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index 0fbb0cc49..4c77872f4 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -44,11 +44,11 @@ void PASTEMAC2(ch,opname,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, \
-       const ctype*     b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta, \
+             void*      c, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
@@ -56,8 +56,7 @@ void PASTEMAC2(ch,opname,suf) \
 	const num_t       dt        = PASTEMAC(ch,type); \
 	const num_t       dt_r      = PASTEMAC(chr,type); \
 \
-	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	      gemm_ukr_ft rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        row_pref  = !col_pref; \
 \
@@ -79,11 +78,11 @@ void PASTEMAC2(ch,opname,suf) \
 \
 	const ctype_r*    zero_r    = PASTEMAC(chr,0); \
 \
-	const ctype_r*    alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
-	   /* ctype_r*    alpha_i   = &PASTEMAC(ch,imag)( *alpha ); */ \
+	const ctype_r*    alpha_r   = &PASTEMAC(ch,real)( *(( ctype* )alpha) ); \
+	   /* ctype_r*    alpha_i   = &PASTEMAC(ch,imag)( *(( ctype* )alpha) ); */ \
 \
-	const ctype_r*    beta_r    = &PASTEMAC(ch,real)( *beta ); \
-	const ctype_r*    beta_i    = &PASTEMAC(ch,imag)( *beta ); \
+	const ctype_r*    beta_r    = &PASTEMAC(ch,real)( *(( ctype* )beta) ); \
+	const ctype_r*    beta_i    = &PASTEMAC(ch,imag)( *(( ctype* )beta) ); \
 \
 	      bool        using_ct; \
 \
@@ -166,22 +165,28 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 		); \
 \
 		/* Accumulate the final result in ct back to c. */ \
-		if ( PASTEMAC(ch,eq1)( *beta ) ) \
+		if ( PASTEMAC(ch,eq1)( *(( ctype* )beta) ) ) \
 		{ \
 			for ( dim_t j = 0; j < n; ++j ) \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
-				                   *(c  + i*rs_c  + j*cs_c ) ); \
+				PASTEMAC(ch,adds) \
+				( \
+				  *(          ct + i*rs_ct + j*cs_ct), \
+				  *(( ctype* )c  + i*rs_c  + j*cs_c )  \
+				); \
 			} \
 		} \
-		else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		else if ( PASTEMAC(ch,eq0)( *(( ctype* )beta )) ) \
 		{ \
 			for ( dim_t j = 0; j < n; ++j ) \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
-				                    *(c  + i*rs_c  + j*cs_c ) ); \
+				PASTEMAC(ch,copys) \
+				( \
+				  *(          ct + i*rs_ct + j*cs_ct), \
+				  *(( ctype* )c  + i*rs_c  + j*cs_c )  \
+				); \
 			} \
 		} \
 		else \
@@ -189,9 +194,12 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 			for ( dim_t j = 0; j < n; ++j ) \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
-				                    *beta, \
-				                    *(c  + i*rs_c  + j*cs_c ) ); \
+				PASTEMAC(ch,xpbys) \
+				( \
+				  *(          ct + i*rs_ct + j*cs_ct), \
+				  *(( ctype* )beta                  ), \
+				  *(( ctype* )c  + i*rs_c  + j*cs_c )  \
+				); \
 			} \
 		} \
 	} \
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.h b/frame/3/gemm/bli_gemm_md_c2r_ref.h
index 43dc20ac2..c3a1dc8b1 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.h
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.h
@@ -34,8 +34,38 @@
 
 // -- Level-3 native micro-kernel prototype redefinitions ----------------------
 
-#undef  gemm_ukr_name
-#define gemm_ukr_name   gemm_md_c2r_ref
+#ifdef BLIS_ENABLE_GEMM_MD
 
-// Include the native micro-kernel API template.
-#include "bli_l3_ukr.h"
+#if 0
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, suf ) \
+\
+void PASTEMAC2(ch,opname,suf) \
+     ( \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta, \
+             void*      c, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
+     )
+#endif
+
+#undef  GENTPROTCO
+#define GENTPROTCO( ctype, ctype_r, ch, chr, funcname, opname ) \
+\
+void PASTEMAC(ch,funcname) \
+     ( \
+       PASTECH(opname,_params), \
+       BLIS_AUXINFO_PARAM, \
+       BLIS_CNTX_PARAM  \
+     );
+
+INSERT_GENTPROTCO_BASIC( gemm_md_c2r_ref, gemm )
+
+
+#endif
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index f69327db0..36500cb6a 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -40,7 +40,7 @@
 
 typedef struct
 {
-	gemm_ukr_vft ukr;
+	gemm_ukr_ft ukr;
 } gemm_ker_params_t;
 
 
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index ecf0265a1..3fcace7f8 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -135,7 +135,7 @@ void bli_gemmt_l_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
 	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
index d7af21d19..b580bcaf8 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
@@ -135,7 +135,7 @@ void bli_gemmt_l_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
 	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 7d396555d..01843b28d 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -135,7 +135,7 @@ void bli_gemmt_u_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
 	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
index e1f939c43..99139b309 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
@@ -135,7 +135,7 @@ void bli_gemmt_u_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
 	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index 0c5cde72c..43998f8be 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -90,7 +90,7 @@ void bli_trmm_ll_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2b.c b/frame/3/trmm/bli_trmm_ll_ker_var2b.c
index bb6de00f5..4bc7d2fa0 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2b.c
@@ -90,7 +90,7 @@ void bli_trmm_ll_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index 039bcc292..969f06941 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -90,7 +90,7 @@ void bli_trmm_lu_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2b.c b/frame/3/trmm/bli_trmm_lu_ker_var2b.c
index 39640ad6b..1c1714c8b 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2b.c
@@ -90,7 +90,7 @@ void bli_trmm_lu_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index f8d0fc6c8..a49c1949b 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -90,7 +90,7 @@ void bli_trmm_rl_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2b.c b/frame/3/trmm/bli_trmm_rl_ker_var2b.c
index 7f2757c3a..ab0a126bd 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2b.c
@@ -90,7 +90,7 @@ void bli_trmm_rl_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index a031b6794..f8db83db2 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -90,7 +90,7 @@ void bli_trmm_ru_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2b.c b/frame/3/trmm/bli_trmm_ru_ker_var2b.c
index 8aae2386a..8d8d3eea2 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2b.c
@@ -90,7 +90,7 @@ void bli_trmm_ru_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 	const char* a_cast     = buf_a;
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 786e4f343..028f02139 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -94,8 +94,8 @@ void bli_trsm_ll_ker_var2
 	const dim_t PACKNR = rs_b;
 
 	// Cast the micro-kernel address to its function pointer type.
-	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
-	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index ebf44905b..72f97b11e 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -94,8 +94,8 @@ void bli_trsm_lu_ker_var2
 	const dim_t     PACKNR      = rs_b;
 
 	// Cast the micro-kernel address to its function pointer type.
-	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
-	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 073fe3ec0..d4b93c7c4 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -99,8 +99,8 @@ void bli_trsm_rl_ker_var2
 	// triangular), it becomes upper-triangular after the kernel operation
 	// is transposed so that all kernel instances are of the "left"
 	// variety (since those are the only trsm ukernels that exist).
-	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
-	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index a05e94494..ae82b1ee0 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -99,8 +99,8 @@ void bli_trsm_ru_ker_var2
 	// triangular), it becomes lower-triangular after the kernel operation
 	// is transposed so that all kernel instances are of the "left"
 	// variety (since those are the only trsm ukernels that exist).
-	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
-	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
+	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
 
 	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
diff --git a/frame/include/bli_pre_ker_params.h b/frame/include/bli_pre_ker_params.h
new file mode 100644
index 000000000..6bd039b7e
--- /dev/null
+++ b/frame/include/bli_pre_ker_params.h
@@ -0,0 +1,45 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_PRE_KER_PARAMS_H
+#define BLIS_PRE_KER_PARAMS_H
+
+// These macros are used in bli_*_ker_prot.h and bli_*_ker_ft.h to make it
+// easy to update them in the future, if needed.
+
+#define BLIS_AUXINFO_PARAM        auxinfo_t* data
+#define BLIS_CNTX_PARAM     const cntx_t*    cntx
+
+
+#endif
diff --git a/frame/include/blis.h b/frame/include/blis.h
index 70005e57d..d87018d00 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -97,6 +97,13 @@ extern "C" {
 
 // -- BLIS architecture/kernel definitions --
 
+#include "bli_pre_ker_params.h"
+#include "bli_l1v_ker_params.h"
+#include "bli_l1f_ker_params.h"
+#include "bli_l1m_ker_params.h"
+#include "bli_l3_ukr_params.h"
+#include "bli_l3_sup_ker_params.h"
+
 #include "bli_l1v_ker_prot.h"
 #include "bli_l1f_ker_prot.h"
 #include "bli_l1m_ker_prot.h"
diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
index fc6755ae5..a6a288613 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
@@ -49,9 +49,9 @@ void bli_dpackm_armsve256_int_8xk
              dim_t   cdim_,
              dim_t   n_,
              dim_t   n_max_,
-       const double* kappa,
-       const double* a, inc_t inca_, inc_t lda_,
-             double* p,              inc_t ldp_,
+       const void*   kappa,
+       const void*   a, inc_t inca_, inc_t lda_,
+             void*   p,              inc_t ldp_,
        const cntx_t* cntx
      )
 {
@@ -63,13 +63,13 @@ void bli_dpackm_armsve256_int_8xk
     const int64_t lda   = lda_;
     const int64_t ldp   = ldp_;
 
-    double* restrict alpha1     = a;
-    double* restrict alpha1_4   = alpha1 + 4 * inca;
-    double* restrict pi1        = p;
-    const   svbool_t all_active = svptrue_b64();
-    svfloat64_t      z_a0;
-    svfloat64_t      z_a4;
-    svuint64_t       z_index;
+    const double* restrict alpha1     = a;
+    const double* restrict alpha1_4   = alpha1 + 4 * inca;
+          double* restrict pi1        = p;
+    const svbool_t         all_active = svptrue_b64();
+    svfloat64_t            z_a0;
+    svfloat64_t            z_a4;
+    svuint64_t             z_index;
 
     // creating index for gather/scatter
     //   with each element as: 0, 1*inca, 2*inca, 3*inca
@@ -77,7 +77,7 @@ void bli_dpackm_armsve256_int_8xk
 
     if ( cdim == mnr )
     {
-        if ( bli_deq1( *kappa ) )
+        if ( bli_deq1( *(( double* )kappa) ) )
         {
             if ( inca == 1 )  // continous memory. packA style
             {
@@ -129,7 +129,7 @@ void bli_dpackm_armsve256_int_8xk
             // load kappa into vector
             svfloat64_t z_kappa;
 
-            z_kappa = svdup_f64( *kappa );
+            z_kappa = svdup_f64( *(( double* )kappa) );
 
             if ( inca == 1 )  // continous memory. packA style
             {
@@ -201,7 +201,7 @@ void bli_dpackm_armsve256_int_8xk
             const dim_t      i      = cdim;
             const dim_t      m_edge = mnr - i;
             const dim_t      n_edge = n_max;
-            double* restrict p_edge = p + (i  )*1;
+            double* restrict p_edge = ( double* )p + (i  )*1;
 
             bli_dset0s_mxn
             (
@@ -217,7 +217,7 @@ void bli_dpackm_armsve256_int_8xk
         const dim_t      j      = n;
         const dim_t      m_edge = mnr;
         const dim_t      n_edge = n_max - j;
-        double* restrict p_edge = p + (j  )*ldp;
+        double* restrict p_edge = ( double* )p + (j  )*ldp;
 
         bli_dset0s_mxn
         (
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
index b17aacfbb..61bd7734a 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
@@ -48,9 +48,9 @@ void bli_dpackm_armsve512_asm_10xk
              dim_t   cdim_,
              dim_t   n_,
              dim_t   n_max_,
-       const double* kappa,
-       const double* a, inc_t inca_, inc_t lda_,
-             double* p,              inc_t ldp_,
+       const void*   kappa,
+       const void*   a, inc_t inca_, inc_t lda_,
+             void*   p,              inc_t ldp_,
        const cntx_t* cntx
      )
 {
@@ -62,7 +62,7 @@ void bli_dpackm_armsve512_asm_10xk
     const int64_t lda   = lda_;
     const int64_t ldp   = ldp_;
     const bool    gs    = inca != 1 && lda != 1;
-    const bool    unitk = bli_deq1( *kappa );
+    const bool    unitk = bli_deq1( *(( double* )kappa) );
 
 #ifdef _A64FX
     {
@@ -337,7 +337,7 @@ void bli_dpackm_armsve512_asm_10xk
             const dim_t      i      = cdim;
             const dim_t      m_edge = mnr - i;
             const dim_t      n_edge = n_max;
-            double* restrict p_edge = p + (i  )*1;
+            double* restrict p_edge = ( double* )p + (i  )*1;
 
             bli_dset0s_mxn
             (
@@ -353,7 +353,7 @@ void bli_dpackm_armsve512_asm_10xk
         const dim_t      j      = n;
         const dim_t      m_edge = mnr;
         const dim_t      n_edge = n_max - j;
-        double* restrict p_edge = p + (j  )*ldp;
+        double* restrict p_edge = ( double* )p + (j  )*ldp;
 
         bli_dset0s_mxn
         (
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
index 370b400d2..b637f8c80 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
@@ -47,9 +47,9 @@ void bli_dpackm_armsve512_asm_16xk
              dim_t   cdim_,
              dim_t   n_,
              dim_t   n_max_,
-       const double* kappa,
-       const double* a, inc_t inca_, inc_t lda_,
-             double* p,              inc_t ldp_,
+       const void*   kappa,
+       const void*   a, inc_t inca_, inc_t lda_,
+             void*   p,              inc_t ldp_,
        const cntx_t* cntx
      )
 {
@@ -61,7 +61,7 @@ void bli_dpackm_armsve512_asm_16xk
     const int64_t lda   = lda_;
     const int64_t ldp   = ldp_;
     const bool    gs    = inca != 1 && lda != 1;
-    const bool    unitk = bli_deq1( *kappa );
+    const bool    unitk = bli_deq1( *(( double* )kappa) );
 
 #ifdef _A64FX
     {
@@ -335,7 +335,7 @@ void bli_dpackm_armsve512_asm_16xk
             const dim_t      i      = cdim;
             const dim_t      m_edge = mnr - i;
             const dim_t      n_edge = n_max;
-            double* restrict p_edge = p + (i  )*1;
+            double* restrict p_edge = ( double* )p + (i  )*1;
 
             bli_dset0s_mxn
             (
@@ -351,7 +351,7 @@ void bli_dpackm_armsve512_asm_16xk
         const dim_t      j      = n;
         const dim_t      m_edge = mnr;
         const dim_t      n_edge = n_max - j;
-        double* restrict p_edge = p + (j  )*ldp;
+        double* restrict p_edge = ( double* )p + (j  )*ldp;
 
         bli_dset0s_mxn
         (
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 452fbaeef..7219d19c4 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -48,11 +48,11 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const scomplex*  alpha, \
-       const scomplex*  a, \
-       const scomplex*  b, \
-       const scomplex*  beta, \
-             scomplex*  c, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta, \
+             void*      c, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 0226a252d..505d3b4b7 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -48,11 +48,11 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const double*    alpha, \
-       const double*    a, \
-       const double*    b, \
-       const double*    beta, \
-             double*    c, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta, \
+             void*      c, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index ede6e170a..88d7eb4bd 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -48,11 +48,11 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const float*     alpha, \
-       const float*     a, \
-       const float*     b, \
-       const float*     beta, \
-             float*     c, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta, \
+             void*      c, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 0af877cc9..812e92e20 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -48,11 +48,11 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const dcomplex*  alpha, \
-       const dcomplex*  a, \
-       const dcomplex*  b, \
-       const dcomplex*  beta, \
-             dcomplex*  c, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta, \
+             void*      c, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
index 4bfd77d56..50d0dfcf3 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
@@ -62,11 +62,11 @@ void bli_sgemm_armv7a_asm_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -84,11 +84,11 @@ void bli_dgemm_armv7a_asm_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -106,11 +106,11 @@ void bli_cgemm_armv7a_asm_2x2
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const scomplex*  alpha,
-       const scomplex*  a,
-       const scomplex*  b,
-       const scomplex*  beta,
-             scomplex*  c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -127,11 +127,11 @@ void bli_zgemm_armv7a_asm_2x2
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
index 7678e2c59..b37d85399 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
@@ -40,15 +40,21 @@ void bli_sgemm_armv7a_int_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha0,
+       const void*      a0,
+       const void*      b0,
+       const void*      beta0,
+             void*      c0, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+	const float* alpha = alpha0;
+	const float* a     = a0;
+	const float* b     = b0;
+	const float* beta  = beta0;
+	      float* c     = c0;
+
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
 	uint32_t k_iter = k / 4;
@@ -59,8 +65,8 @@ void bli_sgemm_armv7a_int_4x4
 
     GEMM_UKR_SETUP_CT( s, 4, 4, false );
 
-	void* a_next = bli_auxinfo_next_a( data );
-	void* b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	float32x4_t alphav;
 	alphav = vmovq_n_f32( *alpha );
@@ -246,15 +252,21 @@ void bli_dgemm_armv7a_int_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha_,
+       const void*      a_,
+       const void*      b_,
+       const void*      beta_,
+             void*      c_, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+	const double* alpha = alpha_;
+	const double* a     = a_;
+	const double* b     = b_;
+	const double* beta  = beta_;
+	      double* c     = c_;
+
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
 	//uint32_t k_iter = k0 / 4;
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
index 90b8b9b2b..7ceaa726a 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
@@ -54,9 +54,9 @@ void bli_dpackm_armv8a_int_6xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const double* kappa,
-       const double* a, inc_t inca0, inc_t lda0,
-             double* p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -87,7 +87,7 @@ void bli_dpackm_armv8a_int_6xk
 
   // NOTE: If/when this kernel ever supports scaling by kappa within the
   // assembly region, this constraint should be lifted.
-  const bool     unitk  = bli_deq1( *kappa );
+  const bool     unitk  = bli_deq1( *(( double* )kappa) );
 
 
   // -------------------------------------------------------------------------
@@ -291,7 +291,7 @@ void bli_dpackm_armv8a_int_6xk
       const dim_t      i      = cdim0;
       const dim_t      m_edge = mnr - cdim0;
       const dim_t      n_edge = k0_max;
-      double* restrict p_edge = p + (i  )*1;
+      double* restrict p_edge = ( double* )p + (i  )*1;
 
       bli_dset0s_mxn
       (
@@ -311,7 +311,7 @@ void bli_dpackm_armv8a_int_6xk
     const dim_t      j      = k0;
     const dim_t      m_edge = mnr;
     const dim_t      n_edge = k0_max - k0;
-    double* restrict p_edge = p + (j  )*ldp;
+    double* restrict p_edge = ( double* )p + (j  )*ldp;
 
     bli_dset0s_mxn
     (
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
index ae9e090e2..b177c7b28 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
@@ -54,9 +54,9 @@ void bli_dpackm_armv8a_int_8xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const double* kappa,
-       const double* a, inc_t inca0, inc_t lda0,
-             double* p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -87,7 +87,7 @@ void bli_dpackm_armv8a_int_8xk
 
   // NOTE: If/when this kernel ever supports scaling by kappa within the
   // assembly region, this constraint should be lifted.
-  const bool     unitk  = bli_deq1( *kappa );
+  const bool     unitk  = bli_deq1( *(( double* )kappa) );
 
 
   // -------------------------------------------------------------------------
@@ -321,7 +321,7 @@ void bli_dpackm_armv8a_int_8xk
       const dim_t      i      = cdim0;
       const dim_t      m_edge = mnr - cdim0;
       const dim_t      n_edge = k0_max;
-      double* restrict p_edge = p + (i  )*1;
+      double* restrict p_edge = ( double* )p + (i  )*1;
 
       bli_dset0s_mxn
       (
@@ -341,7 +341,7 @@ void bli_dpackm_armv8a_int_8xk
     const dim_t      j      = k0;
     const dim_t      m_edge = mnr;
     const dim_t      n_edge = k0_max - k0;
-    double* restrict p_edge = p + (j  )*ldp;
+    double* restrict p_edge = ( double* )p + (j  )*ldp;
 
     bli_dset0s_mxn
     (
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
index f4a793db0..3b6b38181 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
@@ -54,9 +54,9 @@ void bli_spackm_armv8a_int_12xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const float*  kappa,
-       const float*  a, inc_t inca0, inc_t lda0,
-             float*  p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -87,7 +87,7 @@ void bli_spackm_armv8a_int_12xk
 
   // NOTE: If/when this kernel ever supports scaling by kappa within the
   // assembly region, this constraint should be lifted.
-  const bool     unitk  = bli_seq1( *kappa );
+  const bool     unitk  = bli_seq1( *(( float* )kappa) );
 
 
   // -------------------------------------------------------------------------
@@ -405,7 +405,7 @@ void bli_spackm_armv8a_int_12xk
       const dim_t     i      = cdim0;
       const dim_t     m_edge = mnr - cdim0;
       const dim_t     n_edge = k0_max;
-      float* restrict p_edge = p + (i  )*1;
+      float* restrict p_edge = ( float* )p + (i  )*1;
 
       bli_sset0s_mxn
       (
@@ -423,7 +423,7 @@ void bli_spackm_armv8a_int_12xk
     const dim_t     j      = k0;
     const dim_t     m_edge = mnr;
     const dim_t     n_edge = k0_max - k0;
-    float* restrict p_edge = p + (j  )*ldp;
+    float* restrict p_edge = ( float* )p + (j  )*ldp;
 
     bli_sset0s_mxn
     (
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
index 2fd1ec9d3..c0d31b35d 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
@@ -54,9 +54,9 @@ void bli_spackm_armv8a_int_8xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const float*  kappa,
-       const float*  a, inc_t inca0, inc_t lda0,
-             float*  p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -87,7 +87,7 @@ void bli_spackm_armv8a_int_8xk
 
   // NOTE: If/when this kernel ever supports scaling by kappa within the
   // assembly region, this constraint should be lifted.
-  const bool     unitk  = bli_seq1( *kappa );
+  const bool     unitk  = bli_seq1( *(( float* )kappa) );
 
 
   // -------------------------------------------------------------------------
@@ -343,7 +343,7 @@ void bli_spackm_armv8a_int_8xk
       const dim_t     i      = cdim0;
       const dim_t     m_edge = mnr - cdim0;
       const dim_t     n_edge = k0_max;
-      float* restrict p_edge = p + (i  )*1;
+      float* restrict p_edge = ( float* )p + (i  )*1;
 
       bli_sset0s_mxn
       (
@@ -361,7 +361,7 @@ void bli_spackm_armv8a_int_8xk
     const dim_t     j      = k0;
     const dim_t     m_edge = mnr;
     const dim_t     n_edge = k0_max - k0;
-    float* restrict p_edge = p + (j  )*ldp;
+    float* restrict p_edge = ( float* )p + (j  )*ldp;
 
     bli_sset0s_mxn
     (
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index c4970ebb5..3e00df345 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -70,11 +70,11 @@ void bli_sgemm_armv8a_asm_8x12
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -793,11 +793,11 @@ void bli_dgemm_armv8a_asm_6x8
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -816,7 +816,6 @@ void bli_dgemm_armv8a_asm_6x8
 
 #endif
 
-
 	const void* a_next = bli_auxinfo_next_a( data );
 	const void* b_next = bli_auxinfo_next_b( data );
 
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
index 0737c7719..f80b03ed6 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
@@ -137,11 +137,11 @@ void bli_sgemm_armv8a_asm_12x8r
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -382,11 +382,11 @@ void bli_dgemm_armv8a_asm_8x6r
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
index 9af32439d..8ff5c1754 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
@@ -117,11 +117,11 @@ void bli_dgemmsup_rd_armv8a_inline_3x4m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -136,8 +136,8 @@ void bli_dgemmsup_rd_armv8a_inline_3x4m
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    a += 3 * rs_a0;
-    c += 3 * rs_c0;
+    a = ( double* )a + 3 * rs_a0;
+    c = ( double* )c + 3 * rs_c0;
   }
 
   if ( m0 > 0 )
@@ -159,11 +159,11 @@ void bli_dgemmsup_rd_armv8a_inline_3xcm
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -179,8 +179,8 @@ void bli_dgemmsup_rd_armv8a_inline_3xcm
       beta, c, rs_c0, cs_c0, data, cntx
     );
 
-    a += 3 * rs_a0;
-    c += 3 * rs_c0;
+    a = ( double* )a + 3 * rs_a0;
+    c = ( double* )c + 3 * rs_c0;
   }
 }
 
@@ -192,11 +192,11 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -206,10 +206,10 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
     assert( n0 <= 13 );
 
     // Manual separation.
-    dgemmsup_ker_ft ker_fp1 = NULL;
-    dgemmsup_ker_ft ker_fp2 = NULL;
-    dgemmsup_ker_ft ker_fp3 = NULL;
-    dim_t           nr1, nr2, nr3;
+    gemmsup_ker_ft ker_fp1 = NULL;
+    gemmsup_ker_ft ker_fp2 = NULL;
+    gemmsup_ker_ft ker_fp3 = NULL;
+    dim_t          nr1, nr2, nr3;
 
     switch ( n0 )
     {
@@ -250,8 +250,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    b += nr1 * cs_b0;
-    c += nr1 * cs_c0;
+    b = ( double* )b + nr1 * cs_b0;
+    c = ( double* )c + nr1 * cs_c0;
     if ( ker_fp2 )
     {
       ker_fp2
@@ -260,8 +260,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
         alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
         beta, c, rs_c0, cs_c0, data, cntx
       );
-      b += nr2 * cs_b0;
-      c += nr2 * cs_c0;
+      b = ( double* )b + nr2 * cs_b0;
+      c = ( double* )c + nr2 * cs_c0;
     }
     if ( ker_fp3 )
       ker_fp3
@@ -582,8 +582,8 @@ LABEL(END_EXEC)
   // TODO: Implement optimized kernel for this.
   //
   // Forward address.
-  a = a + m_iter * 3 * rs_a;
-  c = c + m_iter * 3 * rs_c;
+  a = ( double* )a + m_iter * 3 * rs_a;
+  c = ( double* )c + m_iter * 3 * rs_c;
   for ( ; m_left > 0; m_left -= 2 )
   {
     dim_t m_loc = ( m_left < 2 ) ? m_left : 2;
@@ -594,8 +594,8 @@ LABEL(END_EXEC)
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    a += 2 * rs_a0;
-    c += 2 * rs_c0;
+    a = ( double* )a + 2 * rs_a0;
+    c = ( double* )c + 2 * rs_c0;
   }
 }
 
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
index 51b6f75c0..fc40bd591 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
@@ -110,11 +110,11 @@ void bli_dgemmsup_rd_armv8a_inline_4x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -137,8 +137,8 @@ void bli_dgemmsup_rd_armv8a_inline_4x8n
       alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx
     );
-    b += 8 * cs_b0;
-    c += 8 * cs_c0;
+    b = ( double* )b + 8 * cs_b0;
+    c = ( double* )c + 8 * cs_c0;
   }
 }
 
@@ -150,11 +150,11 @@ void bli_dgemmsup_rd_armv8a_inline_3x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -169,8 +169,8 @@ void bli_dgemmsup_rd_armv8a_inline_3x8n
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    b += 4 * cs_b0;
-    c += 4 * cs_c0;
+    b = ( double* )b + 4 * cs_b0;
+    c = ( double* )c + 4 * cs_c0;
   }
   if ( n0 > 0 )
   {
@@ -191,11 +191,11 @@ void bli_dgemmsup_rd_armv8a_inline_rx8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -211,8 +211,8 @@ void bli_dgemmsup_rd_armv8a_inline_rx8n
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    b += 8 * cs_b0;
-    c += 8 * cs_c0;
+    b = ( double* )b + 8 * cs_b0;
+    c = ( double* )c + 8 * cs_c0;
   }
 }
 
@@ -224,11 +224,11 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -238,9 +238,9 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
     assert( m0 <= 9 );
 
     // Manual separation.
-    dgemmsup_ker_ft ker_fp1 = NULL;
-    dgemmsup_ker_ft ker_fp2 = NULL;
-    dim_t           mr1, mr2;
+    gemmsup_ker_ft ker_fp1 = NULL;
+    gemmsup_ker_ft ker_fp2 = NULL;
+    dim_t          mr1, mr2;
 
     switch ( m0 )
     {
@@ -270,8 +270,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    a += mr1 * rs_a0;
-    c += mr1 * rs_c0;
+    a = ( double* )a + mr1 * rs_a0;
+    c = ( double* )c + mr1 * rs_c0;
     if ( ker_fp2 )
       ker_fp2
       (
@@ -596,8 +596,8 @@ LABEL(END_EXEC)
   // TODO: Implement optimized kernel for this.
   //
   // Forward address.
-  b = b + n_iter * 4 * cs_b;
-  c = c + n_iter * 4 * cs_c;
+  b = ( double* )b + n_iter * 4 * cs_b;
+  c = ( double* )c + n_iter * 4 * cs_c;
   if ( n_left >= 3 )
   {
     bli_dgemmsup_rd_armv8a_asm_6x3
@@ -606,8 +606,8 @@ LABEL(END_EXEC)
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    b = b + 3 * cs_b;
-    c = c + 3 * cs_c;
+    b = ( double* )b + 3 * cs_b;
+    c = ( double* )c + 3 * cs_c;
     n_left -= 3;
   }
 
@@ -622,8 +622,8 @@ LABEL(END_EXEC)
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    a = a + 3 * rs_a;
-    c = c + 3 * rs_c;
+    a = ( double* )a + 3 * rs_a;
+    c = ( double* )c + 3 * rs_c;
 
     bli_dgemmsup_rd_armv8a_int_3x4
     (
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
index 9669400ce..a8f4f5e12 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
@@ -111,11 +111,11 @@ void bli_dgemmsup_rv_armv8a_asm_4x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -440,8 +440,8 @@ LABEL(END_EXEC)
   // TODO: Implement optimized kernel for this.
   //
   // Forward address.
-  b = b + n_iter * ps_b;
-  c = c + n_iter * 8 * cs_c;
+  b = ( double* )b + n_iter * ps_b;
+  c = ( double* )c + n_iter * 8 * cs_c;
   if ( n_left )
   {
     auxinfo_t data_d6x4mn = *data;
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
index aa57d5850..348f750a5 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
@@ -127,11 +127,11 @@ void bli_dgemmsup_rv_armv8a_asm_5x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -461,8 +461,8 @@ LABEL(END_EXEC)
 
 consider_edge_cases:
   // Forward address.
-  b = b + n_iter * ps_b;
-  c = c + n_iter * 8 * cs_c;
+  b = ( double* )b + n_iter * ps_b;
+  c = ( double* )c + n_iter * 8 * cs_c;
   if ( n_left )
   {
     // Set panel stride to unpacked mode.
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
index 040431e1e..3d1e8c0a0 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
@@ -127,11 +127,11 @@ void bli_dgemmsup_rv_armv8a_asm_6x5m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -460,15 +460,15 @@ LABEL(END_EXEC)
 
 consider_edge_cases:
   // Forward address.
-  a = a + m_iter * ps_a;
-  c = c + m_iter * 6 * rs_c;
+  a = ( double* )a + m_iter * ps_a;
+  c = ( double* )c + m_iter * 6 * rs_c;
   auxinfo_t data_d6x4mn = *data;
   bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
   bli_dgemmsup_rv_armv8a_int_6x4mn
   (
     conja, conjb, m_left, 5, k0,
-      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+    alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+    beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
   );
 
 }
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
index 43f1c193c..a1a945740 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
@@ -117,11 +117,11 @@ void bli_dgemmsup_rv_armv8a_asm_6x6m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -462,15 +462,15 @@ LABEL(END_EXEC)
 
 consider_edge_cases:
   // Forward address.
-  a = a + m_iter * ps_a;
-  c = c + m_iter * 6 * rs_c;
+  a = ( double* )a + m_iter * ps_a;
+  c = ( double* )c + m_iter * 6 * rs_c;
   auxinfo_t data_d6x4mn = *data;
   bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
   bli_dgemmsup_rv_armv8a_int_6x4mn
   (
     conja, conjb, m_left, 6, k0,
-      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+    alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+    beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
   );
 
 }
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
index 78801c8ef..2e00676c4 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
@@ -145,11 +145,11 @@ void bli_dgemmsup_rv_armv8a_asm_6x7m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -500,15 +500,15 @@ LABEL(END_EXEC)
 
 consider_edge_cases:
   // Forward address.
-  a = a + m_iter * ps_a;
-  c = c + m_iter * 6 * rs_c;
+  a = ( double* )a + m_iter * ps_a;
+  c = ( double* )c + m_iter * 6 * rs_c;
   auxinfo_t data_d6x4mn = *data;
   bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
   bli_dgemmsup_rv_armv8a_int_6x4mn
   (
     conja, conjb, m_left, 7, k0,
-      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+    alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+    beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
   );
 
 }
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
index a3d25f860..74fdcf77f 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
@@ -134,11 +134,11 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -148,9 +148,9 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
     assert( n0 <= 13 );
 
     // Manual separation.
-    dgemmsup_ker_ft ker_fp1 = NULL;
-    dgemmsup_ker_ft ker_fp2 = NULL;
-    dim_t           nr1, nr2;
+    gemmsup_ker_ft ker_fp1 = NULL;
+    gemmsup_ker_ft ker_fp2 = NULL;
+    dim_t          nr1, nr2;
 
     if ( n0 == 13 )
     {
@@ -200,14 +200,14 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    b += nr1 * cs_b0;
-    c += nr1 * cs_c0;
+    b = ( double* )b + nr1 * cs_b0;
+    c = ( double* )c + nr1 * cs_c0;
     if ( ker_fp2 )
       ker_fp2
       (
-	conja, conjb, m0, nr2, k0,
-	alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	beta, c, rs_c0, cs_c0, data, cntx
+        conja, conjb, m0, nr2, k0,
+        alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+        beta, c, rs_c0, cs_c0, data, cntx
       );
     return;
   }
@@ -554,15 +554,15 @@ LABEL(END_EXEC)
 
 consider_edge_cases:
   // Forward address.
-  a = a + m_iter * ps_a;
-  c = c + m_iter * 6 * rs_c;
+  a = ( double* )a + m_iter * ps_a;
+  c = ( double* )c + m_iter * 6 * rs_c;
   auxinfo_t data_d6x4mn = *data;
   bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn );
   bli_dgemmsup_rv_armv8a_int_6x4mn
   (
     conja, conjb, m_left, 8, k0,
-      alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-      beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
+    alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+    beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx
   );
 
 }
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
index 9c8ccdd12..3dad7dd91 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
@@ -134,11 +134,11 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -148,9 +148,9 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
     assert( m0 <= 9 );
 
     // Manual separation.
-    dgemmsup_ker_ft ker_fp1 = NULL;
-    dgemmsup_ker_ft ker_fp2 = NULL;
-    dim_t           mr1, mr2;
+    gemmsup_ker_ft ker_fp1 = NULL;
+    gemmsup_ker_ft ker_fp2 = NULL;
+    dim_t          mr1, mr2;
     
     if ( m0 == 9 )
     {
@@ -186,14 +186,14 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
       alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
       beta, c, rs_c0, cs_c0, data, cntx
     );
-    a += mr1 * rs_a0;
-    c += mr1 * rs_c0;
+    a = ( double* )a + mr1 * rs_a0;
+    c = ( double* )c + mr1 * rs_c0;
     if ( ker_fp2 )
       ker_fp2
       (
-	conja, conjb, mr2, n0, k0,
-	alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
-	beta, c, rs_c0, cs_c0, data, cntx
+        conja, conjb, mr2, n0, k0,
+        alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0,
+        beta, c, rs_c0, cs_c0, data, cntx
       );
     return;
   }
@@ -540,8 +540,8 @@ LABEL(END_EXEC)
 
 consider_edge_cases:
   // Forward address.
-  b = b + n_iter * ps_b;
-  c = c + n_iter * 8 * cs_c;
+  b = ( double* )b + n_iter * ps_b;
+  c = ( double* )c + n_iter * 8 * cs_c;
   if ( n_left )
   {
     // Set panel stride to unpacked mode.
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
index ea93ecf5f..8376d418a 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
@@ -105,11 +105,11 @@ void bli_dgemmsup_rv_armv8a_asm_8x4m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -394,8 +394,8 @@ LABEL(END_EXEC)
   );
 
 consider_edge_cases:
-  a = a + m_iter * ps_a;
-  c = c + m_iter * 8 * rs_c;
+  a = ( double* )a + m_iter * ps_a;
+  c = ( double* )c + m_iter * 8 * rs_c;
   // Edge case is within 1 millikernel loop of THIS kernel.
   // Regarding the 6x?m kernel, the panel stride should be always local.
   auxinfo_t data_6xkm = *data;
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
index 42cde2a8b..8cefaed4a 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
@@ -89,11 +89,11 @@ void bli_dgemmsup_rd_armv8a_asm_3x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
index 84276879b..1919aa694 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
@@ -113,11 +113,11 @@ void bli_dgemmsup_rd_armv8a_asm_6x3
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
index d4a17e064..331e5bbda 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
@@ -64,11 +64,11 @@ void bli_dgemmsup_rd_armv8a_int_2x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a, inc_t cs_a,
-       const double*    b, inc_t rs_b, inc_t cs_b,
-       const double*    beta,
-             double*    c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a, inc_t rs_a, inc_t cs_a,
+       const void*      b, inc_t rs_b, inc_t cs_b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -82,7 +82,7 @@ void bli_dgemmsup_rd_armv8a_int_2x8
 
   uint64_t k_mker = k0 / 2;
   uint64_t k_left = k0 % 2;
-  uint64_t b_iszr = ( *beta == 0.0 );
+  uint64_t b_iszr = ( *(( double* )beta) == 0.0 );
 
   assert( cs_a == 1 );
   assert( rs_b == 1 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
index c58ecc2fc..911cb9256 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
@@ -64,11 +64,11 @@ void bli_dgemmsup_rd_armv8a_int_3x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a, inc_t cs_a,
-       const double*    b, inc_t rs_b, inc_t cs_b,
-       const double*    beta,
-             double*    c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a, inc_t rs_a, inc_t cs_a,
+       const void*      b, inc_t rs_b, inc_t cs_b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -94,7 +94,7 @@ void bli_dgemmsup_rd_armv8a_int_3x4
 
   uint64_t k_mker = k0 / 2;
   uint64_t k_left = k0 % 2;
-  uint64_t b_iszr = ( *beta == 0.0 );
+  uint64_t b_iszr = ( *(( double* )beta) == 0.0 );
 
   assert( cs_a == 1 );
   assert( rs_b == 1 );
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
index 9e6c38352..4c2173092 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
@@ -64,11 +64,11 @@ void bli_dgemmsup_rv_armv8a_int_3x8mn
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a0, inc_t rs_a, inc_t cs_a,
-       const double*    b0, inc_t rs_b, inc_t cs_b,
-       const double*    beta,
-             double*    c0, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a0, inc_t rs_a, inc_t cs_a,
+       const void*      b0, inc_t rs_b, inc_t cs_b,
+       const void*      beta,
+             void*      c0, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -87,7 +87,7 @@ void bli_dgemmsup_rv_armv8a_int_3x8mn
   dim_t k;
   uint64_t ps_a   = bli_auxinfo_ps_a( data );
   uint64_t ps_b   = bli_auxinfo_ps_b( data );
-  uint64_t b_iszr = ( *beta == 0.0 );
+  uint64_t b_iszr = ( *(( double* )beta) == 0.0 );
   assert( cs_b == 1 );
 
   // Registers used to store a 3x8 block of C.
@@ -389,8 +389,8 @@ void bli_dgemmsup_rv_armv8a_int_3x8mn
       c_in += 8 * cs_c;
     }
 
-    a0 += ps_a;
-    c0 += 3 * rs_c;
+    a0 = ( double* )a0 + ps_a;
+    c0 = ( double* )c0 + 3 * rs_c;
   }
 }
 
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
index 1ddd582ae..2d259adb3 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
@@ -64,11 +64,11 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a0, inc_t rs_a, inc_t cs_a,
-       const double*    b0, inc_t rs_b, inc_t cs_b,
-       const double*    beta,
-             double*    c0, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a0, inc_t rs_a, inc_t cs_a,
+       const void*      b0, inc_t rs_b, inc_t cs_b,
+       const void*      beta,
+             void*      c0, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -87,7 +87,7 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
   dim_t k;
   uint64_t ps_a   = bli_auxinfo_ps_a( data );
   uint64_t ps_b   = bli_auxinfo_ps_b( data );
-  uint64_t b_iszr = ( *beta == 0.0 );
+  uint64_t b_iszr = ( *(( double* )beta) == 0.0 );
   assert( cs_b == 1 );
 
   // Registers used to store a 6x4 block of C.
@@ -477,8 +477,8 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
       c_in += 4 * cs_c;
     }
 
-    a0 += ps_a;
-    c0 += 6 * rs_c;
+    a0 = ( double* )a0 + ps_a;
+    c0 = ( double* )c0 + 6 * rs_c;
   }
 }
 
diff --git a/kernels/bgq/1/bli_axpyv_bgq_int.c b/kernels/bgq/1/bli_axpyv_bgq_int.c
index 6822c57b2..9822c3341 100644
--- a/kernels/bgq/1/bli_axpyv_bgq_int.c
+++ b/kernels/bgq/1/bli_axpyv_bgq_int.c
@@ -38,12 +38,16 @@ void bli_daxpyv_bgq_int
      (
              conj_t  conjx,
              dim_t   n,
-       const double* alpha,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* alpha = alpha0;
+	const double* x     = x0;
+	const double* y     = y0;
+
 	if ( bli_zero_dim1( n ) ) return;
 
 	// If there is anything that would interfere with our use of aligned
diff --git a/kernels/bgq/1/bli_dotv_bgq_int.c b/kernels/bgq/1/bli_dotv_bgq_int.c
index a4329df32..177485003 100644
--- a/kernels/bgq/1/bli_dotv_bgq_int.c
+++ b/kernels/bgq/1/bli_dotv_bgq_int.c
@@ -39,13 +39,17 @@ void bli_ddotv_bgq_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const double* x, inc_t incx,
-       const double* y, inc_t incy,
-             double* rho,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
-	bool   use_ref = FALSE;
+	const double* x   = x0;
+	const double* y   = y0;
+	const double* rho = rho0;
+
+	bool use_ref = FALSE;
 
 	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) ) {
diff --git a/kernels/bgq/1f/bli_axpyf_bgq_int.c b/kernels/bgq/1f/bli_axpyf_bgq_int.c
index 52f9378be..f366dbe86 100644
--- a/kernels/bgq/1f/bli_axpyf_bgq_int.c
+++ b/kernels/bgq/1f/bli_axpyf_bgq_int.c
@@ -41,13 +41,18 @@ void bli_daxpyf_bgq_int
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* alpha = alpha0;
+	const double* a     = a0;
+	const double* x     = x0;
+	      double* y     = y0;
+
 	const dim_t fusefac = 8;
 
     if ( bli_zero_dim2( m, b_n ) ) return;
@@ -63,22 +68,38 @@ void bli_daxpyf_bgq_int
 	{
 //        printf("%d\t%d\t%d\t%d\t%d\t%d\n", fusefac, inca, incx, incy, bli_is_unaligned_to( ( siz_t )a, 32 ), bli_is_unaligned_to( ( siz_t )y, 32));
 //        printf("DEFAULTING TO REFERENCE IMPLEMENTATION\n");
-		BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy, cntx );
+		#if 0
+		axpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
+
+		f
+		(
+		  conja,
+		  conjx,
+		  m,
+		  b_n,
+		  alpha0,
+		  a0, inca, lda,
+		  x0, incx,
+		  y0, incy,
+		  cntx
+		);
+		#endif
+		bli_abort();
 		return;
 	}
 
 	dim_t m_run       =  m / 4;
 	dim_t m_left      =  m % 4;
 
-	const double* a0   = a + 0*lda;
-	const double* a1   = a + 1*lda;
-	const double* a2   = a + 2*lda;
-	const double* a3   = a + 3*lda;
-	const double* a4   = a + 4*lda;
-	const double* a5   = a + 5*lda;
-	const double* a6   = a + 6*lda;
-	const double* a7   = a + 7*lda;
-	      double* y0   = y;
+	const double* ap0   = a + 0*lda;
+	const double* ap1   = a + 1*lda;
+	const double* ap2   = a + 2*lda;
+	const double* ap3   = a + 3*lda;
+	const double* ap4   = a + 4*lda;
+	const double* ap5   = a + 5*lda;
+	const double* ap6   = a + 6*lda;
+	const double* ap7   = a + 7*lda;
+	      double* yp0   = y;
 
 	double chi0 = *(x + 0*incx);
 	double chi1 = *(x + 1*incx);
@@ -112,16 +133,16 @@ void bli_daxpyf_bgq_int
 
     for ( dim_t i = 0; i < m_run; i += 1 )
 	{
-		yv  = vec_lda( 0 * sizeof(double), &y0[i*4]);
+		yv  = vec_lda( 0 * sizeof(double), &yp0[i*4]);
 
-		a0v = vec_lda( 0 * sizeof(double), &a0[i*4]);
-		a1v = vec_lda( 0 * sizeof(double), &a1[i*4]);
-		a2v = vec_lda( 0 * sizeof(double), &a2[i*4]);
-		a3v = vec_lda( 0 * sizeof(double), &a3[i*4]);
-		a4v = vec_lda( 0 * sizeof(double), &a4[i*4]);
-		a5v = vec_lda( 0 * sizeof(double), &a5[i*4]);
-		a6v = vec_lda( 0 * sizeof(double), &a6[i*4]);
-		a7v = vec_lda( 0 * sizeof(double), &a7[i*4]);
+		a0v = vec_lda( 0 * sizeof(double), &ap0[i*4]);
+		a1v = vec_lda( 0 * sizeof(double), &ap1[i*4]);
+		a2v = vec_lda( 0 * sizeof(double), &ap2[i*4]);
+		a3v = vec_lda( 0 * sizeof(double), &ap3[i*4]);
+		a4v = vec_lda( 0 * sizeof(double), &ap4[i*4]);
+		a5v = vec_lda( 0 * sizeof(double), &ap5[i*4]);
+		a6v = vec_lda( 0 * sizeof(double), &ap6[i*4]);
+		a7v = vec_lda( 0 * sizeof(double), &ap7[i*4]);
 
         yv = vec_madd( chi0v, a0v, yv );
         yv = vec_madd( chi1v, a1v, yv );
@@ -132,19 +153,19 @@ void bli_daxpyf_bgq_int
         yv = vec_madd( chi6v, a6v, yv );
         yv = vec_madd( chi7v, a7v, yv );
 
-        vec_sta( yv, 0 * sizeof(double), &y0[i*4]);
+        vec_sta( yv, 0 * sizeof(double), &yp0[i*4]);
 	}
 
     for ( dim_t i = 0; i < m_left; ++i )
     {
-        y0[4*m_run + i] += chi0 * a0[4*m_run + i]
-                      +  chi1 * a1[4*m_run + i]
-                      +  chi2 * a2[4*m_run + i]
-                      +  chi3 * a3[4*m_run + i]
-                      +  chi4 * a4[4*m_run + i]
-                      +  chi5 * a5[4*m_run + i]
-                      +  chi6 * a6[4*m_run + i]
-                      +  chi7 * a7[4*m_run + i];
+        yp0[4*m_run + i] += chi0 * ap0[4*m_run + i]
+                         +  chi1 * ap1[4*m_run + i]
+                         +  chi2 * ap2[4*m_run + i]
+                         +  chi3 * ap3[4*m_run + i]
+                         +  chi4 * ap4[4*m_run + i]
+                         +  chi5 * ap5[4*m_run + i]
+                         +  chi6 * ap6[4*m_run + i]
+                         +  chi7 * ap7[4*m_run + i];
     }
 
 }
diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
index 4e9dc27d2..d1dcac3a6 100644
--- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
+++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
@@ -59,15 +59,21 @@ void bli_dgemm_bgq_int_8x8
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha0,
+       const void*      a0,
+       const void*      b0,
+       const void*      beta0,
+             void*      c0, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+	const double* alpha = alpha0;
+	const double* a     = a0;
+	const double* b     = b0;
+	const double* beta  = beta0;
+	      double* c     = c0;
+
     GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false );
 
     //Registers for storing C.
@@ -223,15 +229,21 @@ void bli_zgemm_bgq_int_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha0,
+       const void*      a0,
+       const void*      b0,
+       const void*      beta0,
+             void*      c0, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+	const dcomplex* alpha = alpha0;
+	const dcomplex* a     = a0;
+	const dcomplex* b     = b0;
+	const dcomplex* beta  = beta0;
+	      dcomplex* c     = c0;
+
     GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false );
 
     const double* a_d = ( double* )a;
diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
index d26c2c6b8..dbdb5ef3b 100644
--- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
+++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
@@ -93,15 +93,18 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+	//const void* a_next = bli_auxinfo_next_a( data );
+	//const void* b_next = bli_auxinfo_next_b( data );
+
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
 	uint64_t k_iter = k / 4;
@@ -585,15 +588,18 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+	//const void* a_next = bli_auxinfo_next_a( data );
+	//const void* b_next = bli_auxinfo_next_b( data );
+
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
 	uint64_t k_iter = k / 12;
@@ -805,11 +811,11 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const scomplex*  alpha,
-       const scomplex*  a,
-       const scomplex*  b,
-       const scomplex*  beta,
-             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1329,11 +1335,11 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
index 843874c9a..b23fc2497 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
@@ -48,9 +48,9 @@ void bli_cpackm_haswell_asm_3xk
              dim_t     cdim0,
              dim_t     k0,
              dim_t     k0_max,
-       const scomplex* kappa,
-       const scomplex* a, inc_t inca0, inc_t lda0,
-             scomplex* p,              inc_t ldp0,
+       const void*     kappa,
+       const void*     a, inc_t inca0, inc_t lda0,
+             void*     p,              inc_t ldp0,
        const cntx_t*   cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_cpackm_haswell_asm_3xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_ceq1( *kappa );
+	const bool     unitk  = bli_ceq1( *(( scomplex* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -364,7 +364,7 @@ void bli_cpackm_haswell_asm_3xk
 			const dim_t        i      = cdim0;
 			const dim_t        m_edge = mnr - cdim0;
 			const dim_t        n_edge = k0_max;
-			scomplex* restrict p_edge = p + (i  )*1;
+			scomplex* restrict p_edge = ( scomplex* )p + (i  )*1;
 
 			bli_cset0s_mxn
 			(
@@ -384,7 +384,7 @@ void bli_cpackm_haswell_asm_3xk
 		const dim_t        j      = k0;
 		const dim_t        m_edge = mnr;
 		const dim_t        n_edge = k0_max - k0;
-		scomplex* restrict p_edge = p + (j  )*ldp;
+		scomplex* restrict p_edge = ( scomplex* )p + (j  )*ldp;
 
 		bli_cset0s_mxn
 		(
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
index 25fc8bf05..22dfe8e4a 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
@@ -48,9 +48,9 @@ void bli_cpackm_haswell_asm_8xk
              dim_t     cdim0,
              dim_t     k0,
              dim_t     k0_max,
-       const scomplex* kappa,
-       const scomplex* a, inc_t inca0, inc_t lda0,
-             scomplex* p,              inc_t ldp0,
+       const void*     kappa,
+       const void*     a, inc_t inca0, inc_t lda0,
+             void*     p,              inc_t ldp0,
        const cntx_t*   cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_cpackm_haswell_asm_8xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_ceq1( *kappa );
+	const bool     unitk  = bli_ceq1( *(( scomplex* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -384,7 +384,7 @@ void bli_cpackm_haswell_asm_8xk
 			const dim_t        i      = cdim0;
 			const dim_t        m_edge = mnr - cdim0;
 			const dim_t        n_edge = k0_max;
-			scomplex* restrict p_edge = p + (i  )*1;
+			scomplex* restrict p_edge = ( scomplex* )p + (i  )*1;
 
 			bli_cset0s_mxn
 			(
@@ -402,7 +402,7 @@ void bli_cpackm_haswell_asm_8xk
 		const dim_t        j      = k0;
 		const dim_t        m_edge = mnr;
 		const dim_t        n_edge = k0_max - k0;
-		scomplex* restrict p_edge = p + (j  )*ldp;
+		scomplex* restrict p_edge = ( scomplex* )p + (j  )*ldp;
 
 		bli_cset0s_mxn
 		(
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
index 4cfc241d3..7722b5ef4 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
@@ -48,9 +48,9 @@ void bli_dpackm_haswell_asm_6xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const double* kappa,
-       const double* a, inc_t inca0, inc_t lda0,
-             double* p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_dpackm_haswell_asm_6xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_deq1( *kappa );
+	const bool     unitk  = bli_deq1( *(( double* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -368,7 +368,7 @@ void bli_dpackm_haswell_asm_6xk
 			const dim_t      i      = cdim0;
 			const dim_t      m_edge = mnr - cdim0;
 			const dim_t      n_edge = k0_max;
-			double* restrict p_edge = p + (i  )*1;
+			double* restrict p_edge = ( double* )p + (i  )*1;
 
 			bli_dset0s_mxn
 			(
@@ -388,7 +388,7 @@ void bli_dpackm_haswell_asm_6xk
 		const dim_t      j      = k0;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = k0_max - k0;
-		double* restrict p_edge = p + (j  )*ldp;
+		double* restrict p_edge = ( double* )p + (j  )*ldp;
 
 		bli_dset0s_mxn
 		(
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
index 7fdb9b14f..94de87d97 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
@@ -48,9 +48,9 @@ void bli_dpackm_haswell_asm_8xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const double* kappa,
-       const double* a, inc_t inca0, inc_t lda0,
-             double* p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_dpackm_haswell_asm_8xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_deq1( *kappa );
+	const bool     unitk  = bli_deq1( *(( double* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -378,7 +378,7 @@ void bli_dpackm_haswell_asm_8xk
 			const dim_t      i      = cdim0;
 			const dim_t      m_edge = mnr - cdim0;
 			const dim_t      n_edge = k0_max;
-			double* restrict p_edge = p + (i  )*1;
+			double* restrict p_edge = ( double* )p + (i  )*1;
 
 			bli_dset0s_mxn
 			(
@@ -396,7 +396,7 @@ void bli_dpackm_haswell_asm_8xk
 		const dim_t      j      = k0;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = k0_max - k0;
-		double* restrict p_edge = p + (j  )*ldp;
+		double* restrict p_edge = ( double* )p + (j  )*ldp;
 
 		bli_dset0s_mxn
 		(
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
index cc7d52134..21f514b25 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
@@ -48,9 +48,9 @@ void bli_spackm_haswell_asm_16xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const float*  kappa,
-       const float*  a, inc_t inca0, inc_t lda0,
-             float*  p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_spackm_haswell_asm_16xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_seq1( *kappa );
+	const bool     unitk  = bli_seq1( *(( float* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -537,7 +537,7 @@ void bli_spackm_haswell_asm_16xk
 			const dim_t      i      = cdim0;
 			const dim_t      m_edge = mnr - cdim0;
 			const dim_t      n_edge = k0_max;
-			float*  restrict p_edge = p + (i  )*1;
+			float*  restrict p_edge = ( float* )p + (i  )*1;
 
 			bli_sset0s_mxn
 			(
@@ -555,7 +555,7 @@ void bli_spackm_haswell_asm_16xk
 		const dim_t      j      = k0;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = k0_max - k0;
-		float*  restrict p_edge = p + (j  )*ldp;
+		float*  restrict p_edge = ( float* )p + (j  )*ldp;
 
 		bli_sset0s_mxn
 		(
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
index bb464a6b5..bf5dbdf88 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
@@ -48,9 +48,9 @@ void bli_spackm_haswell_asm_6xk
              dim_t   cdim0,
              dim_t   k0,
              dim_t   k0_max,
-       const float*  kappa,
-       const float*  a, inc_t inca0, inc_t lda0,
-             float*  p,              inc_t ldp0,
+       const void*   kappa,
+       const void*   a, inc_t inca0, inc_t lda0,
+             void*   p,              inc_t ldp0,
        const cntx_t* cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_spackm_haswell_asm_6xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_seq1( *kappa );
+	const bool     unitk  = bli_seq1( *(( float* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -410,7 +410,7 @@ void bli_spackm_haswell_asm_6xk
 			const dim_t      i      = cdim0;
 			const dim_t      m_edge = mnr - cdim0;
 			const dim_t      n_edge = k0_max;
-			float*  restrict p_edge = p + (i  )*1;
+			float*  restrict p_edge = ( float* )p + (i  )*1;
 
 			bli_sset0s_mxn
 			(
@@ -428,7 +428,7 @@ void bli_spackm_haswell_asm_6xk
 		const dim_t      j      = k0;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = k0_max - k0;
-		float*  restrict p_edge = p + (j  )*ldp;
+		float*  restrict p_edge = ( float* )p + (j  )*ldp;
 
 		bli_sset0s_mxn
 		(
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
index bf63592d0..eb9417f6c 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
@@ -48,9 +48,9 @@ void bli_zpackm_haswell_asm_3xk
              dim_t     cdim0,
              dim_t     k0,
              dim_t     k0_max,
-       const dcomplex* kappa,
-       const dcomplex* a, inc_t inca0, inc_t lda0,
-             dcomplex* p,              inc_t ldp0,
+       const void*     kappa,
+       const void*     a, inc_t inca0, inc_t lda0,
+             void*     p,              inc_t ldp0,
        const cntx_t*   cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_zpackm_haswell_asm_3xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_zeq1( *kappa );
+	const bool     unitk  = bli_zeq1( *(( dcomplex* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -370,7 +370,7 @@ void bli_zpackm_haswell_asm_3xk
 			const dim_t        i      = cdim0;
 			const dim_t        m_edge = mnr - cdim0;
 			const dim_t        n_edge = k0_max;
-			dcomplex* restrict p_edge = p + (i  )*1;
+			dcomplex* restrict p_edge = ( dcomplex* )p + (i  )*1;
 
 			bli_zset0s_mxn
 			(
@@ -388,7 +388,7 @@ void bli_zpackm_haswell_asm_3xk
 		const dim_t        j      = k0;
 		const dim_t        m_edge = mnr;
 		const dim_t        n_edge = k0_max - k0;
-		dcomplex* restrict p_edge = p + (j  )*ldp;
+		dcomplex* restrict p_edge = ( dcomplex* )p + (j  )*ldp;
 
 		bli_zset0s_mxn
 		(
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
index 87b596ad8..762e2e87c 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
@@ -48,9 +48,9 @@ void bli_zpackm_haswell_asm_4xk
              dim_t     cdim0,
              dim_t     k0,
              dim_t     k0_max,
-       const dcomplex* kappa,
-       const dcomplex* a, inc_t inca0, inc_t lda0,
-             dcomplex* p,              inc_t ldp0,
+       const void*     kappa,
+       const void*     a, inc_t inca0, inc_t lda0,
+             void*     p,              inc_t ldp0,
        const cntx_t*   cntx
      )
 {
@@ -99,7 +99,7 @@ void bli_zpackm_haswell_asm_4xk
 
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_zeq1( *kappa );
+	const bool     unitk  = bli_zeq1( *(( dcomplex* )kappa) );
 
 
 	// -------------------------------------------------------------------------
@@ -380,7 +380,7 @@ void bli_zpackm_haswell_asm_4xk
 			const dim_t        i      = cdim0;
 			const dim_t        m_edge = mnr - cdim0;
 			const dim_t        n_edge = k0_max;
-			dcomplex* restrict p_edge = p + (i  )*1;
+			dcomplex* restrict p_edge = ( dcomplex* )p + (i  )*1;
 
 			bli_zset0s_mxn
 			(
@@ -398,7 +398,7 @@ void bli_zpackm_haswell_asm_4xk
 		const dim_t        j      = k0;
 		const dim_t        m_edge = mnr;
 		const dim_t        n_edge = k0_max - k0;
-		dcomplex* restrict p_edge = p + (j  )*ldp;
+		dcomplex* restrict p_edge = ( dcomplex* )p + (j  )*ldp;
 
 		bli_zset0s_mxn
 		(
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index 617690e4d..322e9a2e7 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -82,11 +82,11 @@ void bli_sgemm_haswell_asm_6x16
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -762,11 +762,11 @@ void bli_dgemm_haswell_asm_6x8
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1321,11 +1321,11 @@ void bli_cgemm_haswell_asm_3x8
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const scomplex*  alpha,
-       const scomplex*  a,
-       const scomplex*  b,
-       const scomplex*  beta,
-             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1714,11 +1714,11 @@ void bli_zgemm_haswell_asm_3x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index 5f1ca3e97..6d3d125fe 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -81,11 +81,11 @@ void bli_sgemm_haswell_asm_16x6
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -465,11 +465,11 @@ void bli_dgemm_haswell_asm_8x6
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -835,11 +835,11 @@ void bli_cgemm_haswell_asm_8x3
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const scomplex*  alpha,
-       const scomplex*  a,
-       const scomplex*  b,
-       const scomplex*  beta,
-             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1226,11 +1226,11 @@ void bli_zgemm_haswell_asm_4x3
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index f97dc6c67..88e152fa0 100644
--- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -61,12 +61,12 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
              dim_t      m, \
              dim_t      n, \
              dim_t      k0, \
-       const float*     alpha, \
-       const float*     a10, \
-       const float*     a11, \
-       const float*     b01, \
-             float*     b11, \
-             float*     c11, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a10, \
+       const void*      a11, \
+       const void*      b01, \
+             void*      b11, \
+             void*      c11, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
@@ -852,12 +852,12 @@ void bli_dgemmtrsm_l_haswell_asm_6x8
              dim_t      m, \
              dim_t      n, \
              dim_t      k0, \
-       const double*    alpha, \
-       const double*    a10, \
-       const double*    a11, \
-       const double*    b01, \
-             double*    b11, \
-             double*    c11, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a10, \
+       const void*      a11, \
+       const void*      b01, \
+             void*      b11, \
+             void*      c11, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index 7cbd4cb12..1518a56eb 100644
--- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -61,12 +61,12 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
              dim_t      m, \
              dim_t      n, \
              dim_t      k0, \
-       const float*     alpha, \
-       const float*     a12, \
-       const float*     a11, \
-       const float*     b21, \
-             float*     b11, \
-             float*     c11, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a12, \
+       const void*      a11, \
+       const void*      b21, \
+             void*      b11, \
+             void*      c11, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
@@ -857,12 +857,12 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
              dim_t      m, \
              dim_t      n, \
              dim_t      k0, \
-       const double*    alpha, \
-       const double*    a12, \
-       const double*    a11, \
-       const double*    b21, \
-             double*    b11, \
-             double*    c11, inc_t rs_c0, inc_t cs_c0, \
+       const void*      alpha, \
+       const void*      a12, \
+       const void*      a11, \
+       const void*      b21, \
+             void*      b11, \
+             void*      c11, inc_t rs_c0, inc_t cs_c0, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
index a5e912dd0..055f99489 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
@@ -73,11 +73,11 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -88,9 +88,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		      double* cij = c;
-		const double* bj  = b;
-		const double* ai  = a;
+		      double* cij = ( double* )c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a;
 
 		if ( 4 <= n_left )
 		{
@@ -693,9 +693,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 		const dim_t   nr_cur = 8;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		const double* bj  = b;
-		const double* ai  = a + i_edge*rs_a;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -730,11 +730,11 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1265,9 +1265,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 		const dim_t   nr_cur = 4;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		const double* bj  = b;
-		const double* ai  = a + i_edge*rs_a;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -1302,11 +1302,11 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1878,9 +1878,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 		const dim_t   nr_cur = 2;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		const double* bj  = b;
-		const double* ai  = a + i_edge*rs_a;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a + i_edge*rs_a;
 
 		if ( 3 <= m_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
index fdbbcaa2d..2f45aec08 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
@@ -73,11 +73,11 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -88,17 +88,17 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		      double* cij = c;
-		const double* bj  = b;
-		const double* ai  = a;
+		      double* cij = ( double* )c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m0 )
 		{
-			dgemmsup_ker_ft ker_fp1 = NULL;
-			dgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			// These kernels don't make any attempt to optimize the cases of
@@ -751,9 +751,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		const double* bj  = b + j_edge*cs_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		const double* bj  = ( double* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -799,11 +799,11 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1337,9 +1337,9 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		const double* bj  = b + j_edge*cs_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		const double* bj  = ( double* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1386,11 +1386,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1868,9 +1868,9 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		const double* bj  = b + j_edge*cs_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		const double* bj  = ( double* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1916,11 +1916,11 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2342,9 +2342,9 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		const double* bj  = b + j_edge*cs_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		const double* bj  = ( double* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
index ee4761005..0bb9563f1 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
@@ -73,11 +73,11 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -88,9 +88,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		      float* cij = c;
-		const float* bj  = b;
-		const float* ai  = a;
+		      float* cij = ( float* )c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a;
 
 		if ( 12 <= n_left )
 		{
@@ -728,9 +728,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 		const dim_t   nr_cur = 16;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		const float* bj  = b;
-		const float* ai  = a + i_edge*rs_a;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -765,11 +765,11 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1339,9 +1339,9 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 		const dim_t   nr_cur = 12;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		const float* bj  = b;
-		const float* ai  = a + i_edge*rs_a;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -1378,11 +1378,11 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1952,9 +1952,9 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 		const dim_t   nr_cur = 8;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		const float* bj  = b;
-		const float* ai  = a + i_edge*rs_a;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -1989,11 +1989,11 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2536,9 +2536,9 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 		const dim_t   nr_cur = 4;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		const float* bj  = b;
-		const float* ai  = a + i_edge*rs_a;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -2574,11 +2574,11 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -3168,9 +3168,9 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 		const dim_t   nr_cur = 2;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		const float* bj  = b;
-		const float* ai  = a + i_edge*rs_a;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a + i_edge*rs_a;
 
 		if ( 3 <= m_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
index b288eab33..41ab6ed63 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
@@ -73,11 +73,11 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -88,17 +88,17 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		      float* cij = c;
-		const float* bj  = b;
-		const float* ai  = a;
+		      float* cij = ( float* )c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m0 )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			// These kernels don't make any attempt to optimize the cases of
@@ -763,9 +763,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		const float* bj  = b + j_edge*cs_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		const float* bj  = ( float* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -811,11 +811,11 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1359,9 +1359,9 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		const float* bj  = b + j_edge*cs_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		const float* bj  = ( float* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1408,11 +1408,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1896,9 +1896,9 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		const float* bj  = b + j_edge*cs_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		const float* bj  = ( float* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
@@ -1944,11 +1944,11 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2373,9 +2373,9 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		const float* bj  = b + j_edge*cs_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		const float* bj  = ( float* )b + j_edge*cs_b;
 
 		if ( 2 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
index 5603f9cb4..c5927c3da 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
@@ -88,11 +88,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -103,9 +103,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		      double* cij = c;
-		const double* bj  = b;
-		const double* ai  = a;
+		      double* cij = ( double* )c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a;
 
 		if ( 6 <= n_left )
 		{
@@ -907,19 +907,19 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 		const dim_t   nr_cur = 8;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		//const double* ai  = a + i_edge*rs_a;
-		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
-		const double* ai  = a + m_iter * ps_a;
-		const double* bj  = b;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		//const double* ai  = ( double* )a + i_edge*rs_a;
+		//const double* ai  = ( double* )a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = ( double* )a + m_iter * ps_a;
+		const double* bj  = ( double* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			dgemmsup_ker_ft ker_fp1 = NULL;
-			dgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -960,7 +960,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 		}
 #endif
 
-		dgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_dgemmsup_rv_haswell_asm_1x8,
@@ -970,7 +970,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 		  bli_dgemmsup_rv_haswell_asm_5x8
 		};
 
-		dgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -990,11 +990,11 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1686,19 +1686,19 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 		const dim_t   nr_cur = 6;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		//const double* ai  = a + i_edge*rs_a;
-		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
-		const double* ai  = a + m_iter * ps_a;
-		const double* bj  = b;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		//const double* ai  = ( double* )a + i_edge*rs_a;
+		//const double* ai  = ( double* )a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = ( double* )a + m_iter * ps_a;
+		const double* bj  = ( double* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			dgemmsup_ker_ft ker_fp1 = NULL;
-			dgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -1739,7 +1739,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 		}
 #endif
 
-		dgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_dgemmsup_rv_haswell_asm_1x6,
@@ -1749,7 +1749,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 		  bli_dgemmsup_rv_haswell_asm_5x6
 		};
 
-		dgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -1769,11 +1769,11 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2343,19 +2343,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 		const dim_t   nr_cur = 4;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		//const double* ai  = a + i_edge*rs_a;
-		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
-		const double* ai  = a + m_iter * ps_a;
-		const double* bj  = b;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		//const double* ai  = ( double* )a + i_edge*rs_a;
+		//const double* ai  = ( double* )a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = ( double* )a + m_iter * ps_a;
+		const double* bj  = ( double* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			dgemmsup_ker_ft ker_fp1 = NULL;
-			dgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -2396,7 +2396,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 		}
 #endif
 
-		dgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_dgemmsup_rv_haswell_asm_1x4,
@@ -2406,7 +2406,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 		  bli_dgemmsup_rv_haswell_asm_5x4
 		};
 
-		dgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -2426,11 +2426,11 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2976,19 +2976,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 		const dim_t   nr_cur = 2;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		//const double* ai  = a + i_edge*rs_a;
-		//const double* ai  = a + ( i_edge / 6 ) * ps_a;
-		const double* ai  = a + m_iter * ps_a;
-		const double* bj  = b;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		//const double* ai  = ( double* )a + i_edge*rs_a;
+		//const double* ai  = ( double* )a + ( i_edge / 6 ) * ps_a;
+		const double* ai  = ( double* )a + m_iter * ps_a;
+		const double* bj  = ( double* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			dgemmsup_ker_ft ker_fp1 = NULL;
-			dgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -3029,7 +3029,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 		}
 #endif
 
-		dgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_dgemmsup_rv_haswell_asm_1x2,
@@ -3039,7 +3039,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 		  bli_dgemmsup_rv_haswell_asm_5x2
 		};
 
-		dgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
index efa997764..cb784d6a1 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
@@ -88,11 +88,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -103,17 +103,17 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		      double* cij = c;
-		const double* bj  = b;
-		const double* ai  = a;
+		      double* cij = ( double* )c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m0 )
 		{
-			dgemmsup_ker_ft ker_fp1 = NULL;
-			dgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m0 == 7 )
@@ -154,7 +154,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 		}
 #endif
 
-		dgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_dgemmsup_rv_haswell_asm_1x8n,
@@ -164,7 +164,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 		  bli_dgemmsup_rv_haswell_asm_5x8n
 		};
 
-		dgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -882,11 +882,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		//const double* bj  = b + j_edge*cs_b;
-		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
-		const double* bj  = b + n_iter * ps_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		//const double* bj  = ( double* )b + j_edge*cs_b;
+		//const double* bj  = ( double* )b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = ( double* )b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -954,11 +954,11 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1630,11 +1630,11 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 		const dim_t      mr_cur = 5;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		//const double* bj  = b + j_edge*cs_b;
-		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
-		const double* bj  = b + n_iter * ps_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		//const double* bj  = ( double* )b + j_edge*cs_b;
+		//const double* bj  = ( double* )b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = ( double* )b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -1702,11 +1702,11 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2287,11 +2287,11 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 		const dim_t      mr_cur = 4;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		//const double* bj  = b + j_edge*cs_b;
-		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
-		const double* bj  = b + n_iter * ps_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		//const double* bj  = ( double* )b + j_edge*cs_b;
+		//const double* bj  = ( double* )b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = ( double* )b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -2350,11 +2350,11 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2965,11 +2965,11 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		//const double* bj  = b + j_edge*cs_b;
-		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
-		const double* bj  = b + n_iter * ps_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		//const double* bj  = ( double* )b + j_edge*cs_b;
+		//const double* bj  = ( double* )b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = ( double* )b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -3028,11 +3028,11 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -3524,11 +3524,11 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		//const double* bj  = b + j_edge*cs_b;
-		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
-		const double* bj  = b + n_iter * ps_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		//const double* bj  = ( double* )b + j_edge*cs_b;
+		//const double* bj  = ( double* )b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = ( double* )b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
@@ -3587,11 +3587,11 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -4047,11 +4047,11 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      double* cij = c + j_edge*cs_c;
-		const double* ai  = a;
-		//const double* bj  = b + j_edge*cs_b;
-		//const double* bj  = b + ( j_edge / 8 ) * ps_b;
-		const double* bj  = b + n_iter * ps_b;
+		      double* cij = ( double* )c + j_edge*cs_c;
+		const double* ai  = ( double* )a;
+		//const double* bj  = ( double* )b + j_edge*cs_b;
+		//const double* bj  = ( double* )b + ( j_edge / 8 ) * ps_b;
+		const double* bj  = ( double* )b + n_iter * ps_b;
 
 		if ( 6 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
index 170f81fab..de2b71c4b 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
@@ -88,11 +88,11 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -103,9 +103,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		      float* cij = c;
-		const float* bj  = b;
-		const float* ai  = a;
+		      float* cij = ( float* )c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a;
 
 		if ( 12 <= n_left )
 		{
@@ -1041,19 +1041,19 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 		const dim_t   nr_cur = 16;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		//const float* ai  = a + i_edge*rs_a;
-		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
-		const float* ai  = a + m_iter * ps_a;
-		const float* bj  = b;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		//const float* ai  = ( float* )a + i_edge*rs_a;
+		//const float* ai  = ( float* )a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = ( float* )a + m_iter * ps_a;
+		const float* bj  = ( float* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -1094,7 +1094,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x16,
@@ -1104,7 +1104,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 		  bli_sgemmsup_rv_haswell_asm_5x16
 		};
 
-		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -1124,11 +1124,11 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1908,19 +1908,19 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 		const dim_t   nr_cur = 12;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		//const float* ai  = a + i_edge*rs_a;
-		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
-		const float* ai  = a + m_iter * ps_a;
-		const float* bj  = b;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		//const float* ai  = ( float* )a + i_edge*rs_a;
+		//const float* ai  = ( float* )a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = ( float* )a + m_iter * ps_a;
+		const float* bj  = ( float* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -1961,7 +1961,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x12,
@@ -1971,7 +1971,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 		  bli_sgemmsup_rv_haswell_asm_5x12
 		};
 
-		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -1991,11 +1991,11 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2621,19 +2621,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 		const dim_t   nr_cur = 8;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		//const float* ai  = a + i_edge*rs_a;
-		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
-		const float* ai  = a + m_iter * ps_a;
-		const float* bj  = b;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		//const float* ai  = ( float* )a + i_edge*rs_a;
+		//const float* ai  = ( float* )a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = ( float* )a + m_iter * ps_a;
+		const float* bj  = ( float* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -2674,7 +2674,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x8,
@@ -2684,7 +2684,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 		  bli_sgemmsup_rv_haswell_asm_5x8
 		};
 
-		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -2704,11 +2704,11 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -3367,19 +3367,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 		const dim_t   nr_cur = 6;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		//const float* ai  = a + i_edge*rs_a;
-		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
-		const float* ai  = a + m_iter * ps_a;
-		const float* bj  = b;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		//const float* ai  = ( float* )a + i_edge*rs_a;
+		//const float* ai  = ( float* )a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = ( float* )a + m_iter * ps_a;
+		const float* bj  = ( float* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -3420,7 +3420,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x6,
@@ -3430,7 +3430,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 		  bli_sgemmsup_rv_haswell_asm_5x6
 		};
 
-		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -3450,11 +3450,11 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -4036,19 +4036,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 		const dim_t   nr_cur = 4;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		//const float* ai  = a + i_edge*rs_a;
-		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
-		const float* ai  = a + m_iter * ps_a;
-		const float* bj  = b;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		//const float* ai  = ( float* )a + i_edge*rs_a;
+		//const float* ai  = ( float* )a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = ( float* )a + m_iter * ps_a;
+		const float* bj  = ( float* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -4089,7 +4089,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x4,
@@ -4099,7 +4099,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 		  bli_sgemmsup_rv_haswell_asm_5x4
 		};
 
-		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -4119,11 +4119,11 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -4676,19 +4676,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 		const dim_t   nr_cur = 2;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		//const float* ai  = a + i_edge*rs_a;
-		//const float* ai  = a + ( i_edge / 6 ) * ps_a;
-		const float* ai  = a + m_iter * ps_a;
-		const float* bj  = b;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		//const float* ai  = ( float* )a + i_edge*rs_a;
+		//const float* ai  = ( float* )a + ( i_edge / 6 ) * ps_a;
+		const float* ai  = ( float* )a + m_iter * ps_a;
+		const float* bj  = ( float* )b;
 
 #if 0
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m_left )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m_left == 7 )
@@ -4729,7 +4729,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x2,
@@ -4739,7 +4739,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 		  bli_sgemmsup_rv_haswell_asm_5x2
 		};
 
-		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
index 14bbad5fe..456866b2f 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
@@ -88,11 +88,11 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -103,17 +103,17 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	// dispatch other ?x8m kernels, as needed.
 	if ( m_left )
 	{
-		      float* cij = c;
-		const float* bj  = b;
-		const float* ai  = a;
+		      float* cij = ( float* )c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a;
 
 #if 1
 		// We add special handling for slightly inflated MR blocksizes
 		// at edge cases, up to a maximum of 9.
 		if ( 6 < m0 )
 		{
-			sgemmsup_ker_ft ker_fp1 = NULL;
-			sgemmsup_ker_ft ker_fp2 = NULL;
+			gemmsup_ker_ft  ker_fp1 = NULL;
+			gemmsup_ker_ft  ker_fp2 = NULL;
 			dim_t           mr1, mr2;
 
 			if ( m0 == 7 )
@@ -154,7 +154,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] =
+		gemmsup_ker_ft  ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x16n,
@@ -164,7 +164,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 		  bli_sgemmsup_rv_haswell_asm_5x16n
 		};
 
-		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+		gemmsup_ker_ft  ker_fp = ker_fps[ m_left ];
 
 		ker_fp
 		(
@@ -993,11 +993,11 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 		const dim_t      mr_cur = 6;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		//const float* bj  = b + j_edge*cs_b;
-		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
-		const float* bj  = b + n_iter * ps_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		//const float* bj  = ( float* )b + j_edge*cs_b;
+		//const float* bj  = ( float* )b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = ( float* )b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -1089,11 +1089,11 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1925,11 +1925,11 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 		const dim_t      mr_cur = 5;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		//const float* bj  = b + j_edge*cs_b;
-		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
-		const float* bj  = b + n_iter * ps_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		//const float* bj  = ( float* )b + j_edge*cs_b;
+		//const float* bj  = ( float* )b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = ( float* )b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -2021,11 +2021,11 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2683,11 +2683,11 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 		const dim_t      mr_cur = 4;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		//const float* bj  = b + j_edge*cs_b;
-		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
-		const float* bj  = b + n_iter * ps_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		//const float* bj  = ( float* )b + j_edge*cs_b;
+		//const float* bj  = ( float* )b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = ( float* )b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -2779,11 +2779,11 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -3502,11 +3502,11 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 		const dim_t      mr_cur = 3;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		//const float* bj  = b + j_edge*cs_b;
-		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
-		const float* bj  = b + n_iter * ps_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		//const float* bj  = ( float* )b + j_edge*cs_b;
+		//const float* bj  = ( float* )b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = ( float* )b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -3598,11 +3598,11 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -4140,11 +4140,11 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 		const dim_t      mr_cur = 2;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		//const float* bj  = b + j_edge*cs_b;
-		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
-		const float* bj  = b + n_iter * ps_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		//const float* bj  = ( float* )b + j_edge*cs_b;
+		//const float* bj  = ( float* )b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = ( float* )b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
@@ -4236,11 +4236,11 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -4791,11 +4791,11 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 		const dim_t      mr_cur = 1;
 		const dim_t      j_edge = n0 - ( dim_t )n_left;
 
-		      float* cij = c + j_edge*cs_c;
-		const float* ai  = a;
-		//const float* bj  = b + j_edge*cs_b;
-		//const float* bj  = b + ( j_edge / 8 ) * ps_b;
-		const float* bj  = b + n_iter * ps_b;
+		      float* cij = ( float* )c + j_edge*cs_c;
+		const float* ai  = ( float* )a;
+		//const float* bj  = ( float* )b + j_edge*cs_b;
+		//const float* bj  = ( float* )b + ( j_edge / 8 ) * ps_b;
+		const float* bj  = ( float* )b + n_iter * ps_b;
 
 		if ( 12 <= n_left )
 		{
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
index bd4da7804..dc4ae1c8e 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
@@ -99,15 +99,21 @@ void PASTEMAC(ch,opname) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a0, inc_t rs_a, inc_t cs_a, \
+       const void*      b0, inc_t rs_b, inc_t cs_b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx \
      ) \
 { \
+	const ctype* alpha = alpha0; \
+	const ctype* a     = a0; \
+	const ctype* b     = b0; \
+	const ctype* beta  = beta0; \
+	      ctype* c     = c0; \
+\
 	for ( dim_t i = 0; i < mdim; ++i ) \
 	{ \
 		      ctype* ci = &c[ i*rs_c ]; \
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
index 883bbb236..d9953583c 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
@@ -67,11 +67,11 @@ void bli_dgemmsup_rd_haswell_asm_6x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -581,11 +581,11 @@ void bli_dgemmsup_rd_haswell_asm_3x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -990,11 +990,11 @@ void bli_dgemmsup_rd_haswell_asm_2x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1364,11 +1364,11 @@ void bli_dgemmsup_rd_haswell_asm_1x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
index e9be29bcb..6110d7cdb 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
@@ -67,11 +67,11 @@ void bli_dgemmsup_rd_haswell_asm_6x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -623,11 +623,11 @@ void bli_dgemmsup_rd_haswell_asm_3x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1056,11 +1056,11 @@ void bli_dgemmsup_rd_haswell_asm_2x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1448,11 +1448,11 @@ void bli_dgemmsup_rd_haswell_asm_1x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index 2e82c6ee0..f8a9f5f25 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -67,11 +67,11 @@ void bli_dgemmsup_rd_haswell_asm_6x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -600,11 +600,11 @@ void bli_dgemmsup_rd_haswell_asm_2x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1050,11 +1050,11 @@ void bli_dgemmsup_rd_haswell_asm_1x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
index bad647c34..c5c6c2854 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
@@ -67,11 +67,11 @@ void bli_dgemmsup_rd_haswell_asm_6x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -82,9 +82,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		      double* cij = c;
-		const double* bj  = b;
-		const double* ai  = a;
+		      double* cij = ( double* )c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a;
 
 		if ( 4 <= n_left )
 		{
@@ -691,9 +691,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 		const dim_t   nr_cur = 8;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      double* cij = c + i_edge*rs_c;
-		const double* bj  = b;
-		const double* ai  = a + i_edge*rs_a;
+		      double* cij = ( double* )c + i_edge*rs_c;
+		const double* bj  = ( double* )b;
+		const double* ai  = ( double* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -730,11 +730,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1204,11 +1204,11 @@ void bli_dgemmsup_rd_haswell_asm_1x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
index dd12186cb..068ee71ed 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
@@ -88,11 +88,11 @@ void bli_dgemmsup_rv_haswell_asm_6x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -561,11 +561,11 @@ void bli_dgemmsup_rv_haswell_asm_5x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1011,11 +1011,11 @@ void bli_dgemmsup_rv_haswell_asm_4x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1419,11 +1419,11 @@ void bli_dgemmsup_rv_haswell_asm_3x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1824,11 +1824,11 @@ void bli_dgemmsup_rv_haswell_asm_2x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2174,11 +2174,11 @@ void bli_dgemmsup_rv_haswell_asm_1x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
index cea208958..a48eb39eb 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
@@ -88,11 +88,11 @@ void bli_dgemmsup_rv_haswell_asm_6x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -583,11 +583,11 @@ void bli_dgemmsup_rv_haswell_asm_5x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1054,11 +1054,11 @@ void bli_dgemmsup_rv_haswell_asm_4x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1474,11 +1474,11 @@ void bli_dgemmsup_rv_haswell_asm_3x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1901,11 +1901,11 @@ void bli_dgemmsup_rv_haswell_asm_2x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2264,11 +2264,11 @@ void bli_dgemmsup_rv_haswell_asm_1x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
index a78232eb3..9c229a962 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
@@ -88,11 +88,11 @@ void bli_dgemmsup_rv_haswell_asm_6x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -699,11 +699,11 @@ void bli_dgemmsup_rv_haswell_asm_5x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1274,11 +1274,11 @@ void bli_dgemmsup_rv_haswell_asm_4x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1772,11 +1772,11 @@ void bli_dgemmsup_rv_haswell_asm_3x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2287,11 +2287,11 @@ void bli_dgemmsup_rv_haswell_asm_2x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2718,11 +2718,11 @@ void bli_dgemmsup_rv_haswell_asm_1x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
index 543eebf0e..0ada5dc31 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
@@ -83,7 +83,7 @@ GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
 // Define parameters and variables for edge case kernel map.
 #define NUM_MR 4
 #define NUM_NR 4
-#define FUNCPTR_T dgemmsup_ker_ft
+#define FUNCPTR_T gemmsup_ker_ft
 
 static dim_t mrs[NUM_MR] = { 6, 4, 2, 1 };
 static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 };
@@ -103,11 +103,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -815,11 +815,11 @@ void bli_dgemmsup_rv_haswell_asm_5x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1410,11 +1410,11 @@ void bli_dgemmsup_rv_haswell_asm_4x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1918,11 +1918,11 @@ void bli_dgemmsup_rv_haswell_asm_3x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2452,11 +2452,11 @@ void bli_dgemmsup_rv_haswell_asm_2x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2875,11 +2875,11 @@ void bli_dgemmsup_rv_haswell_asm_1x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a, inc_t rs_a0, inc_t cs_a0,
-       const double*    b, inc_t rs_b0, inc_t cs_b0,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
index a0653373c..a8facadbb 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
@@ -98,15 +98,21 @@ void PASTEMAC(ch,opname) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a0, inc_t rs_a, inc_t cs_a, \
+       const void*      b0, inc_t rs_b, inc_t cs_b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx \
      ) \
 { \
+	const ctype* alpha = alpha0; \
+	const ctype* a     = a0; \
+	const ctype* b     = b0; \
+	const ctype* beta  = beta0; \
+	      ctype* c     = c0; \
+\
 	for ( dim_t i = 0; i < mdim; ++i ) \
 	{ \
 		      ctype* ci = &c[ i*rs_c ]; \
@@ -155,70 +161,3 @@ GENTFUNC( float,  s, gemmsup_r_haswell_ref_3x1, 3 )
 GENTFUNC( float,  s, gemmsup_r_haswell_ref_2x1, 2 )
 GENTFUNC( float,  s, gemmsup_r_haswell_ref_1x1, 1 )
 
-// -----------------------------------------------------------------------------
-
-#if 0
-// Temporary definition of general-purpose sup kernel.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             conj_t     conja, \
-             conj_t     conjb, \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx \
-     ) \
-{ \
-	for ( dim_t i = 0; i < m; ++i ) \
-	{ \
-		      ctype* ci = &c[ i*rs_c ]; \
-		const ctype* ai = &a[ i*rs_a ]; \
-\
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			      ctype* cij = &ci[ j*cs_c ]; \
-			const ctype* bj  = &b [ j*cs_b ]; \
-			ctype           ab; \
-\
-			PASTEMAC(ch,set0s)( ab ); \
-\
-			/* Perform a dot product to update the (i,j) element of c. */ \
-			for ( dim_t l = 0; l < k; ++l ) \
-			{ \
-				const ctype* aij = &ai[ l*cs_a ]; \
-				const ctype* bij = &bj[ l*rs_b ]; \
-\
-				PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-			} \
-\
-			/* If beta is one, add ab into c. If beta is zero, overwrite c
-			   with the result in ab. Otherwise, scale by beta and accumulate
-			   ab to c. */ \
-			if ( PASTEMAC(ch,eq1)( *beta ) ) \
-			{ \
-				PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-			} \
-			else if ( PASTEMAC(d,eq0)( *beta ) ) \
-			{ \
-				PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-			} \
-			else \
-			{ \
-				PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-			} \
-		} \
-	} \
-}
-
-GENTFUNC( float,  s, gemmsup_r_haswell_ref )
-#endif
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
index ac2cf4602..ce385b9e1 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
@@ -67,11 +67,11 @@ void bli_sgemmsup_rd_haswell_asm_6x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -593,11 +593,11 @@ void bli_sgemmsup_rd_haswell_asm_3x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1008,11 +1008,11 @@ void bli_sgemmsup_rd_haswell_asm_2x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1386,11 +1386,11 @@ void bli_sgemmsup_rd_haswell_asm_1x1
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
index 646280375..63049d2d5 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
@@ -67,11 +67,11 @@ void bli_sgemmsup_rd_haswell_asm_6x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -639,11 +639,11 @@ void bli_sgemmsup_rd_haswell_asm_2x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1116,11 +1116,11 @@ void bli_sgemmsup_rd_haswell_asm_1x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
index 0fce13c0e..6fe365afb 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
@@ -67,11 +67,11 @@ void bli_sgemmsup_rd_haswell_asm_6x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -82,9 +82,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	// dispatch other 6x?m kernels, as needed.
 	if ( n_left )
 	{
-		      float* cij = c;
-		const float* bj  = b;
-		const float* ai  = a;
+		      float* cij = ( float* )c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a;
 
 		if ( 8 <= n_left )
 		{
@@ -714,9 +714,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 		const dim_t   nr_cur = 16;
 		const dim_t   i_edge = m0 - ( dim_t )m_left;
 
-		      float* cij = c + i_edge*rs_c;
-		const float* bj  = b;
-		const float* ai  = a + i_edge*rs_a;
+		      float* cij = ( float* )c + i_edge*rs_c;
+		const float* bj  = ( float* )b;
+		const float* ai  = ( float* )a + i_edge*rs_a;
 
 		if ( 2 == m_left )
 		{
@@ -753,11 +753,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1233,11 +1233,11 @@ void bli_sgemmsup_rd_haswell_asm_1x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
index d81ef6442..a5ce50cdc 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
@@ -67,11 +67,11 @@ void bli_sgemmsup_rd_haswell_asm_6x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -641,11 +641,11 @@ void bli_sgemmsup_rd_haswell_asm_3x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1083,11 +1083,11 @@ void bli_sgemmsup_rd_haswell_asm_2x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1480,11 +1480,11 @@ void bli_sgemmsup_rd_haswell_asm_1x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
index 7302b697c..f69d81596 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
@@ -67,11 +67,11 @@ void bli_sgemmsup_rd_haswell_asm_6x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -611,11 +611,11 @@ void bli_sgemmsup_rd_haswell_asm_2x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1068,11 +1068,11 @@ void bli_sgemmsup_rd_haswell_asm_1x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
index 52d3ccd3d..e325c777a 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
@@ -67,11 +67,11 @@ void bli_sgemmsup_rd_haswell_asm_6x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -639,11 +639,11 @@ void bli_sgemmsup_rd_haswell_asm_2x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1116,11 +1116,11 @@ void bli_sgemmsup_rd_haswell_asm_1x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
index bcf5744c9..9809e0012 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
@@ -88,11 +88,11 @@ void bli_sgemmsup_rv_haswell_asm_6x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -786,11 +786,11 @@ void bli_sgemmsup_rv_haswell_asm_5x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1485,11 +1485,11 @@ void bli_sgemmsup_rv_haswell_asm_4x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2042,11 +2042,11 @@ void bli_sgemmsup_rv_haswell_asm_3x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2630,11 +2630,11 @@ void bli_sgemmsup_rv_haswell_asm_2x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -3076,11 +3076,11 @@ void bli_sgemmsup_rv_haswell_asm_1x12
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
index 23231a3be..7afe0d7a2 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
@@ -83,7 +83,7 @@ GEMMSUP_KER_PROT( float,    s, gemmsup_r_haswell_ref )
 // Define parameters and variables for edge case kernel map.
 #define NUM_MR 4
 #define NUM_NR 6
-#define FUNCPTR_T sgemmsup_ker_ft
+#define FUNCPTR_T gemmsup_ker_ft
 
 static dim_t mrs[NUM_MR] = { 6, 4, 2, 1 };
 static dim_t nrs[NUM_NR] = { 16, 12, 8, 4, 2, 1 };
@@ -103,11 +103,11 @@ void bli_sgemmsup_rv_haswell_asm_6x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -927,11 +927,11 @@ void bli_sgemmsup_rv_haswell_asm_5x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1683,11 +1683,11 @@ void bli_sgemmsup_rv_haswell_asm_4x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2268,11 +2268,11 @@ void bli_sgemmsup_rv_haswell_asm_3x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2909,11 +2909,11 @@ void bli_sgemmsup_rv_haswell_asm_2x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -3378,11 +3378,11 @@ void bli_sgemmsup_rv_haswell_asm_1x16
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
index 9117e6b17..5f37fac5a 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
@@ -88,11 +88,11 @@ void bli_sgemmsup_rv_haswell_asm_6x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -565,11 +565,11 @@ void bli_sgemmsup_rv_haswell_asm_5x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1027,11 +1027,11 @@ void bli_sgemmsup_rv_haswell_asm_4x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1438,11 +1438,11 @@ void bli_sgemmsup_rv_haswell_asm_3x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1841,11 +1841,11 @@ void bli_sgemmsup_rv_haswell_asm_2x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2193,11 +2193,11 @@ void bli_sgemmsup_rv_haswell_asm_1x2
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
index b06a638df..56e744d52 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
@@ -88,11 +88,11 @@ void bli_sgemmsup_rv_haswell_asm_6x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -595,11 +595,11 @@ void bli_sgemmsup_rv_haswell_asm_5x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1092,11 +1092,11 @@ void bli_sgemmsup_rv_haswell_asm_4x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1525,11 +1525,11 @@ void bli_sgemmsup_rv_haswell_asm_3x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1954,11 +1954,11 @@ void bli_sgemmsup_rv_haswell_asm_2x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2319,11 +2319,11 @@ void bli_sgemmsup_rv_haswell_asm_1x4
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
index b7cfe9321..bc3d07561 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
@@ -88,11 +88,11 @@ void bli_sgemmsup_rv_haswell_asm_6x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -673,11 +673,11 @@ void bli_sgemmsup_rv_haswell_asm_5x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1247,11 +1247,11 @@ void bli_sgemmsup_rv_haswell_asm_4x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1734,11 +1734,11 @@ void bli_sgemmsup_rv_haswell_asm_3x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2222,11 +2222,11 @@ void bli_sgemmsup_rv_haswell_asm_2x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2622,11 +2622,11 @@ void bli_sgemmsup_rv_haswell_asm_1x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -3013,11 +3013,11 @@ void bli_sgemmsup_rv_haswell_asm_1x6
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
index d2e145e76..972150db5 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
@@ -88,11 +88,11 @@ void bli_sgemmsup_rv_haswell_asm_6x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -639,11 +639,11 @@ void bli_sgemmsup_rv_haswell_asm_5x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1190,11 +1190,11 @@ void bli_sgemmsup_rv_haswell_asm_4x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1647,11 +1647,11 @@ void bli_sgemmsup_rv_haswell_asm_3x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2127,11 +2127,11 @@ void bli_sgemmsup_rv_haswell_asm_2x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -2513,11 +2513,11 @@ void bli_sgemmsup_rv_haswell_asm_1x8
              dim_t      m0,
              dim_t      n0,
              dim_t      k0,
-       const float*     alpha,
-       const float*     a, inc_t rs_a0, inc_t cs_a0,
-       const float*     b, inc_t rs_b0, inc_t cs_b0,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a, inc_t rs_a0, inc_t cs_a0,
+       const void*      b, inc_t rs_b0, inc_t cs_b0,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
index 4e8c4cac8..989708ee4 100644
--- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
+++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
@@ -259,19 +259,19 @@ void bli_dgemm_knc_asm_30x8
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
-    double * a_next = bli_auxinfo_next_a( data );
-    double * b_next = bli_auxinfo_next_b( data );
+    double* a_next = bli_auxinfo_next_a( data );
+    double* b_next = bli_auxinfo_next_b( data );
 
-    int * offsetPtr = &offsets[0];
+    int* offsetPtr = &offsets[0];
 
     uint64_t k64 = k;
 
diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
index c9d16e049..c2a359f11 100644
--- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
+++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
@@ -259,19 +259,20 @@ void bli_sgemm_knc_asm_30x16
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
-    float * a_next = bli_auxinfo_next_a( data );
-    float * b_next = bli_auxinfo_next_b( data );
+    float* a_next = bli_auxinfo_next_a( data );
+    float* b_next = bli_auxinfo_next_b( data );
 
-    int * offsetPtr = &offsets[0];
+    int* offsetPtr = &offsets[0];
 
     uint64_t k64 = k;
 
diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
index 1f99e7a39..ba472fac1 100644
--- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
+++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
@@ -113,9 +113,9 @@ void bli_dpackm_knl_asm_8xk
              dim_t   cdim_,
              dim_t   n_,
              dim_t   n_max_,
-       const double* kappa_,
-       const double* a_, inc_t inca_, inc_t lda_,
-             double* p_,              inc_t ldp_,
+       const void*   kappa_,
+       const void*   a_, inc_t inca_, inc_t lda_,
+             void*   p_,              inc_t ldp_,
        const cntx_t* cntx
      )
 {
@@ -330,7 +330,7 @@ void bli_dpackm_knl_asm_8xk
 			const dim_t      i      = cdim;
 			const dim_t      m_edge = mnr - i;
 			const dim_t      n_edge = n_max;
-			double* restrict p_edge = p + (i  )*1;
+			double* restrict p_edge = ( double* )p + (i  )*1;
 
 			bli_dset0s_mxn
 			(
@@ -346,7 +346,7 @@ void bli_dpackm_knl_asm_8xk
 		const dim_t      j      = n;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = n_max - j;
-		double* restrict p_edge = p + (j  )*ldp;
+		double* restrict p_edge = ( double* )p + (j  )*ldp;
 
 		bli_dset0s_mxn
 		(
@@ -364,9 +364,9 @@ void bli_dpackm_knl_asm_24xk
              dim_t   cdim_,
              dim_t   n_,
              dim_t   n_max_,
-       const double* kappa_,
-       const double* a_, inc_t inca_, inc_t lda_,
-             double* p_,              inc_t ldp_,
+       const void*   kappa_,
+       const void*   a_, inc_t inca_, inc_t lda_,
+             void*   p_,              inc_t ldp_,
        const cntx_t* cntx
      )
 {
@@ -634,7 +634,7 @@ void bli_dpackm_knl_asm_24xk
 			const dim_t      i      = cdim;
 			const dim_t      m_edge = mnr - i;
 			const dim_t      n_edge = n_max;
-			double* restrict p_edge = p + (i  )*1;
+			double* restrict p_edge = ( double* )p + (i  )*1;
 
 			bli_dset0s_mxn
 			(
@@ -650,7 +650,7 @@ void bli_dpackm_knl_asm_24xk
 		const dim_t      j      = n;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = n_max - j;
-		double* restrict p_edge = p + (j  )*ldp;
+		double* restrict p_edge = ( double* )p + (j  )*ldp;
 
 		bli_dset0s_mxn
 		(
diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
index 493b0259b..f02a28823 100644
--- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
+++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
@@ -115,9 +115,9 @@ void bli_spackm_knl_asm_16xk
              dim_t   cdim_,
              dim_t   n_,
              dim_t   n_max_,
-       const float*  kappa_,
-       const float*  a_, inc_t inca_, inc_t lda_,
-             float*  p_,              inc_t ldp_,
+       const void*   kappa_,
+       const void*   a_, inc_t inca_, inc_t lda_,
+             void*   p_,              inc_t ldp_,
        const cntx_t* cntx
      )
 {
@@ -348,7 +348,7 @@ void bli_spackm_knl_asm_16xk
 			const dim_t      i      = cdim;
 			const dim_t      m_edge = mnr - i;
 			const dim_t      n_edge = n_max;
-			float*  restrict p_edge = p + (i  )*1;
+			float*  restrict p_edge = ( float* )p + (i  )*1;
 
 			bli_sset0s_mxn
 			(
@@ -364,7 +364,7 @@ void bli_spackm_knl_asm_16xk
 		const dim_t      j      = n;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = n_max - j;
-		float*  restrict p_edge = p + (j  )*ldp;
+		float*  restrict p_edge = ( float* )p + (j  )*ldp;
 
 		bli_sset0s_mxn
 		(
@@ -382,9 +382,9 @@ void bli_spackm_knl_asm_24xk
              dim_t   cdim_,
              dim_t   n_,
              dim_t   n_max_,
-       const float*  kappa_,
-       const float*  a_, inc_t inca_, inc_t lda_,
-             float*  p_,              inc_t ldp_,
+       const void*   kappa_,
+       const void*   a_, inc_t inca_, inc_t lda_,
+             void*   p_,              inc_t ldp_,
        const cntx_t* cntx
      )
 {
@@ -651,7 +651,7 @@ void bli_spackm_knl_asm_24xk
 			const dim_t      i      = cdim;
 			const dim_t      m_edge = mnr - i;
 			const dim_t      n_edge = n_max;
-			float*  restrict p_edge = p + (i  )*1;
+			float*  restrict p_edge = ( float* )p + (i  )*1;
 
 			bli_sset0s_mxn
 			(
@@ -667,7 +667,7 @@ void bli_spackm_knl_asm_24xk
 		const dim_t      j      = n;
 		const dim_t      m_edge = mnr;
 		const dim_t      n_edge = n_max - j;
-		float*  restrict p_edge = p + (j  )*ldp;
+		float*  restrict p_edge = ( float* )p + (j  )*ldp;
 
 		bli_sset0s_mxn
 		(
diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
index 579ac61f5..888ed2874 100644
--- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
+++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
@@ -188,11 +188,11 @@ void bli_dgemm_knl_asm_24x8
              dim_t      m,
              dim_t      n,
              dim_t      k_,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c_, inc_t cs_c_,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c_, inc_t cs_c_,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -200,10 +200,10 @@ void bli_dgemm_knl_asm_24x8
     (void)data;
     (void)cntx;
 
-    const double * a_next = bli_auxinfo_next_a( data );
-    const double * b_next = bli_auxinfo_next_b( data );
+    const double* a_next = bli_auxinfo_next_a( data );
+    const double* b_next = bli_auxinfo_next_b( data );
 
-    int32_t * offsetPtr = &offsets[0];
+    int32_t* offsetPtr = &offsets[0];
     int64_t k = k_;
     int64_t rs_c = rs_c_;
     int64_t cs_c = cs_c_;
diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
index 184f44951..504a5b1e8 100644
--- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
+++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
@@ -185,11 +185,11 @@ void bli_sgemm_knl_asm_24x16
              dim_t      m,
              dim_t      n,
              dim_t      k_,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c_, inc_t cs_c_,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c_, inc_t cs_c_,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c
index 690af0452..ddfabc095 100644
--- a/kernels/penryn/1/bli_axpyv_penryn_int.c
+++ b/kernels/penryn/1/bli_axpyv_penryn_int.c
@@ -47,16 +47,15 @@ void bli_daxpyv_penryn_int
      (
              conj_t  conjx,
              dim_t   n,
-       const double* alpha,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
-	const double*  restrict alpha_cast = alpha;
-	const double*  restrict x_cast = x;
-	      double*  restrict y_cast = y;
-	      dim_t             i;
+	const double*  restrict alpha_cast = alpha0;
+	const double*  restrict x_cast     = x0;
+	      double*  restrict y_cast     = y0;
 
 	const dim_t             n_elem_per_reg = 2;
 	const dim_t             n_iter_unroll  = 4;
@@ -84,13 +83,13 @@ void bli_daxpyv_penryn_int
 	{
 		use_ref = TRUE;
 	}
-	else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) ||
-	          bli_is_unaligned_to( ( siz_t )y, 16 ) )
+	else if ( bli_is_unaligned_to( ( siz_t )x_cast, 16 ) ||
+	          bli_is_unaligned_to( ( siz_t )y_cast, 16 ) )
 	{
 		use_ref = TRUE;
 
-		if ( bli_is_unaligned_to( ( siz_t )x, 16 ) &&
-		     bli_is_unaligned_to( ( siz_t )y, 16 ) )
+		if ( bli_is_unaligned_to( ( siz_t )x_cast, 16 ) &&
+		     bli_is_unaligned_to( ( siz_t )y_cast, 16 ) )
 		{
 			use_ref = FALSE;
 			n_pre   = 1;
@@ -100,15 +99,15 @@ void bli_daxpyv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+		axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
 		f
 		(
 		  conjx,
 		  n,
-		  alpha,
-		  x, incx,
-		  y, incy,
+		  alpha0,
+		  x0, incx,
+		  y0, incy,
 		  cntx
 		);
 		return;
@@ -135,7 +134,7 @@ void bli_daxpyv_penryn_int
 
 	alpha1v.v = _mm_loaddup_pd( ( double* )&alpha1c );
 
-	for ( i = 0; i < n_run; ++i )
+	for ( dim_t i = 0; i < n_run; ++i )
 	{
 		y1v.v = _mm_load_pd( ( double* )y1 );
 		x1v.v = _mm_load_pd( ( double* )x1 );
@@ -172,7 +171,7 @@ void bli_daxpyv_penryn_int
 
 	if ( n_left > 0 )
 	{
-		for ( i = 0; i < n_left; ++i )
+		for ( dim_t i = 0; i < n_left; ++i )
 		{
 			x1c = *x1;
 
diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c
index 72a5e5653..83f44309f 100644
--- a/kernels/penryn/1/bli_dotv_penryn_int.c
+++ b/kernels/penryn/1/bli_dotv_penryn_int.c
@@ -48,16 +48,15 @@ void bli_ddotv_penryn_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const double* x, inc_t incx,
-       const double* y, inc_t incy,
-             double* rho,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
-	const double*  restrict x_cast   = x;
-	const double*  restrict y_cast   = y;
-	      double*  restrict rho_cast = rho;
-	      dim_t             i;
+	const double*  restrict x_cast   = x0;
+	const double*  restrict y_cast   = y0;
+	      double*  restrict rho_cast = rho0;
 
 	      dim_t             n_pre;
 	      dim_t             n_run;
@@ -86,13 +85,13 @@ void bli_ddotv_penryn_int
 	{
 		use_ref = TRUE;
 	}
-	else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) ||
-	          bli_is_unaligned_to( ( siz_t )y, 16 ) )
+	else if ( bli_is_unaligned_to( ( siz_t )x_cast, 16 ) ||
+	          bli_is_unaligned_to( ( siz_t )y_cast, 16 ) )
 	{
 		use_ref = TRUE;
 
-		if ( bli_is_unaligned_to( ( siz_t )x, 16 ) &&
-		     bli_is_unaligned_to( ( siz_t )y, 16 ) )
+		if ( bli_is_unaligned_to( ( siz_t )x_cast, 16 ) &&
+		     bli_is_unaligned_to( ( siz_t )y_cast, 16 ) )
 		{
 			use_ref = FALSE;
 			n_pre = 1;
@@ -102,16 +101,16 @@ void bli_ddotv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx );
+		dotv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx );
 
 		f
 		(
 		  conjx,
 		  conjy,
 		  n,
-		  x, incx,
-		  y, incy,
-		  rho,
+		  x0, incx,
+		  y0, incy,
+		  rho0,
 		  cntx
 		);
 		return;
@@ -138,7 +137,7 @@ void bli_ddotv_penryn_int
 
 	rho1v.v = _mm_setzero_pd();
 
-	for ( i = 0; i < n_run; ++i )
+	for ( dim_t i = 0; i < n_run; ++i )
 	{
 		x1v.v = _mm_load_pd( ( double* )x1 );
 		y1v.v = _mm_load_pd( ( double* )y1 );
@@ -155,7 +154,7 @@ void bli_ddotv_penryn_int
 
 	if ( n_left > 0 )
 	{
-		for ( i = 0; i < n_left; ++i )
+		for ( dim_t i = 0; i < n_left; ++i )
 		{
 			x1c = *x1;
 			y1c = *y1;
diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
index 29ca050f5..a5d9d1669 100644
--- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
@@ -48,20 +48,19 @@ void bli_daxpy2v_penryn_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const double* alphax,
-       const double* alphay,
-       const double* x, inc_t incx,
-       const double* y, inc_t incy,
-             double* z, inc_t incz,
+       const void*   alphax,
+       const void*   alphay,
+       const void*   x, inc_t incx,
+       const void*   y, inc_t incy,
+             void*   z, inc_t incz,
        const cntx_t* cntx
      )
 {
-	const double*  restrict alphax_cast  = alphax;
-	const double*  restrict alphay_cast  = alphay;
+	const double*  restrict alphax_cast = alphax;
+	const double*  restrict alphay_cast = alphay;
 	const double*  restrict x_cast      = x;
 	const double*  restrict y_cast      = y;
 	      double*  restrict z_cast      = z;
-	      dim_t             i;
 
 	const dim_t             n_elem_per_reg = 2;
 	const dim_t             n_iter_unroll  = 4;
@@ -107,7 +106,8 @@ void bli_daxpy2v_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpy2v_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
+		#if 0
+		axpy2v_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
 
 		f
 		(
@@ -121,6 +121,8 @@ void bli_daxpy2v_penryn_int
 		  z, incz,
 		  cntx
 		);
+		#endif
+		bli_abort();
 		return;
 	}
 
@@ -151,7 +153,7 @@ void bli_daxpy2v_penryn_int
 	alphaxv.v = _mm_loaddup_pd( ( double* )alphax_cast );
 	alphayv.v = _mm_loaddup_pd( ( double* )alphay_cast );
 
-	for ( i = 0; i < n_run; ++i )
+	for ( dim_t i = 0; i < n_run; ++i )
 	{
 		z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg );
 		x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg );
@@ -198,7 +200,7 @@ void bli_daxpy2v_penryn_int
 
 	if ( n_left > 0 )
 	{
-		for ( i = 0; i < n_left; ++i )
+		for ( dim_t i = 0; i < n_left; ++i )
 		{
 			x1c = *x1;
 			y1c = *y1;
diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c
index 924782a36..6097ab945 100644
--- a/kernels/penryn/1f/bli_axpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c
@@ -49,18 +49,17 @@ void bli_daxpyf_penryn_int
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha,
+       const void*   a, inc_t inca, inc_t lda,
+       const void*   x, inc_t incx,
+             void*   y, inc_t incy,
        const cntx_t* cntx
      )
 {
 	const double*  restrict alpha_cast = alpha;
-	const double*  restrict a_cast = a;
-	const double*  restrict x_cast = x;
-	      double*  restrict y_cast = y;
-	      dim_t             i;
+	const double*  restrict a_cast     = a;
+	const double*  restrict x_cast     = x;
+	      double*  restrict y_cast     = y;
 
 	const dim_t             n_elem_per_reg = 2;
 	const dim_t             n_iter_unroll  = 2;
@@ -110,7 +109,8 @@ void bli_daxpyf_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
+		#if 0
+		axpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
 
 		f
 		(
@@ -124,6 +124,8 @@ void bli_daxpyf_penryn_int
 		  y_cast, incy,
 		  cntx
 		);
+		#endif
+		bli_abort();
 		return;
 	}
 
@@ -171,7 +173,7 @@ void bli_daxpyf_penryn_int
 	chi2v.v = _mm_loaddup_pd( ( double* )&chi2 );
 	chi3v.v = _mm_loaddup_pd( ( double* )&chi3 );
 
-	for ( i = 0; i < m_run; ++i )
+	for ( dim_t i = 0; i < m_run; ++i )
 	{
 		y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) );
 
@@ -216,7 +218,7 @@ void bli_daxpyf_penryn_int
 
 	if ( m_left > 0 )
 	{
-		for ( i = 0; i < m_left; ++i )
+		for ( dim_t i = 0; i < m_left; ++i )
 		{
 			a0c = *a0;
 			a1c = *a1;
diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
index bc2473249..eab3c0bb0 100644
--- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
@@ -49,11 +49,11 @@ void bli_ddotaxpyv_penryn_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const double* alpha,
-       const double* x, inc_t incx,
-       const double* y, inc_t incy,
-             double* rho,
-             double* z, inc_t incz,
+       const void*   alpha,
+       const void*   x, inc_t incx,
+       const void*   y, inc_t incy,
+             void*   rho,
+             void*   z, inc_t incz,
        const cntx_t* cntx
      )
 {
@@ -109,7 +109,8 @@ void bli_ddotaxpyv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotaxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx );
+		#if 0
+		dotaxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx );
 
 		f
 		(
@@ -124,6 +125,8 @@ void bli_ddotaxpyv_penryn_int
 		  z, incz,
 		  cntx
 		);
+		#endif
+		bli_abort();
 		return;
 	}
 
diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
index 0aa68b5bc..6ea503509 100644
--- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
@@ -51,24 +51,23 @@ void bli_ddotxaxpyf_penryn_int
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* w, inc_t incw,
-       const double* x, inc_t incx,
-       const double* beta,
-             double* y, inc_t incy,
-             double* z, inc_t incz,
+       const void*   alpha,
+       const void*   a, inc_t inca, inc_t lda,
+       const void*   w, inc_t incw,
+       const void*   x, inc_t incx,
+       const void*   beta,
+             void*   y, inc_t incy,
+             void*   z, inc_t incz,
        const cntx_t* cntx
      )
 {
 	const double*  restrict alpha_cast = alpha;
-	const double*  restrict beta_cast  = beta;
 	const double*  restrict a_cast     = a;
 	const double*  restrict w_cast     = w;
 	const double*  restrict x_cast     = x;
+	const double*  restrict beta_cast  = beta;
 	      double*  restrict y_cast     = y;
 	      double*  restrict z_cast     = z;
-	      dim_t             i;
 
 	const dim_t             n_elem_per_reg = 2;
 	const dim_t             n_iter_unroll  = 2;
@@ -98,7 +97,7 @@ void bli_ddotxaxpyf_penryn_int
 	// If the vector lengths are zero, scale y by beta and return.
 	if ( bli_zero_dim1( m ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		scalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -143,7 +142,8 @@ void bli_ddotxaxpyf_penryn_int
 
 	if ( use_ref == TRUE )
 	{
-		ddotxaxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx );
+		#if 0
+		dotxaxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx );
 		f
 		(
 		  conjat,
@@ -161,6 +161,8 @@ void bli_ddotxaxpyf_penryn_int
 		  z_cast, incz,
 		  cntx
 		);
+		#endif
+		bli_abort();
 		return;
 	}
 
@@ -230,8 +232,8 @@ void bli_ddotxaxpyf_penryn_int
 
 	/* y = beta * y + alpha * A^T w; */ \
 	/* z =        z + alpha * A   x; */ \
-	//for ( i = 0; i < m_run; ++i )
-	for ( i = m_run; i != 0; --i )
+	//for ( dim_t i = 0; i < m_run; ++i )
+	for ( dim_t i = m_run; i != 0; --i )
 	{
 		z1v.v = _mm_load_pd( ( double* )(z1 + 0*n_elem_per_reg) );
 		w1v.v = _mm_load_pd( ( double* )(w1 + 0*n_elem_per_reg) );
@@ -302,7 +304,7 @@ void bli_ddotxaxpyf_penryn_int
 
 	if ( m_left > 0 )
 	{
-		for ( i = 0; i < m_left; ++i )
+		for ( dim_t i = 0; i < m_left; ++i )
 		{
 			a0c = *a0;
 			//a1c = *a1;
diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c
index 06e46a3b2..8f7018d96 100644
--- a/kernels/penryn/1f/bli_dotxf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c
@@ -49,20 +49,19 @@ void bli_ddotxf_penryn_int
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-       const double* beta,
-             double* y, inc_t incy,
+       const void*   alpha,
+       const void*   a, inc_t inca, inc_t lda,
+       const void*   x, inc_t incx,
+       const void*   beta,
+             void*   y, inc_t incy,
        const cntx_t* cntx
      )
 {
 	const double*  restrict alpha_cast = alpha;
-	const double*  restrict beta_cast = beta;
-	const double*  restrict a_cast = a;
-	const double*  restrict x_cast = x;
-	      double*  restrict y_cast = y;
-	      dim_t             i;
+	const double*  restrict a_cast     = a;
+	const double*  restrict x_cast     = x;
+	const double*  restrict beta_cast  = beta;
+	      double*  restrict y_cast     = y;
 
 	const dim_t             n_elem_per_reg = 2;
 	const dim_t             n_iter_unroll  = 4;
@@ -85,7 +84,7 @@ void bli_ddotxf_penryn_int
 	// If the vector lengths are zero, scale r by beta and return.
 	if ( bli_zero_dim1( m ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		scalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -129,8 +128,8 @@ void bli_ddotxf_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotxf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx );
-
+		#if 0
+		dotxf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx );
 		f
 		( conjat,
 		  conjx,
@@ -143,6 +142,8 @@ void bli_ddotxf_penryn_int
 		  y_cast, incy,
 		  cntx
 		);
+		#endif
+		bli_abort();
 		return;
 	}
 
@@ -186,7 +187,7 @@ void bli_ddotxf_penryn_int
 	rho2v.v = _mm_setzero_pd();
 	rho3v.v = _mm_setzero_pd();
 
-	for ( i = 0; i < m_run; ++i )
+	for ( dim_t i = 0; i < m_run; ++i )
 	{
 		x0v.v = _mm_load_pd( ( double* )(x0 + 0*n_elem_per_reg) );
 		x1v.v = _mm_load_pd( ( double* )(x1 + 0*n_elem_per_reg) );
@@ -247,7 +248,7 @@ void bli_ddotxf_penryn_int
 
 	if ( m_left > 0 )
 	{
-		for ( i = 0; i < m_left; ++i )
+		for ( dim_t i = 0; i < m_left; ++i )
 		{
 			x0c = *x0;
 			x1c = *x1;
diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
index bbfdd413f..aac7e6950 100644
--- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
@@ -42,11 +42,11 @@ void bli_sgemm_penryn_asm_8x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -517,11 +517,11 @@ void bli_dgemm_penryn_asm_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
index 091cfde96..791234a9d 100644
--- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
@@ -50,12 +50,12 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a10,
-       const double*    a11,
-       const double*    b01,
-             double*    b11,
-             double*    c11, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a10,
+       const void*      a11,
+       const void*      b01,
+             void*      b11,
+             void*      c11, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
index 988c8303e..024e8ba61 100644
--- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
@@ -50,12 +50,12 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k0,
-       const double*    alpha,
-       const double*    a12,
-       const double*    a11,
-       const double*    b21,
-             double*    b11,
-             double*    c11, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a12,
+       const void*      a11,
+       const void*      b21,
+             void*      b11,
+             void*      c11, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
index fc131b55d..295aceca5 100644
--- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
@@ -47,9 +47,9 @@ void bli_strsm_l_penryn_asm_8x4
 
 void bli_dtrsm_l_penryn_asm_4x4
      (
-       const double*    a11,
-             double*    b11,
-             double*    c11, inc_t rs_c0, inc_t cs_c0,
+       const void*      a11,
+             void*      b11,
+             void*      c11, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
index eb366cc9d..3ee464f57 100644
--- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
@@ -47,9 +47,9 @@ void bli_strsm_u_penryn_asm_8x4
 
 void bli_dtrsm_u_penryn_asm_4x4
      (
-       const double*    a11,
-             double*    b11,
-             double*    c11, inc_t rs_c0, inc_t cs_c0,
+       const void*      a11,
+             void*      b11,
+             void*      c11, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
index d6329e6c5..cd577a863 100644
--- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
+++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
@@ -45,11 +45,11 @@ void bli_sgemm_piledriver_asm_16x3
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -526,11 +526,11 @@ void bli_dgemm_piledriver_asm_8x3
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -982,11 +982,11 @@ void bli_cgemm_piledriver_asm_4x2
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const scomplex*  alpha,
-       const scomplex*  a,
-       const scomplex*  b,
-       const scomplex*  beta,
-             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1392,11 +1392,11 @@ void bli_zgemm_piledriver_asm_2x2
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c
index 1b7196b43..121f8c112 100644
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -65,11 +65,11 @@ void bli_dgemm_power10_mma_8x8
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
     )
@@ -88,8 +88,8 @@ void bli_dgemm_power10_mma_8x8
     const double* restrict B0 = b;
           double* restrict C0 = c;
 
-    double alpha_ = *alpha,
-           beta_ = *beta;
+    double alpha_ = *((double*)alpha),
+           beta_  = *((double*)beta);
 
     dv4sf_t result[4];
     dv4sf_t *rowC;
diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index 2a181a3c3..c8a183e74 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -60,11 +60,16 @@ void bli_i16gemm_power10_mma_8x16
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const int32_t*   alpha,
-        const short*     a,
-        const short*     b,
-        const int32_t*   beta,
-              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        //const int32_t*   alpha,
+        //const short*     a,
+        //const short*     b,
+        //const int32_t*   beta,
+        //      int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        const void*      alpha,
+        const void*      a,
+        const void*      b,
+        const void*      beta,
+              void*      c, inc_t rs_c0, inc_t cs_c0,
               auxinfo_t* data,
         const cntx_t*    cntx
     )
@@ -80,8 +85,8 @@ void bli_i16gemm_power10_mma_8x16
     const short* restrict B0 = b;
           int*   restrict C0 = c;
 
-    int alpha_ = *alpha,
-        beta_ = *beta;
+    int alpha_ = *((int32_t*)alpha),
+        beta_  = *((int32_t*)beta);
 
     iv4sf_t result[4];
     iv4sf_t *rowC;
@@ -90,8 +95,8 @@ void bli_i16gemm_power10_mma_8x16
     __vector_quad acc0, acc1, acc2, acc3,
                   acc4, acc5, acc6, acc7;
 
-    vec_t *ca = (vec_t *) A0;
-    vec_t *rb = (vec_t *) B0;
+    vec_t *ca = (vec_t*) A0;
+    vec_t *rb = (vec_t*) B0;
 
     __builtin_mma_xvi16ger2 (&acc0, ca[0], rb[0]);
     __builtin_mma_xvi16ger2 (&acc1, ca[0], rb[1]);
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index cb4afbf16..ff2db46c9 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -60,11 +60,16 @@ void bli_i16sgemm_power10_mma_8x16
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const int32_t*   alpha,
-        const short*     a,
-        const short*     b,
-        const int32_t*   beta,
-              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        //const int32_t*   alpha,
+        //const short*     a,
+        //const short*     b,
+        //const int32_t*   beta,
+        //      int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        const void*      alpha,
+        const void*      a,
+        const void*      b,
+        const void*      beta,
+              void*      c, inc_t rs_c0, inc_t cs_c0,
               auxinfo_t* data,
         const cntx_t*    cntx
     )
@@ -80,8 +85,8 @@ void bli_i16sgemm_power10_mma_8x16
     const short* restrict B0 = b;
           int*   restrict C0 = c;
 
-    int alpha_ = *alpha,
-        beta_ = *beta;
+    int alpha_ = *((int32_t*)alpha),
+        beta_  = *((int32_t*)beta);
 
     iv4sf_t result[4];
     iv4sf_t *rowC;
diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c
index da83c4b99..5816b1d06 100644
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -60,11 +60,16 @@ void bli_i4gemm_power10_mma_8x16
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const int32_t*   alpha,
-        const nibbles*   a,
-        const nibbles*   b,
-        const int32_t*   beta,
-              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        //const int32_t*   alpha,
+        //const nibbles*   a,
+        //const nibbles*   b,
+        //const int32_t*   beta,
+        //      int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        const void*      alpha,
+        const void*      a,
+        const void*      b,
+        const void*      beta,
+              void*      c, inc_t rs_c0, inc_t cs_c0,
               auxinfo_t* data,
         const cntx_t*    cntx
     )
@@ -80,8 +85,8 @@ void bli_i4gemm_power10_mma_8x16
     const nibbles* restrict B0 = b;
           int*     restrict C0 = c;
 
-    int alpha_ = *alpha,
-        beta_ = *beta;
+    int alpha_ = *((int32_t*)alpha),
+        beta_  = *((int32_t*)beta);
 
     iv4sf_t result[4];
     iv4sf_t *rowC;
diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c
index f7609fed8..357c9af5e 100644
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -60,11 +60,16 @@ void bli_i8gemm_power10_mma_8x16
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const int32_t*   alpha,
-        const int8_t*    a,
-        const int8_t*    b,
-        const int32_t*   beta,
-              int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        //const int32_t*   alpha,
+        //const int8_t*    a,
+        //const int8_t*    b,
+        //const int32_t*   beta,
+        //      int32_t*   c, inc_t rs_c0, inc_t cs_c0,
+        const void*      alpha,
+        const void*      a,
+        const void*      b,
+        const void*      beta,
+              void*      c, inc_t rs_c0, inc_t cs_c0,
               auxinfo_t* data,
         const cntx_t*    cntx
     )
@@ -79,8 +84,8 @@ void bli_i8gemm_power10_mma_8x16
     const int8_t* restrict B0 = b;
           int*    restrict C0 = c;
 
-    int alpha_ = *alpha,
-        beta_ = *beta;
+    int alpha_ = *((int32_t*)alpha),
+        beta_  = *((int32_t*)beta);
 
     iv4sf_t result[4];
     iv4sf_t *rowC;
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index 881529927..c5edd60db 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -61,11 +61,16 @@ void bli_sbgemm_power10_mma_8x16
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const float*     alpha,
-        const bfloat16*  a,
-        const bfloat16*  b,
-        const float*     beta,
-              float*     c, inc_t rs_c0, inc_t cs_c0,
+        //const float*     alpha,
+        //const bfloat16*  a,
+        //const bfloat16*  b,
+        //const float*     beta,
+        //      float*     c, inc_t rs_c0, inc_t cs_c0,
+        const void*      alpha,
+        const void*      a,
+        const void*      b,
+        const void*      beta,
+              void*      c, inc_t rs_c0, inc_t cs_c0,
               auxinfo_t* data,
         const cntx_t*    cntx
     )
@@ -81,8 +86,8 @@ void bli_sbgemm_power10_mma_8x16
     const bfloat16* restrict B0 = b;
           float*    restrict C0 = c;
 
-    float alpha_= *alpha,
-          beta_ = *beta;
+    float alpha_= *((float*)alpha),
+          beta_ = *((float*)beta);
 
     fv4sf_t result[4];
     fv4sf_t *rowC;
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index fd64c8cc0..13b8dbab6 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -58,11 +58,16 @@ void bli_sgemm_power10_mma_8x16
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const float*     alpha,
-        const float*     a,
-        const float*     b,
-        const float*     beta,
-              float*     c, inc_t rs_c0, inc_t cs_c0,
+        //const float*     alpha,
+        //const float*     a,
+        //const float*     b,
+        //const float*     beta,
+        //      float*     c, inc_t rs_c0, inc_t cs_c0,
+        const void*      alpha,
+        const void*      a,
+        const void*      b,
+        const void*      beta,
+              void*      c, inc_t rs_c0, inc_t cs_c0,
               auxinfo_t* data,
         const cntx_t*    cntx
     )
@@ -98,8 +103,8 @@ void bli_sgemm_power10_mma_8x16
     const float* restrict B0 = b;
           float* restrict C0 = c;
 
-    float alpha_ = *alpha,
-          beta_  = *beta;
+    float alpha_= *((float*)alpha),
+          beta_ = *((float*)beta);
 
     /* Load elements into vector registers */
     vec_t *ca = (vec_t *) A0;
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index af11befcc..1a4624ecf 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -61,11 +61,16 @@ void bli_shgemm_power10_mma_8x16
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const float*     alpha,
-        const float16*   a,
-        const float16*   b,
-        const float*     beta,
-              float*     c, inc_t rs_c0, inc_t cs_c0,
+        //const float*     alpha,
+        //const float16*   a,
+        //const float16*   b,
+        //const float*     beta,
+        //      float*     c, inc_t rs_c0, inc_t cs_c0,
+        const void*      alpha,
+        const void*      a,
+        const void*      b,
+        const void*      beta,
+              void*      c, inc_t rs_c0, inc_t cs_c0,
               auxinfo_t* data,
         const cntx_t*    cntx
     )
@@ -79,10 +84,10 @@ void bli_shgemm_power10_mma_8x16
 
     const float16* restrict A0 = a;
     const float16* restrict B0 = b;
-          float* restrict C0 = c;
+          float*   restrict C0 = c;
 
-    float alpha_= *alpha,
-          beta_ = *beta;
+    float alpha_= *((float*)alpha),
+          beta_ = *((float*)beta);
 
     fv4sf_t result[4];
     fv4sf_t *rowC;
diff --git a/kernels/power7/3/bli_gemm_power7_int_8x4.c b/kernels/power7/3/bli_gemm_power7_int_8x4.c
index c1f0cf7c5..73e17ab03 100644
--- a/kernels/power7/3/bli_gemm_power7_int_8x4.c
+++ b/kernels/power7/3/bli_gemm_power7_int_8x4.c
@@ -53,11 +53,11 @@ void bli_sgemm_power7_int_8x4
               dim_t      m,
               dim_t      n,
               dim_t      k,
-        const float*     alpha,
-        const float*     a,
-        const float*     b,
-        const float*     beta,
-              float*     c, inc_t rs_c, inc_t cs_c,
+        const void*      alpha0,
+        const void*      a0,
+        const void*      b0,
+        const void*      beta0,
+              void*      c0, inc_t rs_c, inc_t cs_c,
               auxinfo_t* data,
         const cntx_t*    cntx
      )
@@ -68,6 +68,12 @@ void bli_sgemm_power7_int_8x4
     long i, j, kk;
     float c00;
 
+    const float* alpha = alpha0;
+    const float* a     = a0;
+    const float* b     = b0;
+    const float* beta  = beta0;
+          float* c     = c0;
+
     for (i=0; i < m; i++) {
         for (j=0; j < n; j++) {
             c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta;
@@ -95,15 +101,21 @@ void bli_dgemm_power7_int_8x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c, inc_t cs_c,
+       const void*      alpha0,
+       const void*      a0,
+       const void*      b0,
+       const void*      beta0,
+             void*      c0, inc_t rs_c, inc_t cs_c,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+    const double* alpha = alpha0;
+    const double* a     = a0;
+    const double* b     = b0;
+    const double* beta  = beta0;
+          double* c     = c0;
+
     if ( cs_c == 1 )
     {
         // Optimized code for case where C rows are contiguous (i.e. C is row-major)
@@ -452,11 +464,11 @@ void bli_cgemm_power7_int_8x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const scomplex*  alpha,
-       const scomplex*  a,
-       const scomplex*  b,
-       const scomplex*  beta,
-             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha0,
+       const void*      a0,
+       const void*      b0,
+       const void*      beta0,
+             void*      c0, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -467,6 +479,12 @@ void bli_cgemm_power7_int_8x4
     int i, j, kk;
     scomplex c00;
 
+    const scomplex* alpha = alpha0;
+    const scomplex* a     = a0;
+    const scomplex* b     = b0;
+    const scomplex* beta  = beta0;
+          scomplex* c     = c0;
+
     for (i=0; i < m; i++) {
         for (j=0; j < n; j++) {
             scomplex tmpc, tmpa, tmpb, tmp;
@@ -505,11 +523,11 @@ void bli_zgemm_power7_int_8x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha0,
+       const void*      a0,
+       const void*      b0,
+       const void*      beta0,
+             void*      c0, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -520,6 +538,12 @@ void bli_zgemm_power7_int_8x4
     int i, j, kk;
     dcomplex c00;
 
+    const dcomplex* alpha = alpha0;
+    const dcomplex* a     = a0;
+    const dcomplex* b     = b0;
+    const dcomplex* beta  = beta0;
+          dcomplex* c     = c0;
+
     for (i=0; i < m; i++) {
         for (j=0; j < n; j++) {
             dcomplex tmpc, tmpa, tmpb, tmp;
diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
index 3d785bcdf..2a4f6f025 100644
--- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
+++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
@@ -40,11 +40,11 @@ void bli_dgemm_power9_asm_12x6
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
index 111d02a0e..5faf8b8dd 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
@@ -45,11 +45,11 @@ void bli_sgemm_sandybridge_asm_8x8
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -537,11 +537,11 @@ void bli_dgemm_sandybridge_asm_8x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -999,11 +999,11 @@ void bli_cgemm_sandybridge_asm_8x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const scomplex*  alpha,
-       const scomplex*  a,
-       const scomplex*  b,
-       const scomplex*  beta,
-             scomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
@@ -1702,11 +1702,11 @@ void bli_zgemm_sandybridge_asm_4x4
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const dcomplex*  alpha,
-       const dcomplex*  a,
-       const dcomplex*  b,
-       const dcomplex*  beta,
-             dcomplex*  c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
index e2685da2b..38572285d 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
@@ -36,29 +36,25 @@
 #include <immintrin.h>
 #include "blis.h"
 
-
-#if 0
-void bli_sgemm_sandybridge_int_8x8
-     (
-     )
-{
-}
-#endif
-
 void bli_dgemm_sandybridge_int_8x4
      (
              dim_t      m,
              dim_t      n,
              dim_t      k,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c0, inc_t cs_c0,
+       const void*      alpha0,
+       const void*      a0,
+       const void*      b0,
+       const void*      beta0,
+             void*      c0, inc_t rs_c0, inc_t cs_c0,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
+	const double* alpha = alpha0;
+	const double* a     = a0;
+	const double* b     = b0;
+	const double* beta  = beta0;
+	      double* c     = c0;
 
 	//const void* a_next = bli_auxinfo_next_a( data );
 	const void* b_next = bli_auxinfo_next_b( data );
@@ -480,23 +476,3 @@ void bli_dgemm_sandybridge_int_8x4
 	GEMM_UKR_FLUSH_CT( d );
 }
 
-
-
-#if 0
-void bli_cgemm_sandybridge_int_8x4
-     (
-     )
-{
-}
-#endif
-
-
-
-#if 0
-void bli_zgemm_sandybridge_int_4x4
-     (
-     )
-{
-}
-#endif
-
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
index 05e7c59e2..58fb8b78b 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
@@ -294,11 +294,11 @@ void bli_dgemm_skx_asm_16x12_l2
              dim_t      m,
              dim_t      n,
              dim_t      k_,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c_, inc_t cs_c_,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c_, inc_t cs_c_,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
index 0b5f178c6..9f5f42074 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
@@ -160,11 +160,11 @@ void bli_dgemm_skx_asm_16x14
              dim_t      m,
              dim_t      n,
              dim_t      k_,
-       const double*    alpha,
-       const double*    a,
-       const double*    b,
-       const double*    beta,
-             double*    c, inc_t rs_c_, inc_t cs_c_,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c_, inc_t cs_c_,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
index b04ced575..d8a4637ce 100644
--- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
+++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
@@ -324,11 +324,11 @@ void bli_sgemm_skx_asm_32x12_l2
              dim_t      m,
              dim_t      n,
              dim_t      k_,
-       const float*     alpha,
-       const float*     a,
-       const float*     b,
-       const float*     beta,
-             float*     c, inc_t rs_c_, inc_t cs_c_,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c_, inc_t cs_c_,
              auxinfo_t* data,
        const cntx_t*    cntx
      )
diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c
index 8aafa542f..79c608f39 100644
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -102,11 +102,13 @@ typedef union
 void bli_samaxv_zen_int
      (
              dim_t   n,
-       const float*  x, inc_t incx,
-             dim_t*  i_max,
+       const void*   x0, inc_t incx,
+             dim_t*  index,
        const cntx_t* cntx
      )
 {
+	const float* restrict x = x0;
+
 	const float* restrict minus_one = PASTEMAC(s,m1);
 	const dim_t* restrict zero_i    = PASTEMAC(i,0);
 
@@ -121,7 +123,7 @@ void bli_samaxv_zen_int
 	   the behavior of netlib BLAS's i?amax() routines. */
 	if ( bli_zero_dim1( n ) )
 	{
-		PASTEMAC(i,copys)( *zero_i, *i_max );
+		PASTEMAC(i,copys)( *zero_i, *index );
 		return;
 	}
 
@@ -259,7 +261,7 @@ void bli_samaxv_zen_int
 	_mm256_zeroupper();
 
 	/* Store final index to output variable. */
-	*i_max = i_max_l;
+	*index = i_max_l;
 }
 
 // -----------------------------------------------------------------------------
@@ -267,11 +269,13 @@ void bli_samaxv_zen_int
 void bli_damaxv_zen_int
      (
              dim_t   n,
-       const double* x, inc_t incx,
-             dim_t*  i_max,
+       const void*   x0, inc_t incx,
+             dim_t*  index,
        const cntx_t* cntx
      )
 {
+	const double* restrict x = x0;
+
 	const double* restrict minus_one = PASTEMAC(d,m1);
 	const dim_t*  restrict zero_i    = PASTEMAC(i,0);
 
@@ -286,7 +290,7 @@ void bli_damaxv_zen_int
 	   the behavior of netlib BLAS's i?amax() routines. */
 	if ( bli_zero_dim1( n ) )
 	{
-		PASTEMAC(i,copys)( *zero_i, *i_max );
+		PASTEMAC(i,copys)( *zero_i, *index );
 		return;
 	}
 
@@ -415,7 +419,7 @@ void bli_damaxv_zen_int
 	_mm256_zeroupper();
 
 	/* Store final index to output variable. */
-	*i_max = i_max_l;
+	*index = i_max_l;
 }
 
 // -----------------------------------------------------------------------------
@@ -428,7 +432,7 @@ void PASTEMAC(ch,varname) \
      ( \
        dim_t    n, \
        ctype*   x, inc_t incx, \
-       dim_t*   i_max, \
+       dim_t*   index, \
        cntx_t*  cntx  \
      ) \
 { \
@@ -442,7 +446,7 @@ void PASTEMAC(ch,varname) \
 	dim_t    i; \
 \
 	/* Initialize the index of the maximum absolute value to zero. */ \
-	PASTEMAC(i,copys)( zero_i, *i_max ); \
+	PASTEMAC(i,copys)( zero_i, *index ); \
 \
 	/* If the vector length is zero, return early. This directly emulates
 	   the behavior of netlib BLAS's i?amax() routines. */ \
@@ -477,7 +481,7 @@ void PASTEMAC(ch,varname) \
 			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
-				*i_max       = i; \
+				*index       = i; \
 			} \
 		} \
 	} \
@@ -507,7 +511,7 @@ void PASTEMAC(ch,varname) \
 			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
-				*i_max       = i; \
+				*index       = i; \
 			} \
 		} \
 	} \
diff --git a/kernels/zen/1/bli_axpyv_zen_int.c b/kernels/zen/1/bli_axpyv_zen_int.c
index af2925c2f..6212f1745 100644
--- a/kernels/zen/1/bli_axpyv_zen_int.c
+++ b/kernels/zen/1/bli_axpyv_zen_int.c
@@ -59,12 +59,16 @@ void bli_saxpyv_zen_int
      (
              conj_t  conjx,
              dim_t   n,
-       const float*  alpha,
-       const float*  x, inc_t incx,
-             float*  y, inc_t incy,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const float*     alpha = alpha0;
+	const float*     x     = x0;
+	      float*     y     = y0;
+
 	const dim_t      n_elem_per_reg = 8;
 	const dim_t      n_iter_unroll  = 4;
 
@@ -94,8 +98,8 @@ void bli_saxpyv_zen_int
 	}
 
 	// Initialize local pointers.
-	const float* restrict x0 = x;
-	      float* restrict y0 = y;
+	const float* restrict xp = x;
+	      float* restrict yp = y;
 
 	// Broadcast the alpha scalar to all elements of a vector register.
 	alphav.v = _mm256_broadcast_ss( alpha );
@@ -105,17 +109,17 @@ void bli_saxpyv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the input values.
-		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
 
-		y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
 
-		y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
 
-		y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
 
 		// perform : y += alpha * x;
 		y0v.v = _mm256_fmadd_ps( alphav.v, x0v.v, y0v.v );
@@ -124,13 +128,13 @@ void bli_saxpyv_zen_int
 		y3v.v = _mm256_fmadd_ps( alphav.v, x3v.v, y3v.v );
 
 		// Store the output.
-		_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-		_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
-		_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), y2v.v );
-		_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), y3v.v );
+		_mm256_storeu_ps( (yp + 0*n_elem_per_reg), y0v.v );
+		_mm256_storeu_ps( (yp + 1*n_elem_per_reg), y1v.v );
+		_mm256_storeu_ps( (yp + 2*n_elem_per_reg), y2v.v );
+		_mm256_storeu_ps( (yp + 3*n_elem_per_reg), y3v.v );
 
-		x0 += n_elem_per_reg * n_iter_unroll;
-		y0 += n_elem_per_reg * n_iter_unroll;
+		xp += n_elem_per_reg * n_iter_unroll;
+		yp += n_elem_per_reg * n_iter_unroll;
 	}
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
@@ -145,12 +149,12 @@ void bli_saxpyv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		const float x0c = *x0;
+		const float x0c = *xp;
 
-		*y0 += alphac * x0c;
+		*yp += alphac * x0c;
 
-		x0 += incx;
-		y0 += incy;
+		xp += incx;
+		yp += incy;
 	}
 }
 
@@ -160,12 +164,16 @@ void bli_daxpyv_zen_int
      (
              conj_t  conjx,
              dim_t   n,
-       const double* alpha,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double*     alpha = alpha0;
+	const double*     x     = x0;
+	      double*     y     = y0;
+
 	const dim_t       n_elem_per_reg = 4;
 	const dim_t       n_iter_unroll  = 4;
 
@@ -195,8 +203,8 @@ void bli_daxpyv_zen_int
 	}
 
 	// Initialize local pointers.
-	const double* restrict x0 = x;
-	      double* restrict y0 = y;
+	const double* restrict xp = x;
+	      double* restrict yp = y;
 
 	// Broadcast the alpha scalar to all elements of a vector register.
 	alphav.v = _mm256_broadcast_sd( alpha );
@@ -206,17 +214,17 @@ void bli_daxpyv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the input values.
-		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
 
-		y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
 
-		y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
 
-		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
 
 		// perform : y += alpha * x;
 		y0v.v = _mm256_fmadd_pd( alphav.v, x0v.v, y0v.v );
@@ -225,13 +233,13 @@ void bli_daxpyv_zen_int
 		y3v.v = _mm256_fmadd_pd( alphav.v, x3v.v, y3v.v );
 
 		// Store the output.
-		_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
-		_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
-		_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), y2v.v );
-		_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), y3v.v );
+		_mm256_storeu_pd( (yp + 0*n_elem_per_reg), y0v.v );
+		_mm256_storeu_pd( (yp + 1*n_elem_per_reg), y1v.v );
+		_mm256_storeu_pd( (yp + 2*n_elem_per_reg), y2v.v );
+		_mm256_storeu_pd( (yp + 3*n_elem_per_reg), y3v.v );
 
-		x0 += n_elem_per_reg * n_iter_unroll;
-		y0 += n_elem_per_reg * n_iter_unroll;
+		xp += n_elem_per_reg * n_iter_unroll;
+		yp += n_elem_per_reg * n_iter_unroll;
 	}
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
@@ -246,12 +254,12 @@ void bli_daxpyv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		const double x0c = *x0;
+		const double x0c = *xp;
 
-		*y0 += alphac * x0c;
+		*yp += alphac * x0c;
 
-		x0 += incx;
-		y0 += incy;
+		xp += incx;
+		yp += incy;
 	}
 }
 
diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c
index abd63a52f..96b8e5f70 100644
--- a/kernels/zen/1/bli_axpyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpyv_zen_int10.c
@@ -59,12 +59,16 @@ void bli_saxpyv_zen_int10
      (
              conj_t  conjx,
              dim_t   n,
-       const float*  alpha,
-       const float*  x, inc_t incx,
-             float*  y, inc_t incy,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const float*     alpha = alpha0;
+	const float*     x     = x0;
+	      float*     y     = y0;
+
 	const dim_t      n_elem_per_reg = 8;
 
 	dim_t            i;
@@ -78,8 +82,8 @@ void bli_saxpyv_zen_int10
 	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
 
 	// Initialize local pointers.
-	const float* restrict x0 = x;
-	      float* restrict y0 = y;
+	const float* restrict xp = x;
+	      float* restrict yp = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
@@ -89,27 +93,27 @@ void bli_saxpyv_zen_int10
 		for ( i = 0; (i + 79) < n; i += 80 )
 		{
 			// 80 elements will be processed per loop; 10 FMAs will run per loop.
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
-			xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
-			xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
-			yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
-			yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
-			yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
-			yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
-			yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( xp + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_ps( xp + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_ps( xp + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( yp + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_ps( yp + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_ps( yp + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_ps( yp + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_ps( yp + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_ps( yp + 9*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
@@ -122,34 +126,34 @@ void bli_saxpyv_zen_int10
 			zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] );
 			zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] );
 
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
-			_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] );
-			_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] );
-			_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] );
-			_mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] );
-			_mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] );
-
-			x0 += 10*n_elem_per_reg;
-			y0 += 10*n_elem_per_reg;
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (yp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (yp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (yp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (yp + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_ps( (yp + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_ps( (yp + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_ps( (yp + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_ps( (yp + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_ps( (yp + 9*n_elem_per_reg), zv[9] );
+
+			xp += 10*n_elem_per_reg;
+			yp += 10*n_elem_per_reg;
 		}
 
 		for ( ; (i + 39) < n; i += 40 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( xp + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( yp + 4*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
@@ -157,72 +161,72 @@ void bli_saxpyv_zen_int10
 			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
 			zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
 
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (yp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (yp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (yp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (yp + 4*n_elem_per_reg), zv[4] );
 
-			x0 += 5*n_elem_per_reg;
-			y0 += 5*n_elem_per_reg;
+			xp += 5*n_elem_per_reg;
+			yp += 5*n_elem_per_reg;
 		}
 
 		for ( ; (i + 31) < n; i += 32 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
 			zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
 			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
 
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (yp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (yp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (yp + 3*n_elem_per_reg), zv[3] );
 
-			x0 += 4*n_elem_per_reg;
-			y0 += 4*n_elem_per_reg;
+			xp += 4*n_elem_per_reg;
+			yp += 4*n_elem_per_reg;
 		}
 
 		for ( ; (i + 15) < n; i += 16 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
 
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (yp + 1*n_elem_per_reg), zv[1] );
 
-			x0 += 2*n_elem_per_reg;
-			y0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
+			yp += 2*n_elem_per_reg;
 		}
 
 		for ( ; (i + 7) < n; i += 8 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
 
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), zv[0] );
 
-			x0 += 1*n_elem_per_reg;
-			y0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
+			yp += 1*n_elem_per_reg;
 		}
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
@@ -234,10 +238,10 @@ void bli_saxpyv_zen_int10
 
 		for ( ; (i + 0) < n; i += 1 )
 		{
-			*y0 += (*alpha) * (*x0);
+			*yp += (*alpha) * (*xp);
 
-			x0 += 1;
-			y0 += 1;
+			xp += 1;
+			yp += 1;
 		}
 	}
 	else
@@ -246,12 +250,12 @@ void bli_saxpyv_zen_int10
 
 		for ( i = 0; i < n; ++i )
 		{
-			const float x0c = *x0;
+			const float xpc = *xp;
 
-			*y0 += alphac * x0c;
+			*yp += alphac * xpc;
 
-			x0 += incx;
-			y0 += incy;
+			xp += incx;
+			yp += incy;
 		}
 	}
 }
@@ -262,12 +266,16 @@ void bli_daxpyv_zen_int10
      (
              conj_t  conjx,
              dim_t   n,
-       const double* alpha,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double*    alpha = alpha0;
+	const double*    x     = x0;
+	      double*    y     = y0;
+
 	const dim_t      n_elem_per_reg = 4;
 
 	dim_t            i;
@@ -281,8 +289,8 @@ void bli_daxpyv_zen_int10
 	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
 
 	// Initialize local pointers.
-	const double* restrict x0 = x;
-	      double* restrict y0 = y;
+	const double* restrict xp = x;
+	      double* restrict yp = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
@@ -292,27 +300,27 @@ void bli_daxpyv_zen_int10
 		for ( i = 0; (i + 39) < n; i += 40 )
 		{
 			// 40 elements will be processed per loop; 10 FMAs will run per loop.
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
-			xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
-			xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
-			yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
-			yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
-			yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
-			yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
-			yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( xp + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( xp + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( xp + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( yp + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_pd( yp + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_pd( yp + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_pd( yp + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_pd( yp + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_pd( yp + 9*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
@@ -325,34 +333,34 @@ void bli_daxpyv_zen_int10
 			zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] );
 			zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] );
 
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
-			_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] );
-			_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] );
-			_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] );
-			_mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] );
-			_mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] );
-
-			x0 += 10*n_elem_per_reg;
-			y0 += 10*n_elem_per_reg;
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (yp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (yp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (yp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (yp + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_pd( (yp + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_pd( (yp + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_pd( (yp + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_pd( (yp + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_pd( (yp + 9*n_elem_per_reg), zv[9] );
+
+			xp += 10*n_elem_per_reg;
+			yp += 10*n_elem_per_reg;
 		}
 
 		for ( ; (i + 19) < n; i += 20 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( xp + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( yp + 4*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
@@ -360,72 +368,72 @@ void bli_daxpyv_zen_int10
 			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
 			zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
 
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (yp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (yp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (yp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (yp + 4*n_elem_per_reg), zv[4] );
 
-			x0 += 5*n_elem_per_reg;
-			y0 += 5*n_elem_per_reg;
+			xp += 5*n_elem_per_reg;
+			yp += 5*n_elem_per_reg;
 		}
 
 		for ( ; (i + 15) < n; i += 16 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
 			zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
 			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
 
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (yp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (yp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (yp + 3*n_elem_per_reg), zv[3] );
 
-			x0 += 4*n_elem_per_reg;
-			y0 += 4*n_elem_per_reg;
+			xp += 4*n_elem_per_reg;
+			yp += 4*n_elem_per_reg;
 		}
 
 		for ( ; i + 7 < n; i += 8 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
 			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
 
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (yp + 1*n_elem_per_reg), zv[1] );
 
-			x0 += 2*n_elem_per_reg;
-			y0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
+			yp += 2*n_elem_per_reg;
 		}
 
 		for ( ; i + 3 < n; i += 4 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
 			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
 
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), zv[0] );
 
-			x0 += 1*n_elem_per_reg;
-			y0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
+			yp += 1*n_elem_per_reg;
 		}
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
@@ -437,10 +445,10 @@ void bli_daxpyv_zen_int10
 
 		for ( ; i < n; i += 1 )
 		{
-			*y0 += (*alpha) * (*x0);
+			*yp += (*alpha) * (*xp);
 
-			y0 += 1;
-			x0 += 1;
+			yp += 1;
+			xp += 1;
 		}
 	}
 	else
@@ -449,12 +457,12 @@ void bli_daxpyv_zen_int10
 
 		for ( i = 0; i < n; ++i )
 		{
-			const double x0c = *x0;
+			const double xpc = *xp;
 
-			*y0 += alphac * x0c;
+			*yp += alphac * xpc;
 
-			x0 += incx;
-			y0 += incy;
+			xp += incx;
+			yp += incy;
 		}
 	}
 }
diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c
index ddf16e15d..8e105e601 100644
--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -41,11 +41,14 @@ void bli_scopyv_zen_int
      (
              conj_t  conjx,
              dim_t   n,
-       const float*  x, inc_t incx,
-             float*  y, inc_t incy,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const float* x = x0;
+	      float* y = y0;
+
 	const dim_t num_elem_per_reg = 8;
 	dim_t       i = 0;
 	__m256      xv[16];
@@ -190,11 +193,14 @@ void bli_dcopyv_zen_int
      (
              conj_t  conjx,
              dim_t   n,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* x = x0;
+	      double* y = y0;
+
 	const dim_t num_elem_per_reg = 4;
 	dim_t       i = 0;
 	__m256d     xv[16];
diff --git a/kernels/zen/1/bli_dotv_zen_int.c b/kernels/zen/1/bli_dotv_zen_int.c
index 3a03a6eed..866817b5d 100644
--- a/kernels/zen/1/bli_dotv_zen_int.c
+++ b/kernels/zen/1/bli_dotv_zen_int.c
@@ -59,12 +59,16 @@ void bli_sdotv_zen_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const float*  x, inc_t incx,
-       const float*  y, inc_t incy,
-             float*  rho,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
+	const float*  x   = x0;
+	const float*  y   = y0;
+	      float*  rho = rho0;
+
 	const dim_t      n_elem_per_reg = 8;
 	const dim_t      n_iter_unroll  = 4;
 
@@ -72,7 +76,7 @@ void bli_sdotv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	float            rho0;
+	float            rho_l;
 
 	v8sf_t           rho0v, rho1v, rho2v, rho3v;
 	v8sf_t           x0v, y0v;
@@ -102,11 +106,11 @@ void bli_sdotv_zen_int
 	}
 
 	// Initialize local pointers.
-	const float* restrict x0 = x;
-	const float* restrict y0 = y;
+	const float* restrict xp = x;
+	const float* restrict yp = y;
 
 	// Initialize the local scalar rho1 to zero.
-	PASTEMAC(s,set0s)( rho0 );
+	PASTEMAC(s,set0s)( rho_l );
 
 	// Initialize the unrolled iterations' rho vectors to zero.
 	rho0v.v = _mm256_setzero_ps();
@@ -117,17 +121,17 @@ void bli_sdotv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the x and y input vector elements.
-		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
 
-		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-		y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
 
-		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-		y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
 
-		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-		y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
 
 		// Compute the element-wise product of the x and y vectors,
 		// storing in the corresponding rho vectors.
@@ -136,8 +140,8 @@ void bli_sdotv_zen_int
 		rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v );
 		rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v );
 
-		x0 += ( n_elem_per_reg * n_iter_unroll );
-		y0 += ( n_elem_per_reg * n_iter_unroll );
+		xp += ( n_elem_per_reg * n_iter_unroll );
+		yp += ( n_elem_per_reg * n_iter_unroll );
 	}
 
 	// Accumulate the unrolled rho vectors into a single vector.
@@ -146,8 +150,8 @@ void bli_sdotv_zen_int
 	rho0v.v += rho3v.v;
 
 	// Accumulate the final rho vector into a single scalar result.
-	rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
-	        rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
+	rho_l += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
+	         rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
@@ -159,17 +163,17 @@ void bli_sdotv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		const float x0c = *x0;
-		const float y0c = *y0;
+		const float x0c = *xp;
+		const float y0c = *yp;
 
-		rho0 += x0c * y0c;
+		rho_l += x0c * y0c;
 
-		x0 += incx;
-		y0 += incy;
+		xp += incx;
+		yp += incy;
 	}
 
 	// Copy the final result into the output variable.
-	PASTEMAC(s,copys)( rho0, *rho );
+	PASTEMAC(s,copys)( rho_l, *rho );
 }
 
 // -----------------------------------------------------------------------------
@@ -179,12 +183,16 @@ void bli_ddotv_zen_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const double* x, inc_t incx,
-       const double* y, inc_t incy,
-             double* rho,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
+	const double* x   = x0;
+	const double* y   = y0;
+	      double* rho = rho0;
+
 	const dim_t      n_elem_per_reg = 4;
 	const dim_t      n_iter_unroll  = 4;
 
@@ -192,7 +200,7 @@ void bli_ddotv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	double           rho0;
+	double           rho_l;
 
 	v4df_t           rho0v, rho1v, rho2v, rho3v;
 	v4df_t           x0v, y0v;
@@ -222,11 +230,11 @@ void bli_ddotv_zen_int
 	}
 
 	// Initialize local pointers.
-	const double* restrict x0 = x;
-	const double* restrict y0 = y;
+	const double* restrict xp = x;
+	const double* restrict yp = y;
 
 	// Initialize the local scalar rho1 to zero.
-	PASTEMAC(d,set0s)( rho0 );
+	PASTEMAC(d,set0s)( rho_l );
 
 	// Initialize the unrolled iterations' rho vectors to zero.
 	rho0v.v = _mm256_setzero_pd();
@@ -237,17 +245,17 @@ void bli_ddotv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the x and y input vector elements.
-		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
-		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-		y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
-		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-		y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
 
-		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
 
 		// Compute the element-wise product of the x and y vectors,
 		// storing in the corresponding rho vectors.
@@ -256,8 +264,8 @@ void bli_ddotv_zen_int
 		rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v );
 		rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v );
 
-		x0 += ( n_elem_per_reg * n_iter_unroll );
-		y0 += ( n_elem_per_reg * n_iter_unroll );
+		xp += ( n_elem_per_reg * n_iter_unroll );
+		yp += ( n_elem_per_reg * n_iter_unroll );
 	}
 
 	// Accumulate the unrolled rho vectors into a single vector.
@@ -266,7 +274,7 @@ void bli_ddotv_zen_int
 	rho0v.v += rho3v.v;
 
 	// Accumulate the final rho vector into a single scalar result.
-	rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
+	rho_l += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
@@ -278,16 +286,16 @@ void bli_ddotv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		const double x0c = *x0;
-		const double y0c = *y0;
+		const double x0c = *xp;
+		const double y0c = *yp;
 
-		rho0 += x0c * y0c;
+		rho_l += x0c * y0c;
 
-		x0 += incx;
-		y0 += incy;
+		xp += incx;
+		yp += incy;
 	}
 
 	// Copy the final result into the output variable.
-	PASTEMAC(d,copys)( rho0, *rho );
+	PASTEMAC(d,copys)( rho_l, *rho );
 }
 
diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c
index 62d95b500..9d8efdec3 100644
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -60,17 +60,21 @@ void bli_sdotv_zen_int10
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const float*  x, inc_t incx,
-       const float*  y, inc_t incy,
-             float*  rho,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
+	const float*  x   = x0;
+	const float*  y   = y0;
+	      float*  rho = rho0;
+
 	const dim_t      n_elem_per_reg = 8;
 
 	dim_t            i;
 
-	float            rho0 = 0.0;
+	float            rho_l = 0.0;
 
 	__m256           xv[10];
 	__m256           yv[10];
@@ -84,10 +88,10 @@ void bli_sdotv_zen_int10
 	}
 
 	// Initialize local pointers.
-	const float* restrict x0 = x;
-	const float* restrict y0 = y;
+	const float* restrict xp = x;
+	const float* restrict yp = y;
 
-	PASTEMAC(s,set0s)( rho0 );
+	PASTEMAC(s,set0s)( rho_l );
 
 	if ( incx == 1 && incy == 1 )
 	{
@@ -105,27 +109,27 @@ void bli_sdotv_zen_int10
 		for ( i = 0 ; (i + 79) < n; i += 80 )
 		{
 			// 80 elements will be processed per loop; 10 FMAs will run per loop.
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
-			xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
-			xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
-			yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
-			yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
-			yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
-			yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
-			yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( xp + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_ps( xp + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_ps( xp + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( yp + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_ps( yp + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_ps( yp + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_ps( yp + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_ps( yp + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_ps( yp + 9*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
@@ -138,8 +142,8 @@ void bli_sdotv_zen_int10
 			rhov[8].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[8].v );
 			rhov[9].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[9].v );
 
-			x0 += 10*n_elem_per_reg;
-			y0 += 10*n_elem_per_reg;
+			xp += 10*n_elem_per_reg;
+			yp += 10*n_elem_per_reg;
 		}
 
 		rhov[0].v += rhov[5].v;
@@ -150,17 +154,17 @@ void bli_sdotv_zen_int10
 
 		for ( ; (i + 39) < n; i += 40 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( xp + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( yp + 4*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
@@ -168,8 +172,8 @@ void bli_sdotv_zen_int10
 			rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v );
 			rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v );
 
-			x0 += 5*n_elem_per_reg;
-			y0 += 5*n_elem_per_reg;
+			xp += 5*n_elem_per_reg;
+			yp += 5*n_elem_per_reg;
 		}
 
 		rhov[0].v += rhov[2].v;
@@ -178,44 +182,44 @@ void bli_sdotv_zen_int10
 
 		for ( ; (i + 15) < n; i += 16 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
 
-			x0 += 2*n_elem_per_reg;
-			y0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
+			yp += 2*n_elem_per_reg;
 		}
 
 		rhov[0].v += rhov[1].v;
 
 		for ( ; (i + 7) < n; i += 8 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
 
-			x0 += 1*n_elem_per_reg;
-			y0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
+			yp += 1*n_elem_per_reg;
 		}
 
 		for ( ; (i + 0) < n; i += 1 )
 		{
-			rho0 += (*x0) * (*y0);
-			x0 += 1;
-			y0 += 1;
+			rho_l += (*xp) * (*yp);
+			xp += 1;
+			yp += 1;
 		}
 
-		rho0 += rhov[0].f[0] + rhov[0].f[1] +
-		        rhov[0].f[2] + rhov[0].f[3] +
-		        rhov[0].f[4] + rhov[0].f[5] +
-		        rhov[0].f[6] + rhov[0].f[7];
+		rho_l += rhov[0].f[0] + rhov[0].f[1] +
+		         rhov[0].f[2] + rhov[0].f[3] +
+		         rhov[0].f[4] + rhov[0].f[5] +
+		         rhov[0].f[6] + rhov[0].f[7];
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
@@ -227,18 +231,18 @@ void bli_sdotv_zen_int10
 	{
 		for ( i = 0; i < n; ++i )
 		{
-			const float x0c = *x0;
-			const float y0c = *y0;
+			const float xpc = *xp;
+			const float ypc = *yp;
 
-			rho0 += x0c * y0c;
+			rho_l += xpc * ypc;
 
-			x0 += incx;
-			y0 += incy;
+			xp += incx;
+			yp += incy;
 		}
 	}
 
 	// Copy the final result into the output variable.
-	PASTEMAC(s,copys)( rho0, *rho );
+	PASTEMAC(s,copys)( rho_l, *rho );
 }
 
 // -----------------------------------------------------------------------------
@@ -248,17 +252,21 @@ void bli_ddotv_zen_int10
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const double* x, inc_t incx,
-       const double* y, inc_t incy,
-             double* rho,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
+	const double* x   = x0;
+	const double* y   = y0;
+	      double* rho = rho0;
+
 	const dim_t      n_elem_per_reg = 4;
 
 	dim_t            i;
 
-	double           rho0 = 0.0;
+	double           rho_l = 0.0;
 
 	__m256d          xv[10];
 	__m256d          yv[10];
@@ -272,10 +280,10 @@ void bli_ddotv_zen_int10
 	}
 
 	// Initialize local pointers.
-	const double* restrict x0 = x;
-	const double* restrict y0 = y;
+	const double* restrict xp = x;
+	const double* restrict yp = y;
 
-	PASTEMAC(d,set0s)( rho0 );
+	PASTEMAC(d,set0s)( rho_l );
 
 	if ( incx == 1 && incy == 1 )
 	{
@@ -293,27 +301,27 @@ void bli_ddotv_zen_int10
 		for ( i = 0; (i + 39) < n; i += 40 )
 		{
 			// 80 elements will be processed per loop; 10 FMAs will run per loop.
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
-			xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
-			xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
-			yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
-			yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
-			yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
-			yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
-			yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( xp + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( xp + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( xp + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( yp + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_pd( yp + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_pd( yp + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_pd( yp + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_pd( yp + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_pd( yp + 9*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
@@ -326,8 +334,8 @@ void bli_ddotv_zen_int10
 			rhov[8].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[8].v );
 			rhov[9].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[9].v );
 
-			x0 += 10*n_elem_per_reg;
-			y0 += 10*n_elem_per_reg;
+			xp += 10*n_elem_per_reg;
+			yp += 10*n_elem_per_reg;
 		}
 
 		rhov[0].v += rhov[5].v;
@@ -338,17 +346,17 @@ void bli_ddotv_zen_int10
 
 		for ( ; (i + 19) < n; i += 20 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( xp + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( yp + 4*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
@@ -356,31 +364,31 @@ void bli_ddotv_zen_int10
 			rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
 			rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v );
 
-			x0 += 5*n_elem_per_reg;
-			y0 += 5*n_elem_per_reg;
+			xp += 5*n_elem_per_reg;
+			yp += 5*n_elem_per_reg;
 		}
 
 		rhov[0].v += rhov[4].v;
 
 		for ( ; (i + 15) < n; i += 16 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
 			rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
 			rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
 
-			x0 += 4*n_elem_per_reg;
-			y0 += 4*n_elem_per_reg;
+			xp += 4*n_elem_per_reg;
+			yp += 4*n_elem_per_reg;
 		}
 
 		rhov[0].v += rhov[2].v;
@@ -388,43 +396,43 @@ void bli_ddotv_zen_int10
 
 		for ( ; (i + 7) < n; i += 8 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
 
-			x0 += 2*n_elem_per_reg;
-			y0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
+			yp += 2*n_elem_per_reg;
 		}
 
 		rhov[0].v += rhov[1].v;
 
 		for ( ; (i + 3) < n; i += 4 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 
-			x0 += 1*n_elem_per_reg;
-			y0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
+			yp += 1*n_elem_per_reg;
 		}
 
 		for ( ; (i + 0) < n; i += 1 )
 		{
-			rho0 += (*x0) * (*y0);
+			rho_l += (*xp) * (*yp);
 
-			x0 += 1;
-			y0 += 1;
+			xp += 1;
+			yp += 1;
 		}
 
 		// Manually add the results from above to finish the sum.
-		rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
+		rho_l += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
@@ -436,17 +444,17 @@ void bli_ddotv_zen_int10
 	{
 		for ( i = 0; i < n; ++i )
 		{
-			const double x0c = *x0;
-			const double y0c = *y0;
+			const double xpc = *xp;
+			const double ypc = *yp;
 
-			rho0 += x0c * y0c;
+			rho_l += xpc * ypc;
 
-			x0 += incx;
-			y0 += incy;
+			xp += incx;
+			yp += incy;
 		}
 	}
 
 	// Copy the final result into the output variable.
-	PASTEMAC(d,copys)( rho0, *rho );
+	PASTEMAC(d,copys)( rho_l, *rho );
 }
 
diff --git a/kernels/zen/1/bli_dotxv_zen_int.c b/kernels/zen/1/bli_dotxv_zen_int.c
index 2ae22cdf2..3e41be879 100644
--- a/kernels/zen/1/bli_dotxv_zen_int.c
+++ b/kernels/zen/1/bli_dotxv_zen_int.c
@@ -59,14 +59,20 @@ void bli_sdotxv_zen_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const float*  alpha,
-       const float*  x, inc_t incx,
-       const float*  y, inc_t incy,
-       const float*  beta,
-             float*  rho,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+       const void*   beta0,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
+	const float*  alpha = alpha0;
+	const float*  x     = x0;
+	const float*  y     = y0;
+	const float*  beta  = beta0;
+	      float*  rho   = rho0;
+
 	const dim_t      n_elem_per_reg = 8;
 	const dim_t      n_iter_unroll  = 4;
 
@@ -74,7 +80,7 @@ void bli_sdotxv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	float            rho0;
+	float            rho_l;
 
 	v8sf_t           rho0v, rho1v, rho2v, rho3v;
 	v8sf_t           x0v, y0v;
@@ -111,8 +117,8 @@ void bli_sdotxv_zen_int
 	}
 
 	// Initialize local pointers.
-	const float* restrict x0 = x;
-	const float* restrict y0 = y;
+	const float* restrict xp = x;
+	const float* restrict yp = y;
 
 	// Initialize the unrolled iterations' rho vectors to zero.
 	rho0v.v = _mm256_setzero_ps();
@@ -123,17 +129,17 @@ void bli_sdotxv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the x and y input vector elements.
-		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
 
-		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-		y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
 
-		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-		y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
 
-		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-		y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
 
 		// Compute the element-wise product of the x and y vectors,
 		// storing in the corresponding rho vectors.
@@ -142,8 +148,8 @@ void bli_sdotxv_zen_int
 		rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v );
 		rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v );
 
-		x0 += ( n_elem_per_reg * n_iter_unroll );
-		y0 += ( n_elem_per_reg * n_iter_unroll );
+		xp += ( n_elem_per_reg * n_iter_unroll );
+		yp += ( n_elem_per_reg * n_iter_unroll );
 	}
 
 	// Accumulate the unrolled rho vectors into a single vector.
@@ -152,8 +158,8 @@ void bli_sdotxv_zen_int
 	rho0v.v += rho3v.v;
 
 	// Accumulate the final rho vector into a single scalar result.
-	rho0 = rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
-	       rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
+	rho_l = rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
+	        rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
@@ -165,17 +171,17 @@ void bli_sdotxv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		const float x0c = *x0;
-		const float y0c = *y0;
+		const float x0c = *xp;
+		const float y0c = *yp;
 
-		rho0 += x0c * y0c;
+		rho_l += x0c * y0c;
 
-		x0 += incx;
-		y0 += incy;
+		xp += incx;
+		yp += incy;
 	}
 
 	// Accumulate the final result into the output variable.
-	PASTEMAC(s,axpys)( *alpha, rho0, *rho );
+	PASTEMAC(s,axpys)( *alpha, rho_l, *rho );
 }
 
 // -----------------------------------------------------------------------------
@@ -185,14 +191,20 @@ void bli_ddotxv_zen_int
              conj_t  conjx,
              conj_t  conjy,
              dim_t   n,
-       const double* alpha,
-       const double* x, inc_t incx,
-       const double* y, inc_t incy,
-       const double* beta,
-             double* rho,
+       const void*   alpha0,
+       const void*   x0, inc_t incx,
+       const void*   y0, inc_t incy,
+       const void*   beta0,
+             void*   rho0,
        const cntx_t* cntx
      )
 {
+	const double*  alpha = alpha0;
+	const double*  x     = x0;
+	const double*  y     = y0;
+	const double*  beta  = beta0;
+	      double*  rho   = rho0;
+
 	const dim_t      n_elem_per_reg = 4;
 	const dim_t      n_iter_unroll  = 4;
 
@@ -200,7 +212,7 @@ void bli_ddotxv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	double           rho0;
+	double           rho_l;
 
 	v4df_t           rho0v, rho1v, rho2v, rho3v;
 	v4df_t           x0v, y0v;
@@ -237,8 +249,8 @@ void bli_ddotxv_zen_int
 	}
 
 	// Initialize local pointers.
-	const double* restrict x0 = x;
-	const double* restrict y0 = y;
+	const double* restrict xp = x;
+	const double* restrict yp = y;
 
 	// Initialize the unrolled iterations' rho vectors to zero.
 	rho0v.v = _mm256_setzero_pd();
@@ -249,17 +261,17 @@ void bli_ddotxv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the x and y input vector elements.
-		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
-		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-		y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
-		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-		y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
 
-		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
 
 		// Compute the element-wise product of the x and y vectors,
 		// storing in the corresponding rho vectors.
@@ -268,8 +280,8 @@ void bli_ddotxv_zen_int
 		rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v );
 		rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v );
 
-		x0 += ( n_elem_per_reg * n_iter_unroll );
-		y0 += ( n_elem_per_reg * n_iter_unroll );
+		xp += ( n_elem_per_reg * n_iter_unroll );
+		yp += ( n_elem_per_reg * n_iter_unroll );
 	}
 
 	// Accumulate the unrolled rho vectors into a single vector.
@@ -278,7 +290,7 @@ void bli_ddotxv_zen_int
 	rho0v.v += rho3v.v;
 
 	// Accumulate the final rho vector into a single scalar result.
-	rho0 = rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
+	rho_l = rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
@@ -290,16 +302,16 @@ void bli_ddotxv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		const double x0c = *x0;
-		const double y0c = *y0;
+		const double x0c = *xp;
+		const double y0c = *yp;
 
-		rho0 += x0c * y0c;
+		rho_l += x0c * y0c;
 
-		x0 += incx;
-		y0 += incy;
+		xp += incx;
+		yp += incy;
 	}
 
 	// Accumulate the final result into the output variable.
-	PASTEMAC(d,axpys)( *alpha, rho0, *rho );
+	PASTEMAC(d,axpys)( *alpha, rho_l, *rho );
 }
 
diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c
index 2c521d165..4dd8b0b5e 100644
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -59,11 +59,14 @@ void bli_sscalv_zen_int
      (
              conj_t  conjalpha,
              dim_t   n,
-       const float*  alpha,
-             float*  x, inc_t incx,
+       const void*   alpha0,
+             void*   x0, inc_t incx,
        const cntx_t* cntx
      )
 {
+	const float*     alpha = alpha0;
+	      float*     x     = x0;
+
 	const dim_t      n_elem_per_reg = 8;
 	const dim_t      n_iter_unroll  = 4;
 
@@ -71,8 +74,6 @@ void bli_sscalv_zen_int
 	dim_t            n_viter;
 	dim_t            n_left;
 
-	float*  restrict x0;
-
 	v8sf_t           alphav;
 	v8sf_t           x0v, x1v, x2v, x3v;
 
@@ -82,8 +83,8 @@ void bli_sscalv_zen_int
 	// If alpha is zero, use setv (in case y contains NaN or Inf).
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
-		float*       zero = bli_s0;
-		ssetv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+		void*       zero = bli_s0;
+		setv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 
 		f
 		(
@@ -111,7 +112,7 @@ void bli_sscalv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
+	float* restrict xp = x;
 
 	// Broadcast the alpha scalar to all elements of a vector register.
 	alphav.v = _mm256_broadcast_ss( alpha );
@@ -121,10 +122,10 @@ void bli_sscalv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the input values.
-		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+		x0v.v = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+		x1v.v = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+		x2v.v = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+		x3v.v = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
 
 		// perform : x := alpha * x;
 		x0v.v = _mm256_mul_ps( alphav.v, x0v.v );
@@ -133,12 +134,12 @@ void bli_sscalv_zen_int
 		x3v.v = _mm256_mul_ps( alphav.v, x3v.v );
 
 		// Store the output.
-		_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), x0v.v );
-		_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), x1v.v );
-		_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), x2v.v );
-		_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), x3v.v );
+		_mm256_storeu_ps( (xp + 0*n_elem_per_reg), x0v.v );
+		_mm256_storeu_ps( (xp + 1*n_elem_per_reg), x1v.v );
+		_mm256_storeu_ps( (xp + 2*n_elem_per_reg), x2v.v );
+		_mm256_storeu_ps( (xp + 3*n_elem_per_reg), x3v.v );
 
-		x0 += n_elem_per_reg * n_iter_unroll;
+		xp += n_elem_per_reg * n_iter_unroll;
 	}
 
 	const float alphac = *alpha;
@@ -146,9 +147,9 @@ void bli_sscalv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		*x0 *= alphac;
+		*xp *= alphac;
 
-		x0 += incx;
+		xp += incx;
 	}
 }
 
@@ -158,11 +159,14 @@ void bli_dscalv_zen_int
      (
              conj_t  conjalpha,
              dim_t   n,
-       const double* alpha,
-             double* x, inc_t incx,
+       const void*   alpha0,
+             void*   x0, inc_t incx,
        const cntx_t* cntx
      )
 {
+	const double*     alpha = alpha0;
+	      double*     x     = x0;
+
 	const dim_t       n_elem_per_reg = 4;
 	const dim_t       n_iter_unroll  = 4;
 
@@ -170,8 +174,6 @@ void bli_dscalv_zen_int
 	dim_t             n_viter;
 	dim_t             n_left;
 
-	double*  restrict x0;
-
 	v4df_t            alphav;
 	v4df_t            x0v, x1v, x2v, x3v;
 
@@ -181,8 +183,8 @@ void bli_dscalv_zen_int
 	// If alpha is zero, use setv (in case y contains NaN or Inf).
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
-		double*      zero = bli_d0;
-		dsetv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+		void*       zero = bli_d0;
+		setv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
 		(
@@ -210,7 +212,7 @@ void bli_dscalv_zen_int
 	}
 
 	// Initialize local pointers.
-	x0 = x;
+	double* restrict xp = x;
 
 	// Broadcast the alpha scalar to all elements of a vector register.
 	alphav.v = _mm256_broadcast_sd( alpha );
@@ -220,10 +222,10 @@ void bli_dscalv_zen_int
 	for ( i = 0; i < n_viter; ++i )
 	{
 		// Load the input values.
-		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+		x0v.v = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+		x1v.v = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+		x2v.v = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+		x3v.v = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
 
 		// perform : y += alpha * x;
 		x0v.v = _mm256_mul_pd( alphav.v, x0v.v );
@@ -232,12 +234,12 @@ void bli_dscalv_zen_int
 		x3v.v = _mm256_mul_pd( alphav.v, x3v.v );
 
 		// Store the output.
-		_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), x0v.v );
-		_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), x1v.v );
-		_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), x2v.v );
-		_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), x3v.v );
+		_mm256_storeu_pd( (xp + 0*n_elem_per_reg), x0v.v );
+		_mm256_storeu_pd( (xp + 1*n_elem_per_reg), x1v.v );
+		_mm256_storeu_pd( (xp + 2*n_elem_per_reg), x2v.v );
+		_mm256_storeu_pd( (xp + 3*n_elem_per_reg), x3v.v );
 
-		x0 += n_elem_per_reg * n_iter_unroll;
+		xp += n_elem_per_reg * n_iter_unroll;
 	}
 
 	const double alphac = *alpha;
@@ -245,9 +247,9 @@ void bli_dscalv_zen_int
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < n_left; ++i )
 	{
-		*x0 *= alphac;
+		*xp *= alphac;
 
-		x0 += incx;
+		xp += incx;
 	}
 }
 
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index fbde870e3..06099b8e0 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -59,16 +59,19 @@ void bli_sscalv_zen_int10
      (
              conj_t  conjalpha,
              dim_t   n,
-       const float*  alpha,
-             float*  x, inc_t incx,
+       const void*   alpha0,
+             void*   x0, inc_t incx,
        const cntx_t* cntx
      )
 {
+	const float*     alpha = alpha0;
+	      float*     x     = x0;
+
 	const dim_t      n_elem_per_reg = 8;
 
 	dim_t            i;
 
-	float*  restrict x0;
+	float*  restrict xp;
 
 	__m256           alphav;
 	__m256           xv[10];
@@ -80,11 +83,10 @@ void bli_sscalv_zen_int10
 	// If alpha is zero, use setv.
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
-		float* zero = bli_s0;
-
 		if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
-		ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+		void*       zero = bli_s0;
+		setv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 
 		f
 		(
@@ -99,7 +101,7 @@ void bli_sscalv_zen_int10
 	}
 
 	// Initialize local pointers.
-	x0 = x;
+	xp = x;
 
 	if ( incx == 1 )
 	{
@@ -109,16 +111,16 @@ void bli_sscalv_zen_int10
 		for ( i = 0; (i + 79) < n; i += 80 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
-			xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
-			xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( xp + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_ps( xp + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_ps( xp + 9*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_ps( alphav, xv[0] );
@@ -133,28 +135,28 @@ void bli_sscalv_zen_int10
 			zv[9] = _mm256_mul_ps( alphav, xv[9] );
 
 			// Store the output.
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] );
-			_mm256_storeu_ps( (x0 + 5*n_elem_per_reg), zv[5] );
-			_mm256_storeu_ps( (x0 + 6*n_elem_per_reg), zv[6] );
-			_mm256_storeu_ps( (x0 + 7*n_elem_per_reg), zv[7] );
-			_mm256_storeu_ps( (x0 + 8*n_elem_per_reg), zv[8] );
-			_mm256_storeu_ps( (x0 + 9*n_elem_per_reg), zv[9] );
-
-			x0 += 10*n_elem_per_reg;
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (xp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (xp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (xp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (xp + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_ps( (xp + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_ps( (xp + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_ps( (xp + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_ps( (xp + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_ps( (xp + 9*n_elem_per_reg), zv[9] );
+
+			xp += 10*n_elem_per_reg;
 		}
 
 		for ( ; (i + 39) < n; i += 40 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( xp + 4*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_ps( alphav, xv[0] );
@@ -164,22 +166,22 @@ void bli_sscalv_zen_int10
 			zv[4] = _mm256_mul_ps( alphav, xv[4] );
 
 			// Store the output.
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (xp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (xp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (xp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (xp + 4*n_elem_per_reg), zv[4] );
 
-			x0 += 5*n_elem_per_reg;
+			xp += 5*n_elem_per_reg;
 		}
 
 		for ( ; (i + 31) < n; i += 32 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_ps( alphav, xv[0] );
@@ -188,50 +190,50 @@ void bli_sscalv_zen_int10
 			zv[3] = _mm256_mul_ps( alphav, xv[3] );
 
 			// Store the output.
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (xp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (xp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (xp + 3*n_elem_per_reg), zv[3] );
 
-			x0 += 4*n_elem_per_reg;
+			xp += 4*n_elem_per_reg;
 		}
 
 		for ( ; (i + 15) < n; i += 16 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_ps( alphav, xv[0] );
 			zv[1] = _mm256_mul_ps( alphav, xv[1] );
 
 			// Store the output.
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (xp + 1*n_elem_per_reg), zv[1] );
 
-			x0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
 		}
 
 		for ( ; (i + 7) < n; i += 8 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_ps( alphav, xv[0] );
 
 			// Store the output.
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), zv[0] );
 
-			x0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
 		}
 
 		for ( ; (i + 0) < n; i += 1 )
 		{
-			*x0 *= *alpha;
+			*xp *= *alpha;
 
-			x0 += 1;
+			xp += 1;
 		}
 	}
 	else
@@ -240,9 +242,9 @@ void bli_sscalv_zen_int10
 
 		for ( i = 0; i < n; ++i )
 		{
-			*x0 *= alphac;
+			*xp *= alphac;
 
-			x0 += incx;
+			xp += incx;
 		}
 	}
 }
@@ -253,16 +255,19 @@ void bli_dscalv_zen_int10
      (
              conj_t  conjalpha,
              dim_t   n,
-       const double* alpha,
-             double* x, inc_t incx,
+       const void*   alpha0,
+             void*   x0, inc_t incx,
        const cntx_t* cntx
      )
 {
+	const double*    alpha = alpha0;
+	      double*    x     = x0;
+
 	const dim_t      n_elem_per_reg = 4;
 
 	dim_t            i;
 
-	double* restrict x0;
+	double* restrict xp;
 
 	__m256d          alphav;
 	__m256d          xv[10];
@@ -274,11 +279,10 @@ void bli_dscalv_zen_int10
 	// If alpha is zero, use setv.
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
-		double* zero = bli_d0;
-
 		if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
-		dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+		void*       zero = bli_d0;
+		setv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
 		(
@@ -293,7 +297,7 @@ void bli_dscalv_zen_int10
 	}
 
 	// Initialize local pointers.
-	x0 = x;
+	xp = x;
 
 	if ( incx == 1 )
 	{
@@ -303,16 +307,16 @@ void bli_dscalv_zen_int10
 		for ( i = 0; (i + 39) < n; i += 40 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
-			xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
-			xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( xp + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( xp + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( xp + 9*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_pd( alphav, xv[0] );
@@ -327,28 +331,28 @@ void bli_dscalv_zen_int10
 			zv[9] = _mm256_mul_pd( alphav, xv[9] );
 
 			// Store the output.
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] );
-			_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), zv[5] );
-			_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), zv[6] );
-			_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), zv[7] );
-			_mm256_storeu_pd( (x0 + 8*n_elem_per_reg), zv[8] );
-			_mm256_storeu_pd( (x0 + 9*n_elem_per_reg), zv[9] );
-
-			x0 += 10*n_elem_per_reg;
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (xp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (xp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (xp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (xp + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_pd( (xp + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_pd( (xp + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_pd( (xp + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_pd( (xp + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_pd( (xp + 9*n_elem_per_reg), zv[9] );
+
+			xp += 10*n_elem_per_reg;
 		}
 
 		for ( ; (i + 19) < n; i += 20 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( xp + 4*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_pd( alphav, xv[0] );
@@ -358,22 +362,22 @@ void bli_dscalv_zen_int10
 			zv[4] = _mm256_mul_pd( alphav, xv[4] );
 
 			// Store the output.
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
-			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (xp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (xp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (xp + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (xp + 4*n_elem_per_reg), zv[4] );
 
-			x0 += 5*n_elem_per_reg;
+			xp += 5*n_elem_per_reg;
 		}
 
 		for ( ; (i + 15) < n; i += 16 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_pd( alphav, xv[0] );
@@ -382,50 +386,50 @@ void bli_dscalv_zen_int10
 			zv[3] = _mm256_mul_pd( alphav, xv[3] );
 
 			// Store the output.
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
-			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
-			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (xp + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (xp + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (xp + 3*n_elem_per_reg), zv[3] );
 
-			x0 += 4*n_elem_per_reg;
+			xp += 4*n_elem_per_reg;
 		}
 
 		for ( ; (i + 7) < n; i += 8 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_pd( alphav, xv[0] );
 			zv[1] = _mm256_mul_pd( alphav, xv[1] );
 
 			// Store the output.
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
-			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (xp + 1*n_elem_per_reg), zv[1] );
 
-			x0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
 		}
 
 		for ( ; (i + 3) < n; i += 4 )
 		{
 			// Load the input values.
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
 
 			// perform : x := alpha * x;
 			zv[0] = _mm256_mul_pd( alphav, xv[0] );
 
 			// Store the output.
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), zv[0] );
 
-			x0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
 		}
 
 		for ( ; (i + 0) < n; i += 1 )
 		{
-			*x0 *= *alpha;
+			*xp *= *alpha;
 
-			x0 += 1;
+			xp += 1;
 		}
 	}
 	else
@@ -434,9 +438,9 @@ void bli_dscalv_zen_int10
 
 		for ( i = 0; i < n; ++i )
 		{
-			*x0 *= alphac;
+			*xp *= alphac;
 
-			x0 += incx;
+			xp += incx;
 		}
 	}
 }
@@ -452,14 +456,12 @@ void bli_cscalv_zen_int10
      (
              conj_t  conjalpha,
              dim_t   n,
-       const scomplex* alpha,
-             scomplex* x, inc_t incx,
+       const void*   alpha,
+             void*   x, inc_t incx,
        const cntx_t* cntx
      )
 {
-	const num_t dt = BLIS_SCOMPLEX;
-
-	cscalv_ker_ft f = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx );
+	scalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx );
 
 	f
 	(
diff --git a/kernels/zen/1/bli_setv_zen_int.c b/kernels/zen/1/bli_setv_zen_int.c
index c706f39ef..8bf5065d5 100644
--- a/kernels/zen/1/bli_setv_zen_int.c
+++ b/kernels/zen/1/bli_setv_zen_int.c
@@ -41,11 +41,14 @@ void bli_ssetv_zen_int
      (
              conj_t  conjalpha,
              dim_t   n,
-       const float*  alpha,
-             float*  x, inc_t incx,
+       const void*   alpha0,
+             void*   x0, inc_t incx,
        const cntx_t* cntx
      )
 {
+	const float* alpha = alpha0;
+	      float* x     = x0;
+
 	const dim_t num_elem_per_reg = 8;
 	dim_t       i = 0;
 	__m256      alphav;
@@ -136,11 +139,14 @@ void bli_dsetv_zen_int
      (
              conj_t  conjalpha,
              dim_t   n,
-       const double* alpha,
-             double* x, inc_t incx,
+       const void*   alpha0,
+             void*   x0, inc_t incx,
        const cntx_t* cntx
      )
 {
+	const double* alpha = alpha0;
+	      double* x     = x0;
+
 	const dim_t num_elem_per_reg = 4;
 	dim_t       i = 0;
 	__m256d     alphav;
diff --git a/kernels/zen/1/bli_swapv_zen_int8.c b/kernels/zen/1/bli_swapv_zen_int8.c
index e62474be2..09ed1cf83 100644
--- a/kernels/zen/1/bli_swapv_zen_int8.c
+++ b/kernels/zen/1/bli_swapv_zen_int8.c
@@ -57,11 +57,13 @@ typedef union
 void bli_sswapv_zen_int8
      (
              dim_t   n,
-             float*  x, inc_t incx,
-             float*  y, inc_t incy,
+             void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	      float*    x = x0;
+	      float*    y = y0;
 
 	const dim_t     n_elem_per_reg = 8;
 	dim_t           i = 0;
@@ -72,109 +74,109 @@ void bli_sswapv_zen_int8
 	// If the vector dimension is zero, return early.
 	if ( bli_zero_dim1( n ) ) return;
 
-	float* restrict x0 = x;
-	float* restrict y0 = y;
+	float* restrict xp = x;
+	float* restrict yp = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
 		for ( i = 0; ( i + 63 ) < n; i += 64 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
-			yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
-			yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
-			yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
-
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
-			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
-			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
-			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
-			_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), yv[4]);
-			_mm256_storeu_ps( (x0 + 5*n_elem_per_reg), yv[5]);
-			_mm256_storeu_ps( (x0 + 6*n_elem_per_reg), yv[6]);
-			_mm256_storeu_ps( (x0 + 7*n_elem_per_reg), yv[7]);
-
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
-			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
-			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
-			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
-			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), xv[4]);
-			_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), xv[5]);
-			_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), xv[6]);
-			_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), xv[7]);
-
-			x0 += 8*n_elem_per_reg;
-			y0 += 8*n_elem_per_reg;
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( xp + 7*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( yp + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_ps( yp + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_ps( yp + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_ps( yp + 7*n_elem_per_reg );
+
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_ps( (xp + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_ps( (xp + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_ps( (xp + 3*n_elem_per_reg), yv[3]);
+			_mm256_storeu_ps( (xp + 4*n_elem_per_reg), yv[4]);
+			_mm256_storeu_ps( (xp + 5*n_elem_per_reg), yv[5]);
+			_mm256_storeu_ps( (xp + 6*n_elem_per_reg), yv[6]);
+			_mm256_storeu_ps( (xp + 7*n_elem_per_reg), yv[7]);
+
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_ps( (yp + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_ps( (yp + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_ps( (yp + 3*n_elem_per_reg), xv[3]);
+			_mm256_storeu_ps( (yp + 4*n_elem_per_reg), xv[4]);
+			_mm256_storeu_ps( (yp + 5*n_elem_per_reg), xv[5]);
+			_mm256_storeu_ps( (yp + 6*n_elem_per_reg), xv[6]);
+			_mm256_storeu_ps( (yp + 7*n_elem_per_reg), xv[7]);
+
+			xp += 8*n_elem_per_reg;
+			yp += 8*n_elem_per_reg;
 		}
 
 		for ( ; ( i + 31 ) < n; i += 32 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
-			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
-			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
-			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
-
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
-			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
-			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
-			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
-
-			x0 += 4*n_elem_per_reg;
-			y0 += 4*n_elem_per_reg;
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( xp + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( yp + 3*n_elem_per_reg );
+
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_ps( (yp + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_ps( (yp + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_ps( (yp + 3*n_elem_per_reg), xv[3]);
+
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_ps( (xp + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_ps( (xp + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_ps( (xp + 3*n_elem_per_reg), yv[3]);
+
+			xp += 4*n_elem_per_reg;
+			yp += 4*n_elem_per_reg;
 		}
 
 		for ( ; ( i + 15 ) < n; i += 16 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( xp + 1*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
 
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
-			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_ps( (yp + 1*n_elem_per_reg), xv[1]);
 
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
-			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_ps( (xp + 1*n_elem_per_reg), yv[1]);
 
-			x0 += 2*n_elem_per_reg;
-			y0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
+			yp += 2*n_elem_per_reg;
 		}
 
 		for ( ; ( i + 7 ) < n; i += 8 )
 		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_ps( xp + 0*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[0] = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
 
-			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_ps( (xp + 0*n_elem_per_reg), yv[0]);
 
-			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_ps( (yp + 0*n_elem_per_reg), xv[0]);
 
-			x0 += 1*n_elem_per_reg;
-			y0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
+			yp += 1*n_elem_per_reg;
 		}
 
 		for ( ; (i + 0) < n; i += 1 )
@@ -186,10 +188,10 @@ void bli_sswapv_zen_int8
 	{
 		for ( i = 0; i < n; ++i )
 		{
-			PASTEMAC(s,swaps)( (*x0), (*y0) );
+			PASTEMAC(s,swaps)( (*xp), (*yp) );
 
-			x0 += incx;
-			y0 += incy;
+			xp += incx;
+			yp += incy;
 		}
 	}
 
@@ -200,11 +202,14 @@ void bli_sswapv_zen_int8
 void bli_dswapv_zen_int8
      (
              dim_t   n,
-             double* x, inc_t incx,
-             double* y, inc_t incy,
+             void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	      double*   x = x0;
+	      double*   y = y0;
+
 	const dim_t      n_elem_per_reg = 4;
 	dim_t            i = 0;
 
@@ -214,109 +219,109 @@ void bli_dswapv_zen_int8
 	// If the vector dimension is zero, return early.
 	if ( bli_zero_dim1( n ) ) return;
 
-	double* restrict x0 = x;
-	double* restrict y0 = y;
+	double* restrict xp = x;
+	double* restrict yp = y;
 
 	if ( incx == 1 && incy == 1 )
 	{
 		for ( ; ( i + 31 ) < n; i += 32 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
-			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
-			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
-			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
-			yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
-			yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
-			yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
-
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
-			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
-			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
-			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
-			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), yv[4]);
-			_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), yv[5]);
-			_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), yv[6]);
-			_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), yv[7]);
-
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
-			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
-			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
-			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
-			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), xv[4]);
-			_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), xv[5]);
-			_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), xv[6]);
-			_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), xv[7]);
-
-			x0 += 8*n_elem_per_reg;
-			y0 += 8*n_elem_per_reg;
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( xp + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( xp + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( xp + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( xp + 7*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( yp + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_pd( yp + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_pd( yp + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_pd( yp + 7*n_elem_per_reg );
+
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_pd( (xp + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_pd( (xp + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_pd( (xp + 3*n_elem_per_reg), yv[3]);
+			_mm256_storeu_pd( (xp + 4*n_elem_per_reg), yv[4]);
+			_mm256_storeu_pd( (xp + 5*n_elem_per_reg), yv[5]);
+			_mm256_storeu_pd( (xp + 6*n_elem_per_reg), yv[6]);
+			_mm256_storeu_pd( (xp + 7*n_elem_per_reg), yv[7]);
+
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_pd( (yp + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_pd( (yp + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_pd( (yp + 3*n_elem_per_reg), xv[3]);
+			_mm256_storeu_pd( (yp + 4*n_elem_per_reg), xv[4]);
+			_mm256_storeu_pd( (yp + 5*n_elem_per_reg), xv[5]);
+			_mm256_storeu_pd( (yp + 6*n_elem_per_reg), xv[6]);
+			_mm256_storeu_pd( (yp + 7*n_elem_per_reg), xv[7]);
+
+			xp += 8*n_elem_per_reg;
+			yp += 8*n_elem_per_reg;
 		}
 
 		for ( ; ( i + 15 ) < n; i += 16 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
-			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
-			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
-			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
-
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
-			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
-			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
-			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
-
-			x0 += 4*n_elem_per_reg;
-			y0 += 4*n_elem_per_reg;
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( xp + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( xp + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_pd( (yp + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_pd( (yp + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_pd( (yp + 3*n_elem_per_reg), xv[3]);
+
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_pd( (xp + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_pd( (xp + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_pd( (xp + 3*n_elem_per_reg), yv[3]);
+
+			xp += 4*n_elem_per_reg;
+			yp += 4*n_elem_per_reg;
 		}
 
 		for ( ; ( i + 7 ) < n; i += 8 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( xp + 1*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
-			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_pd( (yp + 1*n_elem_per_reg), xv[1]);
 
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
-			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_pd( (xp + 1*n_elem_per_reg), yv[1]);
 
-			x0 += 2*n_elem_per_reg;
-			y0 += 2*n_elem_per_reg;
+			xp += 2*n_elem_per_reg;
+			yp += 2*n_elem_per_reg;
 		}
 
 		for ( ; ( i + 3 ) < n; i += 4 )
 		{
-			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[0] = _mm256_loadu_pd( xp + 0*n_elem_per_reg );
 
-			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[0] = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
-			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_pd( (yp + 0*n_elem_per_reg), xv[0]);
 
-			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_pd( (xp + 0*n_elem_per_reg), yv[0]);
 
-			x0 += 1*n_elem_per_reg;
-			y0 += 1*n_elem_per_reg;
+			xp += 1*n_elem_per_reg;
+			yp += 1*n_elem_per_reg;
 		}
 
 		for ( ; (i + 0) < n; i += 1 )
@@ -328,10 +333,10 @@ void bli_dswapv_zen_int8
 	{
 		for ( i = 0; i < n; ++i )
 		{
-			PASTEMAC(d,swaps)( (*x0), (*y0) );
+			PASTEMAC(d,swaps)( (*xp), (*yp) );
 
-			x0 += incx;
-			y0 += incy;
+			xp += incx;
+			yp += incy;
 		}
 	}
 }
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
index 24f66be0b..4e50b4f1c 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -38,17 +38,22 @@
 
 void bli_caxpyf_zen_int_4
      (
-             conj_t    conja,
-             conj_t    conjx,
-             dim_t     m,
-             dim_t     b_n,
-       const scomplex* alpha,
-       const scomplex* a, inc_t inca, inc_t lda,
-       const scomplex* x, inc_t incx,
-             scomplex* y, inc_t incy,
-       const cntx_t*   cntx
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b_n,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
+       const cntx_t* cntx
      )
 {
+	const scomplex* restrict alpha = alpha0;
+	const scomplex* restrict a     = a0;
+	const scomplex* restrict x     = x0;
+	      scomplex* restrict y     = y0;
+
     inc_t fuse_fac = 4;
     inc_t i;
 
@@ -58,7 +63,7 @@ void bli_caxpyf_zen_int_4
     __m256 ymm12, ymm13;
 
     float* ap[4];
-    float* y0 = (float*)y;
+    float* yp = (float*)y;
 
     scomplex            chi0;
     scomplex            chi1;
@@ -72,6 +77,7 @@ void bli_caxpyf_zen_int_4
     {
         setPlusOne = -1;
     }
+
     // If either dimension is zero, or if alpha is zero, return early.
     if ( bli_zero_dim2( m, b_n ) || bli_ceq0( *alpha ) ) return;
 
@@ -79,9 +85,9 @@ void bli_caxpyf_zen_int_4
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-	if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
+        if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
-        caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
+        axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
@@ -186,7 +192,7 @@ void bli_caxpyf_zen_int_4
             ymm13 = _mm256_fmadd_ps(ymm10, ymm7, ymm13);
 
 	    //load Y vector
-            ymm10 = _mm256_loadu_ps(y0 + 0);
+            ymm10 = _mm256_loadu_ps(yp + 0);
 
             if(bli_is_noconj(conja))
             {
@@ -203,9 +209,9 @@ void bli_caxpyf_zen_int_4
 
             ymm12 = _mm256_add_ps(ymm8, ymm10);
 
-            _mm256_storeu_ps((float*)(y0), ymm12);
+            _mm256_storeu_ps((float*)(yp), ymm12);
 
-            y0 += 8;
+            yp += 8;
             ap[0] += 8;
             ap[1] += 8;
             ap[2] += 8;
@@ -217,7 +223,7 @@ void bli_caxpyf_zen_int_4
         for ( i = 0; (i + 0) < n2 ; ++i )
         {
 
-            scomplex       y0c = *(scomplex*)y0;
+            scomplex       y0c = *(scomplex*)yp;
 
             const scomplex a0c = *(scomplex*)ap[0];
             const scomplex a1c = *(scomplex*)ap[1];
@@ -234,13 +240,13 @@ void bli_caxpyf_zen_int_4
             y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne;
             y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne;
 
-            *(scomplex*)y0 = y0c;
+            *(scomplex*)yp = y0c;
 
             ap[0] += 2;
             ap[1] += 2;
             ap[2] += 2;
             ap[3] += 2;
-            y0 += 2;
+            yp += 2;
         }
     //PASTEMAC(c,fprintm)(stdout, "Y after A*x in axpyf",m, 1, (scomplex*)y, 1, 1, "%4.1f", "");
 
@@ -249,7 +255,7 @@ void bli_caxpyf_zen_int_4
     {
         for (i = 0 ; (i + 0) < m ; ++i )
         {
-            scomplex       y0c = *(scomplex*)y0;
+            scomplex       y0c = *(scomplex*)yp;
             const scomplex a0c = *(scomplex*)ap[0];
             const scomplex a1c = *(scomplex*)ap[1];
             const scomplex a2c = *(scomplex*)ap[2];
@@ -265,13 +271,13 @@ void bli_caxpyf_zen_int_4
             y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne;
             y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne;
 
-            *(scomplex*)y0 = y0c;
+            *(scomplex*)yp = y0c;
 
             ap[0] += inca;
             ap[1] += inca;
             ap[2] += inca;
             ap[3] += inca;
-            y0 += incy;
+            yp += incy;
         }
     }
 }
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c
index d4427e86e..78477d3fa 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -65,13 +65,18 @@ void bli_saxpyf_zen_int_5
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const float*  alpha,
-       const float*  a, inc_t inca, inc_t lda,
-       const float*  x, inc_t incx,
-             float*  y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const float* restrict alpha = alpha0;
+	const float* restrict a     = a0;
+	const float* restrict x     = x0;
+	      float* restrict y     = y0;
+
     const dim_t      fuse_fac       = 5;
 
     const dim_t      n_elem_per_reg = 8;
@@ -102,11 +107,11 @@ void bli_saxpyf_zen_int_5
     {
         if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
-        saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+        axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
-            const float* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const float* restrict ap1   = a + (0  )*inca + (i  )*lda;
             const float* restrict chi1 = x + (i  )*incx;
                   float* restrict y1   = y + (0  )*incy;
                   float           alpha_chi1;
@@ -119,7 +124,7 @@ void bli_saxpyf_zen_int_5
               conja,
               m,
               &alpha_chi1,
-              a1, inca,
+              ap1, inca,
               y1, incy,
               cntx
             );
@@ -130,12 +135,12 @@ void bli_saxpyf_zen_int_5
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    const float* restrict a0   = a + 0*lda;
-    const float* restrict a1   = a + 1*lda;
-    const float* restrict a2   = a + 2*lda;
-    const float* restrict a3   = a + 3*lda;
-    const float* restrict a4   = a + 4*lda;
-          float*          y0   = y;
+    const float* restrict ap0   = a + 0*lda;
+    const float* restrict ap1   = a + 1*lda;
+    const float* restrict ap2   = a + 2*lda;
+    const float* restrict ap3   = a + 3*lda;
+    const float* restrict ap4   = a + 4*lda;
+          float*          yp   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -165,23 +170,23 @@ void bli_saxpyf_zen_int_5
         for ( i = 0; (i + 15) < m; i += 16 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+            y0v.v = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_ps( yp + 1*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
+            a00v.v = _mm256_loadu_ps( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_ps( ap0 + 1*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
+            a01v.v = _mm256_loadu_ps( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_ps( ap1 + 1*n_elem_per_reg );
 
-            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
+            a02v.v = _mm256_loadu_ps( ap2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_ps( ap2 + 1*n_elem_per_reg );
 
-            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
+            a03v.v = _mm256_loadu_ps( ap3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_ps( ap3 + 1*n_elem_per_reg );
 
-            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
-            a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
+            a04v.v = _mm256_loadu_ps( ap4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_ps( ap4 + 1*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
@@ -201,27 +206,27 @@ void bli_saxpyf_zen_int_5
 
 
             // Store the output.
-            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
-            a2 += n_iter_unroll * n_elem_per_reg;
-            a3 += n_iter_unroll * n_elem_per_reg;
-            a4 += n_iter_unroll * n_elem_per_reg;
+            _mm256_storeu_ps( (yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_ps( (yp + 1*n_elem_per_reg), y1v.v );
+
+            yp += n_iter_unroll * n_elem_per_reg;
+            ap0 += n_iter_unroll * n_elem_per_reg;
+            ap1 += n_iter_unroll * n_elem_per_reg;
+            ap2 += n_iter_unroll * n_elem_per_reg;
+            ap3 += n_iter_unroll * n_elem_per_reg;
+            ap4 += n_iter_unroll * n_elem_per_reg;
         }
 
         for( ; (i + 7) < m; i += 8 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            y0v.v = _mm256_loadu_ps( yp + 0*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+            a00v.v = _mm256_loadu_ps( ap0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_ps( ap1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_ps( ap2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_ps( ap3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_ps( ap4 + 0*n_elem_per_reg );
 
 
             // perform : y += alpha * x;
@@ -232,26 +237,26 @@ void bli_saxpyf_zen_int_5
             y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
 
             // Store the output.
-            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
-            a2 += n_elem_per_reg;
-            a3 += n_elem_per_reg;
-            a4 += n_elem_per_reg;
+            _mm256_storeu_ps( (yp + 0*n_elem_per_reg), y0v.v );
+
+            yp += n_elem_per_reg;
+            ap0 += n_elem_per_reg;
+            ap1 += n_elem_per_reg;
+            ap2 += n_elem_per_reg;
+            ap3 += n_elem_per_reg;
+            ap4 += n_elem_per_reg;
         }
 
         // If there are leftover iterations, perform them with scalar code.
         for ( ; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const float a0c = *a0;
-            const float a1c = *a1;
-            const float a2c = *a2;
-            const float a3c = *a3;
-            const float a4c = *a4;
+            const float a0c = *ap0;
+            const float a1c = *ap1;
+            const float a2c = *ap2;
+            const float a3c = *ap3;
+            const float a4c = *ap4;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
@@ -259,27 +264,27 @@ void bli_saxpyf_zen_int_5
             y0c += chi3 * a3c;
             y0c += chi4 * a4c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += 1;
-            a1 += 1;
-            a2 += 1;
-            a3 += 1;
-            a4 += 1;
-            y0 += 1;
+            ap0 += 1;
+            ap1 += 1;
+            ap2 += 1;
+            ap3 += 1;
+            ap4 += 1;
+            yp += 1;
         }
     }
     else
     {
         for ( i = 0; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const float a0c = *a0;
-            const float a1c = *a1;
-            const float a2c = *a2;
-            const float a3c = *a3;
-            const float a4c = *a4;
+            const float a0c = *ap0;
+            const float a1c = *ap1;
+            const float a2c = *ap2;
+            const float a3c = *ap3;
+            const float a4c = *ap4;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
@@ -287,14 +292,14 @@ void bli_saxpyf_zen_int_5
             y0c += chi3 * a3c;
             y0c += chi4 * a4c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += inca;
-            a1 += inca;
-            a2 += inca;
-            a3 += inca;
-            a4 += inca;
-            y0 += incy;
+            ap0 += inca;
+            ap1 += inca;
+            ap2 += inca;
+            ap3 += inca;
+            ap4 += inca;
+            yp += incy;
         }
 
     }
@@ -309,13 +314,18 @@ void bli_daxpyf_zen_int_5
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* restrict alpha = alpha0;
+	const double* restrict a     = a0;
+	const double* restrict x     = x0;
+	      double* restrict y     = y0;
+
     const dim_t      fuse_fac       = 5;
 
     const dim_t      n_elem_per_reg = 4;
@@ -346,11 +356,11 @@ void bli_daxpyf_zen_int_5
     {
         if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
-        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+        axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
-            const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const double* restrict ap1   = a + (0  )*inca + (i  )*lda;
             const double* restrict chi1 = x + (i  )*incx;
                   double* restrict y1   = y + (0  )*incy;
                   double           alpha_chi1;
@@ -363,7 +373,7 @@ void bli_daxpyf_zen_int_5
               conja,
               m,
               &alpha_chi1,
-              a1, inca,
+              ap1, inca,
               y1, incy,
               cntx
             );
@@ -374,12 +384,12 @@ void bli_daxpyf_zen_int_5
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    const double* restrict a0   = a + 0*lda;
-    const double* restrict a1   = a + 1*lda;
-    const double* restrict a2   = a + 2*lda;
-    const double* restrict a3   = a + 3*lda;
-    const double* restrict a4   = a + 4*lda;
-          double* restrict y0   = y;
+    const double* restrict ap0   = a + 0*lda;
+    const double* restrict ap1   = a + 1*lda;
+    const double* restrict ap2   = a + 2*lda;
+    const double* restrict ap3   = a + 3*lda;
+    const double* restrict ap4   = a + 4*lda;
+          double* restrict yp   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -409,23 +419,23 @@ void bli_daxpyf_zen_int_5
         for ( i = 0; (i + 7) < m; i += 8 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( ap0 + 1*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( ap1 + 1*n_elem_per_reg );
 
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( ap2 + 1*n_elem_per_reg );
 
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( ap3 + 1*n_elem_per_reg );
 
-            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
-            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
+            a04v.v = _mm256_loadu_pd( ap4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_pd( ap4 + 1*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -445,27 +455,27 @@ void bli_daxpyf_zen_int_5
 
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
-            a2 += n_iter_unroll * n_elem_per_reg;
-            a3 += n_iter_unroll * n_elem_per_reg;
-            a4 += n_iter_unroll * n_elem_per_reg;
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 1*n_elem_per_reg), y1v.v );
+
+            yp += n_iter_unroll * n_elem_per_reg;
+            ap0 += n_iter_unroll * n_elem_per_reg;
+            ap1 += n_iter_unroll * n_elem_per_reg;
+            ap2 += n_iter_unroll * n_elem_per_reg;
+            ap3 += n_iter_unroll * n_elem_per_reg;
+            ap4 += n_iter_unroll * n_elem_per_reg;
         }
 
         for( ; (i + 3) < m; i += 4 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_pd( ap4 + 0*n_elem_per_reg );
 
 
             // perform : y += alpha * x;
@@ -476,26 +486,26 @@ void bli_daxpyf_zen_int_5
             y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
-
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
-            a2 += n_elem_per_reg;
-            a3 += n_elem_per_reg;
-            a4 += n_elem_per_reg;
+            _mm256_storeu_pd( (yp + 0*n_elem_per_reg), y0v.v );
+
+            yp += n_elem_per_reg;
+            ap0 += n_elem_per_reg;
+            ap1 += n_elem_per_reg;
+            ap2 += n_elem_per_reg;
+            ap3 += n_elem_per_reg;
+            ap4 += n_elem_per_reg;
         }
 
         // If there are leftover iterations, perform them with scalar code.
         for ( ; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
-            const double a4c = *a4;
+            const double a0c = *ap0;
+            const double a1c = *ap1;
+            const double a2c = *ap2;
+            const double a3c = *ap3;
+            const double a4c = *ap4;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
@@ -503,27 +513,27 @@ void bli_daxpyf_zen_int_5
             y0c += chi3 * a3c;
             y0c += chi4 * a4c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += 1;
-            a1 += 1;
-            a2 += 1;
-            a3 += 1;
-            a4 += 1;
-            y0 += 1;
+            ap0 += 1;
+            ap1 += 1;
+            ap2 += 1;
+            ap3 += 1;
+            ap4 += 1;
+            yp += 1;
         }
     }
     else
     {
         for ( i = 0; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
-            const double a4c = *a4;
+            const double a0c = *ap0;
+            const double a1c = *ap1;
+            const double a2c = *ap2;
+            const double a3c = *ap3;
+            const double a4c = *ap4;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
@@ -531,14 +541,14 @@ void bli_daxpyf_zen_int_5
             y0c += chi3 * a3c;
             y0c += chi4 * a4c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += inca;
-            a1 += inca;
-            a2 += inca;
-            a3 += inca;
-            a4 += inca;
-            y0 += incy;
+            ap0 += inca;
+            ap1 += inca;
+            ap2 += inca;
+            ap3 += inca;
+            ap4 += inca;
+            yp += incy;
         }
 
     }
@@ -552,13 +562,18 @@ void bli_daxpyf_zen_int_16x2
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* restrict alpha = alpha0;
+	const double* restrict a     = a0;
+	const double* restrict x     = x0;
+	      double* restrict y     = y0;
+
     const dim_t      fuse_fac       = 2;
 
     const dim_t      n_elem_per_reg = 4;
@@ -591,11 +606,13 @@ void bli_daxpyf_zen_int_16x2
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+        if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
+
+        axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
-            const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const double* restrict ap1   = a + (0  )*inca + (i  )*lda;
             const double* restrict chi1 = x + (i  )*incx;
                   double* restrict y1   = y + (0  )*incy;
                   double           alpha_chi1;
@@ -608,7 +625,7 @@ void bli_daxpyf_zen_int_16x2
               conja,
               m,
               &alpha_chi1,
-              a1, inca,
+              ap1, inca,
               y1, incy,
               cntx
             );
@@ -619,10 +636,10 @@ void bli_daxpyf_zen_int_16x2
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    const double* restrict a0   = a + 0*lda;
-    const double* restrict a1   = a + 1*lda;
+    const double* restrict ap0   = a + 0*lda;
+    const double* restrict ap1   = a + 1*lda;
 
-          double* restrict y0   = y;
+          double* restrict yp   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -643,20 +660,20 @@ void bli_daxpyf_zen_int_16x2
         for ( i = 0; (i + 15) < m; i += 16 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-            y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+            y3v.v = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
-            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
-            a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( ap0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( ap0 + 2*n_elem_per_reg );
+            a30v.v = _mm256_loadu_pd( ap0 + 3*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
-            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
-            a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( ap1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( ap1 + 2*n_elem_per_reg );
+            a31v.v = _mm256_loadu_pd( ap1 + 3*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -670,30 +687,30 @@ void bli_daxpyf_zen_int_16x2
             y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
-            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
-            _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(yp + 2*n_elem_per_reg), y2v.v );
+            _mm256_storeu_pd( (double *)(yp + 3*n_elem_per_reg), y3v.v );
+
+            yp += n_iter_unroll * n_elem_per_reg;
+            ap0 += n_iter_unroll * n_elem_per_reg;
+            ap1 += n_iter_unroll * n_elem_per_reg;
         }
 
         for ( ; (i + 11) < m; i += 12 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
-            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( ap0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( ap0 + 2*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
-            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( ap1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( ap1 + 2*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -705,25 +722,25 @@ void bli_daxpyf_zen_int_16x2
             y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
-            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(yp + 2*n_elem_per_reg), y2v.v );
 
-            y0 += 3 * n_elem_per_reg;
-            a0 += 3 * n_elem_per_reg;
-            a1 += 3 * n_elem_per_reg;
+            yp += 3 * n_elem_per_reg;
+            ap0 += 3 * n_elem_per_reg;
+            ap1 += 3 * n_elem_per_reg;
         }
         for ( ; (i + 7) < m; i += 8 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( ap0 + 1*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( ap1 + 1*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -733,22 +750,22 @@ void bli_daxpyf_zen_int_16x2
             y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 1*n_elem_per_reg), y1v.v );
 
-            y0 += 2 * n_elem_per_reg;
-            a0 += 2 * n_elem_per_reg;
-            a1 += 2 * n_elem_per_reg;
+            yp += 2 * n_elem_per_reg;
+            ap0 += 2 * n_elem_per_reg;
+            ap1 += 2 * n_elem_per_reg;
         }
 
         for ( ; (i + 3) < m; i += 4 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -756,21 +773,21 @@ void bli_daxpyf_zen_int_16x2
             y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
 
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
+            yp += n_elem_per_reg;
+            ap0 += n_elem_per_reg;
+            ap1 += n_elem_per_reg;
         }
 
         for ( ; (i + 1) < m; i += 2 )
         {
             // Load the input values.
-            y4v.v = _mm_loadu_pd( y0 + 0*n_elem_per_reg );
+            y4v.v = _mm_loadu_pd( yp + 0*n_elem_per_reg );
 
-            a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg );
+            a40v.v = _mm_loadu_pd( ap0 + 0*n_elem_per_reg );
 
-            a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg );
+            a41v.v = _mm_loadu_pd( ap1 + 0*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v );
@@ -778,48 +795,48 @@ void bli_daxpyf_zen_int_16x2
             y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v );
 
             // Store the output.
-            _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v );
+            _mm_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y4v.v );
 
-            y0 += 2;
-            a0 += 2;
-            a1 += 2;
+            yp += 2;
+            ap0 += 2;
+            ap1 += 2;
         }
 
         // If there are leftover iterations, perform them with scalar code.
         for ( ; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const double a0c = *a0;
-            const double a1c = *a1;
+            const double a0c = *ap0;
+            const double a1c = *ap1;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += 1;
-            a1 += 1;
-            y0 += 1;
+            ap0 += 1;
+            ap1 += 1;
+            yp += 1;
         }
     }
     else
     {
         for ( i = 0; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const double a0c = *a0;
-            const double a1c = *a1;
+            const double a0c = *ap0;
+            const double a1c = *ap1;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += inca;
-            a1 += inca;
-            y0 += incy;
+            ap0 += inca;
+            ap1 += inca;
+            yp += incy;
         }
 
     }
@@ -833,13 +850,18 @@ void bli_daxpyf_zen_int_16x4
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* restrict alpha = alpha0;
+	const double* restrict a     = a0;
+	const double* restrict x     = x0;
+	      double* restrict y     = y0;
+
     const dim_t      fuse_fac       = 4;
 
     const dim_t      n_elem_per_reg = 4;
@@ -874,11 +896,11 @@ void bli_daxpyf_zen_int_16x4
     {
         if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
 
-        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+        axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
-            const double* restrict a1   = a + (0  )*inca + (i  )*lda;
+            const double* restrict ap1   = a + (0  )*inca + (i  )*lda;
             const double* restrict chi1 = x + (i  )*incx;
                   double* restrict y1   = y + (0  )*incy;
                   double           alpha_chi1;
@@ -891,7 +913,7 @@ void bli_daxpyf_zen_int_16x4
               conja,
               m,
               &alpha_chi1,
-              a1, inca,
+              ap1, inca,
               y1, incy,
               cntx
             );
@@ -902,12 +924,12 @@ void bli_daxpyf_zen_int_16x4
 
     // At this point, we know that b_n is exactly equal to the fusing factor.
 
-    const double* restrict a0   = a + 0*lda;
-    const double* restrict a1   = a + 1*lda;
-    const double* restrict a2   = a + 2*lda;
-    const double* restrict a3   = a + 3*lda;
+    const double* restrict ap0   = a + 0*lda;
+    const double* restrict ap1   = a + 1*lda;
+    const double* restrict ap2   = a + 2*lda;
+    const double* restrict ap3   = a + 3*lda;
 
-          double* restrict y0   = y;
+          double* restrict yp   = y;
 
     chi0 = *( x + 0*incx );
     chi1 = *( x + 1*incx );
@@ -933,30 +955,30 @@ void bli_daxpyf_zen_int_16x4
         for ( i = 0; (i + 15) < m; i += 16 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
-            y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
-            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
-            a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg );
-
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
-            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
-            a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg );
-
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
-            a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg );
-            a32v.v = _mm256_loadu_pd( a2 + 3*n_elem_per_reg );
-
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
-            a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg );
-            a33v.v = _mm256_loadu_pd( a3 + 3*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
+            y3v.v = _mm256_loadu_pd( yp + 3*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( ap0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( ap0 + 2*n_elem_per_reg );
+            a30v.v = _mm256_loadu_pd( ap0 + 3*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( ap1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( ap1 + 2*n_elem_per_reg );
+            a31v.v = _mm256_loadu_pd( ap1 + 3*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( ap2 + 1*n_elem_per_reg );
+            a22v.v = _mm256_loadu_pd( ap2 + 2*n_elem_per_reg );
+            a32v.v = _mm256_loadu_pd( ap2 + 3*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( ap3 + 1*n_elem_per_reg );
+            a23v.v = _mm256_loadu_pd( ap3 + 2*n_elem_per_reg );
+            a33v.v = _mm256_loadu_pd( ap3 + 3*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -980,40 +1002,40 @@ void bli_daxpyf_zen_int_16x4
             y3v.v = _mm256_fmadd_pd( a33v.v, chi3v.v, y3v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
-            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
-            _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
-            a2 += n_iter_unroll * n_elem_per_reg;
-            a3 += n_iter_unroll * n_elem_per_reg;
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(yp + 2*n_elem_per_reg), y2v.v );
+            _mm256_storeu_pd( (double *)(yp + 3*n_elem_per_reg), y3v.v );
+
+            yp += n_iter_unroll * n_elem_per_reg;
+            ap0 += n_iter_unroll * n_elem_per_reg;
+            ap1 += n_iter_unroll * n_elem_per_reg;
+            ap2 += n_iter_unroll * n_elem_per_reg;
+            ap3 += n_iter_unroll * n_elem_per_reg;
         }
 
         for ( ; (i + 11) < m; i += 12 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( yp + 2*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
-            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( ap0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( ap0 + 2*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
-            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( ap1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( ap1 + 2*n_elem_per_reg );
 
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
-            a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( ap2 + 1*n_elem_per_reg );
+            a22v.v = _mm256_loadu_pd( ap2 + 2*n_elem_per_reg );
 
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
-            a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( ap3 + 1*n_elem_per_reg );
+            a23v.v = _mm256_loadu_pd( ap3 + 2*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -1033,34 +1055,34 @@ void bli_daxpyf_zen_int_16x4
             y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
-            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
-
-            y0 += 3 * n_elem_per_reg;
-            a0 += 3 * n_elem_per_reg;
-            a1 += 3 * n_elem_per_reg;
-            a2 += 3 * n_elem_per_reg;
-            a3 += 3 * n_elem_per_reg;
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(yp + 2*n_elem_per_reg), y2v.v );
+
+            yp += 3 * n_elem_per_reg;
+            ap0 += 3 * n_elem_per_reg;
+            ap1 += 3 * n_elem_per_reg;
+            ap2 += 3 * n_elem_per_reg;
+            ap3 += 3 * n_elem_per_reg;
         }
 
         for ( ; (i + 7) < m; i += 8 )
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( yp + 1*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( ap0 + 1*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( ap1 + 1*n_elem_per_reg );
 
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( ap2 + 1*n_elem_per_reg );
 
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( ap3 + 1*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -1076,29 +1098,29 @@ void bli_daxpyf_zen_int_16x4
             y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
-
-            y0 += 2 * n_elem_per_reg;
-            a0 += 2 * n_elem_per_reg;
-            a1 += 2 * n_elem_per_reg;
-            a2 += 2 * n_elem_per_reg;
-            a3 += 2 * n_elem_per_reg;
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 1*n_elem_per_reg), y1v.v );
+
+            yp += 2 * n_elem_per_reg;
+            ap0 += 2 * n_elem_per_reg;
+            ap1 += 2 * n_elem_per_reg;
+            ap2 += 2 * n_elem_per_reg;
+            ap3 += 2 * n_elem_per_reg;
         }
 
 
         for ( ; (i + 3) < m; i += 4)
         {
             // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y0v.v = _mm256_loadu_pd( yp + 0*n_elem_per_reg );
 
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a00v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
 
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
 
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
 
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
@@ -1110,28 +1132,28 @@ void bli_daxpyf_zen_int_16x4
             y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
 
             // Store the output.
-            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y0v.v );
 
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
-            a2 += n_elem_per_reg;
-            a3 += n_elem_per_reg;
+            yp += n_elem_per_reg;
+            ap0 += n_elem_per_reg;
+            ap1 += n_elem_per_reg;
+            ap2 += n_elem_per_reg;
+            ap3 += n_elem_per_reg;
         }
 #if 1
         for ( ; (i + 1) < m; i += 2)
         {
 
             // Load the input values.
-            y4v.v  = _mm_loadu_pd( y0 + 0*n_elem_per_reg );
+            y4v.v  = _mm_loadu_pd( yp + 0*n_elem_per_reg );
 
-            a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg );
+            a40v.v = _mm_loadu_pd( ap0 + 0*n_elem_per_reg );
 
-            a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg );
+            a41v.v = _mm_loadu_pd( ap1 + 0*n_elem_per_reg );
 
-            a42v.v = _mm_loadu_pd( a2 + 0*n_elem_per_reg );
+            a42v.v = _mm_loadu_pd( ap2 + 0*n_elem_per_reg );
 
-            a43v.v = _mm_loadu_pd( a3 + 0*n_elem_per_reg );
+            a43v.v = _mm_loadu_pd( ap3 + 0*n_elem_per_reg );
 
             // perform : y += alpha * x;
             y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v );
@@ -1143,64 +1165,64 @@ void bli_daxpyf_zen_int_16x4
             y4v.v = _mm_fmadd_pd( a43v.v, chi3v.xmm[0], y4v.v );
 
             // Store the output.
-            _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v );
+            _mm_storeu_pd( (double *)(yp + 0*n_elem_per_reg), y4v.v );
 
-            y0 += 2;
-            a0 += 2;
-            a1 += 2;
-            a2 += 2;
-            a3 += 2;
+            yp += 2;
+            ap0 += 2;
+            ap1 += 2;
+            ap2 += 2;
+            ap3 += 2;
         }
 #endif
         // If there are leftover iterations, perform them with scalar code.
         for ( ; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
+            const double a0c = *ap0;
+            const double a1c = *ap1;
+            const double a2c = *ap2;
+            const double a3c = *ap3;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
             y0c += chi2 * a2c;
             y0c += chi3 * a3c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += 1;
-            a1 += 1;
-            a2 += 1;
-            a3 += 1;
+            ap0 += 1;
+            ap1 += 1;
+            ap2 += 1;
+            ap3 += 1;
 
-            y0 += 1;
+            yp += 1;
         }
     }
     else
     {
         for ( i = 0; (i + 0) < m ; ++i )
         {
-            double       y0c = *y0;
+            double       y0c = *yp;
 
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
+            const double a0c = *ap0;
+            const double a1c = *ap1;
+            const double a2c = *ap2;
+            const double a3c = *ap3;
 
             y0c += chi0 * a0c;
             y0c += chi1 * a1c;
             y0c += chi2 * a2c;
             y0c += chi3 * a3c;
 
-            *y0 = y0c;
+            *yp = y0c;
 
-            a0 += inca;
-            a1 += inca;
-            a2 += inca;
-            a3 += inca;
+            ap0 += inca;
+            ap1 += inca;
+            ap2 += inca;
+            ap3 += inca;
 
-            y0 += incy;
+            yp += incy;
         }
 
     }
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c
index 254cbe573..d495ad4ac 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -60,13 +60,18 @@ void bli_saxpyf_zen_int_8
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const float*  alpha,
-       const float*  a, inc_t inca, inc_t lda,
-       const float*  x, inc_t incx,
-             float*  y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const float* restrict alpha = alpha0;
+	const float* restrict a     = a0;
+	const float* restrict x     = x0;
+	      float* restrict y     = y0;
+
 	const dim_t      fuse_fac       = 8;
 
 	const dim_t      n_elem_per_reg = 8;
@@ -93,7 +98,7 @@ void bli_saxpyf_zen_int_8
 	// operation as a loop over axpyv.
 	if ( b_n != fuse_fac )
 	{
-		saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+		axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
 		for ( i = 0; i < b_n; ++i )
 		{
@@ -135,15 +140,15 @@ void bli_saxpyf_zen_int_8
 		m_left  = m;
 	}
 
-	const float* restrict a0   = a + 0*lda;
-	const float* restrict a1   = a + 1*lda;
-	const float* restrict a2   = a + 2*lda;
-	const float* restrict a3   = a + 3*lda;
-	const float* restrict a4   = a + 4*lda;
-	const float* restrict a5   = a + 5*lda;
-	const float* restrict a6   = a + 6*lda;
-	const float* restrict a7   = a + 7*lda;
-	      float* restrict y0   = y;
+	const float* restrict ap0   = a + 0*lda;
+	const float* restrict ap1   = a + 1*lda;
+	const float* restrict ap2   = a + 2*lda;
+	const float* restrict ap3   = a + 3*lda;
+	const float* restrict ap4   = a + 4*lda;
+	const float* restrict ap5   = a + 5*lda;
+	const float* restrict ap6   = a + 6*lda;
+	const float* restrict ap7   = a + 7*lda;
+	      float* restrict yp0   = y;
 
 	chi0 = *( x + 0*incx );
 	chi1 = *( x + 1*incx );
@@ -179,15 +184,15 @@ void bli_saxpyf_zen_int_8
 	for ( i = 0; i < m_viter; ++i )
 	{
 		// Load the input values.
-		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-		a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-		a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-		a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-		a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-		a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
-		a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg );
-		a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg );
-		a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_ps( yp0 + 0*n_elem_per_reg );
+		a0v.v = _mm256_loadu_ps( ap0 + 0*n_elem_per_reg );
+		a1v.v = _mm256_loadu_ps( ap1 + 0*n_elem_per_reg );
+		a2v.v = _mm256_loadu_ps( ap2 + 0*n_elem_per_reg );
+		a3v.v = _mm256_loadu_ps( ap3 + 0*n_elem_per_reg );
+		a4v.v = _mm256_loadu_ps( ap4 + 0*n_elem_per_reg );
+		a5v.v = _mm256_loadu_ps( ap5 + 0*n_elem_per_reg );
+		a6v.v = _mm256_loadu_ps( ap6 + 0*n_elem_per_reg );
+		a7v.v = _mm256_loadu_ps( ap7 + 0*n_elem_per_reg );
 
 		// perform : y += alpha * x;
 		y0v.v = _mm256_fmadd_ps( a0v.v, chi0v.v, y0v.v );
@@ -200,32 +205,32 @@ void bli_saxpyf_zen_int_8
 		y0v.v = _mm256_fmadd_ps( a7v.v, chi7v.v, y0v.v );
 
 		// Store the output.
-		_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-
-		y0 += n_elem_per_reg;
-		a0 += n_elem_per_reg;
-		a1 += n_elem_per_reg;
-		a2 += n_elem_per_reg;
-		a3 += n_elem_per_reg;
-		a4 += n_elem_per_reg;
-		a5 += n_elem_per_reg;
-		a6 += n_elem_per_reg;
-		a7 += n_elem_per_reg;
+		_mm256_storeu_ps( (yp0 + 0*n_elem_per_reg), y0v.v );
+
+		yp0 += n_elem_per_reg;
+		ap0 += n_elem_per_reg;
+		ap1 += n_elem_per_reg;
+		ap2 += n_elem_per_reg;
+		ap3 += n_elem_per_reg;
+		ap4 += n_elem_per_reg;
+		ap5 += n_elem_per_reg;
+		ap6 += n_elem_per_reg;
+		ap7 += n_elem_per_reg;
 	}
 
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < m_left ; ++i )
 	{
-		float       y0c = *y0;
+		float       y0c = *yp0;
 
-		const float a0c = *a0;
-		const float a1c = *a1;
-		const float a2c = *a2;
-		const float a3c = *a3;
-		const float a4c = *a4;
-		const float a5c = *a5;
-		const float a6c = *a6;
-		const float a7c = *a7;
+		const float a0c = *ap0;
+		const float a1c = *ap1;
+		const float a2c = *ap2;
+		const float a3c = *ap3;
+		const float a4c = *ap4;
+		const float a5c = *ap5;
+		const float a6c = *ap6;
+		const float a7c = *ap7;
 
 		y0c += chi0 * a0c;
 		y0c += chi1 * a1c;
@@ -236,17 +241,17 @@ void bli_saxpyf_zen_int_8
 		y0c += chi6 * a6c;
 		y0c += chi7 * a7c;
 
-		*y0 = y0c;
-
-		a0 += inca;
-		a1 += inca;
-		a2 += inca;
-		a3 += inca;
-		a4 += inca;
-		a5 += inca;
-		a6 += inca;
-		a7 += inca;
-		y0 += incy;
+		*yp0 = y0c;
+
+		ap0 += inca;
+		ap1 += inca;
+		ap2 += inca;
+		ap3 += inca;
+		ap4 += inca;
+		ap5 += inca;
+		ap6 += inca;
+		ap7 += inca;
+		yp0 += incy;
 	}
 }
 
@@ -258,13 +263,18 @@ void bli_daxpyf_zen_int_8
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* restrict alpha = alpha0;
+	const double* restrict a     = a0;
+	const double* restrict x     = x0;
+	      double* restrict y     = y0;
+
 	const dim_t      fuse_fac       = 8;
 
 	const dim_t      n_elem_per_reg = 4;
@@ -291,7 +301,7 @@ void bli_daxpyf_zen_int_8
 	// operation as a loop over axpyv.
 	if ( b_n != fuse_fac )
 	{
-		daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+		axpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
 		for ( i = 0; i < b_n; ++i )
 		{
@@ -333,15 +343,15 @@ void bli_daxpyf_zen_int_8
 		m_left  = m;
 	}
 
-	const double* restrict a0   = a + 0*lda;
-	const double* restrict a1   = a + 1*lda;
-	const double* restrict a2   = a + 2*lda;
-	const double* restrict a3   = a + 3*lda;
-	const double* restrict a4   = a + 4*lda;
-	const double* restrict a5   = a + 5*lda;
-	const double* restrict a6   = a + 6*lda;
-	const double* restrict a7   = a + 7*lda;
-	      double* restrict y0   = y;
+	const double* restrict ap0   = a + 0*lda;
+	const double* restrict ap1   = a + 1*lda;
+	const double* restrict ap2   = a + 2*lda;
+	const double* restrict ap3   = a + 3*lda;
+	const double* restrict ap4   = a + 4*lda;
+	const double* restrict ap5   = a + 5*lda;
+	const double* restrict ap6   = a + 6*lda;
+	const double* restrict ap7   = a + 7*lda;
+	      double* restrict yp0   = y;
 
 	chi0 = *( x + 0*incx );
 	chi1 = *( x + 1*incx );
@@ -377,15 +387,15 @@ void bli_daxpyf_zen_int_8
 	for ( i = 0; i < m_viter; ++i )
 	{
 		// Load the input values.
-		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-		a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-		a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-		a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-		a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-		a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
-		a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg );
-		a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg );
-		a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_pd( yp0 + 0*n_elem_per_reg );
+		a0v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+		a1v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+		a2v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
+		a3v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
+		a4v.v = _mm256_loadu_pd( ap4 + 0*n_elem_per_reg );
+		a5v.v = _mm256_loadu_pd( ap5 + 0*n_elem_per_reg );
+		a6v.v = _mm256_loadu_pd( ap6 + 0*n_elem_per_reg );
+		a7v.v = _mm256_loadu_pd( ap7 + 0*n_elem_per_reg );
 
 		// perform : y += alpha * x;
 		y0v.v = _mm256_fmadd_pd( a0v.v, chi0v.v, y0v.v );
@@ -398,32 +408,32 @@ void bli_daxpyf_zen_int_8
 		y0v.v = _mm256_fmadd_pd( a7v.v, chi7v.v, y0v.v );
 
 		// Store the output.
-		_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
-
-		y0 += n_elem_per_reg;
-		a0 += n_elem_per_reg;
-		a1 += n_elem_per_reg;
-		a2 += n_elem_per_reg;
-		a3 += n_elem_per_reg;
-		a4 += n_elem_per_reg;
-		a5 += n_elem_per_reg;
-		a6 += n_elem_per_reg;
-		a7 += n_elem_per_reg;
+		_mm256_storeu_pd( (yp0 + 0*n_elem_per_reg), y0v.v );
+
+		yp0 += n_elem_per_reg;
+		ap0 += n_elem_per_reg;
+		ap1 += n_elem_per_reg;
+		ap2 += n_elem_per_reg;
+		ap3 += n_elem_per_reg;
+		ap4 += n_elem_per_reg;
+		ap5 += n_elem_per_reg;
+		ap6 += n_elem_per_reg;
+		ap7 += n_elem_per_reg;
 	}
 
 	// If there are leftover iterations, perform them with scalar code.
 	for ( i = 0; i < m_left ; ++i )
 	{
-		double       y0c = *y0;
+		double       y0c = *yp0;
 
-		const double a0c = *a0;
-		const double a1c = *a1;
-		const double a2c = *a2;
-		const double a3c = *a3;
-		const double a4c = *a4;
-		const double a5c = *a5;
-		const double a6c = *a6;
-		const double a7c = *a7;
+		const double a0c = *ap0;
+		const double a1c = *ap1;
+		const double a2c = *ap2;
+		const double a3c = *ap3;
+		const double a4c = *ap4;
+		const double a5c = *ap5;
+		const double a6c = *ap6;
+		const double a7c = *ap7;
 
 		y0c += chi0 * a0c;
 		y0c += chi1 * a1c;
@@ -434,17 +444,17 @@ void bli_daxpyf_zen_int_8
 		y0c += chi6 * a6c;
 		y0c += chi7 * a7c;
 
-		*y0 = y0c;
-
-		a0 += inca;
-		a1 += inca;
-		a2 += inca;
-		a3 += inca;
-		a4 += inca;
-		a5 += inca;
-		a6 += inca;
-		a7 += inca;
-		y0 += incy;
+		*yp0 = y0c;
+
+		ap0 += inca;
+		ap1 += inca;
+		ap2 += inca;
+		ap3 += inca;
+		ap4 += inca;
+		ap5 += inca;
+		ap6 += inca;
+		ap7 += inca;
+		yp0 += incy;
 	}
 }
 
diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c
index 76309de93..db62c3c59 100644
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -60,14 +60,20 @@ void bli_sdotxf_zen_int_8
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const float*  alpha,
-       const float*  a, inc_t inca, inc_t lda,
-       const float*  x, inc_t incx,
-       const float*  beta,
-             float*  y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+       const void*   beta0,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const float* restrict alpha = alpha0;
+	const float* restrict a     = a0;
+	const float* restrict x     = x0;
+	const float* restrict beta  = beta0;
+	      float* restrict y     = y0;
+
 	const dim_t fuse_fac       = 8;
 	const dim_t n_elem_per_reg = 8;
 
@@ -78,7 +84,7 @@ void bli_sdotxf_zen_int_8
 	// simplifies to updating y.
 	if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) )
 	{
-		sscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
+		scalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -95,7 +101,7 @@ void bli_sdotxf_zen_int_8
 	// operation as a loop over dotxv.
 	if ( b_n != fuse_fac )
 	{
-		sdotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
+		dotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
@@ -147,15 +153,15 @@ void bli_sdotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
 
 		// Set up pointers for x and the b_n columns of A (rows of A^T).
-		const float* restrict x0 = x;
-		const float* restrict a0 = a + 0*lda;
-		const float* restrict a1 = a + 1*lda;
-		const float* restrict a2 = a + 2*lda;
-		const float* restrict a3 = a + 3*lda;
-		const float* restrict a4 = a + 4*lda;
-		const float* restrict a5 = a + 5*lda;
-		const float* restrict a6 = a + 6*lda;
-		const float* restrict a7 = a + 7*lda;
+		const float* restrict xp0 = x;
+		const float* restrict ap0 = a + 0*lda;
+		const float* restrict ap1 = a + 1*lda;
+		const float* restrict ap2 = a + 2*lda;
+		const float* restrict ap3 = a + 3*lda;
+		const float* restrict ap4 = a + 4*lda;
+		const float* restrict ap5 = a + 5*lda;
+		const float* restrict ap6 = a + 6*lda;
+		const float* restrict ap7 = a + 7*lda;
 
 		// Initialize b_n rho vector accumulators to zero.
 		v8sf_t rho0v; rho0v.v = _mm256_setzero_ps();
@@ -175,16 +181,16 @@ void bli_sdotxf_zen_int_8
 		for ( dim_t i = 0; i < m_viter; ++i )
 		{
 			// Load the input values.
-			x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			x0v.v = _mm256_loadu_ps( xp0 + 0*n_elem_per_reg );
 
-			a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-			a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-			a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-			a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-			a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
-			a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg );
-			a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg );
-			a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg );
+			a0v.v = _mm256_loadu_ps( ap0 + 0*n_elem_per_reg );
+			a1v.v = _mm256_loadu_ps( ap1 + 0*n_elem_per_reg );
+			a2v.v = _mm256_loadu_ps( ap2 + 0*n_elem_per_reg );
+			a3v.v = _mm256_loadu_ps( ap3 + 0*n_elem_per_reg );
+			a4v.v = _mm256_loadu_ps( ap4 + 0*n_elem_per_reg );
+			a5v.v = _mm256_loadu_ps( ap5 + 0*n_elem_per_reg );
+			a6v.v = _mm256_loadu_ps( ap6 + 0*n_elem_per_reg );
+			a7v.v = _mm256_loadu_ps( ap7 + 0*n_elem_per_reg );
 
 			// perform: rho?v += a?v * x0v;
 			rho0v.v = _mm256_fmadd_ps( a0v.v, x0v.v, rho0v.v );
@@ -196,15 +202,15 @@ void bli_sdotxf_zen_int_8
 			rho6v.v = _mm256_fmadd_ps( a6v.v, x0v.v, rho6v.v );
 			rho7v.v = _mm256_fmadd_ps( a7v.v, x0v.v, rho7v.v );
 
-			x0 += n_elem_per_reg * n_iter_unroll;
-			a0 += n_elem_per_reg * n_iter_unroll;
-			a1 += n_elem_per_reg * n_iter_unroll;
-			a2 += n_elem_per_reg * n_iter_unroll;
-			a3 += n_elem_per_reg * n_iter_unroll;
-			a4 += n_elem_per_reg * n_iter_unroll;
-			a5 += n_elem_per_reg * n_iter_unroll;
-			a6 += n_elem_per_reg * n_iter_unroll;
-			a7 += n_elem_per_reg * n_iter_unroll;
+			xp0 += n_elem_per_reg * n_iter_unroll;
+			ap0 += n_elem_per_reg * n_iter_unroll;
+			ap1 += n_elem_per_reg * n_iter_unroll;
+			ap2 += n_elem_per_reg * n_iter_unroll;
+			ap3 += n_elem_per_reg * n_iter_unroll;
+			ap4 += n_elem_per_reg * n_iter_unroll;
+			ap5 += n_elem_per_reg * n_iter_unroll;
+			ap6 += n_elem_per_reg * n_iter_unroll;
+			ap7 += n_elem_per_reg * n_iter_unroll;
 		}
 
 #if 0
@@ -268,8 +274,8 @@ void bli_sdotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_iter_unroll );
 
 		// Initialize pointers for x and A.
-		const float* restrict x0 = x;
-		const float* restrict a0 = a;
+		const float* restrict xp0 = x;
+		const float* restrict ap0 = a;
 
 		// Initialize rho vector accumulators to zero.
 		v8sf_t rho0v; rho0v.v = _mm256_setzero_ps();
@@ -283,15 +289,15 @@ void bli_sdotxf_zen_int_8
 		for ( dim_t i = 0; i < m_viter; ++i )
 		{
 			// Load the input values.
-			a0v.v = _mm256_loadu_ps( a0 + 0*inca );
-			a1v.v = _mm256_loadu_ps( a0 + 1*inca );
-			a2v.v = _mm256_loadu_ps( a0 + 2*inca );
-			a3v.v = _mm256_loadu_ps( a0 + 3*inca );
+			a0v.v = _mm256_loadu_ps( ap0 + 0*inca );
+			a1v.v = _mm256_loadu_ps( ap0 + 1*inca );
+			a2v.v = _mm256_loadu_ps( ap0 + 2*inca );
+			a3v.v = _mm256_loadu_ps( ap0 + 3*inca );
 
-			x0v.v = _mm256_broadcast_ss( x0 + 0*incx );
-			x1v.v = _mm256_broadcast_ss( x0 + 1*incx );
-			x2v.v = _mm256_broadcast_ss( x0 + 2*incx );
-			x3v.v = _mm256_broadcast_ss( x0 + 3*incx );
+			x0v.v = _mm256_broadcast_ss( xp0 + 0*incx );
+			x1v.v = _mm256_broadcast_ss( xp0 + 1*incx );
+			x2v.v = _mm256_broadcast_ss( xp0 + 2*incx );
+			x3v.v = _mm256_broadcast_ss( xp0 + 3*incx );
 
 			// perform : rho?v += a?v * x?v;
 			rho0v.v = _mm256_fmadd_ps( a0v.v, x0v.v, rho0v.v );
@@ -299,8 +305,8 @@ void bli_sdotxf_zen_int_8
 			rho2v.v = _mm256_fmadd_ps( a2v.v, x2v.v, rho2v.v );
 			rho3v.v = _mm256_fmadd_ps( a3v.v, x3v.v, rho3v.v );
 
-			x0 += incx * n_iter_unroll;
-			a0 += inca * n_iter_unroll;
+			xp0 += incx * n_iter_unroll;
+			ap0 += inca * n_iter_unroll;
 		}
 
 		// Combine the 8 accumulators into one vector register.
@@ -332,29 +338,29 @@ void bli_sdotxf_zen_int_8
 	// Scalar edge case.
 	{
 		// Initialize pointers for x and the b_n columns of A (rows of A^T).
-		const float* restrict x0 = x;
-		const float* restrict a0 = a + 0*lda;
-		const float* restrict a1 = a + 1*lda;
-		const float* restrict a2 = a + 2*lda;
-		const float* restrict a3 = a + 3*lda;
-		const float* restrict a4 = a + 4*lda;
-		const float* restrict a5 = a + 5*lda;
-		const float* restrict a6 = a + 6*lda;
-		const float* restrict a7 = a + 7*lda;
+		const float* restrict xp0 = x;
+		const float* restrict ap0 = a + 0*lda;
+		const float* restrict ap1 = a + 1*lda;
+		const float* restrict ap2 = a + 2*lda;
+		const float* restrict ap3 = a + 3*lda;
+		const float* restrict ap4 = a + 4*lda;
+		const float* restrict ap5 = a + 5*lda;
+		const float* restrict ap6 = a + 6*lda;
+		const float* restrict ap7 = a + 7*lda;
 
 		// If there are leftover iterations, perform them with scalar code.
 		for ( dim_t i = 0; i < m ; ++i )
 		{
-			const float x0c = *x0;
+			const float x0c = *xp0;
 
-			const float a0c = *a0;
-			const float a1c = *a1;
-			const float a2c = *a2;
-			const float a3c = *a3;
-			const float a4c = *a4;
-			const float a5c = *a5;
-			const float a6c = *a6;
-			const float a7c = *a7;
+			const float a0c = *ap0;
+			const float a1c = *ap1;
+			const float a2c = *ap2;
+			const float a3c = *ap3;
+			const float a4c = *ap4;
+			const float a5c = *ap5;
+			const float a6c = *ap6;
+			const float a7c = *ap7;
 
 			rho0 += a0c * x0c;
 			rho1 += a1c * x0c;
@@ -365,15 +371,15 @@ void bli_sdotxf_zen_int_8
 			rho6 += a6c * x0c;
 			rho7 += a7c * x0c;
 
-			x0 += incx;
-			a0 += inca;
-			a1 += inca;
-			a2 += inca;
-			a3 += inca;
-			a4 += inca;
-			a5 += inca;
-			a6 += inca;
-			a7 += inca;
+			xp0 += incx;
+			ap0 += inca;
+			ap1 += inca;
+			ap2 += inca;
+			ap3 += inca;
+			ap4 += inca;
+			ap5 += inca;
+			ap6 += inca;
+			ap7 += inca;
 		}
 	}
 
@@ -450,14 +456,20 @@ void bli_ddotxf_zen_int_8
              conj_t  conjx,
              dim_t   m,
              dim_t   b_n,
-       const double* alpha,
-       const double* a, inc_t inca, inc_t lda,
-       const double* x, inc_t incx,
-       const double* beta,
-             double* y, inc_t incy,
+       const void*   alpha0,
+       const void*   a0, inc_t inca, inc_t lda,
+       const void*   x0, inc_t incx,
+       const void*   beta0,
+             void*   y0, inc_t incy,
        const cntx_t* cntx
      )
 {
+	const double* restrict alpha = alpha0;
+	const double* restrict a     = a0;
+	const double* restrict x     = x0;
+	const double* restrict beta  = beta0;
+	      double* restrict y     = y0;
+
 	const dim_t fuse_fac       = 8;
 	const dim_t n_elem_per_reg = 4;
 
@@ -468,7 +480,7 @@ void bli_ddotxf_zen_int_8
 	// simplifies to updating y.
 	if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		scalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -485,7 +497,7 @@ void bli_ddotxf_zen_int_8
 	// operation as a loop over dotxv.
 	if ( b_n != fuse_fac )
 	{
-		ddotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
+		dotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
@@ -537,15 +549,15 @@ void bli_ddotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
 
 		// Set up pointers for x and the b_n columns of A (rows of A^T).
-		const double* restrict x0 = x;
-		const double* restrict a0 = a + 0*lda;
-		const double* restrict a1 = a + 1*lda;
-		const double* restrict a2 = a + 2*lda;
-		const double* restrict a3 = a + 3*lda;
-		const double* restrict a4 = a + 4*lda;
-		const double* restrict a5 = a + 5*lda;
-		const double* restrict a6 = a + 6*lda;
-		const double* restrict a7 = a + 7*lda;
+		const double* restrict xp0 = x;
+		const double* restrict ap0 = a + 0*lda;
+		const double* restrict ap1 = a + 1*lda;
+		const double* restrict ap2 = a + 2*lda;
+		const double* restrict ap3 = a + 3*lda;
+		const double* restrict ap4 = a + 4*lda;
+		const double* restrict ap5 = a + 5*lda;
+		const double* restrict ap6 = a + 6*lda;
+		const double* restrict ap7 = a + 7*lda;
 
 		// Initialize b_n rho vector accumulators to zero.
 		v4df_t rho0v; rho0v.v = _mm256_setzero_pd();
@@ -565,16 +577,16 @@ void bli_ddotxf_zen_int_8
 		for ( dim_t i = 0; i < m_viter; ++i )
 		{
 			// Load the input values.
-			x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			x0v.v = _mm256_loadu_pd( xp0 + 0*n_elem_per_reg );
 
-			a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-			a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-			a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-			a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-			a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
-			a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg );
-			a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg );
-			a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg );
+			a0v.v = _mm256_loadu_pd( ap0 + 0*n_elem_per_reg );
+			a1v.v = _mm256_loadu_pd( ap1 + 0*n_elem_per_reg );
+			a2v.v = _mm256_loadu_pd( ap2 + 0*n_elem_per_reg );
+			a3v.v = _mm256_loadu_pd( ap3 + 0*n_elem_per_reg );
+			a4v.v = _mm256_loadu_pd( ap4 + 0*n_elem_per_reg );
+			a5v.v = _mm256_loadu_pd( ap5 + 0*n_elem_per_reg );
+			a6v.v = _mm256_loadu_pd( ap6 + 0*n_elem_per_reg );
+			a7v.v = _mm256_loadu_pd( ap7 + 0*n_elem_per_reg );
 
 			// perform: rho?v += a?v * x0v;
 			rho0v.v = _mm256_fmadd_pd( a0v.v, x0v.v, rho0v.v );
@@ -586,15 +598,15 @@ void bli_ddotxf_zen_int_8
 			rho6v.v = _mm256_fmadd_pd( a6v.v, x0v.v, rho6v.v );
 			rho7v.v = _mm256_fmadd_pd( a7v.v, x0v.v, rho7v.v );
 
-			x0 += n_elem_per_reg * n_iter_unroll;
-			a0 += n_elem_per_reg * n_iter_unroll;
-			a1 += n_elem_per_reg * n_iter_unroll;
-			a2 += n_elem_per_reg * n_iter_unroll;
-			a3 += n_elem_per_reg * n_iter_unroll;
-			a4 += n_elem_per_reg * n_iter_unroll;
-			a5 += n_elem_per_reg * n_iter_unroll;
-			a6 += n_elem_per_reg * n_iter_unroll;
-			a7 += n_elem_per_reg * n_iter_unroll;
+			xp0 += n_elem_per_reg * n_iter_unroll;
+			ap0 += n_elem_per_reg * n_iter_unroll;
+			ap1 += n_elem_per_reg * n_iter_unroll;
+			ap2 += n_elem_per_reg * n_iter_unroll;
+			ap3 += n_elem_per_reg * n_iter_unroll;
+			ap4 += n_elem_per_reg * n_iter_unroll;
+			ap5 += n_elem_per_reg * n_iter_unroll;
+			ap6 += n_elem_per_reg * n_iter_unroll;
+			ap7 += n_elem_per_reg * n_iter_unroll;
 		}
 
 #if 0
@@ -643,8 +655,8 @@ void bli_ddotxf_zen_int_8
 		dim_t m_viter = ( m ) / ( n_reg_per_row * n_iter_unroll );
 
 		// Initialize pointers for x and A.
-		const double* restrict x0 = x;
-		const double* restrict a0 = a;
+		const double* restrict xp0 = x;
+		const double* restrict ap0 = a;
 
 		// Initialize rho vector accumulators to zero.
 		v4df_t rho0v; rho0v.v = _mm256_setzero_pd();
@@ -660,16 +672,16 @@ void bli_ddotxf_zen_int_8
 		for ( dim_t i = 0; i < m_viter; ++i )
 		{
 			// Load the input values.
-			a0v.v = _mm256_loadu_pd( a0 + 0*inca + 0*n_elem_per_reg );
-			a1v.v = _mm256_loadu_pd( a0 + 0*inca + 1*n_elem_per_reg );
-			a2v.v = _mm256_loadu_pd( a0 + 1*inca + 0*n_elem_per_reg );
-			a3v.v = _mm256_loadu_pd( a0 + 1*inca + 1*n_elem_per_reg );
-			a4v.v = _mm256_loadu_pd( a0 + 2*inca + 0*n_elem_per_reg );
-			a5v.v = _mm256_loadu_pd( a0 + 2*inca + 1*n_elem_per_reg );
+			a0v.v = _mm256_loadu_pd( ap0 + 0*inca + 0*n_elem_per_reg );
+			a1v.v = _mm256_loadu_pd( ap0 + 0*inca + 1*n_elem_per_reg );
+			a2v.v = _mm256_loadu_pd( ap0 + 1*inca + 0*n_elem_per_reg );
+			a3v.v = _mm256_loadu_pd( ap0 + 1*inca + 1*n_elem_per_reg );
+			a4v.v = _mm256_loadu_pd( ap0 + 2*inca + 0*n_elem_per_reg );
+			a5v.v = _mm256_loadu_pd( ap0 + 2*inca + 1*n_elem_per_reg );
 
-			x0v.v = _mm256_broadcast_sd( x0 + 0*incx );
-			x1v.v = _mm256_broadcast_sd( x0 + 1*incx );
-			x2v.v = _mm256_broadcast_sd( x0 + 2*incx );
+			x0v.v = _mm256_broadcast_sd( xp0 + 0*incx );
+			x1v.v = _mm256_broadcast_sd( xp0 + 1*incx );
+			x2v.v = _mm256_broadcast_sd( xp0 + 2*incx );
 
 			// perform : rho?v += a?v * x?v;
 			rho0v.v = _mm256_fmadd_pd( a0v.v, x0v.v, rho0v.v );
@@ -679,8 +691,8 @@ void bli_ddotxf_zen_int_8
 			rho4v.v = _mm256_fmadd_pd( a4v.v, x2v.v, rho4v.v );
 			rho5v.v = _mm256_fmadd_pd( a5v.v, x2v.v, rho5v.v );
 
-			x0 += incx * n_iter_unroll;
-			a0 += inca * n_iter_unroll;
+			xp0 += incx * n_iter_unroll;
+			ap0 += inca * n_iter_unroll;
 		}
 
 		// Combine the 8 accumulators into one vector register.
@@ -713,29 +725,29 @@ void bli_ddotxf_zen_int_8
 	// Scalar edge case.
 	{
 		// Initialize pointers for x and the b_n columns of A (rows of A^T).
-		const double* restrict x0 = x;
-		const double* restrict a0 = a + 0*lda;
-		const double* restrict a1 = a + 1*lda;
-		const double* restrict a2 = a + 2*lda;
-		const double* restrict a3 = a + 3*lda;
-		const double* restrict a4 = a + 4*lda;
-		const double* restrict a5 = a + 5*lda;
-		const double* restrict a6 = a + 6*lda;
-		const double* restrict a7 = a + 7*lda;
+		const double* restrict xp0 = x;
+		const double* restrict ap0 = a + 0*lda;
+		const double* restrict ap1 = a + 1*lda;
+		const double* restrict ap2 = a + 2*lda;
+		const double* restrict ap3 = a + 3*lda;
+		const double* restrict ap4 = a + 4*lda;
+		const double* restrict ap5 = a + 5*lda;
+		const double* restrict ap6 = a + 6*lda;
+		const double* restrict ap7 = a + 7*lda;
 
 		// If there are leftover iterations, perform them with scalar code.
 		for ( dim_t i = 0; i < m ; ++i )
 		{
-			const double x0c = *x0;
+			const double x0c = *xp0;
 
-			const double a0c = *a0;
-			const double a1c = *a1;
-			const double a2c = *a2;
-			const double a3c = *a3;
-			const double a4c = *a4;
-			const double a5c = *a5;
-			const double a6c = *a6;
-			const double a7c = *a7;
+			const double a0c = *ap0;
+			const double a1c = *ap1;
+			const double a2c = *ap2;
+			const double a3c = *ap3;
+			const double a4c = *ap4;
+			const double a5c = *ap5;
+			const double a6c = *ap6;
+			const double a7c = *ap7;
 
 			rho0 += a0c * x0c;
 			rho1 += a1c * x0c;
@@ -746,15 +758,15 @@ void bli_ddotxf_zen_int_8
 			rho6 += a6c * x0c;
 			rho7 += a7c * x0c;
 
-			x0 += incx;
-			a0 += inca;
-			a1 += inca;
-			a2 += inca;
-			a3 += inca;
-			a4 += inca;
-			a5 += inca;
-			a6 += inca;
-			a7 += inca;
+			xp0 += incx;
+			ap0 += inca;
+			ap1 += inca;
+			ap2 += inca;
+			ap3 += inca;
+			ap4 += inca;
+			ap5 += inca;
+			ap6 += inca;
+			ap7 += inca;
 		}
 	}
 
diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c
index 004220a59..7195db512 100644
--- a/ref_kernels/1/bli_addv_ref.c
+++ b/ref_kernels/1/bli_addv_ref.c
@@ -41,15 +41,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
+       const void*   x0, inc_t incx, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
 \
-	const ctype* restrict chi1 = x; \
-	      ctype* restrict psi1 = y; \
+	const ctype* restrict x = x0; \
+	      ctype* restrict y = y0; \
 \
 	if ( bli_is_conj( conjx ) ) \
 	{ \
@@ -58,17 +58,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,addjs)( chi1[i], psi1[i] ); \
+				PASTEMAC(ch,addjs)( x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,addjs)( *chi1, *psi1 ); \
+				PASTEMAC(ch,addjs)( *x, *y ); \
 \
-				chi1 += incx; \
-				psi1 += incy; \
+				x += incx; \
+				y += incy; \
 			} \
 		} \
 	} \
@@ -79,17 +79,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,adds)( chi1[i], psi1[i] ); \
+				PASTEMAC(ch,adds)( x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,adds)( *chi1, *psi1 ); \
+				PASTEMAC(ch,adds)( *x, *y ); \
 \
-				chi1 += incx; \
-				psi1 += incy; \
+				x += incx; \
+				y += incy; \
 			} \
 		} \
 	} \
diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c
index 4d249feb4..0fef14c73 100644
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -44,11 +44,13 @@
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              dim_t   n, \
-       const ctype*  x, inc_t incx, \
+       const void*   x0, inc_t incx, \
              dim_t*  index, \
        const cntx_t* cntx  \
      ) \
 { \
+	const ctype*   x         = x0; \
+\
 	const ctype_r* minus_one = PASTEMAC(chr,m1); \
 	const dim_t*   zero_i    = PASTEMAC(i,0); \
 \
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 56354bf48..8c4340161 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -41,14 +41,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
+       const void*   alpha0, \
+       const void*   x0, inc_t incx, \
+       const void*   beta0, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* alpha = alpha0; \
+	const ctype* x     = x0; \
+	const ctype* beta  = beta0; \
+	      ctype* y     = y0; \
 \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
@@ -59,15 +64,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			const ctype* zero = PASTEMAC(ch,0); \
 \
 			/* Query the context for the kernel function pointer. */ \
-			const num_t             dt     = PASTEMAC(ch,type); \
-			PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
+			const num_t dt     = PASTEMAC(ch,type); \
+			setv_ker_ft setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 			setv_p \
 			( \
 			  BLIS_NO_CONJUGATE, \
 			  n, \
 			  zero, \
-			  y, incy, \
+			  y0, incy, \
 			  cntx  \
 			); \
 			return; \
@@ -82,15 +87,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			/* If alpha is zero, scale by beta. */ \
 \
 			/* Query the context for the kernel function pointer. */ \
-			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); \
+			const num_t  dt      = PASTEMAC(ch,type); \
+			scalv_ker_ft scalv_p = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); \
 \
 			scalv_p \
 			( \
 			  BLIS_NO_CONJUGATE, \
 			  n, \
-			  beta, \
-			  y, incy, \
+			  beta0, \
+			  y0, incy, \
 			  cntx  \
 			); \
 			return; \
@@ -104,15 +109,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			/* If alpha is one and beta is zero, use copyv. */ \
 \
 			/* Query the context for the kernel function pointer. */ \
-			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
+			const num_t  dt      = PASTEMAC(ch,type); \
+			copyv_ker_ft copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 			copyv_p \
 			( \
 			  conjx, \
 			  n, \
-			  x, incx, \
-			  y, incy, \
+			  x0, incx, \
+			  y0, incy, \
 			  cntx  \
 			); \
 			return; \
@@ -122,15 +127,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			/* If alpha is one and beta is one, use addv. */ \
 \
 			/* Query the context for the kernel function pointer. */ \
-			const num_t             dt     = PASTEMAC(ch,type); \
-			PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
+			const num_t dt     = PASTEMAC(ch,type); \
+			addv_ker_ft addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 			addv_p \
 			( \
 			  conjx, \
 			  n, \
-			  x, incx, \
-			  y, incy, \
+			  x0, incx, \
+			  y0, incy, \
 			  cntx  \
 			); \
 			return; \
@@ -140,16 +145,16 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			/* If alpha is one and beta is something else, use xpbyv. */ \
 \
 			/* Query the context for the kernel function pointer. */ \
-			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_ukr_dt( dt, BLIS_XPBYV_KER, cntx ); \
+			const num_t  dt      = PASTEMAC(ch,type); \
+			xpbyv_ker_ft xpbyv_p = bli_cntx_get_ukr_dt( dt, BLIS_XPBYV_KER, cntx ); \
 \
 			xpbyv_p \
 			( \
 			  conjx, \
 			  n, \
-			  x, incx, \
-			  beta, \
-			  y, incy, \
+			  x0, incx, \
+			  beta0, \
+			  y0, incy, \
 			  cntx  \
 			); \
 			return; \
@@ -162,16 +167,16 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			/* If alpha is something else and beta is zero, use scal2v. */ \
 \
 			/* Query the context for the kernel function pointer. */ \
-			const num_t               dt       = PASTEMAC(ch,type); \
-			PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_ukr_dt( dt, BLIS_SCAL2V_KER, cntx ); \
+			const num_t   dt       = PASTEMAC(ch,type); \
+			scal2v_ker_ft scal2v_p = bli_cntx_get_ukr_dt( dt, BLIS_SCAL2V_KER, cntx ); \
 \
 			scal2v_p \
 			( \
 			  conjx, \
 			  n, \
-			  alpha, \
-			  x, incx, \
-			  y, incy, \
+			  alpha0, \
+			  x0, incx, \
+			  y0, incy, \
 			  cntx  \
 			); \
 			return; \
@@ -181,16 +186,16 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			/* If alpha is something else and beta is one, use axpyv. */ \
 \
 			/* Query the context for the kernel function pointer. */ \
-			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+			const num_t  dt      = PASTEMAC(ch,type); \
+			axpyv_ker_ft axpyv_p = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 			axpyv_p \
 			( \
 			  conjx, \
 			  n, \
-			  alpha, \
-			  x, incx, \
-			  y, incy, \
+			  alpha0, \
+			  x0, incx, \
+			  y0, incy, \
 			  cntx  \
 			); \
 			return; \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 547aa55cf..52b6fd44e 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -34,97 +34,6 @@
 
 #include "blis.h"
 
-#if 0
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjx, \
-       dim_t            n, \
-       ctype*  restrict alpha, \
-       ctype*  restrict x, inc_t incx, \
-       ctype*  restrict y, inc_t incy, \
-       cntx_t*          cntx  \
-     ) \
-{ \
-	if ( bli_zero_dim1( n ) ) return; \
-\
-	/* If alpha is zero, return. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
-\
-	/* If alpha is one, use addv. */ \
-	if ( PASTEMAC(ch,eq1)( *alpha ) ) \
-	{ \
-		/* Query the context for the kernel function pointer. */ \
-		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
-\
-		addv_p \
-		( \
-		  conjx, \
-		  n, \
-		  x, incx, \
-		  y, incy, \
-		  cntx  \
-		); \
-		return; \
-	} \
-\
-	ctype* restrict chi1 = x; \
-	ctype* restrict psi1 = y; \
-\
-	if ( bli_is_conj( conjx ) ) \
-	{ \
-		if ( incx == 1 && incy == 1 ) \
-		{ \
-			PRAGMA_SIMD \
-			for ( dim_t i = 0; i < n; ++i ) \
-			{ \
-				/*PASTEMAC(ch,axpyjs)( *alpha, chi1[i], psi1[i] );*/ \
-				psi1[i] = fma( *alpha, chi1[i], psi1[i] ); \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t i = 0; i < n; ++i ) \
-			{ \
-				PASTEMAC(ch,axpyjs)( *alpha, *chi1, *psi1 ); \
-\
-				chi1 += incx; \
-				psi1 += incy; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( incx == 1 && incy == 1 ) \
-		{ \
-			PRAGMA_SIMD \
-			for ( dim_t i = 0; i < n; ++i ) \
-			{ \
-				/*PASTEMAC(ch,axpys)( *alpha, chi1[i], psi1[i] );*/ \
-				psi1[i] = fma( *alpha, chi1[i], psi1[i] ); \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t i = 0; i < n; ++i ) \
-			{ \
-				PASTEMAC(ch,axpys)( *alpha, *chi1, *psi1 ); \
-\
-				chi1 += incx; \
-				psi1 += incy; \
-			} \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC2( axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-GENTFUNC( float,    s, axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-GENTFUNC( double,   d, axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-#endif
-
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
@@ -132,13 +41,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
+       const void*   alpha0, \
+       const void*   x0, inc_t incx, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* alpha = alpha0; \
+	const ctype* x     = x0; \
+	      ctype* y     = y0; \
 \
 	/* If alpha is zero, return. */ \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
@@ -147,15 +60,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	if ( PASTEMAC(ch,eq1)( *alpha ) ) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
+		const num_t dt     = PASTEMAC(ch,type); \
+		addv_ker_ft addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
 		  conjx, \
 		  n, \
-		  x, incx, \
-		  y, incy, \
+		  x0, incx, \
+		  y0, incy, \
 		  cntx  \
 		); \
 		return; \
diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c
index 6ec81b69d..8285d9956 100644
--- a/ref_kernels/1/bli_copyv_ref.c
+++ b/ref_kernels/1/bli_copyv_ref.c
@@ -41,12 +41,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
+       const void*   x0, inc_t incx, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* x = x0; \
+	      ctype* y = y0; \
 \
 	if ( bli_is_conj( conjx ) ) \
 	{ \
diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c
index f9e95f68b..f7436f88e 100644
--- a/ref_kernels/1/bli_dotv_ref.c
+++ b/ref_kernels/1/bli_dotv_ref.c
@@ -42,12 +42,16 @@ void PASTEMAC3(ch,opname,arch,suf) \
              conj_t  conjx, \
              conj_t  conjy, \
              dim_t   n, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  rho, \
+       const void*   x0, inc_t incx, \
+       const void*   y0, inc_t incy, \
+             void*   rho0, \
        const cntx_t* cntx  \
      ) \
 { \
+	const ctype* x   = x0; \
+	const ctype* y   = y0; \
+	      ctype* rho = rho0; \
+\
 	ctype dotxy; \
 \
 	if ( bli_zero_dim1( n ) ) \
diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c
index d390c1602..56d3f457d 100644
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -42,14 +42,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
              conj_t  conjx, \
              conj_t  conjy, \
              dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-       const ctype*  beta, \
-             ctype*  rho, \
+       const void*   alpha0, \
+       const void*   x0, inc_t incx, \
+       const void*   y0, inc_t incy, \
+       const void*   beta0, \
+             void*   rho0, \
        const cntx_t* cntx  \
      ) \
 { \
+	const ctype* alpha = alpha0; \
+	const ctype* x     = x0; \
+	const ctype* y     = y0; \
+	const ctype* beta  = beta0; \
+	      ctype* rho   = rho0; \
+\
 	ctype dotxy; \
 \
 	/* If beta is zero, clear rho. Otherwise, scale by beta. */ \
diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c
index 2c5d04187..fa914e653 100644
--- a/ref_kernels/1/bli_invertv_ref.c
+++ b/ref_kernels/1/bli_invertv_ref.c
@@ -40,11 +40,13 @@
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              dim_t   n, \
-             ctype*  x, inc_t incx, \
+             void*   x0, inc_t incx, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	ctype* x = x0; \
 \
 	if ( incx == 1 ) \
 	{ \
diff --git a/ref_kernels/1/bli_invscalv_ref.c b/ref_kernels/1/bli_invscalv_ref.c
index 993c79dcf..6096ff20b 100644
--- a/ref_kernels/1/bli_invscalv_ref.c
+++ b/ref_kernels/1/bli_invscalv_ref.c
@@ -41,12 +41,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjalpha, \
              dim_t   n, \
-       const ctype*  alpha, \
-             ctype*  x, inc_t incx, \
+       const void*   alpha0, \
+             void*   x0, inc_t incx, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* alpha = alpha0; \
+	      ctype* x     = x0; \
 \
 	/* If alpha is one, return. */ \
 	if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index 40d25cae3..6aa519e97 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -41,13 +41,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
+       const void*   alpha0, \
+       const void*   x0, inc_t incx, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* alpha = alpha0; \
+	const ctype* x     = x0; \
+	      ctype* y     = y0; \
 \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
@@ -56,15 +60,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const ctype* zero = PASTEMAC(ch,0); \
 \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
+		const num_t dt     = PASTEMAC(ch,type); \
+		setv_ker_ft setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 		setv_p \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n, \
 		  zero, \
-		  y, incy, \
+		  y0, incy, \
 		  cntx  \
 		); \
 		return; \
@@ -74,15 +78,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* If alpha is one, use copyv. */ \
 \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t              dt      = PASTEMAC(ch,type); \
-		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
+		const num_t  dt      = PASTEMAC(ch,type); \
+		copyv_ker_ft copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 		copyv_p \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n, \
-		  x, incx, \
-		  y, incy, \
+		  x0, incx, \
+		  y0, incy, \
 		  cntx  \
 		); \
 		return; \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index f096dd80a..84a7ec83f 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -41,12 +41,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjalpha, \
              dim_t   n, \
-       const ctype*  alpha, \
-             ctype*  x, inc_t incx, \
+       const void*   alpha0, \
+             void*   x0, inc_t incx, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* alpha = alpha0; \
+	      ctype* x     = x0; \
 \
 	/* If alpha is one, return. */ \
 	if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
@@ -57,15 +60,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const ctype* zero = PASTEMAC(ch,0); \
 \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
+		const num_t dt     = PASTEMAC(ch,type); \
+		setv_ker_ft setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 		setv_p \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n, \
 		  zero, \
-		  x, incx, \
+		  x0, incx, \
 		  cntx  \
 		); \
 		return; \
diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c
index 1e1252d7a..5e39faff4 100644
--- a/ref_kernels/1/bli_setv_ref.c
+++ b/ref_kernels/1/bli_setv_ref.c
@@ -41,12 +41,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjalpha, \
              dim_t   n, \
-       const ctype*  alpha, \
-             ctype*  x, inc_t incx, \
+       const void*   alpha0, \
+             void*   x0, inc_t incx, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* alpha = alpha0; \
+	      ctype* x     = x0; \
 \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c
index e4d4896fb..3e5ddcf1b 100644
--- a/ref_kernels/1/bli_subv_ref.c
+++ b/ref_kernels/1/bli_subv_ref.c
@@ -41,12 +41,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
+       const void*   x0, inc_t incx, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* restrict x = x0; \
+	      ctype* restrict y = y0; \
 \
 	if ( bli_is_conj( conjx ) ) \
 	{ \
diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c
index 43bff0e46..4e6cedd8a 100644
--- a/ref_kernels/1/bli_swapv_ref.c
+++ b/ref_kernels/1/bli_swapv_ref.c
@@ -40,12 +40,15 @@
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              dim_t   n, \
-             ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
+             void*   x0, inc_t incx, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	ctype* x = x0; \
+	ctype* y = y0; \
 \
 	if ( incx == 1 && incy == 1 ) \
 	{ \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 6baddf16c..46342c182 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -41,27 +41,31 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
+       const void*   x0, inc_t incx, \
+       const void*   beta0, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* x    = x0; \
+	const ctype* beta = beta0; \
+	      ctype* y    = y0; \
 \
 	/* If beta is zero, use copyv. */ \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t              dt      = PASTEMAC(ch,type); \
-		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
+		const num_t  dt      = PASTEMAC(ch,type); \
+		copyv_ker_ft copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 		copyv_p \
 		( \
 		  conjx, \
 		  n, \
-		  x, incx, \
-		  y, incy, \
+		  x0, incx, \
+		  y0, incy, \
 		  cntx  \
 		); \
 		return; \
@@ -70,15 +74,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	else if ( PASTEMAC(ch,eq1)( *beta ) ) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
+		const num_t dt     = PASTEMAC(ch,type); \
+		addv_ker_ft addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
 		  conjx, \
 		  n, \
-		  x, incx, \
-		  y, incy, \
+		  x0, incx, \
+		  y0, incy, \
 		  cntx  \
 		); \
 		return; \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index c87f976e7..9fb7a839f 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -43,15 +43,21 @@ void PASTEMAC3(ch,opname,arch,suf) \
              conj_t  conjx, \
              conj_t  conjy, \
              dim_t   n, \
-       const ctype*  alphax, \
-       const ctype*  alphay, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  z, inc_t incz, \
+       const void*   alphax0, \
+       const void*   alphay0, \
+       const void*   x0, inc_t incx, \
+       const void*   y0, inc_t incy, \
+             void*   z0, inc_t incz, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
+\
+	const ctype* restrict alphax = alphax0; \
+	const ctype* restrict alphay = alphay0; \
+	const ctype* restrict x      = x0; \
+	const ctype* restrict y      = y0; \
+	      ctype* restrict z      = z0; \
 \
 	if ( incz == 1 && incx == 1 && incy == 1 ) \
 	{ \
@@ -107,10 +113,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	else \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t              dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,axpyv_ker_ft) kfp_av \
-		= \
-		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		const num_t  dt     = PASTEMAC(ch,type); \
+		axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		kfp_av \
 		( \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index b72fea107..ff8dd6bb4 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -44,14 +44,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
              conj_t  conjx, \
              dim_t   m, \
              dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  x, inc_t incx, \
-             ctype*  y, inc_t incy, \
+       const void*   alpha0, \
+       const void*   a0, inc_t inca, inc_t lda, \
+       const void*   x0, inc_t incx, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
+\
+	const ctype* restrict alpha = alpha0; \
+	const ctype* restrict a     = a0; \
+	const ctype* restrict x     = x0; \
+	      ctype* restrict y     = y0; \
 \
 	if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \
 	{ \
@@ -94,10 +99,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	else \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t              dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,axpyv_ker_ft) kfp_av \
-		= \
-		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		const num_t  dt     = PASTEMAC(ch,type); \
+		axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index 5fe620b12..105463ac4 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -44,15 +44,21 @@ void PASTEMAC3(ch,opname,arch,suf) \
              conj_t  conjx, \
              conj_t  conjy, \
              dim_t   m, \
-       const ctype*  alpha, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  y, inc_t incy, \
-             ctype*  rho, \
-             ctype*  z, inc_t incz, \
+       const void*   alpha0, \
+       const void*   x0, inc_t incx, \
+       const void*   y0, inc_t incy, \
+             void*   rho0, \
+             void*   z0, inc_t incz, \
        const cntx_t* cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
+\
+	const ctype* restrict alpha = alpha0; \
+	const ctype* restrict x     = x0; \
+	const ctype* restrict y     = y0; \
+	      ctype* restrict rho   = rho0; \
+	      ctype* restrict z     = z0; \
 \
 	if ( incz == 1 && incx == 1 && incy == 1 ) \
 	{ \
@@ -129,13 +135,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t              dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,dotv_ker_ft)  kfp_dv \
-		= \
-		bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
-		PASTECH(ch,axpyv_ker_ft) kfp_av \
-		= \
-		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		const num_t  dt     = PASTEMAC(ch,type); \
+		dotv_ker_ft  kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
+		axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		kfp_dv \
 		( \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index 6fb3c9aa8..73d2c036b 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -46,19 +46,27 @@ void PASTEMAC3(ch,opname,arch,suf) \
              conj_t  conjx, \
              dim_t   m, \
              dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  w, inc_t incw, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
-             ctype*  z, inc_t incz, \
+       const void*   alpha0, \
+       const void*   a0, inc_t inca, inc_t lda, \
+       const void*   w0, inc_t incw, \
+       const void*   x0, inc_t incx, \
+       const void*   beta0, \
+             void*   y0, inc_t incy, \
+             void*   z0, inc_t incz, \
        const cntx_t* cntx  \
      ) \
 { \
 	/* A is m x n.                   */ \
 	/* y = beta * y + alpha * A^T w; */ \
 	/* z =        z + alpha * A   x; */ \
+\
+	const ctype* restrict alpha = alpha0; \
+	const ctype* restrict a     = a0; \
+	const ctype* restrict w     = w0; \
+	const ctype* restrict x     = x0; \
+	const ctype* restrict beta  = beta0; \
+	      ctype* restrict y     = y0; \
+	      ctype* restrict z     = z0; \
 \
 	if ( 1 && inca == 1 && incw == 1 && incx == 1 && \
 	     incy == 1 && incz == 1 && b_n == ff ) \
@@ -162,13 +170,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	else \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t              dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,dotxf_ker_ft) kfp_df \
-		= \
-		bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
-		PASTECH(ch,axpyf_ker_ft) kfp_af \
-		= \
-		bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
+		const num_t  dt     = PASTEMAC(ch,type); \
+		dotxf_ker_ft kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
+		axpyf_ker_ft kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 \
 		kfp_df \
 		( \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 7986cd86f..54fcd3ed6 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -44,14 +44,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
              conj_t  conjx, \
              dim_t   m, \
              dim_t   b_n, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-       const ctype*  x, inc_t incx, \
-       const ctype*  beta, \
-             ctype*  y, inc_t incy, \
+       const void*   alpha0, \
+       const void*   a0, inc_t inca, inc_t lda, \
+       const void*   x0, inc_t incx, \
+       const void*   beta0, \
+             void*   y0, inc_t incy, \
        const cntx_t* cntx  \
      ) \
 { \
+	const ctype* restrict alpha = alpha0; \
+	const ctype* restrict a     = a0; \
+	const ctype* restrict x     = x0; \
+	const ctype* restrict beta  = beta0; \
+	      ctype* restrict y     = y0; \
+\
 	if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \
 	{ \
 		ctype r[ ff ]; \
@@ -110,10 +116,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	else \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
-		const num_t              dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,dotxv_ker_ft) kfp_dv \
-		= \
-		bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
+		const num_t  dt     = PASTEMAC(ch,type); \
+		dotxv_ker_ft kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index 42bf454c8..c3385032e 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -128,9 +128,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
              bool    invdiag, \
              dim_t   cdim, \
              dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp, \
        const cntx_t* cntx \
      ) \
 { \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index 060ba3fdb..1285f82da 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -66,9 +66,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
              bool    invdiag, \
              dim_t   cdim, \
              dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp, \
        const cntx_t* cntx  \
      ) \
 { \
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 53067983c..b77ef6965 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -87,9 +87,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t   cdim, \
              dim_t   n, \
              dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp, \
        const cntx_t* cntx  \
      ) \
 { \
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index fcf9c5360..b49856a21 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -62,9 +62,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t   cdim, \
              dim_t   n, \
              dim_t   n_max, \
-       const ctype*  kappa, \
-       const ctype*  a, inc_t inca, inc_t lda, \
-             ctype*  p,             inc_t ldp, \
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp, \
        const cntx_t* cntx  \
      ) \
 { \
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 9a63f6971..5c7c9c430 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -60,9 +60,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
              pack_t  schema, \
              dim_t   cdim, \
              dim_t   n, \
-       const ctype*  kappa, \
-       const ctype*  p,             inc_t ldp, \
-             ctype*  a, inc_t inca, inc_t lda, \
+       const void*   kappa, \
+       const void*   p,             inc_t ldp, \
+             void*   a, inc_t inca, inc_t lda, \
        const cntx_t* cntx  \
      ) \
 { \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index e8f4364cc..6bab6c812 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -45,15 +45,21 @@ static void PASTEMAC3(ch,opname,arch,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, \
-       const ctype*     b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a0, \
+       const void*      b0, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype* alpha = alpha0; \
+	const ctype* a     = a0; \
+	const ctype* b     = b0; \
+	const ctype* beta  = beta0; \
+	      ctype* c     = c0; \
+\
 	const num_t dt     = PASTEMAC(ch,type); \
 \
 	const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
@@ -148,15 +154,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, \
-       const ctype*     b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a0, \
+       const void*      b0, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype* alpha = alpha0; \
+	const ctype* a     = a0; \
+	const ctype* b     = b0; \
+	const ctype* beta  = beta0; \
+	      ctype* c     = c0; \
 \
 	const dim_t mr = PASTECH(BLIS_MR_,ch); \
 	const dim_t nr = PASTECH(BLIS_NR_,ch); \
diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c
index aaa6ff742..934fb10ad 100644
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -48,15 +48,21 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a0, inc_t rs_a, inc_t cs_a, \
+       const void*      b0, inc_t rs_b, inc_t cs_b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype* restrict alpha = alpha0; \
+	const ctype* restrict a     = a0; \
+	const ctype* restrict b     = b0; \
+	const ctype* restrict beta  = beta0; \
+	      ctype* restrict c     = c0; \
+\
 	/* NOTE: This microkernel can actually handle arbitrarily large
 	   values of m, n, and k. */ \
 \
@@ -253,15 +259,21 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a0, inc_t rs_a, inc_t cs_a, \
+       const void*      b0, inc_t rs_b, inc_t cs_b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype* restrict alpha = alpha0; \
+	const ctype* restrict a     = a0; \
+	const ctype* restrict b     = b0; \
+	const ctype* restrict beta  = beta0; \
+	      ctype* restrict c     = c0; \
+\
 	/* NOTE: This microkernel can actually handle arbitrarily large
 	   values of m, n, and k. */ \
 \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 14ff03780..c57ea5ae8 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -45,16 +45,23 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a1x, \
-       const ctype*     a11, \
-       const ctype*     bx1, \
-             ctype*     b11, \
-             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a1x0, \
+       const void*      a110, \
+       const void*      bx10, \
+             void*      b110, \
+             void*      c110, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype* alpha = alpha0; \
+	const ctype* a1x   = a1x0; \
+	const ctype* a11   = a110; \
+	const ctype* bx1   = bx10; \
+	      ctype* b11   = b110; \
+	      ctype* c11   = c110; \
+\
 	const num_t dt     = PASTEMAC(ch,type); \
 \
 	const dim_t mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
@@ -71,10 +78,8 @@ printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
 \
 	const ctype* minus_one = PASTEMAC(ch,m1); \
 \
-	PASTECH(ch,gemm_ukr_ft) \
-	            gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	PASTECH(ch,trsm_ukr_ft) \
-	            trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	trsm_ukr_ft trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
 \
 /*
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index 5a726b1da..547582190 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -42,13 +42,17 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       const ctype*     a, \
-             ctype*     b, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      a0, \
+             void*      b0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype*    a      = a0; \
+	      ctype*    b      = b0; \
+	      ctype*    c      = c0; \
+\
 	const num_t     dt     = PASTEMAC(ch,type); \
 \
 	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
@@ -126,13 +130,17 @@ INSERT_GENTFUNC_BASIC3( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t*          data, \
-       cntx_t*             cntx  \
+       const void*      a0, \
+             void*      b0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
+             auxinfo_t* data, \
+       const cntx_t*    cntx  \
      ) \
 { \
+	const ctype*    a      = a0; \
+	      ctype*    b      = b0; \
+	      ctype*    c      = c0; \
+\
 	const num_t     dt     = PASTEMAC(ch,type); \
 \
 	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 11c3091e9..41135f5ea 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -38,159 +38,181 @@
 // -- Instantiate kernel prototypes for the current architecture ---------------
 
 // Define macros to construct the full symbol name from the operation name.
-#undef  GENARNAME             // architecture, _ref (no bli_)
+#undef  GENARNAME             // opname, architecture, _ref (no bli_)
 #define GENARNAME(opname)     PASTECH2(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
-#undef  GENBARNAME            // bli_, architecture, _ref
+#undef  GENTARNAME            // bli, ch, opname, architecture, _ref
+#define GENTARNAME(ch,opname) PASTEMAC3(ch,opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
+#undef  GENBARNAME            // bli_, opname, architecture, _ref
 #define GENBARNAME(opname)    PASTEMAC2(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
-#undef  GENBAINAME            // bli_, architecture, _ind
+#undef  GENBAINAME            // bli_, opname, architecture, _ind
 #define GENBAINAME(opname)    PASTEMAC2(opname,BLIS_CNAME_INFIX,BLIS_IND_SUFFIX)
 
+// Define a prototype-inserting template that uses an arbitrary prototype-
+// generating macro.
+
+#undef  INSERT_PROTMAC_BASIC0
+#define INSERT_PROTMAC_BASIC0( protmac, kername ) \
+\
+protmac( float,    s, kername ) \
+protmac( double,   d, kername ) \
+protmac( scomplex, c, kername ) \
+protmac( dcomplex, z, kername )
+
+
 // -- Level-3 native micro-kernel prototype redefinitions ----------------------
 
-// -- Prototypes for completely generic level-3 microkernels --
+// -- Construct arch-specific names for reference level-3 microkernels --
 
-#undef  gemm_ukr_name
 #define gemm_ukr_name       GENARNAME(gemm)
-#undef  gemmtrsm_l_ukr_name
 #define gemmtrsm_l_ukr_name GENARNAME(gemmtrsm_l)
-#undef  gemmtrsm_u_ukr_name
 #define gemmtrsm_u_ukr_name GENARNAME(gemmtrsm_u)
-#undef  trsm_l_ukr_name
 #define trsm_l_ukr_name     GENARNAME(trsm_l)
-#undef  trsm_u_ukr_name
 #define trsm_u_ukr_name     GENARNAME(trsm_u)
 
-// Instantiate prototypes for above functions via the native micro-kernel API
-// template.
-#include "bli_l3_ukr.h"
+// Instantiate prototypes for above functions using the pre-defined level-3
+// microkernel prototype-generating macros.
+
+INSERT_PROTMAC_BASIC0( GEMM_UKR_PROT,     gemm_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm_l_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm_u_ukr_name )
+INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm_l_ukr_name )
+INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm_u_ukr_name )
+
 
 // -- Level-3 virtual micro-kernel prototype redefinitions ---------------------
 
-// -- Prototypes for induced method level-3 microkernels --
+// -- Construct arch-specific names for reference virtual level-3 microkernels --
 
 // -- 1m --
 
-#undef  gemm1m_ukr_name
 #define gemm1m_ukr_name        GENARNAME(gemm1m)
-#undef  gemmtrsm1m_l_ukr_name
 #define gemmtrsm1m_l_ukr_name  GENARNAME(gemmtrsm1m_l)
-#undef  gemmtrsm1m_u_ukr_name
 #define gemmtrsm1m_u_ukr_name  GENARNAME(gemmtrsm1m_u)
-#undef  trsm1m_l_ukr_name
 #define trsm1m_l_ukr_name      GENARNAME(trsm1m_l)
-#undef  trsm1m_u_ukr_name
 #define trsm1m_u_ukr_name      GENARNAME(trsm1m_u)
 
-// Instantiate prototypes for above functions via the virtual micro-kernel API
-// template.
-#include "bli_l3_ind_ukr.h"
+// Instantiate prototypes for above functions using the pre-defined level-3
+// microkernel prototype-generating macros.
 
-// -- Level-3 small/unpacked micro-kernel prototype definitions ----------------
+// -- 1m --
 
-// NOTE: This results in redundant prototypes for gemmsup_r and gemmsup_c
-// kernels, but since they will be identical the compiler won't complain.
+INSERT_PROTMAC_BASIC0( GEMM_UKR_PROT,     gemm1m_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm1m_l_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm1m_u_ukr_name )
+INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm1m_l_ukr_name )
+INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm1m_u_ukr_name )
 
-#undef  gemmsup_rv_ukr_name
-#define gemmsup_rv_ukr_name   GENARNAME(gemmsup_r)
-#undef  gemmsup_rg_ukr_name
-#define gemmsup_rg_ukr_name   GENARNAME(gemmsup_r)
-#undef  gemmsup_cv_ukr_name
-#define gemmsup_cv_ukr_name   GENARNAME(gemmsup_c)
-#undef  gemmsup_cg_ukr_name
-#define gemmsup_cg_ukr_name   GENARNAME(gemmsup_c)
 
-#undef  gemmsup_gx_ukr_name
-#define gemmsup_gx_ukr_name   GENARNAME(gemmsup_g)
+// -- Level-3 small/unpacked micro-kernel prototype definitions ----------------
 
-// Include the small/unpacked kernel API template.
-#include "bli_l3_sup_ker.h"
+// -- Construct arch-specific names for reference gemmsup kernels --
 
-// -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------
+#define gemmsup_rv_ukr_name  GENARNAME(gemmsup_r)
+#define gemmsup_rg_ukr_name  GENARNAME(gemmsup_r)
+#define gemmsup_cv_ukr_name  GENARNAME(gemmsup_c)
+#define gemmsup_cg_ukr_name  GENARNAME(gemmsup_c)
+#define gemmsup_gx_ukr_name  GENARNAME(gemmsup_g)
+
+// Instantiate prototypes for above functions using the pre-defined gemmsup
+// kernel prototype-generating macros.
+
+INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_rv_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_rg_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_cv_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_cg_ukr_name )
+INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_gx_ukr_name )
 
-#undef  packm_mrxk_ker_name
-#define packm_mrxk_ker_name  GENARNAME(packm_mrxk)
-#undef  packm_nrxk_ker_name
-#define packm_nrxk_ker_name  GENARNAME(packm_nrxk)
 
-#undef  packm_mrxk_1er_ker_name
-#define packm_mrxk_1er_ker_name  GENARNAME(packm_mrxk_1er)
-#undef  packm_nrxk_1er_ker_name
-#define packm_nrxk_1er_ker_name  GENARNAME(packm_nrxk_1er)
+// -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------
 
-#undef  packm_mrxmr_diag_ker_name
-#define packm_mrxmr_diag_ker_name  GENARNAME(packm_mrxmr_diag)
-#undef  packm_nrxnr_diag_ker_name
-#define packm_nrxnr_diag_ker_name  GENARNAME(packm_nrxnr_diag)
+// -- Construct arch-specific names for reference packm kernels --
 
-#undef  packm_mrxmr_diag_1er_ker_name
+#define packm_mrxk_ker_name            GENARNAME(packm_mrxk)
+#define packm_nrxk_ker_name            GENARNAME(packm_nrxk)
+#define packm_mrxk_1er_ker_name        GENARNAME(packm_mrxk_1er)
+#define packm_nrxk_1er_ker_name        GENARNAME(packm_nrxk_1er)
+#define packm_mrxmr_diag_ker_name      GENARNAME(packm_mrxmr_diag)
+#define packm_nrxnr_diag_ker_name      GENARNAME(packm_nrxnr_diag)
 #define packm_mrxmr_diag_1er_ker_name  GENARNAME(packm_mrxmr_diag_1er)
-#undef  packm_nrxnr_diag_1er_ker_name
 #define packm_nrxnr_diag_1er_ker_name  GENARNAME(packm_nrxnr_diag_1er)
+#define unpackm_mrxk_ker_name          GENARNAME(unpackm_mrxk)
+#define unpackm_nrxk_ker_name          GENARNAME(unpackm_nrxk)
+
+// Instantiate prototypes for above functions using the pre-defined packm
+// kernel prototype-generating macros.
 
-#undef  unpackm_mrxk_ker_name
-#define unpackm_mrxk_ker_name  GENARNAME(unpackm_mrxk)
-#undef  unpackm_nrxk_ker_name
-#define unpackm_nrxk_ker_name  GENARNAME(unpackm_nrxk)
+INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_mrxk_ker_name )
+INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_nrxk_ker_name )
+INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_mrxk_1er_ker_name )
+INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_nrxk_1er_ker_name )
+INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_ker_name )
+INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_ker_name )
+INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_1er_ker_name )
+INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_1er_ker_name )
+INSERT_PROTMAC_BASIC0( UNPACKM_KER_PROT,    unpackm_mrxk_ker_name )
+INSERT_PROTMAC_BASIC0( UNPACKM_KER_PROT,    unpackm_nrxk_ker_name )
 
-// Instantiate prototypes for above functions via the level-1m kernel API
-// template.
-#include "bli_l1m_ker.h"
 
 // -- Level-1f kernel prototype redefinitions ----------------------------------
 
-#undef  axpy2v_ker_name
+// -- Construct arch-specific names for reference level-1f kernels --
+
 #define axpy2v_ker_name     GENARNAME(axpy2v)
-#undef  dotaxpyv_ker_name
-#define dotaxpyv_ker_name   GENARNAME(dotaxpyv)
-#undef  axpyf_ker_name
 #define axpyf_ker_name      GENARNAME(axpyf)
-#undef  dotxf_ker_name
-#define dotxf_ker_name      GENARNAME(dotxf)
-#undef  dotxaxpyf_ker_name
+#define dotaxpyv_ker_name   GENARNAME(dotaxpyv)
 #define dotxaxpyf_ker_name  GENARNAME(dotxaxpyf)
+#define dotxf_ker_name      GENARNAME(dotxf)
+
+// Instantiate prototypes for above functions using the pre-defined level-1f
+// kernel prototype-generating macros.
+
+INSERT_PROTMAC_BASIC0( AXPY2V_KER_PROT,     axpy2v_ker_name )
+INSERT_PROTMAC_BASIC0( AXPYF_KER_PROT,      axpyf_ker_name )
+INSERT_PROTMAC_BASIC0( DOTAXPYV_KER_PROT,   dotaxpyv_ker_name )
+INSERT_PROTMAC_BASIC0( DOTXAXPYF_KER_PROT,  dotxaxpyf_ker_name )
+INSERT_PROTMAC_BASIC0( DOTXF_KER_PROT,      dotxf_ker_name )
 
-// Instantiate prototypes for above functions via the level-1f kernel API
-// template.
-#include "bli_l1f_ker.h"
 
 // -- Level-1v kernel prototype redefinitions ----------------------------------
 
-// -- prototypes for completely generic level-1v kernels --
+// -- Construct arch-specific names for reference level-1v kernels --
 
-#undef  addv_ker_name
 #define addv_ker_name      GENARNAME(addv)
-#undef  amaxv_ker_name
 #define amaxv_ker_name     GENARNAME(amaxv)
-#undef  axpbyv_ker_name
 #define axpbyv_ker_name    GENARNAME(axpbyv)
-#undef  axpyv_ker_name
 #define axpyv_ker_name     GENARNAME(axpyv)
-#undef  copyv_ker_name
 #define copyv_ker_name     GENARNAME(copyv)
-#undef  dotv_ker_name
 #define dotv_ker_name      GENARNAME(dotv)
-#undef  dotxv_ker_name
 #define dotxv_ker_name     GENARNAME(dotxv)
-#undef  invertv_ker_name
 #define invertv_ker_name   GENARNAME(invertv)
-#undef  invscalv_ker_name
 #define invscalv_ker_name  GENARNAME(invscalv)
-#undef  scalv_ker_name
 #define scalv_ker_name     GENARNAME(scalv)
-#undef  scal2v_ker_name
 #define scal2v_ker_name    GENARNAME(scal2v)
-#undef  setv_ker_name
 #define setv_ker_name      GENARNAME(setv)
-#undef  subv_ker_name
 #define subv_ker_name      GENARNAME(subv)
-#undef  swapv_ker_name
 #define swapv_ker_name     GENARNAME(swapv)
-#undef  xpbyv_ker_name
 #define xpbyv_ker_name     GENARNAME(xpbyv)
 
-// Instantiate prototypes for above functions via the level-1v kernel API
-// template.
-#include "bli_l1v_ker.h"
+// Instantiate prototypes for above functions using the pre-defined level-1v
+// kernel prototype-generating macros.
+
+INSERT_PROTMAC_BASIC0( ADDV_KER_PROT,     addv_ker_name )
+INSERT_PROTMAC_BASIC0( AMAXV_KER_PROT,    amaxv_ker_name )
+INSERT_PROTMAC_BASIC0( AXPBYV_KER_PROT,   axpbyv_ker_name )
+INSERT_PROTMAC_BASIC0( AXPYV_KER_PROT,    axpyv_ker_name )
+INSERT_PROTMAC_BASIC0( COPYV_KER_PROT,    copyv_ker_name )
+INSERT_PROTMAC_BASIC0( DOTV_KER_PROT,     dotv_ker_name )
+INSERT_PROTMAC_BASIC0( DOTXV_KER_PROT,    dotxv_ker_name )
+INSERT_PROTMAC_BASIC0( INVERTV_KER_PROT,  invertv_ker_name )
+INSERT_PROTMAC_BASIC0( INVSCALV_KER_PROT, invscalv_ker_name )
+INSERT_PROTMAC_BASIC0( SCALV_KER_PROT,    scalv_ker_name )
+INSERT_PROTMAC_BASIC0( SCAL2V_KER_PROT,   scal2v_ker_name )
+INSERT_PROTMAC_BASIC0( SETV_KER_PROT,     setv_ker_name )
+INSERT_PROTMAC_BASIC0( SUBV_KER_PROT,     subv_ker_name )
+INSERT_PROTMAC_BASIC0( SWAPV_KER_PROT,    swapv_ker_name )
+INSERT_PROTMAC_BASIC0( XPBYV_KER_PROT,    xpbyv_ker_name )
+
+
 
 // -- Macros to help concisely instantiate bli_func_init() ---------------------
 
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 7dd55e358..424b9c44f 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -42,20 +42,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a, \
-       const ctype*     b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a0, \
+       const void*      b0, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype*      alpha     = alpha0; \
+	const ctype*      a         = a0; \
+	const ctype*      b         = b0; \
+	const ctype*      beta      = beta0; \
+	      ctype*      c         = c0; \
+\
 	const num_t       dt        = PASTEMAC(ch,type); \
 	const num_t       dt_r      = PASTEMAC(chr,type); \
 \
-	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	      gemm_ukr_ft rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        row_pref  = !col_pref; \
 \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index dafc14f2c..903173e9d 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -42,24 +42,29 @@ void PASTEMAC3(ch,opname,arch,suf) \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
-       const ctype*     alpha, \
-       const ctype*     a1x, \
-       const ctype*     a11, \
-       const ctype*     bx1, \
-             ctype*     b11, \
-             ctype*     c11, inc_t rs_c, inc_t cs_c, \
+       const void*      alpha0, \
+       const void*      a1x0, \
+       const void*      a110, \
+       const void*      bx10, \
+             void*      b110, \
+             void*      c110, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype*      alpha       = alpha0; \
+	const ctype*      a1x         = a1x0; \
+	const ctype*      a11         = a110; \
+	const ctype*      bx1         = bx10; \
+	      ctype*      b11         = b110; \
+	      ctype*      c11         = c110; \
+\
 	const num_t       dt          = PASTEMAC(ch,type); \
 	const num_t       dt_r        = PASTEMAC(chr,type); \
 \
-	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr   = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	      gemm_ukr_ft rgemm_ukr   = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 \
-	PASTECH(ch,trsm_ukr_ft) \
-	                ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
+	      trsm_ukr_ft ctrsm_vukr  = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
 \
 	const bool        col_pref_r  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 \
@@ -265,7 +270,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	/* b11 = inv(a11) * b11;
 	   c11 = b11; */ \
-	ctrsm_vir_ukr \
+	ctrsm_vukr \
 	( \
 	  a11, \
 	  b11, \
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index 9f2e20ffe..caff4688d 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -40,13 +40,17 @@
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       const ctype*     a, \
-             ctype*     b, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      a0, \
+             void*      b0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype*      a      = a0; \
+	      ctype*      b      = b0; \
+	      ctype*      c      = c0; \
+\
 	const num_t       dt     = PASTEMAC(ch,type); \
 	const num_t       dt_r   = PASTEMAC(chr,type); \
 \
@@ -254,13 +258,17 @@ INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalri
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       const ctype*     a, \
-             ctype*     b, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const void*      a0, \
+             void*      b0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
              auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
+	const ctype*      a      = a0; \
+	      ctype*      b      = b0; \
+	      ctype*      c      = c0; \
+\
 	const num_t       dt     = PASTEMAC(ch,type); \
 	const num_t       dt_r   = PASTEMAC(chr,type); \
 \
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index b61140743..189ada459 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -151,8 +151,7 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	gemm_ukr_ft gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = cs_c; \
diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c
index 2ed178c65..75dc66649 100644
--- a/sandbox/gemmlike/bls_packm_cxk.c
+++ b/sandbox/gemmlike/bls_packm_cxk.c
@@ -57,13 +57,11 @@ void PASTECH2(bls_,ch,opname) \
 	num_t dt     = PASTEMAC(ch,type); \
 	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
 	                                           : BLIS_PACKM_MRXK_KER; \
-\
-	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
+	packm_cxk_ker_ft f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \

From 60f36347c16e6336215cd52b4e5f3c0f96e7c253 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 22 Feb 2023 20:37:30 -0600
Subject: [PATCH 137/230] Fixed bugs in scal2v ref kernel when alpha == 1.
 (#728)

Details:
- Fixed a typo bug in ref_kernels/1/bli_scal2v_ref.c where the
  conditional that was supposed to be checking for cases when alpha is
  equal to 1.0 (so that copyv could be used instead of scal2v) was
  instead erroneously comparing alpha against 0.0.
- Fixed another bug in the same function whereby BLIS_NO_CONJUGATE was
  erroneously being passed into copyv instead of the kernel's conjx
  parameter. This second bug was inert, however, due to the first bug
  since the "alpha == 0.0" case was already being handled, resulting in
  the code block never executing.
---
 ref_kernels/1/bli_scal2v_ref.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index 6aa519e97..1ac66be07 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -73,7 +73,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		); \
 		return; \
 	} \
-	else if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	else if ( PASTEMAC(ch,eq1)( *alpha ) ) \
 	{ \
 		/* If alpha is one, use copyv. */ \
 \
@@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		copyv_p \
 		( \
-		  BLIS_NO_CONJUGATE, \
+		  conjx, \
 		  n, \
 		  x0, incx, \
 		  y0, incy, \

From 72c37eb80f964b7840377076e5009aec5b29d320 Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Thu, 23 Mar 2023 16:01:55 -0500
Subject: [PATCH 138/230] Updated configure to pass all shellcheck checks.
 (#729)

Details:
- Modified configure so that it passes all 'shellcheck' checks,
  disabling ones which we violate but which are just stylistic, or are
  special cases in our code.
- Miscellaneous other minor changes, such as rearranged redirections in
  long sed/perl pipes to look more natural.
- Whitespace tweaks.
---
 configure | 892 ++++++++++++++++++++++++------------------------------
 1 file changed, 395 insertions(+), 497 deletions(-)

diff --git a/configure b/configure
index d45b0ba9d..a953c25c5 100755
--- a/configure
+++ b/configure
@@ -32,6 +32,7 @@
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 #
+# shellcheck disable=2001,2249,2034,2154,2181,2312,2250,2292
 
 #
 # -- Helper functions ----------------------------------------------------------
@@ -42,11 +43,11 @@ print_usage()
 	# Use the version string in the 'version' file since we don't have
 	# the patched version string yet.
 	if [ -z "${version}" ]; then
-		version=$(cat "${version_filepath}")
+		version=$(<"${version_filepath}")
 	fi
 
 	# Echo usage info.
- 	cat <<EOF
+	cat <<EOF
 
  ${script_name} (BLIS ${version})
 
@@ -562,7 +563,7 @@ pass_config_kernel_registries()
 
 		# We've stripped out leading whitespace and trailing comments. If
 		# the line is now empty, then we can skip it altogether.
-		if [ "x${curline}" = "x" ]; then
+		if [[ -z ${curline} ]]; then
 			continue;
 		fi
 
@@ -572,12 +573,13 @@ pass_config_kernel_registries()
 
 		# If we encounter a slash, it means the name of the configuration
 		# and the kernel set needed by that configuration are different.
-		if [[ "${list}" == *[/]* ]]; then
+		if [[ ${list} = */* ]]; then
 
 			#echo "Slash found."
 			klist=""
 			clist=""
-			for item in "${list}"; do
+
+			for item in ${list}; do
 
 				# The sub-configuration name is always the first sub-word in
 				# the slash-separated compound word.
@@ -590,7 +592,7 @@ pass_config_kernel_registries()
 
 				# Replace the slashes with spaces to transform the string
 				# into a space-separated list of kernel names.
-				kernels=$(echo -e ${kernels} | sed -e "s/\// /g")
+				kernels=$(echo -e "${kernels}" | sed -e "s/\// /g")
 
 				clist="${clist} ${config}"
 				klist="${klist} ${kernels}"
@@ -615,7 +617,7 @@ pass_config_kernel_registries()
 		#   to their respective registries, as appropriate.
 
 		# Handle singleton and umbrella configuration entries separately.
-		if [ $(is_singleton_family "${cname}" "${clist}") == "true" ]; then
+		if [[ $(is_singleton_family "${cname}" "${clist}") == "true" ]]; then
 
 			# Singleton configurations/families.
 			# Note: for singleton families, clist contains one item, which
@@ -625,7 +627,7 @@ pass_config_kernel_registries()
 			# Only consider updating the indirect blacklist (pass 0) or
 			# committing clist and klist to the registries (pass 1) if the
 			# configuration name (cname) is not blacklisted.
-			if [ $(is_in_list "${cname}" "${all_blist}") == "false" ]; then
+			if [[ $(is_in_list "${cname}" "${all_blist}") == "false" ]]; then
 
 				if [ "${passnum}" == "0" ]; then
 					# Even if the cname isn't blacklisted, one of the requisite
@@ -637,7 +639,7 @@ pass_config_kernel_registries()
 					# system architecture.) Thus, we add cname to the indirect
 					# blacklist.
 					for item in ${klist}; do
-						if [ $(is_in_list "${item}" "${config_blist}") == "true" ]; then
+						if [[ $(is_in_list "${item}" "${config_blist}") == "true" ]]; then
 							indirect_blist="${indirect_blist} ${cname}"
 							break
 						fi
@@ -664,7 +666,7 @@ pass_config_kernel_registries()
 
 			# First we check cname, which should generally not be blacklisted
 			# for umbrella families, but we check anyway just to be safe.
-			if [ $(is_in_list "${cname}" "${all_blist}") == "false" ]; then
+			if [[ $(is_in_list "${cname}" "${all_blist}") == "false" ]]; then
 
 				if [ "${passnum}" == "1" ]; then
 
@@ -673,7 +675,7 @@ pass_config_kernel_registries()
 					# omit it from clist and klist.
 					for item in ${clist}; do
 
-						if [ $(is_in_list "${item}" "${all_blist}") == "true" ]; then
+						if [[ $(is_in_list "${item}" "${all_blist}") == "true" ]]; then
 							clist=$(remove_from_list "${item}" "${clist}")
 							klist=$(remove_from_list "${item}" "${klist}")
 						fi
@@ -695,7 +697,7 @@ pass_config_kernel_registries()
 
 	if [ "${passnum}" == "0" ]; then
 		# Assign the final indirect blacklist (with whitespace removed).
-		indirect_blist="$(canonicalize_ws ${indirect_blist})"
+		indirect_blist=$(canonicalize_ws "${indirect_blist}")
 	fi
 }
 
@@ -736,11 +738,11 @@ read_registry_file()
 
 			config=${cr_var##config_registry_}
 
-			clist=$(query_array "config_registry" ${config})
+			clist=$(query_array "config_registry" "${config}")
 
 			# The entries that define singleton families should never need
 			# any substitution.
-			if [ $(is_singleton_family "${config}" "${clist}") == "true" ]; then
+			if [[ $(is_singleton_family "${config}" "${clist}") == "true" ]]; then
 				continue
 			fi
 
@@ -749,7 +751,7 @@ read_registry_file()
 			for mem in ${clist}; do
 
 				#mems_mem="${config_registry[${mem}]}"
-				mems_mem=$(query_array "config_registry" ${mem})
+				mems_mem=$(query_array "config_registry" "${mem}")
 
 				# If mems_mem is empty string, then mem was not found as a key
 				# in the config list associative array. In that case, we continue
@@ -762,7 +764,7 @@ read_registry_file()
 				if [ "${mem}" != "${mems_mem}" ]; then
 
 					#clist="${config_registry[$config]}"
-					clisttmp=$(query_array "config_registry" ${config})
+					clisttmp=$(query_array "config_registry" "${config}")
 
 					# Replace the current config with its constituent config set,
 					# canonicalize whitespace, and then remove duplicate config
@@ -789,7 +791,7 @@ read_registry_file()
 					# but only if the config (mem) value is NOT present
 					# in the list of sub-configs. If it is present, then further
 					# substitution may not necessarily be needed this round.
-					if [ $(is_in_list "${mem}" "${mems_mem}") == "false" ]; then
+					if [[ $(is_in_list "${mem}" "${mems_mem}") == "false" ]]; then
 						iterate_again="1"
 					fi
 				fi
@@ -814,7 +816,7 @@ read_registry_file()
 
 			config=${kr_var##kernel_registry_}
 
-			klist=$(query_array "kernel_registry" ${config})
+			klist=$(query_array "kernel_registry" "${config}")
 
 			# The entries that define singleton families should never need
 			# any substitution. In the kernel registry, we know it's a
@@ -822,7 +824,7 @@ read_registry_file()
 			# (This is slightly different than the same test in the config
 			# registry, where we test that clist is one word and that
 			# clist == cname.)
-			if [ $(is_in_list "${config}" "${klist}") == "true" ]; then
+			if [[ $(is_in_list "${config}" "${klist}") == "true" ]]; then
 				#echo "debug: '${config}' not found in '${klist}'; skipping."
 				continue
 			fi
@@ -832,7 +834,7 @@ read_registry_file()
 			for ker in ${klist}; do
 
 				#kers_ker="${kernel_registry[${ker}]}"
-				kers_ker=$(query_array "kernel_registry" ${ker})
+				kers_ker=$(query_array "kernel_registry" "${ker}")
 
 				# If kers_ker is empty string, then ker was not found as a key
 				# in the kernel registry. While not common, this can happen
@@ -853,7 +855,7 @@ read_registry_file()
 				if [ "${ker}" != "${kers_ker}" ]; then
 
 					#klisttmp="${kernel_registry[$config]}"
-					klisttmp=$(query_array "kernel_registry" ${config})
+					klisttmp=$(query_array "kernel_registry" "${config}")
 
 					# Replace the current config with its requisite kernels,
 					# canonicalize whitespace, and then remove duplicate kernel
@@ -880,7 +882,7 @@ read_registry_file()
 					# unless we just substituted using a singleton family
 					# definition, in which case we don't necessarily need to
 					# iterate further this round.
-					if [ $(is_in_list "${ker}" "${kers_ker}") == "false" ]; then
+					if [[ $(is_in_list "${ker}" "${kers_ker}") == "false" ]]; then
 						iterate_again="1"
 					fi
 				fi
@@ -916,13 +918,13 @@ build_kconfig_registry()
 	familyname="$1"
 
 	#clist="${config_registry[${familyname}]}"
-	clist=$(query_array "config_registry" ${familyname})
+	clist=$(query_array "config_registry" "${familyname}")
 
 	for config in ${clist}; do
 
 		# Look up the kernels for the current sub-configuration.
 		#kernels="${kernel_registry[${config}]}"
-		kernels=$(query_array "kernel_registry" ${config})
+		kernels=$(query_array "kernel_registry" "${config}")
 
 		for kernel in ${kernels}; do
 
@@ -931,7 +933,7 @@ build_kconfig_registry()
 
 			# Query the current sub-configs for the current ${kernel}.
 			#cur_configs="${kconfig_registry[${kernel}]}"
-			cur_configs=$(query_array "kconfig_registry" ${kernel})
+			cur_configs=$(query_array "kconfig_registry" "${kernel}")
 
 			# Add the current sub-configuration to the list of sub-configs
 			# we just queried.
@@ -995,7 +997,7 @@ is_singleton_family()
 
 	rval="false"
 
-	if [ $(is_singleton "${memberlist}") ]; then
+	if [[ -n $(is_singleton "${memberlist}") ]]; then
 
 		if [ "${memberlist}" == "${familyname}" ]; then
 			rval="true"
@@ -1016,7 +1018,7 @@ remove_from_list()
 	for item in ${list}; do
 
 		# Filter out any list item that matches any of the strike words.
-		if [ $(is_in_list "${item}" "${strike_words}") == "false" ]; then
+		if [[ $(is_in_list "${item}" "${strike_words}") == "false" ]]; then
 			flist="${flist} ${item}"
 		fi
 	done
@@ -1171,7 +1173,7 @@ auto_detect()
 		filename=${pair#*:}
 		rootdir=${pair%:*}
 
-		filepath=$(find ${dist_path}/${rootdir} -name "${filename}")
+		filepath=$(find "${dist_path}/${rootdir}" -name "${filename}")
 		c_src_filepaths="${c_src_filepaths} ${filepath}"
 	done
 
@@ -1197,7 +1199,7 @@ auto_detect()
 		filename=${pair#*:}
 		rootdir=${pair%:*}
 
-		filepath=$(find ${dist_path}/${rootdir} -name "${filename}")
+		filepath=$(find "${dist_path}/${rootdir}" -name "${filename}")
 		path=${filepath%/*}
 		c_hdr_paths="${c_hdr_paths} -I${path}"
 	done
@@ -1206,9 +1208,9 @@ auto_detect()
 	autodetect_x="auto-detect.x"
 
 	# Create #defines for all of the BLIS_CONFIG_ macros in bli_cpuid.c.
-	bli_cpuid_c_filepath=$(find ${dist_path}/frame -name "bli_cpuid.c")
-	config_defines=$(grep BLIS_CONFIG_ ${bli_cpuid_c_filepath} \
-	                 | sed -e 's/#ifdef /-D/g')
+	bli_cpuid_c_filepath=$(find "${dist_path}/frame" -name "bli_cpuid.c")
+	config_defines=$(grep BLIS_CONFIG_ "${bli_cpuid_c_filepath}" \
+	                 | sed -Ee 's/#ifdef[[:space:]]+/-D/g')
 
 	# Set the linker flags. We typically need pthreads (or BLIS's homerolled
 	# equiavlent) because it is needed for parts of bli_arch.c unrelated to
@@ -1240,6 +1242,7 @@ auto_detect()
 	if [ "${debug_auto_detect}" == "no" ]; then
 
 		# Execute the compilation command.
+		# shellcheck disable=2086
 		eval ${cmd}
 
 	else
@@ -1252,10 +1255,10 @@ auto_detect()
 	fi
 
 	# Run the auto-detect program.
-	detected_config=$(./${autodetect_x})
+	detected_config=$("./${autodetect_x}")
 
 	# Remove the executable file.
-	rm -f ./${autodetect_x}
+	rm -f "./${autodetect_x}"
 
 	# Return the detected sub-configuration name.
 	echo "${detected_config}"
@@ -1267,7 +1270,7 @@ has_libmemkind()
 
 	# Path to libmemkind detection source file.
 	main_c="libmemkind_detect.c"
-	main_c_filepath=$(find ${dist_path}/build -name "${main_c}")
+	main_c_filepath=$(find "${dist_path}/build" -name "${main_c}")
 
 	# Add libmemkind to LDFLAGS.
 	LDFLAGS_mk="${LDFLAGS} -lmemkind"
@@ -1277,7 +1280,8 @@ has_libmemkind()
 
 	# Attempt to compile a simple main() program that contains a call
 	# to hbw_malloc() and that links to libmemkind.
-	${found_cc} -o ${binname} ${main_c_filepath} ${LDFLAGS_mk} 2> /dev/null
+	# shellcheck disable=2086
+	"${found_cc}" -o "${binname}" "${main_c_filepath}" ${LDFLAGS_mk} 2> /dev/null
 
 	# Depending on the return code from the compile step above, we set
 	# enable_memkind accordingly.
@@ -1288,7 +1292,7 @@ has_libmemkind()
 	fi
 
 	# Remove the executable generated above.
-	rm -f ./${binname}
+	rm -f "./${binname}"
 
 	echo "${rval}"
 }
@@ -1299,15 +1303,15 @@ has_pragma_omp_simd()
 
 	# Path to omp-simd detection source file.
 	main_c="omp_simd_detect.c"
-	main_c_filepath=$(find ${dist_path}/build -name "${main_c}")
+	main_c_filepath=$(find "${dist_path}/build" -name "${main_c}")
 
 	# Binary executable filename.
 	binname="omp_simd-detect.x"
 
 	# Attempt to compile a simple main() program that contains a
 	# #pragma omp simd.
-	${found_cc} -std=c99 -O3 -march=native -fopenmp-simd \
-	            -o ${binname} ${main_c_filepath} 2> /dev/null
+	"${found_cc}" -std=c99 -O3 -march=native -fopenmp-simd \
+	            -o "${binname}" "${main_c_filepath}" 2> /dev/null
 
 	# Depending on the return code from the compile step above, we set
 	# enable_memkind accordingly.
@@ -1318,7 +1322,7 @@ has_pragma_omp_simd()
 	fi
 
 	# Remove the executable generated above.
-	rm -f ./${binname}
+	rm -f "./${binname}"
 
 	echo "${rval}"
 }
@@ -1337,7 +1341,7 @@ blacklistcc_add()
 {
 	# Check whether we've already blacklisted the given sub-config so
 	# we don't output redundant messages.
-	if [ $(is_in_list "$1" "${config_blist}") == "false" ]; then
+	if [[ $(is_in_list "$1" "${config_blist}") == "false" ]]; then
 
 		echowarn "${cc_vendor} ${cc_version} does not support '$1'; adding to blacklist."
 		config_blist="${config_blist} $1"
@@ -1348,7 +1352,7 @@ blacklistbu_add()
 {
 	# Check whether we've already blacklisted the given sub-config so
 	# we don't output redundant messages.
-	if [ $(is_in_list "$1" "${config_blist}") == "false" ]; then
+	if [[ $(is_in_list "$1" "${config_blist}") == "false" ]]; then
 
 		echowarn "assembler ('as' ${bu_version}) does not support '$1'; adding to blacklist."
 		config_blist="${config_blist} $1"
@@ -1359,7 +1363,7 @@ blacklistos_add()
 {
 	# Check whether we've already blacklisted the given sub-config so
 	# we don't output redundant messages.
-	if [ $(is_in_list "$1" "${config_blist}") == "false" ]; then
+	if [[ $(is_in_list "$1" "${config_blist}") == "false" ]]; then
 
 		echowarn "The operating system does not support building '$1'; adding to blacklist."
 		config_blist="${config_blist} $1"
@@ -1411,7 +1415,8 @@ get_binutils_version()
 	# Query the binutils version number.
 	# The last part ({ read first rest ; echo $first ; }) is a workaround
 	# to OS X's egrep only returning the first match.
-	bu_version=$(echo "${bu_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
+	bu_version=$(echo "${bu_string}" | grep -oE '[0-9]+\.[0-9]+\.?[0-9]*' |
+	             { read -r first rest ; echo "${first}"; })
 
 	# Parse the version number into its major, minor, and revision
 	# components.
@@ -1477,20 +1482,20 @@ check_python()
 	#
 
 	# Python 1.x is unsupported.
-	if [ ${python_major} -eq 1 ]; then
+	if [[ ${python_major} -eq 1 ]]; then
 		echoerr_unsupportedpython
 	fi
 
 	# Python 2.6.x or older is unsupported.
-	if [ ${python_major} -eq 2 ]; then
-		if [ ${python_minor} -lt 7 ]; then
+	if [[ ${python_major} -eq 2 ]]; then
+		if [[ ${python_minor} -lt 7 ]]; then
 			echoerr_unsupportedpython
 		fi
 	fi
 
 	# Python 3.3.x or older is unsupported.
-	if [ ${python_major} -eq 3 ]; then
-		if [ ${python_minor} -lt 4 ]; then
+	if [[ ${python_major} -eq 3 ]]; then
+		if [[ ${python_minor} -lt 4 ]]; then
 			echoerr_unsupportedpython
 		fi
 	fi
@@ -1515,19 +1520,19 @@ get_compiler_version()
 	# isolate the version number.
 	# The last part ({ read first rest ; echo $first ; }) is a workaround
 	# to OS X's egrep only returning the first match.
-	cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG|GCC' | { read first rest ; echo $first ; })
+	cc_vendor=$(echo "${vendor_string}" |
+	            grep -oE 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG|GCC' |
+	            { read -r first rest ; echo "${first}"; })
 
 	# AOCC version strings contain both "clang" and "AOCC" substrings, and
 	# so we have perform a follow-up check to make sure cc_vendor gets set
 	# correctly.
-	aocc_grep=$(echo "${vendor_string}" | grep 'AOCC')
-	if [ -n "${aocc_grep}" ]; then
+	if [[ ${vendor_string} = *AOCC* ]]; then
 		cc_vendor="aocc"
 	fi
 
 	# Detect armclang, which doesn't have a nice, unambiguous, one-word tag
-	armclang_grep=$(echo "${vendor_string}" | grep 'Arm C/C++/Fortran Compiler')
-	if [ -n "${armclang_grep}" ]; then
+	if [[ ${vendor_string} = *'Arm C/C++/Fortran Compiler'* ]]; then
 		cc_vendor="armclang"
 	fi
 
@@ -1541,8 +1546,7 @@ get_compiler_version()
 		# Treat compilers built by crosstool-NG (for eg: conda) as gcc.
 		cc_vendor="gcc"
 	fi
-	if [ "${cc_vendor}" = "icc" -o \
-	     "${cc_vendor}" = "gcc" ]; then
+	if [[ ${cc_vendor} = icc || ${cc_vendor} = gcc ]]; then
 
 		cc_version=$(${cc} -dumpversion)
 
@@ -1551,19 +1555,17 @@ get_compiler_version()
 		# Treat armclang as regular clang.
 		cc_vendor="clang"
 		cc_version=$(echo "${vendor_string}" \
-		             | egrep -o 'based on LLVM [0-9]+\.[0-9]+\.?[0-9]*' \
-		             | egrep -o               '[0-9]+\.[0-9]+\.?[0-9]*')
+		             | grep -oE 'based on LLVM [0-9]+\.[0-9]+\.?[0-9]*' \
+		             | grep -oE               '[0-9]+\.[0-9]+\.?[0-9]*')
 
 	elif [ "${cc_vendor}" = "clang" ]; then
 
 		cc_version=$(echo "${vendor_string}" \
-		             | egrep -o '(clang|LLVM) version [0-9]+\.[0-9]+\.?[0-9]*' \
-		             | egrep -o                      '[0-9]+\.[0-9]+\.?[0-9]*')
+		             | grep -oE '(clang|LLVM) version [0-9]+\.[0-9]+\.?[0-9]*' \
+		             | grep -oE                      '[0-9]+\.[0-9]+\.?[0-9]*')
 
 	elif [ "${cc_vendor}" = "aocc" ]; then
 
-		aocc_ver21=$(echo "${vendor_string}" | grep 'AOCC.LLVM.2')
-
 		# Versions 2.0 and 2.1 had different version string formats from
 		# 2.2 and later, so we have to handle them separately.
 		# Examples:
@@ -1573,15 +1575,15 @@ get_compiler_version()
 		# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
 		# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
 
-		if [ -n "${aocc_ver21}" ]; then
+		if [[ ${vendor_string} = *AOCC.LLVM.2* ]]; then
 
 			# Grep for the AOCC.LLVM.x.y.z substring first, and then isolate the
 			# version number. Also, the string may contain multiple instances of
 			# the version number, so only use the first occurrence.
 			cc_version=$(echo "${vendor_string}" \
-			             | egrep -o 'AOCC.LLVM.[0-9]+\.[0-9]+\.?[0-9]*' \
-			             | egrep -o           '[0-9]+\.[0-9]+\.?[0-9]*' \
-			             | { read first rest ; echo $first ; })
+			             | grep -oE 'AOCC.LLVM.[0-9]+\.[0-9]+\.?[0-9]*' \
+			             | grep -oE           '[0-9]+\.[0-9]+\.?[0-9]*' \
+			             | { read -r first rest ; echo "${first}"; })
 		else
 
 			# Grep for the AOCC_x.y.z substring first, and then isolate the
@@ -1590,9 +1592,9 @@ get_compiler_version()
 			# take only the first occurrence as a future-oriented safety
 			# measure.
 			cc_version=$(echo "${vendor_string}" \
-			             | egrep -o 'AOCC_[0-9]+\.[0-9]+\.?[0-9]*' \
-			             | egrep -o      '[0-9]+\.[0-9]+\.?[0-9]*' \
-			             | { read first rest ; echo $first ; })
+			             | grep -oE 'AOCC_[0-9]+\.[0-9]+\.?[0-9]*' \
+			             | grep -oE      '[0-9]+\.[0-9]+\.?[0-9]*' \
+			             | { read -r first rest ; echo "${first}"; })
 		fi
 
 	elif [ "${cc_vendor}" = "oneAPI" ]; then
@@ -1600,14 +1602,14 @@ get_compiler_version()
 		# Treat Intel oneAPI's clang as clang, not icc.
 		cc_vendor="clang"
 		cc_version=$(echo "${vendor_string}" \
-		             | egrep -o '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' \
-		             | { read first rest ; echo ${first} ; })
+		             | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' \
+		             | { read -r first rest ; echo "${first}"; })
 
 	else
 
 		cc_version=$(echo "${vendor_string}" \
-		             | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' \
-		             | { read first rest ; echo ${first} ; })
+		             | grep -oE '[0-9]+\.[0-9]+\.?[0-9]*' \
+		             | { read -r first rest ; echo "${first}"; })
 	fi
 
 	# Parse the version number into its major, minor, and revision
@@ -1622,7 +1624,7 @@ get_compiler_version()
 	# always output the major, minor, and revision numbers. Thus, if we're
 	# using gcc and its version is 7 or later, we re-query and re-parse the
 	# version string.
-	if [ "${cc_vendor}" = "gcc" -a ${cc_major} -ge 7 ]; then
+	if [[ ${cc_vendor} = "gcc" && ${cc_major} -ge 7 ]]; then
 
 		# Re-query the version number using -dumpfullversion.
 		cc_version=$(${cc} -dumpfullversion)
@@ -1689,25 +1691,25 @@ check_compiler()
 	# Fixme: check on a64fx, neoverse, and others
 
 	# gcc
-	if [ "x${cc_vendor}" = "xgcc" ]; then
+	if [[ ${cc_vendor} = gcc ]]; then
 
-		if [ ${cc_major} -lt 4 ]; then
+		if [[ ${cc_major} -lt 4 ]]; then
 			echoerr_unsupportedcc
 		fi
-		if [ ${cc_major} -eq 4 ]; then
+		if [[ ${cc_major} -eq 4 ]]; then
 			blacklistcc_add "knl"
-			if [ ${cc_minor} -lt 7 ]; then
+			if [[ ${cc_minor} -lt 7 ]]; then
 				echoerr_unsupportedcc
 			fi
-			if [ ${cc_minor} -lt 9 ]; then
+			if [[ ${cc_minor} -lt 9 ]]; then
 				blacklistcc_add "excavator"
 				blacklistcc_add "zen"
 			fi
 		fi
-		if [ ${cc_major} -lt 5 ]; then
+		if [[ ${cc_major} -lt 5 ]]; then
 			blacklistcc_add "knl"
 		fi
-		if [ ${cc_major} -lt 6 ]; then
+		if [[ ${cc_major} -lt 6 ]]; then
 			# Normally, zen would be blacklisted for gcc prior to 6.0.
 			# However, we have a workaround in place in the zen
 			# configuration's make_defs.mk file that starts with bdver4
@@ -1718,72 +1720,72 @@ check_compiler()
 			# gcc 5.x may support POWER9 but it is unverified.
 			blacklistcc_add "power9"
 		fi
-		if [ ${cc_major} -lt 10 ]; then
+		if [[ ${cc_major} -lt 10 ]]; then
 			blacklistcc_add "armsve"
 		fi
 	fi
 
 	# icc
-	if [ "x${cc_vendor}" = "xicc" ]; then
+	if [[ ${cc_vendor} = icc ]]; then
 
-		if [ ${cc_major} -lt 15 ]; then
+		if [[ ${cc_major} -lt 15 ]]; then
 			echoerr_unsupportedcc
 		fi
-		if [ ${cc_major} -eq 15 ]; then
-			if [ ${cc_revision} -lt 1 ]; then
+		if [[ ${cc_major} -eq 15 ]]; then
+			if [[ ${cc_revision} -lt 1 ]]; then
 				blacklistcc_add "skx"
 			fi
 		fi
-		if [ ${cc_major} -eq 18 ]; then
+		if [[ ${cc_major} -eq 18 ]]; then
 			echo "${script_name}: ${cc} ${cc_version} is known to cause erroneous results. See https://github.com/flame/blis/issues/371 for details."
 			blacklistcc_add "knl"
 			blacklistcc_add "skx"
 		fi
-		if [ ${cc_major} -ge 19 ]; then
+		if [[ ${cc_major} -ge 19 ]]; then
 			echo "${script_name}: ${cc} ${cc_version} is known to cause erroneous results. See https://github.com/flame/blis/issues/371 for details."
 			echoerr_unsupportedcc
 		fi
 	fi
 
 	# clang
-	if [ "x${cc_vendor}" = "xclang" ]; then
-		if [ "$(echo ${vendor_string} | grep -o Apple)" = "Apple" ]; then
-			if [ ${cc_major} -lt 5 ]; then
+	if [[ ${cc_vendor} = clang ]]; then
+		if [[ ${vendor_string} = *Apple* ]]; then
+			if [[ ${cc_major} -lt 5 ]]; then
 				echoerr_unsupportedcc
 			fi
 			# See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
-			if [ ${cc_major} -eq 5 ]; then
+			if [[ ${cc_major} -eq 5 ]]; then
 				# Apple clang 5.0 is clang 3.4svn
 				blacklistcc_add "excavator"
 				blacklistcc_add "zen"
 			fi
-			if [ ${cc_major} -lt 7 ]; then
+			if [[ ${cc_major} -lt 7 ]]; then
 				blacklistcc_add "knl"
 				blacklistcc_add "skx"
 			fi
 		else
-			if [ ${cc_major} -lt 3 ]; then
+			if [[ ${cc_major} -lt 3 ]]; then
 				echoerr_unsupportedcc
 			fi
-			if [ ${cc_major} -eq 3 ]; then
-				if [ ${cc_minor} -lt 3 ]; then
+			if [[ ${cc_major} -eq 3 ]]; then
+				if [[ ${cc_minor} -lt 3 ]]; then
 					echoerr_unsupportedcc
 				fi
-				if [ ${cc_minor} -lt 5 ]; then
+				if [[ ${cc_minor} -lt 5 ]]; then
 					blacklistcc_add "excavator"
 					blacklistcc_add "zen"
 				fi
-				if [ ${cc_minor} -lt 9 ]; then
+				if [[ ${cc_minor} -lt 9 ]]; then
 					blacklistcc_add "knl"
 					blacklistcc_add "skx"
 				fi
 			fi
-			if [ ${cc_major} -lt 4 ]; then
+			if [[ ${cc_major} -lt 4 ]]; then
 				# See comment above regarding zen support.
 				#blacklistcc_add "zen"
 				: # explicit no-op since bash can't handle empty loop bodies.
 			fi
-			if [ ${cc_major} -lt 11 ]; then
+			if [[ ${cc_major} -lt 11 ]]; then
 				blacklistcc_add "armsve"
 			fi
 		fi
@@ -1871,24 +1873,24 @@ check_compiler_version_ranges()
 	echo "${script_name}: checking ${cc} ${cc_version} against known consequential version ranges."
 
 	# gcc
-	if [ "x${cc_vendor}" = "xgcc" ]; then
+	if [[ ${cc_vendor} = gcc ]]; then
 
 		# Check for gcc < 4.9.0 (ie: 4.8.5 or older).
-		if [ ${cc_major} -eq 4 ]; then
-			if [ ${cc_minor} -lt 9 ]; then
+		if [[ ${cc_major} -eq 4 ]]; then
+			if [[ ${cc_minor} -lt 9 ]]; then
 				echo "${script_name}: note: found ${cc} version older than 4.9.0."
 				gcc_older_than_4_9_0='yes'
 			fi
 		fi
 
 		# Check for gcc < 6.1.0 (ie: 5.5 or older).
-		if [ ${cc_major} -lt 6 ]; then
+		if [[ ${cc_major} -lt 6 ]]; then
 			echo "${script_name}: note: found ${cc} version older than 6.1."
 			gcc_older_than_6_1_0='yes'
 		fi
 
 		# Check for gcc < 9.1.0 (ie: 8.3 or older).
-		if [ ${cc_major} -lt 9 ]; then
+		if [[ ${cc_major} -lt 9 ]]; then
 			echo "${script_name}: note: found ${cc} version older than 9.1."
 			gcc_older_than_9_1_0='yes'
 		fi
@@ -1901,37 +1903,37 @@ check_compiler_version_ranges()
 	fi
 
 	# icc
-	if [ "x${cc_vendor}" = "xicc" ]; then
+	if [[ ${cc_vendor} = icc ]]; then
 		:
 	fi
 
 	# clang
-	if [ "x${cc_vendor}" = "xclang" ]; then
+	if [[ ${cc_vendor} = clang ]]; then
 
 		# Check for clang < 9.0.0.
-		if [ ${cc_major} -lt 9 ]; then
+		if [[ ${cc_major} -lt 9 ]]; then
 			echo "${script_name}: note: found ${cc} version older than 9.0."
 			clang_older_than_9_0_0='yes'
 		fi
 
 		# Check for clang < 12.0.0.
-		if [ ${cc_major} -lt 12 ]; then
+		if [[ ${cc_major} -lt 12 ]]; then
 			echo "${script_name}: note: found ${cc} version older than 12.0."
 			clang_older_than_12_0_0='yes'
 		fi
 	fi
 
 	# aocc
-	if [ "x${cc_vendor}" = "xaocc" ]; then
+	if [[ ${cc_vendor} = aocc ]]; then
 
 		# Check for aocc < 2.0.0.
-		if [ ${cc_major} -lt 2 ]; then
+		if [[ ${cc_major} -lt 2 ]]; then
 			echo "${script_name}: note: found ${cc} version older than 2.0."
 			aocc_older_than_2_0_0='yes'
 		fi
 
 		# Check for aocc < 3.0.0.
-		if [ ${cc_major} -lt 3 ]; then
+		if [[ ${cc_major} -lt 3 ]]; then
 			echo "${script_name}: note: found ${cc} version older than 3.0."
 			aocc_older_than_3_0_0='yes'
 		fi
@@ -1955,30 +1957,30 @@ check_assembler()
 	#
 	# Check support for FMA4 (amd: bulldozer).
 	#
-	asm_fp=$(find ${asm_dir} -name "fma4.s")
+	asm_fp=$(find "${asm_dir}" -name "fma4.s")
 	knows_fma4=$(try_assemble "${cc}" "${cflags}" "${asm_fp}")
 
-	if [ "x${knows_fma4}" == "xno" ]; then
+	if [[ ${knows_fma4} = no ]]; then
 		blacklistbu_add "bulldozer"
 	fi
 
 	#
 	# Check support for AVX (intel: sandybridge+, amd: piledriver+).
 	#
-	asm_fp=$(find ${asm_dir} -name "avx.s")
+	asm_fp=$(find "${asm_dir}" -name "avx.s")
 	knows_avx=$(try_assemble "${cc}" "${cflags}" "${asm_fp}")
 
-	if [ "x${knows_avx}" == "xno" ]; then
+	if [[ ${knows_avx} = no ]]; then
 		blacklistbu_add "sandybridge"
 	fi
 
 	#
 	# Check support for FMA3 (intel: haswell+, amd: piledriver+).
 	#
-	asm_fp=$(find ${asm_dir} -name "fma3.s")
+	asm_fp=$(find "${asm_dir}" -name "fma3.s")
 	knows_fma3=$(try_assemble "${cc}" "${cflags}" "${asm_fp}")
 
-	if [ "x${knows_fma3}" == "xno" ]; then
+	if [[ ${knows_fma3} = no ]]; then
 		blacklistbu_add "haswell"
 		blacklistbu_add "piledriver"
 		blacklistbu_add "steamroller"
@@ -1995,10 +1997,10 @@ check_assembler()
 		cflags="-march=knl"
 	fi
 
-	asm_fp=$(find ${asm_dir} -name "avx512f.s")
+	asm_fp=$(find "${asm_dir}" -name "avx512f.s")
 	knows_avx512f=$(try_assemble "${cc}" "${cflags}" "${asm_fp}")
 
-	if [ "x${knows_avx512f}" == "xno" ]; then
+	if [[ ${knows_avx512f} = no ]]; then
 		blacklistbu_add "knl"
 		blacklistbu_add "skx"
 	fi
@@ -2012,10 +2014,10 @@ check_assembler()
 		cflags="-march=skylake-avx512"
 	fi
 
-	asm_fp=$(find ${asm_dir} -name "avx512dq.s")
+	asm_fp=$(find "${asm_dir}" -name "avx512dq.s")
 	knows_avx512dq=$(try_assemble "${cc}" "${cflags}" "${asm_fp}")
 
-	if [ "x${knows_avx512dq}" == "xno" ]; then
+	if [[ ${knows_avx512dq} = no ]]; then
 		blacklistbu_add "skx"
 	fi
 }
@@ -2042,7 +2044,8 @@ try_assemble()
 	asm_bin="${asm_base}.o"
 
 	# Try to assemble the file.
-	${cc} ${cflags} -c ${asm_src} -o ${asm_bin} > /dev/null 2>&1
+	# shellcheck disable=2086
+	"${cc}" ${cflags} -c "${asm_src}" -o "${asm_bin}" > /dev/null 2>&1
 
 	if [ "$?" == 0 ]; then
 		rval='yes'
@@ -2081,14 +2084,14 @@ set_default_version()
 		# followed by a number signifying how many commits have transpired
 		# since the tag, followed by a 'g' and a shortened hash tab. Capture
 		# stderr to a file.
-		git_describe_str=$(git -C ${dist_path} describe --tags 2> ${gd_stderr})
+		git_describe_str=$(git -C "${dist_path}" describe --tags 2> "${gd_stderr}")
 
 		# Pull in whatever error message was generated, if any, and delete
 		# the file.
-		git_error=$(cat ${gd_stderr})
+		git_error=$(<"${gd_stderr}")
 
 		# Remove the stderr file.
-		rm -f ${gd_stderr}
+		rm -f "${gd_stderr}"
 
 		# If git returned an error, don't do anything.
 		if [ -n "${git_error}" ]; then
@@ -2097,13 +2100,13 @@ set_default_version()
 			echo "${script_name}: using string from unmodified version file."
 
 			# Use what's in the version file as-is.
-			version=$(cat "${version_file}")
+			version=$(<"${version_file}")
 		else
 
 			echo "${script_name}: got back ${git_describe_str}."
 
 			# Strip off the commit hash label.
-			new_version_str=$(echo ${git_describe_str} | cut -d- -f-2)
+			new_version_str=$(echo "${git_describe_str}" | cut -d- -f-2)
 
 			echo "${script_name}: truncating to ${new_version_str}."
 
@@ -2118,7 +2121,7 @@ set_default_version()
 		echo "${script_name}: could not find '${gitdir}' directory; using unmodified version file."
 
 		# Use what's in the version file as-is.
-		version=$(cat "${version_file}")
+		version=$(<"${version_file}")
 	fi
 }
 
@@ -2270,8 +2273,7 @@ get_tool_checkflags()
 		# If we are on Darwin/OSX/BSD or something else, we sometimes skip flag
 		# checks. (Note that when the list of flags to check is empty, we end
 		# up testing for the existence of the tool instead.)
-		if   [ "${env_str}" = "AR" -o \
-		       "${env_str}" = "RANLIB" ]; then
+		if   [[ ${env_str} = AR || ${env_str} = RANLIB ]]; then
 
 			# AR, RANLIB may not respond to the normal flags on Darwin/OSX/BSD,
 			# so all we can really do is check for their existence.
@@ -2312,7 +2314,7 @@ check_tool()
 		for opt in ${the_flags}; do
 
 			# See if the tool responds to the current flag.
-			${tool} ${opt} > /dev/null 2>&1
+			"${tool}" "${opt}" > /dev/null 2>&1
 
 			# If the tool responded to the flag with a nominal error code of
 			# 0, we found one that works and set rval accoringly.
@@ -2327,7 +2329,7 @@ check_tool()
 		# request to instead check for the existence of the tool.
 
 		# Use 'which' to determine if the tool exists.
-		toolpath="$(which ${tool} 2> /dev/null)"
+		toolpath="$(command -v "${tool}" 2> /dev/null)"
 
 		# If the tool doesn't exist, we set rval accordingly.
 		if [ -n "${toolpath}" ]; then
@@ -2359,7 +2361,7 @@ main()
 	# The path to the script. We need this to find the top-level directory
 	# of the source distribution in the event that the user has chosen to
 	# build elsewhere.
-	dist_path=${0%/${script_name}}
+	dist_path=${0%"/${script_name}"}
 
 	# The path to the directory in which we are building. We do this to
 	# make explicit that we distinguish between the top-level directory
@@ -2472,21 +2474,25 @@ main()
 
 	# The installation exec_prefix, assigned its default value, and a flag to
 	# track whether or not it was given by the user.
+	# shellcheck disable=2016
 	exec_prefix='${prefix}'
 	exec_prefix_flag=''
 
 	# The installation libdir, assigned its default value, and a flag to
 	# track whether or not it was given by the user.
+	# shellcheck disable=2016
 	libdir='${exec_prefix}/lib'
 	libdir_flag=''
 
 	# The installation includedir, assigned its default value, and a flag to
 	# track whether or not it was given by the user.
+	# shellcheck disable=2016
 	includedir='${prefix}/include'
 	includedir_flag=''
 
 	# The installation sharedir, assigned its default value, and a flag to
 	# track whether or not it was given by the user.
+	# shellcheck disable=2016
 	sharedir='${prefix}/share'
 	sharedir_flag=''
 
@@ -2583,7 +2589,7 @@ main()
 	# -- Command line option/argument parsing ----------------------------------
 
 	found=true
-	while $found = true; do
+	while [[ $found = true ]]; do
 
 		# Process our command line options.
 		unset OPTIND
@@ -2869,17 +2875,16 @@ main()
 					;;
 			esac
 		done
-		shift $(($OPTIND - 1))
+		shift $((OPTIND - 1))
 
 		# Parse environment variables
 		found=false
 		while [ $# -gt 0 ]; do
 			case $1 in
 				*=*)
-					var=`expr "$1" : '\([^=]*\)='`
-					value=`expr "$1" : '[^=]*=\(.*\)'`
-					eval $var=\$value
-					export $var
+					var=$(expr "$1" : '\([^=]*\)=')
+					value=$(expr "$1" : '[^=]*=\(.*\)')
+					eval "export $var=\$value"
 					shift
 					found=true
 					;;
@@ -2914,6 +2919,7 @@ main()
 
 	# Select a python interpreter from the default list, or from PYTHON if it
 	# refers to a valid binary.
+	# shellcheck disable=2153
 	select_tool_w_env "${python_search_list}" "${PYTHON}" "PYTHON" \
 	                  "python interpreter" "yes" found_python
 
@@ -2935,11 +2941,11 @@ main()
 	                  "C compiler" "yes" found_cc
 
 	# Also check the compiler to see if we are (cross-)compiling for Windows
-	if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
+	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
 		is_win=yes
 	fi
 	is_msvc=no
-	if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _MSC_VER; then
+	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -q _MSC_VER; then
 		is_msvc=yes
 	fi
 
@@ -3043,7 +3049,7 @@ main()
 	# Read the registered configuration names and lists into associative
 	# arrays.
 	echo -n "${script_name}: reading configuration registry..."
-	read_registry_file ${registry_filepath}
+	read_registry_file "${registry_filepath}"
 	echo "done."
 
 	# Report if additional configurations needed to be blacklisted.
@@ -3068,7 +3074,7 @@ main()
 	echo "${script_name}: starting configuration of BLIS ${version}."
 
 	# Check if the user requested a custom version string.
-	if [ "x${force_version}" = "xno" ]; then
+	if [[ ${force_version} = no ]]; then
 		echo "${script_name}: configuring with official version string."
 	else
 		echo "${script_name}: configuring with custom version string '${force_version}'."
@@ -3079,11 +3085,11 @@ main()
 	# -- Acquire the shared library (.so) versions -----------------------------
 
 	# The first line of the 'so_version' file contains the .so major version.
-	so_version_major=$(cat ${so_version_filepath} | sed -n "1p")
+	so_version_major=$(sed -n "1p" < "${so_version_filepath}")
 
 	# The second line contains the minor and build .so version numbers
 	# (separated by a '.').
-	so_version_minorbuild=$(cat ${so_version_filepath} | sed -n "2p")
+	so_version_minorbuild=$(sed -n "2p" < "${so_version_filepath}")
 
 	echo "${script_name}: found shared library .so version '${so_version_major}.${so_version_minorbuild}'."
 	echo "${script_name}:   .so major version: ${so_version_major}"
@@ -3124,7 +3130,7 @@ main()
 
 	fi
 
-	if [ $1 = "auto" ]; then
+	if [[ $1 = auto ]]; then
 
 		echo "${script_name}: automatic configuration requested."
 
@@ -3168,8 +3174,8 @@ main()
 	# and kernels associated with that name.
 	#config_list=${config_registry[${config_name}]}
 	#kernel_list=${kernel_registry[${config_name}]}
-	config_list=$(query_array "config_registry" ${config_name})
-	kernel_list=$(query_array "kernel_registry" ${config_name})
+	config_list=$(query_array "config_registry" "${config_name}")
+	kernel_list=$(query_array "kernel_registry" "${config_name}")
 
 	# Use the config_registry and kernel_registry to build a kconfig_registry
 	# for the selected config_name.
@@ -3222,13 +3228,14 @@ main()
 
 	# We use a sorted version of kernel_list so that it ends up matching the
 	# display order of the kconfig_registry above.
+	# shellcheck disable=2086
 	kernel_list_sort=$(echo ${kernel_list} | xargs -n1 | sort -u)
 
 	kconfig_map=""
 	for kernel in ${kernel_list_sort}; do
 
 		#configs="${kconfig_registry[$kernel]}"
-		configs=$(query_array "kconfig_registry" ${kernel})
+		configs=$(query_array "kconfig_registry" "${kernel}")
 
 		has_one_kernel=$(is_singleton "${configs}")
 		contains_kernel=$(is_in_list "${kernel}" "${configs}")
@@ -3273,7 +3280,7 @@ main()
 
 		# NOTE: This branch should never execute when using auto-detection,
 		# but we have it here just in case.
-		if [ $1 = "auto" ]; then
+		if [[ $1 = auto ]]; then
 
 			echo "${script_name}: 'auto-detected configuration '${config_name}' is NOT registered!"
 			echo "${script_name}: "
@@ -3290,7 +3297,7 @@ main()
 			# subconfig implied by config_name is blacklisted. Thus, we cannot
 			# proceed.
 
-			if [ $(is_in_list "${config_name}" "${config_blist}") == "true" ]; then
+			if [[ $(is_in_list "${config_name}" "${config_blist}") = true ]]; then
 
 				echo "${script_name}: 'user-specified configuration '${config_name}' is blacklisted!"
 				echo "${script_name}: "
@@ -3333,7 +3340,7 @@ main()
 
 		# First confirm that the current configuration is registered.
 		#this_clist=${config_registry[${conf}]}
-		this_clist=$(query_array "config_registry" ${conf})
+		this_clist=$(query_array "config_registry" "${conf}")
 
 		# If the config_list associated with conf is empty, then it was
 		# never entered into the config_registry to begin with. Thus,
@@ -3430,11 +3437,11 @@ main()
 
 	# Echo the installation directories that we settled on.
 	echo "${script_name}: final installation directories:"
-	echo "${script_name}:   prefix:      "${prefix}
-	echo "${script_name}:   exec_prefix: "${exec_prefix}
-	echo "${script_name}:   libdir:      "${libdir}
-	echo "${script_name}:   includedir:  "${includedir}
-	echo "${script_name}:   sharedir:    "${sharedir}
+	echo "${script_name}:   prefix:      $(eval echo "${prefix}")"
+	echo "${script_name}:   exec_prefix: $(eval echo "${exec_prefix}")"
+	echo "${script_name}:   libdir:      $(eval echo "$(eval echo "${libdir}")")"
+	echo "${script_name}:   includedir:  $(eval echo "${includedir}")"
+	echo "${script_name}:   sharedir:    $(eval echo "${sharedir}")"
 	echo "${script_name}: NOTE: the variables above can be overridden when running make."
 
 	# Check if CFLAGS is non-empty.
@@ -3458,24 +3465,24 @@ main()
 	fi
 
 	# Check if the verbose make flag was specified.
-	if [ "x${enable_verbose}" = "xyes" ]; then
+	if [[ ${enable_verbose} = yes ]]; then
 		echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)"
 	else
 		echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)"
 	fi
 
 	# Check if the ARG_MAX hack was requested.
-	if [ "x${enable_arg_max_hack}" = "xyes" ]; then
+	if [[ ${enable_arg_max_hack} = yes ]]; then
 		echo "${script_name}: enabling ARG_MAX hack."
 	else
 		echo "${script_name}: disabling ARG_MAX hack."
 	fi
 
 	# Check if the debug flag was specified.
-	if [ -n "${debug_flag}" ]; then
-		if [ "x${debug_type}" = "xopt" ]; then
+	if [[ -n ${debug_flag} ]]; then
+		if [[ ${debug_type} = opt ]]; then
 			echo "${script_name}: enabling debug symbols with optimizations."
-		elif [ "x${debug_type}" = "xsde" ]; then
+		elif [[ ${debug_type} = sde ]]; then
 			debug_type='sde'
 			echo "${script_name}: enabling SDE processor emulation."
 		else
@@ -3490,7 +3497,7 @@ main()
 	fi
 
 	# Check if the AddressSanitizer flag was specified.
-	if [ "x${enable_asan}" = "xyes" ]; then
+	if [[ ${enable_asan} = yes ]]; then
 		echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)."
 	else
 		enable_asan='no'
@@ -3498,13 +3505,13 @@ main()
 	fi
 
 	# Check if the static lib flag was specified.
-	if   [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xyes" ]; then
+	if   [[ ${enable_static} = yes && ${enable_shared} = yes ]]; then
 		echo "${script_name}: building BLIS as both static and shared libraries."
 		enable_shared_01=1
-	elif [ "x${enable_static}" = "xno"  -a "x${enable_shared}" = "xyes" ]; then
+	elif [[ ${enable_static} = no && ${enable_shared} = yes ]]; then
 		echo "${script_name}: building BLIS as a shared library (static library disabled)."
 		enable_shared_01=1
-	elif [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xno"  ]; then
+	elif [[ ${enable_static} = yes && ${enable_shared} = no ]]; then
 		echo "${script_name}: building BLIS as a static library (shared library disabled)."
 		enable_shared_01=0
 	else
@@ -3514,14 +3521,14 @@ main()
 	fi
 
 	# Check if the "export shared" flag was specified.
-	if [ "x${export_shared}" = "xall" ]; then
-		if [ "x${enable_shared}" = "xyes" ]; then
+	if [[ ${export_shared} = all ]]; then
+		if [[ ${enable_shared} = yes ]]; then
 			echo "${script_name}: exporting all symbols within shared library."
 		else
 			echo "${script_name}: ignoring request to export all symbols within shared library."
 		fi
-	elif [ "x${export_shared}" = "xpublic" ]; then
-		if [ "x${enable_shared}" = "xyes" ]; then
+	elif [[ ${export_shared} = public ]]; then
+		if [[ ${enable_shared} = yes ]]; then
 			echo "${script_name}: exporting only public symbols within shared library."
 		fi
 	else
@@ -3531,7 +3538,7 @@ main()
 	fi
 
 	# Check if we are building with or without operating system support.
-	if [ "x${enable_system}" = "xyes" ]; then
+	if [[ ${enable_system} = yes ]]; then
 		echo "${script_name}: enabling operating system support."
 		enable_system_01=1
 	else
@@ -3569,29 +3576,29 @@ main()
 	# the strings in the same order as they originally appeared.
 	for word in ${threading_model_list}; do
 
-		if [ "x${word}" = "xsingle" ] ||
-		   [ "x${word}" = "xnone"   ] ||
-		   [ "x${word}" = "xoff"    ] ||
-		   [ "x${word}" = "xno"     ]; then
+		if [[ ${word} = single ]] ||
+		   [[ ${word} = none   ]] ||
+		   [[ ${word} = off    ]] ||
+		   [[ ${word} = no     ]]; then
 
 			parsed_tm="${parsed_tm} single"
 
-		elif [ "x${word}" = "xopenmp" ] ||
-			 [ "x${word}" = "xomp"    ]; then
+		elif [[ ${word} = openmp ]] ||
+			 [[ ${word} = omp    ]]; then
 
 			parsed_tm="${parsed_tm} openmp"
 
-		elif [ "x${word}" = "xpthreads" ] ||
-			 [ "x${word}" = "xpthread"  ] ||
-			 [ "x${word}" = "xposix"    ]; then
+		elif [[ ${word} = pthreads ]] ||
+			 [[ ${word} = pthread  ]] ||
+			 [[ ${word} = posix    ]]; then
 
 			parsed_tm="${parsed_tm} pthreads"
 
-		elif [ "x${word}" = "xhpx" ]; then
+		elif [[ ${word} = hpx ]]; then
 
 			parsed_tm="${parsed_tm} hpx"
 
-		elif [ "x${word}" = "xauto" ]; then
+		elif [[ ${word} = auto ]]; then
 
 			parsed_tm="${parsed_tm} auto"
 
@@ -3665,25 +3672,25 @@ main()
 	# forward.
 	for word in ${parsed_tm}; do
 
-		if [ "x${word}" = "xsingle" ]; then
+		if [[ ${word} = single ]]; then
 
 			echo "${script_name}: enabling support for single-threading."
 			enable_single='yes'
 			enable_single_01=1
 
-		elif [ "x${word}" = "xopenmp" ]; then
+		elif [[ ${word} = openmp ]]; then
 
 			echo "${script_name}: enabling support for threading via OpenMP."
 			enable_openmp='yes'
 			enable_openmp_01=1
 
-		elif [ "x${word}" = "xpthreads" ]; then
+		elif [[ ${word} = pthreads ]]; then
 
 			echo "${script_name}: enabling support for threading via pthreads."
 			enable_pthreads='yes'
 			enable_pthreads_01=1
 
-		elif [ "x${word}" = "xhpx" ]; then
+		elif [[ ${word} = hpx ]]; then
 
 			echo "${script_name}: enabling support for threading via HPX."
 			enable_hpx='yes'
@@ -3695,28 +3702,28 @@ main()
 
 	# Define boolean variables that can easily be interpreted with #ifdef
 	# directives.
-	if [ "x${first_tm}" = "xsingle" ]; then
+	if [[ ${first_tm} = single ]]; then
 
 		enable_single_as_def_01=1
 		enable_openmp_as_def_01=0
 		enable_pthreads_as_def_01=0
 		enable_hpx_as_def_01=0
 
-	elif [ "x${first_tm}" = "xopenmp" ]; then
+	elif [[ ${first_tm} = openmp ]]; then
 
 		enable_single_as_def_01=0
 		enable_openmp_as_def_01=1
 		enable_pthreads_as_def_01=0
 		enable_hpx_as_def_01=0
 
-	elif [ "x${first_tm}" = "xpthreads" ]; then
+	elif [[ ${first_tm} = pthreads ]]; then
 
 		enable_single_as_def_01=0
 		enable_openmp_as_def_01=0
 		enable_pthreads_as_def_01=1
 		enable_hpx_as_def_01=0
 
-	elif [ "x${first_tm}" = "xhpx" ]; then
+	elif [[ ${first_tm} = hpx ]]; then
 
 		enable_single_as_def_01=0
 		enable_openmp_as_def_01=0
@@ -3728,17 +3735,17 @@ main()
 	# If OpenMP, pthreads, or HPX was enabled, given that single-threaded mode is
 	# also always enabled, remind the user which one will serve as the default
 	# (that is, absent any explicit choice at runtime).
-	if [ "x${enable_openmp}"   = "xyes" ] ||
-	   [ "x${enable_pthreads}" = "xyes" ] ||
-	   [ "x${enable_hpx}"      = "xyes" ]; then
+	if [[ ${enable_openmp}   = yes ]] ||
+	   [[ ${enable_pthreads} = yes ]] ||
+	   [[ ${enable_hpx}      = yes ]]; then
 
-		if   [ "x${first_tm}"   = "xsingle" ]; then
+		if   [[ ${first_tm}   = single ]]; then
 			echo "${script_name}: threading will default to single-threaded."
-		elif [ "x${first_tm}"   = "xopenmp" ]; then
+		elif [[ ${first_tm}   = openmp ]]; then
 			echo "${script_name}: threading will default to OpenMP."
-		elif [ "x${first_tm}"   = "xpthreads" ]; then
+		elif [[ ${first_tm}   = pthreads ]]; then
 			echo "${script_name}: threading will default to pthreads."
-		elif [ "x${first_tm}"   = "xhpx" ]; then
+		elif [[ ${first_tm}   = hpx ]]; then
 			echo "${script_name}: threading will default to HPX."
 		fi
 	fi
@@ -3754,13 +3761,13 @@ main()
 	enable_jrir_rr_01=0
 	enable_jrir_slab_01=0
 	enable_jrir_tlb_01=0
-	if   [ "x${thread_part_jrir}" = "xrr" ]; then
+	if   [[ ${thread_part_jrir} = rr ]]; then
 		echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops."
 		enable_jrir_rr_01=1
-	elif [ "x${thread_part_jrir}" = "xslab" ]; then
+	elif [[ ${thread_part_jrir} = slab ]]; then
 		echo "${script_name}: requesting slab work partitioning in jr and/or ir loops."
 		enable_jrir_slab_01=1
-	elif [ "x${thread_part_jrir}" = "xtlb" ]; then
+	elif [[ ${thread_part_jrir} = tlb ]]; then
 		echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop."
 		enable_jrir_tlb_01=1
 	else
@@ -3769,29 +3776,29 @@ main()
 	fi
 
 	# Convert 'yes' and 'no' flags to booleans.
-	if [ "x${enable_pba_pools}" = "xyes" ]; then
+	if [[ ${enable_pba_pools} = yes ]]; then
 		echo "${script_name}: internal memory pools for packing blocks are enabled."
 		enable_pba_pools_01=1
 	else
 		echo "${script_name}: internal memory pools for packing blocks are disabled."
 		enable_pba_pools_01=0
 	fi
-	if [ "x${enable_sba_pools}" = "xyes" ]; then
+	if [[ ${enable_sba_pools} = yes ]]; then
 		echo "${script_name}: internal memory pools for small blocks are enabled."
 		enable_sba_pools_01=1
 	else
 		echo "${script_name}: internal memory pools for small blocks are disabled."
 		enable_sba_pools_01=0
 	fi
-	if [ "x${enable_mem_tracing}" = "xyes" ]; then
+	if [[ ${enable_mem_tracing} = yes ]]; then
 		echo "${script_name}: memory tracing output is enabled."
 		enable_mem_tracing_01=1
 	else
 		echo "${script_name}: memory tracing output is disabled."
 		enable_mem_tracing_01=0
 	fi
-	if [ "x${has_memkind}" = "xyes" ]; then
-		if [ "x${enable_memkind}" = "x" ]; then
+	if [[ ${has_memkind} = yes ]]; then
+		if [[ -z ${enable_memkind} ]]; then
 			# If no explicit option was given for libmemkind one way or the other,
 			# we use the value returned previously by has_libmemkind(), in this
 			# case "yes", to determine the default.
@@ -3799,7 +3806,7 @@ main()
 			enable_memkind="yes"
 			enable_memkind_01=1
 		else
-			if [ "x${enable_memkind}" = "xyes" ]; then
+			if [[ ${enable_memkind} = yes ]]; then
 				echo "${script_name}: received explicit request to enable libmemkind."
 				enable_memkind="yes"
 				enable_memkind_01=1
@@ -3811,27 +3818,27 @@ main()
 		fi
 	else
 		echo "${script_name}: libmemkind not found; disabling."
-		if [ "x${enable_memkind}" = "xyes" ]; then
+		if [[ ${enable_memkind} = yes ]]; then
 			echo "${script_name}: cannot honor explicit request to enable libmemkind."
 		fi
 		enable_memkind="no"
 		enable_memkind_01=0
 	fi
-	if [ "x${pragma_omp_simd}" = "xyes" ]; then
+	if [[ ${pragma_omp_simd} = yes ]]; then
 		echo "${script_name}: compiler appears to support #pragma omp simd."
 		enable_pragma_omp_simd_01=1
 	else
 		echo "${script_name}: compiler appears to not support #pragma omp simd."
 		enable_pragma_omp_simd_01=0
 	fi
-	if [ "x${enable_blas}" = "xyes" ]; then
+	if [[ ${enable_blas} = yes ]]; then
 		echo "${script_name}: the BLAS compatibility layer is enabled."
 		enable_blas_01=1
 	else
 		echo "${script_name}: the BLAS compatibility layer is disabled."
 		enable_blas_01=0
 	fi
-	if [ "x${enable_cblas}" = "xyes" ]; then
+	if [[ ${enable_cblas} = yes ]]; then
 		echo "${script_name}: the CBLAS compatibility layer is enabled."
 		enable_cblas_01=1
 		# Force BLAS layer when CBLAS is enabled
@@ -3840,10 +3847,10 @@ main()
 		echo "${script_name}: the CBLAS compatibility layer is disabled."
 		enable_cblas_01=0
 	fi
-	if [ "x${enable_mixed_dt}" = "xyes" ]; then
+	if [[ ${enable_mixed_dt} = yes ]]; then
 		echo "${script_name}: mixed datatype support is enabled."
 
-		if [ "x${enable_mixed_dt_extra_mem}" = "xyes" ]; then
+		if [[ ${enable_mixed_dt_extra_mem} = yes ]]; then
 			echo "${script_name}: mixed datatype optimizations requiring extra memory are enabled."
 			enable_mixed_dt_extra_mem_01=1
 		else
@@ -3858,14 +3865,14 @@ main()
 		enable_mixed_dt_extra_mem_01=0
 		enable_mixed_dt_01=0
 	fi
-	if [ "x${enable_sup_handling}" = "xyes" ]; then
+	if [[ ${enable_sup_handling} = yes ]]; then
 		echo "${script_name}: sup (skinny/unpacked) matrix handling is enabled."
 		enable_sup_handling_01=1
 	else
 		echo "${script_name}: sup (skinny/unpacked) matrix handling is disabled."
 		enable_sup_handling_01=0
 	fi
-	if [ "x${enable_trsm_preinversion}" = "xyes" ]; then
+	if [[ ${enable_trsm_preinversion} = yes ]]; then
 		echo "${script_name}: trsm diagonal element pre-inversion is enabled."
 		enable_trsm_preinversion_01=1
 	else
@@ -3874,16 +3881,16 @@ main()
 	fi
 
 	# Report integer sizes.
-	if [ "x${int_type_size}" = "x32" ]; then
+	if [[ ${int_type_size} = 32 ]]; then
 		echo "${script_name}: the BLIS API integer size is 32-bit."
-	elif [ "x${int_type_size}" = "x64" ]; then
+	elif [[ ${int_type_size} = 64 ]]; then
 		echo "${script_name}: the BLIS API integer size is 64-bit."
 	else
 		echo "${script_name}: the BLIS API integer size is automatically determined."
 	fi
-	if [ "x${blas_int_type_size}" = "x32" ]; then
+	if [[ ${blas_int_type_size} = 32 ]]; then
 		echo "${script_name}: the BLAS/CBLAS API integer size is 32-bit."
-	elif [ "x${blas_int_type_size}" = "x64" ]; then
+	elif [[ ${blas_int_type_size} = 64 ]]; then
 		echo "${script_name}: the BLAS/CBLAS API integer size is 64-bit."
 	else
 		echo "${script_name}: the BLAS/CBLAS API integer size is automatically determined."
@@ -3891,23 +3898,21 @@ main()
 
 	# Disallow the simultaneous use of 64-bit integers in the BLAS and
 	# 32-bit integers in BLIS.
-	if [ "x${blas_int_type_size}" = "x64" -a "x${int_type_size}" = "x32" ]; then
+	if [[ ${blas_int_type_size} = 64 && ${int_type_size} = 32 ]]; then
 		echo "${script_name}: *** To avoid the possibility of truncation, we do not allow use of 64-bit integers in the BLAS API with 32-bit integers in BLIS. Please use a different configuration of integers."
 		exit 1
 	fi
 
 	# Check whether we should use AMD-customized versions of certain framework
 	# files.
-	if [ "x${enable_amd_frame_tweaks}" = "xyes" ]; then
+	if [[ ${enable_amd_frame_tweaks} = yes ]]; then
 
 		echo "${script_name}: AMD-specific framework files will be considered."
 		echo "${script_name}:   checking eligibility of target configuration."
 
 		# Make sure we are targeting either one of the zen subconfigs or the
 		# amd64 umbrella family.
-		uconf=$(echo ${config_name} | grep -c 'zen\|amd64')
-
-		if [[ $uconf == 0 ]]; then
+		if [[ ${config_name} != *zen* && ${config_name} != *amd64* ]]; then
 			echo "${script_name}:   target configuration '${config_name}' is not eligible."
 			echo "${script_name}:   disabling AMD-specific framework files."
 			enable_amd_frame_tweaks='no'
@@ -3971,7 +3976,7 @@ main()
 	fi
 
 	# Check the method used for returning complex numbers.
-	if [ "x${complex_return}" = "xdefault" ]; then
+	if [[ ${complex_return} = default ]]; then
 
 		# If we prevoiusly found a Fortran compiler, let's query it to see what
 		# kind of complex return type it uses (gnu or intel). The 'gnu' style
@@ -3986,16 +3991,17 @@ main()
 			# clutter.
 			# NOTE: This maybe should use merged stdout/stderr rather than only
 			# stdout. But it works for now.
-			vendor_string="$(${FC} --version 2>/dev/null)"
+			vendor_string="$(${FC} --version 2>/dev/null || :)"
 
 			# Query the compiler "vendor" (ie: the compiler's simple name).
 			# The last part ({ read first rest ; echo $first ; }) is a workaround
 			# to OS X's egrep only returning the first match.
-			fc_vendor=$(echo "${vendor_string}" | egrep -o 'IFORT|GNU' | { read first rest ; echo $first ; })
+			fc_vendor=$(echo "${vendor_string}" | grep -oE 'IFORT|GNU' |
+			            { read -r first rest ; echo "${first}"; })
 
-			if [ "x${fc_vendor}" = "xIFORT" ]; then
+			if [[ ${fc_vendor} = IFORT ]]; then
 				complex_return='intel'
-			elif [ "x${fc_vendor}" = "xGNU" ]; then
+			elif [[ ${fc_vendor} = GNU ]]; then
 				complex_return='gnu'
 			else
 				echo "${script_name}: unable to determine Fortran compiler vendor!"
@@ -4006,9 +4012,9 @@ main()
 		fi
 	fi
 
-	if [ "x${complex_return}" = "xgnu"  ]; then
+	if [[ ${complex_return} = gnu ]]; then
 		complex_return_intel01='0'
-	elif [ "x${complex_return}" = "xintel" ]; then
+	elif [[ ${complex_return} = intel ]]; then
 		complex_return_intel01='1'
 	else
 		echo "${script_name}: unknown complex return type \"${complex_return}\"! Cannot continue."
@@ -4059,7 +4065,7 @@ main()
 	version_esc=$(echo "${version}" | sed 's/\//\\\//g')
 
 	# Create a #define for the configuration family (config_name).
-	uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
+	uconf=$(echo "${config_name}" | tr '[:lower:]' '[:upper:]')
 	config_name_define="#define BLIS_FAMILY_${uconf}\n"
 
 	# Create a list of #defines, one for each configuration in config_list.
@@ -4067,7 +4073,7 @@ main()
 	for conf in ${config_list}; do
 
 		# Convert the current config name to uppercase.
-		uconf=$(echo ${conf} | tr '[:lower:]' '[:upper:]')
+		uconf=$(echo "${conf}" | tr '[:lower:]' '[:upper:]')
 
 		# Create a #define and add it to the running list.
 		config_define="BLIS_CONFIG_${uconf}"
@@ -4079,7 +4085,7 @@ main()
 	for kern in ${kernel_list}; do
 
 		# Convert the current config name to uppercase.
-		uconf=$(echo ${kern} | tr '[:lower:]' '[:upper:]')
+		uconf=$(echo "${kern}" | tr '[:lower:]' '[:upper:]')
 
 		# Create a #define and add it to the running list.
 		kernel_define="BLIS_KERNELS_${uconf}"
@@ -4127,60 +4133,59 @@ main()
 	# Begin substituting information into the config_mk_in file, outputting
 	# to config_mk_out.
 	echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}"
-	cat "${config_mk_in_path}" \
-		| sed -e "s/@version@/${version_esc}/g" \
-		| sed -e "s/@so_version_major@/${so_version_major}/g" \
-		| sed -e "s/@so_version_minorbuild@/${so_version_minorbuild}/g" \
-		| sed -e "s/@config_name@/${config_name}/g" \
-		| sed -e "s/@config_list@/${config_list}/g" \
-		| sed -e "s/@kernel_list@/${kernel_list}/g" \
-		| sed -e "s/@kconfig_map@/${kconfig_map}/g" \
-		| sed -e "s/@os_name@/${os_name_esc}/g" \
-		| sed -e "s/@is_win@/${is_win}/g" \
-		| sed -e "s/@is_msvc@/${is_msvc}/g" \
-		| sed -e "s/@dist_path@/${dist_path_esc}/g" \
-		| sed -e "s/@CC_VENDOR@/${cc_vendor}/g" \
-		| sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \
-		| sed -e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g" \
-		| sed -e "s/@gcc_older_than_9_1_0@/${gcc_older_than_9_1_0}/g" \
-		| sed -e "s/@gcc_older_than_10_3_0@/${gcc_older_than_10_3_0}/g" \
-		| sed -e "s/@clang_older_than_9_0_0@/${clang_older_than_9_0_0}/g" \
-		| sed -e "s/@clang_older_than_12_0_0@/${clang_older_than_12_0_0}/g" \
-		| sed -e "s/@aocc_older_than_2_0_0@/${aocc_older_than_2_0_0}/g" \
-		| sed -e "s/@aocc_older_than_3_0_0@/${aocc_older_than_3_0_0}/g" \
-		| sed -e "s/@CC@/${cc_esc}/g" \
-		| sed -e "s/@CXX@/${cxx_esc}/g" \
-		| sed -e "s/@AR@/${ar_esc}/g" \
-		| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
-		| sed -e "s/@PYTHON@/${python_esc}/g" \
-		| sed -e "s/@libpthread@/${libpthread_esc}/g" \
-		| sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \
-		| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
-		| sed -e "s/@enable_asan@/${enable_asan}/g" \
-		| sed -e "s/@debug_type@/${debug_type}/g" \
-		| sed -e "s/@enable_debug@/${enable_debug}/g" \
-		| sed -e "s/@enable_system@/${enable_system}/g" \
-		| sed -e "s/@threading_model@/${threading_model}/g" \
-		| sed -e "s/@prefix@/${prefix_esc}/g" \
-		| sed -e "s/@exec_prefix@/${exec_prefix_esc}/g" \
-		| sed -e "s/@libdir@/${libdir_esc}/g" \
-		| sed -e "s/@includedir@/${includedir_esc}/g" \
-		| sed -e "s/@sharedir@/${sharedir_esc}/g" \
-		| sed -e "s/@enable_verbose@/${enable_verbose}/g" \
-		| sed -e "s/@configured_oot@/${configured_oot}/g" \
-		| sed -e "s/@enable_arg_max_hack@/${enable_arg_max_hack}/g" \
-		| sed -e "s/@enable_static@/${enable_static}/g" \
-		| sed -e "s/@enable_shared@/${enable_shared}/g" \
-		| sed -e "s/@enable_rpath@/${enable_rpath}/g" \
-		| sed -e "s/@export_shared@/${export_shared}/g" \
-		| sed -e "s/@enable_blas@/${enable_blas}/g" \
-		| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
-		| sed -e "s/@enable_amd_frame_tweaks@/${enable_amd_frame_tweaks}/g" \
-		| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
-		| sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \
-		| sed -e "s/@addon_list@/${addon_list}/g" \
-		| sed -e "s/@sandbox@/${sandbox}/g" \
-		> "${config_mk_out_path}"
+	sed <"${config_mk_in_path}" >"${config_mk_out_path}"          \
+	-e "s/@version@/${version_esc}/g"                             \
+	-e "s/@so_version_major@/${so_version_major}/g"               \
+	-e "s/@so_version_minorbuild@/${so_version_minorbuild}/g"     \
+	-e "s/@config_name@/${config_name}/g"                         \
+	-e "s/@config_list@/${config_list}/g"                         \
+	-e "s/@kernel_list@/${kernel_list}/g"                         \
+	-e "s/@kconfig_map@/${kconfig_map}/g"                         \
+	-e "s/@os_name@/${os_name_esc}/g"                             \
+	-e "s/@is_win@/${is_win}/g"                                   \
+	-e "s/@is_msvc@/${is_msvc}/g"                                 \
+	-e "s/@dist_path@/${dist_path_esc}/g"                         \
+	-e "s/@CC_VENDOR@/${cc_vendor}/g"                             \
+	-e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g"       \
+	-e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g"       \
+	-e "s/@gcc_older_than_9_1_0@/${gcc_older_than_9_1_0}/g"       \
+	-e "s/@gcc_older_than_10_3_0@/${gcc_older_than_10_3_0}/g"     \
+	-e "s/@clang_older_than_9_0_0@/${clang_older_than_9_0_0}/g"   \
+	-e "s/@clang_older_than_12_0_0@/${clang_older_than_12_0_0}/g" \
+	-e "s/@aocc_older_than_2_0_0@/${aocc_older_than_2_0_0}/g"     \
+	-e "s/@aocc_older_than_3_0_0@/${aocc_older_than_3_0_0}/g"     \
+	-e "s/@CC@/${cc_esc}/g"                                       \
+	-e "s/@CXX@/${cxx_esc}/g"                                     \
+	-e "s/@AR@/${ar_esc}/g"                                       \
+	-e "s/@RANLIB@/${ranlib_esc}/g"                               \
+	-e "s/@PYTHON@/${python_esc}/g"                               \
+	-e "s/@libpthread@/${libpthread_esc}/g"                       \
+	-e "s/@cflags_preset@/${cflags_preset_esc}/g"                 \
+	-e "s/@ldflags_preset@/${ldflags_preset_esc}/g"               \
+	-e "s/@enable_asan@/${enable_asan}/g"                         \
+	-e "s/@debug_type@/${debug_type}/g"                           \
+	-e "s/@enable_debug@/${enable_debug}/g"                       \
+	-e "s/@enable_system@/${enable_system}/g"                     \
+	-e "s/@threading_model@/${threading_model}/g"                 \
+	-e "s/@prefix@/${prefix_esc}/g"                               \
+	-e "s/@exec_prefix@/${exec_prefix_esc}/g"                     \
+	-e "s/@libdir@/${libdir_esc}/g"                               \
+	-e "s/@includedir@/${includedir_esc}/g"                       \
+	-e "s/@sharedir@/${sharedir_esc}/g"                           \
+	-e "s/@enable_verbose@/${enable_verbose}/g"                   \
+	-e "s/@configured_oot@/${configured_oot}/g"                   \
+	-e "s/@enable_arg_max_hack@/${enable_arg_max_hack}/g"         \
+	-e "s/@enable_static@/${enable_static}/g"                     \
+	-e "s/@enable_shared@/${enable_shared}/g"                     \
+	-e "s/@enable_rpath@/${enable_rpath}/g"                       \
+	-e "s/@export_shared@/${export_shared}/g"                     \
+	-e "s/@enable_blas@/${enable_blas}/g"                         \
+	-e "s/@enable_cblas@/${enable_cblas}/g"                       \
+	-e "s/@enable_amd_frame_tweaks@/${enable_amd_frame_tweaks}/g" \
+	-e "s/@enable_memkind@/${enable_memkind}/g"                   \
+	-e "s/@pragma_omp_simd@/${pragma_omp_simd}/g"                 \
+	-e "s/@addon_list@/${addon_list}/g"                           \
+	-e "s/@sandbox@/${sandbox}/g"
 
 	# -- Instantiate bli_config.h file from template ---------------------------
 
@@ -4190,38 +4195,38 @@ main()
 	# intuitively, which was used when constructing ${config_name_define},
 	# ${config_list_defines}, and ${kernel_list_defines}.
 	echo "${script_name}: creating ${bli_config_h_out_path} from ${bli_config_h_in_path}"
-	cat "${bli_config_h_in_path}" \
-		| perl -pe "s/\@config_name_define\@/${config_name_define}/g" \
-		| perl -pe "s/\@config_list_defines\@/${config_list_defines}/g" \
-		| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
-		| sed   -e "s/@version@/${version_esc}/g" \
-		| sed   -e "s/@enable_system@/${enable_system_01}/g" \
-		| sed   -e "s/@enable_openmp@/${enable_openmp_01}/g" \
-		| sed   -e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g" \
-		| sed   -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
-		| sed   -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \
-		| sed   -e "s/@enable_hpx@/${enable_hpx_01}/g" \
-		| sed   -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \
-		| sed   -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
-		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
-		| sed   -e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g" \
-		| sed   -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \
-		| sed   -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \
-		| sed   -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \
-		| sed   -e "s/@int_type_size@/${int_type_size}/g" \
-		| sed   -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
-		| sed   -e "s/@enable_blas@/${enable_blas_01}/g" \
-		| sed   -e "s/@enable_cblas@/${enable_cblas_01}/g" \
-		| sed   -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \
-		| sed   -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
-		| sed   -e "s/@enable_sup_handling@/${enable_sup_handling_01}/g" \
-		| sed   -e "s/@enable_memkind@/${enable_memkind_01}/g" \
-		| sed   -e "s/@enable_trsm_preinversion@/${enable_trsm_preinversion_01}/g" \
-		| sed   -e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g" \
-		| sed   -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \
-		| sed   -e "s/@enable_shared@/${enable_shared_01}/g" \
-		| sed   -e "s/@complex_return_intel@/${complex_return_intel01}/g" \
-		> "${bli_config_h_out_path}"
+	<"${bli_config_h_in_path}" perl -p                                   \
+	-e "s/\@config_name_define\@/${config_name_define}/g;"               \
+	-e "s/\@config_list_defines\@/${config_list_defines}/g;"             \
+	-e "s/\@kernel_list_defines\@/${kernel_list_defines}/g;"             \
+	| sed >"${bli_config_h_out_path}"                                    \
+	-e "s/@version@/${version_esc}/g"                                    \
+	-e "s/@enable_system@/${enable_system_01}/g"                         \
+	-e "s/@enable_openmp@/${enable_openmp_01}/g"                         \
+	-e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g"           \
+	-e "s/@enable_pthreads@/${enable_pthreads_01}/g"                     \
+	-e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g"       \
+	-e "s/@enable_hpx@/${enable_hpx_01}/g"                               \
+	-e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g"                 \
+	-e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g"                       \
+	-e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g"                   \
+	-e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g"                     \
+	-e "s/@enable_pba_pools@/${enable_pba_pools_01}/g"                   \
+	-e "s/@enable_sba_pools@/${enable_sba_pools_01}/g"                   \
+	-e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g"               \
+	-e "s/@int_type_size@/${int_type_size}/g"                            \
+	-e "s/@blas_int_type_size@/${blas_int_type_size}/g"                  \
+	-e "s/@enable_blas@/${enable_blas_01}/g"                             \
+	-e "s/@enable_cblas@/${enable_cblas_01}/g"                           \
+	-e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g"                     \
+	-e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
+	-e "s/@enable_sup_handling@/${enable_sup_handling_01}/g"             \
+	-e "s/@enable_memkind@/${enable_memkind_01}/g"                       \
+	-e "s/@enable_trsm_preinversion@/${enable_trsm_preinversion_01}/g"   \
+	-e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g"       \
+	-e "s/@enable_sandbox@/${enable_sandbox_01}/g"                       \
+	-e "s/@enable_shared@/${enable_shared_01}/g"                         \
+	-e "s/@complex_return_intel@/${complex_return_intel01}/g"
 
 	# -- Instantiate bli_addon.h file from template ----------------------------
 
@@ -4230,10 +4235,8 @@ main()
 	# of sed used on OS X is old and does not handle the '\n' character
 	# intuitively, which was used when constructing ${addon_list_includes}.
 	echo "${script_name}: creating ${bli_addon_h_out_path} from ${bli_addon_h_in_path}"
-	cat "${bli_addon_h_in_path}" \
-		| perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" \
-		| sed   -e "s/@enable_addons@/${enable_addons_01}/g" \
-		> "${bli_addon_h_out_path}"
+	perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" "${bli_addon_h_in_path}" \
+	| sed -e "s/@enable_addons@/${enable_addons_01}/g" > "${bli_addon_h_out_path}"
 
 	# -- Create top-level object directories -----------------------------------
 
@@ -4241,40 +4244,40 @@ main()
 	base_obj_dirpath="${obj_dirpath}/${config_name}"
 
 	echo "${script_name}: creating ${base_obj_dirpath}"
-	mkdir -p ${base_obj_dirpath}
+	mkdir -p "${base_obj_dirpath}"
 
 
 	obj_config_dirpath="${base_obj_dirpath}/${config_dir}"
 
-	mkdir -p ${obj_config_dirpath}
+	mkdir -p "${obj_config_dirpath}"
 	for conf in ${config_list}; do
 		echo "${script_name}: creating ${obj_config_dirpath}/${conf}"
-		mkdir -p ${obj_config_dirpath}/${conf}
+		mkdir -p "${obj_config_dirpath}/${conf}"
 	done
 
 
 	obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}"
 
-	mkdir -p ${obj_kernels_dirpath}
+	mkdir -p "${obj_kernels_dirpath}"
 	for kern in ${kernel_list}; do
 		echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}"
-		mkdir -p ${obj_kernels_dirpath}/${kern}
+		mkdir -p "${obj_kernels_dirpath}/${kern}"
 	done
 
 
 	obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}"
 
-	mkdir -p ${obj_refkern_dirpath}
+	mkdir -p "${obj_refkern_dirpath}"
 	for conf in ${config_list}; do
 		echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}"
-		mkdir -p ${obj_refkern_dirpath}/${conf}
+		mkdir -p "${obj_refkern_dirpath}/${conf}"
 	done
 
 
 	obj_frame_dirpath="${base_obj_dirpath}/${frame_dir}"
 
 	echo "${script_name}: creating ${obj_frame_dirpath}"
-	mkdir -p ${obj_frame_dirpath}
+	mkdir -p "${obj_frame_dirpath}"
 
 
 	if [ -n "${addon_flag}" ]; then
@@ -4283,7 +4286,7 @@ main()
 
 		for addon in ${addon_list}; do
 			echo "${script_name}: creating ${obj_addon_dirpath}/${addon}"
-			mkdir -p ${obj_addon_dirpath}/${addon}
+			mkdir -p "${obj_addon_dirpath}/${addon}"
 		done
 	fi
 
@@ -4293,34 +4296,34 @@ main()
 		obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
 
 		echo "${script_name}: creating ${obj_sandbox_dirpath}/${sandbox}"
-		mkdir -p ${obj_sandbox_dirpath}/${sandbox}
+		mkdir -p "${obj_sandbox_dirpath}/${sandbox}"
 	fi
 
 
 	obj_blastest_dirpath="${base_obj_dirpath}/${blastest_dir}"
 
 	echo "${script_name}: creating ${obj_blastest_dirpath}"
-	mkdir -p ${obj_blastest_dirpath}
+	mkdir -p "${obj_blastest_dirpath}"
 
 
 	obj_testsuite_dirpath="${base_obj_dirpath}/${testsuite_dir}"
 
 	echo "${script_name}: creating ${obj_testsuite_dirpath}"
-	mkdir -p ${obj_testsuite_dirpath}
+	mkdir -p "${obj_testsuite_dirpath}"
 
 
 	# Create lib directory (if it does not already exist).
 	base_lib_dirpath="${lib_dirpath}/${config_name}"
 
 	echo "${script_name}: creating ${base_lib_dirpath}"
-	mkdir -p ${base_lib_dirpath}
+	mkdir -p "${base_lib_dirpath}"
 
 
 	# Create include directory (if it does not already exist).
 	base_include_dirpath="${include_dirpath}/${config_name}"
 
 	echo "${script_name}: creating ${base_include_dirpath}"
-	mkdir -p ${base_include_dirpath}
+	mkdir -p "${base_include_dirpath}"
 
 
 	# -- Mirror source directory hierarchies to object directories -------------
@@ -4332,7 +4335,7 @@ main()
 	for conf in ${config_list_plus_name}; do
 
 		echo "${script_name}: mirroring ${config_dirpath}/${conf} to ${obj_config_dirpath}/${conf}"
-		${mirror_tree_sh} "${config_dirpath}/${conf}" "${obj_config_dirpath}/${conf}"
+		"${mirror_tree_sh}" "${config_dirpath}/${conf}" "${obj_config_dirpath}/${conf}"
 	done
 
 	# Mirror optimized kernels source tree to its object sub-directory.
@@ -4357,147 +4360,95 @@ main()
 
 	# Mirror reference kernel source tree to its object sub-directory.
 	echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}"
-	${mirror_tree_sh} ${refkern_dirpath} ${obj_refkern_dirpath}
+	"${mirror_tree_sh}" "${refkern_dirpath}" "${obj_refkern_dirpath}"
 
 	# Mirror reference kernels source tree to its object sub-directory.
 	for conf in ${config_list}; do
 
 		echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}/${conf}"
-		${mirror_tree_sh} "${refkern_dirpath}" "${obj_refkern_dirpath}/${conf}"
+		"${mirror_tree_sh}" "${refkern_dirpath}" "${obj_refkern_dirpath}/${conf}"
 	done
 
 	# Mirror framework source tree to its object sub-directory.
 	echo "${script_name}: mirroring ${frame_dirpath} to ${obj_frame_dirpath}"
-	${mirror_tree_sh} ${frame_dirpath} ${obj_frame_dirpath}
+	"${mirror_tree_sh}" "${frame_dirpath}" "${obj_frame_dirpath}"
 
 	# Mirror the chosen addon source tree to its object sub-directory.
-	if [ -n "${addon_flag}" ]; then
+	if [[ -n ${addon_flag} ]]; then
 
 		for addon in ${addon_list}; do
 
 			echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}"
-			${mirror_tree_sh} "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}"
+			"${mirror_tree_sh}" "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}"
 		done
 	fi
 
 	# Mirror the chosen sandbox source tree to its object sub-directory.
-	if [ -n "${sandbox_flag}" ]; then
+	if [[ -n ${sandbox_flag} ]]; then
 
 		echo "${script_name}: mirroring ${sandbox_dirpath}/${sandbox} to ${obj_sandbox_dirpath}/${sandbox}"
-		${mirror_tree_sh} "${sandbox_dirpath}/${sandbox}" "${obj_sandbox_dirpath}/${sandbox}"
+		"${mirror_tree_sh}" "${sandbox_dirpath}/${sandbox}" "${obj_sandbox_dirpath}/${sandbox}"
 	fi
 
 
 	# -- Generate makefile fragements ------------------------------------------
 
+	create_makefile_fragment() {
+		echo "${script_name}: creating makefile fragments in $3"
+		"${gen_make_frags_sh}"                           \
+			-h -r -v0                                \
+			-o "${script_name}"                      \
+			-p "$1" "$2" "$3"                        \
+			"${gen_make_frags_dirpath}/fragment.mk"  \
+			"${gen_make_frags_dirpath}/suffix_list"  \
+			"${gen_make_frags_dirpath}/ignore_list"
+	}
+
 	clist_contains_cname=$(is_in_list "${config_name}" "${config_list}")
 
 	# If the config_list does not already contain the config_name (i.e.,
 	# if config_name is an umbrella family), generate makefiles in that
 	# directory. (In the next step, we will loop over the actual sub-
 	# configurations and create fragments there as well.)
-	if [ "${clist_contains_cname}" == "false" ]; then
-
-		echo "${script_name}: creating makefile fragments in ${obj_config_dirpath}/${config_name}"
-		${gen_make_frags_sh} \
-				 -h -r -v0 \
-				 -o ${script_name} \
-				 -p 'CONFIG' \
-				 ${config_dirpath}/${config_name} \
-				 ${obj_config_dirpath}/${config_name} \
-				 ${gen_make_frags_dirpath}/fragment.mk \
-				 ${gen_make_frags_dirpath}/suffix_list \
-				 ${gen_make_frags_dirpath}/ignore_list
+	if [[ ${clist_contains_cname} = false ]]; then
+		create_makefile_fragment CONFIG "${config_dirpath}/${config_name}" \
+		                         "${obj_config_dirpath}/${config_name}"
 	fi
 
 	# Generate makefile fragments for each of the sub-configurations present
 	# in the configuration list.
 	for conf in ${config_list}; do
-
-		echo "${script_name}: creating makefile fragments in ${obj_config_dirpath}/${conf}"
-		${gen_make_frags_sh} \
-				 -h -r -v0 \
-				 -o ${script_name} \
-				 -p 'CONFIG' \
-				 ${config_dirpath}/${conf} \
-				 ${obj_config_dirpath}/${conf} \
-				 ${gen_make_frags_dirpath}/fragment.mk \
-				 ${gen_make_frags_dirpath}/suffix_list \
-				 ${gen_make_frags_dirpath}/ignore_list
+		create_makefile_fragment CONFIG "${config_dirpath}/${conf}" \
+		                         "${obj_config_dirpath}/${conf}"
 	done
 
 	# Generate makefile fragments for each of the kernel sets required by
 	# the configuration list (in the kernel list).
 	for kern in ${kernel_list}; do
-
-		echo "${script_name}: creating makefile fragments in ${obj_kernels_dirpath}/${kern}"
-		${gen_make_frags_sh} \
-				 -h -r -v0 \
-				 -o ${script_name} \
-				 -p 'KERNELS' \
-				 ${kernels_dirpath}/${kern} \
-				 ${obj_kernels_dirpath}/${kern} \
-				 ${gen_make_frags_dirpath}/fragment.mk \
-				 ${gen_make_frags_dirpath}/suffix_list \
-				 ${gen_make_frags_dirpath}/ignore_list
+		create_makefile_fragment KERNELS "${kernels_dirpath}/${kern}" \
+		                         "${obj_kernels_dirpath}/${kern}"
 	done
 
 	# Generate makefile fragments in the reference kernels directory.
-	echo "${script_name}: creating makefile fragments in ${obj_refkern_dirpath}"
-	${gen_make_frags_sh} \
-			 -h -r -v0 \
-			 -o ${script_name} \
-			 -p 'REFKERN' \
-			 ${refkern_dirpath} \
-			 ${obj_refkern_dirpath} \
-			 ${gen_make_frags_dirpath}/fragment.mk \
-			 ${gen_make_frags_dirpath}/suffix_list \
-			 ${gen_make_frags_dirpath}/ignore_list
+	create_makefile_fragment REFKERN "${refkern_dirpath}" \
+	                         "${obj_refkern_dirpath}"
 
 	# Generate makefile fragments in the framework directory.
-	echo "${script_name}: creating makefile fragments in ${obj_frame_dirpath}"
-	${gen_make_frags_sh} \
-			 -h -r -v0 \
-			 -o ${script_name} \
-			 -p 'FRAME' \
-			 ${frame_dirpath} \
-			 ${obj_frame_dirpath} \
-			 ${gen_make_frags_dirpath}/fragment.mk \
-			 ${gen_make_frags_dirpath}/suffix_list \
-			 ${gen_make_frags_dirpath}/ignore_list
+	create_makefile_fragment FRAME "${frame_dirpath}" \
+	                         "${obj_frame_dirpath}"
 
 	# Generate makefile fragments in the addon sub-directory.
-	if [ -n "${addon_flag}" ]; then
-
+	if [[ -n ${addon_flag} ]]; then
 		for addon in ${addon_list}; do
-
-			echo "${script_name}: creating makefile fragments in ${obj_addon_dirpath}/${addon}"
-			${gen_make_frags_sh} \
-					 -h -r -v0 \
-					 -o ${script_name} \
-					 -p 'ADDON' \
-					 ${addon_dirpath}/${addon} \
-					 ${obj_addon_dirpath}/${addon} \
-					 ${gen_make_frags_dirpath}/fragment.mk \
-					 ${gen_make_frags_dirpath}/suffix_list \
-					 ${gen_make_frags_dirpath}/ignore_list
+			create_makefile_fragment ADDON "${addon_dirpath}/${addon}" \
+			                         "${obj_addon_dirpath}/${addon}"
 		done
 	fi
 
-
 	# Generate makefile fragments in the sandbox sub-directory.
-	if [ -n "${sandbox_flag}" ]; then
-
-		echo "${script_name}: creating makefile fragments in ${obj_sandbox_dirpath}/${sandbox}"
-		${gen_make_frags_sh} \
-				 -h -r -v0 \
-				 -o ${script_name} \
-				 -p 'SANDBOX' \
-				 ${sandbox_dirpath}/${sandbox} \
-				 ${obj_sandbox_dirpath}/${sandbox} \
-				 ${gen_make_frags_dirpath}/fragment.mk \
-				 ${gen_make_frags_dirpath}/suffix_list \
-				 ${gen_make_frags_dirpath}/ignore_list
+	if [[ -n ${sandbox_flag} ]]; then
+		create_makefile_fragment SANDBOX "${sandbox_dirpath}/${sandbox}" \
+		                         "${obj_sandbox_dirpath}/${sandbox}"
 	fi
 
 
@@ -4505,75 +4456,23 @@ main()
 
 	# Under some circumstances, we need to create some symbolic links to
 	# properly handle out-of-tree builds.
-	if [ "${configured_oot}" = "yes" ]; then
-
-		# If 'Makefile' symlink does not already exist in the current
-		# directory, create a symbolic link to it. If one does exist, we
-		# use -f to force creation of a new link.
-		if [ ! -e "./Makefile" ]; then
-
-			echo "${script_name}: creating symbolic link to Makefile."
-			ln -s "${dist_path}/Makefile"
-
-		elif [ -h "./Makefile" ]; then
-			echo "${script_name}: symbolic link to Makefile already exists; forcing creation of new link."
-			ln -sf "${dist_path}/Makefile"
-		else
-			echo "${script_name}: Non-symbolic link file or directory 'Makefile' blocks creation of symlink."
-			echo "${script_name}: *** Please remove this entity and re-run configure."
-			exit 1
-		fi
-
-		# If 'blis.pc.in' symlink does not already exist in the current
-		# directory, create a symbolic link to it. If one does exist, we
-		# use -f to force creation of a new link.
-		if [ ! -e "./blis.pc.in" ]; then
-
-			echo "${script_name}: creating symbolic link to blis.pc.in."
-			ln -s "${dist_path}/blis.pc.in"
-
-		elif [ -h "./blis.pc.in" ]; then
-			echo "${script_name}: symbolic link to blis.pc.in already exists; forcing creation of new link."
-			ln -sf "${dist_path}/blis.pc.in"
-		else
-			echo "${script_name}: Non-symbolic link file or directory 'blis.pc.in' blocks creation of symlink."
-			echo "${script_name}: *** Please remove this entity and re-run configure."
-			exit 1
-		fi
-
-		# If 'common.mk' symlink does not already exist in the current
-		# directory, create a symbolic link to it. If one does exist, we
-		# use -f to force creation of a new link.
-		if [ ! -e "./common.mk" ]; then
-
-			echo "${script_name}: creating symbolic link to common.mk."
-			ln -s "${dist_path}/common.mk"
-
-		elif [ -h "./common.mk" ]; then
-			echo "${script_name}: symbolic link to common.mk already exists; forcing creation of new link."
-			ln -sf "${dist_path}/common.mk"
-		else
-			echo "${script_name}: Non-symbolic link file or directory 'common.mk' blocks creation of symlink."
-			echo "${script_name}: *** Please remove this entity and re-run configure."
-			exit 1
-		fi
-
-		# If 'config' symlink does not already exist in the current
-		# directory, create a symbolic link to it. If one does exist, we
-		# use -f to force creation of a new link.
-		if [ ! -e "./config" ]; then
-
-			echo "${script_name}: creating symbolic link to 'config' directory."
-			ln -s "${dist_path}/config"
-
-		elif [ -h "./config" ]; then
-			echo "${script_name}: symbolic link to 'config' directory already exists; forcing creation of new link."
-			ln -sf "${dist_path}/config"
-		else
-			echo "${script_name}: Non-symbolic link file or directory 'config' blocks creation of symlink."
-			echo "${script_name}: *** Please remove this entity and re-run configure."
-			exit 1
-		fi
+	if [[ ${configured_oot} = yes ]]; then
+		for file in Makefile blis.pc.in common.mk config; do
+			# If symlink does not already exist in the current
+			# directory, create a symbolic link to it. If one does exist, we
+			# use -f to force creation of a new link.
+			if [[ ! -e ${file} ]]; then
+				echo "${script_name}: creating symbolic link to ${file}."
+				ln -s "${dist_path}/${file}" .
+			elif [[ -h ${file} ]]; then
+				echo "${script_name}: symbolic link to ${file} already exists; forcing creation of new link."
+				ln -sf "${dist_path}/${file}" .
+			else
+				echo "${script_name}: Non-symbolic link file or directory '${file}' blocks creation of symlink."
+				echo "${script_name}: *** Please remove this entity and re-run configure."
+				exit 1
+			fi
+		done
 
 		echo "${script_name}: configured to build outside of source distribution."
 	else
@@ -4601,4 +4500,3 @@ main()
 
 # The script's main entry point, passing all parameters given.
 main "$@"
-

From 5f841307f668f65b7ed5a479bd8374d2581208cf Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 24 Mar 2023 20:05:13 -0500
Subject: [PATCH 139/230] Omit -fPIC if shared library build is disabled.
 (#732)

Details:
- Updated common.mk so that when --disable-shared option is given to
  configure:
  1. The -fPIC compiler flag is omitted from the individual
     configuration family members' CPICFLAGS variables (which are
     initialized in each subconfig's make_defs.mk file); and
  2. The BUILD_SYMFLAGS variable, which contains compiler flags needed
     to control the symbol export behavior, is left blank.
- The net result of these changes is that flags specific to shared
  library builds are only used when a shared library is actually
  scheduled to be built. Thanks to Nick Knight for reporting this issue.
- CREDITS file update.
---
 CREDITS   |  1 +
 common.mk | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/CREDITS b/CREDITS
index 628e14f58..9cc846d5c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -56,6 +56,7 @@ but many others have contributed code, ideas, and feedback, including
   Tony Kelman              @tkelman
   Lee Killough             @leekillough               (Cray)
   Mike Kistler             @mkistler                  (IBM, Austin Research Laboratory)
+  Nick Knight              @nick-knight               (SiFive)
   Ivan Korostelev          @ivan23kor                 (University of Alberta)
   Kyungmin Lee             @kyungminlee               (Ohio State University)
   Michael Lehn             @michael-lehn
diff --git a/common.mk b/common.mk
index 3a8cb9886..2fdb0aa10 100644
--- a/common.mk
+++ b/common.mk
@@ -731,6 +731,9 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c))))
 
 # --- Position-independent code flags (shared libraries only) ---
 
+
+ifeq ($(MK_ENABLE_SHARED),yes)
+
 # Emit position-independent code for dynamic linking.
 ifeq ($(IS_MSVC),yes)
 # Note: Don't use any fPIC flags for Windows builds since all code is position-
@@ -814,6 +817,14 @@ endif
 endif
 endif
 
+else #ifeq ($(MK_ENABLE_SHARED),no)
+
+# Don't modify CPICFLAGS for the various configuration family members.
+# Don't use any special symbol export flags.
+BUILD_SYMFLAGS :=
+
+endif
+
 # --- Language flags ---
 
 # Enable C99.

From 04090df01175477394d1e73af2e5769751d47cd6 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 27 Mar 2023 14:13:10 -0500
Subject: [PATCH 140/230] Fixed compile errors with `BLIS_DISABLE_BLAS_DEFS`.
 (#730)

* Fixed compile errors with BLIS_DISABLE_BLAS_DEFS.

Details:
- This commit fixes a compile-time error related to the type definition
  (prototype) of dsdot_() when BLIS_DISABLE_BLAS_DEFS is defined by the
  application (or the configuration), which is actually a symptom of a
  larger design issue when disabling BLAS prototypes. The macro was
  intended to allow applications to bring their own BLAS prototypes and
  suppress the inclusion of duplicate (or possibly conflicting)
  prototypes within blis.h. However, prototypes are still needed during
  compilation even if they are ultimately omitted from blis.h. The
  problem is that almost every source file in BLIS--including the BLAS
  compatibility layer--only includes one header (blis.h), and if we
  were to #include a new header in the BLAS source files (to isolate
  only the BLAS prototypes), we would also have to make the build system
  aware of the location of those headers. Thanks to Edward Smyth of AMD
  for reporting this issue.
- The solution I settled upon was to remove all cpp guards from all BLAS
  headers (by changing them to #if 1, for easy search-and-replace
  anchoring in the future if we ever need to re-insert guards) and
  modifying bli_blas.h so that the BLAS prototypes are #included if
  either (a) BLIS_ENABLE_BLAS_DEFS is defined, or (b)
  BLIS_ENABLE_BLAS_DEFS is *not* defined but BLIS_IS_BUILDING_LIBRARY
  *is* defined. (Thanks to Devin Matthews for steering me away from an
  inferior solution.)
- This commit also spins off the actual BLAS prototypes/definitions to
  a separate file, bli_blas_defs.h.
- CREDITS file update.
---
 CREDITS                               |   1 +
 frame/compat/bla_amax.h               |   3 +
 frame/compat/bla_asum.h               |   3 +
 frame/compat/bla_axpy.h               |   3 +
 frame/compat/bla_copy.h               |   3 +
 frame/compat/bla_dot.h                |   2 +-
 frame/compat/bla_gemm.h               |   3 +
 frame/compat/bla_gemv.h               |   3 +
 frame/compat/bla_ger.h                |   3 +
 frame/compat/bla_hemm.h               |   3 +
 frame/compat/bla_hemv.h               |   3 +
 frame/compat/bla_her.h                |   3 +
 frame/compat/bla_her2.h               |   3 +
 frame/compat/bla_her2k.h              |   3 +
 frame/compat/bla_herk.h               |   3 +
 frame/compat/bla_nrm2.h               |   3 +
 frame/compat/bla_scal.h               |   3 +
 frame/compat/bla_swap.h               |   3 +
 frame/compat/bla_symm.h               |   3 +
 frame/compat/bla_symv.h               |   3 +
 frame/compat/bla_syr.h                |   3 +
 frame/compat/bla_syr2.h               |   3 +
 frame/compat/bla_syr2k.h              |   3 +
 frame/compat/bla_syrk.h               |   3 +
 frame/compat/bla_trmm.h               |   3 +
 frame/compat/bla_trmv.h               |   3 +
 frame/compat/bla_trsm.h               |   3 +
 frame/compat/bla_trsv.h               |   3 +
 frame/compat/bli_blas.h               | 187 +++---------------------
 frame/compat/bli_blas_defs.h          | 197 ++++++++++++++++++++++++++
 frame/compat/check/bla_gemm3m_check.h |   2 +-
 frame/compat/check/bla_gemm_check.h   |   2 +-
 frame/compat/check/bla_gemmt_check.h  |   2 +-
 frame/compat/check/bla_gemv_check.h   |   2 +-
 frame/compat/check/bla_ger_check.h    |   2 +-
 frame/compat/check/bla_hemm_check.h   |   2 +-
 frame/compat/check/bla_hemv_check.h   |   2 +-
 frame/compat/check/bla_her2_check.h   |   2 +-
 frame/compat/check/bla_her2k_check.h  |   2 +-
 frame/compat/check/bla_her_check.h    |   2 +-
 frame/compat/check/bla_herk_check.h   |   2 +-
 frame/compat/check/bla_symm_check.h   |   2 +-
 frame/compat/check/bla_symv_check.h   |   2 +-
 frame/compat/check/bla_syr2_check.h   |   2 +-
 frame/compat/check/bla_syr2k_check.h  |   2 +-
 frame/compat/check/bla_syr_check.h    |   2 +-
 frame/compat/check/bla_syrk_check.h   |   2 +-
 frame/compat/check/bla_trmm_check.h   |   2 +-
 frame/compat/check/bla_trmv_check.h   |   2 +-
 frame/compat/check/bla_trsm_check.h   |   2 +-
 frame/compat/check/bla_trsv_check.h   |   2 +-
 frame/compat/f2c/bla_cabs1.h          |   2 +-
 frame/compat/f2c/bla_gbmv.h           |   2 +-
 frame/compat/f2c/bla_hbmv.h           |   2 +-
 frame/compat/f2c/bla_hpmv.h           |   2 +-
 frame/compat/f2c/bla_hpr.h            |   2 +-
 frame/compat/f2c/bla_hpr2.h           |   2 +-
 frame/compat/f2c/bla_lsame.h          |   2 +-
 frame/compat/f2c/bla_rot.h            |   2 +-
 frame/compat/f2c/bla_rotg.h           |   2 +-
 frame/compat/f2c/bla_rotm.h           |   2 +-
 frame/compat/f2c/bla_rotmg.h          |   2 +-
 frame/compat/f2c/bla_sbmv.h           |   2 +-
 frame/compat/f2c/bla_spmv.h           |   2 +-
 frame/compat/f2c/bla_spr.h            |   2 +-
 frame/compat/f2c/bla_spr2.h           |   2 +-
 frame/compat/f2c/bla_tbmv.h           |   2 +-
 frame/compat/f2c/bla_tbsv.h           |   2 +-
 frame/compat/f2c/bla_tpmv.h           |   2 +-
 frame/compat/f2c/bla_tpsv.h           |   2 +-
 frame/compat/f2c/bla_xerbla.h         |   2 +-
 frame/compat/f2c/bla_xerbla_array.h   |   2 +-
 frame/compat/f2c/util/bla_c_abs.h     |   2 +-
 frame/compat/f2c/util/bla_c_div.h     |   2 +-
 frame/compat/f2c/util/bla_d_abs.h     |   2 +-
 frame/compat/f2c/util/bla_d_cnjg.h    |   2 +-
 frame/compat/f2c/util/bla_d_imag.h    |   2 +-
 frame/compat/f2c/util/bla_d_sign.h    |   2 +-
 frame/compat/f2c/util/bla_f__cabs.h   |   2 +-
 frame/compat/f2c/util/bla_r_abs.h     |   2 +-
 frame/compat/f2c/util/bla_r_cnjg.h    |   2 +-
 frame/compat/f2c/util/bla_r_imag.h    |   2 +-
 frame/compat/f2c/util/bla_r_sign.h    |   2 +-
 frame/compat/f2c/util/bla_z_abs.h     |   2 +-
 frame/compat/f2c/util/bla_z_div.h     |   2 +-
 85 files changed, 354 insertions(+), 221 deletions(-)
 create mode 100644 frame/compat/bli_blas_defs.h

diff --git a/CREDITS b/CREDITS
index 9cc846d5c..689afd599 100644
--- a/CREDITS
+++ b/CREDITS
@@ -104,6 +104,7 @@ but many others have contributed code, ideas, and feedback, including
   Nathaniel Smith          @njsmith
   Shaden Smith             @ShadenSmith
   Tyler Smith              @tlrmchlsmth               (The University of Texas at Austin)
+  Edward Smyth             @edwsmyth                  (AMD)
   Snehith                  @ArcadioN09
   Paul Springer            @springer13                (RWTH Aachen University)
   Adam J. Stewart          @adamjstewart              (University of Illinois at Urbana-Champaign)
diff --git a/frame/compat/bla_amax.h b/frame/compat/bla_amax.h
index 1f13715dc..e765ecfcb 100644
--- a/frame/compat/bla_amax.h
+++ b/frame/compat/bla_amax.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -49,3 +50,5 @@ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \
 INSERT_GENTPROT_BLAS( amax )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_asum.h b/frame/compat/bla_asum.h
index a9ef27a03..fd859f26b 100644
--- a/frame/compat/bla_asum.h
+++ b/frame/compat/bla_asum.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -49,3 +50,5 @@ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
 INSERT_GENTPROTR2_BLAS( asum )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_axpy.h b/frame/compat/bla_axpy.h
index 294a385c7..0a4a17c0f 100644
--- a/frame/compat/bla_axpy.h
+++ b/frame/compat/bla_axpy.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -51,3 +52,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( axpy )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_copy.h b/frame/compat/bla_copy.h
index 679017b19..a4b3886dd 100644
--- a/frame/compat/bla_copy.h
+++ b/frame/compat/bla_copy.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -50,3 +51,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( copy )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_dot.h b/frame/compat/bla_dot.h
index 87d773321..14221071e 100644
--- a/frame/compat/bla_dot.h
+++ b/frame/compat/bla_dot.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
diff --git a/frame/compat/bla_gemm.h b/frame/compat/bla_gemm.h
index 77111dbd8..7b14fc8df 100644
--- a/frame/compat/bla_gemm.h
+++ b/frame/compat/bla_gemm.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -57,3 +58,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( gemm )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_gemv.h b/frame/compat/bla_gemv.h
index 22c8bf1c0..e5a9e12bb 100644
--- a/frame/compat/bla_gemv.h
+++ b/frame/compat/bla_gemv.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -55,3 +56,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( gemv )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_ger.h b/frame/compat/bla_ger.h
index a31548f61..88517c015 100644
--- a/frame/compat/bla_ger.h
+++ b/frame/compat/bla_ger.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -53,3 +54,5 @@ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \
 INSERT_GENTPROTDOT_BLAS( ger )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_hemm.h b/frame/compat/bla_hemm.h
index 711877ede..61084d21f 100644
--- a/frame/compat/bla_hemm.h
+++ b/frame/compat/bla_hemm.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -56,3 +57,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTCO_BLAS( hemm )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_hemv.h b/frame/compat/bla_hemv.h
index 4e8230114..b2218f560 100644
--- a/frame/compat/bla_hemv.h
+++ b/frame/compat/bla_hemv.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -54,3 +55,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTCO_BLAS( hemv )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_her.h b/frame/compat/bla_her.h
index b9ae30d90..f4e9665f1 100644
--- a/frame/compat/bla_her.h
+++ b/frame/compat/bla_her.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -52,3 +53,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTCO_BLAS( her )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_her2.h b/frame/compat/bla_her2.h
index 7cf0bb867..431cbe32d 100644
--- a/frame/compat/bla_her2.h
+++ b/frame/compat/bla_her2.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -53,3 +54,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTCO_BLAS( her2 )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_her2k.h b/frame/compat/bla_her2k.h
index c771f78d4..50b36b36b 100644
--- a/frame/compat/bla_her2k.h
+++ b/frame/compat/bla_her2k.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -56,3 +57,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTCO_BLAS( her2k )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_herk.h b/frame/compat/bla_herk.h
index e649a74ab..5b80a3d92 100644
--- a/frame/compat/bla_herk.h
+++ b/frame/compat/bla_herk.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -55,3 +56,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTCO_BLAS( herk )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_nrm2.h b/frame/compat/bla_nrm2.h
index a8bc25ef4..fb5955356 100644
--- a/frame/compat/bla_nrm2.h
+++ b/frame/compat/bla_nrm2.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -49,3 +50,5 @@ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
 INSERT_GENTPROTR2_BLAS( nrm2 )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_scal.h b/frame/compat/bla_scal.h
index c8e898b6b..ef55118bf 100644
--- a/frame/compat/bla_scal.h
+++ b/frame/compat/bla_scal.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -50,3 +51,5 @@ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \
 INSERT_GENTPROTSCAL_BLAS( scal )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_swap.h b/frame/compat/bla_swap.h
index 54c0613a9..e1bb7eb7c 100644
--- a/frame/compat/bla_swap.h
+++ b/frame/compat/bla_swap.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -50,3 +51,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( swap )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_symm.h b/frame/compat/bla_symm.h
index b186e4b43..cf2464e08 100644
--- a/frame/compat/bla_symm.h
+++ b/frame/compat/bla_symm.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -56,3 +57,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( symm )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_symv.h b/frame/compat/bla_symv.h
index 9d1662fad..2f493a9d9 100644
--- a/frame/compat/bla_symv.h
+++ b/frame/compat/bla_symv.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -54,3 +55,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTRO_BLAS( symv )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_syr.h b/frame/compat/bla_syr.h
index 0d2a1e031..662d07328 100644
--- a/frame/compat/bla_syr.h
+++ b/frame/compat/bla_syr.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -52,3 +53,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTRO_BLAS( syr )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_syr2.h b/frame/compat/bla_syr2.h
index b45876794..1b44a669e 100644
--- a/frame/compat/bla_syr2.h
+++ b/frame/compat/bla_syr2.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -53,3 +54,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROTRO_BLAS( syr2 )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_syr2k.h b/frame/compat/bla_syr2k.h
index 91d9a3acf..385af3596 100644
--- a/frame/compat/bla_syr2k.h
+++ b/frame/compat/bla_syr2k.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -56,3 +57,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( syr2k )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_syrk.h b/frame/compat/bla_syrk.h
index b6ca938a6..8cfcc2eab 100644
--- a/frame/compat/bla_syrk.h
+++ b/frame/compat/bla_syrk.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -55,3 +56,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( syrk )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_trmm.h b/frame/compat/bla_trmm.h
index 4f0c20b1b..0e71ea6c4 100644
--- a/frame/compat/bla_trmm.h
+++ b/frame/compat/bla_trmm.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -56,3 +57,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( trmm )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_trmv.h b/frame/compat/bla_trmv.h
index 4096ffe79..8b9a68a86 100644
--- a/frame/compat/bla_trmv.h
+++ b/frame/compat/bla_trmv.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -53,3 +54,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( trmv )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_trsm.h b/frame/compat/bla_trsm.h
index 5694db52a..9d6cf2602 100644
--- a/frame/compat/bla_trsm.h
+++ b/frame/compat/bla_trsm.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -56,3 +57,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( trsm )
 #endif
 
+#endif
+
diff --git a/frame/compat/bla_trsv.h b/frame/compat/bla_trsv.h
index 6edb435f1..f0dc7ff42 100644
--- a/frame/compat/bla_trsv.h
+++ b/frame/compat/bla_trsv.h
@@ -32,6 +32,7 @@
 
 */
 
+#if 1
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -53,3 +54,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 INSERT_GENTPROT_BLAS( trsv )
 #endif
 
+#endif
+
diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h
index c88a2e3c3..fca75e0f9 100644
--- a/frame/compat/bli_blas.h
+++ b/frame/compat/bli_blas.h
@@ -33,13 +33,15 @@
 
 */
 
+#ifndef BLIS_BLAS_H
+#define BLIS_BLAS_H
+
+
 // If the CBLAS compatibility layer was enabled while the BLAS layer
 // was not enabled, we must enable it here.
-#ifdef BLIS_ENABLE_CBLAS
-#ifndef BLIS_ENABLE_BLAS
-#define BLIS_ENABLE_BLAS
+#if defined(BLIS_ENABLE_CBLAS) && !defined(BLIS_ENABLE_BLAS)
+  #define BLIS_ENABLE_BLAS
 #endif
-#endif // BLIS_ENABLE_CBLAS
 
 // By default, if the BLAS compatibility layer is enabled, we define
 // (include) all of the BLAS prototypes. However, if the user is
@@ -47,181 +49,36 @@
 // declares the BLAS functions, then we provide an opportunity to
 // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below).
 #ifdef BLIS_ENABLE_BLAS
-#define BLIS_ENABLE_BLAS_DEFS
+  #define BLIS_ENABLE_BLAS_DEFS
 #else
-#undef  BLIS_ENABLE_BLAS_DEFS
+  #undef  BLIS_ENABLE_BLAS_DEFS
 #endif
 
 // Skip prototyping all of the BLAS if the BLAS test drivers are being
 // compiled.
 #ifdef BLIS_VIA_BLASTEST
-#undef BLIS_ENABLE_BLAS_DEFS
+  #undef BLIS_ENABLE_BLAS_DEFS
 #endif
 
 // Skip prototyping all of the BLAS if the environment has defined the
 // macro BLIS_DISABLE_BLAS_DEFS.
 #ifdef BLIS_DISABLE_BLAS_DEFS
-#undef BLIS_ENABLE_BLAS_DEFS
+  #undef BLIS_ENABLE_BLAS_DEFS
 #endif
 
-// Begin including all BLAS prototypes.
+// Begin including all BLAS prototypes, if appropriate.
 #ifdef BLIS_ENABLE_BLAS_DEFS
+  // If BLIS_ENABLE_BLAS_DEFS is defined, then we should #include the BLAS
+  // prototypes.
+  #include "bli_blas_defs.h"
+#else
+  // Even if BLAS prototypes are not to be #included into blis.h, we still
+  // need to #include the prototypes when compiling BLIS.
+  #ifdef BLIS_IS_BUILDING_LIBRARY
+    #include "bli_blas_defs.h"
+  #endif
+#endif
 
 
-// -- System headers needed by BLAS compatibility layer --
-
-#include <ctype.h>  // for toupper(), used in xerbla()
-
-
-// -- Constants --
-
-#define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1)
-
-
-// -- Utility macros --
-
-#include "bla_r_sign.h"
-#include "bla_d_sign.h"
-
-#include "bla_r_cnjg.h"
-#include "bla_d_cnjg.h"
-
-#include "bla_r_imag.h"
-#include "bla_d_imag.h"
-
-#include "bla_c_div.h"
-#include "bla_z_div.h"
-
-#include "bla_f__cabs.h" // needed by c_abs, z_abs
-#include "bla_r_abs.h"
-#include "bla_d_abs.h"
-#include "bla_c_abs.h"
-#include "bla_z_abs.h"
-
-#include "bla_lsame.h"
-#include "bla_xerbla.h"
-#include "bla_xerbla_array.h"
-
-
-// -- Level-0 BLAS prototypes --
-
-#include "bla_cabs1.h"
-
-
-// -- Level-1 BLAS prototypes --
-
-#include "bla_amax.h"
-#include "bla_asum.h"
-#include "bla_axpy.h"
-#include "bla_copy.h"
-#include "bla_dot.h"
-#include "bla_nrm2.h"
-#include "bla_rot.h"
-#include "bla_rotg.h"
-#include "bla_rotm.h"
-#include "bla_rotmg.h"
-#include "bla_scal.h"
-#include "bla_swap.h"
-
-#include "f77_amax_sub.h"
-#include "f77_asum_sub.h"
-#include "f77_dot_sub.h"
-#include "f77_nrm2_sub.h"
-
-
-// -- Level-2 BLAS prototypes --
-
-// dense
-
-#include "bla_gemv.h"
-#include "bla_ger.h"
-#include "bla_hemv.h"
-#include "bla_her.h"
-#include "bla_her2.h"
-#include "bla_symv.h"
-#include "bla_syr.h"
-#include "bla_syr2.h"
-#include "bla_trmv.h"
-#include "bla_trsv.h"
-
-#include "bla_gemv_check.h"
-#include "bla_ger_check.h"
-#include "bla_hemv_check.h"
-#include "bla_her_check.h"
-#include "bla_her2_check.h"
-#include "bla_symv_check.h"
-#include "bla_syr_check.h"
-#include "bla_syr2_check.h"
-#include "bla_trmv_check.h"
-#include "bla_trsv_check.h"
-
-// packed
-
-#include "bla_hpmv.h"
-#include "bla_hpr.h"
-#include "bla_hpr2.h"
-#include "bla_spmv.h"
-#include "bla_spr.h"
-#include "bla_spr2.h"
-#include "bla_tpmv.h"
-#include "bla_tpsv.h"
-
-// banded
-
-#include "bla_gbmv.h"
-#include "bla_hbmv.h"
-#include "bla_sbmv.h"
-#include "bla_tbmv.h"
-#include "bla_tbsv.h"
-
-
-// -- Level-3 BLAS prototypes --
-
-#include "bla_gemm.h"
-#include "bla_hemm.h"
-#include "bla_herk.h"
-#include "bla_her2k.h"
-#include "bla_symm.h"
-#include "bla_syrk.h"
-#include "bla_syr2k.h"
-#include "bla_trmm.h"
-#include "bla_trsm.h"
-
-#include "bla_gemm_check.h"
-#include "bla_hemm_check.h"
-#include "bla_herk_check.h"
-#include "bla_her2k_check.h"
-#include "bla_symm_check.h"
-#include "bla_syrk_check.h"
-#include "bla_syr2k_check.h"
-#include "bla_trmm_check.h"
-#include "bla_trsm_check.h"
-
-
-// -- BLAS extension prototypes --
-
-// unique to BLIS
-
-#include "bla_axpby.h"
-
-// level-3
-
-#include "bla_gemmt.h"
-#include "bla_gemmt_check.h"
-
-// batch
-
-#include "bla_gemm_batch.h"
-
-// 3m
-
-#include "bla_gemm3m.h"
-#include "bla_gemm3m_check.h"
-
-
-// -- Fortran-compatible APIs to BLIS functions --
-
-#include "b77_thread.h"
-
+#endif // BLIS_BLAS_H
 
-#endif // BLIS_ENABLE_BLAS
diff --git a/frame/compat/bli_blas_defs.h b/frame/compat/bli_blas_defs.h
new file mode 100644
index 000000000..bfcddda0e
--- /dev/null
+++ b/frame/compat/bli_blas_defs.h
@@ -0,0 +1,197 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_BLAS_DEFS_H
+#define BLIS_BLAS_DEFS_H
+
+
+// -- System headers needed by BLAS compatibility layer --
+
+#include <ctype.h>  // for toupper(), used in xerbla()
+
+
+// -- Constants --
+
+#define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1)
+
+
+// -- Utility macros --
+
+#include "bla_r_sign.h"
+#include "bla_d_sign.h"
+
+#include "bla_r_cnjg.h"
+#include "bla_d_cnjg.h"
+
+#include "bla_r_imag.h"
+#include "bla_d_imag.h"
+
+#include "bla_c_div.h"
+#include "bla_z_div.h"
+
+#include "bla_f__cabs.h" // needed by c_abs, z_abs
+#include "bla_r_abs.h"
+#include "bla_d_abs.h"
+#include "bla_c_abs.h"
+#include "bla_z_abs.h"
+
+#include "bla_lsame.h"
+#include "bla_xerbla.h"
+#include "bla_xerbla_array.h"
+
+
+// -- Level-0 BLAS prototypes --
+
+#include "bla_cabs1.h"
+
+
+// -- Level-1 BLAS prototypes --
+
+#include "bla_amax.h"
+#include "bla_asum.h"
+#include "bla_axpy.h"
+#include "bla_copy.h"
+#include "bla_dot.h"
+#include "bla_nrm2.h"
+#include "bla_rot.h"
+#include "bla_rotg.h"
+#include "bla_rotm.h"
+#include "bla_rotmg.h"
+#include "bla_scal.h"
+#include "bla_swap.h"
+
+#include "f77_amax_sub.h"
+#include "f77_asum_sub.h"
+#include "f77_dot_sub.h"
+#include "f77_nrm2_sub.h"
+
+
+// -- Level-2 BLAS prototypes --
+
+// dense
+
+#include "bla_gemv.h"
+#include "bla_ger.h"
+#include "bla_hemv.h"
+#include "bla_her.h"
+#include "bla_her2.h"
+#include "bla_symv.h"
+#include "bla_syr.h"
+#include "bla_syr2.h"
+#include "bla_trmv.h"
+#include "bla_trsv.h"
+
+#include "bla_gemv_check.h"
+#include "bla_ger_check.h"
+#include "bla_hemv_check.h"
+#include "bla_her_check.h"
+#include "bla_her2_check.h"
+#include "bla_symv_check.h"
+#include "bla_syr_check.h"
+#include "bla_syr2_check.h"
+#include "bla_trmv_check.h"
+#include "bla_trsv_check.h"
+
+// packed
+
+#include "bla_hpmv.h"
+#include "bla_hpr.h"
+#include "bla_hpr2.h"
+#include "bla_spmv.h"
+#include "bla_spr.h"
+#include "bla_spr2.h"
+#include "bla_tpmv.h"
+#include "bla_tpsv.h"
+
+// banded
+
+#include "bla_gbmv.h"
+#include "bla_hbmv.h"
+#include "bla_sbmv.h"
+#include "bla_tbmv.h"
+#include "bla_tbsv.h"
+
+
+// -- Level-3 BLAS prototypes --
+
+#include "bla_gemm.h"
+#include "bla_hemm.h"
+#include "bla_herk.h"
+#include "bla_her2k.h"
+#include "bla_symm.h"
+#include "bla_syrk.h"
+#include "bla_syr2k.h"
+#include "bla_trmm.h"
+#include "bla_trsm.h"
+
+#include "bla_gemm_check.h"
+#include "bla_hemm_check.h"
+#include "bla_herk_check.h"
+#include "bla_her2k_check.h"
+#include "bla_symm_check.h"
+#include "bla_syrk_check.h"
+#include "bla_syr2k_check.h"
+#include "bla_trmm_check.h"
+#include "bla_trsm_check.h"
+
+
+// -- BLAS extension prototypes --
+
+// unique to BLIS
+
+#include "bla_axpby.h"
+
+// level-3
+
+#include "bla_gemmt.h"
+#include "bla_gemmt_check.h"
+
+// batch
+
+#include "bla_gemm_batch.h"
+
+// 3m
+
+#include "bla_gemm3m.h"
+#include "bla_gemm3m_check.h"
+
+
+// -- Fortran-compatible APIs to BLIS functions --
+
+#include "b77_thread.h"
+
+
+#endif // BLIS_BLAS_DEFS_H
+
diff --git a/frame/compat/check/bla_gemm3m_check.h b/frame/compat/check/bla_gemm3m_check.h
index f565b5d29..f4ede64c8 100644
--- a/frame/compat/check/bla_gemm3m_check.h
+++ b/frame/compat/check/bla_gemm3m_check.h
@@ -33,7 +33,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \
 { \
diff --git a/frame/compat/check/bla_gemm_check.h b/frame/compat/check/bla_gemm_check.h
index f500e092e..4ee47040e 100644
--- a/frame/compat/check/bla_gemm_check.h
+++ b/frame/compat/check/bla_gemm_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \
 { \
diff --git a/frame/compat/check/bla_gemmt_check.h b/frame/compat/check/bla_gemmt_check.h
index 93908e07d..a447210a3 100644
--- a/frame/compat/check/bla_gemmt_check.h
+++ b/frame/compat/check/bla_gemmt_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \
 { \
diff --git a/frame/compat/check/bla_gemv_check.h b/frame/compat/check/bla_gemv_check.h
index e827c048f..67e718b55 100644
--- a/frame/compat/check/bla_gemv_check.h
+++ b/frame/compat/check/bla_gemv_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \
 { \
diff --git a/frame/compat/check/bla_ger_check.h b/frame/compat/check/bla_ger_check.h
index cdf008d8f..44e51df32 100644
--- a/frame/compat/check/bla_ger_check.h
+++ b/frame/compat/check/bla_ger_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \
 { \
diff --git a/frame/compat/check/bla_hemm_check.h b/frame/compat/check/bla_hemm_check.h
index a450391c0..5e5884d4f 100644
--- a/frame/compat/check/bla_hemm_check.h
+++ b/frame/compat/check/bla_hemm_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \
 { \
diff --git a/frame/compat/check/bla_hemv_check.h b/frame/compat/check/bla_hemv_check.h
index d5865f2eb..014e28e0f 100644
--- a/frame/compat/check/bla_hemv_check.h
+++ b/frame/compat/check/bla_hemv_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \
 { \
diff --git a/frame/compat/check/bla_her2_check.h b/frame/compat/check/bla_her2_check.h
index 5be7299f2..3eb873c94 100644
--- a/frame/compat/check/bla_her2_check.h
+++ b/frame/compat/check/bla_her2_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \
 { \
diff --git a/frame/compat/check/bla_her2k_check.h b/frame/compat/check/bla_her2k_check.h
index bdca4019b..bfaa19f81 100644
--- a/frame/compat/check/bla_her2k_check.h
+++ b/frame/compat/check/bla_her2k_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \
 { \
diff --git a/frame/compat/check/bla_her_check.h b/frame/compat/check/bla_her_check.h
index e1a21709e..7734d5b9e 100644
--- a/frame/compat/check/bla_her_check.h
+++ b/frame/compat/check/bla_her_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \
 { \
diff --git a/frame/compat/check/bla_herk_check.h b/frame/compat/check/bla_herk_check.h
index 029ad38fe..8c48cf123 100644
--- a/frame/compat/check/bla_herk_check.h
+++ b/frame/compat/check/bla_herk_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \
 { \
diff --git a/frame/compat/check/bla_symm_check.h b/frame/compat/check/bla_symm_check.h
index 14ae3e1bf..b5cb0f807 100644
--- a/frame/compat/check/bla_symm_check.h
+++ b/frame/compat/check/bla_symm_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_symm_check bla_hemm_check
 
diff --git a/frame/compat/check/bla_symv_check.h b/frame/compat/check/bla_symv_check.h
index 712b90b76..eeab93536 100644
--- a/frame/compat/check/bla_symv_check.h
+++ b/frame/compat/check/bla_symv_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_symv_check bla_hemv_check
 
diff --git a/frame/compat/check/bla_syr2_check.h b/frame/compat/check/bla_syr2_check.h
index a5b682020..0351a1fa5 100644
--- a/frame/compat/check/bla_syr2_check.h
+++ b/frame/compat/check/bla_syr2_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_syr2_check bla_her2_check
 
diff --git a/frame/compat/check/bla_syr2k_check.h b/frame/compat/check/bla_syr2k_check.h
index d290d3f8b..a1e1f2eb8 100644
--- a/frame/compat/check/bla_syr2k_check.h
+++ b/frame/compat/check/bla_syr2k_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \
 { \
diff --git a/frame/compat/check/bla_syr_check.h b/frame/compat/check/bla_syr_check.h
index 41070a035..fae8cc1df 100644
--- a/frame/compat/check/bla_syr_check.h
+++ b/frame/compat/check/bla_syr_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_syr_check bla_her_check
 
diff --git a/frame/compat/check/bla_syrk_check.h b/frame/compat/check/bla_syrk_check.h
index ea140682c..9332e61d4 100644
--- a/frame/compat/check/bla_syrk_check.h
+++ b/frame/compat/check/bla_syrk_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \
 { \
diff --git a/frame/compat/check/bla_trmm_check.h b/frame/compat/check/bla_trmm_check.h
index 5dba6b051..ab7036d56 100644
--- a/frame/compat/check/bla_trmm_check.h
+++ b/frame/compat/check/bla_trmm_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \
 { \
diff --git a/frame/compat/check/bla_trmv_check.h b/frame/compat/check/bla_trmv_check.h
index 108a2c70b..67e6e28ee 100644
--- a/frame/compat/check/bla_trmv_check.h
+++ b/frame/compat/check/bla_trmv_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \
 { \
diff --git a/frame/compat/check/bla_trsm_check.h b/frame/compat/check/bla_trsm_check.h
index 7f30cec0f..39a387e71 100644
--- a/frame/compat/check/bla_trsm_check.h
+++ b/frame/compat/check/bla_trsm_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_trsm_check bla_trmm_check
 
diff --git a/frame/compat/check/bla_trsv_check.h b/frame/compat/check/bla_trsv_check.h
index 68f690664..497374932 100644
--- a/frame/compat/check/bla_trsv_check.h
+++ b/frame/compat/check/bla_trsv_check.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #define bla_trsv_check bla_trmv_check
 
diff --git a/frame/compat/f2c/bla_cabs1.h b/frame/compat/f2c/bla_cabs1.h
index 753765a1d..e0313daa1 100644
--- a/frame/compat/f2c/bla_cabs1.h
+++ b/frame/compat/f2c/bla_cabs1.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS bla_real   PASTEF77(s,cabs1)(bla_scomplex *z);
 BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z);
diff --git a/frame/compat/f2c/bla_gbmv.h b/frame/compat/f2c/bla_gbmv.h
index eb8ce2534..67b9e5c3f 100644
--- a/frame/compat/f2c/bla_gbmv.h
+++ b/frame/compat/f2c/bla_gbmv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy);
 BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_hbmv.h b/frame/compat/f2c/bla_hbmv.h
index 1ddb83807..cf610ba73 100644
--- a/frame/compat/f2c/bla_hbmv.h
+++ b/frame/compat/f2c/bla_hbmv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy);
 BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_hpmv.h b/frame/compat/f2c/bla_hpmv.h
index 26d055eff..24b1a1965 100644
--- a/frame/compat/f2c/bla_hpmv.h
+++ b/frame/compat/f2c/bla_hpmv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy);
 BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_hpr.h b/frame/compat/f2c/bla_hpr.h
index cfce9e177..1d4bb56ae 100644
--- a/frame/compat/f2c/bla_hpr.h
+++ b/frame/compat/f2c/bla_hpr.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap);
 BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap);
diff --git a/frame/compat/f2c/bla_hpr2.h b/frame/compat/f2c/bla_hpr2.h
index 16f929d61..91bbe3749 100644
--- a/frame/compat/f2c/bla_hpr2.h
+++ b/frame/compat/f2c/bla_hpr2.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap);
 BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap);
diff --git a/frame/compat/f2c/bla_lsame.h b/frame/compat/f2c/bla_lsame.h
index 656032688..0d00ca0ba 100644
--- a/frame/compat/f2c/bla_lsame.h
+++ b/frame/compat/f2c/bla_lsame.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 #ifdef LAPACK_ILP64
 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len);
diff --git a/frame/compat/f2c/bla_rot.h b/frame/compat/f2c/bla_rot.h
index 609355560..ca4a4f9ac 100644
--- a/frame/compat/f2c/bla_rot.h
+++ b/frame/compat/f2c/bla_rot.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
 BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
diff --git a/frame/compat/f2c/bla_rotg.h b/frame/compat/f2c/bla_rotg.h
index b968ebbea..e9cc9ceb9 100644
--- a/frame/compat/f2c/bla_rotg.h
+++ b/frame/compat/f2c/bla_rotg.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s);
 BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s);
diff --git a/frame/compat/f2c/bla_rotm.h b/frame/compat/f2c/bla_rotm.h
index 21906358b..569220978 100644
--- a/frame/compat/f2c/bla_rotm.h
+++ b/frame/compat/f2c/bla_rotm.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam);
 BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam);
diff --git a/frame/compat/f2c/bla_rotmg.h b/frame/compat/f2c/bla_rotmg.h
index 63e9710da..54d429587 100644
--- a/frame/compat/f2c/bla_rotmg.h
+++ b/frame/compat/f2c/bla_rotmg.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam);
 BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam);
diff --git a/frame/compat/f2c/bla_sbmv.h b/frame/compat/f2c/bla_sbmv.h
index c3f3fc24f..a23c61f97 100644
--- a/frame/compat/f2c/bla_sbmv.h
+++ b/frame/compat/f2c/bla_sbmv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy);
 BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_spmv.h b/frame/compat/f2c/bla_spmv.h
index 7db7d4a8b..32f1fb1e2 100644
--- a/frame/compat/f2c/bla_spmv.h
+++ b/frame/compat/f2c/bla_spmv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy);
 BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_spr.h b/frame/compat/f2c/bla_spr.h
index 6712d7c16..fbb31da8b 100644
--- a/frame/compat/f2c/bla_spr.h
+++ b/frame/compat/f2c/bla_spr.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap);
 BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap);
diff --git a/frame/compat/f2c/bla_spr2.h b/frame/compat/f2c/bla_spr2.h
index 5a1d60747..89422a546 100644
--- a/frame/compat/f2c/bla_spr2.h
+++ b/frame/compat/f2c/bla_spr2.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap);
 BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap);
diff --git a/frame/compat/f2c/bla_tbmv.h b/frame/compat/f2c/bla_tbmv.h
index f34654762..809dccb15 100644
--- a/frame/compat/f2c/bla_tbmv.h
+++ b/frame/compat/f2c/bla_tbmv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx);
 BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_tbsv.h b/frame/compat/f2c/bla_tbsv.h
index 5e84f5c36..3653a6a10 100644
--- a/frame/compat/f2c/bla_tbsv.h
+++ b/frame/compat/f2c/bla_tbsv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx);
 BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_tpmv.h b/frame/compat/f2c/bla_tpmv.h
index 2376ecfe3..800fbd568 100644
--- a/frame/compat/f2c/bla_tpmv.h
+++ b/frame/compat/f2c/bla_tpmv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx);
 BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_tpsv.h b/frame/compat/f2c/bla_tpsv.h
index 77bd55979..3cf5e80ff 100644
--- a/frame/compat/f2c/bla_tpsv.h
+++ b/frame/compat/f2c/bla_tpsv.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx);
 BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_xerbla.h b/frame/compat/f2c/bla_xerbla.h
index f9f0a4641..15635ddad 100644
--- a/frame/compat/f2c/bla_xerbla.h
+++ b/frame/compat/f2c/bla_xerbla.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
 
diff --git a/frame/compat/f2c/bla_xerbla_array.h b/frame/compat/f2c/bla_xerbla_array.h
index 6a4b4e059..b6248c029 100644
--- a/frame/compat/f2c/bla_xerbla_array.h
+++ b/frame/compat/f2c/bla_xerbla_array.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info);
 
diff --git a/frame/compat/f2c/util/bla_c_abs.h b/frame/compat/f2c/util/bla_c_abs.h
index b4eb510dd..432e53aa4 100644
--- a/frame/compat/f2c/util/bla_c_abs.h
+++ b/frame/compat/f2c/util/bla_c_abs.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_c_abs(const bla_scomplex *z);
 
diff --git a/frame/compat/f2c/util/bla_c_div.h b/frame/compat/f2c/util/bla_c_div.h
index 14497715d..8887407f0 100644
--- a/frame/compat/f2c/util/bla_c_div.h
+++ b/frame/compat/f2c/util/bla_c_div.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp);
 
diff --git a/frame/compat/f2c/util/bla_d_abs.h b/frame/compat/f2c/util/bla_d_abs.h
index e9b3f1dc1..e97e1fa63 100644
--- a/frame/compat/f2c/util/bla_d_abs.h
+++ b/frame/compat/f2c/util/bla_d_abs.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_d_abs(const bla_double *x);
 
diff --git a/frame/compat/f2c/util/bla_d_cnjg.h b/frame/compat/f2c/util/bla_d_cnjg.h
index 38c810910..4beed81fa 100644
--- a/frame/compat/f2c/util/bla_d_cnjg.h
+++ b/frame/compat/f2c/util/bla_d_cnjg.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src);
 
diff --git a/frame/compat/f2c/util/bla_d_imag.h b/frame/compat/f2c/util/bla_d_imag.h
index 913b84c16..bfed3e465 100644
--- a/frame/compat/f2c/util/bla_d_imag.h
+++ b/frame/compat/f2c/util/bla_d_imag.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_d_imag(const bla_dcomplex *z);
 
diff --git a/frame/compat/f2c/util/bla_d_sign.h b/frame/compat/f2c/util/bla_d_sign.h
index 25076140c..d2a78c957 100644
--- a/frame/compat/f2c/util/bla_d_sign.h
+++ b/frame/compat/f2c/util/bla_d_sign.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_d_sign(const bla_double *a, const bla_double *b);
 
diff --git a/frame/compat/f2c/util/bla_f__cabs.h b/frame/compat/f2c/util/bla_f__cabs.h
index ffa439518..92b603678 100644
--- a/frame/compat/f2c/util/bla_f__cabs.h
+++ b/frame/compat/f2c/util/bla_f__cabs.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_f__cabs(double real, double imag);
 
diff --git a/frame/compat/f2c/util/bla_r_abs.h b/frame/compat/f2c/util/bla_r_abs.h
index 636c0ed21..b5d3aab03 100644
--- a/frame/compat/f2c/util/bla_r_abs.h
+++ b/frame/compat/f2c/util/bla_r_abs.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_r_abs(const bla_real *x);
 
diff --git a/frame/compat/f2c/util/bla_r_cnjg.h b/frame/compat/f2c/util/bla_r_cnjg.h
index 5ee38843f..65b68d02e 100644
--- a/frame/compat/f2c/util/bla_r_cnjg.h
+++ b/frame/compat/f2c/util/bla_r_cnjg.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src);
 
diff --git a/frame/compat/f2c/util/bla_r_imag.h b/frame/compat/f2c/util/bla_r_imag.h
index 6918660ed..685b8ab4c 100644
--- a/frame/compat/f2c/util/bla_r_imag.h
+++ b/frame/compat/f2c/util/bla_r_imag.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 bla_real bla_r_imag(const bla_scomplex *z);
 
diff --git a/frame/compat/f2c/util/bla_r_sign.h b/frame/compat/f2c/util/bla_r_sign.h
index 032377708..524db767e 100644
--- a/frame/compat/f2c/util/bla_r_sign.h
+++ b/frame/compat/f2c/util/bla_r_sign.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_r_sign(const bla_real *a, const bla_real *b);
 
diff --git a/frame/compat/f2c/util/bla_z_abs.h b/frame/compat/f2c/util/bla_z_abs.h
index b84b073fe..49af56d78 100644
--- a/frame/compat/f2c/util/bla_z_abs.h
+++ b/frame/compat/f2c/util/bla_z_abs.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 double bla_z_abs(const bla_dcomplex *z);
 
diff --git a/frame/compat/f2c/util/bla_z_div.h b/frame/compat/f2c/util/bla_z_div.h
index bec56bb5f..3072d3f29 100644
--- a/frame/compat/f2c/util/bla_z_div.h
+++ b/frame/compat/f2c/util/bla_z_div.h
@@ -32,7 +32,7 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
+#if 1
 
 void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp);
 

From 9d778e0f7c94d8752dd578101e4fc6893a1f54ef Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 29 Mar 2023 17:36:49 -0500
Subject: [PATCH 141/230] Move -fPIC insertion to subconfigs' make_defs.mk.
 (#738)

* Move -fPIC insertion to subconfigs' make_defs.mk.

Details:
- Previously, common.mk was appending -fPIC to the CPICFLAGS variables
  set within the various subconfigurations' make_defs.mk files. This
  seemed somewhat unintuitive, and so now the -fPIC flag is assigned to
  the various subconfigs' CPICFLAGS variables in the respective
  make_defs.mk files.
- This also commit changes the logic in common.mk so that instead of
  appending, the variable is overwritten, but now *only* in the case
  of Windows (since apparently -fPIC needs to be omitted there). Thanks
  to Nick Knight for catching and reporting this weirdness.
---
 common.mk                         | 14 +++++---------
 config/a64fx/make_defs.mk         |  2 +-
 config/amd64/make_defs.mk         |  2 +-
 config/amd64_legacy/make_defs.mk  |  2 +-
 config/arm32/make_defs.mk         |  2 +-
 config/arm64/make_defs.mk         |  2 +-
 config/armsve/make_defs.mk        |  2 +-
 config/bgq/make_defs.mk           |  2 +-
 config/bulldozer/make_defs.mk     |  2 +-
 config/cortexa15/make_defs.mk     |  2 +-
 config/cortexa53/make_defs.mk     |  2 +-
 config/cortexa57/make_defs.mk     |  2 +-
 config/cortexa9/make_defs.mk      |  2 +-
 config/excavator/make_defs.mk     |  2 +-
 config/firestorm/make_defs.mk     |  2 +-
 config/generic/make_defs.mk       |  2 +-
 config/haswell/make_defs.mk       |  2 +-
 config/intel64/make_defs.mk       |  2 +-
 config/knc/make_defs.mk           |  2 +-
 config/knl/make_defs.mk           |  2 +-
 config/old/haswellbb/make_defs.mk |  2 +-
 config/old/pnacl/make_defs.mk     |  2 +-
 config/penryn/make_defs.mk        |  2 +-
 config/piledriver/make_defs.mk    |  2 +-
 config/power/make_defs.mk         |  2 +-
 config/power10/make_defs.mk       |  2 +-
 config/power7/make_defs.mk        |  2 +-
 config/power9/make_defs.mk        |  2 +-
 config/sandybridge/make_defs.mk   |  2 +-
 config/skx/make_defs.mk           |  2 +-
 config/steamroller/make_defs.mk   |  2 +-
 config/template/make_defs.mk      |  2 +-
 config/thunderx2/make_defs.mk     |  2 +-
 config/x86_64/make_defs.mk        |  2 +-
 config/zen/amd_config.mk          |  2 +-
 config/zen/make_defs.mk           |  2 +-
 config/zen2/make_defs.mk          |  2 +-
 config/zen3/make_defs.mk          |  2 +-
 config/zen3/make_defs.mk.old      |  2 +-
 39 files changed, 43 insertions(+), 47 deletions(-)

diff --git a/common.mk b/common.mk
index 2fdb0aa10..33f39d529 100644
--- a/common.mk
+++ b/common.mk
@@ -731,21 +731,17 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c))))
 
 # --- Position-independent code flags (shared libraries only) ---
 
-
-ifeq ($(MK_ENABLE_SHARED),yes)
-
-# Emit position-independent code for dynamic linking.
-ifeq ($(IS_MSVC),yes)
-# Note: Don't use any fPIC flags for Windows builds since all code is position-
+# Note: Avoid -fPIC flags for Windows builds since all code is position-
 # independent.
+ifeq ($(IS_MSVC),yes)
 CPICFLAGS :=
-else
-CPICFLAGS := -fPIC
 endif
-$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPICFLAGS,$(c))))
+$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call store-var-for,CPICFLAGS,$(c))))
 
 # --- Symbol exporting flags (shared libraries only) ---
 
+ifeq ($(MK_ENABLE_SHARED),yes)
+
 # NOTE: These flags are only applied when building BLIS and not used by
 # applications that import BLIS compilation flags via the
 # $(get-user-cflags-for ...) function.
diff --git a/config/a64fx/make_defs.mk b/config/a64fx/make_defs.mk
index d6871fac3..5cc8162ba 100644
--- a/config/a64fx/make_defs.mk
+++ b/config/a64fx/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := a64fx
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -D_GNU_SOURCE -D_A64FX
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/amd64/make_defs.mk b/config/amd64/make_defs.mk
index ebb7a569f..bbe4d8d5f 100644
--- a/config/amd64/make_defs.mk
+++ b/config/amd64/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := amd64
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/amd64_legacy/make_defs.mk b/config/amd64_legacy/make_defs.mk
index 37ccbdae2..914f533ae 100644
--- a/config/amd64_legacy/make_defs.mk
+++ b/config/amd64_legacy/make_defs.mk
@@ -48,7 +48,7 @@ THIS_CONFIG    := amd64_legacy
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/arm32/make_defs.mk b/config/arm32/make_defs.mk
index e6818a19d..ee9529638 100644
--- a/config/arm32/make_defs.mk
+++ b/config/arm32/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := arm32
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     := -mfloat-abi=hard -mfpu=neon
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/arm64/make_defs.mk b/config/arm64/make_defs.mk
index fc1a062e6..1f8c2e84b 100644
--- a/config/arm64/make_defs.mk
+++ b/config/arm64/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := arm64
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -D_GNU_SOURCE
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/armsve/make_defs.mk b/config/armsve/make_defs.mk
index d3495efbb..340b52f31 100644
--- a/config/armsve/make_defs.mk
+++ b/config/armsve/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := armsve
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -D_GNU_SOURCE
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk
index 0cbbf439d..fa4479956 100644
--- a/config/bgq/make_defs.mk
+++ b/config/bgq/make_defs.mk
@@ -58,7 +58,7 @@ CMISCFLAGS     := -fopenmp
 else
 $(error xlc or bgclang is required for this configuration.)
 endif
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     := -w
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk
index 1f80f2ab6..e3e208862 100644
--- a/config/bulldozer/make_defs.mk
+++ b/config/bulldozer/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := bulldozer
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/cortexa15/make_defs.mk b/config/cortexa15/make_defs.mk
index abbee599d..3a9a83b39 100644
--- a/config/cortexa15/make_defs.mk
+++ b/config/cortexa15/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := cortexa15
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     := -mfloat-abi=hard -mfpu=neon
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/cortexa53/make_defs.mk b/config/cortexa53/make_defs.mk
index b5b2220a6..6036ea55a 100644
--- a/config/cortexa53/make_defs.mk
+++ b/config/cortexa53/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := cortexa53
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -D_GNU_SOURCE
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/cortexa57/make_defs.mk b/config/cortexa57/make_defs.mk
index 83565b8a7..d84f8538a 100644
--- a/config/cortexa57/make_defs.mk
+++ b/config/cortexa57/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := cortexa57
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -D_GNU_SOURCE
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/cortexa9/make_defs.mk b/config/cortexa9/make_defs.mk
index ea9dc29ac..f5f19e530 100644
--- a/config/cortexa9/make_defs.mk
+++ b/config/cortexa9/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := cortexa9
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     := -mfloat-abi=hard -mfpu=neon
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/excavator/make_defs.mk b/config/excavator/make_defs.mk
index 6e73e6058..7977806b2 100644
--- a/config/excavator/make_defs.mk
+++ b/config/excavator/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := excavator
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/firestorm/make_defs.mk b/config/firestorm/make_defs.mk
index dc4286e6a..2353e0040 100644
--- a/config/firestorm/make_defs.mk
+++ b/config/firestorm/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := firestorm
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -D_GNU_SOURCE
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/generic/make_defs.mk b/config/generic/make_defs.mk
index ee77b6cf0..b0dcec044 100644
--- a/config/generic/make_defs.mk
+++ b/config/generic/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := generic
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk
index a8135c107..6f7b5b49a 100644
--- a/config/haswell/make_defs.mk
+++ b/config/haswell/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := haswell
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/intel64/make_defs.mk b/config/intel64/make_defs.mk
index 95f21f6f9..3f62cef57 100644
--- a/config/intel64/make_defs.mk
+++ b/config/intel64/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := intel64
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/knc/make_defs.mk b/config/knc/make_defs.mk
index 0a1d43a64..243eb8f19 100644
--- a/config/knc/make_defs.mk
+++ b/config/knc/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := knc
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     := -mmic -fasm-blocks
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk
index d4b0da4aa..5458745b9 100644
--- a/config/knl/make_defs.mk
+++ b/config/knl/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := knl
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/old/haswellbb/make_defs.mk b/config/old/haswellbb/make_defs.mk
index 6752dde29..3e4868a1f 100644
--- a/config/old/haswellbb/make_defs.mk
+++ b/config/old/haswellbb/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := haswell
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/old/pnacl/make_defs.mk b/config/old/pnacl/make_defs.mk
index f82493f8b..28f5e2a2e 100644
--- a/config/old/pnacl/make_defs.mk
+++ b/config/old/pnacl/make_defs.mk
@@ -49,7 +49,7 @@ CC_VENDOR      := pnacl-clang
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=gnu11 -I$(NACL_SDK_ROOT)/include
-CPICFLAGS      := 
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 COPTFLAGS      := -O3
diff --git a/config/penryn/make_defs.mk b/config/penryn/make_defs.mk
index a3474e9ce..d070b7f1a 100644
--- a/config/penryn/make_defs.mk
+++ b/config/penryn/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := penryn
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk
index ab42872fb..56b7d0fc5 100644
--- a/config/piledriver/make_defs.mk
+++ b/config/piledriver/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := piledriver
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/power/make_defs.mk b/config/power/make_defs.mk
index 2a366f1e2..8350a0a5c 100644
--- a/config/power/make_defs.mk
+++ b/config/power/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := power
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/power10/make_defs.mk b/config/power10/make_defs.mk
index 2c3f7cd7b..191a3e42a 100644
--- a/config/power10/make_defs.mk
+++ b/config/power10/make_defs.mk
@@ -48,7 +48,7 @@ THIS_CONFIG    := power10
 # may specify additional flags here as needed.
 CPPROCFLAGS    := 
 CMISCFLAGS     :=  
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk
index f80774e48..a732cfe9c 100644
--- a/config/power7/make_defs.mk
+++ b/config/power7/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := power7
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     := -mcpu=power7
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/power9/make_defs.mk b/config/power9/make_defs.mk
index 85fa592d8..9f604a607 100644
--- a/config/power9/make_defs.mk
+++ b/config/power9/make_defs.mk
@@ -48,7 +48,7 @@ THIS_CONFIG    := power9
 # may specify additional flags here as needed.
 CPPROCFLAGS    := 
 CMISCFLAGS     :=  
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk
index d3ceb3483..6047787cd 100644
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := sandybridge
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk
index 00ae94a36..589e73dda 100644
--- a/config/skx/make_defs.mk
+++ b/config/skx/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := skx
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/steamroller/make_defs.mk b/config/steamroller/make_defs.mk
index 5220c3540..122472c85 100644
--- a/config/steamroller/make_defs.mk
+++ b/config/steamroller/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := steamroller
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk
index 7b5b532a3..d4e70d574 100644
--- a/config/template/make_defs.mk
+++ b/config/template/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := template
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/thunderx2/make_defs.mk b/config/thunderx2/make_defs.mk
index b43fea87c..fd7df2eee 100644
--- a/config/thunderx2/make_defs.mk
+++ b/config/thunderx2/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := thunderx2
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -D_GNU_SOURCE
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/x86_64/make_defs.mk b/config/x86_64/make_defs.mk
index 6a05a1f8f..3c912370e 100644
--- a/config/x86_64/make_defs.mk
+++ b/config/x86_64/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := x86_64
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/zen/amd_config.mk b/config/zen/amd_config.mk
index def1cadba..b76bdebad 100644
--- a/config/zen/amd_config.mk
+++ b/config/zen/amd_config.mk
@@ -39,7 +39,7 @@
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk
index 8bdafd5ca..389a313b6 100644
--- a/config/zen/make_defs.mk
+++ b/config/zen/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := zen
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk
index c14b8cba0..1eebf7fa7 100644
--- a/config/zen2/make_defs.mk
+++ b/config/zen2/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := zen2
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
index cfeca4f5d..88f39c3d1 100644
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := zen3
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
diff --git a/config/zen3/make_defs.mk.old b/config/zen3/make_defs.mk.old
index e0794ab0c..9af3a90d4 100644
--- a/config/zen3/make_defs.mk.old
+++ b/config/zen3/make_defs.mk.old
@@ -50,7 +50,7 @@ THIS_CONFIG    := zen3
 # may specify additional flags here as needed.
 CPPROCFLAGS    :=
 CMISCFLAGS     :=
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)

From 17cd260cb504b2f3997c32daec77f4c828fbb32b Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 29 Mar 2023 21:47:12 -0500
Subject: [PATCH 142/230] Added mm_algorithm pptx files (bp and pb).

Details:
- Added two PowerPoint files that contain slides depicting the classic
  Goto algorithm for matrix multiplication as well as its sister
  "panel-block" algorithm. These files reside in docs/diagrams.
---
 docs/diagrams/mmbp_algorithm_color.pptx | Bin 0 -> 90217 bytes
 docs/diagrams/mmpb_algorithm_color.pptx | Bin 0 -> 85453 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 docs/diagrams/mmbp_algorithm_color.pptx
 create mode 100644 docs/diagrams/mmpb_algorithm_color.pptx

diff --git a/docs/diagrams/mmbp_algorithm_color.pptx b/docs/diagrams/mmbp_algorithm_color.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..a436820f72bfe2aa37c0a03ff1fc96f06e74d23d
GIT binary patch
literal 90217
zcmeFYWmsHY(<MxR5Zv9}-Q6L$ySqzphv4q+7Tn$4-7Q#fg1gH!$$dZb&O38m-}ir}
zf6!O=*}c!Hs#U9M?Y)tc1OY_`0s{gE0s<lcD$q-kn)n0+Gz<*{gaiZ*tSMw`<7jN-
zsH^1m&DcSk*45eyKMxd`JO>CEF#i9&{x9~xc=DJ`A3eOtz1WM;E{(<76?HKM6fTtx
z;>9P@)JCwO#uM{gpDAJ*5+&d1xG*b1j=MDv?x;+=^BWrF9?*<r(XZu5jp{(0tfBLL
zAw0ZVjjhxd<c@?1Gw?MOuFgL<H;#CuXdb@jHAF9i&t!c&+d=rz8$9S&nB{9%!f(+^
z&_w8sAoQ#4^o}4pQgnK`SIYm^<A5ICFtBF;t0OCxTNAzZmlZ4hby$iNlW)6!@*umM
zH-)HLnGto2$q*T*I-%u6iD20H&_is!ckr}&i_qdVN~sxrbX$QVXDa_UGVG0zgz1)s
z;R+Dhy2;4i>_iE<2|av6F)3~taboJQf+I;Lsla)>_y^cQD#CVDT=U;<xicl{<vFiZ
zg&Y$m>?#Dy>tdv(;I&R24uemv>u9~ophyr^SOjX|stEX*gx%K%n|%ISg|qV96wN~=
zem}3HvS*c%A5~fq6`2MfX;WC<vH}X`^7QIy=S-Z3tU8YwPZv5&Er#d3K~iBchaH#s
z3-iGkov?--*Z2!%+Ct^to#GT%Qr>4vuAHmg4R}X)sauUpI6nm=q6XTlN49+X0qo|~
zS73lv>mdLv9?CX#PVnji0Py=e2$0<W!UARwj=7%!EU*noawq@`=-L}wIndF5{QZB~
z|9^4H{}<EC<5nd=7~q4>KW`F@d$Fxj63$xL3Lj<;KLHsswqdPfixtjxcI5uj*QfF8
z$e82Py_#~bbIErtO0d3wOj3sNYlj_t(&=${u5SJui_y`;;HvN~2ReQK%HUO)$R1<V
zKT;hTB^w2L{8DDL{EK}`8T?4H1X`RBt^Sd*@M{pyh_d$P@Qor!T4>=i=4W%uO8)v_
zHP-j9nw$pA!{kn4^&urOB#nDCc?0uERP+LTNsheEN`aWW+|x(x{M;vN;a|BXb37MH
zm&9Ie`Nz6I0xRp%&ipqxuDA{TGMqh`28J0)M~zR+o=fmrdCA~UQ`-p~elkZ~#*4|&
z);<0ovqtik=cEg8g$<B7_<;9s-yG>2tjvv!9q9fh(4P;CA2sw(Dg{LMK=#tZ3!nSF
z!{6B`@Pvpd(h-7Z3G?ZRH(FOq3$lj2dN`EhyJq0*Psn(=M*F+U8=$oZHI(zmu=Q|m
znm2FCl;MC+9@4<}#0Q0<!%o=C>za3K39hvb=gDzf9y&yM6fEB`6OO@;pcTQG_iGs)
zDG6V9N6i<OHd0PtOnWj6tXPPdwt?BD8<@XM!CD_f<M_Kp?OsPi4*1tO_E2y;^cWnG
zTgKRXAG9nFU-1?fs^~hT!~GwPH`F4k@A>~Up5eb5Zvm(t!smf6{>vW3T)_%q#=3&|
zF^YD)da;X*Av6#bh(}#jQLLcuf9#)orLFsGzlI9_aJpV59?RPE*d-LG=?&_^FY!JJ
zXl;)NGjFp&EkW*<KlZ2CM{P>fq?!pN5ya6=!q57(3^SL8?E4axC@OoE)*JJ;8HSZD
z#8ftd%~aJezfZyT{esT)ccb0CPKO+*t8?sQ`LG|80${(#K}E5#8=lf?@%q{w!2bW8
zF)5jNm$3nHR|W+F!UF>T^bz~NLH}lN>;Pz>^c~G@ZT@3DfJHw6dMLmT{_lR3C-g}4
z(*sz<?;L*LW==(0gmpa}+yh>C>C15A9BT?GL1>wmhfrAqm1M^*;=_B453fm3Ia_un
z@t1lTvjKcb1^i+OZHNVW*L5DPHDJ4}vM|Yt7=dV5)`7>BCk&GNtys%IJK`VYLpiq_
zYw|~<s9a}=)^dUA8mm#y_|`%&bepdJ%(8PR4cE}*QH&#v26{~kBr)2yf_v|EPTJv!
z=T46sxmRQoa%3Wb1Tm{%JpKZ$GZXRrkT=Zz(o*14_lUZlbV5{Q38^ihaYLZ$hoX(3
zu_D2)yOa5^r-S^gX`4G31B8o|q)BjC+^Y!!vs+q5%~2!Y;2$oH8d57pjA$kfC`kiq
z4&V5t7U4j7o}k0%0Fe0K0k%CRu%iel$EASA92qd;17QChS|>@XVXGhBg!1~5yBc@<
zSWP-9s0j?E2`+v2$;6t3foe=tVb&<SlSo~&jUvzsXj45c)#rz-4>AoNEe)9|sq$QC
z9GD9)iIkG|dWevCw#%`IOz14W#ji{~k`6`@Je6!dn&-S^DTZ7DLka<mTuAcY^2ePQ
z+{!rWP)C)=sol$+@ib=GqyDOr4}n)S_kx?U9jKTB+8r-qUQL^9D9;<pMX0NAgmHFl
zuufL)x0Q=N!q%<fUCBRwjW-sd4cxRMX%OJF_F7ZvyaN@adX{G=bfywWYtPb$)Uij;
zqOXkfj_cX*cPz|4%RVX{)!+)pIV#|!S!lkHWk~nVkn<+~?2J89m}QqX<dY;XsU(q(
z=MGcQoJPx=lWHkvmbKp;x@J?q7r;*UJ=Lh=$!OlF3Ew!PGA$!PW^Y;%yT5fHg7~t7
zPKr9E*_L7s$#<PIl!3adC|>jV0;Dxk(T|bKb)~bzrJu*#MiHGmwCZX>?yL1WI9vY_
zQKKM;wM^b@U&*qG2?}H)@ERCNl@z8>{=mh2jvS8dkyBp_4!T3FtyUb99-9{i>;dVR
zGs)A>oja=fw`3LSX4Z6nzaHuDJ@Ccz(m4u5uv!G}K#)Uy;9I8bd7<&+)UNx6x$L|%
z!=}}H;|Jp~$U|hfkD&6XKk9tPt48JQzQ;l2aF3tf(r@wyg@GhQeeb3J3NoPz&p8aZ
zRtu3s9ktK(^UV%}5>}2En9=d#lwNn%)TWr7{~n3oJZ>Vn-YF*6KzHDsm*KL6+bgZg
z8|424!uaW4>qcQ9ph0|4Ae4VV_)mrV^Y(|{PrO#`cLtO8o`uKx=pTN{Td&cD5xFQW
z)Rf5vOmRd}dhBC}$Rn;IkC4jOmf}4D_@39U(`a!Llo1I(24A39;(H^Gz9J$7;z-9*
zXSvtoSQigZxoPy+tA4|}*3`O>(|+BaNlk~k^;zXX*ZbXdx)Cqe?#e!?rY+mp@#E5-
z{+lh|+tmAI^7?|l7gJvuTX5NlyLa|MTN$1=JM~#I+{%`2Q`;VxPuuz#UM)7?*|~S~
zimA_bDcmt0_fV*}>#$0?+vU?MMb+lUMkVT4mkpXOZc@6Ij!~DEDF^T4mUjbJg6e9f
zeZ#u9Teoq485Q2|uWKvYb5B8UKFhC-y0pJ%_AmRq@6GRzbzUCvqPxya%dj$*JR3)F
zzfoV~h4CTFBJtT=FmpfT-d(j_Nvtf)oxW`ibs4EPH?7RwVh6t`zJ;S1tBw_UHpX;m
z9d6Q|y{}xX4ZO<2OuglLf_MVo77hrFNq-Ji-B=8My&JPZB|f9SKWOPh3iELEVw2iA
ziUf<r8p56YZo>wd!3+cZ2<<dCfi~^3=^UCA&wjGYfbCNwu$^prVfSq8P}iR7rA3?b
z#pRe}vQP0qfR5*>cI+4@pgUpH?<p<)JnyBG>zttN=PBG{@ZO!iC|u>iLmVsH&E-BZ
z+sY1Ia0c$$I&yF7`pb}~^K*4j8n4N5DpuRJmL2(k!)0+hamPiqc9o5{PQ}vpm)XO;
z=UeZfC7r3d0bq-rQ5TK#Gp0vNg1Y92X&-mi{!ZuITa!DoPE|g~{%~vCIKB}!H6OQ!
z=R46(A!JUOT#N@63f~9r=8Tty(}~NbTT$}77tx%R$0ZizbnkaNHz@6(Jhv-{hRyRW
z-Zu5orUDo1PE$L9)+-uVIc?32Xiglr*ehxXqWSCJ1!Jq4p6N#e1HLRi^p6I&ke8FI
zk=~bm7Io`v>noU>2A7jpk5}J!drm5^id<IHs~lUV9{4V7UYjPi+_PE*fKVm|-~`BT
zoNV9r>K>E(Y%aZ_N_wd79$P(AtWobZ_1*|KC0}*$u4HdIAJ}7rFY(&-8$12Q6=XGR
zY`iD#R=z%wPxl=^+F$SA%MQ@|5&>$tV%o`7vi5aXDTb0@q05~tqUr^G$rtv#+*y&s
zbrS#w{}k6^lRzp?ey&8}b{Om%%tlrY<+XX~2MmL<xKNc-T^-=wF0jLbFF9MOq`wL5
z%)bpxiP=pXIUjoSiED?oomi%C-WfjYxTOY2)d>kR4gGXK!Wozn;}mmdvT*C$PC?NV
z7XIP-ojI!5y(B))@7suOB$euKw^H5td`P4cJL-Ai_NHs}Hwsl3$Dqq7a>{<0SnjzX
z4XAntk22dXC9VU>v7$#xU-quG%@?8JgwR3VkP0H@oKL5`f<m?~vUO`;XPFg^f?(U4
zbgxdQ%Cx+}%prcm4WUAjC1{^o4w^eZ(4|PZLe`}(@S^FwoLt0o>C!Q!H1=kW_&^W8
z^_lkh>=~adJ!Rk|9ixaCc778&L7O_CYHPiXpIlr$wMl}AcDSq7IBBwdQ#)fDb2{bG
zh;Zt2L*BHBf2BP|8_{VZ2?mmYprdJt=^dkKIVQHDmToy9D^Ua^DRBH!-B<bLl~56v
zz2;aFDoL9{(c2MX%V-)YfyTBagwcObJt2)eGfjbMtAhWEoDfFA30AsMD;uKYX1IZ(
zYN_z|pnO2(4#zF{WIGfe%w<Htct<M~>pphtA-aOZ;4#ZSW)ls*S`cIUkOGIaakRwr
zoQbwDCP=yygy(0A#BQ<)P6a(sPpie~iB?^vFdW)mHxn-(hmD0Hi^ar%WE1VN`!A_t
zJhcV`=ZUNQtaUw>fegte>-R`(kO8u#bCte$P+yhBn$8~ti0ixy)!{6Y;j>H!%A2%=
zG5Y5wT50f7!=@@7-wydDcFmZM2faYekI0<rY&HUudk5}CWcb%Vr#rD8TP@0tiv2Wj
zObnkN)G{R!4}Kmxv^yI3d`mb}`JK{w%zrpHfMnGk^caSq)g`NeXDZ27CzM9je1A6S
zc>c<`kvo!yIEi8v7ql}Py(7u!F6Mji1mf(epUREa<zyjEmF_mXFu<F&enqSHB3NvJ
zhlEt~n@L#r2w)}Wi`S=#ZDJSk+cgr5t>g7}otf9CRXoDyI4Na_kk(`2<D9+sV^$m~
z1H}EeC_fi=uJEHSpd0u6Tx_cM&K!PT33+cUOht_8N;GO;iWg9zG3@NEI2)x2;ziL1
za0wHxCl*kOs-1D@C2Zx#=BEixj$zkJG~#LJ_QbL2?05=gbsxlJ2>BDp3LmvK5wO`3
ziEYC}gLD_~;m1v18YA+rgAVOt$2&-Q+eyLxD7oC1bC1<)TZ`h7IVx}k33Or+!X=s#
z)rT3{onUxu+Q}Z9mQfL?x2vkwaxy3QYK?C$fFTejs1=9G=a^4lZfqW#0)ucWZf@7d
ze>IQlF)eRYBe34*E)Dxl{V^Bnq*MD6G_a^HvAEvmE@7Bd0pY<U$!DF$w(j#N;l`Qj
ziaYPBar)^eKEELE17Vr(@5%4IeajVB1FusneHPMW5(TdJD{s}pJl68bHSExy;+|;a
z@VRgq@68-5lFTyvrimLld4(P2Mn6oiYMe<M3FGEujJ3Yxo~tHE-ISY4$6*?)AZP2G
zr9YE9L5$p!`lCGQU@oxNF$<SieiE>QpK%DBD)qdchUhJ>cjZ-3K7ueNkh&&84ol=;
zQwEFw-bbnJ@GGNywJ4sUM~#~|e_GnpJ+xat1kcDW+<8RXb<0~G9NEK;8c4(I#xS@Y
z<a^{3OMbOZQq+NLt;TwJ4BTj%BT(Mfv*=#crMS|kR`}{stPJ%mS?64ZU(wmkgF1NC
zFKLq%HSe?{SEmU|e&~(+Z*c~xob%anf=$tAJ5Qjb0?_m&k4$3vpK0q|$<ObheN6mS
zn$DFVpThYxjSg?v`;b+RB?}B1mtj{`D;zMu^mLN?=sz)qAz8R~4?WX<r|_`$Yq<a)
zZ_Go)Da$FTP7tWGGD6f{leCg@H_s6#@QOuYW-du%frodmJeFsd2}P2JTfATRVR$d}
zh*2&+bQ*`47@owxNPL45RxbFJmqc0EZ6q)E&71wck0#!h0HtHWYL3c@ul;=2`+G;R
zN7}>PXDOWwGZr3c_OucsiF&Lni7tzK53dDE$x_XucqmD=iGD!|x3Ij<uS-olhGv~z
z&*e`$n@>Ksc}>L43l}MQJ9ST+(UskwSXZPIds0NdXW}k+D#$;{RVNF>=5az+BN3k+
zLEEQrW~m|%z|3?qL;-npwQJ(oL$^QfBQ(yW7VQ9oe}A+NAL7~Z>z3ZmQ6ag2QH+a>
zY=ZV6+>sNiIQkWBI}LQdEodYnZ#JZJf2*Z}i`Ehf!UA6%tf4o+kWk5ws22td9KMvO
z`om7ME_koc`TdsTd)2P-$tUgI(*4~lk7y&`WXkxXY*}Z+3u%$BXoGA8ox0gAzzW<n
zhZuM~Qpqv|?5i8MsX6&9s;+#=lSF3w)A~*!@o&k;NFqYopXmZL-JNSWZj{5y6F?$j
ziqU(acDuhW<j|RO(&gFrV%7lNMuX5Mpo{1B3rUlTybAARM9xV7Q%>*@-xE>ey6JKu
zX|u~=M3NL*XE;;taDsbvd0l=bL3M)k<NuAQx(3IvB-<N>M}|J3eB9on1<yx;1D9|?
zmW*nqeE6IeZ0<CueFL1u8X&vfb}5<>0B_RF8FsFf)D?|)(LwgRfoUj3M^$z`kF_{Y
z0uB9oUf{S?%HY9{|9v*m=`1PoF{KnYn00@LQM2@+PUCQS%AZ#PI}SJPuB>bJkrq=%
z%Y0N2)GTFGQ<)RG3=~2;V0w;{MJ0Cv`K8}IKm9IU<3(7~_p(_@dedANPTrVO3a%!t
z9Hp}REctXh8iYfFGML~Q<mSY7)O4g@Y=;<g3M!914x<bR*FXbL1PKgcP+!ssiO3}|
zA9C63c)2sDIss>Bui}DE$Dm*i9cPFObC#H;H)ZUHX-a4d^!_hVDHavyle`jmrw+mG
zCW}FG9I;gLQBbH3N>54UN^j4=UihP)$cZI-$2st<sO7N(atUgt8j{`Nd^&<*?)xA}
zw<ks^yzNwY*u<P$TR|f1!O`5OxHp>&120mMFTDbXVfzJf->+hElawNin|UTOlg}+Z
zEgz7zmU$UAa6qO;Qx=%d7iRAc;NKER8<#8;rLHG?qnEuc(vg^hnVy}?;H)6hgA{yb
zLrg`zrJ$qo47Zk8Qv$OC@r2H3H=gBN`FEcjTeFeXl-FgYg>??S4H{Kusv{!>z>fnI
z^A(dneebebcGjMvDoQfHXX-J4$J`Zqqp<LJsr3<U%br-0i&At$oW;=Mww6$R;o+h7
zE;9+v-z?=0mwn{bTwgz^YkGdYn_783H33PsY|BRB$s}B)p(WJoD_3Wv^@g|2BU&(+
zNj%TRpAgzm+fWxKMxo80D!drg{&I^2oH9>?(;$x<7G26~Sd$U5h%ath>4^IqhPI*?
zR5q((k<Z#j*!JO;`)e5oO<<a$-NBYVuO=PC3Lf09X7(?RtZK|+x1wnXOW#hXK40e?
z+h`X^dA}bC{I`17W@6z8nfQX_^9rGPPksaet<3GNZg2!QKj+KJkvWCH&2jv}>H0-0
zwg%4X@dHFk5EhX67RV*iN;wr1Oqz}t&Nj`NKYx<+wPMNT6m4YheG*D__*R1~GWo^)
zD;sJ#W}TPa>F#&*0`?Tqo>|B7R6pY|#V`y3X<8sdC1adi;WdPGE)h27u(*AkN#p{P
zh*C%tLu85!6cbTuwbI#E`D(|~SE-&-J*Dk7aGj`L<iTQd{RBd2Oc8OWFQ#D%P@>8K
zVGI_d=ml|?k$werSEi6jzVgdlnNJb&_4}?r%|Q}>+8~o;?(ixmU^0}FDn-a64U${Y
zH0|X(PCiKWFfxqZnwMff%OedHU$Bqa{9q+8BTNp}I!w)*kC0c+HHb)K6jgWFhi}XB
zGd4hWKQ<UF9(k5`k8_I0Glp)!1<fxN96)DLiJ<(wA0KirXFRnBi6Din0>pxSnMFD&
z-=oZAZbJ=H7Rx>*+a5ghdy{Az=YR#&o9VK%eW1`0w0As-wWc5?!x+L5zc?96zs%jP
z*Hr<+(5kpPbhaB}$AaYx7lNS^VFYDv<95z;8hui@JZ-sLp8+oEMnPtVV{ghG)yUP<
zAiwEyfSq(PdOqog-Tj1WLc|d~36o5hw4RURhDt`#=t9%(t-FyBJ85zZqS6@o<q(D;
z+Oj=)L{wC-sjhU_Fmr?Cmaz?5$csZHw*BO!8&ouLbEwpbq&D``M43#y@uBn#rK7P&
zj&uk4q>0gCQFSL^m$<i+RcwYz%EimkX|;I96RAQcU>EtN*c68I!pFBYpJ(D&LxLt!
z=YAOFr};Bjl%qSLUdrSTQsSK3Ul(3Xej(ZrS_(Jd*ivzE>f+w!;gosDML1`#BDhEo
zy=R0s+3pB+jjn(&1V7pn9PWP}QUD{vVH#QtU;wQsB4+{PWXUxkGboqohv?@5`=r^f
z_+b!Z90N;K0!U^k7-tQOJ~f#PToQdH1d8E&0>fw;eb6$Due?+{nn?ZJ-G>WW!VLH$
zO2PlVJosaKjG=Tewib^?nGY~L>Wd7zY8eN5eqn?>rSzI``;foXUa|d;2zkYU;vMQX
zf%c(}>*L5r@-E6hz$)%fV~P$QPF-SG7%UDoK$bqTsaguoxpf>#X9x?uUvTI7#5hR2
zL4$>cn8?Pq7|Fjak(#G|vR%U=+JQa2!6<~2?<Z)%n2T@}6+Yz1GSM)k=BW*f8AZqz
zr_$&(ebH%H5LJV4V5UY&2eVV4U;i1m?lE|Iv%0OeByJl<N_keO{kb^q08DKhgsee2
zd5t`8VX~e)!s)jO2a<75uD=hsBM{{%1w0?iITd^E$&TjKiq~FxI+94@OOV37%W9E&
zikWu+bICiRdU>EU;!$PbWWcAw4JcEs)gmBSu~-BA%&YwQWUBn<)uM?z(`IDiNKicc
z2JyEW8;|S6Z@-z|qT3!FvQH}y(rJ=d#-FGa=hHKU<r+MmNrqNYEk$=drtuboUliY*
zI*`HCe-YmKn5ZtlYp%EbY`0Qrr+mNEtrV}Z&z0NWb1yO<Q~p7@{rU}buOfUd$BjGO
z{#H~|kw-eUiM_j{Z0Gl6xC#AIej>Y=S`s@|oR|hFH#>8^bcRu$a8eRZQPHW$bo!b3
zmvP;?zK(#(z9o=Xx$kH+<~_miSbX6H3>rbOHzHwlPWdkjqQq^FLJtlPlWAL3u@rN3
zoJdbMP<sc1rulH{<=U~4&v0cE&U8tN<gQ6gd~1*a{Srp}Kk(zd&}+k{fv~mkQ38Om
z`eQZ_@`e?JE-nZQ2KPHC8hdXxn8jy?Jg#}@LdQfjks(4o^OB<3Lp^I911M{bJ?v%O
zeI)y0r=M?ifW@6K#!`FLyzJo6JzzF8n^qcUnduvoiRfpc2oS-o8WYznk39;<kJ0X8
z?mta5CdP=nBQXZ81WiBagyp1;Hy(q<wwDGOJKxFZFuz;>7&K^99H>h%(VV058h1Vn
z(ARGuz>$~RoU}<hC67>0WskRN<oN9u;DYV}Kk~ZC8hq?_1XkaFTlbLrFs9$++NG_C
z%rVFCJht2^l|sH_C6t__SCDy@FPVik$iM=RuncL82q6#CSd-c#9QXCEzJ{@eF;XiP
zxu%dYWtvVgk#<yXu!d}1r#KpolaQ@6F-$S#dZkH9|0i|=sS0^V1d5=}r*ezSR~I+(
z3^hFsu_uC--@y@cLh|)7zjKvB2|m@BDp7&M734{=zztj@8C^m~CGsHes|70ZeKJ<4
zPb-m1lRn-#%QjBs=A#9(t8SY*dmCQ&5@Aal{h>a{JBr*YBX|QZXi9gr(_Mi=sdO!e
z&FOFnXPi!|mx7E^CUiiFz$JBfGGHDvddY&#X`0^l&&moNnWGA5UT8@g+lhb4#a&I*
z=d5Vp1xzx!48#|0C8gS?sf{<Kb}x}~z>OZ~c}I!cz3?<B6Cs>l8)`Y`gxqXw`FyT~
zMOL21`Zk-`X3@ZZsX)B}7uf3JzPWi&`QRjWiLT{BP78Aw7WkQTq`pEd_J-TRY)4c1
zC{@KL&qFs~Qfp;VK18%m6{hh#)>tl4Ny_y<Z<uEErinp9<t+)<YKy5Y0e_~C8TK>J
zF3bzE(Qw@THa#J5U+jN2Y>r<p-XoP%@s*QZTwJ^Y)<C}gHy||r>@Z&#?2+{=aD->8
zB@@Jrs$(#Sv?Y4UV}*2==@-HhRz{BJu5Rq9N~WnCPj+Mdw96$Ku}-+`0w$51uW*w}
zFH%7YYZJd&w9a=w#`|gKNhW-j8u>-hr*yt{l|L*n7*PBDWY_LapK&D^Y?9BOcITyl
zal|`JbWF+UO)WJ>a92r39fi+5i_sqWK=&#kR?+x{?ia-l{Wpc?SbBb5)cR&~>oC~l
zhfd^=A-dM0lgfinYs?9F28`|Ff{$XGtpbsSj)rW70=K(4dT#RIrk`yuaTHoR!*jdG
zqvi?SAGL1|e|W-V_MI+V#m@0R#(QgTp9Jf`&$z=2DyWc1yGm@w?d+EfPQV24{azv|
zMJ_lDN_^BRRtVYPY0a2Cs8#;CI{=RBd{eY@*anlqQxHrmUjXVr7e0&8C?Sdgt66ZQ
z*Bol>uueLP`{Wc#*%Fm4`Xm2ji)BK|5Y9FaH*2+jS*VA0mSp@Lm1#sRSuL5~QeiS>
zUXg2|d$weyfOHTS(m9s)aXCHc_kmS%bAPHuw{9dY3&Y)+aYP4}0eRgNFYYM9GHnN`
zwgPs356y_LgMqtUb|Vg*2rhlXj-ekrOh+~AWv|qZzG^TY<hmEGy|^gZPz6`%*P@@T
z`JC<-=1OByn#ZsXieo{L9L>z^-1~)?!84l86iuP~i5M3frQmg_`cxU;5d8Zg=;_n6
zCL1F-C18bJ@3~9Ow3GT;`&;X$co$Jc#Hh{HPxR&rNVCu&O@cJ@iuLZtoG*ubH$;D<
zPn_}@b}C8_czz*b(NG@$tWJ#;qBbzkp>%3>4I4T*fMmQcFENq|Qc^}|;^5TSVzKSB
zPWk?fO_@ZfV2EPLh`L>$ik1aSCl#M{3=&VoQ7}@i0<IL)U~pI(j?@BMwjystHK<*J
z^jqQ<S-0Gxe9sZ2Ko4y}?YV|b^N79xTXRb|OXwJRNv$gd)3Pe`IN%$%Y<Em+NL^@S
zk8?0_X?S2NE*7rEy-T*T;*REfmCj9)P28;Z>h00KX*nV>pCDk_GN*lHDsE^1p|1tO
zf2rSB@KOo~DS9fTlVa&4SY(yaIafr@B;3+0LZm?;AP!_@gx<4uWxg=GpYQZ%e|*-N
zdS=j=U971cG7kTcloKX7&=4=s(g5*T-YmXz?@v(jfL#QbFNvpuYVt&HZ#ec^<%Z>k
zE2*5$O`BeVVX3v9c!#-Kx$O@_L~%1^Sk%1tN~+g?*W?9mC1|L0?2MWss6vnDMC!`y
zi)?F9n+sLtF(IQBHR*mLODjW2RE4cwZD$3+@cLF%S&&xBneA`C6#*9hyA{)_^bbu2
zE)_@A>f|`#Sw8LW_DLQp{`k2NJKd`9b|ZbTAuO{qMn0v}X%zGF^_ate$JH(jb1UfE
zs2%XG<&MhoN~3bdukZM!o82qk`m3{qKf4^QHp;ZSrt~)y`AC8_z)7LH{DcUC;`a)5
zGA`IIo2EZxRD>{p+hgp7CD!;boP#oa#YN;VZK>}muduzkA94y@clgMgvarFCk7evh
zfU{hXZoFRn(!=Q$^`7a{?}wSTY{qhltZ(;jFsNtzH$Ls9>s{jP6N$ELbmwS|uDnii
zSkK=Wb4(KVb&r^%uhg=6-&(U-3#i_*aB*9<ccCTA9w;_V%i5kj@0`0f-#9$F)~%Al
zJ==m^D!ItQ!?AIq?JoUY<B68mhWJNf7*&R6!O*EwhhtRQ_5)SVIL)cUOXq2fRoh+(
z-mBMm#}9jMU*tTMZZQgGjiS4-?LC>>t37@1z=-g&+N;k^G*nWF#*{8jD5q48vsu~d
zUXE?tc&6^DsCCuo4w$W9F6gr>lCduNJ8vP7`emmtO~%yLdU_G#y^l*_c*I_v+eY|k
zYr-%I)#fL6Vl}v`aIJ7*%o*|op~cF>0m91^0aRHGM@O!`Js95I`be46#1wqXy+Iw)
z*PXP+AGwbF7KLPhmqLk2M0`%pG;H?7x#Q=nhxw1OA41B-QL!{p<UV8qo1x)Hr_PRs
zJ*Bx<1Bu2U1_^mun4cvb=u9C|F$xtj6vzTAs9_46KcVO2GEo~;N%VzGWaeQ3wA3{$
zKua-+DhGx!NVxhT-R3Jfcl<|sA#)ldv?EV)8zw9mM55;->rVsp9|yZ)f(CslA)xP7
z93)SSutlcMOypCnhK{((gohc9kRPUy;j+$ASdGm4;jjjnok(=yAEh?i4^V0jg8-!#
zDt|}<gOML{pTaCEPoYXPX0xB~ez26zQ2I|l-JQl1ysVwYbkCp<ErmMdt~f+)If?DJ
zsF;xUcZ}gcah#1xe{-ZOoYGx8{su@UJzkWnZ2m>@uhLZJ_pE-8Zy9eqgd27`Jc*Mx
z45_YPsJ!g$Ca8!vKrDOF{oGsN880i1zWwx#V$p{dIHr(@lo6ILUJgYsL(*4*Xh8Y$
z86Y>DwQi89`{WFKk&w=3Wn{2Elmobith~iwap`~5lsXjkpx%uvx(9it>9#iVs4Ql_
zT$z<P`dHQrZ{ED0Jj#cbq!-O4SR_=dU1^Bc!|+y$B--1~2}S*)57{7{0QTRs#@N<D
zvVl?}k#R&Cr6`W+3e2|l<j6UZM5K3uE(+yfF@<3Qiz%&U)-!n0@2|=n0L4*a0Qd$M
zHl3+T*@0d(5+#3<aKqd!qMVN^#Knq(sxHGH;T+w3pY(QtnJ(HDE_$x(Yd_ay8F$uO
z3+I{ZCi!eU*G75}UBHLJy0s@LxBGD-EHxh*@~^qjjuwIgGW}|q;KJnEAO>dD&K|!x
z`zwp|FnnQ=&PQcr(&ocHL={Da?D$#)KxY|Z?^7XA0~~F@1}dt|u5g!Mc=S~rX;A*$
z?*#QjcRkB1e(10rvk*c{(hc3vmO9did|#wNsrD{hK8b?R^pJb8g9tjYFcpr!KmOwf
zo})}%+nf9p*Q$t^EN8?XfanVPkLXgenn*R|QJkt50N}ZdgMAUpvfLE$JV~3YW;ETW
zAKi(8-V;^eKf=qn^N;Xazn&w{8?+mja{+NDSI~41YCRwmhaQtWj?AOKB*AWs#@yJ+
zUolZzWySF7L_+CocHL9ApCPG&sf#ds2OD!)^jXEOEUIhfQW^Ixk!tO2PiE97;uo{D
zP*3Gv?Irh}yI+B+I?NglJ5#G6DP%N{`w`9fxO|UFSc)_nU@mV%h;3(`x=nnAq()mS
zl@-0WPPjFaG$NnQlNir)I-#amClyA=m?%Ccj7Fu1*-XKj->qz9{4<@4IXq6VQ3}ca
z8u{y;yj7li(-f+tuyh(WmwKkjGYzUpOTAkleo}qWa@AI_c}PMOy|S$z`dn}6D2WQ4
zq)_1NLCr}+BRb{Nk8S^-z!X1s6`>N4H;xw+Z!$&|maFOEwwLO2Rjj(ZNy|{n^o0!`
z`rCQMs^ETV@5!Dwvg!T~;oEsv6M4Z|oL6tU9t>M7ljz#N28($kmM|QJ?}jnlY6|3`
zZx#&Xr-&bq;OyZA+fto3RDNJNaIL)HTBa9c7vI!@c(nn#P$lp|^dQ75!l43rmW5P6
zf8MOJlu4aWM{Bq;g5It=oGpE@%({`fg!3o*{WT=Z9X0|w@RtnwP%T76fzJwI{~Y>f
zPO#sq88*sKEU6x~H&|xph|!XgBH|lFe6Ut#=Y_sjPlmGDnVGNDnw$DX=Y*SRHCy}p
zOIF!l!t>Q&i7q_4HXAQ*p>JSzE1{81#;yVcU%VCHT&~6PB>Aw?ZIDE`Rrj=+CdtM;
z5Dr+`xw3>j#AA<@w{^z|Qzv1=`2xpIj{8)DnqdjvT<XQvS)qC5LEUEJVJp51LWs1K
zlkr4W3*wXzgLg(F%MdHF|D3XWCAMGpivZ6hvWJb}fayF%bccDq6an=&NP#25@Sm4;
zS~7Yg=3rx5CE{icWqW8Cy%Q!3i<Qf-B>X<)>R(GPAQ)Ld?b|pv=FCf^8(Edd{TS~%
zL|;$dbxn9;Wy^Q59PP!PCHxk0U5TgMQrWS5-g!<g`Q7R@B2UZHWL86?@2t&^Jpa+5
zsVy8|7yls_=*Ymu?Gv~?J}uLae2D@QX=F0tcS4H$%;B%uV3I9jf{L&jK&;5ckdhP5
zHvtgvo1Es=lb_GFz-*e*wb0+~#NCkY!DqU}M*aKkE(dB8>Jp}EC$_!Y*tT<S3EmTY
zwCMu(0$tcrn#``%m??^(DC8yVV-pNT;7f_rM@e#r)!tqUwBmh&67F-}q`J5-w__y8
zG(iaN#zgQ3aL1Z@?I*4<6gK>Kn0{dE*Bx_Vdv$I+ttG1r`~F-C!j^=ioJz1~yW`O{
zaY&EXh}etR+nlFc`jP3omn`qBqq*Vqm@tL9jCeL^BXxV6A#Sxz$v)&q{Jx^3oytXa
z_9@kv%QBYosM3gr+5FyDXKi{x^+iWJJMQMsfr}m>S32o+;v^ho?`@7wGFq!zRNOV}
z6x6s^d?0Na>aZS-2j0>2FyQEE2c9K5TK!toNHqC!5jWJhuZO&moWcG(6gyYpGWoYE
z$nfmuQVIUt&|mH)Y^|_^(4I>qH9;sI?E2`%p`Pa0BL*$&Genkh`qd;w`z|8bXAJE3
zFQ8|nU%G(2Oh`13*I6=UBc9Lov=+-I6tws}UiM;ki0vRrD1?S6)lnVCQR+Tt1sijG
z>Pi!8ppfMbA!t5DVi!M=h~!eNmbx%kOd_?Tta~H*o@lQS<3cSF23yitCRTAY^ODY|
z?h1+D=tp{+V;pXgL3!5%kxwjmm(^`PKn_=#xfZ6c#th;EDAhr6@r7axi76TFG0Gi&
zIy&$gR|9_*NLDW0QF`_K3c!~D3%%pV{OAiLG7<7SfVRbgH7u@>Z3?--hHun`TlOxE
zL6oN+MI;)~O2jUctEI_vD^~w}DW<hEh`^*1rD&{_PhhroDJ0MQE^oK{Ao6Q-H=_=Z
z#fDD;!3fYeq}DQhE<?%Z><?wIXh$zlD;AZXb({w**G9?<d{;8z@ffMmX;h<TYu)|U
z?pF|!4R{*p=~TakAbIWWe!xntykKx6OT%LX2Vi?PU*=m6Mm65TED+aEH8_@W)E>eb
zEOyh4KTi%=;mdII(j^aFod!-6tt3)FoM<VA_2t}sjtEjLPCbs(@E~=jYa6Xwcq*Eh
zpnh-F)S0?|m4I%~RJ2_Dvj9IM9sVT_c&kOYbwYTKjpVj+6!frXe}){Fo->3cRxyD9
zeQm<H-{VlUgesj(UI^Z`*0NL`oB{s&GyJ&{^Gs_l{h(4oR_O}PVT$o*Kq;OGg?Ff>
z_FLfcvw?DK?JohjPIdKZ>G!aTOl62UU7;@xr`QX)EJhl1PY_dN1T$Cy_?5m?G6Ow-
z5c5z^A!s6+vCttSb?NvnzcmHWFW4E*+ksYW&tinGcmETv0dmXCRE8}>*N7uM35@)~
zNcqK5Cd1-EfCKlxKH_|MP$!~xF%=`83b0&n!w0yfaRertsCwCC+=jBL_5Yk^lG|-C
z!wpSw6rIH;qE;o7#Q$Kt?QhVWGMK86&TF)omWqrkCJqCo+Oa@z(mBzK7X3VMFDnt@
zNiHirR%wYg^5^qet&dd6wsHE7Dd_IvLR~O}eojV<PA!)#3?$SdF<6iRQ>|F}vA7ng
zB(v8bF=&tx)((!_-|OPt=W=1p4J0s%MgrP31m=DlJ%zGAo(Pv;Ou^7>@CZ|26Ln%J
z06=uGcnAC^{zm@C^Vc{evonZBqUIY|7OBj)O`d@Mb%bc7d`-g<w87HcLxX`*@W&Xm
z{BkD~`1-65<Y`L*mb{x8BuKol!poh&9>&9>SR;-Q$p(urKYFsLnZ+pp{{9XcO=C(n
zJbfbOT>U3D;qpTq*MNP2V^$AB<uZPT%gO$>f*oy)$vD@g#w*5co={IVL1fC%dvc%(
zm>KVpeh)K1!1FtbES$e^0w(1`$af|ze*4jp2l6-MU6L7q4z>cxH}i;9{s@Q3#Zc*=
zI4DH%e*lAxUeqZR$#CA4VHA^T%n8NIVBaR<$l!0lhY;!o+JpUvCjaEipJ)em9FNO%
z7sD1NFj%ai8&_~bd8lSK47jgdZp3b2oz}0j*Dt1i3~FWg3kVib5ddyN*1KzT0CB-%
z(clYXd^s(V{^**I28(I`ndoD~z#UR09q%t!d}Q1s!pE*cMiV45(m77n<MElRs`xCx
z5wA^X2<N^#!OY#c_!}MgsClmdG;9K%zQc1Wp8m{0VkH#40clH<$P92-8R37?P(3*Q
zCqy`hjvukkA1v<16!?2n-JgHcoZNE!Pv^gm4#3nOyhIV1`Mbh_MQ8qlSK|sMDEFmQ
z03S)UPmNGiS2|$QX<`-!NGN_X%&DUSaf)ZW<kgl3q=mqA)<iVGkt@-dO?V%Ea{uEe
zfG54@U0(s(h5Jz3bn*1-KfwF=0-gIW-{cRF*N-oT`$vV`=eX>*PaIwrKUU(OI{_N&
z7P?TE(<xoic0+TIU3^HIuM=ZOP03R29*Zd}$AIv1A^eL1?gAs}5%du4DezX9YrxZI
zhRkpxWU-h8<=g_Y^*<qA1jqcZ<nWOae<u0|Cl(P`(H}0j|JMce=zE&S<M?Jg6YCJ@
z1O3kl0M~0cfz=#Ss9Adh0u@iP;bobL@v$xA^k)w5`DTE%3m6~bpg~{h4+eAqJ1R`~
z*$k!||E48}gU>~V+n<C(%a3x{|JvWS$>p$Na+Bg}y8TL*B!4YIF1f8O`OCP!ExY+!
zGx8;F3ow-~<K>x#p-Gd=_}ZVSo(M<$4E<K|k>kJki~fQB7GT7G16yPcfiutNvI)A=
zfAi8L$@|x}6zspir&Kei)N^#&WmIA-*>ST^*K=6z{fF*aKCu1zAKX@OI(#I+<KL2^
z5pe}z0-JNrx_>+jAd9RIvS9mGzVt7~{;Jl0^8-Mb4*MIwIIjOGdN>E;(IGBXA8}_e
z-Odbk!TN^=9bl6Y&h=B=5frKiqVA^=^42qYAc4QYTp9hSD;s#_v$ZvqU2>qiQ$*{t
zIE3sRA)!6<aUV5>;vR6CMaNMBv;%onpZu>K+k74_>G)16&+gh}+jQ^ww!u+l(`C8b
ztKFy(9k!d^U0<B|Gy!L3^XE~SzM7w>w5*2GpWCc_v;FH`D+K0HZQ1ks{|Tz7|6ic`
z{XsqefJ`Q|iDv>>_Rux?lW6vTCC7$IXurj|H9&5yUGkcL6bgM%J=q?MfnwOd{eb%M
zc*cg4$)D`BFK0VblO^#=heXH8{DfQlz2+ZEZ_IEz;WnIX|5J@vL|Z;SLuWCae?)r5
z09nI?>+6Kj8kqAqpkQ|Ndx&!*^-!amiF2U^xF<Tx#57U%D=J)s#u9Xd_}$OWOB;yi
zd<n&SLS)gU|D3JH<msrulUUES`+HkzB@?}lD|Y%UQ2MUAJ&P@XoA?7$G&FN0GAA+-
z3z=dut~H?w=g_)<;$jEbHQX5_*X)c2RG<G;1Dd|%o2iSBG%@{K8uy;#-X1$y`Ti7Y
zQ9v$2>(27^xd4kgeFIDYV5}g|Y+59Vps5?X^Jk^tNwP)th!KZLgOhEkKLwid_i92k
zrZ=kLzgeb&lFofVJxDwGm#rsK;rPP;brhh|{5gn_`a9(wz_KK8DSkd3$S-F{a?BoO
zZ?+4+Gxe%<1hzc2Xs@gsnsg=+4Zr1kLgb~b)2$BRec4^_Ii!2LRMWAzemK^7v(NW-
zfw$f_K6Z0%plMz1z?Z7Y5xVKg59OAy@Vd}P=VF#gbzfeehBAU9gAP8?pQ?0z7JAcf
z@k*KTKYql`RzsdP{a9*yLVlw70Umtn?sUx*BluF?tjkNiS#@7GWaY*4?n(Caa1l0I
zx0t??J#UB?T+6s=ANW$T1=+mePBbZo3p~mD-l{YEMa`C%)^<im_VuN)*5^|Hye_@q
zB?qwO8FHQzt+C<%Ag&dY<weyar?c9it*KM!%G=@S<`FKWgm)5<Me)BJ>Fs#tq?0{T
z;nl^B;Z9js{=2zJdynxx1nzhHv0uhPiVIroIR-e>i#~VJ(D9f&wXX}?Ht<Z-sp_uK
zNsDKaKJe=7!=t<F!71no5&VMGQT6ai<CAX>mdSBj<mHdOzryoJG;m~d79E?d_q&qk
z7gq%4oR@J|^2D;b#?Ss>WeqfgVq;?#nym|zko2{=g(~~5p^a3Bmx%hyjxZrcC21%$
z3igyD?3&a`>D=&nHE&lJ_otCBUP@uCoyl}?%*TSoq&J-++mUP($6>qtP*~k1%pDNl
zS~2f6fwkZZ5{_^nH}M?;Y!s?7nxQ?J_bU5w!74?6)fPwLx|o+nRkGnKIBnBIGhCG{
zu<0%6r^6k3%Cl}3a+Xg2l*2KMI>;&A`5`}EO2m-8ZhV!g+umlSh{NJNpoF785IKxA
z=p7P*Y`pbJ%P?(x5jnPYLBEWNv@E}vS<&exA(fyaDmpe`7^&t+jy1Gm@~|Q*4_10%
zMw`{YR0fd)w+xocifGjKCpN1{R9b2Nb4ZX<X19w!cnu1dm6g+=Ak4DT*VvxIBeOdf
zxrI<9K9i5x5z6L-wff3{u^3&`fKtu*zWWcW1XoNVmq+F0NtflDYQva2$<1yBmUCJ;
zks3Iql|odz>~NIJAb*BUTCZK$f?hQn&7rP|iWY9o;4;w2JSZUzSzC@&b6P%%7(7b2
zwL<P#P7qr{DL*^AUTO1c<nN(%9INNwwo^4ev+5FosW>K9vLS=w8xz!KlUf5!g1rPB
z4a>!whjQ`{xKkhDY@`SB4|8D-`*iHodLutThC=e4W}#JyO|xvUVgW&6pn`mx*%Ca|
zJ-4Z95&mhfD!SBU0#lH36lE_X*nmMrvE~fobHhYpGa2%n_pyYQcAcjN$(aj9K|^R)
zDa-@RSkH!aHvZGX=ywd|{aIe-5_oM1etSi+4m`cFG`h&3<G|%2(t*-7!5|EKoiHUf
z@s0z*Baxt0!dsk$-|5!!?GzRv#yFPhD&-Qn?`x)jJ<%<`E~DoH^%ZI*<NG_wsBq-O
z^IBI8@?QDa>n!h%ss*Mg*mozM;JYbPD$c5v35)EVjf?9o0-r7&lj6`M#`ejpg}(<;
zKd~(VNJqIbYsz`W#G^Yp>Y&|$@Jssfd>8xRI;HN~dj?PWA}G<rz&7tHUQ&04pe|al
zFJ0(4@V!`5+Iuqr_vmNJk#xR2P+rsEJ@<wlnhsx}=@F?Tsg0${9|e;gv}j3+2(zr1
zThsScyLkn1n_zsUoq~O`4Xt^6Fa7+;_OZFtz+S?nig3k7H~KdiY=lft3ECoFGhbMh
zz@{<1N<UM+&J0QNL<|JFh+H7yUn7HS8NTE;8!&^(xbgYSQ@@J8We-fs_7EApv>^K^
zWkwHXTO>|>e%NuhLl?gva`2y*UU|o`yBT;O^4DS_H2yQHCIK+2M9JsH4ryv20(a}(
zH8wiKl;+t#zyKScWOOB)!N>=9`JKX4lsr4;@<8Eb2E`QRtU$xT8SEZ0O#F-T(Et<>
zA0sSOJTcITQr&cJq0aQ~0p!zsoWIGwh_q*{BKfE}(0DsDG($}RuhWE@$U~X3AsGZ8
zNu=cDKqPjur~&@zcL@eQnLz&IdXVt8luQK)8l*^}$&;GUw$kg_PLqbwvaYJo2Uds>
zapK&sNF>eul!G1u)GB3C(@Po1+53y_QWJE_`=aK&6LI<;#!{FId&mQNC=LoVU${mx
zke#SwzZz4cn2ttjy{LdW4V#mhYe`aKEPmcI(5JNDIFVi;9ks)OdWpp)+M?bkku%mB
zZRaNnXhy3;c}_=Tb40Xqg5M_rD!~@*{brs$D!C5t3sS_`9Am7d^0k1GH_OshHLp8#
zf{wCVg^m0PxNX(Q`;#OoSf;*rx&np(BH4geY4;_OZl&-nyYOQ2Xzwr#tTrD)F0rOA
z;GZX;jT+Q9cNl5a&vG!oK5ObueTi0w=75!}-%DN?(GndEME*8PDyK0f+#4ti+z__}
z*<utt0RI5ef61@ZUHn8}T0(}dVC}f_c)&Spw-j4@?L+ZW%vl+2awdIOM-I-}_gF-&
z$#aj|%*-UT4gZv)J)&fR&8TQwgZE6xG&IS_^*(j>Qbw7#x%qNu+?6_V$|WvGL&7Qa
zf@Pq~XRlCnxPOs>@3?~*dF~zxa#knSOHza6DBPuM@2PSre~?g*z>{$E>#Xn_Np1Q&
zzry96ex~+Rji`PQDho$)&J#B1+n~cJ?Z}Y>JgyV(m(q%CB>Y1Po9P$sZj4uKRYA~6
z6b&6uPi{(0k%5jArkYBT@`{cu+s1Sf_l?HKcb&ca?6W$Yr&=SX{q;}JI{j3zb?d0t
zQCN!ekMV1D_A{fQZT^qEq3rZe(iJSqRq-XAInBqDs=tWmrB7>La66nHc~s)c@4{~t
zJa?3;FIJN@4ouN|6}4PQIbA>BjrtcP&!q2_-70XSjmd5#$i9{1jhSwue37Y&Nw=Ma
zhs}Se2b?2z*P+391cM^ZD%)&4XDc;_)0`Sz_r8Z{e>ns67rwZoIx6rOl<UQmMZ70X
zy3TdP(=3VuR8bd;&2i10RwvX;6A76|^1I-)AX`z%gZ#M~&GEoyioXa+FudrTGV<cR
z?A~Qt5=LFLiOIshLj>efJ%^feP0GqXTKTGM?^}}V7M!dlyI4zdl^|kDKPiowJ{4tB
zHf?=Qnb5LaVYBUpBAlX<lo&;#@qOrD*PzSyNR6>JkZ$cV3Fk9m@y_q?`ik2vF^v^p
zTH(5S3{DL_h0JJMrW4Fa(Za6ELh5w=e*XTl>2Tf!2Gcu-rOUVT%Zz^3p%@q_Q1)VM
zl4`erG|BzSt$0*4z*k4<MDbGER3cWl{tC_}rr7GGQAZ3xx^z9Jmju_qZxqxk=Dl(S
z0|yKnjdjt+M=Pwrww$Mv;?)E4n=r9KxMK_$MSWR>qoc@<NNf}yJNQH{C-QIp2EV7P
zHRb{qJ>wt#>YKeiY>l2^S;qvjREo9ei%5B(81kwlv*zuCkFheYS{tO-LX>T5e6uBM
zU}wWzkdG1Rzr+qcN1JVn1tU7~b(Z;_n)ht<6J=^w38f~~`BWD1m85rZ&!!5tPfg1~
zLj^$Ph4~y$=Xf=<9w-fVqxNTH%~{|ON;+g&5a4sO#`<<OKLd$$#imd|@GY=)NWp&k
z%EA{f)SX^M*d!Pbg#oMqj{DPLW_lh~CAc1kL7POMG#K7gwSTytts%F4>Qk(;%A%f?
zg0%vc>Y|dl=2_~SD9>DODV5?&;fWP4K44pkr!Z#h<_jr|hq1GeW+HQ5V%nS}^&;Yf
z;y26Vl@}X0nE}?S2}Q*L3yp$OCyQ4kbKXMu2LwmdER9`GhDD1o#~g(cVsS#(uyO?=
zbeG9k1EC)ecn!peI}yb*l&{kH{8UPxH84#bLs2_&_$K>1lXDOoP)ugQ35fc}$<t<z
zD|9o;M1iyQ2f-6hL&>`Nc`liiJ`*obbPp6m^<uDvRd?m9eEQwSkmScO9RX~1M9@3H
z&s5pH<!5wX29p?doF+%hAf>SC;5CmiOlU`UWtR4NNTT+@Bw!2yY$%Ch*Qhix@Nmuw
z*OkO%MY?l$?5JOENoPHDMnorhK!VqT(3fR46v0w$ITYg@mQ!Ywnb>%;=v=q#tABtM
zj@fsfK38V5=4dFynMeaVVLc`3H+vNm?lxKus=4n>8HpuE#++13p+~Vc(()~|kN7?7
zI!Y~Pj{=zbnMfoi)rM?Q$Ddcpk*-GN!8_54jK39lIPQ>38>areL`t*ONZ>xiUEjCN
z&~3rRi4N-Pf%b3l-eZ+3*WIMI+UTFBT+;m%>17vn<TMzoz}U=Lhn>XFByV|1B4<tx
zF2S=zG02!IRx0LWGmX{fO;tI`oPkxxS+_bNFJe)Up0BE=v%5d-*Bs-h+RTAa$hR<9
z=??~HPMm-(GJCX-oG&J|$Gny?XnFI^A?i3x4uTnZFs4amLo`0!a1FGLqs)qDs2*@y
z@%yDp@u}BAPG_uzXSff6e4BD-^E-!!T1B9hK)_kutA6N;?7(!zfIh30UCWAXk9v8~
z;nfQ@Y8wYRFQY_Mv1-fNKD4rZ0FS}RzLef9a~lVZO;iCypoHF$JBw0eWHgR=CzTg=
zd5IKHRbwSUtS#5qnd9vY`>>vdJ$*gU%tjEJv7?Au6JX&df^S7#`@U$}gL2nP4@d;%
z{V*o<j2>43y9C-Fze}XialDMCgG<?Zx|O=rRQy|6Q|>e-dkryD%zs%4eYy~XGHa=v
z04FH_AMCwVRAk+fEsVPr?(XjH?i5hCQ@Fdkd*PZu;hMM=?hb{!L*eeu<$Pz{+o!v~
ze{}z^eecMJJ@(3-4;f>v*fVBC%y6~6a{Ltu&=v}qA4D+A9o3@t#|R5Ts=~D$#QFSW
z7$vo&pFSXXZjZg;s9C6J34uSdJV9T11jAl?fCQP}Xu2^F_Ir5pPz2kku)q@TN~#h;
zVnDEfLQQcyGB7eXD?G<Bg*#F2i!<d@ALPzj0o-3~NSc`<LNRWYZ_z?U)~5WC3EEKm
z*}C{&eZATG%#d7k!r7bfHTo7pKO>L`{a0Ls#8-|DabOyXy=6P-d(&Xnj`W_$*2mN0
zor+)jk1?IohyG!&UzB2c;`+Hm#=1b{Cn{b>$NNnSdMKbUkYaDAM>a~gQjKBI7-DJt
z3!juLpe&pxw=QyXfYTxm1G9F>@5}aaBh2d<_6rk~yo{&>2DDP1$NmrPJ}+;H1V^yc
z(F_`ZOh=2_YVNM1eMxpy_vrMv@Z!VE$8uZK`k*cGGo9#0M}o~rD!5G`_i6)t1NGFp
zZ7zmR?RpA@nsFw&#7BpD7`nvhHN#1v+fOb7>MLFVTVEeenXktMmQvDq?;W2f^4D-H
zER}X>8@VOOAHo;nTiGbflG0aDeU98VP?YzQEgsQgusZ@MQlep*Y+z`8BUA5TnGaL%
zNZ;|;*bDRc6*}BQbiYbcU>um3&!i>nXMN@U%zjYSqRHCipV4Z^o4%`ZzG<WOGy73(
z)pJ24?owOUWtoGqNw95`iQ#xB#E9@SLn17!F(j(U;3x6^s5#?zN%O7;|Dw#ruohzP
z-Hpo=j%SFt*cM?{3vdNfMW`#yKO)s1;DWtZt%Bi>lvR}liV-K#D*ZUt{fcVexKC7n
zmFgE{J*kpst<Nx98Q-VgsOm=YB_?uu&s!@8E4<g+b?)Qv7%2<%FnDAL{nyms<&Qp7
z>c?oJnCC~8^pCZSkL0IYN^r^|QC`kYw4zywusI`DWop2)KM0kA3ai9WG0jyWk(^86
z9ZHOk$*r{}ayzJN{w$%KJ+gHHTJ~Zi5Bsk{bb>5>=Wo*qi+o^fG<e~(Kw$}*bMcZ{
z-YG`&0#|nbe>@=aj|aXy201W0^(mfEn`naODY{I`xUgUJp%d`Ij44Y<E%TBjt~Y3B
z6BwR=;AM!9!tomEP=%z+QPtA$a!%})=99?)K->EwCaG=Pk!oDEha89j@P_al_B|*B
z!)Wm#+cJFl;R?^Z;dVr_B|T}!HrYru*<TF%CcCO!iItcrKf9%3r9gMMU9`hPi<}j6
z3!HK9C8H0Q4xXhTZB$&s7<<DImhb|_BB-|IBhN&rY_fr~%e>~sCx-<VXf^Xg(}_o}
z;@!pKLpKZH&2IU#piRzTHnuikM;v}QIwh#tc_-UBFb+l~uDqN*Wp%K$Do8l*I(E`%
za+=R8pXofnN&YZlHp-V{`TB67&%E|9y{{oW^I}MdSZ%_W{>3xo!UDrBB(adn`cU&2
z1W3V3CdI0dEDcs=+;qa^*B|<GTRUrcMQdO8Bbj-UWGBqasisenTSU$^WNzQVOpKx3
zf0g%<*CSZ5kA{%-vzupdj{TgM!~q;OB?m%Pk3r;|sEK3Dv$_->gK*VHn2BRJohO;T
zzmrbPV7nyAzjmTWD@9XF?V*j3q^Aj`1m#j^^9LWe!PgER<bedX?yG4e6J=>k%!<8<
zSJ-^F*3Wc}&G1*~I#fvvcjl^YH5U?gtm$D{d4f2eCm18E6>h<*1^H#BC66Y<MCH|g
z5VI8z&KT!a9Sp*BTwud5Aej<rit4Pz?Bb=i-S%8hFQ5^us6;OR8%(67z&sGrYzC|%
z!xz$PLGG%;<4qvgp_!=OWmOW+<tGxuVN*!oB1KKoO86hmSm<f&9uj<{T7&W|eC}L}
zeqk;&!1GquMSg=j7Y}FU(;}s5WQpv`{VrQ)a(C%EI$R#5@=k&7(dt5*;O2Y0Krt@&
z67*U)<HI|*c!;hi{sFSZ8J7&PG~AX5KDQnIySw|8o4JJ!IBh8sR?#ELB({F~sK`n2
z(`R{G&H~PycKr%m8Yy#ws=e(S&(9#+<9g#sSySkk!G{7LA~@}x6joI26Ym%GW-Yz3
z=Y^M&O`8WL_cn~L=_`U-#alCzB0d_}(=;W@sK1XRjQ7QTDpFH_o``*8+zUwYPrH>6
z$&E8u?{^Kqu28ZO!WF|!exKLEqbM<W0n5p4WVv+6l23qGn<5y~m2JF34zCvq8|i^U
zs7t-A0Qv@lssaWESXW3H0bH&nr*eYP;-{y+_3}s}H%@xG-3#3=U4cKHCg4pWEpf<o
zb-EPIkOrD`-0UqJQYRW<Ut%Y-64Q$-=iFxM(_mJ&>`zCx&3>MWX;BZ{7Ck)WmHuS4
z@CcD;a9e?=T@~cKCw?2&`SO(LSD?h+GJ1hY>M0IFg{8-F`trSyFx8`$mqsEzArBe-
zx3=|G8|X_z;1Z_=5IS&ytD9ZT{CUi#3-2X<ilr^)%-x#l`^);un5G|>KejrL;>7({
z#LxQ)nsGYB01cPP+L1}=^l(ueOkw)7fRA)S&e&ghBXkWDTZL|bk!^0uTS*N*AK&uF
z%+#Nb?(P#^IxiG4D(TZ+yXKoLI}-T&%rMkRxdhK7*q17EI$JlOtz4qy;`n$qSvKtW
z%G%Hy?&TeEU^6EjRq$^PJIV_h-r%>**n7eUeDCwPP{Uh2qTyRY3VAnDoC)uEA&fA>
z>1kX|0Tj50V3ceoNUTiTp?I>7f%-N9SZy)J-}$sx8>~_+lZ}~*V`RRDMocyL5k<q8
z+yBW$Ru(C-f3D-Yh<myV9=1ypwUYGA1-xq_r`InN8~Tg4QFP-aJu=pqVX0%OO<Q?g
zwOM6ZcXt~F8&&BuVupA4TFYgVk2z7%)`h$5SEkOAp7_Q_vn%4d&e0mLbJh%r#htRH
zvRxL<k#MHkF|S=cub0p>XnG-vKpPoQT)=eGe%shzxnLX2V4*-cM!9Wedn+AUEeZH3
z;PGC#=PK+42POGu*XJy_Ue)ni7kRwd{N1mJFQbf9W^8C8k4ZOQy9u6tn*?!-aBUZI
zUA?a6d>HW`>un+hZ`cOx)@R9h__95|8*qa={qD-dWNBy6dUo9B2PQVO_x`wG!W7-p
zNJ|?(j~I1t2c|7|cyLXqr(F`$mt2|V2n<T(FsM2%y9+!xVKB_XdjfVsBe>#(i>q{@
z);Kv~%NO$DXO%``1n_09oRdGUv8-FvzD2cklhFH(rX2ykfkrJzlwNq83;nIyPQ>0U
zIL4>ij_9YH)87f5{3YU5_-(}QQ*NgN>re=#eJs&<z!73B5ZEsf*aVw-3yvMt6yZQ8
zV^95jL7<Xa|Ag`rD{^c+i_1QT{nv_2Am=!5m0IYHNZ1$EZ(O=S+Se1Kgcug5oYF8D
zpl0a9_;$t7ffAZ3?=<nSdKEc_T?dzGChrQX%$hJ>@s?c~kesl|Erm1H22?HTmVQjK
zj9hnHtn*E}Q=gH?bTPAvLbVgTRvra;TyHG)WO`g`D#jGa(D>e3wrYMA&fEizVa9tp
zq?rvlwYEkji)mpkix02b-{zct2N5PbffV(&=V*He1+y8h*D#?|#^6r6aE$>7P?UyM
z$k!{4yWfVDDKz2xVc{;Ft7PfzO`6)oMU_9>3Bt^c8G7fe^L01tCeW+2sj0o!fB$A|
zT1y~@>ZV<jeBkbn1glQWWYWM>4fXMphdQ$TJ5<JXeqpKvP1k*DK%e{fc!2iV?F@2h
zswLO&xkH9}$s6t&e!fRzG4ar78}{*NWxp?>Af4R|<xBOU1%9I6mM;3&><k$TA)vj?
zPCQJyowM(mM9CS<!9>xoDi=nB(U8=WZwb!O^i%r<feK)@*cLAd@aGTH2~x=K8ZWc3
zD3v;heo{iq7aY*_>e%G5#kZ-{L&0YWY+C|Ux0nUYp=O8%s6Kkx`cg44&BUVosLykE
zu#$*<g0L8KL2H6}s~stHlc4V)v;Kd(kLITnuR1@!`G+JN{l5z4$QyauySn^4VYq)T
zasD5TuzxQPH{*qpP^>S0xg~K6o@%lRwMuSMaJZTE9kOTll-b@b^UWG}APKc?i({@1
zOk@M%#HV`OO`xiYn7g2v8%haxqrb*JEqY&+Y#()}M*=f+N2@R1d5vbk>?f>Nm~<z%
zo;2D%scJQ8Cf|MGf?A`p<z&EpG>xLV81FJ}{~dwcjlbI=ulXU$_;=D=|8LZU*+pzl
z#oubuy+&~h0h^wTR$3d-^PI=lb}yg*^gpXt{r|TE*Z;G_Ut5Ixf4m(S{#)mQh53KH
zA^v*~g6Ds{9sZje;@@dW{)>?2|81V~|JmW6Z_fWOVb%ZJ_5<7hcn9&{bCm4=Bkf?B
zZ$Lu$iLr*D{{x~_FmiS=b^3RR@}Eop<{$d!*RA@9(V;J2dNxER-;5KRj-WX@aDscQ
zBp*Ofl>{~O^WEs}g=i2^spCYuvjU<HULWc9J)UvAu44z?WLbw3(P2hqBNaazA(-r$
z_=p!AX*{aQS+S>k%Y6KJ{S1F+Z<?$s>g-fVkNtF2B%*Fqiw19woS}Y`1&()T6J>^3
z>HA(mS>_Z&a?Ij>owQ1<H#D5=edvl=aewb>DnF^})lk!ePCi$Jw|Y_B;v<@$f2vz!
zKg1><aoJFWZtK4Z+(-i&kE)X_a~9j+C7M1ZvL96AOd?%#irH_{?`bAq9Rf$i>TE^=
z2vZ>0=!{3B49y(H;ES6gGgc_mGyP9?1%O((b-?d43na^e#Wo*Q1@)WuPw;<-SgF4T
z=Tm=1f7PFQtE7L2Q2r9U{HNz(`OCx&8mn=uoG89bQ~tzr=D^A<Q&?;0&NAwnWxamI
z4V*}Qv^?>YLIOUCfLHYwo~J}4n(3^&*3$L$&f!#cegKcT8F#B+eFgRkDhW+>c%1*d
zGfEk}{>-z{VY|WpdAEqlc58pkqRHWDq{~K(-}_PP2sxeoWsW^MSd6xr$6;lP73=S$
zfy{53`d49dmv#dZp%`mX(ADUVY=#=+@LnCA8b;cD>t2qNUY*+0LE|2^Wzs-x6cKFo
ztuEbJN$rMx{02!BSZ1?3)wH?JK#M@^#=@usM?FS&B-VRlczYjn2y}FK^?Y27%PsKq
z889}J(=W_nsU{^c88)dwC>=jH-f1%>oSDXtSI;_x8|EwsqIO-Gc-fqFf^lB2;!=e)
z$pL4dKiAtuSQ2T=11CMd!@%4|00>LG)Oh%4S6(PCVV*p}TK~2KA;jl|e9Pj@KiG!g
z1dyq`ig}#jB4Bg1Kawj?kQh#g;h8(OHJ0*vbSTwJ0x4xrwuu?3sTXDEfL02K3T}<;
z-*i5ZA)aJVD2%Ku;1bDWc1v^zjPOHhgV~K7kB5RqEbeQBYowyrOsu%J|LCK=een}f
z%4QP_Hze?m9xc=^6(EyDB1SYv`J&sMZlq^k=(fr9bCQ<B6he%^ZKiBGd3py;1wSE?
z_7+o40)iDmSPg8C!=iwggI8K5?Ll+PU1~V<8=z-$tNCs*<Y(lYIZodJ>#<3zc?+@i
zq^twQ)E7|#&5rk^{LB_Ikvv*rQ(InL;>z6n&)Tl3v*ap2d~!>$KUe;+@JC@wpY16=
zc~hBo1@%6pE_B7p=!|Z1eG8Dk8i^sp83%?_YuL78=dik&7QX?0*d3pM-2&`Gv~$mV
z=G1qVo%UQ@Q3+O#DpEB~<ZQw)JXN&9;?vaC%YIh)1SX->r_9bLA2Afm91mK(zb-K-
z2&_^XWn~`e{$5>zQxx*`d1FDm@gThOb>|%Mwwue6I44K)b;pGcUiLs4r;@pyKf?8Z
z&7;*$00`PZ{8~L^LroSs)+%YuyYME#fAV`yui6UIe`_rlKCnl=130fc_@8=ECh3=t
z$wsQDZ*pmNBdKTkTY98f+_c8rGLXSb$_3eR%@2|3kHbEz=o=P7945`B-`|W{;ypmW
zc_MipQ+aMl))KYuT4uJ^d!9pY)2CDIE|b_6#kE6SOinCZY`(w#d1>y_C()SDvXzqN
zMzbRaJjG_5at4XMDNtGpr6W<E_y2I>$}EcHn|!Iw8ZtsxpbEZX2lzt{;R%hxiHx_u
z=qyjHUtTU(>Q0L~jA$H}Z>20Et?Ty*p-nqj?b~%NB3|eq3}q7)zfqq`SX%9q?`Byt
zF6h9BAdi`88tyRIoi|PSIzZ*Z#(${Zlm;S$OFP;=TjYOlCc-V~`rF8uAWfKl_ET(k
z>eCDTmwx827^qVH-}{;9j;0zn!)0}4!}_+;>`pfHMpDs8S{XjKXpkr1z4J8%fyz$t
z26q703D$DH8It)V0`kHBMCZPRRFQFhhS&Cdw^##7zTw2~oOkocTWxsqphDm;i^Dm-
zuYW%EEM8|7vS{_!c7m6zAh2U)ujVQm>}H9-7UpxPYvdOp`p|_|)`n?3VWXq^L!4$p
zPa87BS&idXKcpQzM%S5yFhVXbG)xmj6yjS)7^qdMApr%TfhpfWW9|fzWQeqfF(an4
zI>IZn*Lq|RCx~6^_93kt5aIEPc_f5Up21^qdhl%z^>x_h?Z}G;^OoOeS6yF-fe-8F
zBBbir4TJ;m&&VAazP8;6?-0M&hzL*>ABrl?xws+Kr~HmZu8{U&6}dx+QI0T27S&YN
zb89H(k3(FyI0ImEI73^T^&~fo(SEt`s`Qpr@~R?KH3#{8E&UQ-BGV=JW9<w-ePp@2
z8RJ;?Ct_EJ-oy0u_9=#g)yT|?JUVpJqxWNDH1{9b>bpQyIhc@GRgiG#so%9n4l9or
z3sc*T`VLXs5Vw0r%Ry|{pq7CaJ?vi5DaEMEhFkm~dx$j!dE5fzwo%aZR~NtI75unE
zvq>zlNM6sVUj-9t$*xOFroe|n)Pt~4WL;pL5Y-4#6MIMwTi1g-Ly8?!JYjbwB|$=s
z(HbJp(LcU2_Pf7E9%?M~vHxNS=Pj2S`b9xVXeXC72d{0Y5E<Y+aTIsWwq+}!57JeZ
z&QBz8jxCT?t!q<MHHjqXNyAU4lplPqdrMA!PEKE2ECt<-3Z`<}up2&CH0AL45mW(o
zGt=0a0z$_s8KqyuNui4-dGMj%|KyMoBkG^=2V4k;9!&^uFs}<+a`%r9KCtd&FKJOu
zAGFL$eHI{KN71pFIR^23oY4S41u`QTw|wSF7u`ZpVjfH@SY6acLYZ41{f+9fZ&zBA
zwcSFWoTDjDx2$f+T5ky`TItEihK4iigYw&K-`;VU`s83Z$8|w`6LIN;u<Ta1E%>$n
zOqFcA5<ZPE$GklmvtFfD@W<?!Hniq2<*uk_jN?U|ui3p_Kjj*9`n*4Yby!t@<Ff9_
zcBt60vO9xmNI{r1NL7%WHU>OtrSJLG0d!w(eAsVqi~!`(Hv9(t6-p=`c$72=b4Vdw
z7;lDf(VGZK9Aqlg%Exr1$3L@VgVlWs>oCTVJ{2f(?cA}e+Nc5Ij{H$#i#;es7+6>g
zg@(nlXq?;E40adpKLxo%d2pZojncH!0TbemCdN+gNsp(0Pd<+|K+pa8gdVzI{xOm1
zU*<^sZ_@BzpvPZee_V6bZuj4ihus{8Bj%OF8Q3y;X+2T5v<p4GD~*Kr87BOxePeFD
z!3>UUUA7J8NwH)+mO<(E<#-0$-dA&bt|DS<#?|~P$*`M2PMGRCUZ(PwgsW{%4zl2e
zC^a55*8!=))aJp}n_Pch>qPavG#j02bWkF-teS+{Z{;%d6%(oLCTCf#+Zg;hb9E+g
z@1x#yX{Tt9jG=#4+zeAHOJn>x`OW+~G|_Q#7naNSQjP)AjOk#nXLS}UJvz=Y&63xc
zYlq9oBxSbX(@p#@vGr@jD_^UOhFK;_?ms7HsMMlD?eq5yz~VZU<u*+K_wdik@2cxE
zxVTi5TD3Q&RrU@Gb8R%=f;Z$pfZYgwnm>v+k#jWbw1%%=V<@_zmSYg4S1^T)P~{#a
z6Dsk>?+cc%pJ7*Rc)WX8a4|t3+9XIe6|kn8w_C)xn?q3n+-;@2s!*y-C#T=#%5iu4
zFWWBbD(<!rzDqt#B)ZJexy-M+B6AQ0`&RG}1e~W|21R%g5X{}#9JeCNT<rwR1mHrH
zIr8(}u4^)}dIU-EE-K`6?(%Mu7P(&{W_(v`suU?IzoF6Q(9i+(G>g=Pher8Ad7c*o
zQ5rzOWAKV3c9u&PyuumKIve>Lo4GsWFzw7-w6$j}P>MK<LKxO30IjEn(9_f}T}&X=
zij+Gz^f|fWCX%}DhxmNRK`7~kRz(YWrf8{tcf>}DzF6CynqMEem`8XH3mtu6N?<67
zmjaC0{>#yW+mY1SI!x-7DZ@|f2s5H`)8U)?<bYGWube-RM7V#WhTsi09avz!(0YEB
z_1Kowb5{lu!R?&C1W{+S&p~Q`<2hx2M056PZVfv7ws;aKZzy+4*xX2c-GBmSduKOF
zC7Q^A_OlbwP9%OZUu9kTaJQY&lka=>D7X;H*XBgO8rAI7s_2=scoiWCbm50D4%vs0
z+oX~Rc=6ueEw$h@e3*8ZK@MNOBd48j^XeZGN9`UC3^#!23nRkR4V@5e#je35y*^)(
z4innf44t%onR>y!`p1n&09WD0)t~!BFX4EB4T~RGJ%k4KOp#b3_a3|8?SVv{lU`!+
z>x!?B<5IUZBWFnN1PeFbApD80@Kb_^pgGQyVyknF?W)C(!Ok0fvj-nJH@|`wpfZan
z(4x27FXA|CLOApuzgK0Ptgv?+Pn99~W(ul>tIK{&cg%tDx}A1>WO|ed6$E})94fO_
z@s^#az2MpJcP}`S*qa<Ad9%5-wLr=%&)>6j>`Hv~Dnj_S%yQ&_KNkEFiFV-$4a5D?
zu%0GlLIJcseMWj=@``GwPd-Jx8H$q8G#Z{j&aC}N%$j9IMTi;Ci{9C@Qri&N84!YG
za`=E;R8lE;(f2jr$+meThBKo%oP$+eEbQFPz)IrgEUBM!>?q@dXDa4fZo>A2Jd9I2
zS!DA6xV{?`qM9o2WJY)}Sn_4mY4Kox)^1INoZIxkhdw9A&H5f*Oqo5H=a~T2nej|m
zl;r5ba%Gka>ajaTo}_Cqx%uZ!4h;Ks-`D^%qjJ?NZ^OjZ($yZOviQB<R8=e~M#AF3
z9@ho>>}jqbfe2a~Dgf*2G2ib1%&^O#>SlJ5&H8;Wpu3Sr3FvdUB7PXIM>Hs^)NBXr
zpkO-&&j;4ug=Y@Ni773guqopcHvLyV@-JejQeDq(l^ey+!0<xYA{Rr;KaQcFk$yy*
zKAut}n=l}uR9Z)NFR{VOUHRwYih!b0otUg_)*K|D(ZD1+^NIUr;&NBFW0?cpReU(J
z5q_*AT{EJAhwk{^(Ws;A_i09GVXti<jrB<H3Yuky*8B4q#t(9as;eAZ46qn=Ne?46
z*{fkk6a56JG5A3l_)S?#+{!IAe%{b-BG9(HIIax&Y^X&b$7A&OS1qjt9vQ7pTj>ff
z6cK85l&(rEMk#hb1Y9g}u$U`9!o(Hl;Z%WFOOo&8ydJKf)l}K`7J=Gp-Mzj*YLRXv
zS<MQN5o~EcCYb4B+lRI-JU^<KO$B$VuX+}Mo=vAP<lqcX+a82>Zl*ONDRTTX*=hP}
z2MCj&>%BplXLo1-2XQyJKX1nGwj9-V$V>b4&jF2Wb|7E)ZQ=gx`$`HG@5Ub8y0j>1
z-4$c&R%^YB<fn?{@V*IlycXm2xJ=^aXq1}kTbe<)y%%hMZ%FGdBau5ZRzG#!LB6!c
zbQCXIDHWgyLxI7Q0zpFlnQOehRM@y_|Fae^<!eyh74-ESYo6;nC{38OGTlFetDbHa
zkYPGQDG^5~=^0G-&)|wF-qnvgD4XPvMf7o1_0gI1Enzq>Yl>iirwp5c80Laq`vff0
z*yJ4{Midm~>=4bBinVfe0fj!7Wv$+#G-RlyNM;v9Vm%(vgHu#Ub_?UtcT2Z*QK60h
z<NSoT6HMT{PF{IIFtR?pK^pY5h|J{5YR}RtjryL}cRW#Hbt#TV121K2?#W!?8x{rz
z6s`5s{jYe-Q#($He#yj7+P^*&U+HzBRTmX^ZH%Fl*lTvD+|S5w&5o@0fxBZ{^&AnJ
zpk?K)BwBAR7<+CN)zGD{@U`6JVpS)P4!?fg*q_0XSX|twQ{Y!5vJ|JBd-t$pyIAST
zQaty*HV%NfVOu*(Qt&d7%_bxu0^x(~nkM?-^?PSSTDmq3J-`lw=dho6oL9qUmtC?&
za$D0f%%Ss6Rx3;dn`d43z5I@WOd3b3tFqai@Y4jVK!dK=o7F|8kDNS>2?`!XB?l^9
zxfYvWI95&hVK-PqH)Uz##(ZwU<47}4MIEEXdD)?UDCAc*DUT<=QMlcMsj>Ry-Tcw}
zLSUnX<U<v*;VHGs7@Zcl-#m^S?h{}pt=NoOlZ@ogn04LH(G@?|av@dMp-`B_dEPTE
zjAWD|cuxUQ&FPzNpV%lU5c^q(MGjwbYvY9vE&GLAh3Ru9-QVO9eRYZFaz3FE^nU{y
z{YA?B3p7IcBxU3yTS-SqJ4>nK6@qwQKx4#*dR5X?8W!zpyWc-anHLIy!b0we;w2X&
zbUzuBDEyoZHnE)P<yAe3N`^GO<+%e+PqtbH{R0K%1>}SKm$vap+X~f6#&ioSKf4Q0
z^RAA^tMGcxGjYd(`w2;{8fAAHzUD>Q#<CN)r%a7X@7ad-!8SEAy5dV!!In@HBBM6c
zDz1zWyMfUa!iBNLN_+JR)`J_$&2R1M9bZNuRl7HVV@-I##_HMy*^QnUtv4lUtIb{=
zHQFBAG5I7N1eDL>o&z~$O^ym?hZ1v5Ic3bX_Tv@4kzYCA?EO>E>N--)2JY_FwN3LF
z5a?{(Jy_(~gskQJf?sT&`Ya-g+EOYRCpDvlHQSu=e9rC+=#6PMY3MGXZ~81cJ3Wo;
zI`!L`R|&X4f`$}CTvxqs8~HX46^UT357mTHjE>Zj{0}Qz19Mm2HacFt_;6<ht#8g&
zjtg?=EGrRJUmhsQcM~T=T>GD~5ppT7$l1SKQL+>7Y3aU_7qS;N6%~k4laRwqP!>?w
z%w*ZAHfC`S=u6!Zejw7(V`OWWcWhCvM8L`g;`?_44O@*p&e{#~jcGa10D1XP5XZ)p
zqp!h5xY1ZebsM#cj+Qh@0@<9NVN-O>Lg{<1j$`C6cNwMx{Hey~!~|b(r+5QCCKH9~
z8rIexyuF5q%U9PP#7XKC+{l`%sp#bs<B3Cs!-<}fh_5*+#YkWDrFTIN5si#am9Kq6
zEDMb3V6F$t)7x1?RTm(mHu+hvK<eT>sEwIBe+?R$$D{i|BHbD;{v57Xfw>XOb?~7q
zY%~Qs=vO2{Ag-xY6GDYdirgl2$=Lm>#JML>8|+@-%a8QfK)~+>UCJKept-Vev^`G@
z-Tn@0FG?+-(2lzokVbfkDnz<%3AR8D-Q9!MU5hnT0;^eo{~2U<=U|a_Q)Tqx{sH0k
z0^`nqk6Zvq@_@u42XL&QG4RGZfdKeH@u8+8F|#Et<BqRd7Y~0ptnRkHb}{-`0Q7Sw
zGj<k$hCBvb`2l;(T=e9prCptJc^GiG2kz81l<F;M<9{|c-LzXW50Ao&U`?va9I^<>
z$GF7j;jXo<{|K4+VbWVj1D@EkxLsIFCfP3|+2Gqf{BXRZEyabFgKg-y%4z9xQjqaR
z>^Vml+B+n(zU(|(P)IwW7zO;)AkqkfB%{-swRn0+#_~cCX$lHp>3?l9LgkBD^AXE<
z!?KIR`x0>{sH6aGs%KyRV(thV_XVq=ocYbzA_j$XV&H0q6K2;Tfka6r*%Jx(%jPct
zMI@VnN&n5k25BomsMxl|dkFVXKMQCe`0+HG_S>I;L_^_#`M$z}u$tF}7-&hxRVV(Y
zP$F|Uu~WuI!R~y>9|;(vM*z@Y(qt>_j%9Q`LToYQSu*}QD)6TX^CJv4xZ6_87%R3`
zKUUcIX16)sW_yz`))eTrAi<3z2#0NS25BU1EF|x>Y+sYHZ!Qjx<-%2EbRgRK{cHl$
zoJ;QN3G@khA~p|`2tR(u>)0sj-uE@Y@rvs+s{7AH5*Ztu<W!$ycn_KX1E%@k)c#*_
z-@lOhr`mdUyOOAW$TjZ7pIrd4k4m!GJT)eCINcf8wax*U%XKZq(qT3{Zb2Z<;cJHW
zwQcr1qX2RPS1y{-R47&(@cSiZ!-Vw#V^`lhrjCz$uM8ZG_VQU$qKtdKF?J6^ESXG$
znDN}-B^|@;Rz24trV^7<`-s&O_Q>y76E9Ah<|D|JnC-0HWYys|CX6A};ctEfOpl54
z7g31^OIcX+iLh(xSZ{^KT|p15Cw!?Gw`uef3U5A*zi$S7at2>ZBQ)f4;0FqKaM0qr
z^b1Yb9q1mm)z#af@SbkNq@lSDd{QZHdD^~y`0qHADOBZAjMYs9KiE+J9--1Q)vdKL
znPjx7_rL5^`VpXMFx}_ssqd($@igzQ=Bb{e?t<pu&I{~DqgNm^mpe?u9!6B4w0DlD
z8N~Zxzz$ft{OnVbAuSyW2ob&td*3DD+*|C8m^W_MkHr$Q%ZtSt(kXqJfOlxPFLI%E
z&2JlW%-jW(knRpz4jOk-*FiLqs*x*|5(Z<AC6RVk#<q87$Y30oepT#CpH!-*#R_Yo
z$)gal;mBSIG4Yf$kbC14ZPs?wU99CV!=-Wnzex4t_eOyNJ<V<~(FvY%6n%>0D5xtR
zaBtVdbzxzpBZRkvIXy1KB<IBABuDk7kmf;O+$H@*!ff^kv46?U-$h-%DNptizewh6
z=#4Uray@t>2F<1BZr)T&Rwo0R1pa00`z20BZD}DrPy3kzMU-bbg-Uf{nt3BP8W&EQ
z8R+E_iq(J+Ofrj;N6v$vU~nJP0QoKgs7Qr0zI+{fC%d@OB{@dey*Dg)FSzE+bm+GL
zlaZUfWVO>09YAALp&f2E2#=|B+r)_NM5N6xYXv0G+ppH7kQpnDM8@BmE})tOh%v3J
zS8Hg0-?dL6<#ee{!=9T;UviVRC1TS2_z><R1r+Jc<Aq7QOXQdNPtck6w}dm~UbPIP
zk?m^muLgHC)Oyzl85GXHuj!dGdY@QpSJKul>3Cdpp3Xk0pEDmyxmt!8rCi3PFUg~*
zkRZDri4#Y?g~5!~{c^K~Es#e`mOsy60$Z&k+lxb7Yywg{kpe<c!%lQMNiFLC@Z#_<
z`jGsXHGy=ZZ>dqJ8n1&F{}?Z94EaqHm&9v}ZtDbOtYgn8){=y19wl}JmXjsyi(*LT
z*>Cf-Ba~Z4&aY=8-_K?3g<*>8oixmQz4ObN%-_EBXFm_=8$QGhs#Y9mN$K04wz=8w
z@J&w}?cx6ryFeya(_$yDVgJ;OW$(~bdmT!VU4J1LT0L*0<~Zx&)fR?8J#ftKwFek$
zqkq$IXUlHr2yY$0=LmwhPQ$3UggV2W`E&jUw@Jw1O*kcE)wd5mIFQJ%qjNf0=l5?u
z!bQs9LCW7-S8I<g+LLx##Y#DEU&qARU>8<nRB_)sVrE?hhJF}&!;G!Q-dH2Am7{t1
z6)!{C2}K0$(vUV1C}!sztlC{TuwviF#g#Pe67mdfZ!6=qSl6}VFNl0`&-HkFn)Uy9
zdYb(pB>7YsTj=16QfcrsIr)}TRqkT)E=Y(BW{w)`{e3m-zP&(Xi4OqfXMbEUmm5>i
z7wP>94Z&n<kFBArh$~3$wG%kNry6C7t0AE`KG4>pc|YG$V1U_mT}~hWl?GAE%Z4-@
z$qhv^^{lnZMQj*;IiBNBNit^I1UcDcIgzHNp<>_~BeSz`Ibp^KMU@#A5-`sZYq3R^
z(Q#l0yYI`_!)P<e7ptNo$1DsND=T>DS*(eu%}V02GT=`wkr*u#OFvlCsc_5j=u_{;
zDcv&cU~0!+yQq}Mcnfk~x;k<P8>U-Q_6P8W-(tAHZ$zj9&Px&}suk-~$8k<n3@-CC
z$11K=47am0$FChpn3>j=uHx%5(TVj%DHG(CF2D>#fyXH~Il3VngP)K{47M&(yIDS8
z^mVkCul)j<%xD>a4DdZnDE1(W2*JEVv)$3wJfo!}1b^^FH&jBFDvfBA@FGGEY}~@&
z)Z?^Z?DLAL0NH!HaH-I`mQm>^I`Q<(s6D0EE-6Fl^M{cJI6o}wq*N;&*%$6?byl0&
z>gT0m<V-!+BYV>?11>14E#_M7X@Fz<u6Z$Ni9hSgv%lAHvVJtax-+3cVAHC&SqHN<
zepzhh+{xs0OIgp-9*{;8OXQ=~uRDeS{y0E5G_;IkMpV1Y3@wiKuzC|qIeE_y?r9K)
z79HqPnKeqO8)uhI#o}l%<5^gOqesr2Ai}SZmuOF~GtE?1m6})&QsWq^y`{Hl$~2T<
z{4xGJ_r0KDRRl+DkV=$iGt_BgHPC6}g>-aA3*YJyJ&`27&|-=7NL(<J3&Lg{*>6Jn
z-4j}d$_rE))ra@=mo>TVs`Cg9?^j-BVfbp$(Kf{+eyu}a@e=Y^_GEo%*=|a)(G@y_
zFUdOXDl2!YCeu<OiI=lqpRK70#*m9hIz_2Jo<G(fpHbt<AK4Q{pk+4iO}KqSrS0gv
zN+G;6XYRjb|Ks^E0%~XyvlsiaI-Kh+7;0D{MAL>a6%IoCubtHYxcl=H+!P(My7_on
zrMoi(MDB59`@6aJS0wn_Ej2zq;S!&6kVOIKhZ3qYVbLV^di?nuL79h{qct^LkNp%1
zU!jWiqS;?#1_&KT(D!v6M0WdrKQ`Yw*<Y5YO<EuIW@clc^)rK-n(a9Kj!7!|x!_l(
zzl`Md>%fS@tW2AS<lrK9X-&l)6Wrwh;@W8}q!jQUw{_xE4YV6l%$i&JHP%n6AsvJ5
z5$Bmwkl&&H4rOGuKL`Ez456%1{{zbS8+Q4x5{gqzolkZd{f%i#0Oq)vRGDc^G*RUB
zt1@vC#O^p4c4XZAUwLisC{Qz<Gbb~Oj>3L^e8?uMC)(?z;koVkjswAwUF4G}ZMt5G
zkz+JG0?#IiV*rM!ueY&(=Om!NDWu;ukfvfJvV)}e-QoF!X-I+A!ZD7^hK3<tIT}58
z37#fv&Q+q57z0zPl7yo@i%s7MR_F<+<KP2bM|b?3gVlz?Xx+B@`(gR_{tYV?S}GHr
z%9uI~6}sG%0sW{Xw&jH246Y(X#B^J3IXM@iCf&u-pNZV2B1a!cx?w@c*&7^&N?eT^
z2j8JemI{i9>1uK330UQyUUz)<W)m2Te2nTEYqanbd8?Lf>(NNX{tF<%(C*eRz=V+5
zJAg*X5H}=c<y|Bq(rRf}4Qz=C1F&6hoJjp<z!aI&K;C<*>IuZc;Y0b-byoX!4SzG;
zSx;7$-o;)GexT+K@W&uTjjcvCV>2_&pfZNpv>tVnzB0~@qP(w>iGv7=DL_>rRriNx
zt#a6Aw|_^3D(#-U9rbibzzR3y_9gUTiKYZcS@v(Xh-29}njVxp0?%s~`LWk%iVTS;
z6>Xy0_z)eHRf^={?}k!A&b$eQj8)7-xlQDBFAi`9P(`s}mrE*-tO<c6b<vb3Q9A&7
zYzAT257&mM>UhNTN!tiv806hxtQIvFTL~OdF(w$-(mZSmk{MD$FeFdVC;?`XP(5tK
z-hQP9@K|yM;S;kynd-&4=A$UmY))>5VcqG9+XplyKDZWNhybZBXgt9|ToA)x!paA1
zYmflM-0#z91PCBC(~Om%Ae<Ya93b6DZbpb8H6#C6aBG2I1w&ud<E0rov<b?_lx|VV
z(lAS-)#CltpLA-)X7Q4wH+S|nf-|C}r<q&{vo6*?SqFYRaF@ULB8U7<6rWh&yBs5y
zHysfnZ71MCHoVYpwiQaP;4InKRlynTZH_${e04xwZ}xo_D5{40cG+TnJI}GH6a;$9
zOs#Vfs=&oKeD=`m_BH?h2jWHOSrN;LzseG+>Jp`9BGqgr<A9LC{0d1+Jad8FgzXtf
z;UrWC<avCa2ND&wEB=!=zR5IuOadE@X%3oK(iDyasMBCVuKXmbKQC^`=BXVkQ};J5
z_0Rf^G=MJSd<L}k$^yh1mk!w;l!LBVU$`IwZRxuGs-5v}N(>7}r%+CQ!J@)6&|p8Y
zIqu0ilE<fKe2y>dSj^|9DFy+7j7=M}zg1J6OQ*fxz6V|xI^a;>%I)@#Sm`h1Rcgf@
zODM-EsEmt{FK~eRD~Ch*Vw~&XNFf4%!g-%M^_caP-KNFc)m!eSpU=kwqOls0&v**R
zZHO}{)v#plTv~g`D_(3^t&pPRBur=qnqnCE0ev(XRdf(D{*T0ZN9H%EC@yMwXT@8+
z0U`-^bFk>&=RD=C2l6$)_(!$=G+|NN^_|1mYnENe$1m!&S2B-ZiEtzI5{uL-B2Qs8
z%*`fEW2Vl9hO6+Eypq{5YB1|>qgQzlzYlqp1<Daz76b_jC!g&n9^U?au54k_gA(-9
z&n4mi*WQfd9|d8jTK^|k_DKj0<e#JN(C_Fd6K6p*K7d<%I<pk<l>BUyXvq8KmEX%u
zNAak(%eC%*$dNsVqYI?NrTVtVw=F#0w@H3CaD1}5(VcOV@_J+v@<|OXCnvdPXk=53
zQM;r)I1JJT{^Tj|+dGQ}lVhc$iFB{h8oJ<d`B4rA22T2t9r_qdGI14b+KFH}4Af&R
z;r`2dOh%7{EjN|s8O4iZj@pM49t)=Ydh|2tF&6EkGCkI)lC0xB?^YS9DaLedH7POQ
zgvq2XGW&_-(iyr+>cf`>Ix;?rtWn1o9Po~?m_Ix+DYOHfTUZJ`Bek@rMFq=7amd6C
zqwX#$0jbUMNG<xCtPBbeB5YqTIFNDEUz<U&b#L#(HfbpF1?V;X9OK9?!|taRjiw0)
zm=Joucy4LyZpJjb@BWzpwQ0vE5`LJ2?+%)fGwK-gl2j`BUUoFDlJf=8)=W)|1)(GV
z>Ohu`dR$e$D@Wg_L*2x9GN@D6D)t0YEx3Y6)}Pvk!nYh@-J`e8cFd)akN)>PKp>Z!
z`c?p0RNDh8BhOx|(<huX$bD|yw4Vry-4mTKyU*YkDfcL?;GZ8jl?=MoY;&3k@dLSc
znT>YAc&iSVgmH#(m*C3rMhS+6%z_FeJx+q-Fkuijr+={e=wpSf0aoNPzwW<i0GMOt
z9@KjSoDC09$Pg)L5_QIzv1|M2*J8*#*v921q!g7~o*%tdvhL7(T(Y*)=`B+OAQ!CZ
zftMvno**LSq}y6_G5DMJr(e?>5WUB4<u@OWDemKg@h#a}@)R?PDr>@`J!C|}_Bl+w
z?=lA#oW`q6dW$5$MoAb4Ru3wy<E1R63E;KwimG+uV9R(kW^D$-+8c<ya7BmiM38%E
z^fV_*DuZN~K+RTk7EpL*f*32QJw|Cuan;SoqHKsf^<seuZHy?oh``l+#7g0nsS$0<
zZbsmil|=^w&=fM@>~KYF0j?@+!G8J~iFyLw0#ZW^&&Cqc{DFdR3!-O!DvUw<_nkq5
zNsyeC3?Efteq@^PoNb|8=i3t;6@(O=ZexMNZ)KJxsJU}phv9hwl6nJ56VkRcZXI@3
zb-9?O>Y}Xvfp1>&wp6mdNOs@MwmvHf6&uOEHxaEApTH!w9i}Je-HQq=mo*)KQ*8oq
zDC;i-nvjNaYC?ikCPrYY(~dw@fCQ*{ZVpD3t{Gy0)X--9F#g9y?4OsjxWDew{<^pZ
zZ5>kI+nU7FLv2Y>0N4DLsO5^uizV_+O%&?DVOy1$ZgwD^msw5Mmrap!y9M*;2Gg=*
z)1KVmd|SDPks3!T)Xl4A8khZCDH+0$(bUw1DlV%k*hn1wtC@rA>ZiGnY)f+#r@5Iz
z)-}|3hpagE-~y<JHmw`a@jkqPP#kxG$L30hSvxCU)${#ahI02=#=B>8RzE?9tT`V3
zG_F)7&!@~e7mf3(RNEGvcIcz6hRaVW*goi26C54AUPhaD{q(9{6o&`C<E}kS=l~ae
zifqZoM<%68A6Shl@QPr=p?1y>sLQ*`;^6Sf|1=SK)O5i{40ga<I7O%435j{{#1SCl
z6!i47<2&zRgD*-BPBjKin;OPKciyPkw>HDdr<Cwy6uQ$%+NtYeh3Kt-!Q9zcoD?{`
zc!E-X!WbP1F|?KqN6O@=4Xd^>8t+~vU;!3B?Lmnt`?Jj0O>(2B-j9$`vRQhC2)B;W
z18+ZAeKP&+C74p=)_p45WL4dISe-@q_A7`T^e4122VE1qv_58b$z9mG(*%xtCaJ6%
zVE;ILO_g<n-^cxh{btwO16(cb%gpf-%za1&+{#N4|Aba8`n5eXjDR&ZLJX)AWf&Wr
z@2=C%ufhIC0&{K{#_;<`I&6JXswkWUb8Ua}iDle1A1A`p5|#oCz03Oa{f=itpvH`M
zuWaUMJ%<S=w2L(YqTnj$*_3kvO&`c9B;B8ppgK&uNH!KDH$d%=mUgXR&0Ux{Fp|Eo
zYjxOg|C%_k*3gJw6@C5DAAehZrQhpmEcBUbX#M2f{;NFVU#(gHsJ@zIM)8%}A*#Ft
zC^(-`tYC7;1jnbYW9SKXbDwke*2bQtVupOai4QI*VzN!JQYEw_I~yVe*m9irW2g?8
zkQv0h%U#GG*WiCk66OOUTDO~>FV+?QC<zoI?sJ$0&Hi#PT=D+p@qVgnmN}30NbLqC
zf>6cA|IjozOb^4pJcD_{MnV(b5&M;a^RnD7SbJLurOi}?9XB=@lWXndZC>?tz-$&p
z8?MfphD#4c0*5ocYlBk-d3sMZUq*r>#BsyF*dz{hDszz8YhZa=Z1UTg5z8%RKu)Dr
zEVrG}_MkS2xtV)IyK?dhRi}T^p0+;Sfy1vZ_0lrifQ@PR=h%($4kzZ|@U6g?GZa|!
z@K`IX18#7}DWh|dk+^#-fP*A5RDxZ?T6N;tJXw=Tn^!+CHc|k>InrmzrnrA!BosFR
z@-{*FBzo{kC@~TH4w!@XBLTnDFQOo=anAO&{LP4qvsQ(nA;aoGRjE{-y?jCKt&8nV
zhMgG8dQK@x0+M*nNn-9-kT(P!J=OCro!}}RRpuy>2LVqr<(e=bVM$_bE~t*4m9L=y
zxS#OEdptGYe{%aukDgdpUamqXFbD#AM&fVdN%-B+-?N52i0k^p@~y*Q*WuK~pr&cK
zfr!HI2)2RCgWx1VvZByy%GU6g2Ke?ri7H#W&>d5%5*-iIJUOnDlrtVZal`!*Bxjv?
zV%49>pN{WBt3b;wwY52#`6Q~{2EDDydYL2gKKhn@6^J2Ed{@fJ#nfm?Dsla)<$R)A
zZDNk$ZXClg0L*h{he>FM9}O{u;AGYdbLe{KD-XyXGKxL$<5)0n)Ereriu9{4`~=ms
zQ<mIiq(U-%O9q(r83tA5%u!<&99pUFb@P}!;ioy}3%Q$}U5`&^L$D3UjEP3Q-LC$5
z+oFl*JDX)t&rH9~cT3av{Rk;ja+B*LyDA6E42m2R(Vtwj`YWpATM&|BsE@-oPg;q3
zYaSQ3Z1D0!F!7uv#qnIwxZ`)w)Ku8*V8RRh5-(V?ypyKM<*AZHy5bAkw8L~k0|;_B
zEqmwi=&SD?9<~rVv4!74brLUQr<5YS(V~4G_cdQXn`!?u<DHH@T#WL0`m8>S+5fB4
z_g6z;rJAi>mN3ejL7BVoE!Xzy;INxX_qUXp{_-Sn8^cItL-)2ew=?wwC*VCdkBm8%
z^a^<j>Ab%Uwwc*M2Nxfkz<P6RZJ+L3^0@p%KMSHfoc6+=nVG)czPbsnD5*+*7%D#-
zvSxb8&BH}8J*I+Z7?KMe;;@K#jM=`VEaqwxHD`;&P6(^~mX3sHG<o|qUfbRw-ku%a
zsxu&AJwG$um)~EGA6VYnjl<ByKv^f(>ljQv$Hd8jCdPT6Y7_Gv)Y9d4lVSuMykQZM
zw7EEM{SgVFiU)dcpp^zONSm2_MGObBZ4bMGs_tcYITP4&FR%%0QBI?6`oVJcE_P9D
z0Nc_N{6_z3HXO3!JFxnMJ|G-eo15`-PUKA#CF)8aSHhe=r*dF_2V>K?-e+Ky5GeqT
z5LxZ>;4y7`fCCU@Y$|cTZ6Vsqjw3#NF~kZb9>Za<3l*UjR+o2?j=ous*XYrps*Rmi
zHjuSwzaS>HqD6ZIcc_A@z)_ieHj2O!Zr)cuN7E)mr&_$Mj-|E>(H#>D=FkX$8Ni?}
z6@N;2_H03rtV*Hy^&L_XQ&Z-!O&!-uxhbOElojcGJdZkn_5}0bZX$Iz8W`dx%mN*l
zhq%>iXsid!J6A78lyu1R^4wc<wIynu&z29gfE_qNZEz{nC5h|gm&?Z%WtoFxhL*ov
zI=Zq~dlg9msK9E!mPNZG=+Tbw8Fkuy2}gsng;L-n`H;i&`6d|@nBc0Kw#2>JwD{ZO
z>PHW}yqNSGEk~yN#PHP$C%#01DU$DTobzuu(Zy!WBNJoOv9-I`ah*7<<?pL-p#Wi5
z`?H1cms}#kbAN(74c9TG;4j5<83Fs|Ui!A4h!0FMlw?e?sC16OYjAZVOWJz{Q=?)+
zFf-)Zh40r$aOQR>SokN(Z)K+%P}l}BPfd*67!8S3khW)b21_x1eSm50YJ|n3he;$3
zHa3TRhg0$I?lVdhr7D}PdD;O#p*m^XEFR7K8QqwDzarovJqyiKLh%7}EtMGIUTVk=
z{=Sx&i(@}S9})y);J-ob{MA=F{%_O{%5k%?sy)06)Kvv_yiCwaJ&1PxcUN-dB8#CM
z<A7!LHh`846U}r2z7hk~)8P+>vGDK*Unb8dqpTteG0%*e*U@e)U+`&uW5+}xeVp8V
zRW|0YG}44g$$xG~m!@YP9o}w~(j+vCuNanV8-=?Q(nx->s*9o<RclDnWhPt)VX%jP
zq{f(wcPqUwMe}%qoztIby>%{*Nhjjnd2N^q=rATl#A!85S}@Y0DyotSzdNOTPL`!>
zt-%1K3Kyt1OQ5!aHJC;V?SD>Qz~1bJx6uvWpQevhR%G>HG5!Stb(<W4ftw&Yn#XL)
za*!<R()pBvRlgHhQRJQl9ha!7GLJ4<{az4Omdn$<mFHgbs?3()(6+aHM*-VIYT<1f
zu<Le>hM5p=2N_kKHhr^d;pz{82c(pm66gSxuGvKQ(f@|q@oi}4cpNQ3CFDU{?JJm^
zz1Hn7aUO7iqURK^>2=8@nQRK*TG5oJ(86-Z`P@-LTm#xHE0KrP8k^PQmr?vwbEDHn
z+aSGFM?hd;p(*oVRkU7XIAxFk?LH~`DoM2aOg{W0)KCJyHVe$0#6U)>F{K+#OGd~M
z<p41X=@FTU3Y>^=Z#tuYxW=C}I`Ccs-G-^Ci>s~g9(I=jp)xRT0mzz}s2LILTtXAX
zU)We=X+WP>e(!iCQ%;+cW9K`PPD!c2jV9FL6*5RuE{Cp&DbMix5d)<2GRJ2y$V0M4
z+=2D1!7f8TeC@1rPv||QWyCe;KSwFcWx_Z~ygZD<NE=5O5Rk8D{{SCz1BN{WH$5;^
zU#ud%GId;dabMPovhTi0ZD^FuyQrOAgDZ*M(mY6ask4xPURumwt;9tAjGPDQUXiL%
zM5_P|aE&uGZBWy<1=*Sw2>o35DyhK*(fmMH2M56e^jd)!LYLHF2(LUdhZ|CWh56RC
zvy_-9sL`l3IhBv%rSM<oHGd2Amc;#S(SMCy<RUhUbW7Vj`7^f~bt8MdN~O5Iqu6F5
z@5L2kgC|W;i_kQj^Py^fe0tu7QE#2zbd<Q7$yALki2y>DyCDceCJkyF<m*;4nVM=6
zP)T6?TmK1RVb)CTd6@0y!zu5a%b0y5*U*_5hYFOxT%Z`vcC%H#^fh)d!4tLhim@-W
zSRQS3Y-X5_tL%Nh<Xc4<{$iRMs_TyR{A@6;)(v6}nfuw`VtFjIYr{VHtZ_Bn_Lh(p
za38)>IQ3Z(tmzTy*m1&K3pC;Zw=*~s_~awi2)y~jS9-#cOMY@1pV|rCXmI0M^}*}^
zD)ijv>6}tQH=Pc%QZ7!LvZ)<ue3g&1ZKWMw5?c@|9qZ)9IydWJO{gL?ciI}#m%zXE
z%28Kx-hWlYL#X&tJWIs4Kh;>v;{-JHLDNjiyYS&S))InJlSrJlyl&aQ^<(%Jdc+F$
zeXV|*I0E{2ESh*c$ARxla7hQNsl4Ly1X7+Niv8V~%H=4Fe-&&v!7*W=l6=p><GwEg
z*moCKxSS#x`RR3D9sDJU?2Zj9tQHH9<AP$IX`$aN6M{B+OqZ0Z5vr$346}~4wi_`}
z;LiLR#$$v!Xdh<(i@W!ZYU2IgMT4jah><QJL`9_&3q=u<k0MP3qzeeKAtgdoTEvhj
zNH0+lQ4k~2n+OC@N+MlFM7jhe1rceID1?!;JABVw=iar>=XcgQ_pbZTjmuflNoHoh
z@4NT&>}T(pv_L0Zf}h2OADV1BIlqblI~D~K$ffNSbGM7(D$fP<N7p{<4UZH0<lj-5
z*BD*}=L5IZ95&A5yfwuXKSMiCOJa?-hFWvK>uvEnua$Phy>S<{Yb9iB3+{T{gMG3A
z`+QPTll)ufbc;P-X?>7>R64QaYjt(F$;Qo?h1U(!$1+O3ZFDKuv9Xudeb=lI;duJ5
zjmuZe@GD5R&32LIO8M>iJ?w(J2+P7>YiqM>sDG{QH|Z|UDuTiJ!2d#U{?9$$m5=8`
zCrlK2sL(IRw$lQ-ick8=mLy?1ZW~GdSQZ0y_xL%Jl*mPyO>5p0Q_FUd+xCrnJZN}X
z08ci63ByDAe{TE@%`LjDc{xkr@4M&9mT^hux~#nfRTY@Vk7CTdQylXjLHASd*T`pa
ze$9=c40~Ulno^_ajix{}t4|;fd7R2(o<C!ycb_x&LfdK^)j;~m^-ZYhp<R#o@~`T`
z&&wqK+BIbtYBV*iGnG1hD7llIH2UP7yUhJP(^C9u4?C}V@^1xi9ji?{ESlyY(6Y9N
zJUI#Vi3>dZ=8nYh>)3Y%R;rm#1K12i^02qmZyjg7L(F|^`61=Mk4xPd{re-z%kaak
zvc$>IU;FGXo;2Qa=`;WH<Jsz?kJ}R4>X=Rj$%zw1A72{Y+@i93;=Wk?1QWHR)3U~W
zZ#m35v1DNGm*PaA@vW}t&lHIH7pg+A->7Ws)}G*wo3?JcTfW)2=ib;6?RpZX_dt2O
zRst$f`CF*>j_x$zhSl7%-bv%g6`~oeDyArmX4cey@*fqM$Bq(ViIl}J-wMzAF%JLO
zvC(zvfXQVV;|MF~Tte_MLzeoLWm%ym6RS}r6*%0Z`otqRKT_x6lf>EmMU3stS^1BJ
z3OTnEYt9NKiRDzA&E#Vbf7Zz0Fi_)!ew&@|4nTK9*%ArGJGEYb-M!L}lb+-h#)?zQ
z>Y8#2?UwJC#wHzc{%%qI@ZQN8nPLNMLU<89|Nb80(wpx;hYzaQw^?Eje{W1zL5z+F
zBl{K`ek|(0cDlLO!%8(yUa5G?k5fMMIy1=r=VEo6yx%Nl*R8$f#Po70Lz^QRTN8(E
zQeD>|ADgVusif;S3uZ1p=-H(Y$U4Br8lOLl9(K0>@>#x5EtjL(Y-g}gpxU()TJ=uH
zychXncOi3JLgVzd$U{(r`Lx@FEkW3Yw*?J0-!ktUk>LJQX!Os_mY+U|?(5#2>bqQ2
zpWlLbKZ0(1w&~<4@#T!$8bevfIxN8ti6AFwvsSZr(;jZ#u~W>q=UAP#?LZhKe(#E&
z`t$yk4Qsd2BQ>Fc2k{l5&(FpA#r}GUZMwSmq}~I+<Kbzq$%f&tOT!~HNCxFL_Q{s&
zz1t`yyPpNTiJYJt+oH7}Kx=7+`vkUI7Ee2$Bjki0aXk@e^I+0&H+iVa!q)b4U2Hhk
zVvB9!SR5XF^Uel*6CIAFD?tNKZ$Q7pPraB}<_lzk6DB9!@&tWHZ|(RZF?zyi`c(Mc
zB&N%X$L^F7X_XiC*3bDZ<(6wzi-rsCZ$8uVSCj3JBrz_qNXZsaS3VZ~2sel2Ts`9d
z<4aB1(d+tz=tZBpnjVB&AH^);jQfM+o4+>(WH;9zI%DjpBT*!-w$0b*$anQ~ZwCa1
z&`_VOo6d|3F80o~wbEXf_e}=~Rd4Q^lAAxmt{x^7n+UFSq7T>FW}dw_5y1uY;i0gb
zcPuTQqa6C6kNQb=eUuROv8aHspRRL#2AM~PO{%qypLYxh3m$hx|L}EM4e`0~{U|mn
zxDhY-y@UZ9LdPtt=i*x|KF6(nb-PxoyK?lFf&62_)OCipJmG4FPHyg1u#UO%HR9u&
zONUKVcw_RnxGFp0I?|dt%jq4k>tTPyps24Nm+khZmY*$8Eq%9nrM~C6K0?w<DVg&3
zdB=~n%N@th#^s0n-L~MeyxFI@m>5=7w_VK7C(z1%`j+2h2zg6~g$!AA5~<6)viU<&
zv4s5A*4EiKYp{RIAOH6sEAyWkfZ+}Qt6~3tdc4jX!mqGLU=`$*8U&*7f8H?^9_Sz9
z5C6~hKb^1(UQQSgPCzt?+7*tuq!p<u5s~txCb!B#tur)HBPjXNS=Sf;4BEx0(WY*T
zbu?Yb7HeKSa8>W~hURe6exJlD@ij%R?Wx)wy|#m!Cth_Wp4>ygiRe?B5rXc5$N83f
z_D<VPRG_MEChbK)1LYc8Z*>T>WT1BOnUKTJ?lqWd4mQ`U2ycyXic@S)Z?AOvzCZNV
zeV5rEA5xYt?C7m9-zWJo{Y_F$tf%bDjT>NM-yT<0jt0-`9V|{xl-nsj-aK`1MN&2R
z!K8;{OAo`?*#!!m-f9%Ysyy)Jyvx{@qnW<@KkvR00tGhm18p^Qpa=C8Pm^17{@&E{
z8S`XcX-F#Dhqj+z`;lGMtF$rGy~FixL2OVfJ<xYT=cgLPHTAV$_1NNwboA!iF)*Y0
z+J`|VYicWwj&&PriK*n~`YTmTmi_XW(&f+yv%D*MnO$<myLeyDzQqiWs!ddEo4l=0
zd2<^R-q8FlLIU%T5|wg$35BjQJ=Y>;m9{@~6FD-FN^*wpy7VbhkEsTsm-@4hO1;QW
zGVHtfQNCV(^dkRWDgt73?}C+l;T0W~ORyKOQ&rR@%T*W-vG#`}-(^c%<|sd)JTD8r
z1uNch=%V9NU+)sV1(z||f{Ww>U7RUW2Fd&=o?lQjt^c*l?yMiMHqkHSuU%U1`GfI8
z3SOwfdjS*y_?}HdR&PGBxo=<e%5)vw&-~WPNC}m&O5Q**V3%%%4L-IFPu!U=t`Qg7
zec<Ky%X>dHUfglP+pi|?uAJ7{u(IcL*ZXml!yQ{a8D^dJj2JP+LMd$6+Gknp_T;h;
z=Y9o7>ZPbNG-fJ0tcnyvgZ%a#hL+f!?K^k6_qBK$v;rG-odq<SI^>5B>OT0EbmZc%
z!%ww$KDeu0I0M|hCw0r>`Ebp=#>%E_T;*r+6tl-3IhSp19F6*v7YBFn4^_x*=P!P_
zvV@qs=25h=?P&P@dy0?Uw;=a^JUHcG?tbLuo}--y_0;$#9?n_!;-0X3Uq5qW%-%bY
z5iNGVHn`7MZ0*rgymYxu{lw}Q{mA<#2J963CLaGebR*HJNj76hcj5MaXBFO^r~f2f
zB=@^3;)u;EFX(O1BVKdAcn^>KLbR%WEy&j#H2-J$qnjJ>dq2~WzwOG^v}7IQGh=x-
z)Q7&;Mv~q{Vx+yFm^W+bau0odZ}x3^&vTLRP-|Std&HfOXTMe$RxJBxafN@}qrDI)
zT*$NROQpk2QMC?@L2Xk->cHbeCT9*?huRO_wf}NA7O{0M;>WUGioBO<TfwKx=NXI>
zcluQKzFtcz58TdFPz9PwlJ6eUR6q6bhNOhe;d6FAQ_Stp7i#LtpnDr`*j93}=M%@4
zui<}bzuQrEBJ6daN>$K?0RM(a15K%>R++jcrMZt^=d6uf71TrHKhQdV+}^ey?iSb9
zp7Z(JuAj+oKg?efj#V-laVnDJZ&kz_jXU7y3R5rN6@T#8cE0szLDO{?4L#|UK#Qvf
zep-Il@zA@D4ys*xyQuG|TZh)akbP^(x8<P?`>Xvwi<ix63!gsKKW-CJJ#sE2`dv@Z
zgj&ZF6^$u{7Q+b3WUnpCP82zl1t*6qnbTpIh8ta9_m*}TC)M*}+a1+y<K2#(wKg&O
z=%%zmf8c2)zrx|byp_%w=k!neyJ(v?b|Cj}`Z5q<ph8Tp{_x2#J#b=ttn%USvOO2K
zZ?XyOEM`ICcoC(s2(hedD;IQQ@po>Sg@hcQR+`bD{m^)6b3o5Qd(Ed=K6x*0?gxKg
z-8O#f5AgYVKRtAsoqzGmA3vn-a{q9Fk94(K(z+&7qZ9}Qt~ux*7<@9weSJnYQTL(s
z@RD@k&eMyx1Mf8ZJx+KY+Z+4`b<a{j|M9fw0@!+zId?n3_RJTe+ukkXiM5wsEGb-E
zxw=%l*US3dNjJN5PrPevzTbP5|MJd!G-mb8#{AH7JCDs<e}O+tvE;Qm{lvHL5C8O)
zUGADhhcT9Z;jSKva|u;9k#PtWo#=dF@p!2`veoO^Qmen!a%W3l@OLk<kEeW&>)G=f
z6_`(bs;bUPt5%zYG2f=9s&<?o*j{w)$=zkx7f~tuiQsp>>F{pkZCVP$GGYJb3*yc(
zT_zf~%C^Uj#CE^mMms-PSQ%6|YUg7>uY|uIG60?(Xn3@sFGQ5or2hEmKl7bsrFyLH
z)g|<?hT&nxc&3h&<rII{rJa`$PqL`NoB_*g%iD0s$A|Vg9B4DuyOW=&@!0(K_r{%l
zgH_c3TaF|JzF&Xsh7?^wWq&J#K-T|)w|zk_V=kS!fw>k2!dOJe#n1~uR}F)&`3H!;
zh!~LVr>yO)A!1@;kn`Xlh-d<0x&E*JSFiPh{J%QD`j<s-A+lS*VpF}i*dE9RSut^0
zF;OQ3<S$~I{@bzFZC!tfZ4lqMX|u!@NhxXY1NGY>8^pxLH*6H&v}q&QpCuLp?uTrY
z-6XgB;PK7!&KD&1Tvs@B=YIYc%@Z|kiY^1}y@xN}h?SI5Qr@AWx^KUhwvMimv5D!C
zqh^*Tt*mWq?M}J6xqEn?_42-S+5bvFU{G*actm6r_GWb4-S~vWq~w$bnGdrbWj}tB
z^P-@zsJNuGth~04T;I_6`b|@NM`u@e&%5^@28V`6M#sL4)0nKO@6$hKe$LKue*als
z;jRLI*VgS4gNXn4A^TUmWI?+&Y}_cmQDWULu?<n{hRbf;wEN&@x#P|f7p}|iIdo@>
z!ioF&HEoibhh5l;7jFzmDeX04?&GYR_8-gs*9?pOe`MKzH|+niiw4;$E(T7XxGV$-
z5z@>pc_+UfXBiMWNHX}%{42nmP=0v&GtRY&Z-j|kGffD0lAW;JxR#hmXrvtC`vMM^
zUe3`@`FeL%_AqV67Rl)+yvQNea}(Y}#_7i#--{p#aX`HYVtttBN?`i(<O$0c26!5O
z*ah8(G%jayiv_P+dd!>uA{(-MCeB9H{LTH)5G##cm9O1$G5Pw=z=FA8$lm~=D)>vz
zf*t*f4kAc=)B?C2NVMS_!Mdu~4w7iOB8b$|7jRp!t+_u9w!@ehAHsZX<aCFwN->_q
zrt&UK4l#uH8R&YpfRbj^e`uyNr=$~}1lzW(j^v$i0H3dLOFIG${<wlk)6>yW8FVq~
z-0>+Sdz7?Y1o_!P2!Zm&1=)l(?Nagbf3-&hk&)n_g=qoYViBahnY(}eYt@N3`3R0?
z8;ZP|N)0Tfh>0L=J`p^a0@kUs$(+6gcY!m-&`K3<*xx(_mjehMt)t7RJ2^dz+a_Ub
zr2>WncptACo8GZ#-$vO}GqgFAMNG2vVCQr8*Hr+Goz!!L4sV<#Xta!gjATQ(zTin;
zyf7QcKYFXaM6zkwv!&E6VNbnwyK%UCzDK&**~f0vCHtf}B`#TJ!*j=v2DwyT{9D%T
z%yl>TFxT!@HXQ0UNuF^moICUMXo*I&OP*Plj-!0G`@b9K3IuT^x+uqlG!f)(2_v8j
z87qQ}qsFNhz!}P};LDE?nEQB96T%TeDmXn+Q_5XLO&)9^QUsx)*!M+{^Eauq2G}@^
zM)4~X<d7s0`-71RWEoel5sCuN4LC+9#Ky*KAV}kU2Kfdh=v6<Dtb#cXI{MtC6o-(3
zGp@kmf2m(NcEniySf}TH3s0v}ql$M17;^7p<q>UJc47xo6&&z?E&xYaq$H#=UXBkN
z#qam|i9)jz-$KPcQMPidM35vS`i|hpwS@Ev{)th_LB1`3-b6~`?^YsRXNGvF2g4V5
zPh34Y##wnIi(_}l+mH-U{p8`WED@w<6s3%0Jc%x8g8|kES!_wSP(D)d5--iSM#1>D
zLsd9)oErGcZ}S~FRvGtSoCBs@UUTwNu^rAMg@b%p2-D!08^?omA!N#Tlf+Mf;@!Xx
zhbo0Netu(O=88w&4_@zZf6#N|iV-2==G4!A6~{^hnRDc5%jV5_IQ6UM*Nv{5%wU;Q
z?>8-6cjFlE&2;G+EPmyT)wgUdh_7&x(2dYgyigO>J%#R2-0FVnYzi`0Rcp~$1lgo8
zY-|u<H$r7RHfa*i(`z3?$a8jzAPw}k7@M|6Z%W1UbHPIG!ItY`C0Iv{D&RtIkA5l(
zcvv{KQ^bIW=Etl2e0;~pX+Q|Zc*5~l{}g<2stl(CpjeNQ7pY=D3G(LdEJBhJRS}_z
zrE*E^>Ae27{>?!Moe`t{#I+yERRHp>(2U*K?IaHrbR3-3*w@OgL}&pQiq6(|O=jPm
zGSFQ!O2#j|Xsqh>a1BOu%V(L)I_I73J`ivFsYu%`%Z{~MmE)LoDy4B$XXs&3LRBL8
z!6QRyv)y_Vj)hFgBA3Dgzcb5pjdf0tpO@+$%2VmPxcijbGo_;7@KH^uUu>2nKIAKY
z0LgYytMa1_ofbi2F@UMVfJCejb;q3Q4=OL1Z;``CSFn6_d%XdhVr-Zd5D}%=pK+Nz
z{VBhE7QeTZvp<(Ty1C{J50&x~zI$ldH*y(9hVd0VmV~m&Frnl)qfe=))AP((^r3!U
z03h$p(drloiz-bv{zm9fqRWI+q1a5jA)^X=`zH(LI-LDWXNQgZ9D9$~$I|V5?YmV`
zKkF59N{TfOm$?)Mj%J-&eNfC9E=s9RS}Wdz#SWBNr{sI@*fQKZbEmsUIw9Zrq6k9P
z&{R(vba><W(7<jaU&`&xCDr@w9_gy#ls6}KB#ow@J6V3>Y}rtEk8*58`EYDoCA_Y~
z^}|?%bgSmcQDa}^i4mMLp*YApA>W^wClw+`_u(jh%PJf*p(zosvqCO)P<F<-kNikC
z5+E?!v2iB`_B6kaU-7+k`jywx`M=Ho7Nt?Hp4ogaAd|qRb~LE|$Y$5E61xPJ&SM7Q
z$1sxo{evyWWk9k_S4L5(T?MLl3Z{hB>M(Om^xynM-_!e11?$*e(=qSR+~92Qgo2<D
z!&vi@e>=^;g25XUfObj(0f*X<C=?U+QUwa`zmX}Sl)Y$htVEEGC7NSD1IAZuw8lu%
zQRmR4Eo2lCCT4^gpXpqS#+h@J?~r}vI|-0nZsLf!3d{bOuVt56V#YoSxrI$izQXAu
zCK@lI{WvDMOr#7DWo!H+rTzuI6+Z24p!7TI+v2a6?xuYo^CcVkX0=ZAc%4sK7FoeH
zqef4;C7)hqSr;i9y6=7Qb!t`l->{4b1R_&W%a@S#rb0z}jVTDC>NlwWBPvx4A{6*H
zIm(?Rg|AqYpCCsB2_(iLP>D+`Jd`ZoV-TT4lnzFX(=yUaoL>0M(Pb$5I9rovyK;G<
zYV$J3yc!0(OG?h7P0ett$Z_SG&Mj^K8z@&+`W%`aB7&fa@oV~ZK~z<PmHzRP&a$1R
zCKMH5@g^gE+mzSPOb|<zn9yR+5J6OcDduX*?uOS5Idr!M_G((ah0}@qm3znZjV40S
zCrGY(Y`AcnXOtyR>EV?V9o)37Zdv-uLIuaqwwk*_uLcxPWfZ4C1Z)8j`TYUVEc}$q
z{UL&kjtibw%mx1q;F^md5A0b>JTuG$NC|ubxUtn`rQ*MHxz&IKTaWtyboLB<7ik+c
zkQ7Um#3OKsSAt|JqrlI&h7Io~C2Bx{62?4y$4?PNoyy3L*0|4_aN3HUu^tPz$H)go
zYo*|v3TA1`COYM3XYrb2WFES-$~fkP8qr$>Npu83N1r;YN_Zt9+)Z7W7ePM#ASTg&
zKcs)53qOhrrMyItDn$`wV_r4&r#J%&gfH>sdWH8(cn|_hP56n*{}uRO9Hxmnj^W<t
zG<KsDaV~6nCrm0B0Ua@dD`KBwF&$=!ZB9fHL^1*<2?TVQJ<4TPFT{j(qIUAN7y{VT
z!=|p`0c?d0Fu*o!C&V>Txs+D+>OZ7&P6-Ea&Oi;f$hgYP*wrOITA^a-!Kk6}`CR9m
zRb4aJnS!H(S*H$FIO4mjyn~JR`VCb$1vyu^q_Kuv(w;B8dA?{CdhL3JYQ!Z+@`WQj
z@9-I{4Lg&25*+4WLK^~34I}`%r=dJW{Kv>hzny{#a|^a*2hESjM!k*RmiinA<B<6B
zD6Tk%luA+k8Lg50B52L}KBHeApt6)R_^#D~W-(dy2~{lk4s5m;B_YPfi~5?hf@JhB
zB=X?+PqfPDnYY!q2~tA2d08yf2JqD=V!$N=g=#HMiGCWR#_;Wdcqy;t`M}E>YJ(B2
z0q!GLjh^FsF)^jSUu{(1AI@ZwYN-6Do_xhrVZR8%>=!om3METWgh9v)s)kUDe;Kr;
zbr!?fiB;wX0@ekNz(7a=)<C%PTp*!7Zv_k_9>8^`;KZ$7kSpP00C3VbhJ5B9E`}#v
zu`lEVB;w)1&#L4}-H{d_h7L}>ByduCsKvir*gX5dueW&icksc#W_KTX5DCheiqtGD
z+j`7d@Jz<$Y^+RXHfp~*$}L6zD$$u@rF^J~(9+@DS$)I!Cj;G{<#dBu*E0=V?cU}_
zYL&1)k`>?C?Ges-SmAPxofiDDjMr37>|XTx5H46dlchTOzHP+sdyy+Z&NDsYnB7vG
z^U(8qNva6qPZ*Em7Jx>+5DrC>$o&gUwi-93o-MH;16s23n!T^F2NfrRK=Fn?fbLWm
z7(9`DMJc#aKT{h=VA8kSFyFK|!{W^iU`a&-kNQ@~?9tmfnde@nmq=r7{uis6n1}?O
zVnTD>DI`n&n^SDMSWQjdC8tmWxmTcaQR4XP^6$%PkGHPu3Uk%^@HB5xVPZP1a3O0s
z&GjKEu*+mJ*#tIri2?f;A40P*fDVQ)Z!eVWVNZY?9)S-+I1hsFoD>7%7_cw7+zL+h
z9b5#Xdfb)a6yI`*LEQoPFr0|S{adk(?S5LH`4X3^fUwTVoudm^Bhn@O29_#U95mhP
z>W7VZ6}OtPSDc2a<2{{pb-rgY|MZCP7|wi*qT>4%=RC3(z%Vhgetd&rV;Lkf?U^dO
z;^%1XH@*(1zf*nxFV>hhVt1l5Wz@KH)NsX_u&*_~I?*~-t%Ja*MEy>M13CeM#}bPm
zORk*fsFoEM*fskVZM6tc7%s9K%Xh5H_sn`-4n(<rxu~h)xV_r3Q1fWF?hTVHTM?wc
zX{0O+qry|ID_RVy=QvlqUkvPbjCQ?COT|?zt?|3TX-?t8LH1tF3gF92Go89oiTF)G
zBfGhq(Ab|=#lOIwU}5ftLp87xU60n3*T+ywkM&IsDhC|v^2S#%zeTFUy8K|DaSlaR
z<w0Y1B|zNCLE~JjMCYNaFE0p=R^LWmCH_594eVVKkVKG9j36SLuTXnWf^QZjl=K%t
zY=;E5ksNSs1)zjcMud(M0^<>CIUV^$Tm;!p=SGMiGhryg*bH$|o#(^%1T1)$@$X;|
zz8)LH3P`1>;KG_Y+ejQ3%Q}%Q(Tz9^++-emKNx6!5SmN~Jm!={kz;GMN%t8nw>r6-
z?SI2?N~NuK(i?&h>I2J5iSPDtu6{?}Om1Ud;$Mx_oO>M+5abz-_DI{eHz)RC;0N6s
zWKAt)mote*EZX(_)Xr9I#<h`ykDr7xPdv?6={xQhHXX6&`}5qpN6Y&kWtnBBiy;5D
z0GZWE1omkRptY}>uTa9ZU`J;1w&34^OQC3NA%Yi;a|GjW95sLx?<YNI3{N6#6<UDU
zad?@U=w~!?6{p1vQKh?cwC;q{lb%`Bupd>G7LCvot8hWA$KhoLuD-+Z{Y(3BVjMN0
zOD<0hOulXt+U$`lFQ|-6@|GpsZqRlA3KIEZtk?Vq25H=hfr@YOhjw`(j#{xopiFuq
zSrCj&bM-=-g;V7Qg_C_+D3$k@SO$>;^&m=z9ta-eJc<1JPHjO~`#E%)B-{qVTduwc
z(hM3r3AKW;Z;bKB0|Z|&LUy#a_-gOJ0&m4M>B}FX9Xyq~FiQ=^2tr6SDN6Bk=xu~1
z;e#vT5`lLB3a)k`*^@69NfAM+A5#$c9`MqsK~C~~B!}8r2(%Vq$C#0EQ%h2SRyS%B
zAqYit;$GogdBVh`m?d@yP(70>6pq%yHx8PKsAs{q0ofjaJA2-}^)uf-rZVVyu%GO}
z(!yfMOGZ#3hYm!N)?wL7jVpqPAYC<tHdn&hA#L%cH9ZiH|9vMI!-7C;UUsHpL=e7@
zGG7s7V+DI|LqSk?)Dud9Au&J%Vb6#lzTo=aI8;rY{}a_tW!uBXd*0$Tf%`1N2L35f
z>d-6Y`_v8>0C8LbpiNxtdI{<UcGbanN%m3)5pY_O?%#pC#o1p4J~+XiZ5qdiF;Oyn
z%UqYtE{gK!7ENY|UPn{$lMBqn{j{N3OdPV#^e#nWu4YcC0i^S+v7fMfi2}Zb5><s7
zgiNQZE*B3OIb3oXty-BiBBf}Ua;!;OM7W``UF8Z9*7}BnuP>MgYBDLQ_&S^8pH7Cr
zteveC#MlxYq`#*y>+$Cbs9J0s0_jx>7&mG>=_|jLyWwBZ42d3iiYZLh9A(<GB+Wlf
zWkc`ccLWRf1IoPf!ZB){8j$*yvVj97@;w<<(p}!3;Oz7>trJ=G?3YxI6Kj8dez3;g
zA!7$02LybystPkz_l9TocZq+@Y2E){r1J<O2<r=#QAQQ0D2S~JaNvb(0#lraUhAuT
z;Vl?q1Ocd|VL}H;<O5J^+Vn;dL}-vIlmbcL!2Pob9q@wPgzul3v3rvEvQxzE_z8+J
z+cPa(9o0@bz{Y^M$0((<T}kqQQx`=E3@FzNv2h&;ZT2LX)1d+>yV<{53nzVNy(beZ
zzDx#1H{?cyT`2Yb>V3{$`Zhje>@BRpW}P+2MXj?2mgzcckn#fK);m;v$-*kfW-7gZ
z;Tvlymac>oXUDOk7Wz4BBFGdezMhl<+swD(486tgia~S31kb^N7N9#+MUXonL^N{o
z9Eg|sQQq-+ja`E)dB<25Pli^g(4hO@W}~a(M}{Hd^R)G~>WO5#(bvq0>v2=Dgl9!q
z{!$96FJa(FIe^Ra-FlSuCXhWv5Qmi82++Cy)$54=cASb%z9kQI_O{b2aPV9+i@QXS
z`mqI_m)LBkX9qHkzA3w-m$wN=U|)%kVJ}V5GYA0$4X}~_1A$SXfuLrC&rcLV>QJ#)
zaj??lO0tDUzX!)2?F~)-(K_g32@#-QP%PMqi6RJQo}k!+B;pN(Jxz^w61%+M#e@9a
zykoc^_G-!m<4*0U2VA<R8jY-1PF}tXlsDadrjg^MVVdxrojUF{Z_>-2)*7x1C0D;H
z75@u5Vca9?k6z@jDZ*;pfDM0Vy$E6gQtl`)K(=UrX@iO2-Y<kU?iyK8WDaLy(kS`>
zj6uHx>L1mMaqLMi#7^K6GqQdPsth!H3Gyks`KXbV7$;BQ4Fj3nv?(!hfecR;$_<34
zYud5Xi}SfWm>g<Ob!9m^A=^Jc24p*1|AJB=G50@Gpv`}zK=bNH1V%RU&wVG}ser}n
z;FYXVUA4uhNkBk9nBd{)V1g%yTu<<hPs7;#Bqi$4ztmt1e=p%8Mi}?uZymRUtpMAJ
z#K6au8K_iT3}?OrCS8viSL3b#bs$G!U>5awmiUij8oM~F9fULz+mRpvDEn}<l92(h
zWKe%=ac3RD9c0qeU~-j4t@7tam-t9|pjIeYE*ZY?s+s(Y%A^tMss##|iS?+c1O~MA
zHT3h{b0|1^3Al9$*QRe>!qr@ta5bF&-A+*W{MSH9{4e6F_$Yx%LKqMR{@{n{fUU5T
zE|AEYBgi0N*v7Y{2^9MwKMW}06!1KGrVB`R?J_Yo+LNz|y1*mQs5JzzDya@Qfl>FW
zpq9?F2M2Lc1kvN^cJMTLdLKuSWXe&&?d4n5;=qznYBxK2qMNGJ62KtD(i{5YTU6Lx
zh{L!bpqT^007a3VolIbmNUsvRWl$a0aCX4-6efuvXRNM-N`}evV_2yc$If!D6oeFh
z1#;H<?J|1o?S0e;133^t*vs7qSiJx&S&@m%bRQP<E+4}QmM%4S=L~gB?yai_zC!u?
zy-6A7$cPf4ea5Qg714zqT3z<;naT(zTuopnr#Nh!+luZq^)7BKy(!T~1hgWH*f&6x
zaSY0fCbU`Nb!pfMrg92iaom!V%1vzKRQ1PV%ze?!c{NfpWqVeO@`d6cx*Y#t2-Eiv
z0RlX5P`!yeR*puP4cBY)PYU(7&ms$!J@lYfj;&UXy4a^OryJ@#&wWu%-C2@ockBd|
z=LFk3Tl&fPpnI_OsR9>I-K_nuI8UEGQdV|SKW=gJU^u0)Y?Sb6YG?DNO{z~nbI%5c
zKlL>Ng-{bCCt&|~Z<C_RuXa2vq0MfkO1?ANGoGAxt38Ll=yfZs_?rzgWW$t|&o{MB
zrAcTV5YMpi>{K4IVi6VQP%<rO#zYkwRo9r|rOEy?8K@ldjkF-k+zYs}sZ5VLo|SMM
zzc-Z`l`PzaZG!Kd6b>Q|utg9eLNk~yi*qf<mbf(nLpK@PEifiERj4+2dF0o!6BIby
zgI6BK!Gf5aP4U0tOM9h!$FhYZFlZI!h;UK_sa0-+DPzHiyc93o5lpC6Ui`!H^kR>_
z?Wdu+se&h{PJ}sdlaniy>|(;i=bvef5dxqoC@GvBD}qf;q!{3AUJWeg5zY>ik6E(5
zxp0i=jk&i<5d=XI;UXac3B?}<11Q3L9}}8JX#ADQ6hSgb^{eR!MZW7RFm(3=I?rC<
zoER3GTnRy=&}5dTIvGr@Z(ZTWj*1}ilx?`995}Zz;UzgH)Svw)Fl^2R2=f|SPWKF^
zQy<QF={_jay^$Fdq-(6{kr`{J8=7Z(;yO3jB+D!W?YQ&xP>GcDU7eC0?ecd+bWD3)
z(ob91=$K6F3>SG+2DmtUa;&oZ;25E$wcORim-p;wN~jL0L?atK`3m+vk9ylwQSp__
z6HBzI?(NuUn0z2*-(ZVV5#J}XBibXk_$LS|a+NR1H39N-IWenu@Le|#H{Qv0B@fga
z7%s!uj!38$UsG-p{{_ZA6LaOyKTWX?ejylcJ?CXv5wH$OOU~5S;3Q_?R3ff`sb#&4
znrjO+X)dS$u(zZ(4;f9sx8@zkX_=HgpF`;j!KhS&?106QHk&K6=!wW3AaKy3*f>u?
zHI+evNv*82x!7U?h*$ToGmsgi#-4={wms`vBG-bzZ5+9J0VB^p2*iWSRmQMtQ>FK0
zEbY~pxp7&bP;-b}<<|**I}Q(Sx8gV9+h7f=-AnPL9aIc`E6xcx%G-?}Bq`!|VzWTf
zP)1<88pN1eO!X&HH-l_^X$zW|kIm&ajN(kI3^Q5knvT>Etxi=$MgH80uj6yHkd?b*
zO0_>q2U9w_T;pG56(x0)D2$j5b$NEXJ!oB8L|;Ug*JK;%elBHaY3lDuS1inZTJgc*
z&BZ`VX)n6hh1r3U6M6gi_e-SQW}L=<zrLus^4d5hv`$bm7S3x$og6i)LiDoYS
z?*x)RD@00NZpw$f_-(Opbd>gsD3MGi$YO8Ac-rQ;4I2Tt<4i%+$p^nC6-L5i9NS82
zNk5kU1&E?xFvOur+*Y<PDXo$shdT!h?dwS*FK$C^=F77mQVw(4X_K@$WlytVRl8>$
zj;i=<#XAqUAt<kbkitKEYF;(BjA6plmEHONz&yva6Iq|V7{-$U;I6~StNr6kOw(@o
zc46PD-$B3-q0J8$WC*os<2ZXkJUGVHF!JUqCPw2uTaV=#Ywll+O|{{W-jeP-!XF-5
zFb@a|DFS+!epQU|&|ixNEtcrgAn2rZOMZEWTgTu_?)PW9dBG1A^PY~_UYJJ&hFKeK
zJA6Ge<?>MR&$6E!g}Kp!D@A>d>9&LA6X}2VNq&-kP@?}eK&9JI&V^8k`X3P(`9Ffx
zcRG6!IX)&#F$Ez<1W5&{X>73wvV7TdM&ph0f+mv=Xs2iM<@?=%3SNlK)S@NxO6pXp
z_mEHugquxi!!2PUFVSmHw+B7)ZoC`5l%9>+wu};+45B5Z%8=4loH9%(hD{v5Dy4K-
z`+a5wx_akH8+zx3S2e3OM`-ohOzmpcMYMA}>-vcHK*ox0rJ=mMmuvdWk--PC537`n
zrfg)Bg36D0htIKsy?zfAxyoBgN@b}6L4r&2C!Q7yzJ&eh>wX%bEo=hiD+CVA?FxPf
z%Bbx7pcc9Y)&4ik_ssk+$Tms?=P^_TPTMmSc{)2yC^IPp1DL8gvV_nECooE2WOxz<
z!nYOFf--SG%>|w!v=BT&?83iQt?rP~YFo2el;@zPsv1GqjPmEmvtY5y7U;HTI~}~I
zakrvEZu!6LeiXT7ivTQB{W}#AaIWJo3*Hcdl_I7+2ol0BjQD|RkIQk~6Ae#TS{tkq
zeoT6tPMS%6cb@<=LIMUL;5Q6|SIj&tc#6F2v=z99F-xddUR>pPI-$?VwmeiNc98(B
z>^6GBT9~iXG_B{>6g>Hv%;)Ei0{gg)K`L-m1F4R>D9!-OnfA<a&QiQlzZ~Glv*W;%
z_!3hf4Z<I-eKjcDg);>1Grk=sQ<VVm9@Q&Z<>=_ss+0G_cQ@A!hZ|sg+0`k0)3QJW
zn3^QvY#3BI;8v&VV~O9*;38pg8dJ@WYrUn40aLCG99S}CGtNWs0<Sy?d>T95IDljt
z*9}t?gx%B~Mh5slmbC^KiqHpuTP#Y}b8M`^f#ZdAG)<$Za@H&|y*M~BXa7^rgWzGu
zK!Loz3=|B8a40ad136plMdyED6cpQ@^!Htfpy)97HJIYZpvtn=3H8P|BFNJe0?ieP
zCJJUo)}H(WgW#;eOsrVgErM{*!`LqUf<RF1S&R`uQ2+I=e_%U>y@UoN$hCtFF0mdx
zVcx1ve8X1GFn4C6rD2<Km)TBT7zMDfXtLz+nQv9X4`kD0)BE|_m=szh^_KedSUXg1
zE_$@lpF!G=)$$x(K||HP@hyQMcJo4Tf#3<~e?f$+U^@5(TxWtr-eI2S!aYI|;mTj|
zl)`>qR5rKzm7qePweSpce@4jb{euW@bwGX_aKQgI(s8Zchxe{8>vGM*7SpvxZ+h$r
z&wgWF<uy<geI^d=Zq^VLS8;TZ+9l<RV;AWKPQsorr(71t;<d+eO3Z4DoEZNTG2nl)
zZ+0uN8I(X)7u+jRz^Xl4q6^GSiQd&6$elibcz2W5LN<Gbg<33QVYbeVdCO25H<z%g
zC4@bbvcN2BNgV5kCM^=AEm<g7pr7JksCf_*r2zO%vV5IvALt-f8(wGpqD{{2%iHrN
zRxfGv2kpm)=}S4r_mx<merv-Gw4~nJheT{NJz%S<!^&{GtRcohacu;TC>CDdf~XqQ
z08}hVo!nf#1DYgm3rw*4+hNK^x1g#}=|G_rSo+Vj8bs^~0#o^L8TS^w=Rw|jUjrkX
zZax^DJi#n(X?-+BU#zOCtg2dASY(jv$gz58AD_U2SQ?GCWmMZ>k&*18eLi9DSyqbi
z*{O7Q*%P<LON5J7?qkZTM|O<nS!lae?WAt4ld4IpKBD{NMOUxPwkwZh2ExviPdjdM
zVwKpRlJU5a?hxGWHz9bYd!4Lnbw;TFRj1LNw*AbxK|V!LhKsq*?hr~z@(y8t^mZWj
zb25tZL0Wc<z$i*D04}v_$aA97>-*3?R6!w?@nG#rvcGa7UpgO~-6bEiR6aT0&{#T4
z+xPpY_ntY`#osn1vtzC{GeS*ZfljtbK{AN+?V)^Q7E}@&3kS5dI;qmY*hvmFAp*X|
z$$zSU8+a~VDx**dZ`?4%Klal18<uD{xL|G@p?-+&z&Z5=TVbEmJ#&h)Kf46&VcJk5
zkpEXj4?hn3eI3QNqw|np*8&}Ub`+rFuZv%(F(kCO2MxsU;OpKkb9g_}Jo9k+$fv5K
z*4wg<TDyHa>;6n5?Zr%h>M5P>YZWf;`)-v#EO9*O_9kZ1>qOxtXSd8;`<$nmn_O<(
z2O|TR@MlH<pnd?lJ?Qv(BFGr%<jaJ%v(#xN6zJ!mS~sC7f_ROwjMv@%uPG`K$pw`f
z@VW%im<5c@dJV-Hl)%;xRR<B&B3JV8nrAdY;W-KnGYP`})yV^bN+cs!c$Dqb15bl(
z;74)fxwSCqYI9#UrHApTt~5|JpdHjywW|4wD3(NG{Sq_?rH0rF3e&GR>VsHb4@G)}
zj$#X`&%a{ryT<-_2Pz>B4YnA7+zu-2^!%aBR|s$h3JUpPEiTqC7`CZJodJXK9UBBx
z9B9}U>RnI(u;VDVL&fH0;C%VRLMh|TWMgnxClvr!PDiJPnh$iFePwWyPY_+&+#@0;
zw8!DI;n>NVk<b?V0!S2{-AyNZ3ihRfoD)P(aJ)8x$Tk}^YF);Y#-YayjeJ_sLaB6K
zc<`#9+K8tgd+}2l8_Xl@OV0(GvA&>!(3L;yz<!;!!~U7}Bc7*6j4h5vs42y=&jy>0
zUc}UW`)aeF%GU=AXBdGBbwLc|7y5!1)HPQ!p}pt75siUd;aeC`+fUOIL5hyqxUF?N
zKZtRAF<h{Z=KR1V)w?*;`}waXV{Ph@_eVa!n;!b4WeyY%4J7gpHZCfPAa)l8RB+n9
zg4du=UGt7$AY_4~9C@Y|UlGndkAewDL=YG?7=+?4(n2YX@I+vi**}iySY$XCt~XBD
zUoVBn+bs0+jD??Ou@r6LaE;j#fB*cHQX^vL0F|kN*_O;dVZ4L~^(Z2^92`J?VY;qI
z`IFZ5s8f+G672|0&^wQVJM9IJp^FFEVGL-yO`YI2l>s^^`!4(~mFCLQTHH?A?L(w#
z2_-@4Rq`Om6$^sS89_ENF4y(|78__^sFjhJlu%Zgcf18Ih`VTREw<**dC}hE4jkiL
zvLDf}GB!Bu0t8Jp%h)(l8xnqZ3%mcc)2dp(Wi1x9+hNup-@o_Xxe2G8_xiUI6-wWI
zEctCO+!9oJZVt7P8h9+XqHJj&AqmVs(rTTq>-8CBE0fq({lGqt{8A_V+Qa5`lfhBh
zjbmZ9efzqmN4h#+{{HZbUJX+se9z+BgIOkF3NElQ*5wC+6DQHnC5`jAvDwvpU)UEe
zoCk8jjqBw_PjDQ_bHf-(r@sMPKqhsnKm?hPys<i^!Ffn!D{m4(8ey3*wlCqYo;OL5
zXWAu#2rgoP{reOMZ4n~K_o`q5H=M34f~<h0?8gt?6!3ouZH9!IJtXeKej3O}Zm!^q
zJ*P>0xf$UNP#O(7EqIAsC3(*ILvU6+YvCu9M$0YEJSZe7^UQEJ084gU4{8fyyKn%x
zDZ;>X%5!0H3Q3c1$Lq04i;`2w@!TG>^p%V}28I^E^&d4q%z|&7ht(P0!T8_0UJYbB
z42RR@=C?Z)O!)eE`DH{275hQWTu9X9f*#WqBpg5qOmum|6bK;xue*@UbKuf+CH(x2
zJP+l2kk;2Mc*7|5=X2;i>SgNq5{vkDo<y_hqO0Qd0SVCcJC|serdASrkP7oiX`u{|
z%0S)bBi7&xGdu*oZw;Pc1QiFWr!b4+tPM3l7#BCdKVFVDBX-{EC`IFU4^NQ?mn)=w
z&O*n?&i`yu0143pD%<%d-W+UgDgf)D?*aAdpE+WH9y@^h46xwH@w#DR;n0j{l7oC%
z4!xUNOUl^KF>J+D;=ht!eHF^YD0>Q?hWBF4Pf`py&(d<PRK1jmvKi<!!<U1XjA4NZ
z0%1`w1s%6%^~4Dc`O>v)cpEGOCP&f7DzgA^QQT~Uw|jCLQrV{52ZF2?&*#7g)D}Y9
zHN-xk`W$(vr;DnAy%va99yCgKbpq0he|jfWRCO`?ps2wJ;?J-yKRI?u23fCP8kp$N
z(BKe3hFh0;hP{+R_BHJ^Q(Vr8SZOFLd4IETjyskU)B7bj(jz)O*CBKf9TOB1C1#(r
z=YAL{UC%I=+DTg{uzJj>+k6%FL|a32!kcm$*8VL*dB9wqnd|G?Nt?`Pq2z)+Xwbx~
z;fuO5mYo`!BlW(7b1f+&p~fK?*I{FyzlQK?!+?9fn3>De#a<KslJk+G5`Ea=9}mw@
zhn~GbC((p|a?dsyJ{4|!05%kScvO4i?(C=F;b}+1>K`RN(D7qJDI3meTMqmV&Yy=C
zcB+Qn&&Qzz&xFR~U$Fr~$yN@l7p#<^>QFuX2_!P=j=tfQ5dxzuXQU1wxO1rOpoWe&
zAg6p%KLwUu8gu$Gh!ufmt$i8IV2Yx(0K(lSO+gVtjDJm#XCAeF1~boC1cJW{$Y#5C
zdzaj$GIHq}`sVJ%fG4XY&bWF5kO0Pi)3qMH;$hUFev4&5A-XYJ)nKwBAMG8|P^3;C
zqd67?KN#;@Yqi=S1xRpbK*^(Ph}HkaiA8Tm-N7RTV60>2QPE)KlkWn=v;TlH7eWe)
z?g@DNY`-9p8Unq&eLiO^9fmjH%L9Zm^}`?lsNps@J$nKtUR@zBpsQ?LhKvNZvvm_B
zD%gv28_BFi23D_~Tv}X5r$n{?lw61O3xsN#T(F8$=qRLv=z5(B)?5XLKJbK8*>Iu;
zp)D9MACuUzR`Pa93O^8yZ)EdWC27WRwf8LgIIk10!-f;#d@E)JCjQyx+LYdI<SSG{
z%f)a9N!*2(biO|6ZH{4jm`qQixq2D?8!e%zIM@#{b~xgaYWdRwXRX98mPL7yOOdv~
zC8umbtw^PRe#gVV2Od6zHI^xF9IF3y;71zDNZCY7I)PjiWHaxsj>PQcc3fEOPpJ?=
z41?-TRrDt2slZhtO;33L7!TweOfdV1^&%|y>fL5<F5>-zyDp&MC2?p@)Pg@4R=HPz
z05*a9kDvyxG_SE2n4Q9mOZXu6aOgdlG!f)gR4PtBAMmjj<bufoiYAY`XT=0_q9vAr
z@J$(zYR{e|Uico!ZCHz6`BnOe{TqzWo<uO2<-+PryY1<5B8XT7T^<Jp-}vUD4)(1M
z)K-KJzISp9XP7RrWbRppjxsAQn4%NCqdJ{pqm^ok1G%FbTSv+2u1rk&$`*4yv*1Z}
zjzkLX`pa{3FeoK`kP1QsZ4YV_PPqy7g)rrmio3+oYLE0Lrmo09yHR2|c{ZJ?4BovR
zW3eFI4^4w931K+Q&_YpyF=s0WE6woi{DwX?Xz0=Ipv&u2`W-|hYc8tWmhAYj<D<#8
z!5|%b#XA{@N18PBsb@0B`(<1e{fbo2^qlED|6R~3bwA?}N&kYW%I^;uhnmUfincH~
zwQoN?%~u<anE!-4+kU1*Wu%kgztvy6Nj{A<g!~@>4y0RgI77q`&lLap#tO(;x_EkB
zbm3V-8w~8D`m8~D=BL*PTqRsZL4;#S_L=H!xRZbu@Ax>rm%1Yyvjga5!aM27bh0p;
z)60yH?};~;h`CS(i>1nh*rec`d8T0J^s1>zR5Z1NqyhZ+Fwu$MGeWILC;`fC6je4M
zY5CM2C}8SHAcIFF?|t$Kaa8z8HbD5A(6*1tcybR9UO@5Pm|_QeHMSu1sN;UOyphw!
zr>)fva=)A!2s&l_YUJlgzQ>`oU~9KU?>yD7%R19uJoCYq%B(eVos?{aQjxu^l<_sc
ze%TX+KJnh}IwNs2CAtqsoPr$lylY}BkRPi7Ln=S3$_eaB12IWhW6GD81Cs<$Wclq;
zj|6+tQbmxpPCEPaDZP6?ksj0wR5|RQvp^LO#g}Bm(hY?w_ESrMb_rj>hZT?{-1U<N
zOK7oV<(YO+C09t}aUIa-fMb7mj#E<0ZvRSj(^z+i64wFUiBRTP<UPxJ9&B<wu=-}%
z#GEQ&c9nm425gW=VAdPtCBcpt5#;%ADoD&3Pr=K%g(PQ#M1rsQM+EU0<I7*9;!4H;
zgiVoBglhPYlFVde5C+iePay;(@vqX_v4@;8@J67=UB~TnQbs0PKbqX-wAGYI^<RR%
zY@x}`N9&=bi`43hWl;8}FT@BzKlnUQYo+ow69(H+l2p!qurGncXH*O#(J&g6p|vm$
zhTGkO6!Tj=D}E?QIHcdwN*1iZ|J=pScrt}dN6CQ|n(FpwVyZEW2xHs(1~{?1h)FFG
zRe|0X^rUJs0g7I$dX|d0xxXl)_-AzMvQj^~c(NB-P3Qm%$6!avgZ}l(sJ!NSMR_Lu
z9~fwQ2S(UNT7H$?j^W#ZlEsV&azq!&Yy{iU``6CeGfjni$M|v}p?|6fZnlDX8==-Q
znRg0znM2K>GD@gyM|Df4Q;M<rq24x6XF{7Zeh0RQ^tK64EOjUd^dQxRjK!O)vF91o
z<d%IbpC|FF#1>Od9E*@FJP<_LGZu=_^eZC93%3FG%)~ErHWv=;%NFddTc<^x?*w^-
zzf-P$D?5$-OGXc{U(VWjS4o)e{dsYBtJ(6`5Es>=@F|^k=ck#GRmnwHMq#p^P}SUy
z?PXpu7t^w)%q^m#AKwfM`;Z#f>DqBIra>tI+J=$kBLsyZ)o?-pIGx$4LW}X)LEo-Y
zU%nD2kuRUi%BoKT+wiHO8NUhQ_}-=kk6JH62pEXacHdMdcs>q)Vq==d`gnTGl8z-M
zKpsRzIZ_5eL1;t>Uawjw3YGlsvz<Udy^CHzVtAfhIA4bzN0sD@Ad931pLI#5-9|v2
zCitHvTqgM6%qih!GfNika9rDvWE5Y4$Q|Vn7wS1njMa1;l7&h_k?(S0ij<vmLM32`
zxjJY_I8Oq0_upAoztv%u9glMpsth}(a{}5`N2*DszGyRz{3!M+3m&WbNhXwjX4w4L
z+IyZcf4iU<za5*&lIiJzO$2l!>r3Jy7M0sfP*5Wj47UwXWOsBCp!MmhOWn$TrenrP
zWt<F~)FDvr5%1=ppD7PjlHc<spZj1>b+jpU(;R$U#_C&7MX390lUW5j-aDeUPH?mD
zx}7U7(y;w=wEKWy5#e=eBqq=9#vnJjte<v5qutc~roXned$q9?;#1DU663{~iu9x9
z;U%Vyibtv?Kz?f>lm*n<o*8`(hbJL5_Hf41Tl8)+DoE*790L}R#U#qWl2ymLEW{mw
zsKPISqbLc=&g!@p=a-H`wd^*6RJhk&5AK-v{Cv)Y-`wg`!v9xg!xavyi+USXpPhz3
z7zmR^C<!Mi2Y{pOKaQM$j$?foh7`V^0HN%{ig)5SvKy0(d{HdZ#LFC1GVUO>TSgDe
zRuq;i4AAW*-3>6=Y%8`kv8w0Rb3TI9>Fs9rt>41jc#B8;5$AMW><6e;r-%2M_UYF&
z;P1r}@26@NJB@;8j=nr5$2QWND);DOaTEf_x<^v-K859(x`fR<+eQvMZJ`zAK{+OE
zFJ><z<$@kRRa=lX{pFnb^{k-tvrP6`oY9<1hie)%p0baxCrGFI6XpGa2xKR|QXKaI
zaKap21ccd8fLpwOz*(`SsPm76_?xRS1NDnr*rWL@z#Psw+?LZ&8_}oMjfW58l>utI
za&$wL32|R<2Q&4r(J_Hj+$xy0I#4srU~QOjG!w;FEaq_&fpjKZj_(SR?-2Y>ur}Hv
z&w^)ARg95}bI*1IMAkcsHa3b;on{nBISK@{N1Nx&jbo2>Q6xALab(oS*`bE&{7K6a
z`_dkexS(ii?W8)epNz-4QVk|`GzqRZuzRX1jG4W-#3V`pRPG(1|2h7XhIk$5kV|bO
z_VEDrRu`b6s|L%u#F}^70Kj`_q(p<M0pnwE4Qw;BI_=k0qbl)#$YU4S{jEW;2IYmn
zoPJg|U$%e=k4Ii5flUTkg6A!=ytBZ(J8=I6{u7eiAB#W+R}^BwcD`F+z?Ns%FJ-`g
z3>iC5`MTPhCDbu<VcP*ZsOQ9?V_4MfJ~tbpFVjXzCC`$<E)sM1Q}6A%i>19vOI$+6
zUKUv!San?pNbKmRU6|6=%`8&+kafzaz<l>91<eyiBQKcAvE=X58%u{e-uF$q-sp7c
z8cQv6FB`^B2D@wN4id*#VQdG~wmH?-hMG#@IEw9T5b6Wp7=%}<8^Tc0#2v?(KXDE(
zfb3HoL^dbXNoE$jn;;*>KULc_*!9$@JZN=t6HoE`r^(GBjziYN<mS(Wwj+FV5<AWQ
zCht6<?f6P=--bC!<D2zV+E{l}arTtJM6cCaIMo44SpFlqe8jL(h~saeW?3<jX|PSW
zbb9D&%b|!(?5DKixH5cTAbmTM@d{MK2=PJ%Ad?#lwtyn#fT_-=>h}JGNzG;;FN7zH
z<#DaX5Ei(9ng?4{UI6e#SRJtqq@x|&901CuPr<tS(6%6<C9$k4;EReeY+k1ZoPClm
z-*S*;U}kaA*Jd@9dlQV8OEgo4XFHsTP#9XVfE@>#*?-!fA#1>A9bGl1Z-dGG<Tho|
z6O31CbAI$0g8+?k3VS|(pLXsmt*3l;?Eb`8C#k~foC3##{|8-P8rD>@bxpUehzL;`
z1tHxkA|k{AL=cl!TTu`qvj!opG(w1&MnIy3oU2SSNh=^%rYKXGBeNhRGPiI6nIR}4
ziHc0-L?s-_;8))7-~0CahleMGlbkwLyY^md?N#lCfI}vH>bNwV5ST*}lnM8o@mZ17
zRk@Fz@6FT-aHHDaf4KQn!jNOX`@3DQt1NzrDHoML{qkY>iPHnc>EW0?A<V_E7x!;o
zeXZANbz$|!bLN}*)m>GEq}7jQ{0KST9;j$B8Ug1%Wa#nYu8D$n)z^=QQf(dck;L}Y
z`KI57ik>jr1t`=))6R?=a8X(Q{0+6EX+X+CyApRoo=r2N2t{0d#fB&1gEP3>*Otq0
zY6N6hcnD1VpOz6K-;}XAP`hD;OTSGK;Pe$eO@=9l>LUFL=ffY~atlw63?Va#uTQM$
z^#4<=_?M?(MW*emu>GDJ@g&e{gW@^BjFo+I-R~%d`wqNr;7U<T8|kPuON~dFRjbq6
zb5P{PuX9qo|C06BqXpI#y#xAhg5T?Mif8sui2Rr6%;*k5ONNzk?P|-`F$qmp^L7@r
zl)lX{6ZM*202SQ-PK=5Hx5bl3;pQ%3qIpOvfO-cMf4al(=yWP$CcN7OM@dno4`zl(
zW0lEhwQ{z>6wM^1)2fM;Ou<?rlwX@OBg2XH^{+(lh7|fA=muHmA1IDhAUDB7zMEOG
z^gg99P~*hRyOG63$fpCb@`$0;r`t-3w&z-*w;67)uzfVO#-4i7{k$(3vt)Ae)xBNG
zKyYl6=59lcBOEC6$gwrw@tKB;Pg`*9vcGwD_kddT!(B<nkGkch1*YespT453(JpP$
zGIcL_vZ1=fW`E&J-I4S_Vlxk(0>zfUe6JCl1tbGxg^D$3iP+GRF~a~YgXr3ZyLEVF
z06)POo)4rb%wB!DVCzsp^_pRHaFAV$PU2>gp+UlGw?hVT)B^t_{Xbj@llh@4@K$aX
z^GxZRsgsv8xG~B3PUhw__wc>9-pJF(yT0R%Tj4<3;jH#UyiGiXN)>ov9sY-l)G?mh
zbNH&^LCQU#klN3`+3ZEm!;>p6B3I1*oM5+53cS4nQ%MfMW5?DiNHg($uvH9a{=Hpt
zKh!{PT*j3_HPDqvA~(`^Yg`+s!llPK#>Nd2MjOKlgL&TfpXb%)IK@=opN;+!{3yEy
z)ZY*&gMwZz{yAEOZd=VUzee$UDLXxtmF0DvGr>y0DbY>a;a<VsYC0ICI<=8tsz&F~
z(Q<``Jkdin4Ud}N`0DH!r&_D7r}IG6w%BQbO56VZj+$BCkTx{K<HhD46?ZQVix<3L
z))HC_@hw|2x;P-OcEsw!o^fm?vXAblr79XRYLIE<o~T$*77$y}hDhT_k#Fvl(}<Y`
zUN?Z?OX3DGCBUAcq!%jnq4mNMGa}hGv{0ljy9FCQ5pcMu_9k31+Zdoo933ygN(XC_
z)#gxvG<d<`F|`BeK*kR=N-0eli5KEG(EFO&dxEgKWb}IBJ(N1>geSE2gJP>VGWlIS
zL#_t3T5v9x4NUk-!k!_)0!K<a_q#&e01>PR!aI6d>rNE6&K$+~P}T#uDFgWN&+M;g
zU&jgO<KrL|tgh@362V-^lzDIm65hit<B1f|Mu%;(waS;N8>8Z0yc-4h3Rl<m%69gV
zHH1yA_ATez?x9itK3+cWmb1Z4cvDbZ8k|&UO}HMM<Z@lQL5qC3s+Jsy*NV+|P)T5#
zT+X-8O1EEr>cZ4LZW-vvwb;|x<DBIAEH$VO{mx&pfgxVgf!;!~R14{ZE~R#4DlZL4
zePw3jm@mB^PoIukV(vg|px<#(;XcOo!vu0_p`D<Fwr+Z-;%^;KQbBzw_iA{!Gq<86
zCKwGn<G&W;06L58u~Mua)#B%tNZ*0j1xk|CNkBy{&T(c~jMaL8IcZ7X5pRRi1NwTD
z!F2$orD6kMG|RwRsv@%8fLynO0_3`j03ZJ@*Zoeti~*qMj?*}A`RhsPdBp(d2OJ59
zn4lJ&XE0pMOk5i}g}rX*o6==KA}J@<4y>aEBIKEYgj|i`<_k<3K-ljPK5(J<McNTz
zg^u^&lI=;~d?b!IRK8pmad_F(+v?)z;_r<P-u4ocmI#Hxk^j$t6p+4i5&#mbVSQ6N
zlBopnT>$U?AAiO1hdHnVX7v=~JHK(4BDMLZ^ttiz8i^GcfbU<w3Ia1(2BO6{ko;OK
z0164LxjP{Mj`fCdf^eA@itpb_i3RN($KdWZMuR9y2fRTY<Yq_O*-9`vcH9fR{|eM<
zO>?L)zs*Of%yIE?U&2OZcI69tieif;>;!S)R%K!7o39Py<F37{5x=5SAPvO<sMY67
z19Hz+Rcb-o;mo83)DJdmgG-xR;JE#(0C_5-9flQ8j)KP(%f`w;b$RE|2m01wosNS>
zL)03uN<asm8iAq}IBf=Nvek!cDQoAZ4~#<IUtVJThC{zv>dJDCJDA#W5??0pdYY*_
z$i<3*b@}DT*1hayrK+<^Pih~sE9Pxad~|KliSZF0`4cRXbeqF^AsucjWn(2#z3_y}
zSXz3%PSQgIVetMWm)rv_fqji5X^&iarKPkDdun}8<))>PJtRGI(_uyqNGQ`KB%6f0
z@^&bht%M|hQ!;UsgP%Z0JyXb79R#}#_aB=!n&Tu@at8fJ`Va)WpeOQQ5NnC8sAwFT
zrdFuv0F)p~hm^Y!8HLAN8Ctf+Olpl2@}1MD8w9d!jR_>Yn_&JtpG!^XwiBy!3NI{p
zNP_1*nyxjLO|)Kr*p$BLdJ_Pm0~4u|{qjtRj6;kdpd<!5SIH&h2cfsWWEy$GHz0PQ
ze^UG)+k?0Zay;9@0enJ4c2_n;@RJ=n41a=iqsoZXTc0n$im0+T0S%tl=Nj~K;|qwr
zS62u@X-^+xlfV<o>wQb_Wveq3RK*((Z2uAx@xT<4*g|ujxS4dv=L(fM>0YZ9w!vv0
zBi>SY9tXS6(NZS$Ph>Gur9q%9FfSZnW`N~oBmawtE2M`i`oqDP5Y@mi)L3;;COD^H
zQ3qNTv6CDbb-E9eJF!0_p4#`t#=@d1A;(}c;IE57X-+ppLIf2aykDy*>f~GZo`Y0?
zCACF-PYo_vn%SV%$q3?g91W@s**#hBTsGgs`Fh7^`*gkZH;4IfSn&0<lEI_}$G&qC
zO_M>UOT|_9K;2tz?_6q~>uj7#0}nE{nhZw!x)hK?dN(Ju@wcbVA0CP(^=>K?>>16m
zIZQL!c4}9}>EZlcke%gQoF3%Q`xmD+BW7>2iu5FHGjc-w`NitB7LuUiNGnH=u7?~H
zFk<%F;xAn)ne)KzsD?%NLcPOJ*NvCg_K`0PS6JTND;zN^w+wRX*N<-CG~FXU!0OT*
z!Dz=IECDL!9g-qF5<uKYTZ>LwmZQ5QF4QP~Z1q(@1;w)UUQ<Ilu$wrrsEeaRMsH#b
zB=E)Z$|q&yhv*;u;L7`zK67(0Qvkp!l*~-POKh=<!dfZDZs|nR&5>RKZZSm6=^`Z1
zT&jpM=nb?rLezu=b=QSvHCF4ecK0=!gVceF<Ht?fZY`d8-OHM1??CEM33W@J-pzOv
z^|GV#+2MB5VX5_4bRqh6m%2;LeB1RBdB#hpq!N?TA@-G#clgX?^y2Fw_tN25y)6|3
zPCjG%vUeB_kh}6E2bkZKcBo&Yj)n3X<&4slgN*U(-<19+jr6+?BKSYHG}--Q|C431
zcpaK+DqwVRCb~OaWj`YCW-zMA5#l(2vYm$8IkjVPe2hwfGqvojSUW0@>MB@?qwC3T
zy$a<euFzEy@vm$z2{0;nctMD3|15Cy1TW|amhg^IojyCC$#wJ7I~}qzGA$PdYmN2p
z2j}ro0R=g{qxir);ns5{_L>XlIy{p-O4xtAw!YcuAD8d*DJgHbe7?@H%yR4TY*J&}
z_1Nv{XZq7=M-5LqHM~H>bG+}E)eiiBF)OR@nAPj?Dw?J|9`Z#`mUXPABy;?6plrO1
ztPvjdJ@yQ)tN>jQ3?+pY%~Xsn;U~77I6L}LW?j6XSPK_Ll}>-KB2*^k&xh+B?R(us
z<Hn{I`MkU~en`9e<d-F_9UFG%{R?-S^Boi&K?4E3OiTZpQlue3Rr461&p}Hj4A_k#
z|MK4+B!$YrHzng5=^1Ep7-}WrmV=<yL#8V|oQI+q9nbqTiahU2FL`P~m7)t)^lzLX
zD7HzK8MHK=6%jteOMn{2Wt*q?oO`u*!&Fr&MCGy1X9l#$+`Z!-o}0Ye*UBDvIcoMi
zz<f-J$6ZdDTyqw*h?IQ5RiPJK1-wmjtB078`BXYk%qZ6`Ft#ils&^e=e)zPo%87p0
z@HFdu%jv6c$1Ha}H1&K&o~qeikX7VzE~#iFGwEVxl99EgR(S7v6Q}FdL;I2njSF0Z
ze(AKf-|yk%c+%Q_U-DG-`|NCAt!bCQ{e=1ljURB^z-$8rqQB&^p$@v11Jtf-BzFy!
zZ3f}<$Sh3Zk9St_*9eG<4CHi5Om~XRT7X(y1NP<L1Bg?^N!%ZbQAOtv{SBC`{ujRD
zaR0}gJ^xMVN8BHjXf_7^SP>~b4n_@tz?VdK42>E_7J~o=OI1CC_=<bF-G;r|SC96W
zluVOXpqha;;v%zN?AX<kH+Ms&VE(}TE%l5-<e|iS%Qq!LBU~Bx<xq@NpZ+&iXY324
zLG_oo$+Mb|2yp82P4wq{z=j17*U)uIdebo#b2@*{>v>e=&o@CaOQJ7*o@Y4?cUJHt
zdzp7W_IzA_EvV>T!2R(vCtC}?dId+fOr8oHPRMj@eQ9E}lez9|Ne`TvB-~x%;^J5~
zXJU80$6TD6SpC4UXgK)6iod>-y7L1HKx-M*{<2N1nX#%8BwUcO=m7RGA&k3Q`**Y1
zb=#wrica2|pt_}Ny<I-rhir|W1Y*?(Gx!#n8TY=_i?J;QAAoPhOqn<21do;6dYEg}
zIAnC@p^c&G_h$SV9`OfqQ+G@=Bn635t6}pG0tV{!8}0&IM?BlgndoixF&ZqYR!ork
znc&Vf5VHN~;-vsvTm_Ht=Hg^`P-GD)BeNoHT<@w`7?J5?p1}`qC8Ohs+c35U%(<q1
z7BgTABJIGc!p4$@4(Kj0zj^ygruO7wCqEgq>#}iU6+{p0HW(W#bbB5v^wjf6-e`L3
z!}Jei>0Wvlu{KFf>?`nJbcP9-c=oOc6UFxPvWv(LnCuED8Ii`+)G_mXJ}Mr&^Hsh$
zvZ$_@JLL`e2VVa$eOMfKn?~j*+b!9k?%MjoD;*NT!<q#5?5X!fCC3P)Hv9Z^pYS+I
z(1?9rN>y={?R`eo?Bm%*d4=v+k>K$tqdxUomW$H{i`VXWvX)_hb?INBg(fE*+e(W{
zr8bpmKx%veJcu{)e~U0Uf7ayZ4o($naRn-;@;Vv_aW6zsw+3E7(JPs(j$~uIq4MTS
z!8hUIRx6vI#;2`-A`SIht$6QnK=s1dEo@5Ru~Lg0gvh)TOW5`DV(1Dl{yriQ^1Hiv
zQMtIMi2svT06sT0+)?bLp6Jv=-wxGJOwlClDiJJ&|28TLg;0b7pxq6;c&IEga4f35
zsW!u5qsR588EUHJfIOwyHzYDb02ZX3ge-P{&#Xikoi^#bc<fpkX52gsr9tKDF3J1_
zrgO_URd0vzN~CFhUR1|wvuM{7R=Z~Ke1lFvLjEAl5_XvBj<Uq)Enrc|Nw~c&v%4cp
zP2T}YEJesKu$r&xFlbV)!LZ081vNKq$sYQW<;BvR2q1Vofr<jgGT$~xR<AH3Z~FOd
z(;h!~O0;q>-Lh}}_-c7%`Bx(w2fevMD<{TL>*l>=>%%4P?*2|2%-Y<ymGoSnO(ogT
zd2bS~9b7G~$V>a`ho;!tw2b2Xo$8!?2v76-h}}UD1wV1)bDJ1>&y#e@Gl(x%HXjS|
z6M7&ka6Lvj@S&&Gk6mhZr^IL3CMn0qY8%6TIdm_xG99|fG`K$C{7Nw_XB7o%d-Q^&
zXI8pt*FD;?5a}iQe{AS?^#ED6tEr(Iiq$)rc(Q1L2;Y0--5p#?%^k81{apU<@`|wC
zy`<GB@AJKvLP98zpILW^06$5V{2@-WdB*{)nm_Kg;hqNZ(by~yW>8ybzaUWM34y^u
zJNC1YFUdzPCc47QLxE#@9);oI#iaW1g<y(rWLR)Gd7JCN&g7>!|ABw`b`r+##k~N?
z>6$LZ&h@q@g`2@_is~dF_lVg271Zbe4tKTqO#Gv@%WwI9*vtH{d5qfGsItH3K3~~V
zvVqnUXkKTR?h<px$K1JHs&_Lc+<uRHj{U?{+h25Fr8|ukm%KT0$=#&S#Mo$qgFYa`
zpVcz0JxgNp^@c+Er_%N6=9_<8;M20Yh|16<{M76TcqN{_gSK6s&bbOi9BbUf<|*G5
z!csn{ceu*!K02e8=f7x+)0ua|L>d9Pdx!S=f!2d|3EPCC=CkKwpq^Upqw*txdZ@{(
zj}cG}{j?&#&0dFyP5?!Vs{}3o3yY#T!Y@X;V-?$FX7Wt>MjBSki;JzxpEood1KpY;
z6{_iJ@wx|=L2EolEso+B^<p?vUm_jO4VyPlnfBh@xq9RULtTNEnZfUvQHrfpomQTD
zv(fJk=Mk?Lxw?b29sJ&1`#U$29W~k|HuOhV&z?K6c^~)5{`M{ChZbw1kw;P+ML(o%
zdA`|8oH&DT#l_$Hn4Z7uzA^|zwEZw9Da`vwyTr<BHJfmeFrE;4)c#ZNshZ}Ix}(ST
z4)OO^e58mzju}0i=fB<7l5K30nzHw3!RxQymU?G;oUc2D?d3c5Tz_9%P-vvP>&ccq
zE^qCz(&WQE`)Myn-z6}NpX>DQ$?nf4Tu-{_YvOdKp9m0oZ@*8FuPTW%o3RA4{hzv1
zhd+X@66g<)1;o=1ypR>dgPkaoW}Ft7y-T*dbah;~vXqmLsCvnx*=QLR2CUkY<Q(E)
zo+!OaQy#;b)7%jQ<GAJLoXLopxJ9A#0__M=SRwNlA1cOONr{HG<X4GPV%R!;dDOCF
zRd7*i5DWx2n`?P`)Txd>_+xix6FYqw@RG(755z=_k^q%rIW1g%hyDYwCwOROxP4HD
z0ij0^l*Ts!+%CJ0L<E(z@M7pkod9YR@`O6v-UM|#V^!7}C?6BoGr-pIYr~N2*q<U_
z7U^wRzxm-?wCTl`@&f*@^wXtj0Zw~Dc^oH_m0sHXse^^i_UWhF%8l1QXBz0y>d3A)
z!kygSmb~n&Fjfy>+y4c+j<})k4%;7qfaTcxhHlR7|8`cH64iRaW(G3FI9xykZ!>z6
zXb6;%VCmPWGsE~FY7Ks#-{dF&`)qs@5aaMr6R6GeE)!I8`zx;nPavXUKgKNU2OVt7
zw2P-XJdwU(E*_I=z9A4^zm#sJ|L2g)MX&ZFlirE}hBA9R;8Rz%WySa;ZGT9RB=Lr?
zCeOd|S=4_{ee!>JwA#vMCEo-Sdr%V8f-&_jo_;-l#Od7oB%^@7F30VDA8mpkE|dDp
z%}W1_9<@F+I--~A5&hbKH=EkoK8>x+V}lJSb}ML#_%ML;W610VqB*j^$6I;zoI9M$
z=*c4ld6s2g#0+}zml;YNW0PV(L54$Om<UheDbEpE_W2(ly?Q&&6^=`ig@}cc8$-6g
zSjCAH1BQiueBN5c5NC_*{OVXn5u!0uR23Xl@}|{1IH|bndXm;vC!GzYPR=nlA)Nvf
zeo24%+i|nqBW21i4v)HY9wgSq=DjALF~<1DbqFqA&dL_Xw2yQ+X$~RaxT!)t=Rv3Z
zBrne<c}5=lTs8Rmmn!W0hn<Hxe?XHUiNka1??~Ddy)(vbs%1x}jubzL@Oti0Y-_Sp
z=g|1R)2YX7&Bcxy1^1ut`0*;p3^#(t%rE|o>H)=$**2o$kHB|ZkGmx033oSaJ#_{M
zPj3W2xIro^ANItbvL_V`J6dFt-G)ak#*!^-utopJ5JuBSIMuwxK{!QnMV^N>mF)sT
zE*05cBnNJv3GamNGEj<NXqJfg4C4pta2x#qY<JsI1d=1dAkC5QsBwBM$f#;8^tow#
z(9q|_TS+QLv4xay=Ie`I_OX!h|Ag0oO!6e|{bK+Qy5d0FxAeo8bISAnZLF^tU`T`m
zHAF2-(LA77UqP(pz%Cpu*%_F%m{ZaYCw9J~DhqTva4DRs@E6IdT`T*M9;&U84)CT3
zP=IPf$#P(`0~rv|%55q7JoEc}Q9PkyMG_uMH$pT#M$T^Un(8E&JhOVGFr)1OofqGf
z#Hos*M^eC;FZ>52hY6$7ga042`zPu34_~L?l^MKDlPDp}s}!0tW!QhIQF;`)gGuJp
zKuaWX?keEE7kj0FG<*0;3ItPx#*4S}=VR!GH1MGB)KW`!iQ|$jA+8TFC&1ba%JBTg
z+E%bf_7Yyne1nBv-ma}ckV_5e<iT#7P4p24D2=WJOY!s<9uzor2DLaY9&Dxn_zRZj
zUDN3>mHtNiO<vZll&fX973j+fehrO<gXI|v&!A8w&dqSUp^dj~09OTWWQ)<9e{K{(
zt3&ThIQ4eG<V2={2L*vCuu%9FZALvcrP#1v>N3x(&v=2p3vB~Jaf0YJU0LP_(8?<e
z-4BL`z&!=XLXjxYRGEoN#`)nUpG%TQShN!}Eg^~x5!xw!d3}D;Q|P!A1!@$Pcy;-7
z^A5~ib4!cF4i0=TfM`R*B7h@WyA}bl=cmLWh=Plu1I+P`zw98q4g{~CkXm`Y1!fZB
zugEk~h5k{t`}5{*L5rhgab^3=$5a7J9cn#o@kYn7p?YZXYV%r>P#+h&Z)%Egfjj>c
zF)~{cywdmrHm!fgF=tR;M>7;K5T?9XcoNb4(Z(iWbMaWVxL|fq;0+|*+O2q|FihF{
z*<_EWVROTHD&xWGrwAh8nV<}A#GZ?i23!rZkA8aU1Dfml-$ypd1704sKxkdNvba{b
zS%6C6Rj0hDDges%c1b)uS3}{(NeU2a=|N@!rxqpC+To)Eq+&=t+bT$PaUWU}6PFTu
zs)pZQma=IMh<$)AeXRq4(U;D^ArgPw_p1UxSX-f_7-AUc0MQ*RLUDJ!q1ke3I34&&
zyB8?FE1!Yf`fb*eqRZgMScZ+;-3_hc56*ZD_b0Q^nyMo8h!vS;jlyx9Kk(fV23RJ1
zQ}O|(C4>Li>YyZ=WC~K2&VVe@4)p$lMN->vkNpD;pUQ_`U?%N^%qUw53|K$#>GBQq
zSGdhMASt6pwXu!i?i3CQ;`-IazRHsPF&Gw!_8DqE$$u*Bv?*dl<n><-!W6EG1guWX
zTA3CSDS#5dz7!ULQl=j7IU>ITp#VZRZ9D$fA@O+-w!il{?D?-yj0Q5*nK_nU>_a$D
zqN8C}TwPAMJG=4DfYWegrP1lBx0rvnGRkEegL2OfN8+vFbOh&v-;dJGK?<=R>Pn=<
zuDWqHPYKWo<zX#9QsNBJ;*j}zZ6551iekX-#Jdl<+2P#3MsgPf8Ev9FbXBaD*e<#}
z$kshlIEB-sx;iE{jyyEX&fg%~DBhQT+TN|v183Ij5@er!b2hQ>X(-=ur06}<+Jumu
zOln(lI+t!Aoaz3HQ*g|Mldfe1T1N7zG}5^;6L*RGFZM6{A_iSbq?e&qw@}UxiXN;D
z@=m&&)W9|t6Ju$=NPj~f^wF`30jM^kov6|s0;477NQ?BsOQz2p{Rn`5lYN~VNBv+7
zkUoLJgLT1&w30cHboI~~0WJ=EVDJlO7KBjQHj*^xHBuumZiTk;pjKPZ_hp9a9OJF0
zA4rBIN06y9gwnoh2Q&g?$dLXr*N6ke6Nx}3Z7EZkQ3V!kb_#9W-AYS0(6TZv&;bUk
z4IeUm4ki*B6#2Lr7W*f70FZ`8aZ4N)m1Wlw>Hh*<MO^X~A8BRc$C@uZ_NGEzRf-<;
zR$GfJW|uHA%SWAh0Di&gnART!&_m=t65)MHId3j^2Rw3rz$^`VA1OfJ<<yMewCVdD
z`JyfDVKXV}pbh2N_%_9BLu@@m$cQQjsxY+Ha{aiiaRq2h2jmA)e37UQ5SV)wedLZ<
zHJx^di`&8><=`*83LDsjXmoLB8=~6zv$khVUG9XY?dbKX^V~fWW5b^8<ma#5-BkO!
zd{`2?wALn-_9=T?zRTvaK9v$9kF=pOhu0Yvb9ILPMn>vRSGd>C)ahF9?p^`=GZ6*^
z!O~6Gfj@h<mJ>7mgZvJ58;OmEDNB*!^)z#sDAN!Mp>~8PP+t5j4#C4WDrQvQtoc>W
zUfOHHy6a;E-*`;7rg`vugCk(Zi#rCMgv8FdP)xqO0z^izRkiwnX*?Ar83HylwldEV
z&J~7+%mVIir=%F}gn0>c6b*$i;D}B_KNF6XN^}^%itiW$h9MB0l1VQFmQuUor56`i
z{Q5+PA(o{+^wS5$uIjOO)#KN`IO34g;_wbstQsGUo`BlOGp65^Hsjhjh>0Rj&Lm53
z$$ZBw96h_oIr_DzEv&nXp=BKIkHT|eR20^NW`>6E%B<k7Cq-`-ICZ!V(+eQAKGHsI
z3HLK7f@ho}9Ab5(lddKP$f2nqY$aOewXP+|c(GOL;e$k+sFjy2^SEKQMlkT=A%eRF
zGS)WsaGk@_chA~K05^M97eTqJhf1U@@$75H>rzyE`7Nf-<KQ!AL8+xbr4!TU+UAjC
z6Y4hS+IlK0T71pfNmIDHcr=xnw{kJ4SF2a=kaNIvfk|<{^SSre4hC7*&pLm0OJ(14
zNh>sYDcZ#yTU?D$sLDa$RY4fieTwm3cxXys6c>Zt@_|@w+)7FCWb0>jVs{NM10W%z
zo2bhPbVPY&IKlM$;Gc};w;}$)#^EmmQ9jz*tKo59@JYGG2m1?7@b7N<n@s#gF@S@;
zz7v3V=*qy=GCYdI{|V`PLW@8W#g24Ck|*>dkk_7)9jm7Ro1$@rIyO0f9=#cwspT4X
zs6l5MtB`QzPU0q%GNywSheQ1$u4L_mG?sB2Co0RcfD@Ec9E%4g$?-5X&-PpR&ayZ5
z1Cm|xq*{yTh<At-<GltQ|J?_0_gM^oDTFWTUho%Fx|!lVb6p0;l3U0nsBn&_MQ<%3
z0pRai$1Nzj2h@}&ksU1AS%f6DzCcao$93YgcD0BdBx&$Fd4CLyQ>c8dkVVFID758l
z4hglHOwgbjcM;Nj4v*GpC44dNp!`HL8EH0?9;TVpk65BKr+tux_7L*4R>SJWxE;OX
zn^*T+e#-VZR9?NhoHvKpI&NuNK7UU=d-VL+pMNHs*lc;*F5PoIt#q__*2x~HoOR{d
zf3P-gyWbzCnOOqKfzOb|o*qq~3$7Z6BdlFrX48Kwv&(F#&A$A^X~V(9jE}il-;|U!
zz^J%{;iDTEn}gWeivBJ0HvWsO2&oI%`MlyKA-4q>JQKFz*1bglhDhHh7R7RADl^CL
z*=nH@fB)^33>_gv-Ql8>Y)@=^K$Z~IG6(RZ6&4DVBp1xoD#bWj&jPc>OYw?VJ-c{P
z>~IE<W}sf-+8{h3Y7xWFInjnzZN4OS6{6Y!KbjfW&PnF#$h7SQ8;Ppy1Bk~!mos%Z
z=w1zg?sB))zY3idLrHAycBl}^8~8epP~Zb|z*f=Fa#W#1<*IVlgWEaC-h<?IcTB4w
z4pJ3dRO}<6Xj0#~d8iI3zS#hEq49K^*W3&yn=+J<IDuJr0^LSNNO5ffE>CQO^CyIk
z>Y~?QIFo|UXMx~p!KR^=7tg)Y5(KVd%s;sz`x%TH;Jo#$+7}Yb7*i=TsxR~AxFE=>
zW(~v{>LcBib|GuC8S7-8L4qLFQ(i$>;NUTv((KN!z@MxdjwpuTyAt9oo<-=sFLA1~
z9?$^trHkeon-dD}rn#JPI^47=XMIr`yTYdSZCal2rS0bfPrR@DpQm!me;&25(8wlV
z4mg*i+rLuNKAINT>#5_j??E#GYC`|SnGvF<n1YOCnll{7{|Txpl&7=~L}3qzKR(+F
z@Fz5|jR`7{S&y?fBA0tlz(p)R2Ql-HSLhN0DuB;N;}}au77QdBu7^KnDc@mz3DTzM
zEMO|V5LcIWg*vDW?&Nq+Q>!lkvjzQ{R~D#^Kv?j_+8lRz;%71Q-dltzASUJNztVUy
zy`BT!JUAz&em*m9jz!I@!4fH-5if&SGCOj`sYuf{n;W;8pp)FkQf3QG4MlfzjYjFa
zNuF#Sz{^=Q%d>E#hyfIr761cf$gIFWc6q|~7wNBx)`lkjpyEhi$a$IuqOdS9{jDyl
z6WT=IM}pvZ+1A=&3(Q84CX3=NMRBXK_e*IJOkm-#`8@4xO)cmjJR#NSGW-cS36)GO
z5Z7>Ka-5N|evu%fgThT1wi}`UltXlpQaW;Mgo@3LSF!rWFAQ+4*u%&II&L=7$^-bl
ztQ<$&nHOy))z9as^zQ5y_+Jq287*0=sx3V9^i!Grf#`+HyO=76$2;s@lRfRvWzP*>
z&#|<qxf)d3dF7e?rh=!nbHi>=>oi@<oN~^baxOf+t@`3fK-a#|(UZH6hi!@GEmI;v
z#QX%tuLN{0i_tO5=os7@%yPv+z*n*jKCrjKjK#~`*$y>Vf*9S3iNfh&EaZ6pGz<LO
zBe)^H!ZKz3gzOS%=Do~TZ$RQ_L>uP_`bgPD@%tRg&0>;|p?nyDq^D?Z@-o~^GO;$J
zi<n%GRep**34!w!8*uP=r!&wGyszP-*1hh7H3ax26Ikc$XedJ!iRuPQ*=6r_j^y7E
zQqh85<s+*OyiO~*oENx%g6wHSt(#InXf5c<%VkbH{b%?$#yZ6wB9%tN%X4KKB>yKg
zRd8pafjq;|7JMrM2%i>XSl-BW^iR|%SNvA8hFJ`-QOZV+@E?}&S^okO_s|ymg)YNA
zBh3&cOd#6WF8`a}DG`85*ivdw*sDRPs9?&p36nwBE)UH-tKUQ)=hVMGuEHPG3O#=%
z8k4b?aJ(`@zs%@P;;~IERQ#uGb>i{sZSVe_q#En*xuMjPMPSuN`<O@%cU7di37rxp
zr5`W$s}ILGtWK)?CUm(+lZ79Iny}`=GnagO#@@_!`X?tnrnOV4<l5_cm$3se9F_mY
z^MN}|fclPsvH<wKICEA8Kd|Dl1@g0q`!^*c5DbjJI~b$`-N+*lt+dJ{3yKX}XDIQx
zSb0|YUd3jb)wlvfooa=ELSYf?WVqOA&2eBj71D{_3pXr&mWI+nHza*KNwIa5Z3?Uh
z2ipH`K7no@h*g+04+hcPg$&iXrrbhuP$9@vy<YpYJ{RloJ!Ki+l#>42ZRrV$8-Gwo
zda&l#c#73;0~l;^U`ZZg_X@a-El1!`A9&o=oQGL*7AsKo;Q4?||0(~trPY>CQ?D&^
zvq<l!FP3ZQg0mu)91DAIWLWT>tcEiS&vfM+wrieuOS`n5bt@#zSM%+V`h&<{R7SGq
z^g?Im-Li8BV-BBk&Cbc!J=x>9a}xOsLJg-bw?!I2`%#{61~d+wdKp~>@s#ePk3(y1
zj}=~&odZL8**Jkka}`Sk4U2X{mU5t;Wg5`Xl4=tHzmSPv92I6!G`xxArW=L+C4q`9
zarQ%DWnN2HJ|D9+7Kck}UO_>q6i%%lPqw8&!b@$%Y2mJ*CN*Z5abL<P326ANo!C7i
zfvzO;kT=+d2@OtMsLoJqFjK!z;mh%7<b^b~AbwVvLAdh>%hGH|=hOKnr$n-Ez7LsK
zWrKa5M@{H4!?&+?rB}|?wEG3Gdy+q|otcw1wt6qeublL`Gu+PtKBJ~cEuLE~Oe^`t
zp~6PdBUu+gE;foJXr{Jg(yFtK{3D#Qv-cdEA{`G-GuA2h)7iRn_(X6T!6h*<@W7d(
ztRw+b#q>+sAB8%#dBdKL2gPwSI-NZ30H4@NnF{Il0Xxzi55?3<6CSv%;Ss!$c)OzZ
zzxU>l>~I>ckCN&r{$|wVOfv)%6wXW@H>@J%?oKhF%5~@K$C@nPw-j+=!f+d>d&^Fg
zO$Rn<Cw^9*fP#2Ph_dVmLtzjpB|>eB0805C>~fkQ`vmh?m7-7qsrQC5g*HQWfFaw>
zK~Cf9f%Iqw4TFi+#s(xsgJKtv%YdVxHc%?BpIs%&wnJP<Y|Q#FB#`tEE^TjmUSI!E
zF}vuo`88e>6LC`#kvLFI*qKxj5+ZSMC}VBk^hut}6-2Tx9yrigx#TjhuhV#&a_#5F
zriL0)zE{c8)wgxApWf?VZ@ch&-NV%`(?|F94)1KPJ9B{`*fj5w<o<4VXUHj?)H5Od
z&PhAMnHGhIPwk&}Npo?0>0R98G?<k2E~MI#eY&L4sn_Rxv18-av|Upx-W)v~nFHn~
zqgxb%)qpWkobPi}@u(~S_kBs)i7P+OcJMTDGyi^g>uRpHJd*>OAc?j9_d=zlS{Aaq
zzAjYp^C&POVs+?wMpyXN#5uCX?jMu=%ICT2_k$xV=eiTI=0GU2vh)O2l_Y_Set&*q
z#F$S6S--bTQ@k!cM0`S_ayNHRix}T6O5jzY?=>3~2aJK1o|8pG=C-&Z60XMcUQn}R
z7`PrcJ-4t7u{SV1Ggd*V%j#2>c&JVtDWUvc|GfTLmSdxX*(qJ)1kzQEA;4H2943S}
zBD^w&MT?XNDsA>-;q@86Y_!_+yz$zS&pcVP@ir@HroQP}&4qk&u^DA&Q*8fx@}FJj
zA2&VhS~Vr*Xj{LnX*&HVbNctFw9A6R-JSgxpE?%I8f|s#GLgBQi!SsF@7?O~2iN{U
zpySE9wSBu%1NUD~x>+#Z)>x?fW-`Ry^v3<KgVKG+#Lv4q_0W<7*o*At8ElN=KXer_
z6O4eSOcQoW7qdD!HN?9zcL7IJrVU!qZXyfat1&Sdi&N5H>8}XbM|`=}8Tn(ywkd))
zmErv=%na|+IQpb&=t|vCVKX#1zq@p^cvhx~<1>@#7C`#fVAVPh%l(Z2#A?JuT-EH7
zB|PxLX5%V5@i9;$k9R|BX+eC7DoKMpZ8Q=JsR!H-B)l)X<iCr>n&<i?78P`Tb|1M~
zeku1ohM&=agYAJ+5Bs_JgaFgwK%7_+C?4s^%+RWcXr}Ig4746$pz7Kn9f=I6jNCYJ
z^c|hXO9N^dcP$Rnw1i~UxK|c;mv9Zu;rx^zk<&<2I~xVB7vqy~gH|3xp%sw*$QQHl
zv%||3D8jHPG{^tcXn94QnW@K6ohfPWvcW;uVb+dJ06XG2wV7nEDbr%V`lkAar;YEO
z&PkIEL6;a43}ti`Z?PKxJJ3-@KlZQ3%eEV%FQT7RW@8W3Z+4S7^^ij5cJcOJNpB<s
zRo)o#TuJlJI{nP)9LKb7Mdle+r|@F5a^i6_e-U9qB)(0tA1+K;<Vc+ym}H_FjsnO~
z-+2xi`kk^wl7WM2WS0jO2W9Ati(^Z`^6({2P$I@8lvn;P&zJd<3VT87aP|fcB}4Hm
z{8!lfh*J=<1HX7e8bR9#R={oQiV#}5022kc+cYyF`VOPEVv&tOQib@*6t`#3kQi}@
z3wU491aKizK`1kEOGm}C#HhAg7vzuVKl^+j6@TDjllkb4=nVl4?CoP=eLjaT$?@_>
z&<rE~F~D`op*rP70CW8`G|EQLJMde4`M>d0=o<s|L@7bElaZy=zY(dUco8%$>|<-z
z!f#?@(CuE(S}&T#V6#zyV+AIq&VjX_dP~jDp$h*BplUTKi=W|$3(v*qHvGSM>*(6w
z#ieI6eq)cbx5><5yE~RyZ}cLbANl$vvBkCB#_&;ONReYCleF?+!tn_AOt>!)_iwFu
z96DX^n^0jN8fb5BskOVub|^Jvl($n$UhsrzG<<O|IK#rAz}tTKdfAzmPWwXzrFL19
zBPrEPj{`+-vJza+J;?Um?VfCZJ=$;U4kQyaxjv#DEuy9<Hmr&C3+H1tQ0rHri4@7<
zr**Z!g*Zq^<c6J0Us+NE38S)Nq`n!6n>p(1Bqjm}oQ08wI-)MZc=7NSqZ7Zmb6pw_
zbva?7t%;fQRfQ$nHoOc0{0%hQ^O>z`As%(esML|VD-z=d9?VS{8MzG)IMoh96ciWj
z^mIrBDmc5Y%``x^!V9NI2>vh{Y$MFR;NtEuNg^fLa0l2WPIL^tIUxrHLu()?QdqaA
zv9MJu5xoG0bHf5h-x6dkhT6!P0@-<>C4VF}roU3`PJBV|{1^L>%$XR=6~*HM(Xr6x
zTF#HOz3lN&vQ&QJ{E)?0OM@LABeQX*BGrkSfr_7NB>2f;g-Qu{{4+>~6iZV<Qa~w+
z>;iPC6UG1mOviXTN)H*sDA8Z?YS4FWm30zn?lZ}#i{fo`mGN52Mx^2Nm~4k(RU|c6
zfAkAW3(0aEslDWP(^p^+I2KT6%E;>t`QZ1+8%<7k?rbaHZw=K_xnE{+ekpBN!o2P6
zHIjZ8dgQEKUuRzsSy1YlHS7BI><#C=H*E4_v6Y6Qvvym1p4R%WZ{(ZK{Qktb=fHI*
z4Zx|@M!iik*6I-|LOB~0TS-l#R?0Ri!wFVy``y+6+zm@C;ouVLQYLl?;#%d`dp@6&
zKVa64Uc(y70*7cI*IRGZ$<Uc^;1>9I@$?jXx%xRCA03Ji%g6Qcf;VRt<xQBFTO0pU
z3@Z-C@j0z(xL=3qX7s;(wxqg!uj2lY`64Feybr4bOLefRVyB)?>lAToA8^2<?Z`C4
z!t;h24G}3^Wue6tygHIMd#bAHl|MtS3Gs&k67OI;t1u{$D$QCC$o7NNCbF0NA^yc#
zYZ1r@)W1Un7t|TBBW^+|4E^cKcU?9eB^%BaMiHC^vZH(GeQ(nW<mpuFNN5YorBZ)$
zY_iv3ds#v)ac?%mNOvgTW>f0HlJ;~aHZ0REMYQ`Rrqr4K&gP40kB&=9*W4MOJ*JuE
zrEW17c0$?XFNkpjr;Ku-Uo{f7b7BeCa)7l3M^f0Z8=A>PZ?3StBydPkp<R<lu`DK=
z$<pnCK6`PhKsHnGvV5)&e%_8##Z=JLW{v=IMpZ`;CSjJ&V|Q0of1O7~bA2A=Q!D#t
z`MB1>n?f`Q0!quh-rDU7l^pR($GA>Cx1lqID>^kMqVKrqLoG(S0ogm|2fucfIf(6E
zJokFoG+a(I<D(}({bvrXTDffCDYxP9hk3ArIRp6kO{_5p2DK2-piBC3@3SQ@Wqm;A
zC<5d){=0GQ5BHt{Pz5L*BZRomP7yQo<3+R!#{&}-odxN;JrdYQwtQR|l7yrNnk*RG
zR0s8h`5!9lFF)G$q`fY09=Bwl!gWf^d77q`u9xuVy`_LOQo)0m!<{zgm^eVJ*l-dU
zKh;V(DK_J6V~8KDabff^B7%+EQs+Q8Znb7x0;j`kIV6ASHhMSdQqeq7N47&4j#R<j
zFIIi>!z`w?5FO?l$KvUQpyB~>{>3`LH5(mba8nk8$r3d3i_Vg@Z*?EG8!gkmHhv|?
zU!KEcPJ;adJ$2onUJqh~N%Soz6TNV?uOlBS{dM!x;%EIYD22gMF!!s43QKS>N$?90
zB*5)cMcpzlh6`TkMXY7DK{1>)U{&Wpbyk3(;HcUD07vmNV!&|{46Ym1pFiFvT8D&#
z;^(%ljU_9NjTNguwZN=L@J>TCTmeKTuhy4Uf@5h`BRNyc$@p_4ADgG9dLO-=>91L}
zuCw`z`LO$^*_4u`*?W76ekl!+y#9Cil1aBceX?%%^<yvl?UVdm&Mx$Py44uUw-y#A
zoNH}Y>)o@dAX~WDSZ1v9a+?0qcx%W@bGLcR_4!|wz*_hpMI)%iO};5vDg!bWMCw!D
zlz4V|?0^;2$UqG76A^R)0IHZol;32?HWN74LG0wMpThy05Pzv9kLI7{BbH4c1s!>b
zLVS{^rx+h!u_SX8qP0FWd3NM@;l*n!)5}P&$ci7JO$!+}fb6-^FdzOzQEk7(*B7bz
zP_K9K9P#b=fA}mK=Mv-V0SfTV@AH=cv|`kOuZJDA_L?sANWSGFbz9)ix3LZDteW?{
zVmajmTJ9hXm92M+N)l>k2Kn?=2J~;K8@JeVvhB}E;b8!g_&i9;CXi~Wq9x_;k5nFu
z3q<LX0MJUu$aEPV;-Ywd&W<+hwlNw`yhFgeQ{HTyN4?>A08*bKVb$T3<WP!IX+St(
zSA2-yT2Tz<>bsyFPInYzlKwYA2{OP9faJQBsB3$0W<VLZCadZs01?~g8AcS&pR$$|
z4sD_DBXsk5(P&himp8c$yY(r<5xMPm1M{Wgcn{Z396j7CjC5MBXfLDQQ*^$qhhE(>
zaZdVE?A+dYE-5%{|Gb4uq1L?$yn%hC<?ic+iDeEJ|0x;iVDX%CeeXpV2@^Z^+&|n^
z-dK9d(r7vGyg8}UiRk*oR*RtP@X~5I(DJ|{yKoP>jcL1X!=HB^YDdL&7_I|fI+)%C
zsbfv4YlIomY?K2Dv`;1T;xV@1Bt|zRRZQ+s?70Vw^dh|B9GTDdQCnwW-xbQX3naF#
z_)rU&)$8Zjhr#xm=mcxuFTFTH)f;cG@7!@lb`lKoTlw8)S0PbTc4`#N8{(Q2eMHoC
zU3Of&a`($4u;Z%pWWS2DGjG%sRJMHIB<<>Q%u(CC33Jmwj|PQ)Xm(d;kqZku9E+y7
zFa3U(@UA+$%(XtVT5siaF6WR?d#*osPpnNgKkQ?V?rU?MttzMb50_hR{Ebj{DqAq*
z?7l-YUvKcwGR^YR)}e}HU3mm2^LyJ{bhdiixG%Ui7@YxKsfd?i{r5hNDL5$hen9|K
z4a!eWRj31g)5OFFn}KA-9&j|<l5zrb#DCl=|1rYozSWgdanYx;0=kls*_SSV_b~PF
zVCdJS(7R@fd6Ja%fd0{tuAvztSrzm?Mh!FZ<KnU^BJ+gc#F<A~9qfH#&Rzeel(WO3
z(AqilX4|lQs>5-I0FmJ7aCs##LFW93xVAt1^O)Im6hBA(2VbO4`YQ;3WQ7@bz$za2
zCFDzjk)!aKU>c=?5QG8|h)Ic1E&6VlD0=ndlZpwN6_O=98V>aG#)}N#8|^LXGBwyR
z+YO=Unir_!0%nIUTRRZYK+NPI+77GaFD>haOOfw7he^=JF%S*96s@A0e-QW1gun16
zOA-*Gl&%<597d{UaB)NfOEdkhT$KDiha|d#u32e~G;X7BLrzMs({_tRZBk4^?i%@%
zFRTQxu4rH%yD4Tr=i{`I2q)OELK8z`O_tRpEsPe)(`k-6u`C~x2*Sk8%O-+%r^;8#
z90et=9i?G6=OVA<*nDD|wC;;0VJ@i*Mt|UoF*^2%9?XrumGsBhkBkH@mbpFgs8AVW
z_PP(2xiD~h-!c~I88Z|$`iRY;3^A^oy+K~iUQ7O&Y^c)!tCxVH;XLw7c@dDMY+1Om
z8V<TG*D3mdIZE!2X-JW^m-paukduI@@5nEEqv0fWWG9CecJi0D&&u`#rO&GVcN=;q
z$`pXJ^3*vy6-uB_`M&%PhYV>cdU-nZU7(4c(H_Bx!&N_Qcyd8zfsmzmsLh$*fRykl
zh)HDseLfVcSO=b_7kz9sNyDy~-bvRV!~l)h@|!hnH6+8mH7&p^Y{Mjib`={|t%`wL
z7m&&z?^j>bcgfQrFme^O-Usd3%95yI_J;FCz}+N}u$0!M9dF!*Oa7*$BV><!Ni^Hu
zesT2L*2k}09mS_&g8VALF^|-Y-(X3mTXz2XPoG?qxIqPiypaMX_WVdX|5jBIuO^4E
z@6R}3v+?hdk|B40<K6b0YM1IDr*e^fcD|#J=|lT{*?DO~`-0bni)Ho}0DDQnZuzG4
z>J4zg7lhEKaPm!|P`e=wK#otIe|P)ucrUMqR)3xcnVgsftfXr^7k#A{6@$4d!?gy!
zU62xV1(U+ilkF%#N_pp=!(Al;C<fZl>?y2R<0LjHN-g=&R52D6QTpM~LvAPgQ8~#b
znoDcgJQ;XH$IM`a*qzeBK&*`=AYepTb$A~QgC>|AHUsTERmFr}H1|f9RGa>4DvY5I
z5^SACwmy+|boiqHgu1vlOiAVQ8EHKGFtus6_aPo^(i@<fI=O?R_$R_&Wba~r+2tKB
zZuwhJJxB!okD*KbUoSr>Iow!3TA}VgTBdhuOU`Q-T7AoFkB>d$fvaCU{Urcr-gpzv
zP{fM9yUS=~NkB^RAu3;?`<aiCt9?Jwz}RXDq;iAG#4i61;GsPrk-~J2_{cH%BU7qI
zofh)nKN6>Q-0-M}s?ptYjF&hrimof$iR>32Y^NU?wAc#GJke=*K(!KFKx&G;0|?*{
z_f}uHfj<k-9s7o+KTwao=&c||7(w6tqD4!sQWEWyJeO`dPprYmEA*(o{D$t7EmTS;
zPMNJN!y`up75=WsiAbsHz+-BwMi=P0MRoV#T#z8K#&7b<uyJ_mWZ|4=W-{kaD5P<L
ze0`GKaBCMMY~SI`{be?P-AUeh%2Kc5_>DBio-@Y_vX=*+ne3kX&uI4^*My+Lfakbd
z81E)4=t$7W3J);>)lMw5##T#uc=`iTgF{jU2fy+pe__>M(F_DQE>4n!wDZyNog9Bo
zisF}%W_=+8C3BPKuZseki*om+$GzXhk@^>aLHTH?H}Th3wI3{|^dqm&cGwsLL<@L_
zpcMUR1L(47@_Bde8Zo8oS?GN@@r94}XecKJst4nNYU6A~x?9;gLo`ca%37Mmb6o9>
zilX^x6&RBk`o_1l$*>T7*d`?ZSb)W@nx|s(>Z|XcQ@SX~VW7jZ6#LF(bX6!z>;{_;
zf~{gnjB5WIt0wzF{`gA{M)LP4+ZJ&U+-nqvgHsit0Fm;C49*(GFETx(6@E;!!Kzdv
z#+M9Rgx+2qh8u@N6Y*&8IfXF-c&QMdEc+Ju4KOpE>92WPab)~$sG1nBFrkJ>ZVp~!
zV`w|&HI_#Kj@&clO^nr}#6y3e0R?zco>ePl#Bd+x3o$wu<{2BCgMqmQiJ-VQ_{Q7T
z^VZd-SVp<`2kvMtqcFJiqvNaDW|Ebq>Y2m+1*FeCqBnug$(Y#HqTJ%SvRDaqoB1!V
z()<V&;mh}KU5_CEfPD|`{<T7zw9>)OL#4Yvp9;?Q+?!UoLivYKn1c8MA?Oz?Iyki=
z8yqI_P3bp-NXL^0?}DR>|Mv>8`-dyAi}L=3y#UGzFo@rjPK19y**LyWdKFlR9|N@%
z?3NP6)p-F;>;Us51?F|5Yu7f!{|*nt(G2#|_sfA>0120nt2bhMQ0&m1tBH6Ey^Y;q
z&rW=T!UDdLZ^)bh!bbAL8!Mg!&CJ-;n`ZT81>O_$hi<HxNbQIZA(+7Uddh#v^qW#B
z<(m>HW7|rV6yW>(&!#Hykp4&B{P>?U<bTRqWorKiCj(B4+6v@xyTBQai(>#ile@7u
zKSq@l=W$G3B^Y^GZIEZ%4Fy`xq+u=<l62Vn2ZKP`v<C4P;M*JkudM2X+<c5+W*&)g
z?Zv=7{xm$kWFlm#O-@o9?8Ftb2{K2p6s%Ga!IZ%NHWrPTb^~4dP9I~Fg0H^}nA)XV
z?>^b%{=xZOT3ztD7?PmW;eKIUyXU#tj^;z?%>_GDaMNy*;FXUf7rF1A5X${}!>B!l
zhtCC?yWd<GPBU>!3Or0NaVFlu?gis>R`FVo2P_G}hy!;<FS-eBd(${LkjcM|tvAZn
z2N`B(HPy7U{#JLV0i}z$slr!GS!x%&2`7%*`i6^LJraU|U7~TbIQj2ETjO$Mr5nA)
z)*oC=8DX{aLRpJX#69DZG~|+4f=}kDH3K34dgi}qs2P;DH#K8MlxT+CN4zlCM6E@#
zly9@Qb7_Gb(9TVX<!LV;i9FKfV}anAGNmAVhvAVyTYbS%vS9QBs9c8eaM8$!v5ASj
zTmCY4f@<@Z;{Zy*+i1SO)U{;8pA}xJ(W{+%z>x&IbSzs1+d$^=i2qdAv40O^@6mkf
z9%^R)0ecJbC%@(Wx`#jS9$|ST1s|Rm&at`p@61W>z);RKtO0hx0}yOSoE&zLmtl8J
zBR1ef6JW;Y><0vf3YC{2tkjB~m;z2rrcQeVJ^_1Rl;#Rvl*lt_Ml*dH=a3wM=ttyv
zr(zqd(|)(vN*Htvy%wYja9><fvz>5-Eso|Evf?ayRiG0Jb9x`KiUBrVyrLsl^}BS5
zEYhY~y=wNCf>gcLp1zZI8ceBRsUOlSQ4{03cwArb4Yhz-*bSX=y_62jZJZXzKkLZp
zFrG4%d@j`<LG#}baBJDgGv2rkE4uDDuxNiTzDl%!TI)U00m!WQuURY*S9vN_)}V&}
zmM78mPNcN=u8x1?E#Z{AL_g5MwpcyBv=ttg_#gn;r2_F=jq*|j@RQ_vw&JxFee`W&
z&lp=*F%uAQTZJs8KphF-jF*0pVn3`h;+D;fB{U>CC_Idx9PTqx+h;#@CRTbTcHBKd
zZ2f25(R#yV;w*D(<cH$tokznQLV&X*x|LUD+<`-O14)>jaA1-w>=P|H&+uZ<)#HmA
z<p?C$u6P6KK|vJ<+aT@c6G9HMhj>TknST+H3Us<tlBB@!m7fv6*p)SN)Qgbi2s<)T
zmccPu<n~o{rdXdiW_xUjqoDx!W~<M#9deJRDs_<UxKu0)07qbGGt|v$ag*k&?fg<1
z=#pm~%>jXpj+JcyE9GSXeb=yl$G*Z+D!!ew9_KG7ptVR?Gw~vO|41lBk2I||)RHCn
zJxUm!0)bG-%>dCaU5WuFpZ^95HvD9i!~`*!$uSiY6QL`d&A``)Zi1MJy(FK-<0;Fk
zs+h?J$-m@}Ya<V~uU<=)9+O$u*EWf<&mYedatlMsPWTq|kqaFF>XJWN>uE;0mfgs%
zg!|Q|Q<;0|UjKIN$y)KVcG$~|4I$M<cL+_|ABLsd-!kEv|G)OWJRa(`{hv}uNZBH$
zl_Ew(N|;I!LiVyvj${wnMy8pvO|}%36QPK)WGUoinaSQJJ0WJP?3qtxd@{@V-8#?n
zoRiaee!af0*Y~gAPcO>MeBRybbzj$YU)TFm-<9H@VLQ7+CB&<xu-I%n+ur)7X-dc@
z<KDzxNsA6Nta2s{m{Sd4%&ED%(Aq;<V8Kzd$W`ycl0O)Pk5QDTDUk3v`F4U?x8KLQ
zmzTy~B^dQ=$_{<k<9j{k0sey?6h@bM8n*>*BEnl*anhuow#z7LAx{A?-xT6T(K1zr
z16t2Xx}G&W0)58dD1K;*RjN}rNr$5lPie}MVBW5*tK}BpRE-K?avMn=B1C=}K{fYN
zCV@-`QDu${@>O)X_DloQnifg`bbzR4N+RNs;EWnjs0L!M5_8AN%xmy09h4rDgDo0$
z5Uh?XA)?9$yYW?Zhl(dyh@lG~iUEZs!jo-BEI2iIRPPli+}PX!Z}s3v3$|r5D`$K`
za>0=5p-v8-W&v+UU9RM=Y@>H@)_ocpKE@=>I76f+#{8wqswaqQwU<W=Cl&WbI;%2D
z$qH3Z`$!vxCU7g5uag4<SCdrrU!z@8PL5gGc-D}fH!8l!K)0U1;l(yC^AIX|8=jvM
z@3_0X&?<l8$&>SqcO)l=g0fYOUs#3Cr#tne*q>Wa&yFU`08M05A19oB7#V}L$e9S!
zrikLblH8gWX2xb(An{fQr7CK@56g*rql6>QjB<pwk}J=aJb*v98oLuQwG9iiv6^#`
zKY^#jv0~0g^CX7B8EMR{pq+i<N)w-$;)%Y7&{5dM;WdFXF9Kt)q0G@3T(0+}krX>;
zP~I2ASeX{WSXJ@>SPH3RW?AKo726qZfzIN^Fr{iP!}OI=^kH@wAk~<0Yy1TJoAic&
zkY~<*Cbuk>w-e3<a?lfOnpVyUD6Lg3%!!+-uQPg*bDSAA!^iYiru$Ee65*N-I<n3G
z61jrdlyqJe1Xd2e_}FFyy9UycmjhuZjOWneoyidx5r~SJ`B;@Lfw^3T7va4Ji&YNg
zS~fgS4Fgp#1Q{JO_DmyiR@TYW$x-M^Y_*FvaNvs%X4oPDg8{H{IP2oOJD!3ngg8N7
zPqt#nON-T~l&7m9<rB<lxvcBCUi#9^N=5muk9B}(#v$#de_vAbi7bTTJ%<~4R#nPo
zhM7Y9TeZ~RhAX}Gt*W%PvOn)*;GLQf<7lG($*%I34CB(*6QNbJwWxh4>KNk61Qj_<
zZq|4Uv)gzgJzD;6&@8>0el^`KkRZ#3`7x$+!a;ck7$GYJZ!s}VG{TI0m`5qE%dW=P
z{uK))z*BrX=*d$ztv4B<E9JR~U@YHpmB3)4u*YBm5eL|7wUa=$3965$4-Yq^fh?mw
zs?L(`Dw|7=c3GC}lxzn+oQu`zH@c$hQ-bkmmZpes)bd@1j=uM{N3FusG&(Em9qk|L
z3o<EJI9jW{Q5tyan7<?@tO>=mtmQ9&iD+i_6)pUmAeL1{7c!U(tTIKZ?rmMU+<Peh
zqfj6y3rzNE@+!nSGTZCyFKqTE3uC29G5E%;UGb2bzKVB$IWp=b55$Z#teW$ao(JWQ
z^@tNV%^uy5Ko90io`_n>^Rkoe)WE<pPsPQqm0<LAk&L+DS%Mf=-0eO*%U<r(t0|wo
zq6Kj|WF67F3!`rQ#ZN;BcR0mVJ`VfPtU%u#GFAze%F7>eq?^AyO58oat7<D8w$(fj
z<a=5@Vo7bdr6|+-!B$Pop)-yA>h0U@Bl}-+bJR`Jxik=Hh)+bU;mLYs?Evm-J-i1x
z&fZ}*DHy~`Sz(VeJ6hg#e{mLRP3IA)vulu7>$bQNJ;SCLWWk9c#8rFK_}8kG)yWd#
zqTZuti|>yf<8>UB8XkUr6kThqtla&{C*7{{9A&4Q{823VRPEzDtNY#Eor#ksfq5dT
zHNkG%OfoVu?ad`Ziu#JitlR-xcjGr=(<;=b#Ev0*-Z+H^;<aLI7jFM=sJfOYSJY5M
zv%?rrHPit!D@ekRf%9mY%c<xlG(Qj%fiyEWmTMRGKrx&^tB(0HOBmIRGARc~nK5R0
z>o1g}P<CtxS7^kWw62@pyRBy{xx4-2^&qzbXPh%f<%dZ^3%7wd0E_a*S)k<1pF+*y
zIkrG!Ce!ZnK>55x9xz1gaiDyjEbvJj;(#mo-1S^g&ZiWlel2KBO?rPTH|Y7ddh!Tn
zCmxg=Y{dgVk183I8%)$$NRJ1(xMb?c?3GbfW>{-kLJtNF__)9wz-$C)1kl_fEr&)A
zF^BEoJ$WzhmJp`nn4h-0_c<~ITPQJmKO0>LYxCCJKX?0jNj^K+&eLy{u>h{RVxty|
z2paBLEF$Q<SVYiu00P#h0FIj*Ooo-SoAE;tj_`F5Dh?Q7GCdA)(@|&9gE=5*U>Wmx
z_FAlUJ|Lri=pQvF4oI2@06&T2NlJtJV$47D7R+CzCuj@DT%>gf`6$W`Xx95%KvM0$
z{uaz5YJt;vcKT=zXE!VixqPMzHk(;DQ7X#_xJ_^d4b&l@XN7Qccv_ebtTvk96<sZ$
zQFp=_PqP6tJ;xZbh(rQ{VMfc4;AP!_EVUlZh>J3<VH330CmB}u(O8yg`sr=Atdow1
zu9Yg6+_Jjy=JBJlwoiMb-<w#r9tM$}T2^Zbvp102Yqg)m=v=emPP9a5))O^T-bF;r
z<35tx-QEJv($Yx`>BFcpWST9cFd4@?LnwoQ6u8M5ER30KZ>TH@F!rmXVGr;#J1ks~
zl}T(W8!~O<?M`DGei&6y>R=-p9UiwI=5?DHOv_oJy+g4mP^lpR_Zn3-S_X+UQI>Oz
z80=XI&d#1Xi3cDk8oidc4A+SO?wYt@zjgGZ9Z9}e@zMqwWoS7K-z4he*jZ<UXENIC
zq;>i)yA^r;0@nCYeNxL?nb6jCb9oV!#a7MMG2T*M{kfJw*2N})h30`gPjk^$)7NUb
zo-g*q-`JWbQ*^`OhQr778vGV8>uYg?xb4i2WR4NFHXGF9?&=LLk!cAVmFU8TgSQF5
zT02U!xU$ya6q-^NU$<#*41~5Kb?dHj>(&z4p+$**oV_|^{hT8(`Nl7ZJrN@zcVgR`
z@J!2Wi78f=Y@a%wnn`9_c8G!;`D+gFISLw<qQ1czNsVM*;TX_nLecV`c4(=xR9aRl
zF;jZjwtG%oaneDwEk4-KoMPV@JY1s0FeLZ$m~aVX@RTfolL2=dU66q@S_HcXK1xN%
z0*6u@-J_?<&l!7wiUvM8O9-71(%&5{FKX}HJ8-O1Gt=ONPG~~Dw@%e~_1>Y7nD_6>
zho67d0;jy75D@E15B#UUWXukU(35Tz7r?n@O#SS<QCthE+WB!Vp5f=wOwwe!BvD_=
zuLsF*zkV#`I2Hp!26D5%kQYv_I^N9360&caRltP4ZuN6(;B9`?<Gf1iNzQw#$D4Px
zoG7+Z$=vsZ=0zQMopcQ^?4DPSYqly&&f2EhNZM>u&`pfavk<Gyu>QabW|+el%9ApU
zKmDR&TVf|g(d3R>THT8!8kqa4$z}ED9d~w5%$+AbMGoPeLD3<Qf{wQ4-U7)T;+F&)
zFW6`gT`guGgts(~%64n{um?5A%DZ(HN-<m7fTMpGeQ?06UMKcpP(DnUUN{n|r!X|;
zR0Sz!(mDd+1O`F^v*k!tA9fFu*hY~>e$+at`4#*!WeGdy`-JXI8A3ZfFCB5hI>M>U
z>Zas8M>DUL*}8jOHrDkI&00T<@KdBW>Y2=2aAGoeVrq*={BR2hfdHn~q)><-^5lSK
zVLos(_>JcP0zq-iasv>lBI;w7VlmhN?pKd>;w0hhCX}RBy?I~Wqi#^+FZm!edQT5T
z?3HNS)y&UcA~R2Rz{<p#Jy32Vz5=BFCc5bMlhk2~4Wt-HBa48WiPPL-p3Kmw4{mKY
zX{V3k#;$W5TgV#J8Ld;u^=3&^_<34G>bFbaD05QXfW9R5FmohY#$_|}R(0Os)N~PX
z|GwU;-aFxPF{;)CP1lc~i~NX|Diy_6x-&YD;*;(v<;vDRZ@az!dBkV)pgYdywsKe~
zI@hcwc;c|hDwlTS=PxHL;GdH1U#nR_8;yL1Z_oxC1i!XU8&Kr8adR0k3PQAuu6zV9
zR;gT_98FQs-(+@?nrVcF#ABvI272|SvAdzx>#ho~YiSm1CZJQ3Ix+e8+w(ziLCb?{
z7V6d0!2Vfrj(Zv9gXi-G`8#0A;*)wfE}dUGV2isI4|_-(K;XPNnSRlYb~S;PEdeut
z-O>dvnzWy4D{dU%9Jq(+Xsj76aXVu&kuF{BM^`ZSQClWIV%~4S+c4kpa~=F_0q-ER
zSi~U&e&=&Tfx4|cX$%*{4G|hgcf<VZ^$eJUX79m>+cO6mGp$((u*E%2P#u$yfcMR`
zKgvz}l6=J)E7{B2cnZ50pCorfQM?wK8;G4&GA*7emIqN4dMHqCvWhSx%r-Y*yP+D&
zhHBdA)dY?z6|l3LzzPiHun5R?Tl<+6LuR7dP;!_7MmwR!2Q3TOa)o9g$Ag{OwAa<t
z85A`dB?b!PWg(ei$-2gjJ_z_Rs=R_tBVr!ke%`t+J-TRLE2u(`)O~B)yLoK;Artei
zWL=tkRr~hhsuRh&48iRUCYA|zKHd{Mo2GfkCOy$V++la9`SW`5KL4K_td7C46WrQ5
z%Yt#7GF%Us1-3!jO<r|J=|V_#vd4%xom5IvXS!tPQoKeLL50a;IJMRKBu2YcTVH#(
zGyI})flW^%Jm8@2H_+79VnQf0NZGf{fRjMDotl#`Ij^>d-o{zyeNkpc2LDEXdlWTo
zMzvrB)U?Dr@-ee$SQv{28`kqTHT930dh0A|IzMt;-5E3bKH1VvX5oR%*`t-6TRW>7
zcYi1(#N^4^=TEZFmYFB&#ps%_CUyG?+q>_~m=5ZYMiZImMC4WWUFZs%9<x+>t74e_
zsLs~-iG4BQ1MwBZ+=gM5a-)cS)a*owspNg;>yUf1Aa>?&iVvp;Y>mfQTpeUTjR0BB
z8c<pTB7wXDYhMEIc{-}ctOL}KI1wLdmmSRc%XlyH;LD~_jkM$SAoC%a-W)39urd!)
zZrGt!7H1f_5bZ!sJED^_QPal4kIzmI2C@U!&x!aCE%dO?u2tCmY2KoSd&iHoRXxTJ
zuN{ZH+pQVaOWHQ|*=MM)g&gBoQdV~e!xLlHN9&Yut4K-uB|;=;)MpW%2vBUdrU55e
z#VHtQRVCoi7HBT&?@BWc8gzD!(5c$~GJQw2{EItx_NTCJWa{*OE$Aa!d6|4Py=55~
zXr@IZn@jz|*oHAXZ<DOOHE(({sH^a~%T0#>4lsh2`g(s9BN3VW!w3nGS07IS+V@~F
zZ%TPl`;M>r>cK4L0q=I}x4PHB)m*j*{1>YBlO7xiN1o+vwibTm42W+4De-ofP}bXX
z;`tW8c}IXXmWN{cBbxzLww%5y){&cyMKKSw)u3Cl*I?K!9X<@aEXIvG+p?JK)X|Du
z+r&V{ryE6cyy#_vNLB@ta3#3inyy(h+uX4dGudGPTR;)Cb9q3ulo{8Thi;V-C$rox
z$TZBtN@*M^Vk1e4o27-29l^aIuhAk8PFZk^c;Z++*dG)pi>YO*5t;zC&lm%6dhkyo
z#IUG)n9r;WpfeZ7v(`ZZJSiajrdyy}bTfXQAZDNE=;8~JmE*d$bm3srXAE9ZL7AX@
z)Be5g(XRqz-|BBBiR_K>#!PRm*zjVX_w{BF*fIJdN!Muy&E2Z<;sauIlJMu(#a1pu
zWhgygwbe&eDt!)H=bh5BQtU2Dypf(-y>meIynWq%mxamMMG6CfPux_pw?!mgf$T*|
zqEtRk2JvehP!tV0{e)T8kR=1JhZ5+@ts$3j0+=>dDdc|zXd&ySKu!dR67#44*)@1i
z=XK^?m6@uBkWd?;%Jk)kdGFgHhivaC-l~giDIXc0UN?sy0;(UQznIz41`@)d(m3`u
z&VCT?=Z(o|(y8i?@-Jy9go_(Mv8?%MoDQb;>8MQl7#rQ#38%FU$;1R_B+Fs!Tl8hB
ze@nCO-WqQ~E*{H{2Od%sC;%||dddu*W9x7v;hDS<%v0<^IipreEN%__B=-%b6XR&a
zwC8M1SHBLeI|eCb-1RP@IWITqa$y)$^}M?E_>+vN*8Aydh1>eJk<ZG1JzlKV)Fz%v
z4M5v9(rg{(in<E9_tMOFU)>^MS9Pi}%c%mB=Mdtq>+B?#nzYR-PcNWX1NPO`_;ODt
z;QkY@ZOT@=qP_Yn<qE#Odw;5n%P77O+-LvX%6yj-dWOdAKy&Y+PLqd`R7#}F0e?x3
zFZVj$GdUKwHuYq8ofYIb1yo#pQA|_gkjP|>GJ&I42KVJASoZtLu}!;lca*^Qi(Y!=
z!x2_}i8{WN*0LBf6`U8Az~#9Q5MktS1dDM^yI?Ich|?jUtHNSk`*{a=eToY8&|Y96
zu^w=r;5`qhtx`p<<2A^MRp(DAY9_Pld5xXA?qT;T!{o#-2X`o*gz?tp@MWL-7$0_@
zpZ?bFVUwPd<DGezd!`Ylk=^U=H$Miabd{gZsH4~P=Dpju?7jBVODiYbGI!WmXx9dc
z(0DaYj!BHw-H$KvR@_GF)YZ!wPa|#VyPxi7kFM>m9QKh<^07{*(?8Lj4Q$f|Np&d^
zwS`$9!sQ&}-c}}kv~00$dbw`SJ0(w0MZY6Ub4;hxe_ea~#}}IK36{AXkBT7G!4a#>
zQa~ylqwaz5+K2L+9(W?$V*~Oi=kIvG2+_OwB;CeiCVWF7+k%=6ZmFWqfO<|BN^gVG
z10eXOg0-v~{3SxOZ?%Zk594T}X4Zj<*&H=sPo_Q1ezScl3(eVe9Df>o=ps%y87#HI
zEcaQ~LuePEN{jtf6Ad4)^6!~_qs$ZPU}HIaToNdG)`OQTL!AMf4mc6G7;>bC#rXp5
zKM3w%P!y0HB^H@!qi1c{`2{4eXFtkr>=2*KvbX}WlrFd=7fy!pke$4aT<i`qhycsJ
z7{L(1i(-NnYI6!1(D|g3HPyCdGxin4Sr@>@`Vhmuo_KNC@8a;;tbRYO_WRXcJ~_^{
zN7W8o$ZiB2%)omO9|E$lU;%g;7t83anrB<HR&di}@iee~8}IPm09tZ!c0gb)Ptg@f
zficj}RQ7?H+rwknv%FSzD7K7%7#gm-xB%W=+2Vy;1CF*iwhPuNh0=uE1p(9ZF(nSG
zD%cX%uSvc#s#eRa#6Pl_Mh7AZo8VGPC3=Pck}3q`G7G#kqq#)fHm`ndR%BN$Gp02g
z+(OAjllPFRbQC|nF%W2fMm>TY)l^7?jbsS`GARq#DvGhDi!{tWf_sbFOMyf29LdwL
z?<Cr-K?xaUaEjN#XAH@+x0hWa+oGs%`~-Ot$%ERgCgAg;Dd?{SeVeBvU?QHj3Lys0
zJ5X6j0oO90VH71e-NRM{^u-C}O1$T3UORA`iskr=mA!}S(8*U6cQU>3&pISP8kXDF
z6Z&l+ig&9y*jlciK@GXv*XIcw=`m+!SS%nHL97S&AB9@Adg6v~278mYbb5ab;0wrJ
zd6rq0P(t>l46-Z;qx<@J%Am~beqiIMHt@um%E?Btb?%II(`Nii+{TG+%oYP^Me4ZO
zw%)K%@^zqE7>Qp2L9F`Qr%=_rLf`{x1G#F_^+Z5mApG8);lBYZ$1TCCvMmp0i6t}a
zXG|lgYVr*_lxYh#VA>(=VWuUKq~UlA1*{=7n(RZvZ<x{<i_roDt~t7?OH-&y#BPyB
zBl4+FmMU|Mvu=u^SVqlG2%`vKjl`4iXN!H%8@yapxpz6D4@L*Q(N)L>zdeqy*Il@l
zWd%oo>yr3R>>;T7a9uA+WLyT^2IwmSy1KXrFt0`JLC)id;$id)G`vj^_<RrsZtPGW
z@>;TQEPRnRyBv08sRNR=Y;ffRWK(1s*0r0o>I!KauX6zCka6@(N8YMHW*SQk$^sSm
zhE_Hm=_1KEPJoy<U88X7a0w`bU9DbS&{C8=?dRBCc&APDaH+NKk%^w>76#GeBy81W
zvhO6xyQ+Lov$bt~I+<;dyS*!^x2VUc&0xjl5?|}$%i($Zjl`oHUtHZBp69KV&Po=s
zk~QwM0pV_qm>0In>MkZ)VJ>r!8!da<ES%cG-v)tP@iI0ZYwfx7V|yeA*@gF^Y<P<S
z*DA^QO2tBdsnH6CI08@)Wx33^*psy3Vy(wynxjZ9QW!HuOK@^b5QP2m1~2-t0l^t7
zggG{&zyMYZP;Dt7dT+W6QlF*S&gBy!pEjI~oCgSV$4J5*O$@UxkSMNA_m2-g#Z&V6
zG%;kDTk2Iqh?&NQc;#W%@2qM+JJoo>{t~k8&=_eO99Ikznr^8vgLopQfO&dvCr6cN
zy0@y-h)Rk2olh34jmc&{0ugl)f#D2P0z#34E~)Rk6$dS+jx}W(_A}@!@2N*GgkFDO
z=AYjIp_}Az318-?6Lrn#78@{Qtq#f@$%rQWV|-(hwpnVELmE(NR%3#wO1Xkflz5ko
z83&;@R_1K8e{_Xu>8WAus1SLPGEuBHBy`Pd2$e`V8D&++d`A-M#%i};Z+gQHk=?0V
zNZ)f|&c@Y9T-LJn#A_zAMeF8^y4<<RH$@l+OnSRb3m1p+?GZPWoo*y8&(N&$a-Gzg
zi?NHfHIb63D4S8GLhJP|yz%VNV40&Wlh)IHPUNjO65gqx^Yz^3s^dy4ZylJO{HB+J
z^Yr<)UNG4GIH2xKfxE$mU5mV~%kkf8YZ-^2*jm{Mptja93e?t`&_aPXme9k3)%sJB
z45T;UlP0`gR0XOL4=Z;}v#E%9e5KJ-a%`4P*B1ER-U=!i!Es~?g4>__1|=L%IEgMQ
z?QGGMwHaEO@!+IU@wiTDTFHceh{xJVIm0~HLWlLfWojQ%Klmr2L`Z;*fv3g28!Q0|
zN#;Re@%mc;3)lvfr#Jy2N%oi>taj$+=xXJRQ{(~5O<UWCmkyxfeHasD{`c!D)A5&q
zoaeYw>kqflbrqGmNZT+Xj`~}eejS&S3A5Nh$gq%#OsW~pEARmU<%ppk`+hB-;Sy8Z
zz7PvoJA$Q72Hbgu3t>;{$-T%@PKa9H`<eHz9yvKXJH6aGzwX(mms_4qH!oiRM`i2>
z;5;4&?q0xqb_@x4&jx@H6>SHu+b$aKB?`WV7RG-qEqkBMlL`ZM#x^5B(AOR!c>A-T
z5l3Y<0sZ9&3b>U$ELh-r;=|A-RADlF&i_pYFIk)g4z%XT#mJZT5Cror<)JoL5it!+
znYa~;>5b`=i=ikqdo1u{=WGHFV}=g?q4w{5dwcPXrHfwVBL<5BcdfzT4Djdq6)a!)
z_5U~be{Xc>f>{0B8z|}}G}96l2O?YA*>0Et)+x9FQf<sy)<XikTwB2e&<1!2dGX~F
z`kNr__)4xgdkf9KZC1cj8|^Vr=SDY$3SB0`bn3`MhW$QYch2nen;~7|CV<prP~Oc#
zJjQ|2fPC|r5jI@`5($#8a3YGa=KWY;GKz-e!@&1V+BX155io8x17cN>5q&<X&VawB
zvre8?i<lmEb`agnpP!HrA$0EJEU|{O1!zppkPq()VKK5QaRM;dafZSAT~$jT;7W)u
z1}(1d0qgtg<Gbp3LsX@BD;ne~%@{Fr(XcX9!MYFVh)tPOph{0<$c(93C*CJ%Ijyx;
ztQNSz0Wo?8)j7xj8fZes<HVq`Sgq{Un9mKP>ml`<fTp#1=qN{f!R$95!sBpiS`$eK
zW6cv9Wa~FT=ZBw?5rArR?IDN)US|?o3V>ce;?OZ;2#)>arihLWllrTeCG#B3wzlMi
zvoD(*r0+q<whPfZE3XA}l|HhYhhje5^9l<%oq_=J8@8o@+nXzkVQ8<xX$Q7b6@pnz
z0yjcy!0iu|iFyP2nc>}-FC*fRQ|u^uxWv{ZZ-rH!Tu?zlpIz}$bIbEATj`rSC@DV1
zN)q$lhe1??x1UakTb_xZW>54$y>z<S`Kls|poL|XkRWHRw1S8ovy}*0vo?S$tf~m|
znpYreK%3#;=pVnJl#q|f|JV)6gERf3Dgi%%Cgh%ciMRgwMR)&jB4-ElxFP}u3a)Ue
zro}3+?zeC2MjNP<p&*`c19Kt-&~dR3<3Rl|XEa;W`z4i<fLNxhQA1>D#e1b2v95G!
zu_Yq7H}hN5aT%;^?diSSZ`~r<tlENjHTwf$ll+5`IXCithn~CE;XSD^al=k!Q_=N$
z@5;YS_7V>rC^}KN&-#7hyE_G~Dk{4YZQLC!jko6RJaSL>NY1V9GM(VDQzhOn*4+4d
z#~R(+#?jVZ9BU&28-#z+4=qLjf2aXj&D4izIejsXFT9^|fD(n4)|bONJ_2+SO^{qq
zi#>idHjL)4I0T=aYR-O~%~^+IG(8^O2<Xv^i&}D>eELC^lno0zbrOb+iwtPobB<D(
zn#I7F;r(bO+ZH2Zv)9pnvo7E)=RVLfQ}-OJ==+j@f;Wm!cTkP`zxc4jeXB06VziJ~
z;daBz=}JjNpLF+@{?sB+Wph0oWcyq9EOKY6b*$=bgB1v6aGoyVN3<Wu1Mdo~G~ebw
zYy|j3ia~KaMGdCb`++v}RSJT%QF&$^HCdV?)1M6}qy-5$AuqYs4C1(=a;LBQxhkc8
zeUVAt0ca;qo^XmO7;_&^xDO*5U38#l%-6&RT?Y}r51uexE+EObl23?_k8cxSL4RQ5
zp=Er0ek=I+B)}^H0l~<C^X@?>&ie-hsbl<nAJ{Zp&Cpm=Z$F7T^0xj>?3=A7j~?wh
zcC)5fuM$CxOx?IX(buEGwhv!+`Ao+2hjM}_VYP0@s|f#<cJs)REIrjJxt@|O!GgKB
zCsnMqN6sm5IhK(-)Au859lU!AL%ujZ8OME|ds&2L+3oKOpD>tA&5Svm>G&d~a-M{4
zeOSyYy7}?-mm-lj{cr3R0(`;({oe1=^bASdv^7vQLtW?naPqs1na71UH2Us|3gmCL
zo46h~tX+S=ehOtZW7Q^=XSQa7^+6W!DJ6FgJv_?XAy;nwi*8nrR?mgiQKL@7?Iy9y
zUlRgYJLhyB6<uAXNT}<%j4O2EgpCn6Yud#gpEAP7Z;*Hq!?*s0#<BIpOH$!ttFjvJ
zACnyu<{tQBG+|{*obGhpjnciK{F|SO&z-@A+8343JEyl9xsU3-3_q{Aqokn`8ah99
zGw@U+GvgRqQ(jAeG*B71W1rHsWk)I`?;ma8Cnl`!jO?h0mW+A!<~ED^A*x9Jc0}<A
z@mjX7p)$cJv|*xxyy?!${Q;$w7t0A@-lsNexEfs6-pjNpPkhU_DrJSA_N5ecPZNLs
z^;-le%bqPG=Jxrgr@eQ2)l!m}5<R7KP(mRIV;{fqeCW7uBR;t_^jb>Y$@{C<{Q7R=
z(H$$X5=tflCflQhc5`Z?LKrGuQ~o7y3v<rZU30<Fk6Ey_6F*&`9f_3P`s?L#_tp1>
zUHJCa?Y&dstLPLPB+wyZUtFT=i8?QN@pIf44bo;zeNY<umU8V^K|a3Ue_zh`{d#-D
zO-g<M*4bn5z3yM@E!g!JUw7-^a9{UtD{l2GW3fmL*@LwDxG+!NMKuG<t&)u!Rl_Fv
zRc>sWIOv(XmNDY~^-~J69c7Y}c5?cR=JMaE{A+8pM(zu)Opb_hJJ=sM66jTVx_p&L
z_+v_*ZlFbYP_d-_RA;_tM!?G4prFH7{EY7XruXZq@tL`^mT&!@tiJCENA%p>dU;GC
zj-I-*H!iC|yC`yFPUTbL%lB8jjxBe(pxGK|)8F@`kgKv2ckmGkH2v?L^O${*caxuw
z&v5yln<>=GJ?uZ5DbdQzzj3213GiHQd|QR+Ik)$*(#uDwgl#n=!gY2j(n5Fb@LU&%
z%#h&*`8q_0%qtl`>r}my%2g_!yRYuxZ>hRgq%uCXaPiCGxX;Ih?R-|O(2LAX_yoD0
z`&1v@MA#;D*Xi}Dnz_+;mhWI`xeu*Fml{V6hhxL*hE&$xQdsj_VXo%YiUL=wuFs@;
z548W@l%|XiwJUGT9@^RaDi>3Iv(>rtm7Zs7q`|M}*oIbdA8f8U<L9KShq7F5JJWU=
zc9z@i{Ui_>dq@Bpiagoax-P77HSv^OkLMdlpPs6-sXM0}y3`K+8d4?M`8h@v-S<mC
zt)ha5){UTBwYR7V?OEF;j>(1nc2bX5vUxcC<-4I5CMw<1!8eqI^*=tm+vB_T3tg7{
z$>u<BMX>&riPr-5yVQ?M_uOsvwC~i!eY8Vvao%<NOvRp2sjZj9cdjt5j=jmE_IEB6
zM_*1ym!^&epMG*u@y0Stc6TR|=lkVC-0y<&gx~L#K9E#x+Q+kN-1Wc%Yb`*%(urW#
zu||I1P6jje_c_GKH==#OJNJQ5l;9KMKjZKAZTgUI{^#8n1>(jW0sifG`M^j1-~Pg;
z0OM<;Y|N|~M_hDXp&Y5;RePkp)ag<4`lK*{QNLzsxu+rQ`NP%yYH~>02F2GA8#53^
zy9c8}UW{+(;y)zp>3Y9=ctd@u%!HoFoiT?`-JZ|7BsQeWRBPWZxUC^=*WY`(ep#A8
zO6c+4{X^mCJ&Bob4z-MI+!wq?yVdS3A#a49o?t;*p}X}=yeiY_d`F-_eu%{pkNc8(
zk9d#VMXoI~5SH?s)lfLWVa%OYogcew_awmX1!790XaC1-j`68)YQ@dxTJybDwihHx
ze6r?{usAOxvzu3)g%2K8TJiZs+KC6pmxsFFU_E~kDH8WY+}-21up?#VPwOAH96q$e
zv1B;TEbZ-Acg&5XWnTlPkCS7@tIF*mtpsRiQ`%I3+3?c|^=rnz#%0Ql->B|l^jx?c
zzv*7j4KeEr)k@K3XA`d_%T7*W??9E7nYmnxJ9#(q&(ta1=n|GkuRU<EONqp@Z_)mt
zlwtH@g*agOB?<8Ht@-N;bqxsk{|ofbO?k)0)PGoGHA(ypUIXiYBY4nGAU=N6(d$i*
zR$dv-v`x4g^E_o^(WzQAYAAjkQFLXnJ_OxPPQDM%1-U&^5a@mV`tx(oJ07P^iIpR-
zKeOp9x=M$JN%8$a=?`B;PUF<{anvcQZgc_vo>i`aSp&Nk*1MhT?>x5Q`fv*I{Jrbf
zd&}KRi-Lu<OR?Th4MX=U;$4{=8dOfm@YZNpNO;u3<U==AhZmUlFpy0-H7223vW9<G
z=J9<8ia11(RV@RmuTz@I;r_upwn%KYt644PELBlk`o(t!XK?a>-xd#pja~f5^g_rZ
zp=t>&>kJ11LR(t~c0cpGMlyuHrff92>7QtKDe^%?-0uO`SG&DVd79?1qdWo`5|s0~
z&)c<fmRbKyJ5RHGqv4TlqRV3RW#P4F4$M^b@~@z@42r!NKODOt-X4+ek-YZa`@pQ(
zGo9Ur^op2md8tWlvPTX@5#PKsy`U`SYH-S8#ZB{*Yk9)QAJ&|!IBc{WzAD*Fe<E43
ztIGI^h48T*A64Xxb(`9Vn-I=kaY;7sai*hd0`I>24K)~`7OPM%7}4EU{6eM_>Ly)j
zbS*(_^Qga!A7IIhvKvR932_bY$bHSf#V?uZJ&TfqQ*Legb?>vBTcILFZFM1)17EJW
z$n3g(we?E~p=Wq#YMdeg$^2WU^Dk55`%}%$JALbTJJ>VNR{x=P{JBaN|K(`le=gYl
zTrk=uJitB3d2th+F#R6D6Ar$&0szp#u^{r@(<{L?M4xx}4MM6f{#^Q?>73p)89bm3
z9^UxfgStRLT2!}#e7(-N2O<B!`1Z^0#TNV!&CO{`Wy25A_5*$X$7s9$5lv-}{+kV;
zUACYLi0|6APxueCKN_`Y5P;!v4fgUs_m5!omxmbEV04PW5KDdsb{h=XzwL^@!Ym!C
z)Xqk~$4B`1S~m*tNqvW)DE3|Fz75j1ZY_nl@~&sO6lla@(4+4m1g-xL@ok)!LiBZ|
zK%{>S@y_pBVPX3hh~uu|{vpBNP1_$Y?fu6Q`e$T&yPrYU{vRNt?SBS&-%lZ{eK#5&
zCw~Tcm&Q*)e?QOxr+x-{|4$)*-)M}(&miym=PmQkb9mYDr=WkT(cce8oYT)B|Fh8e
z=Qdw={u$&w|ARK)cKI1(&3{Dp%0X{Z1fr=82+8k9;~x4i$Y!oV!S3h3+l7C;^v^T6
z{X|sGb|4<gR`c;~{%%DJyZ#0F-~0QUtoh@YKbFN(d7+SYaAyxN_&R}<`yMaQ{r`gZ
zZEq~ads!*i`yRB<3Akf6eb>Hdum6tsZJw6G1^kk7^Ec4EMIrn>T*0~j4);Upc+qKN
zZx(2s;NM!8eg410{ZL-t9^JcD8Jt|>8cSP;4f_k+zo+@zWc+1<mv%OzZXq%mw9aSk
zQaC#PFK|DY+ixpzDO~?+j1vD!KEBiqOX13+|2v%4;=*1E=h7Jd3kpm{4e0RqQ^Ab?
z?{MGFl%;UL%0A0q1K_;D7Wp3T+|~aM_w68A3a730;kFC#0=E2Z?CP)ocern-$5OZ&
zm6-`C0H^UcIES?V4);y2FNOPO8CH7d2XKE;rv8}hZxixEmFfxpW6&)y$>s7(^_It(
ze;@~Yy8F2!zy18nB3|{cy}5}m&kdb6a-WZH@f+x$fBC|JSN`Kb2=)x|`{kUgm#?~a
zfV;=i4lZ@X{)uwUnEyZO(|<fkOCMkASNrpEzT<y9zQoJ66kw_E>Q8`?e*t{w$y)mO
z_imFvALiq`XU_lU82sCbvh?xqZQ8#+ZeaPB$Ny#Fo-kbjCXA0y8T=UqaxKJ$AN=-z
E09zog*Z=?k

literal 0
HcmV?d00001

diff --git a/docs/diagrams/mmpb_algorithm_color.pptx b/docs/diagrams/mmpb_algorithm_color.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..f660fb96c1f1725e69d7805ed657b1dd53643e76
GIT binary patch
literal 85453
zcmeFYV{~R+vo0LlM#r{o+et?qcWgW9*tTukwr$%<$4*YV-{;-?>~r>y@AtRQy~fBr
z=D2HSuBy7~nzd@yl9vJoK>+{*fB*mhAOI-PO_ZJh1OOO@0RTV-fB@1IvbJ(Cv~ti<
zcC|6I*P?Z?w7|~=0V2-^0Q!>uf1m%s78p+&lkKBN5P1}T6WXUSTfd<$rhvw!(nk6X
zNSe|JHq>}-n&Ul1OhcmVGaVahLCEp2?#>;VVS9B)qtXMCo+K((j@+mYz{wgi-xtio
z`=hay`Zu`)Vf+k24TX!->el8dk2KBGuiS>HWr&$fo69}K{NA8J-@;5E+Y)}WR)Qu%
zFGQj3wu=XXs0h*N<z8vOd-o%H1cQK{0ql;<7;a6B+HDI~`rFWCX(pd`zobESc`pi4
zwX&bo(MCgLoa%(;6D5M7<3mp|^<F{K>McTxx2UBi^igdE4xB0cHe}y6gX5=L8ip%?
z<?1FQdb1KF<tKFU4aB9nWhIEILkmu&n4|;d@#3D~2B`?!(Qr*qTythh(#o^nsR}tJ
zjM%>uEN_UDmO|7zcGwTTv~HmHs(>IvR$&vUL8v0)XApMZ9&PdY{V1GO;HGFED)C*t
zjm(-=L3#e(f~3SW_)MG3@{#FZFqf-aPdjJiG-T0v#(1&NVQe-$=LMVshc)c5%wL!X
z#^{JW?6A&XDBBhy|LGX3w37TeTXN%6?WWH=x=-C|Si%VygoGAgrykK_lMm$T*jJ#B
zUh6IZBoV?kbw%**`~~38PhbG~|H1+$PYyYtUo5cu739!gETCg&XkkxB`{(cfi~j$?
zA^*RrULLz93Cw^Hcm=veFz(5=MoBnpVJ&=|G5i8xz}SYp@lCvNwzDH=TThS1w<CRy
zPv>UJt<E{mr6}I=H&mhutZzHq;EQ&T{cCkIXbfgY4}*)MO*Ty0;f?;gFp(YRs9%IS
z3ThTA%=oqJXgRoDav8!%k|cVp5Ut*+q40Yk&xnfF*6^JYaB4{5G8U+*c_n{+kQ(c!
zm?oz_^Dw!icztk5G)dzDP42)vG8Mf5U!nuAlX3voKKJx#J3sgNdYBm3WVXjb>5}-n
zHUC%_a6n~U>ZRW%#|^iEZ@QBQ)4(tz>8Rnk$!iH-D=!(sMM^t?{VH?#b)2{?ZQb*K
z8#M+$i0XU3eqsBH9Q?0u8yg2YdkfQ_hW2#-3ZTCO#y@-L-%#?8>M{EwP~oe9H@{_f
zVy+;?P(vL-{AeXxUfr0*#$X!A3Z&Dnsz_E4w{)Dt3E8=K+Bz2nef0J~jqmuwX}TGB
zEbFgh*U%uwcW6i35`B^|TJBFKUM7P-1i4#=bLF|slkFn6Bx_Pk1QH2i=_V0oeOrc^
zOM?%6h)R@HJWJ~h`P&Ra%NC+58^LC(YM4K#;QF><GW=X=_ixjn2I}e@`dGN_d-Sm=
z%%knxk1C1{UGbFHiZ|9D;Qu?)`2VIp!~b8^r)1(?{`Td06%YUbJOBv5Ki>bB+imO&
z?Y~keJqJ^3tN+mci$?*!vZ${K|Fb{k@ja6L^j|#UdxdalHTPXhgmohf!W}_)34FM5
zjy0K-Af(LGU8t;qN~&WY>FG1to7X6?oGmMZ7`$HAWB^}E5x-bc3vz+pWrOF(I*@H<
zS*TP+v_KRb>%jBM3nt0qPK<egEpa~iQ1<=iy29xw8rLO~rF=k|##-bnzNHW>-Ij|V
zv)mkN!z~PXB;!b<zHZY3Nwk)=;K65|qgEKwmE-ef&JCHQJef!ULG&6JkDoy6%tRbN
z)E#rbj5Ng5Ba)5>oe&jSd`b&Q>=20hv1lVmj7X5n{$$?!#UMXx>ee3S0O2AfX(Bu}
z_gcKb?9Pv$&5<KE2v65P8&WDpe$q@FQIh)C9DndjFT#WHyugIgeL>>?-jr3IpQ{zW
zHssQ;)EwnY;t#<7PiUPdu7$4su_m-9c+Ohv{c|<xq@X4cv?hei1E7&52?N!bsN(F;
ztWF|z%{GbvPk=4;*>S!+Id2piJX#tuV^Wp5kXSHhUJ_|#t&LzIi7e+c5!sMge6#Hg
zU6Kw)5xnnNd^E4QNzx2C0tS=<m^o17CkkhszqwU#)S*u+&r-S@Jjpbt+2g^h5`RM<
zXkG?%X1UQY`?h*K$Gll}SyMiAl*!U0;t3P)HNrbuxZPJSdJ9{&hIMHw+8C|Ph3UI(
zKvqP<Y3($kR(b!LmF8WPliV7QE~GWZ=vPk{GYvogsb@sbn!Rau@=oSSd9MafD8V@b
zC(|hNQOAuMC3;;l{}7>^6gy~&^{Or?;YeeO7N~$g>X&)UtVByW5x4$${U$T-i7;fO
zTefHcPgd!Cp4;>?x?%<iI%&x|>-oMe-rK_-7&36X$jSt}V~l%;G3V<+Ns$QP#ouHr
zDQ^T$x7PF++H5#s3E@X}t<s5gT3ms>6PWDTuV|7zA))NXj;cBCkRwmgZ7F{<n$N(j
z0V4+swQyai&6d9A!$9^rNWua9eJ*Lo#wyoBCAW&ls!Q<9+jPL!ypDq$W1X5F1C7$L
zPQC;Y+$k9MSMN2w;1gz~$u2Py?34#J$`BdWGfZRpb9#%<?o&Te7XGNO18elN3MM6-
zJR@LyP)i5yA@;Dm?XOuxd5lAo>}M!J@ISw?0>%%_c!;g>(FzR!j*9UI!gb;tO1+H3
zqJkfL0g=npKF2n~F)E*zJ#CSZ@+K6?-7Z-Fw;1&0+*)xH1pr`w0R}+*2ZR4?aeu%5
zvirH`n%&;_*o)T?_RsF^GykDx9ZHSN;!-D9&fW_rWydM5&|G3%C1Pyh1eJICs9V4z
z<f)|e1a|`+fL%e~XzJ*d%W8@ckdTS-1}4?YjOU_H7q=f%owT>_W^FAgZJ(Fp#$yxu
z?rp2<TjQVCkMLV0w=12)tW7I!ZR(rjT{MrkpHDm9pH+Mpo99bSOzm!U-f<h_c%k}_
zhp$(UVL=B5jh~MfRxLeSmyOQ`-?xX}K3><jUvX4De|TSU$lkphG_H=1EgcQDd$~I|
zX;6Q*Yk9c-u!3@p@o1TZ^0;~LGiy>+UBj_&(CS>@W87Uxh`Iy5w753)l=|Gb`pBe9
zx-+@GFY<UaeKe>2cpF>Nxm`I$J7dYRtQDgkdwacsKbY%}!sdpB@2u*0)7GMJzja9B
z`}kdS4tr_&;J`Gr>z(JlC$f@upv0g#tV26)jSlZ^?sB=iGZbj-U9}02#otA-k<gyk
zw+1<X@rGmlVfD&<d+#|DXFP!HCE@I8<9@2)UL$<!c^}at`>hpM8j=l3R6AKz@1FF1
zI>gU`H;?OnogM<{rLEKFe1dihosXs|ipkA}X(}m9U$!jX^W9ZvxTv&fx{7TLzU)5L
zOYV`Xi}mHMY3sNu>sn`h^2!xVbLI0^Dv|D}u3fEbtJ3uOgyMB$jSiGnQCqXq_oI3f
zJuB4ujHEI`PUO>9W?QFular`O-Kz6-h>U%r%jz)1&)R`xeBUVneOEL60x#_G%n-~v
z?n)S~Jq2~kX{^lxP{}Q=s`lQaag241$IF5HC<B=T@T2UbXQ^HjH7p2zWzQ@k?B!`-
zmtgEss$ANXC+vMVso&YhJztZ?%O&5s5Me8-{f~F;8+9sG{Ry^QbA+h!mi)DOdkmk{
z)WN7B#{nM-&!v-bTQu7G0!DXudo%TT&JLt8c=3;us`izt83p0}h<G9oU1c$fGHsao
zWPa0EaB5%-+6q1Zwb^En-|rrC%dJ^2m1)|sz92n5zaKrGc|5TA5xtXtsA_lI)ZZID
zaBXge>4EMVCYVp7)y}I@WP+^gI#x(+7u^@`uDjnz=!vC8jdA^AvKfs-uhOQ|r2h3<
z`8Fry)E1AINUhim*Ae3uQ2m=lE4tDnC~ba;mM%M!f-bj`=DOH}x5L5JIfzcJtGWHG
z68%gy*m=`xJPc1Y&2rPh*nC~(dGhAj&S$?Dqi+6p)5n;RH<R23{x3ST-5~sMp10^V
z?`Mlo!!FHueAAr+>NRTI#^ru=5!#9i1Y*<I;l)ROp7;EN!-pm6Dc;3srOn=vN))`-
z*lLRl$@=Kmjj6Q(Yj-#3v&SM=wbQoW)CYPByCEuUua3teYKkX%Iag0PY=wR*mWQ_;
zFQYU^_*=T4wd>6z94lQ|O_YeFqdZ4ssD)5+K5>fOVHqM|?R*DobZsfsyw{I6E5FN2
z+0<1Lml6qtX)f;-x1_2O6<i>;%zjg_-2+)9)|+Eyjcy*BG~Yv!GQwe30X8S$f&I$n
z(_Ko0ol2t8Z9Gr@_G53P2kFjYrSl8~aiG(Z?P}2L+`zV!t%~nyq{bGM@=n{BO9H<!
z={{+w%gXxTil~7H=tn2FA{1fc6aV{*?oxSsNI%h5giwq1wUakf)gTldfEa0)E9xr?
zOy{N8h^yEBKu8t<@p2S`hy8N+uXpv{L)Aku+THtBqL6g!-+}Y3NtB|a5gY6eu0pjV
zB$IqcMTc45zlcikqk35oFC6__D@mQl)KsT|4M5t0t${JmeHWmSi$rIdWwDS|C9iSq
z?59wv%W=*Jd25}B+w13A^pf)pq0D?Y4$Y5~6)J>AjQK2i6ViXXyQzJAuUqej7Y}|`
z-d{QW$Vu+lvSQU`ZhRhR(^cwx!Ut%iZAC{<Vt72a>R5e8ugQx~O+sm2a<azqC|?M%
zedJkTVTjy)p%Jcuks<m$HHDdq+k?h3CoM)VLMOwNGMu}vW0@Laf@&!9(sNj52#C)*
z_JeU6Zc+DpHAjMS0v+#Sc_3Dt{-?54=_sV(fDdd5gNpLo+%FI#ZlT$7;Wz@!$X#Kf
z(;>1cAp$9chro}zn51h+u1SY$NCK;~M)Y7$F=1e`R26LAdraiLxbm*)7t{KffJ`DM
ziFYWsXWNh+mnVNS{x*UXtk;mh`6#*0Jy(XkIEv#H;-b}-!Z5j3xAppq6waFvqlG{@
z&%ypDW_1ri#??qU!4jsQd0aqQyTjKa_x^iiQaAE6u+r#R4icS^S<<#%)E=h{q=Q8x
zyK$8*teCMSotY-$?@uToC20=T_UT(PoLzGy!=!bL1H;G@r8HirOkct!yU5LZkxwU>
z&({d)KE&M|Uajp<6z97`diF%Od;hG?xj(dZ`265p77DJ3y_P-;(aYLu#>9G`Jl9#y
zd@7hnox`Jq3SWd-yYMnJQNhj(cn>qF8;#{H%enf#=h3Jw*b)xjjN{KOW-?V(y;~GI
zDu%k$AdxuWXy9F-ubiCcd6V<j(dm{K8(+VoO)jEKo-vfMwpT`8>8_<-%f>EsDQ9J|
z1AxAC?-~b9pd-#dJ9^S{C@T8OYj7&yPFvZvy2}X{?T9hEP2GZ*^>Ugc-6@^oaouKT
z<Z5CoK9LjQaq?GHQZ_HuL>MaU=9&9FB^{plYtz}MeB^jAI}BE~>GIXzB~!^5S)^fd
zT9l05)uHDQ9;YQijJz^fS*^^5AG$K?RJ`{X$a`}?uS=vv0w)=P*;>6kPRss_$iJNa
zM<fhSOw-@Euc*_Fd~YuQ&p0V{p5eKO*FfJRm6{jlX77Htl`omZdhK$WKPK!fgCtIh
zOV(Y{R!a%a&~~F5<_LAklbMw&4LWqUmD71snb!+-z_E!+D)HzvDBHu^ZcY{cU|Vmr
z$8RlxsqYYMFyGQ-%UJpww=|23O<V0xwH5v^kp9Q=RK|f5u*_ww9)G<)va6(;cn<eH
z=06SLkj!MB@i>mdqj~ucE?B6nRa#o7Ty93M{@MXEYkIb0qg>aXXz1UtJXMSt=>eST
z<GWuF!ptI#kW1aRL?Ckuyf}zEGr|~0#T^E&FV;?{p<(Z!^;*X+Cp??F`tnb>T-;z0
zt=BhJHoLd^uY2Z`VfFgL*x7)iHNDiPsIi?bj@Uz0WAi3Mxty4m(OuMeC!xN}X&UXX
z3Euk3)RHyNDVjFu4E*qJy<pKx)7W5pl0qGz$f}@HWDk@U1?pmka~K+)($_-$?}0b-
zoqD(=q(yHhqBT=18V~adkV!11@HBN&ZUc72$QE$6#8a@d@~|^@*xVx#gq#%OOsLCo
zF2@G}Fdg}_WU-&$k<$#h+&Ty{7-~7h-$4Y`R?~ck!)7(``T>+uO<_n~<M9eQxkp(&
ztU2}cZ^7b*XJUtN72Woq6q)P-S$FD;xz7Lx6LKaYfJ``Vx;B&tfeloxGo8%_@%IAM
zx)Mi<JWD)WFeuYfvCjA3(xpJC@R_dziWIhm8*~RA7yJk9baUD>66Ft?Bk_!c>1Fnw
z>>ItKaoQiZ9`1lQP`-z+GU&Mugm2hkHK)96FejJ9K)1fjtVcI%&-HH&lHMMQkTssL
z(9HvaZ}bmmoN)Q7(;(J7=;py-)`Gv{2D%XXFU*uL&!*+AEZ_6EO4_pP+c3XMrlM)N
zGMBM1y1}#dV%+30J#5e0iuc0w!}7c-t@(DK>}?<dPNSXcaShI&$c|q~!OJcCK7PK%
z9xVHdZOT7>wx*CJ_wj1ZsL|IkeSc8x9dZ0#&YG*frPZjmIlh&vx8=sWW|($?%g32{
zI~npVc>bPGZE&MhjoV`wSJj*vBeHQ44PL!d2hD|-<{N4Gg;*zXH%*e~X(%3RgAIO>
zbH9$Uc|BHp3LV?l``(<#cTJzE6)y?RgNZjTx_rp{qz2L}bMsj=LYywjIYv19F0CA$
zA==6SWi0n_w^Q!(>$h$RLQr096exM7ww5Gf34as$U+_pE>Jzw~X;}cgj33@$$F>1M
z3pK_mM?et|Q3J2{j6y{6m{76^ewzz#6*zw91;;=+&>HEVN}AvH(t{9alEjX@mjXEZ
z)t1!2i5j@z9pCUF-yj%ZosOsb?gBVZg(;eY&G|E+`O})t<+rM<kkJLS!q%E><tbQ#
ze8UI`aeM~DJn3?f6w~+$6p(ZR__U@)RiaW|(VJ$eHwUaOr%&=CDEzHerlmqvv!`t+
za?ZG9`L!ki1-2vNUb=rAMl6wRl&5S*&fbpw9_;v@SAYNuq?{bT+7_r8p*4MACPX`i
zdpPEuun?$S2ADP-$s!716SzQ3eknxKr)Cf#<aY3V4iSwAKK|jx*6~w*eDh%2&3HQ>
zrCq*~0{2m((&!nDy6WtgCTVEU?ES^Y#yMnxC0jWDE}`GG4@XY+6wDYZm@n{LEusR!
z8Gg&LeP|TLW}c%4tmm)F8`OwQwDjJ;$w|SjnD6Jnh41^v%@do)Th$6ymc~0)mZVfG
zNSa9p;Kv}7IXNO#XGK!HDyrO7BMO<JHSb;*{tj#!`xt9j5Dj^kNK9OT-pRuNnOn{i
zV97VLI~S40PH@!XtodHF9Tv5>#8j`KlQT1~fjb#YZ;-jibO<Q;k>9l!(B5-!#GYlG
z(1(l~k}};74a@?lEV-rhJK?@&yFs|AqGp`JKbE_BRF%6f5`QRU*aQ?!!Zuh)k<F(J
zz_F*;wY6;dU{`&&$Iwz%Ua;4d+Uht7V>ewGMPPDZ2!?}X5|n0h$d!M{V9Hifcn(sa
zeC2fnNf{HFLDkl<%|b;Q#feb)D8DH(xg!x&tB|SYE-95|Ux?#oyYU@%A`xIwulzao
z*4H1^|5`((FiXXRh_eIbVgYB0eNeNY_9XmK@}fUsAB%%q<kDdkqURxR6E#TLytf{X
z+#d^IYcK>|qtv4qE>$zxC1X#PXzGyR%|(WAMp*->tQH6`GeD;kWdu?_4H9b^J_ag(
zwww=Fya*}0XmQ(80?olD@=KL*45DzL_zH9V?3*f5jYJZatC#N{qGBtGtwn7-BxKFf
zw=@tv>56l^SMfFDYoX_)cdKhJ=*Fx}<?@BTrEX;<Blm5|LLlzRO64$#^YE41y{C0Q
z_;iL_iCk6ryAVgub*SIc>K?&Vu0DG&Bs(-CMH;_`+Gg#P;vuhu1_3t<KwLyQb%ELe
zdjY9bZ)^@Rp?n^GglL6mNti$eI*_^y0A|JzVyI%_3s?Ur5#+*m13ptG$zbm`P;C`f
zogD?!T=2U^Cqst6@MX>F<t7#;DF7eMaWY$vt;b2=s>#1dAQmHr^3TamRaun&?IIY<
zy4y*tko}X#wxN>u$Kqbjd$fTCF3E#iVqXRlY5v)Bg_9M1(X<a2iM$ep24>AS(IYz-
z_LbA@8C!=mG)GRc<%#!D7It-cY;zW0(X3Lm9hf`Y8R(qzT?(mU!Wm`~<*f*V)Vsk7
zBvKH;b;_JY+@NT&``VQSgjJU#0xT`ZGXH)47C`*ky!V~zJ#dA1d0=m?Hsj?Kub@P?
zDgTG%BPgzV`g7$;rzWX@VrT=FB_M<dU@qHju#nlX7VYG3-*UDik>bu|RQXuCiw!`z
z))jWu=1t{Q07ER+5u@ZdxBzL9?Tsh~uaVg9&rX%w<hyezX?1kOpE+l8DF?pmjy1fS
zxUBx#;IjL+qBkM6N1kw-45QiOd$mOPB#U*&Uk$ScCp(~25BY$c9?I}b!+2Ce7W@nW
z2R>81K!e-WzDH0)3l69Nyuyj&*GS{AFx6_upXWo_6<6Dp{hS*y&pQ!HIT?-tH#G4`
zA}o^OD6O_k&OJfrdSi+i?hX`ct7xPirm1VeW4vyNF=D8T=X88jh1eclu5yl2@i8&M
zgO<sgAE_4E14ql+Dc`^S5XeX@h6)J`V%a8^H;j4J2zHvk(T%?*uOA69<&7`yv8j4f
z-K6@g`L1JfL=;)(sNMVai&Jb)#h$}Tg|-IBBkPJchg}*L%PTiXxdsWAdZxzIG3*3M
z!}ngP^Sw4r;zf}=&*MWt+)_*IhZb7T{Exia5x=8W)6&Mf`5>SI?TG8xX)%^yfZ-tu
zClvo{*<_Au{)+bLka6f;<^p4cCB(cnRgF+Wba*~q)H={iU+qrVCH`?rKA3s_7htCb
zY+1N?_S%UDh>JifHlOdC0vcm<=bGJ=?fc8WjMLTim-HQ!o}>UWuFJO3r<dOD80RHn
zqIDS}fp&ly;P6Leivx#zw4-yiEF`Q~$Xlry=7}hCxC38%0q(}_rD&mCKHAU#okY0M
ztLPQ5xdIq3M7EU8WyY@sC_WakUj;1kfbu&a;+64pG*$1BP?MqePoDujkdojVzyZsD
z51W=(J&q$kHTwx&*wDF$ajqor-xS!0jkL4bc%4gn9G(D8aT5S?I~3T90GjMU!7t?g
z(ug#5M+yT{L$M(UC{_*wlJ+1wN$9!O`j&@ROH-vMr=byt+?-#wx~H=k^Pmh(72gil
zX$bDz_YBTF8&p`hkB79B_dL;DOChW^wLTB6`!cAyr_-HVo}5}kR_=(vsKJq|S6GUT
z3CAShtEiA7U1++?)c}UIxiIe74iHqT>a<q2;gnI5mudr<Kqi_(r1DsTwy-QCg5bLh
zIH6#nvZ73UBW|Id5*uLFpwrlSLzp`AW3Eg=4$-k2Ggm<_-FDv5lsRqcaX@bDVlNzb
zecb+%qL)X6k}N<ki#{n%?p}y6XPCrX@Ef;&_C=7O*AE6y|KAe67YWOS^u%JtI4*S-
zTLZ`i%<-0ZT4vwPQ!Q2*#+472Zq~d{Qh-DOK0;qlAOz%sGAR}`a7&D1b8~L*r8|Lj
zvMWH1uEZ3A3l$f*(O{kBB~{%D*;TA^?xlHew(HXePRNcWNusKwcO}kJu&Q!A#S(Z5
z0?UXdd83haRV+X-aE(>12_y7LMiuRV+%dJ)WV_u+rvz{;dDkjJ2l|4}Z5v1p2I&5#
zPK49)83T@cP_oC^Xa{%l?gMV5fHeh~kftf2LJoy|8fj7N_djawYAvxvQGsf*PmDfT
zbquS1%uf1}Qe~-dxi@8xIM%}mwV=ALvU9a1$zMZzL3cqRY~q_&fU<Y`_0_1_>i<kJ
zSZ7##0cx(Fk&FB>z3Rd3(|m+XY)K|gKcQs#z<5BIteC8viHgX#YDqh~g}iw}gavLU
z1s{z361(t7lx|X1_wuj<xn(}lLaeR;iB_IoW)~=|wYQ%>=2(x_qg%%>Pb13CcBa@g
z6Yr?b)}y{u7UlVbA|bRmP$bUU)Me!Q5ObVCuwOMCTB<b(k${2{SO5MEVz>SbX;(nD
zO=Gr_SAV4dS75eHIoY5e+Bkqiz`szt7F1?`Fb91BsS1T@T7q1T%y17M2%ZL+X@WXP
z79sIHQD{rf)B!hrlQc1njsbs|p+SY`)gNteFxgM7>PR}`=}xiZ8{rj<4Tg+|+rZH;
zipTexUmD@O1cEC$d@p)5_u-3pQwzVya{E&|5zFc(d&u3CylOH7UiUw^tj#h4WO2OS
zM_av`m3b<z<`$4HetTj;>?Vj5BG{M5P$_p%NDI#J%iq2KMjc~X&tP4HCpYia6VMwG
zvTtJ>m}{fSu_&Cc?K)byH2Sa@;!(;?OOtblBBt<qTFIiR9q7C9_wRC3Xzc;KcS(|6
z9IaY4nLl|Qx_8hf$6Xt_hWv$@)2V$`{@eE#1@xj%cVojsgLD&FnG?<2#CUb$YSF-`
zMzmP4nAR8ub6Wo6EJBjA^YS4+DLxrh*}bCb@8(qt`<A8dO`403eE}u(21~ip#RhQK
z*!=cB|3cvepE*1PH*q=6^UtA7;@!ehn$H@}loIz?clmTjf{aGz{JLKFq{$5;6-c4_
ztC&+d`_(<kKKLtmD8BVRUxLo}nNBktQ^M(fVTMR%MAKI9dDV(C`o)jntF4dPsTDjW
zK7cX#_s}es+3_nrbA{iUpHtd*iJR$!Ll<t2QtIFrFyE+<DyAEFwDro5aTj;@L4-5x
zGwS+sbehX0^qHW6t3*`;O7>hOU%go%ojTifF2W2uy{}=?BK5xgT&Dk3V?$>XhjC6#
zYn+>GJkU`}=iP~pFj=ii+AdahRE7B95FjXpuy1;1t2;#zhLV8gCz&v_#z^eC>UbR*
zwPMEP{)5-*)_MxiqmjD0iB|-fh3cMSY2I+MR4#|C1o$~ZdA)F3<cz4YAH<T_^KPRp
z2Ru=(w}4i@WtowPenwCe<4M8SEfPSPsL!+$Ojqtcp+eZR;g|7wYlOyQrym+^#4g9W
zN9m8wZ8|4DYxJ%Q3a^Dua`>9J)G-~a&WGyG3iD#_YG=^1AUJm@L;B{}RaFJn!^XfL
zgBnM6+M)^mt@9U!cuo!hdDLNjWV*Ax<5Rvkzr}6U8mo)um9mNz!SK3wH)?4wTXAtb
zDFj3uqh;yAcw#!lfP`&Lm?|=_4OZUkpRv%+G3bhYdv{IqUd+0Rf{-ZDx;#|G@Q>pp
zOrDhYQ`jS(S*M!1rZ%nFu$wnovsx;1s>Z^C6Y9(b76%MnC-C)GOw)v;bArNy{MRJD
z#VD$j`)QB`jGJWB#G#%&m?c-!@s)mr+x--NxIgmrG2ab11@$*GvP2PtsanxZc5c-8
zY;MW;Q&mrM^+AS7d%d@lEy(^`XtL@DerAXy55;4+(C<(niKoDu$fpre&-1E&HdBu@
zOq>p!pHTE(p%?E1PNKupjn+<EgQXw7H>*R@oP<usFHf6*8my=LdiY>e<k&1{o++Q1
zWTc`HN7C#KF=A=ctaAF42x5S+nYZvM^}!+3mZrfTb(s(8kf)!!C>|1hNLO!|VPW}3
z#RuEQJg{N%Gw_UeX4qm*G6UtcWK~*bxO;g)h^ed83bZIC!6*!SV)WZC`%HOiO_`Z)
zxYnFyh;3?&X=>}tFpQ38Zj4Gb)%C*2v1DuCvSZ6~0mYVR9V68JSLF`0jS_5d*^y?s
zAmu=mh>7_&MAS$Ref2EcCvQI)vyF6(<g}hfA?_%`$WZO>OCGV!>>v4)lP~!n_h0e`
zNoR8{B$Jp$8#|ZG=bS$uEj!KWtG2?mRDcc6TanIlcZwR2Vl96f*xZ{;T@8k7Z6`{7
zoSKk2w0b0MjV$yX3r~Qv<RLfp8=aS8VHhlt=%P1{Dx&M+p6E10raH|il=s&uQUXXR
zH##LM|3Z<(u%#oSM+YaA`%bPWZHAIyeC#(Z)BeOyIbBxengB(j6SuZl1F*6>p-AY1
z$EA}q-x?U|eXb=yLMv}Mqn@LTirI0O$rKW2MS|qm9a*@EdXCZf&b<st;1+?HA9F)`
zhs;-r<74(6eTC_=EWtv_in(|A&|Gc6V*L}`6GVqvv_wR4;FXmk*n1cUk34>wM~&bv
zWad`}ex0C12&K4w?h4O$au&#=Y4#4HGoP7E>*#X%R^FYotr6BStM_vG-W18io}4nU
zn^Xcu_EV*W$>nYkvF^&!U;X|tl%tEiO<pZ@&|8s(?qJ8SL&%*9@au4OSV@)-rC@{M
z6D0JkiNftPIQ+M1v_#l`mJjqPF{qO9m|E~_;WI8cMkHLc5$Mh;u;Ddl8r4!vV=^jU
z92vI3F)PGiRPmLU)yml;x1x1es+f~dBr&eqSuv(=e?X$@2?nT%8f`P_;;{NSi<)$8
zH@F?W(bP}WqpRK3GOEKuAK6w(a(TLcs^@!jtMx~rO5WrBQ@@ymopO1~vRHIsX$-Ac
zR|kK=5Y8VGxef79-AJhZ8Dyx{ZsefaSj1#h@s#i0y8!k}booJw*&~)xImBF-A@N!d
z-!Vo0xTw^r7pkD08i2JxPwW^~n^U_JCIBfytD-kjZ*9sqLlV9KqfTI6+0F3x#A)tQ
zt~?Zc%N}q5(xNQE9VR3rRlIUX8meR%@6NZE5r=qY-TEVJ;xC2i&LFu~j8^(h#1Zpi
zYDzFMm{?VNm;Ey=n51n$&P$=hlPx=e8ad8?rs+8VPn2S|eHr;rI4q2?_JM$7rKlVj
z2h5@lYGqi|rMog|uMa>Kj2Wcq;dF5186~EbzUAbkOyM)A<;?eMB*rC`8K`_L+b8U)
zJPXNWvZXFGkJ@}E*D4YWnH*@s<NF1fiRb*R0tk5&t{TlG8g$-SAn!Z#RY)C7uQAe1
zXVR9K#Bj8`r|<C*H983YGI3`*!ykkH>&Db_99%i_W=K-R#K_}K;50@49TDA`H)tRN
z-#C2bn1txk8W<9lvvVsY6i*`r@glxhmcy##8S6V!oa%}Y#q54#fqXA|&f<FuisS<p
zWsm3;lpTD*rR{LJ{Ave9=BA}}*9a{*SLy=kD%gmRKS^*r^7=e;ow(9f0X-6L&iE(w
zatV37BD&wES@mrzswDG#Z6elJ3{*Rrl}zG4s^#=Uk%%xp)=nspt-<8+Y~h;&AsJaz
z`qZ5KV_=4}fBQgQXkazbY|hu9dZGc1EnG|WmzM3HN!X}ue^d>hKveN!{&mq!o4n0W
zRJUyOWk|fWZ_c$7wP*BH@y*VL1b1}XaY1*L!b7jL=Zws^n12y2n}cG0(=L%iJVYHH
z_y?!50gC1Sa24kAm;288DutZ3atU=Efcu_BJHt>VRYSEhet2?&sZK!^P>o}9wY<3e
zsO&V-9beD^QnhxFIGjOL^bx*&EKtU0jel2H1Y|gfFdCj|E*S3lG*k33vJ5Yfzv-Qq
z5!|NH0^CFF-4}EtCrMpzM3H<VRQ6!Z6)bUtRnMtcH4bkMLV}@l2R23g0EJX&?A%%(
zM;DO=gB02M*ID8&-I;~GH`CYl`*jlhf=$TZ4q;>8Do8Oi?+(fJ8`ALOs0WU-Vt}|9
zAv?xRp~G?Q>^nJ0(E7EHAf$Ow;%PWY3aVtN^8v182UJ0RoMqr2>R|zdRDG_>9?f#7
zouF^^F<Ka2o`(sq%aj^+i!8n&0yr*?@A(3{dco*b{<56mbWDG^*3z#(cn{|kD~n$s
zw)q<SevR!J!PdsV%x91mC)=n#YeA&1Y8Z}hu#}*yTAFfDNXaMt`>nZ*6o04r3~IMh
zIqpct2s97d<a-2%!9+JkTP0{4GtZxGtc=(1mtnx{0!94nmmd_#3;bN){~`VcT~#tw
zc;s)*0vaBwGW@-^ODz3=T*FDLs>4J-SSi&_n<vE>6)8_%klIC8092B5J{1SGthBj1
z4K1rmKA^qdygX093h|Ebm>;V5!V|A3W2Fg2;(=&(hM|-*-xydfuOBd*cHjA=?4e1g
z=R#L5FUfLWsXJ7CanvLG&Ap_EpGZ=xq?x&5O}j~uf@ePx9z{}(D*(lq_$xrZrq=Mp
za~J)mVU2l`vhH1E(buF!zs`KHFw;_fY!~ez)#bVbN9<ddjTrw#G6U<E3k;T6bXEV6
zORWiWC)YB7Cz(3YPvbF$Y5ek<YLWa6|3E{S)5!7Q0PT134t1TJ7*nH`oKZPa_UYmB
ze+-MYOZ$gc67*)|_6M={D!<m7bfMwMD=fl}7`Is<!%B(J`r?;At1S_<S))j1G9@or
z4_21?FgR&JI<;v1=r{`RSy%!g{u^&rU)IjU#+YO?K$mlE<L(?gjvdeQ3Wz;2`_PsI
z)x}y}oQ#9w&p|1{l`KIf{-SxYAFlbWvG!!)f+3v{2Yco6FVz}&8wP7>O6yhkqIFd(
zFGiJxm0^55KbxeU&!XL2XE^iTqr5;L<$HbG%fF2BAJ(4CN0rQUAI4cim8^GyqL>tq
zR{!FTf;?17tN&z<Nw-RZv;TuRmU@@JkbH?@!Su!MRRCvBUxQ`~`@awaX=|5l#3W_6
z@vEma{If-!-*B!NVQlc%nMJlKei2BX;K<y#ol*kcAEQ#AmqLoO8@KzW$H?L}!dX6i
zF~UCyj_p!IzgB9dRX=T?N|r$PchE0;eFeo|#=Zf&eVJHRVfn(Nc~7Nm<tDaY*`Gb)
zFg)W&2iyOJRM)>q&HO@YEL3x4)Q(j%Cnx^1=_&iYt&|?d5HpXF4;lY>yqg_d8OtG!
zIKvw3HykYjrwxADpT&3bW}RSnWOB~WdCB~aSy%bH;B)f~ab}N3w<_=LPX}d4u&$}r
zjlx1|*a?@@QvG|d+P>gw2KE^|AL6cc9#iKN50_RC!f_|5e4SQ0^S3($SR?;gq_{oE
zMX*ahKbv~><qdF!>iG0()83G@jUwreSxujM^hLrqjC?QuQ-@333L{uQX%oJlw}Jd&
z(}8$Kpeis`UNtVOkBSImBzF-y474C$e?D<^alw|F`H(d6VU*b_0ANnn4=+G2&DZpK
zv+^<K)#TC@fCgPP9M<5LY>5WHww^lkU||`{nLe`A-g+S&D4X;FF17yNuJzWKbO~Y{
z-$gdXxjH0B#y>_3is*#`5!mY>ZH7xya^3SZW#5*3HAvvT%vl8mK@v!Fb@|Jp^ytb-
z2;Q*s<?mCag&ApHdv~k6ngdxD9YOIzpLRyrF_zB(;csl}V2mpI$}oQdMT1q5IC9^<
z>9*=HWf}zwFKWXC?9sDj5r=?2Cq$)Xj3tqIS|oNWr0Otgyrio$-3i*8Qzt~NiW;qc
z6w}NRksrhcF*d@RA};6xP(s7<g^X2`A`TTJUvu@bmPC5RXHkpUo(b9@t_WpTKFaJ7
zwb<&k`s2q29T9O%Ky8z2!>Yridqo@;roLvJV=sx^(IV|AgSJ7m4O7`7wpGuIe4c6?
zZYi9)G<l}Fc2elZmW>AkiUs6LAIfdJ$GAP22G@gIpe1j0`{`En8i872$oqMfQa&r3
z?uEA#W2)F5_NR^-Nr9raf-FQ8PtJ93eP$ZBr9B(9sZWC2K9dq6a6DXIBmr4#es}q)
zE91?+e{*eH%f_LCP&@HH;nDJZiI;U)x2B@K{eT#H`L_+JF$b@o5tZ+44K^VHN`V`R
zPRGj1d*bHY;7n!zCC4d{M7{vJ2CE)T%PCtbY!FV`_F9@t9BJ7Hrt!KP)Lf|nn9pGs
zK!m3jIQ<nC2yqIA>_e{F@13vfCkM6OzP00LRhp~d#JTD|`(jf2xWexq>vKBIWT~&?
zzPjBtE7v5NM7G<eXG+Z}<2H-BjXkj;`%R8)ziXX)c)NoCIPkN>r<~&b`Ro;PYG5kx
zuf``J=tB4(Z`CC$Ms=P`-*^9ZJz=lUI^#CrH%~V_t7)dOcJX*Q78(*CdVxB@b`vS>
z>1^<!)H!pk1G&%rqWq-Z>Lu#^g{3{*>#BbHI6H5zuO1EK3j*~Ei&jAZsV5(SrytRg
z_;@q(I4%^Pv&g!v;I|y=lZKwo<5n~X>cZu~()+=6)tcsWn>c&eGTNdJ?kCucugN9g
zem~EMTtO9K;<_SSw`mml>)7KwdWoIk4GwKyOZf_h4h^E{tZyxi?7&Nzp{obQ`@OU2
zd!^q-vDHUq8Yjd<{xMn4r|`CFK}f=L&~ie6v%~qQ#Rs?{#tEBqu;f^13vCu&#r0wp
zzBoDbF%PIS(MO6=56?8HGfsFfk$+3lXxxs3#4ebre|`{QAv2uT_;6o9#Ed=li_t{Q
z@wxA=L!^kxBsV=;KyS7w8<^xuG%@aD4beqg%^7|?G@u7iGLmp1tcd1j1e=D`U=MjM
zr3s+`sNtD=wP+Y2T~n=5ECg8S_Qjnt^>i<29k!|=bf2@qQs2IGs))q*D4$e`41vd4
za-YQ?ib4YWBYm?t0wz)CFCb>(ivCmlFfAy-c=ST(-4GpBHMkey(H$UK@zbExqos}H
zfmkI-J;I<PhBNWi_<*Fy;VIZ>K&kGWy9CHfS>vYgXAa=QTXsz}@GSt|^Sqj!ySil!
zmBf`>F8zH{p2MId8-5mx@IUEHkzazeK~%c>Q>UHZ<g?Qq2M3_zhA9qO04JM&@x%tV
zxjbE$B-6-~n05vZQmG`wG=#wM@|u+1N@;soOxT`joz&4qoaJ7f95_6C+#TeUbW*ZC
z0CM+~^u9rJy&&ONvYYV24k?)ksZ;?M=je=TXfO@)lkagg(*vbK36mS!8Ss(LMu-~%
zGfwSE*WTOPh?7E7A~b>NFB8Ur3duT0$>q7~KF{qiF!K`K2DRowN*Pz}0`O-$f+QY;
z!8s+;2_W*5=`JS~N3{_+%F$l@4v~%Jmp*LAgrb3cRaA+vz3D_DwLMPK_{QbH@*0ah
z^u%=~CCa6o3X>IC7`%;|6e~Qrl%izD;xkGS5xL!-jueg(T6}<p`eH@dD;B5J3ikpP
zkHnUaQRG8Lc&WiH4~qwxhyKbgrHz)qPA@$h`^ds!c+282tTsV`UK;m)Xuc8?hBi1_
zMPNrhQ&5tlYN31BAE=#sJL?_O9I`7(Qe)D_1<SSg(%(s?WaViNUP2-f)PJuzfI3e|
zAc!c(J+?|w5xrPUJ*u`O+~i`MD49Hg6jjf20$veVkWoHU9X^y{q9!yUaiXu$pKl?_
zIb=Tj0+Wp_?Y~;Bn*LKVWU-4mH8!Rv-9+*o1C%{Vkbuu}&}NVv3RF8T^MJW;b~Ki>
z&v&K7<6gMWtgo2YUOzmwlEPGlKaT>_wmYP7K1$(F2bDqm4td%`6{JCYWfL1#B1K1*
ziJB$Gg0XD4!rTD4mXswkiIN)6OnhW_F}vMcYW?h_0moOR;#<MkTtuMZWZsu}1;&EG
zKjHy@#3R4NUl=GM6xk44h>zT0Y4V9VSa;YZlZdfl$L2}pn9EtiPnpODi%;~G$&53(
ziHvc51>`xI566paI!~E-OXHa2M32T_$3qy*YB3UU?GC|jXASF#Mtz7Z-R99oPquU2
zV9a#(F>$aL>U4hI|4u1&Kh}t@Smd*^5>W*aH=F*cXWu3I3XTH%7<CL@+s^^T9i0QO
z3gly00M?t0g?r-G_Ep)^NSrYrFgkr9AW1C!ZDq!Ggu1zh;%Hw03r%2&sJ8NLxGS>P
zeLjUr%+LPtLwG0=)ds=VRsiI_*dz-Wq7J*I_{&H<$x{K3OPVf4ao0G7%<RUOL&cqv
z6v%^dq~Up!Fjz)xjH34`$OiL2ku!>!j01CEnG3waPs`#NMP;z^^fStt5eMnpd;%9l
zF&_VlF;HI4r}RvNrOqN^EGSVJhNrTMI+jlAIswC$q2{^$u}CY%LJ5<Q&bZGX4|*mj
z4i=jlC>Nc=1i^q~eaHUSat;h%HcTQsCO9bn_j1^IoE-My-Z+?fFaP>4j>9Q<D4Pe(
zcLd@KjVHD{I7Q)H&=g5C_;w~cTF3~7J2(k0pr&6&QF5@z*Tc!}L4>-x>3=e(Z5QHO
zQkWN<QfZ(kGt+<Z3;xm=95D`#(4<kpy+L`Db*TT1*M1jYzg7TbGTk|4c00WVFYkr2
z;0sZM#gi}^jhq<B|GtZb6sA17%esMoYcj6~Rsp)hME9+n(|j!BF9iN73=MuZ{<25j
zRirqI!D1PHs+<wvr(*mM^5O=Hi<OMg;1*Nv4U}~X;u*|lF#>itaEO7%t#p5F1}b<S
z_G7SUg`e(^AH)|D)C=&5-m6Gr<`qULWTd+?t2r+IhY08XI6TzVjP7CkLiSVt>jK)Q
z9OrAMSw5B2?-IbKTgl&<XC%zTzM>GIK4&AVofJKD>uy*Cv|vj?xQ8$D`^K=0_6$SR
zltuUkNE-3QbGL;j9=9yj_rp%BW+e0U#4DaUuwu2V6Gf{M6+QH7Sg4_+N!_{sDoS6Q
zD{{ZxsA!d1=awE6RyY?-wm+`V2{b*o8z=u%b$8`(myp%&i7xo+R5eVR4K}8`hji9%
zJy<R=nEr@fk$UvBM`m!%mHt3kPmV+~?ukg_w1r5Bl79oyi`Vx^`GtJ&9q?@wl!tVC
z@Sb}CesH{5oPL<1=ZI)R9tpft)hV38X3ir5ty%qUqFEfCN(DT|Cw-oQIH(4H=rNZ5
z4&d%)GFMojLIXRX*KQm%Crxk~xPC{Q{Q`MG8w+U=+c4T`u~E0!jEGyo!w55H=};Ru
zJUY$Z!A$aVx&*`2*b+LBbvhn|>^AFughwThQ@D}+#w5G}z?1C3#cs-<p|q~?Yu7YU
zV-+os4ee00j3))r4S$iG!!c~d&mUGQeDv`3capGh)3!VV$SV~czz~63YVnpq%kwBI
zYZNhJ0~00TWvSy9SHj4nh0bd=)BI@yEK|Y}DM|Gu>o|f@fVmY2CWwFo$3jgNd4rQm
zH-!-}QgRSHHP8n-3${yT6_Mv5osNT?XZDZ0>8@xvWBRoX@2K-T{m<~$4wrDwpDPg-
zc*%^kqAC%g45zUSrD05u84T?9TlO2t45v5$8ryQ*rYbOq%An@Ow~7QSI2?F(nJcwC
zMH)?i!qpycJeUUhHf?^2k1<kZ{x+wq4IX|+QI|S@p=iLuZ;?xOi?pz{*yr`Z22;k)
zRSb{1X{SS)co4MDDmScq2av(%B~8?eijsAf$@+M}_RRXH4ZpE1#kw(#WU%nfu&yg5
zj|L(n66#tAVG!4A_@c|B@~z;jIp6n*wyyb4Z~kUuwzJBWz*XZv&G}wYz8vq+c;KJC
zWNHRZWB)ZBzD|4{+I@FR(TkQUaijElYQk<cA)uwv?p$C0NIh5?S}Yrm@_Fn-D%M0V
z6EL6A1Ftj!sK?P2hpa4Qk?Fg@zfPY9g_blzaH&C$CxjZ1eaS&qGC*Slh9@QX{2(-{
z{ya*e4kExXgz=PkweOmL|9U4Nzc{&K*t-16CFN+B?ZH{*s(CVWK@{e(WejaB?O7A%
zN&SCMier$syH4@wDcV$OT%MwNDi_t*@w(Kwzp`R=6JU+=uMW6QWDI)G?cE@m3Ffqk
zwnn6^y7yShd$!vv{Es|T#s_Z!^V^z6R!b}Qw^Qjc@$q-4V=Mr%YNHWC&wL8aKZ&t^
z;9EP81W@J+YrcACfLJr!TMSGjnw`5dTOQ5?WDU*8Y-MzwU@GOd1|aeXZ44z;0e`S@
zFrV`_lBe>u#6Vm9Bv4+|bU`;UY}8Yy<$b|#wCfZ6(Kl=>eq~mGj9OoRxq>re{fP>;
z?kyKW3`^g7Jhnj)un*%Eqw8u=ak|hSSbaO9vI1Xav?0-ote-HQwQYN^y}M1xU2szx
zM$yleM(z3Zv(B(1_sAub-V^ZL*>5}>1<jqp<`kZ+fmdK&(h~#yk>^D*biQ23$tw4Q
zx|RW3>YV4(etYrgHopDW!sYyOf3v^s9}`K_lYUh~-kq1-!uBkpfihG?mp{fTOs9A?
z1GU;p#96^iHBk#>mzo^1!_J{qY0w+mRXHDy+(}A`xy?IrLcBY}c(aDGgX{UupoRo|
zP`u5#W3t2{dHP>@|5i%mv(27GOY;fFJ6j@h{2LR^$Hv+nD!h>xPb-(T>SP<kFlr?-
z#a-agXS}MMq}i@xK@>998COL5NxvRR8L`ch!c15voDhX_BaR7c;-D1$9=^Mq|3t4j
z@NEx!17s+IYJF+OpMd-xS4Bxj4phAtTZI7rOfHv0*D>DFj(?|wz(WXfA2(eUCm4f{
z%)^fjnHln_)W3viZhOfRa9tf3ul%D#q@zRka-l{A;gzZcyw4x&p3V}bFGYR|{;fZW
zY_^vi0(m-<-aJm<#q<=B$QD%j7*syHYr&*5y>k>e@Y5x>N)agATqYXLOGh&f&FH7o
zTdA};!T7tAEuYp?-u*6?H-LnG6U|BF&eL8S?E*r7B^9e$lM~@0^ezZvfK)%8l1Veb
zzOCGSZDj9Q!*dea4E87E==1G^GuvQ_eRKiJf+!%$bEY`M#K$D6b=$5nS1yE~cI~cA
z(AF3da;srBahIr%#LaT+p-d$MP|bO6f&{;i+X%JCKBHsZuZ?~Tx<~1Hb+1qLBQWcC
zKKHMa*reKEJxWA|1`9J2l}XLQOyEiREJ+%8#QbyTut&w{o1uNh+(@B#K`ERuq1Wu$
zIb}b1zb3v6rJ<X0FTn^e)W90QT=j{na!$0$?O5D{Zl(&s?|UrF$T{o@onC_TocVy(
zrJxrvzYFgxQNUN)X<sr$D}F|3O&I0J7VxQ<8|2sCJ0O*4QeJAQCsj6Rf&#5v7@wPD
zFL43GP*Ro1g?_8=7E-x(e(roSZShQ>@k@K=Ff_7yaI8@|r3ggQf?Lf8zRnZ@2MQ^T
z;}7#dKq<b6!?`0D!5kCk6fsUZ%$A`<xl&gq*9Pfu>4l@5Cn?-ZOPH*il*(6Jz<9=r
zy{^QmmjEV8DZUn3r*~qg3wD<OhN2HXvAJ%^HZXLsVIgwF`uByJ_<rp+Kdg<20dZ9o
zHUkFX<2gipHzROE@D8FheQ=c`1WqS;9VtJH*kVdJtO)_3rN<Qkj{bqh5x7v`^>``2
zC6&+3V*zL*Bfltn=^z#~mx_u^%n5-x^2^c_Mu~o6#1U|I@CF>^B;N`g<xJjhJm9Fe
zdYr=O=h=U*`n=UIO+#Zn)d!uYi9<#o=p#;Ho#el+=~h41k>LI3w;3E}VJ_)$f}gR$
z5`>ciB_YcS942wG5F06(MBowaC`Tj90w+HilGKw2cqRqX;)Sy{vib!2K*&m$1yTkK
z(Nr6EsGa5C#PG0>DVv#1L-$dpX8g3IpL6##DtanCMahD{aUh<$65lE&U<V^{L~U!C
z?{tISMYht<;0;=r`G##2A=1E|Zzyx)Zc_H!?=nZf8}y(T5RlsMg&xWmEEFPcxHC+n
zdtcZ{U!vs70heZ4+4X3*UZsXCOWpfOS8J;#4OVW996M4D>ZqK5Gkp6wX=way>%R9_
zq2gFyE`>j*#U)N&Y%08swB_I|2-co_)SZUGx0M8mApJgUrEQGDtav4wT>4NCj<y9|
zZnnZ?QB@l&X#sE_M9*|DkQl*qzd#Th&KjTi3Y|fus^FjC+`-36JM0P_w=85_Kiu|a
zH7oAWm_t;z8>SAk3*ML@2&c+eLJhuLYzSV#WfJECs{LvKr@;ybsW|m0yl4#c{XZNK
zqUw8e;$v|8oRg*dTRA~g?>{GrAVsGhV6uSInC~6?e;d^XZ1<~FZDJmjuZ9pCx7?-0
zR#vj2!TO=rLt}ze1u{jXjmTxPV(2%?nX?B*X!>Ub-hR1Pqdmm_Y>8T+D$mAm_3`Dx
z7<9C2dNqEvUsoenaZGtKE%xu#Bd6)*+YRC9fq!A<0?zCwOQPR*5th8e>;pI~F$kBj
znX}S4qBvn`BnAm(q7XqEkXpFwF^TU}LP!a4sm(cf_BbfJNSAlR5&N9}CCn*ph4av5
zi~&26JbdE%Nut66oG8uFq{BB6A*QbW8%-NGw3^aFRYOGQhlKmU#rigQ0SN=}+`X57
z-x-7Z&Zwvqr8r^&y$2}e6ib8qm;sezEeyHmN9>3c!tIsHDXTG7c=8PbYTWK;`QBL5
zo*zCtTDV;-ukgUA%ee}+;Z9v%P|)#hD2+knn|v7deliiv%j|>KVlok1hyqSRMmaZ(
zO}Jqnb=^TP=5)BMM(5Q)$jj^Qj<vHhlRPXS7>O&hEM^)jXBQ`d(RlB>3BqiY+<=8o
z5wnhvdlYB&3fCd|JnBy#ZFp#d*Z1Gcd3M;%L{k*jLIr0JGA{Y4C=fZ;yZbGoX*4SO
z*E<6>Dy~-paoi)N;28g^Fn(m|)Ov6<iP}g+?tT>*(J79xPEqfIz7Af5{}+4j6lCkt
zW(${X+qP}ncCE7QT4mcdSJ}3$Rkm$ct<tHzJ5K-o_1O{K|81XhF=s|(%$xZ{zT+Lw
z7?~qkrEh5ksADe0-=XJ&Z6Z;Olyoz)c{_f+Cij>|Zksk?ru>AiZBG{_^i`a=v7JS7
zp1kV}g6!`QjC53Vbtt#eS(y6j`Fias-QpJ>kfG~K78I7oF}uv@njm7^uhg(H9Rj&?
zbs7MMPY>4OH|}pJ_dJQZp$RW6q8wM*O-Cn85PVH$YD9;dVOxg<H-pEoA6dkTjCD0`
z7!ppoPEEsi@AZ9mvjJukTgJ}R%V94Y47DWqpd?6Oko5kAG7I55Ed?l8{u>iHz1Psd
zMuXorp<NG%Poy>F+8+n8t(5h2$t|9<fId_La1-LVO8RL~3J|*{zk$37>4P#)O#>sU
z?2J0ZNam2OvseC$-)p&fy)|CMH+Z(gi0vAu7kDK{2EibZhApwvi9}OPC>%u6%oOpW
z=en$0nTRyLp1isLKy*n#*0rHSF1vhwQN~qe6~l|5yDmDNB2T$G^EF(}w7`$xW9n@k
z*T@SCy4#-Lj@MFwE&>#`SO9(T^`3HP-BHB(>~yuN4;weM8PE@!sWVILmx=K(v_<6T
z^o`Ju3pJssC|)xu4?^Vmw)9Q4bDCUElV(~=k^$YBE+=9Ve4Y<}jZwr<Ksf7<$w8b%
zN&1%XQw`OsWM6riW4~6C-)Hm<)obE;(faBV%A$aC81^bhuaL-@&$1;G@VYo3)Wi@p
zFoR1XvWi>g0iocsB1Pl9FZJc_{e@dB&#z4DNQ%CNOYf_qWaunxi7O(-d*Wg&#L8?F
zXqmk-tCD7)uW2iTWl)0y8bK<Y#X1&El9%Df!G9XrvnDV=h2+g(5>*2i?iH>4#7W^+
zipwBQRe~B(Wl@3rcWrGOYaRz9YD+)eIi7e=s%p>GTidF!$+<lvA!D`=?a17Uy0)gq
z=1?aXIZ+%kc3yf<I%#vK4m^kZ9a^4{O~;!hfp<7yo@Q4@eCZMcT2VaU2_GA0(?m%W
zk3EJNVuYsO$XRld@{$t|sTImRl;^w$qdXu<DGchf!DOO<?mAiWa)XjOKzZTjS^AEk
zSD`8QV0cgf^SGv1?g_DF`Xy3|=am#{&s>qY6kk5##UB1B`Ue<=_X4zW76a4gG)Erx
zGhz<|vGo(J;I*BK2mJzM$^Z``lH33>W8IHFm%9jsIzc>rZQaiOc&%yAU#~)|hhE)#
zPQR}mYskcmX>h1zPJ-vdN?P7i)?A!|pZFon-}q#HnGIBVb~W#!smEsu$4@BEo6pCb
z#&1`V?|tswmcH96L%C1)pmx@kso?RmUGN+vJ}wM4_w^-Xq3?azT+Tb1?_mq)i!GU$
zX@FRa&Ms;2rI8tit^x;<&94l#I>5j2gtL7jVsU>JTsmPek-QSxswMF(R++zmf<M$_
zTXvAKDXl~wpQ;;V(b4&2V5-uR*47YafpkRe;yBrJX2~&#uyUeD8Wk>DJQpjY=KAQv
zl(#0Lf<I-^HT}Bvows%**y%Flkf9GLofyi!=as#p{2esw(V#c&6kw|6a4~dmVyUgF
zd@pi;Ah5v<>dTu(MEF{A*qGuya~KJh(mJY8%40?qDdg3*k_}{c2ZR6cN7m=-t;+BD
z+tBk70Ow&p0RUkd%DQM|gUKT-rmFE;H{zl4apNKOUfC-9QrGbUHNow6t)EMPiP~R*
z=$luoPb_Tg3Cm9J8jCFE)T?^qvb?!hmx-rSFm_*luK0Pt;=cY4i}3oUQ^ZL8Q^Y(~
zF03=D@5ha>r^<o|=L%`EjpqtU+TKaZu>8m~Hn5R?T`VHO#GeG>2J(%@H4Bg(4B#w{
zu>;lRW7p7GjgTFQHpj7`TUk$q&(96L3_j3i+y>m!Y=%heqyTyNqdXrSd%kNaqCPdO
zKaB@qtts9ThVlBKIoBtpjdlmI6ENK&bu1*AHODd73?YGwvTc^;gXT6KSqrKbPN$pI
zo<2(lR&A}kpTUB}B-3sSg?9~8NmT~T3+yQNP_!G8oHjy-5?iLCv-A@7=#7>c0Lkp$
zbRu0WwwhDWvF+9&qAXxzpPP9_{umKm{5*maWMdz$&)Kj665ytuku|*cc|IYIp@e8}
z`OPytz(#14%dNr7jcu~;(y-r$Z?Sm-!IiTx7Fpn2A7Nd4FQoxgoF3%7RRU39kW)>?
zg3^KjEzLve2_CYaMU45*fI=9Swib^BF5!+?YjMMhh$~5m{stnbV|PEQ{QlZ}z3Gjr
znsuoF3-1M;)+Ef@7%awXjLoxV+aZrW&DH73{S;&)AAJK_S1iB+P0wiMLxr|yH8>~V
zq029g@VtJVYnv|q%b7yT4V&Ju$M;1kF$%}4x&9JrD}54y<B?|~71~i@NV-mcunk5?
zCKz7xgF>w2(@xfRlv`H8MsG<p9=a=8)cOQ6ZFSTUudO#8rA$D3A^Zlh96?$av!qwN
z%VzCHxQ|vRzTLD3t_oGfT7b22nvBMAZa%Ept~3#vw&aos-A40Lf!O95xIjzeR9G0%
zodYnf0dq5wmwL+I2I?@Alruh3Y`JH(vPE<eH0tX{flmkj)DP3~*1BDf<n;6a$Z+DR
z3RkEtURt%VM8Su><tD`AYoK+=m9g@XeT0?*lAv{H6A7+HhCj@?kHMA@rC@E>dvy8G
zPT+^Fobk#mw4#R$o=KGQZl{DOC~6~fD(L9u#2DNXu0!v6Xv2*A385xrm?~<>lPM|B
z!N8%v#uKYX0GZS7ki{~qyQ7nA*w(Mfm6r>#OjyWagd4#o%sGnTn>8l<u-uBtP{{xZ
zI8GBj1SKDLfdGlZM?om+5ZSMQ(dx;5?ip?Tr(!1jteniWnjv@4a3aXi6JzdFliw{H
z4d;rmH7btHr9W`T%P2c2R-++&?GxScIvkL}AbEDXz(a8rgK$&%Abz4?BaktK#f`Xm
zh*x=nR+d6WZa%OSz@Cm>4A0k<!nqP#j=}~)de%@y8b1O<o@+o5RxL<Ypc_sqoC<c|
zQ@RASZO6dL3Xyj2^!Sf?6_+6+lcuJH->x8+;3O3G;!qZrdcZC5Yt2i)SCUBW3}m0P
zdP@`-fD=z%i8!~1$M8o*zz}mS4k6=K1;0F=%_|ots|k@;l5^x<G~*RefA|6|8pZ+T
zRmP=V6%9GP3Nh_~Ng@dks!TqSgaIE;KM1TV2ZrK_xzw6yyr8i~TR_&<?x|P~beG7w
zwAWZDX8VzTGwc31Y_0_*CSG*>vvIiWv0$Vv{VSj5wq<rze_5TdiW*fhIf4wDCE07S
zfzCW;WOO9r2PPc|t4+F1F`ocOsOdeSl{mq8Sp~x3P6D+fDbc{2Si?MH)cEt@@CbAZ
zR84tkG~_c2BM{`@0KbQVqH1FR_DN9c(&83jQEUgoXZmg3;6U<7T*+ZEVT_IA#!>)f
zL{9MKf`TY7!rvcK?I7*P|JY`T<6u{G4ctDE_xdoGU$2>+gS^E{+Sskt8A5gJ*eEgf
z?vSp^-*Z$xE~g)9;=)>5#uq_ZVuc@Q3S5aY2GdwL?$;h>cag=#4>1-igs+Ui{DyT=
z$_}}m-qt4An(v1|ST0DWLtqC=VI;71>6hop4`KAK;f1mt6o7&}bipSp8*V(mGhHI%
z$vBLI^-h1EqJ}k%8Mdz)W+I$y2D9P`X<(4CAJ)&!&LnsrEt#;#7=?p2&;tr0*3nPR
zEk8^kx+3uP?jim<;88u#T-A4TvpCo!cT;zxd2vVeme{XYk5zPkM09};c<VTL+WPwR
zMF3Btac8ed`PHLr?W@C~&;Hu%6H0Gq8Y`>Z-O+t?w_EucL%ngnm;Y+3XI&kisy&ms
zY;tR&nG6atyNqWUu47mK=t@>!O)@&Tq)2Azz3XwC#50wbgQh>_JBH3<5~t2@aya9w
zX57i9xk*EPF+Zq&=-E;57%e?Ty5RPH_4SWoHP3ED)+pbBNrK;rT>lhS^LG+kk&3qb
zIs=MNZsV81E3d4OuzEqsFIeY{Mp%{|aEuWf8t=@@t)QoEw+j{%0sOcpr1g>Gx9LkK
zvQ%!$*$~Z<rgD3}QHCE8XCW1;!T49`V;IE8;g(^u5vuMpINe!3pDc{U$gn_-j-+Mw
z=UtTZ3v(&oX7m~ZsGGD6>8*WZMPr>Ybfr>}nI@YQ=`Ixh%@eVEmyY9jHH)rviMRNV
zbOJmkH(K^;GiH24<`mAn@RMcAHZ?oytX;|^iE9yLHM1*PBg0ZQ{i-(R7B;=FjSa0q
zY;zp32#FJTIe2vN7I&2$u*?To??vW#$CgG%E<#e2a92XEwNQFLi7{8axoHXg%Em6v
zUFr36<Sh$)045Hw;16j{g#C_t!bb!f8;{`J<NL@tP2L>x`%Vp#p~az8<7SL(^2mEb
z#fAIZN<6(UNSpD<1T8e}*Q!(yjO}PaN6V&pY5qK|l1(1v!j?&_seOt2i@Zo!L{z%r
z!phN+0iFlI-@yxiSjnJEd<0Zh@CP@n4QPr$06k65&6x<=;67p_iw<S{B8hSqwLB2|
z1;Z1%C;kK_k=DsuB?_Q|mHQk7y!}KuPz?A|{cr|?T&e<VE7ZSu&InYahQW=_cu1ib
z2|SBuCg3H1Kr3VjGKGbQ#8Y38^*ji3h9(*$&X?EC&YNdAPgKd*_PYe!JhkcA!`T66
z_<;VW+^xmAgxl`#1@>30AI*QvJd!c=uyb+#mvFy-n_~O-l(+AQKdAn{BK`ut5T0y*
zdm_Y_=!x9yh&}+P7_Wn^lNcACY-bvQ_U&IV*tuqWSYZt%B6aMrF4O}FZh@S8*X+9T
zRyPxH6t-}H$zyE|*4m{;9tja2A?@{vq6P114#YWaP!5^qL28CbbaUuRARiGa*AQiJ
zKNT&hHYr+6`#nWc%Bcx+tzr#6;!5B9x}I>Eoghpa5#{=-P~c@3Gdq^3)TVikV-;m>
zd(c^GZh<edomttwegD({Gk@&=e;wHWUmgBlA{_tY>p=V8dM+3l{>K;MzXuSU|KscM
z-&_#?k_P&JF;f1&I{e$4^IyVo|1Vw-%>Uy7;=c!!EdL|xU{Rn?i1*F0M!^3Qqm(st
zayD`N7e@JSQ~w`63@&b34iKP#-@Ns02~L0fNN7F<XYIla?5`Gk20)VM(<mr#rLp6u
zgh8T+5$etKi#UFNp+0i|gXwt}J?tvQG@5_{F)kG@#|FbkZ%5Bfu;f7LUPHo!G1p(_
z?Zf3`Fq6G)ye_A`S0OQx=OTws(WDv)+!Fqq;zO!xvNsz)Bg9hg=Pj5;P6;UMJeI<=
zWkQ32!F2y~Pt=;*XHRqad3C?KsxElal^nF?o7@gJ{^H_m{RYbk2FaN7mK=D;;JyD=
zYSoW%HNsW45^L-Plh*{6;~LCqxH~psyKR~SjilR?s&V0Z>oGsPWKd@6AL9`QrVhf;
zCC%aKYh-B|zUTYARhn4!RYt#;2v_+^tiQ+$8@BCU|B<<u8Mcw)eK&vA--)?I|HM%K
z&Mf^m<YD}~#Et6fG3#syKj~+E2^P$%uIfoDaYxZnLD@PYSH`%B=d6g5eknB?l|saA
zTk@0Z|0X2$XlRaH(|s0gK26`9<aC<9%(NMs4G9amvoz;mZ}yi+z7J577vBFMVkQP=
zMN$Vf?IaR5635z~Iyug@JufmG!ri)U5-}*WMB-slQ(6i)VSSBAYHizPE!*iKMJG+U
zQRdMM?#=CU9iv%p7Oog<Wa9ei5W}ZWXCYU@qn}!oP=zRnk+S34P~bAJT`^Oq2L0O)
zv+CnFX<G%M7JwMtFJqGLsDmW|Ej{ZU{1FU5kk>5KU;U&w-T;ajMj?$nK!dXZ*2QR6
zXTby_lq#9}qP#Wu!AsrSJWwLgCRGhTG&b?G(>^u*F}&;G7IHnMIacXJIopRI^8%ZN
zfh9J|$SHO~vIIQ5()e?aDngO#UoBmQ*3Rew;Ga@~zFe^995`(8rvWLZ`8w6W1EaZf
zd@U7*2n~XTvCVvXPOHjy7I`0`nzAH@Lj7CwdUd1~OBW+7(>yQ5gg4)Mz(+A#KYO8B
zWpXaPTpy4sdZBgb8RCFw9*n#YEEs%23Yv7t9l4K&<N!Yq`TXURnS}!lP`#{@Ssm%z
zqRhDfSh55}ZU78JRI}s9z4=4As1Jh>VO&-e$;FYy?&T#MMWCqWv^OM(;=};Tkz}?4
zq>+>gnI+5j5TW0%dh3Bu$?cg9=HH<8wSm6U#jexDl4W9Hj*W*j0jpc7gnk9))O-#8
z0=n{3_yr}g%7;rwuoP|sUk>TcJye0*8t4vic<YM@eHyY7a*63pItOJt<#Zo#@_oGu
z1<xO9RxR{{rK|&-b`)al{%MAqvhLpZAEHCAOI^(>)5FyJR)p$i+}HY+Idt+`p{|~b
zNcV3@nLi};R8qhWnvzu|PN?(^kbX3?v}agyr)Q9s*WtNH>(2a4Jju+s(iyR_hMbp=
zuHh%MK=ayVVV*bNOZ)82PB$W#k=+iTH^3>4XO8T_(gmg`P?3~Qm@91}fVW~Lo;eSx
zJA7fd%YNb1?E@|TsjHTE-y+Prufw{5&z&<}w0u{eR6u(4%!+0&JTuk9(kb2AMn}Xz
zr7dz4g$A-&GZ72k3bLFAQmYhjnmBv%RM_z7X~8Yq(2H4A#~Dqu{!r*_I=tm>8N%?f
z>5neE=iV;AE+(qy?{;i@{%q&_dH1oduRf~ETk(d9WZJI_%-pk-xv8b_tLK&jZc~ya
zr2L+FK6e<Hd8)s65@_a`e4irQy`g5u{qjYUE^N!qm%6<R8!p!MFySbG-J8<RQBUjg
zx6R@^Ga_rL=e6sFt;e<jhMOEpyP-jg7fLI%+}YM|ZfJ2!^`#CVWIKnCTK}U!WOr%5
zhuo7wtSxAQf#j2hFsLry8H^9uf9i+r2jwtkfdK$!zp>}PM3=uipGviVK^Ge7C$0W9
ze{V%agNBY@+1<=2O+-TBRFd4TkpQn%PfmBFxJr8^TO5Au=jf{iA)(A>rf|ncQ{6}A
z;>AA-(mi(<dxaYb3k;_A7ra`=KEgwjhGqTR%ug1$6%M=7Ki=nMGpY18_5xQdK`^4F
zZWk&VZRZITiV9fO)C-DXy{UsM>q69DF;I|vK`t`D2l@<f7SdStdh~M|^pi-5ohe6~
zOQNp?NDeguzsi49IhtQ`npYJ;8C+Z$r9R{boBLyQt$K~Fjp0@XZNUhUTj}M2E!Wl>
zW+;Pz<n#BYAx_SPTs5&OUy3PNvaxjNn{?RX7j6D*@rnyW1*;Oj4fqtcGu6jt2;}c;
zxQvB^EPYd2YQW9{qBiG$B=7*g50d2oB}X_#8(UUY+RUvbT|5hN*?|Lq&w&f>Y|$0j
z4q<lQbuJ2$S9B@CXx@Ul6U-MA$Rc~E<8f?pG9WVr-d6i1?a%i9<hm(75!#^MTSh8<
zeO$ni*1J7WqdQ2c?pXz&r6tsBlLf@V<L4728$3Vk+Y$m*^BABEgFBm8Y6d+$3SI<S
z06Kp{tCk^UobB%eFp6y;F6`(jv4#S#vA3N|nE7H4MIJamE&Q^s`0|s8So}ayHXl3&
zvI>O0H+h$IlvTw;LIO;7R<#4%1yXdM=^kk~Itl=4XxL(Zlk$^LH|+Wad9G$Ml+rO1
z&Lt@}#z~roXCt2N7kW}}1`^=UsneJ{<{cX`J%FCFG#-53GYsC$8XfE6>S<R#A4(o-
z`GUYJokrvz%aGEQr;5NDLIM%luJ3W}C|DU^^sQ73^tz|okhMfcMJKA3%KfZ{&Y#z!
zDdcnR>2IsXh<|P$?GI-L9F9f|6QYkK*&x-&$ZJwNfrJ(-OE|qYq&$0AQ^!0Cl0gh%
z+<v5a)2iF|U36QkiP2HFA&8-W(aSU^=w+rVP0%s=+SUaj>)85;q;?mIq?#CCU}!q2
zDLA)OZS(J&I79Y@q(kS)D;XUg_nUM~-YL3glYTlfVZPjwXunFb=>OTDW5CiAPZUj{
z$sJcqAH}}8+OYh%d(GAF_WpbZYO<_RVYlkbwy)T+w7rCAOosVm$0t8LFk}p;i+=3W
zDO3GHgCG0ujWTrmgKrFN?hKh@E5t#HqzQP}PLw~be~*2@cq$A<D(O!|7}rV3?>toe
zGHcL=KnsqO(skUBb2_O62CS(5_+;7}^v=e;z0P1#W>u^f$W%KAuO8sXKFY~Cg)iui
zh5iF;vIt^i<c`civ#)>D$F5DVsUE)BhBo3~7is;6>p%XtPWT_##^3yYQe)kA|KG`m
zsGBCz9*uYSI%vIQ?X(X7@myW)LNUr^lQvCl&!W$?&)9Ta!KJbC!c>W%o_Bu80JD?n
zsBm`os!CR6`pI|+p4L{l6{6a)tm2ZJ@cF>hq#Vs}NiFmwc2;aH?IlgSX`j#2#wjj6
zomrcP1Q729U9r$5rExZ=EIR%A2v>612}}28a}iO}cT_hy!V~fvU5633&6P59P3;eF
zhsk%-ItnK?n>*DZ%d%{U;WCi-&3bF@{VLvJZE6j0+l7m_xP#_kbMBO{VRJ)76N)8<
z15GmoFTP{YG7C^(r<DCY5ZH7D`Hm+sh7i}sKk9SS>sI+>y45wHl+ez<<(soU1?@^c
zmkhx8v%u%CqsTLC&<eP?i!qG@twYe_nup}mM3|f7NtOd&eOUDMcz<Nl&*0a&P6GKE
zv{2k^o`rS8kFf7<BqcEKT^9T<K`GH=c^!#afE^6qJ-@k|XmSW)EPQj7bUq_(Kf8Po
zPKGzyF3ZBo>yQC)Kmy~!HN^3Z=WoqTB8V1tH$aq$Mh7!Y0<%QiheVsJUYD^QmaQ8~
z+UjR#5I^@`6-bu4OmuUkYr%0Z!PIwxM?f6E&Wq+y0i1$rc83?<#22P^)&+2>2%Dia
z_H*2#U!MVK@0;)!mtdPBAiV*U;~4TWZu&5U`_LGJG($l<qf9kJ);q!vw}zmWlH8h;
zV-i$>%J%{!_Dr>eJxo{}Vns#S#q|_ZGY6v<Mw5A<A{?FMpU1yA8CmN%VlkTt5oSif
z4-B$p<l!$vVt7Mqgk5N)%pyk>*%{WuD9zxRCw+z{lG0(9@-%f$#8pOUSvw~DSwcWX
zMW+&&;sVGG`za!4r=D7yUGBq6fxLqBxL>_%!br_2uHBSb4~cFp%c-&r)<TfiP+5$J
zDZ(M2*ut!)g7*UW<5B9Yulh{o^p8e6mIVeZ0BF%0L@S=Lv>>O(k#Arm2BWnA-o<W-
z9z>T5$;J!lF3~WaZENT`C@{PtPO4s%3~Dd53UI*1tr)mFGQVJgKE58v>VEXI`%ixV
zhi<V>i24z3j$Uqv!2R`0(sMw;C-+2NJrFPNo-UHVekconGd#EB@`N3*z<oQwj6XL(
z*22A^sv*A&;66(UIlDOzAkTR$$YwFdU7WDVeViQfXU?fRj=)BMhL%#lH>Pmf>t|&?
zg4$_6zEd(M?R-)@gVfLoi%WS%gT{xVmL*^WZqsva;v5a^-svPBDe?>)_-7mM(_TET
z=P3nNY2hHZJim4q3@w3*rkb_dRuxExmLzn=1qPSW2=uPcUh!KeFi6%%=NUCq<zxn!
z>URDolp^Xo1wAw+3u8~sFF#Zm_AjF0PiXd;%=i^!V>qga_z6{)Hv;-OrvuQdo}E8)
z3ly8s<F}j>u3Q<STLwvM${Y!o44B#L77RIS4hP_$1QKKxT${-@q4CRpS1Xnue-3N(
zyM12YFfCOfO`X1iH;)SfBgHGKi>VcbzM@AgF60v!opm`H1}wO0#Tz*4%@h$r5*?T_
zK#p?(-OqzH>C)P>dT9Tof-t;n4s}yG%{fmA<s8T!p0I<+t$Y6{un{0JLT>-a1S`(_
zxedFq-~=lTprq>oqY(yh{ao(U>Mp)s(X<w{n+%r358;4exl0PB!jW+$%iw&kbZ~+F
z4T%4zrNChp>eTj4q0+x8)PDt;e@jD^YS%HD90*@NO51&Vd<{W2<bG^r5Rj#b%^ek{
zZvc7>&{cg|m$OBx#ocPx4u444*=jFsl_|Z((2*u5XPA31zj#t>+*n0%cME<@ZPe0}
zff30L-<5l$VP9V<F*C%)wT>~Gz@}q+5k)OWs{86R$LGDGZZ%Yi_|1hu%tU|VWj#%N
za?$W!KvR*w9q2x5%FQ@_W!8SIpRTeMSNe-ubj2{n`zzyTB}I8urABj=B(WEQ97PI3
z4@HGql=&s2IW0MmLF4m$+(CD+_3FoWPzLc<Uxf_~!L_xp)4~oG+7*=~B#^5@KNh6K
znn0sl7J|u|v+u3z{FdXT+vcXD$|0254SVJ8fI-RM+-ZsVITR66ydz*aQW^&k)0~|>
z&(!M?gd8V6TST)b?L%W~JQKKCvyIQdCK_7+BsNp1ZJn=Be?C3t^uc*0R`an0L#IaL
zNiZ)&Fo)`cuj`^6yH8~rs+(4Erc-4S&h<llYQ&u0Pgyo)gI2R!J^Y(I=?x;HIubu8
znF|qE;{g6eo+4C@5{JE8{D-SegdjtP@kbLQgVa{RNic~#NawI=NOmQ_fkyxc=>2ga
zhcF2w6SsfFZ1V}Lz3xJ|hUASv?ApoiUWYr25rx_cwV;?48MA~El$6eC9nPW<-<uhr
zK>3-A<Ck&A8!k3y7~BT!=7`FY@(k0uV9`^Yg&4z3Q;_23i}zZU=mYqwtG&BBi!a`S
zo=~nkYU1H`zA3$wbA-W3IWBFvj%?`}O6IUMlYtn!s04l$qqha^zeBR%&-Vkwt4>wJ
zH>JRm(vhEv118iYy*VM3=I_@iF~cFPJDbfnwKaaMIA1b6r5NI@oZmz;hH1c+jhB;I
zJk}l_m{wGS|9WSx;~){vH|el%TiuOy4Ms?M_Nds4R+Yq5mU?d3Ig#UPq6<FwY52MN
z1CBeYiL($XI~CE_*jTtCEAQOq=T{W^H4EX@H9JNQz#jruEuZO)<_eC-3r(Di3WBsW
zTZEdaS<Pyh?M$xubYv34R!?i~d?q`BcB78Rw`DtDpRYgHn4YTo3HuKqZvDxA6U~dl
zQz`29pD7cwxnzAva@pT~Ayrno!<m=q8E17SO(2KEl0L!Bl51fUGVkg~Has#J%_kG;
zlIn1}L2&1&hRx%C@nU^!P)p@+<Pu)Y;RUz>mt&dEbpCr@dKDXhTcM==ni#Z{g`i*k
z(-gje>zQy1;NlDj!e8W`Wx<H8&!1x=hkP*$k^k(>Gu#jvWqng6@c#x?`dgQo)VQ==
zXG8hmH@Frs?@`r8B8?PE=*PchAdn5<{0SgdV3fZwLb2AwjdEgT6FX17N=Q1Smys`K
z2=i$qjpE^Q#rvFae71Ml9A`|dlBj-Iyx)b%C6Z*3e0VrteE(^9H4JCLLeETBp`^pb
zJ;z~J^G%it<gn5=Z*-fMK-k8~c2lRN&bcsQGMq`FwmQo!757%_l|YwURpkR^Zewe8
zNB#M6!~z43i|4D<M=EPlCU{UAsC3Z_7OD_QNtS!JEAQG*s}@HzDw53&hO0l>NxCX_
ztEQV!;OgcVz<_g{KxgKICnd(9VR+IojS25%R>WmL@`4x=$ERw_^%kDTt6MiWs1wYF
zg<5`NI>WbHJT(up=)maF_@2?;8h5A+ErQA;K-70R_byMr|3H)<R7-5cm3b~aJ-bS7
zJ3af8yc+lkDsc0zq@de^_wzhvyJH@#-|MM7d%W(UM2erKsB)*&p~r>3Yx7|^YG&BG
zi^yg{xvH|s*79v5q)7kwO6`+IZXlup8ay#J;&!PZ&RWamK2mMi?=Y@Gkr8N%3uf##
z;Tr9Emx7i=s`d0=e0$sV{m82)!CR7sh0$p@AfvaywxsN+#8S^6K&Z+mWhgkPi;RUe
zB~0@C2)c;zhZ&vw(P$ConY_`A9)Ji~lWt_9>6=bNzP^U5AhG1`%>8r3gC-L*z6~f3
zG_*^xm)sx92#b~Y+68f|5*<0~`;d_skvu9OCwanX?!uM^xClq$EO7o?NNDqM9{gfy
zj8yEug}34YDzLRVJks-DE=g16382G6iNN;HwXI#zB*{{CdSTd5BfN8@E$k-Y$V-&s
zVN+Mgn@x=X{5{In;}tn@o)j)!&B?Uy;4b{exA^F|y@%oGK6q8IVrn(H4&HCD;&8VN
zVpBbma>(_7>j$Kt-Mb79M2{g?@-6^E!TpT*V`h+R1y=!Mp8S6Di}}qk1;dWR-JAg2
zgn^l9If0pgb8G>#na?zyz>LE&a0B237c$dw1^%{O{BW?F&E07nad`E`eVOH`;>&iR
z9ql?O(@<p!Jb;jMxG}@RIz~oeOO?)&e!8)f=O)&%d6*pP2m3vWT#@^faL8R?3F%?e
zc$YhWn*};&eHF7`cKkcu_foFY{6Z-<dH@o8WQ+Cf!nvL?u!4(zkjGbx+M#7BX7>`q
zW*7`llzO+W3j!FV;ZGwI&4p&VdUCvPXs5<u;8MZz5PwltDgr}F8vl~cE-WwmwxX#5
zm$;MDT9`m(i-#pQg<}UDp8g;yk7##+YVfJLuu*WW4V_$SbHgpqgP76Lw7pCR3!Z)l
zP1@uJ#RIS}%AoSF>5RpC+EyJ%yHUgAd1{Z(>z~7G<b(1{oIgIOQ~R=ka7E2F2U>@}
zdd7l8KUs3}&Q3Opnb{mi?kSv)4Fyh}kzz~x9?bx1cr%KmR`~w3RGuF*zR1Gs9h8-4
zSe9)UsWO^pz6$F8L@BS%%a%oi>xczX(|9usLj&vrWVQh46j=SzZp2{PZ1vBUdW>0i
z!@k($={p8L;AdeEvGle#SI3uO?0&{)sQ&!NF0euqN|^HdF6Co}|3qp2x3>Rx7x*7k
z{R?ef+kG)4ANX1~g4S5*=t2GX{!xDt?J<Dq!LV%L#<7$kR#B#@6eDs72DeezC$x^Y
zl)_2LJ5{&D1ZxX|0XBnJSYV}w4vmZInor7}uP4uVZFQXmO-&e^W8Il%_xL0WH657C
z+?kT>(PqoBh@?Z4zR)MwnoB$Q&)catONHZ8_{tO=jUB}WV7A6%CAC1?7P%wuzB4zI
zA?V5}sX7Cs)V+dCs3&T7`n&4&^WaU;38?iA`lq&@tf~4S{FE`rKIpP^i0`<;kLCNy
za%|g0z748At6D`jJL^+`V`;hjr0yB|hhuqfQ<GEHqJ?wLr#>$>h2w)n9SwCF{Iz5B
zm`z^y?P6^KYcXdh-E7Ed3l*K8<mK~cBD11*w`U@L<Qi*geQ0s?j0(^pGP-C4^HJ=!
zJ?0#BeUCCU=Hdvk!MH#}*nb9Pt$S;VlTXHsSu-bnH-s`LW1AJ+GQoGuFNIH4{)o86
z?z4?>P8d&DC1#O&>m5-#i-t>ND{O;Ns*RX9YHEvyGbd6!C=zCcFlOiCsZP;)C<{s@
ztYC3<g=M4C{;WD!w{B04IQOufQ7>>PT!FP48*$t7!4TS(l!{ruFX&aD(O@hWq3zi}
z>@Vn>MdOK=LR?vNptV@Ho{gh;coZx5@5D<zh4pucu`_^+9Qe3bdN<X64K2W1tJ{C`
zlP!nlKj~;iIHx|wre?}bQs6|WBLd9L&z8u4QtDk3lh&MOY9RiQ@O0OkQ1(c4Dn<i{
zg(Xn)HW20{4m=h;K32wf7m{<F-a&{8JLn^sNsyLKL6c{;!0b?G=;iez#nq((q)6$M
zkWVw<-`uknDKefP1sSyvU8|DQtB@<2IA1$J5`bSMLyEJJc_WlM3{u<~BCONU+ze5B
zRj6Hw&h1f~gS;x1sO6yUQNk*V`4Si+%1YK-#EuYs64jo^pCV2kX$_^zy(^+oqCQsV
z*$(XbUFTKHpK12P_bNv%wd;_THdSc)d8_lz;CbbpCDY~*F4#vUF$8-F@H%d63Lqck
zP9af_v~$Jorj{lbb6$<1oI$YfAm<jy6ng#AE~9xql8C)ulET0}9zUq>zU3m?Jzu6N
z9LiXO!7x-N{Sf`;vrX2CGB+Nd+<}PdX+cN#NRw;oE(^l8Sny8EOq_|jNG|G+ApG;L
zVeNK`hJMwHfh@lyZCdn69i(>E;mn<iiENL&*!&7Kd}+2KivyR>Ht$(fw*7+tJTB<I
zI|`@xEj~m!m4!sU1(WuTg=-zk`$#d|<{L;)>-mcWYqt9#Z7t~9^gfp8EXGhH4P}j9
ztn+vrz9ky-0R+^jeu~Tv<S<Lp*yaRF7Qs^hfm+*w>$+dtpy1beQ#9G8r_a)DD;Fjk
zmj_hj@zTiQ-g}BERq3<Bln5i_(t48e@lMj#^?3{Lj9VZ?J9tZ{E@zMHNENmJacM@?
zJ|B)Oa4CNTcc|jR`ODnNFJ!O%0XdA;ob~4dNgVv<m=3Zj>O9;7hv(bnT>g*e-0!bz
z#Lgxvxr{YjNy>4aa&`({-3V**FFw4HeD3h!Fz(OBef;5%X>%;FY>S=OPh*^DvaLD%
z75BrJKoR>zi<h{J7;v&{XG%0pqVe{uF*nJ!lQUQG7AoJv0xCoWN|;BPbrDuvBJvAy
z!4ozU=H`&RvgTiGk%sf55{|dP6*P^M2y3KTaA6C=++Bkw@BPJ`3zv6Tg%3!?DI_P+
ze*G{Rn3d=0DZ^H#3cpTCvnF6N((}><<L0FenSs%nXBzg9>htK=Gm>AQJqs`u@AA0h
zMlMQp#^K8Vfou!Y6*2wKUw-o63E}5U!u)C=;wvgStc_&Llj_K41wEA08Z9qoO+Wh{
zON&`|y~t@vmVaY~wkM)tnn_obPr;5^j}rPdYHNL;Vf9DjXl$Ir4Y#`scC)egj<p8u
zHZN;zj2Q>+5$FWeDO|8BVCbE&S{(Z&Nvnb_q?-`9B4bD?qir8#cJYn^S$S5Eu%*N<
z&%sSM43q(`Qx>j~CLK39hbUNjXRJwLzL*xGpBz<ICoWrYK*D%xleyMhaXb-wl2U%*
zIR#nR2i%>iqfqLyEwg4kRl$QvUKCwk<QmewHv74}x1FtnWFR52S_L}&Af&XdC1I(x
zlI&t9g^DZtAaNDu-Z#idg})trYM=>753A_%eNS?jyXC8JQpMd5*g%VLpdd$I+LHKD
z4O?wg>TxWk^R_Mr`is{W+fM@xRkFm_B-;8cORn<EBbf!95vHoz`$G_BOHXs(&j)<k
zP1dA2BX6RlNL>o_Y~1e;eL-c^P_d^x@hW)20jxI3CWXKySg6f-Uk&CM|24s>7(IYw
zz)UNzU2xI<3m)bWR+25s??EfSyK2RD`6EyKISNRA;Nl)IQUhf=6O-f`h;drAvWGvE
z1t-Kpk}sGPfv8Eg^e5D!-oXl=k6#_1kE8^WH<%;?tR%oA>Q@h_L~XRE`6w3_O6%e*
zQvEXmtNMXVSKL&yJ%VLg1N1<DO<`8$Ed-BfFU@yWdmt(x`GAeP!;Vux>Dw(ekoy0G
zg!?vHr$(A=1z9N$vm%?_{G?t*fNd?09`$n?VIhToKR7yj{2^JK&^oLh=w+UvIen~T
zOl=z1{V+I@<(o~te6*i`{>~A2E<T-!A(}tumS4bgw*XqxYv+6^4jo6@W-%8REy_T$
zql2I`7tfx1cCmVq4RkfW9(`7EJCGINcHCn)>EX0^{9Q1`vYaU0d8XHeR>b~1#ZQ6r
z{*Ug{C2<CN+;<CQh4i0z#y_adfA6BW_*b=AeIg;wcvo5lWt2=X`{S<;%7s#q)M8}o
zhyTa7_w*#qRU|_0(Aczjz(^l1gBxQyt;3Nu4}Ts}fVx{?{Fr{2-HJx!G96Lh&ELSc
zY3gcCNa&zzFmCB+Y7<t=m-Y2NZ;=S4rLzaCEd|148NbJDMQD=96*G!n)V~+RS`^~u
zG)zsiOWxxK6@xF}DvG=9LR{K!V&n4mO8B<={dP7UzcUfFT!q>V`T_YTQgju9xQbIm
zYakqe(ev3U`Je>Q>JYf2`)xo7Yec<vPxIKoldll8+yNB21s+CLgH|xghcrW5E(~a~
zLf?bg%C>dOp{afM7fn)-ZRw#iJ=eFT4Ge>qm3{dvh9tM_q?et$hmt06&J2yytmtcq
zEH|cA(5X9#uYNz|0?($e^N@#o4>)V^rg-Brul4X7XFJVFTS_XfrKvpl$BO6J-3-to
z%!SHP%h91aHL=WU<sdus!d$h8OY2kd(ZCL<efe0zIU_aO_zm|<4Z4F=aMl<sX$FD<
zb}0eQC%^|v#V@j^4+}6w-7!c~jDl`r`5bxkbYHOqLZu|6pMye3F-6VN6!E3KF(Ajz
zM3TB21~I<d@s$??l-`(0Y*_t9(z~(YFfj4DEcw_GPW1+j0Hp8Bb5z{|YzAB{1VDs5
zQSd&!RRvb$<2>{Zz+pNNRCXLVBDVsR4;c1%ozXfBg29`;j|m7qjUtT+EPz#+u;g8O
z%0j0Ro?JHeT5)X!>ZjKu)vhS^Zy*4%KJaYb5v%~cV7wa7luG~r*xiGkGtLYEy>L&4
zumH7le3gIzuC4G^0DjTWe@%?>Swk!6xSbZ^6gEJ*62;ZZW~!T}QfYFv4K`d@GMhgq
za@og9oM4V=>Jmt)1>q6snXMx3*`bs2>$J;-5f_<Rj~(tA91gG%!@G*BO?wp!EA?3D
zLVs0MTgdJ3u!i=<j||D_&VSB}2bZ(_**#64Q(yMxhbuVfs_jB=e(0*T!Onl~cNG|5
zV+gj1AoW<PI#qiAs?b^oK}Ygj$nLJP@mtu;Yv~mncwBLOYF2Nzm>+q;v1K^px?46*
zA{Atrx=`e~y|6#gUooP`TFp<50^2u6@j@R{uQ;ECW>8H*9UFfmC0~|yIsVS6Dh0ke
zuR{LTfZLBqV~-hr<+=7;Iv``#7)O;oeORDZWiBoMF`@G~PvcKb8$g$l?<y_h8)mng
z7@cKcDl`$CU94|k-zD|jyex=T#{l;`(X)KYnix@NfL%YOz;+){{2{-;(lp@iY)Ibc
zZYd7Te-kRmBbji@W*uDL33dq=z4V>@|9SY<;=4tl=0eCw3~26354u5s{Zm>04(ZO0
zM~=uBDVFfOe-f^AY!=UCu6G(t5JPVG)*MO48*~VpUdBo8p>nUKATzypFOpMS^(YIH
zEtQ<JBFoiakLAMuCM4O@bgyl5$0RsE_8F*E3s$$a{}elON!CqXOq)RvmM3YA^AO*`
zsVm?Ks+3{R|0VaN7%YTqM;!EL5XJgb+{wHCKPthPL1{Q(U;qG#IR7=AvHrCw>_YS3
z%`=1rv!RkJri5?vED~sy<X0DduY_}DHAy+jB?&Fs%J)tkvz{X6q?3-0h{VvJqmlcg
zq2dFC$CLK%pSzSZQKZPK<B`ybSuV1HsEqQz_jdc=S#Y)UEmRWFAgA2>=qAn7((&nh
zKTkU^R5cxCb6<t9dKFc8Jq@ReX}q%KP-isdDwy@^fqXI@;*}x4B0}YePKfP~HAk*o
zKjh@*`>lr~+4f(pQ--Yv_eH0nG!r3G6d|9~?;9&wv{yr_P%442SUuNCUUmAd4~cX6
zWbt1wKk!TPdw)R2hI>HX$C7LQKw$|57tTplDMF=RpjYKYjUp`3_@o+_^dMjG%RJ0x
z_zeP?;cr^{n(trc3+>mJIz4>@J|*HM&m6CxGmqy!;NH$)IG1ya3^s@7l^~|WAGIvM
z|9S~%&C|o>cXfj92U?NZ@0t1-^;Bwfek!A!&KOu}swzwg({*%vJWjzeiCn5LPv5O+
z$&|(XJJqSsfmeyO^^f==d~{8cd{aKP{7Q_Go9^Mau4(N%LQWSBv>i$tokQrl49TnB
zC5+t1ZFy8ro(sm~Ar`wepx;E)n5ybXl*_E4{n9-tVI<mb+Ource2cNs3JHEahc}7H
zXMk>u0vv_9R0;{AK^KZyh~O99b?2B?d}}7`7)V;i!uMN>$xz~O3ksG4+!fNRDBoDr
zQ5+PMxH)ZZsuA2S#N+h!7iB?3Y)xr{mi&f{8)o!smKmgmLC?_d0)E*r;FTgz@~p=G
zL2{pya!?&Rua0xKz}}kF75J=K*R|?e!ASRM!H_G5!KX9UNSgx{oeDqcNRTAp9-;>y
z2jIJ3VvAmVzdqX;Y+9i6J!(qZ>ev_5v0debPdoun2sw&jt=w`^%aX<>CN&$x2364-
z4M}(TsFheNuQHN30B_f^w251E{d8cfS_HAz^6T@Ho5uZYv<@uIKUk}5Lj7z`*Mygd
z)P=FFXTj(wrl`&1EAnBN8%I}t#^5*r+pyNm1DBoy)M+stDQQ=gS-c@ZMx@#o*5%bX
zP;f}?7gPGP3x33bIlI!444$RLQ7Zc3R5Q;)JE4d@XPGvL4JXl*zZY!;g?kn7l+1@u
z(}N=?<GTFh|HUe9c1?@vq<Gp{TNQ(gOuvENai(Gx7OeuN-;`$kgE2@=0~Vk%B@DJw
zDFIat7@%U`2RX9CF?bQ6_soDV;;#XM_wS*W`|m-6>#t!6u>Me=l&qrd0|T)r0`O&4
zu;T6-dOQI#(jy)NQ@j2`!0+mJMXNjDhjUKE(*;$CelOn<G~sbov>jCBQUIHr8Te{G
zo5%dw<a|5ZX4Q1kP-&!=uKV&LCe%hQG*#Y(HA^*Jp~BjlRTjSBlj`D2omR9vM6cs}
zmzq`X%eEEkuQGTQer};{l;tO{st~hY*j2lj^s3^^8;b{6E7n=R{icji)c(`@W_pvy
z^e+|4hS&uwD@w1L<=Nx)#~H6K*P_z<9>qp&g|20*6`>0Gz>e-$#q%ZstMxFcW*3M@
zeR6A;t`rG#0$Y4>$)6=$RQ6JGF8ntJzQ_pCUS_ig!zqv31E1d+igPPxp~64f{C}Fj
zhHbaxm99O5De{x3&!IP*ZjmvlrXN{n@zH>Ee4pZhTyF#|fOE*Nk}r6?1(NBVv(pw8
zMsA(vFq*DwX<t25F}s~dmg}NphD^Tqxgy-Q@aQ(S*;*owe;Or`MmF7`xqHLy`2{y<
z^#QajCh&8Xb=#t_bC9*dhWjj*7wLi63S?7s0`^T^?EpqB<$5&+_Hx${C0cF0xj!A<
zKzDuf&*eFPkN4Z3YwQcv#gye`u{h*vYV+&VGd43qrLdkpg1bvY4!>VcBPrYwKo$H?
zyjB;S)!aBiANfv?pfL2|>ye<!ZdyI0Q=3>k8y)Zy^BOl1GY6WJ_i{h$mMC9|kxAtj
zH?dCH(4D2BVEKeLeo0%i|K`T1RAGl+(@F!{3NO>5MaH-A=m4oJ;cbNqg1{~pC)N4v
z<05i~?Q1)q7@YWLTfrx?{!QrHPVD^F-~KB*@$WNQ-*%$yJOjd4%pkYMu2V{h<T{BK
zcd<oa+%VhREkMqG^&HOnimip9L?;>#oP7VP<nT%eOSsQi?)~PgPfy4F>x1<+Lt~-T
za)5l-jqwiQc=#_%AMQs!y;Pq)LK}t}2IMTcF5tzj)r-dp<#Y?R?ESzMED*ld>6?Xh
z4B`fYSz}@BO8KB7oTVip)|K9xc37%l{=n-H(7!fgv9uoPnbqk1pkF%Am<=Xa4{@C5
zQcN-gzCD+vb-9-99?Vox3B)%0aFaM90IVv5ElkbA<vCDUgbT`N2YxUfq8fAn4zQ{S
z&dSi8yVFv%?0LDWN}%>A&gvVwlFDzdhvO2aKSASSeV!16<qHr$YUv`+x?C<#&)$P+
z6?iKHr{rEC$JmX!f+ebodyh|;I%IA)Ztxc4K4JHRxMFYmr6nT23**H>3%|=-u7(<T
z+D71p<p9V*`;La!w(>YlV|uP<I}5zW-Q_xWJ9loivk-6c<fMp8bA#>9O*oy#M(t#<
zfCyx`50f<jd}6k;#CrHSg1X>h1SQ6}@KZb|9kr6oZ4^SJVlD4k1G_w~0_X(D-UI3a
zLqhI2;qDBI`h7RvEE3+VmKkJI93q!C_=}9`EL_Jxcn)<S$QGo6AYhk~qY6gg6UfGY
zbuCaRhzb(dUc5@S*Vmyl#HV)gqiJ-41&dyaUp`_smVBU?AAY0_Kh%ORqtpY5Zr)BW
z^#m#}qdni`XNT&8qg7{W2^DNV#vgu}uLBUF+NF^xsSUGYlzz2=fliT;(TTNRY>c_h
z8hvm-ZU69b3Vq<G#w*T@Ml7O=qOTtvA=2M6y8FKZRp%BDKMurCxW=az+9#BL5=nyj
zWF1KJM;}4^@?eP#NG73Jw71f0+r&GCT@^3>Ie&{~@9F;H5vpnrEXrlD-^jJ%h%4fL
zX(<o+%$%9Ot(7>Mb#XWyiFDpOK6F1wEu@Hv^hiLTyTb;d8{i<0Xy1LVJ_})h$c33M
zA(Jw%V<Q(~5#;s<rsPv56}=3x=QtsIg=;8d2k94^BgmGq#uVs8F!^$w3mA=&$7VA;
z<4sBB=J;t3<_7lbt3RA%14-)~oC?j$_dM<6o2&hwe*R=v(+|Y&2hj3+WBY&g0RDYA
zuu^s1Hd6rMXU&4Iz(cwLT{GZnOlw-XQig`h9U!*G50sV7t7_d2)uMN`H_q3DX}Y*Z
zNh;BDeQOL;)8i~=Zf5Symgu?xorR=H*<)j4tTdFy%CV`9&i2WwUX~z{A`w_pcU!z#
z26^=B?I{hCj5`?I6E&>3z%YoZspuq{dK*P+`^Z5^leFHJh&wb%_d|Bqu{rk9O}1uk
zEZGaR#legHSM5Cu)kV;0^Ew;GcwM9zY?6GRy0UbP`nnclBOGXn!on{)Nj!|aE?a~)
zR@lp7Hv{`gPy+fTSqp@P0Kz%Eaf%hv5e?q>P4Vd^>!GIWSKFcJc9puHQ5Mhmp6nk|
zy|8!}X1nadWGI)^TXd!rKu}R18z9ASXXpV6LEp*ITt>IXv~1`#FKC&E)LT#X04g``
zzl0VEJt0SMcZ?-|7%k=j*_kW<u;qva=%4`6oHlo$h7kg+|4|^Um31A`3DwONZW9&r
zM@eY^Cpe;_a(XYmpA(D5RGi}yyl^K#J8P_C?(AJ2M_Y)mjff)gffgQCWgCzPMY(_=
z>3A&5w6I;mp&Cw6%%S{daWY5Z%3U@C9=Rb7$HdJvP6W`edRqs*ZEMc%6)AIY6bA&1
zH2^vovZY0=Gh3520}%A`RB<|?_1k>yY<n;fX$rm6ykg=*D2InbO{5`&h&(kvPJ@6E
zxme$dJ=pStB&pdwdEE`86D7$l$a|Majw}XqX7e@3SN*k}LZ&_VsLIUt!Qt2DE}!8@
zHyAwwETJ>?vbjzxv6$s%&8*mL3mh#?geV&RuvE1Z(sWkLjvVgC02Y{rk!DmVPXDv5
zwJP{$etw>fFD{{)vlLt~Vwo~#z^R#s&b2$NEB%BZWOi7jRu>=JS51zlJ#VJwySn2y
z3#7ramd?Qb^nG#A5l<Cgs<&joVRWPJDrh;-ssl+u?T&SHw!&=&v&JRr0nA4)v(St-
zNc7Fe0DCUDw=xjbGi`PiXoQ7cbrW>huc(zFxDxmXg8{*>(zQW@(nsdYF;PQ&*MmLw
zf9#3^F;4xhK>+}U{u?6C--k>mHMGCC6rz04&+;Oiwfs=F!~8~=K~aAhq=g*NStQ&F
z;&3WauM0}BG@GW0^yG&G6x?+OOU^NIUnEgZuJ#$H8uwR2-2-z`kL<d;@qTn&=s6}s
z3pGAH&D;_r?s^4UiS&f5<cIOK*|k6RANTU9sx{N|+VzIg{(gm(UI*h^5Q>jn*<6~l
z0SoAr1*GqG)9!*G?_>Dfl;rw;9i!ZGE!fUb3CU{rJa+dynKKPRF&gzW=9DaurJRTc
zzFni`YqhF2D^zk;d5ZGanIvqe?h~fIGRtU64BY|PEA3!2iIO=52`KhPqf&q{C(>kS
z<`PsEq5TVsNy=1nhiB?~vgP2_cwO4O_DRJF<@vQ-Q~X$Ie;PnAbe!Svd|70QT5|69
z{v<pQ5Sjlp5!_?DBj*wEbf#4(up^UqOC0(@w61UE47f`BZi&{v-3sc<IVU%q{b?3t
zPEq&s<PaqHhZNFWD;1_2<@TCQ=_|DmnLjkA6`;R&(G$x1PJPSWF}$mOSi)?I^*iX6
zo|oBkD4vMV`N-5>d*+~PONfDS$6y-;uRLlntMk_3)jAUsa|Le4aVU{Ng9UTVXL-<)
z{YVtfkfhf?#DEbZ@ePt2&xeqpFTUU2Rr`1x4{!`cnTp!i<~hKqc5^4_F9LV)!@n_?
zGbaB1`;*0AjAoFb6yzk6C-U4dhcq|dAgm-^0AG@pLY&HWw%H)v82t=4A|d4zcvl3g
zX@W*s(DTMLtmYPz&ZSv@y3NYbwta_?98$NI2b35GljsCBahsW%Ksw(?Hoj4wF9v7X
zO>epz*;#lQ+T{rt$2}jr9E$mWarfR)O>FPKa1;RnA#|ihMS4>K5d>lb1O%iv0TGoF
z0Tqx4AyImjqaY|P(xpZqfYKuUC<lSigA$UcC`A&5Fp|O@zjv+s-nH)et#$8v|9IDW
z1I$WhCc}RA-cS2HpV@Px;^RzI&=LP;oEO5#)$Tp?gz$7VQTbOb;@sXPV(>~-yJndW
zhzHG>#c$nf;)8f>?00i3{IDVd(d9UM@IY?e76fr3VTb<!M;hRXC(V<l9|V}~b2xH!
z?&OGktW8L5NKFi`I+FbI>qFSrr?t1of@lJE9?#2)8iN+&eYa+0HzzHI7Fs&JK6(u!
z{>f9Q^7E#3M|^Mj?Rf8C#76DSLvk@y4mWC}D=+bW-g2r+d0uqr5apPl^iiz$E1m~$
zYcAB5s;XG|Tyhz-Wjt-q)K)AEn;V|n;4>+@`FSJDNRQbe_DEi3T)tmxpNLc<xmHS9
z%X#kJ=SqXnwyuQA%kQX<swif~PxS9t?aFD6K`U*_+)%MGF)tnq`(>K_p1FsLB_Usw
z3&ZZ6SHP=e?w&^o`+dIPwBjxMdN}&UurQW(;p!p+5HVPwCzZ%~FE2#DSQW+|MHK&K
zhJoVAbY3*jfnvkI@)#7)tGevI8~ygB`@NBk!IAhM8jpz|+BF?DOTLTFEX1=(r!+5w
zJk?iH_>__>`0`Vf2+Zmy&oa&*qtMwKbZfi(a9j8In<oBK-xib}vABnRSOKbB@F9I@
z!N&CZ7qqoI@4j=K)RTf+_6{qq^`;zpgymCvcF1r!Zu$Jx<`XwBh0S+L^yQ1QX1f02
zolW~4VSTEr^tl)?AdRvQTQoZI%5pW$_+3kGS?7)8r&BZJ?8CE!6bv9+6XQkCu*-=z
zz1zyUbHo1dHmgJPZ8BToYjcX3Yj2~ks703hD`dCb@U|VZePuu6T^Pb)a5ntVsNH2n
zkjR*3eEk6f<K3U)TiL_OH%;KmAE3UKwdDC`5`HVT{1l!=d)q$zQwVt~b?5x{SNVg!
z{<!?xwf1MxwuO@iU;kd)SNXU2hmeypG6GsAJL*-=O?tWR+RoL-UoR+puivk|_*(wL
z++brJiD!*hYdSBWKFRy1wneM`2l$f}bY&jzDbx5JSgp5|g3MVZl}&*oaYn-qv%BDu
zBL8dXqsF|ODhrzGWB(hb`aidS*L$q~p)_&-JEcjgu){Xh84!o^5RX2a<iC2<M7nn2
ze<m~;=LUJ(HFrGX&WT%~O=mY0e*XM)a_sNq3=^G<$7l7D`6w1wRD&4?C(KR%8GU=G
z^}0o0bMumNz%YXLX&Ohk;^tVaVsy*BQDskN*`v9?J*+(~63!~(kRMhunyI`f@4O>{
z;=Y5Ehs4$-t}f5Nl@sYc`+?XOG@Ic(lRt7p?w6-5F2eV3z6<nB>%R!Tm97mFwa(*U
zng4-fUFVT^KD;EiVD>A+)B@F$$IfJFjhp&j7trt!;n@z>o{i6OsJZf3qwcbZVRfU@
zzU5P$b8OvOizvM*;=*Y6q2Tz-I*G@F;P}ZY#Gc6|fBL7NYFPti>rS8D^R@S8{I&i!
zWO$t}Bi)o5u98;R3mlWf{qTJzk8AXMW;QPWU?0n_ciFXkA|gt=;lp`Nefw{|bvHE=
zDvfTin^Y|XV#;>1Vi@a|38f$Q`!!q)K&M?^@A_KaoKoVaRTy%1J8yr@`SY)I)qjZi
zWb>hC)$?F{k7#oH>}bip7j;@w_~OmvK5IT%uztQaUY1RVJ_@kt(mZ?_CjWPRp3s|-
zx1-wcPs;K>+&@xsDCflfz0xYE-)aY@u~vMEr(b__)at=P&6Agv6<!_7cq>BZvDHsb
z`&;N&|6cLq`IDLY55hUxxGGOM%+cnfrq(L=$3oumI^Ijb^S%@HsWp>4s`n&T^`Kx>
zW@h76v9$voh~lTR<BB-e#~Jfk&)e9hXSny;dk*J19(KoL#?m1d&+s(!p7R_Y@w~HS
z+{p8F<FlpO0fmJ&9<rICh~#+FB>q=~YM6^|mBrqH`0Q`*hPQNHReMEAik}>~&>7IF
zwA@#obZ6>}abtGN-x^4_+nFhcbV}mhCVW0)Ta-LYI=~Z|bNFJ$vI70(y7R{MGygnP
zoq>otdY#$cf9%!XDVl-6=?AIAy)pb}Mg-5iZGnhY@@{5B;;iofI3NJNKOpiT^<Iaw
ziKeoR&_Sg=F^!liXVEV=Y*zA36zmkC9(9lI_YdvAuN8jChsSU4Mx#gDEscuO`}Hqv
ztWX~6QHGRNLHacf#+BHE$?6|Vd+wb&dK-n+{C?QKV1*5@yg=!CelAGOqKS*2xUHHr
zE~3(141V^Ceeu_;K)&xwTW1a_b}0en39~$Lk@T3W*&>UppC5kQmrDr0;_|G7Og<F&
zl3Dd*(`b3~*f{^mVs@tW;^8CJ&?7dV_2tU!gi%jZBst%8FGZIqc3jr0b-2=Zb{|v%
z<1qYdoOv@MLg7-)cVMQ~xO8zY(G9s~-*@K>u3yh`!Cmb|?WG8F2kg??-S#}b<{^zA
z`}wCHo|h4Z-s2aORAG`lV@O)Rng<>+U!QrYyWYTiO~Zorj4*TQtw?Cv3wV$|X2^&m
zeaX-J`r}Fgwf0pu<8&u7Za_<6WzUYqB>M%5n;FdXKP^Azvyy#2g3k{7eIE^>k5|*5
zjL=#5xqekHxg~CH=H>l8#A3Myau4^{LOX7kDOy+i*D8<Mt1&y_lv<nOmf0go?7~au
z#tQf({x%5=Giez&8jct<Ty9)HHE9jf&1e12(9=yEwO@DK0hEKo&9BnuLoG^%%IZfd
zyHZC`<5yx2)c%x^aeZD}2sOEdb2xb*b-J`?{dAtpRmlgh4^?p8`}#zEy<g5~N^E#W
zwL2a?ux%j9bvAVA9lg|IVt(oD^p@S&H0EDJ;Q#YgXZ}+j(7k)(R>c3GR^NGp`4N!^
zCf%<<CcPxsQ@3wN>O}_K2)*&&xBp?uu6a6ygD!;tor^vc86Kc^PmVt-eXgmn!A^c4
zEJZ0eEziTb><rO5PM$=2u&2MnKYvf>`mtMD<9j<Jv8vuF4Lmzi%!@Y72S+Xv`4(Oc
zrd&9JLG6-h%2Ax5@@J*zj~xACy-?@Za6k2^IW$PTtvj}#`&1Zeos<L7FZrv@NSWB#
zw8f2`WmKhKyd+xh@L4r1_Oa9VukX`0{SS`RnW_l<ll3OGDc)VIao=9po=?vj>Ssb$
zwT@S%rHCu=%y-g`ZwbhSJXv(J?;2h*baa9Omkt;NQ|ph-U3Hq>Z=B<!I)3<OC=}So
z4!WqM0X?oQbqU{H@cX`&_pCepW?O2F3c_}E=WBk$i1fZ3*M8^3^7!Cxa*)r0#y5G0
zbH;05-0b?aPz>LLIG90eb8hg-9r-PL`<8vp*%|oG)|(9!s_o{i^!2bPlj56NIfLSc
zhgfqSZ^Nf%<QM8h79VK$ym=5F+1B|fia-2}G@<9gh99EA=t|cfi%iuVUi`fv0@hLQ
zP{4<KS`>K*`Q;6|v0zzgs_y8ue<WJ9XRfjT$}oo*{N-;UQE^j4HUL)kIzv`bpjLLt
zF5XuE-rxB`=L=+>^t`AEiG@`iJaNr_V{~ML+=Y6y*oC^s1_l{4>?5qHF`8Xo`9=HV
zptXlDu(L46W$y%R4*v@Gg>%Z}^hN+b4DdN03&-)D;~Q0p+4|B#_BD;&nyw~bw()y=
zbm%V+z=+Q-My4o~@+c*Q4IOLzeEsN$_G<_Iy?mRB6UEg$B5GccogXLk==UFRUoshJ
zU5eWyRUwFs*clf?il)`Pzw#sKo>sczlG1X0zeS}~Sg@~(KD64}WAw_Uk=Hz#&^l!F
z9V*anWLFwV)O_+O_0+W=`bFvrPZHHDmVv~-1Y^x!Of{`4ZSm%#>c@G~O`f?GT)${#
zZ!jvePCUpyQ710SUZ1<UVZL(Pt#V7mIP&pdQqNrX!;k)RoMvb0daCh=@xXB{dG<*+
z$ESZKJg1MmE^%4<{@1ZbF?$|2hm88{*~u$H3)Nb+E^N<f-+O#+!dhx{;n}YfcT*g|
zKz>ql?SZPJEbC#>nbd3eF;^*6cBgC^xfgoMbL9ss_tX#bZn=-;rOHIpGn@aoxBx#@
zDfVoUo3}F4Gz>4#7T;Bz{M>vG`{rJ_kk@n5PBl&DiI4A0K7BdzVwZcOJE8iW`NMxa
zKGx~hZQgjw<o<Gv@ig~Cg_h(8yqxNYZnkR=?xj^K0?$sIysU2-W;>Z^JC_)5eqbf)
z>!x+Ogr{6@`G@OQmzK^w9F;rzdMC9uNR%Qe2Xs`YC7w`Lw8_0Iz;C60#oC)j5q+`N
z)KUXI+IIJ1Jrj8~Wp?v6`iJ`82W!qnydITp2;LibqwSuKvS3HIa7%~u%0C}hEDf9`
z6~mI=lLo#%5K+~0N$Bk>82@zWTiV<AtGBtc^^~OqSpoc~hU~lT2lcL0WL!_=dGh<>
zYWKJDjyq0DT0-eTX19)gJO5e3P3sOKxOwC4y0*P$3qspJKX${XE7ywt(e}*xb(7|b
zq9X0HR-w4*E1@xe4+k&E_dl0aqDgk?Mx9Ud+%MzMBYtwt!R}_xmx%DTyMrH(zU((l
zZDqyx*(+X5ayji`dD7q?7wNs)6GipxI=f@578;iwvp%Q}lKA%Z!&Q0bCZcp?v(s?z
zKj>x!EzHl>=l-lYa!r)iDrlgJ3Q1r^y^J^C^Yr$Xzh*r8VXR4LsQwq}W$o|p+XMIl
zhmYGT7d`bZF1xSl{aWK<(t)oHN0q+0X>>aNVAnp&B^+{n@6U!iTWqM^7H*Obf&#bg
zw2u*=6Pd3si=}AhT25^U1u0xwe-QMr)Aw2Oi};a{Uw(g`=a9b^Rr(X<>~mHgBwxHd
zm+f+N|9ndG^|B4gYg@NAnvZ%~{(Zs4`pR>!CaceXy((>dxEd3_eR*GLSgo}i-+>?3
z*>5&H`LfP^`keb~wC4KYA|hgG;|J>2i3F!G#goEzVY}xB%FLc^)ZXj%EZOM3VX-;T
zH5&5SbI(6E-e<LJS?!XPBJYL<4<R|+$q35Zj10N{s}rJ?x1T3&!sd2g(w}pFmKseR
zhCd*sFP%?T<@4upj2k?ubW!Hw=~MAT??gyfi52z1Ei=~MI^=r2*ONLx$+5P)HEphW
zbyLRIe{L*)rdr6IZg~}eINdfiwKSijA$Xp~J`|u3VE+6mA%rn;{`RH_3jXYbirukZ
zBdv#}DN4^wAAD|C7$r6k{wIGf2)_UGIt-y0B;7mq7|fHvKN)b(_3(hpcf)T-fNm@*
z^jesI@Gaet+cyGt=XRGMqBfS+mXJMr_CT(JKakx8$oW5?{;yttF7khMfj?i~eG3uW
z57NIYdG;KE>=oO?BerLE00M(R_VE7qYyXk;&uh<Ko_)N0{QCt2g}@V9MIn3l@bK*2
z$HU9J59Ds`i35*A_KER|A3lDTPr}ik|HvK56AvGk?pHq7)GOsQK|iX0?QXn)ptQ_E
zSveI|HFXV51H+R>r;JU`U$C&Wx@c|V?BeR??&0YbaQ(*3z@Xreh{&jW(a8HT35iL`
zDXD4cPjYgf=H)+oUQkwEQCU^}vZl7V1>f4%{`yTvU;n`1(D2{y-V-OMre|j7=1CMP
z?emwf%iq4QFn<2p++uD6zjyw~wFkoUKf3H+<q`ws+PiNb&p!S?a_!k0{YP-IeY}T{
z^NF8z<oCZLapc6q{gUS%mp1hZDC;}XrLNtb05KQ?iVEY8X#bJye^0RZ|5uXz55fK)
zxk!)$JbS>w;}L_vA>4VxG17^3)LE)9Kc)IDCd*HOO0U92QO3l9Y)TiFFVaEiJ6d%z
z5kp#3Oz{9l7}=B)h@HF)?V7*_qolIgNRF1PcenV?jw0gHbR<iG3<SY5{5AvNtwvla
za+A?6#8MH~v<w$u`wr)YlM0!iKYm&WTyJ2Vx$xY~er`hf3TMjVY|>@^si{3b$Jhq2
zLEO%9EU9o8BDgUJZp%fo|K*Bk40;5MH)2bAGba<Mvl(bb_Ek<gmk)5K%%=AU0&c^7
zTm#M{b7S^h24BT4L>hh#J8#SE#=3Me9;L8QZ2rkoAC+B5FabD2o%E+0>sQC`LMTV#
zF2Y1H0YPLJOy8SbNPN{UWa2aWEevRYleBgrRfA`gT6Q7A{0sy)Gmr@$?dxQ!?n1ux
zmviNy)LG8Kv30v$NK*6~xSdGx1-F9BTtQF@tH&$|hKk>X1mU=aQup<S;0l;tEBMV~
z7-e{Xkcz>}tl2RF-WIg4>$8;D5)914E?Jf^`yOCL{bbcoz%Q@+)BTu;=~f3}_W4=(
zQW^Om9wUPAm23f8^)m3wQ(QS@<FM14@$!iuw;IZ(yd7I0Eah{+7Y5vYW;0IfGA9cY
z3=^6QiD|@EF9L-=46C(-J7vgwMCK29L~*o7JAG{3ovJ4`Rv9e%_fB#tG=&XsT)j*3
z{-?k#yM~OHcBag&>_Xa9t65ga*^6;J6ln4;<R)BrMTr+9j7;&GX!WtB$_$vn+~_`|
z!&(00QD#Ea!2vnlCJI4m|Jp~OpOTWY$#^jw(qYh}_j-CG3N94qZJ`xA^(C6s@wg_u
z#oP*Q^EkL>GLM)62AZ{qu%Hhj3#1AaJN$MgsLg89)%GEVht)qEf5ILj?m|Mafci6_
z=>;I+3yzfUB8hE0Wq$bEq9_3*Kd^k+5?K3;KIc_)CHyQ}4cDJ#GVIOJeO~>&N4{{#
zSGSE`f~!CO68vhPVFh4L_QVb<=@Hk<-c+EcFmy+)tZ;{xnO(?IUVedZN-Z+%2P3K*
z&YT8#>aPSTX_;%~oFi^RrN8|O-Z{VUtr>~3poNAHEQ)qBx)<{+x(e){Qn2F4JQbn}
zM+f&VJC%$ltS8Yh3k8^YC=<nqUYn-pTTOOpF<67{z9zssI+U6+yl(&7oDVq76&R)R
z5!W}N`?yo^mK%*liz?lm?l6}Srx4OJVgp*Xch6*$0LoQmF`-z_{*gRyN>rU&p4b?P
zT6fA;;+drfytePu?7?&COxK2I=a43+Oj?F?y|%QjdiNT+cD&BK5jm|+eWR#tsNQ!9
z#q&t#+D&@!kSn&e5_N!<*^rTVEbD!yMu6lU#7t!PBR{%hocEc6pW93!Ab=7YXRd+h
zk?NXg#V+vl?<93&`#5@iz9O$Dd$f_~9qHSBpX}kOM!w37o;sT1X-nWfbqbKF8>Em0
zNh?Z6W=xIa8oj-&a_+D$>_U1khPB2b?DV+;gr!eL7nJrB*#R#R1-p>-Iyw`<XvC#O
z*6AQ9O3x{yGL-3$?&iih{rM)zb(O%A?SX>ueh;~!<#n{?>#h`siKxYVuGAP?%7~k-
z%!KSh27|d1ew>q<aLN_%A30;b{e)kIvfZ%jQoE4JYjvMFUo?v*w+|n5NzspFD7-<u
zt(nds99`5IG0A;}gN{8nw6yWmG4Zr8+=w;7|Hf`ea*FM}LaLOUhizrXY-|MVh7VAl
z_8qKDtyB<JpLW@Wv}#_{*PItckAsuPn*dlI#Q<u72?pLUF`u*k^&{|g?2;W~01LGk
zT`W5~8?_5L;EggQvdyS=YL|!&I=X?c=+y&--2s<p4NvRv%?QP*MClz2#@s?#FtSs-
z#LB!@dSn4-+D4|#wp_g&3j7fDlC3*w*eP!vs@HgZHmuQHhiwgbQ-+BRRrmyl5_vsQ
zUqonh&0Kd|_8otv%GheQKpcag`4f<3Y%@*hDZ{WVc(c)vLoyjp9>&SB1!p#~N$_U=
zA%_Pgr|8{7X{@8@$z2FO?=kh0-Vj=k(?JMo;M<X$CIr^Og;snOJApX)#qW#iZ>Drw
zIVYEG(uFoA3p9qGoUnUeY3)|yOq$C1nx*kF)>J#VKL;vz>b_;g_bC_iH;%JKcaZgV
zCLSLC?@oR$K)rIQ>2=EVA6Y-NynbA_JY&G7@vu!%h;vklbU`_PQosMQvxMd@M0*#~
zf0pe9dIVWc4T0XsNh9nW`?d>7P5@d#C!^1D#!!4%5|~Z@+0i_>C?;I*5eA02<+mRt
z0!*@$QSv}2>jWx>?*6tx!Ww8`9YS3PwrSatR}h$HtF>O1bqH5y+BA>?U8rM5jo<Q*
z^JXS>(R&Ih3l1qnv%F$TZ0NaVzp2J3&#O&82>&{-YI$Zblownb%tR)4iVV1;2gsa*
zk(mFr*{kpV{Q@`@V3?K-s4uTgma|U*`3xvCo|6XZmV_Ip3KKw%N9)mz65#%XN96r%
z@0UO{vxrk~F2o&;X{(_SQe&1uhe(sori>!`@-WhsjC(~p*nS78xfOR4$r!WIr+BZ7
zkds$op8xw@IW={e|6M-3&9Bdo|5Ow3o3E3lvpTz9jTi~bHd@?E#17=0X};nMcn|dO
zFop+Udw?v1?-v{qZeCUyVOyd57QZ_*d?%qyK4j}K{1W`=cW2r2ft-)wTzZT;&-Mt@
zbQki(mb$?*310x+q<0`QzOfmTy<_>D5jaRbU}{JIMI|R1zy*M_sDNFF(Bwdg5KC{3
z)!;S%z1oxE`#hAPJLXJir!ST+6>El?{POMu(y9Qp{*HdXj~+-FA4YNw1*0$rxaV-P
z;h&!Xo!k$F%&)tUnR(8Ox)ozF6{~a?Yvsps#>$V+SEMf0m6z|W&#b7^+IvuZ|Dkc*
zKP3#C=&T+!_1A-K#l!^EcdZDU<4&j|df{ZAt*hF*5tT0EkaKq@s<5WYn)a&~r*g|j
zoQ~yUWC%;?hq9C5c;HaHxej`0DgM}57~LWpP&s*4F%>sXAd#6*>C0=S^Z-|&mFk_H
zGIPwT0l@SXxK%S^sk6!EJgCFOuCvs-E#junc`9}YBeSq#0`JH0oJD@;G*R~(KxKeO
z=a@V21XCytb8w{qnq-I>g9ArW*f~&AEcgA`TIepM(F632PP6Me7p6$rN2lcKES<tk
z!G|TeNz~cQUg;s_23i%HhoHDbjrSTxGE!rsGFn93{@9OQPpd6&m%4uYO*@CLnR%)g
zy<jPlQzZGD_6{jC<mEm96$mQPo4Cm0Yq|Av|IEq|C$w=r?B=VfHFqasBMu+uc+2p#
zcc;oU-tN59c&lr>Tyk2?ikItX@z(Glf$lW++sH2k&cy6O=+<OTO!!*3AM$(0&BF5e
z^HtyF;vPM%_P&n|d^H(*sHr(&^yzZ`yVll6>PLPB=2=(F%WdRYJ#PLnz*gLa@Pbw$
zCklwzg#_+`W`AGtE@ZZfJG9kmea=<WrpWB6vAt80c_2KHUzH{j9cX@$WIC#*<+Jin
zP0dh|vJ;wE;eYT$wPp&wAl(Ujq$69u>y2}WQS$^0Gy6F|6SMRR<lA-ebE;4l^i^u(
zlQt|~1V`UH*rW0hrN_z6aht|}g4^_D`_egMY)L<=(VBArLsgc<4QgbcrmOy<$)w?f
zju5wm9qi*r29@OUojIZPqed1JoA;*tE)pY0HMX+s;}GtlC;MgwTYavrs2}Q>Zk29L
zSB|Us*pCQ3u)S^RT~pQ`Z>(}X`PiGuj)B^zFmqY|+_?BF%_W+~ZFXtQM`ieO`}_7X
z;0>Nne{0^%ZMu_KOVK#x>OBH|XU)<+KVTet@6FV0;)wn&_>n5k?5Uf`%<5xym174w
zs^pxqOzkZ6)PW{OrMdVjTm-*>P4biLITYMu%HSPJt3zTb@P1Y{>>F8VJ>U^Z%R!uN
z(L@X%&NxEnme;Sy`6h)e3em^1q6&@m!cf-r-$?Ec*hVy%3^L;9Eag#M6#`E8+z?$1
zmdi@63zq5k1JtXYs>~GL5048~s(t~;aD|u0TStwTTc3D50j>C~!6z%r>H7Jkq2k&(
z-|w@J`=b%2$GIeTD&c{cGB}qf5vy`XL(r0-S3gssnJy`lPpM|hztL_y_dw{Ka@P|h
z_j`fYJkxYgCu4f<+ZG}$UaI%%G$^_22Hv-EeoEF%Jha$gSa#VF5&5~=$-b&~#VM0w
zH)w7l=NO`Sw?>1T!=1+BVaf0tdh;VRnRO|eEdG?K89l#O4wwFeJV5$TLm3n99)102
z*1DYmO^t*L0kkXQ_|x6jR{Cb$W(*wm0VAH1w^Uzu5m72spL6n|(!a^g?dy9eW28D}
zBygH#fwVjw)Md<weY<0oi7%}lp~@F)P^QF))|;ovGDzFEN?KE<@Q6DHW|d%5Y`Z1+
zKEzH*hE8&6Jrdq;k~BJ$7O=5pWjAT?N$I_!(Z$KN$Oe;ze^_S8g~7a8xk|Tirag8I
z^>@z76q+{Mss25SYBtTvzjrhl>$$wmsh;X-2#;9d4zKP)e3yZqrY&-Keyom0oWqI~
zVbQ#s`_Gq~FYi{%@TW6>u-|voZNY0R>WgI(BkvTA8pj2!KK|;$t&?$H+E`HL+G}wI
zPX9;o81lOipBsO)5l2{Eg8rAszKPNK*TQ!pYG4$`du<m2=2CxDx3L)<*x=b2vJ6)X
zd3q_kKU|va#Q4!v6;yJN{^&V#q^`l+ZPMVhcO9!P)!>uR1+X+5R*&zDvmJ<8|62GW
zSm=KAKK+%Br^Rm`UwSe74&Fpy7rC>gGPq-4PB+Hw7~u+3`(b9H0-PeN1zpXutSyEN
zU6dUs!|joMXj+l$N8_m%m4T=1HTvA`FK1n}sW7WN^+7lCcad3zv;F1e>~5tS$X`0H
zKUz&!Df|Pr&<uj0xg5%sNNs7K<Q_!|_4yj(+a|XVap<8Lm{ib7ITFcpwe7D3(yH4G
zA@n4DWoCx%y>fi0{VxjzXHDp1%u*%xXBa^Qv-ZbmE#(k)Hpz2~J-ZOx1Ngrl>d>;0
zsB?>~iIsD%F?KGi)=YnDZ2j8VRc&MJiH=o??`FgOkohdbQ<K$lIn0(!W!b4Omzg<^
zHqM{3JC&mE-yOcVIC3Fw7h)mlb^|*CXDfga(@8K2hz19)ODP?`^ac03d5ra%%vsvJ
z%)gPpc7GRgG=GrHj3l=kaofh$dHtN-zpAbEGcu^?2FgYTmj{rb<9c@?qIzg~^o(Cy
z*kcC&fS)*8YXu|FrBQ(_^y4(3Pa@y<_Z1ipr<hBl{25A}4GT!C0dsi<U*k_8;Ntk&
z8r!z)Y;Z$}NkYg>j1`eU&qpu7H^Z^bV_a1b2jD&|fUt)+*<hXtTc}$~d9(|;<(G`%
z!65wRCT-~s$=nm{y%TLHOU_?C%Isjq;PmoC!csAzNv%%{%+^J5{TqO~QSaY9iaund
zJ0(9kY@xU0Fr8Ui7jnYRXh8V1OG01CN&44>vN%wZf6dfm34_Ak%opZLK$Jf{yALaX
z-%vni=}J(Q+G)bxz6@OdSaSY2(rMYzwY9bB4d6(FDFsFQYGg&q3nT2CCZe9_exH@g
zPQW7+{amI|hkuO???Om=%niVn7PJXZB+wnoSq^}RnUK|V6g=Hro>Re<K^<+M#F2Ew
z^~Ao-5TK?3GO6rn+-Ox3<0LLXH~(7T)Qk3sE;IjXa)^<0NEEohWwqwzrzXUxx}iiL
z6GP1*T@RGo;psZfJGns<>3tn1>RkKa9t}xlI8%$$-{nx_HHId^PSu(%nN{T{pWf0r
z*jJ<y>{jDQjjITra8;2zg>v<(Ez(c}b(`LZS*jsy-iN*7=W01H({~}?o7_h*t=oO!
zfVyQReS|_|dvfaF{$vRhd>nI24^Sg#fyoUN8NfURRMU9|VrJQgVD0&9pJ-Wnm&Uo8
zY%uYQ&tUu0OA=}q5xz>3FF<#KmIulm8Lj=pOAaCnb3o2Ohh?4r25EWS7_<|@uX6gs
z%Rzg84CaULY6tYyGNj<4@P`FAYhGMIl6z+gHU<|$#XoFckc&<uP%a{Cy$L*BvoW*D
z&Qy!(G6Yx3!{GE|Q*UWM7-6Z@tMLuh+XJ&y!WF0g$PT9yTo}<0aa{o{f2811hRlFK
zm%~!ARzFs-D-MjTcxm1n$)t};3R9?PYPJ-R&{x1egx%+5Md_Beq3=S{w>S<yh*-1F
zZ#1bIW;c@R(#u^3e-U%(NR?-9$e8i3<(ZWx(0K}iqUm0qJw(=dJ2h}qgW@JyP;<cS
zV$Dlx4W-uEx!Nh0rr>*gDOlS=^KsH8m5XjyfC7)#j&<?93&)1eCGV-X<p`G%jJIGU
zA+F%@n+3v+g<!0+O1C7uBm=a5bGg~@FC07@N*Ox<ME4hPiW?R?X+W4aB_$PQ(Fi!a
zmC1kdtswI@fN)@JXE<ARGejQJsS{Dc@P2AN91mwpCK+VeTY)2U2T=OuPT#y|eMgee
z*8X7gR9A$y+>K8|?NM|l37YJy@n}9QYQN4c)bVQ6g&9-HFzn?IggPX8X2_eBsp9Ee
z>DF-3Exp{6vND;a`E;eWsWPcS?}o#HI#W9-g&XSiHzgL9YF(2xv$QR2o@rkXLLB@x
zRc<eyu`>v(cPheJVw;y6>|%qjfooOHh9!R4x)jn+lT@*+c*V$y(XIY2g+7M5#VG=D
zyEu8SU;urLN?2QFo#lSOLZckgVSI#J#et4Qv@u(lp*JMYF&T56BrN4ELT0s1&6WuK
z+-X9~SX!5NumPdPFMg{A6<GbLw}O4T(6f`iordp5(}sP;rtt77!ag8@y4X(78yP6k
z0U{`5e)hF)I<w*BJ9^Kr;cMIda#m^WA<$B!y9<y*0p~lAI#Q+B5n9qf(^2l-$*q%Y
z_Zq||6u3G-^+tP4fh3kCApXHGV;3UUvj;s`qR<LpN>@)3*%ui`>-QN)rn_V)7_r4~
zCy>wO2ez@Vuvp55@Jf3DG<n1H68RRMcvFt==}Uy&=WlJ>(%ll7-``(;SuQ)7TkdGt
zAV|+FBR$DN8G6lyOxPLJjyJd^S!@vNoC-5PI)yY>eXALDy&RcS?we&@n|HehCT5eT
zn)|t+TW&MhJrS#7bNOYZRMvlC{$T%V0P3u0dJAqIM2{W_`aRL&i1y=mu=NavKkbRp
zp2rijOgYVOE9bQ|G~dPx@L_i$0@3TBqqqFc0%LWm|DO#0n`}+Am*Gu(@aFz^)*2Qy
zmew{WGCH1we6PX>H#mD4fS?DQf?4l|t*e3a{0qZsc!T7?Gy{^BviG6qN_1w{*&6f_
zrZ3p9Cy}#`vxbB@2N0XX2z^^r0iLp=&=V)k4Px!ohB`(+4fpRtL;yaR_(xK|QKp1n
z)3~WDRY=EyDbjOxS`UM0xVw6`dxmfmI?wOkO>cX(c$hx8Si|x_*-hzo8ROa>*CFFt
zNBO85Oz}J)WsguEzu8F3sm<;7ZWGJay5K5IQ;5-DqfP%V=8M30N>U8>^}{SfG!fR`
z{RO)w#<J&*D+zF3z=O#U)GZ*Kp}d66CioXZh5V3S{Y+U-R?prQQdngLJBHJY=0lw$
zqAV)d=Kxt+hIgGejz?IRW-Di2Q@JXaXp*4Qj>Gg{M^E@|rjZx^1QQ}gZ1E8gsrK{*
z`~2T}zMLz*u@K13`o#qO|3LpA>||X3j1mY{3qNLAV2ElMFhFCUBGYa4xX*7OW!B|E
z5TcaxvP+HLNl^uT@xG5799kL*Ajot2)Di+if?7QQf?fCNutZZKmN^=1Mhkg;4@2H-
z_AubXie_I15@Kc+_W%J@zqMlhzLGtRlMS`2p;h0TJj#5mqr;i)Y!!0b7)O424Qz8-
z3InkVd2urvuqA|IcZ|m7!OTtq1lM;VXGS4$b{44rBf($~3I8+Zpfx~PS_)4mEoZRL
zvUE|Vz{90ZzZ9~RDQ_jpj{!?Xoh}Eexe_>`MZjUW9WJbGdYFn6LU!Y!0*fm#ZgsUt
z=um+{8;P7`dc1Ki1T9Bk*=HIo6e~MVy@zn#fEE<Q8$VdU*EO)M!`xphoJ+V7IG{4d
z{_M;pK4)FQY?$M~GXePlY&$GH0BB{jQ{a7ibK%T4obo~@nlYPVE{n2aZ1-VB(E~jP
z>1NpME^B%?#cw#()Ua}pbq+o6$Ah|GJ6g>?4UEybyAY@8tMp@_FJeBbi>Um*(^DE%
zxuO*gWuLVA19o04<`6(XPQ&zS6LxOoqK9@NKkxX_-Sk-ST}TRiXH^eSEsl$3ID-im
zR|u))1UzKGJYP?A8CQ)2+$^^n?>bK<i>=0dXowamE(QF07X|3|p5lPXVYEI&wdPJ}
z1=73MT!_vkP|1=ABexE#luaL7r;hg4$HpoM{F}=CmOo~Tt<|05dwv#uIe1RX)5hkw
z9X>fHx;bnj_fyWj%p=Rj<`ap^8rI#)lAlS|EuS5LTiGNn?&H%e5Fem`4UYH3Y>sHj
zcyI`&(Mz#7-YaZ>aA!0aEAn<hX<)U9x90$1U_%xhIf36)KLMosAOY5+3#cuH_Xo4B
zD^M^_Cs&PqG2dbeeH3+`UXuNC5eeRHsrrT(+A3>^7?C8dg*P_rLeAkE-0fx~yN$dj
za1Bv*Fwa*l(~%AKL)YIlPepz@w|Kf!eih{STA4L`s-L*xJt}{~Rx<UQ%_~DXhAr8`
zwVS1kaX^E|7LNwAt3sCi^!pQul>$SKMb~NS?@qlxk%2E)?=(C*X=fO@c^%vxpx&zf
zPQ%hM#4*>a1N;rHj-1H~$tHX&OMje%Mxj5#@NS(;I_{L^wRQS7Q*Q<yhDnFZp(Ge^
zYIjQSJJSHVE$Dq7Dg#k0b6|GKGKuCOhHM#xDYULap(h(Ek^Qu>zPjxF-%gq+y@`G~
zrqoc|`XPN0)}FV$sYEb0SN}ws!}MO>g<Ka}T}Tbpt|u&EVRUD$&d$0tY57=$cQ{F&
zD{$6uaNy?~m-K_zte$u(#+Pa7i|#ShFVg4(A(ms+b|yJRq|C!|XjYAe%|v>No_<cS
z<;PjKH;$Rr>V0Cl)vDzO2aRo<a+))pI>YSdk2Wqen;QXfEEIZ<%wy;fkKspafZ?4?
z&ta4d9Wy{yK-N*<y>Jnf0H*@(zkX+fZC#0i12v5HivWh=m(owuduS?9`mPG44!F{A
zVylaX8CC-hDT|U_e*0HX1U=TvFg;6wrFab!eUF0^C6jGW=TGgC1e^y!O|TtX*tTNu
zgVPEY0vF;+2M-Tn=$96LQ3*=HaG{=~?BJ<L-cBN6soZ_WMA;j495~OAAB2-iR5~W`
z*6#gwWxX}FR5QOb6I&s0d2?Mro|ZkR2OJF?fT!e}D;M?`VA+x~)U2+{$o{uzGr*5J
zB3}rafMK>|n}!Z(ZBuq3lBW7g8!+TeYEY(JR-%$J{lin;nliuO7;Zs+j4G}wL<aqw
zbsh6=5YwynCr9KeV^+bTseD>NF}ZuEM&R5%yP-RGo>t_7j^Cb`;p9*o+#oBKdmSfT
zk&_!PUGO!r=!Se_(~~<-7EjonHuy&;g1HM}Y&_I8m563wxB`=`Ft!5SDwC_r76WwD
z1_&}h9amt33Kv8=4}!JGAs7!z0EoCx^--hftuHgQsM;M;vjcvqi;|O32!aq&Vqlx(
znzgo5-+`F-^s$xs_N=?bD7#vyGvK9FsN!1Q5o8wCb5#4)`FqS>_EuY7kXo|w{5}Wm
z6GM%@Mm$ScKqHV{i0jAxF+#x64Q{s)(+F3Fie(w2P5?;^s3p)@tCqRxmtG=E&#PI&
z3IQ9vWO36tirM+IbSBPl^vhPFmr@4xv>)H<$$YPhwt_)!Ar*UIb;q)`HppDMV-~;e
z&K2{6supU|s!DP{u6x;f_F=VgCb#cpS=Zb}|8TC9wJO|h3icsTrrQ}V5-$okA8R%L
z3N!TiT`J(jeC&Ey_PvyX%G!#B4V31v7o4A$T49;(i>F{M+iSwCTPP%bte+F`c89N%
zhT})TUR5WUo>s*%B$lH497y{8(go;3#xQdcu%=}^j-uz&vilvx?4s2YmYU<^V(b(f
zz1@hdh|{_|;S1pNiQALCGeB%uW_^2~DENR5Q?fW1wu+rUa9rcQO5UHiJg?SDA*lYx
zk6y>KHxKPa{8Hu$<g?sRW}Je0cnCQeCT%Xyon<bbB=+cW-?w$lJO<6hUQP$skOag!
z(Z|^0MqB|b=p?R;g9!m!np13kI7mr}p^Bw~Js7emH*1K1iQ2vBNuUYEnPB!KJNsgo
z?c0Pw>_Rm&0%)Rd48lYzCqS&C(w7g<2u%<60;^J>d)WsjBJq#msinm09ql=8lRD0z
zV!aB_LK)1I9GVR1pmSffbLD%U;^XYu!9Tz6JjPEo=+~}T7JX9x<@zP0;NF+z7>oD6
zs*x5w!Ig$;l2hSYYLQbnoCGW099ot#Ui~rHI;wY1KSrxtS&gI@&<56hL&qNNe0^Sx
z@X}+8cQP5%o1wl#lP<)lP(5lsTfFY_q@kVu<@QHE`W+{6>+q#239rSuMbfC$Xyqq-
z^TNl45ry%;868cfjc790jIJ^qAI8-Kht~uXN=_)*&%h3n<Kes&Kt6RBB8BNSV_Q)i
z1d%vqJkY-s{tzw_ZfaY>R^fDURZtfB=Q@B4hJ0T^SVJ@=d@r(uWz*D+b<MYn)zYfD
z_7gVW&U_09v&`AYnt%Wdm}a$8yL&QfR(Q@bU|E=@94y`C)K)RPul7(geJuHa9EgKU
zXYDiL%eWC3v`XRV_xC8~vDVv7ph+m5N{8SV!QA_-TYmy9KJ(+CVKSyK_xTEOaQJLd
zWD1i5j)Z4xaw)@^E5Kl(P(pOY*As-LDonE_;XdaL!+{nKR-Lyg*~xMUMa{3m8Yo*T
zN2PT=%*UwT>);g1sy<lGb=$&~_q21qfr7s-F3&0Ph6#$8qz1oXr0F|3Kk0hktRlGA
z+TsSkYWG&KhP_FsitPL2FQ4ST32@VJ$y6wMQBiL6&vRDnAD9U?1Z69LA*Najy^}i{
z*bCoj>oJ<J`j>rYU_E|xTkP+@t2veW%-6e+#W=s3r@z6lZyy*?6{TZH&TvFFXL))D
zysHcHUqmpjjGU$6hdri9!!9A(He<B$k)e;#75p?>mhAyF(BZ=|<_(n>*Aa0Payo1u
zJDAR*x+mX!BHM^xKhG80kPgnM`_{gbmsF?oBdcab+6Z34{r@O%cUB(?*04NbbGg9D
zy3!;f%O=9TyVAZQ;FK-$!|`0NCznE;vW>LvEQ6@Q8tH4LS5b{ywHC@{g;@06C7~<O
z{%&GUW&tJysoW>6#6urVN!lD)4|HyQRp|W2Bs(=q)-diachzGW*V=IIm&>%u1?=Un
zH&cf4&#O9L3rC!WKfj|riGM;E#Qa;8#_C_A%BM?y`0xQZDTP@^$M$U`7QopO5zKDD
z15UwVu8yV0fQg1fI#vucAH~aW(V}#4xiwZecMOB)9|%QUNuflqlNsl!!mHj$esDb}
zOmpOzAd-IoToSIKt*~nFT3VFKr=BOd)f)D9bC-?5P$JAJw_-!X#(w?1UE4xv+mCmd
z0sDtU(=(6c1{|zBICR`CGeGM46YHZ-n!S{_eiC5pv*29%&Z&$_W@z9!o#uyqFt^ae
zg6$zhIAC81v{ITvDT)p@<7?OGm1&?F-ACO9w*iZbzMrhCO}&b7L!(yA3_4g-2uo9a
zOj%Gow>)U%dx-O(tgOJLNlOMw(>~ERWdGxM{Aly7AL5RH&&8XvB0f$ho;-=9xMh}W
zMpa9@+VsFyS`Ev$Qqu0YXXJj#yE%V>`nmOU3BSqkWU#^1vVHvxx!lsD?#nRHgbD{O
z>Vr+l;f|O?8B+M!5*WjeEs@O?OrbQ~BQEYj9Epv(_|RzDEL%L^c53~_v_VFvu(w@7
ztX2&bgz0R1nx}4k%A>yjvH0~=mBX-4H1pmxVX58^zdTIfL)kE*6V^7z4j{{E+|u9(
zhAqV?g%H%O+-uGc$g)D%C{AjRG|C9Dpr=riE2HVoZy75j>{=ygMDzv;sif?OU3Ltk
zVWq<Wmf|BgHqZe}V$1rM&ELbc5ZK}^!!Wvy-$F|ttTp^CCV?!DxrN!!KFfL1V~QTb
zh_mHc`Y0m~$hW#t2v6=&?f?eC!u&<Rhk6r;rEX+mTlnMIb`dlm{XVkPCQb$zyENTp
zOHI8-Pw8(E--XyGSjP{2)u^#A4v8rDD$*p@2{_l;R7ifGs&>-VPox~BowC;%a(r?p
z*W1ouRAZ{fra}``j_K-C;0Z;Qg1IYaYtG#+^2${cP*H!ca|&M&T)wKz5<riFuFGl=
z3#h{SvGXZ?1+x#iFk}u_@HB`ED)+^-XPVm4o<$<&;Y)dNy0dr2)+>fgR;vzXi1gQV
zRI#lmgO@X#Vd_Zrab{V0ylKHt=@<y<$XF12`Z&$-&r|{k^#3V&iV?PE{OC^ae<#p3
z2Fau`YV~0FT=uXp3><Jb?gG=xVQm`3gj;BSk3w6q=`i{w+c;ZydRsS3hC7RCDXzj&
z)Nf&y%1Bua+o{To_TdfwkoJ=uqhAxMs*@teBzy<!?CR#fdr9L_aX=@_lkEbs9mIfD
z5D1~$kQdM+)|>)DFvy8RAsD0Jg2d;!0%_qjAzbNM0$7tFZ7?J7lWY~?Y>-Rf*xJL`
zhdv2#me$?A&U$u@M^4cjpHmTYt--CaW?JWF+KO$S-WYR>s4@3A`=O#}S;bzgTE_U@
z8~ZDNrRRSAV%+)uZf;??`Z>v2?ZXGH3L?HFy0{x!zmItO#O7B~rLox;jgwoqb}(E)
z&{L?c(wF~KZ2nVK!r5`MRyTIo4>{#MN6=G#t<c0jz&Pm83jM$!x^DvjQI0t+YJRHu
zVYU<S7*ufZVd#kxeM<W{SV9<VewhYZowx{}6iUiEg>jn7`@MzE->)a$mS2T*c<ZY=
zW9W1JJ1MlMgm?1e*&WVhN1&gc!R!Q_>HJI(VTgYToF4X-;8bs|dpmR5xiZKC(#BHt
zniVSuP3)1*u<HWPxrXUAiF_H_3RU2Kz(9I5k!lwidwqi1pL{~vx-xk4)9WMW@x?c{
zE!&BP>w+vtP*5dLmI=rWH2#y7Gn@%@7cw^?=641Kw%*E;MUTQ;v1}>$Tab1Wy&l0j
zj)E{6TfkslkAco!+J$6cmAFzUO@@5~<@5%iYQ&bnoo{AX7J@mTB7i$Pjq5snzX2KR
zN7RYx^tO&%$Z|eEV*oVzZd~e4hgM@KYoMRi+p!K@+XLb(kN-5}*U$@||DsJiOcLj{
z`2mza;7^aBj3stSvHZR(9b(9=|EJR4bW$z_%zM<@*MHHC-sbH>=E20#VIJnw$^8UW
zGFq6yKpcIf?<+!4_p7irGJlX{J!|+paziADKvFt9y;-cgy$S~Zu`(cD#eL%WhaPzy
z{ei${)P2@l2;d4rS^H2?Kma|O64(n{h-IDwh8d>^b|LK>Z1GXHA)Q9?Ti2ooQf0DD
z&lNz~k|WHE4BZYA2tp~-?ED7gn!oiZOtfM|Lq`3puLz&MKy1}<qxbs<M46s}wdXoo
zKrjd4^gDs<lCi!GmLYcEE@XuSkS$>UvJZm(jp^0JEFZx#bH_;Fyt==Iu5w@6UNfYI
zD~1px3AgkfD)BIqhWov7sW7@DSR26(5JcJF;+YOmWXeE?Lo!+d0dIjN2adB<ssV!C
ztf@*c4nbNT^4a@s6|GEPPB&Opls`A4J$bh9^0aoVj@IS|SN0dkD1{mQVU#9j|6!Dl
z-#-I$cJwRogXPd1-Xgo?BLW$PTtRQf%;Z2v>vVgn8o)!3W$s~A5QI@iRh1P5xGeUK
z%2h&3eqvO&w{<i4P%Q+-FWD;)bjORT$i^jlU_Va$3h7Eetj7Z-2-MVgc!rz=`xv$!
zpxDi+Y+Vy0142wgB94~4B3W{@o4DmL_hHO=GHm*&>zh|op*rDfU!mdAp}YfkFI4!M
z2CA>rOt~yCp2)rO;v-rw&^fOn*tNa6&K;UwZhyBfaiA~f$?eFg9N!u78oSmCngGkB
zsCoW<QIT1_#B#M9Ud`!v@{z>axPbqFd<TAmG~4XL|Ilo${zJ1hE&ta}$*y^c%fnEx
z<{<Iv7aAm9UF3qqEB;(ezc~s-o6Eoc;fp=~!xw|9^U%+OQ%Rx>J&l!tFTDV<%8j%r
z+!_>$TwG<A6Qdd&xT53~f+(t35A5;yO)xWXLV3r6kmP>9L98IK3f5FW?R+s=F-d4a
zA40;0(4sl(g<Bxlip;UI&63;RJm2p}x5JSf`gXFjBDKKDz@K4=iC<J^$fRQ^vVbEM
zo<b1YI@kkan@{)fWItR~1dCxWz0i{dfr&pFyUcE7QJ-&`Sy4WcTLza?4TT**_qz6p
z0#BXcN&smTy$?7*1<al0fokSEPE({XjF|zvThg)WGr_I<RIMs$)AQ0;=8MRL;aMTD
ze)a?uqqL%ckkQZ;MOPatk!qd_3lopC59;ElV0nYJVqY#=g;y?ouko**<L>-RpJ`pP
zRFZ6xQd2@YNIV<~cE4TYq(scp^M5u`tv(cD+BDE}-&4&=*QM5YqNt`d1!pAaWQS?F
zcIRg~a_CA9PHs-_2hq;j0UCp9`I;G7BpITj!oY$t4xk27J&kD&8Stj|L2?SFbvbD*
z%3<EYb|y0J8Z;HtvYccZ%;2G-7LQNokq5aNL~~=H(4B!v?GYd&%w?s|50%IO{!1MA
zS)e3RK77xoD}XYkt@K7I;QbAr>DN3wh$CrvGQv~1CP)_33wTJETU>T$sP$r!W7^kz
z7mqM7{Tr$DZ!vk#>8r!sV?YAKh?xKubg~a3{Dh-?oE#*-ZG(_HTAVG!Fd$7AXWm=X
zAR4sPe&4}WfE2p_b#{SlMARQd-D>m?8&UZW8&SzkN~5n)$;0dM*B2{aqjDu~XKLP@
zE>9o)@T6*~#x+^PR^lO!SHaTT4MQ*X7P2qi?^tU$UaXnRnJ1R!`@bML_rSC&zWJ+9
zIe&GnF*)&6`?{9K@QK8V7O9M{Puas<fj2BowlgOab5mGtNpEm0jk_0Ql)*%RRjN`=
zIZ#N!^g#tt`{+`Hgs2S(pA{u3q5&*XiN6I;uYUMB;ZkJJ?B=J#t;eIOSytZDtuRq!
zq5D)6RCEOs<DS7b0laD1`w-DyxHLjAGz)f6Ti2jw6X?=s$eiPp7>tS!{qXLTf3~&Y
z$rVTO0^U@k>CjR<)*)V=Eb1GS6<k9$`;dJHqf{7M{hKd?rO$Rvp+X1rz_^Op46f$n
zwQ4duSO*g2gz*Q<=^Onp$TvR;17RLp;u6FB`OA<+bw<j7p9GS}g{WtU6Igi#(0a{<
zC*<8;x6z0Du?H^Al9H<%mBmsAT#ITg^=l4v78HBlbxJ~63V#7i%|$}8jMUu*{R2}6
zZKc+Hlx_*%@D1rvO|L07ad~;pw20lR&%drQ_`c%1<B(~Xo<v4w-IPt_D<H`x@%G}p
zYnyI<b9Pgy>m*Alo&A~~xYh~klhHWaTeqAB1vKwv(+GkNrKq#<X8X|n;4+V@r07&b
zuw1Zkk<Lpmyu`3zmezVA^#_&I>D8G(u7x6Oxugt=Zz09(oP4K)5|HmP*&!^!yb5Y)
z@_dNbdjtpw*fxas$1FajAP!5yn!?}uC=dBcOhsnxrOpm?j6GyqD^*8RQguyzsQ2(Q
z6)lX7esXG?oKlAHSW0B)o%>PHH0_qmqV86TdaP!onn)o3R+?K>T&0|8PQK11y_SSV
zhEiW{-jV$25b6~~t)VKFntNrc+Dq55`LauQzKV6kYwKaLb+t7YW#i#F*PQ`%-;_3v
zS{cCvgduNaFZi3nmRRzj?rOL{QU!|@cr2_swY3vEk6Q}Of7o*fSvSBv#c)VRDN?gV
z0dwZ82czwAD1Fj$+EkCC2<znv0B<Pag2-$;dQN)={?oOE0h8oHk+|)(8W!^7s;~CU
zpUx;nzCZZ&HT~0Af4Im9OPxEW)^9Fau+YErHH!uVF8NR1GQf(WY!m!Cr|J!HBA&KX
z2f;bOl4T#=)ylX|430KYa;kGPpRPBv9x6Kap&;=a%jQ=@`T%9MPprnN=*X+wpz&ds
zVlO<WQC`+YQrTJ7K_aodB0#{zdg^nOny0;U<{NeAMD_QXD<){--KWt`m4<=_D=g!2
ztSJXfvuk#&El|o}KsHZc62{qaOBgYrohv9q%T6ufoA`lWAS<%<r$K5I=0VB9W=0X|
zlF8M0vLz$>K_pZN+1a~uvobFH<qG<6)9a}&7gABX$-TnhJC#7pq{jrxYSU=IFWKCT
zk=KrBKsqopfp+>6(xgrNFbHV!0uf89?G!s3u2AP{c>@AQZ|)I7ZVWj@-B+CMDUL&w
z0h8w@3mqo4KVXBaR_?CI6>nUiQrvDkRl5~MCV7Q5)EXKOwhH2FY>wT@GM>DP|6D~g
ziL4Z|)r?rKs?>}M`mi%u`?Awwvij)Mx0(8)+%HbK)V182h;y!oPyNlc!7Ne7ew$&F
z$V`N}z%FFEmC=*Vfc^oIVWKD-P8nAU(W5073lyFOj?<t?q^%!02$kf*h7~d|9I-)a
zS^Bb_R*#<#3qMK>Wjo(lvSKL2oc1MSuItut3iQIIr>?yJZ;ZWrJXCA{KCYuqn&ec(
z)b1d}R7g3@Jk^$vCZ`lJyV}K=m@Q<=G^?B$nqA0_m?R`L$zT&E#$lIA&KWTWot&Dr
z!dTlJ`rV#?KhN*=`u>q*re)1q_qy-veZ8;ix;=I(!E+W9{!{JMq^g?)G)S;dr_v8B
zswv7JoR>n>m;g}Q6mahZf+czxec>zs{QtlanyM*mz}&||O@vcR<SvYXk>$!vdq*&@
z5SB084JHr6SfP#JrpzCL9tt=>ausnfN*1Ops490QK8sbYOqcskbQJGZT<zihGkX!`
zo!$Ov{z}1lu=?M<SIjqTdn&xa(+51p)5cB82aPtWcZ~g@$GKAxfsk%Au4TMq>kCgH
z(y0y!PCZD4h3CLBXTa!bT-G&NZ^xm7-xza<w(o0;ZSMC%xgDASX^t6fgRSoy=~pH4
zed@3&3TNY#NKl6>@8-%~&~Vua1gp>)k_;q=;<2ZiunKL8s59QGoG6)V<*cDEM-La$
zVp~b@F)X^{R@TFDthUM$Z4|n9sSHs4)$PP7*1CfaXuOuodNo~&jUvbZZTEUlHz}0|
zPAW&u%_Qmy?O0hk!boxYV9iF2c?vDxgZjg#gymJkoQ2-|vY`JLNiRP}_9w@~XlD0^
zq_dP4g_9G#4=Q#S^>EqND@{rq_fuYfDx{@Q2YU-#7*haC(CYS(RnwSN)6K7+P#krV
zifw(lAM?$2qz4)zX0i%!jc(J;5ccH1B9bPf6<eV(QXzL%1oQ)2*V<7>@baH@T3t^B
zPdA9P2ydh6ipm<X$djv9Qi_JjBSz^jg=+PIMScN6{Q;YM*{#{E1gn1|k|Ls~d;+>`
ziAJhkfN!&!R@g6*2yd~U<0jJ{DJ&2|J7=&|L`|CI#R+RTZNy|FJgjU7W~{^X-OBrP
z8;P6N%f{Ucqf&u|rO9G|*c(m1Wq+wD6e?WcoDQ*E!@CUkrPrgoSz7l$tNs5i?EAA5
zH2WIYya%uHo?}vfN6wJe)?b&9(k|1CB~{X<P8~<UjqF0%@_<e;q&qm|_pv>Elt~)2
zX(kzjHg2R*JOUfJ-Stk$5zI1j)|@~uDdb$%n=We0ull*9vzTi&6|{I~dT2_UXzzDg
z`&AM)`@)B#Z8jMbqdPrQ{j#+M#m*-m_nO#_@CB|%<)kO)!@g@Y=vl0`Xg^nI9MpVE
zm;0%kNx%@!u>?4=@gz}xjqx!G%N<CX#Ux}Kr{H&}`T}$+5$l0C#o2N+$_4SBQ)m-p
zRmu{32Qf!G7{BR#B!oIvQS}uoluz(liX-r@Ycu<FM|Q)?k=m;dK8~~&{4=dgmUN;P
zyqosXPv~1<>7*lE+|1*ur_iBCjQ6U|NPB|nfFb~aCd<neCS3sFuth!zWmM~sReafX
zR4%z5Ae4N>8qo=o&V;%e#nyR3f_3K#vTyk;FR8e1iW6Z@m|=s$O`-KwhF2EXpUNYk
z9q`3=Uj*sCWVO1}5cCFx>liw;A>a2_S@l^*-_zZI??AN+c1ogq=YQ)7)|(Cz5R$D8
zecZYO8|re7S6AP?>d(868A_KCm3ffpE|B~O0#3g_O+wHfPgnf|$QBQt5$#1AX+z_V
zrYCeq^fvTxp&W^cz$A<;LY&$#8@B2CVq)sC%Zv4H(wFHLRWo$q<C)hV)0jSbzTay`
ziCPmv_sm_a-EVfUwwj+cb@$rB@+z3xS*E*W(awQVxA&<Pw<?Ph;*WL}9xA(kU%d8;
zh#mV^hjr6$6?a}LJ;FmDJ-b!#sbsb+z``QaHm~2EZyFr=b)|QS?|l#V`?Wrnq*B5X
zo~Q)NB>;CIVX#N!(?%N96Seap45JHgkJRwfu<g(q9;kjc(a-gR$AGw!z6gDb%&)A%
zEkR3VR;c0ms}F;-ZJPY4|H+;oXg7h1gReDa{*iq=Ke&eBG?fNz!B%v({?)jj0py)W
zdC3y&%F_X~Qdv66c^o`Xh_R18ixBwP%gb>JEsAV5`t92G_UwSHhhvS#s&^NUvQPCy
ze294Q!*S&HzWhg>7fJ%4fZ2iedpk({e-%Fz6~uA2XOqOFwtv2z&~H+g*R7NF!Zz}k
zQQOP0e`M%q_%Yq4;ailncYA50dlbCW^Y4C1?IlMWdAqpSKQFEi)QjBKO&+$&e&&-v
zCo3NlF0uyex|@NPB3bzmzns3bq7V9{7mXB4gf|$vg5c9xudu(>BArQAJE;Wv8qznZ
zvD7JR4&{V%zG%s?=t?nXXj+z_e5Tq-Cj;QB#CgQFXin^PgFD-Oi}_T&;Az0P&P_Rq
zh0DJMY(?*hH47Dod+R~fE-L2Z;<0NO`U)Hu(Y*%?o%^h$Njxnn-Rv<6bW@?F<);*Z
z5DumG&WB4olL)XoBv|jq7O+}D4VyJ71!|T+beY@={VEpT2vHfMm9im}DI#bpe(PoY
zqIx5&<V+oSL}6>xZ-jefhRS@pNeS)H93RU~1PjWN76M%LInh`3s!!n*wmeNNtsL3c
z9{*Df+<J6paVp>d<sy8nT%W<$Yut%mg-Ie}dm6FD9cpq`9XLZXI1SrOX-b4Fl?J>D
z4fJ~s#`zilVB5Tk<puKnv@h}&&5uEM^&J{nrKqjcvA=3;*5O<a|EHQdn$sMhtBh9!
zBH*%Ls6w^CQa*}y$gL-U0%oz|Fls3x3I=5NP$8I3=zHKSKH&pa=B-SvcMz9{H-on~
z*+aonjbQ!wTWNAUMQbe27OP#QLGgt<Q{C=XQO+{vp9e%!Cr5#fX&6nGA5}yEPElQ<
z0ZuUtZ9jtGCg<cPjCVC`Oud5~V)W?m$Q7-I27jux0}aou8Mkvn-vvD4E;tBLC32Dg
ze|`M`|4P87o_v^##du5x*VWOo?KIT<Tyq!^qW+P$O~xCIIrbOtXY_o&Z@P28XT;Ts
zJ0qKFYIZd|%yKTVa9UoI`ao!#dH;gW>rX}h;?sa+4cK&=jyCF`oiaL%OX8hleMpd=
z0$N}kg|v*W9`<nIWxTyVxsJrN!rfp41RnIFL#jsvn(n1lJ)vAF##uzY5n$P4&atB4
z)&&zKZ=Lx1^txYd9b~1Ppj=}X6bub@h~kVEA=Gx_pq}hVH$#uc9vuKZL7SFq!e;8w
zz^HKx3rk~+%I+!i8#gfC0j2hiWD#it)O0|FL8Dq=G`y9h6X6!_*DY0Tr@4KGl?hM!
z^`?VO$Y-Ozk~Z1{e6!3AtwHo;MC4y()=#zHuozG$;}j5rZ9lkBGPe+d%KcPY=@QIf
zpe$3FO%UtafSGc_-%_Q?evT1rMu+6#lL1@g2Voj4l-Xn9KYyxGzg@?)L4cuaItr2f
z^~5|r$w<{ns?clZEg2(lrITR@r_E?ToH*5kj7oSZGr~2DH@qrN6H!3)@2R(j<$zs|
z2~&V{6K4*MeGGLA7yIuJdZD+)=J@WPYQC`^^}T5g8Fy>cu%+AuYH$ezpJe}wQ*HXG
z_Tm|^6x~yeSh<4_2Uv4Jr-(b{FEGlh?5R=5|5AeoeyV+vL1Y{-v7u&PRplIXM=RrB
z9}yLJkX6J$8Tc=iMc@n?p>YBD>ynQ&Q;oi8p^REkc%0%bC_KP#T99~TN6^91GKy4#
zSlMq_*o~kJ;-F?azY*oT83Lol=&<|*K-u?!`<XNt+r(I@EXL!2lsuqR#7=`Mh^^E#
zya`;KQhxvl4gr;388Vfyu<k8ZjX;%O(o1NNXBPm%!>e0l)menKAywv<9s^aBja4BY
z{RRg-J7-9PcB)lQ`E)v(C3hTDfzz#6nrDNSz_B8ziMVboiU7MwSs1Y&VU=Pr!k$_=
z%2_V_*1rd8^4!ulRe!J~7?5Mdx26<sQP-c2+lGA4DWtg=nmKocUb1Gsr7al%5`Nad
z`#C`Lv7nS{p(-P@6xz=LV*ek4kj>yEjG?BZDpOjgEE*X|f<mAVF`YijY!z;Tq_0>>
zEr6PA8twhab)1wc;SFdLA<(r0qss?R#j<0)p;QF{8-vHA9bVPkE~%BFv%Pl_dU4+0
zH{_QN@Ap+t6D4|+9C@@NtQuC2W7seTc+YhU;dU_pV|f>ew;0>I8g*1Jc^eyzXwxk?
z`Ce5Qdo@JXSk2eR-m>=tzrzr;4uL5Vm#lDti;u#p@-m@fKSHkTvNm^db=W0T-leG#
zh?K7cEaM;S^P1NDRSJ?am$8O^-jx8<N7~%K8u!o-z?@cEy#uoPWM%iHHC7k(#U=R$
zCU1p{D#8OFl22H-Ck(i*sNVfomqEBqSB5+G%EMBf4)24W-dhVDS4jT&`$Lmk^y}3d
zZ#JFlrCvyG5>pDN{+J0|?3%YWW5B`2*ygxv_S(yQ$sc!eV(YKwX>DXS*ZN8^DJ@;p
z!GY3^ItsvDHvOi+ig8%g2W$l{wjO91)}jN=wm;P-Z-Y;%Svf#AlMHJeL$Ha~rF#$-
zkdo}@_HiA~^-mw`4T*c-x;y4ae^C90g7nZC4?@0+qFq@Jp8d`gs0wN&49X>Q%~*l@
zOYBm6clhv2ELLSOa$0dz$yPWbU;l%I-bB8(kSYg~r4}J&iaqi}s!o8}XIUEK1TbWE
zs@~MqB$ZW};WUHyxE7!83=Fxf`l>Oy`!wU~-BHU8;i8=bR6VzD?>?9H#rwVD9=~WS
zyI9z<(|P}E^P=q9PhbB&bJE77{p=qepIiGL-8Bm6>i%51u{rF{<EmI@+UWKtWa|>}
zRM{&Iz&6diAy?H(2uQ~QHgwk;iYZjV=&1iNJ8@>eM$zr!DxpFvQ8s&}x@VV8TI0Q<
zp<i0bMPz;Y6~ReXv7dl^%{%vQE}25nn@uMrla>NZDC7pl@W`Y=-e^r5Rxl?jZlP+Z
z-sKSxUQ3$YOGpDc&oiWN?pL^T&=&y5_Q8t<g1Pp9ok}i#-v@FXKC_5U>8p2g;2JHd
zYv|>0{cYPEoZM>izMPj+0(eG&w6&W%KmT>%>7!==1&PH|WZS*0zOg!bN)GidXovrH
zH^pSzos;(8HD4pOM@~HZOGPZJKe?m+qiM7^j%tm+N}M|G@g-yE#pRIdp}&2md%$9{
zH|H%<Gxdb<^=>Jw!gY+2rjkB@m9TVPm-DGowX#8ON^_O{16E{lO`781Rk3s}>NMKk
z#7U0w_&~E1Lus5U1NI7qx3XH&!yPgsG8<O_&e~!5<hXucl@i3vD$H!s?9HuXf`yo>
z&gE#o#_Bc(78OQk4JTTUi}UR|GCd1?bdKa+lRCY5)_cL;>3K?c>10DDfe{wi=ahA~
zGM&uFB$cdav+pkT&D_|VpXifuUOl;7pB8!Yve%exV}HS?IM!I{Xy;2MRu%*A1U+R8
zmQF^a3L2}5DK^>dFR7_UcBsy1sc^ETcAvl<Wx^e&VCgOHu&!V2gNEKqR=9QKtv=DK
zDH<nThu4QVx$>k6O>qYx0ccMA5xqY93ldI}&%ExvkfM+15Ge8CG^p8M;#2J(>xrxe
zE<GS|=*_<WihJhmBb0Kq=3e@{Xp+O{yTsS5WuRjSKvjs@uMO}33lIqfs(py+UR3yK
z!x6rFx_F?4jXEm~OHlS`aU$Lly#lQW0Nix}=(97w)f?y~OxkbB??8N-ggVxpE61)}
zxk6H(mBt0E<srp4>EziR9J#%!3sLy?;w@;FvJ~_};kbU_OFK?1X)WDmlyrlwiQi0v
z_;szS5_rJB0jWtXq6!N5j4jCc+fvl7<+*E-FC}jcvLpY3goF`LMjZMBVfRL4+o6+j
zzVW_cEoa4PE$#uGg2Nl0-D6p-|09iB+S`8U)TK&wPt8@*l^wS-3Q|j*%mW00SG_)k
zYRy*#qro~#=><=WeSp7SQh~ACxs$q8rhmV=6x7e}<{Uq@vS*%4IsZ`DJ+c>Vg<0Z(
z<VGXZOoC}8BpNM6vRWH1P6Im*efucR01f0D?Lc!RhPESSZx0_tqaRFM%e_@P>UOcM
zn`XhW;%Nwiefq1X9qr+59$T?aoOlIMZl~&n*3}c>&<+V^$_XBk`@^_4&Pw`k=*_4F
z3a^p`CnngF)Nd+$#D-D)k>xNh;VB_VxIP#JhwHh$S2(UM3?G&1(W$+m3Zf*Ms_olR
zJc#-^!tdn|xhXCg_rL1-On4WpXd-5fQ#4{S_3wO1sUn{Xyj8frXrMCx2-!(M+4T0^
zzPJMQ#JJ*jk}nm6(`|<y>21rFK_`o2%Qc24++66dy-Kv*9i5$EfQSAs&cOa(JaR&P
zyk>H`WTFOJ<1OSn7`HGs!+G}*z(ZPsBgCW7s0Ov!_t88i<sn=Yo6mSf*zmo~3cGGl
zyU(5w`81flikr4-`t-ue{K6dT(aU!3y+nPZw+HC#uYsBWt>`UD#Jj~%Cxgbwi$^Hb
ze&^mSMw`OZqB|p;c7Zj&G4(Ajq25{ULPW|V;?u0`Jc|V8aWddqy;zSJCrm{^x<q9?
zyEAJNP@yK4s5h|f90T|V5HRC6P#aQj>yC7wgzj0`K`VBClI}Um{y-E*CD*G1#G(#{
z0Ab1ib_J%wSs?2T*a_6`K+VjeGHmhrBK8W<De%#!wsbN<8`_ZMnoHm0cc`(Bd&_S4
z(8e7vUeu<0{QY-B)+_ssU{Vt_z)x>;zFkY4T4-G_!y;)<D@QqtXl>?)TiD#Dtd@Pg
z(9NOttZ>xAJhY=pmVO!~wF$CX%!KH1b9*~X;_rPkq~&yru8@(J=spP$cPsZitn}L1
zc4qv(c4I|itG^?Kw9+<m_cSH&m8n+_vwlbZhTc?3*@4&IoAVF2wR)5v!Ox_}+yA<K
zRYlyx-uySk`wG=J9=aT9@^|M5!NP(d%DN&*<ogK|Y=KT#U9~E#Z0z~q_!GL9Gg
z_!p!)O}F-Oa2A+5awQIS;;FcieQrL-`Y;KSO0*EeiV$ji>$jzktbPvnH>?qSRJC&$
z&3K)DI*7Gb^~lnc+<NO08oOmKV0*vSb#+-jnlz%DVo<tzLQ9NWNsB&H*FTBhKm&<&
z$9oH5PICr~&nX!nF#xd&<wYg?vgtq7idClcV_ghQ)Rwq$Lcpjg71SixFA+^>Y~|nW
z_qjI0kx-NK9N+;yNu7LBGuSsFy?9r47L)Gx@U(OI<KQd0`%VzeKM(irgO85FFZew8
z4?r1_#md!f_{CsQ90DLDQni_;*P4yLC!sDwr<<@aafqmiTDFa74U|3Tvp*n_u0Rus
z=-cfH4Pw^%pt+OR-rX!4JSxjB%8VvozcjVxTiX-BbD!xutA9t<O97kW+IV78_~#!_
z+mz)KH^w$gyiEeq%Max`sc#QEJV$;K(sMjB*u*n*QwE|PnE0gRhu@&fscV@#O*3ju
z52b!SzCLxE<hmK`D%*z|j<TmT!1^{d4F=Fgq^$&IwOXP?36UAe5)??{hXI(Kcx&8}
zx1&E|$oz2Y$X57mQRXOSDSG&1A8>Gd83;giA|Ih<s`WTv5r1j4z2BOe%vgio3!%!g
zUg8GL4k_7m9|=i&3<8BgFzA?*y7RjtrqpFO(J=`-&DSYra%pyC-#n<F+eS|y4srAS
zg~e&Er|Np^9ZNs@JMX%7&2?>J0UGYy1z{T3gX`(b0OE><erf$O@C<;bFZRzj*<Xq!
z8m^Ngw#p2=wZgZ%eqWw(ZI{$KL|y!yiJX@eBGoLI*A&3G`N$9luq~g7U4<I_=hiFp
z84QMe_lKJ{yMR!UsL+XShN=necAbMire)i}{bQ>B!P{b2D$XMyq=!(WKG{hIavK#M
zX7i;R=$0@sZ8NMfp*V;D%04-gr9r{!)i2voMF}ZvCl?J;sxOZShu&T-9vbn8(^(Z!
ztMh0yrKN{_JWH15edwu0mBocO;@P_6Pj%j!W^CP25S(6aTT<F5d-}1rp?r;1_vib6
zxSUF%T?>8JmEP>*MQPUP2}|8+CWg@<54nNAG7}7LreDAP!N;@$+3gzMK&1cSQw_0)
zl+Sq->FRHP`j1D22YlogfCYxIiOX6`i*iW;ypVD=fBxjXhh$ymT6Q5>c2XDibIb(M
zJy+o@Rf2ROqf9GDHVFJ|v7Bh6lVy*AdCfo*7YodX3~ltP2)kk|eImdN;oSljE}Cnz
z(T1+6|JeA}B+*?tBjkAtW|80GHFNWsQ&nY~W4D#wChY)qUk!_o`S7Bt(Ex#%_87!f
z(m`vmI$&=p=EcY$OqD~R53_~6LA8v&{C#n(JDP^*iP^E6t9u)_tGa}%B9bgP^wX3}
z)z^kcI93(pZ11TzJ}Ztt$}nm3-tqbPs&&ka`{T8y?(QieuM6V(_6w%N?xciTw0zbN
z`;v1a?TKr4$Ps(4m*HvuLEk$iZoSb5y<#(K3m_ZiWBWz0jm!@<kqFa#+W>Z1wg}~6
z<3GKhy2Q|<FIPS#Tmpp?{Wqiy46$gp_q%3Ti&6ZvEEk@7S_Op)m=xP$L1g*99v2t{
z)lzEa{y0UB^58tDQDi>oVF`6D4QlxlWy+rRh>_*Vg_N*U{;CmdwZ?2;#<lV|Jw$M3
z1mxXh4cHT|hFz*z43xX()KgvGw;jnk)@QxiIOaR`D^mt)$`}C{`g<TMD;|iFDzkw1
zZnf$oNwaYSZA5wqi8I>3lK4#CgX?<E*b<x0%8!!g%ks=&&{_$6nVnH*lm(N#D$!WZ
z&@e5IcCEv;Z^jI#<LC4wc;4b(B#0YY1Ug{24~b$bpA#=!uEH{2vul2zUw%(a*`P28
z13h%v;vQ+F|5_+!LhkyiEI{Zg52Z!@TAeZ^>MspMF#dPU9JU^V-{q8js|zTgJRG;J
zPAfQb<^AhQpEacFj<D}#2pYV9?J=V*{PBrZ4OPa$rbNSCmxSCx3tAbW*MDWS%U_S3
z?mj)6HGTH_ExQ%@n|ADYYQxrabneTfT@Q#T_i(4V(0n~ZOJ#dyvu15!rN^}cF+wbQ
zzB7(DW3wQSxewfSKh=IUWGn$!N~>`x4J*N<5kd_7qtBpSXZ7MDrAxle`_Z<_Sbzo9
z|2XUmHGfo=Cgw=&cnQF~LN~iQ>Oj=(x?P4ghD4oSJFvE9NNOQ~25}z>WP_*~7#Sy&
zxs5x}jwUPyzn4ycw`Dh>1PMn&apDCGnAX6@V?0g#5``(+RY(~zMATn)GD(_~ySR`R
z-58w}l@pKj&p?~T80KA7W~CEXC%NV%7bn6$iIe8w*jvCY{+-c!Ei0|j6m{}Z*4Vq2
z(LGRik@N;(iL{Kd18Uk+zZKCEQ=6zscyl+Z_)9(j6bV^E_yl-hI}p9mDGn3AWvphz
z7@b0L#JUOgCgKK5BD;;T7(ULvfKW&XmGHb}r=;Xiyi*bg!#M6Wjte}g<&#D^;#o*`
zY>SG`2-!|4HRPT#ef)awmtyOcrWpfVR%JR<c026KozL{H;BnW$Cj?Gs3^3LbE<!BT
zn)G&{>a9O34{;U$rEr!&EAP=G<t|W@i)!5jrX009T?fx4Jxpxo=`nVq2CY82y1spB
z-*ea2E@$pYEID&}SOe$Y;CJ=3L!fHIv>2z<5LVf!jBOUIwO#6@cbv&N0GifAP4<A_
z!7d*I{{RgbNKo8<SA%ei9(jR-4vo$MAIOQy?p?s>o|sVi$W?+%1kErg7CakA1R9jK
zI%ilK|9xtPx{j79ot>h%RU_N#d{V9b2{MWd@NY4t^hMxnvYTWz6g#pCEfhNe(qnX3
z=cyv(v0^1cOsvY0CS<DvLJRC;i%Ya8%L(*9(9fdl!47sul|_wO|8)D(GU6M0XLEOi
zEiV;M`Mw_Z&iA;(%rxy{neTiZNp{H{uXE3IeEn~MTkZ5K4{x*R{qmH?ElxwdXDW|}
zx`zeyKS^MrjP8K8AHnwBvNN|lJ{5hD63!n{y~ZAe_(lso*8CR>nz5EI%4%qSGk9bZ
z>`~XBYKNm1lyu2&<zSkcIXKYUd-50j>Hqy=&o(7XW#G8!P7p|uAOv4G@c#9D@i<@h
zPFZe$C}QYk@jngvdZ~?;Xjb#J+M4uByg?Pn!>KG74*9|1PLy7u<&0<VT%s5!T;dqf
zVkLREfkNjv^7}KQgq<W@f;R(nQl((1ogA`{2f*aefUFByUTI!ccJN$vjGL2tEiFW{
zc>j4beYGdcblMp2Nk5#Svuq$z)Pp82sXj#fH=(2N{|8e;yTGjHqj=OQ_5pSZ)oRb5
zYEPDkh9OUFwG&sDhIejSj}aYNffs$c@-Riej$B$?;O1{N=e7vhQ7&Ie-}GL!p@!cX
zqmRD_-yfuhgXZ?HXxQvF(9D2Vpy3i0pS@6?-F}QFI>Y;_Mu+zf9cn;thZRor6Er!B
zjisoQ7m|=a3hB~bv_uJbDb@`>?!5#8PjoMD0p{HXfs0Kxz-1Vfw^&)qJBQPt=}Tq5
zC^JIo8^uCBs0j}&9KzOY^|-GnMr1;_dx)~zu7RA-PO*w5k!q5C?FK(g1Q>VMFB7GK
z65>H^-zEB1ajD<mTncXZwaOh5u>;nDUu5aeXd+bTbhc8_?NY@qLW&LcId*b@(82}^
zO|eu^I*L;m<MBD7&X`XKD-{w`QL0QEVI?xmpk_CTZf_(9J#C9&)_XrhzfSSs^fAV^
zfnAYDt?eF49lQ6W{=x$cOWZFnj;u>PzoC4q_QN~Qn?t{QTzeg!d-~P*SC6vK9_7vs
zpKUX@o4gAJ&Q}xX+I^ivJk*B(B){|atPf!B)hPys>zNOwFTp416lC!Mk>JG}yus+f
zZcwQy{GvNQfVc;s3SV)Uz8Y|zzcAiSh(FgT9|C3xyfq!8g9EEDAY27;XlqqCA@Nq6
z;4rF9iy&Y7r&q5PJJfH?XA_`>7Rl0`Kh+)``Kk7`j#mv1=-SMdTYO_bt4;&+sWYkr
zAdRPd8^kwj50@`ejga6<PnF>i5-S5I88<6dJ|u{;v2a{XCK4#Simnjq0uEf~G(Et+
z6T@Mu-yn#$50&n_L8lgJTv{@h<@Y`Ur$v7LO|`8ng1wHc^)@gJpIJWEf2}&lx-IuA
z_3_wr$9YJjwJiPAxbEE=+2wl2DUNyh_Xoe{Z5W2sAyE~fa!0zDJ?PoYUe8$C4)a8_
z1ANvdRXb@hm#yKa*qS15PJ4h(t5#9vD^cD{)sb{i$Zw?Y(+`b@N89t3x&4m%zVogb
zm?42}v>baPG3vqdEk>r#=5cKV_!M*_V2P>+{|g$4_&EC!1?oQNqI1Ia-|>2?_E)9B
zf>3rL$%#W_faF#jVjwfAK-1SuK~_EtLsaW3!fX*(X_Buc6c-=7AYo~^mh^zpHsK_`
z2grN2)>-6fERcfw#BNtUApEwo{;U{6f33oRYu@Z8KK)<8>2m>%$rr*m$83-Do}i<0
zs*_D!pkhm1Ky3MVfQj5krgMw+#Gll9o)F+M>ZzG|Zj1|~PyRi=BQqe-*{kr@%g<Na
z)|Wj3;LQ8=^ckM!J4vyNweo&1rK{Y9ZGk^o@R#^^VRduQo5$Wo1yc^Wmv&h0ejRqg
z6%Eg@T*KLz7`!*^2|ONlxxgdM8Q80gn=XJvgvX3`Dg!vQ5}s?-XQe8ffTZOz8g*e5
zGx$~VwBiX(?*Br04{xt9X@o<Wmw44{bn`aD3zMf0{G6(W=J4-MqlcY=?z^WYo52d-
zkp%}fU85$Bmct+}RU=DAgTE$J5fkw{dh5yMjMt=13XLMr1*W5w5>V|JQEJ%F0n5mV
zx8=k{tniV&c%Im5WB}JN@wy)s1i(9&4}in9Lbn38_Hdb*-FOYu#tr`H)wn2#gsJy~
z;A}`+fkRJ|%Yw1%80%1yA4q5YfRm6~PI2rPNr1G)yT9rw{XaQJDwWw)euh!40*i_u
zp@F@z*l@YI=W#S=)Q#c~^oRl>T^r`asWy8N)IekiMynIfK$&7z^N?Gxw4JoV9uUTL
zPLo)~NfHqSk_><ZKl|&ce*Bhzu|FAa1@-uCNCLAAlmyLFaC>}&XQ?mU11YNFDE#}k
z@KBr>T17w1>DfK<7Oc{j4=?4g%8zHsz4AL$V^dgOit_bbVNh?_{o1n%YM{>qR`ay0
zvND^NrD}KaCJgr0mQOY0Rq$1_@{0QWNFy?|I*<KKC51!-R|ONYZ732g!>%1W5sc-s
zn#+mDyXYnhA+%1wF`HZhrd=zZD9<-BvEGuLkMi1hn)D-;==p=SJ#+#9x9rr}8jvqk
zBt-Luxf2fOhK_=99gAI##vbmatHV#f$VV~!4xT3M@Jmt!CSiuK9)s@8YK2ygv)5b-
zZ8Qh8_O=H2O9{+wX<Yi^g)KYwEjPPis=bLq;l;Bn-^D*9O<YUkr2@>h+O@tHswbe<
z9A{V+E5|4pUG+HXBgaQ$>?*w*mjn|B(MSn3UVmRh`C%}1eN*1hYYo`l1FZhoRWxkN
z-F+hm5Fgn<YL8!Uy-5Mo9HMNbN1k8SmMihI8{&vvn{*q^`az0<4f1QhaGkXsZV}yB
znfyz)YOgQo;8YtY(lz0B;N1MFCbt35*8KYUr;6ph_SkSqu3bxl9xO|T?N0UDYr;I|
z9*fzjB~EBALU`$WrQkYpn<vZrgX0WwB}492!68Sgt|&7`^^@D|+Me`e3B64?BTmxi
z?73a$9BPS}E#2HZ`M6!eOcR??wsnU^@yws*jab6U(jm0EW7N30+{_)x0@Fp1oTm%a
zA)-eR7hm~E;hf$YV5!WASZRfF%PpJt(u2@bjTYSMT!o#On3l~9R$0D)RmG!FYIX%0
z1=l#gw<b+^sx*cW9;iut1DTPfQ-O!O6^0Kcc-3)NE{_SZyCAZItsS&X)y{)|?bMpE
zy+krJpR~fPrGd~f$M+!Q!la5cS?X`rq|&YHio~s;h4k=8412agpZ9(lY8N<HBNSS+
z#3z21WB83!)lgLOc%02OR%jb`wo^*oAlZ@F)?_drG{=g(2u4=9K&~5D%{s<x_pzN~
zr_?(Z6H67rh%ZVJ7coV{e)F?nY+X{=MTs3qe(!PiTP4&*&KJs5JuD1xFpdhf=*NI+
zyNt@afI+t`&o3f3ZUe=zVynpE@`Nw*cwLq4tu*7<dWRyK%h)5Bl{oXQo{&$zgkF4I
z(CmC9qPL-H`$kiZoyTe9zh7|zUEpiZe1T`=uFuzmZLEBo4S%~l@^%R;b$h4TV6$KF
zBjs^EExc{a*}rt!BKnM~q;Z|%JQ|36eR<8Mg%{Uo(On-Xnz~H*7;Jm#R*n93MOe##
zKt0K-G$Ym<9!QBGZ}%gtqxq4Z_tYPJ5WL!gT}JNrsc>FAe?i+P*;rS);_uQUV;wmo
zpU+I;PYEpO?=++SV4lbr-(TAiv&uciBg@WB{_%-<n>6blE{c8k_2rb*3wE!q##$f$
z`Y^@Rh46Mj2C9fVuJSd2tS&=e{5K_)|BwZ;pqz1&mLMFhQX*A`u!q*Bs40H{YuD!;
z&?+7XAc1|^$`jb-6Z3y@KC@*2CuH1Hn1c3f5!4h6&HjgOz!^VxW7W=wH&!XOf;H?S
z4WJFjXz;`VdtuD=5!!0mQ)N=UwTQy|z?8Yy!hwl&fAH!&iU4>QWTeFw5pER1-<c9<
z<2NDfNl2V0FzX!)gQS7y%V}&ESTDwE81jhu>Rkb|63Rm*U1hu|vws~I4i53v0k^bN
z)k1>rL407`663&Pa#)!Sz614RJb05PsPmeO0scr=X)lyfH3ab~s2S9sOt4_AnwEP`
zk>Gf!`7iJTB7jIj8C7A%K_3w+zAceS|7R97j0Ff*RtScvLI(%B>)iG>!VQA%$h2$$
znSx5Fd7Th^cafLGuB=0^hJXi?jc(^Dw((nCM*+&>$@1}gLe6xGNAvgnsaj@as2}j$
zdKuy*n@xtlb_$WFy!p;K)pZ-9G7A#tf=@C=Sj-du2YgTlI99?EWj#-94!!{uN#dh_
zb9mza!m1Bg_z8IL=bvhggRChYdYmVAW=NwHTX=9Ko{IW#7s<PdzSZGx37BL0$L|&5
zD@r=cgQto<_<pD^E%SF=s^?q%u)pG0YL(qzAtCvBs&5^URwrSnZDoJ9>HVlttqxyX
z69#UZ6v^c!L~58FMF3sL1xn9m?Amd`cUF|d9OF7r2(Ca1<Mr)t^LMMnQv`pwLDlf+
zRIqtSWXZR?cw>0@OZf%u`Rkk{tJB}N_Lf7ZDL)KUtIL(?z#h?y<}A3`RuW|xg_Zi`
zGG#5eA2`JhA$%V?5AmCTaGW<qL=S;a=?k^K0=M+VBpy<1CBaWYomj6DfF)py5OD{W
ze&C|-6szNIiWRco^pE4A_s_ayx_FH>9k=Z+cDcWG#bq;(IFA??lT9Cn|M+X*;%D3a
zPws%&&Rmz<CAR5d*SvC_UwdX*HaNek-S=rw1?s#t+`&m}>J8hqw~<>o{ouR<Oc@pF
z<Qrgh6ibnec}_|MaTDW>&;Y-ZHYe7LKi>_J-<lFMX$Jvm;mQYLWT{qzQhSH!QT9eh
z@er6UW1H{>@b`+Rl)lD|V{<3<Ahj7gT<Qt5#s}$RO^Qc3iTeNns5#0U_H4<vxkb2$
zOXDq7ZBcB2DWI0*Rwt=;z;U+fO*jmrBhU1CHfhZ`!yd^nGN^>VREek<8syU-IT9=S
znxOE?A}!`HHlspW4jNuQ2Jq@4QG*p44NBRVKSzXUw5T)rr`mZa$@U0yl(ovW03{ng
z?^R$CkP+iUTB<OFRiC*SU0QJ)Pv6Q-zYDt9alnFe7PWamdw3GnDnW<Z*<22PDv&rp
z2n2M&w=APgJp|Ds{rmHJO)OnT8?-KhhoQ|iN9kh|*Hb9Mm2}^~(fnxG72MIaQosKo
zP%=FPDR6v0l_k5xjea5Q4)P$5+awr<pAUZFicL!64PPzwlKs^>*-MC?EcS0J{!YGJ
zm)BNYjiyv>SM^Ezt~B=yt#%w&)|l_#e>TL}?y%rc+8B@GdGD}*KL~}B;NYnSC1+9(
zNzrifWS-2#6X*u}YIdX0Nw|G*2Eb|X^uxyo=?4lveiIBN(@EEL3&c_kK1;0hRYs1@
z42OI7;39{$h_H%TGNv->t`o*X@DWbjR#`T3Bp&gUtx@Jo@-7;#pL6A*ZW5)P479#t
zPNOeQB4v&1kY&Tdq+NLXZzl=QczkM-(F&N;;$WxJoFcre!|^d%^rOh(u`2<)>(O)s
z)2x51_F+)nMJI*+d^d0MtlY_=lUeVBELMVsFJ@U|#VAjo#pJgbSt3gS8nXimiG2t*
z<-UT(?NOQFnGy~9xGyKJaA0Q2(j}+a##g&hy~j$bMu;zapvo|trnm0A__r)OV{1x#
z&k0Xi#-q;xE;`<)UV9sxo(#>%yz0rejP$<HL9Kq-c<IGzgUjz|bZup^S^TNXhwgt}
zcHG%BrHtcJ7U-0>j{2caXZ?K!XoV#bK`vAs#OH#OSrgf=*u$Hg7~20+EjxV3c-mSj
zTn7E#IY^RSq6{c=p^MO<Tx2lrO3!A}bx>>gtN1*rs3_t}@7>vY4ROPYVT%XA$|fi6
z`f~6}g!!g-+iWjn-Ja3VX^C8>?f##IKiDr5@`KQ;?aBmbvVUPQDX)~D?Q)u0wvaX|
z+X#q@(@4}p+-Ty0u!`}P^dp@RPXeY()SRe#jERj@9`ixw+3Tt}aZnX81#gCi8XQnK
z!tF8%xNR!Mq4&S_p1aEZw~DiDL4do~-np>#`WmW1N8ol@&SL)~caBIGMf!Lg#*C2(
zfl}sVN}#fqM;0IOwp>kE&S-t+d-c&0<H6tRl((hd&(;XpznC%QgQLt~x2W*V<oip;
zQ_McI(Lg{}&d?XwxAlaD?M)7T#=@O!UEa5UdBIf0Hs~Bx75iFkKAX1_3grD)P9?Wi
zEJ3kyTbh<^K$#RtT1sDx9)x>D#j%k@P)aXIm`G@;2k5);mIpO1g8@29=FC8->BXCm
z0?>r%qwmYUTbDMN9qNi0%5`zQR~C_L+b5|^oM8Pwp3D!hj{INVp@gBgU^uo>Sliqr
zkRbsHSWOlo|55A)UnfL?F(8czCnKUvIbjXsJysLCNCmE~RwENc56}*mAOS^w=^GDz
zQ@1=l|6AShcyTHqPRdB6@d5UQ=aohkVD|lgHE?1*ef}ZU1WxjwGCn|+#N@`MAS`ZQ
zCY;k!m(_y3fn86x8y${*>BH8Valrbci{yvsyOa;CnEsTP*$@lAK?v9=WO+W^EaYFy
zY+$kImRBi(rEeem3hzy=7hH$_-!4;Kq%lYe=t+-vSqR<mYy0C)#_#o`+p=eN`~fn7
z?@K_E*NOW9-KpuR=|9L^XdB)NpxCq4ticJskv3305>%^dMQ-Zixh5V)VGP%#{Epa6
z9vo%9T_VBfikR3Wk~(7(8M|&~c&kikJCGFV;M7FgG>f}=4WBLw7b_2+fZui|D6GXD
zJFO>}L(0HhWey|~T;&B}6R}Gf_6jgoVx9zz>_h!2ARdx-<SpG3EbKD0U33_60wJ<I
z1N<_<7)e*5Im#KrHeuIs%ZdEF^*~;QiP00#eSP>76WHZJ^H<nK0`gpp+^-*c0BmgP
z&773R#mZF1W?H{|w0KHS?id78C!~|Jl7-eC;uOmZOt#-PvJ}pm(t&}&%NA)*84$5S
zCYez380`5WT=8wbv!%gNVIb+Sdz{|Pw5T9Ip<S?Uu>C?rjcxc+0ZW2?$sBfVV*^`z
z7%Xhw11w7dRugYsupYaDlUmkjO5ce94Id{dU=>mv6EZ7}feh(q^A)?R0K}f5uBO|w
z+ITuuxsd1k2<^sdc;F^|FWo`>g6e$nufN~NSA2+1ijUggGj~RN)0S%`e+vZ%noNTK
z?R^pstuN+&xq4w^xHl~rgW37SGV`YzZ9Mu%>LxE<{6C+7wdul!oeJ%L9TNW=JFs-D
z9}?YTR|-?{>J*;AjFc7CXaY7u;^_OKH7=-siDG+YcLZUX>K(RHIyoq2X+X^fN)<ln
ztLW=j)f0GuO5-!Vvz@5Rl)1&M1LH^MG5AYz(U|N{!ng))p*(Bc4<D(Ce1hi4{zSWw
znC4LzHj&9EXhz@$M~`*w3jO?|qL$@f#TFaCB*iywy?#<);5>tFX+V~JN$Steo&NZ3
zP7fKYse|rQNQ)YG4;Qx?x`O?II#U#866%KGiRbhkduZ1tS{N%(CnF9d8osFE<N-&e
zC<1{t@i4Nis8me(d(B>2O*4>x*6(>LTxfY}*qblUmry#jP&9jXsI_0_+0u8){^H(M
zmc{yiy@@bAt4tuNdz>gfUTuAn<FP-xr#SxMvmWm!N4$gDu5(R1NGA(B^{w)ZS106z
z-7iRJcJoR*<LB;?dGzc7a`^k0$noY!g-0xN34w&tZ=4R|az+Qp&&bv9Y3s)Wxk*gP
zSf1tIeF_Y@fM(WWB40~i56_HfBEY-)(xzqvm%|6PO2NQsIU$~IGU~Rf=NPZQxSMK?
zVMf)p2>cTP?$;1d9LzbBd_ZTey*1S4dtIo|S?-q;;uA11D_t6ls9M>J%j-{z<W~oX
zM%wU|^6Sb<n>dBPsC;{VZ%)f=dyH6UgtDjlJ08&w!oNK`AMNc|^k8)3$uG?VEzTaF
zLDa}8Q0a+`rwX0z15EFKX6D<57A@a@&3kKd+garJUHzhnN8{ns&0Yt>tb!c-{$O-x
z`f8QhUvfyDuukCj#{JQk?|yir>FkzW`1Oq&y;?k#J03RbB6^cUa-ZM;9h6v1gJeoX
z8gEYo^@&K9CSq?4UPsqJEz7~KFWA=WHm;{a88);|kdXhP6Fa6CW&3Gg-6sOZUIz)}
z=V{#8R`<T%5tGc#ktY#XUYDCPMr>>cnAU$0M2`dZCLDNbZxap=@t><+6DnW9opC#=
zVO4DbtzD(HA!0}anD^F;2ENalX;AogJ<#;#YP@D+Pp;IQe%dPA|6QU^p{@5G_jsG+
z;)C63`rPe0vm^9S+-i@AEy?rTlJ0`U{KPkYt=9v};J?PL#Ezq5=bm|4WSJj5dp{@N
z7P(x$s>|)SpK4)=r5k)QSNg~@{avb_b)WK1D%Ahm_l|e*p~6*tkn>JX933mSQ4P#v
zS@jqRVPR!FqzFT_UJ4xgbD~Fi3D<a&Qdo<x8(^8^oOo=V-?tG9c<oIwRgXqE*2^iM
zH6*s|v(e8=$??4V^>|N+g%uCuOVP(A%}a;W6mBrK85jV$7``+Obam`EM#3Zzyvo>o
zN`ZR@&owoi7Z<0_auOMv5SvsnX~QcCFFs&1nq95hBH#Y>X1S9XlgwCuikss~=3eNo
zN@(nrR^2~i`fioic+agrOWpdajuh{&&Oe!Ha%AnEV%yM@_cJ~Q9?!@pJ06(s@yN90
z23{xkdHFva>$wvgK{<Kc<#LHj*cBJA+I-VL@Ba=h=W+EEcSs)sK~`3)aDIqTZK(Z!
zJ3qi0c?>Q1skWUm3Mq4K{u`!M<|fKkb(W${|NDi<Y7f4uFEM^*zee_|seM5GThPPA
zC&5#sCjgumJ+1HYe~-;$Yf}j!tFbq;!sth1U)mULVF4TbrtE<v!Ub!zf5fa-nQIND
zjjYU)*y}sRobLhv#b69DS)3c+t8i=3^pjw*tLj8u6=vrp>|}kw{!Vz+_b@hrO82Hl
z32iq3nHG_{fVqg7!vVE-k8IhCJKrbXfaQ~^N32r5D1!n^9O2xX*vbse)2jD>zR7a`
zn;0V-@w)xGM3gk>BwI$hCx}vL`MtE^ap!J<!W^3tMNr3EqvzYR@D}w@TpnJT&RFU<
zNVA;_=#40yvh##_X^JiRXkbgD`2bIHn%4?swGYxqYf`8<3yx4+T?Mm_4bc4x(2V2L
zspcIz&MHK)66BDsaCzr_!I$U!!)0&w{fWM!Pld*~gmT9~V{d2owBEw02^;tBhLS^f
z4V~ZiL~iLgVwz_J;!V2==8Qe;!6f>x$_%Gw9149^n8{G3WRKkAI|)5-pN+a>;SPwK
z56k$H8<C(ZMXrz^_@vn1o+=CV1MI>oq@ITPz<ci1sDZ74ZfsFzCn~gsD(y?`=T1|H
zaqU|2ONYVic7d9}m$VQz{8T$Pqs(hh00c=)dj9*ZND~|<<lg2cihpI65g$BlRGB&N
zDxW#Rl7+WqU(7G3MFn)tu-63#tnZ5q(evuD>eLD97dU+9h;t9A+M@k{;`JnBOHX9(
zVyQ>h(+ez*is}KY>po6-<AR-44*TX`eXYuZ3sf3PB&!8Vf+~NiwMlhpQb}NHePA#>
zP`=x5Ol62zJOrDh9Yvy81QD<(R6tW(5VmC5t@0=3b1B56MoY!-$_I*&QR1c#A19{K
zGn5Wxtr&!7f6Epo*<W~!-VQf=JWJ-*Ka{XPeriNWe%E+)!Y>ST;7hwg=k>ItoHc2H
zm_o50itxlXbHr8VR@DoLEvzoF4Y-s}`UtC}FPYf1E)=8E{tq?v+;?isoDqR}lA0i*
zRoL6kzaSs0yup7}e-L4+R&|ArY*)1^^B6`gfRrJ-Mf0!uC^pvxCh*9yxTY0W<!k-S
z&&^}OTmg8_^?2;^!NZZ({INU}5J)Na2+r>N++S1dPa_dy`TCJxlX+frp!ZQ%5aFlA
zB9_s&eT*IO(`WOS>Tabm0{yOj?V9y740o<Q9a2<N8#Ce7AMuvpSy5M=XTbEEu#5id
zcjt2_SD6i+-Hpb5cG>@Chq~4Z*N-hdg@1kC7#iaIPXC8j0nzf-+J^g?yV})n@r$h!
zy-odXg+A?GiIwr>-}d~K?J4_H_Ls}--pL{C-QEO5#-;RjTu6?gKYdDKHVx2Ony)a^
zo?Z1mU=S7X`c&G!6zpoAsCIrk$cVRwz3JCWG3%fvO6Uxz3ROG1>$f1Kd}zaV<a%07
z^3z$C<_B7-<N$gAu{ed_n8Tig5)k%M)I~zrK$I}$K@midl^AS6RcQ?ZqHzr*s`HJk
zt`!qwZliOjbAY+bH+iyNQ!<QgV(ZsiBQc|GcdtQ7#=TW?!UnnvVEhgsd%s}gpSkHK
z!=L?{0@eo6{cVyt=DbEKxC75RZ$z@<@mAeHt76lv+Jsg;9wT26@z&8g+p*drxIgjn
znE*j+c<DIlgP@x8gs5Z}RZaC3W~Nou&&_rDo!(KNe-$*^PXg*t13Wr}*nN46;COZm
zF+rcs*#G>!)eufA7blr(ks6*RmYyCeEPp#r<9XJg(bpv_Id2ZlPka0g_M6D2bR;*A
z{rRwj0X-@1lq-A%g!b%j!{Y1zdZvAT60-d&;S!r$`Kr|&8oZ}1C^=a`H02A4=fC%N
zn|h6paGuB8$q7P#clV~ts%WUGUcj32DN(gS>n|#wZc#qjjC>>*&a8G=juU*R3XHx=
z4xJJ({t-S;C1~{=u2mLi&`)H&d?k0qwa4|G-}@0g&L@$y$c%NhIlop^2dj*L=tU}<
zN3aPDEUlY3WT#r~5N=d1=6o2LXmJzs(4EI~K=e#@bty8@PFfOy(K$07V39aleLEW7
z>WL&|Axr$Zs4LvjF2Gkxg1+>FeSAPj*!akLn8D(lIt7)q-@s#m#y>C~6h2E71X#&V
zus16_Gog3a)`5A(UNZnq^y{TNKOLzb2tVV0W+ZrC#d$)}PaA$Sea3%WZ_dZ^=#STL
zp!OyFe!vhGcH!4Sl+FjI6o(VV?UVPYV!65HuV6mAI6k7)KPhGGqA#mObjy+Tr|AbD
zqAw01M)!W3zjZH4EKR*lOKcx8@4*j9aA_)I*s1xRYAY>ZXsE-ayXt6PvFW(B=;0^P
zaPRLK#Rr;w(#bBbKo8^ci|JF-4i7m2RbclE48_CSA#=l%2=esXjyoB<dQF-YTv2X;
zc7FJb75xuoL;0l;T%fX&C*pQeB+zntAfkIGl7yPr#9L{sHDfP9=VVlvlgMe@NcYVe
z4H<}of((}&Wr9Gm8RhW_et;ny+1X0eMVU>Fz&lqoI2_byjOb=6A45%l8=0Yf66~O*
z?9VQhE!|65sxqe)ycAab00wHXVGK@h1bs2unD$@-s%W1KM|Om$`}y$<6-S9HgSu&)
z_7M}rB349!^u{mK&ULn2>^!}PmRZ%pnXaiO#T@n+csdgY&$cQ%6QLloYHQFnE(ANH
z^gZI*uLID<9J|%T1S1?d5bVv^-m~lX{z%r^2_t>jDb_lTG2@RqiHzdK)i2PR(P*rc
zq|(4MU%qEF3AuEF26fiWr^%f$_}=N^xbFxHF5u5MsDvLVvQ7GFBgOd&tpR&ic-pyv
zeNwOc%$Uq*OMks1#mov%=#}C(b5e}2Pp>$Urag8-c3IXtU_GY5zdrM7r)O+vIQP-h
zKHW^b*PykR*YyvB)8?Vop9g?Zu#Zz&mDxG2o3fL$>FcV017%;pZeKf;&wxC=9}2=L
zy&5ezC8eL)LpiErIWqnNf24fCoAp5JZV+zr{zrp^5QO`88}C`CGP{$>o2`Z}+3s92
zYbjwVGC5$HSxA})3shCu#8BjT&ZzqT`1!n<G8?~W0g_LS$_Wa8G<LL8K=MMQt;878
zD#liH$fKKg3D{$r^#k%H!c8=-gNRL%G(xx%wHdv5Q;c2n!Jnd^`$@*Y!j16-v}mUq
z+Kke`^;u8()YLgOrPf29X1#$NNHv=F+h4`--8}g!0(d6ww~KzNr2;(aFXkjbI`+<h
zv0e5WCcgZsrZ-M7sCO0f)agdGJq$av#|=~vO^-p_+^$8n7rvDyqUr{q;{(l&JL(bq
zjk@EN5!nr~os(A>pyv{EZu|sA?;2_9vjyjZxFaoe2qZfb6gsi8q`oqH6XS%Z%d(RE
zTC?V(rhFpF;vg$?=c$?N0TV4+<m$RBwbzS+qQV_Z$Fn0dduyAV556<KoceXcf$het
zLQNm_`n^cqBr4ukuD5^M`pwfRivS8ffadw7^7Ge{x0B-QwO?OZ9F<y}K5b{mD0L^@
z4cLMhE3|OUzMW|bI|*m#A6j4wcFlL`CH%Ia0MHGJ;VPi6NOt0Y;9}0c%EyB=;jvyG
z#egDQ<v%v>SU+DrRo$>GuD|Y*7U}qHLE8@w1s3w5XNheNKJwLK%oKT)n3660vxfBH
zzpH@gF_tU5WlgHYvzM3EU<Xwjl#i=id0V(IO8uQL{GeABe{7;{7<#)YRn!Oa>w_N+
zuc8&Mi0-BZb3gmu-EzbxG~Q)*u5aWWS$Ofv<9oxH!P|GXhP$|@dXvw7eAG=k6_>y9
z{fnNVBi<%$UXA--KhjDG2ZQX0*}s6Sj1BoJcbbexo*es1ZRHJK0lmF}8pV1L>&(35
zBs$kFCY9n%m6?rtD79I&4be#(m`WZs7+!L)-@25r0`-x7RzC9sTwGoEjGQ?Et)+#~
z56j(X(f<jFK67_zk5%M=wG!iG8{hYL?`+!Mj?RWym=DmKGKU~GR_3a#4<e*yfNbXY
z@&-GJ`peZCW`6M3DGg*mR1*^kfcg?u6Yn|kWoI&eC6f=WP+6lcfUr)6maA}dUF4_Q
z4Td=h?1r;$c+6H=m>AOzLQDLb^ELw|NxR48(+Rq{>|HSJ-<rUVx2py(r%-<vl-TI>
znfP6Fr4Aj8h#0RKFLcDcW%uSD42hy#HE;_IU)Jj!78(3w$ZZo+-j~+fZ`1z<u{8a7
zdSvM1>ET01?~IJEEdSzl*VqyEcar%75i%|=U=OM>%F~z->Nc)d9Q@mo)&m+7n;2Ua
zZs=5qWasLjABI~`^fvCHwYTdJ(JsBsCl6rZl=hN#F^9{XU{7+m!2)g;&t`%IHX%ff
zSxKzK|F69-kB55u{~uRT6teH5B4w{gp_xz-A^R2+l4UFjA<NwCvQ@HVo9szUL?R5v
za*->0$TFjnkQrGfX6F75eeUN*b$|W7zdwJ!dhp;poaemGdA-i-ea`E=-p|*d6*Z*>
zT4YYPY=N<mx*`_+=loj}$UfQwHT1dDt2|*Y(iC}`6buU6)m!c=({&z=e_8DUiq9Yw
zGIu*Pt_E9MGsmIF*33|>uh%Y0IS1%sN(&?PTLyy5LW)8g3ymA`xqU}>o$hCPUQ?E#
z7*&L=YM<}wnk604(=|5XG;HYc7&9UnzsZWibn;y~*b7YQDc^vXV^FBM_0UI?>#1=W
zKCY;O@ex&jyFhytQxl(OKKi1|gV&SglyM%e9?ZF~^q6Z5Pfv@w{ZXH6X*E1{w%7bk
zgE#v{D~p<Ey&5V=Z#!>=*rlf`=i4i}eLyi8p9^<IE31Q%eu-jH-6&<?R)WugchubG
z6lNsHhft0Dp~50beD7&JfCE)XE0En^TVPIE2i?HqGba1|1=$-GjVwg4vUX1JQ53`@
zN@Vy~PP@XE4DdlOLCw7R2vGLJ(1xg8zj_i}`@saJVLUY;fhflp*+i4<_SRv>tvp*X
z>0-DEu3j!yk}|_kNTIi#dB{|aNU4fZ?g>?&oR21oCP`1#@N)u>K3-Ej(_^%E+sEqe
z^aC?}MFzU@{TWPm3Yd(tw7E;K7Ark14)%5IZay<E&DG|o#U&P*Twd<j7xSRdFb8^h
z>;PAmL_(p3XQPUIp<4P?a$aQOMaBF<q7`th<YPrNra-)VcpGtHsufz=8aavBMf-D%
zzoL2o>m<u%H9f`?GU48Y3PZCkc}@s00}j68fhv5ga_b;`7vQqrjqt5(3KF0VleTw7
z*NHfxxV$^W{IRo`A7txlP!3NuB31>~?+bL@N5GtCvs6E{MdII#@;yJ%+<09-p!=RZ
zH~L7R=s;@GsB?lTUpvpc>(x)Yl?w`t*3hl6LO=mNj%qNdkY;VDHlXNGlZq?ccTSJ<
z&AGuu#OJICr6k#=K{$`Xe6U4U8aM(!19~;GK)K@zoY0I2+}Pe~l5koHU?&-d-zxDE
z=;EEW-rf6_7*|9E4~2$(aNF@XL@BfZ9Qt>YDM74f2whR*&~kLtjRR>!?PgP&EHN_y
z4+WQ)sHftyYc|`LbKv!;p~`b*gG(nZC#)+8%Eio$sGW<lJ3pjeugr5AR{vvfrht11
z3Lg&xuco{j1^jpfPf)>w$_DD=exJ4bamLQWEK<q)QB^~G`mjgvN}-|GgL&QpZ(UvG
zwUPF@11F3N6`Zs_P7inZmKZHs)ltU^`ua>kT(K3s6ZOFjFg}t_U|rF)NG169AL-qG
zB~S+;MSC;808-_kU^lRS1@$Pox$j4^%;4eVgL*h2@d-qIhQ%Z}vBm@)S3$7{aIBdI
zD3eO=gKhG#G9ZX=et@)Qk6)&zVqp7mmjV<>D1v)j4ZL7%gxrN64_S(K_1Ef^3Q%{E
zFZ6n?V78cHYW}1T0>6#=zuBp*rhwbDw{2LvE|4HR1d_mwWkZo)9DW!K%7@#azZ`LY
z{TT$AmIDEAmNuRrLgU5S{;|kPcI<kceNs_}8a}vuM{n40SqGfTLJEAHL86vFjV(>u
z0v<EYA~1=iNQ6?;r{K?mol_CvgRKdaJ>U=B<+1#eBzsJ7yWi@^QBm}O%zdK6a#%k_
zXA;2+ig&n!i;6}Pv{_PiCxxH5oc3sjfDmv_6%U8jkF$Aqaaep@9mLK54h0$VIga3U
zO)vCLiv2{!5m4TS#2w+9?Tp@|5G&Kkk_c9u1<f9H84jTYMiW?P6^N|}>TT3S5kF4@
zdRqXB8pWYQ_96Y&(c)nqAo&W=<+_tW3ABA{gn}qp)C5RvxjDv-C1UE=x|lwI3&cWs
z=rct4oI)3<ebbb*s@IHW(++g-dki=nt(B>1D(WCdzPc{0gjSE>sFN@5tZeu&2NU)k
zS#IsG*?%liF_gv~^s1WWaW!kKH+&d3DW$GWEGZ;(WH2}0FhSc->@abU73LfK^savi
z9>G(0h#2ZX{=;^E7$!JJ;#!sz*Umso#>kFGo<_~RID&(VnAAY6Sr>Cuk=`D2wM4<Q
zd;>YrNgUgkVyuMIW#}$_)CxOq*rZk~b@pg#o%r^cSf%nO{YX5XNAU7N!$b-44NqKS
zo^f<m<UL87C;cSkZnerhz!;}7SBMEXPqvwqjTYVkJb|e*O{9A%_C%1>^e}6qY?Q@d
ztjb}qYit_ZkF6#}+^S<EuQaib%q-VQklrPQ54%*uW8^g{vJYpJyh|RfzAT;?HPj<v
z#_L!Bo561sC6eHnG{{7A$768wY@;8?_L3O#3VnfV;t4@mfIIGqvI-Jols8#L(LUe8
zH>7DeBQLBry>9=&1a@HZss9UYf3oY+ZZ%1C3p@l-S}_%&f>@KE!2p}_A+q9P;SEp@
z6`@BXzQHTYQ^_q@P8glFQ>J89zk3bpq4SxFlpq;j0nP9cT@u3`;(IvPs1`^)4&W0U
z<IiB<t%omI+-`N~>~Z^p3oUwkETYdDE@w9S3dZvououo*AMlcxq~`hP=81=!wR_uZ
zu#woB-c`X9&a@j>IqY1(v8p}3IJURJBu?^J<Jk*g7ZoHpXAB$EVi}bT&7y3qcujCE
zAvn9wdL3A@R6&!FIvz#gSRshVNP+iXk|#z(p#pH*4Eh#0hl&_Uya~?ejb)-7CSm5V
zYbVI!g#MlYLt-WtkJv+|KVJhbJ)t~KK_VpGI|QrW<~51b<wCFOy3B`zhiTxlyg`bl
zL^(<Vyxd3}4Fb4+94V)$c{TJ8gPK77d7c>90be~cP?+KfZOEGcRSg1<h{Kk0M-wZX
zOmCPrtw+6ir2-gBWMU^#(fn@F51l+5d2TF){RN41%}+3hIL~ZHJ*|(Bjs2tac-_9X
zr>`CEm+<UPywIXweCHL$fX~E9%9v>}%Mz+;Wc0DGvHdaf!oBIv#ytLOy3z+S&P#o?
zlOyLvVRSk78ysnD)VWad@vOc<k-)I7;G0)9R(YKB4@}>=Q@GOzEO|sMZRa@YXp0Gy
zQ^k;IJ1{t@F{_^q0!$7eu=Q#Cvlj0j5hO=_j&+b_47iX9zZJwvVhyEePgu$7lRYuy
z0u2&Rq&%AagXa+0N54OyQ)^)K{q1Wo5n$Y2w4Q35@4exmQ8Zf#_8A<6_9hi7DdGkc
z3*zf%#JGYy$LoiE4Zxf=sMH+?FAL4P%ev+EW=iW0PF~4bJ41#EoK!rWph=vI%km_o
zVc3H1Q2fcBj{`1(=oc%#H6p;Wn9SG|Bte|(jEGQi$JZjwiJ%}nl&x9R6!5Pt1|n6f
zpY=&82tkc2%Ihlbm1O1DJdsiw3nNH#>!_>t)h=}!GM>$?DzrV!tHR}+;#(n+;+#^0
zmTn;&RH}>-a$&KFZnZy(-Jfy3%&f?;AgW-;4Nhk&0xCXh6PXp-LfF64sIsi3x8eP3
ziZq{+&bdd&DGVHpDY;)J`OWX*6i?tk>LT75PIQqEj?Zy{+@*+Pu=V3pM$ljszF1>Y
zkHQ{uhUAirAOZ?0t;QaB1d0pm;J(K<diibUn!a9KQdgeS4ZY^Hzyl)$8LUu$`uO&W
zXCaNl)_D`!{`oxz<YhOIVRuMcu6i^k?=@>_S~Agc;y}g39q_6KV)^c~Bt&E?c-sg(
zO2|1RwcW7o<Y~46Z%fn&0-PqiitSD%DSrSDNQNnO@(G5tJg}Od6Je#aryvkPL*wBr
zj)0|B;6k7z&f-8Ko_csG(9&pd$)tA?$tyo|pl@mP+UE-c6~(H>#o3D0&q`5&kDN^p
zXm_eDy#M5sA6Px*>s05yoH~=5E>b)7>AAmhCz7Oh=QFHk;gf%9mmFysPK-|K7_ucD
zPre~Xot2~_lXn}gSafGCMf&r;cu2JfyZhFJ<^`V^Ld1@pB!6CG4$vhz#DnFhNqw+%
z5<XdgbCp(cBB@V`#|E814qtf+QurFcIo$>$MA0Yafl2{>1T@c#l6^@c3DlS3ELoD>
zHu5oP)igxKw2+*jTH0N%SJsg?f)YoRVduaS+`fZcK)}B4z$FA+1FFg)%NmmhNomW<
zWWS=ISHkk$aCgc0V0@55fEC$%!pgrzZxSN}K2y8Kok-toEkrs~5RHis&jaa&mfb4S
zlhYNk;yT2LFc`-h<gjoN*x@cNVipVlw0D3g5tgtKVK`C~Xuq~Ez?q1SDxd;yJCR|L
zjuoyz55bosh!|W^N2ky7Ju)A69>Ec&>>IL7By}}Pj!L%fd}w9yL0Gh$n&LIb+OjaS
z>Wv8n>1Jb-h;B@P^z5ZWSi4)goHI(k=d8WaSYa~-@D65ma$(*|*!YNL<{7P9wJyev
z-QcYSF_Xtms@Hn6_&VDRbMLw2=qT(E&Aw-#H3I!OIU!4D;ma~;X||N%-gE<d%iABm
zhg^Udd{-K%Zb=dp<{}P;ep+$}IbYiA)v4lqER(zRMWC#9>C^7TnW%#DZYlXrVaeSx
z9W~jv2*dIdb^Jz-_-T#YgZOE+FeZJgy;`xy4MR=tgzo3cK5?UMDSgVo{qZ9}B5ah7
zHt#D0rABVF8vtIE*ZOBTA*SyDD}Mn^lQ9DRX%@aTgszMQ1n}QCu5-%&$~7LKZg6OZ
zevw3gc=YTv7Er2EZQZvGE*)Oa@oh^8YDF&ty7+n8BgBru&#r4{ov^!LyL?-hqhAHd
z0sDeCBI10;E6{VzBxU^dC=y(aTv<ehHlw#=T`(Znh{xRGu}GL0zX=l|<Ft}K*9)_j
zS3l`XQ!DFPDpqV+jf=&1v73{NaKqH`u(~}}<V0_O4-db<H5*`2vY2_dsigM{tLudr
zrccf-z3W?$ujKJb2L3{O$j$B3Co!K?!h3?bL#Bh@l{;^*+tRHT69s6y+p5XZ1o#lz
z3=R^xY@<8SL*s<x*+CLNDyohN7@k8Ls7Zw;17qJsB1zUm*-8e@s1M-yrz*x`5z%#W
z8WZQhZd$zsGjS5J=A#D^zC{yopQ>{#Ff*(XpkS|3K}3K@;SBw93l&oV@)bu%M|^Y6
zl>s_4RJ8Cxas*zFe}s&uoL{cvC!KvdxrPl(LNUuwRK|&+)_`+$LSa7MP5zMVVk@~m
zK(fR)1QhgG2m$O(jI@$T+D>o?yEOyqAzbYKzG*PyX8X0-XNF14yZLo<N#0_eN)-u(
zv9UV{S8}Yq?fA4m_KE50pL>;YurF)&k+trj-Y|)yxL#e(N4AcHDc*Lc>P3NN!>RXE
z=RD78rQR(tkZ#Ep=kBJ4Q<$l@q-YGj<65LQs0zh74C(rsb~&joeVDOTZ&=cb96J#U
zHd=Wt5gvm4nBWrmS~xD5Cx`_GEsJR2r&t7Hqso!~#n%Kf+zk+WfcV1*<ls7i$gd14
zi>pmHmUn!}=0h*;RfsEygli<p#g2@Qd5g|VR1_H0n(J0rB#OCPBtEKjwv{w$GC_kM
z+uA*oyGCP-b9DEFDLKpIceJj(&Boruf?_{QK2V}IoMMUq`GfD{w>G3e$HlnTN%tB-
zl&0u8FbH#y{aX!^u?OW*KWL+)F6blb$0<_K)fb>N8k!~vJXEKr#@2>Y02d`Z{uHEL
zR!pLXRwfa%{UoK8<u$-R4BZv<CzijCo?;P=Q&Oa`_by)t^;V(H%c$RA`+e03-1qSv
z=wT>soP8c9?Uag%5+w%aClF)>N1cahRVC_yg&l;H0DXqSqUlw16Pox0_3;xNP<@h1
zK)$m=Cv<DWMQ2?mEsk`f-k842ggl;_%Ly0TmUKUWS}DgcIhDpkEATokNitaUVZiWA
zJ;44JT%<O8{{{I{;`pOR=WVEfd#1mr&{vErZHS}|;yMvw!T%7~T?rafpnwngeu5PW
z^8-w=y><hSi?Ob7viO1MJTe9sBtnEo57whg;W02ia)cF8JjUw<nw8wu_{w*r9#Aj~
zF*aB42FxEP15r6+)*n(U5-Ds}<?Ep(kLuq8qJNOyiEyPjQ*T$y51VR{h1}P+na{w>
z(IEI3jAhIx+XJ>#UzApVc|tz9)e@BWKusPivi0iXwQz)qmUYj`)Jcx|dg2P2hOZ<L
zVoh2b7kqE6ovG~h#;`cAPM?DQHFQCnej6u*0kQ)Ef$W6@zb}e%qk}-IeuF^vfR?LQ
zy=1Rmv-iAk&HbvUjIXQ9<2xO?I62PKE1DGbx3$RDolXJ@h3Wa4nj)pYv&u#FTHJzG
zUuWJynolth<`UqZE%81%d33UQ;aq&1b+GAx(ZP%h_cVKltncfDeVUgc?Gl6S=#B2?
zH_vjI?k<DDEuZ(?T<qDlCwr0muH)r^LlWh?8xljwZju#5j+xx37Y)JOdxdS(Z?MIa
z=uWbqby6+Az#z0l<f#>6=x-ait}H|cJAW#UksWLNC#t#%`<(jHBiN|*_UI19Cq+Ej
zi%<N_VsFnjo9)f4csvo=$Gi~Onz(T2;u^6e+OUk?I4gOn+^l%&MC<lWljC}x;b~v0
z8_uL%G1M|_A6n!(qm!CkyO8laO)EO~;)+E1zDT>n9#qK~6CpGA2)XIU2cX4l4p6b<
zMJ&y^i?2W3OclPl|IH!0w7hign91pY7LUx*6Yq6K2>JM@(b<<4_vImCuCcv1EXZqF
zK-&{v%2EG%)&cE7`J!EU0Si>{*I)FI&B}RAjg5Z{l=2LGar{R)UN*Ka_JASN#ePFQ
zyI!7S36|qMg|ChBaij%GtLuw$Hi}64e58}Ow|DxKV-EY|sQs6PyRw~dt?Xp8*~{|u
ze@)P_S1XL>GBU*bhuEDO@EG-QD!=rEiP`Vjc%F)fo}XtCr|HbAe8;q_3^|^jYLTuQ
zS<g?Vn17gCx}yKawQ5(cCE33>QS|1NP!u77p)V@4LGe+rNOpOBLC3pDCr$c`H{@G9
zj0XCv3aJuOL8r>#K=khiw*#tPeGd#<gZ>}OyY6J~^Iyx0G0@S&iSX8?k)b<QhD!c)
zLk>tEZb@KF(3#&h_Docot$t{3aLpipE{0JrY-RFGcAC|A#oHYN=ZdwIu-b>{nk#NU
zl9?XC`#sBU^2^_btO&lFS>HXnynMy(R!(S_sW#iGs^G+O(irDCf-k+0r_b>3jn;Xt
ztt!^889O>)d!8c`q;x*#+8K0r)KR<YN+jCw=RU`?+>00Cws@~!gUt09cYF_bYzbCR
zv02mV;qELHc@Y@TE#TS}|MHUu_x=xNvdVr8syLBgUsu(dQ}3db?ZR38l6o!=D?1)7
zABa|gTC^#srg#@~vVCbxoVPq&6|2!|&OEYP@}gcNa4xp{g6?p16u{g6@EJN9_N*-F
z)}mO3^rxd*x?)!OMIyWfWKAQ~g9kiQA*uNDS2;d6Iv!@xin%V<S@uqSkzXU65O4=0
z>$_hQg7#&)RU#xg*C6y@B_}j0b^KY*bp3*zpVW3DZ*QLrLzp%8^Ir`uc=$u4Wb>F@
z999ya($5a%qVv^69&f3M!?Bi8fqVE~g*9KK@3;UJ=59ET9dwu7?KZS0Yv+E=e%*P9
zWN)K8_y7X_<p1q|0eAKFLgYBZ#&syHx$_s8loWrH(tj$hquJybfE<2NyvL$vpp@#*
zyz6pz3b7}%7c-T4w<O;CozN+Ptjpn=Z2MS64KMy-6>i-Ti#?rLYIM!w&`v(3o2oOE
zRSA2Y#(UvvMGWO!msB+F9<b2~UmiHNvn6~kEC0Y%mg%PA9)$|GLZ9bZvk9Ld8aB`C
z70lWT^PbbMcAh@%AnL$T_NB*|bz+zLb_GqO+|C^|lEtfvFP~3gUZz((DQM~Ric@y*
zJQkU-=Gb)EW#pLSBrLo`kGS@^R~!CF_VOSX{k?*8F_YI!3u+Y)n8k9Jn0B{_<qsrg
z5Zb!^?lU+i{;Bwz#29i?chKuhY~c0yGOqf0y`1UFSAGhDr&m%>nCJP{T_0TBy~~Gg
z<<0rFSM?8TnpxxDF}%~)<c38BlpV}D?twTDtG>W9^@1-Vc2#f=MQCR=g;gDDI}-Kj
z?ug@AK{`J)c>&x0V;8C@c;g)v*k9~62!svtjfvV^z54$x^q*CkZUk-`<#yGvzDCIf
zxWDx>uiPGUYwwvmxH5*wkqqPLxUiwSB8BEp5%A$#91gpg%3&nZ{rLFYiPfsY1ZgEq
zOlIDse$s)5%=;6^UiPd}mI=sIvPd~+MfY)SpAWrA8j1dKdg+H?HS~oYa}G{i!5K&1
zM>7fFHQo-qJrs*PEz<9ra!A{FBKA-wf8R~3D??nksoyQKYo0{>?i6!;G<=02G2e0K
zGwYW>4H^Wmah%R5V)$}=jbZj-qwHkv6r$w!Gdt@nEsNB)?F*GP#>XhyYksQkNl5$>
z;%0x3r9S*rHvP$^KI-+6ikDM*uoQ;Zd_0dsIRwOA-hG)mtQsfr7%A5MS^JXz&Y;v&
z;k(Q@FJ<MPu3hOF7gv%>{%kSCG9LXz=5(y5ZD?}#vSt=5gV_Vef?D==Kb8@>QQJK+
z+BK`%pEGmYa)_@Nj%!yo9?GI84K-%>v{fuMkkT3SO51prbOjs3%TljOWIPeAVTviO
zc1ySz#!nS6V2hqIC~tXv;2lg%B>%ajlPG={6dfMmm3QWtZQ`1=pK8<1yOT6gTm|p-
zMef;i(7%jfim_ZbKve_DrGvjOF_5zs-BIqIp8J1W7VvNm^%fJ<QtBT=$=3%bHT1jQ
z_7huVu#fkfcEn8Yt&<5*;j>@#j_4g3p7}7ohsOH1SEu_;8uHJkFz-^*`A)EC{@C?(
z-|>%`tp8-G?(XJg@8*Rt^1Ev9X|=wJE@*ECa3p(WZ3h4Z@L*ux)SCgUL&P<E7f)H4
z_3v9d+FjgV-T*o%g3cnFI;t>!-JE*5I9;~)l>Lga@yn5Q3H}e!l)>(IeXILD+Tov~
zNgRc}76ft`gCRgS$)>dPE816~*0;}KdThL$+}(Z(X8h&L69X_ikH8djZURdK6ZZEm
z{9Bl<Qx&W<K6K+O1af?p9>TQ=0m`yza2sRX7}i#Z6`y@BoInUwFrv*6FO9y3*qG<7
z5dGbENp;}U*A*tUNeT|rZy?Uw__=#~ZBpCUrei;yp`RlQnEeW};{O17zxl5qAO9s}
z=}ogCY4Iz_hvj|=dh<lfTmB04iC;qAEVRnSUqL?n^Ck0h9iFlJCFoxgdh>MXSpN$0
z&z;84Wj03q3i8qafy`Eyeg#?nr^rs(h`qvK1-1ewxp_A1ZN5R)vGMe>zqV-=er@`>
z1`BPf#Dsx+DA@&p?Av6~ownZ~|GmC9e9hNizM92We{r=*m-iJo@x20GZZn>w!~X_v
zV{L53o3|H`SOoG}g5bs8P4X!t{~d2bPg~)RJ4AR^0rBnw@ixP`UHx~sA9_bQ;y$}{
zARXg((p~ZTceo$=>mqX|^+VvvH6^!II#s`K;Qm(ghBCfU@Ycc3?N8;o4WvV|Z-pzo
z`3>9;bh}}RTjBUZQv?|pAdm#Xt#H2K{|=|HZrEGlf}+dT)PO2#z<@WaqBiE=;WnNr
zTj5^TR@m+VaL!<fY=(o~{&%>I2gz2r5Z>vVF54lH=I`b%F5%zdHl7|^;l!7(Me+eS
zx$odk-2ZpD4Zpq>?&m)2PUa8bzS&HDRrZELez>KoLVfZ~0!n@&uyx;ZAN>{C%hBG|
zUUuXAH%83#pS8Icf|n1wE_VkE6@2k*TCq0y5XgT+2wslfuC{JAPA)ReSM43P4sdJO
z@*kWQuoL-v`}E&X($?-<18e_uw>$B*`<5`<R)DR+tA7AaDSZXl6w2D#eRJUCpUw~n
vy$anwbMSW<Wo!4%huUwu=fl3~{+}c61?}xXVGxKo_&)~x+K?(8`0alI#UL?s

literal 0
HcmV?d00001


From 38fc5237520a2f20914a9de8bb14d5999009b3fb Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 30 Mar 2023 17:30:07 -0500
Subject: [PATCH 143/230] Added mm_algorithm pdf files (bp and pb).

Details:
- Added PDF versions of the PowerPoint files added in 17cd260.
---
 docs/diagrams/mmbp_algorithm_color.pdf | Bin 0 -> 172097 bytes
 docs/diagrams/mmpb_algorithm_color.pdf | Bin 0 -> 162478 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 docs/diagrams/mmbp_algorithm_color.pdf
 create mode 100644 docs/diagrams/mmpb_algorithm_color.pdf

diff --git a/docs/diagrams/mmbp_algorithm_color.pdf b/docs/diagrams/mmbp_algorithm_color.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..b4fb268df7980421aebd209290c19b9dcafb21ab
GIT binary patch
literal 172097
zcmbrl1yo#3_OBfXPH=a3cXxMpcXxLW8VCe;*Wm8%?(PuWU2~hfGxLAv&Yk<+wZ2{r
zUFV$IwRi1bN%!feN#un^X&7i(V4z6q0S_1`27G#aJ3~ttC~j^#We<B3Iw1pR18X}o
zI(Y*#6DNE|z-2``X#-m`auZt`6(tIMIwfaE7b9n7M-vl8J3D85CXPQ<WDFdwBy3IX
z0L^?9O^opY)l7H+4;UyDTjM`2J{tQsUw?ei3E8>WI^#3YNm&>>Y2h<{G^d0Ace{^M
z|6vCR(9r~7_g939CQf!Pjz%U<0RJNH&SFZ=2F@mcvX~MJKFgl~#gtg_SwBj=ymX>=
zw$6Zyq73+Kf3Ar#;{VkPQ6_wbkBlhNi8A9e{4oSnSC=!iG%@<Hkgx${g#mDh`A?-k
zhl~vPfahNg%Nsa5n>gA6YRWSJn)=fhI(bHVe8BUgRe44>e8BThk%<W(@cgUDhR^ip
zEWnw(qn(kGiL(|RKq)$96L;sooC^N+BJ|gb#7BbYWK4`L3<T}mwE(Tq<1?|c(K6z*
zFtF1y>)->DYU>P0j}!i1k%~Fmx!423`Xlk513D!-Wk&;BC;N{y8hOwODbWd=xLO#Q
zD2fRJ+Ez9&bOKcQOYFZ=?SxPNcU3+b{--LU7S@1%(}`LGvMy|5WM^zbCv9SD=4_77
z&c?y`VddoPXkuUk1LdB6rXvwU((KTIHuakzMuqVirg0B=2)It8$^fW_<tyB0955D|
zaq&tm#Yu=jy7#l_5M#&jJ*(mCZcb@A2HYnR5T-z@?`ym-{r7Jde)l(czi;GNuY&Nb
z`JRVk8u>ar5Yw6<3Gx>eZC6^-R$PzlOkW<)$7lU~?OrZ_9o1&NId1yioTZU9zmcNK
zx5|I5y@=QIqP#r0X!m$-WRqK!^LyX5p+X-Sl3%+&n+!IGR9LHnT%F7gUh{3`8BTMV
zux<U$M%VaY2ip@JH8!q(K(KGuV||V)Y)^6b{7d3uoAh3|>2r4}R+m^{!>1i2Rapco
z>hWOvRRK0y84#}@Jts7VM`?-Q`D!94$Z<l#LCA4Tz6BSH^+l8nK}q_({W?kt(p0Km
z_(UyEuAkrJ5qTE}3FhL!*6Jm`qy>={&z2)3sZf8~n2oqSahI=zc^5bmF3uqmrP~Y@
zfImN8fEhv@pA=vv5Tw`|G!`o(Eig?kPP|qfEkBkVjVYX;OvrlkJXOs*1Y4l^=<&r{
zyGpyHi}DRDF^qi}nF|C7!S|K}eVcEX#yaaa<{4&`rx5%jaq3BrZC0n45m_Yo=eM2Z
zwpYuSSG^7HH{_AR>jv7&aT)!2u1k2zn2y}WNxe?@_wy5htFP^_#{1x2_(sd+6sgeY
zkjIIj9qt{ShX<RTZR<~2d~K(bt4VlA7ku5Dcw3#7-|yek)Uu-uP?SsT^fxE2d(;|Q
zCSF?I=<c6(L(v{yVb#~oNOEeVm1>mhUR=Fwg$&cK_UyQPZD=0Sa()paWeWgXA#TNQ
zJ?LZFhOV@GIP1&C>U|O+1AYo==SYGuUt=`dN3M?e1_O^zQzxZMG8`=ONqGQ@jm%Yl
z<fkE&Ptf0#kfbnWaf9Qshl4>sIp2nQFi!_Q0ZM&cptvnnGACKi-wY&fC1`zroG+C_
zUPJR145{hT8z&R|?DY%Dw(_RprEepi`wnvR3B<<^P8?Uu=v0shXcfv4_*Ki`1{UX&
zZ+J3cnyUW~2;*7&7)g*ii-hlrflqBU51)WmcqwEY$%y$^VP%KJyYvhzhoTIj-U39z
zgSoiE<c^Agi3&h&QoMx7+kv?3N>O~Er;y1X`G3t|<)b$R!LV6XA$CRS9KasW;9xVm
z5&8oAxP!PM;#J`EM5dKI>9GfgmNuvan1s4}P~F|6s150?85dWZ=@M+G39YMnEVc=@
z;JlSkdK{xD-rH6S5rN$|wx$_1iU;$<<b~*tb^jK}W2SqLJk|VOO}{;%8z@7GRAr&Y
zxV?v59iH(8kvqyKSB)M>bb>iBLY++l7F9z5YQ3kZFT*?7gjg^x%Rw<>c`QS)v^2(l
z8ZByQPgdshCn~oyWX~;RC@`0WA1nBdC&<?`6wDC*GN8(a^d4KtUg36~t<F&Gjjwd+
zf~{#+DgoeLklD1Dm5$7mbQ~F2l}FzeH`CS9HVyo9hTWOEyYoNu=0UmM+<$dq7vD|}
zKr#;)Fp@%oG!GzMi;@D72F=QjMt1bvALIAm4hsJPPWlPW?t#WImVf}YI-*34vFiJ*
zn`B;oEPQ>KJOc_xM)IX@%H{cV26j_GU<t&;^ld%Nsw<{_?~ygjx&Lrh0O$NtNIjU(
z3bA0j;EOjhw*`Vvmu<%w6syTp;g_ZOOgJRs@t%>u1JZp;pKkjq_mT==ILFJ%Y&?i7
zp#vAoJV*x>*cIc$kc2=?wd$?T$$ZvS>5?%o+IxnUWjFBOoCe!^cnvX^H1R*!pFo^|
zfb{$JS;l_Z$@bQ9`?x=At6_w)qRJkGk8ss_o_<m`!~Xqzy})~NCbE36nAG_EB=xw+
z?FS>%*Q#L@T!;tN6zQozxELAaQRr`LFxcs%KsWWnyj_uQHmVmiJkRKEtLsuH8=>D1
zdO{z;GRvW3gvq#6ln2%i7~AjBS5<J6-=~L-{IY_#rwY%wM1Nw(=UO^W#5a%{Wu%!}
zEXly7E9Ub!)JiK&2lQiuGZiYsIQuL6OFm|oLH!}`*r3~#&O2CYqg}Sg3+nrC9GBe9
z%sG(gxIRT1<r}?){v53#(1Ie2P|28I!})ar_+>Au+iyvm-k)T4zhi1WJ#>ooK0fL(
z&9mHxlB^{>cUaUKuPaN{ZrVRW^m5zXpi60ClD+lvO+1sAwJNR;l_rsByRai#YHAFj
zM;lDPB;bnmge@8gxKU%hxpUsfEtMr`$n`JeHsGW8?)oovnS+uBu@^5fM|G$SkrF$w
za4k=J+;0aKw*4eSXX_z{PeOj((3)5LrlIMt5C~>;;8Rxz%cSBlSthN5dDuN=2p&EM
zCb*T#CUnmP%!97gW!<HTJ^SQyb~As{J7Ks=5Yl~zkM2eOi23be^5#otBFBjv66zN_
zW+4H)%;jyug)Hc3oeF|p?zwLTUC>^qMMsrF(yRJ`=h_hqs-AP*7eZ@uq;Wo!+Ed+k
z*fC|i7tnz{E*V^i_p7|b^08dsextB+-3BYpT26UQ1cRVBQt0e$DR$z9t3{DK3Aqc5
z)-R1+2l5k6Ngny+njGp|2QDufL?!nUTbu$%SsMge7H9-0;6&wfY!Jv0ruQ7e4v-|j
z0iiR6O3XBYCbtJD2sY0A>{{aZBJ~5Zh`u5SvYy$Bz8nROULv*X)2t>#iwe;=@L1+g
zc8<ot-^(+r4vdwO2j`H=`S-4e_1`GlaSlu3X&uMfeJ-$xJV~m8zBbt_`8PP0>K~v_
zXVm0c(PtQ8(@S8sewyX_`Vor-4w1(`fedkaZ(Vfk+R=AY$U>JWU{Z-)RDI9Esd8qj
zKL*qvu`V@u$ela04<Slh^@B>sky3ZnsiwEJe>VjcW!s>YnQh!{aF|bU1EE#)YK4&5
zq`fb+L)&h%Ys@eru4D!rpGr8Hat(@~8>e=>VHY|@eJ(s-oiy+ql>hLu-bCh6VDWtK
zlx(~9V{Mq(=Y=BbWPk@;dc85L_cl65%R*J3bC6yC_`}v_qn0bB(2g(EqP)q!<mHbM
zQu{|Fg__dSK*C*+am~x0&Su7Y$G{saf`1nEByohc#g#~5*%#tMU!Zb&Y}R<Rj4<!n
z*RulOKtd{BfPL0G*0`Uc4DWsmt$7`x440C<8|*uVhSBc~tXWlCMIsN*!~SdwTaJIW
zC#7xq4dmTEPoIm9)}dVjMZAf!K$hRc0v?K4g`K8Pg5LL6_NCI0YoJf2-EB!5XK>~I
zLeot%+FcHdtpUwkC~pu*ZTOCecvP`r52Uf9s&;<=n^<_OyaFF2t-W&JtVf~&&S{7c
z2qh(~Qc<N^b1(NOeEZs4DL21@aFMytrYcW1ZRZZq#=K~O0*X+qk{PQ@oK0~Ro=Q=0
zH>B#ago~@j*Tdn+JnaqaO^unl)1_DTQHCf3iOeW0(h(r&m{q;yFv;P`+Aq`lYw+<-
zSsTa;`)XeZ<{Tg&5$^M$)D+5Xv^%)Q*_4|P%fPiFZhe%qTezMAe_#uJEmWrV$;-Ag
zmD|Um^>)p2NH-bEwWJZG9C2hhJOFrYU=R?a*(*8Sr518_Gu%SDbx0&=v9tUN+Dnc>
zW2VA1%+v=5<7*Ly^yrYdO<D)PUR)5P6xPZa;ELnvrebR2cR=2`Q$ZhW?k34?=Yy^t
z84^G`FbRBB>|Y1$Yn_;$fW9bZNgNgItCw}-1rX3Qb_E=J*g~#=!`h*5hjKPq#1i4>
zLB@j;O$FmOjeOMZ-3o;!eTM{#F+vt-pwr0<H0qBFd))!qI$KVtYAFNMj-PYp#<hJS
z%cPA^2gf{+8FN?$CWAlak*45xJ4PNBlyM7H)T|5(ce{<RYKN=|EVPW*MtKb7URV9R
z7F)I?vv_|&S7+%$PjLt1e)ReB6KhLx72$I0fx+IZ!np1sT8TD7>LpkJrC~<psX$c|
z*9s`F34RXo>r`v2Mkib;0c?$|<2R~Z^-s{6cEck%jd?~SP3dGZMtg1U_L+8LKT`#^
zG7GkCMTTuBANCFy?yFr{moGr+xzWFOnPPS9S{r40h$EQ5xPts<{_TN%PldUZ2L-m;
zQTxT{unm0>tMg^t7D^62wXWETLZ5*SWRzE;ImX1G*i`#vysRgAKxC_qJl!v=5>jgZ
zY@J;@mC%$Sj1^Hh>|kf$_%4mjtH%HZ&EZ+(xILB(k=9;xm93uJiv8{VTOZ4_V5WFe
z7lzH2a60)UrfjGDnWo1Y)Py;7QLGuL*n<~)OClKpz$A@5fjg4@&CtbsACiiF-IC>E
zJpP(`o7gKFmsY=O3n~T;T}|`RJ*n`ueiAy|1GZ?8E&=L@a4>xJWP&HoZKm$rUabb0
zN3C+RKU_EAR=e+LjcD80BiTNLX=SivU!(Vk%`w-_6mTM7R;-G+a^(8w_ALQ*MZ#eD
zR@A*CrbnXv^EkH7C=#R^d)|7wWTr?uZxl)0budagZ8S-X55Xj2blDp2BFsp%uBYDp
zsZE@dnruuc{zx?Lc+=_O(*sh>g`r`MQEE$`*xnDYa_ER*Sd-)EcVM#%qt<)Tk0c!N
z2(bdZM&-613Z=E=b@pI{xQV2feiZuMSgxB;(iQyGh$ayy$a~7-{v?G!_aiBJ=#`?G
zRe?-U*Oq%4-PL!B!0vTFioD1fHQy`-7|M*VCE2wVxYkRWhnP&idlQ`ZZ4p%3x2Q2q
zFn}$xxHk*1fZ-jW2a#1i?WPqqIvYj8Z>_NEE(B^N^{>-B#7A+I??Ee+#(7^v272~o
zJ(i-iJZ(+(w^P>zLHl9ymfgS0kb(p(vgwbUe~;@zMdBsVC7_Zf6p}WKj2?tiQhmg7
zy9(;rKS&1#uhy=4!>EiHA}Qxg-b(ngfTvA=rljd)u87Q9pq{jNbY3x*!!Ke3eYRgT
z&zhiJX0t}sU}8fCOr?LA&t4|FO0ZoF4sOw=Ml%07U*;X%J*sc-d8|ZveSK0lAJchD
zw+Gjn-jg5Mn!a3-GlaUTj>CVurX_~7iH$FWayPSNYrqcea3?lfx~ra0dYa=lea1PM
z8G&D+`z9<XyNVf)E--#zCR&qPR^jORK-+n96nT>r5mJ<V%)1hT>-^>WQgTn;=}#5u
z_6hF|9M#9{Q7WTH#INeIkva)zJSIr_u9$0(aMy)T;>7kKDEVEC^d#=wMUKY_p~4Kc
zTSy)J^eO%nBa=XZ6e8uq)H=VZW8yM3hk|0s**8=0VY}T|c}K93@~2alp9W4d?Yq&+
zWq?CM2?Jp(g@M0HZ^!zHpH)a-)l-Lm(wjd$#Y*wFDx5MH&QM%3fQ;!~P0pRUo2)L*
zN178GISNEM-@SHG0tI4&R6ng7AJy!in_Cf7!%D2^2;Z+_?dme#NCRtZFBxMXd*r2-
z6$`j9!JO|-t9MpKnIuc8Ax+I{CSEj<ts-R@&)U*0y!u?-bhKN|%D_?<!YJqfkN+vr
z(ndFDVeU#9dU>4nz~a67lgAfh(DhixgFEL;lP_6f{1aOwDn*w?8UvReU-ohsc*OWd
z8Nen5)SABR$xTnzMr|t<#SlR_JSSvLd>oJ!C3Na!XYHd09awPKq2ihxU`5n&8-cC=
z1i4td^rnyHGBsbQ9uKg1<IR$3vQ>?<FJ5AdzDP79f)%YVg%aCXV;)m^w)ME58hrw{
zZ@DWXhHEm1B$^3GVvufpRW3Hc>pDuzLJuYNQ(=L&tS)3<Cw`Hq!WB^OJtAFJz)dGf
zyh8#_-*NtN8m_Cxj^y<90=;rvlARrs^XU}HDY`$=!7&@pE|&y8W8v{qktqS<Au!uE
z5i<0Zm9A_SIiTShGedmXD^^Ij$2ykA+1Bt7Si0ps<_wW7>~k{>-Z<vTa<fE_!_n9x
zrtAS30ytdEDM~aq$1I`H0a=LdHBkD5TpaffsY`ee@WdITv#;VI3U1fYYp?_PL|Lhs
zgKLl6*9ke;_F%tHMK?dl;hOq&SLBfG1M`FzPvve~dQz5{TE3R;T^+GsqnjB{RxfeN
z;<6Q7V-~SEG>43wuwM;>B&&@uPe<i4ZCgbf*JIDl7YjYFM`9Bql2%j-9}v0|ZdQp^
zm2rft;l_^2>IjEVh#*EsHoDR7IMX6Jz%I(Mo6(j5>ird~NF4X_9nnO|DCKmsc#fL-
zU~EPtz3et2<iVIp!m0M&6jjSRpr3>anZ0)y2R$Yxi+lfwvRTxEk9g~~bu^r3(C~zP
zIMd046*IX)(LlFRlH$@q)-2^=?|Q)jb+$BT=dFQ5tmV@6*L95(j2M~C6w(ek^dRz@
z&7_!*q-#a2y*4IK#kO+=f<s@Xs0O6i0L%bE^s#o{)F`+AY%|y{KbsqLLmxt&5L^F|
z_U;rF?vqyUkv7q=k-fxG7Xi>g;Zi?&g6A-}J?CN=`xYdpGCT1Wa*@?>k!QxP^l8dk
zQ5_!Q<zB1YDF>?7;IP<NSdQF9ui^c0`$`A~#9!obL<Y#wn;dN|3Zb{HXdJ@CW7?f?
zUe`^@lC$1|u$g%w*sL2&pfA})VLOUHI>Bb2z<VK9s;CW&t)~a*GrWv545gaEW<A8>
zMYq~ZK)=Y27a<^*M#@Xlv!2=i>~`)Dpl7})FF<HY<_rOZFE7dWY1RWwsyD3*TpHO9
zYL?~JsDM_#Wh<W;X3R?pIsM0Ec_p+Ar*q9cP;%Q2X5`77E9gdySx-AYIkx(bMbc`U
z^L+uQ5hF|$`FgLb)~(yI$yRnU8s}R`+_~NUtJY@BC^rLHNYZadXa<^DVf5@XEGS>X
z*u*S!a;GeC@?FA+B{9dz;%^k((Grx+k#YSkZ~_gY8SK7t-~_obNQ5eK$Lj@K@DNJ=
z5W7bBWeBn%LM#*C5{Qv@$q+3^f3SN&i(4Wu$@}St8=Rp7Ya+6tkc^pZzrqWFy`vqC
zx8sFqD0A2(M+MlGj`oj4Z^k6OQ-16E-L)*>t6!u`{VXQfXy8tf>=|Oh*teflwujio
zF?OA49S-~RvRKm3<1(pZx~P_+v61aJmYo}K<2{=JvR3M2Uk5^orvH4UTJ`A9LA}7I
z-f^}?=)W@d1BV@D3pvufuU17PN75o~EY_#jHn0s^FW?MOB<1%1Vcq<lNO)CAI}_p|
zLO<CanZbg*B4JOuP(`f&?Is;C$;ZxG(>$4VDkhaQQ4uc84ct_2I3(Rv<OUolm@Qz6
zoMOU$9GqhuI8UtWEGQkDXr?z)xHR4gryHGT+D{y;t-J7q=yJR1^&Mdo5C8bpgm7Ru
zamI;8aT^D>^)cbD*LZf*Cz@MfcAIq%*pa+=f1N`fcu%a`peK2zfZNCMpgBHzyGh?J
zFRkN&;6OHuXSQ0klP%iKNjN>E?^>Sot_CG`vC?k#!SJ8$&~B^u50g4mH$*e72CLqy
zMiT9M2^l31<NkFjNEKK}1PVL;q7T^AYf9T1$E*ST8=H&D{Lh)?5{3zDxYS6R-CzlE
zP@P>hA39X~2qwsq!@HKQl$6@yc;KV8*SrU;{!qmUUjH94ttJGHpFr&TnYMM|uUot*
zbE65oRy@$WNQAy9e(M}&vV*(hBA?hfSK}_QVb(jM=ZN7RWnXB6n83*j<lVBDLa;-!
zm_V9jjC#T$i=xJrNfpX6K^J=YQqF*S`NcES;7i7&&F*I%vnewS(J9Ug4<kH+H;P;C
zUUW{q?JXRz@hpK!rl_175?FK$UrPcqjDe{q%+RXcsqMdvE3Ke1E6}(DGR@G1$oy2$
z34K);%>b1=pcR65D!vl@X~~O756uINXOjM_ZEs~HcqKLuR74W#B~zE|u6DxgObHg>
zP_cC^{i`8n(Ut2bUuK#@m{2Q8?Y>^*hRQ@;tVjw<t+#Y3MQjVn(+*>#9csEl6kfbq
z5(|~62@@UmJI=DGQ8Xrd=)v7(sL-H@dy=sF#nhP+%m`RMKd#S<nI;)`uUDWH70gxl
zrU?`*>hV~U+3K~VY;&&gzX-bN@V4bsH?XJOI}{3&S{5<~<8B{G=JwAkDA%@R1YI_u
zyXO<O%)K7t;D;`5i2?_g^wT}vJ?=)JXcdPzFG9AS745wtuatB{1Ad9o@xmIB@aQzW
zb4F25Z1K3`l4Z3tbH*s~K8K3IH~4g7BwVD50dEwSdZZJudCSaP572h>O_ZqdY7#L9
zXp*-?aqWl3=S6y`TylV-5E>rcn^BGC7MkuuZMcz!1t#>D9}v4rSF5gJM#CN)61Ej`
zAq`A^+2QHp9a!3yaSQh^fu|>7;G`(zuA7@1Aq&hq0J|XRmW)ipCdUsMA?F;SaEg#z
z<SWW;(GQ{HQT^$<<3k6Tsb0O*1Yg*Yk5Xj%CVb<(z15D>(gmL7Rpe&aob|YToyTP^
zJ~y}ST73(fn%GZK@4Yjv@fL1IQf+**dRj%iAsv#0gb`9kd9jHN^6f=baoR?az5Y6W
ziXuR?4EP&Fvb|YgjhHLvz}E-YX&W^L6NeIp#0tNGr*(z7r?`Y)HtRtsw1>vPg@TD3
zq~du@7neyi<QwOmyY*m4f*bu4sGF}{Cuh>rQgT_5Ghk}SrEO>nh%Kz~D?M`NLgVMJ
z`7mdiUoaNuj3cVUK+H)f_L#5_Z0KX*2Dc7^0y>6^vd^XE2dl}&r>AD7?}8{ys->a|
zMze6%-%}z`$+C4lXwQs?At7EI2rxz8PbGfE8z#s~i|9m;nqQFMe!qeHhFE3!c$(FG
zY&-AZKl#(8@z_zj@M@^G$V@81w{L(&w`5W-*3!dZ_mMQ(=2{n`0*r=m%rPm7I|KfA
zaPRJwqDlytECW9qB6G!$Eup?8DGE<f96f`6uOs%m2)Gx)mn29xLUb!6po`GKRkwLr
zGb<zDSKh>O-##pc7{~4;@5?)Ph3oD}O_5(gZ;9iK2goJPru+Rk7dv!Z1Q~wYp6{n<
zdcF_qtuc5Fi*^A-V{n~5Z~iY1n0Xz}_)fl)Ww!*xz$$n&R>Fa-&Qt-g-swDrZT;1}
zD)>nD2r~{x;#1}>2+%=3--iI-07&%?_Nl!HdfJg7-J_?(_w_5q)(iom#I)*?2o&)L
z70U}`qs)l&$AYxrA4pQf{Z6PC1GaLfg9wP`5&P+%!`^&Mf2BER6E*vks)oMR`g@+w
z)|JpPyVGy3^GGb?xyE9ksg6l&yo_W$J@<0&5qEV@uMOQ(OHF9=PPOIy$n6xu6JN-T
zi!vJz7D{uiryP35Cv!xx4IAy%^qW!FOFd<Dd$Fm!y_!w;1$2pD*sr%Blz~LBvaIaV
zyhbrO6`t!E$bQNT5wm7JUu$nA;8wbo-^OFVGp~!r7b!15e@|PXkDNJ2hUDgK>gnrg
zUe=?KOvR{D;`pE+7(_%fId$_<JH+RE_B#xB047i&3MgI;9kR->oB~SLr-Xx2Tn`#f
zfrN*hmlmp*8FuNOdfFeEOBt(43#pGu9(0c{zVuC;yGKQX{Vmh`=HKO>cBd%LTSx;7
ztB}7T8Dcwg$b|Ed*l!EY*3-rU=7gG<tLDSNb|x1hY7uMo<y9HNSUfbQCI}rm#<4iG
zsJD^mgk1_vWV#}yYt<}lzK--o2M1`z()i`cNbP|A96BT?od!yqdwf%d-(7MhtU}tl
zVLQ(jM?~drw^@;+)x#<YO?Iy<OD{CZO0zn+#nE>U&}CWBP)>HuV|Q&(F*h}yT)|e&
z4XlvQ1gtQ>fAvQdz#<-O(bG*$$#F8-l!0?@`Ig(U>yl4bq{GE$)5xJjn7V5~x1~}d
z%UJQ{Vp!&s_`&|bHOme;s=pjhjLajiw`4P(CNm!w_G)=KxHYPy{!Bx}T?Lf)AXV9w
z7Nw8Befa)9xYi>t5b$GX@U^G)>uw0@eM^1cw@<e5csEh}mFxPE86hdKI1a_cCv9kf
zdah?nV?hj5QLRpBI$Q%HoO%cCTbr~!uqzth)`N5X8^xl~r4>NVA818|uSUulPeSIl
z9lOq+?&)R3;Ap|t@F`sHO*Wj~f3@)SN$q1_jPWqX32zoP)_#dbqCFlTOsa|ico{v3
z$b*J+V6>?ejdOoXxCJcu?@2hx{dqYNT058WR-ZE{Q0mNG;p*OC_4qB>b<G`?gE-or
zNMI4oye5Tc+HxL&Ed}<7G`B#T3b);J`Vm$<PbI;yW5v;T25R@XU`B`p&b+;t2JW92
zNxWCogn>ol`^tNyyR0@0u45Wq#CHN(<2H(r6Ds^BL}XIrL5aL&hK2$W8s?m{s35;}
z%ka{wF2$#wNu)Mloqi_QPyU)ik6u+Y$}?O~4kcANAl2q{Mu`rT#Ac6GIp%dm5%V}O
z{8?t|PGe1av~tsbl+v7w)a_j-9$n4A!w)NQAX5D-cym^zkO5k+c8Q0t+jYb+6>tlV
zncAM*bJ#G^Vu&!ySh|1&xz(6ql-ZQP3Ki_(^c6JH0$?y-1>d-9QHc>Mk;(9Rou0q4
zY0e<gVytJ5nt8l$f@ADD3gZ;!voG9MdN=W8azHr-dB?`@_-^XSWC6c`i{lx&)*y!U
z(~8ET_^mUFFW&t=PytMur;#9b3i6m(KHKKMBrLq6Ee_17us;>W57;7lMTt)#Ye$7#
z5&WX0F`rISdxu!ximf6HwdZcIg&fN#W{2wTd$A&s>eCu@_OlsEJLbAbhtF9hc-AXD
z+SOhA?Uul$WxgsW=w5%|p9mk*yRC;l)&+uz;vrcefK<dBdPOS838V{u34$WRwrwO@
zOz?GdDgDo9e)0}<Rboofk-<oDsK5p-L{46vAxZ^9R`iv~Pe}u8I4`8W<dqbRWE%^)
zPr9U}YjA;53S%Qv?{0GJ5$G+1oy1cipki|hwUJPp5Ma`s>#z4WXV6gN*>=ggqQ5(l
z98IlMwPiXl-(91k5Zu{BvGCvU{2q@-(QaPXTMK-!+8l)dqsj#e@$lH2`xr@Y)+h43
z?67*k?in;4=p>tbV^6>RPL4<RX%~jDVb71{dAw**tzn@g0bA@~zOce5J~%#lWA7Ie
z&=LehL{J2XGh0q15+)E4`;kc0dQ*BXbWGQ#A*RsNW41CWdz53eygSs?wap2-LqEl>
z^I@F{=^C`4C23wAdRI{ycH6lMBKCRCMlCOiK~V}il00$LgCaIrvLfeKYY|{7cfJ`g
z({1nY)&hl&o(F+sCFyok3!?39`tToi8sH+YZ&>d(o!fP9t<m>`p(w!fvbvF4QK>~<
zZ&fqan$T%qz0Y5z)lwdE!-og!XSDjD@X#Qeu0q8WtJ`mHt86f>;pkVuU4@WNB?VBp
z78-a{-{~?RWPOU?N!L(1k&z2_&(=0}iuy?(S}Vc(u_oyGyS5&JZt|OXc@fL$(EcC3
zZJ$t`q!6tO^x+yEW$RM*@OwD#=k<HN+{xSoDtq}!JOIK<LdX05&i|Vc)<0b|n9V0z
z$_JL_4f(Si?u96DbMe+T0YYF%cUa+&m36zK<iH>~ArPE{1SJk6tGu^QUyXauqD`!d
zbZk)XB!$nUMY7~I3Xp>O$qtG9E0^tipOZ=YR|aAF5({835`k?-SK_TmF2*g75_o6X
z>ccrz1OiGoItb+NsMS@Z`D>2#3di-u>V)2k+8dwo*j#x1ekc3Y#%SDf|Ni`5CTRPI
zGWsWE@=xIC1K{~5qVk7)`4<{QCn!TFYv*WVU`=OafY0!EuH^#~`u{W%68Lwc4|?a{
zj6M(}IwcoF=MSbz8Gu6l@h@v&V*<dA=mZU%O#ajm5EK#?5up+?u(mLCw4f8Q1+X&~
zwq|r{7PbPmP8R<<{-`W$;$-A#Vef3`h|dmi2Vj&e9G#qn%ncmznHT|#)<4P&j0^yC
zV`p;!`@{MNF#3D%J}mzu(En-q4;u9+%Kt4g0NP{b^x;PWfQMNa3D}xho8Z&a2{;*j
zKv--n^Z<J3uQClI;6TX0UfjgO%>1uQ0<LC%BEn|@#3p0l{?`Em8#B`%e`<f#WTvO5
zQ~!t6M;l7cCN`=7lRtF?|8Pz;Ec8tH9}e&t7@6rm+yYRikK6zNBojtJfB(=<fA8-H
z*YzL${cplB{!JK13j=HFzdHN>EDOV5l-GaJg@N&ZbomEb_CIXgAJhK`^#3T#-y{1!
z%0dJ1^bcV;=vn?I3_bhbg<)i;`<F1x%zqb#osISXCXBR+!N(v4;A{WTh?)7{G-6-_
zXhhEr;NMs{@YxvI|DERh1K9mXPyd@FO#hoC|L<D-1qS{XEf`t<M~gq`-`~d!!+(H^
z|33ZxboQ@F@}b2C_V$078~-i=!@tdq|0)3^D=W+Y4+#L)00{tNijkfkFirkLf{$h4
z-z4}4xCnqM{{W01ixq&#lsB-jb^b?L*}@v2`JV&8H2D{G$iVUeN`4TBG6v3$7Jpbo
z0AR|*%)pBOj~9A;W=2+87IpwC%1+C{3aF|MnA-LqW^@|=D*c_{{BOYJ2TlrD_5K@c
z$i(`g;y*V5_jD>{dS;esrca+i;e~fe501TlkrozVrjj@9QGoz|+CP8-@Q8q|;-3cp
z+%^8do&T1df5h{zG5Ux5{A&RH>BT=5t&a`~*;(5;D%l$tnfwn<NhfIG>?CjEC}d}2
zZ)f|lUb53k0Psvld%F){65rIo+6k~N{MUZ5u`)6Ld(X}?!ridv69#VNCOB$|og7W$
zq!Ag%y6B+|dcGI~8-qbebp@|WNJ9*W`UiX(KsVZf;nD}UVi09SZ3u27SREQlWPWJB
zSbsU!Ds3%oX(=O7KERD$duu7Ez}s|7Dy?`PU|VRsvuSEbAZ=*i>sxfT3SQ^jL~RT}
z&?Z~$EVucZQhugW^W4x7p0KjAAyKApg{81xeul%EueGY~2{`2;R9F8kcewn_XL)7e
zKKf#J;l2XrcH#d0{AJ-D7I$jl-WK<8;eO(Lmvv~p>8DYO5D)thpn=WTg-#z(T`AJU
zN}B!M<NBVWLlvp97C)`k*3Rm3JT9^xu`2jTWVNJDC_{=bzmdze+kWX`b)O9`utw-M
zg1Ld7ZG~kUHadqGI#>kkj#jn=|IAm@F_`LM!(ZJpQy=F~TlqZtMFHXV5oBtgD$FVg
z9-5by7UXwd;3(;s6tiJK+IO2TR;%iicHg{@AqXwV2oIMS=psBK=qh~_XwCPzJK^ns
zJ0LY;65cZlrJHPoO&T}&D9#w{isT7}kON_}rkcI)fCrzX1k!iIaFgzCHMgnk0pA@8
zmJ`m4CO5#8zAPY%A>=9M29bk=WL$E3P4}flMw0&W85;-x-3Ls)@RLRRJE&^8s<_Ij
z8_egkeJ2`?=nds|^v_`k0)rFRj92|T2L@xh5QLtI3zLvmqim#U`ge+V_pM+73-9a=
z>L@&^J|u?1l*mLpS%|ko9jV*wKD=Q{*wec;z6-(-NyETpN>tLx1(*zH{a&l0Xm<5{
zVd99MebFt>B=C7(lL0mpK6e;9rzm~Sw0j!Vg=!g!-b6JFs<D<`bwIHVFjIDS@9!qn
zstI^X(U8I--<N@AB_fzec~u+?X(8T_QM2aaX|!cnds0nW^riGlVl5-<Y@$rd9+IrA
z5?<B&5F|l_XB7D9g;`k|>F=7uC2(r7j|Hgn?l76#wfGl|>Mg$|>vljD#>pA9nH#GT
zO=0@c`%DaJmLN3sJ;mP7G$-dF5yeBmM4bD>4e?exq#F+O4>IW$5=`PABz>yuav1?t
zRV_;@rAg86L}Sld!K7f_@Dx`=kW3)?Y&HMHumiy`0FNRN2rNA)+nSusK2Ib2au@9g
zq!BYO$zn*kQT8o~SBEaeIU&35d_-eLN^zc}M?qSJ9ZDDh9|Ee?0m~>$rx1OfVLS)7
zkFibo#$^XoBT^;Jeni=l7@ba-GNrEJ1eR}PO6Ri#rd7QQ>~-FOdH>-lvLtet^wf4*
zhj;vL-VcY@r&+0^pc>q4AT&EF^&C}k7Q+Z{Nk1n_MPo&Wb<aYsbaIzm>2(z?s%xVW
zg;V8th_E~YE<sU;zJYBd9pKSR#POsJrl9%vn0Q7J-UMKZq5KUOj&HCeQ!V5i3c2<w
zakmSWkt&atUyWdM2ObFe=L+BoX$jw%HPCCmF8HDMj2q5NlZBMf_$Ux{yNl%Sv?ySf
zLpe9ee`Sf0iZ8X7A)$C;*cUfN7XaxFThEb`#{RPVwtF*8q>g%pQUjzKTUw(xTvQ@n
zu(tnB0-_mvSe0E@s;x|}<mRFgM{;zGO$D?a76*n+6o$YHh}wT=_vvjtZ~?pO1W2=2
zrW$v@uhfNeSXv~X_Xp+TJU?2fXby|KV8S7}QT%BIhQtJQz9+~a1R;c{_u#Jwhf|*W
z{tZYpFZMkIPgT{KGUl})$c3;Hce_~eBc~kZ2>%&hbllPM+f||<82M6R3UUf01>V5=
zxEumLQx40p)NJ{e-KOvdcq3ZZn5V2O<V47Qf9o(7ljG|W)ROW1*pov3`uzs++L_8<
zro{Y5q@UBtr-Q;;EhTa%j{2r45rF3&U9l|DTw`?I)%q*wZ<Xq1e&RuI2{y=Effp9X
zD#mgy!UUwYkOOI|WS8e^%c^h`DXicM%NNE8`$n+~MgcPrn!Dq)Uq*k+z*9&UYa6eg
z4b>7oaoZmPrpm`;5aE>(IS$B#^q<&AX<0*^q9p$|_}szBumn-yre-XWxM#3x<-B7~
z5SYEiH1uw52tKGK#z-(}c(sa2I~1*x{SZ@7nmn^9>X#FSoHEyJ3tQZvJ>Q^=`v!wp
zKxOE$l9z_?&5=`>;@nd?2U&tsPX+rMdX6yoG!n7mYD_wl!{b^Sww*V6Ft@B#6}tBo
z>8&k1bs5VPMO&kI{)7w4)DgD)k&p$Q!L6D^_UF9tk%JKxwzCv96zhD+5)Z=oI1&Ph
zISyi&aRfu>cM5ovrbP-~`m0d52~<$jwhH)Jt}sWQh3^)`io^LD*|!)*A-c-^49~ln
zCiKjszZ0vjtAkprFv4M6N%=9FPhQ3xDj!RKCG&#_^xUpwNw-C?JKra0PwlGv_rux(
zG=sWW9|eq?offg)7V;YH#Rm0kQ$=I5<~zz<-#l&ABSp08<a==@M2u7SFvN3D`5a=7
zCC<S|-9Cm8apzA~s{#>BbW&}2V0<0pAUER%-;!IrQ^S<C(fFsoizc4D$YZ>PpiF;4
zG=E|n96OLyf1)4hpDo1*|H3L6KcK@QB^WmpQ`i&E{uMqP8om9tBODE`30>ec%gNw3
z#iSua@3(5s5?qc!X&qh{!hLB>paQx0;T+$pEm6M~LPp5C9q}AL<|9AKl&56pW_UGT
zbcAe-a7U8}MKcL;#3zIfOnbmG0l%eNRqdje)WwJora_LT&Z6W_SE`3vGMP^?MIM@K
z)2&aSgs=>vIdAt8taz+`QGLG^R_SfLLgZr-sA#EnQIQT%1QB95m>V8jBnnvJ(k5q#
z>!B@tlI?=x3t-!r8li!z7{p|N5jkcm0ghl!1y0~a{*kM65Ev15pYUii|J=aWqwHeM
zs&l<n0YUPnheIa0U$LTDkBcLgNFYGKiGR12IqCOX3~jO_3DV=918&hEM3ZG!jYf$<
zH!Pl!;w=P|{5bcgAdWoO^9@R@WcGyJ3+dzxxyu!jmvuU=87lpm08<=`x?1Z>pjZtt
zarkR7hoF)_IS5VV@}|ORrcWXJp@!zo&v1Y0cB~CmCJ|ng*j&zQ{=w_S$Oh0Kgi1RT
zU?>V(jU653s<13lN@#onIAz?bMN%pE$sIGSL<o{4I@=Z1#I@KMgwsGJG8Yr?iH3_`
z`t2M8ip*iko$WEG+O<aFeeH-9n=0m7DCD1=#pd$G)r5B%>yn&(%0_d*EE2uT_KP1C
zdezkif6b6Ys9Uxe#V_b9+z6A`6K7O{VP||{(+J$UJ8_e?&R??Fv4DMb+U2qh=mJ&s
z#Peh`Xfdh0(WRMz0OWx@lrx)KT^x!0ETy>P+ei*rt=aH{L`caX?}m|C#-J%x)|*2`
z9XXtteDve%q4(YLLmMI};S$f?y+s5Wh3j4quhdmtU|BV1VT1Zg58?PH9&cN*I3bzU
zxbm=5<Bg~C3OfbZJF!fXP2mAh&1A(_eRQgxIb5vl*QL7zAjvYx40586S#M|@mbeu?
zD`(IBj@n-`*yFwytS59rvoH(zZmVRz<VnY{4ZcJ+Z^aR4?k>k59P6pP(oTf~d8D>_
zr(uuFn29u)!YHda`rAh~ZWEL*l|aj25?zysC9{zw2>AqKwjeMV1%ghe&q#gm3M0h;
zi#7Ep?a=wsc_yF5eI<kv@Cati1Nqg4C0=9_+j#!GXIAo1TH1RGuSAb0N97SzXr&0e
zEA=RY;mF((@NsedDg;TA;dr8982FI&uWY<R+^?SS*G-ouKQRgk6`#)#<ZYVGwRR%H
z4t#eSce#7c@a1&Y664KcdggdciKUYmgbbo<=s(BQA1UTBkB5yU@Jm=ggTkFD(nws3
zPEJFOPZ@Z_DZFmi>aDn6)?47Xf6w#@+vYVK#w=XrQS$VoQT)VGpkUtoiPER|vt{Vb
zik*&6+K(dRN&ZZk+e{L@vafs<gQxh?6u!Bf3!XnElE@9G)jAUQM#f9GfFvn3iT|V|
zEbsy-Lapdx5|roNKk%fwMwTQiP8?-QBY(P_LVh(gw~qzV`4sq7b+J(q)@3+z&C8H^
z&MsBJC=TV<vTcttZe&1ArFT;x49r4q9Vpxy%G2}{C40>mNTjQl3lyYI_x(M(hKx40
zFP_1;gMlv?RCAISLKk2cBD{C}ZsVT-JArcZEn6r`?1R>GzRsQpE0J8(SIT<I_9bii
z-@!bl3IoUn$DFL!>dL$`QQ{RKk|!0g1>$iapge-%_z;#4ZPq)s5lOr-J0=hDK~}?p
z<x_x#oSE@<6{C1E$6TucVXDv0#RqGciN?0zA%3`gPOyHwVN8nNXmXB^`!yD~wwAs*
z+GV}qL0#)98zV<j@j!B=K#ox|Hh(s62O{wI!%PwK@0Z;0EF#f0ICpsHbJL$a$ebvE
z9krf62z$h4G^`@b8Qk(@b>cdHLn-DfW4-*Q`Oem2Li%_vL%8P{BVeSLQQ^%3Vn<f7
zg_G}5;}QERetM4ibUP7?<ks;$G8XBUx*SN=Gl9N63$V@F!qFs^trZg@cGA$v@nK48
zCfLJn@}*9gI$TP9yY_w~`g~uF*c1PelG}td<z=GMxpPZm3G7A=p19jDeQ2_40@ZhY
zlMR`VSu`oxgGb9;)XS|uO*3R|R9Oh?hmk9&pjh||Y^qTd4gu@a86_ay_6-us&<3zE
zy~YV+4U3knbs8A^PtT#De2OEekV?l(Lq^k4+$fmTXT}a{fVbkv(f*IK2Sv7x4p@e@
zFF<fJwSg$oIZA*{>LF*!@rX`fmC!d*qhhX6<T`#wRa8h3`ba>}vf++$l`+!8TwcEx
z=LGLIxip>zx$s_nQOyzb1cp`k$;qm8>v5Jw+WuzFkB{w6Z&IdEAm(#LcZ<of+0@Yx
zq$ggOvVX*W?npr=h!6RzDrO>~$Ma5Nj4$fEU~;b+)heb@Fk__4%&pOVRjLcC7iELS
z7eD8i6jN#JJwd6O@L3W{Jmo4ls-X787AnQPjGu$6Kn}G5?k5*yeulezSuntWj)0w*
z*1o$gl!VQFPQbP-(^eK1&%Q--?A_VNFrF-akRHER)^8}bc9>$qAi51Qb(q-`O8EiG
z8u=^+=a(NZ6GA~F5l_o9;zd2uicUgIfemz?xao6k6k#U4_8RpPfSWT|&7T}04tmH#
zFa;C7nQJgSI*Z!aZ7x197`vQX(=RYtsA!hbuM<ZqcXQ3ECI4H#h_t|3H94l!1rs;Y
z*@9Btriy>K!;>m9_B%qy?<M1%O*Wjx0C2kobpjPnl(wV|aDId4TdhO4^-9FzyVW*s
zABWXy)Dg7?%aKK?V`@1i&hZKfrgGEzans|hhj9x{i-GvWKChoPqQ%$J3UPg1SE#?l
zx8Bbh2Y=tOk=pKb@f|tn7bG;&D|v=;PpQd1gOp+>O5g8oFja(bCfk(EX*8Uinq;Gq
z$fUm_;rw=FHlf$juRUaNp}>M>l1Ngn>(XZ#!J~@qJJ<7lW9a{eC9_sF{pJsDrzm}-
z0wk|~J`_$KFtho6WEQQF`$TFSHGyWJ!Qz<Nbkr<axTJO?ta7ZT2}F`}oL1sTO!5}H
zsc~&FKvjtXB1Thc>67aCm0~E7Wg*fM1;H49agg6n7D&Hu=NLfVJw6+S$g;>AH8+sA
z^pZ}saogPIchaBosL&uM+t@1?NF6Kd6yaT|PzR4p*NJt<A&SG@ju6Y}jOUKev0}~h
zaT;+NtRMh!qnb>X6G)09Vnc4;$>F{XlEu7*1^E_=`QLs0rkf8uw<oWK%+cvHEggX`
zm%#&Vl~Z>`ox~Z0pr;6tz{jbY1Jd$KHsD^Yn=^@DDf$<$*z7&2cOOWQhH@?XJ}9e|
zm$c4^C}4?@YL=7V5GSV*<8M0;4?n@pdL#gUlaBdBRi2Z;MwxxfyJBbLQM!ZLUc{>P
z<yn5&&^j`Y^44RzXbZj}k3*#bJ`Sl!Dl1}2N&pKx6Bf?gR1#kzRV*(5=og=1Ez^pr
z!pL(}3dD~Hj1hl&bpelcq(O*HINY?ycO#_Vm&MS}DYy*kS(hJU2bS$%k760A-^RS5
zw5HxEhusBNLa~T4IJY<BdCP^)#6}{O-P!R-(m@~%Yf=(2SaueN*<rBL+b%G_q)bEl
zOWk;GyvRWQsVJc!6R<&#K1RXrW4qX3=R?`m1@BHEAaz~w3+&kojv1RPh>Jr%YRMv2
zyR1Fpwy3mZGY*yZ&@o_3vQMa~gXfw_QfgFjq%9&$r!tYJ;wj*x1Gm@O%x&ehfgS7+
zNylMIQ}0+ub;3~#uhi41az^mfc@SGKxEbFcum*nH_M!*+zDJu#PHJX@@w7+idMT01
zIaA%X_E{Iytbfm!y`@0!aia`&|GG7~4^1Yhk*bmwZ^>M+oPHt5q&C&kEWkW9OENX!
zrfs+LgH)|{CRv=qONvl>B-}V(Gy#<a;q|`PXwx@r*Ps`fztfK9K`fj*mgnd#V|hhY
zM6Ds6(<UnpS<tf&?TdA-i4DT|(=&}(46}(~S&aN3v&T|26U<CtgAio!Qw*^TGikA(
ze(g{)arMcv(XcGeNa(=(eNx`0=M*Sl46tX%MEBZNHeL(g2QTNM_PnYplKm3QfX*rY
z*}n|I=u4Lyo~Tq<AC%jSJUtc-8)1HKn7T|MFN{>XoDH(GK`frC{o6gw#2~ID4iR#b
z4G;S*4Lth&?oI5da}#7L^xM8*W}WUBYCXIribO_>T&y4j)2d+VwlKs^dj)eIkl-(I
zNhRhVMh1)!luVpUz0A9&N-(P-zh<SGXw4%n2rVMU6=h#(p>MQM_x0A@Zs{(VeivjN
zAq@Rkn381BlE^SI=j~z6lC?O`iahX=c9L5L5;VmIFiWwjJ=K}m&}meK6l8*<2IH@X
z*_7i5h0V*;@rZ|C4O8<a1)tC!e1!q6MVfEGqY55!;*7_FnV-YLIq$?D5vDqAJ5*oy
zw!7F=kl0ilj92PxXjfV+>dUll*i|(9I{Bd1jhG2goj9$qFc;})@O`cM6`k{C`f9xV
zjkALyqmZMc*<~De6ML^297}Du8QcQ0PIH0>ea}OHz2E0?5<0ijDIpV0`6#`8T6A)G
zm7e+F)W-Zay&X0$NOxixtL4VxF8sXLo0sN+RLF(*Y-`eGakHqxr{YCNCjI2x(Zv0)
z5-n3Z0r#R$!SqEA;8zh|<Zfh}Le1r&7ddN*-f;WpnVq3EMopceirNq=zoXT=li1W=
z-1-R9%CB&$aiy|V9^S@p%bMO^-V3qNe)eD3Rm4Bqy4c&KRCc?SaY5AXxs~0yn5I74
zx+pF3T%w`Mjy1fyz`44%o_CMJ=saxg+b;4fEw8!2$v(AOGyARwVV2a!TEz)3`e=S$
zBn~&N?e&(4+o(jVt6aMa%+#Y=gkE9mTZGn>s5EKurK$AQ9W6_eI!o}FS7h8Na|Rmw
ztXF$-ic{>1A<Z*YW>%{*-mu3)Pqt7QkfQzBDR+d0{$$yjq&5NYYaBZ4c3+<r*dFQ@
zZ)c(hQHmQ)pR+?)G?yg6`*uAp&K6f^PXSgRi-*Dr9Gy<oh>4vtCqTRd7$S=<NiUgS
zPFv2kyItEWUY?#8U9#+MTF$Aa<15z+*1b9pdTz>gGNF;$H=S};8f7`Be$q|ub@=LB
zggWZWY{H3nSr@(JFs~=i<zZ|VGqfL=cDF}F&$X*f59wd15XWI3o#~3?&0na@otF1W
zcFgNv5V;o4zl3ER4f^ywyOG7??rH4c`qiF1JW{#DbPj}QsAQfzY+7Hwr+tI2MU+8q
zJb4(<!2<kIkcXzW+ucPIGy(n@wAi_*q$c<;v=+S{mUa}q3UBl5i>cl4?5pj(^!4AB
z6U&zZF;86)hKO1?p9apgFwBBb4cT%Xq6;(&U?Y1+>x0B@+`y=uwUt|{Jxe*~#xG~6
zlH^21EdA&p3BL%1jw$yH+Vy0n0_rEv#1=Z%Vv}oXI>bzbIFuf6oNgG(4|84FfFuLl
z#7L#^(2qbd)~i>2m88|>*+Xqh3~J)pKCB^@QnbtzT8lFN0=SnUk)@-Fm@hR~9dxzi
z$shF0n#2!Ej^h490yd!=_&8TQe<&Yl7F(47x=ADe6u+3jl;fD1B90q58X{L)B5MN1
z2*ZR`XQl(we{dm>0?@lD>ZzkXN22T%7ljkTAC`qfMT9CbcVWZM{x5}suU~;fmu#H~
zL*Q@Wx62~(I3oS6)XpkI$a6-H05@o8sTQi+XMch?OOVK8z@tzo`{&xkN+|5LfG@kV
zDDAehW(k&FWGgBfyU}kl|GDQoZFNQ3D$2n}8I2)P(>hWI_&gR*r4-M2k73;MQ^u*E
zY$}^D&WS}Sy8}+eMS*xmu<-lw5@?YsD)rVmbMge2+&jb^F?tq!9sJSyAx}N-LoF`c
zK5=*|IMSV56GpiFL<o7obxA61TCN7#p`!B60Mvd1D8GviF=2vqX$t0r$hSUqr5eQ6
zo+&vI5F-Xs>{t?!xRzg6<dde#Dah3cR9RpOs-P`nC<-9TS`}7W#)`~lq$z_=gfzw@
zitBD;DxF@Wz)cq#LZYX*H+;Jk-DVIgP=ox&N@raN8j8dgYYE_PvukRmFsgY!m7kcS
zYY@jF$P2>Ji*2+Bv&Hl5b;6w2WYKG+iGIqLeGjpIkZoFG+Ae8kzYqDjHYpi}Z_2@G
zRUa4JN~mBNe+Wp-OpLO~GF<ANKP@W7?G7QSNb>$MGbQ2O02U()i(;6{5STsJ*Da(#
zguY%#v1(O1i45|ytqEzbrb=lQje5Kzyb(_RB%{%J8)#8K&TkRoBtJ}DQVmgRFe_m3
zXLGIs^r0AL@@iER{$wF%l4M+bK%c7S^uw$do6?6IwtfnH(PAz<o~gtL)056JAENAg
z!3K~v^Cq(~OgUO0gYa;SWK7njvH!cE+GQ0W?w0P1V=AkI5uog!l1w1S&r?$9Zy^=X
zv67@DFC$DeT~IZcwg>Rug7hKi4(W)|65T?6ovv!n&L20|6hH6ln=gE7$$gfWj-b_R
zE}SJ7wv@{2a%Y(xUS9GAE&I-cc|D&!3vTj8Ueu4C%Q8QVEv8;J8aJZDq0nLxbDJO(
zS%a2sdy7`6&{Q#zKj9uyGE($|S|vs$_Rii#+Nl_P5HNW19}vPelsWh;qYcXmCC)0y
z^~S#}$ao!V_jr9L_xgm1++weTP%@ft-BvdUoS|8WS*BFRV(?uAVpG;(rE+i-N8qct
zp)>X*s{I~dYAK%|k1R?CwW2C4fR#e&IOtH#47m#xR)|Vg_XMZ>96|+>Rv4-s{9oL?
z1$Z3EvMwlQW@ct)u$Y;dnVHFACW|a4i!Ekm$zqF{C0i_uE!vjn%zZO+XU@C#?%VI%
zT~&8iW@J`oWL8wRI^vJW2Xg9OAd<u32$KZBWb-I}ruonq^f}^~CXpl=XLsUwZeB9s
z$iiJi#+V!os>jWj+?;zZSOtbq_{4Puqk=?yPvTOoT4FP}D5=sbJj7kexB_-AQkA6)
z%ORC|9#^}(KwV*>58MDch<t>s=6-KEVT=3jKtG0hn6Y{yd?AX?s}a%zALYZ+(ev{d
zow8kr)~7=jO_4f~#UEm_hr&67J1Uhz#NR%Y5*18Jaf>gkjLFHYKwvqz35JgrtpMin
zN<UL!bb+MuQ=!zzBvltm30J|enV%>VXyTPg-;AaWqn0xlfT>rnhRZiyguXUZ5sjuv
z9h^ou_$jIP!mbJ!xG|A<3L$A+W0hOFlJW8PV5h+`o|^}@!H-Gcy2Fbksp*-BFOlGy
zbk#RtE=cIZW*>MuUGSVgEO|eNL{VFlEiuZ>o6MUU4fbh-9~6r$faW5hMOgB5U~vm_
zxM@^338k##+?UeidOxaZvTGq~T%BTEWx)g~prS=w;}VM-9J0B3G&LcsZG3!Kf>tkC
zNTBO)^hN%cnGU1|)t6YPP&5ci#SUusXqQSygty1GDccD!N5E^LNKQ&4(!CaJSg5ur
zBu!00*KO==tZU};I0<5axsWlsF9}(n|6o`T<z<5_W};(o8@HZ{+F}J&$p&T+0Mu4>
zx(+ZvPIaO?jPaxb?rZSc%EqPN8bmv(&c@2dGAO{XIpMdtve6q1$Zq-pN*nF$?am)5
znw#$SYi#XYzFFD$9KwHJ;7I+Vv&w6WZfi$M)7aRVtiHLxQ9HhH%b*3TvkJFJd~2ga
z+0xYP=Dzeoh#Awq_4jt@{{&z91sVDeP!YgJ&-9Pz2q{y5o!QFOi(c|y_zzPf07?dc
zlyM;fSmxi*L#hsH_EvA9dH}GQe+5SR103Z)P*6HV05}Jrz8=vZO8*FV`d?V4Utoye
z*%Ofxee3QGFu?L#*r8wC1IVfl5>ldaMvjcX^T;TwLBtFIDlh^HRlHmPu%EX`Qbf%E
zLOJ~z`o#QOq$Ujj*T2gD1l;%|x=F#=)XW(GS)ls88$j$TWityafVtm`O32i~*z7Hs
zm7AlZt(o1MP5w{8jehm=Ck((F_x}jT`Tqt4<QJgiccj#>oBGGe`xT<<PecBngF*lz
z!_3V2pFts<zkxykCjc~slZ}A|019COgso!#U!V|X&fh^HfYiSMh5RAx)_)rm!nXKV
zPzW~7zd#|eFjjy2F#Zbh^{3hSx1bOJ@4tgW0LlL+Kq0K$9DjkkoMu~TplP9mJ-)ui
za@SvvyT*4~wl})3T33{A!PwK2M35A;)RguN#GZ<;k<$ig=Z)}P3bE8Dcu$U@IwhA^
zI&^(+B5gQ5@xi@e<#JqoNpvs00_bw$Sq?eanwrZ@SIbq``W_@QTwsf2ouh4Rj*k}>
zF~Cc<0q(Wwgy@s}i8Klnx1mS0b7@nZ(UW@;$d1M9u-B!_CpHx->&WF8Wx=;zs7J7i
z;>Fh?6L>8m>oHz)S+{j+yV6?d<<wKLjG3y(xn&$UG75zf&@R#1;C)A2Cm}Y5Cc2^J
zBIQhrEeneri-L`%jdFe?Vfop4=94G~{ZOsYbi-=(IAt`-<V_Ga$kxF}v*FI)p|?#z
z*YYtYXK-;2M8I3&HKQtu%aO@*5;q>*O5^FHCa=f+YEtS$S}FIG>SdeZPA1(|g&@>R
zKYNuUuUMBNR@e}3G?vb@^9Z5=b7n2V9kT2t1cK};q3%~<XuH`0wI>Ftl|z*M@O{NN
zN?tgUZav|?K$6f2w!5+=cAzqK{vj0O#NF`O3%uq8I1vCvSG<uV?cpTcY^36%2c0af
z=fmZeNou_8jrYqlUZZVT>X@!g(nQr5rBRaeVhHfXJ?dXw{H$J+l)c@se~}R4zDMV^
zz~quNGrDq*W9*x-gdsc@$?nvs3a{MT0(O&g1V3L<>G&#LDal)Ko5jB;95XmXN|9Ap
z%wR58VMyKpUXw#Z>_%qrnW34AYl0=dG?aCpJ5*L4Lqjq~3~lLLT)3rY0jf8=3EKAl
zrexEjxO^Hz!+^1C*BjXedhBdwc8fTU!|8zUE>Jg$xx#ZKWVHsoYI!=!q!1G8<SV2^
zN&?Lgcxkg@b?HHU-?NM^M0*}b7U`Y`NJh4;i7dw?6lD$;&T3Ui6=xN+ooi*_ev9%(
z1Puv#4%Ef|6}pDYrzc|Y$%YixeXTycWIFj<W{v{M%=I2<4d+L8llH`Vv*C{eD=o^3
z_+ML723t9Gr#3oD(eS>`e&F^ajB0?bm-^9ehPkVY7UpQ_9^53FjLHFHyL~4w7oCqj
zj8lWQD9KKw7s+RlpPG+RnqwaNXyB-|f_FZ$EMiV6vRlu=QBX5s&qZMW!)~$(Y317z
z9-Wd}5sb@A_2TE8^dI49o}zMDE+adp&SE&b)?u(AkbUl}lZO0D@4wkvih&c*S)`QF
zj2`Bv3iIoKtU$xtae=!U$+$Wpsz{cLVeLC#91CHEMGQQ3PdT;jTgc!Dao@&otU{|a
zPf0-Olu)eB5#qP4l&C%s7F!v}=&}H2;nsqxkBz009CYjZ9#t4%>tU&fVNvsGjYNmN
zDnxxX^?mUOb~*C*lm4`NL}UK!NVB_sD}+YpR3=tas{kAmlie6cW9aOKpFEOeUy9io
z>Fhka?0OeIVu&2~8tYP5h^i%$3=JyIMhB?DAY5{@5;Ptxhj0sH^vTGhaVgC~UGeD>
zko`o~<D|J*5jt3nIrSM!4SkoN?N=BVYe-!YMg!XM3UUNJ*D1X{<A^iEk@ek2d}L-6
z&jce>!zq-hnrn@y)<LXD`{GNeSzJbMd2471k{oVtm;YI>c=;V>%HggLZ2b9<w<~4p
zI${a6ooCB_Wr7kW^U%zYS6CIft2V8S=eef~{oa@2Mo_p|v!>~hqUcUXTaf}yaD+-l
zH>N|v4SO0c;Ao{;k32r!kv-GM+6ZAaUD9#dxPp+o$_QKS3+)kc@SNVHwSzkHS!2u5
zS`0d~dE(BSC*XP2;tQ%QbH!mkT2nXY2<DZRydAcFBkkzmkVHV78~sDJM<h;qGHu2E
z9bZ@x`Gb%iccJC4hI#qq=%$7#hj4Rycyny?aw;w1ag$gp<Rf#y+(Q?jJ@G;zm~=)q
z&{^TzQpG6pXhA9i_Lr$V+C20{U*;S8cgk<*YuW46nur_rG(lu8No!bDB0fk7WOiMi
z_!n;}asf|?PP)Kgb4N4C#-uP+ENO#4_p4AfaDQ+V5Ie~qPASZkg)I&3D<xu0zoF+t
zW?aQuA**Hms0{3R{?oQJQhHk{_#<gHg_a3jh}hs{zZ1cNkT97o+IC5V9z+Xu+tSPz
z?TAxZbEab*Q`p;G260>R27=0b9%diTx!Lwb;q{<4>m!_VL`o>@12`{vVwadmM<npl
zCC&Hdk!jMJ<hsfs-<{eK9E06j3(03etcdZ^@pPQukEn<C<dgRbUx47rn&jbly|_I}
ztMo!AatMh}7DQv25I?<F&GO_zildDqGV)Z`N4Br+-~_b*6=84t-e9<jU5W1s;Ua%w
zt5je99%@0c^X4#FF-=4+qk!La&N(hY43li{L(Pq2)`D>+Yt)5`D07d0h^1K=S)1uP
zU7M3O5`DTm1?y7q05q3V#&jJFQe`oYq`tQ-JG9%(hZDpUXM_YbOtjs1_X{o}OYS@w
z;YxjFBwv6LAew;fSwS<u4Vh^#x8dor(tFZ7^4bhz{0O5#su4z)5@>sn01U$sCtJbm
zfNT|O({~}b#>&MVB~E3)`Ci77x#twqNlHbgFd9@u#MKLh6u<|DdCdUy*TDq(@P%MA
zZ%zijy`Rt9ZYmeW1j>`xd(-zsLvb~Y0}T!?MNqtAb0Bo2ArM?vj&_;aP793-jWdH*
z#PcAN&1}KgP3bzMU~0gk%<sw5yZO`ecpPBvdWgm76ABqBsaX2Bs*Eya(0WxwakdZv
z(*a@6oo^;*CzJV6@6%13y%a1@uoC0$ns-w&inbwoSx#;%gxhE3LyN7D5o}^eJymFv
zl4L7-4KM}dSYUZfr?suG?9QMY1acFTZ8zU2wDYjX?-y!Prk#z^Pg*l76uafad3gN>
z(R@cA6YGV$kckVaD#;sWw;NKa3NW^~ga@!&HnwOybux%2O}<y!R2>&)S_%1RMbeWo
zy%+NaKFD|sP|oHE_1T(6-^L@>fDV_36ns|z#aV(5XKQY#J75)Ta~_W7U}i2k5h!CQ
z$b?ofrp@#vw22^lYq{^7m;Cn2@*~{sb>~>g7w~YfAAa)A@|yHU=***0R1iyXNKP5L
zKSTrs*>>G3pxXpe$KkwG>gyTc;flZY3C~k;{#>Xno|ZOdBwdoBJQvn`BBx&=<bpM2
z5s#=Pry~e^Ws^)tQ-ZCDLvn}UfS-H-K7?!yTWIJijcR6M)nt`a$&|~~w$}SNP>JCw
zFS*>u*iNBdy790ZGUr71fP#R0n7$Xo1aEt>Sw^xoyhYuI%Vp<S|GBq}@W(^GRyW7g
zYKG@*sN*198`y!?A_AO1Tbc7bbRWi7&+lZlW<{8Bn_U+!>cbAG)?Dz(bj%cemeBVH
zks;B*smLVbJ#Dh~7b0z4Og|^m9NIA)oD=g+!<deTf`Y0tMbjDsJLv%T2|k~^M)Jp9
zuD#IntqI+Vh{Se7G$qxLESDw-uU<O2BC=KBM8Xg-!$Z&^qzkk$;!k#S$;%%Dt12Dn
ztVN)~+8W}zCE%_gwV|@g#u-P5Du?-T89$9++)*s#>yW}6Q21=o@@;r*40(lBpQpvU
zJJp8?92OQmci=TVZg_&RNAG`Xwv239gjt698cG&Ky9h@xJS^10lTG1g^4)ch8p!pg
z)v&&Magv7a2VL%GktqV66o)#5<P;pRv2@~_Jx25d*DTqrY@fe4x^dA{41w)v>{3Ur
z&cGg57M?%~XC~B}1Nn)OjkIKJ47zmgkQ4fIa_)}Ph!XET>J}Rb9$gSY*9j;dyzT|4
z-XZgqh+KE^*-%=5nNy;Ho&!5adRLS2LLam~XVOE4P^#4o93N92VEM{Mp|`Y8IrZS2
zdb%6RcwOU)1%C}>G_oM0!=@0E3>@rn4<QoVB4NlNT=P`#Yqw>np^T0W+s=<@?$Bp7
z5y`J!w&8MTPn>|Wh9JONYGZCAn%@F5moco)eYv|)(Y!#sM{(~XridYE0*-@Vc<BAE
zv50v9qtMu{c46PptCRcX#|_T*Jh)g@^`#^SvFF5ip1I<&vIHr8pf-1N4@Rj!uM^6V
zf##-sPg)Hsv`JXJTpdZxa9L0x;nNcm;}14hPCeZZ7oYk?%s5ur^#jN$yXO@9eIzb^
zNXrfCuz-qTg2AE$=H&?2bU$6KL<Z=Fz7Ob03t}?Gq9X>Ld<NQF`>ruupX!+lv+<x`
zzf5#tuA()&wDpJ+#kmE)!+&}K!HSpp{_<TRtP^?D@GC(S^eRw&5dX%kP`789#Blx6
zx;Gr+vMmQ#hc|v5$MqTJ`6U`78%qMcY0A>eQ!}liC>I!)Bv-$WFOg^3Ny;d~Iq|^i
zs8Q+JskYAMrF-yl&ZYZUR1gF0&ju=gXD-Y^1&D5{vt#n?hkh3+3xBFK*EZNB&`zRd
zJ$b^d{p;)%!eI4wL5P%3b@S`b%%xr&$1fjRn+46>10o2&5$?37CBj{rk*!DP-QJLt
z?nIZ)o-K5SR97=@AL|G5Ok?K7ZSBu4%gn})jFn_QO*dn{3i4VrzDzL+!0lGiqDt&n
z(ViZLzjR*Gbm?43c3xVS_^fULZE$Gn8N_U6>X%4dd|I>)#vN|9z1j~&?lkkUu#k1&
zGF(|+n|9z*gulRAXAoKb3NIDr)gD6tOpun@IUajv&Nu-<kg0=?-6YbfrbOH$u`RCi
zy=ol7v#{!eqeo^-*9HXxG2a0vx9&xABHkA{gjt~GZn4ATGmLB>lC>9z`f9{ocyk+D
zNVni+x*mz|w<As2@F%w;*xNrRI_njbzuBL4YV@V(%<fV$;Q!Q_<*Yq@t@;id5<dq?
zJ15vyU|@^dLA`zXs+mwNa_{*m>tgZpMG@C1lhOZil7q4`ctA6NOyyM~R1cZp5fKfS
zVl~S#f7XpKoD-^VJ)9figX5i2Q9tRLU&2`+lm{Uwzh+-CmItA6*Jc&oZWX<WiF{qK
zEw@b$a%1s&VFYE$n021?#8}X}YK`eiFxHB1#UNKp0o1ai{#_RJJo8fQjg+!=*=OAH
zy(N$(P{I|{v4BaEy`_6Ni<Ejfb~V#88HEUHwMOmo$y)v%0?np2b|G^;%Uzc&dTZPh
zg2tvYM5CfGn!$OlTcqGd9e5TVt^V;Wl{w+ephcV6i}6J9a<s)-pbDMK$P>lgdL!}4
zBY25S>q2GkzE3{3Og)${lQ@eoi~V2Jn~&-Xm{<?NZZJrkr>(^-eiXtf=W7ppkMyoo
zZF6hSRU#BkfP7ceNi89=3*k{WOs=9l#BUmoOIq0g{PcDA{(<LE{;&Wek=vvsxTs{0
zKc8=B5_bK%-$?lU_d0_=`Von{*_*gpIoSX5pkZYcb+7|C*StmUW(Md!gaIMY--4er
zF|Y!zVE`!p7&riKQ@=SonE_OAZ}tAy8&L9lbAXFjemVIOvHbFVBVu{e8Y&a9{8H)t
z+4JgG#ou)pzc=tNkDfpCBf|W~3l}>RH$bBXaNXg0YaDQ9V&w+(EyDCx%E`gR`Buus
z%*_Gdje(nmo9%5Z@_+V>{KeLPqtkzC_P0*i**UlXBV*uX0x<Q~IX4r_-}L!M&FOz(
zp#RY7e{Y=sp)m!7S^qEe`R~X1PgDQb$^D}u_K&_>EQ~6aznt^{%>1qP>0ibFy89vy
z4$g1#2XLxn07R{4VPa)qWBWbF%=V_J1I#kOdluj$rSmJY|E~`K?e~!qvAhX8Ah`aY
zREmGp$NpaPfAat^y$S!@oBK^E_{#zIfBOLZQV9bziN7SD^;a-{fUD|T?k^9(AH6~U
z#1G3~9>})`fcfnJ@bD%Ezo-8)|G&EYFAMkYjr=B8Z;#L0Q^m;&_ye3-+5T3Re+}oq
z+y!9W_=jfy-6;NXo%)|n%OCv<{||^IGZPm;B<KNK8?bW$G|Rto%&e@xITQYSseF^b
zzepwP+d}!bGy2cV@|zs|MHc@c{BPS<{I=|}{>`%ctLSh0`=3tle`~4vpGn>S;e~~n
ziJjriA()$ilLc@C-!dGmfK`R{ZC?M~vhwR@{q+X^aamEda<w(1QgrYzb5?W!xT_K=
zI-9uw>^VknU$y`XkC2<IrGqn-xRsf$DIhBX*oNY*L=@n7NhQj|!o<wY%*@0J_+)3}
zq-WxwWMZNORFiWs{om98?9*ZDW@6?{_3{cN2V@0g0^|(j0OSIAn*+H5eFh=|QUs(u
zfXo2&x0Dr-{lDleKukc)Kuo_g17Pld-ozhQTed%b>%FbAj0(oqDz0{bZ)7$WfT#Ui
zw177sT+V;vYRb;U^;cI@J-zsSab!P<9l?9#oFTROTohqpUnn$Wv1Lg<B7!heNp)NI
z0wa>%?h^S&PY={HyQvlkeND2Rji&atF@ciu)kAsS<vNBkI;~Vy%*2MGCdn!pea!i$
zqAtr*E<35zN*V3hr04RiG+X}^&KqT`m)GKSgRiMyT}R!@4m56y-34nICSDC(zZb|P
z7N_lLFxHjl%KD%9H{}HgW<M`IcOZ%y%@3PZWS;t09xIzA?lpB%m8BFPkt|E-t?N5o
z&99~%-Pkv`!JfsaHlM3~UGsNqU+=Ws=<=aF%lUjniYEATI+LEpU#6q<>+09KO=!;q
z4EE;Da>Df%kE+q(&5wl7f*Yac-69jG=*^p*eoiHAt5rLP>-IPD>+E);Q|;?#{twSH
zXY$u=OV<8=I<o5BpC6spvfP;YV6~p9OEYg)o6;Yu66c><@0SK2>*pMu^x-^2o<_0k
z!z=AE??gHn2{{?u$0l?lTK2;p>FirG4Nl-IJ@}xnBu@1aB_norPC~<?$CB7~ciPAZ
z3o!Ah!XBypq+SFA9OKG6cA$A`SO^k%+@snlk5Vfh!3xXzgxtp95q_@A!XuRHoH<V0
zuoA?T20MlZH}T+X6ZL#@oF2|bNB9W(b&DtgGB1jo0~L>L3k~6;2yefTOE5UXM-QcS
zz01_rK7@n~!{M1EVD#ZkOv7P6TH2GM6YwRcA~b5yXavVKDoSOgyvb3OA<ryXvzTEo
z)gez7M|;p#{Xl#R73vM)EQR8}>L9<E;ZZ=zyU)x7hicfnmvP5f@yX?Jmt3PqSn-?7
zJVU#p6)<zW^)!}r;3t|+wX<wNid#ZMvreh=JMU1L;4jbx^(yj*=z}MAWGkxjhT6YH
z;WV%DT4y2B5@bDYUR-2*>syvO(NK3d=CMh%VGh7vfK<wa;amwz#9q;o3UfVb19Pm2
z(&Dq*x0=tj^nNP6WYETKaKC+sMI)3p0B4`9Xv`nViU^d0WW5ALl=54{@F>#}mUD9k
zX%KjygQSoZY(E8!An-Hz4&=Rt5ImMmsnb|v;ShcEdjKEkfL4R=CWT6z#OJqa3qBWy
zd?Vf}G9SPPE&+El{2Z$p!j*w?1knxdhFu)RgVatomml9&fZ%t}D)Iz!y4?eXf={rO
zUkMaX!`0CTxh7UEil2@!Qc8kAdPbE~gNo!ff(#*eA(=&3dQ8=eC;~Qe0qW=^31|TA
zKIrf`LJ0q-<wgIrL@;4bjv(%wS_tC35fn7-lVcqqBr+sWQOh9=(c`pYc!bdfsKN}R
zIR1|Gd%{W8c%}<d;yly)Al(x3_>ovkP{AJl;owEH>zOSHaYB55q7AEX!k)CM3sTZP
zB4zJ2321+y$1o>L@eSb}<-mD3f787hFhTLX(F;&f7E?v~2nBF|pa=CW$elv9`=Aa>
zsAL#*?O;MuPWOA}_p>m936%JQ`kX9eAklGI(7IH@PRjSdzCHwE`0v%<6OLq4_r@bb
zvOZxYR`wt!kfYx#%}9h2+KCF>&zSC+045AUu%}~g8&bG=3PUjs8FCx4Kp28`p8J#-
z**|cI081%c)AFK_^qTxW2q~HM>0Rmwwm%WY9*7{%&v#UdaRM|?Sow{;@o@qPj_AM?
z9{1Z%x0>Jmu~3w;AJBTMiHKIBT42`VEY+qb7myWUkC>(|biR`h$ayTupeu0{B3v%u
z`at`A;Hcvv`TS_aBK$cI!LK2V=XwV9h{;9k*NtsMjs#?Kh46&3E|kUeBviu)w_1nY
zt8sfVM~8Y`&N1A=P@aIM_`3=b{0s|BNs~10fpZoi1bbZKFH1!L6TSqsa{<cdAo&WB
z!#Uu9>x9?Aa?#Pt2s)M7i-=p#1gOn9j?84w*o%0vCQ`Z>E2=d71mUL&pg&d#zsUgL
zVtoI=L%&eMEiZ#@NY9N#4!aQl0dCMKUPtpSFHS(sJ1{_h0Z^s5+#rCOojr&b9hh%q
z7{Q*ajV%d+GEe}S;J$Ox>#P^fA8njb<9?g(jt@joyy;yV^I>ZV+FSk}LNM!4A0R|7
z3XB&fMLBSSFb7rG{;JnXTM}Y~>9*Qpgy{Z2hXO9b_;enCJDJJ5QylfQt1uS~Cq1ds
zAEU_>TbpQ<a0ORyBE;6mHzmUSbE!_HUu!%iS<tMXPAj1g{>oekfpa_^TxyNU0;3WU
zq8dME1LVp)9N|;Jx8{7r@r-9;wQ*^sqx4jox^zLE{&B5gIt)OtoF<m>0`!b80QSD6
z&$yd5#k3_Zcf{{?d6pTwN2ALR$-0<|A;Rkm6Am(`s9$f-;ldIyKFs#Vj9w9ZsyZGC
z6xe7e;M}9vyo39;NJf~E_B8r9;8lt3sIK^<Jsa#8YOfaA30{eoyPbd1G;3lc-ro&p
znN=-oFbWhKPQ{PWElZ4W@Z3Q8I;i6un1EbDL-~4!sal-u26Bs34InL(1&HH6W=EoI
zhL)pZqO`hAdS%cpc%3U=dAZv==bY3N%uvl=ptUgWTunFjrs%O3M0ay|Fgo^@Bq7Fj
zb8t=`=-jZenTRWjLpE?IUZ7KPePzDE&%Uz#_!h+;(tM{Zq9?9D^nSi}3lV7ynGjK#
z?<m0m85kw?v)7tbrJ2iniH$EVZo=6l8oO4peCMoZwV!S(?Mg))G}%7I2ryv+J>rpR
z9+_T;k?VXhsXFYUsA(rUreNwa{GQNr^?un92fT#(?A<UOqP+(hmM8zGbjNAG)P2nE
zmIpZch5f5c4bJMWc0Zr1!S1g1cCVZJi|4fXZa<g1(+LX$MvlpVy2|dSeT9G@{S@6k
z&o_679-Q6YSNHX~jIT4d$%2eO()P0>Q77D1o-BKAyQD0Vrx<RMK6Uy!-AxU5Z@gY9
ziv+Cr%Z(4>IoEdBB-tGp^Pd_=wJ!-X+}<%PWJK5H+|~~Iec=8YpM~bPION5B^>xx)
z>l(ZxOJ#q0>}g|+@1_m_HqgDQ8=rCcNW4!z?UG31mL1oUi|>?jAK^CHw0n#BpgcQn
zl3fy<G5qjxxKY-JZ=y-tV2i-OR44m@CHpXWG-4H&jJ0!Q_0ueJuDO1_4XM2g-URB`
z%HawwYZtui18W~(yk)V^P1JF=Ia3s*XBtj(B^eX)_3hp7%<H*l<5#Cxcby)&9o`ZL
zfCOOMiz#lmHUG(k{#)wwcxA`pq=Rq#_JjilFguLkmuc_sWrUuJiC&bE0=Of(@dgYK
zb{HWq(}M5Egr4?^UYvg?L+@0BU243)rxkiiA$pNU3LuZ@Mj0^pjU0SmA@np)^kR(^
zz#h?!GhhI}Q>{Nh=z1~wp)*Hca+{NRo7)H8xCxwj^?kaX(0CJ(xeikODT#UhfgIDU
zeD_MfgLPlJRWH}-d#ATbjK=B*4zG{j2Rk|j9$yItybyOfAe}aXYg-{rTo8?$Am6HB
zXy1dm)a5LgT=?`ceKUm@x6lf%NnwxB(F*PXoOHE<rvWEDt>CQ`b}Xm;cbdNMgQ!r*
zqXx#xiT%a6OfG^}I87Y=g1%=<qIE*D?4ZJl&iH68`NRcLc^3+2>#85+m=Hy6fTEv4
zLlih&zf<#hyB<-sjuI<^<nmoV4~L^xm^Yv{3L@F3HySeys+1~aQmG=DQn6%#a-wJ>
zNo155idr-o4W^VTV^XOhnbM$SK~kb9J85K;IEq><84a$KDrZuuBbm~^WWiU!6@UT>
z6t#FV8bT>m!KBhaG9_@S0+b|CcCyGQNffn2G8$4TRf(k11TrOZsRHFB(MCXl6pC6h
z84abBszOp}9$6ZC@!=ac<%m+ML`kK>WJ-~e1vrVKVkE7VDE!0k&?1#4Bnk@RMb!vf
z=^puh5s)PcXyQc8@FI`6P|%&p%J(Eyza)-nPtL-B3#z;V1D)xHjy__+nKxrAAG1_V
zn<-@|RMHbKY>N?h!;byE;9IYO3xDrh`>|q$IML$l7_ruLIP;opPs56TC6(7&s+!D{
z4i)_Wx}dgn$XF?MumC4oj2$D|nhs-Lldim0Q`Mw)3@3WRW3&#`$n1v&uDf3@U8cha
zvqO)~`ePll<tIedlXpEC?%p?us+Drhdi9_)77JGGZI>S>?ndgaR$rEH^y+eg21+MQ
z)7&UB9Wu=h?|>UHm${DXILst;EV5tfJ=?py7F@*rn1M8NMNZxge2n`^Gf-pqqha<6
z?o7jEp4DDWm9u`V4ck2^r_Jvt<8jUGHGA`-*A`DVJ5f_z*UER`a(PxPA2WkfK>YHO
zX0!qGHO9d5F<`@N`xwJRPUr!OZ4HNcK6QMkuzLA@@5-bH`G7-C%mI!=4c<ntl9DTq
zTcz%m@2-S8yhyAi+4}3F80`%Ty!`ljHK<SX@`!=10&U;OzT*RZ_jR2)e)koWC_A2k
zXL)H0NV4P*Z7KHEx&-na4`HHfoyHiRBxK)ZrAj*jStjWBceXk1-uRLnkY9t|@t<Dv
zB-kPpfhzf|d<T<Mn2gn6yL=E=aVH8<kaz~sLDp22XfBPjSHfQ};As<cbB#XwhA}@w
zUtXy5mxA_!!o!E6mv?-*p@B~*$v<gV9-j))+=*HgB>Xzom$xw$zd&xjqrTeT4q_6-
zweku=gjq7v+q@m^tKh8eZfnQeasuz7<3(`eZ8&Ap-p#Dr&096HsR;rhOvVqd(%ZnX
zZH~&_(QJzSoNA{*<KOx|H@T<8*l9^%J`iyru7APMM`26>j&8MG)_$|wwi=|BUEr?b
zkmASv_%WBQhJE`d##P=_GoRc#TSmbPp+%>qp}J02%_dH;U6aqNN}j3&vBLu<2elWP
zUbcK+bEg01_?q*dkC~jm9Qp;>qP&srP;9>GZoFR+5{LZ|hnqErnP(AZC_@PEj&KbO
z*5hmyprjADBE*QXLM=ZC^M({7B`8~lkGv&IkqVWq!VS?2)SxuV8YI@Fi}^t`aZZu?
zgyCcg@!x9!lJj3JOHSRnKFj-nwYjd4?t~^_8tX%O6CZ`vVD9Kg`jDTcv}pVvPZaX+
z7(@n;zogu2{6}I)pB(W6pW=^qqoCx7Y_kLpd3(W0h|&U7e@Mj4O6ya3<?n?iVg5rR
z=0w_`>LGtGJc;nXk`VXDdNA4xPbB<*bwY^STO-0_#(h!A!^*|C+J(2yU}s~+pZ^iC
ziH620%Gdj|1NS|yNSiMN?I^X;pu4yqR})o)cTyNj?(PO3Y1f{sj$R5}0`6M65!N=R
zPZ=j}?=&APil6PKUO>!!W1V@Ovj*fy>`|POZ=va&k|Q!3rY!HMPX$WI-)Iq;qHAP3
zVaH{cdZVY3yaUjSpu4My$HmK!kS?c;1ZQUg-Q58l4^J->>x2&7IvfV}0_}b+KD?6*
znHqp7Uh5m}^|?QHwv*cf=!IvyU347I3D4~X1_S&bZ!7Lzmlm<B&$@g&0P%uOH9Qb?
zu<dSt5N>pNJ>Q<qb2eiU)A-*k(60$}bp?zC2=aBktX5$Tb-%d2UfgwWY`#7paNZgC
zzgERt%{}dAwzo^b{kS}cZ#EEE3%EEp2-tmn-rDT;@_RgZIYy*unjd+zY4(4rt0uJX
z-Zu{K^8fkV{CZ!%nK+=~UYb*_eQy}hV7;7k5U);ed(p2pwy-5pky-`Z+F7mLMBoo7
z@-}oQ%^2PHo3GQ8B4Kq%92!?AJ0DgX3-Fs4<}J~ne=I+=obS8cR)al#no_54xN@5B
z+Xa-nKebpd#}R7KuU_t!oAJJMS87{^`)(l$-)&l+31)WGSa07vmWOf$bXyzo?)((g
zpwIF{9ezrged#YmRHx56MJ;*S9(&RG@O()8$jEU^sC3e8vlL)mQ}NnP)NQlZZre#a
z+TQZmTOPXmlv}NBdAMF4db(Mqy<~6D_;gMA3URa--t@ON%iltV{?b+h%t61bh;L?<
zKObywW<h|x<xLO$cCa&X{-^ifUGZnyA9HNK(C2@o{BxR(^UaI{P-y~UgR&F-#tR!K
z^RFDsFN+KtJ3GKC3J842&HanY4%l<~tuBCPHg13llo=4Z60k4yFUng#Z0rC%^snjo
zHSpiUi2jGh{vYG=aj~&*vHfeKV}O<&U>)W9<wWp%Kta!}6JKau6^);_+;<QtlENrN
zw!kP6fiMwJG%fE;(UH-KVQ64L?5m<$bnL3!iyJHSY<a8JZ7po=BD9s-mli(H4t{sd
z_-taa^4+@mdpUa7;TmUn!R+_zOb7q&m%~@5z0s?zJG#5IyPT^HPV_M@lkqr2{7hwK
z6{U^V`{y#*)>F_Sl-my&DRUd1OeXv$FOe~!eCpb4wfEVw95J;%h$cN^`{c!D=YqE>
zUp{1Zo5(&l_<q^@7)oEr$$T4_a+xW+&%QyThJVeka<qd@>*MDvw@`Dw+4ib7bf4=u
zKXfoCXGh9O*OgT(lo&(J$Wg7m{bSwpUTo;d1PeAQcabCDl5<lv150MuTX^SylitDP
zay<6<yw~&iiFI+(#+&gwkV(A@w#=m7^vc{1yTwpMqZchp!df7RtInTSikneC1Hq#;
zDz)5lAD{qRr)eG?TQoE=W^{HmCe~&m=1-|vFxc6z=sGSu#@OlhXznx&%Gs<Wi6gY2
z+Ua4PYkzHMVSm{$J7j6<Ol5WvTK&3D>y38;QYkThZf)aW_joIMH5&uJxb2)Md(ZI=
zsbbXNx*u;Wy;`d>DAM_2_ht&l-;8n9Z|8ngMp$5GGu|3X1q(d})_gsN5MkYa=l=4P
zIM<T(hPutzs20vVmQIqDYWz!`Cap<Rg`=L5svb`_NL5&#oe+0AOQI%!AA@3g5Ope%
zn38LG0q>U&1c>b=;h)De)ndm6`C{Derg5%0Y0DUf9A+riG0`Z_(CWjVTX0@L9$!C|
z!k^^i)<JgvoV|VdYVg8|d1J<Lyotk^HgQk>?AjXP<Tspc2>&{H^HPF-!-V@H^K<*9
zS!%+-=<s!QvHkgjSJSg<?iNN<)%nef5xrOUNyg3XIcwUF@v^IOsmh;e<M}7XQm>qg
zPXW}L=L@d`d`I+j4}#1J0pA&)i@UGWUN;KXMnnQm*`IWkQtQ7O_`sK7x^cDdE$a8b
z4!u7=X%BQa6ktNLd~RI4-c+3<)Lgc@S?*rY{wec&3jntNHuIz-22Uh#k?9d25^Ljz
z?*qXX*qZ06Xr4O$Rf__5a;T1GVL*FXcwtDruF;maebMan`^-@0jKTF6@#@HL^Wz8g
zteQT#`W=ThL$2G_Wj_Pu$9*bvVRGkYEBRg)TZ2+syce;mdqW%HzPCHOz4ykaH6&mA
zvg=8b+h=hB;ZI@kMdOE>FKBnV!I$bS&WldE^nilc`<?4e22nleCx(JP|E>#xMZT(U
zLk0Q#HKis;B}W!VhewztH`sUe^}7r&gE#vuHS%o+HSWV(0(UVVJ8X_#_ZTzpH0X{3
z+<6qd#<G^->&+)itFIOoWUF#myp-n$zCfezj*UsnL`4sI!N3m1Ve``nI_yPkYl*P1
zLVWrJ=SazGLSm;Ly^MupJPir?T{W~mM%hEFa4c=*+iY>$aQqaSn+GbKoF81Q0H&;2
z$?at;PNV&26(8?umkld^{<+!N)npw#lcn@=IE}K^eV@+*r$;-i<;Qu=A6ky}x=cQP
z{@TExR*|kGCEW+5VKx39B`uz_P1?__0>kwKN2+0$yoqIPdkZ@oYp`1w4sonvSI}9L
zyg*H$jWD(*Pq~T_M14!8f*iG-^`%_dXIgqEGV*yF4)w~3eOQLu?~Sdxu!A3$$v;oV
zh~THWYb~!0xvE6B7h9=kE7<krAv87=?h5D9uXB#*?cg??tz)e@S1|VLDIdvrnACkK
z>%`pZ%0sJIoxf!7-&r3P;GX}ARs&X|*=;TyrE}tjx>9VeiK`y*C?kzx(^@pve^9Wq
zs;MX8hM9SM!K$F{-Kk)OMjvh(y-usm@g;R*JNAt1vQ!!uRImbL;GSdB&CI;#aYX^f
zfBNi4Sy`-0CMJ}<^{HdphlEY*W8WMUC<T=*eHq?x^ZWo6QhVckM-jJd#<BH-Xje=n
zNG#nQY`V&_*)XuM2a;&N%35xwFKKGpBn$gAr<Qd2;8Cq>^KOq?@>4UDx@jgJ2xDe9
z#`s;O#(W><zQ(nsdI?w*7J@&z<Wx*1?p@9G``w(iEqGskSX1B3D{RqCau>5hJ3DRI
zsH<;r@r&wJE8&{+zHBgbOxf<&V}pQK=VPd%$+4Wl(!lp%T87tvzD7Fw;H9Xf5;Yu6
zp$11!a4uP{Q1F9Q#u;n@?ZXc1b;cNs$oa=_Jg=@+b2@aE2^9^EMN=RO8EjNFJw3L9
zo$_M$B#uTgOcBG}`W$7{T&*?K>Ef)BC0M&ZYN<c-cvQ4+#PaWDC{v!v-D~@FrLdoU
zIl^{ZnMF8BT2Zl{%aLFwlo774q#9fbHv5k7pxNl%s(xM-Cs@@g%+byRclPBY0&^=b
z6S2!S*!HJj721ysqp(whGk|?qUDDWF_0%=ABvdq}hFbBn2^1uBtajfk86$Xz$%MCm
zvX2qbIi72Sk8|5m-mXm2SimqdC%Q#*YkLrz&nEC!nuuw8R_}ObEU&tv)upzlD?i5z
zY-)RE0P~nAhpJ$Tijq+skX2D)wtU8a<d3nN({}qPRKe-t^l2IH>yVa9nlML219i>!
zk%JlAf-T}fy1-~;dl^0Yoy*E<gXtMfMg+|0zQO^-J0}m|E9$fN@}I#nee=&2NOEWN
z$WB80^d)Z%jTTKzE~pR=Bhu$)zKLh`g_WBi$m=MftF*Owds`nT<+$KgDD<gkf{&M8
zg%*+CLKe%u>vj$jPapHc&CDF&NpF^qnJ-agTQphy0zFgQhxr~`7=$%XB7;NRRszm?
zsJ<6;ZpcA&u(@_aBbR|eySYYxbyA%W_naVfSl$JqI11Km-Xo+v+{yw~f|zYzScq5?
zx1x+R8dOcUGAFBu(`bLfo{gQeZOabd=5w!li6+hwkqoVH0YjsjNGa*`$belLwYW<u
zqbU@{K>YL&@s1=!@wek+t4r<EzITwit4FNHVr721*jPa4d1b<t;DVUOWd+Bi#vhLu
zIOaPc62<7WGo|9qbGeHPpnG$NaE+V}M|&ZVNn?Vz5t7vy&(^@YE2taw_&r(|DQ>7U
zu<b9Mk#Li_Z%s_<f5y$icV&Ffzz&<OGM`wRF=BeX2KtfKyEfMMjb>0Mm^mQg=ty~T
z6&0nQ@||s}aWdMWbgfd9OA(h&6ufd}+72D{f>f<~6uWX|s&NV$ZLuctpfh!<)C|R-
zGIgqCt!5N!F_%b`m@?f5;~=!F;&e)}3N(6UHW_A(C<<zv;&jqMTWXx5bn-!2YI8{?
zMzL(P+2VAHP>m>UYU`qO8nH$+m7;XIK{D#{qI4>;Vl>pE(gfoUw0-3dQc5&pk!XX;
zXj07bF!5-i%4m|z>QT(fF{#EM&>occq?nbXkd@)mNHn7GsKcZZ=?4|5>x!^Q#6F>6
zD;LQyYeoqv!+ju;i&CXlF2W)T)rm5tPAYPg+WtVI6a_}TQ|u^-Oet20#;hzVgG?T(
z7bQteQUsOOM-i$96OCr1oL<Z%W%PkWH42FuPWf9gR4R!iOg36MS`?Zt3<gq%M3g<X
zy!2%-8f6g|!{D8=FXdn}Ab{`SqOvdf;AZi?auj3Hy>8T0(Y<PviLx)r;IOhU8DPUT
zU!uWB<tMqQyCP4GsBY;?OzMrqol>-b)EztOjienp>WzdQH){Rl9R=kMnn7CS4!S`Z
z<tL>m%;NiCG>IZlp{T=RPx&Z|;`>gtDap$QGzF>4A~Xfb%NDfoVo%v9iDFODC=q2o
zx<SAP)gY)c-iMt)w8!FaNyfQocQUmaQT)m$seO!MC1?WFXUWECXm`?1@o2oo_t|LO
zDLeMmXGzB4Xdco|;b_e=PMK)U(zSX~Iz=s7QJa!Z$!MI?PQhraMJ@VK$3-pbQ8q<O
z0KllSF8N@JvM%x9zOpXGAO<x<$_@%OPf?3zRCG~`Viar<mjO)1t>b_Hb>U8J`qFRq
z|LAL_-|V+nQ@`0iyrMl|gDRJEHMc}3PZS!0Ih0Oy19qsU#mlorT$==Ez*gQh-iTTt
zSCMKTD_}ut?<!D1R8eS)H6*R5E81}@@nWYvphms36P1ZfRZJTcwLr<<j%Ng?C()K{
z2(%QK;S2DEE={qf7o8AwiToIW6M29%@yQ8ZHB*c&*%CsDUkQK4wZKYLzd*YHr=U!9
zUQ{K50#%NroaCdVsuWxAc`1+@QtTZsMc|jnFHdpOie;P3$g0AsvZ|tXyOvqur3pjm
zNU`KNBK#7lu?5=?qF$n;k>W`3C8N%f&X%c0qRXNpR&{g*#B2=7RKZl)mP|851wo=#
zY;H74FcaK?BXS>PRVa~LEwhrT5~+fyaz=#E4&yVyqR68zhvgu|P{k5w<k6&tpb8vG
za2vHE=}h5BSx_@dCnN#En1$^iNcfd%1l?h%MhJD_zap2Js+JB#ri*ukm+Bn%DJY*p
z7NCj_m!0~Q`$=5W&v1$wJepVw89bO+zY8c?NfR|d@{_mARXSBEFcHmFTpboAOuQ5+
zK#bU+{E6%46>oj6d|Ivg2s^_oDp7ViCF+oTNngMjxk2Qe-PR%AD2Uv76rY3OJxIt8
zf6X!@WBNoo6D=A*#3QVyPvo5wGlk4AbL~@LQ93p&s*rdoRX`a*K*=L}jdkPu!8#rZ
zpTs+Tg@D96AqEi%pN!{SLvCrb_gUe}v#3wxEz;?&1p51@m(q+pc@n}IKhgGxGxD}@
zOUjw^C#8tRr)IXXbx{wcRfGcG$TOn0>~9<eI+0DN>%z(Fvh`14-zcox6Td+e*oiLA
z1|FBX)6cMr>Xe;Yf83`jpe;LvDHw}nAYBo5rM~G1vBnq0MAZ{@C7uEI_d)&U+wI5j
zjjv$B|B-K+qQK?Zozb#zGtO{)>T*>gaFSx>UH{^2OMS$`!zq1UBbzV$jCFyUC~c%V
z5`9V8s?|M90Y`TycEbk>9btY<)NCnNh#8WCP&Rwu8DH&XKAa8dIbr_H$O9C1WOkGZ
z;*WBi>+fjaWYOLmLQYQsvz+v!{8+X)=8vy*jiT(LDv{;Si3HPV0Yxo+rU6Au8_7?J
znL=tWBeK(?N6G;CB#i6`)KIkJQ{u)RdrGtjmVQped@uc!co-P<oQNwV_Hw~8%|(K!
zM647fh<ENGkZNu6)Z{&FS)k;fxr74{SAQLD%c4znNSy8F2_X2ACfx$OZhBC*7;p1O
zyPcu1pLRR8=E(5*+ZnE*a0RX(Z2|K8(UwRFM~Ox$YLlXa4IsfT6th`UY#Gt!PAKGG
zTfhoDNg$Q*lqxq@fTAvSEP@N37r67NNO+aVl)#j@;A>`pXtC)<jYQ2!P;c0jrxaPj
zkgy}JMSJI1BfO7BN>Qjqvm(5_>0Giw)>F+7rDUS7AitsZIBoeXbp@a5Y~gO%Y|$>g
zBKSr&F)F^|-NW6ZaX~y%_6%D-Lfs$+AqL(Bb%#!PbYV92JTP2nY%%um*5HG`Vte9x
z!rpN9IsjFJRs#pH<Zk=UXPx?jKh-_R9?Bf39ufv&=D!-sdk_HY1La6Oy43L7=$}0C
zU$agthzOeL@A-c1n0^TH3c7N-@;x+)@q@eKhv5V9^hVgeGTI{E5}IW>1pVZGC^YMQ
z2s{gO=${9>1o`;fqu+2Uv}(Wf^p0{1^o;eh+ZoGemkk@;^N!(`-qu;qrYnyghoJVL
z_CWle>>=4h{H@iXRi*>=3-?{@Nu4~0uorko)a%&vV~Fuaco#jyRkxnT9`+vfE$c0s
zn0AyaJ!GCyFRyQ=hi3U#**e1KuKGCMX;;4NgI+lw57)-8vNAhxI}gv+zVQ#Y_xjvU
z!t3us*spCZDYkA`!t-)=)b^qh;I(bEPx>rOx@|OW%x|#sYWN^hVb*wkdB|bjP~+9`
zcB3WL_^esx-<Wi+J1V38BcHalPhDH1O}+dR`iA@YkL*+Z1%YBellE#o&Xq;~20f!w
zzJ`j!=`(i;W9+5+u60{_oddliyVcgYcGC^xa}qj7qYdBl_t{sSS3k5ns(5ueCcIoa
z<-B4yxV_*vmU(eI>VKGfTAXEVS4~x1$xFOic{S>vPii~J={I^DO<JGtbx3VU)oARi
zc~hQMUdxZ*o>wh#Z|LM`9MBI=)O*@=(CRlvfy+h>OQ#28!n=9mN#N)5x{hxHBD|Dt
zr*c_jv5cUMMN<y{D6Tym&?l(SHr9_&;$seG`9e3tDEi9%v))oKgwlIDfd#`X)>*8~
z-N`{ot<ekzGZo1Kaq+B?=1aD=d96$Ix|N1>w)>>5gkA2c>SO-`W@Xf1AAKr8^iFMF
zJNY>Fr-=9*rg_v<{<?8v_mAk8GbZYVeCC=1F;<_b8_h5ZXG=X{W0PZ}hBY;H517pn
z=uK!moEZFThX>U_f1Yi-TV(ts!t8cO8{q1GKmSR?9eZg`I@jonEMfOZnlh;X|2(U2
zFXltK$Iy72{3K(=`JS{>6{GS~XqCEPz!*lVqUv{<(z2_H)n-bxd5xbOFsGN6Z4#DY
z3^(8F)2elIa=MH;1k4!?5<frZ4FUdZY-5Vb-KSN=3{barOMR&iZ3{Y}lh7iLrS<op
zd%f-pI0b*YOOvPYpE1;pcP6k=;sDJIbP0;>;qF=PfiUEE#Hc~ggslWt2dxN9>47oi
zu7R@zO$Ctwc%x<XVE2$164wBwBaVY=2Vvx4r^B)V$M)bEa!dn_gP4P`272cqq$9|J
zcD$ok1eFCZ52EfdHAGPamj$B<f_?|iL_h`kB~UpJj0z$u&?yj-3JfN&wg=0QjR|H1
z@<(7|50oKS0_+}8f1s@)RRTC1FiQ{|5k3|i90&x6)jOI55ICT<cPt5DEkKkBP-H-t
zfuIqf#z9y_I1wNx@5qpVLqIutU^D}ndf?K5NRePdfFRx>f`JMJ+Up_;sJ#Q`>d}M^
zkM#hh3?dBtwH9shZvAU1+B)w^>%r_f?6K%6>0#bN-7?uC-4fXX-SXRV=<(@k?=cAK
z2090l2oeN!0CfO%0QuJn1fmbF53CQN5Ap)l0rdpT1=0b;hsX!X2m3_e3E_$33FC?4
z3F3*z1;+=$2UuwEKnNfRKnTDHfDk|+1E1ev)*!YJF2SAvyMy}Yp<V#r0k;SG1o;GJ
z_nZfAf_Q*>0P_a&2Jr?0mXko8pwk}uo{1iZz|270AlyK^o|~ROtRq{}TQFPBTP%jK
zj?|9O)4;lLx?s9cx<FY|+Cz4GT(5YCxw}s;SpjiZQ%*13lY+w!hyP4$28{s5V-6&R
za2x=N!|Fi><uV46fZO`NN&Jug{Nn1ToNl=%1%{UoxgPN>a(7Q$vH-+ao8j%(AIh&b
zj9c9u1LGMJ=`ZF(ZPfz%X{9AKVz%6?5rUXC^e5K0Og_OAW|{d-wWqd_kMviy5=*lu
z^RIgj_8!&n+T*fE0vO{YSjXHOE|orz)97dm!_Ms1$UKYjsRTAtB_+!Or(JEsE9>ZZ
zlfsR)=KErruagS5c*BqAH#x`!pLL<^X2gB-yIUW-&f4X#gxE8C&FcVqSoS)KwE$S1
z9*jO#iyGEt<0jDxdZATc-$@a>xZ87lZMJ8V>f%axOAFd1=_Z(fmAo`a>jJwBb%Rm$
z-6@==)G8B;+8bMuEtf`ft@u<oZ9jNN4qixw1KlPmm88+SjO^jbVU_j-y#a=o<(0wC
z@?gC_hC!FE7`+#Tn^YBRjjgGa6PlCosuLV{`gVfH9dY+rtA%5>NB9$Sci7t)%XWxV
z9X4`pLPLd`PFu|O`1R_CI&6K?c**YZS>GiH!pdQM`?GrEADc(|4%x9jn0E`vEwk)3
z4uBumYCZ7<hvwHsTHm-Mv^cU3B555Fivl`G%k>J^U*i|L^dGn@u7~CP_*@FrQIA}Y
z1!ouQHn+ewq7=5(^6i5u+@e1{t~i{Xw(C`FPM#Th>}ahOQ;(W11N&jN<61m=*iyDb
z2KdkUp<gn%SKsdWRwh3z+c!Zf{P1szceA?9wRxm|JPTakv{{})s0m&oMywTyV2j_0
zZ{UMk?FT+Knh?K{ST-q*)>rj;&mReO1NX@9liJxsAaUw-dPO}KSA)4sxLmOYcLRA#
zETB+FDs*1Q#DQSYg4i>~S-X6X8dBPz>xZl)YUZ6fRlY>VaRqEXbSw}RyW?}WA|nl#
z@iY8_Mk5_|G!WQeA(|f}+js*LQ{&qPC!c4ub;Y-x3?G;qorD979N)*iD=o@R0gRkn
zw-}4go5|I4&2G<))ZCP-Ntts~#Qi3+%&-wAvFdm@cS=O1u#)1qkx^{*Jh&-2H_CA8
zkMte`BOvS+ngs>o(lS9LN4kEO;n+ue$-Q5e&-$#mg(96&C0!g7PzX_TxG4s;Sy|1-
z6Tmvl5!Hj?eg=_7C(%yP$@%L!$ka%Xhv33(Lo3~aWxnj;3K<P|!@=+>Rq|d9sIz-s
zSM~uP-Q+`vVdAZ&V4?DQx+)K{$%Lp`#MG=5!2NX0v@#s6OvYT=9mKnI=bdn?i*mXQ
zfYqQ4Cegyf?ap2H5m>U@SG)b3Gv%QJV!LpkxlR!)xMYV`xpZ5DG^Xl<sc;VW*(`Jz
z27nwnw`8n3$*11=l$@jzz6#=D*Gvv=pn4=cPE3g?fyg3CF+4yahI5aM<(iTh7lCB)
z(LSxW2odIMCLE<beC=R#z^t3X)a8~d-#YUNL40FvUe!BhM0}j=6m%Xk3X0Fs3pkjX
zHoAsYPC<=B;0rk6jov7brkS$&%-;yC8zT^1;o!xpr?Fh2`2)`-Di0GVG0-^-MH`tZ
zR<h8O`N@{JYgWF)-$8h35lefzcF{v}{#52!|IVZn*Ql}lW7vFuZ^^kXoTF$Y+}`Di
zNv)Latmy{FWzb(rwZIEum}2G$HOoH2bx5324C})g-n^wkL<6NbPRf}xfI2wTLUN5A
z2mU!9)L8ibh{!*Pc`^g1eQ76xX;3Qn`$6tfIpTZ;kDnzI$6%BGV_Yw&tP?1{psa=m
z8A2R;m?+tj6PEc6o(k-|g(kukQ|_=CwKIkpdf8CIH(k_@B-*EBTHR!uM$LzTHHMce
z%9!F}Ei3>&PKZOg1y@f3nllsBmTZG`DCS5RE=N-5dY5kTV?~U>hcPXNsR?`*dq+$U
zPdbStFXuXj?LuPIp)RZPueEfx#7;*zamxdTb<6Nv0UuouOQ!5crbr|Z$EFDWFVfxu
zsE%e`{|yizc(C9Q+}&XT!5xA-1b252Avg<ncXxMpcbA1bEZpJd+k4-0_PO`md-gv6
zs<)nc`nOi~bkB56&D8YU&4c%bG%zhKOTZ7{YPz@EL!2bYd2#!I;FMx`+hY(JK7zwx
zkf&(|bnWBr71Ptu>1gAr{n5XUuqbXUKSH#s9Bs{#B;@2FK3(VDEQ>W?uu+|#mp`N-
zV7xX{%}k+WS2fI*TSb>|wb2w}jjlN2P>ASC)`X6@pe}7Cl*UU;((CDRwsknxBWTPn
zO1{*{3erh@V#AwYI1e}77SJ*{@QZ|(XWk;No}Pi^<)W|h@6LozEP&RAdy_zvI8h8I
zbX3Jg3G?1Of@6<#MLygf$q0r!9$elQ5DQEiG&Oa)1WiQIe5^{Gi8Lnyl=|K6>u+(M
zeTTUZ5$Se;*E=^V5)u8#M6cT%_tpC}X{4pBz)~wHQmdhyzrDrCO-7;$&88s_cp0JT
zEf-U$FY{K{;}pAO)3|J4y}uOq+ip^$RW0Kk+%mn;w3}2FoCIbwo$1Q+ZY{8_kS2;c
z=MW(a_U&E%)VcPXjs>&&2`#eLj*XM&s$X{$%anQ30+R?V(y`?62>W<HbX_xKsR3bX
z44oE_x#atobEeMDm<rBgG4x3h#^O+a>6aP%Xr;Y=UR9D6!qsWuD|&zh<D&8;?H8N6
zdEa%n{=GLR$8#q!B}ED^1i-G?E@>MUiLq7ZwK}#hU5J9uaX-Qx%jtO&J&~lY<~9_u
zjYJ;cOlo-6JUc)Ryp7B6-v%5zaYdGv7N}UCr?qZ_-1qYPLltbl49=AaXlSx}kFDKi
z#K@K#_QGPBBG|HbX|39c&1x{L?%|}d=h*elpj!M!*a)`khDJq|#8GYfD(prp+{7%*
zTfT&_c|kAPNvb0m6nmFr>QTW^_2}fsS1=F6owc8bdnYG11~e<U2FP*e?T9;R7luuG
zusREki39bm^`+E@#4lH1EI*~q8J}GRBYV=ObKs3_Lr-;8H^)3#(LNWa%2yWxr|?VE
z8uR-zZYUG#RXjCmYGgbNt4fMIc7n}E0n!&1MC>NZcl)A?Y$oU57b^&POTTI;#MR}E
zu+IiA?VLVUE!jE#?l%ethES<5*4y;IIZ8_>Y^q~bo1a95l#}*%Y7=aD*LzeyVbFfl
zWeOs=T<T>V?SG@;?1J87u>6>*o}siff1Pe8ssF>&FLHx11^zacn9oF-E0b$EyB1N!
z@y26|(@Hq#0S}5*#3W*4YwE!0xbO0AH`Gywl0o61#z1`?gb|6FW`I+oc1_V{Q}Wnl
zV>_lKfY0hvReD(82FI%BC~Jc81YbYodWu;s?XK!E;kbP$c3_&FSGfA4jz2LX)9KOA
z>7%ZTxae~IdTPp-@K}6Ig7l)5XEFj(RElU|u>8ubWN(g}jfJf`-FFfPrP-ODxo@)V
zTs;mIygur^1Cpev1Y6TPVqV&79L^HwIJ^n;8yuXyDRE~b6SxW6_))wkYa#w4>1!6X
z&aRCdi1d@8HtETH{Io45a<@Shg3QDR<t0}1c15&n6A-ht6y?Rk62b69F^7aXziA{5
zPsFk6+-sEpx6P8!?%dVY=&S=A1qEyvD&?i)h|FZpzRZ>#w3P-wUI7O~mz1WFG|Gt8
z$^I}agP4?v^q-Tv8!~o|@3M@ou(etXnG#VVKK+IOAFTL9e9D7B^{e}u6XtsmBF7f`
z%@D)1pGpp>z{?~y{bdATX5c2Si7I_>EQkI53$FqKe!%vT9RS)oY&PuADyI6g8-VU7
z{z|vx`*3W*OT$*;HF6SvViSq}sb9C`MHZFL-XD`kD~pos7y9zl{im~Lw}h*e#~E<G
zAMm+{)P3B;kr7Hpq}(hm!{)u%ra&c~xZs{B^{+`vC$E;i$84c&O>CJj>f+_50YQ}$
zW>+`3=R)%G(SYiT{6|>6MDAW>l6gWTk}N-;zkM%+CGD>yJ`Jf?HG6a4Q}_6a{L#ST
zsdxhJfkddyuFs^8VHdiP_bZ9vO_7tIE>_XO(NW;7*I`k&{}sgJ?&8kyL5(G#pa<3K
zJICbR?`xVi_(-$SG34JZR+mp~=4Ambr~Xp7(k2C6>q(G!y%<e8N+1_m1J*uKV-ptp
z+&b!<7@!~&?e?gZgYfSILt#*VXxlCV{RhJM`*qJ9^B2kns6tz4xBFx6$WX;aJd&6l
z{Bvw+yL<OS&)<u<J%-Vgl&ivH3FlB=A<Nj54L?gbgTRcf<L^z(K5o&K@a#cFpR$Mx
zur2TX&M{K?K&-kSGMAH0_4IFc5A&yK5Z3tr9<lPNA-)Bu7WA{{TwpKXtLC?z+VI!G
zd}vivr;iu4)TgA8q-!*CrCk2Ss<|QZGGVBh)=KUgnX#*$dHof@Pp2!v-!${m5BvJ4
zhWh&S<HjxB+Y@FiVR<oZd%TqZ9J=iJU+I~wGf$7NXKNV8iOpdjT|d#P5}sD3{S={H
zG9={8&Z1+jBylvb$~<8F)i6eW1-1DJEXoX?|JDb4#;E7ZAn{ULq+eKsO0YyHC{E=j
zf=$9I6}ZdD$lf*iP>~fmLPZ$UnZudzF=$X+oLv1Dow0JLlo7{bysUzbh{;<e;CirV
zdqRM|<Rw>>Z}Laq7kFk|RZq%6V<29V4ht9LmPt-{i`%c>CDq7)jI4)GA3?L3ieJw&
z-q<eZ@I%C+PZaRK*h<5XYKn+GcYLGz;<tpJ!mV$2`V;=h<1*tE*zfm)uKg7OvgTC}
zdRLcHSM8P&@`^g*mp`s3;7juRD){?HX+u}fZ3Ng$zb&sQU}1`TNk}2CTS*3r>f_e{
zN*J`2zMs8lQ2MmYQQ71$_z6^KwS8sZ32~`AI>q2!MMLqKSn%6P0wzgNsIs@8Pxui^
zEqV<@F+1Ijl0gHFzbCS2#CYq8K`f8;-}*|vZ56(X!+V2L<78at8^!568@-RxrNwp`
zJ59OWb{WLGbA~Yyq+{>{!YCs3l()Ap=$Cy3ji;AASGv)=PG~vC$0yTki!FKz3SQQh
zQ$eZL>ss@PTdCbvbgNBRg5*c<RrmB@1r@bU&&gzN(~achw#}e6;jeI8sGmN=AW8Mm
zeiRfDq>vZ8!lpo%_zJ>6omNa$l%~%tAG%eve4TsxW?`N`0$nPkK%M`QemfOLk{(zr
zwEukJJt0H!lm6A`b$#Q)m36d!a^tzI{m!?t{%$8NdiY~5F1Z-HRCxFNU{S=zP~+Jl
zfCKa=+PGYJBawI`H3q)r0MQ2`r;g$6w>M)|qSb9uIpH*%L9+$agT;pWo}2<#u!Cvd
z@z@IcI#cj5d}7=<uQpcW4aVHYv*|ec-DsHUhfxlulgR=tPaUa_8R3MjL)3}eU3v|7
zPcvGf=!b=+_v_BN^w!!>-^}gk=X^qwP(O;u4-U<&9WJM4SfpM8SE!?#7b9_&s()O@
zt;jjo&%GoGWG?~^aO7)Al+i_t5<)(u&<NG*e`JmK=OSIM2q2GSN{#-g&NOR2R5}zI
zoG_ZZjK3X*wb^kcWSX4C9S7kHloT0{WX}<-jFg=Sj$p*si}88P8&1<@p<ZSoAzd!R
zEc#9mYru((_*C?=+@F}y;jpsCo=wLqG=&F*u_VE}eCgsIM-6XE!?#aCFp*7Vw_QlD
z%}B{ltw6SGvG6-O4o{vSEF+z-k!?>|iqCK;Fu^d!NJd|Vj~)7wDAE8TtC^>X9?$B-
zGAHugGc-zVTOQ{0!Klr$O?jlQ9Y}kZtG<-KlX`Y&9b~{*4Lf62YAK{VcAz!jR}exl
z0hng~zJ63(<;KhW;oglUjI7G%_C*m`iZsEvXRY7C^+3gM<~7Ex;AnJO19U{N%}~HL
zoC~EGj^nhR?u#}S4OyNT3Aj#QZdN%dVKGVA6_bX%^T;)mNd2zMG?R>%zEh^z)Rin-
z&WgKrvGh<BJ~#9+HI9msXNJ<q1UM=wJ4&m=X53cl6}OCP%_`ovbb84_O~FsW(#}Nf
z=@pU2;V>dQhD8c_z%IM;PDZ}laNrQa=Ur!UJzc^mR@#AGUSW_sUxYvYYYN{I$D6_1
zq-Ib1bd1>z@;y0^PAEuY38<3UibC{VSITUj)7V_2dy%U<Nl9Mn=*VSn`gc6LtVe2E
zDv_sfIjg(yZ{fnm!$sIg0-{=1u882sFx&8PiO`u7M4q$qVRMdB!LhV?9yP;wRuc5n
z8GATIO@AC5Pa&f~zz=PY)^4*Yjh&??q93+YzrB~X8n~6JE0b$9J{s448d<1Sf)Y?O
zT77`{^=2K(GOMUknpdLQU7tBAo1kwZi6pn6%>HLP&bQRr&)hA5)x0pqL1tmjhm>FA
zlb~S|3`I3>S!TDYUE15<+uc}SP$VOE50{KuxlDOChw`Ynk(E@4w3uqEoYoPkX)swA
z^*^22T7H>GpRu#8r=aKJWz<P>eZEZ&!?IZU#XiPwuHMV5pNXh=>llN5=o~TlBNW}-
zh6K11)XbG)oc?3p2+s^MRhb-)&4;8ScE{Ypy^XnMjZ2z0a!e6LVV0?=E#Do(^E=@i
z9x>GpO~ly9NP))AoZ)8zykR08!yK~lB^P@cg<n>S!RaumOyHP}gy@)<FUC^}G&f;)
zY~0kn49Ec6+LJK_RUXL`Ciq6tK@I0${5=lTdX9jZJpV!h5n-SozlT&KqUpA_fwyjA
zGS{FB)Q@(@-+XGNE?;VdCK(dj2j27hbaN@W$$00*3n|G9#81itAJXQO0S~y_4oXUn
z+emfvP1^{JDFi0zW(}#GDQqN+(?(Kbc+3V)ZNnG>xE$$%nX|i=1u>rtydxwE95s77
zJiljNh*L$%oW3?wZ0ZxK?{@{A&uB62b{ZGvnHc8>-+er-*u*$y=XJ8~z9chQH)}=R
z;nv3B&j7*rYRJme?K4b&Ik5u?k}Y7flW^FG#Q7nwvF24UNiX5!e}1)Q;$$*0OCJwJ
z<r{vxOJhjbjYUbv!J%d&O*SE$z7a`@{>HjZjS!=rGKwOGj8iI3?3;>~om|@%g_g`0
zuv>B>Rnc<3Tf$1GmagA?+&5xudtXj%U#5^2l25_QE0GOXF_;k`*ZTCvUgjg9<Y!qq
z`|}I$*&N}~l5`<os&0H3Yhj*Ot^(WHjDAN+?c`P)l)K?$uI|RWzO)D7mO`n|93x=v
z0b4&&M8SPaF|9ab>+EIkxHo5-M5jfTB~@hl$F$N(Vsv$|?6gr9X}yVf8kCYEAWUK=
zzS>!<hFpXP?G$FK?><~pTB?k1!sLq{TAgM|yv(-RCP&w0RRh+7j|AthxNT*d;q-6U
zie@$}<5JmjCJX8&s+2(eph4`^Wg*i+Cf5Gb`S1-^4s;QU;)aZyoP9Y1#c)Fd+gh1Z
zBb!Vv$plLB^ZZB8fq~#h1S>9`y^r*Hr+&~c_DSg!$hE-8%$g=(!pta8315Ro`z}XU
zS?{w2S%|OnP1M9^{U){zTmoXaflt?JoUN4HF#~E5JC#S|v3c`{H}<i%8r<&7jS2gM
zYfa62RVHQpJ$a>eWTLoaT2eB~QdVUFxVEbcy6K6Wg=F;fl(H8RAp<6+zzK;+Jm1w7
zocxoH*SPzp?(RtfB=zgYZlAE?^8>D~3s(x}So&<k8>SI#7ftNmGmgMz>oMJNw?g~#
zUuDgXJ9er+Lj>$i1j)kMqn&d&Myb`&ZyP||{*5w5^yu?ZvcLB-DLrDJWIsHSsIs$_
zThwVK?~rW4oEY7q8Pvp5Qzyx|AC?fGvek?X-wod-!Fd)Bo73qr9W&OH(dvEn-=1^|
zi({4QZkFWOONt4K=ZI{E^OORo_2@Cyz^V<BX@TJc<aO{x-byH(G^g`WeCE;0np4no
zl|Jfg6<=J&|Jgf~A!I7-*SrCCXIB1c%SugM8#ICALbqCsSq-kL&H=-u95$z~SBRxc
zG-?DUTMmDXk$#xtaDtZjXudRs-ligViiho^i?_X#L&-dDJbrpKndyjmK0;V17Vj(*
zUk>RGX&&1FeLwdey5g+kj5BHoGitSqCD}`wuhJ?|Y5kfqnQi7_Bwn2u;qvCGfdPJ!
z=y8|URMLch99Onhd^+4JC0dj<)=ZT|_5eZI1b;j{UxS&AHpN7Anhq~wlTBEhwT_K3
z`f7e8yG%9La62j_+WL5veXo4w2y02JG4I7|b)!qvl!eU7>S<YBWNFE-X8;+4!zHxp
zRmgC#Hb+&Xw9XSs+`sZ5UF^<BS4%xAJ!z7lY#dL&7Drqe2kS~0V6wjCC@Sg#Ys|QQ
zJ?Dt|Omk(IaTPCb2cvtJJ(5i0Hh>~4%hROVKjp+D<i<NvP5<z^lV59+Dq4sNMDv!$
zN@k%k?n}a0<?y`eB2DqrI*Ked6<PRZ3uY3u&Y>q~U?JyUAmLhi3vx`^+U|B5FHDqL
z3NP&qU#+M$=zpr8jnR>*r38x``c<lX#zB{=>%@mHPY?h8&S<hZv)^^YT{NzLO)|(+
zS(JY`L6S|CoXq)G$Tt~rY{lr!h4<sUqnoJTk*{69c<b)!-erw#d=?+j-wt<35s%l7
zT5gPPQMld-lu6G&Sem0A%5B&=4(G1ANn&xw*kL(Ao=6}L^1OQ^PWOebu-Z8N$}8ke
zKxft2{a!mj4dj#!{_Knk_#`zD&DmnAdy$-($=B&PJ=d=xX<s~?QvCEOmtllCDos~+
zsixH!ZN1q*O*({+f5M`HlIt|Y#oNo_m-fPU1&`-CWk;a`3WMKPT_j1-tYE%$leQCn
z^(UAas?+I#C`h#1v7vB`yNlGZ9Dj^(KCdIN`mGVu_K12HA;(SWGhfbh3DHt%Bc-S~
z`D!$)dN_#-D<A#hYHV&!>?xfo^AX#`kVudZ2;%Neeo!b|;X)$ZDGQk!U(;#n-Y&7c
z4Ywv-4#S5k4jn;~Y<Lp26vKK&94I1vDzWqFndKu2;s`3mDHVnxks_K8Hl97)B#VN}
zIJCI3o9<^jlF?xaX}l(GN^){aR^U7Ln)n;g$tUVixcBCQWW20XxU)0nw@E-9>O0oq
zc6{Eo<J0OU>1AtL1WL@!FQ5I-JU-4WZ1ztQ#5?F}G1Ld0qMq-}$+1D^NpGEn$G*(#
zuZ+2i?*~!Fx$N{L6gjI4x9?Q13dDa>dy!`U|KfG>>suHZ#`}8YoaC#BNORozr!)}{
zDllt_;2ZvouwU$)7`xxK#3HGz;HzFVA^b~#YR(&(q812p=Hrms491Z1j1*dklQ+mQ
zj~DV{np}Got)d*Yd_&Lz0bexbvm2(l6i7{C5!T1A4j~2@bs_xZ1tMLca)X+j;i$+4
zTB?s<m%97#&2-LqPM4q@MRZ|Fpb&Rrw&ROiAfySVe17=wrvl-;ct9sVSo|(g*3XER
zPbA!l)`Pq2ZN?m=lotfK?gY!`6+SmG`*H5#XJGB0gLy=qe`=F=&6=+~B9`a~nuDne
z`uYergHgxxyF-24L$`OpIr#jWUXWX4(A1rEcUXr<ol^bjdx25rH11t|=M|@nr8|wC
zyoP(o9(x~+nIxF;%;Xa?dnHorbTpU#uN17_WAYl!q|*IcBeHWUcc|z9D2t7vWt<q!
zXwe>6iwzku3+cHw(d#S@DnoMMcK?)K4}FYkk-HR`c~HvYXGvd<2Tt1%VWOAO<`pI!
z0%p9A?2Li)(ndAJLgOcrb#poUNPQ+n->=0q|Edzui^gtCF&)@C-xe^z@zJ=<=vVZ+
zu|c_DeQk`|Y1u9&FRgq3gp@8GMedTuaG3sWXnK5`fhl?ApAxN$>eYWKzQILHqEeF?
z4!P~_gY}v(W4_G5S5WfL=}h_(NIU|+Vp*&WvB!F?CClLovJ?yHKxz1SmbWPfSrX)_
zk(LMS-kL3bO17A5Cf?e2V6a1Xns-<>iPKbA9J`=CTmSaeUnj~@SQu8yGU_^;G5MWB
zf&B^?70<oTQCG3o=|0z0Ust5*9c8BYY=p)#u+;Nhw$o|dH=sB%s2Pci^*&C(NE9$u
z$-pnQP{|U3yN1)x&Uu+4Rxu5YM}6C^o03%+tu>aLV<jewexWiwJaB{{+~situKws7
zD_Py}EB$s35@G0NF&MWAbM56mXz7CI(JchHgmo2Uz?zShY-p=GINxKfHBJtT8Wb6<
zQJ&tYB-W0$=S7IJwK2B8lF}52Y!k#Wf4y~QLZ~#LH+WflN$YcYW%ffYUwq#ZhX;^7
zO@)o8oU9tRP_p8U9k87}Pt4ft7T)xImCZ=I+pGct4eO#)>`Wk2+zRR*v(<4ApQ_nl
zE^KbdzLc4RP%|{z$2;%YEwdhgrol)n%>QX+-@>s{5NIhcks?~LeQ7eZc)65}6oMPm
zYn_6StXXj(w$Tu9a_<Mg7KboHD)izn47^LMv|#+!;V=FJO8m?(5P6OzZ2lUxR)U~}
zU0rX|H)L#dhrH<G3qK6G6i<_@7Pk;l<c=!;Mce`l5mYLa^P0cShB(2!KD#Ym-2!6G
z=bem-hTFh(g?)MY#$iyJdhXk=#n8$lHEPynT(>IVY83~DVf{--khp}ny0-g+MC$yI
zqSO3k+<4sR;P^Q$D+_FSQlUDVa*#(0Zpl6^n_eprrXE&JR6}Rc3*U7->UM9W8dfcK
z)Ls_3_5)tXpE)P-NayGqQ=vMei)AJqF3lY-d7<e#!d4|?4`0|-#PWAn-FE9Q6Y9O*
zYNzJ{ZkUF}_$}Wug5FfF4eaDF?AWZ3sc3!q@8g5&-bEN_@L6>;b}B9#>Kf*rcTcq1
zGQL!f9@v;qBo=aVjHsLkVs@xDmsHeGw6;`mynCqSwyvBcte4kF+gKErUvrF>SM00e
z+^^p{w3iOAFA-PTKdTOvThDh4@GP&+ENGwD*qFs@eLXI<6dK*vY%b_u)jqHuPaO4j
z@J=6H+%LC`Z+ET`TRsyeVlU6#U7*1FJl6yms-=*%2J$z;(xk4Y9JfVTm0#OKm)8ug
zLarfl)Nyh&ltrt3XgamIYO?UEY1O!^w*P>WGW2Ij+M9VrMzvPFCWCxZJ)Kq*;5zNe
zm9cZua=AlnX|r&Jc`pcatpoG0(*}6ztWT%Q83+ulplBZNZSfm`nX7AA6k1#S1yODy
z>@aMpy+M)BXtJMz$-ww?-u=4t7<UC%XCn$$>X#1VFMB3@cQ2pI2V${%cb$4?S%wQK
z4#+^0DNm%T`XH6Xx*wqNKRFLC%hv4N6${$U4RRgRt25hLTi5PV@bC*|8nK%gR*5K=
zka|Id`Q0a1WrtKnm-CHm&zdR<36-wd&+#(n2?-oYdBQf9<-!La`fNrS+_ENCckdpL
zE%uI)ku3*$mS^o3M-jmC)~VYe?6hm_v^^`^$2Jw_R;8j!>SYV-$L1oiW2%Iec#!N-
zC6;%4qum`^DO*GMt`*vg%Hz05B^Jo=dYGtKrrDnw>z>-qamjUs+H0BMj#I`UcAKUV
zgc8Q#7!gia64=Lq^c7$^zvcX|0XxjvOG(BnUKt!jV;w6xN`pqKbL<`rcvJNczdUoY
z8Y(sdK$EWZ9xK9=vSgpdIrPr31noPp>|i)!BuJnk-L_|n<ney*gI4kZ=<n(B#V#$&
zGQ`67x}|OXJ-h6zl`8xszeY2gRNYg7!w9q$DjPNjSFejSL^yEn+HjsvGY;-|H_1X$
z(Ft3{Qo?9OK%<$DiV{jg%7Ev|Jn12!mN;dlQ{uV+rx^|hMjctgtdrs18{I3ebt&B}
zon2!95>dfqtN_UU0V?~~iv~CbV(VF%HZ9j=ixrnKtx!XVk%0CknWtXq2>;B<gZ1ks
zKL$#9Gom~$I2<M;q{|TsJO-O#+%@|NK+xrg0`@|2bAg<c_cQ&o#t<Yx{O18mm%pjY
zkKzws811|=d45ttxYhWX;)7*iGn%^`X~Aq+zLVbpDUH2gbw5gUw}B6n%AjN<K$u@4
zgqteVBn9Zte8f}8j03-e#V$u|H#vw&$}UGg{>hD=@1@@bWBp7qz!(sdMj&d~j8wlu
zWH(K}1H7(ldeAaIM&trPASI-m!_Pm5<MIn?FKtG0Q-z%50X1W}`FsEfKxBA#UDJT^
zp)&6M4gg>;XaEc-1SSI!gaZp8?{ee=D?>ygxB<ExDZy(f0N$SmzDWI~DZSvOpOh$W
z6p)jMAdQaz_O4lYK<dwf)~;(R@SnLzl@OB`eg{yX=5MW3`MgD#ZZUqQWZ<|DlXReD
z6oB;417wf}0^lBE5*g%<?{@cLQd;;@qB3Z!8cLoYECZ1N5Bk%}^A8!$e|5qg+sy|C
zQ2Aj}67=}dZE2MP&Mn7}lE7^Ma?%N;0RtceX&?c7K#$0Oog!1e11fMEEa1SeFtc3P
zgMR+xWu#E8ve09Pw(`Z+%LfdlATC<0w%K7U-CC5NGtnyo@F19%_D~%?P7XGe?%dpr
z@KCV2IMzJBw>sJr@_C=FWTEEE_bAV#(y2NZ>Fl&TcuVJPjpIEQpHrG{I^ewD?wzLr
z&J%+5`Jhln?}Rk7a_;J8K7&LeeSNdf>hj2A7M4%?3Zoj0W>6}oPJ)Y8Yw~Lp6VfND
z<U%sxMN0YTKUH;4r1KINVkKV0Ef;2D0km#BQcn>v0Umr%zl8Mljg8eh#|IEz*Icv!
zJWgae<#Y%<Iq1)l@87R+i#Bd7-z#5RzoGnW_{qRpTa>h%l?#)^ZD#6X=&n|2@Gi*f
zNd2yYWId$_g;=h+@~xQI=*V|rZtPM1qH}D{gI;LR-B~M4P%Fq4&&0~#z^ZR3t6Bk@
ztwHfS5ZLp_e!|t7L6wYat~aJX!Ss|Cs13%(H|^|}K-zzO0Bn2pH3Mf=qw;dmPO=Ex
zGzc>SZoWh+<cYPbmfl*g*O||%W|ApNZg!Z@igtFQH<A^{E)PVexG;opc%oPt5E-YH
z?7fN+u(MQJ={H=g805`ynR#{^WyL&dT*0_@2(^)_s8r3$N-<iz`n8wkCrFi$x@826
zIHaD{2}&dK%wtXnRs3w+JZ7!&V=?W*vkQv02=D&!QJv;{`FWkHG|&KxACRORG_)2K
zDxM@QM1GF$AFQY?H!+s1#c+K4lCh1kj~Q}Eo;uF)VL?hH4(1cikNp^-FHL4_3c1Gb
zSFfX}Fua|lzTzDyb#Qsex~s!wm0Tgb17wPKj1-mUIB<_hW7=;MKs?^&qW2SR!x;s1
z*qo;Ukz(pG&Ucyvro%I<nq$w(Wp0WRX6SU*K3<A*f?PSnxl!VK@Q2^|e^QOL5iwWw
z$=CrqLWJA>ayON0#tbtkEfO=VSj??REknN3M%e|yulCP7F8ilrM=8tfv9cGfx2`9g
z$G0T&w7NZ1>Vhfi11oW|Y|?X9Qy?XWUNJ8ls;wOK(WKlFiWD*op^TNK;e1Rr>s=z_
zf{)&NoMCfiD2@vh_<rMC8&@Z?%C@hMZ~g$cUwPu3fmu?zN$LYN@E>lG?{PN6l`|jb
zRbW}+6~+px4xR2CE4e8h=tFz(q-4gtzN;m~LP3qxBwLJSlZ)S`MR-ooF;8Zz#b#I4
zj23voxHWsDm*p9E;L-KE<Bk`a}iDl3a=EEjyhAGo01n5}wNb0(_I_jw1Ic3I_{
ztRCgfO13LSMDR?z<y_7$+qT*NILR)rOwwb<BeP&~BE;xNf<+<J7?*nbGx1*M6(cM|
zGaQmX7Ef0{GjJG&FRyTp2*6e4>wAmAO1tj<IdkF_&(0UhwreHh=@4_zaQb_g%e#+|
zkR2UbJ+}Hj2#?akQ@-tTKv0F^)R0L@xUYDY2#XM3sLa(+f}K&e_~6kag52#QS4jp|
zD@-m8*`kjp-&UaN-N`e?-I6!;dHZVbbu@lbq(Vw%m1)`PD-6_Xs^zC#xGb{o_(f6b
zY`9@u!5BL^nkDF-&Ls7Rn?H!YGUHJ;WCkk<K<G}pe|EtTb=2Ov>V~%u4PNn2H-Vxl
z<ANB-s-#ntxA_Vy*~N$^rtJaMyId^MD=FqL>&*KiM-Fw}dPQbT*2)$Z@_ly8noI#{
zqa{8sZ+g#?qSii$p{PY}fa+5QU>QA5f{sjzUV#LaHe~-+3J6m;`{`>av5>p*o=l7a
zyKRIFylqJed2K9BqW)wJ2imAKq(@ayon=S}mqeT93@U4e8@hx6uy9_OaPMdb;GI(F
z>DCfi$XHo+%@M2S?(w^D(9I*Ju+aSpRo-){^ve~ubn(#a$Bg~!8=ee??IQwN+EmpU
zB*6OE?6Z4IdZEDfkxd3ajt9xBICcAgGK%zeQ*VcYNQU?9-9_na7fOQf(+Ekd_d)l@
zj4j#f5W^LsdvcrTMAYPEnE{A&zBKoY`1Dx};w*>Bl*+Rri+sd+)MW3e_pOQ}xN*`2
ziCXEpKik-HVX9`GxemYC(2U#hhZ*-l%`+)_V8S3Knm<LG7x8I_#34{AFu}A|4~~hX
zz)^zyIpT{oM~`wqij+g)zQXv$u-sNq56Uoex%~5|$Mkb+dxSrk?%@nijyrr@#%&f2
z$);u%oSPPJ(FMJVmd(P~(xd4K<tb3AK$S}O(Wii0gmI!zxA)Ae!GrXk6;k4Yl=MWu
zr#0dubORXFXNBURXdvSK&OtYzJtJ<5bz|>|2A%wL{TnV)-eww{vv}f`$+vI8PZ$nl
zIssRLJ2@`Dr*q*ic@q`pSQRP*4<&BQr^Lez>eNWZeZom8DDOI$JbuoOs^#Za>%mYn
z&@d2-7}Ka|$J21rD^>~*R~E>ee=Rg5R@CgoTf`G!>EL)1zl>yV(e`|aRU;MkeA#$l
zrP0AATabQ9Y-1GI*<C&&Cr(M%#mv!+KmYJJd+|fyHF0ZfY?4m86Uy9Ye^jH=gy_JD
z*c5IkHWeB--jkw{$cw`D$8M>+fI*xmg{|Jka$9X~U2a)!j#&FAo&-Jkk}*9+aFkRh
z?ha8Z`(3R4Dn&R{{FgyYMrxbPhj6K93c@tEG9xlGgW>(*!h+DWmV%D4BZH&gGnC3J
zthXd1@^N=piWh9-Y_dn_vtryAuIH&40*1_o>v(E1XNr0DXDWFk(#hJ7r=fMBJ)tw9
zgC@rIC27vEg?AdMrhLEaycaC9W@{HG?k0U)BVg;i&mAOGCiqzHhsqYCG7Pw<lAHM)
zJJSvh>O42_(l*i;p?q9*_3~YGiO%np8Hxca?k0b3`sGrAYv=CXBk3je*H0z<MNd}x
zdpZ7LSd7kevYILLS=Q<^b~#(4Am@InDc13ZeKz@4(_GWegYd$^*+!|>5;K+Mk|&i6
z6-za1je75S^wPUBo~mQ|%E8w0=cM~<)tO_n^kvQUmde{E+qBA+`|qnc4fNKX7Ohp9
z+I$wbNTr$TuhS_t?0OB9^_x>2ZR@LV-=Aa8{(RHpu{l|qwa?iZt>3R_wxq0Rx3NB3
zS+|rg!)vavTCZ_Nsqpk#FLw4$pO~mme_jYXNk3^n$<+0=d+kSp!&9|dYR^8Hh~Ztm
z@S4$kWVf8p7+uWGE(QADeyw)y>vW8Y!`!vp*X`e9wK)VRki3?Z%GvF~<z`z3y@S41
z&v*M;Ma4P$m1j2$?jR5F|8#yZGG|nk?SQ$aUC{C*Us!}KZ?#&b7Z<jqFgo)-sxK=y
zj=b&eu2-r?Q$1kmc%xA?U(gXesn%@x;<y}p%Acr4|Eb-Rj_3#ZX22K9=7!vFjRcFa
zX)vxqwTWl&#`@1)Ai1irtqy}*-<Qz&&7z9O`^(#m$}4RYO|8)=U#2Fs9-_$Nx1N=}
zGJ%n&f+ti<64h_}^(#}y*f&%QO-=CcI12_-gGVZFtZ%GS19=P}UE`PJn-W?>$P-X?
z73M9?iPb}k$6iWF#Tu+T8+Ga7Dd=f<!FeuA%~$@#u&6k+G>ysTJQ};aCi0T@u55`h
zSDv&4zk8jssgx_mpw6E+3N7Bsc4tPD!mojGkZXxRwKq?_opjiN4ZGMLE`sdzC-OqZ
zUi?b8H=7rwvHC+r@p-B~Zy^*e!zLvWb`39~it9kpSC6%K#iR6Ffl`(yCv6-sVl{s)
zZY?talf%%TL)C<&wg8R#sJ_Y-`kJCvMB8`{`TSxq$+pMDS@I<o+tMzGufXi~zVGps
z55AgkiM>74Jzy_R!thTIOnk(mdq$8G75ZHBnGAT-YFaN}FFPf@<dJe|$sgqK#Ev8I
za%i=x((f`!JFxUv16)okvq$Ossp=wqkuK9;Q$jL1Dxw)0kp*&pPsiWwm?knA*jKte
zW#-bdzZ*I&iiM^xFZZ`tmZ(Q_+~(|&@!r}UZ?lu|9)T6rL~(hcitoR13CbE3QsgI;
zztRl`XGVOvj>Y5^zD>Bhx<-`~A8H5Ghh@sWWGf-&u3x`6%8y^yq9Zb%sZMfB`Ebe!
znyMr5PFJ}&s%%pd5((f5CTy;c&pc2Px2L8TRB0NG^Yu^aoYAOUrO>$?Z|9FR7cRBr
zZ9r}9ss`8M@@7}Nq`D*bO_~$>Wcbqw6bDRL*3uN3H;FQ^XpP-dCnV8aU4E$k<+Na_
z&3R(Zy4Zh<oF1a)!3IFCiKHbbnR)e#?Mdt6*C!77`~|%hg!H2EfJse(yT%<l69iZ!
zr9<@kMm57240B}ryWcO@P3YYmM8f^KNg^>;;+RCGE~Rf4m*v+28n2GzX%s%}u(^;S
zh+HW9z&j_P45!9rD23Q^_lf`aoN3>;#AvcL`N7wKrp)l?qp5f}9nz^Mfu9FD214gJ
zpn8P#u4;ZTBsk&I1#AbS*Eje86v=2X^DsI;QX6$z|186-55~&mXL<)*{mU-3Z`U8#
z+J#mmKNf%Gu!9>WkL|Aa<CG+5B3#3LNP2+8lE8CgcSC9-SQ8vGvIf!DAg{scqHKKV
zKse*{yMVEU*@W7F?7;FOdPoJ8gEC`JhQdHTyRcmlG172Jzi5Yzc!;#2bw6~#ol;yM
z3dV$@wg!kAh1$_hBQ*Ns4S(5^iy`syAL`K|6MeP=(VxkINkNSg6?fKPX7EMVV0R~M
z=7)C(FSIozerTVd-D4|1PUk`>+oLy;u@=JJNk300+YlZ_;{9!&gxyedJ8Q%?Y7W2D
zZhBBd*Wk7VlND(X2Z<GF1#~aRwvtiJ1KR^M7|W=JEhbOO9keQ0QHQUMqzirbi?_=M
z%-K~HJqP`U{Gbme-Q_tdY|4Q0g*pF|)c3;ufWTx`R0*;8>!iz?Z2sGjkuZkjUiCBT
z_R5fBX3vRUl$pp`ZSaaIs<lRN!%=VpFt`CaxWPh1i#ul=?noS!@v$eT>$PXxuRb{I
zGC1`@u-9Qr&2PQuIk*-j<g3?j?3cJdE3KSDNu>8MvvScUME_!XWzF_wznT2`W+1`n
zcMhBN3|)tdu)do8ERbbObVE;~YQ~L1e~{e|L%%#VgGdiOrTBr~eJTbi?m|cjnQdN3
z8kG$&ApV56n?Yhfb&2NG_K|@5=B_JjO6db%qwplxp~eTk*&<%B)*($63d~*39%Gg~
zf_JH386gj;-zkIHbAcU&9mJt7I)As-Yh3u3i4xWY<Z{yNd8G3GDIO?I)+}b&l?{`3
z11MHoWU=O`q&dueG*z^QnwOtzn1{rvU9SF|{udCo5Ss`cC@->LRj{R>&lS%D5?CLM
z3bySQ@3#H53E7Fn5A#Ow@bMx2+s`(HH_|^Bh!2bde)d}&h6SN?XTr7&>oH9-{(Of&
zhw_CuA3VO}Tzf}*!F(WjKy&+`4(UfI)fHD6dVwJQS-%l_0%6onhmfn%n|3PK8_x^z
zg%@lOzUZdx^8Jwc(bZqxZxg~7?Qqcto*&^2y$RL}`32_z(1i733O)dJ{$%{=C(8Q)
z`J?V4H)N&s#J5g1C~b%g=nT9E^%l6r01$bTFRdHS!`-*Yzm1v@&r-YFU*1EN5RLx4
z-_QHzBTqQHQlmRz5d|bR2kH?PFMf80y+DC3{2Ki`QC?AUeXy<~N#2!qGlkx;emqG0
ziX{6>^3K0~<ADzRM`70Nn(zS;n?Hsh$w}I@+qXyEqw4@tJb$w5z`hcz<sI=2-wVR{
zLGHNs79nnn)ElYki#DvU`Oqs_=!G_9Gf{QxXXY^%Mgpn#5JhOIz2GaT9$QJm?4mPL
z<KQtH5fe_5TD<6G;Lxd<kqaqHD{Auk-*KLRSl%t^%oD2lYjB;9h^7EZIg!GgfAEoT
z-wm<B(`RPyLtBBTlP?Kq=^#io(q9&k7o-*~sO;;(DK;XqoH+$}3bVkXBe6NBf5w1i
zSoyo|SZ}ZaXWJ6e<ZZX*I#`plZ5~$swp(c(EXCP2jZ}Kyd;D7EAyBq~^2e6&Rav0W
zt-yxeb?10j8E}>@!P2_)hOYmynbnjbAGPELWflxCUdit^4gzwQEI0cYey<0v-qVRa
zP>4McHri53*;A2sci_E;HF`!ymNMM)b+E)zn6qptj?{nsEe*X(;@V8bS^iVeR!ekE
zt5#*|>(^g<IfYYbzkxrYQK?9NrIKJ-D;OyB1wzu(Q~TA)&alzTt~g1sq~}SduqQWw
zF4|08cC!xB7P;Tkt_KI3%h^xdJ2ns8eLC6mJf8#Er$rZraZikZ&NTx(8iwg*)p85!
zl$K?&?0?6!=~m@{IZgMX4D0#Ya*qa;Nt?Oa>I~qmRhg+FAD)&x<<W9ohV04TM8iis
zv&JT)NS^a}mchG4A=8Dt6IE8$2R%`|QC%FRE`rdEAl=}stlDKko!7D~oqb^f*U}VN
zUpug(VfatjgSi}Q_Ak-8)h#>Fywxtvkk;C<D6?l@Xv96y0y>wARpymjvoACV-%71K
zsph_E&HI)<*{ZykAN%G$Z5F*5-V$lOQ)s+X6u;M{*s5Gtr^stTS(f>-FLdIb=*ebp
z^nNNAKb`c!^%?WK9=itaKO)Md4TcPQdlXu{i#WumD_cHj)zCkQO}9HooI5E8VO|$*
ziB)!Nt*K1toUPZUJu4kBIG#gKSUYd1<<wmAKh=<~;x8W$A%s4c3XKC4UJ9hWd<rHv
z!S1b-^7pch&o}oSNBEm_yQ%!z03U1n%%<#z&gP;Tqs7Cnm@s?pimrP*%#O&?OQ@GV
zsZP(~?V=SGo4d2c=ZwWr&s_1`_E<&{N^-sJ_kur*ncZQXqETuz-`7e5$d*l6Zo<Cq
zw9`*ZE?k%PN>40}W?X@KQqEsSbFa&PV3j^riXBjPs;+sb6P=ONP?Mzocd|V;Jkd6j
z9GmWLd^u)es>)ltk~}|A=3-LMTAHqL_BZ5M!W!wguWU-F24LMAi(^%qkm3GkGxv~8
z8IkFpBh!IPaI_pP3d9Tb{LO4pPW!_9%?mkm5DUzxkl(KLAF<ch<6B6d4<?*=JbjQe
zl$_25Dp(G0=_?JRa-k9RC~)Bpg4DgR2e;hPexlo>$qi6He3I)o=OQo#<vw6f^l5P6
z9e}jG@H4yEywDD|ytIGHZ_PG+p1s6*fkfF#e842^&V68iLyy!aBpkE_f6VApa>MQH
zzhELl03-TD3V<}&K#}TZjSWjX(%H4U#%;gI)kD?VJL&4;=>{#$df?j#HyF$Z@Z4sR
zw+&or^$_UurO(K8`&s_+YsCJP>J!WcqBXh;N-BlSK*kb+ry<cP9!LCmw||!4a%In|
zaVMPCXhv788@&8Z<c4p5?z+jc;-r?4*0RN*t6K&_>Ig~~tld{+A=xEOA137PY%WA4
zVXa44d$&V60Z5B%ZTh(D_7}HGdGPYl*4Y&+z=z+eT;jJZrevG}<jD63co`A$k|-f|
zm+p!sq^t6z-+W&$;N8`RT7@<+uPUNk`+bsry%N8{U#bOn3cX;mb72R8zT*cL8}RVA
z>DWWcAz5P<ZaH0~2be#Qb(}ZZy0y)jZn~_weh(UD>NRDxC$;IJ8@qfnjPUKbln<N;
zEo}3_CEW<?EeIixI_cvRS`~4ZSz~SEyZp+2lZ$p7_r35p5BVH*wj7+ZRFTql_}>vr
zzpr&WFNHqJf1IPR*HaCFpHTLnE$sf$>q|lWbB5<n_#r4;RQNzcJ39XG9Xo(rR|ppz
z<qn3|$HHy4v<OzPg4drZeROF+M>_`o2>Bnl{2|wN{0v=%;G-h&yL4`e5f8tAPp8B?
zK7S-5NDnM(?+d3lAo>7r66hC$XOC{8&tW6RXVV>NPixcb+x03oyslriOZb9#zkSez
za=!zC@y7Qg{6Ov#AP%PT0lxPcCIt;%;;BPO|I^_4v5=5dPwv}QYt}_ny+r2=$GC$@
z*R4p`>(CZRQPw3OAI%u(%bCwugM4anm8nm#G=C$|JEyB*u^pe{mQSHu9HRE$+m<fA
z5!MI3x1XldKLrek-|%6iG6(g*kZ+-)W_@Am;fY`t-EK5C@!}ayut{7>m8UYdzJoHm
zzWixGdo{`(uw>*M5z5##z-k$P^5Wjh=8)cVDy7eVo6U31)PdS_&D4Ma-7z_7ROYD0
zv}eC&b4MeSJ&S2C!|^4v?Y?dD{TEi-8*QHk#l-fF*1mJ>q^g_Oy63nXlx+^VRD8LM
z^}UPn`Ps-ydm}|SCZ%!V(%NMFEzcsKw8nyC>j_KiNlm%kq~d-?{urP1GDkRxn4FEF
zzT+c<ewLToY2bRbJT~nn*0VnR-5`X5@#9~^zGsd~<xLPjg48bUKH}aDvaDuLb@4r=
zMWKWQRYiqNAry{En45+PHNHp)etrqM)qwLfLUIWzHg_bHuNv!afPd+4^#yVLX*KZ#
zyuIoi=m-8UVS8+TY#18;UBa`!hEiLDQW4k0b*KC`I-dYeA&8Vqg|KjIHbd{zeKTgJ
z{f238Oz7m+^@9|4=|c+_qw|`8y!m}A=D90vMXwC=T5u}U=w)MT!yBXA*)!k7Z9B>~
zmfI)!hYrfid+Il>i_0n!m-7iG4rcgN%7i6amEWf#9&%+jO|AF$!HWIEHUTM3jrVum
z9ZZpw=Xwu1HB(1B&n3w`b>&Uso{#PKl{arl{hJE#&*j}_LfdWN%3*|FT9?B3fXxOk
zj?V<~4|YRja(&cX=m!DuA=ka3eT8<}I*(YrFs)&?E?6hDjPSHHj04uhN{>WM1OnXz
zP3W%O?b>_zYd}vh>snam4*yytEECq>8r|8~`%gDT-Jv{w3>F2&yyia9^N>mx6z`S?
z!^^>tSs~3A`)%#W#vouwW|Qlqg;U+5a~-LgvJY_{?K`wyPv>^3zTuv66TaFu&C7?z
z0Ck({HG}4Uhp@dbqx1<{bdCQOP}BL$q3R8vIS`N(F$t=D&|adtPYNph&5rNEr*Oke
z$N1*pmbI1{cOM&483-L>y9K{S_+|Eo{nne-x%|khWU~PEJ>_19`gz!Y3T-6{0*!4~
zBom$hGD2Uyx^`Iqf0f)??(c(sJk_^$S#n;6G-k@sz|dN0G6Iq#37<>b+~KgdFIqY!
z+>T}Q`6;}E5t}=V3OczZmdZ{9IVYcWf)05^7KvMsC+6%6B3PZ%ig#oPOpm#g29tUI
zH;8IV&<`2j>3fzxBmu@@);Cya9xf%Ubm;(1nz^fO_CK;y-{(FJQIpB=e6-WqxWVir
z;7x{km(v5Myc7mc8U8Nyf~ng+P`c=Li(9FiJK$s?Q}~}EM$c|htRM~4w3t)OO~9?6
zE6V?0q1j^Qblm@k)GIV=V#nJ{4cresXyVAnYynx?7*-O+qz`L<bDY(g7xt3j%3K|_
zU+Iy&lmA_86Mb7>R`)g;-d!FW!*X@-&$i95#9O&m`llHpT<q&k;#<Id0^_aIhlhO!
zV@+Y-O(%3r2E4HndtJktp4&AzXUCeT%Zi>9hLm49$NGRxaJ5W$oVkRJpur|g6T|pr
zbz=_IWT49b%QC1YhpFrv#5b!O|A%cF6;aL^8KafClVQG7k1V`((~K-UQf-pYJf2nY
z;4F8Y{fE%ssh?Ia8CDM(MD`Q@$!Pba``>_`u@<Vb{JRqjpQH9<m7yY99$hgc{>|NM
zW}CKNplj1*gkdYb*47eDx+3+gp_5d;RV2<k(xD1+dvCU@(el?(7!8i08o>ml8mlN{
ze>%MX1N1qaEY@R#?qK{%M5j)`H(?Z%%JZWYd1!d|OY$k-+vwOvfS!rb)wiuhx()h+
zb%%dQqDp(&v9StcsfWh=Xo;@sZB5Dl3F7Z4S{fvZ?oZu$A2&}ay1=}xRfzXW-CLzw
zG;u$pF68re$n;9to2AP%d1M<c<ff4x1D(|Cdv3RYUADhC+pA&Atf@WZueb7|HU8FC
z-u7)27v~~LY|<57`}yZ7YkQ%H#6Gq50?|9<8R5M#rDvkgU*9zT)%s7&Lp*#y?h`EP
z?tVxxWa!R+;OOw@(GI-WcF^{tz2wslxY)7K_9yxQQ_=6-M92vW(l=$!Bp^AH97Ak(
zSWbIzUrMnn#L`J?(yZ6>$JuDtP2dC-OvNMxEH~jaig;4kAWf3(bDf`0r8mI}^tLDA
zWh8E1tAv0(RM7I1ezL^nLZT~#(Bypw>Qw!CV?6)!=*EQOX~1|2^sZvO>y2ZQ=oQ38
zp0SGx^k>7*+xm(GTPlY@uTr>!=!HSlo6Ln^x`Uq<7P*6-6ezz9NQ<-`-a*q2wcX(@
z>NS2~U>L+~Lfh<BdEl=YV1Ixx0cmOno!CS9U}r>1sZ7AoR1|Cv^u6k<)32k6%?+#H
zXfW(HM;&w_BV*85<ziBr_78KDi0=X7uT_)MFeyv}hBfpZ{$|Br0k_w4lS%^e|E_WU
z?hAYSUnEF{!8CqUW6wdgIqg3!zVr3mxDu!QI^bV7l?N3n>jAoHlCyX!_w>^YbLZK2
z6c)C?iX3}VAfAdxCTZeqOV9#e2$2UL>kY4^$eyW+kLq8NfO2t_9gPC?!j^ksqnm}W
zbn3z-s1k!Mo0hIjMXJ0i0*YG$EOr%0BXQm5hH}LS*{_PC#ex4@#0NOuQb39HaGUGR
zB_zBB7l{WMIZLSi&yp|*1la7_lSbmXuMO>p5prHNMGsm7Y<BGNB3Iqk26i|I8!wxH
z{{j+q#+Pwq8t=A#@ktKT;$9qXrccA$UHf_f!_7JLj`5zkr<>rp+()p{8aC_L<Ar=@
zUlr-^jeoiW`T!d=KTN{lVqYB=romv+oU~_<@OQIskP>FjP|%$8@0ppei7?G<L-~RX
zNDk&v(*D}GyZ;SROpw-KFm8_9`%i<F*wQrDpU=SHY+oEEt)qRst@uBq>7eGg9mR@2
zJDlwGP+<i&GmZJ-!@}Ro_RbL$oHIEG74yoh>mTnIrRLg1v~d5shE*nA{}@rpG>NBo
z4yRy($vH5WR5s96CeYQ%cfmt9xk>no>qriE`@y|o#qFEm$sI;(%d7nd{(rm#b?hZc
z+fjD_FS?Lp77XhMb8oTB2hI4zPM9?#r#-^|TQu!zwl03ck7_EKHV=)X6E*#pP#?eF
zBWFFoD?hq`LwXqT-k&WlcEpJUo*t$LBl&%9E^{M^yk8FgHI(}!;g<%y33XEb|83@x
zaMq)y|7UsZJPbq(V!<D=qw3B6mkq|^L-w|B03caSL)Ta~wah%Wz&qg%8d%FVF~cRj
zL-+3)1a6bPq<BNs2#x=5vs@xXKh{;>gnTUf*WYs2$F?7cJjL?Y6;5x`l2e`qZiCyO
z>+g!rWt_F78sm=|fIQan>8d!gzP34IRk)LV!1GaC`|{P^s1ZU3CtX`)jdbyJXT$(U
z3_|bC3Udi(l;U&ZdA_`sAm#h1_d;{lyr)3r&8TBMMQ`B}TZvnw;*RY1!q%Lgb*T)+
z(T!GH9AM1O#a?qA>l2A}s@ZB^C&U}lRD+%A>~IL&%~`&!t^M2h<<r4A>UphF%1Jw{
zFr$v=(Ten<Zp+<FePTtslXZ?Ix=!lhipU~G3&(VQNQLabiWHTgEG)jsA{7-wu(M|s
zz|774Ou^0aWL)<=+K|52ZMmOmPdsmTy2$^RP&v&mH`L-bk9H*9pIt1q=G>m$D0g%D
z&=7Ll(HNKQJIWaJ;A+uEdtyVZ?{MssVwVUtV<=3o%v>u615INn18+Qi%U|-%8g~<O
z=N>jO?(!D`IJr0`?-{yAc|h?~De5^hi3Z7*4EIeWWzub1a*gxzD}aVv#+3hyx3_?5
ztLyrGYe0*Z;@(1WcZcFF?(P<fyGv<{TU(%bXz^gdg1fsEf<y2?g1hJNJkR%j=Y8)z
z<Bl`#8TXFy+cPu&y+_7QR<^9U*P1hRdj?b~?mVh$hb`La%7eKy;I3_M=7Z`B2X44o
z7#%-ElM<yl{QPH=igD(q=zj~JnWW-_4*Z%~N|l0WY^4zj)Byix{PS^;zdpB?kCa})
z*5&QP#FHQpMl&$R>^k1hA^^b*LtzKg)aF0*)mvs(&Q7Q|lkAU{6-ztG!J-wTCuiH<
zi|@8oox1e5%Yl^WB^TYMu0HyE`xV#f2WG_SC3`z1Jvu=~q-CwFjwP*lcx#nhq$XVD
z_Kexb+9op4$HlnnXH><`1@;&^GwNOxe;FKpk15LEThHkgn1rPV@L8M-EtPvxqq)&A
zm}Wn8@Dm!#bf{vM|2D@!6a7nw57Deh{QQ)V^4H3ve-u?rTy@t^pB;(>-?cnO^i0Jt
z3%;vNwdC%xRP0HLX6Anf!dHw_xBse1mqh-1>Uq9A6TkF>R7R#(oT&NqVGANHmV&AG
zo~~;<hq>LIhXIQG#T{9M*El-0Zw0=(lvI;N-5*9G?lmo^R09GoQ$+&!)3B9(*sK-~
zXxD2(>gX9G(l<gU@&~#)@wD$d2CO=TwF8vT)U-VC=p~7rm4Q^PBKI=}CIg%i3W+3P
zLqEJMG*%B%tj|bzcj!yHS@nsR-Fxb;Dm#cK%f}ClA_CVLVD&njS>-7bi1@>g@hyb9
zA|>OTV3L45&g)Ordcr6C0B2$&Lr0y~-;h~6#&BIgE1{{!^unnBC|Q7zRW8zt=w`;m
zt1<#_s!$;y$Z>Y1)LDZg3Rm4=AsWD;VW3oL+*9g$PElM8<)|953YaYw5h#ftVX>+n
zoGs!JD2|RRw#+D=EnukyMK(wGmpqu+qF2W(Z@{kvJe+yqwQglxS2sB(-C7LCe&rW6
z$a{nGF)P1WshP7=<-&42VVLCFjY1&mJwgf`Lgd~Gc+287UDOZu?k$jBT6iJp^ph=V
zi1l|*6Y>@7$oJ3T6w=|&tU(9luqKAaejT`w|4$%-S$8PAOIa^_XV^k<_g6PAY+NSl
zUDU);D|vpmT@D@FVH^$%i2l2dcycNw$A|r%xQ9Ry?u(hXcs&*tO2@7eG@-c-3Ktt`
z5|T}qbwhnoTS6@}GD{kWcti^|W`d@Wy3lqDHOp|G=dW%`#o^!Gly@^SdsI`)QV$0A
z8mxdiQxC#``lvfFY$`Ad@bbgn8h#?_<Ok7Hn<?-vw;Ot{v1fbvEpCenggF%F9@?LM
zt@zc5y;*eR&%(j`+crY@VR%-CXz>AFMlnsh@k87;0!<@1no=V!Ln!F;&Pn;-aLlMp
z>>MCYeY~!l8n2h^LV^jKhbMv~&t*ngcyq%?iY$2T_4jk#X_9Y>nj<#E-yXbaz#Z`F
z4<r*X3m^GuQEfBpNTv(xvK^|>SLhM8*2T^s>ay)|{gBT(s{8oDBZT`Rr1vhHp4hDh
zpEJxRie9%TU!Ry-3r&=uq#0IRE7BR;J%2bO%Q$TzzT6q^YLqjy&Tx7;Izm0UEoa7O
z1mSzj19Xwn-Nf;w?4EX2v3%YIOfWYp;>|Er-p5UBlEDZWi1nwdoCF-KELAWa2_|%~
z7=M0-c~YVcQ8{cL&Md~R{-Fab>?G>(t1anUs(6T%x(k^=fQ5!*T4itpBNvr@l5!>0
zqq%z_u4~1r-t`Wl?qN~Fh_Db9)wAm_LW@W`*F3nw>b7e++0_BVP+<AU*1Ti+i0&Rq
zdWsLdn)fjC<@<VJzv8}0SO0ouzwo|<HLF#aXOG`NwV0%+aT#wb-#tn7iC4eZKum7X
z4>K7P-^su`lVXmVJdQ3Mi$ab$kW6rajCb*mxnk>Dmb9$&y_1Zizavgr6Q-c1L#<)A
z+o0tp8a!Wd61;0*MDu-epF$@Q$ZVZ^7^|*;8gIs6#92{%@66>$bQo{a6UxWBsKVk}
zi<#*Z;##7!Uob_}H3ACr9d#QEI6&*WgPzFQo!-9!r}%|595@{>V>a)%*fxg}!yvCp
zD&~(IC>KH@63DMRWsLyj{6`SZDp<rPUFVcxDFC>i^%HRfBv%G?Of9HKfFw>MdYHXZ
zH6uW5Q|9)p;arYSVqe4kB-j+4`f$f!Vd+XBVJ!<?NMnhv4|JfSsBvojD_2)is)gck
zQTd^N&W^?le*G-Wn39sp82Cd3^5Ebh1le_yC=-#~+NB87XsJ6`=;OYzlvM7x7izD>
zxW0WoRe#^H%xj^NYXB7tBc_sQx~{bY+neg0G^Wo$eO6Q#H>(4>yhK83Kk$#_rQ2$%
z@S@pf=uQ^X9E;<{+~C8mQV+7>XX}?Zz~?iv`~}_6f^M2<){~*0vp~>?@sf4;_HM}J
z9G2G}Z{s<?gLA2s*;J)?qkF!y<DamYhf*mf7ca8T__I!9;~SLv*r41kjT;gsRfZ&d
z#rJ(WD1bJ~5^8ptg2`BNDVXb$s;3sU;iFrs69p*M#2Wc38Sro^bj%9q0Vqi~?aVoE
z8amqI=r<4>KE;q-a5*!ydmwm(;7Ze@*lfckRZdBE$zs`&$3-Bjd8XN!=t`Hj2|Db|
zul6%CxYd9Q#+J%Cn3Z!IXfbAWW|(~KlX(84VjQ#ffkDh#br;<2k;kmFEO%E)_mtz=
zYCB_vr*JLZ3LTJ9=7rT_sHjX6XxX{A4mt9;WRB$lg4U(4E;AWw1|!6Xk;2nIfnJhk
zSz~&WchiZw$PXordvygh#n51{Mp=_lt(23gO)<zUcv=7iQL%SQoJZ&x-68A~8|U-N
zf9l5=LZ@D_E=u#CB@sC%C+C_34QtWdA*>RqHfX)ZBU5(Ae}+kF`By_N@(qRh_{?0=
z!FqVCMe{09;^Nw(D9N-%3Z|yTJX5u~%_Ng@f!d<DI+b;3!IAx>FN9oJF)lW(DyzV;
z(9obn^$&wX^}xJil|V_Fz@%px^Sp(7xd6UidL%2TH4TvgYMFax{P9US-($V>Ppm5K
z$w^C&h%&x2-E>b@l^R^F(n&%jUyq1#rL9)!+%qIs$8@^6gGvdcK&NnL>Z6EPx@uN|
zR`t%5qX>V7s)I_tsusv2kcB@(KC3_@;$uy^=#a|KGln654tDss`=rg6ovMy!bLNJn
zg6z{7*hrP|GlnjIRfubGm==3MCpv+-c%h*XH0NZ-!@gnE479F?xJ(ua&X51cwOv{+
zT@dvOD0DuU+8b$X(W(Q(En_&$z(X_0<Fm%VmVh;hruEL&>+VBU&mSDDi+~_Kg%PNR
zhu73lATZyd_T)@3&}qfti}S!CBm#`(+xV#v%Tv~rWW$d;hNRl_9gwD4JJ2+FOlQ+W
z=gQc0SaZ`)2dMY=8)$QuDb{w)JARi!=o7=U@nQuXmqNV+#bwoO^lQHH!cYnEV!tN)
zWz}4Ca{h5ZC}N~!xq-!}>7j#rsaUVn6F*@7U2mbAI-(G|6>cY)=5)fsULBlvHsttT
zcVz3GyzK~&1M!l;1b<Aqf#-tk3|p}i?L0$HF#cIgks05DY_$sCUCs`PN;MPaMb+Iz
zXgOiiA45IGCe8WeYdigV1t&L%hMt2Pgjg@eq^t*OXw2>nAvTO1x9rxjGK{Tv-yQs@
zr`qJ6PE1!$-Q>TE_^7AW<R4Q)vX3{pk1vp#ve21P6+D@&VB*$daNZo%)Z5LzNOKDk
zavP%L7YW8oCPN7-_c3rr$8)mG>?NK_>~OTQEH3P^A%rY$OYy-~t!+yecndf_vy-;W
z%`(8`q)4s~V9h_5TWVaHRx6o_w1;8403zmYA##;7Dy5)fz;)bZd7IGM-ox||l9`PD
zKxPFrmM$~vGvT?w#CST-#dm!;o@|R|F27)rr1!Mp`Ih=D;rTVUeK-5ev#6j87=OH^
zHmB;@WyLzpk?AV1Np|$h#PS*i@nl76?L&&VOHLspIT&Yo+1flyeA(J8OL$qi`0jky
zgK0D}_K86{yS1>F*Y!YZ8%HY~9HVg0_Bi|Rb<#LPU`bB-CxLo{M1}S?N1SrOV*8th
zb2CKuIq-u?h29~PGc>)R4jr>gGgV+`9>~r3Ap`W8ekv)R=y^6{Z}%EY<A{WLQvi~u
zHJIC&|9*zbrSVuVI)vN5xn5A{Sk2hM+iaZUo&<C{e8ceG!e;x|R97=cX_Jc<(!ySx
zEt@0=N!9A~m?UFwX`cC#3yo&WnwN-X8lOre=S%L$ae#)NXTRGQO2K}&r+OKC9rJ%t
zH;?-0?&;c)HZ*de5>F0uv-8#8=Nt@pnvxMSX@v=$(Fig*nu(kxoTE0yKYO$}6XpA^
zI}GNEUepplkCYXD-T9aWoO>+tKMf|>BmXfbICozJQU14?)TsnDbs-A<we=}YV_)fZ
zP<bmZ<mN5)^MJ!!5OMa+<eqqmb&_N?4`{|4F!z-vS*^F>+wK~K@2;QdHpeD&Smjex
z|B0Ie!^^zM0#2KQdVY!fEh7YMy(p?1<K}$edrc?0wcM-=sX=|elG&{exb5=Es{h0q
znDpjEO&{;kUgbj`9xIy6&a~_mf1wPF{gOabI4E0kXXjq^!QHce@E1&DZsWs5IYVp!
za9e0*CsHwJ%oe6~Il^z&|E?Fb#`!$0^C&5`G+PzZ?|>vY+3V|xfLwsTPJgiWd2SUA
zyKQh~OXrOA!VM2i@|7Jok~o325))ZLU6nXjG%Rl;Ku)z`?=mh}a6o=t?uW{1Z~&Ol
zt1QeC{VD$zeBgXLx5gR#alW_4L}uZVW`BIfcS+-xsKU2Ub%DPeSg@CV0V`9W=Sj$w
zNrVDba1&<PapwOfdPkUBT%l|>9t4XA9U|u+W!27-;kd6*;okmy+U7+gJP>5P3Z}zq
z0tP?eCJykKbop-tCtU<5If3w9>+3yB^q$j9D!I0QSdtCV=17;UKNip%Zdw!2TiCZ}
zHFlEoj8qf+Q>GW|&{IGCz2OK}fr^k}oSVxkFCQ!DY93eWqT*XT0xFI9gFBFL@mBLf
z$ng4&iP5?j#{K@XQT&egZ9&C3jqzfP{gO7az)kr~RqUw#D3Yh?bT^4;0qQ!6)v
zd}(UCKwgl)k$FNT%9+Je>vsm`lJ<7|c_Q4%Jy0p$Wx%b3(rB~ZFur87eqI%#?p3l`
zJzvgarI+@7$jJL=7^X3M3e?@ei81YapS@q{&psn}UnSg<%?d-@k{DF<@wbhcT$!AP
ziMq}&6ZG|jGhY>QX~F|?Xja=S7@TF*QYoC|$?4+fsQ2T&%^8<FnpXFJbjc|F9}n-C
zk_$*-!{z+hOn9$-{MRFKww|hrB5Pq&83Ns}a`_2g0~Bd`z!SS?6ut!R>~|E>^#PsE
zs~s;AI~QMg&k^iaG0xhk%~IT$n|VeAFj!OZe){dIHis`nmJ)$6RNkI^Np1U8VW=E!
zX=v-~&#IplMZ;8TS5p;5w_k+BZb7%tDH8~qC7zN*G;Esn%yceo;+a!7wk$`gtQh!(
z_|}MhDU+Z$6Cv3zfB*Go^jk&l$2JvOr4OV=Dr&z8v)}z}Qc2<6_jWf_t1BmZ^3ZDx
zCF%x$%pjLszN+O_TvA+85+No#@WGhKSXYm*&&!}o@g<SmyLL1=SM-68q%R3D-;cij
zm4{FGjT^taJw18$-6Yp@Y>QMf`)32ZrwuOr>!v1dUKjf(#@Bvj`^AZfDPa^UX!p+@
zX<wlb$7U*M8KU-(XnP*>rwolG+hDOu&O>S;S+w`z(c_QEJCnzsmw#^hO<(hQdK`T}
zwAPh#5d*JXX53`NW^?p!;$J?U8nS$i{kendSvW{p*Mp%%I1!pTU_Z*&TcpyfPm00W
z;hXx1z~Ys1I5QVfL5F9mS_|4?^xy_)3};s}<4M39iq^2ZOv%n8CPLRcp}@X|82iMJ
zpV6dTf4$NU_gAM$#Yprs_p(k`5jGt=Vu<%??;0m-e5dzb$HPGQzoF9?rc>UZ*$6g}
zN;^-yUyS@<ACro$WzM4fWvAlVPicl`^r01{OU%}A7jpu~aP|tK+79o?cHBtmctaIs
z{lB@C4#X>5;mZuIb~P<~t6KWs@VP^Y*GV^|BnThvcOU+G+4_aR^_44IFY_U?M+%g@
zTL&esY_~jkC|MWN;#@gaHjThS2FLIHdTJvg4S@Xd|2+28{55v?8!k#<#6){=bt+wq
zaMfSNQnIYZbhAEkT`?Z{$*&?6Bjzu2K5zeTknBP@O9PocNh5%3VkZw1m_a6V*c$TB
z+Mi)LKL3o`TYBSq2TuL^){fyvkIJ}7es%Ap!TL_=QE2Ir>Az#i#Ez}?vGjy*+K(J5
z7Uu<I0OIF&S07S};pMKP`QrDsKeUyDtXCzm`xF;&`yaG{b_{BKiHzw31S1<!wwNf2
zys;8&pgp=8|Chq_4y}=K+Y+jI%2ccqa|yOjdMi5m&tuHF*+TQby#ROc=;MOBC@P-l
zcCz>q=*kc`=X-C*$uYM?y`Ro-R}K}`X$($5fMGaJSBqM4e*!?-cQ4-Sfb43UY!vt(
zEqT}NcDjAK*2L8|+9byRW+dm=Km~1js@JL|#G_y{C}a1%E9bXZ=TQ-A=^f>XoPpIR
zCM)c(Qe33r?dqiHJ6x1J=FeikUa`daewS~4&s!U%+51<6&%S7SjoI`DJ;0pP`svAl
z{g^of@Y3PW(}!6Gx_4Zb%7(Aagt}8a*OWw#=-JiZ4yL}+H?lQg!th#AOtuwRn|&6{
z`6G(?;}%Y{n$Avlg6Eoo$Pr!ECHJ?#n25QY7_w$h>-q$-9@P{bo{I{RYU^rzQY?d0
z^g`tqWYD@0$LcR^^n>J>OK8N<a?M{2uiOU4gCh~U1=tS42NzoRk0_r_Q;Gsu{(><v
zEAb;0iPX>+>A~zZ5o7{Yl0_C#{_e4TgvsL{ziG8U52~XTOY@tT3RpqNN=Eee4Am(R
z?@Wp?mh4JI#W2YNsUOop<9`)+3+?HI6h5^s(gwV(aG^Yp>uPnI6~w*iwSWozbA*Q8
zcrkPlreaQ-iwu8C<HEA-dT3Y9E-~}qOteGHW=t%L?@UCB5v$)G)0w~Js(Z5~aLQNF
zCbN6+@<z1#9+!Lc<vRBN()7Nx!sh-D;|YG1|Ksz@yM9!@SVLOY4;j8=*cD7%Q-emC
z&x4?i!~8Rvz-62H-n-|E-GZ}Cvb-c{FJ&Hgt3To<e24wNaB0niA4qyft@s*`{#(SW
zewh~=zn>keKk;J<75kU0khClHH*cu*fZH!xR$jah<J-njlcw3my8G3J_#nF{^Y8FE
zN&N?2kp;LDQ*Y8mWVIsf5HD4zazHYCe~ErRoc}k#hp1LM0@E)x74q5)$GQwC{|&Q*
zTjci;@nf*U{aJq-_0<gBe~2~vN`J!|zh>jH`uEwU-R-|lHku=^t2p2MJH>eP|AyS8
z!o;ZtUl5@R&HoQBbRU1f(5Rkd|I3X<ZS=P&TI-zWX-v`#{|Byb6kBt*|9>D$Y)Xs8
z`8_P9zf;NjqZyXM4Z1*w?drE4<Em68VvjT1`!<cuOF+C&@ptR$=?9`5r!|PSmwq$O
z?I{?jsA-{D*KS$Yn*L*bk6u)@`OB1R(G20xeh8L5Ux>|yMXSL$Sbl6C_q%KEcf!mD
zJs<O!EOq-RZ@Eci5A(jOw-!!ega6nxuF6#U=`rBWVK7QR)bKAc$&*@91dl!|;>W1W
z2l(&26ZJoSOgm!#DU<Og0fxUCa8Ycv8~c)Nbp%2`56?aD1eP3jFxcs1^IeXrsz`>|
zt3L<VVCqebh_yxp8{*sl6h$Uw0RG#`cv(BYdBr^oB1TWg)2RPLT(4o_8f`Za!ho+n
zA(Y~-VC)~38TbqF{$YF`53^e{LKI04O718`;~$s4+w}#vu{1Tu8+mjWe4#wCOlj=G
zmvo62uM;nxa}_+YqRp*)o!e9&uK#)zkT0h730>s<>W>{0N;zx={4R}dS3+){Hl-~e
zH)8Hgx#3dzIo4meaKurA#7hS^GVUZPtfWu}fh`e-mtN|)Gx#!ZYTmu67fK)<r3*ia
zTotqY$93Nh{TCLm6jauVK2Y-CbUh+1^3P#-;QH4Cw1z8yT&Ae!inNxaN`Z7*RSlTS
zih-<JRaY<i*jccW@wxqeUM?5;>AuYsqGgJI9!>XoEpjES`moc$clGQ6rf<)KeZ_-?
zh8jNz`L3}AhDLB69->jVws(>2%H^|lkoaCc!GruEf&7_4<YAUZ{H`XCmOL!gUal@I
zT|Rt!8roVjhgZ+p+-2JQCBo<M5SJRWPij@j_c<Qks(}@gq`*<xXX@g<4mID)7kHAZ
ztBhjW<LcVqzF-!|f)CZDWi+I5mpi+Re0{O-Tvk`v#hSw+q!BUy!htGhugmpW<QZ`#
zHI(n^U(`H%0K!KInIeJw<7}*)<@(++nMy{_B&_}Kxy7uX5z)UIX^YCA|L?i8Gu#ER
zt~iz<XLJ+0CW`E)uG~XXxW8*4rN1<%ujc0OaayLD=a}r4l5$xuGR+j{lp)K(aTh}?
zHPPr{@8vt>J}(+oWDa8LzKr+C9(3D}!1&c-A6MnD=ec(GQfCD_%=1!eCvNS|gUhg<
zNrI#Yo^xqTm>sj`p%KUtJtlY5@c&Xo(GX?d5cKFt#p($<WXZ9XmDH;R$YM|2Gu-ER
z5ePag^?bR`%4u2F5_`P>TCff{V{Fi6|4yOJ>ZcwQD!|4W<$4^M{?-EuI8H5MIXN`C
zLc|um>FpDY-n7rSY?_xx)+Gf}?4H#G6P20CA?xhLp=9nz3;&Lh)xmj??dhLPQ}n?z
z`>a43fo`mg2!xaSBF9zbwXC!D`p%t!zXT&*N$%-#>@wqcQ{IK&X{*Mpd!pgnIxdFH
z=EZ`>fj#@USapZOc^Tbd`9co8Y@S**=Bt+t9t8OSbrBQim_Ft^%=`n|gjSWfj;YkF
zOXofZGAH!8Tg<8p6Mt47KQ*Y<0efC6c2={-oeD~MWa=VHa=UP#e^ZXxugs8b1}lXv
zh~4LqK|Cm&CfS|B)FZ+_hK=ginUD|R8?C|=YkPyW5A;6X+SKdi$3ofQcn3U8nKJyx
zTpdp+9<I#Kji1ZBO43BiezYq%kIYyhgD8q&jdh%Gk(+)ZvA~WEbK?RE8|T(WL+ncm
zO2pHjJ$>@mMf+Jt=*zFa`I17b)Nu!8{^~`jzhIH^DtgTSn|LZ*M4bd76=e75qR6p~
zeG%)T<L>W!iHEzOm5;vYz%ckj1{565c3Fr~#<N5VOy#hTTvWD2Oa4|6MHfuwaov`}
zwj#@GQX;G5x^)m?#b~sFRf+E_w9-m}(n?D|1|aKL72PPa5-=e<nPe@%`a6$}+ig*E
zk*0O#iK33clOy9U?K-lbRoJ4z)6aV^NaK0p)NhMciwWt5uig&gs5``lay>`ydP$1e
z^_nVN20ixE_rv0OE4Ht46{J7gHT%ELn3J(Z%&g+*KW4QcYJO4j25b3q-Ku!9Q)wuq
z0(ZLWrsA>0C%jd<OHzSOm_>%6jbp^~-A`E+SUD9GmK7N1(zS*}OqveLFG|gLe@&;^
z2><*dGX6Zh{QydRf*!xFT`Npwdz^N|M{)wZ;clQw{L6UI!*x6C`#<|w;J{}(MfCAv
zm#lZUQkqeity_136)yz3RA=+OGq}sC!UBaJGp+^*p%t!{4%YtoMVJ)MqyAy9mPLlB
z2ow4XG;g)^C<w~(ue;6&y(wJ>;l677<fV5u3wj{&bACz+4@Vp)U(~mGC!ckHA74V#
zja%i>vCnf{7(Zpc&r=byi0X&a-LUqzV(Nz4+b2w_^kkZT!)*DDhyCchsVmrL_5k;0
z$)Zhk*9T5dibr5h%Vi4SQ4R%h;S_XXr@p8*7Z?t0IOG|Ax<6s?2;5#$@sb#%^SiBB
zf7{vaMcFU@n$Un>?0F>@(p_(!%D2Dw#gy-uJjDLL{P7N7#YVi~!vFVsRizfVX}&H!
z8Z_Vo?$@b+&ewXSw8!uPs<nsQy_Km9;UHn!e;V|JB|Id4HQVYEPI@mK9&&y-Mtwim
zB^huDz(eh>W{cleNBn;v!b7l9&Y9P5m5Cmb0Ueg2`A$C(b9gM_$?!m{hv?rlwZ7kE
zxJP|Yxi?e5IITuJnqyNW&Y%;FA>SCfv#xOStrz|mPsEHLQ;%M#ao=iTqX@fr{d^hl
z#x_>d#KaOK0LO>#Jr1Qj4w*d;-+COre1Mwz_am2;)u)(;kI{7Lh^osK18D${=V3F)
z+@9XpPE;~jedF!ir{IYDv+*|S>lqo8|F#+49rAaG`2T}gg0>BPa#|cA&t8G?=Ul7V
z3=!gxemtH*?O**`A9y!qIwET|Dy!wJpylktt6eVoM80^@)z9m-@k6f#ed?=5rdbj0
z^Q_Wi5kD#_#+hh}O_8ID+jo4)G|`H~2TjLTNmKF@@0?$ir6H#;uOCIP5;q4Kkm1j8
zAHZ@bi*r42-31i*q&aKkIj2$@*1YHM>NgImz&V8pQ|^OmA5I&h9a_|hkg~7ftt*qZ
zr~u}Wdy_^(weRj%2a`5pDzj9AlMYJ9t0u9lf6^GNhpIDM;v5^|9n(@S<JSUFGfcup
zacXsM?^P>lZIImPsJiFzktNJ-IN1U*g!>g@GsPa@`?^+$9<2Xy4%s?t)-K+3QustT
zyETWe;f0ERf1#s9el%A$$Kj60#Jh)z(Ert0fBMGt4fcoASyk<CRO7x`Z;}HHBXDnD
zW?g-sx5s|T-20C7*%yoF*kKmmaQ`^~zJE?p>!>bVnbSJlX7)WL6!XKp7ezbB`V3S3
zaC<LD>J<<TtZIm%JAMkxLpd<efPB>Jt`=_|ZCSIg`)6w|vR_aW+d$ZJ;%jyIY;8q&
z@j`x5F+nvvld=e4+%p;l#R<1A^gq5#NeP=6a{?kMw)@sU%*YIsayb*ukUcb@ZAosC
zY8>KpGO1Ab-Be!TdN3*q*h$~!aGbU)yBEO>OJ1j=#Hi@3En2eMcyfAl)qOq3>}dbN
z;=1jt^p3S6gBH*G4+*crXbm=A%1x-RaW#Ty#IMFYzs<+*bF6?;XxE%oYDiXJ?EfNm
zx{el7*CkqCjX65(@!hZ#bLXaRAWbliy<jgG^JbYrk*L+oMrK*k8!r4-jf$4ssV3kq
zVa;=Md8)pa0evB_*B|6tT90`elx~<k?h+Y1c4y!A=30CkC4g0yoDX}%4lL+-VkjmT
zBV{P6;ZD-CTkV~B*2Y;upxDZKx<2?wBQ0G@-cj=en?D9bX3ORlGF@cQUEYuH6Di|L
zI$B-D`3|43OD;!7G*c-6s_#u}D8|;<UrtH0ni_tw)8~s03Ue1iS~gzNAy;i}1F^&f
zl*BTe5Sy|t_>e?#J?a#X-_~))fbm*<40`fsgVCa;xa0UDG3e9S`p<5z0v`$%uGe4+
z`$Vsk^=^frtlrRxj)`XjtmEnrPGU>(FHFL2&gsHO?nD-m8VyQGs-=x*Q<tga5k(qd
z|Co?G&yT%<Z!3>$LA1R=@#zt-Ahc01`t5fiR0acC!rrSw-->+Irx@0VURhDnTG`NA
zk*zIVcXJ{x$2jGg2hya0GNKQWz+Ie&vK2pULUf_#$4mIVSxO0gNh$g+Hz1!#f84T#
zno<vX29}t8#8+QJ>M!?Vciv)SxUnU?*w!}J`5yZ83aLA}u{)&qJPEqT1BoMYJ?}nx
z(W*b{gTCM;`twUr?8Q@ymwOD!w+!k@Kdovg6Ym;ILc4Fji1ktGPQ39}UvrM?Y8am2
zKJ)5~(X$Seiv>p>5hX9SpnL{z>}+Vtwrl=Ga{nwyuC94iNQBDRaxxTJK~dwcvs(Vw
zw%o0>y?Wi3(cRFTCOc;8us-utyElI<gsj`2Pkg3Unpi)GG<a-V&(6)-&Q?cn%WhMk
z$2)mYttx4iwcI1(#GC^9usEq_9eJH1FSC5CkQkF*ufb;(C%~ItQp9am!$sWAnj`nC
z#*S=utn$;k+ofs^7x*jkjZO{U<X_+J{?bw2tH63`n;t2zCdL|qo&d^?YKznFs82~s
zQsiJx9<CQxG)=jrh_td7Wu@b9#pm>;JgB1q`%t>n!R$yShYgkD0y5b*lz&gCFG-hx
zj)IP{PmE>vEG@g_AOp{fsxLb|<lkrIY%nM;1B-6(_3GShSpk`{CTga?+GuSzbLK<)
zVjt?&X$q=>sC);6=(Qr1Ku2_^uln+Vmey#;Vtus!?6x+h#Rke?x5XW1d1LcgN-BB$
zwcGrk(G6cFOB=NV_Y8hdD84D{{wOyg?VRJHWjDo<=bbqpo!&riRsScu@KUzd_1nBx
zQA@-;U<1v&RPx$#`bbXKm@@0vlw7a-w`s)qJhh>y)ia&9ZhCKk)vSu^Nhfw!v{wFI
zV>yb2Jsz=iM!jLGWge$lPG9|C>d<4QZb7Xzrn@U%qTFtc`BMP1JOzxq33b}+drdzU
zHQ#m|_{xT0EUc>SSggJeweIm6c%N~imUP8ACVFv+b&^%@KIcdLLEaCakA<sMv6H)a
z+)xLhEyHWvaf2`}>43}|mkh!EjZ4NL=o?SF-WNm^8<!1KQjc}>51zchv(U~%v7D+W
z18-}KJzCM>4sNX+aHJd1+ZQwVV;j7fp8ob>)gTW?R!Kyh^{Dn&#T=1!R;7Wi8iTB(
zomjQPX^UKj83!q+(V2W@@lhx|=qXBX;3IuT<+V={MKYNK5kAw$9&pyk<(=qMoXEqz
zK$IX}`*mX4=6m$KTZ%PCVF94QNwB<`-sMHl8M!fd;B4L)Jk;QC3?4f}U!?Krs0O`j
z8ptero;;=6`x4-dOAht@spmQdW!Cz_!{1<vWv^9$YlF}%kgy^DHg^l(VAz8EU>Xy;
zB@^uv8sv?Cx$a8fVU#iEMZoJD&z%IoeuDx%PpiLRp+5=yczVA(&>dt}tf9|iZ<2Ja
zHuJ1$+UoxPqe;aAm}oV`$4R}UvLg6nkShepbOAG@t2r0lzm_y+oN{IwyfvG_EL+@$
zO@shi4&Syai}j)GM&7#1T*6EQql%W*fl)McKfLMJwCRc=KS<WXWFc++YrfPIg`yf|
z3>sx=_>nYpIo|Z-IuGl4J|9Wv%$vokizW0=Wfkcr7EKQ`(MWd18jr*p55yXO)L!$j
zITN*g8>PW-p^M;UJ&Q$Tf~6YLlR#*YE8_@vFJ`?2_cP|1PdWtM4Gq5KggdD6F?kJK
zh!J(Z5sA)`LQN-QQHy32jM%Wmi&*fZNzdLqon`5{VEmHn{@#@F)g#jlDG2c<x8RYI
zm)?DhX7*dftG3V@R=4vAwUYaz4}AkJ;}Y5>uMTd_>O}jSH0w7?!d{LX=EZeq`|=a}
z#N5ad9DVZrR*m<2)0sFS#%&F3*{sh1By{LOmk8>2_;IPZOPDB^G!ytePSoIa{GT-k
zL!GjR0fRa-<-`k+pN6d8b0HJ^BoPo?L&oo5uZhOz54~9=2PQ<jtnSj3qG*H=P&3zD
zZ0F&p1Km0*E@&xWe;E2GF(u%@f7*NPH4pLEx<lU+X-QjFZZd6?Q=4!eN0ru%J5RaH
zxXd{JIhZ<_cAhrcDXTF$tGy1I&pyRCJvYQ^$z9fHk~zrnnb@CdLMGVz?1-OI&oB9_
zwbV^(9u(sJ*@s4K=Rz>fNX}AfsIQ*|HU2Cfn*w}%dMbCFG5=az2_+k#i&BPxM#gjf
z<7>4g3*Zh0{=e_D&f|kMB}-Zgvu9#za2kI|-@Hd?+Ee7ycg-_aC9#zRu@4O$&l9Jy
zmDm~Cup1aHFe3;34YdWn>DhB%iTXi)vV)Un>|uip$0AN`jx69A+(Fx8hP?_$4ws4e
zL8FI!``msR%_>8Q?ABQ9X0<<@YM%JDIf97>ajP=C6$=LW*g0+nIvT%qZMrTc(QQ0m
z`Z2kQz4ViH19R!S%Rt@fw?Mttk|y`K#iEE!h6W|@TfD)*mRd2o<a^PF@b=CO#b#Xi
zb(y?JZo~2@sd?-)4OcIz3$i8lA{OV`bDhGf`o5}iM5vUL%N-}bjmY&${JmE@PW`oD
z=%3K4>w@<4cIO}ma~JbAYMhN{?x(x$-`nwn%w7w+Dx(Cx(OjxZz7!%80@2`P;$}iU
zp2DBP&u-so{Cdy%+?em_wc8|5;wi7~$Lbnt!J%{_Pa9pH5~YvRJRzJ^CwN;l8$??;
zTU1qzbG?p?V162b?4L6kW9ef<s#`Hz`g1**=BnR9cp1|YZ3I2ig;gZ%N4emGeZy#5
zPtxo;_(%r%JSrRX&jnI4430mXP<gaaS_Yd3vnix!r)LiUhB=3+j)gr^$dVIBQppmv
zMuN8L+|&K0h7);*32ZAftzIalD6tO}Gp87wl@%NxOvmYZG%Phj8+GK%B#nIfm!{3i
zrrool)Ap<$r8i|an{SYhk&pdm5$0as6-u3TeGJUcYWx+){5#2MvC1r_pZiVt50cw#
zPoGTNbyf80Tj5Nmn~(50I@IznFgc7sw#YY2{iXwRn<4#%)^Cl5%Dm*|9CFXXCdTLO
z_xQ>o=lYCslKO?ESn{kTvzonk-a$>R6U)waf;`k!;7TyR$ehYvfjr<wzA0^@r*+QR
zS<Dh<53~2IKBz(2d)67vv&T_;J2b@|)dyuSjc(gWT1#5z%`NRx%e&lsXaY~bsBQ6h
z(dWp#!?|aJ2ZQHHEfZ?Z`2Orwo(oxh74HC=i`l5vGE(I3?wnJue+&R+eQY-8tV^M=
z+9f*2v?>kybH7Xq$gWkRSl{zF7q*iA)(%j{CYS0;rL6fJ&rQFL&PvYvIr&R+Tv$hO
z2ckE}e8gOZZn9x)$GmyV)2or(MPw;!zt<k_R#^CbQ*2etqIoY*xFyafQ$m$yT^#U@
zw_kr9+asLJA!~K?y;T<Pyx-Dg`_=f>qR{TH>p3~FdQ8|c(`_`OnDj_JYrOm2$oR%M
zYv|E$A8CpwC(;=*j8BzJhRO&p2U`a3lwL159Y|0M!6jDDxxY9|`g}4Nq}ssc9%}4b
zw6I0<slUlg+ihKm7j$SOSc<ZJ>Qf~Ob=+3<AxYJ105^iuXPRObXGSO%llP6b%J!S~
zMNYggf-ZtioG+eUJgv^Z`F_(dH+VO{WLxV&=iI%!vG19?)#W^VH@%CXc5UOO@o4iI
zOZ&2j<y<Me7#9&H%u^cSb+S`|FE9e=^cHfIpy8lW73d3N4P#By>eT6k4ue(^tBqT+
zu6M3M9;Uw5zInouVl20r4X<ydFX=tpd^LTe8^pxe#g>fmfkZ%?a?G(8W85(8Hp<{0
zV=7ZpQ~TD8JVyF7Wa1!$4qw~*;&jij2H_pYKWH8q<5Z_jFZZ2n_VID#RF2H9bEk_b
zy=3`x8=7GrbFbX^70Nbrih2)ssG;9QvJW}xHvG#6*V`}@C9<vzf>8EAksf3S*dS#G
z9*}xnvaS+w5#nx3BmVZL<#vrGlVsDogm$^f`dF2A7@nCOe-zQwpqy=1ayO_28}F3b
zKi>w&9SPo6e)RJHfQ--%y_Tioxbi#(w7_L*uXUaD;8SLD-x?g=+TU&a-_D3=75aG>
z+O38tjNazzbC{Tv5@V6vA7P_F=War!mNk_L36DK>gWRwV?BAB(n>cBT8c3E>1>S7l
zm=&1noCvi(Vn&ejtK+OJC2T$H3EZS(%)Sp=h+tyf);c3vUlG2nzL+#bG2NVjmBZQ#
zF~2AI3bt;<DlEWmD+EHCSrkVz*GAQNE&`@5w(m4E?FY3k6%JS=kG>v<Fx^#OqajEG
zhHor-@7l;zC@=2ohrw}8dB(ep+1K|Qlkm1OoO`|xNt^~B?McSQben?n;nkA!3o6|-
zA^E3BmPyzIViG1+kUcHA2F48;x!)a(hpA_XYK<{2x15aK7~Jg%P66VI>R!1yWTO`E
zmn0DoS-zk+zq>1%8$pPA(KJKOmWwe-%E#3W7c-p{kjX0xXf3Mfytq5W=j!`<+dcfE
z?Kg@p#XXJ1k}3%0a<OthRD!dzEXyfr-TXAglzzs+xL`E7+5WKcm@(H_(|s{;=Zq5}
z*x-2gW6mZylR+fwKL0U+WI_pnY{gxa_$C<P5p|;VDy5_Z4~kB7#Cy#-#d$w*Q<=ka
zgp!)*g}tGNR}J7GgQ;Us8im^m2aG@b6r-AC^_Ia*!YN(5PIsH7*D|OmBJr4OsT{So
zAMf|G&B}B!LCbeN*QjR9r$vR-nIWK!YdC^;3pMmZO)>#S5_}qr4PlMZ?bMTIriU-<
zIoPenneC#+H^=(?EQ15Jopc<aCJ<(3aTMSzn<1KA*QJP^<v4e;(N|xLuJ_2fIduh<
zL3Sj<>J#Qm;#IHe_;QB+UU${MQe&4;Fmh?FqTYxS$~R$(EcPP0b346=e%zWu@+FXj
z6CK%d%EwJ8Nlip{O)ahvJMM92%xyR}dD4T`DoA2=%6o0Vt;+c)rrQ^m5%c0fN<7?p
z!!0-cjlWxY?t=uI?Lhf4K6jsQpRdAYxo(tSXjN#{n!}0HiPKTsRcnKkBhN)@Ko7B2
znxf;0c-HKGfCxThzj8j>5jGj+a^#5@eb-t@fDxp|=>D)A2R<whucJ3W)$SutEB?@o
zjJ&jILWct4Q}Ha_!3$5rkFd#)Hl<<Hv}KtGJ2jZCg_DT&9aVe1^{u4o-k;v@kai2T
zxn8v<q8|dfIjkOLi7pWBMjPTaMVqJ~ZS?9K>_2)_H36slT6<I{uGi<Oq3CO)rsLr&
zqrLWBfvEia8{m72{Qz2ax+LoKqPXL%c!4hVJ>5Uk4d5oQLx_*Z;z5mCAo#b_MN4??
z!Ug%fDa^qRc5N!Na1emQ-uEDI|0P_R%80H8;Or$G@Ym476zqCh6UI5NC8TE(KYw_$
zq*(k-Tf^taD80$}9ZJPQ&$7oZiYT|*vS6LbHgc|XN7xb{Kzvfw{i-sS4U&0g%`lxf
z;BPoq9+0iG`Bpc%tN8l!iCJ{y^C+rjPu)3H?K9^}60c--+^RlzeGf>thTCVVkRN(Z
zM=e6Eh)9oSMD)p8P7&i;X+P@szR`+f=`<?ar{$()w`B&{#I2X&2KYm26-o9++w;4p
za3?8I`Y=6ccyw1P?rpNCH2uXSvxvJQ%T@CFdqD;1AB%}bKi<CeL&45=#V3^szi7cx
zqX6-MJ-A6-%Y}a%@}@cEK=wFRd5a8m5}PDOM{m3FGWz9JYRV6|4dK?NZ8VWlH*qwv
zQIB^qvv(vFA!BQIpvu}+djAQ3Jlp{X2!0`EmK#P#89gn9Vn(mlZMV!NPi-PTzC2<!
z^|7MvI}}sx7TcG>IcrTFiQ~$+q05dDUW$Z!@@$>j{xLQ(Hv8_n=f1}<3F-6t)Liaj
z5&`;&#F)Xy^r7;~`ZZsk&3!|0xC<VtmP`YL@QmGC1Mdb>?@}~kp>qgj(~=ly*uxkt
zN<8%V2<Jo3y2dV#WG^CvY3MZ6+Gu7)2o$lNhk-DS$n6<Xh|$Y>{3hFrAU=(YlDAIX
zq$B~F(peU)D`{`Oxxy5^k<0WWffh$$BCF`3aEx?XQ|a%+MW-c|PIB9&FQ)sDeYv5F
z)%PcB@0(~K^7?cuoRx~|5}&Nb`uczcqlLeQ^^q2*qND!L0%+=}BLJWIA47l@z;|hk
zWgiZeYZ<KGYLzk{u3WxN5RsS|<#QmnSE(!Ll3YlS^WM_;P<IFN;{j7juZnJ}&UjiQ
z(59sWdBFxQmYFeSJ(sG>Rm+`4_f9P_6Xg{W?=rIWQ`4pkiMR63(jfR{K^9K=BGUp$
zr9ei}GVyH(FeE{<6kK+9$<S7bs~&_jLK>a=j*EQdrm4lv-N2j1n+@m|d4keFGReCS
zU*qr5)Wzp+W|wA<AHP68lQWVulFYqGoc=K7#ino<qO<VJI*g&>9fAI@7i{Cti0z(W
z@r9zLw?9!a4`s-EtU~>n${~UNH&>W3rq+9{ROwf#JWp!7wG{C&4c=o^l``(!4L5}b
zE|0ZDFiHQ$-odnCqGmdwmmA7*XYA<*Z|H3#Hf?Z3CpT?6?=I0@sAn#+UTFSUWNeeV
z|0MS57&hplvGq$Vk_R?)@LiHE<kvGlS!$`Y++)%~%9Os!3r4@s&BuA3OrwR;NV_nz
z3XW{7>BBTaG{S07!Pwr3I!^WsT<*ipiN-5ec5)!885=#5XRiJ)qbH6zoG)zbuk5vA
z!SP_+c@R=bsuc*P9-O&=Dp1d2NLJth;enImL!L0~sq~$UsGoqFtzRQ;SNE#?gycWU
z$G_1Ml50MDiOhb(m-$IDYvN(po~qQn!OE{uA-gANX41hDEst*)ENUO)q|mlulNL=j
zwM$V6T{6FXsCb|QKe-TCf7nw0iyPNZxA86^{X{pe_=jhEZd{>m>h`OKLRU4*hAy*j
z3?&?Y3lMsz3SfI@3d9G#8$iD2Qfv(UQP;tebjqXHNJig9N?_F;T$xQvhLKB(k=rx2
z2EYV7iyb#<jLG>FBydc<B=g<hrA3f_r|vMz5%cB;*^NE&?lkxiez@d>iIGT#;edR4
zYQ$Xm1j8p0_nDZoK*hl`Khm>Tq-T~;10%|8s<L<?e^G<1_kh^yvY}8n4s&Cl4Wh=K
zSZ9V@qfD<dFLp7^B~PS3(tp-;8ixtQHQKXHJhZ)K;i>ef@r>~!P6c}Vvi(x@ah*|J
z7j6}L2TNWx*^bzoJPKQ!FL3JDSp%d1qaIyu#5KF*U<#cH^RZMcg_HQFW@(X(@2Y^j
z6aDqFqoT8-Gg2|kGe>!GGU=A6Y1vDn^SRid7qR{#xxgWR+@it=b(e7dr&npy%CG0y
z9Eb;5DW}LV2v}DCYLBNFBy=?0t@P7v-%xcBa$SpToZW@UX6Y94SpUjce`@)b^jLql
zSpTC~|A$ZfbG>{`4Iw=5;W_W&KBxcs@OyBn0%8+uZ5{G3CI)#IwUnL5?tnKIN@DaI
zXGu>ont>WXjl(3r!Tu2;`euR_wM6sosPC{x4zwFS$1ml$TV=Vbx*r+D(}pPDzh7-n
zN)#F;Dv-VQ%vN1}M-bCVs*ppfVEO5_MnCJbsB`EkRF*7^io4a?jOP|2Gpj+u&a@k8
zxf^M|8)>~8`NUf+LZI-bAa~jk=9rZIbSlzv>gC%%c*VTnG9QzlAC|VaU~5;X5XVt!
z09f(2&w?t659mZJupHdzbf68W7dkc_JZNJ&NZdoHpEj8FWK+gCdwL5sl>jG%J=!(_
z+nG6tgu_l}RUcU(;!M!$K+AqtD(`N&Av*<R2S9;g!mvu5D|}Y!Ucsg=;ZNFbwqVV-
z<XWI}`K0-Gh-L+q4QT-BP%cu^?nFDbz`gCqeP=6J&BwcA&Ii#T4F6WWbVV7U+JNa+
zZuC%tq4;3xh#(!BJW67|>n|6Ka=9BEoz@z&H!a#b)pH%ycZlarG<!(9c6gzq36IDm
z`F`_SyCBdOM0be+Uw03IT+a=7c=mA~Kn=J69{H~lj;~&Ol?SrIy)F5Y@mWsqfO`i+
zfHI%V2IDiq;Tpv~5rs^G_kGX?4mg};Gx(r!3N?F<L30AA6xJHcbx)YAsry21g*z>h
zpc@k6pd4F<sYa%y*7TTS^qPF_3QW{y4<0OHp$PyOporj30IBWkcd@)nei<!h#n6^x
zM3CyO#nTEmij#<?VHMJmsq_uUE@ZPh=xJcfo=l$4L;fm6nuC0mFN>sL6Qx53#wg=^
zt<}A)QZEThRO9QI71sF>(EWQgva4m51794kdxL>Q_^|hJRs_@_7+@DxM-}K&w7x%8
z5}#Mz0Q6_~%&sQPwpkAo)f|F<G3@(fzS(^E6w@TT?4iJldk+NPD{d2;9R-(;J2Dmw
zRFVh|k~VRXusPrY_$60$%#7$SwQBQjP+s3?kN9sH-HL-Kr|X=4yWmVWbQ$!tU1<DK
zs}8?n+Dj6^TM@cs@bvnC;htpXB|^R!aF1NzocUd13T6W^M+Y1=wh)A1sICHy(gJ{6
zc6}3pp_NF_1@NKc9`lXb?L5IT%*$U=2QK0d5d@g?vpzehBt<42A?sovfs4N`VvXjQ
z$6q9$@U|{TnIT3zNS9bgDc%jwNFV$Z_4u$vbJ{pa_gFa8awzoGCflXauLFfQna@on
zu|)GnE;45_eWaawi+tXM5P7<0A`*TgSDCyXs@>~RH#zDx9M|*ocau#>sGrdJCcNLi
z6z+~(oZ0C(SbpQh4}QKQvB4muRWZFpVxRw~h5ZtJ+5HyIHE&$lowm6_SBI@|#s+w~
z-j%B#XS*5@+?R&H?w!9bYeetQcC6%urQz=i*Vw-m8VZJPIRnIV23Ni>6w{KdWsN3u
zc%H{LC@w@?`GV_Nix4b5NKd(qp<y2i*oFF4e9V^4f&Nu)ZLyx)6>fpcsMxD}*}}z;
zGX+N-x`)_~C2mLv!|jF{{IgsmbdKc&j}nJv@ERN3=9b@-v(b7&13t;*?erT@8@@{5
zh5?F2ZKRRv-#G7A*kt{0Zm(B={`u@vm8nNzCel>#>i9Ym=1H<;fd<FDM~q6j#(yca
z`UKraQVRLx&(olV=)}scu7pS*)P~1rj~Agb*RMP?c*h%-4A$6aI}Q{x145_ozQnfs
zEeVJFtY<Z4DrNe&Phi6(;PEHhto<~|%o$&qRKnPgtowqQ23`kYi<!bCzi{{iW-v6U
zZqJj<vM&x?r*Eq;Q4JLGCus4)67VSDQre#A_w@aql^5c9uzJ0-gk`SfJ{FpW6}=eL
z@X_sgc&=0GnrerrNmf%w{PgNJ)_y?59?59NDEnwvzxci*=JjYL_+m7q8M=Gqx_Q%l
z?^A~vAFzFUP%gIAvQOfURV8zH0IoooeBvbrG%8&AAc32)?xtHFH)h!ZYRR>iSbqi$
zoK~daBgR>b9Nq6n#}%IMv^$U8G~c3KCy!1IrI^>EHt9Ayoy^4}IA;Y3yr|dfU-sS5
z_=>L?+tT@t#hZ_eCsJ*~<#dm{!gbn=8eVIAOP$@^<<@!lp|2$g1fSUVA55)QpCObr
zxL)A`CqVVYyH_ojsbhP+S`ul{gA`<TmzL?Mv#DsX-_O$9%*DB0Z_~!+>y~?v(Qboo
z>3uSzrVoD+@(<DXmJ{LymrrAWX7Z!9+RzKs;lh}*HjB@0o8-^xg;Cr$V_rIvd)1nN
z-I#;dmz#4^t(F?x)nbKZ+NQc$C5kyluETdPjCA&?WpQNH0PO~?uTB^<4?J3%J~=rR
zj+1RpO1MpZ&34z+8<&(AS>}b50dwa~O{>NF0GD+UVw*v=>#f23<%$Q?*ZCvIl|^uf
zDp|5WOG#Lxq6ddY6ED<n*}(XSQlF`iRBAhOtB+{i&TY^)8yp;>gUIg6k{%^*WjV6w
z324Q59ylq6WDNowoi4ka2F(K!$J7BN*$_=uVXMb&M)6jhk=Q`kdP;N7SFM}s0Miq`
zv$4(l2nfPYNmb$V!Cd%gI!Z$1tAvs!^d^q0t<^(fl`q-;bK*XZiOM{&Sb<;prcOv6
z;F7X~)iF88tFX-@`@{!8jzrg9oLu+xF$LKU4WUSTOM5RD?t~cCd~}3;<&N}P4@Lut
z;KqWj3JhO|N8cPq$*d?)WX83<{$5QZ;Wbhp=RwyNLd0v{GCAX?*W$D6c8@4k2Mqf?
zl)Il_Hr@wMN0xgXXKAR`jf;+2pyRIHo{TyIn6^5!D<D1kemYTdqYk5;<Hcp8qkWp>
zW**f0*cN7o!QCHMpkpAI4Ct-)BAWK5@wXGJtI1m^`N8C9Oe7TeqQ$5uNX;hGtOpj_
zc-`18yn7pi^(=cHZ^`0idQVv~44DnN(*yQOKvMQLBZ->#Zvjql_bkk|Q$g*WQkgL6
z&bd<YiM)MS1MuWSQs`_o?#vx#={l>a$}3R*33I&qS>4>+n(^m#o0lIL>&qk7v?;SX
zQj2Z3?^u7c5`AU?rrM&E+D?QomofViyf{43|LkmUy!P@z*Q_Bh{xZbRr}(W=XfKl@
zb374e^k$Ty)kvDE@}`2p39kn{Qp63;9I2l=$X*h<tG5-YwlFL8ZPV{@*V!-KCK_Cn
z76;+vSF0*NUDmp)4p1$3Yi3M1R&3~QnDo;rvrujKd$&1`sApM1KX}}9LfI?Y&D;&=
z+z=FSmmwAWAAG%Ikflw~HaKnD*0kL{ZBE;^ZQHhO+qP}nHmB|GJ<t2?yZdKjBQm3|
zyfRPKiKr8GpQyVklVC6T9py8nQx-h8$TwU0?*!YoNp`28b(V0H>qgH@PWu$4^Ya59
zPz3outAq6}+*|D~&2PG0`Q`yim_m%EXIM^`Zb43$!gC2}GnO+}l~SCU3nxSMY@4hV
z%}Ys?Caxx~v)U@jmn4^1)|e8r!{+fny5)=W2E{U>wv^7C8U;(|XFpB8<^+pe#L`W9
z%EW?33l;#U<#6$Vf<`I8t7u<*CX*YUe_*pg5fW*z@tZ7++&nUk<YW&1&XqB@2Q_l&
z^0C=Xu}ge6jP$^olQ1RH^_=U@|JAPt;|HnotWJ%VH=o3yT1|3v$fG{TXQ}w6@X#X&
zUzj{;0GFeRL9~j&^z7;BJ!2T!k+pKd{Yi>P@u9v#3$t=UxUNDA^>SSJvinK139~yl
zr*OE!^tg$=_`QUbVwtVF{nB}LQ^OWu!pTY4Zd^;8OXHXcVa9I|l8l6{-pF~_YUJ0C
zuhnc<KJGLjN!&_P%L6F>^KNsJ<ef~`k=OoLaZfSxF7sGGDPZe{1-Hc#Y5)iUfD;M;
z$N_*I6>QrLE{Q{FP3vctQL|i1XSRHR?kil_49EfWzn0ZBtx-5_!5KwfVpUp*TAMAt
z$Z1=Mr}&Z#JDng6M+l|(jEqh`!lt0Qee+Yph*FqsUY@ceyfV2U^DG~ATPR1t31#@=
zjUA~jm{zFVdLsqX+|#-i(>in$=>UjpBnHXYB@2&!WNqA~0n>7|S#fAk!d{xY0Cjc9
zt+<U0zg2UMb_hknRa#1^!}379xRWfWfolzK*yul`(qU;(v7n7IOS#R$sC`}&uKvvL
z8JAY=v^unVrT@%u7oebvLQnO?!l+ta8}9x*_&jLP!?S}=uhT-%N*)F#;Vs=&u(Uc<
z`5!0;0L|yh=S_p~?j;@Soz~UO+;HjV(`O8W$nNzW+Llev>)T<U;t8@CDrHvtDVME`
zSNZaWa1@DT>CY;qb*E{(+nVB!B>YVLYZ-gC6~nFRt?4Hr*A6c2+H0F*o8ndbR+rYr
z%Z|qF?8lIsnVT7piq6Up=r3YdHdj}kY47F_4W4c9#1E71<jy`GCEM%r!}3~Fz<hKU
zD!4AzDp^>2uobVAU{E%*cB_>)pBE-+WPaDRroO0n=XhAi`bq-0pOh5XRaL5N)-Of(
zNZcv{B~I&i@#4i32g+>L)MBR%Eqs7+5AdtP0f2-8r*(}H08|tLT+>Im9J9$40E8t`
zI=!rN-qcQ~3<!Y#&yojNHGtk~@Bx-{_JX<NQ^MvkysB++!^uUV(u$UM@<fKEpMtt0
zoC=j?4Hv+o=93CquL)qlLO<ysr=g-5VbP+jY`;O^!gg6ggDkb1R+bRh1X_X-9Kw)6
zDw6;G?G!P<c~!t(wGFQF1pbe0C|5YT4tgUAI>`wHVKW{SzvAa%F6Rd%L|}?RWXg{4
zB)ZHQ<(t?)(|g{4uOpdr&bn_drf)7^58m~#76q<wUnwx+4nxuhI`340&<Yn(MWO3t
zae%@<7g1)3*smP}$PuT1QO#1yTmz+=@^iY$WULf8@(?Rh^H*eT@HLRo&45S#ordn7
z1Lxs?BDr@D`6}5IN+F0-87-WN08)<PzZ9!j!ur=ULc=oAZ-W-IS(Cw{Rc}cK#G*Is
zQ%lbkyY54~3WB-`4UpGI_9LvJDVhJlO2iT9&#a#{m=qh<A_?Xqn4t%#p$DwtyK&Sx
z26IWPzLYF*Nv~M*uoh!#!Z7z8LIw2+$gv{ke{%WjIJNJapsokq(A#_D-AM1tS5Fn_
z?()~2>YxwsR_$<`ZlrxLzMZZD$Tq~I?bz9F+I%n49A8Yv#hnK?%Oc7n=tY$Z@KHUo
z97+{vv>wi-v>ePOP;tf6ujY*lYyHjh>pC{g*%0g6R#B`fxH>hQJ^5J;-5G%P&#D4i
zRY0rcI?O3Aaji|FjSwHQYyceGnPVqiAgPt8tw^!))MSYbsBh{D$j(utUgIm*W!ZDd
zBGDId&sj{(B0nK_32j{N99*QKsz(w9d0{4m9ls<{O_vn2Y>an4QaVzqG^%D>Rcd$0
ze8}v8g%dS9-dE8Gl`}nJ;?{zxImXn2tvSxxg0)!|Qop?>qti?lFIz26(LTB+7(b$8
zNH45!{GRd0u=ftmgImu_a(?9RA>p!;C7Q=)|EvE&6v+tE1ZxboC}WG^2xHW#=%MIQ
zEY4{B&-iQj1`Zz%U*U0LNuHS^oA%PqAF04URd(q5^$gbc={Adk`z5Q72{san3R&mO
z-bvOW>G~9b%5q<-uFTlZhEv#OIHn_oUIFQ1@+JxCQ7M+`&0>^!E5Xu?igN-yjerd*
zw#z~<kVeJ{HpWaj>BjPP)Nd0LW{3206RO~JjP&XG@8Of-vP_eb{AP3$m?UwwMfuj^
z^@Jh~`*~H%MwU<OsTPX$8LLqAhADWXG`wj#iuL6yi4^yCca9OSYIi(dRoMsdrrp)6
z*d51dptEBYN5Ty(qaAU)DIyah=Rc8C`e%3e4>}h{JEp)qQwEE&^(S8^UzpdB%zm>9
z7BNCxg;Vlo`b`|zwA^~U{5lVRAO1cnn55)SVLHNk0D<HQ4gaEmgWZFgU?9&xDK9<%
zLPfv_B;71cQ>5tk+5m-51H4mLNa0w{im>$z;Q(<d;fP#F2B(<_q|OU=KjB_Wm6}p;
z?BO7A)@8+z!+6L&;hVpwKB-*m9McNwsLiY${0V8tB;937%{$^cQMmd_ZI5E=XHe<;
z$$7(PftY>$b<FpNFX)uyfh0-G+$Soba(Wqamiz=eN!_w$7;-f|*I?>$`pGH&E&UCv
znUyUKM-BrQthpskbM9Wnk=o*#)wQsh=Y%)uXX6w5xR=?NKb_<R$qPqxx)5`d@d0$0
z8efpM-_UC;R6RaPUs!!U$!u7Sg>o^XF`=wE*=ji-U5_K0V$#BjC7-&wM1|SHv{<A3
zpO%rf)VfMWatOMH8m%1DwuESUii+gbbR8mB0FJgmbybUs=Hv2=Md-b{3i`8&=K7hW
zIkZ^>vwX*aRNdbHha}~(qmWX~fWpiOVNlKlakHZsQ_cw8NmQm}Y~o$Vl!2XqBBY^A
z_;K_HaLAV!$Sh$`LB;#=jNuEd@5u(E+F-1^=EJI?UIYmDBcKN-kX*-`nI`c-zs*0J
zz4n(e;4@Gq`EL1SF<n7DgDwHr7NqPcC+?+N8Lq4=UYW-Kbbh%$J_URMYnP{?bL(jw
z$xFeY|F5{ze&K0b5Huj<r`}%Q&cmx=6jQDQcNubB_}aFIceJszirhu`%M)X!fmwE1
zTO5ywnA^l|26j1x_J?%Ve|L~<Atjy1IEnV7T*s8qxJGe>XmnV%S<1C=t!SfsySZ{3
zx1nr7ImL6$XNP$ua6xtd?o6z2P3c7E-g6lX1eC+vbE*6Pl_UAB7&@@l&KzRyZtm`}
z$F03~pf?;IHe%nzQ?eu5=f@Jv;m<wyG3L;qQ`?->IEWuX{7(eqz8|?0(OQuK>Mkg!
zbOZk-va;<wHA1LiVEukzecSj}xrQr}4>`Pb)#qZdb=l{_4JOeOIi7Xh=Tg*kYDe)@
zkfZA{gd`AopE%qgrodypq;|!f@K3~a*(YOv8vb+v^&lp}W4WttP543$y_A=JJ}vlA
zf=^_ZAY33#VvwZYlE-2d)S~0_6mNcBzeov^jL+<=yg+mA?irh4;Ujd&Cc^7vCwdE}
zesf?M<`9?WWtSc^kMU9G3cr;*P~FdtpU$29;ehijojhL>n~7sC3pc1)!klu3XG!!;
z^evpZcYh<kcRbcgN+&<nzn#1{UIV+&O`_6`uhqM)bRc@}q1(+XxtDIo%xOK+{O5N&
z5O6#2ElF1he!uI$G0{b>Dku1s<Ap%Fn|fQAntX5QrZUN5hGK<zU7LET&zFcJXHxhH
z?8!DRZD!iI74WIjp3Kyj&|27x=K#T?pp|anPaRwGdl#e!f^t~(WUv)<hdV{V8{y-h
z_S?wEh@$mwVUOs(`v6;f##_!Kk9am(ZU}@|h5}kHV6sZZDR|*v{u61&2w6qWG;N5J
zMeTH17?Xv+J6bboXngR7KDZ*j{koXUhtz3Uu7)PK!&9$RCL{GzQI<^1F#uavymwUZ
z3qSM86L#g6QLT4$@SO+z@&&VM_t5YQKk$EP@~r<sP^SOG)fj!{+*IwNOuzG}Hu}u6
zRP8#_UcNl)edT~MIMcf|%04iP#+IsR1f=|euTJy8VR*7Er**u$Y{qHo?W=8b<Fd|U
zMTGspzs6tlWZ`prfMmE|(K|Av#L@cHKO&gN|LZ<HrWqJ7TLQm#pjb|#_A0Qp+O%4h
z*dpc$;fV&*8?OFpJT<_@408*No+HB9XxEMWv(n#7Z}{y^`s(UxvM-@OHViEoU&Vkr
z4lT%{rcaT;@$Nh|PmhEUX}_dH`Jq^gsHoc2mF}kAB_%zCvv>TI7SE<9vLoj8n-};;
zMXReujbx?ohw{}`TZL8vJvVY%?+CD&uU4#=G*MwAJuGvF#@#t=qgH&a5}6!*5otRF
zbd=m9FL;;D9@M44VL{`AOdhAIA=*4FzAs0s!Ywh+qD@%q6pMWkVnbNF6sw^2Rn~K}
z-2E>o@QclKCwHccFVNjrmPcMTK)3$q#Dk~qC-Nyo?`JywtMpv@sbsA5qP&jwF=Yuk
zp5O9l^UV5;_zQUmWSRrtEZ>v}->gQrLO0@BnR_ma#2<$pxh?+xZVdfu^nMa4V7q*=
zg&7ZH4M%dvqI#n(kuZI^7`~5_u02+p6F3=fZ?(h|$MvL3>{g5-Or%TjaiN?J#qvll
znVnDF@@NnIn~fY`^{{eDz!^Be>)62|4WDfpKVq3(u(|s0U9md%<-2x`MsyRgeA!Gs
z<)>Z`)9yxbLZb)kA{JeZWo#KsV+~|>lR3H>r0qN+*Sm(TcyrV}RZlL_nV%?3F7Bpg
zI+^HO(^5wn>2IOwV%k=%;6-ehFk%hpc36f6n5STqX8$nD>)7AbObrBPZ5rfQGq&~w
zv^%q%tjJGQ*BaR1O>T9NIbO1#tlUlI98TYzxY<pJPK)-Fp?TU&MsZAL8Oa3bGM=bR
zrEz5}PMPn;M^Kw)H1;JorcskKsm`$GvYiHu&VoqF7^QgeJRDj%=C>m@qe{;FXR6)f
zlzou3IS@M@vrZ*9PWxs~&JT2+bW+r|Igm9T37L#rr;(a6OWs)307=dmq-@2j|HoUI
zjN_$|y6UHFO>9}&jOYx<22Qmlx9@U8#vY7XrxCOo`*?M`FK|PJRHWIXGap(p9A=DB
z%WU~`*o#)yEc<aXGT%&1y^@)Xq%tDhjB&q`o%Cm?Ly&7!SFhAX0J-XeZ>@JMb)Z^P
z3D?`LATz5;(qt(Q4X6uG>7g+qREB8}QW=0(hGD75R+4+DY8Zf9h6guBsJW^jMVg@1
z<5v-<^idf=RS}!iMWprrPw0PcCsMR}I~Amdx(GG)$}rOZ<J$*BDJdW^s|o)%1nmFm
z0aWFG31q7BFsv$K^cgAx9yP%5Ukrr_K56~$`n)R$Fs5G&flq@TN(%_gYJ4<VPlK@k
z5BdKhz-m%=Q$>J2{jLvKP1abd2wZ3{L!b1%>Ofci`&<>*WvKSxyFL%qc_>yDK}z4d
zKDa@enMGt+F_81oYf<9%m`%`HUi~(vR&V-t)W%x*vv3$&#^&_CY~!MUy<|zYO=}Sy
zZnx1!x(kjP+MrX*=KA$qOoTq?y7u*?er;W6VRPkGrThu@2eyT=ZloC5=8gV~P>@}s
zH_CH`TS;KH;i(=s$Q4i-nv#JMl8mVkyZf1>FuVS<_Oo1*W2-lt#kS$DVCwqI`b)o}
zLFxgZBuFXkC~J7tR@yj{Ov+c<7iLw2V~1T<qA{N_2r2u__z@)eW{!<)sA&riPIskQ
zDYGaS+dBP*qrBuW_xdy6m2ckE<TLP^D$<+VHSKe1oi5SIox_wP$0&F9)!?&!Q<XtZ
z%d_mW?5h*i|2TL?x$!to#t+QM6AQCjaIcLzyu<7o5ch=?AY$w|9fKUPMA@|w_lJm~
zE6!sC$K8b3t=U_|kK9U74$?)=iYX7U=nv9i*Q8uE*Q|yZbYk?o!t`r`mFFW<3o+<L
z=~GEO&e0;&l&Tz<58`%k>mTnP{ZCb9tCw~XPsaoC4zPeV_bB~n?@{~VA7BHOHJswB
zgfYf7oLJN%Yrrk%g&WYde~a>Rj<T-0qjXa`g|}wDa9Z4UPhXbmR4JVvn2iQQmSPn^
zq7D8z#4HK#!ef-ub`PBbQB<{;)Q(ZhATpp7>Ji%s9^2P(r{|_pDJQ>8eHm0FmP9Hp
zdo2PQC7)*JH4k_-!sU)7lrSHVmmqm-^TIt=_F9nQB_-FtH5ww1M$F}oUQzN|@H<S@
zGg62pNjz~V$%5k2oC#5m7Y7$5<2jBD17z!9a)<i%9eYa}i5G{7k<xH5O7I^$Qg%b1
zSUn?d7o!sxXFJaPSNVEm&<=Vh?)cW?h2xn@!X$o}_B+rpA?IH34e7Zisd7&>UR<(W
zrAAmBEzxLf_E_Y#mtSfDnQ`2tsRb-voHj;k;fS>z?yKVUclKdo<Z&u8ZtOCUqIzso
z`4;)hU||xl;t`NLqI)sMTHF(`q8dFz^8Yk-te5dC34D{e`pGVO6;8h~38^7hAQ&M)
zjPUw5s-!g{mq8QNGn>#Yp;?Wj`;}J6tS?|5f(L6K!EoLSyyyQ>B=?V0F{y20oJ#iT
z6tkpFXavHkTh=jJ!mYd%zbe$JQo^$u4fms};8xvEJ{-$y#|&W;BI69lW1a-w6733x
z&5aPhTlCGVO2%}05Z`z}zlrxyCXE2>UFIIsIMs<@IQ4dh^kf)C_4YbOY7ADRV*w2F
z@X!R2fKLf;Tx-&cU7T{h1^yx;F~Rr{)h-KR$2)9(4@7g^1<)dKF~xWjqHe!G1m;8w
z21UhU)Nv}r)qgkqnu3SM5IE$2va^Q9mc=gPcSD^J&mk9}3bg!<NP>xm2}uc;p<;+y
z_iejMz&9tP5lEFw;S@Y~MBB6DJHSRKolU^_3~dmKaN^%Vzw%WX9Z<-t&43mFZ?TnL
z1I!T?{N;Rj7WU=7-58<^<e1`rE1)XiDpc#O%RjYqAH(xw<9_Biz)<x(=&hG%#W;i#
zy~T?VM?-ms-1UEp6bJKO3Kda`Eed>`20fdGD4=`2;yEs#ss}Vx9W;XSJrEa>icQAt
zh-w5k!JCsTuoh|OuYM)#OimIAaH4x`V^+9UEqo^mfoO^07eN-O6&2~~EZ?#bf|cK?
z_eRKP`@B^znc~cGKWeJ7kt#`Z_l|;n?7Jr;(u{to$Y6<C`?^<~k*w8VI)i=U#dU<7
z5In=RU8Jr8@x<G>2tRM8>{c5&|3JWmSC6X<#85{CWXoYfaFy4gUo8#1*EK?Crc|jk
zOME=~<}*|~@%L)K@kp$^Z07nJj+cBtz%Z|yC?8><yHKqcJ*U9mxKi}Wj>mjQ!qif%
zBTa=~)qet!dcM=sc7<5euc2k>_P-?uck8dBK8J40u_L-j?#CtYRH&E(UFjXko?s;5
zp7l1`QC#&5?AJlSfRI8Y^*X7=Q2VNl?|+Lz69n<dv)3%;9*)1H;?AN4qpZmoaG|Zt
zT*Kn1u*0yH`~zDH8hER7hQ>(2Rc)3KJz=HM!cEISiRWYTvu&|6_j53K7zLFwmK;6$
zeH@E0w)HrGR%Z>Z3SM-d6oc?C$V%#KKuA?;|DD9IW%Dt>g4W`=AHtt?pDF{pe)6^m
z-0aYaW~1MU(gJa6OS1*BS^AIg%5Q+AE)YgmX)S<Qo|Dv7Enb_GmYl>_CeNp)mx4wv
zSJM*vK8hxSvUNg{lf2aWdy8M=CanOp#d+dkLYI@RRPZ~1-{j`CAjDrPnO-|?F`okM
zt>!HfSiK!3a$&utd`y<o0)eZ#jQJ|7z^2YI(xZ=#q9gvn8(6)<=DZ-Q@;()m^Z0Q}
z7rl+xRes(Ny0gfP|2$#KSU3<w^=K>W$7e4i%jd1q4HnEQRQYI^{@(C+>v1j2<Ak>I
zpN4@NYHXCV<iylo#<`D};$f2Y7=12$7vVdhwd7$%y;W4%z->8h2p7q_Eq+_&$|>K`
znFBCm5=X#!7O?lzt;}9(hJP{M!;yLj==A!P_o5N0VBi(q#)b|eN4E%Ru2~d5e0nG0
zpI^p@fvl!Z-}!xU+?{S{o6dFmSqO(-Qg<S2K?l}#&JnKttTa0Wm<1Ne(m^~SpKG6q
z{`v0Z!=ZYMk+J<Plp&&TlO;#oDtpkdvzE82EseX1thA@KU^)DRwV8YZ>s^dlvBB5-
zvH{)mxxkyFyO=J1y&PA1z9NKq7Rl4&)c;y(GczBgW@7pR-3LGMfDulnD8eA46kU|Q
zvQA+PKv3>KziaA9zEr`;C}h_I^7xE&sl=eACfth&Nf}sdqk2t)@osyuM&Un=BL$wL
z14jJS-BG^2)^=3la}Q<%vrp&)vJZPPTxxtluYiBd)`Y+7jV13>UJ0o2;vQW7KJ>x3
zPk#E+YK~M-Be?Ip@PTwns2@{$@|n8-717h*;rxMZnN2-S?zK{$2?etTiF{yv*M+80
zd24WX-~6mivYt}co+GOJV~f$_uJ(h2_V)GNs*muNI*9+^3+cJg%USm*lj_S;HQGyA
zr?gDFnbO#kXQ?MVQxZ@KOnn>m@Afr0`so&`YHD46j=7G^RJMQGAI_ER4WW)lD_-`F
z0KsaELvD_~?s#V8MKRdgrx$B2!Lbcc4tQoOTKI5XCg{ue%G*~CXFdDWk_aX`be;ir
zuntxbGlU@sQ6Nl^Jq1w!Qvgy>O+byFNKO7n$yHLUT1Apo06BpM<)9F=cJ=NZcl7k>
zjbbz(7vq#Y>ku(F_Nf6iKWufhgFV8HQh+_e4qo3I{&?$!M;x{#aq6@FXM4G|C)Y-&
z*XPT}Yv<*|JJv?+_vvIzq95Q?j2{B-DG&iC9Wbwp)-(NnComhW$LMn3HAw7Lf$wtE
z75q<#aqS_e9#ubVz<NyPH=Gv>gEi|K%TLPsURyL$uI4@NB2Xg7@Xdp*CN!Qa<c{Y>
z*HArV+iX3o+PNA?l|A=A+rv&^*D3R7IeycbrWc5)6pvBCKC)56K9y3ywE;)lKC_v6
z1aPSox_~HV7Q5LjTRwcT*034Cc%To!=O<pKL|I%}q<$HW1bMUS;fq8+1*At+{z*Qj
z-g@g=Mto>v4X~V<zhb#GLDWwaKGc6`M*1?2lN^2b{M)yDfbMghUH(EH%Yc&<(Tpq{
zsu}m9j<Kw6)belBu?zP}g4VL*%`vlo<sWA#g4~MFOv61}`U6kGmB#9sLD+J90Fh@0
z`Z3QO`nhJ8{yNPz1-j*;2gTP^$x03Ipg0xDTgy7^0#>u^1zWSq4f3erss*w!_)Qsk
z!ulbVH+4VeekeY4{#UJq(-sWcEYOiD&ElqR7fxx8aweT0j2FQqjVmk8fOYd!ywkvq
z&t9gKL-iMP_N5=kjC7yZDM=qO7*GloxEWW^OMZIeE`m~~@rOiS44tY%&Hxo4yP2uV
z)uq2k<7y9<S-vISC<9eG*IM2YsK%7fM<GAgcGVF`3(}Qb#II=ya|WEz-~HSfiW5Y<
z{%eG4Vv`cex6suT&3-z?L?``$q-zM^FY`Xd>$!rWNvyu`&qDnY9{lHhI^v8V0ku#o
zbAr&P^rf;COs^Pe^!*<1O$D#147P#Nj~v67yCtD$e0yQY;r29Z5u<Evx%kT9z+flf
z#*m2bL~Fval;n1~6s16hw0NpBKr$4A{^$qxDaGDAlzZNF$GVABo(rE1N{fk5YP*Rj
zt+NPq$y}du78m0$xm<SDrI3i5>7xpkvXm_xJjZ@|;sqY&zOra)u{AjwKe+ne#5~1O
zpjaL%Sa1=KSxo3ob1)Nba)qf$;XCdg%IO!~vbGeBe+iD#1GzOP1Q>*DsU`b;L`ti`
zA+QP?kho{klB#!}K%^&HdQ-$*&jU)+Xi^#7p8I@K$9JcWIu-&k;88mo0#f0V+8n+(
zr2H!h?J_A{&>(6VjB6V9n$^1cmD;ER*wQRioo``dC<0p3I90Xn63m@g4&?2ce{*v!
z9$1jCL9M|%XX<lmr|fBu93TfgMC~;|+%mhQVr%Iut>*7>I*y^i+Kd=%*q&+0haDK$
zqlPP_<y}$?)%><GDyT8^6V)asiWuB-dxVRGiQ_oo+zm}jtcmmnKZ&|@(1muyKjqGI
z1r)J0qU)%6DIO+fdoW*}-oZL)xMZEOM=oSi)n4qT-W_w;KKCBda@`Irg{cuf3(c%C
z?%S+k@#ST{_V~rPCcG+GHjB4v*ymZn%Wk%#@FMUQX<?yD)>vf%YHEk@c+6mF;S#m!
z{Wb-{f-vD;hW<X8K58MZi&Uxl@{~M6Eag(Gv93B(Qo-V_?56X$+<;}lBXo{f`NA?3
zH?5LsfaIq8Kxaj=_BY3XY^>hh!txecv+U+p+f;I?W=9%V{$>1%H#t0)8noNZ=jrW`
z2kOpFH)!uyjq)y{W_g6SV7+4i`5N}}zO3P;GTecFSj2}+qjUv5I><}*X7C_ldBwL^
zi!vObLU&l<N>4mygSLe)jlwmKF_!b3?nK0eJ-=8eYN!VldNa7?&R`EC_vOb3l*~}v
z@=_yWq=3){+wl7m45GV$)&|>T$dcr@NsDeb@R?lg-zkpj9ltf?*hwwe^}kOly8)!K
z*+J{kw2k3BH8XxMQ;N3(Z<NkGw&PiaJEQo2wa+z;XDcrKA+Erj8*Kx-i+C?K4Qkuu
zcUe8Bts&13TC&w_9YeWDjf`yD7W=S0x6_Ffr;r<3w>a{hTo$qkCJaoop7OPxYdR^E
z5F>$T3OZ4}fbN*CxLY<E38r^kQ$r)x(Oi|wqO?bMQnsB|16DyQ(JJbu+HeHWu1H3+
z<(iM%IyvHoiCi@)sBco~+qi(qX7*SQ)dr^DQrl2lqr5vb?`$2-ZEOj%C!Id_+aGQq
zpLK2O{Iy~{Ar~ID$$`(E%0HY26zzm>9P>GAmUv*D1Jw%Xqp4kE0l$MwfoQE{&p#&S
zAg*m1FPKS-d-xFazj)b}23R(0gEs1Op`9J9Z^Gk(FYRQ5L1(UV;MVPXgOwC!h+-UP
zg9Eazbo^N0*Z#@<LBH06bs1(rT3&0OlaH!Tnqv$gx5pDOh0HlceNo^ab>FT8A~_wM
zhm4U5(AFC2M&g3-g@APRr-#%YQekt7$FZ{=K8Tn*W*49isNRb6#Q6l{AiwSy%bs)c
z*Scvk2?_%SxRr0ROd^Dl<SOxCvzxT$7C?r#>Qcbi3yw#)d6PgeJ7%ISz!>QXI>w{T
zhy9ssw5UOXoW#=Ss0?&?ojfsQ4EfMH1poc+_uNWSf$PEHb~B65T)ya23?x9Hb#&A)
z%!}8Y)A*Iz<nqs8TEjFi`|%CCAL1@YeVN7|+C~rs*m*0(1fj`pGvh+{z<DDdP|D-n
zJnSZUDCb11@#xGPWXXf`Q!t@+^y;)o4&jtd?2E_Y586G2opaqN&U9SN%j^C{d5+j0
zP0meb#OXx3N7#OgGpkvsA3vw^Q>UhXeRnyIKD|!GrtBKT+Lu`2H;FcnQ6kj+hB0rm
z;;kMdgI^0qvqDWG8RzK{E2An95M-VvUxo|YIqhbNe|*m*nv^?s8xgnZs~>6lIc<tf
zoyp;KZ`Z?I#4~v`?Y_%Qxx9o$c(s$_;dl*|k`%p4$^>N|ibYvC7E{rl#yNVn%qUub
z(Qv%M>P{l6oyIZ*W?s-lKMX6m+aVQD+m}?qE$RL}&HugYswL*6Kzpw!96ff>)qYIU
zI<}08FkqZ<*PB~Cc$sMKc|KcT<DBHYxs(EwAh(oi_phSH88+@Zcu;rA@tG(8mC&59
zb=5c+LD@3m4r@7O-(GE6SSwNWTrTGmdMo!BVD${9pAE)?*!|nF$BbKrgKV;=@Y}KB
zfa@eTvAZX6p1NVzmCS>-1wV<T=V2b_-tX4n5k1DSAn`u#mZ_>--xx2O$W4`ZzY|&q
zL}hM|{#W4xCWw(VUUzIRvUYgB*M=*eh7<gXRs{D|=lRRQldY3B{@r#=t}U`x&|xtN
z#ZF?K2aiD*$<l>b1ouTIZvrwGc^X;l+s&)&*kO-nokuXsRlJn@Kr5zZJ^uK5@Y{j)
zF4N;$k}aGaI*E=U_{*qV&I3<aHFYfNI80CIx4irK3nt<Mk#Vysfu<oo<#zBfdjyw)
za2;#Ny~wSO8ppu<omgPkawhbpkeV>Qc}lA%^JgQgwk>O3J=-jwDf1J$Naw#cTyv5&
zd%e2W-yJrRx7()#jmA$Z@1<7l-7Im9d96FU{W#5fXN>Axh`vHr7**<SiRf^bZ%)@{
zZnf)qc$bogjZKMbu4bkK<z93m?0f_>doWFr`&NCETxwO?aHn_18&7sbx<Ia2hL)T?
z8l6~l`<G3d``%uKtavp?noV5$*t{^CvH=KhF4sD{;4LZp9s$xRaV?l)r(Dg)cGR}>
zD+3?X#~Xot#PntegW6r%p13BE5o<Cbcz4ZcdTNKtvAkE2)s3%$Io<n+1dl}_A8%#k
zB-~>JPe!4!52M`fe&jsC`-*EOA+yL`eYtl@LG(a3FlVn1ITQZ?N##q)yhu+JUng_k
z|FL{mlu~*H_THGo?H<^AJcp9MZRXVUP#FkEe>R2QIey~BV&?4My5sJ5ein$r%w`(L
z(#ws~dkXkl_)IU}5=6UGMD2f=@VN=HzrcpT10Jxf%Z}V#1APCopldVZPp#Y%eb3qv
zeTMj+wV-)UB6(B?+yizTX|;&2p0>gWiIyFy4G@{Q6XQ{dHkl6zknfuk<D0gNp@MTd
z<E8S4m<h&_p51Rm?}H^~C5pt#?y5v3f<0zUinPq`xg(UIs&!Dy(Vjiw;=%JLGm*{F
zF>SH)&ZeV^hUfRE@93d2*!UL6Ub}?%;}GvtL^y+dY~6=wU_B~{v+TAsapHp&cS6F5
zvz}N*yWG4;eD(v@al#=}yR74Zh&EwZb=wYDL_-V=$+aFF;!y_RYH@K<ZQPT`%=H&8
z!Lsh&Nlt@0?JnyWZt)<CGjY1=29R$&Wq;MU$%={uIp4OAmDQc&fqNa5M@6Z}yB68@
zTl|Z8hg(6tRWsVkIKhjRQe*B$@mbF}%#M&!f9P(ESGSCJrDHOsZJvEpR-f#K*_W_P
zcBO3i#i2?BI9yNCfa)31+~0|&-puOR8pyoT64)^ovQCKl-dcqR5gkwKxcpa<aMgl0
z4S&7G0iqH9vg*EnYfz+;_!S;!+8UytOS2my5<!s0wT@=a=82$+J<BE<w~9B*7>`}e
zm3@`dsqW4WNY|yV*cRk)|6r|R*W*C%W1s2rJTN<y+V0Vz4O^5l^zt9w#Qw-y6bTom
z=ph%H>`V;KT{mz8vt8S;$4wroR@m3azDpN4-&Xj#+ZysF9`W;jzOxz^)N!KN6sLn6
z<kmh6{04~YK_{D$?l7l?D=zvg1jO>-GhjGQmw$8mXtB<9Xy<d4OW;qvNCiHU)<k$u
zVefbgioL7yV|kl}%z39yvzH2U4;@O|l4oGs)E%&o1$2@x*dF4krJg9=Rn$w~pxp%@
zF*jUi%S9nY^aWEqPo6z~4bK#wS`?nMvM-;g$7%d?p;GN7u5vZy!%N{A(hyRi^Q%&~
z89o65C7UHIk06U3<}lmA8=qhci9bGi0`vu^bCP9nJh2<`VBT1aXPRXwU!H%$h|M6s
zaE0gj@^$mXAL*BL9{I{pFTfr>FN819A`1So$%uUJ{C=WZ{(GmyYFu2PgtRP+SBhPh
zxa3+^tMC*2j`h@BbW84?yYN`CBklh9*E@`}>~_^R^8S2xxmObGX?Nv^cDeMY=KHoR
z%ty#K{k`P7;#+h{{!8|eFF)k*NBiexsq@Hl>hN9tYqs3>65}y)Y4;-askcDp(zp7D
z#aZP?^Q%|k2dP(BG<)~qJ}0&Or|b086JK#KgeS!7B4(d<P;;qdpCknNqBTSohpK#c
zpPsvH7l+rjZ1EiK2~f=ud@vO*n=^4A2V5l8dJi^&Zgr3K)tP|VYmuSsqe*|vPU8h@
z&=n5DDf)F1O1n~j=wEC{EoKSaB4?@o2t%nN>_dpVcC7w<UD*SckGb0N)4@x<lXu82
zCabgjV9I`d4IdV1Z=NR`2kjf$wrirUw2$J8>p0kW?t-?=w^l<Hj;sqQ!D2P(kM#ZS
z@8aLSTyIVbFyp(D_~IT|Kf23Bs)@d7UbK{4?lf<u;vBEuc|`$7z7jc-8>wFO?i}UB
zZ>r-z8ILXXeQu=X)32GQJ7Vv_=dLBY=XWm-OcxMRe>q+)4>u<5Spv`tf!7!Q$)R6h
z_qeszD78#hc$MkP!%xg}wi}D}Q0b=Hu0)Tw9-F_#c**J{rDi^}9*>J={&`#5jJAcj
z7k`WL(&?elwVCkjgjXwuh_OYC`9YUox-H)&!DHY=BkgZq9nUNid*RuBK`j30x%sfW
zZn%M<5Pvb;j-AK9C#(4T!n{2j7w-l2oON^B!DUA84#5)K<^3LTgVCW&h~j?xD7;<k
z+#9m)ze^)6S&&ec{e#^_KJLdfmTH(TjwOzDHk-hd$Rv;MuK3toG*Be)$oWP^Ltai*
z4t<<p+qWZDA^wg#L83c+l<3OJ3;QISzaTcNFSI~=lxQQKxDe%}nvr@VNxjg~M9Gs7
z-$aRXgvud3^E-p&bW(VsLyIbPQrV$;i;!*7+9A{`g+o0BZJqk#IE<SRVVwME@t&h#
z@IH*2Dt7$vK8TxicEYi9c!iFBO!B^>tDZ#jzD3%U`35<Wc00v*GI%L~yl7G7mwTkO
zaSOCN7g39aW_C3aqGaSWvF#e@m}RTFRmF82ty$zoj6gbAi(yM*3v<iEoX-UZhYia=
zwoCeW-f>WIfptO;CBGd3PdP?0rg!!*B;^3^&<g(N0*km)c|4EIF70p9I?mV5NsM{j
zeE6h@4*rF{8-_FE-+o0T7z*PUsi9-7DM0rk?wH2@JRTU`Aw6*SobL#P9A|v_FlT~i
zo`kk}NO9bcBynu>zF^~`eBf^k@3v>?OSZ^Gai45gv3VSivt^K9vT+8F^n?V0y`YqT
z5$0_3ZV={w{^s(>{^b=vFEeV2-GFJSP3YqLKGpkBkVkX(?aG+Q-{H?K-YMD<$|c|R
zT=8eIBGPqo$@93lOv3U8q!vYgB)z9|M|tabz>_F3T9R^B+&-I3%3P3xS(Xfzqh6b#
z*g#l$$ULhpvy~T}Uo9iDfb`Dx+Fl1sdBo=AkCrzKD;X)zB@W5R`^L9l;*<OF1HB+9
zr!*q{Ozwk5C0<uEq7m3Wy5<hOLm6Q@`fYpE9+f;IQG{I?y*wt}^Lt_J!xg`9{T0q>
zP7KF<3B<57ss-|8@=iAeQS|R9|L9+G{?Y?ZoK=oFu*cMcG<G_fFvWu^c5=Ef$%BeE
zA{!tZNzQRdgM@`*8cJ2FxQJq=a)FAJiz1P7VT)8w(#YRpF5=1X;(sZr@RDQ1$%~m6
z+~Xn_jL$2av^yxbQ%Z`t7I@6%HBwfJAQlAA>$m7HkGmZzvWQG4-5v6^h;I|AkFn!C
z+Qn^>Ujm)Rqi81J)f2rPAdTdB>2e2t{yN1P$D(EJtx9Vj-BNdb*^b=0^6UJjktL<^
z4X^t&9=LZJxL1XoWCjS-WFzFHE(y&n8O<ya%@3l!JDt8ezW!Tg(D7T)F##D3Jt2*6
zWZfr;8FW#Y-B)4I<#5m?Y0xDBxfVUC)(`0ZJ2})UADGr^zQe$+EQ-$$*M&0`gj;O7
zws6`P*1(m;z4wqQRHh|lCZFh<mgL%}yPhNcl>aX@KMud%gjZmFM-Y8SlBd-r-9fLu
zEE~XO_%6Y<EvdCF0coIE&2n=KZ13S$<wI5Z%=*DK{~n7b?CMv~>m|L5Ur~zCk6#aA
z4lY2q6a3Wtm47b}%%k&Y^RDx<*`<FH+Gf@1roYWe-A|IZ42l~is_a)JU}ULeP%g9K
z7Ld;>91tqkj;vBxznc!Y4JsR@w{EZET6Ppa8`qBiNN&nBRw<$jkBpCx@Kyhg6<0?W
zE~P2uGM?X83bT-@%>v7)TpI`EG9bgCMi9zfqz}nLYQN$k<x{0V81^~!)B57oDCJJU
z(e|*P;}CjmdkAuyaM!R2r{o>APTbs8Tv%zGadaqsz~x2m-os6yy{%=>bsOg(*0GP7
z7)Yejaf~6I*seZ+J8iS_l^xcc2}dZ1p}S6EKhHuo&1&?OyX8Es<HWqCQL*1?|L6bM
z<0!6WU&C8?IRjS(KhHv8!`aFk#)-QDb{Fr6QyGMcCm(1qbTQW?`MPH9s+%+wAuJ&;
zrI3;E-Nfm&fkvq~?!o6A)MQx{g-e=h)}ug(Mma2NR<7_Il|qdsYnH6=<=?DQo`{_^
zV%nr`9>0<FY}zDv?j>N~$>%pwVOcvw8StC>nXk-Fn)gT|Ij{ycK~{bVp-AUA?npIY
zmV%BR+G8o1RygSlLjEm{S{7R_=H@KU!AkV*DKXDfEBNTMt5T@vY=b;oPA1`<xA@*z
zm6U_8I?J6exwgaN=(6UWe1g3+%U2*YIdYbw=cS}l99y1lvQ@kKi2ZuwC-mY$)LE&W
zDw=XNci#TE<WK$<VE-=Ze4A?5-kH?u!$va;uwTe}lM}gK|2LA(z7=nMfBeQ?I!dx2
zHbpxoyGPs6yz^4XN}O*EHSHs&KMl?fH+EW=saA$keaQCLE}rRMdNVd;FBh!<5^f0H
zRDw3IdqGt(pR0@3bM3&j-nHkBm_w&6R?UjbB-%k|^i5UGmP=s3dd<B`s+oTY{S4HC
zyvky=DW2(n+u~h=1NC85>9pOj+(u^KY5i1#wT00_wyJH#WuGpjsr}IMRrAq8w;B-*
zC^0OwS~9)wVYH|e^pW~XxRp8=%1ZD$-p1D$-1jr|vA%+E3|j%N1>WwWC0(DbuvoXd
ze%IVNVpb|uWuKey53EgQjcsG5BE854V<~HhJ@Iqxd2kJ-7nV1+={g}JcF{_;FI05|
z^}}+jSJq5%ir&yy_>+;F9FUzep5k9<(={c5WDb}`=gFhtFfZ%ADvCwtKJGxrAxZsg
z)W^tn&pi|Y)=MH(^ENOaq@k1-!K>hAks~biKJBw3n?Z+RBfmAX1ro=OvZVEc1&27q
zhkj?R%DT#u#%9Nc-lkHfIA_k_TI!xV%md^s`6fy8bj%XGRpb@;veb#N0shc6@jiM|
z;B^)cn3bYeOxzuCXYj4;TA%63isFOCXwUBp1Y^BUb6G_W?vrOJ<(+nd)6h4QS$Q<i
zo{Qn97L0Nq&vI-cR@_#I4Ly2Qh%4m_#P-{-E7>lc=Z!Wz*{Qp;8tyyJ#1>xTSl5lf
zYde9TwZ`u-o@Tb8*y#!5tr9%%4Vjyk<TYM0t0S9CNOy4#c7z^qZ#`Q_7LFRTR@~Jl
z@hbL~{zK`544e&Eaga>oT<1*O)~3$z*I~x;=}nRg_Ig7^S9X)8kKZM~GEE5NRj+-V
zX^>FZ0o9;}O7wqX55cW2Pnw;t3h`c=rAH&4$9QXOxK?<^>dzV3@!b`&#u88dxG`l@
z=HMbH4qubDV~m>IwS^CHhhu`JqS1rsI+nQK*LM#+ZG>MV+pnE?vZk34hlG#fx6$Cn
zaxn7<v+vlSxJjVAB!14}-!ngAq?6CFn)Z}t745e%=HId3L3wkcM~(W(nsw>s#$39f
zjv4@%8p$CuUqV(#^x;QuLuOph>zH1<spPOumtnQGnj@Vo@*VQoDcUsOXZWWf=7)(Y
zKo%A?7a1PY)Mb6ENaVMx2+ja^>z9hcAXz9bzlS{zh)<GHAm;(zBANIHl#8mbCeBdj
z7q7-#MU1-WI)qBepxnZX5oUYXGH<DEVrZ`+i4&q0O%xkE4w?4oaE`FkAn%!M8(llo
z>nT2~z?7}_p_Qr*m@AwcgO8L>*pHl`gXaSl^mN}jlaH3p0?pGJGULFnGuDnvF|Nb6
zPU)MG8z)yu*Obm~uGQ^uYk61y%?0f%^p44`=NstOh%bUW|25vfQghnZyi3}r@CKe+
z+h;KeEHJMFH|Q_(H!d&rH!?5cTe{omFX=Y@-uSgB9m+4pTe91sTdMZoH+df(u1nsH
zA7vk`F+lZSvTG+dhBxOoi8owA`P^5g46U8BZMXYAx4gGHS3PGsBixrgS2`rDup{*!
z6(2zDF56#mN%o6hn49-En;AWx8k?KmqLSA=rN&P57VF!S8agi-Dr;Z-)cs$ZUvitG
zSC}?GhSkUfcplZ@5B>G3m99k=(lM~r%kkCMyZZ=3d+gtKN9Niw0@3zKX2jyLn_8p1
z)B9yhf0%q2WN}$CX60*xaiukW;8jsB{eofx97QWx!yee5El2IK7l<ZMS>T8se|!V)
zf97oER^|d<d=UDqh|;GG9M^$c(r68-*DhFjG)Gw*X#K9O$Q#j6STU$A^}$TI$f~lH
z*9EQOq8mil4X?zXVLfxY$8>CPjA|QJH*M<J);F*ISVl}T7Pzi?T6t)puN%8Wd-w4g
z5jc9-(5Frq+pTNitBxhxeY8o`gjAY3L;2%MYf-o6UK&I*?qkXv;l|05une&baNMzc
za=QWVocit)+kHg-kbUIc_HDf~jY%32{E5B_?2&r@t4s8ehb6{O?145;sBXme$kKrG
z&bbcvg}4qkP1>1n8@D~jHg03a3!oU*;k+ZSLeLYo=VAdgF(6XD0Y?MY(FQs>&pb{R
zz%c<haez_+_!ysXpJ3kTZwxWgb(Am12J&|-fT}l;YY09I@reRd_E?UiEaob_NBUXl
zay;h@V=TuE1Q1KSj$-}Wvp|>Fvp`yj9*uumiC{VOJK!q3C4pzYHV&YGui%n)&L`w6
z{uTC)bU_FRCIKk9X92_`$h$2O^P6NK`WMSrEF#E*2eAD++E;DduLoC{_XGi*Bj4D{
zKTT;plCz61=V%^yMu|S4c(!^9X@AHP$B70QnF<+nMi7wo!PAINKmoY|M`4*sgznRc
zOOhx}E5eqLt@JX0Hq+ooqQD+RUgY<tM-RHhNoGf*=KC?b!LItX<^_LJfjk*hVS}j0
z`;uDy1>I&pIrKy!>>wWw-3MY(8)UzMn<RnB^jV95F?PUs_3?{<VCa73GJq`1Bk=1G
z!HyBax}lByp#MX^4gJB2dgB8b{8!d1BEpk-<=+cW(i`D0hIy(`&M;ydH$P2s_)dm3
z?{)FZScHI#<Q+Y#3<OO9JIV}c{rH>08C9e{E$U!&j3OnD=Q>uAuc#(pK?rLyTDfzi
zQ0xa;`>}~3G>uSOB9%OdGOzML;iHK`zQf-p2FfwXrf@UOFz!Byfjphr2_g(DXmB8l
zTr^z;c|fbEdluR;J!!A_%RYQ4s9geK4rI!?FW)q{T|75W_I8XTi;Zs39?M;$ieyFJ
zBMDy;vh{E5sPTb$eayUKxT*T+xq6%<pzFGiGg=RLP(E9rNV&1Jy*8m6!hsq-O->%7
zr9@;hWFE=dks;2%wsSZu`hC@=!<=B(n>+C)BCRANX`lW%9n(_GWMtl=Hw@}RH{Gi!
zQ2OZUS|x-;MnkFT2c<@Ek3fndMZib;++U295LG+o^32z!FHFv}(3ul|8nC2b7Gr{^
zO*jn)P8eOp>KV{m;_9!cOhwm_J(7`JhGxp3!k>7%e~|!xnovWz;;dn};5$ZH5AgK2
z82HqE>{mrLwWJxV#V?8dwvJ8?N@tQ3F`wWx)h4u3Bkt~%@bHLs?%Sda{-!RiREx$p
z@t&I`V+DP>G==b=G<jzKw>_C-;B=+FcOO@r+uf(s=Lq3Xjx^>O-aVjG*t(8$MeBlW
z0hsuqnK+dU{UMu0q-VM#Gn3Qx4>|Ol<|5(WrdBn3^U-tGSI|qocpF_%=OA&lxF*}b
zJ|Jf#*6HBqm9zXM_ZQlv=t3vS9ouQiw2)LIPN`LAl>L_+;21QI*P@Oy+F9@axFN6p
z6=i46NyJTp<uI1n@5;)JoFi}ZfBhU}woAh~Fxtxq|3?c`lcR^74t?_{aX9-hq#daw
z5hwUAc-{b_ozW&M(=UhG<RVXoJk8uDA~Sf)kq|}-i%kSGOjCRW`V<6cWKZWHg<mzD
zGi^|C3%#gvi?#L;-Hilc_GIymhBuXdFEYPYxCizjyLPYgjk71-wjVb%htzPU74V;}
z1-d6O(H$nNZ!_J3#!mWp@>OsyK%@k(1~grUH^Lp-Y(MsX^fmDli!PT>w)P$a%fA=a
ztLd=iPWA2bpA-y(((rYo&0OIMtv}EtgUxoxox0!i5|D0)P_`K;H3LmXpMsPf`6~-!
z_Ds<zO`=z18tKV_J6x9k4l>M;L46bsI`xo+9z_eTrrxyOEv)8fm0>;ZV;lp?%3t-h
z9VGC?EJkV&*pLR3VUhQsHG$Z$>qpSSKrchNT)29a8Id=cd4br*WUq8NO6US@rEs(;
zO{TlJWHptZ0Zz+ov_J#Yt1X48m2lBv!z1&5zwdr`Q|0upz}g9NljLB~N2CVBBGGW}
zVl`?FxnNaS9ys4oEK=?avFzIpu<PGA-M7A(Pvxf71<Kt*eZ#kifAU}+3YUP1m}0pp
z!#?Mw=?3=j&hqJ1yoG)mcBs$6=((Pe5-E?>NokM;TVeJ;^ku?rP#-}B3jpq&l%ZN+
z!nFE*WaArC1$pF$?;vUfe{Chqfi;@`f{hwm+kb=EBS;O#PU|PDQ%7$*K>X7`Y6L};
zuZxKqv*<HIk%c@1jd7;FpI|54Hj);EBUd5bMk=-7D0d&g0OHK_SNjI~VZd8oSPg?U
zKSoimC4R>*E4FIPA`ha9QqOupkB$YzmT5o7U{4~L1>&{SWV%!|NWdpr*AU`O*uy`w
z8C_kn`fUz@)3UvJLSs@|9j9t~h1D{2+3L6oPKpHA*k-aqtvha$9`=55CCu4RzWOJ%
z+tI(KqwV?U_ERSuQgjHFNC;a|81yf!7*Km2s{awU8!DABRXi~Ul63?sQafZAtAemW
zKk+?PF{UBEkPtpl{Cxs(Eioanv_CNzEHou?aAN0+sgHl!h;rx6d+n3%<&N{pcg}PU
zkEhA=)jIP77kMZpLeJyg6vQc9`lnt$NFcj3%SSKH$xi{!2_ki>Ez8pKk2G0nAN01Z
zCMCziu`)&N@Vy3E?LpRo&;5jYYNJhG5737L>!hXT469M+akH)t6?Ocoyt{aJPqDY8
zqs+vmQ_PP3wH^1C_D0!s^vHpTp<}8&`$NZ;F}DpPuj$Tj8^-Z1gPOe)2U51wj4b!T
zEce=jDa}l-$>lATntFj|h-IH3#=GvpEEN?Ba;n?wqwZ8s=T<Nd?H)K|(@D?xxdhw1
z<B*0AoQswd%>|Eq+T}TT!F7@&@7dPEEBkvN!aPABJmw_lxh_*#j0{?@a=CRrMVI`p
z+@IB?ZmBZ0D-EDG0hV4FKxQ7}<#K_iZ#WAICqb5_BJ55H+6uW#TT%NvPZiI6nO**E
z*x?t<i3nfG3)-bx><LEHRQE~j&ua=g$4nNQynUc(CR9F?9YfOl9!ayRxgp<@wjutN
zB46pdhQ5&`KON)@_23g_K`>>r`QE>h!73_(mcgtlf|`|cHMXMbqQ~C4@Rx-e2@O|?
z0;M3`B)bVQFy1<F*DzLOM_*U1g<;gU7<7s)-syDbOYl3F(8c2F_s1<%ioZ$B_&#eX
z7u3q$RxeFUmGYM?#4cMBUYy)*F}iluy?uMa8*reu72u4ao6AEuy|Rh5sg7?a-dc!t
z!QE+)r`wZsiMt(oa`>nYitf9>zk7}fdy>68cUM20C`*-QfY2=2)igUym4>`tJd_RI
zt0-e(k+G?gEFFSw=;IkhY78T!5GR=5SVT`H?~RNq=3pEDFXrCzD~>1X7XAg-5Zs*v
zhv4q+?(UZ0E<<p4cXzko4g&<Y!QFKRcX)Z8`>y*p+%NT^y8Bdht+V#&t~zILc=1EH
zr?;+%hw#T@3Lz<Tr6>lLy%+|xS29ixqe=zoi+{e7RLqh?9_S~ZGJPb-uY`|kXN^jl
z=O$rvPdew~jAmlvg^fnyc5rW%yh<8}$8))=EJ|LM4$&YMzbH_Z`bQT&@`oYDx!*O|
zdQezXJt9?0Oyl8zcyT6I_QkG5;2tAIvykrPtmN(5zfp;E;_2CNnd<18MBApfGr1|a
zFelJsrM2+NI7a3@{9i6siU1AWi;}A&SsF+63;fyNC><r06|2HQ1*XVYB#=b?SGwy$
zVCw2DzZ9TSo6_rtgUy>9PyOj?Wn(Sf^>|533hu4H>CdAirX?vOke>Q|>9Jhtr&e-O
zVFx@higRMD$bJ5CVg6?CKM{=^Pd8p{zuF(9Ms#vL@>}U=?;3&%AG~jBxz*Y9Z1e}C
zvQOVOZiUNYt^y8^FENBRvhe1bVy0SiZ8};J%rWR2rUvXtkf)e84t`+?5bm>N%wH1d
zVJZklRe%?b`ZMx<TFKH*-al01kJ#VM<1U%*JEFSNZ(BQu6g(Q140fwKSCjG11UX0Z
zu%Db&9677Fxlntvr5do8^W!iTdWwxSZX8pHd{0o+62jcFZRN(PN;{M))csX|=7NAm
z+3n`PHjI$WFX&ZTZOcz}ewGm+z3SfSdqe;=3oDO0BH30y_J93lKRP?AYvD<oHb|M)
z<`LOQN%2*6`#ts?Wk2~;=7znI;&Kv=z3ApQvtKCpbQ&IUc6>#aOSfO?!Wk~)m)3=W
z3^I@hX*2vPzxIwQNhd;=<G8}NQ=jCKq;Z!M&Ilb1vh7uXZ0=kWEO4MFbu)pA=AJRT
zm*Xb9qsB;x32E4e&IPc=1?kuUb1Wky=mhA?sZK+Z@Ovn?X3k|%0+hGeGf_PIh0Ctw
z?SsRI$k>^0N)i0DR)Nc=1r|R@M<NxnPYv<(sGD`H^|tFaL|-Na$lVtnY2D|JN6f+D
zIheBPZ25;Dk?TZh0%V4R@vCNe3CpodoPfA-eD+++U4~qf*YX=Y2cl#Sv|r7v?fgtu
zLv5dVxtTuu{5WYY+WJ!(PCQD(@c19VFgP!HAn7#frc|b9__z)YlU07(Ir`OD+Mbk4
zYcBk*Gfw1ZuOL@EJP#MsQGjPcL){q_lVIdHK3PwxYPIG%G~GST&fZy!qHk2fNh^zO
zO;yXzM^DUE?}<*v+K@qAUiqqCH5UUttwDLh45Mg<=<i@&b#E|3GUI~Sv6Sdr2pkeB
zy0^G>@QihVljt;p0QGI*jBCM)=uHR`v@S#QCp=o`%vcuu7VQkVM@~T?MqTB9LIszF
zOu-gEBaX|u$MCm@DqrL~PzQP7i#Zz<uos|E_TFx4m$#qTIaiIFDcV!JmMP;Fls6=;
zk2j~SYaI&~hy?hSE7#6Qn{jzX&%hM$7mSIjg;*j?V(XHrV|6iC+!_}2>RT&71i8vv
zOI4pf5nYro_#T1y8?Wn=!CR8AeEIE`XB|W$<7%&0ZbHQ3)I|D%&FFi;Wdoh-=4<N>
zHW>Sk{@ic}c@gDMZ(nPn{S_b~P!I|tOt4SzEcPfiqM~NSBf!BY!N<lY%)raQ%E0iV
zWo2V!R{f@fr9+^DuS2+kw?eS;z0NNoD1kCzFJYR}iW0Zbot=juwp3iL;MSV*-7l((
zwdiv-#rdR~iUbOUBIP6Q<FLfxCBwu?10=<zq(&t2A_Q^$Sa=0^`FMqN@eCL~3~?A^
zYoqLA?V~PZFXK|h<0O?Nm87O6Fj=@XDK+^txl(b)!#QF(;#wkGqBii)FrsYRM}3B$
zftOLJk*G1BMaw|o&<|m+hOvf<#kuI^_2t0PdTkYmE=Aqx<H5BqrXJ}U#cS+#3$^=y
zyi|2&*K{!{!&4(QZD@<pO9=u6fR^4iW7yud_z7mFg>771+v_M~6M$0zXK$Nwn32Bp
z4leHLXV=oT9`fDb$<A6e+)nDwlx7n|X;7>>cE@BVPtPMvDwwpDVy9t;x+ZrgVyCQz
zH1uY$s#<Voafbuc204Tv+Cla9)Ng<6<%Wp5iMIP1S(LmxT;peak+B>*{-%8Nr;+|j
zw$YXqfCZ^s6557`W2Yh;>%dQRW1~8=lm<<D{+RJ&n$cQ`N_iI$@XRBgd6oT*1NUM0
zi(_(=qK<gQrxDGZG;xUqqU>JL^{;VSsydvs!p3k#kHU@OjslgEE4(>klu%MvT-S;;
zvN_H*kz=fKf+@)jKW=|i-(Z;exArSAbkgA$_s)c4=|L#?t4BkZqb9fI_bVSljzQ=F
zXbTb&6d`j1ToMw%f-G9x*N?Om^xSWNf3{dmUvc$f2En0QE%smYQ1Y5HUxogX$gZEg
z3)km~OVK!e_eFhN+ja<YIC01Pbr2&g`0dGII}dZ>b~6|Z=Wx0Sw-waUio_|k`OgoA
zPnQex<!vK1a<lt+gW91PLTgVg0mcoKo5?7;FbwW(%M_Fse*fkx_m~Q)k#~S5LVYJ9
zZt%KdwpZ&<G!|E&?M^dfQEmOyhKkQcQ`uaB&y%l^s9B2Vm)}50tlH*NePF3JkWk_c
zB;l=@#b;QC>VfqwUHkf^{0Pz({`$9=J!uZy=|SU-Hk;&bO}9126BBzi=%P&z%$ypo
zvH#*6^t;dS7-is)^;aa<dG2cA#>6<f?`qQG@3>$v!sSgUa;!#9JgTBt9$+=KkPUCr
zW>X0k*Z9UZCHS6>lvm6~@Y$p21IiZ(H1+5Eah}Hqjq}<D+4A^CO$`kxZ-{w)r&fLU
zskRli*vhunwyng72pck=pSqAM9x^Nhnf4+Q!1JuI9T#}k*E%b-^DfU`v`HE)ROV#&
zve`L6txiI4{2QHrUzTdQqS7n}0z`f{U~y%SN8rs^mVuowfB7#R&G0N$&AS(}hJ67I
z8Q%Wtn7L+jG%ECk2^zVdzh0fCBDZ<*-vn8v(#ta;9rO-gU<(uj%#Co34`b#TtwHps
zi#13eW_oEEI^t_190ru+pIR00gzBb>fGuK>9&K)Yzr4yiijdL5-1Mo|H$_|2q)B0+
zsubUKZ{wdH!ndLPkFl2cI4(nFVk_Q9C_h2yRO2%77b4!B-`EV$N}f%9`i}oeQ+ZMk
z5421y@vzJF?R|-*h6CCBB9k#D4+w$5!4MxNmqjDz!IIWQ;*CD=b`W4`!k&PI|HFqJ
z5sZOp_D`<3q9q4Hy)R$q_bp<BNf<w>Tc&71Q7kc(DrPaKG!>TX4}HW01^%8jW}bE~
z)FHf^n*qP<gi+N8W@|GC^8c73>`TgD<rm(7_B4YBx%Vjr<zeP~J0QQEXB2~M@!1pL
zdi(NQBQqu~NLp@DUJ*Zp76##3v~V!m=jlo*w)!V7g&Tu8<`CqmkvAj(KC2E4k5t!6
zV-gk+=F_hj&-6SssbgW4_?0_vW!oI)O%dBIo@VI0t@Sv9!C*uj7#`*Ov~TwpR1*GW
zQ#~K_ccD^DZY~Sd@6HErddVKK=xeY;GCyWJQ*_6>zw0g;h}Si{mX96vSWcBjHgbA2
zmAW^@y1K0+<SCsP1+fUq`opO`xZN6!Oq=6@)1ReOoLdSr>MVQC<L;$z?fUDa{C~v@
zAdNwLb@0{0!BghJ8KQ1V)BE&a7u)k@$w)L;m3W*njQ`UR{4U($<rAM|g=`d;z8*nE
zybk7?dHTLJMV<>LcVJ#*IK80w;Za_?l)fyd88MLqbc{T_nbB;DJg8;hgSu&y6JYQ%
z{?76@QNzX&`pn{cTL7`Ri(4fK+rLMy=BhgXYN+7vHS$GDQb~zo>KI*Zwub0Z1CUR}
zuKnZtlgQng^{0%RhO@d>hxlVyN7KyB1ldQll$u@!+F{VyYgY0@p4EZ-l0j&S&!4&7
z|0a<%dRn)V=aQrq=lt&+?flCQdA<4MJO*dr&m$^s?<uaiuAbZ=C(#G-xyk1wwX0i)
zp|Ei$C$qJ6G&dZ%)rC!*K`c|Cmcw<Rpl4HI(q-fjfONbr-wSTsKlFpmSL@tCV;Z^g
z;MenOcwr0PJGW<?*5I5=$ZBs;hHYTTZ_AHGm*i~BOX?S!V5Qw#=XK(LH0+x_QYR?h
zJxPBx%@{SLPTys^cXluzK4pfBFTsiiEr-74T|V%>xIn#sG(A*WzGJfTC3)sPo!UOf
z&8FvCFx`u}=6zec|5iAfBUF0#{@MI=rS@BsHz?e&Kl(b)p;)u&4fofFFAq`C4U98R
zoIX-=Nfx8@g1zHmb}|3o;wsVkdhDe95;lJ|jfzK3Ng3<apQK<?!xE6ABNB__ga(*#
zK-3JvwHz_{5!f(2Fr9<9r)=woMDic0dfXVToD_?QQq6!-fl5Eg?N1Qg-1O%1Rd*UY
zG|X&yk)DMA)C(7lw^bin|KccH->*r;!-<MacjWq<)-*ryC#FNJ$ef(*ALqXYrhc6J
zM!y^1!dDGVNz9SRtY_<{v`*P7!3e=<QBMESdec2Lm!q#jb^bgFR!jx8y!1V=bgiQ6
z$M8dUeU%}&8uWtB3Tx1W!}){UPod?}SG#;3Ir)E^w2n%PvItvV1xOz+Lv@fp;rnmm
z5KlMGH!a`sXf3akwO{8Gr67EU@?zAtSJpnpYK99jX0WDa)zp-ymQt2o&|1LX#{P}L
z%merQ%v7E+(w$#Md3wTJ5Wnihk#`+OG;Q&X9%I~7CaHJYw#5{hJ2qE+KtUm&y4(8i
zO(dO@<^!|`*SyoXI_IlCt>du8DggP}GpBcF5;qTuBH|^jPexA6(7vJNZ{}QbLAR6$
zc!CpouO+0ZeMR%4m2Z<d5KNoy?g*~KU5*s)uWSco)1n_gAHQq=XTMKnv`t!qosxXk
zHTp{D&n|pO0BDaON|B~AXA4@HXj?C;iI~dUH^hIaRlhyh^mgj39g12L$gnuVR+QCS
zptjfB@K}dW_Sl5+kLePgHoGKhq<D7b1i8$5R1ogy-+s#kpZLvh6zi<+ydU4rzBKA(
z&9#=_`3x7sjh>nAKxqd9zB;jL9u0Wu`JE93gOUEKyUx4CuA`VzqP=-Ul*eIpKOHwy
z(Wz?8d!=T)p#W$~xYTsxtWTn0U9=dXI%hSd6k%PKzjT(5WDlN`dF=!5&It(be0L`f
z#CbCWtVxarS)_9h)zu<%IJWH}Vi3x6qA#rhaSdF4LxP?9vp#45hPJ?S?^J=~G;e@Q
zmUH9_&30Nqh%t$FWr_FAb?n{;1G`<|?>AE9OhSAvaVv>GpV!K9HpN}AS~fK!@@2it
zhdSfRkEeAWSbjnsI<pc@JvAYFs4CXmQ|?+}|1LP$w_$QiKscp|HF*O5_rIoTKwVjP
zil?7n)4#4yWozggIB{QTN8c6{OwQv5nUViD1gcc-dF%Tf)M^Tz@P1g-*R7VN66QaL
z@dc_C0~~psPYE%%8Ghmt{f8mv?R;bR6D%as6m=*GmUKvhlyyD87)ka}|4+zk`_iSc
z7c&<l%}L><Io&Zu*dZ$RF|5-;{z$rQmd!Zd4&QQc?u}fwJ0`whCVp-%0KaNb|BCav
zw{Dm=*QylnU+BJ$@FrA_(JJ!ie`9F*+1Zbalox~U!RNZn^d)Q)(*D(yZGA<AJ#8u4
zY*K*Y>vZvL(SKF}?c4u3Li6q_J$U~f_EE~CN2;7cvaR`})DLZSH3z7RZ<$e+Lv=P2
zsN~XoNSEzeT*dUjf2H+I<93=Q-ODk?K{>r}>)bRPCM3XKTX(uupacaPiA-xL+csLm
zLu8pHX6Bx*PhSK=^2(gS-V{%mkSnq@+e6wPX@^B@h;K{UPI(QnwNyoa$)TtJ7?t2_
z@5<VG*=3}~Ebrf+c7-2=HGr@H8JqqGp;aaA);RP(>Ocref7^b)!Hx5?$*``x$F196
zYZr{@h&VLy+QYXK^C34t=s*k)HWeDtIUdapOw$$m_Cu_=9btlf29GKS!=kipS#AbM
zmzFK5ft@E8dILgAQR7#0umaHBb@4fz0Ze{+B_8Gv9@_{KVbs)9I3M^MVC~^2h(nQ3
zSjTkJ+>QD!+U&1wq2O16Te&{LZT*jGQ&pjjlbNS(NkvuY#|~q;Y6D?^I=|Y2&LBOh
zJhk>#?~0+g>%aGtd=*V#kjX_f%Y(vh!RT7GQ=#*NqhlF}IyGe7X|=Jo^OPiEA*TH$
zKBGg-t2b0jPIkJ%s%D0;LA>rTcuF^OAtuM7w)V%E7JWbJeo0FHWx)!+lx^fI*>%vU
z61h;|E9L%M!#!e6de#!2J8)wg>Gu--Ecz3FoLj2nUhW79m%-hI(D9pY<NAO6uiyPZ
zT??AvRRE)=A8nOSOrfVUz1N|pcVS!196b-NK?ODGSQxGja?>cDE-%gma$^{1Rc_N$
z3%pUt2K|KctkiRzhXug*>sk9v_NXVpMf|~RSxm~tM>*S0J)<rob}0fr!`rC3i#>;^
zuduZK^;=)%OiIe@L$jIiVY{?o#$>$XACmyEw)E@=Lqa!4x(jO88P>=4m!-;I5Zv;D
zm}mZ;gXedJ4=uj7zKcX3r>#4BG*ReNO`p}Q|CSX3OMJNK7He&aKeBjpc>v%XTuFeU
zb`WZB!$g;&+JD%WEgj4=gGv!DE?GTM(#QXn9o!-<^xqa@g`kLPj;Gvk6M5VuUWL6J
zlDQ%cD};QC%ml&O3Y<613ts9TEX=_<tB?M$0<YbhojGrivfICx>u_5d4rfzm(Fb%F
z5hF;>V5hSvbIv!?qihV{xdds)xU(|;cQ7~RW?icre$IVaz~)Mh%(5@L@SO=GgrvL7
zhrW;WOkm(CEOw(V3cj&Dh_p~Mc5^-l+*eo79R2<pL&7BT4x49DTxInNY~K7r)aMx%
zP!rTta#?v-(W2pSd7pQCc=?L)v2gTcf+)~MI$`OaYe7QEYlqPtcrQw2bL#@vn(kX)
zw9{U&-vqjDKiNO4k(E98vAZ!8;mw$-6yV^al!TUo7K9$89d1k>7*cDU>bzuEiP1`%
z*ez|@#=1v-y2|xu+R^Bp{NaGq8QE_9?b4!~UWO{XM*wCr&boo=XxdWSuGDQ~3bKbv
zfnNvupP2n$otYKxv?m|_hO(A@$dvrAzr#KF2JjHXZXekXd|1vb(<WL<MvLN-Q~C#z
zmUEAL_W&guEEAIkkyo~FJ%oVcW2vhYQ;$pV%iTLuQTS;Svj_Esq>c!w*$xV*utU$L
zj_QV{XLpmvGmlSXzi704J?#KcQ!*WBBhA}}88YqG;xHK>@33)}9yQ&^jFrO}xydQf
ziqCC$<~?b1z_&aBwH9#GkZR={5~5~o#{bIwSZ~=?tZ*8z&;8Cn(hVlhofE3|O`NXv
z=uP$J+Sy^Wb`SLzekxY5W#p-)#$@jIt7qI}ZI}(!xt!>l2v+JB^>H;^I)YY=ecn>A
zz`?;JTsmVqBwC#1LEf<&jFUJbTaMPnH%4mG(K_?KaRqiijp`LX(KWqRU*B6vfAX65
zR}eT@gO*m3xxF%p%G;o`Bp_UKM_9g(!T%&~h`I$vkPecbua~Oh50ZP)dX<I7;n>EF
zLVEEh=9R{R=9c=rj^jePw9zJ{>`$2jje=V!AedBnePA)EefMc+KZ;cpUjiaTcbJC&
zGi<THg&Lt$n9W!@{gC-S*zLtHb$JBmC71YAPum*{-d_Ym<}e`q7`hEU<UFtr=dERT
zP1{n*0N7e!W{@JzGW^rdpCv&oGM(^ZdS9NF%sx`sr2vN@>Y3<)yr732Ok)9va$!U+
z$_d&tXiH-kZ@M_3f)0`8bVgv?eII^1s-O564}2%T{vAg*qg{FP5@8DNC{yRpGz_C7
zK^s{?*?bG|gATF#OK!m}G=maXQBW_)lOzX*cPXoZjJ7~aYR3c4tUb!<yWGa?0&x)r
z@<(v4I2#+f6&Bm^6N)!qP!H`PB@cA2uyc2z#UuMqY<|(Gm_AnsCq!+P75$Oo)u?|#
zoVtG#vh=eFQ$fCR-4MoKPSjzGbf8sCQGHn^QZV6AsPwqDZ$RSi&qw|BCeEREz?J+H
z{j&|_b1!Q{Knw24VA`>7NS*g-Wip3G+@n^uspZqIje%3NtN9<!bS6*6WppCB6Yc64
zjfh9J1A}<0K+QiV9n5`7$jTgho<0uSPp+0GoaL_@bRLHJ&C$B8^J17g7SRYFVfR`1
zCdDrD{DWwsR<wqz7L8%g|2ASqCY}`nlm^+(W{lF>)beEe<A$b~x2<S-t4*n+5ctEk
zu4yee#g@p03ItIM@}~ti<t)U<JZTGPoGd(vDSMFKkl39({r{XS<irp!ZpBU-B=^AW
zDxJAV`VnQr0+azU+Y&%+=&>xwQhynnoxpbM?2TD6NXw8_GC=yW_*TUcuB7Sezu8Z)
zbI|mz^q%O!Z`VNw2h|xrePQ7Lb083qmdus$oBqsOpz2P|GN2m6`RbK;nI*%K@Uite
z^;G7O@dtb%58@;^gGid5FQMsq-Kjc90b-Qukpjr{GbdR&6uV3LpTC^_lI16iTzPAw
zFPyM!6PXfLn2(C_Irg~bWeC3|ttxb8$?*GfBhz^En+Vyxg4$YEaEGd#%N(^|3bgun
z^2e}a`nUe{3JXH@U~(rT_!2$Y^&942@sn>aLR*55;{<tzNUZO%Eayi)?)rQk`Lt0I
zr^OstjSe7HeK!jmCWpu*N}-DVXmER;Ck(!fvRZ>y0e%=(En%Z*W-dr=-x!cUbvAO8
z{+J2=so1md=aIg|zJ#|UW-)%$%>f|HAviv>aQVvxDHo1eC{kWI&P`~e;4bE_v8st4
zBWdwPxIBtMN5LyX(@FDU!Q1<b>x^f7Oq_q1y{27(1u@%R*N)7rW1<j2_UWWBJ64<~
zr4kq~j+|xC#0-gyPuEgzwKI+v<7+*y@MQPbOr0;4AyBfoaOqP$Y$&9q^GcI)C&~7(
zTc!4Bxh$zyC_*vRF*fgdg*t4H?lAzq64b4+_R*$;tUJBff{RFZ^1KiG?aMLD${8br
zH_Itqdvo7aQ;++)g_J33?zQCBtstR`%e~rvR)*e!t<Nhlu7<Y}!Sh3Y%6cCDWmYH$
z7pf1)F_IET&7FO7M{<QW#L6hB1)5wEb@5gNg3If~MRu6Tu(jDfIq%wS`6U1c+dC+|
ztG8em<HoE?woCp35*JWa?DakO8#1m>dSTKT(NLzfZWgu}eko3(LFA>31b)aa|B)9l
ztUK?S8mv!IgP4txgaZ`LQ~-Rx%r=Hp-VwV>a};s+M%h{SreIic>{1-yv~|?Ba%i=r
z%S~A2Hf5{&+0f^_Sgk6gXlu)EUw&!D<8Zc>!mm`J=DRT;RgA`#o>&c~h*EPibK=UB
z^(5ygA58fcr8BhM&4S$R?~LmLjOV>&a)~;5|2w$NzrRz0LCTd|!uI&QLtK`3dcOtE
zWdbK}E20d3_2%m=y_ejtzjxQ#9b8zQTfvO1KA-%pc`op+Pb<9n#ykJ3=ge9Oy=&1+
z6Qb~_m#x)6>3fWlxzGYGy+npA8DB(8+8j?e12%gkml|tSW|wNEgzW5I*L?puyil}{
z_sA|a`X$v`ETRqf=SG6Iu!-T3H|jwnwRJr6o3RQ9)2m@_OMjR}*&;(=+<KAyv3!|}
zNepF4W5ar=GbhEKgVnAJrI~_n)efTyST~T*aWb_ooP2HTdKr8u)C=EKP&O=<G$Ok7
zJ!)r++@@jY`wk;b6=rx*gkY;NdBYDsi?bo{qD5|8{fGB6zWo&iac)9$x!QTAJK$B!
zmq3e?)|riI>^H2(-oB;XwG*uYf6^yH>7m#cJ)>V8Z+%nSG7?94ZOXG%5wC%T7lG51
zHqlc`Sa03doL`EPE&I@d!BK}QRHGX*kus<|4u!V{^(crn-2TD-{{e`Wp~qkxlTE`n
zSBYu-NPB|cACLEgLXz$O_+!4a)$=`??MIKcOB<N|rhmkGBzq*!tl7p-yFt@^<$3jf
zZi45J-S^NY6tCg5wif{iyiz>sJeu>onr#z7EN$O7W!y=712TvYHj<3nGbj(bGo_jO
zwrma&#-c1slRW@XUtwGt-19P+ZNF#e1dUL?)3VteGsDEGW)`F`d>iY$z0hROgAOD3
zn0ZD!-qGE|Oc~8B@4CVRxE}>v*q}Rv_2gqItqyCGopx1hOWqy@cJ|?omK$K&G#wvl
z-#QFI2|f(m20S(pqWfs&`@`fc?eG=am{w!rY03i(&VKlZ0eFvHhX?qKk*?vQJ6Lj8
z0^iwXbto_GZ8wDV9zhwlAY#FFIRoD_MJR`XDZS(7KEd~9aN(uuel_d#(CN2TR{Dm5
z2!V6o|KdvD7+MtbZ|r8o5jI^yv!qKm&aM_IhG%f44GS1C!&-wp{s>fbLic>fVeMDy
zuZtHF(-R8$Aa}=0lR7iBW2ZK=;^oh69YeeHb);H`C!XKU6UHR3qVYp8H_U!+eq{SL
zO-=4JJ+o`Q|D&t{&Y-D7r+rgH`t*-ETv)kdy=9iXgl2<~R9?p=0)(UfzxJ^h$do%(
z);8K4uhT)j`}zC=#VLPN=0vz=ks9<{hV(>NoyaD#JnTu6*wpP!y^XhodK+m_HEr55
z!mP`OjIBVxyO*bCAs^g&ST>Ivq<4vwhv(mSyY`_V1x*k5Ao^KmN=qQR&<`W*z<r81
zw|)MQA@eEK(@Y+VO=8P`)T!A%BA`q*SSs(NzqCIvu}NOiq?%tm`63!#k~@z1_G!u(
zmksn<8M=eodKHe#B>tcE;Zx3hdmdhofnI4FrdPkwBw^rg9+X4LEsrD3BeZQz$p*W(
zwfB_ghL^F=DeLJBZZa!HXL`p(4;S`&_`AL%E*`W3+GKQ=Qw;2<H<~>RESr;DNDlR3
zACrn4xjn{OXSQF$sM2T5?Sp5O;B#uVJ>lgGTnYFSPA(;n;28Gs!`}HER92z*Jb^vP
zPpbRkbylXv2q5|3EbX`dczExdq!pE)?c!G&u7cQzd|~Y@NgUw8|M{qPd^%do|L<r)
zlb@A2-}r9Vi?HiBGLstEh)IlAsu?^4xiQXzVr{OJ@-yZ&k_BOQWy%B@-%*BqWTe;G
znz;$u$Kb^4>_D(AV1tn2ucMjH=~D1^e|7e%rCwZaE6!s7GlE8r&#Hz3|B`A%d?%~H
zMEr_wpI3_Gw40I6MX4L3h<a;9!NS%F=f(UJyLkHR3UGR`^aT6n5`fZ(q;DxCKK({r
zaR>8!w4=j!iRVb_e7Mm6Vd%eKZJ#DxKNfsx?D(Bmyv2@HhPEq%)^fi8LlAv;9U0?h
z3M+nDZdLM?ZHeS*ol?`!H+?AFSJvQ`_&R7_bM|@peRQsjVy*ZV?uKYEgCWs=y%YXy
z9?ptH38RrFr^cV~toZC<KKq1Uu0g}}1NF%~<qVgB;mr;?<yPy~owj2#Ax(uVZO$z6
z-SM&Cn5#O4Kw<TQkbknRf6fZan=3nJ|71Ht%H{p9VqT&7&Fen@w3w<Vr)sco52(e1
z3}6044#J5e+n-?k#jLKvI&KiU`wlUpJE07xW?PR3Y;CaE@XyQ8NdfXIKJy%P>Y1^8
zB2hENPi|9|IKy!%pRk0Q8-sO)QXGCn9vFc;e#fH`cl|rlXXoW~1GHJ-du;y*NOli3
zLhVJHWPyx9E@<AMTVg3I%&N(Gl4r8s8IKnVG?5+2N_<dMGm-pr=h7M+p|7_n9Z<J|
zRY~84cKmIg^L>vg=iz0gX!DLW0Gbp9=!xGo{rcY^O(`C=yG%y4$FBb=QSzV(5l;%y
zdv{`Od~+sbC}h`RQHL`^`YXI8YQ?$?lP!HB{rnRfK<4{}F<01%{nb)7%%lwNektH4
zDA!IYltL%A7%v-!Rke(;#Dr=Oqw3LUdGR_kSP)Tu<eE~1R2?9EG<uS6O`Ly8;fMtC
z+%+K;nbUJymqdK4)s>P3%XZPF*B4UM#Z3LCYY^h{177oA1o;yuF8&KEB?G5;s_m)W
zZQLba&Z^>fPAMudJct^iWiiiuz#SZw<Y69PY@MtDY7P!qQ;SBAFx<>qG|M@D$AnJz
zDMk&!3W~Jlzr#?|{<D#7fRQ2KmD)L@;T6i)s!4B&Y34v04>~c4C<O1W0e^`8d$C@q
zh&7P<{`|t1NUmdH|C>0<2<@%tpO3*zMMQC4gjiROBt6HKJn^0OF}~ymSHZMp*z6$G
zBSC}@I}qc!`G(-9;u+x;a03FZ!KN-*g9EXeT@MeTg-nxg1#$hQa0uQ<DS48<H)AA~
zkz!Y(l?9sg$P>+d(3m|T`_YhN$WtjTs*E#klNp+B_B&0e<3A%Zu$R@iC;ue_$i}y-
zB)`_)Bw`=akdVW<n|Os?`ER3Mcv%5e1M5yH>R|6BT(oB{VL`*NreO92WA;S{)ReSk
zK?SO+DZgzjs)zgxS-lEt`y!B}tL2|@+8FpVE;Zop{835t74*K%cIT~C_^f$XT)IkC
zsW`e)O$3b)<dnaVz@gkqRIL6L!^;?TQxCVf#GN&xqT`2}N>#G%*^K<R3&t6JS@TjF
zu$8tJx=mHO)5V8;Q(+KCA?TS+it7*0EyO(j>a=OenqDu6gU>(~H>;%@M5WgwUdLKt
zab2mKJF2#;YWK~FL;B(NI(OhByqqugt5+Y(*)h*4@InaVsVZH8cuzNpaMl;Y%Yk1=
zL*;so&N;*qn~f6In6>02x&w1h06`Is*|?qU#*I4tizPg7<^d4yraa?fDU<CxiDpBl
z;deSFh|DFd=^fOSZ$P{9I}v~o_uTX@;kexD=iI#|F6B`&E8lRWjJOI$XS55-qzIW$
z_FX%$AUi;a96t94ylv=-g3uaz31j#Uu{cM&H2YCO#j^t$X{aUFBf|C5tuLA{`ZZI;
zhA~qWxH)&MC1vRLT5Lj#*`Q{^Cfdm&-H-7KUa;;X_C7UN$(Z59`d-%Yu)scZQTbOX
z92@=XaER?n<6{oj_+sxoVlF$KE=T;5a=S$Zl~WyHoOf2K7cp-eEP}!Dp4AQhZ@O^#
zS%|IaWh!gKPf1joESRFZRXCj825PQ2f}K|MIFtfLxtJypOC{k)w+!YtbaX7(6NOA^
z@SG1g9+D%V(@y0q)V}1?d-U;2n&)-Rr{->T^lEvSow4D^MHIK|i8T_{0k)x*>;%uL
z0dS}C#;*=9U(7DEp0JEYuEvRakAWLTS2OM7>vijnfaVuYvPHJI-bv@EUWc2as7o-y
zf9qTA)S>OYhteo6{%DD9qqs+a1Igac1{g!l;IS9+=9kxI)IO`V@$5%!u%N{an5i+!
zJ9vbSr6yJjYoH-r32aDt*&uBFjX09Hdj1j}mp64){8`DE<=_l(pF(p|Ni(}ZVl_1J
zgLgbJm7IK<zDdA-)WzlTXIs(<c9CUeli_)2zvU1F&=@(fHM!<BkQ#OKlV-$CyTvg~
zTn!L0@2-tLdGO@Moyq*09%HPSzTzix7`i)mo2}!SGP=Hc`FwZ^f1t$=41f-^^USpi
z|38b;n^v~xzvD>&bQG+@Xl+MmZU6poO$EBeNpoKFZQ?HD+2aLf>1U?a$G=DJA-7Lq
z6OjT(d<VG`Z*X0FsYe{z7+re@L~7hdOjH0&br>X0*<LPr#nh7Dvt8Db`uuhTy~?uB
z>}>{|)5eOkhXBzT_)iSA<a8#Cq+>@v>OqJ}+<?S0{lO~%qTY!T{z8m<mnr<(5kF}7
zK8&EZBaz{r4UVq^tL~qB_P3ur-&=?{UoKt^EA+_OW~O#$DG}h|edE70iqK#Mn<_*7
z6XCG_PoW}~7xJ|WzF~%hSIh%%pP2MNRD5#2;!x3VVlByGr!(I&XEn}59gdY!i=bTt
zNPeb&j2O@HAOITidB{+Z25*#+=;Gb%VJ1Ze=x5+J!$1(YE<^zngyJt6E$$a|$~E>i
zX>4NS-fmS^p$+<Sy6=|9F6LP-sxd0c1ct$B+ZNU+bwPn$%4p96|5-yj&0uc^7WgWD
zGY{3@_U^#Vr&tK~g^(Xf2_sy^!%~oQE$U%Kis6^TXyoH`!7-L^Jnd*4!$TJ^x(3@r
zI8knN<B@)iBAt+<Eq+6_C}33&S)GB=4Mc>^7wkeN73MpJxtbY&OSSoxVBb(FXvPNQ
zjLyOH`FS7n8y&Y!(3oh7JJ$(r4+;o2)pND0(PXnXUkA2DUH!M$u)G{=uwVnsVj@Un
z31-1G0{|0ic2`?D=-5EU)oYb%N3oJ(K+EVFJiedqCg=UjoeE1h4U+yqlV~%%GS(>z
z{fV|fI(w5}|Hq)Pe%U2)j9GLUJ~OMKRo8g>aiPtSTWky)Bk&s#0VuD&yf`tHp4RZw
z48_a_h!m~C+j+NOUOlp$e&zN#XQyZbK9ApvW4=)h%zJ;B?BA7vJD=lFvnpC;54PnK
z-X*lBJK^uy0%M}r@E%<4wkl%0!mqgZbb#a0T=-y*GrYB7_-Gt6JhLCb^=OF>H+rH2
z`bq7C1~@YT;E(7Ugs8_|yghs$wy(xSK<>!;atDe%6yQ^I6u!T?z8v1g+CW>96X_m0
zFe&<|)6HRJ(j(=Hb59Jn#r&4dwDr=U<K!n$8aNTngMY(@6G8pM%?ZdFjg4Q$0<x^0
zKCT6RU&*ca7op2MIw2qI#m0QynfV}AXvF)Wqk{VWq5W@LT~$ARQ7$6DN?gKy>d=Ad
zBPE^e?9CO$&oX(PGSSve&9`;r4(*ceDuZQ_%?v1HJj!NfX<uPVn=7mpM12EMQb{P2
zg+~7{WndhkdMc~_NSuIM`S_jz?fj0c8c+S^^6H`p;{_2K*|r*>(fvJ?v_uNaX3SC@
zV&I5jk0cfxaZaJDj&&zakIoDiu3{Rh%>art*e!0OMa_@_#dq&3^B7L0jPF}|3FoFg
zq?Ax-7;30$sKsc&FUp~6>@DhN23@wi%9v>`$|u-oPzW$cJnvD@Nk%%8D%|+x4`bTU
z^Lnjj<i7fL<#fH*1IOlf;=A4<^;jp_w6S%G6dP0$ydi=+@QG#f`-%y*ITUZ2pS#)V
znO*#Qabxp5*JaYUz7z^K%u+j82X8!#V}ijyY|^XT6DM+t+eOTnP**Cm4KKeckwErk
zlnnh=XT6F9bbv`3{5s5D^w(b%YAt%w_fG;N#^RV4etPlJNibv4CQ=aRP;^sDcw0r@
z4r85+kIhSn<>6UiMIg#0#L56Z`CPG1k`ESDi|vY~m*h92zE`4Cuwc?liV?CT)clr1
z5@vJ5LSP$`AMT;R@>SH(2dnqDv|(te#VYAzq#d_a?b0DQ8R)~)+DGW+*Yl^sLmLg@
zjyZJfiB!t|yI`gjv*12x>m?Iy6`&Kg!YOtoSdDQFiKr)u?e{~M{0&j78p!4)%b`;=
z7^s}053Y45H&0@lw=ej4;U&ALqs$t->Kc3CU7VAOQg#cQdgg>CEqg(!9|w=Xl~{X;
z5EESO8NJ$(DC=+5P7;@-uW*9Nuq5QPF9RM`+e|3yrkwVadf(hUUEd^F-%M-o?oC_R
zPt?^{eHp`8pI}@cU|a_>zIL0#r=9O78t7Aq$RV*${yOPtC#Db!UvxCp;7Q3z3%^Tg
z!|+`wT^ve=TG~S%_DDZw=E~X1Yd%#PGe$>w6E7Y`0AGJJ5s?u@{rIUo6cPIgqhSZi
zSEb=$Xv4}ZSc2$g!`ygGwGnndMb~j!O^Q9qNcw0ztoy7_k_7Q6iBI~hSX1O-%`6Bq
z;DBSdtOC(#E9RW(WH+PyKUHv2TTBw8(>6>(&S`xHWE<bRjZ;1~Sq5x*guni2agHZ~
zIoqe<K?03W?_1T2OvU?{6d}!&dzBYzuv1kcRZ{S)YG`PXFH&*H&`@rs|3F~_MPo}$
zfGq7)7Jk|N#h$hFY3io2bZ6^~z-#JUG2nN&XZMNRJ*?~LFATnmTol!BfLjrz;w4x7
z{`B`IlT8PJZH;|Q<^b~y8#?wvGh6qX4%<|2JQ2F0j&RsVM2CGipihT&V@#4laAP8f
z9SU3j2Wdo9Te_s5CR){og`h|S251~D8#dg~7CDrtty(fNr>$$z$Eq!9G32bRXfc4K
zEg}$x-B9>1TBAPlU*KeY#=p~dIg=FBHDkgQ;x!Y#6lgwEsT52;GpiKjtsyQA`}5ch
z4U+Q+0S!Z6!Y$3GdeVEfrz-(pCG5TcZe{GDer|2-fdX!IY+xLO5<4Vso4OJqwhz6M
zCU!uDj$mXPPXlFSR~tKM&#B9sgnCz8gNS<1Tm!0XS6u_6Y|m8#$!Z%#gTQKs;SOr5
zpQjRKYPh|UVQS>5l8C7fwvzv|Ag2oH5fxBmIOK4pr-7cftEz$6I|A0Qc>Ut9%)Eo7
zUr@PCr(aUJOQ2s=xx=epR=Eeao^QMTW4+iAn0IBRf!VsNse#<O=dMBcspg_i{EF?b
z;pW73sadyexL$R-16Z#!91ikSUK}a%ROjfE^OWHjs`Av}7&!AZ-~eWMs&$99Ud7sq
z2DQSjF&mG6AM$g`t6rwO3a<{MwIS`hnb`L_-1926D2|5douF$Hjh|q22`Z_KcWatV
zwUl$5(6ppSNdG;-`Nxv4i5Mo!V6yg`DKP6Q{@<sB;2-6Q=nQfW%i(JFktQ>DgpurY
z%sFHG2~-;v=ED$#iMS_nVw|DCZxJC4KmE&kXw3UewNY4iZyx#02QajuatBqlky!VT
zAEC_q8?{rbQ|$bk8r~*jnRn#kp|ecYMk5;VklX7y{)VnE<E)92L=SS0M&;CF?hi73
zD)pH01I50a2MFYo^@KQxke^5S75wap0*hjO?faq(1HVH?6Dj71AinpL2yPf+_KIf;
zQG3P89ZBHEh}^R#Lj4Y5dqt7m<tBnJafDJN^ASL#?nmZ^EfdDi4xP_u0L12r@nlD^
z74T01SCqf>_AzfuWQX14Qvf22L_f*5kPBFu1|%RDOu%7f*tBho^2n_n4`tM}9oF*L
zaVMSy<Ch24wcbo~z9mt{#+<VPPJ5~|9eA#!mYOKN(bb;7wb513AhEfH(QuBr#n~QH
zV<V~~+Ea9BTb!>XCBLv>5erE_C4|7AvIo(ypMD0bZimA<lnVmKKBDIt1@`%VQ3oU5
zR5K}NHjbqrq4v+7Q8(ykDuJBHGlnl>{;S2S#pu@$D@oFozFw)Vt)gi{zXbm0>y9eM
z*-Z}!nC1Qf?*%P}#l$`T&NqCNu1Le8h}`^7qVktY5d&-amyoZ`0I&r8c)?F7;Gk<d
za+EQCb?9V0=GQ(aZnPyE3$y|1?EF??SU2QIH18~nhkQc@ADHgJMk{5~#4D_`aaO@{
z((%v4gvN99Taxiy5pdGE;s~lSJ}G<r=gnx^_Xq?-jaD<?5(SRi3K;f+1kG0jtntQ4
zL`WOX@oy2wa$VOQQ?Eqw8qZ1C11;k66BvBp?%-rf-Vi(CQGa|IWigrV?HjcQb?MFE
z#br$MTLQf&qHogi!7Ob3X6A(>qf~JTv?P>j0+OX((ee*?PwQspr6Z#ZaS1CVlrH>|
zJrKN%BNo;zvwgX^1XvPEBd_RainmT!1xrcGpOh{i6qH6j(G1skPe3#ChLO=$5=uD%
z$(|eFzXuYwaehkxB>idD%(8uC0x2#bk6%*LE4um~PiK#%5oBhbGcpPnmk>ijnZ+;3
z=@b2W&e9l<uQNbt@tv*lotd?9(abz+WYj$_ftZA{*E9O{l%;Xp%)E7Elq4>pn}qU3
zK=P|+G|?^aA9-BHAPJikzh$iV1kp|U)1=vc@W`l4T!I=2Wto8FlxOrW6G?Yg$`{XK
zqK~NP>>E5DFpG1)+5YGIe~(L`C!y35kgWBJ)_=fj-(hhcH{1U^GD;km&`Co1z%NPS
z8O?u-Cj-Hg`5dZ<%926a6o?@+q0JhBFXuIpZ6V3TM#+_t@Ny~b^r2L=X5W(q`gR?j
z;~ZI8U|YdiS#z4dsl1`?v8dKv^IYC<8%|Ius6MzL)r>4yt5}|Nn#@}>Ro--(&0EJ*
z9(9`7QC(YJb(+~xYhRvqs!0ZX6OuIzTNc4hKY!=d($HEIsm!eWL9-~^0w>m@cF|wf
zQCYjKbwx)KsE)Q6qzUCv(b}ikt>NIPZgeLIvjD^KMYYPpmOdCZ5;pS6AXK_OS{~>k
zX-OHCusCXI&gG296Qf^ixlpL*>mgv%o!H;Wl`zd%<dS%4fQ<d{7{0!V{<PtvMi(pF
zhrqVZ{O5qvIp}T3OJ)g`Z6JnSQkQZ-+Dg1*tEb59!^r?~g?t%-LzS>}hpl;sD-dP8
z9(4o(&l-l1Jk+OmoNVJ+g>u8C_$p8Zj;N?hg(Uu;DVgY`s>_1!U5aiMsN&8xXtkWP
zV)E3gMdG&~!QSNtD~H--F@aTQubt-20{^a000p1yL`}gNu@)}Ll3-G7rc}xJ0@|rm
zi++u=P(_Muif#>vzb3BzUdI+^QBZ2qX=_ydrsBR1OUKG~N3(Kcaj>SFELflgRCe5-
zP5i*^GVKGL9X@I=3YcQ!8ykiQqWlSfd_f89{~3IE7xpvyqw|>hCy`)@jv$6)zcn6o
zUcWY;lnVe#K3L`_ercb2x+KoFOZpc!6P$@q9tiA8kk`-8lOk9MHbZdT7_TpM)Q+Au
zl-v&AWA3kEGzcX_7&Zj6I#|P;2h^vl4S!|={VQ0(TnZ4%Sdaa;-+lQ@Z@k76wtK(*
zGPI8g={Yr_%eSxnCfra`J9w{>I7cwq!D@~KQ++zzQujwt`oVIJ__ci+ZIYm4xb<K?
zN5acKgEshwoiF^sFw((N?9n0BlxP36%mtn*tC<N$LmLx*F&HH&n5(VGbEege<205;
zr_`9+tw{b&xiI%x5$Pe{GnZPCaZfea&vLH(lvo*uzRu^TLK~;NF6aN@pEPoS=%t#Q
zSayK#rJ0*_bO7t6qH~?tRsG9L!Z2lN-_1*GGr4vDx0lpr%EP|jldxbiBCCoZ1?GOw
z@4Dmy`2#iGCyn^ZqSHz(b%8~34vo4hoXR3^HB@EjMR?nIf(l?ogl)mIu3r&O6?oyn
zHfy!kz1(O)@vdsM#Jx(u_A`<3z6$y@pSSpH73FC;Zz+IH7QbZUW6k!Q)t%5Mx*I5H
z*6Y;AZB4}bl~5|W2*|R>wb$k}z-^Pwsz_)R-2~*>>vHOEv({&2BGihm0&?tiI1RSh
zth3q^dPH{u1@?Nb`aG?v@FIYeY{sFd*ByZX6|IF7SBtmA+QO<++m4Rf0wWNI8>4~V
z(yZGZeMc@pV`&N1Rcu3=3siUVqocyUH0x~14S7R#MAcfMfHCC0BM2ZiauVB-bXnbb
z%;l!Fp)hiG*%5YGu{Gy#W{Bl|CfHGTS+=zZK6!uk6A(sQAUziGt|uuBJ4L(u`Ihjm
z?OzynrgVoOpc*$<bcgqrK~gnxa^)SETTXp4<{g<^S$1;r>>?mGJ%4kT<{j`Z%e-K8
zCh<(zt?gfx*7otD;a`__LV6qL-ELUmKSzH@*R5MOYj?-nZTLxhaQCBI=cBsyxW+qc
zv)1F-=vMKqYV$L<UckF<v+Usv`Z-^)810Pmxm>W+e}3c+(N8sRuIvuqPcv`+=nmFT
z#b_?=4%<(|XnyI=z)xvw3GG%aCm-{UQs^-xx5DaV#XI(0bb68NR>4nh3*=GVTHRV>
z2;$RpsES)$I<@My>$bXU-E4)^hOR+7_c-5lC+)1;m<B8XrY{|O9nk89>fu*{F39hd
zU&MUU-xA&$-Xa9nNgR~wsaKQE$L_IwDg`EU?fvbi>#bIs&R6b<UXXoi1*YHU42}IQ
zn45e5;;w8hM_o8v<ejJ7)A_V?0}bu!nwHkGF7hsF?m2xry9eKgH*G#_!HphkKIeY-
z4)=&J$$mgVE2H|l<)w4Ndxe)&zqKB-trqag!v*}k7~d>l{|_b63C)P3{K!$Zqatrv
z_WTt7QMJny4_M!ki8n2Kd<y%h(q*#!a^3Nh^DBF8iukD3Wx5^A??}Rns91PI^WPWr
z_%GN$p)C7wXTJE8(HO&?|G`9n`T30q0kIe<7)}*_0BR?IJ)k)tT)0EnoXn3*jLeKo
zZM$9gP&ic>B>Y$SUN}~GsxP>2sIRzhzc0COp|82`w(lqaKOiGOCtxLj=k+!Nd~qXi
z4{(m8LjLD5RaRlUzvWStHr5N)h*`9&H~IwC9WtN!!OEypD5=rCZg{Ch1P&<0@&;Rm
zZ1ZVA<QvH5Ue9XTA;+TEO_5PWZin9|JW3L=sQAAgXl8}+W^og?Ary2K-?;8@FxQri
zlO`@`c9(U_q1Obp@rSg{WHTVqrbYo|pc4%4Xx)FgXc}EJ3L^{uQ@wv<H-}{X$pegS
zo^yh6FC5<%VRplFC|HCTz#iYf`KdX6FFEjb|6?HNK6PZ|vSgVKGVe_R#X5^b<}f!;
z>C>C#KuYzxA}HUsC@9;u(X*c$v11>T4y-|+@8EDYK-512Pg@7lw0vb8K((f(M?6gY
z#w3$cO%*6N=OjawCXG;NNH6cFrQh+vSdq*CtE!orA~IIlNQc*$S`AyuF-L$XXjjpK
z)$F#|A$up|68n14yKk1{v;5KfcJ>hy=H_?Y^gQ`dI*qbe<4U-p*egaddY7FtY6UWO
z&<x|BANfS=#5mV(vqXnqZ_$Pg4|$J+Lf(T;`|uw-U-E95KI*&%M>b<$ceSs#l)Q(p
zo1Yz*NMz{IJT_FaG)W%1+(w{v-PjAUN6Zaf<O{Gz{0Lg^Y>GKOMh?I|YVPuvTshyU
zQ!r19-3PKAsbi8r5sq^_35wp-K}oY?`}{}d?v30!IlmO$sl$@y#}4-0uerVQPJhD-
zZfu!;)3p{#TFUv5>W%8h3#@qG_TyS6yihm~YUN)<27edo+7h%T{Tugw8OMTJYLVxA
zlIML8Ec7WQ-RFw;<luE=-m@a}C<K|}2*KKEPKm*4CzoHqxg#SKa+zk#i~z+eHGd|o
zms>NhiZy$RxdxjSHXXwbOE|C)&_!OEVz6UwMPAIPx~E*tNZrT#Z5ym8^}q)myf}aJ
zr3o92Gj4kokNq?Idh44_L^-y4L*n1)tWG1ZVnT;vS;0=nJfZodLH?*BQ4v|6DDcj7
z|JdpE@OLEL{#V$IB^_Hz_)mwdRteslS}!f}%!kPrite~$418ZPU($%^9wlE&3C%6*
zE$xJHu1zkP{Tqd=N$e2c8^0(C+AWMNZ<X2B*;cWx)3v=d%FL(xm#>kBJ%c?(h2LvX
zF~se*D0K;MmsmQI9~FD^$0M=d`QN)tQ-peZ<F)tVuB9SS-|_sTmyelun8`WYr>BQ3
z{dxR(u9|uSdjpqu>(1*K{ytYdS4IAQCwM0??d;9%&0XGc*mfWTTyfoV;qbjpzBTM>
z`TO|$G%$z%eKYM{oQuSUuLf!IIgOqg4@a44qE&^p2?N{%+oS>a)J{>a=LQMZaxWYe
z7p3mZD+3w3L}_nOq_Dto4jFLkjJ@j@AtxKURZkwd+);F84ByRFO6nS$!V2FXu(-H0
zmBv5hxLVfb7|Rw+kS_H&pTq?TIAA42oIQpmoGNd=Ut#d1ofh)qnO5VCJFNT)#<48U
z*qku7WM|8m&d8W#u+(JB%Gj7NIb?UqXrJV_G-S)m*kUloVMopgnWSk7*HM3^bC1|C
zja#O_k9gbt!%g!wVsJN{n=UqDYskq-G#jgI$i_)B8~bR;%Sl9^J=vsu?KHNd>aybH
zG{CzYL%q=ZR_hu?bvWL)tbf~xzIfM12ptQ{Jm2`6OS|;^>vm6?W_VzpHlxhh&>R*6
zjaYwZq%tB=x#E|CFCoU(FflfM%h94V&4uv<t11XV_TSKUAe^^>%`<VLyqP%|6cY6a
z0k`Z@l;lODRIDOD0DOl$nRPAXY?DjC_>jC)%Y%$eg`o)U(JAow{<jSYv{BmUa6$I6
zuyFEtDPNjU5knSzl(1L!pb@bz{~ven93)xO?TxmLY1=lYZTGZo+tapf+qP{?+dXaD
zthRCcdCxiVJ>QMEh`9gVsM@<Sb62id_^q|8c4n^3__Hk}Sg%+WAY|CR0;FIVxdI~k
z;-kbe(}k(4q_*EFJM~;Me;x}6u70=*Mk6d?9;d_-r5VP=+W%yeRxLs^V%3ONjdC{8
z(PUzagYPSHmb1E^jW~waWI|{O3E0Lqg@;_vFqsPZaSm(67o_P~)g3pyW{Vd9bzbHi
zur$|w*%KGsH3Bqx<pD7&oMV7};CRV_O3d8@SupBB5MpTjCsqh<^%~`xXZ1!&KI>7{
zEDvn;)=@s;;su;il<fogA^z=CjcWCvo<5UT2PT0&J>gVQC?(HGCpxK0#uF=$m05#?
zpc;T3XxjbW<l%`Dpw1Ba08^)%U!|7T+^6{qo30^6z+;K&=Xu&9mr-_L))M$R%;}+}
zW*lp@p^TCG{NBNr>D;G2+a>dO!&0*PdFm*`X2{t=GP1eZo~5#i%XLmqN3VnR5H^~F
zGpe)u*1$gQj%T`J6Urg}v4n!$L8<b}{=PW`PjKX11B?AGv!NY9=4527`|M<7t~+#r
zv#nBH8eXR7ra=sdDoG_0;pOC;5aVT!nEZBZ1GxaiS|FYO#ggli+z(;ydK&rR4%5xx
zrKS)}Fg%N9MqH2CGkv~$itR7DD001C0&@Fp-x2RGUia&5eXLkN0ynVtQ4)R5SU)r#
z5T8Ayx&)p#G<DIf59}&(#9Sb&(cCwEoN9YweZ*?3hCU?CiG$$zChGcK=4yJyKdiSm
z@lBwFQop#1*nv0jNR{mX%mR2%-E?k4J|pE|>nB-JhPb_lWuLPoe-qs4CJSpGQFs-f
z33x#s@+S9rpbwu<-M=L#rL-zw4Xo*!Tl-abnR6MVH-6~tqUhORCk5lFmg<x|>O0af
zm%eKq>USBZ&t0^FfMp#j-6G7_BAjs<mwPGyW1T4SGH%Lc)POt4Iy7JvA-E0?#)La5
z<5C`TJ>RZH*kTo-y_ZH;2AiWHLQ_ooebo_@iWW(-dXoA!g1DfljIbj7d*jrUg1{4E
zkH2fhy_mFo=!GX0N<*|vdbG&f;b?}{E3_^@*h9lY8}}O8sIC%ko+QLW#-Y+BOjF&%
zH!<T=n`UIRQT12Tx^ihUBamq`!xLyTBNb`&Fp4!=elXNoQ;pX}p_yp36r>-IFBR}i
zuoVzbJQgrdaNL&FT+mw9<Jy)u7uOF)>9;RPA4_`ewCY%tJaX6EPhCT*(?5|+pndA6
zK!5&73I9~4C3q3mk*G{vpq+q#?U99y52Y1kd~ug3e-DZtTko`jY=A5xix!+?inw-G
z2&xOFlw48#%N5{6Beo(r_@f1V2BifFZNrs+FfzzCOfty6NjI)Y1+AaHfO<Ex0NX%K
zA<(YNE#0o5OoHPN9H_)=2ns#xCN}}%`vOxDRkY>YJfZeg=>x>+iQ~5EtyRBBZ|-$G
zKNRVySS5k4#;9;(1@y~0$8BS`0T5iE0wa=XpO-oue*(ceNaxOv%XfykjmMnh*<TTP
z^*PzYiUvp|lJn*j#Fh!c*3urWM07{UTKnPo?zEW^FW*{Z&|t%WRqvtiF<e!oHO8Xp
zF<lj+HC&bS8fa)%pKOTcFQkzek**=7%Qb4>$W1?l@kReuQxz@B$ErHtt(hyCMOL7R
z22=W*EzHw~@>RRVhU0wuG2by?<{xjiL{EG4rw!|?cFqk)y|#~CM5Ez>^c$8*H%(KX
z04%wYC~@Zf)ASp-$u=Mo%;)OgLoFa~C6W416xdg(URJl<5AFF-FNOfwq+bI@bab)t
zP)}>W>4-P?)wx|s+gnmE#xU_|qOMSwk%Z7;L^v)jz_f_R9)}(*kQha}sIkMu3i8Vs
z)->S9-dJQi7-%~udK~I-tA{HY7KpvVpwq%)M|)%&$hDNJ%9mKIIv(0t1FbKqt(u#M
zHLa=}m$Y!IqTL|Ls9FAs5Ln)8JypFefu83l>yv55S~Gh-mOesx7V(bb+r-xMH;%0B
zUywa2zHz^KdZvE_eiU|Z=o%2zA+CyF7(Lp*6?w1mS>?7etfgL7K01qY`%sM5<F4Ch
z`31_>CC-ciQ0-v2g0uQ%lavOj&P2F}vM`U`OWTzCsZNs|B)J(lOtKneIZL~g{<zPU
zpC}7Kf`(y6lrzBr_ISHuPnkeFW(XZ`YY^Q^&F04Qxn7zOm}9zJcrH(HS3_N#fPLPu
z+`rH#I~#uVJY`6FSNDX-Z>#kL$E~$4w4(NVyL!2rZ;E-M{W`x=G52;H<N|l!z=N$e
zg-8U@OL#uo?uxG2079{w_W7U~8UZz9J)P)pS8ErY=XHRZiJmvK$IXWo*WU4-PQJG*
zh>K2~weW*x98RKxa;=8atAU(X9XJZP55CiOIBb5Y*iTR1XT8(XZO>6dHvd&PLww=%
zk^4y_m!&QsEBt|sWrL~n8|-4ix#^jMuY2(u%bl&|YM}HS+%GEe8-9XV38PTUUqd9|
zZbg(=1_VzO4!#hMz9DwLC3~N6ho90@A6Ua5SkoWrA#XqFp3K0W%($**ex0>8#cf%2
z+w7rKi)-3?{9MYu<B|0rDc`ZDcF&>)Xpg}8>}^14CiAp_^v*c7;PSw_k!JEFu*Mqu
z&e5xCqqGiz+Ooz;mc*Z$u%(Nuuo_J*nQAq5uTEY3ejKDtgLl-YpA3Ad148eoj2gzI
zjUlUNN%`ZT4oqW+W*t&p2j4Ofvx2ofc(sDEJ?!(0kMKS4fy&=6X9pJ3&%}W~vM-Vq
zrDV_b&tn+K<2^5JKfQhSXFB>Jzzw9ff2AFK%>aBGwC`|r8{YRI#P{gK(1+WP*gdkY
zh~*pdt<SgW+urT=cf@W-Dx#pJcbmzDXFo5WRWpy$EYpIM9q#)<oIal?9V|d~#@7R0
z`UqfF9`6=>_2MRnm#UOYBX9euOK~HJyGe|JJaQh1K~a}ZLwqSU+8AXu!deBR!NYsQ
zOj8c_%CU+9bTNSk^~=aE*Mk>p!^~I?o}mOPd6U9?OsJY4;>lty(ckHX!v9^cL4b<_
zF**l*)Rz=?Q1SL})ByinwPH{UD%X4B-$RQ(pSh3IJXsZ&U;Ad4jxp{PM&mp)G2RG5
zIziH7Kd8yl%@FLz<u^GbgUTkUg(Vb+B~vGpSR{%&;1iTZL`125-e_AL0IhG`-<ufb
zym@`S@V?o5Ev+42bU$w8K74lZdGdQ!uWmD_FT8#<93Vzc0#dhQY<s%xV848A_vEf*
zfHqKQeLjPRJYb_5@OC*U^~CuNVuBhFQ$r>O8*+&^B(ynGz7}>f>{ILD${<|{xP2+T
zmfmuG&OX(0TfU}_5dprF+jzVEpC-(A_*qClG5QR%2`BN>W;lT4hi}cnA|J8RvY{Io
zxp5fH0DKW9x=<XD!)YXa0j=si%)rNeZisyPkEqX3Mn@ErpKzv3o(Klh0s(|zlgD9D
zu%?Z~lL|_lNVH#KZM$ChUQj7UB#}ryljs6qOj==_{E+BsqD)$F@x#_wU=tfZIFBX?
z)T2~}t<gll8&!~0CRK$O>?W~f7ML=l7|r&^gl`$&7#%PhV+q9>-zc|mWA-e=2lmyF
zBqr_m!d7mOMZ-!a<Ke@PICy3QPNs27Gf!eyfhRTm>(%l6SK}v<3d20ZJQ=xjR-%li
z?KT2dtF6<R8;R7>!|SN(AoqYge{hR=T>RN6=IHZ|z?tx9pJ>x!>`d+iZDcSgf^j2d
zobEGX;e`CjBc1Gyq+9&rnzfrOXouh1o&J_B2xR;N2n&qXEf7@z0yyr!t9?&=|JDAl
zHUWrqTOtq%Fac!XI1wOHB%uHQ6koXbMq6iv2GjQ&ZwEB9+{nj&@Q?P_AloAG55cR)
zgM_%@sy4#{`}K!6H2zR=zYc>!`c4gDSJnUoqJKi@C%@xYBe;mZ5QZi(81`p`lYiIR
z9>2{_8;ZPOi-YTN#H-85Pm*B;(|_9%r=jApWbPvEOtvdYz+LCbp!yOV&Okg#&|DPj
zSqa<?j1Wo!#maEse(Oms!u?fYWk)ROP0I=x$x02(n%sxZKRBq$IWO+7=p+bExKX!f
zTQ2`pD!x$Rbw1Ceq$+Y{cdMEm)Kny~e*}{i$1jlYWP=7oyF`3WREcqQf8!WkJNO@A
zEtC`5zh?)3_H51auV`)hU*Y3Yo0u5}NkZ8xr4)FSA&81=sY%(As4nQpix^dY#i!Wk
z<4!(gk#@ybLyICM=Q2qAm3|lg=o%<e<w}!G=USniU!2n}uPHZbQQnLSuv9ouSbYIS
ze%l4+bPKh)d%73H5-{C`Hl5}i>Jx#fJQlzN+Cez!eqh;bqE7wm@n~N(FxF}^Jyl_*
zmEZ!d-mbxcJ7w9J`q%k7iktt_5-xVB+w<%g$?bG<OL2xTn8&-hYLGX}j1N1;>!4{r
ztdN!Y**$l8%Fg4tgx+*FOro4a&-d(+2=`CuT1%<Z)Sx-0o@79tKB3y`(x={fjS}`&
zWyr#wf3@Z%iUJlaH65R9sY?Z6yVjnQN_=%(u~5v$O?V=rT8>Uj*Flg;E;8~8#f-sv
zBmY3Ewep3f#cHYjOV7X@bQeGtz4OpUuEU5LA1#S|s3=83lMdg_auUjkX8V(JNYY@_
zui5HmMOF2qqu7Qj0DyfVy2bIkYZOnhWjAT6z_gME3jbVe1k(?Xn?ytzm%Dh62Q&IE
za<)w;A|NUj^FEwCZj3>zv1!gl5Cb`t45~1I>AgX79`RyIh*M0>YwfXIthqs^GX%rg
z*heryKUy<C1^fuS<c8eG$>W3>9)6~d(BXRdT;vfBdI{=#cKVWKP)M+w%|lVBl~-rf
zWF-^>@llfT1x*<WEY1=`xQHqcROPHn+0Zg`#NgNcIs!2<u}>2ypw73^giF$}MJ=b#
z>CZM>xY?Vq@8HkwpraPDvJ?gKjv!w}_0enLR&s#WmCd~lKo}4=v92bSA+c#mKmlm_
z0n@#Z?7VtjXP#QwKCiPZhY#F}s0aHOtBq79rygpiXO4&U_v+hPi>UUjJRMaTpekwo
zEIW!qv1&N+e5N6k0~OL5TlZ&cZW;!Ewk}So!ugWS%?=K{A>?MJ=--6K#f=azc+)GC
zWUH7KJ+uao!y-D}pET}2pHN&YfANl7nh3IQy?GJ7-u!d`tjfq1)ytH^C5Q3ZeOxbb
zh7Pj5k9r5cqe41!m021qL<MA2*o}sOy3UDQIX%a+#78~wgcb<%E>Vy6=J`sXBT<TI
zxwa-u`npu6SxumIss%Eu1C<yG!zx#KQ`&QJE&Vn}s8E0FZyXLUUzzy|Q>Dy@xy#md
z*!5trrGoU;!){S2iIGrHfB}z4N2rj>mshZ!&A?~`h$el;HUU~6p7}MtQZ19}00G)K
zR+hh|llsiSTW1WQEq?-HOvsEg7V$gJGRo?u6KU-U9Zbc`NkOwBd#D|dwoM*8NX*Vq
z1icIqN3!6m?fTVvg%#Z7C2o0%Hb)(e^JEF#laeYwMA#RYj9*5vlw1}XikZ-+b~`#Q
z$Ccee^arbpUhgNYJ&VN1WG#C)=U$Uy8rPQZPWsNbMpu*4fBuGkh41~9;%Ut4d{HAw
z1E`kozI>K=TRzu9=!r|iSw(hp##m>2RqA0ycir0KqEjU>TEySz`!E^VoyNd{eCz^D
z9VT!1?!ElZPuZ&RvbTGQ6oo#`Gfim%ckWQBfl}K%`HLp5<Pw87Upyqe6dU*L!YYNB
z7f}~!sW|Q>8(y%Jm8*TNIT3n<YHFUmF56D?I5~GVn+x+AdaCPU<6@WTl^95jbX>n<
zG8EesreFRF)wMJ?mYU})o5ufAEVeA546ZmUkF<0*_w1~+I(0WMw^CqrUUX(nRb1?L
zYNq15Qm$x8i9H68{tY-{H5c!i;vg$3BypsFHeOaG-Jc=54uwHO#zs*2;fOIA%fv8@
zn|9h;Gl}S|ERMGrjC|4~0J2<wSw$NsFyFiJM-A<ry)&;+dom<3ZQh2M@(OViYo@OO
zA6;-P^jzzb%fu#fNn{Hr`_rL&_xIg;Y{9u09@ocIxa)dt49{EEe%5ZsFSYV?b4ia3
zoMgM%(hGwRE8}YG>K^MN1-qAft&QRPEN{c*m7g7sL#hSELjkGhliAyqmSl&eT&GQ|
zVJf3eH#)sDJ2Z><45_Em$=pbp3#F8g8{#~(emt%CwwpqcG<iLIT7?F!)4|1(gzZWj
zjci#~dF-10YGm_GLD*(-7Ij<9LTvUe5rEqsK{kI@yb2tuX%FCZidYITQ|jO@8wP^k
zjA$K41}L7>3WHw0F7|dmWp3u03XG4-l&Q=X*KxAiPiH`4o=4E=yD?~4JCCol*MqBy
zFB)Ip=~my1r*4*qG3iXK4yA);G%muU0G}(4gLY$}pM$AXFFp>?J`++5BP*0$#)-O@
zFNfDez*bX^=Ns05>Q_<MFWeXV<a7JPKAL#;$}>=<{eZ`iY;5|^q%ZX%H9+HXvOZ|b
zA3S>%GkW~Wb)TP(n>XgAm621t9z`R3UkkFkS#ylvsXL(U8-G^mq?>B>r7t`;&WBF)
zcy1ElPyGf+_h1`tZA;4%RKLoHL&Soq=irWSEUoq}@pZm?ySE`oJ#=JBT`ly=0oG{P
zkWhfEXCj9~mN#5dn6!^bH&U;MTQ?g-_T#-0K98=Ds*@oH8Sr=3h8Z!fdTVum2|Q|f
z;d(}G3}P4v`k}#s7~Ki_4e0!pgSG5kA%I>&8>fPN6AT^{MzPe@4=2RG6B|cXK5<RH
z*v>+86kc@SOnR%yd?X@Db2F?t2kZk*WuH=**Cd}{q3=q?nTGHd1)Jq%U{thFOcJP*
zK_W9K8=rfHPkQH`-uv?^dP315Rb0RlQt#P$a85=wNUU6shK&qxIJekaC4X1*35yq_
zcbj&frxEF=(_4Dg@pz(CT+`5BFFuCEi>Pi<;R`*w?XiwkW;l4pN_X9%DZO=lm;^ju
zOJy1b(={OIQ2i`nU+rX9b5g@biQvnD;yUn@JB?2lHt~AOOTf~&aq7C%^sG5{YnB7H
zu%NU796;QUPR?{|u+|ne`s7>@$uul-fvxAV%1pHORWm<n6qQ{#GM~TLq^zij61b+)
zG9R0n9RToka5!NdhAvkgcb?@NXV)kiOdl@KVqVaY<HeG~tlYgP_7TuJNR9{FEBWLO
zRRVFCRbw0p!fmP*v-BRi;kZf2C;{w4L<Mda+aSDOL^{leT3M+AeeVqTheifpbmUM#
zrtk>SQ#s2=D=aK*WFg-Rg9mJexQ40(-RI1beGo#U;AdhnJn&t=Mm7`wU^t7_*{KNL
zq%ZEu0@NjpZ5Mv?QK}cH<5DwHF-EYzziGeQIAgYmmn04N?X(dRs^^)WjDrIUKJgnE
z9@(c&+CJFcGP=~fG(0xj^w(;?x}994hlR2iEU5=lGkZrmGGdfpi0wZvz#ec%MkWOE
z(pQPsto@3=aKIO@b!+MA>8Dbw(!(9ht%=S0b%w8ju*G8~eMsn1qRxZk`oqmy%jNoI
zBFV3liH>28LY?JtClsM<bS%i{_4>u#7$My=;z8p?)F6}O4V1H=nI82F?@<MyKJfFR
zeoUT`TVnhgs}kyApLM*#2Gx6!ae8`J{-d22x2Q)I;pt=li5{xEMPh9+zq;c^ef~U>
zn=`s~+a4;>C(vb4+{4xZLJ0Srhqw3EtoQ2_gP;`KIXac#X;4qA@oF*t?|t69M<u{3
z%#w=?_cEVHHL0{}2$^WlUQ~}*J>h##T8-+@5{apuk@)Omh?v|WE4{kY72-~H1Ds>I
z$1yFojPz^0s`B!t$ID6foXGHm7*1-x*Dhw=E{&qQvM8t4vA8q&5AH}SOV=wUBUPF*
z_0jg<xVr2vdlJ*}H6}#x^$|`fqP8s(?1xO6sCYAK;3pL5za9CdXh^foxlH(aI1VQG
zEae}pw=<@VS84d*#Rs40ABFB0Y1oQ%-RODUY8oc)tcyr)Vvts^oIFhIjWuKhk5uC!
zA)gXj#?9sB8fIa#5)U(BhFx4Go_bR1x)a`pZ%<LL5hM;aR!_HI0@$<@oC6DZM&@_B
zPJ5cocR#SA3quZ?1N_x~jr8QfEtw|?-ali*xXpR!Q}%r^!H3B_ix;Iu6x`t{z5tv?
z0a(D`us9vc%38L3=Cr}0FypkJGe6NGZfom74TcfGBJtF7X>f9(_fbf4+vT{2b{dx|
zGC2UG*|xM5$Q(_xR}0|iI!FN~n>B3o;UirBsp#9sxXU(E2IUtEov_bT!>uUog<X%I
zSy$v3%JvJl*KylNIXyg~mI%!)5s<|dhqD)GPt5F2?cqN{y;`~p)@W##+%$$wT^mhJ
zswX~Nn0>6$w+|L~#!NC{z_-&Ec^4nJWbWo-cocTVDt$bM@=yIKsy1BNMbWgVdk=oo
z5{j~voX2`&%WsZSSai#lQ*vFL?wK^FWCCJiGH}B@>$*KWyG}p@D~was!dp5;$zfR8
zc;vKdbZbr*6s#Dfx^)7VYb$Cv%HAAj;xU__*v=7tSxvN_&UU$QF5n&UHj21BExI~C
zKPOQw-~4_nIk@k}$Sgj+t?8FMuBD<r9XC6#*7R@`U;N|1w3?upi{v`y=R-C@QUg76
zpz~Ww3u~n{SId?%l%N;wvdUh~g>WI*HCjVtilydj&mw5|wN4|iMDTuEvoh5qxJ$Sp
z1tXZDYQx~QtC)`V)nY}ey@$LpE2dtx{sD1cQLKt?Hlg9en`HV0Etf#;mB(r<Vv}!;
z+wx&fu|25Bsld3@b~4f%KfY!WyAx2&-&IZ}ByoCaZkMT8p?L;_x1RFJOM|k$s^+>u
z#WtH~COun1Qk=E$i~cuOJ48!Hr*F0N4-bchBOTL2(Z+zhi=5JK=&^0XV@h=HP@f;4
zJ&-d5M`ThxAASTxb`o|wxOQ6h_#7x%vw;1xUa-SpEV;B-=AZ=ATUh?t65qB1=ED^0
z0`v}hzBVR6-Ws|(hWe1!r#QQJoloS0W>uq)l%%$Tj2bmsNc$9Qe-^K0=o&4Q<FGHo
z<VWV^{OpRC>3%U(y@t6p7q;ilx&-n5><Y)i_4(Bg!m<}NixjmmGY%Ky&p%fN3OV_^
z#{3?P9ee8#Ka`~csL5iqqVI-Z_BQMVIn=Z%yUH(O?*>H;$d0stuC5^*86QWo==^BO
zJ_|)AU5R@*-pY-Vpk5xmJ8Grz&5jCuW&$~f)44qh+U7L7bjB<p=4}jG1NKdE;PvdC
z=xmu8c-c<B{x(X8b#hD$dAhDmlm6^%eriX!Izg&O&U_!^_<qECE4AYXOryaG2<|bz
z$#&dBI`x<d>8hz(h>c3gCcWf|7mJm0cU1vqQnJ__QS<Fe8ck+DW8W8_rj%z=hIVSW
zPcnWgwV9mu!hfpabe@xb7&|`PgkLjadAa=qGc<cOlE{O`DXFo##PgbkVjbR{V57Wi
zwvTtlo)w$Zys+OkIO}mL{i6R67zzSyC7tb5u7xV<qSCFD<x|iiJ<i|hAzVN}S5%a;
z)e<o308&PY#Rw;{#FsV2x&(Mrat#lLx?rJOWor`}dQ;Dg>=yURY&)K-dhXGqD5U^{
zbwqos#V~X3wZCD(wln4JTJ|FgI7F9PRF(J?bA58$kEIVs#(s;UzoYDS->YUyc*wU?
zI`Y=G&DR(R8LKjMp@Hqm!W{dZ<sh!2Tc2+c6!ch}Q&8}JVxEo_2{+atchtZHh?Qjr
zU(z(;_3UO<G&WYJppaa#tZ5ZnvOSNCNS~3Y<unTUNLBL^6Nj#RdU~*df8@#YJs;)5
zSuaU@U=EItESsA(62nyq$(7aRzM4}^i);N}4u}IgZ@N$8zMjQtNDI;{a@fiO1_`NM
zk4RxJzOGZs2$f73Xon396g~B<2U$a~jqv_yT}0TDO~u#C$;6%0-+cWCb}0ftCJTeu
zVOI@A{q_+G219P68i;(X%7q%N`yhYM#j=uc@OBVMc-Fi_p`GMhX5=p!g*=VzQpH#m
z$&|bKtiLG%a~$T(Vp@7PbhgPffrz0}_Jic9ly{wpnq`oEl6QeY@(i&)y$^`vBb0Qw
zyJm9?-0ZA^2HxgZr0qsC(}mDV%4Qaq-BFqPgB>ztwQt*v6lo+G^V`X5wA{&PJ>u_n
z4+<7eHsvGb#<6AoMp&i_lrF)Ebwcy%w((fH;_S_c8mGTxh9t1(`CcQ-NSV`m>LiKY
z_eri*=T^+e4*O(zYKAVMw@P(^b7YR(g)%p3T4k^pTEoe?ppQL9HV!;Jt?+t-Hhj4C
z&nzQ+zQsDwGcP#z<B|by;B6|l$qh+&YsiT(qpek?jSX4&Y(w-+9;+TS*wq<mnF>*x
z1ST#Y-ln%(1BDg)>=0h&TNB5Mq~rikjb6+R-Q@ylj(g^H$|8wDf2((cU>dzI^lc%C
zf}q{-jVG%?Ap=4X^+n>7jJGed@aI^9Cc0}03}v@`DF+{4eW<!@UoGLH>wcUO%`)`l
zg6INGoUb9084qk;2R;INUgvCTWRxxCw)^Oh6#_-_lG^9Z#3%TP)+~HZM$(KDqWhL(
zgV&QK|02Q0QWvZz5W``eEvH;s^Cqp&mU)YKPJ)^j{D!559<KGXKRQ-;DI&H5Cr7)u
z=tR{z(~i-c$15ku&5!O)Y4GbRKU8m`3o9I`qB=l-Y8P<Jp#li?j=CtFCGBjWF$fU%
z10qj=x;q!d7bzvY5SFu1e%ws({=hh8FJ`Pxp|;7oVE@_7xi4i;!CksX|Bh6A#9pn<
zWfOi__%L(GT;4%b{IYOIU)$kz7XP;#ti{Mx7!;G7VH#rStt2AwKxhh2Z4dXL+BD+L
zA~R;I$Z|&W4WBlu>cCfL71N1Bbv0Qo{pX`p(eV+af-M2h$nGa%K0#m_JdDY2V@D?k
z69b$76m1PH;bB<VSP2;k|5MZ=WM=31uKa&{v9Yle{&TQ1GymUG`hU9qf98Vuo8CVM
zE8{;`IDY;lWMyUhE`6VM2>&h5KaKy9hlhtj%);8q#DPJ~+Q7*~#Kg$f*n~ma<hPlV
zIU(~;7Ct_BnEz3STb53ojO8FBT<{ZLaLW9U{zxNrP#<G)oT#t)8Yo9g0yg$K<uq|?
z?FCkeygxi*w;Ydz?rSOEGu&-<0UqPPK_H3tz8Jc};40)SrQOz1W<{@#G;+eOf!<Gg
z@3aDQtvG@W?Vp!+9Gx6{fpNA6ab9V=*Unh#h+2~TZTh2mBFp{*;gH{ZL0F=LY1A75
z+5mSx#z!+Z?G#`-&W`X#g#)Zuk;eT|1uEKjO*7ireLJ5I56RdRZ#HEeau=$Sl`$%g
zT#a*6g*6vbDkVuK0Hp848mDKXmQ~23kX;0HL&{~hKER<m_N)IFC}978r8O27W=@X(
zRv{B169*GBBjf+B#*1udeN@rMFFh648Sjt{EM{ITW&sGOAQE8W^ejIlPGrVH5oAJ|
zykCrdZi3pPjQEu24Pl*o^_6bS(4z=>u$48Hl>O9^BNc=V0X0yscL%?|^d6+`kXdnk
zC!rS1%4~INJM@;7mzCqCA)$as3&P~}l-ayaib7dpfpuQc1a3WrBO|1K^LmZg5fVf}
z=FK4Du{K<E*vnM`-R8qx8{htIs0SxNRgQ-65T_RyZ@8Nnmnt!nUetfPbX!<}9iIrH
zi|jlz^1BBBO)ip5FgIFzAD!Q*b5+%1tGEfi%$|p!?%j0i`}DOHAfXkeNWWp{G33nn
z%oLq2HtVIz(E>F9t<MKYIGT<ja&5g5#2u;unUDJ_d9L0s*W55bHbBkPw%<0Kb!h$q
z<Hct{k2UMOh_LecP9eeq9|y_yJ&ao-ox}I!>TK%^Bk%O%SI=)rnTLb!&nrzJ2X5{!
zRNAmptE<o#45n?9@p}zND_2W|o%T|oHb6($Q3VZ=(hEu)d%E`-+D~#7faeF{E5Sy{
z$HT?_S$i{+T6%}!o3}cHo(6BX=|&iX!|bO7&_LZ9mUCkgoI1?GN3FW|r3!2FDYuw4
zm;=QU&KJZ*cbvm;7q8<{1oYBItus|sCJ_E>nH5CjcAJ^cZN7>)L!<w6kVd!VOrJV;
z(aZNVtUviGp*78a>N6Pk4yy*Wp*aN>L15TMgY4DsNOMw&pdv?g2>oR>=^utbi-HVy
z^{#QS3lqx}NEAcbV40I-7+d8ZfT#!*C<76Vmyu7Wj%JH$U7##(ffZ1}W3S@b>S(|f
zP+Y<hc+f%a>Do|bk?Ij#axB-C1n)R#tt6!OcRpJl=J<F2By!@v`u1RhM)f;Ehs@<R
zfvVr5*kClI@gcx2xDh&W<~#C1V`N9g;vyU$^kI1#H+k>aaeP;t1XZce3oi%4iHoy1
zdmK1$Wm$;ws+JAvfOFY7aS=iXcK`X>KOfWhlgXMwqbM0m#TmJ5s_F1M`t>9Qlc3nH
zYOE?_Mf~`LN9x6o_inG}fw<d2ca{63TdbcDxqNmh;VrQLO$jdq2*lu5fB<DpAae0c
zROSwMHVD^Ws;NB4ZU<C2eF4@0q(v}t<)3K^bQPL^0xUskD)1hluIetzvI!wr(Yf-6
z^YKI6!bD~KI2J?}gyX=V(_}+PrAq~vD})!P;{q~Z5Dq=<DyW6clrcCI<^{x@3w<DU
z`hqhMEXwc~Z1ZW_6gmYl7Or*#r{ou5b&_rKAf1FS%Pkp31*X6lENxH1*B~FPMgw%q
zyopen3GTl)8S^HY2mTAmit}1<B*1_8rJq+3n8$ynui|I`bmkrx%?0Klf2W^!OZKe)
z!;%k9jLuJcL1=*?4{BPNE`+FzEUpwcD1S#758|b4ARp+#OGPUG#{&qZ5FRkuIB=W%
zVmu?NGb)@g9{fP^j6qa|i5;IMXA<?B0d153Ug?>r9g!t?lrkSEf0R%+rZc}bNFft1
z{<u6%KJJ5}3Pqp#p|3pnzI^2_@j`f6Za?XDp=?N`g4GW4!ginAAxfL{B9(kN7aG0F
z;~$o?{`<oUUEsyCc?04UJ?KPvvocNC#N=*96ifNqfC>3?<!o@Jq*!|^el%wUY2j|{
zhh9budhu?El0scrwF2GJvj;}d2Z%D!Y{;L<yo`E_9%aJaNT&Ip=%zFTe`FXTA52v+
z`WDB3w}BrO&<9z`WBh3@YwdR)5B&?YT(||nu0M8JM$nJPi1;9z7TbaOTcjJ-8Dj&!
zS5yatJ^vHox2j7&Z^b7*dLDgnaG9^4sCp-yGsOmMYXN;I#35)K;&0&(l8u;G3Kc*o
zw}M)q;R56i{Q{)Ic4+9%?}bnU=(F5i*D_y!&!ndvr04Ei0yor@f|?-yJid~%{W`b@
z=r)W8i8jOsDwW(2{(_nS{(OiXp9Qf6*L-NzyqXY5`Q<+0aqeq4o~kZ*E0qtLcPv&Y
zUqKzn*t{AwXV5AUZ*+X6t$^bl`-R@}4x}7`O)$BFPrMu<Z~PoFZ(vm_?j2F6yiZW<
zWZwa)icgdbdA*RuaTho6rM%xEX7YN$&1JQJ^b}eJempp<hjyqe+SK7c=v~vE;aq3k
zl6Qr_LRg3P=6~X_6<qnrjQ8%)sceTFD{T8?$ZrQ@D0~&3d0rzfJnfP#AnxRjv#pe$
z(YuMR!VvVEE^J;?pNUso`7g_V!D`9#hb${R^|Kv1uaurqTr(`l?ZTVDz9Vjld`K^T
z)ZsliuH<(?K<>~j1XlO}+ZOsj1M)rzx*}hbEA$3D$1AVlyJBDIy!>AA&P<>2T?+66
zbjm_^g%?OF@PlKM)bg+%^Qf=&l=(`(ZXI7S%$~D8dr@-c`CrmGKN1P1PTzb?Ww+l7
zIY0JM(&nFVGv{k9SxbHIe>%R3pJjc{#va{oo@MoL8_I51PoH)!a(cTCW_|8{H|u;i
zOJ&cyX8Sw7aw}xax88AP&zzcVWcifT&d+RcejG2IddVI;zRJy<GUS3ga_vOH9#I>0
zA%OWuRDX4rbm#h<_b<ltFD&_qp;(dxHxNTnIMw~cHfqA#$pX0_l$lxEUJnBq#3Zo6
z`0s$m1)@}d|D-0A`Jv(oE7jdG1jDa<oz^XeV~MimA$nCiy{1w@wT56P>z}xyck08*
zFwI#;#k5BM-vQAH?8q22tbnkY-o{qS7|ym@6Hwc<`;p8>O8j1D#BHTkeq_$w#|a*^
zpV?%$6th7oFf}&o;{RU(wUnf;1l!)1ta(+hxwUTaBiVl_MD>wNr!1N+DVg|DEgq-7
z5yt92yi?ToGJgkD#{hXN%O;)wMD5UlQz+|(@XOjYFY$_Qx_Hj_p~6vHf>Xg##AfKP
zvmMU*-UiWi;AyJ9Cp88CwFC3czW)m2^GGZ6lY|mqUvD(t=O4-BhQ7{AHHw$BE`E+M
zVI%Ao{``Fh_kx!e|KjuHpnm65a{dTb?UIhRdY1uT)aPv!2*Ri=nr}fHl>dErSEEz%
z64eHhqqI-rqV^f|dz_#tk>X!!vd3b3w0>ZFt9(RHaRGvjq^rJ&9+t;>2vre#6!?R}
zwSpz1{XdfKAu@7$L)l~fi-BJkc%YiqHB$^=Ev<DIsGoZZ2eu>We~h7<71))~|BCVi
zFglR9sQw|~@Fpk5+b#nC1WKF_ja%XV7YkINhO3QV|It>kiIAHW+RM7VYMr{U4Q+lE
z-yTv>%lzI+ZCuiS>?ON5qjSfwlFb*C1pF^HPIxH4!Sc%XMx(_|hup7KIl3Lw=)Z_=
zX#{NokMvgA=#sp~`9Em6`KSNjbYQ_H$ba2IWyJ~KkOgbmTihW;rMc-8zxxM;F>|k6
z<l7|CC9#U}1YB3?6p#PM5KT|5jq5+&Cfx0(iN<^WWxF<*gq6>Vsh^ZFjlB_O_6M!8
z7y7Z`D|$m&IBzs+|C<WKY6AY%u_zY+6!>T5$RGXNhS@+8v@!3G3balUupef=&uWk|
zNWKr$pa0ysNqGN}0C@98Msw#MOZ+-L7GDSduU`RKF@%NM5Q$@R-(;ZqZL0Pl?CLn0
zlfAWt%4EKY&>_j0$Z_%Je7^1=X7Qm|0Bp@#{{tr?lcayq$Xj+}Pn$y5I-fgm(JZRu
z@3;I|w;zj(!?jHOF|J91f8+BFNYDR(SD&Lt0`?u4ycAKU_U2!f7|pl<jvQM0Mr4eO
zrW2xVbw^W!oI}tWX>h}1Sg*%Fh;A&n1Ug3L2>(cakL0w!${)+X{a<b$0IcQ&S-H4%
zgoRrEEyvu?uI96~aXvEE1#+^a=<sJ0&07omdo`%9Kt40#;cxGad;G^1`FM7s|MEYG
zsWV_Utx`6`rnq&Bc&shWZ@>5zGVM?%V@bJAwI^qu-CU{s3F$p8BK<#(m5;|l^Dj^a
zw;BtU%HG)IBrR?AGi!Yyp$h<1A1zMv*(~y{y#|$5Nqql0t14jr<63@)&*XG&zW>A^
zr{GqR|7DXrCkuqZA+n}#TzSe3b2I!C3ASS<faJu=qA8MOBHRW&W9RozYh&mB18!@I
z-)nEFx4XC^dL1nP;>9Eu&Oa!qh;1g%95CiKbQW6&<e;@`b`R5h6Veb9Cf@bLM;G%6
zLlxNqnx)j|vsd!*A70>0V*ka9zE>S)LSmv$wbWb7e1Y;m0twQ*>3R#>Fbiv#4U%ll
ztc?g<L(1lcJ8^xRHt;_rr1t~=W!toO2PzKS82V}{$2AKD^v}K7I@L!F&U`kF{3@C~
zB!Jna5#cjv>5Rx;jO|<i0K;)_V}R%7@PA=l5NdY^sNg7f5VzzPN|+r+T`BN*nqh3s
z2`Gs^YE-agng2vSdQE@^BO?CaXKg7|(dPF*hI6rWNwGdhtf#;9kH@c9*MrS>(y;PT
z{aU`WCm<Csnbcq6*10~LImhFS6H{Vgukq-P7Qrzb7Z@ckj=0_iA<0&+)?Vq{pvBP5
zJRQPwGPqvn`E#y*ygf{v&2u-Z&ERiEU2S%F%yqWSYd8FCZX_Y&q+%lCqi<1)+B9uW
zlMxHlEnWOoAOL+&>5{5E1}vu{aNiaRe?E45Wl38WLs<qhs;)e@Moy?!Zt7qqMWFdv
z5?#z-e9CPqVeLSF@OVLl(RtXHLfnc=iV<7WcaJCfyBBw*wnV(_v~!^eYn~p1EiMwJ
z6m<+(kqiS;@<`f(vWB{UfE61)_^KaUcn1HkIiulRSh+wzXG=UO9Fb)JIVz%MQQpr&
zC4XaA1$)9Vok1QbsMDdvM9@MpHB^k^Hto+GUXUWY1W!Yt4RGK}HCBJ22Nqx%1A)pM
zv7GT5h~J4gp4OIayyk)7D!TK=PX}i=Xu$bvGt&l7d+w1S(92g6??jQ1u~b#+m>QYH
zET|N+$r>&B2Kl&d`c9(8)sf}fu_JvgEZs#F`CxM1+%AR|=}~5du<vNV4M1{KjWh;o
zXdOM+u1qxL8RU@nS0GPGAr>mnOQh^z`jf0RB?rdaTqYQ#7!&Y^Xus768HO%v=^a>x
zg$B~BR4HB#ik4V;Jz3T+@?&h8)CZswIhhq4t7K)$k7!l6j?`z6kK#Q!5W@U;B*f&X
zH{4`6r7a!u3=|PEh<pO7K@xB9(y{+ysIsm=)H^Kk5f-m+I1DWNk@R(cH7=1Y?jp-{
zTw3W%u*t^2vsT$kKz3~jh(X{GuL*g{&5jUR_V~`=I|eLV)-;b{8a(}>yjZvV%9FnE
zNsa9O(+cj*i1gF{brvnZ3l2poqc8RJ0n!|;a&<uwg*|0)p>+Y*O80O$zDuloXdi@Y
zrruXo%d2#cDiDOUC;Kcz-rFL*)9HQ)&B0z+rOj4qLhDfIfpg@nzg(cSABHG6M2SUI
z-pXD+hTn9O8Lh-GE#`~mEH4Ll;ml6Cf=q-SdZi*Hc)`}LUA9g^o^w>Qd=_!4|2G7L
zyVfye>v*ff%mQKFKtX6qir&UweRGviVLGS&-CZbgo+P!lgMOf>A?}}eU*O3yoV3qM
zAbb}MLZ*KE49cj}EzHu;r5{i@*}R@fJ0i<}<@Zvguq&`D0wq60TCE+~s|si4k8vy`
zD>Wfx<u!b@B=Zhj_#YQhnW1c9V8Ek)H<&vW#?n%Ewi#x_^HZQFtFb5u!B)b&N9)wp
z7-mN$c4|s$P|+A;rJLj>?TE4|WLxGoOhd}oE=5@SXXK)i=B1o!jzdYq%*m%$7e^VU
zV^ZIho4~P+pr)va#Na=q+&_V-LmmpXF6M|Fx0nTA)xm<Ib+Eg0b1y^d1W(HZX7}Xn
z=;EQInoI$xM%|{RRZJGJFjWm!DC^{ApoOSY*5!|l%0}v72E^dj8Fw^6lNZM^P4(6N
z4KON^xL{+*S1l$0)T9RMq+Jk|aX^ral$c<41*V3W;RYZfLzt@if6H8|j3Gi^Fxd<_
z%9fskf=bA|XP<&f))&klnrIppteq*jdlaW?dPWH%#G?>~L<fXT1BFbzUxOn&(Sr7B
zc!lhxJydxk;g70hUT%bVbd7nqbR#8CBK66@vabDpKmx-l{|%Dz5}G`t&#6Mg0Y?fG
z00&9@Q*Mp?D{(dF`+qPDFA&pCQSSUlK6Zz1^H5GR@t0Qz&bNqRf_a@c4@4h7j^=f$
zW)$uitNQAY22|l78O=}Vy)FlrA@4!!=AMX9gz-b5zeQOI+XaH4^Cnmn+cvsOtf0*f
z^`Co^x4Gwc)A$l>o^xe-p!dem@oc2>)L-Ia-JOD?$U5B>5t8IULdiudvSsn!#uHD1
z+_u0~zzJf(1n~}TD!J#I&Wl?~;$FlmBB)O3o`$^joWK0h*N0xTelxi)ILrtmj25B^
zhb?{Cw}4PKVuuj|$q2@{Ln+*B5O-<{S8K2W+|BgN7}cLwxSny%BG}P9rnEB|TUtBs
zD+KcRjQ-ky-7BW2xUY7~zwnF}rNp!)t!F9eEYyQ^4!pj?*kX_s{($`Pc1F)6OSUb`
zZn({<@RRAl;J>ZM;zS~2m2naEC(zn&ic4UE{8=QLz(3S!C-dmu9mXDLT$(3vdH{7C
z#yvRnl}8FNM2j+$EyXxA=y-RTX1Le>C??Hxqs;`VB+ZjpY7h~Aya)Z=jHMAnl|<P6
zRjbp%)hMnjd*DUWjqQ1Ac~M+hpIQSEuREoC-975nSxZF%>S^6Rv(7u9(VMftFLgE&
z_Zs@k8@@bO{LGg0p}pu+Q4v7)fS{NwdFEZ@13LDnri`M8wYmY+JG{u}XSr|0!g9O^
zV!0f}gU+vR=>~7XGfV6(wdjaNb7#KJQ${{psA)dcvwnQ%-89$TtU4rT8VJ#;m%qPc
zVX-KR8#rv7#dImRt>l-Y-VN@-t)D@9;2HX4;fQ+#8Fm;wfDvy&@cSY%Lcf77gNdLo
z&uIRa3m#9essfGmzYud|#Nj7c*x?8S8jNLtp;jJa5|WKS=5P=0RlGYPs?#&g7CL2x
zL;k%A%ZeH+y`pH&IxKeNP%uiz3=#Qm&30<r+X}+FCud6nL)grV<O*gaGP4J80heGR
zlLJ5f&T$L*3(bBZ-hYSa6!TahI|G1-fEW4ZAIRr#ubOCt%ht%|?!mE)0&CU0^wMn<
zdxs((XXub?6nIC+Ntr-?+*^MT-aLI7qm^B24B@PPTSf7UQInyv<Q$iCx|FT5gxzrH
zmi(9pz1n$e2lK3JBVTnPh?>57AZz9fcfWmw6E0TmAnoLSahGYC>kc=S%TZYMLnE_!
zdg~;BJ!w+Ftdn~ou|sJQG-e7RfvN|8t#}RQ^4BGhO-f7XQn9lox0O;;eFtW?jM;3u
z6GXOb>1;Sx1*nd-{OaLdL$-C*@Gx=osIu}r+jx?V$7+$d6Y6xR07ZUMN(x%sTwjTV
zJeLB3m(mdeGH8t|sVgw_V3Lr9yCC6+7WM+Q5y<M+oS(}A8CrDE#_7l$kWNtU+##A^
zL0thjm9<_S0#rRi8%glcCi=$ild$1?R@Y!x8quRanJ7PziowIDfMKQ6ao^lrLqy+x
z_J==hiaghw6lkvzPN?&NltW#cIo?Zv=x2m;|BqXf8%$MtII7z>sX*f_OGxGykW0Hk
zul%*s>%jC?|0*f?RXW7gVk6I{^M2x{`Ew$jW~t?K74CS(&fyOFA<<}^jxsiA=7n-L
zXK06_sUxQ*2{|>Jy>o*`%SO(goK>)pGI9jSj3O>&lms}0#X-RYl|*~~G~*DN-@K*{
zl^*2&@A~^tt7)#yD`~Fz(N7}fRM#G3XMa8=_E$?P$91#=^QrCDv#7%TU&9+VIK~LJ
zqTMUVWi$Hs&NBSAb(B!k4(JnbAY;G6&u*$b>N?AihA~prSYn<QQCZ<1;ptt}FkaLU
z@fI067u$dHy43g-`t0c!uKanecZ8q%3_r70<xR?_$^7SSRc2e}K`%e@ToxgQhx%+E
zT{~P^KI^yNOg(hS5Z1bgpa2QbSKI4KdT7(sfEL@hO|yHm9`38~IC`2iQ*o24r-ohE
z2(3guhIc8hqM#X2VWVy7gze+Da@n;0MVq6E`l=SHKU=k0H6VOe{l-~WgULQ7>f8t<
z35sP>?9Zb;#b(*8t~o?c#aLE6X-+%b>|IuefsSAp9AGyYmwed3+|6hGy#M6u!a;BB
zD#*K8m+`DPMq73V>%TDP5TPXK?grb4zpQw%pFbUop)TH4RC9=F(AupFb?C3-D3!*_
zodxqWRAq;287_0C*>Qkb)z~%SVU3Reqv#<nh)BZ&H9-WC6ok@~>B7kVPT7Q@(v0Z?
zm7+tR1E?yfbX3_5hOPhZfUrd~7{{NrFGWD~TD%Sa%0KwI-i0Y;1IFl@%;1HWR6+wZ
z{hH9Hju%Sn?;~*<<T+6eDRu7mYD2=V@M?5FtT{pgu|IHaf!cYKAvbkOUC<4?AZ^H&
z65WdV8|AbLpK!kZ&ixbP3XDR`(*4WmhmNN<hrB1@bUuhr{jv=rUvSUYv=8Pl6m2jm
zu*t*<^hyZ%ko0ZX-aib$tcfzqFk*E<Jtc_Sbtwl6d8)vYFlNVrlNrHN20Z3+;fm=k
zx|oGlL2le)+T9jB^P!jaXrN~PbPkf@DSuYs29cfs58QYx^_?3h{`I;aKH|Axg7z%u
zh8xVlx@Lc-uqOF^>&hj}ETNGg*P*(KQ2$r_1283Ce>ifW#f7MM7o0kXTnt3J!r<Dt
z72?|$N}6t*5O0ioDQoukEx1x3)a)X#c8n42%@Yg%0~A-xsyzHR8a{gN4@4_2c7P)7
zbBx;#-!sa4cG``kiK4iVluKN!IZ?I%ps+L8b8uvky=l>HF}2^R4K&BkRswE}sBnzT
z7^Xdkmee)Ta}2#9`l9fqDN9!t5lSg2z2DIgxMzHj>xTRk{giF0=P2R`BlBzVy*i<I
z1EkQNXCrLu3;$|_MhjAF7tiZ5Xj6n(%<xaDw6-yAJA&TbDBb~4LjsT3(vC(;{N@45
z`>)pw+>&PrDOMJ641R0qgA%n}m^=Je@eVY-^%JvuPa?rzC|yFnL~7TtKmS$+_v<`+
zh&{_oqqj!DJUXT{DQgl{C*{QDq#{^KuWj#{y<3Un^JUoI2b!AeE_5}um%B&b5)JNp
zhUMb7?T(8ADsNcfw*kc*d#Zmj2ZOv*rI|;6H~>FRf-y%=k~-E?*pg3-=_zmLlFt}2
zbc-V7&6u9adtZsZ&4X|u>!r?NbD>}uNu6MK1mYt%1t`?P*b9X)Dwm;J25I!O_A3$*
zc9C~6T%h+L#q3BS!51h=qwp4};Zn=~FpGSO^c?D*b30Wz?fqVQ-f@I3BYy{8V;N#D
zZXs?((;V4s)x7Sk!oj8IXSP!t|0OQ%0?iIH)c@0t;*y4Z7yZZ?cLqhjfiYn3S24{P
zu?MJzaOW>a*1%JJ2O?Zl-j!vBS+si`E_F;nNeL|JLT@4Q`N4=-qa>Ohptm8jn>F1L
zl9|P{Q63KgqX2Sk68J&KPvH|fk6^W39RrHl2*iG05l{)jx6{IYN?h=naN!~tJ80ja
z(%|$12)7V)P)vmL-y`;3MDDo9!43(cJt(N!5PD&Slv>YFzXK*t*XsxX`=Zy7%o<&A
zS3x}o9xJf1J36yE9g2zh;Uu+x%MC#<MV5fj2nH%~es%m1j0nz@D6Z3X^y>j_G^jN9
zLkyyreXG8=FbHy!#}+Qx!LtM10Qs4Wvx{9W<Q1q50fGg2x<{Y2S%c9af-!t&uZk7x
zxTe|FxxCL4#}L(*AcD(@vD1bb`E$_0iw^YhF0<Xub$IW?o2(1Lisg_6DpdkKpb^3E
z+42svP2C;rN`*d<CYkIn%5h-u_=DSE^0l5i9L-q3(Rm(vYXV@W+uL7Rs8y0oQf^IP
zJKw~~;`|B2lRKNVzx3Gy`=@Xb{9nQzaFe*wjhc5HaaZ;%13++ioBn2?+6!lnL0TR3
zE1TXgoMhyD^jcSnC{=uwtU(~9BpQ8E9$4R5N%`uBJx^Rh+nonpNLHsPW~nlZoQRJ)
zOwT`|-K;prxmxzDRkCi#Z}k!AeI~A}^9pu{aN@=W^GLa1p@Qh_^|@H!f7geesYv|s
zf15AVeVJPp5Q3jl=PIODGMzw%w`T|j8DDXBK;FjqWTnoX$XoHMO9>h=_E4WdN^ly|
zALM1nNtH`Oxi9K7Zk>31r`hsiX!2G`=k9OWZh=IsA9>_FIE=2t`@mG|sU*`*`=M1s
zw+sNxcP8ostT2K;z4sqY0T)l#st&<<6hl)uF<$FlGbwbF%TmFM_m+!lB$K+K1o7L|
zvQhvF@MM6OMb6bn6gH>}@>MpegweZp?L0IHcs2d9{^Ig^IldZ94@YBj?k6a5SbNg*
z<f9Mi7|Y$P$WZ~i>)@>Rx`M6A)vWgDgo!NPlqk_-nm?$<$$X|;cXpbmke&))d~NRT
zgHOu?wo;=1&-a!WK}j~^E(MLD#WMPSDtsFH$L-o#av1ZT&U=Z(vw>m-0lLKGl59AB
z?Iw2ZZx0atpJoWHJXY3hT<9pduNSH`CAj1$6+_mlQi!h!RjO0!b?MU5Q>F{Xi3bBT
zi7WNA7wX9IH%ifx62lLqskA^Y;hxlH9^Ear{Z>embu`25BnE`_jg^%_lC={y%jQiV
zkZ_|{Qi!!<zK!pQoj=XOm*LyDUhe-+p{z914PcE#0Q{q`EQ+j~sz9@Eqi06F7BnVZ
zxKbdJXO`>0I_}zQair1>)til3p*;xy4*-ckcE41oH(Efarv{^jU8&K=uFM!z4y|QY
zMjKw_$1Q#E-5RwEg36IK>h))$>swSo&o^zdZQHiV9+NQ8>s8+#FQkNqu+OQLjYiE_
zjjDpGqN<Xrx}fr<&&^WQq*`cLZ&C|Wy@{zoRW0*?b+L{OGAB!U@QsrVG5xHnNG1V@
z#Rxhbp$2y0>}YC2J;(++4q~vc#`(;ko(R+vLCax*uxt4$^DA^cFE=kYH*_(Tp{t2l
zd#3p21GVa6X^KvqW4gukGIphDxM{dm=q{$4dsr%jOT;<mxt4`yosmkqf#w00k+f7O
z()x5G%mo(1GJ3DDRJ&BSQg~FWiJ?}DrME<5C5h@xW^->zhe@4jjCBn05lVGBy}@WS
zn=KX_d!MRe>SJikdb$$Kq)(G1>w=_@&tTFUWS?oikr>zGMOugvYiN)deR?Ya*;;EO
zwjdq%S6Ql(>LmfEjFqeHgZ+u#B%BU#LP*5BZGUo-Z9k@yk7f2$LSkNCu6p~mypnDE
z_b-%sc^BUB;=*2uEb7x6HvvZ61f=37;5jfUhj9$<0IKZB$;t8K=HG;M-LP&Qn3eAu
zEe2MqZUoHA_O-5omL9Hw<{&1s2U@bS)#Nihv9#yFP?!6!npp`mD@mo_-xsIN_=Ex3
z#ASEeami?3hM!-T^o~#Jk1G#pL-V**%@xx6!(ZM#Y}9?ikwYcotB3oGyAI1ZTiuVJ
zD@XB8G2`b^ldngO{qk5^YAcPs#CA!nXf&nbdjJWEsgzLA#dvfEHokSX@aX)?m%;{$
zE?G_{_%AupEUVLl9KJ8UX}K@HSreWY4u$T)@G!&tn(#<F=rgE8tukrw_{4|_0Oz+0
zZ8Ixbe#Cyfn=l_UFcUR27vPm=C8W~0Sn6`wF+HGvKTlVW+jT$fzGTD|OZ=ZVztX&j
zT=(q#l^686x%pOUy(Ok*?eq=JM~^-!klW{vzbVel=8FfFy;2(XLM*)FGkB~OVukJ%
zofyw%s`yTE{rZW6btU5Yx;fTIq<vPc38+0tH>}a<V?8*Je5PI{WjCvlT_&}$OQh_-
zg&gY5TPepAIet_;t%|RWua6hv%^ul64C<VUM>nY5WoYRxLrZrVj&+wo4BxQdhHhAP
zo2%VrsElKym3=a_blCn8w#v$xYX7kg+n=ZQlou*VUwbSS-DzEDfBXoGjgRk}HNY-b
zJwLhm@b=f657j<D^r;(mtd-UuX?(x=$isIM^C!Z{BTXBhnfN@3WuI&3EBi!;I0V-P
z*|evxhu*BuNHQm9bT?;Y;Ef!YoiZq6ct)kUGGm%~YDQJ>2J<c5m&f0i{FpheTTt1j
zPB~pM;Y(tvN0J`zwl--)w-=MPc6%f4{r|__w}3}gUH`7V&+Id=Gm}XONeBs<gd{w|
zGYOAi3IhV7@_q>jF@Yo`AtaeeLcsc<iiiqAE!z4hB2B4Mst}PPjfxhnSgntsRY5HZ
zMN||OQBX4XxA!>{MvMLT``^cR@BPSTowd$BYpuQZT5G>%l0#3&=mgeHCRD8KU|gKM
z3N^V6l75^askFqjq?|sz^9yVReJ-_)>oe0aJEz=HnX|;W+1O{C_MFa%9g^>5HedhD
z{KVY!_@t|P)%N1OQu=#Gc-MOG_lCXVe(xjR&%Bz~kdF3Z@lrFmS2o<)=hJ6+4e3>{
zF4r!vH$_X_#kcHCx-&i{#ZHnbS!L^(9<JOJ&E2cSS3<f<?5y-mnFmstB+NX>w8_}v
z{xCYPh?B|-APxrHvbjvvP^Mfrq~gq7{K|m0M}V?@+5YSS*^gw40wl=}ub0or-W7gh
zry>T*x`3w}7SMt{1$<)xD@c^Kjg%S_vyw9UXFjFBtn)N|gwA!Z?5S=@d-Nn}k8Vg~
zbTt5Vuk5a_7HNIp(5Pki_{rlrKCHf{WJYx)SK-;5BS&OT9nKkX{On<DIQmEUgSe3!
z5GL0KOnviYJqrR+FiY3FXJik`$AYcOpnMF`4Bg&iIM2&XNJvcR+&MlWv3s_r+r1dd
zGBLrV6_!3;^+@X__2ULDdV3+u8?);AWvSbeYF@cz)t1RVXJSTcO5*&zwR3Y9S5|Dw
zPQ7u)*x%nY@n;j`y`JREELTn6pUe&>1#i1NP;yznr6<0<=_f<kJ3Ui;Jty=ZSA6B9
zpEP0APY!>m9Y;ndvpME2A&~FAK8D4(S%4;EAFYusF2!w6O0nSt+S%@q?zbyf+70PS
zyHC2(u6*#){^z7)j{Ek`%at&2K-a(}PJ^YTT-@p6#A%(TB^GxoPP~`jtKH*i_BAIP
z4o??X6|dB)M59sXS?<|vJm%c#ddx5quyB6BHE+gMF|{$*$7nGONf=l*fTU5yFl!xc
zq<_!}Y=vWD+;q;y6qugrb*K!^=!*Vjx^vPP!{(I*WVBg8W(XT6qs@{f8@n{6bLPu-
zmS!Je=XS4Tvb!XcT@66{z^?gwt;rZ6Hq&EX$oh)+abohT?c<@7Imbh)??{K(g8sgF
zhw)FQK#<J~&Sr@+`V^ZVHyFWAv}eoEn?cqJx2Jyg@Z0TQhmPIyhl6R4bh&=csx8ec
zs%~dDCI0bcmdadP8UNWM4|H8r{pYqfp1DEJLSqrL_b^k(QYxDfXm)X%C(D!X8SN2+
z;s>Q%!Kb^X#!pLG$V)|;b3uG@%AU0S;`L7NbUD)LNc?AsA9p#D`j515T0&Y{PI6&F
zVe;k4b!qF;?EQGAr(eP_KFD)9ALAJte`(4Uu9=>No+J9vgfr}<*T*_*UbioXy2977
z<m!y!pOnWaD>f#}=X)iV`C<dH#j(p{ZD|3S6KMgNC9!dGLWot<njD6)y3C;1B=t-!
z7t-UgUg`1J$ZjlFPVS>*bYdIgGM}=)Y=6%lw%cTgCfPN6H<fZ~B-^{0si?xHCN8_0
zpX^<_<xh@U6y^md96x*R^SyaNY-4<93lFQsK|-t?xiVl(?kLg+$<d18Y6c8z1yj+&
z%8JNoL(BGF|9WH9{u_($>i_6j|JKGOzkTqhOCMOdVfDAaxu0pbP8rF)XU6il7hib#
zxwl{3D`R#!R*r5MXPptVX@SHvO6kmJX!FE8=L~n5wn)@E%iNC6a-LM(1`(JlsnisC
z?Gg8uI1_&=*)}k4SeJn*BjY9{k4%{oH@C~wl#;l`$t5XE^`)Ip@l#1YN?<Xb#Kg%7
za-G!@QexKmHu|{FXX~2cveRz9MP{s;q?w#TebVnfjE*}yVe}*hu){j27G94!8=z^A
zTp^@OJ(Azqqj&x`5A!6a;qK9_?0ks>BjrS##?lh<e3|w@X7Bt+r2J8lN>PzA2Uv=V
zQG$vWA~hsBQaKaO9>#VjC+AeKV*x*lIrOj^T-Xc+&7)pi<R~QP*idAM`6w@!V&m=U
zYHerf*=m~Aigx$;^zmcupRxFZuQM<E?n770O$%0^eVb1)hR(d@nuplT#NX^-X-s1V
z>)HNp`)Qy5k=+&S&XpHeY{o3I3J3IWFyh?2B+yki^Gu(q&v9xoo-f5I9b0_qQe7@@
zYO@j{oDo8k%f31T&d{1%JWls_O3%mk@x-HXJ@UbIVh3=XP!;K_BCrBAy3HnR`Vi+O
zHj$<Gb<J`$X^pP8wGVXrW}S7{v+Y@q0)41+glCdxwr#dP%RbwAt!<gO+xeXSn(Yn!
zuzt+`wSL;sInL!0nr7oTGdi6P+~CliW!K~Fc3rdCvV;rA3YW`?8yt_wCUm>Q;dYbD
zwu{9CoWiE|A{h>Om+n_HrCKoU$?K2|?kwV2jIAR!f+iu?cZJ&n1LZI$bzmg57bKU=
z1&ULfUNyCnnx?6X;W?CkNxA&|;pDmDDL;ldjx;!xGj9SG-{aOvUmm?;6AR=i!*;E2
zFK|+h7lhsCD0CER>S`S<J(oLKnsbH5ok=*?6O_(I!r*Wba5?*=7C0TLsf7sGyDg~&
zK>J(#3T;m}j~27lxgto`_Y!(|PfL0Mg1Dz8L85nCd<D9R)SaQwc6a1l!ptu;0&(xy
zm?J&`t;NR|stcq}wIoU5Pque8*>QX^_X?nbIeBun$zwRe*kf1iU|Wv0SFxwwZGYf;
zY)f0&w)Q1wOL^MW?N`cK<>vNETRLWyIO@jc2W~ffeSJUiUGB4u@Ne_;G=DFndundy
z+|*I2b^djJ$FRg<T`x<#tm|yYmB!q}xm~Lqi;PO&;>1N=_xRi5-${BWxvksb_`}`)
z;Sc*0y4!MmIh_aDhWW7T_s#JgaetiJ?sLa_F{z~JcBUt!;56;+lKF~@`CI{4v1_@@
z=2tVnKVWU${~2(r-F#AHD<2sg9jEV;JLK-qB?m`$wXJVpo$`2ITo%zDWE9)Twy_h;
zmc~Y~Nf?~iB?eMuW-*mnOl1~RTM(w!045I(YKfPu2~s&zTWA)i7W*z~mkdc_(T4@(
zMF`n;_T*vT*>hF$m~|ZgVhgNQ0bCU96r0z1uv`%maM+S7M2}c4YF)UwdDtBlt6r&U
zeD9}o*7l3tykzO`f7ejIy}eRAaqE;RtHZzgP5ZaEjURUQTdnz}y?=fEuP^*vp1W>B
zAfH1FV`*RDqW+zj&&Imj@@*H}rrFAE4L02w>u@@po=&k&4`~kOR?a7vv*$Vob7c5C
zu}(ZAHjAl`r+j&X^H0dH+5Pki_D44L@kQhVnn0`?lWIp!x*Cmg*DbxJHulO}*2f;^
z(3}`|$=;}gsQM`CEOs_0Ls&SKcM4+FcDJB_eEV+psssPL5W|twbQHEaa+9lfj=gEq
z@X8Ta77ZUY>Y}3fZno?Pg5!q$uE!-KibH4jOD|-Gzu>*a-IPdgK29#IE#0&8)dzti
zK`ifr9h~8DF^v*@&YTz*l23ET_%bMidE&AR7PdRaIL8#*>+H+z>+CkN`|TU;+w6Pn
zuh@0FOgHHRyP0l^_(B~htPhPWLakCVN!9L8&cJTz5xZ`EzVxA55AC~o6(zC3+smUp
z1J@vdj^j-3JAATG?JCX|%H)a7%k%A1r_Y?ML~~OS+kH@8>=1aUyLkfTzU1+R^Q-%;
zSn=p%k9ErF+3kV*eZ$K(@dc}yy}EtP>a)L?&?i~?=nCG~HdAY)1ey_uce`x~UET2s
zHjh8YxX9oJ^UIGU175c`pL~AjeuMVM`?XzsR=_YkMsgCPE{Va_YNajoohqdM6HX$Q
zCy)Dbj{D{vmx1dvNbPpi$~mO7I<Uy_T~W2QaK+VQ#&jQf+u*>BQwtX#pA-o4zAr89
zo}U^w@uK~OgHx)8Nc$$yukG8kYWbgqFdS*1!QOrFkn}<6+1WkxPB%R>bHVPPO(<G;
z`dR&=@r4&@)j$7bR^I`4uG?2#R#LPC`;y%(i#^ENaOOyQf;j(-82=c9<HUBs`upTo
z4Ss|+_F#J)`;=upOs0(J`d`b4u4nESCFf+2Pd_4a<Xk({zMG9bC*$}JWE}MMZby`i
z?>~_7(Pz7Tri}bUYO^e&r~X}GegAt!{*EnwMjO*n{(k-<0wSO%>nH7P=ZZ5eQd|sW
zg=>X7;XI}MLivBJth`VzlndqmNZI(mrR=^?E|mWZ%YOH#hT}rHP%e}U<wChoE|d%9
zLb*^blndoTxlk^Y3*|!j|Dl+F-c8BsPrVIA$5VW`g7QcUA5ed6GmJk;kr#+nsjH@>
zT<T8JoV@fAMc(dKL@!#jEs94_66<ZzI(gYBi?-AJ4ju;$fF_f6(yeT=MSJ)b_IAhH
z!3Nn@STrMHd)%Tq*=?sSTBH89k1X02#UscNT`XEByU4O=I}PmMaZr-&Hj8%B7}3X~
zJ#2;u%D2_oH0de*IYkT5KK%_v>#F{ringozFBI*Nw4E#3Y5CBkO&^-H=|ht?eQ46Q
zC?3;?CT;rAq)i{1wCO{WHhpN)rVl0Uin8CWY>QL05moP%v^_)7K53ghSJ9n7$Jqml
zj*sH+tom!x2~qW3l<ZVRcU5hfI#Z)~(xT{0#b2!G-ioeKbYDeZBWXvJ{f?-%jHr4e
z(wB$GPr2~OK*|R-ohqmdcmmZ@4WNORQJt!}7<VDi@>+uWO2yF+PexKTil3&UZXx6w
zs9xQb0hfV$39hB!90|G-vXZw_@st1>l)h5%ECvqIBGlDV`G3onA94=+UYpXQh13XI
zwlj=oD(mVkX+I5uhJiE~R8MHCqy>0di|5js2I_TQ{#=xEZA^g8|7z{@4r;Wrw+Y-e
zu*6T3pt)SNB%k(GHr66jE0z9<>PZD^q{a2r2X&KGe?#g?rSji2TpPh(YFQ`WFIXU7
zu}`xh*@%1T*JZ$sDjw3~6_)?Ym8}h`t`^r)RbQvtTc+Y8_4?I23M(zn1(r;ibzh=1
z)v4AOgR4P3lT!1Qjs`1=)mA@iI;=1yB6ftLc<YqsOJVT>)rRShCS|wu$oczV-b>yE
zXsuCsq*P_-4?Q6zsuk4}{Jnr>_RY8KKELjo|B1ez(_Y#U`;ZzRkyu7@_<TKxwEnMl
zT@)1~*%Pxb4XVA!=#cu&zLuh<N%g!|jktf82_@f)dYOt&t#vhfWzvnft5a9MvUW*F
zHk*1RXEiwfT@Lko$e%l4V7`BPMVWs>ZB1>%vbr+=#kHZj+E7VDWo=DA|H$fU|J2Hb
z6%F<Nsb%$Lp(SOd{YHi=ORD{q_5KonL#U**Y;j3wk-xV5$F)Q%hnbR7%N90PmxPAR
zEDP1+v42p%frI@$CsZy7)z;URH}pDJpPQ>BCQR>8J6&Ezhf10%YZm$^m6um8DD(IA
zPpzF_S>vBrxuBx9x}?63e{xAfsB%GNiGNy2V@)a4`3DvZ&7ED_=wDp2%->jF=5MHg
z{pGba4gQ8&e`#fXT{Rw*)Rg+`LY1gofG1_ZCH4Ngve4qnhK90I|NLbTEAv;Q%^Il!
z52TKes;vvvmNqVE@WYO#3K$aA0&r!`g6hUn_|6|OqPC`bnZIXcFMrwM`OqE3UGr~S
zSL~&-_o1?S*-Po+a}7xG4qX?i9`&q5D-C6fWjI2WXtlJqsiwNNr1bkvC^5TU7V@J<
zwP*_0#)i7a27hVU66t2}RFqZMecwa<Fjv;9F(_x@8f3eiMVB!TGOP;s$JC4*c|Hv}
zXO0><I!d*BwC&mx+EakXwcXm*C|xCL){5L8Qo75&*H+p0v?+bIZnlB8%Wap~eu8TO
zxJxi9<ajezg$lNfJ%BBnoCV~{5mK|a)M34VHr#{mqVxY~BzH=&#KK|ot-urbQyIL#
zmQ56RTkOWY-%RevPZ&QmBHTW5>iDSx23Y?^h1{(fKquHY(1sl(Urmg!;dhh9@8S1=
zzL(z%`aXUi=wI^<pzr4=K!47^0exD71zMbjJ(1Q)8w>gp?Q+oLwd+7%uU$`ETdti1
z{gw6|=(Aco=&=0DeH+>8u~%uZHGpokEdzbE?P}0Jwf!9Q9kyS9UT<3u`cB)Opzjj7
z#6+IRCru0zLqHD|7lAI+M-$V>=xA3TuTKCyQJ)5Sy8MGyeWrc|=vn$K(6jX`LC@72
zKsV}*pqJ=PpqJ`55!YAhH-lcK-vatpdowZnZ|%P&&A!?G80ejjk;EON9N1Mku63Yy
zj_V!wf&R7QQ_!C|PJ;f*iB_DmolT@Um%6cIb-Uai(%fEmPtd*GI3>9A-M<6<p!*Tf
z+uTose#X5Q^t0~2f_~Bc5^?w6+{ZwF<UWr2Pu*XFKIuLM`fK;spucg$^X}8`GoZgk
z@HB%Nf5t9uukjbq&l_KW{?a%}-1y4lBj$<qbRo@??3p89R%OMUQ@ZlJ>2uTDmY>nb
zsp!#k2mI!k<v@=da~wsWOB@SuEqB!6y2P;z*Q?<T>4%@;dV}Kz)ZFN}5%dbjO3*hu
zZUKF(<2KOi;4A5+FD##Mcq0dNA9sKBWq^Bt>hH&(KXHGe`tmHU&l%XC8h=5*WG@nM
zP4pz9FG-#x&|PG|tp2#@F4jt-Bvdk={0o+as%iMbP}w4ySWz}VL`Bsl4K*13E@CsL
zj`qv<PGc-^c*AW`IO{0ikQz<$yyJ<wXE-x>JMI}q9yE=gKF&`G76+Gqq-D_>p86<u
zQCX;lD%7<`T^rQ(YPmSka&=v$uItqGE_L0au76Y4_ZKf(yogSzYrDKMU0uEEnxL-T
zh&oPpKfH45g=7)T<QolPyO5_j*y}>?4RwNqwG>C4;Fr$mQ6eR&H=cB*6iSt^E5s^&
ze!272$&;JydpJh^uI!KCUf2=O#e7(eS??NJL91y!-AB!|m3B}o?ZvrnKOLmQ^a-7$
z@0g7lESdFWgV|^{o=s<SS%}@i?qQqQ7S_UcvuD@~>@{e15MwLRIwKZ005$@rR=`qV
zKbbiFusIeoR+xBjyNQRqV&Z~+CLU(y1-s;wiO0Qb;!B?~@f1JV<g4GNY#|+PXIEWA
zIueML95&luut8cwtTcpsJMadxzI2y~%leskVS?hbRraeKRXMY=%DQ{A@{=m6iZ}17
z-l+Phs(n$MdADf!qV<a&T-0jHS6^r1#Z@M*8LhY+wcToSYsb|V)i%`LTD!4!r>gbT
z-B-7*Zg1V2b)N(&7$58zEDTN#mIjvwSDV&`<ogvRE;6-+%1vA!Fmb~U6E}Wj@--D%
zxLjo$Yr36SOr7#jB^_is;+D$-tccaI8`z7S^L!rS*YI2UdcX$W%v<>L{38q<ujU6_
zrqyXnwCA<gutp@?X4yivRklsGts+-!)SeeF=zhIQuhTc{2Q{x<x5q;Q#XibD%U)zJ
zwQsQ>b_{cDbv);I#c{@&;>>jhoaN5-&Qq>@SBrbRyT-lReV2QqdyD(95oe4xW*X~_
zH$3EVc?Nh!dFnj(c$z&ep8cLvUWYf=+u*&!yUY86_f7BnF}9eDm_9L=A=@*<YiUUM
z02PK0u+PJ5*=fMH;kBFroZ$o91&G0Oram2VlBW18r~oi5yaRF-G#k(70OkUA;$EYe
za4N+Dq;@-Gc0|ckD4Dsa+li+(^?6fF`!^H|$ix$yvZnwr%(M$5TJb=i4-!&uDxjOv
zUqQKeHX8I;z&OBoXr2l@19&FzEZ{kin+xzj%V<l>XlU61EjyH!(SUKN9}jI)fu*#R
zly(}BS_`QIkXj3=1CTlZsU48o0VyeQ01^ivA-(V^W`szud}v4ZEESU7!at2_YqVwQ
zXqtxm>A<tVGaJC^Qt3&`Ql4C^THQ(G<xB(4SOD*+XY?3&HRa=(%KryQ@V?<<J{T|_
zFeTi|r-xfH9%902kV(T1sTpw|jW~}+oJaHI@PoVu%_4!?1E@WKS{eB#5&0)cL+!I2
zcQ%$A{(z^1-{P6!eSCHJ19Gu`;SX4Uz(7DAp85cZ06!oDkPXNIr<3&wzs~wXn+U&-
z?5cp)3TUn131|;Kg--cKU9?5<;9UdWHPAj5n#V%(SXjOV)>Od83fNcyJ!`lpyn)Ao
z?iAj^lYo<fy8@>I{Nb_a*?itBJeD~0zKYgf#SAaA7@7JX*`rHt$!T%j65*m4)IJUQ
zwXpFhJa80t9)+Dp!TB`2a1>rR%DVv40DeF=pcfz~eDq)RcC_9Rhu6Os2i?l4Gsvei
z(Ow{qm~22SHc+}XQdFFgb7{!AG_-L5Rvv(rX>0&sARtfWLF*5^c>vx#0L#-j<avB}
zB0Mn>UaeAIO$F|TdBKlo-NTdNowYm*bv>w+_XJ-r)b|cg#CjXC)Ca%AS}Xh4{zy!J
zXvOzqngIIzab2e3D)X-e-fw}%7HDjN{VnkNTi|Me$6LVF0*|*~PXN3Am*-U+G&Q09
z9njW<NNj<|CfLvf=>xES3#1Rgo(<4&02*XwZGna+*wzFMO|Y#A(c6N|$G9TTzhu+-
zGAZK+*(9Y7LFy2s4nayr@esHVf%_1+Uq!@ThQuLA972>{hSVYDyM2(_2dRCK+6Sq9
zXloxN_d#+WB=<pLA7+8bxR+zz{iAXbnQT)It?a|TB_^yRBXxQx+(cUdE#WrITsy<X
z>e_@|+fI0VIF05b!^ij->dVK4+xVrRF9(*h*o5$AJ`pq76wp_oZVn~#YT(7-sR7+Y
zeaXv*p+;&LrzDR-vKf-ELh=}&glAK5kGT*Uj`7)mGQeWk(ixJid^k7)N<%9jqqMX_
zODnWAK})M@e-i94wXA`LR$c_YazHg`X-h322zpug7<QfK={H(xLQ73(r46mD;bURn
zIN-~nRcf3AdM=;{JXZncgDwM<0~P`*P*(-K7`PF531BH;8T9LX0(>!1^>jBcfj=v7
zzZkyZs=Y<9%4K<|&GZnSPk_It1Ef!{LOxX}uN)(X<%NhZ+u(&`%9m3>!+%(5qP=al
zy<5J^4tH0omJ`WoNgOlv1Efuh%%||xF~lyB+$wgF2*`FP08fHX%(mL#k2V!Aco_To
znD%Qivs{arr42Jn8@#)w!?OW!o%3!~UsX1=S=rF6w9Zj6EJnXKqu-l(6Y7>>wT$Vo
zM`a|qBAUj7o}e;ejkUJ!R{11*R}3$rM>uIjc0dn>H^a8gR^(-lwekQsO|5Ov*sSu(
zw51u5-wa#UAnKcWDWD8><;u2VUI{ECzeq)Y4Lq@#*8>^>O8`p&%ff3Y8{Rnq@0{rH
zj%h_RtT<+QsoC;StIG0;YD}1Zz7inwZytC|&ouF3JTFmJHS-0aOM%N!Uk<zwAak-3
zxC*ccbS-clAOxrfECtB?bXZ<(RV|N)#t9t}-VKedWLI`YMxJTcZYv|(kj3NF*q8Y{
zCz9E629YBNr#8$eQF+vAd3iTt*Jfpbj9;FWSH;#m(2VR;aY61!qCE*xX4bVLs)^Dz
zHO}R@Hmpc)wsN&OYK*M0bjz%2wxV|oC#w=QN0{?f5Iu=Ov|m*`SD~gDP@?i%@~d$w
z`^#%ozPG9wr2){SENCMiq%pz(a{f7|LC$EeDm$ge7Im~~Y6?PogEcp~B6AZowOMww
zSsL0PEp2F1oE$Q3YK%Iq7TcoiZH=1w+N7Q6>y=Ssy8#h$cgS7UVdXgs)eLUUnrI30
z6XFXUebro1B7Gk<XH;9dr9E8LN^fea1*uuiIF2YA&Hh9@Uu$_^a=dCi-TlL-s@FCZ
z^@^w&$`u*Y%J(ual;1JOnjW)y15O_}^T3%$TOdEjnhVcKBq|9rN=GqfOiAg3CM(12
z9UhCeK4P1*Bl;14o{M(Gmu=|551ztHS6O$9mG`PK0)lE(s@O<xN8)d-uC*QZL_8X?
zJn~ew#5?+H?^yB9%{;}Jd5YDF?^Gv+|LT;$X^{HtN&QBG{9AL9-wyXe(WpO)jq*?g
zxCUWQJOsr~1vCujn?e*9U5etS87KzLra3tC%|(f!$8d)7(O#4|mc#l|C)S_!rv#km
z^C*#h&OWCu>?`&)C9~7)G^Mg{*|*e<%a?wpalr+pb31oY1~<5evUv=Tp`JXECsHro
zg?FLeyem(k9G=G0k-3>XlLqi?o=pRJFW!rCc@EE^Jl>D@qkNvvVLKnjN74{JnvbOs
zd>kJ~qxfZfGF{B4VJ*Il&*ZacJfDO7pTtXfIbFdk5S2N+ir3MVyq?!%_ql{Gr3HK?
zUrFVB6<<XQ`D(tJDv0T&`WE$j9q&>;QQdujO~gV6fgc1s4ElZG9e~|{Jr+C%c+rB_
zh=LWsZvx&09EQw?z#jol08RnE0}lr{0A4^mAQ|8XWCQv@=2PHYz)<y!g-)vHEOZ8V
zIAAp3GQecOOk(v2dvGpbK2hUlz*ayDvBq7%PXhL`lDbWyVIf=nb#;3}Q_G7&rF9>L
z>J)qzS`u`GHiF(7oLg@wJip!ueiFI{u&VCCP@wK%z>ZL0;eb#e__zA$b$V!Q-R{uX
z4xTqdvrrd6UBJ}8a9yxDR21A2Dyn-fG*vx+5%^u;bM0OeMHkhb00sm(<h<ZL1^6zs
z5%(K|{?KM+m&9*|wxW$EasNsOJ{5XH!FQp93O)+G5BM;c9Xc570~i{5AMhdM-w#f%
z*A<Koor%zqzLyu(caspjy1uu9mGy(+x2eI^p?hQ>g4<x{CfK*Lo`S9Qh0+%h9|S+C
zp93hCeYboJe(0Cr51onlF9Pt_c<IB4-;}QX=k(RRhW;Op^4p%!EETusV8fdW@9L=k
z;qxdz!Y6eS5Z|IGU*8ZN`yb(15{yT`y`kFy>p`!V@c?~~s*gaj86O#^Xizb8m~n$m
zEf(Ho#cvmE-TTA%1#=N68L#1p(QqqP61LT^P~eBpCj(1Bef_QVx&(`7or36k8Q;qQ
zeULAkth`2yq1(*wnI=a4LDa`Z@?PNzh0owV3HWy4xJZ4l2KliZxgjB%Za7CHhcb~r
z0~D-=Ki>>h1@D0Geg#nX3-1bUKu(>D)hab!=E@k<kBWfGi3rHtc-x}?0r+Qh?nTBS
z#+`ijNqsr)mjG7-LcuRWp9a6FUm9$;uo;II`2PjfuXE$%Q@}}?_weU9_-&$muHe2<
zz8aG<54aWM=Mj@<Fb*tSx2OI(mH#Ry)i{Xc>@2h=afY<ftOt(F4beGk<)F+*#Hdc@
zoXRO{>^FQPbpTFajGqX7*wBvg{wCm*dEaP5KPI2MH}j)0LqSxoMEa$2!C}T--8<wx
z^hfS_eVhcubZ}!&^j*yf_16VgH};c#Y0QUx7^jWHfZvt<lJiF>LjjMPzhFC77q|N6
zl+<^o4)txRQ+=oDRo`a%)c2V2>Kn|i>buKy^{r)h^^IY+`o6CpwEUTW#<8AgX<8a_
zEnQ0|P3xicBAb?@<&a(LtM!HU{#t)>Y6G=_<kIrAJaTJ;w83O(Khtg^uXeL`8+Fpw
zXlp4+`?>aW>Z<)hyOUD1UuwUkH0^HfZt`pQYWGsQ_G|5a%FrIrenVN>Z?)f1PwgS?
zA<EHyul=6-Xn)ZDKz+4Gv`46)wq4s!{k2E6M`?ienD!VA)c&abk#e=iwZ|z>`;+!3
z%GaLIo}fY6Q`%EBSbJJ~nuci4YR}S8ZJ+ik6=-eR>oiLHyY?21(GF+_Xq@(rc8D(3
zKF~g(iP{nE2u;$CYDa0Zc1$}?Q?$>tFKMReDf-ZrqEL*cVlh^XrAjeQTt-!5yckc5
z#Y8cYYQ$tQnQFyUF_r4XbTOTRVy2i$Au&tLqIxk$%%KJ`SInhGQ6!3Ji6|Du)FkGM
z`LtA&ic(r8%0&fTEviH{T`Out9W56j(LgtfC1NS96jzID=oWFExQ<qf<zhLl5jTh%
z=ytI}te~}GrC3Sp#453helBhmx6&PAwOCER5VwolX}wq{*3q5f4si$FCDx1e^h<G<
zxQl)zQ0Q)PkGO~K5%-Dv=w7iwY@qwZMzN88EjEcwv_UkBX1ZT&7Mp3Kcu+h@4~Q*d
z3vCix#a8-_cvw74&0?F_M!yv;qJ=h#9byOlPV5waq6ftj;wgGqJT3lA+r_iuFSJAK
z6Mv;YikHO8v|GF?UZp2Qn`oma#p~jAdP@9V{GIlQx5QiYv^XFR&@<v4@eci2yeHnH
zz2cBKM9+#3#0T`8I3kYFU&K*yl%5yI#4*|@J{BL-3*xvqPJb1jiO=Xo@wxb%UJ_r5
zFX?aMEAbV*EWQ?B(<|b%I8CpLZ^gItnm8-M)TT3Cqc?P++vzRcsk`Z*?$JH;o*tve
z&_DE8J(dpXo%BxhzTR2yOdsfpdLkXxyXal%h@Prv(1&`ao=KnT*?Km8ruWo)(h0q{
z-kUzx`{;e>3%#G-kG|9g=mY4ao~sY0uk@k%P&%Uz(=VcL_2K%(bXK3M&t;lkq!%%p
zUaS{0q0iUnGab9eh0Ly3>Xpo`FVd@-p@;N(=5;&WPUdrOckg1c2DZPfi(xZtEZNWv
zopqJJP0UgZm*HZm27XvK!)th1n&C5i%x}aQaV*`4H{w}_kzgdS?naW4#4?R!Bbj9x
zDMkv*Ho6&p*2Bm!GFfjU+sI~pjh;qN*3al|^k)5yK1Lrl!02c6V*`x=Mj^{Jh8qDk
z!Wd;tW}}R$##A=Nm~KpGQ;nI%Og7D!Wz1sJjXA~~Hp7_vKgjzYps2Pp&(l|I{<VY<
z26>1CTON<_43ALQgGhK3ePiTyHz5x6zh`*J;~^o&IBx%^>Au~xuFJZvOS!CNu@-B&
z#Ig*rT*e|S!xH1LE=9S97?)*?>o^QcIgDiyhP4>h?C+euw`uy}GfC~Hwrc9s?|k1q
z_x$<3?|kPw_g-#uRfiQuRimm=VNx}zniOVLv#MEPQMIUA6joKMs!j2cs$JEtXjC0j
z9aDU)I<7jdXd()(jPtEuKYxnr(NCL+;wY}8KgBiir^mrx0kr`hK$LJAI4hxtkk$gw
zi@;^zD%#-M`_t>dP2e^#3fuwi0r!E2z%1|-m<JZoW)O4<c*V{sjt;T&ilex0|MWG+
zhy#*<R3HQ8xQ=hl2GoSKw*tq2lSI+(0`&nQvV)Y71EdajM)d6+7YUNHq?h#LUUi6!
zk$YsCJSOwxC0SL(DN+=f3bmqCae%&0Y`y)F+B*7Cf%VQuh}_n0>l16Abq<_peQNEo
z&VzGUC#^y218~QzQ&yLC8eFsWwzbWAADr1bY;Ckof~&P&vDRC!f#a=Lt%4PIE7o%B
zIqL!I1#rdI^VSM$FSvZ`W$PYm54dcr*Q&Oj1D9rPwPsq6flENYDc0lQGOUNJaaJ=p
zm9^foYPEvntd*9RR-SN{SJo=ay!8OM7uG_{V{0+EIqM$Fw6z4>9h|*q%?3AYt+0$)
zb4Z-!iB&+2I@B;)X3$3*asj1vC^cH1TMwglx8;fDyaoEV-LyQloNl}Qe{E0P%HB63
z>`kNaJ3Wc)eVXm;ZJ2k60<#jtov{k@wTF;q_)#<bs2P6L3_ogyA2q{|nx9I5FEuYp
zfG>S=1{6{@!+$=3|9k@f`2_y+Nd^Gl`2@c634HgHA^`sXNjXpj902sdA;26}-Uze+
z?Z64Z0eFERAxCjfc@+1PM{z%S6!(!w=g1yQfu#s21<H@6SPoe9mJCa_MQxe1v{<Sv
z?Uq9pv*m=vVQI8@kq0ejkj_~yST0%mQ8Hk;fpp6<41UaV_o#5R?x>M+^er;j9~yGY
z+}{(9`(K!gI5rnbHW%@1E>xI{eVCE`ks0|SX5`x>Q~4*DmuxmKJC#<Ym3$fVa)ab1
z4`O!SP5y`E|4rUY{tf1+@DDhWf>aS3o9iNy@cHwEc#&YXnLTFFeA;~0++)7jO5aXN
z;C{{h8q5l}LKIuex0Vx5c}RJPY=Q4tNSyK`c;MFLA0_{YD3f0&zb5g&R~tpzi!agl
z{Z)!Ff|%0=S=&;8bRdgxB7Cha7sv<Tvu!2PaT!nv@Y1m$p-!q}1P(`_XtNP65Jxz~
zMJ|xUC~0d3p9+2=3L_}X0J6iNWpGh0pbocZD3DMj)h`90X`l)?5UIZ|6JmecaR70@
z&BOYjGP5?)xTFpHK-+0nj|=Ffva`Pjuz$2&i~{zMwyVq@T(d~Hk55NH+OCJon(5d#
znQTlxc6|Gd2%jINl>3#j+d1y<w;i;5`~${b_ptwlt=696zh$d;kNJmfweGt(YemVJ
ztqJ*ETdRB0KWRH=bNL^zl409P_XGbUo6G&k|Jc^#evCS_{xh^4^Do$Z;9s!bmTmPI
zWx;k3{3>cbL(O{kf=G<lT$e?qEo4s-Rjee<*6n`bAF!QwFN>MBUiSm!efBowSL_}!
z$2j3$74vM@952K|Tc3xB#kN7z$*~PNTSbj+1bKyRoRv(t7R74Ylt(EZv`u?dD49W?
zX`Jw+p;ezJQ>?W;@#Kga<D4f?thdd13Pmf*(@^udr&w&Vy+qqq@M++mdo<!P<BX?5
zJZW3;REq@pLC6ogi7wlkr`A7g=REcPyY>W+)&C596LjqL3{owk@<GoK<B{9zSP9jx
z59K4!>od9C=ouHS_QReD=-cX<60abi_D>pDJTqdBz1i~w{k$Q!jeF+AYb>WVk=w4Y
zd=PE(>?xjB(Py)IjzQWs&vS9mejMdPHkUW)$e2Cdb5bPsERPF$g2#tC9?yz6Wfwhb
z;<Vj{nj^+g2j^e-UXv$;I=RdO;a2&cS}`P*?D2Gq-S!gX=j~;l^T;bby<)GOXMINJ
zf~ODG6FgT~PWjWWCHUK*b6lLUjo7Y<PwZzq5=51~rz1sdvR}j~L(phMoMZlWi|I9v
zxws~#g(bArdS1foX!(Twa!0!Vv2mp%OMGrG>Bxl*yWL~rOXCdezhb}IkuR<pr#tq*
zde<p$ze#!Hw0kVT+15Hr;GcBvX6&~+$^r@YQI95&V!s2)Z`tp`&j;Ar8Mfc=s0^gr
zM>}|&eb^!RhcQ!g0nR?#QHQoqJB(treZJ!`G#WB?Qy#uLW1Fz|fEQrTyQn`RW`3`i
z^7h4!X2`SD(S|iO*l`?ly4Jx#He-h^kY!J2{438?AQzUe4dffwIy~Z>{k3~Iu*VVC
zAqGkuNtlHJ@TUW1j#S#GBctQ2m>1@4;~hQl1lqRJk==0-ZNt2p^2Xkd%l=!Af{v?4
z9yp3RuKO3LKLqj}rBowFIeNGOshYu8b=-{b2Rd#eGEiIPK_l?+J@5`a<sF9@Z-z#X
z9gXORceJ1%HZK9*X6+aa2#)rSvnW5oq;+)M5px{gj!{TE*l`bh(48%IJI-|6_fOi(
zIv)Bb9cMgKjGtvZR)^!9z30dnVnQfT=eW@EG+=aG>X;84cJ!nFX4{}~DA48@KqSeb
z5iF3xY96Pf1Z=kHj>Q0vb-jS7HSH-5@b&~q^2|}n=7N{_B{9=+!=4gocHDBhj;OH;
z;O)jW&tSk~J4yWkHh(2r9mA5Hmmv9An5SznY=5>+a~yZ+xW;i$8nmA5cr8vl9(d!R
z`y<rp$DA%5Ib%1v(#2}WWalwa>3H0EQfvk9V!V&>>*bHUUd)uuQ+(u%&FW2s=gfFB
z;5kj+8_;N`<GPsVoOX@}2b?qR<=~C)HPKzb`J^)xyybY|9S9E77#ke3=XwYH!_K+R
zZp<U&jVm<vI-hr*58idY?Cce*oh!~za1z$@1s~AJAAE%9stP`~^WdMMZJ#*qTysAQ
zE}&c!eBl^I<X=Yl75Ko7j?v(<i|f25_BkHG+WoEsluJD1*NA=JYw~gM>S@nVVBY!M
zmw<8md@1mJ-j|Mu(M;o*Bh{A$y~ZhT?Db}26=N(KKfP*r<70TvnBy7trd#$NZ-M`j
z<C(VzUcKNgh2>v(%f*mm*<0nm;aK$^fR$>!<$-+IO%I<O^d3S~If$Bhbk_)o#%`~f
zl{5w_QL`m*+DW|a@Z@LS6M?f%rPmSYp^++Z(W!DZ`zM`gUazQfG<t({hjJ7^@*bx$
zaM_vZJ%gG}9i!r!Gsi;$S25x_%#_P}!T-#e=e>lTuGia-HFeM%gjd|~4g{{-LUdQf
z9+W3GISUcHRL)}iJUlth))%-5PkxLjbj{WjxJ~)MC`Q-A8t2^$fjdr(_g3JZz1e#s
za39tf4m?CZn!qd~=WyUDA_gL%ZJf&K^!ai_S<sh+a$RSA`JsGQk8e+CkL#kZBvj(M
z>?;eExvu&uLzS-UHdnCRb<@X(c-L*85E5LYzPeDIal-pT%yZrG8AC?bJ<s#dVXW)R
zxI(~w4PDH>!{D#b_3lc++zq?#`<lC~Hmh?mAW;7ayu$pp1q3?3;P3d3hnj8UJ{#i0
zgwGRdb3OElkZKlE9d|v2R5sVVFI|oy#zEg{agFW>ArIPSi6KN(?6w%|Ec~hm{5bpy
zJoPJa9CdocP}qMQj~LJPN9S|wR#zNnd?{kL%zMUzcOApj_Z*GBi!>Tg-Zlt65B*&B
zFS{0fS3{ybm-4o2zN@0fq4!;v{S$2?yzQi~Cv@5o=U5G$wOw(J2ijaqojIW%SANIM
z&_&lP-%a?yAR<h<>ou-?hMd>159ZOi3tgsj7rIJUM_7x%yyJy?xa&M!xy*Xu)$v5k
zbI18^!)Ml@X`eHv<9cA;*5@4#EIKPN7mK!oxPr=aR>KpP5PviQ&WWqRz$>gBFZFrv
z7|ZXXPOWz`@Y-4Joeaj&`pD~%C)rPX9~?<V{c_CCjQ0`EvHCKdRu2iL8ee)J`&X$i
zp`49H^I(Q;$o3r3qsi%u<gK=Y!R&Bu8?pBU)$sghfve7A-i2TR%ZpfE%GOI8lfNzO
zX&8}U#FO4-|6`}iyGr9T^-QO)lLV`rAzLkSpEK`Bw(Uu$GI)UHYMP50XSXRKV1#FC
zsHU)w$M(Ea71TqL`rskwd0SI+g|inc8f&UE?Z~RL&v`y*w#_-uA6bPZABj1pgw9OZ
z6V?cLob{bK=;38&p4jR<h?VYf3_wl~oiA~P=D|i=U&r-e3ys6U7Uy}?F*+LQ%tB5p
zB$>wO?cu93^j0XzbHaJ0vlyD<>NMzZUNiP$bd(=C<E-t}1ii);uQ$LuL&j?XqrDls
zWS(=dvmzL@r*u{a&p3xV4+hUUm7T?b9_L7BEvz@*Ss%QB+#0-OvvxKiO7wQN`j-(Q
zgW{ZX!W#_svpr(fIOSgMBCZr%y{{rlJixk6w|jyEt}OS1E+uw4PnU{bfp?{SkKPgV
zI~sAepI*;*Wm3K?hw*v#X3EnYv#Zd!f;#QAJdkc12j4;?TUW7d#=Y7_D35Z++wahO
z5LYg}+PCvwZ<hvZhXh+3sTgb7K1=1HYox2fc?~jDyYgwdYfq&7AS<tBd_CoZ?T&Nq
z$s=ldzYsiN&qe(OR|)D^9WStFEmK{vPl4}hVmu-+?ZGzWS`0pLm4R<{RT{gyj<NDq
zTHbY%@h(>9fQ$E@>GHXR&Ovd|#di)degwSGIo=g=)pbrF&&M9U%=Xi9>@wc2Zre2W
z+{cWsVEh>4&ojQ=p58fyJ$)Xr@oF^hcmx~fyNuM69sQ8G*LAp4+11DRgN(m|`qPN9
z&GxpgYsT)*>8?Ci8~8z5-ZkVp?o@VN`wTzA>W{lpI%hENCC2Hl3Hbj^*OaZ*IMp>B
z#m_MQ3FU)o$Xwnv2QMuSW-z{<@-!yPeAjb&eS&L9b*CyYi+#<DYXDoPs_Ug=lHLj8
z8s>@EY8-N1?pnb*deXIq>w(9#zFou|ZRFW?f?X8*;3s4Bn#V4(9c?Vcxj4HRN}zn#
z6ysBjD^4Y@YDzliLg_9~=X0^uC3e0<RD9C85}0?L?pzZ!ryQrer-E*;KUJ)DJN&7q
zs<5(x@J2)=_zb;^d`Tm=Xmw6uH3{L|o#Y!0U3aJY?u2f-Gdw3lx82#kd!bRc+IK&6
zCw!0WF7VWb?$JBV(0zIr8G7h0@;$^=LHG`r-P=lcwNZDh(!DCXOLdp}W;foehVMq*
z<-Vt(S$CChKJ?Umz_%Ehck6vip+)y0->cA)+w6OVR;(X)qwlqt>2C4Ig<jEdL$BGm
z#dO?Lanjv1y>~9A_s*x1s3fOSnIy&QlAOv2YY~o+^iKIyHoZqal^xcL-npY^c7ILd
z5^W1VpP)O)DK*_eP8HCSQ$=(Qp2~2y+uKf+O1<?lIZybLPL;cZ{*0hE{B$Dx#K3*V
zIezMZtsd7o!|rqb>{EK_Sp>W5cVF<UPaSe!@)w*kOV1pb-K3`q?Ec+-%YWchyY$o|
z{CptHho4uVRo6jxzrQFr<{t2uc3p9|`^!%?x^MWaPPNegXu*DUNTPmq$gy7?#wAoI
zn8{Z5OT%pTOT%64mxg%>js!3HAi<yT&xwZpicrgbMOc^omE>YlpZq*|ksM~feEgXG
z&hZnBQ9|;_*9jp!`3}h<%_Kld*>{NR$$9dxNCW97|Bjf*0Fs5=AUBDX{Dh2<kI4l2
zB{@Q#lD{E;O8zHVBu=tIenUDHoT8BYQ$@GpEV-%ZRs53t2gR=xbL7`??Q!2H{}6XI
z?uX<zaieiRRcwiyiu;8kF>WF5ZxvsRTiLor@kd+pw!W*_t?X8gD&AF&D}SmmD}Sc^
znZl~Pr~J9%BjtaKS1LY@PmJHL_}B3}<8u^O<9EgT6$6R>#M9i?#DA99!);6aH;I?H
z?TP<3@dmde@yCgi-1~{YNSx)qnfR;374BP!|B#f<eV6{FiR(&EN&W&CN={4O&Ye#F
zAIS^cS=9;EWv*BC_uDw`FShO2wuAe{ww>E{a}T!_ZY$*ea@$w8eU*Dm6tGJ>ds3Q1
zf3KzO1^R$1z%`Pk?bBY-UegY0hqNQwac~pbDebg&M*BoNr+uz{3A&<P<2gQoPvO(~
zEIyadr@#GUb4wDm30lG>2nhoFjbs|PhucF4SH_hQ1y|0M6OQ|k`;ctmsyQu*WAAe*
zIRj@P@mvGfKoU3;XC;Z;$K1zc8`sQzLQ>fKT&djmx$l!Na(2#+KAoJ0e2Kl!wH@}(
zBbmydD}PR(_mdg&glU#We-Wu|A<fzrZM*h_)}i%kgW5CNbJ`2qOWJ<zfcA#=mUdV>
zroF43)IQKY(mvKc(=LF%&@O9NdBQ7s6`#gu@;Q7SU&t5p8oq+B<`1GQjj!eFc`M(<
zxAMpMle~+^#k_WPJ+V48z2sL|N{gg5DRuMb`QC4Ed>=sgEBrN_8w5ARkMQICbAE!K
z!qGH8!$0BY=<n;||2?dkvBA&j|5GR>C*ZZ^#0{UXW`3@Nf8Qhqc=%69E&ThJ<Xh}Z
z;C0MK-;UoM|1LQc|6criq#^$O`1gr1{;Tmt#1vl~Ura3Vns^Pd#+S#JlaJyn;w#AE
z_;1D^B;SeuDE=_{IKDBy5$i=k`p5vYvYLM48$Ls{gTN3l0*sRi?NjZ%c2T>eeWiWP
z$7x^jNqj1w!DsVo?Gj(W7xAThIbX#e;PuE40cO6DZ{ged6G#r;%Ln;0{5k#t@=JU_
zKY+3uD7(cE^Iq+&_C7zRy^nMc{6pmTwKui5wWHcQ^tXr!f0|%tzcx+UU@bRNDfzER
zd&yrTX|U@5hWs&EKq`x`i?1X5;v3={NO^oqd<)r66sqNI>_6s^Li!E!`V=6YD2y*b
zvw&RiE5O>8lKKSW97BrnIRoX>jW6r7j4SLo*SJ=nZ{k?_9#cYni7ADZ)AO`Ut|zyX
z`7%>_q;FbI+fvZJWS;gd^OdG726~=SI*xo!25-t`=jC>kr{hs7L%?`~qz@gt4%7%4
zDUI!KGnt}gi^iMU#)9g4*p$!g(q`Hd(Os5{%0kam`^a_`HbFDSj0SBdgX$tfmW|qs
zwxfL>$C$Eh<gugYkoF<zSe0Zd*<i;yiPH0OeVNj8Hd9%|wsPN;dL$VrFPbVDJ#FG6
zc9*F<Ha({EDN|WSYA>o2)#0p3_)L3QH<|XB>gq3=jP;jIhd0O>txK$|LW~_`niVA*
z)kn5ptS>Z2@zFk;An736kCw@OTs1Xs7(=EhZ_r=XHzh`Y`I!0-<>hu5%cu;O#nMQf
z>!!B)o2KLSuS_@VUz={f(Jq@&&u@ADZS{VaahfIj%686*nXk{Ip7mp+Dy;XLXv8it
zwl=1P?esPpnRj6m8m&jP?^Al)WQ)X&QIn_sj!9&^Oy!s?ug`m?(~)!cO=sD-)bHhZ
z{m|4?KWn-ev01{pZ!r4QbUC6Utv_$NTEB=oOQ!3QbuP7Oh%=2gB$@88eq^5nZAdlU
zqkS}FnC?gXRi;v`YbY>1ZOAq~WHu}`4;zckV@8enuCc;AN#nY)+Wdf(9W*~O)|wx)
zdi98jG+xHeF>GkeME|knSpN<E=0nWyt?Oe%=Df-LjOq}v_1k=}Ic9Cg`c!PZij7N>
zji^k<R`Y@!LyX7FFN`P6%SM-ZHDW7y%-DD=`^{$iQ%re-+BD1TOi^T-Zzwe_M(<M%
z<))>ED$}cm1E$v-`jYnmk3ew0S)f60j%zq%PGY>-oXYlG8FD|3<_tz#%-Id?W_7~}
za{<%;4QOzfizsBjYw(&&84a4tqxaH=Gv=y>bLIoldr!j!v!2mQ=mU0S^BcXl=ZF2E
zVZdyr`Z67Fm>U~znOj)9VRL)5y&A^MCmQaW9SxIaZ^Hv~u;G#UOv7XIxrS%v3k?hA
zOARl~{SC|JfreG{jnBlw&GtTdPl=AhZ=$jO72BU&PWQ3qD1X~5?t3=rUyh0L^?|%j
z<TalV<x8|4^~D@%f394o#=P)+%QS+w>G_6OyTO;Esl0AdV%AaY{udQXnLbf8vhT;l
zqz&{<^47)JGiJQVUKzWu#?s?a@h+Omb0Wv<Jz;zQ!1e==yxxpH3!&@47_umh-4>OR
zm~Szhx1>=-uEXX2<i4qVy_U?#8m8k!UmwfsPrf#m+OznLeZtsh$%(9cT8?!`*I#sh
z#w(V*O|QMBevQ{Gg&WpkOpKCak#W#cyy2RNo})2w$fBuFuvFBiSgMU9mV=DnW2vpr
zx762HTCB!#OA|ZB`Fh^c%Fg4QamwOi<*37`)Cc2pEyq}$3Cl@(7oYGi68<G2$zNmt
zx$5td|BlfA+M54|eZI1F3;7Luj`A^kj&ejfuKbze0(*jTkv&1V!k(Z^vnMEj#h#%2
zb)r8hovUWgMP}G@k^jz~i~J>fF7j9Gxyawr=OSCO=yQ=R1@yVdmaov~B3p{-bCE4y
z!yWBaa((^jfrcXoHI*7(BWUV0M$KW(9!;~RO><mhL+%03X++Iw%~?&4=AtG+Q=++y
zGgmd&H7P9J1a50`k&hzX(cIJA*F4mulm<1knx~q1&7vk9X-V@+^ZJkDG+8X=YVt7(
zO(m>HdX&XbhPox1T>2y+!9w4kjJdb<U`9HLA9uBbNc-4*ZaMimnZ}*$L!_@O{)=Lg
zRK`7s`vs|@&#TZ<LCmBvO5a?fmLw>_cwx;;B;94rJyANJg~n9ho6z$*(rwgg){HV4
z>65?h>=%gy-jk2A0wfOi()9oH?m>zpMdSmrm3)nq;Qo3qDI<xHi6`600i+ai5a|m<
zKuRTgq%V?M@-0aEZE}dD!{^K-gZ;<NEaE`gL3~KrL`2$2rjT-=`!C3s6)B1ovWvYP
z>RZ>f=KhwFn)?+mYaZ6j);z5|S2JI;xOeiy@tVb&rJ7eYuWKF(aWzXql90OBETmS<
z2^m6mWxJq8SwW><lUMP)VofL#O852)<uunQHKbB6R0#+6_E)x7OlXLpubHp(e$XNu
zs<~ftUoh7^Wxb}N|40%}pdXe#tgl$xJ1ID5FLKh5aN4^dG@_-qQm@OTeStqCoco}q
zvK{<WhNnV{(2n+T(B~l?g{2EMOOP*>$_*)IKkTh+N2_x}ux4H;LhVaJf6c>6J;mMy
z^s^)k2sdhGHF-6&!YyIA=4r)q+Cv1dHG~26g(UQxB#g1%?+TN9%@q?uDzv7+Xc8U>
zk7&R0D6B^rkby#Y47!M30pWt+pov;ScqS}-I4+d$hm1lg>b($_A>pcy$h`?kIwc){
z!`OgMrAresV0~x{DZ$C0C9EZCF}Z#hJk|AWkbeVoeS0VO_Uo?buIUDKLlJ$Wuvs}R
z(~Yc4zCjC=>&EFUhe2haV?^YCSn*sa(B)Kmk>|lHy{y;Snuof=y$iZxou=}HuA*jM
zSFJm!tKB<Uv&j6E=;~`;>#V5Nq-(9bAOv;C*c`6vPU>9Lk~*I*1m7uz=VA_Z-MaJe
z4zsRTs4|=|I1FAx&~V0Z&TzqS$<V*I%1|YEYvv6DY&J21;fCQBg|6E$Ae4t&&~t|2
z4;q>8M64OMoN%FHn$Gz8oC<O9tqWLp3&1k<EyI}MF6(>J@SyTU#ml`Xn6F+EPS6(A
zHWkk`d3&pn%=+v4n=;Q*mHsw-rwDWuxC8yk-U~E+IQZe9{+|B6{-J(W|5THwpND;_
z^o#l>{i_crKAh0M7J|aFz5QQ5uIDQChPeF&h9pC(zRi$f$Yvv)(Bv7^Se;{r0{A+{
zFcf{AtE|$UWa|kd8%ho3y7RD|_3OtgrVLev0|vdYYB*H$+F+)$WM~vhsU{UKF@qDj
zDc!VgMleGwtU%opU`{uqgJ!~|{lSR6F6mzCR&;Ar`^s|_YqH&I7WG`13Z?o4eTqI^
zpG8-ZT(-X-eXr_sDfIcpKK&kjNzE&bG6J@?bs>Ej#;jhC3z2z=Lu{j9>t0_8@Oq)x
zM}0?Mr#CVml6dAtGYDXZ_077I`Zis){<z+z_vl6aY1z9GQT3>MHoP+RJ&3Me3ObwN
z*rDh27xkC*S1TrJ=HdNIdoO(afJR%xBh14i!(&5>q22III6&heWL(1B@6lCPUJ?or
zo!6if5vum~hodo_AH#xh$)Kh)FH~Xl0>cZ#vSC#yswK6`T2*bDQ1wBhuDdp~Hm5dE
zFxM8=7T0QOD{8B25ALn1t*x!EwbmBaELC2>Y^Bnei<ph*RNGYBN^L^ru62bYCG}v;
z>O}3a+LLTA{PZ8u6;vBJ!Je7Z|2t2L97F-67o9|MA^DJ+kU~h^MN>uRk$RB|k@}Fj
zk**+JL!xDaNJB^?NcBkLNE1a9^uJpse4Nn4{)^`~$Twlv8Z7KB#DJ><CHWIrFBx|J
z4*3#Ms1{U9Od|HTu6;v9QBeUp0$RPLWZ&SvHHIMxlpg`crQ-=;N-CcQW~B0Qscbx^
zoy<Rxs9ZiRVJeEBh~noYOi6Sm>YR+{()o#~zJ??!BU&Gs$}-0Gx0y`QvPI)fZDT=o
zeJPE(5~aH=*SK{4NsPR)h#h}C3PVwlWt)@QO-W<QwwaD9m*;RiO2@SrJIZuAs=iEd
zuAGaqZS*-=2GH^ZiKax!FcdTPkTjn%jh1~POv}?ZX+N@WgO24{KrWEKQO*sz#L60L
zhgcdrX6#%<Bid(W9c25>t@lyB=X1u|O#fIJV(F8pd637K%VMcir@RCx14Q8T=h)@{
z7b@FVrf-_Bx6p{*Z=ta^mhJR5x?z3A(&<g)m;0~W7g;;yJRnHC5LGYAm&@y->KMcQ
zGryPOb@^eSIm%|yzCq`|ZQq7<SbiL^0iGzk%f50J=-C%pi;*>XQIh5IzKyYFWM3rK
zmEQ!ehWY*HfD6DSpno5W>-z_!vKzoHX>1x3S2meLS%=tijJ3ZYtXuRL^7;M4VI4Na
zz_<BYY`p!vc(qwvnh49Ze{8*c|6O1bco1c)H^m`Y*Uk2)nDX-LlAS4T1Ec#k>{I1;
zfP29Gn7-!LQT`B^mH4OOxrspTXI`R<z!LCE(*F%8e;tPGcl+ZcnzV1jUb;UO$k?|b
zPVdi_s9LgPL6q(?V5|K_VSTBN`%8gxsa@4Zd+k2}=z&9k8E6DrfOg;n-~hZp@VD@f
z=so50;_`3nZ!(qF)|>X8H__<W7`@hGsXXUzqBC#6+s1!e?A^?MvFm5E{r_z=Di&=@
zf0y}*?)MFQ=BDFsMyI33kFvW-o+r8n9);Jz{>Q*G;EaTYFe2Bha(lUN>dP<ot*_yU
zeH*Wj<@G0D8%ynJ{HFWF{^fNU*xJhoufOR2_OI^S{MuXUw}R~3xDI1tlpKpHl>0Vb
z6HSNLafNDKzlyXdzmV!xWXAZ3WbcZ+bz4T}EqV>mIyw8+=zCA>D+}Zw&%LJNJpC{H
zM5RcfA2ul*2PS|iU>cYKo&a;ebLsdcumY@wkKgAcB!ug{p8}*uAuFm*E|3rG0ZOFy
zWk97=&i@_|fVwCcfy0s<%|KgtEQaHs0~@I-bQSsvLxtUi=il)Z_P%4IsqlPM>XW&5
zdJ3-;UVG=Pbp8r3_)cBnP~p&fc{Ha~;zq(psCk9eqix3vuN6)dPN2qARHENtNc-%Y
zzU<2)aqOESf5g5WlEJ<m@(%lINH+Uk$WHc-Z!UYs_p9t1Azx!(2PtLW1=-8K3!-6P
z1Sw-*1lh;F1@hq^d_09BO%Z-;c7(i7$b0am_f}%CCZRe^3*a{e@U4QBD8R1@;8z8?
z5;zI@65X@j7psf8FDMIxl~=EWo>Sj@E8c(a{CmAIsqZr-pI^R7qF)?jZ*mj%<(Tj{
zW45w4xs%wN+}qeUV=~!yVzSv6Vs^4`!{oAWz^K{RU-H=(Ukcb)UcSP<?^5{xtX~Bg
zBtz?e1KN3k7~Y-TIZV^LPj{N%oqu=n-KBRI+3_n%8D8%k&X3Dal4=d-r_yqo@-t{z
zz8WbtoZg+KN$xQ}8$Cw)WBKcFPjWvy&8&wc)ES`dXnA<#ox}9WAxCYj<odZkM_B(U
z_ZK9G`_J4w*{yUcokUBcC*hNSk~~hn!M<3N38YEig4q>mABXl=(SCp%AzQgo^p(Zh
zWTQp~%O!j4C?X09`qd|z)2|tkGEx~;Ba@_MPG?T<$Wb3x+jiva$lFo4qZTQ1M{(wp
z9U7owNA-?_tdEQI4P5Sr+z&DKkGLP9<j35P3CG>yZjmkApK*T%$^Ijx*$SQRkp!kq
z5+wU`lAQdXAYUq0+!@6^_U!Hu`64`7B;?D+Fx02l`Dm!mZe9*3)Jw#m)~kne=hS1l
z#p=71zSE+<wJSkA`Q-xj1Io+CS%vCHw2u0a`Y|n2o7K-~dF~VS0;`jveyna(zo3-X
zrBJu1msuOsRky2G)hBin8LVyb?rI9!Cxdyn5~z;=vdMsK7=L%3B;&IB-0s3Kzgwd&
z-(7+6?y|8iuzpmMT*Hz~FQVjm#QJXEZB<{|U7S_8J4fBW`=EMYcde}dJ9^0B*sW4~
zcc(GEg3wpy&+N`*^GLr5M%Wj$6p7Zv?+`BW<HRPSOl(PPAqk1CiAPCd;&&6jOOg|R
zkoW_lN<5kPPsp}JN1~H_;kWi-h2jSLCYg#jaNR<7AsX(wEnyV6BLPuu*L~n2FdN3s
zR?zt{jmECUbv|cy9XM8~GZ=5kEz2#-d61crlfCQKc5vJ4w%6q>BaQ7$2e-PtE|(qe
z(r0FTxiu$y`)#DUUBkJ2Zu71)I5U|uN$a52YG%gv+h}n+GdnXQGh^4-u5&0|$jr#)
zbDrfIx3}dy$UPhZ>ofN(Mb539Rf=4Gds!~O>p=u%>nSW_&Wr!U+u49uQC)j~&&=6#
zPLc@;Nz^m~2BZNYQi`YnDWn<^BO;}dM$OGdL_|y@0wR~|i>UZfL>mDSEg}WAsECM&
z)S@C<MYM_vsDR3s6afJtwH76L*ZS=-hTFGqKkj|*Gtc?Y|E#@c?X}lld(WPaoH!o~
z2g-31+Rjzpc6r-J3X9tIm(o_wKlLs4t3+xq7t7`9kywxY5!>odec`R*=PTmH8uh!)
zTGxKj8_{+~;n2bnQkxvxW|vqJ-u8a^HnvdzuS(?K>B}7L-|OFN()|1U`%HS;MQImF
z22`h2O9ot)c9~?r$h2D1BJG;AYfLWx^(-%=KBL|om+@uBmnNV8Yxcc=YL2Uv&p`Rq
zGRw8&Z%i)cU4~xq%*X%FH4}K_z%zZD>2)*1^}q8p>3K36{Yo<0m-&eLsjUsQ>HpkK
zk?Dp^Pmt*$nVuxm)5!F6GCe~weVoa}lD5In21^^hk3TbGzZX*XiO6okYtO7~CVMOP
zySd(Tc<q@{u}ptG>%S9wje7pyY;}kwgC!43K9&M3rLdv@joJRS57d%@{T6>S7C!&;
zOQYfXp*?LJAID-;lgu|&aW6L48y?4^Uo87qdg<@=``_|=rdi(1Zzp@sJk%`roUAtP
z#pbeUaW8%=`=9E=f20@wJH7n>iiKyEnKxM#6zH#y=M8H5oS#=NTS4Byyuo=@f<xt7
z6|xba<K<FXo>wc|(7dsHS6&`(lVmH;yEXolH?x?Mt(?!d_P({$+)gcfEtx?5Biwgw
zOWLJrmr9#QrAd~hU7MyE@DHDx%o2JqezKWcEuTls`ML9CI}=>cw4c|sEy!Ke#Fxq@
zn6o73uAEW2HPc*~vxLvL-<_pJ?7y?HsL7JEGcGrBlAKEUypl64=at;&a-UPoc}254
z?eeq{|7r_!_Q|K-oS$<b=gXXLa{b&?Zj0PJLERVR7UmYgi*vgP`ne^!y+u}-Tb5fc
zd|>WiwxOa~kUK)Ql6cc&QcV_XbH{=Oxf4W3t)*g1lVUqVbH0=^m83XYKV_6u(ZZo=
zWB-ZshcD~@4a}}?92V*C=X>3}60=Ia&pJxl(d)u2eA7|VHeP$?@|vThd0v5;B;N-g
zB~ACT`8&(fqol?QO@(~#)J*D|FIi{jHOanf+utAIIsQC<zGyt`KO!^RqyD3^vR>#f
zlp`1WizPP8{pBVpv3bd){8uDGY5v>(TA6>|_1`m@{(67C$?`w&KQP(;Mt`Ge>2LA3
zh|TT(cA0aY%Xm)aoabd#evGWjUzFU?wf)`To#1_c#qVy$r#0g<n(=#LE^YS;J!!A^
z?f)t5fGpo@Bi{}jCCm1-_RUAhQeF%Bw)!ZUM%a^{ZYDEdnFjeb_bAx`iMl7}qUN$)
zW*=vx=CUnjr+gcBl+68|>A<68Yssx%N6A(nbuMtSmFCrEecgd;H0@Wzoa`|t>ug-Z
z8Skj5rj7ios7`AppAH||j8`{{{GH7rfA@Dh*14zIvG+8y>G)Mg^_t__mUg_=sn?ON
zH66dx7XRHfsNxdAbYo?;m);~jYQBjYjLA%x^D|9mm>J0@Co?~@jiBz^%lA&1U1iJ9
zJWa4y=9wbzCtHrl2V_==hHf?SzCFh^-`a>?jvO0ijuso^Wv@C>T-r-M6EiQS%pI+t
zi`lwn4iOxZIlS4r;UB%8Y2j7S!qL)hlanr=>}DUi{>YKI<+PKJWJpezSgv|=|I6o*
zc)u{_OXSmCK0W1A>b_^sH0P&JZn>r9K*60Y_qIIP@+;Z)wrt2rq)*l@C%t8Z?z8)~
z+>(>sa%WC<&apY!qJ2=eob;S_IqlHPcAGouTZ=gz)uJHwa=NH3we+2xobF;b(Q->h
zx$ssQ#pyNavvXPrPEN1+A6xPKqi3^i%oY*v5}S;!@;OaDz2d!o&XiBTct0R6Q@x7#
zbFsv8Vn&g)V@gK1jFODr8D+ASXADfAl0H#4;pObhM5~)@MHz!Ls`xIO><5ajmRGqg
zKI%J*wax9Rt=Q7Hc1kjeM6*b3rH@aqORviqnlVE56VvPd#p{!QDRZ-AY-Y>+oGqD|
zEg70EnVT({n=P4}Et#7wnVa1`*6S}HSq)}a%12g7*)nftkCe}-xNMw!Bx|!v<Bxpr
zHqo4)T$5GQVy0kmR*7uAWh=?rl2x8uqg&R%tTOh!vx>3?i_YMzs;t3$*R3{dMAit(
z2fHmk>S&84qJ=%Rg{AN8l*qSo`Scb(G&MT8B)KhXsNkC9lK=J0)&IwPxY-IouXn<R
zQAc)syZxOrhU$Ctp3cFm8%w^OlY_}&o$#HZ_U+E(6tGg|$rZ*+OthP2X5JU-S?qAI
z&UkinC|Aje0+pwA33eGT*(%XrB-_GUB}b~Q(PJC+47XEJG-Uqw$~g<{Y#60nF_^59
zVDgcTs-u{gs1kn#nuiso4#C9b$oDBGYZ~T>e0-yv&4WRTUUB0{T`7dS6_X3Vwn!?#
zI#B)J9O}E52f~XKlWWnuMbA*-7QvaJUdeS!rjXa5qc|5dR<BQumWuvGNVtz+mm<+)
zO=26o1@g`4&jGOz*R=uEXdmXN!`dQqSX&`{ot8HA2D^h?{WeIyHRh`2hL5nHmUxjG
zmq}!vNR|oC2WN!R)Ba1~K5(PTgX5J4E0L^)zYTUm(mC<6U>SI>LzR0C@U@EGZum>!
zIIySF!A^voJ}Phg!v8`fg;rXZxGN#+Wm}w(vwGrl<%t)OU#Dm<NjxUJLAf_UF&IGU
zuaKN?e<#bf95g-f=U7;U%}Ov6$pq~D8l7}7S20<Rwe6s@KQEZ4Z3iC=B-?}ac8Nwt
zCH^40M);1ziGt6n#4AqdI`jvM!D;X!)em!G3y%xFtaAT`L~G&y7KlIIpAx4Ef1abJ
zE7~#e;oxlWDaEiFJKrdVTd1*C<%tmaZZrqM^Qo&4J{iq35*_hSN6Ht9!MlnHB>pw<
zNx=vDU(nI4MrRGUP|+Ok4c06I)4-G`Be8Lx{|vv((pAT-r`JK7gX7h6z0UEr_!@@>
z#o%}um!98MxjzSfOVCgA0lp9XL?!-waIH$b*2uq7lvzdpg6XjGU^v)`(zS^KZLhPV
z++P5{0^Eq^8u)y$671~BZ0QfcpUz-UyiLIMe~L+q5{nh#Tb)FGklIzJ4_Jx(0LRuN
z`9!htK%$#OwZE+uT&!b1xQZUR7H@~cr^DwdicT##FPe$B6#e-e)d@*~mIk}f85>KM
zlF3UoN@zw%ZdSBo;3eo!0|#SaC48ioCXYjB8h8>Grem#vnW_=rB1N+mn=j+@1eaID
z`HI9T!jt4!uKjn>Ih~_=xHf|Q6dSkc3g4f@Jb!HC7tG!|{}(FvT2ne-OZ`gtdiWA>
z3i>s%T=nabbOp=6oQA4zukNH&&nfSsUxbAsj@k`ghfV`{Ht4P$Uk3HRHOM&9JBrfN
zR|?*RoqY|<wC(6@=A1H0xuq7-SIx;fKg+eW^t+4%&Tfi{6X4f^6VM!y_?2j0qGthb
zu-`+uKS-kVj6Vr{2HZh3whHbbuQGj^g?x|EGzgqU)}5Ewqv!j?c9A@wmV-j<SHgEG
zdi6+FAejsfQcM&eFG8{md>gz<(RM`VUU(jSF?>8WS0U+-{w#2mqB)$<yPB)P+Z4Tg
z_*_NtO8Hjs1thJoR);64*n?01D1gKvV%k<S2Yz)8rfIo8CrE5}c>8GH;p5bG$;
z1gC<vij70y!@yo@C+Lo4`pbKWy4KRtb;O~sv)0HNTyk93Gr!enG%nV5f%_Cit~1F}
zMOlZaWErI^z%t}@;1<P1e{=?cg<u5kLVrFw+pxKsqw2wY@OpHnpg#(oZxS+Bd)*T6
z2tQ;;!W#q&ZMJ5<=8tEUdwI5vV1|v9Cv+XTHPm}4A6Q+J><%NMS1x$6o^j0ai601Z
zzB7H`X?k{R{1ly>#3b~KK$ZVlQFwna4=iDw(-l01715<&JMa}bYPsgNaQ&Yg$8ip=
zw1GxU*N|o_YpuE9R<{yUpZy}s3}9C)>gvuf()FrW9TtQ2ibz(ernd_DD&+M@>XFnV
zS!IVQ>RQ*UR}>$dP96HIkvnUvu~UbR)?SHDv9pg(9lQ?9bt*B3Y0GNxFe~lD&RdO-
z--ovJN&Hz+j}lZ1#ne^j?BmZvU`5Qev_duIY{DFiKlAZzzP3nsF>^j`(Yj>KFbY0f
z{gF9L^2hu{d!HvSyN&)OtTCs7>TQ~N9K2TNvBoFm*hiUHb7eLa3k{qdz&a#{6}@7|
zbv-Yu*=1U)+$(nIR_q*A%(XxrtM@ux_X_GvTcmQo5}kT%R-;p`*DRJhWHwLj2##bf
zy;zSzvz}U3!S4rGQRDq!JvOWH@BpRPIpnD8DBVO|oA7@tnp>SsU4eVm&QIcV*h%72
zUBj>DsMRW$(%$giv^3K3eq@14T<zE?)*d$pILdGjtS@L9Ul+`4d{?jyk~}1B8o!qH
z|4Nm}@ruh7eeSdQDZJvYr?gQeUE#&<>`5v2EBzVZT6d<V^m^AKcW#A0sVEVbvD|o;
zqSU3lKvAq|z2m@Zz*<F_uU^+4QO~7c(&3!_)pLIsI?KVjm^+_!#+VB31m6Ycf?KI^
z1bnmMDnrkh<~EP(*5puT8)C>zXxyyATRkVMq;a20WM<IOl%LeMR^BD@Mn{dzPRvMQ
zK9Mh=cgSjHhC1y#zmJYt8Jl%FSG()w_^Q^86phtvqE{BjaHg(T;hA6`a0k)e4WHsx
zAYi5H%X&jgFNk|{rit%$dk-PuEgtVer!O|7E=8{%tWz|H;g5o=&?y2JASs4tf_=ap
z$alk6fK$MIioqaCUjwfPFQAqK==4NV4n71v2sTi<9;`u9f#((Uc!g$&*B^cX`~uAl
zX|>CIJ+GZiRF97{P}{qNT>XZ1*O^#*8Z3?PFwbPXt7cE+Z-TFZ54sg5n#XWvKNHLV
z&jCwVC)^AVK~_-a1TZ^Zz35rE9L;AGkDS-YoY{CgI2#-Zb^)IOb=7KP`B+eAUDNP6
z{1tGbT4Swij(kO_@P_xToXwAH(zA2J{xC!Eb4AJU$G|tp+ct1_H{eU~20U~+`cA}^
zF~^_a5d1mve=q!H<e$Oc0)GwaOl7$GGYS0H2sG|h`zDWCzJQy=5RkScq>T;RlF}nb
zb|ibM=X&lI3HLz4QgA=`JUANE)tym&@rw0>J&gS4;Pv2H;EkMr=YlWESzvIgyWnez
ziS3GE4@DnL7;p^o#}t!yDthSqNK)t|Tc^0+g8m*#F9a7V2FUGJI2ID?k-U!O&G0^O
z3${d33ZH=fSg?V*BCwFsgJ}K?o(4M0m!MB+kb&fDO1mKW6v=1s$KjdiXMy8EAKXtX
z=76<Grh_$D`zeU$iQ%AYbsqQzlBwVjup?LjR)MrJaTT68-_D?Q&w`Yuoc~W#dNq6{
z^|peSgLocn!&(WF8x+&&qcr*}H4H=^%*Os**y)VCFY<{T`v`V6fVUzc{+7P*4{&Tg
zr4M7_TuPT>0lDoCp9`-9cY<A!lOc_lXdVhm9{M+c*OQr->OCUwH({3Mr)Iu=O);@u
zG3=q}g9+{pCdMFtOfh+<qKCeZL^6C989s~Lo~0S<EGSR5c9xYV_Bd<G{R`1qs2HGc
zSw#ofPpn7sI`wXb_kmlmC6ZG31oX#(4R{!Vg_Isd^Jnlh&{@6&eM*B2Bwth71<9vK
zK7&6F&qO~991r>+vxh$itVJ>%tijq(K|D_k2VJZ4z&DUg1&4qg!3wYnq>YKI@WlCc
z2CaJ*q%`IHf11*(;VY@P6}%k8^MF|{QG(<K*PHa(W^g@yWE4Hun~y1`gK6}5Y8c3|
z!E7wtMO~ed8>bKNi_S#qdW3p6fVU!{zbt*{b00gIPwB&0JD1X>SU_&O!@1XOE5V&$
zSLEbX<0X16+<0N5<auL%&Z>_$oG-W?yhbr-rRaga0ec|75PVh9pzk3`pcCAM-05FS
zX(w<nI0^XyI2Qbo;0}(hg+B=Q!4#6Y@E^f@f;S+!6|4jQ15AL<<`W=t?-uwGN{>Tw
z5J^3JJsfL(66^(f;C1L+0G^KIT<|okodlkM<P^}gHyykjNnfxlm<M(N3&B&-ycc``
zYnwn?ov`S9Na?fiEtIx^KMdYR=@)R<UoR?J*Dv%@q8r!;&9|`d8h8xy6OnV2{~mZ1
zcoz5^_&hq}Ich4BLGbqQjg&h1{qVKA@{zo%BCiw|f;~XiMaIbwDM}Vqk#8!Q1^*K|
z9UH&Y`>w1TmyxSqv3lffoJ0t+Vh?!l$3BMSc2<cyk#H3pRKaU0eFQAP+N<CSo@5Q@
zsD~RS=#>SMQC!<@!9s7ayGoK=qj~K>cU@44W(#U@Z<xHR{wKMnvu^#L@2=OlE-q8F
zy%ZB6xET3YJSSS}?#DLt(-pg-e~Wq*yzTB=Qr8Hu)YXfH4=CN`t_O)>g<`OfwyZ;Q
zkzNx8?wYYXabO0J5vQfw@q8)0un!L}MuI%Ve~F2d+SQ7Q{@`rA3ihW^<3rf*PQCNc
zkElJu<~V0T@8ZfyXeC$HD`~Z24!8oG1dao{#`1N<S#buE1@KA4tp?o3od=#R2E6f=
zi0^DSOy;V89`(+ncl6j-W6wu%wdgFY)2jopLQx{=YEj9XvHm({vvt^XSv?BPdGL9(
zNaZPFvPCr$KXTVESQrLA0Iqh~80Y^ya-)h~c0D+Xwi9!5X(|0Oo*vJJ-+_f!+|?!e
zJ>5MSEacPb(PZEVB0rj=`nqg!SrLzsG~u`EQ6cvk>{KlcH(+_F+DYui{zhuN6R)N!
zrjdK8gNjM+ON7f~3lr!O!6aABVYPB=kS`!^Yn)B$n&YmTxzDp(F)`WY4<4?hEv5SA
zw{`bGuJ$&Iyfr$rXiGl4iatF|OE)8#hh!#o?Lab!7)}I7f(2lIN|&Lzfae7pu$;gG
zF|mIj^C!944Br^(!HIZv4Np<0fb)5-S_ogylh$_dufcJY4g!-%7J%1;tp(fie6}r;
zB6toK>X4KvnmyoFrvu-Nwc+mBF4&bPzYl@ASg1$-2{<Eu?yPH!^*m)RRJ3{6oDr|w
zRA)uJ9xKzk5Ug+8#aO1%W{pu8+fh81XZIthF@d$N_%@k(YrvFB68(`c!^2|a3$R@8
zqJib!U?-4untvRn_k*W%)D+~UcE6r+kiVdJmVDMihV_qMiTqS*Jc(LHBdH;Jjqpim
zZUi&66~22i9?+I|T>tV;MQ7}=$_;*i{t9CCDtrK?{ot+OtXHL{)aD`hf4Zn*xdZ;>
zJMNyy<N3SPxC6|grAyGc9=;C?`>1!Ldn*9RXXr11f8yHZp6}a6^)t@8JJcV&S`T_C
z4<=)|7|pNHnGUa{uK6y`p}t8}O6dej2hewKQksc{Esi7q3wI~>z^h%(D-W&$lgL*&
zq!$jrH^FCuJ6+D(o78{h_A&5om(P0NFW7}nD`LBX+UFq2i{FK~o>3#{akV8j;qhA*
zmuWV}IjNfCh}$ic{*+o4Az2G@wc?k7*O32v7&}};1y!!EX~m5kdn0^5e81xy8&KnC
zcv276aa};RC-FZlS4@(hb~XMyp_qD#(tD8i0=Z%eM!A?^2TzhjBOF1rxkm7=cQXmK
z7vafJXBi7q@qenDAHwm(XFPdw2eXs%L1BqlxDTBkR%h&6^vRUWIs5cFPIb(7<<jrG
zT>$n4xm)FTp>#93v6YxCS3CZL@%*OGi4KL=sr`5+Y{Lwu@>y2j@c9#UZDS5u3{Iq7
z*Wv#f;?n{-^Q|@D$w=-+XC5(mTQO0fXb&U#7)b_Lg5-GcT9>8vknqou>;k_)v%lUM
zlRPZNpYhaN%3NAX?Se_p4t6}X??R`)qUb0uAhuJ$6{;_D#dwZY90s1O7|dhL?gHPo
zoNKHb-Q+X4LNU=EosZ!eis3_+Y*9=;ONJar+b_h2x3GV$b$;TNb)!sc?@Q@2Z2krQ
z7vjmRnDB|i6Uf)lidIga_I_jCm>ths4eH@q>`aGO64m1s6D4>%n9_9|HP@Zj-1#E*
zwv^r<4_Em))b)*<EuH;1l3L@<@SfQGhF<#|oz7Zg@|W1zOub{toDJB#nYvz}_K!iv
zU1BzP61{mk`ftKJx-4=WosXOya2lFK%-#ei+a;=b^u9_?L&6vg7s1D1a~M1we3{aV
zIBGC4KLy-IyB=V?v_f(c_z`#x^FS%wgSWskW6-|>Jc5Ol&W;;d@IsFI5<ZKg`nq$0
zV)7QOG0!J!oIh@sM)PhYd*RH+5^XKzjF9B4WM9DYJllbM?!ejVr}S3`vbO`--hpv9
zP481tBWKC50)93;pSs?{^B2Lr*q@8HuR8ARfGer%3N%X?aZ|vWIR8uO9Ua#<sD<>4
zm8VTpp6U+z)XOR&tqY@xcj_!FvE+O#3?x?z;E%=ggQ6E_zF_(o^m(r@ae18EilKEm
zR?6s5ejf63sHL0okl8yNN`x;X+gIxohu|Z-PH;3a8I$NOJR`9{GdwXxG+AwWuPK)i
zF-7mQfmMoLH9gWxk4lWuC`rba=>5HfUJHy?iML*BNid%$-1&vs{>Xs%dCHA~S@5j*
zNea)V8+p>vcr#c74&>g^9{5y!Dxvy~UpB5oGEL9sYRA7uF*p`J1zdro2YkJv*A9FW
zNhPI&z(OQ5;1PTuI1XHZ{B8Jr@IJ6BSOk_~In7Cqe5hwPS~niP6$`^D?V}R8ic$S)
zumSle@GV;EJ0I37Pn-f@OKCBfkLAP2Q#_&W0PjdGRqz@#>#^3;d5E?C=qy7*+s$UD
zi6j&KDcIb?ySqu}vunFb{6XLfo@?*K!poE{alN3=!M$p*4#^d9Ow?K-md|!i)j9S&
ztnCB)VDm}%dN9+)8U1!hCShkX$G-0(;ChqNC0vo^qJKT|uaLh)t4FzLJ3EvH-~c4&
zI_`QQj+olnq}k}bqbNPD{4eOYqVyv4EAXlxwe*9#D0N1s9B=oy7SYC;$bU#JIgRZV
zDeW28sPfKOUan~GR!n@YrNZ0m{D7pudE#<j-$w9@z*AlR!@Kg9LTjwG=Dmfjiovmz
zc7_)=-mNj`+BQHw2VCGH;}FRtu%{~pTY)pcC1}1u1bVx?b^d_+z@gwLXy&-gK~jTn
z?}K~X-5c!u%w;t?uYzkib|dX72dlyTwDD7Pp2hMnk#7M<xDgG%j669F&2i2e7!d!z
z5R<o^pWqc7)c{YyM^o1ZdUmHCmFliM$v6!wz*73|LM=@)M}-}D6MirK*oJmZz^k8<
z6~oYE)(V~d1o8vU^CROJS&FNi&u*l1zbN3Y)KqLvWfafRJdydZJ$c)n*{r?JJZ^>o
zdoUN>f}Nhs56n?!JCeTf=u@3EH;*X>!`=LbyaC(<zJtxOhE9S8-UZF^c>ACmQ^fEP
zcQ1Q@JYn}|(cV9~R`YJjki-u0XIx@{a<cs;u1AJ&clsrs?sYaMen%kP%<&#G_R`Up
z4mE|B{m1AUQ*p)U%daxkBQ760+KjBZbZo5|uXZmSR90x($eE*2|Hq$6Ga06($u*h!
zO=Kyff3+5wER$pMObhw0Z<=a}X=0xKHAc*R6PiHn_OIwysK3QRc_J>eCfgiy*=0W;
zWhQ~Qf>Xhn;2dzlu$n8cFiVDCS$nBj4!&?@?UiHAE8v<d$6Q@w)`A;E@lvxH+*Wh-
zWi@6OxbNqe54+OTgNH{;v1dT;g_}eZ1~9!jrQU@{NxoyA;hwd7_a(-RW|(S*nawZ>
zIy>pjFp9CIX>W>6chk$1nf|82RGDF>#*8)B^9unp%xv?J3H7xEbG!4|%W?bAy}#zA
z^f#UL7q;8S`x!?6^3zKnVE7A|^riUgrN0&LH@kgy8@JD0BFD-8xV~<mKf>9`UnzBo
z{rt7@_Z@L7^nD`UAmUd|eDV5xeUl;ClW+gazei<9YDRhO{O&>DM5SqEdYUs$xfx_G
zHbc!wGulirH%a8`%q%n4EHI1BGV_9Y)x2dkn2*g4v)9y{Z{*YwC6}2XxiI-e@>%wa
zlFQjYnfxvL#mVPn|AesLv40}Dg8ic8^X#8YzQBHQ@<rpzex-a{B-$^re<Hbx{i5W{
z?4L}&!hUh`_oBTh`Ko+-QnX)V|3q>%`$fsu**}?F!+vq{4bgrw`KEkZEZToy|3vaF
z_KT8lvwt$Vmi^-7JEFb#zeGfTnS8vN^bbYkU0S{{`Cc5Eb#Y|Y$C23(N9KL0XJPV#
zI6fc7aoHHhWm6oN&2dCNiX-xI9Fb4rh-`@?vXzKzizBi<j>wKUBA><)*%?RVk3?iw
z9FfoBi0qCdvL}wn-Z&zk|DA};G;_=&X3^h_$i6rt`{Rflh$Hev9Fc=@L=F*=`Zywg
ziZkZRI5K~ZBlA@pnZJ-RhvWEs9mnUJI4(!xxHQBOX><{hiH3-zJQtCa?;?^4G$N^l
zi%814h@?UnkyO$}B$aXzNu~W05qZVDY1Wy|5|Q2JpgHXMUV19sMI@EsB9e++L{gb9
zBB>TGBB?BmNGjV!B-PSIB$eYLlFD@vN#(hSq>j;uq>gnFNgd}RlFD}(llq>EOsc>|
zCUv}KOsbWOPpWkspEhxPPKe|3eQjx~Fpf*xI4<qtxSSZrrM-*GKOK?1O%W-IBhn#`
zNXIxLC&dxz6i1{p5h;!%a&jDzE^$PD5J#kI9FZRqk#2EBPKhJZJ&wq!aYRmwBXT+s
zDTyP}BaTSVI3j1n8Ph9{%#X;J-f?95#E~hDBlBYy8U1Zd^_w%$G&vJQrdD4ble0l9
zQ)IeIyvob~QyIM{GyPeqpCqP5>tZ%NS`Ql}vJEkt9=$K@Z1g^e+4Sf`nCfke*>rwi
zrM>B5N~Ayfo55y?%<5y!M00zzsi~&TO*MVgRMW>zHGR@l)0U>1wl>wY?c187J;KgO
zoh7opF`FKJ4m(?9`(pP0^JsgMM?0E4`n1WTolPG7vB{%dO&)#L<k9XXkM=itbfC$j
zFPc0$*yNFneXonxMHVmxeu1n9PxMdZTr4ZM%<ilL#v1*#sPL%XlGRUej`96J!}>L`
zbDLOs6FU!P`3<~$Su3{Z$b%eph$HIJ{u7#CYPR?X<p|A|8RmO`9V>IctZNpT<>p-(
zKVQkniM(UILa&q8-Rtf3^Yne^M8+zybsy~ICicp=%>RS1S+KX7*xOBPZ4-M3rmND(
ze^<G`U2rC4?=@+yYhvqRfkdx`$@4dGgnDwPuU_1P&il=Dj`h{2nf~uhAeuA%51K4}
z*u*wAu}w{Ea})chiGAF}K51fGU`bgw=bJ+5y)Nc7)618&f&YNi_d2yc;J+qX5BRcD
z@aM?)HAv_9t3^7;-`ZqrTO5m|f0th;`7+mkL^K!p3r)Je$X{f#{KfuLCfk47Uutsv
zW&U%r;tyCE=SkM+uej*CFFXE4#a~Oo<No7fbE)VD{`3CxvR0JL^Jnmv;`)1enul`M
z(f^SyD_@ys{rmm<O>2L)Kijks8!ws@_{;A;{N;BkzkC;jw}rR*dbJP)R3D@V>3Sb3
zh_ELb!DnIXKr`P9PYBzphhCL=JlGwa5OfSW2VH`0!D+!w!Q|kU;Pzlja98llU`B9H
za9=Pdm=`<}JQ_S6JP|w@JQX|>{5E(#cq#aOuqJpbcqdpNd>DKbYz;mYTYH1gg9E`q
z+tC)=AK5<k$M!5+X8YQ6ZMhv_2id{)Vq0y8+AHi;_G&xEUT6Q)PPCKkO?I-q)!uHW
z*s1m|TW6=)8Fr?fW$(AM?HoJTK4cfzg?6!BYM-^w+27e0?aTHx`-XknzGK(f59}uU
ziQR5@*}e9Ft+!v<uWe%(gvl@?%nDnEx#4kPLD)vxT^P0v+lB4JqOfDwDJ%}Vgk8f^
z!c)VN@Qkoe_~Y=buqqr9ULKCgSdy_cV_D=!VU!-Vh;pK1qk`y!=)|Zf>KJv3ilZM!
zr$#-ZUQuauc64rZel#$uh$^G1Xh<|H(qFCkO^^z7$#Ft(g7ia&po8%x%AKVjP7Y3%
z`QeAb4~-2@4Nf&-aAR<zNd`9uH=C41{}z)L+!ows(t}?Fzc3lWoxz<ZlHT~G$qc4T
z54MnAxyNKl&)jFSg9m~KOv~Wc!LLnD@Nn?3$(5dZ)Z|HTJ#LPX9(%$ZE4}ulIZk@+
zDU&a~_l)_T^x$tzf%M|@=6LDJmrN_^&EK2W!Rx{6rcLmN;1A{m>DhNoq4e&0(^h);
zL(@)r`6F|p^z>Gl%{~o2HQx_D3qCVN!Jc4`=^#D-x#<|}5B8grf-izEOee{Lj;6C@
zLa`~fy=*UYvh8hqn=ZE0mYN^fGwqqCt7J!+`JrS;U(?O@v;E8|k}2h;yJX7%bE;&_
zAaj~z&0ur7WX{E=M6##a^pFf1YI@q?cDOmij+85!UbfcOnjhKGcC_hj$JueFkDXvA
zm{Q5EiRQ<WVUx_6l4UoUvn10dn=;9^Tg};$akrbkl66zeIg)u(O+U%LyUe+gfpw-_
zvT&L?Pcm_a=`Yzh)0{6EIm--?ti0b`AelMa43zAgV}2qTI@b)6EPcrQR5EpesgP`4
zXfBkDU2FzR)-E+av&-x<Q>oc&E|UCx(Oe=q{IaQ%Jbummo8<Bvrdsm(ZS(Jv)9;ue
z_C5QaxzxUI-#0@gzc-o7B*#B7!z9nQo6GGV?H|o>yT|S^SJ?e_zZqc<*+b?^`)B)S
zGg9*ZYjc%tw2h`l#zA0yE@L5SYGpiRn5$(>WSLPiE?SyvWNhS`(K0@cGh@Q<h2Jw{
z!`5MIbFGY(c4nN6l=fzVjFuwvpE6=Pn(JlMbTa=Z)W2w(D5Ix~xj{xyS2HQ>7Irf?
zhTX&N<|Y|UCFbU^XV}wB4ts~a%`IVRSZZz!&kWBrx5)^rGPlbp8)AMD4hx5wDdFgF
zw7DbWsf?%0)QqPyo;G)8Jd^Q^xl2Z$Z+<BwFf?^C3e(NqG7?*uX)+pf%yb!%$C?>3
zDhtd#QJbiZnHjZ<+L?Q0gcg}uGD<s|`(&hcGWW}9EjGW3x<*~i?5KOx-8>MLL?vd9
zjNV@6K^eiNW^Pm#m6=~h{i1$mUerJ8Z|2L09%vqtQC(pkmXTd)9+A;qWqu<ge27^f
zqkNcoG^&xY{+NvRy=F|%HaIEh8gvg11$P8@2locQ3LXsR2fqoP4xSBu7re-f`DU;-
zSQmT{Yznpn+k-#Ke7V=26dbZA+0MZow!a-{D{Q5$vP0}JJHpo3QFg2yZ?CuW>?8Iu
zyT~q)T>7o;Y+tae?5lQ-eJfZe`we!Z{m5>$pW5AapFL>5w1;g&=!apL9%crAwEe?l
zf_CBYVNH0Ntq@in4h^rcTQinNi6|9CQFfFU<wvce!l-@JB|0TKJvt-m6P*>E6P*`b
z5Dkh3M;Axc(a>mkR4Z*A&75YK(>!J~pV=&6HcK#TS>~(o|F(DL;ZU#rAOFsZnIVj9
z*_D0Fcg9{M`@TzM$-ZW53{onlQVMN~C?QHkQ7R=lDo?g#DO!#t5uvD*E%p1(UCBA8
z=RD{6<N2NIxvuA$%XjYW`<Z+0`}2Oy{kiXJ1|Ss&AQdMd6&D~C0g#Fah(rQJ;s!+G
z0Yq8^h{Ows#0QAP4~RqtL=pf*S`3IJ2#6#Eh$IY%Bm#&e3W&4>ut*HBNF1<80<cID
za7GGnMjCKN25?3eaAqms%rd|kIlvitz!?R=8AZSuCBPYFz!??5844hPDj)zA5C8@Q
zpaBBV0Rhwi0n`BjGynm#00FcC0dxQXmIDH;00htl1keKnSP2NA4+yXd5WoNszz`6?
z2oS&+5WoZwz!VU`3=qH^5WoTuz!DI^3J|~=5Wofyz!ng|4iLZ|(Y_?)0JQ7~RO|#)
z><pB<8YtHVD0dA|?pmPMbwI7IK&@^-t?odr9zd<@fm%I*TD^c;y@6VNfHr-BHvND$
z{ed<EfHnhxHiLjRgMl_ffHpS(ZH5AEh5>Dc18qhCZAJoZZUow7EYKzuqxPrTY{Wdk
zv|~CEZT0|dB0ls@n@WFKn-w@ETotYcSBtC1-N&>e&yR3VzG?Fn?ltZ$?gMTbH-|^z
zad@tIZAv5B+yJyG4YV16X!F`nwMi#wl9rQJk_<_vBukR*|Fro(ZT=T)6NO^`M`|%>
zJ(P=lik3sSpgIuUzJ$iXJ_(4MAtfY^LgF{f2gH|Im`UWi6Elrm_h4p`YZ`tJgvRON
zHz8Mj{AT28fR9J6MnAFe5m=Z47G{8jPryPlun-3<YylRw0t-m|hu_9-0aDxNQwj5_
z#Q79!KDA>$wR1kT3#33qO?uBxlP12GAhhX)Alw}8GlWLCfW(Oi8}JYw;R6BUM06bh
zzV~Vgc97&Cbzp^;*o3GVxf3(|J%#uMJKDdG<lZCETNh}@gV~N~AF2ELj2Q+IB0Hjh
zKx}|Q?(AX5gy4*V6GXg&NHki^-UC7BevKTVN5n-$o<E#iu=~V5o05T4kkMWBZNH!q
z#IjBEaeqJi*AGEzU_SNodkP=RZvCgRF*vaSCvqfk(u52ZdK|L^8;A|U27f!bLw#)p
zL89#M^9q1cNVFPeiB{sdl#<MoCb>}@=v<a)F;Y~7Mxkg#m<x|n=E0yvaS-f{Cn)1l
zSQJYOjl$+S!49y>f)cTOi3Bl77d-4BA1DfnfI=aEh>85IWItf(1;(%fd^4YHua{?R
zJT1TNzJ!xV;0~cEtz6b(7R<ulfmxXIxfnDGjV7xhUEF7l-$;^D=~)3fybtD{?=1?4
z^eYy06SEqNC!<$8(a10#JHtsPtnrQt3JwosMugLNVIFoN2bsgs-!Cj8+>a&!i?K@x
z<i$3@zKn>dhybRPQ3NA0g5k{!M(&V?CE4{Dvha_3=io4ZiWAd2EK<te$N-iQ;-+cP
zU|5T$O;d;Iu1Ho3rp;&J#LNGYo^iuOc0G}dwXw5zq{+j}zGfxDje;YC{25ZlP9{<&
zPPWUn=-S2<BXtc^im8r?A#E8f`}G}({q!9<`7>gIef?n;N@n2*qi_(0g<1q5<peYf
zg@Q`<##ji>40Kf44@hi>b)+Ilq}5xgm1Yv+Ix}2eutLMtCl+IK;~nZJ*SfwpzS^wK
zd-CxK+4Zv_XL4rTpBh9bh)F#0<g|IbBm0`=_RhxDuD7UtuDvHSvuI^&nkS1VZWZwy
zFXn#s*|GIyH}2Q?XEXKiI-1Eo-wpa5X@mP)Z3l8jt3uPbelao--xNK(BY=Lpc3LFu
z%C99>m9SkH(|CGsyue*TOrXRT{uqL8$d%m}a+a-A!5!!3<G*#{p^f$M4EH*ddqXOp
zCI)Kgfi-5ryD>+bW{%sO(HGb$3G=@6SY9d7R~Tq)(476t=ZRkhc@J;aZp}$gwkckt
zQ{$_nk7obK@F^CG3mFj{ERGDRI1g43E09dPbKVG3_NnZmeT80+hl*n5^-|zN;<8v_
zSTI3AR(<M;qiH1J^{UyJ*~`kMcQh_9f}Pn7B(XNI6>O1fmTQt^6vSjkE~ip`8KJ6S
z-{(oyHzJH0xgnTcN{wVh_(l6Nqp0)K$eu+oiHKLKB2D16cuqu^I2;EGg|&t)Ve{{K
z7@eg1?FC|EV}JGn{)|7d%!JA8{w~9kV8VBk7|!1b62tbAlznu|km>a$Wb1pw1>9|i
z4~SMbihOAwtmQtGm9@l8YE99Eb!=tZF+;^MfuJyT4qp%b6(-J44_kB}JutQ3t6-?Y
zsyg}EySpW0c;biRoUxeG1wn4ymmYMAl!SCsAIkP0Y$Eqm>Q-ppHam+kOYS+RaD&dU
z9M3#pO`5LJyAdTs9@RKK6(ljVv}^CNb(HBZOvlXSx$2=)N0NCSXpRiW;C($?m}W}i
zGDWb-?yLNMi|=zEc<r_^<M1`IWOMjna?7@~X%34$IpIVXQ~scjT2%z)p({qADKA&N
zx~+Q9$Rz5PzqyxRL)@kPd-~VwzpJ@RnQOIaUYesJ6~CwFpbCPqQ3PWTe_#wnP>H+H
z8Moy$`$GfC{T^d+{}EWo!qQ(wFS@Ya&tJ+ZI4~Ts`U7OJCag}=Qm3nHutBB?>&$22
z#2x=1Ap6!F^G}-pOE4Y!6rox99CKZv!?ZQu`$nPZtgqlo)oHV~d!w)K9V?S_h^~6t
zv<NT2R}d<a^NXjAb8^3p{o~a8dESMw<Sel>ueq6%XI$cj6egY8$~OD<en`)_Hu}VD
zDpc?NvOSk?5^Aw$(&BfS$B22Go)fvd*=JwPZH;rYYa?oXiH9xV#3fsLw#VC*n}vC9
z61#YH@+jGH?0Tz>&OlVRxvj3ixeRWdrhN|2-CI|Trv(nfUT1AwPZ(7_lfCiFqE0K`
z6US}F9rAac>py-*ujAAhLD;-#rp&f5lgFxhT;M%aS8Cqj_F20#nU8|IVT87mfsQUK
z*{Tt~Gw8gOu(rZyv3%aL!tY@6LPo^>XF|m+q}68vZCLL9%dmR~4_;4C$rtw4QimPb
z)qGfl7X_xU@$ctH4W_ZPII@x&O+!aR8CHk2bksCp3QaS>o1*Tk2~&JDeF7*tesmw2
zuOF<TqpkiM05<XsHMErru0_>rsj3MIUboI7NWv~(0cZy!2+Tzgm}K&A0TA(J#FG&x
z_Jp-5YBUNR`2Yq8bX|ZzTNr`h0tBx7YY6-&wwXWU-RUBygFP!ypS?Zst|I~Cq{ov_
zcD6$fJiM(1twIXL-FaM1+usLpL%1R?8b~-vhBTbL);^PnG#DzW43UIJ6}pvF8g3~?
zlMH2utEi}ye%zEM&7>j~&60#d^Qht3$Gz^4_V;z4zn<5_A9tU1)?Ryk_C9;P*B)N(
zi4~UBg+T!$9#vv%A{;Clt&JKa26u%g-w8fdFxyWo-NEy1O>a=!8>@X6go~~zwGK$;
z#2$WH*V6x~;nmr=%Mbd>*t-T?J~WB8+BddwTewj=(q}_bTWl7FpIPiUl(gcUT~k%=
z+x645%-u37E<QC7FfUemHraJBFE?HLkF{Im-2J!Lq;K|mTim#NeebHUtf~<Iuv`iM
znq^PLB4Uyc9LJW0nzhDLZ?CA0Yn;5_W8UOm@ea+9*USI#xq3=1CStj1(mR>ZbJ5Pu
z2@Q3vIXdDhR{7V0eyN|8;ggxx=+Re_5u5+yWAxd5qF<I~r+Ox&Njb(oIptX){<-gB
zmq0heX2DqvrJ*f(Go=^%Do?()bi%aZs54e>do6_Z2Ud!Oj5K%}s|rk*ViMsw(bF~k
z(unTl(xU6Xs59}5N7GOKSz%$nN^3(vy37str!T6iGSA%`J-sh<ZfmXDzD&ywS%noV
za*T;B;VVP!RY_LHg8Fuz%9#fb%z1V`A6S<1uH71ahg1xzHcbC;DPMfOZO)_>ey2LF
zADACoo*l!D4X84c(0(7%6Xsjr)v)J$M{h~4d0TA#LDBNmD?`qSstG$Km#cJZN=g;J
zyR+4KG)#S!)B~w}y~Qt03gi!Eg)HaP2COS6&(_E)k6xVn%;(~1S7O_H`SFf*+YDl!
z?J-V|UQs;qaCM5i$k6ZMk7F5j+D~^VgvK9haJ$BK3r)Q<O+$I=hD@`HgYOk9)jOQ5
zA~^a*Q(ej8rw5N}1lx&MyOeZZ{Www1@N-zFe&71=uyXZ`Vflxwygk=0_O=Na1o``}
zO!L*6j0Afo`JEshmrZaUKEK3T|CB=Zfmt&(JqJ4Wyk37UZ@KJ@x#}UAU8WXE7gBay
zK9pKpFZB$k?+`9syrs3?MbksAdH&SviA$zVXcAD9dgDLKq<wx!hw#0$9Vw5^r^`yY
z>pE|-S@oaz%yCJhg=kvkboLWgA#3A{(N@_j31*D=V)b9IU+kK2`clp>=f$>aeto^`
zapEN7?uwR01K#S&t@b+(hvIbZ?8F#<H)#_uAMw2=1g~-4aJBQK$g*2r{V5$H-VU3S
zR@5tD<?0*yL6)vknUCM~`#ml^-FZ8CrKtat3ptZ(=3hJK<vD*^kcN(#Z|^a~FM50R
zlj{5>WE)EggdcYA`1I%Qt}niCIv%tY)TeDcyY1B-n<wJO&elb4do)|IHnc=?B>tI$
zjC8_&-#w1clMZjHam$|-8L+=9Lhr!aEekxfkWp=3+xzHkPmE4?9VJSRC9fTBDc>C$
zZ+-t1QDN8k(c4GXxj))ds%a#+A?x0N%!8KmPEJE*for~=S(~F!@Gjx%h{L^`rlzfS
zq2VD~8@%gUSpsWPXZRW1)H!_N)_lM8H-@r;PWA_rH*w7C-#Xv(Tb3v9?_(V+mSEu)
zRywG3KirJ$EWfN`l@t}-EKd|9Z&BikY}_YmnGv<syJauB-@j~eq5Z^NWr3l(6#~Q3
zj=c^Y4qvrY$tz0MV$LGR#0=_{VqZw5aH20VHA0NGzgRmw^vR)Y^9%*-p0GVB^5l)A
z1cid!HA`>Vd7qMgzT~T8SnA|XvnfT68bc_va#~FO&bK!?eG7;)Gc)WQM}r5;3#wj-
zCD#Q7y*Jt@SgkA=zAfMS_uQb6v^lk%IyMC_T8}UE624S=YgcAQ<*ip!4|Ne%hwAl=
zBv<F;Y`nvZJT@y`rb)x5w^XTqalNhF@sdDAXXdnL9_`0JDP~SOQ9eF1Z)ApB&hE$6
z6C{&PipS+NbA(JP_Br&nP3s#KN!M=n4(VBq*WMJ)RNmg?wEeYTwegJq^hy^SW+!Qg
zr5{{VW@(u171^)OUC3Ls%Q`qCxBa=d<}v**EAH>N(5SFL1reJTS~ok|o-?^o{i1GJ
zy_Ue~+b-28d4r4N^WJTH1LE{lTKu0(j1TD%GuSMYI8YX?GINT9%#dZvw#(({`o0($
zqcD>P)xqUKI+uiaHsfZz9(G&ca=^@BL;gF-kN$`B4mI_Ky$Z{gc&arcjP#K$p3vVU
zGVwGTtm!xvGEk=Yto_ndo5U?&T+*G(%(sOkuX!);E5>$py<_n5ey4oiK$F@jt;zQ*
zqdr{|QJnGK=|+ODU8w8jq6{y~kVM-N6p(6XaJv6${rliqye{+Zk%Nq(`OaPPH#xc!
zvQAwxdUM5b+!oDYo?Dm8w6xK;U!F7utejPJuSiZdF>tqHmhz6C(S-03Bf+}h#OC74
z;{^+h<EnaM(w6k@vI{#Q6r*_|rBu#eRq3^h)zcBd3ZJ2Y75kM$teyuX>6&~sHoUU4
z=g#f4WVXhq6t2weWZSY-SGgN2w;mZh{CQbJi*E7P{HvnJs;;grUwcu!b9s2P>7e53
zqhie!%8`*M_}-$mJx5kW6|Q+wD(0kD*{fPTdBWR`YS&6S)@{Bs@!8#=)Qo-2N-Kty
zX6Ovs&nZSHqoRu#FNHR=xK^n#`f8UtMh6_*@k>DEmzuq?;b|k=dL9?wQO%K?d3ElK
zBMm1DwMM=iUM}f5PP<0WqiE*CR})|EjoLaT`j|9}mpuKdujWj-mx*ruMEqb_yV;_=
z#35_7qb_l`w9~JtUR+`o=BORn=~4R5<74TaoqL6a36_kt6{oQCY2L3#7wEAcUKXEP
zo92>IzVz;wn{PtBxF5a3@@yWRksNg{eJC;Sqv(yDg$Xjtimq&1s1&tK|LKZL3O)7<
z9Ofz3xqed3>GX|Jby|9@P0Qc@C_7fYe!y~<?_S%z{>Pjg7Y^9>@b3LK;+vA*{aoXZ
z;}R{pCan`=w)nO+_o=k%$j8?hl`5>UiM*fnz~vnN67CV>`kV8?H&I=oc0uZXO1%45
zB;-^g#eO^a{$ZqSY{`O}Dn=c#c>m5ndh#QV&3?BiShIGw*q%P*(yytGwK}G0-?{SD
z_=>bskohg%U`*^JPWd_Q;!=Hm)$B@aAuOog)Z2F_)IH-+r|}#fr$hdDN^h}y3X^=1
zy}<J3#I?4n;xp6kIuOT+{$X`lA&=P`#I+)Kgzn7jt}6+t4^Dg#s`_QE=dft)`iPWA
zB`KSy3Z4-Xo<6XC`tk|Zwo}$=ZWOdV*?4{95;5QN$E9vFmdk6)@Bg?~B-U<^i7F#U
z5iJiKOr0qrJ8jbE-i*0QsiWd7H%%Amvb(mWQw(kW!koQ}?X?c{X1#G=vijG=2CszN
z5CeVP&qr&WqnWwa<IA{bw|hpd>Dgm-)W0n$UnMiiT=Dgv@^U?Eb~OffHorJ4($iOc
zN7mN-b4}3F+0GL3mhl^6>wowC{9Ytz^WIQ{1x*!kF84lvxEwHen`04NT)3idiCcI$
zXX_s`Hg;cbvN|F!^GsZNq1dM_nd2^0|M{}6q7LiEJu7d&sMy*NYbC7DkiYk#Vz_14
ztLk{oVS__QZz{gpk)^B~hx+33%p`S{Qd9$m=fy14;uty=MN}V%(-#{!J+IWVxl?sV
zYt4+SJ%y2r-q~w=Z_X<*Y;{O{W*jcEPt^Hi5HGt-|3lu!S&P~W?`IwoY44hIYLe2&
z@zWISH#G*G6A|ug>|$}N7uz_mi@sg<(X!fJ?$uhyrUH!#C#M%l4fr2E(O|gM<x>8>
zwzg+Vl^JKvFH8L~l!5iCR@r#{in9h9Qk;BT<HU7hVu!kBC0JZl(3_awqkU9L=KjiK
zB{wg;{~B-WV0ra}W~x<8?bgym56@d|mRC;vTAh-yaQW7KZ!H?^Q{-N5Jd>&+Ek%ai
zkMGQnm)otKw)&HHt*~2nmSeob`dvLPd)~ZJIUeWyK{$L_&HRMCxYdm>LKeBOR$Uvt
zx&A`EZh6lWhf8~FE5{Fasz`2Ges;Kd>cfO>6`lSUZ~IiZA@!1#N8@bk_gvHr5YMb`
zjIW#hr*Z47?dQ(j;T+GKXjJHM@KD>lLPvMobki$lFCTgMnp<>F-CpJ6vHxb;T&IVP
z1*cC2wxGRF{_{Y-{J4s^!kV?pqf;tX_09+hnjSW>v$r+ff9Tudkpq@Of+U;EU^2cf
z8Zej!3=WHF$Y5Xt2I6q31w<9GSZr#EMHn#HYz}{kO;tmPT4Hm*f6L+5V>0=5|E|yB
zf5+rE#-&>Os|_yad%HYpiNz%i8HS7>`@;KgU#RU<AO8I>e{26<<KMrf^8B+tmFS=4
zpV#_pD?g<Bp&q~V```ce2JZW}yg#Jn@K^k{=U1;**ydzos-UlHYNnuf(Aktha+nG`
z4HyK2WKcU=LHEcJd+Gy|p{uazu-Op?=I<M7S=C@QQ$<yo$yQb-nd&TpOLCPtEK-?J
z<*2E#IGAJ;O8<KeY6I#At?f;171XWw+ncJI8Xqw+RnRm&Y;JGC;E_m4Nl5Vf@5V8V
z^~&_T*mjA6H>1bYKCR2kUhl9eahjrz{;vkAADPN|f3B1L^k|0lbXi$pB2m;jaZX~Q
z&L_{KDKks_oJy`@yXK-RuitNpeBnrvH#=(MbT2(yt|_yvLNvx)%yM_QdH7Lg^m8-o
z@XsICr(RzFHOhNNg}3}1LBpB|Y$j@H_~~WD^K`xNzO6-a-SYx>d1s_E)1M!i+blBJ
z_b4~}_vX2}b1MaoUfeym>GKKko!bLm*Atw#qP3^52#&r+866&3STs&XSwu8D+rl$S
zU%+p_j%=XxUEz@27y6Tn@5+YcbXo+8+?~B6w@_as_WGn9IpX>YtoCc&S6vwwV0m9%
zs5(NZ`o@A)*QH)&IqQqH8c($L4Y*6Nv!W`eU%4?S=}}0fpn>sxYroHT*El>1temQN
z13P5JfAEaIrN*%@IQ+q*E^$h4?%vAzU-ub(^0`*ID4m*9sV?ePH|kb#pZ;jw^z3~S
zUXt9oJ`GzXPKI+Oa%=<wb9Lqh-q7(ewvh<T>)?);V$NQ6KzK{`6YiwwM$$k3BlS;=
zHpqqr*xAUJ<P0Q_Gq#bG&(2F02yH}r0$18d$7k#B39zz}i_hsv9?v--A?nX~d;7rZ
z?f%h)j<qLiew*?o*YM)+slUzqlINPdXimqtg}F;sdp&WS8`Pk)`eZ?eKvct~)!xw^
z)32$mi8>&DEvL(Iyh5XNn*YvrnW(p{v_MY#()#Qg$0==%Ytn*dwo95dNEPK4^b7pa
zh+P5|+NI6jtaJ%D(k_>g)7L-#UWb%hw#1OXQTs}_oVNan?u`mV0q*Sz%?B2W3+~#g
zGVC{^HoDj)IO~H^w?ekqJb(D(FGeQJL9sV})73{K+-GGx-sh!&<_`O{t4+S~&&O}y
z@|*gr^0$^F+)Fe0Ef0$Q`$zt|{fke0Z~2D}eeV_Z?f0hsuB?;B?}p{~Zj@&7x66+}
z^-Ai8M1Oz$;oBRI{7#ntH_`W&$8_?aL?4W(ycd59Nc}uZ?y0=1$-sk+!{@Wh3GahF
zUAJRee}8hJ(7ogFig0kqceFJ%6B1-nV+=w4=kGt5$6&Kqn8o0K_4glw8Xft67>EBU
zBM^&T#-&E4G4)tn+PXZFMUJh<V=?LVSZr!w{JA|I2T?SPDdVxnmJv(>k8O`&V$Rt9
z5lj}wW7;E_92PaI{<IB(V3K3|OmMlh{t;XrmsXF*B<N)Xn-c5iZSW8|HZQ>=xwQWA
zIJCS7^JvG2YK=WMFCq!Prax^Lk%%|8e~2WBv3)@#3ys~Uh?+Hw&4H*c)7Rw@w7#&I
z^lh*R=GbG2SOkyO9t)GS94wM{JP?aT(znZ|?PtW|@Wy7Nb`n*Ob}mru(aR7}hJi8?
zC}RO-Y@mz-lyQMF;QVC)Iaojr7LbDl<e<)9+CE?bIjC6~y&jN*I^XE^fE?8ErPZT;
z<Mc8h2OG%226C{09Mri=-v*F_nnKg-0Xe7%AFUoWHqpy~92_7A2gtzza&Ukg93TfZ
zM$r2L<lq81C<AEga)BJwxJIuB<lq81s4<7WE|7x@<lq81sPTo?9uLUD19I?y9MpJ9
zZx6`919DK~5Pe-B2aleEpoAW~E+rvQM$bX%2-KtJpu_~~(Q{CW0`=%Q_~SjT&y)-F
zG9U*5<RE|?l)v=$fE)yngL0L=E|7x&av&fF0&-ASoV0BqAO`|+Q1dYQx<C%fZhAc+
z2Q|N<)uTM8mjO92kb|0E(bolXP%}<?Js<})ucOxkascyK0`QywJSRvX2MOe$=7hAq
zkU$O+$Uy=*s5uS2J>Yl%o)duQ1mHOVcuoMG6M*Lg;5oqpasZwaEFcHqIRSW1uz}+N
zcuoMG6M*OZxi@V;Q+I&qWx(;E=H9e=fae6@Il%#P0G<<o=LFz60eDUTp7ZDY^gaVQ
z0M7})a{}<306eGeLeaMicuoMG6V!Dht<Qkx)D;)K9^g3vcuoMG6M*N`RT!`h;5-LB
zCjie09&kJW&k^7`0z5}d`tJ$>o+Bnb2Le1tO#1nO0M8MVo&y1%BfxV+T@Q{uz6kIf
z0iGkkb41XO2Le1tfaiz+asZwqz;gt6jsVXQ;5h<3M}X%D0mlRI93dbF;5h<3M;JIB
zfaeJC9AV&h1$d4ykOS}>QFpazzYV~1{{0eKJ-~D74hK+=?m2bW1E>f5t^m&w;5l_C
z1Za;wULnA9>aGWUUBGh$c#Z(isXHM+d%*DkJV$`%2=E*Mo+H3>1bB`B&k^7`0z5~6
z=Lql|0oUgU@Eif2BfxV6c#Z(i5#TujJV$`%2=JW$REhSx=K#Mez;gt6jsVXQ;5h<3
zM}X%D@Eif2BfxV6c#Z(i5#TujJV$`%{QI4>{l^2&55RK-c#Z(i5#TunJja0N81Nhe
zo@2ms40w(K&oSUR20X`r=NRxD1D<2Ra}0Qn0nahuIR-q(fae(S90Q(Xz;g_Ejsedx
z;5i07$AIS;@Eil4W59C^c#Z+jG2l4{Jja0N81Nheo@2ms40w(K&oSUR20X`r=NRxD
z1J~ym@Eilz=hV}2+IWBg&-u?B#*Q19q>sB8@Eil4W59C^c#Z+jG2l4{uFo;xIR-q(
zfae(S90Q(Xz;g_Ejsedx;5i07$AIS;@Eil4W59F%vq0Lh1U$!p=NRxD1D<2Ra}0Qn
z0nahuIR-q(fae(SociA-(DMSGW59C^c#Z+jG2l4{Jja0N)B{X<{{YW1;5i07$AIS;
z@Eil4W59C^cuoSIlYr+W;5i9+P6D2jfalbMUE01P0nbUma}w~J|EzcHxj+J*lYr+W
z;5i9+P6D2jfalZ$YM?LlahC)<Cjrk%z;hDtoCG{40nbUma}w~J1Ux4J&#8y+^t^!Q
zB;Yv-cuoSIlYr+W;5i9+P6D2jfafIOISF`9y;wlc3wTZfo|Ay*B;Yv-cuoSIlYr+W
z;5i9+P6D1&FG$d{0iN^Uf1#akfafIOISF`90-lqA=lu60XxjxmCjrl?w;{%?OU>>n
zyMKO{gqpQe=f_WF1a;@18vn+Wv8gv|Xl4BK_vd<SCU@+6BLthtqxSundIbMI@IPht
zw)+oSo7xHqQtwm#{T3zhfBx#^UoS`Qx3&N4wJPeN1vQTr5|ox!(^eM}{Qj;LQ%Lat
jrFRNjZA~2*e|75H>!o`3hp3PI{WiAa9O_>A_m2M$*Foxg

literal 0
HcmV?d00001

diff --git a/docs/diagrams/mmpb_algorithm_color.pdf b/docs/diagrams/mmpb_algorithm_color.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..31bc12812cbf47a42ad89f10847f5b8eca6cb95b
GIT binary patch
literal 162478
zcmd431y~%*);0<R4;}~(gS!nbf#B{C+!@^60>KFcC%6-W21sxXZb5^)21tUta~raA
z_TJx-|2tQn=kg5G)!nOVty*u%>gu<KN=ZzDiH(^P6`86U_(Mfz1F?eajjd3T1q4{s
zJRKk`qDC%8HumN$N=D`oXAl^;t->N_WM@tbv13wKr3103x;VL-xTrZnAS(9uE+8=P
zgNVG5leM&+nLSX<eG*~{0>mJKz#l3y#Lo0V#eHFa)AgW(MbzHa&IQEAB5P^ttOH`d
zFHR5i*L?Sv{wW_|pc4eh_sc>Rh_k(`lL^Eb&@W+c=W>5w193i>CIJR<|4Oifei<vl
z0b;waxQd`4i>9Kn6~yE|gS0JBWj1aQ@VT$_!zq{(1bqHH<pzNt?gAN=j9gqGPIf>>
zB{nV)@VS>#0<(jF&#xp02+Z+Ek_!ZW5V+6hWN)GhanWG`pkPshc)0x1B=YMZ`s*Nl
z?*SHhh^eKKh`omnPzo!Eor{|p4C3VCVdl{T0ba9n0qXAz`lUkB$==lgu;l^6hZ7c6
z7Bwd$J7<S`Kbd&4h^n%PLEJ1&AS#j~K-p?W#?FAqFEoGo(iz11SGV66{M+pkmNr0T
zStM+XTp(f)6MIt#iyXwx+{FUK!^X;SpUc_B31Vc6itLfTqo-32;dS&CQn?Gg@klK^
zV_a&c41j&~8Ve37F7|L_6`GhJ#Aq1mSs(_>#ZFPVkJjlS<f1A>N;yDm*wOaNdFG2c
z^W3+?ol1jw-}8=P-`k7N*T*$CTavX|*T+Y5SxW-8wL2}gX)#XD+>iym#WyGJeplx!
zgJXWT%cp~BG>%t#1LigIdt^x~HbV|AOSj8|V=PMo3%;jE!}hhgB$z)h`f%+t1+Qov
zkIv+mTF#k3NPE<FjdfQGi$NpqRCWeFkzRG5{Ctm&Z!dn<er|YqP;Q^;E7A2Ns9x4U
z^t{mw(Nlw8w8`)GOtmdR!zMv`%F|wa@jWz{%dl}z#-Dv<eBm<9VQyiY`@?Qyljlp^
zt5M9=)2V5g;)9Yo_1L>DwG!j%`6CrucNTk<!p0bGrd{pb^XSU@{t|&^J{P;xIm#RB
zw{1(uE6N3v?zT@dm*L0N?eJjy#3#l|n^WaWwFvxhy<YT<3CN<)Q#<^KZDzC#TT&>O
zKRjO6+pV7V%HNIR)7OcMnw#3<CD0is4w0y-dv;0d`w9#BL+&<({?^<UR<PhOl!#5o
zDI-i6^sBGqX*k{ReZ+0O(mL0CZwOkYRMgdbu)JqcjgIS-)vWNl-_*sPH4`xUH)Foh
zvmGZg<8TQM413GkxrhPbaH$`mmm;Yb{tOXRd!@uL3frCs(oAK7A$uwv#3Fn|Hs))k
zV6l|fYeeoZb5m1rOqeqQ`*CcV-JKLZFXuCMM8QPfi+%|Xayi;+l(8_0)eve^jdqLM
zVZ`^Ck-5rN#J~YjG%i~Xu5zKvffbO9l0f>121o{xNwccs1~!w)P~eNFB?=R?NtH^;
z6~oCa+ro~~o+wAWXSL!8k~Tg~B{a52VJB$M83)S|w5L1<gG`2Hm`M<$0}S_=G0?gn
zLFYzUVy0XdvE{f%lZQqH`4^T*&~Jxe1sBfU*nJCrQ!GJGK0x+EmNin1((Y5CWOC8{
zrErNN*JKIC`#VTx)Op32Z}OCt$CC^R+QL{v4aJgE5`IW9hjTFna;U^=_T`c&zXul+
zb_rUI0{#mkLz&2i_Fj{S3~>PJ9Upns2`BAV@#w?1RsVfj#HY@@Prz&bP|CH`riTUH
zL6gtIiTJSKbIan@_H)A-kwdAln4U-0X(1I+PEc?um#>w9-L61LQ6T+%c}QfwdWz80
zq08GM9nrX=3i|iek3W<`nya3wEHyXclT~yVB~5C6p}6{Twdr@>Gpe03P|zR{t8Cn0
zI4Lc6mtO2uj<~I%c6ypNH=2eOGB<Tie5$9Mq)bCOv$sQ;{NdW`tHl8BY-1zW_q^#w
z+QfpwD{=#OTd#VEaEHMN%;*TzJJ%!6FRN*9nI20&GJq0U5}yw~_u0p>2rib-aGkxA
z$UHB&tv;!1eLjb;+=qA(k2b{^<SHDUl8himf3B{obYevrtL_?kBmux6rFcZ!5IhNO
zNwL|(&?L1-W~<)D%W@fxTQW+DvwC6eb~Prw&6mx8q(83W&gaTqZsng6PI=nt5+n8E
zNJ@1|ggiJ!`dvl`b#)>oF)Rfx<$}CpK1UN$!UdfFi?OL|61T85e7i=5=E_v1sG#{u
zWHbgPaktoyub<bvI$JR}>BnAaQrdXV%JI?G#`-BNmQOIMP#fKcK5R2MJXKn~J0I+7
zNP7~Fr7Lc6X&9o+0{NJfQ9M@_-FtobS!PS?OTMQ!o-Y2aRjEh;@b#`mkHgir0;G5v
zvO!9?h_WV}oXRl-T)A8RW99YPOix-$sokFoXQPEN_E7gc9prDd)A*r}XZH>h-kU*q
zq1@Wbf5t3}Av8T{AWObkq~ZhZrkLiinu3v#z_4p)V^v1R_mcUA78%vYZlChkKFOFz
z1V7bnsIc1bY&b7wvmyxWM*h^oZw|+5k<iR#@#ByIK5`;*#ox?Q?iM$sht;6=<o#Tu
z>GXwd)|p*=K|~gJ_akd<B%3*eNko6@0^_NdiTM$k^7@}e;RfonHovdZd}HDcy)X^;
zi!F%woaH6C@7rDFeyP<E3P0!Yqs=#TBQt7lzLQ2CAr^#c|LrJr1k(R$LoTR^MjwOY
z#Weqxl(KKpGp}dwl^Sj@XuZZu#MK8$&X|z{rfNGs&Dee9Jf3>K)L?z2?OBS>dDD}T
z#I%=nS7EAevX+Fw7}dGKhr7uoGUZwDqg8KiO2OpVQfu1P`jm5~ilD%)YFr-fuFBNF
zyMr{l)NwHk?#k;iLH1VCWXmpxh*yMqUUyrVk37XXNwj3r=gw*)JCBrnD%5=qf(?>y
zTcS-?8KR{6sSPcjj|VIz+vDA0I3U}n$+3^SUdSyS4e0e%PF0pr{!A>5-gRs1-_}kt
zNYT^#*keJIC9ZRsw2n|Pq6wZJ@69&;NrIHYHaNNS*e`R;@>}-1z<E^lTOQfk@Ee&?
zOVPv}%90HDrW=ka1Uws*9M!C7&J2HBpML%uj$0c7cTQsbTA6+Pn-Y!%_$L}|?<XHo
zIj&D=GOKxR`b??HDH`wv63bCllAQxnWXMFNO4;F1Nz&R!@o@h{>yv4|?Jmcbvd6+-
zSH2$9WvLx-41X0@?8l)&XB7)@2+pN(w$235#3tqPspY*Cki}ilAF@l7ClQYypY$UB
zFzCGR4-?dyO8zEdiDl4v(&XxW;g!|%`jQQcF!%#<x9P_6<gH%qx%%v9F6}B5hR}1W
zKz}_qQQaiwPanNYz#W}J(P0k$IYU8ZqQJma!(S3tuoO_DZEZNV#lbN_c5#xHPW~v7
z1DDZYi;O@qdC$^@H%rB|RfU@BXc(OuFU47@-#Ep;*~|4EHW#nx_e*noCuv7>p$6S2
zk&?|xoJOy9>!~B!y;u4256dL%YriC2?SGsi;*u=8(utcH)+6|WL}BZKbxcn*63Lg=
z0{7w#jE)o7hFT&Zq+2}v_AN5W3#qV#?2zp!;!*OIHrUE<+X*TJLb7tiyyC04kEcGz
zEe=sR(lUNY-C~csrM1d^Ix;1F(Rcio?;u@u{+(m!UeXwq!Y8B3(KFQzb*}1_M;ewy
zTzpr?Uh7m4CG!mA4PRaz;}pa)CV|n7cq;mWk1F3XrhN6~T@wS58$EcuL7s>)vNf0r
z@tnvFSBa$uu{yN6c$U?w-!hI~*1y79G<hHQVt>kKkS=1Sx^X7DBQD3bY7Dt$h|gTd
zAXS2&NZ2zDYonioye;>{?5(?ga|-@d)qeaqE+sJ*;n4n4c1ot01HZ2I+Kh|tBR%Y`
z2DwZP;VVxOwZ!Lz2D;YqcXM;@r&<e(GFR7!rx{+ioHr#|pM9(|g-{xLJB+H^8-L`Q
zJR3jgk7VlRFbvUCI~`;-ueFbQgk{hAnZTI%j>PqG4Z=d<?iaZQt1oZAUWj7E;Q1h4
z(2=gTHlAY`&vn}z(7A~j>37oXs|4s)IGv|y5`5o3o;)t^wmUq4P%3=ImFS{0UlP?k
z<rgs-bJ(OPI6AvHU-{sDyPzXqnc#Jsczd-FT6=o2R=b9FV6$po@r9&_Xx*fe$aUCE
zgU>LzatYdN1bX}~D9G1BV-FAM^Ib^8%IRr_lK;LxJlx$uPwn2oQO{>o7P^WwZ|IjZ
z#HwB4kjwX;8so+$Cull_2W!zfbRA7MWqp}uP1C`CEz9zTq}NI0!W}g}K~-DtME$q+
zMIXOBJ?h@j>0GW-fOH;QP0vZ%nD23}5V>Jc;%^Z;Cn45-W<8A-@-B}>B{Ec@m@*t%
zw8p1)euAW$+^t6{Nyo`8%W96m)rzO;_yo~8x$+UA4Q>c;ot8f@e+49YK%R2IhybBR
zKD|yWz|x2HiQFnR<_2pd`o@Np-J~G_=>V*=!`f3IA*btBRIlYf!*cRwErR>WGxom8
z&v>6W>8U+}ap~d9X5K5&Ww_8P72=)y%!+h@m>%QA`(_m;ceiD!L%YP&s=tA(JC<{y
z-7<F6SgaGsQAi+3B+zeQZ7@od8q?b)8V(tE7PqR;!?qIvml!+BR?Xnvr=VcHp;aHC
z0;FSB!qcddyWjG*g{gGuwp9t<#;nApaVB@C{kqc5u{fZ_#>vgVYSop1d8^&!QhD-o
z{27D9Wjqdrx@|Rw<&riVj~c@Vy!we3Z^oT*#^w>FCgJhFRO5*D#_QX5(xxKS$6B*S
z;!!2vn05^qX{T8UIab1s&8oG{($>8W!k2TV0;M3q0U8EPV+>E@ao9sG?{gsl4j#;Z
z1zVX_n#?O_ty-T7m^Jy{$=q%pM$SZ4oZ<bl8JwB#29axz*_!Q6eYhoH+3_<Ris`8w
zCAN1sK`gE^vdiSvahO;s$V`<Nx-!_j)Ls(YpRiA$1^C2^K!`f>c1<;p6F`0SMG-tZ
z$l@92uY(?^Pu9#TUYD*^b*qmc!3p_0_>XC7D5pO5O>{{0BwI1{C0~Il(0d>DM97+F
zhfjR`nz9{E!P=-kHA_5wveU`jcOE9n)~2-32^q-InLpHa5!f=xyI^y}2g!4`AglVW
zvPml_N~<OpiV-nxU0}=L8nxN+N@R1L=t+hNVM_Ry^uYwao{+#0>7^Qm#ER#A@_L<?
zNsK=_6+yayQIo!_=BNeLHSH<|%b`dbJdyP{xO8wrvOI%Cu?})Y&diRwjdqwczLO5K
z!4ZWQuY(xUC)b@gg(7%941OEBuER<1-U??Z3do{Lf2(J-@+~}LtZ%ZsvX+d|L8G9`
zImo?QYYa;ywtuKC)(W!O%31o!**NO;kYk{p(Z`b6A?VXmdxW`mNmX1&_^Wm}p;*?Q
zgTTm@fGp1RwjqZ;z{X{inh;r_>hRtjz-<SA_Gu#KbFB{iZ8Xw*R}C3@*I~Q`q;DPi
zqIKHvZ7`Cq8COcTF-T)8Hry1_KIm*N&XI*gSlVv#IDc{`5TR9>;e9uv;1(5Mo$<t3
z&Sgk&nmWW9xQ%i8&i<@T@L+UYJiSfAu}zT@(Crrf7^phDP8ZcZa_rL#%okei_%=^*
zuf^_l*VZkYqncOf`RvCdsuI<so<=-Y25hszOS<<=Ta;tpFqXnMl;>HRZN$AFucmmO
zFR6F?s$2G|8a>vqVsd=xRM^)&bGg6qYSY0)2mQwK+?sSe)8nac5Z`${`cH+5w8)$3
zP+Py<j?3c56_do-r+zzuyPV|}W;9K1zPs%kW9Pl{B+`2Na|T!0)uLuXRS3JY*}ip;
z{6tLA>jvI-Y1sXca{OLG_ogi<lHV1~TUNRdhsK@dc%Dv-A4(?hp&hQ0eh+R<T8OA@
z#ERXi3x|(ipvP&J$B$t(UqLsgEqkFp$rZ&A4{^>niT}u6ZJH}Chm-dNOJrh{_BE^~
zVZX3&{x$t>0*@Pt0v+K(-&~fhmWl?>0h+UxGGcrDr_j3G*zeeoggK7W;KX1Si8b=w
zLJ?fGTv<ml9wO`~F|+MB3iBWNRP=)pp<<U!nkoXI><unM(pC~)V?sHF%T5Vx=CZ0p
zjobAX2OOQ`kvpISwQ}VXm0<_J2p<0;RWtY&5%Udm&#ARBYRt2rfj<*lV#sPc+DZpJ
z$%I28Zv`osrfbh396c~c)I0}pF!Jy8dxmNI6E+w7U`b8<v0;gcVD#!f%8dzGn*0=(
z$#J<kQXVe?SC@RE@4z}OM5KK6+P0%TMH@$@W7xx!Lei!wOWDJ(2ff~fDKw8POSd(t
z>>Fa+&|UuUMOWwt77_ZEcL6aNWN3DT7Ik$2Ua^f!&Y^Vth<xeLEGB5^&TiI2c49=z
z)5BJtspj813bfKD3K;sM+R>tqjv2P#d#fFmT8D6lSuo6nFZFtJKFmu%%d_g*;J+sX
zaqC{Qy$`~y<Khka4hR-agXOrNGhch3VN=kyO4TTaur5H=bK&TOe3$4&WgDvf6!Ds$
z`;9514oyy;@(@c$m4XPS*&2@~9Evk{(Dx9}T}~@Bc7iUQR0;BdjcAZEaq=iFT34cx
z)#t3hw-^>mJ1G9^d2$l$belsu5~h`wEt4;}gVRoVe3y*9uwT|i!W8qGlD~T%<QV2q
zfbrz94rIDX-$A$gBi;2efdY#Fvje4l>{}jk_!qlt$K>np-M(uqAB(KRgi^N6Vuj+3
zueD8g!#tbK|Hk0pCG3<!8jSR*6*_QjWxC-Odk3q$^}DziJeqXJi^#0jWsCycAN5OR
zyh9cOqHe_pl^>r8bmB*DD%lXzL~aJ4JBv}&yxuQF)QOTHim*}VSarxc87E5VIP4^%
zQy7*yb|qxcc%BtrA@iu_oA3}hzojuDBloJqM@V{i2F(glZkEG)^0~UBgq^SDt(Kx2
z@adw*6hpeEgzlNSddkOB4zt<yv004dnC8r%BKZ|(m=BQCxelFMdBucQ<iu-SH}Pw%
zqu2?CL$@T;>w9Z<1;iVV`4rqX>CVG)qYXRXZjlDjUP{J_9%C98V%>B~JiSR+lR18j
zo-?!~OZ57H$;G$L(93CZcyceQTNJT&qfd_QqBjBw`~y+&(NC<pUPmW0P#=8}K^wFz
zTp95X7bmy%<P+&((Xhmm!r^+C)vh2WiJ<M;%1ttCM-gBc^&%1B6nf3t^W7710+@D=
z1%lFxfO&{lVOW%Ql@b_8{}AZ<?)6)M-WO~XHFktie6H0AWXm{rWZ-#*p7R8l1x|cT
z37@<f#+NGal@VYSG?juRDd5fXI-q}dn%j={q6%dGI7R{Qx~PT9y_!Q%TXLfu9g3*n
z$)d@!yOYJ$gy~J>3-h&y_-XVSW;|lp!e;w0hcI?gMMowgVqhBq*OO^Um_<B?B$d*$
zXFdf!GQoqxhnsm8Zz~|;UJTZwJ`xE$@3CK=j%UXZ*v3t=O}Yt?)P}lMOPZb#uRAXK
zic5b__|{5dKS9)Jx0cmdF_VWJ--szXeK7oT`dwVhR7YV+oxrtr0J5;n>Np7!pHLgd
z>QZtH7@Y5b1+%o6;0yaL0@2w2Rl)-Aanw^undNuis7%ZJxN;fkUur!uHlO`OPp<j)
zh<{P%weBW1x0MsYw{k^hcVBbVJs7?tOJT`+RSY$wj?6Xjcbm@!-a{7aeGgn)@h6ZH
z0>fZ$&D5iw(eX*b+a?p4IF?D(d^oXA-!dE8%2afI_l@Y(^hEPg*UD&;R`<|rlr3m}
zL_O?zb41mQ{*O!tdfvW?q_s<_ZPrnWpch2%O1@&pqulV#x5t6nN6nYsrj2WuZ%riy
zGCp=YBorM6ALlsxK-y+4;hznS1P|@{Rjf#R4)B?#=b&diD)sia%X+SOFOw{lX;m<?
zhVpW28$B3Lt+FK(vL(7{xYgJhFZN_hIz-R~Ekw3*kZb4XecJcJe6unR|CmRVly8q}
zwbPyJ+zkJ6M6%ZzZaD1WvX?K4Yy`Y^rjA^f7A_V&PRTH<d5YIb@dqz=kFIrTeJ)$t
zc7{8`r>7W2Di9hS4D1hj0t}D8>@nJIx2&AYhsPL;b{d;dV3GKP24a`mabQRB9eK&$
zaeR;#W7fMJ*id2aN0&}e|DL;Raa5Q~AIV-i;3gIPBGQ&boyW2!Q<`;i8TpApT}S3=
zCgOIyFzHihSl3vm#n9{ne?w2OR{|^(1d*2|Hcl`oc8a$vza-09Sy>Ivx~y2nc_BK^
zu)esJ3*Y}uS0wW5wp20O!~XEq+EiXvTJ=f10NJN+9f;7;?IZiC;$>{D#EBeXKnSav
zEZJo;$5+%vr<$`dV*6<0a@V0Lm+Hx?IN?tZlGNXTPf98Z=q;v88mjC|@E|`lR({Z1
zEb~fTRD~P8_r+-`gOMC=*H>d37%w*#st!1gXte2~Rjm!*CP~Ai?Am0}Ve20+7fCOU
zq4fGyT8t}>M})Gz`hAX7BKa!4`B+aRDqJ6Sp&exuFa_pwLNZ*5axx-McL5j`vkdEs
z*Pm&Oqxo@kf3)|du1Ph-f{2Ay-e8l&3aNlSK1l?KlrBrgQh#`*$31<t9iH<MvvVke
ztPAop%UK#c9+7Ye5kKx*{h5GZ^eD~WvTd`iNS=%{)mC9K^O@f>E1FypZ#N{md04-L
z7tSDQ@ukxY2lYM0_JfX>&0;!eJ;U8mPTR!%UX^)Q37{)m4bGa<ANs<cp+(x**DKWH
z(H{v<^Mb=2^CycHE$9VP)Q;YhcwQLQ)9(f2O_#6?l|cDWJoan}d(U{Da!$V|ev@zV
z19M)}i1ck@a8Cqd7Do%l)>%;H(-wkqbVIg`#V^~Qm8U;C5}QjIbP4oEU<(4u3HA7e
zJTH-$7A|rYEmfnNfVBkhUf0w~Ql2$>aa8tM$<|=6>5o#}34<1kfUeh#mX)H8z-;<K
zHI5bR#u<><Sn@%YlsrBWST^}G#mpDY4Ohc%)HJ=0Kvg9j>j5UyC+N!4CScyuL<>Kr
zDZkyvH@?LDy4%}tB1YIzuuUg!U9MJjK4B8UnbdhD|EaX+$eUDLTx>ex`kGQ_l{>$V
zH(vdHlkd=%{UV`EIGH>jCLjJ6$+SIt*?P$gXa`q6n{Z7;THGjBT~6E0xgyM2qELz9
zl8P*##9@p-ljRUoVH;F9ZcAaQvdUJpkz=i3A|_uL`x45^wy?&ikZUOp4tqdv(bB7F
zy<Ms0h$uFUhnTIxh4;G3Bzr_!@H|(bSq$!dzR_`DzHcCeNFv*x0bkwSM{(6l@v1?p
z2)D`S$H9mw<+9HAjJ*t47?nmvO-Y%MM_;5`uk=^+PQ+TBCKt$`ioj_ZI;yjE(P$6I
zQSN)<9T6aXB*tXO7g1-~{`s}~Ddc46UA7G7quC?gnz!zef(O@!F4{IOtV5d@SAFvw
zJjWe56?>hfw>MvUgrcg-Owd<y*RM9+JDT{p>WZ8Emd}?ELnHB~qhpEeufaCzhayoC
z++jjjUFDyzb~}3P?O2vnNVJjst`1S7{5pwA#n<;e;|AyimF7ESt7Wn+*&ZQQZDV@N
zE~AR-VbdYL`M%6&CYs8?#BP!9%N3Lt$I`)l1IZB%DCpWLN&9?r?Wf9r*)fNT4gK=0
z+)i;HKhOYSP{^o~E8o=v``j)V=}cDZlN*H!@~X`_??<UuJfYm~<j{(TQn3|I$efNV
zG2QW0-n2yQF^8((FGvKZ6i=4zGyNWsvKc!W$326>x_T3<k`tbSR#<M*=P(e}oaZno
z%2N1h(HdEE!`QmF_n7&MkA~`^f*aLteE&!mbJk|6L*~_vd&Xh)YMXpm!|hPt?uGDE
z@bhq(J!$LC){E=mEUD;u-<#OG+s8LIs5f`q*@NSFjIizj#6@ghw^*MSH(6WB(e+TP
zYdMv5k8`P*RJ}m8LeUtbTIE>mpxCK;)g8f^?n`9pkrv_-mhPZXa2+;84bjRZ{fK+{
z?#->hh-B^A^(@6*qJhs{)`p|03wF}mPt{~I?8vhH=d9}F+de<>{NFL!SK`0mMnj(q
z!v2W}O|vLvNZi)L8vBZd2-4k8M@af69M)q3mS#pQo2_L=r!zO_wMnC<x?FRGw&RLt
z+Oy8quqFu+8_7<YBBcqjvBJx@BI3x%lQccgUQHS(bIQ1?Mb4fx@qPE8@_E5|s4`Kw
zTpyZ=n#oq{-qZtYdoXCv%o*8@P*nVRgkzfMqq;vt{t3xxvX;DN%r@ugh*di-hwCDz
zYajy=Vwh55lM?r3bxD@X^ko@u=(F-j#UHW=C&7Ww{Uy=QJZjMvT6&HtU>RBI-(!M^
zu99V(8;xbYI&s1>E^}N>gsKW_<y%16uIyGtgoeQMI6}~RbOy~u3Uj0w7U{*VoiG=z
zJ2RDx(^Xl-zqqV?@!cBQt9NJ6yY?&92%TNzn9ZbwE|xV~z}B^U;!o-uthsvQNoXvw
zwO?o=B_4r+Q0I?v!int-qWu9I`_U=9Wc91uD}20m2}&4OJTN?C%O#WD$Kqr!qi<+t
z<j}K%`~T={?6&s9ZdAkCR~cF2^V8!E<EIPFkgk=L5nS(|+(n5fvN)HEuSFLmzZ|Yr
zw}e7o*ASsFY9@9k!L`tQQ4yKYs4)5R_4$N`=M<ByAH8SJ5B|NfPhJ;|RyS3rW(6^6
zi>KZmFV%x~x}-dUY@^blLoyaoF4?Y;Sgt|X$HA_juz1^isdB1ReB>}bnuMd$puKp4
z6djP=u4QbhNJru$4*$WV%3ls+CC<HgP0HR!q{Gr7kVZ#33CaDPoI`o^_vu1O;TWTo
zHYq;7_yXV6aoN#$Ii6uK@Mxu@$a8lme3wb;!qc`h2rD^8UR51pk8Pi61|!t9yPN2e
z?zWO+FMN8GsUjeHpyfu{i7%7SVFo6E#-uB7@!{vj+IPvsHPU7C-^fLkiOfL(_gKj>
z;?1xg^ewcK7m)8^lD#*Ei<f3HSVFglA_pZ<NC3;N!q)MrPcx(=TPnD6Sn_SCIbZS?
zbxBf>W;G5sQW+r~EhHtW61eJICVae%pc|~as?mUJNuop5@oteIX4Ze#nDzuAA_BJ~
zDXci|2>(M^T#TaTvYB-1@IDnhUi)jq3U|rDY0gN_4$SfE4D2x3B8!Uea$yu+xUXB?
z^aEs&%EI!YEeRm>ut7+cu+hl5fsL?d#8GJ<6O-Ftmb6Mz3V_-N%qTmlVGEPPP6kYK
zk%Rlnjh5bQNX{mS=bp_%ni?bYEd~Xg=`ia|L--X!ZnzDYFhv}jKINT0_QxP74<MbX
z<T>9T&I(`*+qqz2M8bF^Fw2Jx7xY;bXITnsx$@(L>z)5LdGT$>{<!EtM!_d33g|o3
zml>0<9%vcAU)uj>XxvjBe!ud6U=rMODOglpja}}kA!-1l<Ke1;ku3z^SFnf}IYS;K
zgq<vnY#5~h=7^;Ui?|)Y53sZ|XVI{<6Si}<{Kxsds2IfA#L3dZ#oh_@i!mW#>E!Gp
zYGLFAVh006lRwgIY+xX}sfz``Z(#p}s_`e0<UZ#=nfYhQ4-Ak88~<h(KwdC+zSr{0
zPGLK98wiLAP$cYZa?d2;W#t4I62H<+U{)T03gaLJu{5`E0da$QScKioA1nfKaIgU!
z7mr^jY&_gt5BfBINpi5VvS|L13(Uz1uq2G`eF0);2dEYh+ZRCY2XT=HS_cy+KsLG8
z1!808<h@t(cjn7I*W{ln`tR@n{{;^bQ896G22lVHV<*f1H6lEKIu(|Gg9w=IPedL_
zH-F`d+~@o!L;ocrf4A&kV#4|x5pKYNe<A{$`~?x<>>r450=E5$2sa1&e?sIJ44mM<
z!NAH6z`)50Vgqxqf&jGu!REfT0<;$h7-)eHjGn*N_nt@ePxbwGz_9-XnEQ_Mzs7^@
zp7!(a0pa=+kUvOCe|C|Ftp8-;zeME0r2h=W-@69e-+=hHT?5R;#rdCr_&;<FAUA*s
zP!BLGD}c%W>>3XQu79fFpAY~87VE$DyR<3L;VoS}nWX+`U#3O?*Z|<pKnr;AxthJY
zo#j2x2|)jWVf9z^A7mbM>i`}1US1FM7uVlk`EA+Xhsb}JCn_y=ukL;{1^?n~-Jh}m
zm(}bgrN!ip99Vwa$RefzVgp=oe^b@d85l(Fd3GSSU%apTY=5E(V*5o_(*UgggKzd1
zyZ%b(QgkwfH~|9_?Qb;zvrh$LZVAi?p0vUMUk`Fm^>cM_uz}d#4~~ChhCOKcUBiFO
z6MvfVfcl>U@I7tppGV$*^z#3oG_JpO_y;%s4f21QbN=A}{pJDQy8$3EDj8Y+f)z-s
zS=s;|ez*aQ&42RvctPyEzyKp}<l<xraQoPp**Q4xu>ppkUr7LNW=?>J2<BksxR=xf
z#w-UQ6N}dG#9#T2|Bk_UPrd{O^M6O&W9PiD-|q;(Bb`BwwPu_h>Jc1<*f*-pAKn|(
z?+XZ08S2&<U}2!fN0G4~TGDU%-x0$DS@Lg9;SY2E(bOM!l)px*`zrh~Hv%b9dmDQv
zRR<#z$e+YS77<GqXC;V}sJ*R&z1@Ao<6)5oh>lJU_V?67keQK<GZ1?G+e)!<aI*is
zT7y$J_Sov!-Hx}~eyjwM5_zK*90Z&X2`Nf)M8p8l{U;m*cd78jPUliz+f7zhM+R4j
zy}%paOjS?m2@TEQK!*3;nZPiw`>fsU@#mJ^8poTnIth|H!3Hr)j>$=a5={3=y8`bO
zb-GZ$5FPWJ(lVwceH|;*wM7X6^hoA?ux`bRT}I|5EluqVd*CXy7t#%=Q=%R2=cLuk
z12?AV`YQ^;W3JIh8H9^ZYo+AspqNSr7IjlFj~K?fSXbYM!NS<urp=3S2!+)*pqL+B
z9-7*Eip*;g$YN6HQ#D-V@KJ8QxWnPb97Oj=GQX?OS7!6J7SsLA$+)VI7Y#dLlWYAt
z?bwczU3c)cb0UrIE3>C1>=8I@%{;WLf~2JqX#R=ju{y`~INIdeBz1;4a!e?4%*RRX
zhV1?*#Kodcv4l7CR*VeeqXS|C6Y2q+j+OS;X2a7y?sO<#ILFUJ*U6I-)#l^m=%|bl
zDckJY2r~AXUVIi%`K*Z?sGowdrF-&>IqO8dWLoN#99@bDb>!3jc|rfm3x{TtIya+2
zw-!8!PX}I-n8a~`B`x6-^js|hWMk>%Y%ALa&5mVGYugj`OW{VH9wVZz{%PMK+R3F3
z_0!%;(eQm7lEhjGVg8kO%kwGzGh?g$+eKj>9J@Pwd`U#}`zb+`i79<6`-w;LDQ`g`
z&$VHuguFlPT_TNo-*^Oh2a=EEq^>U)+b0tl?_?37S?unlTgCO^%ZgASAx-f+mzr-B
zZe<FRuPz$qrs2nlmY+V3Pcycrg|q0Ou%CElsvQ@GG{yN+(?x+8E2#jjJ2F-ZiqEmb
zTz`hXGfq^Eo}rNWuov21^gUIROs65VUMio1qmv(UH2H{lS+$W<*f)~U*Yv{bDUX0B
z&R#VNeppPjTob3I;MdK5a@*1?qlP0n(rEXRm7ml}0&*#7y`MR?>eL6z4dROt&hqBH
z*%K6M`8hv5giV6#1rc!&Ew7fN=Hf-^@4L!zM?$yQu-yqz!)I_NU|&7A3tQ@OODCSY
zo!hDb#Lc#-HR4oIKW=}Ggr1Nk6lKUxM@j)Jie26DHJl7ovnU$OVXZ7mWy>R$QFfTV
ztwyK)2}8iiM2ydQl1N4^JE5I2*B{~*at>LKP^CyyMD=SSp-79&7F`R`KY3dml%G+p
z+DwtHtU7W|jGJXp+-J^G*w=)bQX^8u-r2V@>`QW~mX3ExY4-+?#i{z~M|y@Gtnt_b
zN7|c2M-NvPZ~2YrYOn^FV27wQtC;k)$pre1fZ(y?NKyA&o#X*hW`DD0wY^?zZFPQw
zcWNUpj+%l$M^a2CL*z(i@nL@U^k30My%1zI7Fdh2NlZiGgbICRm1RMxJ+6R@(`*}V
z>+y}Tgf2fDT#JLggW|aU!#lLR<*H+uA9FV+&{`pkNtoD^Py+XDlYElM7a1=r0wu6#
zSn}GUAjkP+n?H9jVRRdW^P^b+m?oQ`FLos@;E>DI?szKOO~nhf{5>>PkVM!X!Xl5u
zx04$U{U>$;B5N|z#zd0}3H>iz(^cV4c*#{CfBQVnbv>pb*m)_~`(n0+JSYW2^Xk%I
zX722h!`f)JY1rH36r*w*_0U?hfTt?pg~M9Eb#{Z0Yk*i)oSH#aaeCZbS8xV*V7nw+
zcZ~HV^c5dxVsE0dY4Q8Lrnu^hy*DZSM)m7-)$Fj7+|P!0et1p}FEkIY`A*#xUj}^A
zpl}S4yU9-d>h_c<9^tYla+t6GOxRpShI%q;x8U=xSHGeK@2h@E`N7!CzT%<_;wPP_
zoQcd|-uBfb>~8y-c~8a9HDvCOy?p*QtybZw38v+GG2f5$&{iClf>)Z^A2OnCo3FPt
zGNPZ9v^1$ZiV@WRQ2R0AHv9fYb_JtW*9%rn9{-EFD<#H|(fk%=jlLgDe=7*}Ur#W9
zeYB8pwKH+Cw6}YhQ`uR>>}`SR_MRQj2F#Koz&?We9R#e*>_8013{2t7T=z`*hv!cS
zFwfr0{rL~v`CS~4iS1!$01#^e0xBTz!y_jM{16NN{gLcJ@b`rGyMRCD_`lmD%64xH
z4<{=xPz7dA9-jN%0Kk#;VfGhgy}!%N#maqumxqm)3((2T3+CmxuZseR?XS_tgIWKD
z(tlU%FQuHo9s(eoVdiE9jJj9O%L@J%I`4h(U+DZdrT%yIym$D2q4Pi2^WVAte>(TC
z@%kT8-5-FeT0A_x1N$oeWw7$c?LUM??d_fJ+aDV*2Y@38SQ!AeYy3Xp;J9yz9>A6j
zQ=NycF%MsVX@h|Lebc)S+WrjLANG&@_Q$=N-_hpp<Adt|Rn!SIFRuHR^*ich1(rUT
z**Q6Z2jAaEpnLH0Fa!Ht*g4pkS?{~fufUUw?T^3{Nc@Mu^B;R<{`<g_{htEQ6#8F*
z=a>sJ(!U%R9=7HD%{Bii@B}RXBk%-L{}Rjphk+-Ujg$B9f#<hWB#j9kypWr_&!Xmz
z-T3C%IP*9du#lyoFXS-okR|5qErGtIcQdm(7#Y0gJWDH8mZ^fCR~tWVzgkb-PWc&F
z-`&i;!+m}e?UIAC_61kuCL?Q8^ww9v2k8UcmCx<PlY^9Vmqa99G2*$1J+!O|u3Vg&
z$}z>x@2{gKwJ;i#=(qIKoy!#O^isM|p!jEDs$Jbgh_b4q6xE5oiIDP*He>jb-ZrxF
z2ctSXFKLTHSByeeh|=Az4HLaF2xfj4;39m^bUhJcu6w|%Hf{NH%&NDed9rQspsnon
zOJXy3#b6h)bAW;8cl%AJ_ekMXd=i=8P}Eu(XMJdNu_p3E2gs)0C8&}|qpcQOK%>6O
zNX2B?bTb^h9N_lTA!%A*envoPRPAH4k%${zLBMf9CC`l%gW{F$PpoOz6z5jB4=eLa
z+T0&69EZ{A0ArwiwX-=>zrAqMVG+~svgyePd&6rI3_V#TXzpxVUf<BTV%!z7_$|go
z_~7w%SzN|r$I;aFqO3VEZfhE}gK*=g%G<Wio^Q57y1YQeAANrLZr;~NyN79pP17SU
zwchd?)S6oy-EivbZEI<*ZtOS~w3wfs_ca3bQF$*L3a)LMj*m?ew~r9+KxTYKZ*V)w
ze(E<}vZq`^j2!9%KP{#sgQ-K`;le0VguMdAI4AknhvmzcIh$0B)CbLIiL7(BitsP#
z+SxD;+cvqgHQI^nIjJ>(_K!2n>S68tbL94=td_nq+n+CnrF`hcUzTv5abm{Yo5VF2
zlwp)BacGonc)ZVWkW{1eDsePj*7@g8Sv}3!4Zh7yD>Fu;SA@vZqgF%J7j%U2CDb;r
zj|0%0n28JIncHyMq~OYXN&KHRvEg~-z0tz1X+^7M`^d-LvTE#BI=z8`vy~c<HK+M;
zeI8O+{O(jTm&Vl)J%c+v>yg<THiNM3M}d^B4nt8dJ-Drm%}ljK+<@lhUogcHrc*}_
za~Y6S&(}-h9X#Ih%oGKl&pd_Zv0(>qlYl%*b5gQbN&7ZhNuHYOzN&SVXr(UV=ygJz
zYZ!}b%rlp81*V-QOfX?AafNYB6@jbe;o#Tx<*%K~xsQR?0K=)A$vj4|<LoEgZ<N-y
zCG_NFk?pBc^xIYvK6@UaD?Ul`h;}H^5)i`Ep4U^4O|Wjv0<qAPL;KSac~u;kU|GC7
z&jXctJ7qzy$LYlp5+_Y6h$|DPzDAu6mUv9y9R%nhe}|g@Xf{<LjJgD}YP|1o(7H2t
z{LfVoPDeaDF6s?W#UzXnN_o5qJDQz4o;iz(ot1`@RLhcGkFPTl<Y<*4>Y(xWk|{4f
z2^EtbPU5Ss8sqI?Y)w7g`J%Nt7~$?Kl@qX+wyBG~Oh13AVk4x-A8ONu#)j0{z!#@I
zCmJcdiJFBUyK>-<kTr}Vn1=5`&-TG2L<CJaQgw@GF_4TTn%N`?uW_6#?ifqM`&7wy
z2yDIVPT+BQ@&xB5y5KrHC~v&;2S=DLof{v?%{a~U4l!8kjacmLwCE=CK}b*Ox@*--
zbQ%Lpz4&ImC$NOuDhk8`ic9q`@K2?~cb-9r++syVLZ@L@BRi$6o*kzTwa=efAd864
z@s1uD3WN<Ip$QMw_pX{;&9J0Nd_71e;CBn#vtj_6#tv#$y(GK)5}~)W%bVscJgVuf
z7NMv^mnU?^<8H$}3$Sy;)66H|v*gg`)}L+0G>r#8BVzY}Vwi2r8{T^(a5dL?)R>oG
zLbj_-xcx@@(6W;-+FH;p_Kc`z$m8Sq>_BcM>m#HeW1mB^7D;@ayu%Cwv36nHixsHp
z+7{&}hN60=r`=dD%eYh`xcRz?*sG%NJY)qL@&izjdGq<~C8sT^i%Mz?LwR9s9K#%|
zSx$)aDxrj~Z{(-!1NFGo1sYC5tXwO`n)J-2Ha?10Ki+0W=TFd1=C?<4yXD(oPJ+I5
zF5D@fT&B~Ct^M+GjZBO5YM6|o+N|a7=%=^P$x`F!u-!2QtFPpKob<7SyJFw5V?1>$
zV+~4Q`<!cq{qa)s;*?`E%aM2Es&I-t_3nybQdIwHAG7Bi)$_dz3-D{gxS=_J=>*Y)
zSz_+XFH3g)e)83XZ+CW>qSJO~#%ZW6-C+YHPt9sC^}tti)%`S)^jQhWyP4Yx4)U2B
z&-;8WRKP0^IiW-OUkzV(JaV#i@=t0N7%yd4j2otb1h8x%m~A^O(f2t^k|coz{Cj@T
z??$VN5FZyMCl~Xw$`b2eajVY^Yj{FQAAdLapq3q*Q2(Lh3pgiB1*^o=_XS+)Tox@@
zqyv}e#_C9}c~%J_Ef)3DG!8qu|1z%kn6y592lv@(<fo~oyMF%ECOB0E;xmb{r~G$Y
z02eJTRp2bj%txPX$1u>ljW~zE-0?}}*oW$FGlrC${uF^szkRT$z@6cyXYOk{ZYpuo
zdM%FLoia0KvS{-+@pNIDN8!@N7T}RVgj77b_qTYn9)XlELhIiMWa=L+dXZ=Ku&1n)
zX0tc#H}Y!EWb0Db@?sM2o4(u#5)g9L<;zM@aA#RB*t*FUd%s%5_+(c1TkgQrS84kt
zrVnw_dvPal=7j8epADzd2;3$tbNa|GrCw^CEMpmz8-IJI4Y8<on6%4RWI652&D0^t
zyDifv$h(n!R7-{aLIZ-tA9SWczc8DngZ5(1+6*ZzHN8DD{$$MFJgWCu2j%{$_nQHy
znj7usiOYipFsJ<fFK>b$F^wA(w-D6}DNuNH*dy2+Q1uPPx`^3Mm&+7(WQZ7P$CSI5
zCGtLAIxWFFGkS9sCuvJsT-Uy*&x~NOue15iz%zq#$$eBc=Z38nXWyV%@$)>c_2Fo$
zO`RgN-VG^=Zoy-SpN3PtAJmVt(bj(*BK^O&W(mBU;{5BH<*(?GgZEeT_;3WGM>bAo
z4mJ=68yoX|r1>j&<ovyA2_*hg@c8G>>i>4tvihHc$3IprCoGWh|FviS>BZWA3LXK=
z{|Fv|)c+^JBP$sEKZ8dE=Wz+Vkc+fFQ)=6<A(V2n?^uzH;+|96f-qQvo_akd7H!4Q
z-r|A{bnIwe^{PRiOJ?_cH+AM2>%zBCZwfx}ai;iCRbp<?d?h`55&(DC^SR*l-7Q1<
z9rJ}8&t{5`oAM@s_pXI`h9@;%eEb-j(6l&)894q)N?(s+6JFN7@wQ{gNKHR4jRx*U
z=p;`tH(#do6&v_82#+KK?X829UUC?Dma29Rd8%W$!`5iLnwN2F^eraKWk>;+TCcLg
znva$8%a8%M9P)9CME18-u;uPsj?tv(s`@UdQ4CeGyfhB)d%v?-;r67*ynv4>j>A@4
z!#3J%^^m;tXe70LY4VhQCFZ<HM6UtGEoUJ3s}oL6uDX3<C57ztSN_R)_Q_+11dHRm
zoVmxgs=dkwsu4d5<<rVa%+YM3-c}Sb(ih99l-Z?|m;Uron`6V_RH*opb6F-CDaecS
zo#hM>bAZHI>`QP^xUTN(H`vmKXL=KfEmNs2bKu;<0XwB$GKFQiVx^=;(Lj#OB;MCg
zbhUZ8KsNEw;TF|sG#>7$l6KsvOIF9Zxkh|y?aFPB)AYv}71-^up&6GNW(jOVDN2=;
zan$JO6sG$LL+l0d8uW!+KJ2Ds2hrLm17wBTkQV2ueF^5}ro&lUcAE8Eam^xnHAfh?
zYA)A&F4uZ$*P_{oh%*G7*>4y$&lPZfe1+IO9oTx(N2JK`lb)uog;Se%GDw`Z$f|G^
zc28tt`J%YZ`3!z&F$2vF3C$5^eLT0)g~*KgU5|{D;S@^0Ay>by?V~Eh2;C5Rh3O$R
zw1kE4LN5#M#+UTdn|6Y0=+<f14OL?$EHLaHx}Y5NJ@wdo8wem}Pn<@bTt#tcf@thc
zh|Bj^H9gDW8uM{p66H%8QgEIj64zlsF*`eXLg2MDc-u}sj2b^;rZK9o%4yg7Axcs|
z<9#l%mqYjrq1@YXd3GG0iimI5L^a~6;ro|rGKfSh<)zkn0~1`&AHT{8cfOm*e5paQ
zdtx&Dt;jG<<or9@w3IymIlg17)G7j1s0W}V^@Bq?hV<i*oPgS^^4*N226x@<#a-A&
zY$t4+Vx^3<E~>gJv8c1aOzcD=r#=RDTAJQkW+rWEO*|C!be&;$Ti&Vai(v)hcv2gs
zPdg&G(})VBqbZ4_G%OVm_M^aju!erVZi?{a1T8z@tr$G|Ai`lcJD#zREu|(4-@j)E
zi5fZ!PFP}S)h$S)Y@4wkJ?ayFJ@s~qQ#3`We*np7c2JN$ovMI;GSkZ7Nb>4OynZR)
zXg3RBUxI>igm-5%5ZUp>kdoglH%zU+#ZPEl*GYQOp1ab(NFO7VWufEPbW&h4_VH6y
zq*H_FRN)ZQ+7LA~8Xiv)sqR_Bp)tG!z8#0Ho^~hpk7!99Kc3mko7luXG(`j|qFl-2
z*!Ok1PKNl?aEYV|gStY=-j5E!{rm6I4Xb*?l0|M#mA-rm99$x@xV=2P_wQwU-y2ms
zV-yFF_;grhkQw|Wf5$`&=|D4e`2c7swPu8T!-@2y<VIfhjPqrExn<v?1YBc6ha9&h
zFN*_@c#x#-bn9VY+#}x@Z>swo{YcDAg$CP}x=kf3m3pXIyT<{M!|vR<TxS8hxCIrj
zI9GKIWk17jU$)IN08sL_Sr`TG#}lKH+WYN~)*z5xw!sJ21xp~GIAAS<X>O45u4WuM
z(DAT5p%a^8K-pC|OhKeJAN=fPZ*rMV_Uy{m&Tf`|#AakZg(kvtseiSANn?mo+R0#8
zuki@g^CQFIDs&#)$xfuJy=@`YQ@g_aAbu4wk|+c;4<3|l0?pQQBg@H#-Ux&d)hhd2
z?W@a%)G3)J!!*L!YiZm^HkM2RTXD2Dt>@4`VcsO~*&J5AG<i8#LdD5aUgYa|a+Bm+
zv=MZU%f+ocoi~%|a=IJ_`$D^t@gv3hij*OpO>6H}zjpL#u=-h+ftGk&-&IJ8>3qE+
zpSMHMQAIxI7c8!=^_4E(SEEmtH(#hx`T4W-cKbWhELTK&TG4)B8a6}v#-ZDmO~IZV
zYSfK0&C_R|0gbzktg)_J7KS||j~(HYfqSVHz!t`c^jUOGPg&P#tWm6NojPNut&QtO
z<~C)|cd{!YeWWk;LLS|s>EpOxtRafX5KCiv%;8c{JUfl2Eqm<RnE0!iOfN6<CtM!~
z?z&c7!X?_Pqy=}|$$Ny}FQgOsveitTV;NXAHwi~&!V+RT!Qf;)<@8T^ed!Z*eMuW_
z?AE-}LNaGgOff3>a~cG_>Phd!;5<srA;HsCZx(fJ>}2r0gY?~{4}9i`d5d5oeti|b
zV%iiQG<zuU#shI!c}sM7gBC<nft+(eIw-Cb0dF#(RG30xPCqcbTlH4jb{aB-qEjT)
z(akc4ZRHgn9u^Z*1Il>hJBe2{Q*)jom*(>AtVw9JAL-leHmO!2XtrqpT4*!l)rSFs
zxr`u$VY6l4clBqzeG@arLqQws-gz7Awz+dpAyw<jDS8Pj`VOfZWs)}+r>hq6taAZ+
zmoxc6t+{a@{9YT#>S=!vRpFC*oBZv~48)f}ThA#;%=X#19u_1}Pe&S-)3@4Uw}K{J
z-3e7%hkWW&-HUFz9D$qd)N{sMYV(@Ut)Jzl?_?|ZS4ff5g>b9H4QVdYRL8X_vlNF?
zVur7^@1S&AAe#R=GPz#`{KvAErHc)OR>|HS;-q8`yjKP(IYFF(m!d}ZYv929d0|%<
z3wtM82}_8LDR3$Z`~u7U9WmfdDXkbEn3au}jg6I^jrTsq#L7j(%1Q%>$=jR$A0&XE
zs4;alfjH6L-a*MjSwfjWIYHS&IRpP@P%cnqP#`EJAngtX0j}StETQcFxDJM5g<^wZ
z{cQ|j-2eR|{<;v#{!h#_MPn;f7h7PVoP(9^7i9^=&dtg5AKndf@PhxxEAJgW`Dn`8
z*k;0gzh%d>aePcUU#Jjh6I&D9061ZaRs<7t+ae7D4-Td`I<Ic_xJfRo-merHyJ?IY
z;bMSLE48{1;g7l`H%d%X^O>~d8z~v2x#TC~64Fa)CQ=U*tL*8ndK&XfebrP>^JZ0t
zEG36C`)dkxRp<tVq$Zjw^Jj0;o>hO!e<`cSo^U+*wXCf7&e5No*=~4|>uxerjBK37
zIL)uzOXtR0FmJnZ&O7eqoNijJU>CWds%=qS(^pQMB#oC9ew6Yz-pu8kcbQ*)3g)Q9
z+pR8HFTS|TOry0kbR~5CQWRc$n^U6ZOjzvsQ(`=wqw#A4&B6i-X>sG6vYx&pfAN<R
zyHJ+bFKRpP#x~qI+GeZAb@knI2IFk)B<JE^&#kDk9~^w3JbL}^MYF5X2O4$*O%0<-
z=N~QSGdqVzhK*l-GMkrcstB1E7d1AmwqKXL&}&hgu{ztUZ`z1h3!R*}(zA0X8_D1<
zy$m^`YiUY5_n%`SRQR-#w~atdy5f`M*nj;dqWt?tMK2EN#dmlZQ)gD}yWB8YW<gG&
z@+jEegOj1;H(|2fHl5<8a+34w^Djrmi2MlGp-|G&D83Ri^6bIX)v+ocJQ8}cO5=70
zIeY{{+<?-;-zek_;qs}vf_}eZbsAvFBL{q)_crrk5F4Q}wn>(i;1~|Vx$Ql5)>{}(
zOy->iFC$hV2|86ge!ucl7rSNk4)-Y7rSKA(bvuS0zvPFY8u-P;Lcw-&2%(mn9L1wV
zP1Lh|m5;^yS>fuzLYy76N%MRX_3OKLJeS0!$st$j3$NT;(O-n8sli2j*H_1h_^w(M
z62B^|YRiLOQF`8+F|HUwT1RB?DX6-vOR=R+bpj?Ddbxz9rZJh+kaND=>I))zS#F2v
z8S9x-RuhZCT!o9^C)+4-M<J`^I`8-VukxN>`*65sK@Sn^Gwr{AbM!59(n&npyUlKE
z29sxG4d(GsRJ2VPBMf>G@iS&m5VGJDG}2k!B)=EY3E^t!aN{F7n7f#e%mrcuvfiCN
z>a8N*k&!jQcsBF1))tc?v~u8UH?Y}oh`Cu~V5Yf3V*sCI7TC&eNBZUKighx(S-oxY
z@%}qhzee1K#)~R|l}+C31q8#Aq)mOzbI~XYg<FKX(<2CbbMD7vn*3OpeGWl*mBNyP
zTY*s<@U*R;IK}9BMkm=Xmo4W)1jUw75744OuPR_70z3;_C2^FWlu#nRrtgU7zg(`v
zcY~rt6Wep9IiZ-&=}0pQ6_E@kMkdB_qjlk!W?Ow8O3dTeBDWQoW>?XnH%3hA2Bm|H
zLMtE&?a<Y5iLk{^6j_HFI#kgSBkwlt-}#<@yH)Ee4F7Vywqd9+P!c{L!FKP+s3C$v
zz1wswCOX%2>`|}c)*L3IVGkmfCe0v4Tq{~#ID*r$YY@gXw80Vc9w8gzsPn5)<n8AY
zRWSS*LjuWp=s+oWo|v&Rl4jc#CZtC2Zmn%c@cc2P>@XRLnBG89oOlIc1T%&Rho&`L
zpYX`BwqI>-61zct_6#M46E13vf&pp-s%yZ<NkNZAc5B6V3MIzji#Z&>na+E<{e0wj
zd6yUSwu*xO;uA%HZEqH;WN#~b$OW(Jnb}?9RS1_@;WmT6>q+Ydtr~2~OZU4Moz&5X
zWqHCn3o(9V^-tftsxbq6KRJp(+&nPbO_8yt%wPQaLmql8>WV6VF3J7}#16PaobP>W
z0+$$6v6wi%df-FB>!x$Hn?eeLhPxTA)!g!+wqe<O1Z#58`LxAJU0g($q8#dT%9jhX
z?Zp&C4LUGC0a;dk2(Rb!DCi%zVOj-ax_q>v&I79)Lx>`gVfp%_1h<}}5sX&k2C<#j
zBX}PRONJ8fJEJ%87vsR{O|ywG$UF+n;)eAcQ}$~^JNFE{ffure=a)zys(9qP2;P2`
zRZHyFI{fUtH|cc~&NQ@;Jq~b6D8nC^RSVC*yy>el8p>xMNK8Rsf1e>%1=V*E6CL>S
zmkZ2tzqZ}~zleJa;5eTjQ7~r5OffSvW6aDjW9FEdne8}^nVIb|Gdrf3DQ0G7X2x@V
z|J_%+@9o~z)z;P3r|y>2lDcQAt81n|Nqv@OFaUeC`1{Yss*f8yBH_7k=P~Vr2h&eQ
zN$-0n(~E08_@|s6C%hwe{M@bYZeE;1*~EL#L%7u)9{z;l4U&H4iC({ma@8QNIK`m$
zz%=UfOgS`m*seEV!~NH7g2kcY+%(v)WCA`AI(pg(H3!oTWbMF4a2o<MQ<Xlct)DF#
zF?}{F{nWh(4K+EK%nyacj?vt77x<z<^eI)joOhoB+avPAwCah|ADXLU{bI%t8(1^I
zzR+HUZGMONdQ_~I3qeD0sNoRz0{b;)W|UvgppfWvkb|3zES)&e>zmkIS5Vk<vl&4K
zj(c4<u2oaU<q&2fbCi-38Ky_hEsvX&#N)Q`7s-i95@e_t5<~Zi?|f1cl|(obJs~+D
ziL5KOucV2sP4nF$LTA-_mr*Z)hAehXX+nkL0&Ie`2XuqFM!ivaEZ{se(Wr=;))5EX
z-LL^s-%-H{pvWYA#Wu5{zAjPk<>p1APD~EK9vm1!I!<v)up76sfVN@DR$6^Zn89P2
z$kL*}lK1pf|I7ko5pmE>7zp{U!!2>bjZ2o37lAqfU;zhk&->_T5ev2iaiURl29*_U
zKH>p?C@xEI`V$eW5?{RRak=Rm^d_~pdpX_j6+Pd(t-o9s1-Ei368XN}9U`9-`nEkE
zJsWRdGfmwsufIL1J4tQER0%x4Vfjg4tsibzD~c*;wR=1aOzRW9JJ{wxbam9>zYhFb
ze}CEBJaaqGdOKa8=E1?mv#r_W;q;D1WUATZ;B4#rdB0%4_<ei6n4gdcG7B=6O87t;
z#S0BG8*-9L_(2-Q54SfP`m#v)#p3@&&4gbxQT#CF+y2$|W<y>68>?3zX^RQwGCA;J
zcyfp-qyzsSMs{DeJ}D#IYg_*cXzCt*%)THyFl2R{bdJdt3-erEfB6CG8Y=u1<XPYl
zc^o9sk<y+INXDJds-7~(;$U>BuktY~xP<0j3e5Ti!*7Qy@y3t+cwguH!wWlHS#569
z*I`H~?&rAXXVY-Uxib587nAGRD6lCmTPNn8goD)};xSp2eTtB4v=OXZ`kk&8CT8U-
z1?-Jmrf<+~Z!-I{G!Py5O~p>B*vGf+Bv@=@zUkFd9xh9rRYcuy8sksszU(z=*Gr*E
zPGejtP7^T4nnaLXEwvt}?y~92Ce}BJW3E2FwWlBj?Y&;t-&M<qj)w4hU=15zl;HP@
zT-UO#Ln~Z=3oe>iFXC!jQE45HVAb4)xxnppxwfwj4>!$L-4}VY*ak@I9k#7yU-fyy
zcjRgvwh6fH4Z9D?^4%Ug?fdemoepcvR*h#qBO0m-o*zHgCLPT`Kluh8_Y#g|zIvan
zB)h0~2)LakAD7maml?ONEe<2X(RJML2_BpLxH=v>HuDLbl4-27jnMx@Z|3uxED3L7
z*Y^+J7Zi%OM(XQd;w2pnL;M4O_Nkxo3$uG(oKpOe*d`3%zK#e7OCM0fGXQY4Kz#LL
zq2{5ilV}AXiXm~?goq?+JO;p5Es%G;*k_AS)+w}te2O7y*@T27YB~mhmKLb(L*t`k
z8m*v^Vn|*#AtQ;JmjU3Z1$z0=u;dw6>W~n^2nF<Ojwz2-p`wUyfn~mcg8|41V}3KC
zpokkLR3}m!(E*0k#+(|fQ0zO`+l4;l_h}a9VM9n+WQt`H0EyHEv;YNFP=2YHrYetO
zS^Hm$Fy4pokYcg!oyPwtz_jiBLuP4tNBUEvA#?X&{iDArozN3c9ZI#Y+4k4qrGJ)J
zz(@Z$oAuwp8h`&C42EyvU#67)MS4&F%M`0;r2mC7!{mV0m=n*(+<mHd`IxS8!6qfO
zLHOYkJeZM=^tihk>_asN9#K%s#Ma4&9Q?#;m^ZkD=IdkARtshZB{!CoXkI)zN>6D8
zVWe+-$$>UxNv2f>?X0b<{+x5yFUXB`8e>mK`q$>IU^eA|F{vI!#-BvG5rVDQk(ZlA
zSUu59KeP1_1Us>Pzq7o+8YQqzQJdv|$mM0kO7M_HI!fT~Mz9azSp5Ic@w*deH~z=9
zdMk|Bq@`2kqyL~s!ErkCyx@)LBedYkJT+GVz*Z)2B|)N@=69!jMK;<bgovff9%gjt
z$uTc04EY@JaF>_$`AU<FpSV3q#_q!YmxKpf<drGz?kM|Ex<vxlG__qJU`;NsE%m?b
z7y-4V>TZnAIS1>$BP8e+^6If2A28|xDn~&&>Eg|t;f^+#yG!VpI|l;Hy7##q$#
zjpS=5{tsGR9jpHCb@1at4ZGVNuKaeUuUFSQKAMY*T|GeAf@HeZ_VSYx&&tuK-Jwb*
zI$mDlg(Mj!byuH9>}G-(N7(|bY%)J6v~efT1t~9%V+c|^mgBZn|E>mVWOEq4jY*{W
z!pB9e3Ii9R2Zs40Xe<QGUHDUfCO*BgJBx~)PwMJk5S-@GRjTezGZ1>0_7!s36|=bN
zFpyi06}ACo7ZRl~FWZ+|iHcSYYoCUYLi%U5Rt;gO;q6*HJCzA5RoAlq5&yCCk2r+D
z^@56*=}}A0S7Ju>LcO(@!@M!B^EKal`#0+?JZ~P6v}O2$=0v45y!VaYetDCNBVgBO
z7HPk}POr33j@npGt*B6rA{tMvoKcLXyIQOn$X?rOrTuE3Y8y@5<ELr6vSU{sT0tcr
zT@{>KDTT0m#v4swb?fdddy`-*7>T63$m*Dsqj8ZbQI^RB9wo3=0jJyY<*w%)E#B%8
z?e70d`vH^o&O0tl{|a93t6I#xSYRHkWGAK<I#MayGNqX|Cz)C#eeK}+vPt1n_ocC)
zvu<@!tuc~e0RH0%uPwmO&A|Dg_`I8WSB+OSpU%)xHB<D`4y*!Z1CRkh>U*O}+puhB
z(Jo7MmTh=;jWd!l4pjxyZ<0m)p$gBCk@qX3NuD9ZBtM!IM=j1%E^%8I9q~zeUKRS`
zk4<Iyd&5zfMS5XL$a*7C*hOYw4aj;UQMg5RU~|cOqfq!nZlv<~x}`$MNMiB<gs2pm
zSFj9+36c@gMSS1oLpVuRez+@d$%k@fE@wsEPmKPZJ7!~|jghQJZ#J5r%nQwBQExWf
zE^07sf+Sz55}`whj7M(~T6SrIWCpE6X>SSbE^3u!g5=7UjbmK_-<Eo)K80`MgKEDl
zMN}fHa;O<aR-(I_ggixMqJwg%oym^MmJSYYqPx0;BE?x^lRC5##arTo`j$S9SfXzZ
zLY|DD(o-O79PO*fA=c2h(i`LgKk8S}Ln4`Xr6(A_1MjEPOZT2-Yo4sPoEwO@6PLCg
zn?`)e7rM);-VU^Png0U8yR_@=Kz)~Z`JW?2rT!0GTk8rY;Foqqo&kUH5I?k5s!EqO
zo{woLw&7jc_VVJiqaV6%b@jJSm<~JiHlWT+=-@fn?5eCP2|C#PFZ$n+QQoEPQ`5Qv
z{H^5>JF@o3%e4?Eva!F7+Zrvt9aB2PF3NhFs!NNPb4xDUn&8i*Uo!o81E#}<kxjd&
z0hhiti?u*9Jr8NVHA+tHC@KXH9PTxOznFl>A=?_pUyM=nEc*WY$2z<1ZnPVRo3j((
z<MV!fcyPYI-EIvQTzyZVbUklxdB3>u^-gP=K5TLq{PX;9n5-w{cX=^Rj`1p)eEW8J
zxOpb~avglX?(?**V%6dCaCdknNOacW^}rm>vgiDwf4=_P=lx<Y*lPrZpu_7N?JRpu
zpnZRxC~NI){5MfV$J@)prk$UU_sjC?IZ?;!qh$8<!Q(SBGO}XI&h%>bS;y<e!^!#k
zL)W{{-TnFcT8G!mo8|!h$BPuVqsuCv_sus+z2%8HCcn4KbHB?kENjtX_eE3L*iQvF
z(MfMk4;=|_PWS{VZ?gA3nup_YXLVEA1uL@!g0dnltZwxJr>w8#H%0B^GJZ!!aGSQB
zOw_MDQLxW#H~ZXMVY~@hu891nJC7eCFCQXHq6^-%uQcyQhvVn(z^A&`f}8A9rh<3t
zDu!2@;I|8p!*K^9S-+$Af*Z5v_un@~_*<{L1%ms+-q*pF@4+&DL(LwCezZLbF4zO;
z^E+{hH)o*^K59>ODy<q%bqXJq(@Wn}_BK<}TM_UQcsPFfuJx2{tM~q-{tgdqmiX<z
zJiPo@sOCSXm)akRvi}}s{?8sxPA=lVh3g+r-2dLz|5N|}uF3jg?{DGwh%93!=KhH9
z`|w`<z&`5#7|F%{*OB;Nr?DTO=bsGT|GOn-V)no8y~J$HfA#-99L!w*NBPfM|I+x+
zy#GkC|KslaA36U$&d&AMahd%i=QziQN94aOvvYl<KL6{D$@W)|o#S5^7thB#{0}b2
zM+Wo1?Z=$i|3)IQe8i^y<^JXTOP=jNe+U23J@&tkyy0eN<!1l4q~I(c&aBLA-2cRk
z{d>TWd&Y_PXB}nrr#qf|Xf!DiG-7Kov=D!|5LntKA>%KoUr69+;lOPx!kV;gDqIU1
z%5<%HD^{(|tZhQH0IdsimD7EP&Z(6~W=n^bzYa^kwC}HQ1%sv!Z_@32JKpx+9d`a)
zXWY}@uiR%{uW@}D<~ACMLMBKD0+j)4%@407a?Pg@{b+Xyn29rM?#xE~MsJ}JL40aj
z>@^RWa-0!0UdTqB;=2?DCg*~8i9-qL9Y%7m_1;4}X+aEmTr79~iI?eeyBuqzssuOu
z%12u`bY4D=@^jVa>n-o9{SVppv;BL0@-}2#^z9in!m$xFOq^9(n^&vu590ktM%eIS
z+4G!!mt5;Aso1guo+4X&Tnu(bmm`tK=Uwi{FKqK;R-R0UpN#69aAe1HCzfWeZ03WI
z4c;`(iE2JUU;lo(R$32x@`nmnFV}R*euS+|9MCvAHmh%BO8wo|5L=UuoI9>+#%N=^
zq+>t#9ATr=sj*exCvUYBCxO(2ZljBRt~FKP#PPOfvd`N1JBh_fczJ4<&J+Ixyj*hR
z+|tU<=J`(SdO8Aee$z2l?t$|FrR<OXO%MKXN|k1LK&a!z_U$;FuL;w#&(^~qSrLKB
z^=M02W$Z5z@TRK~L`bW?TMw6~B-!R{w=^w=1~mw#k@Qk*)FVT+8gxdDW%jxN72U5L
z;1xe}Y=n7ISYtK#yBU>I0%($m#R1NxAl{(_Lgdz>;L2eQ)yUyKz6h853EUelx)R2I
zyGhDbEDXvsjJn|0Cfqmh=l8;5#FMP-T9}Tf>ASZn{WmVGTNBRXbzH9G(FclG=jIRx
zpMlJ8i0@;!Z$)2jnepCapEloqNssCq?7uJ1x4tHLG`^~2Z(uf7oZr3~FnDyFq~6}0
zvn5}Rlw6lemp>(s<enHxzjMvM_|a&b&%O8Z9Wl^93bH8r9WuQZblfDruYpzuMg2}W
zUUUFSbyNCYh(%Z~+^swFdOh#`-_K84{awEaFk_g%Hq76wtIQB-ELz+ycFbu#$^N^A
zk7@sx#EZ5#BC)_lx|^S9q?HSS7c`%LbB?o;Y0}7)CMDije=Y4?uhyc-T)$dvgEepK
zyvga|WPkdk{>_j?Rp`O&$X*?rhF7*;+rCx5^QL9VlfS}<SD6l6_RMrS-`jk1Kw^{U
zJa$!APy@nYtE0<zPXfAc6e~m9?xfk>W*5-Dl=?&JSE}9+9Vz-lRU2Fvzv)x_K#>ny
zH|vaIx}RSdLEXOX7XtHq6$kyGT>k1}qoblDv!neZtfE_-`?|Vq#<#xPUDj%a7X511
zfenHCh_p7VqxT)A^m}#sBR|)#iXOumi->imW5re1b8~VPS*#ww+1{bgU$%#bWn{y`
z`#s>``=fC9X$9?eLN+x;S=pcq3lZ$8c#TMH)WR3BaSbP6U=CG+>LP${nt8*?O9#^h
zEd$Zx7%pz;2=YD%kpft9CPjCb&A1J=mC9b86HaRu{QNW1)64PNx<(5rBM9mx%e!8c
zy{AW8Y^BFJzY>~`b=!^7DyQlhRm)NU(lXt!>J}s4(UPONT4a1o$}pW1IFr7$D;SyA
zv^H_Dvjw{R#3hMTY7aPTR1m24w-Uk8_*$xL0A1Hqt|(7qV|giGQc1_)Ku$4h#i>?4
zx(m;Eci7OZ1K*dnNKrW!Axe<!s=2t*@2ni&T414;sc6%kgVa!;w=I&*u*x;4yM<SO
zwu-&tSjN<&3p|o_GpZdb`Hi*Fo`X@gJbTH}v$Z-Pz%x6AQ4Lw7(P1hQrhVdqzEohU
zfu|PoEGvU%)to=vvj^H**3gx7!Ad{AU{h4{{H<t#!4Pa5zDlRXIh3@v8F@y2SuBGG
zAy|gl`@lKoVq)6)yrc-{J8^bZQWEKujs<ILd1{}W5VLN1?45-MtEjx8C(9dbn(L=b
zW^0&hFY1!XG`zYO?u-S1!PeQrp)W6){t5Z>ku=<=yoQH)C|OmDbZ(dS)SNySDy(^B
z*5z4KVSI8-C)vmiY1rh}kf6QTkS}3oDyk*PL%=LA59-+|t86TG=X$2c=k}~+&hs*1
zMQuGNuSqA)Ron*S?6iKZwyw#^C#*}gh<nEKvi_TW;%1L7J2aviA7ce=miZ*MI)NMW
zBBJ`|8<e924<&$d*g!a?Dgp!Hxm2kl=!#9&5poVAVT<i1br??cJni7?yR*fNHobXF
zS$#wPIJjaeJ9Twur?ud31@Q+`dxHq(kO3Y&&Jr5#=4zT03AWH8?Cq->n#!+kWvy$G
z{5z>Ysx$cqEwA=Oj<ca79G9hOq?5QMWy_f?Ne&`ek-Bo~zJ)-OL!?KI2G3@-^NJ|J
zie?ec)~^U>Lup7X&AiMcPMeUMg@MX+X^em1$NMJfMV+0JIhu9V)HNlQ)yMmr2{H*4
zCABTKzXJ@B+{9&rTMKO?M757+S`edLwt$=Eaq4rJCZ@!97%nZ3;<K5AzJSq)mRGg5
zSEka6YdReoTl&&-eE-IlS4K#;(Nfql=CCkXm0mez0E_u6!83n^&5V{yns6DHn?vCu
z!c@PeQ?dwWSv^hl;o#n+HE4sRkKR8V)mB!QVe7KIN`GQfg9!;MygRQK`QE_|?3(86
zyFw*ox_9o`9BKBXF8N7Nx1QA9H-mX2qYG-J{g9NI$peXu?w_SbNDA73FUl<~o}QM+
zaam6IWs2Qu=};rZ*FpJYcQ6HVLLH6)5-Gzzc<JfAUsHZ5M9dbcu+JMU4}G32=*Iev
zBLdErBbmx6VJ(SZ*<aTMG1G4+*7vJsO+A~DQR`Q=-tw3l5#Bjr(13yybU_%r$*kLt
z)?f=WbV(BST@hgt4ZN}vvTz7ho${=Vd@h6CQCoHnu9giO0;|d{wIU7NBVt)P5fEd8
zs%SCU#9*&Y35|qPF_SSYWpDIEKgpI9biu*#vBjm<X}1uJ&hinPp?Hap4i5IG^PCcq
zawtJ8!xGRjnPJ)yBj@aI=vZ-jt#s*V(`=ps(C4n~emnz*{XbpMsALfVJV^1XOlK>Q
z9c45Py8Ld<^OUzVsW`TmjwpEXJa<M$bx%>#i0!F|sW?BUD@;dMCJmV1Z$4cmcdZO}
zAJFz`2eSBu93272meJ9AsD!MO4C68OWoiInPWjy0VTi!;<SlxdIq4d;Fb-gOl3^kS
zU4aHkpCe6@^dw~;kS0m0Mk9=^fLk<797vyF7=UqIkU}M1hQR=2mt|28qolztNFnR9
zroqimq3DyNF_i)^iDzO=7o<=IsfTINSmvkDiZ@^==cmy3k<*msr%;O*V4&w0#~8L@
z>;e;{0kq<w7=1ttX%+>zXpA5rh7^lh7z;2W$uI%q5x67G0t`b1A|#WlhvC!wl#XTS
zQ=+NO$0ijo#J~aO%d%*M2?G%lNae#+Xn^_H#6j9&#x!yH_R^aPq<}C;nymtRDO4))
zats!rm@FzqkZzb14QW1XayMm=DqJ{*0WhV2S=u0hR3!|B1_5|b0GmWA1(%6YiV=pP
z1BZ#yCK+Z+qabq`h(VRl&DeJj^rq_j1@tECn+JMR^sN^>0K=H_A9TXT^B+{gjDX&x
zeFH#m^1fi8H*w!H@I^lCKHptEtV8A!i)Jl$s~E#CY0HLYEpAJmW-VsRg+?!aOA**c
z+eZg%qwkXiz5v3o3LXM6B=g;c!}bf@6~fF49)4qtOI_AuC`w=EV<<{pHem!8xXXn}
z7PyOri30iP`#vh_J_sOw!j?bAbHPEJVK&CSY>j#tKky`}n@PL~LxAQi-Y^;CUdACB
zgSX%z6Qd(>%a-OW&M+9mO~xS@<Cm;MI>s-V8r?AM{3gw?bt#8<3@#amK#b-5CcUuZ
z{3f+9tNaDpzFD9SMPDLNhoo;8s6*L@NyC`9g+}u=zeyu3JikdP3_hP*A1?LI{{KB(
zxY8KE^_cvB3QIjE|ANLIlm7<7-QWYt7qc}s#Kuk(>jOCfr#gOH)Dse=nWD~(f|HO-
zLJhZKW~dcpTE~jmFk0J+)X)`_TH^I_OKM6sJOEyt<VW-{AsaE-&?KegJ~1=2%*|*f
zC<an3sd|5Nfk{3;@6W}FmJDK}VosrHA-JJ?D5HfAh$`vg?D6K%0Db_$q%+7uOb?_5
z!UdIx&59|9P@>C|mXfAPsYtVTofm&nMTxxUrSu;P9eRn9Q7TzyK~)h^ky8<~**4Dz
zE{^H{f)YuAE6Oj49tqlf6!Q=x3za}YEc)XZ>S&&1AhswbYEerMB4KBYrw*jfG-sY9
z1_gX%A8?@sz>V_w56UOVDN~^~n`gvR$5IDSXAKHt>_?|VhEarF4#-1`ql?E*Dqu+W
z!-DKd@ftKk>5UP{SkY69N2M@iFhp#iN%;ZQg067XgGAbhQ>Z1zD#iVwDH3hL#oEW+
ziojDC5W3hv$*EVVkK_%*B$t@}vyr8+{-cqlkYCYKvY0-KkAh7$;8Yo8B$lnTJRn9C
zdnpP+4q2mm!gKM6wmb)(R;fJ0Px6XMmYj}@*~MQnfVe`}h&?k~+9VnTQGXvrXCZm^
z5%DA5uujSvzmQFai}?|M717lr_RNYHN9C8j@dB9@4^N9J#$HN;s6q&-zRKNT-+Cul
zMxzjrdZsK9l6uBOAfphFe-*0FE)MrR%UgOC^9sE~IlYto^8MwlI5kIslxWgNtTp6}
zq9xdzYV!OA5HkPri+y-i%nh)N1mX=nBW}q&-~?%hHlnYJ#IMTLz5G0&v}}z%fCkx!
z%}@Iu7rQb{a)@b{oLZ*s(t_wpPT@eqp^RipBF;3oZ9gmt#IVqH#hghdp?tm254<~k
z7!UYBqrT646O<sQS63$Uy!EJWBjcCLlKx|qOF}*K(@k|DbC0JCISuUIh?ABeRWZ6y
zH57)Tl4XkrRuE^$Z=CuBN^KE-Ec8riXXr^%P!PK<*rd1CA|LLW%!~+sdgvY+2Py~J
zC`p<;*QyZR-?eD#2`#Uyh*e6KrZAi-fps-S-yp^zrW{)O8cR5V;g{dkZS0r7uonLk
zn=Y*SHYhhCb_D!bpG2W;{_09Ld;lJt;g?vmK$+KAtnV@}vHSjEud#T-;%^tM6WpZ8
z01`leApW_VK$4}=OQYw6ISAmJzJU9&Tz$29%=6dPU~o5ojebHbYSaPYcQ8P(M|he(
z+idm!e6rcHHbq6u-Ar{3LI63ZS%0kGG;3l2E`Sygwoci`{;`6cD5o=|*;B(!9ndJI
znjk^$q%Z({K>7O8r?5+Hvp~@698V54DKCH=014nmte*Tthr=LdAZAL6e#;IVS7QB%
zf)jEh)-}Tx;(0V!j7B4t5#r%V@01C?nq;~!EgOCfbAaCIu;I1P9(by~fwy6`LAUUZ
z<Q>|`r1Xyefbf994gE~jIbi+_dy5=^?0+B75j5)7j@8)t$atZ?!PLoHO#t<d<BsPJ
zf6LWn_o)h^3e1l+d((S1<J24KrS?&7Uv^JrpC|w;_x+oK8zGq9r!2{5r)qvHy^|OI
z8@35WQ9%>E9q*~OiN_xv0oM-K-ung-J_y(RaD3qIo=BV51{)+B!qcq#5QVP$!qblX
zVAF8>zB%v<Fwd`@diAHm%eD(ILR1?NXKa-&XRMV@YgRhvZ39hR&C~9U*KVD50j&Y8
z{sf(w{c`&R8_NO9%zJ7VuG=_c+BtSV-w^H5Zz5BUp+_1Joph0xT{;^&IXcxgEH`K)
zTG6g`QNRB2@HjBuH_5%u)D}5+*2DEozV_zm^T<luUm3a1NN>aYy??fHz(3I1<#jiP
zsJ9JmyRxyM)Vx`a$jjAM(}hlm-?G*^<~29wvevLRyT-w*?uAT^RqZkKn8mWD%B$|_
zLPw@vsZrwF5O=QgM^^1Bm#(>6O-sE+t+epVn(O&h=BeJCK!J}@Yn3k7(!6iIuE8l^
zecArRnX9BB&O%-Lsx^c5p6-#&a`Q~9@tWZ|DZRbHn)mtl%<JFRS6XcqyxMJ}9!|gI
zJtEh5JP_9wdGXrnu1wv{&N4PD#>=i1B;PGO8uZS`wCv>d8r+V?EYEk^q}QaY)pu1r
zsm{u86bA9mD;9Xxw6oOr82U!*+^yQ^^cupT<iZAIQUb9MUEJ{{39@;eM>d`C9g8=U
zxXm(H2hoPZsRphJYW92e2+On#^+EuAEP<>;^pi|t?>tX+=DI(qJSSpUF-;;J#Y<cr
z>;S3_Cb(EhC}zm>XAQJNnOdebPT{K->N1(GW7d*3*~===J#$#)VSU{UNrd5BH94&m
zBRGX2(OJy1=t=ywBZjVNUoIz&)bjXDHF_f~3TYZlF!QF1-QgqSBf|zXG<5b@OpzFj
zXx$tbeQO5#R3V<uHeJn9pNO$KTrqmNJHF2rs=MMW%*bRL49O973?>7~1o&s!yt}X-
zQ{4JTS`@~Z%FcIW94eTAFF_S*f_}r8NlGe*vc)CWWy`;)FlN=CIN?q&&08eRe}2UI
z)g@QyWM#D*atfF->Bm;S=JbF3YHVQ+%ig6^!tztIbV+)v3u+12qnFeqiKO%Ooq50M
z_B(}oxldN0^qu^s6YWT71>l58_jd}2?BwZO?1cWtZ;x4xqyb+JrUp^wpV$faji(yH
z93lx^*1xYawG*e4^czX_rxfH72(19j9Gnz*cCg4!{BN8SpGLq<!P)#hbC6Py<RIFF
z7?dF7ph^R1I*q@fDM86W(gu7Mf?_75h8gk)=0H+Ihxt4B!%#!Q`PXz}e`9Bc8-%&?
zkL`s0#vKE{^Qp(*`Wtl&6apA)00J=qHUa`TG`NKjZ45ZVrxhXA7|5niR57sRpDz6&
zLLdwSu!(U)z)yt8QNVscaCX9J_%nARq<kVnf&1|ZS_l~uLfGF{2U$Q>2#mW^13oy?
z4T36w$p4?cXoGj--?pNS^UmZ>tj_&Tv(BPUmJRd`qYbhR(G7?VpAEZCug=y^{eX^7
z=bt141R?Ao?7-~6|7`~X*Mrgn(}UIne}ip<eF5VJZ~MfD%m>2<|3c^v?T+gX=Z@wM
z?vBrmzz5CuvC-gz6G9V$6G9SxLIQ{Je-*;2Ms6ZnfIk6qh49TmzW}=jYxVaE@bb^>
zJojG*cY|;P<Mrna;PwC5PW-h4PCFSoM?3BO)BW)R@ceB$Z#)0PKC&UR0k`3}!TJr}
zp2q(31egwj4x|pO&ZmrVt$v#w?sxqC?Clq)48N%BafdgaF~Nby{r{f%3nByzpT(aH
z+P?Qw6m};n1h?TQNra96A@hHY=Mz;o?r_I5CNQwL&;5*Vmc4!Al<~oQw;I@dzXHBn
zF>Q3T^^T;Dro5T<w^Rx2CKne~i(B(7hX`U-Gn`o7F?$7$nxyA8)|^_yJTqL^NG?pD
z%)al~*}7FBeq>NS62KfG#Xja)b1L_OnfQV+H{i%&iTZUuI*HI~yr^hV;IzGEU}^OW
z{+LKZjp?qq#`~D!9sa=cm)k5<P^Av6&7_2PZb$QT`&p~PwJ=9|muc<CIV?vl<%%D?
zb|+>xn^`s6qG6+08H4b$xA&N+P1M~vffoC#QB^@XqPZE}f=nZv-%?I8j3vk>RZaho
z+V(hZV^W2YS<S7r=!R2+sb+MNi<S?fJtr@W;+{^Uv~t{^+SJT}u>s}Q7~NjRx5cHt
z-=%@N-Hd%s?Gd_finmG1mg*bhi6^ut!4)UCt_-b&&s!3%(-w2b?9YfNrmpaJ5$3JX
zN!sibT14LztAAT#wMMU2J=Wsrkwr^&j7)nkKogY@5ZIp88D6a)>DgsQdSTtqp*Br(
zRNH<0xmN9r2JM^P<ZB-AglKYR?1j?VBj@|Ik(KJ^t-eRkwd+0dl-&%-`|vsCsi7Y^
z9}7;;*RF3su7xRXs^;1TQo4i}J}=pwown+ht&g33bKBBfDWLgdya?ul)rx2K>}E~X
z3ghQH<MZW`(Y5Ms$GbfKanZIBM)AtGG1|rAF5Bvv=K0Kjb=_)l9H}~RfdsimG=x2R
zE4rQ!cDV=a*kDxRR&vp(I9yM~>pOoa>@C7GzgN=lPD063kJD?KnW$>4MWV&B6@*)u
zI}!oKS~B7DT4qip{U+qjaju%h2lOAs^*TPN05KEKr18=Pa?Wco)Ba<Du*fa1`z2W!
zgw&_t3tIIQ_&@$&`g7s@n3;xaSXk-@Yg~Np;g)3wTd7`fx7sm#m|5P>JJ*_2>jIcr
z*)9=gzi-Eu&ow&S*OIamug7H1jgfa7$<u!hGK*J5Be+r_1AZ12L=FDIQOiLXmv^BG
zrb%OP>m3B=Fw+2mBxGa*NRM=UE`xE7cH+B+7SFmZc!Wb8lBAsMW6+4uvv?@`wAk28
zMq(g;mm;eLB0L3<g~!p2)64tn+R0W+QvAR}*!&E*gG_(h!4ozZ=s<wu1(fq%_o{KY
z-;{TQ9o^<ai(}!hBx0lUx;q2=*kymHnnhGEfe@bT(=ERJDUZim*zUu>bmbj&sSR_u
z^n+KY3nbOV$Lq*m^b%OG*;T!(%o=yo2DhGjNMEIl6kM>uC||g%Mj2M|!csg(s5A*0
zfb*DhIyYykILW2iDvXa)4qgU#v-w5wSzqNyWQ2qYSrVC5jB=otQXKaI727#6HYx<g
zEX_8#D<2tdDjk8!7O|!;+;7@Nar|<_nQxWlgfRNUBeOz?1(^UhGx5t;a!Sg|@Ht#8
z4J(~*6%GLn{ZMnb!3~~hFvjU}xhw~SmJK1u&IpL&RTJ3GpZWdICCm3?s4%~9eiLh8
zp<K%N63<V*z*D_+h<Fd}p-Cd+?%d7*!}SFGx_Zd09o3+|cr{?UyR+a}8_bzM7;Niw
z&8%9?e%5%4>(u8ftpf5u8laqfLC>%aaqgF(630$B!=E)*45_D*z)d`J{NVQWH<8}p
zL_t;NLKuoX91;5lu#BbRwk~XiF!xDkAMRx@lp@cTef2SC=Im?KdyeXYkaK__5R_AQ
zBS(tj_$fxd;DBv<i?0knYo>v8&73`8LgR>O@};Cd=0FF%EspLbo=zv;szKwicZKn-
zf-0h*KogsbWmA|_rU_41@-tUDgf;mZS%1V45FtxS`(~SdKCLW5AYoXOaeS13)z%)%
z&7EE{&cm^maWjtuy}#X}bgG8ln#AD<H)^qWzjhIk+b_)txoF&WaGX>Ud3c=gE8ZJo
z|D>!ufdC(O)4lyJ!Z-ow#r+e!bDGg@w_$X|5DuqdzLt5UTQ5(KgubS3d)wE#Z+)xq
z^O7ctLqy9eu{NwJBF>(Yll30W@>sKlYc&P=1%sM`CM#1lER@Rj)qmLYs_6@?*P23Y
z(1AmaMF?)>P3Q=78nV_R8N4K<JzlP-n+GG^!X_Nz6brxDHguDp*zv{~&mzpW1hozK
z1ES#+SvE;(CO@3Vxf!YhyRzYu3!!vAzeyoT9Rni>oz$?A!+q8dKXXL8AsuWDWrch`
z8d%&Cl=zu4U}olgxiJ<)3t62!6>UMpR~B%$XRyh2ssw!>D%Rx+r+;Q#EG7=g%%Int
zK<smpGSpIDXr&z-t=(85@Vmv>T~4YR&9*T=@-j-xM<K4rK<=%++c|#0_Sd4J&E7)7
z5BqV=R`sm+i<Zf`rk#}PkdzB{v#HK}pVmUVN?D?qGfpw`5Wk*9*p8JS^sHZMp3tJ}
z?Af`#UiIk>W0|q6TYe$B5NluXe1v(tAH1#|w9*7KGl9xT{IcNp7jvP>&6)_wXEpLo
z6UE}xc<GZHfwb0HJ*zIw3FYoE^b_C5f_7DXk`0JYU%&6XTUF`-fxNbp)6!({Lisp=
zzoqTMqcJw?y_ZM!WQ&mTIq!#f;<>zTV#iW6)ZGW8wh$?TT*!=0o2UCJB5xB4`nLFv
zoVlaR$_iC&&N5oJHavC<`oaKqm;*EAf|^=vJ|io)S#k0eMm;cCX7F|#o!ZOx64RO=
zw?^Y+aDeQ4r%)|_z^{eacR`_|O5><C5sSLhi8ir_@>VRsub(qWcaZ5y2gl!m%seX@
zs~?>M_zPzvai<+-KfhB@nD8|N+=3K%^0y_Ob&A5rJ=t7DMkFH*Yz$;H1|=_-psipt
zW=u}6LXf=ZGCA=^wxA|DYntPptm&Rh(iLlpA}8=m)qfTAW!+FEHK=-N(bmd&8daAT
zdv1qV4D-pJTM}`YF5c~l&$FAJDa}_B@|F>60ut);hd8ExE^MDXRWH~({pd4}hzzCH
zm~XJ{dvlVNO<LE$s<AkZ39TUO>(C)s^J(y`dBUI*(qj%LxLoLA8}56f<?4jmWwe6K
z*2q#`n7z)lmp1rj77)G0lm>SjPr`31%bm@=m|KUS>U86|$z?4X{D23+CT1G7wmGqH
zeAIh+w-e^1OT`G-uQk+I-N1-O%`n6%RllZevn_q>w6z;i7Q|=stu8xgX#32j?<8-E
z@dVc}=yvj@M%F{kbIfV$Kw{r4H@|54Tm47Mi)`n|0B1-&S4r{3hSl^m%!qjWF9eyz
zOV8v4WT=#}ks*pp)6zX4cUwz44SFR~N9F0M?inHZ-`w4fmAt+hJ^j*T=>(gT+Y;V7
zE1WJ;XE?k`3~QWRJ!uK2Lu0r}Tlg`&$19<MLzyd<buMndI1w1e!)!BC`32}&Ocic}
zD}`A|_A5%Q8SIPcR>r`l>nJNq{z!$u5yc%4f_~6S8y!nx)qB(_^W8Q}LwWGj)S$EV
za~2k|W2jb?jUup+yZEtKwbNA@etYHHAH1Y8i>6gUs7VckUK;p9g}{K4+SQn~eRP*&
zVvVicTEv`;90mIW>|%e(Hwrc%3^kyh7!>ZeAF99}CS-(R)<>-n3E*Xxn8Y07GdFaX
z)IycLH&MV=!sG?O;|FaW+Ver#ginVbBorG=ukm#~2~@eK-bY{yUmCTNtWb~!l9)>M
zO?<zlD7LI}@qtVot}0G(nCs2g2%OBB-W07-8D+##+UNHOt%uyjkrPQppjt00$L2l%
zxJHs}?3`z;EU-2ulcGlU{!0s0YjVqMaVIYi?FK{zVQx*MM;<sYKkY_+ao`XuX7tW^
zHn|sAGU?*;*_%=k3|U_l$w_E~n)#c@u7)Qu5~QK!Q_0xp2U3wX`(D#t#vQ04USd+C
zn_}kxJ*?vW!^5Aq-Ur29fmdLUJM-Ir_G_*93cFFgl{m-meq7VG!9|-7k0AYMvA%p_
zw<r&4ISG`(l{GEwTulMT>%nN!Ro-xwH)QJ-H!)>($g8IT#YGB3&}|J{JBt3;Hxk|G
z3;XaxX81&yc)#kkZSg|Y2vKAQ<$iy}6CDPe$0Lo~#y`WBwZHc$^7=84+ies}MYSwC
zl5_^)9lD53)#zW!6@0<eI{Mzk;_Dt;1;-Iw44Xqzh;4NraE6i2zrm*WDSI*1OyA&U
z=b&Jc7JfzGUl6aD9_m+!YRNEt#(m-Kch&s1T^I2>PynTd>I`{aM{`0JO}0X-P{tit
zqLvq`C>M^JZKLd=nH|6EmETtx345{>BBYg{dC=QSJ=oi8kT7cH@jGeC3WgWMuG>eM
zk5iAMfS7^VCi~>*db*Zrl*9t&(G8YPjqs!@!(WVU!HAG6H;10Biqy%_I(wh#d*cYj
z6~sF1g*eN_tWYn^DU-e*qtr`Xu|ZKWD!~H1uq3s+7&a-J%+DPrCXUYWhsvDjA!@?7
z4iHxoWblBbB!$K;I#bnP8554>Xn7?)5wnkK(Dgv^)|ene=}VqC|M<6FOgI)?H7}|G
zlSsT2T~==JO;b=ri~IMU1-0m)tegi}$c^c2An{q&8~f!9eyBw3F#sRaP8M!hOHAUq
zU5FYpU;#ai$H4x?AMVieGV3I=FW?*f?^k&6+E;z39X%>N^;;(JE1Ib9fw<y)n5pl}
z7fO(_Ms8kP@GzGFo8B>zMQI+Tp+&gvrCG>okHkS#&}d7&{yk_A26QYjxfIa&Nz`a{
zz2!=TxHRn@5^!$fVff6f_`gYm#z~Q>bGM$41>nmpdyK-qba)u2ZuI~9F_uFs!P`Kx
z!TQ)B)LSaFS@bFi=d+QXAm=vwOOm1Em(O9Qti%pehZ&Fi4x?lj=+75;**N^5aLQ<X
zm94FFhDASNlgUM|r7rZ&V>-^!(ecc>63gzw!k3lB^o?|zRqfg2&Gas7`sF4pVT!}|
z>U)L@fU0_j*LW(A*;;C9+j?-DDDmfy3w=<a5oNmRAce(*DHSEIuqn}{h&M1$CxPie
zS%&P2!CRo!>&%morA5IIRGA2XrT~&*D;-*zA+khd@A=$kOpel@;nnwbb?w}ZZMb24
z?YaE-onKYM-F8OoAILmh3JDIGh_2az;;6O3U#ACroEsm|#_h@*jmR6V*)If&MDLHD
zID&K7T8~$a)v!(H`mE`)F<nSAP-0}@#U*$JGmznvhz;1&oq&tum*By9wY45?G~qFs
z&cre3Li>_=5aVbzo+{Y#)Sm8?6+zfKNRzzPso!|_G^HJeeo$0)zv`05V59RSWMR)R
z;~SQO3Mr;IFgUexu$Z1@nSL3$L=)RQAC0q6^X)QWNx`LI<|Rcicb;z_N3o7n1zo&2
zDHJx1R;0lIk}WZin{2T%h$5OfJr+`fdD>#IY%nY&X*h2Ye=8hoz5PnWEH#HG0n9H_
zT5L3$10-A(Ek70##e}aP=lhudCqs{wW|5VYY_a@Hu@XVNAs05nQ}N4UUvgHv<I)O8
zE<LZv1YRVx6)E23ODE4LYD7~8zC#+kseC$z-CSl}R$4)NC6axMWx(N4MCuq}IoWKj
z{O`1d#4N``Qw$S~RP;r-_(9BMvBnMZ+F9Dz(VSi^3nIT=Bjfb8#XnrW7<D;zX^%8@
z{Tc5HHJ6HaGS7~!1B@8U;ioLhEk#sEj&z0s01<doKC|o}*N?zu9=z;t9$i?XNNW7<
zn83&~#4)B_8-sT42WkOx?-3q=lkrJyq!YYd76995CXDh=0+-EXZ>))U=;GK=&~@fw
zv+8jvt7+1Xge>@-XP&uKx{@CAR4QKPcDYtlXR3S!8}8=$!b5Sy%phcX0yWjwDJo;r
z$YE*uVLDxQleRMNghf;vHp$+FlS@t-N&!mN-^?^#-ccExjzjVzSY+V)9P&%=<P<xN
z`;MXfKJ}K@lch`&W$oA%m4<n<#rUJ&C-ALsd>DO9Yj<@{Mp)dz-&6DHMS?XKB2}|n
zk%^S_WXxx|Oe{3J=DE94loe$T4_$XBe<X6qd!}ck6M2bNuz8675H0$3Fb@+=Kvd_(
z9TgHCZWl2s6*hH@@b$Ffj|FF$@JPn&S9POAHd6GHDTmKMtw0<cFA?LPeBX3DTf59B
zG`AO;h`!lT|L|GZY~)d{sY<QYgfyvx9h$3Ch7eRYUVZ@k{$>-+It^4U%P&>yYRDd!
zPckr-MpT$n;gE3fe@mZ6;c4Mp&JSlAU=ihdNc%oIzVU|?1E}sJ&*EOaLwEaQs|yPg
zSvqRxV8OVR+l+U8FrS(SNm-Rho4Kyqc@=?%_6ys*0qm)r73Nsxl)YU8B?C7vlWvOJ
z^KEK4mgUlSju8P1jUHZuYy{w~QylhzOVq%(FmwxB(#Y-LX6`hT%x|;Cc;?{gDiojD
zeMu|hw=FC^+E{8=xMg{xM}WwHY3Aa#0uKx?CBip666$T*sF9(eLe1?NBNPI>KSa7l
zAo9@#R|h%3ckB6(Oz3pxi@3F<*tj@MlL-LrP52!<4^0mv5}#e&@d!ZetMoB5+%NF~
zO_%Qi-HtT+PJC1OfkgyjqLBdto-)4>%(iq4ee{x3xd&V!zWsLk!LMHCidic%&Y1kW
z|Gl7BFOQ0coOf2Th>D_6^0?yXL&l5>-vjQJqq4Ho7Gga^(-u5a8i8r1d1HD<8apY|
zq_NBh9*d!K+aC-;T+U44?CBk=!Z=t%pD3w9C#~*wFQx2rN$P01lh<a-bps-ez0TmX
zDQ)JR4wIsMQ<H*_JIIsDb&MkpUT2%GOLEgy^H$Vt9vuvUtPN;CO?kQcJ;q7QWBUzZ
z@;Ph{Qchd3gaD)!w){$F*#&%jlvf*OE@o5n%+a5y{D0o=G8mI~;*m3PaA?@cQccMx
zZ^Y7Kh1j-e;NvvXhLI(ZaLOb}{L;~KQ|sDd&{Fw>c1n+BDqGHWO4;buGYy)LdWTHx
z?ki{<$^jXn1(dwJQn{Zi2eN_`TA$uH%6$cu1FWj1{V{n@X9y1$WQ+LI^%BF`it@el
z0PLqz2JNMF<C|>|9!8IOdTZ|nvYv#SfHL11Ccc>mY=dMmfX5~<qa<td^kw&`2Q*2l
z+ak}JE;jjXQu#PJwkAY=(m3ZM4V`2LgffsXTxu$@#zni9LhLKr3G`;~eT0^*OgaCU
zDW*PJy;f<W+?M(}XXj;gBi5X+6xa8JEfw28nL^h<b6eI?nOp_aISo@asz`(20qpce
z5wihiw!V|uh&47&bTP`3#;hCAo`NAT!pP9BPVU6mHk(^IiHhQ^;L)qUKjabKnp<}l
zk|F;j0P4jdC6f}VE;2g1wka}cYB*9EUz1kn4y31|k77w4>L+^>GlpW&#J+}0K=QdC
z_FA2*m5L{>Up;EO>X0HnfA-)bb)KCjkH_M#q`iTarsmyh({h3C{4#rTaa?k3895ah
z>+&F6yX85(%w(=2as~z}`E#kzep9o^F{x-gzvU&Ig5&nrg!`thu5kiHjq6`szTqWj
z``n%9Zj>zX47o-(%tP3&TG&0OoIe+BM)XGAiyY3rmp40Y+pGD93ObkylZXF~bpde>
z(`caIHg50){*p6hK%b3~|FN4*<r)7Z|LKWTjf1_yvR*rNn{*TU*!T|3ur{8CCPmKU
zpp@i<y>{r&-JiRZ&t4^eEa>%_kC<x9>GV+ox5k~r6WC<Bnx#2+Q{sXXIis6Dd&yj6
zbn7$K!l)0BYhQd8RMf>6dn=`M)|$*m_MJs5Z%#wcQ-(CqE;+wUoK|OG44nx7K5K~G
zkyCKevQ%5&21VdB*QFk3UW==yyU#eTfX(IS9ctwoiyHNX9VAd|Y!L4BCrMj!xImUt
ze;vq^=4ton>f_+-SUQWFh@TltZZ>3503RNP#XHT+4<g$p%V*zb=;PT%2TnUpxuAx!
zpw_rrk-ud4DKAGVuU=E7vQIq>C2EksU*0@5GQy1$J?_w%Nt+6c;>y=aPDWU##fr1V
zo2!w^@58H@;*UlYXtJ==rJ0IP(&I&~vx`cy)w46jUd;~WmaF9%ZN-Gf+8ix&>{cut
zVl8O@%75`*Uh5P$V<oq?ep=KJTUZF_?nlDlbPcP16)_s91F312)q6om23GB7O5FMC
zX=}t}ri>GmkK!5B;Yh0BU|oswnXWE4iHp0!m@utge_SU2OnYUYb(N@S53P6ik!q0E
zy&qX#{%ez3--Pp55qI998it40?SeYfbnzn8NHiZ=tW;K7lin1JWlpb~PO`KB?ZfC2
zGqE`#y9;Jv8xR8pBP)f#94Yt0Td-5w=2n;UXi>7vLPS|l#BybwVc%23beyhS9o2=T
zQ9zZ3R{~U-hHhfm;^dznN=&BfQ+u5^JjJ61*Q5hqtBMN_#z=FiQ&YMAf?|^~=VqM#
zOhg~fJG!aH9mUG^i;v!p-d)bf8j57U!B&K0nq;C*%;K-u7QprPPr1y3{e>Bt!Mw)p
zqX?ern-o?LjBVCqq_HHDV6VG3l1x9SO6#=~VqOs+0($F?uJ^irnn*7B5EK_&K3JLl
zSgsZ`z4O%UZ2k_X$(cS)X@`<OX(dmvd5l9WF&TP#3$?8#XsgYJ>awBy0%MkqRNN<_
zu0Gz5-*x7c0G`kFDo!GWl!ia7J4sVw*)I4qP1}wIG@hWRs81&QW5CgFM+PG>?#|Ok
zK!F$${N9Hb8n?#GTSFS1gq$~J&-|dtQlf>jUsU3f6w9$}8WE(fZ2S!K%kg=j_!D|F
zmP7WjL9t-p4Y0dA#Q~9Az`0a}a}E*@zLxXCy?t^;8*Xic0){Vj0y?}j`JZvrG7Os~
z$w)ET6RB<3XI4mLgdtSQ6KV`2A|RSCHlD-haaMrqD3qj%yB>-?>F^&ZS-d75DhdiJ
zw#av$70I_qXWy6u(Vm-g($Vq`(T<L|AErU|sP9+@TZ#GCPEX70WS6ZOQOI#O-+lK`
zzWTbbvO7FUk!+)@$I<L}h<m-Wq{atZq`Y+$9r>}WzB1*_zwgJG<Z&>NQi7J}Zr`b2
z0VH1C%!t$fcknt+{1#4*@xB^8BmF8S)|_w#n<3^&eZf{L{Dwaz8W2At!4YsRHBTll
z{Hh;I2=@}C26`h0YHxr~K@O@<VGOEF$)JTg`)oMn^MapGQs^wBRaT%@tO;Af<BO-E
zxPP&b*-)37hw%-lhmVVlITwBM-XL3|c83_B;;hV#w9**8F7xo^pX!+Mnk+>=jOxUa
zLMG|>@*7{$5<WvX4dvm}hXK+2M7|Dz3(31=`2b@&ez6E=I!~U?w<(Ja<@{jqRc9D>
z?}(ZHX~>y#|DSaqgL%fB!L}*7<;+$cl1Q}&&p_7)6F+{Q!l?iHqg`XvQ?IArCFD#<
zKiEAwc;e2cE4<yaUb*4qz0f#&68A2#<BChp%7fNkQPU%Im!p@~T>66P)D#wpqY5#8
zGM3xmdm2{H5k;+5O4;76G5Hy_2SjWTgymZCB2FAvtavw!<(iy?rR+?b_;n5^wGl<+
zR^NnvH$$9Sv4;$~MR3|YinJf+1D9Q>DACJs^Aa--0Sg`^2h-14S>sv~k<nx6`Wet3
zVy|hj(zS%vUz2=2XdHHwlRsN$+k(b8Ax+ASzsJ6t7*+^3)WxZvl<$1urSs?;lhMPY
z%v;d>6K-%DmYLXQXhu=>LD9LYUHvEW?Q^U&Dh;{Op!?3=g?{s89Oe{UC6&O8?zkU;
z)MF$umgUMIN4)n+ssgSs>qnwP2+aVmiZ+#CD}sD=vWlRcTl0C?RLhxWlFdCwMtgMU
zS;s}w1TDb)$T`jFst|FYZj6(tD2$9%%ylkP>N_QX<0>*Hk!O#yzH+z2W2UpAzF5mA
z#vJ%;jK<o((EVJ#-C@()4;&lNipIryA0=QS3L2?m6p)##VvWLG!Rh1Rx=fR(oP@%o
zx$V+R%PET09?1h)OUR?2t4{vuKZF<U^t`#Kf%J=)u4(+9c{>9RKlm~qg4^_E<>fwj
z;r#2PduZeW)>W_}TLD(8k)7JWY`2Z}C<P2^aCC@fMdmMMiQj0u-h{}TYa@G0X-z+o
zY=b#xuQ%^Z36=Zx2QKR_>3lD*%)hA@NbXtTeC5lXq{hZmNmWajD_!!z4%*6{C1GlI
zk7!a_W;fR9GOq+f!#eMfI2Fv6w1&9<((1H}Pu*-d6FxKOP{zVZs1+9L>y!WNp54HQ
zrpZJnDge8*XX#WW9BHK}l_p-fb!j>{f4Pu~7>XO$W0MA-s#SR|vDO%Ld>_DvEeU3h
zSmZ5G^z$yc%92T_Jy7x+gyd<!Pox>v@Y!qBIw^uu4h{WrztEB4ZHnS^OaW*LnXgT5
z+B_me(c5YQ=LvJHL=fo^E-QhyYmx-_1{`*H^>YZdDBD?;jkiBn0egxJzy54wXym<p
zpAV}#RHtEE#C5NZT(0KCFlu;d50;dY)X?#GkV>B&1Uk=NCX6Nw4~(ACv9ZEbq!eke
zs|0(t;Fj*uvFo=+LN~yui)-r6d*i!}#@y}>)xfC74?D;s)%{<*y#-Jl?Yg!bLV)1G
zg9mpD?!gJ}?jGFT2_7H`?lQQ$ySuv%?l8!};D_(~)?RY{b?RSx@2XRE)m``8GX>q#
zJw4NJzt7We@WWr{oWvuaVXRL^=#DIunzg$&wY%m<r0a@U7mq@)*;U2!c2?YX>Mj!M
zyq{~PW`nPphr|V}o-;z9Rj-Wf<uL8ptWl`x`~+^}Lu+3|8EFYv^)j~0FY0USXCHQs
zwOcc=D@OKhzl|praB>W*o`qnwt2Gsu*NwL{mvg*$ssmb<j}z9)s-<l$i^{GzM#{?f
z)NpUtz>aMtLu-p96%G$-gJm{z?fpDUE7S8j$F{Z>@!Fq`O00xO_OzPv`&M-JZN?Hu
zd>nn!M;7+Vtm4~T%EgyXMTptU06X)PI3H#k`37q!Wo@Da8sKPCS5l5ze_EGaIlz=v
z53IahMdGOC<ft$GrT(Vz#P+h$(!07v^QOw-4Q|R{)S21!oD!3I3x1<f9+|#w3u@FF
z-SMTVOVU!AV{A#2Ncp!OAl7O-)<K7D)QO8By&h*sR7g2x(^yY)KtJqkZS#Wg>Ovq?
znVE>=kd@9lWge5+UJ4c?lmDE@Rml<FGM?`GPdKSSU8X<>W&#gy-wOzd_^pR-or^5v
zxfI9S5VJ`y<jT5G)rDFE;Mi-;!{d?-J9qiKPE);H`_#(xmiFeAht%tBb4xX2H!!Ue
zQ7_)=hZf{@9bc9nP#0dzHLyKssVXK^xMe@Y%bX=7a3tl5*jkl|?7!)?9bt6O8eiGD
zxjVAlJwidT>hE5faab5Z1j*Z^ZiTbct+LZ~FK->$mVdJ@5mVMEonJfpCJHyIMpS|S
zmi=e3)y?imSG#t~<{*Jvxz2*h7#?!5B?^K*78;Iu_WOp~`_?l&3O(Vr8fN&Tlu@Xi
z#wkSMgfVy~#FOO&_OU32a*$lWQr@TFZ5ExyB-3T@3=ZPa_GMk=0TZ=Zc27q9$vQ}&
zS58)a`Fb#L!mZA8S!6<%{DTCC{wa=-LpzQ=Eaz`YQkb{yThoQ|_}}+{%XxelFX{3{
zuFXp_BqFzZC9QqkJM63#ssf~eBbkkA9;s17h;*-I`?mX+PYblfxbPl2@Lu=RjvhDH
z$-+{<5;lvZM9>SPOs3x{Nh%Mj@I6fAN)HM*$Ehfvkkke{Pjfgj>B<si9S`+f>s@lK
zN$F+j?wIl+6X#FF3IaVK(Aj~HnxNlMn-3~<X#lfLRy?M(0!?8iLb}Ifo;u}2f>UQt
z)=wJ(n5bn<Nb-1~AFvtWolel8QMe4#&YAaogq=>PAa7K6*S8b$0p_Tn*0+2T{`;t%
zLFTRoMQ^Z~?7cI2{HdVas{_mlKr(O{O`T43Ahs;O2?f5Ch8~bc02PLN|C<RFU@|ga
zWIzFwyBhQaCGd4V;>p{L{eb<2PN$FVa!?agolXG)6YJdyB?>~Z0p^$>OsEMHAPrnb
zYCr*syH>zHe&-bfa7h3&W}YyF>aDw@|Lecw3J7g0X+m~adpp4cY{GH(eZwaRmEqHQ
zMGGQ;&bSTO=L31e@WBGZVKb0`xNv+Folg896{r|QcfL+1D$pt_AFuzuA95dAN)Kq!
zp9<BT^6dl?Q1cxhd*=)SU#kCpOXn3e=yfhq1=PfOz&<pv=}QZB9&aI*`|kjAa!}lx
z33^~MDxb9fJ_=A1k?$620tM(n;C}OFLR#cPvLbY|3R+$OBm<R!0DN7_vo{$of2@SZ
zM|WRXzKS;!Ux9b;+!t3U;oWlrs0iKr-%dCKHDURPfSSmBzQ8+_Ki(qqfPHFED;(c`
zKtX1ih$q9`@#AoTdPRZfHeJP|owqLtT2Vr*NPVN-RJx@wFK4_*l+TlJPTEs#<S04J
zT)JapBid8Z`us@i?AH2lSJ?Murs6XVf1YPqCbe$mnP^9c)&6rjZ%Z8Sk;JU>RO9|@
zZ}gry+Ne1qkRd-b>d1|-R#whU?eqtrXpEm<_GxV{#qas0<KBXw4JOm5<&(!@g)7y0
z)k+EJ<CSvZnFyjK{0#3ayT;RbNeXZh&*PQ~GI98{uRT*wkg)hX`Jn@a4Gm3A)jP)e
z5ua9FwfT6Q$#cr+5qWYj9=^UPT;Ub2Ut7IYJhgm5^{@A5WUVPoTFL^zCUIMsyBd3_
zR~Wqr@jB7Gs3O};DnTQaX)S*#A~8Aio1Y!MlRxhmo%Li89`JC{jug@kb;CEa4l=Ut
z9n7jy{K!_Xq!1O={c7LiX-}hljcckiWjMz2k`}B9`$%Bk(Ittz_XOeF^6qVlno;|i
z3qU{4B6Qaz$_T#3j#11NZ&ND)+pN`on^DUoSNgiq{%uCAqXVOXyeM|5KPJVMF`UB-
z)!K;IG_82|Nt}@VbA`2G{rR#{?ktyuSBFW~?>o&)ShsfJRx(x9${ATHCd;RQw$i)=
zsbVtsjNoC%)YDoaX(XOGta0IT|Avht*6M)I=AHQVp}#DDbQ!$Upj9Y4t5uVZGQtty
zOHv6PT>Tj#kt8ilafT5Trlcb`KANn}cm#gT*uvby3O}Go9piX2FC`iW`ySU|@3$~^
zqXnBH!1U$vX#@?Hw}Z@2q8+ssJ{LuAWvH}*E1b8VT<M01vf>OE{tkIm=XpE|pSP*-
z<ygmfS`h;-=e}RGh-Q@Yg?69$;MBVM$g5(Bo3fY%CY`mHm-37dAZI)~LUIdw^F8mi
z)K~`zYsHY9oo`#1Xsb`|x?<IoaT>K*a+(!~r3JZp&~M5lJ3s8n;bGf#?_~5aWr;mj
z_Phn`cFcJMCY__x>!#KaO3~<Fj+14Rp0%C?Dm(Uwd)rcP=3tB@0fs44$T5X8mKTTe
zu+(jKh)wg~`RsB=&X%G&%~KKtjBT!89?PoOJ>5MA@wo@)N^pi`N$DkN^j9Oi0i)dF
zZv0Tmyqi;nV?|IL&96LgzHzGHrgCJ6=*E|l8TD3BPl$zv9<ENd9L=VX0H;NJjnlJC
zWUI$!S5}YYd&9an`CydhnzrN9_jurq<-5ibnL4Sch-)t8zai*9r(2(?d{B2GuF3O#
z0h)JO=b5b>=FWU=Q;v@2nR3s$m|L=Ibuc*2E~`k=XTc}8WOgRP>_dh_CDI&|x__N`
zuls}<nW6RLtw1cko?&Ll5G;Rg!7MQ!SEZjH7?YK5&ErGn_!GXpAGBTPa>o4u)~@lC
zLZs`9uduK^J$l_ojXfYfm8X|{>qWniD&>hWv$9BU(F`#T5rJ^2o3SK2lWfucoo6(K
z`#C^a22MLtE)B)9mnY9ou=2&(>$is$Z|cL=<?ho+{Df$^l<ErelJzH8=#^Bf_W<}T
zas`6IpBij<kz8Sz+c{dr7+x-<bq5;;#GhF3sp>Pslm(&mraV5lVv0HGY+iODI7Eak
z2c?@q)0T2U^=DPkE6Lk_g8SOZgf6b*3Ei_)B-!&-JV@4s_fd`l`l{uU+?2e9EiznT
zX48gT5qZ5iJ~wx2*NU>nA(XMOS+1Y@eFonWMw}!)xfFvUDH>h)9#|?0wqWM{rw9^Z
z57S+l--_&Z(J}~j#VHguv9yVX6V)8(Bhqg@D?@9o!o#^FTeYUqSTo!)B#oj9=0u2g
z54ZVzQVP7>n_~)ITX$b^#HxFEelHks_xxQ@;Bk*8@3mNheaS6dG&u7vWAEykCxdb8
zkWiK`Rc#uXZ*6qu!J|37Kyd5OHbVf{lk`b~rmbHEReGzjr(IDr!)NB^ykw>mHNo$G
zm^9XBziWNkj(la1@e;`+xm9fZ=fp*+5s+-I1aL}n@*oa%n!{{P?Ny#dG3+v8wtL_6
zT*(pEFyV?!qkPquZE7_?S-r+mOVDI&!R=&V!M$JoK!y>LFo1;~MA_<1a?&n&5TzWF
zU|ypS&rF){BuViQjcvoxtrDCf<yf$%ICefHw;9@vI>b^Y|M31U{mjMz@wL+Z2jjid
zHb0kXt7Uz%xkWkWhUN3G{2nE%CXp-Yk@SSJ6lk?5)e4W1`(QBQ81Z}XEz3&S0E1V#
zl!Oo!1M&AM&G=}&U`CA@;W%hosQ7?0;5Bg9gxhk>)Mvb2H!t1rnv0CLi5B-Xo}_u=
z`AgV6rX#s-@TJgpj_da+0Kx@tqT(#8VnxV-<n^~ni62I_>SPkWKgcMlZrYhW{bxqh
z^8i))ur!Rcj3lC_w5mGswA>6z6(U0w`7&pp3XDmVv^ww?@C84&b3992#IQ8$cs<6d
zlZkmfu0vR9bw85NOFt&IG6`<)EFDsiq@?R%<!Hs9y}6q?HxPVE+#DU9pqK7|{^q+k
zqFG@^yzfk64nG*13WFE#McF{?P3dN^Q{o|L6z4^0r@y|`S_7yBlmc?Z+urje=pz)5
z>MMbMN_F6E6Q{D@#5$}{{(z3h9>8LvvCV}1kb0mbN^>tYA-6CZ+8Zj!k4S6IZy!B0
zI{ZFOrLxQlCLNZKySY?4XB%UaJ;az1=RS8kOU)28W;s~HSC=_e%5^wZ%^j9b*10>0
zsEz23n2s1QGj%9VbAc<k(M&bx|6c1eZ<RGuvoL-$;p-L+SL<`;D5*Nm|M_;Xbm3=)
z5%*+r6TeeO+Wvm6*E)XMdinyiubZBJo~s`5*{up=5ud7u+3P{SOe$pc%)@6my}0h`
zzF45}-r8_CCnyq!$%S54D`hUrMq}DOXLAJT(nmeXI#$2OCf{NXFz?v^Q7|ymAk|WA
zp}JIjubQE1rEa5H=QD><a#PAvdBjjL&@%RrbbF;XePofoq_x&u0dBNQt608OSjnkp
zu<5XDsnpWpw*(`XWNJK3rBt))*HhJPOt!bKtvo9{#Gbxh)8nx{UYv2r*&eCet7EaE
zDsQv3IbB|}k}t(?Dz{#%c0n!o@?I-)@kt*auS<WJk33F4ZadD@^Rs{ILx#s!vtDe=
z-XH(XyK?S5t$)XEHJ3560LU(h@&kXWa_Q}G`Wc6{W3{K(x65jKz^6$1R9qrwzY7n@
zwhDa#eyW=5^0WRK=MqqsT|cmm(l6le0x|i<q$b-Adqp>|?L{%a09V#xy}}?NVnu0k
z>T_6ET4ovp?&_*ju0mJa|J?petMqMNSLnD(s~+2FDfUDlQJvv^n-@K?0mVizHdRwS
z;7bGHLTnnWTWC$<DT1ltLnlzKGIF!s2<-P5F}G1zes_BT&ZxN5LDkY8`RT{ph~7;c
zQ}o=uoLee5e4l@hW<{#@Wv^~|^629=^?YL^!VB)a(d59P>ND#z>tuf}BT&!uG5NZf
z&iL&yu&NRZOnYn%Y4+SrDK1}y^I)SXIXD5{56!#GW~uwhKN=SnMU<p5+n)XUSXLc#
zL3dNS$OMolD<<e#qiQVS`fXGj#QPIH-r8<^T8lEEo@sz<kx;ECSEG$=$dT=1kpp}*
z`H4Tue8z74a+eRAH<hUdq`c@X)sVLU8jo><iuhwSFOjNSf8i(3)fc71bg*E_=X+-z
zTo6)~Kn-3E3m+DTaS(@^8EH*EI?Z8Sr5nr@WsRtg=`70G`9P9gx0%b=$Jmcn_M!aw
z7U0|7yC;5xDxyX9wg`{l-8f0(*EKNn75~*ejFPC>>z2oC#G6*#a`AN0@zq-%89+w?
zk;4~1ipI~O)2>Xv$s}vX(Ps^IJ*mhZVd$f-jqyXiNPkKR&*Uio#n^x%2>3n~f3t0#
z$ZX_L;eMY9pkseAc3u#VNMBm&Yql!Z_{9Ov*(K)%+aGPQlky&dl+?xWc%h4Kzi<i3
zniNpxC6qnU4}@h#V_(H$@rr;GZZ5CT<Rk{$`064v<sP$@kpOE~k52MqS2Y+&Os8rS
z+)}=razf@B$h=dPu1>03R7Avr_(BOAYh%+8Dw4L;^!!RKlQI6j3Efj#)youm*Q2ex
z;iiJc=G=AY%^kI{8a&?YD%VsGl-`MNM7|k8^nyjf<5o4a1>YLQ7(Z)|-qIu_(OzD>
zsS0$Sx6<J}{>Hk{2S!N`SNCM&L#d9TqadAr3W)7a>l82~3IBkNQ3FJN)P!KsP~xp}
zM@)zEEs)V8d4HjvW(tEnwENx{0B{$6`35BA{?I6y7%O>1s#=@UJA?N*a2}mk_v^_|
z0=S`X!p1-f;p}~%oZwR2YS+OO604nif$y{Cy<ZZ4k*~@RJOwvq{&+Z?jEC1Ho4gnF
z-`6z~KEnmpA*Od$34q>$65gM~wKI8tK?p|u`U}J|gdu?3N|V+%!#LxMxjgZK!4c2!
zqEr3L)f=`p;pMOIiav4J!w*r!cGU%NekE)qTE&ATLEhp>;=8lEBR3MR3XPiB02!)L
zR^jzf*Wa`wo^l49!`i`aK(D`T$MGhHqyo!;nX$)%kwD)axK60w((p-vbVDXQ#5yo~
zZ`$EcD6bBLen+6S1dEwO*walRHU!}hVQ<R)CiM;)?A9e0d$0#GoXUa7fDMx6H#Q&^
z(0S)TR|j0?n-?f=^i^a57~jyHBkKT8mjY<J!)MXaW}=;m0552}a2_R+y)B-Eod^tj
z8>Chmj)2rI24H>Xz?LPmHCZ<YsWn+SOb^GFvPm_B4FUtgF{x(zoh#)5T=`mFOQ3_S
z2Xhn1+vyA9?5zAX3-gQuF$9tA@EjI2W<dMFp1l_JJ#s@3nXL;epcVp;J8j74z6_d(
zV1C`LdO+J+9(2m=KGy$fA$nR9wrq}OqZw9z7*-z@Ru2<aZz-zHoihf1D1pXw*PYY(
z)IAnZ7nXGqmU=GK<G86Fu-5$$R)ZS;$@}}q$GF!{D<{yB>D?@>TyzP)0-2v!v;Ekw
zC;Xp{B$)!v;IbZIYVi=)R<a)iv+Rhk8A#PExKSDQv-@BfmL{i>7+@xq-Y|Gf{zi^F
z7gk1Ln-i8sW8>?WxX0hgAa$6$KzD9^N63AB(-}Fb{D!|lWP<BJ^9}z@AumY#fHn&i
z_9kbSDN7#FrzD`1h=(lTgwf(5-=5MQ>Oc=epv(Fx?g#dGG3z`^8Cmuma#`Ob5409*
z77N_+y4i~nG^-toc+=0MS*$&DHT3%GNB>o<1CrEEw;;}-b0|Bg4a9cTM_G^>$ST11
zk_UneG6bQ4?7Aen>;gC5b|4GDJ`+OTLDIkYw<121y>1|W5H9E;U||Rjh}xA2*F2=p
zJi+Auf^Y`y2X!`Zbi=v&g8qmFA%&p3ztMObKqS=}R}pcJDE+~(0cIR=#9o((tHOtF
z65xaHjr7P1asZuoQFZ#g$$aM)Bp<K=<%fQ-;EN!D_>9pA=Z*4+3*l?Tc{B&@13Ubg
z`~$>z-=MtHTi||MAwB-3gAG~-Dg!11AEMC=zYq+hX!N6V$A#Q{iTQJ{3HK_o2M78L
zRzNib@hY74&cz&acBcO7fI||L+~}`ETsZgdh<t<wo(D7pb)Y_>0(@~UV@O|=cQS>a
zaSR}mfidKNNbcE}FFY_Ie|s=vaYY0{`Y7;Q0NGjEyvwg!!?Sb$twi2L=e|P)PV)=W
zGl4ghDMaq52aFiEN#=vxh^+(X_igZrJmOsEZ4+@-%LkTGS0+NKmvAK*sok(k=x)2O
zMA?O>WTs)GwxVX7q&4`zmZAnv#7$huKDVGHul*J03X11~$)@kozCDH2`ig1^l9mxG
z&IW}Yiu7KS7~Owh@j0*)yg$ZHKu-t0RVNFye0xr2*^I`%7M5ZwD$ALZkFPirRd^^q
z>-^6#-x8esO;@ZB$cVFb5qSdKWwi#<;%uFRlLvPxuYsgETc?mqZhMZNDm?{D*HK?>
zF@L2cD*ZBC<V{zOPo)uO=^`ARYfr@5tIe#U3V)|9Hy|@_eEvjnv%Vh;ShU*cW&FMt
zvT{o=4xto>5H;9QNjXqcbhYEZL^gQE#FQ}J^0$AErTq4JQ)#&F(|2i@9a6U@YR)o$
zB|B}gRqYzp$xojGcXJ9R(Z5Ie!=O=<2BwnY*eDt)_J+J=V4w-8m7Qi|kX?3`{G6Wq
zHHAI79(dkr?z)q;pSHmLl6EyP&{W2L?9sll@8R3Qp6m4x!agN7KZJK|66I3e&!cIa
zURotLuR&#18q5AyOqXt5#y6|wQJ7&fS5xL$uR38nTT_)01-33VH|EFJmZv&gs?Cr+
z{!=u4B{HjRGYaK7k7gKsniVr$DLT+(WqmOcB^op&P-~-!EC|z$PD`s?=QVh(O4Hfr
z$MLMpqv~q<mo<(5DfVP3!<qR*w846%`&wX~k_>6>ZOc*z_W1_9W9=xHGVzMsG8^{!
zdJ(YH^1T}1S$oc}<lav8rR>NLaKBObYz!vWexcNSp)7i-O|etGs!Ea9hPEp8W1sK9
zJJy%YUhjEdCUG+1i|0EUa5Z`rwReXkmo^YS;Nw|f`6B8VpRQsB(XM8=7oTc#i9T~y
z3B|f9*c7j5-&|E4*F9aUNqbP<XLLG)8Mkp+SI?=w5V)@<TOn9F8bpkEC=njxQ+&*q
z_V&%6*Z_I7OvvBLHauM4wjUB~%<iNL=<xa4IAk_vLpquYt4$UTI)6txaF=)9+GDlH
zlw3eR_DXel4Q&-JtJ>b2E<9u`M0f!t0Bx~MqEr<6TQB*q&CG6aPSB||n{I2Q`N)^d
zKVL_F+HPZ*`Z|AA(jz^-IFfM*>`pm*906RF8Q_%MRfz9Xb*QcSq!XW#R@0EC{yW(k
z9U5<)PL56YFufSHG*{!TSx%lCFLgDmV=YP7JpD6rDrSvw+EXznQs-mc9gSmE9+%<%
zXEO^)ri#w=$dPHsBRpLCB^HGr;q{%xvW)JL_lq}5<^T?eNinZY`|s@bcKa1D<bjCB
z?@l0c#$TrZQHoYWn}*6mXk6&T-HKdz13(S$j{}?TY5o`v=yLrukoR(Z-?#|PfdI(I
z@m@_X{C%K~H$i46n>YIYrnip2{N_yKhnWl9$G51Pi4ZKJE&zo686(D!h-knL^e&@U
z*&VN=@0^(!5rpI$BM8)F1IB1rG}JF{OJ~>Yn6?H|Xhf*Db<o$w(+^l#bR)D9tuub>
z$9JDW*)npY(??{;lRhQa>-+q-XQTHf)$U=}k!&zrQBx^p`ZE?0y^M)Z@Hyhgx`MKV
zmMXeeOgrGUM>0BN-4W!sW7hrp0BdGTN)y_`+Dn!LZtfX~sl%w9aQ2^2gk=}Ce3?<Y
zvbm6yMYQkW9NZ7+1>ag`>oCM!wLQ95$b*&+H%~8F`FsVe%Ot<cVttJ>dONHTjUXdR
zQT$Vw-L<QH5&5z#={x_Ya|93d!4~0ltjqGBZhgK<flnmQ2p8&M9m0=T>|7s1feHj6
zMMgXVt-20x<&bT#3O1e3(}TZ3$lK2v?c7^u%{N?E-4sGcn0w4w9ms6E=|?Y~jidd#
zFXThUBMMr5@yOOAd-B66ejfMo3$KWJ$gHxq@?U&nzXqTm#VHhg=b@OT$(Dn6kt$T)
z`teuvxzEq0jhE68_3!Ud<kN|!pg*+32TS|Et@VWv!5QPjdxCJ(O=^NDV|#jmAKUhP
za-HE^@KoEF-tP*w*wUg|L5kk5Q~K!BqMYm*1)}8*@C3rI>;)J*3qXg3pciRCu?der
zP<Myq3xPlk6Hp%{=HLgf-!BG1FbfIzjqiYAX2@YH&Trcl<3MNI<Jb8lKD1_7x<mAc
zbi1|Rh<dvXh55{XF9M<P4VC~=`$oO=8YhJgT;OX!N&nN%V=$kPR7c_0S!2;jT(wB=
z2hX&PMc<`F-{aUCLRs1=DF2Ho#E&zNsT$?P@-owqaB=QhuxD0J({d|5#XXNwuP9vo
z-`nO+{$W-K|FggOl)s=6$uj}0ROWy_=<Rcam_=`-#*akMXT2_THi@EXPLNq#N~M<y
zx1pm7yP^C^a9b7XO;qvl84|RqTd?&K!NmEkx9tIg*JMg>&=#B5thpnN*Q&V@BZgCQ
z(umAqwR!hm^~SblCVLk1ZidriX6tS1#7iJf>oZ-iCgu3nwf3G%?1Y-T_nOz3JG5O6
zg;ac*tIe&e>DlS<a$5uC4=gIvg2mN|crecbzqIDOQ_Jz^mgDL&`w6AJjJ#2P=_QUI
zq~db6#)eLJjD}g>>L(#<Rq`L{HgFyc5pD*c6ix5`*zLP#sa2na2%^dC({7`0-QSj0
z@2V}lq%<oQlcK4rk}F2QQ;TrZGNZ*83M0%d!n7E1o<x6LgpLIaNAOqS-1G}9{`vPp
zxn7r=WE{akZ5C)i5GdmCQQ#vit-ub^=^wjNYrS$I*Z5V3{1yhkAZ`JWj7ycMU~?uz
z|NU(fR;I(cdC%{NiOnmB)W;G?GZ&M~s-XP0+ZL=dH@fm38J5+sROXS3hL-wgCb`oG
z{&8>{>K2asdwEDZ)x|B%v-bH#C8_J#I5P(eLMm0lBAx2@6H!mO((A^S+uJauz9HM-
zl*WeJo33`|7^*XUh;H@d;r2suGEZ$;qlDL8+ik`5GjiXCBEmyimxb_FE2v@!v4_sJ
zAU=4b-kakCVLZfskX){ph6`gqI6nNUC!)8&K3n$=rw6tr66}g|Ovi*kN6XZ2L!x{~
z+(;<cMc9bp*43u7ORyT{1!7%|%-j}Oje%pv`E#K=dVBuqpr|*PD}eb~Q7N~nSL`gj
z!WGr08De}f5I!TU_2{sr6VnhH6`tAXc4z5ad*@P1rmo^kl1ujjqu<@Jm1=0bYuZSl
zzD4`^rXg6vwrbU=sn0QT7kh*uL7TqezZIzKe&A5^LCEY6PKurY)<AR?sc(})3%;`x
zc=9V=v(Pg=JGy7BX2#vdhF65ZgxhT*tP)|*7&vS`YoEyvKYeWyq`9TqZPz%9{D;t0
zpd!-Rb;dB`3!)(Q)@kTO_Wi2>YPb~!0(h#x_E~b?#<UhHFi{aT(&U7shmyV*c7UPC
zXKZcVVs59>xx5tKf#{8GCPiI9v6YH5Vb1Xf-OvLb(FKxbl<`@I{AgB}w4!YpLh~ao
z<$+|L|4BqWDHMQ$;H;1p^p+6wAnOYp3=fwwPP%lk7VYfi7W=Dg*Y$c#K~-lmJ{)d$
zG_14u3i^;^-Q;w`t1Lz$Q2luA#Eqrb)?c#V4#um{1N1vv$`t%V#2MHvixj1!v@B<p
z010??vxRy8qiCjxB^~eoC3T8TS|8&br220MAzHZdzc-^kw=ym#ic25VeCIf={Z`OJ
zjwf?@*mkK;`a<!S*d_Y4Jg)3+Fuu6nHADh*2~M{xa3or|miwj{qg@?pkK>y`y@F#c
zQwImV`=gDK3g+XwX8k@mi9K$SjSp>_oHL`1G^K_2ii0XooTI%_jqo+h_?!Tu2H-#=
zmYH$<l7=aVT5^=?|6v)`k|S03jN+R#O#f~hM?_U}hDYgSZe&<)G-3+C?piSghiZ-T
znMX6Ko}6WFGk+6?8;z5yMdPXgqnJLT*NS#8`u|DLJ=#oNns;-I>3i6gtU6do$D=2X
zEU>Y2#bVpq6Xn);5pCQ;puM?Bo32DNW9%%IXB~t4f_$Kg($<shX0r5Y1Xhz{uv#bq
zx!U@tX%IdB|04R3P9E#IPQO2PDXLp5=$9}8Oyx0XK^Yv{!A?Hme;yfK57sv`x%{%Z
zK)=qgzvlQiNmOksJu+2gD)H2u8!6UP16LRSZz6&A!o>mNUwx_DFJs?Q3eT~?HHz`x
zsk<xm3uYdtGzI)Vj+x#myEF8eW_N5O1>CgKqrl@jL$9r7kn0w<i-Y<{nN@X2-dYPU
zdc$`el`X#p2?;LJ#6~@_RsYvtSzGgkqz<Vy=SV)`4~Q=fDc$3}L5Ak>Pd2a2K|$W2
z^a>SrbwLu08N2cz9PL3oIw9v<jyeH!7yLTG=i8P#LBs~I<$W%VM4Z4-Lvxl)LekT(
zqeyLzOKA{~#T5Gj9Nn}=tvdZ6-1RoS1WsW7<nN^5rAFKaQ7=kc<O%XUuCuer^hP+r
zp0*_XjKqyA)o_rfDtca$|K~WsTMWf;+FXSw-OAT9#<SO-Zp^q|MocGBK9x*2J#oyE
zJwkXW)Am0@g4hUhH$Nf6mB=A7s1|G^d1DgyBy%B}Zxf_N#%yCGg~)I5rN!6{ZKLZ%
z*lqI`_LxE#83(W$(KmWjAp+(7><}n3pq5VPu>-X4$BY;$)p1zb^8Bs--X}v1hBb8Y
z*&&T<O~#$3pZlFCD44X?04yr=z9DW>iCw<<E48FFEK2j>Ax%TaKdr<QANXo^LRnD$
zziZqS{NTX<BB82`=J6w%yN+s2Y5%YUE?2W-%AE3ReE+$r*e_RI3)V~fI)krz%P_?_
zdzO7eX=xW#p5s6kg|F(FNtQU%96HY*PVC9gdd+Jkx@)fLtM-Q^pq^i5|3Z!O{wVjz
zMn3~(<=lx!SSb!yIwf6^id=D77$srDXSt(D7K7(8I|vXbV!tf>B@y*MjrjSFHWg80
zJl$t|07OJ!P@zPqiHoG#|6LLWqWElg9LQquJyr*|#fdmC8-ESh@Y!xV;K!`Eul8?q
z5H(yhM*Rnom<xf76Z2S?&7*H}q&D}$P!mHM{?6*BeOPYJ!57Sz%w4?%uccnX^_IvP
zr*3bQ8;8o6ARmI0ZQvWY(77Q}MpuWb$Vg2_v!<k7qlCZAo>5BVH^%&?r2n3o{*(yY
z#5R~G#E9(p?Wc5*4&KhciIftgH5pBt;&%UGa1xsvXZ!LP8C@KTBBgb8j<%Hk@6l9f
zQ{1*v`C`<|ZZ|bfNE7pD00A7rt!&RMVg4DjOK1_V%$niRUSTT0HoBSnziU`!(hZN0
zl+BZPdS-F+$C+J10A#WuZZaWm&VKWrddZC<*lxo)AKUhCjmyE$LdQ3lEzM63Zv_5!
z64bt%ByCUA9(CS{@_XL6mIwg;ST<n6FMiCT6*J}e<KIS8ZWe12#{y{PVrg?QxVk^5
z{t%j@ms^yqhZmJQS5SC26aLGC<@vS*vEV&qY9L0y_xb`5L+tZ-@SjBhgM>g$1T&hX
zy#F$D$hd1ir~dD9*KrVn6#AK9*q*v4`#(0A3y|zBzhJ&(bxl1}+0@c+vH3m;H!x8(
zY~#~h(%ba^J%h+?wwn}htQM{L|816vg&9UW>l#sxWdC>>U~O~@LhL1;x2AY<otB*P
zAP5d?d#Jl9Jd<(JmTHJUtdHWckxy5{mG!gB8Lh;d=#4rXv2!R}>G?TK<mjwthoYG-
zk?w*N?1V|=vr+y{(gn5XjAV{Kw>ebhcJig*8*A=;h|0##BYY(vkzzYZFmh3Q_Dexa
zPWPHrhSJD-iydy%@9p#5rdrl}QkztZmEI1hXXMFxd-Iv0aQN%fJUcsw=dp|X{WG+)
z8s(JZHaHO`U9ZDs=>@&!o9Vj5@-}Ci94idn)PrTw1<GcQsk-oT+5a?BQiis){3457
zSOmq+o|O+fJL{i<m*vH@=5@F(eXG}eJKdIe*5-Vk_a8-Nv^(6;3tK!oG5DaKN$pwp
z2Y0F+06#h+pbedA$)UZJQ6IhreWW`!+~x+?At`o|Nb5JH`K5(h#XyMpZ>o@MFTb+K
zJd1{%M8GU$1M?<tKA4k>W8#*vbA$&NKbfMDGo5IZY{hunNLni0x+&K%H@D1J4`513
zI#?*G;<p`_vq0kQw*QQ<Hsq*cpzroF5R$RuV5oihI+_$K%;@2{7*~wEG)s}q4c1LU
zD?0aRr7e~AAvO~}k{~koY(=}7Eb`P~QFaqj&)<VQz80SN6v4G-MC#v1dl-5h(QLdR
z`4ZRXy`HPLEdaq+M7wd87wb}`-FOwemE%|71J~6r2XZz&8VBVW1h6Hyy?XX;8mDKK
z_X_9w*eNBahb4WgKH4A4+UcxI+EG!rs+d3OGM8IYWnQZ2ip)V3qbPt0i*55Q;ZzqC
zT=4%8WOk=ng6zHb6cW_j!b51ZwgpBqeMu24FmTk1U%S{bbws-45X!R+;9z+F2%`Oz
zDq{noF$uDOZ~jW;5Kt8C{R6EEIlr_)9revd(r|vMN-|>UGm`3yi=biuQiLWIrC|A8
zi7XB;d-i>vI5oTQYgC4-UL30g{rU<lW!n5%NoV`5LlB_18{{R$Ufh+Ad5^4GPtNh(
zuA~Mx{23H>^sHnwE9d0}N#gNhPezjdZMs=7q|&HV*FZrPnz9oxl{eJWjjHn8HDuh)
zt>PsEmREK{r4YonmB}D%=XqYx)E#09m59X&9{KHJsJMBaXadGzJ)|h<rPIJ(ckFAp
ztLnm<DW5#o4)xxq+-Ou~N-s|oIEn^+i*CbI<0+YBs=)EeWxDsTQRlv5H@C&s*0NS@
z&#qfUr3z6qZ^zVgnqL|B9LM$IG6o#GU_C78x>SXt&K5{``B*OkOKlYy!%-A8S0c<A
z6*Z-+boxr|Z}5wY=NPNUjJ+01c{oa<$7qdfh8GJza}-BJ6&s}%FXq$M7lpM(43@mg
zVj=n^ElCABhm$Sqe!W8(^W8&^Zm%-srAPT~?eVi_`Gm2@qR;~PTBe{Jm3u>Q>p{Z%
zMe<0f!Vk-_-JG1(WxjB*!nHqNcx?q<&?bRCZiFtoujLq>ZtPcJ2)=NLEuGIf-bM>$
z^PuWJmuEu85slhNW{-?|=HaNJ)Jaw^%EqKf_?Pghi*~%cUW*(udQcQHZ4t#URsQ%S
z0>-areNnFiMJVtK<fwgyhSHbz0>lA;CW+gfWC6hzNW;iL_#Ri=g2<ZUQS?z85kia-
zmjc&88xieju5(r|fz)VrFTvA-$SL9My3nhj+*e4vfE9e;=G+vlD0d}jI9S;?$hG`&
zy4i~;i1Z~`*E_YGw-_>AJXQ~QIfAW<0U%58_sXnhq}{>`Pg+LSY_m}A*XdcQyv66J
zsl~*dI<Molp)iW^5d?}+sa!sRhgao)!nhzmb$EVs?dEdd(tN*W;pdClIy&VWb|W%Q
z%L)h?D>P)a)Hnk;630IjwubKTlb@3`p$xeUdgF5Fhm0i{)|f6@<Em}+n2l6wNc3@=
zs3E0d^_cb9f6b#CSA)8B@?*L6>wn6mz;>ubV+uA6r%>z5)4(QDhT+91Y27HU=jr~@
zy9`<orJ6V7U+)gF*UlN)ro4uXj}grri0QLw*Rhd*&ae|w+eLOK=$*HhHG1EZ5o4fT
z$eOApbA}SzA_C_(^uv>^YR3E=xK>Gh!5Py<s}uMR;i^QXP8QTUnpTWd^IJ8epc|{t
zqrPNdt@3rO)R9ZKE~7weLb+^TQ`=7V45wUL?qcbgi}_xvx_`Tit9O(aGc?#wRt|h0
z#I<@%W}mw+v3X!$j(c}bIpSSDwzq6uKBjhxlalC0A@4d$1Ceax_9z~hvG;7G@dz1+
z*)rbTaPIRMs^Jsl)hT0bXM4t}xpEnF8Hxn>{MHxIb)WHm(k*7J&1LLSH7sCUDiZO{
z7jZ5Ay;N*cPn(>ca(b0|@#lzB`jlQ#3rKm?;n8Qkg&5VH9|!e5BeeB7en6rdt4M#F
zWfZX?p9pn9bIewX{}gO@!8?jN<2=Vkw<=3(UyqPx<7Z!@dX_&++%r}b>^|-=;dKr(
z@HBTNW^w(DUXkeG*K}@kxsK3!)@IfkfW1+NUQ)SyVMVYKP$zKw{jjXr{CN3W5%OjQ
zk6ViBHRYOD##ws8(M1NHOi|bDib80Sz;$RJjZ2bJXc7IafhAoCv-R5_pY}Zj=%s81
zP$o8lQ>2Tyl?~PEnoHE&=7uT@n`gJbzuql0$xv#vupH!>bEpW<u91!qSyECJx&Jx>
zd4B#n1ljWtD-s&t-Xpc4*w%2qGQjd+Bq-DM%+*;1cmMclw(+@Zoz+klpgG4Gj7=!e
za$j#zVX3En)ts_0=LVEp-L3KJapCc+|H?j=n_{LU%L-$bsy0(he94a*`GB@@mvo-F
z58j3#XS`n!WzX-O%kL$Qpt~CB1A7;Joh;ejKREH5SweI<Wo^D;x3VoY)}O87Z+6VH
zwVn=+e7#kQ?k#*em_7aaM|6|)0R0?GTl0>9ZnYK;Tk-RN>WjI`i_x4uBoTqC1i}fh
zOH!8)-*HnbRSh>URYDy5F6j08RcM{=)n_g(+;X^NyQ^hwhOE(qtpyDyx?%^`vUtUJ
z4C7A8saSu<E=f#Ubj@(x`m-HJc<Vx|E!LeHYs=iIExUX|m~Xo#^9Fqq({OsuqqpIN
zF<2}9-Yxd#cjY8P{VT%=^$K<<2V+oli?mO7anA&knevAd1=st^YL%)*+G)4Os{y<s
zEk*0L#SO<}P$f&WuVA#VeYaakmDe1@J{l{yp7wST)K8yKpLv>(RXhG#(tO&GUt2uq
z>(VT$JFc8~HM`4Kw^%XHQB)^uX%oAAq^|vRWEtDMoLiot5vesdi%z#H%npvjvW<@i
z==zK*6F(gp#}e+4x=e;8o=heL3o3io%o*lsaSgEP+oe>fqtX>F%g$jJ*B6EhCa>aC
zx2)#s$uI53>6UZU7e+P6ZqMakSkAcD;c-hv{Yb7(&$lkn(kzjirnIUVTDGp{C`sm+
zaW11-HgqiKKvPc%qbq7pK1wZWTY9JS?QJ<5lzNIkoor`(+?w@K85>wF#hFgF7Dc&q
z2J@J$k4L7;Ou2OF-7$05e2RgUY)M_dYQf>`Hy)Q1x%7PHn!{OZ9`;l@E7?3b<sv6<
zTJ}`&^nAt8Z?!4BBeDr#$`MaShW$y$8PlM{>aKT723n<@4D+c-$Fd<{${tU-I{V^a
zWrqB2SPTRH0xhn&oT~*V%cgPtqHXmfNIZ{kUi4qqY;C=Ch1bQa!1jFhbga2exuIg;
zD3VdXVr1d?65Q<F=CvizvfbT&-wTp+{>@0YYVM;hF*c{@<T5+rosnl%e+B0BwgFlN
z*$#o~LMss6oBazAokjI<c05=jacZ2uWDwV=c<aSasO~zc-s$L#D(!lxW~l%9V6HV?
z5AmS(3%gyx+*>X1WU+**U4eRx)Vf?I>^<9LL4W{%u}6#Lx*Px&kA2cS;ApI5y@}SX
z<@E#?BGo9}7d>R~OMRu6D70X1FT_GH+2)Ftp~g2EJYp@WHn#Ug+-&T#74{m(6nkX3
zrt^yE0)4R!=`v-GFB&+qP@io@v__WgDd!MJwuTzvw)$x*pd7PhT1#E3MQJ(y-a?~M
z!p5OaQQgX+4qH7^x2$hYONYU=4qNNTq*1S`vDS}9$CKf2>T)fPDcEG?L@k~tN8i-t
zTRbC6aL!O?&d@lL5?8uYt9@tUC3GFyG;dnNTl#w$R*4^rxEw|Z*m-<W<8fa&Wd<mj
zBBGgS7fxfr0*8$4w5uzp^haEVkELi8)$JW?x2P+~Zi_Q!G_4{T@o{0yT^Y51ecckF
zh_oAvOeMcg+cn3c>E*(c1rtj9T$<lULCQP0woYHmLvR*S2fb+|U}$^vO~Bks-m%eS
z&Z}Prke%sQ_3g4Y!ahEehU6J((1zsIJ`Oyb(Ja3B+-|T(3#u^5fgzRK#20#-87-pY
zK~w8n_}DX*N%gNS;vh_1+F~0_<z-vTw9#c-i?ktS<@_hhJ+IQNt>YGXZ0pd*P~Om2
zSKmBdX=NRb0zPFLd{#|n^4o~B$-DAaAI2)MG`(OF^W{6+)mmD>dd}HDpONSv(FMb(
z`*f-5L-ge`x^s&hbY43^gEf*qqVnEkQuX(4(Ke3>7_@jDe{T0>(P4jHAhc_~RFCju
z@oa76<hqpCv2xX)WPHXcx*mO?lr%Iw$eQhGWh`y6Q$9Ad<fqTX@i``Jw}HY*JzZO-
zfiS}m@0qY-5znI$^5g^o#xA`S)tv_&f(SSV9iY`yPrH`?pkevto1?RRQ}W2zxokAv
zhJ%H>#u?MF7j$B3<cu-G+=8Ny?!`jbBIXj29vb+~?n1cxm)_tFd)UIZ=;dQk?oZuN
zv=x9$o_`2Jg;O3BLIvQA=cxQonNh7OYUw#D@Kn(t(EEX|)<x(lwF1ap8~j`m(smSO
z*^1|k8rvX<Tl<Pqv?k3!X`JzP2b$T5X2{9*m0IhMct+zq{F>>grHcJBS6r{_R-e8s
zevkDd4l@`0n&zk_xBb(WE46wSx|JlKfvk82<8u~^JlvY;ADX)TIpOn{r=)jzb+4Bd
zbr%=fPK$#Gynh76U=<9DmONQFR)2ML9vsfvP+Z#iI#o{j!z<%}YvC|V%4foCLm4v0
zu0Qytzi5l;eRB6jTvBPK9D>I=j&Hon_bU!DFHhA$UzHm`H8F#kZ&h3Og7C`2XX5xf
z3zlQ-7`iHi<4U6H*rmczaZ?UF!W~PyNzVLpy!&#Gm-Hqpb6pDAl|`)S@}IuvwvS6&
zOupZi`)hSYRv^S@lMC)^ijP>8?gery?ByBxrzy7^WfByhV*nzta~ZNIF^dey%YPDa
z$j!npQMMRev=LndI$nO0UcZQo><A=8c}#%ra3SU%@-f-0AVX})@O{OB-`s7+?(cBU
z*m2G<W$d4r+@Bi3_FbnDigo<Ony8-&V~SwoC5QTG%NB?F%9$mdj*Xaem^|lnnfec_
zzQ*}qO&1%LFQFn-OH1kH<rDcmt&`F{glwx98PXG;6<x>s{Otw>b)(x4y4u?=aL;G!
z+R=xukNK51#5${ymTM}<`C6QL%`|fiXXO_gxy>}wvcGICos|Kpcx$r<`Qn`H%`{W8
z;kLBS%Gs$1YbpoPH?jNLj^5JIc0&#&1lqfeTG1uDjmvU%3N9tPHOu9njn$KXjcB_j
z1S9A$B+fZ%+E69`8nE;zO&HL2beBdc*{#$<DT#cExcs@Z01(M(nri3{nxbgLT=*^l
zAP(`$A>QmTq_h>4Pr|nm$0LhgB07t9HK1DWYS}#d-6JCXzn<PPE9R9*kHVDDiutT^
z`Nt)3`o8MQLKAL15e&8OVtFy&y`+fyDyB}r`0f~v3{UvNjb7chn_cj+-K+4fOBg5B
zREwtai}())`p%(VlqQ6%{@M2OOK4oUiJ@>K<(=^mBD3!jBjqq_BYWQysuL;;M+xQc
zW-ALHgShw}iyq$-#9*`vK;wiq?dta}bg%8A8W1(Nt%u11H9h>?Yx#o81n|K;IA`UO
z-xDIprC6X$E0s%MNeYuvAqloT3Dm8^cO4)<2~h6IiI@SoOuX<q>_btC#g}g?yA+oc
zmz0F^i4J|$!O~Gv#~g6c?2-C_CHAEgM$8^|=-bB+7zmQ%pR#h%Fn_Y3^>(JjFMgR}
zevf3BglqY3i1oV3j(uBC*TLoX>`Ld}qwK6W_Gp&-<;CY@`sw390T<(LQqg*y{A;7_
zWrPF~VnO=hV%eP6)<U8%k|7b3Z}7U~C*PL`wq#S(2F_2yo(0s~Gp)j*c8W~uj+yR`
z|4H-*=-CmYPe_4X%<n>qWYnA}OSog_(uOR@+4>7*`!znoF?G2oy}>X<mkFU^#>($<
zPLgkf0Ywb&6ipzXD5bvjBEfGDeo7PUzM#gmf8z2UXo|Fq{T2u#WS@nu65^>qoCFu^
zVc=qtBFn8eaX}gF*4Z<O+x$gcQq@V5`|prxaFfZLE$sMe3MJpfKCgy-wTw(cQZ}HK
z2|BENmmo~p3OfRnh9I<u*co70hcHxP)pxmub)tj`M{CK-X#5F8*c-I~$d(pRW3N|6
zE?4?@*c|7uw?FO(31GfC>jh<fXb-}$N4KZ%rvV-J2^GbkG|XXFbsC+&-m)&T&9-Wy
zZ2k!G*f;t#3)+-N*t~AqGjIah;1f~^2{S=P=v1d~O%hoocl94)Bv@9fx7!%DEfsz2
z?@^fq7x{<WEL;8)I45;XrQX#3!bcffd{?gp%tg9nNQ&a`I^Q8$zYlylT_dr7sz~}y
zZbA9GPj*r_ucm)SbNjILBB1m_@1HTEW<XN@R(i!Y?}2yB7v+*cnSqwqTLU8CLj`QY
zxTE!VzHTckQI{%g$64l#gTGM8u%MJ@i=|2#!Wi2LH$!;A&z&ei%CLl$XAde!=~5n>
zG%F!oCP+fOG7zBmR|l$UypJ?sp%2LW3189mSp%h_2fy;IS~smbhMEX=Yo6;tlo(B0
zxa54Qqf7v=YO`<RQN;$b&E2YU)U;QT$`ib#YLP{Ki>U<rn>E*llWqt9dnFVV?OklN
zpW0#`O@wnT(CY2VT%RROhea%Y_2gvpbzc;|tbHN464NyHr#43VF2wvXq*LJ|>>)G3
zXM=Y?zTeSCx&IPxm1M0CSL**`gD=9jd_rg;f%P(AGJ(DtvYareGyh;U4gFeWp!URU
zB%}37pQ|^~c}tq-f`UPTd^ibRL)%P~8qNhM6>r9|wfN4L>32BIw>{)mdDX+-80Res
zo(r;c2+Pkun2Ll<jhHYb_xRVDj4SbuE`|FEb@Vj9Efzs8gqO{#8#XCGHvR)s4IeR@
z5@J3WfC2g_@X<TEBJAiSAIXaQ{8st-4Z*v40$y|4KcI?DkNrl7BQz2uJe;}3a~wk$
zXPAzacXDYNstffyn^YxmSOKP3nBAb1!?+H&WK82!O9B7ri*6y@nnejO9|Ep75fs^6
zbXLAA*Qu&s0klb>iaB|u9l=dhPrJh+C(1*=;Re@VpU{8@7s?*YB!n3Qo>Bi~W<;}|
z*X_#G!)M@|1_MH9ML;zCMU5pDy7?1|%z&J^fn<;4nys=!<mCLr18?s$3d{J1ZKQuI
z{U3~xSpG$vE70?P2LitgzU1*WC8z%uVY}S8+u&DHB$i=w!suW$2!!E)bQqj|dB57r
zx!5AgiUadO1gclz4GZQMq<>>c^|@b(`$8o336<h!DEgoX{7&|}ONF-{)B$||6-&gO
z2>l5Ok$T1fd>as6GMMcESzeg<0P!iS<LImCsmMRWW+U`3xP;}SOwE#{@Q5mhS{!|l
zCCn)jVf!QWaAW#Er~tjRlVRuunO2IcP+qE0zIg8F$2=bY^5eg(kaz|Ub`aeyko`+o
zdnWxSbkSR;PMd!oZ8{zPcCb+z`&7+D^3N2bQv4HQGZItRiflfZvc&&C*ts*bAua7j
zoU@<@!}^Gy;iM)x?~|#8DgPJNJ)FL^*YjTx#nvN5Wcn4HIM^+1@=YI6;sKVU%WU&!
zpH6j>G`17i?72g67h)dmR{YDPX8x6M>v=`2gSBj`rBew_S$Sm`llncAdcD8Y{Q|za
z!&9VOnRo=5^sBGv&B~9=jR^S-M%v%ImxGQ<XWbk6!_H8jb=P{`WcC~cGDlgzG@5WH
z((nJ?)u~RCPH-A>WYip|7-{;iFwU7siU*Y<J@ogu$k+Ys+$)7?59&j{e~ocj<IG3D
zYck{Sb?Ug|?sa+31&%Jg@^&LW_<XqA&GeTTVRfMhw##4+ismdyB#((=DE-mTe}p_U
zBIce4GN|c?*{uA|`Jv<2lgUK?5^mShGLLs^a>4CyzQq(`siYbl6&d;i{=reS&rbS}
z`j{^`eJ^4ceu{rtie7to79C+CMiOyYJ2b9bzBFN^f)8Y|x1VBf-!tdGF(w5xd;+vI
zhG={mH_zi!_J`$>-28p0OCW|Mf!3qgYmdqDxkGx-%>kPwO>DGOe2Fd#W&h}+N#KK(
z11?LP5Mo?_6~`Wr)dv>^lm#>qlG-mMja)Gw$H_vj!Z!Jg{<4k-0sn;|x`gaj;a39o
zhn_bd3q5lvU%AhEg;Wb@4xkQyFGXrJF6~XGT-}7QF6B+9Tzv;Wz(9)_kH+GW09h~e
zP$LI$ky1y$k01-&3Ihh$fVwr^H^Hw^|9tQ3&hN`L()^XneT&{ZAe0FNf+1?}?7=w^
z%cJkYafiG`t((TFn_f7|MJNr;uEC2C1eL0+*A$elgUGJ~+H03k8<|>r^jd>L-9R7|
zB7^~<O)mHMsHmHo#?*ow7iED&#RFaP?htrX!Ocx7K9xxYm7hTf#Xl-Q3c@0a!YJ$A
zJ=*T>h^Tg(n+$xd!J)!Ok^hAiVa{of{kt$Qb`{Ya8}uL4zI&yP4YO_*r|vJa0|W!y
zpQ!Z&BNk$|{xe%l7kCtr)l6Dg{{BC+iwi9I8$D69Bet-*7A^RhEj<9x$9+$GZvszY
zCU>RX>`7AU+V=<yMuKAL@M-!|OEPuqzEQU$z}g7dU`g>VvA|XB%CunwwGYuwnZpie
zp>SDkmQmGKr_NhXA5?)z!Ojq&!>Fw%CuXfiY5|<S{Tzr6W@hA;lcG0c#Dv&I)BiAh
zA;!vl!01yK`k~Hgl`ckCR?;Z%C5kllO!=JSg28FE))#c2p3}Cj%=c-fXvM?}Ox2{u
z@C#ps&O^Z`fP<bX-2O5wh1_Y*{4%MK_6nqZcl4u>q<?@jV%IVi(z5*j(Dv3*ZFOI}
z=nE8R(Nf%7C|-&cw-$GI_uv!_PJ!Z9+(~gKP^`E+1b24{9^7*B{=V<K_l`TxIRBk7
z=Gtq{XRfThla=hfEc20qHYWL!A79t`6IL2aLK|&`XGvY4rrup+o5M?`R##J5Mrgy=
zw%NYaJiVAZVMh)wE1&Oc9wZz!why0my+s&s%k!_+qSqKES_*ExuG&=>ToUy@HnP)a
zwXPI351rV?M61{pFNtf7N)><B25~khGu^*!a>XwMs0bK3Mh!4MV-%jz#I-BNbj_q?
z-#HH0kvgC)K4JX1HS}iT^irNxKVvOu$I5P1eO81kjLqCeh#nLV4em-Zc~$Ctn#WB2
z<i`p;rx*4MrA~GsH*yX0j`~FT`<f3(@Ik#e)xuVHOO_7kp+U7>b;$<;$J*gyh*#nv
zv3DVpBc9B!4PJ}COHzYM*jN|cglBG$*2@WDj(5S>$&7vwn_@+WIJ4*Q8{{`c*4vgB
zl?!LQLP7rbq5Y~W@a@msFG+!aRB(pH{~blBykQo1FG1qYC7KBpP$540=4buSZHav)
z>k8&=*Ykh+5{O>VhK(m$PRKqY0tz-;r$oJe=Bq?g422LIc1o*~Rd4fpIVTLJ39Yu`
zjuNv65<>Ij#_dvqYDlE6dAa91)QTFOiW)O7`W%+=KeP}g`J6aVGVzv=MLega^HWJ{
ziJC<gOiq&r`NE(_qmeY_50;?+-0S{Zl330dm8X);G6LGs`;WueDt6I<?61*!-jZPS
zyr&EmM~nUue_pm^{^_S=HAzaR+ThQ56Vgv%^PAW@NGzs=t#8UdV6Fu<ZVD$mR0P&n
z<IMFuRwIf0z}=+1BjM>@H_g<uv=4iI{3E*>Gq0N5tQ!4BtU-^EQO!=_O@%Sn@40kK
z{*+*WiPssOXR}l=wAgKp27XGb%k;-D#IT%4jwb5Fe-Ey7`&wPLgF!%3?EU#}0Ug}v
zRg2@j)K)k~``$Be^&6fZrG-L|OpYqb5MMqdhD{$n)Z)#G;f93Y1WBQsDzYaH%;JP4
z7_+}im&{lH@tbA-pI&E}_KcPt|3Iy6^6FR1W$jYo8*U16S9@#+Px!AT2jI3h(38Mu
z=ScxF9<<1go+2K<L`lFbs1r*2$THZDp%rLr8#k@kmu2(|qb(N~>z|vJ9)IA%8P4OX
zX@}r3aG#C@7vF@2-3Y*`5Ln8NUDSh>_NLZ^XEd<soO2ZAbW-;paA$e-TcT3U+>SDx
z14rvOmA|>`0)2k77L?zL^?X2)>Xh-skm{N~$NK+$@y_2xhrMCP`~UV;<=gg+3bp7^
zXMJRGe$M!Gzt^syx!fP3+(N|etx2N~E#;^AuLgxNhX%xMg3RysNnZFv18&a8sa_U)
zM1Af6xbWjmknmG&*#8A^KL9iJhH3jrfe;Z4=)3~ea}HU^4oSp~9wF_ng8z}0c9~uJ
z7kGT?i?KBNRqfF~i=X6(GHH3E$aY4aEvlV8|MLHjAPX5FQ9*8%IiA$9-~>HfUY_*0
z<2#$_LPCjQfXj0k*Yj_#=S;5WA6?JiB7}-6_aCQ?O%#lCBvdV0!rCf1Uupo-()#=*
zhnokM1Eo0Tz(gm<6*%nWdZL5sVO|{WePE3H49yJ?{y%`_s0XviS7l*RtkvjKH|nj%
zM*(&jmlK&(-nH*_a-L_5#w5%k66%i9>W)AzjVeK8sj`*tUheM=5Th2fnV+gz#w9qf
zvnwtIyeKId<|D~>1um+ezJ5ujj+7fcYq>N}nvt6P>iDiQ9XfaS@K59>QLCRWDc(HC
z*?JyDS-vZd3y<^{F}6A>wwctXEsv$=zdL7tz<I@SGcLo*vR6%!c5NzzP>FZ@?Hkip
za37|C7sF;f<#?C-vuR5q#RW><X*>DLO~dHTsdRdak=m@b82hGJ`}EYi*ezf9JR`q;
zjB?}07o{2+ODG2#yzzBxcsY|ZHi##R;Ivw3zRVT;(%4?Fy*`MPN4gDPu#PpF=7;fv
z+Vj3Ny;0N|EVdUZjARFWK0kDw{EA?N{_nv$%7^bCuw<_mlr(-(PIzX2NcPbS!+Cm}
zeILJMi}jYN|0~I>VAI!FA*R1@{_6)2AC#)xRa?BVsD6IH<atFP<c0Afh<cVCw5}xk
z^dd>(?h~2wrzwi|5+$bqZbwgD3RG#lUwQmz&w_Q^8>F_vdP_xQS<jjm+o1Wy%1VF>
zHz0+QaaPqWDI0&CV_vnWEbhsPE+Kg48^56e8z7wgU|?HzUVNy6-H~9P6w!cI<@qH_
zF-KR^NyYqMX9|k9{oxTg-E<vxm+8k6Ct*zM$=kFP=+)f~C9Bpu$X6Hly$_2__O`O7
z4;??n4lU&9)j4Hk<KBhP=<d9goK)FjZ!V=4z8`n{wG?~$d80IyX3J5rj(GFU>2D&3
zhe$pZEyC^1sEhMH&mA)%7Y?c>k~o9tTh^j+59WC|v2xu)c(xgx-g2%|M5O3pEj~v%
zOM$Z!ipo~z>>DZVK|jxmzZh438G4`zr*QxAXWNbs55fm<9?Z(*!u5YxbBg+q^@Jp&
zzUc|7x)Apr*Lq}Kcd%9C%eAvyZ4dvWnx64Z%3ckI#T`{jYW2xEV6H^Bw`vd%7%u*u
z1XBBl?JFKZk7S;>V3xektd0kbp3tY}!72*s&9u<l!vRmU(hwIus9EzJEp*e$$`@0Z
zM_wq?;b>RF39mj;Seq)<HP<4>05H*jhfYWKYB*A`0%rnGAPQ{`OXt<&y)S~WaJvqJ
z9~ik!+P@b7H~&B@I4%qVSj7B2gGE>1-5Q2G-q41QJqxTrRh#6Klq#C9XYSI-!b(&_
z{_7A>;AYeB`?2P-p_HcIFE%6WT|G_2I^Dt707~7VY<`bTzF#GtDl_z3gzwBLXv{5X
z%t^OaA9~r2?#9`qn1<5Da>NA@Gl9q02-=DS%K)uelgm}S{%rZUfuvL&r^ixYxHnGa
za$SWh9X)f@>CsP50;=E_p=S@Fah&LKE-VYnheB5!I_b2-{OCiH7tT1X%b~<E$-b{P
z?lda@49vdaBAoioEA$4%^z8|K@)NyEQi^#UMdEW)d0_8Tu+RX7*5n5dl`Y4Jo~F@B
zj%)YsC~XU0$!Ku+1!3|^8yq<NXl+Sdc~CbM&XH1-TwC|9m=K=1=b*>80jI*-W-<Hk
zvK;L+{o1{Ek-f8dE!Ip_Ap<69)*q%!`7AnLPX<xRPi~)u>mphGW#wS${M1Ni#cElk
z%{6^it0Zckz1An-z?51lyE3h95&n=WCBAkkofwtzSM`f|3=daEc?pMc9XnAcOP=KK
zI&0E}@tPmo&UZ?6?BJi!51MsfrvH6+FL(A*?nM@>2Xs&=Wg(UTw7Amzh>jSYuD_{C
zNphc=l1Kjv%NeEKk%ybx3bN30x8t#SQk*qXgMk!IjqBDVqN959F+N$WI|{jzDyw4U
zr5ApeSTF;L6Em|O$@-z!CAD|mu2M4Dc{}uSYdIy4c-oCFRxE%l2}5P0-yJkoyLn52
z1JMY+IyGLUQcBMuK05Vq`O*v8t9Jv1Ic659mqG(H-mF%ZMrFDRVCR)XCMg4x1qw<j
zysf9gl*p#w>566z-xJ;3Nx2V|y*84QVvc!E>ee%#3p}!xA~Txk%>Pb-iti-)zyDft
zFKG)~0_>oARER#9&0R=p8Bk>Zo{{W#`89Vmu|%aOX#Ps`qqFvhoLUyS?If7>J&n0{
z&v>3(ai426t$u%qQl;xvw!=@ab(Psm`CeZ2Eym}2F2ej?)uk%{lN9+nM+^L_)$@UF
zJYuQi()XPu{&>irj!U7x1MqFv_c<?_Fy*9swsFDRJ4{&iUzbHMB3UUfyf4h`?Q-25
zWfOXuM;hW>23?Ow2IXD#O9tg5``!lS<Mn7eC_OSoVPyuDLp4<6z1+jFH@K!61#spo
zmE@eKhBDW7w0%40_RsrJ=Nu1D41b%B{W3bbCxmO5^K*82Sd0Zk<ELDnz&4BgP*0t1
z_Qheea`BvLKK;DiHwVakp@J}EcHa*Lu03Q!ms#@wEFn)OwIjr1wCMw9kKH{B&cp~H
zuBBeS-u<*uNW<iWj%!b@PCvw_ba)ypWvqR7+jmW703NztG60V>c^iPouhCYhfnBww
zUt5N<DqknhDD}Suc;JxDdZuW9AD?AX59Z`<GQzY~FT$}rQY#X%B>T1aw7)~YSD(N*
z&i6zrIKVf|75jGkJHD%a=D0gPmuD<T5&-K1y!3VY-&<xnn6J&%%kfaJpK+P04yUbQ
z(u4B+tCl(Qmlqqu>SZwDW&qGZrM#xv-^TBIeGcR8x*l!ajo|5nr~$)_BjfOs@jOQ5
z%E9_%0HE#sW4nUT0Ni@)ql@_6x*=~w$(l+|1T}4f2i=wiZCQN+@m7dLeaGOIC)H%J
zplT((YGpcJI5lma2OXIv!p|9WA(ppf9IYZ0H#n1BqLo-OH_AvY+7oRs7Hu#TZD6CZ
z<!X5?X!S8dmD^P7h>PVq`X~$httlg^6t(`|Ak4*`NjuKvnrZ%rCVp>IlV=sdA-rl_
zN)@MGpQ=!wSo1=<uA8BxO)X-bNUz+Th}Xuj?_`PAtm1|@cJ=$|$&A7MKSn#>>WOwa
zcrWDLwO^vtKp*AqI|A!ioNvOE%U>>J2Zo#`L^R6Zojn;h3J$iY{oN@Kc{_Gq5Yr3t
z<R$_}JxbtT{P6r$i<`UaNE8?4yahIE)uGQJu<Jt;@f);DxKleONR&*P_l=Jc)O{a2
zwPmNLS&7&%s5xInv|OK}#}c1kKY2<VR*$2{5D#{rY<`Ux%|bt8)N52W-WidXj1d6p
z7F)}#UA;7C_b$qc+KN~aTOUPcc<i{Z`XAhv>b=#U(M|-~($^GPj5;KhCmko?6^#>)
zGfwkP^Nv$zGiP&-bNYvsb^6y04}MFaEA*=yJ<PWJHPsgJvpnGB=}ZeW&K7tmd_}di
z>aE<?IHz`2j5~EY8+MRikA6*jomxlr@XD__rEGi#V1sfc`H;EvURWM3;iCmtST7x$
z$oGzI&=4)!x6}2`eaXIw^;Z)uZz~4PN7Z3BCx|`D9I4ro7t-}CG5ksTRPM(*GIF^@
zl>VvQTHlgYSAUraI_#~d!ShSomg8Q~t3HJloIG#4K1_cp;Lu^u44%gswz{N0skZ;@
zG#NXrkBGO=;g#8{IFiU}fyrvzkl;{<?A2ipCLG53lj$K>H2j72^W#ug^RJ%W@2g3)
zJFiz0rgyPcQ&@H|R^y$98n1r&YPXlSxWufKgzeHd$%8-Q4*Rw>2+=0L2qLDp59i5u
zV?rM)rCjry)*vJ%(R0-7{UlD%w&>ev><71Pa`W1kKQ&`~6>RJ-*o7U0-;rZq+&i)V
zKJW%k1^#&`>b&W6^s_T@GU=eg-g)J6b=(=>iRWkhp7*-~oaYzyos#HVK2p9?YV0hW
ztXWr-eU$y{r?0BNW!PRDd_j3|p5{!v;<B=-t)t={$slyI)Z#3cx0&M%V55ZL?osa$
z?qTmy{%Kz9w`TxzQ}cjQ<}=4L#z&O)qV{wa`?5@weg$wbq$gVPx@Pbzir7Nf_lE~Y
zQTLG3Z9jh@9{%E5)1-65lbWe}DGQ@?ZKE*rH}U@@odL=K4FN{kMkz1(T~kSu6UWj>
z6V=E3_8MI>yk<rdxkmA=YO>7V$fwG)j+8N_>RwkCU7pRwXt_46HqSO|N>z&L0|!^<
zj4S6{K(lkUEUp!gm5;k0ph!?8uZ1HM_ju_FM=hYP$#tE#+_-l)84YHo>D+6tN$+7Y
z%Y!-CoOMrizm7TfbcV^;7kj$~?qx>1vHCr--3qTc-~874K|PC)`XiO@Qj&K0*CCS=
zOSUIps_Jia7-B?qiYqXsSjrdF`ky`gTG}Vq9Ibgdss4a#z}y0hiYG-<fJdp8^vS;V
zMMp;=vvu2bTesS?I=HP{qy7?W43&pnOU%VzKhP@Vz*5vg)S_T<^_WV^=~1=?Jh@I~
zh0BGuNa_*FvEV=KzeHjd*I@i*>ZIm|&-|xwSB{C0rE&uUdH(U?4aK&NPieMIs|j0U
zD!KVF;SGj)h2PZ68VLZ@piI7f;(EhxF7~Svpnyg8ttXA5E-03R?f{L2j4LQPI5{Sy
ztE}s&KhI>$M3Hv7Y5dTnb<fScnaoLGHT$&RcHg<UIDS`XQ^>USq=3II2ACzHM7=Ex
z_{BA-vyJ5%N@|zA36U|+=34Svz3aT6xL@HrKK_0~mQy><Z=dB12`eMHP|2R?{W><W
zGr<ygkqZ<fM}~=IiZh_d8;(>G+zq!4Kg+*gb~qEE;@cP5yx|CT6b1g!9j4sD;TUP|
zSuwRj1^#`^N<VJjh~>3w#$S!FLIM5}oV7nt0urZbH-VeM8S^bsEAwOIE6Jz&dzGgx
zrvfmKTfbXBnBy(VElO?SWBg;&;_&m*s#Sw4tz+-;&Z%4eUXSDG^W5<f)%OlAYS#|n
zczW;(reh8NN=#S?KW9alJM6F;k7o?f?ZIa+Lj9RiiDw{$C4?niy<4+;cC>W!XtQ}Q
z`up>DKUX793(o?6Q6c81tfu!*Vs~_|&Yo(XkxfEEtU{{>csYbQmQ@(zZ^k*+u{tRH
z`wS?JNQ`XTGYc5#(xHjN^qOBfWXdw!LYnvw?Wa&(GbbpoTHc;ISf1ixODbL%KjhDq
zQMgNd(Q0a4cU^qv%q?BHqgm2_en>SNF916K+_>XiHN4${E+>+Guj>c5%@OE>27q-_
z5BGi29?G{B!)^mytf+-QKDIq=QD+hFdX&?wwOCv#(TwhAfnqPhTACC<#^uk$>gyBT
z-%f81z%duRPc=5~-m=gzjlc&9%Fp+1mw>i?@rDO22kreCW657lb{}n@54@k|1=NeZ
zJc_M11Ee8O`8uBs4J(K+iC-?T;H5Xtd=+MOHF0rBZd!iMm}j;ht6mHp)C6@!D=2**
zcOQ+5j5J|<?f)<xk#VbFZ>+}cA+BWKuxs3QV7B;(k>x=Bns9rA|E~6SS`W_ncnwwv
z=`6;GPx9n#--(u9UVp0Q320@OgJf+%lsRvGW^NCj)v|1d)$gRwm_;vsUIs8e*FK;g
zk@$>0n)W|;kSbE#zWf~p$Fvj}95a9(UUsJUJ1VhXzQ`uA>Dt&5kB@7$_!sWiiY_fH
z_EHBFUO}0s*C&sr*M*8ebD~>doPe>H<KftKl}thPafY=v*!ZLF^9k<^Af}}8owFSX
zzVfmvdQ>m*#V^L|`JVcbw_c@Wjy`YC$$&W3W^>2MSTnWM@SW*w1H9y>tTzC7AHUu4
zvVYr=3#UzWNoO{r^n*LyZoG_?V{fcUu!&l<qNEzp&D$9iL6Tc-&zmn9@(t8nR(uby
z*^YRd?4J`BEt9k81+rfXk?=+1%JC(tpCg2K!AGtUF!gt-<>k1uXoMGB4{S4RFO!co
zd7KyUZ<GD&AL#b~3}Hk4siNSj#Rt-73<+Lx5iJthYhXtH)ShF9=iQ12ad-rwaMZ)M
zJmroAnNyH)r4~l%+H>CnycGkbq?jrzpmgV9|A=c3K9Zm;8n;gDf8~!=&l088tu4kx
zx4))sXT2F?d<>u19Ut&A^Y_(o(6pO1tY=~ph6Aoa^pUJuP9?0&m-*Apo;pIb{TGg{
zX&dlN(hC6=V2CHN`(GvdxASx_TASWAs>cMp;j3HKfAuM5zf2lI%iIZ{ov&^qk=j#<
zgMEqF(4cKsUpNRvsR&6QsD!1XCtR-$IP}J+VSSkGyu{{LTo1Y&N^Db6y`JltQEw1X
z;`!bO4#~M++`S5mvLc_XhN>>{I0ifiJf-idv?9C${{;TovV%Fm94=z++nc`GbKa);
z^bx72%h|((vlmW%1n}xlYnCGI*QX<#F5GY<pWBP^(fyPeToCEf7m*M-<S|F7_rlGb
zJ1{e|AbpO&u88PLI9qG@)=kf5ecHydB4m!HGVAQH4x_zz`e^%5$yR%NFKMnn#p5f~
zdbuIry}?jW)~ENgIpSZU%Y?^~dbll-hKf*2?M6FW8xP8soVkJaKBdWr?Io&Nv@M9y
zMCb;j-?qmWURd~;BSU`bL&Hj&M3qqzbD15>)5ChAH8s}+ZUNf`09{wk>Xd!Kxem8&
zp$*HoWJ^ZtcGl|;M&iq7KG>`S2!P{X;ND1FaMK6-B<YO1j%Hojy01NM;`4)u*nDD;
z?1xpkvR@jizyt`L;lwjs(Nx>4&pLuIzt*g1o6#zKvEq>53=cpA`_ucbCi+u-*0lxw
zT;h<o-guP{NOSk2R&r0-LlDe3GW>M}WviRkqLRk7V-2x;GAquX0T&(D`d(G%Aj*IQ
zZHJ3CK9-|!*Vdy**j8Q%!&b#0Joi+8!@gn;4g$74wjH<4=d^HWr#b_sNX)}YKWMnc
zyYY9E5N3?hm5xG=zr}n^b`zt!on{hnkz>A3-j?B&7E4%3)KB>M(F?wQeJnh!KoG3H
z4~GDReNOfb8{2NZJD`uPsb><G(F%K{rLgEEAzC`Cowt#1@6%Ef_ALqacC8`_^?QjT
ziS+wC%9uRDDG3<bdwmtR?lT5q+_C$1*g4=gLdN+av=otZ-{4GWwOXBK`DAG=M3;9L
zOh!O+s)2JMrCy;^aqR2%w6PfW%tu;K6#r`Yz8mM>mDQAizJYQ4_Y;>B`sw-s_aCiQ
zPKIHnDNyuzJPaVEd-m_8DiFsb`S~$;q*gQ?5WqS9Vv+Mal=hse8a=ysq+nDYH5-E1
zqeX#>78~Yx?$+4c<C^SFs5>`1H*29kzrj};wq1aJWE7U)HzpmWosIMX<c=Sk&Q8JA
zsA5<iH=EjB>8~YfYqGn+7`c<rm=H&UEj^iCf*_6&OK+*jJzsHHRqQ6SS`9Wjtv{6<
zx!sh3ZOODy*GuWpZm`wJsfheAA0HUVDS{OL9@T-GUI{|HU-?isQiTD6xRL5}s&l@k
zGgQi+D>TrXe^jqvI^Vea7$+by3Hf42W~<m()FZl_5#zC^<ErA4!;PDhT5(_U_~)9l
zJq&eD%$EzS>tvP{RoQo^wDxDMyX3{8Eo!o=TI6dcNGC0Qu9#@A;5xk?uhP%dp-^Cs
z2P*H&AXq7UU<a;`Q>y@1KHt%I)ZnQ2LG_{fSDq6BKRKuyaPoI>=WrK%dIgZ-s!&EL
zm(d5j6Y9p;{9RB5XyWn>^p&K(sJ>|aZQ`8ljQc0)=K#&+-xeYC)nD;-e!uxN@ruYA
z8S_gZYDOorqDdfq0g@tB5T#ulT`qfw0fxGadYag~G*0A(UUfM<3|$%YKNSpz&!a7Y
zzH8%cVT@wASce#vj8u#;I?0i27lyt;@Q(IQV$06w$mEt?$KzGnTa~O8mRq%i6^4#)
zFF%BST&@p0sqXz23g=uOIg1zl6!80%mju<f^!!VbVT#m&np+01pw`O*H%3UY7}Pq%
zxcV~)bM8Ey0F|J&v}pVUrh*Nc-<Om<gsHy!&PtX;V*H7Y(Jf!+w?49c9@`sB+k0E}
zXmBhTXQ>n_|E)a-Of@`zJ1b4KfG%2%o3rmbJu%|8jy02Um>H1*c-;Fv)^Y!$#7#h!
zP&M(3hJZ}%)mtd&!<VcdqS=#(ZF@@JUUWBpL-?$bQH{m?McR-a=}jAuuv2L|ut-X#
zTROi{^4&4LMKB&{?!#_*wh<+Df}Pn@oF{ttbP}D}V-ah6&g_BCDz=+?eD`&0dQJ-;
z^hE4)c?dkxc(6ROcw&9O4nbwu<(dN%8oQX2t~lkIN$Gk>@XdSuYd|!l==mh*`F-PC
z01UvZ=n2E-sJtJ3JeO3f;_=>2ZM<}cjpy0+7>^00kG9a~EC2KT^Hm27^h8p0J1EMP
zK2r@cIxrFEm5>5Y_1P;glIwRQ*JiW2`V=6_%2+;cLEUT_Ky+>8NTBm)69eE5Ve?_M
zBYnPpmV2c;s}RPj8`K->y<jwl%~;9~={q1AIoLCGlmDmrn!yIU8m)89b~W;{QNOVV
z=MP#Jb3rZXq0ol{ax3iMeX94_LnHwGJ}r);2UgMHR9Zu(%Wu)-4#GcL#RM|rl>h}W
zo$bntlIxOd5+RIhdnsX3v9^dgi95oZ#c01b(cS|2IV0XUCB<PXPNCc=_vv#A@0UK=
z5e>6Y%#fnvGjIOe9w_L<wA8)rbaNfQ;EnZUTBacE<K^&Hu{N@3@0w_DDyg=NXm96e
z?~7<}M2%AqYY$ZQ1UG%0H+>v8bUzWF2fGp=I<djZt^j><m}^B{!EyXK=h0MNh;HXP
z2|2ShM>R+FJjrW#aEy<x6~9e6(WEybe*GUinjPTfTS<;~3HCp||LDfj1Sq`7Y<4Ck
z@<9lTBp%#AN}FHtqq<3?^GKx4e!N#5WO)^FGkY~FK^j8I(QaYP`BX2ypi0cjcpPqa
z9By(PZgCur>>(7!Q~X$zKWD#gp9Dgg2{)U0`*8}lj0;=|G)$3QJ=j~{x`zkYL#O~?
zxm@6~5;3q8CgfWMZgx1+0Q}{<G#WnZU_47aIa0Z5GVTLWL^-;7@U@iZO!B*S>;ew5
z@{Z!pyB(E)5;;fLyyj=>w#VNJp4XZ{sioEcIC!0Zy$1UpkA<qAx21>whlaBiSnVU3
zdg+Z+($d$XR%yi@F#yR(K2+2irV(A_(vk4e-40d;dUP!UkDB=Rf7PzuQ}`$~VK`Um
zBRCk!&Soy~GiH+^#3skylKya~=V8d4`nau8$;p-WcZiN%ELWm2BJA4vt)|+3SQc^o
z<9m%FU#n8uJM{f+mw@_*#UWR>0k*SQU3P$L;k&rY`}giuzAXD5W?z!=n6I95PR>RE
zmB6efgKOT=I=K@8=`4Jif!U_d`}@qh{%6fI@P!+6Y8aq~Uwu5^C2qQ|F__F8XHFna
zD<Hs5A-WPnnN(f51&JK;K(=)cChV{U50@}g`v7#|g!>MFZwC)wqq$bSGTV&HX4@{0
z{FI(dQL3HEVPUJIiX>w*89Vkp&{maF6yLTJ@d6+se$`RBozx#sW>MZ23cD_J2!o9J
z=E+`#yNDUQ`F+#|%Thq;i;Obq{TlO`C${^G?oi_XsK;e?*sQKUz&fOn($}eE`*fx}
zwxFsh$D7p+R7(J|+zt^`8`%%m8~9<e+j@?IVF;>35UjZL)#JIx>|(LP_r+o^40wIz
zMg0B544p(Q&)5N8$$z@$$8=WPHMn*t9v(Huy!Z5<gi9&r8Xa<-u;-e3bo)APRi~6|
zL+=?+l6Y`8`0nW4+-1>Sl8oI2$X0w_pv!FYx#dRSPXMM!pNr-;`~Y;N%^dx7pPW7G
zfl1%M8mQYc_}u=4=~4M<3IB54-CI<1U%<Pb7huB8^6IRH1e$aKZH)d0T=sJX6Ov~V
zdz%d7YF~>mJ{of+S!IEce;u6{J4+FCm0hL2Y96LV;tw<%34FKv=}!Omne>OO*G8h4
zf`wzZS@T&yF^B#V;D-Q0x9?d;aVgMEMt20;y*5>gy>`>(U(Uf^(n%2&7_DcV%)uRh
zZ}`gmVb|H(2X}7p>qC(pdOr2)xm9A@!l^daJG3>Ir+xM%gW}%w-5uISEa~e{fVbN{
z`8qLHn?C*n=|@=oOAnRJXoDd88m{$pykq`4+mC!B{<C|I0O7phjrip<8se>NNL-iO
zO>~poa@f5m_%BPz5py5ZO>$>s6iB{)t8yP3wWoQebD!T(rtN%>Q{)5@diNqxyb^FN
zZLdj-nA<VK2?(Hn+A-b_l5Cz`WQO5VU^5RtV1YZF3tRGb+F{gSSQb~e*F^g0eH_O+
zpiIzG465>r?PcSWg!kjq!)D4<5b#fyHo37tOZB_Uhwybb;yqK;eVmsg$hYsY!Nul3
zW>29Me89pbYSaMDXvxiu0I{=%&{)t!2|R22-Yt`BqG?rk>l01anOv4n;M{X?bf?!U
ze<*M}yCq9L%e!+DYhPqP7IwfgNDa-J_Y_Yfh_+!l<;~J{KMPsO;wS!%&FwRfu1fiI
zlVl9KJ^McQ^algpL@otGjpY~FkKnJM>5G)18+5C=6)sr+t9>1}#=bVdOx?7h9R(k~
zc)AJAcSze(>Jl`}Zt04h+dRNL^$9y6hGarmAw7e_r}h{RkQ(qUB%pQn_~QHSW9tjB
z5hK>;;OVSNXtnK>*ah>C`1u*Q`pEDH7ZIRY`W^_)*^PEF+H-w02Kgu_H{4-P4eL5=
zi0zLVWHWs3m4Qr1zdr1A9Di(mLVZYv%#5U(G{ARhce@=-gu~btc=6q-w*S5zc%=3e
z-ZHSF^&F2i8JkF?+})Sdx^NHG?9gv|ui^3S`tdox(bWrWD~ZP+W;=K`vsrt6q@c?F
z4ku@_^e@rzecN5y_({LINc!wqDiqYCZUk{O67=^<sd&m-S?u>PYHodKdnwgFZqllF
zNoG(3a+g4-2;<veM9WS;#sRHl7aa|OxA61jaRm)#;Bkj&O6@WP=fk+Wrszqn8sIqU
z?ET&D;<t7)RgPMr;z|u8t?Y8SJpJ#Z&u{cKPpKrZC6oc3y6x{^3|VKc?JYkX9EvAM
zcc(?1XMTcQ)U+o=MaI^+>ML{dmyC>Rg$4k3jbTE&ehu60{@hh^XH*Y`W0y50`}Ins
z$==N6A<c5GpH*A9X1&&Q4K65j7>h~19c1kd5N=yL4|{^Z{sEdtpq^|o2w6Mxg=wEp
zJNj$iX(1?!AK>C@&FRW-36MCh0w4y}tFiE#BX#J<nzMyt<*aX~w&wj*f2{Q}f_=Fj
z-+c+IKk|}Sk`6js425LCMFf6|$g9mh#;|v^yNYanNwy72JjFItTp|)G@~Ya^3@8BH
zQFO7`C+E2rcesLJKmZvOO=D$x+YM+`YBe$fC+V-~zgvFhV^9Wa@_R~NXt$q1d<pjr
zc-y7ve~v;voJWXnNRwy9bi9wRr514?`y1m*+Yvy>Wzsf1@1@-aTyuUos!#!pdLiiU
zm(~nU!E@nN?w8rBN{thO5K}aqttS}79>BQQrBPkqr{kp=AqlaAbWfC3LLdWbWX7&k
zr&y-O=l;Dm8?)o3>*A#!HC9kHb`5^P%<re4X35T`AyMJ8Id9tZ`}~wGvyA)J1DhY3
zJNb{FqA*{9mT*^1-)8hxmaRiU_0QTl{UY_LC%fT<t*1`_hkci9jE*Z_jl&A@5V7vX
z3gO9u)Agntm~2wuLM_hxGe*TWi;?2H(n1(htjcxc;^LM;(6;4US%$w=VOts$*<ER6
zRtL{4xh#Z1%sFXRaQO}x|J@qKK%Dz$H?*n4gPr&8Ks2>Fe1qT-?k{q`n)yz$$g?IA
zu_2ET`gVOW%9_V&dI#J-@K^~4IBV?h%o%8v@A<ElK&`29g=dFOpNr;c#R1{)ikNUI
zc44iO0?L~DeXWmDm2)dY+@)MoZ_~7wW~Hf8r`OlriKD;Ft7vCPEij6H!Ct1`eYPE5
z9v5*EUh0kH2mGg$7G*S0K|r?Vw_)BFtLzqWhb+lx;8Oc-PSYrp`wff|DU#vbervIn
z;>@sB|K;<#YJE2oW<GJ<4VHjMo49~S{-K<rHTMCJRxvr`$jw|g+c9fSA1aSuCs-$V
z0Ira~P4bB2i7m3;t{?hhTsl2zR;VQH#N;lZmp5~G0Iv(Mp`I3`QLOu|Nh5ACWs7jy
z^%ZWZ=@lcq>du7+N}!0`EypFs&?vK|kM!XTHc@G`dlNVpKn_qlK~yjFs@_w*Rdzj`
zZp%U7eM%JYQ1Bw~?ngT@9IyPKT8C96m)5LOM}DB!yC%nfrtma>+xsV#Btud+1%Cy*
zbOk%$;QIO@qYuNCr+nD!T0vMHR#Rp`Qa&tMU1mT49g>8$UF+A8w1IN+`^o^jb$o>n
zMRbg_yp>H*_xWW#CxnTuBImSpB6p}rZY@C)zx1BAKYadF#oI|Qw`OReYPI}otqxD_
zOp!$u0ewAev!P91%jD_5>$;Qmk+EsDi9=8UJN%jQoGxNPD4_^tj}f8dAe1#NGN&a$
zxot&#ha2tzYY>w=Z!RKjsvS9Yt02<9IS(MMRx)kM9bZ*$UQvR@kT<@-ZB;_J5K2#$
zPMt<BfnRt)|5-KCF|V>|`AN@$Ns@O`m8m(RJUK7(AeUfOB1g>)zwhXt53f3eRie~k
zDTTzw#{o>@5VnlBg(TP?i)ZPPMakB`Fy!%v#BRP`y|+isSy3pDV7}M0@Gm`8qy7SG
zFTNa5Q9+~GZcDbXg+Awx-~wfz#lM<Xvz=Mllp+2&(<(QI;bB#T?#)-9Uul(Y^S$fG
zrZ?;tL2AbMY|Pi(9NMMe2(O!-n;x@v;Wa8YqXz0mhHz9lKgHI(nfc!Gf0a^%qJL9<
zvuqa8He<xH);PZmicq}Szach@YFk`mZCLg>IqUN;9Hx(D*5q*>^;pTcQ!V`yfiIV=
z_*9{>=r%@qR#o_zM3qT(qU6jwXTCDFGIlKiY!);GpE&M17S202dNfW$n`>9IuR>2D
zrx5S5mhwl8XBr?c5GXw6SO2Idyy}<uXw{b7(kZOrbW*xq3I-tBqb*+zI1SdYb@1jb
zoGT(`s^@9bDXl-uPtwbLscuZY)$+^nwp9+0M_}-j6s&n|=4_s4b+0Ij3Th1jhX>ih
z={*;wY@XCYw<QBAgmH=RYhxo6IW+-?Dhq^CmXC0a?NIQKCzl~KQn})>S*^pmrq7y)
z0yzI#stBtJQCk%i!g9}^vT?mmSl-3ab}Ia{ca*O&XW*ASoMHDvP1hA$i`lM95Mi;1
zD*Rlm3SuWE+G}R`#QdqB+kmOO=@eIx%wvY|b@BHxO*ye;q!~={UScJ@0@aT%Zjs#r
z$9a4et7ux+Xy?4WAjz0&jHM)uBsUb?<@i@rWpLO;4jd&^Y?NJU^!xk0m=bsV7n;CK
zzezLE=462hht7%E&WT(T9N;z>A3J|jBRKMcSl%Dw_rBVDdy2POYOQ<I-Hd^tw@NcK
zKbnY<T*<qpb?g*u^zbX0p2q5|28)rSwn?M(|A_*JoFGY@c6;Yu=oxRg2yFi=@*9`0
z@8s*k7{$rUV<a<iQA+UxXX4b0c)s~W>e(g!SYjoq*JrnFG?`ICvF!+dGB5#>RX-Y5
zptIBwBI5ceKZgIQOa2P8mvTk=GXBE>Re_gunwDf5)!h4)xi^;igJtv~F;`Kese&?c
zQHM-@p8-c|LZ8qDZW+rS%C0)ezXJVdH?{MHy1Jdn+|RH7!NTM@z(gDA0{YK|c8@>u
z3b^mDag=ih)puhxlvVUAr@@2uw8;*<3DMY$qYG~#?rHq1S4S0E$U8OfRPZZa#h8#o
z6;&{yNQGtB|I{xXIhZaVo>cSdkB2&d8>LAru&Z^*H&n;<j3J2BHLi`wwGp`nuuni$
z?!=JR5H~(_)(knMCFi?hp1eV#p*rKzb)6j<g5MN~h|W=^TNR+vYS#|smg|hX6ewij
zRvlJ>Vpx{ChZN{(o6ttTKC+e|i=Ppz{G1d!Ye{+7U)*1;F`(mAQT%<IYn#i3lstNT
zsI#p0P0m=qm1hHq{vc-qnf?$@1L<-}XwB+^l2JW<ymF;1W7EKbcznN+Ih&-Z<wM37
z^NtH_Zy^(3`N{ro+wW&J><GP|y6ytEqG|i_hIwMiq&XVQ`#GZbrMIPb;>ZW$;X`*3
zOXQ;DqWQatMZc}pc@1aQz9<BLsrXK0TEp&eneI3(zF9PXmEb6+u9kJk<(K3Tnr_M%
ztf}&>4dfzoHy<S{AqVv5`v#@Us9GhYN2l1O*UK>do(oarP@fPJu0>2x@j~-`QEEAc
zc{wsw6l+Tt30|zMxLndpt(Zg7iPOg>U;6g?N;0jAa_fn#Fq33?r&SvZ7ZVEfoF}#I
zYPp{rMjIFxA@*TJe@1aE(kRD1GcL{^%cXcVdGYuAR(es2Xe-0e>elCv<JMfqkPddW
zT;DH|TCB-Zj()KE;Ql3Q)b!wj3TAX<u?9dE9yOa*uGxFudnP$~&Glnk%{Ep-Fn?6l
z+O&?Jj8(`)<f{?v8|>SanpH~fD2XeUHxkNkiN1drv9UJZ46`$2;FlI|AratGA%boe
zr>Qe``7XV}NkgoutEKSIW<@&rgz}?!6!Aw*C8K>B4rcis;<YEVkg71M=Gx9r?QYCN
ztU?R}?TJoavh0;Fv<w>jYzAk7d*Go>By{VXslRsw_tcJ`Q=6hWyV$il;W_tI-00(X
z|Llr>5&gMO3!_aku<?&hC?A_88E4odOVYKg>U%w(4l)~sj$OOO->2V`*7NYDk*g3>
zpw>5}=}%m0xw6=v@Bs7ch4(~~;7d<ryB%BsU(;##@O;Uo$MQ**Ibg5)Sg6EVyUcwD
z!*t`5OeJ+EldSu6xS6ISYZFT9lkI17iA?xo7)NaFxpP@6i?q0G0fl;{=LQyr3X579
z$)QAgI;<+3t8&umDO&Q!W7Tg3gUAiVDk~ba^si<iwqcjLT0}Qi`ilqhHW=2CT&m4m
z3e_9`8%g@jhfl{0d4&m)AZCL66yn3TWP(Us$d#vLEK{DumZ0vv!cD^;{=wfB#IIUp
zrZht~`X>H|@F11M;)Zevfg|xEFc(XQ;3!D43zsM)f%YWc8jvK6aX)!ueb-gOPQ}ig
z6tLo-#d-YZ=G6=`I4@;GGjSu`-h6IR{TL7rZ+Qk@T?aiQ8<wUK37P2as48F*{f~Hn
zpD7J(ssG^Srk<UgP2%Vf7c$L-w3-VZ1voWDH2>vpl)8uvRHY$FL$zyhu-)yKvT=-C
z4sLY|>k4hDd1<Cy!Am-fb(8MGKZz~+1dJ98)oZr<Yp2p6IA@6e;t3S8{ENTx#w}iO
zGCSNiLGVr6$CkvJ#*`K!p$(70U<4iJhDY`PQFc^UVn3rcw&c(VwF$KiUaf32A>ES;
z^U?$)?$hr$-CPxs^gV5O4RVG3+;_@J{e=z}r+UJ*?1ZcANrR=jSz6z?rJDuL(3h<K
z)_MP?581Q}+0-e%QRUAu?V}3iqV`Rp%%bv5{t~D3o{I3I@l7#7I*Y4pD$35uAZ`+h
zs{b#FpPa$FCP_`|YY|Ux)0M9+13!ID7t{U8DZZ<6+@_C{uV<2S+a+p%6Gr{?&f_Nq
zl0FSe(ntASnKBmXp-IO3AHM(4Q72g*I9K?-%DK4YsKdgEG@>7ls8$~C$Q%90Pime+
zQ~W!-c$aHXlm#StU+ThqIX<*MvG;`^+oyQsa7G65i>oBUtZIg2)E$RAF(5G@A9Cq+
zPjl(IYmk)Auy1-ca%s7MdI?HmHjW3IoRzzLxM}~q{=4W>v56$7aYz6B$60sKS@(<l
z=X|uwbr=5ORu*j)@%tQKT)Oqtv;5TLOLI@H5pHXIdy*4yD%4aokzB<p{|VWLcSzA1
zP`eWJq}7zn*_qIoUr*_R&aGyUZX3uFSM+fmr5&BAPkXPY@zt78ikctp<GJB||6{+p
z!+L(Z^rlxgZ+ymm&XuriHX$e!-8Vyx5QJ!~Vp&EJY|MW}kt0%BT_DX6eQ(+@T^ZAA
z>f4$DB#lrM$K0PnDxk}lgln56jTC5ZMbS5U$86PKGa7Bj$rX#Rjb%FqOrEJSuYIuQ
z&Ny^B26`TZ(V({^73<sP&s4$x12bg(M|%VK4_0OIoO4>Sjz9JwtX=Cr&RwzY$_l-`
zGI`E<#qQ4LS*r}=kd7<X(hExY2d6U4o1FdHu9VgF0$NXA*U<_7>nZ5)o98XoBh?Aj
zf)BT-C+s!*<($cmITN|Tlj#oiB-J;szCr!qc;zCTjV<+3TAe$wh55SqlEelXA9No=
z%#H}%JIm2-K`zWQWTG4?f!gnF@8EM?9c<<=esp(0pw*_F>0lp*czgvr!4QTxw~i@e
z0{?^i=p-8+Zj|$k5!0hO_^rBjYiqivZmWV~FZssqby_^HiPV~m??(}2xRya{l@9G(
zC!FaHXsE?1M+Ax*Guc5*=Ia!iBn_82D)uQ|5DK;QIqDRiXhkK*97Q?xBJCu%tBPM_
z^Zjg9<G20f|Bm68ww`o-pX{aztCo=5B)8#vuv;A2QK;j4@JyVV&T~oo>1^Bi&){cX
zz#0hBDjMt+pe(Ga9AsPr-}4r(hDTk8n!v}>pNkI_uZsqYk4mdquTo~-#ecQiSw3($
zpm~1Ri!#PfWt|K7KxJKJTxJ}3qbW3zMH@)Y_wH}}e`n@Ds!ZUCj99G!WRe`)apwJ?
z!RU?{J3NvALH3v3q!aJ?`UC-vvoizP#32*KqVIDSkyeUDR1|OAwq<_PLb=@cJ%6)q
z1=d@*V40ADXwlgD(W?2-UYp-IaXiLxd5{TqT>?p4HdR~KE&7cUNdtJTo>WKgw$m;K
z$irfKsw1a?mP$?>#c^gz>&g6W?26yLqZV8H>_u{PeYE$WL|oU5R!0}3kQPq1#<bJ{
z4z@Fl&#`~!?Qx_WIf>)UKCf~2c5{tlC5?aKQZ;hEs2c4K&RRChao}id4{CDf-J4S#
ztpuAnQjV-N)4M|X_U0}|bGFAW_B_82OOHu+(G&VOjzsg1WLYQ$8FTDujiw1^Opn@Z
z#7D9KGHN@MYtvX5IJF@>Am06;fpHXhC5seaVc534Yi`rqdIEXKzoyP5dC4RFUl$tJ
zU7pe8+OdGlk;(3sy%t9BUl;n?9SN%;hcr3>m;9-H6_Px}EM+BL_g~!JYKSt84rrRP
zGQ48%*l*Mq7d-kmxoKVKb==l~LmG9XrN3{R*Obtk(6TgVBCc(F_U(*87NwOyerM_O
zs@Wd`99*ZPqj&UH{iz(dr-MRw^m|>|>F5l4m6dbVkw`#Ow3Wr?nPvh9X33iGbMLrx
z<UeI;_IB$^j+zj1;FgCQ_Ar~Fm4}mRDVH;NYwMYz*+qoZM(O~y@S>~;Yp5z{QaYK<
z-c-<7RY#_E{io=^ZZ|r@n(tb8u<A%1zVdLo|MG1jN@=L!ap_3@dkU=os^OK^{{Zip
zOT$SkXow)pX2Lp%(?7H4C-|pzJ(!Bj;bH=w*@Lfpyfvn9xpYK7WnK4R{XgLUt0C?t
z)u$EIL}L%8h`Y%GcLlWvD>Urd<hlCQ+`q4@5QK&q_B@yhGarVLR#2yOKA57JrCHlX
zg%={Z@7xt6t`0i>T=-qHN@CEFz8bx>P<kU7&YQ73wy9h@E#@p=l<nA9@R_2`VkzB&
zT!*m7tzmibWFj`wRAABYWW*F)-I8Bleq64)NA^f&YiS%ML%)1#dMoksd!iryO_^s=
zaJKori4e*$QVF4knFgK`K!VTfKwgs1^v3W;rOvg{kJon9d|f<s@pkdHOWiDW3qc7|
z%sRjmQL&OXgr}79oc2so5$W3eJuA^t)Dne`4>Gj#nqfJ|Q8^6IAWYs?Ze7eJEy%mb
zw&bcR-zT(qBYGV0dvxRmc|jZR-t&a@Cbio5!`_9<s4M?~Q1)@pjcHwlSx&=^@{RJH
z8}ol0!UIB-{Ch)N)(naH*$osY7R`R)-~YVblu$#D{Vw48)0I@3&+zSLFAYZ7VXXL&
zrv#q^UxVz9XA%BZy3|1-(-tY)Ryx*#0#JX!zL(u7*0eR;v?@e%GAcEnm`$23mDc-^
zFj7aO!i8(=*BV95)x}ldzSel<%v$39P%z~dDYE_s(+}25f-W2wR<Nd?TYQBi@sOSy
zw@y?Qn%$)2A0oq#(jo!_Jo8@oZA@+vjhVLswr6c)&|;$sjr}d_fsoh5q<OCid%kRw
z6h*XBawr&jg^i-9Ydgyu#_D8zFk_NvmstxL+%)oH6JpjXWjIT{?NO(Z$15zkD?l1x
z7~>PM3A(eO5Q=**XVa}JM|=O*mtt4bcS=Eoj=}WIqL(4&El4P4PQ!QV$9AHLg<2eK
z;+{)U)*C^6NT_DKESfaE@NQH%B3cKh*PEAs!TY5Ccv;L?1w9vwguuZaO;4iX`2(6J
zna^TFyt|p_<##(~O|LFwUGE!w$$e63Ic2xgJ_eg7<Xnp1<K0vxm2YUr%gQ%tRY}Sc
zCRz-R?@HZud{vlw$1!A8*MJ%?%NnaNwZqed{akkUF?%~PYB%*AMcgcsx^7%u=?cSb
zPks`z`VNxUTdzXmg<sdm>N;%f$^Xf!d7$xg38Et)(_{~mGPjQ;?^#~Yp>v?4SWtE?
zRVeC3&Av+1&8#D`!{D({?9!N{cR0d?h4g^0P{|+i{3g#Cle_vWICYkZ_vJf{3c1sU
z^@6c=?W#HKu;*?I@6@WbnQ(Y4`nm|TC@RiIV7sb@vAtyP-;wvllk5ebeOMO{pXjG~
zu<e}GmXB@mraAS-xR-5bO6o`0>s;-CU)sYV*t$*Z>B*Stx=qy_EW|t(yJEyP5n%~Z
zK~Hji6b^Jp>*S@PQ(p@{$cV>>YPZ@-x?Yfd^?qwZG4-lIRz^ME>O<R)FVr?4rpyWo
zWmtY`(Nun0`cW6sJBZ7#3eV2!9h{Yc#;=FDy*+$A^(N0C(1JGPL*IM4_p@(eKQwMS
zwHBc*4=du*X%=Iv`K(DdWyd$84ve^4p^BP2;ues{KfHR*)#h-)XQ;GBnnJ8%Yup9d
z;7$dq_zO>Ms=Roz$K<JS#(z{JP@_<*G+9)=ZW7wX`ASCdl;eWQ+zvBYEHa373Hxv#
zFGUmc#xL}u>qV+CL<A~Pz$7y*_PGD^1`w)7<a<ZyI=inMR9CT8`$`l>Q$Qy(@@q|6
zFQg8~hIWdlz%Y0IIoW7rgj!61$a|Hf3|KMsk|=><AVXE~x<IF(z}RT^jQ2fi>4k1b
zq-wVRedP>*d_w3-Uz?XsLs6(>0QGUxEBWoGfoClxQW=K;uS#p$g(?zv)F;Yc&7phL
zH`srVQs<F`;~hODZ<bTmD=pmN=$H|eLt5RjEYU&PDkSKFrPV~oGu;o>wHTQx6<YOj
zk5>V??3Hd`I}GoI<>qdeK>_ANMK3T+u0<=&9bzI6=EZ`W6r58aV~6rk>`N3Tm~jzr
zH0-$M35m|<fsM5_)PZe*Fw3~>J~^b#be`ZQY*mHtt%v;PuLR*TEgPg`lO5$f;v|ZL
zj@s{x$L-yl)##Wgbm&PPZaT3n0XjpQFVYy)KZRBKs%AjjL$|3E<Afpj3rc2!gmaJ+
zEOISAOrD~1)P<kj_towg#3>Zo^>WgCJf93G(lYSlMLB;sHGH@E;bI0Gc%@(|Kd|$0
zHx74j<*J*o+5tlwt>7{#7WYA%hc3YEJ#(@1OVU>Z$H#7N!UosP(63pSsY+Ojd-v1G
z^)4-hj;1a64e0w$pH|S9i_hO5|LCTz4#rg0SO}s~6`%v^#2X5*GSCJn{r0cwVEm*4
z(zheKjQ$kK)Htjzz)<Y)vhr2$G%fE{gZnUS*jRwKSo|gEtJUdUUTB~~GMnMA>0Cy_
z`>OjWWZfqGsHw$<(m`b=TU^1)60YN{JjZI+DDTeCjLq>dKV;o9$HTm=^2^j$0=rkE
z#zcQ<fU1)}Fj@+%zaA#6SV{(?Xz#3K!TmQfvi$GMJ+V;j!!&o+*)GjLHtvEkuZ9gZ
zzx?U0Vj;snNKQ=c;Fx%fE$pLRj5QUc@{qicSV-<uH<>3;4qjCeLidorSo!LtSw0#t
z0NFydq;*B?XF+|~U&-uXVLy-c>x(kMC1Nuzzm$$lMMW!Xvov>++Bw5b17<P$i<<03
zJUv@(2lD{jUUEBs3AK0<F1uHoX5nu8DqKh{{M>S=c8>&h@qAjNCdso+R{SX(`n2$r
z7?|r-+81Ur9TnHrz!WNdKT@<Kq_u&8HEwsN-B7!(&cnJ7M$P&99-JvEw%E#%6&G@{
zsT|Zc2|`{TSSRuL(ZPRg5+L>dw{7y+5X*TZYi1@)0TSC8?A7x`iHy59${7ASTHQnA
zIcp!&3<dx4>7}Zg_E{U1o>6%*=(nhaF|!PY!mwB2dpdS*r|1qq2<2G^=>X1?Wt7-W
zOi+KIt{49E^TL`|JZNh?ID3yRD0{nu*rO`o)iE;MdO`A~#!~(=<&K(#@)ykG8>|!4
zYvd__RlmP-4A*PzRurX0PIs5dM-<@oP0GY{O#qJ7FrK=f++nW?2}5;2iGuMw7-M|W
zy8q+uwfqE5av0S${9gd3Kv}=gA+R)W7ruE|{~Hzg{r<OM5BZnF9`k<)d(yu;u=Gxs
ze?6?nzbUX()h*kWv%a8fU`5WRf~x{6?~L&8_?3<I?+L8T*-_9luqJ0u!OejUcP9A{
zusvLnPw^iQY{)rK&@Zs%&c^psg>8Pn8Q79@xZt+H&fJYf<JnGx{eSFze^49Qo#$&I
z3yp`-Fbu=63_}=(FoY07h(kD*5JDJ22t!yxST7-1QU6qTOGvF&OEy9hk}MPAqZqH(
z@%kLc=k@t0$Dte-=Qy0>5TB3rvJ|0+*T=dzig+nPIEru%VSNte-1j|647QWZ{IUPt
z)wjN%ZeG8B{eFDj_xpX1ZdfGe%Z`(PYmPI34;<$KA381qK6YI8FX6iaH!XU{)xhlo
zkL;p<)r{;8|Au9+<2qm}!3=`gj+_1sd^Ip_HrO@)mf7IAjS-()@*L9vlR3ri^KV-U
z9d`)cd+NNzu?V=&u?$%0SOctaJn(Phy8?I2Y4%V+XU??u2aJ|_$3ws-$78@&ryg*>
zb8o<C&an>$OqMQ!j59TG&zx^R9!R%{&J2Rt1oHyv=6q)%oI9K)fSPk3pwC%}(;@$&
zCFHDulzwM@VA)(`KNZNb3_6<tk2_lfS>{svS^q4?z?$WhbARB0x!isM@}G5fK?&tf
z1|y^_#?s?hWxo{2HP_m&K;14lMZil0uQ(mhGdTUB<(g9i?@7+h4fbn+0?Q4jk6>sA
zZ#nw`C!K?UGtT3Hcb%sI=bdK(?>jF99$QwNmx6jI=SHB|oa4L#SZlup*leGKdagUK
z0d8U>DU0Sd`%IwB^2mAPNyO<vShhNE1qx_`b26A}?y%ntRM08TnLq_T4`!G<oRDHR
z+viEUVt7AsPNz9n05hHIfH}@hz<lQ;z#;_!mMR8lJHGd!nX}&yRGVe{N}!G|S5ltB
zDkUvYXIAX%fkv~}z8PqtYaPchDw#XjpyUKv%zpc$Ks(*6<O8-5>`;mT%}Qx78^<KU
zJepI=AwGfFAXrGtN>%R)tthpCUZo-C!Gk5(4+i(qex=!;Lr0W0e-8FU!Ag2S>A+q|
zF$03H3D(d<3g<7UPbf0@7nm=>diu1Y_?IwmgH7}~#p~auhZR5Ih!Tl$EZB<2H@Kf3
zRR#hD<_2X55c(n5MUN>baGxlr0b?y4SIz-WD8qpk;&GOxeLX4QeX!+UlPY`C{IXQj
zlL=VglM}bk53rRx{7`D@$@iZ-aIL2Z5Zjr@Qfp5swmCiJfM7|YdTD=86<Ck1o?8DL
zVg4dLrHlj_dR7??iu9Z^7Ie@H%6L$tmz0U#d3sfu>b*~ID6>Hy<e7s!TgpN(L~kog
z!G1=ktOf@eqp}e^&X|;~-t_}b%C^6jNq6ajr<g34F?jaCewWGL%;dV#gBK1Iy0U_o
z4wSfZ{W(m5s~~WjDRvbHuP|k<vf#B?yunnsDuOqdYFBmd42~#*x0pItU2u|VbT#5p
zaJ3{nJJaH7C)f$t4t1MhI$f6FUB==Pg7b{vvIp-oc9$AlVN}<l;5u{26%201V%l%$
zTv3SPVsR9SQkbCYD8#NLZe*gaV}M6pCn1W%+<5d2iwlrB<~rYpn3JxHeFo-?>vCTT
za~|hBpm$yE-DEDhuJ@(IA~*AC*UjD;=CbQH%yAN<GFM&Gy({0ya^2~D#9VjXgQp9w
zMd*i{u4Ta6t~Gx?Gwpf+G1x`dL#WRk*JHqYsy;Yq-ct8M`HO05UnaAxW&~$&5A@|Q
zYif3k+kN@W12qrId8iip%bCY&Nna7GSNHXm#v)91uUgqx&Zeq0eN}i~^wqK%YCS~h
z5Uck!#E{Kan_>}oU-K?(BiKQnVkXG3d1`C#3R|e|?_Fn0)UMu598vel>^_x=pDP5t
zcz*Ty<2?!Sa9@Pr0IpTv5L>B=eM7Nxf+yG-)!|QL>s1ZnVTg+RPP0v_FBSv$o!f=O
z5UHyEe702$0q)0$qt?EW7_wbze=Jh%8)X@F&|k`m>Ty6whZKi;Di*Z{Yu>}YF+A)0
z#tBZukk!<)5Q{=Y+Be1e)C&-KLR{K6%ZAiT{${pcy#nY^uR+W?t=@px5|7JV4B0{T
zR^NgpPo0F>jV-f@J+97x_smo8!uct6zHiB#qu$3JL0#!vWzVYXfd}jbb+d1Sy`(<s
z+hVV{QQtOu&20$j;<lYwZ*zw`C1hl8xYNKId)=8KQ_K>wx7<0Qbav96AIf59ob{nx
z_O81qR1k{+ETX#<d;#$?>^!#HF(1v|cbA8X*%fzHsEl35-j>~T*Miprs}-uS%(xpu
z73?F&olv#qxVt%!#UXcFC@b!lFSt7bCeGkC<5<GY`Iop9w~T$dTk&smX>M;Ioy&Cl
z;konf2=-rYuz_5TdkE6=-6sO+bgBDv!c%ZX?sK6!uGBpYb%R)<FO4gAj|69M>q3oO
zm3uU1X#;n-TK5=~)8HNlF91F_)WS8}uY}sMp9*zyZSIMXh3jxng#^y*o(<V?zlBs%
z&LNI-&xL}V>|TI14%qNeG#2e(40gpZ5!KkW&{2%Bl+ZDPCvhD@X9%9>6!%g<$9dhW
z0VC(f(Fl%Af?Zt1y%9SHe+LM6fiDPM#8x+SksEMt#o~(4<v2nsg00+;dmH=(IVbJ9
z$eqx1fn4siW`u|W;)>8!?wn?dMG;V|9SohcT+q@3#oVx#1vsMR#ymch0~or_jcNrk
zFCV(ejcLV!GHzTe1B7%)nb0a?UOset7fxe86}m(4B*A;!lvWMC8$5Vuk(<@(z*B=C
z4=r<ZT4SJ*ThLknm$Y{9V$E7-pq=#j8o>wgo>|o_feLOz69RSImSzvMaNC;dU*dJz
zq0mE#>1l}WmBgOrNVDYVX#>=HI{ZVDugC1?q)-p%H%R?Gax@k4C~$SK#~aO%j`#TC
z>ZzVcG+R2`GZ4*_F7yn+YUpgw3IBj}spoXG@WAn&bAG>crDr%=B3<hliSCnb^o&L;
z!DEj3&q=p>#>4&6WY0vjMw;oFiq^+g2L~p5X8mK*-JZE<lQe(mO0-qF-?I?iFRk<}
z!8&fGXLTnIU+>w7c1fE(TX6oUXFGgFMm}AXkqtg$RFqSErl><s^QA{MIn$RF^~pKD
z+-NAa0u>v51+eCdt#rtmLeBRUNBd)|Jh=!`6uHz_25T<x_0d7`yssiUC>8puqsQfP
zUtRQ+T;*$wiefR&d0^1j;`hq6zV_(Z*gDHP?(0mfXJczjxq+-J8^vY3;`3Q{o_^6M
z0Ft$m-0ZW%iYd0nc?@gJak;H$qpwP=_Nh>Qhwo7If^7B$VVwm%5WOUGzGysFIVQ`#
zqyBTU;ydP_lfAx^(JQjwcP4r*wjPutzVp!=@__GR^cIeDqLcEF?=q~ZVPzeiA$V6l
z;kyc616I)BF94(S7z1QY9lejA5?zU*eA;&%T67N9T#7vGyXp7h^>1`t9`W6dpKlU;
zgyW?nDBhE>dOc!@Bh>0hiahF@K9UkUCzvLW`R@2n$m70yuzH0x?2$})!nYV(u^!3U
zh56!b-!k;rly41i79(C`2KUEMp7TA3tt*cd$qT-R{!w|!_ZSe;A!W5!A6sJ{DcyzT
zIEp(`MX)x8@<#7oSUJM_@koQb)td_IL|8>0X_mKpGyJnQU2is^u{X~@XEXH{M!WF1
z9BCuiabU2wB)V=(@7?Dgvt{*G`loEUy)}@tpts)N!5g(;=rLJ45Pw8$7pz%$lNJr@
z`E>1Qc(2)@9qUWuv$T`gcW7tAseCR*I6oiG;0v^i;cURmJ6Nn;4d?M?+VyZDUjcU+
zV{1CTTDuu8;p?>9;eC9gHXW|yTeLgj8a@l#Pl6BmcI{r@2DX;rdV)=skhT~|=R39K
za4T=o*24SE<=TV5A}?qU!(F^xdmLs6ir7wu9r3jlU|8c-k3Q^+t+V(;p1t7^AM~V#
z`}wFRBRt3-^<;;S^T#}SfG0hL;ZxXVhR+hbz@PDygf9W^3+VXsp33kQ{-UQQd?n^n
z_{*O9@HPIbrzw1czwT)b-{Nn2_J=3=+n%nzDq=;A{IrMhbNn5T7@pzpc^u)p{Gvw-
z&zohBFHpxXdqRO~e$CVGAB}k^jN$ucKX^kp_Y8&(#UeL0)pI<&!awkw3a^`y=WKYB
zf9Saoe#AfaTnatri#=Dud4k?^EnFh(_1uUcA=Ps$Vh}PslaUl5+cV=I67sZ@k+j{V
z35A}!kxZe)Gatzj_Id6{@`XyzN~B1r@vKKmg?i6sq+Dq7Jc?8atzHzV74~}#kp`j5
zn-Xaj7;hS&=*^6z#nIu-@h=IQH$T!Q_`F5okPz~gMmmIkZ+XNl40@{~oN(M*8<B-m
z-iC-Gob@(Gym)*ge&K@I8;J;)yls&IA=?W!U%28mM}}f!CtUM#krToVuN)i{Zh4i+
zX<^dq1)TBvBj<#>aO<!z?;QxGn;X2a!V>O#PXMlXPe(?Cb?>>zsIciB27KhHjEq^4
zccgE@YVeNw4b~LzSY+Iq<{ghrSTns7e%YGior+9Z^S!f?S!<DZE;45=wa@fbS<Af(
z;Yw?jcPX*}IaeWPt#>1`6nl!b!Mhb%wKjXVgEQ8)gSyCu*&r19x2zopjgc*L)j?Ar
z*J?hP9@(~X2eS_Atn$I!!$zwj_zoM*Z3n?STfGO14?`?>uq@DS^&hM_oNkRAtUjD&
z9XMEbIM+IKu<>vKUv{viZv*4u;@GO+dg5UF;WF##gPn&faP)Gx8b>LI>+mkh;l|jE
zBRd#J<2w~k&W;=Ly#lf`aJa>K?w}=Hi2eKFcDx2Z+=<zE*b>_vBKF~-byzzI<&#Lv
zI&x45+~Erj+5>muyEkNC#X4$t92WQo2i3#&*s9+;cJR<)6_0hW73t8c)CTpx5u$!e
zZ6O`?i25DUQ@__y$e`P!Goa^m&+Agq^SV^sOXvmNE4mEyqApAK8hS~Wr+X8<tSi)g
z1ihmBj_y0?)ue``FQUvn^?Sa6vi2x@Tqt|bPxt&3<(NRwhfKMqcTldW#ng$KO$SVe
z(br7xntp;prrV}9^aInnX$#$j+WZC5lmC8TLNB7{&`YQdJ&)Rv1$`MY=v$~0{d+Ws
zB4`Nx97WMTp<kd$^e@z2^gpN+>IL+B>P6}milXp0PDcFRB=suQLGe^JWu>B25!Fu}
zry8h})OV<_QvZee8TGZK?<akq()6m{O?mV|eTX`!@7I5e^6B5zze@%6-`4*v)u;ci
z{_m)Wenfwn`lkM>{zueN{ZI5ip@#JTUH|vgyX1S!<N8_s&#7<g7xatN-|Cn3zoJg-
zf35#Dbyojd{l8NG5x+%8o!|5Fo|mcryyxe89#g}HJ%)Vhj^QoCThylEBZd;{w}wv}
zDyiS$FFdH<8$NHS)9DTMhIe!Z!<P-6I+NiW21b`{;0%f`*WfnzbssZ)(=e#}v|-3_
zTK5^lcMTVGjrhw6U8`Zl@Q=E$7^V%=I;UaAFsD-t^M-j{k73cUsPh?qX;{_u;`gz1
zeTLr{Hg!?Mmf^9kKN%&zpnEqtE%_DQcavXDepB~%$py)u(p^seT(YbiPj)3A)2$`{
zP4eF)r6hkl`E=3?$=^#JNqQOoU)7{nlYf|eEh#&BEcvHNIrx2`q=MvsN}fyFmwYdI
zIjJJ~SIPgH^tQ3UcqOUL_?ze6Ncx58cc$Oz@%M5v>IY0gIrwYi+RJe@f)=9!+0yV(
z7U~+lc(<-pw#%w~s9{kK%F#NxWk|j%Uzcyzjmfv=Y59(PPhOOloAx&4$w!+s`Pkcy
zZwvBC`HXyCz9?UA%6q#GSfvM!ZxD|E7E#pisox_VFgg__!L`@Po2W?lJ>B;ZrTf0_
z`*7{D?guDI_ha3U(H`<9szLWN-OrGbd<piP?&rEYXfJsOHHEx=`hxBkx?dm@dGGW^
z-LG}OhS`DNEKN(IlBk{k8oVdTfHKIJW0^@ANg3$Rk}{JrQ5Jbq^tGhIq(byM`6?_Y
zsVb=o<&tl}-bkuR`W(t5U&Z8;cQ!r@^-rPFNY7zJ(k<kZZb_5UjC5C;m+ng|(z>)M
zJ(7{MDI4SzIZe(anj`1SMbf%lDwoSua;@ATH_L5uhisNPS(X*qEBob$JRlFrC*;%e
zIikbzh&(Ef$>Z{bJSES{bGyaXE-%PS@~XVCBU|#eO=mOi5|b_6mSxL@)Pg6Z(^dd^
zi*04L3VAyrwrX3Qtq~AQi_Ic$*aXOHw^?ke?T{^Ki`tGtnFY3Ewv)Cqw)4<_>Nz=X
zHSjm<FOfdU1W7{KAbM1Q-a>m&F-S7{IEWEdf;@++K=z_)kmpeydYimm{w1(F_`Ay&
z(O;nsl#04Q(!lO8=w&2;q$3wn$rqQ0$hVdO@<w?WWr00<7rlnQ4e~nr4<I?{KcNfg
zL+E=TZ=ey7JoF=wH_?9u$wxm1`7ru#=oZv(5~Ps)H|&p~IrPt{2>nlxV)QQ{A49(c
zDM1g=Z=miE(f>i8K#xH_N$IF$w2#^gwy%=BKmO-n`%+OAd4If`%AwvspQYZU-b9}#
z-+R`9HEbbYbatS6>g$vleSxx27W5_ZE_oCAy7L`MrexGibyMBw%aohaPz%*dg-|Ql
z&M4Xs*7KX_uc&WPN72`)A!-PHoqXNd1(x(0IzWw4V~D24sh=W-nxOt3anwIh|9}MQ
zAE_y1CH$4Z3KSyQSY#|h-Q?{&hw&4}GUOz0<hhKM#!93btBuvjO}-@3$oqF5quKZk
zbkIl}Y19K_yNtF7_sj8LmJmll$3VwHCqSp-IvdwH&;|6D;+QxtPKZ<DtT-nwh)d$C
zxFK!<Zc94J2x5}br7S5|Dv*k$GO0qUmg=NNsYPm+IwgxFNOnn;4oN{NDjk)MNhhT<
z(s_`J(q-wYbX~eB-Ik`MJJLOAQCgPPqzBSN>9O1|>*c+2s+=Kb%XxC4Tq5t2E9DxH
zdbvq%mG^^m$&4(@4q20ZatLHlJ}#fa|3P$*Wsen{;MYxE<nJrI{cpzacR*ewEKemY
ze~GaCWy11w!tz%L%QFbeGYQMH2+Ln1EYBt^&mk=T5Mg;PVR;^5`J060`Gn;kCM+)?
zEPsoz{3B=zq=<0+qlD|lgzF`Q>mMgvFC|?61mXH83D-XbT-Tv;!tYNLe*Zh_bt(sV
zkC|OXnEe^T>}v9T#AgYcYY3Y^N7!6T*!+25b1%$+02P2afWMOX0%3ClVRIw(*VJFb
ze88M;BHu^6L)hF**xW)*0gqd$X=)mM#rRd@S5cd>!`OkoYP1@y_<QZtVCn#jQwsF`
z^N4bG0q@E{D?qD3>*8sRpe>;7pq)5P|B|@RSuV}~X(f5)lsjUjvr4RS)=CRcDozvY
zoek0w=&EuLtJvghmNx#h;yhxjvrXLZ?2xwJQ~a~o<ups%PP4327DdL%$;LmeB#%j1
z7DcBlr#ofQ;Z%rfPOqE=nyc8!d4aMf7b_1$pVKdfoDsR~JtaR^C=bPc=YU-OK3b<d
zmK$AqanLzL^tkhcc*=QNZiy*A$8D3_-&4}ov(9sJr}G>>620IYmM!nEIG=dQIU-(h
zj>^K5DqeGrCCc5c;tl7xY=2M1Th56OqK+=@lsM^}5@(#Ve^Bo_=dd~$?JQPt-nsCB
z^uBXRTyd^GqwCHMtPasNC~i8pK8QLT+6D2Ea~rFK6dh56V#Mmuv`hHdZFgUk%!)-y
zQA|>rlK!+heA*Q$Q^|Tib%eBQq>LOTmuS9HAQdUaPiU!9_KcP*6;EiDQvHFoR;iO3
zl*VVYS!t2ll=f$|L+N}%al1c&%BpLx#3_O#EA~IAa7{ksN|l4I43bY#)DNUy<&fl8
zg3l-}FA93pl`Tb-=m$~xm@7{jP>$~CkaBEC@%59SXIzEk{Jg6~zUbN~ols7a`aY%d
zWmhFRKdqd3Kb5b#YUJy#`X|abr<{L6hn0)ZD8Bcmt4SJBF3Y!Ft>ifE+W&{@^G9`5
zxhjn**PqdG<)$>D+<r!<l<D0n-*I&%Z2N9ooUoNUHtwE_*|9;3E^)UV+2P6VaZQX%
zqK}qcj-9?;b7{NlL~Kbv*c6!ij=`7>a2$`#Uox*wDOtpC0N)*FK`&@G5`1=Cf+IW!
zUzZ=aeDXtANPg_<x9MGj*uU8Jx{l*?+%{XP>y$0Sbr#!KTej;0wv)Cz*Ckt_>k9S(
zwi4GhY}-kjf$z3`t{VyaY^!wLvemdIal38xt{L1HwkFqIY&&t8L|a|+w*9XAwl3F-
zjd88pMAxRx;d*4#RD}DN_#oVu(1$i3_{q3GvxUI_#QWLS59JK123!WYc3e%dol?`V
z-`VlCyT)jzOssLAlJEt%PS`iv&Z?QDZWq*?#5j}u*!IKN6Tb{TkF@QQnvc_LSHOo?
z#uWQ&+`ms~Q_L@8e?|N;JSX9c@sTL-hqUb}A4{GEYP;t2W5qr;;a3xCyWt$dy5l3U
z&y4%V-M;b(9~t+P3ANn<KL}$?d>{Bj+oW?0E8#8nb2nnTW3xbQGinhz-c?I&^J+Os
z2m4{WuU6Ss)LPrR+F;vMn@KvJ!`PSLc?<Ih&nMd>wat#y4s46;2GvaFMPeSr?IPh8
zl!NV@Jq2u2e2$Ru0=tFpPuRd`=GIg5YRA4L>JMdLyJ=5TIXvd}OjX{wALobpX3tR-
zd%o($^|lwOetW4JAvPSh2ijmSR|o7>>JYi_{p?S?UiR9!9Z$^h=&n5RI+M0OWsi5-
zgY9ra|Jbwsh&{D8#E;F&W!$D6ecDd$ww2gECT!$xn+SPHIXDl#27O>}b2T}xXtx~K
zv`MTuw!z~~;+h-U3`ql<WuJAM9JjQ)iTK4asm<FX>Nv4u5O+9cwEHBc0GnnXQCIDL
zbu1ARIPPjI#HL~YkK+bx>*lp}?4up`wN1y0_Q*c2ZadaJ$g$}$I39UYoXC^rG<Y(d
zDV`h>lf~^NxfjP~@wkb^Jc$?umlZDqUvs8;^0B?d_1cN;5^>n(dp4M8B8G{_9=I$V
zk2y0vML6DZ=6FgI_mlk49`ZBv7nvu}HfO%4oTTBnF`@A|@d<r87L3P!PiVYfpN{Kx
z6vtu-eLAj7#AtsYHggum^nGJ99DD7Kzi`Zzh`V;hSh%j_UYsZ1C(q0+n1A*T^~5vt
z%B-GF#8`GtJ!hBIVQh;M`Rs~1f_ZKCs-uKY2|IxE;5i5UBgX-C!ak%<k@FMkto^h)
zhjsUSv!7EJu#K<}t4oBRqw2;paSW(^Ox+?1e8fuTA+}o+ZXG#Jxs8cAi06cT&Yf;w
zaA)nzKWu}R+_}Wothx*A8}4HJmb(o5o`mA|;x!hTf6!+8w!5NR=dLDYgZ=0>y6Z?g
z<N3Qy?#AwPcT0DcyS+Qt-HB~ccY)i|UF;UR%iQ+v3b%^=AMt_MK0!>-UF|;9UFQyB
zd)(dVj*>MC_`>cM_tEZl_p$Cy_etDe-4^$mZoz#XuM@ge_hsBq-G|&)asKY0`+9fO
zeY5+h`!<dNvHefj=I&$e>F$&6J4B&8>}N2yyU)1q;ePKv?_TV_=w8M=>%Qz>BQ^}i
zvHPm~0l9YF{g9M#)BPCR^6uN3zI$5R+kHn%?Y^gFbT4Yz-OE~D_nKDN{Xi?h{oDOe
z+ehpeZvSJg5+5CUtp@YKu~(~iq-sr$46W6Xt?l3855(X&{&3`JU5-MHag=BxNsrrH
z$3D&BsMItvelSlQHJZ;+uZ0{<TK^7L6YGG4kAibYt2T)9IQDDDaT?Mg$`myXA@u)$
zgYy5-Z&23IgZF+9z-%Fz&0=%e0=AefV=LI|Ha%O%HnJ^%?QAEr6%#fWD{El|W;-UV
zomJUGY><tzN7-ZSN%jnTp1sIkZX0B;ve(;gus7M;>@;hN%Q2#N*n4bxTo&17c8z_&
zK4c$rdTuY5%4KlbTpm}*m2msGO0I^hXLSjI98Fv+x1Z}`jWOXEPUIYrPa`$NHNrXZ
zHT*ny0^a9;;UaMRi~lD3pzpxk_%g^#WFIt*?1R2c_Ca4E`=A*}Kq7h-$sk!|H}o~K
z8~Qrg4gC<=4b3IHp>L4g&^O6$Xg=8u{V>@LeT(dd7Lwi2kC5HaBC;F$QL-Ca40CP@
zm7r;mPmtZvGO`=`NwORIDY6?{j(&-Lg+7h`7f2=96a90tC;Az(C;C~kCt5@HL_bIN
zL_bgEP&ufM?1{cj_C)K+p6D0Io@fKv6a6CD6Ky1WqF*9=qD@pU6+q2YAJvEcg6xU5
zl0DI{kUi0_lHJgDnCU-6e@Xp_`Vs0Nd!k<>d!k<_d!h&Q1Nv_vGx<w{7X3B-j}Qa%
zdIE9!75%S}0P}YX`jSFk<bRK=G_;t$OV88y=@oh%WRrdb9dBS#m^3Dn$sxiqGNUkF
z#t#x<2ACn{1aq1>#|$$gMDm#;rj#jXs)*DQY+#xJ+n5f<j0N*X_fg=@$Kt$6BhSZ-
z(ZLwK1?<6G(L?{20edi441_Dmz?8RvLwL{pIbcTza0v6`dBTqrvQPd3v`YdeVK$@!
z7v2RvVIHIb3oZbkFay%j4?tca49Fl1cooL~7VsADhyNLj`9A~0@Tk8=Mm?MCfxiwT
zzlw6m80S){Fvf3?(aj^{`X(9CeChyYMjs|4`WB3+0~JyljA#+r<^Cvj6h^U_jN-@0
zD3*{t?T=GGq^_Y-L>bGCRlCNcNdFRD^t3RCm>^wBmnVcNpsV6i`?N4ox`A$v33HTg
zquZEckn)r;Cz&&J2Z$Mj!@`^=N10X<!d#@i&kA#yl;EZPaf!r)xk?Yv1I%^6p=X7;
zNuQuk@5nivH!j0*8F^Z)Bi2!Rlpae6w?U7`W#VbEj?q(zXIaPTS&(y(@|3Xq=q-8<
z1V3*9i%`gz;J8HYAi}OcTRJ@<)RPh>5<N?=#>6^7Z_pdoVZg0t#X1G;+uo68Mz<rz
znEZjBy3I^8CMJDH?l4(I?!DI&31JqQT&5r)WW<-5;-_Q{^Z`>wqypq&JpD1N2dv&{
zQ8Vpl_cC?Q3Y*F_vKdSZ(~gDBW;$cS<}r3iu`m|4kg@D4XGho)dX*7&#LlP*iS<6c
z03!nYW-AGI8`v7Q9&?Os!ac#ZVt%mu;jD|awwPsDk<>^eZH<+LnKDa8Y05fhT_9{&
zBE7##dU}m@z}zW?{wt@eSdAWLebCD$HU#zSX9t1HX7)Jn-OHY0bnIE+@{pyUy}(q!
za|eMZ5y*Rqy~19jx7i!)Ep`%WjQKyr&X87`VZ?6J5%w<RXlCcx`%o61wWLKPXKb$E
z(T&h;><ZL>9ooDJ=>w3j9A@1B`v}fT*)9&zRh)rK;nKKFE{Dq}bGa=hToG4_=POsv
zRe{uU4KQ2f*o@_xxi<1NcoNq^M>sQ{lN^)`Ib}}ayquql&?mTo`252&aF`q7WZ;^K
z@GT3jo?sf83hp$M#hv4Zxe=z58-?+O(P9U=F>aijfSxO6>Yz?1m}zc`n}ytSAPY<x
zH%1tUTS>+W=K}%8;9laEKv$VMn8QQd7No1(Hm~Crcq0$b=F|BsK9?`xi=nSYz6|r7
zui&fsI=+!_!5oL4C1cLFvxUI63ciz*c?*cZ+nH)Ca82dQ_(RZwMy4IE1es1gig^lX
z;g9mi_>=q@E}uV7uksfefxirVwBxq%SNZGkObakn#qG0h(4*EZfdR`fZrz6QS%CJ`
z0bhnhooECrFf5wDQcQ3s_%$(|&ZNgIlYEPqCFbIJ4V;1A5(~s)v5XrQE5vGGD7GA1
zV9jvv3|q&nW3e3ckT4_GL0Q<|5eAc-F)ITMhO%bGM*4)<LXU{;^aP`Wd}C0b4Y3o>
zM)^+BLXWeRe3?)w3Zh+9i4}}nGVzd=SXREmI%}O3gJKly+)~WeiATj_<Z19E@g%(}
zp25}(a6>#Jo)<5QmxT-wEIQvAvs}b>8Lg}0b@3?EDc;02;cvn?VeWMDxA|$zpO~<N
zQ11!;4z@}BJs9IfrUgc48F*C;_Mn(wgT6k`KVYi)hx}u@S<nk$cjzS{RmiYRS|){T
z7{T+vn+@74<OzjBiLg(o6lxd?w}AVd9%U+odZCHl5L$)(+>+1*vu6MdjL6vG`3vA5
zmxUUb!6vX-7P#UNG{MJY2_d0h7!;1ve#R)A63z-2=vm>Ca7DPrn1mZps!6ydOww~u
z@`5k}buvL+?h5n5ePKmd7dD}eQ&6^%-hiBstjKDh2dpVzZPH*aH3M%l;htjPN`*BC
z_`7D!r$>ZBevMya2l+*Mft$4!Sxc??&<{FmxwXn#%S~Autj*RoYlqcr<*W@%BOGOJ
z%c@wtklQZ|^3&Fcb%0T=|ChaUfv&o^_W$qvUWo`t#1tV!8dF6CK@3O%xu6t*<UDv3
zDn%-hA{P;DiWCvK2#7XPMPk7s@+zemV?abi9s!Z^k|M7nAR<kHA_WWzDI$fFyZ2{)
zftbcsyYBk0^<V4Wb=Ll#&7M7bX7=paGxIwqhba%f^x(_~=M9<sV8H`}I88PCzZ>TN
zq8sKAE0=FI)Ybn@Ca1s(-EUhdIrHUJD6hrxS|+dJoZ^1+%2}7QZos|)`*JqRYg^7X
zWV_|HUtTIJk=IdqNxi<MeM@sr%IkEF-eK)<JltK{sLhSW;znb0mvK1ry4+<vZZSS*
z-Xv}@$=qVD;O25AH<y~6dBdD}uj0&Gn>)+Z+*Yo!LbV6geuOdcy~CVVRwySb@9pHh
zmwR=$1vy>54P|G`dy&wRZ=mIdvilqAw&q*XdZ8`V&<;b{BjkP0x1;QF@_rzoLjfH#
zly%U&XHOTJEqtj^**{Zup}e2}*Oa|Xs95Y;0r^5TzfO#;3T4%)LiPR)YAAF=HPoaU
zyGkvpQ0s3;aq)Xrd!bI>K-~<*_ZZ5Ge@p5u)UO)KF%*}1%NqRcC@yoGH7uav0gW;g
zKW5%zCGx&YXspo0f2O#^K-Q#xP4Tk8#*~0w5}H|s=2b%lRcO(_K}&^JRzqurHdaGh
ztI)1*N7+Z^eP0giqgs-(?2|&L1HB{Pl8y^~UJdOFY$oLW?6;$cEAJOBQ?`AXF6q<`
z-s=jbNm=scwklI3BWOSTuPM@4sQK6HXQx!5)M`lUwvzXBd2biI-&uWszKOCM3Eff+
zWeT;ahB|zmB3&xw@5=eMB3}l1EE3JRlwTsfa=xBFD&nN_yaxWh<fFcoe0$&5`*bO%
zRL%hvYdBb2x@T0xeD``)V*0Epj|>tTDwJz-gzhhqF-ERqicAp7Gx;?~=S%WrWU9~%
zvj;@x2KoKGiT8;3q#{2@WKWAMG2@La5BwlDHD9VtNbSmeMkFsr#CH+tLqzQ!3;Y}S
zQ|q!%R_tMiD*0LLL@i@O%c8PhL~929sya_q=IGL%QZY|?W^atvF@1~Ht5pBI84Lej
z66}5Q?`Xq7R#b_WS3_0z<sH?~o=W>`r9Ji%l~$rE`|G9s@_({VHq80@Uf3k(@*Y@e
zzq^#Pgrc2-ciFe1@nDTf{)oz+6qP+FD*I1#a3Gg8tosM+wu@O$QQ2pr!%Z%VjtbUA
zunsEnqQ;im(L6_fs#qg=CU0r|OF6LW+N!dyD)P&vwUrn2e_}A_lP=dsU)Ebx_NM4d
zrr%N7lcKr@9Sm}kjzRXI=%S$Bzsf;X^LTYS6@9KIA73g<zD(bk>;6?fP5(OmQ=Y7x
zD>EzQ$V&MT-Sj}Gjq!oz=<X6f-J2NFvyINTo@u(b2xJl~vChJ49t+Md-Q{c)JsadP
znbYnXTN6z_i>dF`U%{E6d$%B$>Hhyxkmof1WRLDH_PUF`nDn>1#7%dx*InYQyV{f(
z>pnKfpB4FXQE)b^I5!QyoJX-0*r1OSO}?aGRr7F_9DL~vQ;~;toN7zUB&NDgsZ@Vb
zFrL=NU!1vQY~5c9WbG?aw`!>BSum#>3jA92Tz84aR-&rsx{5Q}|Kx18H0SH*ueCXs
z&s~+zSUEx6m6d!_b#2M|i*C*Nr*#$Gm2>Hg71ecp(Bz^^`Ra&?>*(<yo+{+3b2f-S
zeV4ov3)Wps)>tfTsPcM?rI;KMOEvKsYgElSMksbmK$(WfgKED`@ZRC_8q_rrlf5q1
zqtf~lz1WY;dW*>(9UCY##Oys4L_R7se$p|=9uXQ5*p_j`MhD{z^kZ_Ti#;RsoY0Fo
z<f7PgA-TiF<_i^?{ZENJDQAM%VxeUwAIFMKoa$Z}TPL(xXj|Z~-9r1#Jt20~_$hYM
z=*ML5h@BHkm@}Z}|BBokmpvd}!;sozpOM&(*EW8S%NaDDX5uWKVf?7`7;hYuH4kjG
zGPyi1XT-Ri596A9<UAIab68x?RdG2_#RnPx#)k%ZOvWG24c<qZc!<kcC@yE9xa<+}
z$(Q3tbGM!c;&K*>%ULKsH;@N8H!kOac#*NM>m)8dipv=wzNS*FR@?(B_9)@w>y1u)
zi;?NKF>0-=wc-E1jq$(fS()3!U9X?0+FPtQEGsEpsFBbuLYYEsggR7u?<|xp)I;b;
zLj8pX3Jno@L}-N2XrXaJ&j>vy^rFyoq1i(7g$e^&EVN9hSZJNlW}$6DyM^`(l?WXT
z+C2FU@27>%1*G~3%Sv{QJh_HY?VxWbgLfH2a$TV`p$sFx^xjygx#??itIK0fZYOl7
zP#2-7v6I}(w3pmh=suxAfqbY?uFyzhH+hWE1fe{k$wE_wW(ds{S|C&uyf3NrUZKBS
zXie}wH>kI!Qu%tJEkZkl_6QviI&}G6{gZr5sPx}>ukb~g@#DWs%t2-9VMy&WUnPO=
zu_|=l{I~NJ?-lhc>|Wsu)ePoa<X6<W{C-6}p@!BQk#3QAq<5rWBquT$8WtHI85J2D
znHZTAnG$&^GBYwSQV>}bSsGaxSsU3H*&5j;uYHk&kt31gk<TM%BNw7}G&vfMrbJVr
z^k^ggeT%#@qiuvbL_0^bqdlTOicE?2j}$})Mu$Wnfks3}N5@5<i9Q#7F*-duJ32pF
z7+oy?UKTBmu8VGtZj0`Y?vIv4k48^MPe;#16EQbdBUU?BH<lL5h&7HikF|=mi`^OP
z5{qiN$a}^5#_o#^iVcnB#zw}*#3n#_vB|Ni)zXaE+}MK1uvk&>S|YFN_~r6i6I&nK
zBK(fvwI_A}IwY@S@+ys$#m>jAxDVBg*O5J;ezm{2U<JRpkjyVG)ZiBvYVwN<VSaI;
zHov$~hhJPs;TIR`@{0@C@>>h_!|CDMtOnth;m%gG@K3`-td8Mf;h$S~hew1Tw<6)O
z;U}%0;fdj=tsjQ-!>g=5;nm?a*0}KI@Cj=IzrL{G|Kpy8LUls9d|zdy{%`UbnE^>=
z$><PJ=YX;+@k+94GJX{Bs@Ff@2L?1GAj!HJl65mA>t;yS&3GozeJ;>f*^8C_Jw14z
z9nkzrZ59UqUTihZ-XgCZ*?Y1NWFL~(G4N1!sl3Xv&qu6?AE{|x*;^uYBK0B-BR7b&
ziO3s9T0~lBABwb(bdoj6s?08vbs3N3$Xs2;FKt--#)i#rY&hY}aHi#k?+$mdyl_@H
zW+m}^8#TiN!UJU<9t=NZT^Al6e$+}2j}HHrl@Xp0e#&YTE)Ex4jl=JT-?eTIpA4V0
zn*2Yu9!lKn{y?ywL`NubC43F|+u(15Z|QauY~_tWJ`nkD;GYxB^xlN;25;)#2H%6a
zw+c2#-pYLhJ6bmr`~BUPir&5Y-6wCf@F{L5Mei4aOVF94?OuexxSBppK>iSXOZb-X
ztxRtN91Z>)9kF@wFK$!8l3+wlDZQ6oF;cfJ<9d*u4^iIP2cJx-MXlG6JWSmPlB?i5
zQui^eT}4kif*v}p(HSh*0=@-&rrQqrQ25qZKn}K~G?UWryLTvh57Co$Xe!^*IgO6-
z!*f_G#+uH>t!@EwgKfZG=nTT<GHi-B?4H`9-OS4oeA<)MZI6WS2=4QS3yxKtkPS{l
zpMKhwqB9uHY2Hr-7hv;hm4x;xA38&6KKgf~{{nmp*3N(>=%080D017~AlS>@t*s~@
zI)!AgyH~Ki>R4wLL)qY0;A%z3K{E?GeN;bm7z+jP3n}f2emVM0;E#gqz_-yq;r>iC
zquw8cKc{hP``)|4CsZC9=xq@^<Vqyisj6e0@Wu=NT`_btkO(@N*hvM`sQUnz4_@n*
zQ+mJf+ZF8t@WtQ`aGKj+=g2<+E&~g}Jg`(Tsh+ge%FG~|Py6~CDg)s=g0%uH1oJ?x
zdo7ylDP4?YACj?1=9+m|UT_&$2<CyMpz3F#Ifh!6LGNXa3ixUCd;$DQa6up`1oOaB
zu(OP1t(Qj>4N;`^z2F(@<_EY8EClmFwUCcDUQiupI`{&8J_D9ur$qZ4I^k^<ECWw?
zv%p+%07xWwC&j|G%-1<IOTgm+jwc4zX>1@#U?G8p1QrrR#UV-$AvqLCRth?d<$7;D
zF|!rS1qXmT6^Vor6JyFtq^Y%?@F(O<uYC}S-7Y{Gi@iys)J{W^h9r&Eou-xrH-IaN
z%pw`%XcM1Uo&x3vb#+}lO3YuY{}Z>gH4DrI2Y?&Elc3r>K-~j)V+YoDz)ypp6QFu;
z3;pbwq}FN~G?50sz@WE^nayP$we$@93I9LT!v?ii781GBDyAae!OZSJvV=9#2uvpn
z-9~<U+HWlQB=S$mMP=|W>JAe6D><!}$%m>Ls^R^o;5B5kiQu((V-Y&Huogaq?+F$J
zIZ*bO5ccgPMSrKxzW*!rhDR=SbJQpPa^?M7ly{C1+Xci)Cegc<TFbp9TGw4fue5Zi
z$>+Xi&Zdg8Hg~h8cC(Ijb=(M_Pw8KSF+NIv22=NS^k2u%WEk(5`bSZ6wW67?X1?r?
z_DJ+=!LLVuY>+()$pd-1JFo@^BcBFd4c<sAd%^OgZaSMuMYOA>ck3+6%2hrkK(%>$
zkdtQ%UyqpFi6kA|?>Er-CEv~=p6?-VFV!sS?II?3sHWGL*h^N4%$>$?istjsR<nxb
zt}HtO^Ri1lEO}rhF}9LbX>bGUWhJ<Q9Yk|=s@4^MzDup~tf2A4p|<i1?Uj?IrB0T~
zDz2PWLf5gzJ5g)7|0VfB_1kD~C6ZNgR9Cu0_=mc)(DTe7|BOddTbU2$gX{d8l=q)!
z<#yLyhLwAO(&7Nk3RS+h-;~mmNKVp!^>Ax{r{+3(J4<)N&<E^^sr2?D@;CAMb@1to
z<&d6~>>jS#-{yt|kLl^ruCFNbHJ+@Zvo?b<&epZ&+{Qk5v!avkio|{&%den$2Frhj
z|1*47>~B@i3##R8`uvEZwHW?5c#l`Z(sPB{zk``-f`x5n|J8GpoD=RR`)Pzv;`DOd
z9m*&)-o8ia40^tkwpL>;RY&UnNxk9W6ZcEzcckv`PF=O;e5ky82YeUCTgoWDBv*XN
zN=&7nWP7_<d+4-c>{kT6)t+R)-{Q*IHMEIQzvIejFmxw<+e}@e-EQX&(=|@5-g*YM
zdpn;APId1V%N>}<rS2e&Px}?&cVq3-0M*(Y?A(scOxG1WtSC`;ou#KT@qm+rA6})_
zaBvDSaLiTjEy6-$@Hg0r!~cS@&%)207~@cCeF`1`A5@RYDN9E&K~Js1U!hUK9`rEQ
z2GY;o*zvuWGy??DsH0m&f6n_iD0*kW55d*QGr(-k3FgeHnsy7Yowt{LMENe}6pEdl
z;3n*B!p<h_e2o5J^arcN&PCE5N!*-TmAB4nPXs>%S0m4WSN&S%WJ@cn4RVgk!cLaf
zwfn&LF}&IlT#Y;fybehk@xyron}@Mh1{Qz|!6H|8z^>Trip{Rr?265<*zAgzx~gTT
ziE27c@W47Otb>0W{%swFcy%`YSA5HiS9$5%Z0yXY))_3Qqz(Q2GQcgYEbT)t?^af;
z&dzvOPn&Cl+kv*S!QI1}<#wQ&mO05E;cQ=@TR=DPeXP~T{#kGjSVY~vU>E8xNB$|e
z)s-mqGjv>jp`trfb=+;<3R&s2<q!9CpX!gVa`C`d=!ey-ej)4nHhR00I5`YX3GOL-
znVpTq<X&cNFP<Sjb(M(+8tF`V*9y+$PGqCm5Y3(N6KTD@i8torHI+CHGvy%9LjN%M
zC0GD%<Ro9<AEgzQl<U06{3?GmK;{~LxrrQ=NF-!4gW1fu+@r0D&Y$F78TYPK^yS`q
zal5X>i`zY!W#<)6N?X(?Zd=`V+y|6*-UNS*yg<)m?i{ir`f@hbbJ2IH8{u5^5Ss7l
zebe2fS=!<L==9P1vRj+dx2W}d#iRzv>tf*oScHBKaWYot!g*VL>l}pd3bs^q({+yA
z3}Pb>{db9|j}*Q6=(Hp%QdQG=mYMwweki3kQ(BK`>5K()S1kg&(N;%J1#{Q|yQs&U
z!|Wh=<i_>v8XNU|;>_hlI+;Ca9_Pa`NS@|YQ;MB6;BN3k@FVOT0k?sVu{V9hKD%7e
zdk=mrxPt6@mObc~dVX`8a*F#X;Fp1gU>;birF!%7j^qC<G;4xw{T~aO8yPo5XGMU8
zU>>Lz-b8a0wcb{Vph|kE#Ji8E=+9aAF)#w=axxsw`LhdmQckdLBjRTjZK3JpX)bk<
z2<j+~Vfn@Ymw|;~9w^qloAplWze~()^6E;hE3q~g{3*!&)J<XRkHaqua2Z$#=25ES
zdR%?wu2w$jI{#&`uiB@U8=(4!JAzZBesDT+>KhZ_EHD=w01``ja`S62?>|Aai{?h}
zMsJhwr@Z$BUq-$sz+7+uxB<LX<BigdNH!vQR?oW58@fV6|HWClA^0EQW1KD1xd*l7
z?0O5Q;>kqRV?^v^JqOxdu>2PM<J?}C(zj2Lf1-BW2>m~(61PEsv%p+%0I2KhChe8`
zUA@ElubP<G6=5I2+Lhp3P6*>zM=6{ieop+)r1YZzXMwrk0B{33;!pQ_ZN+;}pDPNg
z&I)R+fKR~#XYta0uaLFSP)iMRV?C-jR`-}nyw%zhcNzRCc$YYF(I4ZTU=|GO?RC4}
zUd0<472Di#^i3nyf?q+;S5RuQXb130FGbMO`SR-fJ%yjAa_@Q~`4jN*AeU<9DJGZZ
zqcaKp*I03%v5u}JqqZRf&wyX08Cc><YI$p!myOuu_UgQd<%hA>6n>fBf8~a&rMYMp
zD<-ej2v4@SZT$wzJTpl$+5V5*Zt_^6$5}ydvpQ}Exg9$TvG6L^Eca=tRg9f?v~MoY
zsa$TvG8ejsKEPf0QRT%#DY%<;T?+1IwU&ar>Fry>KTnIx@zMo7130gcoi>3keOsk>
zWcPRA2t|h{C(aiC8Qt$##rg1Wqd$kb&$H6=RN@cOY~VkH{5|wbz&_wy|EywCKi%oj
z`MpL?QUmz9U`woBKz}NIuA#ewTSuQ$xwm?$#E<f}_RTw~v+ven?Tj(^P2!=*+pKY-
za`$j>r`9Kyh1_x4vaY-89+A|Ix%-frx<h%thF42U&#`B%&>OKtujYi#TAI|2on@}B
zCdpST!42f4m7wmS**r&Tiu@q*gUC-IKZ5)t<o&Vs39Xo$*BExj-QWss*DKKx`8<zx
zmT=>q#~wFb_3c0C{aUiz06hF9eb&6xR`1$=EmnOgt9y;2cycRw=2m*5xQ)^)ITu~2
zCm+d&7g-Cl$QW8W7R(E>ht}Fl-gsB<-J&xC`3(9`zVe?Gei?nf+if8DTSYe=|1V~w
zA2T~6(YYtkR7pebLGSAr-K+JE?C@0CF}JpT+}bvilb^s3Y0P^w?qx3%E%iLfMSe21
zzT|}PhTg}$p|0F4+{xtFN2vP<b)SPT(6;RF)8atpWe&IgL5%2^+;DfZB3vZT>#f{v
zN<Z&K|8rWTSKdVUF?u(5eg<!J9>zjz@F#j+l5?Z3$`$Ulx-ZbfPl<DLo4$)k=*rk{
zb&6$N{nU;>0vxUA&UF`(Idw(+KzVPJH;P`VKYOENKJQpfHuJR9c~o~XFNB@i_<sdG
z{Jq}0o&E5q^tSHY2H#b0>(1+p<tpuo#|m=qaqEj@HvJ#xoDzJ}kw|cl>!&>a?>H0Q
zueWs>o%W}J-r1c=*nBiV^<gdW9=)e~E5IM?J;M2ij@_?|2eP??AEH){-qW4$z^|dS
zjQ;$M9xg%t1)pW4t7rViwB7+6z<7s&t%I|r?yB{vwN-EK_Mi0T?)(z})aUtO2|c_Y
z%w~3;Wga6sYZASJ_Ch3S^m7Kcn0&CK;klvYv)1yt>E!FJN=oG>;@!lFdM%nyfp2?H
zlMj^-gAs#Rh+rXtg|29pa@T9B61ORmy+~%bGUFlY+BbmDa9{o*_vIgQ3v0{$?q$6z
zNhFBAJp|2<Xt6Hm&t_O_rgh!U@Z$|48L!p^S0m2=uVZHy&v-i--B4F<h0aWDw#B{~
zT?g<9y?aXk83i_*VzVhDEoG#o>MQpiwd{BHc7ac66k%bpLHsiiz8ik%=tuFDVt4NE
z`}O`VqloIWFLx*HMp;20W8n<eqPkmYuO#y*zT(~}c%M5#<fF)_+?VBkt)<U`X-GP_
z6NS%pwQtRkkZ;}Vn2XQAFBF|EirzLww;4EI>w3*;<vZBS0^5P}v6g3ig@sdKV?6vg
z_^@WYq)qC9<T(C3$Gz_%p3*<8e3H+ucfXc;H{szeXm*BwK>4IiYArdgClfEheXuR#
zGO^KyS?<id%!E(DLVI*x*XILHj{4tCWfm3_8`tYD<{Z`0ISb)y!;eMhAlQsr8DI*t
zSy$_N^K{0&`N&(M`6{#gtn&6}NZR_X!KoT=Xx7)+lsI{s9H_Vhn?>+9(&rR>_=L(u
z^BlcB#|q_>`tsMTkmZ!W55G`9%LX&cUxmLJ%+pWFgVNW)dFYRZH&_qMM{epaf?tHS
zBJdL=pTO@!N91<Ta(yD&415|K33decfnyazwxU&rBonk0ox$+azy;vd0k5AL?4|A*
zB>CXo)OrCvh0+r6JQ7;5dm%XmZ|t|H^|Rndlzs)S#+rjp7T5>*;egjZ6d+lMq$_+m
zwVJ>m1=oRZgD238!k>fp;S*pz@KB<Oo($;^`qtl7;uz1QfvGA9G3wAg;CAE(;ETZ>
z=os(yOxz_FTA8^}-tGv_2IrbtfX@Uia2mKEp{LWe#{Vh_y#bz~^aCtUH=~2U)_5MB
z1UlE7u^=g-S4ZJ@24lI4aV1OL(bOHI5)1!?o<uV<5oZ)L==ltKJA=L{<|8=+o<MTF
zqGk9rFcmyPi;ci^@HX&M+A<ONBK%*$)I=x2-tgBjvlC5Bs@z@#e@jAd-yg!aLrzrK
zNs2y^=0B$BuE4@Y^??7d^4=EadV%pw`78QqBl^EGv4OQ0>BE=ke1)%i>Z+7ARgd^r
zoMLAA;%QxRjC5pxHkdAWE!YtpOX*mwja7;GQ{`QdXM$FM>AEk#PYX~r<9K^-;Q4fB
zVK%7z-70rpptOuyx6?xt;bn{n|A+RQ8DXSzOgzI6LT4#n+GVnb`pVsn$ISXofltFT
z>(F19xKp1>!H>kwFW_e>I;>(RiwHc2-|{JCHbd7TKcPMhU7*iC-b-af$#^NDC~;+Y
z9V5O9{R%X3cz=ND)SZnT)!&ZIU6j5C;!o>yMtvNd1%6O|jo?MbSOdwM@YlhoGiyEI
zx6$)sl-5^XGDd+&?xgfla0aEb@c_~8+y-w}da8QNz8Se$5$}VqVEF+oJb>jrftQpw
znQbfUdOfT4Q)Ypz5_0kU9oR9Ma2xoy@|NmJ_$?*}qB#l8@0lD(D?5?1`rL7>olWpx
zmUoqYntfy*+2%v_m3s$#7v!bb`4XN?XkP&~1vAW=MQ0QEPLQ2$7i6ahZANFQS<xUF
zz@CbiI-s)~G<k!#wb>WMt8%AsSo@(@!Qr6E6^+5)fZSc<cP8PJor!H*GqG*kwvCC+
ziEZ1q?c|MZd*ZyYvDx$65Bn$V*)M&%pVR%IPIY~#x^-`_03S#XDn}BOH{OeNzh2%A
z5Ap62s2T7imXC9Yz~T|jY+>Z<q0if%TLUg(M%a!?|LD51?{0C+@E<IJe282>BE*6F
zDUCO+l=`5#zYmIebLc3du@g)i5)I69t)(=nA8Nc`3i=%6MT9rLH}3t2akLM4lm9a6
z5Cpezn9S9pxD)Ff2qBj=+cv0vP#1mISmk7ozJ$mUL@vpoJ>Lzk(UeIM-~Hp|ZTc%V
zD#kc!uTfV(e)O)#tvx-{n9wyJ%S>XZH4{dod@+~@Yxe5$<Sc=tSJz5Tl_Q1+?Ry=U
zz*ILvhUU+TU=S%xh_vY*W;nu<1^FrI(<EE)of2Eb9LAIzIR0qzXv<IS0!`*;*Jxkv
zV)&i0_R+?p%m?k5{L2Ve;@mIW-K#OwidlW#53^}4TNCefBe%!e#pG#nj*X<&ok0G}
ztAm<xb3ISK*4MRYXTAIIpoJkH1#LIKQggVYOQk2+Xfe^_rp~_kW9b4*d<8i8d^L8_
z+BkC@zST|qLTgl5@R}^I>`#rhykcKFs|OJ6n~xwTgNDos<}03j0;lE5nA=B=4_HjE
zw1UKQyrB#WtxR-b>=KLw-H0nmQS9J-o?{Pua96H#WpJ;;dSOd_Q9BTfX@AIm$z2rj
z++!BSrbztmt<v-EZNAW=n3Y)Hv({0oilO!5E*AmC`{bRfS3R$bBIU~9!kr!KLz$H|
zw|{f3Bt9jw)xQk5$)Z(vw1ull1?8KX=o8kaEvDE;xxvbINF70~?-r%+|D@mMqr4uh
z5{uT#_&q>uxBN*CT2`po5wOMP8e%tl(Eh`FAsH}rUmmGz)0?Nc@>%@2`Pp4#eROGl
zVGcI3{(9O{{hIGxms)U##I<14b8aDz+_mJPb}jp?ouyt+>U{#2vDgeOxq`i3F}REp
zvpkt;^4;zcTWP3PoLi|8=eM?g-|!x{dn0L|=#g4!@JXyOT|yl0&xu&xLC1%L-Ktw2
zsj1~$*p87snpqEZS@}yZ#2gU}=F$u6hvrRRM4%@{7!%q<o-rl-8l-$%AVC*&ulz4E
zpJ@yB0wY8H(!txRwwK0>M5W+E5pK(LMK!!z$E{{o-(?1Tq3>UWk?brN96xv!Dp%OQ
z@8;}lNTEU(_Wpm5v)=vX`LQni^En#1Mti_@)VBcB)7IIo8T1dd=ia`R{f$%g0YAbQ
zT#2EWH*I~Jj*q_ST}jbntTu(Y%JBDqg3EvzQp>1md9;si3$~wyNoIYBLBPm=ie#f(
zk`a>ddv*o)x^-~S)f|37em{KiRzb(W?30Ybw%73~d<c7jKA%qx1A~)n|N5bRGS_iG
z8y`lEwoB+5|Dk?HdnS4&&ZyqSPQ63adgpxid~Jl}i8*xBz!j-xv#=HP<$Wi4)_gYM
zd^g_3yEe1>V3Twu?DbE_KiWz(Xiq0S>dug$>)WyX2Q?OHR+8lA3xWXVRPUObPH*)q
zT{Ccm{F9Q|`h*@VRw*MtZ4qg#^ZrteB^NXl=WF&A@nlcy05y3ur>yG+%h&bT|I!lK
zF0>~PO@4h?gXpZQd{^xLIH0o+YqU%k)w1#ASmWMq2!!ux;NJha9v9h5J<ks;du5Ni
zz|yD+6-!OVSNHt4U#KtFiSzIPw*ky8L{tZ3&RW1Hi<Bnmm95p5fcEopy45m1-=?&#
z_qiO1-N3Z=NmC!+XA`jCO6jnQX=dmQX`P9>K0loI!uv;T$p=ldT;84atO(S$Q%I&n
z$=3PJ63Or^ri5NTEox|Mpxa;G@=nm6Ul>gN^8K}O{KDG&!C%C#SgGRYde$uD#^zi+
zIjv)eSKjtytC08$`?&(B#FZ32P$qg=uT9U)NHgTb4l}d+7Kgt}>mhU-J2cz3RVB{;
znm~k>+1Ht6%807f^NZ(pTtQv4*8ON7i@u(ACCl7Joab^ls&l<qSR^^)Y0Q`mQ!7*k
z{YaOX?5Y*qhLr(7Z4{oqzpJzK6jf;>46LF|U4@!+`jWKb_5bv6S1aI#*bL3$bh++b
zA>`y7|7G1i6ez3a1{p{_M^9=7K<59gj~;MH66<ov6Fg)xt#p>bX}XPXHcp<B<t4bB
z!3<93nb<=48x@`4Emgef&7C{E{w=BFg!_=9ly+5DyOpLpu&r12q*V0(tPh{D<=Jv_
zxefG6Skk@wj3x>I_H#k(itjn?DV{;Cs*AT+JS{w@-M2gpyv~@;W-*hPNIKIxCVSY?
zH^V-4>@l%GWf7;MG99A9zrE4;X<*fa;8JX;5B-Et@Yv-!#v-Hr3QUPQef|(QD-W4n
zqu~xIlkbef6MuRob__vtfF1hDZKt>n!tD;|fqhXr6sa{gGJsl^3Ch&?_`%8b*eD^V
z@M;~mR(})7jOPt*ZAM@R3H;%u)bZ_T&Hv+Qfm7d|IY`|1n}z7LtQm>*%=iTQYgIIE
zd>kkjfiadh@_FeCs!4oc`;w)6w4ZQ8UXl`<%uO6P?PCxkwbskvj6mJsB81Tl$24(B
z>lV#}D)Bd``||UcANruMahX*hKpXJ}=ue_*RA>Zb+uRZihyC<4c2ccqIrw{X5=Q1u
z2oHJ&^r9JrHNebZ$tn8Xl`mWajE)(<$jk?M`2*PN@t!946_!1r<G;oJFFn7*D%(_v
zy0M@u1N&cGBF)xJl9XNPlx7S4Uwp{>o3JQ%(`a$4((7XH%qs*ho1|(!-f2T&-cq{v
z__u)zYICovpQH1oBpXHd5O;Wk=`;zpo1Kv73lQc6@+b`y+0}ly=SAmF3t6W;()FrF
zU+^y`$>*3fG#}Qm$@l8_u9O{9@u`aJsq@AWpZ3rF2JBVIII`=PxI9yB{qyE%p6prC
zho`&Y;!d9}a=8U2ckhQhGr~&lY)V1iJ<H8*MA$N?(${PlQvLA;KaFe4En){jyC0y#
zyW>kSs&}<Hffl+;_2V9T4zkzpBD1fNXYT2%r=r!<Jj9lz3A3zMGVv?$Inii0a7AIq
z!~x-$6AzrKvA2IRy!PJCwwAZ^y-#ePeUn^6^x=CErx>rtt}iJ*K$~Mo%Z;muIg@5H
zKWR@E^VMMOiHg18lrmuaa^_R(?LlvMNbTTvf|Lk9`S<)RU-P_ANEaZb#3^%6RDIP*
z@{tpMsrmGOTsJ1W)$G&h*PJ;2kSFCt<j0@lr}peb+d^{0r72+1WK@CBhiMVm5i)05
z1<R5+m3U=9_a*ZFNt+{J&hl<16>3-t@wnoD7nox$A3~xTQ-qZT#-vn=TWm;nfKvIa
zzq)jr5yS^AGjdBRNT}i~a6EdNXMvx0MPd)L?7nYEC^)a}vMC1rQKKa;1(fQdN~<d%
zsg0igLsieu?gO~xxeWBfPgoicEg=FXyDJ|kKWsfDUCk+BcTUU6(mV<2A!gFgenA`^
z7w4j$T<)B1_*NesF{Koa9;3JzH>;Jg{)rBm>642b0_PKK%liZ)ryRGGst1$A;S%3F
zr{LnxQ?E{Ij&5RwnFu^J3@-rgZvcJ^jlWqemdEId|9XAnP9WAawf%!1sgL+kIPRr8
zTOMAN8!p_HEk@0HBZL2-ae^(j#hyQ77CJWw@{AMi#R5QiZMwr@kUPh{0c>4^sxqsH
zRbyP6&uxYUQ(n&y?*y{9kk|zs!WF+rJQ~vyN{X|{Q%V60dt~tDzbK4fV0|cvQDn&E
zmlP*xcZu|jw)-6>Rj^;7X;@0DU6Wdfe5K-A6_eg;?h?>XC<urlTn#-!Z~S)QFFniw
zN&&TJB(>m=qD~sKSK!OT;6^}}cmtLtJNV?(RX$m=%4wfnG_t3>bSdp}3){kFF=w;C
z6O_@A=j_UW{e|P=s2k8j%iYd<^RPLSuGloi$`TP|`RZ^A{_8WI0vx+Cb0OincN7l;
z_-$>>reas7^zx41%8HdqTIaJeNSCy;I#MPj6hLzgb!5xRG>6MCnZ|-Z)`I0%W^wkv
zocGrzaW`j;D;Ct+fvnuRQka>|Rm)`BJtDPC<)*h4S~;W2`%2bG4y+PS_qRC%Utwk3
zF$f-gjOQnuXMjt7l$XjhS^NX7MBF)V6c0NdepSWWc`C<XGjwKBa08~|)2I&A170XO
z2zrBd<~tYiw4Y{>Tp33Ih`X}%%ash~Uj%CP8G65{=&mKNz>OY2ZoK{56+ZENaWOB9
z9^y~R%o*k%%`i!ilbE=NBP2x>Q97fX;HCtLys{qJ0r^?}{KSwszagzcPG$Kwkc(-<
z_V7hm+a*|z^UGiDhzLW>*q@<pU+%pTy^(L}!nX|QDuGQoW6jA!_qW26>h!wRla^5q
zrfEL3H;{a_Uonp<Ir0WHrxuS=_W$y2GnN!=N+6i2--m;()*7C(*(a8I7oc-ksI*w)
zRutMz%gG#SeJ8l)<a?oWcY%T^te=_Pz#lUOtFQdbjc?PLTR!qa5=22H-K_#)ES4bi
zMd2)zLMI_4U<ySP0ceWxzq=(-k&uzmz)xi}B!Kf?z_{ydUd?tgNB;H|uioRYcfwqc
zTW)1n^W%53e_82UKJ0k0`|fBXksUxwa<NX}ys|Imbne8>zpFRntIQWP{gImqyxtSQ
zmj2Cb`^0ALroC^|8ynFQb8PRFV`Q)0U18)E5bDR~PCI!>d+$F9I43{Egtk%4W8WjO
z-tP+_nw-Y%ApFB8w@tr&PJQduhg^42ox^vwT0i&b88n6*Upcg%f^aRMKKX46xAhP7
zNW%KXTU2cB^i9!sCu63aqp#~UqJv_p@g)qCo}mw<{i%`I)XU6WJo>X1CZ`|sie`|L
z6ce)~=TrMVE1xfgz5{ckb8bDMem4(cdhCo#EY0N203QAR#^{S1XHM+7<oL`OZTZX%
z51!r7!-dOSE$6iU?akZw$J5vY%|1Y1&`@jl9P6<E7^OG0^uTY=9S2~~SBcWv4%OQJ
z>*<zk`5q(Hal^Zjqm=W23y`Ukky01;8F2vHK8=n?2pI7m<Vd)~bn+%2v1_As?i~=U
zb{R2L^rfpsA#lj@aLO$r7yFgvw2|27v&ZLAnssh#IpCN&R+Kg58<mdzLQ_LbWk^dn
zcKo|;89I@}H{o1o@P-$!ce0qL0OiqX8oOr12NbdoCGg`|aJXlS6=7iA^*d*O|IPEg
z1C8<X@?Ec7o0xfadVh`-3KG&g?q`D_1zM1i0?4=^tHlr5@))jbgf7VXSpqI$H>^W^
z!XNP1#N0(8LP)~RNug)6NEvgg=R$TT@+pO&&i(`pX<s7-^PEt=4cMGSa4>^+3NTc0
zF1BD(f&<jEklUfl*AQLMyoOLkKUJGuFR2u&ZL3q6g(o~+D$PUcb)>aE%}$(5GM$v7
z6%}ywf>L))E#PVc1G*FtUk83zfI7^gZwKUi%ORPB=<IrSVCIo526<l-ABzh>-Nb>D
z5VJ37qeTc~mw~C~VRS*zmTkT4sh&WB=F>U{*<Q23-D$<a*o?xQ5+g1l!JFnYsRXaj
zf@uXngXi&e!4e8^pMc%WPJE<T+Qi$|SMV7#1K6UnvAh@_qyHdd*76zPO>^WpKpcPo
zfJWNR*41jvwkDf^w#b_w2lcC~F}jPEfJ{1^1jZmnRAXO2LiPT7D=QWAvO(2Gh4OKX
zm@vRBsv3)%!PW4hf3;I~1*2Zf4`3K&j8)1sZK^Zb7C>cdX!D;51$0WU@S}~RO0nsg
z^vt^^(oPC2hg@Q!nP~w?04RWr%IebObXsaXgE5@3B>*N$m8<h%(WGi*HSNabd(RG0
zy4-Gmmd3mz>*){v(%F700QSDuA!m{^&l+sY!+nTvPjkRNumVI!ZD2h)Tkn)ddxYI^
z9B2Y2qSZs%AV!;n!J##qL`0h>{x9WK;(N7Pdq@%0eKynx?P)liYo`&b9qUd#vv
zcmR%wx-vVG1317-R3x^aiH<bZ<;Fl;q66UpG9WSPxYNaMZOSeAhV4KYv)SaH*{JnO
zx8sxnAOV<+;>5mV#t0|>?cxAnibBV(WL!3@nmMTf{94PY^An`XI6fsF>_ta?-<$oy
zmu<lMt*HqA`K2+wtD>Y6w<I0zZ!RM0I{nX%?khQs=={SO&c`filQhA~McKP`<N@)D
z>L#6WiP;#yZ!pSiY-U?-M42O?9!P$7Ew30~Dg}!CWkf?eLiSQx^_4IQvG(;V9n|p?
zRw<7B!|B~g4$R}4e`MEufI{ofP~r+9II{s`Rj{r-iY<(AQ1}IjmI~T~2sJW2M3|yc
zhz8AagzkP(8zp?Y<Z@j1p#rDgbn?WZnTJ45>QiztiK?EelB#mFD&&$hyz0S{PDbEW
z^Sh+6+LBDXZ8`}rjp*wE`Mg+!Bcbe_Pu?)9B{i4FdV0>gcUN}TXB}W{VK1)h^STc0
zG>bB(Hi2Y|Oq45_Zx1q|bm34gz9yUGL+v}8otDwXa}YbWz<FCLf$2>mdq*$6hj#SA
zNjt_D^xHD6(luc+yQp2zm=1oeB1`WIL7w3HP*PscXMN72kXI9ssLG>B??HWwAX{VF
zlXiF-5Iz=5zsTUhMI}m)Mwvi@pH0$DD(Yz-aX*Z9Ix)5&Dx8aDiWUwn9Umj<`_1Qy
zcA9jwq*P>GB(Wm19r>#QnS>FQT1*(1F~0hbG>ibV3mO#jkj!uo37WT@st!=4zqt)X
zT{%Wx2PO5St#Y3V%0X8LlF~L@JFku>1rk($2y@J?<3O-NYW$L(QrMK^sI8YssFjyW
zz#OB<8D~A(IXJwIAg133S?tfXa^*l47g08qobEuyG<8soE3ru;^MY+Y!=;DRfu;ge
z(7JQXk!MkM3S8+uc*?m0qJ-2XsZJ~;6njF=6;yOkm3!2BM}m}}aXUe5qK@n-D$NSN
z!=WT_RCznTw3~FsUHo%<`)qR?XLCEXy}LJc@i0M4N9k=0Wpk2tbAWaeK>OZp0-1Vo
zn4qgeA}9^RGG%kx(~eIf9JXX{q{^9`of`I#+=k-4Sh6&f1hR4fJM5NrLeHMPliPGA
zKW2ao_aRa=3I)9VYQ!VLiTur=Fccp19Y#Zs6mCkwLJ@}*7}5BUjfXj~=xV~P8H6@5
zS`CZbi3s{A+${PmP80dCNbyhm%$bs9z>Q6D(jfq2_e{J|scR-|X+(FUJPe8$Db0on
zQK?&o!55TX15yp2-3H0unk)lWoB{}cn;qluz|IdTIANA2W)98k1gGP?3=3gq%RI^o
zR9VO>U@Ay>lvGt!iI>P&B`HX^(|*G-FGrz^PA;2SD=yktx1i6Nc{O%Zn7J}{hGR8$
zuIcjFJ+gR3?EkCn=`RR+h*%QRsfSn-BI6>K`}NXtm%*&*i*A8_LSzT_3LY}{Mlo0W
zo(A4nW-uADrULb^5047{-+&Gk+MNMGGR~bLA9@IQ-CvjyAq|P*eu^k1OGcbRRWN`-
zlvL<&eOts(f`(G@$h?M@X&;k@nCXzChMegDjD{d@D0+Rtc$8{g#CX6|UHbUhr?g=*
z{DuK;GX91kcQPoqk$5sHx3PIL?9LFos_jM0mMXzTIIpUnH|~zwOC8~(^2-gcw>)~E
zKZgSPP(OzT`anL13OXS6niM@acbB{ZDy9#)f+A)>kqT#I7fTgxWM2b4@W7$VlYo3*
zL=}(xz(f_KbYDdkrS!mA6~=rQP8G*|kLCenx}UQGZhE-Af@XT;r2>zx54?isyCJ6%
z<{93%P;bcYMq3p*bzeypx_1PqYWn`uPl0|9Mkl{wmrAF&VjoARuwsu(r?lb#Vl&Tb
z_xEOz9w7I|Totu-UriOZ^}tmX_gl?H1^*r0Pu0bN{YtHNS8ucOY|nSIR&O}aU14dY
z&|QVKPug9Qb*R!^m383UU6&P*>8{)z(s~nPB^1~SzCmv=@oUJ(A-8Ik^d_t-l+u#0
z?{4zY<KKZtiD^+3SnniNqfp!=ty5reMVw3HREn9j{iK>1HB{Q4Nw&Xc+>Q8PnYvRo
zNJfCno4E0B3BkV#lTqo!tY*ViEF+D^u23UcX{hrCwv+IdjP(D4p(bNri19Io0+7Ok
z>lyq?dnio$j5Oew_V1o~Oa@RiKyn6^G+>wxV4p!u`WrM-s*<h!8tXr%V(9mz<3KZw
zlt;tsv0&TlSX)9iS1~q(2%-i#MkBN9P!9*`zLk26c>uycFZ_A)h<bwU1c@&qeexN4
zB7s6^2z@^lz##XCC?bUIp+pWC@F0e-jo-11u9e@>az^4gP$CX2@Zf)4Grz-0?Q`Hk
z7Tbf!5qa@KllQ}NfR_qjXN4@}(fG#X3Ug+KGw1V61J)FN_V&^5ie`o0<&pS?>kEC8
zZ^7m>(G7@RqtF3{6~I$>QOY8A_S_WUQ}>w4VkR6o7Y*K?m^OMdOt@EsXdAN6^Vw|4
z&NU&~6Pv3ewMW-`0yak1mk04pO!bGeO-#=Z=o%W}?Gaz1LfT@z#YlMs_zD>b`bn>G
z{78GC_4=u2(Q5ZtEkf9@Ay`JV-6Mfszb<RRL>jB7gpDWAWJQ(7ZE3Xv8B%bh1z%CT
zq4VBN-%UpmzRbl4*ZO+JcXkSAaDC!=UT-@pY3H`xu7Qk?Pgrk=(Ts+!c@N%Uqf~{e
zriH{Ne-jjbRtV}^$h-w3H2DHWsVDLoKmdczX|Rz7*i|7@b*Kn^4jhOpmZpdU<XL&G
z0^lyN5s03drcZf#G+tocgAL{igb6oj=MzkPWrP#oz=Vbi<a>gN96?~>h1>|d0X88^
z-1p0f8jnyoLk;G$NC~_rZTU0@fqW)wycSp!1cHS17ufgEV>!;7_9-`lxeXTtECHsm
zdGRz}5DyTN#UId}knq31CuKC8>Fpb}SnkrE#fnXz;V}bvPDb6OVFMYN`;AQsMn=hE
z<0%P9m3hTVJfdWtuwFKeO-e>aX=3Bo2uPiH#Coo=(vKOLc8m|DW8=XINcBCUqDVeE
z!DY?F%os?WzDP**y`pGtv0ebiCiNqutpudfykb3ffbk~+<_R7%-|MuOIb*Z-kx7`?
z_*@<_HIJyOM=Z?)#)f5MlkAaEh}igO0@6$#F*dKL_Y1~`IBd-UQqy0|4WIN(4NJx*
znIogFvGMo>q`mG@?`Mn+6UHX3BclYd@!bTZZ@gj%?ooL6fN|p3^g#k<aUQc6&q=(y
zw3jL4!=RB-$=G;h0@6}mv1#`x8$&TyCek<eBD}B2sH{6IP9URWzwzPs`G3X6QxlM?
z^NQ7YMCm+XweK-HP8c7yjEv&P#&;5sKJkbVxJU8aV@Y0PNq#R?NM=SOVFW;toYY_n
z$Cmb(%rX_DV<u%!kAJ(8aQKoh+_3G*Tt>PL%XW;Y$hRtItEfK9+g8|8aa&UEu6`}+
zw+h3l;8Ph~6mNp%tC1^9JWJxLo-S)V%i^k~D~mi!=%}hGt31o-sIe`}JX0eAeDF&d
zg{}%>rd@n;sjI3l307oO{H9ovYK9PQR=(^n?Wm~PRllJk2v9*>3RDBJD{t*n>sGb1
zSJ8jq16u@R{i#%GYDFCc9swS4qZ=Yo7bOF55;G%>j9(fxGhuhc;*8d*F<UIq_IBg7
z>`v(KWRIVrEp$q_(uGBTdJfy%Mt<4yQl^TL>Vsn5r2l)w<{0=f<RQ5N&pZ&#BBn(;
zAYm@jvC~s%{N<nvy+*tW#j1o`vd7%C#~uJTQ3pQ)g=GPTOB~|WJ3+Mdsz|!!RCE)d
z2!U7FrAQDrZbT$BrR22e{gA9xxm^BW6}Vo;Rz7uR-YojZ2j}2wi-}cvstCutv)5Yf
zZjoo-%b$c>YO*^29A6!iV1+L+CPTb<ViEC7yjiDOfxkT2Dp{*~nWs9o{ZZ2jV~J0E
z%3)_z<*xj(7EROKYEP|VYiY2$n<$94dAanYKMVhf!)e9~FgJYMUg$r~%sn=I%?J0_
z|N19fKtDs!zlTtUsISfwat1uUVog32v3?6I(A<6vEO94a5SbuJ2JDhPl{7JoU8l64
z%!U|~A)MFXYk?jN-<2ZpYjm2R+A%I~(8xV)4G`%)?&lnvVZ>`vno#s>)T$s=6VBy6
zEe*(XLr|L_Src*J5ZXHQmVVdOpS^LaFX*oQwyU6ChJ+X7xK2n2{e~PM;(J){Vi?C@
zSwYJ7IMaQa9O941AUZ+P_SiLjs%>J+ClH%K+V;3teY$OsPkTRkg1{t##95+(t4Pns
z)lGO`DyrylM?)IoZD{lpWlfaVWY|(`Comd{qmrvltk=X^k}plX)&zTq4@|_@BwbT<
z4>KJrz9m*BKyUMS$Pgz;Z_9Xo`z4MXL3=3WB$OUud#L3k9v^{wC~Dp&bXD1Si0UP;
z9J+W2Zzr`L{_zmsPJTM{c@f}Cf@V_WBSAgv`BR&eFLR`<^`aVAQFvCNuEM({!m3(Z
ziBVDLsSK|Gx&&zzhf@wH54XyH)$%FCs01!PS!J%*xR&WJ$~{!B7rR#ST73f<A1gu6
z^0<l+DoM}ExJrDPrLc>)zBa5bm|StaqPhWm#=Q=G92R&?2)N==g#g9__JcNu0S?P7
zCOKU5s73(iL6<{+n}rS&9j<y*C4lvy!(p(^a+Apx*Db0Oz<bbh)8}qMh7}GVWi|*o
zyX^?@Ra9S0b~gP;s41vCv+C%m$=6?Ib)nVOUYT=wpzg@=Rb5$ucNX4~U|+61{oPS+
zTatOc;sU#+G@@i7n@=13*b(R}JaQV-k$6?rdBX0Zz9l<ye$^3rRlYNCcdm!#dCu2S
zdsVu#1U&tG_2CsjTqHaZ^sFN&2t7l5VEBmt)bJ|^J(qug;#G>BFMPoINGGTqIlb`=
z$SET~9rKLHsVF@?eRbj$o>{niNcHspl%ikMKNo$)?bh(COl|vmQ}wG&Jte%4^=#M6
z_nW7Fpz7ADowI)6>el<FJ$U%tt@%~edQ$C~xn1LSqJJ;<QMvt%t>^Wu-7bAP2Yt=s
zD?&UceJ$fF@mm;qfc8<!oiBaB_EF1SIDP>4QPiJLeL(k7)n8b7(DjkuSwXxP&dx)9
zAmx7!&M7xPUGt3j6q;FLznArq-dT35YOQLm)?4OQv#X3<S~)ZCw(d56Xx(mw&;YGQ
zyl}hNb|vhr-J0=T@twJ{>$O9y<FA8U3%n$LRCp8iO8bcasQ(D(-6XJ+uOnYiycl~#
z^Qz#T%CYscovAZlZ@gH0#CwDFs^OjaoYyn(Go^3pZNXgIS&h83yUe{vex&kh?gr>t
z*EX(fWM1Z8RzI?Nb#@Pa4sTn2Spgf|HoPu;9_=2X-;#U)eCGOfwW})^xR0`LDLxxL
z#yicxwWmwSM`7+c-^0J8c&8L2_A(>KS@v>VrCAHp*vC~))0{vZdpfSvtchv#;|iy#
z_Nz_%Z_cl*jcNSj8mF0dAdfu(7qnc#9mS8I$Z<c>89>bXFlT@I5m6X`U;IUd0%JhJ
zgMu!C34&09901w#XYp_H4-@DRFd_0G5+*VxQr>MB_$QDeuq@Cb@F)-?Fx?l_H`G_u
zci5NIx7gRzci(sHkL{oCuj#+$&-s3z4!peMee`vVAcGxun=UQ4I^1!qOdab5szxta
z*Xe%;)jcAwg~5u*GZ697gKkLiB`8)9xw3jInk<tU--wT8%SUbVRl97{UKcr9Md>{r
zudqll=)$5OJ)rcm6HOw9%tLU<ir%r^VL<jBO9wT~<+**P9lKsb_=evSmQzi>36|Be
z%etCDz>e0#x68)S4gFA}u)kG@ch>VT7T-L;=qA~xD31bhZQ;guod5U=q5WCn`nSI|
z#~;N8J|2G$1U{yWj9eA3Qe7{2l7KMH!4TQaFOYimX4(;wy|3}fv@P*TwQcnr=7jIr
zMyCO)krz5x9d)5~&Ve%)0Tj&$v;*)K<kZmr5|HR5ldH%Aq~{$Z$x<bt3iPODeAIP1
zzG%ynXuy@!Qj!J7DjKM;8d9pjOIYV|p!uxJo6(wFmO7*!B%Na3k9rS{6TMcydOyy;
zqC;JLP8wgQzDj1`ma3g`x8!<-2}U2Xl1IeZoHc@{>#BX&ZpbGLDVoy|v-Uj0+<
z^B}+Hpu-{L*WS<Ed%CY$kHL}cnD>2++Z}n&;oGKH`xOF7YDBj!#Y{DVr!JQfP%Rgh
z0`w6RJtvub^bsGN<_F6nHn))@V2`q^%oTe!5_vM}S&{2NmOXiN;&QnCJZHR|CwXAv
z+}I({v59K~hi0~ooGW=~;=<U`q3bP&NAB4lNWQHdBP1;g{=}8+FY(^Ueyo7<&s`t(
zRoqM2i@;W%C0O7u{;nN93&NJz&#PEQ_!84x@6%k*qagloDd|3EtQR|vW0RgW$!GrS
zY1Ux0y{6=7w02^dMT`d`Tz;n++Klk!IQgb;z<QYl{km|IyRdVRQ9<Jg_^_xQBMw!>
zjS&h9>Q2PvtdeW;&8+xioX@WAntTsr;L)2Sk~c-@XskioyGYF6x%YeTEIiV&^*aK;
z21gYN8AU@XIJ0sVD*8#a7ggeCMbYwzx&+=&y2s~EkEfQAG+Ud{J2NWg<S+)i%vMpZ
zyBZI5k&LIQH<Iqy6BKN3VQ<3ls2+K5Qc<-Xiye*l3HEJv$-_I@n<?~Q?>nDJQOX^Z
z9Z$u%*11;UuCtAU4bqI4$2WwCe?5aeg$2K=;Za1acSyBx?^hT*lAh&y^ClwDKY2d8
zjFS0#d*d_?VsFL6;XkqbqE=7n_vne)+Gl2l%=|e0IByzz19}5i_iHa|X<A+@Un?X2
zeBylK7<KmM^yaMY+3ng9`QETUvSV<6OnubvtNVHRdDYX0wR{-$F3m@vLsl)TaXXBj
z84O1nt07i~wh8#U2DC}|K9V~`zF+9ZTS&jLmS2{*(ytAq@8hL@fDnQMCRinbt+Tey
zKlvRjsn*>&rE^A+6;Qml*Gb8%Eep!Me}iLU&Q=)wmS%6>l%_3RG=#a*W_uCg!(jy%
z6?AkP7Imn&`*nlDnR-^hg=JKQHDR}A6NF(_l)gP_V8+6nK9in4MPsJMoSD8gY50%D
zDZPD)$4rkoH+_f30D}cKJ$Q<uF-%kCoys+Q%P4l0`Z4@t|1SpxLipf*7zb5M_|A}n
zgHRS)>5!#^SQh&6kcWey4oi|@*~VE+N99%d+nK*-8H!4Q=e_zZoYHWdcWM8wK6TN)
z7e6u@nn|7kl2g0H!uxJds#;h;t_H2-`OrKX4TW%jNQ443UYXp_{GY)F7GTkqKC4ke
z6io$jIO~c~fwoA9d)I9D0Zp^9LR=ZyC?ukFQ2zHU;-tidqhw5izkRv?ac0yu6Ejb(
z0OEpk&&-a}Gh~OtIYy^}6NgA!qKKoEuVH*FrJ-TOapK+-A%c30I&h)yEP*4!KQsDw
zgaqrAsse<Jx>tWF8bz*xh`#tLv&?j1=_#upbjnRV*DRdJ0)wj`u7c5tN|?qeutli>
zSl9=jY%*#^=*Fy?(P~jHrn*{;Y;o{?MK1Ez*K-la@LG%rEg=CrIA-vW8yTikAxP)2
z)_g%)URB+3fOR|k0I2gamw@H@?#rII;I5G$qgS2~qry3cIEPM`9B3rmJ&*;Xo`fMr
zCjGHOaBJ77*F0-C$_iPJYUX)hYqw4c2^TNm6ryY&C=c;(pX!urhxK%syt*(6bm<AF
zN<t}l#=6l-RkB{#fvilLq=eN#oItbg_a;v-)Bp|o$Oo7@z5FWmwB|l72^`vn6amj=
zM&|RhB`)Laz^rBPbC}a3E3G)zXd_u;jfMTgFSGei1GY=1@rLDOjq}t|`mK<&!(<c-
z^L;B7RoCmBo{nBen_(PuM;9~~kL|$&yj`z!r6$xPf@4WV`NL9`)q?{I2%g}``34q;
zJtiZ2!pzCYSdY2M$XpNT0v9{wx-|SuuPwtE5H-?DWTMN-Hz9`09x;WT*oNN%5bJ@o
z{uj$`%koIV-1XE7fDW^*;N_+eEHHe_W(GXZxibU4d-9zx+Nj_9y@bCVcKk+szIZ*Z
zcMPy&{}Q@`y^oR_aK`>c=K=BEN3KiYc|+F_-TuI-`i+?T!+JFL%>cLBfy4l@+Pa|+
zS!?1jc%g}^evhe|PN|>u_9ngwlt@~FyNLbgCO(;pJ&;KN|EZhSeb{%T9BktxE6ND3
z_o(c1j&wG`gLbm8<`I=w>6wrh<RNeJfCu{M`PAe4<YW}q1+0NJUGwV_g_k*(G5X_2
zKCVh$4ffJ7Uh1jN$)kQF4fE-HHlcqn;|#cqRuQmmLS<Tn`C5cCF5_}96_7TFBQN8o
zTt^MLgKR<r))0d0@L^23lQJ$9ur~7TTZApw5ZZgG^<;548X~mBWZu`DuqbJerK%^X
zZX-wvipq#8!oP)6Qwjo4NIYlPi+izX_%I4jDwKz581-pTcEZt(s#j@Tk=R4SLL2uR
z+9<CQZ=NK@L&l-fCCySj!nd&EQ=4XFb<hmf(z<e~Gb2!FGQ$&SG9#5}^f8MyTaf7M
zY$(U;qR>q>TME*T$CnFuCfEu{CLRlzCOB@(YA$H3>hbJKT#D<5q72#>Wsar1cUyHW
zOCGsv?x(IHHRzs5C(u6)QlLMPQo=t~Xb4|~btNlP7ilISV0+{s<3ni#8D2aj%in{d
z$2K~xAsZlzenkt;Ge%syC<fI9Q%J2U&2j}eQ;V%i4I#B)%%HX)qi?$L4@Cyqg-HcD
zH0i|^siF_i6;SPE7T_4FD+bzkxu@F~lu2^*!+}b^hM>~1ZgCSLzArKsQAS(MFA(Wm
zl|De6o;dB8-C7Tb^yXg2^FxuHid7Q&X^sjvRzSaOaNIU_8~%U`RAfLl>+{xt<4+*m
z0O{OCx_qaf-+atDo|A~ktIx>>C>bJ?N-bDakXR)I+sJsf64U-e(LM;z_n^s)c=@&>
zy(SwbtVR!AkI|YEjR`h&kJ*|KjnSHv_h3V_#$-b@e<8Kxh)fL`ZLV?qW^Vcsj336g
znW|_|Kh`w(Zq40(S!M;AYBHw3*}*(*s$8{OZaOWrAM+jaW%m29C3-nvJZ;)swR3Jd
z>9>9CAsPb)({ETN-L=ek0<h&rqQsdFPSbDTCfk0HVm(*e4!3}~mqZ#ok>gyYdRyOe
zKeXpVy%+)Ik|YL=X=!8Qp`O-lX-T#YG`QW!+FMdD#;^#eqpnbykcBW{L^v)j!L&)n
z9)}++kr_m~sBpr>3i8Y7*EQkC-dN;1=xI79dK~NUssWYsizMD*&}m_@qdjsBzqOUC
z%9mNJJ099u18pv;tecwwTGrK#%i6eA(e9AHs90u22(9k5pQ_%LLC^D(4Ss3G+Aw)N
zmOesx74eP}{E4mSZyZ@axcK#`^v3<><(2*s_)*xssb@%7hqxwwVf^UuR^+qJXPw(h
zzn*$o`RF3f?MpsdkGJ8F^*2ziE^%fIh-MGN6`VC7m!v#Ic_zX=oP~AlQQD?FKzW+v
zD8)_BVVcz-$64B?-0v}0exf1-2^xkKQO*bl+~@6zJ!J&#m?3hytwD4zHJ=~P=Xz;E
zV2bH-<+(h;TMKn{{^|RM?eT>%+1Y^9^OPawQ{5Ayu%q4+9Jk)O*orpb<L2#Vu_fk(
zE^&UPYT@HF#0BoLi4R+C29XG)lk|GD+Y?>?0}RD!I^ct%Zv@ti^>kvqU9Deqp4S0u
zCVJk`AGaP>-FnA+I{DtNATBy_*TWB+aXE<(%e5OyuLg5ob>Yb6Klo1D;c)n+V?Vuk
zpY=~mcf3Z8*!<Vvj0l9&NA4$$U6;FlSQ89ptQbz6-{2Gr&QH%Ae%*`TSnY1FRR2iN
z!TX{Vzu_m0l{5~uk{BihcQ2y2G9-K=cl3jB@(Z!|E7|{qJNlHF`oIQ!U{8OfhrBV<
zKAD3(nR8vuNSw7c#cf-6|Jg^a7T2=#WM0m`<B{_pDc^OV^2nkB>Wsko?r%bACG)g^
z^v*c9;PJq^lV$QGu*RDB&C{vrptcT!+Oft-mBgQ#vZafwvKmh<n`t+8uT5RpJ`T~O
z!8;kyO$NTy{lFNYhyq~I#QdsfN$Gdg_(^SqZWB^n2j4Onvx>blbhV1Q1Mqz&K=>|v
zpz#mL+k?dnFmhmw9EfB^DLZiecNzxrc;8#+ul@o1GcDaP@CH)HztSGQW)Qv&+7D3O
zhW}lJ_^v(-f4C#X?*HnFSh@MV{rOgX+q={Lj@a!)NgTBN{%3OW`LDO{nz?6bmRZ5c
zF8BQqZlCXyE;g__<Ld!GeFQkCfPV|Vc5#!#OIgaLnYZ)QrL-Bu-6Tf;J8}V;UP+Hu
zQ+zo!+5~ki!bTOd!P95cTuUDI%BhMRbSZ%c?aSCc*OM1})7(TJp1uSsd5hd)OsJY4
z;>mJ7(ck%n-2Yv$L4b=KF**lh)Q=2yNa=PqYLI`vS}CXnjq5#e_Q<mTGxu?tC#&M}
z>%jcdDaND1c${Y@#s@)2H%Mj-iRxFnIfBEu!WM^AP}wAvu%r?|Ds?i6MY6a9K0!r9
zM3l<+ji%KR*!tG}En=AS=Kb};`{v-iyncMq{kWa`@Y%)Z#qU+UwnMM6`1;Xsh!{Bu
zOx=mG>*=<K{qnQhm%ozzv589K`x!Ls2^-abzsEtLFV1fm6V!m18Zt4|kV~>Dsl%D_
zwYZnzkXi><2I)q~?MLCg{Fdu`_NkuR@-=mg2=trW!QUJBG-bLY$U<hu>@&(Hnj}b@
z;rO8dxU~R_e8f)6hHhZs#$_-E@<o{HL2*C=(n$LPS~Yr@ejfL^Bk~zMqCG<y|0AFL
zgfnCGLNJ^b2p|HRJPw0`HESf9R8;0fruh<U+w&&yhDtFejYRgH#1H^u)DGk1hs019
zWz>d?2iRbPO>81@{+lG!h*AaEpo@Svsv@gSstGUJPh!h1GG<0GnD37X-!i;0IAS%%
z5{WasQEcPI>|2Eo9;hQrPCD#|t=^!BhLudl!-xOl;F$|JnZ_&4Jc(WVIjQMiuYvEs
z7C(tx80Hn`#lW4j8f83fzZtMrZIi~-NUVVoUPoC6x&OlpiCfI`qJOiPqt7P-cfzxM
zqD`BjGr1GAk>0Qf#+{5|y3d$}6Ox%nCfNg7ulU6+YcE;Qo}jlo{ViMYhY8XTY%m)4
zKr{h}pK<^5*!RTupYi`SCIFFcM+_nfCV=uYPUHs}@{j*Nw=Z0Lqpfp7L+J;Nw}V<)
z?!U*8_(un7Q0$QThv7BiK|<W|)S6*`{v8N!Xhc%=xDJCt{-%boFKhS#VsJwAS7G;W
zjo=c-Vi>yQP+0#6C;y&}13{a;4%F|0Z4R!-5$`VJzf$z8SpGX!xD6GLCG!_)XL4Oh
z0v@_ghSitgaE9VZf)=9K&&uHLU_?-osMbaY4%<)a5gxCKtGi-JZ`#&Bk*(FiY<~OF
z`UeM9x#Y#oicW&yh8uT#wdL|pr4k4gUgz^nN~xh#cDJhALrq1J_(w2Wb4Y-ElMNaY
z?-BDkQzpjM&Bigfb?`sJTB;<pf1`sxd$#BJSGBkNuL$s{OwEmgq@Wy>QwluG5Jbhb
z)uru7)fROXM2suH;!_;*@g^U#$hzXIp+%9CbLk~#W!{B9x(18XxYDH3xmIZwmgaTJ
zYs!sVRJNi5tP~Fw*IqzT-u8ZSx`+O`d%73H7BJg`Hk;-g?h}EjJQl$Ev5Roh{lK!-
zM3p-0`RGtIIM!-9Jyl_@o!|<t(XPpXH)YkBI_q*B#m)a|1sA*A?R9pH?0!1Atu(_I
z%;VEsHN=}`&W97@eb{smR>;cq?2)@NW$*c1LT9!YCRxs*?|1e{jMpE!-css3HDrOM
zFBOnyK%~C5{HecDql~j%8M3(VU#)eCs)!9sMaw5w>RLh6uD$Q98ebh(EEKbO6P}2u
zo}=5+br@uti-NLBK4ZAi$Um5BqjF(oxmN1%(la;@-364x=sfx(-(gHefS&YwxF|(Z
zi<ZFMY7)wsdgqg3SjuqgZ?pBys+!tINAVxZ03gnV=r)IK*C@VH%U;q{fmtOr6v4UJ
z2-aVGZc-5yJnrIs9<1oQ$hkJ%h=8bAtov~GxG{R|#-@2!K}?j?Ur>bsjPDIv3y2p}
zLY!ji-s_L$V$BV*ogtVmCcc6R2GLsiDd7LWOKyJqI(wcl!Nbqg5jkG3oQpidK`%pn
zqtlnIf<l7b|2!0hT6=d!O;$qDll)6Exu7mXg~eT_4;NAU0aZDtS~k4G6fyL5zkxtP
zLgL#53as;MH06>qYEjSWbMD_^3palg_8VgE4*J*ft1LzFcSn$)lE&z@aO-cN_SK(z
zU7#>9Zel}SIzw{HijW-Ggap&QnC!B4UT2Y7*}kB=B2VzM6;U5{7Q2m1Hm4qHre~gq
z)pqS|y+u@KPJx!P3|N)4ah4rLu2cm`JfCR@<v@e9!O>%G%}vAP&(^~&RXksoz1hWu
zH-g;C6rD|IT-prrhBv!HO}36{(MNCaJSw8qW2W|CenNGtl;9n?G!<mue)A@Jy<v6)
zuF1+3)ytN`C5Q3Ze_Stfh7Pg4kNO0^qd~fGm06i6Mg?S4*pG&Qy3LDRIX}m;#78~w
zgcb<%E>n&6=J`otAXA8FyR{}v`ngu7Sx=yMss}P@{3tOJhE=KZp>W{hTDG-7sL*&D
zXat0pug-jhsZr#^+-2)I?s?MNQ9}CZ<Fu%j#7HVC!hlDlBUDJ|D=6B`WneY}MUy^b
zn}Dqk&-|KSsa8pKzyKXwYb#rsq&{=-))_-+D`sGf=`WLvC4$bgjIw%}L>dPoM>Fwq
zGSIBZ9x6xV9n;4SQu8xZL2o0(ku11s`vLV{VMTWZ$y;9Htx+eFJUJqdq@>CZ5%xt!
zlb2C!W!J@qVn+0-y^fB{aTWIvgQ4o8*ZT<@uOhKua#p=t^RG!Ujq59SCw=GJqiad&
z%(l?4@Vyc#UM8$A7d2AUz-oo=%V){Am2+)`p13sJH57Lj%ni0z<sMcHx9xo{S~WuB
zC4$Yq57UvoX-r(m$1dR1QSzqW{)=sX%65&ngZ)dSDD-KbSxOVQONVL=l={}mEV{U|
zYYhHE@vzKtY~1%RtXhb55p|K4itACb=?yzsxz^X36QNI}uI|O_y5qcnn{)SPYjHtS
zUu{EdT<kKv5)+w$mdiFKL#a)1`ejzAuBExL)FNNSEM7vX*s6dcxZ<ol(#pfatFzMj
z)Wf3OT9MUd$%QplX{p<}nUe2HrJ^Mz_83Su8}N_SLcD8=<5y83sT19^$%-1;!OXAg
zP#AO+90XM)C(OxMMtT5V+G%giB%+IoIQ~*F%1Ms^$VvfL6-}JLLhok3I{G_%XI`Ps
zWJqG#!XFZfE5t4AnZ5=B48gI`bL~qm(?5~RBHOsxpN`#ows#w`1?OV;Tpv^6ZX2~R
zJa1VCS$iE4>gDMcQl1&O$@X)l7lt3!Ce=38JvK#(_AmF^n}GW)AET93<_@P}wSwZ|
zfYkHJ?43%hUq_`}r%h{Ns-w;~y1g^I)Jp{Psi!i@+{l@Wr4)~w;yiPId0GkVwuB<7
z^LqHS3k_SRgNvnz+LbpO*|Mzj*tPuCe=Rfx;h4u+)@?Hhu{pFv0B?H)+5B1YD{v{N
zJ%6T?$C87YQ3ZGXp(nIuK=1fxi0U=1IOOf;>R`_-do$luU~*ihLTSFVft%HSIs+2(
zJc3TwjY-4Wd3>d_5nNS#(fImKyY^l@b+ZD%qBXTXk_np8ya<m1ey%zV*^hyK4y97Q
z_&P%SPDs;_tWtEDB<fwh99<Ltw4QQ0-?RZ%zlyqj;k`H{pF1S>QOC1ao`EVK1U!ah
z<IsI3eQ6Y_0~=S84M1D^@f}pn=?E$}e3_lLZY)YGBd2&hi$?gq7UlM`<{7@JJD?pJ
znX7cu&9wW{7oQs!LMM8>wg?HPY=JU8I7Zt$GIE5~uL|K1v0xfGc;lPPYkkXno$o##
zZ3xm29huTsi@ox|b!s+b)E_o8k${kuP1h7goquGTsn>wktp<^Uc<+SIe^<!W$&f?z
z1iR}11}y8|T0IGYM{RFBuc*x-OhduH=&&HhcY=Qhb!X*at$J4pp_kFeDIwnkgNKAs
zt@I4Si3sk*#!*yG+>$SLve2D`mprzT-fA)*iHXzPjcU$;2f$Ogrxd1jsV7+IyHat+
zVf-b*W(8RoRc%z$1gd0^$P9|c=U(BH-ub8ZfxL>IPz*>_SFnWCdv+e&lTl4lYqx&@
zV?$idZT40v+iE^x@nVc_v+naWVuN%#EAKiUFVu=_YP##C$B=jtwQWiQp-1<9*0IVA
zN3U3!t~+$)x2_M<fah!JOygkM1_WJ7<`VX`PIh%?bsW?Pz8om7LofN$_;g`Y@29*3
zY|R_zu1hVinq&87`Ja}S6n}t+5ci{#Gu@i3wMC7-IakE84NF{L8@a5q6RmyKOi!9c
zWfx9N=P!R!R#im_-BM|oj?K*vfdsm^oUo3=m#dGv&kBum>*Nh)50_^#FX$-oVo71v
z9zGKX2<RQ8$3yLveDX)Cfw)X+F;0Zxf2x(T^dGw6xJiFe0NIC$3*0YuKzP51b(xN|
zvr+~6-suUBj19kNe?$E+gGY#-%31lh%EH3-E9AQ|c<|3K*Kn1f$Gmy6FG6S({7fvS
zCxP47$W~%Mri)mey{h0%`qG{pP(#whZqb&HLZd(fkBWhkA%gw=P3QfO3s#GGNz&lo
z-8Ld3jXblHaqyplPyB{P{~XdL?H=rI8C+{#8XlYf4Aknpx}RL6hlR2iENcW(G5JI~
zF<_Qoh#fpG!X9!*MkWOE(p8DqtV_gSI1-50y0`T73{a|9>EjLM*2HE>oDpauZ1Y&l
z91*#eXz<{=A-UUVyI#LcB>nAVq@|xH*I;?v4Miv$9Sib(y?*g9K}h$Cc+fl%HOwS^
z1LYiGqC-2we^do(3^HHTk0~&4OO9V-S3*4;u#Q*!LGxK+n4aEK_-N<FE9y~0c=|Ya
zqJ!#gkz8NOukLu!SU8X5=8SILaezwn4Rl=+_q20_5W;)s;qARO@BKQ(BrL^oiB2Va
z8q(KpyjqH%z0X_ltOR<8S#kZsyUgcNODe4zMj_sJ5Y;EqNch&I)vW$3k(}BciO)WU
zh{-Lo)~`EVCF#^K#66aO9MfjYNWa#vDlcz(yqsjui40GO;iUTe+Qp>TrCD@W7UkSJ
z7I&ub!5wLB<#wfPtVUg?G1_j6r^oKPFF74wV@eEPAK{!LYS$vke#EGShCibYenO66
z>%=cjO_puJWy;sXaX7(erSM?0lQC_wM$HE=KJ-NQD0II>%~qu6PRHwB(=c&oQ$%_b
zgS>X->}l#?qA4r*Pc0r2@+qNZ+(JRVVGbrM@hB4p;OZv%)RR)zo$v;@Jw>}lkUZR6
zJKcE+VADx(2`u0lS=j43?P<2y`@oJa3^{BL@K=`@>B)mzwn!4Zf5w4vpZ7GN==)-X
z50iZsFG`CjxWiX^0XmNYv44WY;&!O0Xxs5w&;*OZjMIG1Fk?X6*4Bd>0uaC=@ilU(
zadV*$P)T#!<++D<8<#6GIe=u@b~F_z98Ggqi{Kc#$N{EXHEeX@BV7Ke7(2&!D}Sa8
z%P$r?VV@}ht*GsVU5}qxSHCe;92Re{<97b#^zejQAvCu{Ko(aV&0U~BF|j+hha-i0
zw{#b*Q`0QFYXZ#N8qG|rCq7)6e67=W4wrVvOfzA?chZ-5mmaue@8)B86nDoeeZ7YB
zPyNZOHr?1o(Y2|14{d3PL|IDCV|{QGwnoV<yXDF$xGqljO`B6Pfw3_ecwt_3-JV`u
zC!m29CaLP-EuEsjVOZIC<h5(`YEBmwtr?`dbpuywD{45(-kfIQv6`RQ&JiT6Ct6SE
zx?DLI@&EBQinu;4xw$+)CsD54*uIq<-gjeW7N6eM3`iZ<Qc|6co1a%}c{+(N^*b`I
zB`EzDWA6YY%F?tAj%^!fY+Gk$&e*nX<BV<Fwr$(CZQHYR@B4rIy&JI`u@T)}t?bHr
z@~NzjuCB^#M3+$?FOqShD#+;rEo%u4%$4SB4IA=c{BG3C3Of~Nf`verNVPwc%vIle
zW&yizHEKCU{0~#=<;iY=9fEa9=z(+<8+vaYg*4P}W-Ah{U8MCHQMJmokMR5Q!WA^L
zadn@bL{qP*+4w4N+!mu@o4jjW=8tpotpNp&d4|O{6XBkCu~my$?bnrj9i<ckVyBm;
zw(0U^>Ss{6>q%ccRLJYADlQuotg|^LlCwobg&7N?wAPrdU`=W5-j$LFZuSdDTE>S$
z^?o@QS;d`@quct&<Y-*MUI<@Z;M4d=BobYpKKO*TVzxUtwi<SL?8q6j*ZXJPAcujN
zGAVD20ddB6(0sE+-Yo}=he?)sXl-`9EezK=YiO$Is)HI|B5a;DUg3}G74=>c;+k?&
zDwL=}t&`Aw89e5}Yt#@9L*8@~pXrzLvnw9P`-Kp-YNnQ)SnfOPV!!rhSJ)SB&aV&%
zN?ujWl2k%W*qsf(dav~4vT}C~`P}H+_SV4=6eav9Nuo3&?}uLZHthJ>RW!*vN-v`C
z2ZZ!Ujx+$SZonMqpN2DNe5gsj^8ZY@{OV$VFEvPjczyD0tCqwwIm+{z_U9N%<#Nw!
znN#o38Z`%-x6*6&+c(CB)v<G=v0<d+VLknxsTUJ&XCEJQcUhYvW@>MIZiTx#L99he
z{}^Te`NVoBvEu_srN(g`*kyW~>9B`*>NXwJQB|=J9g&nte90Xv94+zJMG25W!EAF_
z#k(V6B$4fmZC_-HT$Vu*(y{I#!SK1*YGTR*@41S@X-@KS^!RWScFlnK^{y9caQ13A
zo*R`zTy1rU`z-_6GPE<!N^#d@ANPzcBRZ>bVZUWy*6mdCRrk?97zolrGSjhC14YPL
zsZ$}tE3Zj%jIZ5IFb|)mpde|h$#29SsDvDo9!6}5H)E1z>H1y4B{UG?f|+KOwMAg?
zT{S(tQ^X^^<#?{*rAvpbm<$Bk0rkBa-NdQe?v@$L)|jVb*@wjM5KUrHS?o*L<=J6B
znl=;(>pg<@p1kw#UL`}^W3H{jk*B6ju3CT4XobEr6?9hy#;A3My@--_ZLV2Bz*AvX
zUf#!vX)0zo%xIm=Q60l|v@{#&lDZL(dnb#$p`j`nnfQu%RkQGt&3SlO>a<ujhe6P1
zvWkbW2xR&5^P?5)6L*gH`3NWWdQr+FV_<A}$=s}gFpg4CwzM|a)tr1vO!LohKy2uF
z<9({X>lqyS)Ii;T4x5=lz`-?Z;mPbo*0qZ1Ari^`ZLt6WA}3#Tz^m}L;Xb}B3kaGr
zDR{d%7`U?f8gF``m%^?|q@m#3Y%2jM-#>#vph&Hh{gIB9IZ*<&A7vjnnOEWt-VeeF
z&Kh^fG!vXk41C2SkfyMlE9fi28L~HDbT`GIjzgT7jf?LG&o&vx;n9^!5Qv_OdDa;y
znFrV=coyiy&){oQdjQx!gNcVat2ReLP0q@wV68;MZ8jPiE(BJRHZwSFk4jV@ZIQq$
zy<2W2h{H)3-%s8mWllzF;jLTU$e1};6^|6_N0<5Pp&81MJNU=f2}~<n#-eEoGdIJk
z9A`-MiJ;GOJ%*PN)2DP)iQ+vU5?m_IEf|mO_epY8^qqt66l(nENF2EGrEXI+N}$m-
zh7z-ZpSldJ?76#|VRZ(qcyVf9n1^}23$-ApU$Gy?#QokuTa;`P>k{tQkm8|6nk$Oy
z>oV|I2Wc7HR^6zuD$`KY<svrmjhsI{jqkSl^UL<xz&uR1#*gKRNw3}2x-mAim-8gq
z9~jrk3&aL|Ek5)DsdT>4wgtfQ0(L_;o-GCh^ay}d7k{0ky?>j8zC`0U(A<cjE4t=N
z*n4^FLeyk>YX}zH^kEOHm!Ku)Mdqnve-9E(yJ7Lz^WxL;IAu~IA#W+RJVbu3;LDR1
zRljV;Kf{hUXW((r6Q>msJ~SQcy`3!i7Vy^>J7Ycr=?`gbIc8IvHfVe`&6~w?;8(rk
z)h*R^aju{BYFXeW{jupkIoicRBdpY#a){(OUO7Q(eEQpv0=uq+pnMydUuI7c(FV+<
zna3rAa!sIf)IshfZfgaJjt{@@7k&cJ*}fpMNG|39x15QLa68U}fPTtWNMD&mX_axo
z#?;8MFJVW<RlG<0fmnFNR;kHp6?&NeIDN=i+D29Qx^Pch-R5x?J5vhMWZ)tQj6q5_
z1vdCz6y|>*Fo~<OhjUPA9QJOO9<^0qKCS+aM;%dd;H|Za;mEGMnkbXX^dwPmd;~6M
zgU>y@`vsqi@1Ft-Wn^vW;An58XZ7!{jlMZ76f+Yu0X@OLyWHG#!e*9^M)q{VmU@mx
ze~b)l42|d{jjT-^O$iv8n0a|&q5enQHA5>#%6xzxCh(azFll~JcetK1pohLNM#$TA
z4Vb+t4hw6YeCk(o^#x{;tS>Blrwq54_FFOU3(Q?+9xi?Vfj^PvzA&2Jz$*AGx$V|b
zdRe!YBvRb2o(>bOXG)%_MhyOjCex)YdprA{e~isRj7Q4ujT5FSyoNYmi|$CypJm_v
zQ1GAF3Fc@ZYPAN}E!X#6hDX!4tz;ltP7bgJ`TZ=I;f8$?c}kkN4bz&LJv(2IkBL}h
z?^Y#kG8f9?<xxrwob_{)`BfK_N<|4q*N8t8QJkI$nO7i<fOp{23@Vn~c>xA%*{%M6
zpvLn53pHjYdKUWs10e$e0~-e`8{_{I#*0iyT@<0EVI3uxY00B8;u!al2g5i(M&AHi
zVqZ~GGTlE>AQR-VAR!y}AP6#hlzxbTMrvqwD%{SfGNgd3fR1{DPN+hvMz?4nijG1v
z)4&{a+vQo1q=5c^CLoroOTCp;I?uecw6qik>0uE7LWB`>i*gKSd}`s%@POZDAbnPz
z!B9}pHvfKH@&De!0{@+jPwyx?8cCF`3VN_f3m`KDM~)f$%&$F2p92jAu&1o4F0ana
z@;49cygOJVvb&<ea3>@rwHzQW3Mhp#IH90O!)*e(*Wk3KL*MkvsF2L&B0G9mruUPV
z2pTLIR*Ul#+b1g@pf4#hQ=Q}UTOb<~%#e4fc-P=~A`ZWy>y?jysveNJa9I+c?{V>*
zhsu|ldGD)npQ^lAkU3!Iw!)Qr{Dop!C%(yJ>=yw%H6R3C8ol+|1_|ieGyqf-!04h>
zNg2*_XmL5354HYrwh!hmk5s4aH|T~;2EEhL#_YC)k?R#CKom)Qxv@5M>6ng>rOr^5
z%~(TiqPSlIWWg7WWwlAM@>C`s@B9kpIsSnC>)KeU5)rs}YUI;7zyolxOnFuzalbu*
zO_uKAyos`u%o>j?Q;fSc5d8=6N@omXFuv#6ib1|$efR8Otuc_RsrRuVT6dSF_C+lV
z*|5g%;N|fu<PY77%WYeZ1C5=PGy^u(9Yrw!JH~N_L_tzn7Jolbc|GDJ?$Bc+h(3Wj
z5aJmzs=_Q+A9io*5AdqoUCvM}=3ho3?u2TqKo0jx@G*0}uZ%7HO>~J{rWnF*=I`-t
zYzRVfHz(3&$nEk7EP%<TOQk!7HhqgT!UN_j^Gj`3{P*k?rjimwyI<Mv+z;RKEMOR6
zr+N5|z^X3>Mh|u=-qOb&NLB3U?r^~l);l3Q#4EhE>^MIMT?FNck-bm5#lHr%*t_i6
zair-9b1WD1Y5?%rIDT&E-~IQ#pM(}q@=#r11Uc=fHH`_od;+YdX~My8XjdjN+Y8`w
z`u=7anxh@^SY?=FgT>2#8lQa(WF7QB56Ey91n{LN<|jZo=AWHA4x+JDEcStO6bBCY
z9b%94kR!t41GoqwtcaW<J6RUwhq(xpTcTG&Mu5{N%k`VxT;#+D%hwI(S1yq`%AXwJ
zd^~@QkUmH10bNcweYwgqXFv7@*BZ)ge<>wQ&fK-$pfmu+GT=vYVn19KNF>?Jl5Kty
z+Z<=#KMPemf25QbK{FF=2tnI&oD~|mi1Um5pjlwsa-V*Gg3aY}nahX@U4oj8Ezf8y
z^3DLW0K4TG3(y^Y6G$v}&0sfXm#xl#KO&9%zJyOL;fN{p0JS~|PlNuHy(hqC!GDw>
z_mPl=&gC;lJRld*1!$227oa^O7!mALT+|;C>4brj1s9C9$Dmblf`Ivr3k56}+)Ho>
zjZtL|sTrmYH8@6y0q8F$QsxG#8Kn*;m@wT7EGSp~hX+KZmtHd34{5=kT&Pohah6`-
z6<@JObV0lY$O-1mgdUB%WPe9|fwBea5euWx18QF>6X37}dDr}qK++FNQbyScN-<A5
zkRipJCPFB)=*$tL*aL_u2g->wLZnjJ2~AR#cc}35&jJTM<Rj-ov@Wfra3@|--WI@e
z-d4$(6g}*taY~dM^n@&254Vy<4%VY`O5ht;O65iHx-Q(=W!Y5ld4i`c_=J!az;ZtA
zPh_v_0PIBgTr_KePH5|)451SIzD--`Gah<)Cx(r%HV`Me4?M5nHWX{YPvDKJ3tywM
zFT|sK+F;NUZ@)m*HXtYX4QQ9VssQyv;1>8RflnrCfzR-==YF1&a$SZq;2Mxe{)OjV
z!WW*i{u-3W>KpKd?q0@29#@dBk`CzTKKq5m8?uFAMNb^%9A3cZJYLA>T;9U7`5Ti3
zmK$_Mx{YY{;5WF1e#(W$8-O!TSJA)NS-!!E&AOds-of2@Z#y<6*gJ>|k!99?izU;2
z24m?rINVWh@Jej`$R!#(Vhi4V3MJG1^CcR3k#*SG0CZ>FInu9m9VKVoxp<Exs^u4c
z$$GeFtgc{>sH)L#lpnZGTraXZ{;(xnikpzwa^Jwyd0lYR6gEBBCDr}3FW_h1dQ4~5
zD|sK_TZ)e~u6U1DD`9VN9X@YWW!JskvfIJva(q7CR9t$|W$wF1M&KS0>$1<iR3-09
zJOI^sULfmod_l{ye7>1u7PhjBGP`^w*}dmu&M){KzHhB%!9Vkgqt|HsMTP56FA-7v
zMA{PggxV7R<Z}gov{35yeNIr>(Qw6o<az;kBtK()!F$TV^Y1Jf-W6OxF3Sq|92=hj
z0GCC5fk=!#?fyvlIt1k{`o1QMKF$6bN%=Z8GkEL8$DHTmu#)_KA&WQ#mySGTN{>8!
z_coN=u8ul=m%*NYE;V@TA~ks9b&}jJ{yCKvaf+YCT6C>6{Lp!p()A34GrtWtm(mp;
zed;0Ey)*Ox?V07Z8+FRZDsybWi9bKhUDWM}7;)-DM4Lhh6U&JJLdp7lp0?EWxs;i!
z3Y_*k>mYi-I1rs8NSCX}4EUiqUAy7ing{|x(6B~p=RXe23_fC(u%!!0y`ow@T{Wmg
zq9W-j?RR32zPP2bYpgy@6L1ErE!=te{2NE{q^yDCe+f#~x*-Vhn`f=B#f>1i>bN_&
zS2n3l7M+)eJ13b#{Ub>5;W!`ObPNV)9tPV4{eRr%CBJD0;IfK*_@j~qvV^A2sm9Ui
zZ}bu*RUH<dEwl34@W7dW)jh81;;hVeSc8F$g#TY*Um@1IN2Ty>(Z(iX#?0Md5yBj3
zMM&y;FXVG7i5lFU2EeW^D5!;B;QzOieIlUl1&MpOUsfTUTvm~acA1@$@Ata|D25F&
zVKWQp?m772??>{x1p1Ftmv*1&H5+<NEE<Hb`UjD$pFG@Vz{^0A)TEYG&@ZVQ1^>y$
z*Fjb17Cj|{&lKF`zcBd~N0T$#psCZyS_=bA-#o$OM_3aE<)Kz+^@Ao7q9r)H)eg>*
z$_FUvF+AZfjuqPf(~LBL&zkM#ES{-<`1dT4CgD!q^b;Xt09#b`=$D;BGfHbjYEj>y
zOY4M){*TUte~Ll>tBh4LA~UCsG+soDYXgjDFnrulpbk0Wx(thsoyM<fa9?4e(El;z
z_a@7?@8+vu$T_kl`af;mPaC%8FYQTW=ZQL}?`dRKD7=8_<!b0V^TUF-veR4IH8u0%
z|FOs(Uk#UcNah>Z|7QK$Swv7YBzKUDyicPP(Ip5UFVL5yCamryzi7DK)TGq@U$~aO
zt}oCWKMp%yCJD!`blZFW(}2$7t=#Lcyp{b(Pt|pWS*D_c>G@Kkc|WA9>ii6l`7u&P
z|6ABUHX>ZR|1TTGIF_P5xCw@%t2eOLHUJJZOq@gD0x1E7mlmBj&MO<BLyPq~`^<wm
z{@-wsXkq<Vc@N|<BZM7(#Q!wP_XFx$rJ{o(`cnEDDbKrrWi?^7tM1TcbRn2seot8g
zdi##P3X0SIH-n$$;OJ8)l8#zyD;GrNyZg+|@KJvlY|Nrnt<_M!aP@EA5C4;Q?kO+j
ze_8r;KSP&QL;Yjttbwj1ZRYXESakxDIpRCxWkuI89Y2tr{wqJF=SlLdFa7Z}QxFaP
zzcqkAn?j3_nYrQzvOSH`Sy+NvBW>;h&@zy&3iCGYc<&_tZv7YU69+<l_I5EM$nFgE
zZjJ`eqW^vCc*?o1K&esXY;esD+z^jc^qHc$H#|Z4U;P!o@ZU5;Fs7~NApR};x{LCE
zQT*;&G#C!|mqrIgWn-vO&>O0<#CyO$9;57(3s~kKq8@;+14?7lXKxlDF4G+k+Q4aj
z;r2TF{^y;kn1@_D3lb4gj%JI0b}jvumZKj&I<6P9WbZ*lY8x}<?w<SOQ6A@?n$mMO
zFE#H)f$!xXfI|0QK7fCe=#!9=vgii^b$2H1+<#R#_)$UUmCci1cx~GLMnzKWU)t5w
zr!&vJC*SX}1$Q09{uk{c?ob6REiJ>@fSxQ1g=_zP#FZ~)=~u}q_6FVB^6!6Zo8vza
zetL1xc{yN+y#5#e=&uU(Yt$=gwjfeg#UtaGK5od7=YW-fz8z~$-7fPyVL`li9|A=O
zR;94){<R@}q&vm`asXL?K-IV8ecTpa!H^_`9(~0d&$1VXhxj)<28>`X5;|JE9_a=U
zyy?bMVHnT_s9&U!L3%R9yUhAf*3Fd~F8gxyZ)V#uA6cm18dy2wc?;at22|jZT>UQg
zpt8(8*=-y6k*4yht7-1XAu~<p{1%=rUm<gvF4Er;b?=XpV-qKv<tA>?4GUMB@|vSs
z?T*f*B%DYWbu?Y^EVQhooRo}t2EjM<5+$pK%qN15>XF1y3BSd2^GgJtIH@}F<{4~t
z*_y6P1|r5tF;TTK>%-F4T@RraC9SPbg~u(_Rogz3t`w36ewhe5H>0%95V|*`vm4^K
z+N)v~UDMjS8eS8!@Tf}B$<32Wu3xJ1(?(>;61a;6YM#)fG37SZ<N?5|n9Lk=-IYIV
z%I8Si^b`OpY>^K?a3U>cmq21Etu#mZO$A8*HNXW8+`<q6yzikRMpcfwhr?A?Rz)yr
zc5f?m75h>h#-L-r>&T^D1i4$BcV&6iu5`ot`>P$RMWRY#Rla@1#Fqh;cAtH?WlYXU
zJ9n;s3h}7#9?do1njPV@pB9gsyW5g~evUo2iQX-5!r8(CePjzfD3OJxk?GO(217NO
z6jf;fZMPI%j|eS!#FdqETj7Z0{x*TJ9m!eg>%80^r7j~(td4CU_BchBs}#sa%CZtY
zNXMo+I?~BT_DR4pS5rg%sK(u44<~C3$U;k`1lQ|x4Wk)nzN0`xJd<Q%yT<8g=69c|
zd0GjlQc;njLlPyPSD6vy{RjgDSAYs0RX&*!Tt|^4sZy#e7c2540(`IfmYu5zshi*(
zl{t;48gKb^VjMMG>l1bl-VXc2-}{WQ-%+Hacd#|Y=VZlgSV?R}owUhp9H~x-R=isZ
z54GKG+wB@-IQUNNMH+??HLpMMd#wtKZ75m0H>a#`w0PMfdU0(gYZ$+~U<~%odNa@N
zT+KkKR_9ca82l0!n3FQfw0C>5I;1^Yh&nirOZjq>M&JUG-`vCusRot4?)H#GZfmH<
z5Bk=^ey^h?v##GKn`BW!7j)p>jR&w}iO#a~(+o;;oFy=<Q!u#=5`W~&aXqqUD+hPV
zPZHY4F3U5_RO5ASej!u6CJ<F5VSUQd>=ZRum%n;-8^2F(FfBoy)SMIy7OE^Qouw!0
zsoZDV$`bV9!511ib8JHs-Hb~1@$ni-kV0p%S7WDBXJhJq-<yCsr(+I*&fx_-PFYh3
zvCk|nq5XtqTnk$h&z5HzBGkZX#6G!ycwIXUM2*B9F1GBTn$l<(@wZ~}eHaY(6VT4|
zr^Zn}CWV%$qF&eSl?e<iG<r;!`3U2d(fXB?Xcc2;tKvX0Zj@>%V$3BgdGqVMRJt*G
zeFv;WPC|%9a1;NWDCg)D>mDNFQO-N7kRe(>>tx!Y{4Wga&_Qw%%O%el(dj8z4WM`+
ztwrW+Q|2_3mL44#rDLUwQ=Jt8Cv@i7`|1X7Z!IhLFCOORbb@^j1*Jr}Vm(!VJti<j
zVd&cw8~)7TT(EvDm~@7V0Urk_MP*8ZlpsCSpw5avC&a4fM(HHY^$7k>1~*s&f#mEF
z{JjJ|uq>$k2r&5!)I&8;{=$t09P{)O{WX~Vh>Zg@e-`O3L%T#4w+n?W$fU$nT&O<V
z^T!r98&zCCZg=#)T6AN!E2_6EMCiPopSis^I0o^9$qJWiRBgn!<FG0I><J0y?a#|W
zdIAbS0%CI${P95tMrHQJX-4Q`olWgBuO|RPEctCKpr_CTQLu%QlvQ2~#RIdeo3Aq?
z_=V(VTFF7}sdQE7h+olLgX#bKdH0F%6X9sKvtm-tfxe>eM@u-EU<T6@?Tx3MC%{ht
zfz$VxFhcNtQ<!pk%OP6~|D>*H^W5v&TQQ}isovkG?j>k%wAxGUCAkl{lAVIv{L5P<
z(<9>BIi#AUz5z58F(F~l#C=5bo)3?2*X{bz_IsH_c#UGBK3_8mq9rnCdiRjCkE_U5
zylvi<7j--_>B26)=)mc&SA9&^K$!iqFsK+s1~a<mY(>t3ut6jkcs)CVKAkX&e*8u2
z^c`V(Yz2C@1{h)dA}&k&W*&|rolMh7*kU^Q-$CS-x0eqjEMAFSB}3IS-lezW7{dk~
zkzK0zHvXExNx)0Bx|Y4{sN*DW(~Y6KtR!p^)<aG8xkdUTz4RbI<qiKq!8lEpZwf66
zd=+Hf(I@HgrWA`02>!H(XY|PV$N)ScmHl1Yk0L?oj*`rYD|tJ-T{7dDdJqNq_%fV+
zaNuu$Ws3e@1|ztFWX3+JURdb;*zC^<9QiN`6#UNb>(ERqOhHm)4z>jC;7Ui_ra7k-
z@nv7pY6EN!I>zm*avA9%G>ubdC#j}IOJbJkxU}7ZBSLuvA#-a&mgcxLOL+&DQoC3z
zs?tz07Hj$XP(yu%1%&4|<94+xQ~CPH1#ThpGO@_`1>)GWKk`KJFwN0x=irZu`2sBJ
z>En2u5p<`~`7-2<2H|V9)9ZGUD?cBp{qwF`XGNuNu!jfg+4ZdKqrAJ_w;#+}flrky
zT2O8Jl~?@v2e2DZe?la2k0^8hCO3x*n#E|+{fGBC{O2H!9<^n;C8Zel^bte!@cnu)
zSc{qc803=BEiJ-M)GRv>2u&i*dr3a&nQNPw)XL%GjIyehB)PYSq8&&wOayVAY4qHZ
z=to3FkK3tHUM7Aq8whUjS}>V6!qkuh;pg4ghCj0uK2AwBZ9v?Sg`n3i$5PwI5l1lP
zs&8JyD&vN``zG@G+PQT)5i3Jix)iOjA~BUWZasBA`dliMN9*6^IK^K|t&+vLt$Q;Z
zhdwrGEpb`V6~8&A+*izt(65n~7hzYVHWfFO7B%blj=8PVP^;UtuX6UVu9l%GzCu@3
z4e_{?Shd`Xum9%O_H~Mw)QTL|v>C6}wi<uZcr<oeTA!N%P3B9Dm~w6`BXb%k2gixs
zB|_eySUX%}yhO0Vw`yY*vRGbg$ZW7%i)ypY6f#-DbaKm-E}jkLECbdO$*x<z)TdgC
z!h#NG{-q!%r9rOH)MDh1H!IhI(t#VGudW#<64t3PByfwd`fe54FY>^y5sUM809zYo
zu<-Aeq)M$!In1+hOIYVxu1ew}qJ1>DL#HF5mw0m0Ye-++(QvI(FBk!_<=3Vi^%di{
zbu@S$(ec5b<A7)QaY{;de?^dmN8s7Xxy+97aEY;wXqP+Jc{PF41B$0FUm6_NnCg+(
zK0nw43ADdstPr20DuMlKT{GQ1`xH;Hv1Pa;2NDhac=v_o#r^Qc#^80dTQ<#eHpu~x
zv$U~7rszmOVr5$Lq9NvFNX9BEnX0O?$tfjke@101*AzuYrL3Ara;4<s%#qcA*A>&P
zVp%4J)^%8F)n|#f%F<#~YsyG}eojt{_~<2O(X~IOi04{lf;<4nF5ez%Xr3KvK)2H5
zODy9~4zIX8#^eg4X`{pmRKlR_aG6X+xU9q^8h0|_@m)-**PmE{q^_#CbHh;r-KvcO
zYOUHXVM2-4AN<4tb)qk;X{fMHXfMm@%5K-m&9CdJckY?d#+;mTIxcm1#BvMrhcyB}
zADm(#*oPU!tZ)V3QUUvCSq69pc+|{!y3I9m;vBi(!O@9amiNbdw-CzMj7K+1-O{=2
z_`H~H9<Ul%9W$rji5m_XH$etdUZ82#prz_F2}4a?9pjWm@s!|m-w>|jlr`+Vpclbw
zF`lf-f}IUShW_v_L!3xsNoQ%%(OTpKOG4fI)tcjm_Pds?wO}RPzkZdo63u~Xh2-x$
z>k%eQAbaz(iSzJ87qtR{G-h3gWfD3P5<U0l*EHO~>jgzgZu%0E3wJ8$B3>`Hj;6RB
zIXNt~9_yZFm?yWQ`wdWeXt*L99??$IIOvzB?(znP@g!UUkQD<J!n=ij;@J=>2dD-0
zaw%GRAUKE_(SUA9B3>ZIEaWV}7#Rj!U5j3xeN19~idPVDCLIERa=(J%FK*EE-M{uA
z7L~n$J)F7<IY2K=8zAGlfL;|^p&TxKwj0C~I>P!1PBBpMd*JK-HMmyQJ{r12f8tCJ
z6;i<O1KDbpJm~p>4*FzMz+B?T6ISV=o<f@W!8E~^`yA!-6DXd^B@(;)wsrk);_Q)N
z5TFh0)~h@SI}1C&H*1FQcePgplY+Q`t+^#&)xSWE`^0lKiN{<IG2C>6&u739!TO!+
z0^4p*lx`qHN!cJ@T@!*)tM7o+)pBu2d&E~n-QockXd~spsMZOQrWw$q-RKtj-8MXK
za5@Ad<`567#}Gq&Q7{6Q>*2JCGy;O+n^_NZ!^Kv_wFsdlihB8Wwrk?qNBB;eN68B%
ztWUv=&>aKt=z%&tQ7pV9yYicNSY=%0sO9f^A};UTm)36RUSbB~6d3~|3bam{-MZ<K
z9bJd;S|a^JSGK5bPVjZv)MX+agKuhH0$ercOZBSthq(3v-goG3Wc2%}62(|vaaX0D
z<~=ymwnUyG#|MT7mKzgY7ghQdTd=cyu0`2LiTg*C41rohsR_Etp5rbJ$v8wE%qTJ?
z;h~WOF#8<z@vKSo8(q^KgB_EPbOqJ=p<%zxUZllTu6*=X@U4ZezagIY6Rmu3Z&<C}
zb@5|Ed{gY=Lo*`?*!G;EThqD!D8~<vaB`1J*F}4T;frm@?4$6Rl?Dy%9t2DI;y98{
z(I?OH&EvLQ7G%#~2a*%=+d_5*TJ{o23Ra^*NZ7gyzsN?Qx`aSAIkYuss*_YEWyNGA
zN0~;fZSR_Vl#}7{rh44@gPE$X)O6%_>1Ey#{@LLfkb&2<HOhZZY?11F@5FK6z;gA+
zsJ+w|V?33i?N7p#B8|l~q-{F5G$eUVKYO1)P_ry#8`XPQ{Zm;sy=i8XCK9~k%u<=h
zWXsr;P?g_)%D_*B5g<z_Ac_VCaVU^TFVUi{O6wY^68IeG+1K0$xq%`_w>AJWOVKs}
zxMynD<l6lb`Vz`4?k4mmn!hZ$xRHBn)-|9e<$Wo8NpIEaT<hHFoLmLueyr>Y_O<nJ
z9WtuDtDGzA3U-ytGl-$t*AevjmpK9%OyQ!`kws0^hV<Mtk6C3t`iUsHA8|I+>hqVD
zP<P>HKyN`xLJ#p;zc;g#$gN#O-C(7ys0Uj0B@6eRW@PM!0K7$nY_vlGHTQ9M0P`$K
z2hcTydx)|J!WQZyaTrcsG?ZX1Iyc!<@>mb;2WA$UaZLIyraD9pUzJ9{=ZpOvdlYmb
zaV8EbN(PutR42LM3Vd^~;TsxG-?1y<M`#4{a6ivX{02yN57CRFV`9}JvBj^EamPGW
z39cBY--E+9XLX{#5DTJgxhT+;&Ee*o^;j<u9&dbah~2vVd1+wKqDA{hK|Fx0gSd2q
z?5Q;A5<{!wZwX>zV~DQ?XD>+!wHge#;YAYV*o!KMi`|K$fLr@?Cqjp^95A245!nSQ
z2CP7%kT0-(Lti3veDhZ<YNQp!J?XU>k#U>=22`+u`V+^9Y^xdZs+5uh*l^&{$k}!g
zRr*D0FqGq8VcGf|HO0Fkr=N`E5jP1@DK<@MJkyhA4kxeanw*hBgQjiWv0UCH(sPtO
zki`i^)CxY(-CfXe^sWOL*!6gPR9-=|^m1xI-B>lH&l7&>acW=VVH9zft%Tr_k}K#!
za6@~~i_4Ze?eaXuE#JD?1W<BnrjjVKNDerwMpW($Eu}^7PiL}YD}!@~dV1!8)&#Hi
z$I03r!H^m1jYBqr4D+Kk*IWm6oLuUEB*YScSXa&0beonH6TF^aHS;G|&IUiA*0Xp*
zit*Uq0UTs}u#;6yl_hc3Cin^4xC%j>QW=G<#)LWxuof~B9-ww^*_MCyqO#w2ZrnNj
zebOB#90%kc031h;iDz1|zca{qtxB}j7oc8EvWzYkQ3ofiFi9bJxWc>@8_Lp#za~%!
zK#*UlIUR6;9dEKR>C&dDC|oY>!qR;L`v~D~V<jw2K2V5iL(b50$7FW1B-Z2-iyOKT
z3Xvv*Kxt^R^fVQff`IK!_2cS8S))1t{uQ#C&?2SRDH>rnls`Kmr1G>AznT<Zb$1cJ
z8WTU6!I>H<(AQUAr+N}AtXTy&!IN8DCNbCS;@&;{#H>dX%4uJ)tI%C{ID+T+)Y1U@
zP%}gWeZd_+_}7u4Q-zF|A`W3jR1qRJBsR{<P1pMLGD_R*_saIfZk$Qxpn!<(9+Qiz
zDS1(aFj3j6p}a$Mi-CY33sH|gGb2l?tU+WPcD1DbFk`)e*q<R87Ro^k);LQ>{1p;B
z^6XuEwJz8RA=_XrhOLMaYNVm6Dx5JbXOR@){L9VapYZQ)9$Ht}h2Cq)oDv<i-%?G*
z;Fj2vRX;Rk&vk{Te)x@R)M*5Kl3=q5c9cJ536)P&BJ-+`E0M#c)6!m(z>t9|kt<9H
z0<jo~F_8nE>*gP%RodHEHL;@*RcWFO2CaQC9BF0in7IbRdpN-P_@*%Xcv+TntkI<E
ze6@juhL?C~U)nqjYWGEZ(PU=i&(9=~UzSr=QdUqlRqS!-T!X?=hK9A2wlKjwFrrvp
zmh`tLO?CJO?ZOP#eM3*zYFQ=Xe1HK<fXpaao_1SjW|mw$Lu6dY-4$h`O;TAIkh9R7
z+nV1ijUJhAENxlwnemyI-6{6pi_EF+L#@Yvu!~LwcAR!mw{ZIsD+xpVL;Se{Dh#6u
z`n1SZNXIEhY3M1AV3enIlAaTSrnRy;PUm3hcN1@@588UI&aKvsHzmr*(t?vm%%RQ+
z#AQrrl4fym@tpfNGrI#)<cp^Q4`BYIhO8PZo@Ef#kj1;$r{?src#m-v_px}^oeY2I
z+9U5_25J!L<|JWM1w03K;2=o|f~A`g!lR=;+NqU9S>k*Iht3-HKDlSBFmT{(e}@be
zU&>lW93m7M$Xw!(2cKkQSU%obZ0OSj!_maFwWDo(^!Z)QQlG^Jg#!T70`1u$-r!&)
z?yrEm04q707#q9U-r5Y*(W*89N{-EP)@L1-K?BCuS#<J@XXsBezS5`08c9#f#!ox;
zZ&<5+kv>mk$iXp<iM0;TBL`28l<;B>;2lBqQy8p3eSEZ0(l8dWO>q}U!4fSqCx~H3
zdq2MW3SJ{i<4_^#aBOL8xN8p0o_T#}a)io+LGnePZBi)d#dFQ+OV|yUQ>!?kzz(@~
zfFq#d%0Pl@wI~#ud}^3LK1@|_+EwJA=pU;3dF$s()p2r<_#Xd+XZTs2eC>8qzA<4O
znKpFLPs~I%+#1Uhhaa#}Hw^)LZZGY}BkY^_UJ!VtTJW<H!_M%bZ^q{H_ssr?m^TOO
zwaHv-PfbY&e|oJso$gRb@aQmiY}CYTyj*-9CO@`x?n;~Gb99*hgq?*Yli{U^5xN)N
zMAOqZcq$y@pQU^np6-BIo6JPghZ2io7$kLJAt5s$iyR3#Pv<6Y-B8Tp9%W`|{Z+Z-
zD6*Uw`#i{N9Cos&09O{RU?SOKpn4XYGTdGfy}EE1n1ZXN9qrm3UPTZ$P(K7^u#AN_
zr6`ZTv2{@$C0vt0Glq64E_RhjVq6G=1YQ)e@K<yBgtYAl106$$)yw8>3pHtWT=I{E
zxqfwtqx<m5tX8p8p;l$-5zf8-J!QRql5*;D3Vb?feQaYZCb+SaBV#?Xh$?j&qA`($
zZhcVXxN*Hw$ldJlvdDLB{$d%ZF-rMXXy4SEW4Z2$x>+emq}zPlMJVr~!=o<sE^E!k
zpV&>|4L%?AYSJ!c4M61UR@e*2(6dma)szvqjFWto*2&Dxod$LzlQf4&g|R+(KPO`*
z+}#I?_T}<WJnCtyuwnS%3L3QsU+Os2L631GkqGfHX;G<-iq+g)R;j2U^N9;_7U834
z(zMrn21XNR^W(Xc)kE_Icmk>D2~vED;4a>Sx*am;qXhi7Han1U-qW`eq7jaS`|k@6
zSP#X)#pl#kE*F=2mwO@67YM^pgjW~@AfO@)2H`(lFc|gFLmno`0&$y`5W8~xf5K1_
z)WtO#P|U<*<p)qG=0fm=D3{()8z@bM<Oag>eo<1sf0<lu8aA#yEkFOw-hS`m!}xm6
z;BuKtWjfAC<?5$(w_G71>*fqfJ{=K#|A`kpo;d0RKmW$5HVs68f>CgeUSJY2IW#Vs
zSi36P?YQZ+OUz43#lb<+YBF7LkO0ldyondiEQJ?Of^Eiu3cs|qY$r6IP<yXaYHUEO
z(^}BDJ*E9Pz13vDUg2sY?j~X`zWNpPR}I?yBc17O(_ZS!#yZpPKB7R?CEM3u|4hVV
z8v6wGrB|u7P=N_O^KEbGI?|*XPC62w+w&kHu9c~K*^h#q>Lbb-33D^}b%IUT+W8`m
z9qm}Zm7Rr6f7(bw(rMMbaLu;u%*;jfc|DQpGRU!zN4ssV%Ht+k5WK)Zdg!{FRzm-w
zq^nhh9X2V+O)Yv{-uO8sQ+_U=_>k5sxv6&(ZF_0hw#9vT;SQ@(nHf9ss_=?t2zY9f
z0Ukq`Q5(v}o`&`FttF_1#7-SY5b3BX>X(@!<b=9J`iT=~pp~{}MtGZ*WZ)^V0WPKg
z{JXvfuvY&ZM*mR6;h1T?W-6g&SOn`iQF<+vf;*hY(A3;T@=7a@GHc!Nh_EvY;AQ^1
z@pUsAR=|p6^ts>Ca56knJpOtXOctIqW64mIqi6~}<5!!(3zY&vPZhMQT37@xi6L2(
zV2#2BCgX!~Ok>9TyJyXQqC+;A%*N9?k=gU3_MS#-rPbfd%ZsIiZNC1u+LX(PZI5jp
z&*SCK+WW!bq){bue1>=Ir9F_d;i^%q@Lkqt0+%MPX__I@NU6@G=}-#Mv`qC$sy~JG
z+@-{iu?>QIfq2zVUgg6YV)z9gozEJd+A`3^B*lfW=kf6w^T%bpk!BY27pWdOTugs|
z;h%Sf>0C|8v@t%er3kn*zSDfV-l;irvSvLl=xg2DEueQv=cHt%e02`dV<jN4k@J47
zf}mx`q|3(DF!v@UQ91mgQ85-1$kRkXTBsu}*7KufBF{~BBYmc!J-rQ3{aHt(iTJN%
zNXJiz1CYRZExm`n*xXmF;|bHumM=b)#!6EyuWgF*pwiV_F3dgap0Bcb#y6le>K)!j
ztJ}UW&)NXAXK!^5apD=qV?|?_RKOxen-6_z?VI%5(g6FFvr0{OD}6`I_Nkuf{^qtm
z&CrmXi+o*CO+Ot4cCWqZgkP!E+C8Foe}K*_Yk5*c^Q2EBXS?0a0#|c4>Rd6a?7iG!
z>L2YDwt70v%Xi8?b2PF)5AQ`p!Nxcy(4vcS)TBV{N6@Ev*p=Ykv8YSH$Sm;ifEn%|
zx}CX%q8sNgmTKjM3Mj$IqPC99<*!@z73myw^pS#{66uHOt2Z2_s5G!Sy`yDn<(RC1
zbC90wXC}9pv~{vpYa2du+B7tVWpluvOs{J%M4ysfWS?boxseVHKuFcwU6f=>gIn7Y
zV;U~UUpQ;kuY!eQShnfwXSDPl#aaq9bK%gU53W4bv}hLBls2oi32wpOl(ZHlX}V`O
zS|2&X{^AX(Z5*KOeT~`YaiDc+kVKJRH~Y)o!y-JCiL{vXTS{6UInxdMDj&G2B(AuA
zeiJY$xFLQ8IRn#q(~)|ctkNN7BZag;GRB>LJ@WRQ-yJp_)F9gBfZd+isj+GEsyu^w
zvQ=|`N}K%=Az@FKtXaF2VQTl2lpXi;?T|pXy74(3z`kTNf8-N*A~IR!vJJ@av%C_|
ztH84d1mrgM$xuw&B04)K{m&+5*v56(#ahbibZ2X5^;!xM1!gA+$zcg}!BoR@T;KKP
z_OfNvWaeuVAUX!C@X<L&5NsBY-XU$%<v^?Y(~ZUPmD;Z-uhm}XER7c>P#Fym>#B)y
zjoKWXp117Z;0|<ntNw%QE1*r5>qbqhO_UlVMO?@@(!ukaM7^h^lf9ayZJDPi`#2p-
zy*Q^?copC>G8h93y>wm^Ef)%Rz|`<O;ft$9aA2|dGS+@!#cS{}Rlg{<U+uTbt%T}0
z!O!IMGGko_ze;GbI1fAnogN(#=AYsuq)B(k%e9-3No7BJuZPPjADsAuCbv?u_UY|v
zXU0@2wwQL8CgaU%Q#H8F?heAk<KNa9uOKB;U(TVcTMCF@770%SEnXosa+f4W9i^<}
z#*HMd&c(%>T6y{-!eOm-t0~!ZYpYf)!=BmUF&)WA^DlRWT7!03-yZZAxJZ6_jafNv
zP3%n%9_DNAA4XsF$o8wbC?d}olq-ow&l<NBtUwFQmY1xj)w*8p#_3uf!1wKKC`Qz5
z?TxK{qF=`IpzGdArdnQPr&<=)!V5f^PLH25)iXavU4tfpm{;g9TxNZ0ff*TT7u+}A
zRP|oB(u>hmyuVt!ZW}Vf8{6=|3}-^*__W<?s_GhC3Ag0jR|%RDo%XD@&I$KEfHO3{
zqkcM&b!uX7?GgWI3u3Rk(Z+U8gIy9R2tRx-Gv-~d>V(W$V|G|{3m#Qod7W%%CvDU0
zzpxqNJe+EaBuIXA6C%stUqrGQ9b{)Ir^>*8*Y^p3m<V~Xh4U3@eN~591zQEHmL?0K
zzek<yVSM5q9^D^)*V<3N+IMBpeG?tRe~XMkACJG@dF#FQoH}aT{+#|f{O9WbOY-q8
zJMoPs>bc;J$c&QxM@)||l90wj)w78u_BwwAeqy1lyr@8bA@hJ^aIPKYH8ES?ZEVj<
z&I;sFUyw$-Se12k?^1DTafm-ZXY?SmA#AeFm2G=S`}B<`wYKRHZCZ2%ufbK?dHooz
zdRBD39nB`(oM;50lSZ4Y9mDX}8Ef-Lw9}HW#+6KqHb|o*U5g0lP9@wj?IFSI_bre|
z$5-B-lhSw7-euVr(I6MG&2gBEIxq9z`Cw9;x0wSe{Im|5vdB}xcs5VdyCQh0t%uyv
z^nb6%MTgdH7g|avb|5O@Q~h;s+uJX+dZHeD2FNy@R`QTne9toPAkD`43AZ|mM&4C@
zHomY*S0<&7!YA8EC-)&cxJ>00uxAs~K{VziqNpRR*olZa-g%Hq?P|zu<ex6+RR)qN
z4XX$rP?CtHG-=cV?U%ePzpz|_mg;Ixv6$;x0^qb511}ffRt{46=RB}QIp?r&$AW-r
zeW`@C@VDq``R{bl9R9TEHn6g{Xv<gQ@lc&}1$9NK$sHp;<WgC+WkWG{Q)=djBD)<h
zK?T^veKX)J2@3({1C4c)T*qeyT-NXWMu)Z*ZX+9L_2qsf_{o4H%EfpOPJz`GAIVC&
zH<J=FuLaTYbW`>-<I-EO#dxg4?$s97e9u6m%n#Hu$h+dCQRq?Q4T<h1Kh|<r(dc)Y
zZxsp+!LlmAWhkZXk~a&b+gLZA5=hE)!*P3wqPaFYr8F$1Yv>uuf-JF8#zG=Pq1q`d
z6VGJSqCW#sx{B+<hp$konpJP~pR;)vNqDbM3@OibRvc~?%6O%+m~T=tqM@9Fu)^<z
zIgIaP)a~6`E}!2+EA5Rcj)S}<M<T>Yu{d8%h=1I^McQej!eVhkIg(<|=oq;Qyc{Y}
zOU&emY-E|PvW)~zm-Dns^X7H6CgoB+n69uE7xOE$RuU(R@K)>KXuj0ojJD24u~<0Q
znTB;vQ@3Qd5zMVB9yFH_EG;3Dg3Phbn0S(x7XLz-mwo1v%J3U07}@F>2^|&orp<B6
zNW~W~AVVc7E7Fp~PCYX#dNwDC9{y3N%c*N?tzu@ctzldV7|yCq%^&^?CR0rT&&tfA
z7#QTZY+^(adg{=&Fcq8Jp=%}~<=JRrtfVCMx6mY@I48HFXp1SoXk%MeWsP}*shk<5
zf})z@ae~&Ws@jKh`UG;nz5{kL3$(B@>!?UvW(5Ad9P3OlOV+<Quc)*Z4qp76iaW9w
zOEGC*jKtI|zPOgLC5IHHtl}o2QJ`W@A(;2P5Q;m7)32<FVMTUU6Ys7k3XKmY^bD|}
zsH>x@6{zd}PlRbnwZ$V$73N;4rYzW|T;=(<*>&09KpfASI<Y8ux0ZHbimk$PeU`jl
zmJ<@i*55%ys{YzkSn6lzSmIP&R{`=4CyD&<!V5lCwpMT~tijx<iO_}VvqL&+%P&sw
zM*5YOvJSFUeUBr~tzk`!Y>X9X&twZ>IDdJuIZURoIJhVqVAnp^1lRg|{MWRt+!Lzg
z8_k8TqA52Mb<WH0EY=cuHN4kN*Oi*6EVuiefYg)ql}BEUyHGzY$=bm+SX-e+*hFM2
zx8#E#fdhA;&3HWD@gonUDSTDgEx$Mq9KUQivvtP_#yo{C20Mo(uST-zzh+2w9g@)`
zsW4c^-b(FTHJE-&*G}iaa*1}OVd(UG3uA5fu=#X{Z-ahWd$GM_-$(9v_saNw$9RK%
zIl!WshGISaQHQ6&|3F<++9@_;^VP|tA*`ZVl=lDr^N#ePvc_?v_sYj>WoXf-O=1Y`
zYtz)|r(>xB^VMz12jX3PDYCP?y5##^dpW$b98RsTA2C8I6@iNGi%&-jmYsi$I8V1c
zTg@w#4d)fai~WJ~il6%+@Tq<y<c8>4&5Qms=ozF-%?7Mn)&{U!v^%k-OW>RLCGpK_
zM`d+AWYyc~8`bQI<_Wy#+F4V4gJIQID{)^KT=-aZuLb*K8u!}&yT=aegG~Bi?}1Z8
z)PdGsZ}=?Cu2sxouno)~f3J_CH}Ud~<)yOfz31AI2kMJq+3O~{{R`&nXV^qYdzN!1
zz<1R-@!7VEC(KvFIn0ZUR<_^w1n6`-uS+KAH>h7R=EMt;v)y|npRae{3&h9pklEf4
z*_EsQ6^M(?+=k}{?<*0R@&pI0Y!;iRwcGj~(~znU;-p=VP1G-N;e#8Fu-;_OF}q58
z=DoI=*|2kcSHZVkW6flaq}(I4VVa#}E@hwZu6@><avVN@ZnUmA?YEIT$Y8c0U;Hbq
z(Ao(5NcsIwu9s)c>(W~8sB=KzcZGc!jAR@k#wv;{l!(IQcJ!Amd(9<ReN&!u0y!*d
z8X}p$7Bx|%O5jw-^Dy2<YB6Ew;)0JUp9}+46eIT|+a!aUJ2ye&d$vn*U4$&@2Y-ig
zou+M#LoVA)QvS{`TTP+BlW5Sie83~1(RpJtXCWP_fSr$4S3RC#$uAGy9sZG6bSiAC
zQd)jIYm?H-R$gX9o6w=po+>%#_-oYGHdJO&*Y-qq#<}TvKURrf7b;8Y4f*q9;@>p?
zP6mw5T<MsaB&1UCk2z!_@m%dQ@3H4X#WOPRBPg3wo8<Q<&cF8co{>YNHsfC4xPG{|
z+!eu2s*lz=hYe{<vzrUT>t^UqGhP|zaI=V<!q>BAZ*i+By>B(WmN?@E^D8*4(V(lD
zmcZ*1O|{NV$vi_@hSP%e{umm*TbQ*2ay5tz+4xC*ZX*T2QK=4uo~>fTnM|ib(`k+Z
zMPg)We|xLg4dy7SQy<2LCEXf>E$_(utG0yOK0>$7Z3u872gIXOKa*_|mJrGz_O{^K
zW-F>!4Cclp<CV$Eotx4o(neVVSsIr%C0MQ(<*nlWs=Vu8wl^Lhk)^rZCr|0Hd7pp4
zDRf<t#?zLtn0aQIADW(Q$1Ktp=C%+n)RgHd^jj;PTX(eEne2ZJC1Y(v%Mwt~v&SH(
zpq4;TLshU=!j5ep#x<-gpj9xMDO8CGwo9&kRIOGOuH-3EuB(n;4(wqi2AiKi%p*2a
zCyj@x5oru%?PZwU6LkAl@16TZc0@MclfAU3aMq@Qr<AdnLpPE&9N$Kr7*N%LT2Wc0
zTdfNvtrYMOW-aizRc@8o%to8mI&WryK%%@Gy2o~|XzOho^Btv}tX#40xGvc(rJhaL
zA90pvoM)Km*BPv7&1*hZEml2NNmW@@83^aanxJ5vN1$&WMA=2#%Q&;Ov9+vkTwV!Y
zG1W$`RIJEeWY)UtXGeHaJVml=Xu56Ow3LoK`R;O^<lX3vLE;<Z2VLuC1M37_@4(mw
z!QW#Lz)h#VEfbQNm33y50wDLd6C9J#Wxf0S?XljlKtaC}f%I9h4xXqY$Lx+v4Q|_X
zHCOoFLUFlb5(fa=3zb@B0!%So3&gBk`>n%GT(IT!+q-aEM$I(|oEJX@NYeiZXvX3<
zG=KPS&zN-ZmRyqQxj1l|_8g<+pH!0nli8FCAaPD!e$DLn8Js9?&;o7@CroL7%o=E^
zR!kgHFrgg=(j4@)flx>UCEn*q?4!To_l>vEW1@*#nn=AJA9w?VRR=&ZRUw&K%-sQQ
z+43f{R28#nD!@&Zn=pJ%Ws!$w$%l6e2tFe1?!J#%&*Cl|FWH9gD+0Q-za*Bks8xsq
z-d;v9`|&OBZsmsn<uBj0{t})Db<9`KkPqHD*>~6F%t)ia?Ss<Y<ky}X{w~fB(!rXE
zn{-QJdMph8;@jU)JAiY(FuEA(oU1oC(3T(#p*PSYw0sS3xr@{Bv-loRnRm$;v*;wb
zmsky2z?ZKx8Ss;@H6GX%xBbF=mJfo{SXXd12cbLe0WP=NIiA1Y{holicReC1BH$s9
zk%PC6XVx85ca6}96uxQP9aq(x8q_z@%!KSpJW=jkdWQCIdhAyI;yB}pjEn3B?%(v;
zZDET7bN>dT2f;w&{epLEce@Vjht>)Aw?14@JD6WJr9%*rWc`JPZRW$GpqLTsU^ddq
z{9uQnj2Dz>W{L7ss{Fg0ee5l67#qBxLo=K^ZVGPTtQ5fNCCch2;LB*IUVjtx0Fz!W
zmmB_)_886=-3DyGx7=M!*KJ|y$lz_g%EV6>4-v_3v%zJ9G?=gT{k5_^%KK{EEt}vW
zXH(1I&&!*)annV(=;nMSF7B-;9>9w?o9XgDrsn+L?Kzd8u`6JaG&Q-PbHRF_tfKfq
zJlMIr^voj@YhbW6HI<-P>be@xdCjn?(SL=myhn;`#tR;74C50xp-S^(IH4#o2lGum
zQ7WVJ9vwnfm}1zWLi0oBm|{4~JX{du!3mxIv+=|YIN%q>`)7kQK#7q0Y(LfBr9k^D
zUT-Wn*?*;ZAkfLMK>9gI!xVc4?`mMIkxMhLie{zpGH94g4zHWp!%s(==q16HtQ+Y=
zDaRf)9dE18M3<x;>!Y#cUJj}nSj&g?aYw@xWbiJuBo?cPd=xpx(*M9AhP&5fZ*{R{
zSRDeEtOdOU1g+<%clu%{e&&~o*~v~a6%Fpp93TzN%v|nUHYArMjYUqbKYG3vsw}wn
z+8h8#Pjirr?(!?mzOW3<NhxTjV_x$A1!X{*zcoA`_Duo33R<N`ANYAdG1^=Yl!31V
zs(@<1kGfjW1)$BK3xP#IEA;DpI(#uhjdU+Bg+KkcUjW~5)!%$r6=Qj*)ASIYPlvzf
z0Me(|BcJ@rD<{Zdc_HG<PI%#j^5xaw;Xk}-qP=bQy;uGQ67J@zo>RzaNt`hC1Jb7X
z=2Q6U1Y(y$E)}~-1Z2O{L1)4zW?!A~N2iJxJdAyP+_@XEvfPN3r4uVlC%n6+%d@#?
z)$QG=v8rt7w6dX1Y4xcXmSEglFzzk91$C`>TgG+SqcRe$BATXwpRO`tjrDHbtMW;X
zt^{7hh;Y(~?11hKZ-H%FtjNn8>)^R)X=?3+#x|8#rY&uV{1(`<22tO_D}YMWRVmv_
zcnzqG{CpMtHSok1-UKuQ3xP#IYj_Q1z&oelol{-jF|BBW6(=k&wOJnOP+2}h%?Z=b
za{-xu*P)H+nHFAx=cUT3HeL?C0<;qKRiM>?%*h(iT3|l-2GB+z2s8nU0GXc-%c~u#
z=V{P5y(_|dp|OMP%C5-FGws@IWn?F^c#4|)GM{~s%$6&NoH;nPVMU3`qYlf<dl9=%
zD+^@&##nh(Vyy#h$UYSp<bEXDGa+SWT?e9?B5hOiT+VC5isTk6SKFfI$Qnzx%&Imk
zdM9wQDphNQxn2b@k~l>BIo0NR)RX|FD!*lWHBaSudA-W_4z;3$fEHyzC&fb=GYpXH
zPqzlSqCKbVlpdSk)u*W`0PP`bZHkGkP0-Y7+0kif=!CSip;NWwkm*!&)M@qD8D(!r
z)XLW>?ZjB;M$PRIBIN3lyS~fHZVS~4ZmpW=3F{N$3mqfWT2U%}AGK!GS-Pb?T-8c%
zYN`jRS*|#aC>zc3L_FVMd0)18&U(7{n@`oKZ7S;is1+(EGN+aAWnL)1V~sUEW{n0d
z<I!>qT8^P@koQ?@VYftzk}#um9COB$ls;&&GR)rPv1scfwmG|^AMs~*v?IRk#0b9e
z6kfXCx?7;USBDu8P_t6SMtVCEf9vhq&}C1=qY=v^Pi0TMYrOWZH(vM3Q-YPJM7{By
z>ZI_mP6?dGs(*V@-$;<Zw?XpL;UOp*jYP507!-k4W3eZ`2*pklXd=!xIVdr7C5nr#
zK{3cnKAidHp~TSxI77wLew0KujE$fkY$O{=$vDrCp%nHZ`;dCGPuQo_hkedIr+(}U
z_67Cl^2g%RxZr{Ya66BobZ&4rW$-v2M}v6^PoW{aH}6eDd0(DN!+07WfXp4p2hu2>
z!82$yAHs)F79Ylk(HNe|Gbx*AbJ)%&@=NI=p2zd)5<Z1bp~-wIpGBAP*?1RE<%PV6
zrg0zge<rWsRdg-)BPu>#%NuDfZ{khZeJ<pSsGP6hE2xUE<SVI~ujZ@CPfV}Sx2f-S
zyhhnXjduf^i3N{<ZUuILe*<(Euorm5!U5nZ3(pe;{GcxZuK~v(^A6~{z$xGi@Fm)C
zzyZVpNkAXK17rZhA@e?H7BF5tW5LtvISZZzodo0oQ-N7PA+aWeJunX_BWm6PYzNwj
zHSYo42kd91jhllLgSMud8XpN3R22s+8s7~zDtsAS7;pqPf!`jO*JLPK)?@_U3*G>%
zY}^{mZQKFu3g%Xi3g!l$Zkp4m2lE^E2J^exyc8@#T`uZ!P5sqt18u?Lz_ws<<AGp-
zdj1sXYoOiz-VntXH=Y6o0vvL&XnO|uGPnu%n*yHT7G;;DF9o-wkA1j*wu_z#zNqkJ
z@QA{@!8d?+0vW+0f#JaT;2Xd@kbfgEt4UYL3!aVek+D}5H}#hYEN&XAu%c-k{8kWH
z9o!(t5ZDPjH^aW&O%&*8%8|Z^_#p6JlMg77W4C;a_Rufk37(DkF9P^$n)G4BZ%Wsp
zZhei<WBkXW{Psw&NX6{{Y<Q`9eOLWApGWx-K53Lde2b%ey(~KR-_d4aAPM7+4c-o{
z1HVqj1N;V6A3+~8J~B?xP%(3uaf40m7TsgTZx3wU|IPRXvJfX3uStl}Br8@DJDZj(
zc;NF{pc1HWTGgaWSZ&rSMAyssP6dV|Up8BLjTl3>ncsybMg0-fCr0vKQNN;Rao-E{
zcF@E~eV`usu@t!>5zU9XdF0SQ<j*LD)$r#_!P>x`@ZC=Vbzi+cun{@c9jldUzRZ&`
zXqp^>%83YMZoFdguLEyK=U!wUV&2JT?=@B7ej#Wb5Da`2d_VA6)1tsRi<)t0hyR~c
z<LaI#?*pf0-ou|$;I|p_xx(GSY&9ok9&jthPar0bVIEkt@sXyRRQ{`+RP!K`vqk7n
z(sXH~Sq~an8=`a8%0Zcrh*6`=Ih9k^+z)*wbpWR@$4>>{37x}ye+f8a-Z$GYj#=IJ
zW_~oMD@5f=WLzp29A@0ry+htZf8?GwB}yQs<C+I!>}pMDx+$=_Ia7|MIUDw2o;FVe
zeNB!_t{=g41s=72!FIe|T<VikQlFVR)TgCR^_ga@`ZP0MeU6!=KEdp(KD!*CKDE3+
zePWoQKJUwfmOt_jINm2(nwCaf8=ws!O}kJVLN;xfHjM1r2yFzkkJLtzQyZ;~rWkFE
zHilf<SZy2`+RfUn6svt-`vLXP)@XN7FYSlg52>&ABkjkOs{KU!38iU2(|$%C?Jn&u
z8le4L`vs+I_h`SQLE5jhU(;aiUhQ5Qru|0y4Gq_RtNoTnX!mLNQKojkc0Y~Oey9D8
zMrjXd5721s_uB6%OM6gzkj7|#(EdQ#+C$nyG*)|9dzi*)k7|$7McU)q<1}77s69s$
zv`*~>nymd*dzmiR4r_;LiuS5@l&;j?)ZU~S+FRONG*dgS9j95^3GF0Zt$m<<Ood{w
z7*2CVj>w}DkuUP8MobY?sa8xA(`bR1A!bm$m?dUWgD4OM)F|eNITR3uqL6~3NEA_%
z@ChG<#5^&NnnkfFriG$Jlu(N(6J@kWREP>{6;;Adi$$%dqZ>uNXr!egC_;3LSSS|J
z3b9z+K(~pT#7(qXEEP*>jaVj@(d}ZnSWb6{6=DUg6)VL``k`1QR?(eewOCC*61R)n
zX`NUr*3yr~o#IYfFV=~5^b@gOtf!v}6#ALiAU4njaksde?h+ftM!H*U5}WAfVzbyx
z8%3LFqhE+EVhe2&Tg6toM{E<@XtUTZw$m@g4zYvU#7?o3ekIyPJ8coW#4h@^*e(7*
zTg5}-VcH=c6@R4r#pB{nv`ZWmf2QAyzldjOuXs*8M-Pck(MkKn3*rTOSo~G|l^zi<
zi<jw9aabIt$Hc4RRr;g&oA?{;7e~cWdR)9I-lPNKE%6rpNgNl)=?QT{oS=i^@8a+D
zq&O)~(x1f#;sbh0d?-Gozle{;$Mm%LM0`Teh)>0*^sM+?d`{1aFT@x0y!c9lsZ(dV
zMlb3@x6{kIQ+Lr3-L1RnZ+e^_N3ZJ%dIBBQd+0sr4ZWw{lit)*^b|U#_tyK;TY5h|
zo!-$0>I3P0Jwwl+5A?zMU^=A_)rZoD`fz<XeWYjVne?$fN*_h1^(=iHeWH)o$J1GT
zqJA-bp-<8;qp$RN`aGuT#d<Nb=_Psz6Y}?Bn2udzHM8qAdJS{w^YuDr=s~@S#k!m>
zCyRI8@7lu>3~YZ{Z^LHTSRX?-bk^5!7!H<d#27KGpMgKDzY%N1vNR*!h-V%n(MV(i
zj3gt8r5njcGP}U&W%ObLjXp*nHpoadQdx%4-|(;tjdWum8){@28Ek|x*ci+*jiJU+
zHqsbw3}>T^Oe2$xHbxmaEX$Z=<g!bQ$;K=;*(fjy*ww}yV-71Y3XMWG+bA-M*c`)W
z_}DeZJYya!G>VO4cCArjl&~VB%qU}Cqr#|QKBLO;v$;mCQOk;rI-`zVZ`2$0tb`aJ
z&u{A9KUZQuT3JS{8vE!<?1?Mu!EXi)0gHj97FGbOE!;^|H5l}#z($}AGT3`pZUc4#
zyMYeiG2jW{Y2Xm>3h+AcHt-%~c7T2ae5RhU>RswNtH!>)@*MiG0Wm;4kc@il<Ev7E
zbfVfSpbiKUt6c<o6R@26(s;_JSvVQVZ|^LkWwe^s(<Yo%chNq2f}W=%^fsNMv&_a~
zSud8(vRS_TKCy52+;m^Z+`+!b=3?dct@6F*yW4jR+*04`zB_$ygInl((znd_EVw$~
zQ@%yM=fRcvcKiIkr@<BZ_V|i@PlB7_+w7a;+X^nv*XEn*!)e7g(RaHq-?t9jDBoIN
zj&D7<4BtlIaNnKaQhhi0(tWpsOY~Lwdim<WIiNSzR}U`PSKzbxiohAZIo`8AA2{xt
z<UQrfBkujom+O7omk;iJ-%#%n-zad$e8auZ`?A12hNn;XQo-%<<#_k``jgH3nr|vv
z%tDJo?@Q2O!*wcZXQ8&x`>t;uTCeiH=3VQ>`29P*uX|VccmKD&6NUP|5vgw)ncwMg
zs_)aJsBgpcCWfdi!^zk{yoM8%A&<(CM`g&PGUQPi@~8}XRQ9?B<Wkvt7LZHjD?wpp
z8S<wb`BRSkDM$X4Cj-cxa^y}qa<@DaK>n9c1ag6VU>Z;W6h+k+0~J6m&<HF9ZUB}M
zRpU%qjq_wR&Xd(RM^+!B;oiaCOduPWSRL!l_fGRBdsDsX-Y2~k-du03x4>KEZS*em
z7JF~Nb(wc1%I)5D-VNSOsM+G(j?(Vk1Ad=(fA!SrS=EJ-liwnP|1m;-ntwu^e~MVx
zR4jBA3%iPifmlpHL@w@%NCF~qHTBYGBQB{bF8y?$?xQrsWjlFXI}n{gt}k3)(S@!s
zVl?!BuqQ^j6jHIyB*#CG6V?kyQK)Ef(bA$7MXQVMEc$7c{C0|if6PAybB3QGCMJrB
z#PtHbfHdTumu&i6WT0^U*7aM`UFTfq$o?P6u*CO!$nX0bY#(9G@xxYsERY2BA-)v3
z=Jx;@0CLu!W!;YlCINZY{ZtFHtTu(fye=?*i1^Jm;#e2?%`u9SzYKgl_ybYci@IbW
z)r71=it?M&O?idE7Ba2&*#Jg*b1snI)xO&%to{Ca0PDVgvC<)JRx(z<<i~r!ze2U+
zH&3(bR{saU`@{d!DBvC9Z&Uu@WlM=)zoHA|-)7d8$-Z|g+mwCke)o66eBL1`f3<K`
z%OguRg{CwwUb3Zdee0ej+e0&&W0$mt=CtlxvL`g7bw8f^P_r*og6sZJRqK;Wo($E6
z7A<*J)$9ocTc2I>VrWt8i%X7#N?MPgjcor0WcMw3Cv+3|_m#HOp*iT~ozN8UXVLNv
zw4BrW&QdDex@hB4J+!<zcB!Fi5<{z6-(RvNw6^v1(q5tUt^bR>uK{XfyVAUBW03d@
z76kJ!CPKzM!gF|p0u9CrrclR$)=wBONP;*#ObAPO4k2FsONf>bUJv7V9hMNs%eq+B
zLl_rBTrA@vtiuq;!+I&oLwJstWjtQSVOWY`TrQTSTny#xxwpGTOAkN4WPWO^rfz-r
zp1ytWx#ygF?z!EKnx2Qe-`WiMpw-2u8)uw5Y^G({_JYl^^t%w7XBmYy>6USO6RWaJ
zLaw$<lbRXF23uyCb49Y{mIYTV)GR`tVw`cwp;x~vg*8~7xzbscamAI%R#{eDIczP|
z%c14EE03+WyoA0@K+Az%cd6J5#zmK!y=d8Xm9Yr&a*!YF#yTv!E<?wJm2y>eJg`b#
zwH?oat_L0aT%%YEQ9kM#Cp6^NN>YQh>xcSD(CcHl)##dLYpv&8GoWviYmOa+e4%63
zxb0eG)2$7zXTawTxn<h5!VZ%hw}jj>Nb*tWn`w=AHL+ext?L3v+w59rN3E?;KW=e&
zqR&rRlUx^BWKDHBAeXwl(8lH3X6LM|YnNTHTA<~mG2o^;);_Fv1)z<BSb*!5<ub4V
zzUG9ho9(vdLw?O#<hllViK~z8v(kj8IA>h_U_Hh)NODYHa%_UXjoPQ#Ma!gRn0;ow
z;+C?p)*g2}TW`G%ql|+_lk5udw=tsEG|a^?E9WG%7+f#G>u~*y^`<+i<Ee4moyx9T
z^W6%tVYhRNeQ8_-`)^xsxwF_^<AVDHSnoEbt#>eOTyReLD9f%pAN&)~-J<obyT~WC
z-gBvZ@z(nw`Iz+~`1uf7I}_GN?h;>;^`4uCvya_O#{|sOijT4`xhtXXvfId(Sy$cX
zK%;SEH>SZ?7cDc^9-tYp=L2Xz$)<eRhiU7Ey8-0cbT`AA8g;kAobI|Qkj?0}_)@J&
zgns3k^C`gc249wO*X?3gtgoFDz7w_xH|xu{MZ+u%0e#6=WJ|z2ZOQH{Y$iurrrkZ@
z3Ak^GEzNx$`f{`q)5bpc&5ki!w)@ukd0VdgcE=j_2Va)00Bd9`1`Z=2RRhqa?mI#H
zjQcJ`25hTL&<JSoJ)mtmOxw;9+6)>!wbcP1+SUks$h`PyORf8!kFmA5uRwhpk=EwE
z&!*cv?t38ZsQV$%erFonZM*D#)G=!<azE~vwOw}25qgQxusUp4tv%<bASMKSm9}2@
zvd?I{;a>Hfvkd_M2Fs{%+}CUyf=H5%BbYCqv~0zr_$-zM_lA##b-e~rYr&Q0qpebq
z<hiYY%mvNUn{0}0#2W8wu#MRr=asMuz}t<xu2G-MauNFj*!&e+Yn$Nhya|#|aWr0o
zoc+l<O}9P3;~J-3a?pC#{hD2{&3ht1_b1S10OoY_{AH`rk;InSX4@~Yk+!Gp7uhDD
z9fbB0dawS8#{)BEapj%AY^n7mfaffFlEHK8JtLseqWd<RX<x8U`-kj{&Mp7QhxK?D
zus>@L_{VH7JVX8o9Ao`cR)uG%W5T}D-VO6eXyZ1Hz4rC?YyJoJm+gIQnSI+H@Xvzv
zy#9F{`Tb8Iy2kpST4|u4L*IUO+P>?2?q7p?mH&lp0wVtw)DMCWjJWUlw;WXaFxzi?
z0@faINTHsmL4Fn6|6#qC0<T_hjr&&Z>s~30+wYAB&!@df5HT8X9J3{OQ$eq3OdI<=
zX|Rf6EF3>QO7O;~;5k#a=dd@8S$jO$9Zzh}J-Oi3Yn}qI{0mPp8?bG8N;^huJDxLO
zC4;Bfmj!myflrQl&O%fvhnAUm*YL5%ZjYJN)cH!FWuxzs9eG;7lb?Iqd{^v|9-FTR
zM=IZSd#t0OW7aPBc-UB5oyU)NC|fp2-eZsS-L$88E<?+D_dRykp6)`vTQK5Pm??*+
zx8u1z({lrMx<1bUtf_L3AG~73GvvE%3E*87_Ml9*-kt-oE7qQ8T?J2$u=M-xfG0nN
zC^T%T_ua*`?;eb<18ZD!uKDiURh}{5Lu-R)#P<lSG2wd*d{n+Ah@2C?Wr!FM2`$rD
zPP^Bu2&96(D4=j$@n!|G96jC>ffJ7F-uytm<EFPLP~^DfEeVu3Zd)AwV#ggX9iSa|
zy-a{{-1AljDvdLq7i^~EzSkHqIv%>#1Lt5}--Ig!*snnsv-cd(gLu6=;$iM49FM#W
zowb%)`>2n>{^NTE^V{rW@caUO-`g5!uuOX`5Fci|u0XTnv6ls@mO!dj$1+G|ajbfi
zgcxER^<H9k@s1F1LElt101*{-TNvvK_*D<k)8JP?W4~gjp-m4P;QYt-gwSMvw6DW%
zHE6r+jc2<B+BNNeV4J|cXRGsG$I$@OmQkQHfzQp3EysrUR)DoCFl`z3-eOfYo%goj
zpU^i*TP}Kg0+(zNww=Hg%b<PQ*X-DAPY?7svfOt9*B!6CcfbcmA;Kg%Uc;5oxP2J*
z!Av}Nftz^l0=Mw$;I#0q+Fm#(I<MiCORUGOj%RG9Gs1fpd}bFk?YF19Z~Im){hkTm
zhFuMFv0*8PE2vC+8F<1b#2=N9vcuJ&?-i^a5B7P_6v-bz8-r)o_u5|One|8D_K;UW
z9&NqknLnQZ?TcY{7CldJ4y!N4Uh6{s1mjE3(~ceNOHfb7!nr@$GHzLi=uvO?2J<FM
zxj&7|EtA$BzY;wEx$l<!f@jU2P4Zll7m)SROyqCoJPk%fFyckeR>xDj!?T0qGxkio
zw;lOQ?E#Aca<4t}e46E1d!+vi$(1-~RrYR^)Mo_GRAEiQJ}%38d#qmvl2rN6+OJva
z8`SnbSkbVi+U4hW?EUs@ezRr8e(n4YSn>&*Zj!dAfIYz)K9{|!Jsmi_Y|mty?B%f1
zUA7^R(}m}Y-Nw1U&eHF`?Qg_!*xzWs25pSCIy|!=XDvvw0He2XS7pF0hnJ_#KG>cI
zn!?qo-)0{+_QB{-fBv%F(5~`(jN2ZMkG2Pl!#<<60d~nu`)Iq`@3+Rcm-#Q-$J@*O
zSM8DQdA=U|WV->ZH{D+4?}fb9f5TGSUJp^Cuf3^b3nHYSU9r!2{Qd#5N9-8qoLim9
z5f4}II}j!2VO=L#UH&0Qs&l?G5_URQXDq$~@05Rt?+6BLb#Qh7U(a`@V7fD%(3#c-
zOyeE1Gsn0MZCY@>FUc|ubR&*zoq3i;=T0ZWG}IH?dLQ3|I28D5-%5Ktohn#6$lqv7
zfU&l$OIRMfMmp8@VUVHBk%j9WCxZ3mq~1X2Dop!ZY*(GL=au+=!GFf8fc9&Sd}vc^
zdjWga7S;v!DWE&+2@Me#bFhp%HvIFBBA}ZbCC2W~3#7gY*LPkdw1c!c<Dfm4JG~C3
zeUu$_(Cy=do&=g{pY9AeD%)ou&w@RAi|nV<u*-NlyDbZ_=RPI0n$S~(zDDR4Yf}3h
z?CGlz8*hne+Y_*1mcxiW*){+&_c_kBM|SoTx}4C1(0&18Y=gDAbJ*D3zR;QJXa;%|
z*LRLPTJ4dY!ylt3N&9I>eETBId%kg@a|Zl>v2)JSWSr|<5YdZ-euink5@as!Tmdi5
z^CuI!3ez|y3v}l?zCM9#NM(DhZwdA_4_pIS+G9Ik+Gg>cAY8*dW1EcQj+>p^u#TQ}
z?!xuJQ{3Ll!W@Ot<T}C1vi;yEQ}~+4%90&zDnL0XD;tnvx^s@u@y2a?BwW?xx32_}
z9Ip0tw#mV^zl5mxtbN<J>bTUt%c{C;U7jw#)6<c_mN{)530<YIvi#tU5Rt%V@Ll9f
z9I@G2`y8w$hRdDN-g|-D&IIrMz#V6@>tf)pGtK)jaL=jqJ__9D?vb6@E<@lUzS9gm
z!grB@$Ie{uW4J2d?r_PyEq_-lx?|<<Rmokdv%tF)daugejXI0H%Yh|lsdqK7>^$S$
z2&_7F-p#;<^Q`w(VAE;#zJgwakF(DEnoV&wc0>eT;c)}6$+&rV+^z`zZW`Y^=iz(j
zu4pVtR|1hFZ%>l0WKIh%LgG8+t~7j)+?B@Zh40*fGr7OUaS8VYKcB%nNS6}tAYIwG
zrYje(!LDRyi?z9{falgv<ZSDR?kaZrJCgk#?&$>g#K3vkKHYW3QU%vJ6V9t0X<a)0
zSp>Q3clLHDyUsdqbYyp#`DYHqZv0aPa{ule>p0WZ!aud(o)2&|_q+mnb(T8^I&%F}
z&Y_Ni&Ov8OM{!r3bEKoRs}cW43-Z<>61{auk+%*bq-9bwI!N9$Oe1d^W{@`xGo?1E
z2fZ)tkp3Z3k+%pn<SoL=n9s-Lp{khmm<@D}y!rS6dC&11Fh)MgL|;M((dg?a6*V9q
zDj?sltU}k&_fa((K>rDu&=8~=G=lD+TJ%#ii9SFx=vU}GT1J0^zKi}h+CX-+jedjL
zC6puw{aZ=5<O;eY>682l{TIn!NmkJRiD-%VHv0RBTM<7-zlpdP@pH+6h`ET5B(jLL
zh`*H_ir7AQK=RpxnFo(cjzxAy-jf`UoR0jt#2opH$X`fmBOgZoQu4LP{}B}_`DT<X
z>agSoQAeZFCAXq7qB<l)vJTlL>Y(foWIa@z?1!=&)M43wl#NhFWIvJ3Qt!z=k}XkR
zk^PlyoBFEk@1v8bZ{fc*QJpdIF`uCVG4hzh)TNmJ6|+WNiEWF$N%h74-#Ci;^SC2%
zN2rhDj>a9M9>?Xx<xqba_l3AGP*0Hr?9xJ>l&0hNwKRQze!w7L7^Q0ZHG`UA&8TKv
zGpU&dYDP1sS<ozMo@rJz>zbEPZfkaFik8yxbP}CPE9flz?jM<3B-KbYh>|i=hP)#w
zr%q5O5Tc5xA|#=TsbWM?r>WEE098h5Pz3osS0tsU^eBp|rmB&YGEuciMtwkifa0hI
z>KiDY{GKa;`Zo1#bcnK2R^Vx;Tqu$Jp6f8!I}@cu{xb5H_<28CM9+w3a{NZ5rV%x0
z8Z|ANHjPc=(fBo&HCHvgnj4w{&5&k9Gp3o)OlclyW;OGgCz_|4=bAMrUud>8J2avr
z=~!A$r_kwiCY?j)(JER^m(k@=C#MZ`6<tf$(@pdR`XcS1;bLC1vzJI4oL<t~B*}wm
zmrvdFHM;N16x|O%^dLP9=SG1Vrzh!YdYzu3=iq38UZkJVEBO7os6PU0CWrVr{(lMu
zs13Zf7&*b`%ZQ(A!N2byJ$U#}kpcYsSLmzc3$m5OM!z3*Eb2Hq8})9~yQn(qy{Pw)
zG3pCZxyTfi7nO%<qEt~TR2x+sRgAtCrH)dgb5UQ3Do0<B`dZXE^g&czR2{4r3F?QH
zgsrT^pN0jWL7Gv(IA9VmjntZD&8lWYv#EKdc}+)XUeVEX0-a2!(MruGolWP`1#~f8
zN}r*1ke>yZ={mZRZlT*C*=P^#r!Ui2>0Zch&;#@k)Qv#h7(GFIG)tOC^pxfiq=!I1
zhWwG{j^?iBp5{J&7g73Msg=BK8XaOSC!_-O7my0kuOX>m)&C9pJ+ua?D5^555}k~y
zj;cn*QH@cJ=oFH~ZpD%Rn1gcgw{NTB0ZB+=d<kVLKmqhNVD~_NmDIRG5N}*3z;u%F
zWmT$in;a{QyH!~xiqxMlNvrZr@uVJ~$8|zGp`Sn(nUaFMaXs#ffq4lu<}J`Arc?ra
z9+!9=;hX^4q#);oewfDN;Zgtt<1xHGc<f3jjUXc~!};welUOz}-qbf7Sl4r=EMk{t
z(}|$&f?QY@d>-3Juq(3<8emK@a6bW97XgB7*k-sN=Gh8k3bql(7SAE(!RuHWZORX^
zV<lhW^Fn)}#OExgqM&UB-new}GGLlDm5}n1i4NLbD21`{F`iGM6lBEq!a89cu9%pQ
z^%ry#${tf?)pe7x>Za*jh@4_w!ewQ`>>!k>BH6G$g8jmMp+Q88eOAirAlMJr2|R9@
z8bZbp%J?_vFX$T|roV8E{Rh)RKN!mx$<>9I!8W%|%~f|ytyQl~cdA~S?!M73`<1Rw
zdH!wfew%R`c>4-=P7Rx{Pbyt|$HrJr?>Ch}yM)==DCg|-wlX;H%s!=953%p#@~+7e
zj2rh%uB!Vcme4{e#AIQ8J~Uklo_l1vLdL~@FU0G|rk<)L)AgXuq<g+W%4O5dppLlx
zs_9nM2DI5U-43pEzE5?8>0WiT=|15j_#~9o38sgbM|HC4QP5w7l8<%O*{0>{G}B{Z
z!yNO3G0!|@RGA+b)#h0o*NtW7c~VzyequD3pOSV}5EF5{44-4Lp)m#chu6dUuO2X;
zC4OJKH%4&I>&?%x4nbSL&G#C@)^@m0g~zM#xWwBC%VcaauL&{4c)|R_c+tFNbeMO7
zwi3n+kH>=F?6*IK)mJM`OT^9?xu(_X0@H?gpQ<i4ZC00>UR9qly$<0e?6ZJsojIcV
ztT~#{W^)4Ba|H-|>deWcY&54;x0sdHZRTvE{~J(kGv{Imepl@=7m(6#E*9^l)tAks
z)mP1D#CuP5uUSXR8^8nXNak0(w`XyFP(5TeV||H^Bj&p5F>@p7H(_oO+pBuY+*bX-
zY^$C%d#dNn{^}>@%hgZKSF4|!d#l&XH>zKl2dcNsL)AOxk&nf~{q{a#PZ7uAH<jW3
z70yqn$NN}=$lvyh`>uWX3o%i+J`mQ4u;!&AU&8IMFQ)VRbID$5%;e@<D1&&Lo^Ob?
zA-*gwg>@4jwvNL0KT#|t`iRQlz8@BoLdrMETN!50u<?R>W%#}tUbc$jowyX{M2OcX
zID3D`@*S73-i+QFgx7&FP!nnFu8B1w^BBRk8aYPrI$Yo<@W%4>)uaU1Fdj#IeJrd$
z;o6w*PvST16UP3U^x(S3^|0>n`V;dr4%TGud+p8hH4fM0gsj7`7$w9a<7iD@$Tbl@
zhhyS+jjBpoqppgtDKk#iloR?yjiD;5rmCu>rq(!JQ%{cJd=*{OM9#xG<6Mn{)I%Fm
z@;nHwsJTGe%+y@Ock$BiN&g)}F<&JAx$5s?{tn^)+M4yx`+Vi#0rVU49OVP@9OZoE
zbmT82z2phXb@BvdkUT+IAWu;Knmj@IwX7pLi7F${MHb0(k^f1ai~J>dF7nspxyaw*
z=OPDE@pF*_+4#B0fzRXTA_wyDbCCmIgge?>==R>z0~LkJRV6A~#i%M(M%6jh2~~rt
zS=FktK<)yXQn9K_sw=7<)peCrm9M%9XKtx(tKvzz1GuYFKz<L>ebqzNBh_P7e1TuJ
zq*_+3sy0+fkTzAXRIh(8LX}FALX`!h;FM1|;-gf8B50ehQs5^6(i;5x$*_A{7tBaI
z>VUi2QAj7reQq)OC0c+x*~gH+B>C@>SyU1+AMp_?#m}ptrv#Z%ok(8=Qt}ey!+4zL
z`MmCe<}Q)WS3qN|?;X(dHl(}IszG&+$cUf(9VTxiBJiFpsLO^#!M!y8|GXz4MW9^t
zK01iLi1Ok7x)2p18OTJVICKV5JSvCu8N@(JKsrc=kO6%aB>jDK7A1ktnNc$NkDIB;
z2I&a$LP|p{q@!pKQab4V5&E1YUJ{Qo$ge}ad%9LUI*?!SNd2<nam7-_a>><-)ryV6
z+0)Y%8x@-suPR<wJZ2&)HkoK9q0r1Es8^U|Cat7}Q9@mIiB6TNURUojxlBRf08@-}
zRip})=$KOGOyNLDi+V<d7+u9`iRb-B=4{2IibssOVwrGF0RF+mv;iNIPFJaS3uhS{
z<{~5&;!@!nQwKdgB|2>i<^}X+=IZ;6B`rWN6D%{0Obhgn0DT_gQAp~o*aZ0!u-qWU
z(&@gE7U*@A@mH)exzPFsGf?rkM2Ath27ES|A!ejvNtIc##EdZ$70c>%%pr)^DntP7
znP}h~%}f#Q5183PvwDU}0Ie}#G#>NJ6U<i_g>V!AGGH)Iq1*tj0H&9*;e;*0JZIKU
zPcy}*Kt`qj+Pz@5K*Aj@61XwZ+DJTp$k+gFtX9q>gY`jMkP;{XxQ6tERz$Af2Fu$1
z5cxx(?Jt}y9MBGGhqa^H@u0pU>{pNLw3B<1hiC!y+G#w?9Iy;{jG+9F)$2^QHoe3H
zc_w(Jhj3l0c&yDST+`-hRV8g&b;YW-Ok1us6wX#`5I;rQs*2azT4+_TZ7S(y{Mrj-
z4tKN{wGM1atydcW-zfmkg*nuAYp;QKn6-UOslH8b(|h!O{bl`CeXst8exR^aU&?qY
zR`o+<Hem$)h<*%1+pQmBin$*6oPOf{I^sJ)YjT!jdesYf#`orwi2&c~g>|<E*uuW0
zpVB`dyl3_EC2i`Lg>A%FZ!m4R2eyrRU6ol_3dyXyt-B-8B$ev!g74%)c@J<O^hbrg
zIGr9nJ*s=Cd!&1;ThcA7GIgt9pHkh1Zd3Q_^vvlQ-D}3rJTDyhQmc+C(di>jW$UB$
z3A$!|vObNB(5A}ND`9m`>9fJtVGMolm#C6b?M1SlU}SxPzF2z=Y*+iGR`r~|RDVXV
zV|MgsD_-l(c$V~aOaazJ{SszyMmwin&@M7&&<a+d_8DMByQl@tm>Z}3L3`cMzSM4O
zcd_;*SJk_M-77Y96jw3@I;k#Rm!wO@t4OFjH2}PKbP5bzR-RXPLYH6hN);IdS=-ux
zt_a2~+lvdqd5M77hC$Z7t^`2qm^?4`9bKi)NPLK=i5D$G06V8^&|cIvYs+-4I*ZPw
zV|AAV?}mt~gSJ<=m8t82=<311v&qE{9i_XjyQ#aSo~c*`@82x!{a_wPTm2K5hbQ`{
z`bK?={yB37$3u{D6XyPewyfj^lMT^%7j!~QY2g4Djq&{G*O(i6C7yYv6h_b1ztC^#
zcbHrQGDI3;4RWUR{W@*8A;pkx$Yjih97CQ#Wl$T+4CRHT27{r>P;1Dk*evOV*-F4M
z7h*O<CquoV3EKqAZE$dr5_>Sr>Wtxn;Ud`!zxgld3aS~kk!R-k|IVXa8<GH$axX%1
zK=MMWhZKO+ojaF%4N@PZ97z3;x*-ih8is`HMj?$unuJsZX&TZ@?hO9#meLQT_2j>J
zei?lQ>{<Z}`vB6z)j=fsS75yuu=CeZB9g?e#cmRb$ltn7jw6X$4do=1We4(4j-K2l
z80UfMNx(FJJOh~H>lXlveEl?EHyzebpr7%jP`|)qPDIa$=oKDwe7Pt(Ct#gFKO^Eb
z&X)qj`Us^UV>rM4WD?6J#+&+v1MB*dA9GuzyCByzfBspRyx|BRe_Dib5d_&*_<nQz
zn1XE<MD@ZPPK$Kh4YQ+AE{NI-C7dg!M79;56J&t0Sjw02A{oZR#vbSAQz*r<&v0dN
z(mwqsPlo7NoC;6?vO?ty(Is5ga65#T;bVr+g&1O=6?72nx3b5h_{1lSwV(dsGK81U
zMDrkwFVuyXe4FBYKoNiiT>1pN{Qpu4_7%!E&DUGXpx$pO!)+|s>1}1m`U)=>_K{!U
zUve_Ic8X~L!_$nYoyeDqD@ARL9Dm~Xe7r6`2WSx4OzazY{+myRti$3~fCb<Z*<J9J
zD}bJp!L=A%lh=7!Zk`N{HIpZUv99<I;1)-px(et8+yD%mBys)J5MMU}7~{vrF>!mJ
zITUmVuZOWtWplcT#}Lk+n&5N@iGgqPweWcRN%3mGxHQAbbZTm^{?r4&EMQ(_t2f0V
zLD&8Er?C3s+q|7I?gH+e4B4lO?*kqJ9)<B**+cPTz!Fa{b8{1fz-N^&HvpS}SG@jj
zK=Er1g5RBr;LGTfA$#em1VHl1kT`uRjW3nF9kWHc3jkZ4%H{OMI-V*36!ZN`L+y3y
z3_u4s3orxf0F8haKpVgY@BsXu!au}&$|uF;PwQ_&DXgtG?LBWQ#j#Pm*27C-&fion
zz5#C=|822%Kl_ERpZ)g#x0Rw;v~T&_%#WDw8}`h7$KS795REUgyGfWQyat|d>)_N=
zz;nQ59%~$e*Q-K*fj9Q$7bo}D@XX23>tkX43D?Gae;mK@K5=SmPX@B~lDYLK=67o6
z<o?&*JYO|B8M+R`Vw4bz)R8AcuZb48b*zrv(@!lI`32ujof76JyuH<#d$tVDn|KZ2
zHt8pK@$a6<R~FE}oO?~lHT+-rQLH2$f7m2v8ZZNx11ta*0nY#{fOY=(C14w{%N@T*
z@sM(D-irq$iI6I4qX1+9P5|=x{zZTizMlRazyK;mFapl;ax?&%xv>aZKLHk0n&Zgv
z<^*!ObFRJP%ISN@f>X{lQR)|{cY1OLbB5o!!k-@mjJ{KuGoCa4ZYIug$y1Zu5wsj6
z?Qq}eoZ+0AoEc~_CrbDm4Dyfv(wBT$B!YZX<g?`4A<5+1A@7i{hNO}2g&ZZn@l}xD
z_<n(WBjk(Z>mUW>yC8++yC5p^MUW!$MUa!^TOg<Z*~gPe<Pz>zvy<pOgx&>TdUrbv
zyFALcG8_CR8+<D}UIg%~Z1AgW1rLfx7GIv&<3-xS+-DbYAoXQ?z~__?--`F%z4mTj
zSnB^+%_rCIlkgV@$uGGP`Em^R&6tDam)z0hm)vpWn=vWmJ27eG3o%E@w_z0I8!$@p
z^_ML2#g}aIm6y+x@4Mvulkt_HQ8d2yH=v`vNPm3k=mbv3myeo{uO8nxzIl9u9KXUP
z!Rw<FSrJ*$e5;A91YD0(Rx+;3QbJ1L((xsn1dds0z%j^=<gYnS0-vL1!XX;k4B>vb
zo*Vh-1b%WzVH-zM1JutTtpA+)bCgc~H)<6fi?m1Dkp@RkM92IzW*U8&e6c14Am_gY
zlM(D60sU`5{~>A;9i;97uT;_}4O%3VoVUl3TqME3Uwy(k{+bagLM5UWDM+5Okg{+j
zUD>L%97#Wtc_inE0aD75yp(50Q~>pnvLoe$$94P*T<XWvk74XTq5cGFenR~OQPdbU
zh7M4FO8qHF_FqAogP_wxBqiEJgJeHLF)@Dz@+H8EyDWJ~p52{Ahrp9rgg$5FpuDt4
zi=n)-e?35=+(de%PC21iQBEoHln-$EPNQ-xL#mwpT()u^)539Tj`9g^qdcp8itChS
z<#SxGc&1z<ZQ_+rm37J&xWsKSl#R+Q(g)fqTa-J>wqr;D>6>?~3<L8dFdvHqR0RRD
z2>{t({9~ECj9bd9$8tFOm`YiEObz2bAY=6sKC!%96TD0>MDjc#yjzadDsLRiOU*f!
zt{ga4t{ggM5cGdX2Xfes#VS3=<U}t&=qu2dkEM`##NPx%<O^C7S*`5rh?0FEt4EQt
zMp+}0%9>>V8p&kel6?!s$i5@{4vLjsl>HkNC$q`y=rf<%hb58`@=dZ>WP|G#lmXE&
z<1UYTfcrcks%1O^JO(UrINAi|Dp!h;v9U*|FYN)2CCX$%>lH<cqV)Nc<n*+RvBN+e
zt~^|sz6ELOXcAC6hbtB2I763`{JEy|w8M8HRc1^mXhlQDWjHgNK8xExtDThO!*`*_
z-ITPH<do!$sf?>oyOxrypwpi#jE9@k=N0FIAUqXUFw)1;cQ6$6;UWc{F&{+HUgBg-
zf5FM9sDys5i~&q%Ok@lxvNCjVHjR@H%f`!91fw&+nZXhPcU~Ugjpf-=Tmm^CfE2S>
z?(}(1e`wd7(W_`sG{b1nH+>B_0iAIJjw}lNU6oksdlbZI>W9=1k&OBg^&=E5tC7`!
z2h_>xzyrP^`v!PGi>wvJ%f2i7E=nX{Jxhw&irGSkW42?qQ40C4**m{gj57l&0j(s;
zDakt~wdA)9YCiIj@8=>H`Ne?*oe{OmM@{(aJi&Gnh(`YbJeq=dM8+qhrIXR{H+QAP
z(-HCX2;%9H#M7gQr^|?^M-xwv0Z+G~IKm0*fQSwUi4Mn!Jbk>}BV_Djq|Q&ImY``-
z-6LLZal1((wFI^Cb@;z${X=Yzt)B8mts@C1L^w$ZCyH=7KsZSW5BxRegCCcHj1oim
z$G_1Q6#3>Cwgxv3nM6kiZ;Q5I9fi*EvnX!Y#M`2rulrqQ>234=Cb|32gh+0#ZW7%R
z`5sr-%Fm*>u7{sRq3$1R4nNH-{6l7WKU+x9T{K6o3R3ZZA5S_Pgg%LZl$unTWK22-
zc>^4sBX!Lr=V%7bGD)qF8j>vJ7}E@&SV&CLB_523qZATDAn+47iTSjW5}$FA@te70
ztc&ct>^$h)CIc^%S!LJ*KK0&&#>fovf7wi|0}LU3;vl3lk`D*V!C)Fr91YUrkRX?j
z$*;=Wa1yngmX8q##~i0oQtojW4RYf6*b-y88o3$pK;9^SkT{t*iF5e_?B%j=%9?*y
z4dpKYTSzZ|Dc_dwCQ^yg#Q4M{$Z?sPs7TBr^ohiL$f?Al#1g1eB+`it(3Ocsk{Y08
zYGN~_B0k|h(jdpyL<`AN6J5{-bCdF%Se{RVd>hW;8Y$_GL6bA*h=vU^%WtuNP;mYK
zghW?v9cJPGpD)Rm6ruZYd_AN#Tartnus@_WU6M&+`Gt_$BuOe_;n)~b8!bs7?^%|H
z)FMeFs)FMjQ7whG$#r&8ux?Mcx36$HHAwwuXz@qX5Ja{=rv4bNtVgI3=y`{_1Gbr<
zCQuaE=I2OCJpc>IsK21*A^t2-AE7wvSJbc2XQ)ML5hYMd)Dk*GEmO<D^Vig`A?8fR
zOhU~08C;cr7Ou+w415D$+kc<r-y`{tK1P3EME8j3UJ?BRo(A0|3JK^d5x!IUE~!hA
zq{Gq6kh%m39{p)Zom3JJM-M~lu!SWs)1o@`3cZG-fsneFV08)XMdG?O^n&b0;<{zD
z3P+70b=-TVl_7QW#JAKTbq_=K0<LZv&4}juU2>0x=M|BdEFm!&N8<}*#|yEltmR`?
z?Ge@gdqOve=sJ<*zbCT%_xEVN%@0I<e<0$?(I1A)8b@PYl;l^P_&$=;CVQ_n$j{w_
z;(QG9Xmk*+_M(HeA@L^mH9~O_NFRsdBI9BK^0<_^bjWd;3CB5cd5}`#RFJFV%Amd+
zk{s#{aaGU)rwe?UN&1OXI<%8R-^jR&z{3G0ZWGI&RRg?nwd5Sv8>5z_ytsPE>*Jb4
z*A1WcekNX0MKrt!dLwx>AVCDU|B!>-<VpZ|h&-3C$L$8fFvOQ5JUs>|02BjCx#NUB
zq>m09T0T?>`RbvKLz{<QLE1R<S{@M{z)2o`=rt}A$`38e6ArD)6XePA1Zcg9lRR3k
zlq*TQ1TJyC1dj4-%n@?JPo9f;VNQE|<ORSt;?Qyo19V!<iRcT_{qi)(1JM`$(XMzt
z?QZs*M2lGRTVfQG2T%dje2Gy8DCbK9e-5{+;!z9s^u}a?9^End|Bt<MfwH2w_I{nN
zu0DsEh8Y1Bk;jN+Kp1hrK?g(_Wkf_o!%I{yLFDZS2#ACbM8roB$wUYt29baoqJW5q
zCczgPMSP(GDxkbk0RaVHqs-jBf7MRB-@V^`t@YitzP0X}#sB<g*X~`_yY}8))!lQ*
zF3#?lT`E7_vaiURku^nsgm>e!RJ7X2Pkwga?Ed^NKk|8n=xTd^|C1c`4~wIIY)@?^
zmVU5PoSiS4`D!a`Vpe%pdG?^};qo~ptNg#YKlyJ(ZjR(;uGHsTiOgJy&|Hb!T#4LV
ziQHU?++2y=+@eIUyZn>YU~WJ8C#$4fshhbY<=>d3ZG!xhsLd@&{>krcrkKmp>vQsJ
z%@r)jDVCp(@>86%Bd1$>z5e7}kyFZN$DI6}zM|7Nr+-dge%GH-Im2^?)85zrBuD*A
zi^ZaaJ++0Uf7mIO-@3`aj=~3J#->-McjXKcT%TU~KOVXI|M6D*Xa%7Aov3om@jXBM
z{0CPI)lV^dT7%bB75`9^?ewTz_`XQzc3*l1*iYr@H6|5KaoeOaAByxUR$(`ql-m}`
zE;+2P@*1t|Cnl9{5Ox>IuILVlkpa%=u~mA7+ovcRQvXxkxC-p69HU&(PFIPYete7S
zD27v160AY9LQ(n<42K{;q?j(NTqN>|RdO}AJrz?0RU>t!5bak?F9A<O(i<!Xb^O~R
zefRQ6bd_Ry1Ddz%6)M^xI5*OrT$^+Tc^Nv23qj*_e`>T<^shp~eFV1(i5_dhUGQ4S
zx1qlP#6r^7W>CFd!BG`@My5i~Lik2)t?UREf$V-OW!|dlsO8EJv0o#6jviNwXO^W)
z1s8*JBAMyn1@I8KMdkKX<@R|b8{lt&Es(SfUlc3_FY&1IR3&_aVroD91#kk`-s@l|
z#!e@dSA8CQE|NSaXBW;2WxebQLb<AkpD7QYLw>WOyC!@@c%|~xBt_eU);}P*%>7N4
zZMD%%fj3}bEjIgsHIYoh&L7dq0_!NIyJ2lN=<P4Ei}c*V2dqT9-Q-rPXH*jObnAuh
z2~QV%N+qd+P}iYnDcW=3`Klk)PAohs_@c^#Tf&CI|H;aDQeTE`g+IelvlZPq_)u^@
z__$&;06X6+Mmy+nl*+>h`F=Ee!t2pj9(+2Q=Y`D~p=QeGiuN7F5J_+&e5!q4{}*&L
z2cWYaT&ieJP4(3%0&9Solw@Mnq2NirnWa#3Hl_QZZT3_hx$bjPI|B7XrJ_Am@-h_^
zDi0RGZ?|1F9^i+-?J5ZtgBw(mYKZ(BMX4(K7fcn(?NG1<tsBDndcNL{@?Z&k7`O$^
z_3*`DKd@y=s%6lF@w5cnGujZY|EHLoQMf`8zSB!|4AQ&mbOQS!Kf<xckZf11Iuf=K
zuXcB%1XpP8+v}N;n;7j-_-yzhMbQ~W%!_9Dred&|qgo)TudVhIbjBx=RYdYC^%9yf
zlG_yBICwGo_kw+~@H~8^wx&-;=U(s(EX>AQC6%g*vBipJCpKSX%#(av;pbuD*}~Jr
zSRMB>(K(l++WB(?yDC=g(iMKNfI5Ft)#p@it^aw-Qw?cdtgS&m_$K&Da0dEiiCp!M
zAt?k)!P=Gmf7sngt6o#yK|dc0`5d($ycwNJ@FLLf9bW|Xe`}CD(mRSW(<20DVdqfg
zYCU&!wsB1vqukMp=&R;*t<SQTmU)*<;OeFrHiq8>PC|2d_y^IvMy~>?zCk<XK~M40
zlfhK*NpKI|*eSS&xT+aQ6$(;hQ%`UnQFm#0K(F`VZjs!tmTeyP`@ug^OdUhA2FY}=
zr(#$ic|MZW;9KDJimo|2zk}C>uYga)=2|4((Vqv7Q8X2y-ql<O-l>?X2VbZtqf)*T
zd=^Oqtd%p8OkzPLSPlx|1>o!8)5zz5W#Clg131<sy(?#&W*hhjcqGwLoD0qbM=4ef
zgbxNgs2y8`W#%jOAboA%OgG|(F5X%dS8$1OUC;bQy-~G7&kH=HC~~bNs}yA&qLS6L
zt^rGtmxDVL!|v$x1oOZc`~>~Q=<LGgI*vL9)&nP_GXwoG=zJeatxmNG-xhw<jf7VU
z=DA#re2t%!Q=Y2p8VP2*Sb3=H$eoeiOL^bvnq+?z<GpTzXX+KloEn}b$o0;2g4fWi
zTh(54YKK$N&j(fhwW9FuU|q17bxt98A}gY6!6x8Ka@5lr+rst#<T#mYXg_DwW4eYk
zJ6UTj1b6zCn2tHfcT@nkPEl8PLB6h6Qv;#`@R%Z!wW^s~i+nBeV@Qr6Ifi7d8?2~n
z-PAEf8H3j;M}Hl1Z*3iR%F)r@`=L|d?W0o;FUNAZN=yakvJR|ZrCs4itNsZ(aW0+0
zuNCzuLA6joU*+CD<9QJ5op5dKt(tN*p~f<v#f)vSo{{hZYCh+peaV_(41B(hM{1bF
zkNK(2K2Kiu8~saIW6l9}v^C75;7wY`sveVL|4d!2Bh^$aRC09y%aK$lrV2dQ^}MWR
zS8J>CRDnmoV&|v=_5$Us-ph5}E2xz=U*$nRbdF(j06GJ7&*Hd4=IW{)!I9L`tMw=}
zkI~Co_<i77db|%jhRp$t@CdCpd*rB_Y28X+TN(dOG<SNNx&luP@Z-ck6<(6G>Kc9>
zN3B!2w04Ad<V<7D_v1@c;(NzVfzG%&!cm5MU|m2{^{Qaqs&@n%A*qX`QPp>{{(oL2
za=hYd#en;4L55Lr*Hg|>C57+;fAyr5`<1~QaD%^6(>mFok-xUWA5#>MOD<P+R+PS!
z*H;v4+V2GLMsSp()T>u@Ms(ydFIjM|{yOqtFgj0z<q7v=)`~F`+y}k`E(CYd<8b&k
z!!ARwnC8wD`_}XzstrD*5*lAsc&q13l~f&4iBty7rg~|8Yvmn0Z!~M9I#H3LdU(D*
zvqMx<8On9;f=-&VlAGmPtNngC+12_?QD4o+d!>mF=jwVDUK8vD?!nvp;WPXS1nj5!
zvfj|v%ahrhYm$52sRxnp7Eh{=*B2YomtyJ|SgvR);C}|!qLU9UK~ey(33dYaAm0yP
z1I_>sDcYX2z5-4LFQ=Cy=(I=D4SW!M0IZ~S6Ih0%HzV)OjQ7?ENp*)`4!>MuL(bYq
zzFybP#H*(!5vb?8l34wob=QShdjc#;?l8|K-&M0c^4Gyvzz6&a6U`I3vR?>hgBOFv
ztP^g7M<6RG(-_Q6Rxf%L?uO=*ipMXlqGndz1<nUYf~~+OL0z@FL_QwWs%t7ggTDk$
zQERMq&GBz&6<+zCldJjht$KB?JRD^Uex@i9{s_h0bzBrt!!K|U0R;g8X{4oVcNP|;
z3__HUkdT({?otqy4ucLs36YYP?vQTj?(XKUPrUbi<b3Y?&%K`ug0qKn=FD$)cD^&S
zvonX3wj<ZpH?TY~$7`=ugj5p47^Z@nX#6fd9*SM_COk^pMAnG;JkPjMCPobR+Pl|(
zztDQQ=huP_G5%G-$fq57cB;>fW)Bk_+;5j^mr2x$gr#2<z9EVcg6ZgJjJiTv!*R_s
zx+?Uz8%|i(bH`;jJ?>{;@q{FNa#-J#k)${Upp&D{EZ&y7)qB92+C>m%&?Sv*v#G*7
z!zkECk7R#7YsEs1JH^bD^TL&?oOtvV3mw}l`3K&@YSKEeD5(`P6Yc9c=x)CTljJko
zCAV*9WreLuY_pvzxXA9>SNrRt(adv_zDx;g!}^swfNL;2^|UL|QEQWr395OeYoCi8
zqMfC#TcPl$9dV5tQ=-eF5IbivxsB(Y;6+tk_9|#}z4kdQLgQ9`R0?IAsuAzI(T=8b
z&ea0(am=GzA)e$yu{A<rlip35SEU4sXtD_Orih-a>Y<l%`0%_wz&Lp2*X$qm%^ZP`
zc?aKDqq;?WzR)d{buW!oYDS51q9U1mT_tFk4NJGVqUOKPgGaAB8EY2^M|Kb|Dlq4k
z-i-g)-h5e{D!l01p(qhpsZU~=M{D(9oBhnn*V#vC^D5JwP3gr@Yz_*&8}r@Am@%rg
z<}b&$`4)T4^20r(%sH6c!#!}kZ)p>sQQI((zf=y@MHj$nvB<q)UU5&71|RcGeGpUU
zA?iZ;*Z8sBeb4T9z{(TC5A))iev4rC#B9xY`wVn%E-V$Jms2%2b3Nncl(Y*>lr^%X
zD)ASD2e667hO!#H;)~j9(+$&K^Wbbt&iT|FJowi6zA=uch)mU=dBm#|Qi@v=8lNq-
z@=&ep^}sW=d+Py{qbp8l)t<WacGKUMDaUYC_c9^`a+W)u?RB^Qy6WtUWFN)~VB_W0
z6`J*|jwHc5W@Dni>qKgo<4v5!Rb7eVOlnf?nxv1rfYYs0bQd(EFwWPB1l9!OKC}3_
z)Te^vKI8N7o36`Fhqlk<CATpn^fzT~zGIr^VnkiBmRzvg)MSc$xuCmtBNrBtf8V=O
zeex!gR`r8i?jf8ujFIFcp-r#vmY9MoU7vT*v*c@>yPo5o-*jQgN~}Bc5J_7+E$5Ze
zqsk?Yv3@Z{vMD~B#<-9^skP~r3mvL%QaCW%^~o@aOvz4ZnCxwzOj0?_G1~SgmMChX
zZsMv=M!_TYja8v9(M736_%L!2k#qcbqwI&3rtEjGy5;a<+uCae4OXrjO(H(C+$#Pj
zVY#?PXH%%_pOAkKEsU(;6K<hwRUmjK*AtH!0b8|n*8R+=_9X)0UZ70ewSjy5Sv^qu
zl!|#{@vwJwDNg5NaOZ??zu{{@w(kq_R~%v9b@s{6&b=%QRkiXsu!>jGS2fNn?|9;2
zKk=wib4eVKKlQpvyRTC4L@+YggnLjVe9yg+;q)G6SWxf4Sda%vzRf#A?cl4f^@j~C
zJ<9`ZZ^mkle&XUAb<Ow=@gz@3EH>@Z=-c}gCpp{BG3*Kptt<$dHw;fw+OP|tbIQz8
zZRJl>Gd-D9+R~tKc_&05tWi+DMZ)Y{#F3W2q9a}v7<F4@f6ic5e~8Nsm&(x@ZC~Y<
z{IRz3wp(&lxLFZltmwC<q|$4Z?;u52ti;u)?VBD|-Zizln_f2Yp^Z%EWQ0^_(S~Ba
z`U1`#a-3N&w3*>ErJH4Wn-LRQ$rIxf?`s~2bhvR63Q%{3&ZTpUKKD>&5>$M6c-u|6
z-V+O{q><9DM)Sp~)^}*)LFfVRx99-AcNJ}90jBFu#N}B?nPH4RrcUkMXmcHOIzAHv
z?y<Xt)f+LTo%uSjxb>O0grc3L9~NULIwmkE;)j{H-z>JxXfD^BKxrg#OZ-fMVQpV?
zL>{?K*Rf4L$`u`oALuvXo^q`*;(V)C&9aWi;QcMfC@5H_I1Tzn<r;7oL$l4kU-*zY
zp@9FDH?~|-Axo6xkE`V{GgcSMPGk9p5d=BGes)Y>1Lo06dRMB8^(@o9i*QM|XAkb@
z;**BZPvCb*CkU-j>J!UH@7&VTvgnXjvHF(JtRDJtS?k(Bu`ChiGwb}u<|nJH+O_#7
zQ@i9ZvcmFY(<b1&DG|9s+x<bgXh%exv^(R~c2c>R&3Cy3q6bc@?FG%Swuzk8as-+a
z`-PsMo72}Fvn;+X=MqV{&!ZpyVXA`tP#IDDl5dam%7wwG9k=z#(Oi7Mry;$M=A93`
z@haUY#(f8}H?eF6GT};QJFKT<<C$t!jV{Zs>xs?Kw)3Jbfm$r2<=<g&NY6=~Ji&FI
zg~pVaTt!qlmKf<>x$(^*PP%U_Yo>ebr)AoxT$3Go%tR7IH6B_?>MkaA%&c%d>%GZZ
z&1i+3xh&`ECiPMAXuJR#?WDeTX$6f8qs~gWy1<nQoJ8F99oR<d(&5K4sp^SCmw^*6
z)NTmn<OttxC7Sfutk%00gJ9i&u}?#<S*+DH)4HSTznICPcA9#J1GPUf{zm(uas6=(
zmH!3CEWSJI$V4Zfu`Nf_IF8s2u~osQ=WeL`bx>pc`b~Co&9sr$QAvh4(dzb$dTNww
z+`t6G2HFq(ANu9;xpj|vTo@@sihnbHVLo~MVB;vrAgnmZFFMFOHHV;E{1gfoD2-kI
zQf{owN$M3UCzM8V;^Yax!?zH&KBL^Us=~d~F@mW}7FPD`KF9N7LMoxHmR>U!7rG4V
z<G4i=hx}oAc0ES+sMZBR&$@*t$HNkM>gaM)<yJ<gD_inhvwa%I)lm2;e!RrtELTHr
zWpd`dZNdnPT=JHTNm<dyV+vD=^hZ}usoU4xjAf>CUvJ5LnIJMVy<UNvT6(v{e;2hi
zJVqiR<?87ngrS*otjNJ^_kCJ-Q%0ifUEJAKk6*S~wxVVJ#x}t(hHk}&PW}$)_6;#n
zkP(x39oJTRpm)Ri#Rc<1Q%ibWCw#2}H!Cymi0f42RQlNsjMLm3M{XU-uLc)9s+>iQ
zS+-StLu@ddjOtFN`jtrPUPz8a4g^ydcIgbB?lDkUUe6aJ%-#-MT5;FmsyM5EKRjc(
zxj%H8pnh^A{?oAj@!-&J$*LLuxb`8`9sd%(9)#Z8=8hvssIf>0JYU(I7Yz`WT6k!U
zT}~z|JfB+Q@3dv6Wu$XwfsP(KWp2}Ui26q76`e18>><m6ot;?Dw%P~pdGNyG(7mo{
z%iLytl{Q(JD}#?!S3RTVGCr~ujoCSqo-?^4@GY>4lr`15iPBM2YrbhF6A@E&-|nUy
zAE~uDhw5vD>YW45Ph_P=_4ZJ<N<~zTk+?OKTtVULoHL%5ER`LhV0|dD>9)Ja%;=3T
z{O-&zXy352;`XNNG36U07zGl8^T~A8>)+kwl@eSAT3O|d6|Inj?!Bt-3}yuku`X#H
zc|=cm8f!`F%{&hor9Gd@+zIE2;=}hGU%`FLl$S*=NUMLx;O+L4a2**Q%MVh@8zwht
zDnm?k(&VXE-&&<is(HY<S6PKo*U2KJ>9{GY5}Tp=;fZ8h*NKI$cS9eV(HCDDk`N^x
z>3Y(YGt~9OujoIE82(c6x^JgTM<;#1$(^ZF$%>@y)dG{eq6@!P!~2kLBh~Otob}tc
zUv~DT5q;Cyd)6<>bg<m7xpoyr{`*O&n(xtY4U#Q2=4B4!l9ahTFF^~n%MG!sa%hos
z-nf<Iry^?C=6%1aWl!B{66?yOGDZ?s65~EJUHd}xyCMA?EtlxX=@8!%@06UMjiB?5
zby6YP_qTI6vlIzWk-o^Z$n#A&B2+J{f|h>ZXG%Wh^#!XUS6L>$?!;H^x68|7_x1M|
zvLljMO5Q9CpX6>bWO1BvF4}MQq2*GeJ}Av%#zpXp>?m!S+MR6x|5beM5xEDkTJH(P
z8S|sm7shv+Di=g1jo%M^Ty0)X5~DamHtS4&Z*sAA7b<w3iyiSq_#i*bMP|*S-$bS&
zixtW~)!3!#aw9pepm?V0wuEz_ZRRhBlyt97BwP?6b2WF%(CSwXAJ~@;Ww-X9OCuSx
zzE!tT@XSRii{R1nN%&#ZU@@VOM{ZWAw+p>B8>0UC!*PwF3={d}!aKq_%gAN>{UXw%
zD>nWIK~hXgHT#t$7HA#ls>w%`G>eNf1EJW$SMjeDr>Kue;YYw5_i0?JCn{<>K8k+p
zcBL4t&eIUcv6@8N<XtEsi%-ivHJ=r#o)S%&8YE;{ND{2kz~DN+AWbZvYr2j<lm6HR
zOY~DEKa_v6n=TW+jc}#h!+H``CORCGDk;(KLVQlQGxTmJras+`ki@t9i<!i@*Sphq
zv`1Ul4L(gj7c)v+8s+&h`Z318J=$RFN-%N8TUq~+ROS5C=dK=p=pzb&R!ws5%NTsE
ztCBr;T15LZrdFboZ!fP<Z9lf*IOOMl!l7}E(uD)T0w2cqkejMY8LhCu-hM3-)pX>w
z)EVRa1nZ5vfiQ6`O{@Pt*-+?>+4Wsq;W2YlneC8!pVY(L>PyU<w1t{ywGMXj3{wws
z^B!g+G2ReRAO?as%bZSGA%(U5CG{9%%9f8Qk1s;Sl<Ev(iDhmpDR~pCH!kXAH83~z
z@jJ?L9cfJ<`bm}A6^{qWuM@}^d>>vLFEdZ@BEwzN>zS9y7To%|Vnwc>>f`(pswthk
zV$(ADgRrix^)sWSmU#kJ;%%5&y~_?!iC29|x1kY6iTbS^zL9Dz!;~BjDDmZ6D{{EY
zYe6iPppWw(&`ZgzYRw9GD+Ir$1fKiN(sCM>7_UuuO7osbMI8|ioD!zqw`N(9B4=^&
zF*U=H318BEEbH7mvWxVUji#_>-oZn+Z}Zqm^4~4IY9C2!|E=gE==8)qBgiW)->c54
zch)|nyXsWNaF6#CQ)Ym@F}BqWIvSl9v!^=hyvs2sa3owBc<~)MI+AfvzI$hX#qyEY
z#QQkEO8I&!agR)Cx|B+p34=$)3&j^x8!2{-e94NkJ`5vwR*fm6_U9k;w(lZTWA7V%
zp)hBTHk~fDgE~18Pug(uhlS0x`K@Ex=Na>xs1Z$V2X;&EdNYG%d1QH>h&3#1@Ai6#
zemciZ<mmDK*lAdM-nj5%7xi(uw8W{Myxn|{>_=k8SB^Ziy)?Loc{WQqQ8$g4TOvF1
z;5;Mx(}ZkoZYgrExJmqkL^Sg2;8v;d8GB;r*TJ1qhO-zGu}@xFUK;en)~JSO%!$8x
zt+$TU+%QlLtpoQ1F6tPM+h>W}`z}66jZd{jrcTHT=fk$zQtP2u;?8oGGfvs6#KQ&O
zdL9j;Dht`uA6#3_Z~di29}E8w*=i1{XB66M&aP)HBMx1a7XE?UYObtjq@4+uCkpk5
zaWU|o4#)79!e^#AS{2)D*37RKw)k|RBVxz8)jU<tD21w&;4&MC8ro{kt7p_<Cfs>8
z9i2nm>G9rWUV|Z(VXM>G-@8)&)J}NoiQ#lO`vxH!!iDMmHciXbaYl+vnwHn7{}x0d
z{L7`_rW@fhjqXfJGgXt}k!>qRDZiO%ep^@al4Q_*{MdqOV&EQX@B&py7WKo0`U!Dk
zdcPtPAy|WIG=lm;t=N?Y-I?r@-I(kn4j#=Lpz4Uw!lISRjp2J0#_%+=dIL(A3j<2#
zbJGupX|~gmiHp%j+eB4ypANrDPe=9=S0VPij(FT>bql4oi9-=jgnNXynkoFHo-@&y
z`b*&ug~mm@e0eq<o=+5di<!nWSZbJwMo?($k-~I%k-t=jw6GmF4V#>B5BJuilX}L5
zJA}RTTWv?;GEMbRJW=Ndri9XJcgZRZrKWb0MeS0edZQ1O!py3^Sj-c*0;(R%67=$T
zsj_=6w8_`G&jeqOy>0EYckB23>-LZdVkbhk4HgThQ*@cVA|?y|iJYFk0(b+Qc1%_W
zdiw3y1!@>2vCdjI)o_%jRXNyIaFid(bRSk2>+M)ha60Q!^0+tD5Z%Eb6qWmX%Q!bs
z+=*51V*H(xYN(0)-n)I3P!pZKccR%e-7Ni##|3WUCz(<XC2rzU-=ub3_uf^e(}d4@
zeS7xpBd6{%_MV}Li<`~e!58YK{hTjIc*hruB_u9h6dCO4^YGu_<oj;USN0{wG1Dlr
zmU`lqs{BBJtl&0o$lX@Lrml0M_8i1>AD-E3!Jh=@KX|Wk*e8BB{}h|nE)*d)@o2t2
z<GrG525CmJa+C7lhsq6VqZ|^As<#zt?b^q$Fs-k}V!P+ZR6J|vSI#fi`^g^c>-+(`
z40EAp->Pt+(X8mYQG<J-%$P1DwfJcLy2Gt1iG>Q<rMO#*QS(^dNEO5a)`qIaaZ-$B
zhLJ|*?;^8(J-(Cl7|R?ZAzf2-<RIh<ch71QOnpda?mOwVwDHWXLd->?V4c9=>rKwE
zi9trKo8O6-d~}Tqv$XtqdTPEWX-*sJZRVPAU;h;IUDq{tboMHTPm`?Grg&~^AHE8<
z_eYW{Lqko~WXY-qi9yfO79{(*0woO|4M}XK_yktx)z0SU3Jg3^bvgkn2dFEcwmy@(
z2z5|DdK{0pj=K)w`5x7H^ggl>Hm2N(oQOaCwLw&Q)FJ6C>3sF4M((F|%TMh+Cr|ks
z6HJEWQ{JdBijnYz$n_GY9>+?pd5sM57js#1j5H?br~O{qcXE%6*E9aDGa|TOyno>I
zO3Q$@$h#<X*l-_qQsUWj-XN;4lj~i)vKG5<S=QpED^?)1rk#hnRI^d&5Raz4>STv=
z{3vF2K8K^$=$-7?l_|HMmFs56YCOBT(LJjIss$^jDjTIyWn{aovvWz==nZ6VyWQNn
zHnOSRac9)is#0T;^e6ThQE^|EP3EZ5dd|A_y5K>wdqc(ixSe%b@VaMlzom287|VK+
z&~!+t^{8_;`wxY?ts!>F#L0KNTNKXH!{eN_9iI0wnNiM3wWp9O-0x#&59$aRmTc&D
zRY+U(Nw@D4qa?@kZ#2wl)Bad-7k`?yBfz|&aOVB=vF*&^u7H>oP4PS2=>gl360K2s
zZhxKt9)F&|oOkX`XVd3f)h;&QC!TMy#OD2+HL;-jC5*^#m!Z?PV%xDv6;WD#Xg)G5
zFfxF;<3vUOxLMCWzk6)Uy8~+^-)j86+<QZB)c21yk$oECUlymseFgc3SmrI?PX}h3
z+fK)4hfmhd=C7Veok%H-h#Jts9Su^?EtXp?M9kzqP-qy*MX0;C#c@IhB^fOF2VVpw
zj5c{vE8C`_dFAL9G8v5M_Nv%7`_E3knHo&wT1Pd)^Q~irqsL_D5z$iwGQ`ny1no=k
zUZJlsYlZIZ@~dh4O7E=+hA>j2+fI#BOGPXQVxiTAef!=A$w{TAY~fFJaWt_9YvNEJ
zGz&@n$YFbDp40hrS=X({4j*C$i0HLss0RBT`HY`@aeyR<xI|<${G$@)sr_Rr=QaJ~
zGG+6`JJEXO7)-)UH{SgGJTHt?bVA%7Wkt5eZ8ZMOOHGNLWR8-viikCM%=9xg3UycY
z)EL}5t*Ox+M5ev68R8?i=NbG<%4Ep=e^Si9@Gp-}MZFd{?fu1dOP81xTyjd48EiwA
zxVr@gHJJ$oJ9op<%R4o>!v!Za>AOp-Pn8)QJ}<E;949T^3IAEY`^iMh?D$cAHm~O$
zGu3F0F|*>~-QGn$Rq0bv<Lq#9ukPJQ0k4MB_x0Q6Mlb7kH0F(odaYw$t!o`*mkv}c
zj6=RmmBleDOpV2{HAD+#?KcndPKItcVvnLjZMU7?w%a)O-R5H)9b7ku6sIqQ>?<<u
zo~6ocSI1B>k@XmTn0;78!6g{3-}p#7PS@(gXu5vmW3M=iMu{}>w5TnGy7bSb(sp0{
zqv@i1`eY(3T98#?L8eyBPX-EjtAciUhEpHy7r=`jVUfgSD8!}*m-4X(1hgSqQeW6>
z^og2Tr%K&U37OwdXJUsri<3!`#R%Q0JB}@5`h^vvBjKA;V<d^-r!-&h@De_;g`gAv
zRhBl9-||Lcrbg2HnDc`Xufo|PIose>;WFP1*FRc(w={YpnP4#gNM!(76-?C-+%{Sw
zJz9S%2S-e_z2~&F^5Tg`os-G(`?>D<O|nKmBQMQcWYH%x(<MWkVXJGpG+{TmVYo9r
zC4Hx`bxLQx!0B~c=X^EzG0sksfur?Kn1PenPOgEo_YO<T<|`wAgCpIYg%TONGs2dA
zU!ypY_5?j#%Wx61jiRc&s(kNUf}w_e`TgrBAI@SuLjsO|j_D$Y57t!5>eh>BpHv7h
zcH~NpQDlFu7;$t~Eg8>FT-m_-Sz}egVXr^xg6x)c3LLFlCoCS#`k;0kXug!-Xg>DK
zs=~e3>O*I{Rp3e2nuGh8mvgh{RHYqBHXJ9JwKc$OI8^IkQGA5yE+2bfhi!!)-a9Ob
zy3vl+v>D!xdfPav!Qhn!yy@~1(@}O&AyK3#&-yLr5S|d8V4hI-M$ehd8T73r7Zc&f
z9jM#Hw?FMwR7QPZ%_zf&{@E{u90vAhF^@lfR_sqwG*rm=k?{ydlEF-pK}y2*HsHe(
ziMy`?lw$*w-v%g01~@Q~EHSb7v?zaZQ~siq{%9}bQJP{P<<me))Nh4AN+sj%bSO>1
zIOeIbTv`X>Yscy9yYJ=WO(o+^<>Cc>@AmrK?TxsrbT3};Jx*5ujtwbEQW#EG3{KY@
z92<U;q<1)7pKxrrNs^*)x<YWe5^!v2NRlu}COz2iNwME!VZV2sWbz7owV(3K4&|4M
z?9~#=FS(Up24<jXWuQeqMgOUQ&i(a@L)sNhNzBFjm?d#IHh!~S`a4Q?r7Y^Bft>5I
z^ZWGnClRHp5v6C%1DU9QEd%Dw1Lp=SV&PdB%>#+3->8z7f%~)bj<fT+cJ|jw1wV{Z
zOIqQxAf>`$8C%*{TH4`uJ6Lx1w@L+LMyVfKjX$yqIa9DoFzh^cw%_49`Q$lR$x>Px
zKl*|Ev0Z+H_g!S76W1uTj(OmG@Xfuhk|Qy86kFQLYgyDjv5aS|c-`egfTnWFU8MZ`
z(ei{*;ownf_R^!is_0nDY5VDY+or?Xk|(2F>swW&N2q@nwpOW%$6|^|UW~Q}r2*OS
z&KSp4tAl99Q5L-ki|5UvkN2(jpZL$2?wi*v?Q7JOtvdUl&cTqG><DGXaSC9*u6<LM
zboJC)?GOidLih6(+zI_DRkcGwQm<EjIxp_;dw1XJe>tYuo~n|M+SVvAbJF08)o!<X
zo0k$2${reeMi*|DUq|jd`Yh(&vd=2<SAKEkh~qFrc*Nktqq0G((fh4P3AL7XFRdTy
zAq6Mix<q6@N4y_;39*+I6Le=9T8EED@?V%9<Uj1Y?WboQ7QtA&k`QKmi^8d#_Lr4I
z#yG}x(&PA)-yh=NYpv*rly<?E@-vuI@0DvrjPkeySFEn|pPY*vuf5H!=(m_4;QXmx
zcrjK1lechTAlvyOtw}<im-eQvc53R~Y>QT_IfatWMM5#RqS8fat?AwcX}^iB)Sysh
zcJyOx+_Zij_T)SF+57L7<x0*zANX<K(-<cw_{Vd}3|_vqQ92Ut)n|vjwLY}cl$>UW
z!dFYxopVNE6vmRVaHu{g();#a+GYVQ1@211(=E@?s_(~R!34<IZOIhvv>Rn>v$hM|
zaZb|(wWd>ne!Z(HxRzl!K6j0O-P~BG86#gHu@DPIpU_gvcPn6HnfV#5H!P~B%(B><
z=@?&2yc;PK|D7Y}HqxU$U2jGz)GS^@?TO#g-KB(wSQET%eUk&hI8^COnH~O-!6jC!
zi?%6mLpoQ4-BzQz^>c2EJqQuV%FQbpxx06cx_CrA-ouVnzVl(K##ycIBcEDHmO+cL
zB1gTc(yTxYGLoXw=UyVkxsH1|e#N3&7X}7=xcP-5WpXaR=Tvts#=dW|oDWK-+zlm%
zJer5fd$h^swE8loPV7f+MJD_%O}C7M$_mOBYX`*GgvSyUe7$#q-0&!b^y(cS;-r0z
zIUTBFGhQuF4k2_^;pO@W|79doGW#W<A~vz~g>{Ms&d6MF$c$)}@QO7V(rhR@xv{!-
zenr*Wre$hIil0oRYhyX(Yvu%Jhv($ki9GG*ShE;c`eN0bPi!i`$!<+2JhH-jgf?N>
z>6d%XUg`1gd?&4nnOCj{g<hwv$SQ5O)!%mt=XtV93x6z(oQT?uFk5H26mEVSARF@>
z+l%M#JU$$>J{T&SoP-SX&4ee8eBKaS4||Vv@ta?qud(GPj25l-cxTVI&3x885fjyK
zSF1JQ{M|yNzfRGpD8BD0_PBYRPD8&-yy2EY(dzFXk__hbv#d1c`%iD1i(ivgpDA5Z
zmXlu)?A&SE5z3j+fhJadu5Wl&Wp6B;R8|c-R%ko(=}t;M)hb)>>_OY9u|0vERL^SB
zXqF{9HEYJf{Sy3-Y|RoA*4yq@ls@k*IGo=$5W!N-sc{MxuJja<;86{qV<)ISY+QcC
zci@6Ax91|Youl!qik@7byV)tN;mtBhTv%ptq!QD}QL_!{^~V?;*X~qf%CV7QVEXvn
zxplUifaBU?Bpt5zwf5b|5);Q~TiI_)uCflHiK7L+>|4gwca$Fz`ihqr49}ZR8~Snf
za;XE221n!`cGu&<N2<6gwhT9gLzABFyQp3-&asWiJ+OJ`n&3EYD&aaULS4RgT|3=c
zZ`6O<>gN4tk6%7~`Cj>?y8g?TFK-^OIiOqK+sVWW!~AvOg`uM}kvLXOcwsF}jCR<u
zLzA3@p{t`6v2j9A-{P!H<b;v3_QUDt&Fm4xd)WzM;s%Am#>z3F{ztU&9~`=T4#dZQ
zWLrCSav64GlCmJ#%-lH|Qp{3$^b6k1q^K}j{z&6+Tz;U##dnMQX}1^WBaO|{_iG0C
zb%q72wHM(*%fwM7zwv1^KIMGyO16#EWOQKCnI*)^sfZ%~`lMJTXR|qwMD%S%{&Kwt
zmw?0BCqX;QSlu`mCaqRSKJE{nmE4>f-?onb3{0|(3KS!FTDBkdd#V~bPIo3<AbP<*
zDSUff>9bZO!$5lBgPu{-_P*$N(U!j2`c&HVk~bBC$NT>EvLt*tO(8M-_*wNm&0+q%
z0Z;buP$$j0-S#n6)4m7`kDaD#oWB>e=h;Mli&?H*?A~r!{!R~C7A;KjG_m>t+T=;Z
zF17o~ZE9E9INSB4!3{c$FX(IqLANAv$}r%j_ts;tU)4UULi@c@*K%QSsN=)t*8A|M
zTTuDqFFF-Mz55Cw$8SWYqv~TM<$FiVQZ?>8kQ^`M897kO<25Y%k#=U5WlVYZv#|jA
zIcFVz6Inui^hroUZiKmH9H}~NiLF#u>=?%X`Ng_l<G0M(w-4zJ`En%#Mz3Aa$<oSc
z?0M|J!F56|C+hAX<=ha%y^SX5udvS~xlgD*n3t$0qAuQ6LPUVnPG25)AG=FA&=+ak
zBA*m#UZsaH`t9ieyMn|R8toVQ_)YQT@nm<-3AO{dlF}||Fg#~o{>v=;t}$2j5MpQG
z$*w&Sg~Kwby6C-_K)w^MA_MFP(P1A#m78xI(r>-R;dwEZb9ZdU@IA@z={5t-mLnWy
zDjP{o?@#Ho?xuy94?C}kxX%nY>=-?>P-BkG3c96R(AzGKzRjR<`{@}>Gy=`7>C0={
zPvZJ>xi4PcqGM$7#|$%Bf{GroxtpSyvRFD}u`RsJdpIJ5rY5MKA%>P4Z2Ge>joOxI
zn1PYr|DN&^bBzmHJ`WlK?cUW?-)lEiPIAz$nNcXiSznmnBW3fKd{Ird$$dqB?S7Id
z+5;PcE1_q`*G`EiUMFs?%O$k5ho^LLX>fc~$~Pum!cx`NT=OG`@j4kh=Lg96Nw)^z
zeLM9%eRJ^LV0U}WcGUQXs-WgEyO|1hp>`-G9Vw3l_Z!~A2%LQ6`e@;Kly2|p;M0Y~
z)zgr-$`|MA^{%(eLNRvZFqqrUwi<DVp6|bN_!@OX4WB4{TRcA3dcS{LH{??R8`R4)
zYv}b3FE|S?tzD;|vxy|yKO!3UOQYT$r{yDz;9LXy)*0tAqMj?g&0PG2I`)sAl9HPx
z)eiXkj+SSBph`CWtv0ZAi=?1b8sA<<1<UODR5EpQaQ0gS^MGPr%DXq;D0o(P{r&RP
z>y?U_zx_ywFH&uy{r0BO2or;;l`_ovw~h;T{>NL>Ls<h|VlO(}B58PIOg?u--Lb`H
z74o5ub3q7IAHS81v9`U>h@QuzdGoXbQ=fp~CcbV`T&KXu2obews5^0-@Mg!=Y3??(
zdn!c1HA!<!&fnl|8_Y&cr$T}!s#iH0Vl{qX;5%wc^Ou*m7JJBOhTdL1FtK(Qvv=Y^
zuMxOw_u9rU<;us9=n)#P<nKJ2)C8`6BIvl56`h%d<%`NmX~YRiPx=HSzu!OG-uWD^
zyb;#n!PEZ><5iAmAGfq|vD&vY73FkZ_URAR;jz94aQ9A1p)CrkK>3Y>y!riSWkgJY
zi`zevJYGZiootSh6$WWz0nG10PLgEas-f-lC)kvAT5=66o7A!E9lj>&@+ULAHDw*q
zt@@~mRb1p(*jJn%eMm-s>ePDW#T(>9@_Te^<C$zSB&)t2ik**EimnmEqbtL5D!z=f
zG4jN!teZ#I#gL9S-ZEv@(YdEDJAU#}H?4+35OZ~c9)lWZEH&wBb4{Yi?lGIRlWs#{
zqR$m&UEXRLiJg<4S+oy6c7rgrb1!qok*)1gs;mhP2N5gI?s*fM;h7MRb~+;63;uV0
zK|Uc=Sg=<vzP~6)nzzJqgFe4BNHJiJ<8{4O>FRRBh{#wp9FN>fFPbz7J^gFu7g_?9
z(@ypMEGmM>?%4OsZb4%YVX|niw`l@e&XInja<%CmTgN%4489NA=0dbc6?tOzp0d{n
z+e91QD_t<nYJ2Wp)hiM}{gmy?n3;Q^xbgLZ6Dnq`O?491BnMi<`RK3Z``@!Ub?%6E
z)#gyg3VM|~28HyOa<TZ4mCBh9>F(b+dR~^+XwM!h`d+xnqG0=HhyPcBxeo`lr1Lu1
zJC9maWA1RpT)TR^gW0w*s7cYDD|jX6jbVXYoxRmEl6KX<zUj8;!l$;p)>4z6Q1gX}
zqFJ|36e=8=j|-4W?FhG`eg2g%zn{XRW7u2Wj^@rpi5fj2A$(ETim)S6>C`r#oY7am
zK1-Dr8jSWcp*GZ?(3$McUZ;U#*YDDH8vXW>P$OutF_oaBC3~D<*8$N?zEs20<2t-c
z6l*f?OI~|plzj3S=R7%v?QoqSJ8REAL7gR8UNZ<!UMyYO%j_$Ji_JLusOn*|TI|H6
zVN|5Z<5}A;d^0^Z(ytkjb$j%UdT$Lw@%JjV2a4;l0=CETwg?keOfhruN^%va>W7EM
zel7R+@l%HIKR#|~wLjiy^?2vY%SMUpKtps2_%3xBG&X+0(*OK$x`_TkZB<CbMy<=J
zt=>W!EZmlB%D2ViV2MkK6xzTR()w(FU*#mu&!&NloBmXkKKq)ApTo~t(<BAystVCG
zUbIq!wC*v&9>tcTj+X>Eor<gsN_M}U<U(kT+w?gjVWWc=%II_o3cjb>hppDH-}iA>
z30~lPVZe1Q(TS)yt~al3QdO8gz9KX~Q9OFV;Wp7>!$slxmBMrA;``MTwY9^QMh(d;
zZ~SgQv^e%TStzqFbHu%l#?#BWXWv_b@jMUvW6B20^D7T{=#5bZE<DIM$l-m;#Qmmd
zr$8D<`-#@`M%w`EV_83&c9|t(!#>=of;Tytjy(?X+{^4Lsm+>YGnBqmH;-h;z6g|<
zD~fA9D4`pgB-8(S-EIpnGM%;hY#7^~&hbpq`@wI7PVLuzn!~rAB+OQgb7l&~eDuSV
z#!;^&I%n_5>?&fK8E0#?8}{garXQ<3UXR?-=wjlo^Spmk^4^(NL+(pAM~t3bC&GiH
z8Iy_q+3*zA7jTOf8k_D&i)^OI2SSIAGR1?eiK0_q&mM2MPX^7N3%Ol5(4S(vLF-?U
zLs!tR5LZ<rd2z1YxDfP$R`V?{7PgsSM7V?$!%KpGKebsRGp2|_F09ipXt~B}N>Q8e
z=Y0{H&)gO*T|aMR;wOjG6o~uSbkqjpd*#gJ=@@(!r?oeViq)FC{)69ByJRZsGw$OV
zmT!S_%(fx6_kD4{b@GfY#t7lW(|S{1PqDxsq}D+Q@*3FixV>N>opZf+7CUr@UHRtG
zNbFn8BN3#irCeOqo%>Y}MAp^oQWGg{N#BHKwuL>Zp{EKaH(KQ+A@oPOkThio@|v%i
zSke~wkS7$mHY(`}UBjYvy^<fSMxAaD-(O{0O32}KH`A(Uo+0apPU7_hYZ{X*(G3$*
zbVI*9-raHo=~Gr6$9V$^BECY>TccJ6{PD{aE6?fDMh^VdB0HUG<Q(WBb>)E8$hvHa
zS0ll13m%OyX^&+n@|W^EZ`G`<3STKwtlEl0%MT;SDz_afRa$1}<TEsEa4VS~X&pfq
zuY6g6Q5|h+INwX%ZhGpW#ctg&uEJwp6nKF5;);VGzB$hG#2A=H3FN_1xc5%VGKu$9
zfa&<#9PVgk9*g7LdI@?&T~pDh2!>Z0w~l`J@0lIPAoHbxcMA4V_kKkWD`%Ux;TzyI
zGwd|CUaxGyZphl7TaaZt33)<JT2HzVSEc%iW}|(J;h1tXme#LjWFf(d&?%kUbIOcE
z=Yq5N{*#{afkm9YZpIJ$FY@hC_m+tseT`qR$p0|KZ!T%PCaA_yIVQQD=UqP6=J!q$
zBFw1TTcrk1S<zBQCRBMYJVs_unC;wj>*PxJ%e+jRyB*w}(`&E~t-6ybURG3KI|bC_
zkSQkJXH}3&YM`2<APJ1jb_(v`F6S}y-lw)1vcG<N_y(`67&YCMZeM|itMo4Kn%0>Q
zD6XN`;==rH(kCSOVT@STc+1^<@#g8v$wcN%w(eexx5ax749Q+D8BfEK!pw^#)~Uit
zST8WM^{>t?pV7`uX+2Q<=w`n6ss(9$PT19wT}a?EsgeCP82%xjT<cDnBjwMB(rLtW
zmhW0OvUl>T`bf_-tbdsLTG{5Etv$%hJ?in5(ui?$Ea*YraG$}@W0aKWNo!%Exo_gW
z;orEUeEn?NpqE~ym)Ltc?!_Tvf$>{4HR6LDlOOr%k?%)S%%XpgZMVLkH&MD-mv=zl
zfHl^Rw@sgq8F2M`|3aDM{x*Aw1^4-xbz;9nEq@HRNO!485X|yBW}J)kolnx(2u+u@
z9@D(=>qIJRd}-u}ayJi$M-noaF4oerde0RPPP|HcUo<4Xp3T>E^%PUR5ldkd)+~-j
z6g_!BuJBpf0K1wwRqL7LspYfkPI;z%lO7!15f16$4Hk~4oAVQHtlcNh@&vplY&Rqo
zNg3^zb#5JHe2}1!Ki3q{J1nukSzmQki0^pjcT<Q8ys!85L$Mo70ZY?HdZNEh?lvJ#
z@I3FBy1s2Kp0+5TD^g{7rlH;HuH34kqZP{9S)dB%ThgX^XY!QHJd0X|?29@H&duA|
z&9t<yCe=FHrv_U0*pzSXKAgUuz3wpIAL?^zpgp8Ukvfnoa*=*qYO5|YO#@?LzAJ8g
z)z`q&wlu4DK`l;QK6U4zwfFo8Lm`e&*SC+_=L)TUtPR^HWW=oTQ+|d)BGzl+R}=dy
z?_ztA^y=H)HQhN-t*s!Xby9D2a@8s&XZh?@R%Y}8_drtj(foF^L+#s<v_khU@0nAC
zwZp@|KEAd)HM$7*cnwQVWY;)Zc}(v5cs%xVOlqF>cO?DAReg$@?|3+dRtEMCc7}SE
z{}{A>VTOl;fcWWn>HaaOLC32>hvehcqN9T}=-@~J)Bqe+#K*^v8sdX#(DCylQA7L!
z|1LxR`77_A2FO2U{}>WPqJ{(zT69`;|GgLF|Lz4fX4HfK^Eax`|DneJxjkx1|Ht<K
z>;pB`f7QHP@^_7YKK@6_|DN$rtbfc&VPPI+R~thfHAO=s9wk#ZLplT`z@w@`2ctvK
z!L{gk6s@ftP!D+N6nUhrjI8P4f109(#WjQ_CGNv``S}nKFTVtw9|3=GA0hY<666;b
zfIk$F<mDIrpKG8xkTkV$Ftp>5w9s=f6gSkjHZbIoF|;yvFrgDb3W$p0;rx3!aEap^
zF#JIJ46SMEpbD!fdFa&-#JgPGB!&PS^D<`k?|x%LuWrBd8)G1S@tM_UEM3>Rn3a7f
zID2E<IG$i9PRMwE^u%y|LxsIRGlQ+)edY1jXu>Gv^d)VRK~g2$LSmdXY_hw3`bM8;
z3Ld1rB7Y<3*Fod7q_H^sbWt#`%N&iFZ2702%x7n+sG?Ac<97C4>u2PMqMrlH-^}f4
zI+`#AiZ?GfQZi}#<V`7_ncZyjD}@ovOK`NkJ<3BPt$(0K9;mCp`DQ?k`biSrr+P&-
z%3r$boNwpyuBg;MHVOQghco<{bgdixcK7x8ZcMFiys<oXH(kD6@!M{nk;l4t-$M#>
znFl0wz6UyX5v+U`8)(8Bx`~lX$lpkJLtKQvCUxjWtUj)Ve^wqz(S3}fx}qWU5q+uE
z*EK^FWxA|ybX5^AoLAq7<g&cc<5~5s$fMnFI2S;}p1Mcv<7-Qf-!MfU!f1Onq!uPX
z*)%N>_j1OTL2{BQ&gZ=?yIhm00EY22XHflbazFd7?Ua$L?EJ3+WRu{OWYY|qK~HSh
zxa;OhuBcA4YX+O!aAnt#Ylh~Ppd+W5M}stY@rz#wTh*<VU^q{6%T$k*U=!Q&tk*G(
zhPFicU~kgV)YiFN4V{Jwhp=oiCDiu1U@2{JCe-ajy;*WW=b64!9;CX-u3fj{f?++)
zRUYiT$vs|oa)?Gd&FB#%zsV+Aw{Unx=LJ?}?O>F@^(I$k9qI#j1cx+YQ&OivI-68p
z4I78V-jh71!Ged_7s(l6pD1y340w;p&H~RRE|R^ws}^;3dHydwyoMTcEg3}+YRn8C
z;Wht!I2A}KdC1^PSlO)mng>q)r(e{=|LFJMmH$k}rUX`s_n*ptrfMqihU8(gi$dK$
z+x~g@uW9}>s;R(f2|UJsjQQV{|0_`~Sp%xye~jvyPT#+#>R<i7=1KWxJWL-v$0=MO
z^*Jv)Iw)3~JkiLEQu2k<*KOK*-Ln+nD%i!&&<GEQ7qyNMME&=#FSsBbKOY~&NB8GX
z|N4TV)@gsf=&b&$3<l@BTqb~8v;0->AM3jREE7PY#{O450YP5)-(`ZZ%k=~yUc{w(
ze2B|sNZw0*2%;MQ)fUFf%lCI%RGGl#GWZ{^{v!vB7lK_LA8OWJswV(lDuW~VFO3h*
ze|dbUd)EIw?_qEO#O3h`@(KRkFT{(y)CUBI{(g>7a~XCiFQ35W9Q>D`4;U)i-_wVh
zNvQdJX&FNGd$|k_D1!iH2tXMhpo||-h6I!e0LlQ%Jpzye0my*><Ul<~m-3>P)5~Rm
z90))T1Rw_jkOKk8!3W5}2gtz($bni8E=@D)HFLQPkOQ?Xb-5lO2Ol5@YI(lYmLHIV
zACQ9|kb@tPgCCHCACLp}TD?3ifE@gQ9Q=SBNI(uGAO{kV0}0511mr*ha-h}+mvSHh
zIZ)flm+Jv?2mo@R47k)*0FXlfkOQ^ez1$X%LjaIN0FXlfkV62FLlBTd5RgL<kV6oV
zLlBTd5RgL<kV6oVLlBTd@Ny26T$kn_jQ4U57|Pne=QBz(K-uLSFqEs8+g{EA<Gq{%
z#(Oyjj2Dmt<=~}0V1OJjKn@rn2g<k0;{)V?0dk<WeJ-~J<bVNkzyUenfE;i@4mcnO
z9FPOG9d>E@;D8*c?TE|u0675Le=vaOFo5SU2#^D{?Q^ML2#^D{t#`Q|AO~uD<8nPf
z4hZmk06d2QJcj{1hXFi?0X&Bx0674j!vLPc0G`7Dp2Gm1!vLPc0G`7Dp2Gm1!}tI>
z0G`A60M7@&a~QyL7{GHFz;hVDa~QyL7{GHFz;hVDa~SHl&86iE;5iK7ISk-A4B$Bo
z;5iK7ISk-A4B$Bo;5iK7ISk-A3<-FB06d2QJcj{1N9_q*o__$(VFG~V3g9^m;5iK7
zISk-A4B$Bo;5iK7ISk-A4B$Bo;5iK7ISk-A4B$Bo;5i)NIUL|Q9N;+|;5i(%@BH^N
z4hMJ+=e@k%g#$c?13ZTVJck23hXXu^13ZTVJck23hr=!}S8#ymaDeA<*kwQ90MFq7
z&*1>i;Q-I!0MFq7&*1>i;Q-I!0MFs5qq&!!Ux4RufaiaX3tVao@Ei{C91idt4)FZX
zF_g=FTwd?OAwUj*=Wu}MaDeA<fah?4=Wu}MaDeBigEyCR06d2SJck23hXXu^13ZTV
zJck23hXXu^13ZTVJck23hXXu^13ZTVJVzbWx;(D{p2Gp2quzx8v<1A*0iMGFp2Gp2
z!vUVd0iMGFo}&)VU7kLG=Wu}MaDeA<fah?4=YQVEy7W2%cn$}64hMJ+2Y3z#c#b;o
zc)4GI=Wu}MaDeA<fah?4=Wu}MaDeA<fah?4=Wu}MaDeA<fah?4=Wu}MaDe9!z;g)T
zIRx+=0(k!CSoq(|0R-?I^^P8(-sSZ!1n?XJc>d?O_oaRTo<ji7A%N!)z;g)TIRx+=
z0(cGqJcj_DLjcbqfaegvbJT(K%ku!>IRx+=0(cGqJcj_DLjcbqfaegva|qx$1n?XJ
zcn$$PhX9`cc|YLNvH|cM0(cGqJcj_DLjcbqfaegva|qx$1n?XJcn$$PhX9^K0MAix
zvRs}o0M8+S=Mcbi2;eyc@Eiho4goxe0G>kt&mn;45WsT?;5h{F90Kg0Ljcbqfaegv
za|qx$1n?XJcn$$PhX9^K0M8+S=Mcbi2;eyc@Eiho4goxe0G>kt&mn;45WsT?;5h{F
z90GU_0X&BQo<ji7A%N!)z;g)TIRx+=0(g!(ndS0p5a2lk@EihojsSR$0C<i7c#Z&g
zjsSR$0C<i7c#Z(<pCbUCBLJQw0G=ZNo+ALBBLJQw0G=ZNo+ALBBLJQw0G=ZNo+ALB
zBLJQw0G=ZNo+ALBBLJQw0G=ZNo+ALBBLJSGPQkjgoTA=U|EmnOH;r=ZKhH@(?Kz{Y
z_^S-{RQ^?ldLssbdcFNeJye+hYT5nIGCtIyufN;!p$;Sc)i3Hm^Phd3|7z=Cr)O$m
zXorV`Iv?#{XQ2uHKR@;CpC^~;**X04v=-EXZj_ODICt(y$V=kk{QF!NUOb%tFH^_!
i)Xva}?w=F&kJIy%94t`}|ID|)C!8OS+8O!Rg#RxEyQYBv

literal 0
HcmV?d00001


From 3f1432abe75cc306ef90a04381d7e0d8739fded8 Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Mon, 3 Apr 2023 12:10:59 -0500
Subject: [PATCH 144/230] Add output.testsuite to .gitignore (#736)

Details:
- Added `output.testsuite` to .gitignore since it was previously not
  being matched by `output.testsuite.*`.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 6d51f6f51..5255bcb73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,6 +44,7 @@ include/*/*.h
 # -- misc. --
 
 # BLIS testsuite output file
+output.testsuite
 output.testsuite.*
 
 # BLAS test output files

From aea8e1d9243631635ca788d5e14f0f29328e637d Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 3 Apr 2023 12:17:51 -0500
Subject: [PATCH 145/230] Optionally disable thread-local storage. (#735)

Details:
- Implemented a new configure option, --disable-tls, which allows the
  user to optionally disable the use of thread-local storage qualifiers
  on static variables in BLIS. This option will rarely be needed, but
  in some situations may allow BLIS to compile when TLS is unavailable.
  Thanks to Nick Knight for suggesting this option.
- Unlike the --disable-system option, --disable-tls does not forcibly
  disable threading. Instead, warnings of the possible consequences of
  using threading with TLS disabled are added to:
  - the output of './configure --help';
  - the output of 'configure' the --disable-tls option is parsed;
  - the informational header output by the testsuite.
  Thanks to Minh Quan Ho for suggesting these warnings.
- Modified frame/include/bli_lang_defs.h so that BLIS_THREAD_LOCAL is
  defined to nothing when BLIS_ENABLE_TLS is not defined.
- Defined bli_info_get_enable_tls(), which returns whether the cpp macro
  BLIS_ENABLE_TLS was defined.
- Edited --disable-system configure status output for clarity.
- Whitespace updates.
---
 build/bli_config.h.in         |  6 ++++++
 configure                     | 38 ++++++++++++++++++++++++++++++++++-
 frame/base/bli_info.c         | 12 +++++++++--
 frame/base/bli_info.h         |  1 +
 frame/include/bli_lang_defs.h |  5 ++++-
 testsuite/src/test_libblis.c  | 14 +++++++++++++
 6 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 9636278d9..8a6baee87 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -53,6 +53,12 @@
 #define BLIS_DISABLE_SYSTEM
 #endif
 
+#if @enable_tls@
+#define BLIS_ENABLE_TLS
+#else
+#define BLIS_DISABLE_TLS
+#endif
+
 #if @enable_openmp@
 #define BLIS_ENABLE_OPENMP
 #if @enable_openmp_as_def@
diff --git a/configure b/configure
index a953c25c5..cd296cabd 100755
--- a/configure
+++ b/configure
@@ -199,6 +199,19 @@ print_usage()
                  library. When disabled, this option also forces the use
                  of --disable-threading.
 
+   --enable-tls, --disable-tls
+
+                 Enable thread-local storage (TLS) for static variables
+                 in BLIS. The default state is enabled. However, like with
+                 --disable-system, there may be rare situations, such as
+                 when --disable-system is appropriate, where thread-local
+                 storage is unsupported by the compiler. In those cases,
+                 disabling TLS may be a suitable workaround.
+		         WARNING: DISABLING TLS IS DANGEROUS AND MAY CAUSE RACE
+                 CONDITIONS! Please try combining --disable-tls with
+                 --disable-threading if you suspect any correctness or
+                 deadlock issues.
+
    --disable-pba-pools, --enable-pba-pools
    --disable-sba-pools, --enable-sba-pools
 
@@ -2512,6 +2525,9 @@ main()
 	# The system flag.
 	enable_system='yes'
 
+	# The thread-local storage flag.
+	enable_tls='yes'
+
 	# The threading flag.
 	threading_model='off'
 
@@ -2692,6 +2708,13 @@ main()
 							enable_system='no'
 							;;
 
+						enable-tls)
+							enable_tls='yes'
+							;;
+						disable-tls)
+							enable_tls='no'
+							;;
+
 						enable-threading=*)
 							threading_model=${OPTARG#*=}
 							;;
@@ -3543,13 +3566,25 @@ main()
 		enable_system_01=1
 	else
 		echo "${script_name}: disabling operating system support."
-		echo "${script_name}: WARNING: all threading will be disabled!"
+		echo "${script_name}: WARNING: disabling OS support forcibly disables all threading!"
 		enable_system_01=0
 
 		# Force threading to be disabled.
 		threading_model='off'
 	fi
 
+	# Check if we are building with or without thread-local storage support.
+	if [[ ${enable_tls} = yes ]]; then
+		echo "${script_name}: enabling thread-local storage (TLS) support."
+		enable_tls_01=1
+	else
+		echo "${script_name}: disabling thread-local storage (TLS) support."
+		echo "${script_name}: WARNING: THIS IS DANGEROUS! Disabling TLS may cause race conditions!"
+		echo "${script_name}: WARNING: Please try --disable-threading if you suspect any correctness"
+		echo "${script_name}: WARNING: or deadlock issues."
+		enable_tls_01=0
+	fi
+
 	# Check the threading model flag and standardize its value, if needed.
 	# Note that single-threaded mode will always be enabled, but not necessarily
 	# by default.
@@ -4202,6 +4237,7 @@ main()
 	| sed >"${bli_config_h_out_path}"                                    \
 	-e "s/@version@/${version_esc}/g"                                    \
 	-e "s/@enable_system@/${enable_system_01}/g"                         \
+	-e "s/@enable_tls@/${enable_tls_01}/g"                               \
 	-e "s/@enable_openmp@/${enable_openmp_01}/g"                         \
 	-e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g"           \
 	-e "s/@enable_pthreads@/${enable_pthreads_01}/g"                     \
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index 3fc76b978..53ec287cb 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -103,10 +103,10 @@ gint_t bli_info_get_enable_sba_pools( void )
 }
 gint_t bli_info_get_enable_threading( void )
 {
-	if ( bli_info_get_enable_openmp() ||
+	if ( bli_info_get_enable_openmp()   ||
 	     bli_info_get_enable_pthreads() ||
 	     bli_info_get_enable_hpx() ) return 1;
-	else                                  return 0;
+	else                             return 0;
 }
 gint_t bli_info_get_enable_openmp( void )
 {
@@ -180,6 +180,14 @@ gint_t bli_info_get_thread_jrir_tlb( void )
 	return 0;
 #endif
 }
+gint_t bli_info_get_enable_tls( void )
+{
+#ifdef BLIS_ENABLE_TLS
+	return 1;
+#else
+	return 0;
+#endif
+}
 gint_t bli_info_get_enable_memkind( void )
 {
 #ifdef BLIS_ENABLE_MEMKIND
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index 300b3f584..50c337fea 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -77,6 +77,7 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx_as_default( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_slab( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_rr( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_tlb( void );
+BLIS_EXPORT_BLIS gint_t bli_info_get_enable_tls( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void );
 BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void );
 
diff --git a/frame/include/bli_lang_defs.h b/frame/include/bli_lang_defs.h
index 8cf3f9986..5eb69a766 100644
--- a/frame/include/bli_lang_defs.h
+++ b/frame/include/bli_lang_defs.h
@@ -73,7 +73,10 @@
 // doesn't support __thread, as __GNUC__ is not quite unique to GCC.
 // But the possibility of someone using such non-main-stream compiler
 // for building BLIS is low.
-#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__)
+#if defined(BLIS_ENABLE_TLS) && ( defined(__GNUC__)  || \
+                                  defined(__clang__) || \
+                                  defined(__ICC)     || \
+                                  defined(__IBMC__) )
   #define BLIS_THREAD_LOCAL __thread
 #else
   #define BLIS_THREAD_LOCAL
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 8656652b3..eee28bdaf 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -867,6 +867,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_LEFT,  m, n, k, &trsm_l );
 	bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_RIGHT, m, n, k, &trsm_r );
 
+	const bool tls_enabled = bli_info_get_enable_tls();
+	const bool thr_enabled = bli_info_get_enable_threading();
+
 	// Output some system parameters.
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "--- BLIS library info -------------------------------------\n" );
@@ -916,6 +919,17 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "--- BLIS parallelization info ---\n" );
 	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "thread-local storage (TLS)     %d\n", ( int )tls_enabled );
+	if ( !tls_enabled && thr_enabled )
+	{
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "[WARNING] BLIS was compiled with TLS disabled. We assume you know what\n" );
+	libblis_test_fprintf_c( os, "[WARNING] you're doing! Multithreaded race conditions, correctness\n" );
+	libblis_test_fprintf_c( os, "[WARNING] issues, and deadlocks may occur. If any of these happen,\n" );
+	libblis_test_fprintf_c( os, "[WARNING] please consider reconfiguring with --disable-threading.\n" );
+
+	}
+	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "multithreading modes           %s\n", impl_str );
 	libblis_test_fprintf_c( os, "  default mode                 %s\n", def_impl_unset_str );
 	libblis_test_fprintf_c( os, "  current mode                 %s\n", def_impl_set_str );

From 259f68479671bbaf9c5986759aaa0004f9b05a24 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 7 Apr 2023 16:11:34 -0500
Subject: [PATCH 146/230] CREDITS file update.

Details:
- Added attributions associated with commits:
  - 98d4678 9b1beec: @bartoldeman
  - 2b05948 059f151: @ct-clmsn
- Reordered attirubtion for @decandia50.
---
 CREDITS | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CREDITS b/CREDITS
index 689afd599..af4deec08 100644
--- a/CREDITS
+++ b/CREDITS
@@ -24,9 +24,10 @@ but many others have contributed code, ideas, and feedback, including
   Robin Christ             @robinchrist
   Dilyn Corner             @dilyn-corner
   Mat Cross                @matcross                  (NAG)
-                           @decandia50
+                           @ct-clmsn
   Harsh Dave               @HarshDave12               (AMD)
   Tim Davis                @DrTimothyAldenDavis       (Texas A&M University)
+                           @decandia50
   Daniël de Kok            @danieldk                  (Explosion)
   Kay Dewhurst             @jkd2016                   (Max Planck Institute, Halle, Germany)
   Jeff Diamond                                        (Oracle)
@@ -78,6 +79,7 @@ but many others have contributed code, ideas, and feedback, including
                            @nagsingh
   Bhaskar Nallani          @BhaskarNallani            (AMD)
   Stepan Nassyr            @stepannassyr              (Jülich Supercomputing Centre)
+  Bart Oldeman             @bartoldeman
   Nisanth M P              @nisanthmp
   Nisanth Padinharepatt                               (AMD)
   Ajay Panyala             @ajaypanyala

From 593d01761910af6a9a16ee0ac097142732f73c29 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sat, 8 Apr 2023 16:44:16 -0500
Subject: [PATCH 147/230] CREDITS file update.

---
 CREDITS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CREDITS b/CREDITS
index af4deec08..817bf700c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -24,7 +24,6 @@ but many others have contributed code, ideas, and feedback, including
   Robin Christ             @robinchrist
   Dilyn Corner             @dilyn-corner
   Mat Cross                @matcross                  (NAG)
-                           @ct-clmsn
   Harsh Dave               @HarshDave12               (AMD)
   Tim Davis                @DrTimothyAldenDavis       (Texas A&M University)
                            @decandia50
@@ -55,7 +54,7 @@ but many others have contributed code, ideas, and feedback, including
   Francisco Igual          @figual                    (Universidad Complutense de Madrid)
   Madeesh Kannan           @shadeMe
   Tony Kelman              @tkelman
-  Lee Killough             @leekillough               (Cray)
+  Lee Killough             @leekillough               (Tactical Computing Labs)
   Mike Kistler             @mkistler                  (IBM, Austin Research Laboratory)
   Nick Knight              @nick-knight               (SiFive)
   Ivan Korostelev          @ivan23kor                 (University of Alberta)
@@ -113,6 +112,7 @@ but many others have contributed code, ideas, and feedback, including
   Vladimir Sukarev
   Harihara Sudhan S        @ihariharasudhan           (AMD)
   Chengguo Sun             @chengguosun
+  Christopher Taylor       @ct-clmsn                  (Tactical Computing Labs)
   Santanu Thangaraj                                   (AMD)
   Nicholai Tukanov         @nicholaiTukanov           (The University of Texas at Austin)
   Rhys Ulerich             @RhysU                     (The University of Texas at Austin)

From 6b38c5ac07a2a27738674784e58aa699bf895447 Mon Sep 17 00:00:00 2001
From: angsch <17718454+angsch@users.noreply.github.com>
Date: Tue, 11 Apr 2023 19:27:43 +0200
Subject: [PATCH 148/230] Add RISC-V target (#693)

Details:
- There are four RISC-V base configurations: 'rv32i', 'rv32iv', 'rv64i',
  and 'rv64iv', namely the 32-bit and 64-bit implementations with and
  without the 'V' vector extension. Additional extensions such as 'M'
  (multiplication), 'A' (atomics), 'F' ('float' hardware support), 'D'
  ('double' hardware support), and 'C' (compressed-length instructions),
  are automatically used when available. If they are not available, then
  software equivalents (e.g., softfloat and -latomic) are used.
- './configure auto' can be invoked on a RISC-V build platform, and will
  automatically detect RISC-V CPU extensions through the RISC-V C API:
  https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md
- The assembly kernels assume the presence of the vector extension
  RVV 1.0.
- It is possible to build 'rv[32,64]iv' for any value of VLEN.
  However, if VLEN < 128, the targets will fall back to the generic
  kernels and blocksizes.
- The vector microkernels are vector-length agnostic and work with
  every VLEN >=128, but are expected to work best with smaller vector
  lengths, i.e., VLEN <= 512.
- The assembly kernels cover column major storage (rs_c == 1).
- The blocksizes aim at being a good generic choice for out-of-order
  cores. They are not tuned to a specific RISC-V HPC core.
- The vector kernels have been tested using vlen={128,256,512}.
- The single- and double-precision assembly code routines for 'sgemm'
  and 'dgemm', or for 'cgemm' and 'zgemm', are combined in their RISC-V
  vector assembly source code, and are differentiated only with macros.
- The XLEN=32 and XLEN=64 versions of the RISC-V assembly code are
  identical, except that callee-saved registers are saved and restored
  differently. There are RISC-V assembly code #include files for
  handling the saving and restoring of callee-saved registers, and they
  are future-proof if ever XLEN=128.
- Multiplications, such as computing array strides and offsets, are
  performed in C, and later passed to the RISC-V assembly kernels. This
  is so that the compiler can determine whether the 'M' (multiply)
  extension is available and use multiplication instructions, or call
  library helper functions instead.
- A new macro called bli_static_assert() has been added to perform
  static assertions at compile-time, regardless of the C/C++ dialect of
  the compiler. The original motivation of this was to ensure that
  calling RISC-V assembly kernels would not silently truncate arguments
  of type 'dim_t' or 'inc_t' (so-called "narrowing conversions").
- RISC-V CI tests have been added to Travis CI, using the
  riscv-gnu-toolchain cross-compiler, and qemu simulator.
- Thanks to Lee Killough for collaborating on this commit.
---
 .travis.yml                               |  37 +-
 CREDITS                                   |   1 +
 config/rv32i/bli_cntx_init_rv32i.c        |  44 ++
 config/rv32i/bli_kernel_defs_rv32i.h      |  43 ++
 config/rv32i/make_defs.mk                 |  94 +++
 config/rv32iv/bli_cntx_init_rv32iv.c      | 109 +++
 config/rv32iv/bli_kernel_defs_rv32iv.h    |  43 ++
 config/rv32iv/make_defs.mk                |  96 +++
 config/rv64i/bli_cntx_init_rv64i.c        |  44 ++
 config/rv64i/bli_kernel_defs_rv64i.h      |  43 ++
 config/rv64i/make_defs.mk                 |  92 +++
 config/rv64iv/bli_cntx_init_rv64iv.c      | 114 +++
 config/rv64iv/bli_kernel_defs_rv64iv.h    |  42 ++
 config/rv64iv/make_defs.mk                |  93 +++
 config_registry                           |   8 +-
 configure                                 |  21 +-
 frame/base/bli_arch.c                     |  19 +
 frame/base/bli_gks.c                      |  26 +
 frame/base/bli_riscv_cpuid.h              |  67 ++
 frame/base/bli_riscv_detect_arch.h        | 155 +++++
 frame/include/bli_arch_config.h           |  22 +
 frame/include/bli_misc_macro_defs.h       |   4 +-
 frame/include/bli_type_defs.h             |   6 +
 kernels/rvi/bli_kernels_rvi.h             |  33 +
 kernels/rviv/3/bli_cgemm_rviv_4vx4.c      |  79 +++
 kernels/rviv/3/bli_cgemm_rviv_asm_4vx4.S  |  45 ++
 kernels/rviv/3/bli_czgemm_rviv_asm_4vx4.h | 801 ++++++++++++++++++++++
 kernels/rviv/3/bli_dgemm_rviv_4vx4.c      |  79 +++
 kernels/rviv/3/bli_dgemm_rviv_asm_4vx4.S  |  45 ++
 kernels/rviv/3/bli_rviv_utils.h           |  46 ++
 kernels/rviv/3/bli_sdgemm_rviv_asm_4vx4.h | 627 +++++++++++++++++
 kernels/rviv/3/bli_sgemm_rviv_4vx4.c      |  80 +++
 kernels/rviv/3/bli_sgemm_rviv_asm_4vx4.S  |  45 ++
 kernels/rviv/3/bli_zgemm_rviv_4vx4.c      |  80 +++
 kernels/rviv/3/bli_zgemm_rviv_asm_4vx4.S  |  44 ++
 kernels/rviv/3/rviv_restore_registers.h   |  77 +++
 kernels/rviv/3/rviv_save_registers.h      |  77 +++
 kernels/rviv/bli_kernels_rviv.h           |  38 +
 travis/do_riscv.sh                        |  36 +
 39 files changed, 3444 insertions(+), 11 deletions(-)
 create mode 100644 config/rv32i/bli_cntx_init_rv32i.c
 create mode 100644 config/rv32i/bli_kernel_defs_rv32i.h
 create mode 100644 config/rv32i/make_defs.mk
 create mode 100644 config/rv32iv/bli_cntx_init_rv32iv.c
 create mode 100644 config/rv32iv/bli_kernel_defs_rv32iv.h
 create mode 100644 config/rv32iv/make_defs.mk
 create mode 100644 config/rv64i/bli_cntx_init_rv64i.c
 create mode 100644 config/rv64i/bli_kernel_defs_rv64i.h
 create mode 100644 config/rv64i/make_defs.mk
 create mode 100644 config/rv64iv/bli_cntx_init_rv64iv.c
 create mode 100644 config/rv64iv/bli_kernel_defs_rv64iv.h
 create mode 100644 config/rv64iv/make_defs.mk
 create mode 100644 frame/base/bli_riscv_cpuid.h
 create mode 100644 frame/base/bli_riscv_detect_arch.h
 create mode 100644 kernels/rvi/bli_kernels_rvi.h
 create mode 100644 kernels/rviv/3/bli_cgemm_rviv_4vx4.c
 create mode 100644 kernels/rviv/3/bli_cgemm_rviv_asm_4vx4.S
 create mode 100644 kernels/rviv/3/bli_czgemm_rviv_asm_4vx4.h
 create mode 100644 kernels/rviv/3/bli_dgemm_rviv_4vx4.c
 create mode 100644 kernels/rviv/3/bli_dgemm_rviv_asm_4vx4.S
 create mode 100644 kernels/rviv/3/bli_rviv_utils.h
 create mode 100644 kernels/rviv/3/bli_sdgemm_rviv_asm_4vx4.h
 create mode 100644 kernels/rviv/3/bli_sgemm_rviv_4vx4.c
 create mode 100644 kernels/rviv/3/bli_sgemm_rviv_asm_4vx4.S
 create mode 100644 kernels/rviv/3/bli_zgemm_rviv_4vx4.c
 create mode 100644 kernels/rviv/3/bli_zgemm_rviv_asm_4vx4.S
 create mode 100644 kernels/rviv/3/rviv_restore_registers.h
 create mode 100644 kernels/rviv/3/rviv_save_registers.h
 create mode 100644 kernels/rviv/bli_kernels_rviv.h
 create mode 100755 travis/do_riscv.sh

diff --git a/.travis.yml b/.travis.yml
index b177bb23a..848cb1843 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,7 +27,8 @@ matrix:
   - os: linux
     compiler: clang
     env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto"
-      # There seems to be some difficulty installing 2 Clang toolchains of different versions.
+      # There seems to be some difficulty installing two Clang toolchains of
+      # different versions.
       # Use the TravisCI default.
       # PACKAGES="clang-8 binutils"
   # macOS with system compiler (clang)
@@ -71,6 +72,20 @@ matrix:
       CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \
       PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
       TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
+  # The RISC-V targets require the qemu version available in jammy or newer.
+  # When CI is upgraded, the packages should be activated and do_script.sh
+  # cleaned up.
+  # PACKAGES="qemu-user qemu-user-binfmt"
+  - os: linux
+    compiler: riscv64-unknown-linux-gcc
+    env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv64iv" \
+      CC=riscv64-unknown-linux-gnu-gcc \
+      LDFLAGS=-static
+  - os: linux
+    compiler: riscv32-unknown-linux-gcc
+    env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv32iv" \
+      CC=riscv32-unknown-linux-gnu-gcc \
+      LDFLAGS=-static
 install:
 - if [ "$CC" = "gcc"  ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi
 - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
@@ -79,14 +94,28 @@ script:
 - pwd
 - if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi
 - pwd
-- $DIST_PATH/configure -p `pwd`/../install -t $THR CC=$CC $CONF
+- if [ "$CONF" = "rv64iv" ]; then
+    $DIST_PATH/travis/do_riscv.sh "$CONF";
+    export CC=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-gcc;
+    export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-g++;
+    export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
+  fi
+- if [ "$CONF" = "rv32iv" ]; then
+    $DIST_PATH/travis/do_riscv.sh "$CONF";
+    export CC=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-gcc;
+    export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++;
+    export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
+  fi
+- $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF
 - pwd
 - ls -l
 - $CC --version
+- $CC -v
 - make -j 2
 - make install
-- $DIST_PATH/travis/cxx/cxx-test.sh $DIST_PATH $(ls -1 include)
-# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx).
+- if [ "$BLD" = "" ]; then $DIST_PATH/travis/cxx/cxx-test.sh $DIST_PATH $(ls -1 include); fi
+# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed
+# on real chip (A64fx).
 - if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
 - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
 - if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi
diff --git a/CREDITS b/CREDITS
index 817bf700c..9ffbee7e7 100644
--- a/CREDITS
+++ b/CREDITS
@@ -99,6 +99,7 @@ but many others have contributed code, ideas, and feedback, including
   Karl Rupp                @karlrupp
   Martin Schatz                                       (The University of Texas at Austin)
   Nico Schlömer            @nschloe
+  Angelika Schwarz         @angsch
   Rene Sitt
   Tony Skjellum            @tonyskjellum              (The University of Tennessee at Chattanooga)
   Mikhail Smelyanskiy                                 (Intel, Parallel Computing Lab)
diff --git a/config/rv32i/bli_cntx_init_rv32i.c b/config/rv32i/bli_cntx_init_rv32i.c
new file mode 100644
index 000000000..84fd2dca6
--- /dev/null
+++ b/config/rv32i/bli_cntx_init_rv32i.c
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+void bli_cntx_init_rv32i( cntx_t* cntx )
+{
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_rv32i_ref( cntx );
+
+	// -------------------------------------------------------------------------
+}
diff --git a/config/rv32i/bli_kernel_defs_rv32i.h b/config/rv32i/bli_kernel_defs_rv32i.h
new file mode 100644
index 000000000..fe51f998d
--- /dev/null
+++ b/config/rv32i/bli_kernel_defs_rv32i.h
@@ -0,0 +1,43 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+// Fall through to generic sizes
+
+//#endif
diff --git a/config/rv32i/make_defs.mk b/config/rv32i/make_defs.mk
new file mode 100644
index 000000000..40849ce66
--- /dev/null
+++ b/config/rv32i/make_defs.mk
@@ -0,0 +1,94 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := rv32i
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -DRISCV_SIZE=32
+# Atomic instructions must be enabled either via hardware
+# (-march=rv32ia) or by linking against libatomic
+CMISCFLAGS     := -march=$(shell $(CC) -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=ilp32
+CPICFLAGS      :=
+CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
+
+# In case the A extension is not available
+LDFLAGS        += -latomic
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS) -O3
+ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     :=
+else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     :=
+else
+$(error gcc or clang is required for this configuration.)
+endif
+endif
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
diff --git a/config/rv32iv/bli_cntx_init_rv32iv.c b/config/rv32iv/bli_cntx_init_rv32iv.c
new file mode 100644
index 000000000..dd10a3655
--- /dev/null
+++ b/config/rv32iv/bli_cntx_init_rv32iv.c
@@ -0,0 +1,109 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "../../kernels/rviv/3/bli_rviv_utils.h"
+
+void bli_cntx_init_rv32iv( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_rv32iv_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// A reasonable assumptions for application cores is VLEN >= 128 bits, i.e.,
+	// v >= 4. Embedded cores, however, may implement the minimal configuration,
+	// which allows VLEN = 32 bits. Here, we assume VLEN >= 128 and otherwise
+	// fall back to the reference kernels.
+	const uint32_t v = get_vlenb() / sizeof(float);
+
+	if ( v >= 4 )
+	{
+		const uint32_t mr_s = 4 * v;
+		const uint32_t mr_d = 2 * v;
+		const uint32_t mr_c = 2 * v;
+		const uint32_t mr_z = v;
+
+		// Update the context with optimized native gemm micro-kernels.
+		bli_cntx_set_ukrs
+		(
+		  cntx,
+
+		  // level-3
+		  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_rviv_4vx4,
+		  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_rviv_4vx4,
+		  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_rviv_4vx4,
+		  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_rviv_4vx4,
+
+		  BLIS_VA_END
+		);
+
+		// Update the context with storage preferences.
+		bli_cntx_set_ukr_prefs
+		(
+		  cntx,
+
+		  // level-3
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+		  BLIS_VA_END
+		);
+
+		// Initialize level-3 blocksize objects with architecture-specific values.
+		//                                              s        d        c        z
+		bli_blksz_init_easy( &blkszs[ BLIS_MR ],     mr_s,    mr_d,    mr_c,    mr_z );
+		bli_blksz_init_easy( &blkszs[ BLIS_NR ],        4,       4,       4,       4 );
+		bli_blksz_init_easy( &blkszs[ BLIS_MC ],  20*mr_s, 20*mr_d, 60*mr_c, 30*mr_z );
+		bli_blksz_init_easy( &blkszs[ BLIS_KC ],      640,     320,     320,     160 );
+		bli_blksz_init_easy( &blkszs[ BLIS_NC ],     3072,    3072,    3072,    3072 );
+
+		bli_cntx_set_blkszs
+		(
+		  cntx,
+
+		  // level-3
+		  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+		  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+		  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+		  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+		  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+		  BLIS_VA_END
+		);
+	}
+}
diff --git a/config/rv32iv/bli_kernel_defs_rv32iv.h b/config/rv32iv/bli_kernel_defs_rv32iv.h
new file mode 100644
index 000000000..b17989208
--- /dev/null
+++ b/config/rv32iv/bli_kernel_defs_rv32iv.h
@@ -0,0 +1,43 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+
+
+//#endif
diff --git a/config/rv32iv/make_defs.mk b/config/rv32iv/make_defs.mk
new file mode 100644
index 000000000..3cef697ac
--- /dev/null
+++ b/config/rv32iv/make_defs.mk
@@ -0,0 +1,96 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := rv32iv
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -DRISCV_SIZE=32
+# Atomic instructions must be enabled either via hardware
+# (-march=rv32iav) or by linking against libatomic
+CMISCFLAGS     := -march=$(shell $(CC) -DFORCE_RISCV_VECTOR -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=ilp32d
+CPICFLAGS      :=
+CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
+
+# In case the A extension is not available
+LDFLAGS        += -latomic
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O0
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS) -O3
+ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     :=
+else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     :=
+else
+$(error gcc or clang is required for this configuration.)
+endif
+endif
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+# Lower compiler optimization to -O1. At -O3, gcc version 12.0.1 20220505
+# computes offsets for the matrix ab in the ref gemm kernel incorrectly.
+CRVECFLAGS     := $(CKVECFLAGS) -O1
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
diff --git a/config/rv64i/bli_cntx_init_rv64i.c b/config/rv64i/bli_cntx_init_rv64i.c
new file mode 100644
index 000000000..f670e4a57
--- /dev/null
+++ b/config/rv64i/bli_cntx_init_rv64i.c
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+void bli_cntx_init_rv64i( cntx_t* cntx )
+{
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_rv64i_ref( cntx );
+
+	// -------------------------------------------------------------------------
+}
diff --git a/config/rv64i/bli_kernel_defs_rv64i.h b/config/rv64i/bli_kernel_defs_rv64i.h
new file mode 100644
index 000000000..fe51f998d
--- /dev/null
+++ b/config/rv64i/bli_kernel_defs_rv64i.h
@@ -0,0 +1,43 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+// Fall through to generic sizes
+
+//#endif
diff --git a/config/rv64i/make_defs.mk b/config/rv64i/make_defs.mk
new file mode 100644
index 000000000..6c69dd84e
--- /dev/null
+++ b/config/rv64i/make_defs.mk
@@ -0,0 +1,92 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := rv64i
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -DRISCV_SIZE=64
+CMISCFLAGS     := -march=$(shell $(CC) -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=lp64
+CPICFLAGS      :=
+CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
+
+# In case the A extension is not available
+LDFLAGS        += -latomic
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS) -O3
+ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     :=
+else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     :=
+else
+$(error gcc or clang is required for this configuration.)
+endif
+endif
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
diff --git a/config/rv64iv/bli_cntx_init_rv64iv.c b/config/rv64iv/bli_cntx_init_rv64iv.c
new file mode 100644
index 000000000..eb1f79ebc
--- /dev/null
+++ b/config/rv64iv/bli_cntx_init_rv64iv.c
@@ -0,0 +1,114 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "../../kernels/rviv/3/bli_rviv_utils.h"
+
+void bli_cntx_init_rv64iv( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_rv64iv_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// A reasonable assumptions for application cores is VLEN >= 128 bits, i.e.,
+	// v >= 4. Embedded cores, however, may implement the minimal configuration,
+	// which allows VLEN = 32 bits. Here, we assume VLEN >= 128 and otherwise
+	// fall back to the reference kernels.
+	const uint32_t v = get_vlenb() / sizeof(float);
+
+	if ( v >= 4 )
+	{
+		const uint32_t mr_s = 4 * v;
+		const uint32_t mr_d = 2 * v;
+		const uint32_t mr_c = 2 * v;
+		const uint32_t mr_z = v;
+
+		// TODO: Register different kernels based on the value
+		// of v to avoid MC becoming too big. (e.g. 2vx8)
+
+		// Update the context with optimized native gemm micro-kernels.
+		bli_cntx_set_ukrs
+		(
+		  cntx,
+
+		  // level-3
+		  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_rviv_4vx4,
+		  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_rviv_4vx4,
+		  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_rviv_4vx4,
+		  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_rviv_4vx4,
+
+		  BLIS_VA_END
+		);
+
+		// Update the context with storage preferences.
+		bli_cntx_set_ukr_prefs
+		(
+		  cntx,
+
+		  // level-3
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+		  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+		  BLIS_VA_END
+		);
+
+		// Initialize level-3 blocksize objects with architecture-specific values.
+		//                                              s        d        c        z
+		bli_blksz_init_easy( &blkszs[ BLIS_MR ],     mr_s,    mr_d,    mr_c,    mr_z );
+		bli_blksz_init_easy( &blkszs[ BLIS_NR ],        4,       4,       4,       4 );
+		bli_blksz_init_easy( &blkszs[ BLIS_MC ],  20*mr_s, 20*mr_d, 60*mr_c, 30*mr_z );
+		bli_blksz_init_easy( &blkszs[ BLIS_KC ],      640,     320,     320,     160 );
+		bli_blksz_init_easy( &blkszs[ BLIS_NC ],     3072,    3072,    3072,    3072 );
+
+		// Update the context with the current architecture's register and cache
+		// blocksizes (and multiples) for native execution.
+		bli_cntx_set_blkszs
+		(
+		  cntx,
+
+		  // level-3
+		  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+		  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+		  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+		  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+		  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+		  BLIS_VA_END
+		);
+	}
+}
diff --git a/config/rv64iv/bli_kernel_defs_rv64iv.h b/config/rv64iv/bli_kernel_defs_rv64iv.h
new file mode 100644
index 000000000..18ca4030e
--- /dev/null
+++ b/config/rv64iv/bli_kernel_defs_rv64iv.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+
+//#endif
diff --git a/config/rv64iv/make_defs.mk b/config/rv64iv/make_defs.mk
new file mode 100644
index 000000000..06545d461
--- /dev/null
+++ b/config/rv64iv/make_defs.mk
@@ -0,0 +1,93 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := rv64iv
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -DRISCV_SIZE=64
+CMISCFLAGS     := -march=$(shell $(CC) -DFORCE_RISCV_VECTOR -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=lp64d
+CPICFLAGS      :=
+CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
+
+# In case the A extension is not available
+LDFLAGS        += -latomic
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -ftree-vectorize
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS) -O3
+ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     :=
+else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     :=
+else
+$(error gcc or clang is required for this configuration.)
+endif
+endif
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+# Lower compiler optimization. cinvscalv fails at -O1
+CRVECFLAGS     := $(CKVECFLAGS) -O0
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
diff --git a/config_registry b/config_registry
index f25d66e7f..2138ba515 100644
--- a/config_registry
+++ b/config_registry
@@ -14,7 +14,7 @@ amd64_legacy:   excavator steamroller piledriver bulldozer generic
 amd64:          zen3 zen2 zen generic
 arm64:          armsve firestorm thunderx2 cortexa57 cortexa53 generic
 arm32:          cortexa15 cortexa9 generic
-power:		power10 power9 generic
+power:          power10 power9 generic
 
 # Intel architectures.
 skx:         skx/skx/haswell/zen
@@ -47,5 +47,11 @@ power10:     power10
 power9:      power9
 bgq:         bgq
 
+# RISC-V architectures.
+rv32i:       rv32i/rvi
+rv64i:       rv64i/rvi
+rv32iv:      rv32iv/rviv
+rv64iv:      rv64iv/rviv
+
 # Generic architectures.
 generic:     generic
diff --git a/configure b/configure
index cd296cabd..9fe6bd6c7 100755
--- a/configure
+++ b/configure
@@ -1243,14 +1243,25 @@ auto_detect()
 	# NOTE: -D_GNU_SOURCE is needed to enable POSIX extensions to
 	# pthreads (i.e., barriers).
 
-	cmd="${cc} ${config_defines} \
+	cmd="${cc} \
 	      -DBLIS_CONFIGURETIME_CPUID \
 	      ${c_hdr_paths} \
 	      -std=c99 -D_GNU_SOURCE \
-	      ${cflags} \
-	      ${c_src_filepaths} \
-	      ${ldflags} \
-	      -o ${autodetect_x}"
+	      ${cflags}"
+
+	# Special case for RISC-V, whose architecture can be detected with
+	# preprocessor macros alone. This avoids having to run RISC-V binaries
+	# on a cross-compiler host. Returns "generic" if RISC-V not detected.
+	riscv_config=$(${cmd} -E "${dist_path}/frame/base/bli_riscv_cpuid.h" |
+	               grep '^[^#]')
+	if [[ $riscv_config != *generic* ]]; then
+		echo "${riscv_config}"
+		return
+	fi
+
+	# Finish command for building executable
+	cmd="${cmd} ${config_defines} ${c_src_filepaths} ${ldflags} \
+	     -o ${autodetect_x}"
 
 	if [ "${debug_auto_detect}" == "no" ]; then
 
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index b697e35f9..5fef62ce1 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -263,6 +263,20 @@ arch_t bli_arch_query_id_impl( void )
 		id = BLIS_ARCH_BGQ;
 		#endif
 
+		// RISC-V microarchitectures
+		#ifdef BLIS_FAMILY_RV32I
+		id = BLIS_ARCH_RV32I;
+		#endif
+		#ifdef BLIS_FAMILY_RV64I
+		id = BLIS_ARCH_RV64I;
+		#endif
+		#ifdef BLIS_FAMILY_RV32IV
+		id = BLIS_ARCH_RV32IV;
+		#endif
+		#ifdef BLIS_FAMILY_RV64IV
+		id = BLIS_ARCH_RV64IV;
+		#endif
+
 		// Generic microarchitecture.
 		#ifdef BLIS_FAMILY_GENERIC
 		id = BLIS_ARCH_GENERIC;
@@ -318,6 +332,11 @@ static const char* config_name[ BLIS_NUM_ARCHS ] =
     "power7",
     "bgq",
 
+    "rv32i",
+    "rv64i",
+    "rv32iv",
+    "rv64iv",
+
     "generic"
 };
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index df0abc8ed..c1fd4c866 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -215,6 +215,32 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_bgq_ind );
 #endif
 
+		// -- RISC-V architectures --------------------------------------------
+
+#ifdef BLIS_CONFIG_RV32I
+		bli_gks_register_cntx( BLIS_ARCH_RV32I,       bli_cntx_init_rv32i,
+		                                              bli_cntx_init_rv32i_ref,
+		                                              bli_cntx_init_rv32i_ind );
+#endif
+
+#ifdef BLIS_CONFIG_RV64I
+		bli_gks_register_cntx( BLIS_ARCH_RV64I,       bli_cntx_init_rv64i,
+		                                              bli_cntx_init_rv64i_ref,
+		                                              bli_cntx_init_rv64i_ind );
+#endif
+
+#ifdef BLIS_CONFIG_RV32IV
+		bli_gks_register_cntx( BLIS_ARCH_RV32IV,      bli_cntx_init_rv32iv,
+		                                              bli_cntx_init_rv32iv_ref,
+		                                              bli_cntx_init_rv32iv_ind );
+#endif
+
+#ifdef BLIS_CONFIG_RV64IV
+		bli_gks_register_cntx( BLIS_ARCH_RV64IV,      bli_cntx_init_rv64iv,
+		                                              bli_cntx_init_rv64iv_ref,
+		                                              bli_cntx_init_rv64iv_ind );
+#endif
+
 		// -- Generic architectures --------------------------------------------
 
 #ifdef BLIS_CONFIG_GENERIC
diff --git a/frame/base/bli_riscv_cpuid.h b/frame/base/bli_riscv_cpuid.h
new file mode 100644
index 000000000..4f0c25a33
--- /dev/null
+++ b/frame/base/bli_riscv_cpuid.h
@@ -0,0 +1,67 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+/* RISC-V autodetection code which works with native or cross-compilers.
+   Compile with $CC -E and ignore all output lines starting with #.  On RISC-V
+   it may return rv32i (base 32-bit integer RISC-V), rv32iv (rv32i plus vector
+   extensions), rv64i (base 64-bit integer RISC-V), or rv64iv (rv64i plus
+   vector extensions). On 128-bit integer RISC-V, it falls back to generic
+   for now. For toolchains which do not yet support RISC-V feature-detection
+   macros, it will fall back on generic, so the BLIS configure script may need
+   the RISC-V configuration to be explicitly specified. */
+
+// false if !defined(__riscv) || !defined(__riscv_xlen)
+#if __riscv && __riscv_xlen == 64
+
+#if __riscv_vector // false if !defined(__riscv_vector)
+rv64iv
+#else
+rv64i
+#endif
+
+// false if !defined(__riscv) || !defined(__riscv_xlen) || __riscv_e32 != 0
+#elif __riscv && __riscv_xlen == 32 && !__riscv_e32
+
+#if __riscv_vector // false if !defined(__riscv_vector)
+rv32iv
+#else
+rv32i
+#endif
+
+#else
+
+generic  // fall back on BLIS runtime CPUID autodetection algorithm
+
+#endif
diff --git a/frame/base/bli_riscv_detect_arch.h b/frame/base/bli_riscv_detect_arch.h
new file mode 100644
index 000000000..448b0f39d
--- /dev/null
+++ b/frame/base/bli_riscv_detect_arch.h
@@ -0,0 +1,155 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+/* Construct a RISC-V architecture string based on available features. */
+
+#if __riscv
+
+#if __riscv_arch_test
+
+#if __riscv_i
+#define RISCV_I i
+#else
+#define RISCV_I
+#endif
+
+#if __riscv_e
+#define RISCV_E e
+#else
+#define RISCV_E
+#endif
+
+#if __riscv_m
+#define RISCV_M m
+#else
+#define RISCV_M
+#endif
+
+#if __riscv_a
+#define RISCV_A a
+#else
+#define RISCV_A
+#endif
+
+#if __riscv_f
+#define RISCV_F f
+#else
+#define RISCV_F
+#endif
+
+#if __riscv_d
+#define RISCV_D d
+#else
+#define RISCV_D
+#endif
+
+#if __riscv_c
+#define RISCV_C c
+#else
+#define RISCV_C
+#endif
+
+#if __riscv_p
+#define RISCV_P p
+#else
+#define RISCV_P
+#endif
+
+/* FORCE_RISCV_VECTOR is a Clang workaround */
+#if __riscv_v || FORCE_RISCV_VECTOR
+#define RISCV_V v
+#else
+#define RISCV_V
+#endif
+
+#else /* __riscv_arch_test */
+
+/* We assume I and E are exclusive when __riscv_arch_test isn't defined */
+#if __riscv_32e
+#define RISCV_I
+#define RISCV_E e
+#else
+#define RISCV_I i
+#define RISCV_E
+#endif
+
+#if __riscv_mul
+#define RISCV_M m
+#else
+#define RISCV_M
+#endif
+
+#if __riscv_atomic
+#define RISCV_A a
+#else
+#define RISCV_A
+#endif
+
+#if __riscv_flen >= 32
+#define RISCV_F f
+#else
+#define RISCV_F
+#endif
+
+#if __riscv_flen >= 64
+#define RISCV_D d
+#else
+#define RISCV_D
+#endif
+
+#if __riscv_compressed
+#define RISCV_C c
+#else
+#define RISCV_C
+#endif
+
+#define RISCV_P
+
+/* FORCE_RISCV_VECTOR is a Clang workaround */
+#if __riscv_vector || FORCE_RISCV_VECTOR
+#define RISCV_V v
+#else
+#define RISCV_V
+#endif
+
+#endif /* __riscv_arch_test */
+
+#define CAT2(a,b) a##b
+#define CAT(a,b) CAT2(a,b)
+
+CAT(rv, CAT(__riscv_xlen, CAT(RISCV_I, CAT(RISCV_E, CAT(RISCV_M, CAT(RISCV_A,
+CAT(RISCV_F, CAT(RISCV_D, CAT(RISCV_C, CAT(RISCV_P, RISCV_V))))))))))
+
+#endif /* __riscv */
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index 0485295df..c80e8e922 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -131,6 +131,22 @@ CNTX_INIT_PROTS( power7 )
 CNTX_INIT_PROTS( bgq )
 #endif
 
+// -- RISC-V --
+
+#ifdef BLIS_CONFIG_RV32I
+CNTX_INIT_PROTS( rv32i )
+#endif
+#ifdef BLIS_CONFIG_RV64I
+CNTX_INIT_PROTS( rv64i )
+#endif
+#ifdef BLIS_CONFIG_RV32IV
+CNTX_INIT_PROTS( rv32iv )
+#endif
+#ifdef BLIS_CONFIG_RV64IV
+CNTX_INIT_PROTS( rv64iv )
+#endif
+
+
 // -- Generic --
 
 #ifdef BLIS_CONFIG_GENERIC
@@ -343,6 +359,12 @@ CNTX_INIT_PROTS( generic )
 #endif
 
 
+#ifdef BLIS_KERNELS_RVI
+#include "bli_kernels_rvi.h"
+#endif
+#ifdef BLIS_KERNELS_RVIV
+#include "bli_kernels_rviv.h"
+#endif
 
 #endif
 
diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h
index 903b4ece6..31e0150f6 100644
--- a/frame/include/bli_misc_macro_defs.h
+++ b/frame/include/bli_misc_macro_defs.h
@@ -170,5 +170,7 @@ BLIS_INLINE void bli_toggle_bool( bool* b )
 #define BLIS_VA_END  (-1)
 
 
-#endif
+// Static assertion compatible with any version of C/C++
+#define bli_static_assert(cond) while(0){struct s {int STATIC_ASSERT_FAILED : !!(cond);};}
 
+#endif
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index cb933bfa4..b246fda05 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -965,6 +965,12 @@ typedef enum
 	BLIS_ARCH_POWER7,
 	BLIS_ARCH_BGQ,
 
+	// RISC-V
+	BLIS_ARCH_RV32I,
+	BLIS_ARCH_RV64I,
+	BLIS_ARCH_RV32IV,
+	BLIS_ARCH_RV64IV,
+
 	// Generic architecture/configuration
 	BLIS_ARCH_GENERIC,
 
diff --git a/kernels/rvi/bli_kernels_rvi.h b/kernels/rvi/bli_kernels_rvi.h
new file mode 100644
index 000000000..d06afae62
--- /dev/null
+++ b/kernels/rvi/bli_kernels_rvi.h
@@ -0,0 +1,33 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
diff --git a/kernels/rviv/3/bli_cgemm_rviv_4vx4.c b/kernels/rviv/3/bli_cgemm_rviv_4vx4.c
new file mode 100644
index 000000000..9ef333a78
--- /dev/null
+++ b/kernels/rviv/3/bli_cgemm_rviv_4vx4.c
@@ -0,0 +1,79 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "bli_rviv_utils.h"
+
+void bli_cgemm_rviv_asm_4vx4
+    (
+             intptr_t   k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, intptr_t rs_c, intptr_t cs_c
+    );
+
+void bli_cgemm_rviv_4vx4
+     (
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
+     )
+{
+	// The assembly kernels always take native machine-sized integer arguments.
+	// dim_t and inc_t are normally defined as being machine-sized. If larger, assert.
+	bli_static_assert( sizeof(dim_t) <= sizeof(intptr_t) &&
+	                   sizeof(inc_t) <= sizeof(intptr_t) );
+
+	// Extract vector-length dependent mr, nr that are fixed at configure time.
+	const inc_t mr = bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx );
+	const inc_t nr = 4;
+
+	GEMM_UKR_SETUP_CT( c, mr, nr, false );
+
+	// The kernel assumes rs_c == 1, and the context should not deviate from it.
+	assert( rs_c == 1 );
+
+	bli_cgemm_rviv_asm_4vx4( k, alpha, a, b, beta, c,
+	                         get_vlenb() * 2, cs_c * sizeof(scomplex) );
+
+	GEMM_UKR_FLUSH_CT( c );
+}
diff --git a/kernels/rviv/3/bli_cgemm_rviv_asm_4vx4.S b/kernels/rviv/3/bli_cgemm_rviv_asm_4vx4.S
new file mode 100644
index 000000000..98c73d23d
--- /dev/null
+++ b/kernels/rviv/3/bli_cgemm_rviv_asm_4vx4.S
@@ -0,0 +1,45 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#define REALNAME bli_cgemm_rviv_asm_4vx4
+#define DATASIZE 8
+#define VTYPE e32
+#define FLOAD flw
+#define FZERO(fr) fcvt.s.w fr, x0
+#define FEQ feq.s
+#define VLE vlseg2e32.v
+#define VSE vsseg2e32.v
+
+#include "bli_czgemm_rviv_asm_4vx4.h"
diff --git a/kernels/rviv/3/bli_czgemm_rviv_asm_4vx4.h b/kernels/rviv/3/bli_czgemm_rviv_asm_4vx4.h
new file mode 100644
index 000000000..8f7727c8d
--- /dev/null
+++ b/kernels/rviv/3/bli_czgemm_rviv_asm_4vx4.h
@@ -0,0 +1,801 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+	.text
+	.align      2
+	.global     REALNAME
+
+// void REALNAME(intptr_t k, void* alpha, void* a, void* b,
+//               void* beta, void* c, intptr_t rs_c, intptr_t cs_c)
+//
+// register arguments:
+// a0   k
+// a1   alpha
+// a2   a
+// a3   b
+// a4   beta
+// a5   c
+// a6   rs_c
+// a7   cs_c
+//
+
+#define REALSIZE (DATASIZE/2)
+
+#define loop_counter a0
+
+#define A00_ptr   a2
+#define A10_ptr   t0
+#define A01_ptr   t1
+#define A11_ptr   t2
+
+#define B_row_ptr a3
+
+#define C00_ptr   a5
+#define C01_ptr   t3
+#define C02_ptr   t4
+#define C03_ptr   t5
+#define C10_ptr   s1
+#define C11_ptr   s2
+#define C12_ptr   s3
+#define C13_ptr   s4
+
+#define tmp       t6
+
+#define ALPHA_re  fa0
+#define ALPHA_im  fa1
+#define BETA_re   fa2
+#define BETA_im   fa3
+
+#define B00_re    fa4
+#define B00_im    fa5
+#define B01_re    fa6
+#define B01_im    fa7
+#define B02_re    fa0
+#define B02_im    fa1
+#define B03_re    fa2
+#define B03_im    fa3
+
+#define B10_re    ft0
+#define B10_im    ft1
+#define B11_re    ft2
+#define B11_im    ft3
+#define B12_re    ft4
+#define B12_im    ft5
+#define B13_re    ft6
+#define B13_im    ft7
+
+#define fzero     ft8
+
+#define A00_re    v24
+#define A00_im    v25
+#define A10_re    v26
+#define A10_im    v27
+#define A01_re    v28
+#define A01_im    v29
+#define A11_re    v30
+#define A11_im    v31
+
+#define C0_re     v24
+#define C0_im     v25
+#define C1_re     v26
+#define C1_im     v27
+#define C2_re     v28
+#define C2_im     v29
+#define C3_re     v30
+#define C3_im     v31
+
+#define AB00_re   v0
+#define AB00_im   v1
+#define AB01_re   v2
+#define AB01_im   v3
+#define AB02_re   v4
+#define AB02_im   v5
+#define AB03_re   v6
+#define AB03_im   v7
+#define AB10_re   v8
+#define AB10_im   v9
+#define AB11_re   v10
+#define AB11_im   v11
+#define AB12_re   v12
+#define AB12_im   v13
+#define AB13_re   v14
+#define AB13_im   v15
+
+#define tmp0_re   v16
+#define tmp0_im   v17
+#define tmp1_re   v18
+#define tmp1_im   v19
+#define tmp2_re   v20
+#define tmp2_im   v21
+#define tmp3_re   v22
+#define tmp3_im   v23
+
+#define rs_c  a6
+#define cs_c  a7
+
+REALNAME:
+	#include "rviv_save_registers.h"
+
+	vsetvli s0, zero, VTYPE, m1, ta, ma
+	csrr s0, vlenb
+	slli s0, s0, 1
+	FZERO(fzero)
+
+	// Set up pointers
+	add C01_ptr, C00_ptr, cs_c
+	add C02_ptr, C01_ptr, cs_c
+	add C03_ptr, C02_ptr, cs_c
+	add C10_ptr, C00_ptr, rs_c
+	add C11_ptr, C01_ptr, rs_c
+	add C12_ptr, C02_ptr, rs_c
+	add C13_ptr, C03_ptr, rs_c
+
+	// Zero-initialize accumulators
+	vxor.vv AB00_re, AB00_re, AB00_re
+	vxor.vv AB00_im, AB00_im, AB00_im
+	vxor.vv AB01_re, AB01_re, AB01_re
+	vxor.vv AB01_im, AB01_im, AB01_im
+	vxor.vv AB02_re, AB02_re, AB02_re
+	vxor.vv AB02_im, AB02_im, AB02_im
+	vxor.vv AB03_re, AB03_re, AB03_re
+	vxor.vv AB03_im, AB03_im, AB03_im
+	vxor.vv AB10_re, AB10_re, AB10_re
+	vxor.vv AB10_im, AB10_im, AB10_im
+	vxor.vv AB11_re, AB11_re, AB11_re
+	vxor.vv AB11_im, AB11_im, AB11_im
+	vxor.vv AB12_re, AB12_re, AB12_re
+	vxor.vv AB12_im, AB12_im, AB12_im
+	vxor.vv AB13_re, AB13_re, AB13_re
+	vxor.vv AB13_im, AB13_im, AB13_im
+
+	// Handle k == 0
+	beqz loop_counter, MULTIPLYBETA
+
+	add A10_ptr, A00_ptr, s0
+	slli s0, s0, 1      // length of a column of A in bytes
+	add A01_ptr, A00_ptr, s0
+	add A11_ptr, A10_ptr, s0
+
+	li tmp, 3
+	ble loop_counter, tmp, TAIL_UNROLL_2
+
+	// Preload A and B
+	// Load and deinterleave A(:,l)
+	VLE A00_re, (A00_ptr)
+	VLE A10_re, (A10_ptr)
+
+	// Load B(l,0:3)
+	FLOAD B00_re, 0*REALSIZE(B_row_ptr)
+	FLOAD B00_im, 1*REALSIZE(B_row_ptr)
+	FLOAD B01_re, 2*REALSIZE(B_row_ptr)
+	FLOAD B01_im, 3*REALSIZE(B_row_ptr)
+	FLOAD B02_re, 4*REALSIZE(B_row_ptr)
+	FLOAD B02_im, 5*REALSIZE(B_row_ptr)
+	FLOAD B03_re, 6*REALSIZE(B_row_ptr)
+	FLOAD B03_im, 7*REALSIZE(B_row_ptr)
+
+	// Load and deinterleave A(:,l+1)
+	VLE A01_re, (A01_ptr)
+	VLE A11_re, (A11_ptr)
+
+LOOP_UNROLL_4: // loop_counter >= 4
+	addi loop_counter, loop_counter, -4
+
+	vfmacc.vf  AB00_re, B00_re, A00_re   // AB(:,0) += A(:,l) * B(l,0)
+	vfnmsac.vf AB00_re, B00_im, A00_im
+	vfmacc.vf  AB00_im, B00_re, A00_im
+	vfmacc.vf  AB00_im, B00_im, A00_re
+	vfmacc.vf  AB10_re, B00_re, A10_re
+	vfnmsac.vf AB10_re, B00_im, A10_im
+	vfmacc.vf  AB10_im, B00_re, A10_im
+	vfmacc.vf  AB10_im, B00_im, A10_re
+
+	vfmacc.vf  AB01_re, B01_re, A00_re   // AB(:,1) += A(:,l) * B(l,1)
+	vfnmsac.vf AB01_re, B01_im, A00_im
+	vfmacc.vf  AB01_im, B01_re, A00_im
+	vfmacc.vf  AB01_im, B01_im, A00_re
+	vfmacc.vf  AB11_re, B01_re, A10_re
+	vfnmsac.vf AB11_re, B01_im, A10_im
+	vfmacc.vf  AB11_im, B01_re, A10_im
+	vfmacc.vf  AB11_im, B01_im, A10_re
+
+	// Point to A(:,l+2), A(:,l+3)
+	add A00_ptr, A01_ptr, s0
+	add A10_ptr, A11_ptr, s0
+	add A01_ptr, A00_ptr, s0
+	add A11_ptr, A10_ptr, s0
+
+	// Load B(l+1,0:3)
+	FLOAD B10_re,  8*REALSIZE(B_row_ptr)
+	FLOAD B10_im,  9*REALSIZE(B_row_ptr)
+	FLOAD B11_re, 10*REALSIZE(B_row_ptr)
+	FLOAD B11_im, 11*REALSIZE(B_row_ptr)
+	FLOAD B12_re, 12*REALSIZE(B_row_ptr)
+	FLOAD B12_im, 13*REALSIZE(B_row_ptr)
+	FLOAD B13_re, 14*REALSIZE(B_row_ptr)
+	FLOAD B13_im, 15*REALSIZE(B_row_ptr)
+	addi B_row_ptr, B_row_ptr, 16*REALSIZE
+
+	vfmacc.vf  AB00_re, B10_re, A01_re   // AB(:,0) += A(:,l+1) * B(l+1,0)
+	vfnmsac.vf AB00_re, B10_im, A01_im
+	vfmacc.vf  AB00_im, B10_re, A01_im
+	vfmacc.vf  AB00_im, B10_im, A01_re
+	vfmacc.vf  AB10_re, B10_re, A11_re
+	vfnmsac.vf AB10_re, B10_im, A11_im
+	vfmacc.vf  AB10_im, B10_re, A11_im
+	vfmacc.vf  AB10_im, B10_im, A11_re
+
+	vfmacc.vf  AB02_re, B02_re, A00_re   // AB(:,2) += A(:,l) * B(l,2)
+	vfnmsac.vf AB02_re, B02_im, A00_im
+	vfmacc.vf  AB02_im, B02_re, A00_im
+	vfmacc.vf  AB02_im, B02_im, A00_re
+	vfmacc.vf  AB12_re, B02_re, A10_re
+	vfnmsac.vf AB12_re, B02_im, A10_im
+	vfmacc.vf  AB12_im, B02_re, A10_im
+	vfmacc.vf  AB12_im, B02_im, A10_re
+
+	vfmacc.vf  AB03_re, B03_re, A00_re   // AB(:,3) += A(:,l) * B(l,3)
+	vfnmsac.vf AB03_re, B03_im, A00_im
+	vfmacc.vf  AB03_im, B03_re, A00_im
+	vfmacc.vf  AB03_im, B03_im, A00_re
+	vfmacc.vf  AB13_re, B03_re, A10_re
+	vfnmsac.vf AB13_re, B03_im, A10_im
+	vfmacc.vf  AB13_im, B03_re, A10_im
+	vfmacc.vf  AB13_im, B03_im, A10_re
+
+	// Load and deinterleave A(:,l+2)
+	VLE A00_re, (A00_ptr)
+	VLE A10_re, (A10_ptr)
+
+	// Load B(l+2, 0:3)
+	FLOAD B00_re, 0*REALSIZE(B_row_ptr)
+	FLOAD B00_im, 1*REALSIZE(B_row_ptr)
+	FLOAD B01_re, 2*REALSIZE(B_row_ptr)
+	FLOAD B01_im, 3*REALSIZE(B_row_ptr)
+	FLOAD B02_re, 4*REALSIZE(B_row_ptr)
+	FLOAD B02_im, 5*REALSIZE(B_row_ptr)
+	FLOAD B03_re, 6*REALSIZE(B_row_ptr)
+	FLOAD B03_im, 7*REALSIZE(B_row_ptr)
+
+	vfmacc.vf  AB01_re, B11_re, A01_re   // AB(:,1) += A(:,l+1) * B(l+1,1)
+	vfnmsac.vf AB01_re, B11_im, A01_im
+	vfmacc.vf  AB01_im, B11_re, A01_im
+	vfmacc.vf  AB01_im, B11_im, A01_re
+	vfmacc.vf  AB11_re, B11_re, A11_re
+	vfnmsac.vf AB11_re, B11_im, A11_im
+	vfmacc.vf  AB11_im, B11_re, A11_im
+	vfmacc.vf  AB11_im, B11_im, A11_re
+
+	vfmacc.vf  AB02_re, B12_re, A01_re   // AB(:,2) += A(:,l+1) * B(l+1,2)
+	vfnmsac.vf AB02_re, B12_im, A01_im
+	vfmacc.vf  AB02_im, B12_re, A01_im
+	vfmacc.vf  AB02_im, B12_im, A01_re
+	vfmacc.vf  AB12_re, B12_re, A11_re
+	vfnmsac.vf AB12_re, B12_im, A11_im
+	vfmacc.vf  AB12_im, B12_re, A11_im
+	vfmacc.vf  AB12_im, B12_im, A11_re
+
+	vfmacc.vf  AB03_re, B13_re, A01_re   // AB(:,3) += A(:,l+1) * B(l+1,3)
+	vfnmsac.vf AB03_re, B13_im, A01_im
+	vfmacc.vf  AB03_im, B13_re, A01_im
+	vfmacc.vf  AB03_im, B13_im, A01_re
+	vfmacc.vf  AB13_re, B13_re, A11_re
+	vfnmsac.vf AB13_re, B13_im, A11_im
+	vfmacc.vf  AB13_im, B13_re, A11_im
+	vfmacc.vf  AB13_im, B13_im, A11_re
+
+	// Load and deinterleave A(:,l+3)
+	VLE A01_re, (A01_ptr)
+	VLE A11_re, (A11_ptr)
+
+	// Point to A(:,l+2), A(:,l+3)
+	add A00_ptr, A01_ptr, s0
+	add A10_ptr, A11_ptr, s0
+	add A01_ptr, A00_ptr, s0
+	add A11_ptr, A10_ptr, s0
+
+	// Load B(l+3, 0:3)
+	FLOAD B10_re,  8*REALSIZE(B_row_ptr)
+	FLOAD B10_im,  9*REALSIZE(B_row_ptr)
+	FLOAD B11_re, 10*REALSIZE(B_row_ptr)
+	FLOAD B11_im, 11*REALSIZE(B_row_ptr)
+	FLOAD B12_re, 12*REALSIZE(B_row_ptr)
+	FLOAD B12_im, 13*REALSIZE(B_row_ptr)
+	FLOAD B13_re, 14*REALSIZE(B_row_ptr)
+	FLOAD B13_im, 15*REALSIZE(B_row_ptr)
+	addi B_row_ptr, B_row_ptr, 16*REALSIZE
+
+	vfmacc.vf  AB00_re, B00_re, A00_re   // AB(:,0) += A(:,l+2) * B(l+2,0)
+	vfnmsac.vf AB00_re, B00_im, A00_im
+	vfmacc.vf  AB00_im, B00_re, A00_im
+	vfmacc.vf  AB00_im, B00_im, A00_re
+	vfmacc.vf  AB10_re, B00_re, A10_re
+	vfnmsac.vf AB10_re, B00_im, A10_im
+	vfmacc.vf  AB10_im, B00_re, A10_im
+	vfmacc.vf  AB10_im, B00_im, A10_re
+
+	vfmacc.vf  AB00_re, B10_re, A01_re   // AB(:,0) += A(:,l+3) * B(l+3,0)
+	vfnmsac.vf AB00_re, B10_im, A01_im
+	vfmacc.vf  AB00_im, B10_re, A01_im
+	vfmacc.vf  AB00_im, B10_im, A01_re
+	vfmacc.vf  AB10_re, B10_re, A11_re
+	vfnmsac.vf AB10_re, B10_im, A11_im
+	vfmacc.vf  AB10_im, B10_re, A11_im
+	vfmacc.vf  AB10_im, B10_im, A11_re
+
+	vfmacc.vf  AB01_re, B01_re, A00_re   // AB(:,1) += A(:,l+2) * B(l+2,1)
+	vfnmsac.vf AB01_re, B01_im, A00_im
+	vfmacc.vf  AB01_im, B01_re, A00_im
+	vfmacc.vf  AB01_im, B01_im, A00_re
+	vfmacc.vf  AB11_re, B01_re, A10_re
+	vfnmsac.vf AB11_re, B01_im, A10_im
+	vfmacc.vf  AB11_im, B01_re, A10_im
+	vfmacc.vf  AB11_im, B01_im, A10_re
+
+	vfmacc.vf  AB01_re, B11_re, A01_re   // AB(:,1) += A(:,l+3) * B(l+3,1)
+	vfnmsac.vf AB01_re, B11_im, A01_im
+	vfmacc.vf  AB01_im, B11_re, A01_im
+	vfmacc.vf  AB01_im, B11_im, A01_re
+	vfmacc.vf  AB11_re, B11_re, A11_re
+	vfnmsac.vf AB11_re, B11_im, A11_im
+	vfmacc.vf  AB11_im, B11_re, A11_im
+	vfmacc.vf  AB11_im, B11_im, A11_re
+
+	vfmacc.vf  AB02_re, B02_re, A00_re   // AB(:,2) += A(:,l+2) * B(l+2,2)
+	vfnmsac.vf AB02_re, B02_im, A00_im
+	vfmacc.vf  AB02_im, B02_re, A00_im
+	vfmacc.vf  AB02_im, B02_im, A00_re
+	vfmacc.vf  AB12_re, B02_re, A10_re
+	vfnmsac.vf AB12_re, B02_im, A10_im
+	vfmacc.vf  AB12_im, B02_re, A10_im
+	vfmacc.vf  AB12_im, B02_im, A10_re
+
+	vfmacc.vf  AB02_re, B12_re, A01_re   // AB(:,2) += A(:,l+3) * B(l+3,2)
+	vfnmsac.vf AB02_re, B12_im, A01_im
+	vfmacc.vf  AB02_im, B12_re, A01_im
+	vfmacc.vf  AB02_im, B12_im, A01_re
+	vfmacc.vf  AB12_re, B12_re, A11_re
+	vfnmsac.vf AB12_re, B12_im, A11_im
+	vfmacc.vf  AB12_im, B12_re, A11_im
+	vfmacc.vf  AB12_im, B12_im, A11_re
+
+	vfmacc.vf  AB03_re, B03_re, A00_re   // AB(:,3) += A(:,l+2) * B(l+2,3)
+	vfnmsac.vf AB03_re, B03_im, A00_im
+	vfmacc.vf  AB03_im, B03_re, A00_im
+	vfmacc.vf  AB03_im, B03_im, A00_re
+	vfmacc.vf  AB13_re, B03_re, A10_re
+	vfnmsac.vf AB13_re, B03_im, A10_im
+	vfmacc.vf  AB13_im, B03_re, A10_im
+	vfmacc.vf  AB13_im, B03_im, A10_re
+
+	vfmacc.vf  AB03_re, B13_re, A01_re   // AB(:,3) += A(:,l+3) * B(l+3,3)
+	vfnmsac.vf AB03_re, B13_im, A01_im
+	vfmacc.vf  AB03_im, B13_re, A01_im
+	vfmacc.vf  AB03_im, B13_im, A01_re
+	vfmacc.vf  AB13_re, B13_re, A11_re
+	vfnmsac.vf AB13_re, B13_im, A11_im
+	vfmacc.vf  AB13_im, B13_re, A11_im
+	vfmacc.vf  AB13_im, B13_im, A11_re
+
+	li tmp, 3
+	ble loop_counter, tmp, TAIL_UNROLL_2
+
+	// Load A and B for the next iteration
+	VLE A00_re, (A00_ptr)
+	VLE A10_re, (A10_ptr)
+	VLE A01_re, (A01_ptr)
+	VLE A11_re, (A11_ptr)
+
+	FLOAD B00_re, 0*REALSIZE(B_row_ptr)
+	FLOAD B00_im, 1*REALSIZE(B_row_ptr)
+	FLOAD B01_re, 2*REALSIZE(B_row_ptr)
+	FLOAD B01_im, 3*REALSIZE(B_row_ptr)
+	FLOAD B02_re, 4*REALSIZE(B_row_ptr)
+	FLOAD B02_im, 5*REALSIZE(B_row_ptr)
+	FLOAD B03_re, 6*REALSIZE(B_row_ptr)
+	FLOAD B03_im, 7*REALSIZE(B_row_ptr)
+
+	j LOOP_UNROLL_4
+
+TAIL_UNROLL_2: // loop_counter <= 3
+	li tmp, 1
+	ble loop_counter, tmp, TAIL_UNROLL_1
+
+	addi loop_counter, loop_counter, -2
+
+	// Load and deinterleave A(:,l)
+	VLE A00_re, (A00_ptr)
+	VLE A10_re, (A10_ptr)
+
+	// Load B(l, 0:3)
+	FLOAD B00_re, 0*REALSIZE(B_row_ptr)
+	FLOAD B00_im, 1*REALSIZE(B_row_ptr)
+	FLOAD B01_re, 2*REALSIZE(B_row_ptr)
+	FLOAD B01_im, 3*REALSIZE(B_row_ptr)
+	FLOAD B02_re, 4*REALSIZE(B_row_ptr)
+	FLOAD B02_im, 5*REALSIZE(B_row_ptr)
+	FLOAD B03_re, 6*REALSIZE(B_row_ptr)
+	FLOAD B03_im, 7*REALSIZE(B_row_ptr)
+
+	vfmacc.vf  AB00_re, B00_re, A00_re   // AB(:,0) += A(:,l) * B(l,0)
+	vfnmsac.vf AB00_re, B00_im, A00_im
+	vfmacc.vf  AB00_im, B00_re, A00_im
+	vfmacc.vf  AB00_im, B00_im, A00_re
+	vfmacc.vf  AB10_re, B00_re, A10_re
+	vfnmsac.vf AB10_re, B00_im, A10_im
+	vfmacc.vf  AB10_im, B00_re, A10_im
+	vfmacc.vf  AB10_im, B00_im, A10_re
+
+	vfmacc.vf  AB01_re, B01_re, A00_re   // AB(:,1) += A(:,l) * B(l,1)
+	vfnmsac.vf AB01_re, B01_im, A00_im
+	vfmacc.vf  AB01_im, B01_re, A00_im
+	vfmacc.vf  AB01_im, B01_im, A00_re
+	vfmacc.vf  AB11_re, B01_re, A10_re
+	vfnmsac.vf AB11_re, B01_im, A10_im
+	vfmacc.vf  AB11_im, B01_re, A10_im
+	vfmacc.vf  AB11_im, B01_im, A10_re
+
+	// Load and deinterleave A(:,l+1)
+	VLE A01_re, (A01_ptr)
+	VLE A11_re, (A11_ptr)
+
+	// Load B(l+1, 0:3)
+	FLOAD B10_re,  8*REALSIZE(B_row_ptr)
+	FLOAD B10_im,  9*REALSIZE(B_row_ptr)
+	FLOAD B11_re, 10*REALSIZE(B_row_ptr)
+	FLOAD B11_im, 11*REALSIZE(B_row_ptr)
+	FLOAD B12_re, 12*REALSIZE(B_row_ptr)
+	FLOAD B12_im, 13*REALSIZE(B_row_ptr)
+	FLOAD B13_re, 14*REALSIZE(B_row_ptr)
+	FLOAD B13_im, 15*REALSIZE(B_row_ptr)
+
+	vfmacc.vf  AB00_re, B10_re, A01_re   // AB(:,0) += A(:,l+1) * B(l+1,0)
+	vfnmsac.vf AB00_re, B10_im, A01_im
+	vfmacc.vf  AB00_im, B10_re, A01_im
+	vfmacc.vf  AB00_im, B10_im, A01_re
+	vfmacc.vf  AB10_re, B10_re, A11_re
+	vfnmsac.vf AB10_re, B10_im, A11_im
+	vfmacc.vf  AB10_im, B10_re, A11_im
+	vfmacc.vf  AB10_im, B10_im, A11_re
+
+	vfmacc.vf  AB01_re, B11_re, A01_re   // AB(:,1) += A(:,l+1) * B(l+1,1)
+	vfnmsac.vf AB01_re, B11_im, A01_im
+	vfmacc.vf  AB01_im, B11_re, A01_im
+	vfmacc.vf  AB01_im, B11_im, A01_re
+	vfmacc.vf  AB11_re, B11_re, A11_re
+	vfnmsac.vf AB11_re, B11_im, A11_im
+	vfmacc.vf  AB11_im, B11_re, A11_im
+	vfmacc.vf  AB11_im, B11_im, A11_re
+
+	vfmacc.vf  AB02_re, B02_re, A00_re   // AB(:,2) += A(:,l) * B(l,2)
+	vfnmsac.vf AB02_re, B02_im, A00_im
+	vfmacc.vf  AB02_im, B02_re, A00_im
+	vfmacc.vf  AB02_im, B02_im, A00_re
+	vfmacc.vf  AB12_re, B02_re, A10_re
+	vfnmsac.vf AB12_re, B02_im, A10_im
+	vfmacc.vf  AB12_im, B02_re, A10_im
+	vfmacc.vf  AB12_im, B02_im, A10_re
+
+	vfmacc.vf  AB03_re, B03_re, A00_re   // AB(:,3) += A(:,l) * B(l,3)
+	vfnmsac.vf AB03_re, B03_im, A00_im
+	vfmacc.vf  AB03_im, B03_re, A00_im
+	vfmacc.vf  AB03_im, B03_im, A00_re
+	vfmacc.vf  AB13_re, B03_re, A10_re
+	vfnmsac.vf AB13_re, B03_im, A10_im
+	vfmacc.vf  AB13_im, B03_re, A10_im
+	vfmacc.vf  AB13_im, B03_im, A10_re
+
+	vfmacc.vf  AB02_re, B12_re, A01_re   // AB(:,2) += A(:,l+1) * B(l+1,2)
+	vfnmsac.vf AB02_re, B12_im, A01_im
+	vfmacc.vf  AB02_im, B12_re, A01_im
+	vfmacc.vf  AB02_im, B12_im, A01_re
+	vfmacc.vf  AB12_re, B12_re, A11_re
+	vfnmsac.vf AB12_re, B12_im, A11_im
+	vfmacc.vf  AB12_im, B12_re, A11_im
+	vfmacc.vf  AB12_im, B12_im, A11_re
+
+	vfmacc.vf  AB03_re, B13_re, A01_re   // AB(:,3) += A(:,l+1) * B(l+1,3)
+	vfnmsac.vf AB03_re, B13_im, A01_im
+	vfmacc.vf  AB03_im, B13_re, A01_im
+	vfmacc.vf  AB03_im, B13_im, A01_re
+	vfmacc.vf  AB13_re, B13_re, A11_re
+	vfnmsac.vf AB13_re, B13_im, A11_im
+	vfmacc.vf  AB13_im, B13_re, A11_im
+	vfmacc.vf  AB13_im, B13_im, A11_re
+
+	beqz loop_counter, MULTIPLYALPHA
+
+	// Advance pointers
+	add A00_ptr, A01_ptr, s0
+	add A10_ptr, A11_ptr, s0
+	addi B_row_ptr, B_row_ptr, 16*REALSIZE
+
+TAIL_UNROLL_1: // loop_counter <= 1
+	beqz loop_counter, MULTIPLYALPHA
+
+	// Load and deinterleave A(:,l)
+	VLE A00_re, (A00_ptr)
+	VLE A10_re, (A10_ptr)
+
+	// Load B(l,0:3)
+	FLOAD B00_re, 0*REALSIZE(B_row_ptr)
+	FLOAD B00_im, 1*REALSIZE(B_row_ptr)
+	FLOAD B01_re, 2*REALSIZE(B_row_ptr)
+	FLOAD B01_im, 3*REALSIZE(B_row_ptr)
+	FLOAD B02_re, 4*REALSIZE(B_row_ptr)
+	FLOAD B02_im, 5*REALSIZE(B_row_ptr)
+	FLOAD B03_re, 6*REALSIZE(B_row_ptr)
+	FLOAD B03_im, 7*REALSIZE(B_row_ptr)
+
+	vfmacc.vf  AB00_re, B00_re, A00_re   // AB(:,0) += A(:,l) * B(l,0)
+	vfnmsac.vf AB00_re, B00_im, A00_im
+	vfmacc.vf  AB00_im, B00_re, A00_im
+	vfmacc.vf  AB00_im, B00_im, A00_re
+	vfmacc.vf  AB10_re, B00_re, A10_re
+	vfnmsac.vf AB10_re, B00_im, A10_im
+	vfmacc.vf  AB10_im, B00_re, A10_im
+	vfmacc.vf  AB10_im, B00_im, A10_re
+
+	vfmacc.vf  AB01_re, B01_re, A00_re   // AB(:,1) += A(:,l) * B(l,1)
+	vfnmsac.vf AB01_re, B01_im, A00_im
+	vfmacc.vf  AB01_im, B01_re, A00_im
+	vfmacc.vf  AB01_im, B01_im, A00_re
+	vfmacc.vf  AB11_re, B01_re, A10_re
+	vfnmsac.vf AB11_re, B01_im, A10_im
+	vfmacc.vf  AB11_im, B01_re, A10_im
+	vfmacc.vf  AB11_im, B01_im, A10_re
+
+	vfmacc.vf  AB02_re, B02_re, A00_re   // AB(:,2) += A(:,l) * B(l,2)
+	vfnmsac.vf AB02_re, B02_im, A00_im
+	vfmacc.vf  AB02_im, B02_re, A00_im
+	vfmacc.vf  AB02_im, B02_im, A00_re
+	vfmacc.vf  AB12_re, B02_re, A10_re
+	vfnmsac.vf AB12_re, B02_im, A10_im
+	vfmacc.vf  AB12_im, B02_re, A10_im
+	vfmacc.vf  AB12_im, B02_im, A10_re
+
+	vfmacc.vf  AB03_re, B03_re, A00_re   // AB(:,3) += A(:,l) * B(l,3)
+	vfnmsac.vf AB03_re, B03_im, A00_im
+	vfmacc.vf  AB03_im, B03_re, A00_im
+	vfmacc.vf  AB03_im, B03_im, A00_re
+	vfmacc.vf  AB13_re, B03_re, A10_re
+	vfnmsac.vf AB13_re, B03_im, A10_im
+	vfmacc.vf  AB13_im, B03_re, A10_im
+	vfmacc.vf  AB13_im, B03_im, A10_re
+
+MULTIPLYALPHA:
+	FLOAD ALPHA_re, 0*REALSIZE(a1)
+	FLOAD ALPHA_im, 1*REALSIZE(a1)
+
+	FEQ tmp, ALPHA_im, fzero
+	bne tmp, zero, ALPHAREAL
+
+	// [AB00, ..., AB03] * alpha
+	vfmul.vf  tmp0_re, AB00_im, ALPHA_im
+	vfmul.vf  tmp0_im, AB00_re, ALPHA_im
+	vfmul.vf  tmp1_re, AB01_im, ALPHA_im
+	vfmul.vf  tmp1_im, AB01_re, ALPHA_im
+	vfmul.vf  tmp2_re, AB02_im, ALPHA_im
+	vfmul.vf  tmp2_im, AB02_re, ALPHA_im
+	vfmul.vf  tmp3_re, AB03_im, ALPHA_im
+	vfmul.vf  tmp3_im, AB03_re, ALPHA_im
+	vfmsub.vf AB00_re, ALPHA_re, tmp0_re
+	vfmsub.vf AB01_re, ALPHA_re, tmp1_re
+	vfmsub.vf AB02_re, ALPHA_re, tmp2_re
+	vfmsub.vf AB03_re, ALPHA_re, tmp3_re
+	vfmadd.vf AB00_im, ALPHA_re, tmp0_im
+	vfmadd.vf AB01_im, ALPHA_re, tmp1_im
+	vfmadd.vf AB02_im, ALPHA_re, tmp2_im
+	vfmadd.vf AB03_im, ALPHA_re, tmp3_im
+
+	// [AB10, ..., AB13] * alpha
+	vfmul.vf  tmp0_re, AB10_im, ALPHA_im
+	vfmul.vf  tmp0_im, AB10_re, ALPHA_im
+	vfmul.vf  tmp1_re, AB11_im, ALPHA_im
+	vfmul.vf  tmp1_im, AB11_re, ALPHA_im
+	vfmul.vf  tmp2_re, AB12_im, ALPHA_im
+	vfmul.vf  tmp2_im, AB12_re, ALPHA_im
+	vfmul.vf  tmp3_re, AB13_im, ALPHA_im
+	vfmul.vf  tmp3_im, AB13_re, ALPHA_im
+	vfmsub.vf AB10_re, ALPHA_re, tmp0_re
+	vfmsub.vf AB11_re, ALPHA_re, tmp1_re
+	vfmsub.vf AB12_re, ALPHA_re, tmp2_re
+	vfmsub.vf AB13_re, ALPHA_re, tmp3_re
+	vfmadd.vf AB10_im, ALPHA_re, tmp0_im
+	vfmadd.vf AB11_im, ALPHA_re, tmp1_im
+	vfmadd.vf AB12_im, ALPHA_re, tmp2_im
+	vfmadd.vf AB13_im, ALPHA_re, tmp3_im
+
+	j MULTIPLYBETA
+
+ALPHAREAL:
+	vfmul.vf AB00_re, AB00_re, ALPHA_re
+	vfmul.vf AB00_im, AB00_im, ALPHA_re
+	vfmul.vf AB01_re, AB01_re, ALPHA_re
+	vfmul.vf AB01_im, AB01_im, ALPHA_re
+	vfmul.vf AB02_re, AB02_re, ALPHA_re
+	vfmul.vf AB02_im, AB02_im, ALPHA_re
+	vfmul.vf AB03_re, AB03_re, ALPHA_re
+	vfmul.vf AB03_im, AB03_im, ALPHA_re
+
+	vfmul.vf AB10_re, AB10_re, ALPHA_re
+	vfmul.vf AB10_im, AB10_im, ALPHA_re
+	vfmul.vf AB11_re, AB11_re, ALPHA_re
+	vfmul.vf AB11_im, AB11_im, ALPHA_re
+	vfmul.vf AB12_re, AB12_re, ALPHA_re
+	vfmul.vf AB12_im, AB12_im, ALPHA_re
+	vfmul.vf AB13_re, AB13_re, ALPHA_re
+	vfmul.vf AB13_im, AB13_im, ALPHA_re
+
+MULTIPLYBETA:
+	FLOAD BETA_re,  0*REALSIZE(a4)
+	FLOAD BETA_im,  1*REALSIZE(a4)
+	FEQ tmp, BETA_im, fzero
+	bne tmp, zero, BETAREAL
+
+	// Load and deinterleave C(0:VLEN-1, 0:1)
+	VLE C0_re, (C00_ptr)
+	VLE C1_re, (C01_ptr)
+
+	// Load and deinterleave C(0:VLEN-1, 2:3)
+	VLE C2_re, (C02_ptr)
+	VLE C3_re, (C03_ptr)
+
+	// C(0:VLEN-1,0:1) * beta + AB(0:VLEN-1,0:1)
+	vfmacc.vf   AB00_re, BETA_re, C0_re
+	vfnmsac.vf  AB00_re, BETA_im, C0_im
+	vfmacc.vf   AB00_im, BETA_re, C0_im
+	vfmacc.vf   AB00_im, BETA_im, C0_re
+	VSE AB00_re, (C00_ptr)
+
+	vfmacc.vf   AB01_re, BETA_re, C1_re
+	vfnmsac.vf  AB01_re, BETA_im, C1_im
+	vfmacc.vf   AB01_im, BETA_re, C1_im
+	vfmacc.vf   AB01_im, BETA_im, C1_re
+	VSE AB01_re, (C01_ptr)
+
+	// C(0:VLEN-1,2:3) * beta + AB(0:VLEN-1,2:3)
+	vfmacc.vf   AB02_re, BETA_re, C2_re
+	vfnmsac.vf  AB02_re, BETA_im, C2_im
+	vfmacc.vf   AB02_im, BETA_re, C2_im
+	vfmacc.vf   AB02_im, BETA_im, C2_re
+	VSE AB02_re, (C02_ptr)
+
+	vfmacc.vf   AB03_re, BETA_re, C3_re
+	vfnmsac.vf  AB03_re, BETA_im, C3_im
+	vfmacc.vf   AB03_im, BETA_re, C3_im
+	vfmacc.vf   AB03_im, BETA_im, C3_re
+	VSE AB03_re, (C03_ptr)
+
+	// Load and deinterleave C(VLEN:2*VLEN-1, 0:1)
+	VLE C0_re, (C10_ptr)
+	VLE C1_re, (C11_ptr)
+
+	// Load and deinterleave C(VLEN:2*VLEN-1, 2:3)
+	VLE C2_re, (C12_ptr)
+	VLE C3_re, (C13_ptr)
+
+	// C(VLEN:2*VLEN-1,0:1) * beta + AB(VLEN:2*VLEN-1,0:1)
+	vfmacc.vf   AB10_re, BETA_re, C0_re
+	vfnmsac.vf  AB10_re, BETA_im, C0_im
+	vfmacc.vf   AB10_im, BETA_re, C0_im
+	vfmacc.vf   AB10_im, BETA_im, C0_re
+	VSE AB10_re, (C10_ptr)
+
+	vfmacc.vf   AB11_re, BETA_re, C1_re
+	vfnmsac.vf  AB11_re, BETA_im, C1_im
+	vfmacc.vf   AB11_im, BETA_re, C1_im
+	vfmacc.vf   AB11_im, BETA_im, C1_re
+	VSE AB11_re, (C11_ptr)
+
+	// C(VLEN:2*VLEN-1,2:3) * beta + AB(VLEN:2*VLEN-1,2:3)
+	vfmacc.vf   AB12_re, BETA_re, C2_re
+	vfnmsac.vf  AB12_re, BETA_im, C2_im
+	vfmacc.vf   AB12_im, BETA_re, C2_im
+	vfmacc.vf   AB12_im, BETA_im, C2_re
+	VSE AB12_re, (C12_ptr)
+
+	vfmacc.vf   AB13_re, BETA_re, C3_re
+	vfnmsac.vf  AB13_re, BETA_im, C3_im
+	vfmacc.vf   AB13_im, BETA_re, C3_im
+	vfmacc.vf   AB13_im, BETA_im, C3_re
+	VSE AB13_re, (C13_ptr)
+
+	j END
+
+BETAREAL:
+	FEQ tmp, BETA_re, fzero
+	bne tmp, zero, BETAZERO
+
+	// Load and deinterleave C(0:VLEN-1, 0:3)
+	VLE C0_re, (C00_ptr)
+	VLE C1_re, (C01_ptr)
+	VLE C2_re, (C02_ptr)
+	VLE C3_re, (C03_ptr)
+
+	// C(0:VLEN-1,0:3) * beta + AB(0:VLEN-1,0:3)
+	vfmacc.vf   AB00_re, BETA_re, C0_re
+	vfmacc.vf   AB00_im, BETA_re, C0_im
+	vfmacc.vf   AB01_re, BETA_re, C1_re
+	vfmacc.vf   AB01_im, BETA_re, C1_im
+
+	vfmacc.vf   AB02_re, BETA_re, C2_re
+	vfmacc.vf   AB02_im, BETA_re, C2_im
+	vfmacc.vf   AB03_re, BETA_re, C3_re
+	vfmacc.vf   AB03_im, BETA_re, C3_im
+
+	VSE AB00_re, (C00_ptr)
+	VSE AB01_re, (C01_ptr)
+	VSE AB02_re, (C02_ptr)
+	VSE AB03_re, (C03_ptr)
+
+	// Load and deinterleave C(VLEN:2*VLEN-1, 0:3)
+	VLE C0_re, (C10_ptr)
+	VLE C1_re, (C11_ptr)
+	VLE C2_re, (C12_ptr)
+	VLE C3_re, (C13_ptr)
+
+	// C(VLEN:2*VLEN-1,0:3) * beta + AB(VLEN:2*VLEN-1,0:3)
+	vfmacc.vf   AB10_re, BETA_re, C0_re
+	vfmacc.vf   AB10_im, BETA_re, C0_im
+	vfmacc.vf   AB11_re, BETA_re, C1_re
+	vfmacc.vf   AB11_im, BETA_re, C1_im
+
+	vfmacc.vf   AB12_re, BETA_re, C2_re
+	vfmacc.vf   AB12_im, BETA_re, C2_im
+	vfmacc.vf   AB13_re, BETA_re, C3_re
+	vfmacc.vf   AB13_im, BETA_re, C3_im
+
+	VSE AB10_re, (C10_ptr)
+	VSE AB11_re, (C11_ptr)
+	VSE AB12_re, (C12_ptr)
+	VSE AB13_re, (C13_ptr)
+
+	j END
+
+BETAZERO:
+	VSE AB00_re, (C00_ptr)
+	VSE AB01_re, (C01_ptr)
+	VSE AB02_re, (C02_ptr)
+	VSE AB03_re, (C03_ptr)
+
+	VSE AB10_re, (C10_ptr)
+	VSE AB11_re, (C11_ptr)
+	VSE AB12_re, (C12_ptr)
+	VSE AB13_re, (C13_ptr)
+
+END:
+	#include "rviv_restore_registers.h"
+	ret
diff --git a/kernels/rviv/3/bli_dgemm_rviv_4vx4.c b/kernels/rviv/3/bli_dgemm_rviv_4vx4.c
new file mode 100644
index 000000000..e03716a5a
--- /dev/null
+++ b/kernels/rviv/3/bli_dgemm_rviv_4vx4.c
@@ -0,0 +1,79 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "bli_rviv_utils.h"
+
+void bli_dgemm_rviv_asm_4vx4
+    (
+             intptr_t   k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, intptr_t rs_c, intptr_t cs_c
+    );
+
+void bli_dgemm_rviv_4vx4
+     (
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
+     )
+{
+	// The assembly kernels always take native machine-sized integer arguments.
+	// dim_t and inc_t are normally defined as being machine-sized. If larger, assert.
+	bli_static_assert( sizeof(dim_t) <= sizeof(intptr_t) &&
+	                   sizeof(inc_t) <= sizeof(intptr_t) );
+
+	// Extract vector-length dependent mr, nr that are fixed at configure time.
+	const inc_t mr = bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_MR, cntx );
+	const inc_t nr = 4;
+
+	GEMM_UKR_SETUP_CT( d, mr, nr, false );
+
+	// The kernel assumes rs_c == 1, and the context should not deviate from it.
+	assert( rs_c == 1 );
+
+	bli_dgemm_rviv_asm_4vx4( k, alpha, a, b, beta, c,
+	                         get_vlenb(), cs_c * sizeof(double) );
+
+	GEMM_UKR_FLUSH_CT( d );
+}
diff --git a/kernels/rviv/3/bli_dgemm_rviv_asm_4vx4.S b/kernels/rviv/3/bli_dgemm_rviv_asm_4vx4.S
new file mode 100644
index 000000000..b29c6da5e
--- /dev/null
+++ b/kernels/rviv/3/bli_dgemm_rviv_asm_4vx4.S
@@ -0,0 +1,45 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#define REALNAME bli_dgemm_rviv_asm_4vx4
+#define DATASIZE 8
+#define VTYPE e64
+#define FLOAD fld
+#define FZERO(fr) fcvt.d.w fr, x0
+#define FEQ feq.d
+#define VLE vle64.v
+#define VSE vse64.v
+
+#include "bli_sdgemm_rviv_asm_4vx4.h"
diff --git a/kernels/rviv/3/bli_rviv_utils.h b/kernels/rviv/3/bli_rviv_utils.h
new file mode 100644
index 000000000..e4570321d
--- /dev/null
+++ b/kernels/rviv/3/bli_rviv_utils.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <assert.h>
+
+static inline uintptr_t get_vlenb(void)
+{
+	uintptr_t vlenb = 0;
+	__asm__ volatile (
+	   " csrr %0, vlenb"    // vector length in bytes
+	  : "=r" (vlenb)
+	);
+	return vlenb;
+}
diff --git a/kernels/rviv/3/bli_sdgemm_rviv_asm_4vx4.h b/kernels/rviv/3/bli_sdgemm_rviv_asm_4vx4.h
new file mode 100644
index 000000000..998a4e27d
--- /dev/null
+++ b/kernels/rviv/3/bli_sdgemm_rviv_asm_4vx4.h
@@ -0,0 +1,627 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+	.text
+	.align      2
+	.global     REALNAME
+
+// void REALNAME(intptr_t k, void* alpha, void* a, void* b,
+//               void* beta, void* c, intptr_t rs_c, intptr_t cs_c)
+//
+// register arguments:
+// a0   k
+// a1   alpha
+// a2   a
+// a3   b
+// a4   beta
+// a5   c
+// a6   rs_c
+// a7   cs_c
+//
+
+#define loop_counter a0
+
+#define A00_ptr   a2
+#define A10_ptr   t0
+#define A20_ptr   t1
+#define A30_ptr   t2
+#define A01_ptr   s5
+#define A11_ptr   s6
+#define A21_ptr   s7
+#define A31_ptr   t6
+
+#define B_row_ptr a3
+
+#define C00_ptr   a5
+#define C01_ptr   t3
+#define C02_ptr   t4
+#define C03_ptr   t5
+#define C10_ptr   s1
+#define C11_ptr   s2
+#define C12_ptr   s3
+#define C13_ptr   s4
+
+#define tmp    t6
+
+#define ALPHA  fa1
+#define BETA   fa2
+
+#define B00    fa4
+#define B01    fa5
+#define B02    fa6
+#define B03    fa7
+
+#define B10    fa0
+#define B11    fa1
+#define B12    fa2
+#define B13    fa3
+
+#define fzero  ft8
+
+#define A00    v24
+#define A10    v25
+#define A20    v26
+#define A30    v27
+
+#define A01    v28
+#define A11    v29
+#define A21    v30
+#define A31    v31
+
+#define C00    v16
+#define C01    v17
+#define C02    v18
+#define C03    v19
+#define C10    v20
+#define C11    v21
+#define C12    v22
+#define C13    v23
+#define C20    v0
+#define C21    v1
+#define C22    v2
+#define C23    v3
+#define C30    v4
+#define C31    v5
+#define C32    v6
+#define C33    v7
+
+#define AB00   v0
+#define AB01   v1
+#define AB02   v2
+#define AB03   v3
+#define AB10   v4
+#define AB11   v5
+#define AB12   v6
+#define AB13   v7
+#define AB20   v8
+#define AB21   v9
+#define AB22   v10
+#define AB23   v11
+#define AB30   v12
+#define AB31   v13
+#define AB32   v14
+#define AB33   v15
+
+#define rs_c   a6
+#define cs_c   a7
+
+REALNAME:
+	#include "rviv_save_registers.h"
+
+	vsetvli s0, zero, VTYPE, m1, ta, ma
+	csrr s0, vlenb
+	FZERO(fzero)
+
+	// Set up pointers
+	add C01_ptr, C00_ptr, cs_c
+	add C02_ptr, C01_ptr, cs_c
+	add C03_ptr, C02_ptr, cs_c
+	add C10_ptr, C00_ptr, rs_c
+	add C11_ptr, C01_ptr, rs_c
+	add C12_ptr, C02_ptr, rs_c
+	add C13_ptr, C03_ptr, rs_c
+
+	// Zero-initialize accumulators
+	vxor.vv AB00, AB00, AB00
+	vxor.vv AB01, AB01, AB01
+	vxor.vv AB02, AB02, AB02
+	vxor.vv AB03, AB03, AB03
+	vxor.vv AB10, AB10, AB10
+	vxor.vv AB11, AB11, AB11
+	vxor.vv AB12, AB12, AB12
+	vxor.vv AB13, AB13, AB13
+	vxor.vv AB20, AB20, AB20
+	vxor.vv AB21, AB21, AB21
+	vxor.vv AB22, AB22, AB22
+	vxor.vv AB23, AB23, AB23
+	vxor.vv AB30, AB30, AB30
+	vxor.vv AB31, AB31, AB31
+	vxor.vv AB32, AB32, AB32
+	vxor.vv AB33, AB33, AB33
+
+	// Handle k == 0
+	beqz loop_counter, MULTIPLYBETA
+
+	// Set up pointers to rows of A
+	add A10_ptr, A00_ptr, s0
+	add A20_ptr, A10_ptr, s0
+	add A30_ptr, A20_ptr, s0
+
+	slli s0, s0, 2 // length of a column of A in bytes
+
+	li tmp, 3
+	ble loop_counter, tmp, TAIL_UNROLL_2
+
+	// Preload A and B
+	// Load A(:,l)
+	VLE A00, (A00_ptr)
+	VLE A10, (A10_ptr)
+	VLE A20, (A20_ptr)
+	VLE A30, (A30_ptr)
+
+	// Load B(l,0:3)
+	FLOAD B00, 0*DATASIZE(B_row_ptr)
+	FLOAD B01, 1*DATASIZE(B_row_ptr)
+	FLOAD B02, 2*DATASIZE(B_row_ptr)
+	FLOAD B03, 3*DATASIZE(B_row_ptr)
+
+	// Set up pointers to A(:,l+1)
+	add A01_ptr, A00_ptr, s0
+	add A11_ptr, A10_ptr, s0
+	add A21_ptr, A20_ptr, s0
+	add A31_ptr, A30_ptr, s0
+
+LOOP_UNROLL_4:
+	addi loop_counter, loop_counter, -4
+
+	vfmacc.vf AB00, B00, A00   // AB(0,:) += A(0,0) * B(0,:)
+	vfmacc.vf AB01, B01, A00
+	vfmacc.vf AB02, B02, A00
+	vfmacc.vf AB03, B03, A00
+
+	vfmacc.vf AB10, B00, A10   // AB(1,:) += A(1,0) * B(0,:)
+	vfmacc.vf AB11, B01, A10
+	vfmacc.vf AB12, B02, A10
+	vfmacc.vf AB13, B03, A10
+
+	// Load B(l+1,0:3)
+	FLOAD B10, 4*DATASIZE(B_row_ptr)
+	FLOAD B11, 5*DATASIZE(B_row_ptr)
+	FLOAD B12, 6*DATASIZE(B_row_ptr)
+	FLOAD B13, 7*DATASIZE(B_row_ptr)
+	addi B_row_ptr, B_row_ptr, 8*DATASIZE
+
+	vfmacc.vf AB20, B00, A20   // AB(2,:) += A(2,0) * B(0,:)
+	vfmacc.vf AB21, B01, A20
+	vfmacc.vf AB22, B02, A20
+	vfmacc.vf AB23, B03, A20
+
+	// Load A(:,l+1)
+	VLE A01, (A01_ptr)
+	VLE A11, (A11_ptr)
+	VLE A21, (A21_ptr)
+	VLE A31, (A31_ptr)
+
+	// Point to A(:,l+2)
+	add A00_ptr, A01_ptr, s0
+	add A10_ptr, A11_ptr, s0
+	add A20_ptr, A21_ptr, s0
+	add A30_ptr, A31_ptr, s0
+
+	vfmacc.vf AB30, B00, A30   // AB(3,:) += A(3,0) * B(0,:)
+	vfmacc.vf AB31, B01, A30
+	vfmacc.vf AB32, B02, A30
+	vfmacc.vf AB33, B03, A30
+
+	vfmacc.vf AB00, B10, A01   // AB(0,:) += A(0,1) * B(1,:)
+	vfmacc.vf AB01, B11, A01
+	vfmacc.vf AB02, B12, A01
+	vfmacc.vf AB03, B13, A01
+
+	// Load B(l+2,0:3)
+	FLOAD B00, 0*DATASIZE(B_row_ptr)
+	FLOAD B01, 1*DATASIZE(B_row_ptr)
+	FLOAD B02, 2*DATASIZE(B_row_ptr)
+	FLOAD B03, 3*DATASIZE(B_row_ptr)
+
+	vfmacc.vf AB10, B10, A11   // AB(1,:) += A(1,1) * B(1,:)
+	vfmacc.vf AB11, B11, A11
+	vfmacc.vf AB12, B12, A11
+	vfmacc.vf AB13, B13, A11
+
+	// Load A(:,l+2)
+	VLE A00, (A00_ptr)
+	VLE A10, (A10_ptr)
+	VLE A20, (A20_ptr)
+	VLE A30, (A30_ptr)
+
+	// Point to A(:,l+3)
+	add A01_ptr, A00_ptr, s0
+	add A11_ptr, A10_ptr, s0
+	add A21_ptr, A20_ptr, s0
+	add A31_ptr, A30_ptr, s0
+
+	vfmacc.vf AB20, B10, A21   // AB(2,:) += A(2,1) * B(1,:)
+	vfmacc.vf AB21, B11, A21
+	vfmacc.vf AB22, B12, A21
+	vfmacc.vf AB23, B13, A21
+
+	vfmacc.vf AB30, B10, A31   // AB(3,:) += A(3,1) * B(1,:)
+	vfmacc.vf AB31, B11, A31
+	vfmacc.vf AB32, B12, A31
+	vfmacc.vf AB33, B13, A31
+
+	// Load A(:,l+3)
+	VLE A01, (A01_ptr)
+	VLE A11, (A11_ptr)
+	VLE A21, (A21_ptr)
+	VLE A31, (A31_ptr)
+
+	// Point to A(:,l+4)
+	add A00_ptr, A01_ptr, s0
+	add A10_ptr, A11_ptr, s0
+	add A20_ptr, A21_ptr, s0
+	add A30_ptr, A31_ptr, s0
+
+	vfmacc.vf AB00, B00, A00   // AB(0,:) += A(0,2) * B(2,:)
+	vfmacc.vf AB01, B01, A00
+	vfmacc.vf AB02, B02, A00
+	vfmacc.vf AB03, B03, A00
+
+	// Load B(l+3,0:3)
+	FLOAD B10, 4*DATASIZE(B_row_ptr)
+	FLOAD B11, 5*DATASIZE(B_row_ptr)
+	FLOAD B12, 6*DATASIZE(B_row_ptr)
+	FLOAD B13, 7*DATASIZE(B_row_ptr)
+	addi B_row_ptr, B_row_ptr, 8*DATASIZE
+
+	vfmacc.vf AB10, B00, A10   // AB(1,:) += A(1,2) * B(2,:)
+	vfmacc.vf AB11, B01, A10
+	vfmacc.vf AB12, B02, A10
+	vfmacc.vf AB13, B03, A10
+
+	vfmacc.vf AB20, B00, A20   // AB(2,:) += A(2,2) * B(2,:)
+	vfmacc.vf AB21, B01, A20
+	vfmacc.vf AB22, B02, A20
+	vfmacc.vf AB23, B03, A20
+
+	vfmacc.vf AB30, B00, A30   // AB(3,:) += A(3,2) * B(3,:)
+	vfmacc.vf AB31, B01, A30
+	vfmacc.vf AB32, B02, A30
+	vfmacc.vf AB33, B03, A30
+
+	vfmacc.vf AB00, B10, A01   // AB(0,:) += A(0,3) * B(3,:)
+	vfmacc.vf AB01, B11, A01
+	vfmacc.vf AB02, B12, A01
+	vfmacc.vf AB03, B13, A01
+
+	vfmacc.vf AB10, B10, A11   // AB(1,:) += A(1,3) * B(3,:)
+	vfmacc.vf AB11, B11, A11
+	vfmacc.vf AB12, B12, A11
+	vfmacc.vf AB13, B13, A11
+
+	vfmacc.vf AB20, B10, A21   // AB(2,:) += A(2,3) * B(3,:)
+	vfmacc.vf AB21, B11, A21
+	vfmacc.vf AB22, B12, A21
+	vfmacc.vf AB23, B13, A21
+
+	vfmacc.vf AB30, B10, A31   // AB(3,:) += A(3,3) * B(3,:)
+	vfmacc.vf AB31, B11, A31
+	vfmacc.vf AB32, B12, A31
+	vfmacc.vf AB33, B13, A31
+
+	li tmp, 3
+	ble loop_counter, tmp, TAIL_UNROLL_2
+
+	// Load A and B for the next iteration
+	// Load B(l,0:3)
+	FLOAD B00, 0*DATASIZE(B_row_ptr)
+	FLOAD B01, 1*DATASIZE(B_row_ptr)
+	FLOAD B02, 2*DATASIZE(B_row_ptr)
+	FLOAD B03, 3*DATASIZE(B_row_ptr)
+
+	// Load A(:,l)
+	VLE A00, (A00_ptr)
+	VLE A10, (A10_ptr)
+	VLE A20, (A20_ptr)
+	VLE A30, (A30_ptr)
+
+	// Set up pointers to A(:,l+1)
+	add A01_ptr, A00_ptr, s0
+	add A11_ptr, A10_ptr, s0
+	add A21_ptr, A20_ptr, s0
+	add A31_ptr, A30_ptr, s0
+
+	j LOOP_UNROLL_4
+
+TAIL_UNROLL_2: // loop_counter <= 3
+	li tmp, 1
+	ble loop_counter, tmp, TAIL_UNROLL_1
+
+	addi loop_counter, loop_counter, -2
+
+	// Load B(l,0:3)
+	FLOAD B00, 0*DATASIZE(B_row_ptr)
+	FLOAD B01, 1*DATASIZE(B_row_ptr)
+	FLOAD B02, 2*DATASIZE(B_row_ptr)
+	FLOAD B03, 3*DATASIZE(B_row_ptr)
+
+	// Load A(0:1,l)
+	VLE A00, (A00_ptr)
+	VLE A10, (A10_ptr)
+
+	// Point to A(:,l+1)
+	add A01_ptr, A00_ptr, s0
+	add A11_ptr, A10_ptr, s0
+	add A21_ptr, A20_ptr, s0
+	add A31_ptr, A30_ptr, s0
+
+	vfmacc.vf AB00, B00, A00   // AB(0,:) += A(0,0) * B(0,:)
+	vfmacc.vf AB01, B01, A00
+	vfmacc.vf AB02, B02, A00
+	vfmacc.vf AB03, B03, A00
+
+	// Load A(2:3,l)
+	VLE A20, (A20_ptr)
+	VLE A30, (A30_ptr)
+
+	vfmacc.vf AB10, B00, A10   // AB(1,:) += A(1,0) * B(0,:)
+	vfmacc.vf AB11, B01, A10
+	vfmacc.vf AB12, B02, A10
+	vfmacc.vf AB13, B03, A10
+
+	// Load B(l+1,0:3)
+	FLOAD B10, 4*DATASIZE(B_row_ptr)
+	FLOAD B11, 5*DATASIZE(B_row_ptr)
+	FLOAD B12, 6*DATASIZE(B_row_ptr)
+	FLOAD B13, 7*DATASIZE(B_row_ptr)
+	addi B_row_ptr, B_row_ptr, 8*DATASIZE
+
+	// Load A(:,l+1)
+	VLE A01, (A01_ptr)
+	VLE A11, (A11_ptr)
+	VLE A21, (A21_ptr)
+	VLE A31, (A31_ptr)
+
+	vfmacc.vf AB20, B00, A20   // AB(2,:) += A(2,0) * B(0,:)
+	vfmacc.vf AB21, B01, A20
+	vfmacc.vf AB22, B02, A20
+	vfmacc.vf AB23, B03, A20
+
+	vfmacc.vf AB30, B00, A30   // AB(3,:) += A(3,0) * B(0,:)
+	vfmacc.vf AB31, B01, A30
+	vfmacc.vf AB32, B02, A30
+	vfmacc.vf AB33, B03, A30
+
+	// Point to A(:,l+2)
+	add A00_ptr, A01_ptr, s0
+	add A10_ptr, A11_ptr, s0
+	add A20_ptr, A21_ptr, s0
+	add A30_ptr, A31_ptr, s0
+
+	vfmacc.vf AB00, B10, A01   // AB(0,:) += A(0,1) * B(1,:)
+	vfmacc.vf AB01, B11, A01
+	vfmacc.vf AB02, B12, A01
+	vfmacc.vf AB03, B13, A01
+
+	vfmacc.vf AB10, B10, A11   // AB(1,:) += A(1,1) * B(1,:)
+	vfmacc.vf AB11, B11, A11
+	vfmacc.vf AB12, B12, A11
+	vfmacc.vf AB13, B13, A11
+
+	vfmacc.vf AB20, B10, A21   // AB(2,:) += A(2,1) * B(1,:)
+	vfmacc.vf AB21, B11, A21
+	vfmacc.vf AB22, B12, A21
+	vfmacc.vf AB23, B13, A21
+
+	vfmacc.vf AB30, B10, A31   // AB(3,:) += A(3,1) * B(1,:)
+	vfmacc.vf AB31, B11, A31
+	vfmacc.vf AB32, B12, A31
+	vfmacc.vf AB33, B13, A31
+
+	li tmp, 1
+	ble loop_counter, tmp, TAIL_UNROLL_1
+
+TAIL_UNROLL_1: // loop_counter <= 1
+	beqz loop_counter, MULTIPLYALPHA
+
+	// Load row of B
+	FLOAD B00, 0*DATASIZE(B_row_ptr)
+	FLOAD B01, 1*DATASIZE(B_row_ptr)
+	FLOAD B02, 2*DATASIZE(B_row_ptr)
+	FLOAD B03, 3*DATASIZE(B_row_ptr)
+
+	// Load A(:,l)
+	VLE A00, (A00_ptr)
+	VLE A10, (A10_ptr)
+	VLE A20, (A20_ptr)
+	VLE A30, (A30_ptr)
+
+	vfmacc.vf AB00, B00, A00   // AB(0,:) += A(0,0) * B(0,:)
+	vfmacc.vf AB01, B01, A00
+	vfmacc.vf AB02, B02, A00
+	vfmacc.vf AB03, B03, A00
+
+	vfmacc.vf AB10, B00, A10   // AB(1,:) += A(1,0) * B(0,:)
+	vfmacc.vf AB11, B01, A10
+	vfmacc.vf AB12, B02, A10
+	vfmacc.vf AB13, B03, A10
+
+	vfmacc.vf AB20, B00, A20   // AB(2,:) += A(2,0) * B(0,:)
+	vfmacc.vf AB21, B01, A20
+	vfmacc.vf AB22, B02, A20
+	vfmacc.vf AB23, B03, A20
+
+	vfmacc.vf AB30, B00, A30   // AB(3,:) += A(3,0) * B(0,:)
+	vfmacc.vf AB31, B01, A30
+	vfmacc.vf AB32, B02, A30
+	vfmacc.vf AB33, B03, A30
+
+MULTIPLYALPHA:
+	FLOAD ALPHA, (a1)
+
+	// Multiply with alpha
+	vfmul.vf AB00, AB00, ALPHA
+	vfmul.vf AB01, AB01, ALPHA
+	vfmul.vf AB02, AB02, ALPHA
+	vfmul.vf AB03, AB03, ALPHA
+
+	vfmul.vf AB10, AB10, ALPHA
+	vfmul.vf AB11, AB11, ALPHA
+	vfmul.vf AB12, AB12, ALPHA
+	vfmul.vf AB13, AB13, ALPHA
+
+	vfmul.vf AB20, AB20, ALPHA
+	vfmul.vf AB21, AB21, ALPHA
+	vfmul.vf AB22, AB22, ALPHA
+	vfmul.vf AB23, AB23, ALPHA
+
+	vfmul.vf AB30, AB30, ALPHA
+	vfmul.vf AB31, AB31, ALPHA
+	vfmul.vf AB32, AB32, ALPHA
+	vfmul.vf AB33, AB33, ALPHA
+
+MULTIPLYBETA:
+	FLOAD BETA,  (a4)
+	FEQ tmp, BETA, fzero
+	beq tmp, zero, BETANOTZERO
+
+BETAZERO:
+	VSE AB00, (C00_ptr)
+	VSE AB01, (C01_ptr)
+	VSE AB02, (C02_ptr)
+	VSE AB03, (C03_ptr)
+
+	add C00_ptr, C10_ptr, rs_c  // advance pointers to row 2*VLEN
+	add C01_ptr, C11_ptr, rs_c
+	add C02_ptr, C12_ptr, rs_c
+	add C03_ptr, C13_ptr, rs_c
+
+	VSE AB10, (C10_ptr)
+	VSE AB11, (C11_ptr)
+	VSE AB12, (C12_ptr)
+	VSE AB13, (C13_ptr)
+
+	add C10_ptr, C00_ptr, rs_c  // advance pointers to row 3*VLEN
+	add C11_ptr, C01_ptr, rs_c
+	add C12_ptr, C02_ptr, rs_c
+	add C13_ptr, C03_ptr, rs_c
+
+	VSE AB20, (C00_ptr)
+	VSE AB21, (C01_ptr)
+	VSE AB22, (C02_ptr)
+	VSE AB23, (C03_ptr)
+
+	VSE AB30, (C10_ptr)
+	VSE AB31, (C11_ptr)
+	VSE AB32, (C12_ptr)
+	VSE AB33, (C13_ptr)
+
+	j END
+
+BETANOTZERO:
+	VLE C00, (C00_ptr)  // Load C(0:VLEN-1, 0:3)
+	VLE C01, (C01_ptr)
+	VLE C02, (C02_ptr)
+	VLE C03, (C03_ptr)
+
+	vfmacc.vf AB00, BETA, C00
+	vfmacc.vf AB01, BETA, C01
+	vfmacc.vf AB02, BETA, C02
+	vfmacc.vf AB03, BETA, C03
+
+	VSE AB00, (C00_ptr)  // Store C(0:VLEN-1, 0:3)
+	VSE AB01, (C01_ptr)
+	VSE AB02, (C02_ptr)
+	VSE AB03, (C03_ptr)
+
+	add C00_ptr, C10_ptr, rs_c  // advance pointers to row 2*VLEN
+	add C01_ptr, C11_ptr, rs_c
+	add C02_ptr, C12_ptr, rs_c
+	add C03_ptr, C13_ptr, rs_c
+
+	VLE C10, (C10_ptr)  // Load C(VLEN:2*VLEN-1, 0:3)
+	VLE C11, (C11_ptr)
+	VLE C12, (C12_ptr)
+	VLE C13, (C13_ptr)
+
+	vfmacc.vf AB10, BETA, C10
+	vfmacc.vf AB11, BETA, C11
+	vfmacc.vf AB12, BETA, C12
+	vfmacc.vf AB13, BETA, C13
+
+	VSE AB10, (C10_ptr)  // Store C(VLEN:2*VLEN-1, 0:3)
+	VSE AB11, (C11_ptr)
+	VSE AB12, (C12_ptr)
+	VSE AB13, (C13_ptr)
+
+	add C10_ptr, C00_ptr, rs_c  // advance pointers to row 3*VLEN
+	add C11_ptr, C01_ptr, rs_c
+	add C12_ptr, C02_ptr, rs_c
+	add C13_ptr, C03_ptr, rs_c
+
+	VLE C20, (C00_ptr)  // Load C(2*VLEN:3*VLEN-1, 0:3)
+	VLE C21, (C01_ptr)
+	VLE C22, (C02_ptr)
+	VLE C23, (C03_ptr)
+
+	vfmacc.vf AB20, BETA, C20
+	vfmacc.vf AB21, BETA, C21
+	vfmacc.vf AB22, BETA, C22
+	vfmacc.vf AB23, BETA, C23
+
+	VSE AB20, (C00_ptr)  // Store C(2*VLEN:3*VLEN-1, 0:3)
+	VSE AB21, (C01_ptr)
+	VSE AB22, (C02_ptr)
+	VSE AB23, (C03_ptr)
+
+	VLE C30, (C10_ptr)  // Load C(3*VLEN:4*VLEN-1, 0:3)
+	VLE C31, (C11_ptr)
+	VLE C32, (C12_ptr)
+	VLE C33, (C13_ptr)
+
+	vfmacc.vf AB30, BETA, C30
+	vfmacc.vf AB31, BETA, C31
+	vfmacc.vf AB32, BETA, C32
+	vfmacc.vf AB33, BETA, C33
+
+	VSE AB30, (C10_ptr)  // Store C(3*VLEN:4*VLEN-1, 0:3)
+	VSE AB31, (C11_ptr)
+	VSE AB32, (C12_ptr)
+	VSE AB33, (C13_ptr)
+
+END:
+	#include "rviv_restore_registers.h"
+	ret
diff --git a/kernels/rviv/3/bli_sgemm_rviv_4vx4.c b/kernels/rviv/3/bli_sgemm_rviv_4vx4.c
new file mode 100644
index 000000000..c240d0391
--- /dev/null
+++ b/kernels/rviv/3/bli_sgemm_rviv_4vx4.c
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "bli_rviv_utils.h"
+
+void bli_sgemm_rviv_asm_4vx4
+    (
+             intptr_t   k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c
+    );
+
+void bli_sgemm_rviv_4vx4
+     (
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
+     )
+{
+	// The assembly kernels always take native machine-sized integer arguments.
+	// dim_t and inc_t are normally defined as being machine-sized. If larger, assert.
+	bli_static_assert( sizeof(dim_t) <= sizeof(intptr_t) &&
+	                   sizeof(inc_t) <= sizeof(intptr_t) );
+
+	// Extract vector-length dependent mr, nr that are fixed at configure time.
+	const inc_t mr = bli_cntx_get_blksz_def_dt( BLIS_FLOAT, BLIS_MR, cntx );
+	const inc_t nr = 4;
+
+	GEMM_UKR_SETUP_CT( s, mr, nr, false );
+
+	// The kernel assumes rs_c == 1, and the context should not deviate from it.
+	assert( rs_c == 1 );
+
+	bli_sgemm_rviv_asm_4vx4( k, alpha, a, b, beta, c,
+	                         get_vlenb(), cs_c * sizeof(float) );
+
+	GEMM_UKR_FLUSH_CT( s );
+}
diff --git a/kernels/rviv/3/bli_sgemm_rviv_asm_4vx4.S b/kernels/rviv/3/bli_sgemm_rviv_asm_4vx4.S
new file mode 100644
index 000000000..2a917fc8e
--- /dev/null
+++ b/kernels/rviv/3/bli_sgemm_rviv_asm_4vx4.S
@@ -0,0 +1,45 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#define REALNAME bli_sgemm_rviv_asm_4vx4
+#define DATASIZE 4
+#define VTYPE e32
+#define FLOAD flw
+#define FZERO(fr) fcvt.s.w fr, x0
+#define FEQ feq.s
+#define VLE vle32.v
+#define VSE vse32.v
+
+#include "bli_sdgemm_rviv_asm_4vx4.h"
diff --git a/kernels/rviv/3/bli_zgemm_rviv_4vx4.c b/kernels/rviv/3/bli_zgemm_rviv_4vx4.c
new file mode 100644
index 000000000..3d9940f9b
--- /dev/null
+++ b/kernels/rviv/3/bli_zgemm_rviv_4vx4.c
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "bli_rviv_utils.h"
+
+void bli_zgemm_rviv_asm_4vx4
+    (
+             intptr_t   k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, intptr_t rs_c, intptr_t cs_c
+    );
+
+
+void bli_zgemm_rviv_4vx4
+     (
+             dim_t      m,
+             dim_t      n,
+             dim_t      k,
+       const void*      alpha,
+       const void*      a,
+       const void*      b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
+             auxinfo_t* data,
+       const cntx_t*    cntx
+     )
+{
+	// The assembly kernels always take native machine-sized integer arguments.
+	// dim_t and inc_t are normally defined as being machine-sized. If larger, assert.
+	bli_static_assert( sizeof(dim_t) <= sizeof(intptr_t) &&
+	                   sizeof(inc_t) <= sizeof(intptr_t) );
+
+	// Extract vector-length dependent mr, nr that are fixed at configure time.
+	const inc_t mr = bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx );
+	const inc_t nr = 4;
+
+	GEMM_UKR_SETUP_CT( z, mr, nr, false );
+
+	// The kernel assumes rs_c == 1, and the context should not deviate from it.
+	assert( rs_c == 1 );
+
+	bli_zgemm_rviv_asm_4vx4( k, alpha, a, b, beta, c,
+	                         get_vlenb() * 2, cs_c * sizeof(dcomplex) );
+
+	GEMM_UKR_FLUSH_CT( z );
+}
diff --git a/kernels/rviv/3/bli_zgemm_rviv_asm_4vx4.S b/kernels/rviv/3/bli_zgemm_rviv_asm_4vx4.S
new file mode 100644
index 000000000..ae61a415d
--- /dev/null
+++ b/kernels/rviv/3/bli_zgemm_rviv_asm_4vx4.S
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define REALNAME bli_zgemm_rviv_asm_4vx4
+#define DATASIZE 16
+#define VTYPE e64
+#define FLOAD fld
+#define FZERO(fr) fcvt.d.w fr, x0
+#define FEQ feq.d
+#define VLE vlseg2e64.v
+#define VSE vsseg2e64.v
+
+#include "bli_czgemm_rviv_asm_4vx4.h"
diff --git a/kernels/rviv/3/rviv_restore_registers.h b/kernels/rviv/3/rviv_restore_registers.h
new file mode 100644
index 000000000..bcf7d17c8
--- /dev/null
+++ b/kernels/rviv/3/rviv_restore_registers.h
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+// 128-bit RISC-V is assumed to support the __riscv_xlen test macro
+#if __riscv_xlen == 128  // false if !defined(__riscv_xlen)
+
+	lq s7, 112(sp)
+	lq s6,  96(sp)
+	lq s5,  80(sp)
+	lq s4,  64(sp)
+	lq s3,  48(sp)
+	lq s2,  32(sp)
+	lq s1,  16(sp)
+	lq s0,   0(sp)
+	addi sp, sp, 128
+
+// 64-bit RISC-V can be indicated by either __riscv_xlen == 64 or
+// RISCV_SIZE == 64, to support toolchains which do not currently
+// support __riscv_xlen. If a macro is undefined, it is considered 0.
+#elif __riscv_xlen == 64 || RISCV_SIZE == 64
+
+	ld s7, 56(sp)
+	ld s6, 48(sp)
+	ld s5, 40(sp)
+	ld s4, 32(sp)
+	ld s3, 24(sp)
+	ld s2, 16(sp)
+	ld s1,  8(sp)
+	ld s0,  0(sp)
+	addi sp, sp, 64
+
+#else
+// else 32-bit RISC-V is assumed
+
+	lw s7, 28(sp)
+	lw s6, 24(sp)
+	lw s5, 20(sp)
+	lw s4, 16(sp)
+	lw s3, 12(sp)
+	lw s2,  8(sp)
+	lw s1,  4(sp)
+	lw s0,  0(sp)
+	addi sp, sp, 32
+
+#endif
diff --git a/kernels/rviv/3/rviv_save_registers.h b/kernels/rviv/3/rviv_save_registers.h
new file mode 100644
index 000000000..537c76ca6
--- /dev/null
+++ b/kernels/rviv/3/rviv_save_registers.h
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+// 128-bit RISC-V is assumed to support the __riscv_xlen test macro
+#if __riscv_xlen == 128  // false if !defined(__riscv_xlen)
+
+	addi sp, sp, -128
+	sq s7, 112(sp)
+	sq s6,  96(sp)
+	sq s5,  80(sp)
+	sq s4,  64(sp)
+	sq s3,  48(sp)
+	sq s2,  32(sp)
+	sq s1,  16(sp)
+	sq s0,   0(sp)
+
+// 64-bit RISC-V can be indicated by either __riscv_xlen == 64 or
+// RISCV_SIZE == 64, to support toolchains which do not currently
+// support __riscv_xlen. If a macro is undefined, it is considered 0.
+#elif __riscv_xlen == 64 || RISCV_SIZE == 64
+
+	addi sp, sp, -64
+	sd s7, 56(sp)
+	sd s6, 48(sp)
+	sd s5, 40(sp)
+	sd s4, 32(sp)
+	sd s3, 24(sp)
+	sd s2, 16(sp)
+	sd s1,  8(sp)
+	sd s0,  0(sp)
+
+#else
+// else 32-bit RISC-V is assumed
+
+	addi sp, sp, -32
+	sw s7, 28(sp)
+	sw s6, 24(sp)
+	sw s5, 20(sp)
+	sw s4, 16(sp)
+	sw s3, 12(sp)
+	sw s2,  8(sp)
+	sw s1,  4(sp)
+	sw s0,  0(sp)
+
+#endif
diff --git a/kernels/rviv/bli_kernels_rviv.h b/kernels/rviv/bli_kernels_rviv.h
new file mode 100644
index 000000000..82a652396
--- /dev/null
+++ b/kernels/rviv/bli_kernels_rviv.h
@@ -0,0 +1,38 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+GEMM_UKR_PROT( float,    s, gemm_rviv_4vx4 )
+GEMM_UKR_PROT( double,   d, gemm_rviv_4vx4 )
+GEMM_UKR_PROT( scomplex, c, gemm_rviv_4vx4 )
+GEMM_UKR_PROT( dcomplex, z, gemm_rviv_4vx4 )
diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh
new file mode 100755
index 000000000..a51d33061
--- /dev/null
+++ b/travis/do_riscv.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -e
+set -x
+
+TAG=2023.02.25
+
+# The prebuilt toolchains only support hardfloat, so we only
+# test these for now.
+case $1 in
+	"rv32iv")
+	TARBALL=riscv32-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+	;;
+	"rv64iv")
+	TARBALL=riscv64-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+	;;
+	*)
+	exit 1
+	;;
+esac
+
+TOOLCHAIN_PATH=$DIST_PATH/../toolchain
+TOOLCHAIN_URL=https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/${TAG}/${TARBALL}
+
+mkdir -p $TOOLCHAIN_PATH
+cd $TOOLCHAIN_PATH
+
+wget $TOOLCHAIN_URL
+tar -xf $TARBALL
+
+# Once CI upgrades to jammy, the next three lines can be removed.
+# The qemu version installed via packages (qemu-user qemu-user-binfmt)
+# is sufficient.
+TARBALL_QEMU=qemu-riscv-2023.02.25-ubuntu-20.04.tar.gz
+wget https://github.com/flame/ci-utils/raw/master/riscv/${TARBALL_QEMU}
+tar -xf $TARBALL_QEMU

From 8215b02f99aa77ecc7d813508c247565115319d7 Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Wed, 12 Apr 2023 12:59:27 -0500
Subject: [PATCH 149/230] Apply #738 to make_defs.mk of RISC-V subconfigs.
 (#740)

Details:
- PR #738 -- which moved -fPIC flag insertion responsibilities from
  common.mk to the subconfigs' individual make_defs.mk files -- was
  merged shortly before the introduction of new RISC-V subconfigs in
  #693. This commit brings those RISC-V subconfigs up to date with the
  new -fPIC conventions.
---
 config/rv32i/make_defs.mk  | 2 +-
 config/rv32iv/make_defs.mk | 2 +-
 config/rv64i/make_defs.mk  | 2 +-
 config/rv64iv/make_defs.mk | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/rv32i/make_defs.mk b/config/rv32i/make_defs.mk
index 40849ce66..86b7143dd 100644
--- a/config/rv32i/make_defs.mk
+++ b/config/rv32i/make_defs.mk
@@ -49,7 +49,7 @@ CPPROCFLAGS    := -DRISCV_SIZE=32
 # Atomic instructions must be enabled either via hardware
 # (-march=rv32ia) or by linking against libatomic
 CMISCFLAGS     := -march=$(shell $(CC) -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=ilp32
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
 # In case the A extension is not available
diff --git a/config/rv32iv/make_defs.mk b/config/rv32iv/make_defs.mk
index 3cef697ac..e8d9cca57 100644
--- a/config/rv32iv/make_defs.mk
+++ b/config/rv32iv/make_defs.mk
@@ -49,7 +49,7 @@ CPPROCFLAGS    := -DRISCV_SIZE=32
 # Atomic instructions must be enabled either via hardware
 # (-march=rv32iav) or by linking against libatomic
 CMISCFLAGS     := -march=$(shell $(CC) -DFORCE_RISCV_VECTOR -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=ilp32d
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
 # In case the A extension is not available
diff --git a/config/rv64i/make_defs.mk b/config/rv64i/make_defs.mk
index 6c69dd84e..bee21ed0d 100644
--- a/config/rv64i/make_defs.mk
+++ b/config/rv64i/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := rv64i
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -DRISCV_SIZE=64
 CMISCFLAGS     := -march=$(shell $(CC) -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=lp64
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
 # In case the A extension is not available
diff --git a/config/rv64iv/make_defs.mk b/config/rv64iv/make_defs.mk
index 06545d461..1c9849fbe 100644
--- a/config/rv64iv/make_defs.mk
+++ b/config/rv64iv/make_defs.mk
@@ -47,7 +47,7 @@ THIS_CONFIG    := rv64iv
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -DRISCV_SIZE=64
 CMISCFLAGS     := -march=$(shell $(CC) -DFORCE_RISCV_VECTOR -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=lp64d
-CPICFLAGS      :=
+CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
 # In case the A extension is not available

From 6fd9aabb03d172a792a7eeb106c7d965cf038421 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 5 May 2023 14:22:52 -0500
Subject: [PATCH 150/230] Fix bug in detecting Fortran compiler vendor (#745)

`FC` was used instead of `found_fc`.
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index 9fe6bd6c7..7219a204b 100755
--- a/configure
+++ b/configure
@@ -4037,7 +4037,7 @@ main()
 			# clutter.
 			# NOTE: This maybe should use merged stdout/stderr rather than only
 			# stdout. But it works for now.
-			vendor_string="$(${FC} --version 2>/dev/null || :)"
+			vendor_string="$(${found_fc} --version 2>/dev/null || :)"
 
 			# Query the compiler "vendor" (ie: the compiler's simple name).
 			# The last part ({ read first rest ; echo $first ; }) is a workaround

From ef9d3e6675320a53e7cb477c16b01388e708b1da Mon Sep 17 00:00:00 2001
From: h-vetinari <h.vetinari@gmx.com>
Date: Sun, 7 May 2023 04:59:35 +1100
Subject: [PATCH 151/230] Added missing #include <io.h> for Windows. (#747)

Details:
- This commit fixes issue #746, in which the _access() function (called
  from within blastest/f2c/open.c) is undeclared when compiling on
  Windows with clang 16.
---
 blastest/f2c/open.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/blastest/f2c/open.c b/blastest/f2c/open.c
index 2834fd946..12e5f02b2 100644
--- a/blastest/f2c/open.c
+++ b/blastest/f2c/open.c
@@ -28,6 +28,7 @@ use or performance of this software.
 #include <unistd.h>
 #endif
 #ifdef _MSC_VER
+#include <io.h>
 #define access _access
 #endif
 #include "f2c.h"

From 0873c0f6ed03fea321d1631b3d1a385a306aa797 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 7 May 2023 14:03:19 -0500
Subject: [PATCH 152/230] Consolidate INSERT_ macro sets via variadic macros.
 (#744)

Details:
- Consolidated INSERT_GENTFUNC_* (and corresponding GENTPROT) macro sets
  using variadic macros (__VA_ARGS__), which means we no longer need a
  different INSERT_ macro for each possible number of arguments the
  macro might take. This change seems reasonable given that variadic
  macros are a standard C99 feature and widely supported. I took care
  not to use variadic macros where 0 variadic arguments are expected
  since that is a non-standard extension.
- Added pre-typecast parentheses to arithmetic expressions in printf()
  statements in bli_thread_range_tlb.c.
---
 frame/0/bli_l0_tapi.c                       |   16 +-
 frame/0/bli_l0_tapi.h                       |   26 +-
 frame/0/copysc/bli_copysc.c                 |    6 +-
 frame/0/copysc/bli_copysc.h                 |    6 +-
 frame/1/bli_l1v_tapi.h                      |   30 +-
 frame/1d/bli_l1d_tapi.c                     |   24 +-
 frame/1d/bli_l1d_tapi.h                     |   24 +-
 frame/1f/bli_l1f_tapi.h                     |   10 +-
 frame/1m/bli_l1m_tapi.c                     |   18 +-
 frame/1m/bli_l1m_tapi.h                     |   22 +-
 frame/1m/bli_l1m_unb_var1.c                 |   22 +-
 frame/1m/bli_l1m_unb_var1.h                 |   22 +-
 frame/1m/packm/bli_packm_struc_cxk.c        |    2 +-
 frame/1m/packm/bli_packm_struc_cxk.h        |    2 +-
 frame/1m/packm/bli_packm_struc_cxk_md.c     |   12 +-
 frame/1m/packm/bli_packm_struc_cxk_md.h     |   12 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.c     |    2 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.h     |    2 +-
 frame/2/bli_l2_tapi.c                       |   20 +-
 frame/2/bli_l2_tapi.h                       |   20 +-
 frame/2/gemv/amd/bli_gemv_unf_var2_amd.c    |    4 +-
 frame/2/gemv/bli_gemv_unb_var1.c            |    2 +-
 frame/2/gemv/bli_gemv_unb_var2.c            |    2 +-
 frame/2/gemv/bli_gemv_unf_var1.c            |    2 +-
 frame/2/gemv/bli_gemv_unf_var2.c            |    2 +-
 frame/2/gemv/bli_gemv_var.h                 |    8 +-
 frame/2/ger/bli_ger_unb_var1.c              |    2 +-
 frame/2/ger/bli_ger_unb_var2.c              |    2 +-
 frame/2/ger/bli_ger_var.h                   |    4 +-
 frame/2/hemv/bli_hemv_unb_var1.c            |    2 +-
 frame/2/hemv/bli_hemv_unb_var2.c            |    2 +-
 frame/2/hemv/bli_hemv_unb_var3.c            |    2 +-
 frame/2/hemv/bli_hemv_unb_var4.c            |    2 +-
 frame/2/hemv/bli_hemv_unf_var1.c            |    2 +-
 frame/2/hemv/bli_hemv_unf_var1a.c           |    2 +-
 frame/2/hemv/bli_hemv_unf_var3.c            |    2 +-
 frame/2/hemv/bli_hemv_unf_var3a.c           |    2 +-
 frame/2/hemv/bli_hemv_var.h                 |   18 +-
 frame/2/her/bli_her_unb_var1.c              |    2 +-
 frame/2/her/bli_her_unb_var2.c              |    2 +-
 frame/2/her/bli_her_var.h                   |    4 +-
 frame/2/her2/bli_her2_unb_var1.c            |    2 +-
 frame/2/her2/bli_her2_unb_var2.c            |    2 +-
 frame/2/her2/bli_her2_unb_var3.c            |    2 +-
 frame/2/her2/bli_her2_unb_var4.c            |    2 +-
 frame/2/her2/bli_her2_unf_var1.c            |    2 +-
 frame/2/her2/bli_her2_unf_var4.c            |    2 +-
 frame/2/her2/bli_her2_var.h                 |   12 +-
 frame/2/trmv/bli_trmv_unb_var1.c            |    2 +-
 frame/2/trmv/bli_trmv_unb_var2.c            |    2 +-
 frame/2/trmv/bli_trmv_unf_var1.c            |    2 +-
 frame/2/trmv/bli_trmv_unf_var2.c            |    2 +-
 frame/2/trmv/bli_trmv_var.h                 |    8 +-
 frame/2/trsv/bli_trsv_unb_var1.c            |    2 +-
 frame/2/trsv/bli_trsv_unb_var2.c            |    2 +-
 frame/2/trsv/bli_trsv_unf_var1.c            |    2 +-
 frame/2/trsv/bli_trsv_unf_var2.c            |    2 +-
 frame/2/trsv/bli_trsv_var.h                 |    8 +-
 frame/3/bli_l3_ind_ukr.h                    |   10 +-
 frame/3/bli_l3_sup_packm_var.h              |    4 +-
 frame/3/bli_l3_sup_var12.c                  |    4 +-
 frame/3/bli_l3_sup_vars.h                   |    4 +-
 frame/3/bli_l3_tapi.c                       |   18 +-
 frame/3/bli_l3_tapi.h                       |   22 +-
 frame/3/bli_l3_tapi_ex.c                    |   18 +-
 frame/3/bli_l3_tapi_ex.h                    |   22 +-
 frame/3/bli_l3_ukr_tapi.c                   |   10 +-
 frame/3/bli_l3_ukr_tapi.h                   |   10 +-
 frame/3/gemm/bli_gemm_ker_var2.c            |    4 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c          |    2 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c        |    2 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2b.c       |    2 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c        |    2 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2b.c       |    2 +-
 frame/base/bli_machval.h                    |    2 +-
 frame/base/bli_setgetijm.c                  |    4 +-
 frame/base/bli_setgetijm.h                  |    4 +-
 frame/base/bli_setgetijv.c                  |    4 +-
 frame/base/bli_setgetijv.h                  |    4 +-
 frame/base/cast/bli_castm.c                 |    4 +-
 frame/base/cast/bli_castm.h                 |    4 +-
 frame/base/cast/bli_castnzm.c               |    4 +-
 frame/base/cast/bli_castnzm.h               |    4 +-
 frame/base/cast/bli_castv.c                 |    4 +-
 frame/base/cast/bli_castv.h                 |    4 +-
 frame/include/bli_gentfunc_macro_defs.h     | 1145 ++++---------------
 frame/include/bli_gentprot_macro_defs.h     |  371 ++----
 frame/include/level0/bb/bli_bcastbbs_mxn.h  |    2 +-
 frame/include/level0/bb/bli_scal2bbs_mxn.h  |    4 +-
 frame/include/level0/bb/bli_set0bbs_mxn.h   |    2 +-
 frame/include/level0/bli_copys_mxn.h        |    2 +-
 frame/include/level0/bli_scal2s_mxn.h       |    2 +-
 frame/include/level0/bli_set0s_edge.h       |    2 +-
 frame/include/level0/bli_xpbys_mxn.h        |    2 +-
 frame/thread/bli_thread_range_tlb.c         |   54 +-
 frame/util/bli_util_tapi.c                  |   36 +-
 frame/util/bli_util_tapi.h                  |   48 +-
 frame/util/bli_util_unb_var1.c              |   22 +-
 frame/util/bli_util_unb_var1.h              |   38 +-
 ref_kernels/1/bli_addv_ref.c                |    2 +-
 ref_kernels/1/bli_amaxv_ref.c               |    2 +-
 ref_kernels/1/bli_axpbyv_ref.c              |    2 +-
 ref_kernels/1/bli_axpyv_ref.c               |    2 +-
 ref_kernels/1/bli_copyv_ref.c               |    2 +-
 ref_kernels/1/bli_dotv_ref.c                |    2 +-
 ref_kernels/1/bli_dotxv_ref.c               |    2 +-
 ref_kernels/1/bli_invertv_ref.c             |    2 +-
 ref_kernels/1/bli_invscalv_ref.c            |    2 +-
 ref_kernels/1/bli_scal2v_ref.c              |    2 +-
 ref_kernels/1/bli_scalv_ref.c               |    2 +-
 ref_kernels/1/bli_setv_ref.c                |    2 +-
 ref_kernels/1/bli_subv_ref.c                |    2 +-
 ref_kernels/1/bli_swapv_ref.c               |    2 +-
 ref_kernels/1/bli_xpbyv_ref.c               |    2 +-
 ref_kernels/1f/bli_axpy2v_ref.c             |    2 +-
 ref_kernels/1f/bli_axpyf_ref.c              |    2 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c           |    2 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c          |    2 +-
 ref_kernels/1f/bli_dotxf_ref.c              |    2 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c |    4 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c     |    4 +-
 ref_kernels/1m/bli_packm_cxk_1er_ref.c      |    4 +-
 ref_kernels/1m/bli_packm_cxk_ref.c          |    4 +-
 ref_kernels/1m/bli_unpackm_cxk_ref.c        |    4 +-
 ref_kernels/3/bli_gemm_ref.c                |    4 +-
 ref_kernels/3/bli_gemmsup_ref.c             |    6 +-
 ref_kernels/3/bli_gemmtrsm_ref.c            |    4 +-
 ref_kernels/3/bli_trsm_ref.c                |    8 +-
 ref_kernels/bli_cntx_ref.c                  |   94 +-
 ref_kernels/ind/bli_gemm1m_ref.c            |    2 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c        |    4 +-
 ref_kernels/ind/bli_trsm1m_ref.c            |    8 +-
 sandbox/gemmlike/bls_gemm.c                 |    2 +-
 sandbox/gemmlike/bls_gemm.h                 |    2 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c         |    2 +-
 sandbox/gemmlike/bls_gemm_var.h             |    4 +-
 sandbox/gemmlike/bls_l3_packm_a.c           |    6 +-
 sandbox/gemmlike/bls_l3_packm_a.h           |    6 +-
 sandbox/gemmlike/bls_l3_packm_b.c           |    6 +-
 sandbox/gemmlike/bls_l3_packm_b.h           |    6 +-
 sandbox/gemmlike/bls_l3_packm_var.h         |    6 +-
 sandbox/gemmlike/bls_l3_packm_var1.c        |    2 +-
 sandbox/gemmlike/bls_l3_packm_var2.c        |    2 +-
 sandbox/gemmlike/bls_l3_packm_var3.c        |    2 +-
 sandbox/gemmlike/bls_packm_cxk.c            |    2 +-
 sandbox/gemmlike/bls_packm_cxk.h            |    2 +-
 testsuite/src/test_amaxv.c                  |    6 +-
 testsuite/src/test_randm.c                  |    6 +-
 testsuite/src/test_randm.h                  |    2 +-
 149 files changed, 849 insertions(+), 1737 deletions(-)

diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c
index 7d6d33131..6da19e31b 100644
--- a/frame/0/bli_l0_tapi.c
+++ b/frame/0/bli_l0_tapi.c
@@ -138,7 +138,7 @@ void PASTEMAC(ch,opname) \
 	( void )chi_i; \
 }
 
-INSERT_GENTFUNCR_BASIC0( absqsc )
+INSERT_GENTFUNCR_BASIC( absqsc )
 
 
 #undef  GENTFUNCR
@@ -156,7 +156,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(ch,chr,abval2s)( *chi, *norm ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( normfsc )
+INSERT_GENTFUNCR_BASIC( normfsc )
 
 
 #undef  GENTFUNC
@@ -174,7 +174,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC(ch,sqrt2s)( *chi, *psi ); \
 }
 
-INSERT_GENTFUNC_BASIC0( sqrtsc )
+INSERT_GENTFUNC_BASIC( sqrtsc )
 
 
 #undef  GENTFUNCR
@@ -193,7 +193,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(chr,ch,sqrt2s)( chi_r, *psi ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( sqrtrsc )
+INSERT_GENTFUNCR_BASIC( sqrtrsc )
 
 
 #undef  GENTFUNC
@@ -211,7 +211,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(ch,d,gets)( *chi, *zeta_r, *zeta_i ); \
 }
 
-INSERT_GENTFUNC_BASIC0( getsc )
+INSERT_GENTFUNC_BASIC( getsc )
 
 
 #undef  GENTFUNC
@@ -229,7 +229,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(d,ch,sets)( zeta_r, zeta_i, *chi ); \
 }
 
-INSERT_GENTFUNC_BASIC0( setsc )
+INSERT_GENTFUNC_BASIC( setsc )
 
 
 #undef  GENTFUNCR
@@ -247,7 +247,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(ch,chr,gets)( *chi, *zeta_r, *zeta_i ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( unzipsc )
+INSERT_GENTFUNCR_BASIC( unzipsc )
 
 
 #undef  GENTFUNCR
@@ -265,7 +265,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(chr,ch,sets)( *zeta_r, *zeta_i, *chi ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( zipsc )
+INSERT_GENTFUNCR_BASIC( zipsc )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h
index ead89c056..cf28b07d7 100644
--- a/frame/0/bli_l0_tapi.h
+++ b/frame/0/bli_l0_tapi.h
@@ -47,11 +47,11 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype* psi  \
      );
 
-INSERT_GENTPROT_BASIC0( addsc )
-INSERT_GENTPROT_BASIC0( divsc )
-INSERT_GENTPROT_BASIC0( mulsc )
-INSERT_GENTPROT_BASIC0( subsc )
-INSERT_GENTPROT_BASIC0( invertsc )
+INSERT_GENTPROT_BASIC( addsc )
+INSERT_GENTPROT_BASIC( divsc )
+INSERT_GENTPROT_BASIC( mulsc )
+INSERT_GENTPROT_BASIC( subsc )
+INSERT_GENTPROT_BASIC( invertsc )
 
 
 #undef  GENTPROTR
@@ -63,8 +63,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype_r* absq  \
      );
 
-INSERT_GENTPROTR_BASIC0( absqsc )
-INSERT_GENTPROTR_BASIC0( normfsc )
+INSERT_GENTPROTR_BASIC( absqsc )
+INSERT_GENTPROTR_BASIC( normfsc )
 
 
 #undef  GENTPROT
@@ -76,8 +76,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype* psi  \
      );
 
-INSERT_GENTPROT_BASIC0( sqrtsc )
-INSERT_GENTPROT_BASIC0( sqrtrsc )
+INSERT_GENTPROT_BASIC( sqrtsc )
+INSERT_GENTPROT_BASIC( sqrtrsc )
 
 
 #undef  GENTPROT
@@ -90,7 +90,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              double* zeta_i  \
      );
 
-INSERT_GENTPROT_BASIC0( getsc )
+INSERT_GENTPROT_BASIC( getsc )
 
 
 #undef  GENTPROT
@@ -103,7 +103,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
        ctype* chi  \
      );
 
-INSERT_GENTPROT_BASIC0( setsc )
+INSERT_GENTPROT_BASIC( setsc )
 
 
 #undef  GENTPROTR
@@ -116,7 +116,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype_r* zeta_i  \
      );
 
-INSERT_GENTPROTR_BASIC0( unzipsc )
+INSERT_GENTPROTR_BASIC( unzipsc )
 
 
 #undef  GENTPROTR
@@ -129,7 +129,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*   chi  \
      );
 
-INSERT_GENTPROTR_BASIC0( zipsc )
+INSERT_GENTPROTR_BASIC( zipsc )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/0/copysc/bli_copysc.c b/frame/0/copysc/bli_copysc.c
index c2e01d07b..805bff6dc 100644
--- a/frame/0/copysc/bli_copysc.c
+++ b/frame/0/copysc/bli_copysc.c
@@ -125,7 +125,7 @@ void PASTEMAC2(chx,chy,varname) \
 	} \
 }
 
-INSERT_GENTFUNC2_BASIC0( copysc )
-INSERT_GENTFUNC2_MIX_D0( copysc )
-INSERT_GENTFUNC2_MIX_P0( copysc )
+INSERT_GENTFUNC2_BASIC( copysc )
+INSERT_GENTFUNC2_MIX_D( copysc )
+INSERT_GENTFUNC2_MIX_P( copysc )
 
diff --git a/frame/0/copysc/bli_copysc.h b/frame/0/copysc/bli_copysc.h
index cd5481e57..b04fabb30 100644
--- a/frame/0/copysc/bli_copysc.h
+++ b/frame/0/copysc/bli_copysc.h
@@ -62,7 +62,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
              void*  psi \
      );
 
-INSERT_GENTPROT2_BASIC0( copysc )
-INSERT_GENTPROT2_MIX_D0( copysc )
-INSERT_GENTPROT2_MIX_P0( copysc )
+INSERT_GENTPROT2_BASIC( copysc )
+INSERT_GENTPROT2_MIX_D( copysc )
+INSERT_GENTPROT2_MIX_P( copysc )
 
diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h
index 8eaf2b185..bda6fd28b 100644
--- a/frame/1/bli_l1v_tapi.h
+++ b/frame/1/bli_l1v_tapi.h
@@ -49,9 +49,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
         BLIS_TAPI_EX_PARAMS  \
       );
 
-INSERT_GENTPROT_BASIC0( addv )
-INSERT_GENTPROT_BASIC0( copyv )
-INSERT_GENTPROT_BASIC0( subv )
+INSERT_GENTPROT_BASIC( addv )
+INSERT_GENTPROT_BASIC( copyv )
+INSERT_GENTPROT_BASIC( subv )
 
 
 #undef  GENTPROT
@@ -65,7 +65,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( amaxv )
+INSERT_GENTPROT_BASIC( amaxv )
 
 
 #undef  GENTPROT
@@ -82,7 +82,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( axpbyv )
+INSERT_GENTPROT_BASIC( axpbyv )
 
 
 #undef  GENTPROT
@@ -98,8 +98,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( axpyv )
-INSERT_GENTPROT_BASIC0( scal2v )
+INSERT_GENTPROT_BASIC( axpyv )
+INSERT_GENTPROT_BASIC( scal2v )
 
 
 #undef  GENTPROT
@@ -116,7 +116,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( dotv )
+INSERT_GENTPROT_BASIC( dotv )
 
 
 #undef  GENTPROT
@@ -135,7 +135,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( dotxv )
+INSERT_GENTPROT_BASIC( dotxv )
 
 
 #undef  GENTPROT
@@ -148,7 +148,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( invertv )
+INSERT_GENTPROT_BASIC( invertv )
 
 
 #undef  GENTPROT
@@ -163,9 +163,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( invscalv )
-INSERT_GENTPROT_BASIC0( scalv )
-INSERT_GENTPROT_BASIC0( setv )
+INSERT_GENTPROT_BASIC( invscalv )
+INSERT_GENTPROT_BASIC( scalv )
+INSERT_GENTPROT_BASIC( setv )
 
 
 #undef  GENTPROT
@@ -179,7 +179,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( swapv )
+INSERT_GENTPROT_BASIC( swapv )
 
 
 #undef  GENTPROT
@@ -195,4 +195,4 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
-INSERT_GENTPROT_BASIC0( xpbyv )
+INSERT_GENTPROT_BASIC( xpbyv )
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index 60d5cf1d6..75e3b997f 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -114,9 +114,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC2( addd,  addv,  BLIS_ADDV_KER )
-INSERT_GENTFUNC_BASIC2( copyd, copyv, BLIS_COPYV_KER )
-INSERT_GENTFUNC_BASIC2( subd,  subv,  BLIS_SUBV_KER )
+INSERT_GENTFUNC_BASIC( addd,  addv,  BLIS_ADDV_KER )
+INSERT_GENTFUNC_BASIC( copyd, copyv, BLIS_COPYV_KER )
+INSERT_GENTFUNC_BASIC( subd,  subv,  BLIS_SUBV_KER )
 
 
 #undef  GENTFUNC
@@ -195,8 +195,8 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC2( axpyd,  axpyv,  BLIS_AXPYV_KER )
-INSERT_GENTFUNC_BASIC2( scal2d, scal2v, BLIS_SCAL2V_KER )
+INSERT_GENTFUNC_BASIC( axpyd,  axpyv,  BLIS_AXPYV_KER )
+INSERT_GENTFUNC_BASIC( scal2d, scal2v, BLIS_SCAL2V_KER )
 
 
 #undef  GENTFUNC
@@ -252,7 +252,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
+INSERT_GENTFUNC_BASIC( invertd, invertv, BLIS_INVERTV_KER )
 
 
 #undef  GENTFUNC
@@ -312,9 +312,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC2( invscald, invscalv, BLIS_INVSCALV_KER )
-INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
-INSERT_GENTFUNC_BASIC2( setd,  setv,  BLIS_SETV_KER )
+INSERT_GENTFUNC_BASIC( invscald, invscalv, BLIS_INVSCALV_KER )
+INSERT_GENTFUNC_BASIC( scald, scalv, BLIS_SCALV_KER )
+INSERT_GENTFUNC_BASIC( setd,  setv,  BLIS_SETV_KER )
 
 
 #undef  GENTFUNCR
@@ -390,7 +390,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
+INSERT_GENTFUNCR_BASIC( setid, setv, BLIS_SETV_KER )
 
 
 #undef  GENTFUNC
@@ -449,7 +449,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
+INSERT_GENTFUNC_BASIC( shiftd, addv, BLIS_ADDV_KER )
 
 
 #undef  GENTFUNC
@@ -528,7 +528,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC2( xpbyd,  xpbyv,  BLIS_XPBYV_KER )
+INSERT_GENTFUNC_BASIC( xpbyd,  xpbyv,  BLIS_XPBYV_KER )
 
 
 #endif
diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h
index 8fe882f0c..201bd9ae6 100644
--- a/frame/1d/bli_l1d_tapi.h
+++ b/frame/1d/bli_l1d_tapi.h
@@ -52,9 +52,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( addd )
-INSERT_GENTPROT_BASIC0( copyd )
-INSERT_GENTPROT_BASIC0( subd )
+INSERT_GENTPROT_BASIC( addd )
+INSERT_GENTPROT_BASIC( copyd )
+INSERT_GENTPROT_BASIC( subd )
 
 
 #undef  GENTPROT
@@ -73,8 +73,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( axpyd )
-INSERT_GENTPROT_BASIC0( scal2d )
+INSERT_GENTPROT_BASIC( axpyd )
+INSERT_GENTPROT_BASIC( scal2d )
 
 
 #undef  GENTPROT
@@ -89,7 +89,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( invertd )
+INSERT_GENTPROT_BASIC( invertd )
 
 
 #undef  GENTPROT
@@ -106,9 +106,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( invscald )
-INSERT_GENTPROT_BASIC0( scald )
-INSERT_GENTPROT_BASIC0( setd )
+INSERT_GENTPROT_BASIC( invscald )
+INSERT_GENTPROT_BASIC( scald )
+INSERT_GENTPROT_BASIC( setd )
 
 
 #undef  GENTPROTR
@@ -124,7 +124,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROTR_BASIC0( setid )
+INSERT_GENTPROTR_BASIC( setid )
 
 
 #undef  GENTPROT
@@ -140,7 +140,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( shiftd )
+INSERT_GENTPROT_BASIC( shiftd )
 
 
 #undef  GENTPROT
@@ -159,5 +159,5 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( xpbyd )
+INSERT_GENTPROT_BASIC( xpbyd )
 
diff --git a/frame/1f/bli_l1f_tapi.h b/frame/1f/bli_l1f_tapi.h
index 2ea54df4c..bccd08e5e 100644
--- a/frame/1f/bli_l1f_tapi.h
+++ b/frame/1f/bli_l1f_tapi.h
@@ -53,7 +53,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( axpy2v )
+INSERT_GENTPROT_BASIC( axpy2v )
 
 
 #undef  GENTPROT
@@ -72,7 +72,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( axpyf )
+INSERT_GENTPROT_BASIC( axpyf )
 
 
 #undef  GENTPROT
@@ -92,7 +92,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( dotaxpyv )
+INSERT_GENTPROT_BASIC( dotaxpyv )
 
 
 #undef  GENTPROT
@@ -116,7 +116,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( dotxaxpyf )
+INSERT_GENTPROT_BASIC( dotxaxpyf )
 
 
 #undef  GENTPROT
@@ -136,5 +136,5 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( dotxf )
+INSERT_GENTPROT_BASIC( dotxf )
 
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 83ccf6853..08dd5c915 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -181,7 +181,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 */ \
 }
 
-INSERT_GENTFUNC_BASIC0( copym )
+INSERT_GENTFUNC_BASIC( copym )
 
 
 #undef  GENTFUNC
@@ -256,7 +256,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 */ \
 }
 
-INSERT_GENTFUNC_BASIC0( axpym )
+INSERT_GENTFUNC_BASIC( axpym )
 
 
 #undef  GENTFUNC
@@ -353,7 +353,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 */ \
 }
 
-INSERT_GENTFUNC_BASIC0( scal2m )
+INSERT_GENTFUNC_BASIC( scal2m )
 
 
 #undef  GENTFUNC
@@ -397,9 +397,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( invscalm )
-INSERT_GENTFUNC_BASIC0( scalm )
-INSERT_GENTFUNC_BASIC0( setm )
+INSERT_GENTFUNC_BASIC( invscalm )
+INSERT_GENTFUNC_BASIC( scalm )
+INSERT_GENTFUNC_BASIC( setm )
 
 
 #undef  GENTFUNC
@@ -490,7 +490,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 */ \
 }
 
-INSERT_GENTFUNC_BASIC0( xpbym )
+INSERT_GENTFUNC_BASIC( xpbym )
 
 
 #undef  GENTFUNC2
@@ -551,8 +551,8 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC2_BASIC0( xpbym_md )
-INSERT_GENTFUNC2_MIXDP0( xpbym_md )
+INSERT_GENTFUNC2_BASIC( xpbym_md )
+INSERT_GENTFUNC2_MIX_DP( xpbym_md )
 
 
 #endif
diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h
index 531fae075..26d62e23f 100644
--- a/frame/1m/bli_l1m_tapi.h
+++ b/frame/1m/bli_l1m_tapi.h
@@ -53,9 +53,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( addm )
-INSERT_GENTPROT_BASIC0( copym )
-INSERT_GENTPROT_BASIC0( subm )
+INSERT_GENTPROT_BASIC( addm )
+INSERT_GENTPROT_BASIC( copym )
+INSERT_GENTPROT_BASIC( subm )
 
 
 #undef  GENTPROT
@@ -75,8 +75,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( axpym )
-INSERT_GENTPROT_BASIC0( scal2m )
+INSERT_GENTPROT_BASIC( axpym )
+INSERT_GENTPROT_BASIC( scal2m )
 
 
 #undef  GENTPROT
@@ -95,9 +95,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( invscalm )
-INSERT_GENTPROT_BASIC0( scalm )
-INSERT_GENTPROT_BASIC0( setm )
+INSERT_GENTPROT_BASIC( invscalm )
+INSERT_GENTPROT_BASIC( scalm )
+INSERT_GENTPROT_BASIC( setm )
 
 
 #undef  GENTPROT
@@ -117,7 +117,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( xpbym )
+INSERT_GENTPROT_BASIC( xpbym )
 
 
 #undef  GENTPROT2
@@ -137,6 +137,6 @@ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT2_BASIC0( xpbym_md )
-INSERT_GENTPROT2_MIXDP0( xpbym_md )
+INSERT_GENTPROT2_BASIC( xpbym_md )
+INSERT_GENTPROT2_MIX_DP( xpbym_md )
 
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index 79e4d9efd..53ef4e792 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -148,9 +148,9 @@ void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( addm_unb_var1,  addv,  BLIS_ADDV_KER )
-INSERT_GENTFUNC_BASIC2( copym_unb_var1, copyv, BLIS_COPYV_KER )
-INSERT_GENTFUNC_BASIC2( subm_unb_var1,  subv,  BLIS_SUBV_KER )
+INSERT_GENTFUNC_BASIC( addm_unb_var1,  addv,  BLIS_ADDV_KER )
+INSERT_GENTFUNC_BASIC( copym_unb_var1, copyv, BLIS_COPYV_KER )
+INSERT_GENTFUNC_BASIC( subm_unb_var1,  subv,  BLIS_SUBV_KER )
 
 
 #undef  GENTFUNC
@@ -267,8 +267,8 @@ void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( axpym_unb_var1,  axpyv,  BLIS_AXPYV_KER )
-INSERT_GENTFUNC_BASIC2( scal2m_unb_var1, scal2v, BLIS_SCAL2V_KER )
+INSERT_GENTFUNC_BASIC( axpym_unb_var1,  axpyv,  BLIS_AXPYV_KER )
+INSERT_GENTFUNC_BASIC( scal2m_unb_var1, scal2v, BLIS_SCAL2V_KER )
 
 
 #undef  GENTFUNC
@@ -373,9 +373,9 @@ void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( invscalm_unb_var1, invscalv, BLIS_INVSCALV_KER )
-INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER )
-INSERT_GENTFUNC_BASIC2( setm_unb_var1,  setv,  BLIS_SETV_KER )
+INSERT_GENTFUNC_BASIC( invscalm_unb_var1, invscalv, BLIS_INVSCALV_KER )
+INSERT_GENTFUNC_BASIC( scalm_unb_var1, scalv, BLIS_SCALV_KER )
+INSERT_GENTFUNC_BASIC( setm_unb_var1,  setv,  BLIS_SETV_KER )
 
 
 #undef  GENTFUNC
@@ -492,7 +492,7 @@ void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( xpbym_unb_var1,  xpbyv,  BLIS_XPBYV_KER )
+INSERT_GENTFUNC_BASIC( xpbym_unb_var1,  xpbyv,  BLIS_XPBYV_KER )
 
 
 #undef  GENTFUNC2
@@ -612,6 +612,6 @@ void PASTEMAC2(chx,chy,opname) \
 	} \
 }
 
-INSERT_GENTFUNC2_BASIC0( xpbym_md_unb_var1 )
-INSERT_GENTFUNC2_MIXDP0( xpbym_md_unb_var1 )
+INSERT_GENTFUNC2_BASIC( xpbym_md_unb_var1 )
+INSERT_GENTFUNC2_MIX_DP( xpbym_md_unb_var1 )
 
diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h
index 06aed2fe1..e752208c4 100644
--- a/frame/1m/bli_l1m_unb_var1.h
+++ b/frame/1m/bli_l1m_unb_var1.h
@@ -53,9 +53,9 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        cntx_t* cntx \
      );
 
-INSERT_GENTPROT_BASIC0( addm )
-INSERT_GENTPROT_BASIC0( copym )
-INSERT_GENTPROT_BASIC0( subm )
+INSERT_GENTPROT_BASIC( addm )
+INSERT_GENTPROT_BASIC( copym )
+INSERT_GENTPROT_BASIC( subm )
 
 
 #undef  GENTPROT
@@ -75,8 +75,8 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        cntx_t* cntx \
      );
 
-INSERT_GENTPROT_BASIC0( axpym )
-INSERT_GENTPROT_BASIC0( scal2m )
+INSERT_GENTPROT_BASIC( axpym )
+INSERT_GENTPROT_BASIC( scal2m )
 
 
 #undef  GENTPROT
@@ -95,9 +95,9 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        cntx_t* cntx \
      );
 
-INSERT_GENTPROT_BASIC0( invscalm )
-INSERT_GENTPROT_BASIC0( scalm )
-INSERT_GENTPROT_BASIC0( setm )
+INSERT_GENTPROT_BASIC( invscalm )
+INSERT_GENTPROT_BASIC( scalm )
+INSERT_GENTPROT_BASIC( setm )
 
 
 #undef  GENTPROT
@@ -117,7 +117,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        cntx_t* cntx \
      );
 
-INSERT_GENTPROT_BASIC0( xpbym )
+INSERT_GENTPROT_BASIC( xpbym )
 
 
 #undef  GENTPROT2
@@ -137,6 +137,6 @@ void PASTEMAC3(chx,chy,opname,_unb_var1) \
        cntx_t*  cntx \
      );
 
-INSERT_GENTPROT2_BASIC0( xpbym_md )
-INSERT_GENTPROT2_MIXDP0( xpbym_md )
+INSERT_GENTPROT2_BASIC( xpbym_md )
+INSERT_GENTPROT2_MIX_DP( xpbym_md )
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 93801ebe9..c8571d24e 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -303,5 +303,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag )
+INSERT_GENTFUNCR_BASIC( packm_struc_cxk, packm_cxk, packm_cxc_diag )
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h
index 58bef1d76..129a4d018 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.h
+++ b/frame/1m/packm/bli_packm_struc_cxk.h
@@ -57,5 +57,5 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( packm_struc_cxk )
+INSERT_GENTPROT_BASIC( packm_struc_cxk )
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c
index 2b8a07bd5..51212040e 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.c
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.c
@@ -277,8 +277,8 @@ void PASTEMAC2(chc,chp,varname) \
 */ \
 }
 
-INSERT_GENTFUNC2_BASIC0( packm_struc_cxk_md )
-INSERT_GENTFUNC2_MIXDP0( packm_struc_cxk_md )
+INSERT_GENTFUNC2_BASIC( packm_struc_cxk_md )
+INSERT_GENTFUNC2_MIX_DP( packm_struc_cxk_md )
 
 
 // -----------------------------------------------------------------------------
@@ -403,8 +403,8 @@ void PASTEMAC2(cha,chp,opname) \
 	} \
 }
 
-INSERT_GENTFUNC2_BASIC0( packm_cxk_1r_md )
-INSERT_GENTFUNC2_MIXDP0( packm_cxk_1r_md )
+INSERT_GENTFUNC2_BASIC( packm_cxk_1r_md )
+INSERT_GENTFUNC2_MIX_DP( packm_cxk_1r_md )
 
 // -----------------------------------------------------------------------------
 
@@ -517,7 +517,7 @@ void PASTEMAC2(cha,chp,opname) \
 	} \
 }
 
-INSERT_GENTFUNC2_BASIC0( packm_cxk_1e_md )
-INSERT_GENTFUNC2_MIXDP0( packm_cxk_1e_md )
+INSERT_GENTFUNC2_BASIC( packm_cxk_1e_md )
+INSERT_GENTFUNC2_MIX_DP( packm_cxk_1e_md )
 
 #endif
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h
index 8c3fa0335..f92126eb4 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.h
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.h
@@ -57,8 +57,8 @@ void PASTEMAC2(chc,chp,varname) \
        cntx_t*  cntx  \
      );
 
-INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md )
-INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md )
+INSERT_GENTPROT2_BASIC( packm_struc_cxk_md )
+INSERT_GENTPROT2_MIX_DP( packm_struc_cxk_md )
 
 
 #undef  GENTPROT2
@@ -74,9 +74,9 @@ void PASTEMAC2(cha,chp,opname) \
        ctype_p* p,             inc_t ldp  \
      );
 
-INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md )
-INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md )
+INSERT_GENTPROT2_BASIC( packm_cxk_1e_md )
+INSERT_GENTPROT2_MIX_DP( packm_cxk_1e_md )
 
-INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md )
-INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md )
+INSERT_GENTPROT2_BASIC( packm_cxk_1r_md )
+INSERT_GENTPROT2_MIX_DP( packm_cxk_1r_md )
 
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index 8659ad5e2..9cbf88b1a 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -286,5 +286,5 @@ void PASTEMAC(ch,varname) \
 \
 }
 
-INSERT_GENTFUNC_BASIC0( unpackm_blk_var1 )
+INSERT_GENTFUNC_BASIC( unpackm_blk_var1 )
 
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h
index 4a92dc1b7..18dc2326b 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.h
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h
@@ -62,5 +62,5 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( unpackm_blk_var1 )
+INSERT_GENTPROT_BASIC( unpackm_blk_var1 )
 
diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c
index 4bef7c81a..3366c26f9 100644
--- a/frame/2/bli_l2_tapi.c
+++ b/frame/2/bli_l2_tapi.c
@@ -120,7 +120,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC3( gemv, gemv, gemv_unf_var1, gemv_unf_var2 )
+INSERT_GENTFUNC_BASIC( gemv, gemv, gemv_unf_var1, gemv_unf_var2 )
 
 
 #undef  GENTFUNC
@@ -172,7 +172,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC3( ger, ger, ger_unb_var1, ger_unb_var2 )
+INSERT_GENTFUNC_BASIC( ger, ger, ger_unb_var1, ger_unb_var2 )
 
 
 #undef  GENTFUNC
@@ -248,8 +248,8 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC4( hemv, hemv, BLIS_CONJUGATE,    hemv_unf_var1, hemv_unf_var3 )
-INSERT_GENTFUNC_BASIC4( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_var3 )
+INSERT_GENTFUNC_BASIC( hemv, hemv, BLIS_CONJUGATE,    hemv_unf_var1, hemv_unf_var3 )
+INSERT_GENTFUNC_BASIC( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_var3 )
 
 
 #undef  GENTFUNCR
@@ -313,7 +313,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC4( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 )
+INSERT_GENTFUNCR_BASIC( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 )
 
 
 #undef  GENTFUNC
@@ -370,7 +370,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC4( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2 )
+INSERT_GENTFUNC_BASIC( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2 )
 
 
 #undef  GENTFUNC
@@ -431,8 +431,8 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC4( her2, her2, BLIS_CONJUGATE,    her2_unf_var1, her2_unf_var4 )
-INSERT_GENTFUNC_BASIC4( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_var4 )
+INSERT_GENTFUNC_BASIC( her2, her2, BLIS_CONJUGATE,    her2_unf_var1, her2_unf_var4 )
+INSERT_GENTFUNC_BASIC( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_var4 )
 
 
 #undef  GENTFUNC
@@ -505,8 +505,8 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC3( trmv, trmv, trmv_unf_var1, trmv_unf_var2 )
-INSERT_GENTFUNC_BASIC3( trsv, trmv, trsv_unf_var1, trsv_unf_var2 )
+INSERT_GENTFUNC_BASIC( trmv, trmv, trmv_unf_var1, trmv_unf_var2 )
+INSERT_GENTFUNC_BASIC( trsv, trmv, trsv_unf_var1, trsv_unf_var2 )
 
 
 #endif
diff --git a/frame/2/bli_l2_tapi.h b/frame/2/bli_l2_tapi.h
index edd9607b6..207d0de84 100644
--- a/frame/2/bli_l2_tapi.h
+++ b/frame/2/bli_l2_tapi.h
@@ -54,7 +54,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( gemv )
+INSERT_GENTPROT_BASIC( gemv )
 
 
 #undef  GENTPROT
@@ -73,7 +73,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( ger )
+INSERT_GENTPROT_BASIC( ger )
 
 
 #undef  GENTPROT
@@ -93,8 +93,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( hemv )
-INSERT_GENTPROT_BASIC0( symv )
+INSERT_GENTPROT_BASIC( hemv )
+INSERT_GENTPROT_BASIC( symv )
 
 
 #undef  GENTPROTR
@@ -111,7 +111,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROTR_BASIC0( her )
+INSERT_GENTPROTR_BASIC( her )
 
 
 #undef  GENTPROT
@@ -128,7 +128,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( syr )
+INSERT_GENTPROT_BASIC( syr )
 
 
 #undef  GENTPROT
@@ -147,8 +147,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( her2 )
-INSERT_GENTPROT_BASIC0( syr2 )
+INSERT_GENTPROT_BASIC( her2 )
+INSERT_GENTPROT_BASIC( syr2 )
 
 
 #undef  GENTPROT
@@ -166,5 +166,5 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( trmv )
-INSERT_GENTPROT_BASIC0( trsv )
+INSERT_GENTPROT_BASIC( trmv )
+INSERT_GENTPROT_BASIC( trsv )
diff --git a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
index 397189971..6d2e2b852 100644
--- a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
+++ b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
@@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
+//INSERT_GENTFUNC_BASIC( gemv_unf_var2 )
 GENTFUNC( float,    s, gemv_unf_var2, _zen_int10, _zen_int_5,    5 )
 GENTFUNC( double,   d, gemv_unf_var2, _zen_int10, _zen_int_16x4, 4 )
 GENTFUNC( scomplex, c, gemv_unf_var2, _zen_int10, _zen_int_4,    4 )
@@ -215,6 +215,6 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
+//INSERT_GENTFUNC_BASIC( gemv_unf_var2 )
 GENTFUNC( dcomplex, z, gemv_unf_var2 )
 
diff --git a/frame/2/gemv/bli_gemv_unb_var1.c b/frame/2/gemv/bli_gemv_unb_var1.c
index 606c867ba..f0c89f6b7 100644
--- a/frame/2/gemv/bli_gemv_unb_var1.c
+++ b/frame/2/gemv/bli_gemv_unb_var1.c
@@ -92,5 +92,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( gemv_unb_var1 )
+INSERT_GENTFUNC_BASIC( gemv_unb_var1 )
 
diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c
index cfee006f9..16ec68278 100644
--- a/frame/2/gemv/bli_gemv_unb_var2.c
+++ b/frame/2/gemv/bli_gemv_unb_var2.c
@@ -122,5 +122,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( gemv_unb_var2 )
+INSERT_GENTFUNC_BASIC( gemv_unb_var2 )
 
diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c
index 0f7eeaf1d..86cd41c46 100644
--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -98,5 +98,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
+INSERT_GENTFUNC_BASIC( gemv_unf_var1 )
 
diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c
index c16511da2..e73f062e5 100644
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -125,5 +125,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
+INSERT_GENTFUNC_BASIC( gemv_unf_var2 )
 
diff --git a/frame/2/gemv/bli_gemv_var.h b/frame/2/gemv/bli_gemv_var.h
index 2102c1d0b..f06cdac75 100644
--- a/frame/2/gemv/bli_gemv_var.h
+++ b/frame/2/gemv/bli_gemv_var.h
@@ -82,9 +82,9 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( gemv_unb_var1 )
-INSERT_GENTPROT_BASIC0( gemv_unb_var2 )
+INSERT_GENTPROT_BASIC( gemv_unb_var1 )
+INSERT_GENTPROT_BASIC( gemv_unb_var2 )
 
-INSERT_GENTPROT_BASIC0( gemv_unf_var1 )
-INSERT_GENTPROT_BASIC0( gemv_unf_var2 )
+INSERT_GENTPROT_BASIC( gemv_unf_var1 )
+INSERT_GENTPROT_BASIC( gemv_unf_var2 )
 
diff --git a/frame/2/ger/bli_ger_unb_var1.c b/frame/2/ger/bli_ger_unb_var1.c
index ceab85012..24e96822b 100644
--- a/frame/2/ger/bli_ger_unb_var1.c
+++ b/frame/2/ger/bli_ger_unb_var1.c
@@ -83,5 +83,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( ger_unb_var1 )
+INSERT_GENTFUNC_BASIC( ger_unb_var1 )
 
diff --git a/frame/2/ger/bli_ger_unb_var2.c b/frame/2/ger/bli_ger_unb_var2.c
index 5bed4116b..fb38e683d 100644
--- a/frame/2/ger/bli_ger_unb_var2.c
+++ b/frame/2/ger/bli_ger_unb_var2.c
@@ -83,5 +83,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( ger_unb_var2 )
+INSERT_GENTFUNC_BASIC( ger_unb_var2 )
 
diff --git a/frame/2/ger/bli_ger_var.h b/frame/2/ger/bli_ger_var.h
index a19bac08c..d1648aa88 100644
--- a/frame/2/ger/bli_ger_var.h
+++ b/frame/2/ger/bli_ger_var.h
@@ -77,6 +77,6 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( ger_unb_var1 )
-INSERT_GENTPROT_BASIC0( ger_unb_var2 )
+INSERT_GENTPROT_BASIC( ger_unb_var1 )
+INSERT_GENTPROT_BASIC( ger_unb_var2 )
 
diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c
index f592bdf6c..7e622f09d 100644
--- a/frame/2/hemv/bli_hemv_unb_var1.c
+++ b/frame/2/hemv/bli_hemv_unb_var1.c
@@ -174,5 +174,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unb_var1 )
+INSERT_GENTFUNC_BASIC( hemv_unb_var1 )
 
diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c
index 9465659fa..91a5b6ef5 100644
--- a/frame/2/hemv/bli_hemv_unb_var2.c
+++ b/frame/2/hemv/bli_hemv_unb_var2.c
@@ -178,5 +178,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unb_var2 )
+INSERT_GENTFUNC_BASIC( hemv_unb_var2 )
 
diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c
index a93b78b81..0ea2605ea 100644
--- a/frame/2/hemv/bli_hemv_unb_var3.c
+++ b/frame/2/hemv/bli_hemv_unb_var3.c
@@ -173,5 +173,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unb_var3 )
+INSERT_GENTFUNC_BASIC( hemv_unb_var3 )
 
diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c
index 810900dc2..56b252ec2 100644
--- a/frame/2/hemv/bli_hemv_unb_var4.c
+++ b/frame/2/hemv/bli_hemv_unb_var4.c
@@ -171,5 +171,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unb_var4 )
+INSERT_GENTFUNC_BASIC( hemv_unb_var4 )
 
diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c
index 65ddeb3e6..ca9f72fe7 100644
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -213,5 +213,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unf_var1 )
+INSERT_GENTFUNC_BASIC( hemv_unf_var1 )
 
diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c
index 5377f20a2..44d0e3794 100644
--- a/frame/2/hemv/bli_hemv_unf_var1a.c
+++ b/frame/2/hemv/bli_hemv_unf_var1a.c
@@ -164,5 +164,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unf_var1a )
+INSERT_GENTFUNC_BASIC( hemv_unf_var1a )
 
diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c
index 97a7a5a66..16f45f051 100644
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -213,5 +213,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unf_var3 )
+INSERT_GENTFUNC_BASIC( hemv_unf_var3 )
 
diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c
index c7aa38550..c54485d65 100644
--- a/frame/2/hemv/bli_hemv_unf_var3a.c
+++ b/frame/2/hemv/bli_hemv_unf_var3a.c
@@ -163,5 +163,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( hemv_unf_var3a )
+INSERT_GENTFUNC_BASIC( hemv_unf_var3a )
 
diff --git a/frame/2/hemv/bli_hemv_var.h b/frame/2/hemv/bli_hemv_var.h
index a591f1afe..1cf50cadf 100644
--- a/frame/2/hemv/bli_hemv_var.h
+++ b/frame/2/hemv/bli_hemv_var.h
@@ -90,13 +90,13 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( hemv_unb_var1 )
-INSERT_GENTPROT_BASIC0( hemv_unb_var2 )
-INSERT_GENTPROT_BASIC0( hemv_unb_var3 )
-INSERT_GENTPROT_BASIC0( hemv_unb_var4 )
-
-INSERT_GENTPROT_BASIC0( hemv_unf_var1 )
-INSERT_GENTPROT_BASIC0( hemv_unf_var3 )
-INSERT_GENTPROT_BASIC0( hemv_unf_var1a )
-INSERT_GENTPROT_BASIC0( hemv_unf_var3a )
+INSERT_GENTPROT_BASIC( hemv_unb_var1 )
+INSERT_GENTPROT_BASIC( hemv_unb_var2 )
+INSERT_GENTPROT_BASIC( hemv_unb_var3 )
+INSERT_GENTPROT_BASIC( hemv_unb_var4 )
+
+INSERT_GENTPROT_BASIC( hemv_unf_var1 )
+INSERT_GENTPROT_BASIC( hemv_unf_var3 )
+INSERT_GENTPROT_BASIC( hemv_unf_var1a )
+INSERT_GENTPROT_BASIC( hemv_unf_var3a )
 
diff --git a/frame/2/her/bli_her_unb_var1.c b/frame/2/her/bli_her_unb_var1.c
index b1d91269e..7f7215c5d 100644
--- a/frame/2/her/bli_her_unb_var1.c
+++ b/frame/2/her/bli_her_unb_var1.c
@@ -143,5 +143,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her_unb_var1 )
+INSERT_GENTFUNC_BASIC( her_unb_var1 )
 
diff --git a/frame/2/her/bli_her_unb_var2.c b/frame/2/her/bli_her_unb_var2.c
index adf86635a..1f071ca42 100644
--- a/frame/2/her/bli_her_unb_var2.c
+++ b/frame/2/her/bli_her_unb_var2.c
@@ -143,5 +143,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her_unb_var2 )
+INSERT_GENTFUNC_BASIC( her_unb_var2 )
 
diff --git a/frame/2/her/bli_her_var.h b/frame/2/her/bli_her_var.h
index f7f02baac..4e22cb324 100644
--- a/frame/2/her/bli_her_var.h
+++ b/frame/2/her/bli_her_var.h
@@ -76,6 +76,6 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROTR_BASIC0( her_unb_var1 )
-INSERT_GENTPROTR_BASIC0( her_unb_var2 )
+INSERT_GENTPROTR_BASIC( her_unb_var1 )
+INSERT_GENTPROTR_BASIC( her_unb_var2 )
 
diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c
index 5ae70e321..e0bfd7773 100644
--- a/frame/2/her2/bli_her2_unb_var1.c
+++ b/frame/2/her2/bli_her2_unb_var1.c
@@ -163,5 +163,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her2_unb_var1 )
+INSERT_GENTFUNC_BASIC( her2_unb_var1 )
 
diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c
index 14135e894..0ab92fb38 100644
--- a/frame/2/her2/bli_her2_unb_var2.c
+++ b/frame/2/her2/bli_her2_unb_var2.c
@@ -172,5 +172,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her2_unb_var2 )
+INSERT_GENTFUNC_BASIC( her2_unb_var2 )
 
diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c
index ffcaf434f..dc2630c46 100644
--- a/frame/2/her2/bli_her2_unb_var3.c
+++ b/frame/2/her2/bli_her2_unb_var3.c
@@ -172,5 +172,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her2_unb_var3 )
+INSERT_GENTFUNC_BASIC( her2_unb_var3 )
 
diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c
index 8b8be1c57..59902654d 100644
--- a/frame/2/her2/bli_her2_unb_var4.c
+++ b/frame/2/her2/bli_her2_unb_var4.c
@@ -171,5 +171,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her2_unb_var4 )
+INSERT_GENTFUNC_BASIC( her2_unb_var4 )
 
diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c
index c2250b87a..aa0de6a2f 100644
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -156,5 +156,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
+INSERT_GENTFUNC_BASIC( her2_unf_var1 )
 
diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c
index 7ad36e951..4095e5e65 100644
--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -164,5 +164,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
+INSERT_GENTFUNC_BASIC( her2_unf_var4 )
 
diff --git a/frame/2/her2/bli_her2_var.h b/frame/2/her2/bli_her2_var.h
index 99672f68c..f4a8b902e 100644
--- a/frame/2/her2/bli_her2_var.h
+++ b/frame/2/her2/bli_her2_var.h
@@ -87,11 +87,11 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( her2_unb_var1 )
-INSERT_GENTPROT_BASIC0( her2_unb_var2 )
-INSERT_GENTPROT_BASIC0( her2_unb_var3 )
-INSERT_GENTPROT_BASIC0( her2_unb_var4 )
+INSERT_GENTPROT_BASIC( her2_unb_var1 )
+INSERT_GENTPROT_BASIC( her2_unb_var2 )
+INSERT_GENTPROT_BASIC( her2_unb_var3 )
+INSERT_GENTPROT_BASIC( her2_unb_var4 )
 
-INSERT_GENTPROT_BASIC0( her2_unf_var1 )
-INSERT_GENTPROT_BASIC0( her2_unf_var4 )
+INSERT_GENTPROT_BASIC( her2_unf_var1 )
+INSERT_GENTPROT_BASIC( her2_unf_var4 )
 
diff --git a/frame/2/trmv/bli_trmv_unb_var1.c b/frame/2/trmv/bli_trmv_unb_var1.c
index 9ed74b037..36ba911b8 100644
--- a/frame/2/trmv/bli_trmv_unb_var1.c
+++ b/frame/2/trmv/bli_trmv_unb_var1.c
@@ -148,5 +148,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trmv_unb_var1 )
+INSERT_GENTFUNC_BASIC( trmv_unb_var1 )
 
diff --git a/frame/2/trmv/bli_trmv_unb_var2.c b/frame/2/trmv/bli_trmv_unb_var2.c
index 5674c4d74..91b85d685 100644
--- a/frame/2/trmv/bli_trmv_unb_var2.c
+++ b/frame/2/trmv/bli_trmv_unb_var2.c
@@ -146,5 +146,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trmv_unb_var2 )
+INSERT_GENTFUNC_BASIC( trmv_unb_var2 )
 
diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c
index 1a8199cae..70b4fa7f1 100644
--- a/frame/2/trmv/bli_trmv_unf_var1.c
+++ b/frame/2/trmv/bli_trmv_unf_var1.c
@@ -213,5 +213,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trmv_unf_var1 )
+INSERT_GENTFUNC_BASIC( trmv_unf_var1 )
 
diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c
index 5d9d37aa8..981a819de 100644
--- a/frame/2/trmv/bli_trmv_unf_var2.c
+++ b/frame/2/trmv/bli_trmv_unf_var2.c
@@ -208,5 +208,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trmv_unf_var2 )
+INSERT_GENTFUNC_BASIC( trmv_unf_var2 )
 
diff --git a/frame/2/trmv/bli_trmv_var.h b/frame/2/trmv/bli_trmv_var.h
index 5a36a656d..2042f0f7b 100644
--- a/frame/2/trmv/bli_trmv_var.h
+++ b/frame/2/trmv/bli_trmv_var.h
@@ -80,9 +80,9 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( trmv_unb_var1 )
-INSERT_GENTPROT_BASIC0( trmv_unb_var2 )
+INSERT_GENTPROT_BASIC( trmv_unb_var1 )
+INSERT_GENTPROT_BASIC( trmv_unb_var2 )
 
-INSERT_GENTPROT_BASIC0( trmv_unf_var1 )
-INSERT_GENTPROT_BASIC0( trmv_unf_var2 )
+INSERT_GENTPROT_BASIC( trmv_unf_var1 )
+INSERT_GENTPROT_BASIC( trmv_unf_var2 )
 
diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c
index e50162956..126e3eb7c 100644
--- a/frame/2/trsv/bli_trsv_unb_var1.c
+++ b/frame/2/trsv/bli_trsv_unb_var1.c
@@ -161,5 +161,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trsv_unb_var1 )
+INSERT_GENTFUNC_BASIC( trsv_unb_var1 )
 
diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c
index 661489f1a..941cf43b9 100644
--- a/frame/2/trsv/bli_trsv_unb_var2.c
+++ b/frame/2/trsv/bli_trsv_unb_var2.c
@@ -159,5 +159,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trsv_unb_var2 )
+INSERT_GENTFUNC_BASIC( trsv_unb_var2 )
 
diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c
index 88e9101ce..d64bba63f 100644
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -227,5 +227,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trsv_unf_var1 )
+INSERT_GENTFUNC_BASIC( trsv_unf_var1 )
 
diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c
index edde2bf5b..f73d32413 100644
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -222,5 +222,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )
+INSERT_GENTFUNC_BASIC( trsv_unf_var2 )
 
diff --git a/frame/2/trsv/bli_trsv_var.h b/frame/2/trsv/bli_trsv_var.h
index 064394355..35e8db301 100644
--- a/frame/2/trsv/bli_trsv_var.h
+++ b/frame/2/trsv/bli_trsv_var.h
@@ -80,9 +80,9 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( trsv_unb_var1 )
-INSERT_GENTPROT_BASIC0( trsv_unb_var2 )
+INSERT_GENTPROT_BASIC( trsv_unb_var1 )
+INSERT_GENTPROT_BASIC( trsv_unb_var2 )
 
-INSERT_GENTPROT_BASIC0( trsv_unf_var1 )
-INSERT_GENTPROT_BASIC0( trsv_unf_var2 )
+INSERT_GENTPROT_BASIC( trsv_unf_var1 )
+INSERT_GENTPROT_BASIC( trsv_unf_var2 )
 
diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h
index 16b2c1173..141c862df 100644
--- a/frame/3/bli_l3_ind_ukr.h
+++ b/frame/3/bli_l3_ind_ukr.h
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname) \
        const cntx_t*    cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
+INSERT_GENTPROT_BASIC( gemm1m_ukr_name )
 
 
 #undef  GENTPROT
@@ -76,8 +76,8 @@ void PASTEMAC(ch,opname) \
        const cntx_t*    cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name )
-INSERT_GENTPROT_BASIC0( gemmtrsm1m_u_ukr_name )
+INSERT_GENTPROT_BASIC( gemmtrsm1m_l_ukr_name )
+INSERT_GENTPROT_BASIC( gemmtrsm1m_u_ukr_name )
 
 
 #undef  GENTPROT
@@ -92,7 +92,7 @@ void PASTEMAC(ch,opname) \
        const cntx_t*    cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name )
-INSERT_GENTPROT_BASIC0( trsm1m_u_ukr_name )
+INSERT_GENTPROT_BASIC( trsm1m_l_ukr_name )
+INSERT_GENTPROT_BASIC( trsm1m_u_ukr_name )
 
 
diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h
index 17cf9a482..a62320161 100644
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -56,7 +56,7 @@ void PASTEMAC(ch,varname) \
        thrinfo_t* thread  \
      );
 
-INSERT_GENTPROT_BASIC0( packm_sup_var1 )
+INSERT_GENTPROT_BASIC( packm_sup_var1 )
 
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, varname ) \
@@ -74,5 +74,5 @@ void PASTEMAC(ch,varname) \
        thrinfo_t* thread  \
      );
 
-INSERT_GENTPROT_BASIC0( packm_sup_var2 )
+INSERT_GENTPROT_BASIC( packm_sup_var2 )
 
diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c
index 4162c3d33..a1e20c132 100644
--- a/frame/3/bli_l3_sup_var12.c
+++ b/frame/3/bli_l3_sup_var12.c
@@ -398,7 +398,7 @@ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c
 */ \
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2 )
+INSERT_GENTFUNC_BASIC( gemmsup_ref_var2 )
 
 
 //
@@ -729,7 +729,7 @@ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c
 */ \
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1 )
+INSERT_GENTFUNC_BASIC( gemmsup_ref_var1 )
 #endif
 
 
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index 8bbb73ca9..5b3d6f6a4 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -86,8 +86,8 @@ void PASTEMAC(ch,varname) \
        thrinfo_t* thread  \
      );
 
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
+INSERT_GENTPROT_BASIC( gemmsup_ref_var1 )
+INSERT_GENTPROT_BASIC( gemmsup_ref_var2 )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/3/bli_l3_tapi.c b/frame/3/bli_l3_tapi.c
index 8f256a11a..7d5883311 100644
--- a/frame/3/bli_l3_tapi.c
+++ b/frame/3/bli_l3_tapi.c
@@ -72,7 +72,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( gemm )
+INSERT_GENTFUNC_BASIC( gemm )
 
 
 #undef  GENTFUNC
@@ -110,7 +110,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( gemmt )
+INSERT_GENTFUNC_BASIC( gemmt )
 
 
 #undef  GENTFUNC
@@ -185,7 +185,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( herk )
+INSERT_GENTFUNCR_BASIC( herk )
 
 
 #undef  GENTFUNCR
@@ -223,7 +223,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( her2k )
+INSERT_GENTFUNCR_BASIC( her2k )
 
 
 #undef  GENTFUNC
@@ -257,7 +257,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( syrk )
+INSERT_GENTFUNC_BASIC( syrk )
 
 
 #undef  GENTFUNC
@@ -295,7 +295,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( syr2k )
+INSERT_GENTFUNC_BASIC( syr2k )
 
 
 #undef  GENTFUNC
@@ -337,7 +337,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( trmm3 )
+INSERT_GENTFUNC_BASIC( trmm3 )
 
 
 #undef  GENTFUNC
@@ -373,6 +373,6 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( trmm )
-INSERT_GENTFUNC_BASIC0( trsm )
+INSERT_GENTFUNC_BASIC( trmm )
+INSERT_GENTFUNC_BASIC( trsm )
 
diff --git a/frame/3/bli_l3_tapi.h b/frame/3/bli_l3_tapi.h
index 9b7a9b077..81ddf6945 100644
--- a/frame/3/bli_l3_tapi.h
+++ b/frame/3/bli_l3_tapi.h
@@ -55,7 +55,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
-INSERT_GENTPROT_BASIC0( gemm )
+INSERT_GENTPROT_BASIC( gemm )
 
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
@@ -75,8 +75,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
-INSERT_GENTPROT_BASIC0( hemm )
-INSERT_GENTPROT_BASIC0( symm )
+INSERT_GENTPROT_BASIC( hemm )
+INSERT_GENTPROT_BASIC( symm )
 
 
 #undef  GENTPROTR
@@ -94,7 +94,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*   c, inc_t rs_c, inc_t cs_c  \
      );
 
-INSERT_GENTPROTR_BASIC0( herk )
+INSERT_GENTPROTR_BASIC( herk )
 
 
 #undef  GENTPROTR
@@ -114,7 +114,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*   c, inc_t rs_c, inc_t cs_c  \
      );
 
-INSERT_GENTPROTR_BASIC0( her2k )
+INSERT_GENTPROTR_BASIC( her2k )
 
 
 #undef  GENTPROT
@@ -132,7 +132,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
-INSERT_GENTPROT_BASIC0( syrk )
+INSERT_GENTPROT_BASIC( syrk )
 
 
 #undef  GENTPROT
@@ -152,8 +152,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
-INSERT_GENTPROT_BASIC0( gemmt )
-INSERT_GENTPROT_BASIC0( syr2k )
+INSERT_GENTPROT_BASIC( gemmt )
+INSERT_GENTPROT_BASIC( syr2k )
 
 
 #undef  GENTPROT
@@ -175,7 +175,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
-INSERT_GENTPROT_BASIC0( trmm3 )
+INSERT_GENTPROT_BASIC( trmm3 )
 
 
 #undef  GENTPROT
@@ -194,6 +194,6 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              ctype*  b, inc_t rs_b, inc_t cs_b  \
      );
 
-INSERT_GENTPROT_BASIC0( trmm )
-INSERT_GENTPROT_BASIC0( trsm )
+INSERT_GENTPROT_BASIC( trmm )
+INSERT_GENTPROT_BASIC( trsm )
 
diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c
index 130237ee4..237d9d9a8 100644
--- a/frame/3/bli_l3_tapi_ex.c
+++ b/frame/3/bli_l3_tapi_ex.c
@@ -96,7 +96,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( gemm )
+INSERT_GENTFUNC_BASIC( gemm )
 
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, struca ) \
@@ -217,7 +217,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( herk )
+INSERT_GENTFUNCR_BASIC( herk )
 
 
 #undef  GENTFUNCR
@@ -281,7 +281,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( her2k )
+INSERT_GENTFUNCR_BASIC( her2k )
 
 
 #undef  GENTFUNC
@@ -336,7 +336,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( syrk )
+INSERT_GENTFUNC_BASIC( syrk )
 
 
 #undef  GENTFUNC
@@ -399,7 +399,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( syr2k )
+INSERT_GENTFUNC_BASIC( syr2k )
 
 
 #undef  GENTFUNC
@@ -460,7 +460,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( gemmt )
+INSERT_GENTFUNC_BASIC( gemmt )
 
 
 #undef  GENTFUNC
@@ -527,7 +527,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( trmm3 )
+INSERT_GENTFUNC_BASIC( trmm3 )
 
 
 #undef  GENTFUNC
@@ -582,6 +582,6 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( trmm )
-INSERT_GENTFUNC_BASIC0( trsm )
+INSERT_GENTFUNC_BASIC( trmm )
+INSERT_GENTFUNC_BASIC( trsm )
 
diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h
index d8610dee8..b0288841f 100644
--- a/frame/3/bli_l3_tapi_ex.h
+++ b/frame/3/bli_l3_tapi_ex.h
@@ -57,7 +57,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( gemm )
+INSERT_GENTPROT_BASIC( gemm )
 
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
@@ -79,8 +79,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( hemm )
-INSERT_GENTPROT_BASIC0( symm )
+INSERT_GENTPROT_BASIC( hemm )
+INSERT_GENTPROT_BASIC( symm )
 
 
 #undef  GENTPROTR
@@ -100,7 +100,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t*  rntm  \
      );
 
-INSERT_GENTPROTR_BASIC0( herk )
+INSERT_GENTPROTR_BASIC( herk )
 
 
 #undef  GENTPROTR
@@ -122,7 +122,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t*  rntm  \
      );
 
-INSERT_GENTPROTR_BASIC0( her2k )
+INSERT_GENTPROTR_BASIC( her2k )
 
 
 #undef  GENTPROT
@@ -142,7 +142,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( syrk )
+INSERT_GENTPROT_BASIC( syrk )
 
 
 #undef  GENTPROT
@@ -164,8 +164,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( gemmt )
-INSERT_GENTPROT_BASIC0( syr2k )
+INSERT_GENTPROT_BASIC( gemmt )
+INSERT_GENTPROT_BASIC( syr2k )
 
 
 #undef  GENTPROT
@@ -189,7 +189,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( trmm3 )
+INSERT_GENTPROT_BASIC( trmm3 )
 
 
 #undef  GENTPROT
@@ -210,6 +210,6 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( trmm )
-INSERT_GENTPROT_BASIC0( trsm )
+INSERT_GENTPROT_BASIC( trmm )
+INSERT_GENTPROT_BASIC( trsm )
 
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index 7ea68a9d2..e3dffc1b7 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -75,7 +75,7 @@ void PASTEMAC(ch,opname) \
 	); \
 } \
 
-INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR )
+INSERT_GENTFUNC_BASIC( gemm_ukernel, gemm, BLIS_GEMM_UKR )
 
 
 #undef  GENTFUNC
@@ -121,8 +121,8 @@ void PASTEMAC(ch,opname) \
 	); \
 } \
 
-INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ukernel, gemmtrsm, BLIS_GEMMTRSM_L_UKR )
-INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ukernel, gemmtrsm, BLIS_GEMMTRSM_U_UKR )
+INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukernel, gemmtrsm, BLIS_GEMMTRSM_L_UKR )
+INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukernel, gemmtrsm, BLIS_GEMMTRSM_U_UKR )
 
 
 #undef  GENTFUNC
@@ -156,6 +156,6 @@ void PASTEMAC(ch,opname) \
 	); \
 } \
 
-INSERT_GENTFUNC_BASIC2( trsm_l_ukernel, trsm, BLIS_TRSM_L_UKR )
-INSERT_GENTFUNC_BASIC2( trsm_u_ukernel, trsm, BLIS_TRSM_U_UKR )
+INSERT_GENTFUNC_BASIC( trsm_l_ukernel, trsm, BLIS_TRSM_L_UKR )
+INSERT_GENTFUNC_BASIC( trsm_u_ukernel, trsm, BLIS_TRSM_U_UKR )
 
diff --git a/frame/3/bli_l3_ukr_tapi.h b/frame/3/bli_l3_ukr_tapi.h
index 68335c731..66dd2f0e3 100644
--- a/frame/3/bli_l3_ukr_tapi.h
+++ b/frame/3/bli_l3_ukr_tapi.h
@@ -54,7 +54,7 @@ void PASTEMAC(ch,opname) \
        const cntx_t*    cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( gemm_ukernel )
+INSERT_GENTPROT_BASIC( gemm_ukernel )
 
 
 #undef  GENTPROT
@@ -75,8 +75,8 @@ void PASTEMAC(ch,opname) \
        const cntx_t*    cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukernel )
-INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukernel )
+INSERT_GENTPROT_BASIC( gemmtrsm_l_ukernel )
+INSERT_GENTPROT_BASIC( gemmtrsm_u_ukernel )
 
 
 #undef  GENTPROT
@@ -91,6 +91,6 @@ void PASTEMAC(ch,opname) \
        const cntx_t*    cntx  \
      );
 
-INSERT_GENTPROT_BASIC0( trsm_l_ukernel )
-INSERT_GENTPROT_BASIC0( trsm_u_ukernel )
+INSERT_GENTPROT_BASIC( trsm_l_ukernel )
+INSERT_GENTPROT_BASIC( trsm_u_ukernel )
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index cc0a633e2..6b413d909 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -69,8 +69,8 @@ BLIS_INLINE void PASTEMAC2(chx,chy,op) \
 	); \
 }
 
-INSERT_GENTFUNC2_BASIC0(xpbys_mxn_fn);
-INSERT_GENTFUNC2_MIXDP0(xpbys_mxn_fn);
+INSERT_GENTFUNC2_BASIC(xpbys_mxn_fn);
+INSERT_GENTFUNC2_MIX_DP(xpbys_mxn_fn);
 
 static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn);
 
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index 4c77872f4..ada9ed858 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -237,6 +237,6 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
 	} \
 }
 
-INSERT_GENTFUNCCO_BASIC( gemm_md_c2r, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO( gemm_md_c2r, BLIS_REF_SUFFIX )
 
 #endif
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 3fcace7f8..7bb3a5e36 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -73,7 +73,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
+INSERT_GENTFUNC_BASIC(xpbys_mxn_l_fn);
 
 static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
index b580bcaf8..a4d384629 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
@@ -73,7 +73,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
+INSERT_GENTFUNC_BASIC(xpbys_mxn_l_fn);
 
 static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 01843b28d..0ea845a20 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -73,7 +73,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
+INSERT_GENTFUNC_BASIC(xpbys_mxn_u_fn);
 
 static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
index 99139b309..e02a5f93b 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
@@ -73,7 +73,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
+INSERT_GENTFUNC_BASIC(xpbys_mxn_u_fn);
 
 static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
diff --git a/frame/base/bli_machval.h b/frame/base/bli_machval.h
index 25177a250..f7453e19f 100644
--- a/frame/base/bli_machval.h
+++ b/frame/base/bli_machval.h
@@ -54,5 +54,5 @@ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \
        void*     v     \
      );
 
-INSERT_GENTPROTR_BASIC0( machval )
+INSERT_GENTPROTR_BASIC( machval )
 
diff --git a/frame/base/bli_setgetijm.c b/frame/base/bli_setgetijm.c
index d056a2e44..d864b3fff 100644
--- a/frame/base/bli_setgetijm.c
+++ b/frame/base/bli_setgetijm.c
@@ -104,7 +104,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(z,ch,sets)( ar, ai, *b_ij ); \
 }
 
-INSERT_GENTFUNC_BASIC0( setijm )
+INSERT_GENTFUNC_BASIC( setijm )
 
 // -----------------------------------------------------------------------------
 
@@ -178,5 +178,5 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(ch,z,gets)( *b_ij, *ar, *ai ); \
 }
 
-INSERT_GENTFUNC_BASIC0( getijm )
+INSERT_GENTFUNC_BASIC( getijm )
 
diff --git a/frame/base/bli_setgetijm.h b/frame/base/bli_setgetijm.h
index a2db16d11..384fbd69e 100644
--- a/frame/base/bli_setgetijm.h
+++ b/frame/base/bli_setgetijm.h
@@ -53,7 +53,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
        void*  b, inc_t rs, inc_t cs  \
      );
 
-INSERT_GENTPROT_BASIC0( setijm )
+INSERT_GENTPROT_BASIC( setijm )
 
 // -----------------------------------------------------------------------------
 
@@ -78,5 +78,5 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              double* ai  \
      );
 
-INSERT_GENTPROT_BASIC0( getijm )
+INSERT_GENTPROT_BASIC( getijm )
 
diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c
index 6cee789c7..ff76647ff 100644
--- a/frame/base/bli_setgetijv.c
+++ b/frame/base/bli_setgetijv.c
@@ -97,7 +97,7 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(z,ch,sets)( ar, ai, *x_i ); \
 }
 
-INSERT_GENTFUNC_BASIC0( setijv )
+INSERT_GENTFUNC_BASIC( setijv )
 
 // -----------------------------------------------------------------------------
 
@@ -164,5 +164,5 @@ void PASTEMAC(ch,opname) \
 	PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \
 }
 
-INSERT_GENTFUNC_BASIC0( getijv )
+INSERT_GENTFUNC_BASIC( getijv )
 
diff --git a/frame/base/bli_setgetijv.h b/frame/base/bli_setgetijv.h
index a9badce4d..00710f320 100644
--- a/frame/base/bli_setgetijv.h
+++ b/frame/base/bli_setgetijv.h
@@ -51,7 +51,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
        void*  x, inc_t incx  \
      );
 
-INSERT_GENTPROT_BASIC0( setijv )
+INSERT_GENTPROT_BASIC( setijv )
 
 // -----------------------------------------------------------------------------
 
@@ -74,5 +74,5 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              double* ai  \
      );
 
-INSERT_GENTPROT_BASIC0( getijv )
+INSERT_GENTPROT_BASIC( getijv )
 
diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c
index 57dd48bbc..5e0792910 100644
--- a/frame/base/cast/bli_castm.c
+++ b/frame/base/cast/bli_castm.c
@@ -205,8 +205,8 @@ void PASTEMAC2(cha,chb,opname) \
 	} \
 }
 
-INSERT_GENTFUNC2_BASIC0( castm )
-INSERT_GENTFUNC2_MIXDP0( castm )
+INSERT_GENTFUNC2_BASIC( castm )
+INSERT_GENTFUNC2_MIX_DP( castm )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/base/cast/bli_castm.h b/frame/base/cast/bli_castm.h
index c06d1241a..a36d15e99 100644
--- a/frame/base/cast/bli_castm.h
+++ b/frame/base/cast/bli_castm.h
@@ -58,8 +58,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
              void*   b, inc_t rs_b, inc_t cs_b  \
      );
 
-INSERT_GENTPROT2_BASIC0( castm )
-INSERT_GENTPROT2_MIXDP0( castm )
+INSERT_GENTPROT2_BASIC( castm )
+INSERT_GENTPROT2_MIX_DP( castm )
 
 //
 // Prototype object-based _check() function.
diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c
index 071233169..e6e4a6cd9 100644
--- a/frame/base/cast/bli_castnzm.c
+++ b/frame/base/cast/bli_castnzm.c
@@ -205,8 +205,8 @@ void PASTEMAC2(cha,chb,opname) \
 	} \
 }
 
-INSERT_GENTFUNC2_BASIC0( castnzm )
-INSERT_GENTFUNC2_MIXDP0( castnzm )
+INSERT_GENTFUNC2_BASIC( castnzm )
+INSERT_GENTFUNC2_MIX_DP( castnzm )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h
index 03860fe40..a8a852720 100644
--- a/frame/base/cast/bli_castnzm.h
+++ b/frame/base/cast/bli_castnzm.h
@@ -58,8 +58,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
              void*   b, inc_t rs_b, inc_t cs_b  \
      );
 
-INSERT_GENTPROT2_BASIC0( castnzm )
-INSERT_GENTPROT2_MIXDP0( castnzm )
+INSERT_GENTPROT2_BASIC( castnzm )
+INSERT_GENTPROT2_MIX_DP( castnzm )
 
 //
 // Prototype object-based _check() function.
diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c
index c46a2798c..c057df82a 100644
--- a/frame/base/cast/bli_castv.c
+++ b/frame/base/cast/bli_castv.c
@@ -159,8 +159,8 @@ void PASTEMAC2(chx,chy,opname) \
 	} \
 }
 
-INSERT_GENTFUNC2_BASIC0( castv )
-INSERT_GENTFUNC2_MIXDP0( castv )
+INSERT_GENTFUNC2_BASIC( castv )
+INSERT_GENTFUNC2_MIX_DP( castv )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/base/cast/bli_castv.h b/frame/base/cast/bli_castv.h
index 85d87d911..99f89fc24 100644
--- a/frame/base/cast/bli_castv.h
+++ b/frame/base/cast/bli_castv.h
@@ -57,8 +57,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \
              void*  y, inc_t incy  \
      );
 
-INSERT_GENTPROT2_BASIC0( castv )
-INSERT_GENTPROT2_MIXDP0( castv )
+INSERT_GENTPROT2_BASIC( castv )
+INSERT_GENTPROT2_MIX_DP( castv )
 
 //
 // Prototype object-based _check() function.
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index e863f7dcf..a8ce2eb2c 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -145,215 +145,87 @@ GENTFUNCSCAL( dcomplex, double,   z, d, blasname, blisname )
 
 // -- Basic one-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC_BASIC0( tfuncname ) \
-\
-GENTFUNC( float,    s, tfuncname ) \
-GENTFUNC( double,   d, tfuncname ) \
-GENTFUNC( scomplex, c, tfuncname ) \
-GENTFUNC( dcomplex, z, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \
-\
-GENTFUNC( float,    s, tfuncname, varname ) \
-GENTFUNC( double,   d, tfuncname, varname ) \
-GENTFUNC( scomplex, c, tfuncname, varname ) \
-GENTFUNC( dcomplex, z, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \
+#define INSERT_GENTFUNC_BASIC( ... ) \
 \
-GENTFUNC( float,    s, tfuncname, varname1, varname2 ) \
-GENTFUNC( double,   d, tfuncname, varname1, varname2 ) \
-GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \
-GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 )
-
-// -- (three auxiliary arguments) --
-
-#define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \
-\
-GENTFUNC( float,    s, tfuncname, varname1, varname2, varname3 ) \
-GENTFUNC( double,   d, tfuncname, varname1, varname2, varname3 ) \
-GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \
-GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 )
-
-// -- (four auxiliary arguments) --
-
-#define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \
-\
-GENTFUNC( float,    s, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTFUNC( double,   d, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 )
+GENTFUNC( float,    s, __VA_ARGS__ ) \
+GENTFUNC( double,   d, __VA_ARGS__ ) \
+GENTFUNC( scomplex, c, __VA_ARGS__ ) \
+GENTFUNC( dcomplex, z, __VA_ARGS__ )
 
 
 // -- Basic one-operand with real projection --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNCR_BASIC0( tfuncname ) \
+#define INSERT_GENTFUNCR_BASIC( ... ) \
 \
-GENTFUNCR( float,    float,  s, s, tfuncname ) \
-GENTFUNCR( double,   double, d, d, tfuncname ) \
-GENTFUNCR( scomplex, float,  c, s, tfuncname ) \
-GENTFUNCR( dcomplex, double, z, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \
-\
-GENTFUNCR( float,    float,  s, s, tfuncname, varname ) \
-GENTFUNCR( double,   double, d, d, tfuncname, varname ) \
-GENTFUNCR( scomplex, float,  c, s, tfuncname, varname ) \
-GENTFUNCR( dcomplex, double, z, d, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \
-\
-GENTFUNCR( float,    float,  s, s, tfuncname, varname1, varname2 ) \
-GENTFUNCR( double,   double, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNCR( scomplex, float,  c, s, tfuncname, varname1, varname2 ) \
-GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 )
-
-// -- (three auxiliary arguments) --
-
-#define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3  ) \
-\
-GENTFUNCR( float,    float,  s, s, tfuncname, varname1, varname2, varname3 ) \
-GENTFUNCR( double,   double, d, d, tfuncname, varname1, varname2, varname3 ) \
-GENTFUNCR( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3 ) \
-GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 )
-
-// -- (four auxiliary arguments) --
-
-#define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4  ) \
-\
-GENTFUNCR( float,    float,  s, s, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTFUNCR( double,   double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTFUNCR( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 )
+GENTFUNCR( float,    float,  s, s, __VA_ARGS__ ) \
+GENTFUNCR( double,   double, d, d, __VA_ARGS__ ) \
+GENTFUNCR( scomplex, float,  c, s, __VA_ARGS__ ) \
+GENTFUNCR( dcomplex, double, z, d, __VA_ARGS__ )
 
 
 // -- Basic one-operand macro with real domain only --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \
-\
-GENTFUNCRO( float,  s, tfuncname ) \
-GENTFUNCRO( double, d, tfuncname ) \
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \
-\
-GENTFUNCRO( float,  s, tfuncname, varname ) \
-GENTFUNCRO( double, d, tfuncname, varname ) \
-
-
-
-// -- Basic one-operand macro with complex domain only and real projection --
-
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \
+#define INSERT_GENTFUNCRO_BASIC( ... ) \
 \
-GENTFUNCCO( scomplex, float,  c, s, tfuncname ) \
-GENTFUNCCO( dcomplex, double, z, d, tfuncname )
+GENTFUNCRO( float,  s, __VA_ARGS__ ) \
+GENTFUNCRO( double, d, __VA_ARGS__ )
 
-// -- (one auxiliary argument) --
+// -- Basic one-operand macro with complex domain only --
 
-#define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \
+#define INSERT_GENTFUNCCO_BASIC( ... ) \
 \
-GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname ) \
-GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname )
+GENTFUNCCO( scomplex, c, __VA_ARGS__ ) \
+GENTFUNCCO( dcomplex, z, __VA_ARGS__ )
 
-// -- (two auxiliary arguments) --
+// -- Basic one-operand macro with real domain only and complex projection --
 
-#define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \
+#define INSERT_GENTFUNCRO( ... ) \
 \
-GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname1, varname2 ) \
-GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 )
+GENTFUNCRO( float,  scomplex, s, c, __VA_ARGS__ ) \
+GENTFUNCRO( double, dcomplex, d, z, __VA_ARGS__ )
 
-// -- (three auxiliary arguments) --
-
-#define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \
-\
-GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3 ) \
-GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 )
-
-// -- (four auxiliary arguments) --
+// -- Basic one-operand macro with complex domain only and real projection --
 
-#define INSERT_GENTFUNCCO_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \
+#define INSERT_GENTFUNCCO( ... ) \
 \
-GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 )
+GENTFUNCCO( scomplex, float,  c, s, __VA_ARGS__ ) \
+GENTFUNCCO( dcomplex, double, z, d, __VA_ARGS__ )
 
 
 // -- Basic one-operand macro with integer instance --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \
+#define INSERT_GENTFUNC_BASIC_I( ... ) \
 \
-GENTFUNC( float,    s, tfuncname ) \
-GENTFUNC( double,   d, tfuncname ) \
-GENTFUNC( scomplex, c, tfuncname ) \
-GENTFUNC( dcomplex, z, tfuncname ) \
-GENTFUNC( gint_t,   i, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \
-\
-GENTFUNC( float,    s, tfuncname, varname ) \
-GENTFUNC( double,   d, tfuncname, varname ) \
-GENTFUNC( scomplex, c, tfuncname, varname ) \
-GENTFUNC( dcomplex, z, tfuncname, varname ) \
-GENTFUNC( gint_t,   i, tfuncname, varname )
+GENTFUNC( float,    s, __VA_ARGS__ ) \
+GENTFUNC( double,   d, __VA_ARGS__ ) \
+GENTFUNC( scomplex, c, __VA_ARGS__ ) \
+GENTFUNC( dcomplex, z, __VA_ARGS__ ) \
+GENTFUNC( gint_t,   i, __VA_ARGS__ )
 
 
 // -- Basic one-operand with integer projection --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNCI_BASIC0( tfuncname ) \
-\
-GENTFUNCI( float,    gint_t, s, i, tfuncname ) \
-GENTFUNCI( double,   gint_t, d, i, tfuncname ) \
-GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \
-GENTFUNCI( dcomplex, gint_t, z, i, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \
+#define INSERT_GENTFUNCI_BASIC( ... ) \
 \
-GENTFUNCI( float,    gint_t, s, i, tfuncname, varname ) \
-GENTFUNCI( double,   gint_t, d, i, tfuncname, varname ) \
-GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \
-GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname )
+GENTFUNCI( float,    gint_t, s, i, __VA_ARGS__ ) \
+GENTFUNCI( double,   gint_t, d, i, __VA_ARGS__ ) \
+GENTFUNCI( scomplex, gint_t, c, i, __VA_ARGS__ ) \
+GENTFUNCI( dcomplex, gint_t, z, i, __VA_ARGS__ )
 
 
 // -- Basic one-operand with real and integer projections --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \
+#define INSERT_GENTFUNCRI_BASIC( ... ) \
 \
-GENTFUNCRI( float,    float,  gint_t, s, s, i, tfuncname ) \
-GENTFUNCRI( double,   double, gint_t, d, d, i, tfuncname ) \
-GENTFUNCRI( scomplex, float,  gint_t, c, s, i, tfuncname ) \
-GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname )
+GENTFUNCRI( float,    float,  gint_t, s, s, i, __VA_ARGS__ ) \
+GENTFUNCRI( double,   double, gint_t, d, d, i, __VA_ARGS__ ) \
+GENTFUNCRI( scomplex, float,  gint_t, c, s, i, __VA_ARGS__ ) \
+GENTFUNCRI( dcomplex, double, gint_t, z, d, i, __VA_ARGS__ )
 
 
@@ -363,254 +235,127 @@ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname )
 
 // -- Basic two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2_BASIC0( tfuncname ) \
+#define INSERT_GENTFUNC2_BASIC( ... ) \
 \
-GENTFUNC2( float,    float,    s, s, tfuncname ) \
-GENTFUNC2( double,   double,   d, d, tfuncname ) \
-GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \
-GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \
-\
-GENTFUNC2( float,    float,    s, s, tfuncname, varname ) \
-GENTFUNC2( double,   double,   d, d, tfuncname, varname ) \
-GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \
-GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname )
+GENTFUNC2( float,    float,    s, s, __VA_ARGS__ ) \
+GENTFUNC2( double,   double,   d, d, __VA_ARGS__ ) \
+GENTFUNC2( scomplex, scomplex, c, c, __VA_ARGS__ ) \
+GENTFUNC2( dcomplex, dcomplex, z, z, __VA_ARGS__ )
 
 
 // -- Mixed domain two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \
-\
-GENTFUNC2( float,    scomplex, s, c, tfuncname ) \
-GENTFUNC2( scomplex, float,    c, s, tfuncname ) \
-\
-GENTFUNC2( double,   dcomplex, d, z, tfuncname ) \
-GENTFUNC2( dcomplex, double,   z, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \
+#define INSERT_GENTFUNC2_MIX_D( ... ) \
 \
-GENTFUNC2( float,    scomplex, s, c, tfuncname, varname ) \
-GENTFUNC2( scomplex, float,    c, s, tfuncname, varname ) \
+GENTFUNC2( float,    scomplex, s, c, __VA_ARGS__ ) \
+GENTFUNC2( scomplex, float,    c, s, __VA_ARGS__ ) \
 \
-GENTFUNC2( double,   dcomplex, d, z, tfuncname, varname ) \
-GENTFUNC2( dcomplex, double,   z, d, tfuncname, varname )
+GENTFUNC2( double,   dcomplex, d, z, __VA_ARGS__ ) \
+GENTFUNC2( dcomplex, double,   z, d, __VA_ARGS__ )
 
 
 // -- Mixed precision two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \
-\
-GENTFUNC2( float,    double,   s, d, tfuncname ) \
-GENTFUNC2( float,    dcomplex, s, z, tfuncname ) \
-\
-GENTFUNC2( double,   float,    d, s, tfuncname ) \
-GENTFUNC2( double,   scomplex, d, c, tfuncname ) \
-\
-GENTFUNC2( scomplex, double,   c, d, tfuncname ) \
-GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \
-\
-GENTFUNC2( dcomplex, float,    z, s, tfuncname ) \
-GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \
+#define INSERT_GENTFUNC2_MIX_P( ... ) \
 \
-GENTFUNC2( float,    double,   s, d, tfuncname, varname ) \
-GENTFUNC2( float,    dcomplex, s, z, tfuncname, varname ) \
+GENTFUNC2( float,    double,   s, d, __VA_ARGS__ ) \
+GENTFUNC2( float,    dcomplex, s, z, __VA_ARGS__ ) \
 \
-GENTFUNC2( double,   float,    d, s, tfuncname, varname ) \
-GENTFUNC2( double,   scomplex, d, c, tfuncname, varname ) \
+GENTFUNC2( double,   float,    d, s, __VA_ARGS__ ) \
+GENTFUNC2( double,   scomplex, d, c, __VA_ARGS__ ) \
 \
-GENTFUNC2( scomplex, double,   c, d, tfuncname, varname ) \
-GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \
+GENTFUNC2( scomplex, double,   c, d, __VA_ARGS__ ) \
+GENTFUNC2( scomplex, dcomplex, c, z, __VA_ARGS__ ) \
 \
-GENTFUNC2( dcomplex, float,    z, s, tfuncname, varname ) \
-GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \
+GENTFUNC2( dcomplex, float,    z, s, __VA_ARGS__ ) \
+GENTFUNC2( dcomplex, scomplex, z, c, __VA_ARGS__ )
 
 
 // -- Mixed domain/precision (all) two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \
-\
-GENTFUNC2( float,    double,   s, d, tfuncname ) \
-GENTFUNC2( float,    scomplex, s, c, tfuncname ) \
-GENTFUNC2( float,    dcomplex, s, z, tfuncname ) \
-\
-GENTFUNC2( double,   float,    d, s, tfuncname ) \
-GENTFUNC2( double,   scomplex, d, c, tfuncname ) \
-GENTFUNC2( double,   dcomplex, d, z, tfuncname ) \
-\
-GENTFUNC2( scomplex, float,    c, s, tfuncname ) \
-GENTFUNC2( scomplex, double,   c, d, tfuncname ) \
-GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \
-\
-GENTFUNC2( dcomplex, float,    z, s, tfuncname ) \
-GENTFUNC2( dcomplex, double,   z, d, tfuncname ) \
-GENTFUNC2( dcomplex, scomplex, z, c, tfuncname )
-
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \
+#define INSERT_GENTFUNC2_MIX_DP( ... ) \
 \
-GENTFUNC2( float,    double,   s, d, tfuncname, varname ) \
-GENTFUNC2( float,    scomplex, s, c, tfuncname, varname ) \
-GENTFUNC2( float,    dcomplex, s, z, tfuncname, varname ) \
+GENTFUNC2( float,    double,   s, d, __VA_ARGS__ ) \
+GENTFUNC2( float,    scomplex, s, c, __VA_ARGS__ ) \
+GENTFUNC2( float,    dcomplex, s, z, __VA_ARGS__ ) \
 \
-GENTFUNC2( double,   float,    d, s, tfuncname, varname ) \
-GENTFUNC2( double,   scomplex, d, c, tfuncname, varname ) \
-GENTFUNC2( double,   dcomplex, d, z, tfuncname, varname ) \
+GENTFUNC2( double,   float,    d, s, __VA_ARGS__ ) \
+GENTFUNC2( double,   scomplex, d, c, __VA_ARGS__ ) \
+GENTFUNC2( double,   dcomplex, d, z, __VA_ARGS__ ) \
 \
-GENTFUNC2( scomplex, float,    c, s, tfuncname, varname ) \
-GENTFUNC2( scomplex, double,   c, d, tfuncname, varname ) \
-GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \
+GENTFUNC2( scomplex, float,    c, s, __VA_ARGS__ ) \
+GENTFUNC2( scomplex, double,   c, d, __VA_ARGS__ ) \
+GENTFUNC2( scomplex, dcomplex, c, z, __VA_ARGS__ ) \
 \
-GENTFUNC2( dcomplex, float,    z, s, tfuncname, varname ) \
-GENTFUNC2( dcomplex, double,   z, d, tfuncname, varname ) \
-GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname )
+GENTFUNC2( dcomplex, float,    z, s, __VA_ARGS__ ) \
+GENTFUNC2( dcomplex, double,   z, d, __VA_ARGS__ ) \
+GENTFUNC2( dcomplex, scomplex, z, c, __VA_ARGS__ )
 
 
 // -- Basic two-operand with real projection of second operand --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \
-\
-GENTFUNC2R( float,    float,    float,    s, s, s, tfuncname ) \
-GENTFUNC2R( double,   double,   double,   d, d, d, tfuncname ) \
-GENTFUNC2R( scomplex, scomplex, float,    c, c, s, tfuncname ) \
-GENTFUNC2R( dcomplex, dcomplex, double,   z, z, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \
+#define INSERT_GENTFUNC2R_BASIC( ... ) \
 \
-GENTFUNC2R( float,    float,    float,    s, s, s, tfuncname, varname ) \
-GENTFUNC2R( double,   double,   double,   d, d, d, tfuncname, varname ) \
-GENTFUNC2R( scomplex, scomplex, float,    c, c, s, tfuncname, varname ) \
-GENTFUNC2R( dcomplex, dcomplex, double,   z, z, d, tfuncname, varname )
+GENTFUNC2R( float,    float,    float,    s, s, s, __VA_ARGS__ ) \
+GENTFUNC2R( double,   double,   double,   d, d, d, __VA_ARGS__ ) \
+GENTFUNC2R( scomplex, scomplex, float,    c, c, s, __VA_ARGS__ ) \
+GENTFUNC2R( dcomplex, dcomplex, double,   z, z, d, __VA_ARGS__ )
 
 
 // -- Mixed domain two-operand with real projection of second operand --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \
+#define INSERT_GENTFUNC2R_MIX_D( ... ) \
 \
-GENTFUNC2R( float,    scomplex, float,    s, c, s, tfuncname ) \
-GENTFUNC2R( scomplex, float,    float,    c, s, s, tfuncname ) \
+GENTFUNC2R( float,    scomplex, float,    s, c, s, __VA_ARGS__ ) \
+GENTFUNC2R( scomplex, float,    float,    c, s, s, __VA_ARGS__ ) \
 \
-GENTFUNC2R( double,   dcomplex, double,   d, z, d, tfuncname ) \
-GENTFUNC2R( dcomplex, double,   double,   z, d, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \
-\
-GENTFUNC2R( float,    scomplex, float,    s, c, s, tfuncname, varname ) \
-GENTFUNC2R( scomplex, float,    float,    c, s, s, tfuncname, varname ) \
-\
-GENTFUNC2R( double,   dcomplex, double,   d, z, d, tfuncname, varname ) \
-GENTFUNC2R( dcomplex, double,   double,   z, d, d, tfuncname, varname )
+GENTFUNC2R( double,   dcomplex, double,   d, z, d, __VA_ARGS__ ) \
+GENTFUNC2R( dcomplex, double,   double,   z, d, d, __VA_ARGS__ )
 
 
 // -- Mixed precision two-operand with real projection of second operand --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \
+#define INSERT_GENTFUNC2R_MIX_P( ... ) \
 \
-GENTFUNC2R( float,    double,   double,   s, d, d, tfuncname ) \
-GENTFUNC2R( float,    dcomplex, double,   s, z, d, tfuncname ) \
+GENTFUNC2R( float,    double,   double,   s, d, d, __VA_ARGS__ ) \
+GENTFUNC2R( float,    dcomplex, double,   s, z, d, __VA_ARGS__ ) \
 \
-GENTFUNC2R( double,   float,    float,    d, s, s, tfuncname ) \
-GENTFUNC2R( double,   scomplex, float,    d, c, s, tfuncname ) \
+GENTFUNC2R( double,   float,    float,    d, s, s, __VA_ARGS__ ) \
+GENTFUNC2R( double,   scomplex, float,    d, c, s, __VA_ARGS__ ) \
 \
-GENTFUNC2R( scomplex, double,   double,   c, d, d, tfuncname ) \
-GENTFUNC2R( scomplex, dcomplex, double,   c, z, d, tfuncname ) \
+GENTFUNC2R( scomplex, double,   double,   c, d, d, __VA_ARGS__ ) \
+GENTFUNC2R( scomplex, dcomplex, double,   c, z, d, __VA_ARGS__ ) \
 \
-GENTFUNC2R( dcomplex, float,    float,    z, s, s, tfuncname ) \
-GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \
-\
-GENTFUNC2R( float,    double,   double,   s, d, d, tfuncname, varname ) \
-GENTFUNC2R( float,    dcomplex, double,   s, z, d, tfuncname, varname ) \
-\
-GENTFUNC2R( double,   float,    float,    d, s, s, tfuncname, varname ) \
-GENTFUNC2R( double,   scomplex, float,    d, c, s, tfuncname, varname ) \
-\
-GENTFUNC2R( scomplex, double,   double,   c, d, d, tfuncname, varname ) \
-GENTFUNC2R( scomplex, dcomplex, double,   c, z, d, tfuncname, varname ) \
-\
-GENTFUNC2R( dcomplex, float,    float,    z, s, s, tfuncname, varname ) \
-GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, tfuncname, varname )
+GENTFUNC2R( dcomplex, float,    float,    z, s, s, __VA_ARGS__ ) \
+GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, __VA_ARGS__ )
 
 
 // -- Mixed domain/precision (all) two-operand macro with real projection of second operand --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \
+#define INSERT_GENTFUNC2R_MIX_DP( ... ) \
 \
-GENTFUNC2R( float,    double,   double,   s, d, d, tfuncname ) \
-GENTFUNC2R( float,    scomplex, float,    s, c, s, tfuncname ) \
-GENTFUNC2R( float,    dcomplex, double,   s, z, d, tfuncname ) \
+GENTFUNC2R( float,    double,   double,   s, d, d, __VA_ARGS__ ) \
+GENTFUNC2R( float,    scomplex, float,    s, c, s, __VA_ARGS__ ) \
+GENTFUNC2R( float,    dcomplex, double,   s, z, d, __VA_ARGS__ ) \
 \
-GENTFUNC2R( double,   float,    float,    d, s, s, tfuncname ) \
-GENTFUNC2R( double,   scomplex, float,    d, c, s, tfuncname ) \
-GENTFUNC2R( double,   dcomplex, double,   d, z, d, tfuncname ) \
+GENTFUNC2R( double,   float,    float,    d, s, s, __VA_ARGS__ ) \
+GENTFUNC2R( double,   scomplex, float,    d, c, s, __VA_ARGS__ ) \
+GENTFUNC2R( double,   dcomplex, double,   d, z, d, __VA_ARGS__ ) \
 \
-GENTFUNC2R( scomplex, float,    float,    c, s, s, tfuncname ) \
-GENTFUNC2R( scomplex, double,   double,   c, d, d, tfuncname ) \
-GENTFUNC2R( scomplex, dcomplex, double,   c, z, d, tfuncname ) \
+GENTFUNC2R( scomplex, float,    float,    c, s, s, __VA_ARGS__ ) \
+GENTFUNC2R( scomplex, double,   double,   c, d, d, __VA_ARGS__ ) \
+GENTFUNC2R( scomplex, dcomplex, double,   c, z, d, __VA_ARGS__ ) \
 \
-GENTFUNC2R( dcomplex, float,    float,    z, s, s, tfuncname ) \
-GENTFUNC2R( dcomplex, double,   double,   z, d, d, tfuncname ) \
-GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, tfuncname ) \
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \
-\
-GENTFUNC2R( float,    double,   double,   s, d, d, tfuncname, varname ) \
-GENTFUNC2R( float,    scomplex, float,    s, c, s, tfuncname, varname ) \
-GENTFUNC2R( float,    dcomplex, double,   s, z, d, tfuncname, varname ) \
-\
-GENTFUNC2R( double,   float,    float,    d, s, s, tfuncname, varname ) \
-GENTFUNC2R( double,   scomplex, float,    d, c, s, tfuncname, varname ) \
-GENTFUNC2R( double,   dcomplex, double,   d, z, d, tfuncname, varname ) \
-\
-GENTFUNC2R( scomplex, float,    float,    c, s, s, tfuncname, varname ) \
-GENTFUNC2R( scomplex, double,   double,   c, d, d, tfuncname, varname ) \
-GENTFUNC2R( scomplex, dcomplex, double,   c, z, d, tfuncname, varname ) \
-\
-GENTFUNC2R( dcomplex, float,    float,    z, s, s, tfuncname, varname ) \
-GENTFUNC2R( dcomplex, double,   double,   z, d, d, tfuncname, varname ) \
-GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, tfuncname, varname ) \
+GENTFUNC2R( dcomplex, float,    float,    z, s, s, __VA_ARGS__ ) \
+GENTFUNC2R( dcomplex, double,   double,   z, d, d, __VA_ARGS__ ) \
+GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, __VA_ARGS__ )
 
 
@@ -620,625 +365,213 @@ GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, tfuncname, varname ) \
 
 // -- Basic three-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC3_BASIC0( tfuncname ) \
+#define INSERT_GENTFUNC3_BASIC( ... ) \
 \
-GENTFUNC3( float,    float,    float,    s, s, s, tfuncname ) \
-GENTFUNC3( double,   double,   double,   d, d, d, tfuncname ) \
-GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \
-GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \
-\
-GENTFUNC3( float,    float,    float,    s, s, s, tfuncname, varname ) \
-GENTFUNC3( double,   double,   double,   d, d, d, tfuncname, varname ) \
-GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \
-GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3( float,    float,    float,    s, s, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   double,   double,   d, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 )
+GENTFUNC3( float,    float,    float,    s, s, s, __VA_ARGS__ ) \
+GENTFUNC3( double,   double,   double,   d, d, d, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, __VA_ARGS__ )
 
 
 // -- Mixed domain three-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \
+#define INSERT_GENTFUNC3_MIX_D( ... ) \
 \
-GENTFUNC3( float,    float,    scomplex, s, s, c, tfuncname ) \
-GENTFUNC3( float,    scomplex, float,    s, c, s, tfuncname ) \
-GENTFUNC3( float,    scomplex, scomplex, s, c, c, tfuncname ) \
+GENTFUNC3( float,    float,    scomplex, s, s, c, __VA_ARGS__ ) \
+GENTFUNC3( float,    scomplex, float,    s, c, s, __VA_ARGS__ ) \
+GENTFUNC3( float,    scomplex, scomplex, s, c, c, __VA_ARGS__ ) \
 \
-GENTFUNC3( double,   double,   dcomplex, d, d, z, tfuncname ) \
-GENTFUNC3( double,   dcomplex, double,   d, z, d, tfuncname ) \
-GENTFUNC3( double,   dcomplex, dcomplex, d, z, z, tfuncname ) \
+GENTFUNC3( double,   double,   dcomplex, d, d, z, __VA_ARGS__ ) \
+GENTFUNC3( double,   dcomplex, double,   d, z, d, __VA_ARGS__ ) \
+GENTFUNC3( double,   dcomplex, dcomplex, d, z, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( scomplex, float,    float,    c, s, s, tfuncname ) \
-GENTFUNC3( scomplex, float,    scomplex, c, s, c, tfuncname ) \
-GENTFUNC3( scomplex, scomplex, float,    c, c, s, tfuncname ) \
+GENTFUNC3( scomplex, float,    float,    c, s, s, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, float,    scomplex, c, s, c, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, scomplex, float,    c, c, s, __VA_ARGS__ ) \
 \
-GENTFUNC3( dcomplex, double,   double,   z, d, d, tfuncname ) \
-GENTFUNC3( dcomplex, double,   dcomplex, z, d, z, tfuncname ) \
-GENTFUNC3( dcomplex, dcomplex, double,   z, z, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \
-\
-GENTFUNC3( float,    float,    scomplex, s, s, c, tfuncname, varname ) \
-GENTFUNC3( float,    scomplex, float,    s, c, s, tfuncname, varname ) \
-GENTFUNC3( float,    scomplex, scomplex, s, c, c, tfuncname, varname ) \
-\
-GENTFUNC3( double,   double,   dcomplex, d, d, z, tfuncname, varname ) \
-GENTFUNC3( double,   dcomplex, double,   d, z, d, tfuncname, varname ) \
-GENTFUNC3( double,   dcomplex, dcomplex, d, z, z, tfuncname, varname ) \
-\
-GENTFUNC3( scomplex, float,    float,    c, s, s, tfuncname, varname ) \
-GENTFUNC3( scomplex, float,    scomplex, c, s, c, tfuncname, varname ) \
-GENTFUNC3( scomplex, scomplex, float,    c, c, s, tfuncname, varname ) \
-\
-GENTFUNC3( dcomplex, double,   double,   z, d, d, tfuncname, varname ) \
-GENTFUNC3( dcomplex, double,   dcomplex, z, d, z, tfuncname, varname ) \
-GENTFUNC3( dcomplex, dcomplex, double,   z, z, d, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3( float,    float,    scomplex, s, s, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    scomplex, float,    s, c, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3( double,   double,   dcomplex, d, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   dcomplex, double,   d, z, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3( scomplex, float,    float,    c, s, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, float,    scomplex, c, s, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, scomplex, float,    c, c, s, tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3( dcomplex, double,   double,   z, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, double,   dcomplex, z, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, dcomplex, double,   z, z, d, tfuncname, varname1, varname2 )
+GENTFUNC3( dcomplex, double,   double,   z, d, d, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, double,   dcomplex, z, d, z, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, dcomplex, double,   z, z, d, __VA_ARGS__ )
 
 
 // -- Mixed precision three-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \
-\
-GENTFUNC3( float,    float,    double,   s, s, d, tfuncname ) \
-GENTFUNC3( float,    float,    dcomplex, s, s, z, tfuncname ) \
-\
-GENTFUNC3( float,    double,   float,    s, d, s, tfuncname ) \
-GENTFUNC3( float,    double,   double,   s, d, d, tfuncname ) \
-GENTFUNC3( float,    double,   scomplex, s, d, c, tfuncname ) \
-GENTFUNC3( float,    double,   dcomplex, s, d, z, tfuncname ) \
-\
-GENTFUNC3( float,    scomplex, double,   s, c, d, tfuncname ) \
-GENTFUNC3( float,    scomplex, dcomplex, s, c, z, tfuncname ) \
-\
-GENTFUNC3( float,    dcomplex, float,    s, z, s, tfuncname ) \
-GENTFUNC3( float,    dcomplex, double,   s, z, d, tfuncname ) \
-GENTFUNC3( float,    dcomplex, scomplex, s, z, c, tfuncname ) \
-GENTFUNC3( float,    dcomplex, dcomplex, s, z, z, tfuncname ) \
-\
-\
-GENTFUNC3( double,   float,    float,    d, s, s, tfuncname ) \
-GENTFUNC3( double,   float,    double,   d, s, d, tfuncname ) \
-GENTFUNC3( double,   float,    scomplex, d, s, c, tfuncname ) \
-GENTFUNC3( double,   float,    dcomplex, d, s, z, tfuncname ) \
-\
-GENTFUNC3( double,   double,   float,    d, d, s, tfuncname ) \
-GENTFUNC3( double,   double,   scomplex, d, d, c, tfuncname ) \
-\
-GENTFUNC3( double,   scomplex, float,    d, c, s, tfuncname ) \
-GENTFUNC3( double,   scomplex, double,   d, c, d, tfuncname ) \
-GENTFUNC3( double,   scomplex, scomplex, d, c, c, tfuncname ) \
-GENTFUNC3( double,   scomplex, dcomplex, d, c, z, tfuncname ) \
-\
-GENTFUNC3( double,   dcomplex, float,    d, z, s, tfuncname ) \
-GENTFUNC3( double,   dcomplex, scomplex, d, z, c, tfuncname ) \
-\
-\
-GENTFUNC3( scomplex, float,    double,   c, s, d, tfuncname ) \
-GENTFUNC3( scomplex, float,    dcomplex, c, s, z, tfuncname ) \
-\
-GENTFUNC3( scomplex, double,   float,    c, d, s, tfuncname ) \
-GENTFUNC3( scomplex, double,   double,   c, d, d, tfuncname ) \
-GENTFUNC3( scomplex, double,   scomplex, c, d, c, tfuncname ) \
-GENTFUNC3( scomplex, double,   dcomplex, c, d, z, tfuncname ) \
-\
-GENTFUNC3( scomplex, scomplex, double,   c, c, d, tfuncname ) \
-GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \
-\
-GENTFUNC3( scomplex, dcomplex, float,    c, z, s, tfuncname ) \
-GENTFUNC3( scomplex, dcomplex, double,   c, z, d, tfuncname ) \
-GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \
-GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \
-\
-\
-GENTFUNC3( dcomplex, float,    float,    z, s, s, tfuncname ) \
-GENTFUNC3( dcomplex, float,    double,   z, s, d, tfuncname ) \
-GENTFUNC3( dcomplex, float,    scomplex, z, s, c, tfuncname ) \
-GENTFUNC3( dcomplex, float,    dcomplex, z, s, z, tfuncname ) \
-\
-GENTFUNC3( dcomplex, double,   float,    z, d, s, tfuncname ) \
-GENTFUNC3( dcomplex, double,   scomplex, z, d, c, tfuncname ) \
-\
-GENTFUNC3( dcomplex, scomplex, float,    z, c, s, tfuncname ) \
-GENTFUNC3( dcomplex, scomplex, double,   z, c, d, tfuncname ) \
-GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \
-GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \
-\
-GENTFUNC3( dcomplex, dcomplex, float,    z, z, s, tfuncname ) \
-GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \
-\
-GENTFUNC3( float,    float,    double,   s, s, d, tfuncname, varname ) \
-GENTFUNC3( float,    float,    dcomplex, s, s, z, tfuncname, varname ) \
-\
-GENTFUNC3( float,    double,   float,    s, d, s, tfuncname, varname ) \
-GENTFUNC3( float,    double,   double,   s, d, d, tfuncname, varname ) \
-GENTFUNC3( float,    double,   scomplex, s, d, c, tfuncname, varname ) \
-GENTFUNC3( float,    double,   dcomplex, s, d, z, tfuncname, varname ) \
-\
-GENTFUNC3( float,    scomplex, double,   s, c, d, tfuncname, varname ) \
-GENTFUNC3( float,    scomplex, dcomplex, s, c, z, tfuncname, varname ) \
-\
-GENTFUNC3( float,    dcomplex, float,    s, z, s, tfuncname, varname ) \
-GENTFUNC3( float,    dcomplex, double,   s, z, d, tfuncname, varname ) \
-GENTFUNC3( float,    dcomplex, scomplex, s, z, c, tfuncname, varname ) \
-GENTFUNC3( float,    dcomplex, dcomplex, s, z, z, tfuncname, varname ) \
-\
-\
-GENTFUNC3( double,   float,    float,    d, s, s, tfuncname, varname ) \
-GENTFUNC3( double,   float,    double,   d, s, d, tfuncname, varname ) \
-GENTFUNC3( double,   float,    scomplex, d, s, c, tfuncname, varname ) \
-GENTFUNC3( double,   float,    dcomplex, d, s, z, tfuncname, varname ) \
-\
-GENTFUNC3( double,   double,   float,    d, d, s, tfuncname, varname ) \
-GENTFUNC3( double,   double,   scomplex, d, d, c, tfuncname, varname ) \
-\
-GENTFUNC3( double,   scomplex, float,    d, c, s, tfuncname, varname ) \
-GENTFUNC3( double,   scomplex, double,   d, c, d, tfuncname, varname ) \
-GENTFUNC3( double,   scomplex, scomplex, d, c, c, tfuncname, varname ) \
-GENTFUNC3( double,   scomplex, dcomplex, d, c, z, tfuncname, varname ) \
-\
-GENTFUNC3( double,   dcomplex, float,    d, z, s, tfuncname, varname ) \
-GENTFUNC3( double,   dcomplex, scomplex, d, z, c, tfuncname, varname ) \
-\
-\
-GENTFUNC3( scomplex, float,    double,   c, s, d, tfuncname, varname ) \
-GENTFUNC3( scomplex, float,    dcomplex, c, s, z, tfuncname, varname ) \
-\
-GENTFUNC3( scomplex, double,   float,    c, d, s, tfuncname, varname ) \
-GENTFUNC3( scomplex, double,   double,   c, d, d, tfuncname, varname ) \
-GENTFUNC3( scomplex, double,   scomplex, c, d, c, tfuncname, varname ) \
-GENTFUNC3( scomplex, double,   dcomplex, c, d, z, tfuncname, varname ) \
-\
-GENTFUNC3( scomplex, scomplex, double,   c, c, d, tfuncname, varname ) \
-GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \
-\
-GENTFUNC3( scomplex, dcomplex, float,    c, z, s, tfuncname, varname ) \
-GENTFUNC3( scomplex, dcomplex, double,   c, z, d, tfuncname, varname ) \
-GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \
-GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \
-\
-\
-GENTFUNC3( dcomplex, float,    float,    z, s, s, tfuncname, varname ) \
-GENTFUNC3( dcomplex, float,    double,   z, s, d, tfuncname, varname ) \
-GENTFUNC3( dcomplex, float,    scomplex, z, s, c, tfuncname, varname ) \
-GENTFUNC3( dcomplex, float,    dcomplex, z, s, z, tfuncname, varname ) \
-\
-GENTFUNC3( dcomplex, double,   float,    z, d, s, tfuncname, varname ) \
-GENTFUNC3( dcomplex, double,   scomplex, z, d, c, tfuncname, varname ) \
-\
-GENTFUNC3( dcomplex, scomplex, float,    z, c, s, tfuncname, varname ) \
-GENTFUNC3( dcomplex, scomplex, double,   z, c, d, tfuncname, varname ) \
-GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \
-GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \
-\
-GENTFUNC3( dcomplex, dcomplex, float,    z, z, s, tfuncname, varname ) \
-GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \
+#define INSERT_GENTFUNC3_MIX_P( ... ) \
 \
-GENTFUNC3( float,    float,    double,   s, s, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    float,    dcomplex, s, s, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( float,    float,    double,   s, s, d, __VA_ARGS__ ) \
+GENTFUNC3( float,    float,    dcomplex, s, s, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( float,    double,   float,    s, d, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    double,   double,   s, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    double,   scomplex, s, d, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    double,   dcomplex, s, d, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( float,    double,   float,    s, d, s, __VA_ARGS__ ) \
+GENTFUNC3( float,    double,   double,   s, d, d, __VA_ARGS__ ) \
+GENTFUNC3( float,    double,   scomplex, s, d, c, __VA_ARGS__ ) \
+GENTFUNC3( float,    double,   dcomplex, s, d, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( float,    scomplex, double,   s, c, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( float,    scomplex, double,   s, c, d, __VA_ARGS__ ) \
+GENTFUNC3( float,    scomplex, dcomplex, s, c, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( float,    dcomplex, float,    s, z, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    dcomplex, double,   s, z, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( float,    dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( float,    dcomplex, float,    s, z, s, __VA_ARGS__ ) \
+GENTFUNC3( float,    dcomplex, double,   s, z, d, __VA_ARGS__ ) \
+GENTFUNC3( float,    dcomplex, scomplex, s, z, c, __VA_ARGS__ ) \
+GENTFUNC3( float,    dcomplex, dcomplex, s, z, z, __VA_ARGS__ ) \
 \
 \
-GENTFUNC3( double,   float,    float,    d, s, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   float,    double,   d, s, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   float,    scomplex, d, s, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   float,    dcomplex, d, s, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( double,   float,    float,    d, s, s, __VA_ARGS__ ) \
+GENTFUNC3( double,   float,    double,   d, s, d, __VA_ARGS__ ) \
+GENTFUNC3( double,   float,    scomplex, d, s, c, __VA_ARGS__ ) \
+GENTFUNC3( double,   float,    dcomplex, d, s, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( double,   double,   float,    d, d, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   double,   scomplex, d, d, c, tfuncname, varname1, varname2 ) \
+GENTFUNC3( double,   double,   float,    d, d, s, __VA_ARGS__ ) \
+GENTFUNC3( double,   double,   scomplex, d, d, c, __VA_ARGS__ ) \
 \
-GENTFUNC3( double,   scomplex, float,    d, c, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   scomplex, double,   d, c, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( double,   scomplex, float,    d, c, s, __VA_ARGS__ ) \
+GENTFUNC3( double,   scomplex, double,   d, c, d, __VA_ARGS__ ) \
+GENTFUNC3( double,   scomplex, scomplex, d, c, c, __VA_ARGS__ ) \
+GENTFUNC3( double,   scomplex, dcomplex, d, c, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( double,   dcomplex, float,    d, z, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( double,   dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \
+GENTFUNC3( double,   dcomplex, float,    d, z, s, __VA_ARGS__ ) \
+GENTFUNC3( double,   dcomplex, scomplex, d, z, c, __VA_ARGS__ ) \
 \
 \
-GENTFUNC3( scomplex, float,    double,   c, s, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, float,    dcomplex, c, s, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( scomplex, float,    double,   c, s, d, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, float,    dcomplex, c, s, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( scomplex, double,   float,    c, d, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, double,   double,   c, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, double,   scomplex, c, d, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, double,   dcomplex, c, d, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( scomplex, double,   float,    c, d, s, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, double,   double,   c, d, d, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, double,   scomplex, c, d, c, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, double,   dcomplex, c, d, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( scomplex, scomplex, double,   c, c, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( scomplex, scomplex, double,   c, c, d, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( scomplex, dcomplex, float,    c, z, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, dcomplex, double,   c, z, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( scomplex, dcomplex, float,    c, z, s, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, dcomplex, double,   c, z, d, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, __VA_ARGS__ ) \
+GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, __VA_ARGS__ ) \
 \
 \
-GENTFUNC3( dcomplex, float,    float,    z, s, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, float,    double,   z, s, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, float,    scomplex, z, s, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, float,    dcomplex, z, s, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( dcomplex, float,    float,    z, s, s, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, float,    double,   z, s, d, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, float,    scomplex, z, s, c, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, float,    dcomplex, z, s, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( dcomplex, double,   float,    z, d, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, double,   scomplex, z, d, c, tfuncname, varname1, varname2 ) \
+GENTFUNC3( dcomplex, double,   float,    z, d, s, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, double,   scomplex, z, d, c, __VA_ARGS__ ) \
 \
-GENTFUNC3( dcomplex, scomplex, float,    z, c, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, scomplex, double,   z, c, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3( dcomplex, scomplex, float,    z, c, s, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, scomplex, double,   z, c, d, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, __VA_ARGS__ ) \
 \
-GENTFUNC3( dcomplex, dcomplex, float,    z, z, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 )
+GENTFUNC3( dcomplex, dcomplex, float,    z, z, s, __VA_ARGS__ ) \
+GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, __VA_ARGS__ )
 
 
 // -- Basic three-operand with union of operands 1 and 2 --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \
+#define INSERT_GENTFUNC3U12_BASIC( ... ) \
 \
-GENTFUNC3U12( float,    float,    float,    float,    s, s, s, s, tfuncname ) \
-GENTFUNC3U12( double,   double,   double,   double,   d, d, d, d, tfuncname ) \
-GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \
-GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \
-\
-GENTFUNC3U12( float,    float,    float,    float,    s, s, s, s, tfuncname, varname ) \
-GENTFUNC3U12( double,   double,   double,   double,   d, d, d, d, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3U12( float,    float,    float,    float,    s, s, s, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   double,   double,   double,   d, d, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 )
+GENTFUNC3U12( float,    float,    float,    float,    s, s, s, s, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   double,   double,   double,   d, d, d, d, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, __VA_ARGS__ )
 
 
 // -- Mixed domain three-operand with union of operands 1 and 2 --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \
+#define INSERT_GENTFUNC3U12_MIX_D( ... ) \
 \
-GENTFUNC3U12( float,    float,    scomplex, float,    s, s, c, s, tfuncname ) \
-GENTFUNC3U12( float,    scomplex, float,    scomplex, s, c, s, c, tfuncname ) \
-GENTFUNC3U12( float,    scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \
+GENTFUNC3U12( float,    float,    scomplex, float,    s, s, c, s, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    scomplex, float,    scomplex, s, c, s, c, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    scomplex, scomplex, scomplex, s, c, c, c, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( double,   double,   dcomplex, double,   d, d, z, d, tfuncname ) \
-GENTFUNC3U12( double,   dcomplex, double,   dcomplex, d, z, d, z, tfuncname ) \
-GENTFUNC3U12( double,   dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \
+GENTFUNC3U12( double,   double,   dcomplex, double,   d, d, z, d, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   dcomplex, double,   dcomplex, d, z, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   dcomplex, dcomplex, dcomplex, d, z, z, z, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( scomplex, float,    float,    scomplex, c, s, s, c, tfuncname ) \
-GENTFUNC3U12( scomplex, float,    scomplex, scomplex, c, s, c, c, tfuncname ) \
-GENTFUNC3U12( scomplex, scomplex, float,    scomplex, c, c, s, c, tfuncname ) \
+GENTFUNC3U12( scomplex, float,    float,    scomplex, c, s, s, c, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, float,    scomplex, scomplex, c, s, c, c, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, scomplex, float,    scomplex, c, c, s, c, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( dcomplex, double,   double,   dcomplex, z, d, d, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, double,   dcomplex, dcomplex, z, d, z, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, dcomplex, double,   dcomplex, z, z, d, z, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \
-\
-GENTFUNC3U12( float,    float,    scomplex, float,    s, s, c, s, tfuncname, varname ) \
-GENTFUNC3U12( float,    scomplex, float,    scomplex, s, c, s, c, tfuncname, varname ) \
-GENTFUNC3U12( float,    scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \
-\
-GENTFUNC3U12( double,   double,   dcomplex, double,   d, d, z, d, tfuncname, varname ) \
-GENTFUNC3U12( double,   dcomplex, double,   dcomplex, d, z, d, z, tfuncname, varname ) \
-GENTFUNC3U12( double,   dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \
-\
-GENTFUNC3U12( scomplex, float,    float,    scomplex, c, s, s, c, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, float,    scomplex, scomplex, c, s, c, c, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, scomplex, float,    scomplex, c, c, s, c, tfuncname, varname ) \
-\
-GENTFUNC3U12( dcomplex, double,   double,   dcomplex, z, d, d, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, double,   dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, dcomplex, double,   dcomplex, z, z, d, z, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3U12( float,    float,    scomplex, float,    s, s, c, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    scomplex, float,    scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3U12( double,   double,   dcomplex, double,   d, d, z, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   dcomplex, double,   dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3U12( scomplex, float,    float,    scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, float,    scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, scomplex, float,    scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \
-\
-GENTFUNC3U12( dcomplex, double,   double,   dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, double,   dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, dcomplex, double,   dcomplex, z, z, d, z, tfuncname, varname1, varname2 )
+GENTFUNC3U12( dcomplex, double,   double,   dcomplex, z, d, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, double,   dcomplex, dcomplex, z, d, z, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, dcomplex, double,   dcomplex, z, z, d, z, __VA_ARGS__ )
 
 
 // -- Mixed precision three-operand with union of operands 1 and 2 --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \
-\
-GENTFUNC3U12( float,    float,    double,   float,    s, s, d, s, tfuncname ) \
-GENTFUNC3U12( float,    float,    dcomplex, float,    s, s, z, s, tfuncname ) \
-\
-GENTFUNC3U12( float,    double,   float,    double,   s, d, s, d, tfuncname ) \
-GENTFUNC3U12( float,    double,   double,   double,   s, d, d, d, tfuncname ) \
-GENTFUNC3U12( float,    double,   scomplex, double,   s, d, c, d, tfuncname ) \
-GENTFUNC3U12( float,    double,   dcomplex, double,   s, d, z, d, tfuncname ) \
-\
-GENTFUNC3U12( float,    scomplex, double,   scomplex, s, c, d, c, tfuncname ) \
-GENTFUNC3U12( float,    scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \
-\
-GENTFUNC3U12( float,    dcomplex, float,    dcomplex, s, z, s, z, tfuncname ) \
-GENTFUNC3U12( float,    dcomplex, double,   dcomplex, s, z, d, z, tfuncname ) \
-GENTFUNC3U12( float,    dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \
-GENTFUNC3U12( float,    dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \
-\
-\
-GENTFUNC3U12( double,   float,    float,    double,   d, s, s, d, tfuncname ) \
-GENTFUNC3U12( double,   float,    double,   double,   d, s, d, d, tfuncname ) \
-GENTFUNC3U12( double,   float,    scomplex, double,   d, s, c, d, tfuncname ) \
-GENTFUNC3U12( double,   float,    dcomplex, double,   d, s, z, d, tfuncname ) \
-\
-GENTFUNC3U12( double,   double,   float,    double,   d, d, s, d, tfuncname ) \
-GENTFUNC3U12( double,   double,   scomplex, double,   d, d, c, d, tfuncname ) \
-\
-GENTFUNC3U12( double,   scomplex, float,    dcomplex, d, c, s, z, tfuncname ) \
-GENTFUNC3U12( double,   scomplex, double,   dcomplex, d, c, d, z, tfuncname ) \
-GENTFUNC3U12( double,   scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \
-GENTFUNC3U12( double,   scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \
-\
-GENTFUNC3U12( double,   dcomplex, float,    dcomplex, d, z, s, z, tfuncname ) \
-GENTFUNC3U12( double,   dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \
-\
-\
-GENTFUNC3U12( scomplex, float,    double,   scomplex, c, s, d, c, tfuncname ) \
-GENTFUNC3U12( scomplex, float,    dcomplex, scomplex, c, s, z, c, tfuncname ) \
-\
-GENTFUNC3U12( scomplex, double,   float,    dcomplex, c, d, s, z, tfuncname ) \
-GENTFUNC3U12( scomplex, double,   double,   dcomplex, c, d, d, z, tfuncname ) \
-GENTFUNC3U12( scomplex, double,   scomplex, dcomplex, c, d, c, z, tfuncname ) \
-GENTFUNC3U12( scomplex, double,   dcomplex, dcomplex, c, d, z, z, tfuncname ) \
-\
-GENTFUNC3U12( scomplex, scomplex, double,   scomplex, c, c, d, c, tfuncname ) \
-GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \
-\
-GENTFUNC3U12( scomplex, dcomplex, float,    dcomplex, c, z, s, z, tfuncname ) \
-GENTFUNC3U12( scomplex, dcomplex, double,   dcomplex, c, z, d, z, tfuncname ) \
-GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \
-GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \
-\
-\
-GENTFUNC3U12( dcomplex, float,    float,    dcomplex, z, s, s, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, float,    double,   dcomplex, z, s, d, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, float,    scomplex, dcomplex, z, s, c, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, float,    dcomplex, dcomplex, z, s, z, z, tfuncname ) \
-\
-GENTFUNC3U12( dcomplex, double,   float,    dcomplex, z, d, s, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, double,   scomplex, dcomplex, z, d, c, z, tfuncname ) \
-\
-GENTFUNC3U12( dcomplex, scomplex, float,    dcomplex, z, c, s, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, scomplex, double,   dcomplex, z, c, d, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \
-\
-GENTFUNC3U12( dcomplex, dcomplex, float,    dcomplex, z, z, s, z, tfuncname ) \
-GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \
-\
-GENTFUNC3U12( float,    float,    double,   float,    s, s, d, s, tfuncname, varname ) \
-GENTFUNC3U12( float,    float,    dcomplex, float,    s, s, z, s, tfuncname, varname ) \
-\
-GENTFUNC3U12( float,    double,   float,    double,   s, d, s, d, tfuncname, varname ) \
-GENTFUNC3U12( float,    double,   double,   double,   s, d, d, d, tfuncname, varname ) \
-GENTFUNC3U12( float,    double,   scomplex, double,   s, d, c, d, tfuncname, varname ) \
-GENTFUNC3U12( float,    double,   dcomplex, double,   s, d, z, d, tfuncname, varname ) \
-\
-GENTFUNC3U12( float,    scomplex, double,   scomplex, s, c, d, c, tfuncname, varname ) \
-GENTFUNC3U12( float,    scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \
-\
-GENTFUNC3U12( float,    dcomplex, float,    dcomplex, s, z, s, z, tfuncname, varname ) \
-GENTFUNC3U12( float,    dcomplex, double,   dcomplex, s, z, d, z, tfuncname, varname ) \
-GENTFUNC3U12( float,    dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \
-GENTFUNC3U12( float,    dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \
-\
-\
-GENTFUNC3U12( double,   float,    float,    double,   d, s, s, d, tfuncname, varname ) \
-GENTFUNC3U12( double,   float,    double,   double,   d, s, d, d, tfuncname, varname ) \
-GENTFUNC3U12( double,   float,    scomplex, double,   d, s, c, d, tfuncname, varname ) \
-GENTFUNC3U12( double,   float,    dcomplex, double,   d, s, z, d, tfuncname, varname ) \
-\
-GENTFUNC3U12( double,   double,   float,    double,   d, d, s, d, tfuncname, varname ) \
-GENTFUNC3U12( double,   double,   scomplex, double,   d, d, c, d, tfuncname, varname ) \
-\
-GENTFUNC3U12( double,   scomplex, float,    dcomplex, d, c, s, z, tfuncname, varname ) \
-GENTFUNC3U12( double,   scomplex, double,   dcomplex, d, c, d, z, tfuncname, varname ) \
-GENTFUNC3U12( double,   scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \
-GENTFUNC3U12( double,   scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \
-\
-GENTFUNC3U12( double,   dcomplex, float,    dcomplex, d, z, s, z, tfuncname, varname ) \
-GENTFUNC3U12( double,   dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \
-\
-\
-GENTFUNC3U12( scomplex, float,    double,   scomplex, c, s, d, c, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, float,    dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \
-\
-GENTFUNC3U12( scomplex, double,   float,    dcomplex, c, d, s, z, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, double,   double,   dcomplex, c, d, d, z, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, double,   scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, double,   dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \
-\
-GENTFUNC3U12( scomplex, scomplex, double,   scomplex, c, c, d, c, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \
-\
-GENTFUNC3U12( scomplex, dcomplex, float,    dcomplex, c, z, s, z, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, dcomplex, double,   dcomplex, c, z, d, z, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \
-GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \
-\
-\
-GENTFUNC3U12( dcomplex, float,    float,    dcomplex, z, s, s, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, float,    double,   dcomplex, z, s, d, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, float,    scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, float,    dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \
-\
-GENTFUNC3U12( dcomplex, double,   float,    dcomplex, z, d, s, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, double,   scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \
-\
-GENTFUNC3U12( dcomplex, scomplex, float,    dcomplex, z, c, s, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, scomplex, double,   dcomplex, z, c, d, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \
-\
-GENTFUNC3U12( dcomplex, dcomplex, float,    dcomplex, z, z, s, z, tfuncname, varname ) \
-GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \
+#define INSERT_GENTFUNC3U12_MIX_P( ... ) \
 \
-GENTFUNC3U12( float,    float,    double,   float,    s, s, d, s, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    float,    dcomplex, float,    s, s, z, s, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( float,    float,    double,   float,    s, s, d, s, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    float,    dcomplex, float,    s, s, z, s, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( float,    double,   float,    double,   s, d, s, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    double,   double,   double,   s, d, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    double,   scomplex, double,   s, d, c, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    double,   dcomplex, double,   s, d, z, d, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( float,    double,   float,    double,   s, d, s, d, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    double,   double,   double,   s, d, d, d, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    double,   scomplex, double,   s, d, c, d, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    double,   dcomplex, double,   s, d, z, d, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( float,    scomplex, double,   scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( float,    scomplex, double,   scomplex, s, c, d, c, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    scomplex, dcomplex, scomplex, s, c, z, c, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( float,    dcomplex, float,    dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    dcomplex, double,   dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( float,    dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( float,    dcomplex, float,    dcomplex, s, z, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    dcomplex, double,   dcomplex, s, z, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    dcomplex, scomplex, dcomplex, s, z, c, z, __VA_ARGS__ ) \
+GENTFUNC3U12( float,    dcomplex, dcomplex, dcomplex, s, z, z, z, __VA_ARGS__ ) \
 \
 \
-GENTFUNC3U12( double,   float,    float,    double,   d, s, s, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   float,    double,   double,   d, s, d, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   float,    scomplex, double,   d, s, c, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   float,    dcomplex, double,   d, s, z, d, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( double,   float,    float,    double,   d, s, s, d, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   float,    double,   double,   d, s, d, d, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   float,    scomplex, double,   d, s, c, d, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   float,    dcomplex, double,   d, s, z, d, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( double,   double,   float,    double,   d, d, s, d, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   double,   scomplex, double,   d, d, c, d, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( double,   double,   float,    double,   d, d, s, d, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   double,   scomplex, double,   d, d, c, d, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( double,   scomplex, float,    dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   scomplex, double,   dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( double,   scomplex, float,    dcomplex, d, c, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   scomplex, double,   dcomplex, d, c, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   scomplex, scomplex, dcomplex, d, c, c, z, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   scomplex, dcomplex, dcomplex, d, c, z, z, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( double,   dcomplex, float,    dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( double,   dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( double,   dcomplex, float,    dcomplex, d, z, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( double,   dcomplex, scomplex, dcomplex, d, z, c, z, __VA_ARGS__ ) \
 \
 \
-GENTFUNC3U12( scomplex, float,    double,   scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, float,    dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( scomplex, float,    double,   scomplex, c, s, d, c, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, float,    dcomplex, scomplex, c, s, z, c, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( scomplex, double,   float,    dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, double,   double,   dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, double,   scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, double,   dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( scomplex, double,   float,    dcomplex, c, d, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, double,   double,   dcomplex, c, d, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, double,   scomplex, dcomplex, c, d, c, z, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, double,   dcomplex, dcomplex, c, d, z, z, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( scomplex, scomplex, double,   scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( scomplex, scomplex, double,   scomplex, c, c, d, c, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( scomplex, dcomplex, float,    dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, dcomplex, double,   dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( scomplex, dcomplex, float,    dcomplex, c, z, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, dcomplex, double,   dcomplex, c, z, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, __VA_ARGS__ ) \
+GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, __VA_ARGS__ ) \
 \
 \
-GENTFUNC3U12( dcomplex, float,    float,    dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, float,    double,   dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, float,    scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, float,    dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( dcomplex, float,    float,    dcomplex, z, s, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, float,    double,   dcomplex, z, s, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, float,    scomplex, dcomplex, z, s, c, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, float,    dcomplex, dcomplex, z, s, z, z, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( dcomplex, double,   float,    dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, double,   scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( dcomplex, double,   float,    dcomplex, z, d, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, double,   scomplex, dcomplex, z, d, c, z, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( dcomplex, scomplex, float,    dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, scomplex, double,   dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \
+GENTFUNC3U12( dcomplex, scomplex, float,    dcomplex, z, c, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, scomplex, double,   dcomplex, z, c, d, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, __VA_ARGS__ ) \
 \
-GENTFUNC3U12( dcomplex, dcomplex, float,    dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \
-GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 )
+GENTFUNC3U12( dcomplex, dcomplex, float,    dcomplex, z, z, s, z, __VA_ARGS__ ) \
+GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, __VA_ARGS__ )
 
 
 #endif
diff --git a/frame/include/bli_gentprot_macro_defs.h b/frame/include/bli_gentprot_macro_defs.h
index 3db9cdc48..e733e4800 100644
--- a/frame/include/bli_gentprot_macro_defs.h
+++ b/frame/include/bli_gentprot_macro_defs.h
@@ -145,177 +145,60 @@ GENTPROTSCAL( double,   dcomplex, d, z, blasname )
 
 // -- Basic one-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT_BASIC0( tfuncname ) \
-\
-GENTPROT( float,    s, tfuncname ) \
-GENTPROT( double,   d, tfuncname ) \
-GENTPROT( scomplex, c, tfuncname ) \
-GENTPROT( dcomplex, z, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT_BASIC( tfuncname, varname ) \
+#define INSERT_GENTPROT_BASIC( ... ) \
 \
-GENTPROT( float,    s, tfuncname, varname ) \
-GENTPROT( double,   d, tfuncname, varname ) \
-GENTPROT( scomplex, c, tfuncname, varname ) \
-GENTPROT( dcomplex, z, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \
-\
-GENTPROT( float,    s, tfuncname, varname1, varname2 ) \
-GENTPROT( double,   d, tfuncname, varname1, varname2 ) \
-GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \
-GENTPROT( dcomplex, z, tfuncname, varname1, varname2 )
-
-// -- (three auxiliary arguments) --
-
-#define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \
-\
-GENTPROT( float,    s, tfuncname, varname1, varname2, varname3 ) \
-GENTPROT( double,   d, tfuncname, varname1, varname2, varname3 ) \
-GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \
-GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 )
-
-// -- (four auxiliary arguments) --
-
-#define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \
-\
-GENTPROT( float,    s, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTPROT( double,   d, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 )
+GENTPROT( float,    s, __VA_ARGS__ ) \
+GENTPROT( double,   d, __VA_ARGS__ ) \
+GENTPROT( scomplex, c, __VA_ARGS__ ) \
+GENTPROT( dcomplex, z, __VA_ARGS__ )
 
 
 // -- Basic one-operand with real projection --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROTR_BASIC0( tfuncname ) \
-\
-GENTPROTR( float,    float,  s, s, tfuncname ) \
-GENTPROTR( double,   double, d, d, tfuncname ) \
-GENTPROTR( scomplex, float,  c, s, tfuncname ) \
-GENTPROTR( dcomplex, double, z, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \
+#define INSERT_GENTPROTR_BASIC( ... ) \
 \
-GENTPROTR( float,    float,  s, s, tfuncname, varname ) \
-GENTPROTR( double,   double, d, d, tfuncname, varname ) \
-GENTPROTR( scomplex, float,  c, s, tfuncname, varname ) \
-GENTPROTR( dcomplex, double, z, d, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \
-\
-GENTPROTR( float,    float,  s, s, tfuncname, varname1, varname2 ) \
-GENTPROTR( double,   double, d, d, tfuncname, varname1, varname2 ) \
-GENTPROTR( scomplex, float,  c, s, tfuncname, varname1, varname2 ) \
-GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 )
-
-// -- (three auxiliary arguments) --
-
-#define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3  ) \
-\
-GENTPROTR( float,    float,  s, s, tfuncname, varname1, varname2, varname3 ) \
-GENTPROTR( double,   double, d, d, tfuncname, varname1, varname2, varname3 ) \
-GENTPROTR( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3 ) \
-GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 )
-
-// -- (four auxiliary arguments) --
-
-#define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4  ) \
-\
-GENTPROTR( float,    float,  s, s, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTPROTR( double,   double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTPROTR( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3, varname4 ) \
-GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 )
+GENTPROTR( float,    float,  s, s, __VA_ARGS__ ) \
+GENTPROTR( double,   double, d, d, __VA_ARGS__ ) \
+GENTPROTR( scomplex, float,  c, s, __VA_ARGS__ ) \
+GENTPROTR( dcomplex, double, z, d, __VA_ARGS__ )
 
 
 // -- Basic one-operand macro with complex domain only and real projection --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROTCO_BASIC0( tfuncname ) \
-\
-GENTPROTCO( scomplex, float,  c, s, tfuncname ) \
-GENTPROTCO( dcomplex, double, z, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \
-\
-GENTPROTCO( scomplex, float,  c, s, tfuncname, varname ) \
-GENTPROTCO( dcomplex, double, z, d, tfuncname, varname )
-
-// -- (two auxiliary arguments) --
-
-#define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \
+#define INSERT_GENTPROTCO_BASIC( ... ) \
 \
-GENTPROTCO( scomplex, float,  c, s, tfuncname, varname1, varname2 ) \
-GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 )
+GENTPROTCO( scomplex, float,  c, s, __VA_ARGS__ ) \
+GENTPROTCO( dcomplex, double, z, d, __VA_ARGS__ )
 
 
 // -- Basic one-operand macro with integer instance --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT_BASIC0_I( funcname ) \
-\
-GENTPROT( float,    s, funcname ) \
-GENTPROT( double,   d, funcname ) \
-GENTPROT( scomplex, c, funcname ) \
-GENTPROT( dcomplex, z, funcname ) \
-GENTPROT( gint_t,   i, funcname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \
+#define INSERT_GENTPROT_BASIC_I( ... ) \
 \
-GENTPROT( float,    s, tfuncname, varname ) \
-GENTPROT( double,   d, tfuncname, varname ) \
-GENTPROT( scomplex, c, tfuncname, varname ) \
-GENTPROT( dcomplex, z, tfuncname, varname ) \
-GENTPROT( gint_t,   i, tfuncname, varname )
+GENTPROT( float,    s, __VA_ARGS__ ) \
+GENTPROT( double,   d, __VA_ARGS__ ) \
+GENTPROT( scomplex, c, __VA_ARGS__ ) \
+GENTPROT( dcomplex, z, __VA_ARGS__ ) \
+GENTPROT( gint_t,   i, __VA_ARGS__ )
 
 
 // -- Basic one-operand with integer projection --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROTI_BASIC0( funcname ) \
-\
-GENTPROTI( float,    gint_t, s, i, funcname ) \
-GENTPROTI( double,   gint_t, d, i, funcname ) \
-GENTPROTI( scomplex, gint_t, c, i, funcname ) \
-GENTPROTI( dcomplex, gint_t, z, i, funcname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \
+#define INSERT_GENTPROTI_BASIC( ... ) \
 \
-GENTPROTI( float,    gint_t, s, i, tfuncname, varname ) \
-GENTPROTI( double,   gint_t, d, i, tfuncname, varname ) \
-GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \
-GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname )
+GENTPROTI( float,    gint_t, s, i, __VA_ARGS__ ) \
+GENTPROTI( double,   gint_t, d, i, __VA_ARGS__ ) \
+GENTPROTI( scomplex, gint_t, c, i, __VA_ARGS__ ) \
+GENTPROTI( dcomplex, gint_t, z, i, __VA_ARGS__ )
 
 
 // -- Basic one-operand with real and integer projections --
 
-// -- (no auxiliary arguments) --
-
 #define INSERT_GENTPROTRI_BASIC( funcname ) \
 \
 GENTPROTRI( float,    float,  gint_t, s, s, i, funcname ) \
@@ -331,209 +214,105 @@ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname )
 
 // -- Basic two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT2_BASIC0( funcname ) \
-\
-GENTPROT2( float,    float,    s, s, funcname ) \
-GENTPROT2( double,   double,   d, d, funcname ) \
-GENTPROT2( scomplex, scomplex, c, c, funcname ) \
-GENTPROT2( dcomplex, dcomplex, z, z, funcname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \
+#define INSERT_GENTPROT2_BASIC( ... ) \
 \
-GENTPROT2( float,    float,    s, s, tfuncname, varname ) \
-GENTPROT2( double,   double,   d, d, tfuncname, varname ) \
-GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \
-GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname )
+GENTPROT2( float,    float,    s, s, __VA_ARGS__ ) \
+GENTPROT2( double,   double,   d, d, __VA_ARGS__ ) \
+GENTPROT2( scomplex, scomplex, c, c, __VA_ARGS__ ) \
+GENTPROT2( dcomplex, dcomplex, z, z, __VA_ARGS__ )
 
 
 // -- Mixed domain two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT2_MIX_D0( funcname ) \
-\
-GENTPROT2( float,    scomplex, s, c, funcname ) \
-GENTPROT2( scomplex, float,    c, s, funcname ) \
-\
-GENTPROT2( double,   dcomplex, d, z, funcname ) \
-GENTPROT2( dcomplex, double,   z, d, funcname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \
+#define INSERT_GENTPROT2_MIX_D( ... ) \
 \
-GENTPROT2( float,    scomplex, s, c, tfuncname, varname ) \
-GENTPROT2( scomplex, float,    c, s, tfuncname, varname ) \
+GENTPROT2( float,    scomplex, s, c, __VA_ARGS__ ) \
+GENTPROT2( scomplex, float,    c, s, __VA_ARGS__ ) \
 \
-GENTPROT2( double,   dcomplex, d, z, tfuncname, varname ) \
-GENTPROT2( dcomplex, double,   z, d, tfuncname, varname )
+GENTPROT2( double,   dcomplex, d, z, __VA_ARGS__ ) \
+GENTPROT2( dcomplex, double,   z, d, __VA_ARGS__ )
 
 
 // -- Mixed precision two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT2_MIX_P0( funcname ) \
-\
-GENTPROT2( float,    double,   s, d, funcname ) \
-GENTPROT2( float,    dcomplex, s, z, funcname ) \
-\
-GENTPROT2( double,   float,    d, s, funcname ) \
-GENTPROT2( double,   scomplex, d, c, funcname ) \
-\
-GENTPROT2( scomplex, double,   c, d, funcname ) \
-GENTPROT2( scomplex, dcomplex, c, z, funcname ) \
-\
-GENTPROT2( dcomplex, float,    z, s, funcname ) \
-GENTPROT2( dcomplex, scomplex, z, c, funcname ) \
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \
+#define INSERT_GENTPROT2_MIX_P( ... ) \
 \
-GENTPROT2( float,    double,   s, d, tfuncname, varname ) \
-GENTPROT2( float,    dcomplex, s, z, tfuncname, varname ) \
+GENTPROT2( float,    double,   s, d, __VA_ARGS__ ) \
+GENTPROT2( float,    dcomplex, s, z, __VA_ARGS__ ) \
 \
-GENTPROT2( double,   float,    d, s, tfuncname, varname ) \
-GENTPROT2( double,   scomplex, d, c, tfuncname, varname ) \
+GENTPROT2( double,   float,    d, s, __VA_ARGS__ ) \
+GENTPROT2( double,   scomplex, d, c, __VA_ARGS__ ) \
 \
-GENTPROT2( scomplex, double,   c, d, tfuncname, varname ) \
-GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \
+GENTPROT2( scomplex, double,   c, d, __VA_ARGS__ ) \
+GENTPROT2( scomplex, dcomplex, c, z, __VA_ARGS__ ) \
 \
-GENTPROT2( dcomplex, float,    z, s, tfuncname, varname ) \
-GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \
+GENTPROT2( dcomplex, float,    z, s, __VA_ARGS__ ) \
+GENTPROT2( dcomplex, scomplex, z, c, __VA_ARGS__ ) \
 
 
 // -- Mixed domain/precision (all) two-operand macro --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT2_MIXDP0( funcname ) \
-\
-GENTPROT2( float,    double,   s, d, funcname ) \
-GENTPROT2( float,    scomplex, s, c, funcname ) \
-GENTPROT2( float,    dcomplex, s, z, funcname ) \
-\
-GENTPROT2( double,   float,    d, s, funcname ) \
-GENTPROT2( double,   scomplex, d, c, funcname ) \
-GENTPROT2( double,   dcomplex, d, z, funcname ) \
-\
-GENTPROT2( scomplex, float,    c, s, funcname ) \
-GENTPROT2( scomplex, double,   c, d, funcname ) \
-GENTPROT2( scomplex, dcomplex, c, z, funcname ) \
-\
-GENTPROT2( dcomplex, float,    z, s, funcname ) \
-GENTPROT2( dcomplex, double,   z, d, funcname ) \
-GENTPROT2( dcomplex, scomplex, z, c, funcname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \
+#define INSERT_GENTPROT2_MIX_DP( ... ) \
 \
-GENTPROT2( float,    double,   s, d, tfuncname, varname ) \
-GENTPROT2( float,    scomplex, s, c, tfuncname, varname ) \
-GENTPROT2( float,    dcomplex, s, z, tfuncname, varname ) \
+GENTPROT2( float,    double,   s, d, __VA_ARGS__ ) \
+GENTPROT2( float,    scomplex, s, c, __VA_ARGS__ ) \
+GENTPROT2( float,    dcomplex, s, z, __VA_ARGS__ ) \
 \
-GENTPROT2( double,   float,    d, s, tfuncname, varname ) \
-GENTPROT2( double,   scomplex, d, c, tfuncname, varname ) \
-GENTPROT2( double,   dcomplex, d, z, tfuncname, varname ) \
+GENTPROT2( double,   float,    d, s, __VA_ARGS__ ) \
+GENTPROT2( double,   scomplex, d, c, __VA_ARGS__ ) \
+GENTPROT2( double,   dcomplex, d, z, __VA_ARGS__ ) \
 \
-GENTPROT2( scomplex, float,    c, s, tfuncname, varname ) \
-GENTPROT2( scomplex, double,   c, d, tfuncname, varname ) \
-GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \
+GENTPROT2( scomplex, float,    c, s, __VA_ARGS__ ) \
+GENTPROT2( scomplex, double,   c, d, __VA_ARGS__ ) \
+GENTPROT2( scomplex, dcomplex, c, z, __VA_ARGS__ ) \
 \
-GENTPROT2( dcomplex, float,    z, s, tfuncname, varname ) \
-GENTPROT2( dcomplex, double,   z, d, tfuncname, varname ) \
-GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname )
+GENTPROT2( dcomplex, float,    z, s, __VA_ARGS__ ) \
+GENTPROT2( dcomplex, double,   z, d, __VA_ARGS__ ) \
+GENTPROT2( dcomplex, scomplex, z, c, __VA_ARGS__ )
 
 
 // -- Basic two-operand with real projection of first operand --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT2R_BASIC0( funcname ) \
+#define INSERT_GENTPROT2R_BASIC( ... ) \
 \
-GENTPROT2R( float,    float,    float,    s, s, s, funcname ) \
-GENTPROT2R( double,   double,   double,   d, d, d, funcname ) \
-GENTPROT2R( scomplex, scomplex, float,    c, c, s, funcname ) \
-GENTPROT2R( dcomplex, dcomplex, double,   z, z, d, funcname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \
-\
-GENTPROT2R( float,    float,    float,    s, s, s, tfuncname, varname ) \
-GENTPROT2R( double,   double,   double,   d, d, d, tfuncname, varname ) \
-GENTPROT2R( scomplex, scomplex, float,    c, c, s, tfuncname, varname ) \
-GENTPROT2R( dcomplex, dcomplex, double,   z, z, d, tfuncname, varname )
+GENTPROT2R( float,    float,    float,    s, s, s, __VA_ARGS__ ) \
+GENTPROT2R( double,   double,   double,   d, d, d, __VA_ARGS__ ) \
+GENTPROT2R( scomplex, scomplex, float,    c, c, s, __VA_ARGS__ ) \
+GENTPROT2R( dcomplex, dcomplex, double,   z, z, d, __VA_ARGS__ )
 
 
 // -- Mixed domain two-operand with real projection of first operand --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \
-\
-GENTPROT2R( float,    scomplex, float,    s, c, s, tfuncname ) \
-GENTPROT2R( scomplex, float,    float,    c, s, s, tfuncname ) \
-\
-GENTPROT2R( double,   dcomplex, double,   d, z, d, tfuncname ) \
-GENTPROT2R( dcomplex, double,   double,   z, d, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \
+#define INSERT_GENTPROT2R_MIX_D( ... ) \
 \
-GENTPROT2R( float,    scomplex, float,    s, c, s, tfuncname, varname ) \
-GENTPROT2R( scomplex, float,    float,    c, s, s, tfuncname, varname ) \
+GENTPROT2R( float,    scomplex, float,    s, c, s, __VA_ARGS__ ) \
+GENTPROT2R( scomplex, float,    float,    c, s, s, __VA_ARGS__ ) \
 \
-GENTPROT2R( double,   dcomplex, double,   d, z, d, tfuncname, varname ) \
-GENTPROT2R( dcomplex, double,   double,   z, d, d, tfuncname, varname )
+GENTPROT2R( double,   dcomplex, double,   d, z, d, __VA_ARGS__ ) \
+GENTPROT2R( dcomplex, double,   double,   z, d, d, __VA_ARGS__ )
 
 
 // -- Mixed precision two-operand with real projection of first operand --
 
-// -- (no auxiliary arguments) --
-
-#define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \
-\
-GENTPROT2R( float,    double,   float,    s, d, s, tfuncname ) \
-GENTPROT2R( float,    dcomplex, float,    s, z, s, tfuncname ) \
-\
-GENTPROT2R( double,   float,    double,   d, s, d, tfuncname ) \
-GENTPROT2R( double,   scomplex, double,   d, c, d, tfuncname ) \
-\
-GENTPROT2R( scomplex, double,   float,    c, d, s, tfuncname ) \
-GENTPROT2R( scomplex, dcomplex, float,    c, z, s, tfuncname ) \
-\
-GENTPROT2R( dcomplex, float,    double,   z, s, d, tfuncname ) \
-GENTPROT2R( dcomplex, scomplex, double,   z, c, d, tfuncname )
-
-// -- (one auxiliary argument) --
-
-#define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \
+#define INSERT_GENTPROT2R_MIX_P( ... ) \
 \
-GENTPROT2R( float,    double,   float,    s, d, s, tfuncname, varname ) \
-GENTPROT2R( float,    dcomplex, float,    s, z, s, tfuncname, varname ) \
+GENTPROT2R( float,    double,   float,    s, d, s, __VA_ARGS__ ) \
+GENTPROT2R( float,    dcomplex, float,    s, z, s, __VA_ARGS__ ) \
 \
-GENTPROT2R( double,   float,    double,   d, s, d, tfuncname, varname ) \
-GENTPROT2R( double,   scomplex, double,   d, c, d, tfuncname, varname ) \
+GENTPROT2R( double,   float,    double,   d, s, d, __VA_ARGS__ ) \
+GENTPROT2R( double,   scomplex, double,   d, c, d, __VA_ARGS__ ) \
 \
-GENTPROT2R( scomplex, double,   float,    c, d, s, tfuncname, varname ) \
-GENTPROT2R( scomplex, dcomplex, float,    c, z, s, tfuncname, varname ) \
+GENTPROT2R( scomplex, double,   float,    c, d, s, __VA_ARGS__ ) \
+GENTPROT2R( scomplex, dcomplex, float,    c, z, s, __VA_ARGS__ ) \
 \
-GENTPROT2R( dcomplex, float,    double,   z, s, d, tfuncname, varname ) \
-GENTPROT2R( dcomplex, scomplex, double,   z, c, d, tfuncname, varname )
+GENTPROT2R( dcomplex, float,    double,   z, s, d, __VA_ARGS__ ) \
+GENTPROT2R( dcomplex, scomplex, double,   z, c, d, __VA_ARGS__ )
 
 
diff --git a/frame/include/level0/bb/bli_bcastbbs_mxn.h b/frame/include/level0/bb/bli_bcastbbs_mxn.h
index 84ca4fdc1..d060b767b 100644
--- a/frame/include/level0/bb/bli_bcastbbs_mxn.h
+++ b/frame/include/level0/bb/bli_bcastbbs_mxn.h
@@ -69,6 +69,6 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( bcastbbs_mxn )
+INSERT_GENTFUNC_BASIC( bcastbbs_mxn )
 
 #endif
diff --git a/frame/include/level0/bb/bli_scal2bbs_mxn.h b/frame/include/level0/bb/bli_scal2bbs_mxn.h
index 9d0325b5e..c4156713f 100644
--- a/frame/include/level0/bb/bli_scal2bbs_mxn.h
+++ b/frame/include/level0/bb/bli_scal2bbs_mxn.h
@@ -102,7 +102,7 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn )
+INSERT_GENTFUNCRO_BASIC( scal2bbs_mxn )
 
 
 #undef  GENTFUNCCO
@@ -199,6 +199,6 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn )
+INSERT_GENTFUNCCO( scal2bbs_mxn )
 
 #endif
diff --git a/frame/include/level0/bb/bli_set0bbs_mxn.h b/frame/include/level0/bb/bli_set0bbs_mxn.h
index 3a44883f4..f05121819 100644
--- a/frame/include/level0/bb/bli_set0bbs_mxn.h
+++ b/frame/include/level0/bb/bli_set0bbs_mxn.h
@@ -69,6 +69,6 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( set0bbs_mxn )
+INSERT_GENTFUNC_BASIC( set0bbs_mxn )
 
 #endif
diff --git a/frame/include/level0/bli_copys_mxn.h b/frame/include/level0/bli_copys_mxn.h
index 9dc688ac0..cb2513466 100644
--- a/frame/include/level0/bli_copys_mxn.h
+++ b/frame/include/level0/bli_copys_mxn.h
@@ -100,7 +100,7 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	PASTEMAC2(ch,ch,opname)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
 }
 
-INSERT_GENTFUNC_BASIC0( copys_mxn )
+INSERT_GENTFUNC_BASIC( copys_mxn )
 
 
diff --git a/frame/include/level0/bli_scal2s_mxn.h b/frame/include/level0/bli_scal2s_mxn.h
index db17eee4c..d58a37cd0 100644
--- a/frame/include/level0/bli_scal2s_mxn.h
+++ b/frame/include/level0/bli_scal2s_mxn.h
@@ -84,6 +84,6 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( scal2s_mxn )
+INSERT_GENTFUNC_BASIC( scal2s_mxn )
 
 #endif
diff --git a/frame/include/level0/bli_set0s_edge.h b/frame/include/level0/bli_set0s_edge.h
index 2c436812e..ca57685fc 100644
--- a/frame/include/level0/bli_set0s_edge.h
+++ b/frame/include/level0/bli_set0s_edge.h
@@ -74,6 +74,6 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0(set0s_edge)
+INSERT_GENTFUNC_BASIC(set0s_edge)
 
 #endif
diff --git a/frame/include/level0/bli_xpbys_mxn.h b/frame/include/level0/bli_xpbys_mxn.h
index 446e45d7f..3fae708cd 100644
--- a/frame/include/level0/bli_xpbys_mxn.h
+++ b/frame/include/level0/bli_xpbys_mxn.h
@@ -120,7 +120,7 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
     PASTEMAC3(ch,ch,ch,opname)( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
 }
 
-INSERT_GENTFUNC_BASIC0( xpbys_mxn )
+INSERT_GENTFUNC_BASIC( xpbys_mxn )
 
 
diff --git a/frame/thread/bli_thread_range_tlb.c b/frame/thread/bli_thread_range_tlb.c
index d0c767373..cdc2950f8 100644
--- a/frame/thread/bli_thread_range_tlb.c
+++ b/frame/thread/bli_thread_range_tlb.c
@@ -179,7 +179,7 @@ dim_t bli_thread_range_tlb_l
 	const dim_t tri_ref_area   = nonrect_area - tri_unref_area;
 	const dim_t total_ref_area = rect_area + tri_ref_area;
 
-	PGUARD printf( "gross area:         %7ld\n", (long) m * n );
+	PGUARD printf( "gross area:         %7ld\n", (long) ( m * n ) );
 	PGUARD printf( "rect_area:          %7ld\n", (long) rect_area );
 	PGUARD printf( "nonrect_area:       %7ld\n", (long) nonrect_area );
 	PGUARD printf( "tri_unref_area:     %7ld\n", (long) tri_unref_area );
@@ -218,7 +218,7 @@ dim_t bli_thread_range_tlb_l
 	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
 
 	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", (long) n_ut_for_me,
-	               (long) n_ut_per_thr, (long) n_ut_for_me - n_ut_per_thr );
+	               (long) n_ut_per_thr, (long) ( n_ut_for_me - n_ut_per_thr ) );
 
 	// Compute the number of utiles prior to the current thread's starting
 	// point. This is the sum of all n_ut_for_me for all thread ids less
@@ -446,7 +446,7 @@ dim_t bli_thread_range_tlb_u
 		num_unref_ut += num_unref_ut_j;
 
 		PGUARD printf( "j                   %7ld\n", (long) j );
-		PGUARD printf( "diagoff_j - nr      %7ld\n", (long) diagoff_j - nr );
+		PGUARD printf( "diagoff_j - nr      %7ld\n", (long) ( diagoff_j - nr ) );
 		PGUARD printf( "num_unref_ut_j      %7ld\n", (long) num_unref_ut_j );
 		PGUARD printf( "num_unref_ut        %7ld\n", (long) num_unref_ut );
 		PGUARD printf( "\n" );
@@ -457,7 +457,7 @@ dim_t bli_thread_range_tlb_u
 	const dim_t tri_ref_area   = nonrect_area - tri_unref_area;
 	const dim_t total_ref_area = rect_area + tri_ref_area;
 
-	PGUARD printf( "gross area:         %7ld\n", (long) m * n );
+	PGUARD printf( "gross area:         %7ld\n", (long) ( m * n ) );
 	PGUARD printf( "rect_area:          %7ld\n", (long) rect_area );
 	PGUARD printf( "nonrect_area:       %7ld\n", (long) nonrect_area );
 	PGUARD printf( "tri_unref_area:     %7ld\n", (long) tri_unref_area );
@@ -496,7 +496,7 @@ dim_t bli_thread_range_tlb_u
 	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
 
 	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", (long) n_ut_for_me,
-	               (long) n_ut_per_thr, (long) n_ut_for_me - n_ut_per_thr );
+	               (long) n_ut_per_thr, (long) ( n_ut_for_me - n_ut_per_thr ) );
 
 	// Compute the number of utiles prior to the current thread's starting
 	// point. This is the sum of all n_ut_for_me for all thread ids less
@@ -701,7 +701,7 @@ dim_t bli_thread_range_tlb_d
 	const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 );
 
 	PGUARD printf( "n_ut_for_me:        %7ld (%ld+%ld)\n", (long) n_ut_for_me,
-	               (long) n_ut_per_thr, (long) n_ut_for_me - n_ut_per_thr );
+	               (long) n_ut_per_thr, (long) ( n_ut_for_me - n_ut_per_thr ) );
 
 	// Compute the number of utiles prior to the current thread's starting
 	// point. This is the sum of all n_ut_for_me for all thread ids less
@@ -903,7 +903,7 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 	//
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "total_utiles:       %7ld\n", (long) m_iter * n_iter );
+	PGUARD printf( "total_utiles:       %7ld\n", (long) ( m_iter * n_iter ) );
 	PGUARD printf( "---------------------------\n" );
 
 	dim_t j_st_cur = 0; dim_t j_en_cur = 0;
@@ -949,7 +949,7 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 
 				PGUARD printf( "tid_i: %ld  i: %2ld  (1 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 				               (long) tid_i, (long) i, (long) n_ut_for_me,
-				               (long) uops_ta - uops_tba );
+				               (long) ( uops_ta - uops_tba ) );
 
 				if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE;
 				                           break; }
@@ -985,10 +985,10 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 			n_ut_for_me += j_inc * m_iter;
 
 			PGUARD printf( "tid_i: %ld  advanced to col: %2ld  (uops traversed: %ld)\n",
-			               (long) tid_i, (long) j, (long) uops_per_col * j_inc );
+			               (long) tid_i, (long) j, (long) ( uops_per_col * j_inc ) );
 			PGUARD printf( "tid_i: %ld  j: %2ld  (  n_ut_cur: %ld) (uops_alloc: %ld)\n",
 			               (long) tid_i, (long) j, (long) n_ut_for_me,
-			               (long) uops_ta - uops_tba );
+			               (long) ( uops_ta - uops_tba ) );
 			PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n",
 			               (long) tid_i, (long) j_left );
 
@@ -1028,7 +1028,7 @@ dim_t bli_thread_range_tlb_trmm_lx_impl
 
 					PGUARD printf( "tid_i: %ld  i: %2ld  (4 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 					               (long) tid_i, (long) i,
-					               (long) n_ut_for_me, (long) uops_ta - uops_tba );
+					               (long) n_ut_for_me, (long) ( uops_ta - uops_tba ) );
 
 					if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i;
 					                           break; }
@@ -1340,7 +1340,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 	//
 
 	PGUARD printf( "---------------------------\n" );
-	PGUARD printf( "total_utiles:       %7ld\n", (long) m_iter * n_iter );
+	PGUARD printf( "total_utiles:       %7ld\n", (long) ( m_iter * n_iter ) );
 	PGUARD printf( "---------------------------\n" );
 
 	dim_t j_st_cur = 0; dim_t j_en_cur = 0;
@@ -1364,7 +1364,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 		               (long) tid_i, (long) uops_tba );
 		PGUARD printf( "tid_i: %ld  j: %2ld  (  n_ut_cur: %ld) (uops_alloc: %ld)\n",
 		               (long) tid_i, (long) j, (long) n_ut_for_me,
-		               (long) uops_ta - uops_tba );
+		               (long) ( uops_ta - uops_tba ) );
 
 		// This code begins allocating uops when the starting point is somewhere
 		// after the first microtile. Typically this will not be enough to
@@ -1443,7 +1443,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 
 			PGUARD printf( "tid_i: %ld  i: %2ld  (* n_ut_cur: %ld) (uops_alloc: %ld)\n",
 			               (long) tid_i, (long) i-1, (long) n_ut_for_me,
-			               (long) uops_ta - uops_tba );
+			               (long) ( uops_ta - uops_tba ) );
 
 			// If we allocated all utiles in the column (regardless of whether we finished
 			// allocating utiles for the current thread), increment j to the next column,
@@ -1458,7 +1458,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 
 				PGUARD printf( "tid_i: %ld  j: %2ld  (! n_ut_cur: %ld) (uops_alloc: %ld)\n",
 				               (long) tid_i, (long) j, (long) n_ut_for_me,
-				               (long) uops_ta - uops_tba );
+				               (long) ( uops_ta - uops_tba ) );
 			}
 
 			#endif
@@ -1500,10 +1500,10 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 				n_ut_for_me += j_inc * m_iter;
 
 				PGUARD printf( "tid_i: %ld  advanced to col: %2ld  (uops traversed: %ld)\n",
-				               (long) tid_i, (long) j, (long) uops_per_col_rect * j_inc );
+				               (long) tid_i, (long) j, (long) ( uops_per_col_rect * j_inc ) );
 				PGUARD printf( "tid_i: %ld  j: %2ld  (1 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 				               (long) tid_i, (long) j, (long) n_ut_for_me,
-				               (long) uops_ta - uops_tba );
+				               (long) ( uops_ta - uops_tba ) );
 				PGUARD printf( "tid_i: %ld  uops left to alloc: %2ld \n",
 				               (long) tid_i, (long) j_left );
 
@@ -1518,7 +1518,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 
 					PGUARD printf( "tid_i: %ld  j: %2ld  (2 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 					               (long) tid_i, (long) j, (long) n_ut_for_me,
-					               (long) uops_ta - uops_tba );
+					               (long) ( uops_ta - uops_tba ) );
 				}
 				else if ( j >  n_iter ) bli_abort(); // Safety check; should never execute.
 				else if ( j == n_iter )
@@ -1532,7 +1532,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					search_tri = FALSE;
 					PGUARD printf( "tid_i: %ld  j: %2ld  (3 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 					               (long) tid_i, (long) j, (long) n_ut_for_me,
-					               (long) uops_ta - uops_tba );
+					               (long) ( uops_ta - uops_tba ) );
 				}
 				else if ( j < diagoff_iter )
 				{
@@ -1545,7 +1545,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					search_tri = FALSE;
 					PGUARD printf( "tid_i: %ld  j: %2ld  (4 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 					               (long) tid_i, (long) j, (long)  n_ut_for_me,
-					               (long) uops_ta - uops_tba );
+					               (long) ( uops_ta - uops_tba ) );
 				}
 				else // if ( 0 < uops_tba && j == diagoff_iter && j < n_iter )
 				{
@@ -1556,7 +1556,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 					search_tri = TRUE;
 					PGUARD printf( "tid_i: %ld  j: %2ld  (5 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 					               (long) tid_i, (long) j, (long) n_ut_for_me,
-					               (long) uops_ta - uops_tba );
+					               (long) ( uops_ta - uops_tba ) );
 				}
 			}
 			else /* if ( diagoff_iter <= j ) */
@@ -1581,12 +1581,12 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 
 					PGUARD printf( "tid_i: %ld  j: %2ld  (6 n_ut_cur: %ld) (uops_alloc: %ld) (n_uops_j: %ld)\n",
 					               (long) tid_i, (long) j, (long) n_ut_for_me,
-					               (long) uops_ta - uops_tba, (long) n_uops_j );
+					               (long) ( uops_ta - uops_tba ), (long) n_uops_j );
 
 					if ( uops_tba == 0 )
 					{
 						PGUARD printf( "tid_i: %ld  j: %2ld  (7 n_ut_cur: %ld) (uops_alloc: %ld)\n",
-						               (long) tid_i, (long) j, (long) n_ut_for_me, (long) uops_ta - uops_tba );
+						               (long) tid_i, (long) j, (long) n_ut_for_me, (long) ( uops_ta - uops_tba ) );
 						// If advancing over the previous column allocated all of
 						// our uops, then designate the last iteration of the
 						// previous column as the end point.
@@ -1605,13 +1605,13 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 
 						PGUARD printf( "tid_i: %ld  j: %2ld  (8 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 						               (long) tid_i, (long) j, (long) n_ut_for_me,
-						               (long) uops_ta - uops_tba );
+						               (long) ( uops_ta - uops_tba ) );
 					}
 					else // if ( uops_tba < n_uops_j )
 					{
 						PGUARD printf( "tid_i: %ld  j: %2ld  (9 n_ut_cur: %ld) (uops_alloc: %ld)\n",
 						               (long) tid_i, (long) j, (long) n_ut_for_me,
-						               (long) uops_ta - uops_tba );
+						               (long) ( uops_ta - uops_tba ) );
 						// If we can finish allocating all the remaining uops
 						// with the utiles in the current column, then we break
 						// out of the loop without updating j, n_ut_for_me, or
@@ -1631,7 +1631,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 
 				PGUARD printf( "tid_i: %ld  j: %2ld  (A n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n",
 				               (long) tid_i, (long) j, (long) n_ut_for_me,
-				               (long) uops_ta - uops_tba, (long) k_iter_j );
+				               (long) ( uops_ta - uops_tba ), (long) k_iter_j );
 
 				#if 0
 
@@ -1664,7 +1664,7 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
 
 				PGUARD printf( "tid_i: %ld  i: %2ld  (b n_ut_cur: %ld) (uops_alloc: %ld)\n",
 				               (long) tid_i, (long) i, (long) n_ut_for_me,
-				               (long) uops_ta - uops_tba );
+				               (long) ( uops_ta - uops_tba ) );
 
 				#endif
 			}
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index 8611b9164..faa35e039 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -78,7 +78,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( asumv )
+INSERT_GENTFUNCR_BASIC( asumv )
 
 
 #undef  GENTFUNC
@@ -114,9 +114,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( mkherm )
-INSERT_GENTFUNC_BASIC0( mksymm )
-INSERT_GENTFUNC_BASIC0( mktrim )
+INSERT_GENTFUNC_BASIC( mkherm )
+INSERT_GENTFUNC_BASIC( mksymm )
+INSERT_GENTFUNC_BASIC( mktrim )
 
 
 #undef  GENTFUNCR
@@ -157,9 +157,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( norm1v )
-INSERT_GENTFUNCR_BASIC0( normfv )
-INSERT_GENTFUNCR_BASIC0( normiv )
+INSERT_GENTFUNCR_BASIC( norm1v )
+INSERT_GENTFUNCR_BASIC( normfv )
+INSERT_GENTFUNCR_BASIC( normiv )
 
 
 #undef  GENTFUNCR
@@ -208,9 +208,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( norm1m )
-INSERT_GENTFUNCR_BASIC0( normfm )
-INSERT_GENTFUNCR_BASIC0( normim )
+INSERT_GENTFUNCR_BASIC( norm1m )
+INSERT_GENTFUNCR_BASIC( normfm )
+INSERT_GENTFUNCR_BASIC( normim )
 
 
 #undef  GENTFUNCR
@@ -265,8 +265,8 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	} \
 }
 
-INSERT_GENTFUNCR_BASIC0( randv )
-INSERT_GENTFUNCR_BASIC0( randnv )
+INSERT_GENTFUNCR_BASIC( randv )
+INSERT_GENTFUNCR_BASIC( randnv )
 
 
 #undef  GENTFUNCR
@@ -331,8 +331,8 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	} \
 }
 
-INSERT_GENTFUNCR_BASIC0( randm )
-INSERT_GENTFUNCR_BASIC0( randnm )
+INSERT_GENTFUNCR_BASIC( randm )
+INSERT_GENTFUNCR_BASIC( randnm )
 
 
 #undef  GENTFUNCR
@@ -370,7 +370,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( sumsqv )
+INSERT_GENTFUNCR_BASIC( sumsqv )
 
 // -----------------------------------------------------------------------------
 
@@ -398,7 +398,7 @@ void PASTEMAC(ch,opname) \
 	*is_eq = PASTEMAC(ch,eq)( chi_conj, *psi ); \
 }
 
-INSERT_GENTFUNC_BASIC0( eqsc )
+INSERT_GENTFUNC_BASIC( eqsc )
 
 
 #undef  GENTFUNC
@@ -430,7 +430,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( eqv )
+INSERT_GENTFUNC_BASIC( eqv )
 
 
 #undef  GENTFUNC
@@ -472,7 +472,7 @@ void PASTEMAC(ch,opname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( eqm )
+INSERT_GENTFUNC_BASIC( eqm )
 
 
 #undef  GENTFUNC
diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h
index b720877b5..2ff83aaba 100644
--- a/frame/util/bli_util_tapi.h
+++ b/frame/util/bli_util_tapi.h
@@ -48,7 +48,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROTR_BASIC0( asumv )
+INSERT_GENTPROTR_BASIC( asumv )
 
 
 #undef  GENTPROT
@@ -62,9 +62,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( mkherm )
-INSERT_GENTPROT_BASIC0( mksymm )
-INSERT_GENTPROT_BASIC0( mktrim )
+INSERT_GENTPROT_BASIC( mkherm )
+INSERT_GENTPROT_BASIC( mksymm )
+INSERT_GENTPROT_BASIC( mktrim )
 
 
 #undef  GENTPROTR
@@ -78,9 +78,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROTR_BASIC0( norm1v )
-INSERT_GENTPROTR_BASIC0( normfv )
-INSERT_GENTPROTR_BASIC0( normiv )
+INSERT_GENTPROTR_BASIC( norm1v )
+INSERT_GENTPROTR_BASIC( normfv )
+INSERT_GENTPROTR_BASIC( normiv )
 
 
 #undef  GENTPROTR
@@ -98,9 +98,9 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROTR_BASIC0( norm1m )
-INSERT_GENTPROTR_BASIC0( normfm )
-INSERT_GENTPROTR_BASIC0( normim )
+INSERT_GENTPROTR_BASIC( norm1m )
+INSERT_GENTPROTR_BASIC( normfm )
+INSERT_GENTPROTR_BASIC( normim )
 
 
 #undef  GENTPROT
@@ -113,8 +113,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( randv )
-INSERT_GENTPROT_BASIC0( randnv )
+INSERT_GENTPROT_BASIC( randv )
+INSERT_GENTPROT_BASIC( randnv )
 
 
 #undef  GENTPROT
@@ -130,8 +130,8 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROT_BASIC0( randm )
-INSERT_GENTPROT_BASIC0( randnm )
+INSERT_GENTPROT_BASIC( randm )
+INSERT_GENTPROT_BASIC( randnm )
 
 
 #undef  GENTPROTR
@@ -146,7 +146,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
        BLIS_TAPI_EX_PARAMS  \
      );
 
-INSERT_GENTPROTR_BASIC0( sumsqv )
+INSERT_GENTPROTR_BASIC( sumsqv )
 
 // -----------------------------------------------------------------------------
 
@@ -165,7 +165,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              bool*  is_eq  \
      );
 
-INSERT_GENTPROT_BASIC0( eqsc )
+INSERT_GENTPROT_BASIC( eqsc )
 
 
 #undef  GENTPROT
@@ -180,7 +180,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
               bool*  is_eq  \
       );
 
-INSERT_GENTPROT_BASIC0( eqv )
+INSERT_GENTPROT_BASIC( eqv )
 
 
 #undef  GENTPROT
@@ -199,7 +199,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              bool*   is_eq  \
      );
 
-INSERT_GENTPROT_BASIC0( eqm )
+INSERT_GENTPROT_BASIC( eqm )
 
 
 #undef  GENTPROT
@@ -212,10 +212,10 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              bool*  is  \
      );
 
-INSERT_GENTPROT_BASIC0( ltsc )
-INSERT_GENTPROT_BASIC0( ltesc )
-INSERT_GENTPROT_BASIC0( gtsc )
-INSERT_GENTPROT_BASIC0( gtesc )
+INSERT_GENTPROT_BASIC( ltsc )
+INSERT_GENTPROT_BASIC( ltesc )
+INSERT_GENTPROT_BASIC( gtsc )
+INSERT_GENTPROT_BASIC( gtesc )
 
 
 #undef  GENTPROT
@@ -230,7 +230,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
        const char* s2  \
      );
 
-INSERT_GENTPROT_BASIC0_I( printv )
+INSERT_GENTPROT_BASIC_I( printv )
 
 
 #undef  GENTPROT
@@ -246,7 +246,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
        const char* s2  \
      );
 
-INSERT_GENTPROT_BASIC0_I( printm )
+INSERT_GENTPROT_BASIC_I( printm )
 
 #endif // #ifdef BLIS_TAPI_BASIC
 
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index 3c501d107..227d30c6f 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -81,7 +81,7 @@ void PASTEMAC(ch,varname) \
 	PASTEMAC(chr,copys)( absum, *asum ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( asumv_unb_var1 )
+INSERT_GENTFUNCR_BASIC( asumv_unb_var1 )
 
 
 #undef  GENTFUNCR
@@ -137,7 +137,7 @@ void PASTEMAC(ch,varname) \
 	); \
 }
 
-INSERT_GENTFUNCR_BASIC0( mkherm_unb_var1 )
+INSERT_GENTFUNCR_BASIC( mkherm_unb_var1 )
 
 
 #undef  GENTFUNC
@@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( mksymm_unb_var1 )
+INSERT_GENTFUNC_BASIC( mksymm_unb_var1 )
 
 
 #undef  GENTFUNC
@@ -224,7 +224,7 @@ void PASTEMAC(ch,varname) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( mktrim_unb_var1 )
+INSERT_GENTFUNC_BASIC( mktrim_unb_var1 )
 
 
 #undef  GENTFUNCR
@@ -262,7 +262,7 @@ void PASTEMAC(ch,varname) \
 	PASTEMAC(chr,copys)( absum, *norm ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( norm1v_unb_var1 )
+INSERT_GENTFUNCR_BASIC( norm1v_unb_var1 )
 
 
 #undef  GENTFUNCR
@@ -485,7 +485,7 @@ void PASTEMAC(ch,varname) \
 	PASTEMAC(chr,copys)( abs_chi1_max, *norm ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( normiv_unb_var1 )
+INSERT_GENTFUNCR_BASIC( normiv_unb_var1 )
 
 
@@ -1170,7 +1170,7 @@ void PASTEMAC(ch,varname) \
 	PASTEMAC(chr,copys)( sumsq_r, *sumsq ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 )
+INSERT_GENTFUNCR_BASIC( sumsqv_unb_var1 )
 
 // -----------------------------------------------------------------------------
 
@@ -1202,7 +1202,7 @@ bool PASTEMAC(ch,opname) \
 	return TRUE; \
 }
 
-INSERT_GENTFUNC_BASIC0( eqv_unb_var1 )
+INSERT_GENTFUNC_BASIC( eqv_unb_var1 )
 
 
 #undef  GENTFUNC
@@ -1326,7 +1326,7 @@ bool PASTEMAC(ch,opname) \
 	return TRUE; \
 }
 
-INSERT_GENTFUNC_BASIC0( eqm_unb_var1 )
+INSERT_GENTFUNC_BASIC( eqm_unb_var1 )
 
 
 #undef  GENTFUNC
@@ -1361,7 +1361,7 @@ void PASTEMAC(ch,opname) \
 	fprintf( file, "%s\n", s2 ); \
 }
 
-INSERT_GENTFUNC_BASIC0_I( fprintv )
+INSERT_GENTFUNC_BASIC_I( fprintv )
 
 
 #undef  GENTFUNC
@@ -1401,5 +1401,5 @@ void PASTEMAC(ch,opname) \
 	fflush( file ); \
 }
 
-INSERT_GENTFUNC_BASIC0_I( fprintm )
+INSERT_GENTFUNC_BASIC_I( fprintm )
 
diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h
index 435efa4ac..978183f96 100644
--- a/frame/util/bli_util_unb_var1.h
+++ b/frame/util/bli_util_unb_var1.h
@@ -49,7 +49,7 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      );
 
-INSERT_GENTPROTR_BASIC0( asumv_unb_var1 )
+INSERT_GENTPROTR_BASIC( asumv_unb_var1 )
 
 
 #undef  GENTPROT
@@ -64,9 +64,9 @@ void PASTEMAC(ch,varname) \
        rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( mkherm_unb_var1 )
-INSERT_GENTPROT_BASIC0( mksymm_unb_var1 )
-INSERT_GENTPROT_BASIC0( mktrim_unb_var1 )
+INSERT_GENTPROT_BASIC( mkherm_unb_var1 )
+INSERT_GENTPROT_BASIC( mksymm_unb_var1 )
+INSERT_GENTPROT_BASIC( mktrim_unb_var1 )
 
 
 #undef  GENTPROTR
@@ -81,9 +81,9 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      );
 
-INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 )
-INSERT_GENTPROTR_BASIC0( normfv_unb_var1 )
-INSERT_GENTPROTR_BASIC0( normiv_unb_var1 )
+INSERT_GENTPROTR_BASIC( norm1v_unb_var1 )
+INSERT_GENTPROTR_BASIC( normfv_unb_var1 )
+INSERT_GENTPROTR_BASIC( normiv_unb_var1 )
 
 
 #undef  GENTPROTR
@@ -102,9 +102,9 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      );
 
-INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 )
-INSERT_GENTPROTR_BASIC0( normfm_unb_var1 )
-INSERT_GENTPROTR_BASIC0( normim_unb_var1 )
+INSERT_GENTPROTR_BASIC( norm1m_unb_var1 )
+INSERT_GENTPROTR_BASIC( normfm_unb_var1 )
+INSERT_GENTPROTR_BASIC( normim_unb_var1 )
 
 
 #undef  GENTPROT
@@ -118,8 +118,8 @@ void PASTEMAC(ch,varname) \
        rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( randv_unb_var1 )
-INSERT_GENTPROT_BASIC0( randnv_unb_var1 )
+INSERT_GENTPROT_BASIC( randv_unb_var1 )
+INSERT_GENTPROT_BASIC( randnv_unb_var1 )
 
 
 #undef  GENTPROT
@@ -136,8 +136,8 @@ void PASTEMAC(ch,varname) \
        rntm_t* rntm  \
      );
 
-INSERT_GENTPROT_BASIC0( randm_unb_var1 )
-INSERT_GENTPROT_BASIC0( randnm_unb_var1 )
+INSERT_GENTPROT_BASIC( randm_unb_var1 )
+INSERT_GENTPROT_BASIC( randnm_unb_var1 )
 
 
 #undef  GENTPROTR
@@ -153,7 +153,7 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      );
 
-INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 )
+INSERT_GENTPROTR_BASIC( sumsqv_unb_var1 )
 
 // -----------------------------------------------------------------------------
 
@@ -168,7 +168,7 @@ bool PASTEMAC(ch,varname) \
        ctype* y, inc_t incy  \
      );
 
-INSERT_GENTPROT_BASIC0( eqv_unb_var1 )
+INSERT_GENTPROT_BASIC( eqv_unb_var1 )
 
 
 #undef  GENTPROT
@@ -186,7 +186,7 @@ bool PASTEMAC(ch,varname) \
        ctype*  y, inc_t rs_y, inc_t cs_y  \
      );
 
-INSERT_GENTPROT_BASIC0( eqm_unb_var1 )
+INSERT_GENTPROT_BASIC( eqm_unb_var1 )
 
 
 #undef  GENTPROT
@@ -202,7 +202,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
        const char*  s2  \
      );
 
-INSERT_GENTPROT_BASIC0_I( fprintv )
+INSERT_GENTPROT_BASIC_I( fprintv )
 
 
 #undef  GENTPROT
@@ -219,6 +219,6 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
        const char*  s2  \
      );
 
-INSERT_GENTPROT_BASIC0_I( fprintm )
+INSERT_GENTPROT_BASIC_I( fprintm )
 
 
diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c
index 7195db512..e6f9c8a65 100644
--- a/ref_kernels/1/bli_addv_ref.c
+++ b/ref_kernels/1/bli_addv_ref.c
@@ -95,5 +95,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( addv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( addv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c
index 0fef14c73..93591e202 100644
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -143,5 +143,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	PASTEMAC(i,copys)( i_max_l, *index ); \
 }
 
-INSERT_GENTFUNCR_BASIC2( amaxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCR_BASIC( amaxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 8c4340161..92cee8bd9 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -248,5 +248,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( axpbyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( axpbyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 52b6fd44e..663b7dc43 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -118,5 +118,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c
index 8285d9956..78af7451c 100644
--- a/ref_kernels/1/bli_copyv_ref.c
+++ b/ref_kernels/1/bli_copyv_ref.c
@@ -95,5 +95,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( copyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( copyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c
index f7436f88e..b972e1cae 100644
--- a/ref_kernels/1/bli_dotv_ref.c
+++ b/ref_kernels/1/bli_dotv_ref.c
@@ -119,5 +119,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	PASTEMAC(ch,copys)( dotxy, *rho ); \
 }
 
-INSERT_GENTFUNC_BASIC2( dotv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( dotv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c
index 56d3f457d..cb9d2a521 100644
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -130,5 +130,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	PASTEMAC(ch,axpys)( *alpha, dotxy, *rho ); \
 }
 
-INSERT_GENTFUNC_BASIC2( dotxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( dotxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c
index fa914e653..7710de638 100644
--- a/ref_kernels/1/bli_invertv_ref.c
+++ b/ref_kernels/1/bli_invertv_ref.c
@@ -67,5 +67,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( invertv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( invertv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_invscalv_ref.c b/ref_kernels/1/bli_invscalv_ref.c
index 6096ff20b..d48473b3a 100644
--- a/ref_kernels/1/bli_invscalv_ref.c
+++ b/ref_kernels/1/bli_invscalv_ref.c
@@ -80,5 +80,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( invscalv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( invscalv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index 1ac66be07..f35504dd0 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -136,5 +136,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( scal2v, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( scal2v, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 84a7ec83f..559189a62 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -97,5 +97,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( scalv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( scalv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c
index 5e39faff4..f3669aac9 100644
--- a/ref_kernels/1/bli_setv_ref.c
+++ b/ref_kernels/1/bli_setv_ref.c
@@ -97,5 +97,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( setv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( setv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c
index 3e5ddcf1b..3820b8051 100644
--- a/ref_kernels/1/bli_subv_ref.c
+++ b/ref_kernels/1/bli_subv_ref.c
@@ -95,5 +95,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( subv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( subv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c
index 4e6cedd8a..b586ebca2 100644
--- a/ref_kernels/1/bli_swapv_ref.c
+++ b/ref_kernels/1/bli_swapv_ref.c
@@ -70,5 +70,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( swapv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( swapv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 46342c182..51ef6c4ff 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -132,5 +132,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( xpbyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( xpbyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 9fb7a839f..3932a453f 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -138,5 +138,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( axpy2v, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( axpy2v, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index ff8dd6bb4..f4f137bb3 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -126,7 +126,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC2( axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+//INSERT_GENTFUNC_BASIC( axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 GENTFUNC( float,    s, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 )
 GENTFUNC( double,   d, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 )
 GENTFUNC( scomplex, c, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 )
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index 105463ac4..b48cf616d 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -162,5 +162,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( dotaxpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index 73d2c036b..2c69ffb88 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -203,7 +203,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC2( dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+//INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 GENTFUNC( float,    s, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
 GENTFUNC( double,   d, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
 GENTFUNC( scomplex, c, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 54fcd3ed6..b80b7154e 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -141,7 +141,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC2( dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+//INSERT_GENTFUNC_BASIC( dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 GENTFUNC( float,    s, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 )
 GENTFUNC( double,   d, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 )
 GENTFUNC( scomplex, c, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 )
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index c3385032e..ae3ae685c 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -340,6 +340,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNCCO_BASIC4( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNCCO_BASIC4( packm_nrxnr_diag_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO( packm_nrxnr_diag_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index 1285f82da..f21f18861 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -171,6 +171,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
 }
 
-INSERT_GENTFUNC_BASIC4( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNC_BASIC4( packm_nrxnr_diag, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( packm_nrxnr_diag, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index b77ef6965..e4581a097 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -181,6 +181,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNCCO_BASIC4( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNCCO_BASIC4( packm_nrxk_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO( packm_nrxk_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index b49856a21..ce7f8740b 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -104,6 +104,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC4( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNC_BASIC4( packm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( packm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 5c7c9c430..9d86467ba 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -94,6 +94,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC4( unpackm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNC_BASIC4( unpackm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( unpackm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( unpackm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 6bab6c812..74c596d39 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -139,7 +139,7 @@ static void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 // An implementation that attempts to facilitate emission of vectorized
 // instructions via constant loop bounds + #pragma omp simd directives.
@@ -297,6 +297,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 
diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c
index 934fb10ad..75c506167 100644
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -243,7 +243,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 //
 // -- Column storage case ------------------------------------------------------
@@ -454,11 +454,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 //
 // -- General storage case -----------------------------------------------------
 //
 
-INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index c57ea5ae8..7fa56457b 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -178,6 +178,6 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \
 */ \
 }
 
-INSERT_GENTFUNC_BASIC3( gemmtrsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
-INSERT_GENTFUNC_BASIC3( gemmtrsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
+INSERT_GENTFUNC_BASIC( gemmtrsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
+INSERT_GENTFUNC_BASIC( gemmtrsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
 
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index 547582190..b8459dd35 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -119,9 +119,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 }
 
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC3( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
+INSERT_GENTFUNC_BASIC( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
 #else
-INSERT_GENTFUNC_BASIC3( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
+INSERT_GENTFUNC_BASIC( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
 #endif
 
 
@@ -207,8 +207,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 }
 
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
+INSERT_GENTFUNC_BASIC( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
 #else
-INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
+INSERT_GENTFUNC_BASIC( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
 #endif
 
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 41135f5ea..ff1cfd9a6 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -50,8 +50,8 @@
 // Define a prototype-inserting template that uses an arbitrary prototype-
 // generating macro.
 
-#undef  INSERT_PROTMAC_BASIC0
-#define INSERT_PROTMAC_BASIC0( protmac, kername ) \
+#undef  INSERT_PROTMAC_BASIC
+#define INSERT_PROTMAC_BASIC( protmac, kername ) \
 \
 protmac( float,    s, kername ) \
 protmac( double,   d, kername ) \
@@ -72,11 +72,11 @@ protmac( dcomplex, z, kername )
 // Instantiate prototypes for above functions using the pre-defined level-3
 // microkernel prototype-generating macros.
 
-INSERT_PROTMAC_BASIC0( GEMM_UKR_PROT,     gemm_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm_l_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm_u_ukr_name )
-INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm_l_ukr_name )
-INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm_u_ukr_name )
+INSERT_PROTMAC_BASIC( GEMM_UKR_PROT,     gemm_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm_l_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm_u_ukr_name )
+INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm_l_ukr_name )
+INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm_u_ukr_name )
 
 
 // -- Level-3 virtual micro-kernel prototype redefinitions ---------------------
@@ -96,11 +96,11 @@ INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm_u_ukr_name )
 
 // -- 1m --
 
-INSERT_PROTMAC_BASIC0( GEMM_UKR_PROT,     gemm1m_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm1m_l_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMTRSM_UKR_PROT, gemmtrsm1m_u_ukr_name )
-INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm1m_l_ukr_name )
-INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm1m_u_ukr_name )
+INSERT_PROTMAC_BASIC( GEMM_UKR_PROT,     gemm1m_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm1m_l_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm1m_u_ukr_name )
+INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm1m_l_ukr_name )
+INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm1m_u_ukr_name )
 
 
 // -- Level-3 small/unpacked micro-kernel prototype definitions ----------------
@@ -116,11 +116,11 @@ INSERT_PROTMAC_BASIC0( TRSM_UKR_PROT,     trsm1m_u_ukr_name )
 // Instantiate prototypes for above functions using the pre-defined gemmsup
 // kernel prototype-generating macros.
 
-INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_rv_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_rg_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_cv_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_cg_ukr_name )
-INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_gx_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMSUP_KER_PROT, gemmsup_rv_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMSUP_KER_PROT, gemmsup_rg_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMSUP_KER_PROT, gemmsup_cv_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMSUP_KER_PROT, gemmsup_cg_ukr_name )
+INSERT_PROTMAC_BASIC( GEMMSUP_KER_PROT, gemmsup_gx_ukr_name )
 
 
 // -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------
@@ -141,16 +141,16 @@ INSERT_PROTMAC_BASIC0( GEMMSUP_KER_PROT, gemmsup_gx_ukr_name )
 // Instantiate prototypes for above functions using the pre-defined packm
 // kernel prototype-generating macros.
 
-INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_mrxk_ker_name )
-INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_nrxk_ker_name )
-INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_mrxk_1er_ker_name )
-INSERT_PROTMAC_BASIC0( PACKM_KER_PROT,      packm_nrxk_1er_ker_name )
-INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_ker_name )
-INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_ker_name )
-INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_1er_ker_name )
-INSERT_PROTMAC_BASIC0( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_1er_ker_name )
-INSERT_PROTMAC_BASIC0( UNPACKM_KER_PROT,    unpackm_mrxk_ker_name )
-INSERT_PROTMAC_BASIC0( UNPACKM_KER_PROT,    unpackm_nrxk_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_mrxk_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_nrxk_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_mrxk_1er_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_nrxk_1er_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_1er_ker_name )
+INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_1er_ker_name )
+INSERT_PROTMAC_BASIC( UNPACKM_KER_PROT,    unpackm_mrxk_ker_name )
+INSERT_PROTMAC_BASIC( UNPACKM_KER_PROT,    unpackm_nrxk_ker_name )
 
 
 // -- Level-1f kernel prototype redefinitions ----------------------------------
@@ -166,11 +166,11 @@ INSERT_PROTMAC_BASIC0( UNPACKM_KER_PROT,    unpackm_nrxk_ker_name )
 // Instantiate prototypes for above functions using the pre-defined level-1f
 // kernel prototype-generating macros.
 
-INSERT_PROTMAC_BASIC0( AXPY2V_KER_PROT,     axpy2v_ker_name )
-INSERT_PROTMAC_BASIC0( AXPYF_KER_PROT,      axpyf_ker_name )
-INSERT_PROTMAC_BASIC0( DOTAXPYV_KER_PROT,   dotaxpyv_ker_name )
-INSERT_PROTMAC_BASIC0( DOTXAXPYF_KER_PROT,  dotxaxpyf_ker_name )
-INSERT_PROTMAC_BASIC0( DOTXF_KER_PROT,      dotxf_ker_name )
+INSERT_PROTMAC_BASIC( AXPY2V_KER_PROT,     axpy2v_ker_name )
+INSERT_PROTMAC_BASIC( AXPYF_KER_PROT,      axpyf_ker_name )
+INSERT_PROTMAC_BASIC( DOTAXPYV_KER_PROT,   dotaxpyv_ker_name )
+INSERT_PROTMAC_BASIC( DOTXAXPYF_KER_PROT,  dotxaxpyf_ker_name )
+INSERT_PROTMAC_BASIC( DOTXF_KER_PROT,      dotxf_ker_name )
 
 
 // -- Level-1v kernel prototype redefinitions ----------------------------------
@@ -196,21 +196,21 @@ INSERT_PROTMAC_BASIC0( DOTXF_KER_PROT,      dotxf_ker_name )
 // Instantiate prototypes for above functions using the pre-defined level-1v
 // kernel prototype-generating macros.
 
-INSERT_PROTMAC_BASIC0( ADDV_KER_PROT,     addv_ker_name )
-INSERT_PROTMAC_BASIC0( AMAXV_KER_PROT,    amaxv_ker_name )
-INSERT_PROTMAC_BASIC0( AXPBYV_KER_PROT,   axpbyv_ker_name )
-INSERT_PROTMAC_BASIC0( AXPYV_KER_PROT,    axpyv_ker_name )
-INSERT_PROTMAC_BASIC0( COPYV_KER_PROT,    copyv_ker_name )
-INSERT_PROTMAC_BASIC0( DOTV_KER_PROT,     dotv_ker_name )
-INSERT_PROTMAC_BASIC0( DOTXV_KER_PROT,    dotxv_ker_name )
-INSERT_PROTMAC_BASIC0( INVERTV_KER_PROT,  invertv_ker_name )
-INSERT_PROTMAC_BASIC0( INVSCALV_KER_PROT, invscalv_ker_name )
-INSERT_PROTMAC_BASIC0( SCALV_KER_PROT,    scalv_ker_name )
-INSERT_PROTMAC_BASIC0( SCAL2V_KER_PROT,   scal2v_ker_name )
-INSERT_PROTMAC_BASIC0( SETV_KER_PROT,     setv_ker_name )
-INSERT_PROTMAC_BASIC0( SUBV_KER_PROT,     subv_ker_name )
-INSERT_PROTMAC_BASIC0( SWAPV_KER_PROT,    swapv_ker_name )
-INSERT_PROTMAC_BASIC0( XPBYV_KER_PROT,    xpbyv_ker_name )
+INSERT_PROTMAC_BASIC( ADDV_KER_PROT,     addv_ker_name )
+INSERT_PROTMAC_BASIC( AMAXV_KER_PROT,    amaxv_ker_name )
+INSERT_PROTMAC_BASIC( AXPBYV_KER_PROT,   axpbyv_ker_name )
+INSERT_PROTMAC_BASIC( AXPYV_KER_PROT,    axpyv_ker_name )
+INSERT_PROTMAC_BASIC( COPYV_KER_PROT,    copyv_ker_name )
+INSERT_PROTMAC_BASIC( DOTV_KER_PROT,     dotv_ker_name )
+INSERT_PROTMAC_BASIC( DOTXV_KER_PROT,    dotxv_ker_name )
+INSERT_PROTMAC_BASIC( INVERTV_KER_PROT,  invertv_ker_name )
+INSERT_PROTMAC_BASIC( INVSCALV_KER_PROT, invscalv_ker_name )
+INSERT_PROTMAC_BASIC( SCALV_KER_PROT,    scalv_ker_name )
+INSERT_PROTMAC_BASIC( SCAL2V_KER_PROT,   scal2v_ker_name )
+INSERT_PROTMAC_BASIC( SETV_KER_PROT,     setv_ker_name )
+INSERT_PROTMAC_BASIC( SUBV_KER_PROT,     subv_ker_name )
+INSERT_PROTMAC_BASIC( SWAPV_KER_PROT,    swapv_ker_name )
+INSERT_PROTMAC_BASIC( XPBYV_KER_PROT,    xpbyv_ker_name )
 
 
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 424b9c44f..99dea36d4 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -245,5 +245,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNCCO_BASIC2( gemm1m, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO( gemm1m, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 903173e9d..a0aa9597b 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -290,5 +290,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNCCO_BASIC3( gemmtrsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
-INSERT_GENTFUNCCO_BASIC3( gemmtrsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
+INSERT_GENTFUNCCO( gemmtrsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
+INSERT_GENTFUNCCO( gemmtrsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index caff4688d..05e193a09 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -247,9 +247,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 }
 
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
+INSERT_GENTFUNCCO( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
 #else
-INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
+INSERT_GENTFUNCCO( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
 #endif
 
 
@@ -465,7 +465,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 }
 
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
+INSERT_GENTFUNCCO( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
 #else
-INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
+INSERT_GENTFUNCCO( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
 #endif
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index e0fb5bb8a..2b4bffde1 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -284,7 +284,7 @@ void PASTECH2(bls_,ch,opname) \
 	); \
 }
 
-//INSERT_GENTFUNC_BASIC0( gemm )
+//INSERT_GENTFUNC_BASIC( gemm )
 GENTFUNC( float,    s, gemm )
 GENTFUNC( double,   d, gemm )
 GENTFUNC( scomplex, c, gemm )
diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h
index b8dba9cfd..3dadacfd0 100644
--- a/sandbox/gemmlike/bls_gemm.h
+++ b/sandbox/gemmlike/bls_gemm.h
@@ -93,7 +93,7 @@ void PASTECH2(bls_,ch,opname) \
        ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
-//INSERT_GENTPROT_BASIC0( gemm )
+//INSERT_GENTPROT_BASIC( gemm )
 GENTPROT( float,    s, gemm )
 GENTPROT( double,   d, gemm )
 GENTPROT( scomplex, c, gemm )
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 189ada459..785f0836e 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -388,7 +388,7 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs
 */ \
 }
 
-//INSERT_GENTFUNC_BASIC0( gemm_bp_var1 )
+//INSERT_GENTFUNC_BASIC( gemm_bp_var1 )
 GENTFUNC( float,    s, gemm_bp_var1 )
 GENTFUNC( double,   d, gemm_bp_var1 )
 GENTFUNC( scomplex, c, gemm_bp_var1 )
diff --git a/sandbox/gemmlike/bls_gemm_var.h b/sandbox/gemmlike/bls_gemm_var.h
index 0a41afde4..f7b072558 100644
--- a/sandbox/gemmlike/bls_gemm_var.h
+++ b/sandbox/gemmlike/bls_gemm_var.h
@@ -77,7 +77,7 @@ void PASTECH2(bls_,ch,varname) \
        thrinfo_t* restrict thread  \
      );
 
-//INSERT_GENTPROT_BASIC0( gemm_bp_var1 )
+//INSERT_GENTPROT_BASIC( gemm_bp_var1 )
 GENTPROT( float,    s, gemm_bp_var1 )
 GENTPROT( double,   d, gemm_bp_var1 )
 GENTPROT( scomplex, c, gemm_bp_var1 )
@@ -107,7 +107,7 @@ void PASTECH2(bls_,ch,varname) \
        cntx_t*    restrict cntx  \
      );
 
-//INSERT_GENTPROT_BASIC0( gemm_kernel )
+//INSERT_GENTPROT_BASIC( gemm_kernel )
 GENTPROT( float,    s, gemm_kernel )
 GENTPROT( double,   d, gemm_kernel )
 GENTPROT( scomplex, c, gemm_kernel )
diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c
index 742c78bfb..e6115e340 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.c
+++ b/sandbox/gemmlike/bls_l3_packm_a.c
@@ -158,7 +158,7 @@ void PASTECH2(bls_,ch,opname) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
+//INSERT_GENTFUNC_BASIC( packm_init_mem_a )
 GENTFUNC( float,    s, packm_init_mem_a )
 GENTFUNC( double,   d, packm_init_mem_a )
 GENTFUNC( scomplex, c, packm_init_mem_a )
@@ -207,7 +207,7 @@ void PASTECH2(bls_,ch,opname) \
 	*p = bli_mem_buffer( mem ); \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_init_a )
+//INSERT_GENTFUNC_BASIC( packm_init_a )
 GENTFUNC( float,    s, packm_init_a )
 GENTFUNC( double,   d, packm_init_a )
 GENTFUNC( scomplex, c, packm_init_a )
@@ -283,7 +283,7 @@ void PASTECH2(bls_,ch,opname) \
 	bli_thrinfo_barrier( thread ); \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_a )
+//INSERT_GENTFUNC_BASIC( packm_a )
 GENTFUNC( float,    s, packm_a )
 GENTFUNC( double,   d, packm_a )
 GENTFUNC( scomplex, c, packm_a )
diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/sandbox/gemmlike/bls_l3_packm_a.h
index 2ab53dcbf..e5ecf9eab 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.h
+++ b/sandbox/gemmlike/bls_l3_packm_a.h
@@ -44,7 +44,7 @@ void PASTECH2(bls_,ch,opname) \
        thrinfo_t* restrict thread  \
      ); \
 
-//INSERT_GENTPROT_BASIC0( packm_init_mem_a )
+//INSERT_GENTPROT_BASIC( packm_init_mem_a )
 GENTPROT( float,    s, packm_init_mem_a )
 GENTPROT( double,   d, packm_init_mem_a )
 GENTPROT( scomplex, c, packm_init_mem_a )
@@ -67,7 +67,7 @@ void PASTECH2(bls_,ch,opname) \
        mem_t*  restrict mem  \
      ); \
 
-//INSERT_GENTPROT_BASIC0( packm_init_a )
+//INSERT_GENTPROT_BASIC( packm_init_a )
 GENTPROT( float,    s, packm_init_a )
 GENTPROT( double,   d, packm_init_a )
 GENTPROT( scomplex, c, packm_init_a )
@@ -93,7 +93,7 @@ void PASTECH2(bls_,ch,opname) \
        thrinfo_t* restrict thread  \
      ); \
 
-//INSERT_GENTPROT_BASIC0( packm_a )
+//INSERT_GENTPROT_BASIC( packm_a )
 GENTPROT( float,    s, packm_a )
 GENTPROT( double,   d, packm_a )
 GENTPROT( scomplex, c, packm_a )
diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c
index db6bca8fc..7be3482f9 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.c
+++ b/sandbox/gemmlike/bls_l3_packm_b.c
@@ -158,7 +158,7 @@ void PASTECH2(bls_,ch,opname) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_init_mem_b )
+//INSERT_GENTFUNC_BASIC( packm_init_mem_b )
 GENTFUNC( float,    s, packm_init_mem_b )
 GENTFUNC( double,   d, packm_init_mem_b )
 GENTFUNC( scomplex, c, packm_init_mem_b )
@@ -207,7 +207,7 @@ void PASTECH2(bls_,ch,opname) \
 	*p = bli_mem_buffer( mem ); \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_init_b )
+//INSERT_GENTFUNC_BASIC( packm_init_b )
 GENTFUNC( float,    s, packm_init_b )
 GENTFUNC( double,   d, packm_init_b )
 GENTFUNC( scomplex, c, packm_init_b )
@@ -283,7 +283,7 @@ void PASTECH2(bls_,ch,opname) \
 	bli_thrinfo_barrier( thread ); \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_b )
+//INSERT_GENTFUNC_BASIC( packm_b )
 GENTFUNC( float,    s, packm_b )
 GENTFUNC( double,   d, packm_b )
 GENTFUNC( scomplex, c, packm_b )
diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/sandbox/gemmlike/bls_l3_packm_b.h
index 791cf9b71..c92f63f1b 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.h
+++ b/sandbox/gemmlike/bls_l3_packm_b.h
@@ -44,7 +44,7 @@ void PASTECH2(bls_,ch,opname) \
        thrinfo_t* restrict thread  \
      ); \
 
-//INSERT_GENTPROT_BASIC0( packm_init_mem_b )
+//INSERT_GENTPROT_BASIC( packm_init_mem_b )
 GENTPROT( float,    s, packm_init_mem_b )
 GENTPROT( double,   d, packm_init_mem_b )
 GENTPROT( scomplex, c, packm_init_mem_b )
@@ -67,7 +67,7 @@ void PASTECH2(bls_,ch,opname) \
        mem_t*  restrict mem  \
      ); \
 
-//INSERT_GENTPROT_BASIC0( packm_init_b )
+//INSERT_GENTPROT_BASIC( packm_init_b )
 GENTPROT( float,    s, packm_init_b )
 GENTPROT( double,   d, packm_init_b )
 GENTPROT( scomplex, c, packm_init_b )
@@ -93,7 +93,7 @@ void PASTECH2(bls_,ch,opname) \
        thrinfo_t* restrict thread  \
      ); \
 
-//INSERT_GENTPROT_BASIC0( packm_b )
+//INSERT_GENTPROT_BASIC( packm_b )
 GENTPROT( float,    s, packm_b )
 GENTPROT( double,   d, packm_b )
 GENTPROT( scomplex, c, packm_b )
diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h
index 4c6db2cac..e686e64fc 100644
--- a/sandbox/gemmlike/bls_l3_packm_var.h
+++ b/sandbox/gemmlike/bls_l3_packm_var.h
@@ -55,19 +55,19 @@ void PASTECH2(bls_,ch,varname) \
        thrinfo_t* restrict thread  \
      );
 
-//INSERT_GENTPROT_BASIC0( packm_var1 )
+//INSERT_GENTPROT_BASIC( packm_var1 )
 GENTPROT( float,    s, packm_var1 )
 GENTPROT( double,   d, packm_var1 )
 GENTPROT( scomplex, c, packm_var1 )
 GENTPROT( dcomplex, z, packm_var1 )
 
-//INSERT_GENTPROT_BASIC0( packm_var2 )
+//INSERT_GENTPROT_BASIC( packm_var2 )
 GENTPROT( float,    s, packm_var2 )
 GENTPROT( double,   d, packm_var2 )
 GENTPROT( scomplex, c, packm_var2 )
 GENTPROT( dcomplex, z, packm_var2 )
 
-//INSERT_GENTPROT_BASIC0( packm_var3 )
+//INSERT_GENTPROT_BASIC( packm_var3 )
 GENTPROT( float,    s, packm_var3 )
 GENTPROT( double,   d, packm_var3 )
 GENTPROT( scomplex, c, packm_var3 )
diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c
index b37d34cce..9cfab59c5 100644
--- a/sandbox/gemmlike/bls_l3_packm_var1.c
+++ b/sandbox/gemmlike/bls_l3_packm_var1.c
@@ -180,7 +180,7 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_m
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_var1 )
+//INSERT_GENTFUNC_BASIC( packm_var1 )
 GENTFUNC( float,    s, packm_var1 )
 GENTFUNC( double,   d, packm_var1 )
 GENTFUNC( scomplex, c, packm_var1 )
diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c
index b3efbbc28..96d041a1a 100644
--- a/sandbox/gemmlike/bls_l3_packm_var2.c
+++ b/sandbox/gemmlike/bls_l3_packm_var2.c
@@ -231,7 +231,7 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_m
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_var1 )
+//INSERT_GENTFUNC_BASIC( packm_var1 )
 GENTFUNC( float,    s, packm_var2 )
 GENTFUNC( double,   d, packm_var2 )
 GENTFUNC( scomplex, c, packm_var2 )
diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c
index 48cd6dd60..6ee209291 100644
--- a/sandbox/gemmlike/bls_l3_packm_var3.c
+++ b/sandbox/gemmlike/bls_l3_packm_var3.c
@@ -178,7 +178,7 @@ void PASTECH2(bls_,ch,varname) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_var3 )
+//INSERT_GENTFUNC_BASIC( packm_var3 )
 GENTFUNC( float,    s, packm_var3 )
 GENTFUNC( double,   d, packm_var3 )
 GENTFUNC( scomplex, c, packm_var3 )
diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c
index 75dc66649..f987a6a7d 100644
--- a/sandbox/gemmlike/bls_packm_cxk.c
+++ b/sandbox/gemmlike/bls_packm_cxk.c
@@ -152,7 +152,7 @@ void PASTECH2(bls_,ch,opname) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC0( packm_cxk )
+//INSERT_GENTFUNC_BASIC( packm_cxk )
 GENTFUNC( float,    s, packm_cxk )
 GENTFUNC( double,   d, packm_cxk )
 GENTFUNC( scomplex, c, packm_cxk )
diff --git a/sandbox/gemmlike/bls_packm_cxk.h b/sandbox/gemmlike/bls_packm_cxk.h
index f6582d64a..56e6e5d7a 100644
--- a/sandbox/gemmlike/bls_packm_cxk.h
+++ b/sandbox/gemmlike/bls_packm_cxk.h
@@ -50,7 +50,7 @@ void PASTECH2(bls_,ch,varname) \
        cntx_t* cntx  \
      );
 
-//INSERT_GENTPROT_BASIC0( packm_cxk )
+//INSERT_GENTPROT_BASIC( packm_cxk )
 GENTPROT( float,    s, packm_cxk )
 GENTPROT( double,   d, packm_cxk )
 GENTPROT( scomplex, c, packm_cxk )
diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c
index fd6bad5f7..c8e740b24 100644
--- a/testsuite/src/test_amaxv.c
+++ b/testsuite/src/test_amaxv.c
@@ -182,7 +182,7 @@ void libblis_test_amaxv_experiment
 	// Randomize x.
 	libblis_test_vobj_randomize( params, FALSE, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		time = bli_clock();
@@ -300,7 +300,7 @@ void PASTEMAC(ch,opname) \
        dim_t* restrict index  \
      ); \
 
-INSERT_GENTPROT_BASIC0( amaxv_test )
+INSERT_GENTPROT_BASIC( amaxv_test )
 
 
 //
@@ -459,5 +459,5 @@ void PASTEMAC(ch,varname) \
 	PASTEMAC(i,copys)( index_l, *index ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( amaxv_test )
+INSERT_GENTFUNCR_BASIC( amaxv_test )
 
diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c
index 223007dba..9b6268ca3 100644
--- a/testsuite/src/test_randm.c
+++ b/testsuite/src/test_randm.c
@@ -172,7 +172,7 @@ void libblis_test_randm_experiment
 	// Create the test objects.
 	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, x_store, m, n, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		time = bli_clock();
@@ -245,7 +245,7 @@ void libblis_test_randm_check
 	bli_obj_scalar_init_detached( dt_real, &sum );
 
 	bli_absumm( x, &sum );
-	
+
 	if ( bli_is_float( dt_real ) )
 	{
 		float*  sum_x = bli_obj_buffer_at_off( &sum );
@@ -341,5 +341,5 @@ void PASTEMAC(ch,varname)( \
 	PASTEMAC2(chr,chr,copys)( sum, *sum_x_cast ); \
 }
 
-INSERT_GENTFUNCR_BASIC0( absumm )
+INSERT_GENTFUNCR_BASIC( absumm )
 
diff --git a/testsuite/src/test_randm.h b/testsuite/src/test_randm.h
index e44464962..856566648 100644
--- a/testsuite/src/test_randm.h
+++ b/testsuite/src/test_randm.h
@@ -57,4 +57,4 @@ void PASTEMAC(ch,varname)( \
                            void*  sum_x  \
                          );
 
-INSERT_GENTPROTR_BASIC0( absumm )
+INSERT_GENTPROTR_BASIC( absumm )

From 138de3b3e88c5bf7d8718c45c88811771cf42db8 Mon Sep 17 00:00:00 2001
From: Ajay Panyala <ajay.panyala@gmail.com>
Date: Sun, 7 May 2023 13:01:38 -0700
Subject: [PATCH 153/230] add nvhpc compiler support (#719)

Add detection of the NVIDIA nvhpc compiler (`nvc`) in `configure`, and adjust some warning options in `config.mk`. Currently, no specific options for `nvc` have been added in the relevant configurations so it may not be usable without further tweaks.
---
 common.mk                   | 4 ++++
 config/generic/make_defs.mk | 6 +++++-
 configure                   | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/common.mk b/common.mk
index 33f39d529..25d9f8d2c 100644
--- a/common.mk
+++ b/common.mk
@@ -719,7 +719,11 @@ CWARNFLAGS :=
 # Disable unused function warnings and stop compiling on first error for
 # all compilers that accept such options: gcc, clang, and icc.
 ifneq ($(CC_VENDOR),ibm)
+ifneq ($(CC_VENDOR),nvc)
 CWARNFLAGS += -Wall -Wno-unused-function -Wfatal-errors
+else
+CWARNFLAGS += -Wall -Wno-unused-function
+endif
 endif
 
 # Disable tautological comparision warnings in clang.
diff --git a/config/generic/make_defs.mk b/config/generic/make_defs.mk
index b0dcec044..cbe4fb86f 100644
--- a/config/generic/make_defs.mk
+++ b/config/generic/make_defs.mk
@@ -71,7 +71,11 @@ else
 ifeq ($(CC_VENDOR),clang)
 CKVECFLAGS     :=
 else
-$(error gcc, icc, or clang is required for this configuration.)
+ifeq ($(CC_VENDOR),nvc)
+CKVECFLAGS     :=
+else
+$(error gcc, icc, nvc, or clang is required for this configuration.)
+endif
 endif
 endif
 endif
diff --git a/configure b/configure
index 7219a204b..6938d47cd 100755
--- a/configure
+++ b/configure
@@ -1545,7 +1545,7 @@ get_compiler_version()
 	# The last part ({ read first rest ; echo $first ; }) is a workaround
 	# to OS X's egrep only returning the first match.
 	cc_vendor=$(echo "${vendor_string}" |
-	            grep -oE 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG|GCC' |
+	            grep -oE 'icc|gcc|clang|nvc|emcc|pnacl|IBM|oneAPI|crosstool-NG|GCC' |
 	            { read -r first rest ; echo "${first}"; })
 
 	# AOCC version strings contain both "clang" and "AOCC" substrings, and

From 89b7863fc9a88903917deedc6a5ad9fd17f83713 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 8 May 2023 16:51:18 -0500
Subject: [PATCH 154/230] Fix 1m enablement for herk/her2k/syrk/syr2k. (#743)

Details:
- Ever since 28b0982, herk, her2k, syrk, and syr2k have been implemented
  in terms of the gemmt expert API. And since the decision of which
  induced method to use (1m or native) is made *below* the level of the
  expert API, executing any of {herk,her2k,syrk,syr2k} results in BLIS
  checking the enablement status for gemmt.
- This commit applies a band-aid of sorts to this issue by modifying
  bli_l3_ind_oper_get_enable() and bli_l3_ind_oper_set_enable() so that
  any attempts to query or modify the internal enablement status for
  herk, her2k, syrk, or syr2k instead does so for gemmt.
- This solution isn't perfect since, in theory, the user could enable 1m
  for, say, herk but then disable it for syrk, and then be confused when
  herk runs via native execution. But we don't anticipate that users
  modify 1m enablement at the operation level, and so in practice this
  solution is likely fine for now.
---
 frame/3/bli_l3_ind.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/frame/3/bli_l3_ind.c b/frame/3/bli_l3_ind.c
index fbf73be60..9ccaf3515 100644
--- a/frame/3/bli_l3_ind.c
+++ b/frame/3/bli_l3_ind.c
@@ -201,6 +201,18 @@ void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool statu
 	if ( !bli_is_complex( dt ) ) return;
 	if ( !bli_opid_is_level3( oper ) ) return;
 
+	// BLIS currently implements herk/her2k/syrk/syr2k in terms of the user-
+	// level gemmt (expert) API, and so those operations choose to execute
+	// 1m (or not) based on the induced method enablement status of gemmt.
+	// In other words, changing the enablement status of those operations
+	// would have no effect. Therefore, we redirect queries/accesses to those
+	// operations' induced method enablement statuses to that of gemmt.
+	if ( method != BLIS_NAT && ( oper == BLIS_HERK  ||
+	                             oper == BLIS_HER2K ||
+	                             oper == BLIS_SYRK  ||
+	                             oper == BLIS_SYR2K ) )
+		oper = BLIS_GEMMT;
+
 	// Disallow changing status of native execution.
 	if ( method == BLIS_NAT ) return;
 
@@ -224,6 +236,18 @@ bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt )
 	num_t idt = bli_ind_map_cdt_to_index( dt );
 	bool  r_val;
 
+	// BLIS currently implements herk/her2k/syrk/syr2k in terms of the user-
+	// level gemmt (expert) API, and so those operations choose to execute
+	// 1m (or not) based on the induced method enablement status of gemmt.
+	// In other words, changing the enablement status of those operations
+	// would have no effect. Therefore, we redirect queries/accesses to those
+	// operations' induced method enablement statuses to that of gemmt.
+	if ( method != BLIS_NAT && ( oper == BLIS_HERK  ||
+	                             oper == BLIS_HER2K ||
+	                             oper == BLIS_SYRK  ||
+	                             oper == BLIS_SYR2K ) )
+		oper = BLIS_GEMMT;
+
 	{
 		r_val = bli_l3_ind_oper_st[ method ][ oper ][ idt ];
 	}

From d639554894b6252a86bd3164921bce6fbb9e3b5e Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 7 Jun 2023 16:11:14 -0500
Subject: [PATCH 155/230] Pad thrcomm_t fields to avoid false sharing.

Details:
- Inserted a cache line of padding between various fields of the
  thrcomm_t and, in the case of the (presently defunct) tree barrier,
  fields of the barrier_t. This additional padding ensures that these
  fields, which both serve different purposes when performing a thread
  barrier, are only accessed when needed (and not just due to their
  spatial locality with their cache line neighbors).
- Added a new cpp macro constant, BLIS_CACHE_LINE_SIZE, to
  bli_config_macro_defs. This new constant defines the size of a cache
  line (in bytes) and defaults to 64.
- Special thanks to Leick Robinson for discovering this false sharing
  issue and developing/submitting the patch.
---
 frame/include/bli_config_macro_defs.h |  8 ++++++++
 frame/thread/bli_thrcomm.h            | 29 ++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index e7b77acbb..4bdbb7b78 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -77,6 +77,14 @@
 #endif
 
 
+// -- MEMORY SUBSYSTEM PROPERTIES ----------------------------------------------
+
+// Size of a cache line (in bytes).
+#ifndef BLIS_CACHE_LINE_SIZE
+#define BLIS_CACHE_LINE_SIZE 64
+#endif
+
+
 // -- MULTITHREADING -----------------------------------------------------------
 
 // Enable caching of queried cntx_t pointers in the gks?
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index b55922acd..04cb23a38 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -45,9 +45,23 @@
 struct barrier_s
 {
 	int               arity;
-	int               count;
 	struct barrier_s* dad;
+
+	// We insert a cache line of padding here to eliminate false sharing between
+	// the fields above and fields below.
+	char   padding1[ BLIS_CACHE_LINE_SIZE ];
+
+	int               count;
+
+	// We insert a cache line of padding here to eliminate false sharing between
+	// the fields above and fields below.
+	char   padding2[ BLIS_CACHE_LINE_SIZE ];
+
 	volatile int      signal;
+
+	// We insert a cache line of padding here to eliminate false sharing between
+	// this struct and the next one.
+	char   padding2[ BLIS_CACHE_LINE_SIZE ];
 };
 typedef struct barrier_s barrier_t;
 #endif
@@ -64,6 +78,10 @@ typedef struct thrcomm_s
 	dim_t       n_threads;
 	timpl_t     ti;
 
+	// We insert a cache line of padding here to eliminate false sharing between
+	// the fields above and fields below.
+	char   padding1[ BLIS_CACHE_LINE_SIZE ];
+
 	// NOTE: barrier_sense was originally a gint_t-based bool_t, but upon
 	// redefining bool_t as bool we discovered that some gcc __atomic built-ins
 	// don't allow the use of bool for the variables being operated upon.
@@ -72,8 +90,17 @@ typedef struct thrcomm_s
 	// redefining barrier_sense as a gint_t.
 	//volatile gint_t  barrier_sense;
 	gint_t barrier_sense;
+
+	// We insert a cache line of padding here to eliminate false sharing between
+	// the fields above and fields below.
+	char   padding2[ BLIS_CACHE_LINE_SIZE ];
+
 	dim_t  barrier_threads_arrived;
 
+	// We insert a cache line of padding here to eliminate false sharing between
+	// the fields above and whatever data structures follow.
+	char   padding3[ BLIS_CACHE_LINE_SIZE ];
+
 	// -- Fields specific to OpenMP --
 
 	#ifdef BLIS_ENABLE_OPENMP

From 6b894c30b9bb2c2518848d74e4c8d96844f77f24 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 12 Jun 2023 17:22:44 -0500
Subject: [PATCH 156/230] Rewrote/fixed broken tree barrier implementation.

Details:
- Rewrote the defintion of bli_thrcomm_tree_barrier() so that it (a)
  actually worked again, and (b) used atomics instead of a basic C99
  spin loop. (Note that the conventional barrier implementation is
  still enabled by default; the tree barrier must be toggled on
  manually within the configuration.)
- Added an early return to the definition of bli_thrcomm_barrier() in
  the cases where comm == NULL or comm->n_threads == 1.
- Reordered thread-related and thread-dependent header #include
  directives in blis.h so that the BLIS_TREE_BARRIER and
  BLIS_TREE_BARRIER_ARITY macros, which would be defined in the target
  configuration's in the bli_family_*.h file, would be #included prior
  to the inclusion of the thrcomm_t header that uses them.
- Changed the type of barrier_t.count from 'int' to 'dim_t'.
- Changed the type of barrier_t.signal from 'volatile int' to 'gint_t'.
- Special thanks to Leick Robinson for contributing these changes.
- Whitespace changes.
---
 frame/include/blis.h              | 30 +++++++++++++-------------
 frame/thread/bli_thrcomm.c        | 16 +++++++-------
 frame/thread/bli_thrcomm.h        |  6 +++---
 frame/thread/bli_thrcomm_openmp.c | 35 ++++++++++++++++++++++++-------
 4 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/frame/include/blis.h b/frame/include/blis.h
index d87018d00..6292f4745 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -80,21 +80,6 @@ extern "C" {
 #include "bli_pragma_macro_defs.h"
 
 
-// -- Threading definitions --
-
-#include "bli_thread.h"
-#include "bli_thread_range.h"
-#include "bli_thread_range_slab_rr.h"
-#include "bli_thread_range_tlb.h"
-
-#include "bli_pthread.h"
-
-
-// -- Constant definitions --
-
-#include "bli_extern_defs.h"
-
-
 // -- BLIS architecture/kernel definitions --
 
 #include "bli_pre_ker_params.h"
@@ -116,6 +101,21 @@ extern "C" {
 #include "bli_kernel_macro_defs.h"
 
 
+// -- Threading definitions --
+
+#include "bli_thread.h"
+#include "bli_thread_range.h"
+#include "bli_thread_range_slab_rr.h"
+#include "bli_thread_range_tlb.h"
+
+#include "bli_pthread.h"
+
+
+// -- Constant definitions --
+
+#include "bli_extern_defs.h"
+
+
 // -- Base operation prototypes --
 
 #include "bli_init.h"
diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c
index e9f9d9dc7..79618f1a8 100644
--- a/frame/thread/bli_thrcomm.c
+++ b/frame/thread/bli_thrcomm.c
@@ -206,6 +206,8 @@ void* bli_thrcomm_bcast
 	return object;
 }
 
+#ifndef BLIS_TREE_BARRIER
+
 // Use __sync_* builtins (assumed available) if __atomic_* ones are not present.
 #ifndef __ATOMIC_RELAXED
 
@@ -214,14 +216,10 @@ void* bli_thrcomm_bcast
 #define __ATOMIC_RELEASE
 #define __ATOMIC_ACQ_REL
 
-#define __atomic_load_n(ptr, constraint) \
-    __sync_fetch_and_add(ptr, 0)
-#define __atomic_add_fetch(ptr, value, constraint) \
-    __sync_add_and_fetch(ptr, value)
-#define __atomic_fetch_add(ptr, value, constraint) \
-    __sync_fetch_and_add(ptr, value)
-#define __atomic_fetch_xor(ptr, value, constraint) \
-    __sync_fetch_and_xor(ptr, value)
+#define __atomic_load_n(    ptr,        constraint ) __sync_fetch_and_add( ptr, 0     )
+#define __atomic_add_fetch( ptr, value, constraint ) __sync_add_and_fetch( ptr, value )
+#define __atomic_fetch_add( ptr, value, constraint ) __sync_fetch_and_add( ptr, value )
+#define __atomic_fetch_xor( ptr, value, constraint ) __sync_fetch_and_xor( ptr, value )
 
 #endif
 
@@ -269,3 +267,5 @@ void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm )
 	}
 }
 
+#endif
+
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index 04cb23a38..436b05711 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -51,17 +51,17 @@ struct barrier_s
 	// the fields above and fields below.
 	char   padding1[ BLIS_CACHE_LINE_SIZE ];
 
-	int               count;
+	dim_t             count;
 
 	// We insert a cache line of padding here to eliminate false sharing between
 	// the fields above and fields below.
 	char   padding2[ BLIS_CACHE_LINE_SIZE ];
 
-	volatile int      signal;
+	gint_t            signal;
 
 	// We insert a cache line of padding here to eliminate false sharing between
 	// this struct and the next one.
-	char   padding2[ BLIS_CACHE_LINE_SIZE ];
+	char   padding3[ BLIS_CACHE_LINE_SIZE ];
 };
 typedef struct barrier_s barrier_t;
 #endif
diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c
index a42dabe18..487832cd1 100644
--- a/frame/thread/bli_thrcomm_openmp.c
+++ b/frame/thread/bli_thrcomm_openmp.c
@@ -114,6 +114,10 @@ void bli_thrcomm_cleanup_openmp( thrcomm_t* comm )
 
 void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm )
 {
+	// Return early if the comm is NULL or if there is only one
+	// thread participating.
+	if ( comm == NULL || comm->n_threads == 1 ) return;
+
 	bli_thrcomm_tree_barrier( comm->barriers[t_id] );
 }
 
@@ -176,27 +180,42 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier )
 	return;
 }
 
+// Use __sync_* builtins (assumed available) if __atomic_* ones are not present.
+#ifndef __ATOMIC_RELAXED
+
+#define __ATOMIC_RELAXED
+#define __ATOMIC_ACQUIRE
+#define __ATOMIC_RELEASE
+#define __ATOMIC_ACQ_REL
+
+//#define __atomic_add_fetch( ptr, value, constraint ) __sync_add_and_fetch( ptr, value )
+//#define __atomic_fetch_add( ptr, value, constraint ) __sync_fetch_and_add( ptr, value )
+
+#define __atomic_load_n(    ptr,        constraint ) __sync_fetch_and_add( ptr, 0     )
+#define __atomic_sub_fetch( ptr, value, constraint ) __sync_sub_and_fetch( ptr, value )
+#define __atomic_fetch_xor( ptr, value, constraint ) __sync_fetch_and_xor( ptr, value )
+
+#endif
+
 void bli_thrcomm_tree_barrier( barrier_t* barack )
 {
-	int my_signal = barack->signal;
-	int my_count;
+	gint_t my_signal = __atomic_load_n( &barack->signal, __ATOMIC_RELAXED );
 
-	_Pragma( "omp atomic capture" )
-		my_count = barack->count--;
+	dim_t my_count =
+	__atomic_sub_fetch( &barack->count, 1, __ATOMIC_ACQ_REL );
 
-	if ( my_count == 1 )
+	if ( my_count == 0 )
 	{
 		if ( barack->dad != NULL )
 		{
 			bli_thrcomm_tree_barrier( barack->dad );
 		}
 		barack->count = barack->arity;
-		barack->signal = !barack->signal;
+		__atomic_fetch_xor( &barack->signal, 1, __ATOMIC_RELEASE );
 	}
 	else
 	{
-		volatile int* listener = &barack->signal;
-		while ( *listener == my_signal ) {}
+		while ( __atomic_load_n( &barack->signal, __ATOMIC_ACQUIRE ) == my_signal ) {}
 	}
 }
 

From a0b04e3c007f1207e5678bf20c07752906742fb7 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 26 Jun 2023 17:59:21 -0500
Subject: [PATCH 157/230] Rewrote regen-symbols.sh (gen-libblis-symbols.sh).
 (#751)

Details:
- Wrote an alternative to regen-symbols.sh, gen-libblis-symbols.sh,
  that generates a list of exported symbols from the monolithic blis.h
  file rather than peeking inside of the shared object via nm. (This new
  script lives in the 'build' directory and the older script has been
  retired to build/old.) Special thanks to Devin Matthews for authoring
  gen-libblis-symbols.sh.
- Added a 'symbols' target to the top-level Makefile which will refresh
  build/libblis-symbols.def, with supporting changes to common.mk.
- Updates to build/libblis-symbols.def using the new symbol-generating
  script.
---
 Makefile                         |  14 +
 build/gen-libblis-symbols.sh     |  60 +++++
 build/libblis-symbols.def        | 421 ++++++++++++++++---------------
 build/{ => old}/regen-symbols.sh |   0
 common.mk                        |   6 +
 5 files changed, 302 insertions(+), 199 deletions(-)
 create mode 100755 build/gen-libblis-symbols.sh
 rename build/{ => old}/regen-symbols.sh (100%)

diff --git a/Makefile b/Makefile
index 33641f8c8..d930b3f19 100644
--- a/Makefile
+++ b/Makefile
@@ -62,6 +62,7 @@
         clean cleanmk cleanh cleanlib distclean \
         cleantest cleanblastest cleanblistest \
         changelog \
+        symbols \
         install uninstall uninstall-old \
         uninstall-libs uninstall-lib-symlinks uninstall-headers \
         uninstall-old-libs uninstall-lib-symlinks uninstall-old-headers
@@ -497,6 +498,19 @@ ifeq ($(ALL_MAKE_DEFS_MK_PRESENT),no)
 endif
 
 
+# --- Shared/dynamic libblis symbol file creation/refresh ---
+
+symbols: check-env $(SYM_FILE)
+
+$(SYM_FILE): $(HEADERS_TO_INSTALL)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(GEN_SYMS) > $(SYM_FILE)
+else
+	@echo "Updating $(SYM_FILE)"
+	@$(GEN_SYMS) > $(SYM_FILE)
+endif
+
+
 # --- Consolidated blis.h header creation ---
 
 flat-header: check-env $(BLIS_H_FLAT)
diff --git a/build/gen-libblis-symbols.sh b/build/gen-libblis-symbols.sh
new file mode 100755
index 000000000..0ffa3458f
--- /dev/null
+++ b/build/gen-libblis-symbols.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+get_config_var()
+{
+	# Parse the compiler assigned to the CC variable within the config.mk file.
+	echo "$(grep "^ *$1 *:=" config.mk | sed 's/'$1' *:= *//')"
+}
+
+main()
+{
+	if [ ! -e config.mk ]; then
+		echo "No config.mk file detected; have you configured BLIS?"
+		exit 1
+	fi
+
+	CC=$(get_config_var CC)
+	CONFIG_NAME=$(get_config_var CONFIG_NAME)
+	BLIS_H_FLAT="include/${CONFIG_NAME}/blis.h"
+
+	if [ ! -e ${BLIS_H_FLAT} ]; then
+		echo "No monolithic blis.h file detected at ${BLIS_H_FLAT}; have you run 'make'?"
+		exit 1
+	fi
+
+	#
+	# Header line
+	#
+	echo "EXPORTS"
+
+	#
+	# Breakdown of commands:
+	# $(CC) ...		# Pre-process blis.h, making sure to include all BLAS and CBLAS symbols
+	#	| tr ...	# Make sure to split lines at ';' so that each declaration is on its own line
+	#	| grep ...	# Find exported symbols
+	#	| sed -E
+	#	    -e ...	# 1. Remove all __attribute__ clauses
+	#	    -e ...	# 2. Select only the portion before an opening '(' (if any)
+	#	    -e ...	# 3. Pull out the last word, which is the function name.
+	#	| grep ...	# Remove constants
+	#	| grep ...	# Remove blank lines
+	#	| sed  ...  # Remove trailing spaces
+	#	| sort
+	#	| uniq
+	#
+	${CC} -DBLIS_ENABLE_CBLAS=1 -DBLIS_ENABLE_BLAS=1 -E ${BLIS_H_FLAT} \
+		| tr ';' '\n' \
+		| grep visibility \
+		| sed -E \
+		    -e 's/__attribute__ *\( *\([^\)]+(\([^\)]+\) *)\) *\)//g' \
+		    -e 's/(.*) *\(.*/\1/' \
+		    -e 's/.* ([^ ].*)/\1/' \
+		| grep -v BLIS \
+		| grep -E '[^ ]' \
+		| sed -e 's/[[:space:]]*$//g' \
+		| sort \
+		| uniq
+}
+
+main "$@"
+
diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index 4bc91784c..c3cfbcdcb 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -121,6 +121,8 @@ bli_cger_ex
 bli_cgetijm
 bli_cgetijv
 bli_cgetsc
+bli_cgtesc
+bli_cgtsc
 bli_check_error_code_helper
 bli_chemm
 bli_chemm_ex
@@ -147,6 +149,8 @@ bli_cinvscalv
 bli_cinvscalv_ex
 bli_clock
 bli_clock_min_diff
+bli_cltesc
+bli_cltsc
 bli_cmachval
 bli_cmkherm
 bli_cmkherm_ex
@@ -227,6 +231,7 @@ bli_csetv
 bli_csetv_ex
 bli_cshiftd
 bli_cshiftd_ex
+bli_csqrtrsc
 bli_csqrtsc
 bli_csubd
 bli_csubd_ex
@@ -346,6 +351,8 @@ bli_dger_ex
 bli_dgetijm
 bli_dgetijv
 bli_dgetsc
+bli_dgtesc
+bli_dgtsc
 bli_dhemm
 bli_dhemm_ex
 bli_dhemv
@@ -370,6 +377,8 @@ bli_dinvscalm_ex
 bli_dinvscalv
 bli_dinvscalv_ex
 bli_divsc
+bli_dltesc
+bli_dltsc
 bli_dmachval
 bli_dmkherm
 bli_dmkherm_ex
@@ -440,6 +449,7 @@ bli_dsetv
 bli_dsetv_ex
 bli_dshiftd
 bli_dshiftd_ex
+bli_dsqrtrsc
 bli_dsqrtsc
 bli_dsubd
 bli_dsubd_ex
@@ -466,6 +476,8 @@ bli_dsyr2k_ex
 bli_dsyr_ex
 bli_dsyrk
 bli_dsyrk_ex
+bli_dt_size
+bli_dt_string
 bli_dtrmm
 bli_dtrmm3
 bli_dtrmm3_ex
@@ -476,8 +488,6 @@ bli_dtrsm
 bli_dtrsm_ex
 bli_dtrsv
 bli_dtrsv_ex
-bli_dt_size
-bli_dt_string
 bli_dunzipsc
 bli_dxpbyd
 bli_dxpbyd_ex
@@ -495,7 +505,6 @@ bli_dzxpbym_md_ex
 bli_eqm
 bli_eqsc
 bli_eqv
-bli_error_checking_is_enabled
 bli_error_checking_level
 bli_error_checking_level_set
 bli_finalize
@@ -504,10 +513,10 @@ bli_fprintv
 bli_free_user
 bli_gemm
 bli_gemm_ex
+bli_gemm_ukernel
 bli_gemmt
 bli_gemmt_ex
 bli_gemmtrsm_ukernel
-bli_gemm_ukernel
 bli_gemv
 bli_gemv_ex
 bli_ger
@@ -523,6 +532,8 @@ bli_gks_l3_ukr_impl_type
 bli_gks_query_cntx
 bli_gks_query_ind_cntx
 bli_gks_query_nat_cntx
+bli_gtesc
+bli_gtsc
 bli_hemm
 bli_hemm_ex
 bli_hemv
@@ -547,26 +558,27 @@ bli_ind_enable_dt
 bli_ind_oper_enable_only
 bli_ind_oper_find_avail
 bli_ind_oper_get_avail_impl_string
-bli_ind_oper_is_impl
 bli_info_get_blas_int_type_size
 bli_info_get_enable_blas
 bli_info_get_enable_cblas
+bli_info_get_enable_hpx
+bli_info_get_enable_hpx_as_default
 bli_info_get_enable_memkind
 bli_info_get_enable_openmp
 bli_info_get_enable_openmp_as_default
 bli_info_get_enable_pba_pools
 bli_info_get_enable_pthreads
 bli_info_get_enable_pthreads_as_default
-bli_info_get_enable_hpx
-bli_info_get_enable_hpx_as_default
 bli_info_get_enable_sandbox
 bli_info_get_enable_sba_pools
+bli_info_get_enable_stay_auto_init
 bli_info_get_enable_threading
+bli_info_get_enable_tls
 bli_info_get_gemm_impl_string
+bli_info_get_gemm_ukr_impl_string
 bli_info_get_gemmt_impl_string
 bli_info_get_gemmtrsm_l_ukr_impl_string
 bli_info_get_gemmtrsm_u_ukr_impl_string
-bli_info_get_gemm_ukr_impl_string
 bli_info_get_heap_addr_align_size
 bli_info_get_heap_stride_align_size
 bli_info_get_hemm_impl_string
@@ -593,8 +605,9 @@ bli_info_get_stack_buf_max_size
 bli_info_get_symm_impl_string
 bli_info_get_syr2k_impl_string
 bli_info_get_syrk_impl_string
-bli_info_get_thread_part_jrir_rr
-bli_info_get_thread_part_jrir_slab
+bli_info_get_thread_jrir_rr
+bli_info_get_thread_jrir_slab
+bli_info_get_thread_jrir_tlb
 bli_info_get_trmm3_impl_string
 bli_info_get_trmm_impl_string
 bli_info_get_trsm_impl_string
@@ -618,6 +631,8 @@ bli_iprintv
 bli_isetsc
 bli_l3_cntl_free
 bli_l3_thrinfo_create
+bli_ltesc
+bli_ltsc
 bli_machval
 bli_malloc_user
 bli_mkherm
@@ -648,30 +663,23 @@ bli_obj_create_1x1_with_attached_buffer
 bli_obj_create_conf_to
 bli_obj_create_with_attached_buffer
 bli_obj_create_without_buffer
-bli_obj_equals
 bli_obj_free
-bli_obj_imag_equals
-bli_obj_imag_is_zero
 bli_obj_print
 bli_obj_scalar_apply_scalar
 bli_obj_scalar_attach
 bli_obj_scalar_cast_to
 bli_obj_scalar_detach
-bli_obj_scalar_equals
-bli_obj_scalar_has_nonzero_imag
 bli_obj_scalar_init_detached
 bli_obj_scalar_init_detached_copy_of
 bli_obj_scalar_reset
 bli_pack_get_pack_a
 bli_pack_get_pack_b
+bli_pack_set_pack_a
+bli_pack_set_pack_b
 bli_packm_alloc
 bli_packm_alloc_ex
 bli_packm_blk_var1
-bli_packm_cntl_create_node
-bli_packm_init
 bli_packm_scalar
-bli_pack_set_pack_a
-bli_pack_set_pack_b
 bli_param_map_blis_to_char_conj
 bli_param_map_blis_to_char_diag
 bli_param_map_blis_to_char_dt
@@ -817,6 +825,8 @@ bli_sger_ex
 bli_sgetijm
 bli_sgetijv
 bli_sgetsc
+bli_sgtesc
+bli_sgtsc
 bli_shemm
 bli_shemm_ex
 bli_shemv
@@ -843,6 +853,8 @@ bli_sinvscalm_ex
 bli_sinvscalv
 bli_sinvscalv_ex
 bli_sleep
+bli_sltesc
+bli_sltsc
 bli_smachval
 bli_smkherm
 bli_smkherm_ex
@@ -866,6 +878,7 @@ bli_snormiv
 bli_snormiv_ex
 bli_sprintm
 bli_sprintv
+bli_sqrtrsc
 bli_sqrtsc
 bli_srandm
 bli_srandm_ex
@@ -904,6 +917,7 @@ bli_ssetv
 bli_ssetv_ex
 bli_sshiftd
 bli_sshiftd_ex
+bli_ssqrtrsc
 bli_ssqrtsc
 bli_ssubd
 bli_ssubd_ex
@@ -987,6 +1001,7 @@ bli_thread_get_num_threads
 bli_thread_get_pc_nt
 bli_thread_get_thread_impl
 bli_thread_get_thread_impl_str
+bli_thread_launch
 bli_thread_range_sub
 bli_thread_set_num_threads
 bli_thread_set_num_threads_
@@ -1083,6 +1098,8 @@ bli_zger_ex
 bli_zgetijm
 bli_zgetijv
 bli_zgetsc
+bli_zgtesc
+bli_zgtsc
 bli_zhemm
 bli_zhemm_ex
 bli_zhemv
@@ -1107,6 +1124,8 @@ bli_zinvscalm_ex
 bli_zinvscalv
 bli_zinvscalv_ex
 bli_zipsc
+bli_zltesc
+bli_zltsc
 bli_zmachval
 bli_zmkherm
 bli_zmkherm_ex
@@ -1167,6 +1186,7 @@ bli_zsetv
 bli_zsetv_ex
 bli_zshiftd
 bli_zshiftd_ex
+bli_zsqrtrsc
 bli_zsqrtsc
 bli_zsubd
 bli_zsubd_ex
@@ -1217,188 +1237,8 @@ bli_zzcopysc
 bli_zzipsc
 bli_zzxpbym_md
 bli_zzxpbym_md_ex
-sasum_
-sasumsub_
-saxpby_
-saxpy_
-scabs1_
-scasum_
-scasumsub_
-scnrm2_
-scnrm2sub_
-scopy_
-sdot_
-sdotsub_
-sdsdot_
-sdsdotsub_
-sgbmv_
-sgemm_
-sgemm_batch_
-sgemmt_
-sgemv_
-sger_
-snrm2_
-snrm2sub_
-srot_
-srotg_
-srotm_
-srotmg_
-ssbmv_
-sscal_
-sspmv_
-sspr_
-sspr2_
-sswap_
-ssymm_
-ssymv_
-ssyr_
-ssyr2_
-ssyr2k_
-ssyrk_
-stbmv_
-stbsv_
-stpmv_
-stpsv_
-strmm_
-strmv_
-strsm_
-strsv_
-dasum_
-dasumsub_
-daxpby_
-daxpy_
-dcabs1_
-dcopy_
-ddot_
-ddotsub_
-dgbmv_
-dgemm_
-dgemm_batch_
-dgemmt_
-dgemv_
-dger_
-dnrm2_
-dnrm2sub_
-drot_
-drotg_
-drotm_
-drotmg_
-dsbmv_
-dscal_
-dsdot_
-dsdotsub_
-dspmv_
-dspr_
-dspr2_
-dswap_
-dsymm_
-dsymv_
-dsyr_
-dsyr2_
-dsyr2k_
-dsyrk_
-dtbmv_
-dtbsv_
-dtpmv_
-dtpsv_
-dtrmm_
-dtrmv_
-dtrsm_
-dtrsv_
-dzasum_
-dzasumsub_
-dznrm2_
-dznrm2sub_
 caxpby_
 caxpy_
-ccopy_
-cdotc_
-cdotcsub_
-cdotu_
-cdotusub_
-cgbmv_
-cgemm_
-cgemm3m_
-cgemm_batch_
-cgemmt_
-cgemv_
-cgerc_
-cgeru_
-chbmv_
-chemm_
-chemv_
-cher_
-cher2_
-cher2k_
-cherk_
-chpmv_
-chpr_
-chpr2_
-crotg_
-cscal_
-csrot_
-csscal_
-cswap_
-csymm_
-csyr2k_
-csyrk_
-ctbmv_
-ctbsv_
-ctpmv_
-ctpsv_
-ctrmm_
-ctrmv_
-ctrsm_
-ctrsv_
-zaxpby_
-zaxpy_
-zcopy_
-zdotc_
-zdotcsub_
-zdotu_
-zdotusub_
-zdrot_
-zdscal_
-zgbmv_
-zgemm_
-zgemm3m_
-zgemm_batch_
-zgemmt_
-zgemv_
-zgerc_
-zgeru_
-zhbmv_
-zhemm_
-zhemv_
-zher_
-zher2_
-zher2k_
-zherk_
-zhpmv_
-zhpr_
-zhpr2_
-zrotg_
-zscal_
-zswap_
-zsymm_
-zsyr2k_
-zsyrk_
-ztbmv_
-ztbsv_
-ztpmv_
-ztpsv_
-ztrmm_
-ztrmv_
-ztrsm_
-ztrsv_
-icamax_
-icamaxsub_
-idamax_
-idamaxsub_
-isamax_
-isamaxsub_
-izamax_
-izamaxsub_
 cblas_caxpby
 cblas_caxpy
 cblas_ccopy
@@ -1556,3 +1396,186 @@ cblas_ztrmm
 cblas_ztrmv
 cblas_ztrsm
 cblas_ztrsv
+ccopy_
+cdotc_
+cdotcsub_
+cdotu_
+cdotusub_
+cgbmv_
+cgemm3m_
+cgemm_
+cgemm_batch_
+cgemmt_
+cgemv_
+cgerc_
+cgeru_
+chbmv_
+chemm_
+chemv_
+cher2_
+cher2k_
+cher_
+cherk_
+chpmv_
+chpr2_
+chpr_
+crotg_
+cscal_
+csrot_
+csscal_
+cswap_
+csymm_
+csyr2k_
+csyrk_
+ctbmv_
+ctbsv_
+ctpmv_
+ctpsv_
+ctrmm_
+ctrmv_
+ctrsm_
+ctrsv_
+dasum_
+dasumsub_
+daxpby_
+daxpy_
+dcabs1_
+dcopy_
+ddot_
+ddotsub_
+dgbmv_
+dgemm_
+dgemm_batch_
+dgemmt_
+dgemv_
+dger_
+dnrm2_
+dnrm2sub_
+drot_
+drotg_
+drotm_
+drotmg_
+dsbmv_
+dscal_
+dsdot_
+dsdotsub_
+dspmv_
+dspr2_
+dspr_
+dswap_
+dsymm_
+dsymv_
+dsyr2_
+dsyr2k_
+dsyr_
+dsyrk_
+dtbmv_
+dtbsv_
+dtpmv_
+dtpsv_
+dtrmm_
+dtrmv_
+dtrsm_
+dtrsv_
+dzasum_
+dzasumsub_
+dznrm2_
+dznrm2sub_
+icamax_
+icamaxsub_
+idamax_
+idamaxsub_
+isamax_
+isamaxsub_
+izamax_
+izamaxsub_
+lsame_
+sasum_
+sasumsub_
+saxpby_
+saxpy_
+scabs1_
+scasum_
+scasumsub_
+scnrm2_
+scnrm2sub_
+scopy_
+sdot_
+sdotsub_
+sdsdot_
+sdsdotsub_
+sgbmv_
+sgemm_
+sgemm_batch_
+sgemmt_
+sgemv_
+sger_
+snrm2_
+snrm2sub_
+srot_
+srotg_
+srotm_
+srotmg_
+ssbmv_
+sscal_
+sspmv_
+sspr2_
+sspr_
+sswap_
+ssymm_
+ssymv_
+ssyr2_
+ssyr2k_
+ssyr_
+ssyrk_
+stbmv_
+stbsv_
+stpmv_
+stpsv_
+strmm_
+strmv_
+strsm_
+strsv_
+xerbla_
+xerbla_array_
+zaxpby_
+zaxpy_
+zcopy_
+zdotc_
+zdotcsub_
+zdotu_
+zdotusub_
+zdrot_
+zdscal_
+zgbmv_
+zgemm3m_
+zgemm_
+zgemm_batch_
+zgemmt_
+zgemv_
+zgerc_
+zgeru_
+zhbmv_
+zhemm_
+zhemv_
+zher2_
+zher2k_
+zher_
+zherk_
+zhpmv_
+zhpr2_
+zhpr_
+zrotg_
+zscal_
+zswap_
+zsymm_
+zsyr2k_
+zsyrk_
+ztbmv_
+ztbsv_
+ztpmv_
+ztpsv_
+ztrmm_
+ztrmv_
+ztrsm_
+ztrsv_
diff --git a/build/regen-symbols.sh b/build/old/regen-symbols.sh
similarity index 100%
rename from build/regen-symbols.sh
rename to build/old/regen-symbols.sh
diff --git a/common.mk b/common.mk
index 25d9f8d2c..e58eca8a8 100644
--- a/common.mk
+++ b/common.mk
@@ -425,6 +425,7 @@ REFKERN_PATH       := $(DIST_PATH)/$(REFKERN_DIR)
 KERNELS_PATH       := $(DIST_PATH)/$(KERNELS_DIR)
 ADDON_PATH         := $(DIST_PATH)/$(ADDON_DIR)
 SANDBOX_PATH       := $(DIST_PATH)/$(SANDBOX_DIR)
+BUILD_PATH         := $(DIST_PATH)/$(BUILD_DIR)
 
 # Construct paths to some optional C++ template headers contributed by AMD.
 VEND_CPP_PATH      := $(DIST_PATH)/$(VEND_CPP_DIR)
@@ -547,6 +548,11 @@ ARFLAGS    := cr
 GIT        := git
 GIT_LOG    := $(GIT) log --decorate
 
+# Define the locations of a script to generate a list of shared library symbols
+# within BLIS as well as the symbol file itself.
+GEN_SYMS   := $(BUILD_PATH)/gen-libblis-symbols.sh
+SYM_FILE   := $(BUILD_PATH)/libblis-symbols.def
+
 
 
 #

From c91b41d022e33da82b3b06c82be047a29873d9b6 Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Wed, 26 Jul 2023 14:37:08 -0500
Subject: [PATCH 158/230] Auto-detect the RISC-V ABI of the compiler and use
 -mabi= during RISC-V Builds (#750)

Details:
- Generate a build error if there is a 32/64-bit mismatch between the
  RISC-V ABI or architecture and the BLIS configuration selected.
- Handle Q, Zicsr, ZiFencei, Zba, Zbb, Zbc, Zbs and Zfh extensions in
  the RISC-V architecture auto-detection. ZiFencei and Zicsr is not
  detectable with built-in RISC-V macros right now.
- ZiFencei is not important for BLIS because doesn't it have
  Just-In-Time compilation or self-modifying code, and Zicsr is implied
  by the floating-point extensions, which are required for good
  performance in BLIS.
- Move RISC-V autodetect header files to build/detect/riscv/.
---
 .../detect/riscv}/bli_riscv_cpuid.h           |  0
 build/detect/riscv/bli_riscv_detect_abi.h     | 63 ++++++++++++++++
 .../detect/riscv}/bli_riscv_detect_arch.h     | 72 ++++++++++++++++++-
 config/rv32i/make_defs.mk                     | 14 +++-
 config/rv32iv/make_defs.mk                    | 14 +++-
 config/rv64i/make_defs.mk                     | 12 +++-
 config/rv64iv/make_defs.mk                    | 12 +++-
 configure                                     |  2 +-
 8 files changed, 179 insertions(+), 10 deletions(-)
 rename {frame/base => build/detect/riscv}/bli_riscv_cpuid.h (100%)
 create mode 100644 build/detect/riscv/bli_riscv_detect_abi.h
 rename {frame/base => build/detect/riscv}/bli_riscv_detect_arch.h (72%)

diff --git a/frame/base/bli_riscv_cpuid.h b/build/detect/riscv/bli_riscv_cpuid.h
similarity index 100%
rename from frame/base/bli_riscv_cpuid.h
rename to build/detect/riscv/bli_riscv_cpuid.h
diff --git a/build/detect/riscv/bli_riscv_detect_abi.h b/build/detect/riscv/bli_riscv_detect_abi.h
new file mode 100644
index 000000000..a5a373926
--- /dev/null
+++ b/build/detect/riscv/bli_riscv_detect_abi.h
@@ -0,0 +1,63 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+/* Construct a RISC-V ABI string based on available features. */
+
+#if __riscv
+
+#define CAT2(a,b) a##b
+#define CAT(a,b) CAT2(a,b)
+
+#if __riscv_xlen == 32
+#define RISCV_INT_ABI ilp32
+#else
+#define RISCV_INT_ABI lp64
+#endif
+
+#if __riscv_abi_rve
+CAT(RISCV_INT_ABI, e)
+#elif __riscv_float_abi_soft
+RISCV_INT_ABI
+#elif __riscv_float_abi_single
+CAT(RISCV_INT_ABI, f)
+#elif __riscv_float_abi_double
+CAT(RISCV_INT_ABI, d)
+#elif __riscv_float_abi_quad
+CAT(RISCV_INT_ABI, q)
+#else
+#error "Unknown RISC-V ABI"
+#endif
+
+#endif /* __riscv */
diff --git a/frame/base/bli_riscv_detect_arch.h b/build/detect/riscv/bli_riscv_detect_arch.h
similarity index 72%
rename from frame/base/bli_riscv_detect_arch.h
rename to build/detect/riscv/bli_riscv_detect_arch.h
index 448b0f39d..55542f508 100644
--- a/frame/base/bli_riscv_detect_arch.h
+++ b/build/detect/riscv/bli_riscv_detect_arch.h
@@ -75,6 +75,12 @@
 #define RISCV_D
 #endif
 
+#if __riscv_flen >= 128
+#define RISCV_Q q
+#else
+#define RISCV_Q
+#endif
+
 #if __riscv_c
 #define RISCV_C c
 #else
@@ -94,6 +100,47 @@
 #define RISCV_V
 #endif
 
+/* No test currently for Zicsr, which was removed from the base ISA,
+   but F implies Zicsr */
+#if __riscv_f
+#define RISCV_ZICSR _zicsr
+#else
+#define RISCV_ZICSR
+#endif
+
+/* No test currently for Zifencei, which was removed from the base ISA */
+#define RISCV_ZIFENCEI
+
+#if __riscv_zba
+#define RISCV_ZBA _zba
+#else
+#define RISCV_ZBA
+#endif
+
+#if __riscv_zbb
+#define RISCV_ZBB _zbb
+#else
+#define RISCV_ZBB
+#endif
+
+#if __riscv_zbc
+#define RISCV_ZBC _zbc
+#else
+#define RISCV_ZBC
+#endif
+
+#if __riscv_zbs
+#define RISCV_ZBS _zbs
+#else
+#define RISCV_ZBS
+#endif
+
+#if __riscv_zfh
+#define RISCV_ZFH _zfh
+#else
+#define RISCV_ZFH
+#endif
+
 #else /* __riscv_arch_test */
 
 /* We assume I and E are exclusive when __riscv_arch_test isn't defined */
@@ -129,6 +176,12 @@
 #define RISCV_D
 #endif
 
+#if __riscv_flen >= 128
+#define RISCV_Q q
+#else
+#define RISCV_Q
+#endif
+
 #if __riscv_compressed
 #define RISCV_C c
 #else
@@ -144,12 +197,29 @@
 #define RISCV_V
 #endif
 
+/* No test currently for Zicsr, which was removed from the base ISA, but
+   F implies Zicsr */
+#if __riscv_flen >= 32
+#define RISCV_ZICSR _zicsr
+#else
+#define RISCV_ZICSR
+#endif
+
+#define RISCV_ZIFENCEI
+#define RISCV_ZBA
+#define RISCV_ZBB
+#define RISCV_ZBC
+#define RISCV_ZBS
+#define RISCV_ZFH
+
 #endif /* __riscv_arch_test */
 
 #define CAT2(a,b) a##b
 #define CAT(a,b) CAT2(a,b)
 
 CAT(rv, CAT(__riscv_xlen, CAT(RISCV_I, CAT(RISCV_E, CAT(RISCV_M, CAT(RISCV_A,
-CAT(RISCV_F, CAT(RISCV_D, CAT(RISCV_C, CAT(RISCV_P, RISCV_V))))))))))
+CAT(RISCV_F, CAT(RISCV_D, CAT(RISCV_Q, CAT(RISCV_C, CAT(RISCV_P, CAT(RISCV_V,
+CAT(RISCV_ZICSR, CAT(RISCV_ZIFENCEI, CAT(RISCV_ZBA, CAT(RISCV_ZBB,
+CAT(RISCV_ZBC, CAT(RISCV_ZBS, RISCV_ZFH))))))))))))))))))
 
 #endif /* __riscv */
diff --git a/config/rv32i/make_defs.mk b/config/rv32i/make_defs.mk
index 86b7143dd..21128717f 100644
--- a/config/rv32i/make_defs.mk
+++ b/config/rv32i/make_defs.mk
@@ -46,9 +46,17 @@ THIS_CONFIG    := rv32i
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -DRISCV_SIZE=32
-# Atomic instructions must be enabled either via hardware
-# (-march=rv32ia) or by linking against libatomic
-CMISCFLAGS     := -march=$(shell $(CC) -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=ilp32
+
+RISCV_ARCH     := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]')
+RISCV_ABI      := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]')
+
+ifeq (,$(findstring 32,$(RISCV_ARCH)))
+$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG))
+else ifeq (,$(findstring 32,$(RISCV_ABI)))
+$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG))
+endif
+
+CMISCFLAGS     := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
diff --git a/config/rv32iv/make_defs.mk b/config/rv32iv/make_defs.mk
index e8d9cca57..9daaee3d6 100644
--- a/config/rv32iv/make_defs.mk
+++ b/config/rv32iv/make_defs.mk
@@ -46,9 +46,17 @@ THIS_CONFIG    := rv32iv
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -DRISCV_SIZE=32
-# Atomic instructions must be enabled either via hardware
-# (-march=rv32iav) or by linking against libatomic
-CMISCFLAGS     := -march=$(shell $(CC) -DFORCE_RISCV_VECTOR -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=ilp32d
+
+RISCV_ARCH     := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]')
+RISCV_ABI      := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]')
+
+ifeq (,$(findstring 32,$(RISCV_ARCH)))
+$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG))
+else ifeq (,$(findstring 32,$(RISCV_ABI)))
+$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG))
+endif
+
+CMISCFLAGS     := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
diff --git a/config/rv64i/make_defs.mk b/config/rv64i/make_defs.mk
index bee21ed0d..7c055f012 100644
--- a/config/rv64i/make_defs.mk
+++ b/config/rv64i/make_defs.mk
@@ -46,7 +46,17 @@ THIS_CONFIG    := rv64i
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -DRISCV_SIZE=64
-CMISCFLAGS     := -march=$(shell $(CC) -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=lp64
+
+RISCV_ARCH     := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]')
+RISCV_ABI      := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]')
+
+ifeq (,$(findstring 64,$(RISCV_ARCH)))
+$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG))
+else ifeq (,$(findstring 64,$(RISCV_ABI)))
+$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG))
+endif
+
+CMISCFLAGS     := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
diff --git a/config/rv64iv/make_defs.mk b/config/rv64iv/make_defs.mk
index 1c9849fbe..9ec5a889a 100644
--- a/config/rv64iv/make_defs.mk
+++ b/config/rv64iv/make_defs.mk
@@ -46,7 +46,17 @@ THIS_CONFIG    := rv64iv
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -DRISCV_SIZE=64
-CMISCFLAGS     := -march=$(shell $(CC) -DFORCE_RISCV_VECTOR -E frame/base/bli_riscv_detect_arch.h | grep '^[^\#]') -mabi=lp64d
+
+RISCV_ARCH     := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]')
+RISCV_ABI      := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]')
+
+ifeq (,$(findstring 64,$(RISCV_ARCH)))
+$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG))
+else ifeq (,$(findstring 64,$(RISCV_ABI)))
+$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG))
+endif
+
+CMISCFLAGS     := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI)
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
 
diff --git a/configure b/configure
index 6938d47cd..f87093cad 100755
--- a/configure
+++ b/configure
@@ -1252,7 +1252,7 @@ auto_detect()
 	# Special case for RISC-V, whose architecture can be detected with
 	# preprocessor macros alone. This avoids having to run RISC-V binaries
 	# on a cross-compiler host. Returns "generic" if RISC-V not detected.
-	riscv_config=$(${cmd} -E "${dist_path}/frame/base/bli_riscv_cpuid.h" |
+	riscv_config=$(${cmd} -E "${dist_path}/build/detect/riscv/bli_riscv_cpuid.h" |
 	               grep '^[^#]')
 	if [[ $riscv_config != *generic* ]]; then
 		echo "${riscv_config}"

From 22ad8c1b752364784f320168b31995945ad84a59 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 27 Jul 2023 16:23:29 -0400
Subject: [PATCH 159/230] Small fixes to support hpx in the testsuite (#759)

Details:
- Minor changes to test_libblis.c to support hpx.
---
 testsuite/src/test_libblis.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index eee28bdaf..b5948d64c 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -86,7 +86,6 @@ int main( int argc, char** argv )
 	libblis_test_read_ops_file( libblis_test_operations_filename, &ops );
 
 	// Walk through all test modules.
-	//libblis_test_all_ops( &params, &ops );
 	libblis_test_thread_decorator( &params, &ops );
 
 	// Finalize libblis.
@@ -110,7 +109,9 @@ typedef struct thread_data
 	unsigned int       id;
 	unsigned int       xc;
 	//pthread_mutex_t*   mutex;
+#ifdef BLIS_ENABLE_HPX
 	pthread_barrier_t* barrier;
+#endif
 } thread_data_t;
 #endif
 
@@ -133,6 +134,25 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops )
 {
 	err_t r_val;
 
+#ifdef BLIS_ENABLE_HPX
+
+	size_t tdata_size = ( size_t )params->n_app_threads *
+	                    ( size_t )sizeof( thread_data_t );
+	thread_data_t* tdata = bli_malloc_user( tdata_size, &r_val );
+
+	tdata->params  = params;
+	tdata->ops     = ops;
+	tdata->nt      = nt;
+	tdata->id      = 1;
+	tdata->xc      = 0;
+
+	// Walk through all test modules.
+	libblis_test_all_ops( tdata, params, ops );
+
+	bli_free_user( tdata );
+
+#else
+
 	// Query the total number of threads to simulate.
 	size_t nt = ( size_t )params->n_app_threads;
 
@@ -215,6 +235,8 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops )
 	#endif
 	//bli_free_user( mutex );
 	bli_free_user( barrier );
+
+#endif
 }
 
 
@@ -2396,8 +2418,10 @@ void libblis_test_op_driver
 				}
 			}
 
+#ifndef BLIS_ENABLE_HPX
 			// Wait for all other threads so that the output stays organized.
 			bli_pthread_barrier_wait( tdata->barrier );
+#endif
 
 			// These statements should only be executed by one thread.
 			if ( tdata->id == 0 )
@@ -2447,8 +2471,10 @@ void libblis_test_op_driver
 	if ( tdata->id == 0 )
 		op->test_done = TRUE;
 
+#ifndef BLIS_ENABLE_HPX
 	// Wait here so that all threads know we are done
 	bli_pthread_barrier_wait( tdata->barrier );
+#endif
 }
 
 
From 2db31e057e7e9c97fc60021b5ae72a01a48d7588 Mon Sep 17 00:00:00 2001
From: Lee Killough <15950023+leekillough@users.noreply.github.com>
Date: Thu, 27 Jul 2023 15:27:21 -0500
Subject: [PATCH 160/230] Exclude -lrt on Android with Bionic libraries. (#755)

Details:
- Added build/detect/android/bionic.h header to test whether the
  __BIONIC__ cpp macro is defined.
- In common.mk, only add -lrt to LDFLAGS when Bionic is not present.
- CREDITS file update.
---
 CREDITS                       |  1 +
 build/detect/android/bionic.h | 36 +++++++++++++++++++++++++++++++++++
 common.mk                     |  6 +++++-
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 build/detect/android/bionic.h

diff --git a/CREDITS b/CREDITS
index 9ffbee7e7..348988831 100644
--- a/CREDITS
+++ b/CREDITS
@@ -60,6 +60,7 @@ but many others have contributed code, ideas, and feedback, including
   Ivan Korostelev          @ivan23kor                 (University of Alberta)
   Kyungmin Lee             @kyungminlee               (Ohio State University)
   Michael Lehn             @michael-lehn
+                           @leo4678
   Shmuel Levine            @ShmuelLevine
                            @lschork2
   Dave Love                @loveshack
diff --git a/build/detect/android/bionic.h b/build/detect/android/bionic.h
new file mode 100644
index 000000000..e9a49610b
--- /dev/null
+++ b/build/detect/android/bionic.h
@@ -0,0 +1,36 @@
+/*
+  BLIS
+  An object-based framework for developing high-performance BLAS-like
+  libraries.
+
+  Copyright (C) 2023, The University of Texas at Austin
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+   - Neither the name(s) of the copyright holder(s) nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Detect Bionic on Android */
+#if __BIONIC__
+bionic
+#endif
diff --git a/common.mk b/common.mk
index e58eca8a8..853b11143 100644
--- a/common.mk
+++ b/common.mk
@@ -967,8 +967,13 @@ endif
 #
 
 ifeq ($(OS_NAME),Linux)
+# Exclude -lrt on Android by detecting Bionic.
+# $(CC) -E bionic.h returns a "bionic" substring iff Bionic is detected.
+BIONIC := $(findstring bionic,$(shell $(CC) -E build/detect/android/bionic.h))
+ifeq (,$(BIONIC))
 LDFLAGS += -lrt
 endif
+endif
 
 
@@ -1270,4 +1275,3 @@ BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY
 
 # end of ifndef COMMON_MK_INCLUDED conditional block
 endif
-

From 915daaa43cd189c86d93d72cd249714f126e9425 Mon Sep 17 00:00:00 2001
From: Igor Zhuravlov <zhuravlov.ip@ya.ru>
Date: Thu, 27 Jul 2023 20:33:59 +0000
Subject: [PATCH 161/230] Fix typos in docs + example code comments. (#753)

Details:
- Fixed various typos in API documentation in docs/BLIS*API.md and
  comments in the source code examples within examples/?api/*.c.
---
 docs/BLISObjectAPI.md     | 104 +++++++++++++++++++-------------------
 docs/BLISTypedAPI.md      |  12 ++---
 examples/oapi/06level1m.c |   2 +-
 examples/oapi/10util.c    |   2 +-
 examples/tapi/01level1m.c |   2 +-
 5 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md
index 51f5753a0..57c96ccdd 100644
--- a/docs/BLISObjectAPI.md
+++ b/docs/BLISObjectAPI.md
@@ -85,57 +85,57 @@ The following tables list various types used throughout the BLIS object API.
 
 ### Floating-point types
 
-| BLIS fp type      | Type definition                        | Used to represent...                 |
-|:------------------|:---------------------------------------|:-------------------------------------|
-| `float`           | _N/A_                                  | single-precision real numbers        |
-| `double`          | _N/A_                                  | double-precision real numbers        |
-| `scomplex`        | `struct { float real; float imag; }`   | single-precision complex numbers     |
-| `dcomplex`        | `struct { double real; double imag; }` | double-precision complex numbers     |
+| BLIS fp type      | Type definition                        | Used to represent...              |
+|:------------------|:---------------------------------------|:----------------------------------|
+| `float`           | _N/A_                                  | single-precision real numbers.    |
+| `double`          | _N/A_                                  | double-precision real numbers.    |
+| `scomplex`        | `struct { float real; float imag; }`   | single-precision complex numbers. |
+| `dcomplex`        | `struct { double real; double imag; }` | double-precision complex numbers. |
 
 ### Enumerated parameter types
 
-| `num_t`         | Semantic meaning: Matrix/vector operand...              |
-|:----------------|:--------------------------------------------------------|
-| `BLIS_FLOAT`    | contains single-precision real elements.                |
-| `BLIS_DOUBLE`   | contains double-precision real elements.                |
-| `BLIS_SCOMPLEX` | contains single-precision complex elements.             |
-| `BLIS_DCOMPLEX` | contains double-precision complex elements.             |
-| `BLIS_INT`      | contains integer elements of type `gint_t`.             |
-| `BLIS_CONSTANT` | contains polymorphic representation of a constant value |
-
-| `dom_t`         | Semantic meaning: Matrix/vector operand...  |
-|:----------------|:--------------------------------------------|
-| `BLIS_REAL`     | contains real domain elements.              |
-| `BLIS_COMPLEX`  | contains complex domain elements.           |
-
-| `prec_t`           | Semantic meaning: Matrix/vector operand...  |
-|:-------------------|:--------------------------------------------|
-| `BLIS_SINGLE_PREC` | contains single-precision elements.         |
-| `BLIS_DOUBLE_PREC` | contains double-precision elements.         |
-
-| `trans_t`                | Semantic meaning: Matrix operand ...              |
-|:-------------------------|:--------------------------------------------------|
-| `BLIS_NO_TRANSPOSE`      | will be used as given.                            |
-| `BLIS_TRANSPOSE`         | will be implicitly transposed.                    |
-| `BLIS_CONJ_NO_TRANSPOSE` | will be implicitly conjugated.                    |
-| `BLIS_CONJ_TRANSPOSE`    | will be implicitly transposed _and_ conjugated.   |
-
-| `conj_t`             | Semantic meaning: Matrix/vector operand...               |
-|:---------------------|:---------------------------------------------------------|
-| `BLIS_NO_CONJUGATE`  | will be used as given.                                   |
-| `BLIS_CONJUGATE`     | will be implicitly conjugated.                           |
-
-| `side_t`     | Semantic meaning: Matrix operand...                |
-|:-------------|:---------------------------------------------------|
-| `BLIS_LEFT`  | appears on the left.                               |
-| `BLIS_RIGHT` | appears on the right.                              |
-
-| `struc_t`         | Semantic meaning: Matrix operand...                               |
-|:------------------|:------------------------------------------------------------------|
-| `BLIS_GENERAL`    | has no structure.                                                 |
-| `BLIS_HERMITIAN`  | has Hermitian structure.                                          |
-| `BLIS_SYMMETRIC`  | has symmetric structure.                                          |
-| `BLIS_TRIANGULAR` | has triangular structure.                                         |
+| `num_t`         | Semantic meaning: Matrix/vector operand...               |
+|:----------------|:---------------------------------------------------------|
+| `BLIS_FLOAT`    | contains single-precision real elements.                 |
+| `BLIS_DOUBLE`   | contains double-precision real elements.                 |
+| `BLIS_SCOMPLEX` | contains single-precision complex elements.              |
+| `BLIS_DCOMPLEX` | contains double-precision complex elements.              |
+| `BLIS_INT`      | contains integer elements of type `gint_t`.              |
+| `BLIS_CONSTANT` | contains polymorphic representation of a constant value. |
+
+| `dom_t`         | Semantic meaning: Matrix/vector operand... |
+|:----------------|:-------------------------------------------|
+| `BLIS_REAL`     | contains real domain elements.             |
+| `BLIS_COMPLEX`  | contains complex domain elements.          |
+
+| `prec_t`           | Semantic meaning: Matrix/vector operand... |
+|:-------------------|:-------------------------------------------|
+| `BLIS_SINGLE_PREC` | contains single-precision elements.        |
+| `BLIS_DOUBLE_PREC` | contains double-precision elements.        |
+
+| `trans_t`                | Semantic meaning: Matrix operand ...            |
+|:-------------------------|:------------------------------------------------|
+| `BLIS_NO_TRANSPOSE`      | will be used as given.                          |
+| `BLIS_TRANSPOSE`         | will be implicitly transposed.                  |
+| `BLIS_CONJ_NO_TRANSPOSE` | will be implicitly conjugated.                  |
+| `BLIS_CONJ_TRANSPOSE`    | will be implicitly transposed _and_ conjugated. |
+
+| `conj_t`             | Semantic meaning: Matrix/vector operand... |
+|:---------------------|:-------------------------------------------|
+| `BLIS_NO_CONJUGATE`  | will be used as given.                     |
+| `BLIS_CONJUGATE`     | will be implicitly conjugated.             |
+
+| `side_t`     | Semantic meaning: Matrix operand... |
+|:-------------|:------------------------------------|
+| `BLIS_LEFT`  | appears on the left.                |
+| `BLIS_RIGHT` | appears on the right.               |
+
+| `struc_t`         | Semantic meaning: Matrix operand... |
+|:------------------|:------------------------------------|
+| `BLIS_GENERAL`    | has no structure.                   |
+| `BLIS_HERMITIAN`  | has Hermitian structure.            |
+| `BLIS_SYMMETRIC`  | has symmetric structure.            |
+| `BLIS_TRIANGULAR` | has triangular structure.           |
 
 | `uplo_t`     | Semantic meaning: Matrix operand...                               |
 |:-------------|:------------------------------------------------------------------|
@@ -433,7 +433,7 @@ Return the storage datatype property of `obj`.
 ---
 
 ```c
-dom_t bli_obj_dom( obj_t* obj );
+dom_t bli_obj_domain( obj_t* obj );
 ```
 Return the domain component of the storage datatype property of `obj`.
 
@@ -764,7 +764,7 @@ void bli_axpbyv
        obj_t*  x,
        obj_t*  beta,
        obj_t*  y
-     )
+     );
 ```
 Perform
 ```
@@ -991,7 +991,7 @@ void bli_xpbyv
        obj_t*  x,
        obj_t*  beta,
        obj_t*  y
-     )
+     );
 ```
 Perform
 ```
@@ -2187,7 +2187,7 @@ void bli_getsc
        obj_t*  chi,
        double* zeta_r,
        double* zeta_i
-     )
+     );
 ```
 Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.)
 
diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md
index 497776a15..5a7b90f29 100644
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -287,7 +287,7 @@ void bli_?axpbyv
        ctype*  x, inc_t incx,
        ctype*  beta,
        ctype*  y, inc_t incy
-     )
+     );
 ```
 Perform
 ```
@@ -482,7 +482,7 @@ void bli_?xpbyv
        ctype*  x, inc_t incx,
        ctype*  beta,
        ctype*  y, inc_t incy
-     )
+     );
 ```
 Perform
 ```
@@ -1756,7 +1756,7 @@ void bli_getsc
        ctype*  chi,
        double* zeta_r,
        double* zeta_i
-     )
+     );
 ```
 Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.)
 
@@ -1770,7 +1770,7 @@ err_t bli_?getijv
        ctype*  x, incx,
        double* ar,
        double* ai
-     )
+     );
 ```
 Copy the real and imaginary values at the `i`th element of vector `x` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
 Note that the object-based analogue of [getijv](BLISObjectAPI.md#getijv) does bounds checking of the vector element offset `i` against the vector length while the typed functions specified above do not (since the vector length is not given).
@@ -1786,7 +1786,7 @@ err_t bli_?getijm
        ctype*  b, inc_t rs_b, inc_t cs_b,
        double* ar,
        double* ai
-     )
+     );
 ```
 Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
 Note that the object-based analogue of [getijm](BLISObjectAPI.md#getijm) does bounds checking of the matrix element offsets (`i`,`j`) against the matrix dimensions while the typed functions specified above do not (since the matrix dimensions are not given).
@@ -1881,7 +1881,7 @@ void bli_?eqm
        ctype*  a, inc_t rs_a, inc_t cs_a,
        ctype*  b, inc_t rs_b, inc_t cs_b,
        bool*   is_eq
-     )
+     );
 ```
 Perform an element-wise comparison between matrices `A` and `B` and store the boolean result in the `bool` pointed to by `is_eq`.
 Here, `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or upper-triangular/trapezoidal matrix with arbitrary diagonal offset and unit or non-unit diagonal.
diff --git a/examples/oapi/06level1m.c b/examples/oapi/06level1m.c
index 15e35aa09..9d57d3149 100644
--- a/examples/oapi/06level1m.c
+++ b/examples/oapi/06level1m.c
@@ -169,7 +169,7 @@ int main( int argc, char** argv )
 	//bli_obj_apply_trans( trans, &e );
 
 	// Copy 'e' to 'f', transposing 'e' in the process. Notice that we haven't
-	// modified any properties of 'd'. It's the source operand that matters
+	// modified any properties of 'f'. It's the source operand that matters
 	// when marking an operand for transposition, not the destination.
 	bli_copym( &e, &f );
 
diff --git a/examples/oapi/10util.c b/examples/oapi/10util.c
index 6fd42e22f..2b9226f7b 100644
--- a/examples/oapi/10util.c
+++ b/examples/oapi/10util.c
@@ -115,7 +115,7 @@ int main( int argc, char** argv )
 
 	bli_printm( "a:", &a, "%4.1f", "" );
 
-	// Compute the one-norm of 'a'.
+	// Compute the one, infinity, and frobenius norms of 'a'.
 	bli_norm1m( &a, &norm1 );
 	bli_normim( &a, &normi );
 	bli_normfm( &a, &normf );
diff --git a/examples/tapi/01level1m.c b/examples/tapi/01level1m.c
index 4b81a57a1..d3a5d8287 100644
--- a/examples/tapi/01level1m.c
+++ b/examples/tapi/01level1m.c
@@ -171,7 +171,7 @@ int main( int argc, char** argv )
 
 
 	// Copy 'e' to 'f', transposing 'e' in the process. Notice that we haven't
-	// modified any properties of 'd'. It's the source operand that matters
+	// modified any properties of 'f'. It's the source operand that matters
 	// when marking an operand for transposition, not the destination.
 	bli_dcopym( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_TRANSPOSE,
 	            n, m, e, rs, cs, f, rsf, csf );

From dbc79812c390f812c7bf030bfcf87e947a1443c4 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 28 Jul 2023 18:16:38 -0500
Subject: [PATCH 162/230] CREDITS file update.

Details:
- Thanks to Igor Zhuravlov for PR #753 (commit 915daaa).
---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index 348988831..4daf19fb3 100644
--- a/CREDITS
+++ b/CREDITS
@@ -133,6 +133,7 @@ but many others have contributed code, ideas, and feedback, including
   Roman Yurchak            @rth                       (Symerio)
   Stefano Zampini          @stefanozampini
   M. Zhou                  @cdluminate
+  Igor Zhuravlov           @jip                       (Far Eastern Federal University)
 
 BLIS's development was partially funded by grants from industry
 partners, including

From 3cf17b4a91232709bc6a205b0e4d7ecc96579aa9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 7 Aug 2023 13:46:20 -0500
Subject: [PATCH 163/230] Small fixes/improvements to docs/Multithreading.md.
 (#764)

Details:
- Added reminders that #include "blis.h" must be added to source files
  in order to access BLIS API function prototypes. Thanks to Barry Smith
  for suggesting this improvement.
- Fixed pre-existing typos.
- CREDITS file update.
---
 CREDITS                |  1 +
 docs/Multithreading.md | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CREDITS b/CREDITS
index 4daf19fb3..18af35928 100644
--- a/CREDITS
+++ b/CREDITS
@@ -104,6 +104,7 @@ but many others have contributed code, ideas, and feedback, including
   Rene Sitt
   Tony Skjellum            @tonyskjellum              (The University of Tennessee at Chattanooga)
   Mikhail Smelyanskiy                                 (Intel, Parallel Computing Lab)
+  Barry Smith              @BarrySmith                (Argonne National Laboratory)
   Nathaniel Smith          @njsmith
   Shaden Smith             @ShadenSmith
   Tyler Smith              @tlrmchlsmth               (The University of Texas at Austin)
diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 1a46f6556..59b268878 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -197,6 +197,8 @@ If `BLIS_THREAD_IMPL` is not set, BLIS will attempt to query its shorthand alter
 
 ## Globally at runtime
 
+***Note:** If you want to gain access to BLIS API function prototypes, be sure to #include "blis.h" from the relevant source files in your application.*
+
 If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized).
 
 **Note**: If you set parallelization globally via environment variables and *then* your application *also* uses the global runtime API to set the ways of parallelism, the global runtime API will prevail.
@@ -246,7 +248,7 @@ This will result in both OpenMP and pthreads implementations being compiled and
 ```c
 void bli_thread_set_thread_impl( timpl_t ti );
 ```
-The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
+The function takes a `timpl_t`, which is an enumerated type that has four valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
 ```c
 bli_thread_set_thread_impl( BLIS_POSIX )
 ```
@@ -258,7 +260,9 @@ Note that if `BLIS_SINGLE` is specified, any other-related parameters previously
 
 ## Locally at runtime
 
-In addition to the global methods based on environment variables and runtime function calls, BLIS also offers a local, *per-call* method of requesting parallelism at runtime. This method has the benefit of being thread-safe and flexible; your application can spawn two threads at the application level, with each thread requesting different degrees of parallelism from their respective calls to level-3 BLIS operations.
+***Note:** If you want to gain access to BLIS API function prototypes, be sure to #include "blis.h" from the relevant source files in your application.*
+
+In addition to the global methods based on environment variables and runtime function calls, BLIS also offers a local, *per-call* method of requesting parallelism at runtime. This method has the benefit of being thread-safe and flexible; your application can spawn two or more threads at the application level, with each thread requesting different degrees of parallelism from their respective calls to level-3 BLIS operations.
 
 As with environment variables and the global runtime API, there are two ways to specify parallelism: the automatic way and the manual way. Both ways involve allocating a BLIS-specific object, initializing the object and encoding the desired parallelization, and then passing a pointer to the object into one of the expert interfaces of either the [typed](docs/BLISTypedAPI.md) or [object](docs/BLISObjectAPI) APIs. We provide examples of utilizing this threading object below.
 
@@ -321,7 +325,7 @@ This will result in both OpenMP and pthreads implementations being compiled and
 ```c
 void bli_rntm_set_thread_impl( timpl_t ti, rntm_t* rntm );
 ```
-The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
+The function takes a `timpl_t`, which is an enumerated type that has four valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling:
 ```c
 bli_rntm_set_thread_impl( BLIS_POSIX, &rntm );
 ```

From 634e532c8dcce7383d96ba33276df65c656b2198 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 9 Aug 2023 21:54:49 -0500
Subject: [PATCH 164/230] Set thrcomm timpl_t id inside init functions. (#766)

Details:
- Previously, the timpl_t id being used when a thrcomm_t is being
  initialized was set within the bli_thrcomm_init() dispatch function
  after the timpl_t-specific bli_thrcomm_init_*() function returned. But
  it just occurred to me that each bli_thrcomm_init_*() function already
  intrinsically knows its own timpl_t value. This commit shifts the
  setting of the thrcomm_t.ti field into the corresponding
  bli_thrcomm_init_*() function for each timpl_t type (e.g. single,
  openmp, pthreads, hpx).
- Removed long-deprecated code dating back nearly 10 years.
- Whitespace changes
- Comment updates.
---
 frame/thread/bli_thrcomm.c          |  7 ++---
 frame/thread/bli_thrcomm_hpx.cpp    | 16 +++++++++--
 frame/thread/bli_thrcomm_openmp.c   | 43 ++++++++++-------------------
 frame/thread/bli_thrcomm_pthreads.c | 37 ++++++++++---------------
 frame/thread/bli_thrcomm_single.c   |  1 +
 5 files changed, 46 insertions(+), 58 deletions(-)

diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c
index 79618f1a8..30257db39 100644
--- a/frame/thread/bli_thrcomm.c
+++ b/frame/thread/bli_thrcomm.c
@@ -149,10 +149,9 @@ void bli_thrcomm_init( timpl_t ti, dim_t nt, thrcomm_t* comm )
 	// Call the threading-specific init function.
 	fp( nt, comm );
 
-	// Embed the type of threading implementation within the thrcomm_t struct.
-	// Note that we wait until after the init function has returned in case
-	// that function zeros out the entire struct before setting the fields.
-	comm->ti = ti;
+	// NOTE: The init function that just returned intrinsically knows its
+	// timpl_t value, thus is able to set that value without us explicitly
+	// passing it in.
 }
 
 void bli_thrcomm_cleanup( thrcomm_t* comm )
diff --git a/frame/thread/bli_thrcomm_hpx.cpp b/frame/thread/bli_thrcomm_hpx.cpp
index d9fb258c2..323871ef8 100644
--- a/frame/thread/bli_thrcomm_hpx.cpp
+++ b/frame/thread/bli_thrcomm_hpx.cpp
@@ -46,12 +46,20 @@ extern "C" {
 void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == nullptr ) return;
+
+	//comm->sent_object             = nullptr;
+	//comm->n_threads               = n_threads;
+	comm->ti                      = BLIS_HPX;
+	//comm->barrier_sense           = 0;
+	//comm->barrier_threads_arrived = 0;
+
 	comm->barrier = new hpx:barrier<>();
 }
 
 void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
 {
 	if ( comm == nullptr ) return;
+
 	delete comm->barrier;
 }
 
@@ -69,9 +77,11 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
 void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == nullptr ) return;
-	comm->sent_object = nullptr;
-	comm->n_threads = n_threads;
-	comm->barrier_sense = 0;
+
+	comm->sent_object             = nullptr;
+	comm->n_threads               = n_threads;
+	comm->ti                      = BLIS_HPX;
+	comm->barrier_sense           = 0;
 	comm->barrier_threads_arrived = 0;
 }
 
diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c
index 487832cd1..382d6c9a7 100644
--- a/frame/thread/bli_thrcomm_openmp.c
+++ b/frame/thread/bli_thrcomm_openmp.c
@@ -46,43 +46,22 @@
 void bli_thrcomm_init_openmp( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
-	comm->sent_object = NULL;
-	comm->n_threads = n_threads;
-	comm->barrier_sense = 0;
+
+	comm->sent_object             = NULL;
+	comm->n_threads               = n_threads;
+	comm->ti                      = BLIS_OPENMP;
+	comm->barrier_sense           = 0;
 	comm->barrier_threads_arrived = 0;
 }
 
 
 void bli_thrcomm_cleanup_openmp( thrcomm_t* comm )
 {
-	//if ( comm == NULL ) return;
 	return;
 }
 
-//'Normal' barrier for openmp
-//barrier routine taken from art of multicore programming
 void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm )
 {
-#if 0
-	if ( comm == NULL || comm->n_threads == 1 )
-		return;
-	gint_t my_sense = comm->barrier_sense;
-	dim_t my_threads_arrived;
-
-	_Pragma( "omp atomic capture" )
-		my_threads_arrived = ++(comm->barrier_threads_arrived);
-
-	if ( my_threads_arrived == comm->n_threads )
-	{
-		comm->barrier_threads_arrived = 0;
-		comm->barrier_sense = !comm->barrier_sense;
-	}
-	else
-	{
-		volatile gint_t* listener = &comm->barrier_sense;
-		while ( *listener == my_sense ) {}
-	}
-#endif
 	bli_thrcomm_barrier_atomic( t_id, comm );
 }
 
@@ -96,19 +75,27 @@ void bli_thrcomm_init_openmp( dim_t n_threads, thrcomm_t* comm )
 	err_t r_val;
 
 	if ( comm == NULL ) return;
-	comm->sent_object = NULL;
-	comm->n_threads = n_threads;
+
+	comm->sent_object             = NULL;
+	comm->n_threads               = n_threads;
+	comm->ti                      = BLIS_OPENMP;
+	//comm->barrier_sense           = 0;
+	//comm->barrier_threads_arrived = 0;
+
 	comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads, &r_val );
+
 	bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 );
 }
 
 void bli_thrcomm_cleanup_openmp( thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
+
 	for ( dim_t i = 0; i < comm->n_threads; i++ )
 	{
 	   bli_thrcomm_tree_barrier_free( comm->barriers[i] );
 	}
+
 	bli_free_intl( comm->barriers );
 }
 
diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c
index 39b15d590..8e45a2782 100644
--- a/frame/thread/bli_thrcomm_pthreads.c
+++ b/frame/thread/bli_thrcomm_pthreads.c
@@ -45,14 +45,20 @@
 void bli_thrcomm_init_pthreads( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
-	comm->sent_object = NULL;
-	comm->n_threads = n_threads;
+
+	comm->sent_object             = NULL;
+	comm->n_threads               = n_threads;
+	comm->ti                      = BLIS_POSIX;
+	//comm->barrier_sense           = 0;
+	//comm->barrier_threads_arrived = 0;
+
 	bli_pthread_barrier_init( &comm->barrier, NULL, n_threads );
 }
 
 void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
+
 	bli_pthread_barrier_destroy( &comm->barrier );
 }
 
@@ -70,36 +76,21 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
 void bli_thrcomm_init_pthreads( dim_t n_threads, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
-	comm->sent_object = NULL;
-	comm->n_threads = n_threads;
-	comm->barrier_sense = 0;
+
+	comm->sent_object             = NULL;
+	comm->n_threads               = n_threads;
+	comm->ti                      = BLIS_POSIX;
+	comm->barrier_sense           = 0;
 	comm->barrier_threads_arrived = 0;
 }
 
 void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm )
 {
+	return;
 }
 
 void bli_thrcomm_barrier_pthreads( dim_t t_id, thrcomm_t* comm )
 {
-#if 0
-	if ( comm == NULL || comm->n_threads == 1 ) return;
-	bool  my_sense = comm->sense;
-	dim_t my_threads_arrived;
-
-	my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1);
-
-	if ( my_threads_arrived == comm->n_threads )
-	{
-		comm->threads_arrived = 0;
-		comm->sense = !comm->sense;
-	}
-	else
-	{
-		volatile bool* listener = &comm->sense;
-		while( *listener == my_sense ) {}
-	}
-#endif
 	bli_thrcomm_barrier_atomic( t_id, comm );
 }
 
diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c
index cb12e37f3..3116a5d06 100644
--- a/frame/thread/bli_thrcomm_single.c
+++ b/frame/thread/bli_thrcomm_single.c
@@ -41,6 +41,7 @@ void bli_thrcomm_init_single( dim_t n_threads, thrcomm_t* comm )
 
 	comm->sent_object             = NULL;
 	comm->n_threads               = n_threads;
+	comm->ti                      = BLIS_SINGLE;
 	comm->barrier_sense           = 0;
 	comm->barrier_threads_arrived = 0;
 }

From fa6a9b24ae2ddbd5f30f657d46004843581c768c Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sat, 19 Aug 2023 12:44:34 -0500
Subject: [PATCH 165/230] Fixed error when using common.mk from testsuite.
 (#768)

Details:
- Commit 2db31e0 (#755) inserted logic into common.mk that attempts to
  preprocess build/detect/android/bionic.h to determine whether the
  __BIONIC__ macro is defined (in which case -lrt should not be included
  in LDFLAGS). However, the path to bionic.h was encoded without regard
  to DIST_PATH, and so utilizing common.mk anywhere that isn't the top-
  level directory (such as in the testsuite directory) resulted in a
  compiler error:

    gcc: error: build/detect/android/bionic.h: No such file or directory
    gcc: fatal error: no input files
    compilation terminated.

  This commit adds a $(DIST_PATH) prefix to the path to bionic.h so that
  it can be located from other applications' Makefiles that use BLIS's
  makefile fragments.
---
 common.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common.mk b/common.mk
index 853b11143..de19e7cc0 100644
--- a/common.mk
+++ b/common.mk
@@ -969,7 +969,8 @@ endif
 ifeq ($(OS_NAME),Linux)
 # Exclude -lrt on Android by detecting Bionic.
 # $(CC) -E bionic.h returns a "bionic" substring iff Bionic is detected.
-BIONIC := $(findstring bionic,$(shell $(CC) -E build/detect/android/bionic.h))
+BIONIC_H_PATH := $(DIST_PATH)/build/detect/android/bionic.h
+BIONIC := $(findstring bionic,$(shell $(CC) -E $(BIONIC_H_PATH)))
 ifeq (,$(BIONIC))
 LDFLAGS += -lrt
 endif

From 6dcf7666eff14348e82fbc2750be4b199321e1b9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Sun, 27 Aug 2023 14:18:57 -0500
Subject: [PATCH 166/230] Revamped bli_init() to use TLS where feasible. (#767)

Details:
- Revamped bli_init_apis() and bli_finalize_apis() to use separate
  bli_pthread_switch_t objects for each of the five sub-API init
  functions, with the objects for the 'ind' and 'rntm' sub-APIs being
  declared with BLIS_THREAD_LOCAL. This allows some APIs to be treated
  as thread-local and the rest as thread-shared. Thanks to Edward Smyth
  for requesting application thread-specific rntm_t structs, which
  inspired these change.
- Combined bli_thread_init_from_env() and bli_pack_init_from_env() into
  a new function, bli_rntm_init_rntm_from_env(), and placed the combined
  code in bli_rntm.c inside of a new bli_rntm_init() function. Then
  removed the (now empty) bli_pack_init() and _finalize() function defs.
- Deprecated bli_rntm_init() for the purposes of initializing a rntm_t
  (temporarily preserving it as bli_rntm_clear() in a cpp-undefined code
  block) so that the function name could be used for the aforementioned
  bli_rntm_init() function.
- Updated libblis_test_pobj_create() in test_libblis.c to use a static
  rntm_t initializer instead of the deprecated bli_rntm_init()
  function-based option.
- Minor updates to docs/Multithreading.md, including removal of
  bli_rntm_init() in the example of how to initialize rntm_t structs.
- Changed the return value of bli_gks_init(), bli_ind_init(),
  bli_memsys_init(), bli_thread_init(), and bli_rntm_init() (and their
  finalize() counterparts) from 'void' to 'int' so that those functions
  match the function type expected by bli_pthread_switch_on()/_off().
  Those init/finalize functions now return 0 to indicate success, which
  is needed so that the switch actually changes state from off to on
  and vice versa.
- Defined bli_thread_reset(), which copies the contents of the
  global_rntm_at_init() struct into the global_rntm struct (for the
  current application thread).
- Guard calls to bli_pthread_mutex_lock()/_unlock() in
  - bli_pack_set_pack_a() and _pack_b()
  - bli_rntm_init_from_global()
  - bli_thread_set_ways()
  - bli_thread_set_num_threads()
  - bli_thread_set_thread_impl()
  - bli_thread_reset()
  - bli_l3_ind_oper_set_enable()
  with #ifdef BLIS_DISABLE_TLS (since TLS precludes the possibility of
  race conditions).
- In frame/base/bli_rntm.c, declare global_rntm, global_rntm_at_init,
  and global_rntm_mutex as BLIS_THREAD_LOCAL so that separate
  application threads can change the number of ways of BLIS parallelism
  independently from one another.
- Access global_rntm only via a new private (not exported) function,
  bli_global_rntm(). Defined a similar function for a rntm_t new to
  this commit, global_rntm_at_init, which preserves the state of the
  global rntm at initialization-time.
- In frame/3/bli_l3_ind.c, added a guard to the declaration of the
  static variable oper_st_mutex with #ifdef BLIS_DISABLE_TLS so that the
  mutex is omitted altogether when TLS is enabled (which prevents the
  compiler from warning about an unused variable).
- Removed redundant code from bli_thread.c:
    #ifdef BLIS_ENABLE_HPX
    #include "bli_thread_hpx.h"
    #endif
  since this code is already present in bli_thread.h.
- Thanks to Minh Quan Ho for his review of and feedback on this commit.
- Comment updates.
---
 docs/Multithreading.md       |   6 +-
 frame/3/bli_l3_ind.c         |  20 ++--
 frame/base/bli_gks.c         |  20 +++-
 frame/base/bli_gks.h         |   4 +-
 frame/base/bli_ind.c         |  13 ++-
 frame/base/bli_ind.h         |   4 +-
 frame/base/bli_init.c        |  34 +++---
 frame/base/bli_memsys.c      |  12 +-
 frame/base/bli_memsys.h      |   6 +-
 frame/base/bli_pack.c        | 100 ++++------------
 frame/base/bli_pack.h        |   5 -
 frame/base/bli_rntm.c        | 205 +++++++++++++++++++++++++++++++--
 frame/base/bli_rntm.h        |  13 ++-
 frame/thread/bli_thread.c    | 214 ++++++++++++-----------------------
 frame/thread/bli_thread.h    |   7 +-
 testsuite/src/test_libblis.c |   3 +-
 16 files changed, 383 insertions(+), 283 deletions(-)

diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 59b268878..6f2ef49c5 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -205,6 +205,8 @@ If you still wish to set the parallelization scheme globally, but you want to do
 
 **Note**: Regardless of which way ([automatic](Multithreading.md#globally-at-runtime-the-automatic-way) or [manual](Multithreading.md#globally-at-runtime-the-manual-way)) the global runtime API is used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native ([typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md)) APIs that are unique to BLIS.
 
+If BLIS is being used by two or more application-level threads, each of those application threads will track their own global state for the purpose of specifying parallelism. We felt this makes sense because each application thread may wish to specify a different parallelization scheme without affecting the scheme for the other application thread(s).
+
 ### Globally at runtime: the automatic way
 
 If you simply want to specify an overall number of threads and let BLIS choose a thread factorization automatically, use the following function:
@@ -281,10 +283,6 @@ If you want to initialize it as part of the declaration, you may do so via the d
 ```c
 rntm_t rntm = BLIS_RNTM_INITIALIZER;
 ```
-Alternatively, you can perform the same initialization by passing the address of the `rntm_t` to an initialization function:
-```c
-bli_rntm_init( &rntm );
-```
 As of this writing, BLIS treats a default-initialized `rntm_t` as a request for single-threaded execution.
 
 **Note**: If you choose to **not** initialize the `rntm_t` object and then pass it into a level-3 operation, **you will almost surely observe undefined behavior!** Please don't do this!
diff --git a/frame/3/bli_l3_ind.c b/frame/3/bli_l3_ind.c
index 9ccaf3515..73e366137 100644
--- a/frame/3/bli_l3_ind.c
+++ b/frame/3/bli_l3_ind.c
@@ -36,8 +36,8 @@
 #include "blis.h"
 
 // This array tracks whether a particular operation is implemented for each of
-// the induced methods.
-static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
+// the induced methods. This array is meant to be read-only.
+static const bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
 {
         /*   gemm  gemmt  hemm  herk  her2k  symm  syrk  syr2k  trmm3  trmm  trsm  */
 /* 1m   */ { TRUE, TRUE,  TRUE, TRUE, TRUE,  TRUE, TRUE, TRUE,  TRUE,  TRUE, TRUE  },
@@ -64,6 +64,11 @@ bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
              {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE}    },
 };
 
+// A mutex to allow synchronous access to the bli_l3_ind_oper_st array.
+#ifdef BLIS_DISABLE_TLS
+static bli_pthread_mutex_t oper_st_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
+#endif
+
 // -----------------------------------------------------------------------------
 
 #undef  GENFUNC
@@ -191,9 +196,6 @@ void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status )
 
 // -----------------------------------------------------------------------------
 
-// A mutex to allow synchronous access to the bli_l3_ind_oper_st array.
-static bli_pthread_mutex_t oper_st_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
-
 void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status )
 {
 	num_t idt;
@@ -218,8 +220,11 @@ void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool statu
 
 	idt = bli_ind_map_cdt_to_index( dt );
 
-	// Acquire the mutex protecting bli_l3_ind_oper_st.
+	// If TLS is disabled, we need to use a mutex to protect the status array
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
 	bli_pthread_mutex_lock( &oper_st_mutex );
+	#endif
 
 	// BEGIN CRITICAL SECTION
 	{
@@ -227,8 +232,9 @@ void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool statu
 	}
 	// END CRITICAL SECTION
 
-	// Release the mutex protecting bli_l3_ind_oper_st.
+	#ifdef BLIS_DISABLE_TLS
 	bli_pthread_mutex_unlock( &oper_st_mutex );
+	#endif
 }
 
 bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt )
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index c1fd4c866..eba602aaa 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -59,10 +59,18 @@ typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
 static cntx_t* cached_cntx_nat = NULL;
 static cntx_t* cached_cntx_ind = NULL;
 
+// A mutex to allow synchronous access to the gks when it needs to be updated
+// with a new entry corresponding to a context for an ind_t value.
+static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
+
 // -----------------------------------------------------------------------------
 
-void bli_gks_init( void )
+int bli_gks_init( void )
 {
+	// NOTE: This function is called once by ONLY ONE application thread per
+	// library init/finalize cycle (see bli_init.c). Thus, a mutex is not
+	// needed to protect the data initialization.
+
 	{
 		// Initialize the internal data structure we use to track registered
 		// contexts.
@@ -261,11 +269,13 @@ void bli_gks_init( void )
 	cached_cntx_nat = ( cntx_t* )bli_gks_query_nat_cntx_noinit();
 	cached_cntx_ind = ( cntx_t* )bli_gks_query_ind_cntx_noinit( BLIS_1M );
 #endif
+
+	return 0;
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_gks_finalize( void )
+int bli_gks_finalize( void )
 {
 	arch_t id;
 	ind_t  ind;
@@ -318,6 +328,8 @@ void bli_gks_finalize( void )
 	cached_cntx_nat = NULL;
 	cached_cntx_ind = NULL;
 #endif
+
+	return 0;
 }
 
 // -----------------------------------------------------------------------------
@@ -613,10 +625,6 @@ const cntx_t* bli_gks_query_ind_cntx_noinit
 
 // -----------------------------------------------------------------------------
 
-// A mutex to allow synchronous access to the gks when it needs to be updated
-// with a new entry corresponding to a context for an ind_t value.
-static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
-
 const cntx_t* bli_gks_query_ind_cntx_impl
      (
        ind_t ind
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index d1c715be1..da2ead083 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -35,8 +35,8 @@
 #ifndef BLIS_GKS_H
 #define BLIS_GKS_H
 
-void                           bli_gks_init( void );
-void                           bli_gks_finalize( void );
+int                            bli_gks_init( void );
+int                            bli_gks_finalize( void );
 
 void                           bli_gks_init_index( void );
 
diff --git a/frame/base/bli_ind.c b/frame/base/bli_ind.c
index cc2810d51..3436ddbc9 100644
--- a/frame/base/bli_ind.c
+++ b/frame/base/bli_ind.c
@@ -42,8 +42,14 @@ static const char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
 
 // -----------------------------------------------------------------------------
 
-void bli_ind_init( void )
+int bli_ind_init( void )
 {
+	// NOTE: If TLS is enabled, this function is called once by EACH application
+	// thread per library init/finalize cycle (see bli_init.c). In this case,
+	// the threads will initialize thread-local data (see bli_l3_ind.c). If TLS
+	// is disabled, this function is called once by ONLY ONE application thread.
+	// In neither case is a mutex needed to protect the data initialization.
+
 	// NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order
 	// to avoid the internal call to bli_init_once().
 	const cntx_t* cntx = bli_gks_query_nat_cntx_noinit();
@@ -62,10 +68,13 @@ void bli_ind_init( void )
 
 	if ( c_is_ref && !s_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_SCOMPLEX );
 	if ( z_is_ref && !d_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_DCOMPLEX );
+
+	return 0;
 }
 
-void bli_ind_finalize( void )
+int bli_ind_finalize( void )
 {
+	return 0;
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_ind.h b/frame/base/bli_ind.h
index e162c5809..1b40d59ba 100644
--- a/frame/base/bli_ind.h
+++ b/frame/base/bli_ind.h
@@ -38,8 +38,8 @@
 // level-3 induced method management
 #include "bli_l3_ind.h"
 
-void                         bli_ind_init( void );
-void                         bli_ind_finalize( void );
+int                          bli_ind_init( void );
+int                          bli_ind_finalize( void );
 
 BLIS_EXPORT_BLIS void        bli_ind_enable( ind_t method );
 BLIS_EXPORT_BLIS void        bli_ind_disable( ind_t method );
diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c
index f1baa2c21..bf1befb6b 100644
--- a/frame/base/bli_init.c
+++ b/frame/base/bli_init.c
@@ -64,28 +64,34 @@ void bli_finalize_auto( void )
 
 // -----------------------------------------------------------------------------
 
-static bli_pthread_switch_t lib_state = BLIS_PTHREAD_SWITCH_INIT;
-
 void bli_init_once( void )
 {
-	bli_pthread_switch_on( &lib_state, bli_init_apis );
+	bli_init_apis();
 }
 
 void bli_finalize_once( void )
 {
-	bli_pthread_switch_off( &lib_state, bli_finalize_apis );
+	bli_finalize_apis();
 }
 
 // -----------------------------------------------------------------------------
 
+static bli_pthread_switch_t gks_g_state    = BLIS_PTHREAD_SWITCH_INIT;
+static BLIS_THREAD_LOCAL
+       bli_pthread_switch_t ind_l_state    = BLIS_PTHREAD_SWITCH_INIT;
+static bli_pthread_switch_t thread_g_state = BLIS_PTHREAD_SWITCH_INIT;
+static BLIS_THREAD_LOCAL
+       bli_pthread_switch_t rntm_l_state   = BLIS_PTHREAD_SWITCH_INIT;
+static bli_pthread_switch_t memsys_g_state = BLIS_PTHREAD_SWITCH_INIT;
+
 int bli_init_apis( void )
 {
 	// Initialize various sub-APIs.
-	bli_gks_init();
-	bli_ind_init();
-	bli_thread_init();
-	bli_pack_init();
-	bli_memsys_init();
+	bli_pthread_switch_on( &gks_g_state,    bli_gks_init );
+	bli_pthread_switch_on( &ind_l_state,    bli_ind_init );
+	bli_pthread_switch_on( &thread_g_state, bli_thread_init );
+	bli_pthread_switch_on( &rntm_l_state,   bli_rntm_init );
+	bli_pthread_switch_on( &memsys_g_state, bli_memsys_init );
 
 	return 0;
 }
@@ -93,11 +99,11 @@ int bli_init_apis( void )
 int bli_finalize_apis( void )
 {
 	// Finalize various sub-APIs.
-	bli_memsys_finalize();
-	bli_pack_finalize();
-	bli_thread_finalize();
-	bli_ind_finalize();
-	bli_gks_finalize();
+	bli_pthread_switch_off( &memsys_g_state, bli_memsys_finalize );
+	bli_pthread_switch_off( &rntm_l_state,   bli_rntm_finalize );
+	bli_pthread_switch_off( &thread_g_state, bli_thread_finalize );
+	bli_pthread_switch_off( &ind_l_state,    bli_ind_finalize );
+	bli_pthread_switch_off( &gks_g_state,    bli_gks_finalize );
 
 	return 0;
 }
diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c
index a226b7b85..f6bcfe569 100644
--- a/frame/base/bli_memsys.c
+++ b/frame/base/bli_memsys.c
@@ -36,8 +36,12 @@
 
 #include "blis.h"
 
-void bli_memsys_init( void )
+int bli_memsys_init( void )
 {
+	// NOTE: This function is called once by ONLY ONE application thread per
+	// library init/finalize cycle (see bli_init.c). Thus, a mutex is not
+	// needed to protect the data initialization.
+
 	// Query a native context so we have something to pass into
 	// bli_pba_init_pools().
 	// NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order
@@ -49,14 +53,18 @@ void bli_memsys_init( void )
 
 	// Initialize the small block allocator and its data structures.
 	bli_sba_init();
+
+	return 0;
 }
 
-void bli_memsys_finalize( void )
+int bli_memsys_finalize( void )
 {
 	// Finalize the small block allocator and its data structures.
 	bli_sba_finalize();
 
 	// Finalize the packing block allocator and its data structures.
 	bli_pba_finalize();
+
+	return 0;
 }
 
diff --git a/frame/base/bli_memsys.h b/frame/base/bli_memsys.h
index be0d48e35..0f561b318 100644
--- a/frame/base/bli_memsys.h
+++ b/frame/base/bli_memsys.h
@@ -37,10 +37,8 @@
 #ifndef BLIS_MEMSYS_H
 #define BLIS_MEMSYS_H
 
-// -----------------------------------------------------------------------------
-
-void bli_memsys_init( void );
-void bli_memsys_finalize( void );
+int bli_memsys_init( void );
+int bli_memsys_finalize( void );
 
 
 #endif
diff --git a/frame/base/bli_pack.c b/frame/base/bli_pack.c
index c5ce9cc6c..271354bf2 100644
--- a/frame/base/bli_pack.c
+++ b/frame/base/bli_pack.c
@@ -35,26 +35,6 @@
 
 #include "blis.h"
 
-// The global rntm_t structure. (The definition resides in bli_rntm.c.)
-extern rntm_t global_rntm;
-
-// A mutex to allow synchronous access to global_rntm. (The definition
-// resides in bli_rntm.c.)
-extern bli_pthread_mutex_t global_rntm_mutex;
-
-// -----------------------------------------------------------------------------
-
-void bli_pack_init( void )
-{
-	// Read the environment variables and use them to initialize the
-	// global runtime object.
-	bli_pack_init_rntm_from_env( &global_rntm );
-}
-
-void bli_pack_finalize( void )
-{
-}
-
 // -----------------------------------------------------------------------------
 
 void bli_pack_get_pack_a( bool* pack_a )
@@ -62,7 +42,7 @@ void bli_pack_get_pack_a( bool* pack_a )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	*pack_a = bli_rntm_pack_a( &global_rntm );
+	*pack_a = bli_rntm_pack_a( bli_global_rntm() );
 }
 
 // -----------------------------------------------------------------------------
@@ -72,7 +52,7 @@ void bli_pack_get_pack_b( bool* pack_b )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	*pack_b = bli_rntm_pack_b( &global_rntm );
+	*pack_b = bli_rntm_pack_b( bli_global_rntm() );
 }
 
 // ----------------------------------------------------------------------------
@@ -82,13 +62,17 @@ void bli_pack_set_pack_a( bool pack_a )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	// Acquire the mutex protecting global_rntm.
-	bli_pthread_mutex_lock( &global_rntm_mutex );
+	// If TLS is disabled, we need to use a mutex to protect the global rntm_t
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_lock( bli_global_rntm_mutex() );
+	#endif
 
-	bli_rntm_set_pack_a( pack_a, &global_rntm );
+	bli_rntm_set_pack_a( pack_a, bli_global_rntm() );
 
-	// Release the mutex protecting global_rntm.
-	bli_pthread_mutex_unlock( &global_rntm_mutex );
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_unlock( bli_global_rntm_mutex() );
+	#endif
 }
 
 // ----------------------------------------------------------------------------
@@ -98,60 +82,16 @@ void bli_pack_set_pack_b( bool pack_b )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	// Acquire the mutex protecting global_rntm.
-	bli_pthread_mutex_lock( &global_rntm_mutex );
-
-	bli_rntm_set_pack_b( pack_b, &global_rntm );
-
-	// Release the mutex protecting global_rntm.
-	bli_pthread_mutex_unlock( &global_rntm_mutex );
-}
-
-// ----------------------------------------------------------------------------
-
-void bli_pack_init_rntm_from_env
-     (
-       rntm_t* rntm
-     )
-{
-	// NOTE: We don't need to acquire the global_rntm_mutex here because this
-	// function is only called from bli_pack_init(), which is only called
-	// by bli_init_once().
-
-	bool pack_a;
-	bool pack_b;
-
-#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
-
-	// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
-	// -1 if it is unset.
-	gint_t pack_a_env = bli_env_get_var( "BLIS_PACK_A", -1 );
-	gint_t pack_b_env = bli_env_get_var( "BLIS_PACK_B", -1 );
-
-	// Enforce the default behavior first, then check for affirmative FALSE, and
-	// finally assume anything else is TRUE.
-	if      ( pack_a_env == -1 ) pack_a = FALSE; // default behavior
-	else if ( pack_a_env ==  0 ) pack_a = FALSE; // zero is FALSE
-	else                         pack_a = TRUE;  // anything else is TRUE
-
-	if      ( pack_b_env == -1 ) pack_b = FALSE; // default behavior
-	else if ( pack_b_env ==  0 ) pack_b = FALSE; // zero is FALSE
-	else                         pack_b = TRUE;  // anything else is TRUE
-
-#else
-
-	pack_a = TRUE;
-	pack_b = TRUE;
-
-#endif
+	// If TLS is disabled, we need to use a mutex to protect the global rntm_t
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_lock( bli_global_rntm_mutex() );
+	#endif
 
-	// Save the results back in the runtime object.
-	bli_rntm_set_pack_a( pack_a, rntm );
-	bli_rntm_set_pack_b( pack_b, rntm );
+	bli_rntm_set_pack_b( pack_b, bli_global_rntm() );
 
-#if 0
-	printf( "bli_pack_init_rntm_from_env()\n" );
-	bli_rntm_print( rntm );
-#endif
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_unlock( bli_global_rntm_mutex() );
+	#endif
 }
 
diff --git a/frame/base/bli_pack.h b/frame/base/bli_pack.h
index c12740148..9071581d9 100644
--- a/frame/base/bli_pack.h
+++ b/frame/base/bli_pack.h
@@ -35,15 +35,10 @@
 #ifndef BLIS_PACK_H
 #define BLIS_PACK_H
 
-void  bli_pack_init( void );
-void  bli_pack_finalize( void );
-
 BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a );
 BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b );
 BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a );
 BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b );
 
-void  bli_pack_init_rntm_from_env( rntm_t* rntm );
-
 #endif
 
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index 64124c682..abc3caaa5 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -34,27 +34,214 @@
 
 #include "blis.h"
 
-// The global rntm_t structure, which holds the global thread settings
-// along with a few other key parameters.
-rntm_t global_rntm = BLIS_RNTM_INITIALIZER;
+// The global rntm_t structure, which holds the global thread settings along
+// with a few other key parameters, along with a spare copy to capture a
+// snapshot at init-time and a mutex to control access to both structs.
+static BLIS_THREAD_LOCAL
+rntm_t              global_rntm         = BLIS_RNTM_INITIALIZER;
+static BLIS_THREAD_LOCAL
+rntm_t              global_rntm_at_init = BLIS_RNTM_INITIALIZER;
+static BLIS_THREAD_LOCAL
+bli_pthread_mutex_t global_rntm_mutex   = BLIS_PTHREAD_MUTEX_INITIALIZER;
+
+// Private functions to access the above static variables.
+rntm_t* bli_global_rntm( void )         { return &global_rntm; }
+rntm_t* bli_global_rntm_at_init( void ) { return &global_rntm_at_init; }
+bli_pthread_mutex_t*
+        bli_global_rntm_mutex( void )   { return &global_rntm_mutex; }
 
-// A mutex to allow synchronous access to global_rntm.
-bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
+// -----------------------------------------------------------------------------
+
+int bli_rntm_init( void )
+{
+	// NOTE: If TLS is enabled, this function is called once by EACH application
+	// thread per library init/finalize cycle (see bli_init.c). In this case,
+	// the threads will initialize thread-local data (see vars above). If TLS
+	// is disabled, this function is called once by ONLY ONE application thread.
+	// In neither case is a mutex needed to protect the data initialization.
+
+	rntm_t* gr   = bli_global_rntm();
+	rntm_t* grai = bli_global_rntm_at_init();
+
+	// Read the threading-related and sup packing-related environment variables
+	// and use them to initialize the global_rntm object.
+	bli_rntm_init_from_env( gr );
+
+	// Copy the contents of the global_rntm object into the global_rntm_at_init
+	// object, which is intended to remain unchanged for the duration of the
+	// current init/finalize cycle.
+	*grai = *gr;
+
+	return 0;
+}
+
+int bli_rntm_finalize( void )
+{
+	return 0;
+}
 
 // -----------------------------------------------------------------------------
 
+void bli_rntm_init_from_env
+     (
+       rntm_t* rntm
+     )
+{
+
+#ifdef BLIS_ENABLE_MULTITHREADING
+
+	timpl_t ti = BLIS_SINGLE;
+
+	// Try to read BLIS_THREAD_IMPL.
+	char* ti_env = bli_env_get_str( "BLIS_THREAD_IMPL" );
+
+	// If BLIS_THREAD_IMPL was not set, try to read BLIS_TI.
+	if ( ti_env == NULL ) ti_env = bli_env_get_str( "BLIS_TI" );
+
+	if ( ti_env != NULL )
+	{
+		// If BLIS_THREAD_IMPL was set, parse the value. If the value was
+		// anything other than a "openmp" or "pthreads" (or reasonable
+		// variations thereof), interpret it as a request for single-threaded
+		// execution.
+		if      ( !strncmp( ti_env, "openmp",   6 ) ) ti = BLIS_OPENMP;
+		else if ( !strncmp( ti_env, "omp",      3 ) ) ti = BLIS_OPENMP;
+		else if ( !strncmp( ti_env, "pthreads", 8 ) ) ti = BLIS_POSIX;
+		else if ( !strncmp( ti_env, "pthread",  7 ) ) ti = BLIS_POSIX;
+		else if ( !strncmp( ti_env, "posix",    5 ) ) ti = BLIS_POSIX;
+		else if ( !strncmp( ti_env, "hpx",      3 ) ) ti = BLIS_HPX;
+		else                                          ti = BLIS_SINGLE;
+
+		#ifdef PRINT_IMPL
+		printf( "detected BLIS_THREAD_IMPL=%s.\n",
+		        bli_thread_get_thread_impl_str( ti );
+		#endif
+	}
+	else
+	{
+		// If BLIS_THREAD_IMPL was unset, default to the implementation that
+		// was determined at configure-time.
+		ti = BLIS_SINGLE;
+
+		#ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT
+		ti = BLIS_OPENMP;
+		#endif
+		#ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT
+		ti = BLIS_POSIX;
+		#endif
+		#ifdef BLIS_ENABLE_HPX_AS_DEFAULT
+		ti = BLIS_HPX;
+		#endif
+
+		#ifdef PRINT_IMPL
+		printf( "BLIS_THREAD_IMPL unset; defaulting to BLIS_THREAD_IMPL=%s.\n",
+		        bli_thread_get_thread_impl_str( ti );
+		#endif
+	}
+
+	// ------------------------------------------------------------------------
+
+	// Try to read BLIS_NUM_THREADS first.
+	dim_t nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
+
+	// If BLIS_NUM_THREADS was not set, try to read BLIS_NT.
+	if ( nt == -1 ) nt = bli_env_get_var( "BLIS_NT", -1 );
+
+	// If neither BLIS_NUM_THREADS nor BLIS_NT were set, try OMP_NUM_THREADS.
+	if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
+
+	// ------------------------------------------------------------------------
+
+	// Read the environment variables for the number of threads (ways of
+	// parallelism) for each individual loop.
+	dim_t jc = bli_env_get_var( "BLIS_JC_NT", -1 );
+	dim_t pc = bli_env_get_var( "BLIS_PC_NT", -1 );
+	dim_t ic = bli_env_get_var( "BLIS_IC_NT", -1 );
+	dim_t jr = bli_env_get_var( "BLIS_JR_NT", -1 );
+	dim_t ir = bli_env_get_var( "BLIS_IR_NT", -1 );
+
+	// ------------------------------------------------------------------------
+
+	// Save the results back in the runtime object.
+	bli_rntm_set_thread_impl_only( ti, rntm );
+	bli_rntm_set_num_threads_only( nt, rntm );
+	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+
+	// ------------------------------------------------------------------------
+
+	// This function, bli_thread_init_rntm_from_env(), is only called when BLIS
+	// is initialized, and so we need to go one step further and process the
+	// rntm's contents into a standard form to ensure, for example, that none of
+	// the ways of parallelism are negative or zero (in case the user queries
+	// them later).
+	bli_rntm_sanitize( rntm );
+
+#else
+
+	// When multithreading is disabled, the global rntm can keep the values it
+	// was assigned at (static) initialization time.
+
+#endif
+
+	//printf( "bli_thread_init_rntm_from_env()\n" ); bli_rntm_print( rntm );
+
+	// ------------------------------------------------------------------------
+
+	bool pack_a;
+	bool pack_b;
+
+#if 1
+
+	// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
+	// -1 if it is unset.
+	gint_t pack_a_env = bli_env_get_var( "BLIS_PACK_A", -1 );
+	gint_t pack_b_env = bli_env_get_var( "BLIS_PACK_B", -1 );
+
+	// Enforce the default behavior first, then check for affirmative FALSE, and
+	// finally assume anything else is TRUE.
+	if      ( pack_a_env == -1 ) pack_a = FALSE; // default behavior
+	else if ( pack_a_env ==  0 ) pack_a = FALSE; // zero is FALSE
+	else                         pack_a = TRUE;  // anything else is TRUE
+
+	if      ( pack_b_env == -1 ) pack_b = FALSE; // default behavior
+	else if ( pack_b_env ==  0 ) pack_b = FALSE; // zero is FALSE
+	else                         pack_b = TRUE;  // anything else is TRUE
+
+#else
+
+	pack_a = TRUE;
+	pack_b = TRUE;
+
+#endif
+
+	// Save the results back in the runtime object.
+	bli_rntm_set_pack_a( pack_a, rntm );
+	bli_rntm_set_pack_b( pack_b, rntm );
+
+#if 0
+	printf( "bli_pack_init_rntm_from_env()\n" );
+	bli_rntm_print( rntm );
+#endif
+}
+
+// ----------------------------------------------------------------------------
+
 void bli_rntm_init_from_global( rntm_t* rntm )
 {
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	// Acquire the mutex protecting global_rntm.
-	bli_pthread_mutex_lock( &global_rntm_mutex );
+	// If TLS is disabled, we need to use a mutex to protect the global rntm_t
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_lock( bli_global_rntm_mutex() );
+	#endif
 
 	*rntm = global_rntm;
 
-	// Release the mutex protecting global_rntm.
-	bli_pthread_mutex_unlock( &global_rntm_mutex );
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_unlock( bli_global_rntm_mutex() );
+	#endif
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 882ad1cc3..43e91d505 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -267,7 +267,8 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
           .l3_sup      = TRUE, \
         }  \
 
-BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
+#if 0
+//BLIS_INLINE void bli_rntm_clear( rntm_t* rntm )
 {
 	bli_rntm_clear_thread_impl( rntm );
 
@@ -279,6 +280,7 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 	bli_rntm_clear_pack_b( rntm );
 	bli_rntm_clear_l3_sup( rntm );
 }
+#endif
 
 //
 // -- rntm_t total thread calculation ------------------------------------------
@@ -304,6 +306,15 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads
 // -- Function prototypes ------------------------------------------------------
 //
 
+rntm_t*              bli_global_rntm( void );
+rntm_t*              bli_global_rntm_at_init( void );
+bli_pthread_mutex_t* bli_global_rntm_mutex( void );
+
+int bli_rntm_init( void );
+int bli_rntm_finalize( void );
+
+void bli_rntm_init_from_env( rntm_t* rntm );
+
 BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
 
 BLIS_EXPORT_BLIS void bli_rntm_set_num_threads
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index d41f37053..8556cd798 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -35,18 +35,10 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_HPX
-#include "bli_thread_hpx.h"
-#endif
-
+// A global communicator that is hard-coded for single-threaded execution.
 thrcomm_t BLIS_SINGLE_COMM = {};
 
-// The global rntm_t structure. (The definition resides in bli_rntm.c.)
-extern rntm_t global_rntm;
-
-// A mutex to allow synchronous access to global_rntm. (The definition
-// resides in bli_rntm.c.)
-extern bli_pthread_mutex_t global_rntm_mutex;
+// -----------------------------------------------------------------------------
 
 typedef void (*thread_launch_t)
      (
@@ -80,17 +72,22 @@ static thread_launch_t thread_launch_fpa[ BLIS_NUM_THREAD_IMPLS ] =
 
 // -----------------------------------------------------------------------------
 
-void bli_thread_init( void )
+int bli_thread_init( void )
 {
+	// NOTE: This function is called once by ONLY ONE application thread per
+	// library init/finalize cycle (see bli_init.c). Thus, a mutex is not
+	// needed to protect the data initialization.
+
 	bli_thrcomm_init( BLIS_SINGLE, 1, &BLIS_SINGLE_COMM );
 
-	// Read the environment variables and use them to initialize the
-	// global runtime object.
-	bli_thread_init_rntm_from_env( &global_rntm );
+	return 0;
 }
 
-void bli_thread_finalize( void )
+int bli_thread_finalize( void )
 {
+	bli_thrcomm_cleanup( &BLIS_SINGLE_COMM );
+
+	return 0;
 }
 
 // -----------------------------------------------------------------------------
@@ -653,7 +650,7 @@ dim_t bli_thread_get_jc_nt( void )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_jc_ways( &global_rntm );
+	return bli_rntm_jc_ways( bli_global_rntm() );
 }
 
 dim_t bli_thread_get_pc_nt( void )
@@ -661,7 +658,7 @@ dim_t bli_thread_get_pc_nt( void )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_pc_ways( &global_rntm );
+	return bli_rntm_pc_ways( bli_global_rntm() );
 }
 
 dim_t bli_thread_get_ic_nt( void )
@@ -669,7 +666,7 @@ dim_t bli_thread_get_ic_nt( void )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_ic_ways( &global_rntm );
+	return bli_rntm_ic_ways( bli_global_rntm() );
 }
 
 dim_t bli_thread_get_jr_nt( void )
@@ -677,7 +674,7 @@ dim_t bli_thread_get_jr_nt( void )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_jr_ways( &global_rntm );
+	return bli_rntm_jr_ways( bli_global_rntm() );
 }
 
 dim_t bli_thread_get_ir_nt( void )
@@ -685,7 +682,7 @@ dim_t bli_thread_get_ir_nt( void )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_ir_ways( &global_rntm );
+	return bli_rntm_ir_ways( bli_global_rntm() );
 }
 
 dim_t bli_thread_get_num_threads( void )
@@ -693,7 +690,7 @@ dim_t bli_thread_get_num_threads( void )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_num_threads( &global_rntm );
+	return bli_rntm_num_threads( bli_global_rntm() );
 }
 
 timpl_t bli_thread_get_thread_impl( void )
@@ -701,7 +698,7 @@ timpl_t bli_thread_get_thread_impl( void )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_thread_impl( &global_rntm );
+	return bli_rntm_thread_impl( bli_global_rntm() );
 }
 
 static const char* bli_timpl_string[BLIS_NUM_THREAD_IMPLS] =
@@ -726,16 +723,20 @@ void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
 
 #ifdef BLIS_ENABLE_MULTITHREADING
 
-	// Acquire the mutex protecting global_rntm.
-	bli_pthread_mutex_lock( &global_rntm_mutex );
+	// If TLS is disabled, we need to use a mutex to protect the global rntm_t
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_lock( bli_global_rntm_mutex() );
+	#endif
 
-	bli_rntm_set_ways_only( jc, 1, ic, jr, ir, &global_rntm );
+	bli_rntm_set_ways_only( jc, 1, ic, jr, ir, bli_global_rntm() );
 
 	// Ensure that the rntm_t is in a consistent state.
-	bli_rntm_sanitize( &global_rntm );
+	bli_rntm_sanitize( bli_global_rntm() );
 
-	// Release the mutex protecting global_rntm.
-	bli_pthread_mutex_unlock( &global_rntm_mutex );
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_unlock( bli_global_rntm_mutex() );
+	#endif
 
 #else
 
@@ -752,16 +753,20 @@ void bli_thread_set_num_threads( dim_t n_threads )
 
 #ifdef BLIS_ENABLE_MULTITHREADING
 
-	// Acquire the mutex protecting global_rntm.
-	bli_pthread_mutex_lock( &global_rntm_mutex );
+	// If TLS is disabled, we need to use a mutex to protect the global rntm_t
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_lock( bli_global_rntm_mutex() );
+	#endif
 
-	bli_rntm_set_num_threads_only( n_threads, &global_rntm );
+	bli_rntm_set_num_threads_only( n_threads, bli_global_rntm() );
 
 	// Ensure that the rntm_t is in a consistent state.
-	bli_rntm_sanitize( &global_rntm );
+	bli_rntm_sanitize( bli_global_rntm() );
 
-	// Release the mutex protecting global_rntm.
-	bli_pthread_mutex_unlock( &global_rntm_mutex );
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_unlock( bli_global_rntm_mutex() );
+	#endif
 
 #else
 
@@ -776,123 +781,54 @@ void bli_thread_set_thread_impl( timpl_t ti )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
-	// Acquire the mutex protecting global_rntm.
-	bli_pthread_mutex_lock( &global_rntm_mutex );
+	// If TLS is disabled, we need to use a mutex to protect the global rntm_t
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_lock( bli_global_rntm_mutex() );
+	#endif
 
-	bli_rntm_set_thread_impl_only( ti, &global_rntm );
+	bli_rntm_set_thread_impl_only( ti, bli_global_rntm() );
 
-	// Release the mutex protecting global_rntm.
-	bli_pthread_mutex_unlock( &global_rntm_mutex );
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_unlock( bli_global_rntm_mutex() );
+	#endif
 }
 
-// ----------------------------------------------------------------------------
-
-//#define PRINT_IMPL
-
-void bli_thread_init_rntm_from_env
-     (
-       rntm_t* rntm
-     )
+void bli_thread_reset( void )
 {
-	// NOTE: We don't need to acquire the global_rntm_mutex here because this
-	// function is only called from bli_thread_init(), which is only called
-	// by bli_init_once().
-
-#ifdef BLIS_ENABLE_MULTITHREADING
-
-	timpl_t ti = BLIS_SINGLE;
-
-	// Try to read BLIS_THREAD_IMPL.
-	char* ti_env = bli_env_get_str( "BLIS_THREAD_IMPL" );
-
-	// If BLIS_THREAD_IMPL was not set, try to read BLIS_TI.
-	if ( ti_env == NULL ) ti_env = bli_env_get_str( "BLIS_TI" );
-
-	if ( ti_env != NULL )
-	{
-		// If BLIS_THREAD_IMPL was set, parse the value. If the value was
-		// anything other than a "openmp" or "pthreads" (or reasonable
-		// variations thereof), interpret it as a request for single-threaded
-		// execution.
-		if      ( !strncmp( ti_env, "openmp",   6 ) ) ti = BLIS_OPENMP;
-		else if ( !strncmp( ti_env, "omp",      3 ) ) ti = BLIS_OPENMP;
-		else if ( !strncmp( ti_env, "pthreads", 8 ) ) ti = BLIS_POSIX;
-		else if ( !strncmp( ti_env, "pthread",  7 ) ) ti = BLIS_POSIX;
-		else if ( !strncmp( ti_env, "posix",    5 ) ) ti = BLIS_POSIX;
-		else if ( !strncmp( ti_env, "hpx",      3 ) ) ti = BLIS_HPX;
-		else                                          ti = BLIS_SINGLE;
-
-		#ifdef PRINT_IMPL
-		printf( "detected BLIS_THREAD_IMPL=%s.\n",
-		        bli_thread_get_thread_impl_str( ti );
-		#endif
-	}
-	else
-	{
-		// If BLIS_THREAD_IMPL was unset, default to the implementation that
-		// was determined at configure-time.
-		ti = BLIS_SINGLE;
-
-		#ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT
-		ti = BLIS_OPENMP;
-		#endif
-		#ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT
-		ti = BLIS_POSIX;
-		#endif
-		#ifdef BLIS_ENABLE_HPX_AS_DEFAULT
-		ti = BLIS_HPX;
-		#endif
-
-		#ifdef PRINT_IMPL
-		printf( "BLIS_THREAD_IMPL unset; defaulting to BLIS_THREAD_IMPL=%s.\n",
-		        bli_thread_get_thread_impl_str( ti );
-		#endif
-	}
-
-	// ------------------------------------------------------------------------
-
-	// Try to read BLIS_NUM_THREADS first.
-	dim_t nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
-
-	// If BLIS_NUM_THREADS was not set, try to read BLIS_NT.
-	if ( nt == -1 ) nt = bli_env_get_var( "BLIS_NT", -1 );
-
-	// If neither BLIS_NUM_THREADS nor BLIS_NT were set, try OMP_NUM_THREADS.
-	if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
-
-	// ------------------------------------------------------------------------
-
-	// Read the environment variables for the number of threads (ways of
-	// parallelism) for each individual loop.
-	dim_t jc = bli_env_get_var( "BLIS_JC_NT", -1 );
-	dim_t pc = bli_env_get_var( "BLIS_PC_NT", -1 );
-	dim_t ic = bli_env_get_var( "BLIS_IC_NT", -1 );
-	dim_t jr = bli_env_get_var( "BLIS_JR_NT", -1 );
-	dim_t ir = bli_env_get_var( "BLIS_IR_NT", -1 );
+	// We must ensure that global_rntm_at_init has been initialized.
+	bli_init_once();
 
-	// ------------------------------------------------------------------------
+	// If TLS is disabled, we need to use a mutex to protect the global rntm_t
+	// since it will be shared with all application threads.
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_lock( bli_global_rntm_mutex() );
+	#endif
 
-	// Save the results back in the runtime object.
-	bli_rntm_set_thread_impl_only( ti, rntm );
-	bli_rntm_set_num_threads_only( nt, rntm );
-	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+	// Overwrite the global rntm_t with the contents of the snapshot we took
+	// at initialization.
 
-	// ------------------------------------------------------------------------
+	rntm_t* src = bli_global_rntm_at_init();
+	rntm_t* dst = bli_global_rntm();
 
-	// This function, bli_thread_init_rntm_from_env(), is only called when BLIS
-	// is initialized, and so we need to go one step further and process the
-	// rntm's contents into a standard form to ensure, for example, that none of
-	// the ways of parallelism are negative or zero (in case the user queries
-	// them later).
-	bli_rntm_sanitize( rntm );
+	timpl_t ti = bli_rntm_thread_impl( src );
+	bool    af = bli_rntm_auto_factor( src );
+	dim_t   nt = bli_rntm_num_threads( src );
 
-#else
+	bli_rntm_set_thread_impl_only( ti, dst );
+	bli_rntm_set_auto_factor_only( af, dst );
+	bli_rntm_set_num_threads_only( nt, dst );
 
-	// When multithreading is disabled, the global rntm can keep the values it
-	// was assigned at (static) initialization time.
+	dim_t   jc = bli_rntm_jc_ways( src );
+	dim_t   pc = bli_rntm_pc_ways( src );
+	dim_t   ic = bli_rntm_ic_ways( src );
+	dim_t   jr = bli_rntm_jr_ways( src );
+	dim_t   ir = bli_rntm_ir_ways( src );
 
-#endif
+	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, dst );
 
-	//printf( "bli_thread_init_rntm_from_env()\n" ); bli_rntm_print( rntm );
+	#ifdef BLIS_DISABLE_TLS
+	bli_pthread_mutex_unlock( bli_global_rntm_mutex() );
+	#endif
 }
 
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 5002672dc..9a6c3d1b5 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -53,8 +53,8 @@ typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params
 #include "bli_thread_single.h"
 
 // Initialization-related prototypes.
-void bli_thread_init( void );
-void bli_thread_finalize( void );
+int bli_thread_init( void );
+int bli_thread_finalize( void );
 
 // -----------------------------------------------------------------------------
 
@@ -126,8 +126,7 @@ BLIS_EXPORT_BLIS const char* bli_thread_get_thread_impl_str( timpl_t ti );
 BLIS_EXPORT_BLIS void    bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir );
 BLIS_EXPORT_BLIS void    bli_thread_set_num_threads( dim_t value );
 BLIS_EXPORT_BLIS void    bli_thread_set_thread_impl( timpl_t ti );
-
-void                     bli_thread_init_rntm_from_env( rntm_t* rntm );
+BLIS_EXPORT_BLIS void    bli_thread_reset( void );
 
 
 #endif
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index b5948d64c..4caa46bf9 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -2716,8 +2716,7 @@ thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, inv
 	if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE;
 	else                                   does_inv_diag = TRUE;
 
-	rntm_t rntm;
-	bli_rntm_init( &rntm );
+	rntm_t rntm = BLIS_RNTM_INITIALIZER;
 
 	// Create a control tree node for the packing operation.
 	cntl_t* cntl = bli_packm_cntl_create_node

From c6546c1131b1ddd45ef13f9f2b620ce2e955dbf8 Mon Sep 17 00:00:00 2001
From: John Mather <54645798+jmather-sesi@users.noreply.github.com>
Date: Wed, 20 Sep 2023 13:41:07 -0400
Subject: [PATCH 167/230] Fixed broken link in Multithreading.md. (#774)

Details:
- Replaced 404'd link in docs/Multithreading.md with an archive from
   The Wayback Machine.
- CREDITS file update.
---
 CREDITS                | 3 ++-
 docs/Multithreading.md | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CREDITS b/CREDITS
index 18af35928..94438e400 100644
--- a/CREDITS
+++ b/CREDITS
@@ -42,6 +42,7 @@ but many others have contributed code, ideas, and feedback, including
   Alexander Grund          @Flamefire
   John Gunnels             @jagunnels                 (IBM, T.J. Watson Research Center)
   Ali Emre Gülcü           @Lephar
+                           @h-vetinari
   Jeff Hammond             @jeffhammond               (Intel)
   Jacob Gorm Hansen        @jacobgorm
   Shivaprashanth H                                    (Global Edge)
@@ -52,6 +53,7 @@ but many others have contributed code, ideas, and feedback, including
   Matthew Honnibal         @honnibal
   Stefan Husmann           @stefanhusmann
   Francisco Igual          @figual                    (Universidad Complutense de Madrid)
+  John Mather              @jmather-sesi              (SideFX Software)
   Madeesh Kannan           @shadeMe
   Tony Kelman              @tkelman
   Lee Killough             @leekillough               (Tactical Computing Labs)
@@ -123,7 +125,6 @@ but many others have contributed code, ideas, and feedback, including
   Meghana Vankadari        @Meghana-vankadari         (AMD)
   Kiran Varaganti          @kvaragan                  (AMD)
   Natalia Vassilieva                                  (Hewlett Packard Enterprise)
-                           @h-vetinari
   Andrew Wildman           @awild82                   (University of Washington)
   Zhang Xianyi             @xianyi                    (Chinese Academy of Sciences)
   Benda Xu                 @heroxbd
diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 6f2ef49c5..d8f8b13f4 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -101,7 +101,7 @@ The `cores` value is most appropriate for BLIS since we usually want to ignore h
 
 Setting these two variables is often enough. However, it obviously does not offer the level of control that `GOMP_CPU_AFFINITY` does. Sometimes, it takes some experimentation to determine whether a particular mapping is performing as expected. If multithreaded performance on eight cores is only twice what it is observed of single-threaded performance, the affinity mapping may be to blame. But if performance is six or seven times higher than sequential execution, then the mapping you chose is probably working fine.
 
-Unfortunately, the topic of thread-to-core affinity is well beyond the scope of this document. (A web search will uncover many [great resources](http://www.nersc.gov/users/software/programming-models/openmp/process-and-thread-affinity/) discussing the use of [GOMP_CPU_AFFINITY](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fCPU_005fAFFINITY.html) and [OMP_PROC_BIND](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fPROC_005fBIND.html#OMP_005fPROC_005fBIND).) It's up to the user to determine an appropriate affinity mapping, and then choose your preferred method of expressing that mapping to the OpenMP implementation.
+Unfortunately, the topic of thread-to-core affinity is well beyond the scope of this document. (A web search will uncover many [great resources](https://web.archive.org/web/20190130102805/http://www.nersc.gov/users/software/programming-models/openmp/process-and-thread-affinity) discussing the use of [GOMP_CPU_AFFINITY](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fCPU_005fAFFINITY.html) and [OMP_PROC_BIND](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fPROC_005fBIND.html#OMP_005fPROC_005fBIND).) It's up to the user to determine an appropriate affinity mapping, and then choose your preferred method of expressing that mapping to the OpenMP implementation.
 
 
 # Specifying multithreading

From a4a63295b96ed5b32f4df6477d24db07bf431202 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Tue, 26 Sep 2023 17:58:38 -0500
Subject: [PATCH 168/230] Fixes to HPC runtime code path. (#773)

Details:
- Fixed hpx::for_each invocation and replace with hpx::for_loop. The HPX
  runtime was initialized using hpx::start, but the hpx::for_each
  function was being called on a non-hpx runtime (i.e standard BLIS
  runtime - single main thread). To run hpx::for_each on HPX runtime
  correctly, the code now uses hpx::run_as_hpx_thread(func, args...).
- Replaced hpx::for_each with hpx::for_loop, which eliminates use of
  hpx::util::counting_iterator.
- Employ hpx::execution::chunk_size(1) to make sure that a thread
  resides on a particular core.
- Replaced hpx::apply() with updated version hpx::post().
- Initialize tdata->id = 0 in libblis.c to 0, as it is the main thread
  and is needed for writing results to output file.
- By default, if not specified, the HPX runtime uses all N threads/cores
  available in the system. But, if we want to only specify n_threads out
  N threads, we use hpx::execution::experimental::num_cores(n_threads).
---
 CREDITS                         |  1 +
 frame/thread/bli_thread_hpx.cpp | 22 +++++++++++++---------
 testsuite/src/test_libblis.c    |  5 +++--
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/CREDITS b/CREDITS
index 94438e400..f72698f99 100644
--- a/CREDITS
+++ b/CREDITS
@@ -130,6 +130,7 @@ but many others have contributed code, ideas, and feedback, including
   Benda Xu                 @heroxbd
   Guodong Xu               @docularxu                 (Linaro.org)
   RuQing Xu                @xrq-phys                  (The University of Tokyo)
+  Srinivas Yadav           @srinivasyadav18
   Costas Yamin             @cosstas
   Chenhan Yu               @ChenhanYu                 (The University of Texas at Austin)
   Roman Yurchak            @rth                       (Symerio)
diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp
index f69a0f5d7..847b519dc 100644
--- a/frame/thread/bli_thread_hpx.cpp
+++ b/frame/thread/bli_thread_hpx.cpp
@@ -36,9 +36,10 @@
 
 #ifdef BLIS_ENABLE_HPX
 
-#include <hpx/local/execution.hpp>
-#include <hpx/parallel/algorithms/for_each.hpp>
+#include <hpx/execution.hpp>
 #include <hpx/hpx_start.hpp>
+#include <hpx/parallel/algorithms/for_loop.hpp>
+#include <hpx/runtime_local/run_as_hpx_thread.hpp>
 
 extern "C"
 {
@@ -55,13 +56,16 @@ void bli_thread_launch_hpx
 	// Allocate a global communicator for the root thrinfo_t structures.
 	pool_t*    gl_comm_pool = nullptr;
 	thrcomm_t* gl_comm      = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
-
-	auto irange = hpx::util::counting_shape(n_threads);
-
-	hpx::for_each(hpx::execution::par, hpx::util::begin(irange), hpx::util::end(irange),
-	[&gl_comm, &func, &params](const dim_t tid)
+	hpx::threads::run_as_hpx_thread([&]()
 	{
-		func( gl_comm, tid, params );
+		hpx::execution::experimental::num_cores num_cores_(n_threads);
+		hpx::execution::static_chunk_size chunk_size_(1);
+		hpx::experimental::for_loop(
+		hpx::execution::par.with(num_cores_).with(chunk_size_), 0, n_threads,
+		[&gl_comm, &func, &params](const dim_t tid)
+		{
+			func( gl_comm, tid, params );
+		});
 	});
 
 	// Free the global communicator, because the root thrinfo_t node
@@ -76,7 +80,7 @@ void bli_thread_initialize_hpx( int argc, char** argv )
 
 int bli_thread_finalize_hpx()
 {
-	hpx::apply([]() { hpx::finalize(); });
+	hpx::post([]() { hpx::finalize(); });
 	return hpx::stop();
 }
 
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 4caa46bf9..4f09d3932 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -135,15 +135,16 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops )
 	err_t r_val;
 
 #ifdef BLIS_ENABLE_HPX
+	size_t nt = ( size_t )params->n_app_threads;
 
-	size_t tdata_size = ( size_t )params->n_app_threads *
+	size_t tdata_size = ( size_t )nt *
 	                    ( size_t )sizeof( thread_data_t );
 	thread_data_t* tdata = bli_malloc_user( tdata_size, &r_val );
 
 	tdata->params  = params;
 	tdata->ops     = ops;
 	tdata->nt      = nt;
-	tdata->id      = 1;
+	tdata->id      = 0;
 	tdata->xc      = 0;
 
 	// Walk through all test modules.

From 6f412204004666abac266409a203cb635efbabf3 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 26 Sep 2023 18:00:54 -0500
Subject: [PATCH 169/230] Added 'altra', 'altramax' subconfigs. (#775)

Details:
- Forward-ported 'altra' and 'altramax' subconfigurations from the
  older 'stable' branch lineage [1]. These subconfigs primarily target
  the Ampere Altra and AltraMax (ARM) processors. They also contain
  "QuickStart" directories with information and scripts to help
  use BLIS on these microarchitectures. Thanks to Jeff Diamond and
  Leick Robinson for developing these subconfigs and resources.
- Updated kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c according to
  changes in the 'stable' lineage, mostly related to re-enabling of
  assembly code branches that target general stride IO.

[1] Note that the 'stable' branch is being used to make sure that more
    recent commits do not introduce unreasonable performance
    regressions. As such, the name should be interpreted as shorthand
    for "performance stable," not "API stable."
---
 config/altra/QuickStart/TimeDGEMM.cfile       |  143 +++
 config/altra/QuickStart/blis_build_altra.sh   |   20 +
 .../QuickStart/blis_build_altra_pthreads.sh   |   20 +
 .../QuickStart/blis_build_both_libraries.sh   |   58 +
 .../altra/QuickStart/blis_configure_altra.sh  |   20 +
 .../blis_configure_altra_pthreads.sh          |   20 +
 .../QuickStart/blis_quick_start_altra.txt     |  202 ++++
 .../blis_quick_start_uninstall_altra.sh       |   30 +
 config/altra/QuickStart/blis_setenv.sh        |  195 +++
 config/altra/QuickStart/blis_test.sh          |   23 +
 config/altra/QuickStart/blis_unset_par.sh     |   22 +
 config/altra/bli_cntx_init_altra.c            |   95 ++
 config/altra/bli_family_altra.h               |   55 +
 config/altra/bli_kernel_defs_altra.h          |   48 +
 config/altra/make_defs.mk                     |   90 ++
 config/altramax/QuickStart/TimeDGEMM.cfile    |  143 +++
 .../QuickStart/blis_build_altramax.sh         |   20 +
 .../blis_build_altramax_pthreads.sh           |   20 +
 .../QuickStart/blis_build_both_libraries.sh   |   58 +
 .../QuickStart/blis_configure_altramax.sh     |   20 +
 .../blis_configure_altramax_pthreads.sh       |   20 +
 .../QuickStart/blis_quick_start_altramax.txt  |  200 ++++
 .../blis_quick_start_uninstall_altramax.sh    |   30 +
 config/altramax/QuickStart/blis_setenv.sh     |  183 +++
 config/altramax/QuickStart/blis_test.sh       |   22 +
 config/altramax/QuickStart/blis_unset_par.sh  |   22 +
 config/altramax/bli_cntx_init_altramax.c      |   95 ++
 config/altramax/bli_family_altramax.h         |   48 +
 config/altramax/bli_kernel_defs_altramax.h    |   48 +
 config/altramax/make_defs.mk                  |   90 ++
 config_registry                               |    8 +
 frame/base/bli_arch.c                         |   14 +
 frame/base/bli_gks.c                          |   10 +
 frame/include/bli_arch_config.h               |   23 +-
 frame/include/bli_type_defs.h                 |    4 +
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   | 1045 +++++++++++++----
 36 files changed, 2939 insertions(+), 225 deletions(-)
 create mode 100755 config/altra/QuickStart/TimeDGEMM.cfile
 create mode 100755 config/altra/QuickStart/blis_build_altra.sh
 create mode 100755 config/altra/QuickStart/blis_build_altra_pthreads.sh
 create mode 100755 config/altra/QuickStart/blis_build_both_libraries.sh
 create mode 100755 config/altra/QuickStart/blis_configure_altra.sh
 create mode 100755 config/altra/QuickStart/blis_configure_altra_pthreads.sh
 create mode 100755 config/altra/QuickStart/blis_quick_start_altra.txt
 create mode 100755 config/altra/QuickStart/blis_quick_start_uninstall_altra.sh
 create mode 100755 config/altra/QuickStart/blis_setenv.sh
 create mode 100755 config/altra/QuickStart/blis_test.sh
 create mode 100755 config/altra/QuickStart/blis_unset_par.sh
 create mode 100644 config/altra/bli_cntx_init_altra.c
 create mode 100644 config/altra/bli_family_altra.h
 create mode 100644 config/altra/bli_kernel_defs_altra.h
 create mode 100644 config/altra/make_defs.mk
 create mode 100755 config/altramax/QuickStart/TimeDGEMM.cfile
 create mode 100755 config/altramax/QuickStart/blis_build_altramax.sh
 create mode 100755 config/altramax/QuickStart/blis_build_altramax_pthreads.sh
 create mode 100755 config/altramax/QuickStart/blis_build_both_libraries.sh
 create mode 100755 config/altramax/QuickStart/blis_configure_altramax.sh
 create mode 100755 config/altramax/QuickStart/blis_configure_altramax_pthreads.sh
 create mode 100755 config/altramax/QuickStart/blis_quick_start_altramax.txt
 create mode 100755 config/altramax/QuickStart/blis_quick_start_uninstall_altramax.sh
 create mode 100755 config/altramax/QuickStart/blis_setenv.sh
 create mode 100755 config/altramax/QuickStart/blis_test.sh
 create mode 100755 config/altramax/QuickStart/blis_unset_par.sh
 create mode 100644 config/altramax/bli_cntx_init_altramax.c
 create mode 100644 config/altramax/bli_family_altramax.h
 create mode 100644 config/altramax/bli_kernel_defs_altramax.h
 create mode 100644 config/altramax/make_defs.mk

diff --git a/config/altra/QuickStart/TimeDGEMM.cfile b/config/altra/QuickStart/TimeDGEMM.cfile
new file mode 100755
index 000000000..172edc659
--- /dev/null
+++ b/config/altra/QuickStart/TimeDGEMM.cfile
@@ -0,0 +1,143 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <float.h>
+#include <limits.h>
+#include "blis.h"
+
+/*###################################################
+// To build with openmp:
+// Note: Don't need the -lomp on Linux
+gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+// To build with pThreads
+source ./enable_blis.sh
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+// To run with QuickStart Macros...
+for N_CORES, S_SOCKETS
+
+blis_set_cores_and_sockets N S; $BLIS_NUMA time_gemm.x
+
+###################################################*/
+
+#include <stdarg.h>  // for Linux stdarg
+
+//###################################################
+// Handy blis functions
+//###################################################
+
+// Returns 0.0 if out ofmatrix
+double GetReal(obj_t *m, int row, int col)
+  {
+  double im = 0, re = 0; // Imaginary component
+  if (!m) return 0.0;
+    
+  bli_getijm(row, col, m, &re, &im);
+  return re;
+  }
+  
+bool SetReal(obj_t *m, int row, int col, double dVal)
+  {
+  if (!m) return 0.0;
+  bli_setijm(dVal, 0.0, row, col, m);
+    
+  return true;
+  }
+
+//###################################################
+// The basic meat - a one shot
+//###################################################
+
+bool TimeBlis(long size)
+  {
+  int repeat = 3; // Best Of!
+  double dAlpha = 1.0, dBeta = 0.0; // simplest case!
+
+  //============== Allocate matrices =============
+  obj_t*  alpha = (obj_t*) calloc(1, sizeof(obj_t));
+  obj_t*  beta = (obj_t*) calloc(1, sizeof(obj_t));
+
+  bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, alpha);
+  bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, beta);
+
+  // Full gemm is alpha * A * B + beta * C
+  bli_setsc(dAlpha, 0.0, alpha); // alpha is one
+  bli_setsc(dBeta, 0.0, beta); // beta is zero
+  //==============================================
+  printf("Initializing %g GB of Matrices...\n", 8.0 * size * size * 3.0 / 1024.0 / 1024.0 / 1024.0);
+    
+  obj_t*  a = (obj_t*) calloc(1, sizeof(obj_t));
+  obj_t*  b = (obj_t*) calloc(1, sizeof(obj_t));
+  obj_t*  c = (obj_t*) calloc(1, sizeof(obj_t));
+
+  bli_obj_create(BLIS_DOUBLE, size, size, size, 1, c);
+  bli_obj_create(BLIS_DOUBLE, size, size, size, 1, a);
+  bli_obj_create(BLIS_DOUBLE, size, size, size, 1, b);
+  
+  // Create Random matrices
+  // that are well conditioned and invertible
+  // (Note: this can be slow)
+  //
+  bli_randm(c);
+  bli_randm(a);
+  bli_randm(b);
+  
+  //==============================================
+  // DO the timing, blis style...
+  //==============================================
+   
+  double dBestTime = DBL_MAX;
+  
+  for (int i = 0; i < repeat; i++)
+    {
+    printf("Performing DGEMM %d of %d\n", i + 1, repeat); fflush(stdout);
+    double dStartTime = bli_clock();
+    
+    bli_gemm(alpha, a, b, beta, c);
+    
+    // Always look at best of N for timing!
+    dBestTime = bli_clock_min_diff( dBestTime, dStartTime );
+    }
+    
+  double gflops = ( 2.0 * size * size * size ) / ( dBestTime * 1.0e9 );
+
+  printf("Best DGEMM run completed in %g seconds @ size= \t %ld \t %g \t gigaflops\n",
+         dBestTime, size, gflops); fflush(stdout);
+
+  return true;
+  }
+
+
+int main( int argc, char** argv )
+  {
+  long size = 0;
+  int cores = 1, sweep_inc = 0;
+    
+  printf("Details of parallelism are set by environment variables.\n");
+  printf("Arg1 = size=M=N=K for DGEMM\n"
+  			"optional arg2 = size step for sweep.\n");
+  
+  if (argc < 2) return 0;
+  
+  if (argc > 1) {
+    size = atol(argv[1]);
+    printf("User set size to %ld\n", size);
+    }
+
+  if (argc > 2) {
+      sweep_inc = atoi(argv[3]);
+      printf("User set sweep size inc to %d\n", sweep_inc);
+      }
+
+  if (sweep_inc == 0) TimeBlis(size);
+  else
+    {
+    for (int i = size; i >= sweep_inc; i -= sweep_inc)
+      TimeBlis(i);
+    }
+    
+  return 0;
+  }
diff --git a/config/altra/QuickStart/blis_build_altra.sh b/config/altra/QuickStart/blis_build_altra.sh
new file mode 100755
index 000000000..9208aac37
--- /dev/null
+++ b/config/altra/QuickStart/blis_build_altra.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+echo "#######################################################"
+echo "Building standard OpenMP BLIS..."
+echo "#######################################################"
+. ./blis_setenv.sh quiet
+echo "##########################################################"
+echo "Configuring BLIS for Altra using OpenMP for parallelism..."
+echo "##########################################################"
+. ./blis_configure_altra.sh quiet
+echo "Switching to directory $BLIS_HOME"
+pushd $BLIS_HOME > /dev/null
+make -j
+popd > /dev/null
+if [ "$1" != "notest" ]; then
+    . ./blis_test.sh quiet
+fi
+. ./blis_setenv.sh
+echo "##########################################################"
+echo "...done"
+echo "##########################################################"
diff --git a/config/altra/QuickStart/blis_build_altra_pthreads.sh b/config/altra/QuickStart/blis_build_altra_pthreads.sh
new file mode 100755
index 000000000..328525830
--- /dev/null
+++ b/config/altra/QuickStart/blis_build_altra_pthreads.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+echo "#######################################################"
+echo "Building pThreads version of BLIS..."
+echo "#######################################################"
+. ./blis_setenv.sh quiet
+echo "##########################################################"
+echo "Configuring BLIS for Altra using pThreads for parallelism..."
+echo "##########################################################"
+. ./blis_configure_altra_pthreads.sh quiet
+echo "Switching to directory $BLIS_HOME"
+pushd $BLIS_HOME > /dev/null
+make -j
+popd > /dev/null
+if [ "$1" != "notest" ]; then
+    . ./blis_test.sh quiet
+fi
+. ./blis_setenv.sh
+echo "##########################################################"
+echo "...done"
+echo "##########################################################"
diff --git a/config/altra/QuickStart/blis_build_both_libraries.sh b/config/altra/QuickStart/blis_build_both_libraries.sh
new file mode 100755
index 000000000..2bcf186f8
--- /dev/null
+++ b/config/altra/QuickStart/blis_build_both_libraries.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+echo "##########################################################"
+echo "Creating both OpenMP and pThread BLIS libraries..."
+echo "##########################################################"
+echo "First, Creating pThread library..."
+echo "##########################################################"
+. ./blis_build_altra_pthreads.sh notest
+
+echo "##########################################################"
+echo "Saving the pThreads build..."
+echo "##########################################################"
+# Temporarily move the pthreads build
+mkdir $BLIS_HOME/.tempinc
+mkdir $BLIS_HOME/.templib
+mv $BLIS_INC/* $BLIS_HOME/.tempinc/
+mv $BLIS_LIB/* $BLIS_HOME/.templib/
+# And rename the pthread versions of the include and library files
+#echo "##########################################################"
+pushd $BLIS_HOME/.tempinc/ > /dev/null
+echo "Renaming pThread-enabled blis.h -> blisP.h"
+mv blis.h blisP.h
+popd > /dev/null
+pushd $BLIS_HOME/.templib/ > /dev/null
+for f in $(ls -1); do
+    destf=${f/blis/blisP}
+    echo "Renaming pThread library $f -> $destf"
+    mv "$f" "$destf"
+
+    # Fix the symbolic links
+    if [[ -L "$destf" ]]; then
+        target=$(readlink $destf)
+        target=${target/blis/blisP}
+        \rm "$destf"
+        ln -s "$target" "$destf"
+    fi
+done
+popd > /dev/null
+echo "##########################################################"
+
+echo "##########################################################"
+echo "Second, Creating OpenMP library..."
+echo "##########################################################"
+. ./blis_build_altra.sh notest
+
+echo "##########################################################"
+echo "Restoring the pThreads build..."
+echo "##########################################################"
+# And move the pthread versions back
+mv $BLIS_HOME/.tempinc/*  $BLIS_INC/
+mv $BLIS_HOME/.templib/* $BLIS_LIB/
+rmdir $BLIS_HOME/.tempinc
+rmdir $BLIS_HOME/.templib
+
+. ./blis_test.sh quiet
+. ./blis_setenv.sh
+echo "##########################################################"
+echo "Done creating BLIS libraries..."
+echo "##########################################################"
diff --git a/config/altra/QuickStart/blis_configure_altra.sh b/config/altra/QuickStart/blis_configure_altra.sh
new file mode 100755
index 000000000..206384eca
--- /dev/null
+++ b/config/altra/QuickStart/blis_configure_altra.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if [ "$1" = "quiet" ]; then
+    quiet_confopenmp="quiet"
+else
+    quiet_confopenmp=""
+fi
+
+if [ "$quiet_confopenmp" = "" ]; then
+    echo "##########################################################"
+    echo "Configuring BLIS for Altra using OpenMP for parallelism..."
+    echo "##########################################################"
+fi
+
+. ./blis_setenv.sh $quiet_confopenmp
+pushd $BLIS_HOME > /dev/null
+make distclean
+./configure -t openmp --disable-pba-pools altra
+popd > /dev/null
+
diff --git a/config/altra/QuickStart/blis_configure_altra_pthreads.sh b/config/altra/QuickStart/blis_configure_altra_pthreads.sh
new file mode 100755
index 000000000..7293fb664
--- /dev/null
+++ b/config/altra/QuickStart/blis_configure_altra_pthreads.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if [ "$1" = "quiet" ]; then
+    quiet_confpthreads="quiet"
+else
+    quiet_confpthreads=""
+fi
+
+if [ "$quiet_confpthreads" = "" ]; then
+    echo "##########################################################"
+    echo "Configuring BLIS for Altra using pThreads for parallelism..."
+    echo "##########################################################"
+fi
+
+. ./blis_setenv.sh $quiet_confpthreads
+pushd $BLIS_HOME > /dev/null
+make distclean
+./configure -t pthreads --disable-pba-pools altra
+popd > /dev/null
+
diff --git a/config/altra/QuickStart/blis_quick_start_altra.txt b/config/altra/QuickStart/blis_quick_start_altra.txt
new file mode 100755
index 000000000..bf7988144
--- /dev/null
+++ b/config/altra/QuickStart/blis_quick_start_altra.txt
@@ -0,0 +1,202 @@
+Welcome to the Altra Platform!  We've made some scripts to help you build and use blis,
+but feel free to look at them for your own inspiration.
+Note that all the provided scripts must be SOURCED, NOT executed!  This is because they
+set up environment variables needed for the steps below.
+ 
+Using BLIS requires a few steps:
+
+1) Configuring the library
+2) Building the library & validating it
+3) Linking your program with BLIS
+4) Setting the environment parameters for an optimized blis to run your program
+
+Let's briefly touch on these points, and how the scripts provided can help
+But first, let's make sure your configuration is correct...
+
+Open blis_setenv.sh
+In the Platform Specific: section, around line 50 or so, you will see:
+firmware=107
+or
+firmware=108
+
+If your firmware is version 1.08 or greater, make sure this is set to 108, else make sure
+it's set to 107.  Ampere entirely changed the CoreID mappings between these versions.
+
+The Altra Platform updated their firmware to 1.08 in May 2021, so if your firmware was
+updated later than that, odds are good that you have 2.04 or later.
+
+Note: the scripts referenced here modify environment variables, so they must be sourced.
+E.g., with
+    source <script_name>
+or
+    . <script_name>
+
+===================================================
+1) Configuring the library
+2) Building the library & validating it
+===================================================
+
+There are custom configuration options for Altra, but, as a user, your main decision is
+whether you want BLIS to use OpenMP or pthreads for parallelism?  OpenMP is the default
+option, since OpenMP allows thread pinning and thus results in better performance.
+To build with OpenMP use:
+
+. ./blis_build_altra.sh
+
+However, some platforms (like MacOS) cannot use OpenMP at all.  In this case, you want
+to build the pThreads version of BLIS:
+
+. ./blis_build_altra_pthreads.sh
+
+In both cases, it will create libblis.a in $BLIS_HOME/lib/$BLIS_ARCH
+
+Try doing that in the root blis directory, depending on your OS.
+
+LINUX:
+. ./blis_build_altra.sh
+
+MacOS Apple Silicon:
+. ./blis_build_altra_pthreads.sh
+
+----------------------------------------------------------------------------
+HOWEVER, there is a tricky case: If you link BLIS with a program that uses pThreads, you
+MUST use the pthreads version of BLIS, even though it will be slower.  This is because
+there is a bug in which attempting to use both pthreads AND OpenMP will pin all threads to
+a single core and essentially freeze your program.
+
+If this is a possibility, you may want to have both libraries available and switch between
+them for each application.  The script:
+
+. ./blis_build_both_libraries.sh
+
+will build both versions, with the pThreads version being called libblisP.a, and a second
+header blisP.h
+This is a little inconvenient, and we're working on improving the situation in the near
+future.
+----------------------------------------------------------------------------
+
+The build will additionally check the library, but if you would like to check a la carte, do
+
+. ./blis_test.sh
+
+You should see near the bottom:
+check-blastest.sh: All BLAS tests passed!
+check-blistest.sh: All BLIS tests passed!
+
+--------------------------------
+Finally, here's a script that will be important when you are doing testing.
+This performs the important step of unsetting any parameters effecting blis parallelism.
+
+. ./blis_unset_par.sh
+
+===================================================
+3) Building and Linking your program with BLIS
+===================================================
+
+This depends whether you are using the pThreads version of BLIS or the OpenMP version...
+Note this uses the BLIS locations automatically defined when sourcing blis_setenv.sh
+
+. ./blis_setenv.sh
+
+(This will display you environment variable settings, your blis libraries and headers (if
+built), and also unset blis parallelism parameters for safety.)
+
+// BUILDING your app with the OpenMP version of BLIS:
+// Note: Don't need -lomp on Linux
+
+gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe
+
+// To build with pThreads
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe
+
+// NOTE: If you used the scripts to build BOTH versions of blis, then use the renamed blis lib:
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblisP.a -lpthread -lm -o MyExe
+
+Let's try building a sample program that we've included to test BLIS:
+TimeDGEMM.c
+
+If this is a new terminal session, make sure to:
+. ./blis_setenv.sh
+(there's no harm in running it again.)
+
+Linux:
+gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+Apple Silicon:
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+But don't try a timed run, yet - there's some runtime setup that needs to be done...
+
+===================================================
+4) Setting the environment parameters for an optimized blis to run your program
+===================================================
+
+The performance of some BLAS libraries are very sensitive to the compiler or the page size. 
+BLIS is not sensitive to either of these things, but it IS extremely dependent on pinning
+the right threads to the right cores.  We have scripts to help...
+
+. ./blis_setenv.sh
+
+This not only tells you where blis is, but it also creates shell functions to set
+affinity, threading, and NUMA control for each run.  There is a shell function created
+that you can call to set up how your threads will be pinned and used:
+
+blis_set_cores_and_sockets <ncores> <nsockets>
+
+Specifying the number of sockets is important because BLIS is configured very differently
+for one vs two sockets.
+
+Example:
+# Set up for a run with 128 total cores, half on each of 2 sockets.
+blis_set_cores_and_sockets 128 2
+
+You can also use the following aliases:
+blis_set_cores_1S 80  # Run 80 cores on 1 socket
+blis_set_cores_2S 160 # Run 160 cores across 2 sockets, 80 on each
+
+NOTE that at the moment, for multi-threaded BLIS, we only support active number of threads
+that are a multiple of 8.
+If you want to test single threaded performance, you can set
+
+export BLIS_NUM_THREADS=1
+
+Launching your executable:
+
+If your application is MyExe, your commands to perform an optimized BLIS run might look
+like this:
+
+blis_set_cores_2S 160
+$BLIS_NUMA MyExe
+
+This will set cpu affinity correctly, set BLIS parallelism correctly, set the NUMA
+mode correctly, and launch your EXE.
+
+---------------------------------------------------
+
+Let's try an example using the executable that you created in section 3, remembering that
+if you're on an Apple Silicon Mac, make sure that you don't use more cores than you have.
+(For example, 8 on an M1 Max.)
+
+Apple Silicon: (No NUMA is needed for Apple Mac)
+
+blis_set_cores_1S 8; ./time_gemm.x 8000
+(in tests, we obtained about 95% of peak with Neon64 - about 366 Gigaflops)
+
+Altra Dual Socket:
+blis_set_cores_2S 160; $BLIS_NUMA ./time_gemm.x 12000
+(in tests, we obtained about 3.2 TF, or 82% of peak
+
+CONGRATULATIONS!  You're ready to use BLIS!
+
+===================================================
+Performance Note:
+===================================================
+We continue to enhance BLIS performance on the Altra.
+One current issue is that not all variants of triangular operations obtain full
+performance.
+
+For TRSM, best performance is with left triangular operations.
+For TRMM, DUAL SOCKET, best performance is with left triangular operations.
+For TRMM, SINGLE SOCKET, best performance is with right triangular operations.
+
+
diff --git a/config/altra/QuickStart/blis_quick_start_uninstall_altra.sh b/config/altra/QuickStart/blis_quick_start_uninstall_altra.sh
new file mode 100755
index 000000000..8ad437888
--- /dev/null
+++ b/config/altra/QuickStart/blis_quick_start_uninstall_altra.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# This utility will remove all the configuration
+# Specific QuickStart files from the blis directory.
+# This is very useful when switching configurations!
+#
+if [[ -n "$BLIS_HOME" ]]; then
+  echo "REMOVING ALL ALTRA QUICKSTART FILES FROM $BLIS_HOME"
+    
+  rm $BLIS_HOME/blis_build_altra_pthreads.sh
+	rm $BLIS_HOME/blis_build_altra.sh
+	rm $BLIS_HOME/blis_build_both_libraries.sh
+	
+	rm $BLIS_HOME/blis_configure_altra_pthreads.sh
+	rm $BLIS_HOME/blis_configure_altra.sh
+	
+	rm $BLIS_HOME/blis_quick_start_altra.txt
+	rm $BLIS_HOME/blis_setenv.sh
+	
+  rm $BLIS_HOME/blis_unset_par.sh
+  rm $BLIS_HOME/blis_test.sh
+
+	rm $BLIS_HOME/TimeDGEMM.c
+	rm $BLIS_HOME/time_gemm.x
+	
+	rm $BLIS_HOME/blis_quick_start_uninstall_altra.sh
+
+else
+  echo "ONLY USE THIS SCRIPT FROM THE BLIS HOME DIRECTORY!"
+  echo "BLIS_HOME is not set!"
+fi
diff --git a/config/altra/QuickStart/blis_setenv.sh b/config/altra/QuickStart/blis_setenv.sh
new file mode 100755
index 000000000..1f2b28150
--- /dev/null
+++ b/config/altra/QuickStart/blis_setenv.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+#######################################################################
+# Brought to you by Oracle Labs
+#######################################################################
+# Tested in bash and zsh
+#######################################################################
+# Sets up all the environment variables needed for running blis.
+# For this reason, the script MUST be sourced, NOT executed!
+# Needs to be run from BLIS directory to have a portable definition of
+# BLIS_HOME.  If this setup doesn't work for you, you may hard code
+# the path to BLIS_HOME, but then be careful if you copy or move it!
+#######################################################################
+# This is the top level blis directory - it is recommended to set to an absolulte path
+# Can be overridden by user to be called anywhere, but then less portable
+# export BLIS_HOME=.
+# PORTABLE - Set BLIS_HOME to the blis directory containing this script
+# We need to get the full path to the file in case this is called from another directory
+
+if [ "$1" = "quiet" ]; then
+    quiet_setenv="quiet"
+else
+    quiet_setenv=""
+fi
+
+if [[ -n "$BASH_VERSION" ]] ; then
+    file_path_and_name="$( dirname "${BASH_SOURCE[0]}" )/blis_set_home_dir.sh"
+else
+    file_path_and_name="$( dirname "$0" )/blis_set_home_dir.sh"
+fi
+
+if [ -f "$file_path_and_name" ] ; then
+	. $file_path_and_name quiet
+else
+  echo "ERROR - this file is not being executed from a blis home directory."
+  echo "If you cannot use this script in a home directory, you can hardcode"
+  echo "the absolute location of BLIS_HOME in blis_setenv,bash, but this"
+  echo "is then less portable and more error prone with multiple blis"
+  echo "directories."
+  return
+fi
+
+#######################################################################
+# Platform Specific:
+#######################################################################
+# Important!  Set the firmware number to 107 for firmware version 1.07 or earlier,
+# and 108 for 1.08 or later.  We were unable to test 1.08 at this time.
+#
+firmware=108
+
+qualifier="or later"
+if (( firmware == 107 )); then
+  qualifier="or earlier"
+fi
+
+# Use altra for both single and double socket - this might change
+export BLIS_ARCH="altra"
+export BLIS_LIB=$BLIS_HOME/lib/$BLIS_ARCH
+export BLIS_INC=$BLIS_HOME/include/$BLIS_ARCH
+
+# Verify:
+if [ "$quiet_setenv" = "" ]; then
+  echo "#################################################################"
+  echo "CoreID affinity assumes firmware version on this machine is $firmware $qualifier"
+  echo "BLIS_HOME set to $BLIS_HOME"
+  echo "BLIS_INC set to $BLIS_INC"
+  echo "================================================================="
+  ls -l $BLIS_INC
+  echo "-----------------------------------------------------------------"
+  echo "BLIS_LIB set to $BLIS_LIB"
+  echo "-----------------------------------------------------------------"
+  ls -l $BLIS_LIB
+  echo "#################################################################"
+fi
+
+# Affinity Macros, etc
+export BLIS_NUMA="numactl --localalloc"
+
+# Use with firmware versions 1.07 and earlier.
+
+export BLIS_AFFINITY_2S_1_07="0 40 20 60 4 44 24 64 8 48 28 68 12 52 32 72 2 42 22 62 6 46 26 66 10 50 30 70 14 54 34 74 1 41 21 61 5 45 25 65 9 49 29 69 13 53 33 73 3 43 23 63 7 47 27 67 11 51 31 71 15 55 35 75 16 56 36 76 18 58 38 78 17 57 37 77 19 59 39 79 80 120 100 140 84 124 104 144 88 128 108 148 92 132 112 152 82 122 102 142 86 126 106 146 90 130 110 150 94 134 114 154 81 121 101 141 85 125 105 145 89 129 109 149 93 133 113 153 83 123 103 143 87 127 107 147 91 131 111 151 95 135 115 155 96 136 116 156 98 138 118 158 97 137 117 157 99 139 119 159"
+
+export BLIS_AFFINITY_1S_1_07="0 40 20 60 4 44 24 64 8 48 28 68 12 52 32 72 2 42 22 62 6 46 26 66 10 50 30 70 14 54 34 74 1 41 21 61 5 45 25 65 9 49 29 69 13 53 33 73 3 43 23 63 7 47 27 67 11 51 31 71 15 55 35 75 16 56 36 76 18 58 38 78 17 57 37 77 19 59 39 79"
+
+# Use with firmware versions 1.08+
+# Warning - this has not been tested.
+#
+export BLIS_AFFINITY_2S_1_08="28, 29, 38, 39, 2, 3, 12, 13, 6, 7, 16, 17, 0, 1, 10, 11, 68, 69, 78, 79, 42, 43, 52, 53, 46, 47, 56, 57, 40, 41, 50, 51, 24, 25, 34, 35, 20, 21, 30, 31, 26, 27, 36, 37, 22, 23, 32, 33, 64, 65, 74, 75, 60, 61, 70, 71, 66, 67, 76, 77, 62, 63, 72, 73, 8, 9, 18, 19, 4, 5, 14, 15, 48, 49, 58, 59, 44, 45, 54, 55, 108, 109, 118, 119, 82, 83, 92, 93, 86, 87, 96, 97, 80, 81, 90, 91, 148, 149, 158, 159, 122, 123, 132, 133, 126, 127, 136, 137, 120, 121, 130, 131, 104, 105, 114, 115, 100, 101, 110, 111, 106, 107, 116, 117, 102, 103, 112, 113, 144, 145, 154, 155, 140, 141, 150, 151, 146, 147, 156, 157, 142, 143, 152, 153, 88, 89, 98, 99, 84, 85, 94, 95, 128, 129, 138, 139, 124, 125, 134, 135"
+
+export BLIS_AFFINITY_1S_1_08="28, 29, 38, 39, 2, 3, 12, 13, 6, 7, 16, 17, 0, 1, 10, 11, 68, 69, 78, 79, 42, 43, 52, 53, 46, 47, 56, 57, 40, 41, 50, 51, 24, 25, 34, 35, 20, 21, 30, 31, 26, 27, 36, 37, 22, 23, 32, 33, 64, 65, 74, 75, 60, 61, 70, 71, 66, 67, 76, 77, 62, 63, 72, 73, 8, 9, 18, 19, 4, 5, 14, 15, 48, 49, 58, 59, 44, 45, 54, 55"
+
+# Parallelism on the Altra is very flat:
+
+# Set JC to number of sockets:
+export BLIS_JC_NT=2
+
+# Set JR to groups of 8:
+export BLIS_HR_NT=8
+
+# Set IC to the number of cores per socket / 8:
+export BLIS_IC_NT=10
+
+# Experimental:  Allow you to set threading and
+# Core affinity on single or dual sockets for
+# N threads.  Currently, we only support N as
+# a multple of 8
+
+# Max Altra cores per socket
+CPS=80
+
+# Use Bash Arrays:
+
+# Choose which CoreID mapping to go with based on the firmware ID
+if (($firmware == 107)); then
+arrayCoreIDs=(0 40 20 60 4 44 24 64 8 48 28 68 12 52 32 72 2 42 22 62 6 46 26 66 10 50 30 70 14 54 34 74 1 41 21 61 5 45 25 65 9 49 29 69 13 53 33 73 3 43 23 63 7 47 27 67 11 51 31 71 15 55 35 75 16 56 36 76 18 58 38 78 17 57 37 77 19 59 39 79 80 120 100 140 84 124 104 144 88 128 108 148 92 132 112 152 82 122 102 142 86 126 106 146 90 130 110 150 94 134 114 154 81 121 101 141 85 125 105 145 89 129 109 149 93 133 113 153 83 123 103 143 87 127 107 147 91 131 111 151 95 135 115 155 96 136 116 156 98 138 118 158 97 137 117 157 99 139 119 159)
+elif (($firmware == 108)); then
+arrayCoreIDs=(28 29 38 39 2 3 12 13 6 7 16 17 0 1 10 11 68 69 78 79 42 43 52 53 46 47 56 57 40 41 50 51 24 25 34 35 20 21 30 31 26 27 36 37 22 23 32 33 64 65 74 75 60 61 70 71 66 67 76 77 62 63 72 73 8 9 18 19 4 5 14 15 48 49 58 59 44 45 54 55 108 109 118 119 82 83 92 93 86 87 96 97 80 81 90 91 148 149 158 159 122 123 132 133 126 127 136 137 120 121 130 131 104 105 114 115 100 101 110 111 106 107 116 117 102 103 112 113 144 145 154 155 140 141 150 151 146 147 156 157 142 143 152 153 88 89 98 99 84 85 94 95 128 129 138 139 124 125 134 135)
+else
+  echo "ERROR - UNSUPPORTED FIRMWARE $firmware"
+  exit -1
+fi
+
+# Brief check: @ = list all numbers, loop for i in ${}; do ... done
+# for Array Size, do ${#arr[@]}
+# echo "CoreID array has ${#arrayCoreIDs[@]} elements"
+# echo "CoreID array set to: ${arrayCoreIDs[@]}"
+
+# Give the TOTAL core count:
+# Single socket runs
+blis_set_cores_and_sockets() {
+  cores=$1
+  sockets=$2
+  # echo "Cores = $cores, sockets=$sockets"
+  
+	# Round up to nearest 8 cores per socket:
+	cores_per_group=8
+	if (( $sockets == 2 )); then
+	  cores_per_group=16;
+  fi
+  core_round_inc=$(($cores_per_group-1))
+	
+	cores_per_socket=$(($cores))
+	cores=$(($cores + $core_round_inc))
+	groups_per_socket=$(($cores / $cores_per_group))
+	rounded_cores=$(( $groups_per_socket * $cores_per_group ))
+
+	# echo "Rounded Cores = $rounded_cores"
+	# echo "Groups Per Socket = $groups_per_socket"
+	
+	# set the parallelism for one socket with N cores:
+  # Set JC to number of sockets:
+  export BLIS_JC_NT=$sockets
+
+  # Set JR to groups of 8:
+  export BLIS_JR_NT=8
+
+  # Set IC to the number of cores per socket / 8:
+  export BLIS_IC_NT=$groups_per_socket
+
+  # Using an old version of zsh syntax that's compatible with bash
+  
+  if (( $sockets == 1 )); then
+  
+    # Simple single socket case
+    # quotes
+    # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$rounded_cores}\""
+    # No quotes...
+    export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$rounded_cores}"
+    
+	else
+	
+    # Dual socket case
+	  half_cores=$(( $rounded_cores / 2 ))
+    # echo "Half cores are $half_cores"
+    # quotes
+    # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}\""
+    # No quotes
+    export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}"
+	fi
+
+  echo "Activating $rounded_cores cores across $sockets sockets..."
+  echo "GOMP_CPU_AFFINITY set to $GOMP_CPU_AFFINITY"
+  echo "JC/IC/JR = $BLIS_JC_NT/$BLIS_IC_NT/$BLIS_JR_NT"
+	}
+	
+# Convenience functions:
+blis_set_cores_1S() { blis_set_cores_and_sockets $1 1 ; }
+blis_set_cores_2S() { blis_set_cores_and_sockets $1 2 ; }
+
+# For safety:
+. ./blis_unset_par.sh
+
+
+
+
+
diff --git a/config/altra/QuickStart/blis_test.sh b/config/altra/QuickStart/blis_test.sh
new file mode 100755
index 000000000..c3a25e1e1
--- /dev/null
+++ b/config/altra/QuickStart/blis_test.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+if [ "$1" = "quiet" ]; then
+    quiet_blistest="quiet"
+else
+    quiet_blistest=""
+fi
+
+# We don't want to quiet this part:
+echo "#################################################################"
+echo "Simple testing of BLIS - use testsuite for more extensive tests."
+echo "#################################################################"
+
+. ./blis_setenv.sh $quiet_blistest
+# It's critical to unset parallelism parameters before
+# running the test code!
+. ./blis_unset_par.sh quiet
+echo "Switching to directory $BLIS_HOME"
+pushd $BLIS_HOME > /dev/null
+make check -j
+popd > /dev/null
+
+
diff --git a/config/altra/QuickStart/blis_unset_par.sh b/config/altra/QuickStart/blis_unset_par.sh
new file mode 100755
index 000000000..6310a6f8e
--- /dev/null
+++ b/config/altra/QuickStart/blis_unset_par.sh
@@ -0,0 +1,22 @@
+#!/bin/blis
+
+if [ "$1" = "quiet" ]; then
+    quiet_unsetpar="quiet"
+else
+    quiet_unsetpar=""
+fi
+
+if [ "$quiet_unsetpar" = "" ]; then
+    echo "#########################################################"
+    echo " UNSETTING BLIS ENVIRONMENT VARIABLES THAT SET THREADING"
+    echo " AND AFFINITY."
+    echo "#########################################################"
+fi
+
+unset BLIS_JC_NT
+unset BLIS_JR_NT
+unset BLIS_IC_NT
+unset BLIS_NUM_THREADS
+unset OMP_NUM_THREADS
+unset GOMP_CPU_AFFINITY
+
diff --git a/config/altra/bli_cntx_init_altra.c b/config/altra/bli_cntx_init_altra.c
new file mode 100644
index 000000000..53facbd47
--- /dev/null
+++ b/config/altra/bli_cntx_init_altra.c
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Oracle Labs, Oracle Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_cntx_init_altra( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_altra_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
+	(
+	  cntx,
+
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     6,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    12,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   192,   120,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   640,   480,    -1,    -1 ); // Changed d to 480 - LDR
+//      bli_blksz_init_easy( &blkszs[ BLIS_NC ],  3072,  6144,    -1,    -1 ); // Doubled NC
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  12288,  8192,    -1,    -1 ); // Increased NC slightly more
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+	  BLIS_VA_END
+	);
+}
+
diff --git a/config/altra/bli_family_altra.h b/config/altra/bli_family_altra.h
new file mode 100644
index 000000000..9c7844bd8
--- /dev/null
+++ b/config/altra/bli_family_altra.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+// Version with 16 byte alignment and jr=8
+
+#define BLIS_THREAD_MAX_JR      8
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+#define BLIS_SIMD_ALIGN_SIZE           16
+
+#define BLIS_FORCE_ROLL_PACKM_REF_KERNEL
+
+// Temporary microtile of for each supported datatype:
+// - s: 8 * 12 * sizeof(float)
+// - d: 6 * 8  * sizeof(double)
+// Thus, 384 bytes should be sufficient.
+#define BLIS_STACK_BUF_MAX_SIZE        384
+
+// Empirical best choices for TRMM
+#define BLIS_DISABLE_TRMM_RIGHT_IF_JC_GT_1_ELSE_DISABLE_LEFT_IF_DP
diff --git a/config/altra/bli_kernel_defs_altra.h b/config/altra/bli_kernel_defs_altra.h
new file mode 100644
index 000000000..815c59399
--- /dev/null
+++ b/config/altra/bli_kernel_defs_altra.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
+
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
+
+//#endif
+
diff --git a/config/altra/make_defs.mk b/config/altra/make_defs.mk
new file mode 100644
index 000000000..ef1e337db
--- /dev/null
+++ b/config/altra/make_defs.mk
@@ -0,0 +1,90 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := altra
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -D_GNU_SOURCE
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -mcpu=neoverse-n1
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS) -O3 -ftree-vectorize
+ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     := -mcpu=neoverse-n1
+else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     := -mcpu=neoverse-n1
+else
+$(error gcc or clang is required for this configuration.)
+endif
+endif
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS)
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS)
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/altramax/QuickStart/TimeDGEMM.cfile b/config/altramax/QuickStart/TimeDGEMM.cfile
new file mode 100755
index 000000000..172edc659
--- /dev/null
+++ b/config/altramax/QuickStart/TimeDGEMM.cfile
@@ -0,0 +1,143 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <float.h>
+#include <limits.h>
+#include "blis.h"
+
+/*###################################################
+// To build with openmp:
+// Note: Don't need the -lomp on Linux
+gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+// To build with pThreads
+source ./enable_blis.sh
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+// To run with QuickStart Macros...
+for N_CORES, S_SOCKETS
+
+blis_set_cores_and_sockets N S; $BLIS_NUMA time_gemm.x
+
+###################################################*/
+
+#include <stdarg.h>  // for Linux stdarg
+
+//###################################################
+// Handy blis functions
+//###################################################
+
+// Returns 0.0 if out ofmatrix
+double GetReal(obj_t *m, int row, int col)
+  {
+  double im = 0, re = 0; // Imaginary component
+  if (!m) return 0.0;
+    
+  bli_getijm(row, col, m, &re, &im);
+  return re;
+  }
+  
+bool SetReal(obj_t *m, int row, int col, double dVal)
+  {
+  if (!m) return 0.0;
+  bli_setijm(dVal, 0.0, row, col, m);
+    
+  return true;
+  }
+
+//###################################################
+// The basic meat - a one shot
+//###################################################
+
+bool TimeBlis(long size)
+  {
+  int repeat = 3; // Best Of!
+  double dAlpha = 1.0, dBeta = 0.0; // simplest case!
+
+  //============== Allocate matrices =============
+  obj_t*  alpha = (obj_t*) calloc(1, sizeof(obj_t));
+  obj_t*  beta = (obj_t*) calloc(1, sizeof(obj_t));
+
+  bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, alpha);
+  bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, beta);
+
+  // Full gemm is alpha * A * B + beta * C
+  bli_setsc(dAlpha, 0.0, alpha); // alpha is one
+  bli_setsc(dBeta, 0.0, beta); // beta is zero
+  //==============================================
+  printf("Initializing %g GB of Matrices...\n", 8.0 * size * size * 3.0 / 1024.0 / 1024.0 / 1024.0);
+    
+  obj_t*  a = (obj_t*) calloc(1, sizeof(obj_t));
+  obj_t*  b = (obj_t*) calloc(1, sizeof(obj_t));
+  obj_t*  c = (obj_t*) calloc(1, sizeof(obj_t));
+
+  bli_obj_create(BLIS_DOUBLE, size, size, size, 1, c);
+  bli_obj_create(BLIS_DOUBLE, size, size, size, 1, a);
+  bli_obj_create(BLIS_DOUBLE, size, size, size, 1, b);
+  
+  // Create Random matrices
+  // that are well conditioned and invertible
+  // (Note: this can be slow)
+  //
+  bli_randm(c);
+  bli_randm(a);
+  bli_randm(b);
+  
+  //==============================================
+  // DO the timing, blis style...
+  //==============================================
+   
+  double dBestTime = DBL_MAX;
+  
+  for (int i = 0; i < repeat; i++)
+    {
+    printf("Performing DGEMM %d of %d\n", i + 1, repeat); fflush(stdout);
+    double dStartTime = bli_clock();
+    
+    bli_gemm(alpha, a, b, beta, c);
+    
+    // Always look at best of N for timing!
+    dBestTime = bli_clock_min_diff( dBestTime, dStartTime );
+    }
+    
+  double gflops = ( 2.0 * size * size * size ) / ( dBestTime * 1.0e9 );
+
+  printf("Best DGEMM run completed in %g seconds @ size= \t %ld \t %g \t gigaflops\n",
+         dBestTime, size, gflops); fflush(stdout);
+
+  return true;
+  }
+
+
+int main( int argc, char** argv )
+  {
+  long size = 0;
+  int cores = 1, sweep_inc = 0;
+    
+  printf("Details of parallelism are set by environment variables.\n");
+  printf("Arg1 = size=M=N=K for DGEMM\n"
+  			"optional arg2 = size step for sweep.\n");
+  
+  if (argc < 2) return 0;
+  
+  if (argc > 1) {
+    size = atol(argv[1]);
+    printf("User set size to %ld\n", size);
+    }
+
+  if (argc > 2) {
+      sweep_inc = atoi(argv[3]);
+      printf("User set sweep size inc to %d\n", sweep_inc);
+      }
+
+  if (sweep_inc == 0) TimeBlis(size);
+  else
+    {
+    for (int i = size; i >= sweep_inc; i -= sweep_inc)
+      TimeBlis(i);
+    }
+    
+  return 0;
+  }
diff --git a/config/altramax/QuickStart/blis_build_altramax.sh b/config/altramax/QuickStart/blis_build_altramax.sh
new file mode 100755
index 000000000..99a1c3948
--- /dev/null
+++ b/config/altramax/QuickStart/blis_build_altramax.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+echo "#######################################################"
+echo "Building standard OpenMP BLIS..."
+echo "#######################################################"
+. ./blis_setenv.sh quiet
+echo "#############################################################"
+echo "Configuring BLIS for Altramax using OpenMP for parallelism..."
+echo "#############################################################"
+. ./blis_configure_altramax.sh quiet
+echo "Switching to directory $BLIS_HOME"
+pushd $BLIS_HOME > /dev/null
+make -j
+popd > /dev/null
+if [ "$1" != "notest" ]; then
+    . ./blis_test.sh quiet
+fi
+. ./blis_setenv.sh
+echo "##########################################################"
+echo "...done"
+echo "##########################################################"
diff --git a/config/altramax/QuickStart/blis_build_altramax_pthreads.sh b/config/altramax/QuickStart/blis_build_altramax_pthreads.sh
new file mode 100755
index 000000000..052a682f4
--- /dev/null
+++ b/config/altramax/QuickStart/blis_build_altramax_pthreads.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+echo "#######################################################"
+echo "Building pThreads version of BLIS..."
+echo "#######################################################"
+. ./blis_setenv.sh quiet
+echo "###############################################################"
+echo "Configuring BLIS for Altramax using pThreads for parallelism..."
+echo "###############################################################"
+. ./blis_configure_altramax_pthreads.sh quiet
+echo "Switching to directory $BLIS_HOME"
+pushd $BLIS_HOME > /dev/null
+make -j
+popd > /dev/null
+if [ "$1" != "notest" ]; then
+    . ./blis_test.sh quiet
+fi
+. ./blis_setenv.sh
+echo "##########################################################"
+echo "...done"
+echo "##########################################################"
diff --git a/config/altramax/QuickStart/blis_build_both_libraries.sh b/config/altramax/QuickStart/blis_build_both_libraries.sh
new file mode 100755
index 000000000..73f2b9679
--- /dev/null
+++ b/config/altramax/QuickStart/blis_build_both_libraries.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+echo "##########################################################"
+echo "Creating both OpenMP and pThread BLIS libraries..."
+echo "##########################################################"
+echo "First, Creating pThread library..."
+echo "##########################################################"
+. ./blis_build_altramax_pthreads.sh notest
+
+echo "##########################################################"
+echo "Saving the pThreads build..."
+echo "##########################################################"
+# Temporarily move the pthreads build
+mkdir $BLIS_HOME/.tempinc
+mkdir $BLIS_HOME/.templib
+mv $BLIS_INC/* $BLIS_HOME/.tempinc/
+mv $BLIS_LIB/* $BLIS_HOME/.templib/
+# And rename the pthread versions of the include and library files
+#echo "##########################################################"
+pushd $BLIS_HOME/.tempinc/ > /dev/null
+echo "Renaming pThread-enabled blis.h -> blisP.h"
+mv blis.h blisP.h
+popd > /dev/null
+pushd $BLIS_HOME/.templib/ > /dev/null
+for f in $(ls -1); do
+    destf=${f/blis/blisP}
+    echo "Renaming pThread library $f -> $destf"
+    mv "$f" "$destf"
+
+    # Fix the symbolic links
+    if [[ -L "$destf" ]]; then
+        target=$(readlink $destf)
+        target=${target/blis/blisP}
+        \rm "$destf"
+        ln -s "$target" "$destf"
+    fi
+done
+popd > /dev/null
+echo "##########################################################"
+
+echo "##########################################################"
+echo "Second, Creating OpenMP library..."
+echo "##########################################################"
+. ./blis_build_altramax.sh notest
+
+echo "##########################################################"
+echo "Restoring the pThreads build..."
+echo "##########################################################"
+# And move the pthread versions back
+mv $BLIS_HOME/.tempinc/*  $BLIS_INC/
+mv $BLIS_HOME/.templib/* $BLIS_LIB/
+rmdir $BLIS_HOME/.tempinc
+rmdir $BLIS_HOME/.templib
+
+. ./blis_test.sh quiet
+. ./blis_setenv.sh
+echo "##########################################################"
+echo "Done creating BLIS libraries..."
+echo "##########################################################"
diff --git a/config/altramax/QuickStart/blis_configure_altramax.sh b/config/altramax/QuickStart/blis_configure_altramax.sh
new file mode 100755
index 000000000..4cd02c684
--- /dev/null
+++ b/config/altramax/QuickStart/blis_configure_altramax.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if [ "$1" = "quiet" ]; then
+    quiet_confopenmp="quiet"
+else
+    quiet_confopenmp=""
+fi
+
+if [ "$quiet_confopenmp" = "" ]; then
+    echo "#############################################################"
+    echo "Configuring BLIS for Altramax using OpenMP for parallelism..."
+    echo "#############################################################"
+fi
+
+. ./blis_setenv.sh $quiet_confopenmp
+pushd $BLIS_HOME > /dev/null
+make distclean
+./configure -t openmp --disable-pba-pools altramax
+popd > /dev/null
+
diff --git a/config/altramax/QuickStart/blis_configure_altramax_pthreads.sh b/config/altramax/QuickStart/blis_configure_altramax_pthreads.sh
new file mode 100755
index 000000000..69d9ecc2f
--- /dev/null
+++ b/config/altramax/QuickStart/blis_configure_altramax_pthreads.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if [ "$1" = "quiet" ]; then
+    quiet_confpthreads="quiet"
+else
+    quiet_confpthreads=""
+fi
+
+if [ "$quiet_confpthreads" = "" ]; then
+    echo "###############################################################"
+    echo "Configuring BLIS for Altramax using pThreads for parallelism..."
+    echo "###############################################################"
+fi
+
+. ./blis_setenv.sh $quiet_confpthreads
+pushd $BLIS_HOME > /dev/null
+make distclean
+./configure -t pthreads --disable-pba-pools altramax
+popd > /dev/null
+
diff --git a/config/altramax/QuickStart/blis_quick_start_altramax.txt b/config/altramax/QuickStart/blis_quick_start_altramax.txt
new file mode 100755
index 000000000..efccf28a1
--- /dev/null
+++ b/config/altramax/QuickStart/blis_quick_start_altramax.txt
@@ -0,0 +1,200 @@
+Welcome to the Altramax Platform!  We've made some scripts to help you build and use blis,
+but feel free to look at them for your own inspiration.
+Note that all the provided scripts must be SOURCED, NOT executed!  This is because they
+set up environment variables needed for the steps below.
+ 
+Using BLIS requires a few steps:
+
+1) Configuring the library
+2) Building the library & validating it
+3) Linking your program with BLIS
+4) Setting the environment parameters for an optimized blis to run your program
+
+Let's briefly touch on these points, and how the scripts provided can help
+But first, let's make sure your configuration is correct...
+
+Open blis_setenv.sh
+In the Platform Specific: section, around line 50 or so, you will see:
+firmware=205
+or
+firmware=204
+
+If your firmware is version 2.05 or greater (most likely), make sure this is set to 205,
+else make sure it's set to 204.  Ampere changed the CoreID mappings between these
+versions around May 2022.
+
+Note: the scripts referenced here modify environment variables, so they must be sourced.
+E.g., with
+    source <script_name>
+or
+    . <script_name>
+
+===================================================
+1) Configuring the library
+2) Building the library & validating it
+===================================================
+
+There are custom configuration options for Altramax, but, as a user, your main decision is
+whether you want BLIS to use OpenMP or pthreads for parallelism?  OpenMP is the default
+option, since OpenMP allows thread pinning and thus results in better performance.
+To build with OpenMP use:
+
+. ./blis_build_altramax.sh
+
+However, some platforms (like MacOS) cannot use OpenMP at all.  In this case, you want
+to build the pThreads version of BLIS:
+
+. ./blis_build_altramax_pthreads.sh
+
+In both cases, it will create libblis.a in $BLIS_HOME/lib/$BLIS_ARCH
+
+Try doing that in the root blis directory, depending on your OS.
+
+LINUX:
+. ./blis_build_altramax.sh
+
+MacOS Apple Silicon:
+. ./blis_build_altramax_pthreads.sh
+
+----------------------------------------------------------------------------
+HOWEVER, there is a tricky case: If you link BLIS with a program that uses pThreads, you
+MUST use the pthreads version of BLIS, even though it will be slower.  This is because
+there is a bug in which attempting to use both pthreads AND OpenMP will pin all threads to
+a single core and essentially freeze your program.
+
+If this is a possibility, you may want to have both libraries available and switch between
+them for each application.  The script:
+
+. ./blis_build_both_libraries.sh
+
+will build both versions, with the pThreads version being called libblisP.a, and a second
+header blisP.h
+This is a little inconvenient, and we're working on improving the situation in the near
+future.
+----------------------------------------------------------------------------
+
+The build will additionally check the library, but if you would like to check a la carte, do
+
+. ./blis_test.sh
+
+You should see near the bottom:
+check-blastest.sh: All BLAS tests passed!
+check-blistest.sh: All BLIS tests passed!
+
+--------------------------------
+Finally, here's a script that will be important when you are doing testing.
+This performs the important step of unsetting any parameters effecting blis parallelism.
+
+. ./blis_unset_par.sh
+
+===================================================
+3) Building and Linking your program with BLIS
+===================================================
+
+This depends whether you are using the pThreads version of BLIS or the OpenMP version...
+Note this uses the BLIS locations automatically defined when sourcing blis_setenv.sh
+
+. ./blis_setenv.sh
+
+(This will display you environment variable settings, your blis libraries and headers (if
+built), and also unset blis parallelism parameters for safety.)
+
+// BUILDING your app with the OpenMP version of BLIS:
+// Note: Don't need -lomp on Linux
+
+gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe
+
+// To build with pThreads
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe
+
+// NOTE: If you used the scripts to build BOTH versions of blis, then use the renamed blis lib:
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblisP.a -lpthread -lm -o MyExe
+
+Let's try building a sample program that we've included to test BLIS:
+TimeDGEMM.c
+
+If this is a new terminal session, make sure to:
+. ./blis_setenv.sh
+(there's no harm in running it again.)
+
+Linux:
+gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+Apple Silicon:
+gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x
+
+But don't try a timed run, yet - there's some runtime setup that needs to be done...
+
+===================================================
+4) Setting the environment parameters for an optimized blis to run your program
+===================================================
+
+The performance of some BLAS libraries are very sensitive to the compiler or the page size. 
+BLIS is not sensitive to either of these things, but it IS extremely dependent on pinning
+the right threads to the right cores.  We have scripts to help...
+
+. ./blis_setenv.sh
+
+This not only tells you where blis is, but it also creates shell functions to set
+affinity, threading, and NUMA control for each run.  There is a shell function created
+that you can call to set up how your threads will be pinned and used:
+
+blis_set_cores_and_sockets <ncores> <nsockets>
+
+Specifying the number of sockets is important because BLIS is configured very differently
+for one vs two sockets.
+
+Example:
+# Set up for a run with 128 total cores, half on each of 2 sockets.
+blis_set_cores_and_sockets 128 2
+
+You can also use the following aliases:
+blis_set_cores_1S 128  # Run 128 cores on 1 socket
+blis_set_cores_2S 256 # Run 256 cores across 2 sockets, 128 on each
+
+NOTE that at the moment, for multi-threaded BLIS, we only support active number of threads
+that are a multiple of 8.
+If you want to test single threaded performance, you can set
+
+export BLIS_NUM_THREADS=1
+
+Launching your executable:
+
+If your application is MyExe, your commands to perform an optimized BLIS run might look
+like this:
+
+blis_set_cores_1S 128
+$BLIS_NUMA MyExe
+
+This will set cpu affinity correctly, set BLIS parallelism correctly, set the NUMA
+mode correctly, and launch your EXE.
+
+---------------------------------------------------
+
+Let's try an example using the executable that you created in section 3, remembering that
+if you're on an Apple Silicon Mac, make sure that you don't use more cores than you have.
+(For example, 8 on an M1 Max.)
+
+Apple Silicon: (No NUMA is needed for Apple platforms.)
+
+blis_set_cores_1S 8; ./time_gemm.x 8000
+(in tests, we obtained about 95% of peak with Neon64 - about 366 Gigaflops)
+
+AltraMax Single Socket:
+blis_set_cores_1S 128; $BLIS_NUMA ./time_gemm.x 12000
+(in tests, we obtained about 2.6 TF, or 85% of peak
+
+CONGRATULATIONS!  You're ready to use BLIS!
+
+===================================================
+Performance Note:
+===================================================
+We continue to enhance BLIS performance on the Altramax.
+One current issue is that not all variants of triangular operations obtain full
+performance.
+
+For TRSM, best performance is with left triangular operations.
+For TRMM, DUAL SOCKET, best performance is with left triangular operations.
+For TRMM, SINGLE SOCKET, best performance is with right triangular operations.
+
+
diff --git a/config/altramax/QuickStart/blis_quick_start_uninstall_altramax.sh b/config/altramax/QuickStart/blis_quick_start_uninstall_altramax.sh
new file mode 100755
index 000000000..a36be40fd
--- /dev/null
+++ b/config/altramax/QuickStart/blis_quick_start_uninstall_altramax.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# This utility will remove all the configuration
+# Specific QuickStart files from the blis directory.
+# This is very useful when switching configurations!
+#
+if [[ -n "$BLIS_HOME" ]]; then
+  echo "REMOVING ALL ALTRAMAX QUICKSTART FILES FROM $BLIS_HOME"
+    
+  rm $BLIS_HOME/blis_build_altramax_pthreads.sh
+	rm $BLIS_HOME/blis_build_altramax.sh
+	rm $BLIS_HOME/blis_build_both_libraries.sh
+	
+	rm $BLIS_HOME/blis_configure_altramax_pthreads.sh
+	rm $BLIS_HOME/blis_configure_altramax.sh
+	
+	rm $BLIS_HOME/blis_quick_start_altramax.txt
+	rm $BLIS_HOME/blis_setenv.sh
+	
+  rm $BLIS_HOME/blis_unset_par.sh
+  rm $BLIS_HOME/blis_test.sh
+
+	rm $BLIS_HOME/TimeDGEMM.c
+	rm $BLIS_HOME/time_gemm.x
+	
+	rm $BLIS_HOME/blis_quick_start_uninstall_altramax.sh
+
+else
+  echo "ONLY USE THIS SCRIPT FROM THE BLIS HOME DIRECTORY!"
+  echo "BLIS_HOME is not set!"
+fi
diff --git a/config/altramax/QuickStart/blis_setenv.sh b/config/altramax/QuickStart/blis_setenv.sh
new file mode 100755
index 000000000..0b2cbbe06
--- /dev/null
+++ b/config/altramax/QuickStart/blis_setenv.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+#######################################################################
+# Brought to you by Oracle Labs
+#######################################################################
+# Tested in bash and zsh
+#######################################################################
+# Sets up all the environment variables needed for running blis.
+# For this reason, the script MUST be sourced, NOT executed!
+# Needs to be run from BLIS directory to have a portable definition of
+# BLIS_HOME.  If this setup doesn't work for you, you may hard code
+# the path to BLIS_HOME, but then be careful if you copy or move it!
+#######################################################################
+# This is the top level blis directory - it is recommended to set to an absolulte path
+# Can be overridden by user to be called anywhere, but then less portable
+# export BLIS_HOME=.
+# PORTABLE - Set BLIS_HOME to the blis directory containing this script
+# We need to get the full path to the file in case this is called from another directory
+
+if [ "$1" = "quiet" ]; then
+    quiet_setenv="quiet"
+else
+    quiet_setenv=""
+fi
+
+if [[ -n "$BASH_VERSION" ]] ; then
+    file_path_and_name="$( dirname "${BASH_SOURCE[0]}" )/blis_set_home_dir.sh"
+else
+    file_path_and_name="$( dirname "$0" )/blis_set_home_dir.sh"
+fi
+
+if [ -f "$file_path_and_name" ] ; then
+	. $file_path_and_name quiet
+else
+  echo "ERROR - this file is not being executed from a blis home directory."
+  echo "If you cannot use this script in a home directory, you can hardcode"
+  echo "the absolute location of BLIS_HOME in blis_setenv,bash, but this"
+  echo "is then less portable and more error prone with multiple blis"
+  echo "directories."
+  return
+fi
+
+#######################################################################
+# Platform Specific:
+# Important!  Set the firmware flag to 204 for 2.04 or earlier,
+# and 205 for 2.05 or later.
+firmware=205
+# Use altramax for both single and double socket - this might change
+export BLIS_ARCH="altramax"
+export BLIS_LIB=$BLIS_HOME/lib/$BLIS_ARCH
+export BLIS_INC=$BLIS_HOME/include/$BLIS_ARCH
+
+
+# Verify:
+if [ "$quiet_setenv" = "" ]; then
+  echo "BLIS_HOME set to $BLIS_HOME"
+  echo "BLIS_INC set to $BLIS_INC"
+  echo "-----------------------------------------------------------------"
+  ls -l $BLIS_INC
+  echo "-----------------------------------------------------------------"
+  echo "BLIS_LIB set to $BLIS_LIB"
+  echo "-----------------------------------------------------------------"
+  ls -l $BLIS_LIB
+  echo "-----------------------------------------------------------------"
+fi
+
+# Affinity Macros, etc
+export BLIS_NUMA="numactl --localalloc"
+
+# Use with firmware versions 2.04 and earlier.
+# You can check the firmware version using dmidecode
+
+export BLIS_AFFINITY_2S_2_04="0 64 32 96 4 68 36 100 1 65 33 97 5 69 37 101 2 66 34 98 6 70 38 102 3 67 35 99 7 71 39 103 8 72 40 104 12 76 44 108 9 73 41 105 13 77 45 109 10 74 42 106 14 78 46 110 11 75 43 107 15 79 47 111 16 80 48 112 20 84 52 116 17 81 49 113 21 85 53 117 18 82 50 114 22 86 54 118 19 83 51 115 23 87 55 119 24 88 56 120 26 90 58 122 25 89 57 121 27 91 59 123 28 92 60 124 30 94 62 126 29 93 61 125 31 95 63 127 128 192 160 224 132 196 164 228 129 193 161 225 133 197 165 229 130 194 162 226 134 198 166 230 131 195 163 227 135 199 167 231 136 200 168 232 140 204 172 236 137 201 169 233 141 205 173 237 138 202 170 234 142 206 174 238 139 203 171 235 143 207 175 239 144 208 176 240 148 212 180 244 145 209 177 241 149 213 181 245 146 210 178 242 150 214 182 246 147 211 179 243 151 215 183 247 152 216 184 248 154 218 186 250 153 217 185 249 155 219 187 251 156 220 188 252 158 222 190 254 157 221 189 253 159 223 191 255"
+
+export BLIS_AFFINITY_1S_2_04="0 64 32 96 4 68 36 100 1 65 33 97 5 69 37 101 2 66 34 98 6 70 38 102 3 67 35 99 7 71 39 103 8 72 40 104 12 76 44 108 9 73 41 105 13 77 45 109 10 74 42 106 14 78 46 110 11 75 43 107 15 79 47 111 16 80 48 112 20 84 52 116 17 81 49 113 21 85 53 117 18 82 50 114 22 86 54 118 19 83 51 115 23 87 55 119 24 88 56 120 26 90 58 122 25 89 57 121 27 91 59 123 28 92 60 124 30 94 62 126 29 93 61 125 31 95 63 127"
+
+# Use with firmware versions 2.05 and later
+# You can check the firmware version using dmidecode
+
+export BLIS_AFFINITY_2S_2_05="0 1 64 65 8 9 72 73 2 3 66 67 10 11 74 75 4 5 68 69 12 13 76 77 6 7 70 71 14 15 78 79 16 17 80 81 24 25 88 89 18 19 82 83 26 27 90 91 20 21 84 85 28 29 92 93 22 23 86 87 30 31 94 95 32 33 96 97 40 41 104 105 34 35 98 99 42 43 106 107 36 37 100 101 44 45 108 109 38 39 102 103 46 47 110 111 48 49 112 113 52 53 116 117 50 51 114 115 54 55 118 119 56 57 120 121 60 61 124 125 58 59 122 123 62 63 126 127 128 129 192 193 136 137 200 201 130 131 194 195 138 139 202 203 132 133 196 197 140 141 204 205 134 135 198 199 142 143 206 207 144 145 208 209 152 153 216 217 146 147 210 211 154 155 218 219 148 149 212 213 156 157 220 221 150 151 214 215 158 159 222 223 160 161 224 225 168 169 232 233 162 163 226 227 170 171 234 235 164 165 228 229 172 173 236 237 166 167 230 231 174 175 238 239 176 177 240 241 180 181 244 245 178 179 242 243 182 183 246 247 184 185 248 249 188 189 252 253 186 187 250 251 190 191 254 255"
+
+export BLIS_AFFINITY_1S_2_05="0 1 64 65 8 9 72 73 2 3 66 67 10 11 74 75 4 5 68 69 12 13 76 77 6 7 70 71 14 15 78 79 16 17 80 81 24 25 88 89 18 19 82 83 26 27 90 91 20 21 84 85 28 29 92 93 22 23 86 87 30 31 94 95 32 33 96 97 40 41 104 105 34 35 98 99 42 43 106 107 36 37 100 101 44 45 108 109 38 39 102 103 46 47 110 111 48 49 112 113 52 53 116 117 50 51 114 115 54 55 118 119 56 57 120 121 60 61 124 125 58 59 122 123 62 63 126 127"
+
+# Parallelism on the Altramax is very flat:
+
+# Set JC to number of sockets:
+export BLIS_JC_NT=2
+
+# Set JR to groups of 8:
+export BLIS_HR_NT=8
+
+# Set IC to the number of cores per socket / 8:
+export BLIS_IC_NT=16
+
+# Experimental:  Allow you to set threading and
+# Core affinity on single or dual sockets for
+# N threads.  Currently, we only support N as
+# a multple of 8
+
+# Maximum Altramax cores per socket
+CPS=128
+
+# Use Bash Arrays:
+
+
+if (($firmware == 204)); then
+    arrayCoreIDs=(0 64 32 96 4 68 36 100 1 65 33 97 5 69 37 101 2 66 34 98 6 70 38 102 3 67 35 99 7 71 39 103 8 72 40 104 12 76 44 108 9 73 41 105 13 77 45 109 10 74 42 106 14 78 46 110 11 75 43 107 15 79 47 111 16 80 48 112 20 84 52 116 17 81 49 113 21 85 53 117 18 82 50 114 22 86 54 118 19 83 51 115 23 87 55 119 24 88 56 120 26 90 58 122 25 89 57 121 27 91 59 123 28 92 60 124 30 94 62 126 29 93 61 125 31 95 63 127 128 192 160 224 132 196 164 228 129 193 161 225 133 197 165 229 130 194 162 226 134 198 166 230 131 195 163 227 135 199 167 231 136 200 168 232 140 204 172 236 137 201 169 233 141 205 173 237 138 202 170 234 142 206 174 238 139 203 171 235 143 207 175 239 144 208 176 240 148 212 180 244 145 209 177 241 149 213 181 245 146 210 178 242 150 214 182 246 147 211 179 243 151 215 183 247 152 216 184 248 154 218 186 250 153 217 185 249 155 219 187 251 156 220 188 252 158 222 190 254 157 221 189 253 159 223 191 255)
+elif (($firmware == 205)); then
+    arrayCoreIDs=(0 1 64 65 8 9 72 73 2 3 66 67 10 11 74 75 4 5 68 69 12 13 76 77 6 7 70 71 14 15 78 79 16 17 80 81 24 25 88 89 18 19 82 83 26 27 90 91 20 21 84 85 28 29 92 93 22 23 86 87 30 31 94 95 32 33 96 97 40 41 104 105 34 35 98 99 42 43 106 107 36 37 100 101 44 45 108 109 38 39 102 103 46 47 110 111 48 49 112 113 52 53 116 117 50 51 114 115 54 55 118 119 56 57 120 121 60 61 124 125 58 59 122 123 62 63 126 127 128 129 192 193 136 137 200 201 130 131 194 195 138 139 202 203 132 133 196 197 140 141 204 205 134 135 198 199 142 143 206 207 144 145 208 209 152 153 216 217 146 147 210 211 154 155 218 219 148 149 212 213 156 157 220 221 150 151 214 215 158 159 222 223 160 161 224 225 168 169 232 233 162 163 226 227 170 171 234 235 164 165 228 229 172 173 236 237 166 167 230 231 174 175 238 239 176 177 240 241 180 181 244 245 178 179 242 243 182 183 246 247 184 185 248 249 188 189 252 253 186 187 250 251 190 191 254 255)
+else
+  echo "ERROR - UNSUPPORTED FIRMWARE $firmware"
+  exit -1
+fi
+
+# Brief check: @ = list all numbers, loop for i in ${}; do ... done
+# for Array Size, do ${#arr[@]}
+# echo "CoreID array has ${#arrayCoreIDs[@]} elements"
+# echo "CoreID array set to: ${arrayCoreIDs[@]}"
+
+# Give the TOTAL core count:
+# Single socket runs
+blis_set_cores_and_sockets() {
+  cores=$1
+  sockets=$2
+  # echo "Cores = $cores, sockets=$sockets"
+  
+	# Round up to nearest 8 cores per socket:
+	cores_per_group=8
+	if (( $sockets == 2 )); then
+	  cores_per_group=16;
+  fi
+  core_round_inc=$(($cores_per_group-1))
+	
+	cores_per_socket=$(($cores))
+	cores=$(($cores + $core_round_inc))
+	groups_per_socket=$(($cores / $cores_per_group))
+	rounded_cores=$(( $groups_per_socket * $cores_per_group ))
+
+	# echo "Rounded Cores = $rounded_cores"
+	# echo "Groups Per Socket = $groups_per_socket"
+	
+	# set the parallelism for one socket with N cores:
+  # Set JC to number of sockets:
+  export BLIS_JC_NT=$sockets
+
+  # Set JR to groups of 8:
+  export BLIS_JR_NT=8
+
+  # Set IC to the number of cores per socket / 8:
+  export BLIS_IC_NT=$groups_per_socket
+
+  # Using an old version of zsh syntax that's compatible with bash
+  
+  if (( $sockets == 1 )); then
+  
+    # Simple single socket case
+    # quotes
+    # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$rounded_cores}\""
+    # No quotes...
+    export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$rounded_cores}"
+    
+	else
+	
+    # Dual socket case
+	  half_cores=$(( $rounded_cores / 2 ))
+    # echo "Half cores are $half_cores"
+    # quotes
+    # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}\""
+    # No quotes
+    export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}"
+	fi
+
+  echo "Activating $rounded_cores cores across $sockets sockets..."
+  echo "GOMP_CPU_AFFINITY set to $GOMP_CPU_AFFINITY"
+  echo "JC/IC/JR = $BLIS_JC_NT/$BLIS_IC_NT/$BLIS_JR_NT"
+	}
+	
+# Convenience functions:
+blis_set_cores_1S() { blis_set_cores_and_sockets $1 1 ; }
+blis_set_cores_2S() { blis_set_cores_and_sockets $1 2 ; }
+
+# For safety:
+. ./blis_unset_par.sh
+
diff --git a/config/altramax/QuickStart/blis_test.sh b/config/altramax/QuickStart/blis_test.sh
new file mode 100755
index 000000000..b6153ea60
--- /dev/null
+++ b/config/altramax/QuickStart/blis_test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+if [ "$1" = "quiet" ]; then
+    quiet_blistest="quiet"
+else
+    quiet_blistest=""
+fi
+
+# We don't want to quiet this part:
+echo "#################################################################"
+echo "Simple testing of BLIS - use testsuite for more extensive tests."
+echo "#################################################################"
+
+. ./blis_setenv.sh $quiet_blistest
+# It's critical to unset parallelism parameters before
+# running the test code!
+. ./blis_unset_par.sh quiet
+echo "Switching to directory $BLIS_HOME"
+pushd $BLIS_HOME > /dev/null
+make check -j
+popd > /dev/null
+
diff --git a/config/altramax/QuickStart/blis_unset_par.sh b/config/altramax/QuickStart/blis_unset_par.sh
new file mode 100755
index 000000000..6310a6f8e
--- /dev/null
+++ b/config/altramax/QuickStart/blis_unset_par.sh
@@ -0,0 +1,22 @@
+#!/bin/blis
+
+if [ "$1" = "quiet" ]; then
+    quiet_unsetpar="quiet"
+else
+    quiet_unsetpar=""
+fi
+
+if [ "$quiet_unsetpar" = "" ]; then
+    echo "#########################################################"
+    echo " UNSETTING BLIS ENVIRONMENT VARIABLES THAT SET THREADING"
+    echo " AND AFFINITY."
+    echo "#########################################################"
+fi
+
+unset BLIS_JC_NT
+unset BLIS_JR_NT
+unset BLIS_IC_NT
+unset BLIS_NUM_THREADS
+unset OMP_NUM_THREADS
+unset GOMP_CPU_AFFINITY
+
diff --git a/config/altramax/bli_cntx_init_altramax.c b/config/altramax/bli_cntx_init_altramax.c
new file mode 100644
index 000000000..121946874
--- /dev/null
+++ b/config/altramax/bli_cntx_init_altramax.c
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Oracle Labs, Oracle Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_cntx_init_altramax( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_altramax_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
+	(
+	  cntx,
+
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  BLIS_VA_END
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     6,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    12,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   192,   120,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   640,   480,    -1,    -1 ); // Changed d to 480 - LDR
+//      bli_blksz_init_easy( &blkszs[ BLIS_NC ],  3072,  6144,    -1,    -1 ); // Doubled NC
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  12288,  8192,    -1,    -1 ); // Increased NC slightly more
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+	  BLIS_VA_END
+	);
+}
+
diff --git a/config/altramax/bli_family_altramax.h b/config/altramax/bli_family_altramax.h
new file mode 100644
index 000000000..2594ed73a
--- /dev/null
+++ b/config/altramax/bli_family_altramax.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+// Version with 16 byte alignment and jr=8
+
+#define BLIS_THREAD_MAX_JR      8
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+#define BLIS_SIMD_ALIGN_SIZE           16
+
+#define BLIS_FORCE_ROLL_PACKM_REF_KERNEL
+#define BLIS_DISABLE_TRMM_RIGHT_IF_JC_GT_1_ELSE_DISABLE_LEFT_IF_DP
+
diff --git a/config/altramax/bli_kernel_defs_altramax.h b/config/altramax/bli_kernel_defs_altramax.h
new file mode 100644
index 000000000..815c59399
--- /dev/null
+++ b/config/altramax/bli_kernel_defs_altramax.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
+
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
+
+//#endif
+
diff --git a/config/altramax/make_defs.mk b/config/altramax/make_defs.mk
new file mode 100644
index 000000000..35bd7de48
--- /dev/null
+++ b/config/altramax/make_defs.mk
@@ -0,0 +1,90 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := altramax
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -D_GNU_SOURCE
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -mcpu=neoverse-n1
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS) -O3 -ftree-vectorize
+ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     := -mcpu=neoverse-n1
+else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     := -mcpu=neoverse-n1
+else
+$(error gcc or clang is required for this configuration.)
+endif
+endif
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS)
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS)
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config_registry b/config_registry
index 2138ba515..44bb069c9 100644
--- a/config_registry
+++ b/config_registry
@@ -35,10 +35,18 @@ bulldozer:   bulldozer
 # ARM architectures.
 armsve:      armsve/armsve
 a64fx:       a64fx/armsve
+
+# ARM Neon64 (4 pipes x 128b) architectures.
+altramax:    altramax/armv8a
+altra:       altra/armv8a
 firestorm:   firestorm/armv8a
+
+# ARM (2 pipes x 128b) architectures.
 thunderx2:   thunderx2/armv8a
 cortexa57:   cortexa57/armv8a
 cortexa53:   cortexa53/armv8a
+
+# ARM Vintage architectures.
 cortexa15:   cortexa15/armv7a
 cortexa9:    cortexa9/armv7a
 
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index 5fef62ce1..111b27e20 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -230,9 +230,17 @@ arch_t bli_arch_query_id_impl( void )
 		#ifdef BLIS_FAMILY_A64FX
 		id = BLIS_ARCH_A64FX;
 		#endif
+
+		#ifdef BLIS_FAMILY_ALTRAMAX
+		id = BLIS_ARCH_ALTRAMAX;
+		#endif
+		#ifdef BLIS_FAMILY_ALTRA
+		id = BLIS_ARCH_ALTRA;
+		#endif
 		#ifdef BLIS_FAMILY_FIRESTORM
 		id = BLIS_ARCH_FIRESTORM;
 		#endif
+
 		#ifdef BLIS_FAMILY_THUNDERX2
 		id = BLIS_ARCH_THUNDERX2;
 		#endif
@@ -242,6 +250,7 @@ arch_t bli_arch_query_id_impl( void )
 		#ifdef BLIS_FAMILY_CORTEXA53
 		id = BLIS_ARCH_CORTEXA53;
 		#endif
+
 		#ifdef BLIS_FAMILY_CORTEXA15
 		id = BLIS_ARCH_CORTEXA15;
 		#endif
@@ -320,10 +329,15 @@ static const char* config_name[ BLIS_NUM_ARCHS ] =
 
     "armsve",
     "a64fx",
+
+	"altramax",
+	"altra",
     "firestorm",
+
     "thunderx2",
     "cortexa57",
     "cortexa53",
+
     "cortexa15",
     "cortexa9",
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index eba602aaa..7b9ab3d7c 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -165,6 +165,16 @@ int bli_gks_init( void )
 #endif
 
 		// -- ARM-NEON (4 pipes x 128-bit vectors) --
+#ifdef BLIS_CONFIG_ALTRAMAX
+		bli_gks_register_cntx( BLIS_ARCH_ALTRAMAX,    bli_cntx_init_altramax,
+		                                              bli_cntx_init_altramax_ref,
+		                                              bli_cntx_init_altramax_ind );
+#endif
+#ifdef BLIS_CONFIG_ALTRA
+		bli_gks_register_cntx( BLIS_ARCH_ALTRA,       bli_cntx_init_altra,
+		                                              bli_cntx_init_altra_ref,
+		                                              bli_cntx_init_altra_ind );
+#endif
 #ifdef BLIS_CONFIG_FIRESTORM
 		bli_gks_register_cntx( BLIS_ARCH_FIRESTORM,   bli_cntx_init_firestorm,
 		                                              bli_cntx_init_firestorm_ref,
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index c80e8e922..f8e18c5c1 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -88,15 +88,24 @@ CNTX_INIT_PROTS( bulldozer )
 
 // -- ARM architectures --
 
+// ARM-SVE
 #ifdef BLIS_CONFIG_ARMSVE
 CNTX_INIT_PROTS( armsve )
 #endif
 #ifdef BLIS_CONFIG_A64FX
 CNTX_INIT_PROTS( a64fx )
 #endif
+// ARM-NEON (4x128)
+#ifdef BLIS_CONFIG_ALTRAMAX
+CNTX_INIT_PROTS( altramax )
+#endif
+#ifdef BLIS_CONFIG_ALTRA
+CNTX_INIT_PROTS( altra )
+#endif
 #ifdef BLIS_CONFIG_FIRESTORM
 CNTX_INIT_PROTS( firestorm )
 #endif
+// ARM-NEON (2x128)
 #ifdef BLIS_CONFIG_THUNDERX2
 CNTX_INIT_PROTS( thunderx2 )
 #endif
@@ -106,6 +115,7 @@ CNTX_INIT_PROTS( cortexa57 )
 #ifdef BLIS_CONFIG_CORTEXA53
 CNTX_INIT_PROTS( cortexa53 )
 #endif
+// ARM 32-bit (vintage)
 #ifdef BLIS_CONFIG_CORTEXA15
 CNTX_INIT_PROTS( cortexa15 )
 #endif
@@ -146,7 +156,6 @@ CNTX_INIT_PROTS( rv32iv )
 CNTX_INIT_PROTS( rv64iv )
 #endif
 
-
 // -- Generic --
 
 #ifdef BLIS_CONFIG_GENERIC
@@ -228,15 +237,24 @@ CNTX_INIT_PROTS( generic )
 
 // -- ARM architectures --
 
+// ARM-SVE
 #ifdef BLIS_FAMILY_ARMSVE
 #include "bli_family_armsve.h"
 #endif
 #ifdef BLIS_FAMILY_A64FX
 #include "bli_family_a64fx.h"
 #endif
+// ARM-NEON (4x128)
+#ifdef BLIS_FAMILY_ALTRAMAX
+#include "bli_family_altramax.h"
+#endif
+#ifdef BLIS_FAMILY_ALTRA
+#include "bli_family_altra.h"
+#endif
 #ifdef BLIS_FAMILY_FIRESTORM
 #include "bli_family_firestorm.h"
 #endif
+// ARM-NEON (2x128)
 #ifdef BLIS_FAMILY_THUNDERX2
 #include "bli_family_thunderx2.h"
 #endif
@@ -246,6 +264,7 @@ CNTX_INIT_PROTS( generic )
 #ifdef BLIS_FAMILY_CORTEXA53
 #include "bli_family_cortexa53.h"
 #endif
+// ARM 32-bit (vintage)
 #ifdef BLIS_FAMILY_CORTEXA15
 #include "bli_family_cortexa15.h"
 #endif
@@ -358,6 +377,7 @@ CNTX_INIT_PROTS( generic )
 #include "bli_kernels_bgq.h"
 #endif
 
+// -- RISC-V --
 
 #ifdef BLIS_KERNELS_RVI
 #include "bli_kernels_rvi.h"
@@ -366,5 +386,6 @@ CNTX_INIT_PROTS( generic )
 #include "bli_kernels_rviv.h"
 #endif
 
+
 #endif
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index b246fda05..60c55a5ed 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -950,12 +950,16 @@ typedef enum
 	BLIS_ARCH_A64FX,
 
 	// ARM-NEON (4 pipes x 128-bit vectors)
+	BLIS_ARCH_ALTRAMAX,
+	BLIS_ARCH_ALTRA,
 	BLIS_ARCH_FIRESTORM,
 
 	// ARM (2 pipes x 128-bit vectors)
 	BLIS_ARCH_THUNDERX2,
 	BLIS_ARCH_CORTEXA57,
 	BLIS_ARCH_CORTEXA53,
+
+	// ARM 32-bit (vintage)
 	BLIS_ARCH_CORTEXA15,
 	BLIS_ARCH_CORTEXA9,
 
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 3e00df345..665fe9b70 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,8 @@
 
 // Added prefetch fix for non-cacheline aligned C columns
 // (with the prefetches interleaved with other instructions)
-// to both sgemm and dgemm versions.
+// to both sgemm and dgemm versions.  Previously, only the
+// first cacheline in each column was prefetched.
 
 // Added sgemm prefetch fix for non-cacheline aligned C columns
 // (with the prefetches interleaved with other instructions)
@@ -79,8 +81,10 @@ void bli_sgemm_armv8a_asm_8x12
        const cntx_t*    cntx
      )
 {
-	const void* a_next = bli_auxinfo_next_a( data );
-	const void* b_next = bli_auxinfo_next_b( data );
+
+
+	void* a_next = bli_auxinfo_next_a( data );
+	void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -89,7 +93,7 @@ void bli_sgemm_armv8a_asm_8x12
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
-	GEMM_UKR_SETUP_CT( s, 8, 12, false );
+	GEMM_UKR_SETUP_CT_ANY( s, 8, 12, false );
 
 
 	__asm__ volatile
@@ -105,108 +109,84 @@ void bli_sgemm_armv8a_asm_8x12
 	"                                            \n\t"
 	" ldr x5,%[k_iter]                           \n\t" // Number of unrolled iterations (k_iter).
 	" ldr x6,%[k_left]                           \n\t" // Number of remaining iterations (k_left).
-	" add x16,x2,x10                             \n\t" // Load address Column 1 of C
+	" add x16,x2,x10                             \n\t" //Load address Column 1 of C
 	"                                            \n\t"
-	// " ldr x14,%[rs_c]                            \n\t" // Load rs_c.
-	// " lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
+	" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+	" lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
 	"                                            \n\t"
 	"                                            \n\t"
 	" dup  v8.4s, wzr                            \n\t" // Vector for accummulating column 0
 	" prfm    PLDL1KEEP, [x1, #192]              \n\t"
 	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
-	" add x17,x16,x10                            \n\t" // Load address Column 2 of C
-
+	" add x17,x16,x10                            \n\t" //Load address Column 2 of C
 	" dup  v9.4s, wzr                            \n\t" // Vector for accummulating column 0
 	" prfm    PLDL1KEEP, [x1, #256]              \n\t"
-	"                                            \n\t" // Since columns of C can cross a cache
-	                                                   // line boundary, we also need to prefetch
-	                                                   // the "ends."
+	"                                            \n\t" // Since the columns can cross a cache line boundary,
+	                                                   // we also need to prefetch the "ends"
 	" prfm pldl1keep,[x2, #16]                   \n\t" // Prefetch c.
-	" add x19,x17,x10                            \n\t" // Load address Column 3 of C
-
+	" add x19,x17,x10                            \n\t" //Load address Column 3 of C
 	" dup  v10.4s, wzr                           \n\t" // Vector for accummulating column 1
 	" prfm    PLDL1KEEP, [x1, #320]              \n\t"
 	" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
-	" add x20,x19,x10                            \n\t" // Load address Column 4 of C
-
+	" add x20,x19,x10                            \n\t" //Load address Column 4 of C
 	" dup  v11.4s, wzr                           \n\t" // Vector for accummulating column 1
-	" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
 	" prfm pldl1keep,[x16, #16]                  \n\t" // Prefetch c.
-
 	" dup  v12.4s, wzr                           \n\t" // Vector for accummulating column 2
 	" prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
-	" add x21,x20,x10                            \n\t" // Load address Column 5 of C
-
+	" add x21,x20,x10                            \n\t" //Load address Column 5 of C
 	" dup  v13.4s, wzr                           \n\t" // Vector for accummulating column 2
 	" prfm pldl1keep,[x17, #16]                  \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v14.4s, wzr                           \n\t" // Vector for accummulating column 3
 	" prfm    PLDL1KEEP, [x0, #128]              \n\t"
 	" prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
-	" add x22,x21,x10                            \n\t" // Load address Column 6 of C
-
+	" add x22,x21,x10                            \n\t" //Load address Column 6 of C
 	" dup  v15.4s, wzr                           \n\t" // Vector for accummulating column 3
 	" prfm    PLDL1KEEP, [x0, #192]              \n\t"
 	" prfm pldl1keep,[x19, #16]                  \n\t" // Prefetch c.
-
 	" dup  v16.4s, wzr                           \n\t" // Vector for accummulating column 4
 	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
-	" add x23,x22,x10                            \n\t" // Load address Column 7 of C
-
+	" add x23,x22,x10                            \n\t" //Load address Column 7 of C
 	" dup  v17.4s, wzr                           \n\t" // Vector for accummulating column 4
 	" prfm pldl1keep,[x20, #16]                  \n\t" // Prefetch c.
-
 	" dup  v18.4s, wzr                           \n\t" // Vector for accummulating column 5
 	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
-	" add x24,x23,x10                            \n\t" // Load address Column 8 of C
-
+	" add x24,x23,x10                            \n\t" //Load address Column 8 of C
 	" dup  v19.4s, wzr                           \n\t" // Vector for accummulating column 5
 	" prfm pldl1keep,[x21, #16]                  \n\t" // Prefetch c.
-
 	"                                            \n\t"
 	" dup  v20.4s, wzr                           \n\t" // Vector for accummulating column 6
 	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
-	" add x25,x24,x10                            \n\t" // Load address Column 9 of C
-
+	" add x25,x24,x10                            \n\t" //Load address Column 9 of C
 	" dup  v21.4s, wzr                           \n\t" // Vector for accummulating column 6
 	" prfm pldl1keep,[x22, #16]                  \n\t" // Prefetch c.
-
 	" dup  v22.4s, wzr                           \n\t" // Vector for accummulating column 7
 	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
-	" add x26,x25,x10                            \n\t" // Load address Column 10 of C
-
+	" add x26,x25,x10                            \n\t" //Load address Column 10 of C
 	" dup  v23.4s, wzr                           \n\t" // Vector for accummulating column 7
 	" prfm pldl1keep,[x23, #16]                  \n\t" // Prefetch c.
-
 	" dup  v24.4s, wzr                           \n\t" // Vector for accummulating column 8
 	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
-	" add x27,x26,x10                            \n\t" // Load address Column 11 of C
-
+	" add x27,x26,x10                            \n\t" //Load address Column 11 of C
 	" dup  v25.4s, wzr                           \n\t" // Vector for accummulating column 8
 	" prfm pldl1keep,[x24, #16]                  \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v26.4s, wzr                           \n\t" // Vector for accummulating column 9
 	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
-
 	" dup  v27.4s, wzr                           \n\t" // Vector for accummulating column 9
 	" prfm pldl1keep,[x25, #16]                  \n\t" // Prefetch c.
-
 	" dup  v28.4s, wzr                           \n\t" // Vector for accummulating column 10
 	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
-
 	" dup  v29.4s, wzr                           \n\t" // Vector for accummulating column 10
 	" prfm pldl1keep,[x26, #16]                  \n\t" // Prefetch c.
-
 	" dup  v30.4s, wzr                           \n\t" // Vector for accummulating column 11
 	" prfm pldl1keep,[x27]                       \n\t" // Prefetch c.
-
 	" dup  v31.4s, wzr                           \n\t" // Vector for accummulating column 11
 	" prfm pldl1keep,[x27, #16]                  \n\t" // Prefetch c.
 	"                                            \n\t"
 	"                                            \n\t"
-
 	" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-	BEQ(SCONSIDERKLEFT)
+	BEQ (SCONSIDERKLEFT)
 	"                                            \n\t"
 	" ldr q0, [x0]                               \n\t"
 	" ldr q1, [x0, #16]                          \n\t" // Load a
@@ -215,13 +195,13 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr q3, [x1, #16]                          \n\t"
 	" ldr q4, [x1, #32]                          \n\t"
 	"                                            \n\t"
-	" add x0, x0, #32                            \n\t" // Update address of A
-	" add x1, x1, #48                            \n\t" // Update address of B
+	" add x0, x0, #32                            \n\t" //update address of A
+	" add x1, x1, #48                            \n\t" //update address of B
 	"                                            \n\t"
-	" cmp x5,1                                   \n\t" // If there's only one k_iter, jump to it
-	BEQ(SLASTITER)                                     // (as loop is do-while-like).
+	" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one.
+	BEQ (SLASTITER)                                    // (as loop is do-while-like).
 	"                                            \n\t"
-	LABEL(SLOOPKITER)                                  // Body of the k_iter loop.
+	LABEL (SLOOPKITER)                                 // Body of the k_iter loop.
 	"                                            \n\t"
 	" ldr q5, [x0]                               \n\t"
 	" fmla v8.4s, v0.4s,v2.s[0]                  \n\t" // Accummulate.
@@ -258,7 +238,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #32]                          \n\t"
-	"                                            \n\t"                  // End It 1
+	"                                            \n\t" //End It 1
 	"                                            \n\t"
 	" ldr q0, [x0, #32]                          \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -294,7 +274,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #80]                          \n\t"
-	"                                            \n\t"                  // End It 2
+	"                                            \n\t" //End It 2
 	"                                            \n\t"
 	" ldr q5, [x0, #64]                          \n\t"
 	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -328,7 +308,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #128]                         \n\t"
-	"                                            \n\t"                  // End It 3
+	"                                            \n\t" //End It 3
 	"                                            \n\t"
 	" ldr q0, [x0, #96]                          \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -364,12 +344,12 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr q4, [x1, #176]                         \n\t"
 	" add x1, x1, #192                           \n\t"
 	" add x0, x0, #128                           \n\t"
-	"                                            \n\t"                  // End It 4
+	"                                            \n\t" //End It 4
 	" sub x5,x5,1                                \n\t" // i-=1.
 	" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-	BNE(SLOOPKITER)
+	BNE (SLOOPKITER)
 	"                                            \n\t"
-	LABEL(SLASTITER)                                   // Last iteration of k_iter loop.
+	LABEL (SLASTITER)                                  // Last iteration of k_iter loop.
 	"                                            \n\t"
 	"                                            \n\t"
 	" ldr q5, [x0]                               \n\t"
@@ -404,7 +384,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #32]                          \n\t"
-	"                                            \n\t"                  // End It 1
+	"                                            \n\t" //End It 1
 	"                                            \n\t"
 	" ldr q0, [x0, #32]                          \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -438,7 +418,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #80]                          \n\t"
-	"                                            \n\t"                  // End It 2
+	"                                            \n\t" //End It 2
 	"                                            \n\t"
 	" ldr q5, [x0, #64]                          \n\t"
 	" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -472,7 +452,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	" ldr q4, [x1, #128]                         \n\t"
-	"                                            \n\t"                  // End It 3
+	"                                            \n\t" //End It 3
 	"                                            \n\t"
 	" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
 	" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
@@ -503,13 +483,13 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
 	" add x1, x1, #144                           \n\t"
 	" add x0, x0, #96                            \n\t"
-	"                                            \n\t"                  // End It 4
+	"                                            \n\t" //End It 4
 	"                                            \n\t"
-	LABEL(SCONSIDERKLEFT)
+	LABEL (SCONSIDERKLEFT)
 	" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-	BEQ(SPOSTACCUM)                                    // else, we enter the k_left loop.
+	BEQ (SPOSTACCUM)                                   // else, we enter the k_left loop.
 	"                                            \n\t"
-	LABEL(SLOOPKLEFT)                                  // Body of the left iterations
+	LABEL (SLOOPKLEFT)                                 // Body of the left iterations
 	"                                            \n\t"
 	" ldr q0, [x0],#16                           \n\t"
 	" ldr q1, [x0],#16                           \n\t" // Load a
@@ -548,9 +528,9 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 	"                                            \n\t"
 	" cmp x6,0                                   \n\t" // Iterate again.
-	BNE(SLOOPKLEFT)                                    // if i!=0.
+	BNE (SLOOPKLEFT)                                   // if i!=0.
 	"                                            \n\t"
-	LABEL(SPOSTACCUM)
+	LABEL (SPOSTACCUM)
 	"                                            \n\t"
 	" ldr x0,%[alpha]                            \n\t" // Alpha address.
 	" ldr x1,%[beta]                             \n\t" // Beta address.
@@ -561,7 +541,10 @@ void bli_sgemm_armv8a_asm_8x12
 	" ldr x0,%[a_next]                           \n\t" // Pointer to next block of A.
 	" ldr x1,%[b_next]                           \n\t" // Pointer to next pointer of B.
 	"                                            \n\t"
-	LABEL(SCOLSTORED)                                  // C is column-major.
+	" cmp x14,#4                                 \n\t" // If rs_c != 1 (column-major)
+	BNE (SGENSTORED)
+	"                                            \n\t"
+	LABEL (SCOLSTORED)                                 // C is column-major.
 	"                                            \n\t"
 	" dup  v0.4s, wzr                            \n\t"
 	" dup  v1.4s, wzr                            \n\t"
@@ -571,13 +554,13 @@ void bli_sgemm_armv8a_asm_8x12
 	" dup  v5.4s, wzr                            \n\t"
 	"                                            \n\t"
 	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
+	BEQ (SBETAZEROCOLSTOREDS1)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x2]                               \n\t" // Load column 0 of C
+	" ldr q0, [x2]                               \n\t" //Load column 0 of C
 	" ldr q1, [x2, #16]                          \n\t"
-	" ldr q2, [x16]                              \n\t" // Load column 1 of C
+	" ldr q2, [x16]                              \n\t" //Load column 1 of C
 	" ldr q3, [x16, #16]                         \n\t"
-	" ldr q4, [x17]                              \n\t" // Load column 2 of C
+	" ldr q4, [x17]                              \n\t" //Load column 2 of C
 	" ldr q5, [x17, #16]                         \n\t"
 	"                                            \n\t"
 	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -587,7 +570,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(SBETAZEROCOLSTOREDS1)
+	LABEL (SBETAZEROCOLSTOREDS1)
 	"                                            \n\t"
 	" fmla v0.4s,v8.4s,v6.s[0]                   \n\t" // Scale by alpha
 	" fmla v1.4s,v9.4s,v6.s[0]                   \n\t" // Scale by alpha
@@ -596,11 +579,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
 	" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x2]                               \n\t" // Store column 0 of C
+	" str q0, [x2]                               \n\t" //Store column 0 of C
 	" str q1, [x2, #16]                          \n\t"
-	" str q2, [x16]                              \n\t" // Store column 1 of C
+	" str q2, [x16]                              \n\t" //Store column 1 of C
 	" str q3, [x16, #16]                         \n\t"
-	" str q4, [x17]                              \n\t" // Store column 2 of C
+	" str q4, [x17]                              \n\t" //Store column 2 of C
 	" str q5, [x17, #16]                         \n\t"
 	"                                            \n\t"
 	" dup  v8.4s, wzr                            \n\t"
@@ -611,13 +594,13 @@ void bli_sgemm_armv8a_asm_8x12
 	" dup  v13.4s, wzr                           \n\t"
 	"                                            \n\t"
 	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
+	BEQ (SBETAZEROCOLSTOREDS2)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x19]                              \n\t" // Load column 3 of C
+	" ldr q8, [x19]                              \n\t" //Load column 3 of C
 	" ldr q9, [x19, #16]                         \n\t"
-	" ldr q10, [x20]                             \n\t" // Load column 4 of C
+	" ldr q10, [x20]                             \n\t" //Load column 4 of C
 	" ldr q11, [x20, #16]                        \n\t"
-	" ldr q12, [x21]                             \n\t" // Load column 5 of C
+	" ldr q12, [x21]                             \n\t" //Load column 5 of C
 	" ldr q13, [x21, #16]                        \n\t"
 	"                                            \n\t"
 	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -627,7 +610,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(SBETAZEROCOLSTOREDS2)
+	LABEL (SBETAZEROCOLSTOREDS2)
 	"                                            \n\t"
 	" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
 	" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
@@ -636,11 +619,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
 	" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x19]                              \n\t" // Store column 3 of C
+	" str q8, [x19]                              \n\t" //Store column 3 of C
 	" str q9, [x19, #16]                         \n\t"
-	" str q10, [x20]                             \n\t" // Store column 4 of C
+	" str q10, [x20]                             \n\t" //Store column 4 of C
 	" str q11, [x20, #16]                        \n\t"
-	" str q12, [x21]                             \n\t" // Store column 5 of C
+	" str q12, [x21]                             \n\t" //Store column 5 of C
 	" str q13, [x21, #16]                        \n\t"
 	"                                            \n\t"
 	" dup  v0.4s, wzr                            \n\t"
@@ -651,13 +634,13 @@ void bli_sgemm_armv8a_asm_8x12
 	" dup  v5.4s, wzr                            \n\t"
 	"                                            \n\t"
 	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
+	BEQ (SBETAZEROCOLSTOREDS3)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x22]                              \n\t" // Load column 6 of C
+	" ldr q0, [x22]                              \n\t" //Load column 6 of C
 	" ldr q1, [x22, #16]                         \n\t"
-	" ldr q2, [x23]                              \n\t" // Load column 7 of C
+	" ldr q2, [x23]                              \n\t" //Load column 7 of C
 	" ldr q3, [x23, #16]                         \n\t"
-	" ldr q4, [x24]                              \n\t" // Load column 8 of C
+	" ldr q4, [x24]                              \n\t" //Load column 8 of C
 	" ldr q5, [x24, #16]                         \n\t"
 	"                                            \n\t"
 	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -667,7 +650,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(SBETAZEROCOLSTOREDS3)
+	LABEL (SBETAZEROCOLSTOREDS3)
 	"                                            \n\t"
 	" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
 	" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
@@ -676,11 +659,11 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
 	" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x22]                              \n\t" // Store column 6 of C
+	" str q0, [x22]                              \n\t" //Store column 6 of C
 	" str q1, [x22, #16]                         \n\t"
-	" str q2, [x23]                              \n\t" // Store column 7 of C
+	" str q2, [x23]                              \n\t" //Store column 7 of C
 	" str q3, [x23, #16]                         \n\t"
-	" str q4, [x24]                              \n\t" // Store column 8 of C
+	" str q4, [x24]                              \n\t" //Store column 8 of C
 	" str q5, [x24, #16]                         \n\t"
 	"                                            \n\t"
 	" dup  v8.4s, wzr                            \n\t"
@@ -691,13 +674,13 @@ void bli_sgemm_armv8a_asm_8x12
 	" dup  v13.4s, wzr                            \n\t"
 	"                                            \n\t"
 	" fcmp s7,#0.0                               \n\t"
-	BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
+	BEQ (SBETAZEROCOLSTOREDS4)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x25]                              \n\t" // Load column 9 of C
+	" ldr q8, [x25]                              \n\t" //Load column 9 of C
 	" ldr q9, [x25, #16]                         \n\t"
-	" ldr q10, [x26]                             \n\t" // Load column 10 of C
+	" ldr q10, [x26]                             \n\t" //Load column 10 of C
 	" ldr q11, [x26, #16]                        \n\t"
-	" ldr q12, [x27]                             \n\t" // Load column 11 of C
+	" ldr q12, [x27]                             \n\t" //Load column 11 of C
 	" ldr q13, [x27, #16]                        \n\t"
 	"                                            \n\t"
 	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -707,7 +690,7 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(SBETAZEROCOLSTOREDS4)
+	LABEL (SBETAZEROCOLSTOREDS4)
 	"                                            \n\t"
 	" prfm pldl2keep,[x0]                        \n\t"
 	" prfm pldl2keep,[x1]                        \n\t"
@@ -719,44 +702,420 @@ void bli_sgemm_armv8a_asm_8x12
 	" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
 	" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x25]                              \n\t" // Store column 9 of C
+	" str q8, [x25]                              \n\t" //Store column 9 of C
 	" str q9, [x25, #16]                         \n\t"
-	" str q10, [x26]                             \n\t" // Store column 10 of C
+	" str q10, [x26]                             \n\t" //Store column 10 of C
 	" str q11, [x26, #16]                        \n\t"
-	" str q12, [x27]                             \n\t" // Store column 11 of C
+	" str q12, [x27]                             \n\t" //Store column 11 of C
 	" str q13, [x27, #16]                        \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	// BRANCH(SEND)                                       // Done.
-	// LABEL(SEND)                                        // Done!
+	BRANCH (SEND)                                      // Done.
+	"                                            \n\t"
+	"                                            \n\t"
+	LABEL (SGENSTORED)                                 // C is general-stride stored.
+	"                                            \n\t"
+	"                                            \n\t"
+	" dup  v0.4s, wzr                            \n\t"
+	" dup  v1.4s, wzr                            \n\t"
+	" dup  v2.4s, wzr                            \n\t"
+	" dup  v3.4s, wzr                            \n\t"
+	" dup  v4.4s, wzr                            \n\t"
+	" dup  v5.4s, wzr                            \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ (SBETAZEROGENSTOREDS1)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x2                                 \n\t"
+	"                                            \n\t"
+	" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c00  into quad and increment by rs_c.
+	" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c01  into quad and increment by rs_c.
+	" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c02  into quad and increment by rs_c.
+	" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c03  into quad and increment by rs_c.
+	" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c04  into quad and increment by rs_c.
+	" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c05  into quad and increment by rs_c.
+	" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c06  into quad and increment by rs_c.
+	" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c07  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x16                                \n\t"
+	"                                            \n\t"
+	" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c10  into quad and increment by rs_c.
+	" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c11  into quad and increment by rs_c.
+	" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c12  into quad and increment by rs_c.
+	" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c13  into quad and increment by rs_c.
+	" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c14  into quad and increment by rs_c.
+	" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c15  into quad and increment by rs_c.
+	" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c16  into quad and increment by rs_c.
+	" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c17  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x17                                \n\t"
+	"                                            \n\t"
+	" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c20  into quad and increment by rs_c.
+	" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c21  into quad and increment by rs_c.
+	" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c22  into quad and increment by rs_c.
+	" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c23  into quad and increment by rs_c.
+	" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c24  into quad and increment by rs_c.
+	" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c25  into quad and increment by rs_c.
+	" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c26  into quad and increment by rs_c.
+	" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c27  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (SBETAZEROGENSTOREDS1)
+	"                                            \n\t"
+	" fmla v0.4s, v8.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v1.4s, v9.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x2                                 \n\t"
+	"                                            \n\t"
+	" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c00  into quad and increment by rs_c.
+	" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c01  into quad and increment by rs_c.
+	" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c02  into quad and increment by rs_c.
+	" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c03  into quad and increment by rs_c.
+	" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c04  into quad and increment by rs_c.
+	" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c05  into quad and increment by rs_c.
+	" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c06  into quad and increment by rs_c.
+	" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c07  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x16                                \n\t"
+	"                                            \n\t"
+	" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c10  into quad and increment by rs_c.
+	" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c11  into quad and increment by rs_c.
+	" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c12  into quad and increment by rs_c.
+	" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c13  into quad and increment by rs_c.
+	" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c14  into quad and increment by rs_c.
+	" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c15  into quad and increment by rs_c.
+	" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c16  into quad and increment by rs_c.
+	" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c17  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x17                                \n\t"
+	"                                            \n\t"
+	" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c20  into quad and increment by rs_c.
+	" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c21  into quad and increment by rs_c.
+	" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c22  into quad and increment by rs_c.
+	" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c23  into quad and increment by rs_c.
+	" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c24  into quad and increment by rs_c.
+	" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c25  into quad and increment by rs_c.
+	" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c26  into quad and increment by rs_c.
+	" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c27  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.4s, wzr                            \n\t"
+	" dup  v9.4s, wzr                            \n\t"
+	" dup  v10.4s, wzr                           \n\t"
+	" dup  v11.4s, wzr                           \n\t"
+	" dup  v12.4s, wzr                           \n\t"
+	" dup  v13.4s, wzr                           \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ (SBETAZEROGENSTOREDS2)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x19                                \n\t"
+	"                                            \n\t"
+	" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c30  into quad and increment by rs_c.
+	" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c31  into quad and increment by rs_c.
+	" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c32  into quad and increment by rs_c.
+	" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c33  into quad and increment by rs_c.
+	" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c34  into quad and increment by rs_c.
+	" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c35  into quad and increment by rs_c.
+	" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c36  into quad and increment by rs_c.
+	" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c37  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x20                                \n\t"
+	"                                            \n\t"
+	" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c40  into quad and increment by rs_c.
+	" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c41  into quad and increment by rs_c.
+	" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c42  into quad and increment by rs_c.
+	" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c43  into quad and increment by rs_c.
+	" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c44  into quad and increment by rs_c.
+	" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c45  into quad and increment by rs_c.
+	" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c46  into quad and increment by rs_c.
+	" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c47  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x21                                \n\t"
+	"                                            \n\t"
+	" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c50  into quad and increment by rs_c.
+	" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c51  into quad and increment by rs_c.
+	" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c52  into quad and increment by rs_c.
+	" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c53  into quad and increment by rs_c.
+	" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c54  into quad and increment by rs_c.
+	" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c55  into quad and increment by rs_c.
+	" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c56  into quad and increment by rs_c.
+	" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c57  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (SBETAZEROGENSTOREDS2)
+	"                                            \n\t"
+	" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x19                                \n\t"
+	"                                            \n\t"
+	" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c30  into quad and increment by rs_c.
+	" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c31  into quad and increment by rs_c.
+	" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c32  into quad and increment by rs_c.
+	" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c33  into quad and increment by rs_c.
+	" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c34  into quad and increment by rs_c.
+	" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c35  into quad and increment by rs_c.
+	" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c36  into quad and increment by rs_c.
+	" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c37  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x20                                \n\t"
+	"                                            \n\t"
+	" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c40  into quad and increment by rs_c.
+	" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c41  into quad and increment by rs_c.
+	" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c42  into quad and increment by rs_c.
+	" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c43  into quad and increment by rs_c.
+	" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c44  into quad and increment by rs_c.
+	" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c45  into quad and increment by rs_c.
+	" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c46  into quad and increment by rs_c.
+	" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c47  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x21                                \n\t"
+	"                                            \n\t"
+	" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c50  into quad and increment by rs_c.
+	" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c51  into quad and increment by rs_c.
+	" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c52  into quad and increment by rs_c.
+	" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c53  into quad and increment by rs_c.
+	" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c54  into quad and increment by rs_c.
+	" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c55  into quad and increment by rs_c.
+	" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c56  into quad and increment by rs_c.
+	" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c57  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v0.4s, wzr                            \n\t"
+	" dup  v1.4s, wzr                            \n\t"
+	" dup  v2.4s, wzr                            \n\t"
+	" dup  v3.4s, wzr                            \n\t"
+	" dup  v4.4s, wzr                            \n\t"
+	" dup  v5.4s, wzr                            \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ (SBETAZEROGENSTOREDS3)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x22                                \n\t"
+	"                                            \n\t"
+	" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c60  into quad and increment by rs_c.
+	" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c61  into quad and increment by rs_c.
+	" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c62  into quad and increment by rs_c.
+	" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c63  into quad and increment by rs_c.
+	" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c64  into quad and increment by rs_c.
+	" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c65  into quad and increment by rs_c.
+	" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c66  into quad and increment by rs_c.
+	" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c67  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x23                                \n\t"
+	"                                            \n\t"
+	" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c70  into quad and increment by rs_c.
+	" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c71  into quad and increment by rs_c.
+	" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c72  into quad and increment by rs_c.
+	" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c73  into quad and increment by rs_c.
+	" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c74  into quad and increment by rs_c.
+	" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c75  into quad and increment by rs_c.
+	" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c76  into quad and increment by rs_c.
+	" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c77  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x24                                \n\t"
+	"                                            \n\t"
+	" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c80  into quad and increment by rs_c.
+	" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c81  into quad and increment by rs_c.
+	" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c82  into quad and increment by rs_c.
+	" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c83  into quad and increment by rs_c.
+	" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c84  into quad and increment by rs_c.
+	" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c85  into quad and increment by rs_c.
+	" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c86  into quad and increment by rs_c.
+	" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c87  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+	" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (SBETAZEROGENSTOREDS3)
+	"                                            \n\t"
+	" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
+	" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x22                                \n\t"
+	"                                            \n\t"
+	" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c60  into quad and increment by rs_c.
+	" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c61  into quad and increment by rs_c.
+	" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c62  into quad and increment by rs_c.
+	" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c63  into quad and increment by rs_c.
+	" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c64  into quad and increment by rs_c.
+	" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c65  into quad and increment by rs_c.
+	" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c66  into quad and increment by rs_c.
+	" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c67  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x23                                \n\t"
+	"                                            \n\t"
+	" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c70  into quad and increment by rs_c.
+	" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c71  into quad and increment by rs_c.
+	" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c72  into quad and increment by rs_c.
+	" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c73  into quad and increment by rs_c.
+	" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c74  into quad and increment by rs_c.
+	" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c75  into quad and increment by rs_c.
+	" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c76  into quad and increment by rs_c.
+	" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c77  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x24                                \n\t"
+	"                                            \n\t"
+	" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c80  into quad and increment by rs_c.
+	" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c81  into quad and increment by rs_c.
+	" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c82  into quad and increment by rs_c.
+	" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c83  into quad and increment by rs_c.
+	" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c84  into quad and increment by rs_c.
+	" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c85  into quad and increment by rs_c.
+	" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c86  into quad and increment by rs_c.
+	" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c87  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.4s, wzr                            \n\t"
+	" dup  v9.4s, wzr                            \n\t"
+	" dup  v10.4s, wzr                           \n\t"
+	" dup  v11.4s, wzr                           \n\t"
+	" dup  v12.4s, wzr                           \n\t"
+	" dup  v13.4s, wzr                           \n\t"
+	"                                            \n\t"
+	" fcmp s7,#0.0                               \n\t"
+	BEQ (SBETAZEROGENSTOREDS4)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x5, x25                                \n\t"
+	"                                            \n\t"
+	" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c90  into quad and increment by rs_c.
+	" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c91  into quad and increment by rs_c.
+	" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c92  into quad and increment by rs_c.
+	" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c93  into quad and increment by rs_c.
+	" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c94  into quad and increment by rs_c.
+	" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c95  into quad and increment by rs_c.
+	" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c96  into quad and increment by rs_c.
+	" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c97  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x26                                \n\t"
+	"                                            \n\t"
+	" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c100  into quad and increment by rs_c.
+	" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c101  into quad and increment by rs_c.
+	" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c102  into quad and increment by rs_c.
+	" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c103  into quad and increment by rs_c.
+	" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c104  into quad and increment by rs_c.
+	" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c105  into quad and increment by rs_c.
+	" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c106  into quad and increment by rs_c.
+	" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c107  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x27                                \n\t"
+	"                                            \n\t"
+	" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c110  into quad and increment by rs_c.
+	" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c111  into quad and increment by rs_c.
+	" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c112  into quad and increment by rs_c.
+	" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c113  into quad and increment by rs_c.
+	" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c114  into quad and increment by rs_c.
+	" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c115  into quad and increment by rs_c.
+	" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c116  into quad and increment by rs_c.
+	" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c117  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+	" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+	" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (SBETAZEROGENSTOREDS4)
+	"                                            \n\t"
+	" prfm pldl2keep,[x0]                        \n\t"
+	" prfm pldl2keep,[x1]                        \n\t"
+	"                                            \n\t"
+	" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
+	" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x5, x25                                \n\t"
+	"                                            \n\t"
+	" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c90  into quad and increment by rs_c.
+	" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c91  into quad and increment by rs_c.
+	" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c92  into quad and increment by rs_c.
+	" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c93  into quad and increment by rs_c.
+	" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c94  into quad and increment by rs_c.
+	" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c95  into quad and increment by rs_c.
+	" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c96  into quad and increment by rs_c.
+	" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c97  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x26                                \n\t"
+	"                                            \n\t"
+	" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c100  into quad and increment by rs_c.
+	" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c101  into quad and increment by rs_c.
+	" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c102  into quad and increment by rs_c.
+	" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c103  into quad and increment by rs_c.
+	" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c104  into quad and increment by rs_c.
+	" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c105  into quad and increment by rs_c.
+	" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c106  into quad and increment by rs_c.
+	" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c107  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x5, x27                                \n\t"
+	"                                            \n\t"
+	" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c110  into quad and increment by rs_c.
+	" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c111  into quad and increment by rs_c.
+	" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c112  into quad and increment by rs_c.
+	" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c113  into quad and increment by rs_c.
+	" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c114  into quad and increment by rs_c.
+	" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c115  into quad and increment by rs_c.
+	" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c116  into quad and increment by rs_c.
+	" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c147  into quad and increment by rs_c.
+	"                                            \n\t"
+	LABEL (SEND)                                       // Done!
 	"                                            \n\t"
 	:// output operands (none)
 	:// input operands
-	 [aaddr]  "m" (a),      // 0
-	 [baddr]  "m" (b),      // 1
-	 [caddr]  "m" (c),      // 2
+	 [aaddr] "m" (a),       // 0
+	 [baddr] "m" (b),       // 1
+	 [caddr] "m" (c),       // 2
 	 [k_iter] "m" (k_iter), // 3
 	 [k_left] "m" (k_left), // 4
-	 [alpha]  "m" (alpha),  // 5
-	 [beta]   "m" (beta),   // 6
-	 [rs_c]   "m" (rs_c),   // 7
-	 [cs_c]   "m" (cs_c),   // 8
+	 [alpha] "m" (alpha),   // 5
+	 [beta] "m" (beta),     // 6
+	 [rs_c] "m" (rs_c),     // 7
+	 [cs_c] "m" (cs_c),     // 8
 	 [a_next] "m" (a_next), // 9
-	 [b_next] "m" (b_next) // 10
-	:// Register clobber list
+	 [b_next] "m" (b_next)  // 10
+	 :// Register clobber list
 	 "x0", "x1", "x2",
-	 "x5", "x6", "x10",
-	 "x16","x17","x19","x20",
-	 "x21","x22","x23","x24",
-	 "x25","x26","x27",
+	 "x5", "x6", "x10", "x14",
+	 "x16", "x17", "x19", "x20",
+	 "x21", "x22", "x23", "x24",
+	 "x25", "x26", "x27",
 	 "v0", "v1", "v2", "v3",
 	 "v4", "v5", "v6", "v7",
-	 "v8", "v9", "v10","v11",
-	 "v12","v13","v14","v15",
-	 "v16","v17","v18","v19",
-	 "v20","v21","v22","v23",
-	 "v24","v25","v26","v27",
-	 "v28","v29","v30","v31"
+	 "v8", "v9", "v10", "v11",
+	 "v12", "v13", "v14", "v15",
+	 "v16", "v17", "v18", "v19",
+	 "v20", "v21", "v22", "v23",
+	 "v24", "v25", "v26", "v27",
+	 "v28", "v29", "v30", "v31"
 	);
 
 	GEMM_UKR_FLUSH_CT( s );
@@ -787,7 +1146,7 @@ void bli_sgemm_armv8a_asm_8x12
  * Tested on 2s Altra. Around 3,200 GFLOPS, 160 x N2 cores @ 3.0 GHz
  * Tested on 1s Altra, Around 1,700 GFLOPS,  80 x N2 cores @ 3.0 GHz
  * Tested on 1s Altra Max,  ~ 2,600 GFLOPS, 128 x N2 cores @ 3.0 GHz
- */
+*/
 void bli_dgemm_armv8a_asm_6x8
      (
              dim_t      m,
@@ -816,8 +1175,8 @@ void bli_dgemm_armv8a_asm_6x8
 
 #endif
 
-	const void* a_next = bli_auxinfo_next_a( data );
-	const void* b_next = bli_auxinfo_next_b( data );
+	void* a_next = bli_auxinfo_next_a( data );
+	void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -826,7 +1185,8 @@ void bli_dgemm_armv8a_asm_6x8
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
-	GEMM_UKR_SETUP_CT( d, 6, 8, false );
+	GEMM_UKR_SETUP_CT_ANY( d, 6, 8, false );
+
 
 	__asm__ volatile
 	(
@@ -840,10 +1200,10 @@ void bli_dgemm_armv8a_asm_6x8
 	"                                            \n\t"
 	" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
 	" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
-	" add x20,x2,x10                             \n\t" // Load address Column 1 of C
+	" add x20,x2,x10                             \n\t" //Load address Column 1 of C
 	"                                            \n\t"
-	// " ldr x14,%[rs_c]                            \n\t" // Load rs_c.
-	// " lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double).
+	" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+	" lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double).
 	"                                            \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -857,76 +1217,58 @@ void bli_dgemm_armv8a_asm_6x8
 	" prfm    PLDL1KEEP, [x1, #448]              \n\t"
 	" dup  v12.2d, xzr                           \n\t" // Vector for accummulating column 1
 	" prfm    PLDL1KEEP, [x0, #192]              \n\t"
-	" add x21,x20,x10                            \n\t" // Load address Column 2 of C
-
+	" add x21,x20,x10                            \n\t" //Load address Column 2 of C
 	" dup  v13.2d, xzr                           \n\t" // Vector for accummulating column 1
 	" prfm    PLDL1KEEP, [x0, #256]              \n\t"
 	"                                            \n\t"
 	" dup  v14.2d, xzr                           \n\t" // Vector for accummulating column 2
 	" prfm    PLDL1KEEP, [x0, #320]              \n\t"
-	" add x22,x21,x10                            \n\t" // Load address Column 3 of C
-
+	" add x22,x21,x10                            \n\t" //Load address Column 3 of C
 	" dup  v15.2d, xzr                           \n\t" // Vector for accummulating column 2
 	" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
 	" dup  v16.2d, xzr                           \n\t" // Vector for accummulating column 2
-	"                                            \n\t" // Since columns of C can cross a cache
-	                                                   // line boundary, we also need to prefetch
-	                                                   // the "ends."
+	"                                            \n\t" // Since the columns can cross a cache line boundary,
+	                                                   // we also need to prefetch the "ends"
 	" prfm pldl1keep,[x2, #32]                   \n\t" // Prefetch c.
-	" add x23,x22,x10                            \n\t" // Load address Column 4 of C
-
+	" add x23,x22,x10                            \n\t" //Load address Column 4 of C
 	" dup  v17.2d, xzr                           \n\t" // Vector for accummulating column 3
 	" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
-
 	" dup  v18.2d, xzr                           \n\t" // Vector for accummulating column 3
 	" prfm pldl1keep,[x20, #32]                  \n\t" // Prefetch c.
-	" add x24,x23,x10                            \n\t" // Load address Column 5 of C
-
+	" add x24,x23,x10                            \n\t" //Load address Column 5 of C
 	" dup  v19.2d, xzr                           \n\t" // Vector for accummulating column 3
 	" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
 	"                                            \n\t"
-
 	" dup  v20.2d, xzr                           \n\t" // Vector for accummulating column 4
 	" prfm pldl1keep,[x21, #32]                  \n\t" // Prefetch c.
-	" add x25,x24,x10                            \n\t" // Load address Column 6 of C
-
+	" add x25,x24,x10                            \n\t" //Load address Column 6 of C
 	" dup  v21.2d, xzr                           \n\t" // Vector for accummulating column 4
 	" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
-
 	" dup  v22.2d, xzr                           \n\t" // Vector for accummulating column 4
 	" prfm pldl1keep,[x22, #32]                  \n\t" // Prefetch c.
-	" add x26,x25,x10                            \n\t" // Load address Column 7 of C
-
+	" add x26,x25,x10                            \n\t" //Load address Column 7 of C
 	" dup  v23.2d, xzr                           \n\t" // Vector for accummulating column 5
 	" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
-
 	" dup  v24.2d, xzr                           \n\t" // Vector for accummulating column 5
 	" prfm pldl1keep,[x23, #32]                  \n\t" // Prefetch c.
-
 	" dup  v25.2d, xzr                           \n\t" // Vector for accummulating column 5
 	" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
 	"                                            \n\t"
 	" dup  v26.2d, xzr                           \n\t" // Vector for accummulating column 6
 	" prfm pldl1keep,[x24, #32]                  \n\t" // Prefetch c.
-
 	" dup  v27.2d, xzr                           \n\t" // Vector for accummulating column 6
 	" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
-
 	" dup  v28.2d, xzr                           \n\t" // Vector for accummulating column 6
 	" prfm pldl1keep,[x25, #32]                  \n\t" // Prefetch c.
-
 	" dup  v29.2d, xzr                           \n\t" // Vector for accummulating column 7
 	" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
-
 	" dup  v30.2d, xzr                           \n\t" // Vector for accummulating column 7
 	" prfm pldl1keep,[x26, #32]                  \n\t" // Prefetch c.
-
 	" dup  v31.2d, xzr                           \n\t" // Vector for accummulating column 7
 	"                                            \n\t"
 	"                                            \n\t"
-
 	" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-	BEQ(DCONSIDERKLEFT)
+	BEQ (DCONSIDERKLEFT)
 	"                                            \n\t"
 	" ldr q0, [x0]                               \n\t" // Load a
 	" ldr q1, [x0, #16]                          \n\t"
@@ -937,13 +1279,13 @@ void bli_dgemm_armv8a_asm_6x8
 	" ldr q5, [x1, #32]                          \n\t"
 	" ldr q6, [x1, #48]                          \n\t"
 	"                                            \n\t"
-	" add x0, x0, #48                            \n\t" // Update address of A
-	" add x1, x1, #64                            \n\t" // Update address of B
+	" add x0, x0, #48                            \n\t" //update address of A
+	" add x1, x1, #64                            \n\t" //update address of B
 	"                                            \n\t"
-	" cmp x5,1                                   \n\t" // If there's only one k_iter, jump to it
-	BEQ(DLASTITER)                                     // (as loop is do-while-like).
+	" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one.
+	BEQ (DLASTITER)                                    // (as loop is do-while-like).
 	"                                            \n\t"
-	LABEL(DLOOP)                                       // Body
+	LABEL (DLOOP)                                      // Body
 	"                                            \n\t"
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" prfm    PLDL1KEEP, [x1, #448]              \n\t" //512-64=448
@@ -987,7 +1329,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #48]                          \n\t"
-	"                                            \n\t"                  // End it 1
+	"                                            \n\t" // End it 1
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" prfm    PLDL1KEEP, [x1, #640]              \n\t"
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1030,7 +1372,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #112]                         \n\t"
-	"                                            \n\t"                  // End it 2
+	"                                            \n\t" //End it 2
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" prfm    PLDL1KEEP, [x0, #464]              \n\t"
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1071,7 +1413,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #176]                         \n\t"
-	"                                            \n\t"                  // End it 3
+	"                                            \n\t" // End it 3
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1111,15 +1453,15 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #240]                         \n\t"
-	"                                            \n\t"                  // End it 4
+	"                                            \n\t" //End it 4
 	" add x0, x0, #192                           \n\t"
 	" add x1, x1, #256                           \n\t"
 	"                                            \n\t"
 	" sub x5,x5,1                                \n\t" // i-=1
 	" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-	BNE(DLOOP)
+	BNE (DLOOP)
 	"                                            \n\t"
-	LABEL(DLASTITER)
+	LABEL (DLASTITER)
 	"                                            \n\t"
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1160,7 +1502,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #48]                          \n\t"
-	"                                            \n\t"                  // End it 1
+	"                                            \n\t" // End it 1
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1200,7 +1542,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #112]                         \n\t"
-	"                                            \n\t"                  // End it 2
+	"                                            \n\t" //End it 2
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1240,7 +1582,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
 	" ldr q6, [x1, #176]                         \n\t"
-	"                                            \n\t"                  // End it 3
+	"                                            \n\t" // End it 3
 	" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
 	" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1274,14 +1616,14 @@ void bli_dgemm_armv8a_asm_6x8
 	"                                            \n\t"
 	" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
 	" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
-	"                                            \n\t"                  // End it 4
+	"                                            \n\t" //End it 4
 	" add x0, x0, #144                           \n\t"
 	"                                            \n\t"
-	LABEL(DCONSIDERKLEFT)
+	LABEL (DCONSIDERKLEFT)
 	" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-	BEQ(DPOSTACCUM)                                    // else, we enter the k_left loop.
+	BEQ (DPOSTACCUM)                                   // else, we enter the k_left loop.
 	"                                            \n\t"
-	LABEL(DLOOPKLEFT)
+	LABEL (DLOOPKLEFT)
 	"                                            \n\t"
 	" ldr q0, [x0],#16                           \n\t"
 	" ldr q1, [x0],#16                           \n\t" // Load a
@@ -1328,9 +1670,9 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
 	"                                            \n\t"
 	" cmp x6,0                                   \n\t" // Iterate again.
-	BNE(DLOOPKLEFT)                                    // if i!=0.
+	BNE (DLOOPKLEFT)                                   // if i!=0.
 	"                                            \n\t"
-	LABEL(DPOSTACCUM)
+	LABEL (DPOSTACCUM)
 	"                                            \n\t"
 	" ldr x0,%[alpha]                            \n\t" // Alpha address
 	" ldr x1,%[beta]                             \n\t" // Beta address
@@ -1341,7 +1683,10 @@ void bli_dgemm_armv8a_asm_6x8
 	" ldr x0,%[a_next]                           \n\t" // Next A address for later use.
 	" ldr x1,%[b_next]                           \n\t" // Next B address for later use.
 	"                                            \n\t"
-	LABEL(DCOLSTORED)                                  // C is column-major.
+	" cmp x14,#8                                 \n\t" // If rs_c != 1 (column-major)
+	BNE (DGENSTORED)
+	"                                            \n\t"
+	LABEL (DCOLSTORED)                                 // C is column-major.
 	"                                            \n\t"
 	" dup  v0.2d, xzr                            \n\t"
 	" dup  v1.2d, xzr                            \n\t"
@@ -1351,13 +1696,13 @@ void bli_dgemm_armv8a_asm_6x8
 	" dup  v5.2d, xzr                            \n\t"
 	"                                            \n\t"
 	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
+	BEQ (DBETAZEROCOLSTOREDS1)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x2]                               \n\t" // Load column 0 of C
+	" ldr q0, [x2]                               \n\t" //Load column 0 of C
 	" ldr q1, [x2, #16]                          \n\t"
 	" ldr q2, [x2, #32]                          \n\t"
 	"                                            \n\t"
-	" ldr q3, [x20]                              \n\t" // Load column 1 of C
+	" ldr q3, [x20]                              \n\t" //Load column 1 of C
 	" ldr q4, [x20, #16]                         \n\t"
 	" ldr q5, [x20, #32]                         \n\t"
 	"                                            \n\t"
@@ -1368,7 +1713,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(DBETAZEROCOLSTOREDS1)
+	LABEL (DBETAZEROCOLSTOREDS1)
 	"                                            \n\t"
 	" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
 	" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
@@ -1377,11 +1722,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
 	" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x2]                               \n\t" // Store column 0 of C
+	" str q0, [x2]                               \n\t" //Store column 0 of C
 	" str q1, [x2, #16]                          \n\t"
 	" str q2, [x2, #32]                          \n\t"
 	"                                            \n\t"
-	" str q3, [x20]                              \n\t" // Store column 1 of C
+	" str q3, [x20]                              \n\t" //Store column 1 of C
 	" str q4, [x20, #16]                         \n\t"
 	" str q5, [x20, #32]                         \n\t"
 	"                                            \n\t"
@@ -1393,13 +1738,13 @@ void bli_dgemm_armv8a_asm_6x8
 	" dup  v13.2d, xzr                           \n\t"
 	"                                            \n\t"
 	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
+	BEQ (DBETAZEROCOLSTOREDS2)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x21]                              \n\t" // Load column 2 of C
+	" ldr q8, [x21]                              \n\t" //Load column 2 of C
 	" ldr q9, [x21, #16]                         \n\t"
 	" ldr q10, [x21, #32]                        \n\t"
 	"                                            \n\t"
-	" ldr q11, [x22]                             \n\t" // Load column 3 of C
+	" ldr q11, [x22]                             \n\t" //Load column 3 of C
 	" ldr q12, [x22, #16]                        \n\t"
 	" ldr q13, [x22, #32]                        \n\t"
 	"                                            \n\t"
@@ -1410,7 +1755,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(DBETAZEROCOLSTOREDS2)
+	LABEL (DBETAZEROCOLSTOREDS2)
 	"                                            \n\t"
 	" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
 	" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
@@ -1419,11 +1764,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
 	" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x21]                              \n\t" // Store column 2 of C
+	" str q8, [x21]                              \n\t" //Store column 2 of C
 	" str q9, [x21, #16]                         \n\t"
 	" str q10, [x21, #32]                        \n\t"
 	"                                            \n\t"
-	" str q11, [x22]                             \n\t" // Store column 3 of C
+	" str q11, [x22]                             \n\t" //Store column 3 of C
 	" str q12, [x22, #16]                        \n\t"
 	" str q13, [x22, #32]                        \n\t"
 	"                                            \n\t"
@@ -1435,13 +1780,13 @@ void bli_dgemm_armv8a_asm_6x8
 	" dup  v5.2d, xzr                            \n\t"
 	"                                            \n\t"
 	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
+	BEQ (DBETAZEROCOLSTOREDS3)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q0, [x23]                              \n\t" // Load column 4 of C
+	" ldr q0, [x23]                              \n\t" //Load column 4 of C
 	" ldr q1, [x23, #16]                         \n\t"
 	" ldr q2, [x23, #32]                         \n\t"
 	"                                            \n\t"
-	" ldr q3, [x24]                              \n\t" // Load column 5 of C
+	" ldr q3, [x24]                              \n\t" //Load column 5 of C
 	" ldr q4, [x24, #16]                         \n\t"
 	" ldr q5, [x24, #32]                         \n\t"
 	"                                            \n\t"
@@ -1452,7 +1797,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(DBETAZEROCOLSTOREDS3)
+	LABEL (DBETAZEROCOLSTOREDS3)
 	"                                            \n\t"
 	" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
 	" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
@@ -1461,11 +1806,11 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
 	" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q0, [x23]                              \n\t" // Store column 4 of C
+	" str q0, [x23]                              \n\t" //Store column 4 of C
 	" str q1, [x23, #16]                         \n\t"
 	" str q2, [x23, #32]                         \n\t"
 	"                                            \n\t"
-	" str q3, [x24]                              \n\t" // Store column 5 of C
+	" str q3, [x24]                              \n\t" //Store column 5 of C
 	" str q4, [x24, #16]                         \n\t"
 	" str q5, [x24, #32]                         \n\t"
 	"                                            \n\t"
@@ -1477,13 +1822,13 @@ void bli_dgemm_armv8a_asm_6x8
 	" dup  v13.2d, xzr                           \n\t"
 	"                                            \n\t"
 	" fcmp d7,#0.0                               \n\t"
-	BEQ(DBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
+	BEQ (DBETAZEROCOLSTOREDS4)                         // Taking care of the beta==0 case.
 	"                                            \n\t"
-	" ldr q8, [x25]                              \n\t" // Load column 6 of C
+	" ldr q8, [x25]                              \n\t" //Load column 6 of C
 	" ldr q9, [x25, #16]                         \n\t"
 	" ldr q10, [x25, #32]                        \n\t"
 	"                                            \n\t"
-	" ldr q11, [x26]                             \n\t" // Load column 7 of C
+	" ldr q11, [x26]                             \n\t" //Load column 7 of C
 	" ldr q12, [x26, #16]                        \n\t"
 	" ldr q13, [x26, #32]                        \n\t"
 	"                                            \n\t"
@@ -1494,7 +1839,7 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 	"                                            \n\t"
-	LABEL(DBETAZEROCOLSTOREDS4)
+	LABEL (DBETAZEROCOLSTOREDS4)
 	"                                            \n\t"
 	" prfm pldl2keep,[x0]                        \n\t"
 	" prfm pldl2keep,[x1]                        \n\t"
@@ -1506,48 +1851,300 @@ void bli_dgemm_armv8a_asm_6x8
 	" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
 	" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
 	"                                            \n\t"
-	" str q8, [x25]                              \n\t" // Store column 6 of C
+	" str q8, [x25]                              \n\t" //Store column 6 of C
 	" str q9, [x25, #16]                         \n\t"
 	" str q10, [x25, #32]                        \n\t"
 	"                                            \n\t"
-	" str q11, [x26]                             \n\t" // Store column 7 of C
+	" str q11, [x26]                             \n\t" //Store column 7 of C
 	" str q12, [x26, #16]                        \n\t"
 	" str q13, [x26, #32]                        \n\t"
 	"                                            \n\t"
-	// BRANCH(DEND)
-	// LABEL(DEND)                                        // Done!
+	BRANCH (DEND)
+	"                                            \n\t"
+	LABEL (DGENSTORED)                                 // C is general-stride stored.
+	"                                            \n\t"
+	" dup  v0.2d, xzr                            \n\t"
+	" dup  v1.2d, xzr                            \n\t"
+	" dup  v2.2d, xzr                            \n\t"
+	" dup  v3.2d, xzr                            \n\t"
+	" dup  v4.2d, xzr                            \n\t"
+	" dup  v5.2d, xzr                            \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ (DBETAZEROGENSTOREDS1)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x2                                \n\t"
+	"                                            \n\t" // Load address of C.
+	" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
+	" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
+	" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
+	" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
+	" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
+	" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x20                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
+	" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
+	" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
+	" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+	" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
+	" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (DBETAZEROGENSTOREDS1)
+	"                                            \n\t"
+	" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
+	" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
+	" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x2                                \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
+	" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
+	" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
+	" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
+	" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
+	" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x20                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
+	" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
+	" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
+	" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+	" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
+	" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.2d, xzr                            \n\t"
+	" dup  v9.2d, xzr                            \n\t"
+	" dup  v10.2d, xzr                           \n\t"
+	" dup  v11.2d, xzr                           \n\t"
+	" dup  v12.2d, xzr                           \n\t"
+	" dup  v13.2d, xzr                           \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ (DBETAZEROGENSTOREDS2)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x21                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c20  into quad and increment by rs_c.
+	" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c21  into quad and increment by rs_c.
+	" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c22  into quad and increment by rs_c.
+	" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c23  into quad and increment by rs_c.
+	" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c24  into quad and increment by rs_c.
+	" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c25  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x22                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c30  into quad and increment by rs_c.
+	" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c31  into quad and increment by rs_c.
+	" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c32  into quad and increment by rs_c.
+	" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c33  into quad and increment by rs_c.
+	" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c34  into quad and increment by rs_c.
+	" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c35  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (DBETAZEROGENSTOREDS2)
+	"                                            \n\t"
+	" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x21                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c20  into quad and increment by rs_c.
+	" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c21  into quad and increment by rs_c.
+	" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c22  into quad and increment by rs_c.
+	" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c23  into quad and increment by rs_c.
+	" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c24  into quad and increment by rs_c.
+	" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c25  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x22                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c30  into quad and increment by rs_c.
+	" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c31  into quad and increment by rs_c.
+	" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c32  into quad and increment by rs_c.
+	" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c33  into quad and increment by rs_c.
+	" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c34  into quad and increment by rs_c.
+	" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c35  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v0.2d, xzr                            \n\t"
+	" dup  v1.2d, xzr                            \n\t"
+	" dup  v2.2d, xzr                            \n\t"
+	" dup  v3.2d, xzr                            \n\t"
+	" dup  v4.2d, xzr                            \n\t"
+	" dup  v5.2d, xzr                            \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ (DBETAZEROGENSTOREDS3)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x23                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c40  into quad and increment by rs_c.
+	" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c41  into quad and increment by rs_c.
+	" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c42  into quad and increment by rs_c.
+	" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c43  into quad and increment by rs_c.
+	" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c44  into quad and increment by rs_c.
+	" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c45  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x24                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c50  into quad and increment by rs_c.
+	" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c51  into quad and increment by rs_c.
+	" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c52  into quad and increment by rs_c.
+	" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c53  into quad and increment by rs_c.
+	" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c54  into quad and increment by rs_c.
+	" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c55  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+	" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (DBETAZEROGENSTOREDS3)
+	"                                            \n\t"
+	" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
+	" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x23                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c40  into quad and increment by rs_c.
+	" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c41  into quad and increment by rs_c.
+	" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c42  into quad and increment by rs_c.
+	" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c43  into quad and increment by rs_c.
+	" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c44  into quad and increment by rs_c.
+	" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c45  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x24                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c50  into quad and increment by rs_c.
+	" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c51  into quad and increment by rs_c.
+	" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c52  into quad and increment by rs_c.
+	" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c53  into quad and increment by rs_c.
+	" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c54  into quad and increment by rs_c.
+	" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c55  into quad and increment by rs_c.
+	"                                            \n\t"
+	" dup  v8.2d, xzr                            \n\t"
+	" dup  v9.2d, xzr                            \n\t"
+	" dup  v10.2d, xzr                           \n\t"
+	" dup  v11.2d, xzr                           \n\t"
+	" dup  v12.2d, xzr                           \n\t"
+	" dup  v13.2d, xzr                           \n\t"
+	"                                            \n\t"
+	" fcmp d7,#0.0                               \n\t"
+	BEQ (DBETAZEROGENSTOREDS4)                         // Taking care of the beta==0 case.
+	"                                            \n\t"
+	" mov x27, x25                               \n\t"
+	"                                            \n\t"
+	" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c60  into quad and increment by rs_c.
+	" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c61  into quad and increment by rs_c.
+	" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c62  into quad and increment by rs_c.
+	" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c63  into quad and increment by rs_c.
+	" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c64  into quad and increment by rs_c.
+	" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c65  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x26                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c70  into quad and increment by rs_c.
+	" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c71  into quad and increment by rs_c.
+	" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c72  into quad and increment by rs_c.
+	" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c73  into quad and increment by rs_c.
+	" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c74  into quad and increment by rs_c.
+	" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c75  into quad and increment by rs_c.
+	"                                            \n\t"
+	" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+	" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+	" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+	"                                            \n\t"
+	LABEL (DBETAZEROGENSTOREDS4)
+	"                                            \n\t"
+	" prfm pldl2keep,[x0]                        \n\t"
+	" prfm pldl2keep,[x1]                        \n\t"
+	"                                            \n\t"
+	" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
+	" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
+	"                                            \n\t"
+	" mov x27, x25                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c60  into quad and increment by rs_c.
+	" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c61  into quad and increment by rs_c.
+	" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c62  into quad and increment by rs_c.
+	" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c63  into quad and increment by rs_c.
+	" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c64  into quad and increment by rs_c.
+	" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c65  into quad and increment by rs_c.
+	"                                            \n\t"
+	" mov x27, x26                               \n\t" // Load address of C.
+	"                                            \n\t"
+	" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c70  into quad and increment by rs_c.
+	" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c71  into quad and increment by rs_c.
+	" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c72  into quad and increment by rs_c.
+	" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c73  into quad and increment by rs_c.
+	" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c74  into quad and increment by rs_c.
+	" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c75  into quad and increment by rs_c.
+	"                                            \n\t"
+	LABEL (DEND)                                       // Done!
 	"                                            \n\t"
 	:// output operands (none)
 	:// input operands
-	 [aaddr]  "m" (a),      // 0
-	 [baddr]  "m" (b),      // 1
-	 [caddr]  "m" (c),      // 2
+	 [aaddr] "m" (a),       // 0
+	 [baddr] "m" (b),       // 1
+	 [caddr] "m" (c),       // 2
 	 [k_iter] "m" (k_iter), // 3
 	 [k_left] "m" (k_left), // 4
-	 [alpha]  "m" (alpha),  // 5
-	 [beta]   "m" (beta),   // 6
-	 [rs_c]   "m" (rs_c),   // 6
-	 [cs_c]   "m" (cs_c),   // 7
-	 [a_next] "m" (a_next), // 8
-	 [b_next] "m" (b_next)  // 9
-	:// Register clobber list
-	 "x0","x1","x2",
-	 "x5","x6","x10",
-	 "x16","x17","x20",
-	 "x21","x22","x23",
-	 "x24","x25","x26","x27",
-	 "v0","v1","v2",
-	 "v3","v4","v5",
-	 "v6","v7","v8",
-	 "v9","v10","v11",
-	 "v12","v13","v14",
-	 "v15","v16","v17","v18","v19",
-	 "v20","v21","v22","v23",
-	 "v24","v25","v26","v27",
-	 "v28","v29","v30","v31"
+	 [alpha] "m" (alpha),   // 5
+	 [beta] "m" (beta),     // 6
+	 [rs_c] "m" (rs_c),     // 7
+	 [cs_c] "m" (cs_c),     // 8
+	 [a_next] "m" (a_next), // 9
+	 [b_next] "m" (b_next)  // 10
+	 :// Register clobber list
+	 "x0", "x1", "x2",
+	 "x5", "x6", "x10", "x14",
+	 "x16", "x17", "x19", "x20",
+	 "x21", "x22", "x23", "x24",
+	 "x25", "x26", "x27",
+	 "v0", "v1", "v2", "v3",
+	 "v4", "v5", "v6", "v7",
+	 "v8", "v9", "v10", "v11",
+	 "v12", "v13", "v14", "v15",
+	 "v16", "v17", "v18", "v19",
+	 "v20", "v21", "v22", "v23",
+	 "v24", "v25", "v26", "v27",
+	 "v28", "v29", "v30", "v31"
 	);
 
 	GEMM_UKR_FLUSH_CT( d );
 }
 
-// June 2022, removed unused stubs for ancient 4x4 kernels

From 37ca4fd168525a71937d16aaf6a13c0de5b4daef Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Thu, 28 Sep 2023 16:37:57 -0500
Subject: [PATCH 170/230] Implemented [cz]symv_(), [cz]syr_(), [cz]rot_().
 (#778)

Details:
- Expanded existing BLAS compatibility APIs to provide interfaces to
  [cz]symv_(), [cz]syr_(). This was easy since those operations were
  already implemented natively in BLIS; the APIs were previously
  omitted only because they were not formally part of the BLAS.
- Implemented [cz]rot_() by feeding code from LAPACK 3.11 through
  f2c.
- Thanks to James Foster for pointing out that LAPACK contains these
  additional symbols, which prompted these additions, as well as for
  testing the [cz]rot_() functions from Julia's test infrastructure.
- CREDITS file update.
---
 CREDITS                       |   1 +
 frame/compat/bla_symv.c       |   6 +-
 frame/compat/bla_symv.h       |   6 +-
 frame/compat/bla_syr.c        |   6 +-
 frame/compat/bla_syr.h        |   6 +-
 frame/compat/f2c/bla_rot.c    | 480 ++++++++++++++++++++++++++++++++++
 frame/compat/f2c/bla_rot.h    |   2 +
 frame/compat/f2c/other/crot.c | 227 ++++++++++++++++
 frame/compat/f2c/other/crot.f | 159 +++++++++++
 frame/compat/f2c/other/zrot.c | 227 ++++++++++++++++
 frame/compat/f2c/other/zrot.f | 159 +++++++++++
 11 files changed, 1267 insertions(+), 12 deletions(-)
 create mode 100644 frame/compat/f2c/other/crot.c
 create mode 100644 frame/compat/f2c/other/crot.f
 create mode 100644 frame/compat/f2c/other/zrot.c
 create mode 100644 frame/compat/f2c/other/zrot.f

diff --git a/CREDITS b/CREDITS
index f72698f99..9d35931bc 100644
--- a/CREDITS
+++ b/CREDITS
@@ -36,6 +36,7 @@ but many others have contributed code, ideas, and feedback, including
   Victor Eijkhout          @VictorEijkhout            (Texas Advanced Computing Center)
   Evgeny Epifanovsky       @epifanovsky               (Q-Chem)
   Isuru Fernando           @isuruf
+  James Foster             @jd-foster                 (CSIRO)
   Roman Gareev             @gareevroman
   Richard Goldschmidt      @SuperFluffy
   Chris Goodyer
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index c5b5ebda3..8923acdc4 100644
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -38,8 +38,8 @@
 //
 // Define BLAS-to-BLIS interfaces.
 //
-#undef  GENTFUNCRO
-#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
      ( \
@@ -110,6 +110,6 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTFUNCRO_BLAS( symv, symv )
+INSERT_GENTFUNC_BLAS( symv, symv )
 #endif
 
diff --git a/frame/compat/bla_symv.h b/frame/compat/bla_symv.h
index 2f493a9d9..4f453a7a3 100644
--- a/frame/compat/bla_symv.h
+++ b/frame/compat/bla_symv.h
@@ -37,8 +37,8 @@
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
-#undef  GENTPROTRO
-#define GENTPROTRO( ftype, ch, blasname ) \
+#undef  GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      ( \
@@ -52,7 +52,7 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTPROTRO_BLAS( symv )
+INSERT_GENTPROT_BLAS( symv )
 #endif
 
 #endif
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index 6732a75cf..91dc99b59 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -38,8 +38,8 @@
 //
 // Define BLAS-to-BLIS interfaces.
 //
-#undef  GENTFUNCRO
-#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
      ( \
@@ -101,6 +101,6 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTFUNCRO_BLAS( syr, syr )
+INSERT_GENTFUNC_BLAS( syr, syr )
 #endif
 
diff --git a/frame/compat/bla_syr.h b/frame/compat/bla_syr.h
index 662d07328..7f3eeb367 100644
--- a/frame/compat/bla_syr.h
+++ b/frame/compat/bla_syr.h
@@ -37,8 +37,8 @@
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
-#undef  GENTPROTRO
-#define GENTPROTRO( ftype, ch, blasname ) \
+#undef  GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      ( \
@@ -50,7 +50,7 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTPROTRO_BLAS( syr )
+INSERT_GENTPROT_BLAS( syr )
 #endif
 
 #endif
diff --git a/frame/compat/f2c/bla_rot.c b/frame/compat/f2c/bla_rot.c
index c79769bc0..0dbd720d2 100644
--- a/frame/compat/f2c/bla_rot.c
+++ b/frame/compat/f2c/bla_rot.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Field G. Van Zee
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -358,5 +359,484 @@
     return 0;
 } /* zdrot_ */
 
+
+/* crot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+/* Subroutine */ int PASTEF77(c,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_scomplex *s)
+{
+    /* System generated locals */
+    bla_integer i__1, i__2, i__3, i__4;
+    bla_scomplex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    bla_integer i__, ix, iy;
+    bla_scomplex stemp;
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+#if 0
+	q__2.r = *c__ * cx[i__2].r;
+	q__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	q__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r
+	q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r
+	stemp.i = q__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	q__2.r = *c__ * cy[i__3].r
+	q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = ix;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i
+	q__3.i = q__4.r * cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r;
+	q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r;
+	cy[i__2].i = q__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_csets
+	(
+	  *c__ * bli_creal(cx[i__2]),
+	  *c__ * bli_cimag(cx[i__2]),
+	  q__2
+	);
+	i__3 = iy;
+	bli_csets
+	(
+	  bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
+	  bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) + bli_creal(q__3),
+	  bli_cimag(q__2) + bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  stemp
+	);
+	i__2 = iy;
+	i__3 = iy;
+	bli_csets
+	(
+	  *c__ * bli_creal(cy[i__3]),
+	  *c__ * bli_cimag(cy[i__3]),
+	  q__2
+	);
+	bla_r_cnjg(&q__4, s);
+	i__4 = ix;
+	bli_csets
+	(
+	  bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
+	  bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) - bli_creal(q__3),
+	  bli_cimag(q__2) - bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  cy[i__2]
+	);
+	i__2 = ix;
+	bli_csets
+	(
+	  bli_creal(stemp),
+	  bli_cimag(stemp),
+	  cx[i__2]
+	);
+#endif
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+#if 0
+	q__2.r = *c__ * cx[i__2].r;
+	q__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	q__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r;
+	q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r;
+	stemp.i = q__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	q__2.r = *c__ * cy[i__3].r;
+	q__2.i = *c__ * cy[i__3].i;
+	bla_r_cnjg(&q__4, s);
+	i__4 = i__;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i;
+	q__3.i = q__4.r * cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r;
+	q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r;
+	cy[i__2].i = q__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_csets
+	(
+	  *c__ * bli_creal(cx[i__2]),
+	  *c__ * bli_cimag(cx[i__2]),
+	  q__2
+	);
+	i__3 = i__;
+	bli_csets
+	(
+	  bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
+	  bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) + bli_creal(q__3),
+	  bli_cimag(q__2) + bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  stemp
+	);
+	i__2 = i__;
+	i__3 = i__;
+	bli_csets
+	(
+	  *c__ * bli_creal(cy[i__3]),
+	  *c__ * bli_cimag(cy[i__3]),
+	  q__2
+	);
+	bla_r_cnjg(&q__4, s);
+	i__4 = i__;
+	bli_csets
+	(
+	  bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
+	  bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) - bli_creal(q__3),
+	  bli_cimag(q__2) - bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  cy[i__2]
+	);
+	i__2 = i__;
+	bli_csets
+	(
+	  bli_creal(stemp),
+	  bli_cimag(stemp),
+	  cx[i__2]
+	);
+#endif
+/* L30: */
+    }
+    return 0;
+} /* crot_ */
+
+
+/* zrot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+/* Subroutine */ int PASTEF77(z,rot)(const bla_integer *n, bla_dcomplex *cx, const bla_integer *incx, bla_dcomplex *cy, const bla_integer *incy, const bla_double *c__, const bla_dcomplex *s)
+{
+    /* System generated locals */
+    bla_integer i__1, i__2, i__3, i__4;
+    bla_dcomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    bla_integer i__, ix, iy;
+    bla_dcomplex stemp;
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+#if 0
+	z__2.r = *c__ * cx[i__2].r;
+	z__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	z__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r
+	z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r
+	stemp.i = z__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	z__2.r = *c__ * cy[i__3].r
+	z__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&z__4, s);
+	i__4 = ix;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i
+	z__3.i = z__4.r * cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r;
+	z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r;
+	cy[i__2].i = z__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cx[i__2]),
+	  *c__ * bli_zimag(cx[i__2]),
+	  z__2
+	);
+	i__3 = iy;
+	bli_zsets
+	(
+	  bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
+	  bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) + bli_zreal(z__3),
+	  bli_zimag(z__2) + bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  stemp
+	);
+	i__2 = iy;
+	i__3 = iy;
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cy[i__3]),
+	  *c__ * bli_zimag(cy[i__3]),
+	  z__2
+	);
+	bla_d_cnjg(&z__4, s);
+	i__4 = ix;
+	bli_zsets
+	(
+	  bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
+	  bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) - bli_zreal(z__3),
+	  bli_zimag(z__2) - bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  cy[i__2]
+	);
+	i__2 = ix;
+	bli_zsets
+	(
+	  bli_zreal(stemp),
+	  bli_zimag(stemp),
+	  cx[i__2]
+	);
+#endif
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+#if 0
+	z__2.r = *c__ * cx[i__2].r;
+	z__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	z__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r;
+	z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r;
+	stemp.i = z__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	z__2.r = *c__ * cy[i__3].r;
+	z__2.i = *c__ * cy[i__3].i;
+	bla_d_cnjg(&z__4, s);
+	i__4 = i__;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i;
+	z__3.i = z__4.r * cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r;
+	z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r;
+	cy[i__2].i = z__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cx[i__2]),
+	  *c__ * bli_zimag(cx[i__2]),
+	  z__2
+	);
+	i__3 = i__;
+	bli_zsets
+	(
+	  bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
+	  bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) + bli_zreal(z__3),
+	  bli_zimag(z__2) + bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  stemp
+	);
+	i__2 = i__;
+	i__3 = i__;
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cy[i__3]),
+	  *c__ * bli_zimag(cy[i__3]),
+	  z__2
+	);
+	bla_d_cnjg(&z__4, s);
+	i__4 = i__;
+	bli_zsets
+	(
+	  bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
+	  bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) - bli_zreal(z__3),
+	  bli_zimag(z__2) - bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  cy[i__2]
+	);
+	i__2 = i__;
+	bli_zsets
+	(
+	  bli_zreal(stemp),
+	  bli_zimag(stemp),
+	  cx[i__2]
+	);
+#endif
+/* L30: */
+    }
+    return 0;
+} /* zrot_ */
+
+
 #endif
 
diff --git a/frame/compat/f2c/bla_rot.h b/frame/compat/f2c/bla_rot.h
index ca4a4f9ac..4e6aead4a 100644
--- a/frame/compat/f2c/bla_rot.h
+++ b/frame/compat/f2c/bla_rot.h
@@ -38,5 +38,7 @@ BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const b
 BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
 BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
 BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
+BLIS_EXPORT_BLAS int PASTEF77(c,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_scomplex *s);
+BLIS_EXPORT_BLAS int PASTEF77(z,rot)(const bla_integer *n, bla_dcomplex *cx, const bla_integer *incx, bla_dcomplex *cy, const bla_integer *incy, const bla_double *c__, const bla_dcomplex *s);
 
 #endif
diff --git a/frame/compat/f2c/other/crot.c b/frame/compat/f2c/other/crot.c
new file mode 100644
index 000000000..e3e1282f4
--- /dev/null
+++ b/frame/compat/f2c/other/crot.c
@@ -0,0 +1,227 @@
+/* crot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* > \brief \b CROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors. 
+*/
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CROT + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crot.f"
+> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crot.f"
+> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crot.f"
+> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S ) */
+
+/*       .. Scalar Arguments .. */
+/*       INTEGER            INCX, INCY, N */
+/*       REAL               C */
+/*       COMPLEX            S */
+/*       .. */
+/*       .. Array Arguments .. */
+/*       COMPLEX            CX( * ), CY( * ) */
+/*       .. */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CROT   applies a plane rotation, where the cos (C) is real and the */
+/* > sin (S) is complex, and the vectors CX and CY are complex. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of elements in the vectors CX and CY. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CX */
+/* > \verbatim */
+/* >          CX is COMPLEX array, dimension (N) */
+/* >          On input, the vector X. */
+/* >          On output, CX is overwritten with C*X + S*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of CX.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CY */
+/* > \verbatim */
+/* >          CY is COMPLEX array, dimension (N) */
+/* >          On input, the vector Y. */
+/* >          On output, CY is overwritten with -CONJG(S)*X + C*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCY */
+/* > \verbatim */
+/* >          INCY is INTEGER */
+/* >          The increment between successive values of CY.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] C */
+/* > \verbatim */
+/* >          C is REAL */
+/* > \endverbatim */
+/* > */
+/* > \param[in] S */
+/* > \verbatim */
+/* >          S is COMPLEX */
+/* >          C and S define a rotation */
+/* >             [  C          S  ] */
+/* >             [ -conjg(S)   C  ] */
+/* >          where C*C + S*CONJG(S) = 1.0. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexOTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int crot_(integer *n, complex *cx, integer *incx, complex *
+	cy, integer *incy, real *c__, complex *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, ix, iy;
+    complex stemp;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/* ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r, stemp.i = q__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = ix;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r * 
+		cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r, stemp.i = q__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = i__;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r * 
+		cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+    }
+    return 0;
+} /* crot_ */
+
diff --git a/frame/compat/f2c/other/crot.f b/frame/compat/f2c/other/crot.f
new file mode 100644
index 000000000..6dc771506
--- /dev/null
+++ b/frame/compat/f2c/other/crot.f
@@ -0,0 +1,159 @@
+*> \brief \b CROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CROT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crot.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crot.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crot.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, INCY, N
+*       REAL               C
+*       COMPLEX            S
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            CX( * ), CY( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CROT   applies a plane rotation, where the cos (C) is real and the
+*> sin (S) is complex, and the vectors CX and CY are complex.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of elements in the vectors CX and CY.
+*> \endverbatim
+*>
+*> \param[in,out] CX
+*> \verbatim
+*>          CX is COMPLEX array, dimension (N)
+*>          On input, the vector X.
+*>          On output, CX is overwritten with C*X + S*Y.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of CX.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in,out] CY
+*> \verbatim
+*>          CY is COMPLEX array, dimension (N)
+*>          On input, the vector Y.
+*>          On output, CY is overwritten with -CONJG(S)*X + C*Y.
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*>          INCY is INTEGER
+*>          The increment between successive values of CY.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*>          C is REAL
+*> \endverbatim
+*>
+*> \param[in] S
+*> \verbatim
+*>          S is COMPLEX
+*>          C and S define a rotation
+*>             [  C          S  ]
+*>             [ -conjg(S)   C  ]
+*>          where C*C + S*CONJG(S) = 1.0.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, INCY, N
+      REAL               C
+      COMPLEX            S
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            CX( * ), CY( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Local Scalars ..
+      INTEGER            I, IX, IY
+      COMPLEX            STEMP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.0 )
+     $   RETURN
+      IF( INCX.EQ.1 .AND. INCY.EQ.1 )
+     $   GO TO 20
+*
+*     Code for unequal increments or equal increments not equal to 1
+*
+      IX = 1
+      IY = 1
+      IF( INCX.LT.0 )
+     $   IX = ( -N+1 )*INCX + 1
+      IF( INCY.LT.0 )
+     $   IY = ( -N+1 )*INCY + 1
+      DO 10 I = 1, N
+         STEMP = C*CX( IX ) + S*CY( IY )
+         CY( IY ) = C*CY( IY ) - CONJG( S )*CX( IX )
+         CX( IX ) = STEMP
+         IX = IX + INCX
+         IY = IY + INCY
+   10 CONTINUE
+      RETURN
+*
+*     Code for both increments equal to 1
+*
+   20 CONTINUE
+      DO 30 I = 1, N
+         STEMP = C*CX( I ) + S*CY( I )
+         CY( I ) = C*CY( I ) - CONJG( S )*CX( I )
+         CX( I ) = STEMP
+   30 CONTINUE
+      RETURN
+      END
diff --git a/frame/compat/f2c/other/zrot.c b/frame/compat/f2c/other/zrot.c
new file mode 100644
index 000000000..0706f8b25
--- /dev/null
+++ b/frame/compat/f2c/other/zrot.c
@@ -0,0 +1,227 @@
+/* zrot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* > \brief \b ZROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors. 
+*/
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZROT + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zrot.f"
+> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zrot.f"
+> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zrot.f"
+> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S ) */
+
+/*       .. Scalar Arguments .. */
+/*       INTEGER            INCX, INCY, N */
+/*       DOUBLE PRECISION   C */
+/*       COMPLEX*16         S */
+/*       .. */
+/*       .. Array Arguments .. */
+/*       COMPLEX*16         CX( * ), CY( * ) */
+/*       .. */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZROT   applies a plane rotation, where the cos (C) is real and the */
+/* > sin (S) is complex, and the vectors CX and CY are complex. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of elements in the vectors CX and CY. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CX */
+/* > \verbatim */
+/* >          CX is COMPLEX*16 array, dimension (N) */
+/* >          On input, the vector X. */
+/* >          On output, CX is overwritten with C*X + S*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of CX.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CY */
+/* > \verbatim */
+/* >          CY is COMPLEX*16 array, dimension (N) */
+/* >          On input, the vector Y. */
+/* >          On output, CY is overwritten with -CONJG(S)*X + C*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCY */
+/* > \verbatim */
+/* >          INCY is INTEGER */
+/* >          The increment between successive values of CY.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] C */
+/* > \verbatim */
+/* >          C is DOUBLE PRECISION */
+/* > \endverbatim */
+/* > */
+/* > \param[in] S */
+/* > \verbatim */
+/* >          S is COMPLEX*16 */
+/* >          C and S define a rotation */
+/* >             [  C          S  ] */
+/* >             [ -conjg(S)   C  ] */
+/* >          where C*C + S*CONJG(S) = 1.0. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16OTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int zrot_(integer *n, doublecomplex *cx, integer *incx, 
+	doublecomplex *cy, integer *incy, doublereal *c__, doublecomplex *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, ix, iy;
+    doublecomplex stemp;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/* ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r, stemp.i = z__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	d_cnjg(&z__4, s);
+	i__4 = ix;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r * 
+		cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r, stemp.i = z__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	d_cnjg(&z__4, s);
+	i__4 = i__;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r * 
+		cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+    }
+    return 0;
+} /* zrot_ */
+
diff --git a/frame/compat/f2c/other/zrot.f b/frame/compat/f2c/other/zrot.f
new file mode 100644
index 000000000..28fc8ec1d
--- /dev/null
+++ b/frame/compat/f2c/other/zrot.f
@@ -0,0 +1,159 @@
+*> \brief \b ZROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZROT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zrot.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zrot.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zrot.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, INCY, N
+*       DOUBLE PRECISION   C
+*       COMPLEX*16         S
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         CX( * ), CY( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZROT   applies a plane rotation, where the cos (C) is real and the
+*> sin (S) is complex, and the vectors CX and CY are complex.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of elements in the vectors CX and CY.
+*> \endverbatim
+*>
+*> \param[in,out] CX
+*> \verbatim
+*>          CX is COMPLEX*16 array, dimension (N)
+*>          On input, the vector X.
+*>          On output, CX is overwritten with C*X + S*Y.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of CX.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in,out] CY
+*> \verbatim
+*>          CY is COMPLEX*16 array, dimension (N)
+*>          On input, the vector Y.
+*>          On output, CY is overwritten with -CONJG(S)*X + C*Y.
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*>          INCY is INTEGER
+*>          The increment between successive values of CY.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*>          C is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] S
+*> \verbatim
+*>          S is COMPLEX*16
+*>          C and S define a rotation
+*>             [  C          S  ]
+*>             [ -conjg(S)   C  ]
+*>          where C*C + S*CONJG(S) = 1.0.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, INCY, N
+      DOUBLE PRECISION   C
+      COMPLEX*16         S
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         CX( * ), CY( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Local Scalars ..
+      INTEGER            I, IX, IY
+      COMPLEX*16         STEMP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.0 )
+     $   RETURN
+      IF( INCX.EQ.1 .AND. INCY.EQ.1 )
+     $   GO TO 20
+*
+*     Code for unequal increments or equal increments not equal to 1
+*
+      IX = 1
+      IY = 1
+      IF( INCX.LT.0 )
+     $   IX = ( -N+1 )*INCX + 1
+      IF( INCY.LT.0 )
+     $   IY = ( -N+1 )*INCY + 1
+      DO 10 I = 1, N
+         STEMP = C*CX( IX ) + S*CY( IY )
+         CY( IY ) = C*CY( IY ) - DCONJG( S )*CX( IX )
+         CX( IX ) = STEMP
+         IX = IX + INCX
+         IY = IY + INCY
+   10 CONTINUE
+      RETURN
+*
+*     Code for both increments equal to 1
+*
+   20 CONTINUE
+      DO 30 I = 1, N
+         STEMP = C*CX( I ) + S*CY( I )
+         CY( I ) = C*CY( I ) - DCONJG( S )*CX( I )
+         CX( I ) = STEMP
+   30 CONTINUE
+      RETURN
+      END

From c2099ed2519dcac8ee421faf999b36e1c2260be7 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 2 Oct 2023 14:56:48 -0500
Subject: [PATCH 171/230] Fixed brokenness when sba is disabled. (#777)

Details:
- Previously, disabling the sba via --disable-sba-pools resulted in a
  segfault due to a sanity-check-triggering abort(). The problem was
  that the sba, as currently used in the l3 thread decorators, did not
  yet (fully) support pools being disabled. The solution entailed
  creating wrapper function, bli_sba_array_elem(), which either calls
  bli_apool_array_elem() (when sba pools are enabled at configure time)
  or returns a NULL sba_pool pointer (when sba pools are disabled), and
  calling bli_sba_array_elem() in place of bli_apool_array_elem(). Note
  that the NULL pointer returned by bli_sba_array_elem() when the sba
  pools are disabled does no harm since in that situation the pointer
  goes unreferenced when acquiring and releasing small blocks. Thanks to
  John Mather for reporting this bug.
- Guarded the bodies of bli_sba_init() and bli_sba_finalize() with
  #ifdef BLIS_ENABLE_SBA_POOLS. I don't think this was actually necessary
  to fix the aforementioned bug, but it seems like good practice.
- Moved the code in bli_l3_thrinfo_create() that checked that the array*
  pointer is non-NULL before calling bli_sba_array_elem() (previously
  bli_apool_array_elem()) into the definition of bli_sba_array_elem().
- Renamed various instances of 'pool' variables and function parameters
  to 'sba_pool' to emphasize what kind of pool it represents.
- Whitespace changes.
---
 frame/1m/packm/bli_packm_cntl.c |  6 +++---
 frame/1m/packm/bli_packm_cntl.h |  2 +-
 frame/3/bli_l3_decor.c          |  2 +-
 frame/3/bli_l3_sup_decor.c      |  2 +-
 frame/3/bli_l3_thrinfo.c        | 20 ++++++++---------
 frame/base/bli_apool.h          |  2 +-
 frame/base/bli_cntl.c           | 28 ++++++++++++------------
 frame/base/bli_sba.c            | 38 ++++++++++++++++++++++++---------
 frame/base/bli_sba.h            | 24 +++++++++++++--------
 9 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c
index 7f7401045..f38710ea8 100644
--- a/frame/1m/packm/bli_packm_cntl.c
+++ b/frame/1m/packm/bli_packm_cntl.c
@@ -37,7 +37,7 @@
 
 BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
      (
-       pool_t*   pool,
+       pool_t*   sba_pool,
        void_fp   var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
@@ -57,7 +57,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
 	#endif
 
 	// Allocate a packm_params_t struct.
-	params = bli_sba_acquire( pool, sizeof( packm_params_t ) );
+	params = bli_sba_acquire( sba_pool, sizeof( packm_params_t ) );
 
 	// Initialize the packm_params_t struct.
 	params->size              = sizeof( packm_params_t );
@@ -79,7 +79,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
 	// sync with the cntl_t tree.
 	cntl = bli_cntl_create_node
 	(
-	  pool,
+	  sba_pool,
 	  BLIS_NOID,
 	  BLIS_NO_PART,
 	  var_func,
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index 8a43f711d..a94a465b2 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -85,7 +85,7 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
 
 cntl_t* bli_packm_cntl_create_node
      (
-       pool_t*   pool,
+       pool_t*   sba_pool,
        void_fp   var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c
index 88ec5def9..dc1d3bb1b 100644
--- a/frame/3/bli_l3_decor.c
+++ b/frame/3/bli_l3_decor.c
@@ -89,7 +89,7 @@ static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const
 
 	// Create a default control tree for the operation, if needed.
 	cntl_t* cntl_use;
-	pool_t* sba_pool = bli_apool_array_elem( tid, array );
+	pool_t* sba_pool = bli_sba_array_elem( tid, array );
 	bli_l3_cntl_create_if( family, schema_a, schema_b,
 	                       &a_t, &b_t, &c_t, sba_pool, NULL, &cntl_use );
 
diff --git a/frame/3/bli_l3_sup_decor.c b/frame/3/bli_l3_sup_decor.c
index 7cda8bdca..d420559b5 100644
--- a/frame/3/bli_l3_sup_decor.c
+++ b/frame/3/bli_l3_sup_decor.c
@@ -69,7 +69,7 @@ static void bli_l3_sup_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, co
 	bli_l3_thread_decorator_thread_check( gl_comm, rntm );
 
 	// Create the root node of the thread's thrinfo_t structure.
-	pool_t*    pool   = bli_apool_array_elem( tid, array );
+	pool_t*    pool   = bli_sba_array_elem( tid, array );
 	thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
 
 	func
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 95d2a5439..5f3d39d39 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -44,16 +44,14 @@ thrinfo_t* bli_l3_thrinfo_create
        const cntl_t*     cntl
      )
 {
-	pool_t* pool = NULL;
-	if ( array != NULL )
-		pool = bli_apool_array_elem( id, array );
+	pool_t* sba_pool = bli_sba_array_elem( id, array );
 
 	// Create the root thrinfo_t node.
 	thrinfo_t* root = bli_thrinfo_create_root
 	(
 	  gl_comm,
 	  id,
-	  pool,
+	  sba_pool,
 	  bli_pba_query()
 	);
 
@@ -123,7 +121,7 @@ thrinfo_t* bli_l3_sup_thrinfo_create
      (
              dim_t      id,
              thrcomm_t* gl_comm,
-             pool_t*    pool,
+             pool_t*    sba_pool,
        const rntm_t*    rntm
      )
 {
@@ -132,7 +130,7 @@ thrinfo_t* bli_l3_sup_thrinfo_create
 	(
 	  gl_comm,
 	  id,
-	  pool,
+	  sba_pool,
 	  bli_pba_query()
 	);
 
@@ -176,10 +174,10 @@ void bli_l3_sup_thrinfo_update
              thrinfo_t** root
      )
 {
-	thrcomm_t* gl_comm = bli_thrinfo_comm( *root );
-	dim_t      tid     = bli_thrinfo_thread_id( *root );
-	pool_t*    pool    = bli_thrinfo_sba_pool( *root );
-	dim_t      nt      = bli_thrinfo_num_threads( *root );
+	thrcomm_t* gl_comm  = bli_thrinfo_comm( *root );
+	dim_t      tid      = bli_thrinfo_thread_id( *root );
+	pool_t*    sba_pool = bli_thrinfo_sba_pool( *root );
+	dim_t      nt       = bli_thrinfo_num_threads( *root );
 
 	// Return early in single-threaded execution
 	// since the thread control tree may not have been
@@ -187,7 +185,7 @@ void bli_l3_sup_thrinfo_update
 	if ( nt == 1 ) return;
 
 	bli_thrinfo_free( *root );
-	*root = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
+	*root = bli_l3_sup_thrinfo_create( tid, gl_comm, sba_pool, rntm );
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h
index d06f79207..c11171a27 100644
--- a/frame/base/bli_apool.h
+++ b/frame/base/bli_apool.h
@@ -56,7 +56,7 @@ BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool )
 	return &(apool->pool);
 }
 
-BLIS_INLINE  bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
+BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
 {
 	return &(apool->mutex);
 }
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index daa092ba7..bd688f85a 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -37,7 +37,7 @@
 
 cntl_t* bli_cntl_create_node
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
@@ -52,7 +52,7 @@ cntl_t* bli_cntl_create_node
 	#endif
 
 	// Allocate the cntl_t struct.
-	cntl = bli_sba_acquire( pool, sizeof( cntl_t ) );
+	cntl = bli_sba_acquire( sba_pool, sizeof( cntl_t ) );
 
 	bli_cntl_set_family( family, cntl );
 	bli_cntl_set_bszid( bszid, cntl );
@@ -66,7 +66,7 @@ cntl_t* bli_cntl_create_node
 
 void bli_cntl_free_node
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        cntl_t* cntl
      )
 {
@@ -74,7 +74,7 @@ void bli_cntl_free_node
 	printf( "bli_cntl_free_node(): " );
 	#endif
 
-	bli_sba_release( pool, cntl );
+	bli_sba_release( sba_pool, cntl );
 }
 
 void bli_cntl_clear_node
@@ -94,7 +94,7 @@ void bli_cntl_clear_node
 
 void bli_cntl_free
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        cntl_t* cntl
      )
 {
@@ -110,7 +110,7 @@ void bli_cntl_free
 	{
 		// Recursively free all memory associated with the sub-prenode and its
 		// children.
-		bli_cntl_free( pool, cntl_sub_prenode );
+		bli_cntl_free( sba_pool, cntl_sub_prenode );
 	}
 
 	// Only recurse into the child node if it exists.
@@ -118,7 +118,7 @@ void bli_cntl_free
 	{
 		// Recursively free all memory associated with the sub-node and its
 		// children.
-		bli_cntl_free( pool, cntl_sub_node );
+		bli_cntl_free( sba_pool, cntl_sub_node );
 	}
 
 	// Free the current node's params field, if it is non-NULL.
@@ -128,18 +128,18 @@ void bli_cntl_free
 		printf( "bli_cntl_free_w_thrinfo(): " );
 		#endif
 
-		bli_sba_release( pool, cntl_params );
+		bli_sba_release( sba_pool, cntl_params );
 	}
 
 	// Free the current node.
-	bli_cntl_free_node( pool, cntl );
+	bli_cntl_free_node( sba_pool, cntl );
 }
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_cntl_copy
      (
-             pool_t* pool,
+             pool_t* sba_pool,
        const cntl_t* cntl
      )
 {
@@ -149,7 +149,7 @@ cntl_t* bli_cntl_copy
 	// field.
 	cntl_t* cntl_copy = bli_cntl_create_node
 	(
-	  pool,
+	  sba_pool,
 	  bli_cntl_family( cntl ),
 	  bli_cntl_bszid( cntl ),
 	  bli_cntl_var_func( cntl ),
@@ -165,7 +165,7 @@ cntl_t* bli_cntl_copy
 		// struct.
 		uint64_t params_size = bli_cntl_params_size( cntl );
 		void*    params_orig = bli_cntl_params( cntl );
-		void*    params_copy = bli_sba_acquire( pool, ( size_t )params_size );
+		void*    params_copy = bli_sba_acquire( sba_pool, ( size_t )params_size );
 
 		// Copy the original params struct to the new memory region.
 		memcpy( params_copy, params_orig, params_size );
@@ -180,7 +180,7 @@ cntl_t* bli_cntl_copy
 	{
 		cntl_t* sub_prenode_copy = bli_cntl_copy
 		(
-		  pool,
+		  sba_pool,
 		  bli_cntl_sub_prenode( cntl )
 		);
 
@@ -194,7 +194,7 @@ cntl_t* bli_cntl_copy
 	{
 		cntl_t* sub_node_copy = bli_cntl_copy
 		(
-		  pool,
+		  sba_pool,
 		  bli_cntl_sub_node( cntl )
 		);
 
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 5123c5b4b..54da4c7d9 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -47,17 +47,21 @@ apool_t* bli_sba_query( void )
 
 void bli_sba_init( void )
 {
+#ifdef BLIS_ENABLE_SBA_POOLS
 	bli_apool_init( &sba );
+#endif
 }
 
 void bli_sba_finalize( void )
 {
+#ifdef BLIS_ENABLE_SBA_POOLS
 	bli_apool_finalize( &sba );
+#endif
 }
 
 void* bli_sba_acquire
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        siz_t   req_size
      )
 {
@@ -74,7 +78,7 @@ void* bli_sba_acquire
 	// is convenient to not have to checkout an array_t from the sba, and it
 	// does no harm since the malloc() happens outside of the region that
 	// would be timed.)
-	if ( pool == NULL )
+	if ( sba_pool == NULL )
 	{
 		block = bli_malloc_intl( req_size, &r_val );
 	}
@@ -84,10 +88,10 @@ void* bli_sba_acquire
 
 		// Query the block_size of the pool_t so that we can request the exact
 		// size present.
-		const siz_t block_size = bli_pool_block_size( pool );
+		const siz_t block_size = bli_pool_block_size( sba_pool );
 
 		// Sanity check: Make sure the requested size is no larger than the
-		// block_size field of the pool.
+		// block_size field of the sba pool.
 		if ( block_size < req_size )
 		{
 			printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
@@ -96,7 +100,7 @@ void* bli_sba_acquire
 		}
 
 		// Check out a block using the block_size queried above.
-		bli_pool_checkout_block( block_size, &pblk, pool );
+		bli_pool_checkout_block( block_size, &pblk, sba_pool );
 
 		// The block address is stored within the pblk_t.
 		block = bli_pblk_buf( &pblk );
@@ -114,13 +118,13 @@ void* bli_sba_acquire
 
 void bli_sba_release
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        void*   block
      )
 {
 #ifdef BLIS_ENABLE_SBA_POOLS
 
-	if ( pool == NULL )
+	if ( sba_pool == NULL )
 	{
 		bli_free_intl( block );
 	}
@@ -132,17 +136,17 @@ void bli_sba_release
 		// for this particular application of the pool_t (that is, the "leaf"
 		// component of the sba), but it seems like good housekeeping to maintain
 		// the block_size field of the pblk_t in case its ever needed/read.
-		const siz_t block_size = bli_pool_block_size( pool );
+		const siz_t block_size = bli_pool_block_size( sba_pool );
 
 		// Embed the block's memory address into a pblk_t, along with the
-		// block_size queried from the pool.
+		// block_size queried from the sba pool.
 		bli_pblk_set_buf( block, &pblk );
 		bli_pblk_set_block_size( block_size, &pblk );
 
 		// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
 		// a local variable since its contents are copied into the pool's internal
 		// data structure--an array of pblk_t.)
-		bli_pool_checkin_block( &pblk, pool );
+		bli_pool_checkin_block( &pblk, sba_pool );
 	}
 
 #else
@@ -176,3 +180,17 @@ void bli_sba_checkin_array
 #endif
 }
 
+pool_t* bli_sba_array_elem
+     (
+       siz_t    index,
+       array_t* array
+     )
+{
+#ifdef BLIS_ENABLE_SBA_POOLS
+	if ( array != NULL ) return bli_apool_array_elem( index, array );
+	else                 return NULL;
+#else
+	return NULL;
+#endif
+}
+
diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h
index 8d9db844f..92e53e7b3 100644
--- a/frame/base/bli_sba.h
+++ b/frame/base/bli_sba.h
@@ -42,6 +42,18 @@ apool_t* bli_sba_query( void );
 void bli_sba_init( void );
 void bli_sba_finalize( void );
 
+void* bli_sba_acquire
+     (
+       pool_t* sba_pool,
+       siz_t   req_size
+     );
+
+void bli_sba_release
+     (
+       pool_t* sba_pool,
+       void*   block
+     );
+
 array_t* bli_sba_checkout_array
      (
        siz_t n_threads
@@ -52,16 +64,10 @@ void bli_sba_checkin_array
        array_t* array
      );
 
-void* bli_sba_acquire
+pool_t* bli_sba_array_elem
      (
-       pool_t* pool,
-       siz_t   req_size
-     );
-
-void bli_sba_release
-     (
-       pool_t* pool,
-       void*   block
+       siz_t    index,
+       array_t* array
      );
 
 #endif

From 1e264a42474b535431768ef925bbd518412d392e Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 2 Oct 2023 18:29:46 -0500
Subject: [PATCH 172/230] Update zen3 subconfig to support NVHPC compilers.
 (#779)

Details:
- Parse $(CC_VENDOR) values of "nvc" in 'zen3' make_defs.mk file.
- Minor refactor to accommodate above edit.
- CREDITS file update.
---
 CREDITS                  |  1 +
 config/zen3/make_defs.mk | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/CREDITS b/CREDITS
index 9d35931bc..99ae43bd6 100644
--- a/CREDITS
+++ b/CREDITS
@@ -16,6 +16,7 @@ but many others have contributed code, ideas, and feedback, including
   Alex Arslan              @ararslan
   Vernon Austel                                       (IBM, T.J. Watson Research Center)
   Mohsen Aznaveh           @Aznaveh                   (Texas A&M University)
+  Abhishek Bagusetty       @abagusetty                (Argonne National Laboratory)
   Satish Balay             @balay                     (Argonne National Laboratory)
   Kihiro Bando             @bandokihiro
   Matthew Brett            @matthew-brett             (University of Birmingham)
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
index 88f39c3d1..0bd4ed344 100644
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -1,6 +1,6 @@
 #
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -35,7 +35,7 @@
 
 # Declare the name of the current configuration and add it to the
 # running list of configurations included by common.mk.
-THIS_CONFIG    := zen3 
+THIS_CONFIG    := zen3
 #CONFIGS_INCL   += $(THIS_CONFIG)
 
 #
@@ -65,8 +65,8 @@ endif
 # they make explicit use of the rbp register.
 CKOPTFLAGS         := $(COPTFLAGS) -fomit-frame-pointer
 CROPTFLAGS         := $(CKOPTFLAGS)
-CKVECFLAGS         := -mavx2 -mfma -mfpmath=sse
-CRVECFLAGS         := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+CKVECFLAGS         := -mavx2 -mfma
+CRVECFLAGS         := $(CKVECFLAGS)
 ifeq ($(CC_VENDOR),gcc)
   ifeq ($(GCC_OT_9_1_0),yes)  # gcc versions older than 9.1.
     CVECFLAGS_VER  := -march=znver1 -mno-avx256-split-unaligned-store
@@ -77,6 +77,8 @@ ifeq ($(CC_VENDOR),gcc)
     CVECFLAGS_VER  := -march=znver3
   endif
   endif
+  CKVECFLAGS       += -mfpmath=sse
+  CRVECFLAGS       += -funsafe-math-optimizations -ffp-contract=fast
 else
 ifeq ($(CC_VENDOR),clang)
   ifeq ($(CLANG_OT_9_0_0),yes)  # clang versions older than 9.0.
@@ -92,6 +94,8 @@ ifeq ($(CC_VENDOR),clang)
   endif
   endif
   endif
+  CKVECFLAGS       += -mfpmath=sse
+  CRVECFLAGS       += -funsafe-math-optimizations -ffp-contract=fast
 else
 ifeq ($(CC_VENDOR),aocc)
   ifeq ($(AOCC_OT_2_0_0),yes)   # aocc versions older than 2.0.
@@ -103,8 +107,14 @@ ifeq ($(CC_VENDOR),aocc)
     CVECFLAGS_VER  := -march=znver3
   endif
   endif
+  CKVECFLAGS       += -mfpmath=sse
+  CRVECFLAGS       += -funsafe-math-optimizations -ffp-contract=fast
+ifeq ($(CC_VENDOR),nvc)
+  CVECFLAGS_VER    := -march=znver3
+  CRVECFLAGS       += -fast
 else
-  $(error gcc, clang, or aocc is required for this configuration.)
+  $(error gcc, clang, nvc or aocc is required for this configuration.)
+endif
 endif
 endif
 endif
@@ -114,4 +124,3 @@ CRVECFLAGS         += $(CVECFLAGS_VER)
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
-

From 8fff1e31da1c87e46cacec112b0ac280ab47cd8b Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Thu, 12 Oct 2023 15:51:41 -0500
Subject: [PATCH 173/230] Fixed bug in sup threshold registration. (#782)

Details:
- Fixed a bug that resulted in BLIS non-deterministically calling the
  gemmsup handler, irrespective of the thresholds that are registered
  via bli_cntx_set_blkszs().
- Deep dive: In bli_cntx_init_ref.c, the default values for the gemmsup
  thresholds (BLIS_[MNK]T blocksizes) wre being set to zero so that no
  operation ever matched the criteria for gemmsup (unless specific sup
  thresholds are registered). HOWEVER, these thresholds are set via
  bli_cntx_set_blkszs() which calls bli_blksz_copy_if_pos(), which was
  only coping the thresholds into the gks' cntx_t if the values were
  strictly positive. Thus, the zero values passed into
  bli_cntx_set_blkszs() were being ignored and those threshold slots
  within the gks were left uninitialized. The upshot of this is that the
  reference gemmsup handler was being called for gemm problems
  essentially at random (and as it turns out, very rarely the reference
  gemmsup implementation would encounter a divide-by-zero error).
- The problem was fixed by changing bli_blksz_copy_if_pos() so that it
  copies values that are non-negative (values >= 0 instead of > 0). The
  function was also renamed to bli_blksz_copy_if_nonneg()
- Also needed to standardize use of -1 as the sole value to embed into
  blksz_t structs as a signal to bli_cntx_set_blkszs() to *not* register
  a value for that slot (and instead let whatever existing values
  remain). This required updates to the bli_cntx_init_*() functions for
  bgq, cortexa9, knc, penryn, power7, and template subconfigs, as some
  of these codes were using 0 instead of -1.
- Fixes #781. Thanks to Devin Matthews for identifying, diagnosing, and
  proposing a fix for this issue.
---
 config/bgq/bli_cntx_init_bgq.c           | 10 +++++-----
 config/cortexa9/bli_cntx_init_cortexa9.c | 10 +++++-----
 config/knc/bli_cntx_init_knc.c           | 14 +++++++-------
 config/penryn/bli_cntx_init_penryn.c     | 10 +++++-----
 config/power7/bli_cntx_init_power7.c     | 10 +++++-----
 config/template/bli_cntx_init_template.c | 10 +++++-----
 frame/base/bli_blksz.h                   | 23 ++++++++++++-----------
 frame/base/bli_cntx.c                    |  2 +-
 8 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c
index d3871d8f7..a61d1b95d 100644
--- a/config/bgq/bli_cntx_init_bgq.c
+++ b/config/bgq/bli_cntx_init_bgq.c
@@ -69,11 +69,11 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,     8,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     8,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,  1024,     0,   768 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],     0,  2048,     0,  1536 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0, 10240,     0, 10240 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,     8,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,  1024,    -1,   768 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,  2048,    -1,  1536 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1, 10240,    -1, 10240 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c
index 6af3ff91c..55a8000e7 100644
--- a/config/cortexa9/bli_cntx_init_cortexa9.c
+++ b/config/cortexa9/bli_cntx_init_cortexa9.c
@@ -69,11 +69,11 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   432,   176,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   352,   368,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   432,   176,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   352,   368,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c
index 8f615588c..bbaf37541 100644
--- a/config/knc/bli_cntx_init_knc.c
+++ b/config/knc/bli_cntx_init_knc.c
@@ -67,13 +67,13 @@ void bli_cntx_init_knc( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,    30,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     8,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,   120,     0,     0,
-	                                             0,   160,     0,     0 );
-	bli_blksz_init     ( &blkszs[ BLIS_KC ],     0,   240,     0,     0,
-	                                             0,   300,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0, 14400,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    30,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1,
+	                                            -1,   160,    -1,    -1 );
+	bli_blksz_init     ( &blkszs[ BLIS_KC ],    -1,   240,    -1,    -1,
+	                                            -1,   300,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1, 14400,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c
index 964438e83..30b3ac9fa 100644
--- a/config/penryn/bli_cntx_init_penryn.c
+++ b/config/penryn/bli_cntx_init_penryn.c
@@ -77,11 +77,11 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   768,   384,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   384,   384,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   768,   384,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   384,   384,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c
index d5ffe7dcf..9d1de3da5 100644
--- a/config/power7/bli_cntx_init_power7.c
+++ b/config/power7/bli_cntx_init_power7.c
@@ -67,11 +67,11 @@ void bli_cntx_init_power7( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,     8,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,    64,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],     0,   256,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0,  4096,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    64,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4096,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c
index 4bacc5d63..8e5a57d6c 100644
--- a/config/template/bli_cntx_init_template.c
+++ b/config/template/bli_cntx_init_template.c
@@ -87,11 +87,11 @@ void bli_cntx_init_template( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,     0,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     0,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,     0,     0,   128 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],     0,     0,     0,   256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0,     0,     0,  4096 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    -1,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,    -1,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    -1,    -1,   128 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,    -1,    -1,   256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,    -1,    -1,  4096 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index d91c0542d..7f1db2706 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -84,14 +84,15 @@ BLIS_INLINE void bli_blksz_copy
 	*b_dst = *b_src;
 }
 
-BLIS_INLINE void bli_blksz_copy_if_pos
+BLIS_INLINE void bli_blksz_copy_if_nonneg
      (
        const blksz_t* b_src,
              blksz_t* b_dst
      )
 {
-	// Copy the blocksize values over to b_dst one-by-one so that
-	// we can skip the ones that are non-positive.
+	// Copy the blocksize values over to b_dst one-by-one. Note that we
+	// only copy valuse that are zero or positive (and skip copying any
+	// values that are negative).
 
 	const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT,    b_src );
 	const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE,   b_src );
@@ -103,15 +104,15 @@ BLIS_INLINE void bli_blksz_copy_if_pos
 	const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src );
 	const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src );
 
-	if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT,    b_dst );
-	if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE,   b_dst );
-	if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst );
-	if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst );
+	if ( v_s >= 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT,    b_dst );
+	if ( v_d >= 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE,   b_dst );
+	if ( v_c >= 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst );
+	if ( v_z >= 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst );
 
-	if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT,    b_dst );
-	if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE,   b_dst );
-	if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst );
-	if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst );
+	if ( e_s >= 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT,    b_dst );
+	if ( e_d >= 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE,   b_dst );
+	if ( e_c >= 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst );
+	if ( e_z >= 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst );
 }
 
 BLIS_INLINE void bli_blksz_copy_def_dt
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 8c6cafc13..4635c11f4 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -100,7 +100,7 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 		//cntx_blkszs[ bs_id ] = *blksz;
 		//bli_blksz_copy( blksz, cntx_blksz );
 		blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-		bli_blksz_copy_if_pos( blksz, cntx_blksz );
+		bli_blksz_copy_if_nonneg( blksz, cntx_blksz );
 
 		// Copy the blocksize multiple id into the context.
 		cntx_bmults[ bs_id ] = bm_id;

From 7a87e57b69d697a9b06231a5c0423c00fa375dc1 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Sat, 14 Oct 2023 02:05:41 -0500
Subject: [PATCH 174/230] Fixed HPX barrier synchronization (#783)

Details:
- Fixed hpx barrier synchronization. HPX was hanging on larger cores
  because blis was using non-hpx synchronization primitives. But when
  using hpx-runtime only hpx-synchronization primitives should be used.
  Hence, a C style wrapper hpx_barrier_t is introduced to perform hpx
  barrier operations.
- Replaced hpx::for_loop with hpx::futures. Using hpx::for_loop with
  hpx::barrier on n_threads greater than actual hardware thread count
  causes synchronization issues making hpx hanging. This can be avoided
  by using hpx::futures, which are relatively very lightweight, robust
  and scalable.
---
 frame/thread/bli_thrcomm.h       | 15 +++++++--
 frame/thread/bli_thrcomm_hpx.cpp | 53 +++++++++++++++-----------------
 frame/thread/bli_thread_hpx.cpp  | 20 +++++++-----
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index 436b05711..22b02d97e 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -67,6 +67,17 @@ typedef struct barrier_s barrier_t;
 #endif
 #endif
 
+// Define hpx_barrier_t, which is specific to the barrier used in HPX
+// implementation. This needs to be done first since it is (potentially)
+// used within the definition of thrcomm_t below.
+
+#ifdef BLIS_ENABLE_HPX
+typedef struct hpx_barrier_t
+{
+	void* handle;
+} hpx_barrier_t;
+#endif
+
 // Define the thrcomm_t structure, which will be common to all threading
 // implementations.
 
@@ -124,9 +135,7 @@ typedef struct thrcomm_s
 	// -- Fields specific to HPX --
 
 	#ifdef BLIS_ENABLE_HPX
-	#ifdef BLIS_USE_HPX_BARRIER
-	hpx::barrier<> * barrier;
-	#endif
+	hpx_barrier_t barrier;
 	#endif
 
 } thrcomm_t;
diff --git a/frame/thread/bli_thrcomm_hpx.cpp b/frame/thread/bli_thrcomm_hpx.cpp
index 323871ef8..0947dc81d 100644
--- a/frame/thread/bli_thrcomm_hpx.cpp
+++ b/frame/thread/bli_thrcomm_hpx.cpp
@@ -36,43 +36,36 @@
 
 #ifdef BLIS_ENABLE_HPX
 
+#include <hpx/synchronization/barrier.hpp>
 extern "C" {
 
-#ifdef BLIS_USE_HPX_BARRIER
-
 // Define the pthread_barrier_t implementations of the init, cleanup, and
 // barrier functions.
 
-void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
+void hpx_barrier_init( hpx_barrier_t* barrier, dim_t n_threads )
 {
-	if ( comm == nullptr ) return;
-
-	//comm->sent_object             = nullptr;
-	//comm->n_threads               = n_threads;
-	comm->ti                      = BLIS_HPX;
-	//comm->barrier_sense           = 0;
-	//comm->barrier_threads_arrived = 0;
-
-	comm->barrier = new hpx:barrier<>();
+	if ( barrier == nullptr ) return;
+	barrier->handle = new hpx::barrier<>( n_threads );
 }
 
-void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
+void hpx_barrier_destroy( hpx_barrier_t* barrier )
 {
-	if ( comm == nullptr ) return;
+	if ( barrier == nullptr ) return;
 
-	delete comm->barrier;
-}
+	auto* barrier_ = reinterpret_cast<hpx::barrier<>*>( barrier->handle );
+	barrier->handle = nullptr;
 
-void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
-{
-	comm->barrier->arrive_and_wait();
+	delete barrier_; 
 }
 
-#else
+void hpx_barrier_arrive_and_wait( hpx_barrier_t* barrier )
+{
+	if ( barrier == nullptr ) return;
+	auto* barrier_ = reinterpret_cast<hpx::barrier<>*>( barrier->handle );
 
-// Define the non-hpx::barrier implementations of the init, cleanup,
-// and barrier functions. These are the default unless the hpx::barrier
-// versions are requested at compile-time.
+	if ( barrier_ == nullptr ) return;
+	barrier_->arrive_and_wait();
+}
 
 void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
 {
@@ -81,22 +74,24 @@ void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
 	comm->sent_object             = nullptr;
 	comm->n_threads               = n_threads;
 	comm->ti                      = BLIS_HPX;
-	comm->barrier_sense           = 0;
-	comm->barrier_threads_arrived = 0;
+	// comm->barrier_sense           = 0;
+	// comm->barrier_threads_arrived = 0;
+
+	hpx_barrier_init( &comm->barrier, n_threads );
 }
 
 void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
 {
+	if ( comm == nullptr ) return;
+	hpx_barrier_destroy( &comm->barrier );
 }
 
 void bli_thrcomm_barrier_hpx( dim_t t_id, thrcomm_t* comm )
 {
-	bli_thrcomm_barrier_atomic( t_id, comm );
+	hpx_barrier_arrive_and_wait( &comm->barrier );
 }
 
-} // extern "C"
-
-#endif
+}
 
 #endif
 
diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp
index 847b519dc..baf2eb3f2 100644
--- a/frame/thread/bli_thread_hpx.cpp
+++ b/frame/thread/bli_thread_hpx.cpp
@@ -56,16 +56,22 @@ void bli_thread_launch_hpx
 	// Allocate a global communicator for the root thrinfo_t structures.
 	pool_t*    gl_comm_pool = nullptr;
 	thrcomm_t* gl_comm      = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
+
+	// Execute func on hpx-runtime with n_threads.
 	hpx::threads::run_as_hpx_thread([&]()
 	{
-		hpx::execution::experimental::num_cores num_cores_(n_threads);
-		hpx::execution::static_chunk_size chunk_size_(1);
-		hpx::experimental::for_loop(
-		hpx::execution::par.with(num_cores_).with(chunk_size_), 0, n_threads,
-		[&gl_comm, &func, &params](const dim_t tid)
+		std::vector<hpx::future<void>> futures;
+		futures.reserve(n_threads);
+
+		for (dim_t tid = 0; tid < n_threads; ++tid)
 		{
-			func( gl_comm, tid, params );
-		});
+			futures.push_back(hpx::async([tid, &gl_comm, &func, &params]()
+			{
+			  func( gl_comm, tid, params );
+			}));
+		}
+
+		hpx::wait_all(futures);
 	});
 
 	// Free the global communicator, because the root thrinfo_t node

From 05388ddb66f8bf2d62009b162d64bf2d99226b83 Mon Sep 17 00:00:00 2001
From: Aaron Hutchinson <113382047+Aaron-Hutchinson@users.noreply.github.com>
Date: Fri, 3 Nov 2023 13:30:31 -0700
Subject: [PATCH 175/230] Added 'sifive_x280' subconfig, kernel set. (#737)

Details:
- Added a new 'sifive_x280' subconfiguration for SiFive's x280 RISC-V
  instruction set architecture. The subconfig registers kernels from a
  correspondingly new kernel set, also named 'sifive_x280'.
- Added the aforementioned kernel set, which includes intrinsics- and
  assembly-based implementations of most level-1v kernels along with
  level-1f kernels axpy2v dotaxpyv, packm kernels, and level-3 gemm,
  gemmtrsm_l, and gemmtrsm_u microkernels (plus supporting files).
- Registered the 'sifive_x280' subconfig as belonging to a singleton
  family by the same name.
- Added an entry to '.travis.yml' to test the new subconfig via qemu.
- Updates to 'travis/do_riscv.sh' script to support the 'sifive_x280'
  subconfig and to reflect updated tarball names.
- Special thanks to Lee Killough, Devin Matthews, and Angelika Schwarz
  for their engagement on this commit.
---
 .travis.yml                                   |   11 +
 CREDITS                                       |    2 +
 .../sifive_x280/bli_cntx_init_sifive_x280.c   |  226 ++
 config/sifive_x280/bli_family_sifive_x280.h   |   34 +
 .../sifive_x280/bli_kernel_defs_sifive_x280.h |   55 +
 config/sifive_x280/make_defs.mk               |   78 +
 config_registry                               |    3 +
 frame/base/bli_arch.c                         |    7 +
 frame/base/bli_gks.c                          |    8 +
 frame/include/bli_arch_config.h               |   18 +
 frame/include/bli_type_defs.h                 |    3 +
 .../bli_addv_sifive_x280_intr.c               |  118 +
 .../bli_addv_sifive_x280_intr_complex.c       |   89 +
 .../bli_addv_sifive_x280_intr_real.c          |   78 +
 .../sifive_x280/1/bli_amaxv_sifive_x280_asm.c |  293 ++
 .../bli_axpbyv_sifive_x280_intr.c             |  129 +
 .../bli_axpbyv_sifive_x280_intr_complex.c     |  121 +
 .../bli_axpbyv_sifive_x280_intr_real.c        |   98 +
 .../bli_axpyv_sifive_x280_intr.c              |  119 +
 .../bli_axpyv_sifive_x280_intr_complex.c      |   94 +
 .../bli_axpyv_sifive_x280_intr_real.c         |   79 +
 .../sifive_x280/1/bli_copyv_sifive_x280_asm.c |  272 ++
 .../bli_dotv_sifive_x280_intr.c               |  120 +
 .../bli_dotv_sifive_x280_intr_complex.c       |  116 +
 .../bli_dotv_sifive_x280_intr_real.c          |   87 +
 .../bli_dotxv_sifive_x280_intr.c              |  130 +
 .../bli_dotxv_sifive_x280_intr_complex.c      |  130 +
 .../bli_dotxv_sifive_x280_intr_real.c         |   94 +
 .../1/bli_invertv_sifive_x280_asm.c           |  221 ++
 .../1/bli_invscalv_sifive_x280_asm.c          |  266 ++
 .../bli_scal2v_sifive_x280_intr.c             |  124 +
 .../bli_scal2v_sifive_x280_intr_complex.c     |  100 +
 .../bli_scal2v_sifive_x280_intr_real.c        |   82 +
 .../bli_scalv_sifive_x280_intr.c              |  120 +
 .../bli_scalv_sifive_x280_intr_complex.c      |   89 +
 .../bli_scalv_sifive_x280_intr_real.c         |   76 +
 .../sifive_x280/1/bli_setv_sifive_x280_asm.c  |  204 ++
 .../bli_subv_sifive_x280_intr.c               |  118 +
 .../bli_subv_sifive_x280_intr_complex.c       |   89 +
 .../bli_subv_sifive_x280_intr_real.c          |   77 +
 .../sifive_x280/1/bli_swapv_sifive_x280_asm.c |  245 ++
 .../bli_xpbyv_sifive_x280_intr.c              |  122 +
 .../bli_xpbyv_sifive_x280_intr_complex.c      |  101 +
 .../bli_xpbyv_sifive_x280_intr_real.c         |   84 +
 .../bli_axpy2v_sifive_x280_intr.c             |  122 +
 .../bli_axpy2v_sifive_x280_intr_complex.c     |  117 +
 .../bli_axpy2v_sifive_x280_intr_real.c        |   91 +
 .../1f/bli_axpyf_sifive_x280_asm.c            |  430 +++
 .../bli_dotaxpyv_sifive_x280_intr.c           |  122 +
 .../bli_dotaxpyv_sifive_x280_intr_complex.c   |  151 +
 .../bli_dotaxpyv_sifive_x280_intr_real.c      |  111 +
 .../1f/bli_dotxaxpyf_sifive_x280_asm.c        | 3120 +++++++++++++++++
 .../1f/bli_dotxf_sifive_x280_asm.c            | 2645 ++++++++++++++
 .../1m/bli_packm_sifive_x280_asm_mrxk.c       |  678 ++++
 .../1m/bli_packm_sifive_x280_asm_nrxk.c       |  838 +++++
 .../sifive_x280/3/bli_gemm_sifive_x280_asm.c  | 2405 +++++++++++++
 .../bli_gemmtrsm_l_sifive_x280_asm_complex.c  |  327 ++
 .../bli_gemmtrsm_l_sifive_x280_asm_real.c     |  253 ++
 .../bli_gemmtrsm_sifive_x280_asm.c            |  182 +
 .../bli_gemmtrsm_u_sifive_x280_asm_complex.c  |  331 ++
 .../bli_gemmtrsm_u_sifive_x280_asm_real.c     |  260 ++
 kernels/sifive_x280/bli_kernels_sifive_x280.h |  160 +
 kernels/sifive_x280/riscv_cmul_macros_asm.h   |  137 +
 .../sifive_x280/riscv_overloaded_intrinsics.h |  116 +
 travis/do_riscv.sh                            |    9 +-
 65 files changed, 17332 insertions(+), 3 deletions(-)
 create mode 100644 config/sifive_x280/bli_cntx_init_sifive_x280.c
 create mode 100644 config/sifive_x280/bli_family_sifive_x280.h
 create mode 100644 config/sifive_x280/bli_kernel_defs_sifive_x280.h
 create mode 100644 config/sifive_x280/make_defs.mk
 create mode 100644 kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
 create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
 create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
 create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
 create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
 create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
 create mode 100644 kernels/sifive_x280/bli_kernels_sifive_x280.h
 create mode 100644 kernels/sifive_x280/riscv_cmul_macros_asm.h
 create mode 100644 kernels/sifive_x280/riscv_overloaded_intrinsics.h

diff --git a/.travis.yml b/.travis.yml
index 848cb1843..bdfafb6b0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -86,6 +86,11 @@ matrix:
     env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv32iv" \
       CC=riscv32-unknown-linux-gnu-gcc \
       LDFLAGS=-static
+  - os: linux
+    compiler: clang
+    env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="sifive_x280" \
+      CC=clang \
+      LDFLAGS=-static
 install:
 - if [ "$CC" = "gcc"  ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi
 - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
@@ -106,6 +111,12 @@ script:
     export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++;
     export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
   fi
+- if [ "$CONF" = "sifive_x280" ]; then
+    $DIST_PATH/travis/do_riscv.sh "$CONF";
+    export CC=$DIST_PATH/../toolchain/riscv/bin/clang;
+    export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++;
+    export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000";
+  fi
 - $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF
 - pwd
 - ls -l
diff --git a/CREDITS b/CREDITS
index 99ae43bd6..fa99b0572 100644
--- a/CREDITS
+++ b/CREDITS
@@ -22,6 +22,7 @@ but many others have contributed code, ideas, and feedback, including
   Matthew Brett            @matthew-brett             (University of Birmingham)
   Jérémie du Boisberranger @jeremiedbb
   Jed Brown                @jedbrown                  (Argonne National Laboratory)
+  Alex Chiang              @alexsifivetw              (SiFive)
   Robin Christ             @robinchrist
   Dilyn Corner             @dilyn-corner
   Mat Cross                @matcross                  (NAG)
@@ -54,6 +55,7 @@ but many others have contributed code, ideas, and feedback, including
   Minh Quan Ho             @hominhquan
   Matthew Honnibal         @honnibal
   Stefan Husmann           @stefanhusmann
+  Aaron Hutchinson         @Aaron-Hutchinson          (SiFive)
   Francisco Igual          @figual                    (Universidad Complutense de Madrid)
   John Mather              @jmather-sesi              (SideFX Software)
   Madeesh Kannan           @shadeMe
diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c
new file mode 100644
index 000000000..197394c82
--- /dev/null
+++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c
@@ -0,0 +1,226 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_cntx_init_sifive_x280( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_sifive_x280_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native kernels.
+	bli_cntx_set_ukrs
+	(
+	  cntx,
+
+	  // Level 1
+	  BLIS_ADDV_KER,       BLIS_FLOAT,    bli_saddv_sifive_x280_intr,
+	  BLIS_ADDV_KER,       BLIS_DOUBLE,   bli_daddv_sifive_x280_intr,
+	  BLIS_ADDV_KER,       BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr,
+	  BLIS_ADDV_KER,       BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr,
+
+	  BLIS_AMAXV_KER,      BLIS_FLOAT,    bli_samaxv_sifive_x280_asm,
+	  BLIS_AMAXV_KER,      BLIS_DOUBLE,   bli_damaxv_sifive_x280_asm,
+	  BLIS_AMAXV_KER,      BLIS_SCOMPLEX, bli_camaxv_sifive_x280_asm,
+	  BLIS_AMAXV_KER,      BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_asm,
+
+	  BLIS_AXPBYV_KER,     BLIS_FLOAT,    bli_saxpbyv_sifive_x280_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DOUBLE,   bli_daxpbyv_sifive_x280_intr,
+	  BLIS_AXPBYV_KER,     BLIS_SCOMPLEX, bli_caxpbyv_sifive_x280_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DCOMPLEX, bli_zaxpbyv_sifive_x280_intr,
+
+	  BLIS_AXPYV_KER,      BLIS_FLOAT,    bli_saxpyv_sifive_x280_intr,
+	  BLIS_AXPYV_KER,      BLIS_DOUBLE,   bli_daxpyv_sifive_x280_intr,
+	  BLIS_AXPYV_KER,      BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr,
+	  BLIS_AXPYV_KER,      BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr,
+
+	  BLIS_COPYV_KER,      BLIS_FLOAT,    bli_scopyv_sifive_x280_asm,
+	  BLIS_COPYV_KER,      BLIS_DOUBLE,   bli_dcopyv_sifive_x280_asm,
+	  BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_asm,
+	  BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_asm,
+
+	  BLIS_DOTV_KER,       BLIS_FLOAT,    bli_sdotv_sifive_x280_intr,
+	  BLIS_DOTV_KER,       BLIS_DOUBLE,   bli_ddotv_sifive_x280_intr,
+	  BLIS_DOTV_KER,       BLIS_SCOMPLEX, bli_cdotv_sifive_x280_intr,
+	  BLIS_DOTV_KER,       BLIS_DCOMPLEX, bli_zdotv_sifive_x280_intr,
+
+	  BLIS_DOTXV_KER,      BLIS_FLOAT,    bli_sdotxv_sifive_x280_intr,
+	  BLIS_DOTXV_KER,      BLIS_DOUBLE,   bli_ddotxv_sifive_x280_intr,
+	  BLIS_DOTXV_KER,      BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr,
+	  BLIS_DOTXV_KER,      BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr,
+
+	  BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_sifive_x280_asm,
+	  BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_sifive_x280_asm,
+	  BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_asm,
+	  BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_asm,
+
+	  BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_sifive_x280_asm,
+	  BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_sifive_x280_asm,
+	  BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_asm,
+	  BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_asm,
+
+	  BLIS_SCAL2V_KER,     BLIS_FLOAT,    bli_sscal2v_sifive_x280_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DOUBLE,   bli_dscal2v_sifive_x280_intr,
+	  BLIS_SCAL2V_KER,     BLIS_SCOMPLEX, bli_cscal2v_sifive_x280_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DCOMPLEX, bli_zscal2v_sifive_x280_intr,
+
+	  BLIS_SCALV_KER,      BLIS_FLOAT,    bli_sscalv_sifive_x280_intr,
+	  BLIS_SCALV_KER,      BLIS_DOUBLE,   bli_dscalv_sifive_x280_intr,
+	  BLIS_SCALV_KER,      BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr,
+	  BLIS_SCALV_KER,      BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr,
+
+	  BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_sifive_x280_asm,
+	  BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_sifive_x280_asm,
+	  BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_sifive_x280_asm,
+	  BLIS_SETV_KER,       BLIS_DCOMPLEX, bli_zsetv_sifive_x280_asm,
+
+	  BLIS_SUBV_KER,       BLIS_FLOAT,    bli_ssubv_sifive_x280_intr,
+	  BLIS_SUBV_KER,       BLIS_DOUBLE,   bli_dsubv_sifive_x280_intr,
+	  BLIS_SUBV_KER,       BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr,
+	  BLIS_SUBV_KER,       BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr,
+
+	  BLIS_SWAPV_KER,      BLIS_FLOAT,    bli_sswapv_sifive_x280_asm,
+	  BLIS_SWAPV_KER,      BLIS_DOUBLE,   bli_dswapv_sifive_x280_asm,
+	  BLIS_SWAPV_KER,      BLIS_SCOMPLEX, bli_cswapv_sifive_x280_asm,
+	  BLIS_SWAPV_KER,      BLIS_DCOMPLEX, bli_zswapv_sifive_x280_asm,
+
+	  BLIS_XPBYV_KER,      BLIS_FLOAT,    bli_sxpbyv_sifive_x280_intr,
+	  BLIS_XPBYV_KER,      BLIS_DOUBLE,   bli_dxpbyv_sifive_x280_intr,
+	  BLIS_XPBYV_KER,      BLIS_SCOMPLEX, bli_cxpbyv_sifive_x280_intr,
+	  BLIS_XPBYV_KER,      BLIS_DCOMPLEX, bli_zxpbyv_sifive_x280_intr,
+
+	  // Level 1f
+	  BLIS_AXPY2V_KER,     BLIS_FLOAT,    bli_saxpy2v_sifive_x280_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DOUBLE,   bli_daxpy2v_sifive_x280_intr,
+	  BLIS_AXPY2V_KER,     BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr,
+
+	  BLIS_AXPYF_KER,      BLIS_FLOAT,    bli_saxpyf_sifive_x280_asm,
+	  BLIS_AXPYF_KER,      BLIS_DOUBLE,   bli_daxpyf_sifive_x280_asm,
+	  BLIS_AXPYF_KER,      BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_asm,
+	  BLIS_AXPYF_KER,      BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_asm,
+
+	  BLIS_DOTXF_KER,      BLIS_FLOAT,    bli_sdotxf_sifive_x280_asm,
+	  BLIS_DOTXF_KER,      BLIS_DOUBLE,   bli_ddotxf_sifive_x280_asm,
+	  BLIS_DOTXF_KER,      BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_asm,
+	  BLIS_DOTXF_KER,      BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_asm,
+
+	  BLIS_DOTAXPYV_KER,   BLIS_FLOAT,    bli_sdotaxpyv_sifive_x280_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DOUBLE,   bli_ddotaxpyv_sifive_x280_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr,
+
+	  BLIS_DOTXAXPYF_KER,  BLIS_FLOAT,    bli_sdotxaxpyf_sifive_x280_asm,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DOUBLE,   bli_ddotxaxpyf_sifive_x280_asm,
+	  BLIS_DOTXAXPYF_KER,  BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_asm,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm,
+
+	  // Level 1m
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_sifive_x280_asm_7xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_7xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_sifive_x280_asm_64xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_32xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_32xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_16xk,
+
+	  // Level 3
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_x280_asm_7m4,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_sifive_x280_asm_7m4,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_sifive_x280_asm_6m2,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_sifive_x280_asm_6m2,
+
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_asm,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_DCOMPLEX, TRUE,
+
+	  BLIS_VA_END
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],     7,     7,     6,     6,
+	                                             8,     8,     8,     8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    64,    32,    32,    16 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    28,    28,    24,    24 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  1024,  1024,  1024,  1024 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   128,   256,   128 );
+	// Default BLIS_BBM_s = 1, but set here to ensure it's correct
+	bli_blksz_init_easy( &blkszs[ BLIS_BBM ],    1,     1,     1,     1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBN ],    1,     1,     1,     1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+	  // level-1m
+	  BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
+	  BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
+
+	  BLIS_VA_END
+	);
+}
+
diff --git a/config/sifive_x280/bli_family_sifive_x280.h b/config/sifive_x280/bli_family_sifive_x280.h
new file mode 100644
index 000000000..4f02c048f
--- /dev/null
+++ b/config/sifive_x280/bli_family_sifive_x280.h
@@ -0,0 +1,34 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
diff --git a/config/sifive_x280/bli_kernel_defs_sifive_x280.h b/config/sifive_x280/bli_kernel_defs_sifive_x280.h
new file mode 100644
index 000000000..bb6865a66
--- /dev/null
+++ b/config/sifive_x280/bli_kernel_defs_sifive_x280.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+#define BLIS_MR_s   7
+#define BLIS_MR_d   7
+#define BLIS_MR_c   6
+#define BLIS_MR_z   6
+
+#define BLIS_PACKMR_s   8
+#define BLIS_PACKMR_d   8
+#define BLIS_PACKMR_c   8
+#define BLIS_PACKMR_z   8
+
+#define BLIS_NR_s   64
+#define BLIS_NR_d   32
+#define BLIS_NR_c   32
+#define BLIS_NR_z   16
+//#endif
+
diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk
new file mode 100644
index 000000000..acdf5a361
--- /dev/null
+++ b/config/sifive_x280/make_defs.mk
@@ -0,0 +1,78 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2023, SiFive, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := sifive_x280
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl512b -mabi=lp64d
+CPPROCFLAGS    :=
+CMISCFLAGS     := $(CMISCFLAGS_SIFIVE) -fdata-sections -ffunction-sections \
+                  -fdiagnostics-color=always -fno-rtti -fno-exceptions
+CPICFLAGS      := -fPIC
+CWARNFLAGS     := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \
+                  -Wno-sign-compare -Wno-unused-variable
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -Ofast
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS)
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config_registry b/config_registry
index 44bb069c9..8c1f6f254 100644
--- a/config_registry
+++ b/config_registry
@@ -61,5 +61,8 @@ rv64i:       rv64i/rvi
 rv32iv:      rv32iv/rviv
 rv64iv:      rv64iv/rviv
 
+# SiFive architectures.
+sifive_x280: sifive_x280
+
 # Generic architectures.
 generic:     generic
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index 111b27e20..a8061f933 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -286,6 +286,11 @@ arch_t bli_arch_query_id_impl( void )
 		id = BLIS_ARCH_RV64IV;
 		#endif
 
+		// SiFive microarchitectures.
+		#ifdef BLIS_FAMILY_SIFIVE_X280
+		id = BLIS_ARCH_SIFIVE_X280;
+		#endif
+
 		// Generic microarchitecture.
 		#ifdef BLIS_FAMILY_GENERIC
 		id = BLIS_ARCH_GENERIC;
@@ -351,6 +356,8 @@ static const char* config_name[ BLIS_NUM_ARCHS ] =
     "rv32iv",
     "rv64iv",
 
+    "sifive_x280",
+
     "generic"
 };
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 7b9ab3d7c..a21aa1244 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -259,6 +259,14 @@ int bli_gks_init( void )
 		                                              bli_cntx_init_rv64iv_ind );
 #endif
 
+		// -- SiFive architectures ----------------------------------------------
+
+#ifdef BLIS_CONFIG_SIFIVE_X280
+		bli_gks_register_cntx( BLIS_ARCH_SIFIVE_X280, bli_cntx_init_sifive_x280,
+		                                              bli_cntx_init_sifive_x280_ref,
+		                                              bli_cntx_init_sifive_x280_ind );
+#endif
+
 		// -- Generic architectures --------------------------------------------
 
 #ifdef BLIS_CONFIG_GENERIC
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index f8e18c5c1..361e9663d 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -156,6 +156,12 @@ CNTX_INIT_PROTS( rv32iv )
 CNTX_INIT_PROTS( rv64iv )
 #endif
 
+// -- SiFive architectures --
+
+#ifdef BLIS_CONFIG_SIFIVE_X280
+CNTX_INIT_PROTS( sifive_x280 )
+#endif
+
 // -- Generic --
 
 #ifdef BLIS_CONFIG_GENERIC
@@ -295,6 +301,12 @@ CNTX_INIT_PROTS( generic )
 #include "bli_family_bgq.h"
 #endif
 
+// -- SiFive families --
+
+#ifdef BLIS_FAMILY_SIFIVE_X280
+#include "bli_family_sifive_x280.h"
+#endif
+
 // -- Generic --
 
 #ifdef BLIS_FAMILY_GENERIC
@@ -386,6 +398,12 @@ CNTX_INIT_PROTS( generic )
 #include "bli_kernels_rviv.h"
 #endif
 
+// -- SiFive RISC-V architectures --
+
+#ifdef BLIS_KERNELS_SIFIVE_X280
+#include "bli_kernels_sifive_x280.h"
+#endif
+
 
 #endif
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 60c55a5ed..2f81a4749 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -975,6 +975,9 @@ typedef enum
 	BLIS_ARCH_RV32IV,
 	BLIS_ARCH_RV64IV,
 
+	// SiFive
+	BLIS_ARCH_SIFIVE_X280,
+
 	// Generic architecture/configuration
 	BLIS_ARCH_GENERIC,
 
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
new file mode 100644
index 000000000..2b7ad6fe7
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
@@ -0,0 +1,118 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_x280_intr(\
+          conj_t           conjx,         \
+          dim_t            n,             \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx           \
+)
+
+#define ADDV(...)  ADDV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_addv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_addv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_addv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_addv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef ADDV
+#undef ADDV_
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..d5343befe
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef ADDV
+
+ADDV(PRECISION_CHAR, void)
+{
+    // Computes y := y + conjx(x)
+    (void) cntx;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        yvec_real = VFADD_VV(PREC, LMUL)(yvec_real, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE)
+            yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+        else
+            yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // ADDV
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..d4e7d4a45
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
@@ -0,0 +1,78 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef ADDV
+
+ADDV(PRECISION_CHAR, void)
+{
+    // Computes y = y + conjx(x)
+    //           == y +   x       (real case)
+    
+    (void) cntx;
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFADD_VV(PREC, LMUL)(yvec, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // ADDV
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
new file mode 100644
index 000000000..c423dd131
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
@@ -0,0 +1,293 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const float* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 4;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 4)
+            __asm__("vle32.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlse32.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vfabs.v v8, v24");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vfabs.v v24, v24");
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
+
+void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const double* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 8;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e64, m8, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 8)
+            __asm__("vle64.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlse64.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vfabs.v v8, v24");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vfabs.v v24, v24");
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e64, m8, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
+
+void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const scomplex* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 8;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 8)
+            __asm__("vlseg2e32.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlsseg2e32.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("vfabs.v v24, v24");
+        __asm__("vfabs.v v28, v28");
+        __asm__("vfadd.vv v24, v24, v28");
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vmv4r.v v8, v24");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
+
+void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const dcomplex* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 16;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e64, m4, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 16)
+            __asm__("vlseg2e64.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlsseg2e64.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("vfabs.v v24, v24");
+        __asm__("vfabs.v v28, v28");
+        __asm__("vfadd.vv v24, v24, v28");
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vmv4r.v v8, v24");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
new file mode 100644
index 000000000..3b29f898d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
@@ -0,0 +1,129 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_x280_intr(\
+          conj_t           conjx,          \
+          dim_t            n,              \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict beta_,          \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx            \
+)
+
+#define AXPBYV(...)  AXPBYV_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
+#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_x280_intr
+#define SCAL2V(PRECISION_CHAR) SCAL2V_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPBYV
+#undef AXPBYV_
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..31fc584b9
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
@@ -0,0 +1,121 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPBYV
+
+AXPBYV(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjx(x)
+    
+    if (n <= 0) return;
+
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (alpha->real == 0 && alpha->imag == 0 && beta->real == 0 && beta->imag == 0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+    if (alpha->real == 0 && alpha->imag == 0){
+        SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx);
+        return;
+    }
+    if (beta->real == 0 && beta->imag == 0){
+        SCAL2V(PRECISION_CHAR)(conjx, n, alpha, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we 
+    // will canonicalize NaNs whereas the reference code will propagate NaN payloads.
+
+    // TO DO (optimization): special cases for alpha = +-1, +-i, beta = +-1, +-i
+
+    // alpha and beta are both nonzero
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, temp_real, temp_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        // Computed as:
+        // y.real = beta.real * y.real - beta.imag * y.imag + alpha.real * x.real - alpha.imag * conj(x.imag)
+        // y.imag = beta.real * y.imag + beta.imag * y.real + alpha.imag * x.real + alpha.real * conj(x.imag)
+        temp_real = VFMUL_VF(PREC, LMUL)  (yvec_real, beta->real, vl);
+        temp_imag = VFMUL_VF(PREC, LMUL)  (yvec_imag, beta->real, vl);
+        temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, beta->imag, yvec_imag, vl);
+        temp_imag = VFMACC_VF(PREC, LMUL) (temp_imag, beta->imag, yvec_real, vl);
+        yvec_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->real, xvec_real, vl);
+        yvec_imag = VFMACC_VF(PREC, LMUL) (temp_imag, alpha->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE) {
+            yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        }
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPBYV
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..33eafc5d1
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
@@ -0,0 +1,98 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPBYV
+
+AXPBYV(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjx(x)
+    //            == beta * y + alpha *    x        (real case)
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+    
+    if (*alpha == 0 && *beta == 0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+    if (*alpha == 0){
+        SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx);
+        return;
+    }
+    if (*beta == 0){
+        SCAL2V(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we 
+    // will canonicalize NaNs whereas the reference code will propagate NaN payloads.
+
+    // TO DO (optimization): special cases for alpha = +-1, beta = +-1
+
+    // alpha and beta are both nonzero
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        yvec = VFMUL_VF(PREC, LMUL) (yvec, *beta, vl);
+        yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl);
+        
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL)(y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+}
+
+#endif // AXPYBV
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
new file mode 100644
index 000000000..3f9ebd3b0
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
@@ -0,0 +1,119 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_x280_intr(\
+          conj_t           conjx,         \
+          dim_t            n,             \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx           \
+)
+
+#define AXPYV(...)  AXPYV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPYV
+#undef AXPYV_
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..dc520d212
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
@@ -0,0 +1,94 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYV
+
+AXPYV(PRECISION_CHAR, void)
+{
+    // Computes y := y + alpha * conjx(x)
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+    if (alpha->real == 0 && alpha->imag == 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        yvec_real = VFMACC_VF(PREC, LMUL)( yvec_real, alpha->real, xvec_real, vl);
+        yvec_imag = VFMACC_VF(PREC, LMUL)( yvec_imag, alpha->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE){
+            yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        }
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPYV
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..0c2cda842
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
@@ -0,0 +1,79 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYV
+
+AXPYV(PRECISION_CHAR, void)
+{
+    // Computes y = y + alpha * conj(x)
+    //           == y + alpha * x       (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+    if (*alpha == 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // AXPYV
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
new file mode 100644
index 000000000..357187775
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
@@ -0,0 +1,272 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)conjx;
+    (void)cntx;
+    const float* restrict x = x_;
+    float* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)conjx;
+    (void)cntx;
+    const double* restrict x = x_;
+    double* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    const scomplex* restrict x = x_;
+    scomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    if (conjx == BLIS_NO_CONJUGATE) {
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * 2 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLE "v0, (%0)" : : "r"(x));
+            else
+                __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+            if (incy == 2 * FLT_SIZE)
+                __asm__(VSE "v0, (%0)" : : "r"(y));
+            else
+                __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+            avl -= vl;
+        }
+    } else {
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            else
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+            __asm__("vfneg.v v4, v4");
+
+            if (incy == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+            avl -= vl;
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define SH_ADD "sh3add "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    const dcomplex* restrict x = x_;
+    dcomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    if (conjx == BLIS_NO_CONJUGATE && incx == 2 * FLT_SIZE &&
+        incy == 2 * FLT_SIZE) {
+        size_t avl = 2 * n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * FLT_SIZE));
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+            __asm__(SH_ADD "%0, %1, %0" : "+r"(x) : "r"(vl));
+            __asm__(SH_ADD "%0, %1, %0" : "+r"(y) : "r"(vl));
+            avl -= vl;
+        }
+    } else {
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            else
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+            if (conjx == BLIS_CONJUGATE)
+                __asm__("vfneg.v v4, v4");
+
+            if (incy == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+            avl -= vl;
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
new file mode 100644
index 000000000..0dc856540
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
@@ -0,0 +1,120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_x280_intr(\
+          conj_t           conjxt,        \
+          conj_t           conjy,         \
+          dim_t            n,             \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict y_, inc_t incy, \
+          T*      restrict rho_,           \
+    const cntx_t*          cntx           \
+)
+
+#define DOTV(...)  DOTV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef DOTV
+#undef DOTV_
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..250fab46e
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTV
+
+DOTV(PRECISION_CHAR, void)
+{
+    // Computes rho = conjxt(x)^T * conjy(y)
+    (void) cntx;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    
+    if (n <= 0) {
+        rho->real = 0;
+        rho->imag = 0;
+        return;
+    }
+
+    // Instead of conjugating x, switch conjugation on y
+    //  and conjugate rho at the end
+    conj_t conjrho = conjxt;
+    if (conjxt == BLIS_CONJUGATE)
+        bli_toggle_conj(&conjy); // Switch conjugation of y
+
+    RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        if (first) {
+            acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl);
+            acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl);
+            first = false;
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl);
+        }
+        if (conjy == BLIS_NO_CONJUGATE) {
+            acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl);
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+        }
+
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+
+    RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1);
+    RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+    sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+
+    if (conjrho == BLIS_CONJUGATE) {
+        sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1);
+    }
+    rho->real = VFMV_F_S(PREC)(sum_real);
+    rho->imag = VFMV_F_S(PREC)(sum_imag);
+
+}
+
+#endif // DOTV
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..0ec8e6328
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
@@ -0,0 +1,87 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTV
+
+DOTV(PRECISION_CHAR, void)
+{
+    // Computes rho = conjxt(x)^T * conjy(y)
+    //             ==     x^T     *    y       (real case)
+    (void) cntx;
+    (void) conjxt; // Suppress unused parameter warnings
+    (void) conjy;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+
+    if (n <= 0) {
+        *rho = 0;
+        return;
+    }
+
+    RVV_TYPE_F(PREC, LMUL) acc;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        if (first) {
+            acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl);
+            first = false;
+        } else
+            acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+
+    RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+    *rho = VFMV_F_S(PREC)(sum);
+}
+
+#endif // DOTV
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
new file mode 100644
index 000000000..048f8d298
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
@@ -0,0 +1,130 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_x280_intr(\
+          conj_t           conjxt,        \
+          conj_t           conjy,         \
+          dim_t            n,             \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict y_, inc_t incy, \
+    const T*      restrict beta_,          \
+          T*      restrict rho_,           \
+    const cntx_t*          cntx           \
+)
+
+#define DOTXV(...)  DOTXV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+#define FMA fmaf
+
+#include "./bli_dotxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+#define FMA fma
+
+#include "./bli_dotxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+#define FMA fmaf
+
+#include "./bli_dotxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+#define FMA fma
+
+#include "./bli_dotxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+#undef DOTXV
+#undef DOTXV_
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..8245e8e05
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
@@ -0,0 +1,130 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXV
+
+DOTXV(PRECISION_CHAR, void)
+{
+    // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y)
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    
+    if (beta->real == 0 && beta->imag == 0){
+        rho->real = 0;
+        rho->imag = 0;
+    } else if (!(beta->real == 1 && beta->imag == 0)) {
+        DATATYPE temp = *rho;
+        rho->real =  rho->real * beta->real - rho->imag * beta->imag;
+        rho->imag =  temp.real * beta->imag + rho->imag * beta->real;
+    }
+
+    if (n <= 0 || (alpha->real == 0 && alpha->imag == 0))
+        return;
+
+    // Instead of conjugating x, switch conjugation on y
+    //  and conjugate dot product at the end
+    conj_t conjsum = conjxt;
+    if (conjxt == BLIS_CONJUGATE)
+        bli_toggle_conj(&conjy); // Switch conjugation of y
+
+    // Compute dot product
+    RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        if (first) {
+            acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl);
+            acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl);
+            first = false;
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl);
+        }
+        if (conjy == BLIS_NO_CONJUGATE) {
+            acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl);
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+        }
+
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+
+    RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1);
+    RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+    sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+
+    if (conjsum == BLIS_CONJUGATE) {
+        sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1);
+    }
+    DATATYPE dot = {VFMV_F_S(PREC)(sum_real), VFMV_F_S(PREC)(sum_imag)};
+
+    // Accumulate alpha * dot
+    rho->real = fma( alpha->real, dot.real, rho->real);
+    rho->real = fma(-alpha->imag, dot.imag, rho->real);
+    rho->imag = fma( alpha->imag, dot.real, rho->imag);
+    rho->imag = fma( alpha->real, dot.imag, rho->imag);
+
+}
+
+#endif // DOTXV
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..f9d934697
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
@@ -0,0 +1,94 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXV
+
+DOTXV(PRECISION_CHAR, void)
+{
+    // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y)
+    //             == beta * rho + alpha *     x^T     *    y       (real case)
+    
+    (void) cntx;
+    (void) conjxt; // Suppress unused parameter warnings
+    (void) conjy;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+
+    if (*beta == 0)
+        *rho = 0;
+    else if (*beta != 1.0f)
+        *rho *= *beta;
+    
+    if (n <= 0 || *alpha == 0)
+        return;
+
+    // Compute dot product
+    RVV_TYPE_F(PREC, LMUL) acc;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        if (first) {
+            acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl);
+            first = false;
+        } else
+            acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+
+    RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+    *rho = fma(*alpha, VFMV_F_S(PREC)(sum), *rho);
+}
+
+#endif // DOTXV
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
new file mode 100644
index 000000000..cbca88592
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
@@ -0,0 +1,221 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    float* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    float one = 1.f;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    double* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    double one = 1.;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    scomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    dcomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
new file mode 100644
index 000000000..51edc9221
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
@@ -0,0 +1,266 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FDIV "fdiv.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const float* restrict alpha = alpha_;
+    float* restrict x = x_;
+    if (n <= 0 || *alpha == 0.f || *alpha == 1.f)
+        return;
+
+    float one = 1.f;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
+    __asm__(FDIV "f0, f0, f1");
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FDIV
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FDIV "fdiv.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const double* restrict alpha = alpha_;
+    double* restrict x = x_;
+    if (n <= 0 || *alpha == 0. || *alpha == 1.)
+        return;
+
+    double one = 1.;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
+    __asm__(FDIV "f0, f0, f1");
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FDIV
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FDIV "fdiv.s "
+#define FNEG "fneg.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)cntx;
+    const scomplex* restrict alpha = alpha_;
+    scomplex* restrict x = x_;
+    if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f))
+        return;
+
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FMUL "f2, f0, f0");
+    __asm__(FMADD "f2, f1, f1, f2");
+    __asm__(FDIV "f0, f0, f2");
+    __asm__(FDIV "f1, f1, f2");
+    if (conjalpha == BLIS_NO_CONJUGATE)
+        __asm__(FNEG "f1, f1");
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FDIV
+#undef FNEG
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FDIV "fdiv.d "
+#define FNEG "fneg.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)cntx;
+    const dcomplex* restrict alpha = alpha_;
+    dcomplex* restrict x = x_;
+    if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.))
+        return;
+
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FMUL "f2, f0, f0");
+    __asm__(FMADD "f2, f1, f1, f2");
+    __asm__(FDIV "f0, f0, f2");
+    __asm__(FDIV "f1, f1, f2");
+    if (conjalpha == BLIS_NO_CONJUGATE)
+        __asm__(FNEG "f1, f1");
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
new file mode 100644
index 000000000..cd2dd2c18
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
@@ -0,0 +1,124 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_x280_intr(\
+          conj_t           conjx,         \
+          dim_t            n,             \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx           \
+)
+
+#define SCAL2V(...)  SCAL2V_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scal2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scal2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scal2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scal2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SCAL2V
+#undef SCAL2V_
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..4a25ce3e3
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
@@ -0,0 +1,100 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCAL2V
+
+SCAL2V(PRECISION_CHAR, void)
+{
+    // Computes y = alpha * conjx(x)
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+    if (alpha->real == 0 && alpha->imag == 0) {
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+
+    if (alpha->real == 1 && alpha->imag == 0) {
+        COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+        yvec_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl);
+        yvec_imag = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->imag, vl);
+        if (conjx == BLIS_NO_CONJUGATE) {
+            yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFMACC_VF( PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            yvec_real = VFMACC_VF( PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        }
+
+        // FIXME: remove the #pragmas and change the __riscv_vset_v_f intrinsics to use 
+        // __riscv_vcreate_v_f once they become available in LLVM.
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wuninitialized"
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+        #pragma GCC diagnostic pop
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // SCAL2V
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
new file mode 100644
index 000000000..7084e15cf
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
@@ -0,0 +1,82 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCAL2V
+
+SCAL2V(PRECISION_CHAR, void)
+{
+    // Computes y = alpha * conjx(x)
+    //           == alpha *    x       (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+    if (*alpha == 0) {
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+
+    if (*alpha == 1) {
+        COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // SCAL2V
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
new file mode 100644
index 000000000..b5788d632
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
@@ -0,0 +1,120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_x280_intr(\
+          conj_t  conjalpha,               \
+          dim_t   n,                       \
+    const T*      restrict alpha_,         \
+          T*      restrict x_, inc_t incx, \
+    const cntx_t* cntx                     \
+)
+
+#define SCALV(...)  SCALV_(__VA_ARGS__)
+
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SCALV
+#undef SCALV_
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..c6803c967
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCALV
+
+SCALV(PRECISION_CHAR, void)
+{
+    // Computes x = conjalpha(alpha) * x
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+    
+    if (n <= 0 || (alpha->real == 1 && alpha->imag == 0)) return;
+
+    if (alpha->real == 0 && alpha->imag==0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+        RVV_TYPE_F(PREC, LMUL) temp_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl);
+        RVV_TYPE_F(PREC, LMUL) temp_imag = VFMUL_VF(PREC, LMUL)(xvec_imag, alpha->real, vl);
+        if (conjalpha == BLIS_NO_CONJUGATE) {
+            temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, alpha->imag, xvec_imag, vl);
+            temp_imag = VFMACC_VF(PREC, LMUL)( temp_imag, alpha->imag, xvec_real, vl);
+        } else {
+            temp_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->imag, xvec_imag, vl);
+            temp_imag = VFNMSAC_VF(PREC, LMUL)(temp_imag, alpha->imag, xvec_real, vl);
+        }
+
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, temp_real);
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, temp_imag);
+
+        if (incx == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, xvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, xvec, vl);
+        
+        x += vl*incx;
+        avl -= vl;
+    }
+
+}
+
+#endif // SCALV
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..2b4e31d35
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
@@ -0,0 +1,76 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCALV
+
+SCALV(PRECISION_CHAR, void)
+{
+    // Computes x = conjalpha(alpha) * x
+    //           ==           alpha  * x       (real case)
+    
+    (void) conjalpha; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0 || *alpha == 1) return;
+
+    if (*alpha == 0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl);
+
+        if (incx == 1)
+            VSE_V_F(PREC, LMUL) (x, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+}
+
+#endif // SCALV
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
new file mode 100644
index 000000000..ef9091f16
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
@@ -0,0 +1,204 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const float* restrict alpha = alpha_;
+    float* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    incx *= FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const double* restrict alpha = alpha_;
+    double* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    incx *= FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLSE "vlse32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)cntx;
+    const scomplex* restrict alpha = alpha_;
+    scomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(VLSE "v4, (t0), zero");
+    if (conjalpha == BLIS_CONJUGATE)
+        __asm__("vfneg.v v4, v4");
+    incx *= 2 * FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define VLSE "vlse64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)cntx;
+    const dcomplex* restrict alpha = alpha_;
+    dcomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(VLSE "v4, (t0), zero");
+    if (conjalpha == BLIS_CONJUGATE)
+        __asm__("vfneg.v v4, v4");
+    incx *= 2 * FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
new file mode 100644
index 000000000..e6b483a3f
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
@@ -0,0 +1,118 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_x280_intr(\
+          conj_t           conjx,          \
+          dim_t            n,              \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t* cntx                     \
+)
+
+#define SUBV(...)  SUBV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_subv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_subv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_subv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_subv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SUBV
+#undef SUBV_
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..2d4a1a017
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SUBV
+
+SUBV(PRECISION_CHAR, void)
+{
+    // Computes y := y - conjx(x)
+    (void) cntx;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        yvec_real = VFSUB_VV(PREC, LMUL)(yvec_real, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE)
+            yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+        else
+            yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // SUBV
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..b15859431
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SUBV
+
+SUBV(PRECISION_CHAR, void)
+{
+    // Computes y = y - conjx(x)
+    //           == y -   x       (real case)
+    (void) cntx;
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFSUB_VV(PREC, LMUL)(yvec, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // SUBV
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
new file mode 100644
index 000000000..2342e254a
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
@@ -0,0 +1,245 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * restrict y_,
+                     inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    float* restrict x = x_;
+    float* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VLE "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    double* restrict x = x_;
+    double* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VLE "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    scomplex* restrict x = x_;
+    scomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * 2 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VLE "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSE "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    dcomplex* restrict x = x_;
+    dcomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VLSEG2 "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSSEG2 "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
new file mode 100644
index 000000000..dce4085bf
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_x280_intr(\
+          conj_t           conjx,          \
+          dim_t            n,              \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict beta_,          \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t* restrict cntx            \
+)
+
+#define XPBYV(...)  XPBYV_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_xpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_xpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef XPBYV
+#undef XPBYV_
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..4c86e8b36
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef XPBYV
+
+XPBYV(PRECISION_CHAR, void)
+{
+    // Computes y = beta * y + conjx(x)
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+
+    if (beta->real == 0 && beta->imag == 0){
+        COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // TO DO (optimization): beta = +-1, +-i special cases
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        // xpbyv is computed with FMAs as follows:
+        // y[i].real = (      x[i].real + beta.real * y[i].real) - beta.imag * y[i].imag
+        // y[i].imag = (conjx(x[i].imag + beta.imag * y[i].real) + beta.real * y[i].imag
+
+        xvec_real = VFMACC_VF( PREC, LMUL)(xvec_real, beta->real, yvec_real, vl);
+        xvec_real = VFNMSAC_VF(PREC, LMUL)(xvec_real, beta->imag, yvec_imag, vl);
+        if (conjx == BLIS_NO_CONJUGATE)
+            xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl);
+        else
+            xvec_imag = VFMSAC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl);
+        xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->real, yvec_imag, vl);
+
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, xvec_real);
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, xvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, xvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+}
+
+#endif // XPBYV
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..b23272fea
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
@@ -0,0 +1,84 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef XPBYV
+
+XPBYV(PRECISION_CHAR, void)
+{
+    // Computes y = beta * y + conjx(x)
+    //           == beta * y +    x       (real case)
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    if (*beta == 0){
+        COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // TO DO (optimization): beta = +-1 special cases
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFMADD_VF(PREC, LMUL)(yvec, *beta, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // XPBYV
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
new file mode 100644
index 000000000..1b5ce3b96
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_x280_intr(\
+          conj_t            conjx,              \
+          conj_t            conjy,              \
+          dim_t                 n,              \
+    const T*      restrict alphax_,             \
+    const T*      restrict alphay_,             \
+    const T*      restrict      x_, inc_t incx, \
+    const T*      restrict      y_, inc_t incy, \
+          T*      restrict      z_, inc_t incz, \
+    const cntx_t* restrict   cntx               \
+)
+
+#define AXPY2V(...)  AXPY2V_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpy2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpy2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPY2V
+#undef AXPY2V_
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..9b5719827
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPY2V
+
+AXPY2V(PRECISION_CHAR, void)
+{
+    // Computes z := z + alphax * conjx(x) + alphay * conjy(y)
+    const DATATYPE* restrict alphax = alphax_;
+    const DATATYPE* restrict alphay = alphay_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict z = z_;
+    
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+        
+        if (incz == 1)
+            zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl);
+        else
+            zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+        zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+        zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+
+        //  + alphax * conjx(x)
+        zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->real, xvec_real, vl);
+        zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE){
+            zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphax->imag, xvec_imag, vl);
+            zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->real, xvec_imag, vl);
+        } else {
+            zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->imag, xvec_imag, vl);
+            zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphax->real, xvec_imag, vl);
+        }
+
+        //  + alphay * conjy(y)
+        zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->real, yvec_real, vl);
+        zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->imag, yvec_real, vl);
+        if (conjy == BLIS_NO_CONJUGATE){
+            zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphay->imag, yvec_imag, vl);
+            zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->real, yvec_imag, vl);
+        } else {
+            zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->imag, yvec_imag, vl);
+            zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphay->real, yvec_imag, vl);
+        }
+
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real);
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag);
+
+        if (incz == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPY2V
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
new file mode 100644
index 000000000..cebb15997
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
@@ -0,0 +1,91 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPY2V
+
+AXPY2V(PRECISION_CHAR, void)
+{
+    // Computes z := z + alphax * conjx(x) + alphay * conjy(y)
+    //            == z + alphax *    x     + alphay *    y      (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    (void) conjy;
+    const DATATYPE* restrict alphax = alphax_;
+    const DATATYPE* restrict alphay = alphay_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict z = z_;
+
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        if (incz == 1)
+            zvec = VLE_V_F(PREC, LMUL)(z, vl);
+        else
+            zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl);
+
+        zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphax, xvec, vl);
+        zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphay, yvec, vl);
+        
+        if (incz == 1)
+            VSE_V_F(PREC, LMUL)(z, zvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPY2V
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
new file mode 100644
index 000000000..43c2ba44e
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
@@ -0,0 +1,430 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_saxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_, inc_t inca,
+                         inc_t lda, const void *restrict x_, inc_t incx,
+                         void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
+    (void)conja;
+    (void)conjx;
+    (void)cntx;
+    const float *restrict alpha = alpha_;
+    const float *restrict a = a_;
+    const float *restrict x = x_;
+    float *restrict y = y_;
+
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        // process vl elements of y at a time
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        // x_tmp traverses x
+        // a points to the vl x b block of a needed this iteration
+        // a_tmp traverses the columns of this block
+        const float* restrict x_tmp = x;
+        const float* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        if (inca == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("vfmul.vf v0, v0, ft0");
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            if (inca == FLT_SIZE)
+                __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+        }
+
+        if (incy == FLT_SIZE) {
+            __asm__(VLE "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSE "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_daxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_, inc_t inca,
+                         inc_t lda, const void *restrict x_, inc_t incx,
+                         void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
+    (void)conja;
+    (void)conjx;
+    (void)cntx;
+    const double *restrict alpha = alpha_;
+    const double *restrict a = a_;
+    const double *restrict x = x_;
+    double *restrict y = y_;
+
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        // process vl elements of y at a time
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        // x_tmp traverses x
+        // a points to the vl x b block of a needed this iteration
+        // a_tmp traverses the columns of this block
+        const double* restrict x_tmp = x;
+        const double* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        if (inca == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("vfmul.vf v0, v0, ft0");
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            if (inca == FLT_SIZE)
+                __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+        }
+
+        if (incy == FLT_SIZE) {
+            __asm__(VLE "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSE "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLSEG "vlseg2e32.v "
+#define VLSSEG "vlsseg2e32.v "
+#define VSSEG "vsseg2e32.v "
+#define VSSSEG "vssseg2e32.v "
+
+void bli_caxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_,
+                         inc_t inca, inc_t lda, const void *restrict x_,
+                         inc_t incx, void *restrict y_, inc_t incy,
+                         const cntx_t *restrict cntx) {
+    (void)cntx;
+    const scomplex *restrict alpha = alpha_;
+    const scomplex *restrict a = a_;
+    const scomplex *restrict x = x_;
+    scomplex *restrict y = y_;
+    
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        const scomplex* restrict x_tmp = x;
+        const scomplex* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+        if (inca == 2 * FLT_SIZE)
+            __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+        __asm__("vfmul.vf v0, v24, ft0");
+        __asm__("vfmul.vf v4, v24, ft1");
+        if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfmacc.vf v4, ft0, v28");
+        } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfmsac.vf v4, ft0, v28");
+        } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfnmsac.vf v4, ft0, v28");
+        } else {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfnmacc.vf v4, ft0, v28");
+        }
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+            if (inca == 2 * FLT_SIZE)
+                __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+            if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            }
+        }
+
+        if (incy == 2 * FLT_SIZE) {
+            __asm__(VLSEG "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSEG "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG
+#undef VLSSEG
+#undef VSSEG
+#undef VSSSEG
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLSEG "vlseg2e64.v "
+#define VLSSEG "vlsseg2e64.v "
+#define VSSEG "vsseg2e64.v "
+#define VSSSEG "vssseg2e64.v "
+
+void bli_zaxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_,
+                         inc_t inca, inc_t lda, const void *restrict x_,
+                         inc_t incx, void *restrict y_, inc_t incy,
+                         const cntx_t *restrict cntx) {
+    (void)cntx;
+    const dcomplex *restrict alpha = alpha_;
+    const dcomplex *restrict a = a_;
+    const dcomplex *restrict x = x_;
+    dcomplex *restrict y = y_;
+
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        const dcomplex* restrict x_tmp = x;
+        const dcomplex* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+        if (inca == 2 * FLT_SIZE)
+            __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+        __asm__("vfmul.vf v0, v24, ft0");
+        __asm__("vfmul.vf v4, v24, ft1");
+        if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfmacc.vf v4, ft0, v28");
+        } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfmsac.vf v4, ft0, v28");
+        } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfnmsac.vf v4, ft0, v28");
+        } else {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfnmacc.vf v4, ft0, v28");
+        }
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+            if (inca == 2 * FLT_SIZE)
+                __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+            if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            }
+        }
+
+        if (incy == 2 * FLT_SIZE) {
+            __asm__(VLSEG "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSEG "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
new file mode 100644
index 000000000..9cd1071d7
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_x280_intr(\
+          conj_t            conjxt,             \
+          conj_t             conjx,             \
+          conj_t             conjy,             \
+          dim_t                  n,             \
+    const T*      restrict  alpha_,             \
+    const T*      restrict      x_, inc_t incx, \
+    const T*      restrict      y_, inc_t incy, \
+          T*      restrict    rho_,             \
+          T*      restrict      z_, inc_t incz, \
+    const cntx_t* restrict   cntx               \
+)
+
+#define DOTAXPYV(...)  DOTAXPYV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef DOTAXPYV
+#undef DOTAXPYV_
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..c3cd06c52
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
@@ -0,0 +1,151 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTAXPYV
+
+DOTAXPYV(PRECISION_CHAR, void)
+{
+    // Computes z := z + alpha * conjx(x)
+    //   and  rho := conjxt(x)^T * conjy(y)
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict rho = rho_;
+    DATATYPE* restrict z = z_;
+
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+    bool first = true;
+    RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag;
+
+        // Loads
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+        
+        if (incz == 1)
+            zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl);
+        else
+            zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+        zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+        zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+
+        // z := z + alpha * conjx(x)
+        zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->real, xvec_real, vl);
+        zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE){
+            zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alpha->imag, xvec_imag, vl);
+            zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->imag, xvec_imag, vl);
+            zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alpha->real, xvec_imag, vl);
+        }
+        
+        // rho := conjxt(x)^T * conjy(y)
+        // We accumulate the current term of the dot product as (a*c-b*d) + (a*d+b*c)*i,
+        // conjugating when necessary
+        if (first) {
+            // Initialize real part:      a*c
+            acc_real = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_real, vl);
+            // Initialize imaginary part: a*d
+            acc_imag = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_imag, vl);
+            if (conjy == BLIS_CONJUGATE)
+                acc_imag = VFNEG_VF(PREC, LMUL)(acc_imag, vl); // TO DO: eliminate this negation
+            first = false;
+        } else {
+            // Accumulate real part:      a*c
+            acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_real, yvec_real, vl);
+            // Accumulate imaginary part: a*d
+            if (conjy == BLIS_NO_CONJUGATE)
+                acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+            else
+                acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+        }
+        // Finish real part:      b*d
+        if (conjxt == BLIS_NO_CONJUGATE ^ conjy == BLIS_NO_CONJUGATE)
+            // Exactly one is conjugated => add
+            acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+        else
+            acc_real = VFNMSAC_VV_TU(PREC,LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+        // Finish imaginary part: b*c
+        if (conjxt == BLIS_NO_CONJUGATE)
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl);
+        else
+            acc_imag = VFNMSAC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl);
+
+        // Stores
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real);
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag);
+
+        if (incz == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl);
+
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+    // Compute rho
+    RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)( 0.f, 1);
+    RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)( 0.f, 1);
+    sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+    sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+    rho->real = VFMV_F_S(PREC)(sum_real);
+    rho->imag = VFMV_F_S(PREC)(sum_imag);
+
+}
+
+#endif // ifdef DOTAXPYV
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..adaf3610b
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
@@ -0,0 +1,111 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTAXPYV
+
+DOTAXPYV(PRECISION_CHAR, void)
+{
+    // Computes z := z + alpha * conjx(x)
+    //            == z + alphax *    x      (real case)
+    //   and  rho := conjxt(x)^T * conjy(y)
+    //            == x^T * y                (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    (void) conjxt;
+    (void) conjy;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict rho = rho_;
+    DATATYPE* restrict z = z_;
+
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+    bool first = true;
+    RVV_TYPE_F(PREC, LMUL) acc;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec;
+
+        // Loads
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        if (incz == 1)
+            zvec = VLE_V_F(PREC, LMUL)(z, vl);
+        else
+            zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl);
+
+        // z := z + alphax * x
+        zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, xvec, vl);
+
+        // rho := x^T * y
+        if (first){
+            acc = VFMUL_VV(PREC, LMUL)( xvec, yvec, vl);
+            first = false;
+        } else {
+            acc  = VFMACC_VV_TU(PREC, LMUL)( acc,   xvec, yvec, vl);
+        }
+
+        // Store
+        if (incz == 1)
+            VSE_V_F(PREC, LMUL)(z, zvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+    // Compute rho
+    RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)( 0.f, 1);
+    sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+    *rho = VFMV_F_S(PREC)(sum);
+
+}
+
+#endif // ifdef DOTAXPYV
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
new file mode 100644
index 000000000..ecb340707
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
@@ -0,0 +1,3120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sdotxaxpyf_sifive_x280_asm(
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+                             ) {
+  (void)conjat;
+  (void)conja;
+  (void)conjw;
+  (void)conjx;
+  (void)cntx;
+  const float *restrict alpha = alpha_;
+  const float *restrict beta = beta_;
+  const float *restrict a = a_;
+  const float *restrict w = w_;
+  const float *restrict x = x_;
+  float *restrict y = y_;
+  float *restrict z = z_;
+
+  if (b == 0)
+    return;
+  else if (m == 0 || *alpha == 0.f) {
+    // scale y by beta
+    if (*beta == 0.f)
+        bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    else
+        bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    return;
+  }
+
+  __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+  __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+  inca *= FLT_SIZE;
+  lda *= FLT_SIZE;
+  incw *= FLT_SIZE;
+  incx *= FLT_SIZE;
+  incy *= FLT_SIZE;
+  incz *= FLT_SIZE;
+  inc_t a_bump = 5 * lda;
+  while (b >= 5) {
+    // compute dot product of w with 5 rows of a
+    const float* restrict w_tmp = w;
+    const float* restrict z_tmp = z;
+    const float* restrict a_col = a;
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const float* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("vmv.s.x v31, x0");
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v0, v0, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v0, v0, ft10");
+      __asm__(VSE "v0, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v0");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v4, v4, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v4, v4, ft10");
+      __asm__(VSE "v4, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v4");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v8, v8, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v8, v8, ft10");
+      __asm__(VSE "v8, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v8");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v12, v12, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v12, v12, ft10");
+      __asm__(VSE "v12, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v12");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v16, v16, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v16, v16, ft10");
+      __asm__(VSE "v16, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v16");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+    b -= 5;
+  }
+
+  if (b > 0) {
+    const float* restrict w_tmp = w;
+    const float* restrict z_tmp = z;
+    const float* restrict a_col;
+    __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+    __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const float* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      __asm__("vmv.v.i v20, 0");
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+    __asm__("vmv.s.x v31, x0");
+
+    switch (b) {
+    case 4:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v12, v12, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__(VSE "v12, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v12");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 3:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v8, v8, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__(VSE "v8, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v8");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 2:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v4, v4, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__(VSE "v4, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v4");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 1:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v0, v0, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__(VSE "v0, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v0");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+    }
+  } // end cleanup
+  return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_ddotxaxpyf_sifive_x280_asm(
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+                             ) {
+  (void)conjat;
+  (void)conja;
+  (void)conjw;
+  (void)conjx;
+  (void)cntx;
+  const double *restrict alpha = alpha_;
+  const double *restrict beta = beta_;
+  const double *restrict a = a_;
+  const double *restrict w = w_;
+  const double *restrict x = x_;
+  double *restrict y = y_;
+  double *restrict z = z_;
+
+  if (b == 0)
+    return;
+  else if (m == 0 || *alpha == 0.) {
+    // scale y by beta
+    if (*beta == 0.)
+        bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    else
+        bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    return;
+  }
+
+  __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+  __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+  inca *= FLT_SIZE;
+  lda *= FLT_SIZE;
+  incw *= FLT_SIZE;
+  incx *= FLT_SIZE;
+  incy *= FLT_SIZE;
+  incz *= FLT_SIZE;
+  inc_t a_bump = 5 * lda;
+  while (b >= 5) {
+    // compute dot product of w with 5 rows of a
+    const double* restrict w_tmp = w;
+    const double* restrict z_tmp = z;
+    const double* restrict a_col = a;
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const double* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("vmv.s.x v31, x0");
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v0, v0, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v0, v0, ft10");
+      __asm__(VSE "v0, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v0");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v4, v4, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v4, v4, ft10");
+      __asm__(VSE "v4, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v4");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v8, v8, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v8, v8, ft10");
+      __asm__(VSE "v8, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v8");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v12, v12, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v12, v12, ft10");
+      __asm__(VSE "v12, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v12");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v16, v16, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v16, v16, ft10");
+      __asm__(VSE "v16, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v16");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+    b -= 5;
+  }
+
+  if (b > 0) {
+    const double* restrict w_tmp = w;
+    const double* restrict z_tmp = z;
+    const double* restrict a_col;
+    __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+    __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const double* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      __asm__("vmv.v.i v20, 0");
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+    __asm__("vmv.s.x v31, x0");
+
+    switch (b) {
+    case 4:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v12, v12, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__(VSE "v12, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v12");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 3:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v8, v8, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__(VSE "v8, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v8");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 2:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v4, v4, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__(VSE "v4, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v4");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 1:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v0, v0, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__(VSE "v0, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v0");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+    }
+  } // end cleanup
+  return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FNMSUB "fnmsub.s "
+#define FNEG "fneg.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSE "vse32.v "
+
+void bli_cdotxaxpyf_sifive_x280_asm
+     (
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+     )
+{
+    (void)cntx;
+    const scomplex *restrict alpha = alpha_;
+    const scomplex *restrict beta = beta_;
+    const scomplex *restrict a = a_;
+    const scomplex *restrict w = w_;
+    const scomplex *restrict x = x_;
+    scomplex *restrict y = y_;
+    scomplex *restrict z = z_;
+    
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
+        // scale y by beta
+        if (beta->real == 0.f && beta->imag == 0.f)
+            bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
+    // and fa6-fa7 to store beta
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * w if needed.
+    conj_t conjatw = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjw);
+        bli_toggle_conj(&conjatw);
+    }
+    conj_t conjax = BLIS_NO_CONJUGATE;
+    if (conja == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conja);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjax);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incw *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    incz *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 5 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 5) {
+        // compute dot product of w with 6 rows of a
+        const scomplex* restrict w_tmp = w;
+        const scomplex* restrict z_tmp = z;
+        const scomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v16, v18, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 5 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 5;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 5
+        const scomplex* restrict w_tmp = w;
+        const scomplex* restrict z_tmp = z;
+        const scomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            __asm__("vmv.v.i v20, 0");
+            __asm__("vmv.v.i v22, 0");
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+
+        switch (b) {
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FNMSUB
+#undef FNEG
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FNMSUB "fnmsub.d "
+#define FNEG "fneg.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSE "vse64.v "
+
+void bli_zdotxaxpyf_sifive_x280_asm
+     (
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+     )
+{
+    (void)cntx;
+    const dcomplex *restrict alpha = alpha_;
+    const dcomplex *restrict beta = beta_;
+    const dcomplex *restrict a = a_;
+    const dcomplex *restrict w = w_;
+    const dcomplex *restrict x = x_;
+    dcomplex *restrict y = y_;
+    dcomplex *restrict z = z_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
+        // scale y by beta
+        if (beta->real == 0. && beta->imag == 0.)
+            bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
+    // and fa6-fa7 to store beta
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * w if needed.
+    conj_t conjatw = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjw);
+        bli_toggle_conj(&conjatw);
+    }
+    conj_t conjax = BLIS_NO_CONJUGATE;
+    if (conja == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conja);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjax);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incw *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    incz *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 5 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 5) {
+        // compute dot product of w with 6 rows of a
+        const dcomplex* restrict w_tmp = w;
+        const dcomplex* restrict z_tmp = z;
+        const dcomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v16, v18, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 5 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 5;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 5
+        const dcomplex* restrict w_tmp = w;
+        const dcomplex* restrict z_tmp = z;
+        const dcomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            __asm__("vmv.v.i v20, 0");
+            __asm__("vmv.v.i v22, 0");
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+
+        switch (b) {
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
new file mode 100644
index 000000000..5ac2d4166
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
@@ -0,0 +1,2645 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sdotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
+    // we process 6 elements of y per iteration, using y_tmp to load/store from
+    // y a points to the 6 x m block of a needed this iteration each 6 x m block
+    // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
+    // use x_tmp to load from x a_row is used to load each of the 6 rows of this
+    // 6 x vl block
+    (void)conjat;
+    (void)conjx;
+    (void)cntx;
+    const float* restrict alpha = alpha_;
+    const float* restrict a = a_;
+    const float* restrict x = x_;
+    const float* restrict beta = beta_;
+    float* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || *alpha == 0.f) {
+        // scale y by beta
+        if (*beta == 0.f)
+            bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    inc_t a_bump = 6 * lda; // to bump a down 6 rows
+
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const float* restrict x_tmp = x;
+        const float* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const float* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLE "v20, (%0)" : : "r"(a_row));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v0");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__(VSE "v4, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v4");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__(VSE "v8, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v8");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__(VSE "v12, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v12");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__(VSE "v16, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v16");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__(VSE "v20, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v20");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // compute dot product of x with remaining < 6 rows of a
+        const float* restrict x_tmp = x;
+        // a_col will move along the last row of a!
+        const float* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const float* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        __asm__("vmv.s.x v31, x0");
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__(VSE "v16, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v16");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__(VSE "v12, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v12");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__(VSE "v8, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v8");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__(VSE "v4, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v4");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__(VSE "v0, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v0");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+        }
+    } // end cleanup
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_ddotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
+    // we process 6 elements of y per iteration, using y_tmp to load/store from
+    // y a points to the 6 x m block of a needed this iteration each 6 x m block
+    // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
+    // use x_tmp to load from x a_row is used to load each of the 6 rows of this
+    // 6 x vl block
+    (void)conjat;
+    (void)conjx;
+    (void)cntx;
+    const double* restrict alpha = alpha_;
+    const double* restrict a = a_;
+    const double* restrict x = x_;
+    const double* restrict beta = beta_;
+    double* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || *alpha == 0.) {
+        // scale y by beta
+        if (*beta == 0.)
+            bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    inc_t a_bump = 6 * lda; // to bump a down 6 rows
+
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const double* restrict x_tmp = x;
+        const double* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const double* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLE "v20, (%0)" : : "r"(a_row));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v0");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__(VSE "v4, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v4");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__(VSE "v8, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v8");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__(VSE "v12, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v12");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__(VSE "v16, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v16");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__(VSE "v20, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v20");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // compute dot product of x with remaining < 6 rows of a
+        const double* restrict x_tmp = x;
+        // a_col will move along the last row of a!
+        const double* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const double* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        __asm__("vmv.s.x v31, x0");
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__(VSE "v16, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v16");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__(VSE "v12, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v12");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__(VSE "v8, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v8");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__(VSE "v4, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v4");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__(VSE "v0, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v0");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+        }
+    } // end cleanup
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FNMSUB "fnmsub.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSE "vse32.v "
+
+void bli_cdotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    (void)cntx;
+    const scomplex* restrict alpha = alpha_;
+    const scomplex* restrict a = a_;
+    const scomplex* restrict x = x_;
+    const scomplex* restrict beta = beta_;
+    scomplex* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
+        // scale y by beta
+        if (beta->real == 0.f && beta->imag == 0.f)
+            bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * x if needed.
+    conj_t conjatx = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjatx);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 6 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const scomplex* restrict x_tmp = x;
+        const scomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx = BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vfredusum.vs v22, v22, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v20, v22, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 6
+        const scomplex* restrict x_tmp = x;
+        const scomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+        
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vfredusum.vs v18, v18, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    } // end cleanup
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FNMSUB
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FNMSUB "fnmsub.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSE "vse64.v "
+
+void bli_zdotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    (void)cntx;
+    const dcomplex* restrict alpha = alpha_;
+    const dcomplex* restrict a = a_;
+    const dcomplex* restrict x = x_;
+    const dcomplex* restrict beta = beta_;
+    dcomplex* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
+        // scale y by beta
+        if (beta->real == 0. && beta->imag == 0.)
+            bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * x if needed.
+    conj_t conjatx = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjatx);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 6 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const dcomplex* restrict x_tmp = x;
+        const dcomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx = BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vfredusum.vs v22, v22, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v20, v22, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 6
+        const dcomplex* restrict x_tmp = x;
+        const dcomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+        
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vfredusum.vs v18, v18, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    } // end cleanup
+    return;
+}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
new file mode 100644
index 000000000..35ca23677
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
@@ -0,0 +1,678 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSSEG7 "vssseg7e32.v "
+
+void bli_spackm_sifive_x280_asm_7xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const float* kappa = kappa_;
+    const float* a = a_;
+    float* p = p_;
+
+    float kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        switch (cdim) {
+            case 0: __asm__("vmv.v.i v0, 0");
+            case 1: __asm__("vmv.v.i v1, 0");
+            case 2: __asm__("vmv.v.i v2, 0");
+            case 3: __asm__("vmv.v.i v3, 0");
+            case 4: __asm__("vmv.v.i v4, 0");
+            case 5: __asm__("vmv.v.i v5, 0");
+            case 6: __asm__("vmv.v.i v6, 0");
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const float* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 7:
+                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 6:
+                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast != 1.f) {
+                switch (cdim) {
+                    case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+            }
+            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.f) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSSEG7
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSSEG7 "vssseg7e64.v "
+
+void bli_dpackm_sifive_x280_asm_7xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const double* kappa = kappa_;
+    const double* a = a_;
+    double* p = p_;
+
+    double kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        switch (cdim) {
+            case 0: __asm__("vmv.v.i v0, 0");
+            case 1: __asm__("vmv.v.i v1, 0");
+            case 2: __asm__("vmv.v.i v2, 0");
+            case 3: __asm__("vmv.v.i v3, 0");
+            case 4: __asm__("vmv.v.i v4, 0");
+            case 5: __asm__("vmv.v.i v5, 0");
+            case 6: __asm__("vmv.v.i v6, 0");
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const double* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 7:
+                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 6:
+                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast != 1.) {
+                switch (cdim) {
+                    case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+            }
+            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSSEG7
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+
+void bli_cpackm_sifive_x280_asm_6xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const scomplex* kappa = kappa_;
+    const scomplex* a = a_;
+    scomplex* p = p_;
+
+    scomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v0, 0");
+                    __asm__("vmv.v.i v1, 0");
+                case 1:
+                    __asm__("vmv.v.i v2, 0");
+                    __asm__("vmv.v.i v3, 0");
+                case 2:
+                    __asm__("vmv.v.i v4, 0");
+                    __asm__("vmv.v.i v5, 0");
+                case 3:
+                    __asm__("vmv.v.i v6, 0");
+                    __asm__("vmv.v.i v7, 0");
+                case 4:
+                    __asm__("vmv.v.i v8, 0");
+                    __asm__("vmv.v.i v9, 0");
+                case 5:
+                    __asm__("vmv.v.i v10, 0");
+                    __asm__("vmv.v.i v11, 0");
+            }
+        }
+        else {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v12, 0");
+                    __asm__("vmv.v.i v13, 0");
+                case 1:
+                    __asm__("vmv.v.i v14, 0");
+                    __asm__("vmv.v.i v15, 0");
+                case 2:
+                    __asm__("vmv.v.i v16, 0");
+                    __asm__("vmv.v.i v17, 0");
+                case 3:
+                    __asm__("vmv.v.i v18, 0");
+                    __asm__("vmv.v.i v19, 0");
+                case 4:
+                    __asm__("vmv.v.i v20, 0");
+                    __asm__("vmv.v.i v21, 0");
+                case 5:
+                    __asm__("vmv.v.i v22, 0");
+                    __asm__("vmv.v.i v23, 0");
+            }
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const scomplex* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 6:
+                    __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                if (conja == BLIS_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: __asm__("vfneg.v v11, v11");
+                        case 5: __asm__("vfneg.v v9, v9");
+                        case 4: __asm__("vfneg.v v7, v7");
+                        case 3: __asm__("vfneg.v v5, v5");
+                        case 2: __asm__("vfneg.v v3, v3");
+                        case 1: __asm__("vfneg.v v1, v1");
+                    }
+                }
+                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                else {
+                    switch (cdim) {
+                        case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v3, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v1, v1");
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG6
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+
+void bli_zpackm_sifive_x280_asm_6xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const dcomplex* kappa = kappa_;
+    const dcomplex* a = a_;
+    dcomplex* p = p_;
+
+    dcomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v0, 0");
+                    __asm__("vmv.v.i v1, 0");
+                case 1:
+                    __asm__("vmv.v.i v2, 0");
+                    __asm__("vmv.v.i v3, 0");
+                case 2:
+                    __asm__("vmv.v.i v4, 0");
+                    __asm__("vmv.v.i v5, 0");
+                case 3:
+                    __asm__("vmv.v.i v6, 0");
+                    __asm__("vmv.v.i v7, 0");
+                case 4:
+                    __asm__("vmv.v.i v8, 0");
+                    __asm__("vmv.v.i v9, 0");
+                case 5:
+                    __asm__("vmv.v.i v10, 0");
+                    __asm__("vmv.v.i v11, 0");
+            }
+        }
+        else {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v12, 0");
+                    __asm__("vmv.v.i v13, 0");
+                case 1:
+                    __asm__("vmv.v.i v14, 0");
+                    __asm__("vmv.v.i v15, 0");
+                case 2:
+                    __asm__("vmv.v.i v16, 0");
+                    __asm__("vmv.v.i v17, 0");
+                case 3:
+                    __asm__("vmv.v.i v18, 0");
+                    __asm__("vmv.v.i v19, 0");
+                case 4:
+                    __asm__("vmv.v.i v20, 0");
+                    __asm__("vmv.v.i v21, 0");
+                case 5:
+                    __asm__("vmv.v.i v22, 0");
+                    __asm__("vmv.v.i v23, 0");
+            }
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const dcomplex* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 6:
+                    __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                if (conja == BLIS_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: __asm__("vfneg.v v11, v11");
+                        case 5: __asm__("vfneg.v v9, v9");
+                        case 4: __asm__("vfneg.v v7, v7");
+                        case 3: __asm__("vfneg.v v5, v5");
+                        case 2: __asm__("vfneg.v v3, v3");
+                        case 1: __asm__("vfneg.v v1, v1");
+                    }
+                }
+                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                else {
+                    switch (cdim) {
+                        case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v3, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v1, v1");
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
new file mode 100644
index 000000000..89e05ecae
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
@@ -0,0 +1,838 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define VSSSEG7 "vssseg7e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG5 "vssseg5e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG3 "vssseg3e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define NR 64
+
+void bli_spackm_sifive_x280_asm_64xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const float* kappa = kappa_;
+    const float* a = a_;
+    float* p = p_;
+
+    float kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v8, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const float* a_tmp = a;
+            float* p_tmp = p;
+            while (cdim_tmp >= 8) {
+                __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast != 1.f) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+                }
+                __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                p_tmp += 8;
+                cdim_tmp -= 8;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 6:
+                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast != 1.f) {
+                    switch (cdim_tmp) {
+                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    }
+                }
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 6:
+                        __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 5:
+                        __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 4:
+                        __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 3:
+                        __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 2:
+                        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 1:
+                        __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v8, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.f) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define VSSSEG7 "vssseg7e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG5 "vssseg5e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG3 "vssseg3e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define NR 32
+
+void bli_dpackm_sifive_x280_asm_32xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const double* kappa = kappa_;
+    const double* a = a_;
+    double* p = p_;
+
+    double kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v8, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const double* a_tmp = a;
+            double* p_tmp = p;
+            while (cdim_tmp >= 8) {
+                __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast != 1.) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+                }
+                __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                p_tmp += 8;
+                cdim_tmp -= 8;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 6:
+                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast != 1.) {
+                    switch (cdim_tmp) {
+                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    }
+                }
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 6:
+                        __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 5:
+                        __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 4:
+                        __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 3:
+                        __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 2:
+                        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 1:
+                        __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v8, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define NR 32 
+
+void bli_cpackm_sifive_x280_asm_32xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const scomplex* kappa = kappa_;
+    const scomplex* a = a_;
+    scomplex* p = p_;
+
+    scomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v16, 0");
+        __asm__("vmv.v.i v18, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const scomplex* a_tmp = a;
+            scomplex* p_tmp = p;
+            while (cdim_tmp >= 4) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v1, v1");
+                        __asm__("vfneg.v v3, v3");
+                        __asm__("vfneg.v v5, v5");
+                        __asm__("vfneg.v v7, v7");
+                    }
+                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                p_tmp += 4;
+                cdim_tmp -= 4;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 3:
+                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                    if (conja == BLIS_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: __asm__("vfneg.v v5, v5");
+                            case 2: __asm__("vfneg.v v3, v3");
+                            case 1: __asm__("vfneg.v v1, v1");
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    else {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v4, 0");
+        __asm__("vmv.v.i v6, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v2, v2");
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSSSEG4
+#undef VSSSEG6
+#undef VSSSEG8
+#undef NR
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define NR 16
+
+void bli_zpackm_sifive_x280_asm_16xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const dcomplex* kappa = kappa_;
+    const dcomplex* a = a_;
+    dcomplex* p = p_;
+
+    dcomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v16, 0");
+        __asm__("vmv.v.i v18, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const dcomplex* a_tmp = a;
+            dcomplex* p_tmp = p;
+            while (cdim_tmp >= 4) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v1, v1");
+                        __asm__("vfneg.v v3, v3");
+                        __asm__("vfneg.v v5, v5");
+                        __asm__("vfneg.v v7, v7");
+                    }
+                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                p_tmp += 4;
+                cdim_tmp -= 4;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 3:
+                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                    if (conja == BLIS_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: __asm__("vfneg.v v5, v5");
+                            case 2: __asm__("vfneg.v v3, v3");
+                            case 1: __asm__("vfneg.v v1, v1");
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    else {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v4, 0");
+        __asm__("vmv.v.i v6, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v2, v2");
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
new file mode 100644
index 000000000..b9715988d
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
@@ -0,0 +1,2405 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <riscv_vector.h>
+
+// byte-size of the floating point type
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+#define PACKMR 8
+#define PACKNR 64
+
+void bli_sgemm_7m4
+     (
+             dim_t           N,
+             dim_t           K,
+       const float* restrict alpha,
+       const float* restrict a,
+       const float* restrict b,
+       const float* restrict beta,
+             float* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmul.vf v0, v28, ft0");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmul.vf v4, v28, ft1");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmul.vf v8, v28, ft2");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmul.vf v12, v28, ft3");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmul.vf v16, v28, ft4");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmul.vf v20, v28, ft5");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmul.vf v24, v28, ft6");
+
+            first = false;
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmacc.vf v0, ft0, v28");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmacc.vf v4, ft1, v28");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmacc.vf v8, ft2, v28");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmacc.vf v12, ft3, v28");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmacc.vf v16, ft4, v28");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmacc.vf v20, ft5, v28");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmacc.vf v24, ft6, v28");
+        }
+
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.f) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__("vfmul.vf v16, v16, ft10");
+        __asm__("vfmul.vf v20, v20, ft10");
+        __asm__("vfmul.vf v24, v24, ft10");
+    }
+    else { // beta != 0.f
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        float *c_tmp = c;
+        if (csc == FLT_SIZE) { // c unit column stride
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c unit column stride
+        else { // c non-unit column stride
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c non-unit column stride
+    } // end beta != 0.f
+
+    // store c
+    if (csc == FLT_SIZE) {
+        __asm__(VSE "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v20, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v24, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_sgemm_7m4_cleanup
+     (
+             dim_t           M,
+             dim_t           N,
+             dim_t           K,
+       const float* restrict alpha,
+       const float* restrict a,
+       const float* restrict b,
+       const float* restrict beta,
+             float* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmul.vf v20, v28, ft5");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmul.vf v16, v28, ft4");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmul.vf v12, v28, ft3");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmul.vf v8, v28, ft2");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmul.vf v4, v28, ft1");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmul.vf v0, v28, ft0");
+            }
+            first = false;
+        }
+        else {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmacc.vf v20, ft5, v28");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmacc.vf v16, ft4, v28");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmacc.vf v12, ft3, v28");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmacc.vf v8, ft2, v28");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmacc.vf v4, ft1, v28");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmacc.vf v0, ft0, v28");
+            }
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+     
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.f) {
+        switch (M) {
+        case 6:
+            __asm__("vfmul.vf v20, v20, ft10");
+        case 5:
+            __asm__("vfmul.vf v16, v16, ft10");
+        case 4:
+            __asm__("vfmul.vf v12, v12, ft10");
+        case 3:
+            __asm__("vfmul.vf v8, v8, ft10");
+        case 2:
+            __asm__("vfmul.vf v4, v4, ft10");
+        case 1:
+            __asm__("vfmul.vf v0, v0, ft10");
+        }
+    }
+    else { // beta != 0.f
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        float *c_tmp = c;
+        if (csc == FLT_SIZE) {
+            switch (M) {
+            case 6:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 6:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.f
+
+    // store c
+    if (csc == FLT_SIZE) {
+        switch (M) {
+        case 6:
+            __asm__(VSE "v20, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSE "v16, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSE "v12, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSE "v8, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSE "v4, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSE "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+        case 6:
+            __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+    return;
+}
+
+void bli_sgemm_7m4_k0
+     (
+             dim_t           M,
+             dim_t           N,
+       const float* restrict beta,
+             float* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    if (*beta == 0.f) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }        
+        } // end c non-unit column stride
+    } // end beta == 0.f
+    else { // beta != 0.f
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLE "v24, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSE "v24, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLE "v20, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSE "v20, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLE "v16, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSE "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLE "v12, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSE "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLE "v8, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSE "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLE "v4, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSE "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLE "v0, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.f
+    return;
+}
+
+void bli_sgemm_sifive_x280_asm_7m4
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    (void) data;
+    (void) cntx;
+    const float* restrict alpha = alpha_;
+    const float* restrict beta = beta_;
+    const float* restrict a = a_;
+    const float* restrict b = b_;
+    float* restrict c = c_;
+
+    // M x N x K sgemm
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc);
+    else if (M == 7)
+        bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of the floating point type
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define PACKMR 8
+#define PACKNR 32 
+
+void bli_dgemm_7m4
+     (
+             dim_t            N,
+             dim_t            K,
+       const double* restrict alpha,
+       const double* restrict a,
+       const double* restrict b,
+       const double* restrict beta,
+             double* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmul.vf v0, v28, ft0");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmul.vf v4, v28, ft1");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmul.vf v8, v28, ft2");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmul.vf v12, v28, ft3");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmul.vf v16, v28, ft4");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmul.vf v20, v28, ft5");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmul.vf v24, v28, ft6");
+
+            first = false;
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmacc.vf v0, ft0, v28");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmacc.vf v4, ft1, v28");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmacc.vf v8, ft2, v28");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmacc.vf v12, ft3, v28");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmacc.vf v16, ft4, v28");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmacc.vf v20, ft5, v28");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmacc.vf v24, ft6, v28");
+        }
+
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__("vfmul.vf v16, v16, ft10");
+        __asm__("vfmul.vf v20, v20, ft10");
+        __asm__("vfmul.vf v24, v24, ft10");
+    }
+    else { // beta != 0.
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        double *c_tmp = c;
+        if (csc == FLT_SIZE) { // c unit column stride
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c unit column stride
+        else { // c non-unit column stride
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c non-unit column stride
+    } // end beta != 0.
+
+    // store c
+    if (csc == FLT_SIZE) {
+        __asm__(VSE "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v20, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v24, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_dgemm_7m4_cleanup
+     (
+             dim_t            M,
+             dim_t            N,
+             dim_t            K,
+       const double* restrict alpha,
+       const double* restrict a,
+       const double* restrict b,
+       const double* restrict beta,
+             double* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmul.vf v20, v28, ft5");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmul.vf v16, v28, ft4");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmul.vf v12, v28, ft3");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmul.vf v8, v28, ft2");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmul.vf v4, v28, ft1");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmul.vf v0, v28, ft0");
+            }
+            first = false;
+        }
+        else {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmacc.vf v20, ft5, v28");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmacc.vf v16, ft4, v28");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmacc.vf v12, ft3, v28");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmacc.vf v8, ft2, v28");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmacc.vf v4, ft1, v28");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmacc.vf v0, ft0, v28");
+            }
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+     
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.) {
+        switch (M) {
+        case 6:
+            __asm__("vfmul.vf v20, v20, ft10");
+        case 5:
+            __asm__("vfmul.vf v16, v16, ft10");
+        case 4:
+            __asm__("vfmul.vf v12, v12, ft10");
+        case 3:
+            __asm__("vfmul.vf v8, v8, ft10");
+        case 2:
+            __asm__("vfmul.vf v4, v4, ft10");
+        case 1:
+            __asm__("vfmul.vf v0, v0, ft10");
+        }
+    }
+    else { // beta != 0.
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        double *c_tmp = c;
+        if (csc == FLT_SIZE) {
+            switch (M) {
+            case 6:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 6:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.
+
+    // store c
+    if (csc == FLT_SIZE) {
+        switch (M) {
+        case 6:
+            __asm__(VSE "v20, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSE "v16, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSE "v12, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSE "v8, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSE "v4, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSE "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+        case 6:
+            __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+    return;
+}
+
+void bli_dgemm_7m4_k0
+     (
+             dim_t            M,
+             dim_t            N,
+       const double* restrict beta,
+             double* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    if (*beta == 0.) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }        
+        } // end c non-unit column stride
+    } // end beta == 0.
+    else { // beta != 0.
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLE "v24, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSE "v24, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLE "v20, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSE "v20, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLE "v16, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSE "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLE "v12, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSE "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLE "v8, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSE "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLE "v4, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSE "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLE "v0, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.
+    return;
+}
+
+void bli_dgemm_sifive_x280_asm_7m4
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    (void) data;
+    (void) cntx;
+    const double* restrict alpha = alpha_;
+    const double* restrict beta = beta_;
+    const double* restrict a = a_;
+    const double* restrict b = b_;
+    double* restrict c = c_;
+
+    // M x N x K dgemm
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc);
+    else if (M == 7)
+        bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of underlying floating point type
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define PACKMR 8
+#define PACKNR 32 
+
+void bli_cgemm_6m2
+     (
+             dim_t              N,
+             dim_t              K,
+       const scomplex* restrict alpha,
+       const scomplex* restrict a,
+       const scomplex* restrict b,
+       const scomplex* restrict beta,
+             scomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 6 x N x K cgemm, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+    vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+    vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+    vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+    vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+    vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+    vcmul_vf(v20, v22, v24, v26, ft10, ft11);
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    __asm__("vfmul.vf v24, v2, ft1");
+    __asm__("vfmul.vf v26, v0, ft1");
+    __asm__("vfmul.vf v28, v6, ft1");
+    __asm__("vfmul.vf v30, v4, ft1");
+
+    __asm__("vfmsub.vf v0, ft0, v24");
+    __asm__("vfmadd.vf v2, ft0, v26"); 
+    __asm__("vfmsub.vf v4, ft0, v28");
+    __asm__("vfmadd.vf v6, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v10, ft1");
+    __asm__("vfmul.vf v26, v8, ft1");
+    __asm__("vfmul.vf v28, v14, ft1");
+    __asm__("vfmul.vf v30, v12, ft1");
+
+    __asm__("vfmsub.vf v8, ft0, v24");
+    __asm__("vfmadd.vf v10, ft0, v26"); 
+    __asm__("vfmsub.vf v12, ft0, v28");
+    __asm__("vfmadd.vf v14, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v18, ft1");
+    __asm__("vfmul.vf v26, v16, ft1");
+    __asm__("vfmul.vf v28, v22, ft1");
+    __asm__("vfmul.vf v30, v20, ft1");
+
+    __asm__("vfmsub.vf v16, ft0, v24");
+    __asm__("vfmadd.vf v18, ft0, v26"); 
+    __asm__("vfmsub.vf v20, ft0, v28");
+    __asm__("vfmadd.vf v22, ft0, v30"); 
+
+    scomplex beta_cast = *beta;
+    if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
+        if (csc == 2 * FLT_SIZE) {
+            scomplex *c_tmp = c;
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+        else {
+            scomplex *c_tmp = c;
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_cgemm_6m2_cleanup
+     (
+             dim_t              M,
+             dim_t              N,
+             dim_t              K,
+       const scomplex* restrict alpha,
+       const scomplex* restrict a,
+       const scomplex* restrict b,
+       const scomplex* restrict beta,
+             scomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    switch (M) {
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+            vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+            vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+        case 1:
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+    }
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    switch (M) {
+        case 5:
+            __asm__("vfmul.vf v24, v18, ft1");
+            __asm__("vfmul.vf v26, v16, ft1");
+            __asm__("vfmsub.vf v16, ft0, v24");
+            __asm__("vfmadd.vf v18, ft0, v26"); 
+        case 4:
+            __asm__("vfmul.vf v28, v14, ft1");
+            __asm__("vfmul.vf v30, v12, ft1");
+            __asm__("vfmsub.vf v12, ft0, v28");
+            __asm__("vfmadd.vf v14, ft0, v30"); 
+        case 3:
+            __asm__("vfmul.vf v24, v10, ft1");
+            __asm__("vfmul.vf v26, v8, ft1");
+            __asm__("vfmsub.vf v8, ft0, v24");
+            __asm__("vfmadd.vf v10, ft0, v26"); 
+        case 2:
+            __asm__("vfmul.vf v28, v6, ft1");
+            __asm__("vfmul.vf v30, v4, ft1");
+            __asm__("vfmsub.vf v4, ft0, v28");
+            __asm__("vfmadd.vf v6, ft0, v30"); 
+        case 1:
+            __asm__("vfmul.vf v24, v2, ft1");
+            __asm__("vfmul.vf v26, v0, ft1");
+            __asm__("vfmsub.vf v0, ft0, v24");
+            __asm__("vfmadd.vf v2, ft0, v26"); 
+    } 
+
+    scomplex beta_cast = *beta;
+    if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
+        if (csc == 2 * FLT_SIZE) {
+            scomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+        else {
+            scomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        switch (M) {
+            case 5:
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+            case 5:
+                __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+
+    return;
+}
+
+void bli_cgemm_6m2_k0
+     (
+             dim_t              M,
+             dim_t              N,
+       const scomplex* restrict beta,
+             scomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    csc *= 2 * FLT_SIZE;
+
+    scomplex beta_cast = *beta;
+    if (beta_cast.real == 0.f && beta_cast.imag == 0.f) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+            c += rsc;
+        }
+    }
+    else {
+        // scale c by beta
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+            }
+            c += rsc;
+        }
+    }
+    return;
+}
+
+void bli_cgemm_sifive_x280_asm_6m2
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    // M x N x K cgemm 
+    (void) data;
+    (void) cntx;
+    const scomplex* restrict alpha = alpha_;
+    const scomplex* restrict beta = beta_;
+    const scomplex* restrict a = a_;
+    const scomplex* restrict b = b_;
+    scomplex* restrict c = c_;
+
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc);
+    else if (M == 6)
+        bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of underlying floating point type
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define PACKMR 8
+#define PACKNR 16 
+
+void bli_zgemm_6m2
+     (
+             dim_t              N,
+             dim_t              K,
+       const dcomplex* restrict alpha,
+       const dcomplex* restrict a,
+       const dcomplex* restrict b,
+       const dcomplex* restrict beta,
+             dcomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 6 x N x K zgemm, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+    vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+    vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+    vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+    vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+    vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+    vcmul_vf(v20, v22, v24, v26, ft10, ft11);
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    __asm__("vfmul.vf v24, v2, ft1");
+    __asm__("vfmul.vf v26, v0, ft1");
+    __asm__("vfmul.vf v28, v6, ft1");
+    __asm__("vfmul.vf v30, v4, ft1");
+
+    __asm__("vfmsub.vf v0, ft0, v24");
+    __asm__("vfmadd.vf v2, ft0, v26"); 
+    __asm__("vfmsub.vf v4, ft0, v28");
+    __asm__("vfmadd.vf v6, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v10, ft1");
+    __asm__("vfmul.vf v26, v8, ft1");
+    __asm__("vfmul.vf v28, v14, ft1");
+    __asm__("vfmul.vf v30, v12, ft1");
+
+    __asm__("vfmsub.vf v8, ft0, v24");
+    __asm__("vfmadd.vf v10, ft0, v26"); 
+    __asm__("vfmsub.vf v12, ft0, v28");
+    __asm__("vfmadd.vf v14, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v18, ft1");
+    __asm__("vfmul.vf v26, v16, ft1");
+    __asm__("vfmul.vf v28, v22, ft1");
+    __asm__("vfmul.vf v30, v20, ft1");
+
+    __asm__("vfmsub.vf v16, ft0, v24");
+    __asm__("vfmadd.vf v18, ft0, v26"); 
+    __asm__("vfmsub.vf v20, ft0, v28");
+    __asm__("vfmadd.vf v22, ft0, v30"); 
+
+    dcomplex beta_cast = *beta;
+    if (beta_cast.real != 0. || beta_cast.imag != 0.) {
+        if (csc == 2 * FLT_SIZE) {
+            dcomplex *c_tmp = c;
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+        else {
+            dcomplex *c_tmp = c;
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_zgemm_6m2_cleanup
+     (
+             dim_t              M,
+             dim_t              N,
+             dim_t              K,
+       const dcomplex* restrict alpha,
+       const dcomplex* restrict a,
+       const dcomplex* restrict b,
+       const dcomplex* restrict beta,
+             dcomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    switch (M) {
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+            vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+            vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+        case 1:
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+    }
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    switch (M) {
+        case 5:
+            __asm__("vfmul.vf v24, v18, ft1");
+            __asm__("vfmul.vf v26, v16, ft1");
+            __asm__("vfmsub.vf v16, ft0, v24");
+            __asm__("vfmadd.vf v18, ft0, v26"); 
+        case 4:
+            __asm__("vfmul.vf v28, v14, ft1");
+            __asm__("vfmul.vf v30, v12, ft1");
+            __asm__("vfmsub.vf v12, ft0, v28");
+            __asm__("vfmadd.vf v14, ft0, v30"); 
+        case 3:
+            __asm__("vfmul.vf v24, v10, ft1");
+            __asm__("vfmul.vf v26, v8, ft1");
+            __asm__("vfmsub.vf v8, ft0, v24");
+            __asm__("vfmadd.vf v10, ft0, v26"); 
+        case 2:
+            __asm__("vfmul.vf v28, v6, ft1");
+            __asm__("vfmul.vf v30, v4, ft1");
+            __asm__("vfmsub.vf v4, ft0, v28");
+            __asm__("vfmadd.vf v6, ft0, v30"); 
+        case 1:
+            __asm__("vfmul.vf v24, v2, ft1");
+            __asm__("vfmul.vf v26, v0, ft1");
+            __asm__("vfmsub.vf v0, ft0, v24");
+            __asm__("vfmadd.vf v2, ft0, v26"); 
+    } 
+
+    dcomplex beta_cast = *beta;
+    if (beta_cast.real != 0. || beta_cast.imag != 0.) {
+        if (csc == 2 * FLT_SIZE) {
+            dcomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+        else {
+            dcomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        switch (M) {
+            case 5:
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+            case 5:
+                __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+
+    return;
+}
+
+void bli_zgemm_6m2_k0
+     (
+             dim_t              M,
+             dim_t              N,
+       const dcomplex* restrict beta,
+             dcomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    csc *= 2 * FLT_SIZE;
+
+    dcomplex beta_cast = *beta;
+    if (beta_cast.real == 0. && beta_cast.imag == 0.) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+            c += rsc;
+        }
+    }
+    else {
+        // scale c by beta
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+            }
+            c += rsc;
+        }
+    }
+    return;
+}
+
+void bli_zgemm_sifive_x280_asm_6m2
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    // M x N x K zgemm 
+    (void) data;
+    (void) cntx;
+    const dcomplex* restrict alpha = alpha_;
+    const dcomplex* restrict beta = beta_;
+    const dcomplex* restrict a = a_;
+    const dcomplex* restrict b = b_;
+    dcomplex* restrict c = c_;
+
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc);
+    else if (M == 6)
+        bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef PACKMR
+#undef PACKNR
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
new file mode 100644
index 000000000..18df010d0
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
@@ -0,0 +1,327 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+    (void) data;
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a10 = a10_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b01 = b01_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    if (m <= 0 || n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+
+    DATATYPE alpha_cast = *alpha;
+    if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
+        switch (m) {
+            case 6:
+                __asm__("vmv.v.i v20, 0");
+                __asm__("vmv.v.i v22, 0");
+            case 5:
+                __asm__("vmv.v.i v16, 0");
+                __asm__("vmv.v.i v18, 0");
+            case 4:
+                __asm__("vmv.v.i v12, 0");
+                __asm__("vmv.v.i v14, 0");
+            case 3:
+                __asm__("vmv.v.i v8, 0");
+                __asm__("vmv.v.i v10, 0");
+            case 2:
+                __asm__("vmv.v.i v4, 0");
+                __asm__("vmv.v.i v6, 0");
+            case 1:
+                __asm__("vmv.v.i v0, 0");
+                __asm__("vmv.v.i v2, 0");
+        }
+    }
+    else {
+        const DATATYPE* b11_tmp = b11 + (m - 1) * PACKNR;
+        switch (m) {
+            case 6:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 5:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 4:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 3:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 2:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 1:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
+        }
+    }
+
+    if (k >= 1) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
+        __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    if (k >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
+        __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    while (k > 0) {
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        k -= 1;
+
+        if (k == 0) { break; }
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
+            __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        k -= 1;
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
+            __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
+    vcmul_vf(v24, v26, v0, v2, ft0, ft1);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 1) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
+            vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
+    vcmul_vf(v24, v26, v4, v6, ft2, ft3);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 2) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+    vcmul_vf(v24, v26, v8, v10, ft4, ft5);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 3) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+    vcmul_vf(v24, v26, v12, v14, ft6, ft7);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 4) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+    vcmul_vf(v24, v26, v16, v18, ft8, ft9);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 5) return;
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+    vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+    vcmul_vf(v24, v26, v20, v22, ft10, ft11);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    return;
+}
+
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
new file mode 100644
index 000000000..a0f913473
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
@@ -0,0 +1,253 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a10 = a10_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b01 = b01_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
+        return;
+
+    dim_t b11_offset, temp;
+    size_t vl;
+    __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
+    
+    // Multiply step sizes by data size
+    __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
+    __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
+  
+    __asm__("addi %0, %1, %2": "=r"(b11_offset): "r"(m), "I"(-1));
+    __asm__("li %0, %1": "=r"(temp): "I"(PACKNR * FLT_SIZE));
+    __asm__("mul %0, %0, %1": "+r"(b11_offset): "r"(temp));
+    // b11_offset = (m-1)*PACKNR*FLT_SIZE
+
+    __asm__("add %0, %0, %1": "+r"(b11): "r"(b11_offset));
+    __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha));  // TO DO: optimize alpha = 1 case
+    switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
+        case 7: __asm__(VLE "  v0, (%0)": : "r"(b11)); 
+                __asm__("vfmul.vf  v0,  v0, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 6: __asm__(VLE "  v4, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v4,  v4, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 5: __asm__(VLE "  v8, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v8,  v8, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
+                 __asm__("vfmul.vf v12, v12, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v16, v16, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v20, v20, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v24, v24, f0");
+                // no sub of b11 on final entry
+    }
+    // b11 now reset to original value
+    //  v0 = row 6 of b11
+    //  v4 = row 5 of b11
+    //  v8 = row 4 of b11
+    // v12 = row 3 of b11
+    // v16 = row 2 of b11
+    // v20 = row 1 of b11
+    // v24 = row 0 of b11
+
+    // GEMM: B11 := alpha * B11 - A10 * B01
+    for (dim_t i = 0; i < k; i++){
+        __asm__(VLE " v28, (%0)": : "r"(b01)); // kth row of b01
+        switch (m){
+            case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(6*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf  v0, f6, v28");
+            case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(5*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf  v4, f5, v28");
+            case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(4*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf  v8, f4, v28");
+            case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(3*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf v12, f3, v28");
+            case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(2*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf v16, f2, v28");
+            case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(1*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf v20, f1, v28");
+            case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(0*FLT_SIZE), "r"(a10));
+                 __asm__("vfnmsac.vf v24, f0, v28");
+        }
+        __asm__("addi %0, %0, %1": "+r"(a10): "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1": "+r"(b01): "I"(PACKNR * FLT_SIZE));
+    }
+    // TRSM: B11 := inv(A11) * B11
+    // TO DO: Investigate code size reduction (loop rerolling)
+
+    // Row 0
+    __asm__(FLT_LOAD " f0,  %0(%1)": : "I"(0*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v24, v24, f0");
+    __asm__(VSE " v24, (%0)": : "r"(b11));
+    __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 1) return;
+
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v24");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v24");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v24");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v24");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v24");
+        case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v20, f1, v24");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 1
+    __asm__(FLT_LOAD " f1,  %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v20, v20, f1");
+    __asm__(VSE " v20, (%0)": : "r"(b11));
+    __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 2) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v20");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v20");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v20");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v20");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v20");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 2
+    __asm__(FLT_LOAD " f2,  %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v16, v16, f2");
+    __asm__(VSE " v16, (%0)": : "r"(b11));
+    __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 3) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v16");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v16");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v16");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v16");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 3
+    __asm__(FLT_LOAD " f3,  %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v12, v12, f3");
+    __asm__(VSE " v12, (%0)": : "r"(b11));
+    __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 4) return;
+  
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v12");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v12");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v12");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 4
+    __asm__(FLT_LOAD " f4,  %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v8, v8, f4");
+    __asm__(VSE " v8, (%0)": : "r"(b11));
+    __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 5) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v8");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v8");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 5
+    __asm__(FLT_LOAD " f5,  %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v4, v4, f5");
+    __asm__(VSE " v4, (%0)": : "r"(b11));
+    __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 6) return;
+    
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+    __asm__("vfnmsac.vf v0, f6, v4");
+    
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 6
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v0, v0, f6");
+    __asm__(VSE " v0, (%0)": : "r"(b11));
+    __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
+}
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
new file mode 100644
index 000000000..4323f8fbf
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
@@ -0,0 +1,182 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "blis.h"
+#include "../../riscv_cmul_macros_asm.h"
+#include <stdint.h>
+#include <riscv_vector.h>
+
+#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_asm(\
+          dim_t               m,      \
+          dim_t               n,      \
+          dim_t               k,      \
+    const T*         restrict alpha_, \
+    const T*         restrict a10_,   \
+    const T*         restrict a11_,   \
+    const T*         restrict b01_,   \
+          T*         restrict b11_,   \
+          T*         restrict c11_,   \
+          inc_t               rsc,    \
+          inc_t               csc,    \
+          auxinfo_t* restrict data,   \
+    const cntx_t*    restrict cntx    \
+    )
+
+#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_asm(\
+          dim_t               m,      \
+          dim_t               n,      \
+          dim_t               k,      \
+    const T*         restrict alpha_, \
+    const T*         restrict a12_,   \
+    const T*         restrict a11_,   \
+    const T*         restrict b21_,   \
+          T*         restrict b11_,   \
+          T*         restrict c11_,   \
+          inc_t               rsc,    \
+          inc_t               csc,    \
+          auxinfo_t* restrict data,   \
+    const cntx_t*    restrict cntx    \
+    )
+
+#define GEMMTRSM(macro, ...)  macro(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PACKMR 8
+#define PACKNR 64
+#define VLE "vle32.v"
+#define VSE "vse32.v"
+#define VSSE "vsse32.v"
+#define FLT_LOAD "flw"
+#define FLT_SIZE sizeof(float)
+#define LOG_FLT_SIZE 2
+
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLE
+#undef VSE
+#undef VSSE
+#undef FLT_LOAD
+#undef FLT_SIZE
+#undef LOG_FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PACKMR 8
+#define PACKNR 32
+#define VLE "vle64.v"
+#define VSE "vse64.v"
+#define VSSE "vsse64.v"
+#define FLT_LOAD "fld"
+#define FLT_SIZE sizeof(double)
+#define LOG_FLT_SIZE 3
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLE
+#undef VSE
+#undef VSSE
+#undef FLT_LOAD
+#undef FLT_SIZE
+#undef LOG_FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define PRECISION_CHAR c
+#define PACKMR 8
+#define PACKNR 32
+#define VLSEG2 "vlseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define FLT_LOAD "flw "
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef FLT_LOAD
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define PRECISION_CHAR z
+#define PACKMR 8
+#define PACKNR 16
+#define VLSEG2 "vlseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define FLT_LOAD "fld "
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLSEG
+#undef VSSEG
+#undef VSSSEG
+#undef FLT_LOAD
+#undef FLT_SIZE
+
+
+
+#undef GEMMTRSM
+#undef GEMMTRSM_L
+#undef GEMMTRSM_U
+
+
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
new file mode 100644
index 000000000..9332fd096
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
@@ -0,0 +1,331 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+    (void) data;
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a12 = a12_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b21 = b21_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    if (m <= 0 || n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+
+    DATATYPE alpha_cast = *alpha;
+    if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
+        switch (m) {
+            case 6:
+                __asm__("vmv.v.i v20, 0");
+                __asm__("vmv.v.i v22, 0");
+            case 5:
+                __asm__("vmv.v.i v16, 0");
+                __asm__("vmv.v.i v18, 0");
+            case 4:
+                __asm__("vmv.v.i v12, 0");
+                __asm__("vmv.v.i v14, 0");
+            case 3:
+                __asm__("vmv.v.i v8, 0");
+                __asm__("vmv.v.i v10, 0");
+            case 2:
+                __asm__("vmv.v.i v4, 0");
+                __asm__("vmv.v.i v6, 0");
+            case 1:
+                __asm__("vmv.v.i v0, 0");
+                __asm__("vmv.v.i v2, 0");
+        }
+    }
+    else {
+        const DATATYPE* b11_tmp = b11;
+        switch (m) {
+            case 6:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 5:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 4:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 3:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 2:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 1:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
+        }
+    }
+
+    if (k >= 1) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
+        __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    if (k >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
+        __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    a12 += m - 1;
+
+    while (k > 0) {
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        k -= 1;
+
+        if (k == 0) { break; }
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
+            __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        k -= 1;
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
+            __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    a11 += (m - 1) * (PACKMR + 1); // (m - 1) + (m - 1) * PACKMR
+    b11 += (m - 1) * PACKNR;
+    c11 += (m - 1) * rsc;
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
+    vcmul_vf(v24, v26, v0, v2, ft0, ft1);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 1) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
+            vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
+    vcmul_vf(v24, v26, v4, v6, ft2, ft3);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 2) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+    vcmul_vf(v24, v26, v8, v10, ft4, ft5);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 3) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+    vcmul_vf(v24, v26, v12, v14, ft6, ft7);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 4) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+    vcmul_vf(v24, v26, v16, v18, ft8, ft9);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 5) return;
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+    vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+    vcmul_vf(v24, v26, v20, v22, ft10, ft11);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    return;
+}
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
new file mode 100644
index 000000000..2d511a8ba
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
@@ -0,0 +1,260 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a12 = a12_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b21 = b21_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+    
+    if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
+        return;
+    
+    dim_t m_sz, a11_offset, c11_offset, temp;
+    size_t vl;
+    __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
+
+    // Multiply step sizes by data size
+    __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
+    __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
+    __asm__("slli %0, %1, %2": "=r"(m_sz) : "r"(m), "I"(LOG_FLT_SIZE));
+    
+    __asm__("li %0, %1": "=r"(temp): "I"((PACKMR+1)*FLT_SIZE)); 
+    __asm__("mul %0, %1, %2": "=r"(a11_offset) : "r"(m), "r"(temp));
+    __asm__("addi %0, %0, %1": "+r"(a11_offset) : "I"(-PACKMR * FLT_SIZE));
+    __asm__("mul %0, %1, %2": "=r"(c11_offset) : "r"(m), "r"(rsc));
+    __asm__("sub %0, %0, %1": "+r"(c11_offset) : "r"(rsc));
+    // a11_offset = (PACKMR*(m-1)+m)*sz = m*(PACKMR+1)*FLT_SIZE - PACKMR*FLT_SIZE
+    // c11_offset = rsc*(m-1)*sz
+    
+    __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha));
+    switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
+        case 7: __asm__(VLE "  v0, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v0,  v0, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 6: __asm__(VLE "  v4, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v4,  v4, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 5: __asm__(VLE "  v8, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v8,  v8, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v12, v12, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v16, v16, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v20, v20, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v24, v24, f0");
+                // no add of b11 on final entry
+    }
+    // b11 now positioned at start of last row
+    // v24 = row 0 from bottom (bottom row)
+    // v20 = row 1 from bottom
+    // v16 = row 2 from bottom
+    // v12 = row 3 from bottom
+    //  v8 = row 4 from bottom
+    //  v4 = row 5 from bottom
+    //  v0 = row 6 from bottom
+
+    // GEMM: B11 := alpha * B11 - A12 * B21
+    __asm__("add %0, %0, %1": "+r"(a12): "r"(m_sz));
+    for (dim_t i = 0; i < k; i++){
+        __asm__(VLE " v28, (%0)": : "r"(b21)); // kth row of b21
+        switch (m){
+            case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(-7*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf  v0, f6, v28");
+            case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(-6*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf  v4, f5, v28");
+            case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(-5*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf  v8, f4, v28");
+            case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(-4*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v12, f3, v28");
+            case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(-3*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v16, f2, v28");
+            case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(-2*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v20, f1, v28");
+            case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(-1*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v24, f0, v28");
+    }
+    __asm__("addi %0, %0, %1": "+r"(a12): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b21): "I"(PACKNR * FLT_SIZE));
+    }
+    // TRSM: B11 := inv(A11) * B11
+    // Move a11 to end of array and c11 to first entry in last row
+    __asm__("add %0, %0, %1": "+r"(a11): "r"(a11_offset));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(c11_offset));
+
+    // Row 0 from bottom (bottom row)
+    __asm__(FLT_LOAD " f0,  %0(%1)": : "I"(-1*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v24, v24, f0");
+    __asm__(VSE " v24, (%0)": : "r"(b11));
+    __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 1) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v24");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v24");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v24");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v24");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v24");
+        case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v20, f1, v24");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 1 from bottom
+    __asm__(FLT_LOAD " f1,  %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v20, v20, f1");
+    __asm__(VSE " v20, (%0)": : "r"(b11));
+    __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 2) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v20");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v20");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v20");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v20");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v20");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 2 from bottom
+    __asm__(FLT_LOAD " f2,  %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v16, v16, f2");
+    __asm__(VSE " v16, (%0)": : "r"(b11));
+    __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 3) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v16");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v16");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v16");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v16");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 3 from bottom
+    __asm__(FLT_LOAD " f3,  %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v12, v12, f3");
+    __asm__(VSE " v12, (%0)": : "r"(b11));
+    __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 4) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v12");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v12");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v12");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 4 from bottom
+    __asm__(FLT_LOAD " f4,  %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v8, v8, f4");
+    __asm__(VSE " v8, (%0)": : "r"(b11));
+    __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 5) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v8");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v8");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 5 from bottom
+    __asm__(FLT_LOAD " f5,  %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v4, v4, f5");
+    __asm__(VSE " v4, (%0)": : "r"(b11));
+    __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 6) return;
+    
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+    __asm__("vfnmsac.vf v0, f6, v4");
+    
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 6 from bottom
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v0, v0, f6");
+    __asm__(VSE " v0, (%0)": : "r"(b11));
+    __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
+    
+}
+#endif
diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h
new file mode 100644
index 000000000..425c7dad9
--- /dev/null
+++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Level 1
+ADDV_KER_PROT(float,        s, addv_sifive_x280_intr)
+ADDV_KER_PROT(double,       d, addv_sifive_x280_intr)
+ADDV_KER_PROT(scomplex,     c, addv_sifive_x280_intr)
+ADDV_KER_PROT(dcomplex,     z, addv_sifive_x280_intr)
+
+AMAXV_KER_PROT(float,       s, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(double,      d, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(scomplex,    c, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(dcomplex,    z, amaxv_sifive_x280_asm)
+
+AXPBYV_KER_PROT(float,      s, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(double,     d, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(scomplex,   c, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(dcomplex,   z, axpbyv_sifive_x280_intr)
+
+AXPYV_KER_PROT(float,       s, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(double,      d, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(scomplex,    c, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(dcomplex,    z, axpyv_sifive_x280_intr)
+
+COPYV_KER_PROT(float,       s, copyv_sifive_x280_asm)
+COPYV_KER_PROT(double,      d, copyv_sifive_x280_asm)
+COPYV_KER_PROT(scomplex,    c, copyv_sifive_x280_asm)
+COPYV_KER_PROT(dcomplex,    z, copyv_sifive_x280_asm)
+
+DOTV_KER_PROT(float,        s, dotv_sifive_x280_intr)
+DOTV_KER_PROT(double,       d, dotv_sifive_x280_intr)
+DOTV_KER_PROT(scomplex,     c, dotv_sifive_x280_intr)
+DOTV_KER_PROT(dcomplex,     z, dotv_sifive_x280_intr)
+
+DOTXV_KER_PROT(float,       s, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(double,      d, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(scomplex,    c, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(dcomplex,    z, dotxv_sifive_x280_intr)
+
+INVERTV_KER_PROT(float,     s, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(double,    d, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(scomplex,  c, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(dcomplex,  z, invertv_sifive_x280_asm)
+
+INVSCALV_KER_PROT(float,    s, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(double,   d, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_asm)
+
+SCAL2V_KER_PROT(float,      s, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(double,     d, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(scomplex,   c, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(dcomplex,   z, scal2v_sifive_x280_intr)
+
+SCALV_KER_PROT(float,       s, scalv_sifive_x280_intr)
+SCALV_KER_PROT(double,      d, scalv_sifive_x280_intr)
+SCALV_KER_PROT(scomplex,    c, scalv_sifive_x280_intr)
+SCALV_KER_PROT(dcomplex,    z, scalv_sifive_x280_intr)
+
+SETV_KER_PROT(float,        s, setv_sifive_x280_asm)
+SETV_KER_PROT(double,       d, setv_sifive_x280_asm)
+SETV_KER_PROT(scomplex,     c, setv_sifive_x280_asm)
+SETV_KER_PROT(dcomplex,     z, setv_sifive_x280_asm)
+
+SUBV_KER_PROT(float,        s, subv_sifive_x280_intr)
+SUBV_KER_PROT(double,       d, subv_sifive_x280_intr)
+SUBV_KER_PROT(scomplex,     c, subv_sifive_x280_intr)
+SUBV_KER_PROT(dcomplex,     z, subv_sifive_x280_intr)
+
+SWAPV_KER_PROT(float,       s, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(double,      d, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(scomplex,    c, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(dcomplex,    z, swapv_sifive_x280_asm)
+
+XPBYV_KER_PROT(float,       s, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(double,      d, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(scomplex,    c, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(dcomplex,    z, xpbyv_sifive_x280_intr)
+
+// Level 1f
+AXPY2V_KER_PROT(float,      s, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(double,     d, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(scomplex,   c, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(dcomplex,   z, axpy2v_sifive_x280_intr)
+
+AXPYF_KER_PROT(float,       s, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(double,      d, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(scomplex,    c, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(dcomplex,    z, axpyf_sifive_x280_asm)
+
+DOTXF_KER_PROT(float,       s, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(double,      d, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(scomplex,    c, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(dcomplex,    z, dotxf_sifive_x280_asm)
+
+DOTAXPYV_KER_PROT(float,    s, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(double,   d, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr)
+
+DOTXAXPYF_KER_PROT(float,   s, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(double,  d, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm)
+
+// Level 1m
+PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_7xk)
+PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_7xk)
+PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_6xk)
+PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_6xk)
+PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_64xk)
+PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_32xk)
+PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_32xk)
+PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_16xk)
+
+// Level 3
+GEMM_UKR_PROT(float,        s, gemm_sifive_x280_asm_7m4)
+GEMM_UKR_PROT(double,       d, gemm_sifive_x280_asm_7m4)
+GEMM_UKR_PROT(scomplex,     c, gemm_sifive_x280_asm_6m2)
+GEMM_UKR_PROT(dcomplex,     z, gemm_sifive_x280_asm_6m2)
+
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_asm)
diff --git a/kernels/sifive_x280/riscv_cmul_macros_asm.h b/kernels/sifive_x280/riscv_cmul_macros_asm.h
new file mode 100644
index 000000000..9c33fd7bc
--- /dev/null
+++ b/kernels/sifive_x280/riscv_cmul_macros_asm.h
@@ -0,0 +1,137 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// macros to emit complex multiplication
+// caveat: the destination registers cannot overlap the source registers!
+// rd = rs1 * rs2
+#define cmul(rd_r, rd_i, rs1_r, rs1_i, rs2_r, rs2_i) \
+  \
+  __asm__(FMUL#rd_r", "#rs1_r", "#rs2_r);\
+  __asm__(FMUL#rd_i", "#rs1_r", "#rs2_i);\
+  __asm__(FNMSUB#rd_r", "#rs1_i", "#rs2_i", "#rd_r);\
+  __asm__(FMADD#rd_i", "#rs1_i", "#rs2_r", "#rd_i)
+
+// vd = vs2 * f[rs1]
+#define vcmul_vf(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
+  __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmul_vf2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
+  __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd = conj(vs2) * f[rs1]
+#define vcmul_vf_conj(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
+  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmul_vf_conj2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
+  __asm__("vfmacc.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+  __asm__("vfnmsac.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd += vs2 * f[rs1]
+#define vcmacc_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
+  __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmacc_vf2(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfmacc.vf "#vd_r", %0, "#vs2_r : : "f"(rs1_r));\
+  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_r : : "f"(rs1_i));\
+  __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd += conj(vs2) * f[rs1]
+#define vcmacc_vf_conj(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+// vd -= vs2 * f[rs1]
+#define vcnmsac_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfnmsac.vf "#vd_r", "#rs1_r", "#vs2_r);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_i", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+// vd = vs2 * vs1
+#define vcmul_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd = vs2 * conj(vs1)
+#define vcmul_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmsac.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd += vs2 * vs1
+#define vcmacc_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd += vs2 * conj(vs1)
+#define vcmacc_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfnmsac.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
new file mode 100644
index 000000000..6a1d11b13
--- /dev/null
+++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// 6. Configuration-Setting and Utility Functions
+#define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t
+#define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL)
+#define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t
+#define RVV_TYPE_FX(PRECISION, LMUL, NFIELDS) RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS)
+#define VSETVL_(PRECISION, LMUL) __riscv_vsetvl_e##PRECISION##LMUL
+#define VSETVL(PRECISION, LMUL) VSETVL_(PRECISION, LMUL)
+
+// 7. Vector Loads and Stores
+// Loads
+#define VLE_V_F_(PRECISION, LMUL)   __riscv_vle##PRECISION##_v_f##PRECISION##LMUL
+#define VLE_V_F(PRECISION, LMUL)   VLE_V_F_(PRECISION, LMUL)
+#define VLSE_V_F_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL
+#define VLSE_V_F(PRECISION, LMUL) VLSE_V_F_(PRECISION, LMUL)
+#define VLSEG2_V_F_(PRECISION, LMUL, NFIELDS)   __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VLSEG2_V_F(PRECISION, LMUL, NFIELDS)   VLSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS)   __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VLSSEG2_V_F(PRECISION, LMUL, NFIELDS)   VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+// Stores
+#define VSE_V_F_(PRECISION, LMUL)   __riscv_vse##PRECISION##_v_f##PRECISION##LMUL
+#define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL)
+#define VSSE_V_F_(PRECISION, LMUL) __riscv_vsse##PRECISION##_v_f##PRECISION##LMUL
+#define VSSE_V_F(PRECISION, LMUL) VSSE_V_F_(PRECISION, LMUL)
+#define VSSEG2_V_F_(PRECISION, LMUL, NFIELDS)   __riscv_vsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+
+// 13. Vector Floating-Point Operations
+#define VFADD_VV_(PRECISION, LMUL) __riscv_vfadd_vv_f##PRECISION##LMUL
+#define VFADD_VV(PRECISION, LMUL) VFADD_VV_(PRECISION, LMUL)
+#define VFSUB_VV_(PRECISION, LMUL) __riscv_vfsub_vv_f##PRECISION##LMUL
+#define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL)
+#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
+#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
+#define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL
+#define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL)
+#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
+#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
+#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL
+#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL)
+#define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL
+#define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL)
+#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu
+#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL)
+#define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL
+#define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL)
+#define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL
+#define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL)
+#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu
+#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL)
+#define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL
+#define VFMADD_VF(PRECISION, LMUL)  VFMADD_VF_(PRECISION, LMUL)
+#define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL
+#define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL)
+#define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL
+#define VFNEG_VF(PRECISION, LMUL)  VFNEG_VF_(PRECISION, LMUL)
+#define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)(  __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG
+#define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL)
+
+// 14. Vector Reduction Operations
+#define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1
+#define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL)
+
+// 16. Vector Permutation Operations
+#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL
+#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL)
+#define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION
+#define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION)
+
+// Miscellaneous Vector Function
+#define VREINTERPRET_V_I_F_(PRECISION, LMUL) __riscv_vreinterpret_v_i##PRECISION##LMUL##_f##PRECISION##LMUL
+#define VREINTERPRET_V_I_F(PRECISION, LMUL) VREINTERPRET_V_I_F_(PRECISION, LMUL)
+#define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL
+#define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL)
+#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL
+#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS
+#define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS)
+
+// Non-vector functions
+#define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__))
diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh
index a51d33061..56c2b85c2 100755
--- a/travis/do_riscv.sh
+++ b/travis/do_riscv.sh
@@ -3,16 +3,19 @@
 set -e
 set -x
 
-TAG=2023.02.25
+TAG=2023.10.18
 
 # The prebuilt toolchains only support hardfloat, so we only
 # test these for now.
 case $1 in
 	"rv32iv")
-	TARBALL=riscv32-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+	TARBALL=riscv32-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz
 	;;
 	"rv64iv")
-	TARBALL=riscv64-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+	TARBALL=riscv64-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz
+	;;
+	"sifive_x280")
+	TARBALL=riscv64-glibc-ubuntu-20.04-llvm-nightly-${TAG}-nightly.tar.gz
 	;;
 	*)
 	exit 1

From f7ce54a252028483e4c6af619015eb22063d5541 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Fri, 3 Nov 2023 15:52:57 -0500
Subject: [PATCH 176/230] CREDITS file update.

---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index fa99b0572..bca98dadd 100644
--- a/CREDITS
+++ b/CREDITS
@@ -136,6 +136,7 @@ but many others have contributed code, ideas, and feedback, including
   RuQing Xu                @xrq-phys                  (The University of Tokyo)
   Srinivas Yadav           @srinivasyadav18
   Costas Yamin             @cosstas
+  Michael Yeh              @myeh01                    (SiFive)
   Chenhan Yu               @ChenhanYu                 (The University of Texas at Austin)
   Roman Yurchak            @rth                       (Symerio)
   Stefano Zampini          @stefanozampini

From 2d9439298b336aa6d0ee000a5285a3adb4e6d462 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 21 Nov 2023 12:18:07 -0600
Subject: [PATCH 177/230] Allow users to defines [sd]complex using std::complex
 (#784)

Details:
- In C++ applications, it makes a lot of sense to interface to BLIS
  using C++'s standard complex number library, which uses a template
  class std::complex. Obviously BLIS doesn't know anything about this
  and defaults to a custom struct to represent complex numbers. This PR
  updates the bli_[cz]{real,imag}() functions to accept std::complex
  numbers when a C++ compiler is being used. Note that this has no
  effect on the compilation of the BLIS library (or testsuite), and only
  comes into play when including blis.h into a C++ project and forcing
  the use of std::complex for scomplex and dcomplex.
- The application can explicitly request std:complex-based types via:

    #define BLIS_ENABLE_STD_COMPLEX
    #include <blis.h>
    // Call BLIS functions using std::complex<double> here.

- Fixed a bug in the definition of some scalar level-0 macros, since
  bli_creal()/bli_cimag() and bli_zreal()/bli_zimag() are no longer
  interchangeable.
---
 frame/include/bli_complex_macro_defs.h  | 85 ++++++++++++++++++++++++-
 frame/include/bli_macro_defs.h          |  2 +-
 frame/include/bli_obj_macro_defs.h      |  8 +--
 frame/include/bli_type_defs.h           | 43 ++++++++++++-
 frame/include/level0/1e/bli_scal21es.h  | 48 +++++++-------
 frame/include/level0/1e/bli_scal2j1es.h | 48 +++++++-------
 6 files changed, 177 insertions(+), 57 deletions(-)

diff --git a/frame/include/bli_complex_macro_defs.h b/frame/include/bli_complex_macro_defs.h
index fca6ccd86..f9e22ef0a 100644
--- a/frame/include/bli_complex_macro_defs.h
+++ b/frame/include/bli_complex_macro_defs.h
@@ -45,7 +45,88 @@
 #define bli_dimag( x )  ( 0.0 )
 
 
-#ifndef BLIS_ENABLE_C99_COMPLEX
+#if defined(__cplusplus) && defined(BLIS_ENABLE_STD_COMPLEX)
+
+} // extern "C"
+
+// Create functions bli_[cz]{real,imag} for std::complex<T> which mimic those
+// for the simple struct version. Since normally x.real/x.imag are
+// lvalues, we have to create a wrapper since x.real()/x.imag() in std::complex
+// are rvalues. These will only be used if the user has typedef'd scomplex as
+// std::complex<float> and dcomplex as std::complex<double> themselves.
+
+#include <complex>
+
+template <typename T, bool Imag>
+struct bli_complex_wrapper
+{
+	std::complex<T>& ref;
+
+	bli_complex_wrapper(std::complex<T>& ref) : ref(ref) {}
+
+	operator T() const { return Imag ? ref.imag() : ref.real(); }
+
+	bli_complex_wrapper& operator=(const bli_complex_wrapper& other)
+	{
+		return *this = static_cast<T>( other );
+	}
+
+	bli_complex_wrapper& operator=(T other)
+	{
+		if (Imag)
+			ref.imag(other);
+		else
+			ref.real(other);
+		return *this;
+	}
+};
+
+inline bli_complex_wrapper<float,false> bli_creal( std::complex<float>& x )
+{
+	return x;
+}
+
+inline float bli_creal( const std::complex<float>& x )
+{
+	return x.real();
+}
+
+inline bli_complex_wrapper<float,true> bli_cimag( std::complex<float>& x )
+{
+	return x;
+}
+
+inline float bli_cimag( const std::complex<float>& x )
+{
+	return x.imag();
+}
+
+inline bli_complex_wrapper<double,false> bli_zreal( std::complex<double>& x )
+{
+	return x;
+}
+
+inline double bli_zreal( const std::complex<double>& x )
+{
+	return x.real();
+}
+
+inline bli_complex_wrapper<double,true> bli_zimag( std::complex<double>& x )
+{
+	return x;
+}
+
+inline double bli_zimag( const std::complex<double>& x )
+{
+	return x.imag();
+}
+
+#define __typeof__(x) auto
+
+extern "C"
+{
+
+#elif !defined(BLIS_ENABLE_C99_COMPLEX)
 
 
 #define bli_creal( x )  ( (x).real )
@@ -56,6 +137,8 @@
 
 #else // ifdef BLIS_ENABLE_C99_COMPLEX
 
+// Note that these definitions probably don't work because of constructs
+// like `bli_zreal( x ) = yr`.
 
 #define bli_creal( x )  ( crealf(x) )
 #define bli_cimag( x )  ( cimagf(x) )
diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h
index be45a12e3..927c46644 100644
--- a/frame/include/bli_macro_defs.h
+++ b/frame/include/bli_macro_defs.h
@@ -100,8 +100,8 @@
 #include "bli_misc_macro_defs.h"
 #include "bli_edge_case_macro_defs.h"
 #include "bli_param_macro_defs.h"
-#include "bli_obj_macro_defs.h"
 #include "bli_complex_macro_defs.h"
+#include "bli_obj_macro_defs.h"
 #include "bli_scalar_macro_defs.h"
 #include "bli_error_macro_defs.h"
 #include "bli_blas_macro_defs.h"
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index 9adaef211..59ea87a1e 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -1263,10 +1263,10 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
 	bli_obj_set_scalar_dt( dt, obj );
 	void* s = bli_obj_internal_scalar_buffer( obj );
 
-	if      ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F;
-	                                          (( scomplex* )s)->imag = 0.0F; }
-	else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0;
-	                                          (( dcomplex* )s)->imag = 0.0; }
+	if      ( bli_dt_prec_is_single( dt ) ) { bli_creal( *( scomplex* )s ) = 1.0F;
+	                                          bli_cimag( *( scomplex* )s ) = 0.0F; }
+	else if ( bli_dt_prec_is_double( dt ) ) { bli_zreal( *( dcomplex* )s ) = 1.0;
+	                                          bli_zimag( *( dcomplex* )s ) = 0.0; }
 }
 
 // Finish the initialization started by the 1x1-specific static initializer
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 2f81a4749..b63cdb2c1 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -133,14 +133,51 @@ typedef uint32_t objbits_t;  // object information bit field
 
 // -- Complex types --
 
-#ifdef BLIS_ENABLE_C99_COMPLEX
+#if defined(__cplusplus) && defined(BLIS_ENABLE_STD_COMPLEX)
+
+	} //extern "C"
+
+	#include <complex>
+
+	// Typedef official C++ complex types to BLIS complex type names.
+
+	// This cpp guard provides a temporary hack to allow libflame
+	// interoperability with BLIS.
+	#ifndef _DEFINED_SCOMPLEX
+	#define _DEFINED_SCOMPLEX
+	typedef std::complex<float> scomplex;
+	#endif
+
+	// This cpp guard provides a temporary hack to allow libflame
+	// interoperability with BLIS.
+	#ifndef _DEFINED_DCOMPLEX
+	#define _DEFINED_DCOMPLEX
+	typedef std::complex<double> dcomplex;
+	#endif
+
+	extern "C"
+	{
+
+#elif defined(BLIS_ENABLE_C99_COMPLEX)
 
 	#if __STDC_VERSION__ >= 199901L
 		#include <complex.h>
 
-		// Typedef official complex types to BLIS complex type names.
-		typedef  float complex scomplex;
+		// Typedef official C99 complex types to BLIS complex type names.
+
+		// This cpp guard provides a temporary hack to allow libflame
+		// interoperability with BLIS.
+		#ifndef _DEFINED_SCOMPLEX
+		#define _DEFINED_SCOMPLEX
+		typedef float complex scomplex;
+		#endif
+
+		// This cpp guard provides a temporary hack to allow libflame
+		// interoperability with BLIS.
+		#ifndef _DEFINED_DCOMPLEX
+		#define _DEFINED_DCOMPLEX
 		typedef double complex dcomplex;
+		#endif
 	#else
 		#error "Configuration requested C99 complex types, but C99 does not appear to be supported."
 	#endif
diff --git a/frame/include/level0/1e/bli_scal21es.h b/frame/include/level0/1e/bli_scal21es.h
index cee7745e8..1cce97399 100644
--- a/frame/include/level0/1e/bli_scal21es.h
+++ b/frame/include/level0/1e/bli_scal21es.h
@@ -92,68 +92,68 @@
 #define bli_sdcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_sccscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_szcscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 #define bli_dscscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_ddcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_dccscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_dzcscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 #define bli_cscscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_sreal(x), bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_cdcscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_dreal(x), bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_cccscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_czcscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 #define bli_zscscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_sreal(x), bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_zdcscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_dreal(x), bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_zccscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_zzcscal21es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 // -- (axy) = (??z) ------------------------------------------------------------
diff --git a/frame/include/level0/1e/bli_scal2j1es.h b/frame/include/level0/1e/bli_scal2j1es.h
index a32c4f2e4..d868f6fb7 100644
--- a/frame/include/level0/1e/bli_scal2j1es.h
+++ b/frame/include/level0/1e/bli_scal2j1es.h
@@ -92,68 +92,68 @@
 #define bli_sdcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_sccscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_szcscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 #define bli_dscscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_ddcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
 #define bli_dccscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_dzcscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 #define bli_cscscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x),  bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x),  bli_sreal(x), bli_creal(yir), bli_zimag(yir) ); \
 }
 #define bli_cdcscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x),  bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x),  bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_cccscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_czcscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 #define bli_zscscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x),  bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x),  bli_sreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_zdcscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x),  bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x),  bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_zccscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 #define bli_zzcscal2j1es( a, x, yri, yir ) \
 { \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
+	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
 }
 
 // -- (axy) = (??z) ------------------------------------------------------------

From 141a6c9a8e7557d9c7d28aecedec9dc5377dba13 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Tue, 21 Nov 2023 12:26:43 -0600
Subject: [PATCH 178/230] Install helper headers to INCDIR prefix. (#787)

Details:
- Install one-line headers to INCDIR whose entire purpose is to
  #include the actual headers within the local 'blis' header directory
  so that applications can #include "blis.h" instead of #include
  <blis/blis.h> (and/or "cblas.h" instead of <blis/cblas.h> if CBLAS is
  enabled) when headers are installed to global paths. (Note that
  INCDIR is the installation prefix for headers as specified by
  '--includedir=INCDIR', which defaults to 'PREFIX/include' if not
  specified.) Not sure how this problem went unreported for so long,
  since presumably any user trying to #include "blis.h" from a global
  installation would have encountered a compiler error.
- The one-line blis.h and cblas.h headers now reside in the 'build'
  directory, ready to install as is.
- Thanks to to Jed Brown for reporting this via Issue #786, and for
  Devin Matthews and Mo Zhou for their engagement.
- Harmonized the rule in the top-level Makefile for installing blis.pc
  into SHAREDIR/pkgconfig with conventions for others vis-a-vis
  verbosity/non-verbosity.
---
 Makefile      | 70 +++++++++++++++++++++++++++++++++++++++++++--------
 build/blis.h  |  1 +
 build/cblas.h |  1 +
 common.mk     | 10 +++++++-
 4 files changed, 70 insertions(+), 12 deletions(-)
 create mode 100644 build/blis.h
 create mode 100644 build/cblas.h

diff --git a/Makefile b/Makefile
index d930b3f19..813554f57 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@
         testblis testblis-fast testblis-md testblis-salt \
         check checkblas \
         checkblis checkblis-fast checkblis-md checkblis-salt \
-        install-headers install-libs install-lib-symlinks \
+        install-headers install-helper-headers install-libs install-lib-symlinks \
         showconfig \
         clean cleanmk cleanh cleanlib distclean \
         cleantest cleanblastest cleanblistest \
@@ -282,11 +282,14 @@ endif
 #
 
 # Define a list of headers to install. The default is to only install blis.h.
-HEADERS_TO_INSTALL := $(BLIS_H_FLAT)
-
-# If CBLAS is enabled, we also install cblas.h so the user does not need to
-# change their source code to #include "blis.h" in order to access the CBLAS
-# function prototypes and enums.
+HEADERS_TO_INSTALL      := $(BLIS_H_FLAT)
+
+# If CBLAS is enabled, we also install cblas.h. This allows the user to continue
+# using #include "cblas.h" in their application, if they wish. (NOTE: Even if we
+# didn't install cblas.h, the user could *still* access CBLAS definitions and
+# function prototypes, but they would have to update their source code to use
+# #include "blis.h" instead of #include "cblas.h" since the latter header file
+# would not exist.)
 ifeq ($(MK_ENABLE_CBLAS),yes)
 HEADERS_TO_INSTALL += $(CBLAS_H_FLAT)
 endif
@@ -297,6 +300,19 @@ ifeq ($(INSTALL_HH),yes)
 HEADERS_TO_INSTALL += $(wildcard $(VEND_CPP_PATH)/*.hh)
 endif
 
+# Define a list of so-called helper headers to install. These helper headers
+# are very simple headers that go one directory up from INCDIR/blis (which
+# by default is PREFIX/include/blis, where PREFIX is the install prefix). The
+# default is to only install the blis.h helper header.
+HELP_HEADERS_TO_INSTALL := $(HELP_BLIS_H_PATH)
+HELP_HEADERS_INSTALLED  := $(INSTALL_INCDIR)/$(BLIS_H)
+
+# If CBLAS is enabled, we also install the cblas.h helper header.
+ifeq ($(MK_ENABLE_CBLAS),yes)
+HELP_HEADERS_TO_INSTALL += $(HELP_CBLAS_H_PATH)
+HELP_HEADERS_INSTALLED  += $(INSTALL_INCDIR)/$(CBLAS_H)
+endif
+
 
 
 #
@@ -1034,8 +1050,9 @@ endif
 
 # --- Install header rules ---
 
-install-headers: check-env $(MK_INCL_DIR_INST)
+install-headers: check-env $(MK_INCL_DIR_INST) install-helper-headers
 
+# Rule for installing main headers.
 $(MK_INCL_DIR_INST): $(HEADERS_TO_INSTALL) $(CONFIG_MK_FILE)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(MKDIR) $(@)
@@ -1046,6 +1063,23 @@ else
 	@$(INSTALL) -m 0644 $(HEADERS_TO_INSTALL) $(@)
 endif
 
+install-helper-headers: check-env $(HELP_HEADERS_INSTALLED)
+
+# A rule to install a helper header file.
+define make-helper-header-rule
+$(INSTALL_INCDIR)/$(notdir $(1)): $(BUILD_PATH)/$(notdir $(1)) $(CONFIG_MK_FILE)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(MKDIR) $(INSTALL_INCDIR)
+	$(INSTALL) -m 0644 $$(<) $$(@)
+else
+	@$(MKDIR) $(INSTALL_INCDIR)
+	@echo "Installing $$(@F) helper header into $(INSTALL_INCDIR)/"
+	@$(INSTALL) -m 0644 $$(<) $$(@)
+endif
+endef
+
+# Instantiate the rule above for each helper header file to install.
+$(foreach h, $(HELP_HEADERS_TO_INSTALL), $(eval $(call make-helper-header-rule,$(h))))
 
 # --- Install share rules ---
 
@@ -1068,11 +1102,9 @@ else
 	               $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)/
 endif
 
-$(PC_SHARE_DIR_INST):  $(PC_IN_FILE)
+$(PC_SHARE_DIR_INST): $(PC_IN_FILE)
+ifeq ($(ENABLE_VERBOSE),yes)
 	$(MKDIR) $(@)
-ifeq ($(ENABLE_VERBOSE),no)
-	@echo "Installing $(PC_OUT_FILE) into $(@)/"
-endif
 	$(shell cat "$(PC_IN_FILE)" \
 	| sed -e "s#@PACKAGE_VERSION@#$(VERSION)#g" \
 	| sed -e "s#@prefix@#$(prefix)#g" \
@@ -1082,6 +1114,19 @@ endif
 	| sed -e "s#@LDFLAGS@#$(LDFLAGS)#g" \
 	> "$(PC_OUT_FILE)" )
 	$(INSTALL) -m 0644 $(PC_OUT_FILE) $(@)
+else
+	@$(MKDIR) $(@)
+	@echo "Installing $(PC_OUT_FILE) into $(@)/"
+	@$(shell cat "$(PC_IN_FILE)" \
+	| sed -e "s#@PACKAGE_VERSION@#$(VERSION)#g" \
+	| sed -e "s#@prefix@#$(prefix)#g" \
+	| sed -e "s#@exec_prefix@#$(exec_prefix)#g" \
+	| sed -e "s#@libdir@#$(libdir)#g" \
+	| sed -e "s#@includedir@#$(includedir)#g" \
+	| sed -e "s#@LDFLAGS@#$(LDFLAGS)#g" \
+	> "$(PC_OUT_FILE)" )
+	@$(INSTALL) -m 0644 $(PC_OUT_FILE) $(@)
+endif
 
 # --- Install library rules ---
 
@@ -1401,9 +1446,12 @@ endif
 uninstall-headers: check-env
 ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_RF) $(MK_INCL_DIR_INST)
+	- $(RM_RF) $(HELP_HEADERS_INSTALLED)
 else
 	@echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))"
 	@- $(RM_RF) $(MK_INCL_DIR_INST)
+	@echo "Uninstalling $(notdir $(HELP_HEADERS_INSTALLED)) from $(dir $(INSTALL_INCDIR))"
+	@- $(RM_RF) $(HELP_HEADERS_INSTALLED)
 endif
 
 uninstall-share: check-env
diff --git a/build/blis.h b/build/blis.h
new file mode 100644
index 000000000..999edb6a5
--- /dev/null
+++ b/build/blis.h
@@ -0,0 +1 @@
+#include <blis/blis.h>
diff --git a/build/cblas.h b/build/cblas.h
new file mode 100644
index 000000000..f9ab36872
--- /dev/null
+++ b/build/cblas.h
@@ -0,0 +1 @@
+#include <blis/cblas.h>
diff --git a/common.mk b/common.mk
index de19e7cc0..c43578f19 100644
--- a/common.mk
+++ b/common.mk
@@ -1179,6 +1179,10 @@ BLIS_H_SRC_PATH := $(filter %/$(BLIS_H), $(FRAME_H99_FILES))
 # blis.h file.
 BLIS_H_FLAT     := $(BASE_INC_PATH)/$(BLIS_H)
 
+# Construct the path to the helper blis.h file that will reside one directory
+# up from the installed copy of blis.h.
+HELP_BLIS_H_PATH := $(BUILD_DIR)/$(BLIS_H)
+
 
 #
 # --- cblas.h header definitions -----------------------------------------------
@@ -1193,7 +1197,11 @@ CBLAS_H_DIRPATH  := $(dir $(CBLAS_H_SRC_PATH))
 
 # Construct the path to what will be the intermediate flattened/monolithic
 # cblas.h file.
-CBLAS_H_FLAT    := $(BASE_INC_PATH)/$(CBLAS_H)
+CBLAS_H_FLAT      := $(BASE_INC_PATH)/$(CBLAS_H)
+
+# Construct the path to the helper cblas.h file that will reside one directory
+# up from the installed copy of cblas.h.
+HELP_CBLAS_H_PATH := $(BUILD_DIR)/$(CBLAS_H)
 
 
 #

From 1236ddab455ef3a6293ab394ff06b3a19c2913d9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Sun, 3 Dec 2023 16:42:34 -0600
Subject: [PATCH 179/230] Fixed random segfault in test/3 drivers. (#788)

Details:
- Fixed a segfault in the non-gemm test drivers in test/3 that was the
  result of sometimes leaving either .n_str or .k_str fields of the
  params_t struct uninitialized, depending on the operation in question.
  For example, in test_hemm.c, init_def_params() would only initialize
  the .m_str and .n_str fields, but not the .k_str field. Even though
  hemm doesn't use a 'k' dimension, the proc_params() function (called
  via parse_cl_params()) universally attempts to convert all three into
  integers via sscanf(), which was understandably failing when one of
  those strings was a NULL pointer. I'm not sure how this code ever
  worked to begin with. Special thanks to Leick Robinson for finding and
  reporting this bug.
---
 test/3/test_hemm.c | 1 +
 test/3/test_herk.c | 1 +
 test/3/test_trmm.c | 1 +
 test/3/test_trsm.c | 1 +
 4 files changed, 4 insertions(+)

diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c
index d04d8cab2..9f40fbc00 100644
--- a/test/3/test_hemm.c
+++ b/test/3/test_hemm.c
@@ -358,6 +358,7 @@ void init_def_params( params_t* params )
 	params->ps_str    = GLOB_DEF_PS_STR;
 	params->m_str     = GLOB_DEF_M_STR;
 	params->n_str     = GLOB_DEF_N_STR;
+	params->k_str     = GLOB_DEF_K_STR;
 
 	params->nr_str    = GLOB_DEF_NR_STR;
 
diff --git a/test/3/test_herk.c b/test/3/test_herk.c
index a713b6766..9e94bef52 100644
--- a/test/3/test_herk.c
+++ b/test/3/test_herk.c
@@ -336,6 +336,7 @@ void init_def_params( params_t* params )
 
 	params->ps_str    = GLOB_DEF_PS_STR;
 	params->m_str     = GLOB_DEF_M_STR;
+	params->n_str     = GLOB_DEF_N_STR;
 	params->k_str     = GLOB_DEF_K_STR;
 
 	params->nr_str    = GLOB_DEF_NR_STR;
diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c
index 2ecbb19b1..955a7b878 100644
--- a/test/3/test_trmm.c
+++ b/test/3/test_trmm.c
@@ -340,6 +340,7 @@ void init_def_params( params_t* params )
 	params->ps_str    = GLOB_DEF_PS_STR;
 	params->m_str     = GLOB_DEF_M_STR;
 	params->n_str     = GLOB_DEF_N_STR;
+	params->k_str     = GLOB_DEF_K_STR;
 
 	params->nr_str    = GLOB_DEF_NR_STR;
 
diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c
index 4b92f6128..5fdfa8580 100644
--- a/test/3/test_trsm.c
+++ b/test/3/test_trsm.c
@@ -343,6 +343,7 @@ void init_def_params( params_t* params )
 	params->ps_str    = GLOB_DEF_PS_STR;
 	params->m_str     = GLOB_DEF_M_STR;
 	params->n_str     = GLOB_DEF_N_STR;
+	params->k_str     = GLOB_DEF_K_STR;
 
 	params->nr_str    = GLOB_DEF_NR_STR;
 

From a72e4569f2a03cc3578c019bf7ce25491a44137d Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Wed, 6 Dec 2023 18:21:47 -0600
Subject: [PATCH 180/230] Include bli_config.h before bli_system.h in cblas.h.
 (#789)

Details:
- Previously, in cblas.h, bli_config.h was being #included *after*
  bli_system.h, which meant that the BLIS_ENABLE_SYSTEM macro was
  never defined in time for proper OS detection. This bug only
  affected cblas.h -- blis.h had been correctly #including
  bli_config.h before bli_system.h since fb93d24. Thanks to
  Edward Smyth for reporting this bug and suggesting the fix.
---
 frame/compat/cblas/src/cblas.h | 13 +++++++++----
 frame/include/blis.h           |  9 ++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h
index 22399ac8d..97aa38036 100644
--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -3,11 +3,16 @@
 #define CBLAS_H
 #include <stddef.h>
 
-// We need to #include "bli_type_defs.h" in order to pull in the
-// definition of f77_int. But in order to #include that header, we
-// also need to pull in the headers that precede it in blis.h.
-#include "bli_system.h"
+// We need to #include "bli_type_defs.h" in order to pull in the definition of
+// f77_int. But in order to #include that header, we also need to pull in the
+// headers that precede it in blis.h.
+
+// NOTE: bli_config.h must be included before any other BLIS header. It is
+// bootstrapped by ./configure and does not depend on later headers. Moreover
+// these configuration variables are necessary to change some default behaviors
+// (e.g. disable OS detection in bli_system.h in case of --disable-system).
 #include "bli_config.h"
+#include "bli_system.h"
 #include "bli_config_macro_defs.h"
 #include "bli_type_defs.h"
 
diff --git a/frame/include/blis.h b/frame/include/blis.h
index 6292f4745..9286fc3ce 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -50,11 +50,10 @@ extern "C" {
 
 // -- configure definitions --
 
-// NOTE: bli_config.h header must be included before any BLIS header.
-// It is bootstrapped by ./configure and does not depend on later
-// headers. Moreover, these configuration variables are necessary to change
-// some default behaviors (e.g. disable OS-detection in bli_system.h in case
-// of --disable-system).
+// NOTE: bli_config.h must be included before any other BLIS header. It is
+// bootstrapped by ./configure and does not depend on later headers. Moreover
+// these configuration variables are necessary to change some default behaviors
+// (e.g. disable OS detection in bli_system.h in case of --disable-system).
 #include "bli_config.h"
 
 // -- System and language-related headers --

From c382d8bdccc07e22a341fe04960f0cbf4eec083b Mon Sep 17 00:00:00 2001
From: Igor Zhuravlov <zhuravlov.ip@ya.ru>
Date: Sun, 14 Jan 2024 04:03:31 +0000
Subject: [PATCH 181/230] Fix errors and typos in docs/BLIS*API.md (#791)

Details:
- Fixed errors and unified formatting in docs/BLIS*API.md docs.
---
 docs/BLISObjectAPI.md | 151 +++++-----
 docs/BLISTypedAPI.md  | 632 +++++++++++++++++++++---------------------
 2 files changed, 392 insertions(+), 391 deletions(-)

diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md
index 57c96ccdd..877cdb91d 100644
--- a/docs/BLISObjectAPI.md
+++ b/docs/BLISObjectAPI.md
@@ -177,24 +177,24 @@ The functions listed in this document belong to the "basic" interface subset of
 ```c
 void bli_gemm
      (
-       obj_t* alpha,
-       obj_t* a,
-       obj_t* b,
-       obj_t* beta,
-       obj_t* c,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
      );
 ```
 while the expert interface is:
 ```c
 void bli_gemm_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       obj_t*   alpha,
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx,
+       rntm_t*  rntm
      );
 ```
 The expert interface contains two additional parameters: a `cntx_t*` and `rntm_t*`. Note that calling a function from the expert interface with the `cntx_t*` and `rntm_t*` arguments each set to `NULL` is equivalent to calling the corresponding basic interface. Specifically, a `NULL` value passed in for the `cntx_t*` results in a valid context being queried from BLIS, and a `NULL` value passed in for the `rntm_t*` results in the current global settings for multithreading to be used.
@@ -254,12 +254,12 @@ Only objects that were created with automatic allocation must be freed via BLIS
 ```c
 void bli_obj_create
      (
-       num_t  dt,
-       dim_t  m,
-       dim_t  n,
-       inc_t  rs,
-       inc_t  cs,
-       obj_t* obj
+       num_t   dt,
+       dim_t   m,
+       dim_t   n,
+       inc_t   rs,
+       inc_t   cs,
+       obj_t*  obj
      );
 ```
 Initialize an _m x n_ object `obj` and allocate sufficient storage to hold _mn_ elements whose storage type is specified by `dt` and with row and column strides `rs` and `cs`, respectively. This function allocates enough space to enforce alignment of leading dimensions, where the alignment factor is specific to the configuration being used, though the alignment factor is almost always equal to the size of the hardware's SIMD registers.
@@ -271,7 +271,7 @@ After an object created via `bli_obj_create()` is no longer needed, it should be
 ```c
 void bli_obj_free
      (
-       obj_t* obj
+       obj_t*  obj
      );
 ```
 Deallocate (release) an object `obj` that was previously created, typically via `bli_obj_create()`.
@@ -281,10 +281,10 @@ Deallocate (release) an object `obj` that was previously created, typically via
 ```c
 void bli_obj_create_without_buffer
      (
-       num_t  dt,
-       dim_t  m,
-       dim_t  n,
-       obj_t* obj
+       num_t   dt,
+       dim_t   m,
+       dim_t   n,
+       obj_t*  obj
      );
 ```
 Partially initialize an _m x n_ object `obj` that will eventually contain elements whose storage type is specified by `dt`. This function does not result in any memory allocation. Before `obj` can be used, the object must be fully initialized by attaching a buffer via `bli_obj_attach_buffer()`. This function is useful when the user wishes to encapsulate existing buffers into one or more `obj_t` objects.
@@ -295,11 +295,11 @@ An object (partially) initialized via this function should generally not be pass
 ```c
 void bli_obj_attach_buffer
      (
-       void*  p,
-       inc_t  rs,
-       inc_t  cs,
-       inc_t  is,
-       obj_t* obj
+       void*   p,
+       inc_t   rs,
+       inc_t   cs,
+       inc_t   is,
+       obj_t*  obj
      );
 ```
 Given a partially initialized object (i.e., one that has already been passed to `bli_obj_create_without_buffer()`), attach the buffer pointed to by `p` to the object referenced by `obj` and initialize `obj` as containing elements with row and column strides `rs` and `cs`, respectively. The function also initializes the imaginary stride as `is`, which is experimental and not consistently used by all parts of BLIS.
@@ -309,13 +309,13 @@ Given a partially initialized object (i.e., one that has already been passed to
 ```c
 void bli_obj_create_with_attached_buffer
      (
-       num_t  dt,
-       dim_t  m,
-       dim_t  n,
-       void*  p,
-       inc_t  rs,
-       inc_t  cs,
-       obj_t* obj
+       num_t   dt,
+       dim_t   m,
+       dim_t   n,
+       void*   p,
+       inc_t   rs,
+       inc_t   cs,
+       obj_t*  obj
      );
 ```
 Initialize an _m x n_ object `obj` as containing _mn_ elements whose storage type is specified by `dt` and with row and column strides `rs` and `cs`, respectively. The function does not allocate any memory and instead attaches the buffer pointed to by `p`. Note that calling this function is effectively equivalent to calling
@@ -330,10 +330,10 @@ Objects initialized via this function should generally not be passed to `bli_obj
 ```c
 void bli_obj_alloc_buffer
      (
-       inc_t  rs,
-       inc_t  cs,
-       inc_t  is,
-       obj_t* obj
+       inc_t   rs,
+       inc_t   cs,
+       inc_t   is,
+       obj_t*  obj
      );
 ```
 Given a partially initialized _m x n_ object, allocate and attach a buffer large enough to contain _mn_ elements with the row and column strides `rs` and `cs`, respectively. This function allocates enough space to enforce alignment of leading dimensions, where the alignment factor is specific to the configuration being used, though the alignment factor is almost always equal to the size of the hardware's SIMD registers.
@@ -349,8 +349,8 @@ Very few users will likely have a need to call this function. We provide documen
 ```c
 void bli_obj_create_1x1
      (
-       num_t  dt,
-       obj_t* obj
+       num_t   dt,
+       obj_t*  obj
      );
 ```
 Initialize a _1 x 1_ object `obj` and allocate sufficient storage to hold one element whose storage type is specified by `dt`.
@@ -368,9 +368,9 @@ After an object created via `bli_obj_create_1x1()` is no longer needed, it shoul
 ```c
 void bli_obj_create_1x1_with_attached_buffer
      (
-       num_t  dt,
-       void*  p,
-       obj_t* obj
+       num_t   dt,
+       void*   p,
+       obj_t*  obj
      );
 ```
 Initialize a _1 x 1_ object `obj` as containing one element whose storage type is specified by `dt`. The function does not allocate any memory and instead attaches the buffer pointed to by `p`. Note that calling this function is effectively equivalent to calling
@@ -385,8 +385,8 @@ Objects initialized via this function should generally not be passed to `bli_obj
 ```c
 void bli_obj_create_conf_to
      (
-       obj_t* s,
-       obj_t* d
+       obj_t*  s,
+       obj_t*  d
      );
 ```
 Initialize an object `d` with dimensions conformal to those of an existing object `s`. Object `d` is initialized with the same row and column strides as those of `s`. However, the structure, uplo, conjugation, and transposition properties of `s` are **not** inherited by `d`.
@@ -408,8 +408,8 @@ After an object created via `bli_obj_create_conf_to()` is no longer needed, it s
 ```c
 void bli_obj_scalar_init_detached
      (
-       num_t  dt,
-       obj_t* obj
+       num_t   dt,
+       obj_t*  obj
      );
 ```
 Initialize a _1 x 1_ object `obj` using internal storage sufficient to hold one element whose storage type is specified by `dt`. (Internal storage is present within every `obj_t` and is capable of holding on element of any supported type.) This function is similar to `bli_obj_create_1x1()`, except that the object does not trigger any dynamic memory allocation.
@@ -1284,6 +1284,7 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `uplo(A)`.
 ```c
 void bli_scal2m
      (
+       obj_t*  alpha,
        obj_t*  a,
        obj_t*  b
      );
@@ -1403,7 +1404,7 @@ void bli_axpy2v
 ```
 Perform
 ```
-  y := y + conj?(alphax) * conj?(x) + conj?(alphay) * conj?(y)
+  z := z + conj?(alphax) * conj?(x) + conj?(alphay) * conj?(y)
 ```
 where `x`, `y`, and `z` are vectors of length _m_. The kernel, if optimized, is implemented as a fused pair of calls to [axpyv](BLISObjectAPI.md#axpyv).
 
@@ -1425,7 +1426,7 @@ void bli_dotaxpyv
 Perform
 ```
   rho := conj?(x)^T * conj?(y)
-  y   := y + conj?(alpha) * conj?(x)
+  z   := z + conj?(alpha) * conj?(x)
 ```
 where `x`, `y`, and `z` are vectors of length _m_ and `alpha` and `rho` are scalars. The kernel, if optimized, is implemented as a fusion of calls to [dotv](BLISObjectAPI.md#dotv) and [axpyv](BLISObjectAPI.md#axpyv).
 
@@ -2184,9 +2185,9 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec
 ```c
 void bli_getsc
      (
-       obj_t*  chi,
-       double* zeta_r,
-       double* zeta_i
+       obj_t*   chi,
+       double*  zeta_r,
+       double*  zeta_i
      );
 ```
 Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -2197,10 +2198,10 @@ Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and
 ```c
 err_t bli_getijv
       (
-        dim_t   i,
-        obj_t*  b,
-        double* ar,
-        double* ai
+        dim_t    i,
+        obj_t*   x,
+        double*  ar,
+        double*  ai
       )
 ```
 Copy the real and imaginary values at the `i`th element of vector object `x` to `ar` and `ai`. If elements of `x` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -2212,11 +2213,11 @@ If either the element offset `i` is beyond the vector dimension of `x` or less t
 ```c
 err_t bli_getijm
       (
-        dim_t   i,
-        dim_t   j,
-        obj_t*  b,
-        double* ar,
-        double* ai
+        dim_t    i,
+        dim_t    j,
+        obj_t*   b,
+        double*  ar,
+        double*  ai
       )
 ```
 Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. If elements of `b` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -2228,8 +2229,8 @@ If either the row offset `i` is beyond the _m_ dimension of `b` or less than zer
 ```c
 void bli_setsc
      (
-       double* zeta_r,
-       double* zeta_i,
+       double  zeta_r,
+       double  zeta_i,
        obj_t*  chi
      );
 ```
@@ -2272,9 +2273,9 @@ If either the row offset `i` is beyond the _m_ dimension of `b` or less than zer
 ```c
 void bli_eqsc
      (
-       obj_t  chi,
-       obj_t  psi,
-       bool*  is_eq
+       obj_t*  chi,
+       obj_t*  psi,
+       bool*   is_eq
      );
 ```
 Perform an element-wise comparison between scalars `chi` and `psi` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -2288,9 +2289,9 @@ Observed object properties: `conj?(chi)`, `conj?(psi)`.
 ```c
 void bli_eqv
      (
-       obj_t  x,
-       obj_t  y,
-       bool*  is_eq
+       obj_t*  x,
+       obj_t*  y,
+       bool*   is_eq
      );
 ```
 Perform an element-wise comparison between vectors `x` and `y` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -2304,9 +2305,9 @@ Observed object properties: `conj?(x)`, `conj?(y)`.
 ```c
 void bli_eqm
      (
-       obj_t  a,
-       obj_t  b,
-       bool*  is_eq
+       obj_t*  a,
+       obj_t*  b,
+       bool*   is_eq
      );
 ```
 Perform an element-wise comparison between matrices `A` and `B` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -2417,8 +2418,8 @@ Return the amount of time that has elapsed since some fixed time in the past. Th
 ```c
 double bli_clock_min_diff
      (
-       double time_prev_min,
-       double time_start
+       double  time_prev_min,
+       double  time_start
      );
 ```
 This function computes an intermediate value, `time_diff`, equal to `bli_clock() - time_start`, and then tentatively prepares to return the minimum value of `time_diff` and `time_min`. If that minimum value is extremely small (close to zero), the function returns `time_min` instead.
diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md
index 5a7b90f29..28b46ea3e 100644
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -120,34 +120,34 @@ The functions listed in this document belong to the "basic" interface subset of
 ```c
 void bli_?gemm
      (
-       trans_t transa,
-       trans_t transb,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       trans_t  transa,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    n,
+       dim_t    k,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 while the expert interface is:
 ```c
 void bli_?gemm_ex
      (
-       trans_t transa,
-       trans_t transb,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc,
-       cntx_t* cntx,
-       rntm_t* rntm
+       trans_t  transa,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    n,
+       dim_t    k,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc,
+       cntx_t*  cntx,
+       rntm_t*  rntm
      );
 ```
 The expert interface contains two additional parameters: a `cntx_t*` and `rntm_t*`. Note that calling a function from the expert interface with the `cntx_t*` and `rntm_t*` arguments each set to `NULL` is equivalent to calling the corresponding basic interface. Specifically, a `NULL` value passed in for the `cntx_t*` results in a valid context being queried from BLIS, and a `NULL` value passed in for the `rntm_t*` results in the current global settings for multithreading to be used.
@@ -508,13 +508,13 @@ Most of these operations are similar to level-1m counterparts, except they only
 ```c
 void bli_?addd
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -524,14 +524,14 @@ void bli_?addd
 ```c
 void bli_?axpyd
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -541,13 +541,13 @@ void bli_?axpyd
 ```c
 void bli_?copyd
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -600,14 +600,14 @@ void bli_?scald
 ```c
 void bli_?scal2d
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -632,11 +632,11 @@ void bli_?setd
 ```c
 void bli_?setid
      (
-       doff_t   diagoffa,
-       dim_t    m,
-       dim_t    n,
-       ctype_r* alpha,
-       ctype*   a, inc_t rsa, inc_t csa
+       doff_t    diagoffa,
+       dim_t     m,
+       dim_t     n,
+       ctype_r*  alpha,
+       ctype*    a, inc_t rsa, inc_t csa
      );
 ```
 Set the imaginary components of every element along the diagonal of `a`, as
@@ -666,13 +666,13 @@ specified by `diagoffa`.
 ```c
 void bli_?subd
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -682,14 +682,14 @@ void bli_?subd
 ```c
 void bli_?xpbyd
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  beta,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   beta,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -707,14 +707,14 @@ Level-1m operations perform various level-1 BLAS-like operations on matrices (he
 ```c
 void bli_?addm
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       uplo_t  uploa,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       uplo_t   uploa,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -729,15 +729,15 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up
 ```c
 void bli_?axpym
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       uplo_t  uploa,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       uplo_t   uploa,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -752,14 +752,14 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up
 ```c
 void bli_?copym
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       uplo_t  uploa,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       uplo_t   uploa,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -816,15 +816,15 @@ where `A` is an _m x n_ matrix stored as a dense matrix, or lower- or upper-tria
 ```c
 void bli_?scal2m
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       uplo_t  uploa,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       uplo_t   uploa,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -857,14 +857,14 @@ Set all elements of an _m x n_ matrix `A` to `conjalpha(alpha)`, where `A` is st
 ```c
 void bli_?subm
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       uplo_t  uploa,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       doff_t   diagoffa,
+       diag_t   diaga,
+       uplo_t   uploa,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -905,7 +905,7 @@ void bli_?axpy2v
 ```
 Perform
 ```
-  z := y + alphax * conjx(x) + alphay * conjy(y)
+  z := z + alphax * conjx(x) + alphay * conjy(y)
 ```
 where `x`, `y`, and `z` are vectors of length _m_. The kernel, if optimized, is implemented as a fused pair of calls to [axpyv](BLISTypedAPI.md#axpyv).
 
@@ -929,7 +929,7 @@ void bli_?dotaxpyv
 Perform
 ```
   rho := conjxt(x^T) * conjy(y)
-  y   := y + alpha * conjx(x)
+  z   := z + alpha * conjx(x)
 ```
 where `x`, `y`, and `z` are vectors of length _m_ and `alpha` and `rho` are scalars. The kernel, if optimized, is implemented as a fusion of calls to [dotv](BLISTypedAPI.md#dotv) and [axpyv](BLISTypedAPI.md#axpyv).
 
@@ -974,7 +974,7 @@ void bli_?dotxf
 ```
 Perform
 ```
-  y := y + alpha * conjat(A^T) * conjx(x)
+  y := beta * y + alpha * conjat(A^T) * conjx(x)
 ```
 where `A` is an _m x b_ matrix, and `y` and `x` are vectors. The kernel, if optimized, is implemented as a fused series of calls to [dotxv](BLISTypedAPI.md#dotxv) where _b_ is less than or equal to an implementation-dependent fusing factor specific to `dotxf`.
 
@@ -1019,15 +1019,15 @@ Level-2 operations perform various level-2 BLAS-like operations.
 ```c
 void bli_?gemv
      (
-       trans_t transa,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  x, inc_t incx,
-       ctype*  beta,
-       ctype*  y, inc_t incy
+       trans_t  transa,
+       conj_t   conjx,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   x, inc_t incx,
+       ctype*   beta,
+       ctype*   y, inc_t incy
      );
 ```
 Perform
@@ -1196,13 +1196,13 @@ where `A` is an _m x m_ symmetric matrix stored in the lower or upper triangle a
 ```c
 void bli_?trmv
      (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  x, inc_t incx
+       uplo_t   uploa,
+       trans_t  transa,
+       diag_t   diaga,
+       dim_t    m,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   x, inc_t incx
      );
 ```
 Perform
@@ -1217,13 +1217,13 @@ where `A` is an _m x m_ triangular matrix stored in the lower or upper triangle
 ```c
 void bli_?trsv
      (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  y, inc_t incy
+       uplo_t   uploa,
+       trans_t  transa,
+       diag_t   diaga,
+       dim_t    m,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   y, inc_t incy
      );
 ```
 Solve the linear system
@@ -1248,16 +1248,16 @@ Level-3 operations perform various level-3 BLAS-like operations.
 ```c
 void bli_?gemm
      (
-       trans_t transa,
-       trans_t transb,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       trans_t  transa,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    n,
+       dim_t    k,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1272,16 +1272,16 @@ where C is an _m x n_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)`
 ```c
 void bli_?gemmt
      (
-       uplo_t  uploc,
-       trans_t transa,
-       trans_t transb,
-       dim_t   m,
-       dim_t   k,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       uplo_t   uploc,
+       trans_t  transa,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    k,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1296,17 +1296,17 @@ where C is an _m x m_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)`
 ```c
 void bli_?hemm
      (
-       side_t  sidea,
-       uplo_t  uploa,
-       conj_t  conja,
-       trans_t transb,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       side_t   sidea,
+       uplo_t   uploa,
+       conj_t   conja,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1325,14 +1325,14 @@ if `sidea` is `BLIS_RIGHT`, where `C` and `B` are _m x n_ matrices and `A` is a
 ```c
 void bli_?herk
      (
-       uplo_t  uploc,
-       trans_t transa,
-       dim_t   m,
-       dim_t   k,
-       rtype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       rtype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       uplo_t   uploc,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    k,
+       rtype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       rtype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1349,16 +1349,16 @@ where C is an _m x m_ Hermitian matrix stored in the lower or upper triangle as
 ```c
 void bli_?her2k
      (
-       uplo_t  uploc,
-       trans_t transa,
-       trans_t transb,
-       dim_t   m,
-       dim_t   k,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       rtype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       uplo_t   uploc,
+       trans_t  transa,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    k,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       rtype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1375,17 +1375,17 @@ where C is an _m x m_ Hermitian matrix stored in the lower or upper triangle as
 ```c
 void bli_?symm
      (
-       side_t  sidea,
-       uplo_t  uploa,
-       conj_t  conja,
-       trans_t transb,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       side_t   sidea,
+       uplo_t   uploa,
+       conj_t   conja,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1404,14 +1404,14 @@ if `sidea` is `BLIS_RIGHT`, where `C` and `B` are _m x n_ matrices and `A` is a
 ```c
 void bli_?syrk
      (
-       uplo_t  uploc,
-       trans_t transa,
-       dim_t   m,
-       dim_t   k,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       uplo_t   uploc,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    k,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1426,16 +1426,16 @@ where C is an _m x m_ symmetric matrix stored in the lower or upper triangle as
 ```c
 void bli_?syr2k
      (
-       uplo_t  uploc,
-       trans_t transa,
-       trans_t transb,
-       dim_t   m,
-       dim_t   k,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       uplo_t   uploc,
+       trans_t  transa,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    k,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1450,15 +1450,15 @@ where C is an _m x m_ symmetric matrix stored in the lower or upper triangle as
 ```c
 void bli_?trmm
      (
-       side_t  sidea,
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       side_t   sidea,
+       uplo_t   uploa,
+       trans_t  transa,
+       diag_t   diaga,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -1477,18 +1477,18 @@ if `sidea` is `BLIS_RIGHT`, where `B` is an _m x n_ matrix and `A` is a triangul
 ```c
 void bli_?trmm3
      (
-       side_t  sidea,
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       trans_t transb,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb,
-       ctype*  beta,
-       ctype*  c, inc_t rsc, inc_t csc
+       side_t   sidea,
+       uplo_t   uploa,
+       trans_t  transa,
+       diag_t   diaga,
+       trans_t  transb,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb,
+       ctype*   beta,
+       ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1507,15 +1507,15 @@ if `sidea` is `BLIS_RIGHT`, where `C` and `transb(B)` are _m x n_ matrices and `
 ```c
 void bli_?trsm
      (
-       side_t  sidea,
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  b, inc_t rsb, inc_t csb
+       side_t   sidea,
+       uplo_t   uploa,
+       trans_t  transa,
+       diag_t   diaga,
+       dim_t    m,
+       dim_t    n,
+       ctype*   alpha,
+       ctype*   a, inc_t rsa, inc_t csa,
+       ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Solve the linear system with multiple right-hand sides
@@ -1751,11 +1751,11 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec
 
 #### getsc
 ```c
-void bli_getsc
+void bli_?getsc
      (
-       ctype*  chi,
-       double* zeta_r,
-       double* zeta_i
+       ctype*   chi,
+       double*  zeta_r,
+       double*  zeta_i
      );
 ```
 Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -1766,10 +1766,10 @@ Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and
 ```c
 err_t bli_?getijv
      (
-       dim_t   i,
-       ctype*  x, incx,
-       double* ar,
-       double* ai
+       dim_t    i,
+       ctype*   x, inc_t incx,
+       double*  ar,
+       double*  ai
      );
 ```
 Copy the real and imaginary values at the `i`th element of vector `x` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -1781,11 +1781,11 @@ Note that the object-based analogue of [getijv](BLISObjectAPI.md#getijv) does bo
 ```c
 err_t bli_?getijm
      (
-       dim_t   i,
-       dim_t   j,
-       ctype*  b, inc_t rs_b, inc_t cs_b,
-       double* ar,
-       double* ai
+       dim_t    i,
+       dim_t    j,
+       ctype*   b, inc_t rs_b, inc_t cs_b,
+       double*  ar,
+       double*  ai
      );
 ```
 Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -1795,10 +1795,10 @@ Note that the object-based analogue of [getijm](BLISObjectAPI.md#getijm) does bo
 
 #### setsc
 ```c
-void bli_setsc
+void bli_?setsc
      (
-       double* zeta_r,
-       double* zeta_i,
+       double  zeta_r,
+       double  zeta_i,
        ctype*  chi
      );
 ```
@@ -1813,7 +1813,7 @@ err_t bli_?setijv
        double  ar,
        double  ai,
        dim_t   i,
-       ctype*  x, incx
+       ctype*  x, inc_t incx
      );
 ```
 Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.)
@@ -1841,10 +1841,10 @@ Note that the object-based analogue of [setijm](BLISObjectAPI.md#setijm) does bo
 ```c
 void bli_?eqsc
      (
-       conj_t conjchi,
-       ctype* chi,
-       ctype* psi,
-       bool*  is_eq
+       conj_t  conjchi,
+       ctype*  chi,
+       ctype*  psi,
+       bool*   is_eq
      );
 ```
 Perform an element-wise comparison between scalars `chi` and `psi` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -1872,15 +1872,15 @@ If `conjx` indicates a conjugation, `x` will be implicitly conjugated for purpos
 ```c
 void bli_?eqm
      (
-       doff_t  diagoffa,
-       diag_t  diaga,
-       uplo_t  uploa,
-       trans_t transa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rs_a, inc_t cs_a,
-       ctype*  b, inc_t rs_b, inc_t cs_b,
-       bool*   is_eq
+       doff_t   diagoffa,
+       diag_t   diaga,
+       uplo_t   uploa,
+       trans_t  transa,
+       dim_t    m,
+       dim_t    n,
+       ctype*   a, inc_t rs_a, inc_t cs_a,
+       ctype*   b, inc_t rs_b, inc_t cs_b,
+       bool*    is_eq
      );
 ```
 Perform an element-wise comparison between matrices `A` and `B` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -1903,14 +1903,14 @@ If `transa` indicates a conjugation and/or transposition, then `A` will be conju
 ```c
 void bli_?gemm_*
      (
-       dim_t               k,
-       ctype*     restrict alpha,
-       ctype*     restrict a1,
-       ctype*     restrict b1,
-       ctype*     restrict beta,
-       ctype*     restrict c11, inc_t rsc, inc_t csc,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       dim_t                k,
+       ctype*      restrict alpha,
+       ctype*      restrict a1,
+       ctype*      restrict b1,
+       ctype*      restrict beta,
+       ctype*      restrict c11, inc_t rsc, inc_t csc,
+       auxinfo_t*  restrict data,
+       cntx_t*     restrict cntx
      );
 ```
 Perform
@@ -1928,20 +1928,20 @@ Please see the [Kernel Guide](KernelsHowTo.md) for more information on the `gemm
 ```c
 void bli_?trsm_l_*
      (
-       ctype*     restrict a11,
-       ctype*     restrict b11,
-       ctype*     restrict c11, inc_t rsc, inc_t csc
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       ctype*      restrict a11,
+       ctype*      restrict b11,
+       ctype*      restrict c11, inc_t rsc, inc_t csc
+       auxinfo_t*  restrict data,
+       cntx_t*     restrict cntx
      );
 
 void bli_?trsm_u_*
      (
-       ctype*     restrict a11,
-       ctype*     restrict b11,
-       ctype*     restrict c11, inc_t rsc, inc_t csc
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       ctype*      restrict a11,
+       ctype*      restrict b11,
+       ctype*      restrict c11, inc_t rsc, inc_t csc
+       auxinfo_t*  restrict data,
+       cntx_t*     restrict cntx
      );
 ```
 Perform
@@ -1959,28 +1959,28 @@ Please see the [Kernel Guide](KernelsHowTo.md) for more information on the `trsm
 ```c
 void bli_?gemmtrsm_l_*
      (
-       dim_t               k,
-       ctype*     restrict alpha,
-       ctype*     restrict a10,
-       ctype*     restrict a11,
-       ctype*     restrict b01,
-       ctype*     restrict b11,
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       dim_t                k,
+       ctype*      restrict alpha,
+       ctype*      restrict a10,
+       ctype*      restrict a11,
+       ctype*      restrict b01,
+       ctype*      restrict b11,
+       ctype*      restrict c11, inc_t rs_c, inc_t cs_c,
+       auxinfo_t*  restrict data,
+       cntx_t*     restrict cntx
      );
 
 void bli_?gemmtrsm_u_*
      (
-       dim_t               k,
-       ctype*     restrict alpha,
-       ctype*     restrict a12,
-       ctype*     restrict a11,
-       ctype*     restrict b21,
-       ctype*     restrict b11,
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       dim_t                k,
+       ctype*      restrict alpha,
+       ctype*      restrict a12,
+       ctype*      restrict a11,
+       ctype*      restrict b21,
+       ctype*      restrict b11,
+       ctype*      restrict c11, inc_t rs_c, inc_t cs_c,
+       auxinfo_t*  restrict data,
+       cntx_t*     restrict cntx
      );
 ```
 Perform
@@ -2118,8 +2118,8 @@ Return the amount of time that has elapsed since some fixed time in the past. Th
 ```c
 double bli_clock_min_diff
      (
-       double time_prev_min,
-       double time_start
+       double  time_prev_min,
+       double  time_start
      );
 ```
 This function computes an intermediate value, `time_diff`, equal to `bli_clock() - time_start`, and then tentatively prepares to return the minimum value of `time_diff` and `time_min`. If that minimum value is extremely small (close to zero), the function returns `time_min` instead.

From 1a8c8180b32cf5988bf9eb5d2f0f8111a729993a Mon Sep 17 00:00:00 2001
From: John <50754967+j-bm@users.noreply.github.com>
Date: Thu, 15 Feb 2024 12:35:10 -0400
Subject: [PATCH 182/230] Add cpu part codes for various manufacturers and use
 in the code (#794)

* Add cpu_id symbols for arm v8.

* Add symbols for arm v7.

* Always assume firestorm on Apple aarch64.

* Fixes incorrect usage of model vs. part in some places.

* Fixes #793

---------

Co-authored-by: J <jal@o75snap.localdomain>
---
 frame/base/bli_cpuid.c | 152 +++++++++++++++++++++++------------------
 1 file changed, 84 insertions(+), 68 deletions(-)

diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c
index d967cc05d..a9b081e2b 100644
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -487,6 +487,16 @@ bool bli_cpuid_is_bulldozer
 
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC)
 
+	// courtesy OpenBSD
+#define ARM_CPU_PART_CORTEX_A5  0xc05
+#define ARM_CPU_PART_CORTEX_A7  0xc07
+#define ARM_CPU_PART_CORTEX_A8  0xc08
+#define ARM_CPU_PART_CORTEX_A9  0xc09
+#define ARM_CPU_PART_CORTEX_A12 0xc0d
+#define ARM_CPU_PART_CORTEX_A15 0xc0f
+#define ARM_CPU_PART_CORTEX_A17 0xc0e
+#define ARM_CPU_PART_CORTEX_A32 0xd01
+
 arch_t bli_cpuid_query_id( void )
 {
 	uint32_t vendor, model, part, features;
@@ -545,28 +555,28 @@ arch_t bli_cpuid_query_id( void )
 
 bool bli_cpuid_is_cortexa15
      (
-       uint32_t family,
        uint32_t model,
+       uint32_t part,
        uint32_t features
      )
 {
 	// Check for expected CPU features.
 	const uint32_t expected = FEATURE_NEON;
 
-	return bli_cpuid_has_features( features, expected ) && model == 0xc0f;
+	return bli_cpuid_has_features( features, expected ) && part == ARM_CPU_PART_CORTEX_A15;
 }
 
 bool bli_cpuid_is_cortexa9
      (
-       uint32_t family,
        uint32_t model,
+       uint32_t part,
        uint32_t features
      )
 {
 	// Check for expected CPU features.
 	const uint32_t expected = FEATURE_NEON;
 
-	return bli_cpuid_has_features( features, expected ) && model == 0xc09;
+	return bli_cpuid_has_features( features, expected ) && part == ARM_CPU_PART_CORTEX_A9;
 }
 
 #endif
@@ -1056,127 +1066,133 @@ static uint32_t get_coretype
 
 #ifdef __APPLE__
 	// Better values could be obtained from sysctlbyname()
+	// FIXME: compute actual part number
 	implementer = 0x61; //Apple
 	part        = 0x023; //Firestorm
 #endif //__APPLE__
 
 	// From Linux arch/arm64/include/asm/cputype.h
-	// ARM_CPU_IMP_ARM 0x41
-	// ARM_CPU_IMP_APM 0x50
-	// ARM_CPU_IMP_CAVIUM 0x43
-	// ARM_CPU_IMP_BRCM 0x42
-	// ARM_CPU_IMP_QCOM 0x51
-	// ARM_CPU_IMP_NVIDIA 0x4E
-	// ARM_CPU_IMP_FUJITSU 0x46
-	// ARM_CPU_IMP_HISI 0x48
-	// ARM_CPU_IMP_APPLE 0x61
+#define ARM_CPU_IMP_ARM 0x41
+#define ARM_CPU_IMP_APM 0x50
+#define ARM_CPU_IMP_CAVIUM 0x43
+#define ARM_CPU_IMP_BRCM 0x42
+#define ARM_CPU_IMP_QCOM 0x51
+#define ARM_CPU_IMP_NVIDIA 0x4E
+#define ARM_CPU_IMP_FUJITSU 0x46
+#define ARM_CPU_IMP_HISI 0x48
+#define ARM_CPU_IMP_APPLE 0x61
 	//
-	// ARM_CPU_PART_AEM_V8 0xD0F
-	// ARM_CPU_PART_FOUNDATION 0xD00
-	// ARM_CPU_PART_CORTEX_A57 0xD07
-	// ARM_CPU_PART_CORTEX_A72 0xD08
-	// ARM_CPU_PART_CORTEX_A53 0xD03
-	// ARM_CPU_PART_CORTEX_A73 0xD09
-	// ARM_CPU_PART_CORTEX_A75 0xD0A
-	// ARM_CPU_PART_CORTEX_A35 0xD04
-	// ARM_CPU_PART_CORTEX_A55 0xD05
-	// ARM_CPU_PART_CORTEX_A76 0xD0B
-	// ARM_CPU_PART_NEOVERSE_N1 0xD0C
-	// ARM_CPU_PART_CORTEX_A77 0xD0D
+#define ARM_CPU_PART_AEM_V8 0xD0F
+#define ARM_CPU_PART_FOUNDATION 0xD00
+#define ARM_CPU_PART_CORTEX_A57 0xD07
+#define ARM_CPU_PART_CORTEX_A72 0xD08
+#define ARM_CPU_PART_CORTEX_A53 0xD03
+#define ARM_CPU_PART_CORTEX_A73 0xD09
+#define ARM_CPU_PART_CORTEX_A75 0xD0A
+#define ARM_CPU_PART_CORTEX_A35 0xD04
+#define ARM_CPU_PART_CORTEX_A55 0xD05
+#define ARM_CPU_PART_CORTEX_A76 0xD0B
+#define ARM_CPU_PART_NEOVERSE_N1 0xD0C
+#define ARM_CPU_PART_CORTEX_A77 0xD0D
 	//   from GCC:
-	// ARM_CPU_PART_CORTEX_A78 0xd41
-	// ARM_CPU_PART_CORTEX_X1 0xd44
-	// ARM_CPU_PART_CORTEX_V1 0xd40
-	// ARM_CPU_PART_CORTEX_N2 0xd49
-	// ARM_CPU_PART_CORTEX_R82 0xd15
+#define ARM_CPU_PART_CORTEX_A78 0xd41
+#define ARM_CPU_PART_CORTEX_X1 0xd44
+#define ARM_CPU_PART_CORTEX_V1 0xd40
+#define ARM_CPU_PART_CORTEX_N2 0xd49
+#define ARM_CPU_PART_CORTEX_R82 0xd15
 	//
 	// APM_CPU_PART_POTENZA 0x000
 	//
-	// CAVIUM_CPU_PART_THUNDERX 0x0A1
-	// CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2
-	// CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
-	// CAVIUM_CPU_PART_THUNDERX2 0x0AF
-	// CAVIUM_CPU_PART_THUNDERX3 0x0B8  // taken from OpenBLAS
+#define CAVIUM_CPU_PART_THUNDERX 0x0A1
+#define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2
+#define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
+#define CAVIUM_CPU_PART_THUNDERX2 0x0AF
+#define CAVIUM_CPU_PART_THUNDERX3 0x0B8  // taken from OpenBLAS
 	//
-	// BRCM_CPU_PART_BRAHMA_B53 0x100
-	// BRCM_CPU_PART_VULCAN 0x516
+#define BRCM_CPU_PART_BRAHMA_B53 0x100
+#define BRCM_CPU_PART_VULCAN 0x516
 	//
-	// QCOM_CPU_PART_FALKOR_V1 0x800
-	// QCOM_CPU_PART_FALKOR 0xC00
-	// QCOM_CPU_PART_KRYO 0x200
-	// QCOM_CPU_PART_KRYO_3XX_SILVER 0x803
-	// QCOM_CPU_PART_KRYO_4XX_GOLD 0x804
-	// QCOM_CPU_PART_KRYO_4XX_SILVER 0x805
+#define QCOM_CPU_PART_FALKOR_V1 0x800
+#define QCOM_CPU_PART_FALKOR 0xC00
+#define QCOM_CPU_PART_KRYO 0x200
+#define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803
+#define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804
+#define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805
 	//
-	// NVIDIA_CPU_PART_DENVER 0x003
-	// NVIDIA_CPU_PART_CARMEL 0x004
+#define NVIDIA_CPU_PART_DENVER 0x003
+#define NVIDIA_CPU_PART_CARMEL 0x004
 	//
-	// FUJITSU_CPU_PART_A64FX 0x001
+#define FUJITSU_CPU_PART_A64FX 0x001
 	//
-	// HISI_CPU_PART_TSV110 0xD01
+#define HISI_CPU_PART_TSV110 0xD01
+	//  from OpenBSD
+#define APPLE_CPU_PART_ICESTORM      0x022
+#define APPLE_CPU_PART_FIRESTORM     0x023
+#define APPLE_CPU_PART_ICESTORM_PRO  0x024
+#define APPLE_CPU_PART_FIRESTORM_PRO 0x025
+#define APPLE_CPU_PART_ICESTORM_MAX  0x028
+#define APPLE_CPU_PART_FIRESTORM_MAX 0x029
+#define APPLE_CPU_PART_BLIZZARD      0x032
+#define APPLE_CPU_PART_AVALANCHE     0x033
+#define APPLE_CPU_PART_BLIZZARD_PRO  0x034
+#define APPLE_CPU_PART_AVALANCHE_PRO 0x035
+#define APPLE_CPU_PART_BLIZZARD_MAX  0x038
+#define APPLE_CPU_PART_AVALANCHE_MAX 0x039
 
-	// APPLE_CPU_PART_M1_ICESTORM 0x022
-	// APPLE_CPU_PART_M1_FIRESTORM 0x023
 
 	// Fixme:  After merging the vpu_count branch we could report the
 	// part here with bli_dolog.
 	switch(implementer)
 	{
-		case 0x41:		// ARM
+		case ARM_CPU_IMP_ARM:		// ARM
 			switch (part)
 			{
 #ifdef BLIS_CONFIG_CORTEXA57
-				case 0xd07: // Cortex A57
+				case ARM_CPU_PART_CORTEX_A57:
 					return BLIS_ARCH_CORTEXA57;
 #endif
 #ifdef BLIS_CONFIG_CORTEXA53
-				case 0xd03: // Cortex A53
+				case ARM_CPU_PART_CORTEX_A53:
 					return BLIS_ARCH_CORTEXA53;
 #endif
 #ifdef BLIS_CONFIG_THUNDERX2
-				case 0xd0c: // Neoverse N1 (and Graviton G2?)
+				case ARM_CPU_PART_NEOVERSE_N1: // and Graviton G2
 					return BLIS_ARCH_THUNDERX2; //placeholder for N1
 #endif
 			}
 			break;
-		case 0x42:		// Broadcom
+		case ARM_CPU_IMP_BRCM:		// Broadcom
 			switch (part)
 			{
 #ifdef BLIS_CONFIG_THUNDERX2
-				case 0x516: // Vulcan
+				case BRCM_CPU_PART_VULCAN:
 					return BLIS_ARCH_THUNDERX2;
 #endif
 			}
 			break;
-		case 0x43:		// Cavium
+		case ARM_CPU_IMP_CAVIUM:		// Cavium
 			switch (part)
 			{
 #ifdef BLIS_CONFIG_THUNDERX2
-				case 0x0af: // ThunderX2
-				case 0x0b8: // ThunderX3
+				case CAVIUM_CPU_PART_THUNDERX2:
+				case CAVIUM_CPU_PART_THUNDERX3:
 					return BLIS_ARCH_THUNDERX2;
 #endif
 			}
 			break;
-		case 0x46:      	// Fujitsu
+		case ARM_CPU_IMP_FUJITSU:      	// Fujitsu
 			switch (part)
 			{
 #ifdef BLIS_CONFIG_A64FX
-				case 0x001: // A64FX
+				case FUJITSU_CPU_PART_A64FX:
 					return BLIS_ARCH_A64FX;
 #endif
 			}
 			break;
-		case 0x61:		// Apple
-			switch (part)
-			{
 #ifdef BLIS_CONFIG_FIRESTORM
-				case 0x022: // Icestorm (M1.LITTLE)
-				case 0x023: // Firestorm (M1.big)
-					return BLIS_ARCH_FIRESTORM;
+		case ARM_CPU_IMP_APPLE:		// assume FIRESTORM good for all
+			return BLIS_ARCH_FIRESTORM;
 #endif
-			}
-			break;
 	}
 
 #ifdef BLIS_CONFIG_ARMSVE

From 664cc6bc3ea610b4ecea63d78c6024c48f045635 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 26 Mar 2024 16:25:17 -0500
Subject: [PATCH 183/230] Update BLIS_*_INITIALIZER macros for C++
 compatibility. (#802)

Details:
- Remove designated initializer syntax. This isn't officially supported
  until C++20.
- Arrange initializers in the order in which they are defined in the
  struct. Even with standard or extension support for designated
  initializers, initializing non-static members out-of-order is an
  error in C++.
- Remove the conditional code which uses '-1' as the default value of
  the 'pack_buf' member of 'mem_t' in C, but 'BLIS_BUFFER_FOR_GEN_USE'
  in C++. Simply use the latter as a common-sense default.
---
 frame/base/bli_mem.h          | 29 ++---------
 frame/base/bli_pool.h         |  4 +-
 frame/base/bli_rntm.h         | 16 +++---
 frame/include/bli_type_defs.h | 92 +++++++++++++++++------------------
 4 files changed, 62 insertions(+), 79 deletions(-)

diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h
index b46c0509d..8d2bd9a73 100644
--- a/frame/base/bli_mem.h
+++ b/frame/base/bli_mem.h
@@ -136,38 +136,19 @@ BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem )
 // removed from the mem_t type definition. An alternative to the initializer is
 // calling bli_mem_clear() at runtime.
 
-#ifdef __cplusplus
 #define BLIS_MEM_INITIALIZER \
         { \
-          .pblk        = BLIS_PBLK_INITIALIZER, \
-          /* When using C++, which is strongly typed, we avoid use of -1 as a
-             packbuf_t value since it will result in a compile-time error. */ \
-          .buf_type    = BLIS_BUFFER_FOR_GEN_USE, \
-          .pool        = NULL, \
-          .size        = 0, \
+          /* .pblk     = */ BLIS_PBLK_INITIALIZER, \
+          /* .buf_type = */ BLIS_BUFFER_FOR_GEN_USE, \
+          /* .pool     = */ NULL, \
+          /* .size     = */ 0, \
         }
-#else // C99
-#define BLIS_MEM_INITIALIZER \
-        { \
-          .pblk        = BLIS_PBLK_INITIALIZER, \
-          .buf_type    = -1, \
-          .pool        = NULL, \
-          .size        = 0, \
-        }
-#endif
 
 
 BLIS_INLINE void bli_mem_clear( mem_t* mem )
 {
 	bli_mem_set_buffer( NULL, mem );
-#ifdef __cplusplus
-	const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE;
-	// When using C++, which is strongly typed, we avoid use of -1 as a
-	// packbuf_t value since it will result in a compile-time error.
-	bli_mem_set_buf_type( pb, mem );
-#else
-	bli_mem_set_buf_type( ( packbuf_t )-1, mem );
-#endif
+	bli_mem_set_buf_type( BLIS_BUFFER_FOR_GEN_USE, mem );
 	bli_mem_set_pool( NULL, mem );
 	bli_mem_set_size( 0, mem );
 }
diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h
index 6f199f7a4..454f068a5 100644
--- a/frame/base/bli_pool.h
+++ b/frame/base/bli_pool.h
@@ -102,8 +102,8 @@ BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk )
 
 #define BLIS_PBLK_INITIALIZER \
         { \
-          .buf        = NULL, \
-          .block_size = 0, \
+          /* .buf        = */ NULL, \
+          /* .block_size = */ 0, \
         }  \
 
 BLIS_INLINE void bli_pblk_clear( pblk_t* pblk )
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 43e91d505..df3cb3b38 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -258,13 +258,15 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 
 #define BLIS_RNTM_INITIALIZER \
         { \
-          .thread_impl = BLIS_SINGLE, \
-          .num_threads = 1, \
-          .thrloop     = { 1, 1, 1, 1, 1, 1 }, \
-          .auto_factor = FALSE, \
-          .pack_a      = FALSE, \
-          .pack_b      = FALSE, \
-          .l3_sup      = TRUE, \
+          /* .thread_impl */ = BLIS_SINGLE, \
+\
+          /* .auto_factor */ = FALSE, \
+\
+          /* .num_threads */ = 1, \
+          /* .thrloop     */ = { 1, 1, 1, 1, 1, 1 }, \
+          /* .pack_a      */ = FALSE, \
+          /* .pack_b      */ = FALSE, \
+          /* .l3_sup      */ = TRUE, \
         }  \
 
 #if 0
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index b63cdb2c1..b88aa445d 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1292,68 +1292,68 @@ typedef struct obj_s
 
 #define BLIS_OBJECT_INITIALIZER \
 { \
-	.root        = NULL, \
+	/* .root        */ = NULL, \
 \
-	.off         = { 0, 0 }, \
-	.dim         = { 0, 0 }, \
-	.diag_off    = 0, \
+	/* .off         */ = { 0, 0 }, \
+	/* .dim         */ = { 0, 0 }, \
+	/* .diag_off    */ = 0, \
 \
-	.info        = 0x0 | BLIS_BITVAL_DENSE      | \
-	                     BLIS_BITVAL_GENERAL, \
-	.info2       = 0x0, \
-	.elem_size   = sizeof( float ), /* this is changed later. */ \
+	/* .info        */ = 0x0 | BLIS_BITVAL_DENSE      | \
+	/*              */         BLIS_BITVAL_GENERAL, \
+	/* .info2       */ = 0x0, \
+	/* .elem_size   */ = sizeof( float ), /* this is changed later. */ \
 \
-	.buffer      = NULL, \
-	.rs          = 0, \
-	.cs          = 0, \
-	.is          = 1,  \
+	/* .buffer      */ = NULL, \
+	/* .rs          */ = 0, \
+	/* .cs          */ = 0, \
+	/* .is          */ = 1,  \
 \
-	.scalar      = { 0.0, 0.0 }, \
+	/* .scalar      */ = { 0.0, 0.0 }, \
 \
-	.m_padded    = 0, \
-	.n_padded    = 0, \
-	.ps          = 0, \
-	.pd          = 0, \
-	.m_panel     = 0, \
-	.n_panel     = 0, \
+	/* .m_padded    */ = 0, \
+	/* .n_padded    */ = 0, \
+	/* .ps          */ = 0, \
+	/* .pd          */ = 0, \
+	/* .m_panel     */ = 0, \
+	/* .n_panel     */ = 0, \
 \
-	.pack_fn     = NULL, \
-	.pack_params = NULL, \
-	.ker_fn      = NULL, \
-	.ker_params  = NULL  \
+	/* .pack_fn     */ = NULL, \
+	/* .pack_params */ = NULL, \
+	/* .ker_fn      */ = NULL, \
+	/* .ker_params  */ = NULL  \
 }
 
 #define BLIS_OBJECT_INITIALIZER_1X1 \
 { \
-	.root        = NULL, \
+	/* .root        */ = NULL, \
 \
-	.off         = { 0, 0 }, \
-	.dim         = { 1, 1 }, \
-	.diag_off    = 0, \
+	/* .off         */ = { 0, 0 }, \
+	/* .dim         */ = { 1, 1 }, \
+	/* .diag_off    */ = 0, \
 \
-	.info        = 0x0 | BLIS_BITVAL_DENSE      | \
-	                     BLIS_BITVAL_GENERAL, \
-	.info2       = 0x0, \
-	.elem_size   = sizeof( float ), /* this is changed later. */ \
+	/* .info        */ = 0x0 | BLIS_BITVAL_DENSE      | \
+	/*              */         BLIS_BITVAL_GENERAL, \
+	/* .info2       */ = 0x0, \
+	/* .elem_size   */ = sizeof( float ), /* this is changed later. */ \
 \
-	.buffer      = NULL, \
-	.rs          = 0, \
-	.cs          = 0, \
-	.is          = 1,  \
+	/* .buffer      */ = NULL, \
+	/* .rs          */ = 0, \
+	/* .cs          */ = 0, \
+	/* .is          */ = 1,  \
 \
-	.scalar      = { 0.0, 0.0 }, \
+	/* .scalar      */ = { 0.0, 0.0 }, \
 \
-	.m_padded    = 0, \
-	.n_padded    = 0, \
-	.ps          = 0, \
-	.pd          = 0, \
-	.m_panel     = 0, \
-	.n_panel     = 0, \
+	/* .m_padded    */ = 0, \
+	/* .n_padded    */ = 0, \
+	/* .ps          */ = 0, \
+	/* .pd          */ = 0, \
+	/* .m_panel     */ = 0, \
+	/* .n_panel     */ = 0, \
 \
-	.pack_fn     = NULL, \
-	.pack_params = NULL, \
-	.ker_fn      = NULL, \
-	.ker_params  = NULL  \
+	/* .pack_fn     */ = NULL, \
+	/* .pack_params */ = NULL, \
+	/* .ker_fn      */ = NULL, \
+	/* .ker_params  */ = NULL  \
 }
 
 // Define these macros here since they must be updated if contents of

From a316d2c6c33fc1f8f7c58c4210ab203f48349041 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 28 Mar 2024 12:52:00 -0500
Subject: [PATCH 184/230] Fix incorrect commenting of `BLIS_RNTM_INITIALIZER`
 and `BLIS_OBJECT_INITIALIZER`.

---
 frame/base/bli_rntm.h         | 14 +++---
 frame/include/bli_type_defs.h | 88 +++++++++++++++++------------------
 2 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index df3cb3b38..1cc7ad002 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -258,15 +258,15 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 
 #define BLIS_RNTM_INITIALIZER \
         { \
-          /* .thread_impl */ = BLIS_SINGLE, \
+          /* .thread_impl = */ BLIS_SINGLE, \
 \
-          /* .auto_factor */ = FALSE, \
+          /* .auto_factor = */ FALSE, \
 \
-          /* .num_threads */ = 1, \
-          /* .thrloop     */ = { 1, 1, 1, 1, 1, 1 }, \
-          /* .pack_a      */ = FALSE, \
-          /* .pack_b      */ = FALSE, \
-          /* .l3_sup      */ = TRUE, \
+          /* .num_threads = */ 1, \
+          /* .thrloop     = */ { 1, 1, 1, 1, 1, 1 }, \
+          /* .pack_a      = */ FALSE, \
+          /* .pack_b      = */ FALSE, \
+          /* .l3_sup      = */ TRUE, \
         }  \
 
 #if 0
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index b88aa445d..3c74502b7 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1292,68 +1292,68 @@ typedef struct obj_s
 
 #define BLIS_OBJECT_INITIALIZER \
 { \
-	/* .root        */ = NULL, \
+	/* .root        = */ NULL, \
 \
-	/* .off         */ = { 0, 0 }, \
-	/* .dim         */ = { 0, 0 }, \
-	/* .diag_off    */ = 0, \
+	/* .off         = */ { 0, 0 }, \
+	/* .dim         = */ { 0, 0 }, \
+	/* .diag_off    = */ 0, \
 \
-	/* .info        */ = 0x0 | BLIS_BITVAL_DENSE      | \
+	/* .info        = */ 0x0 | BLIS_BITVAL_DENSE      | \
 	/*              */         BLIS_BITVAL_GENERAL, \
-	/* .info2       */ = 0x0, \
-	/* .elem_size   */ = sizeof( float ), /* this is changed later. */ \
+	/* .info2       = */ 0x0, \
+	/* .elem_size   = */ sizeof( float ), /* this is changed later. */ \
 \
-	/* .buffer      */ = NULL, \
-	/* .rs          */ = 0, \
-	/* .cs          */ = 0, \
-	/* .is          */ = 1,  \
+	/* .buffer      = */ NULL, \
+	/* .rs          = */ 0, \
+	/* .cs          = */ 0, \
+	/* .is          = */ 1,  \
 \
-	/* .scalar      */ = { 0.0, 0.0 }, \
+	/* .scalar      = */ { 0.0, 0.0 }, \
 \
-	/* .m_padded    */ = 0, \
-	/* .n_padded    */ = 0, \
-	/* .ps          */ = 0, \
-	/* .pd          */ = 0, \
-	/* .m_panel     */ = 0, \
-	/* .n_panel     */ = 0, \
+	/* .m_padded    = */ 0, \
+	/* .n_padded    = */ 0, \
+	/* .ps          = */ 0, \
+	/* .pd          = */ 0, \
+	/* .m_panel     = */ 0, \
+	/* .n_panel     = */ 0, \
 \
-	/* .pack_fn     */ = NULL, \
-	/* .pack_params */ = NULL, \
-	/* .ker_fn      */ = NULL, \
-	/* .ker_params  */ = NULL  \
+	/* .pack_fn     = */ NULL, \
+	/* .pack_params = */ NULL, \
+	/* .ker_fn      = */ NULL, \
+	/* .ker_params  = */ NULL  \
 }
 
 #define BLIS_OBJECT_INITIALIZER_1X1 \
 { \
-	/* .root        */ = NULL, \
+	/* .root        = */ NULL, \
 \
-	/* .off         */ = { 0, 0 }, \
-	/* .dim         */ = { 1, 1 }, \
-	/* .diag_off    */ = 0, \
+	/* .off         = */ { 0, 0 }, \
+	/* .dim         = */ { 1, 1 }, \
+	/* .diag_off    = */ 0, \
 \
-	/* .info        */ = 0x0 | BLIS_BITVAL_DENSE      | \
+	/* .info        = */ 0x0 | BLIS_BITVAL_DENSE      | \
 	/*              */         BLIS_BITVAL_GENERAL, \
-	/* .info2       */ = 0x0, \
-	/* .elem_size   */ = sizeof( float ), /* this is changed later. */ \
+	/* .info2       = */ 0x0, \
+	/* .elem_size   = */ sizeof( float ), /* this is changed later. */ \
 \
-	/* .buffer      */ = NULL, \
-	/* .rs          */ = 0, \
-	/* .cs          */ = 0, \
-	/* .is          */ = 1,  \
+	/* .buffer      = */ NULL, \
+	/* .rs          = */ 0, \
+	/* .cs          = */ 0, \
+	/* .is          = */ 1,  \
 \
-	/* .scalar      */ = { 0.0, 0.0 }, \
+	/* .scalar      = */ { 0.0, 0.0 }, \
 \
-	/* .m_padded    */ = 0, \
-	/* .n_padded    */ = 0, \
-	/* .ps          */ = 0, \
-	/* .pd          */ = 0, \
-	/* .m_panel     */ = 0, \
-	/* .n_panel     */ = 0, \
+	/* .m_padded    = */ 0, \
+	/* .n_padded    = */ 0, \
+	/* .ps          = */ 0, \
+	/* .pd          = */ 0, \
+	/* .m_panel     = */ 0, \
+	/* .n_panel     = */ 0, \
 \
-	/* .pack_fn     */ = NULL, \
-	/* .pack_params */ = NULL, \
-	/* .ker_fn      */ = NULL, \
-	/* .ker_params  */ = NULL  \
+	/* .pack_fn     = */ NULL, \
+	/* .pack_params = */ NULL, \
+	/* .ker_fn      = */ NULL, \
+	/* .ker_params  = */ NULL  \
 }
 
 // Define these macros here since they must be updated if contents of

From a49238e6141c96a41aa3c2a4adb0b0663d0b4968 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 24 Apr 2024 15:07:18 -0500
Subject: [PATCH 185/230] Refactor the control tree and other infrastructure
 (#710)

Details:
1. A "plugin" architecture.
- Users are now able to register new kernels, kernel preferences, and
  blocksizes at runtime, directly from user applications.
- Plugins can be created, configured, and built using only an installed
  version of BLIS -- no source or source code changes required.
- Plugins support both reference and optimized kernels, as well as
  custom configuration-to-kernel-set mappings.
- Building plugins (including reference and relevant optimized kernels)
  for enabled architectures or architecture families is automated, as is
  linking into the final library.
- The configure script is now installed as 'configure-plugin'. In this
  mode, it can be used to initialize a plugin from a template including
  optional example code, and prepare a build system for compiling the
  plugin into a shared or static library.
- Additional configuration files, templates, and build system components
  are also installed to '%prefix%/share/blis'.
- The cntx_t struct now has extensible data structures for holding
  kernels, preferences, and blocksizes. These are based on a "stack"
  structure which contains a list of fixed-size data blocks. Adding a
  new entry (which may require allocating a new block or reallocating
  the block pointer array) requires locking, but looking up entries is
  lock-free and takes O(1) time.
- Kernels can depend on either 1 or 2 type parameters (e.g.
  mixed-precision packing requires 2). The func2_t struct supports
  the latter, but can be implicitly cast to func_t if only "diagonal"
  entries are needed. The number of type parameters can be inferred from
  the kernel ID for type safety.
- Functions have been added to register new kernels, preferences, and
  blocksizes with the global kernel structure (gks). This creates
  corresponding entries in each allocated context and returns the next
  available ID. Plugins use this API to register user kernels, although
  the user is responsible for tracking the returned IDs for later
  lookup. Setting newly-registered reference kernels, as well as
  overriding these with optimized kernels is done in exactly the same
  manner as in bli_cntx_init_ref() and bli_cntx_init_<subconfig>().

2. Restructuring of the control and thread control trees.
- The control tree has been substantially restructured to support more
  flexibility.
- The "default" control trees for gemm (also used for
  hemm/symm/herk/her2k/syrk/syr2k/trmm/trmm3) and trsm are now
  represented as a single structure containing all necessary control
  tree nodes and parameters.
- An API has been added to modify the default gemm/trsm control trees.
- This same API is used by the framework and packm/gemm/trsm variants
  to access specific control tree nodes.
- Users can alternatively create a custom control tree from scratch.
- The blocksizes are now encoded directly in the control tree, rather
  than via loop IDs. The logic for adjusting blocksizes for certain
  operations has been moved to the control tree initialization.
- Type information is encoded in the control tree to drive proper
  selection of packing and computational kernels provided by the user.
- The packing microkernel now receives an opaque "params" struct which
  is user-definable and can be used to pass additional information
  through the call stack.
- The auxinfo_t struct has been updated with a .params field for
  opaque user data as well as the global offsets of the current
  microtile.
- The packm and gemm variants can be overridden by the user, and also
  receive an opaque params struct via the associated control tree
  node.
- The structure-aware packing kernel bli_packm_struc_cxk() is no longer
  hard-coded to be called from the default packm variant, but can be
  overridden by the user. It also supports mixed-precision/mixed-domain
  natively now.
- The thread control tree (thrinfo_t) is now created entirely up-front
  by inspecting the control tree. The required number of threads at each
  level is encoded in the control tree via loop IDs (actually a bitfield
  of loop IDs), although the ordering and number of such IDs is
  arbitrary. The logic for adjusting the number of threads at each level
  based on operation type (e.g. trmm) is now in the control tree
  initialization and expressed by combining loop IDs from multiple
  levels into a single level.
- The mem_t object containing the pack buffer pointer has been moved
  from the control tree to the thread control tree. NOTE: **The control
  tree is now strictly const throughout the operation, and only a
  single copy is shared by all threads.**
- The thread control tree node for packing has been changed so that
  there is no longer a "fake" node indicating a team of single threads.
  Instead, the number of threads and thread IDs in the "normal" thread
  control tree node are used. This change has also been made to the
  gemmsup thread control tree and packing variants, as well as to the
  gemmlike sandbox.
- Parameters controlling packing (e.g. inversion of the diagonal,
  direction, schema) are not stored directly in the control tree but in
  the opaque params struct. The packing control tree node and its
  default params struct are stored together in the "combined"
  gemm/trsm control tree structure and initialized as a unit. Users can
  update these parameters individually or substitute a custom packm
  variant and params struct.
- The "target" and "execution" datatypes has been removed from the obj_t
  struct and replaced by type information in the control tree.
- The "sub-node" and "sub-prenode" of a control tree node have been
  replaced by an arbitrary number of sub-nodes accessed by index. There
  is a hard cap on the number of sub-nodes (currently 2). Sub-nodes are
  added during control tree initialization, *after*
  creation/initialization of the parent node through an updated API.
- The level-3 thread decorator has been significantly simplified and
  directly calls bli_l3_int(). The control tree is created externally,
  and it is no longer necessary to alias matrices or set object pack
  schemas. Also, the rntm_t passed in may be NULL. Finally, family
  and scalar information is no longer needed here.
- bli_l3_int() is now a simple inline function which extracts the next
  control tree node and variant and calls it.
- bli_*_front() have been removed and inlined into the expert object
  API with significant simplification.
- 1m (or other induced method) no longer uses an alternative cntx_t.
- The .pack_fn/.ker_fn pointers and associated params fields on the
  obj_t were removed in favor of the present solution.

3. Overhaul of variable substitution in configure script.
- The configure script has been somewhat re-written to use a
  centralized mechanism for substituting variables into build system and
  other configuration files.
- All substitution variables go through the same pathway now, which
  necessitated some variable naming changes for variables which were
  named the same in e.g. Makefile and bli_config.h but with
  different definitions.
- CC and CXX variables can now contain spaces, e.g. 'g++ -std=c++17'.
  This provides better support for integration with build tooling such
  as autotools.

4. Overhaul of packing kernels.
- Previously there were two packing kernels referenced in the cntx_t
  structure for MRxk and NRxk shaped micropanels, respectively. These
  have now been merged into one kernel which is responsible for packing
  any dense rectangular portion of either A or B.
- The packing kernel now receives information about the register
  blocksize (cdim_max) and duplication factor (the "broadcast-B"
  format, although this can also apply to the A matrix).
- The structure-aware packing kernel (bli_packm_struc_cxk(), which is
  now user-overridable) also receives global offsets of the current
  micropanel within A or B.
- Explicit kernels for packing the diagonal blocks of
  triangular/symmetric/Hermitian matrices have been added to the
  cntx_t. This means that the bli_packm_struc_ckx() "kernel" no longer
  needs to directly touch data (except to zero out some regions).
- bli_packm_struc_cxk() has also been updated to work only in terms of
  fundamental elements (i.e., real datatypes) when computing offsets and
  when zeroing data, which greatly simplifies mixed-domain/1m packing.
- bli_packm_scalar() has been updated to better support complex scalars
  in mixed-domain operations.
- Pack schemas for PACKED_ROW_PANELS* and PACKED_COL_PANELS* have
  been merged into simply PACKED_PANELS*. This reflects the merging of
  the packing kernels into a single generic kernel. There were only a
  very few places which needed the row/column information and this is
  now supplied by alternative means.
- Packing variants always behave "as if" the A matrix were being packed
  (i.e. the code assumes packing column-stored row panels). Packing of B
  is handled by applying an implicit or explicit transpose before
  packing. This change also applies to gemmsup.

5. Improved MD/MP support.
- All level-3 operations (except trsm) now support full
  mixed-domain/mixed-precision operation.
- Explicit 1m packing kernels have been added in the cntx_t.
- An explicit 1m microkernel wrapper has been added to the cntx_t.
- An extra packing kernel for the "ro" format has been added, along with
  the pack_t enumeration value. This supports the packing for
  real*complex -> real, including potential scaling by a complex alpha,
  support for structured matrices, etc.
- Extra microkernel wrappers for mixed-domain operations have been added
  to support the 'ccr' (and by extension, 'crc'), 'rcc', and 'crr'
  cases. Notably this includes full support for general stride storage
  and complex alpha/beta.
- Packing kernels and gemm microkernels are now "templated" based on two
  type parameters rather than one. For packing this allows direct
  optimization of mixed-precision kernels, and for gemm microkernels
  this allows direct optimization of mixed-precision without writing to
  a temporary buffer. Reference packing kernels are directly
  instantiated for all mixes of precisions, while by default
  mixed-precision gemm microkernels are supported via a microkernel
  wrapper. The "old" way of specifying optimized kernels using a single
  type parameter works unchanged.
- alpha and beta are typecast appropriately to the computational or
  output datatype, respectively, and **always** to the complex domain.
  Scalar typecasting has also been added to gemmsup for safety.
- The gemm macrokernel doesn't have to do any typecasting anymore, as a
  microkernel wrapper or optimized mixed-precision/mixed-domain kernel
  now handles this.
- 1m and mixed-domain operations now always use a microkernel wrapper,
  rather than adjusting parameters in the gemm macrokernel.
- The gemmt macrokernel **does** still have to handle explicit
  write-back of microtiles which intersect the diagonal, although
  typecasting has already been performed.
- The gemmt_x_ker_var2(), trmm_xx_ker_var2(), and trsm_xx_ker_var2()
  functions have been removed. The appropriate macrokernel pointer is
  selected during control tree initialization.
- Real domain MR/NR are checked for even-ness based on the gemm
  microkernel's row preference in order to guarantee proper 1m and
  mixed-domain operation.
- Full range of mixed-domain/mixed-precision functionality tested in the
  testsuite ('input.*.mixed').

6. Other changes:
- The build system has been updated to support C++ source files
  throughout the framework. While the intent is not to add such files to
  BLIS itself, this supports plugins written in C++.
- Many instances of configuration-specific code have been simplified by
  introducing an INSERT_GENTCONF macro which instantiates a block of
  code for each enabled sub-configuration. The ConfigurationHowTo.md
  document has been updated accordingly.
- PASTEMAC?/PASTECH?/PASTEF77? have been removed in favor of
  variadic macros which accept any number of arguments (up to a
  reasonable limit).
- The INSERT_GENTFUNC* macros have been updated to clean up
  mixed-precision and mixed-domain instantiations.
- bli_align_dim_to_mult() has been updated to support rounding either up
  or down based on a flag.
- Checking for empty matrices and other early exits (level-3 only) has
  been consolidated into a single utility function.
- The auxinfo_t struct is always passed as const.
- The new function bli_obj_alias_submatrix() aliases a matrix while also
  resetting the root to NULL, offsets to zero (while adjusting the
  buffer), and applying any implicit transpose.
- Level-3 pruning functions now only check matrix structure to see what
  to do, not the operation family.
- gemmsup packing has been updated to use the "normal" pack buffer
  allocation routines.
- Remove duplicate checks for early return from gemmsup handler.
- bli_determine_blocksize() has been significantly simplified.
- Partitioning packed panels is no longer allowed.
- Added bli_xxsame macros.
- Automated the calculation of info bit shifts and masks based on
  predefined bit sizes for various flags. This greatly simplifies
  reordering, adding, or removing flags from the info/info2 bitfields.
- Moved more BLIS_NUM_* macros into the corresponding enums as the
  last entry so that the value is automatically computed.
- Better const-correctness in some level0 scalar macros.
- Better mixed-precision support in some level0 scalar macros.
- Added a bli_axpbys_mxn() macro.
- bli_thread_range_sub() takes explicit thread ID and number of threads
  rather than a thrinfo_t node.
- "De-templated" BLIS gemmlike sandbox (specifically, bls_gemm_bp_var1()
  and bls_packm_var1()).
- Combined bls_l3_packm_[ab]() into one function with thin wrappers.
- Deleted bls_packm_var[23]().
- Add a "termination tag" to the testsuite output so that
  'make check-blis' can accurately check for successful completion.
- Add a new function to centrally compute FLOPs for level-3 operations
  in the testsuite.
---
 Makefile                                      |   72 +-
 build/bli_config.h.in                         |   20 -
 build/config.mk.in                            |   29 +-
 build/gen-make-frags/gen-make-frag.sh         |  216 +-
 build/libblis-symbols.def                     |    1 -
 build/plugin/Makefile                         |  524 ++
 .../plugin/bli_kernel_defs_zen3.h             |   40 +-
 build/plugin/bli_plugin.h.in                  |  146 +
 build/plugin/bli_plugin_init_ref.c            |  108 +
 .../plugin/bli_plugin_init_zen3.c             |   94 +-
 .../plugin/bli_plugin_register.c              |   88 +-
 build/plugin/config.mk.in                     |  145 +
 .../plugin/my_kernel_1_ref.c                  |   24 +-
 .../plugin/my_kernel_1_zen3.c                 |   40 +-
 .../plugin/my_kernel_2_ref.c                  |   60 +-
 common.mk                                     |   79 +-
 config/a64fx/bli_cntx_init_a64fx.c            |    3 +-
 config/armsve/bli_cntx_init_armsve.c          |    5 +-
 config/firestorm/bli_cntx_init_firestorm.c    |    6 +-
 config/haswell/bli_cntx_init_haswell.c        |   12 +-
 config/knl/bli_cntx_init_knl.c                |    3 +-
 .../sifive_x280/bli_cntx_init_sifive_x280.c   |   12 +-
 config/zen/bli_cntx_init_zen.c                |   14 +-
 config/zen2/bli_cntx_init_zen2.c              |   14 +-
 config/zen3/bli_cntx_init_zen3.c              |   14 +-
 configure                                     | 4363 ++++++++++-------
 docs/ConfigurationHowTo.md                    |   34 +-
 frame/0/bli_l0_ft.h                           |   20 +-
 frame/0/bli_l0_oapi.c                         |   14 +-
 frame/0/bli_l0_oapi.h                         |   12 +-
 frame/0/bli_l0_tapi.c                         |   18 +-
 frame/0/copysc/bli_copysc.c                   |   11 +-
 frame/0/copysc/bli_copysc.h                   |    4 +-
 frame/1/bli_l1v_fpa.c                         |    8 +-
 frame/1/bli_l1v_fpa.h                         |    4 +-
 frame/1/bli_l1v_ft.h                          |   20 +-
 frame/1/bli_l1v_oapi.c                        |   40 +-
 frame/1/bli_l1v_tapi.c                        |   20 +-
 frame/1/bli_l1v_tapi.h                        |   20 +-
 frame/1d/bli_l1d_fpa.c                        |    8 +-
 frame/1d/bli_l1d_fpa.h                        |    4 +-
 frame/1d/bli_l1d_ft.h                         |   14 +-
 frame/1d/bli_l1d_oapi.c                       |   28 +-
 frame/1d/bli_l1d_tapi.c                       |   14 +-
 frame/1d/bli_l1d_tapi.h                       |   14 +-
 frame/1f/bli_l1f_fpa.c                        |    8 +-
 frame/1f/bli_l1f_fpa.h                        |    4 +-
 frame/1f/bli_l1f_ft.h                         |   10 +-
 frame/1f/bli_l1f_oapi.c                       |   20 +-
 frame/1f/bli_l1f_tapi.c                       |   10 +-
 frame/1f/bli_l1f_tapi.h                       |   10 +-
 frame/1m/bli_l1m_fpa.c                        |   16 +-
 frame/1m/bli_l1m_fpa.h                        |    8 +-
 frame/1m/bli_l1m_ft.h                         |   12 +-
 frame/1m/bli_l1m_ker_params.h                 |   16 +-
 frame/1m/bli_l1m_ker_prot.h                   |   13 +
 frame/1m/bli_l1m_oapi.c                       |   24 +-
 frame/1m/bli_l1m_tapi.c                       |   44 +-
 frame/1m/bli_l1m_tapi.h                       |   10 +-
 frame/1m/bli_l1m_unb_var1.c                   |   10 +-
 frame/1m/bli_l1m_unb_var1.h                   |   10 +-
 frame/1m/packm/bli_packm.h                    |    7 -
 frame/1m/packm/bli_packm_alloc.c              |    2 +-
 frame/1m/packm/bli_packm_blk_var1.c           |   90 +-
 frame/1m/packm/bli_packm_check.c              |   10 +-
 frame/1m/packm/bli_packm_check.h              |   10 +-
 frame/1m/packm/bli_packm_cntl.c               |  101 +-
 frame/1m/packm/bli_packm_cntl.h               |  231 +-
 frame/1m/packm/bli_packm_init.c               |   50 +-
 frame/1m/packm/bli_packm_init.h               |    7 +-
 frame/1m/packm/bli_packm_int.c                |   19 +-
 frame/1m/packm/bli_packm_part.c               |  253 -
 frame/1m/packm/bli_packm_part.h               |   56 -
 frame/1m/packm/bli_packm_scalar.c             |   19 +-
 frame/1m/packm/bli_packm_struc_cxk.c          |  288 +-
 frame/1m/packm/bli_packm_struc_cxk.h          |   45 +-
 frame/1m/packm/bli_packm_struc_cxk_md.c       |  523 --
 frame/1m/packm/bli_packm_struc_cxk_md.h       |   82 -
 frame/1m/unpackm/bli_unpackm_blk_var1.c       |   17 +-
 frame/1m/unpackm/bli_unpackm_cntl.c           |   31 +-
 frame/1m/unpackm/bli_unpackm_cntl.h           |   17 +-
 frame/1m/unpackm/bli_unpackm_int.c            |   35 +-
 frame/1m/unpackm/bli_unpackm_int.h            |   11 +-
 frame/2/bli_l2_fpa.c                          |   12 +-
 frame/2/bli_l2_fpa.h                          |    6 +-
 frame/2/bli_l2_ft.h                           |   14 +-
 frame/2/bli_l2_ft_unb.h                       |   12 +-
 frame/2/bli_l2_oapi.c                         |   24 +-
 frame/2/bli_l2_tapi.c                         |   36 +-
 frame/2/bli_l2_tapi.h                         |   14 +-
 frame/2/gemv/amd/bli_gemv_unf_var2_amd.c      |    8 +-
 frame/2/gemv/bli_gemv_unb_var2.c              |    4 +-
 frame/2/gemv/bli_gemv_unf_var2.c              |    4 +-
 frame/2/gemv/bli_gemv_var.h                   |    2 +-
 frame/2/gemv/bli_gemv_var_oapi.c              |    4 +-
 frame/2/ger/bli_ger_var.h                     |    2 +-
 frame/2/ger/bli_ger_var_oapi.c                |    4 +-
 frame/2/hemv/bli_hemv_unb_var1.c              |    4 +-
 frame/2/hemv/bli_hemv_unb_var2.c              |    4 +-
 frame/2/hemv/bli_hemv_unb_var3.c              |    4 +-
 frame/2/hemv/bli_hemv_unb_var4.c              |    4 +-
 frame/2/hemv/bli_hemv_unf_var1.c              |    4 +-
 frame/2/hemv/bli_hemv_unf_var1a.c             |    4 +-
 frame/2/hemv/bli_hemv_unf_var3.c              |    4 +-
 frame/2/hemv/bli_hemv_unf_var3a.c             |    4 +-
 frame/2/hemv/bli_hemv_var.h                   |    2 +-
 frame/2/hemv/bli_hemv_var_oapi.c              |    4 +-
 frame/2/her/bli_her_var.h                     |    2 +-
 frame/2/her/bli_her_var_oapi.c                |    4 +-
 frame/2/her2/bli_her2_var.h                   |    2 +-
 frame/2/her2/bli_her2_var_oapi.c              |    4 +-
 frame/2/trmv/bli_trmv_var.h                   |    2 +-
 frame/2/trmv/bli_trmv_var_oapi.c              |    4 +-
 frame/2/trsv/bli_trsv_unb_var1.c              |    2 +-
 frame/2/trsv/bli_trsv_unb_var2.c              |    2 +-
 frame/2/trsv/bli_trsv_unf_var1.c              |    2 +-
 frame/2/trsv/bli_trsv_unf_var2.c              |    2 +-
 frame/2/trsv/bli_trsv_var.h                   |    2 +-
 frame/2/trsv/bli_trsv_var_oapi.c              |    4 +-
 frame/3/bli_l3.h                              |   10 +-
 frame/3/bli_l3_blocksize.c                    |   96 -
 frame/3/bli_l3_blocksize.h                    |   55 -
 frame/3/bli_l3_check.c                        |   90 +-
 frame/3/bli_l3_check.h                        |   11 +
 frame/3/bli_l3_cntl.c                         |  129 -
 frame/3/bli_l3_cntl.h                         |   59 -
 frame/3/bli_l3_decor.c                        |   78 +-
 frame/3/bli_l3_decor.h                        |   18 +-
 frame/3/bli_l3_direct.c                       |  140 -
 frame/3/bli_l3_ind_ukr.h                      |   21 +-
 frame/3/bli_l3_int.c                          |  146 -
 frame/3/bli_l3_int.h                          |   22 +-
 frame/3/bli_l3_oapi.c                         |    8 +-
 frame/3/bli_l3_oapi.h                         |    8 +-
 frame/3/bli_l3_oapi_ex.c                      |  645 ++-
 frame/3/bli_l3_oapi_ex.h                      |    3 +
 frame/3/bli_l3_packab.c                       |   40 +-
 frame/3/bli_l3_packab.h                       |   24 +-
 frame/3/bli_l3_prune.c                        |   71 +-
 frame/3/bli_l3_prune.h                        |    9 +-
 frame/3/bli_l3_schema.c                       |   80 -
 frame/3/bli_l3_sup.c                          |   50 +-
 frame/3/bli_l3_sup_packm.c                    |  392 +-
 frame/3/bli_l3_sup_packm.h                    |   31 -
 frame/3/bli_l3_sup_packm_var.c                |   93 +-
 frame/3/bli_l3_sup_ref.c                      |   26 +-
 frame/3/bli_l3_sup_var12.c                    |    4 +-
 frame/3/bli_l3_sup_var1n2m.c                  |   60 +-
 frame/3/bli_l3_sup_vars.h                     |    2 +-
 frame/3/bli_l3_tapi.c                         |   18 +-
 frame/3/bli_l3_tapi_ex.c                      |   18 +-
 frame/3/bli_l3_tapi_ex.h                      |   16 +-
 frame/3/bli_l3_thrinfo.c                      |  406 +-
 frame/3/bli_l3_thrinfo.h                      |    6 +-
 frame/3/bli_l3_ukr_oapi.c                     |    6 +-
 frame/3/bli_l3_ukr_oapi.h                     |    6 +-
 frame/3/bli_l3_ukr_prot.h                     |    2 +
 frame/3/bli_l3_ukr_tapi.c                     |   12 +-
 frame/3/bli_l3_ukr_tapi.h                     |    6 +-
 .../{gemm/bli_gemm_front.h => bli_l3_util.c}  |   47 +-
 .../bli_gemmt_front.h => bli_l3_util.h}       |   16 +-
 frame/3/gemm/bli_gemm.h                       |    8 -
 frame/3/gemm/bli_gemm_blk_var1.c              |   20 +-
 frame/3/gemm/bli_gemm_blk_var2.c              |   20 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |   20 +-
 frame/3/gemm/bli_gemm_cntl.c                  |  614 ++-
 frame/3/gemm/bli_gemm_cntl.h                  |  448 +-
 frame/3/gemm/bli_gemm_front.c                 |  267 -
 frame/3/gemm/bli_gemm_ker_var2.c              |  217 +-
 frame/3/gemm/bli_gemm_md.c                    |  647 ---
 frame/3/gemm/bli_gemm_md.h                    |  278 --
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |  242 -
 frame/3/gemm/bli_gemm_md_c2r_ref.h            |   71 -
 frame/3/gemm/bli_gemm_var.h                   |    4 +-
 frame/3/gemm/ind/bli_gemm_ind_opt.h           |  115 -
 frame/3/gemmt/bli_gemmt.h                     |    2 -
 frame/3/gemmt/bli_gemmt_front.c               |  111 -
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |   52 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2b.c         |   50 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |   56 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2b.c         |   48 +-
 frame/3/gemmt/bli_gemmt_var.h                 |    4 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2.c          |   73 -
 frame/3/hemm/bli_hemm_front.c                 |  166 -
 frame/3/symm/bli_symm.h                       |   36 -
 frame/3/symm/bli_symm_front.c                 |  165 -
 frame/3/symm/bli_symm_front.h                 |   45 -
 frame/3/trmm/bli_trmm.h                       |    2 -
 frame/3/trmm/bli_trmm_front.c                 |  184 -
 frame/3/trmm/bli_trmm_front.h                 |   43 -
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |   78 +-
 frame/3/trmm/bli_trmm_ll_ker_var2b.c          |   73 +-
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |   82 +-
 frame/3/trmm/bli_trmm_lu_ker_var2b.c          |   73 +-
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |   75 +-
 frame/3/trmm/bli_trmm_rl_ker_var2b.c          |   73 +-
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |   85 +-
 frame/3/trmm/bli_trmm_ru_ker_var2b.c          |   81 +-
 frame/3/trmm/bli_trmm_var.h                   |    4 +-
 frame/3/trmm/bli_trmm_xx_ker_var2.c           |   87 -
 frame/3/trmm3/bli_trmm3.h                     |   36 -
 frame/3/trmm3/bli_trmm3_front.c               |  176 -
 frame/3/trmm3/bli_trmm3_front.h               |   45 -
 frame/3/trsm/bli_trsm.h                       |    1 -
 frame/3/trsm/bli_trsm_blk_var1.c              |   36 +-
 frame/3/trsm/bli_trsm_blk_var2.c              |   20 +-
 frame/3/trsm/bli_trsm_blk_var3.c              |   15 +-
 frame/3/trsm/bli_trsm_cntl.c                  |  634 ++-
 frame/3/trsm/bli_trsm_cntl.h                  |  560 ++-
 frame/3/trsm/bli_trsm_front.c                 |  158 -
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |   79 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |   81 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |   75 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |   75 +-
 frame/3/trsm/bli_trsm_var.h                   |    4 +-
 frame/base/bli_apool.c                        |   11 +-
 frame/base/bli_arch.c                         |    2 +-
 frame/base/bli_auxinfo.h                      |   18 +
 frame/base/bli_blksz.c                        |  170 +-
 frame/base/bli_blksz.h                        |   45 +-
 frame/base/bli_check.c                        |   54 +-
 frame/base/bli_check.h                        |    5 +-
 frame/base/bli_cntl.c                         |  225 +-
 frame/base/bli_cntl.h                         |  120 +-
 frame/base/bli_cntx.c                         |  318 +-
 frame/base/bli_cntx.h                         |  221 +-
 frame/base/bli_cpuid.c                        |    2 +-
 frame/base/bli_error.c                        |    7 +
 frame/base/bli_func.c                         |  108 +
 frame/base/bli_func.h                         |   72 +-
 frame/base/bli_gks.c                          |  729 +--
 frame/base/bli_gks.h                          |   27 +-
 frame/base/bli_ind.c                          |   12 +-
 frame/base/bli_machval.c                      |    2 +-
 frame/base/bli_mbool.h                        |    6 +-
 frame/base/bli_memsys.c                       |    4 +-
 frame/base/bli_obj.c                          |   24 +-
 frame/base/bli_obj.h                          |    3 +-
 frame/base/bli_obj_scalar.c                   |    6 +-
 frame/base/bli_part.c                         |   15 +-
 frame/base/bli_part.h                         |    6 +-
 .../bli_part_cntl.c}                          |   51 +-
 frame/base/bli_part_cntl.h                    |  132 +
 frame/base/bli_pba.c                          |    5 -
 frame/base/bli_rntm.c                         |  126 -
 frame/base/bli_rntm.h                         |   21 +-
 frame/base/bli_setgetijm.c                    |    4 +-
 frame/base/bli_setgetijv.c                    |    4 +-
 frame/base/bli_stack.c                        |  200 +
 .../bli_trsm_front.h => base/bli_stack.h}     |   59 +-
 frame/base/cast/bli_castm.c                   |   10 +-
 frame/base/cast/bli_castm.h                   |    2 +-
 frame/base/cast/bli_castnzm.c                 |   10 +-
 frame/base/cast/bli_castnzm.h                 |    2 +-
 frame/base/cast/bli_castv.c                   |   10 +-
 frame/base/cast/bli_castv.h                   |    2 +-
 frame/compat/amd/bla_copy_amd.c               |    4 +-
 frame/compat/amd/bla_gemv_amd.c               |    4 +-
 frame/compat/bla_amax.c                       |    4 +-
 frame/compat/bla_amax.h                       |    2 +-
 frame/compat/bla_asum.c                       |    4 +-
 frame/compat/bla_asum.h                       |    2 +-
 frame/compat/bla_axpy.c                       |    2 +-
 frame/compat/bla_copy.c                       |    2 +-
 frame/compat/bla_dot.c                        |    8 +-
 frame/compat/bla_dot.h                        |    4 +-
 frame/compat/bla_gemm.c                       |    6 +-
 frame/compat/bla_gemv.c                       |    2 +-
 frame/compat/bla_ger.c                        |    4 +-
 frame/compat/bla_ger.h                        |    2 +-
 frame/compat/bla_hemm.c                       |    2 +-
 frame/compat/bla_hemv.c                       |    2 +-
 frame/compat/bla_her.c                        |    2 +-
 frame/compat/bla_her2.c                       |    2 +-
 frame/compat/bla_her2k.c                      |    2 +-
 frame/compat/bla_herk.c                       |    2 +-
 frame/compat/bla_nrm2.c                       |    4 +-
 frame/compat/bla_nrm2.h                       |    2 +-
 frame/compat/bla_scal.c                       |    6 +-
 frame/compat/bla_scal.h                       |    2 +-
 frame/compat/bla_swap.c                       |    2 +-
 frame/compat/bla_symm.c                       |    2 +-
 frame/compat/bla_symv.c                       |    2 +-
 frame/compat/bla_syr.c                        |    2 +-
 frame/compat/bla_syr2.c                       |    2 +-
 frame/compat/bla_syr2k.c                      |    2 +-
 frame/compat/bla_syrk.c                       |    2 +-
 frame/compat/bla_trmm.c                       |    2 +-
 frame/compat/bla_trmv.c                       |    2 +-
 frame/compat/bla_trsm.c                       |    2 +-
 frame/compat/bla_trsv.c                       |    2 +-
 frame/compat/blis/thread/b77_thread.c         |    4 +-
 frame/compat/blis/thread/b77_thread.h         |    4 +-
 frame/compat/cblas/f77_sub/f77_amax_sub.c     |    4 +-
 frame/compat/cblas/f77_sub/f77_amax_sub.h     |    2 +-
 frame/compat/cblas/f77_sub/f77_asum_sub.c     |    4 +-
 frame/compat/cblas/f77_sub/f77_asum_sub.h     |    2 +-
 frame/compat/cblas/f77_sub/f77_dot_sub.c      |   12 +-
 frame/compat/cblas/f77_sub/f77_dot_sub.h      |    6 +-
 frame/compat/cblas/f77_sub/f77_nrm2_sub.c     |    4 +-
 frame/compat/cblas/f77_sub/f77_nrm2_sub.h     |    2 +-
 frame/compat/check/bla_gemm3m_check.h         |   14 +-
 frame/compat/check/bla_gemm_check.h           |   14 +-
 frame/compat/check/bla_gemmt_check.h          |   18 +-
 frame/compat/check/bla_gemv_check.h           |    8 +-
 frame/compat/check/bla_ger_check.h            |    2 +-
 frame/compat/check/bla_hemm_check.h           |   10 +-
 frame/compat/check/bla_hemv_check.h           |    6 +-
 frame/compat/check/bla_her2_check.h           |    6 +-
 frame/compat/check/bla_her2k_check.h          |   10 +-
 frame/compat/check/bla_her_check.h            |    6 +-
 frame/compat/check/bla_herk_check.h           |   10 +-
 frame/compat/check/bla_syr2k_check.h          |   12 +-
 frame/compat/check/bla_syrk_check.h           |   12 +-
 frame/compat/check/bla_trmm_check.h           |   20 +-
 frame/compat/check/bla_trmv_check.h           |   16 +-
 frame/compat/extra/bla_axpby.c                |    4 +-
 frame/compat/extra/bla_gemm3m.c               |   41 +-
 frame/compat/extra/bla_gemm_batch.c           |    2 +-
 frame/compat/extra/bla_gemmt.c                |    2 +-
 frame/compat/f2c/bla_gbmv.c                   |   64 +-
 frame/compat/f2c/bla_hbmv.c                   |   24 +-
 frame/compat/f2c/bla_hpmv.c                   |   24 +-
 frame/compat/f2c/bla_hpr.c                    |   20 +-
 frame/compat/f2c/bla_hpr2.c                   |   36 +-
 frame/compat/f2c/bla_lsame.c                  |    4 +-
 frame/compat/f2c/bla_lsame.h                  |    4 +-
 frame/compat/f2c/bla_sbmv.c                   |   24 +-
 frame/compat/f2c/bla_spmv.c                   |   20 +-
 frame/compat/f2c/bla_spr.c                    |   20 +-
 frame/compat/f2c/bla_spr2.c                   |   20 +-
 frame/compat/f2c/bla_tbmv.c                   |   92 +-
 frame/compat/f2c/bla_tbsv.c                   |   92 +-
 frame/compat/f2c/bla_tpmv.c                   |   92 +-
 frame/compat/f2c/bla_tpsv.c                   |   92 +-
 frame/compat/f2c/bla_xerbla.c                 |    2 +-
 frame/compat/f2c/bla_xerbla.h                 |    2 +-
 frame/compat/f2c/bla_xerbla_array.c           |    4 +-
 frame/compat/f2c/bla_xerbla_array.h           |    2 +-
 frame/include/bli_arch_config.h               |  129 +-
 frame/include/bli_arch_config_pre.h           |   16 +-
 frame/include/bli_config_macro_defs.h         |   19 -
 frame/include/bli_genarray_macro_defs.h       |  123 +-
 frame/include/bli_gentconf_macro_defs.h       |  288 ++
 frame/include/bli_gentfunc_macro_defs.h       |   30 +-
 frame/include/bli_macro_defs.h                |   67 +-
 frame/include/bli_misc_macro_defs.h           |   22 +
 frame/include/bli_obj_macro_defs.h            |  216 +-
 frame/include/bli_param_macro_defs.h          |   26 +-
 frame/include/bli_pre_ker_params.h            |    2 +-
 frame/include/bli_scalar_macro_defs.h         |    2 +
 frame/include/bli_type_defs.h                 |  574 +--
 frame/include/blis.h                          |    4 +-
 frame/include/level0/bb/bli_scal2bbs_mxn.h    |   40 +-
 frame/include/level0/bli_axpbys_mxn.h         |  129 +
 frame/include/level0/bli_copys_mxn.h          |   10 +-
 frame/include/level0/bli_scal2s_mxn.h         |   71 +-
 frame/include/level0/bli_xpbys_mxn.h          |   12 +-
 frame/thread/bli_thread_range.c               |  432 +-
 frame/thread/bli_thread_range.h               |   52 +-
 frame/thread/bli_thread_range_slab_rr.c       |   12 +-
 frame/thread/bli_thread_range_slab_rr.h       |   53 +-
 frame/thread/bli_thread_range_tlb.c           |    2 +
 frame/thread/bli_thrinfo.c                    |   43 +-
 frame/thread/bli_thrinfo.h                    |   22 +-
 frame/util/bli_util_fpa.c                     |   10 +-
 frame/util/bli_util_fpa.h                     |    4 +-
 frame/util/bli_util_ft.h                      |   26 +-
 frame/util/bli_util_oapi.c                    |   44 +-
 frame/util/bli_util_oapi.h                    |    6 +-
 frame/util/bli_util_tapi.c                    |   36 +-
 frame/util/bli_util_tapi.h                    |   14 +-
 frame/util/bli_util_unb_var1.c                |   32 +-
 ..._8xk.c => bli_dpackm_armsve256_int_8x10.c} |   76 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c |  365 --
 ...6xk.c => bli_dpackm_armsve512_asm_16x10.c} |  351 +-
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |    2 +-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  |    2 +-
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  |    2 +-
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |    2 +-
 kernels/armsve/bli_kernels_armsve.h           |    6 +-
 kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c   |   12 +-
 kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c   |    4 +-
 ...int_d8xk.c => bli_packm_armv8a_int_d6x8.c} |  257 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c |  324 --
 ...t_s12xk.c => bli_packm_armv8a_int_s8x12.c} |  310 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c |  374 --
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   |   12 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c  |   22 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c   |    6 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c   |    8 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c   |    2 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c   |    2 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c   |    2 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c   |    2 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c   |    2 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c   |    2 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c   |    4 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c   |    2 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c |    2 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c |    2 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c |    2 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c |    2 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c   |    2 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c   |    2 +-
 kernels/armv8a/bli_kernels_armv8a.h           |    6 +-
 kernels/bgq/1f/bli_axpyf_bgq_int.c            |   16 +-
 kernels/bgq/3/bli_gemm_bgq_int_8x8.c          |    4 +-
 .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c      |    8 +-
 ...sm_c8xk.c => bli_packm_haswell_asm_c3x8.c} |  314 +-
 .../haswell/1m/bli_packm_haswell_asm_c3xk.c   |  397 --
 ...sm_d8xk.c => bli_packm_haswell_asm_d6x8.c} |  318 +-
 .../haswell/1m/bli_packm_haswell_asm_d6xk.c   |  401 --
 ..._s16xk.c => bli_packm_haswell_asm_s6x16.c} |  358 +-
 .../haswell/1m/bli_packm_haswell_asm_s6xk.c   |  441 --
 ...sm_z4xk.c => bli_packm_haswell_asm_z3x4.c} |  318 +-
 .../haswell/1m/bli_packm_haswell_asm_z3xk.c   |  401 --
 kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c |    8 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c |    8 +-
 .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c       |    4 +-
 .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c       |    4 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c  |    6 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c  |    8 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c |   10 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c |    8 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c  |    8 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c  |   12 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c |   12 +-
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c |   12 +-
 .../sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c |    2 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c    |    8 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c    |    8 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c    |    6 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c    |    6 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c    |   12 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c    |   12 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c    |   12 +-
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c    |   12 +-
 .../s6x16/bli_gemmsup_r_haswell_ref_sMx1.c    |    2 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c   |    8 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c  |    6 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c  |    6 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c   |    8 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c   |    6 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c   |    6 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c  |   12 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c  |   12 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c   |   12 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c   |   12 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c   |   14 +-
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c   |   12 +-
 kernels/haswell/bli_kernels_haswell.h         |   15 +-
 kernels/knc/3/bli_dgemm_knc_asm_30x8.c        |    2 +-
 kernels/knc/3/bli_sgemm_knc_asm_30x16.c       |    4 +-
 kernels/knl/1m/bli_dpackm_knl_asm_24x8.c      |  494 +-
 kernels/knl/1m/bli_spackm_knl_asm_24x16.c     |  414 +-
 kernels/knl/3/bli_dgemm_knl_asm_24x8.c        |    2 +-
 kernels/knl/3/bli_sgemm_knl_asm_24x16.c       |    2 +-
 kernels/knl/bli_kernels_knl.h                 |    8 +-
 kernels/penryn/1f/bli_axpyf_penryn_int.c      |    8 +-
 kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c  |    8 +-
 kernels/penryn/1f/bli_dotxf_penryn_int.c      |   18 +-
 kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c   |    4 +-
 .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c |    2 +-
 .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c |    2 +-
 kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c |    2 +-
 kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c |    2 +-
 .../3/bli_gemm_piledriver_asm_d8x3.c          |    8 +-
 kernels/power10/3/bli_dgemm_power10_mma.c     |    2 +-
 kernels/power10/3/bli_i16gemm_power10_mma.c   |    2 +-
 kernels/power10/3/bli_i16sgemm_power10_mma.c  |    2 +-
 kernels/power10/3/bli_i4gemm_power10_mma.c    |    2 +-
 kernels/power10/3/bli_i8gemm_power10_mma.c    |    2 +-
 kernels/power10/3/bli_sbgemm_power10_mma.c    |    2 +-
 kernels/power10/3/bli_sgemm_power10_mma.c     |    2 +-
 kernels/power10/3/bli_shgemm_power10_mma.c    |    2 +-
 kernels/power7/3/bli_gemm_power7_int_8x4.c    |    8 +-
 kernels/power9/3/bli_gemm_power9_asm_d12x6.c  |    2 +-
 kernels/rviv/3/bli_cgemm_rviv_4vx4.c          |    2 +-
 kernels/rviv/3/bli_dgemm_rviv_4vx4.c          |    2 +-
 kernels/rviv/3/bli_sgemm_rviv_4vx4.c          |    2 +-
 kernels/rviv/3/bli_zgemm_rviv_4vx4.c          |    2 +-
 .../3/bli_gemm_sandybridge_asm_d8x4.c         |    8 +-
 .../3/bli_gemm_sandybridge_int_d8x4.c         |    2 +-
 .../1m/bli_packm_sifive_x280_asm.c            | 1465 ++++++
 .../1m/bli_packm_sifive_x280_asm_mrxk.c       |  678 ---
 .../1m/bli_packm_sifive_x280_asm_nrxk.c       |  838 ----
 .../sifive_x280/3/bli_gemm_sifive_x280_asm.c  |   91 +-
 .../bli_gemmtrsm_sifive_x280_asm.c            |    5 +-
 kernels/sifive_x280/bli_kernels_sifive_x280.h |   18 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c    |    2 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x14.c       |    2 +-
 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c    |    2 +-
 kernels/zen/1/bli_amaxv_zen_int.c             |    4 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c  |   26 +-
 ref_kernels/1/bli_addv_ref.c                  |    2 +-
 ref_kernels/1/bli_amaxv_ref.c                 |    6 +-
 ref_kernels/1/bli_axpbyv_ref.c                |    2 +-
 ref_kernels/1/bli_axpyv_ref.c                 |    2 +-
 ref_kernels/1/bli_copyv_ref.c                 |    2 +-
 ref_kernels/1/bli_dotv_ref.c                  |    2 +-
 ref_kernels/1/bli_dotxv_ref.c                 |    2 +-
 ref_kernels/1/bli_invertv_ref.c               |    2 +-
 ref_kernels/1/bli_invscalv_ref.c              |    2 +-
 ref_kernels/1/bli_scal2v_ref.c                |    2 +-
 ref_kernels/1/bli_scalv_ref.c                 |    2 +-
 ref_kernels/1/bli_setv_ref.c                  |    2 +-
 ref_kernels/1/bli_subv_ref.c                  |    2 +-
 ref_kernels/1/bli_swapv_ref.c                 |    2 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |    2 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |    2 +-
 ref_kernels/1f/bli_axpyf_ref.c                |    2 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |    2 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |    2 +-
 ref_kernels/1f/bli_dotxf_ref.c                |    2 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   |  232 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       |   96 +-
 ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c    |  198 +
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        |  154 +-
 ref_kernels/1m/bli_packm_cxk_ref.c            |   71 +-
 ref_kernels/1m/bli_packm_cxk_ro_ref.c         |  137 +
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |   61 +-
 ref_kernels/3/bli_gemm_ref.c                  |   77 +-
 ref_kernels/3/bli_gemmsup_ref.c               |    8 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |    6 +-
 ref_kernels/3/bli_trsm_ref.c                  |    8 +-
 ref_kernels/bli_cntx_ref.c                    |  493 +-
 ref_kernels/ind/bli_gemm1m_ref.c              |  206 +-
 ref_kernels/ind/bli_gemm_ccr_ref.c            |  174 +
 ref_kernels/ind/bli_gemm_crr_ref.c            |  135 +
 ref_kernels/ind/bli_gemm_rcc_ref.c            |  132 +
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |  573 ++-
 ref_kernels/ind/bli_trsm1m_ref.c              |  471 --
 sandbox/gemmlike/bli_gemm_ex.c                |   29 +-
 sandbox/gemmlike/bli_sandbox.h                |    5 +-
 sandbox/gemmlike/bls_gemm.c                   |   45 +-
 sandbox/gemmlike/bls_gemm.h                   |    2 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           |  579 +--
 sandbox/gemmlike/bls_gemm_var.h               |   83 +-
 sandbox/gemmlike/bls_l3_packm.c               |  192 +
 .../gemmlike/bls_l3_packm.h                   |   54 +-
 sandbox/gemmlike/bls_l3_packm_a.c             |  291 --
 sandbox/gemmlike/bls_l3_packm_a.h             |  101 -
 sandbox/gemmlike/bls_l3_packm_b.c             |  291 --
 sandbox/gemmlike/bls_l3_packm_b.h             |  101 -
 sandbox/gemmlike/bls_l3_packm_var.h           |   49 +-
 sandbox/gemmlike/bls_l3_packm_var1.c          |  277 +-
 sandbox/gemmlike/bls_l3_packm_var2.c          |  239 -
 sandbox/gemmlike/bls_l3_packm_var3.c          |  195 -
 sandbox/gemmlike/bls_packm_cxk.c              |  160 -
 test/thread_ranges/test_ranges.c              |   13 +-
 testsuite/check-blistest.sh                   |   26 +-
 testsuite/input.general.mixed                 |    2 +-
 testsuite/input.operations.mixed              |   34 +-
 testsuite/src/test_amaxv.c                    |    4 +-
 testsuite/src/test_gemm.c                     |  378 +-
 testsuite/src/test_gemm_ukr.c                 |    4 +-
 testsuite/src/test_gemmt.c                    |   98 +-
 testsuite/src/test_gemmtrsm_ukr.c             |    4 +-
 testsuite/src/test_hemm.c                     |  108 +-
 testsuite/src/test_her2k.c                    |  106 +-
 testsuite/src/test_herk.c                     |   90 +-
 testsuite/src/test_libblis.c                  |  859 ++--
 testsuite/src/test_libblis.h                  |    4 +-
 testsuite/src/test_randm.c                    |    6 +-
 testsuite/src/test_symm.c                     |  108 +-
 testsuite/src/test_syr2k.c                    |  109 +-
 testsuite/src/test_syrk.c                     |   94 +-
 testsuite/src/test_trmm.c                     |   94 +-
 testsuite/src/test_trmm3.c                    |  110 +-
 testsuite/src/test_trsm_ukr.c                 |    4 +-
 571 files changed, 19427 insertions(+), 23297 deletions(-)
 create mode 100644 build/plugin/Makefile
 rename frame/3/bli_l3_direct.h => build/plugin/bli_kernel_defs_zen3.h (75%)
 create mode 100644 build/plugin/bli_plugin.h.in
 create mode 100644 build/plugin/bli_plugin_init_ref.c
 rename frame/3/trmm/bli_trmm_xx_ker_var2b.c => build/plugin/bli_plugin_init_zen3.c (52%)
 rename frame/3/trsm/bli_trsm_xx_ker_var2.c => build/plugin/bli_plugin_register.c (55%)
 create mode 100644 build/plugin/config.mk.in
 rename frame/3/hemm/bli_hemm.h => build/plugin/my_kernel_1_ref.c (77%)
 rename frame/3/hemm/bli_hemm_front.h => build/plugin/my_kernel_1_zen3.c (78%)
 rename sandbox/gemmlike/bls_packm_cxk.h => build/plugin/my_kernel_2_ref.c (66%)
 delete mode 100644 frame/1m/packm/bli_packm_part.c
 delete mode 100644 frame/1m/packm/bli_packm_part.h
 delete mode 100644 frame/1m/packm/bli_packm_struc_cxk_md.c
 delete mode 100644 frame/1m/packm/bli_packm_struc_cxk_md.h
 delete mode 100644 frame/3/bli_l3_blocksize.c
 delete mode 100644 frame/3/bli_l3_blocksize.h
 delete mode 100644 frame/3/bli_l3_cntl.c
 delete mode 100644 frame/3/bli_l3_cntl.h
 delete mode 100644 frame/3/bli_l3_direct.c
 delete mode 100644 frame/3/bli_l3_int.c
 delete mode 100644 frame/3/bli_l3_schema.c
 rename frame/3/{gemm/bli_gemm_front.h => bli_l3_util.c} (74%)
 rename frame/3/{gemmt/bli_gemmt_front.h => bli_l3_util.h} (92%)
 delete mode 100644 frame/3/gemm/bli_gemm_front.c
 delete mode 100644 frame/3/gemm/bli_gemm_md.c
 delete mode 100644 frame/3/gemm/bli_gemm_md.h
 delete mode 100644 frame/3/gemm/bli_gemm_md_c2r_ref.c
 delete mode 100644 frame/3/gemm/bli_gemm_md_c2r_ref.h
 delete mode 100644 frame/3/gemm/ind/bli_gemm_ind_opt.h
 delete mode 100644 frame/3/gemmt/bli_gemmt_front.c
 delete mode 100644 frame/3/gemmt/bli_gemmt_x_ker_var2.c
 delete mode 100644 frame/3/hemm/bli_hemm_front.c
 delete mode 100644 frame/3/symm/bli_symm.h
 delete mode 100644 frame/3/symm/bli_symm_front.c
 delete mode 100644 frame/3/symm/bli_symm_front.h
 delete mode 100644 frame/3/trmm/bli_trmm_front.c
 delete mode 100644 frame/3/trmm/bli_trmm_front.h
 delete mode 100644 frame/3/trmm/bli_trmm_xx_ker_var2.c
 delete mode 100644 frame/3/trmm3/bli_trmm3.h
 delete mode 100644 frame/3/trmm3/bli_trmm3_front.c
 delete mode 100644 frame/3/trmm3/bli_trmm3_front.h
 delete mode 100644 frame/3/trsm/bli_trsm_front.c
 rename frame/{3/gemmt/bli_gemmt_x_ker_var2b.c => base/bli_part_cntl.c} (73%)
 create mode 100644 frame/base/bli_part_cntl.h
 create mode 100644 frame/base/bli_stack.c
 rename frame/{3/trsm/bli_trsm_front.h => base/bli_stack.h} (67%)
 create mode 100644 frame/include/bli_gentconf_macro_defs.h
 create mode 100644 frame/include/level0/bli_axpbys_mxn.h
 rename kernels/armsve/1m/{bli_dpackm_armsve256_int_8xk.c => bli_dpackm_armsve256_int_8x10.c} (86%)
 delete mode 100644 kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
 rename kernels/armsve/1m/{bli_dpackm_armsve512_asm_16xk.c => bli_dpackm_armsve512_asm_16x10.c} (52%)
 rename kernels/armv8a/1m/{bli_packm_armv8a_int_d8xk.c => bli_packm_armv8a_int_d6x8.c} (63%)
 delete mode 100644 kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
 rename kernels/armv8a/1m/{bli_packm_armv8a_int_s12xk.c => bli_packm_armv8a_int_s8x12.c} (63%)
 delete mode 100644 kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
 rename kernels/haswell/1m/{bli_packm_haswell_asm_c8xk.c => bli_packm_haswell_asm_c3x8.c} (60%)
 delete mode 100644 kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
 rename kernels/haswell/1m/{bli_packm_haswell_asm_d8xk.c => bli_packm_haswell_asm_d6x8.c} (59%)
 delete mode 100644 kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
 rename kernels/haswell/1m/{bli_packm_haswell_asm_s16xk.c => bli_packm_haswell_asm_s6x16.c} (64%)
 delete mode 100644 kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
 rename kernels/haswell/1m/{bli_packm_haswell_asm_z4xk.c => bli_packm_haswell_asm_z3x4.c} (59%)
 delete mode 100644 kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
 create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c
 delete mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
 delete mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
 create mode 100644 ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c
 create mode 100644 ref_kernels/1m/bli_packm_cxk_ro_ref.c
 create mode 100644 ref_kernels/ind/bli_gemm_ccr_ref.c
 create mode 100644 ref_kernels/ind/bli_gemm_crr_ref.c
 create mode 100644 ref_kernels/ind/bli_gemm_rcc_ref.c
 delete mode 100644 ref_kernels/ind/bli_trsm1m_ref.c
 create mode 100644 sandbox/gemmlike/bls_l3_packm.c
 rename frame/3/bli_l3_schema.h => sandbox/gemmlike/bls_l3_packm.h (54%)
 delete mode 100644 sandbox/gemmlike/bls_l3_packm_a.c
 delete mode 100644 sandbox/gemmlike/bls_l3_packm_a.h
 delete mode 100644 sandbox/gemmlike/bls_l3_packm_b.c
 delete mode 100644 sandbox/gemmlike/bls_l3_packm_b.h
 delete mode 100644 sandbox/gemmlike/bls_l3_packm_var2.c
 delete mode 100644 sandbox/gemmlike/bls_l3_packm_var3.c
 delete mode 100644 sandbox/gemmlike/bls_packm_cxk.c

diff --git a/Makefile b/Makefile
index 813554f57..37f6766a0 100644
--- a/Makefile
+++ b/Makefile
@@ -321,7 +321,31 @@ endif
 
 # Define a list of makefile fragments to install.
 FRAGS_TO_INSTALL := $(CONFIG_MK_FILE) \
-                    $(COMMON_MK_FILE)
+                    $(COMMON_MK_FILE) \
+                    $(DIST_PATH)/build/gen-make-frags/gen-make-frag.sh \
+                    $(DIST_PATH)/build/gen-make-frags/fragment.mk \
+                    $(DIST_PATH)/build/gen-make-frags/ignore_list \
+                    $(DIST_PATH)/build/gen-make-frags/special_list \
+                    $(DIST_PATH)/build/gen-make-frags/suffix_list \
+                    $(DIST_PATH)/build/flatten-headers.py \
+                    $(DIST_PATH)/build/mirror-tree.sh \
+                    $(DIST_PATH)/config_registry \
+                    $(DIST_PATH)/build/detect/iset/avx.s \
+                    $(DIST_PATH)/build/detect/iset/avx512dq.s \
+                    $(DIST_PATH)/build/detect/iset/avx512f.s \
+                    $(DIST_PATH)/build/detect/iset/fma3.s \
+                    $(DIST_PATH)/build/detect/iset/fma4.s
+
+# Define a list of plugin makefile fragments to install.
+PLUGIN_FRAGS_TO_INSTALL := $(DIST_PATH)/build/plugin/bli_plugin_init_ref.c \
+                           $(DIST_PATH)/build/plugin/bli_plugin_init_zen3.c \
+                           $(DIST_PATH)/build/plugin/bli_plugin_register.c \
+                           $(DIST_PATH)/build/plugin/my_kernel_1_ref.c \
+                           $(DIST_PATH)/build/plugin/my_kernel_2_ref.c \
+                           $(DIST_PATH)/build/plugin/my_kernel_1_zen3.c \
+                           $(DIST_PATH)/build/plugin/bli_plugin.h.in \
+                           $(DIST_PATH)/build/plugin/config.mk.in \
+                           $(DIST_PATH)/build/plugin/Makefile
 
 PC_IN_FILE  := blis.pc.in
 PC_OUT_FILE := blis.pc
@@ -1085,21 +1109,47 @@ $(foreach h, $(HELP_HEADERS_TO_INSTALL), $(eval $(call make-helper-header-rule,$
 
 install-share: check-env $(MK_SHARE_DIR_INST) $(PC_SHARE_DIR_INST)
 
-$(MK_SHARE_DIR_INST): $(FRAGS_TO_INSTALL) $(CONFIG_MK_FILE)
+$(MK_SHARE_DIR_INST): $(CONFIGURE_FILE) $(FRAGS_TO_INSTALL) $(PLUGIN_FRAGS_TO_INSTALL) $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(MKDIR) $(@)
-	$(INSTALL) -m 0644 $(FRAGS_TO_INSTALL) $(@)
-	$(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)
-	$(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \
-	              $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)
+	$(MKDIR) $(@)/plugin
+	$(INSTALL) -m 0755 $(filter %.sh,$(FRAGS_TO_INSTALL)) $(@)
+	$(INSTALL) -m 0644 $(filter-out %.sh,$(FRAGS_TO_INSTALL)) $(@)
+	$(INSTALL) -m 0644 $(PLUGIN_FRAGS_TO_INSTALL) $(@)/plugin
+	$(INSTALL) -m 0755 $(CONFIGURE_FILE) $(@)/configure-plugin
+#	$(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)
+#	$(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \
+#	              $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)
+	for THIS_CONFIG in $(FULL_CONFIG_LIST); do \
+		$(MKDIR) -p $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \
+		$(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/$(MAKE_DEFS_FILE) \
+		              $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \
+		$(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/bli_kernel_defs_$$THIS_CONFIG.h \
+		              $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \
+	done
 else
 	@$(MKDIR) $(@)
+	@$(MKDIR) $(@)/plugin
 	@echo "Installing $(notdir $(FRAGS_TO_INSTALL)) into $(@)/"
-	@$(INSTALL) -m 0644 $(FRAGS_TO_INSTALL) $(@)
-	@$(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)
-	@echo "Installing $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) into $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)"
-	@$(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \
-	               $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)/
+	@$(INSTALL) -m 0755 $(filter %.sh,$(FRAGS_TO_INSTALL)) $(@)
+	@$(INSTALL) -m 0644 $(filter-out %.sh,$(FRAGS_TO_INSTALL)) $(@)
+	@echo "Installing $(notdir $(PLUGIN_FRAGS_TO_INSTALL)) into $(@)/plugin/"
+	@$(INSTALL) -m 0644 $(PLUGIN_FRAGS_TO_INSTALL) $(@)/plugin
+	@echo "Installing $(CONFIGURE_FILE) into $(@)/configure-plugin"
+	@$(INSTALL) -m 0755 $(CONFIGURE_FILE) $(@)/configure-plugin
+#	@$(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)#\
+#	@echo "Installing $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) into $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)"
+#	@$(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \
+#	               $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)/
+	@for THIS_CONFIG in $(FULL_CONFIG_LIST); do \
+		$(MKDIR) -p $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \
+		echo "Installing $(CONFIG_DIR)/$$THIS_CONFIG/$(MAKE_DEFS_FILE) into $(@)/$(CONFIG_DIR)/$$THIS_CONFIG"; \
+		$(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/$(MAKE_DEFS_FILE) \
+		              $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \
+		echo "Installing $(CONFIG_DIR)/$$THIS_CONFIG/bli_kernel_defs_$$THIS_CONFIG.h into $(@)/$(CONFIG_DIR)/$$THIS_CONFIG"; \
+		$(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/bli_kernel_defs_$$THIS_CONFIG.h \
+		              $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \
+	done
 endif
 
 $(PC_SHARE_DIR_INST): $(PC_IN_FILE)
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 8a6baee87..634ad0a42 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -146,26 +146,6 @@
 #endif
 #endif
 
-#ifndef BLIS_ENABLE_MIXED_DT
-#ifndef BLIS_DISABLE_MIXED_DT
-#if @enable_mixed_dt@
-#define BLIS_ENABLE_MIXED_DT
-#else
-#define BLIS_DISABLE_MIXED_DT
-#endif
-#endif
-#endif
-
-#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM
-#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM
-#if @enable_mixed_dt_extra_mem@
-#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM
-#else
-#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM
-#endif
-#endif
-#endif
-
 #if @enable_sup_handling@
 #define BLIS_ENABLE_SUP_HANDLING
 #else
diff --git a/build/config.mk.in b/build/config.mk.in
index 4624220cf..dcc6f1b55 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -53,7 +53,9 @@ CONFIG_NAME       := @config_name@
 # sub-configuration in CONFIG_LIST corresponds to a configuration
 # sub-directory in the 'config' directory. See the 'config_registry'
 # file for the full list of registered configurations.
-CONFIG_LIST       := @config_list@
+CONFIG_LIST         := @config_list@
+FULL_CONFIG_LIST    := @full_config_list@
+FULL_SUBCONFIG_LIST := @full_subconfig_list@
 
 # This list of kernels needed for the configurations in CONFIG_LIST.
 # Each item in this list corresponds to a sub-directory in the top-level
@@ -62,6 +64,7 @@ CONFIG_LIST       := @config_list@
 # kernel set X, and configuration W uses kernel set Q, and the CONFIG_LIST
 # might contained "X Y Z W", then the KERNEL_LIST would contain "X Z Q".
 KERNEL_LIST       := @kernel_list@
+FULL_KERNEL_LIST  := @full_kernel_list@
 
 # This list contains some number of "kernel:config" pairs, where "config"
 # specifies which configuration's compilation flags (CFLAGS) should be
@@ -101,9 +104,12 @@ CLANG_OT_12_0_0   := @clang_older_than_12_0_0@
 AOCC_OT_2_0_0     := @aocc_older_than_2_0_0@
 AOCC_OT_3_0_0     := @aocc_older_than_3_0_0@
 
-# The C++ compiler. NOTE: A C++ is typically not needed.
+# The C++ compiler. NOTE: A C++ compiler is typically not needed.
 CXX               := @CXX@
 
+# The Fortran compiler. NOTE: A Fortran compiler is typically not needed.
+FC                := @FC@
+
 # Static library indexer.
 RANLIB            := @RANLIB@
 
@@ -113,12 +119,13 @@ AR                := @AR@
 # Python Interpreter
 PYTHON            := @PYTHON@
 
-# Preset (required) CFLAGS and LDFLAGS. These variables capture the value
-# of the CFLAGS and LDFLAGS environment variables at configure-time (and/or
-# the value of CFLAGS/LDFLAGS if either was specified on the command line).
+# Preset (required) CFLAGS, CXXFLAGS, and LDFLAGS. These variables capture the value
+# of the CFLAGS, CXXFLAGS, and LDFLAGS environment variables at configure-time (and/or
+# the value of CFLAGS/CXXFLAGS/LDFLAGS if any was specified on the command line).
 # These flags are used in addition to the flags automatically determined
 # by the build system.
 CFLAGS_PRESET     := @cflags_preset@
+CXXFLAGS_PRESET   := @cxxflags_preset@
 LDFLAGS_PRESET    := @ldflags_preset@
 
 # The level of debugging info to generate.
@@ -129,7 +136,7 @@ ENABLE_DEBUG      := @enable_debug@
 MK_ENABLE_ASAN    := @enable_asan@
 
 # Whether operating system support was requested via --enable-system.
-ENABLE_SYSTEM     := @enable_system@
+ENABLE_SYSTEM     := @mk_enable_system@
 
 # The requested threading model(s).
 THREADING_MODEL   := @threading_model@
@@ -179,8 +186,8 @@ ARG_MAX_HACK      := @enable_arg_max_hack@
 # Whether to build the static and shared libraries.
 # NOTE: The "MK_" prefix, which helps differentiate these variables from
 # their corresonding cpp macros that use the BLIS_ prefix.
-MK_ENABLE_STATIC  := @enable_static@
-MK_ENABLE_SHARED  := @enable_shared@
+MK_ENABLE_STATIC  := @mk_enable_static@
+MK_ENABLE_SHARED  := @mk_enable_shared@
 
 # Whether to use an install_name based on @rpath.
 MK_ENABLE_RPATH   := @enable_rpath@
@@ -190,11 +197,11 @@ MK_ENABLE_RPATH   := @enable_rpath@
 EXPORT_SHARED     := @export_shared@
 
 # Whether to enable either the BLAS or CBLAS compatibility layers.
-MK_ENABLE_BLAS    := @enable_blas@
-MK_ENABLE_CBLAS   := @enable_cblas@
+MK_ENABLE_BLAS    := @mk_enable_blas@
+MK_ENABLE_CBLAS   := @mk_enable_cblas@
 
 # Whether libblis will depend on libmemkind for certain memory allocations.
-MK_ENABLE_MEMKIND := @enable_memkind@
+MK_ENABLE_MEMKIND := @mk_enable_memkind@
 
 # The names of the addons to include when building BLIS. If empty, no addons
 # will be included.
diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh
index e411fa8d9..e826db068 100755
--- a/build/gen-make-frags/gen-make-frag.sh
+++ b/build/gen-make-frags/gen-make-frag.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -42,10 +42,10 @@
 print_usage()
 {
 	#local script_name
-	
+
 	# Get the script name
 	#script_name=${0##*/}
-	
+
 	# Echo usage info
 	echo " "
 	echo " "$script_name
@@ -100,7 +100,7 @@ print_usage()
 	echo "                 level 1: default (one line per directory)"
 	echo "                 level 2: verbose (several lines per directory)."
 	echo " "
-	
+
 	# Exit with non-zero exit status
 	exit 1
 }
@@ -123,24 +123,28 @@ gen_mkfile()
 	local mkfile_frag_var_name
 	local this_dir
 	local this_frag_dir
-	local mkfile_frag_tmpl_name 
-	local mkfile_name 
+	local mkfile_frag_tmpl_name
+	local mkfile_name
 	local mkfile_frag_path
-	local cur_frag_dir 
+	local cur_frag_dir
 	local cur_frag_path
 	local local_src_files
 	local sub_items
 	local item_path
 	local item_suffix
 	local cur_frag_sub_dirs
-	
-	
+
+
 	# Extract our arguments to local variables
 	mkfile_frag_var_name=$1
 	this_dir=$2
 	this_frag_dir=$3
-	
-	
+
+
+	# Make sure the target directory exists
+	mkdir -p $this_frag_dir
+
+
 	# Strip the leading path from the template makefile path to get its
 	# simple filename. Hide the output makefile fragment filename, if
 	# requested.
@@ -150,62 +154,62 @@ gen_mkfile()
 	else
 		mkfile_frag_path=$this_frag_dir/$mkfile_frag_tmpl_name
 	fi
-	
-	
+
+
 	# Determine the directory in which the fragment will reside.
 	cur_frag_path=$this_dir
 	cur_frag_dir=${this_dir##*/}
-	
-	
+
+
 	# Initialize the local source list to empty
 	local_src_files=""
-	
+
 	# Get a listing of the items in $this_dir
 	sub_items=$(ls $this_dir)
-	
+
 	# Generate a list of the source files we've chosen
 	for item in $sub_items; do
-		
+
 		# Prepend the directory to the item to get a relative path
 		item_path=$this_dir/$item
-		
+
 		# Acquire the item's suffix, if it has one
 		item_suffix=${item_path##*.}
-		
+
 		# If the suffix matches, then add it to our list
 		if is_in_list $item_suffix "$src_file_suffixes"
 		then
 			local_src_files="$local_src_files $item"
 		fi
 	done
-	
+
 	# Delete the leading " " space character in the local source files list.
 	local_src_files=${local_src_files##" "}
-	
-	
+
+
 	# Initialize the fragment subdirectory list to empty
 	cur_frag_sub_dirs=""
-	
+
 	# Capture the relative path listing of items in $this_dir.
 	sub_items=$(ls $this_dir)
-	
+
 	# Determine the fragment's subdirectory names, if any exist
 	for item in $sub_items; do
-		
+
 		# Prepend the directory to the item to get a relative path
 		item_path=$this_dir/$item
-		
+
 		# If item is a directory, and it's not in the ignore list, descend into it.
 		#if [ -d $item_path ] && ! should_ignore $item; then
-		if [ -d $item_path ] && ! is_in_list $item "$ignore_dirs" ; then
+		if [ "$recursive_flag" = "1" ] && [ -d $item_path ] && ! is_in_list $item "$ignore_dirs" ; then
 			cur_frag_sub_dirs=$cur_frag_sub_dirs" "$item
 		fi
 	done
-	
+
 	# Delete the leading " " space character in fragment's subdirectory list.
 	cur_frag_sub_dirs=${cur_frag_sub_dirs##" "}
-	
-	
+
+
 	# Be verbose, if level 2 was requested.
 	if [ "$verbose_flag" = "2" ]; then
 		echo "mkf frag tmpl path: $mkfile_frag_tmpl_path"
@@ -218,8 +222,8 @@ gen_mkfile()
 		echo "mkf frag var name:  $mkfile_frag_var_name"
 		echo "--------------------------------------------------"
 	fi
-	
-	
+
+
 	# Copy the template makefile to the directory given, using the new
 	# makefile name we just created above.
 	if [ -z "$dry_run_flag" ]; then
@@ -229,8 +233,8 @@ gen_mkfile()
 		                           | sed -e s/"$mkfile_fragment_src_var_name_anchor"/"$mkfile_frag_var_name"/g \
 		                           > $mkfile_frag_path
 	fi
-	
-	
+
+
 	# Return peacefully.
 	return 0
 }
@@ -239,59 +243,59 @@ gen_mkfile()
 #
 # gen_mkfiles
 #
-# Recursively generates makefile fragments for a directory and all 
+# Recursively generates makefile fragments for a directory and all
 # subdirectories. All of the actual work happens in gen_mkfile().
 #
 gen_mkfiles()
 {
 	# Local variable declarations
 	local item sub_items cur_dir this_frag_dir this_dir
-	
-	
+
+
 	# Extract our argument
 	cur_dir=$1
 	this_frag_dir=$2
-	
-	
+
+
 	# Append a relevant suffix to the makefile variable name, if necesary
 	# NOTE: This step is disabled because special directories are presently
 	# ignored when generating makefile variable names.
 	#all_add_src_var_name "$cur_dir"
-	
-	
+
+
 	# Be verbose if level 2 was requested
 	if   [ "$verbose_flag" = "2" ]; then
 		echo ">>>" $script_name ${src_var_name}_$SRC $cur_dir $this_frag_dir
 	elif [ "$verbose_flag" = "1" ]; then
 		echo "$script_name: creating makefile fragment in $this_frag_dir from $cur_dir"
 	fi
-	
-	
+
+
 	# Call our function to generate a makefile in the directory given.
 	gen_mkfile "${src_var_name}_$SRC" $cur_dir $this_frag_dir
-	
-	
+
+
 	# Get a listing of the directories in $directory
 	sub_items=$(ls $cur_dir)
-	
+
 	# Descend into the contents of root_dir to generate the subdirectories'
 	# makefile fragments.
 	for item in $sub_items; do
-		
+
 		# If item is a directory, and it's not in the ignore list, descend into it.
 		#if [ -d "$cur_dir/$item" ] && ! should_ignore $item; then
 		if [ -d "$cur_dir/$item" ] && ! is_in_list $item "$ignore_dirs" ; then
 			gen_mkfiles $cur_dir/$item $this_frag_dir/$item
 		fi
 	done
-	
-	
+
+
 	# Remove a relevant suffix from the makefile variable name, if necesary
 	# NOTE: This step is disabled because special directories are presently
 	# ignored when generating makefile variable names.
 	#all_del_src_var_name "$cur_dir"
-	
-	
+
+
 	# Return peacefully
 	return 0
 }
@@ -301,28 +305,28 @@ gen_mkfiles()
 #update_src_var_name_special()
 #{
 #	local dir act i name var_suffix
-#	
+#
 #	# Extract arguments.
 #	act="$1"
 #	dir="$2"
-#	
+#
 #	# Strip / from end of directory path, if there is one, and then strip
 #	# path from directory name.
 #	dir=${dir%/}
 #	dir=${dir##*/}
-#	
+#
 #	# Run through our list.
 #	# NOTE: CURRENTLY, SPECIAL DIRECTORY NAMES ARE IGNORED. In order to
 #	#       re-enable them, remove the quotes from "${special_dirs}".
 #	for specdir in "${special_dirs}"; do
-#		
+#
 #		# If the current item matches sdir, then we'll have
 #		# to make a modification of some form.
 #		if [ "$dir" = "$specdir" ]; then
-#			
+#
 #			# Convert the directory name to uppercase.
 #			var_suffix=$(echo "$dir" | tr '[:lower:]' '[:upper:]')
-#			
+#
 #			# Either add or remove the suffix, and also update the
 #			# source file suffix variable.
 #			if [ "$act" == "+" ]; then
@@ -330,7 +334,7 @@ gen_mkfiles()
 #			else
 #				src_var_name=${src_var_name%_$var_suffix}
 #			fi
-#			
+#
 #			# No need to continue iterating.
 #			break;
 #		fi
@@ -340,17 +344,17 @@ gen_mkfiles()
 #init_src_var_name()
 #{
 #	local dir="$1"
-#	
+#
 #	# Strip off the leading / if there is one
 #	dir=${dir%%/}
-#	
-#	# Convert the / directory separators into spaces to make a list of 
+#
+#	# Convert the / directory separators into spaces to make a list of
 #	# directories.
 #	list=${dir//\// }
-#	
+#
 #	# Inspect each item in $list
 #	for item in $list; do
-#		
+#
 #		# Try to initialize the source variable name
 #		all_add_src_var_name $item
 #	done
@@ -359,7 +363,7 @@ gen_mkfiles()
 #all_add_src_var_name()
 #{
 #	local dir="$1"
-#	
+#
 #	update_src_var_name_special "+" "$dir"
 #
 #}
@@ -367,7 +371,7 @@ gen_mkfiles()
 #all_del_src_var_name()
 #{
 #	local dir="$1"
-#	
+#
 #	update_src_var_name_special "-" "$dir"
 #}
 
@@ -384,7 +388,7 @@ read_mkfile_config()
 	src_file_suffixes=$(echo ${src_file_suffixes} | sed "s/\n/ /g")
 	ignore_dirs=$(echo ${ignore_dirs} | sed "s/\n/ /g")
 
-}	
+}
 
 main()
 {
@@ -395,26 +399,26 @@ main()
 	mkfile_fragment_sub_dir_names_anchor="_mkfile_fragment_sub_dir_names_"
 	mkfile_fragment_local_src_files_anchor="_mkfile_fragment_local_src_files_"
 	mkfile_fragment_src_var_name_anchor="_mkfile_fragment_src_var_name_"
-	
+
 	# The name of the script, stripped of any preceeding path.
 	script_name=${0##*/}
-	
+
 	# The prefix for all makefile variables.
 	src_var_name_prefix='MK'
 
 	# The variable that always holds the string that will be passed to
 	# gen_mkfile() as the source variable to insert into the fragment.mk.
 	src_var_name=''
-	
+
 	# The suffix appended to all makefile fragment source variables.
 	SRC='SRC'
-	
+
 	# The list of source file suffixes to add to the makefile variables.
 	src_file_suffixes=''
 
 	# The lists of directories to ignore.
 	ignore_dirs=''
-	
+
 	# The arguments to this function. They'll get assigned meaningful
 	# values after getopts.
 	root_dir=""
@@ -422,22 +426,22 @@ main()
 	mkfile_frag_tmpl_path=""
 	suffix_file=""
 	ignore_file=""
-	
+
 	# Flags set by getopts.
-	dry_run_flag=""	
+	dry_run_flag=""
 	hide_flag=""
 	recursive_flag=""
 	output_name=""
 	prefix_flag=""
 	verbose_flag=""
-	
+
 	# -- END GLOBAL VARIABLE DECLARATIONS --
 
 
 	# Local variable declarations.
 	local item sub_items this_dir
-	
-	
+
+
 	# Process our command line options.
 	while getopts ":dho:p:rv:" opt; do
 		case $opt in
@@ -451,15 +455,15 @@ main()
 		esac
 	done
 	shift $(($OPTIND - 1))
-	
-	
+
+
 	# Make sure that verboseness level is valid.
-	if [ "$verbose_flag" != "0" ] && 
-	   [ "$verbose_flag" != "1" ] && 
+	if [ "$verbose_flag" != "0" ] &&
+	   [ "$verbose_flag" != "1" ] &&
 	   [ "$verbose_flag" != "2" ]; then
 		verbose_flag="1"
 	fi
-	
+
 	# Check the number of arguments after command line option processing.
 	if [ $# != "5" ]; then
 		print_usage
@@ -469,25 +473,25 @@ main()
 	if [ -n "${output_name}" ]; then
 		script_name="${output_name}"
 	fi
-	
-	
+
+
 	# Extract our arguments.
 	root_dir=$1
 	frag_dir=$2
 	mkfile_frag_tmpl_path=$3
 	suffix_file=$4
 	ignore_file=$5
-	
-	
+
+
 	# Read the makefile config files to be used in the makefile fragment
 	# generation.
 	read_mkfile_config
-	
-	
+
+
 	# Strip / from end of directory path, if there is one.
 	root_dir=${root_dir%/}
 	frag_dir=${frag_dir%/}
-	
+
 
 	# Initialize the name of the makefile source variable.
 	if [ -n "$prefix_flag" ]; then
@@ -509,41 +513,41 @@ main()
 		root_dir_upper=$(echo "$root_dir_upper" | tr '/' '_')
 		src_var_name="${src_var_name_prefix}_${root_dir_upper}"
 	fi
-	
-	
+
+
 	# Be verbose if level 2 was requested.
 	if   [ "$verbose_flag" = "2" ]; then
 		echo ">>>" $script_name ${src_var_name}_$SRC $root_dir $frag_dir
 	elif [ "$verbose_flag" = "1" ]; then
 		echo "$script_name: creating makefile fragment in $frag_dir from $root_dir"
 	fi
-	
-	
+
+
 	# Call our function to generate a makefile in the root directory given.
 	gen_mkfile "${src_var_name}_$SRC" $root_dir $frag_dir
-	
-	
+
+
 	# If we were asked to act recursively, then continue processing
 	# root_dir's contents.
 	if [ -n "$recursive_flag" ]; then
-		
+
 		# Get a listing of the directories in $directory.
 		sub_items=$(ls $root_dir)
-		
+
 		# Descend into the contents of root_dir to generate the makefile
 		# fragments.
 		for item in $sub_items; do
-			
+
 			# If item is a directory, and it's not in the ignore list, descend into it.
 			#if [ -d "$root_dir/$item" ] && ! should_ignore $item ; then
 			if [ -d "$root_dir/$item" ] && ! is_in_list $item "$ignore_dirs" ; then
-				
+
 				gen_mkfiles $root_dir/$item $frag_dir/$item
 			fi
 		done
 	fi
-	
-	
+
+
 	# Exit peacefully.
 	return 0
 }
@@ -551,22 +555,22 @@ main()
 is_in_list()
 {
 	local cur_item the_item item_list
-	
+
 	# Extract argument.
 	the_item="$1"
 	item_list="$2"
-	
+
 	# Check each item in the list against the item of interest.
 	for cur_item in ${item_list}; do
-		
+
 		# If the current item in the list matches the one of interest.
 		if [ "${cur_item}" = "${the_item}" ]; then
-			
+
 			# Return success (ie: item was found).
 			return 0
 		fi
 	done
-	
+
 	# If we made it this far, return failure (ie: item not found).
 	return 1
 }
diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index c3cfbcdcb..190dea77c 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -629,7 +629,6 @@ bli_invscalv_ex
 bli_iprintm
 bli_iprintv
 bli_isetsc
-bli_l3_cntl_free
 bli_l3_thrinfo_create
 bli_ltesc
 bli_ltsc
diff --git a/build/plugin/Makefile b/build/plugin/Makefile
new file mode 100644
index 000000000..92554e3e8
--- /dev/null
+++ b/build/plugin/Makefile
@@ -0,0 +1,524 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2022, Advanced Micro Devices, Inc.
+#  Copyright (C) 2023, Southern Methodist University
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+#
+# --- Makefile PHONY target definitions ----------------------------------------
+#
+
+.PHONY: all \
+        plugin \
+        showconfig \
+        clean cleanmk cleanlib distclean \
+        check-env check-env-make-defs check-env-fragments check-env-mk
+
+
+#
+# --- Include config makefile definitions --------------------------------------
+#
+
+# Define the name of the config makefile.
+CONFIG_MK_FILE := config.mk
+
+# Include the configuration file.
+-include $(CONFIG_MK_FILE)
+
+
+#
+# --- Include common makefile definitions --------------------------------------
+#
+
+INC_PATH           := $(includedir)/blis
+
+# Define the name of the common makefile.
+COMMON_MK_FILE := $(sharedir)/blis/common.mk
+
+# Include the configuration file.
+include $(COMMON_MK_FILE)
+
+# Detect whether we actually got the configuration file. If we didn't, then
+# it is likely that the user has not yet generated it (via configure).
+ifeq ($(strip $(COMMON_MK_INCLUDED)),yes)
+COMMON_MK_PRESENT := yes
+else
+COMMON_MK_PRESENT := no
+endif
+
+# Source suffixes.
+CONFIG_SRC_SUFS   := c cxx cpp
+KERNELS_SRC_SUFS  := c cxx cpp s S
+REFKERN_SRC_SUFS  := c cxx cpp
+FRAME_SRC_SUFS    := c cxx cpp
+
+# Make sure the plugin path is included when searching for headers (e.g. bli_plugin_<name>.h).
+CINCFLAGS         += -I$(DIST_PATH)
+
+PLUGIN_A_PATH     := $(BASE_LIB_PATH)/libblis_$(PLUGIN_NAME).a
+PLUGIN_SO_PATH    := $(BASE_LIB_PATH)/libblis_$(PLUGIN_NAME).$(SHLIB_EXT)
+
+# Specify the shared library's 'soname' field.
+# NOTE: The flag for creating shared objects is different for Linux and OS X.
+LDFLAGS    += -L$(libdir) -lblis
+ifeq ($(OS_NAME),Darwin)
+# OS X shared library link flags.
+SOFLAGS    := -dynamiclib
+else
+SOFLAGS    := -shared
+endif
+
+#
+# --- Main target variable definitions -----------------------------------------
+#
+
+# --- Object file paths ---
+
+# Construct the base object file path for the current configuration.
+BASE_OBJ_PATH          := ./$(OBJ_DIR)/$(CONFIG_NAME)
+
+# Construct base object file paths corresponding to the four locations
+# of source code.
+BASE_OBJ_CONFIG_PATH   := $(BASE_OBJ_PATH)/$(CONFIG_DIR)
+BASE_OBJ_FRAME_PATH    := $(BASE_OBJ_PATH)/$(FRAME_DIR)
+BASE_OBJ_REFKERN_PATH  := $(BASE_OBJ_PATH)/$(REFKERN_DIR)
+BASE_OBJ_KERNELS_PATH  := $(BASE_OBJ_PATH)/$(KERNELS_DIR)
+
+# --- Determine which libraries to build ---
+
+MK_LIBS                   :=
+
+ifeq ($(MK_ENABLE_STATIC),yes)
+MK_LIBS                   += $(PLUGIN_A_PATH)
+endif
+ifeq ($(MK_ENABLE_SHARED),yes)
+MK_LIBS                   += $(PLUGIN_SO_PATH)
+endif
+
+#
+# --- Library object definitions -----------------------------------------------
+#
+
+# In this section, we will isolate the relevant source code filepaths and
+# convert them to lists of object filepaths. Relevant source code falls into
+# four categories: configuration source; architecture-specific kernel source;
+# reference kernel source; and general framework source.
+
+# $(call gen-obj-paths-from-src file_exts, src_files, base_src_path, base_obj_path)
+gen-obj-paths-from-src = $(foreach ch, $(1), \
+                             $(patsubst $(3)/%.$(ch), \
+                                        $(4)/%.o, \
+                                        $(filter %.$(ch), $(2)) ) )
+
+# Generate object file paths for source code found in the sub-configuration
+# directories.
+MK_CONFIG_OBJS      := $(call gen-obj-paths-from-src,$(CONFIG_SRC_SUFS),$(MK_CONFIG_SRC),$(CONFIG_PATH),$(BASE_OBJ_CONFIG_PATH))
+
+# Generate object file paths for architecture-specific kernel source code.
+# We target only .c, .s, and .S files. Note that MK_KERNELS_SRC is already
+# limited to the kernel source corresponding to the kernel sets in
+# KERNEL_LIST. This is because the configure script only propogated makefile
+# fragments into those specific kernel subdirectories.
+MK_KERNELS_OBJS     := $(call gen-obj-paths-from-src,$(KERNELS_SRC_SUFS),$(MK_KERNELS_SRC),$(KERNELS_PATH),$(BASE_OBJ_KERNELS_PATH))
+
+# Generate object file paths for reference kernels, with one set of object
+# files for each sub-configuration in CONFIG_LIST. Note that due to the
+# nuances of naming the reference kernel files, we can't use the function
+# gen-obj-paths-from-src as we do above and below.
+MK_REFKERN_OBJS     := $(foreach suf, $(REFKERN_SRC_SUFS), \
+                           $(foreach arch, $(CONFIG_LIST), \
+                               $(patsubst $(REFKERN_PATH)/%_$(REFNM).$(suf), \
+                                     $(BASE_OBJ_REFKERN_PATH)/$(arch)/%_$(arch)_$(REFNM).o, \
+                                     $(filter %.$(suf), $(MK_REFKERN_SRC)) \
+                                ) \
+                            ) \
+                        )
+
+# Generate object file paths for all of the portable framework source code.
+MK_FRAME_OBJS       := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH))
+
+# Combine all of the object files into some readily-accessible variables.
+MK_PLUGIN_OBJS      := $(MK_CONFIG_OBJS) \
+                       $(MK_KERNELS_OBJS) \
+                       $(MK_REFKERN_OBJS) \
+                       $(MK_FRAME_OBJS)
+
+
+#
+# --- Targets/rules ------------------------------------------------------------
+#
+
+# --- Primary targets ---
+
+all: libs
+
+libs: plugin
+
+clean: cleanlib
+
+
+# --- Environment check rules ---
+
+check-env: check-env-make-defs check-env-fragments check-env-mk
+
+check-env-mk:
+ifeq ($(CONFIG_MK_PRESENT),no)
+	$(error Cannot proceed: config.mk not detected! Run configure first)
+endif
+
+check-env-fragments: check-env-mk
+ifeq ($(MAKEFILE_FRAGMENTS_PRESENT),no)
+	$(error Cannot proceed: makefile fragments not detected! Run configure first)
+endif
+
+check-env-make-defs: check-env-fragments
+ifeq ($(ALL_MAKE_DEFS_MK_PRESENT),no)
+	$(error Cannot proceed: Some make_defs.mk files not found or mislabeled!)
+endif
+
+
+# --- General source code / object code rules ---
+
+# FGVZ: Add support for compiling .s and .S files in 'config'/'kernels'
+# directories.
+#  - May want to add an extra foreach loop around function eval/call.
+
+# first argument: a configuration name from config_list, used to look up the
+# CFLAGS to use during compilation.
+define make-config-rule
+$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(call get-config-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-config-text-for,$(1))
+	@$(CC) $(call get-config-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-config-cxxtext-for,$(1))
+	@$(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-config-cxxtext-for,$(1))
+	@$(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+endef
+
+# first argument: a kernel set (name) being targeted (e.g. haswell).
+# The 'trailing' % is important so that these are technically pattern rules and the appropriate one can be
+# selected based on the suffix of bli_cntx_ref.
+define make-refinit-rule
+$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref%.o: $(REFKERN_PATH)/bli_cntx_ref%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(call get-refinit-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-refinit-text-for,$(1))
+	@$(CC) $(call get-refinit-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref%.o: $(REFKERN_PATH)/bli_cntx_ref%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-refinit-cxxtext-for,$(1))
+	@$(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref%.o: $(REFKERN_PATH)/bli_cntx_ref%.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-refinit-cxxtext-for,$(1))
+	@$(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+endef
+
+# first argument: a kernel set (name) being targeted (e.g. haswell).
+define make-refkern-rule
+$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(call get-refkern-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-refkern-text-for,$(1))
+	@$(CC) $(call get-refkern-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-refkern-cxxtext-for,$(1))
+	@$(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-refkern-cxxtext-for,$(1))
+	@$(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+endef
+
+# first argument: a configuration name from the union of config_list and
+# config_name, used to look up the CFLAGS to use during compilation.
+define make-frame-rule
+$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(call get-frame-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-frame-text-for,$(1))
+	@$(CC) $(call get-frame-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-frame-cxxtext-for,$(1))
+	@$(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+
+$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-frame-cxxtext-for,$(1))
+	@$(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+endef
+
+# first argument: a kernel set (name) being targeted (e.g. haswell).
+# second argument: the configuration whose CFLAGS we should use in compilation.
+# third argument: the kernel file suffix being considered.
+define make-kernels-rule
+$(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+	@mkdir -p $$(dir $$@)
+ifeq ($(3),$(filter cxx cpp,$(3)))
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CXX) $(call get-kernel-cxxflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-kernel-cxxtext-for,$(2))
+	@$(CXX) $(call get-kernel-cxxflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+else
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(call get-kernel-cflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+else
+	@echo "Compiling $$@" $(call get-kernel-text-for,$(2))
+	@$(CC) $(call get-kernel-cflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@
+endif
+endif
+endef
+
+# Define functions to choose the correct sub-configuration name for the
+# given kernel set. This function is called when instantiating the
+# make-kernels-rule.
+get-config-for-kset = $(lastword $(subst :, ,$(filter $(1):%,$(KCONFIG_MAP))))
+
+# Instantiate the build rule for files in the configuration directory for
+# each of the sub-configurations in CONFIG_LIST with the CFLAGS designated
+# for that sub-configuration.
+$(foreach conf, $(CONFIG_LIST), $(eval $(call make-config-rule,$(conf))))
+
+# Instantiate the build rule for reference kernel initialization and
+# reference kernels for each of the sub-configurations in CONFIG_LIST with
+# the CFLAGS designated for that sub-configuration.
+$(foreach conf, $(CONFIG_LIST), $(eval $(call make-refinit-rule,$(conf))))
+$(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf))))
+
+# Instantiate the build rule for framework files. Use the CFLAGS for the
+# configuration family, which exists in the directory whose name is equal to
+# CONFIG_NAME. Note that this doesn't need to be in a loop since we expect
+# CONFIG_NAME to only ever contain a single name. (BTW: If CONFIG_NAME refers
+# to a singleton family, then CONFIG_LIST contains CONFIG_NAME as its only
+# item.)
+$(foreach conf, $(CONFIG_NAME), $(eval $(call make-frame-rule,$(conf))))
+
+# Instantiate the build rule for optimized kernels for each of the kernel
+# sets in KERNEL_LIST with the CFLAGS designated for the sub-configuration
+# specified by the KCONFIG_MAP.
+$(foreach suf, $(KERNELS_SRC_SUFS), \
+$(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf)))))
+
+
+# --- All-purpose library rule (static and shared) ---
+
+plugin: check-env $(MK_LIBS)
+
+
+# --- Static library archiver rules ---
+
+$(PLUGIN_A_PATH): $(MK_PLUGIN_OBJS)
+	@mkdir -p $(dir $@)
+ifeq ($(ENABLE_VERBOSE),yes)
+ifeq ($(ARG_MAX_HACK),yes)
+	$(file > $@.in,$^)
+	$(AR) $(ARFLAGS) $@ @$@.in
+	$(RM_F) $@.in
+	$(RANLIB) $@
+else
+	$(AR) $(ARFLAGS) $@ $?
+	$(RANLIB) $@
+endif
+else # ifeq ($(ENABLE_VERBOSE),no)
+ifeq ($(ARG_MAX_HACK),yes)
+	@echo "Archiving $@"
+	@$(file > $@.in,$^)
+	@$(AR) $(ARFLAGS) $@ @$@.in
+	@$(RM_F) $@.in
+	@$(RANLIB) $@
+else
+	@echo "Archiving $@"
+	@$(AR) $(ARFLAGS) $@ $?
+	@$(RANLIB) $@
+endif
+endif
+
+
+# --- Shared library linker rules ---
+
+$(PLUGIN_SO_PATH): $(MK_PLUGIN_OBJS)
+	@mkdir -p $(dir $@)
+ifeq ($(ENABLE_VERBOSE),yes)
+ifeq ($(ARG_MAX_HACK),yes)
+	$(file > $@.in,$^)
+	$(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) @$@.in $(LDFLAGS)
+	$(RM_F) $@.in
+else
+	$(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) $^ $(LDFLAGS)
+endif
+else # ifeq ($(ENABLE_VERBOSE),no)
+ifeq ($(ARG_MAX_HACK),yes)
+	@echo "Dynamically linking $@"
+	@$(file > $@.in,$^)
+	@$(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) @$@.in $(LDFLAGS)
+	@$(RM_F) $@.in
+else
+	@echo "Dynamically linking $@"
+	@$(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) $^ $(LDFLAGS)
+endif
+endif
+
+# --- Query current configuration ---
+
+showconfig: check-env
+	@echo "configuration family:       $(CONFIG_NAME)"
+	@echo "sub-configurations:         $(CONFIG_LIST)"
+	@echo "requisite kernels sets:     $(KERNEL_LIST)"
+	@echo "kernel-to-config map:       $(KCONFIG_MAP)"
+	@echo "-------------------------"
+	@echo "BLIS version string:        $(VERSION)"
+	@echo ".so major version:          $(SO_MAJOR)"
+	@echo ".so minor.build vers:       $(SO_MINORB)"
+	@echo "install libdir:             $(INSTALL_LIBDIR)"
+	@echo "install includedir:         $(INSTALL_INCDIR)"
+	@echo "install sharedir:           $(INSTALL_SHAREDIR)"
+	@echo "debugging status:           $(DEBUG_TYPE)"
+	@echo "enable AddressSanitizer?    $(MK_ENABLE_ASAN)"
+	@echo "enabled threading model(s): $(THREADING_MODEL)"
+	@echo "enable BLAS API?            $(MK_ENABLE_BLAS)"
+	@echo "enable CBLAS API?           $(MK_ENABLE_CBLAS)"
+	@echo "build static library?       $(MK_ENABLE_STATIC)"
+	@echo "build shared library?       $(MK_ENABLE_SHARED)"
+	@echo "ARG_MAX hack enabled?       $(ARG_MAX_HACK)"
+
+
+# --- Clean rules ---
+
+cleanmk:
+ifeq ($(IS_CONFIGURED),yes)
+ifeq ($(ENABLE_VERBOSE),yes)
+	- $(FIND) $(CONFIG_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+	- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+	- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+else
+	@echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)"
+	@- $(FIND) $(CONFIG_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+	@echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)"
+	@- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+	@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)"
+	@- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
+endif
+endif
+
+cleanlib:
+ifeq ($(IS_CONFIGURED),yes)
+ifeq ($(ENABLE_VERBOSE),yes)
+	- $(FIND) $(BASE_OBJ_PATH) -name "*.o" | $(XARGS) $(RM_F)
+	- $(RM_F) $(LIBBLIS_A_PATH)
+	- $(RM_F) $(LIBBLIS_SO_PATH)
+else
+	@echo "Removing object files from $(BASE_OBJ_PATH)"
+	@- $(FIND) $(BASE_OBJ_PATH) -name "*.o" | $(XARGS) $(RM_F)
+	@echo "Removing libraries from $(BASE_LIB_PATH)"
+	@- $(RM_F) $(LIBBLIS_A_PATH)
+	@- $(RM_F) $(LIBBLIS_SO_PATH)
+endif
+endif
+
+distclean: cleanmk cleanlib
+ifeq ($(IS_CONFIGURED),yes)
+ifeq ($(ENABLE_VERBOSE),yes)
+	- $(RM_F) $(CONFIG_MK_FILE)
+	- $(RM_RF) $(OBJ_DIR)
+	- $(RM_RF) $(LIB_DIR)
+else
+	@echo "Removing $(CONFIG_MK_FILE)"
+	@- $(RM_F) $(CONFIG_MK_FILE)
+	@echo "Removing $(OBJ_DIR)"
+	@- $(RM_RF) $(OBJ_DIR)
+	@echo "Removing $(LIB_DIR)"
+	@- $(RM_RF) $(LIB_DIR)
+endif
+endif
+
diff --git a/frame/3/bli_l3_direct.h b/build/plugin/bli_kernel_defs_zen3.h
similarity index 75%
rename from frame/3/bli_l3_direct.h
rename to build/plugin/bli_kernel_defs_zen3.h
index 8f624098e..adda5af9a 100644
--- a/frame/3/bli_l3_direct.h
+++ b/build/plugin/bli_kernel_defs_zen3.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,28 +32,18 @@
 
 */
 
-dir_t bli_l3_direct
-     (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntl_t* cntl
-     );
-
-// -----------------------------------------------------------------------------
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-dir_t PASTEMAC0(opname) \
-      ( \
-         const obj_t* a, \
-         const obj_t* b, \
-         const obj_t* c  \
-      );
-
-GENPROT( gemm_direct )
-GENPROT( gemmt_direct )
-GENPROT( trmm_direct )
-GENPROT( trsm_direct )
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+// ---------------------------------------------------------------------------->
+// -- Example macros to be used in reference kernels compiled for zen3 -------->
+// ---------------------------------------------------------------------------->
+
+#define MY_KERNEL_2_ROW_MAJOR 1
+
+// <----------------------------------------------------------------------------
+// <----------------------------------------------------------------------------
+// <----------------------------------------------------------------------------
+
+//#endif
 
diff --git a/build/plugin/bli_plugin.h.in b/build/plugin/bli_plugin.h.in
new file mode 100644
index 000000000..b2cff2293
--- /dev/null
+++ b/build/plugin/bli_plugin.h.in
@@ -0,0 +1,146 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Southern Methodist University
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// Parameters passed to the plugin registration and initialization
+// functions.
+//
+
+#define plugin_@plugin_name@_params \
+\
+       siz_t* bszids, /* <----- Example arguments       */ \
+       siz_t* kerids, /* <----- for plugin registration */ \
+       siz_t* prefids /* <----- and initialization.     */
+
+#define plugin_@plugin_name@_params_only \
+\
+       bszids, /* <----- We also sometimes need   */ \
+       kerids, /* <----- the names of the         */ \
+       prefids /* <----- arguments without types. */
+
+// ---------------------------------------------------------------------------->
+// -- Example blocksize, micro-kernel, and preference enumerations. ----------->
+// ---------------------------------------------------------------------------->
+
+enum
+{
+	MY_BLKSZ_1,
+	MY_BLKSZ_2,
+
+	MY_NUM_BLOCK_SIZES
+};
+
+enum
+{
+	MY_KERNEL_1,
+	MY_KERNEL_2,
+
+	MY_NUM_KERNELS
+};
+
+enum
+{
+	MY_PREF_1,
+	MY_PREF_2,
+
+	MY_NUM_KERNEL_PREFS
+};
+
+// <----------------------------------------------------------------------------
+// <----------------------------------------------------------------------------
+// <----------------------------------------------------------------------------
+
+// ---------------------------------------------------------------------------->
+// -- Example prototypes for kernel functions. -------------------------------->
+// ---------------------------------------------------------------------------->
+
+// Reference kernels for all data types
+#undef GENTPROT
+#define GENTPROT( ctype, ch, config_infix ) \
+\
+void PASTEMAC(ch,my_kernel_1,config_infix,BLIS_REF_SUFFIX) \
+     ( \
+             int    n, \
+       const ctype* a, \
+             ctype* x  \
+     );
+
+// Reference kernels for only complex data types
+#undef GENTPROTCO
+#define GENTPROTCO( ctype, ctyper, ch, chr, config_infix ) \
+\
+void PASTEMAC(ch,my_kernel_2,config_infix,BLIS_REF_SUFFIX) \
+     ( \
+       int    m, \
+       int    n, \
+       ctype* a  \
+     );
+
+// Optimized kernels
+void bli_dmy_kernel_1_zen3
+     (
+             int    n,
+       const double* a,
+             double* x
+     );
+
+// Generate reference kernel prototypes for each configuration AND data type
+#undef GENTCONF
+#define GENTCONF( CONFIG, config ) \
+\
+INSERT_GENTPROT_BASIC( PASTECH(_,config) ) \
+INSERT_GENTPROTCO_BASIC( PASTECH(_,config) )
+
+INSERT_GENTCONF
+
+// <----------------------------------------------------------------------------
+// <----------------------------------------------------------------------------
+// <----------------------------------------------------------------------------
+
+//
+// Registration and intialization function prototypes.
+//
+
+#undef GENTCONF
+#define GENTCONF( CONFIG, config ) \
+\
+void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,_,config)( PASTECH(plugin,BLIS_PNAME_INFIX,_params) ); \
+void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,_,config,BLIS_REF_SUFFIX)( PASTECH(plugin,BLIS_PNAME_INFIX,_params) );
+
+INSERT_GENTCONF
+
+BLIS_EXPORT_BLIS err_t PASTEMAC(plugin_register,BLIS_PNAME_INFIX)( PASTECH(plugin,BLIS_PNAME_INFIX,_params) );
+
diff --git a/build/plugin/bli_plugin_init_ref.c b/build/plugin/bli_plugin_init_ref.c
new file mode 100644
index 000000000..649eb311a
--- /dev/null
+++ b/build/plugin/bli_plugin_init_ref.c
@@ -0,0 +1,108 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Southern Methodist University
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include @PLUGIN_HEADER@
+
+// -- Macros to help concisely instantiate bli_func_init() ---------------------
+
+#define gen_func_init_ro( func_p, opname ) \
+do { \
+	bli_func_init( func_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \
+	                       NULL,               NULL ); \
+} while (0)
+
+#define gen_func_init_co( func_p, opname ) \
+do { \
+	bli_func_init( func_p, NULL,               NULL, \
+	                       PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
+} while (0)
+
+#define gen_func_init( func_p, opname ) \
+do { \
+	bli_func_init( func_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \
+	                       PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
+} while (0)
+
+// -----------------------------------------------------------------------------
+
+void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
+     (
+       PASTECH(plugin,BLIS_PNAME_INFIX,_params)
+     )
+{
+	cntx_t* cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH,BLIS_CNAME_UPPER_INFIX) );
+	( void )cntx;
+
+    // ------------------------------------------------------------------------>
+	// -- Example Initialization ---------------------------------------------->
+	// ------------------------------------------------------------------------>
+
+	blksz_t blkszs[ MY_NUM_BLOCK_SIZES ];
+	siz_t   bmults[ MY_NUM_BLOCK_SIZES ];
+	func_t  funcs[ MY_NUM_KERNELS ];
+	mbool_t mbools[ MY_NUM_KERNEL_PREFS ];
+
+	// -- Set blocksizes -------------------------------------------------------
+    //                                             s     d     c     z
+	bli_blksz_init_easy( &blkszs[ MY_BLKSZ_1 ],  256,  128,  128,   64 );
+	bli_blksz_init_easy( &blkszs[ MY_BLKSZ_2 ],  256,  256,  256,  256 );
+	bmults[ MY_BLKSZ_1 ] = bszids[ MY_BLKSZ_1 ];
+	bmults[ MY_BLKSZ_2 ] = bszids[ MY_BLKSZ_2 ];
+
+	// -- Set micro-kernels ----------------------------------------------------
+
+	gen_func_init   ( &funcs[ MY_KERNEL_1 ], PASTECH(my_kernel_1,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) );
+	gen_func_init_co( &funcs[ MY_KERNEL_2 ], PASTECH(my_kernel_2,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) );
+
+	// -- Set preferences ------------------------------------------------------
+	//                                        s      d      c      z
+	bli_mbool_init( &mbools[ MY_PREF_1 ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ MY_PREF_2 ], FALSE, FALSE, FALSE, FALSE );
+
+	// -- Put block sizes, kernels, and preferences into the context -----------
+
+	for ( dim_t i = 0; i < MY_NUM_BLOCK_SIZES; i++ )
+		bli_cntx_set_blksz( bszids[ i ], &blkszs[ i ], bmults[ i ], cntx );
+
+	for ( dim_t i = 0; i < MY_NUM_KERNELS; i++ )
+		bli_cntx_set_ukr( kerids[ i ], &funcs[ i ], cntx );
+
+	for ( dim_t i = 0; i < MY_NUM_KERNEL_PREFS; i++ )
+		bli_cntx_set_ukr_pref( prefids[ i ], &mbools[ i ], cntx );
+
+	// <------------------------------------------------------------------------
+	// <------------------------------------------------------------------------
+	// <------------------------------------------------------------------------
+}
+
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2b.c b/build/plugin/bli_plugin_init_zen3.c
similarity index 52%
rename from frame/3/trmm/bli_trmm_xx_ker_var2b.c
rename to build/plugin/bli_plugin_init_zen3.c
index 57894165c..43d2e86a1 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2b.c
+++ b/build/plugin/bli_plugin_init_zen3.c
@@ -4,8 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,55 +32,60 @@
 
 */
 
-#include "blis.h"
+#include @PLUGIN_HEADER@
 
-static l3_var_oft vars[2][2] =
-{
-	{ bli_trmm_ll_ker_var2b, bli_trmm_lu_ker_var2b },
-	{ bli_trmm_rl_ker_var2b, bli_trmm_ru_ker_var2b }
-};
-
-void bli_trmm_xx_ker_var2b
+void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,BLIS_CNAME_INFIX)
      (
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-       const cntl_t*    cntl,
-             thrinfo_t* thread_par
+       PASTECH(plugin,BLIS_PNAME_INFIX,_params)
      )
 {
-	dim_t side;
-	dim_t uplo;
-
-	// Set two bools: one based on the implied side parameter (the structure
-	// of the root object) and one based on the uplo field of the triangular
-	// matrix's root object (whether that is matrix A or matrix B).
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		side = 0;
-		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		side = 1;
-		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-
-	// Index into the variant array to extract the correct function pointer.
-	l3_var_oft f = vars[side][uplo];
-
-	// Call the macrokernel.
-	f
+	cntx_t* cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH,BLIS_CNAME_UPPER_INFIX) );
+	( void )cntx;
+
+    // ------------------------------------------------------------------------>
+	// -- Example Initialization ---------------------------------------------->
+	// ------------------------------------------------------------------------>
+
+	// Update the context with optimized native micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  a,
-	  b,
-	  c,
 	  cntx,
-	  cntl,
-	  thread_par
+
+	  kerids[ MY_KERNEL_1 ], BLIS_DOUBLE, bli_dmy_kernel_1_zen3,
+
+	  BLIS_VA_END
 	);
+
+	// Update the context with preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  prefids[ MY_PREF_1 ], BLIS_DOUBLE, TRUE,
+	  prefids[ MY_PREF_2 ], BLIS_DOUBLE, TRUE,
+
+	  BLIS_VA_END
+	);
+
+	blksz_t blkszs[ MY_NUM_BLOCK_SIZES ];
+	bszid_t bmults[ MY_NUM_BLOCK_SIZES ];
+
+	// Update block sizes
+	//                                             s     d     c     z
+	bli_blksz_init_easy( &blkszs[ MY_BLKSZ_1 ],  320,  240,  182,   96 );
+	bmults[ MY_BLKSZ_1 ] = bszids[ MY_BLKSZ_1 ];
+
+	bli_cntx_set_blkszs
+	(
+	  cntx,
+
+	  bszids[ MY_BLKSZ_1 ], &blkszs[ MY_BLKSZ_1 ], bmults[ MY_BLKSZ_1 ],
+
+	  BLIS_VA_END
+	);
+
+	// <------------------------------------------------------------------------
+	// <------------------------------------------------------------------------
+	// <------------------------------------------------------------------------
 }
 
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/build/plugin/bli_plugin_register.c
similarity index 55%
rename from frame/3/trsm/bli_trsm_xx_ker_var2.c
rename to build/plugin/bli_plugin_register.c
index dfeefcd9d..4512b554e 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/build/plugin/bli_plugin_register.c
@@ -4,8 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,55 +32,50 @@
 
 */
 
-#include "blis.h"
+#include @PLUGIN_HEADER@
 
-static l3_var_oft vars[2][2] =
-{
-	{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
-	{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
-};
-
-void bli_trsm_xx_ker_var2
+err_t PASTEMAC(plugin_register,BLIS_PNAME_INFIX)
      (
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-       const cntl_t*    cntl,
-             thrinfo_t* thread_par
+       PASTECH(plugin,BLIS_PNAME_INFIX,_params)
      )
 {
-	dim_t side;
-	dim_t uplo;
-
-	// Set two bools: one based on the implied side parameter (the structure
-	// of the root object) and one based on the uplo field of the triangular
-	// matrix's root object (whether that is matrix A or matrix B).
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		side = 0;
-		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		side = 1;
-		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-
-	// Index into the variant array to extract the correct function pointer.
-	l3_var_oft f = vars[side][uplo];
-
-	// Call the macrokernel.
-	f
-	(
-	  a,
-	  b,
-	  c,
-	  cntx,
-	  cntl,
-	  thread_par
+	// ------------------------------------------------------------------------>
+	// -- Example Plugin Registration  ---------------------------------------->
+	// ------------------------------------------------------------------------>
+
+	//
+	// Register slots for new microkernels, preferences, and block sizes.
+	//
+
+	err_t err;
+
+	err = bli_gks_register_blksz( &bszids[ MY_BLKSZ_1 ] );
+	err = bli_gks_register_blksz( &bszids[ MY_BLKSZ_1 ] );
+	err = bli_gks_register_ukr( &kerids[ MY_KERNEL_1 ] );
+	err = bli_gks_register_ukr( &kerids[ MY_KERNEL_2 ] );
+	err = bli_gks_register_ukr_pref( &prefids[ MY_PREF_1 ] );
+	err = bli_gks_register_ukr_pref( &prefids[ MY_PREF_2 ] );
+
+	if ( err != BLIS_SUCCESS )
+		return err;
+
+	// <------------------------------------------------------------------------
+	// <------------------------------------------------------------------------
+	// <------------------------------------------------------------------------
+
+	//
+	// Initialize the context for each enabled sub-configuration.
+	//
+
+	#undef GENTCONF
+	#define GENTCONF( CONFIG, config ) \
+	PASTEMAC(plugin_init,BLIS_PNAME_INFIX,_,config,BLIS_REF_SUFFIX) \
+	( \
+	  PASTECH(plugin,BLIS_PNAME_INFIX,_params_only) \
 	);
+
+	INSERT_GENTCONF
+
+	return BLIS_SUCCESS;
 }
 
diff --git a/build/plugin/config.mk.in b/build/plugin/config.mk.in
new file mode 100644
index 000000000..0d5989cbf
--- /dev/null
+++ b/build/plugin/config.mk.in
@@ -0,0 +1,145 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2022, Advanced Micro Devices, Inc.
+#  Copyright (C) 2023, Southern Methodist University
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+ifndef CONFIG_MK_PLUGIN_INCLUDED
+CONFIG_MK_PLUGIN_INCLUDED := yes
+
+# The installation prefix, exec_prefix, libdir, includedir, and shareddir
+# values from configure tell us where to install the libraries, header files,
+# and public makefile fragments. We must first assign each substituted
+# @anchor@ to its own variable. Why? Because the subsitutions may contain
+# unevaluated variable expressions. For example, '@libdir@' may be replaced
+# with '${exec_prefix}/lib'. By assigning the anchors to variables first, and
+# then assigning them to their final INSTALL_* variables, we allow prefix and
+# exec_prefix to be used in the definitions of exec_prefix, libdir,
+# includedir, and sharedir.
+prefix            := @prefix@
+exec_prefix       := @exec_prefix@
+libdir            := @libdir@
+includedir        := @includedir@
+sharedir          := @sharedir@
+
+# Override SHARE_PATH from common.mk so that e.g. make_defs.mk files from
+# configurations are loaded from the installed share directory.
+SHARE_PATH        := @sharedir@/blis
+
+# Override the source path locations to point to the plugin source, rather
+# than the default values which assume the BLIS builtin source tree.
+FRAME_DIR         := .
+DIST_PATH         := @plugin_dir@
+
+# Define the name of the global config.mk makefile.
+GLOB_CONFIG_MK_FILE := $(sharedir)/blis/config.mk
+
+# Include the configuration file.
+include $(GLOB_CONFIG_MK_FILE)
+
+# The name of the plugin.
+PLUGIN_NAME       := @plugin_name@
+
+# This list contains some number of "kernel:config" pairs, where "config"
+# specifies which configuration's compilation flags (CFLAGS) should be
+# used to compile the source code for the kernel set named "kernel".
+KCONFIG_MAP       := @kconfig_map@
+
+# The C compiler.
+CC_VENDOR         := @CC_VENDOR@
+CC                := @CC@
+
+# Important C compiler ranges.
+GCC_OT_4_9_0      := @gcc_older_than_4_9_0@
+GCC_OT_6_1_0      := @gcc_older_than_6_1_0@
+GCC_OT_9_1_0      := @gcc_older_than_9_1_0@
+GCC_OT_10_3_0     := @gcc_older_than_10_3_0@
+CLANG_OT_9_0_0    := @clang_older_than_9_0_0@
+CLANG_OT_12_0_0   := @clang_older_than_12_0_0@
+AOCC_OT_2_0_0     := @aocc_older_than_2_0_0@
+AOCC_OT_3_0_0     := @aocc_older_than_3_0_0@
+
+# The C++ compiler.
+CXX               := @CXX@
+
+# The Fortran compiler.
+FC                := @FC@
+
+# Static library indexer.
+RANLIB            := @RANLIB@
+
+# Archiver.
+AR                := @AR@
+
+# Preset (required) CFLAGS, CXXFLAGS, and LDFLAGS. These variables capture the value
+# of the CFLAGS, CXXFLAGS, and LDFLAGS environment variables at configure-time (and/or
+# the value of CFLAGS/CXXFLAGS/LDFLAGS if any was specified on the command line).
+# These flags are used in addition to the flags automatically determined
+# by the build system.
+CFLAGS_PRESET     := @cflags_preset@
+CXXFLAGS_PRESET   := @cxxflags_preset@
+LDFLAGS_PRESET    := @ldflags_preset@
+
+# The level of debugging info to generate.
+DEBUG_TYPE        := @debug_type@
+ENABLE_DEBUG      := @enable_debug@
+
+# Whether to compile and link the AddressSanitizer library.
+MK_ENABLE_ASAN    := @enable_asan@
+
+# Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option.
+PRAGMA_OMP_SIMD   := @pragma_omp_simd@
+
+# Whether to output verbose command-line feedback as the Makefile is
+# processed.
+ENABLE_VERBOSE    := @enable_verbose@
+
+# Whether we need to employ an alternate method for passing object files to
+# ar and/or the linker to work around a small value of ARG_MAX.
+ARG_MAX_HACK      := @enable_arg_max_hack@
+
+# Whether to build the static and shared libraries.
+# NOTE: The "MK_" prefix, which helps differentiate these variables from
+# their corresonding cpp macros that use the BLIS_ prefix.
+MK_ENABLE_STATIC  := @mk_enable_static@
+MK_ENABLE_SHARED  := @mk_enable_shared@
+
+# Whether to use an install_name based on @rpath.
+MK_ENABLE_RPATH   := @enable_rpath@
+
+# Whether to export all symbols within the shared library, even those symbols
+# that are considered to be for internal use only.
+EXPORT_SHARED     := @export_shared@
+
+# end of ifndef CONFIG_MK_PLUGIN_INCLUDED conditional block
+endif
diff --git a/frame/3/hemm/bli_hemm.h b/build/plugin/my_kernel_1_ref.c
similarity index 77%
rename from frame/3/hemm/bli_hemm.h
rename to build/plugin/my_kernel_1_ref.c
index e73186736..dc3433814 100644
--- a/frame/3/hemm/bli_hemm.h
+++ b/build/plugin/my_kernel_1_ref.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,5 +32,25 @@
 
 */
 
-#include "bli_hemm_front.h"
+#include @PLUGIN_HEADER@
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+void PASTEMAC(ch,opname,arch,suf) \
+     ( \
+             int    n, \
+       const ctype* a, \
+             ctype* x  \
+     ) \
+{ \
+	if ( bli_zero_dim1( n ) ) return; \
+\
+	for ( dim_t i = 0; i < n; ++i ) \
+	{ \
+		PASTEMAC(ch,copys)( *a, x[ i ] ); \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC( my_kernel_1, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/frame/3/hemm/bli_hemm_front.h b/build/plugin/my_kernel_1_zen3.c
similarity index 78%
rename from frame/3/hemm/bli_hemm_front.h
rename to build/plugin/my_kernel_1_zen3.c
index 2ccd8e0c8..00c816338 100644
--- a/frame/3/hemm/bli_hemm_front.h
+++ b/build/plugin/my_kernel_1_zen3.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,14 +32,32 @@
 
 */
 
-void bli_hemm_front
+#include @PLUGIN_HEADER@
+
+void bli_dmy_kernel_1_zen3
      (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     );
+             int     n,
+       const double* a,
+             double* x
+     )
+{
+	if ( bli_zero_dim1( n ) ) return;
+
+	double a_local = *a;
+	dim_t i = 0;
+
+	while ( i <= n-4 )
+	{
+		x[ i+0 ] = a_local;
+		x[ i+1 ] = a_local;
+		x[ i+2 ] = a_local;
+		x[ i+3 ] = a_local;
+		i += 4;
+	}
+
+	while ( i < n )
+	{
+		x[ i ] = a_local;
+		i++;
+	}
+}
diff --git a/sandbox/gemmlike/bls_packm_cxk.h b/build/plugin/my_kernel_2_ref.c
similarity index 66%
rename from sandbox/gemmlike/bls_packm_cxk.h
rename to build/plugin/my_kernel_2_ref.c
index 56e6e5d7a..27aa1e96b 100644
--- a/sandbox/gemmlike/bls_packm_cxk.h
+++ b/build/plugin/my_kernel_2_ref.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,27 +32,45 @@
 
 */
 
+#include @PLUGIN_HEADER@
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
+#ifndef MY_KERNEL_2_ROW_MAJOR
+#define MY_KERNEL_2_ROW_MAJOR 0
+#endif
+
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ctype, ch, opname, arch, suf ) \
 \
-void PASTECH2(bls_,ch,varname) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
-       conj_t  conja, \
-       pack_t  schema, \
-       dim_t   panel_dim, \
-       dim_t   panel_dim_max, \
-       dim_t   panel_len, \
-       dim_t   panel_len_max, \
-       ctype*  kappa, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  p,             inc_t ldp, \
-       cntx_t* cntx  \
-     );
-
-//INSERT_GENTPROT_BASIC( packm_cxk )
-GENTPROT( float,    s, packm_cxk )
-GENTPROT( double,   d, packm_cxk )
-GENTPROT( scomplex, c, packm_cxk )
-GENTPROT( dcomplex, z, packm_cxk )
+       int    m, \
+       int    n, \
+       ctype* a  \
+     ) \
+{ \
+	if ( bli_zero_dim1( m ) || bli_zero_dim1( n ) ) return; \
+\
+	if ( MY_KERNEL_2_ROW_MAJOR ) \
+	{ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				PASTEMAC(ch,seti0s)( a[ i*n + j ] ); \
+			} \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				PASTEMAC(ch,seti0s)( a[ i + j*m ] ); \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNCCO_BASIC( my_kernel_2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/common.mk b/common.mk
index c43578f19..5da49eef2 100644
--- a/common.mk
+++ b/common.mk
@@ -104,7 +104,7 @@ get-noopt-cflags-for     = $(strip $(CFLAGS_PRESET) \
                                    $(CINCFLAGS) \
                             )
 
-get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
+get-noopt-cxxflags-for   = $(strip $(CXXFLAGS_PRESET) \
                                    $(call load-var-for,CDBGFLAGS,$(1)) \
                                    $(call load-var-for,CWARNFLAGS,$(1)) \
                                    $(call load-var-for,CPICFLAGS,$(1)) \
@@ -119,6 +119,18 @@ get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
 get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
                                    -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
+                                   $(BUILD_ASANFLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
+                                   -DBLIS_IN_REF_KERNEL=1 \
+                                   -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
+                            )
+
+get-refinit-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
+                                   $(call get-noopt-cxxflags-for,$(1)) \
+                                   -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
                                    $(BUILD_ASANFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
@@ -131,6 +143,20 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
                                    $(COMPSIMDFLAGS) \
                                    -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
+                                   $(BUILD_ASANFLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
+                                   -DBLIS_IN_REF_KERNEL=1 \
+                                   -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
+                            )
+
+get-refkern-cxxflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
+                                   $(call load-var-for,CRVECFLAGS,$(1)) \
+                                   $(call get-noopt-cxxflags-for,$(1)) \
+                                   $(COMPSIMDFLAGS) \
+                                   -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
                                    $(BUILD_ASANFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
@@ -140,6 +166,17 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
 
 get-config-cflags-for    = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
+                                   -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
+                                   $(BUILD_ASANFLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
+                            )
+
+get-config-cxxflags-for  = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
+                                   $(call get-noopt-cxxflags-for,$(1)) \
+                                   -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
                                    $(BUILD_ASANFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
@@ -162,6 +199,17 @@ get-frame-cxxflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
 get-kernel-cflags-for    = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
                                    $(call load-var-for,CKVECFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
+                                   -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
+                            )
+
+get-kernel-cxxflags-for  = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
+                                   $(call load-var-for,CKVECFLAGS,$(1)) \
+                                   $(call get-noopt-cxxflags-for,$(1)) \
+                                   -DBLIS_CNAME=$(1) \
+                                   -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
@@ -229,11 +277,15 @@ get-user-cflags-for      = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
 # of compilation output.
 get-noopt-text            = "(CFLAGS for no optimization)"
 get-refinit-text-for      = "('$(1)' CFLAGS for ref. kernel init)"
+get-refinit-cxxtext-for   = "('$(1)' CXXFLAGS for ref. kernel init)"
 get-refkern-text-for      = "('$(1)' CFLAGS for ref. kernels)"
+get-refkern-cxxtext-for   = "('$(1)' CXXFLAGS for ref. kernels)"
 get-config-text-for       = "('$(1)' CFLAGS for config code)"
+get-config-cxxtext-for    = "('$(1)' CXXFLAGS for config code)"
 get-frame-text-for        = "('$(1)' CFLAGS for framework code)"
 get-frame-cxxtext-for     = "('$(1)' CXXFLAGS for framework code)"
 get-kernel-text-for       = "('$(1)' CFLAGS for kernels)"
+get-kernel-cxxtext-for    = "('$(1)' CXXFLAGS for kernels)"
 get-addon-c99text-for     = "('$(1)' CFLAGS for addons)"
 get-addon-cxxtext-for     = "('$(1)' CXXFLAGS for addons)"
 get-addon-kernel-text-for = "('$(1)' CFLAGS for addon kernels)"
@@ -337,7 +389,9 @@ FRAGMENT_MK        := .fragment.mk
 # Locations of important files.
 BUILD_DIR          := build
 CONFIG_DIR         := config
+ifeq ($(FRAME_DIR),)
 FRAME_DIR          := frame
+endif
 REFKERN_DIR        := ref_kernels
 KERNELS_DIR        := kernels
 ADDON_DIR          := addon
@@ -737,6 +791,18 @@ ifeq ($(CC_VENDOR),clang)
 CWARNFLAGS += -Wno-tautological-compare -Wno-pass-failed
 endif
 
+# Disable other annoying warnings.
+ifeq ($(CC_VENDOR),clang)
+CWARNFLAGS +=
+else
+ifeq ($(CC_VENDOR),gcc)
+# The '-Wno-maybe-uninitialized' option makes me nervous. Let's temporarily
+# disable for now. -FGVZ
+#CWARNFLAGS += -Wno-maybe-uninitialized -Wno-comment
+CWARNFLAGS += -Wno-comment
+endif
+endif
+
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c))))
 
 # --- Position-independent code flags (shared libraries only) ---
@@ -838,11 +904,16 @@ CLANGFLAGS := -std=c99
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CLANGFLAGS,$(c))))
 
 # Enable C++11, or C++17 if HPX threading is enabled.
+# If building a plugin, do not set any default C++ standard.
+ifeq ($(PLUGIN_NAME),)
 ifneq ($(findstring hpx,$(THREADING_MODEL)),)
 CXXLANGFLAGS := -std=c++17
 else
 CXXLANGFLAGS := -std=c++11
 endif
+else
+CXXLANGFLAGS :=
+endif
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))))
 
 # --- C Preprocessor flags ---
@@ -1281,6 +1352,12 @@ BLIS_CONFIG_H   := ./bli_config.h
 BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY
 
 
+#
+# --- configure file location --------------------------------------------------
+#
+
+CONFIGURE_FILE := $(DIST_PATH)/configure
+
 
 # end of ifndef COMMON_MK_INCLUDED conditional block
 endif
diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c
index dd920bcec..81b4eb8f9 100644
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -56,8 +56,7 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
 
 	  // packm
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+	  BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16x10,
 
 	  BLIS_VA_END
 	);
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index 6339ba381..179a886ab 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -96,8 +96,7 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	  bli_cntx_set_ukrs
 	  (
 		cntx,
-		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
-		BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+		BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16x10,
 		BLIS_VA_END
 	  );
 	}
@@ -106,7 +105,7 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	  bli_cntx_set_ukrs
 	  (
 		cntx,
-		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
+		BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8x10,
 		BLIS_VA_END
 	  );
 	}
diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c
index bfc7f24b9..f1e59e27e 100644
--- a/config/firestorm/bli_cntx_init_firestorm.c
+++ b/config/firestorm/bli_cntx_init_firestorm.c
@@ -53,10 +53,8 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r,
 
 	  // packm
-	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
+	  BLIS_PACKM_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_8x12,
+	  BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6x8,
 
 	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index fe3b45147..e211513b2 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -71,14 +71,10 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 
 #if 1
 	  // packm
-	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6x16,
+	  BLIS_PACKM_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6x8,
+	  BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8,
+	  BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4,
 #endif
 
 	  // axpyf
diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c
index 87fa3176a..548aba177 100644
--- a/config/knl/bli_cntx_init_knl.c
+++ b/config/knl/bli_cntx_init_knl.c
@@ -53,8 +53,7 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8,
 
 	  // packm
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
+	  BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24x8,
 
 	  // axpyf
 	  BLIS_AXPYF_KER, BLIS_FLOAT,  bli_saxpyf_zen_int_8,
diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c
index 197394c82..56a1a66d5 100644
--- a/config/sifive_x280/bli_cntx_init_sifive_x280.c
+++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c
@@ -151,14 +151,10 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx )
 	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm,
 
 	  // Level 1m
-	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_sifive_x280_asm_7xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_7xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_sifive_x280_asm_64xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_32xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_32xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_16xk,
+	  BLIS_PACKM_KER,      BLIS_FLOAT,    bli_spackm_sifive_x280_asm_7m4,
+	  BLIS_PACKM_KER,      BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_7m4,
+	  BLIS_PACKM_KER,      BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6m2,
+	  BLIS_PACKM_KER,      BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6m2,
 
 	  // Level 3
 	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_x280_asm_7m4,
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index a10986b23..99bd794d9 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -113,14 +113,10 @@ void bli_cntx_init_zen( cntx_t* cntx )
 #endif
 
 	  // packm
-	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6x16,
+	  BLIS_PACKM_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6x8,
+	  BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8,
+	  BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4,
 
 	  // axpyf
 	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_8,
@@ -324,7 +320,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_cntx_set_l3_sup_handlers
 	(
 	  cntx,
-	  
+
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index c7e40b4d0..7f507d073 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -110,14 +110,10 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 #endif
 
 	  // packm
-	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6x16,
+	  BLIS_PACKM_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6x8,
+	  BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8,
+	  BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4,
 
 	  // axpyf
 	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_5,
@@ -281,7 +277,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	bli_cntx_set_l3_sup_handlers
 	(
 	  cntx,
-	  
+
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index 3ee385ed6..cf0516a89 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -120,14 +120,10 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
 	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
 #else
-	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6x16,
+	  BLIS_PACKM_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6x8,
+	  BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8,
+	  BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4,
 #endif
 
 	  // axpyf
@@ -298,7 +294,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	bli_cntx_set_l3_sup_handlers
 	(
 	  cntx,
-	  
+
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
diff --git a/configure b/configure
index f87093cad..dc6817f7d 100755
--- a/configure
+++ b/configure
@@ -278,24 +278,6 @@ print_usage()
                  compatibility layer. This automatically enables the
                  BLAS compatibility layer as well.
 
-   --disable-mixed-dt, --enable-mixed-dt
-
-                 Disable (enabled by default) support for mixing the
-                 storage domain and/or storage precision of matrix
-                 operands for the gemm operation, as well as support
-                 for computing in a precision different from one or
-                 both of matrices A and B.
-
-   --disable-mixed-dt-extra-mem, --enable-mixed-dt-extra-mem
-
-                 Disable (enabled by default) support for additional
-                 mixed datatype optimizations that require temporarily
-                 allocating extra memory--specifically, a single m x n
-                 matrix (per application thread) whose storage datatype
-                 is equal to the computation datatype. This option may
-                 only be enabled when mixed domain/precision support is
-                 enabled.
-
    --disable-sup-handling, --enable-sup-handling
 
                  Disable (enabled by default) handling of small/skinny
@@ -478,6 +460,57 @@ assign_key_value()
 	printf -v "${arr}_${key}" %s "${val}"
 }
 
+fully_eval()
+{
+	local old new
+
+	old=""
+	new="$1"
+
+	while [ "x${old}" != "x${new}" ]; do
+		old="${new}"
+		new=$(eval echo "${old}")
+	done
+
+	echo "${new}"
+}
+
+add_config_var()
+{
+	local sub_var var
+
+	sub_var="$1"
+	var="${2:-${sub_var}}"
+
+	#
+	# Use the | character in the substitution command to avoid mangling variables
+	# which contain forward slashes (e.g. paths). There *shouldn't* be any variables
+	# that have | in them...
+	#
+	config_substitutions="${config_substitutions} -e \\\"s|\@${sub_var}\@|\${${var}}|g;\\\""
+}
+
+generate_config_file()
+{
+	local in_file out_file sub
+
+	in_file="$1"
+	out_file="$2"
+
+	echo "${script_name}: creating ${out_file} from ${in_file}"
+
+	#
+	# Use 'eval' to expand the variable references.
+	#
+	eval "sub=\"${config_substitutions}\""
+
+	#
+	# 'eval' also has to be used here to get the proper quoting.
+	# This 'eval' CAN NOT be combined with the one above.
+	#
+	eval "perl -p ${sub} <\"${in_file}\" >\"${out_file}\""
+}
+
 #
 # FGVZ: This commented-out function is being kept as an example how how
 # to effectively "pass by reference" in bash. That is, pass the name of
@@ -629,6 +662,10 @@ pass_config_kernel_registries()
 		# - pass 1: compare cname to the blacklists and commit clist/klist
 		#   to their respective registries, as appropriate.
 
+		# Add cname to full_config_list. Duplicates will
+		# be filtered out later.
+		full_config_list="${full_config_list} ${cname}"
+
 		# Handle singleton and umbrella configuration entries separately.
 		if [[ $(is_singleton_family "${cname}" "${clist}") == "true" ]]; then
 
@@ -637,6 +674,11 @@ pass_config_kernel_registries()
 			# always equals cname, but klist could contain more than one
 			# item.
 
+			# Add the kernels in klist to full_kernel_list. Duplicates will
+			# be filtered out later.
+			full_subconfig_list="${full_subconfig_list} ${cname}"
+			full_kernel_list="${full_kernel_list} ${klist}"
+
 			# Only consider updating the indirect blacklist (pass 0) or
 			# committing clist and klist to the registries (pass 1) if the
 			# configuration name (cname) is not blacklisted.
@@ -712,6 +754,12 @@ pass_config_kernel_registries()
 		# Assign the final indirect blacklist (with whitespace removed).
 		indirect_blist=$(canonicalize_ws "${indirect_blist}")
 	fi
+
+	# Remove duplicates and excess whitespace from the full config and
+	# kernel lists.
+	full_config_list=$(canonicalize_ws "$(rm_duplicate_words_simple "${full_config_list}")")
+	full_subconfig_list=$(canonicalize_ws "$(rm_duplicate_words_simple "${full_subconfig_list}")")
+	full_kernel_list=$(canonicalize_ws "$(rm_duplicate_words_simple "${full_kernel_list}")")
 }
 
 read_registry_file()
@@ -1325,9 +1373,11 @@ has_pragma_omp_simd()
 {
 	local main_c main_c_filepath binname rval
 
+	omp_simd_path="${omp_simd_path-${dist_path}/build}"
+
 	# Path to omp-simd detection source file.
 	main_c="omp_simd_detect.c"
-	main_c_filepath=$(find "${dist_path}/build" -name "${main_c}")
+	main_c_filepath=$(find "${omp_simd_path}" -name "${main_c}")
 
 	# Binary executable filename.
 	binname="omp_simd-detect.x"
@@ -1966,12 +2016,12 @@ check_compiler_version_ranges()
 
 check_assembler()
 {
-	local cc asm_dir cflags asm_fp
+	local cc cflags asm_fp
 
 	cc="${found_cc}"
 
 	# The directory where the assembly files will be.
-	asm_dir="${dist_path}/build"
+	asm_dir=${asm_dir-${dist_path}/build}
 
 	# Most of the time, we won't need any additional compiler flags.
 	cflags=""
@@ -2172,8 +2222,18 @@ select_tool_w_env()
 
 		echo "${script_name}: user specified a ${tool_str} via ${env_str} (${env_var})."
 
-		# See if the binary specified by env_var exists.
-		_the_tool=$(select_tool "${env_var}" "${env_str}")
+		# Map the tool (via its canonical environment variable form) to the set
+		# of options we should use to check that it is working and available.
+		the_flags=$(get_tool_checkflags "${env_str}")
+
+		# Check that the tool works with at least one of the flags in the_flags
+		# the_flags (or, if the_flags is empty, check that the tool exists).
+		rval=$(check_tool "${env_var}" "${the_flags}")
+
+		# If check_tool() returns 0, we're done.
+		if [ "${rval}" == "0" ]; then
+			_the_tool="${env_var}"
+		fi
 
 		# Copy the result into the variable specified by found_var.
 		eval "${found_var}=\"${_the_tool}\""
@@ -2338,7 +2398,7 @@ check_tool()
 		for opt in ${the_flags}; do
 
 			# See if the tool responds to the current flag.
-			"${tool}" "${opt}" > /dev/null 2>&1
+			${tool} ${opt} > /dev/null 2>&1
 
 			# If the tool responded to the flag with a nominal error code of
 			# 0, we found one that works and set rval accoringly.
@@ -2365,2179 +2425,2949 @@ check_tool()
 	echo "${rval}"
 }
 
+build_and_check_configurations()
+{
+	# Use the selected config name to look up the list of configurations
+	# and kernels associated with that name.
+	#config_list=${config_registry[${config_name}]}
+	#kernel_list=${kernel_registry[${config_name}]}
+	config_list=$(query_array "config_registry" "${config_name}")
+	kernel_list=$(query_array "kernel_registry" "${config_name}")
 
+	# Use the config_registry and kernel_registry to build a kconfig_registry
+	# for the selected config_name.
+	build_kconfig_registry "${config_name}"
 
-#
-# -- main function -------------------------------------------------------------
-#
+	# Print the configuration list and kernel list, if requested.
+	if [ "${show_config_list}" == "1" ]; then
 
-main()
-{
-	#declare -A config_registry
-	#declare -A kernel_registry
-	#declare -A kconfig_registry
+		echo "${script_name}: configuration list:"
+		#for k in "${!config_registry[@]}"; do
+		for cr_var in ${!config_registry_*}; do
 
-	# -- Basic names and paths --
+			#v=${config_registry[$k]}
+			k=${cr_var##config_registry_}; v=${!cr_var}
 
-	# The name of the script, stripped of any preceeding path.
-	script_name=${0##*/}
+			echo "${script_name}:   $k: ${v}"
+		done
 
-	# The path to the script. We need this to find the top-level directory
-	# of the source distribution in the event that the user has chosen to
-	# build elsewhere.
-	dist_path=${0%"/${script_name}"}
+		echo "${script_name}: kernel list:"
+		#for k in "${!kernel_registry[@]}"; do
+		for kr_var in ${!kernel_registry_*}; do
 
-	# The path to the directory in which we are building. We do this to
-	# make explicit that we distinguish between the top-level directory
-	# of the distribution and the directory in which we are building.
-	cur_dirpath="."
+			#v=${kernel_registry[$k]}
+			k=${kr_var##kernel_registry_}; v=${!kr_var}
 
-	# The name of and path to the directory named "build" in the top-level
-	# directory of the source distribution.
-	build_dir='build'
-	build_dirpath="${dist_path}/${build_dir}"
+			echo "${script_name}:   $k: ${v}"
+		done
 
-	# The name/path to the registry (master list) of supported configurations.
-	registry_file="config_registry"
-	registry_filepath=${dist_path}/${registry_file}
+		echo "${script_name}: kernel-to-config map for '${config_name}':"
+		#for k in "${!kconfig_registry[@]}"; do
+		for kc_var in ${!kconfig_registry_*}; do
 
-	# The names/paths for the template config.mk.in and its instantiated
-	# counterpart.
-	config_mk_in='config.mk.in'
-	config_mk_out='config.mk'
-	config_mk_in_path="${build_dirpath}/${config_mk_in}"
-	config_mk_out_path="${cur_dirpath}/${config_mk_out}"
+			#v=${kconfig_registry[$k]}
+			k=${kc_var##kconfig_registry_}; v=${!kc_var}
 
-	# The names/paths for the template bli_config.h.in and its instantiated
-	# counterpart.
-	bli_config_h_in='bli_config.h.in'
-	bli_config_h_out='bli_config.h'
-	bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}"
-	bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}"
+			echo "${script_name}:   $k: ${v}"
+		done
+	fi
 
-	# The names/paths for the template bli_addon.h.in and its instantiated
-	# counterpart.
-	bli_addon_h_in='bli_addon.h.in'
-	bli_addon_h_out='bli_addon.h'
-	bli_addon_h_in_path="${build_dirpath}/${bli_addon_h_in}"
-	bli_addon_h_out_path="${cur_dirpath}/${bli_addon_h_out}"
+	# For each kernel in the kernel list, reduce the list of associated
+	# sub-configurations (in the kconfig_registry) to a singleton using
+	# the following rules:
+	# 1. If the list is a singleton, use that name.
+	# 2. If the list contains a sub-configuration name that matches the
+	#    kernel name, use that name.
+	# 3. Otherwise, use the first name in the list.
+	# We use the chosen singleton to ceate a "kernel:subconfig" pair, which
+	# we accumulate into a list. This list is the kernel-to-config map, or
+	# kconfig_map.
 
-	# Path to 'mirror-tree.sh' script.
-	mirror_tree_sh="${build_dirpath}/mirror-tree.sh"
+	# We use a sorted version of kernel_list so that it ends up matching the
+	# display order of the kconfig_registry above.
+	# shellcheck disable=2086
+	kernel_list_sort=$(echo ${kernel_list} | xargs -n1 | sort -u)
 
-	# Path to 'gen-make-frags.sh' script and directory.
-	gen_make_frags_dirpath="${build_dirpath}/gen-make-frags"
-	gen_make_frags_sh="${gen_make_frags_dirpath}/gen-make-frag.sh"
+	kconfig_map=""
+	for kernel in ${kernel_list_sort}; do
 
-	# The name of the (top-level) configuration directory.
-	config_dir='config'
-	config_dirpath="${dist_path}/${config_dir}"
+		#configs="${kconfig_registry[$kernel]}"
+		configs=$(query_array "kconfig_registry" "${kernel}")
 
-	# The name of the (top-level) kernels directory.
-	kernels_dir='kernels'
-	kernels_dirpath="${dist_path}/${kernels_dir}"
+		has_one_kernel=$(is_singleton "${configs}")
+		contains_kernel=$(is_in_list "${kernel}" "${configs}")
 
-	# The name of the (top-level) reference kernels directory.
-	refkern_dir='ref_kernels'
-	refkern_dirpath="${dist_path}/${refkern_dir}"
+		# Check if the list is a singleton.
+		if [ "${has_one_kernel}" == "true" ]; then
 
-	# The root directory of the BLIS framework.
-	frame_dir='frame'
-	frame_dirpath="${dist_path}/${frame_dir}"
+			reducedclist="${configs}"
 
-	# The names of the addons.
-	addon_dir='addon'
-	addon_dirpath="${dist_path}/${addon_dir}"
+		# Check if the list contains a sub-config name that matches the kernel.
+		elif [ "${contains_kernel}" == "true" ]; then
 
-	# The name of the sandbox directory.
-	sandbox_dir='sandbox'
-	sandbox_dirpath="${dist_path}/${sandbox_dir}"
+			reducedclist="${kernel}"
 
-	# The name of the directory in which object files will be kept.
-	obj_dir='obj'
-	obj_dirpath="${cur_dirpath}/${obj_dir}"
+		# Otherwise, use the last name.
+		else
 
-	# The name of the directory in which libraries will be kept.
-	lib_dir='lib'
-	lib_dirpath="${cur_dirpath}/${lib_dir}"
+			last_config=${configs##* }
+			reducedclist="${last_config}"
+		fi
 
-	# The name of the directory in which headers will be kept.
-	include_dir='include'
-	include_dirpath="${cur_dirpath}/${include_dir}"
+		# Create a new "kernel:subconfig" pair and add it to the kconfig_map
+		# list, removing whitespace.
+		new_pair="${kernel}:${reducedclist}"
+		kconfig_map=$(canonicalize_ws "${kconfig_map} ${new_pair}")
+	done
 
-	# The name of the directory in which the BLAS test suite is kept.
-	blastest_dir='blastest'
+	if [ "${show_config_list}" == "1" ]; then
 
-	# The name of the directory in which the BLIS test suite is kept.
-	testsuite_dir='testsuite'
+		echo "${script_name}: kernel-to-config map for '${config_name}' (chosen pairs):"
+		for k in ${kconfig_map}; do
+			echo "${script_name}:   $k"
+		done
+	fi
 
-	# -- Version-related --
 
-	# The file in which the version string is kept.
-	version_file="version"
-	version_filepath="${build_dirpath}/${version_file}"
+	echo "${script_name}: checking configuration against contents of '${registry_file}'."
 
-	# The shared library (.so) version file.
-	so_version_file='so_version'
-	so_version_filepath="${build_dirpath}/${so_version_file}"
+	# First, ensure that the config name is registered (ie: it is present
+	# in the config_registry file).
+	if [ -z "${config_list}" ]; then
 
-	# The major and minor/build .so version numbers.
-	so_version_major=''
-	so_version_minorbuild=''
+		# NOTE: This branch should never execute when using auto-detection,
+		# but we have it here just in case.
+		if [[ $1 = auto ]]; then
 
-	# -- configure options --
+			echo "${script_name}: 'auto-detected configuration '${config_name}' is NOT registered!"
+			echo "${script_name}: "
+			echo "${script_name}: *** Cannot continue with unregistered configuration '${config_name}'. ***"
+			echo "${script_name}: "
+			exit 1;
 
-	# Define the default prefix so that the print_usage() function can
-	# output it in the --help text.
-	prefix_def='/usr/local'
+		else
 
-	# The installation prefix, assigned its default value, and a flag to
-	# track whether or not it was given by the user.
-	prefix=${prefix_def}
-	prefix_flag=''
+			# At this point, we know: (a) config_list is empty; and (b) the user
+			# requested manual configuration. If the config_name given by the
+			# user is present in the configuration blacklist (config_blist),
+			# then we can deduce why the config_list is empty: because the only
+			# subconfig implied by config_name is blacklisted. Thus, we cannot
+			# proceed.
 
-	# The installation exec_prefix, assigned its default value, and a flag to
-	# track whether or not it was given by the user.
-	# shellcheck disable=2016
-	exec_prefix='${prefix}'
-	exec_prefix_flag=''
+			if [[ $(is_in_list "${config_name}" "${config_blist}") = true ]]; then
 
-	# The installation libdir, assigned its default value, and a flag to
-	# track whether or not it was given by the user.
-	# shellcheck disable=2016
-	libdir='${exec_prefix}/lib'
-	libdir_flag=''
+				echo "${script_name}: 'user-specified configuration '${config_name}' is blacklisted!"
+				echo "${script_name}: "
+				echo "${script_name}: *** Cannot continue with blacklisted configuration '${config_name}'. ***"
+				echo "${script_name}: *** Try updating your compiler and/or assembler (binutils) versions. ***"
+				echo "${script_name}: "
+				exit 1;
+			else
 
-	# The installation includedir, assigned its default value, and a flag to
-	# track whether or not it was given by the user.
-	# shellcheck disable=2016
-	includedir='${prefix}/include'
-	includedir_flag=''
+				# If config_name is NOT present in config_blist, then we know
+				# that config_list is empty simply because config_name is
+				# unregistered.
 
-	# The installation sharedir, assigned its default value, and a flag to
-	# track whether or not it was given by the user.
-	# shellcheck disable=2016
-	sharedir='${prefix}/share'
-	sharedir_flag=''
+				echo "${script_name}: 'user-specified configuration '${config_name}' is NOT registered!"
+				echo "${script_name}: "
+				echo "${script_name}: *** Cannot continue with unregistered configuration '${config_name}'. ***"
+				echo "${script_name}: "
+				exit 1;
+			fi
+		fi
+	else
 
-	# The preset value of CFLAGS and LDFLAGS (ie: compiler and linker flags
-	# to use in addition to those determined by the build system).
-	cflags_preset=''
-	ldflags_preset=''
+		# This branch executes when the configuration is found to be present
+		# (i.e. registered) in the config_registry file.
 
-	# The user-given debug type and a flag indicating it was given.
-	debug_type=''
-	debug_flag=''
-	enable_debug='no'
+		echo "${script_name}: configuration '${config_name}' is registered."
+		echo "${script_name}: '${config_name}' is defined as having the following sub-configurations:"
+		echo "${script_name}:    ${config_list}"
+		echo "${script_name}: which collectively require the following kernels:"
+		echo "${script_name}:    ${kernel_list}"
 
-	# A flag indicating whether AddressSanitizer should be used.
-	enable_asan='no'
+	fi
 
-	# The system flag.
-	enable_system='yes'
 
-	# The thread-local storage flag.
-	enable_tls='yes'
+	echo "${script_name}: checking sub-configurations:"
 
-	# The threading flag.
-	threading_model='off'
+	# Now, verify that the constituent configurations associated with the
+	# config name are all valid.
+	for conf in ${config_list}; do
 
-	# The method of assigning micropanels to threads in the JR and JR loops.
-	thread_part_jrir='slab'
+		# First confirm that the current configuration is registered.
+		#this_clist=${config_registry[${conf}]}
+		this_clist=$(query_array "config_registry" "${conf}")
 
-	# Option variables.
-	quiet_flag=''
-	show_config_list=''
+		# If the config_list associated with conf is empty, then it was
+		# never entered into the config_registry to begin with. Thus,
+		# conf must be unregistered.
+		if [ -z "${this_clist}" ]; then
+			echo "${script_name}: '${conf}' is NOT registered!"
+			echo "${script_name}: "
+			echo "${script_name}: *** Cannot continue with unregistered configuration '${conf}'. ***"
+			echo "${script_name}: "
+			exit 1;
+		else
+			echo -n "${script_name}:   '${conf}' is registered."
+		fi
 
-	# Additional flags.
-	enable_verbose='no'
-	enable_arg_max_hack='no'
-	enable_static='yes'
-	enable_shared='yes'
-	enable_rpath='no'
-	export_shared='public'
-	enable_pba_pools='yes'
-	enable_sba_pools='yes'
-	enable_mem_tracing='no'
-	int_type_size=0
-	blas_int_type_size=32
-	enable_blas='yes'
-	enable_cblas='no'
-	enable_mixed_dt='yes'
-	enable_mixed_dt_extra_mem='yes'
-	enable_sup_handling='yes'
-	enable_amd_frame_tweaks='no'
-	enable_memkind='' # The default memkind value is determined later on.
-	enable_trsm_preinversion='yes'
-	force_version='no'
-	complex_return='default'
+		# Then confirm that the current sub-configuration directory exists.
+		if [ ! -d "${config_dirpath}/${conf}" ]; then
+			echo "..but does NOT exist!"
+			echo "${script_name}: "
+			echo "${script_name}: *** Cannot continue with nonexistent configuration '${conf}'. ***"
+			echo "${script_name}: "
+			exit 1;
+		else
+			echo "..and exists."
+		fi
+	done
 
-	# The addon flag and names.
-	addon_flag=''
-	addon_list=''
 
-	# The sandbox flag and name.
-	sandbox_flag=''
-	sandbox=''
+	echo "${script_name}: checking sub-configurations' requisite kernels:"
 
-	# -- Configuration registry --
+	# Also, let's verify that the requisite kernel sets associated with
+	# the config name all correspond to directories that exist.
+	for kernel in ${kernel_list}; do
 
-	# The name of the chosen configuration (the configuration "family").
-	config_name=''
+		echo -n "${script_name}:   '${kernel}' kernels..."
 
-	# The list of sub-configurations associated with config_name.
-	config_list=''
+		# Confirm that the current kernel sub-directory exists.
+		if [ ! -d "${kernels_dirpath}/${kernel}" ]; then
+			echo "do NOT exist!"
+			echo "${script_name}: "
+			echo "${script_name}: *** Cannot continue with nonexistent kernel '${kernel}'. ***"
+			echo "${script_name}: "
+			exit 1;
+		else
+			echo "exist."
+		fi
+	done
+}
 
-	# The list of kernel sets that will be needed by the sub-configurations
-	# in config_list..
-	kernel_list=''
+check_build_tools()
+{
+	# -- Check the operating system --------------------------------------------
 
-	# The list of kernel:sub-configuration pairs for all kernels contained
-	# in kernel_list.
-	kconfig_map=''
+	os_name=$(uname -s)
+	os_vers=$(uname -r)
+	echo "${script_name}: detected ${os_name} kernel version ${os_vers}."
 
-	# -- Out-of-tree --
+	# Define a single variable off of which we can branch to tell if we are
+	# building for Windows.
+	is_win=no
+	if [[ $os_name == MSYS* ]] || \
+	   [[ $os_name == MINGW* ]]  || \
+	   [[ $os_name == CYGWIN* ]] ; then
+		is_win=yes
+	fi
 
-	# Whether we are building out-of-tree.
-	configured_oot="no"
 
-	# Dummy file. Used to check whether the cwd is the same as the top-level
-	# source distribution directory.
-	dummy_file='_blis_dir_detect.tmp'
+	# -- Find a python interpreter ---------------------------------------------
 
-	# -- Debugging --
+	# Acquire the default python search order.
+	python_search_list=$(get_python_search_list)
 
-	# A global flag to help debug the compilation command for the executable
-	# that configure builds on-the-fly to perform hardware auto-detection.
-	debug_auto_detect="no"
+	# Select a python interpreter from the default list, or from PYTHON if it
+	# refers to a valid binary.
+	# shellcheck disable=2153
+	select_tool_w_env "${python_search_list}" "${PYTHON}" "PYTHON" \
+	                  "python interpreter" "yes" found_python
 
+	# -- Check the python version ----------------------------------------------
 
+	# Check the python interpreter's version.
+	get_python_version
+	check_python
 
-	# -- Command line option/argument parsing ----------------------------------
 
-	found=true
-	while [[ $found = true ]]; do
+	# -- Find a C compiler -----------------------------------------------------
 
-		# Process our command line options.
-		unset OPTIND
-		while getopts ":hp:d:e:a:s:t:r:qci:b:-:" opt; do
-			case $opt in
-				-)
-					case "$OPTARG" in
+	# Acquire the default compiler search order. This will vary based on os_name.
+	cc_search_list=$(get_cc_search_list)
 
-						help)
-							print_usage
-							;;
+	# Select a C compiler from the default list, or from CC if it refers to a
+	# valid binary.
+	select_tool_w_env "${cc_search_list}" "${CC}" "CC" \
+	                  "C compiler" "yes" found_cc
 
-						quiet)
-							quiet_flag=1
-							;;
+	# Also check the compiler to see if we are (cross-)compiling for Windows
+	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
+		is_win=yes
+	fi
+	is_msvc=no
+	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -q _MSC_VER; then
+		is_msvc=yes
+	fi
 
-						prefix=*)
-							prefix_flag=1
-							prefix=${OPTARG#*=}
-							;;
-						exec-prefix=*)
-							exec_prefix_flag=1
-							exec_prefix=${OPTARG#*=}
-							;;
-						libdir=*)
-							libdir_flag=1
-							libdir=${OPTARG#*=}
-							;;
-						includedir=*)
-							includedir_flag=1
-							includedir=${OPTARG#*=}
-							;;
-						sharedir=*)
-							sharedir_flag=1
-							sharedir=${OPTARG#*=}
-							;;
 
-						enable-debug)
-							debug_flag=1
-							debug_type=noopt
-							;;
-						enable-debug=*)
-							debug_flag=1
-							debug_type=${OPTARG#*=}
-							;;
-						disable-debug)
-							debug_flag=0
-							;;
+	# -- Check the compiler version --------------------------------------------
 
-						enable-asan)
-							enable_asan='yes'
-							;;
-						disable-asan)
-							enable_asan='no'
-							;;
+	# Initialize the blacklist to empty.
+	blacklist_init
 
-						enable-verbose-make)
-							enable_verbose='yes'
-							;;
-						disable-verbose-make)
-							enable_verbose='no'
-							;;
+	# Check the compiler's version. Certain versions of certain compilers
+	# will preclude building certain sub-configurations, which are added
+	# to a blacklist. We also make note of certain version ranges that
+	# will be useful to know about later.
+	get_compiler_version
+	check_compiler
+	check_compiler_version_ranges
 
-						enable-arg-max-hack)
-							enable_arg_max_hack='yes'
-							;;
-						disable-arg-max-hack)
-							enable_arg_max_hack='no'
-							;;
+	# Now check the assembler's ability to assemble code. Older versions
+	# of binutils may not be aware of certain instruction sets. Those
+	# sub-configurations employing kernels that use such instruction sets
+	# will also be blacklisted.
+	get_binutils_version
+	check_assembler
 
-						enable-static)
-							enable_static='yes'
-							;;
-						disable-static)
-							enable_static='no'
-							;;
+	# Check if there is any incompatibility due to the operating system.
+	check_os
 
-						enable-shared)
-							enable_shared='yes'
-							;;
-						disable-shared)
-							enable_shared='no'
-							;;
+	# Remove duplicates and whitespace from the blacklist.
+	blacklist_cleanup
 
-						enable-rpath)
-							enable_rpath='yes'
-							;;
-						disable-rpath)
-							enable_rpath='no'
-							;;
+	if [ -n "${config_blist}" ]; then
 
-						export-shared=*)
-							export_shared=${OPTARG#*=}
-							;;
+		echo "${script_name}: configuration blacklist:"
+		echo "${script_name}:   ${config_blist}"
+	fi
 
-						enable-system)
-							enable_system='yes'
-							;;
-						disable-system)
-							enable_system='no'
-							;;
 
-						enable-tls)
-							enable_tls='yes'
-							;;
-						disable-tls)
-							enable_tls='no'
-							;;
+	# -- Find a C++ compiler ---------------------------------------------------
 
-						enable-threading=*)
-							threading_model=${OPTARG#*=}
-							;;
-						disable-threading)
-							threading_model='single'
-							;;
+	# Acquire the default C++ compiler search order. This will vary based on
+	# os_name.
+	cxx_search_list=$(get_cxx_search_list)
 
-						thread-part-jrir=*)
-							thread_part_jrir=${OPTARG#*=}
-							;;
+	# Select a C compiler from the default list, or from CC if it refers to a
+	# valid binary.
+	select_tool_w_env "${cxx_search_list}" "${CXX}" "CXX" \
+	                  "C++ compiler" "no" found_cxx
 
-						enable-pba-pools)
-							enable_pba_pools='yes'
-							;;
-						disable-pba-pools)
-							enable_pba_pools='no'
-							;;
 
-						enable-sba-pools)
-							enable_sba_pools='yes'
-							;;
-						disable-sba-pools)
-							enable_sba_pools='no'
-							;;
+	# -- Find a Fortran compiler -----------------------------------------------
 
-						enable-mem-tracing)
-							enable_mem_tracing='yes'
-							;;
-						disable-mem-tracing)
-							enable_mem_tracing='no'
-							;;
+	# Acquire the default Fortran compiler search order.
+	fc_search_list=$(get_fc_search_list)
 
-						enable-addon=*)
-							addon_flag=1
-							addon_name=${OPTARG#*=}
-							# Append the addon name to the list.
-							addon_list="${addon_list} ${addon_name}"
-							;;
-						disable-addon)
-							addon_flag=''
-							;;
+	# Select a Fortran compiler from the default list, or from FC if it refers
+	# to a valid binary.
+	# NOTE: A Fortran compiler is not necessary for building BLIS. The only
+	# reason we might want to query it is to detect the style of returning
+	# complex values from functions. The 'gnu' style returns complex values
+	# from functions normally, via the C language return statement, while the
+	# 'intel' style returns them in a "hidden" parameter (inserted by the
+	# compiler) that precedes all other function parameters.
+	select_tool_w_env "${fc_search_list}" "${FC}" "FC" \
+	                  "Fortran compiler" "no" found_fc
 
-						enable-sandbox=*)
-							sandbox_flag=1
-							sandbox=${OPTARG#*=}
-							;;
-						disable-sandbox)
-							sandbox_flag=''
-							;;
 
-						int-size=*)
-							int_type_size=${OPTARG#*=}
-							;;
+	# -- Find a static library archiver ----------------------------------------
 
-						blas-int-size=*)
-							blas_int_type_size=${OPTARG#*=}
-							;;
+	# Acquire the default archiver search order.
+	ar_search_list=$(get_ar_search_list)
 
-						enable-blas)
-							enable_blas='yes'
-							;;
-						disable-blas)
-							enable_blas='no'
-							;;
+	# Select an archiver from the default list, or from AR if it refers
+	# to a valid binary.
+	select_tool_w_env "${ar_search_list}" "${AR}" "AR" \
+	                  "library archiver" "yes" found_ar
 
-						enable-cblas)
-							enable_cblas='yes'
-							;;
-						disable-cblas)
-							enable_cblas='no'
-							;;
 
-						enable-mixed-dt)
-							enable_mixed_dt='yes'
-							;;
-						disable-mixed-dt)
-							enable_mixed_dt='no'
-							;;
+	# -- Find an archive indexer -----------------------------------------------
 
-						enable-mixed-dt-extra-mem)
-							enable_mixed_dt_extra_mem='yes'
-							;;
-						disable-mixed-dt-extra-mem)
-							enable_mixed_dt_extra_mem='no'
-							;;
+	# Acquire the default archive indexer search order.
+	ranlib_search_list=$(get_ranlib_search_list)
 
-						sup)
-							enable_sup_handling='yes'
-							;;
-						enable-sup-handling)
-							enable_sup_handling='yes'
-							;;
-						nosup)
-							enable_sup_handling='no'
-							;;
-						disable-sup-handling)
-							enable_sup_handling='no'
-							;;
+	# Select an archive indexer from the default list, or from RANLIB if it
+	# refers to a valid binary.
+	select_tool_w_env "${ranlib_search_list}" "${RANLIB}" "RANLIB" \
+	                  "archive indexer" "yes" found_ranlib
+}
 
-						enable-amd-frame-tweaks)
-							enable_amd_frame_tweaks='yes'
-							;;
-						disable-amd-frame-tweaks)
-							enable_amd_frame_tweaks='no'
-							;;
+create_makefile_fragment()
+{
+	local recursive
+	recursive="-r"
+	if [ "$4" = "false" ]; then
+		recursive=""
+	fi
+	echo "${script_name}: creating makefile fragments in $3"
+	"${gen_make_frags_sh}"                       \
+		-h ${recursive} -v0                      \
+		-o "${script_name}"                      \
+		-p "$1" "$2" "$3"                        \
+		"${gen_make_frags_dirpath}/fragment.mk"  \
+		"${gen_make_frags_dirpath}/suffix_list"  \
+		"${gen_make_frags_dirpath}/ignore_list"
+}
 
-						with-memkind)
-							enable_memkind='yes'
-							;;
-						without-memkind)
-							enable_memkind='no'
-							;;
 
-						enable-trsm-preinversion)
-							enable_trsm_preinversion='yes'
-							;;
-						disable-trsm-preinversion)
-							enable_trsm_preinversion='no'
-							;;
+#
+# -- blis_main function -------------------------------------------------------------
+#
 
-						force-version=*)
-							force_version=${OPTARG#*=}
-							;;
+blis_main()
+{
+	#declare -A config_registry
+	#declare -A kernel_registry
+	#declare -A kconfig_registry
 
-						show-config-list)
-							show_config_list=1
-							;;
+	# -- Basic names and paths --
 
-						complex-return=*)
-							complex_return=${OPTARG#*=}
-							;;
+	# The name of the script, stripped of any preceeding path.
+	script_name=${0##*/}
 
-						*)
-							print_usage
-							;;
-					esac;;
-				h)
-					print_usage
-					;;
-				p)
-					prefix_flag=1
-					prefix=$OPTARG
-					;;
-				d)
-					debug_flag=1
-					debug_type=$OPTARG
-					;;
-				e)
-					export_shared=$OPTARG
-					;;
-				a)
-					addon_flag=1
-					addon_name=$OPTARG
-					# Append the addon name to the list.
-					addon_list="${addon_list} ${addon_name}"
-					;;
-				s)
-					sandbox_flag=1
-					sandbox=$OPTARG
-					;;
-				q)
-					quiet_flag=1
-					;;
-				t)
-					threading_model=$OPTARG
-					;;
-				r)
-					thread_part_jrir=$OPTARG
-					;;
-				i)
-					int_type_size=$OPTARG
-					;;
-				b)
-					blas_int_type_size=$OPTARG
-					;;
-				c)
-					show_config_list=1
-					;;
-				\?)
-					print_usage
-					;;
-			esac
-		done
-		shift $((OPTIND - 1))
+	# The path to the script. We need this to find the top-level directory
+	# of the source distribution in the event that the user has chosen to
+	# build elsewhere.
+	dist_path=${0%"/${script_name}"}
 
-		# Parse environment variables
-		found=false
-		while [ $# -gt 0 ]; do
-			case $1 in
-				*=*)
-					var=$(expr "$1" : '\([^=]*\)=')
-					value=$(expr "$1" : '[^=]*=\(.*\)')
-					eval "export $var=\$value"
-					shift
-					found=true
-					;;
-				*)
-					break
-					;;
-			esac
-		done
-	done
+	# The path to the directory in which we are building. We do this to
+	# make explicit that we distinguish between the top-level directory
+	# of the distribution and the directory in which we are building.
+	cur_dirpath="."
 
+	# The name of and path to the directory named "build" in the top-level
+	# directory of the source distribution.
+	build_dir='build'
+	build_dirpath="${dist_path}/${build_dir}"
 
-	# -- Check the operating system --------------------------------------------
+	# The name/path to the registry (master list) of supported configurations.
+	registry_file="config_registry"
+	registry_filepath=${dist_path}/${registry_file}
 
-	os_name=$(uname -s)
-	os_vers=$(uname -r)
-	echo "${script_name}: detected ${os_name} kernel version ${os_vers}."
+	# The names/paths for the template config.mk.in and its instantiated
+	# counterpart.
+	config_mk_in='config.mk.in'
+	config_mk_out='config.mk'
+	config_mk_in_path="${build_dirpath}/${config_mk_in}"
+	config_mk_out_path="${cur_dirpath}/${config_mk_out}"
 
-	# Define a single variable off of which we can branch to tell if we are
-	# building for Windows.
-	is_win=no
-	if [[ $os_name == MSYS* ]] || \
-	   [[ $os_name == MINGW* ]]  || \
-	   [[ $os_name == CYGWIN* ]] ; then
-		is_win=yes
-	fi
+	# The names/paths for the template bli_config.h.in and its instantiated
+	# counterpart.
+	bli_config_h_in='bli_config.h.in'
+	bli_config_h_out='bli_config.h'
+	bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}"
+	bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}"
 
+	# The names/paths for the template bli_addon.h.in and its instantiated
+	# counterpart.
+	bli_addon_h_in='bli_addon.h.in'
+	bli_addon_h_out='bli_addon.h'
+	bli_addon_h_in_path="${build_dirpath}/${bli_addon_h_in}"
+	bli_addon_h_out_path="${cur_dirpath}/${bli_addon_h_out}"
 
-	# -- Find a python interpreter ---------------------------------------------
+	# Path to 'mirror-tree.sh' script.
+	mirror_tree_sh="${build_dirpath}/mirror-tree.sh"
 
-	# Acquire the default python search order.
-	python_search_list=$(get_python_search_list)
+	# Path to 'gen-make-frags.sh' script and directory.
+	gen_make_frags_dirpath="${build_dirpath}/gen-make-frags"
+	gen_make_frags_sh="${gen_make_frags_dirpath}/gen-make-frag.sh"
 
-	# Select a python interpreter from the default list, or from PYTHON if it
-	# refers to a valid binary.
-	# shellcheck disable=2153
-	select_tool_w_env "${python_search_list}" "${PYTHON}" "PYTHON" \
-	                  "python interpreter" "yes" found_python
+	# The name of the (top-level) configuration directory.
+	config_dir='config'
+	config_dirpath="${dist_path}/${config_dir}"
 
-	# -- Check the python version ----------------------------------------------
+	# The name of the (top-level) kernels directory.
+	kernels_dir='kernels'
+	kernels_dirpath="${dist_path}/${kernels_dir}"
 
-	# Check the python interpreter's version.
-	get_python_version
-	check_python
+	# The name of the (top-level) reference kernels directory.
+	refkern_dir='ref_kernels'
+	refkern_dirpath="${dist_path}/${refkern_dir}"
 
+	# The root directory of the BLIS framework.
+	frame_dir='frame'
+	frame_dirpath="${dist_path}/${frame_dir}"
 
-	# -- Find a C compiler -----------------------------------------------------
+	# The names of the addons.
+	addon_dir='addon'
+	addon_dirpath="${dist_path}/${addon_dir}"
 
-	# Acquire the default compiler search order. This will vary based on os_name.
-	cc_search_list=$(get_cc_search_list)
+	# The name of the sandbox directory.
+	sandbox_dir='sandbox'
+	sandbox_dirpath="${dist_path}/${sandbox_dir}"
 
-	# Select a C compiler from the default list, or from CC if it refers to a
-	# valid binary.
-	select_tool_w_env "${cc_search_list}" "${CC}" "CC" \
-	                  "C compiler" "yes" found_cc
+	# The name of the directory in which object files will be kept.
+	obj_dir='obj'
+	obj_dirpath="${cur_dirpath}/${obj_dir}"
 
-	# Also check the compiler to see if we are (cross-)compiling for Windows
-	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
-		is_win=yes
-	fi
-	is_msvc=no
-	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -q _MSC_VER; then
-		is_msvc=yes
-	fi
+	# The name of the directory in which libraries will be kept.
+	lib_dir='lib'
+	lib_dirpath="${cur_dirpath}/${lib_dir}"
 
+	# The name of the directory in which headers will be kept.
+	include_dir='include'
+	include_dirpath="${cur_dirpath}/${include_dir}"
 
-	# -- Check the compiler version --------------------------------------------
+	# The name of the directory in which the BLAS test suite is kept.
+	blastest_dir='blastest'
 
-	# Initialize the blacklist to empty.
-	blacklist_init
+	# The name of the directory in which the BLIS test suite is kept.
+	testsuite_dir='testsuite'
 
-	# Check the compiler's version. Certain versions of certain compilers
-	# will preclude building certain sub-configurations, which are added
-	# to a blacklist. We also make note of certain version ranges that
-	# will be useful to know about later.
-	get_compiler_version
-	check_compiler
-	check_compiler_version_ranges
+	# -- Version-related --
 
-	# Now check the assembler's ability to assemble code. Older versions
-	# of binutils may not be aware of certain instruction sets. Those
-	# sub-configurations employing kernels that use such instruction sets
-	# will also be blacklisted.
-	get_binutils_version
-	check_assembler
+	# The file in which the version string is kept.
+	version_file="version"
+	version_filepath="${build_dirpath}/${version_file}"
 
-	# Check if there is any incompatibility due to the operating system.
-	check_os
+	# The shared library (.so) version file.
+	so_version_file='so_version'
+	so_version_filepath="${build_dirpath}/${so_version_file}"
 
-	# Remove duplicates and whitespace from the blacklist.
-	blacklist_cleanup
+	# The major and minor/build .so version numbers.
+	so_version_major=''
+	so_version_minorbuild=''
 
-	if [ -n "${config_blist}" ]; then
+	# -- configure options --
 
-		echo "${script_name}: configuration blacklist:"
-		echo "${script_name}:   ${config_blist}"
-	fi
+	# Define the default prefix so that the print_usage() function can
+	# output it in the --help text.
+	prefix_def='/usr/local'
 
+	# The installation prefix, assigned its default value, and a flag to
+	# track whether or not it was given by the user.
+	prefix=${prefix_def}
+	prefix_flag=''
 
-	# -- Find a C++ compiler ---------------------------------------------------
+	# The installation exec_prefix, assigned its default value, and a flag to
+	# track whether or not it was given by the user. Double-escaping the
+	# variable is necessary because it will pass through 'eval' twice.
+	# shellcheck disable=2016
+	exec_prefix='\\\${prefix}'
+	exec_prefix_flag=''
 
-	# Acquire the default C++ compiler search order. This will vary based on
-	# os_name.
-	cxx_search_list=$(get_cxx_search_list)
+	# The installation libdir, assigned its default value, and a flag to
+	# track whether or not it was given by the user. Double-escaping the
+	# variable is necessary because it will pass through 'eval' twice.
+	# shellcheck disable=2016
+	libdir='\\\${exec_prefix}/lib'
+	libdir_flag=''
 
-	# Select a C compiler from the default list, or from CC if it refers to a
-	# valid binary.
-	select_tool_w_env "${cxx_search_list}" "${CXX}" "CXX" \
-	                  "C++ compiler" "no" found_cxx
+	# The installation includedir, assigned its default value, and a flag to
+	# track whether or not it was given by the user. Double-escaping the
+	# variable is necessary because it will pass through 'eval' twice.
+	# shellcheck disable=2016
+	includedir='\\\${prefix}/include'
+	includedir_flag=''
 
+	# The installation sharedir, assigned its default value, and a flag to
+	# track whether or not it was given by the user. Double-escaping the
+	# variable is necessary because it will pass through 'eval' twice.
+	# shellcheck disable=2016
+	sharedir='\\\${prefix}/share'
+	sharedir_flag=''
 
-	# -- Find a Fortran compiler -----------------------------------------------
+	# The preset value of CFLAGS, CXXFLAGS, and LDFLAGS (ie: compiler and linker flags
+	# to use in addition to those determined by the build system).
+	cflags_preset=''
+	cxxflags_preset=''
+	ldflags_preset=''
 
-	# Acquire the default Fortran compiler search order.
-	fc_search_list=$(get_fc_search_list)
+	# The user-given debug type and a flag indicating it was given.
+	debug_type=''
+	debug_flag=''
+	enable_debug='no'
 
-	# Select a Fortran compiler from the default list, or from FC if it refers
-	# to a valid binary.
-	# NOTE: A Fortran compiler is not necessary for building BLIS. The only
-	# reason we might want to query it is to detect the style of returning
-	# complex values from functions. The 'gnu' style returns complex values
-	# from functions normally, via the C language return statement, while the
-	# 'intel' style returns them in a "hidden" parameter (inserted by the
-	# compiler) that precedes all other function parameters.
-	select_tool_w_env "${fc_search_list}" "${FC}" "FC" \
-	                  "Fortran compiler" "no" found_fc
+	# A flag indicating whether AddressSanitizer should be used.
+	enable_asan='no'
 
+	# The system flag.
+	enable_system='yes'
 
-	# -- Find a static library archiver ----------------------------------------
+	# The thread-local storage flag.
+	enable_tls='yes'
 
-	# Acquire the default archiver search order.
-	ar_search_list=$(get_ar_search_list)
+	# The threading flag.
+	threading_model='off'
 
-	# Select an archiver from the default list, or from AR if it refers
-	# to a valid binary.
-	select_tool_w_env "${ar_search_list}" "${AR}" "AR" \
-	                  "library archiver" "yes" found_ar
+	# The method of assigning micropanels to threads in the JR and JR loops.
+	thread_part_jrir='slab'
 
+	# Option variables.
+	quiet_flag=''
+	show_config_list=''
 
-	# -- Find an archive indexer -----------------------------------------------
+	# Additional flags.
+	enable_verbose='no'
+	enable_arg_max_hack='no'
+	enable_static='yes'
+	enable_shared='yes'
+	enable_rpath='no'
+	export_shared='public'
+	enable_pba_pools='yes'
+	enable_sba_pools='yes'
+	enable_mem_tracing='no'
+	int_type_size=0
+	blas_int_type_size=32
+	enable_blas='yes'
+	enable_cblas='no'
+	enable_sup_handling='yes'
+	enable_amd_frame_tweaks='no'
+	enable_memkind='' # The default memkind value is determined later on.
+	enable_trsm_preinversion='yes'
+	force_version='no'
+	complex_return='default'
 
-	# Acquire the default archive indexer search order.
-	ranlib_search_list=$(get_ranlib_search_list)
+	# The addon flag and names.
+	addon_flag=''
+	addon_list=''
 
-	# Select an archive indexer from the default list, or from RANLIB if it
-	# refers to a valid binary.
-	select_tool_w_env "${ranlib_search_list}" "${RANLIB}" "RANLIB" \
-	                  "archive indexer" "yes" found_ranlib
+	# The sandbox flag and name.
+	sandbox_flag=''
+	sandbox=''
 
+	# -- Configuration registry --
 
-	# -- Read the configuration registry ---------------------------------------
+	# The name of the chosen configuration (the configuration "family").
+	config_name=''
 
-	# Make sure the config registry file exists and can be opened.
-	if [ ! -f "${registry_filepath}" ]; then
+	# The list of sub-configurations associated with config_name.
+	config_list=''
 
-		echo "${script_name}: could not open '${registry_file}' file; cannot continue."
-		echo "${script_name}: BLIS distribution appears to be incomplete."
-		echo "${script_name}: *** Please verify source distribution."
+	# The list of all sub-configurations and configuration families.
+	full_config_list=''
+	full_subconfig_list=''
 
-		exit 1
-	fi
+	# The list of kernel sets that will be needed by the sub-configurations
+	# in config_list.
+	kernel_list=''
 
-	# Read the registered configuration names and lists into associative
-	# arrays.
-	echo -n "${script_name}: reading configuration registry..."
-	read_registry_file "${registry_filepath}"
-	echo "done."
+	# The list of all kernel sets.
+	full_kernel_list=''
 
-	# Report if additional configurations needed to be blacklisted.
-	# NOTE: This branch should never execute so long as indirect blacklisting
-	# is disabled. See comment regarding issue #214 in the definition of
-	# pass_config_kernel_registries().
-	if [ -n "${indirect_blist}" ]; then
-		echo "${script_name}: needed to indirectly blacklist additional configurations:"
-		echo "${script_name}:   ${indirect_blist}"
-	fi
+	# The list of kernel:sub-configuration pairs for all kernels contained
+	# in kernel_list.
+	kconfig_map=''
 
+	# -- Out-of-tree --
 
-	# -- Acquire the BLIS version ----------------------------------------------
+	# Whether we are building out-of-tree.
+	configured_oot="no"
 
-	# Set the 'version' variable to the default value (the 'git describe'
-	# augmented instance of whatever is in the 'version' file if this is a git
-	# clone, or whatever is in the 'version' file unmodified if it is a bare
-	# source release).
-	set_default_version "${version_filepath}"
+	# Dummy file. Used to check whether the cwd is the same as the top-level
+	# source distribution directory.
+	dummy_file='_blis_dir_detect.tmp'
 
-	# Initial message.
-	echo "${script_name}: starting configuration of BLIS ${version}."
+	# -- Debugging --
 
-	# Check if the user requested a custom version string.
-	if [[ ${force_version} = no ]]; then
-		echo "${script_name}: configuring with official version string."
-	else
-		echo "${script_name}: configuring with custom version string '${force_version}'."
-		version="${force_version}"
-	fi
+	# A global flag to help debug the compilation command for the executable
+	# that configure builds on-the-fly to perform hardware auto-detection.
+	debug_auto_detect="no"
 
 
-	# -- Acquire the shared library (.so) versions -----------------------------
 
-	# The first line of the 'so_version' file contains the .so major version.
-	so_version_major=$(sed -n "1p" < "${so_version_filepath}")
+	# -- Command line option/argument parsing ----------------------------------
 
-	# The second line contains the minor and build .so version numbers
-	# (separated by a '.').
-	so_version_minorbuild=$(sed -n "2p" < "${so_version_filepath}")
+	found=true
+	while [[ $found = true ]]; do
 
-	echo "${script_name}: found shared library .so version '${so_version_major}.${so_version_minorbuild}'."
-	echo "${script_name}:   .so major version: ${so_version_major}"
-	echo "${script_name}:   .so minor.build version: ${so_version_minorbuild}"
+		# Process our command line options.
+		unset OPTIND
+		while getopts ":hp:d:e:a:s:t:r:qci:b:-:" opt; do
+			case $opt in
+				-)
+					case "$OPTARG" in
 
+						help)
+							print_usage
+							;;
 
-	# -- Various pre-configuration checks --------------------------------------
+						quiet)
+							quiet_flag=1
+							;;
 
-	# Set config_name based on the number of arguments leftover (after command
-	# line option processing).
-	if [ $# = "0" ]; then
+						prefix=*)
+							prefix_flag=1
+							prefix=${OPTARG#*=}
+							;;
+						exec-prefix=*)
+							exec_prefix_flag=1
+							exec_prefix=${OPTARG#*=}
+							;;
+						libdir=*)
+							libdir_flag=1
+							libdir=${OPTARG#*=}
+							;;
+						includedir=*)
+							includedir_flag=1
+							includedir=${OPTARG#*=}
+							;;
+						sharedir=*)
+							sharedir_flag=1
+							sharedir=${OPTARG#*=}
+							;;
 
-		#configs_avail="auto "$(ls ${config_dirpath})
+						enable-debug)
+							debug_flag=1
+							debug_type=noopt
+							;;
+						enable-debug=*)
+							debug_flag=1
+							debug_type=${OPTARG#*=}
+							;;
+						disable-debug)
+							debug_flag=0
+							;;
 
-		echo "${script_name}: "
-		echo "${script_name}: *** No configuration given! ***"
-		echo "${script_name}: "
-		echo "${script_name}: Default configuration behavior is not implemented (for your"
-		echo "${script_name}: own safety). Please re-run '${script_name}' and specify one"
-		echo "${script_name}: of the existing configurations in the source distribution's"
-		echo "${script_name}  '${registry_file}' file:"
-		echo "${script_name}: "
-		#for k in "${!config_registry[@]}"; do
-		for cr_var in ${!config_registry_*}; do
+						enable-asan)
+							enable_asan='yes'
+							;;
+						disable-asan)
+							enable_asan='no'
+							;;
 
-			#v=${config_registry[$k]}
-			k=${cr_var##config_registry_}; v=${!cr_var}
+						enable-verbose-make)
+							enable_verbose='yes'
+							;;
+						disable-verbose-make)
+							enable_verbose='no'
+							;;
 
-			echo "${script_name}:   $k (${v})"
-		done
-		echo "${script_name}: "
+						enable-arg-max-hack)
+							enable_arg_max_hack='yes'
+							;;
+						disable-arg-max-hack)
+							enable_arg_max_hack='no'
+							;;
 
-		exit 1
+						enable-static)
+							enable_static='yes'
+							;;
+						disable-static)
+							enable_static='no'
+							;;
 
-	elif [ $# != "1" ]; then   # more than one configuration argument given.
+						enable-shared)
+							enable_shared='yes'
+							;;
+						disable-shared)
+							enable_shared='no'
+							;;
 
-		print_usage
+						enable-rpath)
+							enable_rpath='yes'
+							;;
+						disable-rpath)
+							enable_rpath='no'
+							;;
 
-	fi
+						export-shared=*)
+							export_shared=${OPTARG#*=}
+							;;
 
-	if [[ $1 = auto ]]; then
+						enable-system)
+							enable_system='yes'
+							;;
+						disable-system)
+							enable_system='no'
+							;;
 
-		echo "${script_name}: automatic configuration requested."
+						enable-tls)
+							enable_tls='yes'
+							;;
+						disable-tls)
+							enable_tls='no'
+							;;
 
-		# Call the auto_detect() function and save the returned string in
-		# config_name.
-		config_name=$(auto_detect)
+						enable-threading=*)
+							threading_model=${OPTARG#*=}
+							;;
+						disable-threading)
+							threading_model='single'
+							;;
 
-		# Debugging stuff. When confirming the behavior of auto_detect(),
-		# it is useful to output ${config_name}, which in theory could be
-		# set temoprarily to something other than the config_name, such as
-		# the compilation command.
-		if [ "${debug_auto_detect}" = "yes" ]; then
-			echo "auto-detect program compilation command: ${config_name}"
-			exit 1
-		fi
+						thread-part-jrir=*)
+							thread_part_jrir=${OPTARG#*=}
+							;;
 
-		echo "${script_name}: hardware detection driver returned '${config_name}'."
+						enable-pba-pools)
+							enable_pba_pools='yes'
+							;;
+						disable-pba-pools)
+							enable_pba_pools='no'
+							;;
 
-		# If the auto-detect code returned the "generic" string, it means we
-		# were unable to automatically detect the user's hardware type. While
-		# this is going to be a rare event, it will likely lead the user to
-		# experience much lower performance than expected, and thus we will
-		# warn them about it at the end of the configure output (to increase
-		# the chances that they see it).
-		if [ "${config_name}" = "generic" ]; then
+						enable-sba-pools)
+							enable_sba_pools='yes'
+							;;
+						disable-sba-pools)
+							enable_sba_pools='no'
+							;;
 
-			warn_user_generic=1
-		else
-			warn_user_generic=0
-		fi
-	else
+						enable-mem-tracing)
+							enable_mem_tracing='yes'
+							;;
+						disable-mem-tracing)
+							enable_mem_tracing='no'
+							;;
 
-		# Use the command line argument as the configuration name.
-		config_name=$1
+						enable-addon=*)
+							addon_flag=1
+							addon_name=${OPTARG#*=}
+							# Append the addon name to the list.
+							addon_list="${addon_list} ${addon_name}"
+							;;
+						disable-addon)
+							addon_flag=''
+							;;
+
+						enable-sandbox=*)
+							sandbox_flag=1
+							sandbox=${OPTARG#*=}
+							;;
+						disable-sandbox)
+							sandbox_flag=''
+							;;
+
+						int-size=*)
+							int_type_size=${OPTARG#*=}
+							;;
+
+						blas-int-size=*)
+							blas_int_type_size=${OPTARG#*=}
+							;;
+
+						enable-blas)
+							enable_blas='yes'
+							;;
+						disable-blas)
+							enable_blas='no'
+							;;
+
+						enable-cblas)
+							enable_cblas='yes'
+							;;
+						disable-cblas)
+							enable_cblas='no'
+							;;
+
+						sup)
+							enable_sup_handling='yes'
+							;;
+						enable-sup-handling)
+							enable_sup_handling='yes'
+							;;
+						nosup)
+							enable_sup_handling='no'
+							;;
+						disable-sup-handling)
+							enable_sup_handling='no'
+							;;
+
+						enable-amd-frame-tweaks)
+							enable_amd_frame_tweaks='yes'
+							;;
+						disable-amd-frame-tweaks)
+							enable_amd_frame_tweaks='no'
+							;;
+
+						with-memkind)
+							enable_memkind='yes'
+							;;
+						without-memkind)
+							enable_memkind='no'
+							;;
+
+						enable-trsm-preinversion)
+							enable_trsm_preinversion='yes'
+							;;
+						disable-trsm-preinversion)
+							enable_trsm_preinversion='no'
+							;;
+
+						force-version=*)
+							force_version=${OPTARG#*=}
+							;;
+
+						show-config-list)
+							show_config_list=1
+							;;
+
+						complex-return=*)
+							complex_return=${OPTARG#*=}
+							;;
+
+						*)
+							print_usage
+							;;
+					esac;;
+				h)
+					print_usage
+					;;
+				p)
+					prefix_flag=1
+					prefix=$OPTARG
+					;;
+				d)
+					debug_flag=1
+					debug_type=$OPTARG
+					;;
+				e)
+					export_shared=$OPTARG
+					;;
+				a)
+					addon_flag=1
+					addon_name=$OPTARG
+					# Append the addon name to the list.
+					addon_list="${addon_list} ${addon_name}"
+					;;
+				s)
+					sandbox_flag=1
+					sandbox=$OPTARG
+					;;
+				q)
+					quiet_flag=1
+					;;
+				t)
+					threading_model=$OPTARG
+					;;
+				r)
+					thread_part_jrir=$OPTARG
+					;;
+				i)
+					int_type_size=$OPTARG
+					;;
+				b)
+					blas_int_type_size=$OPTARG
+					;;
+				c)
+					show_config_list=1
+					;;
+				\?)
+					print_usage
+					;;
+			esac
+		done
+		shift $((OPTIND - 1))
+
+		# Parse environment variables
+		found=false
+		while [ $# -gt 0 ]; do
+			case $1 in
+				*=*)
+					var=$(expr "$1" : '\([^=]*\)=')
+					value=$(expr "$1" : '[^=]*=\(.*\)')
+					eval "export $var=\$value"
+					shift
+					found=true
+					;;
+				*)
+					break
+					;;
+			esac
+		done
+	done
+
+	check_build_tools
+
+	# -- Read the configuration registry ---------------------------------------
+
+	# Make sure the config registry file exists and can be opened.
+	if [ ! -f "${registry_filepath}" ]; then
+
+		echo "${script_name}: could not open '${registry_file}' file; cannot continue."
+		echo "${script_name}: BLIS distribution appears to be incomplete."
+		echo "${script_name}: *** Please verify source distribution."
+
+		exit 1
+	fi
+
+	# Read the registered configuration names and lists into associative
+	# arrays.
+	echo -n "${script_name}: reading configuration registry..."
+	read_registry_file "${registry_filepath}"
+	echo "done."
+
+	# Report if additional configurations needed to be blacklisted.
+	# NOTE: This branch should never execute so long as indirect blacklisting
+	# is disabled. See comment regarding issue #214 in the definition of
+	# pass_config_kernel_registries().
+	if [ -n "${indirect_blist}" ]; then
+		echo "${script_name}: needed to indirectly blacklist additional configurations:"
+		echo "${script_name}:   ${indirect_blist}"
+	fi
+
+
+	# -- Acquire the BLIS version ----------------------------------------------
+
+	# Set the 'version' variable to the default value (the 'git describe'
+	# augmented instance of whatever is in the 'version' file if this is a git
+	# clone, or whatever is in the 'version' file unmodified if it is a bare
+	# source release).
+	set_default_version "${version_filepath}"
+
+	# Initial message.
+	echo "${script_name}: starting configuration of BLIS ${version}."
+
+	# Check if the user requested a custom version string.
+	if [[ ${force_version} = no ]]; then
+		echo "${script_name}: configuring with official version string."
+	else
+		echo "${script_name}: configuring with custom version string '${force_version}'."
+		version="${force_version}"
+	fi
+
+
+	# -- Acquire the shared library (.so) versions -----------------------------
+
+	# The first line of the 'so_version' file contains the .so major version.
+	so_version_major=$(sed -n "1p" < "${so_version_filepath}")
+
+	# The second line contains the minor and build .so version numbers
+	# (separated by a '.').
+	so_version_minorbuild=$(sed -n "2p" < "${so_version_filepath}")
+
+	echo "${script_name}: found shared library .so version '${so_version_major}.${so_version_minorbuild}'."
+	echo "${script_name}:   .so major version: ${so_version_major}"
+	echo "${script_name}:   .so minor.build version: ${so_version_minorbuild}"
+
+
+	# -- Various pre-configuration checks --------------------------------------
+
+	# Set config_name based on the number of arguments leftover (after command
+	# line option processing).
+	if [ $# = "0" ]; then
+
+		#configs_avail="auto "$(ls ${config_dirpath})
+
+		echo "${script_name}: "
+		echo "${script_name}: *** No configuration given! ***"
+		echo "${script_name}: "
+		echo "${script_name}: Default configuration behavior is not implemented (for your"
+		echo "${script_name}: own safety). Please re-run '${script_name}' and specify one"
+		echo "${script_name}: of the existing configurations in the source distribution's"
+		echo "${script_name}  '${registry_file}' file:"
+		echo "${script_name}: "
+		#for k in "${!config_registry[@]}"; do
+		for cr_var in ${!config_registry_*}; do
+
+			#v=${config_registry[$k]}
+			k=${cr_var##config_registry_}; v=${!cr_var}
+
+			echo "${script_name}:   $k (${v})"
+		done
+		echo "${script_name}: "
+
+		exit 1
+
+	elif [ $# != "1" ]; then   # more than one configuration argument given.
+
+		print_usage
+
+	fi
+
+	if [[ $1 = auto ]]; then
+
+		echo "${script_name}: automatic configuration requested."
+
+		# Call the auto_detect() function and save the returned string in
+		# config_name.
+		config_name=$(auto_detect)
+
+		# Debugging stuff. When confirming the behavior of auto_detect(),
+		# it is useful to output ${config_name}, which in theory could be
+		# set temoprarily to something other than the config_name, such as
+		# the compilation command.
+		if [ "${debug_auto_detect}" = "yes" ]; then
+			echo "auto-detect program compilation command: ${config_name}"
+			exit 1
+		fi
+
+		echo "${script_name}: hardware detection driver returned '${config_name}'."
+
+		# If the auto-detect code returned the "generic" string, it means we
+		# were unable to automatically detect the user's hardware type. While
+		# this is going to be a rare event, it will likely lead the user to
+		# experience much lower performance than expected, and thus we will
+		# warn them about it at the end of the configure output (to increase
+		# the chances that they see it).
+		if [ "${config_name}" = "generic" ]; then
+
+			warn_user_generic=1
+		else
+			warn_user_generic=0
+		fi
+	else
+
+		# Use the command line argument as the configuration name.
+		config_name=$1
+
+		echo "${script_name}: manual configuration requested; configuring with '${config_name}'."
+
+	fi
+
+	build_and_check_configurations
+
+	# In order to determine the default behavior of the --with[out]-memkind
+	# option, we try to detect whether libmemkind is available. If it is,
+	# the default implied option will be --with-memkind; otherwise, will be
+	# --without-memkind.
+	has_memkind=$(has_libmemkind)
+
+	# Try to determine whether the chosen compiler supports #pragma omp simd.
+	pragma_omp_simd=$(has_pragma_omp_simd)
+
+
+	# -- Prepare variables for subsitution into template files -----------------
+
+	# Parse the status of the prefix option and echo feedback.
+	if [ -n "${prefix_flag}" ]; then
+		echo "${script_name}: detected --prefix='${prefix}'."
+	else
+		echo "${script_name}: no install prefix option given; defaulting to '${prefix}'."
+	fi
+
+	# Parse the status of the exec_prefix option and echo feedback.
+	if [ -n "${exec_prefix_flag}" ]; then
+		echo "${script_name}: detected --exec-prefix='${exec_prefix}'."
+	else
+		echo "${script_name}: no install exec_prefix option given; defaulting to PREFIX."
+	fi
+
+	# Parse the status of the libdir option and echo feedback.
+	if [ -n "${libdir_flag}" ]; then
+		echo "${script_name}: detected --libdir='${libdir}'."
+	else
+		echo "${script_name}: no install libdir option given; defaulting to EXECPREFIX/lib."
+	fi
+
+	# Parse the status of the includedir option and echo feedback.
+	if [ -n "${includedir_flag}" ]; then
+		echo "${script_name}: detected --includedir='${includedir}'."
+	else
+		echo "${script_name}: no install includedir option given; defaulting to PREFIX/include."
+	fi
+
+	# Parse the status of the sharedir option and echo feedback.
+	if [ -n "${sharedir_flag}" ]; then
+		echo "${script_name}: detected --sharedir='${sharedir}'."
+	else
+		echo "${script_name}: no install sharedir option given; defaulting to PREFIX/share."
+	fi
+
+	# Echo the installation directories that we settled on.
+	echo "${script_name}: final installation directories:"
+	echo "${script_name}:   prefix:      $(fully_eval "${prefix}")"
+	echo "${script_name}:   exec_prefix: $(fully_eval "${exec_prefix}")"
+	echo "${script_name}:   libdir:      $(fully_eval "${libdir}")"
+	echo "${script_name}:   includedir:  $(fully_eval "${includedir}")"
+	echo "${script_name}:   sharedir:    $(fully_eval "${sharedir}")"
+	echo "${script_name}: NOTE: the variables above can be overridden when running make."
+
+	# Check if CFLAGS is non-empty.
+	if [ -n "${CFLAGS}" ]; then
+		cflags_preset="${CFLAGS}"
+		echo "${script_name}: detected preset CFLAGS; prepending:"
+		echo "${script_name}:   ${cflags_preset}"
+	else
+		cflags_preset=''
+		echo "${script_name}: no preset CFLAGS detected."
+	fi
+
+	# Check if CXXFLAGS is non-empty.
+	if [ -n "${CXXFLAGS}" ]; then
+		cxxflags_preset="${CXXFLAGS}"
+		echo "${script_name}: detected preset CXXFLAGS; prepending:"
+		echo "${script_name}:   ${cxxflags_preset}"
+	else
+		cxxflags_preset=''
+		echo "${script_name}: no preset CXXFLAGS detected."
+	fi
+
+	# Check if LDFLAGS is non-empty.
+	if [ -n "${LDFLAGS}" ]; then
+		ldflags_preset="${LDFLAGS}"
+		echo "${script_name}: detected preset LDFLAGS; prepending:"
+		echo "${script_name}:   ${ldflags_preset}"
+	else
+		ldflags_preset=''
+		echo "${script_name}: no preset LDFLAGS detected."
+	fi
+
+	# Check if the verbose make flag was specified.
+	if [[ ${enable_verbose} = yes ]]; then
+		echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)"
+	else
+		echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)"
+	fi
+
+	# Check if the ARG_MAX hack was requested.
+	if [[ ${enable_arg_max_hack} = yes ]]; then
+		echo "${script_name}: enabling ARG_MAX hack."
+	else
+		echo "${script_name}: disabling ARG_MAX hack."
+	fi
+
+	# Check if the debug flag was specified.
+	if [[ -n ${debug_flag} ]]; then
+		if [[ ${debug_type} = opt ]]; then
+			echo "${script_name}: enabling debug symbols with optimizations."
+		elif [[ ${debug_type} = sde ]]; then
+			debug_type='sde'
+			echo "${script_name}: enabling SDE processor emulation."
+		else
+			debug_type='noopt'
+			echo "${script_name}: enabling debug symbols; optimizations disabled."
+		fi
+		enable_debug='yes'
+	else
+		debug_type='off'
+		enable_debug='no'
+		echo "${script_name}: debug symbols disabled."
+	fi
+
+	# Check if the AddressSanitizer flag was specified.
+	if [[ ${enable_asan} = yes ]]; then
+		echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)."
+	else
+		enable_asan='no'
+		echo "${script_name}: AddressSanitizer support disabled."
+	fi
+
+	# Check if the static lib flag was specified.
+	if   [[ ${enable_static} = yes && ${enable_shared} = yes ]]; then
+		echo "${script_name}: building BLIS as both static and shared libraries."
+		enable_shared_01=1
+	elif [[ ${enable_static} = no && ${enable_shared} = yes ]]; then
+		echo "${script_name}: building BLIS as a shared library (static library disabled)."
+		enable_shared_01=1
+	elif [[ ${enable_static} = yes && ${enable_shared} = no ]]; then
+		echo "${script_name}: building BLIS as a static library (shared library disabled)."
+		enable_shared_01=0
+	else
+		echo "${script_name}: Both static and shared libraries were disabled."
+		echo "${script_name}: *** Please enable one (or both) to continue."
+		exit 1
+	fi
+
+	# Check if the "export shared" flag was specified.
+	if [[ ${export_shared} = all ]]; then
+		if [[ ${enable_shared} = yes ]]; then
+			echo "${script_name}: exporting all symbols within shared library."
+		else
+			echo "${script_name}: ignoring request to export all symbols within shared library."
+		fi
+	elif [[ ${export_shared} = public ]]; then
+		if [[ ${enable_shared} = yes ]]; then
+			echo "${script_name}: exporting only public symbols within shared library."
+		fi
+	else
+		echo "${script_name}: *** Invalid argument '${export_shared}' to --export-shared option given."
+		echo "${script_name}: *** Please use 'public' or 'all'."
+		exit 1
+	fi
+
+	# Check if we are building with or without operating system support.
+	if [[ ${enable_system} = yes ]]; then
+		echo "${script_name}: enabling operating system support."
+		enable_system_01=1
+	else
+		echo "${script_name}: disabling operating system support."
+		echo "${script_name}: WARNING: disabling OS support forcibly disables all threading!"
+		enable_system_01=0
+
+		# Force threading to be disabled.
+		threading_model='off'
+	fi
+
+	# Check if we are building with or without thread-local storage support.
+	if [[ ${enable_tls} = yes ]]; then
+		echo "${script_name}: enabling thread-local storage (TLS) support."
+		enable_tls_01=1
+	else
+		echo "${script_name}: disabling thread-local storage (TLS) support."
+		echo "${script_name}: WARNING: THIS IS DANGEROUS! Disabling TLS may cause race conditions!"
+		echo "${script_name}: WARNING: Please try --disable-threading if you suspect any correctness"
+		echo "${script_name}: WARNING: or deadlock issues."
+		enable_tls_01=0
+	fi
+
+	# Check the threading model flag and standardize its value, if needed.
+	# Note that single-threaded mode will always be enabled, but not necessarily
+	# by default.
+	enable_single='yes'
+	enable_openmp='no'
+	enable_pthreads='no'
+	enable_hpx='no'
+	enable_single_01=1
+	enable_openmp_01=0
+	enable_pthreads_01=0
+	enable_hpx_01=0
+	parsed_tm=''
+	first_tm=''
+	enable_single_as_def_01=0
+	enable_openmp_as_def_01=0
+	enable_pthreads_as_def_01=0
+	enable_hpx_as_def_01=0
+
+	# Convert whatever reasonable separator the user may have used into a space.
+	threading_model_list=$(echo "${threading_model}" | sed -e "s/[,+]/ /g")
+
+	# Search for all recognized values and standardize them to one of four
+	# strings: 'single', 'openmp', 'pthreads', 'auto'. Notice that we keep
+	# the strings in the same order as they originally appeared.
+	for word in ${threading_model_list}; do
+
+		if [[ ${word} = single ]] ||
+		   [[ ${word} = none   ]] ||
+		   [[ ${word} = off    ]] ||
+		   [[ ${word} = no     ]]; then
+
+			parsed_tm="${parsed_tm} single"
+
+		elif [[ ${word} = openmp ]] ||
+			 [[ ${word} = omp    ]]; then
+
+			parsed_tm="${parsed_tm} openmp"
+
+		elif [[ ${word} = pthreads ]] ||
+			 [[ ${word} = pthread  ]] ||
+			 [[ ${word} = posix    ]]; then
+
+			parsed_tm="${parsed_tm} pthreads"
+
+		elif [[ ${word} = hpx ]]; then
+
+			parsed_tm="${parsed_tm} hpx"
+
+		elif [[ ${word} = auto ]]; then
+
+			parsed_tm="${parsed_tm} auto"
+
+		else
+
+			echo "${script_name}: *** Unsupported threading model: ${word}."
+			exit 1
+		fi
+	done
+
+	# Always enable single-threaded behavior. If the user explicitly
+	# requested 'single' as well as other modes, the first occurrence will
+	# be kept when duplicates are removed, which will preserve the order
+	# for purposes of determining which mode will be the default (absent
+	# any explicit choice at runtime).
+	parsed_tm="${parsed_tm} single"
+
+	# Remove duplicates, if they exist.
+	parsed_tm=$(rm_duplicate_words_simple "${parsed_tm}")
+
+	#echo "parsed_tm0: _${parsed_tm}_"
+
+	# If parsed_tm contains 'auto', substitute in the automatic choice
+	# based on which compiler family is being used.
+	if [ "$(is_in_list "auto" "${parsed_tm}")" = "true" ]; then
+
+		# If 'auto' was found in the threading model string, we ignore any
+		# other choice that may have been expressed and leave everything
+		# disabled. (The Makefile will automatically choose a model based
+		# on information such as the compiler.)
+		echo "${script_name}: determining the threading model automatically."
+
+		# Use OpenMP for gcc and icc, but pthreads for clang.
+		if   [ "${cc_vendor}" = "gcc" ]; then
+
+			selected_tm="openmp"
+			echo "${script_name}:   automatically selected OpenMP."
+
+		elif [ "${cc_vendor}" = "icc" ]; then
+
+			selected_tm="openmp"
+			echo "${script_name}:   automatically selected OpenMP."
+
+		elif [ "${cc_vendor}" = "clang" ]; then
+
+			selected_tm="pthreads"
+			echo "${script_name}:   automatically selected pthreads."
+		fi
+
+		# Substitute the selected threading model for 'auto' in parsed_tm.
+		parsed_tm=$(substitute_words "auto" "${selected_tm}" "${parsed_tm}")
+	fi
+
+	#echo "parsed_tm1: _${parsed_tm}_"
+
+	# Remove any extra whitespace.
+	parsed_tm=$(canonicalize_ws "${parsed_tm}")
+
+	#echo "parsed_tm2: _${parsed_tm}_"
+
+	# Find the first word. This will be the default threading model.
+	first_tm=${parsed_tm%% *}
+
+	#echo "first_tm0:  _${first_tm}_"
+
+	# Now that we've standardized the list, removed duplicates, and handled
+	# the possibility of 'auto' being among the listed threading models, we can
+	# proceed to formally processing each threading model to enable. Since
+	# 'auto' has been converted to 'openmp' or 'pthreads', we only need to
+	# handle the remaining three options (openmp, pthreads, and single) going
+	# forward.
+	for word in ${parsed_tm}; do
+
+		if [[ ${word} = single ]]; then
+
+			echo "${script_name}: enabling support for single-threading."
+			enable_single='yes'
+			enable_single_01=1
+
+		elif [[ ${word} = openmp ]]; then
+
+			echo "${script_name}: enabling support for threading via OpenMP."
+			enable_openmp='yes'
+			enable_openmp_01=1
+
+		elif [[ ${word} = pthreads ]]; then
+
+			echo "${script_name}: enabling support for threading via pthreads."
+			enable_pthreads='yes'
+			enable_pthreads_01=1
+
+		elif [[ ${word} = hpx ]]; then
+
+			echo "${script_name}: enabling support for threading via HPX."
+			enable_hpx='yes'
+			enable_hpx_01=1
+
+		fi
+
+	done
+
+	# Define boolean variables that can easily be interpreted with #ifdef
+	# directives.
+	if [[ ${first_tm} = single ]]; then
+
+		enable_single_as_def_01=1
+		enable_openmp_as_def_01=0
+		enable_pthreads_as_def_01=0
+		enable_hpx_as_def_01=0
+
+	elif [[ ${first_tm} = openmp ]]; then
+
+		enable_single_as_def_01=0
+		enable_openmp_as_def_01=1
+		enable_pthreads_as_def_01=0
+		enable_hpx_as_def_01=0
+
+	elif [[ ${first_tm} = pthreads ]]; then
+
+		enable_single_as_def_01=0
+		enable_openmp_as_def_01=0
+		enable_pthreads_as_def_01=1
+		enable_hpx_as_def_01=0
+
+	elif [[ ${first_tm} = hpx ]]; then
+
+		enable_single_as_def_01=0
+		enable_openmp_as_def_01=0
+		enable_pthreads_as_def_01=0
+		enable_hpx_as_def_01=1
+
+	fi
+
+	# If OpenMP, pthreads, or HPX was enabled, given that single-threaded mode is
+	# also always enabled, remind the user which one will serve as the default
+	# (that is, absent any explicit choice at runtime).
+	if [[ ${enable_openmp}   = yes ]] ||
+	   [[ ${enable_pthreads} = yes ]] ||
+	   [[ ${enable_hpx}      = yes ]]; then
+
+		if   [[ ${first_tm}   = single ]]; then
+			echo "${script_name}: threading will default to single-threaded."
+		elif [[ ${first_tm}   = openmp ]]; then
+			echo "${script_name}: threading will default to OpenMP."
+		elif [[ ${first_tm}   = pthreads ]]; then
+			echo "${script_name}: threading will default to pthreads."
+		elif [[ ${first_tm}   = hpx ]]; then
+			echo "${script_name}: threading will default to HPX."
+		fi
+	fi
+
+	# Copy the final parsed threading model list back to the original variable.
+	threading_model="${parsed_tm}"
+
+	#echo "parsed_tm: _${parsed_tm}_"
+	#echo "first_tm:  _${first_tm}_"
+
+	# Check the method of assigning micropanels to threads in the JR and IR
+	# loops.
+	enable_jrir_rr_01=0
+	enable_jrir_slab_01=0
+	enable_jrir_tlb_01=0
+	if   [[ ${thread_part_jrir} = rr ]]; then
+		echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops."
+		enable_jrir_rr_01=1
+	elif [[ ${thread_part_jrir} = slab ]]; then
+		echo "${script_name}: requesting slab work partitioning in jr and/or ir loops."
+		enable_jrir_slab_01=1
+	elif [[ ${thread_part_jrir} = tlb ]]; then
+		echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop."
+		enable_jrir_tlb_01=1
+	else
+		echo "${script_name}: *** Unsupported method of work partitioning in jr/ir loops: ${thread_part_jrir}."
+		exit 1
+	fi
+
+	# Convert 'yes' and 'no' flags to booleans.
+	if [[ ${enable_pba_pools} = yes ]]; then
+		echo "${script_name}: internal memory pools for packing blocks are enabled."
+		enable_pba_pools_01=1
+	else
+		echo "${script_name}: internal memory pools for packing blocks are disabled."
+		enable_pba_pools_01=0
+	fi
+	if [[ ${enable_sba_pools} = yes ]]; then
+		echo "${script_name}: internal memory pools for small blocks are enabled."
+		enable_sba_pools_01=1
+	else
+		echo "${script_name}: internal memory pools for small blocks are disabled."
+		enable_sba_pools_01=0
+	fi
+	if [[ ${enable_mem_tracing} = yes ]]; then
+		echo "${script_name}: memory tracing output is enabled."
+		enable_mem_tracing_01=1
+	else
+		echo "${script_name}: memory tracing output is disabled."
+		enable_mem_tracing_01=0
+	fi
+	if [[ ${has_memkind} = yes ]]; then
+		if [[ -z ${enable_memkind} ]]; then
+			# If no explicit option was given for libmemkind one way or the other,
+			# we use the value returned previously by has_libmemkind(), in this
+			# case "yes", to determine the default.
+			echo "${script_name}: libmemkind found; default is to enable use."
+			enable_memkind="yes"
+			enable_memkind_01=1
+		else
+			if [[ ${enable_memkind} = yes ]]; then
+				echo "${script_name}: received explicit request to enable libmemkind."
+				enable_memkind="yes"
+				enable_memkind_01=1
+			else
+				echo "${script_name}: received explicit request to disable libmemkind."
+				enable_memkind="no"
+				enable_memkind_01=0
+			fi
+		fi
+	else
+		echo "${script_name}: libmemkind not found; disabling."
+		if [[ ${enable_memkind} = yes ]]; then
+			echo "${script_name}: cannot honor explicit request to enable libmemkind."
+		fi
+		enable_memkind="no"
+		enable_memkind_01=0
+	fi
+	if [[ ${pragma_omp_simd} = yes ]]; then
+		echo "${script_name}: compiler appears to support #pragma omp simd."
+		enable_pragma_omp_simd_01=1
+	else
+		echo "${script_name}: compiler appears to not support #pragma omp simd."
+		enable_pragma_omp_simd_01=0
+	fi
+	if [[ ${enable_blas} = yes ]]; then
+		echo "${script_name}: the BLAS compatibility layer is enabled."
+		enable_blas_01=1
+	else
+		echo "${script_name}: the BLAS compatibility layer is disabled."
+		enable_blas_01=0
+	fi
+	if [[ ${enable_cblas} = yes ]]; then
+		echo "${script_name}: the CBLAS compatibility layer is enabled."
+		enable_cblas_01=1
+		# Force BLAS layer when CBLAS is enabled
+		enable_blas='yes'
+	else
+		echo "${script_name}: the CBLAS compatibility layer is disabled."
+		enable_cblas_01=0
+	fi
+	if [[ ${enable_sup_handling} = yes ]]; then
+		echo "${script_name}: sup (skinny/unpacked) matrix handling is enabled."
+		enable_sup_handling_01=1
+	else
+		echo "${script_name}: sup (skinny/unpacked) matrix handling is disabled."
+		enable_sup_handling_01=0
+	fi
+	if [[ ${enable_trsm_preinversion} = yes ]]; then
+		echo "${script_name}: trsm diagonal element pre-inversion is enabled."
+		enable_trsm_preinversion_01=1
+	else
+		echo "${script_name}: trsm diagonal element pre-inversion is disabled."
+		enable_trsm_preinversion_01=0
+	fi
+
+	# Report integer sizes.
+	if [[ ${int_type_size} = 32 ]]; then
+		echo "${script_name}: the BLIS API integer size is 32-bit."
+	elif [[ ${int_type_size} = 64 ]]; then
+		echo "${script_name}: the BLIS API integer size is 64-bit."
+	else
+		echo "${script_name}: the BLIS API integer size is automatically determined."
+	fi
+	if [[ ${blas_int_type_size} = 32 ]]; then
+		echo "${script_name}: the BLAS/CBLAS API integer size is 32-bit."
+	elif [[ ${blas_int_type_size} = 64 ]]; then
+		echo "${script_name}: the BLAS/CBLAS API integer size is 64-bit."
+	else
+		echo "${script_name}: the BLAS/CBLAS API integer size is automatically determined."
+	fi
+
+	# Disallow the simultaneous use of 64-bit integers in the BLAS and
+	# 32-bit integers in BLIS.
+	if [[ ${blas_int_type_size} = 64 && ${int_type_size} = 32 ]]; then
+		echo "${script_name}: *** To avoid the possibility of truncation, we do not allow use of 64-bit integers in the BLAS API with 32-bit integers in BLIS. Please use a different configuration of integers."
+		exit 1
+	fi
+
+	# Check whether we should use AMD-customized versions of certain framework
+	# files.
+	if [[ ${enable_amd_frame_tweaks} = yes ]]; then
+
+		echo "${script_name}: AMD-specific framework files will be considered."
+		echo "${script_name}:   checking eligibility of target configuration."
+
+		# Make sure we are targeting either one of the zen subconfigs or the
+		# amd64 umbrella family.
+		if [[ ${config_name} != *zen* && ${config_name} != *amd64* ]]; then
+			echo "${script_name}:   target configuration '${config_name}' is not eligible."
+			echo "${script_name}:   disabling AMD-specific framework files."
+			enable_amd_frame_tweaks='no'
+		else
+			echo "${script_name}:   target configuration '${config_name}' is eligible."
+			echo "${script_name}:   enabling AMD-specific framework files."
+		fi
+	else
+		echo "${script_name}: AMD-specific framework files will not be considered."
+	fi
+
+	# Check if addons were given.
+	if [ -n "${addon_flag}" ]; then
+
+		# Remove duplicates in the addon list, if they exist.
+		addon_list=$(rm_duplicate_words_simple "${addon_list}")
+
+		echo "${script_name}: configuring with addons:"
+
+		for addon in ${addon_list}; do
+
+			echo "${script_name}:   ${addon_dir}/${addon}"
+
+			addon_fullpath="${addon_dirpath}/${addon}"
+
+			if [ ! -d "${addon_fullpath}" ]; then
+				echo "${script_name}: requested addon sub-directory does not exist! Cannot continue."
+				echo "${script_name}: *** Please verify addon existence and name."
+				exit 1
+			fi
+		done
 
-		echo "${script_name}: manual configuration requested; configuring with '${config_name}'."
+		enable_addons_01=1
+	else
+		echo "${script_name}: configuring with no addons."
 
+		enable_addons_01=0
 	fi
 
-	# Use the selected config name to look up the list of configurations
-	# and kernels associated with that name.
-	#config_list=${config_registry[${config_name}]}
-	#kernel_list=${kernel_registry[${config_name}]}
-	config_list=$(query_array "config_registry" "${config_name}")
-	kernel_list=$(query_array "kernel_registry" "${config_name}")
+	# Check if a sandbox was given.
+	if [ -n "${sandbox_flag}" ]; then
 
-	# Use the config_registry and kernel_registry to build a kconfig_registry
-	# for the selected config_name.
-	build_kconfig_registry "${config_name}"
+		#sandbox_relpath="${sandbox_dir}/${sandbox}"
 
-	# Print the configuration list and kernel list, if requested.
-	if [ "${show_config_list}" == "1" ]; then
+		echo "${script_name}: configuring for alternate gemm implementation:"
+		echo "${script_name}:   ${sandbox_dir}/${sandbox}"
 
-		echo "${script_name}: configuration list:"
-		#for k in "${!config_registry[@]}"; do
-		for cr_var in ${!config_registry_*}; do
+		sandbox_fullpath="${sandbox_dirpath}/${sandbox}"
 
-			#v=${config_registry[$k]}
-			k=${cr_var##config_registry_}; v=${!cr_var}
+		if [ ! -d "${sandbox_fullpath}" ]; then
+			echo "${script_name}: requested sandbox sub-directory does not exist! Cannot continue."
+			echo "${script_name}: *** Please verify sandbox existence and name."
+			exit 1
+		fi
 
-			echo "${script_name}:   $k: ${v}"
-		done
+		enable_sandbox_01=1
+	else
+		echo "${script_name}: configuring for conventional gemm implementation."
 
-		echo "${script_name}: kernel list:"
-		#for k in "${!kernel_registry[@]}"; do
-		for kr_var in ${!kernel_registry_*}; do
+		enable_sandbox_01=0
+	fi
 
-			#v=${kernel_registry[$k]}
-			k=${kr_var##kernel_registry_}; v=${!kr_var}
+	# Check the method used for returning complex numbers.
+	if [[ ${complex_return} = default ]]; then
 
-			echo "${script_name}:   $k: ${v}"
-		done
+		# If we prevoiusly found a Fortran compiler, let's query it to see what
+		# kind of complex return type it uses (gnu or intel). The 'gnu' style
+		# returns complex values from functions normally, via the C language
+		# return statement, while the 'intel' style returns them in a "hidden"
+		# parameter (inserted by the compiler) that precedes all other function
+		# parameters.
+		if [ -n "${found_fc}" ]; then
 
-		echo "${script_name}: kernel-to-config map for '${config_name}':"
-		#for k in "${!kconfig_registry[@]}"; do
-		for kc_var in ${!kconfig_registry_*}; do
+			# Query the full vendor version string output. This includes the
+			# version number along with (potentially) a bunch of other textual
+			# clutter.
+			# NOTE: This maybe should use merged stdout/stderr rather than only
+			# stdout. But it works for now.
+			vendor_string="$(${found_fc} --version 2>/dev/null || :)"
 
-			#v=${kconfig_registry[$k]}
-			k=${kc_var##kconfig_registry_}; v=${!kc_var}
+			# Query the compiler "vendor" (ie: the compiler's simple name).
+			# The last part ({ read first rest ; echo $first ; }) is a workaround
+			# to OS X's egrep only returning the first match.
+			fc_vendor=$(echo "${vendor_string}" | grep -oE 'IFORT|GNU' |
+			            { read -r first rest ; echo "${first}"; })
 
-			echo "${script_name}:   $k: ${v}"
-		done
+			if [[ ${fc_vendor} = IFORT ]]; then
+				complex_return='intel'
+			elif [[ ${fc_vendor} = GNU ]]; then
+				complex_return='gnu'
+			else
+				echo "${script_name}: unable to determine Fortran compiler vendor!"
+				complex_return='gnu'
+			fi
+		else
+			complex_return='gnu'
+		fi
 	fi
 
-	# For each kernel in the kernel list, reduce the list of associated
-	# sub-configurations (in the kconfig_registry) to a singleton using
-	# the following rules:
-	# 1. If the list is a singleton, use that name.
-	# 2. If the list contains a sub-configuration name that matches the
-	#    kernel name, use that name.
-	# 3. Otherwise, use the first name in the list.
-	# We use the chosen singleton to ceate a "kernel:subconfig" pair, which
-	# we accumulate into a list. This list is the kernel-to-config map, or
-	# kconfig_map.
+	if [[ ${complex_return} = gnu ]]; then
+		complex_return_intel01='0'
+	elif [[ ${complex_return} = intel ]]; then
+		complex_return_intel01='1'
+	else
+		echo "${script_name}: unknown complex return type \"${complex_return}\"! Cannot continue."
+		echo "${script_name}: *** Acceptable values are \"gnu\" and \"intel\"."
+		exit 1
+	fi
 
-	# We use a sorted version of kernel_list so that it ends up matching the
-	# display order of the kconfig_registry above.
-	# shellcheck disable=2086
-	kernel_list_sort=$(echo ${kernel_list} | xargs -n1 | sort -u)
+	echo "${script_name}: configuring complex return type as \"${complex_return}\"."
 
-	kconfig_map=""
-	for kernel in ${kernel_list_sort}; do
+	# Set a default value and friendlier name for LIBPTHREAD
+	libpthread="${LIBPTHREAD--lpthread}"
 
-		#configs="${kconfig_registry[$kernel]}"
-		configs=$(query_array "kconfig_registry" "${kernel}")
+	# For Windows builds, clear the libpthread variable so that
+	# no pthreads library is substituted into config.mk. (Windows builds
+	# employ an implementation of pthreads that is internal to BLIS.)
+	if [[ "$is_win" == "yes" && "$cc_vendor" == "clang" ]]; then
+		libpthread=
+	fi
 
-		has_one_kernel=$(is_singleton "${configs}")
-		contains_kernel=$(is_in_list "${kernel}" "${configs}")
+	# We also clear the libpthread variable for systemless builds
+	# (--disable-system).
+	if [[ "$enable_system" == "no" ]]; then
+		libpthread=
+	fi
 
-		# Check if the list is a singleton.
-		if [ "${has_one_kernel}" == "true" ]; then
+	# Create a #define for the configuration family (config_name).
+	uconf=$(echo "${config_name}" | tr '[:lower:]' '[:upper:]')
+	config_name_define="#define BLIS_FAMILY_${uconf}\n"
 
-			reducedclist="${configs}"
+	# Create a list of #defines, one for each configuration in config_list.
+	config_list_defines=""
+	for conf in ${config_list}; do
 
-		# Check if the list contains a sub-config name that matches the kernel.
-		elif [ "${contains_kernel}" == "true" ]; then
+		# Convert the current config name to uppercase.
+		uconf=$(echo "${conf}" | tr '[:lower:]' '[:upper:]')
 
-			reducedclist="${kernel}"
+		# Create a #define and add it to the running list.
+		config_define="BLIS_CONFIG_${uconf}"
+		config_list_defines="${config_list_defines}#define ${config_define}\n"
+	done
 
-		# Otherwise, use the last name.
-		else
+	# Create a list of #defines, one for each kernel set in kernel_list.
+	kernel_list_defines=""
+	for kern in ${kernel_list}; do
 
-			last_config=${configs##* }
-			reducedclist="${last_config}"
-		fi
+		# Convert the current config name to uppercase.
+		uconf=$(echo "${kern}" | tr '[:lower:]' '[:upper:]')
 
-		# Create a new "kernel:subconfig" pair and add it to the kconfig_map
-		# list, removing whitespace.
-		new_pair="${kernel}:${reducedclist}"
-		kconfig_map=$(canonicalize_ws "${kconfig_map} ${new_pair}")
+		# Create a #define and add it to the running list.
+		kernel_define="BLIS_KERNELS_${uconf}"
+		kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n"
 	done
 
-	if [ "${show_config_list}" == "1" ]; then
-
-		echo "${script_name}: kernel-to-config map for '${config_name}' (chosen pairs):"
-		for k in ${kconfig_map}; do
-			echo "${script_name}:   $k"
-		done
-	fi
+	# Create a list of #includes, one for each addon in addon_list.
+	addon_list_includes=""
+	for addon in ${addon_list}; do
 
+		# Create a #define and add it to the running list.
+		addon_header="\"${addon}.h\""
+		addon_list_includes="${addon_list_includes}#include ${addon_header}\n"
+	done
 
-	echo "${script_name}: checking configuration against contents of '${registry_file}'."
 
-	# First, ensure that the config name is registered (ie: it is present
-	# in the config_registry file).
-	if [ -z "${config_list}" ]; then
+	# -- Determine whether we are performing an out-of-tree build --------------
 
-		# NOTE: This branch should never execute when using auto-detection,
-		# but we have it here just in case.
-		if [[ $1 = auto ]]; then
+	if [ "${dist_path}" != "./" ]; then
 
-			echo "${script_name}: 'auto-detected configuration '${config_name}' is NOT registered!"
-			echo "${script_name}: "
-			echo "${script_name}: *** Cannot continue with unregistered configuration '${config_name}'. ***"
-			echo "${script_name}: "
-			exit 1;
+		# At this point, we know the user did not run "./configure". But we
+		# have not yet ruled out "<fullpath>/configure" or some # equivalent
+		# that uses relative paths. To further rule out these possibilities,
+		# we create a dummy file in the current build directory.
+		touch "./${dummy_file}"
 
+		# If the dummy file we just created in the current directory does not
+		# appear in the source distribution path, then we are in a different
+		# directory and thus we must create a symbolic link.
+		if [ ! -f "${dist_path}/${dummy_file}" ]; then
+			configured_oot="yes"
+			#echo "${script_name}: detected out-of-tree build directory."
 		else
+			configured_oot="no"
+			#echo "${script_name}: detected in-tree build directory."
+		fi
 
-			# At this point, we know: (a) config_list is empty; and (b) the user
-			# requested manual configuration. If the config_name given by the
-			# user is present in the configuration blacklist (config_blist),
-			# then we can deduce why the config_list is empty: because the only
-			# subconfig implied by config_name is blacklisted. Thus, we cannot
-			# proceed.
+		# Remove the dummy file.
+		rm -f "./${dummy_file}"
+	fi
 
-			if [[ $(is_in_list "${config_name}" "${config_blist}") = true ]]; then
 
-				echo "${script_name}: 'user-specified configuration '${config_name}' is blacklisted!"
-				echo "${script_name}: "
-				echo "${script_name}: *** Cannot continue with blacklisted configuration '${config_name}'. ***"
-				echo "${script_name}: *** Try updating your compiler and/or assembler (binutils) versions. ***"
-				echo "${script_name}: "
-				exit 1;
-			else
+	# -- Instantiate configuration files from templates ------------------------
+
+	add_config_var version
+	add_config_var so_version_major
+	add_config_var so_version_minorbuild
+	add_config_var config_name
+	add_config_var config_list
+	add_config_var kernel_list
+	add_config_var full_config_list
+	add_config_var full_subconfig_list
+	add_config_var full_kernel_list
+	add_config_var kconfig_map
+	add_config_var os_name
+	add_config_var is_win
+	add_config_var is_msvc
+	add_config_var dist_path
+	add_config_var CC_VENDOR                 cc_vendor
+	add_config_var gcc_older_than_4_9_0
+	add_config_var gcc_older_than_6_1_0
+	add_config_var gcc_older_than_9_1_0
+	add_config_var gcc_older_than_10_3_0
+	add_config_var clang_older_than_9_0_0
+	add_config_var clang_older_than_12_0_0
+	add_config_var aocc_older_than_2_0_0
+	add_config_var aocc_older_than_3_0_0
+	add_config_var CC                        found_cc
+	add_config_var CXX                       found_cxx
+	add_config_var FC                        found_fc
+	add_config_var AR                        found_ar
+	add_config_var RANLIB                    found_ranlib
+	add_config_var PYTHON                    found_python
+	add_config_var libpthread
+	add_config_var cflags_preset
+	add_config_var cxxflags_preset
+	add_config_var ldflags_preset
+	add_config_var enable_asan
+	add_config_var debug_type
+	add_config_var enable_debug
+	add_config_var mk_enable_system          enable_system
+	add_config_var enable_system             enable_system_01
+	add_config_var threading_model
+	add_config_var prefix
+	add_config_var exec_prefix
+	add_config_var libdir
+	add_config_var includedir
+	add_config_var sharedir
+	add_config_var enable_verbose
+	add_config_var configured_oot
+	add_config_var enable_arg_max_hack
+	add_config_var mk_enable_static          enable_static
+	add_config_var mk_enable_shared          enable_shared
+	add_config_var enable_shared             enable_shared_01
+	add_config_var enable_rpath
+	add_config_var export_shared
+	add_config_var mk_enable_blas            enable_blas
+	add_config_var mk_enable_cblas           enable_cblas
+	add_config_var enable_blas               enable_blas_01
+	add_config_var enable_cblas              enable_cblas_01
+	add_config_var enable_amd_frame_tweaks
+	add_config_var mk_enable_memkind         enable_memkind
+	add_config_var enable_memkind            enable_memkind_01
+	add_config_var pragma_omp_simd
+	add_config_var addon_list
+	add_config_var sandbox
+	add_config_var config_name_define
+	add_config_var config_list_defines
+	add_config_var kernel_list_defines
+	add_config_var enable_tls                enable_tls_01
+	add_config_var enable_openmp             enable_openmp_01
+	add_config_var enable_openmp_as_def      enable_openmp_as_def_01
+	add_config_var enable_pthreads           enable_pthreads_01
+	add_config_var enable_pthreads_as_def    enable_pthreads_as_def_01
+	add_config_var enable_hpx                enable_hpx_01
+	add_config_var enable_hpx_as_def         enable_hpx_as_def_01
+	add_config_var enable_jrir_rr            enable_jrir_rr_01
+	add_config_var enable_jrir_slab          enable_jrir_slab_01
+	add_config_var enable_jrir_tlb           enable_jrir_tlb_01
+	add_config_var enable_pba_pools          enable_pba_pools_01
+	add_config_var enable_sba_pools          enable_sba_pools_01
+	add_config_var enable_mem_tracing        enable_mem_tracing_01
+	add_config_var int_type_size
+	add_config_var blas_int_type_size
+	add_config_var enable_sup_handling       enable_sup_handling_01
+	add_config_var enable_trsm_preinversion  enable_trsm_preinversion_01
+	add_config_var enable_pragma_omp_simd    enable_pragma_omp_simd_01
+	add_config_var enable_sandbox            enable_sandbox_01
+	add_config_var complex_return_intel      complex_return_intel01
+	add_config_var addon_list_includes
+	add_config_var enable_addons             enable_addons_01
+
+	generate_config_file "${config_mk_in_path}"    "${config_mk_out_path}"
+	generate_config_file "${bli_config_h_in_path}" "${bli_config_h_out_path}"
+	generate_config_file "${bli_addon_h_in_path}"  "${bli_addon_h_out_path}"
 
-				# If config_name is NOT present in config_blist, then we know
-				# that config_list is empty simply because config_name is
-				# unregistered.
+	# -- Create top-level object directories -----------------------------------
 
-				echo "${script_name}: 'user-specified configuration '${config_name}' is NOT registered!"
-				echo "${script_name}: "
-				echo "${script_name}: *** Cannot continue with unregistered configuration '${config_name}'. ***"
-				echo "${script_name}: "
-				exit 1;
-			fi
-		fi
-	else
+	# Create obj sub-directories (if they do not already exist).
+	base_obj_dirpath="${obj_dirpath}/${config_name}"
 
-		# This branch executes when the configuration is found to be present
-		# (i.e. registered) in the config_registry file.
+	echo "${script_name}: creating ${base_obj_dirpath}"
+	mkdir -p "${base_obj_dirpath}"
 
-		echo "${script_name}: configuration '${config_name}' is registered."
-		echo "${script_name}: '${config_name}' is defined as having the following sub-configurations:"
-		echo "${script_name}:    ${config_list}"
-		echo "${script_name}: which collectively require the following kernels:"
-		echo "${script_name}:    ${kernel_list}"
 
-	fi
+	obj_config_dirpath="${base_obj_dirpath}/${config_dir}"
+
+	mkdir -p "${obj_config_dirpath}"
+	for conf in ${config_list}; do
+		echo "${script_name}: creating ${obj_config_dirpath}/${conf}"
+		mkdir -p "${obj_config_dirpath}/${conf}"
+	done
+
+
+	obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}"
+
+	mkdir -p "${obj_kernels_dirpath}"
+	for kern in ${kernel_list}; do
+		echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}"
+		mkdir -p "${obj_kernels_dirpath}/${kern}"
+	done
 
 
-	echo "${script_name}: checking sub-configurations:"
+	obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}"
 
-	# Now, verify that the constituent configurations associated with the
-	# config name are all valid.
+	mkdir -p "${obj_refkern_dirpath}"
 	for conf in ${config_list}; do
+		echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}"
+		mkdir -p "${obj_refkern_dirpath}/${conf}"
+	done
 
-		# First confirm that the current configuration is registered.
-		#this_clist=${config_registry[${conf}]}
-		this_clist=$(query_array "config_registry" "${conf}")
 
-		# If the config_list associated with conf is empty, then it was
-		# never entered into the config_registry to begin with. Thus,
-		# conf must be unregistered.
-		if [ -z "${this_clist}" ]; then
-			echo "${script_name}: '${conf}' is NOT registered!"
-			echo "${script_name}: "
-			echo "${script_name}: *** Cannot continue with unregistered configuration '${conf}'. ***"
-			echo "${script_name}: "
-			exit 1;
-		else
-			echo -n "${script_name}:   '${conf}' is registered."
-		fi
+	obj_frame_dirpath="${base_obj_dirpath}/${frame_dir}"
 
-		# Then confirm that the current sub-configuration directory exists.
-		if [ ! -d "${config_dirpath}/${conf}" ]; then
-			echo "..but does NOT exist!"
-			echo "${script_name}: "
-			echo "${script_name}: *** Cannot continue with nonexistent configuration '${conf}'. ***"
-			echo "${script_name}: "
-			exit 1;
-		else
-			echo "..and exists."
-		fi
-	done
+	echo "${script_name}: creating ${obj_frame_dirpath}"
+	mkdir -p "${obj_frame_dirpath}"
 
 
-	echo "${script_name}: checking sub-configurations' requisite kernels:"
+	if [ -n "${addon_flag}" ]; then
 
-	# Also, let's verify that the requisite kernel sets associated with
-	# the config name all correspond to directories that exist.
-	for kernel in ${kernel_list}; do
+		obj_addon_dirpath="${base_obj_dirpath}/${addon_dir}"
 
-		echo -n "${script_name}:   '${kernel}' kernels..."
+		for addon in ${addon_list}; do
+			echo "${script_name}: creating ${obj_addon_dirpath}/${addon}"
+			mkdir -p "${obj_addon_dirpath}/${addon}"
+		done
+	fi
 
-		# Confirm that the current kernel sub-directory exists.
-		if [ ! -d "${kernels_dirpath}/${kernel}" ]; then
-			echo "do NOT exist!"
-			echo "${script_name}: "
-			echo "${script_name}: *** Cannot continue with nonexistent kernel '${kernel}'. ***"
-			echo "${script_name}: "
-			exit 1;
-		else
-			echo "exist."
-		fi
-	done
 
-	# In order to determine the default behavior of the --with[out]-memkind
-	# option, we try to detect whether libmemkind is available. If it is,
-	# the default implied option will be --with-memkind; otherwise, will be
-	# --without-memkind.
-	has_memkind=$(has_libmemkind)
+	if [ -n "${sandbox_flag}" ]; then
 
-	# Try to determine whether the chosen compiler supports #pragma omp simd.
-	pragma_omp_simd=$(has_pragma_omp_simd)
+		obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
 
+		echo "${script_name}: creating ${obj_sandbox_dirpath}/${sandbox}"
+		mkdir -p "${obj_sandbox_dirpath}/${sandbox}"
+	fi
 
-	# -- Prepare variables for subsitution into template files -----------------
 
-	# Parse the status of the prefix option and echo feedback.
-	if [ -n "${prefix_flag}" ]; then
-		echo "${script_name}: detected --prefix='${prefix}'."
-	else
-		echo "${script_name}: no install prefix option given; defaulting to '${prefix}'."
-	fi
+	obj_blastest_dirpath="${base_obj_dirpath}/${blastest_dir}"
 
-	# Parse the status of the exec_prefix option and echo feedback.
-	if [ -n "${exec_prefix_flag}" ]; then
-		echo "${script_name}: detected --exec-prefix='${exec_prefix}'."
-	else
-		echo "${script_name}: no install exec_prefix option given; defaulting to PREFIX."
-	fi
+	echo "${script_name}: creating ${obj_blastest_dirpath}"
+	mkdir -p "${obj_blastest_dirpath}"
 
-	# Parse the status of the libdir option and echo feedback.
-	if [ -n "${libdir_flag}" ]; then
-		echo "${script_name}: detected --libdir='${libdir}'."
-	else
-		echo "${script_name}: no install libdir option given; defaulting to EXECPREFIX/lib."
-	fi
 
-	# Parse the status of the includedir option and echo feedback.
-	if [ -n "${includedir_flag}" ]; then
-		echo "${script_name}: detected --includedir='${includedir}'."
-	else
-		echo "${script_name}: no install includedir option given; defaulting to PREFIX/include."
-	fi
+	obj_testsuite_dirpath="${base_obj_dirpath}/${testsuite_dir}"
 
-	# Parse the status of the sharedir option and echo feedback.
-	if [ -n "${sharedir_flag}" ]; then
-		echo "${script_name}: detected --sharedir='${sharedir}'."
-	else
-		echo "${script_name}: no install sharedir option given; defaulting to PREFIX/share."
-	fi
+	echo "${script_name}: creating ${obj_testsuite_dirpath}"
+	mkdir -p "${obj_testsuite_dirpath}"
 
-	# Echo the installation directories that we settled on.
-	echo "${script_name}: final installation directories:"
-	echo "${script_name}:   prefix:      $(eval echo "${prefix}")"
-	echo "${script_name}:   exec_prefix: $(eval echo "${exec_prefix}")"
-	echo "${script_name}:   libdir:      $(eval echo "$(eval echo "${libdir}")")"
-	echo "${script_name}:   includedir:  $(eval echo "${includedir}")"
-	echo "${script_name}:   sharedir:    $(eval echo "${sharedir}")"
-	echo "${script_name}: NOTE: the variables above can be overridden when running make."
 
-	# Check if CFLAGS is non-empty.
-	if [ -n "${CFLAGS}" ]; then
-		cflags_preset="${CFLAGS}"
-		echo "${script_name}: detected preset CFLAGS; prepending:"
-		echo "${script_name}:   ${cflags_preset}"
-	else
-		cflags_preset=''
-		echo "${script_name}: no preset CFLAGS detected."
-	fi
+	# Create lib directory (if it does not already exist).
+	base_lib_dirpath="${lib_dirpath}/${config_name}"
 
-	# Check if LDFLAGS is non-empty.
-	if [ -n "${LDFLAGS}" ]; then
-		ldflags_preset="${LDFLAGS}"
-		echo "${script_name}: detected preset LDFLAGS; prepending:"
-		echo "${script_name}:   ${ldflags_preset}"
-	else
-		ldflags_preset=''
-		echo "${script_name}: no preset LDFLAGS detected."
-	fi
+	echo "${script_name}: creating ${base_lib_dirpath}"
+	mkdir -p "${base_lib_dirpath}"
 
-	# Check if the verbose make flag was specified.
-	if [[ ${enable_verbose} = yes ]]; then
-		echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)"
-	else
-		echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)"
-	fi
 
-	# Check if the ARG_MAX hack was requested.
-	if [[ ${enable_arg_max_hack} = yes ]]; then
-		echo "${script_name}: enabling ARG_MAX hack."
-	else
-		echo "${script_name}: disabling ARG_MAX hack."
-	fi
+	# Create include directory (if it does not already exist).
+	base_include_dirpath="${include_dirpath}/${config_name}"
 
-	# Check if the debug flag was specified.
-	if [[ -n ${debug_flag} ]]; then
-		if [[ ${debug_type} = opt ]]; then
-			echo "${script_name}: enabling debug symbols with optimizations."
-		elif [[ ${debug_type} = sde ]]; then
-			debug_type='sde'
-			echo "${script_name}: enabling SDE processor emulation."
-		else
-			debug_type='noopt'
-			echo "${script_name}: enabling debug symbols; optimizations disabled."
-		fi
-		enable_debug='yes'
-	else
-		debug_type='off'
-		enable_debug='no'
-		echo "${script_name}: debug symbols disabled."
-	fi
+	echo "${script_name}: creating ${base_include_dirpath}"
+	mkdir -p "${base_include_dirpath}"
 
-	# Check if the AddressSanitizer flag was specified.
-	if [[ ${enable_asan} = yes ]]; then
-		echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)."
-	else
-		enable_asan='no'
-		echo "${script_name}: AddressSanitizer support disabled."
-	fi
 
-	# Check if the static lib flag was specified.
-	if   [[ ${enable_static} = yes && ${enable_shared} = yes ]]; then
-		echo "${script_name}: building BLIS as both static and shared libraries."
-		enable_shared_01=1
-	elif [[ ${enable_static} = no && ${enable_shared} = yes ]]; then
-		echo "${script_name}: building BLIS as a shared library (static library disabled)."
-		enable_shared_01=1
-	elif [[ ${enable_static} = yes && ${enable_shared} = no ]]; then
-		echo "${script_name}: building BLIS as a static library (shared library disabled)."
-		enable_shared_01=0
-	else
-		echo "${script_name}: Both static and shared libraries were disabled."
-		echo "${script_name}: *** Please enable one (or both) to continue."
-		exit 1
-	fi
+	# -- Mirror source directory hierarchies to object directories -------------
 
-	# Check if the "export shared" flag was specified.
-	if [[ ${export_shared} = all ]]; then
-		if [[ ${enable_shared} = yes ]]; then
-			echo "${script_name}: exporting all symbols within shared library."
-		else
-			echo "${script_name}: ignoring request to export all symbols within shared library."
-		fi
-	elif [[ ${export_shared} = public ]]; then
-		if [[ ${enable_shared} = yes ]]; then
-			echo "${script_name}: exporting only public symbols within shared library."
-		fi
-	else
-		echo "${script_name}: *** Invalid argument '${export_shared}' to --export-shared option given."
-		echo "${script_name}: *** Please use 'public' or 'all'."
-		exit 1
-	fi
+	# Combine the config_list with the config_name and then remove duplicates.
+	config_list_plus_name=$(rm_duplicate_words "${config_list} ${config_name}")
 
-	# Check if we are building with or without operating system support.
-	if [[ ${enable_system} = yes ]]; then
-		echo "${script_name}: enabling operating system support."
-		enable_system_01=1
-	else
-		echo "${script_name}: disabling operating system support."
-		echo "${script_name}: WARNING: disabling OS support forcibly disables all threading!"
-		enable_system_01=0
+	# Mirror each of the sub-configuration directories to the object directory.
+	for conf in ${config_list_plus_name}; do
 
-		# Force threading to be disabled.
-		threading_model='off'
-	fi
+		echo "${script_name}: mirroring ${config_dirpath}/${conf} to ${obj_config_dirpath}/${conf}"
+		"${mirror_tree_sh}" "${config_dirpath}/${conf}" "${obj_config_dirpath}/${conf}"
+	done
 
-	# Check if we are building with or without thread-local storage support.
-	if [[ ${enable_tls} = yes ]]; then
-		echo "${script_name}: enabling thread-local storage (TLS) support."
-		enable_tls_01=1
-	else
-		echo "${script_name}: disabling thread-local storage (TLS) support."
-		echo "${script_name}: WARNING: THIS IS DANGEROUS! Disabling TLS may cause race conditions!"
-		echo "${script_name}: WARNING: Please try --disable-threading if you suspect any correctness"
-		echo "${script_name}: WARNING: or deadlock issues."
-		enable_tls_01=0
-	fi
+	# Mirror optimized kernels source tree to its object sub-directory.
+	# We perform the mirroring on each configuration/kernel sub-directory
+	# within 'kernels'.
+	for kern in ${kernel_list}; do
+
+		# Only mirror the optimized kernels source directory if it exists.
+		# There are occasions where one of the sub-configurations in the
+		# config_list does not correspond to a kernels sub-directory, such
+		# as when architecture B is so close to architecture A that B can
+		# use A's kernel source code unmodified (though perhaps with
+		# different blocksizes).
+		#if [ -d "${kernels_dirpath}/${conf}" ]; then
+
+		echo "${script_name}: mirroring ${kernels_dirpath}/${kern} to ${obj_kernels_dirpath}/${kern}"
+		${mirror_tree_sh} "${kernels_dirpath}/${kern}" "${obj_kernels_dirpath}/${kern}"
+		#else
+		#	echo "${script_name}: mirroring ${kernels_dirpath}/${conf} skipped... directory does not exist"
+		#fi
+	done
 
-	# Check the threading model flag and standardize its value, if needed.
-	# Note that single-threaded mode will always be enabled, but not necessarily
-	# by default.
-	enable_single='yes'
-	enable_openmp='no'
-	enable_pthreads='no'
-	enable_hpx='no'
-	enable_single_01=1
-	enable_openmp_01=0
-	enable_pthreads_01=0
-	enable_hpx_01=0
-	parsed_tm=''
-	first_tm=''
-	enable_single_as_def_01=0
-	enable_openmp_as_def_01=0
-	enable_pthreads_as_def_01=0
-	enable_hpx_as_def_01=0
+	# Mirror reference kernel source tree to its object sub-directory.
+	echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}"
+	"${mirror_tree_sh}" "${refkern_dirpath}" "${obj_refkern_dirpath}"
 
-	# Convert whatever reasonable separator the user may have used into a space.
-	threading_model_list=$(echo "${threading_model}" | sed -e "s/[,+]/ /g")
+	# Mirror reference kernels source tree to its object sub-directory.
+	for conf in ${config_list}; do
 
-	# Search for all recognized values and standardize them to one of four
-	# strings: 'single', 'openmp', 'pthreads', 'auto'. Notice that we keep
-	# the strings in the same order as they originally appeared.
-	for word in ${threading_model_list}; do
+		echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}/${conf}"
+		"${mirror_tree_sh}" "${refkern_dirpath}" "${obj_refkern_dirpath}/${conf}"
+	done
 
-		if [[ ${word} = single ]] ||
-		   [[ ${word} = none   ]] ||
-		   [[ ${word} = off    ]] ||
-		   [[ ${word} = no     ]]; then
+	# Mirror framework source tree to its object sub-directory.
+	echo "${script_name}: mirroring ${frame_dirpath} to ${obj_frame_dirpath}"
+	"${mirror_tree_sh}" "${frame_dirpath}" "${obj_frame_dirpath}"
 
-			parsed_tm="${parsed_tm} single"
+	# Mirror the chosen addon source tree to its object sub-directory.
+	if [[ -n ${addon_flag} ]]; then
 
-		elif [[ ${word} = openmp ]] ||
-			 [[ ${word} = omp    ]]; then
+		for addon in ${addon_list}; do
 
-			parsed_tm="${parsed_tm} openmp"
+			echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}"
+			"${mirror_tree_sh}" "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}"
+		done
+	fi
 
-		elif [[ ${word} = pthreads ]] ||
-			 [[ ${word} = pthread  ]] ||
-			 [[ ${word} = posix    ]]; then
+	# Mirror the chosen sandbox source tree to its object sub-directory.
+	if [[ -n ${sandbox_flag} ]]; then
 
-			parsed_tm="${parsed_tm} pthreads"
+		echo "${script_name}: mirroring ${sandbox_dirpath}/${sandbox} to ${obj_sandbox_dirpath}/${sandbox}"
+		"${mirror_tree_sh}" "${sandbox_dirpath}/${sandbox}" "${obj_sandbox_dirpath}/${sandbox}"
+	fi
 
-		elif [[ ${word} = hpx ]]; then
 
-			parsed_tm="${parsed_tm} hpx"
+	# -- Generate makefile fragements ------------------------------------------
 
-		elif [[ ${word} = auto ]]; then
+	clist_contains_cname=$(is_in_list "${config_name}" "${config_list}")
 
-			parsed_tm="${parsed_tm} auto"
+	# If the config_list does not already contain the config_name (i.e.,
+	# if config_name is an umbrella family), generate makefiles in that
+	# directory. (In the next step, we will loop over the actual sub-
+	# configurations and create fragments there as well.)
+	if [[ ${clist_contains_cname} = false ]]; then
+		create_makefile_fragment CONFIG "${config_dirpath}/${config_name}" \
+		                         "${obj_config_dirpath}/${config_name}"
+	fi
 
-		else
+	# Generate makefile fragments for each of the sub-configurations present
+	# in the configuration list.
+	for conf in ${config_list}; do
+		create_makefile_fragment CONFIG "${config_dirpath}/${conf}" \
+		                         "${obj_config_dirpath}/${conf}"
+	done
 
-			echo "${script_name}: *** Unsupported threading model: ${word}."
-			exit 1
-		fi
+	# Generate makefile fragments for each of the kernel sets required by
+	# the configuration list (in the kernel list).
+	for kern in ${kernel_list}; do
+		create_makefile_fragment KERNELS "${kernels_dirpath}/${kern}" \
+		                         "${obj_kernels_dirpath}/${kern}"
 	done
 
-	# Always enable single-threaded behavior. If the user explicitly
-	# requested 'single' as well as other modes, the first occurrence will
-	# be kept when duplicates are removed, which will preserve the order
-	# for purposes of determining which mode will be the default (absent
-	# any explicit choice at runtime).
-	parsed_tm="${parsed_tm} single"
+	# Generate makefile fragments in the reference kernels directory.
+	create_makefile_fragment REFKERN "${refkern_dirpath}" \
+	                         "${obj_refkern_dirpath}"
 
-	# Remove duplicates, if they exist.
-	parsed_tm=$(rm_duplicate_words_simple "${parsed_tm}")
+	# Generate makefile fragments in the framework directory.
+	create_makefile_fragment FRAME "${frame_dirpath}" \
+	                         "${obj_frame_dirpath}"
 
-	#echo "parsed_tm0: _${parsed_tm}_"
+	# Generate makefile fragments in the addon sub-directory.
+	if [[ -n ${addon_flag} ]]; then
+		for addon in ${addon_list}; do
+			create_makefile_fragment ADDON "${addon_dirpath}/${addon}" \
+			                         "${obj_addon_dirpath}/${addon}"
+		done
+	fi
 
-	# If parsed_tm contains 'auto', substitute in the automatic choice
-	# based on which compiler family is being used.
-	if [ "$(is_in_list "auto" "${parsed_tm}")" = "true" ]; then
+	# Generate makefile fragments in the sandbox sub-directory.
+	if [[ -n ${sandbox_flag} ]]; then
+		create_makefile_fragment SANDBOX "${sandbox_dirpath}/${sandbox}" \
+		                         "${obj_sandbox_dirpath}/${sandbox}"
+	fi
 
-		# If 'auto' was found in the threading model string, we ignore any
-		# other choice that may have been expressed and leave everything
-		# disabled. (The Makefile will automatically choose a model based
-		# on information such as the compiler.)
-		echo "${script_name}: determining the threading model automatically."
 
-		# Use OpenMP for gcc and icc, but pthreads for clang.
-		if   [ "${cc_vendor}" = "gcc" ]; then
+	# -- Handle out-of-tree builds ---------------------------------------------
 
-			selected_tm="openmp"
-			echo "${script_name}:   automatically selected OpenMP."
+	# Under some circumstances, we need to create some symbolic links to
+	# properly handle out-of-tree builds.
+	if [[ ${configured_oot} = yes ]]; then
+		for file in Makefile blis.pc.in common.mk config; do
+			# If symlink does not already exist in the current
+			# directory, create a symbolic link to it. If one does exist, we
+			# use -f to force creation of a new link.
+			if [[ ! -e ${file} ]]; then
+				echo "${script_name}: creating symbolic link to ${file}."
+				ln -s "${dist_path}/${file}" .
+			elif [[ -h ${file} ]]; then
+				echo "${script_name}: symbolic link to ${file} already exists; forcing creation of new link."
+				ln -sf "${dist_path}/${file}" .
+			else
+				echo "${script_name}: Non-symbolic link file or directory '${file}' blocks creation of symlink."
+				echo "${script_name}: *** Please remove this entity and re-run configure."
+				exit 1
+			fi
+		done
 
-		elif [ "${cc_vendor}" = "icc" ]; then
+		echo "${script_name}: configured to build outside of source distribution."
+	else
 
-			selected_tm="openmp"
-			echo "${script_name}:   automatically selected OpenMP."
+		echo "${script_name}: configured to build within top-level directory of source distribution."
+	fi
 
-		elif [ "${cc_vendor}" = "clang" ]; then
+	if [ "${warn_user_generic}" = "1" ]; then
 
-			selected_tm="pthreads"
-			echo "${script_name}:   automatically selected pthreads."
-		fi
+		echo "${script_name}: "
+		echo "${script_name}: *** Unable to automatically detect hardware type! ***"
+		echo "${script_name}: "
+		echo "${script_name}: NOTE: configure was unable to identify a subconfiguration"
+		echo "${script_name}: optimized for your hardware. As a result, the 'generic'"
+		echo "${script_name}: subconfiguration (with low-performance reference kernels)"
+		echo "${script_name}: will be used. For support, please open an issue on GitHub"
+		echo "${script_name}: at https://github.com/flame/blis/issues."
+		echo "${script_name}: "
+	fi
 
-		# Substitute the selected threading model for 'auto' in parsed_tm.
-		parsed_tm=$(substitute_words "auto" "${selected_tm}" "${parsed_tm}")
+	# Exit peacefully.
+	return 0
+}
+
+
+#
+# -- plugin functions ----------------------------------------------------------
+#
+
+print_usage_plugin()
+{
+	# Use the version string in the 'version' file since we don't have
+	# the patched version string yet.
+	if [ -z "${version}" ]; then
+		version=$(<"${version_filepath}")
 	fi
 
-	#echo "parsed_tm1: _${parsed_tm}_"
+	# Echo usage info.
+	cat <<EOF
 
-	# Remove any extra whitespace.
-	parsed_tm=$(canonicalize_ws "${parsed_tm}")
+ ${script_name} (BLIS ${version})
 
-	#echo "parsed_tm2: _${parsed_tm}_"
+ Configure a BLIS plugin for compilation.
 
-	# Find the first word. This will be the default threading model.
-	first_tm=${parsed_tm%% *}
+ Usage:
 
-	#echo "first_tm0:  _${first_tm}_"
+   ${script_name} [options] [env. vars.] plugin_name
 
-	# Now that we've standardized the list, removed duplicates, and handled
-	# the possibility of 'auto' being among the listed threading models, we can
-	# proceed to formally processing each threading model to enable. Since
-	# 'auto' has been converted to 'openmp' or 'pthreads', we only need to
-	# handle the remaining three options (openmp, pthreads, and single) going
-	# forward.
-	for word in ${parsed_tm}; do
+ Arguments:
 
-		if [[ ${word} = single ]]; then
+   plugin_name   The name of the plugin which is being configured. This
+                 name will form part of the symbol name for the kernel
+                 registration function. This argument is optional if it
+                 can be determined from the name of an existing
+                 bli_plugin_<plugin_name>.h file.
 
-			echo "${script_name}: enabling support for single-threading."
-			enable_single='yes'
-			enable_single_01=1
+ Options:
 
-		elif [[ ${word} = openmp ]]; then
+   --init
 
-			echo "${script_name}: enabling support for threading via OpenMP."
-			enable_openmp='yes'
-			enable_openmp_01=1
+                 A synonym for '--disable-examples --enable-templates --disable-build'.
 
-		elif [[ ${word} = pthreads ]]; then
+   --build
 
-			echo "${script_name}: enabling support for threading via pthreads."
-			enable_pthreads='yes'
-			enable_pthreads_01=1
+                 A synonym for '--disable-examples --disable-templates --enable-build'.
 
-		elif [[ ${word} = hpx ]]; then
+   --disable-examples, --enable-examples
 
-			echo "${script_name}: enabling support for threading via HPX."
-			enable_hpx='yes'
-			enable_hpx_01=1
+                 Do not include (created by default) example code for plugin
+                 registration, kernels, etc.
 
-		fi
+   --disable-templates, --enable-templates
 
-	done
+                 Do not create (created by default) files which make up the
+                 basic plug-in file structure, for example if the plugin has
+                 already been created and only build files need to be generated.
 
-	# Define boolean variables that can easily be interpreted with #ifdef
-	# directives.
-	if [[ ${first_tm} = single ]]; then
+   --disable-build, --enable-build
 
-		enable_single_as_def_01=1
-		enable_openmp_as_def_01=0
-		enable_pthreads_as_def_01=0
-		enable_hpx_as_def_01=0
+                 Do not create (created by default) files necessary for
+                 actually building the plugin. ${script_name} can be re-run
+                 later to generate these files if desired.
 
-	elif [[ ${first_tm} = openmp ]]; then
+   --enable-verbose-make, --disable-verbose-make
 
-		enable_single_as_def_01=0
-		enable_openmp_as_def_01=1
-		enable_pthreads_as_def_01=0
-		enable_hpx_as_def_01=0
+                 Enable (disabled by default) verbose compilation output
+                 during make.
 
-	elif [[ ${first_tm} = pthreads ]]; then
+   --enable-arg-max-hack --disable-arg-max-hack
 
-		enable_single_as_def_01=0
-		enable_openmp_as_def_01=0
-		enable_pthreads_as_def_01=1
-		enable_hpx_as_def_01=0
+                 Enable (disabled by default) build system logic that
+                 will allow archiving/linking the static/shared library
+                 even if the command plus command line arguments exceeds
+                 the operating system limit (ARG_MAX).
 
-	elif [[ ${first_tm} = hpx ]]; then
+   -d DEBUG, --enable-debug[=DEBUG]
 
-		enable_single_as_def_01=0
-		enable_openmp_as_def_01=0
-		enable_pthreads_as_def_01=0
-		enable_hpx_as_def_01=1
+                 Enable debugging symbols in the library. If argument
+                 DEBUG is given as 'opt', then optimization flags are
+                 kept in the framework, otherwise optimization is
+                 turned off.
 
-	fi
+   --disable-static, --enable-static
 
-	# If OpenMP, pthreads, or HPX was enabled, given that single-threaded mode is
-	# also always enabled, remind the user which one will serve as the default
-	# (that is, absent any explicit choice at runtime).
-	if [[ ${enable_openmp}   = yes ]] ||
-	   [[ ${enable_pthreads} = yes ]] ||
-	   [[ ${enable_hpx}      = yes ]]; then
+                 Disable (enabled by default) building BLIS as a static
+                 library. If the static library build is disabled, the
+                 shared library build must remain enabled.
 
-		if   [[ ${first_tm}   = single ]]; then
-			echo "${script_name}: threading will default to single-threaded."
-		elif [[ ${first_tm}   = openmp ]]; then
-			echo "${script_name}: threading will default to OpenMP."
-		elif [[ ${first_tm}   = pthreads ]]; then
-			echo "${script_name}: threading will default to pthreads."
-		elif [[ ${first_tm}   = hpx ]]; then
-			echo "${script_name}: threading will default to HPX."
-		fi
-	fi
+   --disable-shared, --enable-shared
 
-	# Copy the final parsed threading model list back to the original variable.
-	threading_model="${parsed_tm}"
+                 Disable (enabled by default) building BLIS as a shared
+                 library. If the shared library build is disabled, the
+                 static library build must remain enabled.
 
-	#echo "parsed_tm: _${parsed_tm}_"
-	#echo "first_tm:  _${first_tm}_"
+   --enable-rpath, --disable-rpath
 
-	# Check the method of assigning micropanels to threads in the JR and IR
-	# loops.
-	enable_jrir_rr_01=0
-	enable_jrir_slab_01=0
-	enable_jrir_tlb_01=0
-	if   [[ ${thread_part_jrir} = rr ]]; then
-		echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops."
-		enable_jrir_rr_01=1
-	elif [[ ${thread_part_jrir} = slab ]]; then
-		echo "${script_name}: requesting slab work partitioning in jr and/or ir loops."
-		enable_jrir_slab_01=1
-	elif [[ ${thread_part_jrir} = tlb ]]; then
-		echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop."
-		enable_jrir_tlb_01=1
-	else
-		echo "${script_name}: *** Unsupported method of work partitioning in jr/ir loops: ${thread_part_jrir}."
-		exit 1
-	fi
+                 Enable (disabled by default) setting an install_name for
+                 dynamic libraries on macOS which starts with @rpath rather
+                 than the absolute install path.
 
-	# Convert 'yes' and 'no' flags to booleans.
-	if [[ ${enable_pba_pools} = yes ]]; then
-		echo "${script_name}: internal memory pools for packing blocks are enabled."
-		enable_pba_pools_01=1
-	else
-		echo "${script_name}: internal memory pools for packing blocks are disabled."
-		enable_pba_pools_01=0
-	fi
-	if [[ ${enable_sba_pools} = yes ]]; then
-		echo "${script_name}: internal memory pools for small blocks are enabled."
-		enable_sba_pools_01=1
-	else
-		echo "${script_name}: internal memory pools for small blocks are disabled."
-		enable_sba_pools_01=0
-	fi
-	if [[ ${enable_mem_tracing} = yes ]]; then
-		echo "${script_name}: memory tracing output is enabled."
-		enable_mem_tracing_01=1
-	else
-		echo "${script_name}: memory tracing output is disabled."
-		enable_mem_tracing_01=0
-	fi
-	if [[ ${has_memkind} = yes ]]; then
-		if [[ -z ${enable_memkind} ]]; then
-			# If no explicit option was given for libmemkind one way or the other,
-			# we use the value returned previously by has_libmemkind(), in this
-			# case "yes", to determine the default.
-			echo "${script_name}: libmemkind found; default is to enable use."
-			enable_memkind="yes"
-			enable_memkind_01=1
-		else
-			if [[ ${enable_memkind} = yes ]]; then
-				echo "${script_name}: received explicit request to enable libmemkind."
-				enable_memkind="yes"
-				enable_memkind_01=1
-			else
-				echo "${script_name}: received explicit request to disable libmemkind."
-				enable_memkind="no"
-				enable_memkind_01=0
-			fi
-		fi
-	else
-		echo "${script_name}: libmemkind not found; disabling."
-		if [[ ${enable_memkind} = yes ]]; then
-			echo "${script_name}: cannot honor explicit request to enable libmemkind."
-		fi
-		enable_memkind="no"
-		enable_memkind_01=0
-	fi
-	if [[ ${pragma_omp_simd} = yes ]]; then
-		echo "${script_name}: compiler appears to support #pragma omp simd."
-		enable_pragma_omp_simd_01=1
-	else
-		echo "${script_name}: compiler appears to not support #pragma omp simd."
-		enable_pragma_omp_simd_01=0
-	fi
-	if [[ ${enable_blas} = yes ]]; then
-		echo "${script_name}: the BLAS compatibility layer is enabled."
-		enable_blas_01=1
-	else
-		echo "${script_name}: the BLAS compatibility layer is disabled."
-		enable_blas_01=0
-	fi
-	if [[ ${enable_cblas} = yes ]]; then
-		echo "${script_name}: the CBLAS compatibility layer is enabled."
-		enable_cblas_01=1
-		# Force BLAS layer when CBLAS is enabled
-		enable_blas='yes'
-	else
-		echo "${script_name}: the CBLAS compatibility layer is disabled."
-		enable_cblas_01=0
-	fi
-	if [[ ${enable_mixed_dt} = yes ]]; then
-		echo "${script_name}: mixed datatype support is enabled."
+   -e SYMBOLS, --export-shared[=SYMBOLS]
 
-		if [[ ${enable_mixed_dt_extra_mem} = yes ]]; then
-			echo "${script_name}: mixed datatype optimizations requiring extra memory are enabled."
-			enable_mixed_dt_extra_mem_01=1
-		else
-			echo "${script_name}: mixed datatype optimizations requiring extra memory are disabled."
-			enable_mixed_dt_extra_mem_01=0
-		fi
+                 Specify the subset of library symbols that are exported
+                 within a shared library. Valid values for SYMBOLS are:
+                 'public' (the default) and 'all'. By default, only
+                 functions and variables that belong to public APIs are
+                 exported in shared libraries. However, the user may
+                 instead export all symbols in BLIS, even those that were
+                 intended for internal use only. Note that the public APIs
+                 encompass all functions that almost any user would ever
+                 want to call, including the BLAS/CBLAS compatibility APIs
+                 as well as the basic and expert interfaces to the typed
+                 and object APIs that are unique to BLIS. Also note that
+                 changing this option to 'all' will have no effect in some
+                 environments, such as when compiling with clang on
+                 Windows.
 
-		enable_mixed_dt_01=1
-	else
-		echo "${script_name}: mixed datatype support is disabled."
+   --enable-asan, --disable-asan
 
-		enable_mixed_dt_extra_mem_01=0
-		enable_mixed_dt_01=0
-	fi
-	if [[ ${enable_sup_handling} = yes ]]; then
-		echo "${script_name}: sup (skinny/unpacked) matrix handling is enabled."
-		enable_sup_handling_01=1
-	else
-		echo "${script_name}: sup (skinny/unpacked) matrix handling is disabled."
-		enable_sup_handling_01=0
-	fi
-	if [[ ${enable_trsm_preinversion} = yes ]]; then
-		echo "${script_name}: trsm diagonal element pre-inversion is enabled."
-		enable_trsm_preinversion_01=1
-	else
-		echo "${script_name}: trsm diagonal element pre-inversion is disabled."
-		enable_trsm_preinversion_01=0
-	fi
+                 Enable (disabled by default) compiling and linking BLIS
+                 framework code with the AddressSanitizer (ASan) library.
+                 Optimized kernels are NOT compiled with ASan support due
+                 to limitations of register assignment in inline assembly.
+                 WARNING: ENABLING THIS OPTION WILL NEGATIVELY IMPACT
+                 PERFORMANCE. Please use only for informational/debugging
+                 purposes.
 
-	# Report integer sizes.
-	if [[ ${int_type_size} = 32 ]]; then
-		echo "${script_name}: the BLIS API integer size is 32-bit."
-	elif [[ ${int_type_size} = 64 ]]; then
-		echo "${script_name}: the BLIS API integer size is 64-bit."
-	else
-		echo "${script_name}: the BLIS API integer size is automatically determined."
-	fi
-	if [[ ${blas_int_type_size} = 32 ]]; then
-		echo "${script_name}: the BLAS/CBLAS API integer size is 32-bit."
-	elif [[ ${blas_int_type_size} = 64 ]]; then
-		echo "${script_name}: the BLAS/CBLAS API integer size is 64-bit."
-	else
-		echo "${script_name}: the BLAS/CBLAS API integer size is automatically determined."
-	fi
+   -p PATH, --path=PATH
 
-	# Disallow the simultaneous use of 64-bit integers in the BLAS and
-	# 32-bit integers in BLIS.
-	if [[ ${blas_int_type_size} = 64 && ${int_type_size} = 32 ]]; then
-		echo "${script_name}: *** To avoid the possibility of truncation, we do not allow use of 64-bit integers in the BLAS API with 32-bit integers in BLIS. Please use a different configuration of integers."
-		exit 1
-	fi
+                 Look for the plugin source in PATH instead of the current
+                 directory. This option is used to build the plugin
+                 out-of-tree. In this case, only '--enable-build' (== '--build')
+                 may be specified.
 
-	# Check whether we should use AMD-customized versions of certain framework
-	# files.
-	if [[ ${enable_amd_frame_tweaks} = yes ]]; then
+   -q, --quiet   Suppress informational output.
 
-		echo "${script_name}: AMD-specific framework files will be considered."
-		echo "${script_name}:   checking eligibility of target configuration."
+   -f, --force   Overwrite any files in the current directory which are
+                 normally copied by configure-plugin, for example 'Makefile'
+                 and 'config_registry'.
 
-		# Make sure we are targeting either one of the zen subconfigs or the
-		# amd64 umbrella family.
-		if [[ ${config_name} != *zen* && ${config_name} != *amd64* ]]; then
-			echo "${script_name}:   target configuration '${config_name}' is not eligible."
-			echo "${script_name}:   disabling AMD-specific framework files."
-			enable_amd_frame_tweaks='no'
-		else
-			echo "${script_name}:   target configuration '${config_name}' is eligible."
-			echo "${script_name}:   enabling AMD-specific framework files."
-		fi
-	else
-		echo "${script_name}: AMD-specific framework files will not be considered."
-	fi
+   -h, --help    Output this information and quit.
 
-	# Check if addons were given.
-	if [ -n "${addon_flag}" ]; then
+ Environment Variables:
 
-		# Remove duplicates in the addon list, if they exist.
-		addon_list=$(rm_duplicate_words_simple "${addon_list}")
+   CC            Specifies the C compiler to use.
+   CXX           Specifies the C++ compiler to use.
+   FC            Specifies the Fortran compiler to use.
+   AR            Specifies the static library archiver to use.
+   RANLIB        Specifies the ranlib (library indexer) executable to use.
+   CFLAGS        Specifies additional compiler flags to use (prepended).
+   LDFLAGS       Specifies additional linker flags to use (prepended).
 
-		echo "${script_name}: configuring with addons:"
+   Environment variables are traditionally set prior to running configure-plugin:
 
-		for addon in ${addon_list}; do
+     CC=gcc ./configure-plugin [options] plugin-name
 
-			echo "${script_name}:   ${addon_dir}/${addon}"
+   However, they may also be specified as command line options, e.g.:
 
-			addon_fullpath="${addon_dirpath}/${addon}"
+     ./configure-plugin [options] CC=gcc plugin-name
 
-			if [ ! -d "${addon_fullpath}" ]; then
-				echo "${script_name}: requested addon sub-directory does not exist! Cannot continue."
-				echo "${script_name}: *** Please verify addon existence and name."
-				exit 1
-			fi
-		done
+   Note that the compiler used must be compatible with the compiler used
+   to compile the BLIS library.
 
-		enable_addons_01=1
-	else
-		echo "${script_name}: configuring with no addons."
+EOF
 
-		enable_addons_01=0
-	fi
+	# Exit with non-zero exit status
+	exit 1
+}
 
-	# Check if a sandbox was given.
-	if [ -n "${sandbox_flag}" ]; then
+get_config_var()
+{
+	echo "$(grep "^ *$1 *:=" ${sharedir}/blis/config.mk | sed 's/'$1' *:= *//')"
+}
 
-		#sandbox_relpath="${sandbox_dir}/${sandbox}"
+maybe_echo()
+{
+	[ ${quiet_flag} == '0' ] && echo "$@"
+}
 
-		echo "${script_name}: configuring for alternate gemm implementation:"
-		echo "${script_name}:   ${sandbox_dir}/${sandbox}"
+strip_examples()
+{
+	local src dest
 
-		sandbox_fullpath="${sandbox_dirpath}/${sandbox}"
+	src="$1"
+	dest="$2"
 
-		if [ ! -d "${sandbox_fullpath}" ]; then
-			echo "${script_name}: requested sandbox sub-directory does not exist! Cannot continue."
-			echo "${script_name}: *** Please verify sandbox existence and name."
-			exit 1
-		fi
+	perl -p0 -e 's/[^\n]*----->.*<-----[^\n]*//gms;' -e 's/[^\n]*<-----[^\n]*\n//gms;' "${src}" > "${dest}"
+}
 
-		enable_sandbox_01=1
-	else
-		echo "${script_name}: configuring for conventional gemm implementation."
+plugin_main()
+{
+	# -- Basic names and paths --
 
-		enable_sandbox_01=0
-	fi
+	# The name of the script, stripped of any preceeding path.
+	script_name=${0##*/}
 
-	# Check the method used for returning complex numbers.
-	if [[ ${complex_return} = default ]]; then
+	# The path to the script. We need this to find the top-level directory
+	# of the source distribution in the event that the user has chosen to
+	# build elsewhere.
+	sharedir=${0%"/${script_name}"}/..
+	add_config_var sharedir
+
+	# Other paths which we'll need to build the plugin
+	prefix=$(get_config_var prefix | sed 's/\$/\\\\\\$/g')
+	exec_prefix=$(get_config_var exec_prefix | sed 's/\$/\\\\\\$/g')
+	libdir=$(get_config_var libdir | sed 's/\$/\\\\\\$/g')
+	includedir=$(get_config_var includedir | sed 's/\$/\\\\\\$/g')
+	add_config_var prefix
+	add_config_var exec_prefix
+	add_config_var libdir
+	add_config_var includedir
+
+	# Compiler information which might be overridden for this plugin.
+	CC="${CC-$(get_config_var CC)}"
+	CXX="${CXX-$(get_config_var CXX)}"
+	FC="${FC-$(get_config_var FC)}"
+	PYTHON="${PYTHON-$(get_config_var PYTHON)}"
+	AR="${AR-$(get_config_var AR)}"
+	RANLIB="${RANLIB-$(get_config_var RANLIB)}"
+	add_config_var CC        found_cc
+	add_config_var CC_VENDOR cc_vendor
+	add_config_var CXX       found_cxx
+	add_config_var FC        found_fc
+	add_config_var AR        found_ar
+	add_config_var RANLIB    found_ranlib
+	add_config_var PYTHON    found_python
+
+	add_config_var gcc_older_than_4_9_0
+	add_config_var gcc_older_than_6_1_0
+	add_config_var gcc_older_than_9_1_0
+	add_config_var gcc_older_than_10_3_0
+	add_config_var clang_older_than_9_0_0
+	add_config_var clang_older_than_12_0_0
+	add_config_var aocc_older_than_2_0_0
+	add_config_var aocc_older_than_3_0_0
+
+	asm_dir="${sharedir}/blis"
+	omp_simd_path="${sharedir}/blis"
 
-		# If we prevoiusly found a Fortran compiler, let's query it to see what
-		# kind of complex return type it uses (gnu or intel). The 'gnu' style
-		# returns complex values from functions normally, via the C language
-		# return statement, while the 'intel' style returns them in a "hidden"
-		# parameter (inserted by the compiler) that precedes all other function
-		# parameters.
-		if [ -n "${found_fc}" ]; then
+	# Path to 'mirror-tree.sh' script.
+	mirror_tree_sh="${sharedir}/blis/mirror-tree.sh"
 
-			# Query the full vendor version string output. This includes the
-			# version number along with (potentially) a bunch of other textual
-			# clutter.
-			# NOTE: This maybe should use merged stdout/stderr rather than only
-			# stdout. But it works for now.
-			vendor_string="$(${found_fc} --version 2>/dev/null || :)"
+	# Path to 'gen-make-frags.sh' script.
+	gen_make_frags_sh="${sharedir}/blis/gen-make-frag.sh"
+	gen_make_frags_dirpath="${sharedir}/blis"
 
-			# Query the compiler "vendor" (ie: the compiler's simple name).
-			# The last part ({ read first rest ; echo $first ; }) is a workaround
-			# to OS X's egrep only returning the first match.
-			fc_vendor=$(echo "${vendor_string}" | grep -oE 'IFORT|GNU' |
-			            { read -r first rest ; echo "${first}"; })
+	# The major and minor/build .so version numbers.
+	so_version_major=$(get_config_var SO_MAJOR)
+	so_version_minorbuild=$(get_config_var SO_MINORB)
 
-			if [[ ${fc_vendor} = IFORT ]]; then
-				complex_return='intel'
-			elif [[ ${fc_vendor} = GNU ]]; then
-				complex_return='gnu'
-			else
-				echo "${script_name}: unable to determine Fortran compiler vendor!"
-				complex_return='gnu'
-			fi
-		else
-			complex_return='gnu'
-		fi
-	fi
+	# The preset value of CFLAGS and LDFLAGS (ie: compiler and linker flags
+	# to use in addition to those determined by the build system).
+	cflags_preset=$(get_config_var CFLAGS_PRESET)
+	cxxflags_preset=$(get_config_var CXXFLAGS_PRESET)
+	ldflags_preset=$(get_config_var LDFLAGS_PRESET)
+	add_config_var cflags_preset
+	add_config_var cxxflags_preset
+	add_config_var ldflags_preset
 
-	if [[ ${complex_return} = gnu ]]; then
-		complex_return_intel01='0'
-	elif [[ ${complex_return} = intel ]]; then
-		complex_return_intel01='1'
-	else
-		echo "${script_name}: unknown complex return type \"${complex_return}\"! Cannot continue."
-		echo "${script_name}: *** Acceptable values are \"gnu\" and \"intel\"."
-		exit 1
-	fi
+	# Option variables.
+	quiet_flag='0'
+	force_flag='0'
+	show_config_list='0'
+	examples_flag='1'
+	buildfiles_flag='1'
+	templates_flag='1'
 
-	echo "${script_name}: configuring complex return type as \"${complex_return}\"."
+	# Additional flags.
+	enable_verbose=$(get_config_var ENABLE_VERBOSE)
+	enable_static=$(get_config_var MK_ENABLE_STATIC)
+	enable_shared=$(get_config_var MK_ENABLE_SHARED)
+	enable_rpath=$(get_config_var MK_ENABLE_RPATH)
+	export_shared=$(get_config_var EXPORT_SHARED)
+	enable_asan=$(get_config_var MK_ENABLE_ASAN)
+	enable_debug=$(get_config_var ENABLE_DEBUG)
+	debug_type=$(get_config_var DEBUG_TYPE)
+	enable_arg_max_hack=$(get_config_var ARG_MAX_HACK)
+	add_config_var enable_verbose
+	add_config_var mk_enable_static enable_static
+	add_config_var mk_enable_shared enable_shared
+	add_config_var enable_rpath
+	add_config_var export_shared
+	add_config_var enable_asan
+	add_config_var enable_debug
+	add_config_var debug_type
+	add_config_var enable_arg_max_hack
 
-	# Variables that may contain forward slashes, such as paths, need extra
-	# escaping when used in sed commands. We insert those extra escape
-	# characters here so that the sed commands below do the right thing.
-	os_name_esc=$(echo     "${os_name}"      | sed 's/\//\\\//g')
-	prefix_esc=$(echo      "${prefix}"       | sed 's/\//\\\//g')
-	exec_prefix_esc=$(echo "${exec_prefix}"  | sed 's/\//\\\//g')
-	libdir_esc=$(echo      "${libdir}"       | sed 's/\//\\\//g')
-	includedir_esc=$(echo  "${includedir}"   | sed 's/\//\\\//g')
-	sharedir_esc=$(echo    "${sharedir}"     | sed 's/\//\\\//g')
-	dist_path_esc=$(echo   "${dist_path}"    | sed 's/\//\\\//g')
-	cc_esc=$(echo          "${found_cc}"     | sed 's/\//\\\//g')
-	cxx_esc=$(echo         "${found_cxx}"    | sed 's/\//\\\//g')
-	ar_esc=$(echo          "${found_ar}"     | sed 's/\//\\\//g')
-	ranlib_esc=$(echo      "${found_ranlib}" | sed 's/\//\\\//g')
-	python_esc=$(echo      "${found_python}" | sed 's/\//\\\//g')
-
-	libpthread_esc=$(echo "${LIBPTHREAD--lpthread}" | sed 's/\//\\\//g')
-	cflags_preset_esc=$(echo "${cflags_preset}" | sed 's/\//\\\//g')
-	ldflags_preset_esc=$(echo "${ldflags_preset}" | sed 's/\//\\\//g')
-
-	# For Windows builds, clear the libpthread_esc variable so that
-	# no pthreads library is substituted into config.mk. (Windows builds
-	# employ an implementation of pthreads that is internal to BLIS.)
-	if [[ "$is_win" == "yes" && "$cc_vendor" == "clang" ]]; then
-		libpthread_esc=
-	fi
+	# -- Configuration registry --
 
-	# We also clear the libpthread_esc variable for systemless builds
-	# (--disable-system).
-	if [[ "$enable_system" == "no" ]]; then
-		libpthread_esc=
-	fi
+	# The name of the chosen configuration (the configuration "family").
+	config_name=$(get_config_var CONFIG_NAME)
 
-	# Typically, there are no slashes in the version variable. However,
-	# downstream maintainers (such as those for Debian) may create custom
-	# tags in their local clones such as "upstream/0.4.1", which obviously
-	# contain slashes. This line, and subsequent use of the escaped variable
-	# for the version string, accommodates those use cases.
-	version_esc=$(echo "${version}" | sed 's/\//\\\//g')
+	# The list of sub-configurations associated with config_name.
+	config_list=''
 
-	# Create a #define for the configuration family (config_name).
-	uconf=$(echo "${config_name}" | tr '[:lower:]' '[:upper:]')
-	config_name_define="#define BLIS_FAMILY_${uconf}\n"
+	# The list of kernel sets that will be needed by the sub-configurations
+	# in config_list.
+	kernel_list=''
 
-	# Create a list of #defines, one for each configuration in config_list.
-	config_list_defines=""
-	for conf in ${config_list}; do
+	# The list of kernel:sub-configuration pairs for all kernels contained
+	# in kernel_list.
+	kconfig_map=''
+	add_config_var kconfig_map
 
-		# Convert the current config name to uppercase.
-		uconf=$(echo "${conf}" | tr '[:lower:]' '[:upper:]')
+	# BLIS version.
+	version=$(get_config_var VERSION)
 
-		# Create a #define and add it to the running list.
-		config_define="BLIS_CONFIG_${uconf}"
-		config_list_defines="${config_list_defines}#define ${config_define}\n"
-	done
+	# The list of all sub-configurations and configuration families.
+	full_config_list=$(get_config_var FULL_CONFIG_LIST)
+	full_subconfig_list=$(get_config_var FULL_SUBCONFIG_LIST)
 
-	# Create a list of #defines, one for each kernel set in kernel_list.
-	kernel_list_defines=""
-	for kern in ${kernel_list}; do
+	# The list of all kernel sets.
+	full_kernel_list=$(get_config_var FULL_KERNEL_LIST)
 
-		# Convert the current config name to uppercase.
-		uconf=$(echo "${kern}" | tr '[:lower:]' '[:upper:]')
+	# -- Command line option/argument parsing ----------------------------------
 
-		# Create a #define and add it to the running list.
-		kernel_define="BLIS_KERNELS_${uconf}"
-		kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n"
-	done
+	# By default build in-tree
+	plugin_dir="."
 
-	# Create a list of #includes, one for each addon in addon_list.
-	addon_list_includes=""
-	for addon in ${addon_list}; do
+	found=true
+	while [[ $found = true ]]; do
 
-		# Create a #define and add it to the running list.
-		addon_header="\"${addon}.h\""
-		addon_list_includes="${addon_list_includes}#include ${addon_header}\n"
-	done
+		# Process our command line options.
+		unset OPTIND
+		while getopts ":hcd:e:p:qf-:" opt; do
+			case $opt in
+				-)
+					case "$OPTARG" in
 
+						help)
+							print_usage_plugin
+							;;
 
-	# -- Determine whether we are performing an out-of-tree build --------------
+						init)
+							examples_flag=0
+							templates_flag=1
+							buildfiles_flag=0
+							;;
 
-	if [ "${dist_path}" != "./" ]; then
+						build)
+							examples_flag=0
+							templates_flag=0
+							buildfiles_flag=1
+							;;
 
-		# At this point, we know the user did not run "./configure". But we
-		# have not yet ruled out "<fullpath>/configure" or some # equivalent
-		# that uses relative paths. To further rule out these possibilities,
-		# we create a dummy file in the current build directory.
-		touch "./${dummy_file}"
+						quiet)
+							quiet_flag=1
+							;;
 
-		# If the dummy file we just created in the current directory does not
-		# appear in the source distribution path, then we are in a different
-		# directory and thus we must create a symbolic link.
-		if [ ! -f "${dist_path}/${dummy_file}" ]; then
-			configured_oot="yes"
-			#echo "${script_name}: detected out-of-tree build directory."
-		else
-			configured_oot="no"
-			#echo "${script_name}: detected in-tree build directory."
-		fi
+						force)
+							force_flag=1
+							;;
 
-		# Remove the dummy file.
-		rm -f "./${dummy_file}"
-	fi
+						enable-examples)
+							examples_flag=1
+							;;
+						disable-examples)
+							examples_flag=0
+							;;
 
+						enable-templates)
+							templates_flag=1
+							;;
+						disable-templates)
+							templates_flag=0
+							;;
 
-	# -- Instantiate config.mk file from template ------------------------------
-
-	# Begin substituting information into the config_mk_in file, outputting
-	# to config_mk_out.
-	echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}"
-	sed <"${config_mk_in_path}" >"${config_mk_out_path}"          \
-	-e "s/@version@/${version_esc}/g"                             \
-	-e "s/@so_version_major@/${so_version_major}/g"               \
-	-e "s/@so_version_minorbuild@/${so_version_minorbuild}/g"     \
-	-e "s/@config_name@/${config_name}/g"                         \
-	-e "s/@config_list@/${config_list}/g"                         \
-	-e "s/@kernel_list@/${kernel_list}/g"                         \
-	-e "s/@kconfig_map@/${kconfig_map}/g"                         \
-	-e "s/@os_name@/${os_name_esc}/g"                             \
-	-e "s/@is_win@/${is_win}/g"                                   \
-	-e "s/@is_msvc@/${is_msvc}/g"                                 \
-	-e "s/@dist_path@/${dist_path_esc}/g"                         \
-	-e "s/@CC_VENDOR@/${cc_vendor}/g"                             \
-	-e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g"       \
-	-e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g"       \
-	-e "s/@gcc_older_than_9_1_0@/${gcc_older_than_9_1_0}/g"       \
-	-e "s/@gcc_older_than_10_3_0@/${gcc_older_than_10_3_0}/g"     \
-	-e "s/@clang_older_than_9_0_0@/${clang_older_than_9_0_0}/g"   \
-	-e "s/@clang_older_than_12_0_0@/${clang_older_than_12_0_0}/g" \
-	-e "s/@aocc_older_than_2_0_0@/${aocc_older_than_2_0_0}/g"     \
-	-e "s/@aocc_older_than_3_0_0@/${aocc_older_than_3_0_0}/g"     \
-	-e "s/@CC@/${cc_esc}/g"                                       \
-	-e "s/@CXX@/${cxx_esc}/g"                                     \
-	-e "s/@AR@/${ar_esc}/g"                                       \
-	-e "s/@RANLIB@/${ranlib_esc}/g"                               \
-	-e "s/@PYTHON@/${python_esc}/g"                               \
-	-e "s/@libpthread@/${libpthread_esc}/g"                       \
-	-e "s/@cflags_preset@/${cflags_preset_esc}/g"                 \
-	-e "s/@ldflags_preset@/${ldflags_preset_esc}/g"               \
-	-e "s/@enable_asan@/${enable_asan}/g"                         \
-	-e "s/@debug_type@/${debug_type}/g"                           \
-	-e "s/@enable_debug@/${enable_debug}/g"                       \
-	-e "s/@enable_system@/${enable_system}/g"                     \
-	-e "s/@threading_model@/${threading_model}/g"                 \
-	-e "s/@prefix@/${prefix_esc}/g"                               \
-	-e "s/@exec_prefix@/${exec_prefix_esc}/g"                     \
-	-e "s/@libdir@/${libdir_esc}/g"                               \
-	-e "s/@includedir@/${includedir_esc}/g"                       \
-	-e "s/@sharedir@/${sharedir_esc}/g"                           \
-	-e "s/@enable_verbose@/${enable_verbose}/g"                   \
-	-e "s/@configured_oot@/${configured_oot}/g"                   \
-	-e "s/@enable_arg_max_hack@/${enable_arg_max_hack}/g"         \
-	-e "s/@enable_static@/${enable_static}/g"                     \
-	-e "s/@enable_shared@/${enable_shared}/g"                     \
-	-e "s/@enable_rpath@/${enable_rpath}/g"                       \
-	-e "s/@export_shared@/${export_shared}/g"                     \
-	-e "s/@enable_blas@/${enable_blas}/g"                         \
-	-e "s/@enable_cblas@/${enable_cblas}/g"                       \
-	-e "s/@enable_amd_frame_tweaks@/${enable_amd_frame_tweaks}/g" \
-	-e "s/@enable_memkind@/${enable_memkind}/g"                   \
-	-e "s/@pragma_omp_simd@/${pragma_omp_simd}/g"                 \
-	-e "s/@addon_list@/${addon_list}/g"                           \
-	-e "s/@sandbox@/${sandbox}/g"
-
-	# -- Instantiate bli_config.h file from template ---------------------------
-
-	# Begin substituting information into the bli_config_h_in file, outputting
-	# to bli_config_h_out. NOTE: We use perl instead of sed because the version
-	# of sed used on OS X is old and does not handle the '\n' character
-	# intuitively, which was used when constructing ${config_name_define},
-	# ${config_list_defines}, and ${kernel_list_defines}.
-	echo "${script_name}: creating ${bli_config_h_out_path} from ${bli_config_h_in_path}"
-	<"${bli_config_h_in_path}" perl -p                                   \
-	-e "s/\@config_name_define\@/${config_name_define}/g;"               \
-	-e "s/\@config_list_defines\@/${config_list_defines}/g;"             \
-	-e "s/\@kernel_list_defines\@/${kernel_list_defines}/g;"             \
-	| sed >"${bli_config_h_out_path}"                                    \
-	-e "s/@version@/${version_esc}/g"                                    \
-	-e "s/@enable_system@/${enable_system_01}/g"                         \
-	-e "s/@enable_tls@/${enable_tls_01}/g"                               \
-	-e "s/@enable_openmp@/${enable_openmp_01}/g"                         \
-	-e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g"           \
-	-e "s/@enable_pthreads@/${enable_pthreads_01}/g"                     \
-	-e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g"       \
-	-e "s/@enable_hpx@/${enable_hpx_01}/g"                               \
-	-e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g"                 \
-	-e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g"                       \
-	-e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g"                   \
-	-e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g"                     \
-	-e "s/@enable_pba_pools@/${enable_pba_pools_01}/g"                   \
-	-e "s/@enable_sba_pools@/${enable_sba_pools_01}/g"                   \
-	-e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g"               \
-	-e "s/@int_type_size@/${int_type_size}/g"                            \
-	-e "s/@blas_int_type_size@/${blas_int_type_size}/g"                  \
-	-e "s/@enable_blas@/${enable_blas_01}/g"                             \
-	-e "s/@enable_cblas@/${enable_cblas_01}/g"                           \
-	-e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g"                     \
-	-e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
-	-e "s/@enable_sup_handling@/${enable_sup_handling_01}/g"             \
-	-e "s/@enable_memkind@/${enable_memkind_01}/g"                       \
-	-e "s/@enable_trsm_preinversion@/${enable_trsm_preinversion_01}/g"   \
-	-e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g"       \
-	-e "s/@enable_sandbox@/${enable_sandbox_01}/g"                       \
-	-e "s/@enable_shared@/${enable_shared_01}/g"                         \
-	-e "s/@complex_return_intel@/${complex_return_intel01}/g"
-
-	# -- Instantiate bli_addon.h file from template ----------------------------
-
-	# Begin substituting information into the bli_addon_h_in file, outputting
-	# to bli_addon_h_out. NOTE: We use perl instead of sed because the version
-	# of sed used on OS X is old and does not handle the '\n' character
-	# intuitively, which was used when constructing ${addon_list_includes}.
-	echo "${script_name}: creating ${bli_addon_h_out_path} from ${bli_addon_h_in_path}"
-	perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" "${bli_addon_h_in_path}" \
-	| sed -e "s/@enable_addons@/${enable_addons_01}/g" > "${bli_addon_h_out_path}"
+						enable-build)
+							buildfiles_flag=1
+							;;
+						disable-build)
+							buildfiles_flag=0
+							;;
 
-	# -- Create top-level object directories -----------------------------------
+						path=*)
+							plugin_dir=${OPTARG#*=}
+							;;
 
-	# Create obj sub-directories (if they do not already exist).
-	base_obj_dirpath="${obj_dirpath}/${config_name}"
+						enable-debug)
+							debug_type=noopt
+							;;
+						enable-debug=*)
+							debug_type=${OPTARG#*=}
+							;;
+						disable-debug)
+							debug_type='off'
+							;;
 
-	echo "${script_name}: creating ${base_obj_dirpath}"
-	mkdir -p "${base_obj_dirpath}"
+						enable-asan)
+							enable_asan='yes'
+							;;
+						disable-asan)
+							enable_asan='no'
+							;;
 
+						enable-verbose-make)
+							enable_verbose='yes'
+							;;
+						disable-verbose-make)
+							enable_verbose='no'
+							;;
 
-	obj_config_dirpath="${base_obj_dirpath}/${config_dir}"
+						enable-arg-max-hack)
+							enable_arg_max_hack='yes'
+							;;
+						disable-arg-max-hack)
+							enable_arg_max_hack='no'
+							;;
 
-	mkdir -p "${obj_config_dirpath}"
-	for conf in ${config_list}; do
-		echo "${script_name}: creating ${obj_config_dirpath}/${conf}"
-		mkdir -p "${obj_config_dirpath}/${conf}"
-	done
+						enable-static)
+							enable_static='yes'
+							;;
+						disable-static)
+							enable_static='no'
+							;;
 
+						enable-shared)
+							enable_shared='yes'
+							;;
+						disable-shared)
+							enable_shared='no'
+							;;
 
-	obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}"
+						enable-rpath)
+							enable_rpath='yes'
+							;;
+						disable-rpath)
+							enable_rpath='no'
+							;;
 
-	mkdir -p "${obj_kernels_dirpath}"
-	for kern in ${kernel_list}; do
-		echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}"
-		mkdir -p "${obj_kernels_dirpath}/${kern}"
-	done
+						export-shared=*)
+							export_shared=${OPTARG#*=}
+							;;
 
+						show-config-list)
+							show_config_list=1
+							;;
 
-	obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}"
+						*)
+							print_usage_plugin
+							;;
+					esac;;
+				h)
+					print_usage_plugin
+					;;
+				d)
+					debug_type=$OPTARG
+					;;
+				e)
+					export_shared=$OPTARG
+					;;
+				p)
+					plugin_dir=$OPTARG
+					;;
+				q)
+					quiet_flag=1
+					;;
+				f)
+					force_flag=1
+					;;
+				c)
+					show_config_list=1
+					;;
+				\?)
+					print_usage_plugin
+					;;
+			esac
+		done
+		shift $((OPTIND - 1))
 
-	mkdir -p "${obj_refkern_dirpath}"
-	for conf in ${config_list}; do
-		echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}"
-		mkdir -p "${obj_refkern_dirpath}/${conf}"
+		# Parse environment variables
+		found=false
+		while [ $# -gt 0 ]; do
+			case $1 in
+				*=*)
+					var=$(expr "$1" : '\([^=]*\)=')
+					value=$(expr "$1" : '[^=]*=\(.*\)')
+					eval "export $var=\$value"
+					shift
+					found=true
+					;;
+				*)
+					break
+					;;
+			esac
+		done
 	done
 
+	# The path to the directory in which we are building. We do this to
+	# make explicit that we distinguish between the top-level directory
+	# of the distribution and the directory in which we are building.
+	cur_dirpath="."
 
-	obj_frame_dirpath="${base_obj_dirpath}/${frame_dir}"
-
-	echo "${script_name}: creating ${obj_frame_dirpath}"
-	mkdir -p "${obj_frame_dirpath}"
+	# The name of the (top-level) configuration directory.
+	config_dir='config'
+	config_dirpath="${plugin_dir}/${config_dir}"
 
+	# The name of the (top-level) kernels directory.
+	kernels_dir='kernels'
+	kernels_dirpath="${plugin_dir}/${kernels_dir}"
 
-	if [ -n "${addon_flag}" ]; then
+	# The name of the (top-level) reference kernels directory.
+	refkern_dir='ref_kernels'
+	refkern_dirpath="${plugin_dir}/${refkern_dir}"
 
-		obj_addon_dirpath="${base_obj_dirpath}/${addon_dir}"
+	add_config_var plugin_dir
 
-		for addon in ${addon_list}; do
-			echo "${script_name}: creating ${obj_addon_dirpath}/${addon}"
-			mkdir -p "${obj_addon_dirpath}/${addon}"
-		done
+	# Get the name of the plugin to build.
+	if [ $# -gt "1" ]; then   # more than one configuration argument given.
+		print_usage_plugin
+	elif [ $# == "0" ]; then   # try to guess the plugin name.
+		plugin_h=$(ls ${plugin_dir}/bli_plugin_*.h 2>/dev/null)
+		if [ -z ${plugin_h} ]; then
+			print_usage_plugin
+		else
+			plugin_name=$(echo ${plugin_h} | sed -e 's/.*bli_plugin_//' -e 's/\.h//')
+		fi
+	else
+		plugin_name="${1}"
 	fi
+	add_config_var plugin_name
 
+	maybe_echo "${script_name}: configuring BLIS plugin '${plugin_name}'"
 
-	if [ -n "${sandbox_flag}" ]; then
+	if [[ ${plugin_dir} != . ]]; then
+		if [ ${templates_flag} == '1' ] ||
+		   [ ${examples_flag}  == '1' ]; then
+			echo "${script_name}: *** Only --enable-build may be specified when configuring out-of-tree."
+			echo "${script_name}: *** Please use '--build' with '-p' or '--path'."
+			exit 1
+		fi
+	fi
 
-		obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
+	# Check if CFLAGS is non-empty.
+	if [ -n "${CFLAGS}" ]; then
+		cflags_preset="${CFLAGS}"
+		maybe_echo "${script_name}: detected preset CFLAGS; prepending:"
+		maybe_echo "${script_name}:   ${cflags_preset}"
+	else
+		cflags_preset=''
+		maybe_echo "${script_name}: no preset CFLAGS detected."
+	fi
 
-		echo "${script_name}: creating ${obj_sandbox_dirpath}/${sandbox}"
-		mkdir -p "${obj_sandbox_dirpath}/${sandbox}"
+	# Check if CXXFLAGS is non-empty.
+	if [ -n "${CXXFLAGS}" ]; then
+		cxxflags_preset="${CXXFLAGS}"
+		maybe_echo "${script_name}: detected preset CXXFLAGS; prepending:"
+		maybe_echo "${script_name}:   ${cxxflags_preset}"
+	else
+		cxxflags_preset=''
+		maybe_echo "${script_name}: no preset CXXFLAGS detected."
 	fi
 
+	# Check if LDFLAGS is non-empty.
+	if [ -n "${LDFLAGS}" ]; then
+		ldflags_preset="${LDFLAGS}"
+		maybe_echo "${script_name}: detected preset LDFLAGS; prepending:"
+		maybe_echo "${script_name}:   ${ldflags_preset}"
+	else
+		ldflags_preset=''
+		maybe_echo "${script_name}: no preset LDFLAGS detected."
+	fi
 
-	obj_blastest_dirpath="${base_obj_dirpath}/${blastest_dir}"
+	# Check if the verbose make flag was specified.
+	if [[ ${enable_verbose} = yes ]]; then
+		maybe_echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)"
+	else
+		maybe_echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)"
+	fi
 
-	echo "${script_name}: creating ${obj_blastest_dirpath}"
-	mkdir -p "${obj_blastest_dirpath}"
+	# Check if the ARG_MAX hack was requested.
+	if [[ ${enable_arg_max_hack} = yes ]]; then
+		maybe_echo "${script_name}: enabling ARG_MAX hack."
+	else
+		echo "${script_name}: disabling ARG_MAX hack."
+	fi
 
+	# Check if the debug flag was specified.
+	if [[ ${debug_type} = opt ]]; then
+		enable_debug='yes'
+		maybe_echo "${script_name}: enabling debug symbols with optimizations."
+	elif [[ ${debug_type} = sde ]]; then
+		enable_debug='yes'
+		maybe_echo "${script_name}: enabling SDE processor emulation."
+	elif [[ ${debug_type} = noopt ]]; then
+		enable_debug='yes'
+		maybe_echo "${script_name}: enabling debug symbols; optimizations disabled."
+	else
+		debug_type='off'
+		enable_debug='no'
+		maybe_echo "${script_name}: debug symbols disabled."
+	fi
 
-	obj_testsuite_dirpath="${base_obj_dirpath}/${testsuite_dir}"
+	# Check if the AddressSanitizer flag was specified.
+	if [[ ${enable_asan} = yes ]]; then
+		maybe_echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)."
+	else
+		enable_asan='no'
+		maybe_echo "${script_name}: AddressSanitizer support disabled."
+	fi
 
-	echo "${script_name}: creating ${obj_testsuite_dirpath}"
-	mkdir -p "${obj_testsuite_dirpath}"
+	# Check if the static lib flag was specified.
+	if   [[ ${enable_static} = yes && ${enable_shared} = yes ]]; then
+		maybe_echo "${script_name}: building BLIS plugin '${plugin_name}' as both static and shared libraries."
+		enable_shared_01=1
+	elif [[ ${enable_static} = no && ${enable_shared} = yes ]]; then
+		maybe_echo "${script_name}: building BLIS plugin '${plugin_name}' as a shared library (static library disabled)."
+		enable_shared_01=1
+	elif [[ ${enable_static} = yes && ${enable_shared} = no ]]; then
+		maybe_echo "${script_name}: building BLIS plugin '${plugin_name}' as a static library (shared library disabled)."
+		enable_shared_01=0
+	else
+		maybe_echo "${script_name}: Both static and shared libraries were disabled."
+		maybe_echo "${script_name}: *** Please enable one (or both) to continue."
+		exit 1
+	fi
 
+	# Check if the "export shared" flag was specified.
+	if [[ ${export_shared} = all ]]; then
+		if [[ ${enable_shared} = yes ]]; then
+			echo "${script_name}: exporting all symbols within shared library."
+		else
+			echo "${script_name}: ignoring request to export all symbols within shared library."
+		fi
+	elif [[ ${export_shared} = public ]]; then
+		if [[ ${enable_shared} = yes ]]; then
+			echo "${script_name}: exporting only public symbols within shared library."
+		fi
+	else
+		echo "${script_name}: *** Invalid argument '${export_shared}' to --export-shared option given."
+		echo "${script_name}: *** Please use 'public' or 'all'."
+		exit 1
+	fi
 
-	# Create lib directory (if it does not already exist).
-	base_lib_dirpath="${lib_dirpath}/${config_name}"
+	if [ ${templates_flag} == '1' ]; then
 
-	echo "${script_name}: creating ${base_lib_dirpath}"
-	mkdir -p "${base_lib_dirpath}"
+		plugin_h="\"bli_plugin_${plugin_name}.h\""
 
+		# -- config_registry --
 
-	# Create include directory (if it does not already exist).
-	base_include_dirpath="${include_dirpath}/${config_name}"
+		maybe_echo -n "${script_name}: copying the configuration registry..."
 
-	echo "${script_name}: creating ${base_include_dirpath}"
-	mkdir -p "${base_include_dirpath}"
+		if [ -e config_registry ] && [ ${force_flag} == '0' ]; then
+			maybe_echo "already done"
+		else
+			cp ${sharedir}/blis/config_registry config_registry
+			maybe_echo "done"
+		fi
 
+		# -- bli_plugin_register.c --
 
-	# -- Mirror source directory hierarchies to object directories -------------
+		maybe_echo -n "${script_name}: copying bli_plugin_register.c..."
 
-	# Combine the config_list with the config_name and then remove duplicates.
-	config_list_plus_name=$(rm_duplicate_words "${config_list} ${config_name}")
+		if [ -e bli_plugin_register.c ] && [ ${force_flag} == '0' ]; then
+			maybe_echo "already done"
+		else
+			if [ ${examples_flag} == '1' ]; then
+				cp ${sharedir}/blis/plugin/bli_plugin_register.c bli_plugin_register.c
+			else
+				strip_examples ${sharedir}/blis/plugin/bli_plugin_register.c bli_plugin_register.c
+			fi
+			perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" bli_plugin_register.c
+			maybe_echo "done"
+		fi
 
-	# Mirror each of the sub-configuration directories to the object directory.
-	for conf in ${config_list_plus_name}; do
+		# -- bli_plugin_<name>.h --
 
-		echo "${script_name}: mirroring ${config_dirpath}/${conf} to ${obj_config_dirpath}/${conf}"
-		"${mirror_tree_sh}" "${config_dirpath}/${conf}" "${obj_config_dirpath}/${conf}"
-	done
+		if [ -e bli_plugin_${plugin_name}.h ] && [ ${force_flag} == '0' ]; then
+			maybe_echo "${script_name}: creating bli_plugin_${plugin_name}.h from ${sharedir}/blis/plugin/bli_plugin.h.in...already done"
+		else
+			if [ ${examples_flag} == '1' ]; then
+				generate_config_file ${sharedir}/blis/plugin/bli_plugin.h.in bli_plugin_${plugin_name}.h
+			else
+				generate_config_file ${sharedir}/blis/plugin/bli_plugin.h.in bli_plugin_${plugin_name}_.h
+				strip_examples bli_plugin_${plugin_name}_.h bli_plugin_${plugin_name}.h
+				rm bli_plugin_${plugin_name}_.h
+			fi
+		fi
 
-	# Mirror optimized kernels source tree to its object sub-directory.
-	# We perform the mirroring on each configuration/kernel sub-directory
-	# within 'kernels'.
-	for kern in ${kernel_list}; do
+		# -- ref_kernels directory --
 
-		# Only mirror the optimized kernels source directory if it exists.
-		# There are occasions where one of the sub-configurations in the
-		# config_list does not correspond to a kernels sub-directory, such
-		# as when architecture B is so close to architecture A that B can
-		# use A's kernel source code unmodified (though perhaps with
-		# different blocksizes).
-		#if [ -d "${kernels_dirpath}/${conf}" ]; then
+		maybe_echo -n "${script_name}: creating ref_kernels directory..."
 
-		echo "${script_name}: mirroring ${kernels_dirpath}/${kern} to ${obj_kernels_dirpath}/${kern}"
-		${mirror_tree_sh} "${kernels_dirpath}/${kern}" "${obj_kernels_dirpath}/${kern}"
-		#else
-		#	echo "${script_name}: mirroring ${kernels_dirpath}/${conf} skipped... directory does not exist"
-		#fi
-	done
+		done="true"
+		if [ ${examples_flag} == '1' ]; then
+			files="bli_plugin_init_ref.c my_kernel_1_ref.c my_kernel_2_ref.c"
+		else
+			files="bli_plugin_init_ref.c"
+		fi
+		for file in ${files}; do
+			if [ ! -e ref_kernels/${file} ] || [ ${force_flag} == '1' ]; then
+				mkdir -p ref_kernels
+				if [ ${examples_flag} == '1' ]; then
+					cp ${sharedir}/blis/plugin/${file} ref_kernels/${file}
+				else
+					strip_examples ${sharedir}/blis/plugin/${file} ref_kernels/${file}
+				fi
+				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" ref_kernels/${file}
+				done="false"
+			fi
+		done
 
-	# Mirror reference kernel source tree to its object sub-directory.
-	echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}"
-	"${mirror_tree_sh}" "${refkern_dirpath}" "${obj_refkern_dirpath}"
+		if [ "${done}" == "true" ]; then
+			maybe_echo "already done"
+		else
+			maybe_echo "done"
+		fi
 
-	# Mirror reference kernels source tree to its object sub-directory.
-	for conf in ${config_list}; do
+		# -- config directory --
 
-		echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}/${conf}"
-		"${mirror_tree_sh}" "${refkern_dirpath}" "${obj_refkern_dirpath}/${conf}"
-	done
+		maybe_echo "${script_name}: creating config directories:"
 
-	# Mirror framework source tree to its object sub-directory.
-	echo "${script_name}: mirroring ${frame_dirpath} to ${obj_frame_dirpath}"
-	"${mirror_tree_sh}" "${frame_dirpath}" "${obj_frame_dirpath}"
+		for config in ${full_config_list}; do
+			maybe_echo -n "${script_name}:   config/${config}..."
 
-	# Mirror the chosen addon source tree to its object sub-directory.
-	if [[ -n ${addon_flag} ]]; then
+			if [ -e config/${config}/make_defs.mk ] && [ ${force_flag} == '0' ]; then
+				maybe_echo "already done"
+			else
+				mkdir -p config/${config}
+				cp ${sharedir}/blis/config/${config}/make_defs.mk config/${config}
+				maybe_echo "done"
+			fi
+		done
 
-		for addon in ${addon_list}; do
+		for config in ${full_subconfig_list}; do
+			if [ ! -e config/${config}/bli_plugin_init_${config}.c ] || [ ${force_flag} == '1' ]; then
+				if [ ${config} != zen3 ] || [ ${examples_flag} == '0' ]; then
+					strip_examples ${sharedir}/blis/plugin/bli_plugin_init_zen3.c config/${config}/bli_plugin_init_${config}.c
+					perl -pi -e "s/zen3/${config}/g" config/${config}/bli_plugin_init_${config}.c
+				else
+					cp ${sharedir}/blis/plugin/bli_plugin_init_zen3.c config/${config}/bli_plugin_init_${config}.c
+				fi
+				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" config/${config}/bli_plugin_init_${config}.c
+			fi
 
-			echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}"
-			"${mirror_tree_sh}" "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}"
+			if [ ! -e config/${config}/bli_kernel_defs_${config}.h ] || [ ${force_flag} == '1' ]; then
+				cp ${sharedir}/blis/config/${config}/bli_kernel_defs_${config}.h config/${config}/bli_kernel_defs_${config}.h
+			fi
 		done
-	fi
-
-	# Mirror the chosen sandbox source tree to its object sub-directory.
-	if [[ -n ${sandbox_flag} ]]; then
 
-		echo "${script_name}: mirroring ${sandbox_dirpath}/${sandbox} to ${obj_sandbox_dirpath}/${sandbox}"
-		"${mirror_tree_sh}" "${sandbox_dirpath}/${sandbox}" "${obj_sandbox_dirpath}/${sandbox}"
-	fi
+		# -- kernels directory --
 
+		maybe_echo "${script_name}: creating kernels directories:"
 
-	# -- Generate makefile fragements ------------------------------------------
+		for kernels in ${full_kernel_list}; do
+			maybe_echo -n "${script_name}:   kernels/${kernels}..."
 
-	create_makefile_fragment() {
-		echo "${script_name}: creating makefile fragments in $3"
-		"${gen_make_frags_sh}"                           \
-			-h -r -v0                                \
-			-o "${script_name}"                      \
-			-p "$1" "$2" "$3"                        \
-			"${gen_make_frags_dirpath}/fragment.mk"  \
-			"${gen_make_frags_dirpath}/suffix_list"  \
-			"${gen_make_frags_dirpath}/ignore_list"
-	}
+			if [ -e kernels/${kernels} ] && [ ${force_flag} == '0' ]; then
+				maybe_echo "already done"
+			else
+				mkdir -p kernels/${kernels}
+				maybe_echo "done"
+			fi
+		done
 
-	clist_contains_cname=$(is_in_list "${config_name}" "${config_list}")
+		if [ ${examples_flag} == '1' ]; then
+			if [ ! -e kernels/zen3/my_kernel_1_zen3.c ] || [ ${force_flag} == '1' ]; then
+				cp ${sharedir}/blis/plugin/my_kernel_1_zen3.c kernels/zen3
+				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" kernels/zen3/my_kernel_1_zen3.c
+			fi
+		fi
 
-	# If the config_list does not already contain the config_name (i.e.,
-	# if config_name is an umbrella family), generate makefiles in that
-	# directory. (In the next step, we will loop over the actual sub-
-	# configurations and create fragments there as well.)
-	if [[ ${clist_contains_cname} = false ]]; then
-		create_makefile_fragment CONFIG "${config_dirpath}/${config_name}" \
-		                         "${obj_config_dirpath}/${config_name}"
 	fi
 
-	# Generate makefile fragments for each of the sub-configurations present
-	# in the configuration list.
-	for conf in ${config_list}; do
-		create_makefile_fragment CONFIG "${config_dirpath}/${conf}" \
-		                         "${obj_config_dirpath}/${conf}"
-	done
+	if [ ${buildfiles_flag} == '1' ]; then
 
-	# Generate makefile fragments for each of the kernel sets required by
-	# the configuration list (in the kernel list).
-	for kern in ${kernel_list}; do
-		create_makefile_fragment KERNELS "${kernels_dirpath}/${kern}" \
-		                         "${obj_kernels_dirpath}/${kern}"
-	done
+		check_build_tools
 
-	# Generate makefile fragments in the reference kernels directory.
-	create_makefile_fragment REFKERN "${refkern_dirpath}" \
-	                         "${obj_refkern_dirpath}"
+		# Try to determine whether the chosen compiler supports #pragma omp simd.
+		pragma_omp_simd=$(has_pragma_omp_simd)
+		add_config_var pragma_omp_simd
 
-	# Generate makefile fragments in the framework directory.
-	create_makefile_fragment FRAME "${frame_dirpath}" \
-	                         "${obj_frame_dirpath}"
+		# -- Makefile --
 
-	# Generate makefile fragments in the addon sub-directory.
-	if [[ -n ${addon_flag} ]]; then
-		for addon in ${addon_list}; do
-			create_makefile_fragment ADDON "${addon_dirpath}/${addon}" \
-			                         "${obj_addon_dirpath}/${addon}"
-		done
-	fi
+		maybe_echo -n "${script_name}: copying Makefile..."
 
-	# Generate makefile fragments in the sandbox sub-directory.
-	if [[ -n ${sandbox_flag} ]]; then
-		create_makefile_fragment SANDBOX "${sandbox_dirpath}/${sandbox}" \
-		                         "${obj_sandbox_dirpath}/${sandbox}"
-	fi
+		if [ -e Makefile ] && [ ${force_flag} == '0' ]; then
+			maybe_echo "already done"
+		else
+			cp ${sharedir}/blis/plugin/Makefile Makefile
+			maybe_echo "done"
+		fi
 
+		# -- config.mk --
 
-	# -- Handle out-of-tree builds ---------------------------------------------
+		# The name/path to the registry (master list) of supported configurations.
+		registry_file="config_registry"
+		registry_filepath=${plugin_dir}/${registry_file}
 
-	# Under some circumstances, we need to create some symbolic links to
-	# properly handle out-of-tree builds.
-	if [[ ${configured_oot} = yes ]]; then
-		for file in Makefile blis.pc.in common.mk config; do
-			# If symlink does not already exist in the current
-			# directory, create a symbolic link to it. If one does exist, we
-			# use -f to force creation of a new link.
-			if [[ ! -e ${file} ]]; then
-				echo "${script_name}: creating symbolic link to ${file}."
-				ln -s "${dist_path}/${file}" .
-			elif [[ -h ${file} ]]; then
-				echo "${script_name}: symbolic link to ${file} already exists; forcing creation of new link."
-				ln -sf "${dist_path}/${file}" .
-			else
-				echo "${script_name}: Non-symbolic link file or directory '${file}' blocks creation of symlink."
-				echo "${script_name}: *** Please remove this entity and re-run configure."
-				exit 1
-			fi
-		done
+		# Read the registered configuration names and lists into associative
+		# arrays.
+		echo -n "${script_name}: reading configuration registry..."
+		read_registry_file "${registry_filepath}"
+		echo "done."
 
-		echo "${script_name}: configured to build outside of source distribution."
-	else
+		build_and_check_configurations
 
-		echo "${script_name}: configured to build within top-level directory of source distribution."
-	fi
+		generate_config_file ${sharedir}/blis/plugin/config.mk.in config.mk
 
-	if [ "${warn_user_generic}" = "1" ]; then
+		# -- Makefile fragments --
+
+		# The extra '/.' on the FRAME call is necessary to trick create_makefile_fragments
+		# into not taking the last entry in ${plugin_dir} as the 'current directory name'.
+		create_makefile_fragment FRAME ${plugin_dir}/. obj/${config_name} false
+		create_makefile_fragment CONFIG ${config_dirpath} obj/${config_name}/config
+		create_makefile_fragment KERNELS ${kernels_dirpath} obj/${config_name}/kernels
+		create_makefile_fragment REFKERN ${refkern_dirpath} obj/${config_name}/ref_kernels
 
-		echo "${script_name}: "
-		echo "${script_name}: *** Unable to automatically detect hardware type! ***"
-		echo "${script_name}: "
-		echo "${script_name}: NOTE: configure was unable to identify a subconfiguration"
-		echo "${script_name}: optimized for your hardware. As a result, the 'generic'"
-		echo "${script_name}: subconfiguration (with low-performance reference kernels)"
-		echo "${script_name}: will be used. For support, please open an issue on GitHub"
-		echo "${script_name}: at https://github.com/flame/blis/issues."
-		echo "${script_name}: "
 	fi
 
 	# Exit peacefully.
@@ -4546,4 +5376,11 @@ main()
 
 
 # The script's main entry point, passing all parameters given.
-main "$@"
+case ${0##*/} in
+	configure)
+		blis_main "$@"
+		;;
+	configure-plugin)
+		plugin_main "$@"
+		;;
+esac
diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md
index cc1224182..9217ae9fd 100644
--- a/docs/ConfigurationHowTo.md
+++ b/docs/ConfigurationHowTo.md
@@ -690,25 +690,32 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
 
 
-   * **`frame/base/bli_gks.c`**. We must also update the global kernel structure, or gks, to register the new sub-configuration during library initialization. Sub-configuration registration occurs in `bli_gks_init()`. For `knl`, updating this function amounts to inserting the following lines
+   * **`frame/include/bli_gentconf_macro_defs.h`**. We must also update the macro which automatically generates code which
+   should be executed for each enabled sub-configuration. This macro update requires changes in two places: first we must conditionally define a
+   macro for our new sub-configuration, and then we can invoke (call) that macro from the generic `INSERT_GENTCONF` macro. For `knl`, the
+   first, sub-configuration-specific macro takes the form,
       ```c
+      // -- KNL microarchitecture --
       #ifdef BLIS_CONFIG_KNL
-              bli_gks_register_cntx( BLIS_ARCH_KNL, bli_cntx_init_knl,
-                                                    bli_cntx_init_knl_ref,
-                                                    bli_cntx_init_knl_ind );
+      #define INSERT_GENTCONF_KNL GENTCONF( KNL, knl )
+      #else
+      #define INSERT_GENTCONF_KNL
       #endif
       ```
-      This function submits pointers to various context initialization functions to the global kernel structure, which are then stored and called at the appropriate time. The functions **must** be named strictly according to the format shown in the example above, with `knl` replaced with the sub-configuration name. Also, note the call to `bli_gks_register_cntx` is guarded by `BLIS_CONFIG_KNL`. This macro is automatically `#defined` by the build system if and when the `knl` sub-configuration is enabled at configure-time, either directly as a singleton family or indirectly via an umbrella family.
-
-
-
-   * **`frame/include/bli_arch_config.h`**. This file must be updated in two places. First, we must modify it to generate prototypes for the `bli_cntx_init_*()` functions, including the developer-provided function `bli_cntx_init_knl()` (defined in `config/knl/bli_cntx_init_knl.c`), by inserting:
+      Note the upper-case `KNL` tag which is used in various pre-defined macros such as `BLIS_CONFIG_KNL`, and the lower-case
+      tag `knl` which is used in generating function names such as `bli_cntx_init_knl_ref`. The second modification to make is
+      to add a call to this macro from `INSERT_GENTCONF`,
       ```c
-      #ifdef BLIS_CONFIG_KNL
-      CNTX_INIT_PROTS( knl )
-      #endif
+      #define INSERT_GENTCONF \
+      ...
+      INSERT_GENTCONF_KNL \
+      ...
       ```
-      Here, the `CNTX_INIT_PROTS` macro generates the appropriate prototypes based on the name of the sub-configuration. Next, we must `#include` the `bli_family_knl.h` header file, just as we would if we were adding support for an umbrella family:
+      This will automatically handle most code fragments which depend on a specific sub-configuration, such as creating
+      reference contexts in the global kernel structure.
+
+   * **`frame/include/bli_arch_config.h`**. This file must be modified by adding an `#include` to the `bli_family_knl.h`
+   header file, just as we would if we were adding support for an umbrella family:
       ```c
       #ifdef BLIS_FAMILY_KNL
       #include "bli_family_knl.h"
@@ -759,7 +766,6 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
 
           "generic"
       };
-
       ```
       This array is used by `bli_arch_string()` when mapping `arch_t` values to the strings associated with that architecture ID. Because the `arch_t` value is used as the index of each string, **the relative order of the strings in this array is important**. Be sure to insert the new string (in our case, `"knl"`) at the **same relative location** as the `arch_t` value inserted in `bli_type_defs.h`. This will ensure that each `arch_t` value will map to its corresponding string in the `config_name` array.
 
diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h
index 9ca69d534..4c9933d84 100644
--- a/frame/0/bli_l0_ft.h
+++ b/frame/0/bli_l0_ft.h
@@ -42,7 +42,7 @@
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
              conj_t conjchi, \
        const ctype* chi, \
@@ -59,7 +59,7 @@ INSERT_GENTDEF( invertsc )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
              conj_t conjchi, \
        const ctype* chi, \
@@ -73,7 +73,7 @@ INSERT_GENTDEF( mulsc )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype*   chi, \
              ctype_r* absq  \
@@ -86,7 +86,7 @@ INSERT_GENTDEFR( absqsc )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype*   chi, \
              ctype_r* norm  \
@@ -99,7 +99,7 @@ INSERT_GENTDEFR( normfsc )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype* chi, \
              ctype* psi  \
@@ -112,7 +112,7 @@ INSERT_GENTDEF( sqrtsc )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype* chi, \
              ctype* psi  \
@@ -125,7 +125,7 @@ INSERT_GENTDEF( sqrtrsc )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype*  chi, \
              double* zeta_r, \
@@ -139,7 +139,7 @@ INSERT_GENTDEF( getsc )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        double  zeta_r, \
        double  zeta_i, \
@@ -153,7 +153,7 @@ INSERT_GENTDEF( setsc )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype*   chi, \
              ctype_r* zeta_r, \
@@ -167,7 +167,7 @@ INSERT_GENTDEFR( unzipsc )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype_r* zeta_r, \
        const ctype_r* zeta_i, \
diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c
index 612babe56..d30540ff9 100644
--- a/frame/0/bli_l0_oapi.c
+++ b/frame/0/bli_l0_oapi.c
@@ -41,7 +41,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* absq  \
@@ -81,7 +81,7 @@ GENFRONT( normfsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi  \
@@ -121,7 +121,7 @@ GENFRONT( invertsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi  \
@@ -155,7 +155,7 @@ GENFRONT( sqrtrsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t*  chi, \
              double* zeta_r, \
@@ -199,7 +199,7 @@ GENFRONT( getsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
              double zeta_r, \
              double zeta_i, \
@@ -233,7 +233,7 @@ GENFRONT( setsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* zeta_r, \
@@ -276,7 +276,7 @@ GENFRONT( unzipsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* zeta_r, \
        const obj_t* zeta_i, \
diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h
index 713da0d70..ad342b232 100644
--- a/frame/0/bli_l0_oapi.h
+++ b/frame/0/bli_l0_oapi.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* absq  \
@@ -53,7 +53,7 @@ GENPROT( normfsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi  \
@@ -71,7 +71,7 @@ GENPROT( invertsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t*  chi, \
              double* zeta_r, \
@@ -84,7 +84,7 @@ GENPROT( getsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
              double zeta_r, \
              double zeta_i, \
@@ -97,7 +97,7 @@ GENPROT( setsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* zeta_r, \
@@ -110,7 +110,7 @@ GENPROT( unzipsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* zeta_r, \
        const obj_t* zeta_i, \
diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c
index 6da19e31b..ef2a942f3 100644
--- a/frame/0/bli_l0_tapi.c
+++ b/frame/0/bli_l0_tapi.c
@@ -129,7 +129,7 @@ void PASTEMAC(ch,opname) \
 \
 	( void )absq_i; \
 \
-	PASTEMAC2(ch,chr,gets)( *chi, chi_r, chi_i ); \
+	PASTEMAC(ch,chr,gets)( *chi, chi_r, chi_i ); \
 \
 	/* absq   = chi_r * chi_r + chi_i * chi_i; \
 	   absq_r = 0.0; (thrown away) */ \
@@ -153,7 +153,7 @@ void PASTEMAC(ch,opname) \
 	bli_init_once(); \
 \
 	/* norm = sqrt( chi_r * chi_r + chi_i * chi_i ); */ \
-	PASTEMAC2(ch,chr,abval2s)( *chi, *norm ); \
+	PASTEMAC(ch,chr,abval2s)( *chi, *norm ); \
 }
 
 INSERT_GENTFUNCR_BASIC( normfsc )
@@ -190,7 +190,7 @@ void PASTEMAC(ch,opname) \
 \
 	const ctype_r chi_r = PASTEMAC(ch,real)( *chi ); \
 \
-	PASTEMAC2(chr,ch,sqrt2s)( chi_r, *psi ); \
+	PASTEMAC(chr,ch,sqrt2s)( chi_r, *psi ); \
 }
 
 INSERT_GENTFUNCR_BASIC( sqrtrsc )
@@ -208,7 +208,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC2(ch,d,gets)( *chi, *zeta_r, *zeta_i ); \
+	PASTEMAC(ch,d,gets)( *chi, *zeta_r, *zeta_i ); \
 }
 
 INSERT_GENTFUNC_BASIC( getsc )
@@ -226,7 +226,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC2(d,ch,sets)( zeta_r, zeta_i, *chi ); \
+	PASTEMAC(d,ch,sets)( zeta_r, zeta_i, *chi ); \
 }
 
 INSERT_GENTFUNC_BASIC( setsc )
@@ -244,7 +244,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC2(ch,chr,gets)( *chi, *zeta_r, *zeta_i ); \
+	PASTEMAC(ch,chr,gets)( *chi, *zeta_r, *zeta_i ); \
 }
 
 INSERT_GENTFUNCR_BASIC( unzipsc )
@@ -262,7 +262,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC2(chr,ch,sets)( *zeta_r, *zeta_i, *chi ); \
+	PASTEMAC(chr,ch,sets)( *zeta_r, *zeta_i, *chi ); \
 }
 
 INSERT_GENTFUNCR_BASIC( zipsc )
@@ -278,7 +278,7 @@ void bli_igetsc
 {
 	bli_init_once();
 
-	PASTEMAC2(i,d,gets)( *chi, *zeta_r, *zeta_i );
+	PASTEMAC(i,d,gets)( *chi, *zeta_r, *zeta_i );
 }
 
 void bli_isetsc
@@ -290,6 +290,6 @@ void bli_isetsc
 {
 	bli_init_once();
 
-	PASTEMAC2(d,i,sets)( zeta_r, zeta_i, *chi );
+	PASTEMAC(d,i,sets)( zeta_r, zeta_i, *chi );
 }
 
diff --git a/frame/0/copysc/bli_copysc.c b/frame/0/copysc/bli_copysc.c
index 805bff6dc..11e111544 100644
--- a/frame/0/copysc/bli_copysc.c
+++ b/frame/0/copysc/bli_copysc.c
@@ -55,7 +55,7 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc);
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi  \
@@ -103,7 +103,7 @@ GENFRONT( copysc )
 #undef  GENTFUNC2
 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname ) \
 \
-void PASTEMAC2(chx,chy,varname) \
+void PASTEMAC(chx,chy,varname) \
      ( \
              conj_t conjchi, \
        const void*  chi, \
@@ -117,15 +117,14 @@ void PASTEMAC2(chx,chy,varname) \
 \
 	if ( bli_is_conj( conjchi ) ) \
 	{ \
-		PASTEMAC2(chx,chy,copyjs)( *chi_cast, *psi_cast ); \
+		PASTEMAC(chx,chy,copyjs)( *chi_cast, *psi_cast ); \
 	} \
 	else \
 	{ \
-		PASTEMAC2(chx,chy,copys)( *chi_cast, *psi_cast ); \
+		PASTEMAC(chx,chy,copys)( *chi_cast, *psi_cast ); \
 	} \
 }
 
 INSERT_GENTFUNC2_BASIC( copysc )
-INSERT_GENTFUNC2_MIX_D( copysc )
-INSERT_GENTFUNC2_MIX_P( copysc )
+INSERT_GENTFUNC2_MIX_DP( copysc )
 
diff --git a/frame/0/copysc/bli_copysc.h b/frame/0/copysc/bli_copysc.h
index b04fabb30..fa6e028cf 100644
--- a/frame/0/copysc/bli_copysc.h
+++ b/frame/0/copysc/bli_copysc.h
@@ -40,7 +40,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi  \
@@ -55,7 +55,7 @@ GENFRONT( copysc )
 #undef  GENTPROT2
 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
+BLIS_EXPORT_BLIS void PASTEMAC(chx,chy,varname) \
      ( \
              conj_t conjchi, \
        const void*  chi, \
diff --git a/frame/1/bli_l1v_fpa.c b/frame/1/bli_l1v_fpa.c
index a88aba93d..104a75019 100644
--- a/frame/1/bli_l1v_fpa.c
+++ b/frame/1/bli_l1v_fpa.c
@@ -41,13 +41,13 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+GENARRAY_FPA( PASTECH(opname,BLIS_TAPI_EX_SUF,_vft), \
               PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
 { \
-	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
+	return PASTECH(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
 }
 
 GENFRONT( addv )
diff --git a/frame/1/bli_l1v_fpa.h b/frame/1/bli_l1v_fpa.h
index 52d477d30..35db658b3 100644
--- a/frame/1/bli_l1v_fpa.h
+++ b/frame/1/bli_l1v_fpa.h
@@ -39,8 +39,8 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
 
 GENPROT( addv )
 GENPROT( copyv )
diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h
index 244b926ca..7a7cf22dc 100644
--- a/frame/1/bli_l1v_ft.h
+++ b/frame/1/bli_l1v_ft.h
@@ -42,7 +42,7 @@
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -60,7 +60,7 @@ INSERT_GENTDEF( subv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              dim_t  n, \
        const ctype* x, inc_t incx, \
@@ -75,7 +75,7 @@ INSERT_GENTDEF( amaxv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -93,7 +93,7 @@ INSERT_GENTDEF( axpbyv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -111,7 +111,7 @@ INSERT_GENTDEF( scal2v )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -129,7 +129,7 @@ INSERT_GENTDEF( dotv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -149,7 +149,7 @@ INSERT_GENTDEF( dotxv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx  \
@@ -163,7 +163,7 @@ INSERT_GENTDEF( invertv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjalpha, \
              dim_t  n, \
@@ -181,7 +181,7 @@ INSERT_GENTDEF( setv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx, \
@@ -196,7 +196,7 @@ INSERT_GENTDEF( swapv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              dim_t  n, \
diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c
index ae12250e7..e498d8e1a 100644
--- a/frame/1/bli_l1v_oapi.c
+++ b/frame/1/bli_l1v_oapi.c
@@ -68,8 +68,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -114,8 +114,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -175,8 +175,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -233,8 +233,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -283,8 +283,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -350,8 +350,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -395,8 +395,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -446,8 +446,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -492,8 +492,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -547,8 +547,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index 9c3f4a30c..b49cdc823 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -43,7 +43,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -81,7 +81,7 @@ INSERT_GENTFUNC_BASIC( subv,  BLIS_SUBV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t  n, \
        const ctype* x, inc_t incx, \
@@ -115,7 +115,7 @@ INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -155,7 +155,7 @@ INSERT_GENTFUNC_BASIC( axpbyv, BLIS_AXPBYV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -195,7 +195,7 @@ INSERT_GENTFUNC_BASIC( scal2v, BLIS_SCAL2V_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -235,7 +235,7 @@ INSERT_GENTFUNC_BASIC( dotv, BLIS_DOTV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -279,7 +279,7 @@ INSERT_GENTFUNC_BASIC( dotxv, BLIS_DOTXV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx  \
@@ -311,7 +311,7 @@ INSERT_GENTFUNC_BASIC( invertv, BLIS_INVERTV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjalpha, \
              dim_t  n, \
@@ -349,7 +349,7 @@ INSERT_GENTFUNC_BASIC( setv,  BLIS_SETV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx, \
@@ -382,7 +382,7 @@ INSERT_GENTFUNC_BASIC( swapv, BLIS_SWAPV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              dim_t  n, \
diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h
index bda6fd28b..63a9bb1e2 100644
--- a/frame/1/bli_l1v_tapi.h
+++ b/frame/1/bli_l1v_tapi.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
       ( \
               conj_t conjx, \
               dim_t  n, \
@@ -57,7 +57,7 @@ INSERT_GENTPROT_BASIC( subv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t  n, \
        const ctype* x, inc_t incx, \
@@ -71,7 +71,7 @@ INSERT_GENTPROT_BASIC( amaxv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -88,7 +88,7 @@ INSERT_GENTPROT_BASIC( axpbyv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -105,7 +105,7 @@ INSERT_GENTPROT_BASIC( scal2v )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -122,7 +122,7 @@ INSERT_GENTPROT_BASIC( dotv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -141,7 +141,7 @@ INSERT_GENTPROT_BASIC( dotxv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx  \
@@ -154,7 +154,7 @@ INSERT_GENTPROT_BASIC( invertv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjalpha, \
              dim_t  n, \
@@ -171,7 +171,7 @@ INSERT_GENTPROT_BASIC( setv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx, \
@@ -185,7 +185,7 @@ INSERT_GENTPROT_BASIC( swapv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              dim_t  n, \
diff --git a/frame/1d/bli_l1d_fpa.c b/frame/1d/bli_l1d_fpa.c
index 371f9289b..1eaa24131 100644
--- a/frame/1d/bli_l1d_fpa.c
+++ b/frame/1d/bli_l1d_fpa.c
@@ -41,13 +41,13 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+GENARRAY_FPA( PASTECH(opname,BLIS_TAPI_EX_SUF,_vft), \
               PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
 { \
-	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
+	return PASTECH(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
 }
 
 GENFRONT( addd )
diff --git a/frame/1d/bli_l1d_fpa.h b/frame/1d/bli_l1d_fpa.h
index 11fb36192..92775b3b2 100644
--- a/frame/1d/bli_l1d_fpa.h
+++ b/frame/1d/bli_l1d_fpa.h
@@ -39,8 +39,8 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
 
 GENPROT( addd )
 GENPROT( copyd )
diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h
index b14e17b6a..c80a4bb5c 100644
--- a/frame/1d/bli_l1d_ft.h
+++ b/frame/1d/bli_l1d_ft.h
@@ -42,7 +42,7 @@
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -63,7 +63,7 @@ INSERT_GENTDEF( subd )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -84,7 +84,7 @@ INSERT_GENTDEF( scal2d )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
        doff_t diagoffx, \
        dim_t  m, \
@@ -100,7 +100,7 @@ INSERT_GENTDEF( invertd )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjalpha, \
              doff_t diagoffx, \
@@ -120,7 +120,7 @@ INSERT_GENTDEF( setd )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t   diagoffx, \
              dim_t    m, \
@@ -137,7 +137,7 @@ INSERT_GENTDEFR( setid )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t diagoffx, \
              dim_t  m, \
@@ -154,7 +154,7 @@ INSERT_GENTDEF( shiftd )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c
index 8dfd9cad0..d4caabba9 100644
--- a/frame/1d/bli_l1d_oapi.c
+++ b/frame/1d/bli_l1d_oapi.c
@@ -73,8 +73,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -139,8 +139,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -188,8 +188,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -244,8 +244,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -295,8 +295,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -351,8 +351,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -413,8 +413,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index 75e3b997f..17e7fcd3b 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -43,7 +43,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -122,7 +122,7 @@ INSERT_GENTFUNC_BASIC( subd,  subv,  BLIS_SUBV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -202,7 +202,7 @@ INSERT_GENTFUNC_BASIC( scal2d, scal2v, BLIS_SCAL2V_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        doff_t diagoffx, \
        dim_t  m, \
@@ -258,7 +258,7 @@ INSERT_GENTFUNC_BASIC( invertd, invertv, BLIS_INVERTV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjalpha, \
              doff_t diagoffx, \
@@ -320,7 +320,7 @@ INSERT_GENTFUNC_BASIC( setd,  setv,  BLIS_SETV_KER )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, kername, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t   diagoffx, \
              dim_t    m, \
@@ -396,7 +396,7 @@ INSERT_GENTFUNCR_BASIC( setid, setv, BLIS_SETV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t diagoffx, \
              dim_t  m, \
@@ -455,7 +455,7 @@ INSERT_GENTFUNC_BASIC( shiftd, addv, BLIS_ADDV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h
index 201bd9ae6..71c8b7334 100644
--- a/frame/1d/bli_l1d_tapi.h
+++ b/frame/1d/bli_l1d_tapi.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -60,7 +60,7 @@ INSERT_GENTPROT_BASIC( subd )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -80,7 +80,7 @@ INSERT_GENTPROT_BASIC( scal2d )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        doff_t diagoffx, \
        dim_t  m, \
@@ -95,7 +95,7 @@ INSERT_GENTPROT_BASIC( invertd )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjalpha, \
              doff_t diagoffx, \
@@ -114,7 +114,7 @@ INSERT_GENTPROT_BASIC( setd )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t   diagoffx, \
              dim_t    m, \
@@ -130,7 +130,7 @@ INSERT_GENTPROTR_BASIC( setid )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t diagoffx, \
              dim_t  m, \
@@ -146,7 +146,7 @@ INSERT_GENTPROT_BASIC( shiftd )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
diff --git a/frame/1f/bli_l1f_fpa.c b/frame/1f/bli_l1f_fpa.c
index e0fbc6f4e..a629c5909 100644
--- a/frame/1f/bli_l1f_fpa.c
+++ b/frame/1f/bli_l1f_fpa.c
@@ -41,13 +41,13 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+GENARRAY_FPA( PASTECH(opname,BLIS_TAPI_EX_SUF,_vft), \
               PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
 { \
-	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
+	return PASTECH(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
 }
 
 GENFRONT( axpy2v )
diff --git a/frame/1f/bli_l1f_fpa.h b/frame/1f/bli_l1f_fpa.h
index df11439a4..5ebe24a03 100644
--- a/frame/1f/bli_l1f_fpa.h
+++ b/frame/1f/bli_l1f_fpa.h
@@ -39,8 +39,8 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
 
 GENPROT( axpy2v )
 GENPROT( axpyf )
diff --git a/frame/1f/bli_l1f_ft.h b/frame/1f/bli_l1f_ft.h
index 8e143bf54..ba74ecb8e 100644
--- a/frame/1f/bli_l1f_ft.h
+++ b/frame/1f/bli_l1f_ft.h
@@ -42,7 +42,7 @@
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -62,7 +62,7 @@ INSERT_GENTDEF( axpy2v )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conja, \
              conj_t conjx, \
@@ -82,7 +82,7 @@ INSERT_GENTDEF( axpyf )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjxt, \
              conj_t conjx, \
@@ -103,7 +103,7 @@ INSERT_GENTDEF( dotaxpyv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjat, \
              conj_t conjx, \
@@ -124,7 +124,7 @@ INSERT_GENTDEF( dotxf )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjat, \
              conj_t conja, \
diff --git a/frame/1f/bli_l1f_oapi.c b/frame/1f/bli_l1f_oapi.c
index f1e65a252..7022c4306 100644
--- a/frame/1f/bli_l1f_oapi.c
+++ b/frame/1f/bli_l1f_oapi.c
@@ -89,8 +89,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -158,8 +158,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -226,8 +226,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -311,8 +311,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -391,8 +391,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index b6811dbc2..32f1599ea 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -43,7 +43,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -87,7 +87,7 @@ INSERT_GENTFUNC_BASIC( axpy2v, BLIS_AXPY2V_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conja, \
              conj_t conjx, \
@@ -131,7 +131,7 @@ INSERT_GENTFUNC_BASIC( axpyf, BLIS_AXPYF_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjxt, \
              conj_t conjx, \
@@ -177,7 +177,7 @@ INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_DOTAXPYV_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjat, \
              conj_t conja, \
@@ -231,7 +231,7 @@ INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_DOTXAXPYF_KER )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, kerid ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjat, \
              conj_t conjx, \
diff --git a/frame/1f/bli_l1f_tapi.h b/frame/1f/bli_l1f_tapi.h
index bccd08e5e..986b39448 100644
--- a/frame/1f/bli_l1f_tapi.h
+++ b/frame/1f/bli_l1f_tapi.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -59,7 +59,7 @@ INSERT_GENTPROT_BASIC( axpy2v )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conja, \
              conj_t conjx, \
@@ -78,7 +78,7 @@ INSERT_GENTPROT_BASIC( axpyf )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjxt, \
              conj_t conjx, \
@@ -98,7 +98,7 @@ INSERT_GENTPROT_BASIC( dotaxpyv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjat, \
              conj_t conja, \
@@ -122,7 +122,7 @@ INSERT_GENTPROT_BASIC( dotxaxpyf )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjat, \
              conj_t conjx, \
diff --git a/frame/1m/bli_l1m_fpa.c b/frame/1m/bli_l1m_fpa.c
index 7299dd7c8..08f4bc62e 100644
--- a/frame/1m/bli_l1m_fpa.c
+++ b/frame/1m/bli_l1m_fpa.c
@@ -41,13 +41,13 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+GENARRAY_FPA( PASTECH(opname,BLIS_TAPI_EX_SUF,_vft), \
               PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
 { \
-	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
+	return PASTECH(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
 }
 
 GENFRONT( addm )
@@ -67,13 +67,13 @@ GENFRONT( xpbym )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA2( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+GENARRAY_FPA2( PASTECH(opname,BLIS_TAPI_EX_SUF,_vft), \
                PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ) \
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ) \
 { \
-	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa2)[ dtx ][ dty ]; \
+	return PASTECH(opname,BLIS_TAPI_EX_SUF,_fpa2)[ dtx ][ dty ]; \
 }
 
 GENFRONT( xpbym_md )
diff --git a/frame/1m/bli_l1m_fpa.h b/frame/1m/bli_l1m_fpa.h
index 9de988559..401a6c744 100644
--- a/frame/1m/bli_l1m_fpa.h
+++ b/frame/1m/bli_l1m_fpa.h
@@ -39,8 +39,8 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
 
 GENPROT( addm )
 GENPROT( copym )
@@ -55,8 +55,8 @@ GENPROT( xpbym )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty );
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty );
 
 GENPROT( xpbym_md )
 
diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h
index 0851470dd..0c4a48420 100644
--- a/frame/1m/bli_l1m_ft.h
+++ b/frame/1m/bli_l1m_ft.h
@@ -34,7 +34,7 @@
 
 
 //
-// -- Level-1v function types --------------------------------------------------
+// -- Level-1m function types --------------------------------------------------
 //
 
 // addm, subm
@@ -42,7 +42,7 @@
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -64,7 +64,7 @@ INSERT_GENTDEF( copym )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -85,7 +85,7 @@ INSERT_GENTDEF( axpym )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -106,7 +106,7 @@ INSERT_GENTDEF( scal2m )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjalpha, \
              doff_t diagoffx, \
@@ -128,7 +128,7 @@ INSERT_GENTDEF( setm )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
diff --git a/frame/1m/bli_l1m_ker_params.h b/frame/1m/bli_l1m_ker_params.h
index b0b383b21..ca89bc9db 100644
--- a/frame/1m/bli_l1m_ker_params.h
+++ b/frame/1m/bli_l1m_ker_params.h
@@ -58,10 +58,10 @@
              dim_t   panel_len_max, \
              dim_t   panel_dim_off, \
              dim_t   panel_len_off, \
+             dim_t   panel_bcast, \
        const void*   kappa, \
        const void*   c, inc_t incc, inc_t ldc, \
              void*   p,             inc_t ldp, \
-                        inc_t is_p, \
        const void*   params  \
 
 
@@ -72,11 +72,14 @@
              conj_t  conja, \
              pack_t  schema, \
              dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
              dim_t   n, \
              dim_t   n_max, \
        const void*   kappa, \
        const void*   a, inc_t inca, inc_t lda, \
-             void*   p,             inc_t ldp  \
+             void*   p,             inc_t ldp, \
+       const void*   params  \
 
 
 // unpackm_cxk kernel
@@ -86,10 +89,12 @@
              conj_t  conja, \
              pack_t  schema, \
              dim_t   cdim, \
+             dim_t   cdim_bcast, \
              dim_t   n, \
        const void*   kappa, \
        const void*   p,             inc_t ldp, \
-             void*   a, inc_t inca, inc_t lda  \
+             void*   a, inc_t inca, inc_t lda, \
+       const void*   params  \
 
 
 // packm_cxc_diag kernel
@@ -103,10 +108,13 @@
              pack_t  schema, \
              bool    invdiag, \
              dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
              dim_t   n_max, \
        const void*   kappa, \
        const void*   a, inc_t inca, inc_t lda, \
-             void*   p,             inc_t ldp  \
+             void*   p,             inc_t ldp, \
+       const void*   params  \
 
 
 #endif
diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h
index 1889370fa..8cc9a6934 100644
--- a/frame/1m/bli_l1m_ker_prot.h
+++ b/frame/1m/bli_l1m_ker_prot.h
@@ -48,10 +48,23 @@ void PASTEMAC(ch,funcname) \
        BLIS_CNTX_PARAM  \
      );
 
+#undef  L1MTPROT2
+#define L1MTPROT2( ctypex, ctypey, chx, chy, funcname, opname ) \
+\
+void PASTEMAC(chx,chy,funcname) \
+     ( \
+       PASTECH(opname,_params), \
+       BLIS_CNTX_PARAM  \
+     );
+
 #define PACKM_KER_PROT(      ctype, ch, fn )  L1MTPROT( ctype, ch, fn, packm_cxk );
 #define UNPACKM_KER_PROT(    ctype, ch, fn )  L1MTPROT( ctype, ch, fn, unpackm_cxk );
 #define PACKM_DIAG_KER_PROT( ctype, ch, fn )  L1MTPROT( ctype, ch, fn, packm_cxc_diag );
 
+#define PACKM_KER_PROT2(      ctypex, ctypey, chx, chy, fn )  L1MTPROT2( ctypex, ctypey, chx, chy, fn, packm_cxk );
+#define UNPACKM_KER_PROT2(    ctypex, ctypey, chx, chy, fn )  L1MTPROT2( ctypex, ctypey, chx, chy, fn, unpackm_cxk );
+#define PACKM_DIAG_KER_PROT2( ctypex, ctypey, chx, chy, fn )  L1MTPROT2( ctypex, ctypey, chx, chy, fn, packm_cxc_diag );
+
 
 #endif
 
diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c
index 775d69018..5cd128e16 100644
--- a/frame/1m/bli_l1m_oapi.c
+++ b/frame/1m/bli_l1m_oapi.c
@@ -74,8 +74,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -142,8 +142,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -219,8 +219,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -282,8 +282,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -351,8 +351,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -416,8 +416,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a (multi) type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( dtx, dty ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp2)( dtx, dty ); \
 \
 	f \
 	( \
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 08dd5c915..d17df0eb7 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -43,7 +43,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, auxker ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -67,7 +67,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
@@ -90,7 +90,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
-		PASTEMAC2(ch,auxker,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,auxker,BLIS_TAPI_EX_SUF) \
 		( \
 		  diagoffx, \
 		  diagx, \
@@ -113,7 +113,7 @@ INSERT_GENTFUNC_BASIC( subm, subd )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -137,7 +137,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
@@ -166,7 +166,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		if ( bli_does_trans( transx ) ) \
 			bli_negate_diag_offset( &diagoffy ); \
 \
-		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setd,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  diagoffy, \
@@ -187,7 +187,7 @@ INSERT_GENTFUNC_BASIC( copym )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -215,7 +215,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
@@ -239,7 +239,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
-		PASTEMAC2(ch,axpyd,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,axpyd,BLIS_TAPI_EX_SUF) \
 		( \
 		  diagoffx, \
 		  diagx, \
@@ -262,7 +262,7 @@ INSERT_GENTFUNC_BASIC( axpym )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -291,7 +291,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
 \
-		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setm,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  diagoffx, \
@@ -309,7 +309,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
@@ -338,7 +338,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		if ( bli_does_trans( transx ) ) \
 			bli_negate_diag_offset( &diagoffy ); \
 \
-		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setd,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  diagoffy, \
@@ -359,7 +359,7 @@ INSERT_GENTFUNC_BASIC( scal2m )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjalpha, \
              doff_t diagoffx, \
@@ -383,7 +383,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  conjalpha, \
 	  diagoffx, \
@@ -405,7 +405,7 @@ INSERT_GENTFUNC_BASIC( setm )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -431,7 +431,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* If beta is zero, then the operation reduces to copym. */ \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
-		PASTEMAC2(ch,copym,_unb_var1) \
+		PASTEMAC(ch,copym,_unb_var1) \
 		( \
 		  diagoffx, \
 		  diagx, \
@@ -449,7 +449,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
@@ -473,7 +473,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( bli_is_upper_or_lower( uplox ) && \
 	     bli_is_unit_diag( diagx ) ) \
 	{ \
-		PASTEMAC2(ch,xpbyd,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,xpbyd,BLIS_TAPI_EX_SUF) \
 		( \
 		  diagoffx, \
 		  diagx, \
@@ -496,7 +496,7 @@ INSERT_GENTFUNC_BASIC( xpbym )
 #undef  GENTFUNC2
 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
 \
-void PASTEMAC3(chx,chy,opname,EX_SUF) \
+void PASTEMAC(chx,chy,opname,EX_SUF) \
      ( \
              doff_t   diagoffx, \
              diag_t   diagx, \
@@ -522,7 +522,7 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \
 	/* If beta is zero, then the operation reduces to copym. */ \
 	if ( PASTEMAC(chy,eq0)( *beta ) ) \
 	{ \
-		PASTEMAC2(chx,chy,castm) \
+		PASTEMAC(chx,chy,castm) \
 		( \
 		  transx, \
 		  m, \
@@ -536,7 +536,7 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC3(chx,chy,opname,_unb_var1) \
+	PASTEMAC(chx,chy,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h
index 26d62e23f..036724774 100644
--- a/frame/1m/bli_l1m_tapi.h
+++ b/frame/1m/bli_l1m_tapi.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -61,7 +61,7 @@ INSERT_GENTPROT_BASIC( subm )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -82,7 +82,7 @@ INSERT_GENTPROT_BASIC( scal2m )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjalpha, \
              doff_t diagoffx, \
@@ -103,7 +103,7 @@ INSERT_GENTPROT_BASIC( setm )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -123,7 +123,7 @@ INSERT_GENTPROT_BASIC( xpbym )
 #undef  GENTPROT2
 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(chx,chy,opname,EX_SUF) \
      ( \
              doff_t   diagoffx, \
              diag_t   diagx, \
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index 53ef4e792..749e372bb 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -498,7 +498,7 @@ INSERT_GENTFUNC_BASIC( xpbym_unb_var1,  xpbyv,  BLIS_XPBYV_KER )
 #undef  GENTFUNC2
 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
 \
-void PASTEMAC2(chx,chy,opname) \
+void PASTEMAC(chx,chy,opname) \
      ( \
        doff_t   diagoffx, \
        diag_t   diagx, \
@@ -545,7 +545,7 @@ void PASTEMAC2(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(chx,chy,adds)( x1[i], y1[i] ); \
+					PASTEMAC(chx,chy,adds)( x1[i], y1[i] ); \
 				} \
 			} \
 		} \
@@ -563,7 +563,7 @@ void PASTEMAC2(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \
+					PASTEMAC(chx,chy,adds)( *chi1, *psi1 ); \
 \
 					chi1 += incx; \
 					psi1 += incy; \
@@ -584,7 +584,7 @@ void PASTEMAC2(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC3(chx,chy,chy,xpbys)( x1[i], *beta, y1[i] ); \
+					PASTEMAC(chx,chy,chy,xpbys)( x1[i], *beta, y1[i] ); \
 				} \
 			} \
 		} \
@@ -602,7 +602,7 @@ void PASTEMAC2(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \
+					PASTEMAC(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \
 \
 					chi1 += incx; \
 					psi1 += incy; \
diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h
index e752208c4..632b7a5a0 100644
--- a/frame/1m/bli_l1m_unb_var1.h
+++ b/frame/1m/bli_l1m_unb_var1.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,_unb_var1) \
+void PASTEMAC(ch,opname,_unb_var1) \
      ( \
        doff_t  diagoffx, \
        diag_t  diagx, \
@@ -61,7 +61,7 @@ INSERT_GENTPROT_BASIC( subm )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,_unb_var1) \
+void PASTEMAC(ch,opname,_unb_var1) \
      ( \
        doff_t  diagoffx, \
        diag_t  diagx, \
@@ -82,7 +82,7 @@ INSERT_GENTPROT_BASIC( scal2m )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,_unb_var1) \
+void PASTEMAC(ch,opname,_unb_var1) \
      ( \
        conj_t  conjalpha, \
        doff_t  diagoffx, \
@@ -103,7 +103,7 @@ INSERT_GENTPROT_BASIC( setm )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,_unb_var1) \
+void PASTEMAC(ch,opname,_unb_var1) \
      ( \
        doff_t  diagoffx, \
        diag_t  diagx, \
@@ -123,7 +123,7 @@ INSERT_GENTPROT_BASIC( xpbym )
 #undef  GENTPROT2
 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
 \
-void PASTEMAC3(chx,chy,opname,_unb_var1) \
+void PASTEMAC(chx,chy,opname,_unb_var1) \
      ( \
        doff_t   diagoffx, \
        diag_t   diagx, \
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index 7d73bf903..87e38d5de 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -40,14 +40,7 @@
 #include "bli_packm_int.h"
 #include "bli_packm_scalar.h"
 
-#include "bli_packm_part.h"
-
 #include "bli_packm_struc_cxk.h"
 
-// Mixed datatype support.
-#ifdef BLIS_ENABLE_GEMM_MD
-#include "bli_packm_struc_cxk_md.h"
-#endif
-
 #include "bli_packm_blk_var1.h"
 
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index 18cc6f627..acfac66f1 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -43,7 +43,7 @@ void* bli_packm_alloc
      )
 {
 	// Query the pack buffer type from the control tree node.
-	packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
+	packbuf_t pack_buf_type = bli_packm_def_cntl_pack_buf_type( cntl );
 
 	return bli_packm_alloc_ex
 	(
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index c0b6869f7..a221d0e96 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -36,57 +36,44 @@
 #include "blis.h"
 
 
-static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
-{
-    /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
-// 0000 row/col panels
-    { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
-        bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
-// 0001 row/col panels: 1m-expanded (1e)
-    { { NULL,                      bli_cpackm_struc_cxk,
-        NULL,                      bli_zpackm_struc_cxk,  } },
-// 0010 row/col panels: 1m-reordered (1r)
-    { { NULL,                      bli_cpackm_struc_cxk,
-        NULL,                      bli_zpackm_struc_cxk,  } },
-};
-
-static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
-
 void bli_packm_blk_var1
      (
        const obj_t*     c,
              obj_t*     p,
        const cntx_t*    cntx,
        const cntl_t*    cntl,
-             thrinfo_t* thread_par
+             thrinfo_t* thread
      )
 {
 	// Extract various fields from the control tree.
-	pack_t schema  = bli_cntl_packm_params_pack_schema( cntl );
-	bool   invdiag = bli_cntl_packm_params_does_invert_diag( cntl );
-	bool   revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
-	bool   reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );
+	pack_t schema    = bli_packm_def_cntl_pack_schema( cntl );
+	bool   invdiag   = bli_packm_def_cntl_does_invert_diag( cntl );
+	bool   revifup   = bli_packm_def_cntl_rev_iter_if_upper( cntl );
+	bool   reviflo   = bli_packm_def_cntl_rev_iter_if_lower( cntl );
+	num_t  dt_p      = bli_packm_def_cntl_target_dt( cntl );
 
 	// Every thread initializes p and determines the size of memory block
 	// needed (which gets embedded into the otherwise "blank" mem_t entry
 	// in the control tree node). Return early if no packing is required.
-	if ( !bli_packm_init( c, p, cntx, cntl, bli_thrinfo_sub_node( thread_par ) ) )
+	// If the requested size is zero, then we don't need to do any allocation.
+	siz_t size_p = bli_packm_init( dt_p, c, p, cntl );
+	if ( size_p == 0 )
 		return;
 
-	// Use the sub-prenode. In bli_l3_thrinfo_grow(), this node was created to
-	// represent the team of threads as a group of single-member thread teams.
-	// This is necessary since the all of the work distribution function depend
-	// on the work_id and n_way fields.
-	thrinfo_t* thread = bli_thrinfo_sub_prenode( thread_par );
+	// Update the buffer address in p to point to the buffer associated
+	// with the mem_t entry acquired from the memory broker (now cached in
+	// the control tree node).
+	void* buffer = bli_packm_alloc( size_p, cntl, thread );
+	bli_obj_set_buffer( buffer, p );
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
-		bli_packm_int_check( c, p, cntx );
+		bli_packm_int_check( c, p );
 
+	// dt_p is updated by bli_packm_init for real-only packing
 	num_t   dt_c           = bli_obj_dt( c );
+	        dt_p           = bli_obj_dt( p );
 	dim_t   dt_c_size      = bli_dt_size( dt_c );
-
-	num_t   dt_p           = bli_obj_dt( p );
 	dim_t   dt_p_size      = bli_dt_size( dt_p );
 
 	struc_t strucc         = bli_obj_struc( c );
@@ -107,38 +94,18 @@ void bli_packm_blk_var1
 
 	char*   p_cast         = bli_obj_buffer( p );
 	inc_t   ldp            = bli_obj_col_stride( p );
-	inc_t   is_p           = bli_obj_imag_stride( p );
 	dim_t   panel_dim_max  = bli_obj_panel_dim( p );
 	inc_t   ps_p           = bli_obj_panel_stride( p );
+	dim_t   bcast_p        = bli_packm_def_cntl_bmult_m_bcast( cntl );
 
 	doff_t  diagoffc_inc   = ( doff_t )panel_dim_max;
 
 	obj_t   kappa_local;
 	char*   kappa_cast     = bli_packm_scalar( &kappa_local, p );
 
-	// we use the default lookup table to determine the right func_t
-	// for the current schema.
-	func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
-
-	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker_ft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
-
-	// For mixed-precision gemm, select the proper kernel (only dense panels).
-	if ( dt_c != dt_p )
-	{
-		packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
-	}
-
-	// Query the address of the packm params field of the obj_t. The user might
-	// have set this field in order to specify a custom packm kernel.
-	packm_blk_var1_params_t* params = bli_obj_pack_params( c );
-
-	if ( params && params->ukr_fn[ dt_c ][ dt_p ] )
-	{
-		// Query the user-provided packing kernel from the obj_t. If provided,
-		// this overrides the kernel determined above.
-		packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
-	}
+	// Query the datatype-specific function pointer from the control tree.
+	packm_ker_ft packm_ker_cast = bli_packm_def_cntl_ukr( cntl );
+	const void*  params         = bli_packm_def_cntl_ukr_params( cntl );
 
 	// Compute the total number of iterations we'll need.
 	dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );
@@ -166,15 +133,15 @@ void bli_packm_blk_var1
 
 	// Query the number of threads (single-member thread teams) and the thread
 	// team ids from the current thread's packm thrinfo_t node.
-	const dim_t nt  = bli_thrinfo_n_way( thread );
-	const dim_t tid = bli_thrinfo_work_id( thread );
+	const dim_t nt  = bli_thrinfo_num_threads( thread );
+	const dim_t tid = bli_thrinfo_thread_id( thread );
 
 	// Determine the thread range and increment using the current thread's
 	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	// will depend on whether slab or round-robin partitioning was requested
 	// at configure-time.
 	dim_t it_start, it_end, it_inc;
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
+	bli_thread_range_slrr( tid, nt, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
 
 	char* p_begin = p_cast;
 
@@ -214,11 +181,12 @@ void bli_packm_blk_var1
 				  panel_len_max,
 				  panel_dim_off_i,
 				  panel_len_off,
+				  bcast_p,
 				  kappa_cast,
 				  c_begin, incc, ldc,
-				  p_begin,       ldp, is_p,
+				  p_begin,       ldp,
 				  params,
-				  ( cntx_t* )cntx
+				  cntx
 				);
 			}
 
@@ -307,12 +275,12 @@ void bli_packm_blk_var1
 				  panel_len_max_i,
 				  panel_dim_off_i,
 				  panel_len_off_i,
+				  bcast_p,
 				  kappa_cast,
 				  c_use, incc, ldc,
 				  p_use,       ldp,
-				         is_p_use,
 				  params,
-				  ( cntx_t* )cntx
+				  cntx
 				);
 			}
 
diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c
index 15bd032ca..a033c074c 100644
--- a/frame/1m/packm/bli_packm_check.c
+++ b/frame/1m/packm/bli_packm_check.c
@@ -37,9 +37,8 @@
 
 void bli_packm_init_check
      (
-       const obj_t*  a,
-       const obj_t*  p,
-       const cntx_t* cntx
+       const obj_t* a,
+       const obj_t* p
      )
 {
 	err_t e_val;
@@ -59,9 +58,8 @@ void bli_packm_init_check
 
 void bli_packm_int_check
      (
-       const obj_t*  a,
-       const obj_t*  p,
-       const cntx_t* cntx
+       const obj_t* a,
+       const obj_t* p
      )
 {
 	err_t e_val;
diff --git a/frame/1m/packm/bli_packm_check.h b/frame/1m/packm/bli_packm_check.h
index da9399b31..e329898f8 100644
--- a/frame/1m/packm/bli_packm_check.h
+++ b/frame/1m/packm/bli_packm_check.h
@@ -34,15 +34,13 @@
 
 void bli_packm_init_check
      (
-       const obj_t*  a,
-       const obj_t*  p,
-       const cntx_t* cntx
+       const obj_t* a,
+       const obj_t* p
      );
 
 void bli_packm_int_check
      (
-       const obj_t*  a,
-       const obj_t*  p,
-       const cntx_t* cntx
+       const obj_t* a,
+       const obj_t* p
      );
 
diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c
index f38710ea8..fd41d7f7c 100644
--- a/frame/1m/packm/bli_packm_cntl.c
+++ b/frame/1m/packm/bli_packm_cntl.c
@@ -35,58 +35,71 @@
 
 #include "blis.h"
 
-BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
+
+void bli_packm_cntl_init_node
      (
-       pool_t*   sba_pool,
-       void_fp   var_func,
-       bszid_t   bmid_m,
-       bszid_t   bmid_n,
-       bool      does_invert_diag,
-       bool      rev_iter_if_upper,
-       bool      rev_iter_if_lower,
-       pack_t    pack_schema,
-       packbuf_t pack_buf_type,
-       cntl_t*   sub_node
+       void_fp       var_func,
+       packm_var_oft var,
+       const void*   params,
+       packm_cntl_t* cntl
      )
 {
-	cntl_t*         cntl;
-	packm_params_t* params;
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_packm_cntl_create_node(): " );
-	#endif
-
-	// Allocate a packm_params_t struct.
-	params = bli_sba_acquire( sba_pool, sizeof( packm_params_t ) );
+	// Initialize the packm_cntl_t struct.
+	cntl->var    = var;
+	cntl->params = params;
 
-	// Initialize the packm_params_t struct.
-	params->size              = sizeof( packm_params_t );
-	params->bmid_m            = bmid_m;
-	params->bmid_n            = bmid_n;
-	params->does_invert_diag  = does_invert_diag;
-	params->rev_iter_if_upper = rev_iter_if_upper;
-	params->rev_iter_if_lower = rev_iter_if_lower;
-	params->pack_schema       = pack_schema;
-	params->pack_buf_type     = pack_buf_type;
+	bli_cntl_init_node
+	(
+	  var_func,
+	  &cntl->cntl
+	);
+}
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_packm_cntl_create_node(): " );
-	#endif
+void bli_packm_def_cntl_init_node
+     (
+       void_fp           var_func,
+       num_t             dt_orig,
+       num_t             dt_pack,
+       num_t             dt_bmult,
+       packm_ker_ft      ukr,
+       dim_t             bmult_m_def,
+       dim_t             bmult_m_pack,
+       dim_t             bmult_m_bcast,
+       dim_t             bmult_m_scale,
+       dim_t             bmult_m_pack_scale,
+       dim_t             bmult_n_def,
+       bool              does_invert_diag,
+       bool              rev_iter_if_upper,
+       bool              rev_iter_if_lower,
+       pack_t            pack_schema,
+       packbuf_t         pack_buf_type,
+       packm_def_cntl_t* cntl
+     )
+{
+	// Initialize the packm_def_cntl_t struct.
+	cntl->ukr                = ukr;
+	cntl->dt_orig            = dt_orig;
+	cntl->dt_pack            = dt_pack;
+	cntl->dt_bmult           = dt_bmult;
+	cntl->bmult_m_def        = bmult_m_def;
+	cntl->bmult_m_pack       = bmult_m_pack;
+	cntl->bmult_m_bcast      = bmult_m_bcast;
+	cntl->bmult_m_scale      = bmult_m_scale;
+	cntl->bmult_m_pack_scale = bmult_m_pack_scale;
+	cntl->bmult_n_def        = bmult_n_def;
+	cntl->does_invert_diag   = does_invert_diag;
+	cntl->rev_iter_if_upper  = rev_iter_if_upper;
+	cntl->rev_iter_if_lower  = rev_iter_if_lower;
+	cntl->pack_schema        = pack_schema;
+	cntl->pack_buf_type      = pack_buf_type;
+	cntl->params             = cntl;
 
-	// It's important that we set the bszid field to BLIS_NO_PART to indicate
-	// that no blocksize partitioning is performed. bli_cntl_free() will rely
-	// on this information to know how to step through the thrinfo_t tree in
-	// sync with the cntl_t tree.
-	cntl = bli_cntl_create_node
+	bli_packm_cntl_init_node
 	(
-	  sba_pool,
-	  BLIS_NOID,
-	  BLIS_NO_PART,
 	  var_func,
-	  params,
-	  sub_node
+	  bli_packm_blk_var1,
+	  NULL,
+	  &cntl->cntl
 	);
-
-	return cntl;
 }
 
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index a94a465b2..2c53732c1 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -33,67 +33,226 @@
 
 */
 
-struct packm_params_s
-{
-	uint64_t  size; // size field must be present and come first.
-	bszid_t   bmid_m;
-	bszid_t   bmid_n;
-	bool      does_invert_diag;
-	bool      rev_iter_if_upper;
-	bool      rev_iter_if_lower;
-	pack_t    pack_schema;
-	packbuf_t pack_buf_type;
+
+struct packm_cntl_s
+{
+	cntl_t        cntl; // cntl field must be present and come first.
+	packm_var_oft var;
+	const void*   params;
 };
-typedef struct packm_params_s packm_params_t;
+typedef struct packm_cntl_s packm_cntl_t;
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE packm_var_oft bli_packm_cntl_variant( const cntl_t* cntl )
+{
+	return ( ( const packm_cntl_t* ) cntl )->var;
+}
+
+BLIS_INLINE const void* bli_packm_cntl_variant_params( const cntl_t* cntl )
+{
+	return ( ( const packm_cntl_t* ) cntl )->params;
+}
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE void bli_packm_cntl_set_variant( packm_var_oft var, cntl_t* cntl )
+{
+	( ( packm_cntl_t* ) cntl )->var = var;
+}
+
+BLIS_INLINE void bli_packm_cntl_set_variant_params( const void* params, cntl_t* cntl )
+{
+	( ( packm_cntl_t* ) cntl )->params = params;
+}
+
+// -----------------------------------------------------------------------------
+
+struct packm_def_cntl_s
+{
+	packm_cntl_t cntl; // cntl field must be present and come first.
+	num_t        dt_orig;
+	num_t        dt_pack;
+	num_t        dt_bmult;
+	packm_ker_ft ukr;
+	dim_t        bmult_m_def;
+	dim_t        bmult_m_pack;
+	dim_t        bmult_m_bcast;
+	dim_t        bmult_m_scale;
+	dim_t        bmult_m_pack_scale;
+	dim_t        bmult_n_def;
+	bool         does_invert_diag;
+	bool         rev_iter_if_upper;
+	bool         rev_iter_if_lower;
+	pack_t       pack_schema;
+	packbuf_t    pack_buf_type;
+	const void*  params;
+};
+typedef struct packm_def_cntl_s packm_def_cntl_t;
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE dim_t bli_packm_def_cntl_bmult_m_def( const cntl_t* cntl )
+{
+	return ( ( const packm_def_cntl_t* ) cntl )->bmult_m_def;
+}
+
+BLIS_INLINE dim_t bli_packm_def_cntl_bmult_m_pack( const cntl_t* cntl )
+{
+	return ( ( const packm_def_cntl_t* ) cntl )->bmult_m_pack;
+}
+
+BLIS_INLINE dim_t bli_packm_def_cntl_bmult_m_bcast( const cntl_t* cntl )
+{
+	return ( ( const packm_def_cntl_t* ) cntl )->bmult_m_bcast;
+}
+
+BLIS_INLINE dim_t bli_packm_def_cntl_bmult_n_def( const cntl_t* cntl )
+{
+	return ( ( const packm_def_cntl_t* ) cntl )->bmult_n_def;
+}
+
+BLIS_INLINE bool bli_packm_def_cntl_does_invert_diag( const cntl_t* cntl )
+{
+	return ( ( const packm_def_cntl_t* ) cntl )->does_invert_diag;
+}
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( const cntl_t* cntl )
+BLIS_INLINE bool bli_packm_def_cntl_rev_iter_if_upper( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
+	return ( ( const packm_def_cntl_t* ) cntl )->rev_iter_if_upper;
 }
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( const cntl_t* cntl )
+BLIS_INLINE bool bli_packm_def_cntl_rev_iter_if_lower( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n;
+	return ( ( const packm_def_cntl_t* ) cntl )->rev_iter_if_lower;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( const cntl_t* cntl )
+BLIS_INLINE pack_t bli_packm_def_cntl_pack_schema( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag;
+	return ( ( const packm_def_cntl_t* ) cntl )->pack_schema;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( const cntl_t* cntl )
+BLIS_INLINE num_t bli_packm_def_cntl_target_dt( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper;
+	return ( ( const packm_def_cntl_t* ) cntl )->dt_pack;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( const cntl_t* cntl )
+BLIS_INLINE packbuf_t bli_packm_def_cntl_pack_buf_type( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower;
+	return ( ( const packm_def_cntl_t* ) cntl )->pack_buf_type;
 }
 
-BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( const cntl_t* cntl )
+BLIS_INLINE packm_ker_ft bli_packm_def_cntl_ukr( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema;
+	return ( ( const packm_def_cntl_t* ) cntl )->ukr;
 }
 
-BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
+BLIS_INLINE const void* bli_packm_def_cntl_ukr_params( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type;
+	return ( ( const packm_def_cntl_t* ) cntl )->params;
 }
 
 // -----------------------------------------------------------------------------
 
-cntl_t* bli_packm_cntl_create_node
+BLIS_INLINE void bli_packm_def_cntl_set_bmult_m( const blksz_t* bmult_m, cntl_t* cntl_ )
+{
+	packm_def_cntl_t* cntl = ( packm_def_cntl_t* )cntl_;
+	num_t dt = cntl->dt_bmult;
+	cntl->bmult_m_def = bli_blksz_get_def( dt, bmult_m ) / cntl->bmult_m_scale;
+	cntl->bmult_m_pack = bli_blksz_get_max( dt, bmult_m ) / cntl->bmult_m_pack_scale;
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_bmult_m_bcast( const blksz_t* bmult_m_bcast, cntl_t* cntl_ )
+{
+	packm_def_cntl_t* cntl = ( packm_def_cntl_t* )cntl_;
+	num_t dt = cntl->dt_bmult;
+	cntl->bmult_m_bcast = bli_blksz_get_def( dt, bmult_m_bcast );
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_bmult_n( const blksz_t* bmult_n, cntl_t* cntl_ )
+{
+	packm_def_cntl_t* cntl = ( packm_def_cntl_t* )cntl_;
+	num_t dt = cntl->dt_bmult;
+	cntl->bmult_n_def = bli_blksz_get_def( dt, bmult_n );
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_does_invert_diag( bool does_invert_diag, cntl_t* cntl )
+{
+	 ( ( packm_def_cntl_t* ) cntl )->does_invert_diag = does_invert_diag;
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_rev_iter_if_upper( bool rev_iter_if_upper, cntl_t* cntl )
+{
+	( ( packm_def_cntl_t* ) cntl )->rev_iter_if_upper = rev_iter_if_upper;
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_rev_iter_if_lower( bool rev_iter_if_lower, cntl_t* cntl )
+{
+	( ( packm_def_cntl_t* ) cntl )->rev_iter_if_lower = rev_iter_if_lower;
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_pack_schema( pack_t pack_schema, cntl_t* cntl )
+{
+	( ( packm_def_cntl_t* ) cntl )->pack_schema = pack_schema;
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_pack_buf_type( packbuf_t pack_buf_type, cntl_t* cntl )
+{
+	( ( packm_def_cntl_t* ) cntl )->pack_buf_type = pack_buf_type;
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_ukr( const func2_t* ukr, cntl_t* cntl_ )
+{
+	packm_def_cntl_t* cntl = ( packm_def_cntl_t* )cntl_;
+	num_t dt_orig = cntl->dt_orig;
+	num_t dt_pack = cntl->dt_pack;
+	cntl->ukr = ( packm_ker_ft )bli_func2_get_dt( dt_orig, dt_pack, ukr );
+}
+
+BLIS_INLINE err_t bli_packm_def_cntl_set_ukr_simple( const func_t* ukr, cntl_t* cntl_ )
+{
+	packm_def_cntl_t* cntl = ( packm_def_cntl_t* )cntl_;
+	num_t dt_orig = cntl->dt_orig;
+	num_t dt_pack = cntl->dt_pack;
+	if ( dt_orig != dt_pack )
+		return BLIS_INCONSISTENT_DATATYPES;
+	cntl->ukr = ( packm_ker_ft )bli_func_get_dt( dt_orig, ukr );
+	return BLIS_SUCCESS;
+}
+
+BLIS_INLINE void bli_packm_def_cntl_set_ukr_params( const void* params, cntl_t* cntl )
+{
+	( ( packm_def_cntl_t* ) cntl )->params = params;
+}
+
+// -----------------------------------------------------------------------------
+
+BLIS_EXPORT_BLIS void bli_packm_cntl_init_node
+     (
+       void_fp       var_func,
+       packm_var_oft var,
+       const void*   params,
+       packm_cntl_t* cntl
+     );
+
+BLIS_EXPORT_BLIS void bli_packm_def_cntl_init_node
      (
-       pool_t*   sba_pool,
-       void_fp   var_func,
-       bszid_t   bmid_m,
-       bszid_t   bmid_n,
-       bool      does_invert_diag,
-       bool      rev_iter_if_upper,
-       bool      rev_iter_if_lower,
-       pack_t    pack_schema,
-       packbuf_t pack_buf_type,
-       cntl_t*   sub_node
+       void_fp           var_func,
+       num_t             dt_orig,
+       num_t             dt_pack,
+       num_t             dt_bmult,
+       packm_ker_ft      ukr,
+       dim_t             bmult_m_def,
+       dim_t             bmult_m_pack,
+       dim_t             bmult_m_bcast,
+       dim_t             bmult_m_scale,
+	   dim_t             bmult_m_pack_scale,
+       dim_t             bmult_n_def,
+       bool              does_invert_diag,
+       bool              rev_iter_if_upper,
+       bool              rev_iter_if_lower,
+       pack_t            pack_schema,
+       packbuf_t         pack_buf_type,
+       packm_def_cntl_t* cntl
      );
 
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index d4480f2c1..fa19a6df5 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -35,13 +35,12 @@
 
 #include "blis.h"
 
-bool bli_packm_init
+siz_t bli_packm_init
      (
+             num_t   dt_p,
        const obj_t*  c,
              obj_t*  p,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl
      )
 {
 	bli_init_once();
@@ -54,7 +53,7 @@ bool bli_packm_init
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
-		bli_packm_init_check( c, p, cntx );
+		bli_packm_init_check( c, p );
 
 	// We begin by copying the fields of A.
 	bli_obj_alias_to( c, p );
@@ -65,26 +64,28 @@ bool bli_packm_init
 		return false;
 
 	// Extract various fields from the control tree.
-	bszid_t bmult_id_m   = bli_cntl_packm_params_bmid_m( cntl );
-	bszid_t bmult_id_n   = bli_cntl_packm_params_bmid_n( cntl );
-	pack_t  schema       = bli_cntl_packm_params_pack_schema( cntl );
-	num_t   dt_tar       = bli_obj_target_dt( c );
+	pack_t  schema       = bli_packm_def_cntl_pack_schema( cntl );
 	num_t   dt_scalar    = bli_obj_scalar_dt( c );
-	dim_t   bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
-	dim_t   bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
-	dim_t   bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
+	dim_t   bmult_m_def  = bli_packm_def_cntl_bmult_m_def( cntl );
+	dim_t   bmult_m_pack = bli_packm_def_cntl_bmult_m_pack( cntl );
+	dim_t   bmult_n_def  = bli_packm_def_cntl_bmult_n_def( cntl );
 
 	// Typecast the internal scalar value to the target datatype.
 	// Note that if the typecasting is needed, this must happen BEFORE we
 	// change the datatype of P to reflect the target_dt.
-	if ( dt_scalar != dt_tar )
+	if ( dt_scalar != dt_p )
 	{
-		bli_obj_scalar_cast_to( dt_tar, p );
+		bli_obj_scalar_cast_to( bli_dt_domain( dt_scalar ) | bli_dt_prec( dt_p ), p );
 	}
 
+	// If we are only packing the real part of a complex matrix, use the
+	// real datatype for the packed matrix.
+	if ( schema == BLIS_PACKED_PANELS_RO )
+		dt_p = bli_dt_proj_to_real( dt_p );
+
 	// Update the storage datatype of P to be the target datatype of A.
-	bli_obj_set_dt( dt_tar, p );
-	bli_obj_set_elem_size( bli_dt_size( dt_tar ), p );
+	bli_obj_set_dt( dt_p, p );
+	bli_obj_set_elem_size( bli_dt_size( dt_p ), p );
 
 	// Store the pack schema to the object.
 	bli_obj_set_pack_schema( schema, p );
@@ -108,8 +109,8 @@ bool bli_packm_init
 	// level-2 operations, but that's okay with us.
 	dim_t m_p     = bli_obj_length( p );
 	dim_t n_p     = bli_obj_width( p );
-	dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
-	dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
+	dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def, true );
+	dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def, true );
 
 	// Save the padded dimensions into the packed object. It is important
 	// to save these dimensions since they represent the actual dimensions
@@ -170,17 +171,6 @@ bool bli_packm_init
 
 	// Compute the size of the packed buffer.
 	siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
-
-	// If the requested size is zero, then we don't need to do any allocation.
-	if ( size_p == 0 )
-		return false;
-
-	// Update the buffer address in p to point to the buffer associated
-	// with the mem_t entry acquired from the memory broker (now cached in
-	// the control tree node).
-	void* buffer = bli_packm_alloc( size_p, cntl, thread );
-	bli_obj_set_buffer( buffer, p );
-
-	return true;
+	return size_p;
 }
 
diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h
index b34bd5379..b3258bf57 100644
--- a/frame/1m/packm/bli_packm_init.h
+++ b/frame/1m/packm/bli_packm_init.h
@@ -32,12 +32,11 @@
 
 */
 
-BLIS_EXPORT_BLIS bool bli_packm_init
+BLIS_EXPORT_BLIS siz_t bli_packm_init
      (
+             num_t   dt_p,
        const obj_t*  a,
              obj_t*  p,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const cntl_t* cntl
      );
 
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index 49d5a49a3..8dac1174f 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -45,31 +45,18 @@ void bli_packm_int
 {
 	bli_init_once();
 
-	// Extract the function pointer from the object.
-	packm_var_oft f = bli_obj_pack_fn( a );
-
 	// Barrier so that we know threads are done with previous computation
 	// with the same packing buffer before starting to pack.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 	bli_thrinfo_barrier( thread );
 
-	// Invoke the packm variant.
-	// NOTE: The packing kernel uses two communicators: one which represents a
-	// single workgroup of many threads, and one which represents a group of
-	// many single-member workgroups. The former communicator is used for
-	// barriers and thread communication (i.e. broadcasting the pack buffer
-	// pointer), while the latter communicator is used for partitioning work.
-	// This is because all of the thread range functions rely on the work_id
-	// and number of workgroups (n_way). Thus, we pass along the parent
-	// thrinfo_t node which has these two communicators as the sub-node and
-	// sub-prenode, respectively.
-	f
+	bli_packm_cntl_variant( cntl )
 	(
 	  a,
 	  p,
 	  cntx,
 	  cntl,
-	  thread_par
+	  thread
 	);
 
 	// Barrier so that packing is done before computation.
diff --git a/frame/1m/packm/bli_packm_part.c b/frame/1m/packm/bli_packm_part.c
deleted file mode 100644
index feaaaeea8..000000000
--- a/frame/1m/packm/bli_packm_part.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-
-// -- Matrix partitioning ------------------------------------------------------
-
-
-void bli_packm_acquire_mpart_t2b( subpart_t    requested_part,
-                                  dim_t        i,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                        obj_t* sub_obj )
-{
-	dim_t m, n;
-
-	// For now, we only support acquiring the middle subpartition.
-	if ( requested_part != BLIS_SUBPART1 )
-	{
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-	}
-
-	// Partitioning top-to-bottom through packed column panels (which are
-	// row-stored) is not yet supported.
-	if ( bli_obj_is_col_packed( obj ) )
-	{
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-	}
-
-	// Query the dimensions of the parent object.
-	m = bli_obj_length( obj );
-	n = bli_obj_width( obj );
-
-	// Foolproofing: do not let b exceed what's left of the m dimension at
-	// row offset i.
-	if ( b > m - i ) b = m - i;
-
-	// Begin by copying the info, elem size, buffer, row stride, and column
-	// stride fields of the parent object. Note that this omits copying view
-	// information because the new partition will have its own dimensions
-	// and offsets.
-	bli_obj_init_subpart_from( obj, sub_obj );
-
-	// Modify offsets and dimensions of requested partition.
-	bli_obj_set_dims( b, n, sub_obj );
-
-	// Tweak the padded length of the subpartition to trick the underlying
-	// implementation into only zero-padding for the narrow submatrix of
-	// interest. Usually, the value we want is b (for non-edge cases), but
-	// at the edges, we want the remainder of the mem_t region in the m
-	// dimension. Edge cases are defined as occurring when i + b is exactly
-	// equal to the inherited sub-object's length (which happens since the
-	// determine_blocksize function would have returned a smaller value of
-	// b for the edge iteration). In these cases, we arrive at the new
-	// packed length by simply subtracting off i.
-	{
-		dim_t  m_pack_max = bli_obj_padded_length( sub_obj );
-		dim_t  m_pack_cur;
-
-		if ( i + b == m ) m_pack_cur = m_pack_max - i;
-		else              m_pack_cur = b;
-
-		bli_obj_set_padded_length( m_pack_cur, sub_obj );
-	}
-
-	// Translate the desired offsets to a panel offset and adjust the
-	// buffer pointer of the subpartition object.
-	{
-		char* buf_p        = bli_obj_buffer( sub_obj );
-		siz_t elem_size    = bli_obj_elem_size( sub_obj );
-		dim_t off_to_panel = bli_packm_offset_to_panel_for( i, sub_obj );
-
-		buf_p = buf_p + elem_size * off_to_panel;
-
-		bli_obj_set_buffer( buf_p, sub_obj );
-	}
-}
-
-
-
-void bli_packm_acquire_mpart_l2r( subpart_t    requested_part,
-                                  dim_t        j,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                        obj_t* sub_obj )
-{
-	dim_t m, n;
-
-	// Check parameters.
-	//if ( bli_error_checking_is_enabled() )
-	//	bli_packm_acquire_mpart_l2r_check( requested_part, j, b, obj, sub_obj );
-
-	// For now, we only support acquiring the middle subpartition.
-	if ( requested_part != BLIS_SUBPART1 )
-	{
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-	}
-
-	// Partitioning left-to-right through packed row panels (which are
-	// column-stored) is not yet supported.
-	if ( bli_obj_is_row_packed( obj ) )
-	{
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-	}
-
-	// Query the dimensions of the parent object.
-	m = bli_obj_length( obj );
-	n = bli_obj_width( obj );
-
-	// Foolproofing: do not let b exceed what's left of the n dimension at
-	// column offset j.
-	if ( b > n - j ) b = n - j;
-
-	// Begin by copying the info, elem size, buffer, row stride, and column
-	// stride fields of the parent object. Note that this omits copying view
-	// information because the new partition will have its own dimensions
-	// and offsets.
-	bli_obj_init_subpart_from( obj, sub_obj );
-
-	// Modify offsets and dimensions of requested partition.
-	bli_obj_set_dims( m, b, sub_obj );
-
-	// Tweak the padded width of the subpartition to trick the underlying
-	// implementation into only zero-padding for the narrow submatrix of
-	// interest. Usually, the value we want is b (for non-edge cases), but
-	// at the edges, we want the remainder of the mem_t region in the n
-	// dimension. Edge cases are defined as occurring when j + b is exactly
-	// equal to the inherited sub-object's width (which happens since the
-	// determine_blocksize function would have returned a smaller value of
-	// b for the edge iteration). In these cases, we arrive at the new
-	// packed width by simply subtracting off j.
-	{
-		dim_t  n_pack_max = bli_obj_padded_width( sub_obj );
-		dim_t  n_pack_cur;
-
-		if ( j + b == n ) n_pack_cur = n_pack_max - j;
-		else              n_pack_cur = b;
-
-		bli_obj_set_padded_width( n_pack_cur, sub_obj );
-	}
-
-	// Translate the desired offsets to a panel offset and adjust the
-	// buffer pointer of the subpartition object.
-	{
-		char* buf_p        = bli_obj_buffer( sub_obj );
-		siz_t elem_size    = bli_obj_elem_size( sub_obj );
-		dim_t off_to_panel = bli_packm_offset_to_panel_for( j, sub_obj );
-
-		buf_p = buf_p + elem_size * off_to_panel;
-
-		bli_obj_set_buffer( buf_p, sub_obj );
-	}
-}
-
-
-
-void bli_packm_acquire_mpart_tl2br( subpart_t    requested_part,
-                                    dim_t        ij,
-                                    dim_t        b,
-                                    const obj_t* obj,
-                                          obj_t* sub_obj )
-{
-	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-}
-
-
-
-dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p )
-{
-	dim_t panel_off;
-
-	if      ( bli_obj_pack_schema( p ) == BLIS_PACKED_ROWS )
-	{
-		// For the "packed rows" schema, a single row is effectively one
-		// row panel, and so we use the row offset as the panel offset.
-		// Then we multiply this offset by the effective panel stride
-		// (ie: the row stride) to arrive at the desired offset.
-		panel_off = offmn * bli_obj_row_stride( p );
-	}
-	else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_COLUMNS )
-	{
-		// For the "packed columns" schema, a single column is effectively one
-		// column panel, and so we use the column offset as the panel offset.
-		// Then we multiply this offset by the effective panel stride
-		// (ie: the column stride) to arrive at the desired offset.
-		panel_off = offmn * bli_obj_col_stride( p );
-	}
-	else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_ROW_PANELS )
-	{
-		// For the "packed row panels" schema, the column stride is equal to
-		// the panel dimension (length). So we can divide it into offmn
-		// (interpreted as a row offset) to arrive at a panel offset. Then
-		// we multiply this offset by the panel stride to arrive at the total
-		// offset to the panel (in units of elements).
-		panel_off = offmn / bli_obj_col_stride( p );
-		panel_off = panel_off * bli_obj_panel_stride( p );
-
-		// Sanity check.
-		if ( offmn % bli_obj_col_stride( p ) > 0 ) bli_abort();
-	}
-	else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_COL_PANELS )
-	{
-		// For the "packed column panels" schema, the row stride is equal to
-		// the panel dimension (width). So we can divide it into offmn
-		// (interpreted as a column offset) to arrive at a panel offset. Then
-		// we multiply this offset by the panel stride to arrive at the total
-		// offset to the panel (in units of elements).
-		panel_off = offmn / bli_obj_row_stride( p );
-		panel_off = panel_off * bli_obj_panel_stride( p );
-
-		// Sanity check.
-		if ( offmn % bli_obj_row_stride( p ) > 0 ) bli_abort();
-	}
-	else
-	{
-		panel_off = 0;
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-	}
-
-	return panel_off;
-}
diff --git a/frame/1m/packm/bli_packm_part.h b/frame/1m/packm/bli_packm_part.h
deleted file mode 100644
index 39ee69a2c..000000000
--- a/frame/1m/packm/bli_packm_part.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// -- Matrix partitioning ------------------------------------------------------
-
-void bli_packm_acquire_mpart_t2b( subpart_t    requested_part,
-                                  dim_t        i,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                        obj_t* sub_obj );
-
-void bli_packm_acquire_mpart_l2r( subpart_t    requested_part,
-                                  dim_t        j,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                        obj_t* sub_obj );
-
-void bli_packm_acquire_mpart_tl2br( subpart_t    requested_part,
-                                    dim_t        ij,
-                                    dim_t        b,
-                                    const obj_t* obj,
-                                          obj_t* sub_obj );
-
-dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p );
-
diff --git a/frame/1m/packm/bli_packm_scalar.c b/frame/1m/packm/bli_packm_scalar.c
index f613028c9..484319385 100644
--- a/frame/1m/packm/bli_packm_scalar.c
+++ b/frame/1m/packm/bli_packm_scalar.c
@@ -37,18 +37,16 @@
 
 void* bli_packm_scalar( obj_t* kappa, obj_t* p )
 {
-	num_t  dt_p   = bli_obj_dt( p );
-	pack_t schema = bli_obj_pack_schema( p );
+	num_t dt_p = bli_obj_dt( p );
 
 	// The value for kappa we use will depends on whether the scalar
 	// attached to A has a nonzero imaginary component. If it does,
-	// then we will apply the scalar during packing to facilitate
-	// implementing induced complex domain algorithms in terms of
-	// real domain micro-kernels. (In the aforementioned situation,
-	// applying a real scalar is easy, but applying a complex one is
-	// harder, so we avoid the need altogether with the code below.)
+	// and the matrix to pack is complex, then we apply the scalar now
+	// because we may not have a chance later due to using real-domain
+	// microkernels.
 	if ( bli_obj_scalar_has_nonzero_imag( p ) &&
-	     !bli_is_nat_packed( schema ) )
+	     ( bli_obj_is_complex( p ) ||
+	       bli_obj_pack_schema( p ) == BLIS_PACKED_PANELS_RO ) )
 	{
 		//printf( "applying non-zero imag kappa\n_p" );
 
@@ -60,11 +58,6 @@ void* bli_packm_scalar( obj_t* kappa, obj_t* p )
 
 		return bli_obj_buffer_for_1x1( dt_p, kappa );
 	}
-	// This branch is also for native execution, where we assume that
-	// the micro-kernel will always apply the alpha scalar of the
-	// higher-level operation. Thus, we use BLIS_ONE for kappa so
-	// that the underlying packm implementation does not perform
-	// any scaling during packing.
 	else
 	{
 		// If the internal scalar of A has only a real component, then
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index c8571d24e..74f9de8f8 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -34,54 +34,92 @@
 
 #include "blis.h"
 
-#undef  GENTFUNCR
-#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, cxk_kername, cxc_kername ) \
+#undef  GENTFUNC2RO
+#define GENTFUNC2RO( ctypec_r, ctype_c, ctypep_r, ctypep, chc_r, chc, chp_r, chp, varname ) \
+GENTFUNC2RO_( ctypec_r, ctypec_r, ctypep_r, ctypep_r, chc_r, chc_r, chp_r, chp_r, varname ) \
+GENTFUNC2RO_( ctypec_r, ctypec,   ctypep_r, ctypep,   chc_r, chc,   chp_r, chp,   varname )
+
+#undef  GENTFUNC2RO_
+#define GENTFUNC2RO_( ctypec_r, ctype_c, ctypep_r, ctypep, chc_r, chc, chp_r, chp, varname ) \
 \
-void PASTEMAC(ch,varname) \
+void PASTEMAC(chc,chp,varname) \
      ( \
-       struc_t strucc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       conj_t  conjc, \
-       pack_t  schema, \
-       bool    invdiag, \
-       dim_t   panel_dim, \
-       dim_t   panel_len, \
-       dim_t   panel_dim_max, \
-       dim_t   panel_len_max, \
-       dim_t   panel_dim_off, \
-       dim_t   panel_len_off, \
-       ctype*  kappa, \
-       ctype*  c, inc_t incc, inc_t ldc, \
-       ctype*  p,             inc_t ldp, \
-                  inc_t is_p, \
-       void*   params, \
-       cntx_t* cntx  \
+             struc_t strucc, \
+             diag_t  diagc, \
+             uplo_t  uploc, \
+             conj_t  conjc, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   panel_dim, \
+             dim_t   panel_len, \
+             dim_t   panel_dim_max, \
+             dim_t   panel_len_max, \
+             dim_t   panel_dim_off, \
+             dim_t   panel_len_off, \
+             dim_t   panel_bcast, \
+       const void*   kappa, \
+       const void*   c, inc_t incc, inc_t ldc, \
+             void*   p,             inc_t ldp, \
+       const void*   params, \
+       const cntx_t* cntx \
      ) \
 { \
-	num_t   dt            = PASTEMAC(ch,type); \
-	num_t   dt_r          = PASTEMAC(chr,type); \
-	dim_t   panel_len_pad = panel_len_max - panel_len; \
+	num_t dt_c          = PASTEMAC(chc,type); \
+	num_t dt_p          = PASTEMAC(chp,type); \
+	num_t dt_p0         = dt_p; \
+\
+	/* Always do pointer arithmetic in the real domain so that we
+	   can cleanly handle the real-only packing case. */ \
+	inc_t incc_r        = incc; \
+	inc_t ldc_r         = ldc; \
+	inc_t ldp_r         = ldp; \
+\
+	if ( bli_is_complex( dt_c ) ) \
+	{ \
+		incc_r *= 2; \
+		ldc_r *= 2; \
+		ldp_r *= 2; \
+	} \
 \
-	bszid_t bsz_id        = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \
-	dim_t   packmrnr      = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \
-	dim_t   packmrnr_r    = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \
+	dim_t panel_len_pad = panel_len_max - panel_len; \
 \
-	ukr_t   cxk_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
-	                                                    : BLIS_PACKM_MRXK_KER; \
-	ukr_t   cxc_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER \
-	                                                    : BLIS_PACKM_MRXMR_DIAG_KER; \
+	ukr_t cxk_ker_id    = BLIS_PACKM_KER; \
+	ukr_t cxc_ker_id    = BLIS_PACKM_DIAG_KER; \
 \
 	if ( bli_is_1m_packed( schema ) ) \
 	{ \
-		cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
-		                                         : BLIS_PACKM_MRXK_1ER_KER; \
-		cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER \
-		                                         : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
+		cxk_ker_id = BLIS_PACKM_1ER_KER; \
+		cxc_ker_id = BLIS_PACKM_DIAG_1ER_KER; \
+	} \
+	else if ( bli_is_ro_packed( schema ) ) \
+	{ \
+		ctypep_r kappa_r, kappa_i; \
+		( void )kappa_r; \
+		PASTEMAC(chp,gets)( *( ctypep* )kappa, kappa_r, kappa_i ); \
+		if ( PASTEMAC(chp_r,eq0)( kappa_i ) ) \
+		{ \
+			/* Treat the matrix as real with doubled strides. */ \
+			dt_c = bli_dt_proj_to_real( dt_c ); \
+			dt_p = bli_dt_proj_to_real( dt_p ); \
+			incc *= 2; \
+			ldc *= 2; \
+			schema = BLIS_PACKED_PANELS; \
+		} \
+		else \
+		{ \
+			cxk_ker_id = BLIS_PACKM_RO_KER; \
+			cxc_ker_id = BLIS_PACKM_DIAG_RO_KER; \
+		} \
+\
+		/* Make sure that P is treated as a real matrix. */ \
+		ldp_r /= 2; \
+		dt_p0 = bli_dt_proj_to_real( dt_p ); \
 	} \
 \
-	PASTECH(cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
-	PASTECH(cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
+	const void*           zero   = bli_obj_buffer_for_const( dt_p0, &BLIS_ZERO ); \
+	setv_ker_ft           f_setv = bli_cntx_get_ukr_dt( dt_p0, BLIS_SETV_KER, cntx ); \
+	packm_cxk_ker_ft      f_cxk  = bli_cntx_get_ukr2_dt( dt_c, dt_p, cxk_ker_id, cntx ); \
+	packm_cxc_diag_ker_ft f_cxc  = bli_cntx_get_ukr2_dt( dt_c, dt_p, cxc_ker_id, cntx ); \
 \
 	/* For general matrices, pack and return early */ \
 	if ( bli_is_general( strucc ) ) \
@@ -91,12 +129,15 @@ void PASTEMAC(ch,varname) \
 		  conjc, \
 		  schema, \
 		  panel_dim, \
+		  panel_dim_max, \
+		  panel_bcast, \
 		  panel_len, \
 		  panel_len_max, \
 		  kappa, \
 		  c, incc, ldc, \
 		  p,       ldp, \
-		  cntx  \
+		  params, \
+		  cntx \
 		); \
 		return; \
 	} \
@@ -116,18 +157,20 @@ void PASTEMAC(ch,varname) \
 	/* Pack to p10. */ \
 	if ( 0 < diagoffc ) \
 	{ \
-		dim_t  p10_dim     = panel_dim; \
-		dim_t  p10_len     = bli_min( diagoffc, panel_len ); \
-		dim_t  p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
-		ctype* p10         = p; \
-		conj_t conjc10     = conjc; \
-		ctype* c10         = c; \
-		inc_t  incc10      = incc; \
-		inc_t  ldc10       = ldc; \
+		dim_t     p10_len     = bli_min( diagoffc, panel_len ); \
+		dim_t     p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
+		ctypep_r* p10         = ( ctypep_r* )p; \
+		conj_t    conjc10     = conjc; \
+		ctypec_r* c10         = ( ctypec_r* )c; \
+		inc_t     incc10_r    = incc_r; \
+		inc_t     ldc10_r     = ldc_r; \
+		inc_t     incc10      = incc; \
+		inc_t     ldc10       = ldc; \
 \
 		if ( bli_is_upper( uploc ) ) \
 		{ \
-			bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \
+			bli_reflect_to_stored_part( diagoffc, c10, incc10_r, ldc10_r ); \
+			bli_swap_incs(&incc10, &ldc10); \
 \
 			if ( bli_is_hermitian( strucc ) ) \
 				bli_toggle_conj( &conjc10 ); \
@@ -137,42 +180,14 @@ void PASTEMAC(ch,varname) \
 		   explicitly store zeros */ \
 		if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \
 		{ \
-			if ( bli_is_1m_packed( schema ) ) \
-			{ \
-				ctype_r* restrict zero = PASTEMAC(chr,0); \
-\
-				PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
-				( \
-				  BLIS_NO_CONJUGATE, \
-				  0, \
-				  BLIS_NONUNIT_DIAG, \
-				  BLIS_DENSE, \
-				  packmrnr_r, \
-				  p10_len_max * 2, \
-				  zero, \
-				  ( ctype_r* )p10, 1, ldp, \
-				  cntx, \
-				  NULL  \
-				); \
-			} \
-			else \
-			{ \
-				ctype* restrict zero = PASTEMAC(ch,0); \
-\
-				PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-				( \
-				  BLIS_NO_CONJUGATE, \
-				  0, \
-				  BLIS_NONUNIT_DIAG, \
-				  BLIS_DENSE, \
-				  packmrnr, \
-				  p10_len_max, \
-				  zero, \
-				  p10, 1, ldp, \
-				  cntx, \
-				  NULL  \
-				); \
-			} \
+			f_setv \
+			( \
+			  BLIS_NO_CONJUGATE, \
+			  ldp * p10_len_max, \
+			  zero, \
+			  p10, 1, \
+			  cntx \
+			); \
 		} \
 		else \
 		{ \
@@ -180,13 +195,16 @@ void PASTEMAC(ch,varname) \
 			( \
 			  conjc10, \
 			  schema, \
-			  p10_dim, \
+			  panel_dim, \
+			  panel_dim_max, \
+			  panel_bcast, \
 			  p10_len, \
 			  p10_len_max, \
 			  kappa, \
 			  c10, incc10, ldc10, \
 			  p10,         ldp, \
-			  cntx  \
+			  params, \
+			  cntx \
 			); \
 		} \
 	} \
@@ -194,15 +212,14 @@ void PASTEMAC(ch,varname) \
 	/* Pack to p11. */ \
 	if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \
 	{ \
-		dim_t  i           = diagoffc; \
-		dim_t  p11_dim     = panel_dim; \
-		dim_t  p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \
+		dim_t     i           = diagoffc; \
+		dim_t     p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \
 		                                   ? panel_len_pad : 0 ); \
-		ctype* p11         = p + i * ldp; \
-		conj_t conjc11     = conjc; \
-		ctype* c11         = c + i * ldc; \
-		inc_t  incc11      = incc; \
-		inc_t  ldc11       = ldc; \
+		ctypep_r* p11         = ( ctypep_r* )p + i * ldp_r; \
+		conj_t    conjc11     = conjc; \
+		ctypec_r* c11         = ( ctypec_r* )c + i * ldc_r; \
+		inc_t     incc11      = incc; \
+		inc_t     ldc11       = ldc; \
 \
 		f_cxc \
 		( \
@@ -212,33 +229,38 @@ void PASTEMAC(ch,varname) \
 		  conjc11, \
 		  schema, \
 		  invdiag, \
-		  p11_dim, \
+		  panel_dim, \
+		  panel_dim_max, \
+		  panel_bcast, \
 		  p11_len_max, \
 		  kappa, \
 		  c11, incc11, ldc11, \
 		  p11,         ldp, \
-		  cntx  \
+		  params, \
+		  cntx \
 		); \
 	} \
 \
 	/* Pack to p12. */ \
 	if ( diagoffc + panel_dim < panel_len ) \
 	{ \
-		dim_t  i           = bli_max( 0, diagoffc + panel_dim ); \
-		dim_t  p12_dim     = panel_dim; \
-		dim_t  p12_len     = panel_len - i; \
-		/* If we are packing p12, then it is always the last partial block \
+		dim_t     i           = bli_max( 0, diagoffc + panel_dim ); \
+		dim_t     p12_len     = panel_len - i; \
+		/* If we are packing p12, then it is always the last partial block
 		   and so we should make sure to pad with zeros if necessary. */ \
-		dim_t  p12_len_max = p12_len + panel_len_pad; \
-		ctype* p12         = p + i * ldp; \
-		conj_t conjc12     = conjc; \
-		ctype* c12         = c + i * ldc; \
-		inc_t  incc12      = incc; \
-		inc_t  ldc12       = ldc; \
+		dim_t     p12_len_max = p12_len + panel_len_pad; \
+		ctypep_r* p12         = ( ctypep_r* )p + i * ldp_r; \
+		conj_t    conjc12     = conjc; \
+		ctypec_r* c12         = ( ctypec_r* )c + i * ldc_r; \
+		inc_t     incc12_r    = incc_r; \
+		inc_t     ldc12_r     = ldc_r; \
+		inc_t     incc12      = incc; \
+		inc_t     ldc12       = ldc; \
 \
 		if ( bli_is_lower( uploc ) ) \
 		{ \
-			bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \
+			bli_reflect_to_stored_part( diagoffc - i, c12, incc12_r, ldc12_r ); \
+			bli_swap_incs(&incc12, &ldc12); \
 \
 			if ( bli_is_hermitian( strucc ) ) \
 				bli_toggle_conj( &conjc12 ); \
@@ -248,42 +270,14 @@ void PASTEMAC(ch,varname) \
 		   explicitly store zeros */ \
 		if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \
 		{ \
-			if ( bli_is_1m_packed( schema ) ) \
-			{ \
-			    ctype_r* restrict zero = PASTEMAC(chr,0); \
-\
-				PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
-				( \
-				  BLIS_NO_CONJUGATE, \
-				  0, \
-				  BLIS_NONUNIT_DIAG, \
-				  BLIS_DENSE, \
-				  packmrnr_r, \
-				  p12_len_max * 2, \
-				  zero, \
-				  ( ctype_r* )p12, 1, ldp, \
-				  cntx, \
-				  NULL  \
-				); \
-			} \
-			else \
-			{ \
-				ctype* restrict zero = PASTEMAC(ch,0); \
-\
-				PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-				( \
-				  BLIS_NO_CONJUGATE, \
-				  0, \
-				  BLIS_NONUNIT_DIAG, \
-				  BLIS_DENSE, \
-				  packmrnr, \
-				  p12_len_max, \
-				  zero, \
-				  p12, 1, ldp, \
-				  cntx, \
-				  NULL  \
-				); \
-			} \
+			f_setv \
+			( \
+			  BLIS_NO_CONJUGATE, \
+			  ldp * p12_len_max, \
+			  zero, \
+			  p12, 1, \
+			  cntx \
+			); \
 		} \
 		else \
 		{ \
@@ -291,17 +285,21 @@ void PASTEMAC(ch,varname) \
 			( \
 			  conjc12, \
 			  schema, \
-			  p12_dim, \
+			  panel_dim, \
+			  panel_dim_max, \
+			  panel_bcast, \
 			  p12_len, \
 			  p12_len_max, \
 			  kappa, \
 			  c12, incc12, ldc12, \
 			  p12,         ldp, \
-			  cntx  \
+			  params, \
+			  cntx \
 			); \
 		} \
 	} \
 }
 
-INSERT_GENTFUNCR_BASIC( packm_struc_cxk, packm_cxk, packm_cxc_diag )
+INSERT_GENTFUNC2RO( packm_struc_cxk )
+INSERT_GENTFUNC2RO_MIX_P( packm_struc_cxk )
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h
index 129a4d018..2e66d87a6 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.h
+++ b/frame/1m/packm/bli_packm_struc_cxk.h
@@ -32,30 +32,31 @@
 
 */
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
+#undef  GENTPROT2
+#define GENTPROT2( ctypec, ctypep, chc, chp, varname ) \
 \
-void PASTEMAC(ch,varname) \
+BLIS_EXPORT_BLIS void PASTEMAC(chc,chp,varname) \
      ( \
-       struc_t strucc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       conj_t  conjc, \
-       pack_t  schema, \
-       bool    invdiag, \
-       dim_t   panel_dim, \
-       dim_t   panel_len, \
-       dim_t   panel_dim_max, \
-       dim_t   panel_len_max, \
-       dim_t   panel_dim_off, \
-       dim_t   panel_len_off, \
-       ctype*  kappa, \
-       ctype*  c, inc_t incc, inc_t ldc, \
-       ctype*  p,             inc_t ldp, \
-                  inc_t is_p, \
-       void*   params, \
-       cntx_t* cntx  \
+             struc_t strucc, \
+             diag_t  diagc, \
+             uplo_t  uploc, \
+             conj_t  conjc, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   panel_dim, \
+             dim_t   panel_len, \
+             dim_t   panel_dim_max, \
+             dim_t   panel_len_max, \
+             dim_t   panel_dim_off, \
+             dim_t   panel_len_off, \
+             dim_t   panel_bcast, \
+       const void*   kappa, \
+       const void*   c, inc_t incc, inc_t ldc, \
+             void*   p,             inc_t ldp, \
+       const void*   params, \
+       const cntx_t* cntx \
      );
 
-INSERT_GENTPROT_BASIC( packm_struc_cxk )
+INSERT_GENTPROT2_BASIC( packm_struc_cxk )
+INSERT_GENTPROT2_MIX_P( packm_struc_cxk )
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c
deleted file mode 100644
index 51212040e..000000000
--- a/frame/1m/packm/bli_packm_struc_cxk_md.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_GEMM_MD
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
-\
-void PASTEMAC2(chc,chp,varname) \
-     ( \
-       struc_t  strucc, \
-       diag_t   diagc, \
-       uplo_t   uploc, \
-       conj_t   conjc, \
-       pack_t   schema, \
-       bool     invdiag, \
-       dim_t    panel_dim, \
-       dim_t    panel_len, \
-       dim_t    panel_dim_max, \
-       dim_t    panel_len_max, \
-       dim_t    panel_dim_off, \
-       dim_t    panel_len_off, \
-       ctype_p* kappa, \
-       ctype_c* c, inc_t incc, inc_t ldc, \
-       ctype_p* p,             inc_t ldp, \
-                   inc_t is_p, \
-       void*    params, \
-       cntx_t*  cntx  \
-     ) \
-{ \
-	if ( bli_is_nat_packed( schema ) ) \
-	{ \
-		/* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha
-		   values are never handled when packing for native execution;
-		   instead, they are passed along to the micro-kernel. */ \
-		if ( !PASTEMAC(chp,eq1)( *kappa ) ) \
-			bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-		/* Treat the micro-panel as panel_dim x panel_len and column-stored
-		   (unit row stride). */ \
-\
-		/* NOTE: We ignore kappa for now, since it should be 1.0. */ \
-		PASTEMAC2(chc,chp,castm) \
-		( \
-		  ( trans_t )conjc, \
-		  panel_dim, \
-		  panel_len, \
-		  c, incc, ldc, \
-		  p,    1, ldp  \
-		); \
-\
-		/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
-		if ( panel_dim < panel_dim_max ) \
-		{ \
-			ctype_p* restrict zero   = PASTEMAC(chp,0); \
-			const dim_t       i      = panel_dim; \
-			const dim_t       m_edge = panel_dim_max - i; \
-			const dim_t       n_edge = panel_len_max; \
-			ctype_p*          p_edge = p + (i  )*1; \
-\
-			PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p_edge, 1, ldp, \
-			  cntx, \
-			  NULL  \
-			); \
-		} \
-\
-		/* If panel_len < panel_len_max, then we zero those unused columns. */ \
-		if ( panel_len < panel_len_max ) \
-		{ \
-			ctype_p* restrict zero   = PASTEMAC(chp,0); \
-			const dim_t       j      = panel_len; \
-			const dim_t       m_edge = panel_dim_max; \
-			const dim_t       n_edge = panel_len_max - j; \
-			ctype_p*          p_edge = p + (j  )*ldp; \
-\
-			PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p_edge, 1, ldp, \
-			  cntx, \
-			  NULL  \
-			); \
-		} \
-	} \
-	else if ( bli_is_1r_packed( schema ) ) \
-	{ \
-		/* Treat the micro-panel as panel_dim x panel_len and column-stored
-		   (unit row stride). */ \
-\
-		PASTEMAC2(chc,chp,packm_cxk_1r_md) \
-		( \
-		  conjc, \
-		  panel_dim, \
-		  panel_len, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp  \
-		); \
-\
-		/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
-		if ( panel_dim < panel_dim_max ) \
-		{ \
-			ctype_p* restrict zero   = PASTEMAC(chp,0); \
-			const dim_t       offm   = panel_dim; \
-			const dim_t       offn   = 0; \
-			const dim_t       m_edge = panel_dim_max - panel_dim; \
-			const dim_t       n_edge = panel_len_max; \
-\
-			( void ) zero; \
-			( void ) m_edge; ( void )offm; \
-			( void ) n_edge; ( void )offn; \
-\
-			PASTEMAC(chp,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-\
-		/* If panel_len < panel_len_max, then we zero those unused columns. */ \
-		if ( panel_len < panel_len_max ) \
-		{ \
-			ctype_p* restrict zero   = PASTEMAC(chp,0); \
-			const dim_t       offm   = 0; \
-			const dim_t       offn   = panel_len; \
-			const dim_t       m_edge = panel_dim_max; \
-			const dim_t       n_edge = panel_len_max - panel_len; \
-\
-			( void ) zero; \
-			( void ) m_edge; ( void )offm; \
-			( void ) n_edge; ( void )offn; \
-\
-			PASTEMAC(chp,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-	else if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		/* Treat the micro-panel as panel_dim x panel_len and column-stored
-		   (unit row stride). */ \
-\
-		PASTEMAC2(chc,chp,packm_cxk_1e_md) \
-		( \
-		  conjc, \
-		  panel_dim, \
-		  panel_len, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp  \
-		); \
-\
-		/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
-		if ( panel_dim < panel_dim_max ) \
-		{ \
-			ctype_p* restrict zero   = PASTEMAC(chp,0); \
-			const dim_t       offm   = panel_dim; \
-			const dim_t       offn   = 0; \
-			const dim_t       m_edge = panel_dim_max - panel_dim; \
-			const dim_t       n_edge = panel_len_max; \
-\
-			( void ) zero; \
-			( void ) m_edge; ( void )offm; \
-			( void ) n_edge; ( void )offn; \
-\
-			PASTEMAC(chp,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-\
-		/* If panel_len < panel_len_max, then we zero those unused columns. */ \
-		if ( panel_len < panel_len_max ) \
-		{ \
-			ctype_p* restrict zero   = PASTEMAC(chp,0); \
-			const dim_t       offm   = 0; \
-			const dim_t       offn   = panel_len; \
-			const dim_t       m_edge = panel_dim_max; \
-			const dim_t       n_edge = panel_len_max - panel_len; \
-\
-			( void ) zero; \
-			( void ) m_edge; ( void )offm; \
-			( void ) n_edge; ( void )offn; \
-\
-			PASTEMAC(chp,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-	else \
-	{ \
-		/* Mixed-datatype packing should not occur for any other schemas. */ \
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-	} \
-\
-\
-/*
-	if ( bli_is_col_packed( schema ) ) \
-	PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
-	                      p, rs_p, cs_p, "%4.1f", "" ); \
-	else if ( bli_is_row_packed( schema ) ) \
-	PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
-	                      p, rs_p, cs_p, "%4.1f", "" ); \
-*/ \
-}
-
-INSERT_GENTFUNC2_BASIC( packm_struc_cxk_md )
-INSERT_GENTFUNC2_MIX_DP( packm_struc_cxk_md )
-
-
-// -----------------------------------------------------------------------------
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \
-\
-void PASTEMAC2(cha,chp,opname) \
-     ( \
-       conj_t   conja, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_p* kappa, \
-       ctype_a* a, inc_t inca, inc_t lda, \
-       ctype_p* p,             inc_t ldp  \
-     ) \
-{ \
-	const inc_t                    inca2    = 2 * inca; \
-	const inc_t                    lda2     = 2 * lda; \
-	const inc_t                    ldp2     = 2 * ldp; \
-\
-	PASTEMAC(chp,ctyper)* restrict kappa_r  = ( PASTEMAC(chp,ctyper)* )kappa; \
-	PASTEMAC(chp,ctyper)* restrict kappa_i  = ( PASTEMAC(chp,ctyper)* )kappa + 1; \
-	PASTEMAC(cha,ctyper)* restrict alpha1_r = ( PASTEMAC(cha,ctyper)* )a; \
-	PASTEMAC(cha,ctyper)* restrict alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \
-	PASTEMAC(chp,ctyper)* restrict pi1_r    = ( PASTEMAC(chp,ctyper)* )p; \
-	PASTEMAC(chp,ctyper)* restrict pi1_i    = ( PASTEMAC(chp,ctyper)* )p + ldp; \
-\
-	if ( PASTEMAC(chp,eq1)( *kappa ) ) \
-	{ \
-		if ( bli_is_conj( conja ) ) \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC2(cha,chp,copyjris) \
-					( \
-					  *(alpha1_r + i*inca2), \
-					  *(alpha1_i + i*inca2), \
-					  *(pi1_r    + i*    1), \
-					  *(pi1_i    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_r += lda2; \
-				alpha1_i += lda2; \
-				pi1_r    += ldp2; \
-				pi1_i    += ldp2; \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC2(cha,chp,copyris) \
-					( \
-					  *(alpha1_r + i*inca2), \
-					  *(alpha1_i + i*inca2), \
-					  *(pi1_r    + i*    1), \
-					  *(pi1_i    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_r += lda2; \
-				alpha1_i += lda2; \
-				pi1_r    += ldp2; \
-				pi1_i    += ldp2; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conja ) ) \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC3(chp,cha,chp,scal2jris) \
-					( \
-					  *kappa_r, \
-					  *kappa_i, \
-					  *(alpha1_r + i*inca2), \
-					  *(alpha1_i + i*inca2), \
-					  *(pi1_r    + i*    1), \
-					  *(pi1_i    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_r += lda2; \
-				alpha1_i += lda2; \
-				pi1_r    += ldp2; \
-				pi1_i    += ldp2; \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC3(chp,cha,chp,scal2ris) \
-					( \
-					  *kappa_r, \
-					  *kappa_i, \
-					  *(alpha1_r + i*inca2), \
-					  *(alpha1_i + i*inca2), \
-					  *(pi1_r    + i*    1), \
-					  *(pi1_i    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_r += lda2; \
-				alpha1_i += lda2; \
-				pi1_r    += ldp2; \
-				pi1_i    += ldp2; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC2_BASIC( packm_cxk_1r_md )
-INSERT_GENTFUNC2_MIX_DP( packm_cxk_1r_md )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \
-\
-void PASTEMAC2(cha,chp,opname) \
-     ( \
-       conj_t            conja, \
-       dim_t             m, \
-       dim_t             n, \
-       ctype_p* restrict kappa, \
-       ctype_a* restrict a, inc_t inca, inc_t lda, \
-       ctype_p* restrict p,             inc_t ldp  \
-     ) \
-{ \
-	const inc_t       inca1     = inca; \
-	const inc_t       lda1      = lda; \
-	const inc_t       ldp1      = ldp; \
-\
-	ctype_a* restrict alpha1_ri = ( ctype_a* )a; \
-	ctype_p* restrict pi1_ri    = ( ctype_p* )p; \
-	ctype_p* restrict pi1_ir    = ( ctype_p* )p + ldp1/2; \
-\
-	( void )inca1; \
-\
-	if ( PASTEMAC(chp,eq1)( *kappa ) ) \
-	{ \
-		if ( bli_is_conj( conja ) ) \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC2(cha,chp,copyj1es) \
-					( \
-					  *(alpha1_ri + i*inca1), \
-					  *(pi1_ri    + i*    1), \
-					  *(pi1_ir    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_ri += lda1; \
-				pi1_ri    += ldp1; \
-				pi1_ir    += ldp1; \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC2(cha,chp,copy1es) \
-					( \
-					  *(alpha1_ri + i*inca1), \
-					  *(pi1_ri    + i*    1), \
-					  *(pi1_ir    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_ri += lda1; \
-				pi1_ri    += ldp1; \
-				pi1_ir    += ldp1; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conja ) ) \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC3(chp,cha,chp,scal2j1es) \
-					( \
-					  *kappa, \
-					  *(alpha1_ri + i*inca1), \
-					  *(pi1_ri    + i*    1), \
-					  *(pi1_ir    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_ri += lda1; \
-				pi1_ri    += ldp1; \
-				pi1_ir    += ldp1; \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t k = n; k != 0; --k ) \
-			{ \
-				for ( dim_t i = 0; i < m; ++i ) \
-				{ \
-					PASTEMAC3(chp,cha,chp,scal21es) \
-					( \
-					  *kappa, \
-					  *(alpha1_ri + i*inca1), \
-					  *(pi1_ri    + i*    1), \
-					  *(pi1_ir    + i*    1)  \
-					); \
-				} \
-\
-				alpha1_ri += lda1; \
-				pi1_ri    += ldp1; \
-				pi1_ir    += ldp1; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC2_BASIC( packm_cxk_1e_md )
-INSERT_GENTFUNC2_MIX_DP( packm_cxk_1e_md )
-
-#endif
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h
deleted file mode 100644
index f92126eb4..000000000
--- a/frame/1m/packm/bli_packm_struc_cxk_md.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT2
-#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \
-\
-void PASTEMAC2(chc,chp,varname) \
-     ( \
-       struc_t  strucc, \
-       diag_t   diagc, \
-       uplo_t   uploc, \
-       conj_t   conjc, \
-       pack_t   schema, \
-       bool     invdiag, \
-       dim_t    panel_dim, \
-       dim_t    panel_len, \
-       dim_t    panel_dim_max, \
-       dim_t    panel_len_max, \
-       dim_t    panel_dim_off, \
-       dim_t    panel_len_off, \
-       ctype_p* kappa, \
-       ctype_c* c, inc_t incc, inc_t ldc, \
-       ctype_p* p,             inc_t ldp, \
-                   inc_t is_p, \
-       void*    params, \
-       cntx_t*  cntx  \
-     );
-
-INSERT_GENTPROT2_BASIC( packm_struc_cxk_md )
-INSERT_GENTPROT2_MIX_DP( packm_struc_cxk_md )
-
-
-#undef  GENTPROT2
-#define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \
-\
-void PASTEMAC2(cha,chp,opname) \
-     ( \
-       conj_t   conja, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_p* kappa, \
-       ctype_a* a, inc_t inca, inc_t lda, \
-       ctype_p* p,             inc_t ldp  \
-     );
-
-INSERT_GENTPROT2_BASIC( packm_cxk_1e_md )
-INSERT_GENTPROT2_MIX_DP( packm_cxk_1e_md )
-
-INSERT_GENTPROT2_BASIC( packm_cxk_1r_md )
-INSERT_GENTPROT2_MIX_DP( packm_cxk_1r_md )
-
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index 9cbf88b1a..30e94023c 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -186,7 +186,7 @@ void PASTEMAC(ch,varname) \
 	if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
 	{ \
 		/* Prepare to unpack from column panels. */ \
-		schema        = BLIS_PACKED_COL_PANELS; \
+		schema        = BLIS_PACKED_PANELS; \
 		iter_dim      = n; \
 		panel_len     = m; \
 		panel_dim_max = pd_p; \
@@ -201,7 +201,7 @@ void PASTEMAC(ch,varname) \
 	else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
 	{ \
 		/* Prepare to unpack from row panels. */ \
-		schema        = BLIS_PACKED_ROW_PANELS; \
+		schema        = BLIS_PACKED_PANELS; \
 		iter_dim      = m; \
 		panel_len     = n; \
 		panel_dim_max = pd_p; \
@@ -214,13 +214,12 @@ void PASTEMAC(ch,varname) \
 		n_panel_full  = &n; \
 	} \
 \
-	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
-	                                           : BLIS_UNPACKM_MRXK_KER; \
+	num_t  dt    = PASTEMAC(ch,type); \
+	ukr_t ker_id = BLIS_UNPACKM_KER; \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. */ \
-	PASTECH(unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
+	unpackm_cxk_ker_ft f = bli_cntx_get_ukr2_dt( dt, dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -249,7 +248,7 @@ void PASTEMAC(ch,varname) \
 		if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \
 		     bli_is_upper_or_lower( uploc ) ) \
 		{ \
-			PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
+			PASTEMAC(ch,scal2m,BLIS_TAPI_EX_SUF) \
 			( \
 			  diagoffc_i, \
 			  diagc, \
@@ -272,11 +271,13 @@ void PASTEMAC(ch,varname) \
 			  BLIS_NO_CONJUGATE, \
 			  schema, \
 			  panel_dim_i, \
+			  1, \
 			  panel_len, \
 			  one, \
 			  p_begin,       ldp, \
 			  c_begin, incc, ldc, \
-			  ( cntx_t* )cntx  \
+			  NULL, \
+			  cntx \
 			); \
 		} \
 \
diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c
index e33e3b151..11badea11 100644
--- a/frame/1m/unpackm/bli_unpackm_cntl.c
+++ b/frame/1m/unpackm/bli_unpackm_cntl.c
@@ -35,43 +35,28 @@
 
 #include "blis.h"
 
-cntl_t* bli_unpackm_cntl_create_node
+void bli_unpackm_cntl_init_node
      (
-       pool_t* pool,
-       void_fp var_func,
-       void_fp unpackm_var_func,
-       cntl_t* sub_node
+       void_fp         var_func,
+       void_fp         unpackm_var_func,
+       unpackm_cntl_t* cntl
      )
 {
-	cntl_t*           cntl;
-	unpackm_params_t* params;
-	err_t             r_val;
-
 	// NOTE: If this function is ever called, figure out whether the
 	// bli_malloc_intl() below needs to be changed to bli_sba_acquire().
 	bli_abort();
 
-	// Allocate an unpackm_params_t struct.
-	params = bli_malloc_intl( sizeof( unpackm_params_t ), &r_val );
-
-	// Initialize the unpackm_params_t struct.
-	params->size      = sizeof( unpackm_params_t );
-	params->var_func  = unpackm_var_func;
+	// Initialize the unpackm_cntl_t struct.
+	cntl->var_func = unpackm_var_func;
 
 	// It's important that we set the bszid field to BLIS_NO_PART to indicate
 	// that no blocksize partitioning is performed. bli_cntl_free() will rely
 	// on this information to know how to step through the thrinfo_t tree in
 	// sync with the cntl_t tree.
-	cntl = bli_cntl_create_node
+	bli_cntl_init_node
 	(
-	  pool,
-	  BLIS_NOID,
-	  BLIS_NO_PART,
 	  var_func,
-	  params,
-	  sub_node
+	  &cntl->cntl
 	);
-
-	return cntl;
 }
 
diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h
index 075800d0a..ff1a4e1b4 100644
--- a/frame/1m/unpackm/bli_unpackm_cntl.h
+++ b/frame/1m/unpackm/bli_unpackm_cntl.h
@@ -33,24 +33,23 @@
 
 */
 
-struct unpackm_params_s
+struct unpackm_cntl_s
 {
-	uint64_t        size; // size field must be present and come first.
+	cntl_t          cntl; // cntl field must be present and come first.
 	unpackm_var_oft var_func;
 };
-typedef struct unpackm_params_s unpackm_params_t;
+typedef struct unpackm_cntl_s unpackm_cntl_t;
 
 #define bli_cntl_unpackm_params_var_func( cntl ) \
 \
-	( ( (unpackm_params_t*)(cntl)->params )->var_func )
+	( ( (const unpackm_cntl_t*) cntl )->var_func )
 
 // -----------------------------------------------------------------------------
 
-cntl_t* bli_unpackm_cntl_create_node
+void bli_unpackm_cntl_init_node
      (
-       pool_t* pool,
-       void_fp var_func,
-       void_fp unpackm_var_func,
-       cntl_t* sub_node
+       void_fp         var_func,
+       void_fp         unpackm_var_func,
+       unpackm_cntl_t* cntl
      );
 
diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c
index 2ced9a1a2..a4b484267 100644
--- a/frame/1m/unpackm/bli_unpackm_int.c
+++ b/frame/1m/unpackm/bli_unpackm_int.c
@@ -36,12 +36,11 @@
 
 void bli_unpackm_int
      (
-       const obj_t*  p,
-       const obj_t*  a,
-       const cntx_t* cntx,
-       const rntm_t* rntm,
-       const cntl_t* cntl,
-       const thrinfo_t* thread
+       const obj_t*     a,
+             obj_t*     p,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	bli_init_once();
@@ -57,21 +56,23 @@ void bli_unpackm_int
 	// necessary, so we return.
 	if ( bli_obj_is_alias_of( p, a ) ) return;
 
+	// Barrier so that we know threads are done with previous computation
+	// with the same packing buffer before starting to pack.
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	bli_thrinfo_barrier( thread );
+
 	// Extract the function pointer from the current control tree node.
 	f = bli_cntl_unpackm_params_var_func( cntl );
 
 	// Invoke the variant.
-	if ( bli_thrinfo_am_chief( thread ) )
-	{
-		f
-		(
-		  p,
-		  a,
-		  cntx,
-		  cntl,
-		  thread
-		);
-	}
+	f
+	(
+	  p,
+	  a,
+	  cntx,
+	  cntl,
+	  thread
+	);
 
 	// Barrier so that unpacking is done before computation.
 	bli_thrinfo_barrier( thread );
diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h
index fc2c3e66d..0b7797323 100644
--- a/frame/1m/unpackm/bli_unpackm_int.h
+++ b/frame/1m/unpackm/bli_unpackm_int.h
@@ -34,11 +34,10 @@
 
 void bli_unpackm_int
      (
-       const obj_t*  p,
-       const obj_t*  a,
-       const cntx_t* cntx,
-       const rntm_t* rntm,
-       const cntl_t* cntl,
-       const thrinfo_t* thread
+       const obj_t*     a,
+             obj_t*     p,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      );
 
diff --git a/frame/2/bli_l2_fpa.c b/frame/2/bli_l2_fpa.c
index 76ebccddc..223f912eb 100644
--- a/frame/2/bli_l2_fpa.c
+++ b/frame/2/bli_l2_fpa.c
@@ -41,13 +41,13 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+GENARRAY_FPA( PASTECH(opname,BLIS_TAPI_EX_SUF,_vft), \
               PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
 { \
-	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
+	return PASTECH(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
 }
 
 GENFRONT( gemv )
@@ -68,10 +68,10 @@ GENFRONT( trsv )
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-GENARRAY_FPA( PASTECH2(opname,_unb,_vft), \
+GENARRAY_FPA( PASTECH(opname,_unb,_vft), \
               varname ); \
 \
-PASTECH2(opname,_unb,_vft) \
+PASTECH(opname,_unb,_vft) \
 PASTEMAC(varname,_qfp)( num_t dt ) \
 { \
 	return PASTECH(varname,_fpa)[ dt ]; \
diff --git a/frame/2/bli_l2_fpa.h b/frame/2/bli_l2_fpa.h
index 414bd82c2..9ca9bb89f 100644
--- a/frame/2/bli_l2_fpa.h
+++ b/frame/2/bli_l2_fpa.h
@@ -39,8 +39,8 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
 
 GENPROT( gemv )
 GENPROT( ger )
@@ -60,7 +60,7 @@ GENPROT( trsv )
 #undef  GENPROT
 #define GENPROT( opname, varname ) \
 \
-PASTECH2(opname,_unb,_vft) \
+PASTECH(opname,_unb,_vft) \
 PASTEMAC(varname,_qfp)( num_t dt );
 
 GENPROT( gemv, gemv_unb_var1 )
diff --git a/frame/2/bli_l2_ft.h b/frame/2/bli_l2_ft.h
index 8c48e2bed..94ca133fb 100644
--- a/frame/2/bli_l2_ft.h
+++ b/frame/2/bli_l2_ft.h
@@ -42,7 +42,7 @@
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              trans_t transa, \
              conj_t  conjx, \
@@ -63,7 +63,7 @@ INSERT_GENTDEF( gemv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -83,7 +83,7 @@ INSERT_GENTDEF( ger )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              uplo_t uploa, \
              conj_t conja, \
@@ -105,7 +105,7 @@ INSERT_GENTDEF( symv )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              uplo_t   uploa, \
              conj_t   conjx, \
@@ -123,7 +123,7 @@ INSERT_GENTDEFR( her )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              uplo_t uploa, \
              conj_t conjx, \
@@ -141,7 +141,7 @@ INSERT_GENTDEF( syr )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              uplo_t uploa, \
              conj_t conjx, \
@@ -162,7 +162,7 @@ INSERT_GENTDEF( syr2 )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              uplo_t  uploa, \
              trans_t transa, \
diff --git a/frame/2/bli_l2_ft_unb.h b/frame/2/bli_l2_ft_unb.h
index 0cbd7ee29..39ef25ec8 100644
--- a/frame/2/bli_l2_ft_unb.h
+++ b/frame/2/bli_l2_ft_unb.h
@@ -45,7 +45,7 @@
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
+typedef void (*PASTECH(ch,opname,_unb,tsuf)) \
      ( \
        trans_t transa, \
        conj_t  conjx, \
@@ -66,7 +66,7 @@ INSERT_GENTDEF( gemv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
+typedef void (*PASTECH(ch,opname,_unb,tsuf)) \
      ( \
        conj_t  conjx, \
        conj_t  conjy, \
@@ -86,7 +86,7 @@ INSERT_GENTDEF( ger )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
+typedef void (*PASTECH(ch,opname,_unb,tsuf)) \
      ( \
        uplo_t  uploa, \
        conj_t  conja, \
@@ -108,7 +108,7 @@ INSERT_GENTDEF( hemv )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
+typedef void (*PASTECH(ch,opname,_unb,tsuf)) \
      ( \
        uplo_t   uploa, \
        conj_t   conjx, \
@@ -127,7 +127,7 @@ INSERT_GENTDEFR( her )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
+typedef void (*PASTECH(ch,opname,_unb,tsuf)) \
      ( \
        uplo_t  uploa, \
        conj_t  conjx, \
@@ -148,7 +148,7 @@ INSERT_GENTDEF( her2 )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
+typedef void (*PASTECH(ch,opname,_unb,tsuf)) \
      ( \
        uplo_t  uploa, \
        trans_t transa, \
diff --git a/frame/2/bli_l2_oapi.c b/frame/2/bli_l2_oapi.c
index 2eac6394c..3860d06ad 100644
--- a/frame/2/bli_l2_oapi.c
+++ b/frame/2/bli_l2_oapi.c
@@ -91,8 +91,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -158,8 +158,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -230,8 +230,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -294,8 +294,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -359,8 +359,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -423,8 +423,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c
index 3366c26f9..f6f2a035d 100644
--- a/frame/2/bli_l2_tapi.c
+++ b/frame/2/bli_l2_tapi.c
@@ -43,7 +43,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              trans_t transa, \
              conj_t  conjx, \
@@ -76,7 +76,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	   return early. */ \
 	if ( bli_zero_dim1( n_x ) || PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m_y, \
@@ -89,7 +89,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	} \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,ftname,_unb_ft) f; \
+	PASTECH(ch,ftname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if ( bli_does_notrans( transa ) ) \
@@ -126,7 +126,7 @@ INSERT_GENTFUNC_BASIC( gemv, gemv, gemv_unf_var1, gemv_unf_var2 )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -150,7 +150,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,ftname,_unb_ft) f; \
+	PASTECH(ch,ftname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
@@ -178,7 +178,7 @@ INSERT_GENTFUNC_BASIC( ger, ger, ger_unb_var1, ger_unb_var2 )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t uploa, \
              conj_t conja, \
@@ -203,7 +203,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	   return early. */ \
 	if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -216,7 +216,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	} \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,ftname,_unb_ft) f; \
+	PASTECH(ch,ftname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if ( bli_is_lower( uploa ) ) \
@@ -255,7 +255,7 @@ INSERT_GENTFUNC_BASIC( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_va
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, ftname, conjh, rvarname, cvarname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t   uploa, \
              conj_t   conjx, \
@@ -278,13 +278,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Make a local copy of alpha, cast into the complex domain. This
 	   allows us to use the same underlying her variants to implement
 	   both her and syr operations. */ \
-	PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \
+	PASTEMAC(chr,ch,copys)( *alpha, alpha_local ); \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,ftname,_unb_ft) f; \
+	PASTECH(ch,ftname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if ( bli_is_lower( uploa ) ) \
@@ -319,7 +319,7 @@ INSERT_GENTFUNCR_BASIC( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t uploa, \
              conj_t conjx, \
@@ -341,7 +341,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,ftname,_unb_ft) f; \
+	PASTECH(ch,ftname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if ( bli_is_lower( uploa ) ) \
@@ -376,7 +376,7 @@ INSERT_GENTFUNC_BASIC( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2 )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t uploa, \
              conj_t conjx, \
@@ -400,7 +400,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,ftname,_unb_ft) f; \
+	PASTECH(ch,ftname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if ( bli_is_lower( uploa ) ) \
@@ -438,7 +438,7 @@ INSERT_GENTFUNC_BASIC( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_va
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t  uploa, \
              trans_t transa, \
@@ -463,7 +463,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* If alpha is zero, set x to zero and return early. */ \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -476,7 +476,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	} \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,ftname,_unb_ft) f; \
+	PASTECH(ch,ftname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if ( bli_does_notrans( transa ) ) \
diff --git a/frame/2/bli_l2_tapi.h b/frame/2/bli_l2_tapi.h
index 207d0de84..a263360e7 100644
--- a/frame/2/bli_l2_tapi.h
+++ b/frame/2/bli_l2_tapi.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              trans_t transa, \
              conj_t  conjx, \
@@ -60,7 +60,7 @@ INSERT_GENTPROT_BASIC( gemv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              conj_t conjx, \
              conj_t conjy, \
@@ -79,7 +79,7 @@ INSERT_GENTPROT_BASIC( ger )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t uploa, \
              conj_t conja, \
@@ -100,7 +100,7 @@ INSERT_GENTPROT_BASIC( symv )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t   uploa, \
              conj_t   conjx, \
@@ -117,7 +117,7 @@ INSERT_GENTPROTR_BASIC( her )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t uploa, \
              conj_t conjx, \
@@ -134,7 +134,7 @@ INSERT_GENTPROT_BASIC( syr )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t uploa, \
              conj_t conjx, \
@@ -154,7 +154,7 @@ INSERT_GENTPROT_BASIC( syr2 )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              uplo_t  uploa, \
              trans_t transa, \
diff --git a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
index 6d2e2b852..b80916adc 100644
--- a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
+++ b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c
@@ -72,7 +72,7 @@ void PASTEMAC(ch,varname) \
 	/* y = beta * y; */ \
 	/* NOTE: We don't explicitly handle the case where beta == 0 here
 	   since that behavior is handled within the scalv kernel itself. */ \
-	PASTEMAC2(ch,scalv,scalvsuf) \
+	PASTEMAC(ch,scalv,scalvsuf) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n_elem, \
@@ -99,7 +99,7 @@ void PASTEMAC(ch,varname) \
 \
 		/* y = y + alpha * A1 * x1; */ \
 		/*kfp_af*/ \
-		PASTEMAC2(ch,axpyf,axpyfsuf) \
+		PASTEMAC(ch,axpyf,axpyfsuf) \
 		( \
 		  conja, \
 		  conjx, \
@@ -161,7 +161,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n_elem, \
@@ -174,7 +174,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n_elem, \
diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c
index 16ec68278..f40bb2dab 100644
--- a/frame/2/gemv/bli_gemv_unb_var2.c
+++ b/frame/2/gemv/bli_gemv_unb_var2.c
@@ -73,7 +73,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n_elem, \
@@ -86,7 +86,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n_elem, \
diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c
index e73f062e5..a89017116 100644
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -73,7 +73,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n_elem, \
@@ -86,7 +86,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  n_elem, \
diff --git a/frame/2/gemv/bli_gemv_var.h b/frame/2/gemv/bli_gemv_var.h
index f06cdac75..2006c50a4 100644
--- a/frame/2/gemv/bli_gemv_var.h
+++ b/frame/2/gemv/bli_gemv_var.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
diff --git a/frame/2/gemv/bli_gemv_var_oapi.c b/frame/2/gemv/bli_gemv_var_oapi.c
index 865773534..319aec246 100644
--- a/frame/2/gemv/bli_gemv_var_oapi.c
+++ b/frame/2/gemv/bli_gemv_var_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(varname) \
+void PASTEMAC(varname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
@@ -73,7 +73,7 @@ void PASTEMAC0(varname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,_unb,_vft) f = \
+	PASTECH(opname,_unb,_vft) f = \
 	PASTEMAC(varname,_qfp)( dt ); \
 \
 	f \
diff --git a/frame/2/ger/bli_ger_var.h b/frame/2/ger/bli_ger_var.h
index d1648aa88..7311e98d0 100644
--- a/frame/2/ger/bli_ger_var.h
+++ b/frame/2/ger/bli_ger_var.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  x, \
diff --git a/frame/2/ger/bli_ger_var_oapi.c b/frame/2/ger/bli_ger_var_oapi.c
index f125efdf8..e30804fe1 100644
--- a/frame/2/ger/bli_ger_var_oapi.c
+++ b/frame/2/ger/bli_ger_var_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(varname) \
+void PASTEMAC(varname) \
      ( \
        obj_t*  alpha, \
        obj_t*  x, \
@@ -71,7 +71,7 @@ void PASTEMAC0(varname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,_unb,_vft) f = \
+	PASTECH(opname,_unb,_vft) f = \
 	PASTEMAC(varname,_qfp)( dt ); \
 \
 	f \
diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c
index 7e622f09d..eeffc4292 100644
--- a/frame/2/hemv/bli_hemv_unb_var1.c
+++ b/frame/2/hemv/bli_hemv_unb_var1.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -107,7 +107,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c
index 91a5b6ef5..07de60dcc 100644
--- a/frame/2/hemv/bli_hemv_unb_var2.c
+++ b/frame/2/hemv/bli_hemv_unb_var2.c
@@ -96,7 +96,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -109,7 +109,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c
index 0ea2605ea..1edd78f82 100644
--- a/frame/2/hemv/bli_hemv_unb_var3.c
+++ b/frame/2/hemv/bli_hemv_unb_var3.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -107,7 +107,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c
index 56b252ec2..704299ab1 100644
--- a/frame/2/hemv/bli_hemv_unb_var4.c
+++ b/frame/2/hemv/bli_hemv_unb_var4.c
@@ -95,7 +95,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -108,7 +108,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c
index ca9f72fe7..bb96d9ae5 100644
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -116,7 +116,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c
index 44d0e3794..f20a6de84 100644
--- a/frame/2/hemv/bli_hemv_unf_var1a.c
+++ b/frame/2/hemv/bli_hemv_unf_var1a.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -107,7 +107,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c
index 16f45f051..ef25a3562 100644
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -116,7 +116,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c
index c54485d65..3501a9ac7 100644
--- a/frame/2/hemv/bli_hemv_unf_var3a.c
+++ b/frame/2/hemv/bli_hemv_unf_var3a.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	if ( PASTEMAC(ch,eq0)( *beta ) ) \
 	{ \
 		/* y = 0; */ \
-		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
@@ -107,7 +107,7 @@ void PASTEMAC(ch,varname) \
 	else \
 	{ \
 		/* y = beta * y; */ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m, \
diff --git a/frame/2/hemv/bli_hemv_var.h b/frame/2/hemv/bli_hemv_var.h
index 1cf50cadf..8054c8354 100644
--- a/frame/2/hemv/bli_hemv_var.h
+++ b/frame/2/hemv/bli_hemv_var.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        conj_t  conjh, \
        obj_t*  alpha, \
diff --git a/frame/2/hemv/bli_hemv_var_oapi.c b/frame/2/hemv/bli_hemv_var_oapi.c
index bf0e4b202..b35aec5f4 100644
--- a/frame/2/hemv/bli_hemv_var_oapi.c
+++ b/frame/2/hemv/bli_hemv_var_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(varname) \
+void PASTEMAC(varname) \
      ( \
        conj_t  conjh, \
        obj_t*  alpha, \
@@ -74,7 +74,7 @@ void PASTEMAC0(varname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,_unb,_vft) f = \
+	PASTECH(opname,_unb,_vft) f = \
 	PASTEMAC(varname,_qfp)( dt ); \
 \
 	f \
diff --git a/frame/2/her/bli_her_var.h b/frame/2/her/bli_her_var.h
index 4e22cb324..b9b810e94 100644
--- a/frame/2/her/bli_her_var.h
+++ b/frame/2/her/bli_her_var.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        conj_t  conjh, \
        obj_t*  alpha, \
diff --git a/frame/2/her/bli_her_var_oapi.c b/frame/2/her/bli_her_var_oapi.c
index 44c6d090d..366069408 100644
--- a/frame/2/her/bli_her_var_oapi.c
+++ b/frame/2/her/bli_her_var_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(varname) \
+void PASTEMAC(varname) \
      ( \
        conj_t  conjh, \
        obj_t*  alpha, \
@@ -67,7 +67,7 @@ void PASTEMAC0(varname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,_unb,_vft) f = \
+	PASTECH(opname,_unb,_vft) f = \
 	PASTEMAC(varname,_qfp)( dt ); \
 \
 	f \
diff --git a/frame/2/her2/bli_her2_var.h b/frame/2/her2/bli_her2_var.h
index f4a8b902e..52862940b 100644
--- a/frame/2/her2/bli_her2_var.h
+++ b/frame/2/her2/bli_her2_var.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        conj_t  conjh, \
        obj_t*  alpha, \
diff --git a/frame/2/her2/bli_her2_var_oapi.c b/frame/2/her2/bli_her2_var_oapi.c
index dce87a1cd..75217c770 100644
--- a/frame/2/her2/bli_her2_var_oapi.c
+++ b/frame/2/her2/bli_her2_var_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(varname) \
+void PASTEMAC(varname) \
      ( \
        conj_t  conjh, \
        obj_t*  alpha, \
@@ -73,7 +73,7 @@ void PASTEMAC0(varname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,_unb,_vft) f = \
+	PASTECH(opname,_unb,_vft) f = \
 	PASTEMAC(varname,_qfp)( dt ); \
 \
 	f \
diff --git a/frame/2/trmv/bli_trmv_var.h b/frame/2/trmv/bli_trmv_var.h
index 2042f0f7b..fa44d6066 100644
--- a/frame/2/trmv/bli_trmv_var.h
+++ b/frame/2/trmv/bli_trmv_var.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
diff --git a/frame/2/trmv/bli_trmv_var_oapi.c b/frame/2/trmv/bli_trmv_var_oapi.c
index c74d31223..092e7216c 100644
--- a/frame/2/trmv/bli_trmv_var_oapi.c
+++ b/frame/2/trmv/bli_trmv_var_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(varname) \
+void PASTEMAC(varname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
@@ -67,7 +67,7 @@ void PASTEMAC0(varname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,_unb,_vft) f = \
+	PASTECH(opname,_unb,_vft) f = \
 	PASTEMAC(varname,_qfp)( dt ); \
 \
 	f \
diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c
index 126e3eb7c..99ddce861 100644
--- a/frame/2/trsv/bli_trsv_unb_var1.c
+++ b/frame/2/trsv/bli_trsv_unb_var1.c
@@ -81,7 +81,7 @@ void PASTEMAC(ch,varname) \
 	conja = bli_extract_conj( transa ); \
 \
 	/* x = alpha * x; */ \
-	PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  m, \
diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c
index 941cf43b9..aed530c2d 100644
--- a/frame/2/trsv/bli_trsv_unb_var2.c
+++ b/frame/2/trsv/bli_trsv_unb_var2.c
@@ -81,7 +81,7 @@ void PASTEMAC(ch,varname) \
 	conja = bli_extract_conj( transa ); \
 \
 	/* x = alpha * x; */ \
-	PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  m, \
diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c
index d64bba63f..109184a7c 100644
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -75,7 +75,7 @@ void PASTEMAC(ch,varname) \
 	conj_t  conja; \
 \
 	/* x = alpha * x; */ \
-	PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  m, \
diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c
index f73d32413..5055b9a62 100644
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -74,7 +74,7 @@ void PASTEMAC(ch,varname) \
 	conj_t  conja; \
 \
 	/* x = alpha * x; */ \
-	PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  m, \
diff --git a/frame/2/trsv/bli_trsv_var.h b/frame/2/trsv/bli_trsv_var.h
index 35e8db301..398a17352 100644
--- a/frame/2/trsv/bli_trsv_var.h
+++ b/frame/2/trsv/bli_trsv_var.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
diff --git a/frame/2/trsv/bli_trsv_var_oapi.c b/frame/2/trsv/bli_trsv_var_oapi.c
index 62ac33e45..a3fb25155 100644
--- a/frame/2/trsv/bli_trsv_var_oapi.c
+++ b/frame/2/trsv/bli_trsv_var_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(varname) \
+void PASTEMAC(varname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
@@ -67,7 +67,7 @@ void PASTEMAC0(varname) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,_unb,_vft) f = \
+	PASTECH(opname,_unb,_vft) f = \
 	PASTEMAC(varname,_qfp)( dt ); \
 \
 	f \
diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index 3e50275b3..3cf0be21d 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -33,13 +33,12 @@
 
 */
 
+#include "bli_l3_util.h"
 #include "bli_l3_thrinfo.h"
 #include "bli_l3_decor.h"
 #include "bli_l3_sup_decor.h"
 
-#include "bli_l3_cntl.h"
 #include "bli_l3_check.h"
-#include "bli_l3_int.h"
 #include "bli_l3_packab.h"
 
 // Define function types.
@@ -47,10 +46,8 @@
 #include "bli_l3_oft.h"
 #include "bli_l3_oft_var.h"
 
-#include "bli_l3_blocksize.h"
-#include "bli_l3_direct.h"
+#include "bli_l3_int.h"
 #include "bli_l3_prune.h"
-#include "bli_l3_schema.h"
 
 // Prototype object APIs (basic and expert).
 #include "bli_l3_oapi.h"
@@ -86,9 +83,6 @@
 
 // Operation-specific headers.
 #include "bli_gemm.h"
-#include "bli_hemm.h"
-#include "bli_symm.h"
 #include "bli_trmm.h"
-#include "bli_trmm3.h"
 #include "bli_trsm.h"
 #include "bli_gemmt.h"
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
deleted file mode 100644
index 586aeb6ea..000000000
--- a/frame/3/bli_l3_blocksize.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-
-void bli_l3_adjust_kc
-      (
-        const obj_t*  a,
-        const obj_t*  b,
-              dim_t*  b_alg,
-              dim_t*  b_max,
-        const cntx_t* cntx,
-        const cntl_t* cntl
-      )
-{
-	const opid_t family = bli_cntl_family( cntl );
-	const num_t  dt     = bli_obj_exec_dt( a );
-	      dim_t  mnr    = 1;
-
-	// Nudge the default and maximum kc blocksizes up to the nearest
-	// multiple of MR if A is Hermitian, symmetric, or triangular or
-	// NR if B is Hermitian, symmetric, or triangular. If neither case
-	// applies, then we leave the blocksizes unchanged. For trsm we
-	// always use MR (rather than sometimes using NR) because even
-	// when the triangle is on the right, packing of that matrix uses
-	// MR, since only left-side trsm micro-kernels are supported.
-	if ( !bli_obj_root_is_general( a ) || family == BLIS_TRSM )
-	{
-		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
-	}
-	else if ( !bli_obj_root_is_general( b ) )
-	{
-		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
-	}
-
-	*b_alg = bli_align_dim_to_mult( *b_alg, mnr );
-	*b_max = bli_align_dim_to_mult( *b_max, mnr );
-}
-
-dim_t bli_l3_determine_kc
-      (
-              dir_t   direct,
-              dim_t   i,
-              dim_t   dim,
-        const obj_t*  a,
-        const obj_t*  b,
-              bszid_t bszid,
-        const cntx_t* cntx,
-        const cntl_t* cntl
-      )
-{
-	const num_t    dt    = bli_obj_exec_dt( a );
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx );
-	      dim_t    b_alg = bli_blksz_get_def( dt, bsize );
-	      dim_t    b_max = bli_blksz_get_max( dt, bsize );
-
-	bli_l3_adjust_kc( a, b, &b_alg, &b_max, cntx, cntl );
-
-	if ( direct == BLIS_FWD )
-		return bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );
-	else
-		return bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
-}
-
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
deleted file mode 100644
index 843d5f241..000000000
--- a/frame/3/bli_l3_blocksize.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_l3_adjust_kc
-      (
-        const obj_t*  a,
-        const obj_t*  b,
-              dim_t*  b_alg,
-              dim_t*  b_max,
-        const cntx_t* cntx,
-        const cntl_t* cntl
-      );
-
-dim_t bli_l3_determine_kc
-      (
-              dir_t   direct,
-              dim_t   i,
-              dim_t   dim,
-        const obj_t*  a,
-        const obj_t*  b,
-              bszid_t bszid,
-        const cntx_t* cntx,
-        const cntl_t* cntl
-      );
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 9ac0a7fbb..93146bc18 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -391,7 +391,7 @@ void bli_trsm_check
 
 	// Perform checks common to hemm/symm/trmm/trsm.
 
-	bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
+	bli_trsm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
 
 	// Check matrix squareness.
 
@@ -430,33 +430,8 @@ void bli_gemm_basic_check
 	e_val = bli_check_level3_dims( a, b, c );
 	bli_check_error_code( e_val );
 
-#ifdef BLIS_ENABLE_GEMM_MD
 	// Skip checking for consistent datatypes between A, B, and C since
 	// that is totally valid for mixed-datatype gemm.
-
-	// When mixing datatypes, make sure that alpha does not have a non-zero
-	// imaginary component.
-	if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
-	     bli_obj_dt( c ) != bli_obj_dt( b ) ||
-	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
-	if ( !bli_obj_imag_is_zero( alpha ) )
-	{
-		bli_print_msg( "Mixed-datatype gemm does not yet support alpha with a non-zero imaginary component. Please contact BLIS developers for further support.", __FILE__, __LINE__ );
-		bli_abort();
-	}
-
-#else // BLIS_DISABLE_GEMM_MD
-
-	// Check for consistent datatypes.
-	// NOTE: We only perform these tests when mixed datatype support is
-	// disabled.
-
-	e_val = bli_check_consistent_object_datatypes( c, a );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_consistent_object_datatypes( c, b );
-	bli_check_error_code( e_val );
-#endif
 }
 
 void bli_gemmt_basic_check
@@ -482,11 +457,8 @@ void bli_gemmt_basic_check
 
 	// Check for consistent datatypes.
 
-	e_val = bli_check_consistent_object_datatypes( c, a );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_consistent_object_datatypes( c, b );
-	bli_check_error_code( e_val );
+	// Skip checking for consistent datatypes between A, B, and C since
+	// that is totally valid for mixed-datatype gemmt.
 }
 
 void bli_hemm_basic_check
@@ -519,6 +491,40 @@ void bli_hemm_basic_check
 		bli_check_error_code( e_val );
 	}
 
+	// Skip checking for consistent datatypes between A, B, and C since
+	// that is totally valid for mixed-datatype hemm.
+}
+
+void bli_trsm_basic_check
+     (
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
+     )
+{
+	err_t e_val;
+
+	// Perform standard checks.
+
+	bli_l3_basic_check( alpha, a, b, beta, c, cntx );
+
+	// Check object dimensions.
+
+	if ( bli_is_left( side ) )
+	{
+		e_val = bli_check_level3_dims( a, b, c );
+		bli_check_error_code( e_val );
+	}
+	else // if ( bli_is_right( side ) )
+	{
+		e_val = bli_check_level3_dims( b, a, c );
+		bli_check_error_code( e_val );
+	}
+
 	// Check for consistent datatypes.
 
 	e_val = bli_check_consistent_object_datatypes( c, a );
@@ -551,11 +557,8 @@ void bli_herk_basic_check
 
 	// Check for consistent datatypes.
 
-	e_val = bli_check_consistent_object_datatypes( c, a );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_consistent_object_datatypes( c, ah );
-	bli_check_error_code( e_val );
+	// Skip checking for consistent datatypes between A, B, and C since
+	// that is totally valid for mixed-datatype herk.
 }
 
 void bli_her2k_basic_check
@@ -585,19 +588,8 @@ void bli_her2k_basic_check
 	e_val = bli_check_level3_dims( b, ah, c );
 	bli_check_error_code( e_val );
 
-	// Check for consistent datatypes.
-
-	e_val = bli_check_consistent_object_datatypes( c, a );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_consistent_object_datatypes( c, ah );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_consistent_object_datatypes( c, b );
-	bli_check_error_code( e_val );
-
-	e_val = bli_check_consistent_object_datatypes( c, bh );
-	bli_check_error_code( e_val );
+	// Skip checking for consistent datatypes between A, B, and C since
+	// that is totally valid for mixed-datatype her2k.
 }
 
 void bli_l3_basic_check
diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h
index 8551b6b61..ef59cb60f 100644
--- a/frame/3/bli_l3_check.h
+++ b/frame/3/bli_l3_check.h
@@ -140,6 +140,17 @@ void bli_hemm_basic_check
        const cntx_t* cntx
      );
 
+void bli_trsm_basic_check
+     (
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
+     );
+
 void bli_herk_basic_check
      (
        const obj_t*  alpha,
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
deleted file mode 100644
index 27d140143..000000000
--- a/frame/3/bli_l3_cntl.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-
-void bli_l3_cntl_create_if
-     (
-             opid_t   family,
-             pack_t   schema_a,
-             pack_t   schema_b,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   c,
-             pool_t*  pool,
-       const cntl_t*  cntl_orig,
-             cntl_t** cntl_use
-     )
-{
-	// If the control tree pointer is NULL, we construct a default
-	// tree as a function of the operation family.
-	if ( cntl_orig == NULL )
-	{
-		if ( family == BLIS_GEMM ||
-		     family == BLIS_GEMMT ||
-		     family == BLIS_TRMM )
-		{
-			*cntl_use = bli_gemm_cntl_create
-			(
-			  pool,
-			  family,
-			  schema_a,
-			  schema_b,
-			  bli_obj_ker_fn( c )
-			);
-		}
-		else // if ( family == BLIS_TRSM )
-		{
-			side_t side;
-
-			// NOTE: We no longer ever use right-sided trsm, and therefore this
-			// function will only ever get called with side = BLIS_LEFT, which
-			// means that in the future, we can remove the a, b, and c operands
-			// from the function signature. (This assumes that the call to
-			// bli_obj_ker_fn( c ) is replaced in some future reorganization
-			// that moves the .ker_fn argument from obj_t to, say, the rntm_t.)
-			if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT;
-			else                              side = BLIS_RIGHT;
-
-			*cntl_use = bli_trsm_cntl_create
-			(
-			  pool,
-			  side,
-			  schema_a,
-			  schema_b,
-			  bli_obj_ker_fn( c )
-			);
-		}
-	}
-	else
-	{
-		// If the user provided a control tree, create a copy and use it
-		// instead (so that threads can use its local tree as a place to
-		// cache things like pack mem_t entries).
-		*cntl_use = bli_cntl_copy( pool, cntl_orig );
-
-		// Recursively set the family fields of the newly copied control tree
-		// nodes.
-		bli_cntl_mark_family( family, *cntl_use );
-	}
-}
-
-void bli_l3_cntl_free
-     (
-       pool_t* pool,
-       cntl_t* cntl_use
-     )
-{
-	// NOTE: We don't actually need to call separate _cntl_free() functions
-	// for gemm and trsm; it is merely an unnecessary mirroring of behavior
-	// from the _create() side (which must call different functions based
-	// on the family).
-
-	opid_t family = bli_cntl_family( cntl_use );
-
-	if ( family == BLIS_GEMM ||
-	     family == BLIS_GEMMT ||
-	     family == BLIS_TRMM )
-	{
-		bli_gemm_cntl_free( pool, cntl_use );
-	}
-	else // if ( family == BLIS_TRSM )
-	{
-		bli_trsm_cntl_free( pool, cntl_use );
-	}
-}
-
diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h
deleted file mode 100644
index 68e837663..000000000
--- a/frame/3/bli_l3_cntl.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-
-//
-// Prototype conditional control tree creation functions.
-//
-
-void bli_l3_cntl_create_if
-     (
-             opid_t   family,
-             pack_t   schema_a,
-             pack_t   schema_b,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   c,
-             pool_t*  pool,
-       const cntl_t*  cntl_orig,
-             cntl_t** cntl_use
-     );
-
-BLIS_EXPORT_BLIS void bli_l3_cntl_free
-     (
-       pool_t* pool,
-       cntl_t* cntl_use
-     );
-
diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c
index dc1d3bb1b..3fb2f0ba1 100644
--- a/frame/3/bli_l3_decor.c
+++ b/frame/3/bli_l3_decor.c
@@ -36,14 +36,11 @@
 
 struct l3_decor_params_s
 {
-	      l3int_ft func;
-	      opid_t   family;
-	const obj_t*   alpha;
 	const obj_t*   a;
 	const obj_t*   b;
-	const obj_t*   beta;
 	const obj_t*   c;
 	const cntx_t*  cntx;
+	const cntl_t*  cntl;
 	      rntm_t*  rntm;
 	      array_t* array;
 };
@@ -53,66 +50,31 @@ static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const
 {
 	const l3_decor_params_t* data    = data_void;
 
-	const l3int_ft           func    = data->func;
-	const opid_t             family  = data->family;
-	const obj_t*             alpha   = data->alpha;
 	const obj_t*             a       = data->a;
 	const obj_t*             b       = data->b;
-	const obj_t*             beta    = data->beta;
 	const obj_t*             c       = data->c;
 	const cntx_t*            cntx    = data->cntx;
+	const cntl_t*            cntl    = data->cntl;
 	      rntm_t*            rntm    = data->rntm;
 	      array_t*           array   = data->array;
 
 	bli_l3_thread_decorator_thread_check( gl_comm, rntm );
 
-	// Alias thread-local copies of A, B, and C. These will be the objects
-	// we pass down the algorithmic function stack. Making thread-local
-	// aliases is highly recommended in case a thread needs to change any
-	// of the properties of an object without affecting other threads'
-	// objects.
-	obj_t a_t, b_t, c_t;
-	bli_obj_alias_to( a, &a_t );
-	bli_obj_alias_to( b, &b_t );
-	bli_obj_alias_to( c, &c_t );
-
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( &a_t );
-	pack_t schema_b = bli_obj_pack_schema( &b_t );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
-
-	// Create a default control tree for the operation, if needed.
-	cntl_t* cntl_use;
-	pool_t* sba_pool = bli_sba_array_elem( tid, array );
-	bli_l3_cntl_create_if( family, schema_a, schema_b,
-	                       &a_t, &b_t, &c_t, sba_pool, NULL, &cntl_use );
-
 	// Create the root node of the current thread's thrinfo_t structure.
 	// The root node is the *parent* of the node corresponding to the first
 	// control tree node.
-	thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl_use );
+	thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl );
 
-	func
+	bli_l3_int
 	(
-	  alpha,
-	  &a_t,
-	  &b_t,
-	  beta,
-	  &c_t,
+	  a,
+	  b,
+	  c,
 	  cntx,
-	  cntl_use,
+	  cntl,
 	  thread
 	);
 
-	// Free the thread's local control tree.
-	bli_l3_cntl_free( sba_pool, cntl_use );
-
 	// Free the current thread's thrinfo_t structure.
 	// NOTE: The barrier here is very important as it prevents memory being
 	// released by the chief of some thread sub-group before its peers are done
@@ -124,18 +86,27 @@ static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const
 
 void bli_l3_thread_decorator
      (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
        const obj_t*   a,
        const obj_t*   b,
-       const obj_t*   beta,
        const obj_t*   c,
        const cntx_t*  cntx,
+       const cntl_t*  cntl,
        const rntm_t*  rntm
      )
 {
-	rntm_t rntm_l = *rntm;
+	rntm_t rntm_l;
+	if ( rntm != NULL ) rntm_l = *rntm;
+	else bli_rntm_init_from_global( &rntm_l );
+
+	// Set the number of ways for each loop, if needed, depending on what
+	// kind of information is already stored in the rntm_t object.
+	bli_rntm_factorize
+	(
+	  bli_obj_length( c ),
+	  bli_obj_width( c ),
+	  bli_obj_width( a ),
+	  &rntm_l
+	);
 
 	// Query the threading implementation and the number of threads requested.
 	timpl_t ti = bli_rntm_thread_impl( &rntm_l );
@@ -189,14 +160,11 @@ void bli_l3_thread_decorator
 	array_t* array = bli_sba_checkout_array( nt );
 
 	l3_decor_params_t params;
-	params.func     = func;
-	params.family   = family;
-	params.alpha    = alpha;
 	params.a        = a;
 	params.b        = b;
-	params.beta     = beta;
 	params.c        = c;
 	params.cntx     = cntx;
+	params.cntl     = cntl;
 	params.rntm     = &rntm_l;
 	params.array    = array;
 
diff --git a/frame/3/bli_l3_decor.h b/frame/3/bli_l3_decor.h
index e00b8ed49..b3cc1ed2e 100644
--- a/frame/3/bli_l3_decor.h
+++ b/frame/3/bli_l3_decor.h
@@ -36,30 +36,14 @@
 #ifndef BLIS_L3_DECOR_H
 #define BLIS_L3_DECOR_H
 
-// Level-3 internal function type.
-typedef void (*l3int_ft)
-     (
-       const obj_t*     alpha,
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     beta,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-       const cntl_t*    cntl,
-             thrinfo_t* thread
-     );
-
 // Level-3 thread decorator prototype.
 void bli_l3_thread_decorator
      (
-             l3int_ft func,
-             opid_t   family,
-       const obj_t*   alpha,
        const obj_t*   a,
        const obj_t*   b,
-       const obj_t*   beta,
        const obj_t*   c,
        const cntx_t*  cntx,
+       const cntl_t*  cntl,
        const rntm_t*  rntm
      );
 
diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c
deleted file mode 100644
index bbc4af7a0..000000000
--- a/frame/3/bli_l3_direct.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-dir_t bli_l3_direct
-     (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntl_t* cntl
-     )
-{
-	// Query the operation family.
-	opid_t family = bli_cntl_family( cntl );
-
-	if      ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c );
-	else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c );
-	else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c );
-	else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c );
-
-	// This should never execute.
-	return BLIS_FWD;
-}
-
-// -----------------------------------------------------------------------------
-
-dir_t bli_gemm_direct
-     (
-       const obj_t* a,
-       const obj_t* b,
-       const obj_t* c
-     )
-{
-	// For gemm, movement may be forwards (or backwards).
-
-	return BLIS_FWD;
-}
-
-dir_t bli_gemmt_direct
-     (
-       const obj_t* a,
-       const obj_t* b,
-       const obj_t* c
-     )
-{
-	// For gemmt, movement may be forwards (or backwards).
-
-	return BLIS_FWD;
-}
-
-dir_t bli_trmm_direct
-     (
-       const obj_t* a,
-       const obj_t* b,
-       const obj_t* c
-     )
-{
-	dir_t direct;
-
-	// For trmm, movement for the parameter cases is as follows:
-	// - left,lower:  backwards
-	// - left,upper:  forwards
-	// - right,lower: forwards
-	// - right,upper: backwards
-
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		if ( bli_obj_root_is_lower( a ) ) direct = BLIS_BWD;
-		else                              direct = BLIS_FWD;
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		if ( bli_obj_root_is_lower( b ) ) direct = BLIS_FWD;
-		else                              direct = BLIS_BWD;
-	}
-
-	return direct;
-}
-
-dir_t bli_trsm_direct
-     (
-       const obj_t* a,
-       const obj_t* b,
-       const obj_t* c
-     )
-{
-	dir_t direct;
-
-	// For trsm, movement for the parameter cases is as follows:
-	// - left,lower:  forwards
-	// - left,upper:  backwards
-	// - right,lower: backwards
-	// - right,upper: forwards
-
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		if ( bli_obj_root_is_lower( a ) ) direct = BLIS_FWD;
-		else                              direct = BLIS_BWD;
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		if ( bli_obj_root_is_lower( b ) ) direct = BLIS_BWD;
-		else                              direct = BLIS_FWD;
-	}
-
-	return direct;
-}
-
diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h
index 141c862df..5dc53b759 100644
--- a/frame/3/bli_l3_ind_ukr.h
+++ b/frame/3/bli_l3_ind_ukr.h
@@ -51,7 +51,7 @@ void PASTEMAC(ch,opname) \
        const ctype*     b, \
        const ctype*     beta, \
              ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      );
 
@@ -72,27 +72,10 @@ void PASTEMAC(ch,opname) \
        const ctype*     bx1, \
              ctype*     b11, \
              ctype*     c11, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      );
 
 INSERT_GENTPROT_BASIC( gemmtrsm1m_l_ukr_name )
 INSERT_GENTPROT_BASIC( gemmtrsm1m_u_ukr_name )
 
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       const ctype*     a, \
-             ctype*     b, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     );
-
-INSERT_GENTPROT_BASIC( trsm1m_l_ukr_name )
-INSERT_GENTPROT_BASIC( trsm1m_u_ukr_name )
-
-
diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c
deleted file mode 100644
index 70e6be3a9..000000000
--- a/frame/3/bli_l3_int.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_l3_int
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
-     )
-{
-	obj_t a_local;
-	obj_t b_local;
-	obj_t c_local;
-
-	// Return early if the current control tree node is NULL.
-	if ( bli_cntl_is_null( cntl ) ) return;
-
-	// Check parameters.
-	if ( bli_error_checking_is_enabled() )
-		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
-
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) )
-	{
-		return;
-	}
-
-	// If A or B has a zero dimension, scale C by beta and return early.
-	if ( bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		if ( bli_thrinfo_am_chief( thread ) )
-			bli_scalm( beta, c );
-		bli_thrinfo_barrier( thread );
-		return;
-	}
-
-	// If A or B is marked as being filled with zeros, scale C by beta and
-	// return early.
-	if ( bli_obj_is_zeros( a ) ||
-	     bli_obj_is_zeros( b ) )
-	{
-		// This should never execute.
-		bli_abort();
-
-		if ( bli_thrinfo_am_chief( thread ) )
-			bli_scalm( beta, c );
-		bli_thrinfo_barrier( thread );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to update attached scalars.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// Ensure that a valid packing function is set on A and B.
-	if ( !bli_obj_pack_fn( &a_local ) )
-		bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local );
-
-	if ( !bli_obj_pack_fn( &b_local ) )
-		bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local );
-
-	// If we are about to call a leaf-level implementation, and matrix C
-	// still needs a transposition, then we must induce one by swapping the
-	// strides and dimensions. Note that this transposition would normally
-	// be handled explicitly in the packing of C, but if C is not being
-	// packed, this is our last chance to handle the transposition.
-	//if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
-	if ( bli_obj_has_trans( c ) )
-	{
-		bli_obj_induce_trans( &c_local );
-		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
-	}
-
-	// If alpha is non-unit, typecast and apply it to the scalar attached
-	// to B, unless it happens to be triangular.
-	if ( bli_obj_root_is_triangular( b ) )
-	{
-		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
-			bli_obj_scalar_apply_scalar( alpha, &a_local );
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
-			bli_obj_scalar_apply_scalar( alpha, &b_local );
-	}
-
-	// If beta is non-unit, typecast and apply it to the scalar attached
-	// to C.
-	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
-		bli_obj_scalar_apply_scalar( beta, &c_local );
-
-	// Extract the function pointer from the current control tree node.
-	l3_var_oft f = bli_cntl_var_func( cntl );
-
-	// Invoke the variant.
-	f
-	(
-	  &a_local,
-	  &b_local,
-	  &c_local,
-	  cntx,
-	  cntl,
-	  thread
-	);
-}
-
diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h
index 8364d91e4..d75042dda 100644
--- a/frame/3/bli_l3_int.h
+++ b/frame/3/bli_l3_int.h
@@ -32,15 +32,29 @@
 
 */
 
-void bli_l3_int
+
+BLIS_INLINE void bli_l3_int
      (
-       const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
-       const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
        const cntl_t* cntl,
              thrinfo_t* thread
-     );
+     )
+{
+	// Extract the function pointer from the current control tree node.
+	l3_var_oft f = ( l3_var_oft )bli_cntl_var_func( cntl );
+
+	// Invoke the variant.
+	f
+	(
+	  a,//&a_local,
+	  b,//&b_local,
+	  c,//&c_local,
+	  cntx,
+	  cntl,
+	  thread
+	);
+}
 
diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c
index 0365a198c..2344b9eb8 100644
--- a/frame/3/bli_l3_oapi.c
+++ b/frame/3/bli_l3_oapi.c
@@ -41,7 +41,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* alpha, \
        const obj_t* a, \
@@ -64,7 +64,7 @@ GENFRONT( syr2k )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
              side_t side, \
        const obj_t* alpha, \
@@ -87,7 +87,7 @@ GENFRONT( trmm3 )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* alpha, \
        const obj_t* a, \
@@ -107,7 +107,7 @@ GENFRONT( syrk )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
              side_t side, \
        const obj_t* alpha, \
diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h
index 7161a3bf3..12fb68e97 100644
--- a/frame/3/bli_l3_oapi.h
+++ b/frame/3/bli_l3_oapi.h
@@ -41,7 +41,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* alpha, \
        const obj_t* a, \
@@ -59,7 +59,7 @@ GENPROT( syr2k )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
              side_t side, \
        const obj_t* alpha, \
@@ -77,7 +77,7 @@ GENPROT( trmm3 )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* alpha, \
        const obj_t* a, \
@@ -92,7 +92,7 @@ GENPROT( syrk )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
              side_t side, \
        const obj_t* alpha, \
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index 76234525d..fcb51bb45 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -34,15 +34,18 @@
 
 #include "blis.h"
 
+
 //
 // Define object-based interfaces (expert).
 //
 
 // If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be
 // defined in the sandbox environment.
-#ifndef BLIS_ENABLE_SANDBOX
-
+#ifdef BLIS_ENABLE_SANDBOX
+void PASTEMAC(gemm_def,BLIS_OAPI_EX_SUF)
+#else
 void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
+#endif
      (
        const obj_t*  alpha,
        const obj_t*  a,
@@ -55,43 +58,22 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) ) return;
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
 
-	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
-	// and return early.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
-	     bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		bli_scalm( beta, c );
+	// Check for zero dimensions, alpha == 0, or other conditions which
+	// mean that we don't actually have to perform a full l3 operation.
+	if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS )
 		return;
-	}
-
-	// If the rntm is non-NULL, it may indicate that we should forgo sup
-	// handling altogether.
-	bool enable_sup = TRUE;
-	if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm );
 
-	if ( enable_sup )
-	{
-		// Execute the small/unpacked oapi handler. If it finds that the problem
-		// does not fall within the thresholds that define "small", or for some
-		// other reason decides not to use the small/unpacked implementation,
-		// the function returns with BLIS_FAILURE, which causes execution to
-		// proceed towards the conventional implementation.
-		err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm );
-		if ( result == BLIS_SUCCESS )
-		{
-			return;
-		}
-	}
-
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
-	else                { rntm_l = *rntm;                       }
+	// Execute the small/unpacked oapi handler. If it finds that the problem
+	// does not fall within the thresholds that define "small", or for some
+	// other reason decides not to use the small/unpacked implementation,
+	// the function returns with BLIS_FAILURE, which causes execution to
+	// proceed towards the conventional implementation.
+	if ( bli_gemmsup( alpha, a, b, beta, c, cntx, rntm ) == BLIS_SUCCESS )
+		return;
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -115,18 +97,55 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_gemm_check( alpha, a, b, beta, c, cntx );
+#if 0
+#ifdef BLIS_ENABLE_SMALL_MATRIX
+	// Only handle small problems separately for homogeneous datatypes.
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
+	{
+		err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
+		if ( status == BLIS_SUCCESS ) return;
+	}
+#endif
+#endif
 
-	// Invoke the operation's front-end and request the default control tree.
-	bli_gemm_front( alpha, a, b, beta, c, cntx, &rntm_l );
+	// Alias A, B, and C in case we need to apply transformations.
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( c, &c_local );
+
+	gemm_cntl_t cntl;
+	bli_gemm_cntl_init
+	(
+	  im,
+	  BLIS_GEMM,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  &cntl
+	);
+
+	// Invoke the internal back-end via the thread handler.
+	bli_l3_thread_decorator
+	(
+	  &a_local,
+	  &b_local,
+	  &c_local,
+	  cntx,
+	  ( cntl_t* )&cntl,
+	  rntm
+	);
 }
 
-#endif
-
 
 void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
      (
@@ -141,24 +160,14 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) ) return;
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemmt_check( alpha, a, b, beta, c, cntx );
 
-	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
-	// and return early.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
-	     bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		bli_scalm( beta, c );
+	// Check for zero dimensions, alpha == 0, or other conditions which
+	// mean that we don't actually have to perform a full l3 operation.
+	if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS )
 		return;
-	}
-
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
-	else                { rntm_l = *rntm;                       }
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -179,14 +188,40 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
-
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_gemmt_check( alpha, a, b, beta, c, cntx );
-
-	// Invoke the operation's front-end and request the default control tree.
-	bli_gemmt_front( alpha, a, b, beta, c, cntx, &rntm_l );
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Alias A, B, and C in case we need to apply transformations.
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( c, &c_local );
+
+	gemm_cntl_t cntl;
+	bli_gemm_cntl_init
+	(
+	  im,
+	  BLIS_GEMMT,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  &cntl
+	);
+
+	// Invoke the internal back-end via the thread handler.
+	bli_l3_thread_decorator
+	(
+	  &a_local,
+	  &b_local,
+	  &c_local,
+	  cntx,
+	  ( cntl_t* )&cntl,
+	  rntm
+	);
 }
 
 
@@ -203,24 +238,16 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	obj_t ah;
-	obj_t bh;
-	obj_t alphah;
-
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bli_her2k_check( alpha, a, b, beta, c, cntx );
 
-	bli_obj_alias_to( alpha, &alphah );
-	bli_obj_toggle_conj( &alphah );
-
-	bli_obj_alias_to( a, &ah );
-	bli_obj_toggle_trans( &ah );
-	bli_obj_toggle_conj( &ah );
-
-	bli_obj_alias_to( b, &bh );
-	bli_obj_toggle_trans( &bh );
-	bli_obj_toggle_conj( &bh );
+	obj_t alphah;
+	obj_t ah;
+	obj_t bh;
+	bli_obj_alias_with_conj( BLIS_CONJUGATE, alpha, &alphah );
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah );
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh );
 
 	// Invoke gemmt twice, using beta only the first time.
 	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)(   alpha, a, &bh,      beta, c, cntx, rntm );
@@ -249,18 +276,14 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	obj_t at;
-	obj_t bt;
-
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bli_syr2k_check( alpha, a, b, beta, c, cntx );
 
-	bli_obj_alias_to( b, &bt );
-	bli_obj_toggle_trans( &bt );
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_toggle_trans( &at );
+	obj_t at;
+	obj_t bt;
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at );
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt );
 
 	// Invoke gemmt twice, using beta only the first time.
 	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt,      beta, c, cntx, rntm );
@@ -268,6 +291,60 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
 }
 
 
+void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
+     (
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_herk_check( alpha, a, beta, c, cntx );
+
+	obj_t ah;
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah );
+
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm );
+
+	// The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
+	// diagonal elements. Mathematically, the imaginary components of
+	// diagonal elements of a Hermitian rank-k product should always be
+	// zero. However, in practice, they sometimes accumulate meaningless
+	// non-zero values. To prevent this, we explicitly set those values
+	// to zero before returning.
+	bli_setid( &BLIS_ZERO, c );
+}
+
+
+void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
+     (
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_syrk_check( alpha, a, beta, c, cntx );
+
+	obj_t at;
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at );
+
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm );
+}
+
+
 void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
      (
              side_t  side,
@@ -282,11 +359,14 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
-	else                { rntm_l = *rntm;                       }
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_hemm_check( side, alpha, a, b, beta, c, cntx );
+
+	// Check for zero dimensions, alpha == 0, or other conditions which
+	// mean that we don't actually have to perform a full l3 operation.
+	if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS )
+		return;
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -307,14 +387,48 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
-
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_hemm_check( side, alpha, a, b, beta, c, cntx );
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Alias A, B, and C in case we need to apply transformations.
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( c, &c_local );
+
+	// If the Hermitian/symmetric matrix A is being multiplied from the right,
+	// swap A and B so that the Hermitian/symmetric matrix will actually be on
+	// the right.
+	if ( bli_is_right( side ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+	}
 
-	// Invoke the operation's front-end and request the default control tree.
-	bli_hemm_front( side, alpha, a, b, beta, c, cntx, &rntm_l );
+	gemm_cntl_t cntl;
+	bli_gemm_cntl_init
+	(
+	  im,
+	  BLIS_HEMM,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  &cntl
+	);
+
+	// Invoke the internal back-end.
+	bli_l3_thread_decorator
+	(
+	  &a_local,
+	  &b_local,
+	  &c_local,
+	  cntx,
+	  ( cntl_t* )&cntl,
+	  rntm
+	);
 }
 
 
@@ -332,11 +446,14 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
-	else                { rntm_l = *rntm;                       }
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_symm_check( side, alpha, a, b, beta, c, cntx );
+
+	// Check for zero dimensions, alpha == 0, or other conditions which
+	// mean that we don't actually have to perform a full l3 operation.
+	if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS )
+		return;
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -357,14 +474,48 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
-
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_symm_check( side, alpha, a, b, beta, c, cntx );
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Alias A, B, and C in case we need to apply transformations.
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( c, &c_local );
+
+	// If the Hermitian/symmetric matrix A is being multiplied from the right,
+	// swap A and B so that the Hermitian/symmetric matrix will actually be on
+	// the right.
+	if ( bli_is_right( side ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+	}
 
-	// Invoke the operation's front-end and request the default control tree.
-	bli_symm_front( side, alpha, a, b, beta, c, cntx, &rntm_l );
+	gemm_cntl_t cntl;
+	bli_gemm_cntl_init
+	(
+	  im,
+	  BLIS_SYMM,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  &cntl
+	);
+
+	// Invoke the internal back-end.
+	bli_l3_thread_decorator
+	(
+	  &a_local,
+	  &b_local,
+	  &c_local,
+	  cntx,
+	  ( cntl_t* )&cntl,
+	  rntm
+	);
 }
 
 
@@ -382,11 +533,14 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
-	else                { rntm_l = *rntm;                       }
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trmm3_check( side, alpha, a, b, beta, c, cntx );
+
+	// Check for zero dimensions, alpha == 0, or other conditions which
+	// mean that we don't actually have to perform a full l3 operation.
+	if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS )
+		return;
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( c );
@@ -407,73 +561,47 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
-
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_trmm3_check( side, alpha, a, b, beta, c, cntx );
-
-	// Invoke the operation's front-end and request the default control tree.
-	bli_trmm3_front( side, alpha, a, b, beta, c, cntx, &rntm_l );
-}
-
-
-void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t ah;
-
-	// Check parameters.
-	if ( bli_error_checking_is_enabled() )
-		bli_herk_check( alpha, a, beta, c, cntx );
-
-	bli_obj_alias_to( a, &ah );
-	bli_obj_toggle_trans( &ah );
-	bli_obj_toggle_conj( &ah );
-
-	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm );
-
-	// The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
-	// diagonal elements. Mathematically, the imaginary components of
-	// diagonal elements of a Hermitian rank-k product should always be
-	// zero. However, in practice, they sometimes accumulate meaningless
-	// non-zero values. To prevent this, we explicitly set those values
-	// to zero before returning.
-	bli_setid( &BLIS_ZERO, c );
-}
-
-
-void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t at;
-
-	// Check parameters.
-	if ( bli_error_checking_is_enabled() )
-		bli_syrk_check( alpha, a, beta, c, cntx );
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_toggle_trans( &at );
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Alias A, B, and C so we can tweak the objects if necessary.
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( c, &c_local );
+
+	// If A is being multiplied from the right, swap A and B so that
+	// the matrix will actually be on the right.
+	if ( bli_is_right( side ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+	}
 
-	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm );
+	gemm_cntl_t cntl;
+	bli_gemm_cntl_init
+	(
+	  im,
+	  BLIS_TRMM3,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  &cntl
+	);
+
+	// Invoke the internal back-end.
+	bli_l3_thread_decorator
+	(
+	  &a_local,
+	  &b_local,
+	  &c_local,
+	  cntx,
+	  ( cntl_t* )&cntl,
+	  rntm
+	);
 }
 
 
@@ -489,11 +617,14 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
-	else                { rntm_l = *rntm;                       }
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trmm_check( side, alpha, a, b, cntx );
+
+	// Check for zero dimensions, alpha == 0, or other conditions which
+	// mean that we don't actually have to perform a full l3 operation.
+	if ( bli_l3_return_early_if_trivial( alpha, a, b, &BLIS_ZERO, b ) == BLIS_SUCCESS )
+		return;
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( b );
@@ -513,14 +644,47 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
-
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_trmm_check( side, alpha, a, b, cntx );
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Alias A and B so we can tweak the objects if necessary.
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( b, &c_local );
+
+	// If A is being multiplied from the right, swap A and B so that
+	// the matrix will actually be on the right.
+	if ( bli_is_right( side ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+	}
 
-	// Invoke the operation's front-end and request the default control tree.
-	bli_trmm_front( side, alpha, a, b, cntx, &rntm_l );
+	gemm_cntl_t cntl;
+	bli_gemm_cntl_init
+	(
+	  im,
+	  BLIS_TRMM,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  &BLIS_ZERO,
+	  &c_local,
+	  cntx,
+	  &cntl
+	);
+
+	// Invoke the internal back-end.
+	bli_l3_thread_decorator
+	(
+	  &a_local,
+	  &b_local,
+	  &c_local,
+	  cntx,
+	  ( cntl_t* )&cntl,
+	  rntm
+	);
 }
 
 
@@ -536,11 +700,14 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
-	else                { rntm_l = *rntm;                       }
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trsm_check( side, alpha, a, b, cntx );
+
+	// Check for zero dimensions, alpha == 0, or other conditions which
+    // mean that we don't actually have to perform a full l3 operation.
+	if ( bli_l3_return_early_if_trivial( alpha, a, b, &BLIS_ZERO, b ) == BLIS_SUCCESS )
+		return;
 
 	// Default to using native execution.
 	num_t dt = bli_obj_dt( b );
@@ -560,12 +727,74 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
 
 	// If necessary, obtain a valid context from the gks using the induced
 	// method id determined above.
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im );
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_trsm_check( side, alpha, a, b, cntx );
+#if 0
+#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
+	gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl );
+	if ( status == BLIS_SUCCESS ) return;
+#endif
+#endif
+
+	// Alias A and B so we can tweak the objects if necessary.
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( b, &c_local );
+
+#if 1
+
+	// If A is being solved against from the right, transpose all operands
+	// so that we can perform the computation as if A were being solved
+	// from the left.
+	if ( bli_is_right( side ) )
+	{
+		bli_toggle_side( &side );
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+	}
+
+#else
+
+	// NOTE: Enabling this code requires that BLIS NOT be configured with
+	// BLIS_RELAX_MCNR_NCMR_CONSTRAINTS defined.
+#ifdef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS
+	#error "BLIS_RELAX_MCNR_NCMR_CONSTRAINTS must not be defined for current trsm_r implementation."
+#endif
+
+	// If A is being solved against from the right, swap A and B so that
+	// the triangular matrix will actually be on the right.
+	if ( bli_is_right( side ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+	}
+
+#endif
 
-	// Invoke the operation's front-end and request the default control tree.
-	bli_trsm_front( side, alpha, a, b, cntx, &rntm_l );
+	trsm_cntl_t cntl;
+	bli_trsm_cntl_init
+	(
+	  im,
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  alpha,
+	  &c_local,
+	  cntx,
+	  &cntl
+	);
+
+	// Invoke the internal back-end.
+	bli_l3_thread_decorator
+	(
+	  &a_local,
+	  &b_local,
+	  &c_local,
+	  cntx,
+	  ( cntl_t* )&cntl,
+	  rntm
+	);
 }
diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h
index dd7624d92..09f7f4a5d 100644
--- a/frame/3/bli_l3_oapi_ex.h
+++ b/frame/3/bli_l3_oapi_ex.h
@@ -56,6 +56,9 @@ GENPROT( gemm )
 GENPROT( gemmt )
 GENPROT( her2k )
 GENPROT( syr2k )
+#ifdef BLIS_ENABLE_SANDBOX
+GENPROT( gemm_def )
+#endif
 
 
 #undef  GENPROT
diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c
index 65776d49f..838dd2e2d 100644
--- a/frame/3/bli_l3_packab.c
+++ b/frame/3/bli_l3_packab.c
@@ -36,12 +36,12 @@
 
 void bli_l3_packa
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t a_local, a_pack;
@@ -60,20 +60,18 @@ void bli_l3_packa
 	  &a_pack,
 	  cntx,
 	  cntl,
-	  thread
+	  thread_par
 	);
 
 	// Proceed with execution using packed matrix A.
 	bli_l3_int
 	(
-	  &BLIS_ONE,
 	  &a_pack,
 	  b,
-	  &BLIS_ONE,
 	  c,
 	  cntx,
-	  bli_cntl_sub_node( cntl ),
-	  bli_thrinfo_sub_node( thread )
+	  bli_cntl_sub_node( 0, cntl ),
+	  bli_thrinfo_sub_node( 0, thread_par )
 	);
 }
 
@@ -81,12 +79,12 @@ void bli_l3_packa
 
 void bli_l3_packb
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      )
 {
 	obj_t bt_local, bt_pack;
@@ -109,7 +107,7 @@ void bli_l3_packb
 	  &bt_pack,
 	  cntx,
 	  cntl,
-	  thread
+	  thread_par
 	);
 
 	// Transpose packed object back to B.
@@ -118,14 +116,12 @@ void bli_l3_packb
 	// Proceed with execution using packed matrix B.
 	bli_l3_int
 	(
-	  &BLIS_ONE,
 	  a,
 	  &bt_pack,
-	  &BLIS_ONE,
 	  c,
 	  cntx,
-	  bli_cntl_sub_node( cntl ),
-	  bli_thrinfo_sub_node( thread )
+	  bli_cntl_sub_node( 0, cntl ),
+	  bli_thrinfo_sub_node( 0, thread_par )
 	);
 }
 
diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h
index e58a08e4b..d0dd9f496 100644
--- a/frame/3/bli_l3_packab.h
+++ b/frame/3/bli_l3_packab.h
@@ -34,21 +34,21 @@
 
 void bli_l3_packa
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      );
 
 void bli_l3_packb
      (
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const cntl_t* cntl,
-             thrinfo_t* thread
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+       const cntl_t*    cntl,
+             thrinfo_t* thread_par
      );
 
diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c
index 6531b74a8..195e4107b 100644
--- a/frame/3/bli_l3_prune.c
+++ b/frame/3/bli_l3_prune.c
@@ -39,101 +39,80 @@ void bli_l3_prune_unref_mparts_m
      (
              obj_t*  a,
        const obj_t*  b,
-             obj_t*  c,
-       const cntl_t* cntl
+             obj_t*  c
      )
 {
-	/* Query the operation family. */
-	opid_t family = bli_cntl_family( cntl );
-
-	if      ( family == BLIS_GEMM )
-	{
-		/* No pruning is necessary for gemm. */
-		return;
-	}
-	else if ( family == BLIS_GEMMT )
+	if ( bli_obj_is_upper_or_lower( c ) )
 	{
 		/* Prune any unreferenced part from the subpartition of C (that would
 		   be encountered from partitioning in the m dimension) and adjust the
 		   subpartition of A accordingly. */
 		bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M );
 	}
-	else if ( family == BLIS_TRMM ||
-	          family == BLIS_TRSM )
+	else if ( bli_obj_is_triangular( a ) )
 	{
 		/* Prune any unreferenced part from the subpartition of A (that would
 		   be encountered from partitioning in the m dimension) and adjust the
 		   subpartition of C accordingly. */
 		bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M );
 	}
+	else
+	{
+		/* No pruning is necessary. */
+	}
 }
 
 void bli_l3_prune_unref_mparts_n
      (
        const obj_t*  a,
              obj_t*  b,
-             obj_t*  c,
-       const cntl_t* cntl
+             obj_t*  c
      )
 {
-	/* Query the operation family. */
-	opid_t family = bli_cntl_family( cntl );
-
-	if      ( family == BLIS_GEMM )
-	{
-		/* No pruning is necessary for gemm. */
-		return;
-	}
-	else if ( family == BLIS_GEMMT )
+	if ( bli_obj_is_upper_or_lower( c ) )
 	{
 		/* Prune any unreferenced part from the subpartition of C (that would
-		   be encountered from partitioning in the m dimension) and adjust the
+		   be encountered from partitioning in the n dimension) and adjust the
 		   subpartition of B accordingly. */
 		bli_prune_unref_mparts( c, BLIS_N, b, BLIS_N );
 	}
-	else if ( family == BLIS_TRMM ||
-	          family == BLIS_TRSM )
+	else if ( bli_obj_is_triangular( b ) )
 	{
 		/* Prune any unreferenced part from the subpartition of B (that would
-		   be encountered from partitioning in the m dimension) and adjust the
+		   be encountered from partitioning in the n dimension) and adjust the
 		   subpartition of C accordingly. */
 		bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N );
 	}
+	else
+	{
+		/* No pruning is necessary. */
+	}
 }
 
 void bli_l3_prune_unref_mparts_k
      (
              obj_t*  a,
              obj_t*  b,
-       const obj_t*  c,
-       const cntl_t* cntl
+       const obj_t*  c
      )
 {
-	/* Query the operation family. */
-	opid_t family = bli_cntl_family( cntl );
-
-	if      ( family == BLIS_GEMM )
-	{
-		/* No pruning is necessary for gemm. */
-		return;
-	}
-	else if ( family == BLIS_GEMMT )
-	{
-		/* No pruning is necessary for gemmt. */
-		return;
-	}
-	else if ( family == BLIS_TRMM ||
-	          family == BLIS_TRSM )
+	if ( bli_obj_is_triangular( a ) )
 	{
 		/* Prune any unreferenced part from the subpartition of A (that would
 		   be encountered from partitioning in the k dimension) and adjust the
 		   subpartition of B accordingly. */
 		bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M );
-
+	}
+	else if ( bli_obj_is_triangular( b ) )
+	{
 		/* Prune any unreferenced part from the subpartition of B (that would
 		   be encountered from partitioning in the k dimension) and adjust the
 		   subpartition of A accordingly. */
 		bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N );
 	}
+	else
+	{
+		/* No pruning is necessary. */
+	}
 }
 
diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h
index 84c0cbbcd..2f76b3a32 100644
--- a/frame/3/bli_l3_prune.h
+++ b/frame/3/bli_l3_prune.h
@@ -37,23 +37,20 @@ void bli_l3_prune_unref_mparts_m
      (
              obj_t*  a,
        const obj_t*  b,
-             obj_t*  c,
-       const cntl_t* cntl
+             obj_t*  c
      );
 
 void bli_l3_prune_unref_mparts_n
      (
        const obj_t*  a,
              obj_t*  b,
-             obj_t*  c,
-       const cntl_t* cntl
+             obj_t*  c
      );
 
 void bli_l3_prune_unref_mparts_k
      (
              obj_t*  a,
              obj_t*  b,
-       const obj_t*  c,
-       const cntl_t* cntl
+       const obj_t*  c
      );
 
diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c
deleted file mode 100644
index 1de381f37..000000000
--- a/frame/3/bli_l3_schema.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_l3_set_schemas
-     (
-             obj_t*  a,
-             obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx
-     )
-{
-	// Begin with pack schemas for native execution.
-	pack_t schema_a = BLIS_PACKED_ROW_PANELS;
-	pack_t schema_b = BLIS_PACKED_COL_PANELS;
-
-	// When executing the 1m method, choose the appropriate pack schemas based
-	// on the microkernel preference encoded within the current cntx_t (which
-	// was presumably returned by the gks).
-	if ( bli_cntx_method( cntx ) == BLIS_1M )
-	{
-		num_t dt = bli_obj_domain( c ) | bli_obj_comp_prec( c );
-
-		// Note that bli_cntx_l3_vir_ukr_prefers_cols_dt() will use the real
-		// projection of dt to query the preference of the corresponding native
-		// real-domain microkernel. This is what ultimately determines which
-		// variant of 1m is applicable.
-		if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ) )
-		{
-			schema_a = BLIS_PACKED_ROW_PANELS_1E;
-			schema_b = BLIS_PACKED_COL_PANELS_1R;
-		}
-		else
-		{
-			schema_a = BLIS_PACKED_ROW_PANELS_1R;
-			schema_b = BLIS_PACKED_COL_PANELS_1E;
-		}
-	}
-
-	// Embed the schemas into the objects for A and B. This is a sort of hack
-	// for communicating the desired pack schemas to bli_gemm_cntl_create()
-	// (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows
-	// us to subsequently access the schemas from the control tree, which
-	// hopefully reduces some confusion, particularly in bli_packm_init().
-	bli_obj_set_pack_schema( schema_a, a );
-	bli_obj_set_pack_schema( schema_b, b );
-}
-
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index 57513ab5b..b0fe8a59f 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -60,16 +60,16 @@ err_t bli_gemmsup
 	// that function assumes the context pointer is valid.
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
+	const num_t dt = bli_obj_dt( c );
+	const dim_t m  = bli_obj_length( c );
+	const dim_t n  = bli_obj_width( c );
+	const dim_t k  = bli_obj_width_after_trans( a );
+
 	// Return early if a microkernel preference-induced transposition would
 	// have been performed and shifted the dimensions outside of the space
 	// of sup-handled problems.
-	if ( bli_cntx_dislikes_storage_of( c, BLIS_GEMM_VIR_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
 	{
-		const num_t dt = bli_obj_dt( c );
-		const dim_t m  = bli_obj_length( c );
-		const dim_t n  = bli_obj_width( c );
-		const dim_t k  = bli_obj_width_after_trans( a );
-
 		// Pass in m and n reversed, which simulates a transposition of the
 		// entire operation pursuant to the microkernel storage preference.
 		if ( !bli_cntx_l3_sup_thresh_is_met( dt, n, m, k, cntx ) )
@@ -77,11 +77,6 @@ err_t bli_gemmsup
 	}
 	else // ukr_prefers_storage_of( c, ... )
 	{
-		const num_t dt = bli_obj_dt( c );
-		const dim_t m  = bli_obj_length( c );
-		const dim_t n  = bli_obj_width( c );
-		const dim_t k  = bli_obj_width_after_trans( a );
-
 		if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, n, k, cntx ) )
 			return BLIS_FAILURE;
 	}
@@ -92,6 +87,9 @@ err_t bli_gemmsup
 	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
 	else                { rntm_l = *rntm;                       }
 
+	if ( !bli_rntm_l3_sup( &rntm_l ) )
+		return BLIS_FAILURE;
+
 #if 0
 const num_t dt = bli_obj_dt( c );
 const dim_t m  = bli_obj_length( c );
@@ -118,13 +116,18 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
 	// Query the small/unpacked handler from the context and invoke it.
 	gemmsup_oft gemmsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMM, cntx );
 
+	// Typecast alpha and beta to the correct type
+	obj_t alpha_cast, beta_cast;
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, alpha, &alpha_cast );
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, beta, &beta_cast );
+
 	return
 	gemmsup_fp
 	(
-	  alpha,
+	  &alpha_cast,
 	  a,
 	  b,
-	  beta,
+	  &beta_cast,
 	  c,
 	  cntx,
 	  &rntm_l
@@ -162,14 +165,12 @@ err_t bli_gemmtsup
 	// Notice that we do not bother to check whether the microkernel
 	// prefers or dislikes the storage of C, since the same check is called
 	// for either way.
-	{
-		const num_t dt = bli_obj_dt( c );
-		const dim_t m  = bli_obj_length( c );
-		const dim_t k  = bli_obj_width_after_trans( a );
+	const num_t dt = bli_obj_dt( c );
+	const dim_t m  = bli_obj_length( c );
+	const dim_t k  = bli_obj_width_after_trans( a );
 
-		if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, m, k, cntx ) )
-			return BLIS_FAILURE;
-	}
+	if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, m, k, cntx ) )
+		return BLIS_FAILURE;
 
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
@@ -187,13 +188,18 @@ err_t bli_gemmtsup
 	// Query the small/unpacked handler from the context and invoke it.
 	gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx );
 
+	// Typecast alpha and beta to the correct type
+	obj_t alpha_cast, beta_cast;
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, alpha, &alpha_cast );
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, beta, &beta_cast );
+
 	return
 	gemmtsup_fp
 	(
-	  alpha,
+	  &alpha_cast,
 	  a,
 	  b,
-	  beta,
+	  &beta_cast,
 	  c,
 	  cntx,
 	  &rntm_l
diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c
index 890980da3..63cd65b65 100644
--- a/frame/3/bli_l3_sup_packm.c
+++ b/frame/3/bli_l3_sup_packm.c
@@ -35,135 +35,6 @@
 
 #include "blis.h"
 
-void bli_packm_sup_init_mem
-     (
-       bool       will_pack,
-       packbuf_t  pack_buf_type,
-       num_t      dt,
-       dim_t      m,
-       dim_t      k,
-       dim_t      mr,
-       thrinfo_t* thread
-     )
-{
-	// Inspect whether we are going to be packing matrix A.
-	if ( will_pack == FALSE )
-	{
-	}
-	else // if ( will_pack == TRUE )
-	{
-		mem_t* mem = bli_thrinfo_mem( thread );
-		pba_t* pba = bli_thrinfo_pba( thread );
-
-		// NOTE: This "rounding up" of the last upanel is actually optional
-		// for the rrc/crc cases, but absolutely necessary for the other cases
-		// since we NEED that last micropanel to have the same ldim (cs_p) as
-		// the other micropanels. Why? So that millikernels can use the same
-		// upanel ldim for all iterations of the ir loop.
-		const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
-		const dim_t k_pack = k;
-
-		// Barrier to make sure all threads are caught up and ready to begin
-		// the packm stage.
-		bli_thrinfo_barrier( thread );
-
-		// Compute the size of the memory block eneded.
-		siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
-
-		// Check the mem_t entry provided by the caller. If it is unallocated,
-		// then we need to acquire a block from the pba.
-		if ( bli_mem_is_unalloc( mem ) )
-		{
-			if ( bli_thrinfo_am_chief( thread ) )
-			{
-				// Acquire directly to the chief thread's mem_t that was
-				// passed in. It needs to be that mem_t struct, and not a
-				// local (temporary) mem_t, since there is no barrier until
-				// after packing is finished, which could allow a race
-				// condition whereby the chief thread exits the current
-				// function before the other threads have a chance to copy
-				// from it. (A barrier would fix that race condition, but
-				// then again, I prefer to keep barriers to a minimum.)
-				bli_pba_acquire_m
-				(
-				  pba,
-				  size_needed,
-				  pack_buf_type,
-				  mem
-				);
-			}
-
-			// Broadcast the address of the chief thread's passed-in mem_t
-			// to all threads.
-			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem );
-
-			// Non-chief threads: Copy the contents of the chief thread's
-			// passed-in mem_t to the passed-in mem_t for this thread. (The
-			// chief thread already has the mem_t, so it does not need to
-			// perform any copy.)
-			if ( !bli_thrinfo_am_chief( thread ) )
-			{
-				*mem = *mem_p;
-			}
-		}
-		else // if ( bli_mem_is_alloc( mem ) )
-		{
-			// If the mem_t entry provided by the caller does NOT contain a NULL
-			// buffer, then a block has already been acquired from the pba and
-			// cached by the caller.
-
-			// As a sanity check, we should make sure that the mem_t object isn't
-			// associated with a block that is too small compared to the size of
-			// the packed matrix buffer that is needed, according to the value
-			// computed above.
-			siz_t mem_size = bli_mem_size( mem );
-
-			if ( mem_size < size_needed )
-			{
-				if ( bli_thrinfo_am_chief( thread ) )
-				{
-					// The chief thread releases the existing block associated
-					// with the mem_t, and then re-acquires a new block, saving
-					// the associated mem_t to its passed-in mem_t. (See coment
-					// above for why the acquisition needs to be directly to
-					// the chief thread's passed-in mem_t and not a local
-					// (temporary) mem_t.
-					bli_pba_release
-					(
-					  pba,
-					  mem
-					);
-					bli_pba_acquire_m
-					(
-					  pba,
-					  size_needed,
-					  pack_buf_type,
-					  mem
-					);
-				}
-
-				// Broadcast the address of the chief thread's passed-in mem_t
-				// to all threads.
-				mem_t* mem_p = bli_thrinfo_broadcast( thread, mem );
-
-				// Non-chief threads: Copy the contents of the chief thread's
-				// passed-in mem_t to the passed-in mem_t for this thread. (The
-				// chief thread already has the mem_t, so it does not need to
-				// perform any copy.)
-				if ( !bli_thrinfo_am_chief( thread ) )
-				{
-					*mem = *mem_p;
-				}
-			}
-			else
-			{
-				// If the mem_t entry is already allocated and sufficiently large,
-				// then we use it as-is. No action is needed.
-			}
-		}
-	}
-}
-
 void bli_packm_sup_finalize_mem
      (
        bool       did_pack,
@@ -197,92 +68,6 @@ void bli_packm_sup_finalize_mem
 	}
 }
 
-void bli_packm_sup_init
-     (
-             bool       will_pack,
-             stor3_t    stor_id,
-             pack_t*    schema,
-             dim_t      m,
-             dim_t      k,
-             dim_t      mr,
-             dim_t*     m_max,
-             dim_t*     k_max,
-       const void*      x, inc_t  rs_x, inc_t  cs_x,
-             void**     p, inc_t* rs_p, inc_t* cs_p,
-                           dim_t* pd_p, inc_t* ps_p,
-             thrinfo_t* thread
-     )
-{
-	// Inspect whether we are going to be packing matrix A.
-	if ( will_pack == FALSE )
-	{
-		*m_max = m;
-		*k_max = k;
-
-		// Set the parameters for use with no packing of A (ie: using the
-		// source matrix A directly).
-		{
-			// Use the strides of the source matrix as the final values.
-			*rs_p = rs_x;
-			*cs_p = cs_x;
-
-			*pd_p = mr;
-			*ps_p = mr * rs_x;
-
-			// Set the schema to "not packed" to indicate that packing will be
-			// skipped.
-			*schema = BLIS_NOT_PACKED;
-		}
-
-		// Since we won't be packing, simply update the buffer address provided
-		// by the caller to point to source matrix.
-		*p = ( void* )x;
-	}
-	else // if ( will_pack == TRUE )
-	{
-		// NOTE: This is "rounding up" of the last upanel is actually optional
-		// for the rrc/crc cases, but absolutely necessary for the other cases
-		// since we NEED that last micropanel to have the same ldim (cs_p) as
-		// the other micropanels. Why? So that millikernels can use the same
-		// upanel ldim for all iterations of the ir loop.
-		*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
-		*k_max = k;
-
-		// Determine the dimensions and strides for the packed matrix A.
-		if ( stor_id == BLIS_RRC ||
-			 stor_id == BLIS_CRC )
-		{
-			// stor3_t id values _RRC and _CRC: pack A to plain row storage.
-			*rs_p = k;
-			*cs_p = 1;
-
-			*pd_p = mr;
-			*ps_p = mr * k;
-
-			// Set the schema to "row packed" to indicate packing to plain
-			// row storage.
-			*schema = BLIS_PACKED_ROWS;
-		}
-		else
-		{
-			// All other stor3_t ids: pack A to column-stored row-panels.
-			*rs_p = 1;
-			*cs_p = mr;
-
-			*pd_p = mr;
-			*ps_p = mr * k;
-
-			// Set the schema to "packed row panels" to indicate packing to
-			// conventional column-stored row panels.
-			*schema = BLIS_PACKED_ROW_PANELS;
-		}
-
-		// Set the buffer address provided by the caller to point to the
-		// memory associated with the mem_t entry acquired from the pba.
-		*p = bli_mem_buffer( bli_thrinfo_mem( thread ) );
-	}
-}
-
 typedef void (*packm_sup_var1_fp)
      (
        trans_t    transc,
@@ -324,10 +109,7 @@ void bli_packm_sup
              bool       will_pack,
              packbuf_t  pack_buf_type,
              stor3_t    stor_id,
-             trans_t    transc,
              num_t      dt,
-             dim_t      m_alloc,
-             dim_t      k_alloc,
              dim_t      m,
              dim_t      k,
              dim_t      mr,
@@ -339,88 +121,118 @@ void bli_packm_sup
              thrinfo_t* thread
      )
 {
-	pack_t schema;
-	dim_t  m_max;
-	dim_t  k_max;
-	dim_t  pd_p;
+	if ( will_pack == FALSE )
+	{
+		// Set the parameters for use with no packing of A (ie: using the
+		// source matrix A directly).
 
-	// Prepare the packing destination buffer. If packing is not requested,
-	// this function will reduce to a no-op.
-	bli_packm_sup_init_mem
-	(
-	  will_pack,
-	  pack_buf_type,
-	  dt, m_alloc, k_alloc, mr,
-	  thread
-	);
+		// Use the strides of the source matrix as the final values.
+		*rs_p = rs_a;
+		*cs_p = cs_a;
+		*ps_p = mr * rs_a;
 
-	// Determine the packing buffer and related parameters for matrix A. If A
-	// will not be packed, then a_use will be set to point to a and the _a_use
-	// strides will be set accordingly.
-	bli_packm_sup_init
-	(
-	  will_pack,
-	  stor_id,
-	  &schema,
-	  m, k, mr,
-	  &m_max, &k_max,
-	  a, rs_a,  cs_a,
-	  p, rs_p,  cs_p,
-	     &pd_p, ps_p,
-	  thread
-	);
+		// Since we won't be packing, simply update the buffer address provided
+		// by the caller to point to source matrix.
+		*p = ( void* )a;
 
-	// Inspect whether we are going to be packing matrix A.
-	if ( will_pack == FALSE )
+		return;
+	}
+
+	// Barrier so that computation is done before packing.
+	bli_thrinfo_barrier( thread );
+
+	// NOTE: This is "rounding up" of the last upanel is actually optional
+	// for the rrc/crc cases, but absolutely necessary for the other cases
+	// since we NEED that last micropanel to have the same ldim (cs_p) as
+	// the other micropanels. Why? So that millikernels can use the same
+	// upanel ldim for all iterations of the ir loop.
+	dim_t  m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+	dim_t  k_max = k;
+
+	dim_t  pd_p = mr;
+	*ps_p = mr * k;
+
+	pack_t schema;
+
+	// Determine the dimensions and strides for the packed matrix A.
+	if ( stor_id == BLIS_RRC ||
+		 stor_id == BLIS_CRC )
 	{
-		// If we aren't going to pack matrix A, then there's nothing to do.
+		// stor3_t id values _RRC and _CRC: pack A to plain row storage.
+		*rs_p = k;
+		*cs_p = 1;
 
-		// printf( "blis_ packm_sup_a: not packing A.\n" );
+		// Set the schema to "row packed" to indicate packing to plain
+		// row storage.
+		schema = BLIS_PACKED_MATRIX;
 	}
-	else // if ( will_pack == TRUE )
+	else
 	{
-		if ( schema == BLIS_PACKED_ROWS )
-		{
-			// printf( "blis_ packm_sup_a: packing A to rows.\n" );
+		// All other stor3_t ids: pack A to column-stored row-panels.
+		*rs_p = 1;
+		*cs_p = mr;
 
-			// For plain packing by rows, use var2.
-			packm_sup_var2[ dt ]
-			(
-			  transc,
-			  schema,
-			  m,
-			  k,
-			  ( void* )kappa,
-			  ( void* )a,  rs_a,  cs_a,
-			          *p, *rs_p, *cs_p,
-			  ( cntx_t* )cntx,
-			  bli_thrinfo_sub_prenode( thread )
-			);
-		}
-		else // if ( schema == BLIS_PACKED_ROW_PANELS )
-		{
-			// printf( "blis_ packm_sup_a: packing A to row panels.\n" );
+		// Set the schema to "packed row panels" to indicate packing to
+		// conventional column-stored row panels.
+		schema = BLIS_PACKED_PANELS;
+	}
 
-			// For packing to column-stored row panels, use var1.
-			packm_sup_var1[ dt ]
-			(
-			  transc,
-			  schema,
-			  m,
-			  k,
-			  m_max,
-			  k_max,
-			  ( void* )kappa,
-			  ( void* )a,  rs_a,  cs_a,
-			          *p, *rs_p, *cs_p,
-			               pd_p, *ps_p,
-			  ( cntx_t* )cntx,
-			  bli_thrinfo_sub_prenode( thread )
-			);
-		}
+	// NOTE: This "rounding up" of the last upanel is actually optional
+	// for the rrc/crc cases, but absolutely necessary for the other cases
+	// since we NEED that last micropanel to have the same ldim (cs_p) as
+	// the other micropanels. Why? So that millikernels can use the same
+	// upanel ldim for all iterations of the ir loop.
+	const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+	const dim_t k_pack = k;
 
-		// Barrier so that packing is done before computation.
-		bli_thrinfo_barrier( thread );
+	// Compute the size of the memory block eneded.
+	siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
+
+	// Set the buffer address provided by the caller to point to the
+	// memory associated with the mem_t entry acquired from the pba.
+	*p = bli_packm_alloc_ex( size_needed, pack_buf_type, thread );
+
+	if ( schema == BLIS_PACKED_MATRIX )
+	{
+		// printf( "blis_ packm_sup_a: packing A to rows.\n" );
+
+		// For plain packing by rows, use var2.
+		packm_sup_var2[ dt ]
+		(
+		  BLIS_NO_TRANSPOSE,
+		  schema,
+		  m,
+		  k,
+		  ( void* )kappa,
+		  ( void* )a,  rs_a,  cs_a,
+		          *p, *rs_p, *cs_p,
+		  ( cntx_t* )cntx,
+		  thread
+		);
 	}
+	else // if ( schema == BLIS_PACKED_PANELS )
+	{
+		// printf( "blis_ packm_sup_a: packing A to row panels.\n" );
+
+		// For packing to column-stored row panels, use var1.
+		packm_sup_var1[ dt ]
+		(
+		  BLIS_NO_TRANSPOSE,
+		  schema,
+		  m,
+		  k,
+		  m_max,
+		  k_max,
+		  ( void* )kappa,
+		  ( void* )a,  rs_a,  cs_a,
+		          *p, *rs_p, *cs_p,
+		               pd_p, *ps_p,
+		  ( cntx_t* )cntx,
+		  thread
+		);
+	}
+
+	// Barrier so that packing is done before computation.
+	bli_thrinfo_barrier( thread );
 }
 
diff --git a/frame/3/bli_l3_sup_packm.h b/frame/3/bli_l3_sup_packm.h
index 032ba0afe..c28d1e79a 100644
--- a/frame/3/bli_l3_sup_packm.h
+++ b/frame/3/bli_l3_sup_packm.h
@@ -33,49 +33,18 @@
 
 */
 
-
-void bli_packm_sup_init_mem
-     (
-       bool       will_pack,
-       packbuf_t  pack_buf_type,
-       num_t      dt,
-       dim_t      m,
-       dim_t      k,
-       dim_t      mr,
-       thrinfo_t* thread
-     );
-
 void bli_packm_sup_finalize_mem
      (
        bool       did_pack,
        thrinfo_t* thread
      );
 
-void bli_packm_sup_init
-     (
-             bool       will_pack,
-             stor3_t    stor_id,
-             pack_t*    schema,
-             dim_t      m,
-             dim_t      k,
-             dim_t      mr,
-             dim_t*     m_max,
-             dim_t*     k_max,
-       const void*      x, inc_t  rs_x, inc_t  cs_x,
-             void**     p, inc_t* rs_p, inc_t* cs_p,
-                           dim_t* pd_p, inc_t* ps_p,
-             thrinfo_t* thread
-     );
-
 void bli_packm_sup
      (
              bool       will_pack,
              packbuf_t  pack_buf_type,
              stor3_t    stor_id,
-             trans_t    transc,
              num_t      dt,
-             dim_t      m_alloc,
-             dim_t      k_alloc,
              dim_t      m,
              dim_t      k,
              dim_t      mr,
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index fa31468cb..5ef9e71f5 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -77,7 +77,6 @@ void PASTEMAC(ch,varname) \
 	inc_t  ldc; \
 	inc_t  ldp, p_inc; \
 	conj_t conjc; \
-\
 \
 	/* Extract the conjugation bit from the transposition argument. */ \
 	conjc = bli_extract_conj( transc ); \
@@ -90,46 +89,21 @@ void PASTEMAC(ch,varname) \
 		bli_toggle_trans( &transc ); \
 	} \
 \
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	bool row_stored = bli_is_col_packed( schema ); \
-	/*bool col_stored = bli_is_row_packed( schema );*/ \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		vs_c           = cs_c; \
-		ldc            = rs_c; \
-		ldp            = rs_p; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		vs_c           = rs_c; \
-		ldc            = cs_c; \
-		ldp            = cs_p; \
-	} \
+	/* Prepare to pack to row-stored column panels. */ \
+	iter_dim       = n; \
+	panel_len_full = m; \
+	panel_len_max  = m_max; \
+	panel_dim_max  = pd_p; \
+	vs_c           = cs_c; \
+	ldc            = rs_c; \
+	ldp            = rs_p; \
 \
-	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
-	                                           : BLIS_PACKM_MRXK_KER; \
+	num_t  dt      = PASTEMAC(ch,type); \
+	ukr_t ker_id   = BLIS_PACKM_KER; \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. */ \
-	PASTECH(packm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
+	packm_cxk_ker_ft f = bli_cntx_get_ukr2_dt( dt, dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -158,7 +132,7 @@ void PASTEMAC(ch,varname) \
 	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( tid, nt, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( ic  = ic0,    it  = 0; it < n_iter; \
@@ -184,11 +158,14 @@ void PASTEMAC(ch,varname) \
 				  conjc, \
 				  schema, \
 				  panel_dim_i, \
+				  panel_dim_max, \
+				  1, /* this shouldn't be hard-coded */ \
 				  panel_len_i, \
 				  panel_len_max_i, \
 				  kappa_cast, \
 				  c_use, vs_c, ldc, \
 				  p_use,       ldp, \
+				  NULL, \
 				  cntx  \
 				); \
 			} \
@@ -339,7 +316,6 @@ void PASTEMAC(ch,varname) \
 	inc_t  incc, ldc; \
 	inc_t  incp, ldp; \
 	conj_t conjc; \
-\
 \
 	/* Extract the conjugation bit from the transposition argument. */ \
 	conjc = bli_extract_conj( transc ); \
@@ -352,37 +328,16 @@ void PASTEMAC(ch,varname) \
 		bli_toggle_trans( &transc ); \
 	} \
 \
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	bool col_stored = bli_is_col_packed( schema ); \
-	/*bool row_stored = bli_is_row_packed( schema );*/ \
-\
-	if ( col_stored ) \
-	{ \
-		/* Prepare to pack to a column-stored matrix. */ \
-		iter_dim       = n; \
-		vector_len     = m; \
-		incc           = rs_c; \
-		ldc            = cs_c; \
-		incp           = 1; \
-		ldp            = cs_p; \
-	} \
-	else /* if ( row_stored ) */ \
-	{ \
-		/* Prepare to pack to a row-stored matrix. */ \
-		iter_dim       = m; \
-		vector_len     = n; \
-		incc           = cs_c; \
-		ldc            = rs_c; \
-		incp           = 1; \
-		ldp            = rs_p; \
-	} \
+	/* Prepare to pack to a column-stored matrix. */ \
+	iter_dim       = n; \
+	vector_len     = m; \
+	incc           = rs_c; \
+	ldc            = cs_c; \
+	incp           = 1; \
+	ldp            = cs_p; \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	n_iter = iter_dim; \
-\
 \
 	ctype* p_begin = p_cast; \
 \
@@ -401,7 +356,7 @@ void PASTEMAC(ch,varname) \
 	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( tid, nt, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( it = 0; it < n_iter; it += 1 ) \
@@ -416,7 +371,7 @@ void PASTEMAC(ch,varname) \
 			   or round-robin partitioning was requested at configure-time. */ \
 			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
-				PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
+				PASTEMAC(ch,scal2v,BLIS_TAPI_EX_SUF) \
 				( \
 				  conjc, \
 				  vector_len, \
diff --git a/frame/3/bli_l3_sup_ref.c b/frame/3/bli_l3_sup_ref.c
index 76314aba7..5fcffe518 100644
--- a/frame/3/bli_l3_sup_ref.c
+++ b/frame/3/bli_l3_sup_ref.c
@@ -50,31 +50,6 @@ err_t bli_gemmsup_ref
 	// register a different function pointer in the context in your
 	// sub-configuration's bli_cntx_init_*() function.
 
-	// Check parameters.
-	if ( bli_error_checking_is_enabled() )
-		bli_gemm_check( alpha, a, b, beta, c, cntx );
-
-#if 0
-	// NOTE: This special case handling is done within the variants.
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// If A or B has a zero dimension, scale C by beta and return early.
-	if ( bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		bli_scalm( beta, c );
-		return BLIS_SUCCESS;
-	}
-#endif
-
-	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
-
 	// Don't use the small/unpacked implementation if one of the matrices
 	// uses general stride. NOTE: We check for this here, in bli_gemmsup_ref()
 	// (and not in the calling function, bli_gemmsup()), because we consider
@@ -85,6 +60,7 @@ err_t bli_gemmsup_ref
 	// want to have to manage the multiple return values from the threads,
 	// which we would have to process into a single return value and then
 	// return from the parallel/threaded region.
+	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
 	if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
 
 	// Parse and interpret the contents of the rntm_t object to properly
diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c
index a1e20c132..694142416 100644
--- a/frame/3/bli_l3_sup_var12.c
+++ b/frame/3/bli_l3_sup_var12.c
@@ -586,8 +586,8 @@ void PASTEMAC(ch,varname) \
 	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
 \
 	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
-	const dim_t NC  = bli_align_dim_to_mult( NC0, MR ); \
-	const dim_t MC  = bli_align_dim_to_mult( MC0, NR ); \
+	const dim_t NC  = bli_align_dim_to_mult( NC0, MR, true ); \
+	const dim_t MC  = bli_align_dim_to_mult( MC0, NR, true ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = rs_c * NC; \
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index 0fc4a8e82..d444d53ca 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -194,8 +194,8 @@ void bli_gemmsup_ref_var1n
 	// Nudge NC up to a multiple of MR and MC up to a multiple of NR.
 	// NOTE: This is unique to variant 1 (ie: not performed in variant 2)
 	// because MC % MR == 0 and NC % NR == 0 is already enforced at runtime.
-	const dim_t NC  = bli_align_dim_to_mult( NC0, MR );
-	const dim_t MC  = bli_align_dim_to_mult( MC0, NR );
+	const dim_t NC  = bli_align_dim_to_mult( NC0, MR, true );
+	const dim_t MC  = bli_align_dim_to_mult( MC0, NR, true );
 
 	// Query the maximum blocksize for MR, which implies a maximum blocksize
 	// extension for the final iteration.
@@ -234,16 +234,18 @@ void bli_gemmsup_ref_var1n
 	// Determine whether we are using more than one thread.
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
 
-	thrinfo_t* thread_jc = bli_thrinfo_sub_node( thread );
-	thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc );
-	thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_pc );
-	thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pa );
-	thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_ic );
-	thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pb );
+	thrinfo_t* thread_jc = bli_thrinfo_sub_node( 0, thread );
+	thrinfo_t* thread_pc = bli_thrinfo_sub_node( 0, thread_jc );
+	thrinfo_t* thread_pa = bli_thrinfo_sub_node( 0, thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_sub_node( 0, thread_pa );
+	thrinfo_t* thread_pb = bli_thrinfo_sub_node( 0, thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_sub_node( 0, thread_pb );
 
 	// Compute the JC loop thread range for the current thread.
 	dim_t jc_start, jc_end;
-	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end );
+	dim_t jc_tid = bli_thrinfo_work_id( thread_jc );
+	dim_t jc_nt  = bli_thrinfo_n_way( thread_jc );
+	bli_thread_range_sub( jc_tid, jc_nt, m, MR, FALSE, &jc_start, &jc_end );
 	const dim_t m_local = jc_end - jc_start;
 
 	// Compute number of primary and leftover components of the JC loop.
@@ -296,9 +298,7 @@ void bli_gemmsup_ref_var1n
 			  packa,
 			  BLIS_BUFFER_FOR_B_PANEL, // This algorithm packs matrix A to
 			  stor_id,                 // a "panel of B".
-			  BLIS_NO_TRANSPOSE,
 			  dt,
-			  NC,     KC,       // This "panel of B" is (at most) NC x KC.
 			  nc_cur, kc_cur, MR,
 			  one,
 			  a_pc,   rs_a,      cs_a,
@@ -320,7 +320,9 @@ void bli_gemmsup_ref_var1n
 
 			// Compute the IC loop thread range for the current thread.
 			dim_t ic_start, ic_end;
-			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end );
+			dim_t ic_tid = bli_thrinfo_work_id( thread_ic );
+			dim_t ic_nt  = bli_thrinfo_n_way( thread_ic );
+			bli_thread_range_sub( ic_tid, ic_nt, n, NR, FALSE, &ic_start, &ic_end );
 			const dim_t n_local = ic_end - ic_start;
 
 			// Compute number of primary and leftover components of the IC loop.
@@ -352,9 +354,7 @@ void bli_gemmsup_ref_var1n
 				  packb,
 				  BLIS_BUFFER_FOR_A_BLOCK, // This algorithm packs matrix B to
 				  stor_id,                 // a "block of A".
-				  BLIS_NO_TRANSPOSE,
 				  dt,
-				  MC,     KC,       // This "block of A" is (at most) KC x MC.
 				  mc_cur, kc_cur, NR,
 				  one,
 				  b_ic,   cs_b,      rs_b,
@@ -390,7 +390,9 @@ void bli_gemmsup_ref_var1n
 
 				// Compute the JR loop thread range for the current thread.
 				dim_t jr_start, jr_end;
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+				dim_t jr_tid = bli_thrinfo_work_id( thread_jr );
+				dim_t jr_nt  = bli_thrinfo_n_way( thread_jr );
+				bli_thread_range_sub( jr_tid, jr_nt, jr_iter, 1, FALSE, &jr_start, &jr_end );
 
 				// Loop over the m dimension (NR columns at a time).
 				//for ( dim_t j = 0; j < jr_iter; j += 1 )
@@ -636,16 +638,18 @@ void bli_gemmsup_ref_var2m
 	// Determine whether we are using more than one thread.
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
 
-	thrinfo_t* thread_jc = bli_thrinfo_sub_node( thread );
-	thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc );
-	thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_pc );
-	thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pb );
-	thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_ic );
-	thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pa );
+	thrinfo_t* thread_jc = bli_thrinfo_sub_node( 0, thread );
+	thrinfo_t* thread_pc = bli_thrinfo_sub_node( 0, thread_jc );
+	thrinfo_t* thread_pb = bli_thrinfo_sub_node( 0, thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_sub_node( 0, thread_pb );
+	thrinfo_t* thread_pa = bli_thrinfo_sub_node( 0, thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_sub_node( 0, thread_pa );
 
 	// Compute the JC loop thread range for the current thread.
 	dim_t jc_start, jc_end;
-	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+	dim_t jc_tid = bli_thrinfo_work_id( thread_jc );
+	dim_t jc_nt  = bli_thrinfo_n_way( thread_jc );
+	bli_thread_range_sub( jc_tid, jc_nt, n, NR, FALSE, &jc_start, &jc_end );
 	const dim_t n_local = jc_end - jc_start;
 
 	// Compute number of primary and leftover components of the JC loop.
@@ -696,9 +700,7 @@ void bli_gemmsup_ref_var2m
 			  packb,
 			  BLIS_BUFFER_FOR_B_PANEL, // This algorithm packs matrix B to
 			  stor_id,                 // a "panel of B."
-			  BLIS_NO_TRANSPOSE,
 			  dt,
-			  NC,     KC,       // This "panel of B" is (at most) KC x NC.
 			  nc_cur, kc_cur, NR,
 			  one,
 			  b_pc,   cs_b,      rs_b,
@@ -720,7 +722,9 @@ void bli_gemmsup_ref_var2m
 
 			// Compute the IC loop thread range for the current thread.
 			dim_t ic_start, ic_end;
-			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end );
+			dim_t ic_tid = bli_thrinfo_work_id( thread_ic );
+			dim_t ic_nt  = bli_thrinfo_n_way( thread_ic );
+			bli_thread_range_sub( ic_tid, ic_nt, m, MR, FALSE, &ic_start, &ic_end );
 			const dim_t m_local = ic_end - ic_start;
 
 			// Compute number of primary and leftover components of the IC loop.
@@ -750,9 +754,7 @@ void bli_gemmsup_ref_var2m
 				  packa,
 				  BLIS_BUFFER_FOR_A_BLOCK, // This algorithm packs matrix A to
 				  stor_id,                 // a "block of A."
-				  BLIS_NO_TRANSPOSE,
 				  dt,
-				  MC,     KC,       // This "block of A" is (at most) MC x KC.
 				  mc_cur, kc_cur, MR,
 				  one,
 				  a_ic,   rs_a,      cs_a,
@@ -788,7 +790,9 @@ void bli_gemmsup_ref_var2m
 
 				// Compute the JR loop thread range for the current thread.
 				dim_t jr_start, jr_end;
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+				dim_t jr_tid = bli_thrinfo_work_id( thread_jr );
+				dim_t jr_nt  = bli_thrinfo_n_way( thread_jr );
+				bli_thread_range_sub( jr_tid, jr_nt, jr_iter, 1, FALSE, &jr_start, &jr_end );
 
 				// Loop over the n dimension (NR columns at a time).
 				//for ( dim_t j = 0; j < jr_iter; j += 1 )
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index 5b3d6f6a4..647b58a66 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
              trans_t    trans, \
        const obj_t*     alpha, \
diff --git a/frame/3/bli_l3_tapi.c b/frame/3/bli_l3_tapi.c
index 7d5883311..164f4401e 100644
--- a/frame/3/bli_l3_tapi.c
+++ b/frame/3/bli_l3_tapi.c
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  transa, \
 	  transb, \
@@ -94,7 +94,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  uploc, \
 	  transa, \
@@ -133,7 +133,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  side, \
 	  uploa, \
@@ -171,7 +171,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  uploc, \
 	  transa, \
@@ -207,7 +207,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  uploc, \
 	  transa, \
@@ -243,7 +243,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  uploc, \
 	  transa, \
@@ -279,7 +279,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  uploc, \
 	  transa, \
@@ -319,7 +319,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  side, \
 	  uploa, \
@@ -358,7 +358,7 @@ void PASTEMAC(ch,opname) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
 	   objects. */ \
-	PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
 	( \
 	  side, \
 	  uploa, \
diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c
index 237d9d9a8..04560c1ca 100644
--- a/frame/3/bli_l3_tapi_ex.c
+++ b/frame/3/bli_l3_tapi_ex.c
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              trans_t transa, \
              trans_t transb, \
@@ -101,7 +101,7 @@ INSERT_GENTFUNC_BASIC( gemm )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, struca ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              side_t  side, \
              uplo_t  uploa, \
@@ -167,7 +167,7 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              uplo_t   uploc, \
              trans_t  transa, \
@@ -223,7 +223,7 @@ INSERT_GENTFUNCR_BASIC( herk )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              uplo_t   uploc, \
              trans_t  transa, \
@@ -287,7 +287,7 @@ INSERT_GENTFUNCR_BASIC( her2k )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              uplo_t  uploc, \
              trans_t transa, \
@@ -342,7 +342,7 @@ INSERT_GENTFUNC_BASIC( syrk )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              uplo_t  uploc, \
              trans_t transa, \
@@ -405,7 +405,7 @@ INSERT_GENTFUNC_BASIC( syr2k )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              uplo_t  uploc, \
              trans_t transa, \
@@ -466,7 +466,7 @@ INSERT_GENTFUNC_BASIC( gemmt )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              side_t  side, \
              uplo_t  uploa, \
@@ -533,7 +533,7 @@ INSERT_GENTFUNC_BASIC( trmm3 )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
+void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
              side_t  side, \
              uplo_t  uploa, \
diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h
index b0288841f..872cecfa7 100644
--- a/frame/3/bli_l3_tapi_ex.h
+++ b/frame/3/bli_l3_tapi_ex.h
@@ -41,7 +41,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              trans_t transa, \
              trans_t transb, \
@@ -62,7 +62,7 @@ INSERT_GENTPROT_BASIC( gemm )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              side_t  side, \
              uplo_t  uploa, \
@@ -86,7 +86,7 @@ INSERT_GENTPROT_BASIC( symm )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              uplo_t   uploc, \
              trans_t  transa, \
@@ -106,7 +106,7 @@ INSERT_GENTPROTR_BASIC( herk )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              uplo_t   uploc, \
              trans_t  transa, \
@@ -128,7 +128,7 @@ INSERT_GENTPROTR_BASIC( her2k )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              uplo_t  uploc, \
              trans_t transa, \
@@ -148,7 +148,7 @@ INSERT_GENTPROT_BASIC( syrk )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              uplo_t  uploc, \
              trans_t transa, \
@@ -171,7 +171,7 @@ INSERT_GENTPROT_BASIC( syr2k )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              side_t  side, \
              uplo_t  uploa, \
@@ -195,7 +195,7 @@ INSERT_GENTPROT_BASIC( trmm3 )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
              side_t  side, \
              uplo_t  uploa, \
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 5f3d39d39..622f91652 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -62,56 +62,40 @@ thrinfo_t* bli_l3_thrinfo_create
 
 void bli_l3_thrinfo_grow
      (
-             thrinfo_t*  thread_par,
+             thrinfo_t*  thread,
        const rntm_t*     rntm,
        const cntl_t*     cntl
      )
 {
-	const cntl_t* sub_prenode = bli_cntl_sub_prenode( cntl );
-	const cntl_t* sub_node    = bli_cntl_sub_node( cntl );
-	const bszid_t bszid       = bli_cntl_bszid( cntl );
-	const dim_t   n_way       = bli_rntm_ways_for( bszid, rntm );
+	// For leaf nodes, create one more node in the thread control tree
+	// which splits the remianing threads into single-thread teams.
+	if ( bli_cntl_is_leaf( cntl ) )
+	{
+		dim_t      n_way      = bli_thrinfo_num_threads( thread );
+		thrinfo_t* thread_sub = bli_thrinfo_split( n_way, thread );
 
-	thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par );
-	bli_thrinfo_set_sub_node( thread_cur, thread_par );
+		bli_thrinfo_attach_sub_node( thread_sub, thread );
 
-	if ( bszid == BLIS_NO_PART )
-	{
-		// A hack: the packing code needs a thread communicator which represents
-		// a group of single-member thread teams working cooperatively However,
-		// the "normal" packm thrinfo_t node has a single team of multiple
-		// threads. Our solution (for now) is to create a sub-prenode on the
-		// thrinfo_t tree which splits this single team into multiple
-		// single-member thread teams.
-		const dim_t n_threads = bli_thrinfo_num_threads( thread_par );
-		thrinfo_t* thread_pre = bli_thrinfo_split( n_threads, thread_par );
-		bli_thrinfo_set_sub_prenode( thread_pre, thread_par );
-	}
-	else if ( sub_prenode != NULL )
-	{
-		// A pre-node is only used in the IC loop of trsm. In this case,
-		// we cannot actually thread in the m dimension due to data dependencies
-		// and so all parallelism must be moved down to the JR loop.
-		rntm_t rntm_l = *rntm;
-		const dim_t ic_nway = bli_rntm_ic_ways( &rntm_l );
-		const dim_t jr_nway = bli_rntm_jr_ways( &rntm_l );
-		bli_rntm_set_ic_ways_only(               1, &rntm_l );
-		bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l );
-
-		// Use thread_pre instead of thread_cur since we *don't* want to do any
-		// parallelism at this level. So the thread_pre node gets attached to
-		// thread_par and not thread_cur! This results in a split "one level
-		// higher" than in the corresponding cntl_t tree. This is intentional
-		// since two different thrinfo_t nodes will be used at the cntl_t node
-		// for trsm blocked variant 1 (one for trsm, one for gemm).
-		thrinfo_t* thread_pre = bli_thrinfo_split( 1, thread_par );
-		bli_thrinfo_set_sub_prenode( thread_pre, thread_par );
-		bli_l3_thrinfo_grow( thread_pre, &rntm_l, sub_prenode );
+		return;
 	}
 
-	if ( sub_node != NULL )
+	// Create a thread control tree sub-node for each non-NULL control tree
+	// sub-node. Note that the "ways" encoded in the control tree for each
+	// sub-node control parallelism for the parent node, not the sub-node. This
+	// is necessary because some nodes need to parallelize differently when
+	// calling different sub-nodes (e.g. the ic loop of trsm_l).
+	for ( dim_t i = 0; i < BLIS_MAX_SUB_NODES; i++ )
 	{
-		bli_l3_thrinfo_grow( thread_cur, rntm, sub_node );
+		const cntl_t* sub_node = bli_cntl_sub_node( i, cntl );
+		if ( sub_node == NULL )
+			return;
+
+		dim_t      ways       = bli_cntl_ways( i, cntl );
+		dim_t      n_way      = bli_rntm_total_ways_for( ways, rntm );
+		thrinfo_t* thread_sub = bli_thrinfo_split( n_way, thread );
+
+		bli_thrinfo_attach_sub_node( thread_sub, thread );
+		bli_l3_thrinfo_grow( thread_sub, rntm, sub_node );
 	}
 }
 
@@ -148,22 +132,13 @@ thrinfo_t* bli_l3_sup_thrinfo_create
 	thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa );
 	thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr );
 
-	const dim_t n_way_pb = bli_thrinfo_num_threads( thread_pb );
-	const dim_t n_way_pa = bli_thrinfo_num_threads( thread_pa );
-
-	// Create and set the prenodes for the packb and packa thrinfo_t nodes.
-	thrinfo_t* thread_pb_single = bli_thrinfo_split( n_way_pb, thread_pb );
-	thrinfo_t* thread_pa_single = bli_thrinfo_split( n_way_pa, thread_pa );
-	bli_thrinfo_set_sub_prenode( thread_pb_single, thread_pb );
-	bli_thrinfo_set_sub_prenode( thread_pa_single, thread_pa );
-
-	bli_thrinfo_set_sub_node( thread_jc,      root );
-	bli_thrinfo_set_sub_node( thread_pc, thread_jc );
-	bli_thrinfo_set_sub_node( thread_pb, thread_pc );
-	bli_thrinfo_set_sub_node( thread_ic, thread_pb );
-	bli_thrinfo_set_sub_node( thread_pa, thread_ic );
-	bli_thrinfo_set_sub_node( thread_jr, thread_pa );
-	bli_thrinfo_set_sub_node( thread_ir, thread_jr );
+	bli_thrinfo_set_sub_node( 0, thread_jc,      root );
+	bli_thrinfo_set_sub_node( 0, thread_pc, thread_jc );
+	bli_thrinfo_set_sub_node( 0, thread_pb, thread_pc );
+	bli_thrinfo_set_sub_node( 0, thread_ic, thread_pb );
+	bli_thrinfo_set_sub_node( 0, thread_pa, thread_ic );
+	bli_thrinfo_set_sub_node( 0, thread_jr, thread_pa );
+	bli_thrinfo_set_sub_node( 0, thread_ir, thread_jr );
 
 	return root;
 }
@@ -232,37 +207,37 @@ void bli_l3_thrinfo_print_gemm_paths
 
 	jc_way  = bli_thrinfo_n_way( jc_info );
 	jc_nt   = bli_thrinfo_num_threads( jc_info );
-	pc_info = bli_thrinfo_sub_node( jc_info );
+	pc_info = bli_thrinfo_sub_node( 0, jc_info );
 
 	if ( !pc_info ) goto print_header;
 
 	pc_way  = bli_thrinfo_n_way( pc_info );
 	pc_nt   = bli_thrinfo_num_threads( pc_info );
-	pb_info = bli_thrinfo_sub_node( pc_info );
+	pb_info = bli_thrinfo_sub_node( 0, pc_info );
 
 	if ( !pb_info ) goto print_header;
 
 	pb_way  = bli_thrinfo_n_way( pb_info );
 	pb_nt   = bli_thrinfo_num_threads( pb_info );
-	ic_info = bli_thrinfo_sub_node( pb_info );
+	ic_info = bli_thrinfo_sub_node( 0, pb_info );
 
 	if ( !ic_info ) goto print_header;
 
 	ic_way  = bli_thrinfo_n_way( ic_info );
 	ic_nt   = bli_thrinfo_num_threads( ic_info );
-	pa_info = bli_thrinfo_sub_node( ic_info );
+	pa_info = bli_thrinfo_sub_node( 0, ic_info );
 
 	if ( !pa_info ) goto print_header;
 
 	pa_way  = bli_thrinfo_n_way( pa_info );
 	pa_nt   = bli_thrinfo_num_threads( pa_info );
-	jr_info = bli_thrinfo_sub_node( pa_info );
+	jr_info = bli_thrinfo_sub_node( 0, pa_info );
 
 	if ( !jr_info ) goto print_header;
 
 	jr_way  = bli_thrinfo_n_way( jr_info );
 	jr_nt   = bli_thrinfo_num_threads( jr_info );
-	ir_info = bli_thrinfo_sub_node( jr_info );
+	ir_info = bli_thrinfo_sub_node( 0, jr_info );
 
 	if ( !ir_info ) goto print_header;
 
@@ -273,21 +248,21 @@ void bli_l3_thrinfo_print_gemm_paths
 
 	printf( "            jc   kc   pb   ic   pa   jr   ir\n" );
 	printf( "xx_nt:    %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
-	( unsigned long )jc_nt,
-	( unsigned long )pc_nt,
-	( unsigned long )pb_nt,
-	( unsigned long )ic_nt,
-	( unsigned long )pa_nt,
-	( unsigned long )jr_nt,
-	( unsigned long )ir_nt );
+	        ( unsigned long )jc_nt,
+	        ( unsigned long )pc_nt,
+	        ( unsigned long )pb_nt,
+	        ( unsigned long )ic_nt,
+	        ( unsigned long )pa_nt,
+	        ( unsigned long )jr_nt,
+	        ( unsigned long )ir_nt );
 	printf( "xx_way:   %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
-	( unsigned long )jc_way,
-	( unsigned long )pc_way,
-	( unsigned long )pb_way,
-	( unsigned long )ic_way,
-	( unsigned long )pa_way,
-	( unsigned long )jr_way,
-	( unsigned long )ir_way );
+	        ( unsigned long )jc_way,
+	        ( unsigned long )pc_way,
+	        ( unsigned long )pb_way,
+	        ( unsigned long )ic_way,
+	        ( unsigned long )pa_way,
+	        ( unsigned long )jr_way,
+	        ( unsigned long )ir_way );
 	printf( "============================================\n" );
 
 	for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id )
@@ -304,37 +279,37 @@ void bli_l3_thrinfo_print_gemm_paths
 
 		jc_comm_id = bli_thrinfo_thread_id( jc_info );
 		jc_work_id = bli_thrinfo_work_id( jc_info );
-		pc_info    = bli_thrinfo_sub_node( jc_info );
+		pc_info    = bli_thrinfo_sub_node( 0, jc_info );
 
 		if ( !pc_info ) goto print_thrinfo;
 
 		pc_comm_id = bli_thrinfo_thread_id( pc_info );
 		pc_work_id = bli_thrinfo_work_id( pc_info );
-		pb_info    = bli_thrinfo_sub_node( pc_info );
+		pb_info    = bli_thrinfo_sub_node( 0, pc_info );
 
 		if ( !pb_info ) goto print_thrinfo;
 
 		pb_comm_id = bli_thrinfo_thread_id( pb_info );
 		pb_work_id = bli_thrinfo_work_id( pb_info );
-		ic_info    = bli_thrinfo_sub_node( pb_info );
+		ic_info    = bli_thrinfo_sub_node( 0, pb_info );
 
 		if ( !ic_info ) goto print_thrinfo;
 
 		ic_comm_id = bli_thrinfo_thread_id( ic_info );
 		ic_work_id = bli_thrinfo_work_id( ic_info );
-		pa_info    = bli_thrinfo_sub_node( ic_info );
+		pa_info    = bli_thrinfo_sub_node( 0, ic_info );
 
 		if ( !pa_info ) goto print_thrinfo;
 
 		pa_comm_id = bli_thrinfo_thread_id( pa_info );
 		pa_work_id = bli_thrinfo_work_id( pa_info );
-		jr_info    = bli_thrinfo_sub_node( pa_info );
+		jr_info    = bli_thrinfo_sub_node( 0, pa_info );
 
 		if ( !jr_info ) goto print_thrinfo;
 
 		jr_comm_id = bli_thrinfo_thread_id( jr_info );
 		jr_work_id = bli_thrinfo_work_id( jr_info );
-		ir_info    = bli_thrinfo_sub_node( jr_info );
+		ir_info    = bli_thrinfo_sub_node( 0, jr_info );
 
 		if ( !ir_info ) goto print_thrinfo;
 
@@ -344,21 +319,21 @@ void bli_l3_thrinfo_print_gemm_paths
 		print_thrinfo:
 
 		printf( "comm ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
-		( long )jc_comm_id,
-		( long )pc_comm_id,
-		( long )pb_comm_id,
-		( long )ic_comm_id,
-		( long )pa_comm_id,
-		( long )jr_comm_id,
-		( long )ir_comm_id );
+		        ( long )jc_comm_id,
+		        ( long )pc_comm_id,
+		        ( long )pb_comm_id,
+		        ( long )ic_comm_id,
+		        ( long )pa_comm_id,
+		        ( long )jr_comm_id,
+		        ( long )ir_comm_id );
 		printf( "work ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
-		( long )jc_work_id,
-		( long )pc_work_id,
-		( long )pb_work_id,
-		( long )ic_work_id,
-		( long )pa_work_id,
-		( long )jr_work_id,
-		( long )ir_work_id );
+		        ( long )jc_work_id,
+		        ( long )pc_work_id,
+		        ( long )pb_work_id,
+		        ( long )ic_work_id,
+		        ( long )pa_work_id,
+		        ( long )jr_work_id,
+		        ( long )ir_work_id );
 		printf( "--------------------------------------------\n" );
 	}
 
@@ -412,26 +387,26 @@ void bli_l3_thrinfo_print_trsm_paths
 
 	jc_way   = bli_thrinfo_n_way( jc_info );
 	jc_nt    = bli_thrinfo_num_threads( jc_info );
-	pc_info  = bli_thrinfo_sub_node( jc_info );
+	pc_info  = bli_thrinfo_sub_node( 0, jc_info );
 
 	if ( !pc_info ) goto print_header;
 
 	pc_way   = bli_thrinfo_n_way( pc_info );
 	pc_nt    = bli_thrinfo_num_threads( pc_info );
-	pb_info  = bli_thrinfo_sub_node( pc_info );
+	pb_info  = bli_thrinfo_sub_node( 0, pc_info );
 
 	if ( !pb_info ) goto print_header;
 
 	pb_way   = bli_thrinfo_n_way( pb_info );
 	pb_nt    = bli_thrinfo_num_threads( pb_info );
-	ic_info  = bli_thrinfo_sub_node( pb_info );
+	ic_info  = bli_thrinfo_sub_node( 0, pb_info );
 
 	if ( !ic_info ) goto print_header;
 
 	ic_way   = bli_thrinfo_n_way( ic_info );
 	ic_nt    = bli_thrinfo_num_threads( ic_info );
-	pa_info  = bli_thrinfo_sub_node( ic_info );
-	pa_info0 = bli_thrinfo_sub_prenode( ic_info );
+	pa_info  = bli_thrinfo_sub_node( 1, ic_info );
+	pa_info0 = bli_thrinfo_sub_node( 0, ic_info );
 
 	// check_header_prenode:
 
@@ -439,13 +414,13 @@ void bli_l3_thrinfo_print_trsm_paths
 
 	pa_way0  = bli_thrinfo_n_way( pa_info0 );
 	pa_nt0   = bli_thrinfo_num_threads( pa_info0 );
-	jr_info0 = bli_thrinfo_sub_node( pa_info0 );
+	jr_info0 = bli_thrinfo_sub_node( 0, pa_info0 );
 
 	if ( !jr_info0 ) goto check_header_node;
 
 	jr_way0  = bli_thrinfo_n_way( jr_info0 );
 	jr_nt0   = bli_thrinfo_num_threads( jr_info0 );
-	ir_info0 = bli_thrinfo_sub_node( jr_info0 );
+	ir_info0 = bli_thrinfo_sub_node( 0, jr_info0 );
 
 	if ( !ir_info0 ) goto check_header_node;
 
@@ -458,13 +433,13 @@ void bli_l3_thrinfo_print_trsm_paths
 
 	pa_way  = bli_thrinfo_n_way( pa_info );
 	pa_nt   = bli_thrinfo_num_threads( pa_info );
-	jr_info = bli_thrinfo_sub_node( pa_info );
+	jr_info = bli_thrinfo_sub_node( 0, pa_info );
 
 	if ( !jr_info ) goto print_header;
 
 	jr_way  = bli_thrinfo_n_way( jr_info );
 	jr_nt   = bli_thrinfo_num_threads( jr_info );
-	ir_info = bli_thrinfo_sub_node( jr_info );
+	ir_info = bli_thrinfo_sub_node( 0, jr_info );
 
 	if ( !ir_info ) goto print_header;
 
@@ -475,21 +450,21 @@ void bli_l3_thrinfo_print_trsm_paths
 
 	printf( "            jc   kc   pb   ic     pa     jr     ir\n" );
 	printf( "xx_nt:    %4ld %4ld %4ld %4ld  %2ld|%2ld  %2ld|%2ld  %2ld|%2ld\n",
-	( long )jc_nt,
-	( long )pc_nt,
-	( long )pb_nt,
-	( long )ic_nt,
-	( long )pa_nt0, ( long )pa_nt,
-	( long )jr_nt0, ( long )jr_nt,
-	( long )ir_nt0, ( long )ir_nt );
+	        ( long )jc_nt,
+	        ( long )pc_nt,
+	        ( long )pb_nt,
+	        ( long )ic_nt,
+	        ( long )pa_nt0, ( long )pa_nt,
+	        ( long )jr_nt0, ( long )jr_nt,
+	        ( long )ir_nt0, ( long )ir_nt );
 	printf( "xx_way:   %4ld %4ld %4ld %4ld  %2ld|%2ld  %2ld|%2ld  %2ld|%2ld\n",
-    ( long )jc_way,
-	( long )pc_way,
-	( long )pb_way,
-	( long )ic_way,
-	( long )pa_way0, ( long )pa_way,
-	( long )jr_way0, ( long )jr_way,
-	( long )ir_way0, ( long )ir_way );
+	        ( long )jc_way,
+	        ( long )pc_way,
+	        ( long )pb_way,
+	        ( long )ic_way,
+	        ( long )pa_way0, ( long )pa_way,
+	        ( long )jr_way0, ( long )jr_way,
+	        ( long )ir_way0, ( long )ir_way );
 	printf( "==================================================\n" );
 
 
@@ -497,7 +472,6 @@ void bli_l3_thrinfo_print_trsm_paths
 	{
 		jc_info = threads[gl_id];
 
-#if 1
 		// NOTE: This cpp branch contains code that is safe to execute
 		// for small problems that are parallelized enough that one or
 		// more threads gets no work.
@@ -514,26 +488,26 @@ void bli_l3_thrinfo_print_trsm_paths
 
 		jc_comm_id = bli_thrinfo_thread_id( jc_info );
 		jc_work_id = bli_thrinfo_work_id( jc_info );
-		pc_info    = bli_thrinfo_sub_node( jc_info );
+		pc_info    = bli_thrinfo_sub_node( 0, jc_info );
 
 		if ( !pc_info ) goto print_thrinfo;
 
 		pc_comm_id = bli_thrinfo_thread_id( pc_info );
 		pc_work_id = bli_thrinfo_work_id( pc_info );
-		pb_info    = bli_thrinfo_sub_node( pc_info );
+		pb_info    = bli_thrinfo_sub_node( 0, pc_info );
 
 		if ( !pb_info ) goto print_thrinfo;
 
 		pb_comm_id = bli_thrinfo_thread_id( pb_info );
 		pb_work_id = bli_thrinfo_work_id( pb_info );
-		ic_info    = bli_thrinfo_sub_node( pb_info );
+		ic_info    = bli_thrinfo_sub_node( 0, pb_info );
 
 		if ( !ic_info ) goto print_thrinfo;
 
 		ic_comm_id = bli_thrinfo_thread_id( ic_info );
 		ic_work_id = bli_thrinfo_work_id( ic_info );
-		pa_info    = bli_thrinfo_sub_node( ic_info );
-		pa_info0   = bli_thrinfo_sub_prenode( ic_info );
+		pa_info    = bli_thrinfo_sub_node( 1, ic_info );
+		pa_info0   = bli_thrinfo_sub_node( 0, ic_info );
 
 		// check_thrinfo_prenode:
 
@@ -541,13 +515,13 @@ void bli_l3_thrinfo_print_trsm_paths
 
 		pa_comm_id0 = bli_thrinfo_thread_id( pa_info0 );
 		pa_work_id0 = bli_thrinfo_work_id( pa_info0 );
-		jr_info0    = bli_thrinfo_sub_node( pa_info0 );
+		jr_info0    = bli_thrinfo_sub_node( 0, pa_info0 );
 
 		if ( !jr_info0 ) goto check_thrinfo_node;
 
 		jr_comm_id0 = bli_thrinfo_thread_id( jr_info0 );
 		jr_work_id0 = bli_thrinfo_work_id( jr_info0 );
-		ir_info0    = bli_thrinfo_sub_node( jr_info0 );
+		ir_info0    = bli_thrinfo_sub_node( 0, jr_info0 );
 
 		if ( !ir_info0 ) goto check_thrinfo_node;
 
@@ -560,13 +534,13 @@ void bli_l3_thrinfo_print_trsm_paths
 
 		pa_comm_id = bli_thrinfo_thread_id( pa_info );
 		pa_work_id = bli_thrinfo_work_id( pa_info );
-		jr_info    = bli_thrinfo_sub_node( pa_info );
+		jr_info    = bli_thrinfo_sub_node( 0, pa_info );
 
 		if ( !jr_info ) goto print_thrinfo;
 
 		jr_comm_id = bli_thrinfo_thread_id( jr_info );
 		jr_work_id = bli_thrinfo_work_id( jr_info );
-		ir_info    = bli_thrinfo_sub_node( jr_info );
+		ir_info    = bli_thrinfo_sub_node( 0, jr_info );
 
 		if ( !ir_info ) goto print_thrinfo;
 
@@ -574,169 +548,25 @@ void bli_l3_thrinfo_print_trsm_paths
 		ir_work_id = bli_thrinfo_work_id( ir_info );
 
 		print_thrinfo:
-#else
-		dim_t jc_comm_id;
-		dim_t pc_comm_id;
-		dim_t pb_comm_id;
-		dim_t ic_comm_id;
-		dim_t pa_comm_id0, pa_comm_id;
-		dim_t jr_comm_id0, jr_comm_id;
-		dim_t ir_comm_id0, ir_comm_id;
-
-		dim_t jc_work_id;
-		dim_t pc_work_id;
-		dim_t pb_work_id;
-		dim_t ic_work_id;
-		dim_t pa_work_id0, pa_work_id;
-		dim_t jr_work_id0, jr_work_id;
-		dim_t ir_work_id0, ir_work_id;
-
-		// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
-		// may not fully build their thrinfo_t structures--specifically when the
-		// dimension being parallelized is not large enough for each thread to have
-		// even one unit of work (where as unit is usually a single micropanel's
-		// width, MR or NR).
-		if ( !jc_info )
-		{
-			jc_comm_id = pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-			jc_work_id = pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-		}
-		else
-		{
-			jc_comm_id = bli_thrinfo_thread_id( jc_info );
-			jc_work_id = bli_thrinfo_work_id( jc_info );
-			pc_info = bli_thrinfo_sub_node( jc_info );
-
-			if ( !pc_info )
-			{
-				pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-				pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-			}
-			else
-			{
-				pc_comm_id = bli_thrinfo_thread_id( pc_info );
-				pc_work_id = bli_thrinfo_work_id( pc_info );
-				pb_info = bli_thrinfo_sub_node( pc_info );
-
-				if ( !pb_info )
-				{
-					pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-					pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-				}
-				else
-				{
-					pb_comm_id = bli_thrinfo_thread_id( pb_info );
-					pb_work_id = bli_thrinfo_work_id( pb_info );
-					ic_info = bli_thrinfo_sub_node( pb_info );
-
-					if ( !ic_info )
-					{
-						ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-						ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-					}
-					else
-					{
-						ic_comm_id = bli_thrinfo_thread_id( ic_info );
-						ic_work_id = bli_thrinfo_work_id( ic_info );
-						pa_info0 = bli_thrinfo_sub_prenode( ic_info );
-						pa_info = bli_thrinfo_sub_node( ic_info );
-
-						// Prenode
-						if ( !pa_info0 )
-						{
-							pa_comm_id0 = jr_comm_id0 = ir_comm_id0 = -1;
-							pa_work_id0 = jr_work_id0 = ir_work_id0 = -1;
-						}
-						else
-						{
-							pa_comm_id0 = bli_thrinfo_thread_id( pa_info0 );
-							pa_work_id0 = bli_thrinfo_work_id( pa_info0 );
-							jr_info0 = bli_thrinfo_sub_node( pa_info0 );
-
-							if ( !jr_info0 )
-							{
-								jr_comm_id0 = ir_comm_id0 = -1;
-								jr_work_id0 = ir_work_id0 = -1;
-							}
-							else
-							{
-								jr_comm_id0 = bli_thrinfo_thread_id( jr_info0 );
-								jr_work_id0 = bli_thrinfo_work_id( jr_info0 );
-								ir_info0 = bli_thrinfo_sub_node( jr_info0 );
-
-								if ( !ir_info0 )
-								{
-									ir_comm_id0 = -1;
-									ir_work_id0 = -1;
-								}
-								else
-								{
-									ir_comm_id0 = bli_thrinfo_thread_id( ir_info0 );
-									ir_work_id0 = bli_thrinfo_work_id( ir_info0 );
-								}
-							}
-						}
-
-						// Main node
-						if ( !pa_info )
-						{
-							pa_comm_id = jr_comm_id = ir_comm_id = -1;
-							pa_work_id = jr_work_id = ir_work_id = -1;
-						}
-						else
-						{
-							pa_comm_id = bli_thrinfo_thread_id( pa_info );
-							pa_work_id = bli_thrinfo_work_id( pa_info );
-							jr_info = bli_thrinfo_sub_node( pa_info );
-
-							if ( !jr_info )
-							{
-								jr_comm_id = ir_comm_id = -1;
-								jr_work_id = ir_work_id = -1;
-							}
-							else
-							{
-								jr_comm_id = bli_thrinfo_thread_id( jr_info );
-								jr_work_id = bli_thrinfo_work_id( jr_info );
-								ir_info = bli_thrinfo_sub_node( jr_info );
-
-								if ( !ir_info )
-								{
-									ir_comm_id = -1;
-									ir_work_id = -1;
-								}
-								else
-								{
-									ir_comm_id = bli_thrinfo_thread_id( ir_info );
-									ir_work_id = bli_thrinfo_work_id( ir_info );
-								}
-							}
-						}
-					}
-				}
-			}
-		}
-#endif
 
 		printf( "comm ids: %4ld %4ld %4ld %4ld  %2ld|%2ld  %2ld|%2ld  %2ld|%2ld\n",
-		( long )jc_comm_id,
-		( long )pc_comm_id,
-		( long )pb_comm_id,
-		( long )ic_comm_id,
-		( long )pa_comm_id0, ( long )pa_comm_id,
-		( long )jr_comm_id0, ( long )jr_comm_id,
-		( long )ir_comm_id0, ( long )ir_comm_id );
+		        ( long )jc_comm_id,
+		        ( long )pc_comm_id,
+		        ( long )pb_comm_id,
+		        ( long )ic_comm_id,
+		        ( long )pa_comm_id0, ( long )pa_comm_id,
+		        ( long )jr_comm_id0, ( long )jr_comm_id,
+		        ( long )ir_comm_id0, ( long )ir_comm_id );
 		printf( "work ids: %4ld %4ld %4ld %4ld  %2ld|%2ld  %2ld|%2ld  %2ld|%2ld\n",
-		( long )jc_work_id,
-		( long )pc_work_id,
-		( long )pb_work_id,
-		( long )ic_work_id,
-		( long )pa_work_id0, ( long )pa_work_id,
-		( long )jr_work_id0, ( long )jr_work_id,
-		( long )ir_work_id0, ( long )ir_work_id );
+		        ( long )jc_work_id,
+		        ( long )pc_work_id,
+		        ( long )pb_work_id,
+		        ( long )ic_work_id,
+		        ( long )pa_work_id0, ( long )pa_work_id,
+		        ( long )jr_work_id0, ( long )jr_work_id,
+		        ( long )ir_work_id0, ( long )ir_work_id );
 		printf( "--------------------------------------------------\n" );
 	}
-
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index 2ea7a3fc2..b041ac993 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -81,9 +81,9 @@ BLIS_EXPORT_BLIS thrinfo_t* bli_l3_thrinfo_create
 
 void bli_l3_thrinfo_grow
      (
-             thrinfo_t*  thread_par,
-       const rntm_t*     rntm,
-       const cntl_t*     cntl
+             thrinfo_t* thread,
+       const rntm_t*    rntm,
+       const cntl_t*    cntl
      );
 
 thrinfo_t* bli_l3_sup_thrinfo_create
diff --git a/frame/3/bli_l3_ukr_oapi.c b/frame/3/bli_l3_ukr_oapi.c
index 8494100fa..2e369faeb 100644
--- a/frame/3/bli_l3_ukr_oapi.c
+++ b/frame/3/bli_l3_ukr_oapi.c
@@ -37,7 +37,7 @@
 #undef  GENFRONT
 #define GENFRONT( tname, opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
@@ -96,7 +96,7 @@ GENFRONT( gemm, gemm_ukernel )
 #undef  GENFRONT
 #define GENFRONT( tname, opname, opnamel, opnameu ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a1x, \
@@ -185,7 +185,7 @@ GENFRONT( gemmtrsm, gemmtrsm_ukernel, gemmtrsm_l_ukernel, gemmtrsm_u_ukernel )
 #undef  GENFRONT
 #define GENFRONT( tname, opname, opnamel, opnameu ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  a, \
        obj_t*  b, \
diff --git a/frame/3/bli_l3_ukr_oapi.h b/frame/3/bli_l3_ukr_oapi.h
index 5fed11ede..64e3f5a2e 100644
--- a/frame/3/bli_l3_ukr_oapi.h
+++ b/frame/3/bli_l3_ukr_oapi.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a, \
@@ -56,7 +56,7 @@ GENPROT( gemm_ukernel )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        obj_t*  alpha, \
        obj_t*  a1x, \
@@ -73,7 +73,7 @@ GENPROT( gemmtrsm_ukernel )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        obj_t*  a, \
        obj_t*  b, \
diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h
index ea26ee7f4..00bc23404 100644
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -49,6 +49,8 @@ void PASTEMAC(ch,funcname) \
        BLIS_CNTX_PARAM  \
      );
 
+#define GEMM_UKR2_PROT( ctypeab, ctypec, chab, chc, fn ) L3TPROT( /* not used */, PASTECH(chab,chc), fn, gemm );
+
 #define GEMM_UKR_PROT(     ctype, ch, fn )  L3TPROT( ctype, ch, fn, gemm );
 #define GEMMTRSM_UKR_PROT( ctype, ch, fn )  L3TPROT( ctype, ch, fn, gemmtrsm );
 #define TRSM_UKR_PROT(     ctype, ch, fn )  L3TPROT( ctype, ch, fn, trsm );
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index e3dffc1b7..450fc1cd4 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -47,7 +47,7 @@ void PASTEMAC(ch,opname) \
        const ctype*     b, \
        const ctype*     beta, \
              ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
 \
 	/* Query the context for the function address of the current
 	   datatype's micro-kernel. */ \
-	PASTECH(tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(tname,_ukr_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
 	f \
@@ -92,7 +92,7 @@ void PASTEMAC(ch,opname) \
        const ctype*     bx1, \
              ctype*     b11, \
              ctype*     c11, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -102,7 +102,7 @@ void PASTEMAC(ch,opname) \
 \
 	/* Query the context for the function address of the current
 	   datatype's micro-kernel. */ \
-	PASTECH(tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(tname,_ukr_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
 	f \
@@ -133,7 +133,7 @@ void PASTEMAC(ch,opname) \
        const ctype*     a, \
              ctype*     b, \
              ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -143,7 +143,7 @@ void PASTEMAC(ch,opname) \
 \
 	/* Query the context for the function address of the current
 	   datatype's micro-kernel. */ \
-	PASTECH(tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
+	PASTECH(tname,_ukr_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the typed function for the given datatype. */ \
 	f \
diff --git a/frame/3/bli_l3_ukr_tapi.h b/frame/3/bli_l3_ukr_tapi.h
index 66dd2f0e3..8ee4c0ec5 100644
--- a/frame/3/bli_l3_ukr_tapi.h
+++ b/frame/3/bli_l3_ukr_tapi.h
@@ -50,7 +50,7 @@ void PASTEMAC(ch,opname) \
        const ctype*     b, \
        const ctype*     beta, \
              ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      );
 
@@ -71,7 +71,7 @@ void PASTEMAC(ch,opname) \
        const ctype*     bx1, \
              ctype*     b11, \
              ctype*     c11, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      );
 
@@ -87,7 +87,7 @@ void PASTEMAC(ch,opname) \
        const ctype*     a, \
              ctype*     b, \
              ctype*     c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      );
 
diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/bli_l3_util.c
similarity index 74%
rename from frame/3/gemm/bli_gemm_front.h
rename to frame/3/bli_l3_util.c
index 3acf29cfb..e5b822be7 100644
--- a/frame/3/gemm/bli_gemm_front.h
+++ b/frame/3/bli_l3_util.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,27 +32,34 @@
 
 */
 
-void bli_gemm_front
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     );
-
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-err_t bli_gemm_small
-     (
+#include "blis.h"
+
+//
+// Utility functions for level 3 BLAS
+//
+
+err_t bli_l3_return_early_if_trivial
+      (
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             cntl_t* cntl
-     );
-#endif
+       const obj_t*  c
+      )
+{
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+		return BLIS_SUCCESS;
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return BLIS_SUCCESS;
+	}
 
+	return BLIS_FAILURE;
+}
diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/bli_l3_util.h
similarity index 92%
rename from frame/3/gemmt/bli_gemmt_front.h
rename to frame/3/bli_l3_util.h
index 4a7cd7abe..d49f013f0 100644
--- a/frame/3/gemmt/bli_gemmt_front.h
+++ b/frame/3/bli_l3_util.h
@@ -33,13 +33,17 @@
 
 */
 
-void bli_gemmt_front
-     (
+
+//
+// Prototypes for level 3 BLAS utility functions
+//
+
+BLIS_EXPORT_BLIS err_t bli_l3_return_early_if_trivial
+      (
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     );
+       const obj_t*  c
+      );
+
diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h
index ddd88e163..afc3a9914 100644
--- a/frame/3/gemm/bli_gemm.h
+++ b/frame/3/gemm/bli_gemm.h
@@ -33,13 +33,5 @@
 */
 
 #include "bli_gemm_cntl.h"
-#include "bli_gemm_front.h"
 
 #include "bli_gemm_var.h"
-
-#include "bli_gemm_ind_opt.h"
-
-// Mixed datatype support.
-#ifdef BLIS_ENABLE_GEMM_MD
-#include "bli_gemm_md.h"
-#endif
diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c
index f841e5eb2..863f50a05 100644
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -50,17 +50,20 @@ void bli_gemm_blk_var1
 	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	const dir_t direct = bli_l3_direct( &ap, b, &cp, cntl );
+	const dir_t direct = bli_part_cntl_direct( cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl );
+	bli_l3_prune_unref_mparts_m( &ap, b, &cp );
 
 	// Determine the current thread's subpartition range.
 	dim_t my_start, my_end;
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 	bli_thread_range_mdim
 	(
-	  direct, thread, &ap, b, &cp, cntl, cntx,
+	  direct,
+	  bli_part_cntl_blksz_mult( cntl ),
+	  bli_part_cntl_use_weighted( cntl ),
+	  thread, &ap, b, &cp,
 	  &my_start, &my_end
 	);
 
@@ -69,8 +72,9 @@ void bli_gemm_blk_var1
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, &ap,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		b_alg = bli_determine_blocksize( direct, i, my_end,
+		                                 bli_part_cntl_blksz_alg( cntl ),
+		                                 bli_part_cntl_blksz_max( cntl ) );
 
 		// Acquire partitions for A1 and C1.
 		obj_t a1, c1;
@@ -82,13 +86,11 @@ void bli_gemm_blk_var1
 		// Perform gemm subproblem.
 		bli_l3_int
 		(
-		  &BLIS_ONE,
 		  &a1,
 		  b,
-		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  bli_cntl_sub_node( cntl ),
+		  bli_cntl_sub_node( 0, cntl ),
 		  thread
 		);
 	}
diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c
index ceadce7d7..619038133 100644
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -50,17 +50,20 @@ void bli_gemm_blk_var2
 	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, &bp, &cp, cntl );
+	const dir_t direct = bli_part_cntl_direct( cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl );
+	bli_l3_prune_unref_mparts_n( a, &bp, &cp );
 
 	// Determine the current thread's subpartition range.
 	dim_t my_start, my_end;
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 	bli_thread_range_ndim
 	(
-	  direct, thread, a, &bp, &cp, cntl, cntx,
+	  direct,
+	  bli_part_cntl_blksz_mult( cntl ),
+	  bli_part_cntl_use_weighted( cntl ),
+	  thread, a, &bp, &cp,
 	  &my_start, &my_end
 	);
 
@@ -69,8 +72,9 @@ void bli_gemm_blk_var2
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		b_alg = bli_determine_blocksize( direct, i, my_end,
+		                                 bli_part_cntl_blksz_alg( cntl ),
+		                                 bli_part_cntl_blksz_max( cntl ) );
 
 		// Acquire partitions for B1 and C1.
 		obj_t b1, c1;
@@ -82,13 +86,11 @@ void bli_gemm_blk_var2
 		// Perform gemm subproblem.
 		bli_l3_int
 		(
-		  &BLIS_ONE,
 		  a,
 		  &b1,
-		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  bli_cntl_sub_node( cntl ),
+		  bli_cntl_sub_node( 0, cntl ),
 		  thread
 		);
 	}
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index d683cfc88..803a8954b 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -49,13 +49,13 @@ void bli_gemm_blk_var3
 	bli_obj_alias_to( b, &bp );
 	bli_obj_alias_to( c, &cs );
 
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
+	const dir_t direct = bli_part_cntl_direct( cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl );
+	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs );
 
 	// Query dimension in partitioning direction.
 	dim_t k_trans = bli_obj_width_after_trans( &ap );
@@ -65,8 +65,9 @@ void bli_gemm_blk_var3
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp,
-		                             bli_cntl_bszid( cntl ), cntx, cntl );
+		b_alg = bli_determine_blocksize( direct, i, k_trans,
+		                                 bli_part_cntl_blksz_alg( cntl ),
+		                                 bli_part_cntl_blksz_max( cntl ) );
 
 		// Acquire partitions for A1 and B1.
 		obj_t a1, b1;
@@ -78,13 +79,11 @@ void bli_gemm_blk_var3
 		// Perform gemm subproblem.
 		bli_l3_int
 		(
-		  &BLIS_ONE,
 		  &a1,
 		  &b1,
-		  &BLIS_ONE,
 		  &cs,
 		  cntx,
-		  bli_cntl_sub_node( cntl ),
+		  bli_cntl_sub_node( 0, cntl ),
 		  thread
 		);
 
@@ -108,8 +107,9 @@ void bli_gemm_blk_var3
 		// row-panel of C, and thus beta is applied to all of C exactly once.
 		// Thus, for neither trmm nor trmm3 should we reset the scalar on C
 		// after the first iteration.
-		if ( bli_cntl_family( cntl ) != BLIS_TRMM )
-		if ( i == 0 ) bli_obj_scalar_reset( &cs );
+		if ( i == 0 && !bli_obj_is_triangular( a ) &&
+		               !bli_obj_is_triangular( b ) )
+		    bli_obj_scalar_reset( &cs );
 	}
 }
 
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index 10484adf3..4972502d3 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -35,155 +35,573 @@
 
 #include "blis.h"
 
-cntl_t* bli_gemm_cntl_create
+
+static packm_ker_ft GENARRAY2_MIXP(packm_struc_cxk,packm_struc_cxk);
+
+void bli_gemm_var_cntl_init_node
      (
-       pool_t* pool,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+       void_fp          var_func,
+       num_t            dt_comp,
+       num_t            dt_out,
+       gemm_ukr_ft      ukr,
+       gemm_ukr_ft      real_ukr,
+       bool             row_pref,
+       dim_t            mr,
+       dim_t            nr,
+       dim_t            mr_scale,
+       dim_t            nr_scale,
+       gemm_var_cntl_t* cntl
      )
 {
-	return bli_gemmbp_cntl_create( pool, family, schema_a, schema_b, ker );
-}
+	// Initialize the gemm_var_cntl_t struct.
+	cntl->dt_comp  = dt_comp;
+	cntl->dt_out   = dt_out;
+	cntl->ukr      = ukr;
+	cntl->real_ukr = real_ukr;
+	cntl->row_pref = row_pref;
+	cntl->mr       = mr;
+	cntl->nr       = nr;
+	cntl->mr_scale = mr_scale;
+	cntl->nr_scale = nr_scale;
 
-// -----------------------------------------------------------------------------
+	bli_cntl_init_node
+	(
+	  var_func,
+	  &cntl->cntl
+	);
+}
 
-cntl_t* bli_gemmbp_cntl_create
+void bli_gemm_cntl_init
      (
-       pool_t* pool,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             ind_t        im,
+             opid_t       family,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             gemm_cntl_t* cntl
      )
 {
-	void_fp macro_kernel_fp;
-
-	// Choose the default macrokernel based on the operation family...
-	if      ( family == BLIS_GEMM )  macro_kernel_fp = bli_gemm_ker_var2;
-	else if ( family == BLIS_GEMMT ) macro_kernel_fp =
-	                                   #ifdef BLIS_ENABLE_JRIR_TLB
-	                                   bli_gemmt_x_ker_var2b;
-	                                   #else // ifdef ( _SLAB || _RR )
-	                                   bli_gemmt_x_ker_var2;
-	                                   #endif
-	else if ( family == BLIS_TRMM )  macro_kernel_fp =
-	                                   #ifdef BLIS_ENABLE_JRIR_TLB
-	                                   bli_trmm_xx_ker_var2b;
-	                                   #else // ifdef ( _SLAB || _RR )
-	                                   bli_trmm_xx_ker_var2;
-	                                   #endif
-	else /* should never execute */  macro_kernel_fp = NULL;
-
-	// ...unless a non-NULL kernel function pointer is passed in, in which
-	// case we use that instead.
-	if ( ker ) macro_kernel_fp = ker;
+	      bool   a_is_real = bli_obj_is_real( a );
+	      bool   b_is_real = bli_obj_is_real( b );
+	      bool   c_is_real = bli_obj_is_real( c );
+	const bool   induced   = im != BLIS_NAT ||
+	                         a_is_real != b_is_real ||
+	                         a_is_real != c_is_real ||
+	                         b_is_real != c_is_real;
+	const prec_t comp_prec = bli_obj_comp_prec( c );
+	const num_t  dt_c      = bli_obj_dt( c );
+	const num_t  dt_comp   = ( induced ? BLIS_REAL : bli_dt_domain( dt_c ) ) | comp_prec;
+	const bool   row_pref  = bli_cntx_get_ukr_prefs_dt( dt_comp, BLIS_GEMM_UKR_ROW_PREF, cntx );
+
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	bool needs_swap = (  row_pref && bli_obj_is_col_tilted( c ) ) ||
+	                  ( !row_pref && bli_obj_is_row_tilted( c ) );
+
+	// NOTE: This case casts right-side symm/hemm/trmm/trmm3 in terms of left side.
+	// This may be necessary when the current subconfiguration uses a gemm microkernel
+	// that assumes that the packing kernel will have already duplicated
+	// (broadcast) element of B in the packed copy of B. Supporting
+	// duplication within the logic that packs micropanels from symmetric
+	// matrices is ugly, but technically supported. This can
+	// lead to the microkernel being executed on an output matrix with the
+	// microkernel's general stride IO case (unless the microkernel supports
+	// both both row and column IO cases as well). As a
+	// consequence, those subconfigurations need a way to force the symmetric
+	// matrix to be on the left (and thus the general matrix to the on the
+	// right). So our solution is that in those cases, the subconfigurations
+	// simply #define BLIS_DISABLE_{SYMM,HEMM,TRMM,TRMM3}_RIGHT.
+
+	// If A is being multiplied from the right, transpose all operands
+	// so that we can perform the computation as if A were being multiplied
+	// from the left.
+#ifdef BLIS_DISABLE_SYMM_RIGHT
+	if ( family == BLIS_SYMM ) needs_swap = bli_obj_is_symmetric( b );
+#endif
+#ifdef BLIS_DISABLE_HEMM_RIGHT
+	if ( family == BLIS_HEMM ) needs_swap = bli_obj_is_hermitian( b );
+#endif
+#ifdef BLIS_DISABLE_TRMM_RIGHT
+	if ( family == BLIS_TRMM ) needs_swap = bli_obj_is_triangular( b );
+#endif
+#ifdef BLIS_DISABLE_TRMM3_RIGHT
+	if ( family == BLIS_TRMM3 ) needs_swap = bli_obj_is_triangular( b );
+#endif
+
+	if ( a_is_real && !b_is_real && !c_is_real )
+	{
+		// C := R * C *must* be swapped for column-preferring kernels
+		needs_swap = !row_pref;
+	}
+	else if ( !a_is_real && b_is_real && !c_is_real )
+	{
+		// C := C * R *must* be swapped for row-preferring kernels
+		needs_swap = row_pref;
+	}
+
+	// Swap the A and B operands if required. This transforms the operation
+	// C = alpha A B + beta C into C^T = alpha B^T A^T + beta C^T.
+	if ( needs_swap )
+	{
+		bli_obj_swap( a, b );
+
+		bli_obj_induce_trans( a );
+		bli_obj_induce_trans( b );
+		bli_obj_induce_trans( c );
+
+		bool tmp = a_is_real;
+		a_is_real = b_is_real;
+		b_is_real = tmp;
+	}
+
+	const num_t dt_a  = bli_obj_dt( a );
+	const num_t dt_b  = bli_obj_dt( b );
+	const num_t dt_ap = bli_dt_domain( dt_a ) | comp_prec;
+	const num_t dt_bp = bli_dt_domain( dt_b ) | comp_prec;
+
+	// Cast alpha and beta to the computational precision.
+	// Alpha should be complex if any of A, B, or C are.
+	obj_t alpha_cast, beta_cast;
+	dom_t alpha_dom = bli_obj_is_complex( a ) ||
+	                  bli_obj_is_complex( b ) ||
+	                  bli_obj_is_complex( c ) ? BLIS_COMPLEX : BLIS_REAL;
+	bli_obj_scalar_init_detached_copy_of( alpha_dom | comp_prec,
+	                                      BLIS_NO_CONJUGATE,
+	                                      alpha,
+	                                      &alpha_cast );
+	// Cast beta to the type of C, since we will need to
+	// ignore the imaginary part of beta for real C.
+	bli_obj_scalar_init_detached_copy_of( dt_c,
+	                                      BLIS_NO_CONJUGATE,
+	                                      beta,
+	                                      &beta_cast );
+
+	// Cast the scalars of A and B to the computational precision
+	bli_obj_scalar_cast_to( BLIS_COMPLEX | comp_prec, a );
+	bli_obj_scalar_cast_to( BLIS_COMPLEX | comp_prec, b );
+
+	// If alpha is non-unit, typecast and apply it to the scalar attached
+	// to B, unless alpha is complex and A is complex while B is not.
+	if ( bli_obj_is_complex( &alpha_cast ) &&
+	     bli_obj_is_complex( a ) &&
+	     bli_obj_is_real( b ) )
+	{
+		if ( !bli_obj_equals( &alpha_cast, &BLIS_ONE ) )
+			bli_obj_scalar_apply_scalar( &alpha_cast, a );
+	}
+	else
+	{
+		if ( !bli_obj_equals( &alpha_cast, &BLIS_ONE ) )
+			bli_obj_scalar_apply_scalar( &alpha_cast, b );
+	}
+
+	// If beta is non-unit, typecast and apply it to the scalar attached
+	// to C.
+	if ( !bli_obj_equals( &beta_cast, &BLIS_ONE ) )
+		bli_obj_scalar_apply_scalar( &beta_cast, c );
+
+	void_fp     macro_kernel_fp = bli_gemm_ker_var2;
+	gemm_ukr_ft gemm_ukr        = bli_cntx_get_ukr2_dt( dt_comp, dt_c, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft real_gemm_ukr   = bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMM_UKR, cntx );
+
+	// Set the macrokernel function pointer based on the operation family
+	// and struc/uplo properties.
+#ifdef BLIS_ENABLE_JRIR_TLB
+	if ( family == BLIS_GEMMT )
+	{
+		macro_kernel_fp = bli_obj_is_lower( c ) ? bli_gemmt_l_ker_var2b
+		                                        : bli_gemmt_u_ker_var2b;
+	}
+	else if ( family == BLIS_TRMM || family == BLIS_TRMM3 )
+	{
+		if ( bli_obj_is_triangular( a ) )
+			macro_kernel_fp = bli_obj_is_lower( a ) ? bli_trmm_ll_ker_var2b
+			                                        : bli_trmm_lu_ker_var2b;
+		else /* if ( bli_obj_is_triangular( b ) ) */
+			macro_kernel_fp = bli_obj_is_lower( b ) ? bli_trmm_rl_ker_var2b
+			                                        : bli_trmm_ru_ker_var2b;
+	}
+#else
+	if ( family == BLIS_GEMMT )
+	{
+		macro_kernel_fp = bli_obj_is_lower( c ) ? bli_gemmt_l_ker_var2
+		                                        : bli_gemmt_u_ker_var2;
+	}
+	else if ( family == BLIS_TRMM || family == BLIS_TRMM3 )
+	{
+		if ( bli_obj_is_triangular( a ) )
+			macro_kernel_fp = bli_obj_is_lower( a ) ? bli_trmm_ll_ker_var2
+			                                        : bli_trmm_lu_ker_var2;
+		else /* if ( bli_obj_is_triangular( b ) ) */
+			macro_kernel_fp = bli_obj_is_lower( b ) ? bli_trmm_rl_ker_var2
+			                                        : bli_trmm_ru_ker_var2;
+	}
+#endif
+
+	const bool         trmm_r        = family == BLIS_TRMM && bli_obj_is_triangular( b );
+	const bool         a_lo_tri      = bli_obj_is_triangular( a ) && bli_obj_is_lower( a );
+	const bool         b_up_tri      = bli_obj_is_triangular( b ) && bli_obj_is_upper( b );
+	      pack_t       schema_a      = BLIS_PACKED_PANELS;
+	      pack_t       schema_b      = BLIS_PACKED_PANELS;
+	const packm_ker_ft packm_a_ukr   = packm_struc_cxk[ dt_a ][ dt_ap ];
+	const packm_ker_ft packm_b_ukr   = packm_struc_cxk[ dt_b ][ dt_bp ];
+	const dim_t        mr_def        = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_MR, cntx );
+	const dim_t        mr_pack       = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_MR, cntx );
+	const dim_t        mr_bcast      = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_BBM, cntx );
+	      dim_t        mr_scale      = 1;
+	      dim_t        mr_pack_scale = 1;
+	const dim_t        nr_def        = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_NR, cntx );
+	const dim_t        nr_pack       = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_NR, cntx );
+	const dim_t        nr_bcast      = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_BBN, cntx );
+	      dim_t        nr_scale      = 1;
+	      dim_t        nr_pack_scale = 1;
+	const dim_t        kr_def        = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_KR, cntx );
+	const dim_t        mc_def        = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_MC, cntx );
+	const dim_t        mc_max        = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_MC, cntx );
+	      dim_t        mc_scale      = 1;
+	const dim_t        nc_def        = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_NC, cntx );
+	const dim_t        nc_max        = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_NC, cntx );
+	      dim_t        nc_scale      = 1;
+	const dim_t        kc_def        = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_KC, cntx );
+	const dim_t        kc_max        = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_KC, cntx );
+	      dim_t        kc_scale      = 1;
+
+	if ( im == BLIS_1M )
+	{
+		if ( !row_pref )
+		{
+			schema_a = BLIS_PACKED_PANELS_1E;
+			schema_b = BLIS_PACKED_PANELS_1R;
+			mr_scale = 2;
+			mc_scale = 2;
+			mr_pack_scale = 1; //don't divide PACKMR by 2 since we are also doubling k
+		}
+		else
+		{
+			schema_a = BLIS_PACKED_PANELS_1R;
+			schema_b = BLIS_PACKED_PANELS_1E;
+			nr_scale = 2;
+			nc_scale = 2;
+			nr_pack_scale = 1; //don't divide PACKNR by 2 since we are also doubling k
+		}
+
+		kc_scale = 2;
+		gemm_ukr = bli_cntx_get_ukr2_dt( dt_comp, bli_dt_proj_to_real( dt_c ), BLIS_GEMM1M_UKR, cntx );
+	}
+	else if ( (  c_is_real &&  a_is_real &&  b_is_real ) ||
+	          ( !c_is_real && !a_is_real && !b_is_real ) )
+	{
+		// C_real += A_real * B_real
+		// C_complex += A_complex * B_complex
+		// Nothing to do.
+	}
+	else if ( ( !c_is_real && !a_is_real &&  b_is_real ) ||
+	          ( !c_is_real &&  a_is_real && !b_is_real ) )
+	{
+		// C_complex += A_complex * B_real
+		// C_complex += A_real * B_complex
+
+		// Pack the complex input operand as normal, except that
+		// the (rescaled) real-domain block sizes are used.
+
+		if ( !row_pref )
+		{
+			// We transpose the operation above to make sure that
+			// the complex matrix is on the right side for the storage
+			// preference of the microkernel, but things sometimes go
+			// wrong.
+			if ( a_is_real ) bli_abort();
+			mc_scale = 2;
+			mr_scale = 2;
+			mr_pack_scale = 2;
+		}
+		else
+		{
+			// We transpose the operation above to make sure that
+			// the complex matrix is on the right side for the storage
+			// preference of the microkernel, but things sometimes go
+			// wrong.
+			if ( b_is_real ) bli_abort();
+			nc_scale = 2;
+			nr_scale = 2;
+			nr_pack_scale = 2;
+		}
+
+		// A microkernel wrapper is necessary for cases where C is general-stored
+		// or does not match the storage preference of the real-domain
+		// gemm microkernel, or when beta is complex.
+
+		gemm_ukr = bli_cntx_get_ukr2_dt( dt_comp, bli_dt_proj_to_real( dt_c ), BLIS_GEMM_CCR_UKR, cntx );
+	}
+	else if (  c_is_real && !a_is_real && !b_is_real )
+	{
+		// C_real += A_complex * B_complex
+
+		// Pack both A and B in the 1r format and use 1/2
+		// of the real-domain KC block size since twice as
+		// many values will be packed. One of the matrices
+		// needs to be conjugated to get the right sign
+		// on the imaginary components.
+
+		schema_a = BLIS_PACKED_PANELS_1R;
+		schema_b = BLIS_PACKED_PANELS_1R;
+		kc_scale = 2;
+		bli_obj_toggle_conj( a );
+
+		// A microkernel wrapper is necessary only to scale k by 2
+		// due to the 1r packing schema (or if type conversion is required).
+		// Any complex values of alpha will be applied during packing,
+		// so the real-domain microkernel can do everything directly.
+
+		gemm_ukr = bli_cntx_get_ukr2_dt( dt_comp, bli_dt_proj_to_real( dt_c ), BLIS_GEMM_RCC_UKR, cntx );
+	}
+	else if ( !c_is_real &&  a_is_real &&  b_is_real )
+	{
+		// C_complex += A_real * B_real
+
+		// A microkernel wrapper is always needed to store
+		// only the real part of the AB product, but also deal
+		// with potentially complex alpha and beta scalars.
+
+		gemm_ukr = bli_cntx_get_ukr2_dt( dt_comp, bli_dt_proj_to_real( dt_c ), BLIS_GEMM_CRR_UKR, cntx );
+	}
+	else if ( (  c_is_real && !a_is_real &&  b_is_real ) ||
+	          (  c_is_real &&  a_is_real && !b_is_real ) )
+	{
+		// C_real += A_complex * B_real
+		// C_real += A_real * B_complex
+
+		// Pack only the real part of the complex operand.
+		// If alpha is also complex then it will be applied
+		// during packing.
+
+		if ( a_is_real )
+		{
+			schema_b = BLIS_PACKED_PANELS_RO;
+		}
+		else
+		{
+			schema_a = BLIS_PACKED_PANELS_RO;
+		}
+	}
+
+	//printf("MR: %lld/%lld,  %lld/%lld\n", mr_def, mr_scale, mr_pack, mr_pack_scale);
+	//printf("NR: %lld/%lld,  %lld/%lld\n", nr_def, nr_scale, nr_pack, nr_pack_scale);
 
 	// Create two nodes for the macro-kernel.
-	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
+	bli_cntl_init_node
 	(
-	  pool,         // the thread's sba pool
-	  family,       // the operation family
-	  BLIS_MR,
 	  NULL,         // variant function pointer not used
-	  NULL          // no sub-node; this is the leaf of the tree.
+	  &cntl->ir_loop
 	);
 
-	cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node
+	bli_gemm_var_cntl_init_node
 	(
-	  pool,         // the thread's sba pool
-	  family,
-	  BLIS_NR,
 	  macro_kernel_fp,
-	  gemm_cntl_bu_ke
+	  dt_comp,
+	  dt_c,
+	  gemm_ukr,
+	  real_gemm_ukr,
+	  row_pref,
+	  mr_def / mr_scale,
+	  nr_def / nr_scale,
+	  mr_scale,
+	  nr_scale,
+	  &cntl->ker
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NR,
+	  ( cntl_t* )&cntl->ir_loop,
+	  ( cntl_t* )&cntl->ker
 	);
 
+	// Give the gemm kernel control tree node to the
+	// virtual microkernel as the parameters, so that e.g.
+	// the 1m virtual microkernel can look up the real-domain
+	// micro-kernel and its parameters.
+	bli_gemm_var_cntl_set_params( &cntl->ker, ( cntl_t* )&cntl->ker );
+
 	// Create a node for packing matrix A.
-	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
+	bli_packm_def_cntl_init_node
 	(
-	  pool,
 	  bli_l3_packa, // pack the left-hand operand
-	  BLIS_MR,
-	  BLIS_KR,
-	  FALSE,        // do NOT invert diagonal
-	  FALSE,        // reverse iteration if upper?
-	  FALSE,        // reverse iteration if lower?
-	  schema_a,     // normally BLIS_PACKED_ROW_PANELS
+	  dt_a,
+	  dt_ap,
+	  dt_comp,
+	  packm_a_ukr,
+	  mr_def / mr_scale,
+	  mr_pack / mr_pack_scale,
+	  mr_bcast,
+	  mr_scale,
+	  mr_pack_scale,
+	  kr_def,
+	  FALSE,
+	  FALSE,
+	  FALSE,
+	  schema_a,
 	  BLIS_BUFFER_FOR_A_BLOCK,
-	  gemm_cntl_bp_bu
+	  &cntl->pack_a
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->ker,
+	  ( cntl_t* )&cntl->pack_a
 	);
 
 	// Create a node for partitioning the m dimension by MC.
-	cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_MC,
 	  bli_gemm_blk_var1,
-	  gemm_cntl_packa
+	  dt_comp,
+	  mc_def / mc_scale,
+	  mc_max / mc_scale,
+	  mc_scale,
+	  mr_def / mr_scale,
+	  mr_scale,
+	  a_lo_tri ? BLIS_BWD
+	           : BLIS_FWD,
+	  bli_obj_is_triangular( a ) || bli_obj_is_upper_or_lower( c ),
+	  &cntl->part_ic
+	);
+	bli_cntl_attach_sub_node
+	(
+	  trmm_r ? BLIS_THREAD_MC | BLIS_THREAD_NC
+	         : BLIS_THREAD_MC,
+	  ( cntl_t* )&cntl->pack_a,
+	  ( cntl_t* )&cntl->part_ic
 	);
 
 	// Create a node for packing matrix B.
-	cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
+	bli_packm_def_cntl_init_node
 	(
-	  pool,
 	  bli_l3_packb, // pack the right-hand operand
-	  BLIS_NR,
-	  BLIS_KR,
-	  FALSE,        // do NOT invert diagonal
-	  FALSE,        // reverse iteration if upper?
-	  FALSE,        // reverse iteration if lower?
-	  schema_b,     // normally BLIS_PACKED_COL_PANELS
+	  dt_b,
+	  dt_bp,
+	  dt_comp,
+	  packm_b_ukr,
+	  nr_def / nr_scale,
+	  nr_pack / nr_pack_scale,
+	  nr_bcast,
+	  nr_scale,
+	  nr_pack_scale,
+	  kr_def,
+	  FALSE,
+	  FALSE,
+	  FALSE,
+	  schema_b,
 	  BLIS_BUFFER_FOR_B_PANEL,
-	  gemm_cntl_op_bp
+	  &cntl->pack_b
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->part_ic,
+	  ( cntl_t* )&cntl->pack_b
 	);
 
 	// Create a node for partitioning the k dimension by KC.
-	cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_KC,
 	  bli_gemm_blk_var3,
-	  gemm_cntl_packb
+	  dt_comp,
+	  kc_def / kc_scale,
+	  kc_max / kc_scale,
+	  kc_scale,
+	  kr_def,
+	  1,
+	  ( a_lo_tri || b_up_tri ) ? BLIS_BWD
+	                           : BLIS_FWD,
+	  FALSE,
+	  &cntl->part_pc
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_KC,
+	  ( cntl_t* )&cntl->pack_b,
+	  ( cntl_t* )&cntl->part_pc
 	);
 
 	// Create a node for partitioning the n dimension by NC.
-	cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_NC,
 	  bli_gemm_blk_var2,
-	  gemm_cntl_mm_op
+	  dt_comp,
+	  nc_def / nc_scale,
+	  nc_max / nc_scale,
+	  nc_scale,
+	  nr_def / nr_scale,
+	  nr_scale,
+	  b_up_tri ? BLIS_BWD
+	           : BLIS_FWD,
+	  bli_obj_is_triangular( b ) || bli_obj_is_upper_or_lower( c ),
+	  &cntl->part_jc
+	);
+	bli_cntl_attach_sub_node
+	(
+	  trmm_r ? BLIS_THREAD_NONE
+	         : BLIS_THREAD_NC,
+	  ( cntl_t* )&cntl->part_pc,
+	  ( cntl_t* )&cntl->part_jc
 	);
 
-	return gemm_cntl_vl_mm;
+	bli_gemm_cntl_finalize
+	(
+	  family,
+	  a,
+	  b,
+	  c,
+	  cntl
+	);
 }
 
-// -----------------------------------------------------------------------------
-
-void bli_gemm_cntl_free
+void bli_gemm_cntl_finalize
      (
-       pool_t* pool,
-       cntl_t* cntl
+             opid_t       family,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+             gemm_cntl_t* cntl
      )
 {
-	bli_cntl_free( pool, cntl );
-}
+	( void )c;
 
-// -----------------------------------------------------------------------------
+	const dim_t ic_mult = bli_part_cntl_blksz_mult( ( cntl_t* )&cntl->part_ic );
+	const dim_t jc_mult = bli_part_cntl_blksz_mult( ( cntl_t* )&cntl->part_jc );
 
-cntl_t* bli_gemm_cntl_create_node
-     (
-       pool_t* pool,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
-     )
-{
-	return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node );
+	//
+	// Ensure that:
+	//
+	// 1. KC is a multiple of MR (NR) if A (B) is triangular, hermitian, or symmetric.
+	//    KC is always rounded up.
+	//
+	// 2. MC and NR are multiples of MR and NR, respectively. MC and NC are always
+	//    rounded down.
+	//
+
+	// Nudge the default and maximum kc blocksizes up to the nearest
+	// multiple of MR if A is Hermitian, symmetric, or triangular or
+	// NR if B is Hermitian, symmetric, or triangular. If neither case
+	// applies, then we leave the blocksizes unchanged. For trsm we
+	// always use MR (rather than sometimes using NR) because even
+	// when the triangle is on the right, packing of that matrix uses
+	// MR, since only left-side trsm micro-kernels are supported.
+	if ( !bli_obj_root_is_general( a ) || family == BLIS_TRSM )
+	{
+		bli_part_cntl_align_blksz_to_mult( ic_mult, true, ( cntl_t* )&cntl->part_pc );
+	}
+	else if ( !bli_obj_root_is_general( b ) )
+	{
+		bli_part_cntl_align_blksz_to_mult( jc_mult, true, ( cntl_t* )&cntl->part_pc );
+	}
+
+	bli_part_cntl_align_blksz( false, ( cntl_t* )&cntl->part_ic );
+	bli_part_cntl_align_blksz( false, ( cntl_t* )&cntl->part_jc );
 }
 
diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h
index 48e0652ca..08fb62696 100644
--- a/frame/3/gemm/bli_gemm_cntl.h
+++ b/frame/3/gemm/bli_gemm_cntl.h
@@ -33,42 +33,440 @@
 
 */
 
-cntl_t* bli_gemm_cntl_create
-     (
-       pool_t* pool,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
-     );
 
 // -----------------------------------------------------------------------------
 
-cntl_t* bli_gemmbp_cntl_create
-     (
-       pool_t* pool,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
-     );
+struct gemm_var_cntl_s
+{
+	cntl_t      cntl; //this field must be present and come first
+	num_t       dt_comp;
+	num_t       dt_out;
+	gemm_ukr_ft ukr;
+	gemm_ukr_ft real_ukr;
+	const void* params;
+	const void* real_params;
+	dim_t       mr;
+	dim_t       nr;
+	dim_t       mr_scale;
+	dim_t       nr_scale;
+	bool        row_pref;
+};
+typedef struct gemm_var_cntl_s gemm_var_cntl_t;
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE gemm_ukr_ft bli_gemm_var_cntl_ukr( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->ukr;
+}
+
+BLIS_INLINE gemm_ukr_ft bli_gemm_var_cntl_real_ukr( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->real_ukr;
+}
+
+BLIS_INLINE bool bli_gemm_var_cntl_row_pref( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->row_pref;
+}
+
+BLIS_INLINE const void* bli_gemm_var_cntl_params( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->params;
+}
+
+BLIS_INLINE const void* bli_gemm_var_cntl_real_params( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->real_params;
+}
+
+BLIS_INLINE dim_t bli_gemm_var_cntl_mr( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->mr;
+}
+
+BLIS_INLINE dim_t bli_gemm_var_cntl_nr( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->nr;
+}
+
+BLIS_INLINE num_t bli_gemm_var_cntl_comp_dt( const cntl_t* cntl )
+{
+	return ( ( const gemm_var_cntl_t* ) cntl )->dt_comp;
+}
 
 // -----------------------------------------------------------------------------
 
-void bli_gemm_cntl_free
+BLIS_INLINE void bli_gemm_var_cntl_set_ukr( const func2_t* ukr, cntl_t* cntl_ )
+{
+	gemm_var_cntl_t* cntl = ( gemm_var_cntl_t* )cntl_;
+	num_t dt_comp = cntl->dt_comp;
+	num_t dt_out = cntl->dt_out;
+	cntl->ukr = ( gemm_ukr_ft )bli_func2_get_dt( dt_comp, dt_out, ukr );
+}
+
+BLIS_INLINE void bli_gemm_var_cntl_set_real_ukr( const func2_t* ukr, cntl_t* cntl_ )
+{
+	gemm_var_cntl_t* cntl = ( gemm_var_cntl_t* )cntl_;
+	num_t dt_comp = cntl->dt_comp;
+	num_t dt_out = cntl->dt_out;
+	cntl->real_ukr = ( gemm_ukr_ft )bli_func2_get_dt( dt_comp, dt_out, ukr );
+}
+
+BLIS_INLINE err_t bli_gemm_var_cntl_set_ukr_simple( const func_t* ukr, cntl_t* cntl_ )
+{
+	gemm_var_cntl_t* cntl = ( gemm_var_cntl_t* )cntl_;
+	num_t dt_comp = cntl->dt_comp;
+	num_t dt_out = cntl->dt_out;
+	if ( dt_comp != dt_out )
+		return BLIS_INCONSISTENT_DATATYPES;
+	cntl->ukr = ( gemm_ukr_ft )bli_func_get_dt( dt_comp, ukr );
+	return BLIS_SUCCESS;
+}
+
+BLIS_INLINE err_t bli_gemm_var_cntl_set_real_ukr_simple( const func_t* ukr, cntl_t* cntl_ )
+{
+	gemm_var_cntl_t* cntl = ( gemm_var_cntl_t* )cntl_;
+	num_t dt_comp = cntl->dt_comp;
+	num_t dt_out = cntl->dt_out;
+	if ( dt_comp != dt_out )
+		return BLIS_INCONSISTENT_DATATYPES;
+	cntl->real_ukr = ( gemm_ukr_ft )bli_func_get_dt( dt_comp, ukr );
+	return BLIS_SUCCESS;
+}
+
+BLIS_INLINE void bli_gemm_var_cntl_set_row_pref( const mbool_t* row_pref, cntl_t* cntl_ )
+{
+	gemm_var_cntl_t* cntl = ( gemm_var_cntl_t* )cntl_;
+	num_t dt_comp = cntl->dt_comp;
+	cntl->row_pref = bli_mbool_get_dt( dt_comp, row_pref );
+}
+
+BLIS_INLINE void bli_gemm_var_cntl_set_params( const void* params, cntl_t* cntl )
+{
+	( ( gemm_var_cntl_t* ) cntl )->params = params;
+}
+
+BLIS_INLINE void bli_gemm_var_cntl_set_real_params( const void* params, cntl_t* cntl )
+{
+	( ( gemm_var_cntl_t* ) cntl )->real_params = params;
+}
+
+BLIS_INLINE void bli_gemm_var_cntl_set_mr( dim_t mr, cntl_t* cntl )
+{
+	( ( gemm_var_cntl_t* ) cntl )->mr = mr / ( ( gemm_var_cntl_t* ) cntl )->mr_scale;
+}
+
+BLIS_INLINE void bli_gemm_var_cntl_set_nr( dim_t nr, cntl_t* cntl )
+{
+	( ( gemm_var_cntl_t* ) cntl )->nr = nr / ( ( gemm_var_cntl_t* ) cntl )->nr_scale;
+}
+
+BLIS_INLINE void bli_gemm_var_cntl_set_comp_dt( num_t dt, const cntl_t* cntl )
+{
+	( ( gemm_var_cntl_t* ) cntl )->dt_comp = dt;
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_gemm_var_cntl_init_node
      (
-       pool_t* pool,
-       cntl_t* cntl
+       void_fp          var_func,
+       num_t            dt_comp,
+       num_t            dt_out,
+       gemm_ukr_ft      ukr,
+       gemm_ukr_ft      real_ukr,
+       bool             row_pref,
+       dim_t            mr,
+       dim_t            nr,
+       dim_t            mr_scale,
+       dim_t            nr_scale,
+       gemm_var_cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
-cntl_t* bli_gemm_cntl_create_node
+struct gemm_cntl_s
+{
+         part_cntl_t part_jc;
+         part_cntl_t part_pc;
+    packm_def_cntl_t pack_b;
+         part_cntl_t part_ic;
+    packm_def_cntl_t pack_a;
+     gemm_var_cntl_t ker;
+              cntl_t ir_loop;
+};
+typedef struct gemm_cntl_s gemm_cntl_t;
+
+// -----------------------------------------------------------------------------
+
+BLIS_EXPORT_BLIS void bli_gemm_cntl_init
      (
-       pool_t* pool,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
+             ind_t        im,
+             opid_t       family,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             gemm_cntl_t* cntl
      );
 
+BLIS_EXPORT_BLIS void bli_gemm_cntl_finalize
+     (
+             opid_t       family,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+             gemm_cntl_t* cntl
+     );
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE gemm_ukr_ft bli_gemm_cntl_ukr( gemm_cntl_t* cntl )
+{
+	gemm_ukr_ft real_ukr = bli_gemm_var_cntl_real_ukr( ( cntl_t* )&cntl->ker );
+	return real_ukr ? real_ukr : bli_gemm_var_cntl_ukr( ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE bool bli_gemm_cntl_row_pref( gemm_cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_row_pref( ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE const void* bli_gemm_cntl_params( gemm_cntl_t* cntl )
+{
+	gemm_ukr_ft real_ukr = bli_gemm_var_cntl_real_ukr( ( cntl_t* )&cntl->ker );
+	return real_ukr ? bli_gemm_var_cntl_real_params( ( cntl_t* )&cntl->ker )
+	                : bli_gemm_var_cntl_params( ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE l3_var_oft bli_gemm_cntl_var( gemm_cntl_t* cntl )
+{
+	return ( l3_var_oft )bli_cntl_var_func( ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE packm_ker_ft bli_gemm_cntl_packa_ukr( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr( ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE pack_t bli_gemm_cntl_packa_schema( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_pack_schema( ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE const void* bli_gemm_cntl_packa_params( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr_params( ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE packm_var_oft bli_gemm_cntl_packa_var( gemm_cntl_t* cntl )
+{
+	return bli_packm_cntl_variant( ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE packm_ker_ft bli_gemm_cntl_packb_ukr( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE pack_t bli_gemm_cntl_packb_schema( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_pack_schema( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE const void* bli_gemm_cntl_packb_params( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr_params( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE packm_var_oft bli_gemm_cntl_packb_var( gemm_cntl_t* cntl )
+{
+	return bli_packm_cntl_variant( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_mr_def( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_def( ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_mr_pack( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_pack( ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_nr_def( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_def( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_nr_pack( gemm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_pack( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_mc_def( gemm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_alg( ( cntl_t* )&cntl->part_ic );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_mc_max( gemm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_max( ( cntl_t* )&cntl->part_ic );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_nc_def( gemm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_alg( ( cntl_t* )&cntl->part_jc );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_nc_max( gemm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_max( ( cntl_t* )&cntl->part_jc );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_kc_def( gemm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_alg( ( cntl_t* )&cntl->part_pc );
+}
+
+BLIS_INLINE dim_t bli_gemm_cntl_kc_max( gemm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_max( ( cntl_t* )&cntl->part_pc );
+}
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE void bli_gemm_cntl_set_ukr( const func2_t* ukr, gemm_cntl_t* cntl )
+{
+	if ( bli_gemm_var_cntl_real_ukr( ( cntl_t* )&cntl->ker ) )
+	{
+		bli_gemm_var_cntl_set_real_ukr( ukr, ( cntl_t* )&cntl->ker );
+	}
+	else
+	{
+		bli_gemm_var_cntl_set_ukr( ukr, ( cntl_t* )&cntl->ker );
+	}
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_ukr_simple( const func_t* ukr, gemm_cntl_t* cntl )
+{
+	if ( bli_gemm_var_cntl_real_ukr( ( cntl_t* )&cntl->ker ) )
+	{
+		bli_gemm_var_cntl_set_real_ukr_simple( ukr, ( cntl_t* )&cntl->ker );
+	}
+	else
+	{
+		bli_gemm_var_cntl_set_ukr_simple( ukr, ( cntl_t* )&cntl->ker );
+	}
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_row_pref( const mbool_t* row_pref, gemm_cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_row_pref( row_pref, ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_params( const void* params, gemm_cntl_t* cntl )
+{
+	if ( bli_gemm_var_cntl_real_ukr( ( cntl_t* )&cntl->ker ) )
+	{
+		bli_gemm_var_cntl_set_real_params( params, ( cntl_t* )&cntl->ker );
+	}
+	else
+	{
+		bli_gemm_var_cntl_set_params( params, ( cntl_t* )&cntl->ker );
+	}
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_var( l3_var_oft var, gemm_cntl_t* cntl )
+{
+	bli_cntl_set_var_func( ( void_fp )var, ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packa_ukr( const func2_t* ukr, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr( ukr, ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packa_ukr_simple( const func_t* ukr, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_simple( ukr, ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packa_schema( pack_t schema, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_pack_schema( schema, ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packa_params( const void* params, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_params( params, ( cntl_t* )&cntl->pack_a );
+	bli_packm_cntl_set_variant_params( params, ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packa_var( packm_var_oft var, gemm_cntl_t* cntl )
+{
+	bli_packm_cntl_set_variant( var, ( cntl_t* )&cntl->pack_a );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packb_ukr( const func2_t* ukr, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr( ukr, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packb_ukr_simple( const func_t* ukr, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_simple( ukr, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packb_schema( pack_t schema, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_pack_schema( schema, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packb_params( const void* params, gemm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_params( params, ( cntl_t* )&cntl->pack_b );
+	bli_packm_cntl_set_variant_params( params, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_packb_var( packm_var_oft var, gemm_cntl_t* cntl )
+{
+	bli_packm_cntl_set_variant( var, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_mr( const blksz_t* mr, gemm_cntl_t* cntl )
+{
+	num_t dt = cntl->ker.dt_comp;
+	dim_t mr_dt = bli_blksz_get_def( dt, mr );
+	bli_packm_def_cntl_set_bmult_m( mr, ( cntl_t* )&cntl->pack_a );
+	bli_part_cntl_set_blksz_mult( mr, ( cntl_t* )&cntl->part_ic );
+	bli_gemm_var_cntl_set_mr( mr_dt, ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_nr( const blksz_t* nr, gemm_cntl_t* cntl )
+{
+	num_t dt = cntl->ker.dt_comp;
+	dim_t nr_dt = bli_blksz_get_def( dt, nr );
+	bli_packm_def_cntl_set_bmult_m( nr, ( cntl_t* )&cntl->pack_b );
+	bli_part_cntl_set_blksz_mult( nr, ( cntl_t* )&cntl->part_jc );
+	bli_gemm_var_cntl_set_nr( nr_dt, ( cntl_t* )&cntl->ker );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_mc( const blksz_t* mc, gemm_cntl_t* cntl )
+{
+	bli_part_cntl_set_blksz( mc, ( cntl_t* )&cntl->part_ic );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_nc( const blksz_t* nc, gemm_cntl_t* cntl )
+{
+	bli_part_cntl_set_blksz( nc, ( cntl_t* )&cntl->part_jc );
+}
+
+BLIS_INLINE void bli_gemm_cntl_set_kc( const blksz_t* kc, gemm_cntl_t* cntl )
+{
+	bli_part_cntl_set_blksz( kc, ( cntl_t* )&cntl->part_pc );
+}
+
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
deleted file mode 100644
index fe0dc61a8..000000000
--- a/frame/3/gemm/bli_gemm_front.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_gemm_front
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   b_local;
-	obj_t   c_local;
-
-#if 0
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-	// Only handle small problems separately for homogeneous datatypes.
-	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
-	     bli_obj_dt( a ) == bli_obj_dt( c ) &&
-	     bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
-	{
-		err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
-		if ( status == BLIS_SUCCESS ) return;
-	}
-#endif
-#endif
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// Set the obj_t buffer field to the location currently implied by the row
-	// and column offsets and then zero the offsets. If any of the original
-	// obj_t's were views into larger matrices, this step effectively makes
-	// those obj_t's "forget" their lineage.
-	bli_obj_reset_origin( &a_local );
-	bli_obj_reset_origin( &b_local );
-	bli_obj_reset_origin( &c_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
-
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
-
-#ifdef BLIS_ENABLE_GEMM_MD
-	cntx_t cntx_local;
-
-	// If any of the storage datatypes differ, or if the computation precision
-	// differs from the storage precision of C, utilize the mixed datatype
-	// code path.
-	// NOTE: If we ever want to support the caller setting the computation
-	// domain explicitly, we will need to check the computation dt against the
-	// storage dt of C (instead of the computation precision against the
-	// storage precision of C).
-	if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
-	     bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
-	     bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
-	{
-		// Handle mixed datatype cases in bli_gemm_md(), which may modify
-		// the objects or the context. (If the context is modified, cntx
-		// is adjusted to point to cntx_local.)
-		bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
-	}
-#endif
-
-	// Next, we handle the possibility of needing to typecast alpha to the
-	// computation datatype and/or beta to the storage datatype of C.
-
-	// Attach alpha to B, and in the process typecast alpha to the target
-	// datatype of the matrix (which in this case is equal to the computation
-	// datatype).
-	bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
-
-	// Attach beta to C, and in the process typecast beta to the target
-	// datatype of the matrix (which in this case is equal to the storage
-	// datatype of C).
-	bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta,  &c_local );
-
-	// Change the alpha and beta pointers to BLIS_ONE since the values have
-	// now been typecast and attached to the matrices above.
-	alpha = &BLIS_ONE;
-	beta  = &BLIS_ONE;
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_GEMM,
-	  BLIS_LEFT, // ignored for gemm/hemm/symm
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	      obj_t* cp    = &c_local;
-	const obj_t* betap = beta;
-
-#ifdef BLIS_ENABLE_GEMM_MD
-#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
-	// If any of the following conditions are met, create a temporary matrix
-	// conformal to C into which we will accumulate the matrix product:
-	// - the storage precision of C differs from the computation precision;
-	// - the domains are mixed as crr;
-	// - the storage format of C does not match the preferred orientation
-	//   of the ccr or crc cases.
-	// Then, after the computation is complete, this matrix will be copied
-	// or accumulated back to C.
-	const bool is_ccr_mismatch =
-	             ( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
-                   !bli_obj_is_col_stored( &c_local ) );
-	const bool is_crc_mismatch =
-	             ( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
-                   !bli_obj_is_row_stored( &c_local ) );
-
-	obj_t ct;
-	bool  use_ct = FALSE;
-
-	// FGVZ: Consider adding another guard here that only creates and uses a
-	// temporary matrix for accumulation if k < c * kc, where c is some small
-	// constant like 2. And don't forget to use the same conditional for the
-	// castm() and free() at the end.
-	if (
-	     bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
-	     bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
-	     is_ccr_mismatch ||
-	     is_crc_mismatch
-	   )
-	{
-		use_ct = TRUE;
-	}
-
-	// If we need a temporary matrix conformal to C for whatever reason,
-	// we create it and prepare to use it now.
-	if ( use_ct )
-	{
-		const dim_t m     = bli_obj_length( &c_local );
-		const dim_t n     = bli_obj_width( &c_local );
-		      inc_t rs    = bli_obj_row_stride( &c_local );
-		      inc_t cs    = bli_obj_col_stride( &c_local );
-
-		      num_t dt_ct = bli_obj_domain( &c_local ) |
-		                    bli_obj_comp_prec( &c_local );
-
-		// When performing the crr case, accumulate to a contiguously-stored
-		// real matrix so we do not have to repeatedly update C with general
-		// stride.
-		if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
-			dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
-
-		// When performing the mismatched ccr or crc cases, now is the time
-		// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
-		// microkernel can output directly to C (instead of using a temporary
-		// microtile).
-		if      ( is_ccr_mismatch ) { rs = 1; cs = m; }
-		else if ( is_crc_mismatch ) { rs = n; cs = 1; }
-
-		bli_obj_create( dt_ct, m, n, rs, cs, &ct );
-
-		const num_t dt_exec = bli_obj_exec_dt( &c_local );
-		const num_t dt_comp = bli_obj_comp_dt( &c_local );
-
-		bli_obj_set_target_dt( dt_ct, &ct );
-		bli_obj_set_exec_dt( dt_exec, &ct );
-		bli_obj_set_comp_dt( dt_comp, &ct );
-
-		// A naive approach would cast C to the comptuation datatype,
-		// compute with beta, and then cast the result back to the
-		// user-provided output matrix. However, we employ a different
-		// approach that halves the number of memops on C (or its
-		// typecast temporary) by writing the A*B product directly to
-		// temporary storage, and then using xpbym to scale the
-		// output matrix by beta and accumulate/cast the A*B product.
-		//bli_castm( &c_local, &ct );
-		betap = &BLIS_ZERO;
-
-		cp = &ct;
-	}
-#endif
-#endif
-
-	// Invoke the internal back-end via the thread handler.
-	bli_l3_thread_decorator
-	(
-	  bli_l3_int,
-	  BLIS_GEMM, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  betap,
-	  cp,
-	  cntx,
-	  rntm
-	);
-
-#ifdef BLIS_ENABLE_GEMM_MD
-#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
-	// If we created a temporary matrix conformal to C for whatever reason,
-	// we copy/accumulate the result back to C and then release the object.
-	if ( use_ct )
-	{
-		obj_t beta_local;
-
-		bli_obj_scalar_detach( &c_local, &beta_local );
-
-		//bli_castnzm( &ct, &c_local );
-		bli_xpbym( &ct, &beta_local, &c_local );
-
-		bli_obj_free( &ct );
-	}
-#endif
-#endif
-}
-
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 6b413d909..298a53840 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -35,46 +35,6 @@
 
 #include "blis.h"
 
-typedef void (*xpbys_mxn_vft)
-    (
-            dim_t m,
-            dim_t n,
-      const void* x, inc_t rs_x, inc_t cs_x,
-      const void* b,
-            void* y, inc_t rs_y, inc_t cs_y
-    );
-
-#undef  GENTFUNC2
-#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \
-\
-BLIS_INLINE void PASTEMAC2(chx,chy,op) \
-    ( \
-            dim_t m, \
-            dim_t n, \
-      const void* x, inc_t rs_x, inc_t cs_x, \
-      const void* b, \
-            void* y, inc_t rs_y, inc_t cs_y \
-    ) \
-{ \
-	const ctypex* restrict x_cast = x; \
-	const ctypey* restrict b_cast = b; \
-	      ctypey* restrict y_cast = y; \
-\
-	PASTEMAC3(chx,chy,chy,xpbys_mxn) \
-	( \
-	  m, n, \
-	  x_cast, rs_x, cs_x, \
-	  b_cast, \
-	  y_cast, rs_y,  cs_y \
-	); \
-}
-
-INSERT_GENTFUNC2_BASIC(xpbys_mxn_fn);
-INSERT_GENTFUNC2_MIX_DP(xpbys_mxn_fn);
-
-static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn);
-
-
 void bli_gemm_ker_var2
      (
        const obj_t*     a,
@@ -85,29 +45,32 @@ void bli_gemm_ker_var2
              thrinfo_t* thread_par
      )
 {
-	      num_t  dt_exec   = bli_obj_exec_dt( c );
-	      num_t  dt_c      = bli_obj_dt( c );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
 	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	      dim_t  m         = bli_obj_length( c );
-	      dim_t  n         = bli_obj_width( c );
-	      dim_t  k         = bli_obj_width( a );
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
 
 	const char*  a_cast    = bli_obj_buffer_at_off( a );
 	const inc_t  is_a      = bli_obj_imag_stride( a );
-	      dim_t  pd_a      = bli_obj_panel_dim( a );
-	      inc_t  ps_a      = bli_obj_panel_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const char*  b_cast    = bli_obj_buffer_at_off( b );
 	const inc_t  is_b      = bli_obj_imag_stride( b );
-	      dim_t  pd_b      = bli_obj_panel_dim( b );
-	      inc_t  ps_b      = bli_obj_panel_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
 	      char*  c_cast    = bli_obj_buffer_at_off( c );
-	      inc_t  rs_c      = bli_obj_row_stride( c );
-	      inc_t  cs_c      = bli_obj_col_stride( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+	const inc_t  off_m     = bli_obj_row_off( c );
+	const inc_t  off_n     = bli_obj_col_off( c );
 
 	// If any dimension is zero, return immediately.
 	if ( bli_zero_dim3( m, n, k ) ) return;
@@ -123,54 +86,15 @@ void bli_gemm_ker_var2
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	// NOTE: We know that scalar_b is of type dt_exec due to the above code
-	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
+	// NOTE: We know that scalar_b is of type dt_comp due to the above code
+	// that casts the scalars of A and B to dt_comp via scalar_a and scalar_b,
 	// and we know that the internal scalar in C is already of the type dt_c
 	// due to the casting in the implementation of bli_obj_scalar_attach().
 	const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
 	const char* beta_cast  = bli_obj_internal_scalar_buffer( c );
 
-#if 1
-	// Under certain conditions, we can avoid the overhead of calling the 1m
-	// virtual microkernel by having the real-domain macrokernel execute with
-	// the real-domain microkernel. (See the function definition for details.)
-	if ( bli_cntx_method( cntx ) == BLIS_1M )
-	{
-		bli_gemm_ind_recast_1m_params
-		(
-		  &dt_exec,
-		  &dt_c,
-		  schema_a,
-		  c,
-		  &m, &n, &k,
-		  &pd_a, &ps_a,
-		  &pd_b, &ps_b,
-		  &rs_c, &cs_c,
-		  cntx
-		);
-	}
-#endif
-
-#ifdef BLIS_ENABLE_GEMM_MD
-	// Tweak parameters in select mixed domain cases (rcc, crc, ccr).
-	if ( bli_cntx_method( cntx ) == BLIS_NAT )
-	{
-		bli_gemm_md_ker_var2_recast
-		(
-		  &dt_exec,
-		  bli_obj_dt( a ),
-		  bli_obj_dt( b ),
-		  &dt_c,
-		  &m, &n, &k,
-		  &pd_a, &ps_a,
-		  &pd_b, &ps_b,
-		  c,
-		  &rs_c, &cs_c
-		);
-	}
-#endif
-
-	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_a_size = bli_dt_size( dt_a );
+	const siz_t dt_b_size = bli_dt_size( dt_b );
 	const siz_t dt_c_size = bli_dt_size( dt_c );
 
 	// Alias some constants to simpler names.
@@ -179,26 +103,8 @@ void bli_gemm_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
-
-	// Query the params field from the obj_t. If it is non-NULL, grab the ukr
-	// field of the params struct. If that function pointer is non-NULL, use it
-	// as our microkernel instead of the default microkernel queried from the
-	// cntx above.
-	const gemm_ker_params_t* params = bli_obj_ker_params( c );
-	gemm_ukr_ft user_ukr = params ? params->ukr : NULL;
-	if ( user_ukr ) gemm_ukr = user_ukr;
-
-	// Temporary C buffer for edge cases. Note that the strides of this
-	// temporary buffer are set so that they match the storage of the
-	// original C matrix. For example, if C is column-stored, ct will be
-	// column-stored as well.
-	char        ct[ BLIS_STACK_BUF_MAX_SIZE ]
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
-	const inc_t rs_ct       = ( col_pref ? 1 : NR );
-	const inc_t cs_ct       = ( col_pref ? MR : 1 );
-	const char* zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	gemm_ukr_ft gemm_ukr = bli_gemm_var_cntl_ukr( cntl );
+	const void* params   = bli_gemm_var_cntl_params( cntl );
 
 	//
 	// Assumptions/assertions:
@@ -223,9 +129,9 @@ void bli_gemm_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
 	const inc_t rstep_c = rs_c * MR * dt_c_size;
 	const inc_t cstep_c = cs_c * NR * dt_c_size;
@@ -251,7 +157,7 @@ void bli_gemm_ker_var2
 
 	// Query the number of threads and thread ids for the jr loop around
 	// the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
 	const dim_t jr_tid = bli_thrinfo_work_id( thread );
 
@@ -279,16 +185,18 @@ void bli_gemm_ker_var2
 
 	// Query the number of threads and thread ids for the ir loop around
 	// the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
 	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
 	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
 	// Determine the thread range and increment for the 2nd and 1st loops.
 	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( jr_tid, jr_nt, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( ir_tid, ir_nt, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Calculate the total number of microtiles assigned to this thread.
 	dim_t n_ut_for_me = ( ( ir_end + ir_inc - 1 - ir_start ) / ir_inc ) *
@@ -340,53 +248,26 @@ void bli_gemm_ker_var2
 			bli_auxinfo_set_next_a( a2, &aux );
 			bli_auxinfo_set_next_b( b2, &aux );
 
-			// Edge case handling now occurs within the microkernel itself, but
-			// we must still explicitly accumulate to a temporary microtile in
-			// situations where a virtual microkernel is being used, such as
-			// during the 1m method or some cases of mixed datatypes.
-			if ( dt_exec == dt_c )
-			{
-				// Invoke the gemm micro-kernel.
-				gemm_ukr
-				(
-				  m_cur,
-				  n_cur,
-				  k,
-				  ( void* )alpha_cast,
-				  ( void* )a1,
-				  ( void* )b1,
-				  ( void* )beta_cast,
-				           c11, rs_c, cs_c,
-				  &aux,
-				  ( cntx_t* )cntx
-				);
-			}
-			else
-			{
-				// Invoke the gemm micro-kernel.
-				gemm_ukr
-				(
-				  MR,
-				  NR,
-				  k,
-				  ( void* )alpha_cast,
-				  ( void* )a1,
-				  ( void* )b1,
-				  ( void* )zero,
-				           &ct, rs_ct, cs_ct,
-				  &aux,
-				  ( cntx_t* )cntx
-				);
-
-				// Accumulate to C with typecasting.
-				xpbys_mxn[ dt_exec ][ dt_c ]
-				(
-				  m_cur, n_cur,
-				  &ct, rs_ct, cs_ct,
-				  ( void* )beta_cast,
-				  c11, rs_c, cs_c
-				);
-			}
+			// Set the current offset into the C matrix in the auxinfo_t
+			// object.
+			bli_auxinfo_set_off_m( off_m + i, &aux );
+			bli_auxinfo_set_off_m( off_n + j, &aux );
+
+			// Edge case handling now occurs within the microkernel itself.
+			// Invoke the gemm micro-kernel.
+			gemm_ukr
+			(
+			  m_cur,
+			  n_cur,
+			  k,
+			  ( void* )alpha_cast,
+			  ( void* )a1,
+			  ( void* )b1,
+			  ( void* )beta_cast,
+			           c11, rs_c, cs_c,
+			  &aux,
+			  ( cntx_t* )cntx
+			);
 
 			// Decrement the number of microtiles assigned to the thread; once
 			// it reaches zero, return immediately.
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
deleted file mode 100644
index 1e23d058e..000000000
--- a/frame/3/gemm/bli_gemm_md.c
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_GEMM_MD
-
-void bli_gemm_md
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-
-	const bool a_is_real = bli_obj_is_real( a );
-	const bool a_is_comp = bli_obj_is_complex( a );
-	const bool b_is_real = bli_obj_is_real( b );
-	const bool b_is_comp = bli_obj_is_complex( b );
-	const bool c_is_real = bli_obj_is_real( c );
-	const bool c_is_comp = bli_obj_is_complex( c );
-
-	if      ( c_is_real && a_is_real && b_is_real )
-	{
-		// C_real += A_real * B_real
-		doms = bli_gemm_md_rrr( a, b, beta, c, cntx_local, cntx );
-	}
-	else if ( c_is_comp && a_is_comp && b_is_comp )
-	{
-		// C_complex += A_complex * B_complex
-		doms = bli_gemm_md_ccc( a, b, beta, c, cntx_local, cntx );
-	}
-	else if ( c_is_comp && a_is_comp && b_is_real )
-	{
-		// C_complex += A_complex * B_real
-		doms = bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
-	}
-	else if ( c_is_comp && a_is_real && b_is_comp )
-	{
-		// C_complex += A_real * B_complex
-		doms = bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
-	}
-	else if ( c_is_real && a_is_comp && b_is_comp )
-	{
-		// C_real += A_complex * B_complex
-		doms = bli_gemm_md_rcc( a, b, beta, c, cntx_local, cntx );
-	}
-	else if ( c_is_comp && a_is_real && b_is_real )
-	{
-		// C_complex += A_real * B_real
-		doms = bli_gemm_md_crr( a, b, beta, c, cntx_local, cntx );
-	}
-	else if ( c_is_real && a_is_comp && b_is_real )
-	{
-		// C_real += A_complex * B_real
-		doms = bli_gemm_md_rcr( a, b, beta, c, cntx_local, cntx );
-	}
-	else if ( c_is_real && a_is_real && b_is_comp )
-	{
-		// C_real += A_real * B_complex
-		doms = bli_gemm_md_rrc( a, b, beta, c, cntx_local, cntx );
-	}
-	else
-	{
-		doms.comp = BLIS_REAL;
-		doms.exec = BLIS_REAL;
-
-		// This should never execute.
-		bli_abort();
-	}
-
-	// Extract the computation and execution domains from the struct
-	// returned above.
-	dom_t dom_comp = doms.comp;
-	dom_t dom_exec = doms.exec;
-
-	// Inspect the computation precision of C. (The user may have set
-	// this explicitly to request the precision in which the computation
-	// should take place.)
-	prec_t prec_comp = bli_obj_comp_prec( c );
-
-	// The computation precision tells us the target precision of A and B.
-	// NOTE: We don't set the target domain here. The target domain would
-	// either be unchanged, or would have been changed in one of the eight
-	// domain cases above.
-	bli_obj_set_target_prec( prec_comp, a );
-	bli_obj_set_target_prec( prec_comp, b );
-
-	// Combine the execution domain with the computation precision to form
-	// the execution datatype. (The computation precision and execution
-	// precision are always equal.)
-	num_t dt_exec = dom_exec | prec_comp;
-
-	// Set the execution datatypes of A, B, and C.
-	bli_obj_set_exec_dt( dt_exec, a );
-	bli_obj_set_exec_dt( dt_exec, b );
-	bli_obj_set_exec_dt( dt_exec, c );
-
-	// Combine the computation precision and computation domain to form the
-	// computation datatype.
-	num_t dt_comp = dom_comp | prec_comp;
-
-	// Set the computation datatypes of A, B, and C.
-	bli_obj_set_comp_dt( dt_comp, a );
-	bli_obj_set_comp_dt( dt_comp, b );
-	bli_obj_set_comp_dt( dt_comp, c );
-
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_ccr
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-
-	// We assume that the requested computation domain is complex.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_COMPLEX;
-
-	// For ccr, the computation (ukernel) will be real, but the execution
-	// will appear complex to other parts of the implementation.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_COMPLEX;
-
-	// Here we construct the computation datatype, which for the ccr case
-	// is equal to the real projection of the execution datatype, and use
-	// that computation datatype to query the corresponding ukernel output
-	// preference.
-	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
-	const bool  row_pref
-	      = bli_cntx_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
-
-	// We can only perform this case of mixed-domain gemm, C += A*B where
-	// B is real, if the microkernel prefers column output. If it prefers
-	// row output, we must induce a transposition and perform C += A*B
-	// where A (formerly B) is real.
-	if ( row_pref )
-	{
-		bli_obj_swap( a, b );
-
-		bli_obj_induce_trans( a );
-		bli_obj_induce_trans( b );
-		bli_obj_induce_trans( c );
-
-		// We must swap the pack schemas because the schemas were set before
-		// the objects were swapped.
-		bli_obj_swap_pack_schemas( a, b );
-
-		return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
-	}
-
-	// Create a local copy of the context and then prepare to use this
-	// context instead of the one passed in.
-	*cntx_local = **cntx;
-	*cntx = cntx_local;
-
-	// Copy the real domain blocksizes into the slots of their complex
-	// counterparts.
-	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
-	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
-	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
-	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
-	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
-
-	// Halve both the real and complex MR's (which are both real MR's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mr );
-
-	// Halve both the real and complex MC's (which are both real MC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mc );
-
-    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
-    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
-    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
-    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
-    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
-
-	// Use the default pack schemas in the objects.
-
-	// Rather than check which complex datatype dt_comp refers to, we set
-	// the mixed-domain virtual microkernel for both types.
-    bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
-    bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_crc
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-
-	// We assume that the requested computation domain is complex.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_COMPLEX;
-
-	// For crc, the computation (ukernel) will be real, but the execution
-	// will appear complex to other parts of the implementation.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_COMPLEX;
-
-	// Here we construct the computation datatype, which for the crc case
-	// is equal to the real projection of the execution datatype, and use
-	// that computation datatype to query the corresponding ukernel output
-	// preference.
-	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
-	const bool  col_pref
-	      = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
-
-	// We can only perform this case of mixed-domain gemm, C += A*B where
-	// A is real, if the microkernel prefers row output. If it prefers
-	// column output, we must induce a transposition and perform C += A*B
-	// where B (formerly A) is real.
-	if ( col_pref )
-	{
-		bli_obj_swap( a, b );
-
-		bli_obj_induce_trans( a );
-		bli_obj_induce_trans( b );
-		bli_obj_induce_trans( c );
-
-		// We must swap the pack schemas because the schemas were set before
-		// the objects were swapped.
-		bli_obj_swap_pack_schemas( a, b );
-
-		return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
-	}
-
-	// Create a local copy of the context and then prepare to use this
-	// context instead of the one passed in.
-	*cntx_local = **cntx;
-	*cntx = cntx_local;
-
-	// Copy the real domain blocksizes into the slots of their complex
-	// counterparts.
-	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
-	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
-	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
-	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
-	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
-
-	// Halve both the real and complex NR's (which are both real NR's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nr );
-
-	// Halve both the real and complex NC's (which are both real NC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nc );
-
-    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
-    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
-    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
-    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
-    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
-
-	// Use the default pack schemas in the objects.
-
-	// Rather than check which complex datatype dt_comp refers to, we set
-	// the mixed-domain virtual microkernel for both types.
-    bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
-    bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_rcc
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-
-	// We assume that the requested computation domain is complex.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_COMPLEX;
-
-	// For rcc, the computation (ukernel) will be real, and since the output
-	// matrix C is also real, so must be the execution domain.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_REAL;
-
-	// Create a local copy of the context and then prepare to use this
-	// context instead of the one passed in.
-	*cntx_local = **cntx;
-	*cntx = cntx_local;
-
-	// Copy the real domain blocksizes into the slots of their complex
-	// counterparts.
-	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
-	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
-	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
-	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
-	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
-
-	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
-
-	// Halve both the real and complex KC's (which are both real KC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_kc );
-
-    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
-    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
-    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
-    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
-    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
-
-	// Use the 1r pack schema for both A and B with the conjugation
-	// of A or B toggled (to produce ar * br - ai * bi).
-	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a );
-	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b );
-
-	bli_obj_toggle_conj( b );
-
-	// We also need to copy over the packm kernels from the 1m
-	// context. We query the address of that context here.
-	// NOTE: This is needed for situations where the rcc case does not
-	// involve any casting to different precisions, since currently
-	// bli_packm_blk_var1() is coded to hand off control to
-	// bli_packm_blk_var1_md() only when the storage datatype differs from
-	// the target datatype. (The packm_blk_var1_md() function has "built-in"
-	// support for packing to 1r (and 1e) schemas, whereas the
-	// packm_blk_var1() function relies on packm kernels for packing to 1r.
-	const cntx_t* cntx_1m     = bli_gks_query_ind_cntx( BLIS_1M );
-
-	const func_t* packm_1m_mr = bli_cntx_get_ukrs( BLIS_PACKM_MRXK_KER, cntx_1m );
-	const func_t* packm_1m_nr = bli_cntx_get_ukrs( BLIS_PACKM_NRXK_KER, cntx_1m );
-
-    bli_cntx_set_ukr( BLIS_PACKM_MRXK_KER, packm_1m_mr, cntx_local );
-    bli_cntx_set_ukr( BLIS_PACKM_NRXK_KER, packm_1m_nr, cntx_local );
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_crr
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
-	obj_t  c_real;
-#endif
-
-	// We assume that the requested computation domain is real.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_REAL;
-
-	// For crr, the computation (ukernel) will be real, and since we will
-	// be updating only the real part of the output matrix C, the exectuion
-	// domain is also real.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_REAL;
-
-	// Since the A*B product is real, we can update only the real part of
-	// C. Thus, we convert the obj_t for the complex matrix to one that
-	// represents only the real part. HOWEVER, there are two situations in
-	// which we forgo this trick:
-	// - If extra memory optimizations are enabled, we should leave C alone
-	//   since we'll be computing A*B to a temporary matrix and accumulating
-	//   that result back to C, and in order for that to work, we need to
-	//   allow that code to continue accessing C as a complex matrix.
-	// - Even if extra memory optimizations are diabled, logically projecting
-	//   C as a real matrix can still cause problems if beta is non-unit. In
-	//   that situation, the implementation won't get a chance to scale the
-	//   imaginary components of C by beta, and thus it would compute the
-	//   wrong answer. Thus, if beta is non-unit, we must leave C alone.
-#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
-	if ( bli_obj_equals( beta, &BLIS_ONE ) )
-	{
-		bli_obj_real_part( c, &c_real );
-
-		// Overwrite the complex obj_t with its real-only alias.
-		*c = c_real;
-	}
-#endif
-
-	// Use the default pack schemas in the objects.
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_rcr
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-	obj_t  a_real;
-
-	// We assume that the requested computation domain is real.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_REAL;
-
-	// For rcr, the computation (ukernel) will be real, and since the output
-	// matrix C is also real, so must be the execution domain.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_REAL;
-
-	// Convert the obj_t for the complex matrix to one that represents only
-	// the real part.
-	bli_obj_real_part( a, &a_real );
-
-	// Overwrite the complex obj_t with its real-only alias.
-	*a = a_real;
-
-	// Use the default pack schemas in the objects.
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_rrc
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-	obj_t  b_real;
-
-	// We assume that the requested computation domain is real.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_REAL;
-
-	// For rcr, the computation (ukernel) will be real, and since the output
-	// matrix C is also real, so must be the execution domain.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_REAL;
-
-	// Convert the obj_t for the complex matrix to one that represents only
-	// the real part.
-	bli_obj_real_part( b, &b_real );
-
-	// Overwrite the complex obj_t with its real-only alias.
-	*b = b_real;
-
-	// Use the default pack schemas in the objects.
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_rrr
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-
-	// We assume that the requested computation domain is real.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_REAL;
-
-	// For rrr, the computation (ukernel) and execution domains are both
-	// real.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_REAL;
-
-	// Use the default pack schemas in the objects.
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-// -----------------------------------------------------------------------------
-
-//                 cab
-mddm_t bli_gemm_md_ccc
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     )
-{
-	mddm_t doms;
-
-	// We assume that the requested computation domain is complex.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_COMPLEX;
-
-	// For ccc, the computation (ukernel) and execution domains are both
-	// complex.
-	doms.comp = BLIS_COMPLEX;
-	doms.exec = BLIS_COMPLEX;
-
-	// Use the default pack schemas in the objects.
-
-	// Return the computation and execution domains.
-	return doms;
-}
-
-#endif
diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h
deleted file mode 100644
index d71d97987..000000000
--- a/frame/3/gemm/bli_gemm_md.h
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_gemm_md_c2r_ref.h"
-
-// Define a local struct type that makes returning two values easier.
-typedef struct mddm_s
-{
-	dom_t comp;
-	dom_t exec;
-} mddm_t;
-
-void bli_gemm_md
-     (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     );
-mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-
-// -----------------------------------------------------------------------------
-
-void bli_gemm_md_front
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
-     );
-
-void bli_gemm_md_zgemm
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
-     );
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE bool bli_gemm_md_is_crr( const obj_t* a, const obj_t* b, const obj_t* c )
-{
-	bool r_val = FALSE;
-
-	// NOTE: The last conditional subexpression is necessary if/when we
-	// allow the user to specify the computation domain. (The computation
-	// domain is currently ignored, but once it is honored as a user-
-	// settable value, it will affect the execution domain, which is what
-	// is checked below. Until then, the last expression is not actually
-	// necessary since crr is already unconditionally associated with an
-	// execution domain of BLIS_REAL.)
-	if ( bli_obj_is_complex( c ) &&
-	     bli_obj_is_real( a )    &&
-	     bli_obj_is_real( b )    &&
-	     bli_obj_exec_domain( c ) == BLIS_REAL )
-		r_val = TRUE;
-
-	return r_val;
-}
-
-BLIS_INLINE bool bli_gemm_md_is_ccr( const obj_t* a, const obj_t* b, const obj_t* c )
-{
-	bool r_val = FALSE;
-
-	// NOTE: The last conditional subexpression is necessary if/when we
-	// allow the user to specify the computation domain. (The computation
-	// domain is currently ignored, but once it is honored as a user-
-	// settable value, it will affect the execution domain, which is what
-	// is checked below. Until then, the last expression is not actually
-	// necessary since ccr is already unconditionally associated with an
-	// execution domain of BLIS_COMPLEX.)
-	if ( bli_obj_is_complex( c ) &&
-	     bli_obj_is_complex( a ) &&
-	     bli_obj_is_real( b )    &&
-	     bli_obj_exec_domain( c ) == BLIS_COMPLEX )
-		r_val = TRUE;
-
-	return r_val;
-}
-
-BLIS_INLINE bool bli_gemm_md_is_crc( const obj_t* a, const obj_t* b, const obj_t* c )
-{
-	bool r_val = FALSE;
-
-	// NOTE: The last conditional subexpression is necessary if/when we
-	// allow the user to specify the computation domain. (The computation
-	// domain is currently ignored, but once it is honored as a user-
-	// settable value, it will affect the execution domain, which is what
-	// is checked below. Until then, the last expression is not actually
-	// necessary since crc is already unconditionally associated with an
-	// execution domain of BLIS_COMPLEX.)
-	if ( bli_obj_is_complex( c ) &&
-	     bli_obj_is_real( a )    &&
-	     bli_obj_is_complex( b ) &&
-	     bli_obj_exec_domain( c ) == BLIS_COMPLEX )
-		r_val = TRUE;
-
-	return r_val;
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE void bli_gemm_md_ker_var2_recast
-     (
-             num_t* dt_comp,
-             num_t  dt_a,
-             num_t  dt_b,
-             num_t* dt_c,
-             dim_t* m,
-             dim_t* n,
-             dim_t* k,
-             inc_t* pd_a, inc_t* ps_a,
-             inc_t* pd_b, inc_t* ps_b,
-       const obj_t* c,
-             inc_t* rs_c, inc_t* cs_c
-     )
-{
-	if      ( bli_is_real( *dt_c )    &&
-	          bli_is_complex( dt_a ) &&
-	          bli_is_complex( dt_b ) )
-	{
-		// The rcc case is executed with a real macrokernel, so we need to
-		// double the k dimension (because both A and B are packed to the 1r
-		// schema), and also the panel strides of A and B since they were
-		// packed as complex matrices and we now need to convert them to
-		// units of real elements.
-		*k *= 2;
-		*ps_a *= 2;
-		*ps_b *= 2;
-	}
-	else if ( bli_is_complex( *dt_c ) &&
-	          bli_is_real( dt_a )    &&
-	          bli_is_complex( dt_b ) )
-	{
-#if 1
-		obj_t beta;
-
-		bli_obj_scalar_detach( c, &beta );
-
-		if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&
-		     bli_obj_imag_is_zero( &beta ) &&
-		     bli_is_row_stored( *rs_c, *cs_c ) &&
-		     bli_obj_prec( c ) == bli_obj_comp_prec( c ) )
-		{
-			// If beta is real, and C is not general-stored, and the computation
-			// precision is equal to the storage precision of C, we can use the
-			// real macrokernel (and real microkernel, which is already stored
-			// to the real virtual microkernel slots of the context) instead of
-			// the complex macrokernel and c2r virtual microkernel.
-			*dt_comp = bli_dt_proj_to_real( *dt_comp );
-			*dt_c = bli_dt_proj_to_real( *dt_c );
-			*n *= 2;
-			*pd_b *= 2; *ps_b *= 2;
-			*rs_c *= 2;
-		}
-		else
-#endif
-		{
-			// Generally speaking, the crc case is executed with a complex
-			// macrokernel, so we need to halve the panel stride of A (which
-			// is real) since the macrokernel will perform the pointer
-			// arithmetic in units of complex elements.
-			*ps_a /= 2;
-		}
-	}
-	else if ( bli_is_complex( *dt_c ) &&
-	          bli_is_complex( dt_a ) &&
-	          bli_is_real( dt_b ) )
-	{
-#if 1
-		obj_t beta;
-
-		bli_obj_scalar_detach( c, &beta );
-
-		if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&
-		     bli_obj_imag_is_zero( &beta ) &&
-		     bli_is_col_stored( *rs_c, *cs_c ) &&
-		     bli_obj_prec( c ) == bli_obj_comp_prec( c ) )
-		{
-			// If beta is real, and C is not general-stored, and the computation
-			// precision is equal to the storage precision of C, we can use the
-			// real macrokernel (and real microkernel, which is already stored
-			// to the real virtual microkernel slots of the context) instead of
-			// the complex macrokernel and c2r virtual microkernel.
-			*dt_comp = bli_dt_proj_to_real( *dt_comp );
-			*dt_c = bli_dt_proj_to_real( *dt_c );
-			*m *= 2;
-			*pd_a *= 2; *ps_a *= 2;
-			*cs_c *= 2;
-		}
-		else
-#endif
-		{
-			// Generally speaking, the ccr case is executed with a complex
-			// macrokernel, so we need to halve the panel stride of B (which
-			// is real) since the macrokernel will perform the pointer
-			// arithmetic in units of complex elements.
-			*ps_b /= 2;
-		}
-	}
-#if 0
-	else if ( bli_is_real( dt_c ) &&
-	          bli_is_real( dt_a ) &&
-	          bli_is_real( dt_b ) )
-	{
-		// No action needed.
-//printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k );
-	}
-	else if ( bli_is_complex( dt_c ) &&
-	          bli_is_real( dt_a ) &&
-	          bli_is_real( dt_b ) )
-	{
-		// No action needed.
-	}
-	else if ( bli_is_real( dt_c ) &&
-	          bli_is_complex( dt_a ) &&
-	          bli_is_real( dt_b ) )
-	{
-		// No action needed.
-	}
-	else if ( bli_is_real( dt_c ) &&
-	          bli_is_real( dt_a ) &&
-	          bli_is_complex( dt_b ) )
-	{
-		// No action needed.
-	}
-#endif
-}
-
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
deleted file mode 100644
index ada9ed858..000000000
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#ifdef BLIS_ENABLE_GEMM_MD
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, suf ) \
-\
-void PASTEMAC2(ch,opname,suf) \
-     ( \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const void*      alpha, \
-       const void*      a, \
-       const void*      b, \
-       const void*      beta, \
-             void*      c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     ) \
-{ \
-	const num_t       dt        = PASTEMAC(ch,type); \
-	const num_t       dt_r      = PASTEMAC(chr,type); \
-\
-	      gemm_ukr_ft rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        row_pref  = !col_pref; \
-\
-	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	      dim_t       mr_r      = mr; \
-	      dim_t       nr_r      = nr; \
-\
-	      ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                      / sizeof( ctype_r ) ] \
-	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	      inc_t       rs_ct; \
-	      inc_t       cs_ct; \
-\
-	const ctype_r*    a_r       = ( ctype_r* )a; \
-\
-	const ctype_r*    b_r       = ( ctype_r* )b; \
-\
-	const ctype_r*    zero_r    = PASTEMAC(chr,0); \
-\
-	const ctype_r*    alpha_r   = &PASTEMAC(ch,real)( *(( ctype* )alpha) ); \
-	   /* ctype_r*    alpha_i   = &PASTEMAC(ch,imag)( *(( ctype* )alpha) ); */ \
-\
-	const ctype_r*    beta_r    = &PASTEMAC(ch,real)( *(( ctype* )beta) ); \
-	const ctype_r*    beta_i    = &PASTEMAC(ch,imag)( *(( ctype* )beta) ); \
-\
-	      bool        using_ct; \
-\
-	/* This virtual microkernel is used by ccr and crc mixed-domain cases
-	   when any of the following conditions are met:
-	   - beta is complex (ie: has a non-zero imaginary component)
-	   - C is general-stored
-	   - the computation precision differs from the storage of C
-	   If, however, none of the above conditions are met, then the real
-	   domain macrokernel can be (and will be) called instead of calling
-	   the complex macrokernel (and this virtual microkernel). */ \
-\
-/*
-PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: a", mr, k, \
-                       a_r, 1, mr, "%5.2f", "" ); \
-PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: b", k, nr, \
-                       b_r, nr, 1, "%5.2f", "" ); \
-PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
-                       c_use, rs_c_use, cs_c_use, "%5.2f", "" ); \
-*/ \
-\
-	/* SAFETY CHECK: The higher level implementation should never
-	   allow an alpha with non-zero imaginary component to be passed
-	   in, because it can't be applied properly using the 1m method.
-	   If alpha is not real, then something is very wrong. */ \
-/*
-	if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-*/ \
-\
-	/* If beta has a non-zero imaginary component OR if c is stored with
-	   general stride, then we compute the alpha*a*b product into temporary
-	   storage and then accumulate that result into c afterwards. Note that
-	   the other two cases concerning disagreement between the storage of C
-	   and the output preference of the micro-kernel, should ONLY occur in
-	   the context of trsm, whereby this virtual micro-kernel is called
-	   directly from the trsm macro-kernel to update the micro-tile b11
-	   that exists within the packed row-panel of B. Indeed that is the
-	   reason those cases MUST be explicitly handled. */ \
-	if      ( !PASTEMAC(chr,eq0)( *beta_i ) )               using_ct = TRUE; \
-	else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
-	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
-	else if ( bli_is_gen_stored( rs_c, cs_c ) )             using_ct = TRUE; \
-	else                                                    using_ct = FALSE; \
-\
-\
-	if ( using_ct ) \
-	{ \
-		/* In the atypical cases, we compute the result into temporary
-		   workspace ct and then accumulate it back to c at the end. */ \
-\
-		/* Set the strides of ct based on the preference of the underlying
-		   native real domain gemm micro-kernel. Note that we set the ct
-		   strides in units of complex elements. */ \
-		if ( col_pref ) { rs_ct = 1;  cs_ct = mr; } \
-		else            { rs_ct = nr; cs_ct = 1; } \
-\
-		ctype_r* c_use    = ( ctype_r* )ct; \
-		inc_t    rs_c_use = rs_ct; \
-		inc_t    cs_c_use = cs_ct; \
-\
-		/* Convert the strides and corresponding microtile dimension from being
-		   in units of complex elements to be in units of real elements. */ \
-		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; mr_r *= 2; } \
-		else                                           { rs_c_use *= 2; nr_r *= 2; }\
-\
-		/* c = beta * c + alpha_r * a * b; */ \
-		rgemm_ukr \
-		( \
-		  mr_r, \
-		  nr_r, \
-		  k, \
-		  alpha_r, \
-		  a_r, \
-		  b_r, \
-		  zero_r, \
-		  c_use, rs_c_use, cs_c_use, \
-		  data, \
-		  cntx  \
-		); \
-\
-		/* Accumulate the final result in ct back to c. */ \
-		if ( PASTEMAC(ch,eq1)( *(( ctype* )beta) ) ) \
-		{ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,adds) \
-				( \
-				  *(          ct + i*rs_ct + j*cs_ct), \
-				  *(( ctype* )c  + i*rs_c  + j*cs_c )  \
-				); \
-			} \
-		} \
-		else if ( PASTEMAC(ch,eq0)( *(( ctype* )beta )) ) \
-		{ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,copys) \
-				( \
-				  *(          ct + i*rs_ct + j*cs_ct), \
-				  *(( ctype* )c  + i*rs_c  + j*cs_c )  \
-				); \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,xpbys) \
-				( \
-				  *(          ct + i*rs_ct + j*cs_ct), \
-				  *(( ctype* )beta                  ), \
-				  *(( ctype* )c  + i*rs_c  + j*cs_c )  \
-				); \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		/* In the typical cases, we use the real part of beta and
-		   accumulate directly into the output matrix c. */ \
-\
-		ctype_r* c_use    = ( ctype_r* )c; \
-		inc_t    rs_c_use = rs_c; \
-		inc_t    cs_c_use = cs_c; \
-\
-		dim_t    m_use    = m; \
-		dim_t    n_use    = n; \
-\
-		/* Convert the strides and corresponding microtile dimension from being
-		   in units of complex elements to be in units of real elements. */ \
-		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; m_use *= 2; } \
-		else                                           { rs_c_use *= 2; n_use *= 2; } \
-\
-		/* c = beta * c + alpha_r * a * b; */ \
-		rgemm_ukr \
-		( \
-		  m_use, \
-		  n_use, \
-		  k, \
-		  alpha_r, \
-		  a_r, \
-		  b_r, \
-		  beta_r, \
-		  c_use, rs_c_use, cs_c_use, \
-		  data, \
-		  cntx  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO( gemm_md_c2r, BLIS_REF_SUFFIX )
-
-#endif
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.h b/frame/3/gemm/bli_gemm_md_c2r_ref.h
deleted file mode 100644
index c3a1dc8b1..000000000
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// -- Level-3 native micro-kernel prototype redefinitions ----------------------
-
-#ifdef BLIS_ENABLE_GEMM_MD
-
-#if 0
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, suf ) \
-\
-void PASTEMAC2(ch,opname,suf) \
-     ( \
-             dim_t      m, \
-             dim_t      n, \
-             dim_t      k, \
-       const void*      alpha, \
-       const void*      a, \
-       const void*      b, \
-       const void*      beta, \
-             void*      c, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     )
-#endif
-
-#undef  GENTPROTCO
-#define GENTPROTCO( ctype, ctype_r, ch, chr, funcname, opname ) \
-\
-void PASTEMAC(ch,funcname) \
-     ( \
-       PASTECH(opname,_params), \
-       BLIS_AUXINFO_PARAM, \
-       BLIS_CNTX_PARAM  \
-     );
-
-INSERT_GENTPROTCO_BASIC( gemm_md_c2r_ref, gemm )
-
-
-#endif
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index 36500cb6a..e2510c69c 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -51,7 +51,7 @@ typedef struct
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t*     a, \
        const obj_t*     b, \
@@ -67,5 +67,3 @@ GENPROT( gemm_blk_var3 )
 
 GENPROT( gemm_ker_var2 )
 
-GENPROT( gemm_ker_var2b )
-
diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h
deleted file mode 100644
index a57325580..000000000
--- a/frame/3/gemm/ind/bli_gemm_ind_opt.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-BLIS_INLINE void bli_gemm_ind_recast_1m_params
-     (
-             num_t*  dt_exec,
-             num_t*  dt_c,
-             pack_t  schema_a,
-       const obj_t*  c,
-             dim_t*  m,
-             dim_t*  n,
-             dim_t*  k,
-             inc_t*  pd_a, inc_t* ps_a,
-             inc_t*  pd_b, inc_t* ps_b,
-             inc_t*  rs_c, inc_t* cs_c,
-       const cntx_t* cntx
-     )
-{
-	obj_t beta;
-
-	// Detach the beta scalar from c so that we can test its imaginary
-	// component.
-	bli_obj_scalar_detach( c, &beta );
-
-#if 1
-	// Determine whether the storage of C matches the IO preference of the
-	// microkernel. (We cannot utilize the optimization below if there is a
-	// mismatch.)
-	const ukr_t ukr_id     = BLIS_GEMM_VIR_UKR;
-
-	const bool  row_stored = bli_is_row_stored( *rs_c, *cs_c );
-	const bool  col_stored = !row_stored;
-	const bool  row_pref   = bli_cntx_ukr_prefers_rows_dt( *dt_c, ukr_id, cntx );
-	const bool  col_pref   = !row_pref;
-
-	const bool  is_match   = ( row_stored && row_pref ) ||
-	                         ( col_stored && col_pref );
-#else
-	// This was the previous behavior, which resulted in buggy behavior
-	// when executing right-sided hemm, and:
-	// - the 1m method is enabled,
-	// - BLIS_DISABLE_HEMM_RIGHT is #defined, and
-	// - the storage of C matches the microkernel IO preference PRIOR to
-	//   detecting the right-sidedness of the operation.
-	// See Issue #621 for details.
-	const bool is_match = TRUE;
-#endif
-
-	// If (a) the storage of C matches the IO pref of the ukernel, (b) beta is
-	// in the real domain, and (c) C is row- or column-stored, then we may
-	// proceed with the optimization below, which allows 1m to be induced by
-	// executing the real-domain macrokernel with the real-domain microkernel
-	// plus a few tweaked parameters. Otherwise, we must skip the optimization
-	// and allow 1m to execute via the complex-domain macrokernel calling the
-	// 1m virtual microkernel function, which will incur a little extra
-	// overhead.
-	if ( is_match &&
-	     bli_obj_imag_is_zero( &beta ) &&
-	     !bli_is_gen_stored( *rs_c, *cs_c ) )
-	{
-		*dt_exec = bli_dt_proj_to_real( *dt_exec );
-		*dt_c    = bli_dt_proj_to_real( *dt_c );
-
-		if ( bli_is_1e_packed( schema_a ) )
-		{
-			*m    *= 2;
-			*n    *= 1;
-			*k    *= 2;
-			*pd_a *= 2; *ps_a *= 2;
-			*pd_b *= 1; *ps_b *= 2;
-			*rs_c *= 1; *cs_c *= 2;
-		}
-		else // if ( bli_is_1r_packed( schema_a ) )
-		{
-			*m    *= 1;
-			*n    *= 2;
-			*k    *= 2;
-			*pd_a *= 1; *ps_a *= 2;
-			*pd_b *= 2; *ps_b *= 2;
-			*rs_c *= 2; *cs_c *= 1;
-		}
-	}
-}
-
diff --git a/frame/3/gemmt/bli_gemmt.h b/frame/3/gemmt/bli_gemmt.h
index 32ab3865e..3d26751b1 100644
--- a/frame/3/gemmt/bli_gemmt.h
+++ b/frame/3/gemmt/bli_gemmt.h
@@ -32,7 +32,5 @@
 
 */
 
-#include "bli_gemmt_front.h"
-
 #include "bli_gemmt_var.h"
 
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
deleted file mode 100644
index d75738a94..000000000
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_gemmt_front
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   b_local;
-	obj_t   c_local;
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// Set the obj_t buffer field to the location currently implied by the row
-	// and column offsets and then zero the offsets. If any of the original
-	// obj_t's were views into larger matrices, this step effectively makes
-	// those obj_t's "forget" their lineage.
-	bli_obj_reset_origin( &a_local );
-	bli_obj_reset_origin( &b_local );
-	bli_obj_reset_origin( &c_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
-
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects, as appropriate.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_GEMM,
-	  BLIS_LEFT, // ignored for gemm/hemm/symm/gemmt
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end via the thread handler.
-	bli_l3_thread_decorator
-	(
-	  bli_l3_int,
-	  BLIS_GEMMT, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm
-	);
-}
-
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 7bb3a5e36..02ff808e2 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-typedef void (*xpbys_mxn_l_vft)
+typedef void (*xpbys_mxn_l_ft)
     (
             doff_t diagoff,
             dim_t  m,
@@ -62,7 +62,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
+	PASTEMAC(ch,ch,ch,xpbys_mxn_l) \
 	( \
 	  diagoff, \
 	  m, \
@@ -75,7 +75,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 
 INSERT_GENTFUNC_BASIC(xpbys_mxn_l_fn);
 
-static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
+static xpbys_mxn_l_ft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
 // -----------------------------------------------------------------------------
 
@@ -89,9 +89,15 @@ void bli_gemmt_l_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
 	const num_t  dt_c      = bli_obj_dt( c );
 
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
+
 	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
@@ -102,12 +108,10 @@ void bli_gemmt_l_ker_var2
 	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t  is_a      = bli_obj_imag_stride( a );
 	const dim_t  pd_a      = bli_obj_panel_dim( a );
 	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const void*  buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t  is_b      = bli_obj_imag_stride( b );
 	const dim_t  pd_b      = bli_obj_panel_dim( b );
 	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
@@ -126,17 +130,15 @@ void bli_gemmt_l_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	const siz_t dt_size   = bli_dt_size( dt_exec );
-	const siz_t dt_c_size = bli_dt_size( dt_c );
-
 	// Alias some constants to simpler names.
 	const dim_t MR = pd_a;
 	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
-	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ];
+	gemm_ukr_ft    gemm_ukr        = bli_gemm_var_cntl_ukr( cntl );
+	const void*    params          = bli_gemm_var_cntl_params( cntl );
+	xpbys_mxn_l_ft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_c ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
 	// temporary buffer are set so that they match the storage of the
@@ -144,11 +146,11 @@ void bli_gemmt_l_ker_var2
 	// column-stored as well.
 	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
-	const inc_t rs_ct       = ( col_pref ? 1 : NR );
-	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+	const bool  row_pref    = bli_gemm_var_cntl_row_pref( cntl );
+	const inc_t rs_ct       = ( row_pref ? NR: 1 );
+	const inc_t cs_ct       = ( row_pref ? 1: MR );
 
-	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const void* zero       = bli_obj_buffer_for_const( BLIS_COMPLEX | dt_comp, &BLIS_ZERO );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -187,7 +189,7 @@ void bli_gemmt_l_ker_var2
 		m        = m - i;
 		diagoffc = diagoffc % MR;
 		c_cast   = c_cast + (i  )*rs_c*dt_c_size;
-		a_cast   = a_cast + (ip )*ps_a*dt_size;
+		a_cast   = a_cast + (ip )*ps_a*dt_a_size;
 	}
 
 	// If there is a zero region to the right of where the diagonal
@@ -207,9 +209,9 @@ void bli_gemmt_l_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
 	const inc_t rstep_c = rs_c * MR * dt_c_size;
 	const inc_t cstep_c = cs_c * NR * dt_c_size;
@@ -220,15 +222,15 @@ void bli_gemmt_l_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
-	// Save the imaginary stride of A and B to the auxinfo_t object.
-	bli_auxinfo_set_is_a( is_a, &aux );
-	bli_auxinfo_set_is_b( is_b, &aux );
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
 
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
@@ -244,8 +246,8 @@ void bli_gemmt_l_ker_var2
 	// slab or round-robin partitioning was requested at configure-time.
 	bli_thread_range_quad( thread, diagoffc, BLIS_LOWER, m, n, NR,
 	                       FALSE, &jr_start, &jr_end, &jr_inc );
-	//bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+	//bli_thread_range_slrr( jr_tid, jt_nt, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( ir_tid, ir_nt, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
index a4d384629..a11f599d3 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-typedef void (*xpbys_mxn_l_vft)
+typedef void (*xpbys_mxn_l_ft)
     (
             doff_t diagoff,
             dim_t  m,
@@ -62,7 +62,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
+	PASTEMAC(ch,ch,ch,xpbys_mxn_l) \
 	( \
 	  diagoff, \
 	  m, \
@@ -75,7 +75,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 
 INSERT_GENTFUNC_BASIC(xpbys_mxn_l_fn);
 
-static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
+static xpbys_mxn_l_ft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
 // -----------------------------------------------------------------------------
 
@@ -89,9 +89,15 @@ void bli_gemmt_l_ker_var2b
              thrinfo_t* thread_par
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
 	const num_t  dt_c      = bli_obj_dt( c );
 
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
+
 	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
@@ -102,12 +108,10 @@ void bli_gemmt_l_ker_var2b
 	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t  is_a      = bli_obj_imag_stride( a );
 	const dim_t  pd_a      = bli_obj_panel_dim( a );
 	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const void*  buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t  is_b      = bli_obj_imag_stride( b );
 	const dim_t  pd_b      = bli_obj_panel_dim( b );
 	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
@@ -126,17 +130,15 @@ void bli_gemmt_l_ker_var2b
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	const siz_t dt_size   = bli_dt_size( dt_exec );
-	const siz_t dt_c_size = bli_dt_size( dt_c );
-
 	// Alias some constants to simpler names.
 	const dim_t MR = pd_a;
 	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
-	xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ];
+	gemm_ukr_ft    gemm_ukr        = bli_gemm_var_cntl_ukr( cntl );
+	const void*    params          = bli_gemm_var_cntl_params( cntl );
+	xpbys_mxn_l_ft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_c ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
 	// temporary buffer are set so that they match the storage of the
@@ -144,11 +146,11 @@ void bli_gemmt_l_ker_var2b
 	// column-stored as well.
 	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
-	const inc_t rs_ct       = ( col_pref ? 1 : NR );
-	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+	const bool  row_pref    = bli_gemm_var_cntl_row_pref( cntl );
+	const inc_t rs_ct       = ( row_pref ? NR : 1 );
+	const inc_t cs_ct       = ( row_pref ? 1 : MR );
 
-	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const void* zero       = bli_obj_buffer_for_const( BLIS_COMPLEX | dt_comp, &BLIS_ZERO );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -190,7 +192,7 @@ void bli_gemmt_l_ker_var2b
 		m        = m - i;
 		diagoffc = diagoffc % MR;
 		c_cast   = c_cast + (i  )*rs_c*dt_c_size;
-		a_cast   = a_cast + (ip )*ps_a*dt_size;
+		a_cast   = a_cast + (ip )*ps_a*dt_a_size;
 	}
 
 	// If there is a zero region to the right of where the diagonal
@@ -210,9 +212,9 @@ void bli_gemmt_l_ker_var2b
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
 	const inc_t rstep_c = rs_c * MR * dt_c_size;
 	const inc_t cstep_c = cs_c * NR * dt_c_size;
@@ -223,15 +225,15 @@ void bli_gemmt_l_ker_var2b
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
-	// Save the imaginary stride of A and B to the auxinfo_t object.
-	bli_auxinfo_set_is_a( is_a, &aux );
-	bli_auxinfo_set_is_b( is_b, &aux );
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
 
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
 	const dim_t jr_tid = bli_thrinfo_work_id( thread );
@@ -258,7 +260,7 @@ void bli_gemmt_l_ker_var2b
 	dim_t j = jr_st;
 
 	// Initialize a counter to track the number of microtiles computed by the
-    // current thread.
+	// current thread.
 	dim_t ut = 0;
 
 	// Loop over the n dimension (NR columns at a time).
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 0ea845a20..34a10914f 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-typedef void (*xpbys_mxn_u_vft)
+typedef void (*xpbys_mxn_u_ft)
     (
             doff_t diagoff,
             dim_t  m,
@@ -62,7 +62,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \
+	PASTEMAC(ch,ch,ch,xpbys_mxn_u) \
 	( \
 	  diagoff, \
 	  m, \
@@ -75,7 +75,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 
 INSERT_GENTFUNC_BASIC(xpbys_mxn_u_fn);
 
-static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
+static xpbys_mxn_u_ft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
 // -----------------------------------------------------------------------------
 
@@ -89,9 +89,15 @@ void bli_gemmt_u_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
 	const num_t  dt_c      = bli_obj_dt( c );
 
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
+
 	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
@@ -102,12 +108,10 @@ void bli_gemmt_u_ker_var2
 	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t  is_a      = bli_obj_imag_stride( a );
 	const dim_t  pd_a      = bli_obj_panel_dim( a );
 	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const void*  buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t  is_b      = bli_obj_imag_stride( b );
 	const dim_t  pd_b      = bli_obj_panel_dim( b );
 	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
@@ -126,17 +130,15 @@ void bli_gemmt_u_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	const siz_t dt_size   = bli_dt_size( dt_exec );
-	const siz_t dt_c_size = bli_dt_size( dt_c );
-
 	// Alias some constants to simpler names.
 	const dim_t MR = pd_a;
 	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
-	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ];
+	gemm_ukr_ft    gemm_ukr        = bli_gemm_var_cntl_ukr( cntl );
+	const void*    params          = bli_gemm_var_cntl_params( cntl );
+	xpbys_mxn_u_ft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_c ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
 	// temporary buffer are set so that they match the storage of the
@@ -144,11 +146,11 @@ void bli_gemmt_u_ker_var2
 	// column-stored as well.
 	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
-	const inc_t rs_ct       = ( col_pref ? 1 : NR );
-	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+	const bool  row_pref    = bli_gemm_var_cntl_row_pref( cntl );
+	const inc_t rs_ct       = ( row_pref ? NR : 1 );
+	const inc_t cs_ct       = ( row_pref ? 1 : MR );
 
-	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const void* zero       = bli_obj_buffer_for_const( BLIS_COMPLEX | dt_comp, &BLIS_ZERO );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -189,7 +191,7 @@ void bli_gemmt_u_ker_var2
 		n        = n - j;
 		diagoffc = diagoffc % NR;
 		c_cast   = c_cast + (j  )*cs_c*dt_c_size;
-		b_cast   = b_cast + (jp )*ps_b*dt_size;
+		b_cast   = b_cast + (jp )*ps_b*dt_b_size;
 	}
 
 	// If there is a zero region below where the diagonal of C intersects
@@ -209,9 +211,9 @@ void bli_gemmt_u_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
 	const inc_t rstep_c = rs_c * MR * dt_c_size;
 	const inc_t cstep_c = cs_c * NR * dt_c_size;
@@ -222,21 +224,21 @@ void bli_gemmt_u_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
-	// Save the imaginary stride of A and B to the auxinfo_t object.
-	bli_auxinfo_set_is_a( is_a, &aux );
-	bli_auxinfo_set_is_b( is_b, &aux );
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
 
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
 	const dim_t jr_tid = bli_thrinfo_work_id( thread );
-	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
 	dim_t jr_start, jr_end, jr_inc;
 	dim_t ir_start, ir_end, ir_inc;
@@ -246,8 +248,8 @@ void bli_gemmt_u_ker_var2
 	// slab or round-robin partitioning was requested at configure-time.
 	bli_thread_range_quad( thread, diagoffc, BLIS_UPPER, m, n, NR,
 	                       FALSE, &jr_start, &jr_end, &jr_inc );
-	//bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
+	//bli_thread_range_slrr( jr_tid, jr_nt, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( ir_tid, ir_nt, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
index e02a5f93b..aa9f3bc5e 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-typedef void (*xpbys_mxn_u_vft)
+typedef void (*xpbys_mxn_u_ft)
     (
             doff_t diagoff,
             dim_t  m,
@@ -62,7 +62,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \
+	PASTEMAC(ch,ch,ch,xpbys_mxn_u) \
 	( \
 	  diagoff, \
 	  m, \
@@ -75,7 +75,7 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 
 INSERT_GENTFUNC_BASIC(xpbys_mxn_u_fn);
 
-static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
+static xpbys_mxn_u_ft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
 // -----------------------------------------------------------------------------
 
@@ -89,9 +89,15 @@ void bli_gemmt_u_ker_var2b
              thrinfo_t* thread_par
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
 	const num_t  dt_c      = bli_obj_dt( c );
 
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
+
 	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
@@ -102,12 +108,10 @@ void bli_gemmt_u_ker_var2b
 	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t  is_a      = bli_obj_imag_stride( a );
 	const dim_t  pd_a      = bli_obj_panel_dim( a );
 	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
 	const void*  buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t  is_b      = bli_obj_imag_stride( b );
 	const dim_t  pd_b      = bli_obj_panel_dim( b );
 	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
@@ -126,17 +130,15 @@ void bli_gemmt_u_ker_var2b
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	const siz_t dt_size   = bli_dt_size( dt_exec );
-	const siz_t dt_c_size = bli_dt_size( dt_c );
-
 	// Alias some constants to simpler names.
 	const dim_t MR = pd_a;
 	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft     gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
-	xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ];
+	gemm_ukr_ft    gemm_ukr        = bli_gemm_var_cntl_ukr( cntl );
+	const void*    params          = bli_gemm_var_cntl_params( cntl );
+	xpbys_mxn_u_ft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_c ];
 
 	// Temporary C buffer for edge cases. Note that the strides of this
 	// temporary buffer are set so that they match the storage of the
@@ -144,11 +146,11 @@ void bli_gemmt_u_ker_var2b
 	// column-stored as well.
 	      char  ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
-	const inc_t rs_ct       = ( col_pref ? 1 : NR );
-	const inc_t cs_ct       = ( col_pref ? MR : 1 );
+	const bool  row_pref    = bli_gemm_var_cntl_row_pref( cntl );
+	const inc_t rs_ct       = ( row_pref ? NR : 1 );
+	const inc_t cs_ct       = ( row_pref ? 1 : MR );
 
-	const void* zero       = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const void* zero       = bli_obj_buffer_for_const( BLIS_COMPLEX | dt_comp, &BLIS_ZERO );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -189,7 +191,7 @@ void bli_gemmt_u_ker_var2b
 		n        = n - j;
 		diagoffc = diagoffc % NR;
 		c_cast   = c_cast + (j  )*cs_c*dt_c_size;
-		b_cast   = b_cast + (jp )*ps_b*dt_size;
+		b_cast   = b_cast + (jp )*ps_b*dt_b_size;
 	}
 
 	// If there is a zero region below where the diagonal of C intersects
@@ -209,9 +211,9 @@ void bli_gemmt_u_ker_var2b
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
 	const inc_t rstep_c = rs_c * MR * dt_c_size;
 	const inc_t cstep_c = cs_c * NR * dt_c_size;
@@ -222,15 +224,15 @@ void bli_gemmt_u_ker_var2b
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
-	// Save the imaginary stride of A and B to the auxinfo_t object.
-	bli_auxinfo_set_is_a( is_a, &aux );
-	bli_auxinfo_set_is_b( is_b, &aux );
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
 
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
 	const dim_t jr_tid = bli_thrinfo_work_id( thread );
diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h
index 339b93755..62466b3df 100644
--- a/frame/3/gemmt/bli_gemmt_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -41,7 +41,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t*     a, \
        const obj_t*     ah, \
@@ -51,11 +51,9 @@ void PASTEMAC0(opname) \
              thrinfo_t* thread_par  \
      );
 
-GENPROT( gemmt_x_ker_var2 )
 GENPROT( gemmt_l_ker_var2 )
 GENPROT( gemmt_u_ker_var2 )
 
-GENPROT( gemmt_x_ker_var2b )
 GENPROT( gemmt_l_ker_var2b )
 GENPROT( gemmt_u_ker_var2b )
 
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
deleted file mode 100644
index 8081537b9..000000000
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-static l3_var_oft vars[2] =
-{
-	bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2,
-};
-
-void bli_gemmt_x_ker_var2
-     (
-       const obj_t*     a,
-       const obj_t*     ah,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-       const cntl_t*    cntl,
-             thrinfo_t* thread_par
-     )
-{
-	dim_t uplo;
-
-	// Set a bool based on the uplo field of C's root object.
-	if ( bli_obj_root_is_lower( c ) ) uplo = 0;
-	else                              uplo = 1;
-
-	// Index into the variant array to extract the correct function pointer.
-	l3_var_oft f = vars[uplo];
-
-	// Call the macrokernel.
-	f
-	(
-	  a,
-	  ah,
-	  c,
-	  cntx,
-	  cntl,
-	  thread_par
-	);
-}
-
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
deleted file mode 100644
index a281ddade..000000000
--- a/frame/3/hemm/bli_hemm_front.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_hemm_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   b_local;
-	obj_t   c_local;
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// Set the obj_t buffer field to the location currently implied by the row
-	// and column offsets and then zero the offsets. If any of the original
-	// obj_t's were views into larger matrices, this step effectively makes
-	// those obj_t's "forget" their lineage.
-	bli_obj_reset_origin( &a_local );
-	bli_obj_reset_origin( &b_local );
-	bli_obj_reset_origin( &c_local );
-
-#ifdef BLIS_DISABLE_HEMM_RIGHT
-	// NOTE: This case casts right-side hemm in terms of left side. This is
-	// necessary when the current subconfiguration uses a gemm microkernel
-	// that assumes that the packing kernel will have already duplicated
-	// (broadcast) element of B in the packed copy of B. Supporting
-	// duplication within the logic that packs micropanels from Hermitian/
-	// matrices would be ugly, and so we simply don't support it. As a
-	// consequence, those subconfigurations need a way to force the Hermitian
-	// matrix to be on the left (and thus the general matrix to the on the
-	// right). So our solution is that in those cases, the subconfigurations
-	// simply #define BLIS_DISABLE_HEMM_RIGHT.
-
-	// NOTE: This case casts right-side hemm in terms of left side. This can
-	// lead to the microkernel being executed on an output matrix with the
-	// microkernel's general stride IO case (unless the microkernel supports
-	// both both row and column IO cases as well).
-
-	// If A is being multiplied from the right, transpose all operands
-	// so that we can perform the computation as if A were being multiplied
-	// from the left.
-	if ( bli_is_right( side ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-#else
-	// NOTE: This case computes right-side hemm/symm natively by packing
-	// elements of the Hermitian/symmetric matrix A to micropanels of the
-	// right-hand packed matrix operand "B", and elements of the general
-	// matrix B to micropanels of the left-hand packed matrix operand "A".
-	// This code path always gives us the opportunity to transpose the
-	// entire operation so that the effective storage format of the output
-	// matrix matches the microkernel's output preference. Thus, from a
-	// performance perspective, this case is preferred.
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
-	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_toggle_conj( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// If the Hermitian/symmetric matrix A is being multiplied from the right,
-	// swap A and B so that the Hermitian/symmetric matrix will actually be on
-	// the right.
-	if ( bli_is_right( side ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
-	}
-#endif
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_HEMM,
-	  BLIS_LEFT, // ignored for gemm/hemm/symm
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_l3_int,
-	  BLIS_GEMM, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm
-	);
-}
-
diff --git a/frame/3/symm/bli_symm.h b/frame/3/symm/bli_symm.h
deleted file mode 100644
index e384cc5c0..000000000
--- a/frame/3/symm/bli_symm.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_symm_front.h"
-
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
deleted file mode 100644
index 1ee5e0a7f..000000000
--- a/frame/3/symm/bli_symm_front.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_symm_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   b_local;
-	obj_t   c_local;
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// Set the obj_t buffer field to the location currently implied by the row
-	// and column offsets and then zero the offsets. If any of the original
-	// obj_t's were views into larger matrices, this step effectively makes
-	// those obj_t's "forget" their lineage.
-	bli_obj_reset_origin( &a_local );
-	bli_obj_reset_origin( &b_local );
-	bli_obj_reset_origin( &c_local );
-
-#ifdef BLIS_DISABLE_SYMM_RIGHT
-	// NOTE: This case casts right-side symm in terms of left side. This is
-	// necessary when the current subconfiguration uses a gemm microkernel
-	// that assumes that the packing kernel will have already duplicated
-	// (broadcast) element of B in the packed copy of B. Supporting
-	// duplication within the logic that packs micropanels from symmetric
-	// matrices would be ugly, and so we simply don't support it. As a
-	// consequence, those subconfigurations need a way to force the symmetric
-	// matrix to be on the left (and thus the general matrix to the on the
-	// right). So our solution is that in those cases, the subconfigurations
-	// simply #define BLIS_DISABLE_SYMM_RIGHT.
-
-	// NOTE: This case casts right-side symm in terms of left side. This can
-	// lead to the microkernel being executed on an output matrix with the
-	// microkernel's general stride IO case (unless the microkernel supports
-	// both both row and column IO cases as well).
-
-	// If A is being multiplied from the right, transpose all operands
-	// so that we can perform the computation as if A were being multiplied
-	// from the left.
-	if ( bli_is_right( side ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-#else
-	// NOTE: This case computes right-side hemm/symm natively by packing
-	// elements of the Hermitian/symmetric matrix A to micropanels of the
-	// right-hand packed matrix operand "B", and elements of the general
-	// matrix B to micropanels of the left-hand packed matrix operand "A".
-	// This code path always gives us the opportunity to transpose the
-	// entire operation so that the effective storage format of the output
-	// matrix matches the microkernel's output preference. Thus, from a
-	// performance perspective, this case is preferred.
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
-	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// If the Hermitian/symmetric matrix A is being multiplied from the right,
-	// swap A and B so that the Hermitian/symmetric matrix will actually be on
-	// the right.
-	if ( bli_is_right( side ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
-	}
-#endif
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_SYMM,
-	  BLIS_LEFT, // ignored for gemm/hemm/symm
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_l3_int,
-	  BLIS_GEMM, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm
-	);
-}
-
diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h
deleted file mode 100644
index 585ec1025..000000000
--- a/frame/3/symm/bli_symm_front.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_symm_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     );
diff --git a/frame/3/trmm/bli_trmm.h b/frame/3/trmm/bli_trmm.h
index 85382eec9..ad854b326 100644
--- a/frame/3/trmm/bli_trmm.h
+++ b/frame/3/trmm/bli_trmm.h
@@ -32,7 +32,5 @@
 
 */
 
-#include "bli_trmm_front.h"
-
 #include "bli_trmm_var.h"
 
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
deleted file mode 100644
index d351e78e1..000000000
--- a/frame/3/trmm/bli_trmm_front.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_trmm_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   b_local;
-	obj_t   c_local;
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( alpha, b );
-		return;
-	}
-
-	// Alias A and B so we can tweak the objects if necessary.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( b, &c_local );
-
-	// Set the obj_t buffer field to the location currently implied by the row
-	// and column offsets and then zero the offsets. If any of the original
-	// obj_t's were views into larger matrices, this step effectively makes
-	// those obj_t's "forget" their lineage.
-	bli_obj_reset_origin( &a_local );
-	bli_obj_reset_origin( &b_local );
-	bli_obj_reset_origin( &c_local );
-
-	// We do not explicitly implement the cases where A is transposed.
-	// However, we can still handle them. Specifically, if A is marked as
-	// needing a transposition, we simply induce a transposition. This
-	// allows us to only explicitly implement the no-transpose cases. Once
-	// the transposition is induced, the correct algorithm will be called,
-	// since, for example, an algorithm over a transposed lower triangular
-	// matrix A moves in the same direction (forwards) as a non-transposed
-	// upper triangular matrix. And with the transposition induced, the
-	// matrix now appears to be upper triangular, so the upper triangular
-	// algorithm will grab the correct partitions, as if it were upper
-	// triangular (with no transpose) all along.
-	if ( bli_obj_has_trans( &a_local ) )
-	{
-		bli_obj_induce_trans( &a_local );
-		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
-	}
-
-#ifdef BLIS_DISABLE_TRMM_RIGHT
-	// NOTE: This case casts right-side trmm in terms of left side. This is
-	// necessary when the current subconfiguration uses a gemm microkernel
-	// that assumes that the packing kernel will have already duplicated
-	// (broadcast) element of B in the packed copy of B. Supporting
-	// duplication within the logic that packs micropanels from triangular
-	// matrices would be ugly, and so we simply don't support it. As a
-	// consequence, those subconfigurations need a way to force the triangular
-	// matrix to be on the left (and thus the general matrix to the on the
-	// right). So our solution is that in those cases, the subconfigurations
-	// simply #define BLIS_DISABLE_TRMM_RIGHT.
-
-	// NOTE: This case casts right-side trmm in terms of left side. This can
-	// lead to the microkernel being executed on an output matrix with the
-	// microkernel's general stride IO case (unless the microkernel supports
-	// both both row and column IO cases as well).
-
-	// NOTE: Casting right-side trmm in terms of left side reduces the number
-	// of macrokernels exercised to two (trmm_ll and trmm_lu).
-
-	// If A is being multiplied from the right, transpose all operands
-	// so that we can perform the computation as if A were being multiplied
-	// from the left.
-	if ( bli_is_right( side ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-#else
-	// NOTE: This case computes right-side trmm natively with trmm_rl and
-	// trmm_ru macrokernels. This code path always gives us the opportunity
-	// to transpose the entire operation so that the effective storage format
-	// of the output matrix matches the microkernel's output preference.
-	// Thus, from a performance perspective, this case is preferred.
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	// NOTE: We disable the optimization for 1x1 matrices since the concept
-	// of row- vs. column storage breaks down.
-	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
-	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// If A is being multiplied from the right, swap A and B so that
-	// the matrix will actually be on the right.
-	if ( bli_is_right( side ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
-	}
-
-#endif
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_TRMM,
-	  side,
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_l3_int,
-	  BLIS_TRMM, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  &BLIS_ZERO,
-	  &c_local,
-	  cntx,
-	  rntm
-	);
-}
-
diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h
deleted file mode 100644
index f13d4c34b..000000000
--- a/frame/3/trmm/bli_trmm_front.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_trmm_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     );
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index 43998f8be..83d1b85c5 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trmm_ll_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffa  = bli_obj_diag_offset( a );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -90,9 +96,10 @@ void bli_trmm_ll_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -134,7 +141,7 @@ void bli_trmm_ll_ker_var2
 	if ( diagoffa < 0 )
 	{
 		m        += diagoffa;
-		c_cast   -= diagoffa * rs_c * dt_size;
+		c_cast   -= diagoffa * rs_c * dt_c_size;
 		diagoffa  = 0;
 	}
 
@@ -147,12 +154,12 @@ void bli_trmm_ll_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -160,15 +167,19 @@ void bli_trmm_ll_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
-	//const dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	//const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
 	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
 	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
@@ -178,7 +189,8 @@ void bli_trmm_ll_ker_var2
 	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( jr_tid, jr_nt, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	//bli_thread_range_rr( ir_tid, ir_nt, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -221,12 +233,12 @@ void bli_trmm_ll_ker_var2
 				// intersecting micro-panel.
 				inc_t ps_a_cur  = k_a1011 * PACKMR;
 				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-				      ps_a_cur *= dt_size;
+				      ps_a_cur *= dt_a_size;
 
 				// NOTE: ir loop parallelism disabled for now.
 				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
 
-				const char* b1_i = b1 + off_a1011 * PACKNR * dt_size;
+				const char* b1_i = b1 + off_a1011 * PACKNR * dt_b_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2b.c b/frame/3/trmm/bli_trmm_ll_ker_var2b.c
index 4bc7d2fa0..8b6377c96 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2b.c
@@ -45,31 +45,37 @@ void bli_trmm_ll_ker_var2b
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffa  = bli_obj_diag_offset( a );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -90,9 +96,10 @@ void bli_trmm_ll_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -134,7 +141,7 @@ void bli_trmm_ll_ker_var2b
 	if ( diagoffa < 0 )
 	{
 		m        += diagoffa;
-		c_cast   -= diagoffa * rs_c * dt_size;
+		c_cast   -= diagoffa * rs_c * dt_c_size;
 		diagoffa  = 0;
 	}
 
@@ -151,12 +158,12 @@ void bli_trmm_ll_ker_var2b
 	const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 );
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -164,11 +171,15 @@ void bli_trmm_ll_ker_var2b
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for the JR loop.
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
@@ -219,7 +230,7 @@ void bli_trmm_ll_ker_var2b
 			// intersecting micro-panel.
 			inc_t ps_a_cur  = k_a1011 * PACKMR;
 			      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-			      ps_a_cur *= dt_size;
+			      ps_a_cur *= dt_a_size;
 
 			a1 += ps_a_cur;
 		}
@@ -268,9 +279,9 @@ void bli_trmm_ll_ker_var2b
 				// intersecting micro-panel.
 				inc_t ps_a_cur  = k_a1011 * PACKMR;
 				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-				      ps_a_cur *= dt_size;
+				      ps_a_cur *= dt_a_size;
 
-				const char* b1_i = b1 + off_a1011 * PACKNR * dt_size;
+				const char* b1_i = b1 + off_a1011 * PACKNR * dt_b_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 );
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index 969f06941..3df5024e6 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trmm_lu_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffa  = bli_obj_diag_offset( a );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -90,9 +96,10 @@ void bli_trmm_lu_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -133,7 +140,7 @@ void bli_trmm_lu_ker_var2
 	if ( diagoffa > 0 )
 	{
 		k        -= diagoffa;
-		b_cast   += diagoffa * PACKNR * dt_size;
+		b_cast   += diagoffa * PACKNR * dt_b_size;
 		diagoffa  = 0;
 	}
 
@@ -154,12 +161,12 @@ void bli_trmm_lu_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -167,17 +174,21 @@ void bli_trmm_lu_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
-	//const dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	//const dim_t jr_tid = bli_thrinfo_work_id( thread );
-	//const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
-	//const dim_t ir_tid = bli_thrinfo_work_id( caucus );
+	dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	dim_t jr_tid = bli_thrinfo_work_id( thread );
+	//dim_t ir_nt  = bli_thrinfo_n_way( caucus );
+	//dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
 	dim_t jr_start, jr_end, jr_inc;
 
@@ -185,7 +196,8 @@ void bli_trmm_lu_ker_var2
 	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( jr_tid, jr_nt, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	//bli_thread_range_rr( ir_tid, ir_nt, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -228,12 +240,12 @@ void bli_trmm_lu_ker_var2
 				// intersecting micro-panel.
 				inc_t ps_a_cur  = k_a1112 * PACKMR;
 				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-				      ps_a_cur *= dt_size;
+				      ps_a_cur *= dt_a_size;
 
 				// NOTE: ir loop parallelism disabled for now.
 				//if ( bli_trmm_my_iter( i, ir_thread ) ) {
 
-				const char* b1_i = b1 + off_a1112 * PACKNR * dt_size;
+				const char* b1_i = b1 + off_a1112 * PACKNR * dt_b_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2b.c b/frame/3/trmm/bli_trmm_lu_ker_var2b.c
index 1c1714c8b..e201f08ff 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2b.c
@@ -45,31 +45,37 @@ void bli_trmm_lu_ker_var2b
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffa  = bli_obj_diag_offset( a );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -90,9 +96,10 @@ void bli_trmm_lu_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -135,7 +142,7 @@ void bli_trmm_lu_ker_var2b
 	if ( diagoffa > 0 )
 	{
 		k        -= diagoffa;
-		b_cast   += diagoffa * PACKNR * dt_size;
+		b_cast   += diagoffa * PACKNR * dt_b_size;
 		diagoffa  = 0;
 	}
 
@@ -160,12 +167,12 @@ void bli_trmm_lu_ker_var2b
 	const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 );
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -173,11 +180,15 @@ void bli_trmm_lu_ker_var2b
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
@@ -222,7 +233,7 @@ void bli_trmm_lu_ker_var2b
 			// intersecting micro-panel.
 			inc_t ps_a_cur  = k_a1112 * PACKMR;
 			      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-			      ps_a_cur *= dt_size;
+			      ps_a_cur *= dt_a_size;
 
 			a1 += ps_a_cur;
 		}
@@ -271,9 +282,9 @@ void bli_trmm_lu_ker_var2b
 				// intersecting micro-panel.
 				inc_t ps_a_cur  = k_a1112 * PACKMR;
 				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-				      ps_a_cur *= dt_size;
+				      ps_a_cur *= dt_a_size;
 
-				const char* b1_i = b1 + off_a1112 * PACKNR * dt_size;
+				const char* b1_i = b1 + off_a1112 * PACKNR * dt_b_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 );
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index a49c1949b..47b635e8b 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trmm_rl_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffb  = bli_obj_diag_offset( b );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -90,9 +96,10 @@ void bli_trmm_rl_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -133,7 +140,7 @@ void bli_trmm_rl_ker_var2
 	if ( diagoffb < 0 )
 	{
 		k        += diagoffb;
-		a_cast   -= diagoffb * PACKMR * dt_size;
+		a_cast   -= diagoffb * PACKMR * dt_a_size;
 		diagoffb  = 0;
 	}
 
@@ -154,12 +161,12 @@ void bli_trmm_rl_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -167,11 +174,15 @@ void bli_trmm_rl_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
@@ -211,8 +222,8 @@ void bli_trmm_rl_ker_var2
 	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_slrr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( jr_tid, jr_nt, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( ir_tid, ir_nt, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -311,7 +322,7 @@ void bli_trmm_rl_ker_var2
 			// intersecting micro-panel.
 			inc_t ps_b_cur  = k_b1121 * PACKNR;
 			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-			      ps_b_cur *= dt_size;
+			      ps_b_cur *= dt_b_size;
 
 			if ( bli_trmm_my_iter_rr( j, thread ) ) {
 
@@ -322,7 +333,7 @@ void bli_trmm_rl_ker_var2
 
 				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
-				const char* a1_i = a1 + off_b1121 * PACKMR * dt_size;
+				const char* a1_i = a1 + off_b1121 * PACKMR * dt_a_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1;
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2b.c b/frame/3/trmm/bli_trmm_rl_ker_var2b.c
index ab0a126bd..337c2c3e0 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2b.c
@@ -45,31 +45,37 @@ void bli_trmm_rl_ker_var2b
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffb  = bli_obj_diag_offset( b );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -90,9 +96,10 @@ void bli_trmm_rl_ker_var2b
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -135,7 +142,7 @@ void bli_trmm_rl_ker_var2b
 	if ( diagoffb < 0 )
 	{
 		k        += diagoffb;
-		a_cast   -= diagoffb * PACKMR * dt_size;
+		a_cast   -= diagoffb * PACKMR * dt_a_size;
 		diagoffb  = 0;
 	}
 
@@ -160,12 +167,12 @@ void bli_trmm_rl_ker_var2b
 	const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 );
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -173,11 +180,15 @@ void bli_trmm_rl_ker_var2b
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel while the 'caucus' points to the thrinfo_t
 	// node for the 1st loop (ir).
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 #if 0
@@ -242,7 +253,7 @@ const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
 			// intersecting micro-panel.
 			inc_t ps_b_cur  = k_b1121 * PACKNR;
 			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-			      ps_b_cur *= dt_size;
+			      ps_b_cur *= dt_b_size;
 
 			b1 += ps_b_cur;
 		}
@@ -282,7 +293,7 @@ const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
 			// intersecting micro-panel.
 			inc_t ps_b_cur  = k_b1121 * PACKNR;
 			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-			      ps_b_cur *= dt_size;
+			      ps_b_cur *= dt_b_size;
 
 			// Loop over the m dimension (MR rows at a time).
 			for ( ; i < m_iter; ++i )
@@ -293,7 +304,7 @@ const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
 				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
 				                      ? MR : m_left );
 
-				const char* a1_i = a1 + off_b1121 * PACKMR * dt_size;
+				const char* a1_i = a1 + off_b1121 * PACKMR * dt_a_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index f8db83db2..2ae2cbfbe 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trmm_ru_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffb  = bli_obj_diag_offset( b );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -90,9 +96,10 @@ void bli_trmm_ru_ker_var2
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -134,7 +141,7 @@ void bli_trmm_ru_ker_var2
 	if ( diagoffb > 0 )
 	{
 		n        -= diagoffb;
-		c_cast   += diagoffb * cs_c * dt_size;
+		c_cast   += diagoffb * cs_c * dt_c_size;
 		diagoffb  = 0;
 	}
 
@@ -149,18 +156,18 @@ void bli_trmm_ru_ker_var2
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
 	const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 );
-    const dim_t n_left = n % NR;
+	const dim_t n_left = n % NR;
 
-    const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
-    const dim_t m_left = m % MR;
+	const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 );
+	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -168,15 +175,19 @@ void bli_trmm_ru_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
-	//const dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	//const dim_t jr_tid = bli_thrinfo_work_id( thread );
+	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
+	const dim_t jr_tid = bli_thrinfo_work_id( thread );
 	const dim_t ir_nt  = bli_thrinfo_n_way( caucus );
 	const dim_t ir_tid = bli_thrinfo_work_id( caucus );
 
@@ -245,7 +256,7 @@ void bli_trmm_ru_ker_var2
 			// intersecting micro-panel.
 			inc_t ps_b_cur  = k_b0111 * PACKNR;
 			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-			      ps_b_cur *= dt_size;
+			      ps_b_cur *= dt_b_size;
 
 			if ( bli_trmm_my_iter_rr( j, thread ) ) {
 
@@ -257,7 +268,7 @@ void bli_trmm_ru_ker_var2
 				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
 				                      ? MR : m_left );
 
-				const char* a1_i = a1 + off_b0111 * PACKMR * dt_size;
+				const char* a1_i = a1 + off_b0111 * PACKMR * dt_a_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1;
@@ -307,8 +318,8 @@ void bli_trmm_ru_ker_var2
 	// NOTE: The definition of bli_thread_range_slrr() will depend on whether
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is disabled for now.
-	bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
-	bli_thread_range_slrr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+	bli_thread_range_slrr( jr_tid, jr_nt, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( ir_tid, ir_nt, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
 
 	// Advance the start and end iteration offsets for the rectangular region
 	// by the number of iterations used for the triangular region.
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2b.c b/frame/3/trmm/bli_trmm_ru_ker_var2b.c
index 8d8d3eea2..7a1cda858 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2b.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2b.c
@@ -45,31 +45,37 @@ void bli_trmm_ru_ker_var2b
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffb  = bli_obj_diag_offset( b );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
 	obj_t scalar_a, scalar_b;
@@ -83,16 +89,17 @@ void bli_trmm_ru_ker_var2b
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Alias some constants to simpler names.
-	const dim_t     MR         = pd_a;
-	const dim_t     NR         = pd_b;
-	const dim_t     PACKMR     = cs_a;
-	const dim_t     PACKNR     = rs_b;
+	const dim_t MR         = pd_a;
+	const dim_t NR         = pd_b;
+	const dim_t PACKMR     = cs_a;
+	const dim_t PACKNR     = rs_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemm_ukr_ft gemm_ukr   = bli_gemm_var_cntl_ukr( cntl );
+	const void* params     = bli_gemm_var_cntl_params( cntl );
 
-	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const void* one        = bli_obj_buffer_for_const( dt_comp, &BLIS_ONE );
 	const char* a_cast     = buf_a;
 	const char* b_cast     = buf_b;
 	      char* c_cast     = buf_c;
@@ -134,7 +141,7 @@ void bli_trmm_ru_ker_var2b
 	if ( diagoffb > 0 )
 	{
 		n        -= diagoffb;
-		c_cast   += diagoffb * cs_c * dt_size;
+		c_cast   += diagoffb * cs_c * dt_c_size;
 		diagoffb  = 0;
 	}
 
@@ -159,12 +166,12 @@ void bli_trmm_ru_ker_var2b
 	const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 );
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -172,11 +179,15 @@ void bli_trmm_ru_ker_var2b
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	// loop around the microkernel. Here we query the thrinfo_t node for the
 	// 1st (ir) loop around the microkernel.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 #if 0
@@ -240,7 +251,7 @@ const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
 			// intersecting micro-panel.
 			inc_t ps_b_cur  = k_b0111 * PACKNR;
 			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-			      ps_b_cur *= dt_size;
+			      ps_b_cur *= dt_b_size;
 
 			b1 += ps_b_cur;
 		}
@@ -280,7 +291,7 @@ const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
 			// intersecting micro-panel.
 			inc_t ps_b_cur  = k_b0111 * PACKNR;
 			      ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-			      ps_b_cur *= dt_size;
+			      ps_b_cur *= dt_b_size;
 
 			// Loop over the m dimension (MR rows at a time).
 			for ( ; i < m_iter; ++i )
@@ -291,7 +302,7 @@ const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st;
 				const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left )
 				                      ? MR : m_left );
 
-				const char* a1_i = a1 + off_b0111 * PACKMR * dt_size;
+				const char* a1_i = a1 + off_b0111 * PACKMR * dt_a_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 );
diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h
index 0a605ba86..b347cc941 100644
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -41,7 +41,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t*     a, \
        const obj_t*     b, \
@@ -51,13 +51,11 @@ void PASTEMAC0(opname) \
              thrinfo_t* thread_par  \
      );
 
-GENPROT( trmm_xx_ker_var2 )
 GENPROT( trmm_ll_ker_var2 )
 GENPROT( trmm_lu_ker_var2 )
 GENPROT( trmm_rl_ker_var2 )
 GENPROT( trmm_ru_ker_var2 )
 
-GENPROT( trmm_xx_ker_var2b )
 GENPROT( trmm_ll_ker_var2b )
 GENPROT( trmm_lu_ker_var2b )
 GENPROT( trmm_rl_ker_var2b )
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
deleted file mode 100644
index 918b8f973..000000000
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-static l3_var_oft vars[2][2] =
-{
-	{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
-	{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
-};
-
-void bli_trmm_xx_ker_var2
-     (
-       const obj_t*     a,
-       const obj_t*     b,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-       const cntl_t*    cntl,
-             thrinfo_t* thread_par
-     )
-{
-	dim_t side;
-	dim_t uplo;
-
-	// Set two bools: one based on the implied side parameter (the structure
-	// of the root object) and one based on the uplo field of the triangular
-	// matrix's root object (whether that is matrix A or matrix B).
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		side = 0;
-		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		side = 1;
-		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-
-	// Index into the variant array to extract the correct function pointer.
-	l3_var_oft f = vars[side][uplo];
-
-	// Call the macrokernel.
-	f
-	(
-	  a,
-	  b,
-	  c,
-	  cntx,
-	  cntl,
-	  thread_par
-	);
-}
-
diff --git a/frame/3/trmm3/bli_trmm3.h b/frame/3/trmm3/bli_trmm3.h
deleted file mode 100644
index efcaf0995..000000000
--- a/frame/3/trmm3/bli_trmm3.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_trmm3_front.h"
-
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
deleted file mode 100644
index 88478713f..000000000
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_trmm3_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   b_local;
-	obj_t   c_local;
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// Alias A, B, and C so we can tweak the objects if necessary.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// Set the obj_t buffer field to the location currently implied by the row
-	// and column offsets and then zero the offsets. If any of the original
-	// obj_t's were views into larger matrices, this step effectively makes
-	// those obj_t's "forget" their lineage.
-	bli_obj_reset_origin( &a_local );
-	bli_obj_reset_origin( &b_local );
-	bli_obj_reset_origin( &c_local );
-
-	// We do not explicitly implement the cases where A is transposed.
-	// However, we can still handle them. Specifically, if A is marked as
-	// needing a transposition, we simply induce a transposition. This
-	// allows us to only explicitly implement the no-transpose cases. Once
-	// the transposition is induced, the correct algorithm will be called,
-	// since, for example, an algorithm over a transposed lower triangular
-	// matrix A moves in the same direction (forwards) as a non-transposed
-	// upper triangular matrix. And with the transposition induced, the
-	// matrix now appears to be upper triangular, so the upper triangular
-	// algorithm will grab the correct partitions, as if it were upper
-	// triangular (with no transpose) all along.
-	if ( bli_obj_has_trans( &a_local ) )
-	{
-		bli_obj_induce_trans( &a_local );
-		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
-	}
-
-#ifdef BLIS_DISABLE_TRMM3_RIGHT
-	// NOTE: This case casts right-side trmm3 in terms of left side. This is
-	// necessary when the current subconfiguration uses a gemm microkernel
-	// that assumes that the packing kernel will have already duplicated
-	// (broadcast) element of B in the packed copy of B. Supporting
-	// duplication within the logic that packs micropanels from triangular
-	// matrices would be ugly, and so we simply don't support it. As a
-	// consequence, those subconfigurations need a way to force the triangular
-	// matrix to be on the left (and thus the general matrix to the on the
-	// right). So our solution is that in those cases, the subconfigurations
-	// simply #define BLIS_DISABLE_TRMM3_RIGHT.
-
-	// NOTE: This case casts right-side trmm3 in terms of left side. This can
-	// lead to the microkernel being executed on an output matrix with the
-	// microkernel's general stride IO case (unless the microkernel supports
-	// both both row and column IO cases as well).
-
-	// NOTE: Casting right-side trmm3 in terms of left side reduces the number
-	// of macrokernels exercised to two (trmm_ll and trmm_lu).
-
-	// If A is being multiplied from the right, transpose all operands
-	// so that we can perform the computation as if A were being multiplied
-	// from the left.
-	if ( bli_is_right( side ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-#else
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// If A is being multiplied from the right, swap A and B so that
-	// the matrix will actually be on the right.
-	if ( bli_is_right( side ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
-	}
-
-#endif
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_TRMM3,
-	  side,
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_l3_int,
-	  BLIS_TRMM, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm
-	);
-}
-
diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h
deleted file mode 100644
index dcaa4d0ee..000000000
--- a/frame/3/trmm3/bli_trmm3_front.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_trmm3_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     );
diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h
index 964422d01..c71229f4a 100644
--- a/frame/3/trsm/bli_trsm.h
+++ b/frame/3/trsm/bli_trsm.h
@@ -33,6 +33,5 @@
 */
 
 #include "bli_trsm_cntl.h"
-#include "bli_trsm_front.h"
 #include "bli_trsm_var.h"
 
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index cfd1b4d7d..7defc5942 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -52,10 +52,10 @@ void bli_trsm_blk_var1
 	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( &ap, b, &cp, cntl );
+	const dir_t direct = bli_part_cntl_direct( cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl );
+	bli_l3_prune_unref_mparts_m( &ap, b, &cp );
 
 	// Isolate the diagonal block A11 and its corresponding row panel C1.
 	const dim_t kc = bli_obj_width_after_trans( &ap );
@@ -66,13 +66,8 @@ void bli_trsm_blk_var1
 	                        0, kc, &cp, &c1 );
 
 	// All threads iterate over the entire diagonal block A11.
-	thrinfo_t* thread_pre = bli_thrinfo_sub_prenode( thread_par );
+	thrinfo_t* thread_pre = bli_thrinfo_sub_node( 0, thread_par );
 	dim_t my_start = 0, my_end = kc;
-	//bli_thread_range_mdim
-	//(
-	//  direct, thread_pre, &a11, b, &c1, cntl, cntx,
-	//  &my_start, &my_end
-	//);
 
 #ifdef PRINT
 	printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n",
@@ -85,8 +80,9 @@ void bli_trsm_blk_var1
 	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
-		b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		b_alg = bli_determine_blocksize( direct, i, my_end,
+		                                 bli_part_cntl_blksz_alg( cntl ),
+		                                 bli_part_cntl_blksz_max( cntl ) );
 
 		// Acquire partitions for A1 and C1.
 		obj_t a11_1, c1_1;
@@ -104,13 +100,11 @@ void bli_trsm_blk_var1
 		// Perform trsm subproblem.
 		bli_l3_int
 		(
-		  &BLIS_ONE,
 		  &a11_1,
 		  b,
-		  &BLIS_ONE,
 		  &c1_1,
 		  cntx,
-		  bli_cntl_sub_prenode( cntl ),
+		  bli_cntl_sub_node( 0, cntl ),
 		  thread_pre
 		);
 	}
@@ -141,10 +135,13 @@ void bli_trsm_blk_var1
 
 	// Determine the current thread's subpartition range for the gemm
 	// subproblem over Ax1.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 1, thread_par );
 	bli_thread_range_mdim
 	(
-	  direct, thread, &ax1, b, &cx1, cntl, cntx,
+	  direct,
+	  bli_part_cntl_blksz_mult( cntl ),
+	  bli_part_cntl_use_weighted( cntl ),
+	  thread, &ax1, b, &cx1,
 	  &my_start, &my_end
 	);
 
@@ -156,8 +153,9 @@ void bli_trsm_blk_var1
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, &ax1,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		b_alg = bli_determine_blocksize( direct, i, my_end,
+		                                 bli_part_cntl_blksz_alg( cntl ),
+		                                 bli_part_cntl_blksz_max( cntl ) );
 
 		// Acquire partitions for A1 and C1.
 		obj_t a11, c1;
@@ -176,13 +174,11 @@ void bli_trsm_blk_var1
 		// function as before, since we're calling the same macrokernel.)
 		bli_l3_int
 		(
-		  &BLIS_ONE,
 		  &a11,
 		  b,
-		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  bli_cntl_sub_node( cntl ),
+		  bli_cntl_sub_node( 1, cntl ),
 		  thread
 		);
 	}
diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c
index e86eb988a..8b65a5bbe 100644
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -50,17 +50,20 @@ void bli_trsm_blk_var2
 	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, &bp, &cp, cntl );
+	const dir_t direct = bli_part_cntl_direct( cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl );
+	bli_l3_prune_unref_mparts_n( a, &bp, &cp );
 
 	// Determine the current thread's subpartition range.
 	dim_t my_start, my_end;
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 	bli_thread_range_ndim
 	(
-	  direct, thread, a, &bp, &cp, cntl, cntx,
+	  direct,
+	  bli_part_cntl_blksz_mult( cntl ),
+	  bli_part_cntl_use_weighted( cntl ),
+	  thread, a, &bp, &cp,
 	  &my_start, &my_end
 	);
 
@@ -69,8 +72,9 @@ void bli_trsm_blk_var2
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		b_alg = bli_determine_blocksize( direct, i, my_end,
+		                                 bli_part_cntl_blksz_alg( cntl ),
+		                                 bli_part_cntl_blksz_max( cntl ) );
 
 		// Acquire partitions for B1 and C1.
 		obj_t b1, c1;
@@ -82,13 +86,11 @@ void bli_trsm_blk_var2
 		// Perform trsm subproblem.
 		bli_l3_int
 		(
-		  &BLIS_ONE,
 		  a,
 		  &b1,
-		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  bli_cntl_sub_node( cntl ),
+		  bli_cntl_sub_node( 0, cntl ),
 		  thread
 		);
 	}
diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index 77a3b77d1..3f4e8a1e9 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -49,13 +49,13 @@ void bli_trsm_blk_var3
 	bli_obj_alias_to( b, &bp );
 	bli_obj_alias_to( c, &cs );
 
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
+	const dir_t direct = bli_part_cntl_direct( cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl );
+	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs );
 
 	// Query dimension in partitioning direction.
 	dim_t k_trans = bli_obj_width_after_trans( &ap );
@@ -65,8 +65,9 @@ void bli_trsm_blk_var3
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp,
-		                             bli_cntl_bszid( cntl ), cntx, cntl );
+		b_alg = bli_determine_blocksize( direct, i, k_trans,
+		                                 bli_part_cntl_blksz_alg( cntl ),
+		                                 bli_part_cntl_blksz_max( cntl ) );
 
 		// Acquire partitions for A1 and B1.
 		obj_t a1, b1;
@@ -78,13 +79,11 @@ void bli_trsm_blk_var3
 		// Perform trsm subproblem.
 		bli_l3_int
 		(
-		  &BLIS_ONE,
 		  &a1,
 		  &b1,
-		  &BLIS_ONE,
 		  &cs,
 		  cntx,
-		  bli_cntl_sub_node( cntl ),
+		  bli_cntl_sub_node( 0, cntl ),
 		  thread
 		);
 
diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c
index d036e94c7..bc0ebb8f7 100644
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -35,104 +35,300 @@
 
 #include "blis.h"
 
-cntl_t* bli_trsm_cntl_create
+
+static packm_ker_ft GENARRAY2_MIXP(packm_struc_cxk,packm_struc_cxk);
+
+void bli_trsm_var_cntl_init_node
      (
-       pool_t* pool,
-       side_t  side,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+       void_fp          var_func,
+       num_t            dt_comp,
+       num_t            dt_out,
+       gemmtrsm_ukr_ft  gemmtrsm_ukr,
+       gemm_ukr_ft      gemm_ukr,
+       gemm_ukr_ft      real_gemm_ukr,
+       bool             row_pref,
+       dim_t            mr,
+       dim_t            nr,
+       dim_t            mr_pack,
+       dim_t            nr_pack,
+       dim_t            mr_bcast,
+       dim_t            nr_bcast,
+       dim_t            mr_scale,
+       dim_t            nr_scale,
+       trsm_var_cntl_t* cntl
      )
 {
-	if ( bli_is_left( side ) )
-		return bli_trsm_l_cntl_create( pool, schema_a, schema_b, ker );
-	else
-		return bli_trsm_r_cntl_create( pool, schema_a, schema_b, ker );
+	// Initialize the embedded gemm_var_cntl_t struct.
+	bli_gemm_var_cntl_init_node
+	(
+	  var_func,
+	  dt_comp,
+	  dt_out,
+	  gemm_ukr,
+	  real_gemm_ukr,
+	  row_pref,
+	  mr,
+	  nr,
+	  mr_scale,
+	  nr_scale,
+	  ( gemm_var_cntl_t* )cntl
+	);
+
+	// Initialize the trsm_var_cntl_t struct.
+	cntl->gemmtrsm_ukr  = gemmtrsm_ukr;
+	cntl->mr_pack       = mr_pack;
+	cntl->nr_pack       = nr_pack;
+	cntl->mr_bcast      = mr_bcast;
+	cntl->nr_bcast      = nr_bcast;
 }
 
-cntl_t* bli_trsm_l_cntl_create
+void bli_trsm_cntl_init
      (
-       pool_t* pool,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             ind_t        im,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             trsm_cntl_t* cntl
      )
 {
-	void_fp macro_kernel_p;
-
-	// Set the default macrokernel. If a non-NULL kernel function pointer is
-	// passed in, we use that instead.
-	macro_kernel_p = bli_trsm_xx_ker_var2;
-	if ( ker ) macro_kernel_p = ker;
+	if ( bli_obj_is_triangular( a ) )
+		bli_trsm_l_cntl_init( im, alpha, a, b, beta, c, cntx, cntl );
+	else
+		bli_check_error_code(BLIS_NOT_YET_IMPLEMENTED);
+		//bli_trsm_r_cntl_init( im, alpha, a, b, beta, c, cntx, cntl );
+}
 
-	const opid_t family = BLIS_TRSM;
+void bli_trsm_l_cntl_init
+     (
+             ind_t        im,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             trsm_cntl_t* cntl
+     )
+{
+	const prec_t           comp_prec      = bli_obj_comp_prec( c );
+	const num_t            dt_a           = bli_obj_dt( a );
+	const num_t            dt_b           = bli_obj_dt( b );
+	const num_t            dt_c           = bli_obj_dt( c );
+	const num_t            dt_ap          = bli_dt_domain( dt_a ) | comp_prec;
+	const num_t            dt_bp          = bli_dt_domain( dt_b ) | comp_prec;
+	const num_t            dt_comp        = ( im == BLIS_1M ? BLIS_REAL
+	                                                        : bli_dt_domain( dt_c )
+	                                        ) | comp_prec;
+
+	const void_fp          macro_kernel_p = bli_obj_is_lower( a ) ? bli_trsm_ll_ker_var2
+	                                                              : bli_trsm_lu_ker_var2;
+	      gemmtrsm_ukr_ft  gemmtrsm_ukr   = bli_obj_is_lower( a )
+	                                        ? bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMMTRSM_L_UKR, cntx )
+	                                        : bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMMTRSM_U_UKR, cntx );
+	      gemm_ukr_ft      gemm_ukr       = bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMM_UKR, cntx );
+	      gemm_ukr_ft      real_gemm_ukr  = NULL;
+	const dir_t            direct         = bli_obj_is_lower( a ) ? BLIS_FWD
+	                                                              : BLIS_BWD;
+	const bool             row_pref       = bli_cntx_get_ukr_prefs_dt( dt_comp, BLIS_GEMM_UKR_ROW_PREF, cntx );
+	      pack_t           schema_a       = BLIS_PACKED_PANELS;
+	      pack_t           schema_b       = BLIS_PACKED_PANELS;
+	const packm_ker_ft     packm_a_ukr    = packm_struc_cxk[ dt_a ][ dt_ap ];
+	const packm_ker_ft     packm_b_ukr    = packm_struc_cxk[ dt_b ][ dt_bp ];
+	const dim_t            mr_def         = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_MR, cntx );
+	const dim_t            mr_pack        = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_MR, cntx );
+	const dim_t            mr_bcast       = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_BBM, cntx );
+	      dim_t            mr_scale       = 1;
+	      dim_t            mr_pack_scale  = 1;
+	const dim_t            nr_def         = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_NR, cntx );
+	const dim_t            nr_pack        = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_NR, cntx );
+	const dim_t            nr_bcast       = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_BBN, cntx );
+	      dim_t            nr_scale       = 1;
+	      dim_t            nr_pack_scale  = 1;
+	const dim_t            kr_def         = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_KR, cntx );
+	const dim_t            mc_def         = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_MC, cntx );
+	const dim_t            mc_max         = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_MC, cntx );
+	      dim_t            mc_scale       = 1;
+	const dim_t            nc_def         = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_NC, cntx );
+	const dim_t            nc_max         = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_NC, cntx );
+	      dim_t            nc_scale       = 1;
+	const dim_t            kc_def         = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_KC, cntx );
+	const dim_t            kc_max         = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_KC, cntx );
+	      dim_t            kc_scale       = 1;
+
+	if ( im == BLIS_1M )
+	{
+		if ( ! row_pref )
+		{
+			schema_a = BLIS_PACKED_PANELS_1E;
+			schema_b = BLIS_PACKED_PANELS_1R;
+			mr_scale = 2;
+			mc_scale = 2;
+			mr_pack_scale = 1; //don't divide PACKMR by 2 since we are also doubling k
+		}
+		else
+		{
+			schema_a = BLIS_PACKED_PANELS_1R;
+			schema_b = BLIS_PACKED_PANELS_1E;
+			nr_scale = 2;
+			nc_scale = 2;
+			nr_pack_scale = 1; //don't divide PACKNR by 2 since we are also doubling k
+		}
+
+		kc_scale = 2;
+		real_gemm_ukr = gemm_ukr;
+		gemm_ukr = bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMM1M_UKR, cntx );
+		gemmtrsm_ukr = bli_obj_is_lower( a )
+		               ? bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMMTRSM1M_L_UKR, cntx )
+		               : bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMMTRSM1M_U_UKR, cntx );
+	}
+
+	// If alpha is non-unit, typecast and apply it to the scalar attached
+	// to B, unless it happens to be triangular.
+	if ( bli_obj_root_is_triangular( b ) )
+	{
+		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
+			bli_obj_scalar_apply_scalar( alpha, a );
+	}
+	else // if ( bli_obj_root_is_triangular( b ) )
+	{
+		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
+			bli_obj_scalar_apply_scalar( alpha, b );
+	}
+
+	// If beta is non-unit, typecast and apply it to the scalar attached
+	// to C.
+	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
+		bli_obj_scalar_apply_scalar( beta, c );
 
 	//
 	// Create nodes for packing A and the macro-kernel (gemm branch).
 	//
 
-	cntl_t* gemm_cntl_bu_ke = bli_trsm_cntl_create_node
+	bli_cntl_init_node
 	(
-	  pool,         // the thread's sba pool
-	  family,       // the operation family
-	  BLIS_MR,
 	  NULL,         // variant function pointer not used
-	  NULL          // no sub-node; this is the leaf of the tree.
+	  &cntl->ir_loop_gemm
 	);
 
-	cntl_t* gemm_cntl_bp_bu = bli_trsm_cntl_create_node
+	bli_trsm_var_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_NR,
 	  macro_kernel_p,
-	  gemm_cntl_bu_ke
+	  dt_comp,
+	  dt_c,
+	  gemmtrsm_ukr,
+	  gemm_ukr,
+	  real_gemm_ukr,
+	  row_pref,
+	  mr_def / mr_scale,
+	  nr_def / nr_scale,
+	  mr_pack,
+	  nr_pack,
+	  mr_bcast,
+	  nr_bcast,
+	  mr_scale,
+	  nr_scale,
+	  &cntl->gemm_ker
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_MR | BLIS_THREAD_NR,
+	  ( cntl_t* )&cntl->ir_loop_gemm,
+	  ( cntl_t* )&cntl->gemm_ker
 	);
 
+	// Give the trsm kernel control tree node to the
+	// virtual microkernel as the parameters, so that e.g.
+	// the 1m virtual microkernel can look up the real-domain
+	// micro-kernel and its parameters.
+	bli_trsm_var_cntl_set_params( &cntl->gemm_ker, ( cntl_t* )&cntl->gemm_ker );
+
 	// Create a node for packing matrix A.
-	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
+	bli_packm_def_cntl_init_node
 	(
-	  pool,
 	  bli_l3_packa, // trsm operation's packm function for A.
-	  BLIS_MR,
-	  BLIS_MR,
+	  dt_a,
+	  dt_ap,
+	  dt_comp,
+	  packm_a_ukr,
+	  mr_def / mr_scale,
+	  mr_pack,
+	  mr_bcast,
+	  mr_scale,
+	  mr_pack_scale,
+	  mr_def / mr_scale,
 	  FALSE,        // do NOT invert diagonal
 	  TRUE,         // reverse iteration if upper?
 	  FALSE,        // reverse iteration if lower?
-	  schema_a,     // normally BLIS_PACKED_ROW_PANELS
+	  schema_a,
 	  BLIS_BUFFER_FOR_A_BLOCK,
-	  gemm_cntl_bp_bu
+	  &cntl->pack_a_gemm
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->gemm_ker,
+	  ( cntl_t* )&cntl->pack_a_gemm
 	);
 
 	//
 	// Create nodes for packing A and the macro-kernel (trsm branch).
 	//
 
-	cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
+	bli_cntl_init_node
 	(
-	  pool,         // the thread's sba pool
-	  family,       // the operation family
-	  BLIS_MR,
-	  NULL,         // variant function pointer not used
-	  NULL          // no sub-node; this is the leaf of the tree.
+	  NULL,
+	  &cntl->ir_loop_trsm
 	);
 
-	cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
+	bli_trsm_var_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_NR,
 	  macro_kernel_p,
-	  trsm_cntl_bu_ke
+	  dt_comp,
+	  dt_c,
+	  gemmtrsm_ukr,
+	  gemm_ukr,
+	  real_gemm_ukr,
+	  row_pref,
+	  mr_def / mr_scale,
+	  nr_def / nr_scale,
+	  mr_pack,
+	  nr_pack,
+	  mr_bcast,
+	  nr_bcast,
+	  mr_scale,
+	  nr_scale,
+	  &cntl->trsm_ker
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_MC | BLIS_THREAD_KC | BLIS_THREAD_NR,
+	  ( cntl_t* )&cntl->ir_loop_trsm,
+	  ( cntl_t* )&cntl->trsm_ker
 	);
 
+	// Give the trsm kernel control tree node to the
+	// virtual microkernel as the parameters, so that e.g.
+	// the 1m virtual microkernel can look up the real-domain
+	// micro-kernel and its parameters.
+	bli_trsm_var_cntl_set_params( &cntl->trsm_ker, ( cntl_t* )&cntl->trsm_ker );
+
 	// Create a node for packing matrix A.
-	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
+	bli_packm_def_cntl_init_node
 	(
-	  pool,
 	  bli_l3_packa, // trsm operation's packm function for A.
-	  BLIS_MR,
-	  BLIS_MR,
+	  dt_a,
+	  dt_ap,
+	  dt_comp,
+	  packm_a_ukr,
+	  mr_def / mr_scale,
+	  mr_pack,
+	  mr_bcast,
+	  mr_scale,
+	  mr_pack_scale,
+	  mr_def / mr_scale,
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
 	  TRUE,         // invert diagonal
 #else
@@ -140,185 +336,327 @@ cntl_t* bli_trsm_l_cntl_create
 #endif
 	  TRUE,         // reverse iteration if upper?
 	  FALSE,        // reverse iteration if lower?
-	  schema_a,     // normally BLIS_PACKED_ROW_PANELS
+	  schema_a,
 	  BLIS_BUFFER_FOR_A_BLOCK,
-	  trsm_cntl_bp_bu
+	  &cntl->pack_a_trsm
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->trsm_ker,
+	  ( cntl_t* )&cntl->pack_a_trsm
 	);
 
 	// -------------------------------------------------------------------------
 
 	// Create a node for partitioning the m dimension by MC.
 	// NOTE: We attach the gemm sub-tree as the main branch.
-	cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_MC,
 	  bli_trsm_blk_var1,
-	  gemm_cntl_packa
+	  dt_comp,
+	  mc_def / mc_scale,
+	  mc_max / mc_scale,
+	  mc_scale,
+	  mr_def / mr_scale,
+	  mr_scale,
+	  direct,
+	  FALSE,
+	  &cntl->part_ic
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->pack_a_trsm,
+	  ( cntl_t* )&cntl->part_ic
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_MC | BLIS_THREAD_KC,
+	  ( cntl_t* )&cntl->pack_a_gemm,
+	  ( cntl_t* )&cntl->part_ic
 	);
-
-	// Attach the trsm sub-tree as the auxiliary "prenode" branch.
-	bli_cntl_set_sub_prenode( trsm_cntl_packa, trsm_cntl_op_bp );
 
 	// -------------------------------------------------------------------------
 
 	// Create a node for packing matrix B.
-	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
+	bli_packm_def_cntl_init_node
 	(
-	  pool,
 	  bli_l3_packb,
-	  BLIS_NR,
-	  BLIS_MR,
+	  dt_b,
+	  dt_bp,
+	  dt_comp,
+	  packm_b_ukr,
+	  nr_def / nr_scale,
+	  nr_pack,
+	  nr_bcast,
+	  nr_scale,
+	  nr_pack_scale,
+	  mr_def / mr_scale,
 	  FALSE,        // do NOT invert diagonal
 	  FALSE,        // reverse iteration if upper?
 	  FALSE,        // reverse iteration if lower?
-	  schema_b,     // normally BLIS_PACKED_COL_PANELS
+	  schema_b,
 	  BLIS_BUFFER_FOR_B_PANEL,
-	  trsm_cntl_op_bp
+	  &cntl->pack_b
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->part_ic,
+	  ( cntl_t* )&cntl->pack_b
 	);
 
 	// Create a node for partitioning the k dimension by KC.
-	cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_KC,
 	  bli_trsm_blk_var3,
-	  trsm_cntl_packb
+	  dt_comp,
+	  kc_def / kc_scale,
+	  kc_max / kc_scale,
+	  kc_scale,
+	  kr_def,
+	  1,
+	  direct,
+	  FALSE,
+	  &cntl->part_pc
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->pack_b,
+	  ( cntl_t* )&cntl->part_pc
 	);
 
 	// Create a node for partitioning the n dimension by NC.
-	cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_NC,
 	  bli_trsm_blk_var2,
-	  trsm_cntl_mm_op
+	  dt_comp,
+	  nc_def / nc_scale,
+	  nc_max / nc_scale,
+	  nc_scale,
+	  nr_def / nr_scale,
+	  nr_scale,
+	  BLIS_FWD,
+	  FALSE,
+	  &cntl->part_jc
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NC,
+	  ( cntl_t* )&cntl->part_pc,
+	  ( cntl_t* )&cntl->part_jc
 	);
 
-	return trsm_cntl_vl_mm;
+	bli_trsm_cntl_finalize( cntl );
 }
 
-cntl_t* bli_trsm_r_cntl_create
+#if 0
+
+void bli_trsm_r_cntl_init
      (
-       pool_t* pool,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             ind_t        im,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             trsm_cntl_t* cntl
      )
 {
-	// NOTE: trsm macrokernels are presently disabled for right-side execution.
-	// Set the default macrokernel. If a non-NULL kernel function pointer is
-	// passed in, we use that instead.
-	void_fp macro_kernel_p = bli_trsm_xx_ker_var2;
-	if ( ker ) macro_kernel_p = ker;
-
-	const opid_t family = BLIS_TRSM;
+	const num_t            dt_a           = bli_obj_dt( a );
+	const num_t            dt_b           = bli_obj_dt( b );
+	const num_t            dt_ap          = bli_obj_target_dt( a );
+	const num_t            dt_bp          = bli_obj_target_dt( b );
+	const num_t            dt_exec        = bli_obj_exec_dt( c );
+
+	const void_fp          macro_kernel_p = bli_obj_is_lower( b ) ? bli_trsm_rl_ker_var2 : bli_trsm_ru_ker_var2;
+	const gemmtrsm_ukr_vft gemmtrsm_ukr   = bli_obj_is_lower( b )
+	    ? bli_cntx_get_ukr_dt( dt_exec, BLIS_GEMMTRSM_L_UKR, cntx )
+	    : bli_cntx_get_ukr_dt( dt_exec, BLIS_GEMMTRSM_U_UKR, cntx );
+	const gemm_ukr_vft     gemm_ukr       = bli_cntx_get_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+
+	const dir_t            direct         = bli_obj_is_lower( b ) ? BLIS_BWD : BLIS_FWD;
+	const dim_t            ic_alg         = bli_cntx_get_blksz_def_dt( dt_exec, BLIS_MC, cntx );
+	const dim_t            ic_max         = bli_cntx_get_blksz_max_dt( dt_exec, BLIS_MC, cntx );
+	const dim_t            ic_mult        = bli_cntx_get_blksz_def_dt( dt_exec, BLIS_NR, cntx ); //note: different!
+	      dim_t            pc_alg         = bli_cntx_get_blksz_def_dt( dt_exec, BLIS_KC, cntx );
+	      dim_t            pc_max         = bli_cntx_get_blksz_max_dt( dt_exec, BLIS_KC, cntx );
+	const dim_t            pc_mult        = bli_cntx_get_blksz_def_dt( dt_exec, BLIS_KR, cntx );
+	const dim_t            jc_alg         = bli_cntx_get_blksz_def_dt( dt_exec, BLIS_NC, cntx );
+	const dim_t            jc_max         = bli_cntx_get_blksz_max_dt( dt_exec, BLIS_NC, cntx );
+	const dim_t            jc_mult        = bli_cntx_get_blksz_def_dt( dt_exec, BLIS_MR, cntx ); //note: different!
+
+	const dim_t            bmult_m_def    = bli_cntx_get_blksz_def_dt(   dt_ap, BLIS_NR, cntx );
+	const dim_t            bmult_m_pack   = bli_cntx_get_blksz_max_dt(   dt_ap, BLIS_NR, cntx );
+	const dim_t            bmult_n_def    = bli_cntx_get_blksz_def_dt(   dt_bp, BLIS_MR, cntx );
+	const dim_t            bmult_n_pack   = bli_cntx_get_blksz_max_dt(   dt_bp, BLIS_MR, cntx );
+	const dim_t            bmult_k_def    = bmult_n_def;
+
+	bli_l3_adjust_kc
+	(
+	  BLIS_TRSM,
+	  a,
+	  b,
+	  &pc_alg,
+	  &pc_max,
+	  ic_mult,
+	  jc_mult
+	);
 
 	// Create two nodes for the macro-kernel.
-	cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
+	bli_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
+	  NULL,         // variant function pointer not used
+	  &cntl->ir_loop_trsm
 	);
 
-	cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
+	bli_trsm_var_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
 	  macro_kernel_p,
-	  trsm_cntl_bu_ke
+	  gemmtrsm_ukr,
+	  gemm_ukr,
+	  &cntl->trsm_ker
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->ir_loop_trsm,
+	  ( cntl_t* )&cntl->trsm_ker
 	);
 
 	// Create a node for packing matrix A.
-	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
+	bli_packm_def_cntl_init_node
 	(
-	  pool,
 	  bli_l3_packa,
-	  BLIS_NR,
-	  BLIS_MR,
+	  dt_a,
+	  dt_ap,
+	  bmult_m_def,
+	  bmult_m_pack,
+	  bmult_k_def,
 	  FALSE,   // do NOT invert diagonal
 	  FALSE,   // reverse iteration if upper?
 	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
+	  schema_a, // normally BLIS_PACKED_PANELS
 	  BLIS_BUFFER_FOR_A_BLOCK,
-	  trsm_cntl_bp_bu
+	  &cntl->pack_a_trsm
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->trsm_ker,
+	  ( cntl_t* )&cntl->pack_a_trsm
 	);
 
 	// Create a node for partitioning the m dimension by MC.
-	cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_MC,
 	  bli_trsm_blk_var1,
-	  trsm_cntl_packa
+	  ic_alg,
+	  ic_max,
+	  ic_mult,
+	  BLIS_FWD,
+	  FALSE,
+	  &cntl->part_ic
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_MC | BLIS_THREAD_KC | BLIS_THREAD_NC | BLIS_THREAD_MR | BLIS_THREAD_NR,
+	  ( cntl_t* )&cntl->pack_a_trsm,
+	  ( cntl_t* )&cntl->part_ic
 	);
 
 	// Create a node for packing matrix B.
-	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
+	bli_packm_def_cntl_init_node
 	(
-	  pool,
 	  bli_l3_packb,
-	  BLIS_MR,
-	  BLIS_MR,
+	  dt_b,
+	  dt_bp,
+	  bmult_n_def,
+	  bmult_n_pack,
+	  bmult_k_def,
 	  TRUE,    // do NOT invert diagonal
 	  FALSE,   // reverse iteration if upper?
 	  TRUE,    // reverse iteration if lower?
-	  schema_b, // normally BLIS_PACKED_COL_PANELS
+	  schema_b, // normally BLIS_PACKED_PANELS
 	  BLIS_BUFFER_FOR_B_PANEL,
-	  trsm_cntl_op_bp
+	  &cntl->pack_b
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->part_ic,
+	  ( cntl_t* )&cntl->pack_b
 	);
 
 	// Create a node for partitioning the k dimension by KC.
-	cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_KC,
 	  bli_trsm_blk_var3,
-	  trsm_cntl_packb
+	  pc_alg,
+	  pc_max,
+	  pc_mult,
+	  direct,
+	  FALSE,
+	  &cntl->part_pc
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->pack_b,
+	  ( cntl_t* )&cntl->part_pc
 	);
 
 	// Create a node for partitioning the n dimension by NC.
-	cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
+	bli_part_cntl_init_node
 	(
-	  pool,
-	  family,
-	  BLIS_NC,
 	  bli_trsm_blk_var2,
-	  trsm_cntl_mm_op
+	  jc_alg,
+	  jc_max,
+	  jc_mult,
+	  direct,
+	  FALSE,
+	  &cntl->part_jc
+	);
+	bli_cntl_attach_sub_node
+	(
+	  BLIS_THREAD_NONE,
+	  ( cntl_t* )&cntl->part_pc,
+	  ( cntl_t* )&cntl->part_jc
 	);
-
-	return trsm_cntl_vl_mm;
 }
 
-void bli_trsm_cntl_free
+#endif
+
+void bli_trsm_cntl_finalize
      (
-       pool_t* pool,
-       cntl_t* cntl
+       trsm_cntl_t* cntl
      )
 {
-	bli_cntl_free( pool, cntl );
-}
+	const dim_t ic_mult = bli_part_cntl_blksz_mult( ( cntl_t* )&cntl->part_ic );
 
-// -----------------------------------------------------------------------------
+	//
+	// Ensure that:
+	//
+	// 1. KC is a multiple of MR (NR) if A (B) is triangular, hermitian, or symmetric.
+	//    KC is always rounded up.
+	//
+	// 2. MC and NR are multiples of MR and NR, respectively. MC and NC are always
+	//    rounded down.
+	//
 
-cntl_t* bli_trsm_cntl_create_node
-     (
-       pool_t* pool,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
-     )
-{
-	return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node );
+	// Nudge the default and maximum kc blocksizes up to the nearest
+	// multiple of MR if A is Hermitian, symmetric, or triangular or
+	// NR if B is Hermitian, symmetric, or triangular. If neither case
+	// applies, then we leave the blocksizes unchanged. For trsm we
+	// always use MR (rather than sometimes using NR) because even
+	// when the triangle is on the right, packing of that matrix uses
+	// MR, since only left-side trsm micro-kernels are supported.
+	bli_part_cntl_align_blksz_to_mult( ic_mult, true, ( cntl_t* )&cntl->part_pc );
+
+	bli_part_cntl_align_blksz( false, ( cntl_t* )&cntl->part_ic );
+	bli_part_cntl_align_blksz( false, ( cntl_t* )&cntl->part_jc );
 }
 
diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h
index a23120ff8..94d9df5b4 100644
--- a/frame/3/trsm/bli_trsm_cntl.h
+++ b/frame/3/trsm/bli_trsm_cntl.h
@@ -33,45 +33,551 @@
 
 */
 
-cntl_t* bli_trsm_cntl_create
+struct trsm_var_cntl_s
+{
+	gemm_var_cntl_t gemm; //this field must be present and come first
+	gemmtrsm_ukr_ft gemmtrsm_ukr;
+	dim_t           mr_pack;
+	dim_t           nr_pack;
+	dim_t           mr_bcast;
+	dim_t           nr_bcast;
+};
+typedef struct trsm_var_cntl_s trsm_var_cntl_t;
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE gemmtrsm_ukr_ft bli_trsm_var_cntl_gemmtrsm_ukr( const cntl_t* cntl )
+{
+	return ( ( const trsm_var_cntl_t* ) cntl )->gemmtrsm_ukr;
+}
+
+BLIS_INLINE gemm_ukr_ft bli_trsm_var_cntl_gemm_ukr( const cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_ukr( cntl );
+}
+
+BLIS_INLINE gemm_ukr_ft bli_trsm_var_cntl_real_gemm_ukr( const cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_real_ukr( cntl );
+}
+
+BLIS_INLINE bool bli_trsm_var_cntl_row_pref( const cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_row_pref( cntl );
+}
+
+BLIS_INLINE const void* bli_trsm_var_cntl_params( const cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_params( cntl );
+}
+
+BLIS_INLINE const void* bli_trsm_var_cntl_real_params( const cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_real_params( cntl );
+}
+
+BLIS_INLINE dim_t bli_trsm_var_cntl_mr( const cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_mr( cntl );
+}
+
+BLIS_INLINE dim_t bli_trsm_var_cntl_nr( const cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_nr( cntl );
+}
+
+BLIS_INLINE dim_t bli_trsm_var_cntl_mr_pack( const cntl_t* cntl )
+{
+	return ( ( const trsm_var_cntl_t* ) cntl )->mr_pack;
+}
+
+BLIS_INLINE dim_t bli_trsm_var_cntl_nr_pack( const cntl_t* cntl )
+{
+	return ( ( const trsm_var_cntl_t* ) cntl )->nr_pack;
+}
+
+BLIS_INLINE dim_t bli_trsm_var_cntl_mr_bcast( const cntl_t* cntl )
+{
+	return ( ( const trsm_var_cntl_t* ) cntl )->mr_bcast;
+}
+
+BLIS_INLINE dim_t bli_trsm_var_cntl_nr_bcast( const cntl_t* cntl )
+{
+	return ( ( const trsm_var_cntl_t* ) cntl )->nr_bcast;
+}
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE void bli_trsm_var_cntl_set_gemmtrsm_ukr( const func2_t* ukr, cntl_t* cntl_ )
+{
+	trsm_var_cntl_t* cntl = ( trsm_var_cntl_t* )cntl_;
+	num_t dt_comp = cntl->gemm.dt_comp;
+	num_t dt_out = cntl->gemm.dt_out;
+	cntl->gemmtrsm_ukr = ( gemmtrsm_ukr_ft )bli_func2_get_dt( dt_comp, dt_out, ukr );
+}
+
+BLIS_INLINE err_t bli_trsm_var_cntl_set_gemmtrsm_ukr_simple( const func_t* ukr, cntl_t* cntl_ )
+{
+	trsm_var_cntl_t* cntl = ( trsm_var_cntl_t* )cntl_;
+	num_t dt_comp = cntl->gemm.dt_comp;
+	num_t dt_out = cntl->gemm.dt_out;
+	if ( dt_comp != dt_out )
+		return BLIS_INCONSISTENT_DATATYPES;
+	cntl->gemmtrsm_ukr = ( gemmtrsm_ukr_ft )bli_func_get_dt( dt_comp, ukr );
+	return BLIS_SUCCESS;
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_gemm_ukr( const func2_t* ukr, cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_ukr( ukr, cntl );
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_real_gemm_ukr( const func2_t* ukr, cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_real_ukr( ukr, cntl );
+}
+
+BLIS_INLINE err_t bli_trsm_var_cntl_set_gemm_ukr_simple( const func_t* ukr, cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_set_ukr_simple( ukr, cntl );
+}
+
+BLIS_INLINE err_t bli_trsm_var_cntl_set_real_gemm_ukr_simple( const func_t* ukr, cntl_t* cntl )
+{
+	return bli_gemm_var_cntl_set_real_ukr_simple( ukr, cntl );
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_row_pref( const mbool_t* row_pref, cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_row_pref( row_pref, cntl );
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_params( const void* params, cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_params( params, cntl );
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_real_params( const void* params, cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_real_params( params, cntl );
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_mr( dim_t mr, cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_mr( mr, cntl );
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_nr( dim_t nr, cntl_t* cntl )
+{
+	bli_gemm_var_cntl_set_nr( nr, cntl );
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_mr_pack( dim_t mr_pack, cntl_t* cntl )
+{
+	( ( trsm_var_cntl_t* ) cntl )->mr_pack = mr_pack;
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_nr_pack( dim_t nr_pack, cntl_t* cntl )
+{
+	( ( trsm_var_cntl_t* ) cntl )->nr_pack = nr_pack;
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_mr_bcast( dim_t mr_bcast, cntl_t* cntl )
+{
+	( ( trsm_var_cntl_t* ) cntl )->mr_bcast = mr_bcast;
+}
+
+BLIS_INLINE void bli_trsm_var_cntl_set_nr_bcast( dim_t nr_bcast, cntl_t* cntl )
+{
+	( ( trsm_var_cntl_t* ) cntl )->nr_bcast = nr_bcast;
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_trsm_var_cntl_init_node
      (
-       pool_t* pool,
-       side_t  side,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+       void_fp          var_func,
+       num_t            dt_comp,
+       num_t            dt_out,
+       gemmtrsm_ukr_ft  gemmtrsm_ukr,
+       gemm_ukr_ft      gemm_ukr,
+       gemm_ukr_ft      real_gemm_ukr,
+       bool             row_pref,
+       dim_t            mr,
+       dim_t            nr,
+       dim_t            mr_pack,
+       dim_t            nr_pack,
+       dim_t            mr_bcast,
+       dim_t            nr_bcast,
+       dim_t            mr_scale,
+       dim_t            nr_scale,
+       trsm_var_cntl_t* cntl
      );
 
-cntl_t* bli_trsm_l_cntl_create
+// -----------------------------------------------------------------------------
+
+struct trsm_cntl_s
+{
+         part_cntl_t part_jc;
+         part_cntl_t part_pc;
+    packm_def_cntl_t pack_b;
+         part_cntl_t part_ic;
+    packm_def_cntl_t pack_a_trsm;
+     trsm_var_cntl_t trsm_ker;
+    packm_def_cntl_t pack_a_gemm;
+     trsm_var_cntl_t gemm_ker;
+              cntl_t ir_loop_gemm;
+              cntl_t ir_loop_trsm;
+};
+typedef struct trsm_cntl_s trsm_cntl_t;
+
+// -----------------------------------------------------------------------------
+
+BLIS_EXPORT_BLIS void bli_trsm_cntl_init
      (
-       pool_t* pool,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             ind_t        im,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             trsm_cntl_t* cntl
      );
 
-cntl_t* bli_trsm_r_cntl_create
+void bli_trsm_l_cntl_init
      (
-       pool_t* pool,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             ind_t        im,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             trsm_cntl_t* cntl
      );
 
-void bli_trsm_cntl_free
+void bli_trsm_r_cntl_init
      (
-       pool_t* pool,
-       cntl_t* cntl
+             ind_t        im,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             trsm_cntl_t* cntl
      );
 
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_trsm_cntl_create_node
+BLIS_EXPORT_BLIS void bli_trsm_cntl_finalize
      (
-       pool_t* pool,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
+       trsm_cntl_t* cntl
      );
 
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE gemmtrsm_ukr_ft bli_trsm_cntl_gemmtrsm_ukr( trsm_cntl_t* cntl )
+{
+	return bli_trsm_var_cntl_gemmtrsm_ukr( ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE gemm_ukr_ft bli_trsm_cntl_gemm_ukr( trsm_cntl_t* cntl )
+{
+	gemm_ukr_ft real_ukr = bli_trsm_var_cntl_real_gemm_ukr( ( cntl_t* )&cntl->trsm_ker );
+	return real_ukr ? real_ukr : bli_trsm_var_cntl_gemm_ukr( ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE bool bli_trsm_cntl_row_pref( trsm_cntl_t* cntl )
+{
+	return bli_trsm_var_cntl_row_pref( ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE const void* bli_trsm_cntl_params( trsm_cntl_t* cntl )
+{
+	gemm_ukr_ft real_ukr = bli_trsm_var_cntl_real_gemm_ukr( ( cntl_t* )&cntl->trsm_ker );
+	return real_ukr ? bli_trsm_var_cntl_real_params( ( cntl_t* )&cntl->trsm_ker )
+	                : bli_trsm_var_cntl_params( ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE l3_var_oft bli_trsm_cntl_var( trsm_cntl_t* cntl )
+{
+	return ( l3_var_oft )bli_cntl_var_func( ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE packm_ker_ft bli_trsm_cntl_packa_ukr( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr( ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE pack_t bli_trsm_cntl_packa_schema( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_pack_schema( ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE const void* bli_trsm_cntl_packa_params( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr_params( ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE packm_var_oft bli_trsm_cntl_packa_var( trsm_cntl_t* cntl )
+{
+	return bli_packm_cntl_variant( ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE packm_ker_ft bli_trsm_cntl_packb_ukr( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE pack_t bli_trsm_cntl_packb_schema( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_pack_schema( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE const void* bli_trsm_cntl_packb_params( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_ukr_params( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE packm_var_oft bli_trsm_cntl_packb_var( trsm_cntl_t* cntl )
+{
+	return bli_packm_cntl_variant( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_mr_def( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_def( ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_mr_pack( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_pack( ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_nr_def( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_def( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_nr_pack( trsm_cntl_t* cntl )
+{
+	return bli_packm_def_cntl_bmult_m_pack( ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_mc_def( trsm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_alg( ( cntl_t* )&cntl->part_ic );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_mc_max( trsm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_max( ( cntl_t* )&cntl->part_ic );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_nc_def( trsm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_alg( ( cntl_t* )&cntl->part_jc );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_nc_max( trsm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_max( ( cntl_t* )&cntl->part_jc );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_kc_def( trsm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_alg( ( cntl_t* )&cntl->part_pc );
+}
+
+BLIS_INLINE dim_t bli_trsm_cntl_kc_max( trsm_cntl_t* cntl )
+{
+	return bli_part_cntl_blksz_max( ( cntl_t* )&cntl->part_pc );
+}
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE void bli_trsm_cntl_set_gemmtrsm_ukr( const func2_t* ukr, trsm_cntl_t* cntl )
+{
+	bli_trsm_var_cntl_set_gemmtrsm_ukr( ukr, ( cntl_t* )&cntl->trsm_ker );
+	bli_trsm_var_cntl_set_gemmtrsm_ukr( ukr, ( cntl_t* )&cntl->gemm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_gemmtrsm_ukr_simple( const func_t* ukr, trsm_cntl_t* cntl )
+{
+	bli_trsm_var_cntl_set_gemmtrsm_ukr_simple( ukr, ( cntl_t* )&cntl->trsm_ker );
+	bli_trsm_var_cntl_set_gemmtrsm_ukr_simple( ukr, ( cntl_t* )&cntl->gemm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_gemm_ukr( const func2_t* ukr, trsm_cntl_t* cntl )
+{
+	if ( bli_trsm_var_cntl_real_gemm_ukr( ( cntl_t* )&cntl->trsm_ker ) )
+	{
+		bli_trsm_var_cntl_set_real_gemm_ukr( ukr, ( cntl_t* )&cntl->trsm_ker );
+		bli_trsm_var_cntl_set_real_gemm_ukr( ukr, ( cntl_t* )&cntl->gemm_ker );
+	}
+	else
+	{
+		bli_trsm_var_cntl_set_gemm_ukr( ukr, ( cntl_t* )&cntl->trsm_ker );
+		bli_trsm_var_cntl_set_gemm_ukr( ukr, ( cntl_t* )&cntl->gemm_ker );
+	}
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_gemm_ukr_simple( const func_t* ukr, trsm_cntl_t* cntl )
+{
+	if ( bli_trsm_var_cntl_real_gemm_ukr( ( cntl_t* )&cntl->trsm_ker ) )
+	{
+		bli_trsm_var_cntl_set_real_gemm_ukr_simple( ukr, ( cntl_t* )&cntl->trsm_ker );
+		bli_trsm_var_cntl_set_real_gemm_ukr_simple( ukr, ( cntl_t* )&cntl->gemm_ker );
+	}
+	else
+	{
+		bli_trsm_var_cntl_set_gemm_ukr_simple( ukr, ( cntl_t* )&cntl->trsm_ker );
+		bli_trsm_var_cntl_set_gemm_ukr_simple( ukr, ( cntl_t* )&cntl->gemm_ker );
+	}
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_row_pref( const mbool_t* row_pref, trsm_cntl_t* cntl )
+{
+	bli_trsm_var_cntl_set_row_pref( row_pref, ( cntl_t* )&cntl->trsm_ker );
+	bli_trsm_var_cntl_set_row_pref( row_pref, ( cntl_t* )&cntl->gemm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_params( const void* params, trsm_cntl_t* cntl )
+{
+	if ( bli_trsm_var_cntl_real_gemm_ukr( ( cntl_t* )&cntl->trsm_ker ) )
+	{
+		bli_trsm_var_cntl_set_real_params( params, ( cntl_t* )&cntl->trsm_ker );
+		bli_trsm_var_cntl_set_real_params( params, ( cntl_t* )&cntl->gemm_ker );
+	}
+	else
+	{
+		bli_trsm_var_cntl_set_params( params, ( cntl_t* )&cntl->trsm_ker );
+		bli_trsm_var_cntl_set_params( params, ( cntl_t* )&cntl->gemm_ker );
+	}
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_var( l3_var_oft var, trsm_cntl_t* cntl )
+{
+	bli_cntl_set_var_func( ( void_fp )var, ( cntl_t* )&cntl->trsm_ker );
+	bli_cntl_set_var_func( ( void_fp )var, ( cntl_t* )&cntl->gemm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packa_ukr( const func2_t* ukr, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr( ukr, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_packm_def_cntl_set_ukr( ukr, ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packa_ukr_simple( const func_t* ukr, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_simple( ukr, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_packm_def_cntl_set_ukr_simple( ukr, ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packa_schema( pack_t schema, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_pack_schema( schema, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_packm_def_cntl_set_pack_schema( schema, ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packa_params( const void* params, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_params( params, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_packm_def_cntl_set_ukr_params( params, ( cntl_t* )&cntl->pack_a_trsm );
+	bli_packm_cntl_set_variant_params( params, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_packm_cntl_set_variant_params( params, ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packa_var( packm_var_oft var, trsm_cntl_t* cntl )
+{
+	bli_packm_cntl_set_variant( var, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_packm_cntl_set_variant( var, ( cntl_t* )&cntl->pack_a_trsm );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packb_ukr( const func2_t* ukr, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr( ukr, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packb_ukr_simple( const func_t* ukr, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_simple( ukr, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packb_schema( pack_t schema, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_pack_schema( schema, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packb_params( const void* params, trsm_cntl_t* cntl )
+{
+	bli_packm_def_cntl_set_ukr_params( params, ( cntl_t* )&cntl->pack_b );
+	bli_packm_cntl_set_variant_params( params, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_packb_var( packm_var_oft var, trsm_cntl_t* cntl )
+{
+	bli_packm_cntl_set_variant( var, ( cntl_t* )&cntl->pack_b );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_mr( const blksz_t* mr, trsm_cntl_t* cntl )
+{
+	num_t dt = cntl->gemm_ker.gemm.dt_comp;
+	dim_t mr_dt = bli_blksz_get_def( dt, mr );
+	dim_t mr_pack_dt = bli_blksz_get_max( dt, mr );
+	bli_packm_def_cntl_set_bmult_m( mr, ( cntl_t* )&cntl->pack_a_trsm );
+	bli_packm_def_cntl_set_bmult_m( mr, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_part_cntl_set_blksz_mult( mr, ( cntl_t* )&cntl->part_ic );
+	bli_trsm_var_cntl_set_mr( mr_dt, ( cntl_t* )&cntl->gemm_ker );
+	bli_trsm_var_cntl_set_mr( mr_dt, ( cntl_t* )&cntl->trsm_ker );
+	bli_trsm_var_cntl_set_mr_pack( mr_pack_dt, ( cntl_t* )&cntl->gemm_ker );
+	bli_trsm_var_cntl_set_mr_pack( mr_pack_dt, ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_mr_bcast( const blksz_t* mr_bcast, trsm_cntl_t* cntl )
+{
+	num_t dt = cntl->gemm_ker.gemm.dt_comp;
+	dim_t mr_bcast_dt = bli_blksz_get_def( dt, mr_bcast );
+	bli_packm_def_cntl_set_bmult_m_bcast( mr_bcast, ( cntl_t* )&cntl->pack_a_trsm );
+	bli_packm_def_cntl_set_bmult_m_bcast( mr_bcast, ( cntl_t* )&cntl->pack_a_gemm );
+	bli_trsm_var_cntl_set_mr_bcast( mr_bcast_dt, ( cntl_t* )&cntl->gemm_ker );
+	bli_trsm_var_cntl_set_mr_bcast( mr_bcast_dt, ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_nr( const blksz_t* nr, trsm_cntl_t* cntl )
+{
+	num_t dt = cntl->gemm_ker.gemm.dt_comp;
+	dim_t nr_dt = bli_blksz_get_def( dt, nr );
+	dim_t nr_pack_dt = bli_blksz_get_max( dt, nr );
+	bli_packm_def_cntl_set_bmult_m( nr, ( cntl_t* )&cntl->pack_b );
+	bli_part_cntl_set_blksz_mult( nr, ( cntl_t* )&cntl->part_jc );
+	bli_trsm_var_cntl_set_nr( nr_dt, ( cntl_t* )&cntl->gemm_ker );
+	bli_trsm_var_cntl_set_nr( nr_dt, ( cntl_t* )&cntl->trsm_ker );
+	bli_trsm_var_cntl_set_nr_pack( nr_pack_dt, ( cntl_t* )&cntl->gemm_ker );
+	bli_trsm_var_cntl_set_nr_pack( nr_pack_dt, ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_nr_bcast( const blksz_t* nr_bcast, trsm_cntl_t* cntl )
+{
+	num_t dt = cntl->gemm_ker.gemm.dt_comp;
+	dim_t nr_bcast_dt = bli_blksz_get_def( dt, nr_bcast );
+	bli_packm_def_cntl_set_bmult_m_bcast( nr_bcast, ( cntl_t* )&cntl->pack_b );
+	bli_trsm_var_cntl_set_nr_bcast( nr_bcast_dt, ( cntl_t* )&cntl->gemm_ker );
+	bli_trsm_var_cntl_set_nr_bcast( nr_bcast_dt, ( cntl_t* )&cntl->trsm_ker );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_mc( const blksz_t* mc, trsm_cntl_t* cntl )
+{
+	bli_part_cntl_set_blksz( mc, ( cntl_t* )&cntl->part_ic );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_nc( const blksz_t* nc, trsm_cntl_t* cntl )
+{
+	bli_part_cntl_set_blksz( nc, ( cntl_t* )&cntl->part_jc );
+}
+
+BLIS_INLINE void bli_trsm_cntl_set_kc( const blksz_t* kc, trsm_cntl_t* cntl )
+{
+	bli_part_cntl_set_blksz( kc, ( cntl_t* )&cntl->part_pc );
+}
+
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
deleted file mode 100644
index 4672366e5..000000000
--- a/frame/3/trsm/bli_trsm_front.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_trsm_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   b_local;
-	obj_t   c_local;
-
-#if 0
-#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-	gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl );
-	if ( status == BLIS_SUCCESS ) return;
-#endif
-#endif
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( alpha, b );
-		return;
-	}
-
-	// Alias A and B so we can tweak the objects if necessary.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( b, &c_local );
-
-	// Set the obj_t buffer field to the location currently implied by the row
-	// and column offsets and then zero the offsets. If any of the original
-	// obj_t's were views into larger matrices, this step effectively makes
-	// those obj_t's "forget" their lineage.
-	bli_obj_reset_origin( &a_local );
-	bli_obj_reset_origin( &b_local );
-	bli_obj_reset_origin( &c_local );
-
-	// We do not explicitly implement the cases where A is transposed.
-	// However, we can still handle them. Specifically, if A is marked as
-	// needing a transposition, we simply induce a transposition. This
-	// allows us to only explicitly implement the no-transpose cases. Once
-	// the transposition is induced, the correct algorithm will be called,
-	// since, for example, an algorithm over a transposed lower triangular
-	// matrix A moves in the same direction (forwards) as a non-transposed
-	// upper triangular matrix. And with the transposition induced, the
-	// matrix now appears to be upper triangular, so the upper triangular
-	// algorithm will grab the correct partitions, as if it were upper
-	// triangular (with no transpose) all along.
-	if ( bli_obj_has_trans( &a_local ) )
-	{
-		bli_obj_induce_trans( &a_local );
-		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
-	}
-
-#if 1
-
-	// If A is being solved against from the right, transpose all operands
-	// so that we can perform the computation as if A were being solved
-	// from the left.
-	if ( bli_is_right( side ) )
-	{
-		bli_toggle_side( &side );
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
-	}
-
-#else
-
-	// NOTE: Enabling this code requires that BLIS NOT be configured with
-	// BLIS_RELAX_MCNR_NCMR_CONSTRAINTS defined.
-#ifdef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS
-	#error "BLIS_RELAX_MCNR_NCMR_CONSTRAINTS must not be defined for current trsm_r implementation."
-#endif
-
-	// If A is being solved against from the right, swap A and B so that
-	// the triangular matrix will actually be on the right.
-	if ( bli_is_right( side ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
-	}
-
-#endif
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_TRSM,
-	  side,
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_l3_int,
-	  BLIS_TRSM, // operation family id
-	  alpha,
-	  &a_local,
-	  &b_local,
-	  alpha,
-	  &c_local,
-	  cntx,
-	  rntm
-	);
-}
-
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 028f02139..0efac1b6b 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trsm_ll_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffa  = bli_obj_diag_offset( a );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to B (the non-triangular matrix). This will be the alpha
@@ -94,10 +100,11 @@ void bli_trsm_ll_ker_var2
 	const dim_t PACKNR = rs_b;
 
 	// Cast the micro-kernel address to its function pointer type.
-	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
-	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_trsm_var_cntl_gemmtrsm_ukr( cntl );
+	gemm_ukr_ft     gemm_ukr     = bli_trsm_var_cntl_gemm_ukr( cntl );
+	const void*     params       = bli_trsm_var_cntl_params( cntl );
 
-	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const void* minus_one   = bli_obj_buffer_for_const( dt_comp, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
 	const char* b_cast      = buf_b;
 	      char* c_cast      = buf_c;
@@ -147,7 +154,7 @@ void bli_trsm_ll_ker_var2
 	if ( diagoffa < 0 )
 	{
 		m        += diagoffa;
-		c_cast   -= diagoffa * rs_c * dt_size;
+		c_cast   -= diagoffa * rs_c * dt_c_size;
 		diagoffa  = 0;
 	}
 
@@ -165,12 +172,12 @@ void bli_trsm_ll_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -178,11 +185,15 @@ void bli_trsm_ll_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// We don't bother querying the thrinfo_t node for the 1st loop because
 	// we can't parallelize that loop in trsm due to the inter-iteration
 	// dependencies that exist.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
@@ -195,7 +206,7 @@ void bli_trsm_ll_ker_var2
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is unattainable due to the
 	// inter-iteration dependencies present in trsm.
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( jr_tid, jr_nt, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -238,18 +249,18 @@ void bli_trsm_ll_ker_var2
 				// intersecting micro-panel.
 				inc_t ps_a_cur  = k_a1011 * PACKMR;
 				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-				      ps_a_cur *= dt_size;
+				      ps_a_cur *= dt_a_size;
 
 				// Compute the addresses of the panel A10 and the triangular
 				// block A11.
 				const char* a10 = a1;
-				const char* a11 = a1 + k_a10 * PACKMR * dt_size;
+				const char* a11 = a1 + k_a10 * PACKMR * dt_a_size;
 				//a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );
 
 				// Compute the addresses of the panel B01 and the block
 				// B11.
-				const char* b01 = b1 + off_a10 * PACKNR * dt_size;
-				const char* b11 = b1 + off_a11 * PACKNR * dt_size;
+				const char* b01 = b1 + off_a10 * PACKNR * dt_b_size;
+				const char* b11 = b1 + off_a11 * PACKNR * dt_b_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1 + ps_a_cur;
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 72f97b11e..a1593dfd5 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trsm_lu_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffa  = bli_obj_diag_offset( a );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to B (the non-triangular matrix). This will be the alpha
@@ -94,10 +100,11 @@ void bli_trsm_lu_ker_var2
 	const dim_t     PACKNR      = rs_b;
 
 	// Cast the micro-kernel address to its function pointer type.
-	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
-	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_trsm_var_cntl_gemmtrsm_ukr( cntl );
+	gemm_ukr_ft     gemm_ukr     = bli_trsm_var_cntl_gemm_ukr( cntl );
+	const void*     params       = bli_trsm_var_cntl_params( cntl );
 
-	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const void* minus_one   = bli_obj_buffer_for_const( dt_comp, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
 	const char* b_cast      = buf_b;
 	      char* c_cast      = buf_c;
@@ -138,7 +145,7 @@ void bli_trsm_lu_ker_var2
 	if ( diagoffa > 0 )
 	{
 		k        -= diagoffa;
-		b_cast   += diagoffa * PACKNR * dt_size;
+		b_cast   += diagoffa * PACKNR * dt_b_size;
 		diagoffa  = 0;
 	}
 
@@ -176,12 +183,12 @@ void bli_trsm_lu_ker_var2
 	const dim_t m_left = m % MR;
 
 	// Determine some increments used to step through A, B, and C.
-	const inc_t rstep_a = ps_a * dt_size;
+	const inc_t rstep_a = ps_a * dt_a_size;
 
-	const inc_t cstep_b = ps_b * dt_size;
+	const inc_t cstep_b = ps_b * dt_b_size;
 
-	const inc_t rstep_c = rs_c * MR * dt_size;
-	const inc_t cstep_c = cs_c * NR * dt_size;
+	const inc_t rstep_c = rs_c * MR * dt_c_size;
+	const inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -189,11 +196,15 @@ void bli_trsm_lu_ker_var2
 	bli_auxinfo_set_schema_a( schema_a, &aux );
 	bli_auxinfo_set_schema_b( schema_b, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	// We don't bother querying the thrinfo_t node for the 1st loop because
 	// we can't parallelize that loop in trsm due to the inter-iteration
 	// dependencies that exist.
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
-	//thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
+	//thrinfo_t* caucus = bli_thrinfo_sub_node( 0, thread );
 
 	// Query the number of threads and thread ids for each loop.
 	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
@@ -206,7 +217,7 @@ void bli_trsm_lu_ker_var2
 	// slab or round-robin partitioning was requested at configure-time.
 	// NOTE: Parallelism in the 1st loop is unattainable due to the
 	// inter-iteration dependencies present in trsm.
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_slrr( jr_tid, jr_nt, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
 
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
@@ -242,7 +253,7 @@ void bli_trsm_lu_ker_var2
 			{
 				// Compute various offsets into and lengths of parts of A.
 				const dim_t off_a11 = diagoffa_i;
-				const dim_t k_a1112 = k - off_a11;;
+				const dim_t k_a1112 = k - off_a11;
 				const dim_t k_a11   = MR;
 				const dim_t k_a12   = k_a1112 - MR;
 				const dim_t off_a12 = off_a11 + k_a11;
@@ -251,18 +262,18 @@ void bli_trsm_lu_ker_var2
 				// intersecting micro-panel.
 				inc_t ps_a_cur  = k_a1112 * PACKMR;
 				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
-				      ps_a_cur *= dt_size;
+				      ps_a_cur *= dt_a_size;
 
 				// Compute the addresses of the triangular block A11 and the
 				// panel A12.
 				const char* a11 = a1;
-				const char* a12 = a1 + k_a11 * PACKMR * dt_size;
+				const char* a12 = a1 + k_a11 * PACKMR * dt_a_size;
 				//a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );
 
 				// Compute the addresses of the panel B01 and the block
 				// B11.
-				const char* b11 = b1 + off_a11 * PACKNR * dt_size;
-				const char* b21 = b1 + off_a12 * PACKNR * dt_size;
+				const char* b11 = b1 + off_a11 * PACKNR * dt_b_size;
+				const char* b21 = b1 + off_a12 * PACKNR * dt_b_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1 + ps_a_cur;
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index d4b93c7c4..5d67a20b6 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trsm_rl_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size ( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffb  = bli_obj_diag_offset( b );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to A (the non-triangular matrix). This will be the alpha
@@ -99,10 +105,11 @@ void bli_trsm_rl_ker_var2
 	// triangular), it becomes upper-triangular after the kernel operation
 	// is transposed so that all kernel instances are of the "left"
 	// variety (since those are the only trsm ukernels that exist).
-	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
-	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_trsm_var_cntl_gemmtrsm_ukr( cntl );
+	gemm_ukr_ft     gemm_ukr     = bli_trsm_var_cntl_gemm_ukr( cntl );
+	const void*     params       = bli_trsm_var_cntl_params( cntl );
 
-	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const void* minus_one   = bli_obj_buffer_for_const( dt_comp, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
 	const char* b_cast      = buf_b;
 	      char* c_cast      = buf_c;
@@ -151,7 +158,7 @@ void bli_trsm_rl_ker_var2
 	if ( diagoffb < 0 )
 	{
 		k        += diagoffb;
-		a_cast   -= diagoffb * PACKMR * dt_size;
+		a_cast   -= diagoffb * PACKMR * dt_a_size;
 		diagoffb  = 0;
 	}
 
@@ -185,7 +192,7 @@ void bli_trsm_rl_ker_var2
 	// dimension that is a multiple of PACKNR, with the region between the
 	// last column and the next multiple of NR zero-padded accordingly.
 
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
@@ -199,12 +206,12 @@ void bli_trsm_rl_ker_var2
 	if ( m_left ) ++m_iter;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	inc_t rstep_a = ps_a * dt_a_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	inc_t cstep_b = ps_b * dt_b_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	inc_t rstep_c = rs_c * MR * dt_c_size;
+	inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	auxinfo_t aux;
 
@@ -214,6 +221,10 @@ void bli_trsm_rl_ker_var2
 	bli_auxinfo_set_schema_a( schema_b, &aux );
 	bli_auxinfo_set_schema_b( schema_a, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	const char* b1 = b_cast;
 	      char* c1 = c_cast;
 
@@ -250,13 +261,13 @@ void bli_trsm_rl_ker_var2
 			// Compute the addresses of the triangular block B11 and the
 			// panel B21.
 			const char* b11 = b1;
-			const char* b21 = b1 + k_b11 * PACKNR * dt_size;
+			const char* b21 = b1 + k_b11 * PACKNR * dt_b_size;
 			//b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );
 
 			// Compute the panel stride for the current micro-panel.
 			inc_t ps_b_cur  = k_b1121 * PACKNR;
 				  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-				  ps_b_cur *= dt_size;
+				  ps_b_cur *= dt_b_size;
 
 			// Loop over the m dimension (MR rows at a time).
 			for ( dim_t i = 0; i < m_iter; ++i )
@@ -266,8 +277,8 @@ void bli_trsm_rl_ker_var2
 				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
 				// Compute the addresses of the A11 block and A12 panel.
-				const char* a11  = a1 + off_b11 * PACKMR * dt_size;
-				const char* a12  = a1 + off_b21 * PACKMR * dt_size;
+				const char* a11  = a1 + off_b11 * PACKMR * dt_a_size;
+				const char* a12  = a1 + off_b21 * PACKMR * dt_a_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1;
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index ae82b1ee0..442df5798 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -45,31 +45,37 @@ void bli_trsm_ru_ker_var2
              thrinfo_t* thread_par
      )
 {
-	const num_t     dt        = bli_obj_exec_dt( c );
-	const dim_t     dt_size   = bli_dt_size( dt );
+	const num_t  dt_comp   = bli_gemm_var_cntl_comp_dt( cntl );
+	const num_t  dt_a      = bli_obj_dt( a );
+	const num_t  dt_b      = bli_obj_dt( b );
+	const num_t  dt_c      = bli_obj_dt( c );
 
-	      doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const siz_t  dt_a_size = bli_dt_size( dt_a );
+	const siz_t  dt_b_size = bli_dt_size( dt_b );
+	const siz_t  dt_c_size = bli_dt_size( dt_c );
 
-	const pack_t    schema_a  = bli_obj_pack_schema( a );
-	const pack_t    schema_b  = bli_obj_pack_schema( b );
+	      doff_t diagoffb  = bli_obj_diag_offset( b );
 
-	      dim_t     m         = bli_obj_length( c );
-	      dim_t     n         = bli_obj_width( c );
-	      dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const void*     buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t     cs_a      = bli_obj_col_stride( a );
-	const dim_t     pd_a      = bli_obj_panel_dim( a );
-	const inc_t     ps_a      = bli_obj_panel_stride( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	const void*     buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t     rs_b      = bli_obj_row_stride( b );
-	const dim_t     pd_b      = bli_obj_panel_dim( b );
-	const inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	      void*     buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t     rs_c      = bli_obj_row_stride( c );
-	const inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
+
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to A (the non-triangular matrix). This will be the alpha
@@ -99,10 +105,11 @@ void bli_trsm_ru_ker_var2
 	// triangular), it becomes lower-triangular after the kernel operation
 	// is transposed so that all kernel instances are of the "left"
 	// variety (since those are the only trsm ukernels that exist).
-	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
-	gemm_ukr_ft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+	gemmtrsm_ukr_ft gemmtrsm_ukr = bli_trsm_var_cntl_gemmtrsm_ukr( cntl );
+	gemm_ukr_ft     gemm_ukr     = bli_trsm_var_cntl_gemm_ukr( cntl );
+	const void*     params       = bli_trsm_var_cntl_params( cntl );
 
-	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const void* minus_one   = bli_obj_buffer_for_const( dt_comp, &BLIS_MINUS_ONE );
 	const char* a_cast      = buf_a;
 	const char* b_cast      = buf_b;
 	      char* c_cast      = buf_c;
@@ -152,7 +159,7 @@ void bli_trsm_ru_ker_var2
 	if ( diagoffb > 0 )
 	{
 		n        -= diagoffb;
-		c_cast   += diagoffb * cs_c * dt_size;
+		c_cast   += diagoffb * cs_c * dt_c_size;
 		diagoffb  = 0;
 	}
 
@@ -181,7 +188,7 @@ void bli_trsm_ru_ker_var2
 	// dimension that is a multiple of PACKNR, with the region between the
 	// last column and the next multiple of NR zero-padded accordingly.
 
-	thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
+	thrinfo_t* thread = bli_thrinfo_sub_node( 0, thread_par );
 
 	// Compute number of primary and leftover components of the m and n
 	// dimensions.
@@ -195,12 +202,12 @@ void bli_trsm_ru_ker_var2
 	if ( m_left ) ++m_iter;
 
 	// Determine some increments used to step through A, B, and C.
-	inc_t rstep_a = ps_a * dt_size;
+	inc_t rstep_a = ps_a * dt_a_size;
 
-	inc_t cstep_b = ps_b * dt_size;
+	inc_t cstep_b = ps_b * dt_b_size;
 
-	inc_t rstep_c = rs_c * MR * dt_size;
-	inc_t cstep_c = cs_c * NR * dt_size;
+	inc_t rstep_c = rs_c * MR * dt_c_size;
+	inc_t cstep_c = cs_c * NR * dt_c_size;
 
 	// Save the pack schemas of A and B to the auxinfo_t object.
 	// NOTE: We swap the values for A and B since the triangular
@@ -209,6 +216,10 @@ void bli_trsm_ru_ker_var2
 	bli_auxinfo_set_schema_a( schema_b, &aux );
 	bli_auxinfo_set_schema_b( schema_a, &aux );
 
+	// Save the virtual microkernel address and the params.
+	bli_auxinfo_set_ukr( gemm_ukr, &aux );
+	bli_auxinfo_set_params( params, &aux );
+
 	const char* b1 = b_cast;
 	      char* c1 = c_cast;
 
@@ -243,13 +254,13 @@ void bli_trsm_ru_ker_var2
 			// Compute the addresses of the panel B10 and the triangular
 			// block B11.
 			const char* b01 = b1;
-			const char* b11 = b1 + k_b01 * PACKNR * dt_size;
+			const char* b11 = b1 + k_b01 * PACKNR * dt_b_size;
 			//b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );
 
 			// Compute the panel stride for the current micro-panel.
 			inc_t ps_b_cur  = k_b0111 * PACKNR;
 				  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
-				  ps_b_cur *= dt_size;
+				  ps_b_cur *= dt_b_size;
 
 			// Loop over the m dimension (MR rows at a time).
 			for ( dim_t i = 0; i < m_iter; ++i )
@@ -259,8 +270,8 @@ void bli_trsm_ru_ker_var2
 				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
 				// Compute the addresses of the A10 panel and A11 block.
-				const char* a10  = a1 + off_b01 * PACKMR * dt_size;
-				const char* a11  = a1 + off_b11 * PACKMR * dt_size;
+				const char* a10  = a1 + off_b01 * PACKMR * dt_a_size;
+				const char* a11  = a1 + off_b11 * PACKMR * dt_a_size;
 
 				// Compute the addresses of the next panels of A and B.
 				const char* a2 = a1;
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index 4d7e72b43..e5759bc9f 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -41,7 +41,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t*     a, \
        const obj_t*     b, \
@@ -55,8 +55,6 @@ GENPROT( trsm_blk_var1 )
 GENPROT( trsm_blk_var2 )
 GENPROT( trsm_blk_var3 )
 
-GENPROT( trsm_xx_ker_var2 )
-
 GENPROT( trsm_ll_ker_var2 )
 GENPROT( trsm_lu_ker_var2 )
 GENPROT( trsm_rl_ker_var2 )
diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c
index 693e91bf9..cd7589888 100644
--- a/frame/base/bli_apool.c
+++ b/frame/base/bli_apool.c
@@ -408,12 +408,11 @@ pool_t* bli_apool_array_elem
 		// Each small block pool should contain blocks large enough to
 		// accommodate any of the data structures for which they will be
 		// used.
-		const siz_t n_sizes        = 4;
-		siz_t       sizes[4]       = { sizeof( cntl_t ),
-		                               sizeof( packm_params_t ),
-		                               sizeof( thrcomm_t ),
-		                               sizeof( thrinfo_t ) };
-		siz_t       block_size     = 0;
+		const siz_t sizes[]    = { sizeof( cntl_t ),
+		                           sizeof( thrcomm_t ),
+		                           sizeof( thrinfo_t ) };
+		const siz_t n_sizes    = sizeof( sizes ) / sizeof( sizes[0] );
+		      siz_t block_size = 0;
 
 		// Find the largest of the sizes above and use that as the block_size
 		// for the pool.
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index a8061f933..135d41063 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -150,7 +150,7 @@ arch_t bli_arch_query_id_impl( void )
 		// initialized. Query the address of an internal context data structure
 		// corresponding to req_id. This pointer will be NULL if the associated
 		// subconfig is not available.
-		const cntx_t* const * req_cntx = bli_gks_lookup_id( req_id );
+		const cntx_t* req_cntx = bli_gks_lookup_id( req_id );
 
 		// This function checks the context pointer and aborts with a useful
 		// error message if the pointer is found to be NULL.
diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h
index 166480b30..dec29d875 100644
--- a/frame/base/bli_auxinfo.h
+++ b/frame/base/bli_auxinfo.h
@@ -74,6 +74,15 @@ BLIS_INLINE inc_t bli_auxinfo_ps_b( const auxinfo_t* ai )
 	return ai->ps_b;
 }
 
+BLIS_INLINE inc_t bli_auxinfo_off_m( const auxinfo_t* ai )
+{
+	return ai->off_m;
+}
+BLIS_INLINE inc_t bli_auxinfo_off_n( const auxinfo_t* ai )
+{
+	return ai->off_n;
+}
+
 BLIS_INLINE void_fp bli_auxinfo_ukr( const auxinfo_t* ai )
 {
 	return ai->ukr;
@@ -127,6 +136,15 @@ BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai )
 	ai->ps_b = ps;
 }
 
+BLIS_INLINE void bli_auxinfo_set_off_m( dim_t off, auxinfo_t* ai )
+{
+	ai->off_m = off;
+}
+BLIS_INLINE void bli_auxinfo_set_off_n( dim_t off, auxinfo_t* ai )
+{
+	ai->off_n = off;
+}
+
 BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai )
 {
 	ai->ukr = ukr;
diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c
index 38b4b7956..dea7a8bb1 100644
--- a/frame/base/bli_blksz.c
+++ b/frame/base/bli_blksz.c
@@ -235,161 +235,49 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-             dir_t   direct,
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
+       dir_t direct,
+       dim_t i,
+       dim_t dim,
+       dim_t b_alg,
+       dim_t b_max
      )
 {
-	if ( direct == BLIS_FWD )
-		return bli_determine_blocksize_f( i, dim, obj, bszid, cntx );
-	else
-		return bli_determine_blocksize_b( i, dim, obj, bszid, cntx );
-}
-
-dim_t bli_determine_blocksize_f
-     (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
-     )
-{
-	num_t          dt;
-	const blksz_t* bsize;
-	dim_t          b_alg, b_max;
-	dim_t          b_use;
-
-	// Extract the execution datatype and use it to query the corresponding
-	// blocksize and blocksize maximum values from the blksz_t object.
-	dt    = bli_obj_exec_dt( obj );
-	bsize = bli_cntx_get_blksz( bszid, cntx );
-	b_alg = bli_blksz_get_def( dt, bsize );
-	b_max = bli_blksz_get_max( dt, bsize );
-
-	b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );
-
-	return b_use;
-}
-
-dim_t bli_determine_blocksize_b
-     (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
-     )
-{
-	num_t          dt;
-	const blksz_t* bsize;
-	dim_t          b_alg, b_max;
-	dim_t          b_use;
-
-	// Extract the execution datatype and use it to query the corresponding
-	// blocksize and blocksize maximum values from the blksz_t object.
-	dt    = bli_obj_exec_dt( obj );
-	bsize = bli_cntx_get_blksz( bszid, cntx );
-	b_alg = bli_blksz_get_def( dt, bsize );
-	b_max = bli_blksz_get_max( dt, bsize );
-
-	b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
-
-	return b_use;
-}
-
-dim_t bli_determine_blocksize_f_sub
-     (
-       dim_t  i,
-       dim_t  dim,
-       dim_t  b_alg,
-       dim_t  b_max
-     )
-{
-	dim_t b_now;
-	dim_t dim_left_now;
-
-	// We assume that this function is being called from an algorithm that
-	// is moving "forward" (ie: top to bottom, left to right, top-left
-	// to bottom-right).
+    const bool handle_edge_low = ( direct == BLIS_BWD );
 
 	// Compute how much of the matrix dimension is left, including the
 	// chunk that will correspond to the blocksize we are computing now.
-	dim_left_now = dim - i;
+	dim_t dim_left_now = dim - i;
 
-	// If the dimension currently remaining is less than the maximum
-	// blocksize, use it instead of the default blocksize b_alg.
-	// Otherwise, use b_alg.
-	if ( dim_left_now <= b_max )
-	{
-		b_now = dim_left_now;
-	}
-	else
+	if ( handle_edge_low )
 	{
-		b_now = b_alg;
-	}
-
-	return b_now;
-}
-
-dim_t bli_determine_blocksize_b_sub
-     (
-       dim_t  i,
-       dim_t  dim,
-       dim_t  b_alg,
-       dim_t  b_max
-     )
-{
-	dim_t b_now;
-	dim_t dim_left_now;
-	dim_t dim_at_edge;
+		dim_t dim_at_edge = dim_left_now % b_alg;
 
-	// We assume that this function is being called from an algorithm that
-	// is moving "backward" (ie: bottom to top, right to left, bottom-right
-	// to top-left).
-
-	// Compute how much of the matrix dimension is left, including the
-	// chunk that will correspond to the blocksize we are computing now.
-	dim_left_now = dim - i;
-
-	// Sanity check: if dim_left_now is zero, then we can return zero
-	// without going any further.
-	if ( dim_left_now == 0 )
-		return 0;
-
-	dim_at_edge = dim_left_now % b_alg;
-
-	// If dim_left_now is a multiple of b_alg, we can safely return b_alg
-	// without going any further.
-	if ( dim_at_edge == 0 )
-		return b_alg;
-
-	// If the dimension currently remaining is less than the maximum
-	// blocksize, use it as the chosen blocksize. If this is not the case,
-	// then we know dim_left_now is greater than the maximum blocksize.
-	// To determine how much of it we should use for the current blocksize,
-	// we inspect dim_at_edge; if it is smaller than (or equal to) b_max -
-	// b_alg, then we use b_alg + dim_at_edge. Otherwise, dim_at_edge is
-	// greater than b_max - b_alg, in which case we use dim_at_edge.
-	if ( dim_left_now <= b_max )
-	{
-		b_now = dim_left_now;
+		// To determine how much of the remaining dimension we should use for the
+		// current blocksize, we inspect dim_at_edge; if it is smaller than (or
+		// equal to) b_max - b_alg, then we use b_alg + dim_at_edge. Otherwise,
+		// dim_at_edge is greater than b_max - b_alg, in which case we use dim_at_edge.
+		if ( b_alg + dim_at_edge <= b_max )
+		{
+			return b_alg + dim_at_edge;
+		}
+		else
+		{
+			return dim_at_edge;
+		}
 	}
-	else // if ( dim_left_now > b_max )
+	else
 	{
-		if ( dim_at_edge <= b_max - b_alg )
+		// If the dimension currently remaining is less than the maximum
+		// blocksize, use it instead of the default blocksize b_alg.
+		// Otherwise, use b_alg.
+		if ( dim_left_now <= b_max )
 		{
-			b_now = b_alg + dim_at_edge;
+			return dim_left_now;
 		}
-		else // if ( dim_at_edge > b_max - b_alg )
+		else
 		{
-			b_now = dim_at_edge;
+			return b_alg;
 		}
 	}
-
-	return b_now;
 }
 
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index 7f1db2706..150cd43e8 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -253,45 +253,10 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-             dir_t   direct,
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
-     );
-
-dim_t bli_determine_blocksize_f
-     (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
-     );
-
-dim_t bli_determine_blocksize_b
-     (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
-     );
-
-dim_t bli_determine_blocksize_f_sub
-     (
-       dim_t  i,
-       dim_t  dim,
-       dim_t  b_alg,
-       dim_t  b_max
-     );
-
-dim_t bli_determine_blocksize_b_sub
-     (
-       dim_t  i,
-       dim_t  dim,
-       dim_t  b_alg,
-       dim_t  b_max
+       dir_t direct,
+       dim_t i,
+       dim_t dim,
+       dim_t b_alg,
+       dim_t b_max
      );
 
diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c
index 16c418b49..e949b6361 100644
--- a/frame/base/bli_check.c
+++ b/frame/base/bli_check.c
@@ -747,10 +747,8 @@ err_t bli_check_packm_schema_on_unpack( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
-	if ( bli_obj_pack_schema( a ) != BLIS_PACKED_ROWS &&
-	     bli_obj_pack_schema( a ) != BLIS_PACKED_COLUMNS &&
-	     bli_obj_pack_schema( a ) != BLIS_PACKED_ROW_PANELS &&
-	     bli_obj_pack_schema( a ) != BLIS_PACKED_COL_PANELS )
+	if ( bli_obj_pack_schema( a ) != BLIS_PACKED_MATRIX &&
+	     bli_obj_pack_schema( a ) != BLIS_PACKED_PANELS )
 		e_val = BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK;
 
 	return e_val;
@@ -895,7 +893,7 @@ err_t bli_check_valid_arch_id( arch_t id )
 	return e_val;
 }
 
-err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx )
+err_t bli_check_initialized_gks_cntx( const cntx_t* cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -958,3 +956,49 @@ err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr )
 	return BLIS_SUCCESS;
 }
 
+err_t bli_check_valid_mr_even( const blksz_t* mr, const mbool_t* row_pref )
+{
+	num_t dt;
+
+	for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
+	{
+		if ( bli_is_complex( dt ) )
+			continue;
+
+		dim_t mr_dt        = bli_blksz_get_def( dt, mr );
+		dim_t packmr_dt    = bli_blksz_get_max( dt, mr );
+		bool  prefers_rows = bli_mbool_get_dt( dt, row_pref );
+
+		if ( prefers_rows )
+			continue;
+
+		if      ( mr_dt % 2 != 0 ) return BLIS_MR_NOT_EVEN_FOR_REAL_TYPE;
+		else if ( packmr_dt % 2 != 0 ) return BLIS_PACKMR_NOT_EVEN_FOR_REAL_TYPE;
+	}
+
+	return BLIS_SUCCESS;
+}
+
+err_t bli_check_valid_nr_even( const blksz_t* nr, const mbool_t* row_pref )
+{
+	num_t dt;
+
+	for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
+	{
+		if ( bli_is_complex( dt ) )
+			continue;
+
+		dim_t nr_dt = bli_blksz_get_def( dt, nr );
+		dim_t packnr_dt = bli_blksz_get_max( dt, nr );
+		bool  prefers_rows = bli_mbool_get_dt( dt, row_pref );
+
+		if ( !prefers_rows )
+			continue;
+
+		if      ( nr_dt % 2 != 0 ) return BLIS_NR_NOT_EVEN_FOR_REAL_TYPE;
+		else if ( packnr_dt % 2 != 0 ) return BLIS_PACKNR_NOT_EVEN_FOR_REAL_TYPE;
+	}
+
+	return BLIS_SUCCESS;
+}
+
diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h
index f1e2201a7..d4690b0ad 100644
--- a/frame/base/bli_check.h
+++ b/frame/base/bli_check.h
@@ -110,9 +110,12 @@ err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size );
 err_t bli_check_object_alias_of( const obj_t* a, const obj_t* b );
 
 err_t bli_check_valid_arch_id( arch_t id );
-err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx );
+err_t bli_check_initialized_gks_cntx( const cntx_t* cntx );
 
 err_t bli_check_valid_mc_mod_mult( const blksz_t* mc, const blksz_t* mr );
 err_t bli_check_valid_nc_mod_mult( const blksz_t* nc, const blksz_t* nr );
 err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr );
 
+err_t bli_check_valid_mr_even( const blksz_t* mr, const mbool_t* row_pref );
+err_t bli_check_valid_nr_even( const blksz_t* mr, const mbool_t* row_pref );
+
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index bd688f85a..4ec38a18f 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -35,233 +35,46 @@
 
 #include "blis.h"
 
-cntl_t* bli_cntl_create_node
+void bli_cntl_init_node
      (
-       pool_t* sba_pool,
-       opid_t  family,
-       bszid_t bszid,
        void_fp var_func,
-       void*   params,
-       cntl_t* sub_node
-     )
-{
-	cntl_t* cntl;
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntl_create_node(): " );
-	#endif
-
-	// Allocate the cntl_t struct.
-	cntl = bli_sba_acquire( sba_pool, sizeof( cntl_t ) );
-
-	bli_cntl_set_family( family, cntl );
-	bli_cntl_set_bszid( bszid, cntl );
-	bli_cntl_set_var_func( var_func, cntl );
-	bli_cntl_set_params( params, cntl );
-	bli_cntl_set_sub_prenode( NULL, cntl );
-	bli_cntl_set_sub_node( sub_node, cntl );
-
-	return cntl;
-}
-
-void bli_cntl_free_node
-     (
-       pool_t* sba_pool,
-       cntl_t* cntl
-     )
-{
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntl_free_node(): " );
-	#endif
-
-	bli_sba_release( sba_pool, cntl );
-}
-
-void bli_cntl_clear_node
-     (
        cntl_t* cntl
      )
 {
-	// Clear various fields in the control tree. Clearing these fields
-	// actually is not needed, but we do it for debugging/completeness.
-	bli_cntl_set_var_func( NULL, cntl );
-	bli_cntl_set_params( NULL, cntl );
-	bli_cntl_set_sub_prenode( NULL, cntl );
-	bli_cntl_set_sub_node( NULL, cntl );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntl_free
-     (
-       pool_t* sba_pool,
-       cntl_t* cntl
-     )
-{
-	// Base case: simply return when asked to free NULL nodes.
-	if ( cntl == NULL ) return;
-
-	cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl );
-	cntl_t* cntl_sub_node    = bli_cntl_sub_node( cntl );
-	void*   cntl_params      = bli_cntl_params( cntl );
-
-	// Only recurse into prenode branch if it exists.
-	if ( cntl_sub_prenode != NULL )
-	{
-		// Recursively free all memory associated with the sub-prenode and its
-		// children.
-		bli_cntl_free( sba_pool, cntl_sub_prenode );
-	}
-
-	// Only recurse into the child node if it exists.
-	if ( cntl_sub_node != NULL )
-	{
-		// Recursively free all memory associated with the sub-node and its
-		// children.
-		bli_cntl_free( sba_pool, cntl_sub_node );
-	}
-
-	// Free the current node's params field, if it is non-NULL.
-	if ( cntl_params != NULL )
+	bli_cntl_set_var_func( var_func, cntl );
+	for ( dim_t i = 0; i < BLIS_MAX_SUB_NODES; i++ )
 	{
-		#ifdef BLIS_ENABLE_MEM_TRACING
-		printf( "bli_cntl_free_w_thrinfo(): " );
-		#endif
-
-		bli_sba_release( sba_pool, cntl_params );
+		bli_cntl_set_ways( i, 0, cntl );
+		bli_cntl_set_sub_node( i, NULL, cntl );
 	}
-
-	// Free the current node.
-	bli_cntl_free_node( sba_pool, cntl );
 }
 
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_cntl_copy
+void bli_cntl_attach_sub_node
      (
-             pool_t* sba_pool,
-       const cntl_t* cntl
+       dim_t   ways,
+       cntl_t* sub_node,
+       cntl_t* cntl
      )
 {
-	// Make a copy of the current node. Notice that the source node
-	// should NOT have any allocated/cached mem_t entries, and that
-	// bli_cntl_create_node() creates a node with a cleared mem_t
-	// field.
-	cntl_t* cntl_copy = bli_cntl_create_node
-	(
-	  sba_pool,
-	  bli_cntl_family( cntl ),
-	  bli_cntl_bszid( cntl ),
-	  bli_cntl_var_func( cntl ),
-	  NULL, NULL
-	);
-
-	// Check the params field of the existing control tree; if it's non-NULL,
-	// copy it.
-	if ( bli_cntl_params( cntl ) != NULL )
+	dim_t next = 0;
+	for ( ; next < BLIS_MAX_SUB_NODES; next++ )
 	{
-		// Detect the size of the params struct by reading the first field
-		// as a uint64_t, and then allocate this many bytes for a new params
-		// struct.
-		uint64_t params_size = bli_cntl_params_size( cntl );
-		void*    params_orig = bli_cntl_params( cntl );
-		void*    params_copy = bli_sba_acquire( sba_pool, ( size_t )params_size );
-
-		// Copy the original params struct to the new memory region.
-		memcpy( params_copy, params_orig, params_size );
-
-		// Save the address of the new params struct into the new control
-		// tree node.
-		bli_cntl_set_params( params_copy, cntl_copy );
+		if ( bli_cntl_sub_node( next, cntl ) == NULL )
+			break;
 	}
 
-	// If the sub-prenode exists, copy it recursively.
-	if ( bli_cntl_sub_prenode( cntl ) != NULL )
-	{
-		cntl_t* sub_prenode_copy = bli_cntl_copy
-		(
-		  sba_pool,
-		  bli_cntl_sub_prenode( cntl )
-		);
+	if ( next == BLIS_MAX_SUB_NODES )
+		bli_abort();
 
-		// Save the address of the new sub-node (sub-tree) to the existing
-		// node.
-		bli_cntl_set_sub_prenode( sub_prenode_copy, cntl_copy );
-	}
-
-	// If the sub-node exists, copy it recursively.
-	if ( bli_cntl_sub_node( cntl ) != NULL )
-	{
-		cntl_t* sub_node_copy = bli_cntl_copy
-		(
-		  sba_pool,
-		  bli_cntl_sub_node( cntl )
-		);
-
-		// Save the address of the new sub-node (sub-tree) to the existing
-		// node.
-		bli_cntl_set_sub_node( sub_node_copy, cntl_copy );
-	}
-
-	// Return the address of the newly created node.
-	return cntl_copy;
+	bli_cntl_set_ways( next, ways, cntl );
+	bli_cntl_set_sub_node( next, sub_node, cntl );
 }
 
-void bli_cntl_mark_family
+void bli_cntl_clear_node
      (
-       opid_t  family,
        cntl_t* cntl
      )
 {
-	// This function sets the family field of all cntl tree nodes that are
-	// children of cntl. It's used by bli_l3_cntl_create_if() after making
-	// a copy of a user-given cntl tree, if the user provided one, to mark
-	// the operation family, which is used to determine appropriate behavior
-	// by various functions when executing the blocked variants.
-
-	// Set the family of the root node.
-	bli_cntl_set_family( family, cntl );
-
-	// Recursively set the family field of the sub-tree rooted at the sub-node,
-	// if it exists.
-	if ( bli_cntl_sub_prenode( cntl ) != NULL )
-	{
-		bli_cntl_mark_family( family, bli_cntl_sub_prenode( cntl ) );
-	}
-
-	// Recursively set the family field of the sub-tree rooted at the prenode,
-	// if it exists.
-	if ( bli_cntl_sub_node( cntl ) != NULL )
-	{
-		bli_cntl_mark_family( family, bli_cntl_sub_node( cntl ) );
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-dim_t bli_cntl_calc_num_threads_in
-     (
-       const rntm_t* rntm,
-       const cntl_t* cntl
-     )
-{
-	dim_t n_threads_in = 1;
-
-	for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) )
-	{
-		bszid_t bszid = bli_cntl_bszid( cntl );
-		dim_t   cur_way;
-
-		// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
-		// BLIS_NO_PART.
-		if ( bszid != BLIS_NO_PART )
-			cur_way = bli_rntm_ways_for( bszid, rntm );
-		else
-			cur_way = 1;
-
-		n_threads_in *= cur_way;
-	}
-
-	return n_threads_in;
+	bli_cntl_init_node( NULL, cntl );
 }
 
diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h
index 2c1aeb603..2218386af 100644
--- a/frame/base/bli_cntl.h
+++ b/frame/base/bli_cntl.h
@@ -39,13 +39,12 @@
 
 struct cntl_s
 {
-	// Basic fields (usually required).
-	opid_t         family;
-	bszid_t        bszid;
-	void_fp        var_func;
-	struct cntl_s* sub_prenode;
-	struct cntl_s* sub_node;
-	void*          params;
+	l3_var_oft var_func;
+	struct
+	{
+		dim_t          ways;
+		struct cntl_s* sub_node;
+	} sub_nodes[ BLIS_MAX_SUB_NODES ];
 };
 typedef struct cntl_s cntl_t;
 */
@@ -53,144 +52,65 @@ typedef struct cntl_s cntl_t;
 
 // -- Control tree prototypes --
 
-BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node
+BLIS_EXPORT_BLIS void bli_cntl_init_node
      (
-       pool_t* pool,
-       opid_t  family,
-       bszid_t bszid,
        void_fp var_func,
-       void*   params,
-       cntl_t* sub_node
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_free_node
-     (
-       pool_t* pool,
-       cntl_t* cntl
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_clear_node
-     (
        cntl_t* cntl
      );
 
-// -----------------------------------------------------------------------------
-
-BLIS_EXPORT_BLIS void bli_cntl_free
+BLIS_EXPORT_BLIS void bli_cntl_attach_sub_node
      (
-       pool_t* pool,
+       dim_t   ways,
+       cntl_t* sub_node,
        cntl_t* cntl
      );
 
-BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy
-     (
-             pool_t* pool,
-       const cntl_t* cntl
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_mark_family
+BLIS_EXPORT_BLIS void bli_cntl_clear_node
      (
-       opid_t  family,
        cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
-dim_t bli_cntl_calc_num_threads_in
-     (
-       const rntm_t* rntm,
-       const cntl_t* cntl
-     );
-
-// -----------------------------------------------------------------------------
-
 // cntl_t query (fields only)
 
-BLIS_INLINE opid_t bli_cntl_family( const cntl_t* cntl )
-{
-	return cntl->family;
-}
-
-BLIS_INLINE bszid_t bli_cntl_bszid( const cntl_t* cntl )
-{
-	return cntl->bszid;
-}
-
 BLIS_INLINE void_fp bli_cntl_var_func( const cntl_t* cntl )
 {
 	return cntl->var_func;
 }
 
-BLIS_INLINE cntl_t* bli_cntl_sub_prenode( const cntl_t* cntl )
-{
-	return cntl->sub_prenode;
-}
-
-BLIS_INLINE cntl_t* bli_cntl_sub_node( const cntl_t* cntl )
-{
-	return cntl->sub_node;
-}
-
-BLIS_INLINE void* bli_cntl_params( const cntl_t* cntl )
+BLIS_INLINE dim_t bli_cntl_ways( dim_t which, const cntl_t* cntl )
 {
-	return cntl->params;
+	return cntl->sub_nodes[ which ].ways;
 }
 
-BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl )
+BLIS_INLINE cntl_t* bli_cntl_sub_node( dim_t which, const cntl_t* cntl )
 {
-	// The first 64 bytes is always the size of the params structure.
-	return *( ( uint64_t* )(cntl->params) );
+	return cntl->sub_nodes[ which ].sub_node;
 }
 
 // cntl_t query (complex)
 
-BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl )
-{
-	return ( bool )
-	       ( cntl == NULL );
-}
-
 BLIS_INLINE bool bli_cntl_is_leaf( const cntl_t* cntl )
 {
 	return ( bool )
-	       ( bli_cntl_sub_node( cntl ) == NULL );
-}
-
-BLIS_INLINE bool bli_cntl_does_part( const cntl_t* cntl )
-{
-	return ( bool )
-	       ( bli_cntl_bszid( cntl ) != BLIS_NO_PART );
+	       ( bli_cntl_sub_node( 0, cntl ) == NULL );
 }
 
 // cntl_t modification
 
-BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl )
-{
-	cntl->family = family;
-}
-
-BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl )
-{
-	cntl->bszid = bszid;
-}
-
 BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl )
 {
 	cntl->var_func = var_func;
 }
 
-BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl )
-{
-	cntl->sub_prenode = sub_prenode;
-}
-
-BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl )
+BLIS_INLINE void bli_cntl_set_ways( dim_t which, dim_t ways, cntl_t* cntl )
 {
-	cntl->sub_node = sub_node;
+	cntl->sub_nodes[ which ].ways = ways;
 }
 
-BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl )
+BLIS_INLINE void bli_cntl_set_sub_node( dim_t which, cntl_t* sub_node, cntl_t* cntl )
 {
-	cntl->params = params;
+	cntl->sub_nodes[ which ].sub_node = sub_node;
 }
 
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 4635c11f4..36c481bbc 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -35,10 +35,72 @@
 
 #include "blis.h"
 
-void bli_cntx_clear( cntx_t* cntx )
+BLIS_EXPORT_BLIS err_t bli_cntx_init( cntx_t* cntx )
 {
-	// Fill the entire cntx_t structure with zeros.
-	memset( ( void* )cntx, 0, sizeof( cntx_t ) );
+	if ( cntx == NULL )
+		return BLIS_NULL_POINTER;
+
+	err_t error;
+
+	error = bli_stack_init( sizeof( blksz_t ), 32, 32, BLIS_NUM_BLKSZS, &cntx->blkszs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_init( sizeof( bszid_t ), 32, 32, BLIS_NUM_BLKSZS, &cntx->bmults );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_init( sizeof( func_t ), 32, 32, BLIS_NUM_UKRS, &cntx->ukrs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_init( sizeof( func2_t ), 32, 32, BLIS_NUM_UKR2S, &cntx->ukr2s );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_init( sizeof( mbool_t ), 32, 32, BLIS_NUM_UKR_PREFS, &cntx->ukr_prefs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_init( sizeof( void_fp ), 32, 32, BLIS_NUM_LEVEL3_OPS, &cntx->l3_sup_handlers );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	return BLIS_SUCCESS;
+}
+
+BLIS_EXPORT_BLIS err_t bli_cntx_free( cntx_t* cntx )
+{
+	if ( cntx == NULL )
+		return BLIS_NULL_POINTER;
+
+	err_t error;
+
+	error = bli_stack_finalize( &cntx->blkszs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_finalize(  &cntx->bmults );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_finalize( &cntx->ukrs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_finalize( &cntx->ukr2s );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_finalize( &cntx->ukr_prefs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	error = bli_stack_finalize( &cntx->l3_sup_handlers );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	return BLIS_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
@@ -64,15 +126,6 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	   );
 	*/
 
-	// Save the execution type into the context.
-	bli_cntx_set_method( BLIS_NAT, cntx );
-
-	// Query the context for the addresses of:
-	// - the blocksize object array
-	// - the blocksize multiple array
-	blksz_t* cntx_blkszs = cntx->blkszs;
-	bszid_t* cntx_bmults = cntx->bmults;
-
 	// Initialize variable argument environment.
 	va_list args;
 	va_start( args, cntx );
@@ -97,13 +150,7 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 		// Copy the blksz_t object contents into the appropriate
 		// location within the context's blksz_t array. Do the same
 		// for the blocksize multiple id.
-		//cntx_blkszs[ bs_id ] = *blksz;
-		//bli_blksz_copy( blksz, cntx_blksz );
-		blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-		bli_blksz_copy_if_nonneg( blksz, cntx_blksz );
-
-		// Copy the blocksize multiple id into the context.
-		cntx_bmults[ bs_id ] = bm_id;
+		bli_cntx_set_blksz( bs_id, blksz, bm_id, cntx );
 	}
 
 	// Shutdown variable argument environment and clean up stack.
@@ -112,82 +159,49 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
+void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 {
+	// This function can be called from the bli_cntx_init_*() function for
+	// a particular architecture if the kernel developer wishes to use
+	// non-default microkernels. It should be called after
+	// bli_cntx_init_<subconfig>_ref() so that the context begins with
+	// default microkernels across all datatypes.
+
 	/* Example prototypes:
 
-	   void bli_gks_cntx_set_ind_blkszs
+	   void bli_cntx_set_ukrs
 	   (
-	     ind_t   method != BLIS_NAT,
-	     num_t   dt,
 	     cntx_t* cntx,
-	     bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
-	     bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
-	     bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2,
+	     ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
+	     ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
+	     ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
 	     ...,
 	     BLIS_VA_END
 	   );
-
-		NOTE: This function modifies an existing context that is presumed
-		to have been initialized for native execution.
 	*/
 
-	// Project the given datatype to the real domain. This will be used later on.
-	num_t dt_real = bli_dt_proj_to_real( dt );
-
-	// Return early if called with BLIS_NAT.
-	if ( method == BLIS_NAT ) return;
-
-	// Save the execution type into the context.
-	bli_cntx_set_method( method, cntx );
-
 	// Initialize variable argument environment.
-	va_list args;
+	va_list   args;
 	va_start( args, cntx );
 
-	// Process blocksizes until we get a BLIS_VA_END.
+	// Process ukernels until BLIS_VA_END is reached.
 	while ( true )
 	{
-		int bs_id0 = va_arg( args, int );
+		const int ukr_id0 = va_arg( args, int );
 
-		// If we find a bszid_t id of BLIS_VA_END, then we are done.
-		if ( bs_id0 == BLIS_VA_END ) break;
+		// If we find a ukernel id of BLIS_VA_END, then we are done.
+		if ( ukr_id0 == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the bszid_t of the blocksize we're about to process (already done),
-		// - the scalars we wish to apply to the real blocksizes to
-		//   come up with the induced complex blocksizes (for default
-		//   and maximum blocksizes).
-		bszid_t bs_id = ( bszid_t )bs_id0;
-		double  dsclr = ( double  )va_arg( args, double );
-		double  msclr = ( double  )va_arg( args, double );
-
-		// Query the context for the blksz_t object assoicated with the
-		// current blocksize id, and also query the object corresponding
-		// to the blocksize multiple.
-		blksz_t* cntx_blksz = ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx );
-
-		// Copy the real domain value of the blksz_t object into the
-		// corresponding complex domain slot of the same object.
-		bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
-
-		// If the default blocksize scalar is non-unit, we need to scale
-		// the complex domain default blocksizes.
-		if ( dsclr != 1.0 )
-		{
-			// Scale the default blocksize value corresponding to the given
-			// datatype.
-			bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
-		}
-
-		// Similarly, if the maximum blocksize scalar is non-unit, we need
-		// to scale the complex domain maximum blocksizes.
-		if ( msclr != 1.0 )
-		{
-			// Scale the maximum blocksize value corresponding to the given
-			// datatype.
-			bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
-		}
+		// - the ukr_t of the kernel we're about to process (already done),
+		// - the datatype of the kernel, and
+		// - the kernel function pointer
+		const ukr_t   ukr_id = ( ukr_t   )ukr_id0;
+		const num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
+		      void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
+
+		// Store the ukernel function pointer into the context.
+		bli_cntx_set_ukr_dt( ukr_fp, ukr_dt, ukr_id, cntx );
 	}
 
 	// Shutdown variable argument environment and clean up stack.
@@ -196,7 +210,7 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_ukrs( cntx_t* cntx , ... )
+void bli_cntx_set_ukr2s( cntx_t* cntx , ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
@@ -206,20 +220,17 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 
 	/* Example prototypes:
 
-	   void bli_cntx_set_ukrs
+	   void bli_cntx_set_ukr2s
 	   (
 	     cntx_t* cntx,
-	     ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
-	     ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
-	     ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
+	     ukr_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
+	     ukr_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
+	     ukr_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
 	     ...,
 	     BLIS_VA_END
 	   );
 	*/
 
-	// Query the context for the address of the ukernel func_t array
-	func_t*  cntx_ukrs = cntx->ukrs;
-
 	// Initialize variable argument environment.
 	va_list   args;
 	va_start( args, cntx );
@@ -236,37 +247,13 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 		// - the ukr_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		const ukr_t   ukr_id = ( ukr_t   )ukr_id0;
-		const num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
-		      void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
-
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		func_t* ukrs = &cntx_ukrs[ ukr_id ];
+		const ukr_t   ukr_id  = ( ukr_t  )ukr_id0;
+		const num_t   ukr_dt1 = ( num_t   )va_arg( args, num_t   );
+		const num_t   ukr_dt2 = ( num_t   )va_arg( args, num_t   );
+		      void_fp ukr_fp  = ( void_fp )va_arg( args, void_fp );
 
 		// Store the ukernel function pointer into the context.
-		// Notice that we redundantly store the native
-		// ukernel address in both the native and virtual ukernel slots
-		// in the context. This is standard practice when creating a
-		// native context. (Induced method contexts will overwrite the
-		// virtual function pointer with the address of the appropriate
-		// virtual ukernel.)
-		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
-
-		// Locate the virtual ukernel func_t pointer that corresponds to the
-		// ukernel id provided by the caller.
-		switch ( ukr_id )
-		{
-			case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ]; break;
-			case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ]; break;
-			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break;
-			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break;
-			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break;
-			default:                  ukrs = NULL; break;
-		};
-
-		if ( ukrs )
-			bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
+		bli_cntx_set_ukr2_dt( ukr_fp, ukr_dt1, ukr_dt2, ukr_id, cntx );
 	}
 
 	// Shutdown variable argument environment and clean up stack.
@@ -296,9 +283,6 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	   );
 	*/
 
-	// Query the context for the address of the ukernel preference mbool_t array
-	mbool_t* cntx_ukr_prefs = cntx->ukr_prefs;
-
 	// Initialize variable argument environment.
 	va_list   args;
 	va_start( args, cntx );
@@ -319,12 +303,8 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 		const num_t      ukr_pref_dt = ( num_t      )va_arg( args, num_t );
 		const bool       ukr_pref    = ( bool       )va_arg( args, int );
 
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		mbool_t* ukr_prefs = &cntx_ukr_prefs[ ukr_pref_id ];
-
 		// Store the ukernel preference value into the context.
-		bli_mbool_set_dt( ukr_pref, ukr_pref_dt, ukr_prefs );
+		bli_cntx_set_ukr_pref_dt( ukr_pref, ukr_pref_dt, ukr_pref_id, cntx );
 	}
 
 	// Shutdown variable argument environment and clean up stack.
@@ -354,9 +334,6 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 	   );
 	*/
 
-	// Query the context for the address of the l3 sup handlers array.
-	void_fp* cntx_l3_sup_handlers = cntx->l3_sup_handlers;
-
 	// Initialize variable argument environment.
 	va_list   args;
 	va_start( args, cntx );
@@ -375,15 +352,98 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 		const opid_t  op_id = ( opid_t  )op_id0;
 		      void_fp op_fp = ( void_fp )va_arg( args, void_fp );
 
+		if ( op_id >= BLIS_NUM_LEVEL3_OPS )
+			bli_abort();
+
 		// Store the sup handler function pointer into the slot for the
 		// specified operation id.
-		cntx_l3_sup_handlers[ op_id ] = op_fp;
+		void_fp* l3_sup_handler;
+		bli_stack_get( op_id, ( void** )&l3_sup_handler, &cntx->l3_sup_handlers );
+		*l3_sup_handler = op_fp;
 	}
 
 	// Shutdown variable argument environment and clean up stack.
 	va_end( args );
 }
 
+// -----------------------------------------------------------------------------
+
+err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, bszid_t bmult_id, cntx_t* cntx )
+{
+	siz_t id_blksz;
+	err_t error = bli_stack_push( &id_blksz, &cntx->blkszs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	siz_t id_bmult;
+	error = bli_stack_push( &id_bmult, &cntx->bmults );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	if ( id_blksz != id_bmult )
+		return BLIS_INVALID_UKR_ID;
+
+	*bs_id = id_blksz;
+
+	if ( blksz )
+	{
+		return bli_cntx_set_blksz( id_blksz, blksz, bmult_id, cntx );
+	}
+	else
+	{
+		return BLIS_SUCCESS;
+	}
+}
+
+err_t bli_cntx_register_ukr( siz_t* ukr_id, const func_t* ukr, cntx_t* cntx )
+{
+	err_t error = bli_stack_push( ukr_id, &cntx->ukrs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	if ( ukr )
+	{
+		return bli_cntx_set_ukr( *ukr_id, ukr, cntx );
+	}
+	else
+	{
+		return BLIS_SUCCESS;
+	}
+}
+
+err_t bli_cntx_register_ukr2( siz_t* ukr_id, const func2_t* ukr, cntx_t* cntx )
+{
+	err_t error = bli_stack_push( ukr_id, &cntx->ukr2s );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	if ( ukr )
+	{
+		return bli_cntx_set_ukr2( *ukr_id, ukr, cntx );
+	}
+	else
+	{
+		return BLIS_SUCCESS;
+	}
+}
+
+err_t bli_cntx_register_ukr_pref( siz_t* ukr_pref_id, const mbool_t* ukr_pref, cntx_t* cntx )
+{
+	err_t error = bli_stack_push( ukr_pref_id, &cntx->ukr_prefs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	if ( ukr_pref )
+	{
+		return bli_cntx_set_ukr_pref( *ukr_pref_id, ukr_pref, cntx );
+	}
+	else
+	{
+		return BLIS_SUCCESS;
+	}
+}
+
+
 // -----------------------------------------------------------------------------
 
 void bli_cntx_print( const cntx_t* cntx )
@@ -433,11 +493,5 @@ void bli_cntx_print( const cntx_t* cntx )
 		        bli_mbool_get_dt( BLIS_DCOMPLEX, ukr_pref )
 		      );
 	}
-
-	{
-		ind_t method = bli_cntx_method( cntx );
-
-		printf( "ind method   : %lu\n", ( unsigned long )method );
-	}
 }
 
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 90050a5ed..59c8efbf7 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -43,51 +43,30 @@
 /*
 typedef struct cntx_s
 {
-	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
-	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
+	stck_t blkszs; // blksz_t
+	stck_t bmults; // bszid_t
 
-	func_t    ukrs[ BLIS_NUM_UKRS ];
-	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
-
-	void_fp   l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
-
-	ind_t     method;
+	stck_t ukrs; // func_t
+	stck_t ukr2s; // func2_t
+	stck_t ukr_prefs; // mbool_t
 
+	stck_t l3_sup_handlers; // void_fp
 } cntx_t;
 */
 
 // -----------------------------------------------------------------------------
 
-//
-// -- cntx_t query (fields only) -----------------------------------------------
-//
-
-BLIS_INLINE ind_t bli_cntx_method( const cntx_t* cntx )
-{
-	return cntx->method;
-}
-
-// -----------------------------------------------------------------------------
-
-//
-// -- cntx_t modification (fields only) ----------------------------------------
-//
-
-BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
-{
-	cntx->method = method;
-}
-
-// -----------------------------------------------------------------------------
-
 //
 // -- cntx_t query (complex) ---------------------------------------------------
 //
 
 BLIS_INLINE const blksz_t* bli_cntx_get_blksz( bszid_t bs_id, const cntx_t* cntx )
 {
-	// Return the address of the blksz_t identified by bs_id.
-	return &cntx->blkszs[ bs_id ];
+	const blksz_t* blksz;
+	err_t error = bli_stack_get( bs_id, ( void** )&blksz, &cntx->blkszs );
+	if ( error != BLIS_SUCCESS )
+		bli_check_error_code( error );
+	return blksz;
 }
 
 BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
@@ -110,7 +89,11 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx
 
 BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, const cntx_t* cntx )
 {
-	return cntx->bmults[ bs_id ];
+	const bszid_t* bsz;
+	err_t error = bli_stack_get( bs_id, ( void** )&bsz, &cntx->bmults );
+	if ( error != BLIS_SUCCESS )
+		bli_check_error_code( error );
+	return *bsz;
 }
 
 BLIS_INLINE const blksz_t* bli_cntx_get_bmult( bszid_t bs_id, const cntx_t* cntx )
@@ -131,38 +114,63 @@ BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, const cntx_t*
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE const func2_t* bli_cntx_get_ukr2s( ukr_t ukr_id, const cntx_t* cntx )
 {
-	return &cntx->ukrs[ ukr_id ];
+	const func2_t* ukr;
+	err_t error = bli_stack_get( bli_ker_idx( ukr_id ), ( void** )&ukr, &cntx->ukr2s );
+	if ( error != BLIS_SUCCESS )
+		bli_check_error_code( error );
+	return ukr;
 }
 
-BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, ukr_t ukr_id, const cntx_t* cntx )
 {
-	const func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
+	const func2_t* func = bli_cntx_get_ukr2s( ukr_id, cntx );
 
-	return bli_func_get_dt( dt, func );
+	return bli_func2_get_dt( dt1, dt2, func );
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx )
 {
-	switch ( ukr_id )
+	if ( bli_ker_ntype( ukr_id ) == 2 )
 	{
-		case BLIS_GEMM_UKR:       ukr_id = BLIS_GEMM_VIR_UKR; break;
-		case BLIS_TRSM_L_UKR:     ukr_id = BLIS_TRSM_L_VIR_UKR; break;
-		case BLIS_TRSM_U_UKR:     ukr_id = BLIS_TRSM_U_VIR_UKR; break;
-		case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break;
-		case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break;
-		default: break;
-	};
+		return ( const func_t* )bli_cntx_get_ukr2s( ( ukr_t )ukr_id, cntx );
+	}
+	else
+	{
+		const func_t* ukr;
+		err_t error = bli_stack_get( bli_ker_idx( ukr_id ), ( void** )&ukr, &cntx->ukrs );
+		if ( error != BLIS_SUCCESS )
+			bli_check_error_code( error );
+		return ukr;
+	}
+}
 
-	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
+BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
+{
+	if ( bli_ker_ntype( ukr_id ) == 2 )
+	{
+		return bli_cntx_get_ukr2_dt( dt, dt, ( ukr_t )ukr_id, cntx );
+	}
+	else
+	{
+		const func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
+
+		return bli_func_get_dt( dt, func );
+	}
 }
 
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t pref_id, const cntx_t* cntx )
 {
-	return &cntx->ukr_prefs[ pref_id ];
+	const mbool_t* ukr_prefs;
+	err_t error = bli_stack_get( pref_id, ( void** )&ukr_prefs, &cntx->ukr_prefs );
+	if ( error != BLIS_SUCCESS )
+		bli_check_error_code( error );
+	return ukr_prefs;
 }
 
 BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, const cntx_t* cntx )
@@ -187,7 +195,11 @@ BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_
 
 BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, const cntx_t* cntx )
 {
-	return cntx->l3_sup_handlers[ op ];
+	const void_fp* l3_handler;
+	err_t error = bli_stack_get( op, ( void** )&l3_handler, &cntx->l3_sup_handlers );
+	if ( error != BLIS_SUCCESS )
+		bli_check_error_code( error );
+	return *l3_handler;
 }
 
 // -----------------------------------------------------------------------------
@@ -200,15 +212,10 @@ BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, const cnt
 	// Get the correct preference from the kernel ID.
 	switch ( ukr_id )
 	{
-		case BLIS_GEMM_VIR_UKR: // fallthrough
 		case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break;
-		case BLIS_TRSM_L_VIR_UKR: // fallthrough
 		case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break;
-		case BLIS_TRSM_U_VIR_UKR: // fallthrough
 		case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break;
-		case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
 		case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break;
-		case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough
 		case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break;
 		case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break;
 		case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break;
@@ -222,21 +229,6 @@ BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, const cnt
 		default: break; // TODO: should be an error condition
 	}
 
-	// For virtual ukernels during non-native execution, use the real projection of
-	// the datatype.
-	if ( bli_cntx_method( cntx ) != BLIS_NAT )
-	{
-		switch ( ukr_id )
-		{
-			case BLIS_GEMM_VIR_UKR: // fallthrough
-			case BLIS_TRSM_L_VIR_UKR: // fallthrough
-			case BLIS_TRSM_U_VIR_UKR: // fallthrough
-			case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
-			case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break;
-			default: break;
-		}
-	}
-
 	return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx );
 }
 
@@ -270,35 +262,85 @@ BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, c
 // NOTE: The framework does not use any of the following functions. We provide
 // them in order to facilitate creating/modifying custom contexts.
 
-BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_blksz( bszid_t bs_id, const blksz_t* blksz, bszid_t mult_id, cntx_t* cntx )
 {
-	cntx->blkszs[ bs_id ] = *blksz;
-	cntx->bmults[ bs_id ] = mult_id;
+	blksz_t* cntx_blksz;
+	err_t error = bli_stack_get( bs_id, ( void** )&cntx_blksz, &cntx->blkszs );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	bszid_t* cntx_mult_id;
+	error = bli_stack_get( bs_id, ( void** )&cntx_mult_id, &cntx->bmults );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	bli_blksz_copy_if_nonneg( blksz, cntx_blksz );
+	*cntx_mult_id = mult_id;
+
+	return BLIS_SUCCESS;
 }
 
 BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
 {
-	bli_blksz_set_def( bs, dt, &cntx->blkszs[ bs_id ] );
+	bli_blksz_set_def( bs, dt, ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx ) );
 }
 
 BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
 {
-	bli_blksz_set_max( bs, dt, &cntx->blkszs[ bs_id ]);
+	bli_blksz_set_max( bs, dt, ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx ) );
 }
 
-BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, const func_t* func, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr2( ukr_t ukr_id, const func2_t* func, cntx_t* cntx )
 {
-	cntx->ukrs[ ukr_id ] = *func;
+	*( func2_t* )bli_cntx_get_ukr2s( ukr_id, cntx ) = *func;
+	return BLIS_SUCCESS;
+}
+
+BLIS_INLINE void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, ukr_t ker_id, cntx_t* cntx )
+{
+	bli_func2_set_dt( fp, dt1, dt2, ( func2_t* )bli_cntx_get_ukr2s( ker_id, cntx ) );
+}
+
+BLIS_INLINE err_t bli_cntx_set_ukr( ukr_t ukr_id, const func_t* func, cntx_t* cntx )
+{
+	*( func_t* )bli_cntx_get_ukrs( ukr_id, cntx ) = *func;
+	return BLIS_SUCCESS;
 }
 
 BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx )
 {
-	bli_func_set_dt( fp, dt, &cntx->ukrs[ ker_id ] );
+	if ( bli_ker_ntype( ker_id ) == 2 )
+	{
+		bli_cntx_set_ukr2_dt( fp, dt, dt, (ukr_t)ker_id, cntx );
+	}
+	else
+	{
+		bli_func_set_dt( fp, dt, ( func_t* )bli_cntx_get_ukrs( ker_id, cntx ) );
+	}
+}
+
+BLIS_INLINE err_t bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, const mbool_t* prefs, cntx_t* cntx )
+{
+	*( mbool_t* )bli_cntx_get_ukr_prefs( ukr_id, cntx ) = *prefs;
+	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, ukr_pref_t ukr_id, cntx_t* cntx )
 {
-	cntx->ukr_prefs[ ukr_id ] = *prefs;
+	bli_mbool_set_dt( pref, dt, ( mbool_t* )bli_cntx_get_ukr_prefs( ukr_id, cntx ));
+	return BLIS_SUCCESS;
+}
+
+BLIS_INLINE err_t bli_cntx_set_l3_sup_handler( opid_t opid, void_fp handler, cntx_t* cntx )
+{
+	void_fp* cntx_handler;
+	err_t error = bli_stack_get( opid, ( void** )&cntx_handler, &cntx->l3_sup_handlers );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	*cntx_handler = handler;
+
+	return BLIS_SUCCESS;
 }
 
 BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, const cntx_t* cntx )
@@ -319,7 +361,7 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, con
 		case BLIS_NC: bs_id = BLIS_NC_SUP; break;
 		case BLIS_KC: bs_id = BLIS_KC_SUP; break;
 		default: break;
-	};
+	}
 
 	return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx );
 }
@@ -335,7 +377,7 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, con
 		case BLIS_NC: bs_id = BLIS_NC_SUP; break;
 		case BLIS_KC: bs_id = BLIS_KC_SUP; break;
 		default: break;
-	};
+	}
 
 	return bli_cntx_get_blksz_max_dt( dt, bs_id, cntx );
 }
@@ -344,19 +386,28 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, con
 
 // Function prototypes
 
-BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
+BLIS_EXPORT_BLIS err_t bli_cntx_init( cntx_t* cntx );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( cntx_t* cntx, ... );
+BLIS_EXPORT_BLIS err_t bli_cntx_free( cntx_t* cntx );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( cntx_t* cntx, ... );
 
 BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_ukr2s( cntx_t* cntx, ... );
 BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... );
 
 BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx );
 
 BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
+BLIS_EXPORT_BLIS err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, bszid_t bmult_id, cntx_t* cntx );
+
+BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr( siz_t* ukr_id, const func_t* ukr, cntx_t* cntx );
+
+BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr2( siz_t* ukr_id, const func2_t* ukr, cntx_t* cntx );
+
+BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr_pref( siz_t* ukr_pref_id, const mbool_t* ukr_pref, cntx_t* cntx );
+
 
 #endif
 
diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c
index a9b081e2b..08555b194 100644
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -1035,7 +1035,7 @@ static uint32_t get_coretype
 {
 	int implementer = 0x00, part = 0x000;
 	*features = FEATURE_NEON;
-    bool has_sve = FALSE;
+	bool has_sve = FALSE; ( void )has_sve;
 
 #ifdef __linux__
 	if ( getauxval( AT_HWCAP ) & HWCAP_CPUID )
diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c
index f4933d962..415e9ae05 100644
--- a/frame/base/bli_error.c
+++ b/frame/base/bli_error.c
@@ -42,6 +42,8 @@ static const char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
 	[-BLIS_UNDEFINED_ERROR_CODE]                 = "Undefined error code.",
 	[-BLIS_NULL_POINTER]                         = "Encountered unexpected null pointer.",
 	[-BLIS_NOT_YET_IMPLEMENTED]                  = "Requested functionality not yet implemented.",
+	[-BLIS_OUT_OF_BOUNDS]                        = "Requested index is out of bounds.",
+	[-BLIS_LOCK_FAILURE]                         = "Failed to obtain lock.",
 
 	[-BLIS_INVALID_SIDE]                         = "Invalid side parameter value.",
 	[-BLIS_INVALID_UPLO]                         = "Invalid uplo_t parameter value.",
@@ -90,6 +92,7 @@ static const char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
 	[-BLIS_UNEXPECTED_NULL_CONTROL_TREE]         = "Encountered unexpected null control tree node.",
 
 	[-BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK] = "Pack schema not yet supported/implemented for use with unpacking.",
+	[-BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_PART]   = "Pack schema not yet supported/implemented for use with partitioning.",
 
 	[-BLIS_EXPECTED_NONNULL_OBJECT_BUFFER]       = "Encountered object with non-zero dimensions containing null buffer.",
 
@@ -112,6 +115,10 @@ static const char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
 	[-BLIS_NC_MAX_NONMULTIPLE_OF_NR]             = "Maximum NC is non-multiple of NR for one or more datatypes.",
 	[-BLIS_KC_DEF_NONMULTIPLE_OF_KR]             = "Default KC is non-multiple of KR for one or more datatypes.",
 	[-BLIS_KC_MAX_NONMULTIPLE_OF_KR]             = "Maximum KC is non-multiple of KR for one or more datatypes.",
+	[-BLIS_MR_NOT_EVEN_FOR_REAL_TYPE]            = "MR is not an even number for one or more real-domain datatypes.",
+	[-BLIS_PACKMR_NOT_EVEN_FOR_REAL_TYPE]        = "PACKMR is not an even number for one or more real-domain datatypes.",
+	[-BLIS_NR_NOT_EVEN_FOR_REAL_TYPE]            = "NR is not an even number for one or more real-domain datatypes.",
+	[-BLIS_PACKNR_NOT_EVEN_FOR_REAL_TYPE]        = "PACKNR is not an even number for one or more real-domain datatypes.",
 };
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c
index 7cb7aac6d..979fa241c 100644
--- a/frame/base/bli_func.c
+++ b/frame/base/bli_func.c
@@ -91,6 +91,86 @@ void bli_func_free( func_t* f )
 	bli_free_intl( f );
 }
 
+func2_t* bli_func2_create
+     (
+       void_fp ptr_ss, void_fp ptr_sd, void_fp ptr_sc, void_fp ptr_sz,
+       void_fp ptr_ds, void_fp ptr_dd, void_fp ptr_dc, void_fp ptr_dz,
+       void_fp ptr_cs, void_fp ptr_cd, void_fp ptr_cc, void_fp ptr_cz,
+       void_fp ptr_zs, void_fp ptr_zd, void_fp ptr_zc, void_fp ptr_zz
+     )
+{
+	func2_t* f;
+	err_t r_val;
+
+	f = ( func2_t* )bli_malloc_intl( sizeof( func2_t ), &r_val );
+
+	bli_func2_init
+	(
+	  f,
+	  ptr_ss, ptr_sd, ptr_sc, ptr_sz,
+	  ptr_ds, ptr_dd, ptr_dc, ptr_dz,
+	  ptr_cs, ptr_cd, ptr_cc, ptr_cz,
+	  ptr_zs, ptr_zd, ptr_zc, ptr_zz
+	);
+
+	return f;
+}
+
+void bli_func2_init
+     (
+       func2_t* f,
+       void_fp ptr_ss, void_fp ptr_sd, void_fp ptr_sc, void_fp ptr_sz,
+       void_fp ptr_ds, void_fp ptr_dd, void_fp ptr_dc, void_fp ptr_dz,
+       void_fp ptr_cs, void_fp ptr_cd, void_fp ptr_cc, void_fp ptr_cz,
+       void_fp ptr_zs, void_fp ptr_zd, void_fp ptr_zc, void_fp ptr_zz
+     )
+{
+	bli_func2_set_dt( ptr_ss, BLIS_FLOAT,    BLIS_FLOAT,    f );
+	bli_func2_set_dt( ptr_ds, BLIS_DOUBLE,   BLIS_FLOAT,    f );
+	bli_func2_set_dt( ptr_cs, BLIS_SCOMPLEX, BLIS_FLOAT,    f );
+	bli_func2_set_dt( ptr_zs, BLIS_DCOMPLEX, BLIS_FLOAT,    f );
+	bli_func2_set_dt( ptr_sd, BLIS_FLOAT,    BLIS_DOUBLE,   f );
+	bli_func2_set_dt( ptr_dd, BLIS_DOUBLE,   BLIS_DOUBLE,   f );
+	bli_func2_set_dt( ptr_cd, BLIS_SCOMPLEX, BLIS_DOUBLE,   f );
+	bli_func2_set_dt( ptr_zd, BLIS_DCOMPLEX, BLIS_DOUBLE,   f );
+	bli_func2_set_dt( ptr_sc, BLIS_FLOAT,    BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( ptr_dc, BLIS_DOUBLE,   BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( ptr_cc, BLIS_SCOMPLEX, BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( ptr_zc, BLIS_DCOMPLEX, BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( ptr_sz, BLIS_FLOAT,    BLIS_DCOMPLEX, f );
+	bli_func2_set_dt( ptr_dz, BLIS_DOUBLE,   BLIS_DCOMPLEX, f );
+	bli_func2_set_dt( ptr_cz, BLIS_SCOMPLEX, BLIS_DCOMPLEX, f );
+	bli_func2_set_dt( ptr_zz, BLIS_DCOMPLEX, BLIS_DCOMPLEX, f );
+}
+
+void bli_func2_init_null
+     (
+       func2_t* f
+     )
+{
+	bli_func2_set_dt( NULL, BLIS_FLOAT,    BLIS_FLOAT,    f );
+	bli_func2_set_dt( NULL, BLIS_DOUBLE,   BLIS_FLOAT,    f );
+	bli_func2_set_dt( NULL, BLIS_SCOMPLEX, BLIS_FLOAT,    f );
+	bli_func2_set_dt( NULL, BLIS_DCOMPLEX, BLIS_FLOAT,    f );
+	bli_func2_set_dt( NULL, BLIS_FLOAT,    BLIS_DOUBLE,   f );
+	bli_func2_set_dt( NULL, BLIS_DOUBLE,   BLIS_DOUBLE,   f );
+	bli_func2_set_dt( NULL, BLIS_SCOMPLEX, BLIS_DOUBLE,   f );
+	bli_func2_set_dt( NULL, BLIS_DCOMPLEX, BLIS_DOUBLE,   f );
+	bli_func2_set_dt( NULL, BLIS_FLOAT,    BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( NULL, BLIS_DOUBLE,   BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( NULL, BLIS_SCOMPLEX, BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( NULL, BLIS_DCOMPLEX, BLIS_SCOMPLEX, f );
+	bli_func2_set_dt( NULL, BLIS_FLOAT,    BLIS_DCOMPLEX, f );
+	bli_func2_set_dt( NULL, BLIS_DOUBLE,   BLIS_DCOMPLEX, f );
+	bli_func2_set_dt( NULL, BLIS_SCOMPLEX, BLIS_DCOMPLEX, f );
+	bli_func2_set_dt( NULL, BLIS_DCOMPLEX, BLIS_DCOMPLEX, f );
+}
+
+void bli_func2_free( func2_t* f )
+{
+	bli_free_intl( f );
+}
+
 // -----------------------------------------------------------------------------
 
 bool bli_func_is_null_dt(       num_t   dt,
@@ -118,3 +198,31 @@ bool bli_func_is_null( const func_t* f )
 	return r_val;
 }
 
+bool bli_func2_is_null_dt(       num_t    dt1,
+                                 num_t    dt2,
+                           const func2_t* f )
+{
+	return ( bli_func2_get_dt( dt1, dt2, f ) == NULL );
+}
+
+bool bli_func2_is_null( const func2_t* f )
+{
+	bool  r_val = TRUE;
+	num_t dt1;
+	num_t dt2;
+
+	// Iterate over all floating-point datatypes. If any is non-null,
+	// return FALSE. Otherwise, if they are all null, return TRUE.
+	for ( dt1 = BLIS_DT_LO; dt1 <= BLIS_DT_HI; ++dt1 )
+	for ( dt2 = BLIS_DT_LO; dt2 <= BLIS_DT_HI; ++dt2 )
+	{
+		if ( bli_func2_get_dt( dt1, dt2, f ) != NULL )
+		{
+			r_val = FALSE;
+			break;
+		}
+	}
+
+	return r_val;
+}
+
diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h
index cf89df389..3e2295805 100644
--- a/frame/base/bli_func.h
+++ b/frame/base/bli_func.h
@@ -45,6 +45,20 @@ BLIS_INLINE void_fp bli_func_get_dt
 	return func->ptr[ dt ];
 }
 
+BLIS_INLINE void_fp bli_func2_get_dt
+     (
+             num_t    dt1,
+             num_t    dt2,
+       const func2_t* func
+     )
+{
+	// Arrange the pointer elements such that ((func2_t*)x)->ptr[0][dt]
+	// is equivalent to ((func_t*)x)->ptr[dt] and encodes the (dt,dt)
+	// "diagonal" value.
+	gint_t off = dt2 < dt1 ? dt2 + BLIS_NUM_FP_TYPES - dt1 : dt2 - dt1;
+	return func->ptr[ off ][ dt1 ];
+}
+
 // func_t modification
 
 BLIS_INLINE void bli_func_set_dt
@@ -57,6 +71,21 @@ BLIS_INLINE void bli_func_set_dt
 	func->ptr[ dt ] = fp;
 }
 
+BLIS_INLINE void bli_func2_set_dt
+     (
+       void_fp  fp,
+       num_t    dt1,
+       num_t    dt2,
+       func2_t* func
+     )
+{
+	// Arrange the pointer elements such that ((func2_t*)x)->ptr[0][dt]
+	// is equivalent to ((func_t*)x)->ptr[dt] and encodes the (dt,dt)
+	// "diagonal" value.
+	gint_t off = dt2 < dt1 ? dt2 + BLIS_NUM_FP_TYPES - dt1 : dt2 - dt1;
+	func->ptr[ off ][ dt1 ] = fp;
+}
+
 BLIS_INLINE void bli_func_copy_dt
      (
        num_t dt_src, const func_t* func_src,
@@ -70,7 +99,7 @@ BLIS_INLINE void bli_func_copy_dt
 
 // -----------------------------------------------------------------------------
 
-func_t* bli_func_create
+BLIS_EXPORT_BLIS func_t* bli_func_create
      (
        void_fp ptr_s,
        void_fp ptr_d,
@@ -78,7 +107,7 @@ func_t* bli_func_create
        void_fp ptr_z
      );
 
-void bli_func_init
+BLIS_EXPORT_BLIS void bli_func_init
      (
        func_t* f,
        void_fp ptr_s,
@@ -87,16 +116,45 @@ void bli_func_init
        void_fp ptr_z
      );
 
-void bli_func_init_null
+BLIS_EXPORT_BLIS void bli_func_init_null
      (
        func_t* f
      );
 
-void bli_func_free( func_t* f );
+BLIS_EXPORT_BLIS void bli_func_free( func_t* f );
+
+BLIS_EXPORT_BLIS func2_t* bli_func2_create
+     (
+       void_fp ptr_ss, void_fp ptr_sd, void_fp ptr_sc, void_fp ptr_sz,
+       void_fp ptr_ds, void_fp ptr_dd, void_fp ptr_dc, void_fp ptr_dz,
+       void_fp ptr_cs, void_fp ptr_cd, void_fp ptr_cc, void_fp ptr_cz,
+       void_fp ptr_zs, void_fp ptr_zd, void_fp ptr_zc, void_fp ptr_zz
+     );
+
+BLIS_EXPORT_BLIS void bli_func2_init
+     (
+       func2_t* f,
+       void_fp ptr_ss, void_fp ptr_sd, void_fp ptr_sc, void_fp ptr_sz,
+       void_fp ptr_ds, void_fp ptr_dd, void_fp ptr_dc, void_fp ptr_dz,
+       void_fp ptr_cs, void_fp ptr_cd, void_fp ptr_cc, void_fp ptr_cz,
+       void_fp ptr_zs, void_fp ptr_zd, void_fp ptr_zc, void_fp ptr_zz
+     );
+
+BLIS_EXPORT_BLIS void bli_func2_init_null
+     (
+       func2_t* f
+     );
+
+BLIS_EXPORT_BLIS void bli_func2_free( func2_t* f );
 
 // -----------------------------------------------------------------------------
 
-bool bli_func_is_null_dt(       num_t   dt,
-                          const func_t* f );
-bool bli_func_is_null( const func_t* f );
+BLIS_EXPORT_BLIS bool bli_func_is_null_dt(       num_t   dt,
+                                           const func_t* f );
+BLIS_EXPORT_BLIS bool bli_func_is_null( const func_t* f );
+
+BLIS_EXPORT_BLIS bool bli_func2_is_null_dt(       num_t    dt1,
+                                                  num_t    dt2,
+                                            const func2_t* f );
+BLIS_EXPORT_BLIS bool bli_func2_is_null( const func2_t* f );
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index a21aa1244..a6eea5562 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -37,31 +37,20 @@
 
 // The array of cntx_t* pointers to cache modified contexts used by induced
 // methods.
-static cntx_t** gks[ BLIS_NUM_ARCHS ];
+static cntx_t* gks[ BLIS_NUM_ARCHS ];
 
-// The array of function pointers holding the registered context initialization
-// functions for induced methods.
-static void_fp  cntx_ind_init[ BLIS_NUM_ARCHS ];
+// Define a function pointer type for context initialization functions.
+typedef void (*cntx_init_ft)( cntx_t* cntx );
 
 // The array of function pointers holding the registered context initialization
 // functions for reference kernels.
-static void_fp  cntx_ref_init[ BLIS_NUM_ARCHS ];
-
-// Define a function pointer type for context initialization functions.
-typedef void (*nat_cntx_init_ft)( cntx_t* cntx );
-typedef void (*ref_cntx_init_ft)( cntx_t* cntx );
-typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
+static cntx_init_ft cntx_ref_init[ BLIS_NUM_ARCHS ];
 
-// Cached copies of the pointers to the native and induced contexts for the
+// Cached copies of the pointers to the native context for the
 // active subconfiguration. When BLIS_ENABLE_GKS_CACHING is enabled, these
 // pointers will be set once and then reused to fulfill subsequent context
 // queries.
-static cntx_t* cached_cntx_nat = NULL;
-static cntx_t* cached_cntx_ind = NULL;
-
-// A mutex to allow synchronous access to the gks when it needs to be updated
-// with a new entry corresponding to a context for an ind_t value.
-static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
+static cntx_t* cached_cntx = NULL;
 
 // -----------------------------------------------------------------------------
 
@@ -71,210 +60,21 @@ int bli_gks_init( void )
 	// library init/finalize cycle (see bli_init.c). Thus, a mutex is not
 	// needed to protect the data initialization.
 
-	{
-		// Initialize the internal data structure we use to track registered
-		// contexts.
-		bli_gks_init_index();
-
-		// Register a context for each architecture that was #define'd in
-		// bli_config.h.
-
-		// -- Intel architectures ----------------------------------------------
-
-#ifdef BLIS_CONFIG_SKX
-		bli_gks_register_cntx( BLIS_ARCH_SKX,         bli_cntx_init_skx,
-		                                              bli_cntx_init_skx_ref,
-		                                              bli_cntx_init_skx_ind );
-#endif
-#ifdef BLIS_CONFIG_KNL
-		bli_gks_register_cntx( BLIS_ARCH_KNL,         bli_cntx_init_knl,
-		                                              bli_cntx_init_knl_ref,
-		                                              bli_cntx_init_knl_ind );
-#endif
-#ifdef BLIS_CONFIG_KNC
-		bli_gks_register_cntx( BLIS_ARCH_KNC,         bli_cntx_init_knc,
-		                                              bli_cntx_init_knc_ref,
-		                                              bli_cntx_init_knc_ind );
-#endif
-#ifdef BLIS_CONFIG_HASWELL
-		bli_gks_register_cntx( BLIS_ARCH_HASWELL,     bli_cntx_init_haswell,
-		                                              bli_cntx_init_haswell_ref,
-		                                              bli_cntx_init_haswell_ind );
-#endif
-#ifdef BLIS_CONFIG_SANDYBRIDGE
-		bli_gks_register_cntx( BLIS_ARCH_SANDYBRIDGE, bli_cntx_init_sandybridge,
-		                                              bli_cntx_init_sandybridge_ref,
-		                                              bli_cntx_init_sandybridge_ind );
-#endif
-#ifdef BLIS_CONFIG_PENRYN
-		bli_gks_register_cntx( BLIS_ARCH_PENRYN,      bli_cntx_init_penryn,
-		                                              bli_cntx_init_penryn_ref,
-		                                              bli_cntx_init_penryn_ind );
-#endif
-
-		// -- AMD architectures ------------------------------------------------
+	// Initialize the internal data structure we use to track registered
+	// contexts.
+	bli_gks_init_index();
 
-#ifdef BLIS_CONFIG_ZEN3
-		bli_gks_register_cntx( BLIS_ARCH_ZEN3,        bli_cntx_init_zen3,
-		                                              bli_cntx_init_zen3_ref,
-		                                              bli_cntx_init_zen3_ind );
-#endif
-#ifdef BLIS_CONFIG_ZEN2
-		bli_gks_register_cntx( BLIS_ARCH_ZEN2,        bli_cntx_init_zen2,
-		                                              bli_cntx_init_zen2_ref,
-		                                              bli_cntx_init_zen2_ind );
-#endif
-#ifdef BLIS_CONFIG_ZEN
-		bli_gks_register_cntx( BLIS_ARCH_ZEN,         bli_cntx_init_zen,
-		                                              bli_cntx_init_zen_ref,
-		                                              bli_cntx_init_zen_ind );
-#endif
-#ifdef BLIS_CONFIG_EXCAVATOR
-		bli_gks_register_cntx( BLIS_ARCH_EXCAVATOR,   bli_cntx_init_excavator,
-		                                              bli_cntx_init_excavator_ref,
-		                                              bli_cntx_init_excavator_ind );
-#endif
-#ifdef BLIS_CONFIG_STEAMROLLER
-		bli_gks_register_cntx( BLIS_ARCH_STEAMROLLER, bli_cntx_init_steamroller,
-		                                              bli_cntx_init_steamroller_ref,
-		                                              bli_cntx_init_steamroller_ind );
-#endif
-#ifdef BLIS_CONFIG_PILEDRIVER
-		bli_gks_register_cntx( BLIS_ARCH_PILEDRIVER,  bli_cntx_init_piledriver,
-		                                              bli_cntx_init_piledriver_ref,
-		                                              bli_cntx_init_piledriver_ind );
-#endif
-#ifdef BLIS_CONFIG_BULLDOZER
-		bli_gks_register_cntx( BLIS_ARCH_BULLDOZER,   bli_cntx_init_bulldozer,
-		                                              bli_cntx_init_bulldozer_ref,
-		                                              bli_cntx_init_bulldozer_ind );
-#endif
+	// Register a context for each architecture that was #define'd in
+	// bli_config.h.
 
-		// -- ARM architectures ------------------------------------------------
-
-		// -- ARM-SVE --
-#ifdef BLIS_CONFIG_ARMSVE
-		bli_gks_register_cntx( BLIS_ARCH_ARMSVE,      bli_cntx_init_armsve,
-		                                              bli_cntx_init_armsve_ref,
-		                                              bli_cntx_init_armsve_ind );
-#endif
-#ifdef BLIS_CONFIG_A64FX
-		bli_gks_register_cntx( BLIS_ARCH_A64FX,       bli_cntx_init_a64fx,
-		                                              bli_cntx_init_a64fx_ref,
-		                                              bli_cntx_init_a64fx_ind );
-#endif
+	#undef GENTCONF
+	#define GENTCONF( CONFIG, config ) \
+	\
+	bli_gks_register_cntx( PASTECH(BLIS_ARCH_,CONFIG), \
+	                       PASTEMAC(cntx_init_,config), \
+	                       PASTEMAC(cntx_init_,config,_ref) );
 
-		// -- ARM-NEON (4 pipes x 128-bit vectors) --
-#ifdef BLIS_CONFIG_ALTRAMAX
-		bli_gks_register_cntx( BLIS_ARCH_ALTRAMAX,    bli_cntx_init_altramax,
-		                                              bli_cntx_init_altramax_ref,
-		                                              bli_cntx_init_altramax_ind );
-#endif
-#ifdef BLIS_CONFIG_ALTRA
-		bli_gks_register_cntx( BLIS_ARCH_ALTRA,       bli_cntx_init_altra,
-		                                              bli_cntx_init_altra_ref,
-		                                              bli_cntx_init_altra_ind );
-#endif
-#ifdef BLIS_CONFIG_FIRESTORM
-		bli_gks_register_cntx( BLIS_ARCH_FIRESTORM,   bli_cntx_init_firestorm,
-		                                              bli_cntx_init_firestorm_ref,
-		                                              bli_cntx_init_firestorm_ind );
-#endif
-
-		// -- ARM (2 pipes x 128-bit vectors) --
-#ifdef BLIS_CONFIG_THUNDERX2
-		bli_gks_register_cntx( BLIS_ARCH_THUNDERX2,   bli_cntx_init_thunderx2,
-		                                              bli_cntx_init_thunderx2_ref,
-		                                              bli_cntx_init_thunderx2_ind );
-#endif
-#ifdef BLIS_CONFIG_CORTEXA57
-		bli_gks_register_cntx( BLIS_ARCH_CORTEXA57,   bli_cntx_init_cortexa57,
-		                                              bli_cntx_init_cortexa57_ref,
-		                                              bli_cntx_init_cortexa57_ind );
-#endif
-#ifdef BLIS_CONFIG_CORTEXA53
-		bli_gks_register_cntx( BLIS_ARCH_CORTEXA53,   bli_cntx_init_cortexa53,
-		                                              bli_cntx_init_cortexa53_ref,
-		                                              bli_cntx_init_cortexa53_ind );
-#endif
-
-		// -- ARM (older 32-bit microarchitectures) --
-#ifdef BLIS_CONFIG_CORTEXA15
-		bli_gks_register_cntx( BLIS_ARCH_CORTEXA15,   bli_cntx_init_cortexa15,
-		                                              bli_cntx_init_cortexa15_ref,
-		                                              bli_cntx_init_cortexa15_ind );
-#endif
-#ifdef BLIS_CONFIG_CORTEXA9
-		bli_gks_register_cntx( BLIS_ARCH_CORTEXA9,    bli_cntx_init_cortexa9,
-		                                              bli_cntx_init_cortexa9_ref,
-		                                              bli_cntx_init_cortexa9_ind );
-#endif
-
-		// -- IBM architectures ------------------------------------------------
-
-#ifdef BLIS_CONFIG_POWER10
-		bli_gks_register_cntx( BLIS_ARCH_POWER10,     bli_cntx_init_power10,
-		                                              bli_cntx_init_power10_ref,
-		                                              bli_cntx_init_power10_ind );
-#endif
-#ifdef BLIS_CONFIG_POWER9
-		bli_gks_register_cntx( BLIS_ARCH_POWER9,      bli_cntx_init_power9,
-		                                              bli_cntx_init_power9_ref,
-		                                              bli_cntx_init_power9_ind );
-#endif
-#ifdef BLIS_CONFIG_POWER7
-		bli_gks_register_cntx( BLIS_ARCH_POWER7,      bli_cntx_init_power7,
-		                                              bli_cntx_init_power7_ref,
-		                                              bli_cntx_init_power7_ind );
-#endif
-#ifdef BLIS_CONFIG_BGQ
-		bli_gks_register_cntx( BLIS_ARCH_BGQ,         bli_cntx_init_bgq,
-		                                              bli_cntx_init_bgq_ref,
-		                                              bli_cntx_init_bgq_ind );
-#endif
-
-		// -- RISC-V architectures --------------------------------------------
-
-#ifdef BLIS_CONFIG_RV32I
-		bli_gks_register_cntx( BLIS_ARCH_RV32I,       bli_cntx_init_rv32i,
-		                                              bli_cntx_init_rv32i_ref,
-		                                              bli_cntx_init_rv32i_ind );
-#endif
-
-#ifdef BLIS_CONFIG_RV64I
-		bli_gks_register_cntx( BLIS_ARCH_RV64I,       bli_cntx_init_rv64i,
-		                                              bli_cntx_init_rv64i_ref,
-		                                              bli_cntx_init_rv64i_ind );
-#endif
-
-#ifdef BLIS_CONFIG_RV32IV
-		bli_gks_register_cntx( BLIS_ARCH_RV32IV,      bli_cntx_init_rv32iv,
-		                                              bli_cntx_init_rv32iv_ref,
-		                                              bli_cntx_init_rv32iv_ind );
-#endif
-
-#ifdef BLIS_CONFIG_RV64IV
-		bli_gks_register_cntx( BLIS_ARCH_RV64IV,      bli_cntx_init_rv64iv,
-		                                              bli_cntx_init_rv64iv_ref,
-		                                              bli_cntx_init_rv64iv_ind );
-#endif
-
-		// -- SiFive architectures ----------------------------------------------
-
-#ifdef BLIS_CONFIG_SIFIVE_X280
-		bli_gks_register_cntx( BLIS_ARCH_SIFIVE_X280, bli_cntx_init_sifive_x280,
-		                                              bli_cntx_init_sifive_x280_ref,
-		                                              bli_cntx_init_sifive_x280_ind );
-#endif
-
-		// -- Generic architectures --------------------------------------------
-
-#ifdef BLIS_CONFIG_GENERIC
-		bli_gks_register_cntx( BLIS_ARCH_GENERIC,     bli_cntx_init_generic,
-		                                              bli_cntx_init_generic_ref,
-		                                              bli_cntx_init_generic_ind );
-#endif
-	}
+	INSERT_GENTCONF
 
 #ifdef BLIS_ENABLE_GKS_CACHING
 	// Deep-query and cache the native and induced method contexts so they are
@@ -284,8 +84,7 @@ int bli_gks_init( void )
 	// loop since the current function, bli_gks_init(), is called from within
 	// bli_init_once(); and (2) we can guarantee that the gks has been
 	// initialized given that bli_gks_init() is about to return.
-	cached_cntx_nat = ( cntx_t* )bli_gks_query_nat_cntx_noinit();
-	cached_cntx_ind = ( cntx_t* )bli_gks_query_ind_cntx_noinit( BLIS_1M );
+	cached_cntx = ( cntx_t* )bli_gks_query_cntx_noinit();
 #endif
 
 	return 0;
@@ -296,55 +95,34 @@ int bli_gks_init( void )
 int bli_gks_finalize( void )
 {
 	arch_t id;
-	ind_t  ind;
 
 	// BEGIN CRITICAL SECTION
 	// NOTE: This critical section is implicit. We assume this function is only
 	// called from within the critical section within bli_finalize().
 	{
-
 		// Iterate over the architectures in the gks array.
 		for ( id = 0; id < BLIS_NUM_ARCHS; ++id )
 		{
-			cntx_t** gks_id = gks[ id ];
+			cntx_t* gks_id = gks[ id ];
 
 			// Only consider context arrays for architectures that were allocated
 			// in the first place.
 			if ( gks_id != NULL )
 			{
-				// Iterate over the induced methods in the current sub-array
-				// referenced by cntx_pp.
-				for ( ind = 0; ind < BLIS_NUM_IND_METHODS; ++ind )
-				{
-					cntx_t* gks_id_ind = gks_id[ ind ];
-
-					// If the current context was allocated, free it.
-					if ( gks_id_ind != NULL )
-					{
-						#ifdef BLIS_ENABLE_MEM_TRACING
-						printf( "bli_gks_finalize(): cntx for ind_t %d: ", ( int )ind );
-						#endif
-
-						bli_free_intl( gks_id_ind );
-					}
-				}
-
 				#ifdef BLIS_ENABLE_MEM_TRACING
-				printf( "bli_gks_finalize(): gks for arch_t %d: ", ( int )id );
+				printf( "bli_gks_finalize(): cntx for ind_t %d: ", ( int )ind );
 				#endif
 
-				// Free the array of BLIS_NUM_IND_METHODS cntx* elements.
+				bli_cntx_free( gks_id );
 				bli_free_intl( gks_id );
 			}
 		}
-
 	}
 	// END CRITICAL SECTION
 
 #ifdef BLIS_ENABLE_GKS_CACHING
 	// Clear the cached pointers to the native and induced contexts.
-	cached_cntx_nat = NULL;
-	cached_cntx_ind = NULL;
+	cached_cntx = NULL;
 #endif
 
 	return 0;
@@ -365,35 +143,19 @@ void bli_gks_init_index( void )
 	// allocated.
 	memset( gks,           0, gks_size );
 	memset( cntx_ref_init, 0, fpa_size );
-	memset( cntx_ind_init, 0, fpa_size );
 }
 
 // -----------------------------------------------------------------------------
 
-const cntx_t* bli_gks_lookup_nat_cntx
+const cntx_t* bli_gks_lookup_id
      (
        arch_t id
      )
 {
-	// Return the address of the (native) context for a given architecture id.
-	// This function assumes the architecture has already been registered.
-
-	return bli_gks_lookup_ind_cntx( id, BLIS_NAT );
-}
-
-// -----------------------------------------------------------------------------
-
-const cntx_t* bli_gks_lookup_ind_cntx
-     (
-       arch_t id,
-       ind_t  ind
-     )
-{
-	// Return the address of the context for a given architecture id and
-	// induced method. This function assumes the architecture has already
-	// been registered. Note that this function returns NULL if the induced
-	// method hasn't yet been called (and thus its context pointer is still
-	// NULL).
+	// Return the address of the array of context pointers for a given
+	// architecture id. This function is only used for sanity check purposes
+	// to ensure that the underlying data structures for a particular id are
+	// initialized.
 
 	// Sanity check: verify that the arch_t id is valid.
 	if ( bli_error_checking_is_enabled() )
@@ -402,32 +164,8 @@ const cntx_t* bli_gks_lookup_ind_cntx
 		bli_check_error_code( e_val );
 	}
 
-	// Index into the array of context pointers for the given architecture id,
-	// and then index into the subarray for the given induced method.
-	cntx_t** gks_id     = gks[ id ];
-	cntx_t*  gks_id_ind = gks_id[ ind ];
-
-	// Return the context pointer at gks_id_ind.
-	return gks_id_ind;
-}
-
-// -----------------------------------------------------------------------------
-
-const cntx_t* const * bli_gks_lookup_id
-     (
-       arch_t id
-     )
-{
-	// Return the address of the array of context pointers for a given
-	// architecture id. This function is only used for sanity check purposes
-	// to ensure that the underlying data structures for a particular id are
-	// initialized.
-
 	// Index into the array of context pointers for the given architecture id.
-	cntx_t** gks_id = gks[ id ];
-
-	// Return the context pointer at gks_id_ind.
-	return ( const cntx_t* const * )gks_id;
+	return gks[ id ];
 }
 
 // -----------------------------------------------------------------------------
@@ -436,43 +174,36 @@ void bli_gks_register_cntx
      (
        arch_t  id,
        void_fp nat_fp,
-       void_fp ref_fp,
-       void_fp ind_fp
+       void_fp ref_fp
      )
 {
-	err_t r_val;
+	err_t e_val;
 
 	// This function is called by bli_gks_init() for each architecture that
-	// will be supported by BLIS. It takes an architecture id and three
+	// will be supported by BLIS. It takes an architecture id and two
 	// function pointers, one to a function that initializes a native context
-	// (supplied by the kernel developer), one to a function that initializes
+	// (supplied by the kernel developer), and one to a function that initializes
 	// a reference context (with function pointers specific to the architecture
-	// associated with id), and one to a function that initializes a
-	// context for use with induced methods (again, with function pointers
-	// to the architecture). The latter two functions are automatically
+	// associated with id). The latter function is automatically
 	// generated by the framework. Unlike with native contexts, we don't
-	// actually store the induced contexts until that induced method is
-	// called, and we don't ever store reference contexts. For this reason, we
+	// ever store reference contexts. For this reason, we
 	// can get away with only storing the pointers to the initialization
-	// functions for those latter two types of contexts, which we can then
-	// call at a later time when those contexts are needed.
+	// functions for this type of context, which we can then
+	// call at a later time when the reference context is needed.
 
 	// Sanity check: verify that the arch_t id is valid.
 	if ( bli_error_checking_is_enabled() )
 	{
-		err_t e_val = bli_check_valid_arch_id( id );
+		e_val = bli_check_valid_arch_id( id );
 		bli_check_error_code( e_val );
 	}
 
-	nat_cntx_init_ft f = nat_fp;
+	cntx_init_ft f = nat_fp;
 
-	// First, store the function pointers to the context initialization
-	// functions for reference kernels and induced method execution. The
-	// former will be used whenever we need to obtain reference kernels and
-	// latter will be used later on if the user calls a level-3 function
-	// with induced execution enabled.
+	// First, store the function pointer to the context initialization
+	// function for reference kernels. This
+	// will be used whenever we need to obtain reference kernels.
 	cntx_ref_init[ id ] = ref_fp;
-	cntx_ind_init[ id ] = ind_fp;
 
 	// If the the context array pointer isn't NULL, then it means the given
 	// architecture id has already registered (and the underlying memory
@@ -486,30 +217,17 @@ void bli_gks_register_cntx
 	printf( "bli_gks_register_cntx(): " );
 	#endif
 
-	// At this point, we know the pointer to the array of cntx_t* is NULL and
-	// needs to be allocated. Allocate the memory and initialize it to
-	// zeros/NULL, storing the address of the alloacted memory at the element
-	// for the current architecture id.
-	gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val );
-
-	// Alias the allocated array for readability.
-	cntx_t** gks_id = gks[ id ];
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_gks_register_cntx(): " );
-	#endif
-
 	// Allocate memory for a single context and store the address at
 	// the element in the gks[ id ] array that is reserved for native
 	// execution.
-	gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val );
+	cntx_t* gks_id = bli_calloc_intl( sizeof( cntx_t ), &e_val );
+	gks[ id ] = gks_id;
 
-	// Alias the allocated context address for readability.
-	cntx_t* gks_id_nat = gks_id[ BLIS_NAT ];
+	// The context structure is initialied in bli_cntx_init_<config>_ref
 
 	// Call the context initialization function on the element of the newly
 	// allocated array corresponding to native execution.
-	f( gks_id_nat );
+	f( gks_id );
 
 	// Verify that cache blocksizes are whole multiples of register blocksizes.
 	// Specifically, verify that:
@@ -526,14 +244,14 @@ void bli_gks_register_cntx
 	// with NR blocking used to pack A and MR blocking used to pack B, with the
 	// arguments to the gemmtrsm microkernel swapped at the last minute, as the
 	// kernel is called.
-	err_t e_val;
 
-	const blksz_t* mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat );
-	const blksz_t* nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat );
-	const blksz_t* kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat );
-	const blksz_t* mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat );
-	const blksz_t* nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat );
-	const blksz_t* kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat );
+	const mbool_t* row_pref = bli_cntx_get_ukr_prefs( BLIS_GEMM_UKR_ROW_PREF, gks_id );
+	const blksz_t* mc       = bli_cntx_get_blksz( BLIS_MC, gks_id );
+	const blksz_t* nc       = bli_cntx_get_blksz( BLIS_NC, gks_id );
+	const blksz_t* kc       = bli_cntx_get_blksz( BLIS_KC, gks_id );
+	const blksz_t* mr       = bli_cntx_get_blksz( BLIS_MR, gks_id );
+	const blksz_t* nr       = bli_cntx_get_blksz( BLIS_NR, gks_id );
+	const blksz_t* kr       = bli_cntx_get_blksz( BLIS_KR, gks_id );
 
 	e_val = bli_check_valid_mc_mod_mult( mc, mr ); bli_check_error_code( e_val );
 	e_val = bli_check_valid_nc_mod_mult( nc, nr ); bli_check_error_code( e_val );
@@ -543,22 +261,18 @@ void bli_gks_register_cntx
 	e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val );
 #endif
 
+	e_val = bli_check_valid_mr_even( mr, row_pref ); bli_check_error_code( e_val );
+	e_val = bli_check_valid_nr_even( nr, row_pref ); bli_check_error_code( e_val );
+
 	// Verify that the register blocksizes in the context are sufficiently large
 	// relative to the maximum stack buffer size defined at configure-time.
-	e_val = bli_check_sufficient_stack_buf_size( gks_id_nat );
+	e_val = bli_check_sufficient_stack_buf_size( gks_id );
 	bli_check_error_code( e_val );
 }
 
 // -----------------------------------------------------------------------------
 
 const cntx_t* bli_gks_query_cntx( void )
-{
-	return bli_gks_query_nat_cntx();
-}
-
-// -----------------------------------------------------------------------------
-
-const cntx_t* bli_gks_query_nat_cntx( void )
 {
 	bli_init_once();
 
@@ -566,27 +280,27 @@ const cntx_t* bli_gks_query_nat_cntx( void )
 
 	// Return a pointer to the context for native execution that was deep-
 	// queried and cached at the end of bli_gks_init().
-	return cached_cntx_nat;
+	return cached_cntx;
 
 #else
 
 	// Deep-query and return the address of a context for native execution.
-	return bli_gks_query_nat_cntx_impl();
+	return bli_gks_query_cntx_impl();
 
 #endif
 }
 
-const cntx_t* bli_gks_query_nat_cntx_noinit( void )
+const cntx_t* bli_gks_query_cntx_noinit( void )
 {
 	// NOTE: This function purposefully avoids calling bli_init_once() so that
 	// it is safe to call during inititalization.
 
-	return bli_gks_query_nat_cntx_impl();
+	return bli_gks_query_cntx_impl();
 }
 
 // -----------------------------------------------------------------------------
 
-const cntx_t* bli_gks_query_nat_cntx_impl( void )
+const cntx_t* bli_gks_query_cntx_impl( void )
 {
 	// Return the address of the native context for the architecture id
 	// corresponding to the current hardware, as determined by
@@ -596,155 +310,13 @@ const cntx_t* bli_gks_query_nat_cntx_impl( void )
 	arch_t id = bli_arch_query_id();
 
 	// Use the architecture id to look up a pointer to its context.
-	const cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
+	const cntx_t* cntx = bli_gks_lookup_id( id );
 
 	return cntx;
 }
 
 // -----------------------------------------------------------------------------
 
-const cntx_t* bli_gks_query_ind_cntx
-     (
-       ind_t ind
-     )
-{
-	bli_init_once();
-
-#ifdef BLIS_ENABLE_GKS_CACHING
-
-	// If for some reason the native context was requested, we return its
-	// address instead of the one for induced execution.
-	if ( ind == BLIS_NAT ) return cached_cntx_nat;
-
-	// Return a pointer to the context for the induced method that was deep-
-	// queried and cached at the end of bli_gks_init().
-	return cached_cntx_ind;
-
-#else
-
-	// Deep-query and return the address of a context for the requested induced
-	// method. (In this case, caching never takes place since it was disabled
-	// at configure-time.)
-	return bli_gks_query_ind_cntx_impl( ind );
-
-#endif
-}
-
-const cntx_t* bli_gks_query_ind_cntx_noinit
-     (
-       ind_t ind
-     )
-{
-	// NOTE: This function purposefully avoids calling bli_init_once() so that
-	// it is safe to call during inititalization.
-
-	return bli_gks_query_ind_cntx_impl( ind );
-}
-
-// -----------------------------------------------------------------------------
-
-const cntx_t* bli_gks_query_ind_cntx_impl
-     (
-       ind_t ind
-     )
-{
-	cntx_t* gks_id_ind;
-	err_t r_val;
-
-
-	// Return the address of a context that will be suited for executing a
-	// level-3 operation via the requested induced method (and datatype) for
-	// the architecture id corresponding to the current hardware, as
-	// determined by bli_arch_query_id().
-
-	// This function is called when a level-3 operation via induced method is
-	// called, e.g. bli_gemm1m(). If this is the first time that induced method
-	// is being executed since bli_gks_init(), the necessary context structure
-	// is allocated. If this is not the first time a context for the requested
-	// induced method was queried, then the memory will already be allocated
-	// and initialized, and the previous cntx_t struct will be overwritten.
-	// The function will then return the address to the newly-initialized (or
-	// previously-allocated-but-reinitialized) cntx_t struct. Note that some of
-	// this function must be executed with mutual exclusion to ensure thread
-	// safety and deterministic behavior.
-
-	// Query the architecture id.
-	arch_t id = bli_arch_query_id();
-
-	// Sanity check: verify that the arch_t id is valid.
-	if ( bli_error_checking_is_enabled() )
-	{
-		err_t e_val = bli_check_valid_arch_id( id );
-		bli_check_error_code( e_val );
-	}
-
-	// NOTE: These initial statements can reside outside of the critical section
-	// because gks[ id ] should have already been allocated, and the native
-	// context in that array should have already been allocated/initialized.
-
-	// Query the gks for the array of context pointers corresponding to the
-	// given architecture id.
-	cntx_t** gks_id     = gks[ id ];
-	cntx_t*  gks_id_nat = gks_id[ BLIS_NAT ];
-
-	// If for some reason the native context was requested, we can return
-	// its address early.
-	if ( ind == BLIS_NAT ) return gks_id_nat;
-
-	// This function assumes that the architecture idenified by id has
-	// already been registered with the gks (which guarantees that
-	// gks[ id ] is non-NULL and gks[ id ][ BLIS_NAT ] is also non-NULL
-	// and refers to a context initialized with valid data).
-
-	// Acquire the mutex protecting the gks.
-	bli_pthread_mutex_lock( &gks_mutex );
-
-	// BEGIN CRITICAL SECTION
-	{
-		// Alias for readability the element of gks_id associated with the
-		// requested induced method.
-		gks_id_ind = gks_id[ ind ];
-
-		// If the context pointer is NULL, then we know we must allocate and
-		// then initialize the context before returning its address.
-		if ( gks_id_ind == NULL )
-		{
-			// If gks_id_ind is NULL, then we know we must allocate and then
-			// initialize the context, storing its address back to
-			// gks_id[ ind ].
-			gks_id_ind    = bli_calloc_intl( sizeof( cntx_t ), &r_val );
-			gks_id[ ind ] = gks_id_ind;
-		}
-
-		// Before we can call the induced method context initialization
-		// function on the newly allocated structure, we must first copy
-		// over the contents of the native context. If a previous context
-		// was already copied, this will overwrite those previous values.
-		*gks_id_ind = *gks_id_nat;
-
-		// Use the architecture id to look up the function pointer to the
-		// context initialization function for induced methods.
-		ind_cntx_init_ft f = cntx_ind_init[ id ];
-
-		// Now we modify the context (so that it contains the proper values
-		// for its induced method) by calling the context initialization
-		// function for the current induced method. (That function assumes
-		// that the context is pre-initialized with values for native
-		// execution.)
-		f( ind, gks_id_ind );
-	}
-	// END CRITICAL SECTION
-
-	// Release the mutex protecting the gks.
-	bli_pthread_mutex_unlock( &gks_mutex );
-
-	// Return the address of the newly-allocated/initialized context.
-	return gks_id_ind;
-
-}
-
-// -----------------------------------------------------------------------------
-
 void bli_gks_init_ref_cntx
     (
       cntx_t* cntx
@@ -762,7 +334,7 @@ void bli_gks_init_ref_cntx
 
 	// Obtain the function pointer to the context initialization function for
 	// reference kernels.
-	ref_cntx_init_ft f = cntx_ref_init[ id ];
+	cntx_init_ft f = cntx_ref_init[ id ];
 
 	// Initialize the caller's context with reference kernels and related values.
 	f( cntx );
@@ -770,7 +342,7 @@ void bli_gks_init_ref_cntx
 
 // -----------------------------------------------------------------------------
 
-bool bli_gks_cntx_l3_nat_ukr_is_ref
+bool bli_gks_cntx_ukr_is_ref
      (
              num_t   dt,
              ukr_t   ukr_id,
@@ -788,6 +360,33 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref
 	void_fp ref_fp = bli_cntx_get_ukr_dt( dt, ukr_id, &ref_cntx );
 	void_fp fp     = bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 
+    bli_cntx_free( &ref_cntx );
+
+	// Return the result.
+	return fp == ref_fp;
+}
+
+bool bli_gks_cntx_ukr2_is_ref
+     (
+             num_t   dt1,
+             num_t   dt2,
+             ukr_t   ukr_id,
+       const cntx_t* cntx
+     )
+{
+	cntx_t ref_cntx;
+
+	// Initialize a context with reference kernels for the arch_t id queried
+	// via bli_arch_query_id().
+	bli_gks_init_ref_cntx( &ref_cntx );
+
+	// Query each context for the micro-kernel function pointer for the
+	// specified datatype.
+	void_fp ref_fp = bli_cntx_get_ukr2_dt( dt1, dt2, ukr_id, &ref_cntx );
+	void_fp fp     = bli_cntx_get_ukr2_dt( dt1, dt2, ukr_id, cntx );
+
+    bli_cntx_free( &ref_cntx );
+
 	// Return the result.
 	return fp == ref_fp;
 }
@@ -813,7 +412,7 @@ const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
 	// Query the context for the current induced method and datatype, and
 	// then query the ukernel function pointer for the given datatype from
 	// that context.
-	const cntx_t* cntx = bli_gks_query_ind_cntx( method );
+	const cntx_t* cntx = bli_gks_query_cntx();
 	void_fp fp         = bli_cntx_get_ukr_dt( dt, ukr, cntx );
 
 	// Check whether the ukernel function pointer is NULL for the given
@@ -869,23 +468,141 @@ kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 		// method to the typed function pointer within the known
 		// reference ukrs object.
 
-		// Query the architecture id.
-		arch_t id = bli_arch_query_id();
-
-		// Sanity check: verify that the arch_t id is valid.
-		if ( bli_error_checking_is_enabled() )
-		{
-			err_t e_val = bli_check_valid_arch_id( id );
-			bli_check_error_code( e_val );
-		}
-
 		// Query the native context from the gks.
-		const cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
+		const cntx_t* cntx = bli_gks_query_cntx();
 
-		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) )
+		if ( bli_gks_cntx_ukr_is_ref( dt, ukr, cntx ) )
 			return BLIS_REFERENCE_UKERNEL;
 		else
 			return BLIS_OPTIMIZED_UKERNEL;
 	}
 }
 
+//
+// -- microkernel and block size registration ----------------------------------
+//
+
+err_t bli_gks_register_blksz( siz_t* bs_id )
+{
+	siz_t id = 0;
+	siz_t next_id;
+	cntx_t* cntx;
+	err_t err;
+
+	#undef GENTCONF
+	#define GENTCONF( CONFIG, config ) \
+	\
+	cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH_,CONFIG) ); \
+	err = bli_cntx_register_blksz( &next_id, NULL, 0, cntx ); \
+	if ( err != BLIS_SUCCESS ) \
+	{ \
+		*bs_id = 0; \
+		return err; \
+	} \
+	if ( id != 0 && id != next_id ) \
+	{ \
+		*bs_id = 0; \
+		return BLIS_INVALID_UKR_ID; \
+	} \
+	id = next_id;
+
+	INSERT_GENTCONF
+
+	*bs_id = id;
+
+	return BLIS_SUCCESS;
+}
+
+err_t bli_gks_register_ukr( siz_t* ukr_id )
+{
+	siz_t id = 0;
+	siz_t next_id;
+	cntx_t* cntx;
+	err_t err;
+
+	#undef GENTCONF
+	#define GENTCONF( CONFIG, config ) \
+	\
+	cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH_,CONFIG) ); \
+	err = bli_cntx_register_ukr( &next_id, NULL, cntx ); \
+	if ( err != BLIS_SUCCESS ) \
+	{ \
+		*ukr_id = 0; \
+		return err; \
+	} \
+	if ( id != 0 && id != next_id ) \
+	{ \
+		*ukr_id = 0; \
+		return BLIS_INVALID_UKR_ID; \
+	} \
+	id = next_id;
+
+	INSERT_GENTCONF
+
+	*ukr_id = id;
+
+	return BLIS_SUCCESS;
+}
+
+err_t bli_gks_register_ukr2( siz_t* ukr_id )
+{
+	siz_t id = 0;
+	siz_t next_id;
+	cntx_t* cntx;
+	err_t err;
+
+	#undef GENTCONF
+	#define GENTCONF( CONFIG, config ) \
+	\
+	cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH_,CONFIG) ); \
+	err = bli_cntx_register_ukr2( &next_id, NULL, cntx ); \
+	if ( err != BLIS_SUCCESS ) \
+	{ \
+		*ukr_id = 0; \
+		return err; \
+	} \
+	if ( id != 0 && id != next_id ) \
+	{ \
+		*ukr_id = 0; \
+		return BLIS_INVALID_UKR_ID; \
+	} \
+	id = next_id;
+
+	INSERT_GENTCONF
+
+	*ukr_id = id;
+
+	return BLIS_SUCCESS;
+}
+
+err_t bli_gks_register_ukr_pref( siz_t* ukr_pref_id )
+{
+	siz_t id = 0;
+	siz_t next_id;
+	cntx_t* cntx;
+	err_t err;
+
+	#undef GENTCONF
+	#define GENTCONF( CONFIG, config ) \
+	\
+	cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH_,CONFIG) ); \
+	err = bli_cntx_register_ukr_pref( &next_id, NULL, cntx ); \
+	if ( err != BLIS_SUCCESS ) \
+	{ \
+		*ukr_pref_id = 0; \
+		return err; \
+	} \
+	if ( id != 0 && id != next_id ) \
+	{ \
+		*ukr_pref_id = 0; \
+		return BLIS_INVALID_UKR_ID; \
+	} \
+	id = next_id;
+
+	INSERT_GENTCONF
+
+	*ukr_pref_id = id;
+
+	return BLIS_SUCCESS;
+}
+
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index da2ead083..f4d12c641 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -40,29 +40,30 @@ int                            bli_gks_finalize( void );
 
 void                           bli_gks_init_index( void );
 
-const cntx_t*                  bli_gks_lookup_nat_cntx( arch_t id );
-const cntx_t*                  bli_gks_lookup_ind_cntx( arch_t id, ind_t ind );
-const cntx_t* const *          bli_gks_lookup_id( arch_t id );
-void                           bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_lookup_id( arch_t id );
+void                           bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp );
 
 BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_cntx( void );
-
-BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void );
-const cntx_t*                  bli_gks_query_nat_cntx_noinit( void );
-const cntx_t*                  bli_gks_query_nat_cntx_impl( void );
-
-BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind );
-const cntx_t*                  bli_gks_query_ind_cntx_noinit( ind_t ind );
-const cntx_t*                  bli_gks_query_ind_cntx_impl( ind_t ind );
+const cntx_t*                  bli_gks_query_cntx_noinit( void );
+const cntx_t*                  bli_gks_query_cntx_impl( void );
 
 BLIS_EXPORT_BLIS void          bli_gks_init_ref_cntx( cntx_t* cntx );
 
-bool                           bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, const cntx_t* cntx );
+bool                           bli_gks_cntx_ukr_is_ref( num_t dt, ukr_t ukr_id, const cntx_t* cntx );
+bool                           bli_gks_cntx_ukr2_is_ref( num_t dt1, num_t dt2, ukr_t ukr_id, const cntx_t* cntx );
 
 BLIS_EXPORT_BLIS const char*   bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
 BLIS_EXPORT_BLIS kimpl_t       bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
 
 //char*                          bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
 
+BLIS_EXPORT_BLIS err_t bli_gks_register_blksz( siz_t* bs_id );
+
+BLIS_EXPORT_BLIS err_t bli_gks_register_ukr( siz_t* ukr_id );
+
+BLIS_EXPORT_BLIS err_t bli_gks_register_ukr2( siz_t* ukr_id );
+
+BLIS_EXPORT_BLIS err_t bli_gks_register_ukr_pref( siz_t* ukr_pref_id );
+
 #endif
 
diff --git a/frame/base/bli_ind.c b/frame/base/bli_ind.c
index 3436ddbc9..ce4411b94 100644
--- a/frame/base/bli_ind.c
+++ b/frame/base/bli_ind.c
@@ -50,9 +50,9 @@ int bli_ind_init( void )
 	// is disabled, this function is called once by ONLY ONE application thread.
 	// In neither case is a mutex needed to protect the data initialization.
 
-	// NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order
+	// NOTE: We intentionally call bli_gks_query_cntx_noinit() in order
 	// to avoid the internal call to bli_init_once().
-	const cntx_t* cntx = bli_gks_query_nat_cntx_noinit();
+	const cntx_t* cntx = bli_gks_query_cntx_noinit();
 
 	// For each precision, enable the default induced method (1m) if both of
 	// the following conditions are met:
@@ -61,10 +61,10 @@ int bli_ind_init( void )
 	// The second condition means that BLIS will not bother to use an induced
 	// method if both the real and complex domain kernels are reference.
 
-	bool s_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_FLOAT,    BLIS_GEMM_UKR, cntx );
-	bool d_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_DOUBLE,   BLIS_GEMM_UKR, cntx );
-	bool c_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_SCOMPLEX, BLIS_GEMM_UKR, cntx );
-	bool z_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_DCOMPLEX, BLIS_GEMM_UKR, cntx );
+	bool s_is_ref = bli_gks_cntx_ukr_is_ref( BLIS_FLOAT,    BLIS_GEMM_UKR, cntx );
+	bool d_is_ref = bli_gks_cntx_ukr_is_ref( BLIS_DOUBLE,   BLIS_GEMM_UKR, cntx );
+	bool c_is_ref = bli_gks_cntx_ukr_is_ref( BLIS_SCOMPLEX, BLIS_GEMM_UKR, cntx );
+	bool z_is_ref = bli_gks_cntx_ukr_is_ref( BLIS_DCOMPLEX, BLIS_GEMM_UKR, cntx );
 
 	if ( c_is_ref && !s_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_SCOMPLEX );
 	if ( z_is_ref && !d_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_DCOMPLEX );
diff --git a/frame/base/bli_machval.c b/frame/base/bli_machval.c
index 1aaf604d8..a4b9223bc 100644
--- a/frame/base/bli_machval.c
+++ b/frame/base/bli_machval.c
@@ -113,7 +113,7 @@ void PASTEMAC(chv,opname) \
 \
 	/* Copy the requested parameter value to the output buffer, which
 	   may involve a demotion from the complex to real domain. */ \
-	PASTEMAC2(chvr,chv,copys)( pvals[ val_i ], *v_cast ); \
+	PASTEMAC(chvr,chv,copys)( pvals[ val_i ], *v_cast ); \
 }
 
 INSERT_GENTFUNCR_BASIC( machval, lamch )
diff --git a/frame/base/bli_mbool.h b/frame/base/bli_mbool.h
index d00424273..e0ceaf1c0 100644
--- a/frame/base/bli_mbool.h
+++ b/frame/base/bli_mbool.h
@@ -50,7 +50,7 @@ BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb )
 
 // -----------------------------------------------------------------------------
 
-mbool_t* bli_mbool_create
+BLIS_EXPORT_BLIS mbool_t* bli_mbool_create
      (
        bool b_s,
        bool b_d,
@@ -58,7 +58,7 @@ mbool_t* bli_mbool_create
        bool b_z
      );
 
-void bli_mbool_init
+BLIS_EXPORT_BLIS void bli_mbool_init
      (
        mbool_t* b,
        bool     b_s,
@@ -67,5 +67,5 @@ void bli_mbool_init
        bool     b_z
      );
 
-void bli_mbool_free( mbool_t* b );
+BLIS_EXPORT_BLIS void bli_mbool_free( mbool_t* b );
 
diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c
index f6bcfe569..9ff5b0bb0 100644
--- a/frame/base/bli_memsys.c
+++ b/frame/base/bli_memsys.c
@@ -44,9 +44,9 @@ int bli_memsys_init( void )
 
 	// Query a native context so we have something to pass into
 	// bli_pba_init_pools().
-	// NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order
+	// NOTE: We intentionally call bli_gks_query_cntx_noinit() in order
 	// to avoid the internal call to bli_init_once().
-	const cntx_t* cntx_p = bli_gks_query_nat_cntx_noinit();
+	const cntx_t* cntx_p = bli_gks_query_cntx_noinit();
 
 	// Initialize the packing block allocator and its data structures.
 	bli_pba_init( cntx_p );
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index 043bd1088..e4e79d9f4 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -108,18 +108,11 @@ void bli_obj_create_without_buffer
 	bli_obj_set_buffer( NULL, obj );
 	bli_obj_set_dt( dt, obj );
 	bli_obj_set_elem_size( elem_size, obj );
-	bli_obj_set_target_dt( dt, obj );
-	bli_obj_set_exec_dt( dt, obj );
-	bli_obj_set_comp_dt( dt, obj );
+	bli_obj_set_comp_prec( bli_dt_prec( dt ), obj );
 	bli_obj_set_dims( m, n, obj );
 	bli_obj_set_offs( 0, 0, obj );
 	bli_obj_set_diag_offset( 0, obj );
 
-	bli_obj_set_pack_fn( NULL, obj );
-	bli_obj_set_pack_params( NULL, obj );
-	bli_obj_set_ker_fn( NULL, obj );
-	bli_obj_set_ker_params( NULL, obj );
-
 	// Set the internal scalar to 1.0.
 	bli_obj_set_scalar_dt( dt, obj );
 	void* s = bli_obj_internal_scalar_buffer( obj );
@@ -559,18 +552,19 @@ const char* bli_dt_string
 dim_t bli_align_dim_to_mult
      (
        dim_t dim,
-       dim_t dim_mult
+       dim_t dim_mult,
+       bool  round_up
      )
 {
 	// We return the dimension unmodified if the multiple is zero
 	// (to avoid division by zero).
 	if ( dim_mult == 0 ) return dim;
 
-	dim = ( ( dim + dim_mult - 1 ) /
-	        dim_mult ) *
-	        dim_mult;
+	if ( round_up )
+		dim += dim_mult - 1;
 
-	return dim;
+	// Avoid rounding down to zero.
+	return bli_max( dim_mult, ( dim / dim_mult ) * dim_mult );
 }
 
 dim_t bli_align_dim_to_size
@@ -664,9 +658,7 @@ void bli_obj_print
 	fprintf( file, " - is complex    %lu\n", ( unsigned long )bli_obj_is_complex( obj ) );
 	fprintf( file, " - is d. prec    %lu\n", ( unsigned long )bli_obj_is_double_prec( obj ) );
 	fprintf( file, " - datatype      %lu\n", ( unsigned long )bli_obj_dt( obj ) );
-	fprintf( file, " - target dt     %lu\n", ( unsigned long )bli_obj_target_dt( obj ) );
-	fprintf( file, " - exec dt       %lu\n", ( unsigned long )bli_obj_exec_dt( obj ) );
-	fprintf( file, " - comp dt       %lu\n", ( unsigned long )bli_obj_comp_dt( obj ) );
+	fprintf( file, " - comp prec     %lu\n", ( unsigned long )bli_obj_comp_prec( obj ) );
 	fprintf( file, " - scalar dt     %lu\n", ( unsigned long )bli_obj_scalar_dt( obj ) );
 	fprintf( file, " - has trans     %lu\n", ( unsigned long )bli_obj_has_trans( obj ) );
 	fprintf( file, " - has conj      %lu\n", ( unsigned long )bli_obj_has_conj( obj ) );
diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h
index a446c09c8..71bb7e61a 100644
--- a/frame/base/bli_obj.h
+++ b/frame/base/bli_obj.h
@@ -127,7 +127,8 @@ BLIS_EXPORT_BLIS const char* bli_dt_string
 BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult
      (
        dim_t dim,
-       dim_t dim_mult
+       dim_t dim_mult,
+       bool  round_up
      );
 
 BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size
diff --git a/frame/base/bli_obj_scalar.c b/frame/base/bli_obj_scalar.c
index 2ef9751f6..51120ebf5 100644
--- a/frame/base/bli_obj_scalar.c
+++ b/frame/base/bli_obj_scalar.c
@@ -110,12 +110,12 @@ void bli_obj_scalar_attach
 
 	// Use the target datatype of A as the datatype to which we cast
 	// alpha locally.
-	const num_t dt_targ = bli_obj_target_dt( a );
+	const num_t dt = bli_obj_dt( alpha );
 
 	// Make a copy-cast of alpha to the target datatype of A, queried
 	// above. This step gives us the opportunity to conjugate and/or
 	// typecast alpha.
-	bli_obj_scalar_init_detached_copy_of( dt_targ,
+	bli_obj_scalar_init_detached_copy_of( dt,
 	                                      conj,
 	                                      alpha,
 	                                      &alpha_cast );
@@ -124,7 +124,7 @@ void bli_obj_scalar_attach
 	bli_obj_copy_internal_scalar( &alpha_cast, a );
 
 	// Update the scalar datatype of A.
-	bli_obj_set_scalar_dt( dt_targ, a );
+	bli_obj_set_scalar_dt( dt, a );
 }
 
 void bli_obj_scalar_cast_to
diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c
index fd6ca3a0c..4192101bb 100644
--- a/frame/base/bli_part.c
+++ b/frame/base/bli_part.c
@@ -131,10 +131,7 @@ void bli_acquire_mpart_mdim
 	// partitioned through normally.) Note that the function called below
 	// assumes forward partitioning.
 	if ( bli_obj_is_panel_packed( obj ) )
-	{
-		bli_packm_acquire_mpart_t2b( req_part, i, b, obj, sub_obj );
-		return;
-	}
+		bli_check_error_code( BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_PART );
 
 
 	// Check parameters.
@@ -354,10 +351,7 @@ void bli_acquire_mpart_ndim
 	// partitioned through normally.) Note that the function called below
 	// assumes forward partitioning.
 	if ( bli_obj_is_panel_packed( obj ) )
-	{
-		bli_packm_acquire_mpart_l2r( req_part, j, b, obj, sub_obj );
-		return;
-	}
+		bli_check_error_code( BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_PART );
 
 
 	// Check parameters.
@@ -577,10 +571,7 @@ void bli_acquire_mpart_mndim
 	// partitioned through normally.) Note that the function called below
 	// assumes forward partitioning.
 	if ( bli_obj_is_panel_packed( obj ) )
-	{
-		bli_packm_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj );
-		return;
-	}
+		bli_check_error_code( BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_PART );
 
 
 	// Check parameters.
diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h
index 6d3e00ced..06dd23eed 100644
--- a/frame/base/bli_part.h
+++ b/frame/base/bli_part.h
@@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void bli_acquire_mpart
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
+BLIS_EXPORT_BLIS void PASTEMAC( opname ) \
      ( \
              subpart_t req_part, \
              dim_t     i, \
@@ -69,7 +69,7 @@ GENPROT( acquire_mpart_br2tl )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
+BLIS_EXPORT_BLIS void PASTEMAC( opname ) \
      ( \
              dir_t     direct, \
              subpart_t req_part, \
@@ -89,7 +89,7 @@ GENPROT( acquire_mpart_mndim )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
+BLIS_EXPORT_BLIS void PASTEMAC( opname ) \
      ( \
              subpart_t req_part, \
              dim_t     i, \
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2b.c b/frame/base/bli_part_cntl.c
similarity index 73%
rename from frame/3/gemmt/bli_gemmt_x_ker_var2b.c
rename to frame/base/bli_part_cntl.c
index 132d7c13a..b8a294208 100644
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2b.c
+++ b/frame/base/bli_part_cntl.c
@@ -35,39 +35,34 @@
 
 #include "blis.h"
 
-static l3_var_oft vars[2] =
-{
-	bli_gemmt_l_ker_var2b, bli_gemmt_u_ker_var2b,
-};
-
-void bli_gemmt_x_ker_var2b
+void bli_part_cntl_init_node
      (
-       const obj_t*     a,
-       const obj_t*     ah,
-       const obj_t*     c,
-       const cntx_t*    cntx,
-       const cntl_t*    cntl,
-             thrinfo_t* thread_par
+       void_fp      var_func,
+       num_t        b_dt,
+       dim_t        b_alg,
+       dim_t        b_max,
+       dim_t        b_scale,
+       dim_t        b_mult,
+       dim_t        b_mult_scale,
+       dir_t        direct,
+       bool         use_weighted,
+       part_cntl_t* cntl
      )
 {
-	dim_t uplo;
-
-	// Set a bool based on the uplo field of C's root object.
-	if ( bli_obj_root_is_lower( c ) ) uplo = 0;
-	else                              uplo = 1;
-
-	// Index into the variant array to extract the correct function pointer.
-	l3_var_oft f = vars[uplo];
+	// Initialize the part_cntl_t struct.
+	cntl->b_dt         = b_dt;
+	cntl->b_alg        = b_alg;
+	cntl->b_max        = b_max;
+	cntl->b_scale      = b_scale;
+	cntl->b_mult       = b_mult;
+	cntl->b_mult_scale = b_mult_scale;
+	cntl->direct       = direct;
+	cntl->use_weighted = use_weighted;
 
-	// Call the macrokernel.
-	f
+	bli_cntl_init_node
 	(
-	  a,
-	  ah,
-	  c,
-	  cntx,
-	  cntl,
-	  thread_par
+	  var_func,
+	  &cntl->cntl
 	);
 }
 
diff --git a/frame/base/bli_part_cntl.h b/frame/base/bli_part_cntl.h
new file mode 100644
index 000000000..04dbf654b
--- /dev/null
+++ b/frame/base/bli_part_cntl.h
@@ -0,0 +1,132 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+struct part_cntl_s
+{
+	cntl_t cntl; // cntl field must be present and come first.
+	num_t  b_dt;
+	dim_t  b_alg;
+	dim_t  b_max;
+	dim_t  b_scale;
+	dim_t  b_mult;
+	dim_t  b_mult_scale;
+	dir_t  direct;
+	bool   use_weighted;
+};
+typedef struct part_cntl_s part_cntl_t;
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE dim_t bli_part_cntl_blksz_alg( const cntl_t* cntl )
+{
+	return ( ( const part_cntl_t* )cntl )->b_alg;
+}
+
+BLIS_INLINE dim_t bli_part_cntl_blksz_max( const cntl_t* cntl )
+{
+	return ( ( const part_cntl_t* )cntl )->b_max;
+}
+
+BLIS_INLINE dim_t bli_part_cntl_blksz_mult( const cntl_t* cntl )
+{
+	return ( ( const part_cntl_t* )cntl )->b_mult;
+}
+
+BLIS_INLINE dir_t bli_part_cntl_direct( const cntl_t* cntl )
+{
+	return ( ( const part_cntl_t* )cntl )->direct;
+}
+
+BLIS_INLINE bool bli_part_cntl_use_weighted( const cntl_t* cntl )
+{
+	return ( ( const part_cntl_t* )cntl )->use_weighted;
+}
+
+// -----------------------------------------------------------------------------
+
+BLIS_INLINE void bli_part_cntl_set_blksz( const blksz_t* blksz, cntl_t* cntl_ )
+{
+	part_cntl_t* cntl = ( part_cntl_t* )cntl_;
+	num_t dt = cntl->b_dt;
+	cntl->b_alg = bli_blksz_get_def( dt, blksz ) / cntl->b_scale;
+	cntl->b_max = bli_blksz_get_max( dt, blksz ) / cntl->b_scale;
+}
+
+BLIS_INLINE void bli_part_cntl_set_blksz_mult( const blksz_t* blksz, cntl_t* cntl_ )
+{
+	part_cntl_t* cntl = ( part_cntl_t* )cntl_;
+	num_t dt = cntl->b_dt;
+	cntl->b_mult = bli_blksz_get_def( dt, blksz ) / cntl->b_mult_scale;
+}
+
+BLIS_INLINE void bli_part_cntl_set_direct( dir_t direct, cntl_t* cntl )
+{
+	( ( part_cntl_t* )cntl )->direct = direct;
+}
+
+BLIS_INLINE void bli_part_cntl_set_use_weighted( bool use_weighted, cntl_t* cntl )
+{
+	( ( part_cntl_t* )cntl )->use_weighted = use_weighted;
+}
+
+BLIS_INLINE void bli_part_cntl_align_blksz_to_mult( dim_t mult, bool round_up, cntl_t* cntl_ )
+{
+	part_cntl_t* cntl = ( part_cntl_t* )cntl_;
+	cntl->b_alg = bli_align_dim_to_mult( cntl->b_alg, mult, round_up );
+	cntl->b_max = bli_align_dim_to_mult( cntl->b_max, mult, round_up );
+}
+
+BLIS_INLINE void bli_part_cntl_align_blksz( bool round_up, cntl_t* cntl )
+{
+	bli_part_cntl_align_blksz_to_mult( ( ( part_cntl_t* )cntl )->b_mult, round_up, cntl );
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_part_cntl_init_node
+     (
+       void_fp      var_func,
+       num_t        b_dt,
+       dim_t        b_alg,
+       dim_t        b_max,
+       dim_t        b_scale,
+       dim_t        b_mult,
+       dim_t        b_mult_scale,
+       dir_t        direct,
+       bool         use_weighted,
+       part_cntl_t* cntl
+     );
+
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index abcf708e2..d31ea1a13 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -393,8 +393,6 @@ void bli_pba_compute_pool_block_sizes
        const cntx_t* cntx
      )
 {
-	const ind_t im = bli_cntx_method( cntx );
-
 	siz_t bs_cand_a = 0;
 	siz_t bs_cand_b = 0;
 	siz_t bs_cand_c = 0;
@@ -408,9 +406,6 @@ void bli_pba_compute_pool_block_sizes
 		siz_t bs_dt_b;
 		siz_t bs_dt_c;
 
-		// Avoid considering induced methods for real datatypes.
-		if ( bli_is_real( dt ) && im != BLIS_NAT ) continue;
-
 		bli_pba_compute_pool_block_sizes_dt( dt,
 		                                     &bs_dt_a,
 		                                     &bs_dt_b,
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index abc3caaa5..8b513fa91 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -316,132 +316,6 @@ void bli_rntm_set_ways
 
 // -----------------------------------------------------------------------------
 
-void bli_rntm_set_ways_for_op
-     (
-       opid_t  l3_op,
-       side_t  side,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       rntm_t* rntm
-     )
-{
-	// Set the number of ways for each loop, if needed, depending on what
-	// kind of information is already stored in the rntm_t object.
-	bli_rntm_factorize( m, n, k, rntm );
-
-	#if 0
-	printf( "bli_rntm_set_ways_for_op()\n" );
-	bli_rntm_print( rntm );
-	#endif
-
-	// Now modify the number of ways, if necessary, based on the operation.
-
-	// Consider gemm (hemm, symm), gemmt (herk, her2k, syrk, syr2k), and
-	// trmm (trmm, trmm3).
-	if (
-#ifdef BLIS_ENABLE_JRIR_TLB
-	     l3_op == BLIS_GEMM  ||
-	     l3_op == BLIS_GEMMT ||
-	     l3_op == BLIS_TRMM  ||
-#endif
-	     FALSE
-	   )
-	{
-		dim_t jc = bli_rntm_jc_ways( rntm );
-		dim_t pc = bli_rntm_pc_ways( rntm );
-		dim_t ic = bli_rntm_ic_ways( rntm );
-		dim_t jr = bli_rntm_jr_ways( rntm );
-		dim_t ir = bli_rntm_ir_ways( rntm );
-
-		// If TLB is enabled for gemm or gemmt, redirect any ir loop parallelism
-		// into the jr loop.
-		bli_rntm_set_ways_only
-		(
-		  jc,
-		  pc,
-		  ic,
-		  jr * ir,
-		  1,
-		  rntm
-		);
-	}
-
-	// Consider trmm, trmm3, trsm.
-	if ( l3_op == BLIS_TRMM ||
-	     l3_op == BLIS_TRSM )
-	{
-		dim_t jc = bli_rntm_jc_ways( rntm );
-		dim_t pc = bli_rntm_pc_ways( rntm );
-		dim_t ic = bli_rntm_ic_ways( rntm );
-		dim_t jr = bli_rntm_jr_ways( rntm );
-		dim_t ir = bli_rntm_ir_ways( rntm );
-
-		// Notice that, if we do need to update the ways, we don't need to
-		// update the num_threads field since we only reshuffle where the
-		// parallelism is extracted, not the total amount of parallelism.
-
-		if ( l3_op == BLIS_TRMM )
-		{
-			// We reconfigure the parallelism extracted from trmm_r due to a
-			// dependency in the jc loop. (NOTE: This dependency does not exist
-			// for trmm3.)
-			if ( bli_is_left( side ) )
-			{
-				bli_rntm_set_ways_only
-				(
-				  jc,
-				  pc,
-				  ic,
-				  jr,
-				  ir,
-				  rntm
-				);
-			}
-			else // if ( bli_is_right( side ) )
-			{
-				bli_rntm_set_ways_only
-				(
-				  1,
-				  pc,
-				  ic,
-				  jr * jc,
-				  ir,
-				  rntm
-				);
-			}
-		}
-		else if ( l3_op == BLIS_TRSM )
-		{
-//printf( "bli_rntm_set_ways_for_op(): jc%d ic%d jr%d\n", (int)jc, (int)ic, (int)jr );
-			if ( bli_is_left( side ) )
-			{
-				bli_rntm_set_ways_only
-				(
-				  jc,
-				  1,
-				  ic * pc,
-				  jr * ir,
-				  1,
-				  rntm
-				);
-			}
-			else // if ( bli_is_right( side ) )
-			{
-				bli_rntm_set_ways_only
-				(
-				  1,
-				  1,
-				  ic * pc * jc * ir * jr,
-				  1,
-				  1,
-				  rntm
-				);
-			}
-		}
-	}
-}
-
 void bli_rntm_sanitize
      (
        rntm_t* rntm
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 1cc7ad002..03bb5416d 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -79,6 +79,17 @@ BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm )
 	return ( bszid == BLIS_NO_PART ? 1 : rntm->thrloop[ bszid ] );
 }
 
+BLIS_INLINE dim_t bli_rntm_total_ways_for( dim_t thread_by, const rntm_t* rntm )
+{
+    dim_t n_way = 1;
+    for ( dim_t i = 0; i < BLIS_NUM_LOOPS; i++ )
+	{
+        if ( thread_by & (1 << i) )
+            n_way *= bli_rntm_ways_for( (bszid_t)i, rntm );
+    }
+    return n_way;
+}
+
 BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_NC, rntm );
@@ -335,16 +346,6 @@ BLIS_EXPORT_BLIS void bli_rntm_set_ways
        rntm_t* rntm
      );
 
-BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
-     (
-       opid_t  l3_op,
-       side_t  side,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       rntm_t* rntm
-     );
-
 void bli_rntm_sanitize
      (
        rntm_t* rntm
diff --git a/frame/base/bli_setgetijm.c b/frame/base/bli_setgetijm.c
index d864b3fff..5a89d258e 100644
--- a/frame/base/bli_setgetijm.c
+++ b/frame/base/bli_setgetijm.c
@@ -101,7 +101,7 @@ void PASTEMAC(ch,opname) \
 \
 	ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
-	PASTEMAC2(z,ch,sets)( ar, ai, *b_ij ); \
+	PASTEMAC(z,ch,sets)( ar, ai, *b_ij ); \
 }
 
 INSERT_GENTFUNC_BASIC( setijm )
@@ -175,7 +175,7 @@ void PASTEMAC(ch,opname) \
 \
 	const ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
-	PASTEMAC2(ch,z,gets)( *b_ij, *ar, *ai ); \
+	PASTEMAC(ch,z,gets)( *b_ij, *ar, *ai ); \
 }
 
 INSERT_GENTFUNC_BASIC( getijm )
diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c
index ff76647ff..a6ae2860d 100644
--- a/frame/base/bli_setgetijv.c
+++ b/frame/base/bli_setgetijv.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,opname) \
 \
 	ctype* restrict x_i = x_cast + (i  )*incx; \
 \
-	PASTEMAC2(z,ch,sets)( ar, ai, *x_i ); \
+	PASTEMAC(z,ch,sets)( ar, ai, *x_i ); \
 }
 
 INSERT_GENTFUNC_BASIC( setijv )
@@ -161,7 +161,7 @@ void PASTEMAC(ch,opname) \
 \
 	const ctype* restrict x_i = x_cast + (i  )*incx; \
 \
-	PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \
+	PASTEMAC(ch,z,gets)( *x_i, *ar, *ai ); \
 }
 
 INSERT_GENTFUNC_BASIC( getijv )
diff --git a/frame/base/bli_stack.c b/frame/base/bli_stack.c
new file mode 100644
index 000000000..73e3c123d
--- /dev/null
+++ b/frame/base/bli_stack.c
@@ -0,0 +1,200 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Southern Methodist University
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+err_t bli_stack_init
+     (
+       siz_t   elem_size,
+       siz_t   block_len,
+       siz_t   max_blocks,
+       siz_t   initial_size,
+       stck_t* stack
+     )
+{
+	if ( stack == NULL )
+		return BLIS_NULL_POINTER;
+
+	if ( initial_size > max_blocks * block_len )
+		return BLIS_OUT_OF_BOUNDS;
+
+	// Set up an initial state which cannot store any elements
+	stack->elem_size = elem_size;
+	stack->block_len = block_len;
+	stack->max_blocks = 0;
+	stack->size = 0;
+	stack->capacity = 0;
+
+	if ( bli_pthread_mutex_init( &stack->lock, NULL ) != 0 )
+		return BLIS_LOCK_FAILURE;
+
+	err_t error;
+	stack->blocks = ( void** )bli_malloc_intl( sizeof( void* ) * max_blocks, &error );
+	if ( error != BLIS_SUCCESS )
+		return error;
+
+	// Set this to a non-zero value only after successfully
+	// allocating the blocks array. This way on failure, we
+	// always get a valid stck_t, even if it can't actually contain
+	// any elements.
+	stack->max_blocks = max_blocks;
+
+	// Determine how many blocks are required to store the intial capacity
+	siz_t len = block_len;
+	siz_t num_blocks = ( initial_size + len - 1 ) / len;
+
+	// Allocate the new blocks one by one. If an allocation fails,
+	// the stack state will still be valid for as many blocks as were
+	// successfully allocated. This requires only updating the stack
+	// capacity *after* successful allocation.
+	for ( siz_t block = 0; block < num_blocks; block++ )
+	{
+		stack->blocks[ block ] = bli_malloc_intl( len * stack->elem_size, &error );
+		if ( error != BLIS_SUCCESS )
+			return error;
+
+		stack->capacity += len;
+	}
+
+	stack->size = initial_size;
+
+	return BLIS_SUCCESS;
+}
+
+err_t bli_stack_finalize( stck_t* stack )
+{
+	siz_t len = stack->block_len;
+	siz_t num_blocks = ( stack->capacity + len - 1 ) / len;
+
+	for ( siz_t block = num_blocks; block --> 0; )
+		bli_free_intl( stack->blocks[ block ] );
+
+	bli_free_intl( stack->blocks );
+
+	stack->size = 0;
+	stack->capacity = 0;
+	stack->max_blocks = 0;
+
+	return BLIS_SUCCESS;
+}
+
+err_t bli_stack_get( siz_t i, void** elem, const stck_t* stack )
+{
+	if ( elem == NULL )
+		return BLIS_NULL_POINTER;
+
+	if ( stack == NULL )
+	{
+		*elem = NULL;
+		return BLIS_NULL_POINTER;
+	}
+
+	if ( /* i < 0 || */ i >= bli_stack_size( stack ) )
+	{
+		*elem = NULL;
+		return BLIS_OUT_OF_BOUNDS;
+	}
+
+	// Calculate the position of the requested element using
+	// an O(1) addressing algorithm. Note that all information used
+	// here can never change even during stack pushes in other threads.
+	siz_t block = i / stack->block_len;
+	siz_t i_in_block = i % stack->block_len;
+	*elem = ( void* )( ( char* )stack->blocks[ block ] + i_in_block * stack->elem_size );
+
+	return BLIS_SUCCESS;
+}
+
+err_t bli_stack_push( siz_t* i, stck_t* stack )
+{
+	if ( i == NULL || stack == NULL )
+		return BLIS_NULL_POINTER;
+
+	// While normal access doesn't require locking, we *do* have to
+	// lock to update the size and capacity.
+	if ( bli_pthread_mutex_lock( &stack->lock ) != 0 )
+		return BLIS_LOCK_FAILURE;
+
+	// Check if we will need to allocate some extra space.
+	if ( stack->size + 1 > stack->capacity )
+	{
+		// Determine how many blocks are required to store the new capacity.
+		// A default growth factor of 1.5 (3/2) is used; the check against
+		// stack->size + 1 ensures that we grow the size even if the initial
+		// capacity is zero. Also don't grow the capacity beyond the maximum
+		// number of blocks (unless the stack is completely full and we
+		// return an error code below).
+		siz_t len = stack->block_len;
+		siz_t num_blocks_orig = ( stack->capacity + len - 1) / len;
+		siz_t new_capacity = bli_max( stack->size + 1,
+		                              bli_min( ( stack->capacity * 3 ) / 2,
+		                                       stack->max_blocks * len
+		                                     )
+		                            );
+		siz_t num_blocks_new = ( new_capacity + len - 1 ) / len;
+
+		// If too many blocks are required we must fail.
+		if ( num_blocks_new > stack->max_blocks )
+		{
+			bli_pthread_mutex_unlock( &stack->lock );
+			return BLIS_OUT_OF_BOUNDS;
+		}
+
+		// Allocate the new blocks one by one. If an allocation fails,
+		// the stack state will still be valid for as many blocks as were
+		// successfully allocated. This requires only updating the stack
+		// capacity *after* successful allocation.
+		err_t error;
+		for ( siz_t block = num_blocks_orig; block < num_blocks_new; block++ )
+		{
+			stack->blocks[ block ] = bli_malloc_intl( len * stack->elem_size, &error );
+			if ( error != BLIS_SUCCESS )
+			{
+				bli_pthread_mutex_unlock( &stack->lock );
+				return error;
+			}
+
+			stack->capacity += len;
+		}
+	}
+
+	// Save the position of the end of the stack and finally increase the size.
+	*i = stack->size;
+	stack->size += 1;
+
+	bli_pthread_mutex_unlock( &stack->lock );
+
+	return BLIS_SUCCESS;
+}
+
diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/base/bli_stack.h
similarity index 67%
rename from frame/3/trsm/bli_trsm_front.h
rename to frame/base/bli_stack.h
index dacfd19e9..986234ba7 100644
--- a/frame/3/trsm/bli_trsm_front.h
+++ b/frame/base/bli_stack.h
@@ -4,8 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2023, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,25 +32,47 @@
 
 */
 
-void bli_trsm_front
-     (
-             side_t  side,
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const cntx_t* cntx,
-             rntm_t* rntm
-     );
+#ifndef BLIS_STACK_H
+#define BLIS_STACK_H
+
+// -- Stack type based on a dynamic block array --
+
+/*
+typedef struct
+{
+	siz_t elem_size;
+	siz_t block_len;
+	siz_t max_blocks;
+	siz_t size;
+	siz_t capacity;
+
+	void** blocks;
+
+	bli_pthread_mutex_t lock;
+} stck_t;
+*/
 
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-err_t bli_trsm_small
+
+BLIS_EXPORT_BLIS err_t bli_stack_init
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       cntl_t* cntl
+       siz_t   elem_size,
+       siz_t   block_len,
+       siz_t   max_blocks,
+       siz_t   initial_size,
+       stck_t* stack
      );
+
+BLIS_EXPORT_BLIS err_t bli_stack_finalize( stck_t* stack );
+
+BLIS_INLINE siz_t bli_stack_size( const stck_t* stack )
+{
+	return stack->size;
+}
+
+BLIS_EXPORT_BLIS err_t bli_stack_get( siz_t i, void** elem, const stck_t* stack );
+
+BLIS_EXPORT_BLIS err_t bli_stack_push( siz_t* i, stck_t* stack );
+
+
 #endif
 
diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c
index 5e0792910..6ae848b4c 100644
--- a/frame/base/cast/bli_castm.c
+++ b/frame/base/cast/bli_castm.c
@@ -110,7 +110,7 @@ void bli_castm
 #undef  GENTFUNC2
 #define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \
 \
-void PASTEMAC2(cha,chb,opname) \
+void PASTEMAC(cha,chb,opname) \
      ( \
              trans_t transa, \
              dim_t   m, \
@@ -150,7 +150,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copyjs)( a1[i], b1[i] ); \
+					PASTEMAC(cha,chb,copyjs)( a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -163,7 +163,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copyjs)( *a1, *b1 ); \
+					PASTEMAC(cha,chb,copyjs)( *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
@@ -182,7 +182,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copys)( a1[i], b1[i] ); \
+					PASTEMAC(cha,chb,copys)( a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -195,7 +195,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copys)( *a1, *b1 ); \
+					PASTEMAC(cha,chb,copys)( *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
diff --git a/frame/base/cast/bli_castm.h b/frame/base/cast/bli_castm.h
index a36d15e99..5ea309a57 100644
--- a/frame/base/cast/bli_castm.h
+++ b/frame/base/cast/bli_castm.h
@@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void bli_castm
 #undef  GENTPROT2
 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(cha,chb,opname) \
      ( \
              trans_t transa, \
              dim_t   m, \
diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c
index e6e4a6cd9..3c2bbcb57 100644
--- a/frame/base/cast/bli_castnzm.c
+++ b/frame/base/cast/bli_castnzm.c
@@ -110,7 +110,7 @@ void bli_castnzm
 #undef  GENTFUNC2
 #define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \
 \
-void PASTEMAC2(cha,chb,opname) \
+void PASTEMAC(cha,chb,opname) \
      ( \
              trans_t transa, \
              dim_t   m, \
@@ -150,7 +150,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copyjnzs)( a1[i], b1[i] ); \
+					PASTEMAC(cha,chb,copyjnzs)( a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -163,7 +163,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copyjnzs)( *a1, *b1 ); \
+					PASTEMAC(cha,chb,copyjnzs)( *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
@@ -182,7 +182,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copynzs)( a1[i], b1[i] ); \
+					PASTEMAC(cha,chb,copynzs)( a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -195,7 +195,7 @@ void PASTEMAC2(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC2(cha,chb,copynzs)( *a1, *b1 ); \
+					PASTEMAC(cha,chb,copynzs)( *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h
index a8a852720..f3d9f8bab 100644
--- a/frame/base/cast/bli_castnzm.h
+++ b/frame/base/cast/bli_castnzm.h
@@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void bli_castnzm
 #undef  GENTPROT2
 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(cha,chb,opname) \
      ( \
              trans_t transa, \
              dim_t   m, \
diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c
index c057df82a..468ff9109 100644
--- a/frame/base/cast/bli_castv.c
+++ b/frame/base/cast/bli_castv.c
@@ -105,7 +105,7 @@ void bli_castv
 #undef  GENTFUNC2
 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
 \
-void PASTEMAC2(chx,chy,opname) \
+void PASTEMAC(chx,chy,opname) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -123,14 +123,14 @@ void PASTEMAC2(chx,chy,opname) \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC2(chx,chy,copyjs)( x1[i], y1[i] ); \
+				PASTEMAC(chx,chy,copyjs)( x1[i], y1[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC2(chx,chy,copyjs)( *x1, *y1 ); \
+				PASTEMAC(chx,chy,copyjs)( *x1, *y1 ); \
 \
 				x1 += incx; \
 				y1 += incy; \
@@ -143,14 +143,14 @@ void PASTEMAC2(chx,chy,opname) \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC2(chx,chy,copys)( x1[i], y1[i] ); \
+				PASTEMAC(chx,chy,copys)( x1[i], y1[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC2(chx,chy,copys)( *x1, *y1 ); \
+				PASTEMAC(chx,chy,copys)( *x1, *y1 ); \
 \
 				x1 += incx; \
 				y1 += incy; \
diff --git a/frame/base/cast/bli_castv.h b/frame/base/cast/bli_castv.h
index 99f89fc24..70b916b8e 100644
--- a/frame/base/cast/bli_castv.h
+++ b/frame/base/cast/bli_castv.h
@@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void bli_castv
 #undef  GENTPROT2
 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(chx,chy,opname) \
      ( \
              conj_t conjx, \
              dim_t  n, \
diff --git a/frame/compat/amd/bla_copy_amd.c b/frame/compat/amd/bla_copy_amd.c
index 6780b555e..1ae007c96 100644
--- a/frame/compat/amd/bla_copy_amd.c
+++ b/frame/compat/amd/bla_copy_amd.c
@@ -72,7 +72,7 @@ void PASTEF77(ch,blasname) \
 	   still need initialization so that they can query valid contexts from
 	   gks. However, the expert API will self-initialize before attempting
 	   to query a context, so the complex domain cases should work fine. */ \
-	PASTEMAC2(ch,blisname,isuf) \
+	PASTEMAC(ch,blisname,isuf) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n0, \
@@ -125,7 +125,7 @@ void PASTEF77(ch,blasname) \
 	   still need initialization so that they can query valid contexts from
 	   gks. However, the expert API will self-initialize before attempting
 	   to query a context, so the complex domain cases should work fine. */ \
-	PASTEMAC2(ch,blisname,isuf) \
+	PASTEMAC(ch,blisname,isuf) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n0, \
diff --git a/frame/compat/amd/bla_gemv_amd.c b/frame/compat/amd/bla_gemv_amd.c
index 398d1bf2c..5cd523f17 100644
--- a/frame/compat/amd/bla_gemv_amd.c
+++ b/frame/compat/amd/bla_gemv_amd.c
@@ -118,7 +118,7 @@ void PASTEF77(ch,blasname) \
 	/* If alpha is zero, scale y by beta and return early. */ \
 	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
 	{ \
-		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  m_y, \
@@ -135,7 +135,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Declare a void function pointer for the current operation. */ \
-	PASTECH2(ch,blisname,_unb_ft) f; \
+	PASTECH(ch,blisname,_unb_ft) f; \
 \
 	/* Choose the underlying implementation. */ \
 	if         ( bli_does_notrans( blis_transa ) )  f = PASTEMAC(ch,gemv_unf_var2); \
diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c
index 9fcce920b..1b73cc69b 100644
--- a/frame/compat/bla_amax.c
+++ b/frame/compat/bla_amax.c
@@ -41,7 +41,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype_x, chx, blasname, blisname ) \
 \
-f77_int PASTEF772(i,chx,blasname) \
+f77_int PASTEF77(i,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -70,7 +70,7 @@ f77_int PASTEF772(i,chx,blasname) \
 	bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  n0, \
 	  x0, incx0, \
diff --git a/frame/compat/bla_amax.h b/frame/compat/bla_amax.h
index e765ecfcb..b8e115f7d 100644
--- a/frame/compat/bla_amax.h
+++ b/frame/compat/bla_amax.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ftype_x, chx, blasname ) \
 \
-BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \
+BLIS_EXPORT_BLAS f77_int PASTEF77(i,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
diff --git a/frame/compat/bla_asum.c b/frame/compat/bla_asum.c
index 8e71bfa8c..05f4f69c2 100644
--- a/frame/compat/bla_asum.c
+++ b/frame/compat/bla_asum.c
@@ -41,7 +41,7 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-ftype_r PASTEF772(chr,chx,blasname) \
+ftype_r PASTEF77(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -63,7 +63,7 @@ ftype_r PASTEF772(chr,chx,blasname) \
 	bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  n0, \
 	  x0, incx0, \
diff --git a/frame/compat/bla_asum.h b/frame/compat/bla_asum.h
index fd859f26b..916723e7e 100644
--- a/frame/compat/bla_asum.h
+++ b/frame/compat/bla_asum.h
@@ -40,7 +40,7 @@
 #undef  GENTPROTR2
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
-BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
+BLIS_EXPORT_BLAS ftype_r PASTEF77(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
diff --git a/frame/compat/bla_axpy.c b/frame/compat/bla_axpy.c
index e3c67fd55..f2ed8ee7c 100644
--- a/frame/compat/bla_axpy.c
+++ b/frame/compat/bla_axpy.c
@@ -67,7 +67,7 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n0, \
diff --git a/frame/compat/bla_copy.c b/frame/compat/bla_copy.c
index d9d3b7cce..bc1ed078c 100644
--- a/frame/compat/bla_copy.c
+++ b/frame/compat/bla_copy.c
@@ -66,7 +66,7 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n0, \
diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c
index f5396b190..b68af083c 100644
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -42,7 +42,7 @@
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-ftype PASTEF772(ch,blasname,chc) \
+ftype PASTEF77(ch,blasname,chc) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -68,7 +68,7 @@ ftype PASTEF772(ch,blasname,chc) \
 	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_conjx, \
 	  BLIS_NO_CONJUGATE, \
@@ -99,7 +99,7 @@ INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-void PASTEF772(ch,blasname,chc) \
+void PASTEF77(ch,blasname,chc) \
      ( \
        ftype*         rhop, \
        const f77_int* n, \
@@ -126,7 +126,7 @@ void PASTEF772(ch,blasname,chc) \
 	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_conjx, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_dot.h b/frame/compat/bla_dot.h
index 14221071e..b49f6cd84 100644
--- a/frame/compat/bla_dot.h
+++ b/frame/compat/bla_dot.h
@@ -40,7 +40,7 @@
 #undef  GENTPROTDOT
 #define GENTPROTDOT( ftype, ch, chc, blasname ) \
 \
-BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \
+BLIS_EXPORT_BLAS ftype PASTEF77(ch,blasname,chc) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -60,7 +60,7 @@ INSERT_GENTPROTDOTC_BLAS( dot )
 #undef  GENTPROTDOT
 #define GENTPROTDOT( ftype, ch, chc, blasname ) \
 \
-BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \
+BLIS_EXPORT_BLAS void PASTEF77(ch,blasname,chc) \
      ( \
        ftype*         rhop, \
        const f77_int* n, \
diff --git a/frame/compat/bla_gemm.c b/frame/compat/bla_gemm.c
index e71d4e2fc..3f7ff73d2 100644
--- a/frame/compat/bla_gemm.c
+++ b/frame/compat/bla_gemm.c
@@ -99,7 +99,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_transa, \
 	  blis_transb, \
@@ -183,7 +183,7 @@ void PASTEF77(ch,blasname) \
 		dim_t m0t, k0t; \
 		bli_set_dims_with_trans( blis_transa, m0, k0, &m0t, &k0t ); \
 \
-		PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,gemv,BLIS_TAPI_EX_SUF) \
 		( \
 		  blis_transa, \
 		  bli_extract_conj( blis_transb ), \
@@ -203,7 +203,7 @@ void PASTEF77(ch,blasname) \
 		dim_t n0t, k0t; \
 		bli_set_dims_with_trans( blis_transb, n0, k0, &n0t, &k0t ); \
 \
-		PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,gemv,BLIS_TAPI_EX_SUF) \
 		( \
 		  blis_transb, \
 		  bli_extract_conj( blis_transa ), \
diff --git a/frame/compat/bla_gemv.c b/frame/compat/bla_gemv.c
index 8d730edd9..1b8b738cd 100644
--- a/frame/compat/bla_gemv.c
+++ b/frame/compat/bla_gemv.c
@@ -120,7 +120,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_transa, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_ger.c b/frame/compat/bla_ger.c
index b558bfd94..c28c1ebd1 100644
--- a/frame/compat/bla_ger.c
+++ b/frame/compat/bla_ger.c
@@ -41,7 +41,7 @@
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjy, blasname, blisname ) \
 \
-void PASTEF772(ch,blasname,chc) \
+void PASTEF77(ch,blasname,chc) \
      ( \
        const f77_int* m, \
        const f77_int* n, \
@@ -87,7 +87,7 @@ void PASTEF772(ch,blasname,chc) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  blis_conjy, \
diff --git a/frame/compat/bla_ger.h b/frame/compat/bla_ger.h
index 88517c015..8184efdd9 100644
--- a/frame/compat/bla_ger.h
+++ b/frame/compat/bla_ger.h
@@ -40,7 +40,7 @@
 #undef  GENTPROTDOT
 #define GENTPROTDOT( ftype, chxy, chc, blasname ) \
 \
-BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \
+BLIS_EXPORT_BLAS void PASTEF77(chxy,blasname,chc) \
      ( \
        const f77_int* m, \
        const f77_int* n, \
diff --git a/frame/compat/bla_hemm.c b/frame/compat/bla_hemm.c
index 9a4484a09..d13ccaa06 100644
--- a/frame/compat/bla_hemm.c
+++ b/frame/compat/bla_hemm.c
@@ -96,7 +96,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_side, \
 	  blis_uploa, \
diff --git a/frame/compat/bla_hemv.c b/frame/compat/bla_hemv.c
index d036c10e3..b771e7924 100644
--- a/frame/compat/bla_hemv.c
+++ b/frame/compat/bla_hemv.c
@@ -90,7 +90,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_her.c b/frame/compat/bla_her.c
index 512081d89..1f9dfe600 100644
--- a/frame/compat/bla_her.c
+++ b/frame/compat/bla_her.c
@@ -84,7 +84,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_her2.c b/frame/compat/bla_her2.c
index 7d99a6378..05875a2d2 100644
--- a/frame/compat/bla_her2.c
+++ b/frame/compat/bla_her2.c
@@ -89,7 +89,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_her2k.c b/frame/compat/bla_her2k.c
index 2a058dc02..25e9fb431 100644
--- a/frame/compat/bla_her2k.c
+++ b/frame/compat/bla_her2k.c
@@ -113,7 +113,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploc, \
 	  blis_transa, \
diff --git a/frame/compat/bla_herk.c b/frame/compat/bla_herk.c
index 8236e2032..a9f01268d 100644
--- a/frame/compat/bla_herk.c
+++ b/frame/compat/bla_herk.c
@@ -109,7 +109,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploc, \
 	  blis_transa, \
diff --git a/frame/compat/bla_nrm2.c b/frame/compat/bla_nrm2.c
index 0b41a0673..746d9e63a 100644
--- a/frame/compat/bla_nrm2.c
+++ b/frame/compat/bla_nrm2.c
@@ -41,7 +41,7 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-ftype_r PASTEF772(chr,chx,blasname) \
+ftype_r PASTEF77(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -63,7 +63,7 @@ ftype_r PASTEF772(chr,chx,blasname) \
 	bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  n0, \
 	  x0, incx0, \
diff --git a/frame/compat/bla_nrm2.h b/frame/compat/bla_nrm2.h
index fb5955356..f03d45bf2 100644
--- a/frame/compat/bla_nrm2.h
+++ b/frame/compat/bla_nrm2.h
@@ -40,7 +40,7 @@
 #undef  GENTPROTR2
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
-BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
+BLIS_EXPORT_BLAS ftype_r PASTEF77(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c
index 6c2a3c3db..0acf7c10d 100644
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -41,7 +41,7 @@
 #undef  GENTFUNCSCAL
 #define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
 \
-void PASTEF772(chx,cha,blasname) \
+void PASTEF77(chx,cha,blasname) \
      ( \
        const f77_int* n, \
        const ftype_a* alpha, \
@@ -67,10 +67,10 @@ void PASTEF772(chx,cha,blasname) \
 	   that is, we just always sub-optimally implement those cases
 	   by casting alpha to ctype_x (potentially the complex domain) and
 	   using the homogeneous datatype instance according to that type. */ \
-	PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
+	PASTEMAC(cha,chx,copys)( *alpha, alpha_cast ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n0, \
diff --git a/frame/compat/bla_scal.h b/frame/compat/bla_scal.h
index ef55118bf..66f438522 100644
--- a/frame/compat/bla_scal.h
+++ b/frame/compat/bla_scal.h
@@ -40,7 +40,7 @@
 #undef  GENTPROTSCAL
 #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \
 \
-BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \
+BLIS_EXPORT_BLAS void PASTEF77(chx,cha,blasname) \
      ( \
        const f77_int* n, \
        const ftype_a* alpha, \
diff --git a/frame/compat/bla_swap.c b/frame/compat/bla_swap.c
index 72bc9d6d5..e4aa724f7 100644
--- a/frame/compat/bla_swap.c
+++ b/frame/compat/bla_swap.c
@@ -66,7 +66,7 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  n0, \
 	  x0, incx0, \
diff --git a/frame/compat/bla_symm.c b/frame/compat/bla_symm.c
index 098beb472..39b64e9be 100644
--- a/frame/compat/bla_symm.c
+++ b/frame/compat/bla_symm.c
@@ -96,7 +96,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_side, \
 	  blis_uploa, \
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index 8923acdc4..f13514b01 100644
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -90,7 +90,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index 91dc99b59..0b4877a5c 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -84,7 +84,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_syr2.c b/frame/compat/bla_syr2.c
index 7050c0488..e34fb9003 100644
--- a/frame/compat/bla_syr2.c
+++ b/frame/compat/bla_syr2.c
@@ -90,7 +90,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  BLIS_NO_CONJUGATE, \
diff --git a/frame/compat/bla_syr2k.c b/frame/compat/bla_syr2k.c
index 2b26171b6..9f0f70cbd 100644
--- a/frame/compat/bla_syr2k.c
+++ b/frame/compat/bla_syr2k.c
@@ -105,7 +105,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploc, \
 	  blis_transa, \
diff --git a/frame/compat/bla_syrk.c b/frame/compat/bla_syrk.c
index 4f3f15367..6c0dcb8cf 100644
--- a/frame/compat/bla_syrk.c
+++ b/frame/compat/bla_syrk.c
@@ -101,7 +101,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploc, \
 	  blis_transa, \
diff --git a/frame/compat/bla_trmm.c b/frame/compat/bla_trmm.c
index b77a60dd6..cef3840c3 100644
--- a/frame/compat/bla_trmm.c
+++ b/frame/compat/bla_trmm.c
@@ -99,7 +99,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_b = *ldb; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_side, \
 	  blis_uploa, \
diff --git a/frame/compat/bla_trmv.c b/frame/compat/bla_trmv.c
index 2821d4bfa..01a582169 100644
--- a/frame/compat/bla_trmv.c
+++ b/frame/compat/bla_trmv.c
@@ -95,7 +95,7 @@ void PASTEF77(ch,blasname) \
 	one_p = PASTEMAC(ch,1); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  blis_transa, \
diff --git a/frame/compat/bla_trsm.c b/frame/compat/bla_trsm.c
index 9af008090..967565dd5 100644
--- a/frame/compat/bla_trsm.c
+++ b/frame/compat/bla_trsm.c
@@ -99,7 +99,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_b = *ldb; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_side, \
 	  blis_uploa, \
diff --git a/frame/compat/bla_trsv.c b/frame/compat/bla_trsv.c
index 91132934e..b276de9c7 100644
--- a/frame/compat/bla_trsv.c
+++ b/frame/compat/bla_trsv.c
@@ -95,7 +95,7 @@ void PASTEF77(ch,blasname) \
 	one_p = PASTEMAC(ch,1); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploa, \
 	  blis_transa, \
diff --git a/frame/compat/blis/thread/b77_thread.c b/frame/compat/blis/thread/b77_thread.c
index c864339a3..c4e1db019 100644
--- a/frame/compat/blis/thread/b77_thread.c
+++ b/frame/compat/blis/thread/b77_thread.c
@@ -39,7 +39,7 @@
 // Define Fortran-compatible BLIS interfaces.
 //
 
-void PASTEF770(bli_thread_set_ways)
+void PASTEF77(bli_thread_set_ways)
      (
        const f77_int* jc,
        const f77_int* pc,
@@ -71,7 +71,7 @@ void PASTEF770(bli_thread_set_ways)
 	bli_finalize_auto();
 }
 
-void PASTEF770(bli_thread_set_num_threads)
+void PASTEF77(bli_thread_set_num_threads)
      (
        const f77_int* nt
      )
diff --git a/frame/compat/blis/thread/b77_thread.h b/frame/compat/blis/thread/b77_thread.h
index 922ed6e13..4351f5e7f 100644
--- a/frame/compat/blis/thread/b77_thread.h
+++ b/frame/compat/blis/thread/b77_thread.h
@@ -37,7 +37,7 @@
 // Prototype Fortran-compatible BLIS interfaces.
 //
 
-BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways)
+BLIS_EXPORT_BLAS void PASTEF77(bli_thread_set_ways)
      (
        const f77_int* jc,
        const f77_int* pc,
@@ -46,7 +46,7 @@ BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways)
        const f77_int* ir
      );
 
-BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads)
+BLIS_EXPORT_BLAS void PASTEF77(bli_thread_set_num_threads)
      (
        const f77_int* nt
      );
diff --git a/frame/compat/cblas/f77_sub/f77_amax_sub.c b/frame/compat/cblas/f77_sub/f77_amax_sub.c
index cc26196d7..d7b827d9b 100644
--- a/frame/compat/cblas/f77_sub/f77_amax_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_amax_sub.c
@@ -42,14 +42,14 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype_x, chx, blasname, blisname ) \
 \
-void PASTEF773(i,chx,blasname,sub) \
+void PASTEF77(i,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
              f77_int* rval  \
      ) \
 { \
-	*rval = PASTEF772(i,chx,blasname) \
+	*rval = PASTEF77(i,chx,blasname) \
 	( \
 	  n, \
 	  x, incx \
diff --git a/frame/compat/cblas/f77_sub/f77_amax_sub.h b/frame/compat/cblas/f77_sub/f77_amax_sub.h
index 9cd1202d2..39d923b6b 100644
--- a/frame/compat/cblas/f77_sub/f77_amax_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_amax_sub.h
@@ -39,7 +39,7 @@
 #undef  GENTPROT
 #define GENTPROT( ftype_x, chx, blasname ) \
 \
-BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \
+BLIS_EXPORT_BLAS void PASTEF77(i,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
diff --git a/frame/compat/cblas/f77_sub/f77_asum_sub.c b/frame/compat/cblas/f77_sub/f77_asum_sub.c
index f1cb35b0c..1313e3577 100644
--- a/frame/compat/cblas/f77_sub/f77_asum_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_asum_sub.c
@@ -42,14 +42,14 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-void PASTEF773(chr,chx,blasname,sub) \
+void PASTEF77(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
              ftype_r* rval  \
      ) \
 { \
-	*rval = PASTEF772(chr,chx,blasname) \
+	*rval = PASTEF77(chr,chx,blasname) \
 	( \
 	  n, \
 	  x, incx \
diff --git a/frame/compat/cblas/f77_sub/f77_asum_sub.h b/frame/compat/cblas/f77_sub/f77_asum_sub.h
index 4b8634c16..dc9b0f18c 100644
--- a/frame/compat/cblas/f77_sub/f77_asum_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_asum_sub.h
@@ -39,7 +39,7 @@
 #undef  GENTPROTR2
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
-BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \
+BLIS_EXPORT_BLAS void PASTEF77(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
diff --git a/frame/compat/cblas/f77_sub/f77_dot_sub.c b/frame/compat/cblas/f77_sub/f77_dot_sub.c
index 0ca80464d..906fe1253 100644
--- a/frame/compat/cblas/f77_sub/f77_dot_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_dot_sub.c
@@ -43,7 +43,7 @@
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-void PASTEF773(ch,blasname,chc,sub) \
+void PASTEF77(ch,blasname,chc,sub) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -51,7 +51,7 @@ void PASTEF773(ch,blasname,chc,sub) \
              ftype*   rval  \
      ) \
 { \
-	*rval = PASTEF772(ch,blasname,chc) \
+	*rval = PASTEF77(ch,blasname,chc) \
 	( \
 	  n, \
 	  x, incx, \
@@ -75,7 +75,7 @@ INSERT_GENTFUNCDOTC_BLAS( dot, NULL )
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-void PASTEF773(ch,blasname,chc,sub) \
+void PASTEF77(ch,blasname,chc,sub) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -83,7 +83,7 @@ void PASTEF773(ch,blasname,chc,sub) \
              ftype*   rval  \
      ) \
 { \
-	PASTEF772(ch,blasname,chc) \
+	PASTEF77(ch,blasname,chc) \
 	( \
 	  rval, \
 	  n, \
@@ -100,7 +100,7 @@ INSERT_GENTFUNCDOTC_BLAS( dot, NULL )
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in single precision.
-void PASTEF772(sds,dot,sub)
+void PASTEF77(sds,dot,sub)
      (
        const f77_int* n,
        const float*  sb,
@@ -120,7 +120,7 @@ void PASTEF772(sds,dot,sub)
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in double precision.
-void PASTEF772(ds,dot,sub)
+void PASTEF77(ds,dot,sub)
      (
        const f77_int* n,
        const float*   x, const f77_int* incx,
diff --git a/frame/compat/cblas/f77_sub/f77_dot_sub.h b/frame/compat/cblas/f77_sub/f77_dot_sub.h
index 8aab2728b..c1049a9e2 100644
--- a/frame/compat/cblas/f77_sub/f77_dot_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_dot_sub.h
@@ -39,7 +39,7 @@
 #undef  GENTPROTDOT
 #define GENTPROTDOT( ftype, ch, chc, blasname ) \
 \
-BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \
+BLIS_EXPORT_BLAS void PASTEF77(ch,blasname,chc,sub) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -53,7 +53,7 @@ INSERT_GENTPROTDOT_BLAS( dot )
 
 // -- "Black sheep" dot product function prototypes --
 
-BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub)
+BLIS_EXPORT_BLAS void PASTEF77(sds,dot,sub)
      (
        const f77_int* n,
        const float*  sb,
@@ -62,7 +62,7 @@ BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub)
              float*   rval
      );
 
-BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub)
+BLIS_EXPORT_BLAS void PASTEF77(ds,dot,sub)
      (
        const f77_int* n,
        const float*   x, const f77_int* incx,
diff --git a/frame/compat/cblas/f77_sub/f77_nrm2_sub.c b/frame/compat/cblas/f77_sub/f77_nrm2_sub.c
index 54ce1a5b4..0a45c757d 100644
--- a/frame/compat/cblas/f77_sub/f77_nrm2_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_nrm2_sub.c
@@ -42,14 +42,14 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-void PASTEF773(chr,chx,blasname,sub) \
+void PASTEF77(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
              ftype_r* rval  \
      ) \
 { \
-	*rval = PASTEF772(chr,chx,blasname) \
+	*rval = PASTEF77(chr,chx,blasname) \
 	( \
 	  n, \
 	  x, incx \
diff --git a/frame/compat/cblas/f77_sub/f77_nrm2_sub.h b/frame/compat/cblas/f77_sub/f77_nrm2_sub.h
index c51a94292..373641506 100644
--- a/frame/compat/cblas/f77_sub/f77_nrm2_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_nrm2_sub.h
@@ -39,7 +39,7 @@
 #undef  GENTPROTR2
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
-BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \
+BLIS_EXPORT_BLAS void PASTEF77(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
diff --git a/frame/compat/check/bla_gemm3m_check.h b/frame/compat/check/bla_gemm3m_check.h
index f4ede64c8..e6c4e387e 100644
--- a/frame/compat/check/bla_gemm3m_check.h
+++ b/frame/compat/check/bla_gemm3m_check.h
@@ -43,12 +43,12 @@
 	f77_int ta,    tb; \
 	f77_int nrowa, nrowb; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTEF77(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTEF77(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTEF77(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -80,7 +80,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_gemm_check.h b/frame/compat/check/bla_gemm_check.h
index 4ee47040e..7c9434ee8 100644
--- a/frame/compat/check/bla_gemm_check.h
+++ b/frame/compat/check/bla_gemm_check.h
@@ -42,12 +42,12 @@
 	f77_int ta,    tb; \
 	f77_int nrowa, nrowb; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTEF77(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTEF77(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTEF77(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -79,7 +79,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_gemmt_check.h b/frame/compat/check/bla_gemmt_check.h
index a447210a3..0a754248e 100644
--- a/frame/compat/check/bla_gemmt_check.h
+++ b/frame/compat/check/bla_gemmt_check.h
@@ -43,15 +43,15 @@
 	f77_int lower, upper; \
 	f77_int nrowa, nrowb; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTEF77(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTEF77(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTEF77(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
 \
-	lower = PASTEF770(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -83,7 +83,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_gemv_check.h b/frame/compat/check/bla_gemv_check.h
index 67e718b55..e5f319a9f 100644
--- a/frame/compat/check/bla_gemv_check.h
+++ b/frame/compat/check/bla_gemv_check.h
@@ -39,9 +39,9 @@
 	f77_int info = 0; \
 	f77_int nota, ta, conja; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !nota && !ta && !conja ) \
 		info = 1; \
@@ -64,7 +64,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_ger_check.h b/frame/compat/check/bla_ger_check.h
index 44e51df32..c8c986a10 100644
--- a/frame/compat/check/bla_ger_check.h
+++ b/frame/compat/check/bla_ger_check.h
@@ -59,7 +59,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_hemm_check.h b/frame/compat/check/bla_hemm_check.h
index 5e5884d4f..0bd02db2d 100644
--- a/frame/compat/check/bla_hemm_check.h
+++ b/frame/compat/check/bla_hemm_check.h
@@ -41,10 +41,10 @@
 	f77_int lower, upper; \
 	f77_int nrowa; \
 \
-	left  = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \
-	right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	left  = PASTEF77(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \
+	right = PASTEF77(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( left ) { nrowa = *m; } \
 	else        { nrowa = *n; } \
@@ -72,7 +72,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_hemv_check.h b/frame/compat/check/bla_hemv_check.h
index 014e28e0f..0764000a3 100644
--- a/frame/compat/check/bla_hemv_check.h
+++ b/frame/compat/check/bla_hemv_check.h
@@ -39,8 +39,8 @@
 	f77_int info = 0; \
 	f77_int lower, upper; \
 \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -61,7 +61,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_her2_check.h b/frame/compat/check/bla_her2_check.h
index 3eb873c94..97ac9f185 100644
--- a/frame/compat/check/bla_her2_check.h
+++ b/frame/compat/check/bla_her2_check.h
@@ -39,8 +39,8 @@
 	f77_int info = 0; \
 	f77_int lower, upper; \
 \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -61,7 +61,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_her2k_check.h b/frame/compat/check/bla_her2k_check.h
index bfaa19f81..ebe80d730 100644
--- a/frame/compat/check/bla_her2k_check.h
+++ b/frame/compat/check/bla_her2k_check.h
@@ -41,10 +41,10 @@
 	f77_int lower, upper; \
 	f77_int nrowa; \
 \
-	nota  = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -72,7 +72,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_her_check.h b/frame/compat/check/bla_her_check.h
index 7734d5b9e..5271b65e3 100644
--- a/frame/compat/check/bla_her_check.h
+++ b/frame/compat/check/bla_her_check.h
@@ -39,8 +39,8 @@
 	f77_int info = 0; \
 	f77_int lower, upper; \
 \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -59,7 +59,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_herk_check.h b/frame/compat/check/bla_herk_check.h
index 8c48cf123..3b1c04e94 100644
--- a/frame/compat/check/bla_herk_check.h
+++ b/frame/compat/check/bla_herk_check.h
@@ -41,10 +41,10 @@
 	f77_int lower, upper; \
 	f77_int nrowa; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -70,7 +70,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_syr2k_check.h b/frame/compat/check/bla_syr2k_check.h
index a1e1f2eb8..cd5fc9654 100644
--- a/frame/compat/check/bla_syr2k_check.h
+++ b/frame/compat/check/bla_syr2k_check.h
@@ -45,11 +45,11 @@
 	static char* dt_cst = dt_str; \
 \
 	is_r  = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \
-	nota  = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \
-	cta   = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \
+	cta   = PASTEF77(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -77,7 +77,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_syrk_check.h b/frame/compat/check/bla_syrk_check.h
index 9332e61d4..d0e915fd0 100644
--- a/frame/compat/check/bla_syrk_check.h
+++ b/frame/compat/check/bla_syrk_check.h
@@ -45,11 +45,11 @@
 	static char* dt_cst = dt_str; \
 \
 	is_r  = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	cta   = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	cta   = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -75,7 +75,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_trmm_check.h b/frame/compat/check/bla_trmm_check.h
index ab7036d56..29bc67bf4 100644
--- a/frame/compat/check/bla_trmm_check.h
+++ b/frame/compat/check/bla_trmm_check.h
@@ -43,15 +43,15 @@
 	f77_int unita, nonua; \
 	f77_int nrowa; \
 \
-	left  = PASTEF770(lsame)( sidea,  "L", (ftnlen)1, (ftnlen)1 ); \
-	right = PASTEF770(lsame)( sidea,  "R", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	unita = PASTEF770(lsame)( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nonua = PASTEF770(lsame)( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
+	left  = PASTEF77(lsame)( sidea,  "L", (ftnlen)1, (ftnlen)1 ); \
+	right = PASTEF77(lsame)( sidea,  "R", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	unita = PASTEF77(lsame)( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nonua = PASTEF77(lsame)( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( left ) { nrowa = *m; } \
 	else        { nrowa = *n; } \
@@ -81,7 +81,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/check/bla_trmv_check.h b/frame/compat/check/bla_trmv_check.h
index 67e6e28ee..34d8a5fb4 100644
--- a/frame/compat/check/bla_trmv_check.h
+++ b/frame/compat/check/bla_trmv_check.h
@@ -41,13 +41,13 @@
 	f77_int nota, ta, conja; \
 	f77_int unita, nonua; \
 \
-	lower = PASTEF770(lsame)( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	unita = PASTEF770(lsame)( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nonua = PASTEF770(lsame)( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTEF77(lsame)( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF77(lsame)( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTEF77(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF77(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF77(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	unita = PASTEF77(lsame)( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nonua = PASTEF77(lsame)( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -70,7 +70,7 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTEF77(xerbla)( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
diff --git a/frame/compat/extra/bla_axpby.c b/frame/compat/extra/bla_axpby.c
index d96d75d74..8d6d5ce70 100644
--- a/frame/compat/extra/bla_axpby.c
+++ b/frame/compat/extra/bla_axpby.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2020, Advanced Micro Devices, Inc.
-   
+
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -68,7 +68,7 @@ void PASTEF77(ch,blasname) \
 	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n0, \
diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c
index 1d124cbc2..b83c88ee8 100644
--- a/frame/compat/extra/bla_gemm3m.c
+++ b/frame/compat/extra/bla_gemm3m.c
@@ -114,7 +114,7 @@ void PASTEF77(ch,blasname) \
 		bli_rntm_disable_l3_sup( rntm ); \
 \
 		/* Call BLIS interface. */ \
-		PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 		( \
 		  blis_transa, \
 		  blis_transb, \
@@ -217,34 +217,17 @@ void PASTEF77(ch,blasname) \
 	bli_obj_set_conjtrans( blis_transa, &ao ); \
 	bli_obj_set_conjtrans( blis_transb, &bo ); \
 \
-	/* As a placeholder, invoke 1m since BLIS does no longer contains an
-	   official 3m implementation. Note that we do this by inlining an
-	   abbreviated version of bli_gemm_ex() so that we can bypass
-	   consideration of sup, which doesn't make sense in this context. */ \
-	{ \
-		cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M ); \
-\
-		rntm_t  rntm_l; \
-		rntm_t* rntm = &rntm_l; \
-		bli_rntm_init_from_global( &rntm_l ); \
-\
-		/* This is probably not needed given that we performed BLAS-style
-		   parameter checking above, but bli_gemm_check() is normally called
-		   in the normal course of bli_gemm_ex(). */ \
-		if ( bli_error_checking_is_enabled() ) \
-			bli_gemm_check( &alphao, &ao, &bo, &betao, &co, cntx ); \
-\
-		PASTEMAC(blisname,_front) \
-		( \
-		  &alphao, \
-		  &ao, \
-		  &bo, \
-		  &betao, \
-		  &co, \
-		  cntx, \
-		  rntm \
-		); \
-	} \
+	/* As a placeholder, invoke bli_gemm_ex(). */ \
+	PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
+	( \
+	  &alphao, \
+	  &ao, \
+	  &bo, \
+	  &betao, \
+	  &co, \
+	  NULL, \
+	  NULL \
+	); \
 \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
diff --git a/frame/compat/extra/bla_gemm_batch.c b/frame/compat/extra/bla_gemm_batch.c
index 4b2597e19..da6750cab 100644
--- a/frame/compat/extra/bla_gemm_batch.c
+++ b/frame/compat/extra/bla_gemm_batch.c
@@ -109,7 +109,7 @@ void PASTEF77(ch,blasname) \
 		for ( f77_int j = 0; j < group_size[i]; j++ ) \
 		{ \
 			/* Call BLIS interface. */ \
-			PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+			PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 			( \
 			  blis_transa, \
 			  blis_transb, \
diff --git a/frame/compat/extra/bla_gemmt.c b/frame/compat/extra/bla_gemmt.c
index 101cc6d13..266663df3 100644
--- a/frame/compat/extra/bla_gemmt.c
+++ b/frame/compat/extra/bla_gemmt.c
@@ -100,7 +100,7 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_c = *ldc; \
 \
 	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  blis_uploc, \
 	  blis_transa, \
diff --git a/frame/compat/f2c/bla_gbmv.c b/frame/compat/f2c/bla_gbmv.c
index d53dd322a..320b49684 100644
--- a/frame/compat/f2c/bla_gbmv.c
+++ b/frame/compat/f2c/bla_gbmv.c
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj;
     bla_integer kup1;
 
@@ -203,8 +203,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -223,23 +223,23 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CGBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
 /*     Quick return if possible. */
 
-    if (*m == 0 || *n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) 
+    if (*m == 0 || *n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta)
 	    == 1.f && bli_cimag(*beta) == 0.f))) {
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
 
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -308,7 +308,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -491,9 +491,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_integer kup1;
 
 /*     .. Scalar Arguments .. */
@@ -635,8 +635,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -655,7 +655,7 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DGBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -668,7 +668,7 @@
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -729,7 +729,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -847,9 +847,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_integer kup1;
 
 /*     .. Scalar Arguments .. */
@@ -991,8 +991,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -1011,7 +1011,7 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SGBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("SGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1024,7 +1024,7 @@
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -1085,7 +1085,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -1207,9 +1207,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj;
     bla_integer kup1;
 
@@ -1356,8 +1356,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -1376,23 +1376,23 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZGBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
 /*     Quick return if possible. */
 
-    if (*m == 0 || *n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 
+    if (*m == 0 || *n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) ==
 	    1. && bli_zimag(*beta) == 0.))) {
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
 
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -1461,7 +1461,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
diff --git a/frame/compat/f2c/bla_hbmv.c b/frame/compat/f2c/bla_hbmv.c
index 198336d04..c20a720f9 100644
--- a/frame/compat/f2c/bla_hbmv.c
+++ b/frame/compat/f2c/bla_hbmv.c
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -204,7 +204,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -219,13 +219,13 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CHBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
 /*     Quick return if possible. */
 
-    if (*n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) == 1.f && 
+    if (*n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) == 1.f &&
 	    bli_cimag(*beta) == 0.f))) {
 	return 0;
     }
@@ -293,7 +293,7 @@
     if (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
@@ -501,9 +501,9 @@
     bla_integer info;
     bla_dcomplex temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -650,7 +650,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -665,13 +665,13 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZHBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
 /*     Quick return if possible. */
 
-    if (*n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 1. && 
+    if (*n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 1. &&
 	    bli_zimag(*beta) == 0.))) {
 	return 0;
     }
@@ -739,7 +739,7 @@
     if (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0.) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
diff --git a/frame/compat/f2c/bla_hpmv.c b/frame/compat/f2c/bla_hpmv.c
index 0d7ebce9d..743261157 100644
--- a/frame/compat/f2c/bla_hpmv.c
+++ b/frame/compat/f2c/bla_hpmv.c
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -168,7 +168,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -179,13 +179,13 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CHPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
 /*     Quick return if possible. */
 
-    if (*n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) == 1.f && 
+    if (*n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) == 1.f &&
 	    bli_cimag(*beta) == 0.f))) {
 	return 0;
     }
@@ -254,7 +254,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
@@ -453,9 +453,9 @@
     bla_integer info;
     bla_dcomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -566,7 +566,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -577,13 +577,13 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZHPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
 /*     Quick return if possible. */
 
-    if (*n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 1. && 
+    if (*n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 1. &&
 	    bli_zimag(*beta) == 0.))) {
 	return 0;
     }
@@ -652,7 +652,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
diff --git a/frame/compat/f2c/bla_hpr.c b/frame/compat/f2c/bla_hpr.c
index da1f0a0f3..636cefef3 100644
--- a/frame/compat/f2c/bla_hpr.c
+++ b/frame/compat/f2c/bla_hpr.c
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -155,7 +155,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -164,7 +164,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHPR  ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CHPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -186,7 +186,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -367,9 +367,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -467,7 +467,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -476,7 +476,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHPR  ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZHPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -498,7 +498,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
diff --git a/frame/compat/f2c/bla_hpr2.c b/frame/compat/f2c/bla_hpr2.c
index c78c1eec0..98ae6b554 100644
--- a/frame/compat/f2c/bla_hpr2.c
+++ b/frame/compat/f2c/bla_hpr2.c
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -167,7 +167,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -178,7 +178,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHPR2 ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CHPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -210,7 +210,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -219,7 +219,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
 		i__3 = j;
-		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f 
+		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[j]);
 		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
@@ -266,7 +266,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
 		i__3 = jy;
-		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f 
+		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[jy]);
 		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
@@ -322,7 +322,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
 		i__3 = j;
-		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f 
+		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[j]);
 		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
@@ -369,7 +369,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
 		i__3 = jy;
-		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f 
+		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[jy]);
 		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
@@ -443,9 +443,9 @@
     bla_integer info;
     bla_dcomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -555,7 +555,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -566,7 +566,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHPR2 ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZHPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -598,7 +598,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -607,7 +607,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
 		i__3 = j;
-		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || 
+		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[j]);
 		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
@@ -654,7 +654,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
 		i__3 = jy;
-		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || 
+		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[jy]);
 		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
@@ -710,7 +710,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
 		i__3 = j;
-		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || 
+		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[j]);
 		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
@@ -757,7 +757,7 @@
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
 		i__3 = jy;
-		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || 
+		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[jy]);
 		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
diff --git a/frame/compat/f2c/bla_lsame.c b/frame/compat/f2c/bla_lsame.c
index edee918d1..8fdc7dfd8 100644
--- a/frame/compat/f2c/bla_lsame.c
+++ b/frame/compat/f2c/bla_lsame.c
@@ -43,9 +43,9 @@
 
 
 #ifdef LAPACK_ILP64
-long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len)
+long PASTEF77(lsame)(const char *ca, const char *cb, long ca_len, long cb_len)
 #else
-int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len)
+int PASTEF77(lsame)(const char *ca, const char *cb, int ca_len, int cb_len)
 #endif
 {
     /* System generated locals */
diff --git a/frame/compat/f2c/bla_lsame.h b/frame/compat/f2c/bla_lsame.h
index 0d00ca0ba..83acd7d76 100644
--- a/frame/compat/f2c/bla_lsame.h
+++ b/frame/compat/f2c/bla_lsame.h
@@ -35,9 +35,9 @@
 #if 1
 
 #ifdef LAPACK_ILP64
-long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len);
+long PASTEF77(lsame)(const char *ca, const char *cb, long ca_len, long cb_len);
 #else
-BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len);
+BLIS_EXPORT_BLAS int PASTEF77(lsame)(const char *ca, const char *cb, int ca_len, int cb_len);
 #endif
 
 #endif
diff --git a/frame/compat/f2c/bla_sbmv.c b/frame/compat/f2c/bla_sbmv.c
index 566fabd81..897bd1758 100644
--- a/frame/compat/f2c/bla_sbmv.c
+++ b/frame/compat/f2c/bla_sbmv.c
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -197,7 +197,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -212,7 +212,7 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DSBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -277,7 +277,7 @@
     if (*alpha == 0.) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
@@ -319,7 +319,7 @@
 		    iy += *incy;
 /* L70: */
 		}
-		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * 
+		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha *
 			temp2;
 		jx += *incx;
 		jy += *incy;
@@ -401,9 +401,9 @@
     bla_integer info;
     bla_real temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -548,7 +548,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -563,7 +563,7 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("SSBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -628,7 +628,7 @@
     if (*alpha == 0.f) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
@@ -670,7 +670,7 @@
 		    iy += *incy;
 /* L70: */
 		}
-		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * 
+		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha *
 			temp2;
 		jx += *incx;
 		jy += *incy;
diff --git a/frame/compat/f2c/bla_spmv.c b/frame/compat/f2c/bla_spmv.c
index 0485e1dc3..d0f52b6bb 100644
--- a/frame/compat/f2c/bla_spmv.c
+++ b/frame/compat/f2c/bla_spmv.c
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -160,7 +160,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -171,7 +171,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DSPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -237,7 +237,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
@@ -351,9 +351,9 @@
     bla_integer info;
     bla_real temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -461,7 +461,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -472,7 +472,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("SSPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -538,7 +538,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
diff --git a/frame/compat/f2c/bla_spr.c b/frame/compat/f2c/bla_spr.c
index d276458b4..13ec9d1a4 100644
--- a/frame/compat/f2c/bla_spr.c
+++ b/frame/compat/f2c/bla_spr.c
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -146,7 +146,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -155,7 +155,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSPR  ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DSPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -177,7 +177,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -277,9 +277,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -373,7 +373,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -382,7 +382,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSPR  ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("SSPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -404,7 +404,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
diff --git a/frame/compat/f2c/bla_spr2.c b/frame/compat/f2c/bla_spr2.c
index 7c7538212..1f0ef1718 100644
--- a/frame/compat/f2c/bla_spr2.c
+++ b/frame/compat/f2c/bla_spr2.c
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -158,7 +158,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -169,7 +169,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSPR2 ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DSPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -201,7 +201,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -309,9 +309,9 @@
     bla_integer info;
     bla_real temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -417,7 +417,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -428,7 +428,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSPR2 ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("SSPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -460,7 +460,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
diff --git a/frame/compat/f2c/bla_tbmv.c b/frame/compat/f2c/bla_tbmv.c
index 78feb7056..16c149c89 100644
--- a/frame/compat/f2c/bla_tbmv.c
+++ b/frame/compat/f2c/bla_tbmv.c
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -209,14 +209,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -229,7 +229,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CTBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -239,8 +239,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -254,11 +254,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -401,7 +401,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -620,9 +620,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -775,14 +775,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -795,7 +795,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DTBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -805,7 +805,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -819,11 +819,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -922,7 +922,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1031,9 +1031,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1186,14 +1186,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1206,7 +1206,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("STBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1216,7 +1216,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -1230,11 +1230,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1333,7 +1333,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1446,9 +1446,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1601,14 +1601,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1621,7 +1621,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTBMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZTBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1631,8 +1631,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -1646,11 +1646,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1793,7 +1793,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
diff --git a/frame/compat/f2c/bla_tbsv.c b/frame/compat/f2c/bla_tbsv.c
index 6914882d2..b237556f8 100644
--- a/frame/compat/f2c/bla_tbsv.c
+++ b/frame/compat/f2c/bla_tbsv.c
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -213,14 +213,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -233,7 +233,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTBSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CTBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -243,8 +243,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -258,11 +258,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -393,7 +393,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A') )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -612,9 +612,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -771,14 +771,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -791,7 +791,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTBSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DTBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -801,7 +801,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -815,11 +815,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -914,7 +914,7 @@
 
 /*        Form  x := inv( A')*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1027,9 +1027,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1186,14 +1186,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1206,7 +1206,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STBSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("STBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1216,7 +1216,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1230,11 +1230,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1329,7 +1329,7 @@
 
 /*        Form  x := inv( A')*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1447,9 +1447,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1606,14 +1606,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1626,7 +1626,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTBSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZTBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1636,8 +1636,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1651,11 +1651,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1786,7 +1786,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A') )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
diff --git a/frame/compat/f2c/bla_tpmv.c b/frame/compat/f2c/bla_tpmv.c
index 8fa46f4c4..853f30156 100644
--- a/frame/compat/f2c/bla_tpmv.c
+++ b/frame/compat/f2c/bla_tpmv.c
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -167,14 +167,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -183,7 +183,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CTPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -193,8 +193,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -208,11 +208,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -346,7 +346,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -551,9 +551,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -663,14 +663,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -679,7 +679,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DTPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -689,7 +689,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -703,11 +703,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -797,7 +797,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -899,9 +899,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1011,14 +1011,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1027,7 +1027,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("STPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1037,7 +1037,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1051,11 +1051,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1145,7 +1145,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1251,9 +1251,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1364,14 +1364,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1380,7 +1380,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTPMV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZTPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1390,8 +1390,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1405,11 +1405,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1543,7 +1543,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
diff --git a/frame/compat/f2c/bla_tpsv.c b/frame/compat/f2c/bla_tpsv.c
index 076494097..6a4a5ab6c 100644
--- a/frame/compat/f2c/bla_tpsv.c
+++ b/frame/compat/f2c/bla_tpsv.c
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -170,14 +170,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -186,7 +186,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTPSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("CTPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -196,8 +196,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -211,11 +211,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -339,7 +339,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -543,9 +543,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -658,14 +658,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -674,7 +674,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTPSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("DTPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -684,7 +684,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -698,11 +698,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -790,7 +790,7 @@
 
 /*        Form  x := inv( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -894,9 +894,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1009,14 +1009,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1025,7 +1025,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STPSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("STPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1035,7 +1035,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1049,11 +1049,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1141,7 +1141,7 @@
 
 /*        Form  x := inv( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1250,9 +1250,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTEF77(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTEF77(xerbla)(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1366,14 +1366,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTEF77(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF77(lsame)(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1382,7 +1382,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTPSV ", &info, (ftnlen)6);
+	PASTEF77(xerbla)("ZTPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1392,8 +1392,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTEF77(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTEF77(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1407,11 +1407,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTEF77(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1535,7 +1535,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTEF77(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
diff --git a/frame/compat/f2c/bla_xerbla.c b/frame/compat/f2c/bla_xerbla.c
index a6500c443..991ef00d0 100644
--- a/frame/compat/f2c/bla_xerbla.c
+++ b/frame/compat/f2c/bla_xerbla.c
@@ -43,7 +43,7 @@
 
 /* Table of constant values */
 
-/* Subroutine */ int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len)
+/* Subroutine */ int PASTEF77(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len)
 {
 /*  -- LAPACK auxiliary routine (preliminary version) -- */
 /*     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., */
diff --git a/frame/compat/f2c/bla_xerbla.h b/frame/compat/f2c/bla_xerbla.h
index 15635ddad..6824a5688 100644
--- a/frame/compat/f2c/bla_xerbla.h
+++ b/frame/compat/f2c/bla_xerbla.h
@@ -34,6 +34,6 @@
 
 #if 1
 
-BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
+BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF77(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
 
 #endif
diff --git a/frame/compat/f2c/bla_xerbla_array.c b/frame/compat/f2c/bla_xerbla_array.c
index 722bb2914..b69775d3b 100644
--- a/frame/compat/f2c/bla_xerbla_array.c
+++ b/frame/compat/f2c/bla_xerbla_array.c
@@ -38,7 +38,7 @@
 
 #define MAX_NUM_CHARS 32
 
-int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info)
+int PASTEF77(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info)
 {
 	int  i;
 #if 1
@@ -65,7 +65,7 @@ int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer
 	srname[i] = '\0';
 
 	// Call xerbla_().
-	PASTEF770(xerbla)( srname, info, ( ftnlen )srname_len );
+	PASTEF77(xerbla)( srname, info, ( ftnlen )srname_len );
 
 	return 0;
 }
diff --git a/frame/compat/f2c/bla_xerbla_array.h b/frame/compat/f2c/bla_xerbla_array.h
index b6248c029..4684b942f 100644
--- a/frame/compat/f2c/bla_xerbla_array.h
+++ b/frame/compat/f2c/bla_xerbla_array.h
@@ -34,6 +34,6 @@
 
 #if 1
 
-BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info);
+BLIS_EXPORT_BLAS int PASTEF77(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info);
 
 #endif
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index 361e9663d..a35bb7746 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -41,133 +41,10 @@
 // -- Context initialization prototypes ----------------------------------------
 //
 
-// -- Intel64 architectures --
-
-#ifdef BLIS_CONFIG_SKX
-CNTX_INIT_PROTS( skx )
-#endif
-#ifdef BLIS_CONFIG_KNL
-CNTX_INIT_PROTS( knl )
-#endif
-#ifdef BLIS_CONFIG_KNC
-CNTX_INIT_PROTS( knc )
-#endif
-#ifdef BLIS_CONFIG_HASWELL
-CNTX_INIT_PROTS( haswell )
-#endif
-#ifdef BLIS_CONFIG_SANDYBRIDGE
-CNTX_INIT_PROTS( sandybridge )
-#endif
-#ifdef BLIS_CONFIG_PENRYN
-CNTX_INIT_PROTS( penryn )
-#endif
-
-// -- AMD64 architectures --
-
-#ifdef BLIS_CONFIG_ZEN3
-CNTX_INIT_PROTS( zen3 )
-#endif
-#ifdef BLIS_CONFIG_ZEN2
-CNTX_INIT_PROTS( zen2 )
-#endif
-#ifdef BLIS_CONFIG_ZEN
-CNTX_INIT_PROTS( zen )
-#endif
-#ifdef BLIS_CONFIG_EXCAVATOR
-CNTX_INIT_PROTS( excavator )
-#endif
-#ifdef BLIS_CONFIG_STEAMROLLER
-CNTX_INIT_PROTS( steamroller )
-#endif
-#ifdef BLIS_CONFIG_PILEDRIVER
-CNTX_INIT_PROTS( piledriver )
-#endif
-#ifdef BLIS_CONFIG_BULLDOZER
-CNTX_INIT_PROTS( bulldozer )
-#endif
-
-// -- ARM architectures --
-
-// ARM-SVE
-#ifdef BLIS_CONFIG_ARMSVE
-CNTX_INIT_PROTS( armsve )
-#endif
-#ifdef BLIS_CONFIG_A64FX
-CNTX_INIT_PROTS( a64fx )
-#endif
-// ARM-NEON (4x128)
-#ifdef BLIS_CONFIG_ALTRAMAX
-CNTX_INIT_PROTS( altramax )
-#endif
-#ifdef BLIS_CONFIG_ALTRA
-CNTX_INIT_PROTS( altra )
-#endif
-#ifdef BLIS_CONFIG_FIRESTORM
-CNTX_INIT_PROTS( firestorm )
-#endif
-// ARM-NEON (2x128)
-#ifdef BLIS_CONFIG_THUNDERX2
-CNTX_INIT_PROTS( thunderx2 )
-#endif
-#ifdef BLIS_CONFIG_CORTEXA57
-CNTX_INIT_PROTS( cortexa57 )
-#endif
-#ifdef BLIS_CONFIG_CORTEXA53
-CNTX_INIT_PROTS( cortexa53 )
-#endif
-// ARM 32-bit (vintage)
-#ifdef BLIS_CONFIG_CORTEXA15
-CNTX_INIT_PROTS( cortexa15 )
-#endif
-#ifdef BLIS_CONFIG_CORTEXA9
-CNTX_INIT_PROTS( cortexa9 )
-#endif
-
-// -- IBM Power --
-
-#ifdef BLIS_CONFIG_POWER10
-CNTX_INIT_PROTS( power10 )
-#endif
-#ifdef BLIS_CONFIG_POWER9
-CNTX_INIT_PROTS( power9 )
-#endif
-#ifdef BLIS_CONFIG_POWER7
-CNTX_INIT_PROTS( power7 )
-#endif
-
-// -- IBM BG/Q --
-
-#ifdef BLIS_CONFIG_BGQ
-CNTX_INIT_PROTS( bgq )
-#endif
-
-// -- RISC-V --
-
-#ifdef BLIS_CONFIG_RV32I
-CNTX_INIT_PROTS( rv32i )
-#endif
-#ifdef BLIS_CONFIG_RV64I
-CNTX_INIT_PROTS( rv64i )
-#endif
-#ifdef BLIS_CONFIG_RV32IV
-CNTX_INIT_PROTS( rv32iv )
-#endif
-#ifdef BLIS_CONFIG_RV64IV
-CNTX_INIT_PROTS( rv64iv )
-#endif
-
-// -- SiFive architectures --
-
-#ifdef BLIS_CONFIG_SIFIVE_X280
-CNTX_INIT_PROTS( sifive_x280 )
-#endif
-
-// -- Generic --
-
-#ifdef BLIS_CONFIG_GENERIC
-CNTX_INIT_PROTS( generic )
-#endif
+#undef GENTCONF
+#define GENTCONF( CONFIG, config ) CNTX_INIT_PROTS( config )
 
+INSERT_GENTCONF
 
 //
 // -- Architecture family-specific headers -------------------------------------
diff --git a/frame/include/bli_arch_config_pre.h b/frame/include/bli_arch_config_pre.h
index 86c599230..7392281b9 100644
--- a/frame/include/bli_arch_config_pre.h
+++ b/frame/include/bli_arch_config_pre.h
@@ -49,9 +49,19 @@
 #define BLIS_CNAME_INFIX  PASTECH(_,BLIS_CNAME)
 #endif
 
+// Add an underscore to the BLIS kernel set string, if it was defined.
+#ifdef  BLIS_CNAME_UPPER
+#define BLIS_CNAME_UPPER_INFIX  PASTECH(_,BLIS_CNAME_UPPER)
+#endif
+
+// Add an underscore to the plugin name, if it was defined.
+#ifdef  BLIS_PNAME
+#define BLIS_PNAME_INFIX  PASTECH(_,BLIS_PNAME)
+#endif
+
 // Combine the CNAME and _ref for convenience to the code that defines
 // reference kernels.
-//#define BLIS_CNAME_REF_SUFFIX  PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX)
+//#define BLIS_CNAME_REF_SUFFIX  PASTECH(_,BLIS_CNAME,BLIS_REF_SUFFIX)
 
 // -- Prototype-generating macro definitions -----------------------------------
 
@@ -62,11 +72,11 @@ void PASTEMAC(cntx_init_,archname) \
      ( \
        cntx_t* cntx \
      ); \
-void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \
+void PASTEMAC(cntx_init_,archname,BLIS_REF_SUFFIX) \
      ( \
        cntx_t* cntx \
      ); \
-void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \
+void PASTEMAC(cntx_init_,archname,BLIS_IND_SUFFIX) \
      ( \
        ind_t   method, \
        cntx_t* cntx \
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 4bdbb7b78..86b41b80f 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -150,25 +150,6 @@
 #endif
 
 
-// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
-
-// Enable mixed datatype support?
-#ifdef BLIS_DISABLE_MIXED_DT
-  #undef BLIS_ENABLE_GEMM_MD
-#else
-  // Default behavior is enabled.
-  #define BLIS_ENABLE_GEMM_MD
-#endif
-
-// Enable memory-intensive optimizations for mixed datatype support?
-#ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM
-  #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
-#else
-  // Default behavior is enabled.
-  #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM
-#endif
-
-
 // -- MISCELLANEOUS OPTIONS ----------------------------------------------------
 
 // Do NOT require the cross-blocksize constraints. That is, do not enforce
diff --git a/frame/include/bli_genarray_macro_defs.h b/frame/include/bli_genarray_macro_defs.h
index eb932c558..4ec89d948 100644
--- a/frame/include/bli_genarray_macro_defs.h
+++ b/frame/include/bli_genarray_macro_defs.h
@@ -69,10 +69,10 @@ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \
 \
 static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
-	{ ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \
-	{ ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \
-	{ ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \
-	{ ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) }  \
+	{ ( tname )PASTEMAC(s,s,op), ( tname )PASTEMAC(s,c,op), ( tname )PASTEMAC(s,d,op), ( tname )PASTEMAC(s,z,op) }, \
+	{ ( tname )PASTEMAC(c,s,op), ( tname )PASTEMAC(c,c,op), ( tname )PASTEMAC(c,d,op), ( tname )PASTEMAC(c,z,op) }, \
+	{ ( tname )PASTEMAC(d,s,op), ( tname )PASTEMAC(d,c,op), ( tname )PASTEMAC(d,d,op), ( tname )PASTEMAC(d,z,op) }, \
+	{ ( tname )PASTEMAC(z,s,op), ( tname )PASTEMAC(z,c,op), ( tname )PASTEMAC(z,d,op), ( tname )PASTEMAC(z,z,op) }  \
 }
 
 // -- "Smart" two-operand macro --
@@ -82,10 +82,10 @@ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
-	{ PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \
-	{ PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \
-	{ PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \
-	{ PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) }  \
+	{ PASTEMAC(s,s,op), PASTEMAC(s,c,op), PASTEMAC(s,d,op), PASTEMAC(s,z,op) }, \
+	{ PASTEMAC(c,s,op), PASTEMAC(c,c,op), PASTEMAC(c,d,op), PASTEMAC(c,z,op) }, \
+	{ PASTEMAC(d,s,op), PASTEMAC(d,c,op), PASTEMAC(d,d,op), PASTEMAC(d,z,op) }, \
+	{ PASTEMAC(z,s,op), PASTEMAC(z,c,op), PASTEMAC(z,d,op), PASTEMAC(z,z,op) }  \
 }
 */
 
@@ -119,10 +119,10 @@ arrayname[BLIS_NUM_FP_TYPES+1] = \
 \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
-	{ PASTEMAC2(s,s,op), NULL,              PASTEMAC2(s,d,op), NULL,             }, \
-	{ PASTEMAC2(c,s,op), NULL,              PASTEMAC2(c,d,op), NULL,             }, \
-	{ PASTEMAC2(d,s,op), NULL,              PASTEMAC2(d,d,op), NULL,             }, \
-	{ PASTEMAC2(z,s,op), NULL,              PASTEMAC2(z,d,op), NULL,             }  \
+	{ PASTEMAC(s,s,op), NULL,              PASTEMAC(s,d,op), NULL,             }, \
+	{ PASTEMAC(c,s,op), NULL,              PASTEMAC(c,d,op), NULL,             }, \
+	{ PASTEMAC(d,s,op), NULL,              PASTEMAC(d,d,op), NULL,             }, \
+	{ PASTEMAC(z,s,op), NULL,              PASTEMAC(z,d,op), NULL,             }  \
 }
 */
 
@@ -134,10 +134,10 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 \
 arrayname[BLIS_NUM_FP_TYPES] = \
 { \
-	PASTECH2(prefix,s,op), \
-	PASTECH2(prefix,c,op), \
-	PASTECH2(prefix,d,op), \
-	PASTECH2(prefix,z,op)  \
+	PASTECH(prefix,s,op), \
+	PASTECH(prefix,c,op), \
+	PASTECH(prefix,d,op), \
+	PASTECH(prefix,z,op)  \
 }
 
 
@@ -149,10 +149,21 @@ arrayname[BLIS_NUM_FP_TYPES] = \
 \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
-	{ PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \
-	{ PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \
-	{ PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \
-	{ PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) }  \
+	{ PASTEMAC(s,s,op), PASTEMAC(s,c,op), PASTEMAC(s,d,op), PASTEMAC(s,z,op) }, \
+	{ PASTEMAC(c,s,op), PASTEMAC(c,c,op), PASTEMAC(c,d,op), PASTEMAC(c,z,op) }, \
+	{ PASTEMAC(d,s,op), PASTEMAC(d,c,op), PASTEMAC(d,d,op), PASTEMAC(d,z,op) }, \
+	{ PASTEMAC(z,s,op), PASTEMAC(z,c,op), PASTEMAC(z,d,op), PASTEMAC(z,z,op) }  \
+}
+
+
+#define GENARRAY2_MIXP(arrayname,op) \
+\
+arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
+{ \
+	{ PASTEMAC(s,s,op), NULL,              PASTEMAC(s,d,op), NULL,             }, \
+	{ NULL,              PASTEMAC(c,c,op), NULL,              PASTEMAC(c,z,op) }, \
+	{ PASTEMAC(d,s,op), NULL,              PASTEMAC(d,d,op), NULL,             }, \
+	{ NULL,              PASTEMAC(z,c,op), NULL,              PASTEMAC(z,z,op) }  \
 }
 
 
@@ -160,10 +171,10 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
-	{ PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL,              NULL,             }, \
-	{ PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL,              NULL,             }, \
-	{ NULL,              NULL,              PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \
-	{ NULL,              NULL,              PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) }  \
+	{ PASTEMAC(s,s,op), PASTEMAC(s,c,op), NULL,              NULL,             }, \
+	{ PASTEMAC(c,s,op), PASTEMAC(c,c,op), NULL,              NULL,             }, \
+	{ NULL,              NULL,              PASTEMAC(d,d,op), PASTEMAC(d,z,op) }, \
+	{ NULL,              NULL,              PASTEMAC(z,d,op), PASTEMAC(z,z,op) }  \
 }
 
 
@@ -171,10 +182,10 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
-	{ PASTEMAC2(s,s,op), NULL,              NULL,              NULL,             }, \
-	{ NULL,              PASTEMAC2(c,c,op), NULL,              NULL,             }, \
-	{ NULL,              NULL,              PASTEMAC2(d,d,op), NULL,             }, \
-	{ NULL,              NULL,              NULL,              PASTEMAC2(z,z,op) }  \
+	{ PASTEMAC(s,s,op), NULL,              NULL,              NULL,             }, \
+	{ NULL,              PASTEMAC(c,c,op), NULL,              NULL,             }, \
+	{ NULL,              NULL,              PASTEMAC(d,d,op), NULL,             }, \
+	{ NULL,              NULL,              NULL,              PASTEMAC(z,z,op) }  \
 }
 
 
@@ -186,28 +197,28 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
 	{ \
-	{ PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \
-	{ PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \
-	{ PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \
-	{ PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) }  \
+	{ PASTEMAC(s,s,s,op), PASTEMAC(s,s,c,op), PASTEMAC(s,s,d,op), PASTEMAC(s,s,z,op) }, \
+	{ PASTEMAC(s,c,s,op), PASTEMAC(s,c,c,op), PASTEMAC(s,c,d,op), PASTEMAC(s,c,z,op) }, \
+	{ PASTEMAC(s,d,s,op), PASTEMAC(s,d,c,op), PASTEMAC(s,d,d,op), PASTEMAC(s,d,z,op) }, \
+	{ PASTEMAC(s,z,s,op), PASTEMAC(s,z,c,op), PASTEMAC(s,z,d,op), PASTEMAC(s,z,z,op) }  \
 	}, \
 	{ \
-	{ PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \
-	{ PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \
-	{ PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \
-	{ PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) }  \
+	{ PASTEMAC(c,s,s,op), PASTEMAC(c,s,c,op), PASTEMAC(c,s,d,op), PASTEMAC(c,s,z,op) }, \
+	{ PASTEMAC(c,c,s,op), PASTEMAC(c,c,c,op), PASTEMAC(c,c,d,op), PASTEMAC(c,c,z,op) }, \
+	{ PASTEMAC(c,d,s,op), PASTEMAC(c,d,c,op), PASTEMAC(c,d,d,op), PASTEMAC(c,d,z,op) }, \
+	{ PASTEMAC(c,z,s,op), PASTEMAC(c,z,c,op), PASTEMAC(c,z,d,op), PASTEMAC(c,z,z,op) }  \
 	}, \
 	{ \
-	{ PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \
-	{ PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \
-	{ PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \
-	{ PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) }  \
+	{ PASTEMAC(d,s,s,op), PASTEMAC(d,s,c,op), PASTEMAC(d,s,d,op), PASTEMAC(d,s,z,op) }, \
+	{ PASTEMAC(d,c,s,op), PASTEMAC(d,c,c,op), PASTEMAC(d,c,d,op), PASTEMAC(d,c,z,op) }, \
+	{ PASTEMAC(d,d,s,op), PASTEMAC(d,d,c,op), PASTEMAC(d,d,d,op), PASTEMAC(d,d,z,op) }, \
+	{ PASTEMAC(d,z,s,op), PASTEMAC(d,z,c,op), PASTEMAC(d,z,d,op), PASTEMAC(d,z,z,op) }  \
 	}, \
 	{ \
-	{ PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \
-	{ PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \
-	{ PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \
-	{ PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) }  \
+	{ PASTEMAC(z,s,s,op), PASTEMAC(z,s,c,op), PASTEMAC(z,s,d,op), PASTEMAC(z,s,z,op) }, \
+	{ PASTEMAC(z,c,s,op), PASTEMAC(z,c,c,op), PASTEMAC(z,c,d,op), PASTEMAC(z,c,z,op) }, \
+	{ PASTEMAC(z,d,s,op), PASTEMAC(z,d,c,op), PASTEMAC(z,d,d,op), PASTEMAC(z,d,z,op) }, \
+	{ PASTEMAC(z,z,s,op), PASTEMAC(z,z,c,op), PASTEMAC(z,z,d,op), PASTEMAC(z,z,z,op) }  \
 	} \
 }
 
@@ -217,28 +228,28 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
 	{ \
-	{ PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL,                NULL,               }, \
-	{ PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL,                NULL,               }, \
+	{ PASTEMAC(s,s,s,op), PASTEMAC(s,s,c,op), NULL,                NULL,               }, \
+	{ PASTEMAC(s,c,s,op), PASTEMAC(s,c,c,op), NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }  \
 	}, \
 	{ \
-	{ PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL,                NULL,               }, \
-	{ PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL,                NULL,               }, \
+	{ PASTEMAC(c,s,s,op), PASTEMAC(c,s,c,op), NULL,                NULL,               }, \
+	{ PASTEMAC(c,c,s,op), PASTEMAC(c,c,c,op), NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }  \
 	}, \
 	{ \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
-	{ NULL,                NULL,                PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \
-	{ NULL,                NULL,                PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) }  \
+	{ NULL,                NULL,                PASTEMAC(d,d,d,op), PASTEMAC(d,d,z,op) }, \
+	{ NULL,                NULL,                PASTEMAC(d,z,d,op), PASTEMAC(d,z,z,op) }  \
 	}, \
 	{ \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
-	{ NULL,                NULL,                PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \
-	{ NULL,                NULL,                PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) }  \
+	{ NULL,                NULL,                PASTEMAC(z,d,d,op), PASTEMAC(z,d,z,op) }, \
+	{ NULL,                NULL,                PASTEMAC(z,z,d,op), PASTEMAC(z,z,z,op) }  \
 	} \
 }
 
@@ -248,28 +259,28 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
 { \
 	{ \
-	{ PASTEMAC3(s,s,s,op), NULL,                NULL,                NULL,               }, \
+	{ PASTEMAC(s,s,s,op), NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }  \
 	}, \
 	{ \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
-	{ NULL,                PASTEMAC3(c,c,c,op), NULL,                NULL,               }, \
+	{ NULL,                PASTEMAC(c,c,c,op), NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }  \
 	}, \
 	{ \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
-	{ NULL,                NULL,                PASTEMAC3(d,d,d,op), NULL,               }, \
+	{ NULL,                NULL,                PASTEMAC(d,d,d,op), NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }  \
 	}, \
 	{ \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
 	{ NULL,                NULL,                NULL,                NULL,               }, \
-	{ NULL,                NULL,                NULL,                PASTEMAC3(z,z,z,op) }  \
+	{ NULL,                NULL,                NULL,                PASTEMAC(z,z,z,op) }  \
 	} \
 }
 
diff --git a/frame/include/bli_gentconf_macro_defs.h b/frame/include/bli_gentconf_macro_defs.h
new file mode 100644
index 000000000..70414fb47
--- /dev/null
+++ b/frame/include/bli_gentconf_macro_defs.h
@@ -0,0 +1,288 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Southern Methodist University
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_GENTCONF_MACRO_DEFS_H
+#define BLIS_GENTCONF_MACRO_DEFS_H
+
+
+//
+// -- MACROS TO INSERT CONFIGURATION-SPECIFIC MACROS ---------------------------
+//
+
+
+// -- configuration-specific macros which are conditionally-enabled --
+
+// -- Intel architectures ------------------------------------------------------
+
+#ifdef BLIS_CONFIG_SKX
+#define INSERT_GENTCONF_SKX GENTCONF( SKX, skx )
+#else
+#define INSERT_GENTCONF_SKX
+#endif
+#ifdef BLIS_CONFIG_KNL
+#define INSERT_GENTCONF_KNL GENTCONF( KNL, knl )
+#else
+#define INSERT_GENTCONF_KNL
+#endif
+#ifdef BLIS_CONFIG_KNC
+#define INSERT_GENTCONF_KNC GENTCONF( KNC, knc )
+#else
+#define INSERT_GENTCONF_KNC
+#endif
+#ifdef BLIS_CONFIG_HASWELL
+#define INSERT_GENTCONF_HASWELL GENTCONF( HASWELL, haswell )
+#else
+#define INSERT_GENTCONF_HASWELL
+#endif
+#ifdef BLIS_CONFIG_SANDYBRIDGE
+#define INSERT_GENTCONF_SANDYBRIDGE GENTCONF( SANDYBRIDGE, sandybridge )
+#else
+#define INSERT_GENTCONF_SANDYBRIDGE
+#endif
+#ifdef BLIS_CONFIG_PENRYN
+#define INSERT_GENTCONF_PENRYN GENTCONF( PENRYN, penryn )
+#else
+#define INSERT_GENTCONF_PENRYN
+#endif
+
+// -- AMD architectures --------------------------------------------------------
+
+#ifdef BLIS_CONFIG_ZEN3
+#define INSERT_GENTCONF_ZEN3 GENTCONF( ZEN3, zen3 )
+#else
+#define INSERT_GENTCONF_ZEN3
+#endif
+#ifdef BLIS_CONFIG_ZEN2
+#define INSERT_GENTCONF_ZEN2 GENTCONF( ZEN2, zen2 )
+#else
+#define INSERT_GENTCONF_ZEN2
+#endif
+#ifdef BLIS_CONFIG_ZEN
+#define INSERT_GENTCONF_ZEN GENTCONF( ZEN, zen )
+#else
+#define INSERT_GENTCONF_ZEN
+#endif
+#ifdef BLIS_CONFIG_EXCAVATOR
+#define INSERT_GENTCONF_EXCAVATOR GENTCONF( EXCAVATOR, excavator )
+#else
+#define INSERT_GENTCONF_EXCAVATOR
+#endif
+#ifdef BLIS_CONFIG_STEAMROLLER
+#define INSERT_GENTCONF_STEAMROLLER GENTCONF( STEAMROLLER, steamroller )
+#else
+#define INSERT_GENTCONF_STEAMROLLER
+#endif
+#ifdef BLIS_CONFIG_PILEDRIVER
+#define INSERT_GENTCONF_PILEDRIVER GENTCONF( PILEDRIVER, piledriver )
+#else
+#define INSERT_GENTCONF_PILEDRIVER
+#endif
+#ifdef BLIS_CONFIG_BULLDOZER
+#define INSERT_GENTCONF_BULLDOZER GENTCONF( BULLDOZER, bulldozer )
+#else
+#define INSERT_GENTCONF_BULLDOZER
+#endif
+
+// -- ARM architectures --------------------------------------------------------
+
+// -- ARM-SVE --
+#ifdef BLIS_CONFIG_ARMSVE
+#define INSERT_GENTCONF_ARMSVE GENTCONF( ARMSVE, armsve )
+#else
+#define INSERT_GENTCONF_ARMSVE
+#endif
+#ifdef BLIS_CONFIG_A64FX
+#define INSERT_GENTCONF_A64FX GENTCONF( A64FX, a64fx )
+#else
+#define INSERT_GENTCONF_A64FX
+#endif
+
+// -- ARM-NEON (4 pipes x 128-bit vectors) --
+#ifdef BLIS_CONFIG_ALTRA
+#define INSERT_GENTCONF_ALTRA GENTCONF( ALTRA, altra )
+#else
+#define INSERT_GENTCONF_ALTRA
+#endif
+#ifdef BLIS_CONFIG_ALTRAMAX
+#define INSERT_GENTCONF_ALTRAMAX GENTCONF( ALTRAMAX, altramax )
+#else
+#define INSERT_GENTCONF_ALTRAMAX
+#endif
+#ifdef BLIS_CONFIG_FIRESTORM
+#define INSERT_GENTCONF_FIRESTORM GENTCONF( FIRESTORM, firestorm )
+#else
+#define INSERT_GENTCONF_FIRESTORM
+#endif
+
+// -- ARM (2 pipes x 128-bit vectors) --
+#ifdef BLIS_CONFIG_THUNDERX2
+#define INSERT_GENTCONF_THUNDERX2 GENTCONF( THUNDERX2, thunderx2 )
+#else
+#define INSERT_GENTCONF_THUNDERX2
+#endif
+#ifdef BLIS_CONFIG_CORTEXA57
+#define INSERT_GENTCONF_CORTEXA57 GENTCONF( CORTEXA57, cortexa57 )
+#else
+#define INSERT_GENTCONF_CORTEXA57
+#endif
+#ifdef BLIS_CONFIG_CORTEXA53
+#define INSERT_GENTCONF_CORTEXA53 GENTCONF( CORTEXA53, cortexa53 )
+#else
+#define INSERT_GENTCONF_CORTEXA53
+#endif
+
+		// -- ARM (older 32-bit microarchitectures) --
+#ifdef BLIS_CONFIG_CORTEXA15
+#define INSERT_GENTCONF_CORTEXA15 GENTCONF( CORTEXA15, cortexa15 )
+#else
+#define INSERT_GENTCONF_CORTEXA15
+#endif
+#ifdef BLIS_CONFIG_CORTEXA9
+#define INSERT_GENTCONF_CORTEXA9 GENTCONF( CORTEXA9, cortexa9 )
+#else
+#define INSERT_GENTCONF_CORTEXA9
+#endif
+
+		// -- IBM architectures ------------------------------------------------
+
+#ifdef BLIS_CONFIG_POWER10
+#define INSERT_GENTCONF_POWER10 GENTCONF( POWER10, power10 )
+#else
+#define INSERT_GENTCONF_POWER10
+#endif
+#ifdef BLIS_CONFIG_POWER9
+#define INSERT_GENTCONF_POWER9 GENTCONF( POWER9, power9 )
+#else
+#define INSERT_GENTCONF_POWER9
+#endif
+#ifdef BLIS_CONFIG_POWER7
+#define INSERT_GENTCONF_POWER7 GENTCONF( POWER7, power7 )
+#else
+#define INSERT_GENTCONF_POWER7
+#endif
+#ifdef BLIS_CONFIG_BGQ
+#define INSERT_GENTCONF_BGQ GENTCONF( BGQ, bgq )
+#else
+#define INSERT_GENTCONF_BGQ
+#endif
+
+// -- RISC-V architectures ----------------------------------------------------
+
+#ifdef BLIS_CONFIG_RV32I
+#define INSERT_GENTCONF_RV32I GENTCONF( RV32I, rv32i )
+#else
+#define INSERT_GENTCONF_RV32I
+#endif
+#ifdef BLIS_CONFIG_RV64I
+#define INSERT_GENTCONF_RV64I GENTCONF( RV64I, rv64i )
+#else
+#define INSERT_GENTCONF_RV64I
+#endif
+#ifdef BLIS_CONFIG_RV32IV
+#define INSERT_GENTCONF_RV32IV GENTCONF( RV32IV, rv32iv )
+#else
+#define INSERT_GENTCONF_RV32IV
+#endif
+#ifdef BLIS_CONFIG_RV64IV
+#define INSERT_GENTCONF_RV64IV GENTCONF( RV64IV, rv64iv )
+#else
+#define INSERT_GENTCONF_RV64IV
+#endif
+
+// -- SiFive architectures ----------------------------------------------------
+
+#ifdef BLIS_CONFIG_SIFIVE_X280
+#define INSERT_GENTCONF_SIFIVE_X280 GENTCONF( SIFIVE_X280, sifive_x280 )
+#else
+#define INSERT_GENTCONF_SIFIVE_X280
+#endif
+
+// -- Generic architectures ----------------------------------------------------
+
+#ifdef BLIS_CONFIG_GENERIC
+#define INSERT_GENTCONF_GENERIC GENTCONF( GENERIC, generic )
+#else
+#define INSERT_GENTCONF_GENERIC
+#endif
+
+
+// -- configuration-specific macro --
+
+#define INSERT_GENTCONF \
+\
+INSERT_GENTCONF_SKX \
+INSERT_GENTCONF_KNL \
+INSERT_GENTCONF_KNC \
+INSERT_GENTCONF_HASWELL \
+INSERT_GENTCONF_SANDYBRIDGE \
+INSERT_GENTCONF_PENRYN \
+\
+INSERT_GENTCONF_ZEN3 \
+INSERT_GENTCONF_ZEN2 \
+INSERT_GENTCONF_ZEN \
+INSERT_GENTCONF_EXCAVATOR \
+INSERT_GENTCONF_STEAMROLLER \
+INSERT_GENTCONF_PILEDRIVER \
+INSERT_GENTCONF_BULLDOZER \
+\
+INSERT_GENTCONF_ARMSVE \
+INSERT_GENTCONF_A64FX \
+\
+INSERT_GENTCONF_ALTRAMAX \
+INSERT_GENTCONF_ALTRA \
+INSERT_GENTCONF_FIRESTORM \
+\
+INSERT_GENTCONF_THUNDERX2 \
+INSERT_GENTCONF_CORTEXA57 \
+INSERT_GENTCONF_CORTEXA53 \
+\
+INSERT_GENTCONF_CORTEXA15 \
+INSERT_GENTCONF_CORTEXA9 \
+\
+INSERT_GENTCONF_POWER10 \
+INSERT_GENTCONF_POWER9 \
+INSERT_GENTCONF_POWER7 \
+INSERT_GENTCONF_BGQ \
+\
+INSERT_GENTCONF_RV32I \
+INSERT_GENTCONF_RV64I \
+INSERT_GENTCONF_RV32IV \
+INSERT_GENTCONF_RV64IV \
+\
+INSERT_GENTCONF_SIFIVE_X280 \
+\
+INSERT_GENTCONF_GENERIC
+
+
+#endif
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index a8ce2eb2c..8074bb441 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -261,15 +261,9 @@ GENTFUNC2( dcomplex, double,   z, d, __VA_ARGS__ )
 #define INSERT_GENTFUNC2_MIX_P( ... ) \
 \
 GENTFUNC2( float,    double,   s, d, __VA_ARGS__ ) \
-GENTFUNC2( float,    dcomplex, s, z, __VA_ARGS__ ) \
-\
 GENTFUNC2( double,   float,    d, s, __VA_ARGS__ ) \
-GENTFUNC2( double,   scomplex, d, c, __VA_ARGS__ ) \
 \
-GENTFUNC2( scomplex, double,   c, d, __VA_ARGS__ ) \
 GENTFUNC2( scomplex, dcomplex, c, z, __VA_ARGS__ ) \
-\
-GENTFUNC2( dcomplex, float,    z, s, __VA_ARGS__ ) \
 GENTFUNC2( dcomplex, scomplex, z, c, __VA_ARGS__ )
 
 
@@ -324,19 +318,31 @@ GENTFUNC2R( dcomplex, double,   double,   z, d, d, __VA_ARGS__ )
 #define INSERT_GENTFUNC2R_MIX_P( ... ) \
 \
 GENTFUNC2R( float,    double,   double,   s, d, d, __VA_ARGS__ ) \
-GENTFUNC2R( float,    dcomplex, double,   s, z, d, __VA_ARGS__ ) \
-\
 GENTFUNC2R( double,   float,    float,    d, s, s, __VA_ARGS__ ) \
-GENTFUNC2R( double,   scomplex, float,    d, c, s, __VA_ARGS__ ) \
 \
-GENTFUNC2R( scomplex, double,   double,   c, d, d, __VA_ARGS__ ) \
 GENTFUNC2R( scomplex, dcomplex, double,   c, z, d, __VA_ARGS__ ) \
-\
-GENTFUNC2R( dcomplex, float,    float,    z, s, s, __VA_ARGS__ ) \
 GENTFUNC2R( dcomplex, scomplex, float,    z, c, s, __VA_ARGS__ )
 
 
+// -- Real-only two-operand with complex projection of both operands --
+
+#define INSERT_GENTFUNC2RO( ... ) \
+\
+GENTFUNC2RO( float,  scomplex, float,  scomplex, s, c, s, c, __VA_ARGS__ ) \
+GENTFUNC2RO( double, dcomplex, double, dcomplex, d, z, d, z, __VA_ARGS__ )
+
+
+
+// -- Mixed precision real-only two-operand with complex projection of both operands --
+
+#define INSERT_GENTFUNC2RO_MIX_P( ... ) \
+\
+GENTFUNC2RO( float,  scomplex, double, dcomplex, s, c, d, z, __VA_ARGS__ ) \
+GENTFUNC2RO( double, dcomplex, float,  scomplex, d, z, s, c, __VA_ARGS__ )
+
+
+
 // -- Mixed domain/precision (all) two-operand macro with real projection of second operand --
 
 #define INSERT_GENTFUNC2R_MIX_DP( ... ) \
diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h
index 927c46644..8af3f5a26 100644
--- a/frame/include/bli_macro_defs.h
+++ b/frame/include/bli_macro_defs.h
@@ -45,57 +45,56 @@
 // because sometimes it is needed if, for example, one of the PASTE
 // macros is invoked with an "op" argument that is itself a macro.
 
-#define PASTEMAC0_(op)             bli_ ## op
-#define PASTEMAC0(op)              PASTEMAC0_(op)
-
-#define PASTEMAC_(ch,op)           bli_ ## ch  ## op
-#define PASTEMAC(ch,op)            PASTEMAC_(ch,op)
-
-#define PASTEMAC2_(ch1,ch2,op)     bli_ ## ch1 ## ch2 ## op
-#define PASTEMAC2(ch1,ch2,op)      PASTEMAC2_(ch1,ch2,op)
+#define PASTEMAC0_(op)                         bli_ ## op
+#define PASTEMAC1_(ch,op)                      bli_ ## ch  ## op
+#define PASTEMAC2_(ch1,ch2,op)                 bli_ ## ch1 ## ch2 ## op
+#define PASTEMAC3_(ch1,ch2,ch3,op)             bli_ ## ch1 ## ch2 ## ch3 ## op
+#define PASTEMAC4_(ch1,ch2,ch3,ch4,op)         bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op
+#define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op)     bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op
+#define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op
 
-#define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op
-#define PASTEMAC3(ch1,ch2,ch3,op)  PASTEMAC3_(ch1,ch2,ch3,op)
+#define PASTEMAC__(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,...) PASTEMAC ## arg8 ## _
+#define PASTEMAC_(...) PASTEMAC__(__VA_ARGS__, 6, 5, 4, 3, 2, 1, 0, XXX)
+#define PASTEMAC(...) PASTEMAC_(__VA_ARGS__)(__VA_ARGS__)
 
-#define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op
-#define PASTEMAC4(ch1,ch2,ch3,ch4,op)  PASTEMAC4_(ch1,ch2,ch3,ch4,op)
+#define PASTECH0_(op)                         op
+#define PASTECH1_(ch,op)                      ch  ## op
+#define PASTECH2_(ch1,ch2,op)                 ch1 ## ch2 ## op
+#define PASTECH3_(ch1,ch2,ch3,op)             ch1 ## ch2 ## ch3 ## op
+#define PASTECH4_(ch1,ch2,ch3,ch4,op)         ch1 ## ch2 ## ch3 ## ch4 ## op
+#define PASTECH5_(ch1,ch2,ch3,ch4,ch5,op)     ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op
+#define PASTECH6_(ch1,ch2,ch3,ch4,ch5,ch6,op) ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op
 
-#define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op
-#define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op)  PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op)
+#define PASTECH__(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,...) PASTECH ## arg8 ## _
+#define PASTECH_(...) PASTECH__(__VA_ARGS__, 6, 5, 4, 3, 2, 1, 0, XXX)
+#define PASTECH(...) PASTECH_(__VA_ARGS__)(__VA_ARGS__)
 
-#define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op
-#define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op)  PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op)
+// Fortran-77 name-mangling macros.
+#define PASTEF770_(op)                         op  ## _
+#define PASTEF771_(ch,op)                      ch  ## op ## _
+#define PASTEF772_(ch1,ch2,op)                 ch1 ## ch2 ## op ## _
+#define PASTEF773_(ch1,ch2,ch3,op)             ch1 ## ch2 ## ch3 ## op ## _
+#define PASTEF774_(ch1,ch2,ch3,ch4,op)         ch1 ## ch2 ## ch3 ## ch4 ## op ## _
+#define PASTEF775_(ch1,ch2,ch3,ch4,ch5,op)     ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op ## _
+#define PASTEF776_(ch1,ch2,ch3,ch4,ch5,ch6,op) ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op ## _
+
+#define PASTEF77__(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,...) PASTEF77 ## arg8 ## _
+#define PASTEF77_(...) PASTEF77__(__VA_ARGS__, 6, 5, 4, 3, 2, 1, 0, XXX)
+#define PASTEF77(...) PASTEF77_(__VA_ARGS__)(__VA_ARGS__)
 
 #define PASTEBLACHK_(op)           bla_ ## op ## _check
 #define PASTEBLACHK(op)            PASTEBLACHK_(op)
 
-#define PASTECH0_(op)              op
-#define PASTECH0(op)               PASTECH0_(op)
-
-#define PASTECH_(ch,op)            ch ## op
-#define PASTECH(ch,op)             PASTECH_(ch,op)
-
-#define PASTECH2_(ch1,ch2,op)      ch1 ## ch2 ## op
-#define PASTECH2(ch1,ch2,op)       PASTECH2_(ch1,ch2,op)
-
-#define PASTECH3_(ch1,ch2,ch3,op)  ch1 ## ch2 ## ch3 ## op
-#define PASTECH3(ch1,ch2,ch3,op)   PASTECH3_(ch1,ch2,ch3,op)
-
 #define MKSTR(s1)                  #s1
 #define STRINGIFY_INT( s )         MKSTR( s )
 
-// Fortran-77 name-mangling macros.
-#define PASTEF770(name)                                      name ## _
-#define PASTEF77(ch1,name)                     ch1        ## name ## _
-#define PASTEF772(ch1,ch2,name)                ch1 ## ch2 ## name ## _
-#define PASTEF773(ch1,ch2,ch3,name)     ch1 ## ch2 ## ch3 ## name ## _
-
 // -- Include other groups of macros
 
 #include "bli_genarray_macro_defs.h"
 #include "bli_gentdef_macro_defs.h"
 #include "bli_gentfunc_macro_defs.h"
 #include "bli_gentprot_macro_defs.h"
+#include "bli_gentconf_macro_defs.h"
 
 #include "bli_misc_macro_defs.h"
 #include "bli_edge_case_macro_defs.h"
diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h
index 31e0150f6..98d86a298 100644
--- a/frame/include/bli_misc_macro_defs.h
+++ b/frame/include/bli_misc_macro_defs.h
@@ -151,6 +151,28 @@ BLIS_INLINE void bli_toggle_bool( bool* b )
 #define bli_cctyper  float
 #define bli_zctyper  double
 
+// return whether or not two types are the same
+
+#define bli_sssame 1
+#define bli_sdsame 0
+#define bli_scsame 0
+#define bli_szsame 0
+
+#define bli_dssame 0
+#define bli_ddsame 1
+#define bli_dcsame 0
+#define bli_dzsame 0
+
+#define bli_cssame 0
+#define bli_cdsame 0
+#define bli_ccsame 1
+#define bli_czsame 0
+
+#define bli_zssame 0
+#define bli_zdsame 0
+#define bli_zcsame 0
+#define bli_zzsame 1
+
 
 // return default format specifier for char
 
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index 59ea87a1e..79b74a2c8 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -146,58 +146,10 @@ BLIS_INLINE num_t bli_obj_dt_proj_to_complex( const obj_t* obj )
 	       ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX );
 }
 
-BLIS_INLINE num_t bli_obj_target_dt( const obj_t* obj )
-{
-	return ( num_t )
-	       ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT );
-}
-
-BLIS_INLINE dom_t bli_obj_target_domain( const obj_t* obj )
-{
-	return ( dom_t )
-	       ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT );
-}
-
-BLIS_INLINE prec_t bli_obj_target_prec( const obj_t* obj )
-{
-	return ( prec_t )
-	       ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT );
-}
-
-BLIS_INLINE num_t bli_obj_exec_dt( const obj_t* obj )
-{
-	return ( num_t )
-	       ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT );
-}
-
-BLIS_INLINE dom_t bli_obj_exec_domain( const obj_t* obj )
-{
-	return ( dom_t )
-	       ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT );
-}
-
-BLIS_INLINE prec_t bli_obj_exec_prec( const obj_t* obj )
-{
-	return ( prec_t )
-	       ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT );
-}
-
-BLIS_INLINE num_t bli_obj_comp_dt( const obj_t* obj )
-{
-	return ( num_t )
-	       ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT );
-}
-
-BLIS_INLINE dom_t bli_obj_comp_domain( const obj_t* obj )
-{
-	return ( dom_t )
-	       ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT );
-}
-
 BLIS_INLINE prec_t bli_obj_comp_prec( const obj_t* obj )
 {
 	return ( prec_t )
-	       ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT );
+	       ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_PREC_SHIFT );
 }
 
 // NOTE: This function queries info2.
@@ -348,20 +300,6 @@ BLIS_INLINE bool bli_obj_is_packed( const obj_t* obj )
 	       ( obj->info & BLIS_PACK_BIT );
 }
 
-BLIS_INLINE bool bli_obj_is_row_packed( const obj_t* obj )
-{
-	return ( bool )
-	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-	                                               BLIS_BITVAL_PACKED_ROWS    ) );
-}
-
-BLIS_INLINE bool bli_obj_is_col_packed( const obj_t* obj )
-{
-	return ( bool )
-	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-	                                               BLIS_BITVAL_PACKED_COLUMNS ) );
-}
-
 BLIS_INLINE bool bli_obj_is_panel_packed( const obj_t* obj )
 {
 	return ( bool )
@@ -460,67 +398,11 @@ BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj )
 	            ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt );
 }
 
-BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_TARGET_DT_BITS ) |
-	              ( dt << BLIS_TARGET_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) |
-	              ( dt << BLIS_TARGET_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) |
-	              ( dt << BLIS_TARGET_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_EXEC_DT_BITS ) |
-	              ( dt << BLIS_EXEC_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) |
-	              ( dt << BLIS_EXEC_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) |
-	              ( dt << BLIS_EXEC_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_COMP_DT_BITS ) |
-	              ( dt << BLIS_COMP_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) |
-	              ( dt << BLIS_COMP_DT_SHIFT ) );
-}
-
 BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj )
 {
 	obj->info = ( objbits_t )
 	            ( ( obj->info & ~BLIS_COMP_PREC_BIT ) |
-	              ( dt << BLIS_COMP_DT_SHIFT ) );
+	              ( dt << BLIS_COMP_PREC_SHIFT ) );
 }
 
 // NOTE: This function queries and modifies info2.
@@ -1187,53 +1069,6 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( const obj_t* c, const obj_t* a,
 }
 
 
-// -- User-provided information macros --
-
-// Function pointer query
-
-BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( const obj_t* obj )
-{
-	return obj->pack_fn;
-}
-
-BLIS_INLINE void* bli_obj_pack_params( const obj_t* obj )
-{
-	return obj->pack_params;
-}
-
-BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( const obj_t* obj )
-{
-	return obj->ker_fn;
-}
-
-BLIS_INLINE void* bli_obj_ker_params( const obj_t* obj )
-{
-	return obj->ker_params;
-}
-
-// Function pointer modification
-
-BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj )
-{
-	obj->pack_fn = pack_fn;
-}
-
-BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj )
-{
-	obj->pack_params = params;
-}
-
-BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj )
-{
-	obj->ker_fn = ker_fn;
-}
-
-BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj )
-{
-	obj->ker_params = params;
-}
-
-
 // -- Initialization-related macros --
 
 // Finish the initialization started by the matrix-specific static initializer
@@ -1246,9 +1081,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
 	bli_obj_set_as_root( obj );
 
 	bli_obj_set_dt( dt, obj );
-	bli_obj_set_target_dt( dt, obj );
-	bli_obj_set_exec_dt( dt, obj );
-	bli_obj_set_comp_dt( dt, obj );
+	bli_obj_set_comp_prec( bli_dt_prec( dt ), obj );
 
 	bli_obj_set_dims( m, n, obj );
 	bli_obj_set_strides( rs, cs, obj );
@@ -1407,14 +1240,7 @@ BLIS_INLINE void bli_obj_real_part( const obj_t* c, obj_t* r )
 	if ( bli_obj_is_complex( c ) )
 	{
 		// Change the datatypes.
-		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c )        );
-		const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) );
-		const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c )   );
-		const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c )   );
-		bli_obj_set_dt(        dt_stor_r, r );
-		bli_obj_set_target_dt( dt_targ_r, r );
-		bli_obj_set_exec_dt(   dt_exec_r, r );
-		bli_obj_set_comp_dt(   dt_comp_r, r );
+		bli_obj_set_dt( bli_obj_dt_proj_to_real( c ), r );
 
 		// Don't touch the attached scalar datatype.
 
@@ -1440,14 +1266,7 @@ BLIS_INLINE void bli_obj_imag_part( const obj_t* c, obj_t* i )
 		bli_obj_alias_to( c, i );
 
 		// Change the datatype.
-		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c )        );
-		const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) );
-		const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c )   );
-		const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c )   );
-		bli_obj_set_dt(        dt_stor_r, i );
-		bli_obj_set_target_dt( dt_targ_r, i );
-		bli_obj_set_exec_dt(   dt_exec_r, i );
-		bli_obj_set_comp_dt(   dt_comp_r, i );
+		bli_obj_set_dt( bli_obj_dt_proj_to_real( c ), i );
 
 		// Don't touch the attached scalar datatype.
 
@@ -1499,17 +1318,6 @@ BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b )
 	if ( b_root_is_self ) bli_obj_set_as_root( a );
 }
 
-// Swap object pack schemas.
-
-BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b )
-{
-	const pack_t schema_a = bli_obj_pack_schema( a );
-	const pack_t schema_b = bli_obj_pack_schema( b );
-
-	bli_obj_set_pack_schema( schema_b, a );
-	bli_obj_set_pack_schema( schema_a, b );
-}
-
 // Induce a transposition on an object: swap dimensions, increments, and
 // offsets, then clear the trans bit.
 
@@ -1595,5 +1403,19 @@ BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj )
 	bli_obj_toggle_trans( obj );
 }
 
+// Create an alias which refers to only a portion of the original matrix
+// without any "historical baggage", stripping out all offsets, transposes,
+// and references to the original root object.
+
+BLIS_INLINE void bli_obj_alias_submatrix( const obj_t* a, obj_t* b )
+{
+	bli_obj_alias_to( a, b );
+    bli_obj_reset_origin( b );
+    if ( bli_obj_has_trans( b ) )
+    {
+        bli_obj_induce_trans( b );
+        bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, b );
+    }
+}
 
 #endif
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index fea67c0af..77ee0ddc4 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -569,6 +569,12 @@ BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs )
 	         bli_abs( cs ) != 1 );
 }
 
+BLIS_INLINE bool bli_is_preferentially_stored( inc_t rs, inc_t cs, bool row_pref )
+{
+	return ( bli_is_row_stored( rs, cs ) &&  row_pref ) ||
+	       ( bli_is_col_stored( rs, cs ) && !row_pref );
+}
+
 BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs )
 {
 	return ( bool )
@@ -1035,20 +1041,6 @@ BLIS_INLINE bool bli_is_packed( pack_t schema )
 	       ( schema & BLIS_PACK_BIT );
 }
 
-BLIS_INLINE bool bli_is_row_packed( pack_t schema )
-{
-	return ( bool )
-	       ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-	                                            BLIS_BITVAL_PACKED_ROWS ) );
-}
-
-BLIS_INLINE bool bli_is_col_packed( pack_t schema )
-{
-	return ( bool )
-	       ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-	                                            BLIS_BITVAL_PACKED_COLUMNS ) );
-}
-
 BLIS_INLINE bool bli_is_panel_packed( pack_t schema )
 {
 	return ( bool )
@@ -1074,6 +1066,12 @@ BLIS_INLINE bool bli_is_1m_packed( pack_t schema )
 	         bli_is_1e_packed( schema ) );
 }
 
+BLIS_INLINE bool bli_is_ro_packed( pack_t schema )
+{
+	return ( bool )
+	       ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RO );
+}
+
 BLIS_INLINE bool bli_is_nat_packed( pack_t schema )
 {
 	return ( bool )
diff --git a/frame/include/bli_pre_ker_params.h b/frame/include/bli_pre_ker_params.h
index 6bd039b7e..2a8f7873b 100644
--- a/frame/include/bli_pre_ker_params.h
+++ b/frame/include/bli_pre_ker_params.h
@@ -38,7 +38,7 @@
 // These macros are used in bli_*_ker_prot.h and bli_*_ker_ft.h to make it
 // easy to update them in the future, if needed.
 
-#define BLIS_AUXINFO_PARAM        auxinfo_t* data
+#define BLIS_AUXINFO_PARAM  const auxinfo_t* data
 #define BLIS_CNTX_PARAM     const cntx_t*    cntx
 
 
diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h
index 3d60e8ec3..2eea517fd 100644
--- a/frame/include/bli_scalar_macro_defs.h
+++ b/frame/include/bli_scalar_macro_defs.h
@@ -199,6 +199,8 @@
 #include "bli_set0s_edge.h"
 #include "bli_copys_mxn.h"
 #include "bli_scal2s_mxn.h"
+
+#include "bli_axpbys_mxn.h"
 #include "bli_xpbys_mxn.h"
 #include "bli_xpbys_mxn_uplo.h"
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 3c74502b7..5bc96e8f2 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -251,205 +251,139 @@ typedef void  (*free_ft)  ( void*  p    );
 
 
 //
-// -- BLIS info bit field offsets ----------------------------------------------
+// -- BLIS info bit field sizes ------------------------------------------------
 //
 
-/*
-  info field description
-
-  bit(s)   purpose
-  -------  -------
-   2 ~ 0   Stored numerical datatype
-           - 0: domain    (0 == real, 1 == complex)
-           - 1: precision (0 == single, 1 == double)
-           - 2: special   (100 = int; 101 = const)
-       3   Transposition required [during pack]?
-       4   Conjugation required [during pack]?
-   7 ~ 5   Part of matrix stored:
-           - 5: strictly upper triangular
-           - 6: diagonal
-           - 7: strictly lower triangular
-       8   Implicit unit diagonal?
-       9   Invert diagonal required [during pack]?
-  12 ~ 10  Target numerical datatype
-           - 10: domain    (0 == real, 1 == complex)
-           - 11: precision (0 == single, 1 == double)
-           - 12: used to encode integer, constant types
-  15 ~ 13  Execution numerical datatype
-           - 13: domain    (0 == real, 1 == complex)
-           - 14: precision (0 == single, 1 == double)
-           - 15: used to encode integer, constant types
-  22 ~ 16  Packed type/status
-           - 0 0000 00: not packed
-           - 1 0000 00: packed (unspecified; by rows, columns, or vector)
-           - 1 0000 00: packed by rows
-           - 1 0000 01: packed by columns
-           - 1 0000 10: packed by row panels
-           - 1 0000 11: packed by column panels
-           - 1 0001 10: packed by 1m expanded row panels
-           - 1 0001 11: packed by 1m expanded column panels
-           - 1 0010 10: packed by 1m reordered row panels
-           - 1 0010 11: packed by 1m reordered column panels
-       23  Packed panel order if upper-stored
-           - 0 == forward order if upper
-           - 1 == reverse order if upper
-       24  Packed panel order if lower-stored
-           - 0 == forward order if lower
-           - 1 == reverse order if lower
-  26 ~ 25  Packed buffer type
-           - 0 == block of A
-           - 1 == panel of B
-           - 2 == panel of C
-           - 3 == general use
-  28 ~ 27  Structure type
-           - 0 == general
-           - 1 == Hermitian
-           - 2 == symmetric
-           - 3 == triangular
-  31 ~ 29  Computation numerical datatype
-           - 29: domain    (0 == real, 1 == complex)
-           - 30: precision (0 == single, 1 == double)
-           - 31: used to encode integer, constant types
-
-  info2 field description
-
-  bit(s)   purpose
-  -------  -------
-    2 ~ 0  Scalar storage numerical datatype
-           -  0: domain    (0 == real, 1 == complex)
-           -  1: precision (0 == single, 1 == double)
-           -  2: used to encode integer, constant types
-*/
+#define BLIS_DATATYPE_NUM_BITS             ( BLIS_DOMAIN_NUM_BITS + BLIS_PRECISION_NUM_BITS )
+#define   BLIS_DOMAIN_NUM_BITS               1
+#define   BLIS_PRECISION_NUM_BITS            2
+#define BLIS_CONJTRANS_NUM_BITS            ( BLIS_TRANS_NUM_BITS + BLIS_CONJ_NUM_BITS )
+#define   BLIS_TRANS_NUM_BITS                1
+#define   BLIS_CONJ_NUM_BITS                 1
+#define BLIS_UPLO_NUM_BITS                 ( BLIS_UPPER_NUM_BITS + BLIS_DIAG_NUM_BITS + BLIS_LOWER_NUM_BITS )
+#define   BLIS_UPPER_NUM_BITS                1
+#define   BLIS_DIAG_NUM_BITS                 1
+#define   BLIS_LOWER_NUM_BITS                1
+#define BLIS_UNIT_DIAG_NUM_BITS            1
+#define BLIS_INVERT_DIAG_NUM_BITS          1
+#define BLIS_PACK_SCHEMA_NUM_BITS          ( BLIS_PACK_PANEL_NUM_BITS + BLIS_PACK_FORMAT_NUM_BITS + BLIS_PACK_NUM_BITS )
+#define   BLIS_PACK_PANEL_NUM_BITS           1
+#define   BLIS_PACK_FORMAT_NUM_BITS          4
+#define   BLIS_PACK_NUM_BITS                 1
+#define BLIS_PACK_REV_IF_UPPER_NUM_BITS    1
+#define BLIS_PACK_REV_IF_LOWER_NUM_BITS    1
+#define BLIS_PACK_BUFFER_NUM_BITS          2
+#define BLIS_STRUC_NUM_BITS                2
+
+
+//
+// -- BLIS info bit field offsets ----------------------------------------------
+//
 
-// info
 #define BLIS_DATATYPE_SHIFT                0
-#define   BLIS_DOMAIN_SHIFT                0
-#define   BLIS_PRECISION_SHIFT             1
-#define BLIS_CONJTRANS_SHIFT               3
-#define   BLIS_TRANS_SHIFT                 3
-#define   BLIS_CONJ_SHIFT                  4
-#define BLIS_UPLO_SHIFT                    5
-#define   BLIS_UPPER_SHIFT                 5
-#define   BLIS_DIAG_SHIFT                  6
-#define   BLIS_LOWER_SHIFT                 7
-#define BLIS_UNIT_DIAG_SHIFT               8
-#define BLIS_INVERT_DIAG_SHIFT             9
-#define BLIS_TARGET_DT_SHIFT               10
-#define   BLIS_TARGET_DOMAIN_SHIFT         10
-#define   BLIS_TARGET_PREC_SHIFT           11
-#define BLIS_EXEC_DT_SHIFT                 13
-#define   BLIS_EXEC_DOMAIN_SHIFT           13
-#define   BLIS_EXEC_PREC_SHIFT             14
-#define BLIS_PACK_SCHEMA_SHIFT             16
-#define   BLIS_PACK_RC_SHIFT               16
-#define   BLIS_PACK_PANEL_SHIFT            17
-#define   BLIS_PACK_FORMAT_SHIFT           18
-#define   BLIS_PACK_SHIFT                  22
-#define BLIS_PACK_REV_IF_UPPER_SHIFT       23
-#define BLIS_PACK_REV_IF_LOWER_SHIFT       24
-#define BLIS_PACK_BUFFER_SHIFT             25
-#define BLIS_STRUC_SHIFT                   27
-#define BLIS_COMP_DT_SHIFT                 29
-#define   BLIS_COMP_DOMAIN_SHIFT           29
-#define   BLIS_COMP_PREC_SHIFT             30
-
-// info2
-#define BLIS_SCALAR_DT_SHIFT                0
-#define   BLIS_SCALAR_DOMAIN_SHIFT          0
-#define   BLIS_SCALAR_PREC_SHIFT            1
+#define   BLIS_DOMAIN_SHIFT              (   BLIS_DATATYPE_SHIFT )
+#define   BLIS_PRECISION_SHIFT           (   BLIS_DOMAIN_SHIFT + BLIS_DOMAIN_NUM_BITS )
+#define BLIS_CONJTRANS_SHIFT             ( BLIS_DATATYPE_SHIFT + BLIS_DATATYPE_NUM_BITS )
+#define   BLIS_TRANS_SHIFT               (   BLIS_CONJTRANS_SHIFT )
+#define   BLIS_CONJ_SHIFT                (   BLIS_TRANS_SHIFT + BLIS_TRANS_NUM_BITS )
+#define BLIS_UPLO_SHIFT                  ( BLIS_CONJTRANS_SHIFT + BLIS_CONJTRANS_NUM_BITS )
+#define   BLIS_UPPER_SHIFT               (   BLIS_UPLO_SHIFT )
+#define   BLIS_DIAG_SHIFT                (   BLIS_UPPER_SHIFT + BLIS_UPPER_NUM_BITS )
+#define   BLIS_LOWER_SHIFT               (   BLIS_DIAG_SHIFT + BLIS_DIAG_NUM_BITS )
+#define BLIS_UNIT_DIAG_SHIFT             ( BLIS_UPLO_SHIFT + BLIS_UPLO_NUM_BITS )
+#define BLIS_INVERT_DIAG_SHIFT           ( BLIS_UNIT_DIAG_SHIFT + BLIS_UNIT_DIAG_NUM_BITS )
+#define BLIS_PACK_SCHEMA_SHIFT           ( BLIS_INVERT_DIAG_SHIFT + BLIS_INVERT_DIAG_NUM_BITS )
+#define   BLIS_PACK_PANEL_SHIFT          (   BLIS_PACK_SCHEMA_SHIFT )
+#define   BLIS_PACK_FORMAT_SHIFT         (   BLIS_PACK_PANEL_SHIFT + BLIS_PACK_PANEL_NUM_BITS )
+#define   BLIS_PACK_SHIFT                (   BLIS_PACK_FORMAT_SHIFT + BLIS_PACK_FORMAT_NUM_BITS )
+#define BLIS_PACK_REV_IF_UPPER_SHIFT     ( BLIS_PACK_SCHEMA_SHIFT + BLIS_PACK_SCHEMA_NUM_BITS )
+#define BLIS_PACK_REV_IF_LOWER_SHIFT     ( BLIS_PACK_REV_IF_UPPER_SHIFT + BLIS_PACK_REV_IF_UPPER_NUM_BITS )
+#define BLIS_PACK_BUFFER_SHIFT           ( BLIS_PACK_REV_IF_LOWER_SHIFT + BLIS_PACK_REV_IF_LOWER_NUM_BITS )
+#define BLIS_STRUC_SHIFT                 ( BLIS_PACK_BUFFER_SHIFT + BLIS_PACK_BUFFER_NUM_BITS )
+#define BLIS_COMP_PREC_SHIFT             ( BLIS_STRUC_SHIFT + BLIS_STRUC_NUM_BITS )
+#define BLIS_SCALAR_DT_SHIFT             ( BLIS_COMP_PREC_SHIFT + BLIS_PRECISION_NUM_BITS )
+#define   BLIS_SCALAR_DOMAIN_SHIFT       (   BLIS_SCALAR_DT_SHIFT )
+#define   BLIS_SCALAR_PREC_SHIFT         (   BLIS_SCALAR_DOMAIN_SHIFT + BLIS_DOMAIN_NUM_BITS )
+// This is the total number of bits, which should always be <= 32
+#define BLIS_INFO_NUM_BITS               ( BLIS_SCALAR_DT_SHIFT + BLIS_DATATYPE_NUM_BITS )
 
 //
 // -- BLIS info bit field masks ------------------------------------------------
 //
 
-// info
-#define BLIS_DATATYPE_BITS                 ( 0x7  << BLIS_DATATYPE_SHIFT )
-#define   BLIS_DOMAIN_BIT                  ( 0x1  << BLIS_DOMAIN_SHIFT )
-#define   BLIS_PRECISION_BIT               ( 0x1  << BLIS_PRECISION_SHIFT )
-#define BLIS_CONJTRANS_BITS                ( 0x3  << BLIS_CONJTRANS_SHIFT )
-#define   BLIS_TRANS_BIT                   ( 0x1  << BLIS_TRANS_SHIFT )
-#define   BLIS_CONJ_BIT                    ( 0x1  << BLIS_CONJ_SHIFT )
-#define BLIS_UPLO_BITS                     ( 0x7  << BLIS_UPLO_SHIFT )
-#define   BLIS_UPPER_BIT                   ( 0x1  << BLIS_UPPER_SHIFT )
-#define   BLIS_DIAG_BIT                    ( 0x1  << BLIS_DIAG_SHIFT )
-#define   BLIS_LOWER_BIT                   ( 0x1  << BLIS_LOWER_SHIFT )
-#define BLIS_UNIT_DIAG_BIT                 ( 0x1  << BLIS_UNIT_DIAG_SHIFT )
-#define BLIS_INVERT_DIAG_BIT               ( 0x1  << BLIS_INVERT_DIAG_SHIFT )
-#define BLIS_TARGET_DT_BITS                ( 0x7  << BLIS_TARGET_DT_SHIFT )
-#define   BLIS_TARGET_DOMAIN_BIT           ( 0x1  << BLIS_TARGET_DOMAIN_SHIFT )
-#define   BLIS_TARGET_PREC_BIT             ( 0x1  << BLIS_TARGET_PREC_SHIFT )
-#define BLIS_EXEC_DT_BITS                  ( 0x7  << BLIS_EXEC_DT_SHIFT )
-#define   BLIS_EXEC_DOMAIN_BIT             ( 0x1  << BLIS_EXEC_DOMAIN_SHIFT )
-#define   BLIS_EXEC_PREC_BIT               ( 0x1  << BLIS_EXEC_PREC_SHIFT )
-#define BLIS_PACK_SCHEMA_BITS              ( 0x7F << BLIS_PACK_SCHEMA_SHIFT )
-#define   BLIS_PACK_RC_BIT                 ( 0x1  << BLIS_PACK_RC_SHIFT )
-#define   BLIS_PACK_PANEL_BIT              ( 0x1  << BLIS_PACK_PANEL_SHIFT )
-#define   BLIS_PACK_FORMAT_BITS            ( 0xF  << BLIS_PACK_FORMAT_SHIFT )
-#define   BLIS_PACK_BIT                    ( 0x1  << BLIS_PACK_SHIFT )
-#define BLIS_PACK_REV_IF_UPPER_BIT         ( 0x1  << BLIS_PACK_REV_IF_UPPER_SHIFT )
-#define BLIS_PACK_REV_IF_LOWER_BIT         ( 0x1  << BLIS_PACK_REV_IF_LOWER_SHIFT )
-#define BLIS_PACK_BUFFER_BITS              ( 0x3  << BLIS_PACK_BUFFER_SHIFT )
-#define BLIS_STRUC_BITS                    ( 0x3  << BLIS_STRUC_SHIFT )
-#define BLIS_COMP_DT_BITS                  ( 0x7  << BLIS_COMP_DT_SHIFT )
-#define   BLIS_COMP_DOMAIN_BIT             ( 0x1  << BLIS_COMP_DOMAIN_SHIFT )
-#define   BLIS_COMP_PREC_BIT               ( 0x1  << BLIS_COMP_PREC_SHIFT )
-
-// info2
-#define BLIS_SCALAR_DT_BITS                ( 0x7  << BLIS_SCALAR_DT_SHIFT )
-#define   BLIS_SCALAR_DOMAIN_BIT           ( 0x1  << BLIS_SCALAR_DOMAIN_SHIFT )
-#define   BLIS_SCALAR_PREC_BIT             ( 0x1  << BLIS_SCALAR_PREC_SHIFT )
+#define BLIS_DATATYPE_BITS                 ( ( ( 1 << BLIS_DATATYPE_NUM_BITS          ) - 1 ) << BLIS_DATATYPE_SHIFT )
+#define   BLIS_DOMAIN_BIT                  ( ( ( 1 << BLIS_DOMAIN_NUM_BITS            ) - 1 ) << BLIS_DOMAIN_SHIFT )
+#define   BLIS_PRECISION_BIT               ( ( ( 1 << BLIS_PRECISION_NUM_BITS         ) - 1 ) << BLIS_PRECISION_SHIFT )
+#define BLIS_CONJTRANS_BITS                ( ( ( 1 << BLIS_CONJTRANS_NUM_BITS         ) - 1 ) << BLIS_CONJTRANS_SHIFT )
+#define   BLIS_TRANS_BIT                   ( ( ( 1 << BLIS_TRANS_NUM_BITS             ) - 1 ) << BLIS_TRANS_SHIFT )
+#define   BLIS_CONJ_BIT                    ( ( ( 1 << BLIS_CONJ_NUM_BITS              ) - 1 ) << BLIS_CONJ_SHIFT )
+#define BLIS_UPLO_BITS                     ( ( ( 1 << BLIS_UPLO_NUM_BITS              ) - 1 ) << BLIS_UPLO_SHIFT )
+#define   BLIS_UPPER_BIT                   ( ( ( 1 << BLIS_UPPER_NUM_BITS             ) - 1 ) << BLIS_UPPER_SHIFT )
+#define   BLIS_DIAG_BIT                    ( ( ( 1 << BLIS_DIAG_NUM_BITS              ) - 1 ) << BLIS_DIAG_SHIFT )
+#define   BLIS_LOWER_BIT                   ( ( ( 1 << BLIS_LOWER_NUM_BITS             ) - 1 ) << BLIS_LOWER_SHIFT )
+#define BLIS_UNIT_DIAG_BIT                 ( ( ( 1 << BLIS_UNIT_DIAG_NUM_BITS         ) - 1 ) << BLIS_UNIT_DIAG_SHIFT )
+#define BLIS_INVERT_DIAG_BIT               ( ( ( 1 << BLIS_INVERT_DIAG_NUM_BITS       ) - 1 ) << BLIS_INVERT_DIAG_SHIFT )
+#define BLIS_PACK_SCHEMA_BITS              ( ( ( 1 << BLIS_PACK_SCHEMA_NUM_BITS       ) - 1 ) << BLIS_PACK_SCHEMA_SHIFT )
+#define   BLIS_PACK_PANEL_BIT              ( ( ( 1 << BLIS_PACK_PANEL_NUM_BITS        ) - 1 ) << BLIS_PACK_PANEL_SHIFT )
+#define   BLIS_PACK_FORMAT_BITS            ( ( ( 1 << BLIS_PACK_FORMAT_NUM_BITS       ) - 1 ) << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_PACK_BIT                    ( ( ( 1 << BLIS_PACK_NUM_BITS              ) - 1 ) << BLIS_PACK_SHIFT )
+#define BLIS_PACK_REV_IF_UPPER_BIT         ( ( ( 1 << BLIS_PACK_REV_IF_UPPER_NUM_BITS ) - 1 ) << BLIS_PACK_REV_IF_UPPER_SHIFT )
+#define BLIS_PACK_REV_IF_LOWER_BIT         ( ( ( 1 << BLIS_PACK_REV_IF_LOWER_NUM_BITS ) - 1 ) << BLIS_PACK_REV_IF_LOWER_SHIFT )
+#define BLIS_PACK_BUFFER_BITS              ( ( ( 1 << BLIS_PACK_BUFFER_NUM_BITS       ) - 1 ) << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_STRUC_BITS                    ( ( ( 1 << BLIS_STRUC_NUM_BITS             ) - 1 ) << BLIS_STRUC_SHIFT )
+#define BLIS_COMP_PREC_BIT                 ( ( ( 1 << BLIS_PRECISION_NUM_BITS         ) - 1 ) << BLIS_COMP_PREC_SHIFT )
+#define BLIS_SCALAR_DT_BITS                ( ( ( 1 << BLIS_DATATYPE_NUM_BITS          ) - 1 ) << BLIS_SCALAR_DT_SHIFT )
+#define   BLIS_SCALAR_DOMAIN_BIT           ( ( ( 1 << BLIS_DOMAIN_NUM_BITS            ) - 1 ) << BLIS_SCALAR_DOMAIN_SHIFT )
+#define   BLIS_SCALAR_PREC_BIT             ( ( ( 1 << BLIS_PRECISION_NUM_BITS         ) - 1 ) << BLIS_SCALAR_PREC_SHIFT )
 
 
 //
 // -- BLIS enumerated type value definitions -----------------------------------
 //
 
-#define BLIS_BITVAL_REAL                      0x0
-#define BLIS_BITVAL_COMPLEX                   BLIS_DOMAIN_BIT
-#define BLIS_BITVAL_SINGLE_PREC               0x0
-#define BLIS_BITVAL_DOUBLE_PREC               BLIS_PRECISION_BIT
-#define   BLIS_BITVAL_FLOAT_TYPE              0x0
-#define   BLIS_BITVAL_SCOMPLEX_TYPE           BLIS_DOMAIN_BIT
-#define   BLIS_BITVAL_DOUBLE_TYPE             BLIS_PRECISION_BIT
-#define   BLIS_BITVAL_DCOMPLEX_TYPE         ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT )
-#define   BLIS_BITVAL_INT_TYPE                0x04
-#define   BLIS_BITVAL_CONST_TYPE              0x05
-#define BLIS_BITVAL_NO_TRANS                  0x0
-#define BLIS_BITVAL_TRANS                     BLIS_TRANS_BIT
-#define BLIS_BITVAL_NO_CONJ                   0x0
-#define BLIS_BITVAL_CONJ                      BLIS_CONJ_BIT
-#define BLIS_BITVAL_CONJ_TRANS              ( BLIS_CONJ_BIT | BLIS_TRANS_BIT )
-#define BLIS_BITVAL_ZEROS                     0x0
-#define BLIS_BITVAL_UPPER                   ( BLIS_UPPER_BIT | BLIS_DIAG_BIT )
-#define BLIS_BITVAL_LOWER                   ( BLIS_LOWER_BIT | BLIS_DIAG_BIT )
-#define BLIS_BITVAL_DENSE                     BLIS_UPLO_BITS
-#define BLIS_BITVAL_NONUNIT_DIAG              0x0
-#define BLIS_BITVAL_UNIT_DIAG                 BLIS_UNIT_DIAG_BIT
-#define BLIS_BITVAL_INVERT_DIAG               BLIS_INVERT_DIAG_BIT
-#define BLIS_BITVAL_NOT_PACKED                0x0
-#define   BLIS_BITVAL_1E                    ( 0x1  << BLIS_PACK_FORMAT_SHIFT )
-#define   BLIS_BITVAL_1R                    ( 0x2  << BLIS_PACK_FORMAT_SHIFT )
-#define   BLIS_BITVAL_PACKED_UNSPEC         ( BLIS_PACK_BIT                                                            )
-#define   BLIS_BITVAL_PACKED_ROWS           ( BLIS_PACK_BIT                                                            )
-#define   BLIS_BITVAL_PACKED_COLUMNS        ( BLIS_PACK_BIT                                         | BLIS_PACK_RC_BIT )
-#define   BLIS_BITVAL_PACKED_ROW_PANELS     ( BLIS_PACK_BIT                   | BLIS_PACK_PANEL_BIT                    )
-#define   BLIS_BITVAL_PACKED_COL_PANELS     ( BLIS_PACK_BIT                   | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
-#define   BLIS_BITVAL_PACKED_ROW_PANELS_1E  ( BLIS_PACK_BIT | BLIS_BITVAL_1E  | BLIS_PACK_PANEL_BIT                    )
-#define   BLIS_BITVAL_PACKED_COL_PANELS_1E  ( BLIS_PACK_BIT | BLIS_BITVAL_1E  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
-#define   BLIS_BITVAL_PACKED_ROW_PANELS_1R  ( BLIS_PACK_BIT | BLIS_BITVAL_1R  | BLIS_PACK_PANEL_BIT                    )
-#define   BLIS_BITVAL_PACKED_COL_PANELS_1R  ( BLIS_PACK_BIT | BLIS_BITVAL_1R  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
-#define BLIS_BITVAL_PACK_FWD_IF_UPPER         0x0
-#define BLIS_BITVAL_PACK_REV_IF_UPPER         BLIS_PACK_REV_IF_UPPER_BIT
-#define BLIS_BITVAL_PACK_FWD_IF_LOWER         0x0
-#define BLIS_BITVAL_PACK_REV_IF_LOWER         BLIS_PACK_REV_IF_LOWER_BIT
-#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK        0x0
-#define BLIS_BITVAL_BUFFER_FOR_B_PANEL      ( 0x1 << BLIS_PACK_BUFFER_SHIFT )
-#define BLIS_BITVAL_BUFFER_FOR_C_PANEL      ( 0x2 << BLIS_PACK_BUFFER_SHIFT )
-#define BLIS_BITVAL_BUFFER_FOR_GEN_USE      ( 0x3 << BLIS_PACK_BUFFER_SHIFT )
-#define BLIS_BITVAL_GENERAL                   0x0
-#define BLIS_BITVAL_HERMITIAN               ( 0x1 << BLIS_STRUC_SHIFT )
-#define BLIS_BITVAL_SYMMETRIC               ( 0x2 << BLIS_STRUC_SHIFT )
-#define BLIS_BITVAL_TRIANGULAR              ( 0x3 << BLIS_STRUC_SHIFT )
+#define BLIS_BITVAL_REAL                  0x0
+#define BLIS_BITVAL_COMPLEX               BLIS_DOMAIN_BIT
+#define BLIS_BITVAL_SINGLE_PREC           0x0
+#define BLIS_BITVAL_DOUBLE_PREC         ( 0x1 << BLIS_PRECISION_SHIFT )
+#define   BLIS_BITVAL_FLOAT_TYPE          0x0
+#define   BLIS_BITVAL_SCOMPLEX_TYPE       BLIS_DOMAIN_BIT
+#define   BLIS_BITVAL_DOUBLE_TYPE         BLIS_BITVAL_DOUBLE_PREC
+#define   BLIS_BITVAL_DCOMPLEX_TYPE     ( BLIS_DOMAIN_BIT | BLIS_BITVAL_DOUBLE_PREC )
+#define   BLIS_BITVAL_INT_TYPE            0x04
+#define   BLIS_BITVAL_CONST_TYPE          0x05
+#define BLIS_BITVAL_NO_TRANS              0x0
+#define BLIS_BITVAL_TRANS                 BLIS_TRANS_BIT
+#define BLIS_BITVAL_NO_CONJ               0x0
+#define BLIS_BITVAL_CONJ                  BLIS_CONJ_BIT
+#define BLIS_BITVAL_CONJ_TRANS          ( BLIS_CONJ_BIT | BLIS_TRANS_BIT )
+#define BLIS_BITVAL_ZEROS                 0x0
+#define BLIS_BITVAL_UPPER               ( BLIS_UPPER_BIT | BLIS_DIAG_BIT )
+#define BLIS_BITVAL_LOWER               ( BLIS_LOWER_BIT | BLIS_DIAG_BIT )
+#define BLIS_BITVAL_DENSE                 BLIS_UPLO_BITS
+#define BLIS_BITVAL_NONUNIT_DIAG          0x0
+#define BLIS_BITVAL_UNIT_DIAG             BLIS_UNIT_DIAG_BIT
+#define BLIS_BITVAL_INVERT_DIAG           BLIS_INVERT_DIAG_BIT
+#define BLIS_BITVAL_NOT_PACKED            0x0
+#define   BLIS_BITVAL_1E                ( 0x1  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_1R                ( 0x2  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_RO                ( 0x3  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_PACKED_UNSPEC     ( BLIS_PACK_BIT                                         )
+#define   BLIS_BITVAL_PACKED_PANELS     ( BLIS_PACK_BIT                   | BLIS_PACK_PANEL_BIT )
+#define   BLIS_BITVAL_PACKED_PANELS_1E  ( BLIS_PACK_BIT | BLIS_BITVAL_1E  | BLIS_PACK_PANEL_BIT )
+#define   BLIS_BITVAL_PACKED_PANELS_1R  ( BLIS_PACK_BIT | BLIS_BITVAL_1R  | BLIS_PACK_PANEL_BIT )
+#define   BLIS_BITVAL_PACKED_PANELS_RO  ( BLIS_PACK_BIT | BLIS_BITVAL_RO  | BLIS_PACK_PANEL_BIT )
+#define BLIS_BITVAL_PACK_FWD_IF_UPPER     0x0
+#define BLIS_BITVAL_PACK_REV_IF_UPPER     BLIS_PACK_REV_IF_UPPER_BIT
+#define BLIS_BITVAL_PACK_FWD_IF_LOWER     0x0
+#define BLIS_BITVAL_PACK_REV_IF_LOWER     BLIS_PACK_REV_IF_LOWER_BIT
+#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK    0x0
+#define BLIS_BITVAL_BUFFER_FOR_B_PANEL  ( 0x1 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_BUFFER_FOR_C_PANEL  ( 0x2 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_BUFFER_FOR_GEN_USE  ( 0x3 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_GENERAL               0x0
+#define BLIS_BITVAL_HERMITIAN           ( 0x1 << BLIS_STRUC_SHIFT )
+#define BLIS_BITVAL_SYMMETRIC           ( 0x2 << BLIS_STRUC_SHIFT )
+#define BLIS_BITVAL_TRIANGULAR          ( 0x3 << BLIS_STRUC_SHIFT )
 
 
 //
@@ -538,23 +472,21 @@ typedef enum
 
 typedef enum
 {
-	BLIS_NOT_PACKED            = BLIS_BITVAL_NOT_PACKED,
-	BLIS_PACKED_UNSPEC         = BLIS_BITVAL_PACKED_UNSPEC,
-	BLIS_PACKED_VECTOR         = BLIS_BITVAL_PACKED_UNSPEC,
-	BLIS_PACKED_ROWS           = BLIS_BITVAL_PACKED_ROWS,
-	BLIS_PACKED_COLUMNS        = BLIS_BITVAL_PACKED_COLUMNS,
-	BLIS_PACKED_ROW_PANELS     = BLIS_BITVAL_PACKED_ROW_PANELS,
-	BLIS_PACKED_COL_PANELS     = BLIS_BITVAL_PACKED_COL_PANELS,
-	BLIS_PACKED_ROW_PANELS_1E  = BLIS_BITVAL_PACKED_ROW_PANELS_1E,
-	BLIS_PACKED_COL_PANELS_1E  = BLIS_BITVAL_PACKED_COL_PANELS_1E,
-	BLIS_PACKED_ROW_PANELS_1R  = BLIS_BITVAL_PACKED_ROW_PANELS_1R,
-	BLIS_PACKED_COL_PANELS_1R  = BLIS_BITVAL_PACKED_COL_PANELS_1R
+	BLIS_NOT_PACKED       = BLIS_BITVAL_NOT_PACKED,
+	BLIS_PACKED_UNSPEC    = BLIS_BITVAL_PACKED_UNSPEC,
+	BLIS_PACKED_VECTOR    = BLIS_BITVAL_PACKED_UNSPEC,
+	BLIS_PACKED_MATRIX    = BLIS_BITVAL_PACKED_UNSPEC,
+	BLIS_PACKED_PANELS    = BLIS_BITVAL_PACKED_PANELS,
+	BLIS_PACKED_PANELS_1E = BLIS_BITVAL_PACKED_PANELS_1E,
+	BLIS_PACKED_PANELS_1R = BLIS_BITVAL_PACKED_PANELS_1R,
+	BLIS_PACKED_PANELS_RO = BLIS_BITVAL_PACKED_PANELS_RO,
+
+	// BLIS_NUM_PACK_SCHEMA_TYPES must be last!
+	// We start with BLIS_PACKED_PANELS.
+	BLIS_NUM_PACK_SCHEMA_TYPES_,
+	BLIS_NUM_PACK_SCHEMA_TYPES = ((( BLIS_NUM_PACK_SCHEMA_TYPES_ - BLIS_PACKED_PANELS - 1 ) >> BLIS_PACK_FORMAT_SHIFT ) + 1)
 } pack_t;
 
-// We combine row and column packing into one "type", and we start
-// with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS.
-#define BLIS_NUM_PACK_SCHEMA_TYPES 3
-
 
 // -- Pack order type --
 
@@ -634,10 +566,12 @@ typedef enum
 	BLIS_MACH_RMIN,
 	BLIS_MACH_EMAX,
 	BLIS_MACH_RMAX,
-	BLIS_MACH_EPS2
+	BLIS_MACH_EPS2,
+
+	// BLIS_NUM_MACH_PARAMS must be last!
+	BLIS_NUM_MACH_PARAMS
 } machval_t;
 
-#define BLIS_NUM_MACH_PARAMS   11
 #define BLIS_MACH_PARAM_FIRST  BLIS_MACH_EPS
 #define BLIS_MACH_PARAM_LAST   BLIS_MACH_EPS2
 
@@ -648,11 +582,13 @@ typedef enum
 {
 	BLIS_1M        = 0,
 	BLIS_NAT,
+
 	BLIS_IND_FIRST = 0,
-	BLIS_IND_LAST  = BLIS_NAT
-} ind_t;
+	BLIS_IND_LAST  = BLIS_NAT,
 
-#define BLIS_NUM_IND_METHODS (BLIS_NAT+1)
+	// BLIS_NUM_IND_METHODS must be last!
+	BLIS_NUM_IND_METHODS
+} ind_t;
 
 // These are used in bli_l3_*_oapi.c to construct the ind_t values from
 // the induced method substrings that go into function names.
@@ -677,10 +613,25 @@ typedef enum
 
 // -- Kernel ID types --
 
+// Encode the number of independent type parameters in the high
+// bits of the kernel ID. This lets us identify kernel IDs as the
+// appropriate type while also using them as linear indices after
+// masking out these bits.
+#define BLIS_NTYPE_KER_SHIFT 28
+#define BLIS_NTYPE_KER_BITS  (0xFu << BLIS_NTYPE_KER_SHIFT)
+#define BLIS_1TYPE_KER       (  0u << BLIS_NTYPE_KER_SHIFT)
+#define BLIS_2TYPE_KER       (  1u << BLIS_NTYPE_KER_SHIFT)
+#define BLIS_3TYPE_KER       (  2u << BLIS_NTYPE_KER_SHIFT)
+
+#define bli_ker_idx( ker )	 ((ker) & ~BLIS_NTYPE_KER_BITS)
+#define bli_ker_ntype( ker ) ((((ker) & BLIS_NTYPE_KER_BITS) >> BLIS_NTYPE_KER_SHIFT) + 1)
+
 typedef enum
 {
+	// -- Single-type kernels --
+
 	// l1v kernels
-	BLIS_ADDV_KER,
+	BLIS_ADDV_KER = BLIS_1TYPE_KER,
 	BLIS_AMAXV_KER,
 	BLIS_AXPBYV_KER,
 	BLIS_AXPYV_KER,
@@ -703,33 +654,15 @@ typedef enum
 	BLIS_DOTXF_KER,
 	BLIS_DOTXAXPYF_KER,
 
-	// pack kernels
-	BLIS_PACKM_MRXK_KER,
-	BLIS_PACKM_NRXK_KER,
-	BLIS_PACKM_MRXK_1ER_KER,
-	BLIS_PACKM_NRXK_1ER_KER,
-	BLIS_PACKM_MRXMR_DIAG_KER,
-	BLIS_PACKM_NRXNR_DIAG_KER,
-	BLIS_PACKM_MRXMR_DIAG_1ER_KER,
-	BLIS_PACKM_NRXNR_DIAG_1ER_KER,
-
-	// unpack kernels
-	BLIS_UNPACKM_MRXK_KER,
-	BLIS_UNPACKM_NRXK_KER,
-
 	// l3 native kernels
-	BLIS_GEMM_UKR,
 	BLIS_GEMMTRSM_L_UKR,
 	BLIS_GEMMTRSM_U_UKR,
 	BLIS_TRSM_L_UKR,
 	BLIS_TRSM_U_UKR,
 
-	// l3 virtual kernels
-	BLIS_GEMM_VIR_UKR,
-	BLIS_GEMMTRSM_L_VIR_UKR,
-	BLIS_GEMMTRSM_U_VIR_UKR,
-	BLIS_TRSM_L_VIR_UKR,
-	BLIS_TRSM_U_VIR_UKR,
+	// l3 1m kernels
+	BLIS_GEMMTRSM1M_L_UKR,
+	BLIS_GEMMTRSM1M_U_UKR,
 
 	// gemmsup kernels
 	BLIS_GEMMSUP_RRR_UKR,
@@ -743,7 +676,34 @@ typedef enum
 	BLIS_GEMMSUP_XXX_UKR,
 
 	// BLIS_NUM_UKRS must be last!
-	BLIS_NUM_UKRS
+	BLIS_NUM_UKRS_, BLIS_NUM_UKRS = bli_ker_idx( BLIS_NUM_UKRS_ ),
+
+	// -- Two-type kernels --
+
+	// pack kernels
+	BLIS_PACKM_KER = BLIS_2TYPE_KER,
+	BLIS_PACKM_1ER_KER,
+	BLIS_PACKM_RO_KER,
+	BLIS_PACKM_DIAG_KER,
+	BLIS_PACKM_DIAG_1ER_KER,
+	BLIS_PACKM_DIAG_RO_KER,
+
+	// unpack kernels
+	BLIS_UNPACKM_KER,
+
+	// l3 native kernels
+	BLIS_GEMM_UKR,
+
+	// l3 1m kernels
+	BLIS_GEMM1M_UKR,
+
+	// mixed-domain kernels
+	BLIS_GEMM_CCR_UKR,
+	BLIS_GEMM_RCC_UKR,
+	BLIS_GEMM_CRR_UKR,
+
+	// BLIS_NUM_UKR2S must be last!
+	BLIS_NUM_UKR2S_, BLIS_NUM_UKR2S = bli_ker_idx( BLIS_NUM_UKR2S_ )
 } ukr_t;
 
 
@@ -777,10 +737,11 @@ typedef enum
 	BLIS_REFERENCE_UKERNEL = 0,
 	BLIS_VIRTUAL_UKERNEL,
 	BLIS_OPTIMIZED_UKERNEL,
-	BLIS_NOTAPPLIC_UKERNEL
-} kimpl_t;
+	BLIS_NOTAPPLIC_UKERNEL,
 
-#define BLIS_NUM_UKR_IMPL_TYPES 4
+    // BLIS_NUM_UKR_IMPL_TYPES must be last!
+	BLIS_NUM_UKR_IMPL_TYPES
+} kimpl_t;
 
 
 #if 0
@@ -811,9 +772,10 @@ typedef enum
 	BLIS_GEMMSUP_CR_UKR,
 
 	BLIS_GEMMSUP_GX_UKR,
-} l3sup_t;
 
-#define BLIS_NUM_LEVEL3_SUP_UKRS 9
+	// BLIS_NUM_LEVEL3_SUP_UKRS must be last!
+	BLIS_NUM_LEVEL3_SUP_UKRS
+} l3sup_t;
 #endif
 
 
@@ -851,10 +813,10 @@ typedef enum
 	BLIS_GGC,
 	BLIS_GGG,
 #endif
-} stor3_t;
 
-#define BLIS_NUM_3OP_RC_COMBOS 9
-//#define BLIS_NUM_3OP_RCG_COMBOS 27
+	// BLIS_NUM_3OP_RC_COMBOS must be last!
+	BLIS_NUM_3OP_RC_COMBOS
+} stor3_t;
 
 
 #if 0
@@ -900,11 +862,11 @@ typedef enum
 	BLIS_TRMM,
 	BLIS_TRSM,
 
-	BLIS_NOID
+	// BLIS_NOID (= BLIS_NUM_LEVEL3_OPS) must be last!
+	BLIS_NOID,
+	BLIS_NUM_LEVEL3_OPS = BLIS_NOID
 } opid_t;
 
-#define BLIS_NUM_LEVEL3_OPS 11
-
 
 // -- Blocksize ID type --
 
@@ -953,6 +915,18 @@ typedef enum
 } bszid_t;
 
 
+// A convenient version of the BLIS_XX block size IDs which can be used in bitfields.
+enum
+{
+	BLIS_THREAD_NONE = 0,
+	BLIS_THREAD_KR   = 1 << BLIS_KR,
+	BLIS_THREAD_MR   = 1 << BLIS_MR,
+	BLIS_THREAD_NR   = 1 << BLIS_NR,
+	BLIS_THREAD_MC   = 1 << BLIS_MC,
+	BLIS_THREAD_KC   = 1 << BLIS_KC,
+	BLIS_THREAD_NC   = 1 << BLIS_NC,
+};
+
 // -- Architecture ID type --
 
 // NOTE: This typedef enum must be kept up-to-date with the arch_t
@@ -1116,15 +1090,17 @@ typedef struct mem_s
 
 // -- Control tree node type --
 
+#define BLIS_MAX_SUB_NODES 2
+
 struct cntl_s
 {
-	// Basic fields (usually required).
-	opid_t         family;
-	bszid_t        bszid;
-	void_fp        var_func;
-	struct cntl_s* sub_prenode;
-	struct cntl_s* sub_node;
-	void*          params;
+	// Actually this is a l3_var_oft, but that type hasn't been defined yet
+	void_fp var_func;
+	struct
+	{
+		dim_t          ways;
+		struct cntl_s* sub_node;
+	} sub_nodes[ BLIS_MAX_SUB_NODES ];
 };
 typedef struct cntl_s cntl_t;
 
@@ -1151,6 +1127,17 @@ typedef struct func_s
 
 } func_t;
 
+typedef struct func2_s
+{
+	// Kernel function address.
+	// A func2_t* can be cast to a func_t* in order to access
+	// only the "diagonal" elements (dt,dt) (but note that to accomplish
+	// this those elements are not stored in ptr[dt][dt]...see bli_func.c
+	// for more details).
+	void_fp ptr[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
+
+} func2_t;
+
 
 // -- Multi-boolean object type --
 
@@ -1189,6 +1176,10 @@ typedef struct
 	inc_t ps_a;
 	inc_t ps_b;
 
+	// The row and column offset of the current micro-tile in C.
+	dim_t off_m;
+	dim_t off_n;
+
 	// The type to convert to on output.
 	//num_t  dt_on_output;
 
@@ -1218,31 +1209,6 @@ typedef struct constdata_s
 // -- BLIS object type definitions ---------------------------------------------
 //
 
-// Forward declarations for function pointer types
-struct obj_s;
-struct cntx_s;
-struct rntm_s;
-struct thrinfo_s;
-
-typedef void (*obj_pack_fn_t)
-    (
-      const struct obj_s*     a,
-            struct obj_s*     ap,
-      const struct cntx_s*    cntx,
-      const struct cntl_s*    cntl,
-            struct thrinfo_s* thread
-    );
-
-typedef void (*obj_ker_fn_t)
-    (
-      const struct obj_s*     a,
-      const struct obj_s*     b,
-      const struct obj_s*     c,
-      const struct cntx_s*    cntx,
-      const struct cntl_s*    cntl,
-            struct thrinfo_s* thread
-    );
-
 typedef struct obj_s
 {
 	// Basic fields
@@ -1273,12 +1239,6 @@ typedef struct obj_s
 	dim_t         m_panel;  // m dimension of a "full" panel
 	dim_t         n_panel;  // n dimension of a "full" panel
 
-	// User-customizable fields
-	obj_pack_fn_t pack_fn;
-	void*         pack_params;
-	obj_ker_fn_t  ker_fn;
-	void*         ker_params;
-
 } obj_t;
 
 // Pre-initializors. Things that must be set afterwards:
@@ -1299,7 +1259,7 @@ typedef struct obj_s
 	/* .diag_off    = */ 0, \
 \
 	/* .info        = */ 0x0 | BLIS_BITVAL_DENSE      | \
-	/*              */         BLIS_BITVAL_GENERAL, \
+	/*                */       BLIS_BITVAL_GENERAL, \
 	/* .info2       = */ 0x0, \
 	/* .elem_size   = */ sizeof( float ), /* this is changed later. */ \
 \
@@ -1316,11 +1276,6 @@ typedef struct obj_s
 	/* .pd          = */ 0, \
 	/* .m_panel     = */ 0, \
 	/* .n_panel     = */ 0, \
-\
-	/* .pack_fn     = */ NULL, \
-	/* .pack_params = */ NULL, \
-	/* .ker_fn      = */ NULL, \
-	/* .ker_params  = */ NULL  \
 }
 
 #define BLIS_OBJECT_INITIALIZER_1X1 \
@@ -1332,7 +1287,7 @@ typedef struct obj_s
 	/* .diag_off    = */ 0, \
 \
 	/* .info        = */ 0x0 | BLIS_BITVAL_DENSE      | \
-	/*              */         BLIS_BITVAL_GENERAL, \
+	/*                */       BLIS_BITVAL_GENERAL, \
 	/* .info2       = */ 0x0, \
 	/* .elem_size   = */ sizeof( float ), /* this is changed later. */ \
 \
@@ -1349,11 +1304,6 @@ typedef struct obj_s
 	/* .pd          = */ 0, \
 	/* .m_panel     = */ 0, \
 	/* .n_panel     = */ 0, \
-\
-	/* .pack_fn     = */ NULL, \
-	/* .pack_params = */ NULL, \
-	/* .ker_fn      = */ NULL, \
-	/* .ker_params  = */ NULL  \
 }
 
 // Define these macros here since they must be updated if contents of
@@ -1387,11 +1337,6 @@ BLIS_INLINE void bli_obj_init_full_shallow_copy_of( const obj_t* a, obj_t* b )
 	b->pd          = a->pd;
 	b->m_panel     = a->m_panel;
 	b->n_panel     = a->n_panel;
-
-	b->pack_fn     = a->pack_fn;
-	b->pack_params = a->pack_params;
-	b->ker_fn      = a->ker_fn;
-	b->ker_params  = a->ker_params;
 }
 
 BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b )
@@ -1425,11 +1370,6 @@ BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b )
 	b->pd          = a->pd;
 	b->m_panel     = a->m_panel;
 	b->n_panel     = a->n_panel;
-
-	b->pack_fn     = a->pack_fn;
-	b->pack_params = a->pack_params;
-	b->ker_fn      = a->ker_fn;
-	b->ker_params  = a->ker_params;
 }
 
 // Initializors for global scalar constants.
@@ -1475,20 +1415,35 @@ BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b )
 }
 
 
-// -- Context type --
+// -- Stack type --
 
-typedef struct cntx_s
+// NB: stack_t is already taken by <signal.h>
+typedef struct
 {
-	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
-	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
+	siz_t elem_size;
+	siz_t block_len;
+	siz_t max_blocks;
+	siz_t size;
+	siz_t capacity;
+
+	void** blocks;
+
+	bli_pthread_mutex_t lock;
+} stck_t;
 
-	func_t    ukrs[ BLIS_NUM_UKRS ];
-	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
-	void_fp   l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
+// -- Context type --
+
+typedef struct cntx_s
+{
+	stck_t blkszs;
+	stck_t bmults;
 
-	ind_t     method;
+	stck_t ukrs;
+	stck_t ukr2s;
+	stck_t ukr_prefs;
 
+	stck_t l3_sup_handlers;
 } cntx_t;
 
 
@@ -1533,6 +1488,8 @@ typedef enum
 	BLIS_UNDEFINED_ERROR_CODE                  = ( -11),
 	BLIS_NULL_POINTER                          = ( -12),
 	BLIS_NOT_YET_IMPLEMENTED                   = ( -13),
+	BLIS_OUT_OF_BOUNDS                         = ( -14),
+	BLIS_LOCK_FAILURE                          = ( -15),
 
 	// Parameter-specific errors
 	BLIS_INVALID_SIDE                          = ( -20),
@@ -1591,6 +1548,7 @@ typedef enum
 
 	// Packing-specific errors
 	BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK  = (-100),
+	BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_PART    = (-101),
 
 	// Buffer-specific errors
 	BLIS_EXPECTED_NONNULL_OBJECT_BUFFER        = (-110),
@@ -1620,6 +1578,10 @@ typedef enum
 	BLIS_NC_MAX_NONMULTIPLE_OF_NR              = (-163),
 	BLIS_KC_DEF_NONMULTIPLE_OF_KR              = (-164),
 	BLIS_KC_MAX_NONMULTIPLE_OF_KR              = (-165),
+	BLIS_MR_NOT_EVEN_FOR_REAL_TYPE             = (-166),
+	BLIS_PACKMR_NOT_EVEN_FOR_REAL_TYPE         = (-167),
+	BLIS_NR_NOT_EVEN_FOR_REAL_TYPE             = (-168),
+	BLIS_PACKNR_NOT_EVEN_FOR_REAL_TYPE         = (-169),
 
 	BLIS_ERROR_CODE_MAX                        = (-170)
 } err_t;
diff --git a/frame/include/blis.h b/frame/include/blis.h
index 9286fc3ce..085819b43 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -125,6 +125,8 @@ extern "C" {
 #include "bli_blksz.h"
 #include "bli_func.h"
 #include "bli_mbool.h"
+#include "bli_stack.h"
+#include "bli_check.h"
 #include "bli_cntx.h"
 #include "bli_rntm.h"
 #include "bli_gks.h"
@@ -137,12 +139,12 @@ extern "C" {
 #include "bli_memsys.h"
 #include "bli_mem.h"
 #include "bli_part.h"
+#include "bli_part_cntl.h"
 #include "bli_prune.h"
 #include "bli_query.h"
 #include "bli_auxinfo.h"
 #include "bli_param_map.h"
 #include "bli_clock.h"
-#include "bli_check.h"
 #include "bli_error.h"
 #include "bli_f2c.h"
 #include "bli_machval.h"
diff --git a/frame/include/level0/bb/bli_scal2bbs_mxn.h b/frame/include/level0/bb/bli_scal2bbs_mxn.h
index c4156713f..d6f95f97f 100644
--- a/frame/include/level0/bb/bli_scal2bbs_mxn.h
+++ b/frame/include/level0/bb/bli_scal2bbs_mxn.h
@@ -42,12 +42,12 @@
 \
 BLIS_INLINE void PASTEMAC(ch,opname) \
      ( \
-       const conj_t       conjx, \
-       const dim_t        m, \
-       const dim_t        n, \
-       ctype*    restrict alpha, \
-       ctype*    restrict x, const inc_t incx, const inc_t ldx, \
-       ctype*    restrict y, const inc_t incy, const inc_t ldy  \
+       const conj_t          conjx, \
+       const dim_t           m, \
+       const dim_t           n, \
+       const ctype* restrict alpha, \
+       const ctype* restrict x, const inc_t incx, const inc_t ldx, \
+             ctype* restrict y, const inc_t incy, const inc_t ldy  \
      ) \
 { \
 	/* Assume that the duplication factor is the row stride of y. */ \
@@ -58,13 +58,13 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	{ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict xj = x + j*ldx; \
-			ctype* restrict yj = y + j*ldy; \
+			const ctype* restrict xj = x + j*ldx; \
+			      ctype* restrict yj = y + j*ldy; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict xij = xj + i*incx; \
-				ctype* restrict yij = yj + i*incy; \
+				const ctype* restrict xij = xj + i*incx; \
+				      ctype* restrict yij = yj + i*incy; \
 \
 				PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \
 \
@@ -81,13 +81,13 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	{ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict xj = x + j*ldx; \
-			ctype* restrict yj = y + j*ldy; \
+			const ctype* restrict xj = x + j*ldx; \
+			      ctype* restrict yj = y + j*ldy; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict xij = xj + i*incx; \
-				ctype* restrict yij = yj + i*incy; \
+				const ctype* restrict xij = xj + i*incx; \
+				      ctype* restrict yij = yj + i*incy; \
 \
 				PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \
 \
@@ -110,12 +110,12 @@ INSERT_GENTFUNCRO_BASIC( scal2bbs_mxn )
 \
 BLIS_INLINE void PASTEMAC(ch,opname) \
      ( \
-       const conj_t       conjx, \
-       const dim_t        m, \
-       const dim_t        n, \
-       ctype*    restrict alpha, \
-       ctype*    restrict x, const inc_t incx, const inc_t ldx, \
-       ctype*    restrict y, const inc_t incy, const inc_t ldy  \
+       const conj_t          conjx, \
+       const dim_t           m, \
+       const dim_t           n, \
+       const ctype* restrict alpha, \
+       const ctype* restrict x, const inc_t incx, const inc_t ldx, \
+             ctype* restrict y, const inc_t incy, const inc_t ldy  \
      ) \
 { \
 	/* Assume that the duplication factor is the row stride of y. */ \
diff --git a/frame/include/level0/bli_axpbys_mxn.h b/frame/include/level0/bli_axpbys_mxn.h
new file mode 100644
index 000000000..494c5d445
--- /dev/null
+++ b/frame/include/level0/bli_axpbys_mxn.h
@@ -0,0 +1,129 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_AXPBYS_MXN_H
+#define BLIS_AXPBYS_MXN_H
+
+// axpbys_mxn
+
+// Notes:
+// - The first char encodes the type of a.
+// - The second char encodes the type of x.
+// - The third char encodes the type of b.
+// - The fourth char encodes the type of y.
+// - We only implement cases where typeof(a) == type(x) && typeof(b) == typeof(y).
+
+#undef  BLIS_ENABLE_CR_CASES
+#define BLIS_ENABLE_CR_CASES 0
+
+// -- bli_????axpbys_mxn --
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
+\
+BLIS_INLINE void PASTEMAC(chx,chx,chy,chy,opname) \
+     ( \
+       const dim_t   m, \
+       const dim_t   n, \
+       const ctypex* alpha, \
+       const ctypex* x, inc_t rs_x, inc_t cs_x, \
+       const ctypey* beta, \
+             ctypey* y, inc_t rs_y, inc_t cs_y  \
+     ) \
+{ \
+	/* If beta is zero, overwrite y with alpha*x (in case y has infs or NaNs). */ \
+	if ( PASTEMAC(chy,eq0)( *beta ) ) \
+	{ \
+		PASTEMAC(chx,chx,chy,scal2s_mxn)( BLIS_NO_CONJUGATE, m, n, alpha, x, rs_x, cs_x, y, rs_y, cs_y ); \
+		return; \
+	} \
+\
+	if      ( BLIS_ENABLE_CR_CASES && rs_x == 1 && rs_y == 1 ) \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		PASTEMAC(chx,chx,chy,chy,kername) \
+		( \
+		  *alpha, *(x + ii + jj*cs_x), \
+		  *beta,  *(y + ii + jj*cs_y) \
+		); \
+	} \
+	else if ( BLIS_ENABLE_CR_CASES && cs_x == 1 && cs_y == 1 ) \
+	{ \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		PASTEMAC(chx,chx,chy,chy,kername) \
+		( \
+		  *alpha, *(x + ii*rs_x + jj), \
+		  *beta,  *(y + ii*rs_y + jj) \
+		); \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		PASTEMAC(chx,chx,chy,chy,kername) \
+		( \
+		  *alpha, *(x + ii*rs_x + jj*cs_x), \
+		  *beta,  *(y + ii*rs_y + jj*cs_y) \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC2_BASIC ( axpbys_mxn, axpbys )
+INSERT_GENTFUNC2_MIX_DP( axpbys_mxn, axpbys )
+
+
+// -- bli_?axpbys_mxn --
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+BLIS_INLINE void PASTEMAC(ch,opname) \
+     ( \
+       const dim_t  m, \
+       const dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const ctype* beta, \
+             ctype* y, inc_t rs_y, inc_t cs_y  \
+     ) \
+{ \
+    PASTEMAC(ch,ch,ch,ch,opname)( m, n, alpha, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
+}
+
+INSERT_GENTFUNC_BASIC( axpbys_mxn )
+
+
+#endif
diff --git a/frame/include/level0/bli_copys_mxn.h b/frame/include/level0/bli_copys_mxn.h
index cb2513466..4b729376a 100644
--- a/frame/include/level0/bli_copys_mxn.h
+++ b/frame/include/level0/bli_copys_mxn.h
@@ -49,7 +49,7 @@
 #undef  GENTFUNC2
 #define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
 \
-BLIS_INLINE void PASTEMAC2(chx,chy,opname) \
+BLIS_INLINE void PASTEMAC(chx,chy,opname) \
      ( \
        const dim_t   m, \
        const dim_t   n, \
@@ -61,21 +61,21 @@ BLIS_INLINE void PASTEMAC2(chx,chy,opname) \
 	{ \
 		for ( dim_t jj = 0; jj < n; ++jj ) \
 		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC2(chx,chy,kername)( *(x + ii + jj*cs_x), \
+		PASTEMAC(chx,chy,kername)( *(x + ii + jj*cs_x), \
 		                            *(y + ii + jj*cs_y) ); \
 	} \
 	else if ( BLIS_ENABLE_CR_CASES && cs_x == 1 && cs_y == 1 ) \
 	{ \
 		for ( dim_t ii = 0; ii < m; ++ii ) \
 		for ( dim_t jj = 0; jj < n; ++jj ) \
-		PASTEMAC2(chx,chy,kername)( *(x + ii*rs_x + jj), \
+		PASTEMAC(chx,chy,kername)( *(x + ii*rs_x + jj), \
 		                            *(y + ii*rs_y + jj) ); \
 	} \
 	else \
 	{ \
 		for ( dim_t jj = 0; jj < n; ++jj ) \
 		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC2(chx,chy,kername)( *(x + ii*rs_x + jj*cs_x), \
+		PASTEMAC(chx,chy,kername)( *(x + ii*rs_x + jj*cs_x), \
 		                            *(y + ii*rs_y + jj*cs_y) ); \
 	} \
 }
@@ -97,7 +97,7 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
              ctype* y, inc_t rs_y, inc_t cs_y  \
      ) \
 { \
-	PASTEMAC2(ch,ch,opname)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+	PASTEMAC(ch,ch,opname)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
 }
 
 INSERT_GENTFUNC_BASIC( copys_mxn )
diff --git a/frame/include/level0/bli_scal2s_mxn.h b/frame/include/level0/bli_scal2s_mxn.h
index d58a37cd0..fdfea4dd9 100644
--- a/frame/include/level0/bli_scal2s_mxn.h
+++ b/frame/include/level0/bli_scal2s_mxn.h
@@ -37,32 +37,43 @@
 
 // scal2s_mxn
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
+// Notes:
+// - The first char encodes the type of a.
+// - The second char encodes the type of x.
+// - The third char encodes the type of y.
+// - We only implement cases where typeof(a) == type(x).
+
+#undef  BLIS_ENABLE_CR_CASES
+#define BLIS_ENABLE_CR_CASES 0
+
+// -- bli_???scal2s_mxn --
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
 \
-BLIS_INLINE void PASTEMAC(ch,opname) \
+BLIS_INLINE void PASTEMAC(chx,chx,chy,opname) \
      ( \
-       const conj_t       conjx, \
-       const dim_t        m, \
-       const dim_t        n, \
-       ctype*    restrict alpha, \
-       ctype*    restrict x, const inc_t rs_x, const inc_t cs_x, \
-       ctype*    restrict y, const inc_t rs_y, const inc_t cs_y  \
+       const conj_t  conjx, \
+       const dim_t   m, \
+       const dim_t   n, \
+       const ctypex* alpha, \
+       const ctypex* x, inc_t rs_x, inc_t cs_x, \
+             ctypey* y, inc_t rs_y, inc_t cs_y  \
      ) \
 { \
 	if ( bli_is_conj( conjx ) ) \
 	{ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict xj = x + j*cs_x; \
-			ctype* restrict yj = y + j*cs_y; \
+			const ctypex* restrict xj = x + j*cs_x; \
+			      ctypey* restrict yj = y + j*cs_y; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict xij = xj + i*rs_x; \
-				ctype* restrict yij = yj + i*rs_y; \
+				const ctypex* restrict xij = xj + i*rs_x; \
+				      ctypey* restrict yij = yj + i*rs_y; \
 \
-				PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \
+				PASTEMAC(chx,chx,chy,scal2js)( *alpha, *xij, *yij ); \
 			} \
 		} \
 	} \
@@ -70,20 +81,42 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
 	{ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict xj = x + j*cs_x; \
-			ctype* restrict yj = y + j*cs_y; \
+			const ctypex* restrict xj = x + j*cs_x; \
+			      ctypey* restrict yj = y + j*cs_y; \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict xij = xj + i*rs_x; \
-				ctype* restrict yij = yj + i*rs_y; \
+				const ctypex* restrict xij = xj + i*rs_x; \
+				      ctypey* restrict yij = yj + i*rs_y; \
 \
-				PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \
+				PASTEMAC(chx,chx,chy,scal2s)( *alpha, *xij, *yij ); \
 			} \
 		} \
 	} \
 }
 
+INSERT_GENTFUNC2_BASIC ( scal2s_mxn, scal2s )
+INSERT_GENTFUNC2_MIX_DP( scal2s_mxn, scal2s )
+
+
+// -- bli_?scal2s_mxn --
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+BLIS_INLINE void PASTEMAC(ch,opname) \
+     ( \
+       const conj_t conjx, \
+       const dim_t  m, \
+       const dim_t  n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+             ctype* y, inc_t rs_y, inc_t cs_y  \
+     ) \
+{ \
+	PASTEMAC(ch,ch,ch,opname)( conjx, m, n, alpha, x, rs_x, cs_x, y, rs_y, cs_y ); \
+}
+
 INSERT_GENTFUNC_BASIC( scal2s_mxn )
 
 #endif
diff --git a/frame/include/level0/bli_xpbys_mxn.h b/frame/include/level0/bli_xpbys_mxn.h
index 3fae708cd..d3174289f 100644
--- a/frame/include/level0/bli_xpbys_mxn.h
+++ b/frame/include/level0/bli_xpbys_mxn.h
@@ -51,7 +51,7 @@
 #undef  GENTFUNC2
 #define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
 \
-BLIS_INLINE void PASTEMAC3(chx,chy,chy,opname) \
+BLIS_INLINE void PASTEMAC(chx,chy,chy,opname) \
      ( \
        const dim_t   m, \
        const dim_t   n, \
@@ -63,7 +63,7 @@ BLIS_INLINE void PASTEMAC3(chx,chy,chy,opname) \
 	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
 	if ( PASTEMAC(chy,eq0)( *beta ) ) \
 	{ \
-		PASTEMAC2(chx,chy,copys_mxn)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+		PASTEMAC(chx,chy,copys_mxn)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
 		return; \
 	} \
 \
@@ -71,7 +71,7 @@ BLIS_INLINE void PASTEMAC3(chx,chy,chy,opname) \
 	{ \
 		for ( dim_t jj = 0; jj < n; ++jj ) \
 		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC3(chx,chy,chy,kername) \
+		PASTEMAC(chx,chy,chy,kername) \
 		( \
 		  *(x + ii + jj*cs_x), *beta, \
 		  *(y + ii + jj*cs_y) \
@@ -81,7 +81,7 @@ BLIS_INLINE void PASTEMAC3(chx,chy,chy,opname) \
 	{ \
 		for ( dim_t ii = 0; ii < m; ++ii ) \
 		for ( dim_t jj = 0; jj < n; ++jj ) \
-		PASTEMAC3(chx,chy,chy,kername) \
+		PASTEMAC(chx,chy,chy,kername) \
 		( \
 		  *(x + ii*rs_x + jj), *beta, \
 		  *(y + ii*rs_y + jj) \
@@ -91,7 +91,7 @@ BLIS_INLINE void PASTEMAC3(chx,chy,chy,opname) \
 	{ \
 		for ( dim_t jj = 0; jj < n; ++jj ) \
 		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC3(chx,chy,chy,kername) \
+		PASTEMAC(chx,chy,chy,kername) \
 		( \
 		  *(x + ii*rs_x + jj*cs_x), *beta, \
 		  *(y + ii*rs_y + jj*cs_y) \
@@ -117,7 +117,7 @@ BLIS_INLINE void PASTEMAC(ch,opname) \
              ctype* y, inc_t rs_y, inc_t cs_y  \
      ) \
 { \
-    PASTEMAC3(ch,ch,ch,opname)( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
+    PASTEMAC(ch,ch,ch,opname)( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
 }
 
 INSERT_GENTFUNC_BASIC( xpbys_mxn )
diff --git a/frame/thread/bli_thread_range.c b/frame/thread/bli_thread_range.c
index a28e529b0..804d68a93 100644
--- a/frame/thread/bli_thread_range.c
+++ b/frame/thread/bli_thread_range.c
@@ -37,20 +37,17 @@
 
 void bli_thread_range_sub
      (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end
+       dim_t  work_id,
+       dim_t  n_way,
+       dim_t  n,
+       dim_t  bf,
+       bool   handle_edge_low,
+       dim_t* start,
+       dim_t* end
      )
 {
-	dim_t      n_way      = bli_thrinfo_n_way( thread );
-
 	if ( n_way == 1 ) { *start = 0; *end = n; return; }
 
-	dim_t      work_id    = bli_thrinfo_work_id( thread );
-
 	dim_t      all_start  = 0;
 	dim_t      all_end    = n;
 
@@ -181,88 +178,6 @@ void bli_thread_range_sub
 
 // -----------------------------------------------------------------------------
 
-siz_t bli_thread_range_l2r
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, n, bf,
-	                      FALSE, start, end );
-
-	return m * ( *end - *start );
-}
-
-siz_t bli_thread_range_r2l
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, n, bf,
-	                      TRUE, start, end );
-
-	return m * ( *end - *start );
-}
-
-siz_t bli_thread_range_t2b
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, m, bf,
-	                      FALSE, start, end );
-
-	return n * ( *end - *start );
-}
-
-siz_t bli_thread_range_b2t
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	num_t dt = bli_obj_dt( a );
-	dim_t m  = bli_obj_length_after_trans( a );
-	dim_t n  = bli_obj_width_after_trans( a );
-	dim_t bf = bli_blksz_get_def( dt, bmult );
-
-	bli_thread_range_sub( thr, m, bf,
-	                      TRUE, start, end );
-
-	return n * ( *end - *start );
-}
-
-// -----------------------------------------------------------------------------
-
 dim_t bli_thread_range_width_l
      (
        doff_t diagoff_j,
@@ -795,327 +710,116 @@ siz_t bli_thread_range_weighted_sub
 siz_t bli_thread_range_mdim
      (
              dir_t      direct,
+             dim_t      bmult,
+             bool       use_weighted,
        const thrinfo_t* thr,
        const obj_t*     a,
        const obj_t*     b,
        const obj_t*     c,
-       const cntl_t*    cntl,
-       const cntx_t*    cntx,
              dim_t*     start,
              dim_t*     end
      )
 {
-	bszid_t  bszid  = bli_cntl_bszid( cntl );
-	opid_t   family = bli_cntl_family( cntl );
-
-	// This is part of trsm's current implementation, whereby right side
-	// cases are implemented in left-side micro-kernels, which requires
-	// we swap the usage of the register blocksizes for the purposes of
-	// packing A and B.
-	if ( family == BLIS_TRSM )
-	{
-		if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR;
-		else                                   bszid = BLIS_NR;
-	}
-
-	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	const obj_t*   x;
-	bool  use_weighted;
-
-	// Use the operation family to choose the one of the two matrices
-	// being partitioned that potentially has structure, and also to
-	// decide whether or not we need to use weighted range partitioning.
-	// NOTE: It's important that we use non-weighted range partitioning
-	// for hemm and symm (ie: the gemm family) because the weighted
-	// function will mistakenly skip over unstored regions of the
-	// structured matrix, even though they represent part of that matrix
-	// that will be dense and full (after packing).
-	if      ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; }
-	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
-	else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE;  }
-	else    /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; }
-
-	if ( use_weighted )
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_weighted_t2b( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_weighted_b2t( thr, x, bmult, start, end );
-	}
-	else
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_t2b( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_b2t( thr, x, bmult, start, end );
-	}
+	( void )b;
+	return bli_thread_range
+	(
+	  thr,
+	  bli_obj_is_upper_or_lower( c ) ? c : a,
+	  bmult,
+	  direct,
+	  BLIS_M,
+	  use_weighted,
+	  start,
+	  end
+	);
 }
 
 siz_t bli_thread_range_ndim
      (
              dir_t      direct,
+             dim_t      bmult,
+             bool       use_weighted,
        const thrinfo_t* thr,
        const obj_t*     a,
        const obj_t*     b,
        const obj_t*     c,
-       const cntl_t*    cntl,
-       const cntx_t*    cntx,
              dim_t*     start,
              dim_t*     end
      )
 {
-	bszid_t  bszid  = bli_cntl_bszid( cntl );
-	opid_t   family = bli_cntl_family( cntl );
-
-	// This is part of trsm's current implementation, whereby right side
-	// cases are implemented in left-side micro-kernels, which requires
-	// we swap the usage of the register blocksizes for the purposes of
-	// packing A and B.
-	if ( family == BLIS_TRSM )
-	{
-		if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR;
-		else                                   bszid = BLIS_NR;
-	}
-
-	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	const obj_t*   x;
-	bool  use_weighted;
-
-	// Use the operation family to choose the one of the two matrices
-	// being partitioned that potentially has structure, and also to
-	// decide whether or not we need to use weighted range partitioning.
-	// NOTE: It's important that we use non-weighted range partitioning
-	// for hemm and symm (ie: the gemm family) because the weighted
-	// function will mistakenly skip over unstored regions of the
-	// structured matrix, even though they represent part of that matrix
-	// that will be dense and full (after packing).
-	if      ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; }
-	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
-	else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE;  }
-	else    /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; }
-
-	if ( use_weighted )
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_weighted_l2r( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_weighted_r2l( thr, x, bmult, start, end );
-	}
-	else
-	{
-		if ( direct == BLIS_FWD )
-			return bli_thread_range_l2r( thr, x, bmult, start, end );
-		else
-			return bli_thread_range_r2l( thr, x, bmult, start, end );
-	}
+	( void )a;
+	return bli_thread_range
+	(
+	  thr,
+	  bli_obj_is_upper_or_lower( c ) ? c : b,
+	  bmult,
+	  direct,
+	  BLIS_N,
+	  use_weighted,
+	  start,
+	  end
+	);
 }
 
 // -----------------------------------------------------------------------------
 
-siz_t bli_thread_range_weighted_l2r
+siz_t bli_thread_range
      (
        const thrinfo_t* thr,
        const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	siz_t area;
-
-	// This function assigns area-weighted ranges in the n dimension
-	// where the total range spans 0 to n-1 with 0 at the left end and
-	// n-1 at the right end.
-
-	if ( bli_obj_intersects_diag( a ) &&
-	     bli_obj_is_upper_or_lower( a ) )
-	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
-		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-		}
-
-		area =
-		bli_thread_range_weighted_sub
-		(
-		  thr, diagoff, uplo, uplo, m, n, bf,
-		  FALSE, start, end
-		);
-	}
-	else // if dense or zeros
-	{
-		area = bli_thread_range_l2r
-		(
-		  thr, a, bmult,
-		  start, end
-		);
-	}
-
-	return area;
-}
-
-siz_t bli_thread_range_weighted_r2l
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	siz_t area;
-
-	// This function assigns area-weighted ranges in the n dimension
-	// where the total range spans 0 to n-1 with 0 at the right end and
-	// n-1 at the left end.
-
-	if ( bli_obj_intersects_diag( a ) &&
-	     bli_obj_is_upper_or_lower( a ) )
-	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
-		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-		}
-
-		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
-
-		area =
-		bli_thread_range_weighted_sub
-		(
-		  thr, diagoff, uplo, uplo, m, n, bf,
-		  TRUE, start, end
-		);
-	}
-	else // if dense or zeros
-	{
-		area = bli_thread_range_r2l
-		(
-		  thr, a, bmult,
-		  start, end
-		);
-	}
-
-	return area;
-}
-
-siz_t bli_thread_range_weighted_t2b
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
+             dim_t      bf,
+             dir_t      direct,
+             mdim_t     dim,
+             bool       use_weighted,
              dim_t*     start,
              dim_t*     end
      )
 {
-	siz_t area;
-
-	// This function assigns area-weighted ranges in the m dimension
-	// where the total range spans 0 to m-1 with 0 at the top end and
-	// m-1 at the bottom end.
-
-	if ( bli_obj_intersects_diag( a ) &&
-	     bli_obj_is_upper_or_lower( a ) )
+	dim_t  m       = bli_obj_length( a );
+	dim_t  n       = bli_obj_width( a );
+	doff_t diagoff = bli_obj_diag_offset( a );
+	uplo_t uplo    = bli_obj_uplo( a );
+
+	// Support implicit transposition.
+	if ( ( dim == BLIS_M && !bli_obj_has_trans( a ) ) ||
+	     ( dim == BLIS_N &&  bli_obj_has_trans( a ) ) )
 	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
-		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-		}
-
 		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-
-		area =
-		bli_thread_range_weighted_sub
-		(
-		  thr, diagoff, uplo, uplo, m, n, bf,
-		  FALSE, start, end
-		);
-	}
-	else // if dense or zeros
-	{
-		area = bli_thread_range_t2b
-		(
-		  thr, a, bmult,
-		  start, end
-		);
 	}
 
-	return area;
-}
-
-siz_t bli_thread_range_weighted_b2t
-     (
-       const thrinfo_t* thr,
-       const obj_t*     a,
-       const blksz_t*   bmult,
-             dim_t*     start,
-             dim_t*     end
-     )
-{
-	siz_t area;
+	// Edge cases are handled at the "low" end of the index range when
+	// moving backwards through the matrix.
+	bool handle_edge_low = ( direct == BLIS_BWD );
 
-	// This function assigns area-weighted ranges in the m dimension
-	// where the total range spans 0 to m-1 with 0 at the bottom end and
-	// m-1 at the top end.
-
-	if ( bli_obj_intersects_diag( a ) &&
+	if ( use_weighted &&
+	     bli_obj_intersects_diag( a ) &&
 	     bli_obj_is_upper_or_lower( a ) )
 	{
-		num_t  dt      = bli_obj_dt( a );
-		doff_t diagoff = bli_obj_diag_offset( a );
-		uplo_t uplo    = bli_obj_uplo( a );
-		dim_t  m       = bli_obj_length( a );
-		dim_t  n       = bli_obj_width( a );
-		dim_t  bf      = bli_blksz_get_def( dt, bmult );
-
-		// Support implicit transposition.
-		if ( bli_obj_has_trans( a ) )
+		if ( direct == BLIS_BWD )
 		{
-			bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
+			bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
 		}
 
-		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
-
-		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
-
-		area = bli_thread_range_weighted_sub
+		return bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, uplo, m, n, bf,
-		  TRUE, start, end
+		  handle_edge_low, start, end
 		);
 	}
-	else // if dense or zeros
+	else // if unweighted, dense, or zeros
 	{
-		area = bli_thread_range_b2t
+		bli_thread_range_sub
 		(
-		  thr, a, bmult,
-		  start, end
+		  bli_thrinfo_work_id( thr ),
+		  bli_thrinfo_n_way( thr ),
+		  n,
+		  bf,
+		  handle_edge_low,
+		  start,
+		  end
 		);
-	}
 
-	return area;
+		return m * ( *end - *start );
+	}
 }
 
diff --git a/frame/thread/bli_thread_range.h b/frame/thread/bli_thread_range.h
index cf966b5a3..2a84a76fb 100644
--- a/frame/thread/bli_thread_range.h
+++ b/frame/thread/bli_thread_range.h
@@ -41,54 +41,46 @@
 
 BLIS_EXPORT_BLIS void bli_thread_range_sub
      (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end
+       dim_t  work_id,
+       dim_t  n_way,
+       dim_t  n,
+       dim_t  bf,
+       bool   handle_edge_low,
+       dim_t* start,
+       dim_t* end
      );
 
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-siz_t PASTEMAC0( opname ) \
+siz_t PASTEMAC( opname ) \
      ( \
              dir_t      direct, \
+             dim_t      bmult, \
+             bool       use_weighted, \
        const thrinfo_t* thr, \
        const obj_t*     a, \
        const obj_t*     b, \
        const obj_t*     c, \
-       const cntl_t*    cntl, \
-       const cntx_t*    cntx, \
              dim_t*     start, \
-             dim_t*     end  \
+             dim_t*     end \
      );
 
 GENPROT( thread_range_mdim )
 GENPROT( thread_range_ndim )
 
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-siz_t PASTEMAC0( opname ) \
-     ( \
-       const thrinfo_t* thr, \
-       const obj_t*     a, \
-       const blksz_t*   bmult, \
-             dim_t*     start, \
-             dim_t*     end  \
-     );
-
-GENPROT( thread_range_l2r )
-GENPROT( thread_range_r2l )
-GENPROT( thread_range_t2b )
-GENPROT( thread_range_b2t )
 
-GENPROT( thread_range_weighted_l2r )
-GENPROT( thread_range_weighted_r2l )
-GENPROT( thread_range_weighted_t2b )
-GENPROT( thread_range_weighted_b2t )
+BLIS_EXPORT_BLIS siz_t bli_thread_range
+     (
+       const thrinfo_t* thr,
+       const obj_t*     a,
+             dim_t      bf,
+             dir_t      direct,
+             mdim_t     dim,
+             bool       use_weighted,
+             dim_t*     start,
+             dim_t*     end
+     );
 
 
 dim_t bli_thread_range_width_l
diff --git a/frame/thread/bli_thread_range_slab_rr.c b/frame/thread/bli_thread_range_slab_rr.c
index be4432309..59fba399b 100644
--- a/frame/thread/bli_thread_range_slab_rr.c
+++ b/frame/thread/bli_thread_range_slab_rr.c
@@ -49,13 +49,12 @@ void bli_thread_range_quad
              dim_t*     inc
      )
 {
+	const dim_t tid   = bli_thrinfo_work_id( thread );
+	const dim_t jr_nt = bli_thrinfo_n_way( thread );
+	const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 );
 
 #ifdef BLIS_ENABLE_JRIR_RR
 
-	const dim_t tid    = bli_thrinfo_work_id( thread );
-	const dim_t jr_nt  = bli_thrinfo_n_way( thread );
-	const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 );
-
 	// Use round-robin (interleaved) partitioning of jr/ir loops.
 	*start = tid;
 	*end   = n_iter;
@@ -69,9 +68,6 @@ void bli_thread_range_quad
 	// is defined, since the function is only called from macrokernels that were
 	// designed for slab/rr partitioning.
 
-	const dim_t jr_nt = bli_thrinfo_n_way( thread );
-	const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 );
-
 	// If there is no parallelism in this loop, set the output variables
 	// and return early.
 	if ( jr_nt == 1 ) { *start = 0; *end = n_iter; *inc = 1; return; }
@@ -99,7 +95,7 @@ void bli_thread_range_quad
 
 		bli_thread_range_sub
 		(
-		  thread, n, bf,
+		  tid, jr_nt, n, bf,
 		  handle_edge_low, &st, &en
 		);
 		in = bf;
diff --git a/frame/thread/bli_thread_range_slab_rr.h b/frame/thread/bli_thread_range_slab_rr.h
index 3e9797363..47d1a5f5c 100644
--- a/frame/thread/bli_thread_range_slab_rr.h
+++ b/frame/thread/bli_thread_range_slab_rr.h
@@ -37,17 +37,16 @@
 
 BLIS_INLINE void bli_thread_range_rr
      (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end,
-             dim_t*     inc
+       dim_t  tid,
+       dim_t  nt,
+       dim_t  n,
+       dim_t  bf,
+       bool   handle_edge_low,
+       dim_t* start,
+       dim_t* end,
+       dim_t* inc
      )
 {
-	const dim_t tid    = bli_thrinfo_work_id( thread );
-	const dim_t nt     = bli_thrinfo_n_way( thread );
 	const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 );
 
 	// Use round-robin (interleaved) partitioning of jr/ir loops.
@@ -58,29 +57,31 @@ BLIS_INLINE void bli_thread_range_rr
 
 BLIS_INLINE void bli_thread_range_sl
      (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end,
-             dim_t*     inc
+       dim_t  work_id,
+       dim_t  n_way,
+       dim_t  n,
+       dim_t  bf,
+       bool   handle_edge_low,
+       dim_t* start,
+       dim_t* end,
+       dim_t* inc
      )
 {
 	// Use contiguous slab partitioning of jr/ir loops.
-	bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
+	bli_thread_range_sub( work_id, n_way, n, bf, handle_edge_low, start, end );
 	*inc = 1;
 }
 
 BLIS_INLINE void bli_thread_range_slrr
      (
-       const thrinfo_t* thread,
-             dim_t      n,
-             dim_t      bf,
-             bool       handle_edge_low,
-             dim_t*     start,
-             dim_t*     end,
-             dim_t*     inc
+       dim_t  work_id,
+       dim_t  n_way,
+       dim_t  n,
+       dim_t  bf,
+       bool   handle_edge_low,
+       dim_t* start,
+       dim_t* end,
+       dim_t* inc
      )
 {
 	// Define a general-purpose slab/rr function whose definition depends on
@@ -90,9 +91,9 @@ BLIS_INLINE void bli_thread_range_slrr
 	// are used together by packm.
 
 #ifdef BLIS_ENABLE_JRIR_RR
-	bli_thread_range_rr( thread, n, bf, handle_edge_low, start, end, inc );
+	bli_thread_range_rr( work_id, n_way, bf, handle_edge_low, start, end, inc );
 #else // ifdef ( _SLAB || _TLB )
-	bli_thread_range_sl( thread, n, bf, handle_edge_low, start, end, inc );
+	bli_thread_range_sl( work_id, n_way, n, bf, handle_edge_low, start, end, inc );
 #endif
 }
 
diff --git a/frame/thread/bli_thread_range_tlb.c b/frame/thread/bli_thread_range_tlb.c
index cdc2950f8..e8faa6269 100644
--- a/frame/thread/bli_thread_range_tlb.c
+++ b/frame/thread/bli_thread_range_tlb.c
@@ -1230,6 +1230,8 @@ dim_t bli_thread_range_tlb_trmm_rl_impl
              inc_t* i_en_p
      )
 {
+	( void )mr;
+
 	// Assumption: 0 <= diagoff. Make sure to prune leading rows beforehand!
 	if ( diagoff < 0 ) bli_abort();
 
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index f48e70bb6..9f7960f94 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -37,6 +37,21 @@
 
 #define BLIS_NUM_STATIC_COMMS 80
 
+void bli_thrinfo_attach_sub_node( thrinfo_t* sub_node, thrinfo_t* t )
+{
+	dim_t next = 0;
+	for ( ; next < BLIS_MAX_SUB_NODES; next++ )
+	{
+		if ( bli_thrinfo_sub_node( next, t ) == NULL )
+			break;
+	}
+
+	if ( next == BLIS_MAX_SUB_NODES )
+		bli_abort();
+
+	bli_thrinfo_set_sub_node( next, sub_node, t );
+}
+
 thrinfo_t* bli_thrinfo_create_root
      (
        thrcomm_t* comm,
@@ -83,8 +98,8 @@ thrinfo_t* bli_thrinfo_create
 	bli_thrinfo_set_pba( pba, thread );
 	bli_mem_clear( bli_thrinfo_mem( thread ) );
 
-	bli_thrinfo_set_sub_node( NULL, thread );
-	bli_thrinfo_set_sub_prenode( NULL, thread );
+	for ( dim_t i = 0; i < BLIS_MAX_SUB_NODES; i++ )
+		bli_thrinfo_set_sub_node( i, NULL, thread );
 
 	return thread;
 }
@@ -96,22 +111,16 @@ void bli_thrinfo_free
 {
 	if ( thread == NULL ) return;
 
-	thrinfo_t* thrinfo_sub_prenode = bli_thrinfo_sub_prenode( thread );
-	thrinfo_t* thrinfo_sub_node    = bli_thrinfo_sub_node( thread );
-	pool_t*    sba_pool            = bli_thrinfo_sba_pool( thread );
-	mem_t*     cntl_mem_p          = bli_thrinfo_mem( thread );
-	pba_t*     pba                 = bli_thrinfo_pba( thread );
-
-	// Recursively free all children of the current thrinfo_t.
-	if ( thrinfo_sub_prenode != NULL )
-	{
-		bli_thrinfo_free( thrinfo_sub_prenode );
-	}
+	pool_t* sba_pool   = bli_thrinfo_sba_pool( thread );
+	mem_t*  cntl_mem_p = bli_thrinfo_mem( thread );
+	pba_t*  pba        = bli_thrinfo_pba( thread );
 
 	// Recursively free all children of the current thrinfo_t.
-	if ( thrinfo_sub_node != NULL )
+	for ( dim_t i = 0; i < BLIS_MAX_SUB_NODES; i++ )
 	{
-		bli_thrinfo_free( thrinfo_sub_node );
+		thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( i, thread );
+		if ( thrinfo_sub_node != NULL )
+			bli_thrinfo_free( thrinfo_sub_node );
 	}
 
 	// Free the communicators, but only if the current thrinfo_t struct
@@ -269,7 +278,7 @@ void bli_thrinfo_print_sub
 	        ( unsigned long )bli_thrinfo_work_id( thread ),
 	        ( unsigned long )bli_thrinfo_needs_free_comm( thread ));
 
-	bli_thrinfo_print_sub( bli_thrinfo_sub_prenode( thread ), level+1 );
-	bli_thrinfo_print_sub( bli_thrinfo_sub_node( thread ), level+1 );
+	for ( dim_t i = 0; i < BLIS_MAX_SUB_NODES; i++ )
+		bli_thrinfo_print_sub( bli_thrinfo_sub_node( i, thread ), level+1 );
 }
 
diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h
index d15fb49f6..8a63c5974 100644
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -68,8 +68,8 @@ struct thrinfo_s
 	// Storage for allocated memory obtained from the packing block allocator.
 	mem_t              mem;
 
-	struct thrinfo_s*  sub_prenode;
-	struct thrinfo_s*  sub_node;
+	// Child thread info nodes.
+	struct thrinfo_s*  sub_nodes[ BLIS_MAX_SUB_NODES ];
 };
 typedef struct thrinfo_s thrinfo_t;
 
@@ -124,14 +124,9 @@ BLIS_INLINE mem_t* bli_thrinfo_mem( thrinfo_t* t )
 	return &t->mem;
 }
 
-BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t )
+BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( dim_t which, const thrinfo_t* t )
 {
-	return t->sub_node;
-}
-
-BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t )
-{
-	return t->sub_prenode;
+	return t->sub_nodes[ which ];
 }
 
 // thrinfo_t query (complex)
@@ -178,15 +173,12 @@ BLIS_INLINE void bli_thrinfo_set_pba( pba_t* pba, thrinfo_t* t )
 	t->pba = pba;
 }
 
-BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_set_sub_node( dim_t which, thrinfo_t* sub_node, thrinfo_t* t )
 {
-	t->sub_node = sub_node;
+	t->sub_nodes[ which ] = sub_node;
 }
 
-BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t )
-{
-	t->sub_prenode = sub_prenode;
-}
+void bli_thrinfo_attach_sub_node( thrinfo_t* sub_node, thrinfo_t* t );
 
 // other thrinfo_t-related functions
 
diff --git a/frame/util/bli_util_fpa.c b/frame/util/bli_util_fpa.c
index 4ed95d4c9..bbba052c6 100644
--- a/frame/util/bli_util_fpa.c
+++ b/frame/util/bli_util_fpa.c
@@ -41,13 +41,13 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+GENARRAY_FPA( PASTECH(opname,BLIS_TAPI_EX_SUF,_vft), \
               PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
 { \
-	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
+	return PASTECH(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
 }
 
 GENFRONT( asumv )
@@ -78,7 +78,7 @@ GENARRAY_FPA( void_fp, opname ); \
 */ \
 \
 GENARRAY_FPA( PASTECH(opname,_vft), \
-              PASTECH0(opname) ); \
+              PASTECH(opname) ); \
 \
 PASTECH(opname,_vft) \
 PASTEMAC(opname,_qfp)( num_t dt ) \
diff --git a/frame/util/bli_util_fpa.h b/frame/util/bli_util_fpa.h
index f4b67ba36..5ee0f4adb 100644
--- a/frame/util/bli_util_fpa.h
+++ b/frame/util/bli_util_fpa.h
@@ -39,8 +39,8 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
-PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
+PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
 
 GENPROT( asumv )
 GENPROT( mkherm )
diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h
index 39c27bd9a..2bb1943d7 100644
--- a/frame/util/bli_util_ft.h
+++ b/frame/util/bli_util_ft.h
@@ -42,7 +42,7 @@
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -57,7 +57,7 @@ INSERT_GENTDEFR( asumv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
        uplo_t  uploa, \
        dim_t   m, \
@@ -74,7 +74,7 @@ INSERT_GENTDEF( mktrim )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -91,7 +91,7 @@ INSERT_GENTDEFR( normiv )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              doff_t   diagoffx, \
              diag_t   diagx, \
@@ -112,7 +112,7 @@ INSERT_GENTDEFR( normim )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              FILE*  file, \
        const char*  s1, \
@@ -129,7 +129,7 @@ INSERT_GENTDEF( fprintv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              FILE*  file, \
        const char*  s1, \
@@ -147,7 +147,7 @@ INSERT_GENTDEF( fprintm )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
        dim_t   n, \
        ctype*  x, inc_t incx  \
@@ -162,7 +162,7 @@ INSERT_GENTDEF( randnv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
        doff_t  diagoffx, \
        uplo_t  uplox, \
@@ -180,7 +180,7 @@ INSERT_GENTDEF( randnm )
 #undef  GENTDEFR
 #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
 \
-typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -202,7 +202,7 @@ INSERT_GENTDEFR( sumsqv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
              conj_t conjchi, \
        const ctype* chi, \
@@ -217,7 +217,7 @@ INSERT_GENTDEF( eqsc )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
              conj_t conjx, \
              dim_t  n, \
@@ -233,7 +233,7 @@ INSERT_GENTDEF( eqv )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
              doff_t  diagoffx, \
              diag_t  diagx, \
@@ -253,7 +253,7 @@ INSERT_GENTDEF( eqm )
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
-typedef void (*PASTECH2(ch,opname,tsuf)) \
+typedef void (*PASTECH(ch,opname,tsuf)) \
      ( \
        const ctype* chi, \
        const ctype* psi, \
diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c
index 8223ffff8..4810b6f00 100644
--- a/frame/util/bli_util_oapi.c
+++ b/frame/util/bli_util_oapi.c
@@ -67,8 +67,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -109,8 +109,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -153,8 +153,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -202,8 +202,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -248,8 +248,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -292,8 +292,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -339,8 +339,8 @@ void PASTEMAC(opname,EX_SUF) \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
-	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
-	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+	PASTECH(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
 \
 	f \
 	( \
@@ -364,7 +364,7 @@ GENFRONT( sumsqv )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi, \
@@ -425,7 +425,7 @@ GENFRONT( eqsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
@@ -472,7 +472,7 @@ GENFRONT( eqv )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
@@ -529,7 +529,7 @@ GENFRONT( eqm )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi, \
@@ -579,7 +579,7 @@ GENFRONT( gtesc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
              FILE*  file, \
        const char*  s1, \
@@ -627,7 +627,7 @@ GENFRONT( fprintv )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
              FILE*  file, \
        const char*  s1, \
@@ -693,7 +693,7 @@ GENFRONT( fprintm )
 #undef  GENFRONT
 #define GENFRONT( opname, varname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        const char*  s1, \
        const obj_t* x, \
@@ -704,7 +704,7 @@ void PASTEMAC0(opname) \
 	bli_init_once(); \
 \
 	/* Invoke the typed function. */ \
-	PASTEMAC0(varname) \
+	PASTEMAC(varname) \
 	( \
 	  stdout, \
 	  s1, \
diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h
index 682a58cb3..2a1d700d8 100644
--- a/frame/util/bli_util_oapi.h
+++ b/frame/util/bli_util_oapi.h
@@ -143,7 +143,7 @@ GENPROT( sumsqv )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
@@ -162,7 +162,7 @@ GENPROT( gtesc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
              FILE*  file, \
        const char*  s1, \
@@ -178,7 +178,7 @@ GENPROT( fprintm )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(opname) \
      ( \
        const char*  s1, \
        const obj_t* x, \
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index faa35e039..c3521f244 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -43,7 +43,7 @@
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -68,7 +68,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  n, \
 	  ( ctype* )x, incx, \
@@ -84,7 +84,7 @@ INSERT_GENTFUNCR_BASIC( asumv )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        uplo_t uploa, \
        dim_t  m, \
@@ -104,7 +104,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  uploa, \
 	  m, \
@@ -122,7 +122,7 @@ INSERT_GENTFUNC_BASIC( mktrim )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -147,7 +147,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  n, \
 	  ( ctype* )x, incx, \
@@ -165,7 +165,7 @@ INSERT_GENTFUNCR_BASIC( normiv )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t   diagoffx, \
              diag_t   diagx, \
@@ -194,7 +194,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
@@ -216,7 +216,7 @@ INSERT_GENTFUNCR_BASIC( normim )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx  \
@@ -243,7 +243,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	{ \
 		/* Invoke the helper variant, which loops over the appropriate kernel
 		   to implement the current operation. */ \
-		PASTEMAC2(ch,opname,_unb_var1) \
+		PASTEMAC(ch,opname,_unb_var1) \
 		( \
 		  n, \
 		  x, incx, \
@@ -254,7 +254,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		/* Check the 1-norm of the randomzied vector. In the unlikely event that
 		   the 1-norm is zero, it means that *all* elements are zero, in which
 		   case we want to re-randomize until the 1-norm is not zero. */ \
-		PASTEMAC2(ch,norm1v,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,norm1v,BLIS_TAPI_EX_SUF) \
 		( \
 		  n, \
 		  x, incx, \
@@ -272,7 +272,7 @@ INSERT_GENTFUNCR_BASIC( randnv )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        doff_t diagoffx, \
        uplo_t uplox, \
@@ -302,7 +302,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	{ \
 		/* Invoke the helper variant, which loops over the appropriate kernel
 		   to implement the current operation. */ \
-		PASTEMAC2(ch,opname,_unb_var1) \
+		PASTEMAC(ch,opname,_unb_var1) \
 		( \
 		  diagoffx, \
 		  uplox, \
@@ -316,7 +316,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		/* Check the 1-norm of the randomzied matrix. In the unlikely event that
 		   the 1-norm is zero, it means that *all* elements are zero, in which
 		   case we want to re-randomize until the 1-norm is not zero. */ \
-		PASTEMAC2(ch,norm1m,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,norm1m,BLIS_TAPI_EX_SUF) \
 		( \
 		  diagoffx, \
 		  BLIS_NONUNIT_DIAG, \
@@ -338,7 +338,7 @@ INSERT_GENTFUNCR_BASIC( randnm )
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC2(ch,opname,EX_SUF) \
+void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -359,7 +359,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	/* Invoke the helper variant, which loops over the appropriate kernel
 	   to implement the current operation. */ \
-	PASTEMAC2(ch,opname,_unb_var1) \
+	PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  n, \
 	  ( ctype* )x, incx, \
@@ -421,7 +421,7 @@ void PASTEMAC(ch,opname) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	/*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \
 \
-	*is_eq = PASTEMAC2(ch,opname,_unb_var1) \
+	*is_eq = PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  conjx, \
 	  n, \
@@ -459,7 +459,7 @@ void PASTEMAC(ch,opname) \
 	/*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \
 \
 	/* Invoke the helper variant. */ \
-	*is_eq = PASTEMAC2(ch,opname,_unb_var1) \
+	*is_eq = PASTEMAC(ch,opname,_unb_var1) \
 	( \
 	  diagoffx, \
 	  diagx, \
diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h
index 2ff83aaba..715b22a26 100644
--- a/frame/util/bli_util_tapi.h
+++ b/frame/util/bli_util_tapi.h
@@ -40,7 +40,7 @@
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -54,7 +54,7 @@ INSERT_GENTPROTR_BASIC( asumv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        uplo_t uploa, \
        dim_t  m, \
@@ -70,7 +70,7 @@ INSERT_GENTPROT_BASIC( mktrim )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
@@ -86,7 +86,7 @@ INSERT_GENTPROTR_BASIC( normiv )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              doff_t   diagoffx, \
              diag_t   diagx, \
@@ -106,7 +106,7 @@ INSERT_GENTPROTR_BASIC( normim )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        dim_t  n, \
        ctype* x, inc_t incx  \
@@ -120,7 +120,7 @@ INSERT_GENTPROT_BASIC( randnv )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
        doff_t diagoffx, \
        uplo_t uplox, \
@@ -137,7 +137,7 @@ INSERT_GENTPROT_BASIC( randnm )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \
      ( \
              dim_t    n, \
        const ctype*   x, inc_t incx, \
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index 227d30c6f..b3767e6a8 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -66,7 +66,7 @@ void PASTEMAC(ch,varname) \
 		chi1 = x + (i  )*incx; \
 \
 		/* Get the real and imaginary components of chi1. */ \
-		PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+		PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 		/* Replace chi1_r and chi1_i with their absolute values. */ \
 		chi1_r = bli_fabs( chi1_r ); \
@@ -110,7 +110,7 @@ void PASTEMAC(ch,varname) \
 	/* We will be reflecting the stored region over the diagonal into the
 	   unstored region, so a transposition is necessary. Furthermore, since
 	   we are creating a Hermitian matrix, we must also conjugate. */ \
-	PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,copym,BLIS_TAPI_EX_SUF) \
 	( \
 	  diagoffa, \
 	  BLIS_NONUNIT_DIAG, \
@@ -125,7 +125,7 @@ void PASTEMAC(ch,varname) \
 	); \
 \
 	/* Set the imaginary parts of the diagonal elements to zero. */ \
-	PASTEMAC2(ch,setid,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,setid,BLIS_TAPI_EX_SUF) \
 	( \
 	  0, \
 	  m, \
@@ -164,7 +164,7 @@ void PASTEMAC(ch,varname) \
 \
 	/* We will be reflecting the stored region over the diagonal into the
 	   unstored region, so a transposition is necessary. */ \
-	PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,copym,BLIS_TAPI_EX_SUF) \
 	( \
 	  diagoffa, \
 	  BLIS_NONUNIT_DIAG, \
@@ -209,7 +209,7 @@ void PASTEMAC(ch,varname) \
 	else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \
 \
 	/* Set the unstored triangle to zero. */ \
-	PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+	PASTEMAC(ch,setm,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  diagoffa, \
@@ -252,7 +252,7 @@ void PASTEMAC(ch,varname) \
 		chi1 = x + (i  )*incx; \
 \
 		/* Compute the absolute value (or complex magnitude) of chi1. */ \
-		PASTEMAC2(ch,chr,abval2s)( *chi1, abs_chi1 ); \
+		PASTEMAC(ch,chr,abval2s)( *chi1, abs_chi1 ); \
 \
 		/* Accumulate the absolute value of chi1 into absum. */ \
 		PASTEMAC(chr,adds)( abs_chi1, absum ); \
@@ -356,7 +356,7 @@ void PASTEMAC(ch,varname) \
 \
 		feclearexcept( FE_ALL_EXCEPT );\
 \
-		PASTEMAC2(ch,dotv,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,dotv,BLIS_TAPI_EX_SUF) \
 		( \
 		  BLIS_NO_CONJUGATE, \
 		  BLIS_NO_CONJUGATE, \
@@ -368,7 +368,7 @@ void PASTEMAC(ch,varname) \
 		  rntm  \
 		); \
 \
-		PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \
+		PASTEMAC(ch,chr,copys)( sumsqc, sumsq ); \
 \
 		f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
 \
@@ -468,7 +468,7 @@ void PASTEMAC(ch,varname) \
 		chi1 = x + (i  )*incx; \
 \
 		/* Compute the absolute value (or complex magnitude) of chi1. */ \
-		PASTEMAC2(ch,chr,abval2s)( *chi1, abs_chi1 ); \
+		PASTEMAC(ch,chr,abval2s)( *chi1, abs_chi1 ); \
 \
 		/* If the absolute value of the current element exceeds that of
 		   the previous largest, save it and its index. If NaN is
@@ -598,7 +598,7 @@ void PASTEMAC(ch,varname) \
 \
 				/* Handle the diagonal element separately in case it's
 				   unit. */ \
-				PASTEMAC2(ch,chr,abval2s)( *chi1, abval_chi1 ); \
+				PASTEMAC(ch,chr,abval2s)( *chi1, abval_chi1 ); \
 				PASTEMAC(chr,adds)( abval_chi1, absum_j ); \
 \
 				/* If absum_j is greater than the previous maximum value,
@@ -633,7 +633,7 @@ void PASTEMAC(ch,varname) \
 \
 				/* Handle the diagonal element separately in case it's
 				   unit. */ \
-				PASTEMAC2(ch,chr,abval2s)( *chi1, abval_chi1 ); \
+				PASTEMAC(ch,chr,abval2s)( *chi1, abval_chi1 ); \
 				PASTEMAC(chr,adds)( abval_chi1, absum_j ); \
 \
 				/* If absum_j is greater than the previous maximum value,
@@ -940,7 +940,7 @@ void PASTEMAC(ch,varname) \
 \
 			x1     = x + (j  )*ldx + (0  )*incx; \
 \
-			/*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \
+			/*PASTEMAC(ch,kername,BLIS_TAPI_EX_SUF)*/ \
 			PASTEMAC(ch,kername) \
 			( \
 			  n_elem, \
@@ -954,7 +954,7 @@ void PASTEMAC(ch,varname) \
 	{ \
 		max_m_n = bli_max( m, n ); \
 \
-		PASTEMAC2(d,ch,sets)( max_m_n, 0.0, omega ); \
+		PASTEMAC(d,ch,sets)( max_m_n, 0.0, omega ); \
 		PASTEMAC(ch,copys)( *one, beta ); \
 		PASTEMAC(ch,invscals)( omega, beta ); \
 \
@@ -968,7 +968,7 @@ void PASTEMAC(ch,varname) \
 				x0     = x1; \
 				chi1   = x1 + (n_elem-1)*incx; \
 \
-				/*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \
+				/*PASTEMAC(ch,kername,BLIS_TAPI_EX_SUF)*/ \
 				PASTEMAC(ch,kername) \
 				( \
 				  n_elem, \
@@ -1009,7 +1009,7 @@ void PASTEMAC(ch,varname) \
 				x2     = x1 + incx; \
 				chi1   = x1; \
 \
-				/*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \
+				/*PASTEMAC(ch,kername,BLIS_TAPI_EX_SUF)*/ \
 				PASTEMAC(ch,kername) \
 				( \
 				  n_elem, \
@@ -1083,7 +1083,7 @@ void PASTEMAC(ch,varname) \
 	for ( i = 0; i < n; ++i ) \
 	{ \
 		/* Get the real and imaginary components of chi1. */ \
-		PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+		PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 		abs_chi1_r = bli_fabs( chi1_r ); \
 		abs_chi1_i = bli_fabs( chi1_i ); \
diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8x10.c
similarity index 86%
rename from kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
rename to kernels/armsve/1m/bli_dpackm_armsve256_int_8x10.c
index a6a288613..1665b539c 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8x10.c
@@ -42,23 +42,25 @@
 //   SVE vector length = 256 bits.
 //
 
-void bli_dpackm_armsve256_int_8xk
+void bli_dpackm_armsve256_int_8x10
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim_,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   n_,
              dim_t   n_max_,
        const void*   kappa,
        const void*   a, inc_t inca_, inc_t lda_,
              void*   p,              inc_t ldp_,
+       const void*   params,
        const cntx_t* cntx
      )
 {
     const int64_t cdim  = cdim_;
-    const int64_t mnr   = 8;
+    const int64_t mr    = 8;
     const int64_t n     = n_;
-    const int64_t n_max = n_max_;
     const int64_t inca  = inca_;
     const int64_t lda   = lda_;
     const int64_t ldp   = ldp_;
@@ -75,7 +77,7 @@ void bli_dpackm_armsve256_int_8xk
     //   with each element as: 0, 1*inca, 2*inca, 3*inca
     z_index = svindex_u64( 0, inca * sizeof( double ) );
 
-    if ( cdim == mnr )
+    if ( cdim == mr && cdim_bcast == 1 )
     {
         if ( bli_deq1( *(( double* )kappa) ) )
         {
@@ -179,53 +181,25 @@ void bli_dpackm_armsve256_int_8xk
             }
         } // end of if ( *kappa == 1.0 )
     }
-    else // if ( cdim < mnr )
-    {
-        bli_dscal2m_ex
-        (
-          0,
-          BLIS_NONUNIT_DIAG,
-          BLIS_DENSE,
-          ( trans_t )conja,
-          cdim,
-          n,
-          kappa,
-          a, inca, lda,
-          p, 1,    ldp,
-          cntx,
-          NULL
-        );
-
-        // if ( cdim < mnr )
-        {
-            const dim_t      i      = cdim;
-            const dim_t      m_edge = mnr - i;
-            const dim_t      n_edge = n_max;
-            double* restrict p_edge = ( double* )p + (i  )*1;
-
-            bli_dset0s_mxn
-            (
-              m_edge,
-              n_edge,
-              p_edge, 1, ldp
-            );
-        }
-    }
-
-    if ( n < n_max )
-    {
-        const dim_t      j      = n;
-        const dim_t      m_edge = mnr;
-        const dim_t      n_edge = n_max - j;
-        double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-        bli_dset0s_mxn
-        (
-          m_edge,
-          n_edge,
-          p_edge, 1, ldp
-        );
-    }
+	else
+	{
+		bli_dscal2bbs_mxn
+		(
+		  conja,
+		  cdim_,
+		  n_,
+		  kappa,
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
+		);
+	}
+
+	bli_dset0s_edge
+	(
+	  cdim_*cdim_bcast, cdim_max*cdim_bcast,
+	  n_, n_max_,
+	  p, ldp
+	);
 }
 
 #endif // __has_include(<arm_sve.h>)
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
deleted file mode 100644
index 61bd7734a..000000000
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021, The University of Tokyo
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "armsve512_asm_transpose_d8x8.h"
-#include "armsve512_asm_transpose_d8x2.h"
-#include "../3/armsve_asm_macros.h"
-
-// assumption:
-//   SVE vector length = 512 bits.
-
-void bli_dpackm_armsve512_asm_10xk
-     (
-             conj_t  conja,
-             pack_t  schema,
-             dim_t   cdim_,
-             dim_t   n_,
-             dim_t   n_max_,
-       const void*   kappa,
-       const void*   a, inc_t inca_, inc_t lda_,
-             void*   p,              inc_t ldp_,
-       const cntx_t* cntx
-     )
-{
-    const int64_t cdim  = cdim_;
-    const int64_t mnr   = 10;
-    const int64_t n     = n_;
-    const int64_t n_max = n_max_;
-    const int64_t inca  = inca_;
-    const int64_t lda   = lda_;
-    const int64_t ldp   = ldp_;
-    const bool    gs    = inca != 1 && lda != 1;
-    const bool    unitk = bli_deq1( *(( double* )kappa) );
-
-#ifdef _A64FX
-    {
-        // Infer whether A or B is being packed.
-        if ( schema == BLIS_PACKED_ROWS )
-            p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
-        if ( schema == BLIS_PACKED_COLUMNS )
-            p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
-    }
-#endif
-
-    if ( cdim == mnr && !gs && unitk )
-    {
-        uint64_t n_mker = n / 8;
-        uint64_t n_left = n % 8;
-        __asm__ volatile (
-            "mov  x0, %[a] \n\t"
-            "mov  x1, %[p] \n\t"
-            "mov  x2, %[ldp] \n\t"
-            "mov  x3, %[lda] \n\t"
-            "mov  x4, %[inca] \n\t"
-            "cmp  x4, #1 \n\t"
-            // Skips by sizeof(double).
-            "mov  x8, #8 \n\t"
-            "madd x2, x2, x8, xzr \n\t"
-            "madd x3, x3, x8, xzr \n\t"
-            "madd x4, x4, x8, xzr \n\t"
-            // Loop constants.
-            "mov  x8, %[n_mker] \n\t"
-            "mov  x9, %[n_left] \n\t"
-            "ptrue p0.d \n\t"
-            BNE(AROWSTOR)
-            // A stored in columns.
-            LABEL(ACOLSTOR)
-            // Prefetch distance.
-            "mov  x17, #8 \n\t"
-            "madd x17, x17, x3, xzr \n\t"
-#ifdef _A64FX
-            // Disable hardware prefetch for A.
-            "mov  x16, 0x6 \n\t"
-            "lsl  x16, x16, #60 \n\t"
-            "orr  x0, x0, x16 \n\t"
-#endif
-            LABEL(ACOLSTORMKER)
-            "cmp  x8, xzr \n\t"
-            BEQ(ACOLSTORMKEREND)
-            "add  x5, x0, x3 \n\t"
-            "add  x6, x5, x3 \n\t"
-            "add  x7, x6, x3 \n\t"
-            "ld1d z0.d, p0/z, [x0] \n\t"
-            "ldr  q1, [x0, #64] \n\t"
-            "ld1d z2.d, p0/z, [x5] \n\t"
-            "ldr  q3, [x5, #64] \n\t"
-            "ld1d z4.d, p0/z, [x6] \n\t"
-            "ldr  q5, [x6, #64] \n\t"
-            "ld1d z6.d, p0/z, [x7] \n\t"
-            "ldr  q7, [x7, #64] \n\t"
-            "add  x18, x17, x0 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            "add  x18, x17, x5 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            "add  x18, x17, x6 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            "add  x18, x17, x7 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            "add  x0, x7, x3 \n\t"
-            "add  x5, x0, x3 \n\t"
-            "add  x6, x5, x3 \n\t"
-            "add  x7, x6, x3 \n\t"
-            "ld1d z8.d, p0/z, [x0] \n\t"
-            "ldr  q9, [x0, #64] \n\t"
-            "ld1d z10.d, p0/z, [x5] \n\t"
-            "ldr  q11, [x5, #64] \n\t"
-            "ld1d z12.d, p0/z, [x6] \n\t"
-            "ldr  q13, [x6, #64] \n\t"
-            "ld1d z14.d, p0/z, [x7] \n\t"
-            "ldr  q15, [x7, #64] \n\t"
-            "add  x18, x17, x0 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            "add  x18, x17, x5 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            "add  x18, x17, x6 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            "add  x18, x17, x7 \n\t"
-            "prfm PLDL1STRM, [x18] \n\t"
-            // Plain storage
-            "add  x10, x1, x2 \n\t"
-            "add  x11, x10, x2 \n\t"
-            "add  x12, x11, x2 \n\t"
-            "add  x13, x12, x2 \n\t"
-            "add  x14, x13, x2 \n\t"
-            "add  x15, x14, x2 \n\t"
-            "add  x16, x15, x2 \n\t"
-            "st1d z0.d, p0, [x1] \n\t"
-            "str  q1, [x1, #64] \n\t"
-            "st1d z2.d, p0, [x10] \n\t"
-            "str  q3, [x10, #64] \n\t"
-            "st1d z4.d, p0, [x11] \n\t"
-            "str  q5, [x11, #64] \n\t"
-            "st1d z6.d, p0, [x12] \n\t"
-            "str  q7, [x12, #64] \n\t"
-            "st1d z8.d, p0, [x13] \n\t"
-            "str  q9, [x13, #64] \n\t"
-            "st1d z10.d, p0, [x14] \n\t"
-            "str  q11, [x14, #64] \n\t"
-            "st1d z12.d, p0, [x15] \n\t"
-            "str  q13, [x15, #64] \n\t"
-            "st1d z14.d, p0, [x16] \n\t"
-            "str  q15, [x16, #64] \n\t"
-            "add  x1, x16, x2 \n\t"
-            // Realign and store.
-            // "ext  z1.b, z1.b, z1.b, #16 \n\t"
-            // "ext  z1.b, z1.b, z2.b, #48 \n\t"
-            // "ext  z2.b, z2.b, z3.b, #16 \n\t"
-            // "ext  z2.b, z2.b, z4.b, #32 \n\t"
-            // "ext  z4.b, z4.b, z5.b, #16 \n\t"
-            // "ext  z4.b, z4.b, z6.b, #16 \n\t"
-            // "ext  z6.b, z6.b, z7.b, #16 \n\t"
-            // "ext  z9.b, z9.b, z9.b, #16 \n\t"
-            // "ext  z9.b, z9.b, z10.b, #48 \n\t"
-            // "ext  z10.b, z10.b, z11.b, #16 \n\t"
-            // "ext  z10.b, z10.b, z12.b, #32 \n\t"
-            // "ext  z12.b, z12.b, z13.b, #16 \n\t"
-            // "ext  z12.b, z12.b, z14.b, #16 \n\t"
-            // "ext  z14.b, z14.b, z15.b, #16 \n\t"
-            // "st1d z0.d, p0, [x1] \n\t"
-            // "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
-            // "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
-            // "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
-            // "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
-            // "add  x1, x1, #320 \n\t"
-            // "st1d z8.d, p0, [x1] \n\t"
-            // "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
-            // "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
-            // "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
-            // "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
-            // "add  x1, x1, #320 \n\t"
-            "add  x0, x7, x3 \n\t"
-            "sub  x8, x8, #1 \n\t"
-            BRANCH(ACOLSTORMKER)
-            LABEL(ACOLSTORMKEREND)
-            LABEL(ACOLSTORLEFT)
-            "cmp  x9, xzr \n\t"
-            BEQ(UNITKDONE)
-            "ld1d z0.d, p0/z, [x0] \n\t"
-            "ldr  q1, [x0, #64] \n\t"
-            "st1d z0.d, p0, [x1] \n\t"
-            "str  q1, [x1, #64] \n\t"
-            "add  x0, x0, x3 \n\t"
-            "add  x1, x1, x2 \n\t"
-            "sub  x9, x9, #1 \n\t"
-            BRANCH(ACOLSTORLEFT)
-            // A stored in rows.
-            LABEL(AROWSTOR)
-            // Prepare predicates for in-reg transpose.
-            SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
-            LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful.
-            "cmp  x8, xzr \n\t"
-            BEQ(AROWSTORMKEREND)
-            "add  x10, x0, x4 \n\t"
-            "add  x11, x10, x4 \n\t"
-            "add  x12, x11, x4 \n\t"
-            "add  x13, x12, x4 \n\t"
-            "add  x14, x13, x4 \n\t"
-            "add  x15, x14, x4 \n\t"
-            "add  x16, x15, x4 \n\t"
-            "add  x17, x16, x4 \n\t"
-            "add  x18, x17, x4 \n\t"
-            "ld1d z0.d, p0/z, [x0] \n\t"
-            "ld1d z1.d, p0/z, [x10] \n\t"
-            "ld1d z2.d, p0/z, [x11] \n\t"
-            "ld1d z3.d, p0/z, [x12] \n\t"
-            "ld1d z4.d, p0/z, [x13] \n\t"
-            "ld1d z5.d, p0/z, [x14] \n\t"
-            "ld1d z6.d, p0/z, [x15] \n\t"
-            "ld1d z7.d, p0/z, [x16] \n\t"
-            "ld1d z22.d, p0/z, [x17] \n\t"
-            "ld1d z23.d, p0/z, [x18] \n\t"
-            // Transpose first 8 rows.
-            SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
-            // Transpose last 2 rows.
-            SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
-            // Plain storage.
-            "add  x10, x1, x2 \n\t"
-            "add  x11, x10, x2 \n\t"
-            "add  x12, x11, x2 \n\t"
-            "add  x13, x12, x2 \n\t"
-            "add  x14, x13, x2 \n\t"
-            "add  x15, x14, x2 \n\t"
-            "add  x16, x15, x2 \n\t"
-            "st1d z8.d, p0, [x1] \n\t"
-            "str  q16, [x1, #64] \n\t"
-            "st1d z9.d, p0, [x10] \n\t"
-            "str  q17, [x10, #64] \n\t"
-            "st1d z10.d, p0, [x11] \n\t"
-            "str  q18, [x11, #64] \n\t"
-            "st1d z11.d, p0, [x12] \n\t"
-            "str  q19, [x12, #64] \n\t"
-            "st1d z12.d, p0, [x13] \n\t"
-            "str  q20, [x13, #64] \n\t"
-            "st1d z13.d, p0, [x14] \n\t"
-            "str  q21, [x14, #64] \n\t"
-            "st1d z14.d, p0, [x15] \n\t"
-            "str  q22, [x15, #64] \n\t"
-            "st1d z15.d, p0, [x16] \n\t"
-            "str  q23, [x16, #64] \n\t"
-            "add  x1, x16, x2 \n\t"
-            "add  x0, x0, #64 \n\t"
-            "sub  x8, x8, #1 \n\t"
-            BRANCH(AROWSTORMKER)
-            LABEL(AROWSTORMKEREND)
-            "mov  x4, %[inca] \n\t" // Restore unshifted inca.
-            "index z30.d, xzr, x4 \n\t" // Generate index.
-            "lsl  x4, x4, #3 \n\t" // Shift again.
-            "lsl  x5, x4, #3 \n\t" // Virtual column vl.
-            LABEL(AROWSTORLEFT)
-            "cmp  x9, xzr \n\t"
-            BEQ(UNITKDONE)
-            "add  x6, x0, x5 \n\t"
-            "add  x7, x6, x4 \n\t"
-            "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
-            "ldr  d1, [x6] \n\t"
-            "ldr  d2, [x7] \n\t"
-            "trn1 v1.2d, v1.2d, v2.2d \n\t"
-            "st1d z0.d, p0, [x1] \n\t"
-            "str  q1, [x1, #64] \n\t"
-            "add  x1, x1, x2 \n\t"
-            "add  x0, x0, #8 \n\t"
-            "sub  x9, x9, #1 \n\t"
-            BRANCH(AROWSTORLEFT)
-            LABEL(UNITKDONE)
-            "mov  x0, #0 \n\t"
-            :
-            : [a]      "r" (a),
-              [p]      "r" (p),
-              [lda]    "r" (lda),
-              [ldp]    "r" (ldp),
-              [inca]   "r" (inca),
-              [n_mker] "r" (n_mker),
-              [n_left] "r" (n_left)
-            : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-              "x8", "x9", "x10","x11","x12","x13","x14","x15",
-              "x16","x17","x18",
-              "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
-              "z8", "z9", "z10","z11","z12","z13","z14","z15",
-              "z16","z17","z18","z19","z20","z21","z22","z23",
-              // "z24","z25","z26","z27","z28","z29",
-              "z30","z31",
-              "p0", "p1", "p2", "p3", "p4", // "p5",
-              "p6", "p7", "p8"
-            );
-    }
-    else // if ( cdim < mnr )
-    {
-        bli_dscal2m_ex
-        (
-          0,
-          BLIS_NONUNIT_DIAG,
-          BLIS_DENSE,
-          ( trans_t )conja,
-          cdim,
-          n,
-          kappa,
-          a, inca, lda,
-          p, 1,    ldp,
-          cntx,
-          NULL
-        );
-
-        // if ( cdim < mnr )
-        {
-            const dim_t      i      = cdim;
-            const dim_t      m_edge = mnr - i;
-            const dim_t      n_edge = n_max;
-            double* restrict p_edge = ( double* )p + (i  )*1;
-
-            bli_dset0s_mxn
-            (
-              m_edge,
-              n_edge,
-              p_edge, 1, ldp
-            );
-        }
-    }
-
-    if ( n < n_max )
-    {
-        const dim_t      j      = n;
-        const dim_t      m_edge = mnr;
-        const dim_t      n_edge = n_max - j;
-        double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-        bli_dset0s_mxn
-        (
-          m_edge,
-          n_edge,
-          p_edge, 1, ldp
-        );
-    }
-}
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16x10.c
similarity index 52%
rename from kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
rename to kernels/armsve/1m/bli_dpackm_armsve512_asm_16x10.c
index b637f8c80..5981f392a 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16x10.c
@@ -34,27 +34,32 @@
 */
 
 #include "blis.h"
+#include "armsve512_asm_transpose_d8x2.h"
 #include "armsve512_asm_transpose_d8x8.h"
 #include "../3/armsve_asm_macros.h"
 
 // assumption:
 //   SVE vector length = 512 bits.
 
-void bli_dpackm_armsve512_asm_16xk
+void bli_dpackm_armsve512_asm_16x10
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim_,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   n_,
              dim_t   n_max_,
        const void*   kappa,
        const void*   a, inc_t inca_, inc_t lda_,
              void*   p,              inc_t ldp_,
+       const void*   params,
        const cntx_t* cntx
      )
 {
     const int64_t cdim  = cdim_;
-    const int64_t mnr   = 16;
+    const int64_t mr    = 16;
+    const int64_t nr    = 10;
     const int64_t n     = n_;
     const int64_t n_max = n_max_;
     const int64_t inca  = inca_;
@@ -63,6 +68,10 @@ void bli_dpackm_armsve512_asm_16xk
     const bool    gs    = inca != 1 && lda != 1;
     const bool    unitk = bli_deq1( *(( double* )kappa) );
 
+// This never would have worked in the first place since GEMM packing used
+// BLIS_PACKED_ROW_PANELS and BLIS_PACKED_COL_PANELS, but with the removal
+// of the row/column packing bit it can't work via the schema anyways.
+#if 0
 #ifdef _A64FX
     {
         // Infer whether A or B is being packed.
@@ -71,9 +80,10 @@ void bli_dpackm_armsve512_asm_16xk
         if ( schema == BLIS_PACKED_COLUMNS )
             p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
     }
+#endif
 #endif
 
-    if ( cdim == mnr && !gs && unitk )
+    if ( cdim == mr && cdim_bcast == 1 && !gs && unitk )
     {
         uint64_t n_mker = n / 8;
         uint64_t n_left = n % 8;
@@ -100,9 +110,9 @@ void bli_dpackm_armsve512_asm_16xk
             "mov  x8, %[n_mker] \n\t"
             "mov  x9, %[n_left] \n\t"
             "ptrue p0.d \n\t"
-            BNE(AROWSTOR)
+            BNE(MAROWSTOR)
             // A stored in columns.
-            LABEL(ACOLSTOR)
+            LABEL(MACOLSTOR)
             // Prefetch distance.
             "mov  x17, #8 \n\t"
             "madd x17, x17, x3, xzr \n\t"
@@ -126,9 +136,9 @@ void bli_dpackm_armsve512_asm_16xk
             // "prfm PLDL1STRM, [x5] \n\t"
             // "prfm PLDL1STRM, [x6] \n\t"
             // "prfm PLDL1STRM, [x7] \n\t"
-            LABEL(ACOLSTORMKER)
+            LABEL(MACOLSTORMKER)
             "cmp  x8, xzr \n\t"
-            BEQ(ACOLSTORMKEREND)
+            BEQ(MACOLSTORMKEREND)
             "add  x5, x0, x3 \n\t"
             "add  x6, x5, x3 \n\t"
             "add  x7, x6, x3 \n\t"
@@ -194,11 +204,11 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x0, x7, x3 \n\t"
             "add  x1, x16, x2 \n\t"
             "sub  x8, x8, #1 \n\t"
-            BRANCH(ACOLSTORMKER)
-            LABEL(ACOLSTORMKEREND)
-            LABEL(ACOLSTORLEFT)
+            BRANCH(MACOLSTORMKER)
+            LABEL(MACOLSTORMKEREND)
+            LABEL(MACOLSTORLEFT)
             "cmp  x9, xzr \n\t"
-            BEQ(UNITKDONE)
+            BEQ(MUNITKDONE)
             "ld1d z0.d, p0/z, [x0] \n\t"
             "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
             "st1d z0.d, p0, [x1] \n\t"
@@ -206,14 +216,14 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x0, x0, x3 \n\t"
             "add  x1, x1, x2 \n\t"
             "sub  x9, x9, #1 \n\t"
-            BRANCH(ACOLSTORLEFT)
+            BRANCH(MACOLSTORLEFT)
             // A stored in rows.
-            LABEL(AROWSTOR)
+            LABEL(MAROWSTOR)
             // Prepare predicates for in-reg transpose.
             SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
-            LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful.
+            LABEL(MAROWSTORMKER) // X[10-16] for A here not P. Be careful.
             "cmp  x8, xzr \n\t"
-            BEQ(AROWSTORMKEREND)
+            BEQ(MAROWSTORMKEREND)
             "add  x10, x0, x4 \n\t"
             "add  x11, x10, x4 \n\t"
             "add  x12, x11, x4 \n\t"
@@ -275,15 +285,15 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x0, x0, #64 \n\t"
             "add  x1, x16, x2 \n\t"
             "sub  x8, x8, #1 \n\t"
-            BRANCH(AROWSTORMKER)
-            LABEL(AROWSTORMKEREND)
+            BRANCH(MAROWSTORMKER)
+            LABEL(MAROWSTORMKEREND)
             "mov  x4, %[inca] \n\t" // Restore unshifted inca.
             "index z30.d, xzr, x4 \n\t" // Generate index.
             "lsl  x4, x4, #3 \n\t" // Shift again.
             "lsl  x5, x4, #3 \n\t" // Virtual column vl.
-            LABEL(AROWSTORLEFT)
+            LABEL(MAROWSTORLEFT)
             "cmp  x9, xzr \n\t"
-            BEQ(UNITKDONE)
+            BEQ(MUNITKDONE)
             "add  x6, x0, x5 \n\t"
             "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
             "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
@@ -292,8 +302,8 @@ void bli_dpackm_armsve512_asm_16xk
             "add  x1, x1, x2 \n\t"
             "add  x0, x0, #8 \n\t"
             "sub  x9, x9, #1 \n\t"
-            BRANCH(AROWSTORLEFT)
-            LABEL(UNITKDONE)
+            BRANCH(MAROWSTORLEFT)
+            LABEL(MUNITKDONE)
             "mov  x0, #0 \n\t"
             :
             : [a]      "r" (a),
@@ -313,51 +323,264 @@ void bli_dpackm_armsve512_asm_16xk
               "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7"
             );
     }
-    else // if ( cdim < mnr )
+    else if ( cdim == nr && cdim_bcast == 1 && !gs && unitk )
     {
-        bli_dscal2m_ex
-        (
-          0,
-          BLIS_NONUNIT_DIAG,
-          BLIS_DENSE,
-          ( trans_t )conja,
-          cdim,
-          n,
-          kappa,
-          a, inca, lda,
-          p, 1,    ldp,
-          cntx,
-          NULL
-        );
-
-        // if ( cdim < mnr )
-        {
-            const dim_t      i      = cdim;
-            const dim_t      m_edge = mnr - i;
-            const dim_t      n_edge = n_max;
-            double* restrict p_edge = ( double* )p + (i  )*1;
-
-            bli_dset0s_mxn
-            (
-              m_edge,
-              n_edge,
-              p_edge, 1, ldp
+        uint64_t n_mker = n / 8;
+        uint64_t n_left = n % 8;
+        __asm__ volatile (
+            "mov  x0, %[a] \n\t"
+            "mov  x1, %[p] \n\t"
+            "mov  x2, %[ldp] \n\t"
+            "mov  x3, %[lda] \n\t"
+            "mov  x4, %[inca] \n\t"
+            "cmp  x4, #1 \n\t"
+            // Skips by sizeof(double).
+            "mov  x8, #8 \n\t"
+            "madd x2, x2, x8, xzr \n\t"
+            "madd x3, x3, x8, xzr \n\t"
+            "madd x4, x4, x8, xzr \n\t"
+            // Loop constants.
+            "mov  x8, %[n_mker] \n\t"
+            "mov  x9, %[n_left] \n\t"
+            "ptrue p0.d \n\t"
+            BNE(NAROWSTOR)
+            // A stored in columns.
+            LABEL(NACOLSTOR)
+            // Prefetch distance.
+            "mov  x17, #8 \n\t"
+            "madd x17, x17, x3, xzr \n\t"
+#ifdef _A64FX
+            // Disable hardware prefetch for A.
+            "mov  x16, 0x6 \n\t"
+            "lsl  x16, x16, #60 \n\t"
+            "orr  x0, x0, x16 \n\t"
+#endif
+            LABEL(NACOLSTORMKER)
+            "cmp  x8, xzr \n\t"
+            BEQ(NACOLSTORMKEREND)
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ldr  q1, [x0, #64] \n\t"
+            "ld1d z2.d, p0/z, [x5] \n\t"
+            "ldr  q3, [x5, #64] \n\t"
+            "ld1d z4.d, p0/z, [x6] \n\t"
+            "ldr  q5, [x6, #64] \n\t"
+            "ld1d z6.d, p0/z, [x7] \n\t"
+            "ldr  q7, [x7, #64] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x0, x7, x3 \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z8.d, p0/z, [x0] \n\t"
+            "ldr  q9, [x0, #64] \n\t"
+            "ld1d z10.d, p0/z, [x5] \n\t"
+            "ldr  q11, [x5, #64] \n\t"
+            "ld1d z12.d, p0/z, [x6] \n\t"
+            "ldr  q13, [x6, #64] \n\t"
+            "ld1d z14.d, p0/z, [x7] \n\t"
+            "ldr  q15, [x7, #64] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            // Plain storage
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "st1d z2.d, p0, [x10] \n\t"
+            "str  q3, [x10, #64] \n\t"
+            "st1d z4.d, p0, [x11] \n\t"
+            "str  q5, [x11, #64] \n\t"
+            "st1d z6.d, p0, [x12] \n\t"
+            "str  q7, [x12, #64] \n\t"
+            "st1d z8.d, p0, [x13] \n\t"
+            "str  q9, [x13, #64] \n\t"
+            "st1d z10.d, p0, [x14] \n\t"
+            "str  q11, [x14, #64] \n\t"
+            "st1d z12.d, p0, [x15] \n\t"
+            "str  q13, [x15, #64] \n\t"
+            "st1d z14.d, p0, [x16] \n\t"
+            "str  q15, [x16, #64] \n\t"
+            "add  x1, x16, x2 \n\t"
+            // Realign and store.
+            // "ext  z1.b, z1.b, z1.b, #16 \n\t"
+            // "ext  z1.b, z1.b, z2.b, #48 \n\t"
+            // "ext  z2.b, z2.b, z3.b, #16 \n\t"
+            // "ext  z2.b, z2.b, z4.b, #32 \n\t"
+            // "ext  z4.b, z4.b, z5.b, #16 \n\t"
+            // "ext  z4.b, z4.b, z6.b, #16 \n\t"
+            // "ext  z6.b, z6.b, z7.b, #16 \n\t"
+            // "ext  z9.b, z9.b, z9.b, #16 \n\t"
+            // "ext  z9.b, z9.b, z10.b, #48 \n\t"
+            // "ext  z10.b, z10.b, z11.b, #16 \n\t"
+            // "ext  z10.b, z10.b, z12.b, #32 \n\t"
+            // "ext  z12.b, z12.b, z13.b, #16 \n\t"
+            // "ext  z12.b, z12.b, z14.b, #16 \n\t"
+            // "ext  z14.b, z14.b, z15.b, #16 \n\t"
+            // "st1d z0.d, p0, [x1] \n\t"
+            // "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            // "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
+            // "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
+            // "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
+            // "add  x1, x1, #320 \n\t"
+            // "st1d z8.d, p0, [x1] \n\t"
+            // "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
+            // "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
+            // "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
+            // "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
+            // "add  x1, x1, #320 \n\t"
+            "add  x0, x7, x3 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            BRANCH(NACOLSTORMKER)
+            LABEL(NACOLSTORMKEREND)
+            LABEL(NACOLSTORLEFT)
+            "cmp  x9, xzr \n\t"
+            BEQ(NUNITKDONE)
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ldr  q1, [x0, #64] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "add  x0, x0, x3 \n\t"
+            "add  x1, x1, x2 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            BRANCH(NACOLSTORLEFT)
+            // A stored in rows.
+            LABEL(NAROWSTOR)
+            // Prepare predicates for in-reg transpose.
+            SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
+            LABEL(NAROWSTORMKER) // X[10-16] for A here not P. Be careful.
+            "cmp  x8, xzr \n\t"
+            BEQ(NAROWSTORMKEREND)
+            "add  x10, x0, x4 \n\t"
+            "add  x11, x10, x4 \n\t"
+            "add  x12, x11, x4 \n\t"
+            "add  x13, x12, x4 \n\t"
+            "add  x14, x13, x4 \n\t"
+            "add  x15, x14, x4 \n\t"
+            "add  x16, x15, x4 \n\t"
+            "add  x17, x16, x4 \n\t"
+            "add  x18, x17, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x10] \n\t"
+            "ld1d z2.d, p0/z, [x11] \n\t"
+            "ld1d z3.d, p0/z, [x12] \n\t"
+            "ld1d z4.d, p0/z, [x13] \n\t"
+            "ld1d z5.d, p0/z, [x14] \n\t"
+            "ld1d z6.d, p0/z, [x15] \n\t"
+            "ld1d z7.d, p0/z, [x16] \n\t"
+            "ld1d z22.d, p0/z, [x17] \n\t"
+            "ld1d z23.d, p0/z, [x18] \n\t"
+            // Transpose first 8 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
+            // Transpose last 2 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
+            // Plain storage.
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z8.d, p0, [x1] \n\t"
+            "str  q16, [x1, #64] \n\t"
+            "st1d z9.d, p0, [x10] \n\t"
+            "str  q17, [x10, #64] \n\t"
+            "st1d z10.d, p0, [x11] \n\t"
+            "str  q18, [x11, #64] \n\t"
+            "st1d z11.d, p0, [x12] \n\t"
+            "str  q19, [x12, #64] \n\t"
+            "st1d z12.d, p0, [x13] \n\t"
+            "str  q20, [x13, #64] \n\t"
+            "st1d z13.d, p0, [x14] \n\t"
+            "str  q21, [x14, #64] \n\t"
+            "st1d z14.d, p0, [x15] \n\t"
+            "str  q22, [x15, #64] \n\t"
+            "st1d z15.d, p0, [x16] \n\t"
+            "str  q23, [x16, #64] \n\t"
+            "add  x1, x16, x2 \n\t"
+            "add  x0, x0, #64 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            BRANCH(NAROWSTORMKER)
+            LABEL(NAROWSTORMKEREND)
+            "mov  x4, %[inca] \n\t" // Restore unshifted inca.
+            "index z30.d, xzr, x4 \n\t" // Generate index.
+            "lsl  x4, x4, #3 \n\t" // Shift again.
+            "lsl  x5, x4, #3 \n\t" // Virtual column vl.
+            LABEL(NAROWSTORLEFT)
+            "cmp  x9, xzr \n\t"
+            BEQ(NUNITKDONE)
+            "add  x6, x0, x5 \n\t"
+            "add  x7, x6, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
+            "ldr  d1, [x6] \n\t"
+            "ldr  d2, [x7] \n\t"
+            "trn1 v1.2d, v1.2d, v2.2d \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "add  x1, x1, x2 \n\t"
+            "add  x0, x0, #8 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            BRANCH(NAROWSTORLEFT)
+            LABEL(NUNITKDONE)
+            "mov  x0, #0 \n\t"
+            :
+            : [a]      "r" (a),
+              [p]      "r" (p),
+              [lda]    "r" (lda),
+              [ldp]    "r" (ldp),
+              [inca]   "r" (inca),
+              [n_mker] "r" (n_mker),
+              [n_left] "r" (n_left)
+            : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+              "x8", "x9", "x10","x11","x12","x13","x14","x15",
+              "x16","x17","x18",
+              "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+              "z8", "z9", "z10","z11","z12","z13","z14","z15",
+              "z16","z17","z18","z19","z20","z21","z22","z23",
+              // "z24","z25","z26","z27","z28","z29",
+              "z30","z31",
+              "p0", "p1", "p2", "p3", "p4", // "p5",
+              "p6", "p7", "p8"
             );
-        }
     }
+	else
+	{
+		bli_dscal2bbs_mxn
+		(
+		  conja,
+		  cdim_,
+		  n_,
+		  kappa,
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
+		);
+	}
 
-    if ( n < n_max )
-    {
-        const dim_t      j      = n;
-        const dim_t      m_edge = mnr;
-        const dim_t      n_edge = n_max - j;
-        double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-        bli_dset0s_mxn
-        (
-          m_edge,
-          n_edge,
-          p_edge, 1, ldp
-        );
-    }
+	bli_dset0s_edge
+	(
+	  cdim_*cdim_bcast, cdim_max*cdim_bcast,
+	  n_, n_max_,
+	  p, ldp
+	);
 }
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 7219d19c4..bdbd2cdea 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -53,7 +53,7 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
        const void*      b, \
        const void*      beta, \
              void*      c, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 505d3b4b7..67727d9d2 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -53,7 +53,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
        const void*      b, \
        const void*      beta, \
              void*      c, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index 88d7eb4bd..e689918c4 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -53,7 +53,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
        const void*      b, \
        const void*      beta, \
              void*      c, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 812e92e20..dbe968009 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -53,7 +53,7 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
        const void*      b, \
        const void*      beta, \
              void*      c, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h
index 00e1f0455..46a39e859 100644
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -46,8 +46,6 @@ GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed )
 
 // Use SVE intrinsics only for referred cases.
 #if !defined(BLIS_FAMILY_A64FX)
-PACKM_KER_PROT( double,   d, packm_armsve256_int_8xk )
-PACKM_KER_PROT( double,   d, packm_armsve512_int_12xk )
+PACKM_KER_PROT( double,   d, packm_armsve256_int_8x10 )
 #endif
-PACKM_KER_PROT( double,   d, packm_armsve512_asm_16xk )
-PACKM_KER_PROT( double,   d, packm_armsve512_asm_10xk )
+PACKM_KER_PROT( double,   d, packm_armsve512_asm_16x10 )
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
index 50d0dfcf3..c194074dc 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
@@ -38,7 +38,7 @@
 #define GENTPROT( ctype, ch, opname, suf ) \
 \
 extern \
-void PASTEMAC2(ch,opname,suf) \
+void PASTEMAC(ch,opname,suf) \
      ( \
              uint32_t   k, \
        const ctype*     alpha, \
@@ -46,7 +46,7 @@ void PASTEMAC2(ch,opname,suf) \
        const ctype*     b, \
        const ctype*     beta, \
              ctype*     c, uint32_t rs_c, uint32_t cs_c, \
-             auxinfo_t* data  \
+       const auxinfo_t* data  \
      );
 
 GENTPROT( float,    s, gemm_armv7a_ker_, 4x4 )
@@ -67,7 +67,7 @@ void bli_sgemm_armv7a_asm_4x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -89,7 +89,7 @@ void bli_dgemm_armv7a_asm_4x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -111,7 +111,7 @@ void bli_cgemm_armv7a_asm_2x2
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -132,7 +132,7 @@ void bli_zgemm_armv7a_asm_2x2
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
index b37d85399..9561767da 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
@@ -45,7 +45,7 @@ void bli_sgemm_armv7a_int_4x4
        const void*      b0,
        const void*      beta0,
              void*      c0, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -257,7 +257,7 @@ void bli_dgemm_armv7a_int_4x4
        const void*      b_,
        const void*      beta_,
              void*      c_, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d6x8.c
similarity index 63%
rename from kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
rename to kernels/armv8a/1m/bli_packm_armv8a_int_d6x8.c
index b177c7b28..4242e4efd 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d6x8.c
@@ -47,21 +47,25 @@
 #define PRAGMA_UNROLL_2
 #endif
 
-void bli_dpackm_armv8a_int_8xk
+void bli_dpackm_armv8a_int_6x8
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim0,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   k0,
              dim_t   k0_max,
        const void*   kappa,
        const void*   a, inc_t inca0, inc_t lda0,
              void*   p,              inc_t ldp0,
+       const void*   params,
        const cntx_t* cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
-  const dim_t    mnr    = 8;
+  const dim_t    mr     = 6;
+  const dim_t    nr     = 8;
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
@@ -92,7 +96,182 @@ void bli_dpackm_armv8a_int_8xk
 
   // -------------------------------------------------------------------------
 
-  if ( cdim0 == mnr && !gs )
+  if ( cdim0 == mr && cdim_bcast == 1 && !gs )
+  {
+    if ( unitk )
+    {
+      if ( inca == 1 )
+      {
+        // No need to use k-loops here.
+        // Simply let compiler to expand loops.
+        PRAGMA_UNROLL_2
+        for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
+        {
+          float64x2_t v0 = vld1q_f64( a_loc + 0 );
+          float64x2_t v1 = vld1q_f64( a_loc + 2 );
+          float64x2_t v2 = vld1q_f64( a_loc + 4 );
+
+          vst1q_f64( p_loc + 0, v0 );
+          vst1q_f64( p_loc + 2, v1 );
+          vst1q_f64( p_loc + 4, v2 );
+
+          a_loc += lda;
+          p_loc += ldp;
+        }
+      }
+      else // if ( lda == 1 )
+      {
+        float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
+
+        PRAGMA_NOUNROLL
+        for ( ; k_iter > 0; --k_iter )
+        {
+          v0 = vld1q_f64( a_loc + inca * 0 );
+          v1 = vld1q_f64( a_loc + inca * 1 );
+          v2 = vld1q_f64( a_loc + inca * 2 );
+          v3 = vld1q_f64( a_loc + inca * 3 );
+          v4 = vld1q_f64( a_loc + inca * 4 );
+          v5 = vld1q_f64( a_loc + inca * 5 );
+
+          // In-register transpose.
+          float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
+          float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
+          float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
+          float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
+          float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
+          float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
+
+          vst1q_f64( p_loc + 0, vd0_1 );
+          vst1q_f64( p_loc + 2, vd1_1 );
+          vst1q_f64( p_loc + 4, vd2_1 );
+          p_loc += ldp;
+
+          vst1q_f64( p_loc + 0, vd0_2 );
+          vst1q_f64( p_loc + 2, vd1_2 );
+          vst1q_f64( p_loc + 4, vd2_2 );
+          p_loc += ldp;
+          a_loc += 2 * lda; // 2;
+        }
+        for ( ; k_left > 0; --k_left )
+        {
+          v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
+          v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
+          v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
+          v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
+          v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
+          v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
+
+          vst1q_f64( p_loc + 0, v0 );
+          vst1q_f64( p_loc + 2, v1 );
+          vst1q_f64( p_loc + 4, v2 );
+          p_loc += ldp;
+          a_loc += lda; // 1;
+        }
+      }
+    }
+    else // if ( !unitk )
+    {
+      float64x2_t vkappa = vld1q_dup_f64( kappa );
+
+      if ( inca == 1 )
+      {
+        // No need to use k-loops here.
+        // Simply let compiler to expand loops.
+        PRAGMA_UNROLL_2
+        for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
+        {
+          float64x2_t v0 = vld1q_f64( a_loc + 0 );
+          float64x2_t v1 = vld1q_f64( a_loc + 2 );
+          float64x2_t v2 = vld1q_f64( a_loc + 4 );
+
+          // Scale by kappa.
+          v0 = vmulq_f64( v0, vkappa );
+          v1 = vmulq_f64( v1, vkappa );
+          v2 = vmulq_f64( v2, vkappa );
+
+          vst1q_f64( p_loc + 0, v0 );
+          vst1q_f64( p_loc + 2, v1 );
+          vst1q_f64( p_loc + 4, v2 );
+
+          a_loc += lda;
+          p_loc += ldp;
+        }
+      }
+      else // if ( lda == 1 )
+      {
+        float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
+        float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
+
+        PRAGMA_NOUNROLL
+        for ( ; k_iter > 0; --k_iter )
+        {
+          v0 = vld1q_f64( a_loc + inca * 0 );
+          v1 = vld1q_f64( a_loc + inca * 1 );
+          v2 = vld1q_f64( a_loc + inca * 2 );
+          v3 = vld1q_f64( a_loc + inca * 3 );
+          v4 = vld1q_f64( a_loc + inca * 4 );
+          v5 = vld1q_f64( a_loc + inca * 5 );
+
+          // Scale by kappa.
+          v0 = vmulq_f64( v0, vkappa );
+          v1 = vmulq_f64( v1, vkappa );
+          v2 = vmulq_f64( v2, vkappa );
+          v3 = vmulq_f64( v3, vkappa );
+          v4 = vmulq_f64( v4, vkappa );
+          v5 = vmulq_f64( v5, vkappa );
+
+          // In-register transpose.
+          float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
+          float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
+          float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
+          float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
+          float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
+          float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
+
+          vst1q_f64( p_loc + 0, vd0_1 );
+          vst1q_f64( p_loc + 2, vd1_1 );
+          vst1q_f64( p_loc + 4, vd2_1 );
+          p_loc += ldp;
+
+          vst1q_f64( p_loc + 0, vd0_2 );
+          vst1q_f64( p_loc + 2, vd1_2 );
+          vst1q_f64( p_loc + 4, vd2_2 );
+          p_loc += ldp;
+          a_loc += 2 * lda; // 2;
+        }
+        for ( ; k_left > 0; --k_left )
+        {
+          v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
+          v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
+          v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
+          v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
+          v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
+          v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
+
+          // Scale by kappa.
+          v0 = vmulq_f64( v0, vkappa );
+          v1 = vmulq_f64( v1, vkappa );
+          v2 = vmulq_f64( v2, vkappa );
+
+          vst1q_f64( p_loc + 0, v0 );
+          vst1q_f64( p_loc + 2, v1 );
+          vst1q_f64( p_loc + 4, v2 );
+          p_loc += ldp;
+          a_loc += lda; // 1;
+        }
+      }
+    }
+  }
+  else if ( cdim0 == nr && cdim_bcast == 1 && !gs )
   {
     if ( unitk )
     {
@@ -297,58 +476,24 @@ void bli_dpackm_armv8a_int_8xk
       }
     }
   }
-  else // if ( cdim0 < mnr || gs )
-  {
-    PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
-    (
-      0,
-      BLIS_NONUNIT_DIAG,
-      BLIS_DENSE,
-      ( trans_t )conja,
-      cdim0,
-      k0,
-      kappa,
-      a, inca0, lda0,
-      p,     1, ldp0,
-      cntx,
-      NULL
-    );
-
-    if ( cdim0 < mnr )
-    {
-      // Handle zero-filling along the "long" edge of the micropanel.
-
-      const dim_t      i      = cdim0;
-      const dim_t      m_edge = mnr - cdim0;
-      const dim_t      n_edge = k0_max;
-      double* restrict p_edge = ( double* )p + (i  )*1;
-
-      bli_dset0s_mxn
-      (
-        m_edge,
-        n_edge,
-        p_edge, 1, ldp
-      );
-    }
-  }
-
-//bli_dfprintm( stdout, "packm 8xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" );
-
-  if ( k0 < k0_max )
-  {
-    // Handle zero-filling along the "short" (far) edge of the micropanel.
-
-    const dim_t      j      = k0;
-    const dim_t      m_edge = mnr;
-    const dim_t      n_edge = k0_max - k0;
-    double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-    bli_dset0s_mxn
-    (
-      m_edge,
-      n_edge,
-      p_edge, 1, ldp
-    );
-  }
+	else
+	{
+		bli_dscal2bbs_mxn
+		(
+		  conja,
+		  cdim0,
+		  k0,
+		  kappa,
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
+		);
+	}
+
+	bli_dset0s_edge
+	(
+	  cdim0*cdim_bcast, cdim_max*cdim_bcast,
+	  k0, k0_max,
+	  p, ldp
+	);
 }
 
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
deleted file mode 100644
index 7ceaa726a..000000000
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Linaro Limited
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <arm_neon.h>
-
-#if defined(__clang__)
-#define PRAGMA_NOUNROLL _Pragma("nounroll")
-#define PRAGMA_UNROLL_2 _Pragma("unroll 2")
-#elif defined(__GNUC__)
-#define PRAGMA_NOUNROLL _Pragma("GCC unroll 1")
-#define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2")
-#else
-#define PRAGMA_NOUNROLL
-#define PRAGMA_UNROLL_2
-#endif
-
-void bli_dpackm_armv8a_int_6xk
-     (
-             conj_t  conja,
-             pack_t  schema,
-             dim_t   cdim0,
-             dim_t   k0,
-             dim_t   k0_max,
-       const void*   kappa,
-       const void*   a, inc_t inca0, inc_t lda0,
-             void*   p,              inc_t ldp0,
-       const cntx_t* cntx
-     )
-{
-  // This is the panel dimension assumed by the packm kernel.
-  const dim_t    mnr    = 6;
-
-  // Typecast local copies of integers in case dim_t and inc_t are a
-  // different size than is expected by load instructions.
-  uint64_t       k_iter = k0 / 2;
-  uint64_t       k_left = k0 % 2;
-
-  const double*  a_loc  = a;
-        double*  p_loc  = p;
-
-  // NOTE: For the purposes of the comments in this packm kernel, we
-  // interpret inca and lda as rs_a and cs_a, respectively, and similarly
-  // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
-  // this packm kernel, you should think of the operation as packing an
-  // m x n micropanel, where m and n are tiny and large, respectively, and
-  // where elements of each column of the packed matrix P are contiguous.
-  // (This packm kernel can still be used to pack micropanels of matrix B
-  // in a gemm operation.)
-  const uint64_t inca   = inca0;
-  const uint64_t lda    = lda0;
-  const uint64_t ldp    = ldp0;
-
-  const bool     gs     = ( inca0 != 1 && lda0 != 1 );
-
-  // NOTE: If/when this kernel ever supports scaling by kappa within the
-  // assembly region, this constraint should be lifted.
-  const bool     unitk  = bli_deq1( *(( double* )kappa) );
-
-
-  // -------------------------------------------------------------------------
-
-  if ( cdim0 == mnr && !gs )
-  {
-    if ( unitk )
-    {
-      if ( inca == 1 )
-      {
-        // No need to use k-loops here.
-        // Simply let compiler to expand loops.
-        PRAGMA_UNROLL_2
-        for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
-        {
-          float64x2_t v0 = vld1q_f64( a_loc + 0 );
-          float64x2_t v1 = vld1q_f64( a_loc + 2 );
-          float64x2_t v2 = vld1q_f64( a_loc + 4 );
-
-          vst1q_f64( p_loc + 0, v0 );
-          vst1q_f64( p_loc + 2, v1 );
-          vst1q_f64( p_loc + 4, v2 );
-
-          a_loc += lda;
-          p_loc += ldp;
-        }
-      }
-      else // if ( lda == 1 )
-      {
-        float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
-
-        PRAGMA_NOUNROLL
-        for ( ; k_iter > 0; --k_iter )
-        {
-          v0 = vld1q_f64( a_loc + inca * 0 );
-          v1 = vld1q_f64( a_loc + inca * 1 );
-          v2 = vld1q_f64( a_loc + inca * 2 );
-          v3 = vld1q_f64( a_loc + inca * 3 );
-          v4 = vld1q_f64( a_loc + inca * 4 );
-          v5 = vld1q_f64( a_loc + inca * 5 );
-
-          // In-register transpose.
-          float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
-          float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
-          float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
-          float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
-          float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
-          float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
-
-          vst1q_f64( p_loc + 0, vd0_1 );
-          vst1q_f64( p_loc + 2, vd1_1 );
-          vst1q_f64( p_loc + 4, vd2_1 );
-          p_loc += ldp;
-
-          vst1q_f64( p_loc + 0, vd0_2 );
-          vst1q_f64( p_loc + 2, vd1_2 );
-          vst1q_f64( p_loc + 4, vd2_2 );
-          p_loc += ldp;
-          a_loc += 2 * lda; // 2;
-        }
-        for ( ; k_left > 0; --k_left )
-        {
-          v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
-          v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
-          v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
-          v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
-          v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
-          v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
-
-          vst1q_f64( p_loc + 0, v0 );
-          vst1q_f64( p_loc + 2, v1 );
-          vst1q_f64( p_loc + 4, v2 );
-          p_loc += ldp;
-          a_loc += lda; // 1;
-        }
-      }
-    }
-    else // if ( !unitk )
-    {
-      float64x2_t vkappa = vld1q_dup_f64( kappa );
-
-      if ( inca == 1 )
-      {
-        // No need to use k-loops here.
-        // Simply let compiler to expand loops.
-        PRAGMA_UNROLL_2
-        for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
-        {
-          float64x2_t v0 = vld1q_f64( a_loc + 0 );
-          float64x2_t v1 = vld1q_f64( a_loc + 2 );
-          float64x2_t v2 = vld1q_f64( a_loc + 4 );
-
-          // Scale by kappa.
-          v0 = vmulq_f64( v0, vkappa );
-          v1 = vmulq_f64( v1, vkappa );
-          v2 = vmulq_f64( v2, vkappa );
-
-          vst1q_f64( p_loc + 0, v0 );
-          vst1q_f64( p_loc + 2, v1 );
-          vst1q_f64( p_loc + 4, v2 );
-
-          a_loc += lda;
-          p_loc += ldp;
-        }
-      }
-      else // if ( lda == 1 )
-      {
-        float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
-        float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
-
-        PRAGMA_NOUNROLL
-        for ( ; k_iter > 0; --k_iter )
-        {
-          v0 = vld1q_f64( a_loc + inca * 0 );
-          v1 = vld1q_f64( a_loc + inca * 1 );
-          v2 = vld1q_f64( a_loc + inca * 2 );
-          v3 = vld1q_f64( a_loc + inca * 3 );
-          v4 = vld1q_f64( a_loc + inca * 4 );
-          v5 = vld1q_f64( a_loc + inca * 5 );
-
-          // Scale by kappa.
-          v0 = vmulq_f64( v0, vkappa );
-          v1 = vmulq_f64( v1, vkappa );
-          v2 = vmulq_f64( v2, vkappa );
-          v3 = vmulq_f64( v3, vkappa );
-          v4 = vmulq_f64( v4, vkappa );
-          v5 = vmulq_f64( v5, vkappa );
-
-          // In-register transpose.
-          float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
-          float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
-          float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
-          float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
-          float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
-          float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
-
-          vst1q_f64( p_loc + 0, vd0_1 );
-          vst1q_f64( p_loc + 2, vd1_1 );
-          vst1q_f64( p_loc + 4, vd2_1 );
-          p_loc += ldp;
-
-          vst1q_f64( p_loc + 0, vd0_2 );
-          vst1q_f64( p_loc + 2, vd1_2 );
-          vst1q_f64( p_loc + 4, vd2_2 );
-          p_loc += ldp;
-          a_loc += 2 * lda; // 2;
-        }
-        for ( ; k_left > 0; --k_left )
-        {
-          v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
-          v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
-          v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
-          v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
-          v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
-          v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
-
-          // Scale by kappa.
-          v0 = vmulq_f64( v0, vkappa );
-          v1 = vmulq_f64( v1, vkappa );
-          v2 = vmulq_f64( v2, vkappa );
-
-          vst1q_f64( p_loc + 0, v0 );
-          vst1q_f64( p_loc + 2, v1 );
-          vst1q_f64( p_loc + 4, v2 );
-          p_loc += ldp;
-          a_loc += lda; // 1;
-        }
-      }
-    }
-  }
-  else // if ( cdim0 < mnr || gs )
-  {
-    PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
-    (
-      0,
-      BLIS_NONUNIT_DIAG,
-      BLIS_DENSE,
-      ( trans_t )conja,
-      cdim0,
-      k0,
-      kappa,
-      a, inca0, lda0,
-      p,     1, ldp0,
-      cntx,
-      NULL
-    );
-
-    if ( cdim0 < mnr )
-    {
-      // Handle zero-filling along the "long" edge of the micropanel.
-
-      const dim_t      i      = cdim0;
-      const dim_t      m_edge = mnr - cdim0;
-      const dim_t      n_edge = k0_max;
-      double* restrict p_edge = ( double* )p + (i  )*1;
-
-      bli_dset0s_mxn
-      (
-        m_edge,
-        n_edge,
-        p_edge, 1, ldp
-      );
-    }
-  }
-
-//bli_dfprintm( stdout, "packm 6xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" );
-
-  if ( k0 < k0_max )
-  {
-    // Handle zero-filling along the "short" (far) edge of the micropanel.
-
-    const dim_t      j      = k0;
-    const dim_t      m_edge = mnr;
-    const dim_t      n_edge = k0_max - k0;
-    double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-    bli_dset0s_mxn
-    (
-      m_edge,
-      n_edge,
-      p_edge, 1, ldp
-    );
-  }
-}
-
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s8x12.c
similarity index 63%
rename from kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
rename to kernels/armv8a/1m/bli_packm_armv8a_int_s8x12.c
index 3b6b38181..df1e6178f 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s8x12.c
@@ -39,29 +39,36 @@
 #if defined(__clang__)
 #define PRAGMA_NOUNROLL _Pragma("nounroll")
 #define PRAGMA_UNROLL_2 _Pragma("unroll 2")
+#define PRAGMA_UNROLL_4 _Pragma("unroll 4")
 #elif defined(__GNUC__)
 #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1")
 #define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2")
+#define PRAGMA_UNROLL_4 _Pragma("GCC unroll 4")
 #else
 #define PRAGMA_NOUNROLL
 #define PRAGMA_UNROLL_2
+#define PRAGMA_UNROLL_4
 #endif
 
-void bli_spackm_armv8a_int_12xk
+void bli_spackm_armv8a_int_8x12
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim0,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   k0,
              dim_t   k0_max,
        const void*   kappa,
        const void*   a, inc_t inca0, inc_t lda0,
              void*   p,              inc_t ldp0,
+       const void*   params,
        const cntx_t* cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
-  const dim_t    mnr    = 12;
+  const dim_t    mr     = 8;
+  const dim_t    nr     = 12;
 
   // Typecast local copies of integers in case dim_t and inc_t are a
   // different size than is expected by load instructions.
@@ -92,7 +99,234 @@ void bli_spackm_armv8a_int_12xk
 
   // -------------------------------------------------------------------------
 
-  if ( cdim0 == mnr && !gs )
+  if ( cdim0 == mr && cdim_bcast == 1 && !gs )
+  {
+    if ( unitk )
+    {
+      if ( inca == 1 )
+      {
+        // No need to use k-loops here.
+        // Simply let compiler to expand loops.
+        PRAGMA_UNROLL_4
+        for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
+        {
+          float32x4_t v0 = vld1q_f32( a_loc +  0 );
+          float32x4_t v1 = vld1q_f32( a_loc +  4 );
+
+          vst1q_f32( p_loc +  0, v0 );
+          vst1q_f32( p_loc +  4, v1 );
+
+          a_loc += lda;
+          p_loc += ldp;
+        }
+      }
+      else // if ( lda == 1 )
+      {
+        float32x4_t v0  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v1  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v2  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v3  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v4  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v5  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v6  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v7  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t vt0;
+        float32x4_t vt1;
+        float32x4_t vt2;
+        float32x4_t vt3;
+
+        PRAGMA_NOUNROLL
+        for ( ; k_iter > 0; --k_iter )
+        {
+          v0 = vld1q_f32( a_loc + inca * 0 );
+          v1 = vld1q_f32( a_loc + inca * 1 );
+          v2 = vld1q_f32( a_loc + inca * 2 );
+          v3 = vld1q_f32( a_loc + inca * 3 );
+          v4 = vld1q_f32( a_loc + inca * 4 );
+          v5 = vld1q_f32( a_loc + inca * 5 );
+          v6 = vld1q_f32( a_loc + inca * 6 );
+          v7 = vld1q_f32( a_loc + inca * 7 );
+
+          // In-register transpose.
+          //
+          // Column 0-3
+          vt0 = vtrn1q_f32( v0, v1 );
+          vt1 = vtrn2q_f32( v0, v1 );
+          vt2 = vtrn1q_f32( v2, v3 );
+          vt3 = vtrn2q_f32( v2, v3 );
+          v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+          v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+          // Column 4-7
+          vt0 = vtrn1q_f32( v4, v5 );
+          vt1 = vtrn2q_f32( v4, v5 );
+          vt2 = vtrn1q_f32( v6, v7 );
+          vt3 = vtrn2q_f32( v6, v7 );
+          v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+          v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+
+          vst1q_f32( p_loc + 0,  v0  );
+          vst1q_f32( p_loc + 4,  v4  );
+          p_loc += ldp;
+
+          vst1q_f32( p_loc + 0,  v1  );
+          vst1q_f32( p_loc + 4,  v5  );
+          p_loc += ldp;
+
+          vst1q_f32( p_loc + 0,  v2  );
+          vst1q_f32( p_loc + 4,  v6  );
+          p_loc += ldp;
+
+          vst1q_f32( p_loc + 0,  v3  );
+          vst1q_f32( p_loc + 4,  v7  );
+          p_loc += ldp;
+          a_loc += 4 * lda; // 4;
+        }
+        for ( ; k_left > 0; --k_left )
+        {
+          v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
+          v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
+          v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
+          v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
+          v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
+          v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
+          v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
+          v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
+
+          vst1q_f32( p_loc + 0,  v0 );
+          vst1q_f32( p_loc + 4,  v1 );
+          p_loc += ldp;
+          a_loc += lda; // 1;
+        }
+      }
+    }
+    else // if ( !unitk )
+    {
+      float32x4_t vkappa = vld1q_dup_f32( kappa );
+
+      if ( inca == 1 )
+      {
+        // No need to use k-loops here.
+        // Simply let compiler to expand loops.
+        PRAGMA_UNROLL_4
+        for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
+        {
+          float32x4_t v0 = vld1q_f32( a_loc + 0 );
+          float32x4_t v1 = vld1q_f32( a_loc + 4 );
+
+          // Scale by kappa.
+          v0 = vmulq_f32( v0, vkappa );
+          v1 = vmulq_f32( v1, vkappa );
+
+          vst1q_f32( p_loc + 0, v0 );
+          vst1q_f32( p_loc + 4, v1 );
+
+          a_loc += lda;
+          p_loc += ldp;
+        }
+      }
+      else // if ( lda == 1 )
+      {
+        float32x4_t v0  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v1  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v2  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v3  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v4  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v5  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v6  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t v7  = (float32x4_t)vdupq_n_u32( 0 );
+        float32x4_t vt0;
+        float32x4_t vt1;
+        float32x4_t vt2;
+        float32x4_t vt3;
+
+        PRAGMA_NOUNROLL
+        for ( ; k_iter > 0; --k_iter )
+        {
+          v0  = vld1q_f32( a_loc + inca * 0  );
+          v1  = vld1q_f32( a_loc + inca * 1  );
+          v2  = vld1q_f32( a_loc + inca * 2  );
+          v3  = vld1q_f32( a_loc + inca * 3  );
+          v4  = vld1q_f32( a_loc + inca * 4  );
+          v5  = vld1q_f32( a_loc + inca * 5  );
+          v6  = vld1q_f32( a_loc + inca * 6  );
+          v7  = vld1q_f32( a_loc + inca * 7  );
+
+          // Scale by kappa.
+          v0  = vmulq_f32( v0,  vkappa );
+          v1  = vmulq_f32( v1,  vkappa );
+          v2  = vmulq_f32( v2,  vkappa );
+          v3  = vmulq_f32( v3,  vkappa );
+          v4  = vmulq_f32( v4,  vkappa );
+          v5  = vmulq_f32( v5,  vkappa );
+          v6  = vmulq_f32( v6,  vkappa );
+          v7  = vmulq_f32( v7,  vkappa );
+
+          // In-register transpose.
+          //
+          // Column 0-3
+          vt0 = vtrn1q_f32( v0, v1 );
+          vt1 = vtrn2q_f32( v0, v1 );
+          vt2 = vtrn1q_f32( v2, v3 );
+          vt3 = vtrn2q_f32( v2, v3 );
+          v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+          v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+          // Column 4-7
+          vt0 = vtrn1q_f32( v4, v5 );
+          vt1 = vtrn2q_f32( v4, v5 );
+          vt2 = vtrn1q_f32( v6, v7 );
+          vt3 = vtrn2q_f32( v6, v7 );
+          v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+          v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
+          v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
+
+          vst1q_f32( p_loc + 0,  v0  );
+          vst1q_f32( p_loc + 4,  v4  );
+          p_loc += ldp;
+
+          vst1q_f32( p_loc + 0,  v1  );
+          vst1q_f32( p_loc + 4,  v5  );
+          p_loc += ldp;
+
+          vst1q_f32( p_loc + 0,  v2  );
+          vst1q_f32( p_loc + 4,  v6  );
+          p_loc += ldp;
+
+          vst1q_f32( p_loc + 0,  v3  );
+          vst1q_f32( p_loc + 4,  v7  );
+          p_loc += ldp;
+          a_loc += 4 * lda; // 4;
+        }
+        for ( ; k_left > 0; --k_left )
+        {
+          v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
+          v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
+          v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
+          v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
+          v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
+          v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
+          v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
+          v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
+
+          // Scale by kappa.
+          v0 = vmulq_f32( v0, vkappa );
+          v1 = vmulq_f32( v1, vkappa );
+
+          vst1q_f32( p_loc + 0,  v0 );
+          vst1q_f32( p_loc + 4,  v1 );
+          p_loc += ldp;
+          a_loc += lda; // 1;
+        }
+      }
+    }
+  }
+  else if ( cdim0 == nr && cdim_bcast == 1 && !gs )
   {
     if ( unitk )
     {
@@ -381,56 +615,24 @@ void bli_spackm_armv8a_int_12xk
       }
     }
   }
-  else // if ( cdim0 < mnr || gs )
-  {
-    PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF)
-    (
-      0,
-      BLIS_NONUNIT_DIAG,
-      BLIS_DENSE,
-      ( trans_t )conja,
-      cdim0,
-      k0,
-      kappa,
-      a, inca0, lda0,
-      p,     1, ldp0,
-      cntx,
-      NULL
-    );
-
-    if ( cdim0 < mnr )
-    {
-      // Handle zero-filling along the "long" edge of the micropanel.
-
-      const dim_t     i      = cdim0;
-      const dim_t     m_edge = mnr - cdim0;
-      const dim_t     n_edge = k0_max;
-      float* restrict p_edge = ( float* )p + (i  )*1;
-
-      bli_sset0s_mxn
-      (
-        m_edge,
-        n_edge,
-        p_edge, 1, ldp
-      );
-    }
-  }
-
-  if ( k0 < k0_max )
-  {
-    // Handle zero-filling along the "short" (far) edge of the micropanel.
-
-    const dim_t     j      = k0;
-    const dim_t     m_edge = mnr;
-    const dim_t     n_edge = k0_max - k0;
-    float* restrict p_edge = ( float* )p + (j  )*ldp;
-
-    bli_sset0s_mxn
-    (
-      m_edge,
-      n_edge,
-      p_edge, 1, ldp
-    );
-  }
+	else
+	{
+		bli_sscal2bbs_mxn
+		(
+		  conja,
+		  cdim0,
+		  k0,
+		  kappa,
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
+		);
+	}
+
+	bli_sset0s_edge
+	(
+	  cdim0*cdim_bcast, cdim_max*cdim_bcast,
+	  k0, k0_max,
+	  p, ldp
+	);
 }
 
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
deleted file mode 100644
index c0d31b35d..000000000
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Linaro Limited
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <arm_neon.h>
-
-#if defined(__clang__)
-#define PRAGMA_NOUNROLL _Pragma("nounroll")
-#define PRAGMA_UNROLL_4 _Pragma("unroll 4")
-#elif defined(__GNUC__)
-#define PRAGMA_NOUNROLL _Pragma("GCC unroll 1")
-#define PRAGMA_UNROLL_4 _Pragma("GCC unroll 4")
-#else
-#define PRAGMA_NOUNROLL
-#define PRAGMA_UNROLL_4
-#endif
-
-void bli_spackm_armv8a_int_8xk
-     (
-             conj_t  conja,
-             pack_t  schema,
-             dim_t   cdim0,
-             dim_t   k0,
-             dim_t   k0_max,
-       const void*   kappa,
-       const void*   a, inc_t inca0, inc_t lda0,
-             void*   p,              inc_t ldp0,
-       const cntx_t* cntx
-     )
-{
-  // This is the panel dimension assumed by the packm kernel.
-  const dim_t    mnr    = 8;
-
-  // Typecast local copies of integers in case dim_t and inc_t are a
-  // different size than is expected by load instructions.
-  uint64_t       k_iter = k0 / 4;
-  uint64_t       k_left = k0 % 4;
-
-  const float*   a_loc  = a;
-        float*   p_loc  = p;
-
-  // NOTE: For the purposes of the comments in this packm kernel, we
-  // interpret inca and lda as rs_a and cs_a, respectively, and similarly
-  // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
-  // this packm kernel, you should think of the operation as packing an
-  // m x n micropanel, where m and n are tiny and large, respectively, and
-  // where elements of each column of the packed matrix P are contiguous.
-  // (This packm kernel can still be used to pack micropanels of matrix B
-  // in a gemm operation.)
-  const uint64_t inca   = inca0;
-  const uint64_t lda    = lda0;
-  const uint64_t ldp    = ldp0;
-
-  const bool     gs     = ( inca0 != 1 && lda0 != 1 );
-
-  // NOTE: If/when this kernel ever supports scaling by kappa within the
-  // assembly region, this constraint should be lifted.
-  const bool     unitk  = bli_seq1( *(( float* )kappa) );
-
-
-  // -------------------------------------------------------------------------
-
-  if ( cdim0 == mnr && !gs )
-  {
-    if ( unitk )
-    {
-      if ( inca == 1 )
-      {
-        // No need to use k-loops here.
-        // Simply let compiler to expand loops.
-        PRAGMA_UNROLL_4
-        for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
-        {
-          float32x4_t v0 = vld1q_f32( a_loc +  0 );
-          float32x4_t v1 = vld1q_f32( a_loc +  4 );
-
-          vst1q_f32( p_loc +  0, v0 );
-          vst1q_f32( p_loc +  4, v1 );
-
-          a_loc += lda;
-          p_loc += ldp;
-        }
-      }
-      else // if ( lda == 1 )
-      {
-        float32x4_t v0  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v1  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v2  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v3  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v4  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v5  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v6  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v7  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t vt0;
-        float32x4_t vt1;
-        float32x4_t vt2;
-        float32x4_t vt3;
-
-        PRAGMA_NOUNROLL
-        for ( ; k_iter > 0; --k_iter )
-        {
-          v0 = vld1q_f32( a_loc + inca * 0 );
-          v1 = vld1q_f32( a_loc + inca * 1 );
-          v2 = vld1q_f32( a_loc + inca * 2 );
-          v3 = vld1q_f32( a_loc + inca * 3 );
-          v4 = vld1q_f32( a_loc + inca * 4 );
-          v5 = vld1q_f32( a_loc + inca * 5 );
-          v6 = vld1q_f32( a_loc + inca * 6 );
-          v7 = vld1q_f32( a_loc + inca * 7 );
-
-          // In-register transpose.
-          //
-          // Column 0-3
-          vt0 = vtrn1q_f32( v0, v1 );
-          vt1 = vtrn2q_f32( v0, v1 );
-          vt2 = vtrn1q_f32( v2, v3 );
-          vt3 = vtrn2q_f32( v2, v3 );
-          v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-          v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-          // Column 4-7
-          vt0 = vtrn1q_f32( v4, v5 );
-          vt1 = vtrn2q_f32( v4, v5 );
-          vt2 = vtrn1q_f32( v6, v7 );
-          vt3 = vtrn2q_f32( v6, v7 );
-          v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-          v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-
-          vst1q_f32( p_loc + 0,  v0  );
-          vst1q_f32( p_loc + 4,  v4  );
-          p_loc += ldp;
-
-          vst1q_f32( p_loc + 0,  v1  );
-          vst1q_f32( p_loc + 4,  v5  );
-          p_loc += ldp;
-
-          vst1q_f32( p_loc + 0,  v2  );
-          vst1q_f32( p_loc + 4,  v6  );
-          p_loc += ldp;
-
-          vst1q_f32( p_loc + 0,  v3  );
-          vst1q_f32( p_loc + 4,  v7  );
-          p_loc += ldp;
-          a_loc += 4 * lda; // 4;
-        }
-        for ( ; k_left > 0; --k_left )
-        {
-          v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
-          v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
-          v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
-          v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
-          v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
-          v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
-          v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
-          v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
-
-          vst1q_f32( p_loc + 0,  v0 );
-          vst1q_f32( p_loc + 4,  v1 );
-          p_loc += ldp;
-          a_loc += lda; // 1;
-        }
-      }
-    }
-    else // if ( !unitk )
-    {
-      float32x4_t vkappa = vld1q_dup_f32( kappa );
-
-      if ( inca == 1 )
-      {
-        // No need to use k-loops here.
-        // Simply let compiler to expand loops.
-        PRAGMA_UNROLL_4
-        for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
-        {
-          float32x4_t v0 = vld1q_f32( a_loc + 0 );
-          float32x4_t v1 = vld1q_f32( a_loc + 4 );
-
-          // Scale by kappa.
-          v0 = vmulq_f32( v0, vkappa );
-          v1 = vmulq_f32( v1, vkappa );
-
-          vst1q_f32( p_loc + 0, v0 );
-          vst1q_f32( p_loc + 4, v1 );
-
-          a_loc += lda;
-          p_loc += ldp;
-        }
-      }
-      else // if ( lda == 1 )
-      {
-        float32x4_t v0  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v1  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v2  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v3  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v4  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v5  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v6  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t v7  = (float32x4_t)vdupq_n_u32( 0 );
-        float32x4_t vt0;
-        float32x4_t vt1;
-        float32x4_t vt2;
-        float32x4_t vt3;
-
-        PRAGMA_NOUNROLL
-        for ( ; k_iter > 0; --k_iter )
-        {
-          v0  = vld1q_f32( a_loc + inca * 0  );
-          v1  = vld1q_f32( a_loc + inca * 1  );
-          v2  = vld1q_f32( a_loc + inca * 2  );
-          v3  = vld1q_f32( a_loc + inca * 3  );
-          v4  = vld1q_f32( a_loc + inca * 4  );
-          v5  = vld1q_f32( a_loc + inca * 5  );
-          v6  = vld1q_f32( a_loc + inca * 6  );
-          v7  = vld1q_f32( a_loc + inca * 7  );
-
-          // Scale by kappa.
-          v0  = vmulq_f32( v0,  vkappa );
-          v1  = vmulq_f32( v1,  vkappa );
-          v2  = vmulq_f32( v2,  vkappa );
-          v3  = vmulq_f32( v3,  vkappa );
-          v4  = vmulq_f32( v4,  vkappa );
-          v5  = vmulq_f32( v5,  vkappa );
-          v6  = vmulq_f32( v6,  vkappa );
-          v7  = vmulq_f32( v7,  vkappa );
-
-          // In-register transpose.
-          //
-          // Column 0-3
-          vt0 = vtrn1q_f32( v0, v1 );
-          vt1 = vtrn2q_f32( v0, v1 );
-          vt2 = vtrn1q_f32( v2, v3 );
-          vt3 = vtrn2q_f32( v2, v3 );
-          v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-          v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-          // Column 4-7
-          vt0 = vtrn1q_f32( v4, v5 );
-          vt1 = vtrn2q_f32( v4, v5 );
-          vt2 = vtrn1q_f32( v6, v7 );
-          vt3 = vtrn2q_f32( v6, v7 );
-          v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-          v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
-          v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
-
-          vst1q_f32( p_loc + 0,  v0  );
-          vst1q_f32( p_loc + 4,  v4  );
-          p_loc += ldp;
-
-          vst1q_f32( p_loc + 0,  v1  );
-          vst1q_f32( p_loc + 4,  v5  );
-          p_loc += ldp;
-
-          vst1q_f32( p_loc + 0,  v2  );
-          vst1q_f32( p_loc + 4,  v6  );
-          p_loc += ldp;
-
-          vst1q_f32( p_loc + 0,  v3  );
-          vst1q_f32( p_loc + 4,  v7  );
-          p_loc += ldp;
-          a_loc += 4 * lda; // 4;
-        }
-        for ( ; k_left > 0; --k_left )
-        {
-          v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
-          v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
-          v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
-          v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
-          v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
-          v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
-          v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
-          v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
-
-          // Scale by kappa.
-          v0 = vmulq_f32( v0, vkappa );
-          v1 = vmulq_f32( v1, vkappa );
-
-          vst1q_f32( p_loc + 0,  v0 );
-          vst1q_f32( p_loc + 4,  v1 );
-          p_loc += ldp;
-          a_loc += lda; // 1;
-        }
-      }
-    }
-  }
-  else // if ( cdim0 < mnr || gs )
-  {
-    PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF)
-    (
-      0,
-      BLIS_NONUNIT_DIAG,
-      BLIS_DENSE,
-      ( trans_t )conja,
-      cdim0,
-      k0,
-      kappa,
-      a, inca0, lda0,
-      p,     1, ldp0,
-      cntx,
-      NULL
-    );
-
-    if ( cdim0 < mnr )
-    {
-      // Handle zero-filling along the "long" edge of the micropanel.
-
-      const dim_t     i      = cdim0;
-      const dim_t     m_edge = mnr - cdim0;
-      const dim_t     n_edge = k0_max;
-      float* restrict p_edge = ( float* )p + (i  )*1;
-
-      bli_sset0s_mxn
-      (
-        m_edge,
-        n_edge,
-        p_edge, 1, ldp
-      );
-    }
-  }
-
-  if ( k0 < k0_max )
-  {
-    // Handle zero-filling along the "short" (far) edge of the micropanel.
-
-    const dim_t     j      = k0;
-    const dim_t     m_edge = mnr;
-    const dim_t     n_edge = k0_max - k0;
-    float* restrict p_edge = ( float* )p + (j  )*ldp;
-
-    bli_sset0s_mxn
-    (
-      m_edge,
-      n_edge,
-      p_edge, 1, ldp
-    );
-  }
-}
-
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 665fe9b70..cc711c60f 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -77,14 +77,14 @@ void bli_sgemm_armv8a_asm_8x12
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
 
 
-	void* a_next = bli_auxinfo_next_a( data );
-	void* b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -1157,7 +1157,7 @@ void bli_dgemm_armv8a_asm_6x8
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1175,8 +1175,8 @@ void bli_dgemm_armv8a_asm_6x8
 
 #endif
 
-	void* a_next = bli_auxinfo_next_a( data );
-	void* b_next = bli_auxinfo_next_b( data );
+	const void* a_next = bli_auxinfo_next_a( data );
+	const void* b_next = bli_auxinfo_next_b( data );
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
index f80b03ed6..81d558a94 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c
@@ -44,15 +44,15 @@
 
 /* Order of row-major SGEMM_12x8's execution in 4x5 blocks:
  *
- * +---+ +---+ 
- * | 0 | | 1 | 
- * +---+ +---+ 
- * +---+ +---+ 
- * | 2 | | 3 | 
- * +---+ +---+ 
- * +---+ +---+ 
- * | 4 | | 5 | 
- * +---+ +---+ 
+ * +---+ +---+
+ * | 0 | | 1 |
+ * +---+ +---+
+ * +---+ +---+
+ * | 2 | | 3 |
+ * +---+ +---+
+ * +---+ +---+
+ * | 4 | | 5 |
+ * +---+ +---+
  */
 #define SGEMM_12X8_MKER_LOOP_PLAIN(C00,C01,C10,C11,C20,C21,C30,C31,C40,C41,C50,C51,C60,C61,C70,C71,C80,C81,C90,C91,CA0,CA1,CB0,CB1,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \
   SGEMM_4X4_NANOKERNEL(C00,C10,C20,C30,B0,A0) \
@@ -142,7 +142,7 @@ void bli_sgemm_armv8a_asm_12x8r
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -387,7 +387,7 @@ void bli_dgemm_armv8a_asm_8x6r
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
index 8ff5c1754..1ebf306ac 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
@@ -122,7 +122,7 @@ void bli_dgemmsup_rd_armv8a_inline_3x4m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -164,7 +164,7 @@ void bli_dgemmsup_rd_armv8a_inline_3xcm
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -197,7 +197,7 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
index fc40bd591..d7c70bde2 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
@@ -115,7 +115,7 @@ void bli_dgemmsup_rd_armv8a_inline_4x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -155,7 +155,7 @@ void bli_dgemmsup_rd_armv8a_inline_3x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -196,7 +196,7 @@ void bli_dgemmsup_rd_armv8a_inline_rx8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -229,7 +229,7 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
index a8f4f5e12..dc446ca34 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
@@ -116,7 +116,7 @@ void bli_dgemmsup_rv_armv8a_asm_4x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
index 348f750a5..c0b00f5b8 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c
@@ -132,7 +132,7 @@ void bli_dgemmsup_rv_armv8a_asm_5x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
index 3d1e8c0a0..8ff4769fb 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c
@@ -132,7 +132,7 @@ void bli_dgemmsup_rv_armv8a_asm_6x5m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
index a1a945740..a74426298 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c
@@ -122,7 +122,7 @@ void bli_dgemmsup_rv_armv8a_asm_6x6m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
index 2e00676c4..18751bf5f 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c
@@ -150,7 +150,7 @@ void bli_dgemmsup_rv_armv8a_asm_6x7m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
index 74fdcf77f..9cd2665d4 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
@@ -139,7 +139,7 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
index 3dad7dd91..ba672af92 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
@@ -139,7 +139,7 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -151,7 +151,7 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
     gemmsup_ker_ft ker_fp1 = NULL;
     gemmsup_ker_ft ker_fp2 = NULL;
     dim_t          mr1, mr2;
-    
+
     if ( m0 == 9 )
     {
       ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5;
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
index 8376d418a..c0b548d05 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
@@ -110,7 +110,7 @@ void bli_dgemmsup_rv_armv8a_asm_8x4m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
index 8cefaed4a..fbe7da914 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
@@ -94,7 +94,7 @@ void bli_dgemmsup_rd_armv8a_asm_3x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
index 1919aa694..c8cdb8ecc 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
@@ -118,7 +118,7 @@ void bli_dgemmsup_rd_armv8a_asm_6x3
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
index 331e5bbda..42f38d6c5 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
@@ -69,7 +69,7 @@ void bli_dgemmsup_rd_armv8a_int_2x8
        const void*      b, inc_t rs_b, inc_t cs_b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
index 911cb9256..f5792be37 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
@@ -69,7 +69,7 @@ void bli_dgemmsup_rd_armv8a_int_3x4
        const void*      b, inc_t rs_b, inc_t cs_b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
index 4c2173092..345f87b03 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
@@ -69,7 +69,7 @@ void bli_dgemmsup_rv_armv8a_int_3x8mn
        const void*      b0, inc_t rs_b, inc_t cs_b,
        const void*      beta,
              void*      c0, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
index 2d259adb3..d60339bd8 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
@@ -69,7 +69,7 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
        const void*      b0, inc_t rs_b, inc_t cs_b,
        const void*      beta,
              void*      c0, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/armv8a/bli_kernels_armv8a.h b/kernels/armv8a/bli_kernels_armv8a.h
index 64a3f2fb5..9bdeb138b 100644
--- a/kernels/armv8a/bli_kernels_armv8a.h
+++ b/kernels/armv8a/bli_kernels_armv8a.h
@@ -32,10 +32,8 @@
 
 */
 
-PACKM_KER_PROT( float,    s, packm_armv8a_int_8xk )
-PACKM_KER_PROT( float,    s, packm_armv8a_int_12xk )
-PACKM_KER_PROT( double,   d, packm_armv8a_int_6xk )
-PACKM_KER_PROT( double,   d, packm_armv8a_int_8xk )
+PACKM_KER_PROT( float,    s, packm_armv8a_int_8x12 )
+PACKM_KER_PROT( double,   d, packm_armv8a_int_6x8 )
 
 GEMM_UKR_PROT( float,    s, gemm_armv8a_asm_8x12 )
 GEMM_UKR_PROT( double,   d, gemm_armv8a_asm_6x8 )
diff --git a/kernels/bgq/1f/bli_axpyf_bgq_int.c b/kernels/bgq/1f/bli_axpyf_bgq_int.c
index f366dbe86..1bf82380a 100644
--- a/kernels/bgq/1f/bli_axpyf_bgq_int.c
+++ b/kernels/bgq/1f/bli_axpyf_bgq_int.c
@@ -110,14 +110,14 @@ void bli_daxpyf_bgq_int
 	double chi6 = *(x + 6*incx);
 	double chi7 = *(x + 7*incx);
 
-	PASTEMAC2(d,d,scals)( *alpha, chi0 );
-	PASTEMAC2(d,d,scals)( *alpha, chi1 );
-	PASTEMAC2(d,d,scals)( *alpha, chi2 );
-	PASTEMAC2(d,d,scals)( *alpha, chi3 );
-	PASTEMAC2(d,d,scals)( *alpha, chi4 );
-	PASTEMAC2(d,d,scals)( *alpha, chi5 );
-	PASTEMAC2(d,d,scals)( *alpha, chi6 );
-	PASTEMAC2(d,d,scals)( *alpha, chi7 );
+	PASTEMAC(d,d,scals)( *alpha, chi0 );
+	PASTEMAC(d,d,scals)( *alpha, chi1 );
+	PASTEMAC(d,d,scals)( *alpha, chi2 );
+	PASTEMAC(d,d,scals)( *alpha, chi3 );
+	PASTEMAC(d,d,scals)( *alpha, chi4 );
+	PASTEMAC(d,d,scals)( *alpha, chi5 );
+	PASTEMAC(d,d,scals)( *alpha, chi6 );
+	PASTEMAC(d,d,scals)( *alpha, chi7 );
 
 	vector4double   a0v, a1v, a2v, a3v, a4v, a5v, a6v, a7v;
     vector4double   yv;
diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
index d1dcac3a6..ed7cf2056 100644
--- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
+++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
@@ -64,7 +64,7 @@ void bli_dgemm_bgq_int_8x8
        const void*      b0,
        const void*      beta0,
              void*      c0, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -234,7 +234,7 @@ void bli_zgemm_bgq_int_4x4
        const void*      b0,
        const void*      beta0,
              void*      c0, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
index dbdb5ef3b..eda271ae8 100644
--- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
+++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
@@ -98,7 +98,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -593,7 +593,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -816,7 +816,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1340,7 +1340,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3x8.c
similarity index 60%
rename from kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
rename to kernels/haswell/1m/bli_packm_haswell_asm_c3x8.c
index 22dfe8e4a..87ddb7957 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3x8.c
@@ -38,37 +38,29 @@
 #define BLIS_ASM_SYNTAX_ATT
 #include "bli_x86_asm_macros.h"
 
-// Prototype reference packm kernels.
-PACKM_KER_PROT( scomplex, c, packm_8xk_haswell_ref )
-
-void bli_cpackm_haswell_asm_8xk
+void bli_cpackm_haswell_asm_3x8
      (
              conj_t    conja,
              pack_t    schema,
              dim_t     cdim0,
+             dim_t     cdim_max,
+             dim_t     cdim_bcast,
              dim_t     k0,
              dim_t     k0_max,
        const void*     kappa,
        const void*     a, inc_t inca0, inc_t lda0,
              void*     p,              inc_t ldp0,
+       const void*     params,
        const cntx_t*   cntx
      )
 {
-#if 0
-	bli_cpackm_8xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
 	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 8;
+	const dim_t      mr    = 3;
+	const dim_t      nr    = 8;
 
 	// This is the "packing" dimension assumed by the packm kernel.
 	// This should be equal to ldp.
-	//const dim_t    packmnr = 8;
+	//const dim_t    packmnr = 6;
 
 	// Define a local copy of 1.0 so we can test for unit kappa.
 	float            one_l = 1.0;
@@ -104,7 +96,243 @@ void bli_cpackm_haswell_asm_8xk
 
 	// -------------------------------------------------------------------------
 
-	if ( cdim0 == mnr && !gs && !conja && unitk )
+	if ( cdim0 == mr && ldp0 == mr && cdim_bcast == 1 && !gs && !conja && unitk )
+	{
+		begin_asm()
+
+		mov(var(a), rax)                   // load address of a.
+
+		mov(var(inca), r8)                 // load inca
+		mov(var(lda), r10)                 // load lda
+		lea(mem(, r8,  8), r8)             // inca *= sizeof(scomplex)
+		lea(mem(, r10, 8), r10)            // lda *= sizeof(scomplex)
+
+		mov(var(p), rbx)                   // load address of p.
+
+		lea(mem(   , r10, 4), r14)         // r14 = 4*lda
+
+		mov(var(one), rdx)                 // load address of 1.0 constant
+		vbroadcastss(mem(rdx, 0), ymm1)    // load 1.0 and duplicate
+		vxorps(ymm0, ymm0, ymm0)           // set ymm0 to 0.0.
+
+		mov(var(kappa), rcx)               // load address of kappa
+		vbroadcastss(mem(rcx, 0), ymm10)   // load kappa_r and duplicate
+		vbroadcastss(mem(rcx, 4), ymm11)   // load kappa_i and duplicate
+
+
+										   // now branch on kappa == 1.0
+
+		vucomiss(xmm1, xmm10)              // set ZF if kappa_r == 1.0.
+		sete(r12b)                         // r12b = ( ZF == 1 ? 1 : 0 );
+		vucomiss(xmm0, xmm11)              // set ZF if kappa_i == 0.0.
+		sete(r13b)                         // r13b = ( ZF == 1 ? 1 : 0 );
+		and(r12b, r13b)                    // set ZF if r12b & r13b == 1.
+		jne(.CKAPPAUNIT)                   // if ZF = 1, jump to beta == 0 case
+
+
+
+		label(.CKAPPANONU)
+
+		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
+		jz(.CCOLNONU)                      // jump to column storage case
+
+		// -- kappa non-unit, row storage on A -------------------------------------
+
+		label(.CROWNONU)
+
+		jmp(.CDONE)                        // jump to end.
+
+
+		// -- kappa non-unit, column storage on A ----------------------------------
+
+		label(.CCOLNONU)
+
+		jmp(.CDONE)                        // jump to end.
+
+
+
+
+		label(.CKAPPAUNIT)
+
+		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
+		jz(.CCOLUNIT)                      // jump to column storage case
+
+
+		// -- kappa unit, row storage on A -----------------------------------------
+
+		label(.CROWUNIT)
+
+		//lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
+		//lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
+		//lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.CCONKLEFTROWU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.CKITERROWU)                 // MAIN LOOP (k_iter)
+
+		vmovupd(mem(rax,         0), ymm0)
+		vmovupd(mem(rax,  r8, 1, 0), ymm2)
+		vmovupd(mem(rax,  r8, 2, 0), ymm4)
+
+		add(r14, rax)                      // a += 4*lda;
+
+		vunpcklpd(ymm2, ymm0, ymm10)
+		vunpckhpd(ymm2, ymm0, ymm11)
+		vunpcklpd(ymm6, ymm4, ymm12)
+		vunpckhpd(ymm6, ymm4, ymm13)
+		vinsertf128(imm(0x1), xmm12, ymm10, ymm0)
+		vinsertf128(imm(0x1), xmm13, ymm11, ymm2)
+		vperm2f128(imm(0x31), ymm12, ymm10, ymm4)
+		vperm2f128(imm(0x31), ymm13, ymm11, ymm6)
+
+		vextractf128(imm(0x1), ymm0, xmm1)
+		vextractf128(imm(0x1), ymm2, xmm3)
+		vextractf128(imm(0x1), ymm4, xmm5)
+		vextractf128(imm(0x1), ymm6, xmm7)
+
+		vmovupd(xmm0, mem(rbx, 0*24))
+		vmovupd(xmm2, mem(rbx, 1*24))
+		vmovupd(xmm4, mem(rbx, 2*24))
+		vmovupd(xmm6, mem(rbx, 3*24))
+
+		vmovsd(xmm1, mem(rbx, 0*24+16))
+		vmovsd(xmm3, mem(rbx, 1*24+16))
+		vmovsd(xmm5, mem(rbx, 2*24+16))
+		vmovsd(xmm7, mem(rbx, 3*24+16))
+
+		add(imm(4*3*8), rbx)               // p += 4*ldp = 4*3;
+
+		dec(rsi)                           // i -= 1;
+		jne(.CKITERROWU)                   // iterate again if i != 0.
+
+
+
+		label(.CCONKLEFTROWU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.CDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.CKLEFTROWU)                 // EDGE LOOP (k_left)
+
+		vmovsd(mem(rax,         0), xmm0)
+		vmovsd(mem(rax,  r8, 1, 0), xmm2)
+		vmovsd(mem(rax,  r8, 2, 0), xmm4)
+
+		add(r10, rax)                      // a += lda;
+
+		vmovsd(xmm0, mem(rbx, 0*8))
+		vmovsd(xmm2, mem(rbx, 1*8))
+		vmovsd(xmm4, mem(rbx, 2*8))
+
+		add(imm(3*8), rbx)                 // p += ldp = 3;
+
+		dec(rsi)                           // i -= 1;
+		jne(.CKLEFTROWU)                   // iterate again if i != 0.
+
+
+		jmp(.CDONE)                        // jump to end.
+
+
+		// -- kappa unit, column storage on A --------------------------------------
+
+		label(.CCOLUNIT)
+
+		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.CCONKLEFTCOLU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.CKITERCOLU)                 // MAIN LOOP (k_iter)
+
+		vmovupd(mem(rax,          0), xmm0)
+		vmovsd( mem(rax,         16), xmm1)
+		vmovupd(xmm0, mem(rbx, 0*24+ 0))
+		vmovsd( xmm1, mem(rbx, 0*24+16))
+
+		vmovupd(mem(rax, r10, 1,  0), xmm2)
+		vmovsd( mem(rax, r10, 1, 16), xmm3)
+		vmovupd(xmm2, mem(rbx, 1*24+ 0))
+		vmovsd( xmm3, mem(rbx, 1*24+16))
+
+		vmovupd(mem(rax, r10, 2,  0), xmm4)
+		vmovsd( mem(rax, r10, 2, 16), xmm5)
+		vmovupd(xmm4, mem(rbx, 2*24+ 0))
+		vmovsd( xmm5, mem(rbx, 2*24+16))
+
+		vmovupd(mem(rax, r13, 1,  0), xmm6)
+		vmovsd( mem(rax, r13, 1, 16), xmm7)
+		add(r14, rax)                      // a += 4*lda;
+		vmovupd(xmm6, mem(rbx, 3*24+ 0))
+		vmovsd( xmm7, mem(rbx, 3*24+16))
+		add(imm(4*3*8), rbx)               // p += 4*ldp = 4*3;
+
+		dec(rsi)                           // i -= 1;
+		jne(.CKITERCOLU)                   // iterate again if i != 0.
+
+
+
+		label(.CCONKLEFTCOLU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.CDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.CKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+		vmovupd(mem(rax,          0), xmm0)
+		vmovsd( mem(rax,         16), xmm1)
+		add(r10, rax)                      // a += lda;
+		vmovupd(xmm0, mem(rbx, 0*24+ 0))
+		vmovsd( xmm1, mem(rbx, 0*24+16))
+		add(imm(3*8), rbx)                 // p += ldp = 3;
+
+		dec(rsi)                           // i -= 1;
+		jne(.CKLEFTCOLU)                   // iterate again if i != 0.
+
+
+		//jmp(.CDONE)                        // jump to end.
+
+
+
+		label(.CDONE)
+
+
+
+		end_asm(
+		: // output operands (none)
+		: // input operands
+		  [k_iter] "m" (k_iter),
+		  [k_left] "m" (k_left),
+		  [a]      "m" (a),
+		  [inca]   "m" (inca),
+		  [lda]    "m" (lda),
+		  [p]      "m" (p),
+		  [ldp]    "m" (ldp),
+		  [kappa]  "m" (kappa),
+		  [one]    "m" (one)
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
+		  "xmm0", "xmm1", "xmm2", "xmm3",
+		  "xmm4", "xmm5", "xmm6", "xmm7",
+		  "xmm8", "xmm9", "xmm10", "xmm11",
+		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "memory"
+		)
+	}
+	else if ( cdim0 == nr && ldp0 == nr && cdim_bcast == 1 && !gs && !conja && unitk )
 	{
 		begin_asm()
 
@@ -360,56 +588,24 @@ void bli_cpackm_haswell_asm_8xk
 		  "memory"
 		)
 	}
-	else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk )
+	else
 	{
-		PASTEMAC(cscal2m,BLIS_TAPI_EX_SUF)
+		bli_cscal2bbs_mxn
 		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
+		  conja,
 		  cdim0,
 		  k0,
 		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
 		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t        i      = cdim0;
-			const dim_t        m_edge = mnr - cdim0;
-			const dim_t        n_edge = k0_max;
-			scomplex* restrict p_edge = ( scomplex* )p + (i  )*1;
-
-			bli_cset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
 	}
 
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t        j      = k0;
-		const dim_t        m_edge = mnr;
-		const dim_t        n_edge = k0_max - k0;
-		scomplex* restrict p_edge = ( scomplex* )p + (j  )*ldp;
-
-		bli_cset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
+	bli_cset0s_edge
+	(
+	  cdim0*cdim_bcast, cdim_max*cdim_bcast,
+	  k0, k0_max,
+	  p, ldp
+	);
 }
 
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
deleted file mode 100644
index b23fc2497..000000000
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-// Prototype reference packm kernels.
-PACKM_KER_PROT( scomplex, c, packm_3xk_haswell_ref )
-
-void bli_cpackm_haswell_asm_3xk
-     (
-             conj_t    conja,
-             pack_t    schema,
-             dim_t     cdim0,
-             dim_t     k0,
-             dim_t     k0_max,
-       const void*     kappa,
-       const void*     a, inc_t inca0, inc_t lda0,
-             void*     p,              inc_t ldp0,
-       const cntx_t*   cntx
-     )
-{
-#if 0
-	bli_cpackm_3xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
-	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 3;
-
-	// This is the "packing" dimension assumed by the packm kernel.
-	// This should be equal to ldp.
-	//const dim_t    packmnr = 6;
-
-	// Define a local copy of 1.0 so we can test for unit kappa.
-	float            one_l = 1.0;
-	float*  restrict one   = &one_l;
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	const uint64_t k_iter = k0 / 4;
-#if 1
-	const uint64_t k_left = k0 % 4;
-#else
-	const uint64_t k_left = k0;
-#endif
-
-	// NOTE: For the purposes of the comments in this packm kernel, we
-	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
-	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
-	// this packm kernel, you should think of the operation as packing an
-	// m x n micropanel, where m and n are tiny and large, respectively, and
-	// where elements of each column of the packed matrix P are contiguous.
-	// (This packm kernel can still be used to pack micropanels of matrix B
-	// in a gemm operation.)
-	const uint64_t inca   = inca0;
-	const uint64_t lda    = lda0;
-	const uint64_t ldp    = ldp0;
-
-	const bool     gs     = ( inca0 != 1 && lda0 != 1 );
-
-	// NOTE: If/when this kernel ever supports scaling by kappa within the
-	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_ceq1( *(( scomplex* )kappa) );
-
-
-	// -------------------------------------------------------------------------
-
-	if ( cdim0 == mnr && !gs && !conja && unitk )
-	{
-		begin_asm()
-
-		mov(var(a), rax)                   // load address of a.
-
-		mov(var(inca), r8)                 // load inca
-		mov(var(lda), r10)                 // load lda
-		lea(mem(, r8,  8), r8)             // inca *= sizeof(scomplex)
-		lea(mem(, r10, 8), r10)            // lda *= sizeof(scomplex)
-
-		mov(var(p), rbx)                   // load address of p.
-
-		lea(mem(   , r10, 4), r14)         // r14 = 4*lda
-
-		mov(var(one), rdx)                 // load address of 1.0 constant
-		vbroadcastss(mem(rdx, 0), ymm1)    // load 1.0 and duplicate
-		vxorps(ymm0, ymm0, ymm0)           // set ymm0 to 0.0.
-
-		mov(var(kappa), rcx)               // load address of kappa
-		vbroadcastss(mem(rcx, 0), ymm10)   // load kappa_r and duplicate
-		vbroadcastss(mem(rcx, 4), ymm11)   // load kappa_i and duplicate
-
-
-										   // now branch on kappa == 1.0
-
-		vucomiss(xmm1, xmm10)              // set ZF if kappa_r == 1.0.
-		sete(r12b)                         // r12b = ( ZF == 1 ? 1 : 0 );
-		vucomiss(xmm0, xmm11)              // set ZF if kappa_i == 0.0.
-		sete(r13b)                         // r13b = ( ZF == 1 ? 1 : 0 );
-		and(r12b, r13b)                    // set ZF if r12b & r13b == 1.
-		jne(.CKAPPAUNIT)                   // if ZF = 1, jump to beta == 0 case
-
-
-
-		label(.CKAPPANONU)
-
-		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
-		jz(.CCOLNONU)                      // jump to column storage case
-
-		// -- kappa non-unit, row storage on A -------------------------------------
-
-		label(.CROWNONU)
-
-		jmp(.CDONE)                        // jump to end.
-
-
-		// -- kappa non-unit, column storage on A ----------------------------------
-
-		label(.CCOLNONU)
-
-		jmp(.CDONE)                        // jump to end.
-
-
-
-
-		label(.CKAPPAUNIT)
-
-		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
-		jz(.CCOLUNIT)                      // jump to column storage case
-
-
-		// -- kappa unit, row storage on A -----------------------------------------
-
-		label(.CROWUNIT)
-
-		//lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
-		//lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
-		//lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.CCONKLEFTROWU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.CKITERROWU)                 // MAIN LOOP (k_iter)
-
-		vmovupd(mem(rax,         0), ymm0)
-		vmovupd(mem(rax,  r8, 1, 0), ymm2)
-		vmovupd(mem(rax,  r8, 2, 0), ymm4)
-
-		add(r14, rax)                      // a += 4*lda;
-
-		vunpcklpd(ymm2, ymm0, ymm10)
-		vunpckhpd(ymm2, ymm0, ymm11)
-		vunpcklpd(ymm6, ymm4, ymm12)
-		vunpckhpd(ymm6, ymm4, ymm13)
-		vinsertf128(imm(0x1), xmm12, ymm10, ymm0)
-		vinsertf128(imm(0x1), xmm13, ymm11, ymm2)
-		vperm2f128(imm(0x31), ymm12, ymm10, ymm4)
-		vperm2f128(imm(0x31), ymm13, ymm11, ymm6)
-
-		vextractf128(imm(0x1), ymm0, xmm1)
-		vextractf128(imm(0x1), ymm2, xmm3)
-		vextractf128(imm(0x1), ymm4, xmm5)
-		vextractf128(imm(0x1), ymm6, xmm7)
-
-		vmovupd(xmm0, mem(rbx, 0*24))
-		vmovupd(xmm2, mem(rbx, 1*24))
-		vmovupd(xmm4, mem(rbx, 2*24))
-		vmovupd(xmm6, mem(rbx, 3*24))
-
-		vmovsd(xmm1, mem(rbx, 0*24+16))
-		vmovsd(xmm3, mem(rbx, 1*24+16))
-		vmovsd(xmm5, mem(rbx, 2*24+16))
-		vmovsd(xmm7, mem(rbx, 3*24+16))
-
-		add(imm(4*3*8), rbx)               // p += 4*ldp = 4*3;
-
-		dec(rsi)                           // i -= 1;
-		jne(.CKITERROWU)                   // iterate again if i != 0.
-
-
-
-		label(.CCONKLEFTROWU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.CDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.CKLEFTROWU)                 // EDGE LOOP (k_left)
-
-		vmovsd(mem(rax,         0), xmm0)
-		vmovsd(mem(rax,  r8, 1, 0), xmm2)
-		vmovsd(mem(rax,  r8, 2, 0), xmm4)
-
-		add(r10, rax)                      // a += lda;
-
-		vmovsd(xmm0, mem(rbx, 0*8))
-		vmovsd(xmm2, mem(rbx, 1*8))
-		vmovsd(xmm4, mem(rbx, 2*8))
-
-		add(imm(3*8), rbx)                 // p += ldp = 3;
-
-		dec(rsi)                           // i -= 1;
-		jne(.CKLEFTROWU)                   // iterate again if i != 0.
-
-
-		jmp(.CDONE)                        // jump to end.
-
-
-		// -- kappa unit, column storage on A --------------------------------------
-
-		label(.CCOLUNIT)
-
-		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.CCONKLEFTCOLU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.CKITERCOLU)                 // MAIN LOOP (k_iter)
-
-		vmovupd(mem(rax,          0), xmm0)
-		vmovsd( mem(rax,         16), xmm1)
-		vmovupd(xmm0, mem(rbx, 0*24+ 0))
-		vmovsd( xmm1, mem(rbx, 0*24+16))
-
-		vmovupd(mem(rax, r10, 1,  0), xmm2)
-		vmovsd( mem(rax, r10, 1, 16), xmm3)
-		vmovupd(xmm2, mem(rbx, 1*24+ 0))
-		vmovsd( xmm3, mem(rbx, 1*24+16))
-
-		vmovupd(mem(rax, r10, 2,  0), xmm4)
-		vmovsd( mem(rax, r10, 2, 16), xmm5)
-		vmovupd(xmm4, mem(rbx, 2*24+ 0))
-		vmovsd( xmm5, mem(rbx, 2*24+16))
-
-		vmovupd(mem(rax, r13, 1,  0), xmm6)
-		vmovsd( mem(rax, r13, 1, 16), xmm7)
-		add(r14, rax)                      // a += 4*lda;
-		vmovupd(xmm6, mem(rbx, 3*24+ 0))
-		vmovsd( xmm7, mem(rbx, 3*24+16))
-		add(imm(4*3*8), rbx)               // p += 4*ldp = 4*3;
-
-		dec(rsi)                           // i -= 1;
-		jne(.CKITERCOLU)                   // iterate again if i != 0.
-
-
-
-		label(.CCONKLEFTCOLU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.CDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.CKLEFTCOLU)                 // EDGE LOOP (k_left)
-
-		vmovupd(mem(rax,          0), xmm0)
-		vmovsd( mem(rax,         16), xmm1)
-		add(r10, rax)                      // a += lda;
-		vmovupd(xmm0, mem(rbx, 0*24+ 0))
-		vmovsd( xmm1, mem(rbx, 0*24+16))
-		add(imm(3*8), rbx)                 // p += ldp = 3;
-
-		dec(rsi)                           // i -= 1;
-		jne(.CKLEFTCOLU)                   // iterate again if i != 0.
-
-
-		//jmp(.CDONE)                        // jump to end.
-
-
-
-		label(.CDONE)
-
-
-
-		end_asm(
-		: // output operands (none)
-		: // input operands
-		  [k_iter] "m" (k_iter),
-		  [k_left] "m" (k_left),
-		  [a]      "m" (a),
-		  [inca]   "m" (inca),
-		  [lda]    "m" (lda),
-		  [p]      "m" (p),
-		  [ldp]    "m" (ldp),
-		  [kappa]  "m" (kappa),
-		  [one]    "m" (one)
-		: // register clobber list
-		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
-		  "xmm0", "xmm1", "xmm2", "xmm3",
-		  "xmm4", "xmm5", "xmm6", "xmm7",
-		  "xmm8", "xmm9", "xmm10", "xmm11",
-		  "xmm12", "xmm13", "xmm14", "xmm15",
-		  "memory"
-		)
-	}
-	else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk )
-	{
-		PASTEMAC(cscal2m,BLIS_TAPI_EX_SUF)
-		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
-		  cdim0,
-		  k0,
-		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
-		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t        i      = cdim0;
-			const dim_t        m_edge = mnr - cdim0;
-			const dim_t        n_edge = k0_max;
-			scomplex* restrict p_edge = ( scomplex* )p + (i  )*1;
-
-			bli_cset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
-
-//bli_dfprintm( stdout, "packm 6xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" );
-
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t        j      = k0;
-		const dim_t        m_edge = mnr;
-		const dim_t        n_edge = k0_max - k0;
-		scomplex* restrict p_edge = ( scomplex* )p + (j  )*ldp;
-
-		bli_cset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
-}
-
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6x8.c
similarity index 59%
rename from kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
rename to kernels/haswell/1m/bli_packm_haswell_asm_d6x8.c
index 94de87d97..ef6d66987 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6x8.c
@@ -38,37 +38,29 @@
 #define BLIS_ASM_SYNTAX_ATT
 #include "bli_x86_asm_macros.h"
 
-// Prototype reference packm kernels.
-PACKM_KER_PROT( double,   d, packm_8xk_haswell_ref )
-
-void bli_dpackm_haswell_asm_8xk
+void bli_dpackm_haswell_asm_6x8
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim0,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   k0,
              dim_t   k0_max,
        const void*   kappa,
        const void*   a, inc_t inca0, inc_t lda0,
              void*   p,              inc_t ldp0,
+       const void*   params,
        const cntx_t* cntx
      )
 {
-#if 0
-	bli_dpackm_8xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
 	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 8;
+	const dim_t      mr    = 6;
+	const dim_t      nr    = 8;
 
 	// This is the "packing" dimension assumed by the packm kernel.
 	// This should be equal to ldp.
-	//const dim_t    packmnr = 8;
+	//const dim_t    packmnr = 6;
 
 	// Define a local copy of 1.0 so we can test for unit kappa.
 	double           one_l = 1.0;
@@ -104,7 +96,247 @@ void bli_dpackm_haswell_asm_8xk
 
 	// -------------------------------------------------------------------------
 
-	if ( cdim0 == mnr && !gs && unitk )
+	if ( cdim0 == mr && ldp0 == mr && cdim_bcast == 1 && !gs && unitk )
+	{
+		begin_asm()
+
+		mov(var(a), rax)                   // load address of a.
+
+		mov(var(inca), r8)                 // load inca
+		mov(var(lda), r10)                 // load lda
+		lea(mem(, r8,  8), r8)             // inca *= sizeof(double)
+		lea(mem(, r10, 8), r10)            // lda *= sizeof(double)
+
+		mov(var(p), rbx)                   // load address of p.
+
+		lea(mem(   , r10, 4), r14)         // r14 = 4*lda
+
+		mov(var(one), rdx)                 // load address of 1.0 constant
+		vmovsd(mem(rdx), xmm1)             // load 1.0
+
+		mov(var(kappa), rcx)               // load address of kappa
+		vmovsd(mem(rcx), xmm0)             // load kappa
+
+
+										   // now branch on kappa == 1.0
+
+		vucomisd(xmm0, xmm1)               // set ZF if kappa == 1.0
+		je(.DKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
+
+
+
+		label(.DKAPPANONU)
+
+		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
+		jz(.DCOLNONU)                      // jump to column storage case
+
+		// -- kappa non-unit, row storage on A -------------------------------------
+
+		label(.DROWNONU)
+
+		jmp(.DDONE)                        // jump to end.
+
+
+		// -- kappa non-unit, column storage on A ----------------------------------
+
+		label(.DCOLNONU)
+
+		jmp(.DDONE)                        // jump to end.
+
+
+
+
+		label(.DKAPPAUNIT)
+
+		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
+		jz(.DCOLUNIT)                      // jump to column storage case
+
+
+		// -- kappa unit, row storage on A -----------------------------------------
+
+		label(.DROWUNIT)
+
+		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
+		lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
+		//lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DCONKLEFTROWU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.DKITERROWU)                 // MAIN LOOP (k_iter)
+
+		vmovupd(mem(rax,         0), ymm0)
+		vmovupd(mem(rax,  r8, 1, 0), ymm2)
+		vmovupd(mem(rax,  r8, 2, 0), ymm4)
+		vmovupd(mem(rax, r12, 1, 0), ymm6)
+
+		vunpcklpd(ymm2, ymm0, ymm10)
+		vunpckhpd(ymm2, ymm0, ymm11)
+		vunpcklpd(ymm6, ymm4, ymm12)
+		vunpckhpd(ymm6, ymm4, ymm13)
+		vinsertf128(imm(0x1), xmm12, ymm10, ymm0)
+		vinsertf128(imm(0x1), xmm13, ymm11, ymm2)
+		vperm2f128(imm(0x31), ymm12, ymm10, ymm4)
+		vperm2f128(imm(0x31), ymm13, ymm11, ymm6)
+
+		vmovupd(ymm0, mem(rbx, 0*48))
+		vmovupd(ymm2, mem(rbx, 1*48))
+		vmovupd(ymm4, mem(rbx, 2*48))
+		vmovupd(ymm6, mem(rbx, 3*48))
+
+		vmovupd(mem(rax,  r8, 4, 0), ymm1)
+		vmovupd(mem(rax, rcx, 1, 0), ymm3)
+
+		add(r14, rax)                      // a += 4*lda;
+
+		vunpcklpd(ymm3, ymm1, ymm10)
+		vunpckhpd(ymm3, ymm1, ymm11)
+		vextractf128(imm(0x1), ymm10, xmm12)
+		vextractf128(imm(0x1), ymm11, xmm13)
+
+		vmovupd(xmm10, mem(rbx, 0*48+32))
+		vmovupd(xmm11, mem(rbx, 1*48+32))
+		vmovupd(xmm12, mem(rbx, 2*48+32))
+		vmovupd(xmm13, mem(rbx, 3*48+32))
+
+		add(imm(4*6*8), rbx)               // p += 4*ldp = 4*6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKITERROWU)                   // iterate again if i != 0.
+
+
+
+		label(.DCONKLEFTROWU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.DKLEFTROWU)                 // EDGE LOOP (k_left)
+
+		vmovsd(mem(rax,         0), xmm0)
+		vmovsd(mem(rax,  r8, 1, 0), xmm2)
+		vmovsd(mem(rax,  r8, 2, 0), xmm4)
+		vmovsd(mem(rax, r12, 1, 0), xmm6)
+		vmovsd(mem(rax,  r8, 4, 0), xmm1)
+		vmovsd(mem(rax, rcx, 1, 0), xmm3)
+
+		add(r10, rax)                      // a += lda;
+
+		vmovsd(xmm0, mem(rbx, 0*8))
+		vmovsd(xmm2, mem(rbx, 1*8))
+		vmovsd(xmm4, mem(rbx, 2*8))
+		vmovsd(xmm6, mem(rbx, 3*8))
+		vmovsd(xmm1, mem(rbx, 4*8))
+		vmovsd(xmm3, mem(rbx, 5*8))
+
+		add(imm(6*8), rbx)                 // p += ldp = 6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKLEFTROWU)                   // iterate again if i != 0.
+
+
+		jmp(.DDONE)                        // jump to end.
+
+
+		// -- kappa unit, column storage on A --------------------------------------
+
+		label(.DCOLUNIT)
+
+		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DCONKLEFTCOLU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.DKITERCOLU)                 // MAIN LOOP (k_iter)
+
+		vmovupd(mem(rax,          0), ymm0)
+		vmovupd(mem(rax,         32), xmm1)
+		vmovupd(ymm0, mem(rbx, 0*48+ 0))
+		vmovupd(xmm1, mem(rbx, 0*48+32))
+
+		vmovupd(mem(rax, r10, 1,  0), ymm2)
+		vmovupd(mem(rax, r10, 1, 32), xmm3)
+		vmovupd(ymm2, mem(rbx, 1*48+ 0))
+		vmovupd(xmm3, mem(rbx, 1*48+32))
+
+		vmovupd(mem(rax, r10, 2,  0), ymm4)
+		vmovupd(mem(rax, r10, 2, 32), xmm5)
+		vmovupd(ymm4, mem(rbx, 2*48+ 0))
+		vmovupd(xmm5, mem(rbx, 2*48+32))
+
+		vmovupd(mem(rax, r13, 1,  0), ymm6)
+		vmovupd(mem(rax, r13, 1, 32), xmm7)
+		add(r14, rax)                      // a += 4*lda;
+		vmovupd(ymm6, mem(rbx, 3*48+ 0))
+		vmovupd(xmm7, mem(rbx, 3*48+32))
+		add(imm(4*6*8), rbx)               // p += 4*ldp = 4*6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKITERCOLU)                   // iterate again if i != 0.
+
+
+
+		label(.DCONKLEFTCOLU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.DKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+		vmovupd(mem(rax,          0), ymm0)
+		vmovupd(mem(rax,         32), xmm1)
+		add(r10, rax)                      // a += lda;
+		vmovupd(ymm0, mem(rbx, 0*48+ 0))
+		vmovupd(xmm1, mem(rbx, 0*48+32))
+		add(imm(6*8), rbx)                 // p += ldp = 6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKLEFTCOLU)                   // iterate again if i != 0.
+
+
+		//jmp(.DDONE)                        // jump to end.
+
+
+
+		label(.DDONE)
+
+
+
+		end_asm(
+		: // output operands (none)
+		: // input operands
+		  [k_iter] "m" (k_iter),
+		  [k_left] "m" (k_left),
+		  [a]      "m" (a),
+		  [inca]   "m" (inca),
+		  [lda]    "m" (lda),
+		  [p]      "m" (p),
+		  [ldp]    "m" (ldp),
+		  [kappa]  "m" (kappa),
+		  [one]    "m" (one)
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
+		  "xmm0", "xmm1", "xmm2", "xmm3",
+		  "xmm4", "xmm5", "xmm6", "xmm7",
+		  "xmm8", "xmm9", "xmm10", "xmm11",
+		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "memory"
+		)
+	}
+	else if ( cdim0 == nr && ldp0 == nr && cdim_bcast == 1 && !gs && unitk )
 	{
 		begin_asm()
 
@@ -354,56 +586,24 @@ void bli_dpackm_haswell_asm_8xk
 		  "memory"
 		)
 	}
-	else // if ( cdim0 < mnr || gs || !unitk )
+	else
 	{
-		PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
+		bli_dscal2bbs_mxn
 		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
+		  conja,
 		  cdim0,
 		  k0,
 		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
 		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t      i      = cdim0;
-			const dim_t      m_edge = mnr - cdim0;
-			const dim_t      n_edge = k0_max;
-			double* restrict p_edge = ( double* )p + (i  )*1;
-
-			bli_dset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
 	}
 
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t      j      = k0;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = k0_max - k0;
-		double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-		bli_dset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
+	bli_dset0s_edge
+	(
+	  cdim0*cdim_bcast, cdim_max*cdim_bcast,
+	  k0, k0_max,
+	  p, ldp
+	);
 }
 
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
deleted file mode 100644
index 7722b5ef4..000000000
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-// Prototype reference packm kernels.
-PACKM_KER_PROT( double,   d, packm_6xk_haswell_ref )
-
-void bli_dpackm_haswell_asm_6xk
-     (
-             conj_t  conja,
-             pack_t  schema,
-             dim_t   cdim0,
-             dim_t   k0,
-             dim_t   k0_max,
-       const void*   kappa,
-       const void*   a, inc_t inca0, inc_t lda0,
-             void*   p,              inc_t ldp0,
-       const cntx_t* cntx
-     )
-{
-#if 0
-	bli_dpackm_6xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
-	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 6;
-
-	// This is the "packing" dimension assumed by the packm kernel.
-	// This should be equal to ldp.
-	//const dim_t    packmnr = 6;
-
-	// Define a local copy of 1.0 so we can test for unit kappa.
-	double           one_l = 1.0;
-	double* restrict one   = &one_l;
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	const uint64_t k_iter = k0 / 4;
-#if 1
-	const uint64_t k_left = k0 % 4;
-#else
-	const uint64_t k_left = k0;
-#endif
-
-	// NOTE: For the purposes of the comments in this packm kernel, we
-	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
-	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
-	// this packm kernel, you should think of the operation as packing an
-	// m x n micropanel, where m and n are tiny and large, respectively, and
-	// where elements of each column of the packed matrix P are contiguous.
-	// (This packm kernel can still be used to pack micropanels of matrix B
-	// in a gemm operation.)
-	const uint64_t inca   = inca0;
-	const uint64_t lda    = lda0;
-	const uint64_t ldp    = ldp0;
-
-	const bool     gs     = ( inca0 != 1 && lda0 != 1 );
-
-	// NOTE: If/when this kernel ever supports scaling by kappa within the
-	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_deq1( *(( double* )kappa) );
-
-
-	// -------------------------------------------------------------------------
-
-	if ( cdim0 == mnr && !gs && unitk )
-	{
-		begin_asm()
-
-		mov(var(a), rax)                   // load address of a.
-
-		mov(var(inca), r8)                 // load inca
-		mov(var(lda), r10)                 // load lda
-		lea(mem(, r8,  8), r8)             // inca *= sizeof(double)
-		lea(mem(, r10, 8), r10)            // lda *= sizeof(double)
-
-		mov(var(p), rbx)                   // load address of p.
-
-		lea(mem(   , r10, 4), r14)         // r14 = 4*lda
-
-		mov(var(one), rdx)                 // load address of 1.0 constant
-		vmovsd(mem(rdx), xmm1)             // load 1.0
-
-		mov(var(kappa), rcx)               // load address of kappa
-		vmovsd(mem(rcx), xmm0)             // load kappa
-
-
-										   // now branch on kappa == 1.0
-
-		vucomisd(xmm0, xmm1)               // set ZF if kappa == 1.0
-		je(.DKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
-
-
-
-		label(.DKAPPANONU)
-
-		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
-		jz(.DCOLNONU)                      // jump to column storage case
-
-		// -- kappa non-unit, row storage on A -------------------------------------
-
-		label(.DROWNONU)
-
-		jmp(.DDONE)                        // jump to end.
-
-
-		// -- kappa non-unit, column storage on A ----------------------------------
-
-		label(.DCOLNONU)
-
-		jmp(.DDONE)                        // jump to end.
-
-
-
-
-		label(.DKAPPAUNIT)
-
-		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
-		jz(.DCOLUNIT)                      // jump to column storage case
-
-
-		// -- kappa unit, row storage on A -----------------------------------------
-
-		label(.DROWUNIT)
-
-		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
-		lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
-		//lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.DCONKLEFTROWU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.DKITERROWU)                 // MAIN LOOP (k_iter)
-
-		vmovupd(mem(rax,         0), ymm0)
-		vmovupd(mem(rax,  r8, 1, 0), ymm2)
-		vmovupd(mem(rax,  r8, 2, 0), ymm4)
-		vmovupd(mem(rax, r12, 1, 0), ymm6)
-
-		vunpcklpd(ymm2, ymm0, ymm10)
-		vunpckhpd(ymm2, ymm0, ymm11)
-		vunpcklpd(ymm6, ymm4, ymm12)
-		vunpckhpd(ymm6, ymm4, ymm13)
-		vinsertf128(imm(0x1), xmm12, ymm10, ymm0)
-		vinsertf128(imm(0x1), xmm13, ymm11, ymm2)
-		vperm2f128(imm(0x31), ymm12, ymm10, ymm4)
-		vperm2f128(imm(0x31), ymm13, ymm11, ymm6)
-
-		vmovupd(ymm0, mem(rbx, 0*48))
-		vmovupd(ymm2, mem(rbx, 1*48))
-		vmovupd(ymm4, mem(rbx, 2*48))
-		vmovupd(ymm6, mem(rbx, 3*48))
-
-		vmovupd(mem(rax,  r8, 4, 0), ymm1)
-		vmovupd(mem(rax, rcx, 1, 0), ymm3)
-
-		add(r14, rax)                      // a += 4*lda;
-
-		vunpcklpd(ymm3, ymm1, ymm10)
-		vunpckhpd(ymm3, ymm1, ymm11)
-		vextractf128(imm(0x1), ymm10, xmm12)
-		vextractf128(imm(0x1), ymm11, xmm13)
-
-		vmovupd(xmm10, mem(rbx, 0*48+32))
-		vmovupd(xmm11, mem(rbx, 1*48+32))
-		vmovupd(xmm12, mem(rbx, 2*48+32))
-		vmovupd(xmm13, mem(rbx, 3*48+32))
-
-		add(imm(4*6*8), rbx)               // p += 4*ldp = 4*6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.DKITERROWU)                   // iterate again if i != 0.
-
-
-
-		label(.DCONKLEFTROWU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.DDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.DKLEFTROWU)                 // EDGE LOOP (k_left)
-
-		vmovsd(mem(rax,         0), xmm0)
-		vmovsd(mem(rax,  r8, 1, 0), xmm2)
-		vmovsd(mem(rax,  r8, 2, 0), xmm4)
-		vmovsd(mem(rax, r12, 1, 0), xmm6)
-		vmovsd(mem(rax,  r8, 4, 0), xmm1)
-		vmovsd(mem(rax, rcx, 1, 0), xmm3)
-
-		add(r10, rax)                      // a += lda;
-
-		vmovsd(xmm0, mem(rbx, 0*8))
-		vmovsd(xmm2, mem(rbx, 1*8))
-		vmovsd(xmm4, mem(rbx, 2*8))
-		vmovsd(xmm6, mem(rbx, 3*8))
-		vmovsd(xmm1, mem(rbx, 4*8))
-		vmovsd(xmm3, mem(rbx, 5*8))
-
-		add(imm(6*8), rbx)                 // p += ldp = 6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.DKLEFTROWU)                   // iterate again if i != 0.
-
-
-		jmp(.DDONE)                        // jump to end.
-
-
-		// -- kappa unit, column storage on A --------------------------------------
-
-		label(.DCOLUNIT)
-
-		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.DCONKLEFTCOLU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.DKITERCOLU)                 // MAIN LOOP (k_iter)
-
-		vmovupd(mem(rax,          0), ymm0)
-		vmovupd(mem(rax,         32), xmm1)
-		vmovupd(ymm0, mem(rbx, 0*48+ 0))
-		vmovupd(xmm1, mem(rbx, 0*48+32))
-
-		vmovupd(mem(rax, r10, 1,  0), ymm2)
-		vmovupd(mem(rax, r10, 1, 32), xmm3)
-		vmovupd(ymm2, mem(rbx, 1*48+ 0))
-		vmovupd(xmm3, mem(rbx, 1*48+32))
-
-		vmovupd(mem(rax, r10, 2,  0), ymm4)
-		vmovupd(mem(rax, r10, 2, 32), xmm5)
-		vmovupd(ymm4, mem(rbx, 2*48+ 0))
-		vmovupd(xmm5, mem(rbx, 2*48+32))
-
-		vmovupd(mem(rax, r13, 1,  0), ymm6)
-		vmovupd(mem(rax, r13, 1, 32), xmm7)
-		add(r14, rax)                      // a += 4*lda;
-		vmovupd(ymm6, mem(rbx, 3*48+ 0))
-		vmovupd(xmm7, mem(rbx, 3*48+32))
-		add(imm(4*6*8), rbx)               // p += 4*ldp = 4*6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.DKITERCOLU)                   // iterate again if i != 0.
-
-
-
-		label(.DCONKLEFTCOLU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.DDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.DKLEFTCOLU)                 // EDGE LOOP (k_left)
-
-		vmovupd(mem(rax,          0), ymm0)
-		vmovupd(mem(rax,         32), xmm1)
-		add(r10, rax)                      // a += lda;
-		vmovupd(ymm0, mem(rbx, 0*48+ 0))
-		vmovupd(xmm1, mem(rbx, 0*48+32))
-		add(imm(6*8), rbx)                 // p += ldp = 6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.DKLEFTCOLU)                   // iterate again if i != 0.
-
-
-		//jmp(.DDONE)                        // jump to end.
-
-
-
-		label(.DDONE)
-
-
-
-		end_asm(
-		: // output operands (none)
-		: // input operands
-		  [k_iter] "m" (k_iter),
-		  [k_left] "m" (k_left),
-		  [a]      "m" (a),
-		  [inca]   "m" (inca),
-		  [lda]    "m" (lda),
-		  [p]      "m" (p),
-		  [ldp]    "m" (ldp),
-		  [kappa]  "m" (kappa),
-		  [one]    "m" (one)
-		: // register clobber list
-		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
-		  "xmm0", "xmm1", "xmm2", "xmm3",
-		  "xmm4", "xmm5", "xmm6", "xmm7",
-		  "xmm8", "xmm9", "xmm10", "xmm11",
-		  "xmm12", "xmm13", "xmm14", "xmm15",
-		  "memory"
-		)
-	}
-	else // if ( cdim0 < mnr || gs || !unitk )
-	{
-		PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
-		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
-		  cdim0,
-		  k0,
-		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
-		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t      i      = cdim0;
-			const dim_t      m_edge = mnr - cdim0;
-			const dim_t      n_edge = k0_max;
-			double* restrict p_edge = ( double* )p + (i  )*1;
-
-			bli_dset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
-
-//bli_dfprintm( stdout, "packm 6xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" );
-
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t      j      = k0;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = k0_max - k0;
-		double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-		bli_dset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
-}
-
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6x16.c
similarity index 64%
rename from kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
rename to kernels/haswell/1m/bli_packm_haswell_asm_s6x16.c
index 21f514b25..fbab3983d 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6x16.c
@@ -38,33 +38,25 @@
 #define BLIS_ASM_SYNTAX_ATT
 #include "bli_x86_asm_macros.h"
 
-// Prototype reference packm kernels.
-PACKM_KER_PROT( double,   d, packm_16xk_haswell_ref )
-
-void bli_spackm_haswell_asm_16xk
+void bli_spackm_haswell_asm_6x16
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim0,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   k0,
              dim_t   k0_max,
        const void*   kappa,
        const void*   a, inc_t inca0, inc_t lda0,
              void*   p,              inc_t ldp0,
+       const void*   params,
        const cntx_t* cntx
      )
 {
-#if 0
-	bli_spackm_16xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
 	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 16;
+	const dim_t      mr    = 6;
+	const dim_t      nr    = 16;
 
 	// This is the "packing" dimension assumed by the packm kernel.
 	// This should be equal to ldp.
@@ -104,7 +96,289 @@ void bli_spackm_haswell_asm_16xk
 
 	// -------------------------------------------------------------------------
 
-	if ( cdim0 == mnr && !gs && unitk )
+	if ( cdim0 == mr && ldp0 == mr && cdim_bcast == 1 && !gs && unitk )
+	{
+		begin_asm()
+
+		mov(var(a), rax)                   // load address of a.
+
+		mov(var(inca), r8)                 // load inca
+		mov(var(lda), r10)                 // load lda
+		lea(mem(, r8,  4), r8)             // inca *= sizeof(float)
+		lea(mem(, r10, 4), r10)            // lda *= sizeof(float)
+
+		mov(var(p), rbx)                   // load address of p.
+
+		lea(mem(   , r10, 8), r14)         // r14 = 8*lda
+
+		mov(var(one), rdx)                 // load address of 1.0 constant
+		vmovss(mem(rdx), xmm1)             // load 1.0
+
+		mov(var(kappa), rcx)               // load address of kappa
+		vmovss(mem(rcx), xmm0)             // load kappa
+
+
+										   // now branch on kappa == 1.0
+
+		vucomiss(xmm0, xmm1)               // set ZF if kappa == 1.0
+		je(.SKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
+
+
+
+		label(.SKAPPANONU)
+
+		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
+		jz(.SCOLNONU)                      // jump to column storage case
+
+		// -- kappa non-unit, row storage on A -------------------------------------
+
+		label(.SROWNONU)
+
+		jmp(.SDONE)                        // jump to end.
+
+
+		// -- kappa non-unit, column storage on A ----------------------------------
+
+		label(.SCOLNONU)
+
+		jmp(.SDONE)                        // jump to end.
+
+
+
+
+		label(.SKAPPAUNIT)
+
+		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
+		jz(.SCOLUNIT)                      // jump to column storage case
+
+
+		// -- kappa unit, row storage on A -----------------------------------------
+
+		label(.SROWUNIT)
+
+		lea(mem(r8,  r8,  2), r13)         // r13 = 3*inca
+		lea(mem(r13, r8,  2), r15)         // r15 = 5*inca
+		//lea(mem(r13, r8,  4), rdx)         // rdx = 7*inca
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.SCONKLEFTROWU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.SKITERROWU)                 // MAIN LOOP (k_iter)
+
+		                                   // begin IO on rows 0-3
+		vmovups(mem(rax,         0), ymm4)
+		vmovups(mem(rax,  r8, 1, 0), ymm6)
+		vmovups(mem(rax,  r8, 2, 0), ymm8)
+		vmovups(mem(rax, r13, 1, 0), ymm10)
+
+		vunpcklps(ymm6, ymm4, ymm0)
+		vunpcklps(ymm10, ymm8, ymm1)
+		vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+		vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+		vblendps(imm(0x33), ymm2, ymm1, ymm1)
+
+		vextractf128(imm(0x1), ymm0, xmm2)
+		vmovups(xmm0, mem(rbx, 0*24))      // store ( gamma00..gamma30 )
+		vmovups(xmm2, mem(rbx, 4*24))      // store ( gamma04..gamma34 )
+
+		vextractf128(imm(0x1), ymm1, xmm2)
+		vmovups(xmm1, mem(rbx, 1*24))      // store ( gamma01..gamma31 )
+		vmovups(xmm2, mem(rbx, 5*24))      // store ( gamma05..gamma35 )
+
+		vunpckhps(ymm6, ymm4, ymm0)
+		vunpckhps(ymm10, ymm8, ymm1)
+		vshufps(imm(0x4e), ymm1, ymm0, ymm2)
+		vblendps(imm(0xcc), ymm2, ymm0, ymm0)
+		vblendps(imm(0x33), ymm2, ymm1, ymm1)
+
+		vextractf128(imm(0x1), ymm0, xmm2)
+		vmovups(xmm0, mem(rbx, 2*24))      // store ( gamma02..gamma32 )
+		vmovups(xmm2, mem(rbx, 6*24))      // store ( gamma06..gamma36 )
+
+		vextractf128(imm(0x1), ymm1, xmm2)
+		vmovups(xmm1, mem(rbx, 3*24))      // store ( gamma03..gamma33 )
+		vmovups(xmm2, mem(rbx, 7*24))      // store ( gamma07..gamma37 )
+
+		                                   // begin IO on rows 4-5
+		vmovups(mem(rax,  r8, 4, 0), ymm12)
+		vmovups(mem(rax, r15, 1, 0), ymm14)
+
+		vunpcklps(ymm14, ymm12, ymm0)
+		vextractf128(imm(0x1), ymm0, xmm2)
+		vmovlpd(xmm0, mem(rbx, 0*24+16))   // store ( gamma40..gamma50 )
+		vmovhpd(xmm0, mem(rbx, 1*24+16))   // store ( gamma41..gamma51 )
+		vmovlpd(xmm2, mem(rbx, 4*24+16))   // store ( gamma44..gamma54 )
+		vmovhpd(xmm2, mem(rbx, 5*24+16))   // store ( gamma45..gamma55 )
+
+		vunpckhps(ymm14, ymm12, ymm0)
+		vextractf128(imm(0x1), ymm0, xmm2)
+		vmovlpd(xmm0, mem(rbx, 2*24+16))   // store ( gamma42..gamma52 )
+		vmovhpd(xmm0, mem(rbx, 3*24+16))   // store ( gamma43..gamma53 )
+		vmovlpd(xmm2, mem(rbx, 6*24+16))   // store ( gamma46..gamma56 )
+		vmovhpd(xmm2, mem(rbx, 7*24+16))   // store ( gamma47..gamma57 )
+
+
+		add(r14, rax)                      // a += 8*lda;
+		add(imm(8*6*4), rbx)               // p += 8*ldp = 8*6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.SKITERROWU)                   // iterate again if i != 0.
+
+
+
+		label(.SCONKLEFTROWU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.SDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.SKLEFTROWU)                 // EDGE LOOP (k_left)
+
+		vmovss(mem(rax,         0), xmm0)
+		vmovss(mem(rax,  r8, 1, 0), xmm2)
+		vmovss(mem(rax,  r8, 2, 0), xmm4)
+		vmovss(mem(rax, r13, 1, 0), xmm6)
+		vmovss(mem(rax,  r8, 4, 0), xmm1)
+		vmovss(mem(rax, r15, 1, 0), xmm3)
+
+		vmovss(xmm0, mem(rbx, 0*4))
+		vmovss(xmm2, mem(rbx, 1*4))
+		vmovss(xmm4, mem(rbx, 2*4))
+		vmovss(xmm6, mem(rbx, 3*4))
+		vmovss(xmm1, mem(rbx, 4*4))
+		vmovss(xmm3, mem(rbx, 5*4))
+
+		add(r10, rax)                      // a += lda;
+		add(imm(6*4), rbx)                 // p += ldp = 6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.SKLEFTROWU)                   // iterate again if i != 0.
+
+
+		jmp(.SDONE)                        // jump to end.
+
+
+		// -- kappa unit, column storage on A --------------------------------------
+
+		label(.SCOLUNIT)
+
+		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
+		lea(mem(r13, r10, 2), r15)         // r15 = 5*lda
+		lea(mem(r13, r10, 4), rdx)         // rdx = 7*lda
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.SCONKLEFTCOLU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.SKITERCOLU)                 // MAIN LOOP (k_iter)
+
+		vmovups(mem(rax,          0), xmm0)
+		vmovsd( mem(rax,         16), xmm1)
+		vmovups(xmm0, mem(rbx, 0*24+ 0))
+		vmovsd( xmm1, mem(rbx, 0*24+16))
+
+		vmovups(mem(rax, r10, 1,  0), xmm2)
+		vmovsd( mem(rax, r10, 1, 16), xmm3)
+		vmovups(xmm2, mem(rbx, 1*24+ 0))
+		vmovsd( xmm3, mem(rbx, 1*24+16))
+
+		vmovups(mem(rax, r10, 2,  0), xmm4)
+		vmovsd( mem(rax, r10, 2, 16), xmm5)
+		vmovups(xmm4, mem(rbx, 2*24+ 0))
+		vmovsd( xmm5, mem(rbx, 2*24+16))
+
+		vmovups(mem(rax, r13, 1,  0), xmm6)
+		vmovsd( mem(rax, r13, 1, 16), xmm7)
+		vmovups(xmm6, mem(rbx, 3*24+ 0))
+		vmovsd( xmm7, mem(rbx, 3*24+16))
+
+		vmovups(mem(rax, r10, 4,  0), xmm8)
+		vmovsd( mem(rax, r10, 4, 16), xmm9)
+		vmovups(xmm8, mem(rbx, 4*24+ 0))
+		vmovsd( xmm9, mem(rbx, 4*24+16))
+
+		vmovups(mem(rax, r15, 1,  0), xmm10)
+		vmovsd( mem(rax, r15, 1, 16), xmm11)
+		vmovups(xmm10, mem(rbx, 5*24+ 0))
+		vmovsd( xmm11, mem(rbx, 5*24+16))
+
+		vmovups(mem(rax, r13, 2,  0), xmm12)
+		vmovsd( mem(rax, r13, 2, 16), xmm13)
+		vmovups(xmm12, mem(rbx, 6*24+ 0))
+		vmovsd( xmm13, mem(rbx, 6*24+16))
+
+		vmovups(mem(rax, rdx, 1,  0), xmm14)
+		vmovsd( mem(rax, rdx, 1, 16), xmm15)
+		vmovups(xmm14, mem(rbx, 7*24+ 0))
+		vmovsd( xmm15, mem(rbx, 7*24+16))
+
+		add(r14, rax)                      // a += 8*lda;
+		add(imm(8*6*4), rbx)               // p += 8*ldp = 8*6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.SKITERCOLU)                   // iterate again if i != 0.
+
+
+
+		label(.SCONKLEFTCOLU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.SDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.SKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+		vmovups(mem(rax,          0), xmm0)
+		vmovsd( mem(rax,         16), xmm1)
+		add(r10, rax)                      // a += lda;
+		vmovups(xmm0, mem(rbx, 0*24+ 0))
+		vmovsd( xmm1, mem(rbx, 0*24+16))
+		add(imm(6*4), rbx)                 // p += ldp = 6;
+
+		dec(rsi)                           // i -= 1;
+		jne(.SKLEFTCOLU)                   // iterate again if i != 0.
+
+
+		//jmp(.SDONE)                        // jump to end.
+
+
+
+		label(.SDONE)
+
+
+
+		end_asm(
+		: // output operands (none)
+		: // input operands
+		  [k_iter] "m" (k_iter),
+		  [k_left] "m" (k_left),
+		  [a]      "m" (a),
+		  [inca]   "m" (inca),
+		  [lda]    "m" (lda),
+		  [p]      "m" (p),
+		  [ldp]    "m" (ldp),
+		  [kappa]  "m" (kappa),
+		  [one]    "m" (one)
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
+		  "xmm0", "xmm1", "xmm2", "xmm3",
+		  "xmm4", "xmm5", "xmm6", "xmm7",
+		  "xmm8", "xmm9", "xmm10", "xmm11",
+		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "memory"
+		)
+	}
+	else if ( cdim0 == nr && ldp0 == nr && cdim_bcast == 1 && !gs && unitk )
 	{
 		begin_asm()
 
@@ -513,56 +787,24 @@ void bli_spackm_haswell_asm_16xk
 		  "memory"
 		)
 	}
-	else // if ( cdim0 < mnr || gs || !unitk )
+	else
 	{
-		PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF)
+		bli_sscal2bbs_mxn
 		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
+		  conja,
 		  cdim0,
 		  k0,
 		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
 		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t      i      = cdim0;
-			const dim_t      m_edge = mnr - cdim0;
-			const dim_t      n_edge = k0_max;
-			float*  restrict p_edge = ( float* )p + (i  )*1;
-
-			bli_sset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
 	}
 
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t      j      = k0;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = k0_max - k0;
-		float*  restrict p_edge = ( float* )p + (j  )*ldp;
-
-		bli_sset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
+	bli_sset0s_edge
+	(
+	  cdim0*cdim_bcast, cdim_max*cdim_bcast,
+	  k0, k0_max,
+	  p, ldp
+	);
 }
 
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
deleted file mode 100644
index bf5dbdf88..000000000
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-// Prototype reference packm kernels.
-PACKM_KER_PROT( double,   d, packm_6xk_haswell_ref )
-
-void bli_spackm_haswell_asm_6xk
-     (
-             conj_t  conja,
-             pack_t  schema,
-             dim_t   cdim0,
-             dim_t   k0,
-             dim_t   k0_max,
-       const void*   kappa,
-       const void*   a, inc_t inca0, inc_t lda0,
-             void*   p,              inc_t ldp0,
-       const cntx_t* cntx
-     )
-{
-#if 0
-	bli_spackm_6xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
-	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 6;
-
-	// This is the "packing" dimension assumed by the packm kernel.
-	// This should be equal to ldp.
-	//const dim_t    packmnr = 8;
-
-	// Define a local copy of 1.0 so we can test for unit kappa.
-	float            one_l = 1.0;
-	float*  restrict one   = &one_l;
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	const uint64_t k_iter = k0 / 8;
-#if 1
-	const uint64_t k_left = k0 % 8;
-#else
-	const uint64_t k_left = k0;
-#endif
-
-	// NOTE: For the purposes of the comments in this packm kernel, we
-	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
-	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
-	// this packm kernel, you should think of the operation as packing an
-	// m x n micropanel, where m and n are tiny and large, respectively, and
-	// where elements of each column of the packed matrix P are contiguous.
-	// (This packm kernel can still be used to pack micropanels of matrix B
-	// in a gemm operation.)
-	const uint64_t inca   = inca0;
-	const uint64_t lda    = lda0;
-	const uint64_t ldp    = ldp0;
-
-	const bool     gs     = ( inca0 != 1 && lda0 != 1 );
-
-	// NOTE: If/when this kernel ever supports scaling by kappa within the
-	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_seq1( *(( float* )kappa) );
-
-
-	// -------------------------------------------------------------------------
-
-	if ( cdim0 == mnr && !gs && unitk )
-	{
-		begin_asm()
-
-		mov(var(a), rax)                   // load address of a.
-
-		mov(var(inca), r8)                 // load inca
-		mov(var(lda), r10)                 // load lda
-		lea(mem(, r8,  4), r8)             // inca *= sizeof(float)
-		lea(mem(, r10, 4), r10)            // lda *= sizeof(float)
-
-		mov(var(p), rbx)                   // load address of p.
-
-		lea(mem(   , r10, 8), r14)         // r14 = 8*lda
-
-		mov(var(one), rdx)                 // load address of 1.0 constant
-		vmovss(mem(rdx), xmm1)             // load 1.0
-
-		mov(var(kappa), rcx)               // load address of kappa
-		vmovss(mem(rcx), xmm0)             // load kappa
-
-
-										   // now branch on kappa == 1.0
-
-		vucomiss(xmm0, xmm1)               // set ZF if kappa == 1.0
-		je(.SKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
-
-
-
-		label(.SKAPPANONU)
-
-		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
-		jz(.SCOLNONU)                      // jump to column storage case
-
-		// -- kappa non-unit, row storage on A -------------------------------------
-
-		label(.SROWNONU)
-
-		jmp(.SDONE)                        // jump to end.
-
-
-		// -- kappa non-unit, column storage on A ----------------------------------
-
-		label(.SCOLNONU)
-
-		jmp(.SDONE)                        // jump to end.
-
-
-
-
-		label(.SKAPPAUNIT)
-
-		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
-		jz(.SCOLUNIT)                      // jump to column storage case
-
-
-		// -- kappa unit, row storage on A -----------------------------------------
-
-		label(.SROWUNIT)
-
-		lea(mem(r8,  r8,  2), r13)         // r13 = 3*inca
-		lea(mem(r13, r8,  2), r15)         // r15 = 5*inca
-		//lea(mem(r13, r8,  4), rdx)         // rdx = 7*inca
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.SCONKLEFTROWU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.SKITERROWU)                 // MAIN LOOP (k_iter)
-
-		                                   // begin IO on rows 0-3
-		vmovups(mem(rax,         0), ymm4)
-		vmovups(mem(rax,  r8, 1, 0), ymm6)
-		vmovups(mem(rax,  r8, 2, 0), ymm8)
-		vmovups(mem(rax, r13, 1, 0), ymm10)
-
-		vunpcklps(ymm6, ymm4, ymm0)
-		vunpcklps(ymm10, ymm8, ymm1)
-		vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-		vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-		vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-		vextractf128(imm(0x1), ymm0, xmm2)
-		vmovups(xmm0, mem(rbx, 0*24))      // store ( gamma00..gamma30 )
-		vmovups(xmm2, mem(rbx, 4*24))      // store ( gamma04..gamma34 )
-
-		vextractf128(imm(0x1), ymm1, xmm2)
-		vmovups(xmm1, mem(rbx, 1*24))      // store ( gamma01..gamma31 )
-		vmovups(xmm2, mem(rbx, 5*24))      // store ( gamma05..gamma35 )
-
-		vunpckhps(ymm6, ymm4, ymm0)
-		vunpckhps(ymm10, ymm8, ymm1)
-		vshufps(imm(0x4e), ymm1, ymm0, ymm2)
-		vblendps(imm(0xcc), ymm2, ymm0, ymm0)
-		vblendps(imm(0x33), ymm2, ymm1, ymm1)
-
-		vextractf128(imm(0x1), ymm0, xmm2)
-		vmovups(xmm0, mem(rbx, 2*24))      // store ( gamma02..gamma32 )
-		vmovups(xmm2, mem(rbx, 6*24))      // store ( gamma06..gamma36 )
-
-		vextractf128(imm(0x1), ymm1, xmm2)
-		vmovups(xmm1, mem(rbx, 3*24))      // store ( gamma03..gamma33 )
-		vmovups(xmm2, mem(rbx, 7*24))      // store ( gamma07..gamma37 )
-
-		                                   // begin IO on rows 4-5
-		vmovups(mem(rax,  r8, 4, 0), ymm12)
-		vmovups(mem(rax, r15, 1, 0), ymm14)
-
-		vunpcklps(ymm14, ymm12, ymm0)
-		vextractf128(imm(0x1), ymm0, xmm2)
-		vmovlpd(xmm0, mem(rbx, 0*24+16))   // store ( gamma40..gamma50 )
-		vmovhpd(xmm0, mem(rbx, 1*24+16))   // store ( gamma41..gamma51 )
-		vmovlpd(xmm2, mem(rbx, 4*24+16))   // store ( gamma44..gamma54 )
-		vmovhpd(xmm2, mem(rbx, 5*24+16))   // store ( gamma45..gamma55 )
-
-		vunpckhps(ymm14, ymm12, ymm0)
-		vextractf128(imm(0x1), ymm0, xmm2)
-		vmovlpd(xmm0, mem(rbx, 2*24+16))   // store ( gamma42..gamma52 )
-		vmovhpd(xmm0, mem(rbx, 3*24+16))   // store ( gamma43..gamma53 )
-		vmovlpd(xmm2, mem(rbx, 6*24+16))   // store ( gamma46..gamma56 )
-		vmovhpd(xmm2, mem(rbx, 7*24+16))   // store ( gamma47..gamma57 )
-
-
-		add(r14, rax)                      // a += 8*lda;
-		add(imm(8*6*4), rbx)               // p += 8*ldp = 8*6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.SKITERROWU)                   // iterate again if i != 0.
-
-
-
-		label(.SCONKLEFTROWU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.SDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.SKLEFTROWU)                 // EDGE LOOP (k_left)
-
-		vmovss(mem(rax,         0), xmm0)
-		vmovss(mem(rax,  r8, 1, 0), xmm2)
-		vmovss(mem(rax,  r8, 2, 0), xmm4)
-		vmovss(mem(rax, r13, 1, 0), xmm6)
-		vmovss(mem(rax,  r8, 4, 0), xmm1)
-		vmovss(mem(rax, r15, 1, 0), xmm3)
-
-		vmovss(xmm0, mem(rbx, 0*4))
-		vmovss(xmm2, mem(rbx, 1*4))
-		vmovss(xmm4, mem(rbx, 2*4))
-		vmovss(xmm6, mem(rbx, 3*4))
-		vmovss(xmm1, mem(rbx, 4*4))
-		vmovss(xmm3, mem(rbx, 5*4))
-
-		add(r10, rax)                      // a += lda;
-		add(imm(6*4), rbx)                 // p += ldp = 6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.SKLEFTROWU)                   // iterate again if i != 0.
-
-
-		jmp(.SDONE)                        // jump to end.
-
-
-		// -- kappa unit, column storage on A --------------------------------------
-
-		label(.SCOLUNIT)
-
-		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
-		lea(mem(r13, r10, 2), r15)         // r15 = 5*lda
-		lea(mem(r13, r10, 4), rdx)         // rdx = 7*lda
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.SCONKLEFTCOLU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.SKITERCOLU)                 // MAIN LOOP (k_iter)
-
-		vmovups(mem(rax,          0), xmm0)
-		vmovsd( mem(rax,         16), xmm1)
-		vmovups(xmm0, mem(rbx, 0*24+ 0))
-		vmovsd( xmm1, mem(rbx, 0*24+16))
-
-		vmovups(mem(rax, r10, 1,  0), xmm2)
-		vmovsd( mem(rax, r10, 1, 16), xmm3)
-		vmovups(xmm2, mem(rbx, 1*24+ 0))
-		vmovsd( xmm3, mem(rbx, 1*24+16))
-
-		vmovups(mem(rax, r10, 2,  0), xmm4)
-		vmovsd( mem(rax, r10, 2, 16), xmm5)
-		vmovups(xmm4, mem(rbx, 2*24+ 0))
-		vmovsd( xmm5, mem(rbx, 2*24+16))
-
-		vmovups(mem(rax, r13, 1,  0), xmm6)
-		vmovsd( mem(rax, r13, 1, 16), xmm7)
-		vmovups(xmm6, mem(rbx, 3*24+ 0))
-		vmovsd( xmm7, mem(rbx, 3*24+16))
-
-		vmovups(mem(rax, r10, 4,  0), xmm8)
-		vmovsd( mem(rax, r10, 4, 16), xmm9)
-		vmovups(xmm8, mem(rbx, 4*24+ 0))
-		vmovsd( xmm9, mem(rbx, 4*24+16))
-
-		vmovups(mem(rax, r15, 1,  0), xmm10)
-		vmovsd( mem(rax, r15, 1, 16), xmm11)
-		vmovups(xmm10, mem(rbx, 5*24+ 0))
-		vmovsd( xmm11, mem(rbx, 5*24+16))
-
-		vmovups(mem(rax, r13, 2,  0), xmm12)
-		vmovsd( mem(rax, r13, 2, 16), xmm13)
-		vmovups(xmm12, mem(rbx, 6*24+ 0))
-		vmovsd( xmm13, mem(rbx, 6*24+16))
-
-		vmovups(mem(rax, rdx, 1,  0), xmm14)
-		vmovsd( mem(rax, rdx, 1, 16), xmm15)
-		vmovups(xmm14, mem(rbx, 7*24+ 0))
-		vmovsd( xmm15, mem(rbx, 7*24+16))
-
-		add(r14, rax)                      // a += 8*lda;
-		add(imm(8*6*4), rbx)               // p += 8*ldp = 8*6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.SKITERCOLU)                   // iterate again if i != 0.
-
-
-
-		label(.SCONKLEFTCOLU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.SDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.SKLEFTCOLU)                 // EDGE LOOP (k_left)
-
-		vmovups(mem(rax,          0), xmm0)
-		vmovsd( mem(rax,         16), xmm1)
-		add(r10, rax)                      // a += lda;
-		vmovups(xmm0, mem(rbx, 0*24+ 0))
-		vmovsd( xmm1, mem(rbx, 0*24+16))
-		add(imm(6*4), rbx)                 // p += ldp = 6;
-
-		dec(rsi)                           // i -= 1;
-		jne(.SKLEFTCOLU)                   // iterate again if i != 0.
-
-
-		//jmp(.SDONE)                        // jump to end.
-
-
-
-		label(.SDONE)
-
-
-
-		end_asm(
-		: // output operands (none)
-		: // input operands
-		  [k_iter] "m" (k_iter),
-		  [k_left] "m" (k_left),
-		  [a]      "m" (a),
-		  [inca]   "m" (inca),
-		  [lda]    "m" (lda),
-		  [p]      "m" (p),
-		  [ldp]    "m" (ldp),
-		  [kappa]  "m" (kappa),
-		  [one]    "m" (one)
-		: // register clobber list
-		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
-		  "xmm0", "xmm1", "xmm2", "xmm3",
-		  "xmm4", "xmm5", "xmm6", "xmm7",
-		  "xmm8", "xmm9", "xmm10", "xmm11",
-		  "xmm12", "xmm13", "xmm14", "xmm15",
-		  "memory"
-		)
-	}
-	else // if ( cdim0 < mnr || gs || !unitk )
-	{
-		PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF)
-		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
-		  cdim0,
-		  k0,
-		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
-		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t      i      = cdim0;
-			const dim_t      m_edge = mnr - cdim0;
-			const dim_t      n_edge = k0_max;
-			float*  restrict p_edge = ( float* )p + (i  )*1;
-
-			bli_sset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
-
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t      j      = k0;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = k0_max - k0;
-		float*  restrict p_edge = ( float* )p + (j  )*ldp;
-
-		bli_sset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
-}
-
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3x4.c
similarity index 59%
rename from kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
rename to kernels/haswell/1m/bli_packm_haswell_asm_z3x4.c
index 762e2e87c..e5d9da4f3 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3x4.c
@@ -38,33 +38,25 @@
 #define BLIS_ASM_SYNTAX_ATT
 #include "bli_x86_asm_macros.h"
 
-// Prototype reference packm kernels.
-PACKM_KER_PROT( dcomplex, z, packm_4xk_haswell_ref )
-
-void bli_zpackm_haswell_asm_4xk
+void bli_zpackm_haswell_asm_3x4
      (
              conj_t    conja,
              pack_t    schema,
              dim_t     cdim0,
+             dim_t     cdim_max,
+             dim_t     cdim_bcast,
              dim_t     k0,
              dim_t     k0_max,
        const void*     kappa,
        const void*     a, inc_t inca0, inc_t lda0,
              void*     p,              inc_t ldp0,
+       const void*     params,
        const cntx_t*   cntx
      )
 {
-#if 0
-	bli_zpackm_4xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
 	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 4;
+	const dim_t      mr    = 3;
+	const dim_t      nr    = 4;
 
 	// This is the "packing" dimension assumed by the packm kernel.
 	// This should be equal to ldp.
@@ -104,7 +96,249 @@ void bli_zpackm_haswell_asm_4xk
 
 	// -------------------------------------------------------------------------
 
-	if ( cdim0 == mnr && !gs && !conja && unitk )
+	if ( cdim0 == mr && ldp0 == mr && cdim_bcast == 1 && !gs && !conja && unitk )
+	{
+		begin_asm()
+
+		mov(var(a), rax)                   // load address of a.
+
+		mov(var(inca), r8)                 // load inca
+		mov(var(lda), r10)                 // load lda
+		lea(mem(   , r8,  2), r8)
+		lea(mem(   , r8,  8), r8)          // inca *= sizeof(dcomplex)
+		lea(mem(   , r10, 2), r10)
+		lea(mem(   , r10, 8), r10)         // lda *= sizeof(dcomplex)
+
+		mov(var(p), rbx)                   // load address of p.
+
+		lea(mem(   , r10, 4), r14)         // r14 = 4*lda
+
+		mov(var(one), rdx)                 // load address of 1.0 constant
+		vbroadcastsd(mem(rdx, 0), ymm1)    // load 1.0 and duplicate
+		vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to 0.0.
+
+		mov(var(kappa), rcx)               // load address of kappa
+		vbroadcastsd(mem(rcx, 0), ymm10)   // load kappa_r and duplicate
+		vbroadcastsd(mem(rcx, 8), ymm11)   // load kappa_i and duplicate
+
+
+										   // now branch on kappa == 1.0
+
+		vucomisd(xmm1, xmm10)              // set ZF if kappa_r == 1.0.
+		sete(r12b)                         // r12b = ( ZF == 1 ? 1 : 0 );
+		vucomisd(xmm0, xmm11)              // set ZF if kappa_i == 0.0.
+		sete(r13b)                         // r13b = ( ZF == 1 ? 1 : 0 );
+		and(r12b, r13b)                    // set ZF if r12b & r13b == 1.
+		jne(.ZKAPPAUNIT)                   // if ZF = 1, jump to beta == 0 case
+
+
+
+		label(.ZKAPPANONU)
+
+		cmp(imm(16), r8)                   // set ZF if (16*inca) == 16.
+		jz(.ZCOLNONU)                      // jump to column storage case
+
+		// -- kappa non-unit, row storage on A -------------------------------------
+
+		label(.ZROWNONU)
+
+		jmp(.ZDONE)                        // jump to end.
+
+
+		// -- kappa non-unit, column storage on A ----------------------------------
+
+		label(.ZCOLNONU)
+
+		jmp(.ZDONE)                        // jump to end.
+
+
+
+
+		label(.ZKAPPAUNIT)
+
+		cmp(imm(16), r8)                   // set ZF if (16*inca) == 16.
+		jz(.ZCOLUNIT)                      // jump to column storage case
+
+
+		// -- kappa unit, row storage on A -----------------------------------------
+
+		label(.ZROWUNIT)
+
+		//lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
+		//lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
+		//lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.ZCONKLEFTROWU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.ZKITERROWU)                 // MAIN LOOP (k_iter)
+
+		vmovupd(mem(rax,         0), ymm8)
+		vmovupd(mem(rax,  r8, 1, 0), ymm10)
+		vmovupd(mem(rax,  r8, 2, 0), ymm12)
+
+		vextractf128(imm(0x1), ymm8,  xmm9)
+		vextractf128(imm(0x1), ymm10, xmm11)
+		vextractf128(imm(0x1), ymm12, xmm13)
+
+		vmovupd(xmm8,  mem(rbx, 0*16+0*48))
+		vmovupd(xmm10, mem(rbx, 1*16+0*48))
+		vmovupd(xmm12, mem(rbx, 2*16+0*48))
+
+		vmovupd(xmm9,  mem(rbx, 0*16+1*48))
+		vmovupd(xmm11, mem(rbx, 1*16+1*48))
+		vmovupd(xmm13, mem(rbx, 2*16+1*48))
+
+		vmovupd(mem(rax,         32), ymm8)
+		vmovupd(mem(rax,  r8, 1, 32), ymm10)
+		vmovupd(mem(rax,  r8, 2, 32), ymm12)
+
+		add(r14, rax)                      // a += 4*lda;
+
+		vextractf128(imm(0x1), ymm8,  xmm9)
+		vextractf128(imm(0x1), ymm10, xmm11)
+		vextractf128(imm(0x1), ymm12, xmm13)
+
+		vmovupd(xmm8,  mem(rbx, 0*16+2*48))
+		vmovupd(xmm10, mem(rbx, 1*16+2*48))
+		vmovupd(xmm12, mem(rbx, 2*16+2*48))
+
+		vmovupd(xmm9,  mem(rbx, 0*16+3*48))
+		vmovupd(xmm11, mem(rbx, 1*16+3*48))
+		vmovupd(xmm13, mem(rbx, 2*16+3*48))
+
+		add(imm(4*3*16), rbx)              // p += 4*ldp = 4*3;
+
+		dec(rsi)                           // i -= 1;
+		jne(.ZKITERROWU)                   // iterate again if i != 0.
+
+
+
+		label(.ZCONKLEFTROWU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.ZDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.ZKLEFTROWU)                 // EDGE LOOP (k_left)
+
+		vmovups(mem(rax,         0), xmm0)
+		vmovups(mem(rax,  r8, 1, 0), xmm2)
+		vmovups(mem(rax,  r8, 2, 0), xmm4)
+
+		add(r10, rax)                      // a += lda;
+
+		vmovups(xmm0, mem(rbx, 0*16+0*48))
+		vmovups(xmm2, mem(rbx, 1*16+0*48))
+		vmovups(xmm4, mem(rbx, 2*16+0*48))
+
+		add(imm(3*16), rbx)                // p += ldp = 4;
+
+		dec(rsi)                           // i -= 1;
+		jne(.ZKLEFTROWU)                   // iterate again if i != 0.
+
+
+		jmp(.ZDONE)                        // jump to end.
+
+
+		// -- kappa unit, column storage on A --------------------------------------
+
+		label(.ZCOLUNIT)
+
+		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.ZCONKLEFTCOLU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.ZKITERCOLU)                 // MAIN LOOP (k_iter)
+
+		vmovupd(mem(rax,          0), ymm0)
+		vmovupd(mem(rax,         32), xmm1)
+		vmovupd(ymm0, mem(rbx, 0*48+ 0))
+		vmovupd(xmm1, mem(rbx, 0*48+32))
+
+		vmovupd(mem(rax, r10, 1,  0), ymm2)
+		vmovupd(mem(rax, r10, 1, 32), xmm3)
+		vmovupd(ymm2, mem(rbx, 1*48+ 0))
+		vmovupd(xmm3, mem(rbx, 1*48+32))
+
+		vmovupd(mem(rax, r10, 2,  0), ymm4)
+		vmovupd(mem(rax, r10, 2, 32), xmm5)
+		vmovupd(ymm4, mem(rbx, 2*48+ 0))
+		vmovupd(xmm5, mem(rbx, 2*48+32))
+
+		vmovupd(mem(rax, r13, 1,  0), ymm6)
+		vmovupd(mem(rax, r13, 1, 32), xmm7)
+		add(r14, rax)                      // a += 4*lda;
+		vmovupd(ymm6, mem(rbx, 3*48+ 0))
+		vmovupd(xmm7, mem(rbx, 3*48+32))
+		add(imm(4*3*16), rbx)               // p += 4*ldp = 4*3;
+
+		dec(rsi)                           // i -= 1;
+		jne(.ZKITERCOLU)                   // iterate again if i != 0.
+
+
+
+		label(.ZCONKLEFTCOLU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.ZDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+
+		label(.ZKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+		vmovupd(mem(rax,          0), ymm0)
+		vmovupd(mem(rax,         32), xmm1)
+		add(r10, rax)                      // a += lda;
+		vmovupd(ymm0, mem(rbx, 0*48+ 0))
+		vmovupd(xmm1, mem(rbx, 0*48+32))
+		add(imm(3*16), rbx)                // p += ldp = 3;
+
+		dec(rsi)                           // i -= 1;
+		jne(.ZKLEFTCOLU)                   // iterate again if i != 0.
+
+
+		//jmp(.ZDONE)                        // jump to end.
+
+
+
+		label(.ZDONE)
+
+
+
+		end_asm(
+		: // output operands (none)
+		: // input operands
+		  [k_iter] "m" (k_iter),
+		  [k_left] "m" (k_left),
+		  [a]      "m" (a),
+		  [inca]   "m" (inca),
+		  [lda]    "m" (lda),
+		  [p]      "m" (p),
+		  [ldp]    "m" (ldp),
+		  [kappa]  "m" (kappa),
+		  [one]    "m" (one)
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
+		  "xmm0", "xmm1", "xmm2", "xmm3",
+		  "xmm4", "xmm5", "xmm6", "xmm7",
+		  "xmm8", "xmm9", "xmm10", "xmm11",
+		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "memory"
+		)
+	}
+	else if ( cdim0 == nr && ldp0 == nr && cdim_bcast == 1 && !gs && !conja && unitk )
 	{
 		begin_asm()
 
@@ -356,56 +590,24 @@ void bli_zpackm_haswell_asm_4xk
 		  "memory"
 		)
 	}
-	else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk )
+	else
 	{
-		PASTEMAC(zscal2m,BLIS_TAPI_EX_SUF)
+		bli_zscal2bbs_mxn
 		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
+		  conja,
 		  cdim0,
 		  k0,
 		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
 		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t        i      = cdim0;
-			const dim_t        m_edge = mnr - cdim0;
-			const dim_t        n_edge = k0_max;
-			dcomplex* restrict p_edge = ( dcomplex* )p + (i  )*1;
-
-			bli_zset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
 	}
 
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t        j      = k0;
-		const dim_t        m_edge = mnr;
-		const dim_t        n_edge = k0_max - k0;
-		dcomplex* restrict p_edge = ( dcomplex* )p + (j  )*ldp;
-
-		bli_zset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
+	bli_zset0s_edge
+	(
+	  cdim0*cdim_bcast, cdim_max*cdim_bcast,
+	  k0, k0_max,
+	  p, ldp
+	);
 }
 
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
deleted file mode 100644
index eb9417f6c..000000000
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-// Prototype reference packm kernels.
-PACKM_KER_PROT( dcomplex, z, packm_3xk_haswell_ref )
-
-void bli_zpackm_haswell_asm_3xk
-     (
-             conj_t    conja,
-             pack_t    schema,
-             dim_t     cdim0,
-             dim_t     k0,
-             dim_t     k0_max,
-       const void*     kappa,
-       const void*     a, inc_t inca0, inc_t lda0,
-             void*     p,              inc_t ldp0,
-       const cntx_t*   cntx
-     )
-{
-#if 0
-	bli_zpackm_3xk_haswell_ref
-	(
-	  conja, schema, cdim0, k0, k0_max,
-	  kappa, a, inca0, lda0, p, ldp0, cntx
-	);
-	return;
-#endif
-
-	// This is the panel dimension assumed by the packm kernel.
-	const dim_t      mnr   = 3;
-
-	// This is the "packing" dimension assumed by the packm kernel.
-	// This should be equal to ldp.
-	//const dim_t    packmnr = 8;
-
-	// Define a local copy of 1.0 so we can test for unit kappa.
-	double           one_l = 1.0;
-	double* restrict one   = &one_l;
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	const uint64_t k_iter = k0 / 4;
-#if 1
-	const uint64_t k_left = k0 % 4;
-#else
-	const uint64_t k_left = k0;
-#endif
-
-	// NOTE: For the purposes of the comments in this packm kernel, we
-	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
-	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
-	// this packm kernel, you should think of the operation as packing an
-	// m x n micropanel, where m and n are tiny and large, respectively, and
-	// where elements of each column of the packed matrix P are contiguous.
-	// (This packm kernel can still be used to pack micropanels of matrix B
-	// in a gemm operation.)
-	const uint64_t inca   = inca0;
-	const uint64_t lda    = lda0;
-	const uint64_t ldp    = ldp0;
-
-	const bool     gs     = ( inca0 != 1 && lda0 != 1 );
-
-	// NOTE: If/when this kernel ever supports scaling by kappa within the
-	// assembly region, this constraint should be lifted.
-	const bool     unitk  = bli_zeq1( *(( dcomplex* )kappa) );
-
-
-	// -------------------------------------------------------------------------
-
-	if ( cdim0 == mnr && !gs && !conja && unitk )
-	{
-		begin_asm()
-
-		mov(var(a), rax)                   // load address of a.
-
-		mov(var(inca), r8)                 // load inca
-		mov(var(lda), r10)                 // load lda
-		lea(mem(   , r8,  2), r8)
-		lea(mem(   , r8,  8), r8)          // inca *= sizeof(dcomplex)
-		lea(mem(   , r10, 2), r10)
-		lea(mem(   , r10, 8), r10)         // lda *= sizeof(dcomplex)
-
-		mov(var(p), rbx)                   // load address of p.
-
-		lea(mem(   , r10, 4), r14)         // r14 = 4*lda
-
-		mov(var(one), rdx)                 // load address of 1.0 constant
-		vbroadcastsd(mem(rdx, 0), ymm1)    // load 1.0 and duplicate
-		vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to 0.0.
-
-		mov(var(kappa), rcx)               // load address of kappa
-		vbroadcastsd(mem(rcx, 0), ymm10)   // load kappa_r and duplicate
-		vbroadcastsd(mem(rcx, 8), ymm11)   // load kappa_i and duplicate
-
-
-										   // now branch on kappa == 1.0
-
-		vucomisd(xmm1, xmm10)              // set ZF if kappa_r == 1.0.
-		sete(r12b)                         // r12b = ( ZF == 1 ? 1 : 0 );
-		vucomisd(xmm0, xmm11)              // set ZF if kappa_i == 0.0.
-		sete(r13b)                         // r13b = ( ZF == 1 ? 1 : 0 );
-		and(r12b, r13b)                    // set ZF if r12b & r13b == 1.
-		jne(.ZKAPPAUNIT)                   // if ZF = 1, jump to beta == 0 case
-
-
-
-		label(.ZKAPPANONU)
-
-		cmp(imm(16), r8)                   // set ZF if (16*inca) == 16.
-		jz(.ZCOLNONU)                      // jump to column storage case
-
-		// -- kappa non-unit, row storage on A -------------------------------------
-
-		label(.ZROWNONU)
-
-		jmp(.ZDONE)                        // jump to end.
-
-
-		// -- kappa non-unit, column storage on A ----------------------------------
-
-		label(.ZCOLNONU)
-
-		jmp(.ZDONE)                        // jump to end.
-
-
-
-
-		label(.ZKAPPAUNIT)
-
-		cmp(imm(16), r8)                   // set ZF if (16*inca) == 16.
-		jz(.ZCOLUNIT)                      // jump to column storage case
-
-
-		// -- kappa unit, row storage on A -----------------------------------------
-
-		label(.ZROWUNIT)
-
-		//lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
-		//lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
-		//lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.ZCONKLEFTROWU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.ZKITERROWU)                 // MAIN LOOP (k_iter)
-
-		vmovupd(mem(rax,         0), ymm8)
-		vmovupd(mem(rax,  r8, 1, 0), ymm10)
-		vmovupd(mem(rax,  r8, 2, 0), ymm12)
-
-		vextractf128(imm(0x1), ymm8,  xmm9)
-		vextractf128(imm(0x1), ymm10, xmm11)
-		vextractf128(imm(0x1), ymm12, xmm13)
-
-		vmovupd(xmm8,  mem(rbx, 0*16+0*48))
-		vmovupd(xmm10, mem(rbx, 1*16+0*48))
-		vmovupd(xmm12, mem(rbx, 2*16+0*48))
-
-		vmovupd(xmm9,  mem(rbx, 0*16+1*48))
-		vmovupd(xmm11, mem(rbx, 1*16+1*48))
-		vmovupd(xmm13, mem(rbx, 2*16+1*48))
-
-		vmovupd(mem(rax,         32), ymm8)
-		vmovupd(mem(rax,  r8, 1, 32), ymm10)
-		vmovupd(mem(rax,  r8, 2, 32), ymm12)
-
-		add(r14, rax)                      // a += 4*lda;
-
-		vextractf128(imm(0x1), ymm8,  xmm9)
-		vextractf128(imm(0x1), ymm10, xmm11)
-		vextractf128(imm(0x1), ymm12, xmm13)
-
-		vmovupd(xmm8,  mem(rbx, 0*16+2*48))
-		vmovupd(xmm10, mem(rbx, 1*16+2*48))
-		vmovupd(xmm12, mem(rbx, 2*16+2*48))
-
-		vmovupd(xmm9,  mem(rbx, 0*16+3*48))
-		vmovupd(xmm11, mem(rbx, 1*16+3*48))
-		vmovupd(xmm13, mem(rbx, 2*16+3*48))
-
-		add(imm(4*3*16), rbx)              // p += 4*ldp = 4*3;
-
-		dec(rsi)                           // i -= 1;
-		jne(.ZKITERROWU)                   // iterate again if i != 0.
-
-
-
-		label(.ZCONKLEFTROWU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.ZDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.ZKLEFTROWU)                 // EDGE LOOP (k_left)
-
-		vmovups(mem(rax,         0), xmm0)
-		vmovups(mem(rax,  r8, 1, 0), xmm2)
-		vmovups(mem(rax,  r8, 2, 0), xmm4)
-
-		add(r10, rax)                      // a += lda;
-
-		vmovups(xmm0, mem(rbx, 0*16+0*48))
-		vmovups(xmm2, mem(rbx, 1*16+0*48))
-		vmovups(xmm4, mem(rbx, 2*16+0*48))
-
-		add(imm(3*16), rbx)                // p += ldp = 4;
-
-		dec(rsi)                           // i -= 1;
-		jne(.ZKLEFTROWU)                   // iterate again if i != 0.
-
-
-		jmp(.ZDONE)                        // jump to end.
-
-
-		// -- kappa unit, column storage on A --------------------------------------
-
-		label(.ZCOLUNIT)
-
-		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
-
-		mov(var(k_iter), rsi)              // i = k_iter;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.ZCONKLEFTCOLU)                 // if i == 0, jump to code that
-		                                   // contains the k_left loop.
-
-
-		label(.ZKITERCOLU)                 // MAIN LOOP (k_iter)
-
-		vmovupd(mem(rax,          0), ymm0)
-		vmovupd(mem(rax,         32), xmm1)
-		vmovupd(ymm0, mem(rbx, 0*48+ 0))
-		vmovupd(xmm1, mem(rbx, 0*48+32))
-
-		vmovupd(mem(rax, r10, 1,  0), ymm2)
-		vmovupd(mem(rax, r10, 1, 32), xmm3)
-		vmovupd(ymm2, mem(rbx, 1*48+ 0))
-		vmovupd(xmm3, mem(rbx, 1*48+32))
-
-		vmovupd(mem(rax, r10, 2,  0), ymm4)
-		vmovupd(mem(rax, r10, 2, 32), xmm5)
-		vmovupd(ymm4, mem(rbx, 2*48+ 0))
-		vmovupd(xmm5, mem(rbx, 2*48+32))
-
-		vmovupd(mem(rax, r13, 1,  0), ymm6)
-		vmovupd(mem(rax, r13, 1, 32), xmm7)
-		add(r14, rax)                      // a += 4*lda;
-		vmovupd(ymm6, mem(rbx, 3*48+ 0))
-		vmovupd(xmm7, mem(rbx, 3*48+32))
-		add(imm(4*3*16), rbx)               // p += 4*ldp = 4*3;
-
-		dec(rsi)                           // i -= 1;
-		jne(.ZKITERCOLU)                   // iterate again if i != 0.
-
-
-
-		label(.ZCONKLEFTCOLU)
-
-		mov(var(k_left), rsi)              // i = k_left;
-		test(rsi, rsi)                     // check i via logical AND.
-		je(.ZDONE)                         // if i == 0, we're done; jump to end.
-		                                   // else, we prepare to enter k_left loop.
-
-
-		label(.ZKLEFTCOLU)                 // EDGE LOOP (k_left)
-
-		vmovupd(mem(rax,          0), ymm0)
-		vmovupd(mem(rax,         32), xmm1)
-		add(r10, rax)                      // a += lda;
-		vmovupd(ymm0, mem(rbx, 0*48+ 0))
-		vmovupd(xmm1, mem(rbx, 0*48+32))
-		add(imm(3*16), rbx)                // p += ldp = 3;
-
-		dec(rsi)                           // i -= 1;
-		jne(.ZKLEFTCOLU)                   // iterate again if i != 0.
-
-
-		//jmp(.ZDONE)                        // jump to end.
-
-
-
-		label(.ZDONE)
-
-
-
-		end_asm(
-		: // output operands (none)
-		: // input operands
-		  [k_iter] "m" (k_iter),
-		  [k_left] "m" (k_left),
-		  [a]      "m" (a),
-		  [inca]   "m" (inca),
-		  [lda]    "m" (lda),
-		  [p]      "m" (p),
-		  [ldp]    "m" (ldp),
-		  [kappa]  "m" (kappa),
-		  [one]    "m" (one)
-		: // register clobber list
-		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-		  "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15",
-		  "xmm0", "xmm1", "xmm2", "xmm3",
-		  "xmm4", "xmm5", "xmm6", "xmm7",
-		  "xmm8", "xmm9", "xmm10", "xmm11",
-		  "xmm12", "xmm13", "xmm14", "xmm15",
-		  "memory"
-		)
-	}
-	else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk )
-	{
-		PASTEMAC(zscal2m,BLIS_TAPI_EX_SUF)
-		(
-		  0,
-		  BLIS_NONUNIT_DIAG,
-		  BLIS_DENSE,
-		  ( trans_t )conja,
-		  cdim0,
-		  k0,
-		  kappa,
-		  a, inca0, lda0,
-		  p,     1, ldp0,
-		  cntx,
-		  NULL
-		);
-
-		if ( cdim0 < mnr )
-		{
-			// Handle zero-filling along the "long" edge of the micropanel.
-
-			const dim_t        i      = cdim0;
-			const dim_t        m_edge = mnr - cdim0;
-			const dim_t        n_edge = k0_max;
-			dcomplex* restrict p_edge = ( dcomplex* )p + (i  )*1;
-
-			bli_zset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
-
-	if ( k0 < k0_max )
-	{
-		// Handle zero-filling along the "short" (far) edge of the micropanel.
-
-		const dim_t        j      = k0;
-		const dim_t        m_edge = mnr;
-		const dim_t        n_edge = k0_max - k0;
-		dcomplex* restrict p_edge = ( dcomplex* )p + (j  )*ldp;
-
-		bli_zset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
-}
-
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index 322e9a2e7..2d47cf944 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -87,7 +87,7 @@ void bli_sgemm_haswell_asm_6x16
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -767,7 +767,7 @@ void bli_dgemm_haswell_asm_6x8
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1326,7 +1326,7 @@ void bli_cgemm_haswell_asm_3x8
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1719,7 +1719,7 @@ void bli_zgemm_haswell_asm_3x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index 6d3d125fe..723dd532d 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -86,7 +86,7 @@ void bli_sgemm_haswell_asm_16x6
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -470,7 +470,7 @@ void bli_dgemm_haswell_asm_8x6
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -840,7 +840,7 @@ void bli_cgemm_haswell_asm_8x3
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1231,7 +1231,7 @@ void bli_zgemm_haswell_asm_4x3
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index 88e152fa0..7fdf30bee 100644
--- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -67,7 +67,7 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
        const void*      b01, \
              void*      b11, \
              void*      c11, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
@@ -858,7 +858,7 @@ void bli_dgemmtrsm_l_haswell_asm_6x8
        const void*      b01, \
              void*      b11, \
              void*      c11, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index 1518a56eb..c4f579f00 100644
--- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -67,7 +67,7 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
        const void*      b21, \
              void*      b11, \
              void*      c11, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
@@ -863,7 +863,7 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
        const void*      b21, \
              void*      b11, \
              void*      c11, inc_t rs_c0, inc_t cs_c0, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
index 055f99489..a14211804 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
@@ -78,7 +78,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -735,7 +735,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1307,7 +1307,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
index 2f45aec08..14e2f6c59 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
@@ -78,7 +78,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -804,7 +804,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1391,7 +1391,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1921,7 +1921,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
index 0bb9563f1..f63c293fd 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
@@ -78,7 +78,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -770,7 +770,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1383,7 +1383,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1994,7 +1994,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2579,7 +2579,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
index 41ab6ed63..aa9361e5f 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
@@ -78,7 +78,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -816,7 +816,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1413,7 +1413,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1949,7 +1949,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
index c5927c3da..918ca6d46 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
@@ -93,7 +93,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -995,7 +995,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1774,7 +1774,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2431,7 +2431,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
index cb784d6a1..3f4906878 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
@@ -93,7 +93,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -959,7 +959,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1707,7 +1707,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2355,7 +2355,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -3033,7 +3033,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -3592,7 +3592,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
index de2b71c4b..2b4ee7a9b 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
@@ -93,7 +93,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1129,7 +1129,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1996,7 +1996,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2709,7 +2709,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -3455,7 +3455,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -4124,7 +4124,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
index 456866b2f..cc47db8bc 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
@@ -93,7 +93,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1094,7 +1094,7 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2026,7 +2026,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2784,7 +2784,7 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -3603,7 +3603,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -4241,7 +4241,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
index dc4ae1c8e..ca15842c4 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
@@ -104,7 +104,7 @@ void PASTEMAC(ch,opname) \
        const void*      b0, inc_t rs_b, inc_t cs_b, \
        const void*      beta0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx \
      ) \
 { \
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
index d9953583c..8ca351d94 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
@@ -72,7 +72,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -586,7 +586,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -995,7 +995,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1369,7 +1369,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
index 6110d7cdb..a83193937 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
@@ -72,7 +72,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -628,7 +628,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1061,7 +1061,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1453,7 +1453,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index f8a9f5f25..50830a7ca 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -72,7 +72,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -605,7 +605,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1055,7 +1055,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
index c5c6c2854..4546b7835 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
@@ -72,7 +72,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -735,7 +735,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1209,7 +1209,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
index 068ee71ed..6e83f0ee9 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
@@ -93,7 +93,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -566,7 +566,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1016,7 +1016,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1424,7 +1424,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1829,7 +1829,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2179,7 +2179,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
index a48eb39eb..7ee0b0ab7 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
@@ -93,7 +93,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -588,7 +588,7 @@ void bli_dgemmsup_rv_haswell_asm_5x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1059,7 +1059,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1479,7 +1479,7 @@ void bli_dgemmsup_rv_haswell_asm_3x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1906,7 +1906,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2269,7 +2269,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
index 9c229a962..d1fb203f4 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
@@ -93,7 +93,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -704,7 +704,7 @@ void bli_dgemmsup_rv_haswell_asm_5x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1279,7 +1279,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1777,7 +1777,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2292,7 +2292,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2723,7 +2723,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
index 0ada5dc31..a551ab284 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
@@ -108,7 +108,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -820,7 +820,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1415,7 +1415,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1923,7 +1923,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2457,7 +2457,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2880,7 +2880,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
index a8facadbb..9966283df 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,opname) \
        const void*      b0, inc_t rs_b, inc_t cs_b, \
        const void*      beta0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx \
      ) \
 { \
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
index ce385b9e1..e154d0ebe 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
@@ -72,7 +72,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -598,7 +598,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1013,7 +1013,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1391,7 +1391,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
index 63049d2d5..ba8f1feb3 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
@@ -72,7 +72,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -644,7 +644,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1121,7 +1121,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
index 6fe365afb..8506f78ad 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
@@ -72,7 +72,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -758,7 +758,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1238,7 +1238,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
index a5ce50cdc..7ba991607 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
@@ -72,7 +72,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -646,7 +646,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1088,7 +1088,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1485,7 +1485,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
index f69d81596..31114237d 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
@@ -72,7 +72,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -616,7 +616,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1073,7 +1073,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
index e325c777a..3c6c85374 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
@@ -72,7 +72,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -644,7 +644,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1121,7 +1121,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
index 9809e0012..629be629d 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
@@ -93,7 +93,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -791,7 +791,7 @@ void bli_sgemmsup_rv_haswell_asm_5x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1490,7 +1490,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2047,7 +2047,7 @@ void bli_sgemmsup_rv_haswell_asm_3x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2635,7 +2635,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -3081,7 +3081,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
index 7afe0d7a2..bb4ca2350 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
@@ -108,7 +108,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -932,7 +932,7 @@ void bli_sgemmsup_rv_haswell_asm_5x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1688,7 +1688,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2273,7 +2273,7 @@ void bli_sgemmsup_rv_haswell_asm_3x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2914,7 +2914,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -3383,7 +3383,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
index 5f37fac5a..02390a0f5 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
@@ -93,7 +93,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -570,7 +570,7 @@ void bli_sgemmsup_rv_haswell_asm_5x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1032,7 +1032,7 @@ void bli_sgemmsup_rv_haswell_asm_4x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1443,7 +1443,7 @@ void bli_sgemmsup_rv_haswell_asm_3x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1846,7 +1846,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2198,7 +2198,7 @@ void bli_sgemmsup_rv_haswell_asm_1x2
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
index 56e744d52..63dcaf094 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
@@ -93,7 +93,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -600,7 +600,7 @@ void bli_sgemmsup_rv_haswell_asm_5x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1097,7 +1097,7 @@ void bli_sgemmsup_rv_haswell_asm_4x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1530,7 +1530,7 @@ void bli_sgemmsup_rv_haswell_asm_3x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1959,7 +1959,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2324,7 +2324,7 @@ void bli_sgemmsup_rv_haswell_asm_1x4
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
index bc3d07561..3dd35531e 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
@@ -93,7 +93,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -678,7 +678,7 @@ void bli_sgemmsup_rv_haswell_asm_5x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1252,7 +1252,7 @@ void bli_sgemmsup_rv_haswell_asm_4x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1739,7 +1739,7 @@ void bli_sgemmsup_rv_haswell_asm_3x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2227,7 +2227,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2627,7 +2627,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -3018,7 +3018,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
index 972150db5..d34567c67 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
@@ -93,7 +93,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -644,7 +644,7 @@ void bli_sgemmsup_rv_haswell_asm_5x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1195,7 +1195,7 @@ void bli_sgemmsup_rv_haswell_asm_4x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1652,7 +1652,7 @@ void bli_sgemmsup_rv_haswell_asm_3x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2132,7 +2132,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -2518,7 +2518,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8
        const void*      b, inc_t rs_b0, inc_t cs_b0,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/haswell/bli_kernels_haswell.h b/kernels/haswell/bli_kernels_haswell.h
index 1c35122a4..c4fd4cb5e 100644
--- a/kernels/haswell/bli_kernels_haswell.h
+++ b/kernels/haswell/bli_kernels_haswell.h
@@ -36,17 +36,10 @@
 // -- level-1m -----------------------------------------------------------------
 
 // packm (asm)
-PACKM_KER_PROT( float,    s, packm_haswell_asm_6xk )
-PACKM_KER_PROT( float,    s, packm_haswell_asm_16xk )
-
-PACKM_KER_PROT( double,   d, packm_haswell_asm_6xk )
-PACKM_KER_PROT( double,   d, packm_haswell_asm_8xk )
-
-PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk )
-PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk )
-
-PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk )
-PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk )
+PACKM_KER_PROT( float,    s, packm_haswell_asm_6x16 )
+PACKM_KER_PROT( double,   d, packm_haswell_asm_6x8 )
+PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3x8 )
+PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3x4 )
 
 
 // -- level-3 ------------------------------------------------------------------
diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
index 989708ee4..54988a2c2 100644
--- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
+++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
@@ -264,7 +264,7 @@ void bli_dgemm_knc_asm_30x8
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
index c2a359f11..719cd1d23 100644
--- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
+++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
@@ -264,8 +264,8 @@ void bli_sgemm_knc_asm_30x16
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
-             auxinfo_t* data,
+       const auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
index ba472fac1..b081ab3d0 100644
--- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
+++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
@@ -106,16 +106,19 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
      16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
 
-void bli_dpackm_knl_asm_8xk
+void bli_dpackm_knl_asm_24x8
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim_,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   n_,
              dim_t   n_max_,
        const void*   kappa_,
        const void*   a_, inc_t inca_, inc_t lda_,
              void*   p_,              inc_t ldp_,
+       const void*   params,
        const cntx_t* cntx
      )
 {
@@ -125,265 +128,16 @@ void bli_dpackm_knl_asm_8xk
     double*       p     = ( double* )p_;
     double*       kappa = ( double* )kappa_;
     const int64_t cdim  = cdim_;
-    const int64_t mnr   = 8;
+    const int64_t mr    = 24;
+    const int64_t nr    = 8;
     const int64_t n     = n_;
     const int64_t n_max = n_max_;
     const int64_t inca  = inca_;
     const int64_t lda   = lda_;
     const int64_t ldp   = ldp_;
 
-    if ( cdim == mnr )
-    {
-
-    BEGIN_ASM()
-
-    MOV(RSI, VAR(n))
-    MOV(RAX, VAR(a))
-    MOV(RBX, VAR(inca))
-    MOV(RCX, VAR(lda))
-    MOV(R14, VAR(p))
-    MOV(RDI, VAR(ldp))
-
-    TEST(RSI, RSI)
-    JZ(PACK8_DONE)
-
-    LEA(RBX, MEM(,RBX,8))    //inca in bytes
-    LEA(RCX, MEM(,RCX,8))    //lda in bytes
-    LEA(RDI, MEM(,RDI,8))    //ldp in bytes
-    LEA(R11, MEM(RDI,RDI,2)) //ldp*3
-    LEA(R12, MEM(RDI,RDI,4)) //ldp*5
-    LEA(R13, MEM(R11,RDI,4)) //ldp*7
-
-    VBROADCASTSD(ZMM(31), VAR(kappa))
-
-    CMP(RBX, IMM(8))
-    JNE(PACK8_T)
-
-    LABEL(PACK8_N)
-
-        MOV(RDX, RSI)
-        AND(RDX, IMM(7))
-        SAR(RSI, IMM(3))
-        JZ(PACK8_N_TAIL)
-
-        LEA(R8,  MEM(RCX,RCX,2)) //lda*3
-        LEA(R9,  MEM(RCX,RCX,4)) //lda*5
-        LEA(R10, MEM(R8 ,RCX,4)) //lda*7
-
-        LABEL(PACK8_N_LOOP)
-
-            LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
-            STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
-
-            LEA(RAX, MEM(RAX,RCX,8))
-            LEA(R14, MEM(R14,RDI,8))
-
-            SUB(RSI, IMM(1))
-
-        JNZ(PACK8_N_LOOP)
-
-        TEST(RDX, RDX)
-        JZ(PACK8_DONE)
-
-        LABEL(PACK8_N_TAIL)
-
-            VMULPD(ZMM(0), ZMM(31), MEM(RAX))
-            VMOVUPD(MEM(R14), ZMM(0))
-
-            LEA(RAX, MEM(RAX,RCX,1))
-            LEA(R14, MEM(R14,RDI,1))
-
-            SUB(RDX, IMM(1))
-
-        JNZ(PACK8_N_TAIL)
-
-        JMP(PACK8_DONE)
-
-    LABEL(PACK8_T)
-
-        CMP(RCX, IMM(8))
-        JNE(PACK8_G)
-
-        LEA(R8,  MEM(RBX,RBX,2)) //inca*3
-        LEA(R9,  MEM(RBX,RBX,4)) //inca*5
-        LEA(R10, MEM(R8 ,RBX,4)) //inca*7
-
-        MOV(RDX, RSI)
-        AND(RDX, IMM(7))
-        SAR(RSI, IMM(3))
-        JZ(PACK8_T_TAIL)
-
-        LABEL(PACK8_T_LOOP)
-
-            LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
-            TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
-                         16,17,18,19,20,21,22,23)
-            STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
-
-            LEA(RAX, MEM(RAX,RCX,8))
-            LEA(R14, MEM(R14,RDI,8))
-
-            SUB(RSI, IMM(1))
-
-        JNZ(PACK8_T_LOOP)
-
-        TEST(RDX, RDX)
-        JZ(PACK8_DONE)
-
-        LABEL(PACK8_T_TAIL)
 
-        MOV(RSI, IMM(1))
-        SHLX(RSI, RSI, RDX)
-        SUB(RSI, IMM(1))
-        KMOVW(K(1), ESI)  //mask for n%8 elements
-
-        LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
-        TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
-                      8, 9,10,11,12,13,14,15)
-
-        VMOVUPD(MEM(R14      ), ZMM( 8))
-        SUB(RDX, IMM(1))
-        JZ(PACK8_DONE)
-        VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
-        SUB(RDX, IMM(1))
-        JZ(PACK8_DONE)
-        VMOVUPD(MEM(R14,RDI,2), ZMM(10))
-        SUB(RDX, IMM(1))
-        JZ(PACK8_DONE)
-        VMOVUPD(MEM(R14,R11,1), ZMM(11))
-        SUB(RDX, IMM(1))
-        JZ(PACK8_DONE)
-        VMOVUPD(MEM(R14,RDI,4), ZMM(12))
-        SUB(RDX, IMM(1))
-        JZ(PACK8_DONE)
-        VMOVUPD(MEM(R14,R12,1), ZMM(13))
-        SUB(RDX, IMM(1))
-        JZ(PACK8_DONE)
-        VMOVUPD(MEM(R14,R11,2), ZMM(14))
-
-        JMP(PACK8_DONE)
-
-    LABEL(PACK8_G)
-
-        VPBROADCASTD(ZMM(3), VAR(inca))
-        MOV(RBX, VAR(offsetPtr))
-        VPMULLD(YMM(0), YMM(3), MEM(RBX))
-
-        LABEL(PACK8_G_LOOP)
-
-            KXNORW(K(1), K(0), K(0))
-            VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
-            VMULPD(ZMM(3), ZMM(3), ZMM(31))
-            VMOVUPD(MEM(R14), ZMM(3))
-
-            LEA(RAX, MEM(RAX,RCX,1))
-            LEA(R14, MEM(R14,RDI,1))
-
-            SUB(RSI, IMM(1))
-
-        JNZ(PACK8_G_LOOP)
-
-    LABEL(PACK8_DONE)
-
-    END_ASM(
-        : //output operands
-        : //input operands
-          [n]         "m" (n),
-          [kappa]     "m" (*kappa),
-          [a]         "m" (a),
-          [inca]      "m" (inca),
-          [lda]       "m" (lda),
-          [p]         "m" (p),
-          [ldp]       "m" (ldp),
-          [offsetPtr] "m" (offsetPtr)
-        : //clobbers
-          "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
-          "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
-          "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
-          "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
-          "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
-          "zmm30", "zmm31",
-          "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
-          "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
-    )
-
-	}
-	else // if ( cdim < mnr )
-	{
-		bli_dscal2m_ex \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-
-		// if ( cdim < mnr )
-		{
-			const dim_t      i      = cdim;
-			const dim_t      m_edge = mnr - i;
-			const dim_t      n_edge = n_max;
-			double* restrict p_edge = ( double* )p + (i  )*1;
-
-			bli_dset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
-
-	if ( n < n_max )
-	{
-		const dim_t      j      = n;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = n_max - j;
-		double* restrict p_edge = ( double* )p + (j  )*ldp;
-
-		bli_dset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
-}
-
-void bli_dpackm_knl_asm_24xk
-     (
-             conj_t  conja,
-             pack_t  schema,
-             dim_t   cdim_,
-             dim_t   n_,
-             dim_t   n_max_,
-       const void*   kappa_,
-       const void*   a_, inc_t inca_, inc_t lda_,
-             void*   p_,              inc_t ldp_,
-       const cntx_t* cntx
-     )
-{
-    const int32_t* offsetPtr = &offsets[0];
-
-    double*       a     = ( double* )a_;
-    double*       p     = ( double* )p_;
-    double*       kappa = ( double* )kappa_;
-    const int64_t cdim  = cdim_;
-    const int64_t mnr   = 24;
-    const int64_t n     = n_;
-    const int64_t n_max = n_max_;
-    const int64_t inca  = inca_;
-    const int64_t lda   = lda_;
-    const int64_t ldp   = ldp_;
-
-    if ( cdim == mnr )
+    if ( cdim == mr && cdim_bcast == 1 )
     {
 
     BEGIN_ASM()
@@ -611,52 +365,200 @@ void bli_dpackm_knl_asm_24xk
           "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory"
     )
 
-	}
-	else // if ( cdim < mnr )
-	{
-		bli_dscal2m_ex \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-
-		// if ( cdim < mnr )
-		{
-			const dim_t      i      = cdim;
-			const dim_t      m_edge = mnr - i;
-			const dim_t      n_edge = n_max;
-			double* restrict p_edge = ( double* )p + (i  )*1;
-
-			bli_dset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
+    }
+    else if ( cdim == nr && cdim_bcast == 1 )
+    {
 
-	if ( n < n_max )
-	{
-		const dim_t      j      = n;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = n_max - j;
-		double* restrict p_edge = ( double* )p + (j  )*ldp;
+    BEGIN_ASM()
+
+    MOV(RSI, VAR(n))
+    MOV(RAX, VAR(a))
+    MOV(RBX, VAR(inca))
+    MOV(RCX, VAR(lda))
+    MOV(R14, VAR(p))
+    MOV(RDI, VAR(ldp))
+
+    TEST(RSI, RSI)
+    JZ(PACK8_DONE)
+
+    LEA(RBX, MEM(,RBX,8))    //inca in bytes
+    LEA(RCX, MEM(,RCX,8))    //lda in bytes
+    LEA(RDI, MEM(,RDI,8))    //ldp in bytes
+    LEA(R11, MEM(RDI,RDI,2)) //ldp*3
+    LEA(R12, MEM(RDI,RDI,4)) //ldp*5
+    LEA(R13, MEM(R11,RDI,4)) //ldp*7
+
+    VBROADCASTSD(ZMM(31), VAR(kappa))
+
+    CMP(RBX, IMM(8))
+    JNE(PACK8_T)
+
+    LABEL(PACK8_N)
+
+        MOV(RDX, RSI)
+        AND(RDX, IMM(7))
+        SAR(RSI, IMM(3))
+        JZ(PACK8_N_TAIL)
 
-		bli_dset0s_mxn
+        LEA(R8,  MEM(RCX,RCX,2)) //lda*3
+        LEA(R9,  MEM(RCX,RCX,4)) //lda*5
+        LEA(R10, MEM(R8 ,RCX,4)) //lda*7
+
+        LABEL(PACK8_N_LOOP)
+
+            LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
+            STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
+
+            LEA(RAX, MEM(RAX,RCX,8))
+            LEA(R14, MEM(R14,RDI,8))
+
+            SUB(RSI, IMM(1))
+
+        JNZ(PACK8_N_LOOP)
+
+        TEST(RDX, RDX)
+        JZ(PACK8_DONE)
+
+        LABEL(PACK8_N_TAIL)
+
+            VMULPD(ZMM(0), ZMM(31), MEM(RAX))
+            VMOVUPD(MEM(R14), ZMM(0))
+
+            LEA(RAX, MEM(RAX,RCX,1))
+            LEA(R14, MEM(R14,RDI,1))
+
+            SUB(RDX, IMM(1))
+
+        JNZ(PACK8_N_TAIL)
+
+        JMP(PACK8_DONE)
+
+    LABEL(PACK8_T)
+
+        CMP(RCX, IMM(8))
+        JNE(PACK8_G)
+
+        LEA(R8,  MEM(RBX,RBX,2)) //inca*3
+        LEA(R9,  MEM(RBX,RBX,4)) //inca*5
+        LEA(R10, MEM(R8 ,RBX,4)) //inca*7
+
+        MOV(RDX, RSI)
+        AND(RDX, IMM(7))
+        SAR(RSI, IMM(3))
+        JZ(PACK8_T_TAIL)
+
+        LABEL(PACK8_T_LOOP)
+
+            LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
+            TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
+                         16,17,18,19,20,21,22,23)
+            STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
+
+            LEA(RAX, MEM(RAX,RCX,8))
+            LEA(R14, MEM(R14,RDI,8))
+
+            SUB(RSI, IMM(1))
+
+        JNZ(PACK8_T_LOOP)
+
+        TEST(RDX, RDX)
+        JZ(PACK8_DONE)
+
+        LABEL(PACK8_T_TAIL)
+
+        MOV(RSI, IMM(1))
+        SHLX(RSI, RSI, RDX)
+        SUB(RSI, IMM(1))
+        KMOVW(K(1), ESI)  //mask for n%8 elements
+
+        LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
+        TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
+                      8, 9,10,11,12,13,14,15)
+
+        VMOVUPD(MEM(R14      ), ZMM( 8))
+        SUB(RDX, IMM(1))
+        JZ(PACK8_DONE)
+        VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
+        SUB(RDX, IMM(1))
+        JZ(PACK8_DONE)
+        VMOVUPD(MEM(R14,RDI,2), ZMM(10))
+        SUB(RDX, IMM(1))
+        JZ(PACK8_DONE)
+        VMOVUPD(MEM(R14,R11,1), ZMM(11))
+        SUB(RDX, IMM(1))
+        JZ(PACK8_DONE)
+        VMOVUPD(MEM(R14,RDI,4), ZMM(12))
+        SUB(RDX, IMM(1))
+        JZ(PACK8_DONE)
+        VMOVUPD(MEM(R14,R12,1), ZMM(13))
+        SUB(RDX, IMM(1))
+        JZ(PACK8_DONE)
+        VMOVUPD(MEM(R14,R11,2), ZMM(14))
+
+        JMP(PACK8_DONE)
+
+    LABEL(PACK8_G)
+
+        VPBROADCASTD(ZMM(3), VAR(inca))
+        MOV(RBX, VAR(offsetPtr))
+        VPMULLD(YMM(0), YMM(3), MEM(RBX))
+
+        LABEL(PACK8_G_LOOP)
+
+            KXNORW(K(1), K(0), K(0))
+            VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
+            VMULPD(ZMM(3), ZMM(3), ZMM(31))
+            VMOVUPD(MEM(R14), ZMM(3))
+
+            LEA(RAX, MEM(RAX,RCX,1))
+            LEA(R14, MEM(R14,RDI,1))
+
+            SUB(RSI, IMM(1))
+
+        JNZ(PACK8_G_LOOP)
+
+    LABEL(PACK8_DONE)
+
+    END_ASM(
+        : //output operands
+        : //input operands
+          [n]         "m" (n),
+          [kappa]     "m" (*kappa),
+          [a]         "m" (a),
+          [inca]      "m" (inca),
+          [lda]       "m" (lda),
+          [p]         "m" (p),
+          [ldp]       "m" (ldp),
+          [offsetPtr] "m" (offsetPtr)
+        : //clobbers
+          "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+          "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
+          "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
+          "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
+          "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
+          "zmm30", "zmm31",
+          "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
+          "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
+    )
+
+	}
+	else
+	{
+		bli_dscal2bbs_mxn
 		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
+		  BLIS_NO_CONJUGATE,
+		  cdim,
+		  n,
+		  kappa,
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
 		);
 	}
+
+	bli_dset0s_edge
+	(
+	  cdim*cdim_bcast, cdim_max*cdim_bcast,
+	  n, n_max,
+	  p, ldp
+	);
 }
diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
index f02a28823..78b41ae22 100644
--- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
+++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
@@ -108,16 +108,19 @@ static int32_t offsets[32] __attribute__((aligned(64))) =
     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
      16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
 
-void bli_spackm_knl_asm_16xk
+void bli_spackm_knl_asm_24x16
      (
              conj_t  conja,
              pack_t  schema,
              dim_t   cdim_,
+             dim_t   cdim_max,
+             dim_t   cdim_bcast,
              dim_t   n_,
              dim_t   n_max_,
        const void*   kappa_,
        const void*   a_, inc_t inca_, inc_t lda_,
              void*   p_,              inc_t ldp_,
+       const void*   params,
        const cntx_t* cntx
      )
 {
@@ -127,14 +130,16 @@ void bli_spackm_knl_asm_16xk
     float*        p     = ( float* )p_;
     float*        kappa = ( float* )kappa_;
     const int64_t cdim  = cdim_;
-    const int64_t mnr   = 16;
+    const int64_t mr    = 24;
+    const int64_t nr    = 16;
     const int64_t n     = n_;
     const int64_t n_max = n_max_;
     const int64_t inca  = inca_;
     const int64_t lda   = lda_;
     const int64_t ldp   = ldp_;
 
-    if ( cdim == mnr )
+
+    if ( cdim == mr && cdim_bcast == 1 )
     {
 
     BEGIN_ASM()
@@ -144,106 +149,117 @@ void bli_spackm_knl_asm_16xk
         MOV(RBX, VAR(inca))
         MOV(RCX, VAR(lda))
         MOV(R14, VAR(p))
+        MOV(RDI, VAR(ldp))
 
         TEST(RSI, RSI)
-        JZ(PACK16_DONE)
+        JZ(PACK24_DONE)
 
         LEA(RBX, MEM(,RBX,4))    //inca in bytes
         LEA(RCX, MEM(,RCX,4))    //lda in bytes
+        LEA(RDI, MEM(,RDI,4))    //ldp in bytes
 
-        VBROADCASTSS(YMM(15), VAR(kappa))
+        VBROADCASTSS(ZMM(15), VAR(kappa))
 
         CMP(RBX, IMM(4))
-        JNE(PACK16_T)
+        JNE(PACK24_T)
 
-        LABEL(PACK16_N)
+        LABEL(PACK24_N)
 
             MOV(RDX, RSI)
             AND(RDX, IMM(7))
             SAR(RSI, IMM(3))
-            JZ(PACK16_N_TAIL)
+            JZ(PACK24_N_TAIL)
 
             LEA(R8,  MEM(RCX,RCX,2)) //lda*3
             LEA(R9,  MEM(RCX,RCX,4)) //lda*5
             LEA(R10, MEM(R8 ,RCX,4)) //lda*7
 
-            LABEL(PACK16_N_LOOP)
+            LABEL(PACK24_N_LOOP)
 
                 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7)
+                STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7)
 
                 LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7)
+                STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7)
+
+                LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
+                STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7)
 
                 LEA(RAX, MEM(RAX,RCX,8))
-                LEA(R14, MEM(R14,16*8*4))
+                LEA(R14, MEM(R14,RDI,8))
 
                 SUB(RSI, IMM(1))
 
-            JNZ(PACK16_N_LOOP)
+            JNZ(PACK24_N_LOOP)
 
             TEST(RDX, RDX)
-            JZ(PACK16_DONE)
+            JZ(PACK24_DONE)
 
-            LABEL(PACK16_N_TAIL)
+            LABEL(PACK24_N_TAIL)
 
-                VMULPS(YMM(0), YMM(15), MEM(RAX   ))
-                VMULPS(YMM(1), YMM(15), MEM(RAX,32))
-                VMOVUPS(MEM(R14   ), YMM(0))
-                VMOVUPS(MEM(R14,32), YMM(1))
+                VMULPS(ZMM(0), ZMM(15), MEM(RAX))
+                VMOVUPS(MEM(R14), ZMM(0))
+
+                VMULPS(YMM(1), YMM(15), MEM(RAX,64))
+                VMOVUPS(MEM(R14,64), YMM(1))
 
                 LEA(RAX, MEM(RAX,RCX,1))
-                LEA(R14, MEM(R14, 16*4))
+                LEA(R14, MEM(R14,RDI,1))
 
                 SUB(RDX, IMM(1))
 
-            JNZ(PACK16_N_TAIL)
+            JNZ(PACK24_N_TAIL)
 
-            JMP(PACK16_DONE)
+            JMP(PACK24_DONE)
 
-        LABEL(PACK16_T)
+        LABEL(PACK24_T)
 
             CMP(RCX, IMM(4))
-            JNE(PACK16_G)
+            JNE(PACK24_G)
 
             LEA(R8,  MEM(RBX,RBX,2)) //inca*3
             LEA(R9,  MEM(RBX,RBX,4)) //inca*5
             LEA(R10, MEM(R8 ,RBX,4)) //inca*7
             LEA(R11, MEM(RAX,RBX,8))
+            LEA(R12, MEM(R11,RBX,8))
 
             MOV(RDX, RSI)
             AND(RDX, IMM(7))
             SAR(RSI, IMM(3))
-            JZ(PACK16_T_TAIL)
+            JZ(PACK24_T_TAIL)
 
-            LABEL(PACK16_T_LOOP)
+            LABEL(PACK24_T_LOOP)
 
                 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
+                STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
 
                 LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
+                STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
 
-                LEA(RAX, MEM(RAX,   8*4))
-                LEA(R11, MEM(R11,   8*4))
-                LEA(R14, MEM(R14,16*8*4))
+                LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
+                STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
+
+                LEA(RAX, MEM(RAX,RCX,8))
+                LEA(R11, MEM(R11,RCX,8))
+                LEA(R12, MEM(R12,RCX,8))
+                LEA(R14, MEM(R14,RDI,8))
 
                 SUB(RSI, IMM(1))
 
-            JNZ(PACK16_T_LOOP)
+            JNZ(PACK24_T_LOOP)
 
             TEST(RDX, RDX)
-            JZ(PACK16_DONE)
+            JZ(PACK24_DONE)
 
-            LABEL(PACK16_T_TAIL)
+            LABEL(PACK24_T_TAIL)
 
-                VMULSS(XMM(0), XMM(15), MEM(RAX      ))
+                VMULSS(XMM(0), XMM(15), MEM(RAX))
                 VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
                 VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
-                VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1))
+                VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1))
                 VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
-                VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1))
-                VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2))
+                VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1))
+                VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2))
                 VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
                 VMOVSS(MEM(R14,0*4), XMM(0))
                 VMOVSS(MEM(R14,1*4), XMM(1))
@@ -254,13 +270,13 @@ void bli_spackm_knl_asm_16xk
                 VMOVSS(MEM(R14,6*4), XMM(6))
                 VMOVSS(MEM(R14,7*4), XMM(7))
 
-                VMULSS(XMM(0), XMM(15), MEM(R11      ))
+                VMULSS(XMM(0), XMM(15), MEM(R11))
                 VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
                 VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
-                VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1))
+                VMULSS(XMM(3), XMM(15), MEM(R11,R8,1))
                 VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
-                VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1))
-                VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2))
+                VMULSS(XMM(5), XMM(15), MEM(R11,R9,1))
+                VMULSS(XMM(6), XMM(15), MEM(R11,R8,2))
                 VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
                 VMOVSS(MEM(R14, 8*4), XMM(0))
                 VMOVSS(MEM(R14, 9*4), XMM(1))
@@ -271,37 +287,62 @@ void bli_spackm_knl_asm_16xk
                 VMOVSS(MEM(R14,14*4), XMM(6))
                 VMOVSS(MEM(R14,15*4), XMM(7))
 
-                LEA(RAX, MEM(RAX,   4))
-                LEA(R11, MEM(R11,   4))
-                LEA(R14, MEM(R14,16*4))
+                VMULSS(XMM(0), XMM(15), MEM(R12))
+                VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1))
+                VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2))
+                VMULSS(XMM(3), XMM(15), MEM(R12,R8,1))
+                VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4))
+                VMULSS(XMM(5), XMM(15), MEM(R12,R9,1))
+                VMULSS(XMM(6), XMM(15), MEM(R12,R8,2))
+                VMULSS(XMM(7), XMM(15), MEM(R12,R10,1))
+                VMOVSS(MEM(R14,16*4), XMM(0))
+                VMOVSS(MEM(R14,17*4), XMM(1))
+                VMOVSS(MEM(R14,18*4), XMM(2))
+                VMOVSS(MEM(R14,19*4), XMM(3))
+                VMOVSS(MEM(R14,20*4), XMM(4))
+                VMOVSS(MEM(R14,21*4), XMM(5))
+                VMOVSS(MEM(R14,22*4), XMM(6))
+                VMOVSS(MEM(R14,23*4), XMM(7))
+
+                LEA(RAX, MEM(RAX,RCX,1))
+                LEA(R11, MEM(R11,RCX,1))
+                LEA(R12, MEM(R12,RCX,1))
+                LEA(R14, MEM(R14,RDI,1))
 
                 SUB(RDX, IMM(1))
 
-            JNZ(PACK16_T_TAIL)
+            JNZ(PACK24_T_TAIL)
 
-            JMP(PACK16_DONE)
+            JMP(PACK24_DONE)
 
-        LABEL(PACK16_G)
+        LABEL(PACK24_G)
 
             VPBROADCASTD(ZMM(3), VAR(inca))
             MOV(RBX, VAR(offsetPtr))
             VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
 
-            LABEL(PACK16_G_LOOP)
+            LEA(R11, MEM(RAX,RBX,8))
+            LEA(R11, MEM(R11,RBX,8))
+
+            LABEL(PACK24_G_LOOP)
 
                 KXNORW(K(1), K(0), K(0))
+                KSHIFTRW(K(2), K(1), IMM(8))
                 VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
+                VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8))
                 VMULPS(ZMM(3), ZMM(3), ZMM(15))
+                VMULPS(YMM(4), YMM(4), YMM(15))
                 VMOVUPS(MEM(R14), ZMM(3))
+                VMOVUPS(MEM(R14,64), YMM(4))
 
                 LEA(RAX, MEM(RAX,RCX,1))
-                LEA(R14, MEM(R14, 16*4))
+                LEA(R14, MEM(R14,RDI,1))
 
                 SUB(RSI, IMM(1))
 
-            JNZ(PACK16_G_LOOP)
+            JNZ(PACK24_G_LOOP)
 
-        LABEL(PACK16_DONE)
+        LABEL(PACK24_DONE)
 
     END_ASM(
         : //output operands
@@ -326,82 +367,7 @@ void bli_spackm_knl_asm_16xk
     )
 
 	}
-	else // if ( cdim < mnr )
-	{
-		bli_sscal2m_ex \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-
-		// if ( cdim < mnr )
-		{
-			const dim_t      i      = cdim;
-			const dim_t      m_edge = mnr - i;
-			const dim_t      n_edge = n_max;
-			float*  restrict p_edge = ( float* )p + (i  )*1;
-
-			bli_sset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
-
-	if ( n < n_max )
-	{
-		const dim_t      j      = n;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = n_max - j;
-		float*  restrict p_edge = ( float* )p + (j  )*ldp;
-
-		bli_sset0s_mxn
-		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
-		);
-	}
-}
-
-void bli_spackm_knl_asm_24xk
-     (
-             conj_t  conja,
-             pack_t  schema,
-             dim_t   cdim_,
-             dim_t   n_,
-             dim_t   n_max_,
-       const void*   kappa_,
-       const void*   a_, inc_t inca_, inc_t lda_,
-             void*   p_,              inc_t ldp_,
-       const cntx_t* cntx
-     )
-{
-    const int32_t* offsetPtr = &offsets[0];
-
-    float*        a     = ( float* )a_;
-    float*        p     = ( float* )p_;
-    float*        kappa = ( float* )kappa_;
-    const int64_t cdim  = cdim_;
-    const int64_t mnr   = 24;
-    const int64_t n     = n_;
-    const int64_t n_max = n_max_;
-    const int64_t inca  = inca_;
-    const int64_t lda   = lda_;
-    const int64_t ldp   = ldp_;
-
-    if ( cdim == mnr )
+    if ( cdim == nr && ldp == nr && cdim_bcast == 1 )
     {
 
     BEGIN_ASM()
@@ -411,117 +377,106 @@ void bli_spackm_knl_asm_24xk
         MOV(RBX, VAR(inca))
         MOV(RCX, VAR(lda))
         MOV(R14, VAR(p))
-        MOV(RDI, VAR(ldp))
 
         TEST(RSI, RSI)
-        JZ(PACK24_DONE)
+        JZ(PACK16_DONE)
 
         LEA(RBX, MEM(,RBX,4))    //inca in bytes
         LEA(RCX, MEM(,RCX,4))    //lda in bytes
-        LEA(RDI, MEM(,RDI,4))    //ldp in bytes
 
-        VBROADCASTSS(ZMM(15), VAR(kappa))
+        VBROADCASTSS(YMM(15), VAR(kappa))
 
         CMP(RBX, IMM(4))
-        JNE(PACK24_T)
+        JNE(PACK16_T)
 
-        LABEL(PACK24_N)
+        LABEL(PACK16_N)
 
             MOV(RDX, RSI)
             AND(RDX, IMM(7))
             SAR(RSI, IMM(3))
-            JZ(PACK24_N_TAIL)
+            JZ(PACK16_N_TAIL)
 
             LEA(R8,  MEM(RCX,RCX,2)) //lda*3
             LEA(R9,  MEM(RCX,RCX,4)) //lda*5
             LEA(R10, MEM(R8 ,RCX,4)) //lda*7
 
-            LABEL(PACK24_N_LOOP)
+            LABEL(PACK16_N_LOOP)
 
                 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7)
+                STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7)
 
                 LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7)
-
-                LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7)
+                STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7)
 
                 LEA(RAX, MEM(RAX,RCX,8))
-                LEA(R14, MEM(R14,RDI,8))
+                LEA(R14, MEM(R14,16*8*4))
 
                 SUB(RSI, IMM(1))
 
-            JNZ(PACK24_N_LOOP)
+            JNZ(PACK16_N_LOOP)
 
             TEST(RDX, RDX)
-            JZ(PACK24_DONE)
-
-            LABEL(PACK24_N_TAIL)
+            JZ(PACK16_DONE)
 
-                VMULPS(ZMM(0), ZMM(15), MEM(RAX))
-                VMOVUPS(MEM(R14), ZMM(0))
+            LABEL(PACK16_N_TAIL)
 
-                VMULPS(YMM(1), YMM(15), MEM(RAX,64))
-                VMOVUPS(MEM(R14,64), YMM(1))
+                VMULPS(YMM(0), YMM(15), MEM(RAX   ))
+                VMULPS(YMM(1), YMM(15), MEM(RAX,32))
+                VMOVUPS(MEM(R14   ), YMM(0))
+                VMOVUPS(MEM(R14,32), YMM(1))
 
                 LEA(RAX, MEM(RAX,RCX,1))
-                LEA(R14, MEM(R14,RDI,1))
+                LEA(R14, MEM(R14, 16*4))
 
                 SUB(RDX, IMM(1))
 
-            JNZ(PACK24_N_TAIL)
+            JNZ(PACK16_N_TAIL)
 
-            JMP(PACK24_DONE)
+            JMP(PACK16_DONE)
 
-        LABEL(PACK24_T)
+        LABEL(PACK16_T)
 
             CMP(RCX, IMM(4))
-            JNE(PACK24_G)
+            JNE(PACK16_G)
 
             LEA(R8,  MEM(RBX,RBX,2)) //inca*3
             LEA(R9,  MEM(RBX,RBX,4)) //inca*5
             LEA(R10, MEM(R8 ,RBX,4)) //inca*7
             LEA(R11, MEM(RAX,RBX,8))
-            LEA(R12, MEM(R11,RBX,8))
 
             MOV(RDX, RSI)
             AND(RDX, IMM(7))
             SAR(RSI, IMM(3))
-            JZ(PACK24_T_TAIL)
+            JZ(PACK16_T_TAIL)
 
-            LABEL(PACK24_T_LOOP)
+            LABEL(PACK16_T_LOOP)
 
                 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
+                STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
 
                 LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
-
-                LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
-                STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
+                STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
 
-                LEA(RAX, MEM(RAX,RCX,8))
-                LEA(R11, MEM(R11,RCX,8))
-                LEA(R12, MEM(R12,RCX,8))
-                LEA(R14, MEM(R14,RDI,8))
+                LEA(RAX, MEM(RAX,   8*4))
+                LEA(R11, MEM(R11,   8*4))
+                LEA(R14, MEM(R14,16*8*4))
 
                 SUB(RSI, IMM(1))
 
-            JNZ(PACK24_T_LOOP)
+            JNZ(PACK16_T_LOOP)
 
             TEST(RDX, RDX)
-            JZ(PACK24_DONE)
+            JZ(PACK16_DONE)
 
-            LABEL(PACK24_T_TAIL)
+            LABEL(PACK16_T_TAIL)
 
-                VMULSS(XMM(0), XMM(15), MEM(RAX))
+                VMULSS(XMM(0), XMM(15), MEM(RAX      ))
                 VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
                 VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
-                VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1))
+                VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1))
                 VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
-                VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1))
-                VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2))
+                VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1))
+                VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2))
                 VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
                 VMOVSS(MEM(R14,0*4), XMM(0))
                 VMOVSS(MEM(R14,1*4), XMM(1))
@@ -532,13 +487,13 @@ void bli_spackm_knl_asm_24xk
                 VMOVSS(MEM(R14,6*4), XMM(6))
                 VMOVSS(MEM(R14,7*4), XMM(7))
 
-                VMULSS(XMM(0), XMM(15), MEM(R11))
+                VMULSS(XMM(0), XMM(15), MEM(R11      ))
                 VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
                 VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
-                VMULSS(XMM(3), XMM(15), MEM(R11,R8,1))
+                VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1))
                 VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
-                VMULSS(XMM(5), XMM(15), MEM(R11,R9,1))
-                VMULSS(XMM(6), XMM(15), MEM(R11,R8,2))
+                VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1))
+                VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2))
                 VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
                 VMOVSS(MEM(R14, 8*4), XMM(0))
                 VMOVSS(MEM(R14, 9*4), XMM(1))
@@ -549,62 +504,37 @@ void bli_spackm_knl_asm_24xk
                 VMOVSS(MEM(R14,14*4), XMM(6))
                 VMOVSS(MEM(R14,15*4), XMM(7))
 
-                VMULSS(XMM(0), XMM(15), MEM(R12))
-                VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1))
-                VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2))
-                VMULSS(XMM(3), XMM(15), MEM(R12,R8,1))
-                VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4))
-                VMULSS(XMM(5), XMM(15), MEM(R12,R9,1))
-                VMULSS(XMM(6), XMM(15), MEM(R12,R8,2))
-                VMULSS(XMM(7), XMM(15), MEM(R12,R10,1))
-                VMOVSS(MEM(R14,16*4), XMM(0))
-                VMOVSS(MEM(R14,17*4), XMM(1))
-                VMOVSS(MEM(R14,18*4), XMM(2))
-                VMOVSS(MEM(R14,19*4), XMM(3))
-                VMOVSS(MEM(R14,20*4), XMM(4))
-                VMOVSS(MEM(R14,21*4), XMM(5))
-                VMOVSS(MEM(R14,22*4), XMM(6))
-                VMOVSS(MEM(R14,23*4), XMM(7))
-
-                LEA(RAX, MEM(RAX,RCX,1))
-                LEA(R11, MEM(R11,RCX,1))
-                LEA(R12, MEM(R12,RCX,1))
-                LEA(R14, MEM(R14,RDI,1))
+                LEA(RAX, MEM(RAX,   4))
+                LEA(R11, MEM(R11,   4))
+                LEA(R14, MEM(R14,16*4))
 
                 SUB(RDX, IMM(1))
 
-            JNZ(PACK24_T_TAIL)
+            JNZ(PACK16_T_TAIL)
 
-            JMP(PACK24_DONE)
+            JMP(PACK16_DONE)
 
-        LABEL(PACK24_G)
+        LABEL(PACK16_G)
 
             VPBROADCASTD(ZMM(3), VAR(inca))
             MOV(RBX, VAR(offsetPtr))
             VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
 
-            LEA(R11, MEM(RAX,RBX,8))
-            LEA(R11, MEM(R11,RBX,8))
-
-            LABEL(PACK24_G_LOOP)
+            LABEL(PACK16_G_LOOP)
 
                 KXNORW(K(1), K(0), K(0))
-                KSHIFTRW(K(2), K(1), IMM(8))
                 VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
-                VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8))
                 VMULPS(ZMM(3), ZMM(3), ZMM(15))
-                VMULPS(YMM(4), YMM(4), YMM(15))
                 VMOVUPS(MEM(R14), ZMM(3))
-                VMOVUPS(MEM(R14,64), YMM(4))
 
                 LEA(RAX, MEM(RAX,RCX,1))
-                LEA(R14, MEM(R14,RDI,1))
+                LEA(R14, MEM(R14, 16*4))
 
                 SUB(RSI, IMM(1))
 
-            JNZ(PACK24_G_LOOP)
+            JNZ(PACK16_G_LOOP)
 
-        LABEL(PACK24_DONE)
+        LABEL(PACK16_DONE)
 
     END_ASM(
         : //output operands
@@ -629,51 +559,23 @@ void bli_spackm_knl_asm_24xk
     )
 
 	}
-	else // if ( cdim < mnr )
+	else
 	{
-		bli_sscal2m_ex \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-
-		// if ( cdim < mnr )
-		{
-			const dim_t      i      = cdim;
-			const dim_t      m_edge = mnr - i;
-			const dim_t      n_edge = n_max;
-			float*  restrict p_edge = ( float* )p + (i  )*1;
-
-			bli_sset0s_mxn
-			(
-			  m_edge,
-			  n_edge,
-			  p_edge, 1, ldp
-			);
-		}
-	}
-
-	if ( n < n_max )
-	{
-		const dim_t      j      = n;
-		const dim_t      m_edge = mnr;
-		const dim_t      n_edge = n_max - j;
-		float*  restrict p_edge = ( float* )p + (j  )*ldp;
-
-		bli_sset0s_mxn
+		bli_sscal2bbs_mxn
 		(
-		  m_edge,
-		  n_edge,
-		  p_edge, 1, ldp
+		  BLIS_NO_CONJUGATE,
+		  cdim,
+		  n,
+		  kappa,
+		  a,       inca, lda,
+		  p, cdim_bcast, ldp
 		);
 	}
+
+	bli_sset0s_edge
+	(
+	  cdim*cdim_bcast, cdim_max*cdim_bcast,
+	  n, n_max,
+	  p, ldp
+	);
 }
diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
index 888ed2874..5340ef6e7 100644
--- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
+++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
@@ -193,7 +193,7 @@ void bli_dgemm_knl_asm_24x8
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c_, inc_t cs_c_,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
index 504a5b1e8..2ff449f8a 100644
--- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
+++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
@@ -190,7 +190,7 @@ void bli_sgemm_knl_asm_24x16
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c_, inc_t cs_c_,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/knl/bli_kernels_knl.h b/kernels/knl/bli_kernels_knl.h
index f0b17c49d..ae80f94ea 100644
--- a/kernels/knl/bli_kernels_knl.h
+++ b/kernels/knl/bli_kernels_knl.h
@@ -36,16 +36,14 @@ GEMM_UKR_PROT( float,    s, gemm_knl_asm_24x16 )
 
 GEMM_UKR_PROT( double,   d, gemm_knl_asm_24x8 )
 
-PACKM_KER_PROT( float,    s, packm_knl_asm_24xk )
-PACKM_KER_PROT( float,    s, packm_knl_asm_16xk )
+PACKM_KER_PROT( float,    s, packm_knl_asm_24x16 )
 
-PACKM_KER_PROT( double,   d, packm_knl_asm_24xk )
-PACKM_KER_PROT( double,   d, packm_knl_asm_8xk )
+PACKM_KER_PROT( double,   d, packm_knl_asm_24x8 )
 
 // unused:
 GEMM_UKR_PROT( double,   d, gemm_knl_asm_12x16 )
 GEMM_UKR_PROT( double,   d, gemm_knl_asm_30x8 )
 GEMM_UKR_PROT( double,   d, gemm_knl_asm_8x24 )
 
-PACKM_KER_PROT( double,   d, packm_knl_asm_30xk )
+PACKM_KER_PROT( double,   d, packm_knl_asm_30x8 )
 
diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c
index 6097ab945..3ac75f424 100644
--- a/kernels/penryn/1f/bli_axpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c
@@ -144,10 +144,10 @@ void bli_daxpyf_penryn_int
 	chi2 = *(x_cast + 2*incx);
 	chi3 = *(x_cast + 3*incx);
 
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi0 );
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi1 );
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi2 );
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi3 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi0 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi1 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi2 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi3 );
 
 	if ( m_pre == 1 )
 	{
diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
index 6ea503509..0148d3f92 100644
--- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
@@ -182,10 +182,10 @@ void bli_ddotxaxpyf_penryn_int
 	chi2 = *(x_cast + 2*incx);
 	chi3 = *(x_cast + 3*incx);
 
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi0 );
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi1 );
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi2 );
-	PASTEMAC2(d,d,scals)( *alpha_cast, chi3 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi0 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi1 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi2 );
+	PASTEMAC(d,d,scals)( *alpha_cast, chi3 );
 
 	PASTEMAC(d,set0s)( rho0 );
 	PASTEMAC(d,set0s)( rho1 );
diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c
index 8f7018d96..282587b58 100644
--- a/kernels/penryn/1f/bli_dotxf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c
@@ -269,15 +269,15 @@ void bli_ddotxf_penryn_int
 		}
 	}
 /*
-	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast  ) ); \
-	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+1) ); \
-	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+2) ); \
-	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+3) ); \
-
-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(y_cast  ) ); \
-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(y_cast+1) ); \
-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(y_cast+2) ); \
-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(y_cast+3) ); \
+	PASTEMAC(d,d,scals)( *beta_cast, *(y_cast  ) ); \
+	PASTEMAC(d,d,scals)( *beta_cast, *(y_cast+1) ); \
+	PASTEMAC(d,d,scals)( *beta_cast, *(y_cast+2) ); \
+	PASTEMAC(d,d,scals)( *beta_cast, *(y_cast+3) ); \
+
+	PASTEMAC(d,d,d,axpys)( *alpha_cast, rho1, *(y_cast  ) ); \
+	PASTEMAC(d,d,d,axpys)( *alpha_cast, rho2, *(y_cast+1) ); \
+	PASTEMAC(d,d,d,axpys)( *alpha_cast, rho3, *(y_cast+2) ); \
+	PASTEMAC(d,d,d,axpys)( *alpha_cast, rho4, *(y_cast+3) ); \
 */
 
 	rho1v.d[0] = rho0;
diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
index aac7e6950..66c1085ef 100644
--- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
@@ -47,7 +47,7 @@ void bli_sgemm_penryn_asm_8x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -522,7 +522,7 @@ void bli_dgemm_penryn_asm_4x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
index 791234a9d..a1d346e41 100644
--- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
@@ -56,7 +56,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
        const void*      b01,
              void*      b11,
              void*      c11, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
index 024e8ba61..8ebe4e7ea 100644
--- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
@@ -56,7 +56,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
        const void*      b21,
              void*      b11,
              void*      c11, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
index 295aceca5..ca6d09b95 100644
--- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
@@ -50,7 +50,7 @@ void bli_dtrsm_l_penryn_asm_4x4
        const void*      a11,
              void*      b11,
              void*      c11, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
index 3ee464f57..1528b34c4 100644
--- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
@@ -50,7 +50,7 @@ void bli_dtrsm_u_penryn_asm_4x4
        const void*      a11,
              void*      b11,
              void*      c11, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
index cd577a863..f11b4f68a 100644
--- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
+++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
@@ -50,7 +50,7 @@ void bli_sgemm_piledriver_asm_16x3
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -531,7 +531,7 @@ void bli_dgemm_piledriver_asm_8x3
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -987,7 +987,7 @@ void bli_cgemm_piledriver_asm_4x2
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1397,7 +1397,7 @@ void bli_zgemm_piledriver_asm_2x2
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c
index 121f8c112..91d28620e 100644
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -70,7 +70,7 @@ void bli_dgemm_power10_mma_8x8
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
     )
 {
diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index c8a183e74..e16f73995 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -70,7 +70,7 @@ void bli_i16gemm_power10_mma_8x16
         const void*      b,
         const void*      beta,
               void*      c, inc_t rs_c0, inc_t cs_c0,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
     )
 {
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index ff2db46c9..09751c952 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -70,7 +70,7 @@ void bli_i16sgemm_power10_mma_8x16
         const void*      b,
         const void*      beta,
               void*      c, inc_t rs_c0, inc_t cs_c0,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
     )
 {
diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c
index 5816b1d06..89dce8546 100644
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -70,7 +70,7 @@ void bli_i4gemm_power10_mma_8x16
         const void*      b,
         const void*      beta,
               void*      c, inc_t rs_c0, inc_t cs_c0,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
     )
 {
diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c
index 357c9af5e..8ea10d703 100644
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -70,7 +70,7 @@ void bli_i8gemm_power10_mma_8x16
         const void*      b,
         const void*      beta,
               void*      c, inc_t rs_c0, inc_t cs_c0,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
     )
 {
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index c5edd60db..cc01c3576 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -71,7 +71,7 @@ void bli_sbgemm_power10_mma_8x16
         const void*      b,
         const void*      beta,
               void*      c, inc_t rs_c0, inc_t cs_c0,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
     )
 {
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index 13b8dbab6..792bf3477 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -68,7 +68,7 @@ void bli_sgemm_power10_mma_8x16
         const void*      b,
         const void*      beta,
               void*      c, inc_t rs_c0, inc_t cs_c0,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
     )
 {
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index 1a4624ecf..c2189195d 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -71,7 +71,7 @@ void bli_shgemm_power10_mma_8x16
         const void*      b,
         const void*      beta,
               void*      c, inc_t rs_c0, inc_t cs_c0,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
     )
 {
diff --git a/kernels/power7/3/bli_gemm_power7_int_8x4.c b/kernels/power7/3/bli_gemm_power7_int_8x4.c
index 73e17ab03..efd5a36a2 100644
--- a/kernels/power7/3/bli_gemm_power7_int_8x4.c
+++ b/kernels/power7/3/bli_gemm_power7_int_8x4.c
@@ -58,7 +58,7 @@ void bli_sgemm_power7_int_8x4
         const void*      b0,
         const void*      beta0,
               void*      c0, inc_t rs_c, inc_t cs_c,
-              auxinfo_t* data,
+        const auxinfo_t* data,
         const cntx_t*    cntx
      )
 {
@@ -106,7 +106,7 @@ void bli_dgemm_power7_int_8x4
        const void*      b0,
        const void*      beta0,
              void*      c0, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -469,7 +469,7 @@ void bli_cgemm_power7_int_8x4
        const void*      b0,
        const void*      beta0,
              void*      c0, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -528,7 +528,7 @@ void bli_zgemm_power7_int_8x4
        const void*      b0,
        const void*      beta0,
              void*      c0, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
index 2a4f6f025..a093b24be 100644
--- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
+++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
@@ -45,7 +45,7 @@ void bli_dgemm_power9_asm_12x6
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/rviv/3/bli_cgemm_rviv_4vx4.c b/kernels/rviv/3/bli_cgemm_rviv_4vx4.c
index 9ef333a78..ccd18760b 100644
--- a/kernels/rviv/3/bli_cgemm_rviv_4vx4.c
+++ b/kernels/rviv/3/bli_cgemm_rviv_4vx4.c
@@ -54,7 +54,7 @@ void bli_cgemm_rviv_4vx4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/rviv/3/bli_dgemm_rviv_4vx4.c b/kernels/rviv/3/bli_dgemm_rviv_4vx4.c
index e03716a5a..9791e56b9 100644
--- a/kernels/rviv/3/bli_dgemm_rviv_4vx4.c
+++ b/kernels/rviv/3/bli_dgemm_rviv_4vx4.c
@@ -54,7 +54,7 @@ void bli_dgemm_rviv_4vx4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/rviv/3/bli_sgemm_rviv_4vx4.c b/kernels/rviv/3/bli_sgemm_rviv_4vx4.c
index c240d0391..1e7894f6f 100644
--- a/kernels/rviv/3/bli_sgemm_rviv_4vx4.c
+++ b/kernels/rviv/3/bli_sgemm_rviv_4vx4.c
@@ -55,7 +55,7 @@ void bli_sgemm_rviv_4vx4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/rviv/3/bli_zgemm_rviv_4vx4.c b/kernels/rviv/3/bli_zgemm_rviv_4vx4.c
index 3d9940f9b..1aaa7649a 100644
--- a/kernels/rviv/3/bli_zgemm_rviv_4vx4.c
+++ b/kernels/rviv/3/bli_zgemm_rviv_4vx4.c
@@ -55,7 +55,7 @@ void bli_zgemm_rviv_4vx4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c, inc_t cs_c,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
index 5faf8b8dd..2b0e68063 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
@@ -50,7 +50,7 @@ void bli_sgemm_sandybridge_asm_8x8
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -542,7 +542,7 @@ void bli_dgemm_sandybridge_asm_8x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1004,7 +1004,7 @@ void bli_cgemm_sandybridge_asm_8x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
@@ -1707,7 +1707,7 @@ void bli_zgemm_sandybridge_asm_4x4
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
index 38572285d..5c0c0c0d5 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
@@ -46,7 +46,7 @@ void bli_dgemm_sandybridge_int_8x4
        const void*      b0,
        const void*      beta0,
              void*      c0, inc_t rs_c0, inc_t cs_c0,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c
new file mode 100644
index 000000000..3ee4cdd20
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c
@@ -0,0 +1,1465 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include "../bli_kernels_sifive_x280.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define VSSSEG7 "vssseg7e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG5 "vssseg5e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG3 "vssseg3e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define NR 64
+
+void bli_spackm_sifive_x280_asm_7m4
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            cdim_max,
+             dim_t            cdim_bcast,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const void*   restrict params,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const float* kappa = kappa_;
+    const float* a = a_;
+    float* p = p_;
+
+    float kappa_cast = *kappa;
+
+    // MRxk kernel
+    if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 0: __asm__("vmv.v.i v0, 0");
+                case 1: __asm__("vmv.v.i v1, 0");
+                case 2: __asm__("vmv.v.i v2, 0");
+                case 3: __asm__("vmv.v.i v3, 0");
+                case 4: __asm__("vmv.v.i v4, 0");
+                case 5: __asm__("vmv.v.i v5, 0");
+                case 6: __asm__("vmv.v.i v6, 0");
+            }
+            a += (cdim - 1) * inca;
+            size_t avl = n;
+            while (avl) {
+                const float* a_tmp = a;
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                switch (cdim) {
+                    case 7:
+                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 6:
+                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast != 1.f) {
+                    switch (cdim) {
+                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    }
+                }
+                __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= FLT_SIZE;
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == FLT_SIZE) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast != 1.f) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+                __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // NRxk kernel
+    else if (cdim <= 64 && cdim_max == 64 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v8, 0");
+            size_t avl = n;
+            while (avl) {
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                dim_t cdim_tmp = cdim;
+                const float* a_tmp = a;
+                float* p_tmp = p;
+                while (cdim_tmp >= 8) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    if (kappa_cast != 1.f) {
+                        __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+                    }
+                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                    p_tmp += 8;
+                    cdim_tmp -= 8;
+                }
+                if (cdim_tmp > 0) {
+                    a_tmp += (cdim_tmp - 1) * inca;
+                    switch (cdim_tmp) {
+                        case 7:
+                            __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 6:
+                            __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 5:
+                            __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 4:
+                            __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 3:
+                            __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 2:
+                            __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 1:
+                            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                    }
+                    if (kappa_cast != 1.f) {
+                        switch (cdim_tmp) {
+                            case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                            case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                            case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                            case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                            case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                            case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                            case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 7:
+                            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 6:
+                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 5:
+                            __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 4:
+                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 3:
+                            __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                    }
+                    p_tmp += cdim_tmp;
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+                for (size_t i = 0; i < vl; ++i) {
+                    __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+                    p_tmp += ldp;
+                }
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v8, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= FLT_SIZE;
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == FLT_SIZE) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast != 1.f) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // generic kernel
+    else
+    {
+        bli_sspackm_sifive_x280_ref
+        (
+          conja,
+          schema,
+          cdim,
+          cdim_max,
+          cdim_bcast,
+          n,
+          n_max,
+          kappa,
+          a, inca, lda,
+          p,       ldp,
+          params,
+          cntx
+        );
+    }
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define VSSSEG7 "vssseg7e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG5 "vssseg5e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG3 "vssseg3e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define NR 32
+
+void bli_dpackm_sifive_x280_asm_7m4
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            cdim_max,
+             dim_t            cdim_bcast,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const void*   restrict params,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const double* kappa = kappa_;
+    const double* a = a_;
+    double* p = p_;
+
+    double kappa_cast = *kappa;
+
+    // MRxk kernel
+    if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 0: __asm__("vmv.v.i v0, 0");
+                case 1: __asm__("vmv.v.i v1, 0");
+                case 2: __asm__("vmv.v.i v2, 0");
+                case 3: __asm__("vmv.v.i v3, 0");
+                case 4: __asm__("vmv.v.i v4, 0");
+                case 5: __asm__("vmv.v.i v5, 0");
+                case 6: __asm__("vmv.v.i v6, 0");
+            }
+            a += (cdim - 1) * inca;
+            size_t avl = n;
+            while (avl) {
+                const double* a_tmp = a;
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                switch (cdim) {
+                    case 7:
+                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 6:
+                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast != 1.) {
+                    switch (cdim) {
+                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    }
+                }
+                __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= FLT_SIZE;
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == FLT_SIZE) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast != 1.) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+                __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // NRxk kernel
+    else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v8, 0");
+            size_t avl = n;
+            while (avl) {
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                dim_t cdim_tmp = cdim;
+                const double* a_tmp = a;
+                double* p_tmp = p;
+                while (cdim_tmp >= 8) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    if (kappa_cast != 1.) {
+                        __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+                    }
+                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                    p_tmp += 8;
+                    cdim_tmp -= 8;
+                }
+                if (cdim_tmp > 0) {
+                    a_tmp += (cdim_tmp - 1) * inca;
+                    switch (cdim_tmp) {
+                        case 7:
+                            __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 6:
+                            __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 5:
+                            __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 4:
+                            __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 3:
+                            __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 2:
+                            __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 1:
+                            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                    }
+                    if (kappa_cast != 1.) {
+                        switch (cdim_tmp) {
+                            case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                            case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                            case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                            case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                            case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                            case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                            case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 7:
+                            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 6:
+                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 5:
+                            __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 4:
+                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 3:
+                            __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                            break;
+                    }
+                    p_tmp += cdim_tmp;
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+                for (size_t i = 0; i < vl; ++i) {
+                    __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+                    p_tmp += ldp;
+                }
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v8, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= FLT_SIZE;
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == FLT_SIZE) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast != 1.) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSE "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // generic kernel
+    else
+    {
+        bli_ddpackm_sifive_x280_ref
+        (
+          conja,
+          schema,
+          cdim,
+          cdim_max,
+          cdim_bcast,
+          n,
+          n_max,
+          kappa,
+          a, inca, lda,
+          p,       ldp,
+          params,
+          cntx
+        );
+    }
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define NR 32
+
+void bli_cpackm_sifive_x280_asm_6m2
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            cdim_max,
+             dim_t            cdim_bcast,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const void*   restrict params,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const scomplex* kappa = kappa_;
+    const scomplex* a = a_;
+    scomplex* p = p_;
+
+    scomplex kappa_cast = *kappa;
+
+    // MRxk kernel
+    if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                switch (cdim) {
+                    case 0:
+                        __asm__("vmv.v.i v0, 0");
+                        __asm__("vmv.v.i v1, 0");
+                    case 1:
+                        __asm__("vmv.v.i v2, 0");
+                        __asm__("vmv.v.i v3, 0");
+                    case 2:
+                        __asm__("vmv.v.i v4, 0");
+                        __asm__("vmv.v.i v5, 0");
+                    case 3:
+                        __asm__("vmv.v.i v6, 0");
+                        __asm__("vmv.v.i v7, 0");
+                    case 4:
+                        __asm__("vmv.v.i v8, 0");
+                        __asm__("vmv.v.i v9, 0");
+                    case 5:
+                        __asm__("vmv.v.i v10, 0");
+                        __asm__("vmv.v.i v11, 0");
+                }
+            }
+            else {
+                switch (cdim) {
+                    case 0:
+                        __asm__("vmv.v.i v12, 0");
+                        __asm__("vmv.v.i v13, 0");
+                    case 1:
+                        __asm__("vmv.v.i v14, 0");
+                        __asm__("vmv.v.i v15, 0");
+                    case 2:
+                        __asm__("vmv.v.i v16, 0");
+                        __asm__("vmv.v.i v17, 0");
+                    case 3:
+                        __asm__("vmv.v.i v18, 0");
+                        __asm__("vmv.v.i v19, 0");
+                    case 4:
+                        __asm__("vmv.v.i v20, 0");
+                        __asm__("vmv.v.i v21, 0");
+                    case 5:
+                        __asm__("vmv.v.i v22, 0");
+                        __asm__("vmv.v.i v23, 0");
+                }
+            }
+            a += (cdim - 1) * inca;
+            size_t avl = n;
+            while (avl) {
+                const scomplex* a_tmp = a;
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                switch (cdim) {
+                    case 6:
+                        __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                    if (conja == BLIS_CONJUGATE) {
+                        switch (cdim) {
+                            case 6: __asm__("vfneg.v v11, v11");
+                            case 5: __asm__("vfneg.v v9, v9");
+                            case 4: __asm__("vfneg.v v7, v7");
+                            case 3: __asm__("vfneg.v v5, v5");
+                            case 2: __asm__("vfneg.v v3, v3");
+                            case 1: __asm__("vfneg.v v1, v1");
+                        }
+                    }
+                    __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                    __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        switch (cdim) {
+                            case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                            case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                            case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                            case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    else {
+                        switch (cdim) {
+                            case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                            case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                            case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                            case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                    __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+                }
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v1, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= 2 * FLT_SIZE;
+            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v1, 0");
+            __asm__("vmv.v.i v2, 0");
+            __asm__("vmv.v.i v3, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == 2 * FLT_SIZE) {
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v1, v1");
+                    }
+                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+                }
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v1, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // NRxk kernel
+    else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v16, 0");
+            __asm__("vmv.v.i v18, 0");
+            size_t avl = n;
+            while (avl) {
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                dim_t cdim_tmp = cdim;
+                const scomplex* a_tmp = a;
+                scomplex* p_tmp = p;
+                while (cdim_tmp >= 4) {
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                        if (conja == BLIS_CONJUGATE) {
+                            __asm__("vfneg.v v1, v1");
+                            __asm__("vfneg.v v3, v3");
+                            __asm__("vfneg.v v5, v5");
+                            __asm__("vfneg.v v7, v7");
+                        }
+                        __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                    }
+                    else {
+                        if (conja == BLIS_NO_CONJUGATE) {
+                            vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        }
+                        else {
+                            vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        }
+                        __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                    }
+                    p_tmp += 4;
+                    cdim_tmp -= 4;
+                }
+                if (cdim_tmp > 0) {
+                    a_tmp += (cdim_tmp - 1) * inca;
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 2:
+                            __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 1:
+                            __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                    }
+                    if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                        if (conja == BLIS_CONJUGATE) {
+                            switch (cdim_tmp) {
+                                case 3: __asm__("vfneg.v v5, v5");
+                                case 2: __asm__("vfneg.v v3, v3");
+                                case 1: __asm__("vfneg.v v1, v1");
+                            }
+                        }
+                        switch (cdim_tmp) {
+                            case 3:
+                                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 2:
+                                __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 1:
+                                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                        }
+                    }
+                    else {
+                        if (conja == BLIS_NO_CONJUGATE) {
+                            switch (cdim_tmp) {
+                                case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                                case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                                case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            }
+                        }
+                        else {
+                            switch (cdim_tmp) {
+                                case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                                case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                                case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            }
+                        }
+                        switch (cdim_tmp) {
+                            case 3:
+                                __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 2:
+                                __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 1:
+                                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                        }
+                    }
+                    p_tmp += cdim_tmp;
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+                for (size_t i = 0; i < vl; ++i) {
+                    __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+                    p_tmp += ldp;
+                }
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= 2 * FLT_SIZE;
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v2, 0");
+            __asm__("vmv.v.i v4, 0");
+            __asm__("vmv.v.i v6, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == 2 * FLT_SIZE) {
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v2, v2");
+                    }
+                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+                }
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v2, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // generic kernel
+    else
+    {
+        bli_ccpackm_sifive_x280_ref
+        (
+          conja,
+          schema,
+          cdim,
+          cdim_max,
+          cdim_bcast,
+          n,
+          n_max,
+          kappa,
+          a, inca, lda,
+          p,       ldp,
+          params,
+          cntx
+        );
+    }
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSSSEG4
+#undef VSSSEG6
+#undef VSSSEG8
+#undef NR
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define NR 16
+
+void bli_zpackm_sifive_x280_asm_6m2
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            cdim_max,
+             dim_t            cdim_bcast,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const void*   restrict params,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const dcomplex* kappa = kappa_;
+    const dcomplex* a = a_;
+    dcomplex* p = p_;
+
+    dcomplex kappa_cast = *kappa;
+
+    // MRxk kernel
+    if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                switch (cdim) {
+                    case 0:
+                        __asm__("vmv.v.i v0, 0");
+                        __asm__("vmv.v.i v1, 0");
+                    case 1:
+                        __asm__("vmv.v.i v2, 0");
+                        __asm__("vmv.v.i v3, 0");
+                    case 2:
+                        __asm__("vmv.v.i v4, 0");
+                        __asm__("vmv.v.i v5, 0");
+                    case 3:
+                        __asm__("vmv.v.i v6, 0");
+                        __asm__("vmv.v.i v7, 0");
+                    case 4:
+                        __asm__("vmv.v.i v8, 0");
+                        __asm__("vmv.v.i v9, 0");
+                    case 5:
+                        __asm__("vmv.v.i v10, 0");
+                        __asm__("vmv.v.i v11, 0");
+                }
+            }
+            else {
+                switch (cdim) {
+                    case 0:
+                        __asm__("vmv.v.i v12, 0");
+                        __asm__("vmv.v.i v13, 0");
+                    case 1:
+                        __asm__("vmv.v.i v14, 0");
+                        __asm__("vmv.v.i v15, 0");
+                    case 2:
+                        __asm__("vmv.v.i v16, 0");
+                        __asm__("vmv.v.i v17, 0");
+                    case 3:
+                        __asm__("vmv.v.i v18, 0");
+                        __asm__("vmv.v.i v19, 0");
+                    case 4:
+                        __asm__("vmv.v.i v20, 0");
+                        __asm__("vmv.v.i v21, 0");
+                    case 5:
+                        __asm__("vmv.v.i v22, 0");
+                        __asm__("vmv.v.i v23, 0");
+                }
+            }
+            a += (cdim - 1) * inca;
+            size_t avl = n;
+            while (avl) {
+                const dcomplex* a_tmp = a;
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                switch (cdim) {
+                    case 6:
+                        __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                    if (conja == BLIS_CONJUGATE) {
+                        switch (cdim) {
+                            case 6: __asm__("vfneg.v v11, v11");
+                            case 5: __asm__("vfneg.v v9, v9");
+                            case 4: __asm__("vfneg.v v7, v7");
+                            case 3: __asm__("vfneg.v v5, v5");
+                            case 2: __asm__("vfneg.v v3, v3");
+                            case 1: __asm__("vfneg.v v1, v1");
+                        }
+                    }
+                    __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                    __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        switch (cdim) {
+                            case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                            case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                            case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                            case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    else {
+                        switch (cdim) {
+                            case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                            case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                            case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                            case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                    __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+                }
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v1, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= 2 * FLT_SIZE;
+            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v1, 0");
+            __asm__("vmv.v.i v2, 0");
+            __asm__("vmv.v.i v3, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == 2 * FLT_SIZE) {
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v1, v1");
+                    }
+                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+                }
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v1, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // NRxk kernel
+    else if (cdim <= 16 && cdim_max == 16 && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v16, 0");
+            __asm__("vmv.v.i v18, 0");
+            size_t avl = n;
+            while (avl) {
+                size_t vl;
+                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+                dim_t cdim_tmp = cdim;
+                const dcomplex* a_tmp = a;
+                dcomplex* p_tmp = p;
+                while (cdim_tmp >= 4) {
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp += inca;
+                    if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                        if (conja == BLIS_CONJUGATE) {
+                            __asm__("vfneg.v v1, v1");
+                            __asm__("vfneg.v v3, v3");
+                            __asm__("vfneg.v v5, v5");
+                            __asm__("vfneg.v v7, v7");
+                        }
+                        __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                    }
+                    else {
+                        if (conja == BLIS_NO_CONJUGATE) {
+                            vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        }
+                        else {
+                            vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        }
+                        __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                    }
+                    p_tmp += 4;
+                    cdim_tmp -= 4;
+                }
+                if (cdim_tmp > 0) {
+                    a_tmp += (cdim_tmp - 1) * inca;
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 2:
+                            __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                            a_tmp -= inca;
+                        case 1:
+                            __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                    }
+                    if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                        if (conja == BLIS_CONJUGATE) {
+                            switch (cdim_tmp) {
+                                case 3: __asm__("vfneg.v v5, v5");
+                                case 2: __asm__("vfneg.v v3, v3");
+                                case 1: __asm__("vfneg.v v1, v1");
+                            }
+                        }
+                        switch (cdim_tmp) {
+                            case 3:
+                                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 2:
+                                __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 1:
+                                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                        }
+                    }
+                    else {
+                        if (conja == BLIS_NO_CONJUGATE) {
+                            switch (cdim_tmp) {
+                                case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                                case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                                case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            }
+                        }
+                        else {
+                            switch (cdim_tmp) {
+                                case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                                case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                                case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                            }
+                        }
+                        switch (cdim_tmp) {
+                            case 3:
+                                __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 2:
+                                __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                            case 1:
+                                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                                break;
+                        }
+                    }
+                    p_tmp += cdim_tmp;
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+                for (size_t i = 0; i < vl; ++i) {
+                    __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+                    p_tmp += ldp;
+                }
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+        else {
+            inca *= 2 * FLT_SIZE;
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v2, 0");
+            __asm__("vmv.v.i v4, 0");
+            __asm__("vmv.v.i v6, 0");
+            for (size_t i = 0; i < n; ++i) {
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+                if (inca == 2 * FLT_SIZE) {
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+                }
+                else {
+                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+                }
+                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v2, v2");
+                    }
+                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                    __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+                }
+                a += lda;
+                p += ldp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__("vmv.v.i v0, 0");
+            __asm__("vmv.v.i v2, 0");
+            for (size_t i = n; i < n_max; ++i) {
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+                p += ldp;
+            }
+        }
+    }
+    // generic kernel
+    else
+    {
+        bli_zzpackm_sifive_x280_ref
+        (
+          conja,
+          schema,
+          cdim,
+          cdim_max,
+          cdim_bcast,
+          n,
+          n_max,
+          kappa,
+          a, inca, lda,
+          p,       ldp,
+          params,
+          cntx
+        );
+    }
+}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
deleted file mode 100644
index 35ca23677..000000000
--- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
+++ /dev/null
@@ -1,678 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "../riscv_cmul_macros_asm.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSSEG7 "vssseg7e32.v "
-
-void bli_spackm_sifive_x280_asm_7xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) conja;
-    (void) cntx;
-    const float* kappa = kappa_;
-    const float* a = a_;
-    float* p = p_;
-
-    float kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-        switch (cdim) {
-            case 0: __asm__("vmv.v.i v0, 0");
-            case 1: __asm__("vmv.v.i v1, 0");
-            case 2: __asm__("vmv.v.i v2, 0");
-            case 3: __asm__("vmv.v.i v3, 0");
-            case 4: __asm__("vmv.v.i v4, 0");
-            case 5: __asm__("vmv.v.i v5, 0");
-            case 6: __asm__("vmv.v.i v6, 0");
-        }
-        a += (cdim - 1) * inca;
-        size_t avl = n;
-        while (avl) {
-            const float* a_tmp = a;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            switch (cdim) {
-                case 7:
-                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 6:
-                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 5:
-                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 4:
-                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 3:
-                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 2:
-                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 1:
-                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-            }
-            if (kappa_cast != 1.f) {
-                switch (cdim) {
-                    case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                    case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                    case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                    case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                    case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                    case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                    case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                }
-            }
-            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    else {
-        inca *= FLT_SIZE;
-        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == FLT_SIZE) {
-                __asm__(VLE "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast != 1.f) {
-                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-            }
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSSEG7
-
-#define FLT_SIZE 8
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSSEG7 "vssseg7e64.v "
-
-void bli_dpackm_sifive_x280_asm_7xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) conja;
-    (void) cntx;
-    const double* kappa = kappa_;
-    const double* a = a_;
-    double* p = p_;
-
-    double kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-        switch (cdim) {
-            case 0: __asm__("vmv.v.i v0, 0");
-            case 1: __asm__("vmv.v.i v1, 0");
-            case 2: __asm__("vmv.v.i v2, 0");
-            case 3: __asm__("vmv.v.i v3, 0");
-            case 4: __asm__("vmv.v.i v4, 0");
-            case 5: __asm__("vmv.v.i v5, 0");
-            case 6: __asm__("vmv.v.i v6, 0");
-        }
-        a += (cdim - 1) * inca;
-        size_t avl = n;
-        while (avl) {
-            const double* a_tmp = a;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            switch (cdim) {
-                case 7:
-                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 6:
-                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 5:
-                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 4:
-                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 3:
-                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 2:
-                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 1:
-                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-            }
-            if (kappa_cast != 1.) {
-                switch (cdim) {
-                    case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                    case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                    case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                    case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                    case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                    case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                    case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                }
-            }
-            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    else {
-        inca *= FLT_SIZE;
-        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == FLT_SIZE) {
-                __asm__(VLE "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast != 1.) {
-                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-            }
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSSEG7
-
-#define FLT_SIZE 4
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG6 "vssseg6e32.v "
-
-void bli_cpackm_sifive_x280_asm_6xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) cntx;
-    const scomplex* kappa = kappa_;
-    const scomplex* a = a_;
-    scomplex* p = p_;
-
-    scomplex kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-        if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-            switch (cdim) {
-                case 0:
-                    __asm__("vmv.v.i v0, 0");
-                    __asm__("vmv.v.i v1, 0");
-                case 1:
-                    __asm__("vmv.v.i v2, 0");
-                    __asm__("vmv.v.i v3, 0");
-                case 2:
-                    __asm__("vmv.v.i v4, 0");
-                    __asm__("vmv.v.i v5, 0");
-                case 3:
-                    __asm__("vmv.v.i v6, 0");
-                    __asm__("vmv.v.i v7, 0");
-                case 4:
-                    __asm__("vmv.v.i v8, 0");
-                    __asm__("vmv.v.i v9, 0");
-                case 5:
-                    __asm__("vmv.v.i v10, 0");
-                    __asm__("vmv.v.i v11, 0");
-            }
-        }
-        else {
-            switch (cdim) {
-                case 0:
-                    __asm__("vmv.v.i v12, 0");
-                    __asm__("vmv.v.i v13, 0");
-                case 1:
-                    __asm__("vmv.v.i v14, 0");
-                    __asm__("vmv.v.i v15, 0");
-                case 2:
-                    __asm__("vmv.v.i v16, 0");
-                    __asm__("vmv.v.i v17, 0");
-                case 3:
-                    __asm__("vmv.v.i v18, 0");
-                    __asm__("vmv.v.i v19, 0");
-                case 4:
-                    __asm__("vmv.v.i v20, 0");
-                    __asm__("vmv.v.i v21, 0");
-                case 5:
-                    __asm__("vmv.v.i v22, 0");
-                    __asm__("vmv.v.i v23, 0");
-            }
-        }
-        a += (cdim - 1) * inca;
-        size_t avl = n;
-        while (avl) {
-            const scomplex* a_tmp = a;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            switch (cdim) {
-                case 6:
-                    __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 5:
-                    __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 4:
-                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 3:
-                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 2:
-                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 1:
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-            }
-            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                if (conja == BLIS_CONJUGATE) {
-                    switch (cdim) {
-                        case 6: __asm__("vfneg.v v11, v11");
-                        case 5: __asm__("vfneg.v v9, v9");
-                        case 4: __asm__("vfneg.v v7, v7");
-                        case 3: __asm__("vfneg.v v5, v5");
-                        case 2: __asm__("vfneg.v v3, v3");
-                        case 1: __asm__("vfneg.v v1, v1");
-                    }
-                }
-                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-            }
-            else {
-                if (conja == BLIS_NO_CONJUGATE) {
-                    switch (cdim) {
-                        case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                        case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                        case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                }
-                else {
-                    switch (cdim) {
-                        case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                        case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                        case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                }
-                __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-            }
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v1, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    else {
-        inca *= 2 * FLT_SIZE;
-        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v1, 0");
-        __asm__("vmv.v.i v2, 0");
-        __asm__("vmv.v.i v3, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                if (conja == BLIS_CONJUGATE) {
-                    __asm__("vfneg.v v1, v1");
-                }
-                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            }
-            else {
-                if (conja == BLIS_NO_CONJUGATE) {
-                    vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                }
-                else {
-                    vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                }
-                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
-            }
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v1, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG6
-
-#define FLT_SIZE 8
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG6 "vssseg6e64.v "
-
-void bli_zpackm_sifive_x280_asm_6xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) cntx;
-    const dcomplex* kappa = kappa_;
-    const dcomplex* a = a_;
-    dcomplex* p = p_;
-
-    dcomplex kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-        if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-            switch (cdim) {
-                case 0:
-                    __asm__("vmv.v.i v0, 0");
-                    __asm__("vmv.v.i v1, 0");
-                case 1:
-                    __asm__("vmv.v.i v2, 0");
-                    __asm__("vmv.v.i v3, 0");
-                case 2:
-                    __asm__("vmv.v.i v4, 0");
-                    __asm__("vmv.v.i v5, 0");
-                case 3:
-                    __asm__("vmv.v.i v6, 0");
-                    __asm__("vmv.v.i v7, 0");
-                case 4:
-                    __asm__("vmv.v.i v8, 0");
-                    __asm__("vmv.v.i v9, 0");
-                case 5:
-                    __asm__("vmv.v.i v10, 0");
-                    __asm__("vmv.v.i v11, 0");
-            }
-        }
-        else {
-            switch (cdim) {
-                case 0:
-                    __asm__("vmv.v.i v12, 0");
-                    __asm__("vmv.v.i v13, 0");
-                case 1:
-                    __asm__("vmv.v.i v14, 0");
-                    __asm__("vmv.v.i v15, 0");
-                case 2:
-                    __asm__("vmv.v.i v16, 0");
-                    __asm__("vmv.v.i v17, 0");
-                case 3:
-                    __asm__("vmv.v.i v18, 0");
-                    __asm__("vmv.v.i v19, 0");
-                case 4:
-                    __asm__("vmv.v.i v20, 0");
-                    __asm__("vmv.v.i v21, 0");
-                case 5:
-                    __asm__("vmv.v.i v22, 0");
-                    __asm__("vmv.v.i v23, 0");
-            }
-        }
-        a += (cdim - 1) * inca;
-        size_t avl = n;
-        while (avl) {
-            const dcomplex* a_tmp = a;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            switch (cdim) {
-                case 6:
-                    __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 5:
-                    __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 4:
-                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 3:
-                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 2:
-                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp -= inca;
-                case 1:
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-            }
-            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                if (conja == BLIS_CONJUGATE) {
-                    switch (cdim) {
-                        case 6: __asm__("vfneg.v v11, v11");
-                        case 5: __asm__("vfneg.v v9, v9");
-                        case 4: __asm__("vfneg.v v7, v7");
-                        case 3: __asm__("vfneg.v v5, v5");
-                        case 2: __asm__("vfneg.v v3, v3");
-                        case 1: __asm__("vfneg.v v1, v1");
-                    }
-                }
-                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-            }
-            else {
-                if (conja == BLIS_NO_CONJUGATE) {
-                    switch (cdim) {
-                        case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                        case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                        case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                }
-                else {
-                    switch (cdim) {
-                        case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                        case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                        case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                }
-                __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-            }
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v1, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    else {
-        inca *= 2 * FLT_SIZE;
-        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v1, 0");
-        __asm__("vmv.v.i v2, 0");
-        __asm__("vmv.v.i v3, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                if (conja == BLIS_CONJUGATE) {
-                    __asm__("vfneg.v v1, v1");
-                }
-                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            }
-            else {
-                if (conja == BLIS_NO_CONJUGATE) {
-                    vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                }
-                else {
-                    vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                }
-                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
-            }
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v1, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
deleted file mode 100644
index 89e05ecae..000000000
--- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
+++ /dev/null
@@ -1,838 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "../riscv_cmul_macros_asm.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-#define VSSSEG8 "vssseg8e32.v "
-#define VSSSEG7 "vssseg7e32.v "
-#define VSSSEG6 "vssseg6e32.v "
-#define VSSSEG5 "vssseg5e32.v "
-#define VSSSEG4 "vssseg4e32.v "
-#define VSSSEG3 "vssseg3e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define NR 64
-
-void bli_spackm_sifive_x280_asm_64xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) conja;
-    (void) cntx;
-    const float* kappa = kappa_;
-    const float* a = a_;
-    float* p = p_;
-
-    float kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v8, 0");
-        size_t avl = n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            dim_t cdim_tmp = cdim;
-            const float* a_tmp = a;
-            float* p_tmp = p;
-            while (cdim_tmp >= 8) {
-                __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                if (kappa_cast != 1.f) {
-                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
-                }
-                __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                p_tmp += 8;
-                cdim_tmp -= 8;
-            }
-            if (cdim_tmp > 0) {
-                a_tmp += (cdim_tmp - 1) * inca;
-                switch (cdim_tmp) {
-                    case 7:
-                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 6:
-                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 5:
-                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 4:
-                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 3:
-                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast != 1.f) {
-                    switch (cdim_tmp) {
-                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                    }
-                }
-                switch (cdim_tmp) {
-                    case 7:
-                        __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 6:
-                        __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 5:
-                        __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 4:
-                        __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 3:
-                        __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 2:
-                        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 1:
-                        __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                }
-                p_tmp += cdim_tmp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-            for (size_t i = 0; i < vl; ++i) {
-                __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
-                p_tmp += ldp;
-            }
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v8, (%0)" : : "r"(p));
-            p += ldp;
-        }
-    }
-    else {
-        inca *= FLT_SIZE;
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == FLT_SIZE) {
-                __asm__(VLE "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast != 1.f) {
-                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-#undef VSSSEG8
-#undef VSSSEG7
-#undef VSSSEG6
-#undef VSSSEG5
-#undef VSSSEG4
-#undef VSSSEG3
-#undef VSSSEG2
-#undef NR
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-#define VSSSEG8 "vssseg8e64.v "
-#define VSSSEG7 "vssseg7e64.v "
-#define VSSSEG6 "vssseg6e64.v "
-#define VSSSEG5 "vssseg5e64.v "
-#define VSSSEG4 "vssseg4e64.v "
-#define VSSSEG3 "vssseg3e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define NR 32
-
-void bli_dpackm_sifive_x280_asm_32xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) conja;
-    (void) cntx;
-    const double* kappa = kappa_;
-    const double* a = a_;
-    double* p = p_;
-
-    double kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v8, 0");
-        size_t avl = n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            dim_t cdim_tmp = cdim;
-            const double* a_tmp = a;
-            double* p_tmp = p;
-            while (cdim_tmp >= 8) {
-                __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                if (kappa_cast != 1.) {
-                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                    __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
-                }
-                __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                p_tmp += 8;
-                cdim_tmp -= 8;
-            }
-            if (cdim_tmp > 0) {
-                a_tmp += (cdim_tmp - 1) * inca;
-                switch (cdim_tmp) {
-                    case 7:
-                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 6:
-                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 5:
-                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 4:
-                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 3:
-                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast != 1.) {
-                    switch (cdim_tmp) {
-                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                    }
-                }
-                switch (cdim_tmp) {
-                    case 7:
-                        __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 6:
-                        __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 5:
-                        __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 4:
-                        __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 3:
-                        __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 2:
-                        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                    case 1:
-                        __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                        break;
-                }
-                p_tmp += cdim_tmp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-            for (size_t i = 0; i < vl; ++i) {
-                __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
-                p_tmp += ldp;
-            }
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v8, (%0)" : : "r"(p));
-            p += ldp;
-        }
-    }
-    else {
-        inca *= FLT_SIZE;
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == FLT_SIZE) {
-                __asm__(VLE "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast != 1.) {
-                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSE "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-#undef VSSSEG8
-#undef VSSSEG7
-#undef VSSSEG6
-#undef VSSSEG5
-#undef VSSSEG4
-#undef VSSSEG3
-#undef VSSSEG2
-#undef NR
-
-#define FLT_SIZE 4
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define VSSSEG4 "vssseg4e32.v "
-#define VSSSEG6 "vssseg6e32.v "
-#define VSSSEG8 "vssseg8e32.v "
-#define NR 32 
-
-void bli_cpackm_sifive_x280_asm_32xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) cntx;
-    const scomplex* kappa = kappa_;
-    const scomplex* a = a_;
-    scomplex* p = p_;
-
-    scomplex kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v16, 0");
-        __asm__("vmv.v.i v18, 0");
-        size_t avl = n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            dim_t cdim_tmp = cdim;
-            const scomplex* a_tmp = a;
-            scomplex* p_tmp = p;
-            while (cdim_tmp >= 4) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                    if (conja == BLIS_CONJUGATE) {
-                        __asm__("vfneg.v v1, v1");
-                        __asm__("vfneg.v v3, v3");
-                        __asm__("vfneg.v v5, v5");
-                        __asm__("vfneg.v v7, v7");
-                    }
-                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                    }
-                    else {
-                        vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                    }
-                    __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                }
-                p_tmp += 4;
-                cdim_tmp -= 4;
-            }
-            if (cdim_tmp > 0) {
-                a_tmp += (cdim_tmp - 1) * inca;
-                switch (cdim_tmp) {
-                    case 3:
-                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                    if (conja == BLIS_CONJUGATE) {
-                        switch (cdim_tmp) {
-                            case 3: __asm__("vfneg.v v5, v5");
-                            case 2: __asm__("vfneg.v v3, v3");
-                            case 1: __asm__("vfneg.v v1, v1");
-                        }
-                    }
-                    switch (cdim_tmp) {
-                        case 3:
-                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 2:
-                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 1:
-                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                    }
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        switch (cdim_tmp) {
-                            case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    else {
-                        switch (cdim_tmp) {
-                            case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    switch (cdim_tmp) {
-                        case 3:
-                            __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 2:
-                            __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 1:
-                            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                    }
-                }
-                p_tmp += cdim_tmp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-            for (size_t i = 0; i < vl; ++i) {
-                __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
-                p_tmp += ldp;
-            }
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
-            p += ldp;
-        }
-    }
-    else {
-        inca *= 2 * FLT_SIZE;
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v2, 0");
-        __asm__("vmv.v.i v4, 0");
-        __asm__("vmv.v.i v6, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                if (conja == BLIS_CONJUGATE) {
-                    __asm__("vfneg.v v2, v2");
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            }
-            else {
-                if (conja == BLIS_NO_CONJUGATE) {
-                    vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                }
-                else {
-                    vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
-            }
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v2, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-#undef VSSSEG4
-#undef VSSSEG6
-#undef VSSSEG8
-#undef NR
-
-#define FLT_SIZE 8
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define VSSSEG4 "vssseg4e64.v "
-#define VSSSEG6 "vssseg6e64.v "
-#define VSSSEG8 "vssseg8e64.v "
-#define NR 16
-
-void bli_zpackm_sifive_x280_asm_16xk
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const cntx_t*          cntx
-     )
-{
-    (void) cntx;
-    const dcomplex* kappa = kappa_;
-    const dcomplex* a = a_;
-    dcomplex* p = p_;
-
-    dcomplex kappa_cast = *kappa;
-    if (lda == 1) {
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v16, 0");
-        __asm__("vmv.v.i v18, 0");
-        size_t avl = n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            dim_t cdim_tmp = cdim;
-            const dcomplex* a_tmp = a;
-            dcomplex* p_tmp = p;
-            while (cdim_tmp >= 4) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                a_tmp += inca;
-                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                    if (conja == BLIS_CONJUGATE) {
-                        __asm__("vfneg.v v1, v1");
-                        __asm__("vfneg.v v3, v3");
-                        __asm__("vfneg.v v5, v5");
-                        __asm__("vfneg.v v7, v7");
-                    }
-                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                    }
-                    else {
-                        vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                        vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                    }
-                    __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                }
-                p_tmp += 4;
-                cdim_tmp -= 4;
-            }
-            if (cdim_tmp > 0) {
-                a_tmp += (cdim_tmp - 1) * inca;
-                switch (cdim_tmp) {
-                    case 3:
-                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                    if (conja == BLIS_CONJUGATE) {
-                        switch (cdim_tmp) {
-                            case 3: __asm__("vfneg.v v5, v5");
-                            case 2: __asm__("vfneg.v v3, v3");
-                            case 1: __asm__("vfneg.v v1, v1");
-                        }
-                    }
-                    switch (cdim_tmp) {
-                        case 3:
-                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 2:
-                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 1:
-                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                    }
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        switch (cdim_tmp) {
-                            case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    else {
-                        switch (cdim_tmp) {
-                            case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    switch (cdim_tmp) {
-                        case 3:
-                            __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 2:
-                            __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                        case 1:
-                            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                            break;
-                    }
-                }
-                p_tmp += cdim_tmp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-            for (size_t i = 0; i < vl; ++i) {
-                __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
-                p_tmp += ldp;
-            }
-            a += vl;
-            p += vl * ldp;
-            avl -= vl;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
-            p += ldp;
-        }
-    }
-    else {
-        inca *= 2 * FLT_SIZE;
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v2, 0");
-        __asm__("vmv.v.i v4, 0");
-        __asm__("vmv.v.i v6, 0");
-        for (size_t i = 0; i < n; ++i) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-            if (inca == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-            }
-            else {
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-            }
-            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                if (conja == BLIS_CONJUGATE) {
-                    __asm__("vfneg.v v2, v2");
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            }
-            else {
-                if (conja == BLIS_NO_CONJUGATE) {
-                    vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                }
-                else {
-                    vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
-            }
-            a += lda;
-            p += ldp;
-        }
-        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v2, 0");
-        for (size_t i = n; i < n_max; ++i) {
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-            p += ldp;  
-        }
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
index b9715988d..f4a5a26ca 100644
--- a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
+++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
@@ -35,6 +35,7 @@
 // clang-format off
 #include "blis.h"
 #include "../riscv_cmul_macros_asm.h"
+#include "../bli_kernels_sifive_x280.h"
 #include <math.h>
 #include <stddef.h>
 #include <stdbool.h>
@@ -120,9 +121,9 @@ void bli_sgemm_7m4
 
     rsc *= FLT_SIZE;
     csc *= FLT_SIZE;
-    
+
     __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    
+
     // compute alpha*a*b + beta*c
     if (*beta == 0.f) {
         __asm__("vfmul.vf v0, v0, ft10");
@@ -313,9 +314,9 @@ void bli_sgemm_7m4_cleanup
     c += (M - 1) * rsc;
     rsc *= FLT_SIZE;
     csc *= FLT_SIZE;
-     
+
     __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    
+
     // compute alpha*a*b + beta*c
     if (*beta == 0.f) {
         switch (M) {
@@ -513,7 +514,7 @@ void bli_sgemm_7m4_k0
                 __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
             case 1:
                 __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-            }        
+            }
         } // end c non-unit column stride
     } // end beta == 0.f
     else { // beta != 0.f
@@ -554,7 +555,7 @@ void bli_sgemm_7m4_k0
                 __asm__(VLE "v0, (%0)" : : "r"(c));
                 __asm__("vfmul.vf v0, v0, ft0");
                 __asm__(VSE "v0, (%0)" : : "r"(c));
-                
+
             }
         } // end c unit column stride
         else { // c non-unit column stride
@@ -609,7 +610,7 @@ void bli_sgemm_sifive_x280_asm_7m4
        const void*      restrict b_,
        const void*      restrict beta_,
              void*      restrict c_, inc_t rsc, inc_t csc,
-             auxinfo_t* restrict data,
+       const auxinfo_t* restrict data,
        const cntx_t*    restrict cntx
      )
 {
@@ -650,7 +651,7 @@ void bli_sgemm_sifive_x280_asm_7m4
 #define VSE "vse64.v "
 #define VSSE "vsse64.v "
 #define PACKMR 8
-#define PACKNR 32 
+#define PACKNR 32
 
 void bli_dgemm_7m4
      (
@@ -722,9 +723,9 @@ void bli_dgemm_7m4
 
     rsc *= FLT_SIZE;
     csc *= FLT_SIZE;
-    
+
     __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    
+
     // compute alpha*a*b + beta*c
     if (*beta == 0.) {
         __asm__("vfmul.vf v0, v0, ft10");
@@ -915,9 +916,9 @@ void bli_dgemm_7m4_cleanup
     c += (M - 1) * rsc;
     rsc *= FLT_SIZE;
     csc *= FLT_SIZE;
-     
+
     __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    
+
     // compute alpha*a*b + beta*c
     if (*beta == 0.) {
         switch (M) {
@@ -1115,7 +1116,7 @@ void bli_dgemm_7m4_k0
                 __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
             case 1:
                 __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-            }        
+            }
         } // end c non-unit column stride
     } // end beta == 0.
     else { // beta != 0.
@@ -1156,7 +1157,7 @@ void bli_dgemm_7m4_k0
                 __asm__(VLE "v0, (%0)" : : "r"(c));
                 __asm__("vfmul.vf v0, v0, ft0");
                 __asm__(VSE "v0, (%0)" : : "r"(c));
-                
+
             }
         } // end c unit column stride
         else { // c non-unit column stride
@@ -1211,7 +1212,7 @@ void bli_dgemm_sifive_x280_asm_7m4
        const void*      restrict b_,
        const void*      restrict beta_,
              void*      restrict c_, inc_t rsc, inc_t csc,
-             auxinfo_t* restrict data,
+       const auxinfo_t* restrict data,
        const cntx_t*    restrict cntx
      )
 {
@@ -1252,7 +1253,7 @@ void bli_dgemm_sifive_x280_asm_7m4
 #define VSSEG2 "vsseg2e32.v "
 #define VSSSEG2 "vssseg2e32.v "
 #define PACKMR 8
-#define PACKNR 32 
+#define PACKNR 32
 
 void bli_cgemm_6m2
      (
@@ -1384,9 +1385,9 @@ void bli_cgemm_6m2
     __asm__("vfmul.vf v30, v4, ft1");
 
     __asm__("vfmsub.vf v0, ft0, v24");
-    __asm__("vfmadd.vf v2, ft0, v26"); 
+    __asm__("vfmadd.vf v2, ft0, v26");
     __asm__("vfmsub.vf v4, ft0, v28");
-    __asm__("vfmadd.vf v6, ft0, v30"); 
+    __asm__("vfmadd.vf v6, ft0, v30");
 
     __asm__("vfmul.vf v24, v10, ft1");
     __asm__("vfmul.vf v26, v8, ft1");
@@ -1394,9 +1395,9 @@ void bli_cgemm_6m2
     __asm__("vfmul.vf v30, v12, ft1");
 
     __asm__("vfmsub.vf v8, ft0, v24");
-    __asm__("vfmadd.vf v10, ft0, v26"); 
+    __asm__("vfmadd.vf v10, ft0, v26");
     __asm__("vfmsub.vf v12, ft0, v28");
-    __asm__("vfmadd.vf v14, ft0, v30"); 
+    __asm__("vfmadd.vf v14, ft0, v30");
 
     __asm__("vfmul.vf v24, v18, ft1");
     __asm__("vfmul.vf v26, v16, ft1");
@@ -1404,9 +1405,9 @@ void bli_cgemm_6m2
     __asm__("vfmul.vf v30, v20, ft1");
 
     __asm__("vfmsub.vf v16, ft0, v24");
-    __asm__("vfmadd.vf v18, ft0, v26"); 
+    __asm__("vfmadd.vf v18, ft0, v26");
     __asm__("vfmsub.vf v20, ft0, v28");
-    __asm__("vfmadd.vf v22, ft0, v30"); 
+    __asm__("vfmadd.vf v22, ft0, v30");
 
     scomplex beta_cast = *beta;
     if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
@@ -1621,28 +1622,28 @@ void bli_cgemm_6m2_cleanup
             __asm__("vfmul.vf v24, v18, ft1");
             __asm__("vfmul.vf v26, v16, ft1");
             __asm__("vfmsub.vf v16, ft0, v24");
-            __asm__("vfmadd.vf v18, ft0, v26"); 
+            __asm__("vfmadd.vf v18, ft0, v26");
         case 4:
             __asm__("vfmul.vf v28, v14, ft1");
             __asm__("vfmul.vf v30, v12, ft1");
             __asm__("vfmsub.vf v12, ft0, v28");
-            __asm__("vfmadd.vf v14, ft0, v30"); 
+            __asm__("vfmadd.vf v14, ft0, v30");
         case 3:
             __asm__("vfmul.vf v24, v10, ft1");
             __asm__("vfmul.vf v26, v8, ft1");
             __asm__("vfmsub.vf v8, ft0, v24");
-            __asm__("vfmadd.vf v10, ft0, v26"); 
+            __asm__("vfmadd.vf v10, ft0, v26");
         case 2:
             __asm__("vfmul.vf v28, v6, ft1");
             __asm__("vfmul.vf v30, v4, ft1");
             __asm__("vfmsub.vf v4, ft0, v28");
-            __asm__("vfmadd.vf v6, ft0, v30"); 
+            __asm__("vfmadd.vf v6, ft0, v30");
         case 1:
             __asm__("vfmul.vf v24, v2, ft1");
             __asm__("vfmul.vf v26, v0, ft1");
             __asm__("vfmsub.vf v0, ft0, v24");
-            __asm__("vfmadd.vf v2, ft0, v26"); 
-    } 
+            __asm__("vfmadd.vf v2, ft0, v26");
+    }
 
     scomplex beta_cast = *beta;
     if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
@@ -1791,11 +1792,11 @@ void bli_cgemm_sifive_x280_asm_6m2
        const void*      restrict b_,
        const void*      restrict beta_,
              void*      restrict c_, inc_t rsc, inc_t csc,
-             auxinfo_t* restrict data,
+       const auxinfo_t* restrict data,
        const cntx_t*    restrict cntx
      )
 {
-    // M x N x K cgemm 
+    // M x N x K cgemm
     (void) data;
     (void) cntx;
     const scomplex* restrict alpha = alpha_;
@@ -1832,7 +1833,7 @@ void bli_cgemm_sifive_x280_asm_6m2
 #define VSSEG2 "vsseg2e64.v "
 #define VSSSEG2 "vssseg2e64.v "
 #define PACKMR 8
-#define PACKNR 16 
+#define PACKNR 16
 
 void bli_zgemm_6m2
      (
@@ -1964,9 +1965,9 @@ void bli_zgemm_6m2
     __asm__("vfmul.vf v30, v4, ft1");
 
     __asm__("vfmsub.vf v0, ft0, v24");
-    __asm__("vfmadd.vf v2, ft0, v26"); 
+    __asm__("vfmadd.vf v2, ft0, v26");
     __asm__("vfmsub.vf v4, ft0, v28");
-    __asm__("vfmadd.vf v6, ft0, v30"); 
+    __asm__("vfmadd.vf v6, ft0, v30");
 
     __asm__("vfmul.vf v24, v10, ft1");
     __asm__("vfmul.vf v26, v8, ft1");
@@ -1974,9 +1975,9 @@ void bli_zgemm_6m2
     __asm__("vfmul.vf v30, v12, ft1");
 
     __asm__("vfmsub.vf v8, ft0, v24");
-    __asm__("vfmadd.vf v10, ft0, v26"); 
+    __asm__("vfmadd.vf v10, ft0, v26");
     __asm__("vfmsub.vf v12, ft0, v28");
-    __asm__("vfmadd.vf v14, ft0, v30"); 
+    __asm__("vfmadd.vf v14, ft0, v30");
 
     __asm__("vfmul.vf v24, v18, ft1");
     __asm__("vfmul.vf v26, v16, ft1");
@@ -1984,9 +1985,9 @@ void bli_zgemm_6m2
     __asm__("vfmul.vf v30, v20, ft1");
 
     __asm__("vfmsub.vf v16, ft0, v24");
-    __asm__("vfmadd.vf v18, ft0, v26"); 
+    __asm__("vfmadd.vf v18, ft0, v26");
     __asm__("vfmsub.vf v20, ft0, v28");
-    __asm__("vfmadd.vf v22, ft0, v30"); 
+    __asm__("vfmadd.vf v22, ft0, v30");
 
     dcomplex beta_cast = *beta;
     if (beta_cast.real != 0. || beta_cast.imag != 0.) {
@@ -2201,28 +2202,28 @@ void bli_zgemm_6m2_cleanup
             __asm__("vfmul.vf v24, v18, ft1");
             __asm__("vfmul.vf v26, v16, ft1");
             __asm__("vfmsub.vf v16, ft0, v24");
-            __asm__("vfmadd.vf v18, ft0, v26"); 
+            __asm__("vfmadd.vf v18, ft0, v26");
         case 4:
             __asm__("vfmul.vf v28, v14, ft1");
             __asm__("vfmul.vf v30, v12, ft1");
             __asm__("vfmsub.vf v12, ft0, v28");
-            __asm__("vfmadd.vf v14, ft0, v30"); 
+            __asm__("vfmadd.vf v14, ft0, v30");
         case 3:
             __asm__("vfmul.vf v24, v10, ft1");
             __asm__("vfmul.vf v26, v8, ft1");
             __asm__("vfmsub.vf v8, ft0, v24");
-            __asm__("vfmadd.vf v10, ft0, v26"); 
+            __asm__("vfmadd.vf v10, ft0, v26");
         case 2:
             __asm__("vfmul.vf v28, v6, ft1");
             __asm__("vfmul.vf v30, v4, ft1");
             __asm__("vfmsub.vf v4, ft0, v28");
-            __asm__("vfmadd.vf v6, ft0, v30"); 
+            __asm__("vfmadd.vf v6, ft0, v30");
         case 1:
             __asm__("vfmul.vf v24, v2, ft1");
             __asm__("vfmul.vf v26, v0, ft1");
             __asm__("vfmsub.vf v0, ft0, v24");
-            __asm__("vfmadd.vf v2, ft0, v26"); 
-    } 
+            __asm__("vfmadd.vf v2, ft0, v26");
+    }
 
     dcomplex beta_cast = *beta;
     if (beta_cast.real != 0. || beta_cast.imag != 0.) {
@@ -2371,11 +2372,11 @@ void bli_zgemm_sifive_x280_asm_6m2
        const void*      restrict b_,
        const void*      restrict beta_,
              void*      restrict c_, inc_t rsc, inc_t csc,
-             auxinfo_t* restrict data,
+       const auxinfo_t* restrict data,
        const cntx_t*    restrict cntx
      )
 {
-    // M x N x K zgemm 
+    // M x N x K zgemm
     (void) data;
     (void) cntx;
     const dcomplex* restrict alpha = alpha_;
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
index 4323f8fbf..7cb8d9e07 100644
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
@@ -35,6 +35,7 @@
 // clang-format off
 #include "blis.h"
 #include "../../riscv_cmul_macros_asm.h"
+#include "../../bli_kernels_sifive_x280.h"
 #include <stdint.h>
 #include <riscv_vector.h>
 
@@ -50,7 +51,7 @@
           T*         restrict c11_,   \
           inc_t               rsc,    \
           inc_t               csc,    \
-          auxinfo_t* restrict data,   \
+    const auxinfo_t* restrict data,   \
     const cntx_t*    restrict cntx    \
     )
 
@@ -66,7 +67,7 @@
           T*         restrict c11_,   \
           inc_t               rsc,    \
           inc_t               csc,    \
-          auxinfo_t* restrict data,   \
+    const auxinfo_t* restrict data,   \
     const cntx_t*    restrict cntx    \
     )
 
diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h
index 425c7dad9..0ee01041e 100644
--- a/kernels/sifive_x280/bli_kernels_sifive_x280.h
+++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h
@@ -135,14 +135,16 @@ DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm)
 DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm)
 
 // Level 1m
-PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_7xk)
-PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_7xk)
-PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_6xk)
-PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_6xk)
-PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_64xk)
-PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_32xk)
-PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_32xk)
-PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_16xk)
+PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_7m4)
+PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_7m4)
+PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_6m2)
+PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_6m2)
+
+// Reference 1m
+PACKM_KER_PROT(float,       ss, packm_sifive_x280_ref)
+PACKM_KER_PROT(double,      dd, packm_sifive_x280_ref)
+PACKM_KER_PROT(scomplex,    cc, packm_sifive_x280_ref)
+PACKM_KER_PROT(dcomplex,    zz, packm_sifive_x280_ref)
 
 // Level 3
 GEMM_UKR_PROT(float,        s, gemm_sifive_x280_asm_7m4)
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
index 58fb8b78b..ae99e7141 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
@@ -299,7 +299,7 @@ void bli_dgemm_skx_asm_16x12_l2
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c_, inc_t cs_c_,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
index 9f5f42074..81e4cb63c 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
@@ -165,7 +165,7 @@ void bli_dgemm_skx_asm_16x14
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c_, inc_t cs_c_,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
index d8a4637ce..e0fd372c1 100644
--- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
+++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
@@ -329,7 +329,7 @@ void bli_sgemm_skx_asm_32x12_l2
        const void*      b,
        const void*      beta,
              void*      c, inc_t rs_c_, inc_t cs_c_,
-             auxinfo_t* data,
+       const auxinfo_t* data,
        const cntx_t*    cntx
      )
 {
diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c
index 79c608f39..028e4d6ba 100644
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -462,7 +462,7 @@ void PASTEMAC(ch,varname) \
 		for ( i = 0; i < n; ++i ) \
 		{ \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \
+			PASTEMAC(ch,chr,gets)( x[i], chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
 			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
@@ -492,7 +492,7 @@ void PASTEMAC(ch,varname) \
 			ctype* chi1 = x + (i  )*incx; \
 \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+			PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
 			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c
index 03c1627f1..17e8b401b 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c
@@ -353,7 +353,7 @@ void bli_cgemmsup_rv_zen_asm_2x8
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm9, ymm9)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rbx), ymm1)       // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2)    // load beta_i and duplicate
@@ -417,7 +417,7 @@ void bli_cgemmsup_rv_zen_asm_2x8
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm4, ymm0, ymm4)
 	add(rdi, rcx)
-	
+
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm8, ymm0, ymm8)
 	add(rdi, rcx)
@@ -430,7 +430,7 @@ void bli_cgemmsup_rv_zen_asm_2x8
 
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
+	add(rdi, rcx)
 
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	vunpcklpd(ymm8, ymm4, ymm0)        //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-12
@@ -505,8 +505,8 @@ void bli_cgemmsup_rv_zen_asm_2x8
 	lea(mem(rcx, rsi, 1), rcx)
 
 	/******************Transpose bottom tile 4x2***************************/
-	vunpcklpd(ymm9, ymm5, ymm0)  //a8a9b8b9     a12a13b12b13 
-	vunpckhpd(ymm9, ymm5, ymm2)  //a10a11b10b11 a14a15b14b15 
+	vunpcklpd(ymm9, ymm5, ymm0)  //a8a9b8b9     a12a13b12b13
+	vunpckhpd(ymm9, ymm5, ymm2)  //a10a11b10b11 a14a15b14b15
 
 	vmovups(xmm0, mem(rcx))
 	lea(mem(rcx, rsi, 1), rcx)
@@ -763,7 +763,7 @@ void bli_cgemmsup_rv_zen_asm_1x8
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm5, ymm5)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
@@ -1151,7 +1151,7 @@ void bli_cgemmsup_rv_zen_asm_2x4
 	vaddsubps(ymm3, ymm8, ymm8)
 
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
@@ -1198,7 +1198,7 @@ void bli_cgemmsup_rv_zen_asm_2x4
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm4, ymm0, ymm4)
 	add(rdi, rcx)
-	
+
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm8, ymm0, ymm8)
 
@@ -1492,7 +1492,7 @@ void bli_cgemmsup_rv_zen_asm_1x4
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm4, ymm4)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
@@ -1848,7 +1848,7 @@ void bli_cgemmsup_rv_zen_asm_2x2
 	vaddsubps(xmm3, xmm8, xmm8)
 
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), xmm1) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), xmm2) // load beta_i and duplicate
@@ -1913,7 +1913,7 @@ void bli_cgemmsup_rv_zen_asm_2x2
 	CGEMM_INPUT_SCALE_CS_BETA_NZ_128
 	vaddps(xmm4, xmm0, xmm4)
 	add(rdi, rcx)
-	
+
 	CGEMM_INPUT_SCALE_CS_BETA_NZ_128
 	vaddps(xmm8, xmm0, xmm8)
 
@@ -2033,7 +2033,7 @@ void bli_cgemmsup_rv_zen_asm_1x2
 
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
-	
+
 	mov(var(rs_b), r10)                // load rs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(dt)
 
@@ -2190,7 +2190,7 @@ void bli_cgemmsup_rv_zen_asm_1x2
 	vaddsubps(xmm3, xmm4, xmm4)
 
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastss(mem(rbx), xmm1) // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), xmm2) // load beta_i and duplicate
diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c
index e6f9c8a65..c0ef4bda7 100644
--- a/ref_kernels/1/bli_addv_ref.c
+++ b/ref_kernels/1/bli_addv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c
index 93591e202..87ef63225 100644
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -41,7 +41,7 @@
 #undef  GENTFUNCR
 #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              dim_t   n, \
        const void*   x0, inc_t incx, \
@@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+			PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
 			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
@@ -115,7 +115,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			const ctype* restrict chi1 = x + (i  )*incx; \
 \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+			PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
 			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 92cee8bd9..1c265c819 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 663b7dc43..f9ca0fb9d 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c
index 78af7451c..0f35f5167 100644
--- a/ref_kernels/1/bli_copyv_ref.c
+++ b/ref_kernels/1/bli_copyv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c
index b972e1cae..18a195ca2 100644
--- a/ref_kernels/1/bli_dotv_ref.c
+++ b/ref_kernels/1/bli_dotv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              conj_t  conjy, \
diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c
index cb9d2a521..8fe116001 100644
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              conj_t  conjy, \
diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c
index 7710de638..1cea1c61a 100644
--- a/ref_kernels/1/bli_invertv_ref.c
+++ b/ref_kernels/1/bli_invertv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              dim_t   n, \
              void*   x0, inc_t incx, \
diff --git a/ref_kernels/1/bli_invscalv_ref.c b/ref_kernels/1/bli_invscalv_ref.c
index d48473b3a..914c89174 100644
--- a/ref_kernels/1/bli_invscalv_ref.c
+++ b/ref_kernels/1/bli_invscalv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjalpha, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index f35504dd0..4b96f5659 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 559189a62..8e9a1ec98 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjalpha, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c
index f3669aac9..8d945f618 100644
--- a/ref_kernels/1/bli_setv_ref.c
+++ b/ref_kernels/1/bli_setv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjalpha, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c
index 3820b8051..d43d96033 100644
--- a/ref_kernels/1/bli_subv_ref.c
+++ b/ref_kernels/1/bli_subv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c
index b586ebca2..f01d0d09f 100644
--- a/ref_kernels/1/bli_swapv_ref.c
+++ b/ref_kernels/1/bli_swapv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              dim_t   n, \
              void*   x0, inc_t incx, \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 51ef6c4ff..02c0cd14d 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -37,7 +37,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              dim_t   n, \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 3932a453f..8b5b2cbbb 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -38,7 +38,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjx, \
              conj_t  conjy, \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index f4f137bb3..233c64fc2 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -38,7 +38,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conja, \
              conj_t  conjx, \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index b48cf616d..fe558ba3a 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -38,7 +38,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjxt, \
              conj_t  conjx, \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index 2c69ffb88..6cfa5168c 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -38,7 +38,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjat, \
              conj_t  conja, \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index b80b7154e..0f4cda2b8 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -38,7 +38,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t  conjat, \
              conj_t  conjx, \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index ae3ae685c..fad987c4b 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -35,45 +35,49 @@
 #include "blis.h"
 
 
-#define PACKM_SET1_1E( chr, mnk ) \
+#define PACKM_SET_1E( chp_r, val_r, val_i, mnk ) \
 do { \
-	PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-	PASTEMAC(chr,set0s)( *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
-	PASTEMAC(chr,set0s)( *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-	PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chp_r,copys)(  val_r, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+	PASTEMAC(chp_r,copys)(  val_i, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
+	PASTEMAC(chp_r,copys)( -val_i, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+	PASTEMAC(chp_r,copys)(  val_r, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
 } while (0)
 
 
-#define PACKM_SET1_1R( chr, mnk ) \
+#define PACKM_SET_1R( chp_r, val_r, val_i, mnk ) \
 do { \
-	PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \
-	PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chp_r,copys)( val_r, *(pi1_r + mnk*cdim_bcast + d + mnk*ldp2) ); \
+	PASTEMAC(chp_r,copys)( val_i, *(pi1_i + mnk*cdim_bcast + d + mnk*ldp2) ); \
 } while (0)
 
 
-#define PACKM_SCAL_1E( ch, mn, k, op ) \
+#define PACKM_SCAL_1E( ctypep_r, cha, chp, mn, k, op ) \
 do { \
-	PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
-	                                    *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
-	                                    *(pi1_ri + (mn*2 + 0)*dfac  + d + k*ldp2), \
-	                                    *(pi1_ri + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
-	PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
-	                                    *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
-	                                    *(pi1_ir + (mn*2 + 0)*dfac  + d + k*ldp2), \
-	                                    *(pi1_ir + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
+	ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
+	PASTEMAC(cha,chp,copyris)( *(alpha1 +  mn       *inca2       + 0 + k*lda2), \
+	                            *(alpha1 +  mn       *inca2       + 1 + k*lda2), \
+	                            alpha_r, alpha_i ); \
+	PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+	PASTEMAC(chp,copyris)(  ka_r, ka_i, *(pi1_ri + (mn*2 + 0)*cdim_bcast  + d + k*ldp2), \
+	                                    *(pi1_ri + (mn*2 + 1)*cdim_bcast  + d + k*ldp2) ); \
+	PASTEMAC(chp,copyris)( -ka_i, ka_r, *(pi1_ir + (mn*2 + 0)*cdim_bcast  + d + k*ldp2), \
+	                                    *(pi1_ir + (mn*2 + 1)*cdim_bcast  + d + k*ldp2) ); \
 } while (0)
 
 
-#define PACKM_SCAL_1R( ch, mn, k, op ) \
+#define PACKM_SCAL_1R( ctypep_r, cha, chp, mn, k, op ) \
 do { \
-	PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0 + k*lda2), \
-	                                   *(alpha1 + mn*inca2 + 1 + k*lda2), \
-	                                   *(pi1_r  + mn*dfac  + d + k*ldp2), \
-	                                   *(pi1_i  + mn*dfac  + d + k*ldp2) ); \
+	ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
+	PASTEMAC(cha,chp,copyris)( *(alpha1 +  mn       *inca2       + 0 + k*lda2), \
+	                            *(alpha1 +  mn       *inca2       + 1 + k*lda2), \
+	                            alpha_r, alpha_i ); \
+	PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+	PASTEMAC(chp,copyris)( ka_r, ka_i, *(pi1_r  + mn*cdim_bcast  + d + k*ldp2), \
+	                                   *(pi1_i  + mn*cdim_bcast  + d + k*ldp2) ); \
 } while (0)
 
 
-#define PACKM_DIAG_1E_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
+#define PACKM_DIAG_1E_BODY( ctypep_r, cha, chp, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
 \
 do \
 { \
@@ -82,19 +86,19 @@ do \
 	dim_t lda2 = lda2_lu; \
 	for ( dim_t k = 0; k < cdim; k++ ) \
 	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
-	for ( dim_t d = 0; d < dfac; d++ ) \
-		PACKM_SCAL_1E( ch, mn, k, op ); \
+	for ( dim_t d = 0; d < cdim_bcast; d++ ) \
+		PACKM_SCAL_1E( ctypep_r, cha, chp, mn, k, op ); \
 } while(0)
 
 
-#define PACKM_DIAG_BODY_1E_L( ch, op ) \
-	PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+#define PACKM_DIAG_BODY_1E_L( ctypep_r, cha, chp, op ) \
+	PACKM_DIAG_1E_BODY( ctypep_r, cha, chp, k+1, cdim, inca_l2, lda_l2, op )
 
-#define PACKM_DIAG_BODY_1E_U( ch, op ) \
-	PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op )
+#define PACKM_DIAG_BODY_1E_U( ctypep_r, cha, chp, op ) \
+	PACKM_DIAG_1E_BODY( ctypep_r, cha, chp, 0, k, inca_u2, lda_u2, op )
 
 
-#define PACKM_DIAG_1R_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
+#define PACKM_DIAG_1R_BODY( ctypep_r, cha, chp, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
 \
 do \
 { \
@@ -103,22 +107,22 @@ do \
 	dim_t lda2 = lda2_lu; \
 	for ( dim_t k = 0; k < cdim; k++ ) \
 	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
-	for ( dim_t d = 0; d < dfac; d++ ) \
-		PACKM_SCAL_1R( ch, mn, k, op ); \
+	for ( dim_t d = 0; d < cdim_bcast; d++ ) \
+		PACKM_SCAL_1R( ctypep_r, cha, chp, mn, k, op ); \
 } while(0)
 
 
-#define PACKM_DIAG_BODY_1R_L( ch, op ) \
-	PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+#define PACKM_DIAG_BODY_1R_L( ctypep_r, cha, chp, op ) \
+	PACKM_DIAG_1R_BODY( ctypep_r, cha, chp, k+1, cdim, inca_l2, lda_l2, op )
 
-#define PACKM_DIAG_BODY_1R_U( ch, op ) \
-	PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op )
+#define PACKM_DIAG_BODY_1R_U( ctypep_r, cha, chp, op ) \
+	PACKM_DIAG_1R_BODY( ctypep_r, cha, chp, 0, k, inca_u2, lda_u2, op )
 
 
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
+#undef  GENTFUNC2R
+#define GENTFUNC2R( ctypea, ctypea_r, cha, cha_r, ctypep, ctypep_r, chp, chp_r, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(cha,chp,opname,arch,suf) \
      ( \
              struc_t struca, \
              diag_t  diaga, \
@@ -127,39 +131,38 @@ void PASTEMAC3(ch,opname,arch,suf) \
              pack_t  schema, \
              bool    invdiag, \
              dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
              dim_t   n_max, \
        const void*   kappa, \
        const void*   a, inc_t inca, inc_t lda, \
              void*   p,             inc_t ldp, \
+       const void*   params, \
        const cntx_t* cntx \
      ) \
 { \
-	const num_t dt_r      = PASTEMAC(chr,type); \
-	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt_r, mnr0, cntx ); \
-	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt_r, bb0, cntx ); \
+	const inc_t inca2 = 2 * inca; \
+	const inc_t lda2  = 2 * lda; \
+	const inc_t ldp2  = 2 * ldp; \
 \
-	/* start by zeroing out the whole block */ \
-	PASTEMAC(chr,set0s_mxn) \
-	( \
-	  cdim_pack, \
-	  2*n_max, \
-	  ( ctype_r* )p, 1, ldp  \
-	); \
-\
-	const inc_t       inca2   = 2 * inca; \
-	const inc_t       lda2    = 2 * lda; \
-	const inc_t       ldp2    = 2 * ldp; \
-\
-	      ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
-	      ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
-	const ctype_r* restrict alpha1  = ( const ctype_r* )a; \
+	      ctypep_r           kappa_r = ( ( ctypep_r* )kappa )[0]; \
+	      ctypep_r           kappa_i = ( ( ctypep_r* )kappa )[1]; \
+	      ctypep_r           one     = *PASTEMAC(chp_r,1); \
+	      ctypep_r           zero    = *PASTEMAC(chp_r,0); \
+	const ctypea_r* restrict alpha1  = ( const ctypea_r* )a; \
 \
 	if ( bli_is_1e_packed( schema ) ) \
 	{ \
-		const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
+		/* start by zeroing out the whole block */ \
+		PASTEMAC(chp_r,set0s_mxn) \
+		( \
+		  2*cdim_max, \
+		  2*n_max, \
+		  ( ctypep_r* )p, 1, ldp  \
+		); \
 \
-		ctype_r* restrict pi1_ri   = ( ctype_r* )p; \
-		ctype_r* restrict pi1_ir   = ( ctype_r* )p + ldp; \
+		ctypep_r* restrict pi1_ri   = ( ctypep_r* )p; \
+		ctypep_r* restrict pi1_ir   = ( ctypep_r* )p + ldp; \
 \
 		/* write the strictly lower part if it exists */ \
 		if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
@@ -175,8 +178,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				    bli_toggle_conj( &conja_l ); \
 			} \
 \
-			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ch, scal2jris ); \
-			else                          PACKM_DIAG_BODY_1E_L( ch, scal2ris ); \
+			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ctypep_r, cha, chp, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1E_L( ctypep_r, cha, chp, scal2ris ); \
 		} \
 \
 		/* write the strictly upper part if it exists */ \
@@ -194,68 +197,75 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				    bli_toggle_conj( &conja_u ); \
 			} \
 \
-			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ch, scal2jris ); \
-			else                          PACKM_DIAG_BODY_1E_U( ch, scal2ris ); \
+			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ctypep_r, cha, chp, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1E_U( ctypep_r, cha, chp, scal2ris ); \
 		} \
 \
 		/* write the diagonal */ \
 		if ( bli_is_unit_diag( diaga ) ) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
-				PACKM_SET1_1E( chr, mnk ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PACKM_SET_1E( chp_r, kappa_r, kappa_i, mnk ); \
 		} \
 		else if ( bli_is_hermitian( struca ) ) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
 			{ \
-				ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
-				PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-				PASTEMAC(chr,scal2s)(  kappa_i, mu_r, *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
-				PASTEMAC(chr,scal2s)( -kappa_i, mu_r, *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-				PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+				ctypep_r alpha_r; \
+				PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
+				PASTEMAC(chp_r,scal2s)(  kappa_r, alpha_r, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+				PASTEMAC(chp_r,scal2s)(  kappa_i, alpha_r, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
+				PASTEMAC(chp_r,scal2s)( -kappa_i, alpha_r, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+				PASTEMAC(chp_r,scal2s)(  kappa_r, alpha_r, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
 			} \
 		} \
 		else if ( bli_is_conj( conja )) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
-				PACKM_SCAL_1E( ch, mnk, mnk, scal2jris ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PACKM_SCAL_1E( ctypep_r, cha, chp, mnk, mnk, scal2jris ); \
 		} \
 		else \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
-				PACKM_SCAL_1E( ch, mnk, mnk, scal2ris ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PACKM_SCAL_1E( ctypep_r, cha, chp, mnk, mnk, scal2ris ); \
 		} \
 \
 		/* invert the diagonal if requested */ \
 		if ( invdiag ) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
 			{ \
-				PASTEMAC(ch,invertris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
-				                        *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
-				PASTEMAC(ch,copyjris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
-				                       *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
-				                       *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
-				                       *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(chp,invertris)( *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2), \
+				                         *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
+				PASTEMAC(chp,copyjris)( *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2), \
+				                        *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2), \
+				                        *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2), \
+				                        *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
 			} \
 		} \
 \
 		/* if this an edge case in both directions, extend the diagonal with ones */ \
 		for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
-			PACKM_SET1_1E( chr, mnk ); \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+			PACKM_SET_1E( chp_r, one, zero, mnk ); \
 	} \
 	else /* bli_is_1r_packed( schema ) */ \
 	{ \
-		const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
+		/* start by zeroing out the whole block */ \
+		PASTEMAC(chp_r,set0s_mxn) \
+		( \
+		  cdim_max, \
+		  2*n_max, \
+		  ( ctypep_r* )p, 1, ldp  \
+		); \
 \
-		ctype_r* restrict pi1_r    = ( ctype_r* )p; \
-		ctype_r* restrict pi1_i    = ( ctype_r* )p + ldp; \
+		ctypep_r* restrict pi1_r    = ( ctypep_r* )p; \
+		ctypep_r* restrict pi1_i    = ( ctypep_r* )p + ldp; \
 \
 		/* write the strictly lower part if it exists */ \
 		if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
@@ -271,8 +281,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				    bli_toggle_conj( &conja_l ); \
 			} \
 \
-			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ch, scal2jris ); \
-			else                          PACKM_DIAG_BODY_1R_L( ch, scal2ris ); \
+			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ctypep_r, cha, chp, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1R_L( ctypep_r, cha, chp, scal2ris ); \
 		} \
 \
 		/* write the strictly upper part if it exists */ \
@@ -290,56 +300,58 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				    bli_toggle_conj( &conja_u ); \
 			} \
 \
-			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ch, scal2jris ); \
-			else                          PACKM_DIAG_BODY_1R_U( ch, scal2ris ); \
+			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ctypep_r, cha, chp, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1R_U( ctypep_r, cha, chp, scal2ris ); \
 		} \
 \
 		/* write the diagonal */ \
 		if ( bli_is_unit_diag( diaga ) ) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
-				PACKM_SET1_1R( chr, mnk ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PACKM_SET_1R( chp_r, kappa_r, kappa_i, mnk ); \
 		} \
 		else if ( bli_is_hermitian( struca ) ) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
 			{ \
-				ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
-				PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_r + mnk*(dfac + ldp2) + d) ); \
-				PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+				ctypep_r alpha_r; \
+				PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
+				PASTEMAC(chp_r,scal2s)( kappa_r, alpha_r, *(pi1_r + mnk*(cdim_bcast + ldp2) + d) ); \
+				PASTEMAC(chp_r,scal2s)( kappa_i, alpha_r, *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \
 			} \
 		} \
 		else if ( bli_is_conj( conja ) ) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
-				PACKM_SCAL_1R( ch, mnk, mnk, scal2jris ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PACKM_SCAL_1R( ctypep_r, cha, chp, mnk, mnk, scal2jris ); \
 		} \
 		else \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
-				PACKM_SCAL_1R( ch, mnk, mnk, scal2ris ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PACKM_SCAL_1R( ctypep_r, cha, chp, mnk, mnk, scal2ris ); \
 		} \
 \
 		/* invert the diagonal if requested */ \
 		if ( invdiag ) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-			for ( dim_t d = 0; d < dfac; ++d ) \
-				PASTEMAC(ch,invertris)( *(pi1_r + mnk*(dfac + ldp2) + d), \
-				                        *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PASTEMAC(chp,invertris)( *(pi1_r + mnk*(cdim_bcast + ldp2) + d), \
+				                         *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \
 		} \
 \
 		/* if this an edge case in both directions, extend the diagonal with ones */ \
 		for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
-			PACKM_SET1_1R( chr, mnk ); \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+			PACKM_SET_1R( chp_r, one, zero, mnk ); \
 	} \
 }
 
-INSERT_GENTFUNCCO( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNCCO( packm_nrxnr_diag_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
+GENTFUNC2R( scomplex, float,  c, s, scomplex, float,  c, s, packm_diag_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( scomplex, float,  c, s, dcomplex, double, z, d, packm_diag_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, scomplex, float,  c, s, packm_diag_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, dcomplex, double, z, d, packm_diag_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index f21f18861..635bb9900 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -35,28 +35,33 @@
 #include "blis.h"
 
 
-#define PACKM_DIAG_BODY( ctype, ch, mn_min, mn_max, inca, lda, op ) \
+#define PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, mn_min, mn_max, dfac, inca, lda, op ) \
 \
 do \
 { \
 	for ( dim_t k = 0; k < cdim; k++ ) \
 	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
-	for ( dim_t d = 0; d < dfac; d++ ) \
-		PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca + k*lda), *(pi1 + mn*dfac + d + k*ldp) ); \
+	{ \
+		ctypep alpha_cast, kappa_alpha; \
+		PASTEMAC(cha,chp,copys)( *(alpha1 + mn*inca + k*lda), alpha_cast ); \
+		PASTEMAC(chp,op)( kappa_cast, alpha_cast, kappa_alpha ); \
+		for ( dim_t d = 0; d < dfac; d++ ) \
+			PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mn*dfac + d + k*ldp) ); \
+	} \
 } while(0)
 
 
-#define PACKM_DIAG_BODY_L( ctype, ch, op ) \
-	PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op )
+#define PACKM_DIAG_BODY_L( ctypea, ctypep, cha, chp, op ) \
+	PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, k+1, cdim, cdim_bcast, inca_l, lda_l, op )
 
-#define PACKM_DIAG_BODY_U( ctype, ch, op ) \
-	PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op )
+#define PACKM_DIAG_BODY_U( ctypea, ctypep, cha, chp, op ) \
+	PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, 0, k, cdim_bcast, inca_u, lda_u, op )
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
+#undef  GENTFUNC2
+#define GENTFUNC2( ctypea, ctypep, cha, chp, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(cha,chp,opname,arch,suf) \
      ( \
              struc_t struca, \
              diag_t  diaga, \
@@ -65,29 +70,27 @@ void PASTEMAC3(ch,opname,arch,suf) \
              pack_t  schema, \
              bool    invdiag, \
              dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
              dim_t   n_max, \
        const void*   kappa, \
        const void*   a, inc_t inca, inc_t lda, \
              void*   p,             inc_t ldp, \
+       const void*   params, \
        const cntx_t* cntx  \
      ) \
 { \
-	const num_t dt        = PASTEMAC(ch,type); \
-	const dim_t cdim_max  = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
-	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt, mnr0, cntx ); \
-	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt, bb0, cntx ); \
-\
 	/* start by zeroing out the whole block */ \
-	PASTEMAC(ch,set0s_mxn) \
+	PASTEMAC(chp,set0s_mxn) \
 	( \
-	  cdim_pack, \
+	  cdim_max, \
 	  n_max, \
 	  p, 1, ldp  \
 	); \
 \
-	      ctype           kappa_cast = *( ctype* )kappa; \
-	const ctype* restrict alpha1     = a; \
-	      ctype* restrict pi1        = p; \
+	      ctypep           kappa_cast = *( ctypep* )kappa; \
+	const ctypea* restrict alpha1     = a; \
+	      ctypep* restrict pi1        = p; \
 \
 	/* write the strictly lower part if it exists */ \
 	if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
@@ -103,8 +106,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				bli_toggle_conj( &conja_l ); \
 		} \
 \
-		if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctype, ch, scal2js ); \
-		else                          PACKM_DIAG_BODY_L( ctype, ch, scal2s ); \
+		if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctypea, ctypep, cha, chp, scal2js ); \
+		else                          PACKM_DIAG_BODY_L( ctypea, ctypep, cha, chp, scal2s ); \
 	} \
 \
 	/* write the strictly upper part if it exists */ \
@@ -122,55 +125,66 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				bli_toggle_conj( &conja_u ); \
 		} \
 \
-		if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctype, ch, scal2js ); \
-		else                          PACKM_DIAG_BODY_U( ctype, ch, scal2s ); \
+		if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctypea, ctypep, cha, chp, scal2js ); \
+		else                          PACKM_DIAG_BODY_U( ctypea, ctypep, cha, chp, scal2s ); \
 	} \
 \
 	/* write the diagonal */ \
 	if ( bli_is_unit_diag( diaga ) ) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
-			PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+			PASTEMAC(chp,copys)( kappa_cast, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 	} \
 	else if ( bli_is_hermitian( struca ) ) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
 		{ \
-			ctype mu; \
-			PASTEMAC(ch,copys)( *(alpha1 + mnk*(inca + lda)), mu ); \
-			PASTEMAC(ch,seti0s)( mu ); \
-			PASTEMAC(ch,scal2s)( kappa_cast, mu, *(pi1 + mnk*(dfac + ldp) + d) ); \
+			ctypep alpha_cast, kappa_alpha; \
+			PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
+			PASTEMAC(chp,seti0s)( alpha_cast ); \
+			PASTEMAC(chp,scal2s)( kappa_cast, alpha_cast, kappa_alpha ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 		} \
 	} \
 	else if ( bli_is_conj( conja )) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
-			PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+		{ \
+			ctypep alpha_cast, kappa_alpha; \
+			PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
+			PASTEMAC(chp,scal2js)( kappa_cast, alpha_cast, kappa_alpha ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+		} \
 	} \
 	else \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
-			PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+		{ \
+			ctypep alpha_cast, kappa_alpha; \
+			PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
+			PASTEMAC(chp,scal2s)( kappa_cast, alpha_cast, kappa_alpha ); \
+			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+		} \
 	} \
 \
 	/* invert the diagonal if requested */ \
 	if ( invdiag ) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
-			PASTEMAC(ch,inverts)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+			PASTEMAC(chp,inverts)( *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 	} \
 \
 	/* if this an edge case in both directions, extend the diagonal with ones */ \
 	for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
-	for ( dim_t d = 0; d < dfac; ++d ) \
-		PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+	for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+		PASTEMAC(chp,set1s)( *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 }
 
-INSERT_GENTFUNC_BASIC( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNC_BASIC( packm_nrxnr_diag, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2_BASIC( packm_diag, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2_MIX_P( packm_diag, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c
new file mode 100644
index 000000000..bb6fe939e
--- /dev/null
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c
@@ -0,0 +1,198 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+#define PACKM_SET_RO( chp_r, val, mnk ) \
+do { \
+	PASTEMAC(chp_r,copys)( val, *(pi1_r + mnk*cdim_bcast + d + mnk*ldp) ); \
+} while (0)
+
+
+#define PACKM_SCAL_RO( ctypep_r, cha, chp, chp_r, mn, k, op ) \
+do { \
+	ctypep_r alpha_r, alpha_i, ka_r, ka_i; (void)ka_i; \
+	PASTEMAC(cha,chp,copyris)( *(alpha1 +  mn       *inca2       + 0 + k*lda2), \
+	                            *(alpha1 +  mn       *inca2       + 1 + k*lda2), \
+	                            alpha_r, alpha_i ); \
+	PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+	PASTEMAC(chp_r,copys)( ka_r, *(pi1_r  + mn*cdim_bcast  + d + k*ldp) ); \
+} while (0)
+
+
+#define PACKM_DIAG_RO_BODY( ctypep_r, cha, chp, chp_r, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
+\
+do \
+{ \
+	/* PACKM_SCAL_RO assumes inca2 and lda2 are the strides to use. */ \
+	dim_t inca2 = inca2_lu; \
+	dim_t lda2 = lda2_lu; \
+	for ( dim_t k = 0; k < cdim; k++ ) \
+	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
+	for ( dim_t d = 0; d < cdim_bcast; d++ ) \
+		PACKM_SCAL_RO( ctypep_r, cha, chp, chp_r, mn, k, op ); \
+} while(0)
+
+
+#define PACKM_DIAG_BODY_RO_L( ctypep_r, cha, chp, chp_r, op ) \
+	PACKM_DIAG_RO_BODY( ctypep_r, cha, chp, chp_r, k+1, cdim, inca_l2, lda_l2, op )
+
+#define PACKM_DIAG_BODY_RO_U( ctypep_r, cha, chp, chp_r, op ) \
+	PACKM_DIAG_RO_BODY( ctypep_r, cha, chp, chp_r, 0, k, inca_u2, lda_u2, op )
+
+
+#undef  GENTFUNC2R
+#define GENTFUNC2R( ctypea, ctypea_r, cha, cha_r, ctypep, ctypep_r, chp, chp_r, opname, arch, suf ) \
+\
+void PASTEMAC(cha,chp,opname,arch,suf) \
+     ( \
+             struc_t struca, \
+             diag_t  diaga, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             pack_t  schema, \
+             bool    invdiag, \
+             dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
+             dim_t   n_max, \
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp, \
+       const void*   params, \
+       const cntx_t* cntx \
+     ) \
+{ \
+	const inc_t inca2 = 2 * inca; \
+	const inc_t lda2  = 2 * lda; \
+\
+	      ctypep_r           kappa_r = ( ( ctypep_r* )kappa )[0]; \
+	      ctypep_r           kappa_i = ( ( ctypep_r* )kappa )[1]; \
+	      ctypep_r           one     = *PASTEMAC(chp_r,1); \
+	const ctypea_r* restrict alpha1  = ( const ctypea_r* )a; \
+\
+	/* start by zeroing out the whole block */ \
+	PASTEMAC(chp_r,set0s_mxn) \
+	( \
+	  cdim_max, \
+	  n_max, \
+	  ( ctypep_r* )p, 1, ldp  \
+	); \
+\
+	ctypep_r* restrict pi1_r = ( ctypep_r* )p; \
+\
+	/* write the strictly lower part if it exists */ \
+	if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+	{ \
+		dim_t  inca_l2 = inca2; \
+		dim_t  lda_l2  = lda2; \
+		conj_t conja_l = conja; \
+\
+		if ( bli_is_upper( uploa ) ) \
+		{ \
+			bli_swap_incs( &inca_l2, &lda_l2 ); \
+			if ( bli_is_hermitian( struca ) ) \
+			    bli_toggle_conj( &conja_l ); \
+		} \
+\
+		if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_RO_L( ctypep_r, cha, chp, chp_r, scal2jris ); \
+		else                          PACKM_DIAG_BODY_RO_L( ctypep_r, cha, chp, chp_r, scal2ris ); \
+	} \
+\
+	/* write the strictly upper part if it exists */ \
+	/* assume either symmetric, hermitian, or triangular */ \
+	if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+	{ \
+		dim_t  inca_u2 = inca2; \
+		dim_t  lda_u2  = lda2; \
+		conj_t conja_u = conja; \
+\
+		if ( bli_is_lower( uploa ) ) \
+		{ \
+			bli_swap_incs( &inca_u2, &lda_u2 ); \
+			if ( bli_is_hermitian( struca ) ) \
+			    bli_toggle_conj( &conja_u ); \
+		} \
+\
+		if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_RO_U( ctypep_r, cha, chp, chp_r, scal2jris ); \
+		else                          PACKM_DIAG_BODY_RO_U( ctypep_r, cha, chp, chp_r, scal2ris ); \
+	} \
+\
+	/* write the diagonal */ \
+	if ( bli_is_unit_diag( diaga ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+			PACKM_SET_RO( chp_r, kappa_r, mnk ); \
+	} \
+	else if ( bli_is_hermitian( struca ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+		{ \
+			ctypep_r alpha_r; \
+			PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
+			PASTEMAC(chp_r,scal2s)( kappa_r, alpha_r, *(pi1_r + mnk*(cdim_bcast + ldp) + d) ); \
+		} \
+	} \
+	else if ( bli_is_conj( conja ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+			PACKM_SCAL_RO( ctypep_r, cha, chp, chp_r, mnk, mnk, scal2jris ); \
+	} \
+	else \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+			PACKM_SCAL_RO( ctypep_r, cha, chp, chp_r, mnk, mnk, scal2ris ); \
+	} \
+\
+	/* invert the diagonal if requested */ \
+	if ( invdiag ) \
+	{ \
+		/* TODO: real-only packing doesn't work for TRSM */ \
+	} \
+\
+	/* if this an edge case in both directions, extend the diagonal with ones */ \
+	for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+	for ( dim_t d = 0; d < cdim_bcast; ++d ) \
+		PACKM_SET_RO( chp_r, one, mnk ); \
+}
+
+GENTFUNC2R( scomplex, float,  c, s, scomplex, float,  c, s, packm_diag_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( scomplex, float,  c, s, dcomplex, double, z, d, packm_diag_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, scomplex, float,  c, s, packm_diag_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, dcomplex, double, z, d, packm_diag_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index e4581a097..5115628cd 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -35,7 +35,7 @@
 #include "blis.h"
 
 
-#define PACKM_1E_BODY( ctype, ch, pragma, cdim, inca2, op ) \
+#define PACKM_1E_BODY( ctypep_r, cha, chp, pragma, cdim, dfac, inca2, op ) \
 \
 do \
 { \
@@ -43,12 +43,15 @@ do \
 	{ \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; ++mn ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
 		{ \
-			PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-			                                    *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
-			PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-			                                    *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+			ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
+			PASTEMAC(cha,chp,copyris)( *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), alpha_r, alpha_i ); \
+			PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+			{ \
+				PASTEMAC(chp,copyris)(  ka_r, ka_i, *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+				PASTEMAC(chp,copyris)( -ka_i, ka_r, *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+			} \
 		} \
 \
 		alpha1 += lda2; \
@@ -58,7 +61,7 @@ do \
 } while(0)
 
 
-#define PACKM_1R_BODY( ctype, ch, pragma, cdim, inca2, op ) \
+#define PACKM_1R_BODY( ctypep_r, cha, chp, pragma, cdim, dfac, inca2, op ) \
 \
 do \
 { \
@@ -66,9 +69,13 @@ do \
 	{ \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; ++mn ) \
-		for ( dim_t d = 0; d < dfac; ++d ) \
-			PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-			                                   *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
+		{ \
+			ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
+			PASTEMAC(cha,chp,copyris)( *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), alpha_r, alpha_i ); \
+			PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PASTEMAC(chp,copyris)( ka_r, ka_i, *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
+		} \
 \
 		alpha1 += lda2; \
 		pi1_r  += ldp2; \
@@ -77,110 +84,137 @@ do \
 } while(0)
 
 
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
+#undef  GENTFUNC2R
+#define GENTFUNC2R( ctypea, ctypea_r, cha, cha_r, ctypep, ctypep_r, chp, chp_r, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(cha,chp,opname,arch,suf) \
      ( \
              conj_t  conja, \
              pack_t  schema, \
              dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
              dim_t   n, \
              dim_t   n_max, \
        const void*   kappa, \
        const void*   a, inc_t inca, inc_t lda, \
              void*   p,             inc_t ldp, \
+       const void*   params, \
        const cntx_t* cntx  \
      ) \
 { \
-	const dim_t dfac = PASTECH2(bb0, _, chr); \
-	const num_t dt_r = PASTEMAC(chr,type); \
+	const dim_t mr  = PASTECH(BLIS_MR_, chp_r); \
+	const dim_t nr  = PASTECH(BLIS_NR_, chp_r); \
+	const dim_t bbm = PASTECH(BLIS_BBM_, chp_r); \
+	const dim_t bbn = PASTECH(BLIS_BBN_, chp_r); \
 \
 	if ( bli_is_1e_packed( schema ) ) \
 	{ \
-		/* cdim and mnr are in units of complex values */ \
-		const dim_t mnr      = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \
-		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
-\
-		const inc_t       inca2   = 2 * inca; \
-		const inc_t       lda2    = 2 * lda; \
-		const inc_t       ldp2    = 2 * ldp; \
+		const dim_t cdim2 = 2 * cdim; \
+		const inc_t inca2 = 2 * inca; \
+		const inc_t lda2  = 2 * lda; \
+		const inc_t ldp2  = 2 * ldp; \
 \
-		      ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
-		      ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
-		const ctype_r* restrict alpha1  = ( ctype_r* )a; \
-		      ctype_r* restrict pi1_ri  = ( ctype_r* )p; \
-		      ctype_r* restrict pi1_ir  = ( ctype_r* )p + ldp; \
+		      ctypep_r           kappa_r = ( ( ctypep_r* )kappa )[0]; \
+		      ctypep_r           kappa_i = ( ( ctypep_r* )kappa )[1]; \
+		const ctypea_r* restrict alpha1  = ( ctypea_r* )a; \
+		      ctypep_r* restrict pi1_ri  = ( ctypep_r* )p; \
+		      ctypep_r* restrict pi1_ir  = ( ctypep_r* )p + ldp; \
 \
-		if ( cdim == mnr && mnr != -1 ) \
+		if ( cdim2 == mr && cdim_bcast == bbm && mr != -1 ) \
+		{ \
+			if ( inca == 1 ) \
+			{ \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr/2, bbm, 2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr/2, bbm, 2, scal2ris ); \
+			} \
+			else \
+			{ \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr/2, bbm, inca2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr/2, bbm, inca2, scal2ris ); \
+			} \
+		} \
+		else if ( cdim2 == nr && cdim_bcast == bbn && nr != -1 ) \
 		{ \
 			if ( inca == 1 ) \
 			{ \
-				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
-				else                        PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr/2, bbn, 2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr/2, bbn, 2, scal2ris ); \
 			} \
 			else \
 			{ \
-				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
-				else                        PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr/2, bbn, inca2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr/2, bbn, inca2, scal2ris ); \
 			} \
 		} \
 		else \
 		{ \
-			if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2jris ); \
-			else                        PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
+			if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctypep_r, cha, chp, , cdim, cdim_bcast, inca2, scal2jris ); \
+			else                        PACKM_1E_BODY( ctypep_r, cha, chp, , cdim, cdim_bcast, inca2, scal2ris ); \
 		} \
 \
-		PASTEMAC(chr,set0s_edge) \
+		PASTEMAC(chp_r,set0s_edge) \
 		( \
-		  2*cdim*dfac, 2*cdim_max*dfac, \
+		  cdim2*cdim_bcast, 2*cdim_max*cdim_bcast, \
 		  2*n, 2*n_max, \
-		  ( ctype_r* )p, ldp  \
+		  ( ctypep_r* )p, ldp  \
 		); \
 	} \
 	else /* ( bli_is_1r_packed( schema ) ) */ \
 	{ \
-		const dim_t mnr      = PASTECH2(mnr0, _, chr); \
-		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
-\
-		const inc_t       inca2   = 2 * inca; \
-		const inc_t       lda2    = 2 * lda; \
-		const inc_t       ldp2    = 2 * ldp; \
+		const inc_t inca2 = 2 * inca; \
+		const inc_t lda2  = 2 * lda; \
+		const inc_t ldp2  = 2 * ldp; \
 \
-		      ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
-		      ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
-		const ctype_r* restrict alpha1  = ( ctype_r* )a; \
-		      ctype_r* restrict pi1_r   = ( ctype_r* )p; \
-		      ctype_r* restrict pi1_i   = ( ctype_r* )p + ldp; \
+		      ctypep_r           kappa_r = ( ( ctypep_r* )kappa )[0]; \
+		      ctypep_r           kappa_i = ( ( ctypep_r* )kappa )[1]; \
+		const ctypea_r* restrict alpha1  = ( ctypea_r* )a; \
+		      ctypep_r* restrict pi1_r   = ( ctypep_r* )p; \
+		      ctypep_r* restrict pi1_i   = ( ctypep_r* )p + ldp; \
 \
-		if ( cdim == mnr && mnr != -1 ) \
+		if ( cdim == mr && cdim_bcast == bbm && mr != -1 ) \
+		{ \
+			if ( inca == 1 ) \
+			{ \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr, bbm, 2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr, bbm, 2, scal2ris ); \
+			} \
+			else \
+			{ \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr, bbm, inca2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, mr, bbm, inca2, scal2ris ); \
+			} \
+		} \
+		else if ( cdim == nr && cdim_bcast == bbn && nr != -1 ) \
 		{ \
 			if ( inca == 1 ) \
 			{ \
-				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
-				else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr, bbn, 2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr, bbn, 2, scal2ris ); \
 			} \
 			else \
 			{ \
-				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
-				else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr, bbn, inca2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctypep_r, cha, chp, PRAGMA_SIMD, nr, bbn, inca2, scal2ris ); \
 			} \
 		} \
 		else \
 		{ \
-			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2jris ); \
-			else                        PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
+			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctypep_r, cha, chp, , cdim, cdim_bcast, inca2, scal2jris ); \
+			else                        PACKM_1R_BODY( ctypep_r, cha, chp, , cdim, cdim_bcast, inca2, scal2ris ); \
 		} \
 \
-		PASTEMAC(chr,set0s_edge) \
+		PASTEMAC(chp_r,set0s_edge) \
 		( \
-		  cdim*dfac, cdim_max*dfac, \
+		  cdim*cdim_bcast, cdim_max*cdim_bcast, \
 		  2*n, 2*n_max, \
-		  ( ctype_r* )p, ldp  \
+		  ( ctypep_r* )p, ldp  \
 		); \
 	} \
 }
 
-INSERT_GENTFUNCCO( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNCCO( packm_nrxk_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( scomplex, float,  c, s, scomplex, float,  c, s, packm_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( scomplex, float,  c, s, dcomplex, double, z, d, packm_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, scomplex, float,  c, s, packm_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, dcomplex, double, z, d, packm_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index ce7f8740b..5cca515ae 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -35,7 +35,7 @@
 #include "blis.h"
 
 
-#define PACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
+#define PACKM_BODY( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
 \
 do \
 { \
@@ -43,8 +43,13 @@ do \
 	{ \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; mn++ ) \
-		for ( dim_t d = 0; d < dfac; d++ ) \
-			PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \
+		{ \
+			ctypep alpha_cast, kappa_alpha; \
+			PASTEMAC(cha,chp,copys)( *(alpha1 + mn*inca), alpha_cast ); \
+			PASTEMAC(chp,op)( kappa_cast, alpha_cast, kappa_alpha ); \
+			for ( dim_t d = 0; d < dfac; d++ ) \
+				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mn*dfac + d) ); \
+		} \
 \
 		alpha1 += lda; \
 		pi1    += ldp; \
@@ -52,58 +57,74 @@ do \
 } while(0)
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
+#undef  GENTFUNC2
+#define GENTFUNC2( ctypea, ctypep, cha, chp, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(cha,chp,opname,arch,suf) \
      ( \
              conj_t  conja, \
              pack_t  schema, \
              dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
              dim_t   n, \
              dim_t   n_max, \
        const void*   kappa, \
        const void*   a, inc_t inca, inc_t lda, \
              void*   p,             inc_t ldp, \
+       const void*   params, \
        const cntx_t* cntx  \
      ) \
 { \
-	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
-	const num_t     dt         = PASTEMAC(ch,type); \
-	const dim_t     cdim_max   = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
-	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
+	const dim_t mr  = PASTECH(BLIS_MR_, chp); \
+	const dim_t nr  = PASTECH(BLIS_NR_, chp); \
+	const dim_t bbm = PASTECH(BLIS_BBM_, chp); \
+	const dim_t bbn = PASTECH(BLIS_BBN_, chp); \
 \
-	      ctype           kappa_cast = *( ctype* )kappa; \
-	const ctype* restrict alpha1     = a; \
-	      ctype* restrict pi1        = p; \
+	      ctypep           kappa_cast = *( ctypep* )kappa; \
+	const ctypea* restrict alpha1     = a; \
+	      ctypep* restrict pi1        = p; \
 \
-	if ( cdim == mnr && mnr != -1 ) \
+	if ( cdim == mr && cdim_bcast == bbm && mr != -1 ) \
+	{ \
+		if ( inca == 1 ) \
+		{ \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, 1, scal2js ); \
+			else                        PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, 1, scal2s ); \
+		} \
+		else \
+		{ \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, inca, scal2js ); \
+			else                        PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, inca, scal2s ); \
+		} \
+	} \
+	else if ( cdim == nr && cdim_bcast == bbn && nr != -1 ) \
 	{ \
 		if ( inca == 1 ) \
 		{ \
-			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \
-			else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, 1, scal2js ); \
+			else                        PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, 1, scal2s ); \
 		} \
 		else \
 		{ \
-			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \
-			else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, inca, scal2js ); \
+			else                        PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, inca, scal2s ); \
 		} \
 	} \
-	else /* if ( cdim < mnr ) */ \
+	else \
 	{ \
-		if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \
-		else                        PACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
+		if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, , cdim, cdim_bcast, inca, scal2js ); \
+		else                        PACKM_BODY( ctypea, ctypep, cha, chp, , cdim, cdim_bcast, inca, scal2s ); \
 	} \
 \
-	PASTEMAC(ch,set0s_edge) \
+	PASTEMAC(chp,set0s_edge) \
 	( \
-	  cdim*dfac, cdim_max*dfac, \
+	  cdim*cdim_bcast, cdim_max*cdim_bcast, \
 	  n, n_max, \
 	  p, ldp  \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNC_BASIC( packm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2_BASIC( packm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2_MIX_P( packm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxk_ro_ref.c b/ref_kernels/1m/bli_packm_cxk_ro_ref.c
new file mode 100644
index 000000000..a8165351d
--- /dev/null
+++ b/ref_kernels/1m/bli_packm_cxk_ro_ref.c
@@ -0,0 +1,137 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+#define PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, pragma, cdim, dfac, inca2, op ) \
+\
+do \
+{ \
+	for ( dim_t k = n; k != 0; --k ) \
+	{ \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; ++mn ) \
+		{ \
+			ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
+			( void )ka_i; \
+			PASTEMAC(cha,chp,copyris)( *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), alpha_r, alpha_i ); \
+			PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PASTEMAC(chp_r,copys)( ka_r, *(pi1_r + mn*dfac + d) ); \
+		} \
+\
+		alpha1 += lda2; \
+		pi1_r  += ldp; \
+	} \
+} while(0)
+
+
+#undef  GENTFUNC2R
+#define GENTFUNC2R( ctypea, ctypea_r, cha, cha_r, ctypep, ctypep_r, chp, chp_r, opname, arch, suf ) \
+\
+void PASTEMAC(cha,chp,opname,arch,suf) \
+     ( \
+             conj_t  conja, \
+             pack_t  schema, \
+             dim_t   cdim, \
+             dim_t   cdim_max, \
+             dim_t   cdim_bcast, \
+             dim_t   n, \
+             dim_t   n_max, \
+       const void*   kappa, \
+       const void*   a, inc_t inca, inc_t lda, \
+             void*   p,             inc_t ldp, \
+       const void*   params, \
+       const cntx_t* cntx  \
+     ) \
+{ \
+	const dim_t mr  = PASTECH(BLIS_MR_, chp_r); \
+	const dim_t nr  = PASTECH(BLIS_NR_, chp_r); \
+	const dim_t bbm = PASTECH(BLIS_BBM_, chp_r); \
+	const dim_t bbn = PASTECH(BLIS_BBN_, chp_r); \
+\
+	const inc_t inca2 = 2 * inca; \
+	const inc_t lda2  = 2 * lda; \
+\
+	      ctypep_r           kappa_r = ( ( ctypep_r* )kappa )[0]; \
+	      ctypep_r           kappa_i = ( ( ctypep_r* )kappa )[1]; \
+	const ctypea_r* restrict alpha1  = ( ctypea_r* )a; \
+	      ctypep_r* restrict pi1_r   = ( ctypep_r* )p; \
+\
+	if ( cdim == mr && cdim_bcast == bbm && mr != -1 ) \
+	{ \
+		if ( inca == 1 ) \
+		{ \
+			if ( bli_is_conj( conja ) ) PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, mr, bbm, 2, scal2jris ); \
+			else                        PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, mr, bbm, 2, scal2ris ); \
+		} \
+		else \
+		{ \
+			if ( bli_is_conj( conja ) ) PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, mr, bbm, inca2, scal2jris ); \
+			else                        PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, mr, bbm, inca2, scal2ris ); \
+		} \
+	} \
+	else if ( cdim == nr && cdim_bcast == bbn && nr != -1 ) \
+	{ \
+		if ( inca == 1 ) \
+		{ \
+			if ( bli_is_conj( conja ) ) PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, nr, bbn, 2, scal2jris ); \
+			else                        PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, nr, bbn, 2, scal2ris ); \
+		} \
+		else \
+		{ \
+			if ( bli_is_conj( conja ) ) PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, nr, bbn, inca2, scal2jris ); \
+			else                        PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, PRAGMA_SIMD, nr, bbn, inca2, scal2ris ); \
+		} \
+	} \
+	else \
+	{ \
+		if ( bli_is_conj( conja ) ) PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, , cdim, cdim_bcast, inca2, scal2jris ); \
+		else                        PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, , cdim, cdim_bcast, inca2, scal2ris ); \
+	} \
+\
+	PASTEMAC(chp_r,set0s_edge) \
+	( \
+	  cdim*cdim_bcast, cdim_max*cdim_bcast, \
+	  n, n_max, \
+	  ( ctypep_r* )p, ldp  \
+	); \
+}
+
+GENTFUNC2R( scomplex, float,  c, s, scomplex, float,  c, s, packm_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( scomplex, float,  c, s, dcomplex, double, z, d, packm_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, scomplex, float,  c, s, packm_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+GENTFUNC2R( dcomplex, double, z, d, dcomplex, double, z, d, packm_ro, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 9d86467ba..071f5c4ab 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -35,7 +35,7 @@
 #include "blis.h"
 
 
-#define UNPACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
+#define UNPACKM_BODY( ctypep, ctypea, chp, cha, pragma, cdim, dfac, inca, op ) \
 \
 do \
 { \
@@ -43,7 +43,11 @@ do \
 	{ \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; mn++ ) \
-			PASTEMAC(ch,op)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
+		{ \
+			ctypep kappa_pi; \
+			PASTEMAC(chp,op)( *kappa_cast, *(pi1 + mn*dfac), kappa_pi ); \
+			PASTEMAC(chp,cha,copys)( kappa_pi, *(alpha1 + mn*inca) ); \
+		} \
 \
 		alpha1 += lda; \
 		pi1    += ldp; \
@@ -51,49 +55,66 @@ do \
 } while(0)
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
+#undef  GENTFUNC2
+#define GENTFUNC2( ctypep, ctypea, chp, cha, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(chp,cha,opname,arch,suf) \
      ( \
              conj_t  conja, \
              pack_t  schema, \
              dim_t   cdim, \
+			 dim_t   cdim_bcast, \
              dim_t   n, \
        const void*   kappa, \
        const void*   p,             inc_t ldp, \
              void*   a, inc_t inca, inc_t lda, \
+       const void*   params, \
        const cntx_t* cntx  \
      ) \
 { \
-	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
+	const dim_t mr  = PASTECH(BLIS_MR_, chp); \
+	const dim_t nr  = PASTECH(BLIS_NR_, chp); \
     /* It's not clear if unpack needs to care about BB storage... */ \
-	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
+	const dim_t bbm = PASTECH(BLIS_BBM_, chp); \
+	const dim_t bbn = PASTECH(BLIS_BBN_, chp); \
 \
-	const ctype* restrict kappa_cast = kappa; \
-	const ctype* restrict pi1        = p; \
-	      ctype* restrict alpha1     = a; \
+	const ctypep* restrict kappa_cast = kappa; \
+	const ctypep* restrict pi1        = p; \
+	      ctypea* restrict alpha1     = a; \
 \
-	if ( cdim == mnr && mnr != -1 ) \
+	if ( cdim == mr && cdim_bcast == bbm && mr != -1 ) \
+	{ \
+		if ( inca == 1 ) \
+		{ \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, mr, bbm, 1, scal2js ); \
+			else                        UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, mr, bbm, 1, scal2s ); \
+		} \
+		else \
+		{ \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, mr, bbm, inca, scal2js ); \
+			else                        UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, mr, bbm, inca, scal2s ); \
+		} \
+	} \
+	else if ( cdim == nr && cdim_bcast == bbn && nr != -1 ) \
 	{ \
 		if ( inca == 1 ) \
 		{ \
-			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \
-			else                        UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, nr, bbn, 1, scal2js ); \
+			else                        UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, nr, bbn, 1, scal2s ); \
 		} \
 		else \
 		{ \
-			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \
-			else                        UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, nr, bbn, inca, scal2js ); \
+			else                        UNPACKM_BODY( ctypep, ctypea, chp, cha, PRAGMA_SIMD, nr, bbn, inca, scal2s ); \
 		} \
 	} \
-	else /* if ( cdim < mnr ) */ \
+	else \
 	{ \
-			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \
-			else                        UNPACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
+		if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctypep, ctypea, chp, cha, , cdim, cdim_bcast, inca, scal2js ); \
+		else                        UNPACKM_BODY( ctypep, ctypea, chp, cha, , cdim, cdim_bcast, inca, scal2s ); \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC( unpackm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-INSERT_GENTFUNC_BASIC( unpackm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2_BASIC( unpackm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2_MIX_P( unpackm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 74c596d39..ab861bcb5 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -40,7 +40,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-static void PASTEMAC3(ch,opname,arch,suf) \
+static void PASTEMAC(ch,ch,opname,arch,suf) \
      ( \
              dim_t      m, \
              dim_t      n, \
@@ -50,7 +50,7 @@ static void PASTEMAC3(ch,opname,arch,suf) \
        const void*      b0, \
        const void*      beta0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -149,7 +149,7 @@ INSERT_GENTFUNC_BASIC( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,ch,opname,arch,suf) \
      ( \
              dim_t      m, \
              dim_t      n, \
@@ -159,7 +159,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        const void*      b0, \
        const void*      beta0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -178,7 +178,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	   purpose implementation instead. */ \
 	if ( mr == -1 || nr == -1 ) \
 	{ \
-		PASTEMAC3(ch,gemm_gen,arch,suf) \
+		PASTEMAC(ch,ch,gemm_gen,arch,suf) \
 		( \
 		  m, \
 		  n, \
@@ -299,4 +299,71 @@ void PASTEMAC3(ch,opname,arch,suf) \
 
 INSERT_GENTFUNC_BASIC( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
+// Mixed-precision implementation (does not handle mixed-domain cases)
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_ab, ctype_c, chab, chc, opname, arch, suf ) \
+\
+void PASTEMAC(chab,chc,opname,arch,suf) \
+     ( \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
+       const auxinfo_t* auxinfo, \
+       const cntx_t*    cntx  \
+     ) \
+{ \
+	const ctype_c*    beta      = beta0; \
+	      ctype_c*    c         = c0; \
+\
+	const cntl_t*     params    = bli_auxinfo_params( auxinfo ); \
+\
+	const gemm_ukr_ft rgemm_ukr = bli_gemm_var_cntl_real_ukr( params ); \
+	const bool        row_pref  = bli_gemm_var_cntl_row_pref( params ); \
+	const void*       params_r  = bli_gemm_var_cntl_real_params( params ); \
+\
+	const dim_t       mr        = bli_gemm_var_cntl_mr( params ); \
+	const dim_t       nr        = bli_gemm_var_cntl_nr( params ); \
+\
+	      ctype_ab    ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype_ab ) ] \
+	                  __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const inc_t       rs_ct     = row_pref ? nr : 1; \
+	const inc_t       cs_ct     = row_pref ? 1 : mr; \
+\
+	const ctype_ab*   zero      = PASTEMAC(chab,0); \
+\
+	auxinfo_t auxinfo_r = *auxinfo; \
+	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
+\
+	/* ab = alpha * a * b; */ \
+	rgemm_ukr \
+	( \
+	  mr, \
+	  nr, \
+	  k, \
+	  alpha, \
+	  a, \
+	  b, \
+	  zero, \
+	  ct, rs_ct, cs_ct, \
+	  &auxinfo_r, \
+	  cntx  \
+	); \
+\
+	PASTEMAC(chab,chc,chc,xpbys_mxn) \
+	( \
+	  m, n, \
+	  ct, rs_ct, cs_ct, \
+	  beta, \
+	  c, rs_c, cs_c \
+	); \
+}
+
+INSERT_GENTFUNC2_MIX_P( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
 
diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c
index 75c506167..9dab9e092 100644
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -41,7 +41,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t     conja, \
              conj_t     conjb, \
@@ -53,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        const void*      b0, inc_t rs_b, inc_t cs_b, \
        const void*      beta0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -252,7 +252,7 @@ INSERT_GENTFUNC_BASIC( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              conj_t     conja, \
              conj_t     conjb, \
@@ -264,7 +264,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        const void*      b0, inc_t rs_b, inc_t cs_b, \
        const void*      beta0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 7fa56457b..e1b00e358 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -40,7 +40,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
              dim_t      m, \
              dim_t      n, \
@@ -51,7 +51,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        const void*      bx10, \
              void*      b110, \
              void*      c110, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -99,7 +99,7 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
 	   circumstances where we would want the gemmtrsm_? operations to have
 	   and exercise their own IO preferences -- I'd have to think about it --
 	   but this doesn't seem to be one of them. */ \
-	const bool      col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
+	const bool      col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct    = ( col_pref ? 1 : nr ); \
 	const inc_t     cs_ct    = ( col_pref ? mr : 1 ); \
 \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index b8459dd35..c5f833359 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -40,12 +40,12 @@
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
        const void*      a0, \
              void*      b0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -128,12 +128,12 @@ INSERT_GENTFUNC_BASIC( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(ch,opname,arch,suf) \
      ( \
        const void*      a0, \
              void*      b0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* data, \
        const cntx_t*    cntx  \
      ) \
 { \
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index ff1cfd9a6..9780f9162 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -39,13 +39,11 @@
 
 // Define macros to construct the full symbol name from the operation name.
 #undef  GENARNAME             // opname, architecture, _ref (no bli_)
-#define GENARNAME(opname)     PASTECH2(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
+#define GENARNAME(opname)     PASTECH(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
 #undef  GENTARNAME            // bli, ch, opname, architecture, _ref
-#define GENTARNAME(ch,opname) PASTEMAC3(ch,opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
+#define GENTARNAME(ch,opname) PASTEMAC(ch,opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
 #undef  GENBARNAME            // bli_, opname, architecture, _ref
-#define GENBARNAME(opname)    PASTEMAC2(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
-#undef  GENBAINAME            // bli_, opname, architecture, _ind
-#define GENBAINAME(opname)    PASTEMAC2(opname,BLIS_CNAME_INFIX,BLIS_IND_SUFFIX)
+#define GENBARNAME(opname)    PASTEMAC(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
 
 // Define a prototype-inserting template that uses an arbitrary prototype-
 // generating macro.
@@ -58,6 +56,64 @@ protmac( double,   d, kername ) \
 protmac( scomplex, c, kername ) \
 protmac( dcomplex, z, kername )
 
+#undef  INSERT_PROTMAC_BASIC_CO
+#define INSERT_PROTMAC_BASIC_CO( protmac, kername ) \
+\
+protmac( scomplex, c, kername ) \
+protmac( dcomplex, z, kername )
+
+#undef  INSERT_PROTMAC_MIX_RO
+#define INSERT_PROTMAC_MIX_RO( protmac, kername ) \
+\
+protmac( float,  float,  s, s, kername ) \
+protmac( double, float,  d, s, kername ) \
+protmac( float,  double, s, d, kername ) \
+protmac( double, double, d, d, kername )
+
+#undef  INSERT_PROTMAC_MIX_CO
+#define INSERT_PROTMAC_MIX_CO( protmac, kername ) \
+\
+protmac( scomplex, scomplex, c, c, kername ) \
+protmac( dcomplex, scomplex, z, c, kername ) \
+protmac( scomplex, dcomplex, c, z, kername ) \
+protmac( dcomplex, dcomplex, z, z, kername )
+
+#undef  INSERT_PROTMAC_MIX_P
+#define INSERT_PROTMAC_MIX_P( protmac, kername ) \
+\
+protmac( float,    float,    s, s, kername ) \
+protmac( double,   float,    d, s, kername ) \
+protmac( float,    double,   s, d, kername ) \
+protmac( double,   double,   d, d, kername ) \
+\
+protmac( scomplex, scomplex, c, c, kername ) \
+protmac( dcomplex, scomplex, z, c, kername ) \
+protmac( scomplex, dcomplex, c, z, kername ) \
+protmac( dcomplex, dcomplex, z, z, kername )
+
+#undef  INSERT_PROTMAC_MIX_DP
+#define INSERT_PROTMAC_MIX_DP( protmac, kername ) \
+\
+protmac( float,    float,    s, s, kername ) \
+protmac( double,   float,    d, s, kername ) \
+protmac( scomplex, float,    c, s, kername ) \
+protmac( dcomplex, float,    z, s, kername ) \
+\
+protmac( float,    double,   s, d, kername ) \
+protmac( double,   double,   d, d, kername ) \
+protmac( scomplex, double,   c, d, kername ) \
+protmac( dcomplex, double,   z, d, kername ) \
+\
+protmac( float,    scomplex, s, c, kername ) \
+protmac( double,   scomplex, d, c, kername ) \
+protmac( scomplex, scomplex, c, c, kername ) \
+protmac( dcomplex, scomplex, z, c, kername ) \
+\
+protmac( float,    dcomplex, s, z, kername ) \
+protmac( double,   dcomplex, d, z, kername ) \
+protmac( scomplex, dcomplex, c, z, kername ) \
+protmac( dcomplex, dcomplex, z, z, kername )
+
 
 // -- Level-3 native micro-kernel prototype redefinitions ----------------------
 
@@ -72,7 +128,7 @@ protmac( dcomplex, z, kername )
 // Instantiate prototypes for above functions using the pre-defined level-3
 // microkernel prototype-generating macros.
 
-INSERT_PROTMAC_BASIC( GEMM_UKR_PROT,     gemm_ukr_name )
+INSERT_PROTMAC_MIX_P( GEMM_UKR2_PROT,    gemm_ukr_name )
 INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm_l_ukr_name )
 INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm_u_ukr_name )
 INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm_l_ukr_name )
@@ -83,24 +139,24 @@ INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm_u_ukr_name )
 
 // -- Construct arch-specific names for reference virtual level-3 microkernels --
 
-// -- 1m --
-
 #define gemm1m_ukr_name        GENARNAME(gemm1m)
+#define gemm_ccr_ukr_name       GENARNAME(gemm_ccr)
+#define gemm_rcc_ukr_name       GENARNAME(gemm_rcc)
+#define gemm_crr_ukr_name        GENARNAME(gemm_crr)
 #define gemmtrsm1m_l_ukr_name  GENARNAME(gemmtrsm1m_l)
 #define gemmtrsm1m_u_ukr_name  GENARNAME(gemmtrsm1m_u)
-#define trsm1m_l_ukr_name      GENARNAME(trsm1m_l)
-#define trsm1m_u_ukr_name      GENARNAME(trsm1m_u)
 
 // Instantiate prototypes for above functions using the pre-defined level-3
 // microkernel prototype-generating macros.
 
-// -- 1m --
-
-INSERT_PROTMAC_BASIC( GEMM_UKR_PROT,     gemm1m_ukr_name )
-INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm1m_l_ukr_name )
-INSERT_PROTMAC_BASIC( GEMMTRSM_UKR_PROT, gemmtrsm1m_u_ukr_name )
-INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm1m_l_ukr_name )
-INSERT_PROTMAC_BASIC( TRSM_UKR_PROT,     trsm1m_u_ukr_name )
+INSERT_PROTMAC_MIX_RO( GEMM_UKR2_PROT,    gemm1m_ukr_name )
+INSERT_PROTMAC_MIX_RO( GEMM_UKR2_PROT,    gemm_ccr_ukr_name )
+INSERT_PROTMAC_MIX_RO( GEMM_UKR2_PROT,    gemm_rcc_ukr_name )
+INSERT_PROTMAC_MIX_RO( GEMM_UKR2_PROT,    gemm_crr_ukr_name )
+INSERT_PROTMAC_BASIC(  GEMMTRSM_UKR_PROT, gemmtrsm1m_l_ukr_name )
+INSERT_PROTMAC_BASIC(  GEMMTRSM_UKR_PROT, gemmtrsm1m_u_ukr_name )
+INSERT_PROTMAC_BASIC(  TRSM_UKR_PROT,     trsm1m_l_ukr_name )
+INSERT_PROTMAC_BASIC(  TRSM_UKR_PROT,     trsm1m_u_ukr_name )
 
 
 // -- Level-3 small/unpacked micro-kernel prototype definitions ----------------
@@ -127,30 +183,24 @@ INSERT_PROTMAC_BASIC( GEMMSUP_KER_PROT, gemmsup_gx_ukr_name )
 
 // -- Construct arch-specific names for reference packm kernels --
 
-#define packm_mrxk_ker_name            GENARNAME(packm_mrxk)
-#define packm_nrxk_ker_name            GENARNAME(packm_nrxk)
-#define packm_mrxk_1er_ker_name        GENARNAME(packm_mrxk_1er)
-#define packm_nrxk_1er_ker_name        GENARNAME(packm_nrxk_1er)
-#define packm_mrxmr_diag_ker_name      GENARNAME(packm_mrxmr_diag)
-#define packm_nrxnr_diag_ker_name      GENARNAME(packm_nrxnr_diag)
-#define packm_mrxmr_diag_1er_ker_name  GENARNAME(packm_mrxmr_diag_1er)
-#define packm_nrxnr_diag_1er_ker_name  GENARNAME(packm_nrxnr_diag_1er)
-#define unpackm_mrxk_ker_name          GENARNAME(unpackm_mrxk)
-#define unpackm_nrxk_ker_name          GENARNAME(unpackm_nrxk)
+#define packm_ker_name           GENARNAME(packm)
+#define packm_1er_ker_name       GENARNAME(packm_1er)
+#define packm_ro_ker_name       GENARNAME(packm_ro)
+#define packm_diag_ker_name      GENARNAME(packm_diag)
+#define packm_diag_1er_ker_name  GENARNAME(packm_diag_1er)
+#define packm_diag_ro_ker_name  GENARNAME(packm_diag_ro)
+#define unpackm_ker_name         GENARNAME(unpackm)
 
 // Instantiate prototypes for above functions using the pre-defined packm
 // kernel prototype-generating macros.
 
-INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_mrxk_ker_name )
-INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_nrxk_ker_name )
-INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_mrxk_1er_ker_name )
-INSERT_PROTMAC_BASIC( PACKM_KER_PROT,      packm_nrxk_1er_ker_name )
-INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_ker_name )
-INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_ker_name )
-INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_mrxmr_diag_1er_ker_name )
-INSERT_PROTMAC_BASIC( PACKM_DIAG_KER_PROT, packm_nrxnr_diag_1er_ker_name )
-INSERT_PROTMAC_BASIC( UNPACKM_KER_PROT,    unpackm_mrxk_ker_name )
-INSERT_PROTMAC_BASIC( UNPACKM_KER_PROT,    unpackm_nrxk_ker_name )
+INSERT_PROTMAC_MIX_P ( PACKM_KER_PROT2,      packm_ker_name )
+INSERT_PROTMAC_MIX_CO( PACKM_KER_PROT2,      packm_1er_ker_name )
+INSERT_PROTMAC_MIX_CO( PACKM_KER_PROT2,      packm_ro_ker_name )
+INSERT_PROTMAC_MIX_P ( PACKM_DIAG_KER_PROT2, packm_diag_ker_name )
+INSERT_PROTMAC_MIX_CO( PACKM_DIAG_KER_PROT2, packm_diag_1er_ker_name )
+INSERT_PROTMAC_MIX_CO( PACKM_DIAG_KER_PROT2, packm_diag_ro_ker_name )
+INSERT_PROTMAC_MIX_P ( UNPACKM_KER_PROT2,    unpackm_ker_name )
 
 
 // -- Level-1f kernel prototype redefinitions ----------------------------------
@@ -216,34 +266,79 @@ INSERT_PROTMAC_BASIC( XPBYV_KER_PROT,    xpbyv_ker_name )
 
 // -- Macros to help concisely instantiate bli_func_init() ---------------------
 
+#define gen_func_init_ro( func_p, opname ) \
+do { \
+	bli_func_init( func_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \
+	                       NULL,               NULL ); \
+} while (0)
+
 #define gen_func_init_co( func_p, opname ) \
-{ \
+do { \
 	bli_func_init( func_p, NULL,               NULL, \
 	                       PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
-}
+} while (0)
+
+#define gen_func_init_mix_co( func_p, opname ) \
+do { \
+	bli_func2_init( func_p, NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        PASTEMAC(c,c,opname), PASTEMAC(c,z,opname), \
+	                        NULL,                  NULL, \
+	                        PASTEMAC(z,c,opname), PASTEMAC(z,z,opname) ); \
+} while (0)
 
 #define gen_func_init( func_p, opname ) \
-{ \
+do { \
 	bli_func_init( func_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \
 	                       PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
-}
+} while (0)
+
+#define gen_func_init_ro_mix_p( func_p, opname ) \
+do { \
+	bli_func2_init( func_p, PASTEMAC(s,s,opname), PASTEMAC(s,d,opname), \
+	                        NULL,                  NULL, \
+	                        PASTEMAC(d,s,opname), PASTEMAC(d,d,opname), \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL ); \
+} while (0)
+
+#define gen_func_init_mix_p( func_p, opname ) \
+do { \
+	bli_func2_init( func_p, PASTEMAC(s,s,opname), PASTEMAC(s,d,opname), \
+	                        NULL,                  NULL, \
+	                        PASTEMAC(d,s,opname), PASTEMAC(d,d,opname), \
+	                        NULL,                  NULL, \
+	                        NULL,                  NULL, \
+	                        PASTEMAC(c,c,opname), PASTEMAC(c,z,opname), \
+	                        NULL,                  NULL, \
+	                        PASTEMAC(z,c,opname), PASTEMAC(z,z,opname) ); \
+} while (0)
+
+#define gen_func_init_mix_dp( func_p, opname ) \
+do { \
+	bli_func2_init( func_p, PASTEMAC(s,s,opname), PASTEMAC(s,d,opname), \
+	                        PASTEMAC(s,c,opname), PASTEMAC(s,z,opname), \
+	                        PASTEMAC(d,s,opname), PASTEMAC(d,d,opname), \
+	                        PASTEMAC(d,c,opname), PASTEMAC(d,z,opname), \
+	                        PASTEMAC(c,s,opname), PASTEMAC(c,d,opname), \
+	                        PASTEMAC(c,c,opname), PASTEMAC(c,z,opname), \
+	                        PASTEMAC(z,s,opname), PASTEMAC(z,d,opname), \
+	                        PASTEMAC(z,c,opname), PASTEMAC(z,z,opname) ); \
+} while (0)
 
 #define gen_sup_func_init( func0_p, func1_p, opname ) \
-{ \
+do { \
 	bli_func_init( func0_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \
 	                        PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
 	bli_func_init( func1_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \
 	                        PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
-}
-
-// -- Helper function for 1m ---------------------------------------------------
-
-void GENBAINAME(cntx_init_blkszs)
-     (
-       ind_t   method,
-       num_t   dt,
-       cntx_t* cntx
-     );
+} while (0)
 
 // -----------------------------------------------------------------------------
 
@@ -252,16 +347,22 @@ void GENBARNAME(cntx_init)
        cntx_t* cntx
      )
 {
-	blksz_t  blkszs[ BLIS_NUM_BLKSZS ];
-	func_t*  funcs;
-	mbool_t* mbools;
-	dim_t    i;
-	void_fp* vfuncs;
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+	func_t  funcs [ BLIS_NUM_UKRS ];
+	func2_t func2s[ BLIS_NUM_UKR2S ];
+	mbool_t mbools[ BLIS_NUM_UKR_PREFS ];
+	void_fp vfuncs[ BLIS_NUM_LEVEL3_OPS ];
 
+	// Make sure any unset function pointers or block sizes are NULL/0
+	memset( blkszs, 0, sizeof(blkszs) );
+	memset( funcs,  0, sizeof(funcs)  );
+	memset( func2s, 0, sizeof(func2s) );
+	memset( mbools, 0, sizeof(mbools) );
+	memset( vfuncs, 0, sizeof(vfuncs) );
 
-	// -- Clear the context ----------------------------------------------------
+	// -- Initialize the context -----------------------------------------------
 
-	bli_cntx_clear( cntx );
+	bli_cntx_init( cntx );
 
 
 	// -- Set blocksizes -------------------------------------------------------
@@ -328,31 +429,22 @@ void GENBARNAME(cntx_init)
 	);
 
 
-	// -- Set level-3 virtual micro-kernels ------------------------------------
-
-	funcs = cntx->ukrs;
+	// -- Set level-3 native micro-kernels and preferences ---------------------
 
-	// NOTE: We set the virtual micro-kernel slots to contain the addresses
-	// of the native micro-kernels. In general, the ukernels in the virtual
-	// ukernel slots are always called, and if the function called happens to
-	// be a virtual micro-kernel, it will then know to find its native ukernel
-	// (i.e., in the native ukernel slots).
-	gen_func_init( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
-	gen_func_init( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
-	gen_func_init( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
+	gen_func_init_mix_p( &func2s[ bli_ker_idx( BLIS_GEMM_UKR ) ], gemm_ukr_name );
 
+	gen_func_init_ro_mix_p( &func2s[ bli_ker_idx( BLIS_GEMM1M_UKR ) ],  gemm1m_ukr_name  );
+	gen_func_init_ro_mix_p( &func2s[ bli_ker_idx( BLIS_GEMM_CCR_UKR ) ], gemm_ccr_ukr_name );
+	gen_func_init_ro_mix_p( &func2s[ bli_ker_idx( BLIS_GEMM_RCC_UKR ) ], gemm_rcc_ukr_name );
+	gen_func_init_ro_mix_p( &func2s[ bli_ker_idx( BLIS_GEMM_CRR_UKR ) ],  gemm_crr_ukr_name  );
 
-	// -- Set level-3 native micro-kernels and preferences ---------------------
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMTRSM_L_UKR ) ], gemmtrsm_l_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMTRSM_U_UKR ) ], gemmtrsm_u_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_TRSM_L_UKR ) ],     trsm_l_ukr_name     );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_TRSM_U_UKR ) ],     trsm_u_ukr_name     );
 
-	mbools = cntx->ukr_prefs;
-
-	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
-	gen_func_init( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
-	gen_func_init( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );
+	gen_func_init_ro( &funcs[ bli_ker_idx( BLIS_GEMMTRSM1M_L_UKR ) ], gemmtrsm1m_l_ukr_name );
+	gen_func_init_ro( &funcs[ bli_ker_idx( BLIS_GEMMTRSM1M_U_UKR ) ], gemmtrsm1m_u_ukr_name );
 
 	//                                                           s      d      c      z
 	bli_mbool_init( &mbools[ BLIS_GEMM_UKR_ROW_PREF ],        TRUE,  TRUE,  TRUE,  TRUE );
@@ -364,19 +456,19 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-3 small/unpacked micro-kernels and preferences -------------
 
-	gen_func_init( &funcs[ BLIS_GEMMSUP_RRR_UKR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMSUP_RRC_UKR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMSUP_RCR_UKR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMSUP_RCC_UKR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMSUP_CRR_UKR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMSUP_CRC_UKR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMSUP_CCR_UKR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMSUP_CCC_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_RRR_UKR ) ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_RRC_UKR ) ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_RCR_UKR ) ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_RCC_UKR ) ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_CRR_UKR ) ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_CRC_UKR ) ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_CCR_UKR ) ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_CCC_UKR ) ], gemmsup_rv_ukr_name );
 
 	// Register the general-stride/generic ukernel to the "catch-all" slot
 	// associated with the BLIS_XXX enum value. This slot will be queried if
 	// *any* operand is stored with general stride.
-	gen_func_init( &funcs[ BLIS_GEMMSUP_XXX_UKR ], gemmsup_gx_ukr_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_GEMMSUP_XXX_UKR ) ], gemmsup_gx_ukr_name );
 
 
 	// Set the l3 sup ukernel storage preferences.
@@ -395,56 +487,60 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-1f kernels -------------------------------------------------
 
-	gen_func_init( &funcs[ BLIS_AXPY2V_KER ],    axpy2v_ker_name    );
-	gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ],  dotaxpyv_ker_name  );
-	gen_func_init( &funcs[ BLIS_AXPYF_KER ],     axpyf_ker_name     );
-	gen_func_init( &funcs[ BLIS_DOTXF_KER ],     dotxf_ker_name     );
-	gen_func_init( &funcs[ BLIS_DOTXAXPYF_KER ], dotxaxpyf_ker_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_AXPY2V_KER ) ],    axpy2v_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_DOTAXPYV_KER ) ],  dotaxpyv_ker_name  );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_AXPYF_KER ) ],     axpyf_ker_name     );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_DOTXF_KER ) ],     dotxf_ker_name     );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_DOTXAXPYF_KER ) ], dotxaxpyf_ker_name );
 
 
 	// -- Set level-1v kernels -------------------------------------------------
 
-	gen_func_init( &funcs[ BLIS_ADDV_KER ],     addv_ker_name     );
-	gen_func_init( &funcs[ BLIS_AMAXV_KER ],    amaxv_ker_name    );
-	gen_func_init( &funcs[ BLIS_AXPBYV_KER ],   axpbyv_ker_name   );
-	gen_func_init( &funcs[ BLIS_AXPYV_KER ],    axpyv_ker_name    );
-	gen_func_init( &funcs[ BLIS_COPYV_KER ],    copyv_ker_name    );
-	gen_func_init( &funcs[ BLIS_DOTV_KER ],     dotv_ker_name     );
-	gen_func_init( &funcs[ BLIS_DOTXV_KER ],    dotxv_ker_name    );
-	gen_func_init( &funcs[ BLIS_INVERTV_KER ],  invertv_ker_name  );
-	gen_func_init( &funcs[ BLIS_INVSCALV_KER ], invscalv_ker_name );
-	gen_func_init( &funcs[ BLIS_SCALV_KER ],    scalv_ker_name    );
-	gen_func_init( &funcs[ BLIS_SCAL2V_KER ],   scal2v_ker_name   );
-	gen_func_init( &funcs[ BLIS_SETV_KER ],     setv_ker_name     );
-	gen_func_init( &funcs[ BLIS_SUBV_KER ],     subv_ker_name     );
-	gen_func_init( &funcs[ BLIS_SWAPV_KER ],    swapv_ker_name    );
-	gen_func_init( &funcs[ BLIS_XPBYV_KER ],    xpbyv_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_ADDV_KER ) ],     addv_ker_name     );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_AMAXV_KER ) ],    amaxv_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_AXPBYV_KER ) ],   axpbyv_ker_name   );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_AXPYV_KER ) ],    axpyv_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_COPYV_KER ) ],    copyv_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_DOTV_KER ) ],     dotv_ker_name     );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_DOTXV_KER ) ],    dotxv_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_INVERTV_KER ) ],  invertv_ker_name  );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_INVSCALV_KER ) ], invscalv_ker_name );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_SCALV_KER ) ],    scalv_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_SCAL2V_KER ) ],   scal2v_ker_name   );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_SETV_KER ) ],     setv_ker_name     );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_SUBV_KER ) ],     subv_ker_name     );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_SWAPV_KER ) ],    swapv_ker_name    );
+	gen_func_init( &funcs[ bli_ker_idx( BLIS_XPBYV_KER ) ],    xpbyv_ker_name    );
 
 
 	// -- Set level-1m (packm/unpackm) kernels ---------------------------------
 
-	gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
+	gen_func_init_mix_p ( &func2s[ bli_ker_idx( BLIS_PACKM_KER ) ],           packm_ker_name );
+	gen_func_init_mix_co( &func2s[ bli_ker_idx( BLIS_PACKM_1ER_KER ) ],       packm_1er_ker_name );
+	gen_func_init_mix_co( &func2s[ bli_ker_idx( BLIS_PACKM_RO_KER ) ],        packm_ro_ker_name );
+	gen_func_init_mix_p ( &func2s[ bli_ker_idx( BLIS_PACKM_DIAG_KER ) ],      packm_diag_ker_name );
+	gen_func_init_mix_co( &func2s[ bli_ker_idx( BLIS_PACKM_DIAG_1ER_KER ) ],  packm_diag_1er_ker_name );
+	gen_func_init_mix_co( &func2s[ bli_ker_idx( BLIS_PACKM_DIAG_RO_KER ) ],   packm_diag_ro_ker_name );
+	gen_func_init_mix_p ( &func2s[ bli_ker_idx( BLIS_UNPACKM_KER ) ],         unpackm_ker_name );
 
-	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
-	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
 
-	gen_func_init( &funcs[ BLIS_PACKM_MRXMR_DIAG_KER ],  packm_mrxmr_diag_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_NRXNR_DIAG_KER ],  packm_nrxnr_diag_ker_name );
+	// -- Put the default kernels and their preferences into the context -------
 
-	gen_func_init_co( &funcs[ BLIS_PACKM_MRXMR_DIAG_1ER_KER ],  packm_mrxmr_diag_1er_ker_name );
-	gen_func_init_co( &funcs[ BLIS_PACKM_NRXNR_DIAG_1ER_KER ],  packm_nrxnr_diag_1er_ker_name );
+	for ( dim_t i = 0; i < BLIS_NUM_UKRS; i++ )
+		bli_cntx_set_ukr( BLIS_1TYPE_KER + i, &funcs[ i ], cntx );
 
-	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
+	for ( dim_t i = 0; i < BLIS_NUM_UKR2S; i++ )
+		bli_cntx_set_ukr2( BLIS_2TYPE_KER + i, &func2s[ i ], cntx );
 
+	for ( dim_t i = 0; i < BLIS_NUM_UKR_PREFS; i++ )
+		bli_cntx_set_ukr_pref( i, &mbools[ i ], cntx );
 
-	// -- Set level-3 small/unpacked handlers ----------------------------------
 
-	vfuncs = cntx->l3_sup_handlers;
+	// -- Set level-3 small/unpacked handlers ----------------------------------
 
 	// Initialize all of the function pointers to NULL;
-	for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL;
+	for ( dim_t i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i )
+		vfuncs[ i ] = NULL;
 
 	// The level-3 sup handlers are oapi-based, so we only set one slot per
 	// operation.
@@ -453,156 +549,7 @@ void GENBARNAME(cntx_init)
 	vfuncs[ BLIS_GEMM ]  = bli_gemmsup_ref;
 	vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
 
-
-	// -- Set miscellaneous fields ---------------------------------------------
-
-	bli_cntx_set_method( BLIS_NAT, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
-void GENBAINAME(cntx_init)
-     (
-       ind_t   method,
-       cntx_t* cntx
-     )
-{
-	func_t* funcs;
-
-	// This function is designed to modify a copy of an existing native
-	// context to enable computation via an induced method for complex
-	// domain level-3 operations. It is called by bli_gks_query_ind_cntx()
-	// on a context after its contexts are set by copying from the
-	// architecture's native context.
-
-	// -- Set induced method level-3 virtual micro-kernels ---------------------
-
-	funcs = cntx->ukrs;
-
-	if ( method == BLIS_1M )
-	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm1m_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm1m_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm1m_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm1m_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm1m_u_ukr_name     );
-	}
-	else // if ( method == BLIS_NAT )
-	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
-	}
-
-	// For 1m, we employ an optimization which requires that we copy the native
-	// real domain gemm ukernel function pointers to the corresponding real
-	// domain slots in the virtual gemm ukernel func_t. This optimization allows
-	// us to, under certain conditions, adjust various parameters within the gemm
-	// macrokernel so that the real-domain macrokernel (which will query and use
-	// the real-domain virtual gemm ukernel) can be called instead of calling the
-	// complex-domain macrokernel and the corresponding complex-domain virtual
-	// microkernel. The non-optimized code path would require an extra level of
-	// function call overhead, which can be avoided in most cases (i.e., when
-	// beta has a zero imaginary component and C is either row- or column-stored).
-	if ( method == BLIS_1M )
-	{
-		func_t* gemm_nat_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
-		func_t* gemm_vir_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
-
-		bli_func_copy_dt( BLIS_FLOAT,  gemm_nat_ukrs, BLIS_FLOAT,  gemm_vir_ukrs );
-		bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs );
-	}
-
-
-	// -- Set induced method packm kernels -------------------------------------
-
-	if ( method == BLIS_1M )
-	{
-		gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_1er_ker_name );
-	}
-	else // if ( method == BLIS_NAT )
-	{
-		gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
-	}
-
-	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
-	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
-
-	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
-
-
-	// -- Set induced method cache and register blocksizes ---------------------
-
-	// Modify the context with cache and register blocksizes (and multiples)
-	// appropriate for the current induced method.
-	if ( method == BLIS_1M )
-	{
-		//const bool is_pb = FALSE;
-
-		// Call a helper function to initialize blocksizes for each complex
-		// datatype.
-		GENBAINAME(cntx_init_blkszs)( method, BLIS_SCOMPLEX, cntx );
-		GENBAINAME(cntx_init_blkszs)( method, BLIS_DCOMPLEX, cntx );
-	}
-	else // if ( method == BLIS_NAT )
-	{
-		// No change in blocksizes needed for native execution.
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-void GENBAINAME(cntx_init_blkszs)
-     (
-       ind_t   method,
-       num_t   dt,
-       cntx_t* cntx
-     )
-{
-	// Set the induced method in the context.
-	bli_cntx_set_method( method, cntx );
-
-	num_t dt_r = bli_dt_proj_to_real( dt );
-
-	// Initialize the blocksizes according to the micro-kernel preference as
-	// well as the algorithm.
-	//if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
-	if ( ! bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
-	{
-		// This branch is used for algorithm 1m_c_bp.
-
-		bli_cntx_set_ind_blkszs
-		(
-		  method, dt, cntx,
-		  BLIS_NC, 1.0, 1.0,
-		  BLIS_KC, 2.0, 2.0, // halve kc...
-		  BLIS_MC, 2.0, 2.0, // halve mc...
-		  BLIS_NR, 1.0, 1.0,
-		  BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
-		  BLIS_KR, 1.0, 1.0,
-		  BLIS_VA_END
-		);
-	}
-	else // if ( bli_cntx_get_ukr_prefs_dt( dt, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
-	{
-		// This branch is used for algorithm 1m_r_bp.
-
-		bli_cntx_set_ind_blkszs
-		(
-		  method, dt, cntx,
-		  BLIS_NC, 2.0, 2.0, // halve nc...
-		  BLIS_KC, 2.0, 2.0, // halve kc...
-		  BLIS_MC, 1.0, 1.0,
-		  BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
-		  BLIS_MR, 1.0, 1.0,
-		  BLIS_KR, 1.0, 1.0,
-		  BLIS_VA_END
-		);
-	}
+	for ( dim_t i = 0; i < BLIS_NUM_LEVEL3_OPS; i++ )
+		bli_cntx_set_l3_sup_handler( i, vfuncs[ i ], cntx );
 }
 
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 99dea36d4..30904ecdb 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -34,108 +34,66 @@
 
 #include "blis.h"
 
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
+#undef  GENTFUNC2RO
+#define GENTFUNC2RO( ctype_abr, ctype_ab, ctype_cr, ctype_c, chabr, chab, chcr, chc, opname, arch, suf ) \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+void PASTEMAC(chabr,chcr,opname,arch,suf) \
      ( \
              dim_t      m, \
              dim_t      n, \
              dim_t      k, \
        const void*      alpha0, \
-       const void*      a0, \
-       const void*      b0, \
+       const void*      a, \
+       const void*      b, \
        const void*      beta0, \
              void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* auxinfo, \
        const cntx_t*    cntx  \
      ) \
 { \
-	const ctype*      alpha     = alpha0; \
-	const ctype*      a         = a0; \
-	const ctype*      b         = b0; \
-	const ctype*      beta      = beta0; \
-	      ctype*      c         = c0; \
+	const ctype_ab*   alpha     = alpha0; \
+	const ctype_c*    beta      = beta0; \
+	      ctype_c*    c         = c0; \
 \
-	const num_t       dt        = PASTEMAC(ch,type); \
-	const num_t       dt_r      = PASTEMAC(chr,type); \
+	const cntl_t*     params    = bli_auxinfo_params( auxinfo ); \
 \
-	      gemm_ukr_ft rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        row_pref  = !col_pref; \
+	const gemm_ukr_ft rgemm_ukr = bli_gemm_var_cntl_real_ukr( params ); \
+	const bool        row_pref  = bli_gemm_var_cntl_row_pref( params ); \
+	const void*       params_r  = bli_gemm_var_cntl_real_params( params ); \
 \
-	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t       mr        = bli_gemm_var_cntl_mr( params ); \
+	const dim_t       nr        = bli_gemm_var_cntl_nr( params ); \
 \
-	const dim_t       mr_r      = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
-	const dim_t       nr_r      = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
+	const dim_t       mr_r      = row_pref ? mr : 2 * mr; \
+	const dim_t       nr_r      = row_pref ? 2 * nr : nr; \
 \
-	const dim_t       k2        = 2 * k; \
+	/* Convert the micro-tile dimensions from being in units of complex elements to
+	   be in units of real elements. */ \
+	const dim_t       m_r        = row_pref ? m : 2 * m; \
+	const dim_t       n_r        = row_pref ? 2 * n : n; \
+	const dim_t       k_r        = 2 * k; \
 \
-	      ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                      / sizeof( ctype_r ) ] \
-	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	      ctype_ab    ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype_ab ) ] \
+	                  __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	      inc_t       rs_ct; \
 	      inc_t       cs_ct; \
 \
-	const ctype_r*    a_r       = ( ctype_r* )a; \
+	const ctype_abr* restrict one_r   = PASTEMAC(chabr,1); \
+	const ctype_abr* restrict zero_r  = PASTEMAC(chabr,0); \
 \
-	const ctype_r*    b_r       = ( ctype_r* )b; \
+	const ctype_abr* restrict alpha_r = &PASTEMAC(chab,real)( *alpha ); \
+	const ctype_abr* restrict alpha_i = &PASTEMAC(chab,imag)( *alpha ); \
 \
-	const ctype_r*    zero_r    = PASTEMAC(chr,0); \
+	const ctype_cr*  restrict beta_r  = &PASTEMAC(chc,real)( *beta ); \
+	const ctype_cr*  restrict beta_i  = &PASTEMAC(chc,imag)( *beta ); \
 \
-	const ctype_r*    alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
-	const ctype_r*    alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
+	auxinfo_t auxinfo_r = *auxinfo; \
+	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
 \
-	const ctype_r*    beta_r    = &PASTEMAC(ch,real)( *beta ); \
-	const ctype_r*    beta_i    = &PASTEMAC(ch,imag)( *beta ); \
-\
-	      ctype_r*    c_use; \
-\
-	      inc_t       rs_c_use; \
-	      inc_t       cs_c_use; \
-\
-	      bool        using_ct; \
-\
-/*
-	PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: a", mr, 2*k, \
-	                       a_r, 1, mr, "%5.2f", "" ); \
-	PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: b", 2*k, 2*nr, \
-	                       b_r, 2*nr, 1, "%5.2f", "" ); \
-	PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c after", mr, 2*nr, \
-	                       c_use, rs_c_use, cs_c_use, "%5.2f", "" ); \
-*/ \
-\
-	/* SAFETY CHECK: The higher level implementation should never
-	   allow an alpha with non-zero imaginary component to be passed
-	   in, because it can't be applied properly using the 1m method.
-	   If alpha is not real, then something is very wrong. */ \
-	if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-\
-	/* If beta has a non-zero imaginary component OR if c is stored with
-	   general stride, then we compute the alpha*a*b product into temporary
-	   storage and then accumulate that result into c afterwards. Note that
-	   the other two cases concerning disagreement between the storage of C
-	   and the output preference of the micro-kernel, should ONLY occur in
-	   the context of trsm, whereby this virtual micro-kernel is called
-	   directly from the trsm macro-kernel to update the micro-tile b11
-	   that exists within the packed row-panel of B. Indeed that is the
-	   reason those cases MUST be explicitly handled. */ \
-	if      ( !PASTEMAC(chr,eq0)( *beta_i ) )               using_ct = TRUE; \
-	else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
-	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
-	else if ( bli_is_gen_stored( rs_c, cs_c ) )             using_ct = TRUE; \
-	else                                                    using_ct = FALSE; \
-\
-\
-	/* If we are not computing a full micro-tile, then we must write to
-	   ct and then accumulate to c afterwards. */ \
-	if ( mr != m || nr != n ) using_ct = TRUE; \
-\
-\
-	if ( using_ct ) \
+	if ( !PASTEMAC(chabr,eq0)( *alpha_i ) || \
+	     !PASTEMAC(chcr,eq0)( *beta_i ) || \
+	     !bli_is_preferentially_stored( rs_c, cs_c, row_pref ) || \
+	     !PASTEMAC(chabr,chcr,same) ) \
 	{ \
 		/* In the atypical cases, we compute the result into temporary
 		   workspace ct and then accumulated it back to c at the end. */ \
@@ -143,107 +101,85 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		/* Set the strides of ct based on the preference of the underlying
 		   native real domain gemm micro-kernel. Note that we set the ct
 		   strides in units of complex elements. */ \
-		if ( col_pref ) { rs_ct = 1;  cs_ct = mr; } \
-		else            { rs_ct = nr; cs_ct = 1; } \
+		if ( !row_pref ) { rs_ct = 1;  cs_ct = mr; } \
+		else             { rs_ct = nr; cs_ct = 1; } \
 \
-		c_use    = ( ctype_r* )ct; \
-		rs_c_use = rs_ct; \
-		cs_c_use = cs_ct; \
+		inc_t rs_c_use = rs_ct; \
+		inc_t cs_c_use = cs_ct; \
 \
 		/* Convert the strides from being in units of complex elements to
 		   be in units of real elements. Note that we don't need to check for
 		   general storage here because that case corresponds to the scenario
 		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
-		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
-		else                                           rs_c_use *= 2; \
+		if ( !row_pref ) cs_c_use *= 2; \
+		else             rs_c_use *= 2; \
 \
 		/* The following gemm micro-kernel call implements the 1m method,
 		   which induces a complex matrix multiplication by calling the
 		   real matrix micro-kernel on micro-panels that have been packed
 		   according to the 1e and 1r formats. */ \
 \
-		/* c = beta * c + alpha_r * a * b; */ \
+		/* c = beta * c + a * b; */ \
 		rgemm_ukr \
 		( \
 		  mr_r, \
 		  nr_r, \
-		  k2, \
-		  alpha_r, \
-		  a_r, \
-		  b_r, \
+		  k_r, \
+		  one_r, \
+		  a, \
+		  b, \
 		  zero_r, \
-		  c_use, rs_c_use, cs_c_use, \
-		  data, \
+		  ct, rs_c_use, cs_c_use, \
+		  &auxinfo_r, \
 		  cntx  \
 		); \
 \
-		/* Accumulate the final result in ct back to c. */ \
-		if ( PASTEMAC(ch,eq1)( *beta ) ) \
-		{ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
-				                   *(c  + i*rs_c  + j*cs_c ) ); \
-			} \
-		} \
-		else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-		{ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
-				                    *(c  + i*rs_c  + j*cs_c ) ); \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
-				                    *beta, \
-				                    *(c  + i*rs_c  + j*cs_c ) ); \
-			} \
-		} \
+		PASTEMAC(chab,chab,chc,chc,axpbys_mxn) \
+		( \
+		  m, n, \
+		  alpha, \
+		  ct, rs_ct, cs_ct, \
+		  beta, \
+		  c, rs_c, cs_c \
+		); \
 	} \
 	else \
 	{ \
 		/* In the typical cases, we use the real part of beta and
 		   accumulate directly into the output matrix c. */ \
 \
-		c_use    = ( ctype_r* )c; \
-		rs_c_use = rs_c; \
-		cs_c_use = cs_c; \
+		inc_t rs_c_use = rs_c; \
+		inc_t cs_c_use = cs_c; \
 \
 		/* Convert the strides from being in units of complex elements to
 		   be in units of real elements. Note that we don't need to check for
 		   general storage here because that case corresponds to the scenario
 		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
-		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
-		else                                           rs_c_use *= 2; \
+		if ( !row_pref ) cs_c_use *= 2; \
+		else             rs_c_use *= 2; \
 \
 		/* The following gemm micro-kernel call implements the 1m method,
 		   which induces a complex matrix multiplication by calling the
 		   real matrix micro-kernel on micro-panels that have been packed
 		   according to the 1e and 1r formats. */ \
 \
-		/* c = beta * c + alpha_r * a * b; */ \
+		/* c = beta * c + a * b; */ \
 		rgemm_ukr \
 		( \
-		  mr_r, \
-		  nr_r, \
-		  k2, \
+		  m_r, \
+		  n_r, \
+		  k_r, \
 		  alpha_r, \
-		  a_r, \
-		  b_r, \
+		  a, \
+		  b, \
 		  beta_r, \
-		  c_use, rs_c_use, cs_c_use, \
-		  data, \
+		  c, rs_c_use, cs_c_use, \
+		  &auxinfo_r, \
 		  cntx  \
 		); \
 	} \
 }
 
-INSERT_GENTFUNCCO( gemm1m, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2RO( gemm1m, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2RO_MIX_P( gemm1m, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/ind/bli_gemm_ccr_ref.c b/ref_kernels/ind/bli_gemm_ccr_ref.c
new file mode 100644
index 000000000..f86faebc8
--- /dev/null
+++ b/ref_kernels/ind/bli_gemm_ccr_ref.c
@@ -0,0 +1,174 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC2RO
+#define GENTFUNC2RO( ctype_abr, ctype_ab, ctype_cr, ctype_c, chabr, chab, chcr, chc, opname, arch, suf ) \
+\
+void PASTEMAC(chabr,chcr,opname,arch,suf) \
+     ( \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const void*      alpha0, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
+       const auxinfo_t* auxinfo, \
+       const cntx_t*    cntx  \
+     ) \
+{ \
+	const ctype_ab*   alpha     = alpha0; \
+	const ctype_c*    beta      = beta0; \
+	      ctype_c*    c         = c0; \
+\
+	const cntl_t*     params    = bli_auxinfo_params( auxinfo ); \
+\
+	const gemm_ukr_ft rgemm_ukr = bli_gemm_var_cntl_real_ukr( params ); \
+	const bool        row_pref  = bli_gemm_var_cntl_row_pref( params ); \
+	const void*       params_r  = bli_gemm_var_cntl_real_params( params ); \
+\
+	const dim_t       mr        = bli_gemm_var_cntl_mr( params ); \
+	const dim_t       nr        = bli_gemm_var_cntl_nr( params ); \
+\
+	const dim_t       mr_r      = row_pref ? mr : 2 * mr; \
+	const dim_t       nr_r      = row_pref ? 2 * nr : nr; \
+\
+	/* Convert the micro-tile dimensions from being in units of complex elements to
+	   be in units of real elements. */ \
+	const dim_t       m_r        = row_pref ? m : 2 * m; \
+	const dim_t       n_r        = row_pref ? 2 * n : n; \
+\
+	      ctype_ab    ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype_ab ) ] \
+	                  __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	      inc_t       rs_ct; \
+	      inc_t       cs_ct; \
+\
+	const ctype_abr* restrict one_r   = PASTEMAC(chabr,1); \
+	const ctype_abr* restrict zero_r  = PASTEMAC(chabr,0); \
+\
+	const ctype_abr* restrict alpha_r = &PASTEMAC(chab,real)( *alpha ); \
+	const ctype_abr* restrict alpha_i = &PASTEMAC(chab,imag)( *alpha ); \
+\
+	const ctype_cr*  restrict beta_r  = &PASTEMAC(chc,real)( *beta ); \
+	const ctype_cr*  restrict beta_i  = &PASTEMAC(chc,imag)( *beta ); \
+\
+	auxinfo_t auxinfo_r = *auxinfo; \
+	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
+\
+	if ( !PASTEMAC(chabr,eq0)( *alpha_i ) || \
+	     !PASTEMAC(chcr,eq0)( *beta_i ) || \
+	     !bli_is_preferentially_stored( rs_c, cs_c, row_pref ) || \
+	     !PASTEMAC(chab,chc,same) ) \
+	{ \
+		/* In the atypical cases, we compute the result into temporary
+		   workspace ct and then accumulated it back to c at the end. */ \
+\
+		/* Set the strides of ct based on the preference of the underlying
+		   native real domain gemm micro-kernel. Note that we set the ct
+		   strides in units of complex elements. */ \
+		if ( !row_pref ) { rs_ct = 1;  cs_ct = mr; } \
+		else             { rs_ct = nr; cs_ct = 1; } \
+\
+		inc_t rs_c_use = rs_ct; \
+		inc_t cs_c_use = cs_ct; \
+\
+		/* Convert the strides from being in units of complex elements to
+		   be in units of real elements. Note that we don't need to check for
+		   general storage here because that case corresponds to the scenario
+		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
+		if ( !row_pref ) cs_c_use *= 2; \
+		else             rs_c_use *= 2; \
+\
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  mr_r, \
+		  nr_r, \
+		  k, \
+		  one_r, \
+		  a, \
+		  b, \
+		  zero_r, \
+		  ct, rs_c_use, cs_c_use, \
+		  &auxinfo_r, \
+		  cntx  \
+		); \
+\
+		PASTEMAC(chab,chab,chc,chc,axpbys_mxn) \
+		( \
+		  m, n, \
+		  alpha, \
+		  ct, rs_ct, cs_ct, \
+		  beta, \
+		  c, rs_c, cs_c \
+		); \
+	} \
+	else \
+	{ \
+		/* In the typical cases, we use the real part of beta and
+		   accumulate directly into the output matrix c. */ \
+\
+		inc_t rs_c_use = rs_c; \
+		inc_t cs_c_use = cs_c; \
+\
+		/* Convert the strides from being in units of complex elements to
+		   be in units of real elements. Note that we don't need to check for
+		   general storage here because that case corresponds to the scenario
+		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
+		if ( !row_pref ) cs_c_use *= 2; \
+		else             rs_c_use *= 2; \
+\
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  m_r, \
+		  n_r, \
+		  k, \
+		  alpha_r, \
+		  a, \
+		  b, \
+		  beta_r, \
+		  c, rs_c_use, cs_c_use, \
+		  &auxinfo_r, \
+		  cntx  \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC2RO( gemm_ccr, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2RO_MIX_P( gemm_ccr, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/ind/bli_gemm_crr_ref.c b/ref_kernels/ind/bli_gemm_crr_ref.c
new file mode 100644
index 000000000..c729e95d7
--- /dev/null
+++ b/ref_kernels/ind/bli_gemm_crr_ref.c
@@ -0,0 +1,135 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC2RO
+#define GENTFUNC2RO( ctype_abr, ctype_ab, ctype_cr, ctype_c, chabr, chab, chcr, chc, opname, arch, suf ) \
+\
+void PASTEMAC(chabr,chcr,opname,arch,suf) \
+     ( \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const void*      alpha0, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
+       const auxinfo_t* auxinfo, \
+       const cntx_t*    cntx  \
+     ) \
+{ \
+	const ctype_ab*   alpha     = alpha0; \
+	const ctype_c*    beta      = beta0; \
+	      ctype_c*    c         = c0; \
+\
+	const cntl_t*     params    = bli_auxinfo_params( auxinfo ); \
+\
+	const gemm_ukr_ft rgemm_ukr = bli_gemm_var_cntl_real_ukr( params ); \
+	const bool        row_pref  = bli_gemm_var_cntl_row_pref( params ); \
+	const void*       params_r  = bli_gemm_var_cntl_real_params( params ); \
+\
+	const dim_t       mr        = bli_gemm_var_cntl_mr( params ); \
+	const dim_t       nr        = bli_gemm_var_cntl_nr( params ); \
+\
+	      ctype_abr   ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype_abr ) ] \
+	                  __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	      inc_t       rs_ct; \
+	      inc_t       cs_ct; \
+\
+	const ctype_abr* restrict one_r   = PASTEMAC(chabr,1); \
+	const ctype_abr* restrict zero_r  = PASTEMAC(chabr,0); \
+\
+	auxinfo_t auxinfo_r = *auxinfo; \
+	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
+\
+	/* Because Re(C) is always gen-stored, compute the result into temporary
+	   workspace ct and then accumulated it back to c at the end. */ \
+\
+	/* Set the strides of ct based on the preference of the underlying
+	   native real domain gemm micro-kernel. Note that we set the ct
+	   strides in units of complex elements. */ \
+	if ( !row_pref ) { rs_ct = 1;  cs_ct = mr; } \
+	else             { rs_ct = nr; cs_ct = 1; } \
+\
+	/* c = beta * c + alpha_r * a * b; */ \
+	rgemm_ukr \
+	( \
+	  mr, \
+	  nr, \
+	  k, \
+	  one_r, \
+	  a, \
+	  b, \
+	  zero_r, \
+	  ct, rs_ct, cs_ct, \
+	  &auxinfo_r, \
+	  cntx  \
+	); \
+\
+	ctype_abr ar, ai; \
+	PASTEMAC(chab,gets)( *alpha, ar, ai ); \
+\
+	if ( PASTEMAC(chc,eq0)( *beta ) ) \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		{ \
+			ctype_abr axr, axi; \
+			ctype_ab ax; \
+			PASTEMAC(chabr,scal2s)( ar, *(ct + ii*rs_ct + jj*cs_ct), axr ); \
+			PASTEMAC(chabr,scal2s)( ai, *(ct + ii*rs_ct + jj*cs_ct), axi ); \
+			PASTEMAC(chab,sets)( axr, axi, ax ); \
+			PASTEMAC(chab,chc,copys)( ax, *(c + ii*rs_c + jj*cs_c) ); \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		{ \
+			ctype_abr axr, axi; \
+			ctype_ab ax; \
+			PASTEMAC(chabr,scal2s)( ar, *(ct + ii*rs_ct + jj*cs_ct), axr ); \
+			PASTEMAC(chabr,scal2s)( ai, *(ct + ii*rs_ct + jj*cs_ct), axi ); \
+			PASTEMAC(chab,sets)( axr, axi, ax ); \
+			PASTEMAC(chab,chc,chc,xpbys)( ax, *beta, *(c + ii*rs_c + jj*cs_c) ); \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC2RO( gemm_crr, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2RO_MIX_P( gemm_crr, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/ind/bli_gemm_rcc_ref.c b/ref_kernels/ind/bli_gemm_rcc_ref.c
new file mode 100644
index 000000000..67859cd4a
--- /dev/null
+++ b/ref_kernels/ind/bli_gemm_rcc_ref.c
@@ -0,0 +1,132 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC2RO
+#define GENTFUNC2RO( ctype_ab, ctype_abc, ctype_c, ctype_cc, chab, chabc, chc, chcc, opname, arch, suf ) \
+\
+void PASTEMAC(chab,chc,opname,arch,suf) \
+     ( \
+             dim_t      m, \
+             dim_t      n, \
+             dim_t      k, \
+       const void*      alpha, \
+       const void*      a, \
+       const void*      b, \
+       const void*      beta0, \
+             void*      c0, inc_t rs_c, inc_t cs_c, \
+       const auxinfo_t* auxinfo, \
+       const cntx_t*    cntx  \
+     ) \
+{ \
+	const ctype_c*    beta      = beta0; \
+	      ctype_c*    c         = c0; \
+\
+	const cntl_t*     params    = bli_auxinfo_params( auxinfo ); \
+\
+	const gemm_ukr_ft rgemm_ukr = bli_gemm_var_cntl_real_ukr( params ); \
+	const bool        row_pref  = bli_gemm_var_cntl_row_pref( params ); \
+	const void*       params_r  = bli_gemm_var_cntl_real_params( params ); \
+\
+	const dim_t       mr        = bli_gemm_var_cntl_mr( params ); \
+	const dim_t       nr        = bli_gemm_var_cntl_nr( params ); \
+\
+	      ctype_ab    ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype_ab ) ] \
+	                  __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	      inc_t       rs_ct; \
+	      inc_t       cs_ct; \
+\
+	const ctype_ab* restrict zero_r = PASTEMAC(chab,0); \
+\
+	auxinfo_t auxinfo_r = *auxinfo; \
+	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
+\
+	if ( !PASTEMAC(chab,chc,same) ) \
+	{ \
+		/* In the atypical cases, we compute the result into temporary
+		   workspace ct and then accumulated it back to c at the end. */ \
+\
+		/* Set the strides of ct based on the preference of the underlying
+		   native real domain gemm micro-kernel. */ \
+		if ( !row_pref ) { rs_ct = 1;  cs_ct = mr; } \
+		else             { rs_ct = nr; cs_ct = 1; } \
+\
+		/* Complex values of alpha will have already been applied during packing */ \
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  mr, \
+		  nr, \
+		  2*k, \
+		  alpha, \
+		  a, \
+		  b, \
+		  zero_r, \
+		  ct, rs_ct, cs_ct, \
+		  &auxinfo_r, \
+		  cntx  \
+		); \
+\
+		PASTEMAC(chab,chc,chc,xpbys_mxn) \
+		( \
+		  m, n, \
+		  ct, rs_ct, cs_ct, \
+		  beta, \
+		  c, rs_c, cs_c \
+		); \
+	} \
+	else \
+	{ \
+		/* Complex values of alpha will have already been applied during packing */ \
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  m, \
+		  n, \
+		  2*k, \
+		  alpha, \
+		  a, \
+		  b, \
+		  beta, \
+		  c, rs_c, cs_c, \
+		  &auxinfo_r, \
+		  cntx  \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC2RO( gemm_rcc, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC2RO_MIX_P( gemm_rcc, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index a0aa9597b..54f9900c3 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -34,10 +34,400 @@
 
 #include "blis.h"
 
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, trsmkerid ) \
+
+#undef  GENTFUNCRO
+#define GENTFUNCRO( ctype_r, ctype, chr, ch, opname, arch, suf, diagop ) \
+\
+static void PASTEMAC(chr,opname,arch,suf) \
+     ( \
+             dim_t           m, \
+             dim_t           n, \
+       const ctype* restrict a, inc_t rs_a, inc_t cs_a, \
+       const ctype* restrict b, inc_t rs_b, inc_t cs_b, \
+             ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+             pack_t          schema_b \
+     ) \
+{ \
+	const inc_t ld_a  = cs_a; \
+	const inc_t ld_b  = rs_b; \
+\
+	if ( bli_is_1e_packed( schema_b ) ) \
+	{ \
+		const inc_t       rs_a2 = 1 * rs_a; \
+		const inc_t       cs_a2 = 2 * cs_a; \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 2 * cs_b; \
+\
+		ctype_r* restrict a_r   = ( ctype_r* )a; \
+		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
+\
+		ctype_r* restrict b_ri  = ( ctype_r* )b; \
+		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
+\
+		for ( dim_t iter = 0; iter < m; ++iter ) \
+		{ \
+			dim_t i         = iter; \
+			dim_t n_behind  = i; \
+\
+			ctype_r* restrict alpha11_r = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
+			ctype_r* restrict alpha11_i = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
+			ctype_r* restrict a10t_r    = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
+			ctype_r* restrict a10t_i    = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
+			ctype_r* restrict b1_ri     = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_ir     = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B0_ri     = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
+\
+			/* b1 = b1 - a10t * B0; */ \
+			/* b1 = b1 / alpha11; */ \
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict b01_ri      = B0_ri + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11     = c     + (i  )*rs_c  + (j  )*cs_c; \
+				ctype_r           beta11c_r   = *beta11_ri_r; \
+				ctype_r           beta11c_i   = *beta11_ri_i; \
+				ctype_r           rho11_r; \
+				ctype_r           rho11_i; \
+\
+				/* beta11 = beta11 - a10t * b01; */ \
+				PASTEMAC(ch,set0ris)( rho11_r, \
+				                      rho11_i ); \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
+				{ \
+					ctype_r* restrict alpha10_r = a10t_r  + (l  )*cs_a2; \
+					ctype_r* restrict alpha10_i = a10t_i  + (l  )*cs_a2; \
+					ctype_r* restrict beta01_r  = b01_ri  + (l  )*rs_b2 + 0*cs_b; \
+					ctype_r* restrict beta01_i  = b01_ri  + (l  )*rs_b2 + 1*cs_b; \
+\
+					PASTEMAC(ch,axpyris)( *alpha10_r, \
+					                      *alpha10_i, \
+					                      *beta01_r, \
+					                      *beta01_i, \
+					                      rho11_r, \
+					                      rho11_i ); \
+				} \
+				PASTEMAC(ch,subris)( rho11_r, \
+				                     rho11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* beta11 = beta11 / alpha11; */ \
+				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
+				   (1.0/alpha11) is stored during packing instead alpha11 so we
+				   can multiply rather than divide. When preinversion is disabled,
+				   alpha11 is stored and division happens below explicitly. */ \
+				PASTEMAC(ch,diagop)( *alpha11_r, \
+				                     *alpha11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* Output final result to matrix c. */ \
+				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
+\
+				/* Store the local values back to b11. */ \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+				{ \
+					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+                } \
+			} \
+		} \
+	} \
+	else /* ( bli_is_1r_packed( schema_b ) ) */ \
+	{ \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 1 * cs_b; \
+\
+		ctype*   restrict a_ri  = ( ctype*   )a; \
+		/*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
+\
+		ctype_r* restrict b_r   = ( ctype_r* )b; \
+		ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
+\
+		for ( dim_t iter = 0; iter < m; ++iter ) \
+		{ \
+			dim_t i         = iter; \
+			dim_t n_behind  = i; \
+\
+			ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
+			ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
+			ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
+			ctype*   restrict a10t_ri    = a_ri + (i  )*rs_a  + (0  )*cs_a; \
+			ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B0_r       = b_r  + (0  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B0_i       = b_i  + (0  )*rs_b2 + (0  )*cs_b2; \
+\
+			/* b1 = b1 - a10t * B0; */ \
+			/* b1 = b1 / alpha11; */ \
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype_r* restrict b01_r     = B0_r + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype_r* restrict b01_i     = B0_i + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11   = c    + (i  )*rs_c  + (j  )*cs_c; \
+				ctype_r           beta11c_r = *beta11_r; \
+				ctype_r           beta11c_i = *beta11_i; \
+				ctype_r           rho11_r; \
+				ctype_r           rho11_i; \
+\
+				/* beta11 = beta11 - a10t * b01; */ \
+				PASTEMAC(ch,set0ris)( rho11_r, \
+				                      rho11_i ); \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
+				{ \
+					ctype*   restrict alpha10_ri = a10t_ri + (l  )*cs_a; \
+					ctype_r* restrict alpha10_r  = &PASTEMAC(ch,real)( *alpha10_ri ); \
+					ctype_r* restrict alpha10_i  = &PASTEMAC(ch,imag)( *alpha10_ri ); \
+					ctype_r* restrict beta01_r   = b01_r   + (l  )*rs_b2; \
+					ctype_r* restrict beta01_i   = b01_i   + (l  )*rs_b2; \
+\
+					PASTEMAC(ch,axpyris)( *alpha10_r, \
+					                      *alpha10_i, \
+					                      *beta01_r, \
+					                      *beta01_i, \
+					                      rho11_r, \
+					                      rho11_i ); \
+				} \
+				PASTEMAC(ch,subris)( rho11_r, \
+				                     rho11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* beta11 = beta11 / alpha11; */ \
+				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
+				   (1.0/alpha11) is stored during packing instead alpha11 so we
+				   can multiply rather than divide. When preinversion is disabled,
+				   alpha11 is stored and division happens below explicitly. */ \
+				PASTEMAC(ch,diagop)( *alpha11_r, \
+				                     *alpha11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* Output final result to matrix c. */ \
+				PASTEMAC(ch,sets)( beta11c_r, \
+				                   beta11c_i, *gamma11 ); \
+\
+				/* Store the local values back to b11. */ \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+					PASTEMAC(ch,copyris)( beta11c_r, \
+					                      beta11c_i, \
+					                      *(beta11_r + d), \
+					                      *(beta11_i + d) ); \
+			} \
+		} \
+	} \
+}
+
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+INSERT_GENTFUNCRO( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
+#else
+INSERT_GENTFUNCRO( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
+#endif
+
+
+#undef  GENTFUNCRO
+#define GENTFUNCRO( ctype_r, ctype, chr, ch, opname, arch, suf, diagop ) \
+\
+static void PASTEMAC(chr,opname,arch,suf) \
+     ( \
+             dim_t           m, \
+             dim_t           n, \
+       const ctype* restrict a, inc_t rs_a, inc_t cs_a, \
+       const ctype* restrict b, inc_t rs_b, inc_t cs_b, \
+             ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+             pack_t          schema_b \
+     ) \
+{ \
+	const inc_t ld_a  = cs_a; \
+	const inc_t ld_b  = rs_b; \
+\
+	if ( bli_is_1e_packed( schema_b ) ) \
+	{ \
+		const inc_t       rs_a2 = 1 * rs_a; \
+		const inc_t       cs_a2 = 2 * cs_a; \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 2 * cs_b; \
+\
+		ctype_r* restrict a_r   = ( ctype_r* )a; \
+		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
+\
+		ctype_r* restrict b_ri  = ( ctype_r* )b; \
+		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
+\
+		for ( dim_t iter = 0; iter < m; ++iter ) \
+		{ \
+			dim_t i         = m - iter - 1; \
+			dim_t n_behind  = iter; \
+\
+			ctype_r* restrict alpha11_r  = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
+			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
+			ctype_r* restrict a12t_r     = a_r  + (i  )*rs_a2 + (i+1)*cs_a2; \
+			ctype_r* restrict a12t_i     = a_i  + (i  )*rs_a2 + (i+1)*cs_a2; \
+			ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B2_ri      = b_ri + (i+1)*rs_b2 + (0  )*cs_b2; \
+\
+			/* b1 = b1 - a12t * B2; */ \
+			/* b1 = b1 / alpha11; */ \
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict b21_ri      = B2_ri + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11     = c     + (i  )*rs_c + (j  )*cs_c; \
+				ctype_r           beta11c_r   = *beta11_ri_r; \
+				ctype_r           beta11c_i   = *beta11_ri_i; \
+				ctype_r           rho11_r; \
+				ctype_r           rho11_i; \
+\
+				/* beta11 = beta11 - a10t * b01; */ \
+				PASTEMAC(ch,set0ris)( rho11_r, \
+				                      rho11_i ); \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
+				{ \
+					ctype_r* restrict alpha12_r = a12t_r + (l  )*cs_a2; \
+					ctype_r* restrict alpha12_i = a12t_i + (l  )*cs_a2; \
+					ctype_r* restrict beta21_r  = b21_ri + (l  )*rs_b2 + 0*cs_b; \
+					ctype_r* restrict beta21_i  = b21_ri + (l  )*rs_b2 + 1*cs_b; \
+\
+					PASTEMAC(ch,axpyris)( *alpha12_r, \
+					                      *alpha12_i, \
+					                      *beta21_r, \
+					                      *beta21_i, \
+					                      rho11_r, \
+					                      rho11_i ); \
+				} \
+				PASTEMAC(ch,subris)( rho11_r, \
+				                     rho11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* beta11 = beta11 / alpha11; */ \
+				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
+				   (1.0/alpha11) is stored during packing instead alpha11 so we
+				   can multiply rather than divide. When preinversion is disabled,
+				   alpha11 is stored and division happens below explicitly. */ \
+				PASTEMAC(ch,diagop)( *alpha11_r, \
+				                     *alpha11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* Output final result to matrix c. */ \
+				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
+\
+				/* Store the local values back to b11. */ \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+				{ \
+					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+                } \
+			} \
+		} \
+	} \
+	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
+	{ \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 1 * cs_b; \
 \
-void PASTEMAC3(ch,opname,arch,suf) \
+		ctype*   restrict a_ri  = ( ctype*   )a; \
+		/*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
+\
+		ctype_r* restrict b_r   = ( ctype_r* )b; \
+		ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
+\
+		for ( dim_t iter = 0; iter < m; ++iter ) \
+		{ \
+			dim_t i         = m - iter - 1; \
+			dim_t n_behind  = iter; \
+\
+			ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
+			ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
+			ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
+			ctype*   restrict a12t_ri    = a_ri + (i  )*rs_a  + (i+1)*cs_a; \
+			ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B2_r       = b_r  + (i+1)*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B2_i       = b_i  + (i+1)*rs_b2 + (0  )*cs_b2; \
+\
+			/* b1 = b1 - a12t * B2; */ \
+			/* b1 = b1 / alpha11; */ \
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype_r* restrict b21_r     = B2_r + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype_r* restrict b21_i     = B2_i + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11   = c    + (i  )*rs_c  + (j  )*cs_c; \
+				ctype_r           beta11c_r = *beta11_r; \
+				ctype_r           beta11c_i = *beta11_i; \
+				ctype_r           rho11_r; \
+				ctype_r           rho11_i; \
+\
+				/* beta11 = beta11 - a10t * b01; */ \
+				PASTEMAC(ch,set0ris)( rho11_r, \
+				                      rho11_i ); \
+				for ( dim_t l = 0; l < n_behind; ++l ) \
+				{ \
+					ctype*   restrict alpha12_ri = a12t_ri + (l  )*cs_a; \
+					ctype_r* restrict alpha12_r  = &PASTEMAC(ch,real)( *alpha12_ri ); \
+					ctype_r* restrict alpha12_i  = &PASTEMAC(ch,imag)( *alpha12_ri ); \
+					ctype_r* restrict beta21_r   = b21_r   + (l  )*rs_b2; \
+					ctype_r* restrict beta21_i   = b21_i   + (l  )*rs_b2; \
+\
+					PASTEMAC(ch,axpyris)( *alpha12_r, \
+					                      *alpha12_i, \
+					                      *beta21_r, \
+					                      *beta21_i, \
+					                      rho11_r, \
+					                      rho11_i ); \
+				} \
+				PASTEMAC(ch,subris)( rho11_r, \
+				                     rho11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* beta11 = beta11 / alpha11; */ \
+				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
+				   (1.0/alpha11) is stored during packing instead alpha11 so we
+				   can multiply rather than divide. When preinversion is disabled,
+				   alpha11 is stored and division happens below explicitly. */ \
+				PASTEMAC(ch,diagop)( *alpha11_r, \
+				                     *alpha11_i, \
+				                     beta11c_r, \
+				                     beta11c_i ); \
+\
+				/* Output final result to matrix c. */ \
+				PASTEMAC(ch,sets)( beta11c_r, \
+				                   beta11c_i, *gamma11 ); \
+\
+				/* Store the local values back to b11. */ \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+					PASTEMAC(ch,copyris)( beta11c_r, \
+					                      beta11c_i, \
+					                      *(beta11_r + d), \
+					                      *(beta11_i + d) ); \
+			} \
+		} \
+	} \
+}
+
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+INSERT_GENTFUNCRO( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
+#else
+INSERT_GENTFUNCRO( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
+#endif
+
+#undef  GENTFUNCRO
+#define GENTFUNCRO( ctype_r, ctype, chr, ch, opname, arch, suf, trsmname ) \
+\
+void PASTEMAC(chr,opname,arch,suf) \
      ( \
              dim_t      m, \
              dim_t      n, \
@@ -48,7 +438,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        const void*      bx10, \
              void*      b110, \
              void*      c110, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
+       const auxinfo_t* auxinfo, \
        const cntx_t*    cntx  \
      ) \
 { \
@@ -59,126 +449,57 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	      ctype*      b11         = b110; \
 	      ctype*      c11         = c110; \
 \
-	const num_t       dt          = PASTEMAC(ch,type); \
-	const num_t       dt_r        = PASTEMAC(chr,type); \
-\
-	      gemm_ukr_ft rgemm_ukr   = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const cntl_t*     params     = bli_auxinfo_params( auxinfo ); \
 \
-	      trsm_ukr_ft ctrsm_vukr  = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
+	const gemm_ukr_ft rgemm_ukr  = bli_trsm_var_cntl_real_gemm_ukr( params ); \
+	const bool        row_pref   = bli_trsm_var_cntl_row_pref( params ); \
+	const void*       params_r   = bli_trsm_var_cntl_real_params( params ); \
 \
-	const bool        col_pref_r  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const dim_t       mr         = bli_trsm_var_cntl_mr( params ); \
+	const dim_t       nr         = bli_trsm_var_cntl_nr( params ); \
 \
-	const dim_t       mr          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t       nr          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t       mr_r       = row_pref ? mr : 2 * mr; \
+	const dim_t       nr_r       = row_pref ? 2 * nr : nr; \
 \
-	const dim_t       mr_r        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
-	const dim_t       nr_r        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
+	/* Convert the micro-tile dimensions from being in units of complex elements to
+	   be in units of real elements. */ \
+	const dim_t       k_r         = 2 * k; \
 \
-	      ctype       bt[ BLIS_STACK_BUF_MAX_SIZE \
+	ctype             bt[ BLIS_STACK_BUF_MAX_SIZE \
 	                      / sizeof( ctype ) ] \
 	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	      inc_t       rs_bt; \
-	      inc_t       cs_bt; \
+	inc_t             rs_bt       = row_pref ? nr : 1; \
+	inc_t             cs_bt       = row_pref ? 1 : mr; \
 \
-	      inc_t       rs_bt_r; \
-	      inc_t       cs_bt_r; \
+	inc_t             rs_bt_r     = row_pref ? nr_r : 1; \
+	inc_t             cs_bt_r     = row_pref ? 1 : mr_r; \
 \
-	const dim_t       packnr      = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
+	const pack_t      schema_b    = bli_auxinfo_schema_b( auxinfo ); \
 \
-	const pack_t      schema_b    = bli_auxinfo_schema_b( data ); \
+	ctype_r* restrict a1x_r       = ( ctype_r* )a1x; \
 \
-	const dim_t       k2          = 2 * k; \
+	const inc_t       rs_a        = bli_trsm_var_cntl_mr_bcast( params ); \
+	const inc_t       cs_a        = bli_trsm_var_cntl_mr_pack( params ); \
 \
-	const ctype_r*    a1x_r       = ( ctype_r* )a1x; \
+	ctype_r* restrict bx1_r       = ( ctype_r* )bx1; \
 \
-	      ctype_r*    bx1_r       = ( ctype_r* )bx1; \
+	const inc_t       rs_b        = bli_trsm_var_cntl_nr_pack( params ); \
+	const inc_t       cs_b        = bli_trsm_var_cntl_nr_bcast( params ); \
 \
-	const inc_t       rs_b        = packnr; \
-	const inc_t       cs_b        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
-\
-	const ctype_r*    zero_r      = PASTEMAC(chr,0); \
-	const ctype_r*    minus_one_r = PASTEMAC(chr,m1); \
+	ctype_r* restrict zero_r      = PASTEMAC(chr,0); \
+	ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
 \
 	const ctype_r     alpha_r     = PASTEMAC(ch,real)( *alpha ); \
-	const ctype_r     alpha_i     = PASTEMAC(ch,imag)( *alpha ); \
-\
-	      ctype_r*    b_use; \
-\
-	      inc_t       rs_b_use; \
-	      inc_t       cs_b_use; \
-\
-	      ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                      / sizeof( ctype ) ] \
-	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	/* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
-	   instead? */ \
-	const bool        col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t       rs_ct       = ( col_pref ? 1 : nr ); \
-	const inc_t       cs_ct       = ( col_pref ? mr : 1 ); \
-\
-	const bool        use_ct      = ( m < mr || n < nr ); \
-\
-	      ctype*      c11_use     = c11; \
-\
-	      inc_t       rs_c_use    = rs_c; \
-	      inc_t       cs_c_use    = cs_c; \
-\
-	if ( use_ct ) \
-	{ \
-		c11_use  = ct; \
-		rs_c_use = rs_ct; \
-		cs_c_use = cs_ct; \
-	} \
-\
-\
-	/* Handle alphas with non-zero imaginary components. */ \
-	/* NOTE: This branch should never execute because alphas with
-	   non-zero imaginary components should be applied during
-	   packing, and so the only alphas we should see here are
-	   those exclusively in the real domain, either because the
-	   value originally had no imaginary compoent (e.g. 4.0) or
-	   because a 1.0 was sent in as a placeholder since the alpha
-	   was applied during packing. */ \
-	if ( 0 ) \
-	if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \
-	{ \
-		bli_abort(); \
-\
-		/*
-		ctype_r* restrict one_r = PASTEMAC(chr,1); \
-\
-		const inc_t ld_b = rs_b; \
-\
-		PASTEMAC(ch,scal1ms_mxn)( schema_b, \
-		                          mr, \
-		                          nr, \
-		                          alpha, \
-		                          b11, rs_b, cs_b, ld_b ); \
-\
-		alpha_r = *one_r; \
-		*/ \
-	} \
+	const ctype_r     alpha_i     = PASTEMAC(ch,imag)( *alpha ); ( void ) alpha_i; \
 \
-\
-	{ \
-		/* Set the strides for the temporary bt matrix based on the native
-		   real domain micro-kernel storage preferences. */ \
-		if ( col_pref_r ) { rs_bt   = 1;    cs_bt   = mr;     \
-		                    rs_bt_r = 1;    cs_bt_r = mr_r; } \
-		else              { rs_bt   = nr;   cs_bt   = 1;      \
-		                    rs_bt_r = nr_r; cs_bt_r = 1;    } \
-\
-		b_use    = ( ctype_r* )bt; \
-		rs_b_use = rs_bt_r; \
-		cs_b_use = cs_bt_r; \
-	} \
+	auxinfo_t auxinfo_r = *auxinfo; \
+	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
 \
 \
 	/* Since b11 is stored in the 1e or 1r schema, we cannot update it
 	   directly, and instead must compute the matrix product in a local
 	   temporary microtile and then accumulate it into b11 according to
 	   its schema. */ \
-\
 \
 	/* lower: bt = -1.0 * a10 * b01;
 	   upper: bt = -1.0 * a12 * b21; */ \
@@ -186,22 +507,21 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	( \
 	  mr_r, \
 	  nr_r, \
-	  k2, \
+	  k_r, \
 	  minus_one_r, \
 	  a1x_r, \
 	  bx1_r, \
 	  zero_r, \
-	  b_use, rs_b_use, cs_b_use, \
-	  data, \
+	  bt, rs_bt_r, cs_bt_r, \
+	  &auxinfo_r, \
 	  cntx  \
 	); \
-\
 \
 	if ( bli_is_1e_packed( schema_b ) ) \
 	{ \
-		const inc_t ld_b  =     rs_b; \
-		const inc_t rs_b2 = 2 * rs_b; \
-		const inc_t cs_b2 = 2 * cs_b; \
+		const inc_t       ld_b   =     rs_b; \
+		const inc_t       rs_b2  = 2 * rs_b; \
+		const inc_t       cs_b2  = 2 * cs_b; \
 \
 		ctype_r* restrict b11_ri = ( ctype_r* )b11; \
 		ctype_r* restrict b11_ir = ( ctype_r* )b11 + ld_b; \
@@ -220,7 +540,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype_r* restrict beta11_ir_r = b11_ir + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
 			ctype_r* restrict beta11_ir_i = b11_ir + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \
 \
-			PASTEMAC3(ch,chr,ch,xpbyris) \
+			PASTEMAC(ch,chr,ch,xpbyris) \
 			( \
 			  *beta11t_r, \
 			  *beta11t_i, \
@@ -236,9 +556,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
 	{ \
-		const inc_t ld_b  =     rs_b; \
-		const inc_t rs_b2 = 2 * rs_b; \
-		const inc_t cs_b2 =     cs_b; \
+		const inc_t       ld_b  =     rs_b; \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 =     cs_b; \
 \
 		ctype_r* restrict b11_r = ( ctype_r* )b11; \
 		ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \
@@ -255,7 +575,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype_r* restrict beta11_r  = b11_r + i*rs_b2 + j*cs_b2 + d; \
 			ctype_r* restrict beta11_i  = b11_i + i*rs_b2 + j*cs_b2 + d; \
 \
-			PASTEMAC3(ch,chr,ch,xpbyris) \
+			PASTEMAC(ch,chr,ch,xpbyris) \
 			( \
 			  *beta11t_r, \
 			  *beta11t_i, \
@@ -266,29 +586,18 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			); \
 		} \
 	} \
-\
 \
 	/* b11 = inv(a11) * b11;
 	   c11 = b11; */ \
-	ctrsm_vukr \
+	PASTEMAC(chr,trsmname,arch,suf) \
 	( \
-	  a11, \
-	  b11, \
-	  c11_use, rs_c_use, cs_c_use, \
-	  data, \
-	  cntx  \
+	  m, n, \
+	  a11, rs_a, cs_a, \
+	  b11, rs_b, cs_b, \
+	  c11, rs_c, cs_c, \
+	  schema_b  \
 	); \
-\
-	if ( use_ct ) \
-	{ \
-		PASTEMAC(ch,copys_mxn) \
-		( \
-		  m, n, \
-		  ct,  rs_ct, cs_ct, \
-		  c11, rs_c,  cs_c  \
-		); \
-	} \
 }
 
-INSERT_GENTFUNCCO( gemmtrsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
-INSERT_GENTFUNCCO( gemmtrsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
+INSERT_GENTFUNCRO( gemmtrsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, trsm1m_l )
+INSERT_GENTFUNCRO( gemmtrsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, trsm1m_u )
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
deleted file mode 100644
index 05e193a09..000000000
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       const void*      a0, \
-             void*      b0, \
-             void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     ) \
-{ \
-	const ctype*      a      = a0; \
-	      ctype*      b      = b0; \
-	      ctype*      c      = c0; \
-\
-	const num_t       dt     = PASTEMAC(ch,type); \
-	const num_t       dt_r   = PASTEMAC(chr,type); \
-\
-	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t       packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t       packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t       m      = mr; \
-	const dim_t       n      = nr; \
-\
-	const inc_t       rs_a   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \
-	const inc_t       cs_a   = packmr; \
-\
-	const inc_t       rs_b   = packnr; \
-	const inc_t       cs_b   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
-\
-	const inc_t       ld_a  = cs_a; \
-	const inc_t       ld_b  = rs_b; \
-\
-	const pack_t      schema_b = bli_auxinfo_schema_b( data ); \
-\
-	if ( bli_is_1e_packed( schema_b ) ) \
-	{ \
-		const inc_t rs_a2 = 1 * rs_a; \
-		const inc_t cs_a2 = 2 * cs_a; \
-		const inc_t rs_b2 = 2 * rs_b; \
-		const inc_t cs_b2 = 2 * cs_b; \
-\
-		const ctype_r* restrict a_r  = ( ctype_r* )a; \
-		const ctype_r* restrict a_i  = ( ctype_r* )a + ld_a; \
-\
-		      ctype_r* restrict b_ri = ( ctype_r* )b; \
-		      ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \
-\
-		for ( dim_t iter = 0; iter < m; ++iter ) \
-		{ \
-			dim_t i         = iter; \
-			dim_t n_behind  = i; \
-\
-			const ctype_r* restrict alpha11_r = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
-			const ctype_r* restrict alpha11_i = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
-			const ctype_r* restrict a10t_r    = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
-			const ctype_r* restrict a10t_i    = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
-			      ctype_r* restrict b1_ri     = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict b1_ir     = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict B0_ri     = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
-\
-			/* b1 = b1 - a10t * B0; */ \
-			/* b1 = b1 / alpha11; */ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
-				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
-				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
-				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
-				ctype_r* restrict b01_ri      = B0_ri + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype*   restrict gamma11     = c     + (i  )*rs_c  + (j  )*cs_c; \
-				ctype_r           beta11c_r   = *beta11_ri_r; \
-				ctype_r           beta11c_i   = *beta11_ri_i; \
-				ctype_r           rho11_r; \
-				ctype_r           rho11_i; \
-\
-				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
-				                      rho11_i ); \
-				for ( dim_t l = 0; l < n_behind; ++l ) \
-				{ \
-					const ctype_r* restrict alpha10_r = a10t_r  + (l  )*cs_a2; \
-					const ctype_r* restrict alpha10_i = a10t_i  + (l  )*cs_a2; \
-					const ctype_r* restrict beta01_r  = b01_ri  + (l  )*rs_b2 + 0*cs_b; \
-					const ctype_r* restrict beta01_i  = b01_ri  + (l  )*rs_b2 + 1*cs_b; \
-\
-					PASTEMAC(ch,axpyris)( *alpha10_r, \
-					                      *alpha10_i, \
-					                      *beta01_r, \
-					                      *beta01_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
-				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* beta11 = beta11 / alpha11; */ \
-				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-				   (1.0/alpha11) is stored during packing instead alpha11 so we
-				   can multiply rather than divide. When preinversion is disabled,
-				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
-\
-				/* Store the local values back to b11. */ \
-				for ( dim_t d = 0; d < cs_b; ++d ) \
-				{ \
-					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
-					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
-                } \
-			} \
-		} \
-	} \
-	else /* ( bli_is_1r_packed( schema_b ) ) */ \
-	{ \
-		const inc_t rs_b2 = 2 * rs_b; \
-		const inc_t cs_b2 = 1 * cs_b; \
-\
-		const ctype*   restrict a_ri  = ( ctype*   )a; \
-		    /*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
-\
-		      ctype_r* restrict b_r   = ( ctype_r* )b; \
-		      ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
-\
-		for ( dim_t iter = 0; iter < m; ++iter ) \
-		{ \
-			dim_t i         = iter; \
-			dim_t n_behind  = i; \
-\
-			const ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
-			const ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
-			const ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
-			const ctype*   restrict a10t_ri    = a_ri + (i  )*rs_a  + (0  )*cs_a; \
-			      ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict B0_r       = b_r  + (0  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict B0_i       = b_i  + (0  )*rs_b2 + (0  )*cs_b2; \
-\
-			/* b1 = b1 - a10t * B0; */ \
-			/* b1 = b1 / alpha11; */ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype_r* restrict b01_r     = B0_r + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype_r* restrict b01_i     = B0_i + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype*   restrict gamma11   = c    + (i  )*rs_c  + (j  )*cs_c; \
-				ctype_r           beta11c_r = *beta11_r; \
-				ctype_r           beta11c_i = *beta11_i; \
-				ctype_r           rho11_r; \
-				ctype_r           rho11_i; \
-\
-				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
-				                      rho11_i ); \
-				for ( dim_t l = 0; l < n_behind; ++l ) \
-				{ \
-					const ctype*   restrict alpha10_ri = a10t_ri + (l  )*cs_a; \
-					const ctype_r* restrict alpha10_r  = &PASTEMAC(ch,real)( *alpha10_ri ); \
-					const ctype_r* restrict alpha10_i  = &PASTEMAC(ch,imag)( *alpha10_ri ); \
-					      ctype_r* restrict beta01_r   = b01_r   + (l  )*rs_b2; \
-					      ctype_r* restrict beta01_i   = b01_i   + (l  )*rs_b2; \
-\
-					PASTEMAC(ch,axpyris)( *alpha10_r, \
-					                      *alpha10_i, \
-					                      *beta01_r, \
-					                      *beta01_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
-				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* beta11 = beta11 / alpha11; */ \
-				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-				   (1.0/alpha11) is stored during packing instead alpha11 so we
-				   can multiply rather than divide. When preinversion is disabled,
-				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)( beta11c_r, \
-				                   beta11c_i, *gamma11 ); \
-\
-				/* Store the local values back to b11. */ \
-				for ( dim_t d = 0; d < cs_b; ++d ) \
-					PASTEMAC(ch,copyris)( beta11c_r, \
-					                      beta11c_i, \
-					                      *(beta11_r + d), \
-					                      *(beta11_i + d) ); \
-			} \
-		} \
-	} \
-}
-
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNCCO( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
-#else
-INSERT_GENTFUNCCO( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
-#endif
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       const void*      a0, \
-             void*      b0, \
-             void*      c0, inc_t rs_c, inc_t cs_c, \
-             auxinfo_t* data, \
-       const cntx_t*    cntx  \
-     ) \
-{ \
-	const ctype*      a      = a0; \
-	      ctype*      b      = b0; \
-	      ctype*      c      = c0; \
-\
-	const num_t       dt     = PASTEMAC(ch,type); \
-	const num_t       dt_r   = PASTEMAC(chr,type); \
-\
-	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t       packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t       packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t       m      = mr; \
-	const dim_t       n      = nr; \
-\
-	const inc_t       rs_a   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \
-	const inc_t       cs_a   = packmr; \
-\
-	const inc_t       rs_b   = packnr; \
-	const inc_t       cs_b   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
-\
-	const inc_t       ld_a  = cs_a; \
-	const inc_t       ld_b  = rs_b; \
-\
-	const pack_t      schema_b = bli_auxinfo_schema_b( data ); \
-\
-	if ( bli_is_1e_packed( schema_b ) ) \
-	{ \
-		const inc_t rs_a2 = 1 * rs_a; \
-		const inc_t cs_a2 = 2 * cs_a; \
-		const inc_t rs_b2 = 2 * rs_b; \
-		const inc_t cs_b2 = 2 * cs_b; \
-\
-		const ctype_r* restrict a_r  = ( ctype_r* )a; \
-		const ctype_r* restrict a_i  = ( ctype_r* )a + ld_a; \
-\
-		      ctype_r* restrict b_ri = ( ctype_r* )b; \
-		      ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \
-\
-		for ( dim_t iter = 0; iter < m; ++iter ) \
-		{ \
-			dim_t i         = m - iter - 1; \
-			dim_t n_behind  = iter; \
-\
-			const ctype_r* restrict alpha11_r  = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
-			const ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
-			const ctype_r* restrict a12t_r     = a_r  + (i  )*rs_a2 + (i+1)*cs_a2; \
-			const ctype_r* restrict a12t_i     = a_i  + (i  )*rs_a2 + (i+1)*cs_a2; \
-			      ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict B2_ri      = b_ri + (i+1)*rs_b2 + (0  )*cs_b2; \
-\
-			/* b1 = b1 - a12t * B2; */ \
-			/* b1 = b1 / alpha11; */ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
-				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
-				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
-				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
-				ctype_r* restrict b21_ri      = B2_ri + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype*   restrict gamma11     = c     + (i  )*rs_c + (j  )*cs_c; \
-				ctype_r           beta11c_r   = *beta11_ri_r; \
-				ctype_r           beta11c_i   = *beta11_ri_i; \
-				ctype_r           rho11_r; \
-				ctype_r           rho11_i; \
-\
-				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
-				                      rho11_i ); \
-				for ( dim_t l = 0; l < n_behind; ++l ) \
-				{ \
-					const ctype_r* restrict alpha12_r = a12t_r + (l  )*cs_a2; \
-					const ctype_r* restrict alpha12_i = a12t_i + (l  )*cs_a2; \
-					      ctype_r* restrict beta21_r  = b21_ri + (l  )*rs_b2 + 0*cs_b; \
-					      ctype_r* restrict beta21_i  = b21_ri + (l  )*rs_b2 + 1*cs_b; \
-\
-					PASTEMAC(ch,axpyris)( *alpha12_r, \
-					                      *alpha12_i, \
-					                      *beta21_r, \
-					                      *beta21_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
-				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* beta11 = beta11 / alpha11; */ \
-				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-				   (1.0/alpha11) is stored during packing instead alpha11 so we
-				   can multiply rather than divide. When preinversion is disabled,
-				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
-\
-				/* Store the local values back to b11. */ \
-				for ( dim_t d = 0; d < cs_b; ++d ) \
-				{ \
-					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
-					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
-                } \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
-	{ \
-		const inc_t rs_b2 = 2 * rs_b; \
-		const inc_t cs_b2 = 1 * cs_b; \
-\
-		const ctype*   restrict a_ri  = ( ctype*   )a; \
-		    /*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
-\
-		      ctype_r* restrict b_r   = ( ctype_r* )b; \
-		      ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
-\
-		for ( dim_t iter = 0; iter < m; ++iter ) \
-		{ \
-			dim_t i         = m - iter - 1; \
-			dim_t n_behind  = iter; \
-\
-			const ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
-			const ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
-			const ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
-			const ctype*   restrict a12t_ri    = a_ri + (i  )*rs_a  + (i+1)*cs_a; \
-			      ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict B2_r       = b_r  + (i+1)*rs_b2 + (0  )*cs_b2; \
-			      ctype_r* restrict B2_i       = b_i  + (i+1)*rs_b2 + (0  )*cs_b2; \
-\
-			/* b1 = b1 - a12t * B2; */ \
-			/* b1 = b1 / alpha11; */ \
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype_r* restrict b21_r     = B2_r + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype_r* restrict b21_i     = B2_i + (0  )*rs_b2 + (j  )*cs_b2; \
-				ctype*   restrict gamma11   = c    + (i  )*rs_c  + (j  )*cs_c; \
-				ctype_r           beta11c_r = *beta11_r; \
-				ctype_r           beta11c_i = *beta11_i; \
-				ctype_r           rho11_r; \
-				ctype_r           rho11_i; \
-\
-				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
-				                      rho11_i ); \
-				for ( dim_t l = 0; l < n_behind; ++l ) \
-				{ \
-					const ctype*   restrict alpha12_ri = a12t_ri + (l  )*cs_a; \
-					const ctype_r* restrict alpha12_r  = &PASTEMAC(ch,real)( *alpha12_ri ); \
-					const ctype_r* restrict alpha12_i  = &PASTEMAC(ch,imag)( *alpha12_ri ); \
-					      ctype_r* restrict beta21_r   = b21_r   + (l  )*rs_b2; \
-					      ctype_r* restrict beta21_i   = b21_i   + (l  )*rs_b2; \
-\
-					PASTEMAC(ch,axpyris)( *alpha12_r, \
-					                      *alpha12_i, \
-					                      *beta21_r, \
-					                      *beta21_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
-				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* beta11 = beta11 / alpha11; */ \
-				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-				   (1.0/alpha11) is stored during packing instead alpha11 so we
-				   can multiply rather than divide. When preinversion is disabled,
-				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
-\
-				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)( beta11c_r, \
-				                   beta11c_i, *gamma11 ); \
-\
-				/* Store the local values back to b11. */ \
-				for ( dim_t d = 0; d < cs_b; ++d ) \
-					PASTEMAC(ch,copyris)( beta11c_r, \
-					                      beta11c_i, \
-					                      *(beta11_r + d), \
-					                      *(beta11_i + d) ); \
-			} \
-		} \
-	} \
-}
-
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNCCO( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
-#else
-INSERT_GENTFUNCCO( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
-#endif
diff --git a/sandbox/gemmlike/bli_gemm_ex.c b/sandbox/gemmlike/bli_gemm_ex.c
index f8e8f86f7..fbb2c5548 100644
--- a/sandbox/gemmlike/bli_gemm_ex.c
+++ b/sandbox/gemmlike/bli_gemm_ex.c
@@ -70,27 +70,14 @@ void bli_gemm_ex
 		  ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
 		  ( cntx_t* )cntx, ( rntm_t* )rntm
 		);
-		return;
 	}
-
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
-
-	// Obtain a valid (native) context from the gks if necessary.
-	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-
-	// Check the operands.
-	if ( bli_error_checking_is_enabled() )
-		bli_gemm_check( alpha, a, b, beta, c, cntx );
-
-	// Invoke the operation's front end.
-	bli_gemm_front
-	(
-	  ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
-	  ( cntx_t* )cntx, ( rntm_t* )rntm
-	);
+	else
+	{
+		bli_gemm_def_ex
+		(
+		  ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
+		  ( cntx_t* )cntx, ( rntm_t* )rntm
+		);
+	}
 }
 
diff --git a/sandbox/gemmlike/bli_sandbox.h b/sandbox/gemmlike/bli_sandbox.h
index 6f33da602..190409042 100644
--- a/sandbox/gemmlike/bli_sandbox.h
+++ b/sandbox/gemmlike/bli_sandbox.h
@@ -47,11 +47,8 @@
 #include "bls_gemm_check.h"
 #include "bls_gemm_var.h"
 
-#include "bls_l3_packm_a.h"
-#include "bls_l3_packm_b.h"
+#include "bls_l3_packm.h"
 #include "bls_l3_packm_var.h"
 
-#include "bls_packm_cxk.h"
-
 
 #endif
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index 2b4bffde1..5ded92921 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -102,10 +102,6 @@ void bls_gemm_ex
 
 	// -- bli_gemm_front() -----------------------------------------------------
 
-	obj_t a_local;
-	obj_t b_local;
-	obj_t c_local;
-
 	// If C has a zero dimension, return early.
 	if ( bli_obj_has_zero_dim( c ) )
 	{
@@ -123,31 +119,24 @@ void bls_gemm_ex
 	}
 
 	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-
-	// Induce a transposition of A if it has its transposition property set.
-	// Then clear the transposition bit in the object.
-	if ( bli_obj_has_trans( &a_local ) )
-	{
-		bli_obj_induce_trans( &a_local );
-		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
-	}
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+	bli_obj_alias_submatrix( a, &a_local );
+	bli_obj_alias_submatrix( b, &b_local );
+	bli_obj_alias_submatrix( c, &c_local );
 
-	// Induce a transposition of B if it has its transposition property set.
-	// Then clear the transposition bit in the object.
-	if ( bli_obj_has_trans( &b_local ) )
-	{
-		bli_obj_induce_trans( &b_local );
-		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local );
-	}
+	// Typecast alpha and beta to the correct type
+	obj_t alpha_cast, beta_cast;
+	num_t dt = bli_obj_dt( c );
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, alpha, &alpha_cast );
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, beta, &beta_cast );
 
 	// An optimization: If C is stored by rows and the micro-kernel prefers
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
@@ -159,10 +148,8 @@ void bls_gemm_ex
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
 	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
+	bli_rntm_factorize
 	(
-	  BLIS_GEMM,
-	  BLIS_LEFT, // ignored for gemm/hemm/symm
 	  bli_obj_length( &c_local ),
 	  bli_obj_width( &c_local ),
 	  bli_obj_width( &a_local ),
@@ -176,10 +163,10 @@ void bls_gemm_ex
 	(
 	  bls_gemm_int,
 	  BLIS_GEMM, // operation family id
-	  alpha,
+	  &alpha_cast,
 	  &a_local,
 	  &b_local,
-	  beta,
+	  &beta_cast,
 	  &c_local,
 	  cntx,
 	  &rntm_l
@@ -227,7 +214,7 @@ err_t bls_gemm_int
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
-void PASTECH2(bls_,ch,opname) \
+void PASTECH(bls_,ch,opname) \
      ( \
        trans_t transa, \
        trans_t transb, \
diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h
index 3dadacfd0..7c1adfd20 100644
--- a/sandbox/gemmlike/bls_gemm.h
+++ b/sandbox/gemmlike/bls_gemm.h
@@ -79,7 +79,7 @@ err_t bls_gemm_int
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTECH2(bls_,ch,opname) \
+void PASTECH(bls_,ch,opname) \
      ( \
        trans_t transa, \
        trans_t transb, \
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 785f0836e..3121cb597 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -34,32 +34,10 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       conj_t           conja,
-       conj_t           conjb,
-       dim_t            m,
-       dim_t            n,
-       dim_t            k,
-       void*   restrict alpha,
-       void*   restrict a, inc_t rs_a, inc_t cs_a,
-       void*   restrict b, inc_t rs_b, inc_t cs_b,
-       void*   restrict beta,
-       void*   restrict c, inc_t rs_c, inc_t cs_c,
-       cntx_t* restrict cntx,
-       thrinfo_t* restrict thread
-     );
-
 //
 // -- gemm-like block-panel algorithm (object interface) -----------------------
 //
 
-// Define a function pointer array named ftypes and initialize its contents with
-// the addresses of the typed functions defined below, bls_?gemm_bp_var1().
-static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var1);
-
 void bls_gemm_bp_var1
      (
        const obj_t*     alpha,
@@ -71,326 +49,273 @@ void bls_gemm_bp_var1
              thrinfo_t* thread
      )
 {
-	const num_t    dt        = bli_obj_dt( c );
-
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
-
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-	const dim_t    k         = bli_obj_width( a );
-
-	void* restrict buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t    rs_a      = bli_obj_row_stride( a );
-	const inc_t    cs_a      = bli_obj_col_stride( a );
-
-	void* restrict buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t    rs_b      = bli_obj_row_stride( b );
-	const inc_t    cs_b      = bli_obj_col_stride( b );
-
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
-
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
-
-	// Index into the function pointer array to extract the correct
-	// typed function pointer based on the chosen datatype.
-	FUNCPTR_T f = ftypes[dt];
-
-	// Invoke the function.
-	f
-	(
-	  conja,
-	  conjb,
-	  m,
-	  n,
-	  k,
-	  buf_alpha,
-	  buf_a, rs_a, cs_a,
-	  buf_b, rs_b, cs_b,
-	  buf_beta,
-	  buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  thread
-	);
-}
+	const num_t  dt        = bli_obj_dt( c );
+	const dim_t  dt_size   = bli_dt_size( dt );
 
-//
-// -- gemm-like block-panel algorithm (typed interface) ------------------------
-//
+	const conj_t conja     = bli_obj_conj_status( a );
+	const conj_t conjb     = bli_obj_conj_status( b );
+
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
+
+	const char*  a_00      = bli_obj_buffer_at_off( a );
+	const inc_t  rs_a      = bli_obj_row_stride( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+
+	const char*  b_00      = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  cs_b      = bli_obj_col_stride( b );
+
+	      char*  c_00      = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
+
+	const char*  alpha_buf = bli_obj_buffer_for_1x1( dt, alpha );
+	const char*  beta_buf  = bli_obj_buffer_for_1x1( dt, beta );
+	const char*  one       = bli_obj_buffer_for_1x1( dt, &BLIS_ONE );
+
+	/* Query the context for various blocksizes. */
+	const dim_t  NR        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t  MR        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t  NC        = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t  MC        = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t  KC        = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTECH2(bls_,ch,varname) \
-     ( \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	const num_t dt = PASTEMAC(ch,type); \
-\
-	/* Query the context for various blocksizes. */ \
-	const dim_t NR  = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-	const dim_t MR  = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t NC  = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
-	const dim_t MC  = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
-	const dim_t KC  = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
-\
 	/* Query the context for the microkernel address and cast it to its
-	   function pointer type. */ \
-	gemm_ukr_ft gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Compute partitioning step values for each matrix of each loop. */ \
-	const inc_t jcstep_c = cs_c; \
-	const inc_t jcstep_b = cs_b; \
-\
-	const inc_t pcstep_a = cs_a; \
-	const inc_t pcstep_b = rs_b; \
-\
-	const inc_t icstep_c = rs_c; \
-	const inc_t icstep_a = rs_a; \
-\
-	const inc_t jrstep_c = cs_c * NR; \
-\
-	const inc_t irstep_c = rs_c * MR; \
-\
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-\
-	/* Make local copies of the scalars to prevent any unnecessary sharing of
-	   cache lines between the cores' caches. */ \
-	ctype           alpha_local = *alpha_cast; \
-	ctype           beta_local  = *beta_cast; \
-	ctype           one_local   = *PASTEMAC(ch,1); \
-\
-	auxinfo_t       aux; \
-\
-	thrinfo_t* restrict thread_jc = bli_thrinfo_sub_node( thread ); \
-	thrinfo_t* restrict thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-	thrinfo_t* restrict thread_pb = bli_thrinfo_sub_node( thread_pc ); \
-	thrinfo_t* restrict thread_ic = bli_thrinfo_sub_node( thread_pb ); \
-	thrinfo_t* restrict thread_pa = bli_thrinfo_sub_node( thread_ic ); \
-	thrinfo_t* restrict thread_jr = bli_thrinfo_sub_node( thread_pa ); \
-	thrinfo_t* restrict thread_ir = bli_thrinfo_sub_node( thread_jr ); \
-\
-	/* Compute the JC loop thread range for the current thread. */ \
-	dim_t jc_start, jc_end; \
-	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
-	const dim_t n_local = jc_end - jc_start; \
-\
-	/* Compute number of primary and leftover components of the JC loop. */ \
-	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   n_local % NC; \
-\
-	/* Loop over the n dimension (NC rows/columns at a time). */ \
-	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
-	{ \
-		/* Calculate the thread's current JC block dimension. */ \
-		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
-\
-		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Compute the PC loop thread range for the current thread. */ \
-		const dim_t pc_start = 0, pc_end = k; \
-		const dim_t k_local = k; \
-\
-		/* Compute number of primary and leftover components of the PC loop. */ \
-		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
-		const dim_t pc_left =   k_local % KC; \
-\
-		/* Loop over the k dimension (KC rows/columns at a time). */ \
-		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
-		{ \
-			/* Calculate the thread's current PC block dimension. */ \
-			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
-\
-			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
-			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
-\
-			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
-\
-			ctype* b_use; \
-			inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
+	   function pointer type. */
+	gemm_ukr_ft  gemm_ukr  = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	/* Compute partitioning step values for each matrix of each loop. */
+	const inc_t jcstep_c = cs_c * dt_size;
+	const inc_t jcstep_b = cs_b * dt_size;
+
+	const inc_t pcstep_a = cs_a * dt_size;
+	const inc_t pcstep_b = rs_b * dt_size;
+
+	const inc_t icstep_c = rs_c * dt_size;
+	const inc_t icstep_a = rs_a * dt_size;
+
+	const inc_t jrstep_c = cs_c * NR * dt_size;
+	const inc_t irstep_c = rs_c * MR * dt_size;
+
+	thrinfo_t* thread_jc = bli_thrinfo_sub_node( 0, thread );
+	thrinfo_t* thread_pc = bli_thrinfo_sub_node( 0, thread_jc );
+	thrinfo_t* thread_pb = bli_thrinfo_sub_node( 0, thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_sub_node( 0, thread_pb );
+	thrinfo_t* thread_pa = bli_thrinfo_sub_node( 0, thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_sub_node( 0, thread_pa );
+	thrinfo_t* thread_ir = bli_thrinfo_sub_node( 0, thread_jr );
+
+	/* Compute the JC loop thread range for the current thread. */
+	dim_t jc_start, jc_end;
+	dim_t jc_tid = bli_thrinfo_work_id( thread_jc );
+	dim_t jc_nt  = bli_thrinfo_n_way( thread_jc );
+	bli_thread_range_sub( jc_tid, jc_nt, n, NR, FALSE, &jc_start, &jc_end );
+	const dim_t n_local = jc_end - jc_start;
+
+	/* Compute number of primary and leftover components of the JC loop. */
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/
+	const dim_t jc_left =   n_local % NC;
+
+	/* Loop over the n dimension (NC rows/columns at a time). */
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC )
+	{
+		/* Calculate the thread's current JC block dimension. */
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left );
+
+		const char* b_jc = b_00 + jj * jcstep_b;
+		      char* c_jc = c_00 + jj * jcstep_c;
+
+		/* Compute the PC loop thread range for the current thread. */
+		const dim_t pc_start = 0, pc_end = k;
+		const dim_t k_local = k;
+
+		/* Compute number of primary and leftover components of the PC loop. */
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/
+		const dim_t pc_left =   k_local % KC;
+
+		/* Loop over the k dimension (KC rows/columns at a time). */
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC )
+		{
+			/* Calculate the thread's current PC block dimension. */
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left );
+
+			const char* a_pc = a_00 + pp * pcstep_a;
+			const char* b_pc = b_jc + pp * pcstep_b;
+
+			/* Only apply beta to the first iteration of the pc loop. */
+			const char* beta_use = ( pp == 0 ? beta_buf : one );
+
+			      void* b_use;
+			      inc_t rs_b_use, cs_b_use, ps_b_use;
+
 			/* Determine the packing buffer and related parameters for matrix
-			   B. Then call the packm implementation. */ \
-			PASTECH2(bls_,ch,packm_b) \
-			( \
-			  conjb, \
-			  KC,     NC, \
-			  kc_cur, nc_cur, NR, \
-			  &one_local, \
-			  b_pc,   rs_b,      cs_b, \
-			  &b_use, &rs_b_use, &cs_b_use, \
-			                     &ps_b_use, \
-			  cntx, \
-			  thread_pb  \
-			); \
-\
+			   B. Then call the packm implementation. */
+			bls_packm_b
+			(
+			  dt,
+			  conjb,
+			  KC,     NC,
+			  kc_cur, nc_cur, NR,
+			  one,
+			  b_pc,   rs_b,      cs_b,
+			  &b_use, &rs_b_use, &cs_b_use,
+			                     &ps_b_use,
+			  cntx,
+			  thread_pb
+			);
+
+			/* Scale the panel stride of B by the data type size. */
+			ps_b_use *= dt_size;
+
 			/* Alias b_use so that it's clear this is our current block of
-			   matrix B. */ \
-			ctype* restrict b_pc_use = b_use; \
-\
-			/* Compute the IC loop thread range for the current thread. */ \
-			dim_t ic_start, ic_end; \
-			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
-			const dim_t m_local = ic_end - ic_start; \
-\
-			/* Compute number of primary and leftover components of the IC loop. */ \
-			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
-			const dim_t ic_left =   m_local % MC; \
-\
-			/* Loop over the m dimension (MC rows at a time). */ \
-			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
-			{ \
-				/* Calculate the thread's current IC block dimension. */ \
-				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
-\
-				ctype* restrict a_ic = a_pc + ii * icstep_a; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
-\
-				ctype* a_use; \
-				inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
+			   matrix B. */
+			const char* b_pc_use = b_use;
+
+			/* Compute the IC loop thread range for the current thread. */
+			dim_t ic_start, ic_end;
+			dim_t ic_tid = bli_thrinfo_work_id( thread_ic );
+			dim_t ic_nt  = bli_thrinfo_n_way( thread_ic );
+			bli_thread_range_sub( ic_tid, ic_nt, m, MR, FALSE, &ic_start, &ic_end );
+			const dim_t m_local = ic_end - ic_start;
+
+			/* Compute number of primary and leftover components of the IC loop. */
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/
+			const dim_t ic_left =   m_local % MC;
+
+			/* Loop over the m dimension (MC rows at a time). */
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC )
+			{
+				/* Calculate the thread's current IC block dimension. */
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left );
+
+				const char* a_ic = a_pc + ii * icstep_a;
+				      char* c_ic = c_jc + ii * icstep_c;
+
+				      void* a_use;
+				      inc_t rs_a_use, cs_a_use, ps_a_use;
+
 				/* Determine the packing buffer and related parameters for matrix
-				   A. Then call the packm implementation. */ \
-				PASTECH2(bls_,ch,packm_a) \
-				( \
-				  conja, \
-				  MC,     KC, \
-				  mc_cur, kc_cur, MR, \
-				  &one_local, \
-				  a_ic,   rs_a,      cs_a, \
-				  &a_use, &rs_a_use, &cs_a_use, \
-				                     &ps_a_use, \
-				  cntx, \
-				  thread_pa  \
-				); \
-\
+				   A. Then call the packm implementation. */
+				bls_packm_a
+				(
+				  dt,
+				  conja,
+				  MC,     KC,
+				  mc_cur, kc_cur, MR,
+				  one,
+				  a_ic,   rs_a,      cs_a,
+				  &a_use, &rs_a_use, &cs_a_use,
+				                     &ps_a_use,
+				  cntx,
+				  thread_pa
+				);
+
+				/* Scale the panel stride of A by the data type size. */
+				ps_a_use *= dt_size;
+
 				/* Alias a_use so that it's clear this is our current block of
-				   matrix A. */ \
-				ctype* restrict a_ic_use = a_use; \
-\
+				   matrix A. */
+				const char* a_ic_use = a_use;
+
 				/* Query the number of threads and thread ids for the JR loop.
 				   NOTE: These values are only needed when computing the next
-				   micropanel of B. */ \
-				const dim_t jr_nt  = bli_thrinfo_n_way( thread_jr ); \
-				const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \
-\
-				/* Compute number of primary and leftover components of the JR loop. */ \
-				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
-				dim_t jr_left =   nc_cur % NR; \
-\
-				/* Compute the JR loop thread range for the current thread. */ \
-				dim_t jr_start, jr_end; \
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
-\
-				/* Loop over the n dimension (NR columns at a time). */ \
-				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
-				{ \
-					const dim_t nr_cur \
-					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
-\
-					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
-					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
-\
+				   micropanel of B. */
+				const dim_t jr_nt  = bli_thrinfo_n_way( thread_jr );
+				const dim_t jr_tid = bli_thrinfo_work_id( thread_jr );
+
+				/* Compute number of primary and leftover components of the JR loop. */
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR;
+				dim_t jr_left =   nc_cur % NR;
+
+				/* Compute the JR loop thread range for the current thread. */
+				dim_t jr_start, jr_end;
+				bli_thread_range_sub( jr_tid, jr_nt, jr_iter, 1, FALSE, &jr_start, &jr_end );
+
+				/* Loop over the n dimension (NR columns at a time). */
+				for ( dim_t j = jr_start; j < jr_end; j += 1 )
+				{
+					const dim_t nr_cur
+					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left );
+
+					const char* b_jr = b_pc_use + j * ps_b_use;
+					      char* c_jr = c_ic     + j * jrstep_c;
+
 					/* Assume for now that our next panel of B to be the current panel
-					   of B. */ \
-					ctype* restrict b2 = b_jr; \
-\
+					   of B. */
+					const char* b2 = b_jr;
+
 					/* Query the number of threads and thread ids for the IR loop.
 					   NOTE: These values are only needed when computing the next
-					   micropanel of A. */ \
-					const dim_t ir_nt  = bli_thrinfo_n_way( thread_ir ); \
-					const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \
-\
-					/* Compute number of primary and leftover components of the IR loop. */ \
-					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
-					dim_t ir_left =   mc_cur % MR; \
-\
-					/* Compute the IR loop thread range for the current thread. */ \
-					dim_t ir_start, ir_end; \
-					bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
-\
-					/* Loop over the m dimension (MR rows at a time). */ \
-					for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
-					{ \
-						const dim_t mr_cur \
-						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
-\
-						ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
-						ctype* restrict c_ir = c_jr     + i * irstep_c; \
-\
-						ctype* restrict a2; \
-\
-						/* Compute the addresses of the next micropanels of A and B. */ \
-						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
-						if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) ) \
-						{ \
-							a2 = a_ic_use; \
-							b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
-							if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) ) \
-								b2 = b_pc_use; \
-						} \
-\
+					   micropanel of A. */
+					const dim_t ir_nt  = bli_thrinfo_n_way( thread_ir );
+					const dim_t ir_tid = bli_thrinfo_work_id( thread_ir );
+
+					/* Compute number of primary and leftover components of the IR loop. */
+					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR;
+					dim_t ir_left =   mc_cur % MR;
+
+					/* Compute the IR loop thread range for the current thread. */
+					dim_t ir_start, ir_end;
+					bli_thread_range_sub( ir_tid, ir_nt, ir_iter, 1, FALSE, &ir_start, &ir_end );
+
+					/* Loop over the m dimension (MR rows at a time). */
+					for ( dim_t i = ir_start; i < ir_end; i += 1 )
+					{
+						const dim_t mr_cur
+						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left );
+
+						const char* a_ir = a_ic_use + i * ps_a_use;
+						      char* c_ir = c_jr     + i * irstep_c;
+
+						const char* a2;
+
+						/* Compute the addresses of the next micropanels of A and B. */
+						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 );
+						if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) )
+						{
+							a2 = a_ic_use;
+							b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 );
+							if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) )
+								b2 = b_pc_use;
+						}
+
 						/* Save the addresses of next micropanels of A and B to the
-						   auxinfo_t object. */ \
-						bli_auxinfo_set_next_a( a2, &aux ); \
-						bli_auxinfo_set_next_b( b2, &aux ); \
-\
-						/* Invoke the gemm microkernel. */ \
-						gemm_ukr \
-						( \
-						  mr_cur, \
-						  nr_cur, \
-						  kc_cur, \
-						  &alpha_local, \
-						  a_ir, \
-						  b_jr, \
-						  beta_use, \
-						  c_ir, rs_c, cs_c, \
-						  &aux, \
-						  cntx  \
-						); \
-					} \
-				} \
-			} \
-\
+						   auxinfo_t object. */
+						auxinfo_t aux;
+						bli_auxinfo_set_next_a( a2, &aux );
+						bli_auxinfo_set_next_b( b2, &aux );
+
+						/* Invoke the gemm microkernel. */
+						gemm_ukr
+						(
+						  mr_cur,
+						  nr_cur,
+						  kc_cur,
+						  alpha_buf,
+						  a_ir,
+						  b_jr,
+						  beta_use,
+						  c_ir, rs_c, cs_c,
+						  &aux,
+						  cntx
+						);
+					}
+				}
+			}
+
 			/* This barrier is needed to prevent threads from starting to pack
 			   the next row panel of B before the current row panel is fully
-			   computed upon. */ \
-			bli_thrinfo_barrier( thread_pb ); \
-		} \
-	} \
-\
+			   computed upon. */
+			bli_thrinfo_barrier( thread_pb );
+		}
+	}
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" );
+*/
 }
 
-//INSERT_GENTFUNC_BASIC( gemm_bp_var1 )
-GENTFUNC( float,    s, gemm_bp_var1 )
-GENTFUNC( double,   d, gemm_bp_var1 )
-GENTFUNC( scomplex, c, gemm_bp_var1 )
-GENTFUNC( dcomplex, z, gemm_bp_var1 )
-
diff --git a/sandbox/gemmlike/bls_gemm_var.h b/sandbox/gemmlike/bls_gemm_var.h
index f7b072558..29ac0c0b8 100644
--- a/sandbox/gemmlike/bls_gemm_var.h
+++ b/sandbox/gemmlike/bls_gemm_var.h
@@ -37,79 +37,14 @@
 // Prototype the object-based variant interfaces.
 //
 
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-void PASTECH(bls_,opname) \
-     ( \
-       const obj_t*     alpha, \
-       const obj_t*     a, \
-       const obj_t*     b, \
-       const obj_t*     beta, \
-       const obj_t*     c, \
-       const cntx_t*    cntx, \
-             thrinfo_t* thread  \
+void bls_gemm_bp_var1
+     (
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
      );
 
-GENPROT( gemm_bp_var1 )
-
-
-//
-// Prototype the typed variant interfaces.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTECH2(bls_,ch,varname) \
-     ( \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     );
-
-//INSERT_GENTPROT_BASIC( gemm_bp_var1 )
-GENTPROT( float,    s, gemm_bp_var1 )
-GENTPROT( double,   d, gemm_bp_var1 )
-GENTPROT( scomplex, c, gemm_bp_var1 )
-GENTPROT( dcomplex, z, gemm_bp_var1 )
-
-
-//
-// Prototype the typed kernel interfaces.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTECH2(bls_,ch,varname) \
-     ( \
-       const dim_t         MR, \
-       const dim_t         NR, \
-       dim_t               mr_cur, \
-       dim_t               nr_cur, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict aux, \
-       cntx_t*    restrict cntx  \
-     );
-
-//INSERT_GENTPROT_BASIC( gemm_kernel )
-GENTPROT( float,    s, gemm_kernel )
-GENTPROT( double,   d, gemm_kernel )
-GENTPROT( scomplex, c, gemm_kernel )
-GENTPROT( dcomplex, z, gemm_kernel )
-
diff --git a/sandbox/gemmlike/bls_l3_packm.c b/sandbox/gemmlike/bls_l3_packm.c
new file mode 100644
index 000000000..49aedec30
--- /dev/null
+++ b/sandbox/gemmlike/bls_l3_packm.c
@@ -0,0 +1,192 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bls_packm_int
+     (
+             num_t      dt,
+             conj_t     conj,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
+     )
+{
+	/* Set the pack buffer type so that we are obtaining memory blocks from
+	   the pool dedicated to blocks of A. */
+	const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK;
+
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */
+	const dim_t m_pack = ( ( m_alloc + mr - 1) / mr ) * mr;
+	const dim_t k_pack = k_alloc;
+
+	/* Barrier to make sure all threads are caught up and ready to begin the
+	   packm stage. */
+	bli_thrinfo_barrier( thread );
+
+	/* Compute the size of the memory block eneded. */
+	siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
+
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */
+	dim_t m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+	dim_t k_max = k;
+
+	// Determine the dimensions and strides for the packed matrix.
+	*rs_p = 1;
+	*cs_p = mr;
+
+	dim_t pd_p = mr;
+	*ps_p = mr * k;
+
+	/* Set the buffer address provided by the caller to point to the memory
+	   associated with the mem_t entry acquired from the memory pool. */
+	*p = bli_packm_alloc_ex
+	(
+	  size_needed,
+	  pack_buf_type,
+	  thread
+	);
+
+	bls_packm_var1
+	(
+	  dt,
+	  conj,
+	  m,
+	  k,
+	  m_max,
+	  k_max,
+	  kappa,
+	  a,  rs_a,  cs_a,
+	  *p, *rs_p, *cs_p,
+	       pd_p, *ps_p,
+	  cntx,
+	  thread
+	);
+}
+
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+void bls_packm_a
+     (
+             num_t      dt,
+             conj_t     conj,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
+     )
+{
+	bls_packm_int
+	(
+	  dt,
+	  conj,
+	  m_alloc,
+	  k_alloc,
+	  m,
+	  k,
+	  mr,
+	  kappa,
+	  a, rs_a, cs_a,
+	  p, rs_p, cs_p,
+	     ps_p,
+	  cntx,
+	  thread
+	);
+
+	/* Barrier so that packing is done before computation. */
+	bli_thrinfo_barrier( thread );
+}
+
+void bls_packm_b
+     (
+             num_t      dt,
+             conj_t     conj,
+             dim_t      k_alloc,
+             dim_t      n_alloc,
+             dim_t      k,
+             dim_t      n,
+             dim_t      nr,
+       const void*      kappa,
+       const void*      b, inc_t  rs_b, inc_t  cs_b,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
+     )
+{
+	// Implicitly transpose B for packing.
+	bls_packm_int
+	(
+	  dt,
+	  conj,
+	  n_alloc,
+	  k_alloc,
+	  n,
+	  k,
+	  nr,
+	  kappa,
+	  b, cs_b, rs_b,
+	  p, cs_p, rs_p,
+	     ps_p,
+	  cntx,
+	  thread
+	);
+
+	/* Barrier so that packing is done before computation. */
+	bli_thrinfo_barrier( thread );
+}
diff --git a/frame/3/bli_l3_schema.h b/sandbox/gemmlike/bls_l3_packm.h
similarity index 54%
rename from frame/3/bli_l3_schema.h
rename to sandbox/gemmlike/bls_l3_packm.h
index a909bf598..fe94a1a4d 100644
--- a/frame/3/bli_l3_schema.h
+++ b/sandbox/gemmlike/bls_l3_packm.h
@@ -32,10 +32,54 @@
 
 */
 
-void bli_l3_set_schemas
+void bls_packm_int
      (
-             obj_t*  a,
-             obj_t*  b,
-       const obj_t*  c,
-       const cntx_t* cntx
+             num_t      dt,
+             conj_t     conj,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
      );
+
+void bls_packm_a
+     (
+             num_t      dt,
+             conj_t     conj,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
+     );
+
+void bls_packm_b
+     (
+             num_t      dt,
+             conj_t     conj,
+             dim_t      k_alloc,
+             dim_t      n_alloc,
+             dim_t      k,
+             dim_t      n,
+             dim_t      nr,
+       const void*      kappa,
+       const void*      b, inc_t  rs_b, inc_t  cs_b,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
+     );
+
diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c
deleted file mode 100644
index e6115e340..000000000
--- a/sandbox/gemmlike/bls_l3_packm_a.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	/* Set the pack buffer type so that we are obtaining memory blocks from
-	   the pool dedicated to blocks of A. */ \
-	const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
-\
-	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
-	   we NEED that last micropanel to have the same ldim (cs_p) as the other
-	   micropanels. Why? Because the microkernel assumes that the register (MR,
-	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
-	const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-	const dim_t k_pack = k; \
-\
-	/* Barrier to make sure all threads are caught up and ready to begin the
-	   packm stage. */ \
-	bli_thrinfo_barrier( thread ); \
-\
-	/* Compute the size of the memory block eneded. */ \
-	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
-\
-	mem_t* mem = bli_thrinfo_mem( thread ); \
-\
-	/* Check the mem_t entry provided by the caller. If it is unallocated,
-	   then we need to acquire a block from the packed block allocator. */ \
-	if ( bli_mem_is_unalloc( mem ) ) \
-	{ \
-		if ( bli_thrinfo_am_chief( thread ) ) \
-		{ \
-			/* Acquire directly to the chief thread's mem_t that was passed in.
-			   It needs to be that mem_t struct, and not a local (temporary)
-			   mem_t, since there is no barrier until after packing is finished,
-			   which could allow a race condition whereby the chief thread exits
-			   the current function before the other threads have a chance to
-			   copy from it. (A barrier would fix that race condition, but then
-			   again, I prefer to keep barriers to a minimum.) */ \
-			bli_pba_acquire_m \
-			( \
-			  bli_thrinfo_pba( thread ), \
-			  size_needed, \
-			  pack_buf_type, \
-			  mem  \
-			); \
-		} \
-\
-		/* Broadcast the address of the chief thread's passed-in mem_t to all
-		   threads. */ \
-		mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
-\
-		/* Non-chief threads: Copy the contents of the chief thread's
-		   passed-in mem_t to the passed-in mem_t for this thread. (The
-		   chief thread already has the mem_t, so it does not need to
-		   perform any copy.) */ \
-		if ( !bli_thrinfo_am_chief( thread ) ) \
-		{ \
-			*mem = *mem_p; \
-		} \
-	} \
-	else /* if ( bli_mem_is_alloc( mem ) ) */ \
-	{ \
-		/* If the mem_t entry provided by the caller does NOT contain a NULL
-		   buffer, then a block has already been acquired from the packed
-		   block allocator and cached by the caller. */ \
-\
-		/* As a sanity check, we should make sure that the mem_t object isn't
-		   associated with a block that is too small compared to the size of
-		   the packed matrix buffer that is needed, according to the value
-		   computed above. */ \
-		siz_t mem_size = bli_mem_size( mem ); \
-\
-		if ( mem_size < size_needed ) \
-		{ \
-			if ( bli_thrinfo_am_chief( thread ) ) \
-			{ \
-				/* The chief thread releases the existing block associated
-				   with the mem_t, and then re-acquires a new block, saving
-				   the associated mem_t to its passed-in mem_t. (See coment
-				   above for why the acquisition needs to be directly to
-				   the chief thread's passed-in mem_t and not a local
-				   (temporary) mem_t. */ \
-				bli_pba_release \
-				( \
-				  bli_thrinfo_pba( thread ), \
-				  mem \
-				); \
-				bli_pba_acquire_m \
-				( \
-				  bli_thrinfo_pba( thread ), \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thrinfo_am_chief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else \
-		{ \
-			/* If the mem_t entry is already allocated and sufficiently large,
-			   then we use it as-is. No action is needed. */ \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_init_mem_a )
-GENTFUNC( float,    s, packm_init_mem_a )
-GENTFUNC( double,   d, packm_init_mem_a )
-GENTFUNC( scomplex, c, packm_init_mem_a )
-GENTFUNC( dcomplex, z, packm_init_mem_a )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       pack_t* restrict schema, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       dim_t*  restrict m_max, \
-       dim_t*  restrict k_max, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       mem_t*  restrict mem  \
-     ) \
-{ \
-	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
-	   we NEED that last micropanel to have the same ldim (cs_p) as the other
-	   micropanels. Why? Because the microkernel assumes that the register (MR,
-	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
-	*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-	*k_max = k; \
-\
-	/* Determine the dimensions and strides for the packed matrix A. */ \
-	{ \
-		/* Pack A to column-stored row-panels. */ \
-		*rs_p = 1; \
-		*cs_p = mr; \
-\
-		*pd_p = mr; \
-		*ps_p = mr * k; \
-\
-		/* Set the schema to "packed row panels" to indicate packing to
-		   conventional column-stored row panels. */ \
-		*schema = BLIS_PACKED_ROW_PANELS; \
-	} \
-\
-	/* Set the buffer address provided by the caller to point to the memory
-	   associated with the mem_t entry acquired from the memory pool. */ \
-	*p = bli_mem_buffer( mem ); \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_init_a )
-GENTFUNC( float,    s, packm_init_a )
-GENTFUNC( double,   d, packm_init_a )
-GENTFUNC( scomplex, c, packm_init_a )
-GENTFUNC( dcomplex, z, packm_init_a )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       conj_t           conj, \
-       dim_t            m_alloc, \
-       dim_t            k_alloc, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  m_max; \
-	dim_t  k_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. */ \
-	PASTECH2(bls_,ch,packm_init_mem_a) \
-	( \
-	  m_alloc, k_alloc, mr, \
-	  cntx, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix A. */ \
-	PASTECH2(bls_,ch,packm_init_a) \
-	( \
-	  &schema, \
-	  m, k, mr, \
-	  &m_max, &k_max, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  bli_thrinfo_mem( thread )  \
-	); \
-\
-	/* Pack matrix A to the destination buffer chosen above. Here, the packed
-	   matrix is stored to column-stored MR x k micropanels. */ \
-	PASTECH2(bls_,ch,packm_var1) \
-	( \
-	  conj, \
-	  schema, \
-	  m, \
-	  k, \
-	  m_max, \
-	  k_max, \
-	  kappa, \
-	  a,  rs_a,  cs_a, \
-	  *p, *rs_p, *cs_p, \
-	       pd_p, *ps_p, \
-	  cntx, \
-	  bli_thrinfo_sub_prenode( thread )  \
-	); \
-\
-	/* Barrier so that packing is done before computation. */ \
-	bli_thrinfo_barrier( thread ); \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_a )
-GENTFUNC( float,    s, packm_a )
-GENTFUNC( double,   d, packm_a )
-GENTFUNC( scomplex, c, packm_a )
-GENTFUNC( dcomplex, z, packm_a )
-
diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/sandbox/gemmlike/bls_l3_packm_a.h
deleted file mode 100644
index e5ecf9eab..000000000
--- a/sandbox/gemmlike/bls_l3_packm_a.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC( packm_init_mem_a )
-GENTPROT( float,    s, packm_init_mem_a )
-GENTPROT( double,   d, packm_init_mem_a )
-GENTPROT( scomplex, c, packm_init_mem_a )
-GENTPROT( dcomplex, z, packm_init_mem_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       pack_t* restrict schema, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       dim_t*  restrict m_max, \
-       dim_t*  restrict k_max, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       mem_t*  restrict mem  \
-     ); \
-
-//INSERT_GENTPROT_BASIC( packm_init_a )
-GENTPROT( float,    s, packm_init_a )
-GENTPROT( double,   d, packm_init_a )
-GENTPROT( scomplex, c, packm_init_a )
-GENTPROT( dcomplex, z, packm_init_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       conj_t           conj, \
-       dim_t            m_alloc, \
-       dim_t            k_alloc, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC( packm_a )
-GENTPROT( float,    s, packm_a )
-GENTPROT( double,   d, packm_a )
-GENTPROT( scomplex, c, packm_a )
-GENTPROT( dcomplex, z, packm_a )
-
diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c
deleted file mode 100644
index 7be3482f9..000000000
--- a/sandbox/gemmlike/bls_l3_packm_b.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	/* Set the pack buffer type so that we are obtaining memory blocks from
-	   the pool dedicated to panels of B. */ \
-	const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \
-\
-	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
-	   we NEED that last micropanel to have the same ldim (cs_p) as the other
-	   micropanels. Why? Because the microkernel assumes that the register (MR,
-	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
-	const dim_t k_pack = k; \
-	const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-	/* Barrier to make sure all threads are caught up and ready to begin the
-	   packm stage. */ \
-	bli_thrinfo_barrier( thread ); \
-\
-	/* Compute the size of the memory block eneded. */ \
-	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
-\
-	mem_t* mem = bli_thrinfo_mem( thread ); \
-\
-	/* Check the mem_t entry provided by the caller. If it is unallocated,
-	   then we need to acquire a block from the packed block allocator. */ \
-	if ( bli_mem_is_unalloc( mem ) ) \
-	{ \
-		if ( bli_thrinfo_am_chief( thread ) ) \
-		{ \
-			/* Acquire directly to the chief thread's mem_t that was passed in.
-			   It needs to be that mem_t struct, and not a local (temporary)
-			   mem_t, since there is no barrier until after packing is finished,
-			   which could allow a race condition whereby the chief thread exits
-			   the current function before the other threads have a chance to
-			   copy from it. (A barrier would fix that race condition, but then
-			   again, I prefer to keep barriers to a minimum.) */ \
-			bli_pba_acquire_m \
-			( \
-			  bli_thrinfo_pba( thread ), \
-			  size_needed, \
-			  pack_buf_type, \
-			  mem  \
-			); \
-		} \
-\
-		/* Broadcast the address of the chief thread's passed-in mem_t to all
-		   threads. */ \
-		mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
-\
-		/* Non-chief threads: Copy the contents of the chief thread's
-		   passed-in mem_t to the passed-in mem_t for this thread. (The
-		   chief thread already has the mem_t, so it does not need to
-		   perform any copy.) */ \
-		if ( !bli_thrinfo_am_chief( thread ) ) \
-		{ \
-			*mem = *mem_p; \
-		} \
-	} \
-	else /* if ( bli_mem_is_alloc( mem ) ) */ \
-	{ \
-		/* If the mem_t entry provided by the caller does NOT contain a NULL
-		   buffer, then a block has already been acquired from the packed
-		   block allocator and cached by the caller. */ \
-\
-		/* As a sanity check, we should make sure that the mem_t object isn't
-		   associated with a block that is too small compared to the size of
-		   the packed matrix buffer that is needed, according to the value
-		   computed above. */ \
-		siz_t mem_size = bli_mem_size( mem ); \
-\
-		if ( mem_size < size_needed ) \
-		{ \
-			if ( bli_thrinfo_am_chief( thread ) ) \
-			{ \
-				/* The chief thread releases the existing block associated
-				   with the mem_t, and then re-acquires a new block, saving
-				   the associated mem_t to its passed-in mem_t. (See coment
-				   above for why the acquisition needs to be directly to
-				   the chief thread's passed-in mem_t and not a local
-				   (temporary) mem_t. */ \
-				bli_pba_release \
-				( \
-				  bli_thrinfo_pba( thread ), \
-				  mem \
-				); \
-				bli_pba_acquire_m \
-				( \
-				  bli_thrinfo_pba( thread ), \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thrinfo_am_chief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else \
-		{ \
-			/* If the mem_t entry is already allocated and sufficiently large,
-			   then we use it as-is. No action is needed. */ \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_init_mem_b )
-GENTFUNC( float,    s, packm_init_mem_b )
-GENTFUNC( double,   d, packm_init_mem_b )
-GENTFUNC( scomplex, c, packm_init_mem_b )
-GENTFUNC( dcomplex, z, packm_init_mem_b )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       pack_t* restrict schema, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       dim_t*  restrict k_max, \
-       dim_t*  restrict n_max, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       mem_t*  restrict mem  \
-     ) \
-{ \
-	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
-	   we NEED that last micropanel to have the same ldim (cs_p) as the other
-	   micropanels. Why? Because the microkernel assumes that the register (MR,
-	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
-	*k_max = k; \
-	*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-	/* Determine the dimensions and strides for the packed matrix B. */ \
-	{ \
-		/* Pack B to row-stored column-panels. */ \
-		*rs_p = nr; \
-		*cs_p = 1; \
-\
-		*pd_p = nr; \
-		*ps_p = k * nr; \
-\
-		/* Set the schema to "packed column panels" to indicate packing to
-		   conventional row-stored column panels. */ \
-		*schema = BLIS_PACKED_COL_PANELS; \
-	} \
-\
-	/* Set the buffer address provided by the caller to point to the memory
-	   associated with the mem_t entry acquired from the memory pool. */ \
-	*p = bli_mem_buffer( mem ); \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_init_b )
-GENTFUNC( float,    s, packm_init_b )
-GENTFUNC( double,   d, packm_init_b )
-GENTFUNC( scomplex, c, packm_init_b )
-GENTFUNC( dcomplex, z, packm_init_b )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       conj_t           conj, \
-       dim_t            k_alloc, \
-       dim_t            n_alloc, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  k_max; \
-	dim_t  n_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. */ \
-	PASTECH2(bls_,ch,packm_init_mem_b) \
-	( \
-	  k_alloc, n_alloc, nr, \
-	  cntx, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix B. */ \
-	PASTECH2(bls_,ch,packm_init_b) \
-	( \
-	  &schema, \
-	  k, n, nr, \
-	  &k_max, &n_max, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  bli_thrinfo_mem( thread )  \
-	); \
-\
-	/* Pack matrix B to the destination buffer chosen above. Here, the packed
-	   matrix is stored to row-stored k x NR micropanels. */ \
-	PASTECH2(bls_,ch,packm_var1) \
-	( \
-	  conj, \
-	  schema, \
-	  k, \
-	  n, \
-	  k_max, \
-	  n_max, \
-	  kappa, \
-	  b,  rs_b,  cs_b, \
-	  *p, *rs_p, *cs_p, \
-	       pd_p, *ps_p, \
-	  cntx, \
-	  bli_thrinfo_sub_prenode( thread )  \
-	); \
-\
-	/* Barrier so that packing is done before computation. */ \
-	bli_thrinfo_barrier( thread ); \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_b )
-GENTFUNC( float,    s, packm_b )
-GENTFUNC( double,   d, packm_b )
-GENTFUNC( scomplex, c, packm_b )
-GENTFUNC( dcomplex, z, packm_b )
-
diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/sandbox/gemmlike/bls_l3_packm_b.h
deleted file mode 100644
index c92f63f1b..000000000
--- a/sandbox/gemmlike/bls_l3_packm_b.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC( packm_init_mem_b )
-GENTPROT( float,    s, packm_init_mem_b )
-GENTPROT( double,   d, packm_init_mem_b )
-GENTPROT( scomplex, c, packm_init_mem_b )
-GENTPROT( dcomplex, z, packm_init_mem_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       pack_t* restrict schema, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       dim_t*  restrict k_max, \
-       dim_t*  restrict n_max, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       mem_t*  restrict mem  \
-     ); \
-
-//INSERT_GENTPROT_BASIC( packm_init_b )
-GENTPROT( float,    s, packm_init_b )
-GENTPROT( double,   d, packm_init_b )
-GENTPROT( scomplex, c, packm_init_b )
-GENTPROT( dcomplex, z, packm_init_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       conj_t           conj, \
-       dim_t            k_alloc, \
-       dim_t            n_alloc, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC( packm_b )
-GENTPROT( float,    s, packm_b )
-GENTPROT( double,   d, packm_b )
-GENTPROT( scomplex, c, packm_b )
-GENTPROT( dcomplex, z, packm_b )
-
diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h
index e686e64fc..0c4355ee7 100644
--- a/sandbox/gemmlike/bls_l3_packm_var.h
+++ b/sandbox/gemmlike/bls_l3_packm_var.h
@@ -36,39 +36,18 @@
 // Prototype BLAS-like interfaces to the variants.
 //
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTECH2(bls_,ch,varname) \
-     ( \
-       conj_t           conjc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            m_max, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-                           dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+void bls_packm_var1
+     (
+             num_t      dt,
+             conj_t     conjc,
+             dim_t      m,
+             dim_t      n,
+             dim_t      m_max,
+             dim_t      n_max,
+       const void*      kappa,
+       const void*      c, inc_t rs_c, inc_t cs_c,
+             void*      p, inc_t rs_p, inc_t cs_p,
+                           dim_t pd_p, inc_t ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
      );
-
-//INSERT_GENTPROT_BASIC( packm_var1 )
-GENTPROT( float,    s, packm_var1 )
-GENTPROT( double,   d, packm_var1 )
-GENTPROT( scomplex, c, packm_var1 )
-GENTPROT( dcomplex, z, packm_var1 )
-
-//INSERT_GENTPROT_BASIC( packm_var2 )
-GENTPROT( float,    s, packm_var2 )
-GENTPROT( double,   d, packm_var2 )
-GENTPROT( scomplex, c, packm_var2 )
-GENTPROT( dcomplex, z, packm_var2 )
-
-//INSERT_GENTPROT_BASIC( packm_var3 )
-GENTPROT( float,    s, packm_var3 )
-GENTPROT( double,   d, packm_var3 )
-GENTPROT( scomplex, c, packm_var3 )
-GENTPROT( dcomplex, z, packm_var3 )
diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c
index 9cfab59c5..ab656a31a 100644
--- a/sandbox/gemmlike/bls_l3_packm_var1.c
+++ b/sandbox/gemmlike/bls_l3_packm_var1.c
@@ -38,151 +38,136 @@
 // Variant 1 provides basic support for packing by calling packm_cxk().
 //
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTECH2(bls_,ch,varname) \
-     ( \
-       conj_t           conjc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            m_max, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-                           dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-\
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it, ic; \
-	dim_t           ic0; \
-	doff_t          ic_inc; \
-	dim_t           panel_len; \
-	dim_t           panel_len_max; \
-	dim_t           panel_dim; \
-	dim_t           panel_dim_max; \
-	inc_t           incc; \
-	inc_t           ldc; \
-	inc_t           ldp; \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	bool row_stored = bli_is_col_packed( schema ); \
-	/*bool col_stored = bli_is_row_packed( schema );*/ \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len      = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		incc           = cs_c; \
-		ldc            = rs_c; \
-		ldp            = rs_p; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len      = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		incc           = rs_c; \
-		ldc            = cs_c; \
-		ldp            = cs_p; \
-	} \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-	} \
-\
-	ctype* restrict p_begin = p_cast; \
-\
-	/* Query the number of threads and thread ids from the current thread's
-	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thrinfo_n_way( thread ); \
-	const dim_t tid = bli_thrinfo_work_id( thread ); \
-\
-	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
-	( void )nt; \
-	( void )tid; \
-\
-	dim_t it_start, it_end, it_inc; \
-\
-	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
-	   will depend on whether slab or round-robin partitioning was requested
-	   at configure-time. */ \
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
-\
-	/* Iterate over every logical micropanel in the source matrix. */ \
-	for ( ic  = ic0,    it  = 0; it < n_iter; \
-	      ic += ic_inc, it += 1 ) \
-	{ \
-		panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		ctype* restrict c_begin = c_cast   + (ic  )*incc; \
-\
-		ctype* restrict c_use = c_begin; \
-		ctype* restrict p_use = p_begin; \
-\
-		/* The definition of bli_is_my_iter() will depend on whether slab
-		   or round-robin partitioning was requested at configure-time. (The
-		   default is slab.) */ \
-		if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
-		{ \
-			PASTECH2(bls_,ch,packm_cxk) \
-			( \
-			  conjc, \
-			  schema, \
-			  panel_dim, \
-			  panel_dim_max, \
-			  panel_len, \
-			  panel_len_max, \
-			  kappa_cast, \
-			  c_use, incc, ldc, \
-			  p_use,       ldp, \
-			  cntx  \
-			); \
-		} \
-\
-/*
-if ( !row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-else \
-PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-*/ \
-\
-		p_begin += ps_p; \
-	} \
-}
+void bls_packm_var1
+     (
+             num_t      dt,
+             conj_t     conjc,
+             dim_t      m,
+             dim_t      n,
+             dim_t      m_max,
+             dim_t      n_max,
+       const void*      kappa,
+       const void*      c, inc_t rs_c, inc_t cs_c,
+             void*      p, inc_t rs_p, inc_t cs_p,
+                           dim_t pd_p, inc_t ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
+     )
+{
+	( void )m_max;
+	( void )rs_p;
+
+	const char* c_cast  = c;
+	      char* p_cast  = p;
+
+	dim_t iter_dim      = m;
+	dim_t panel_len     = n;
+	dim_t panel_len_max = n_max;
+	dim_t panel_dim_max = pd_p;
+	inc_t incc          = rs_c;
+	inc_t ldc           = cs_c;
+	inc_t ldp           = cs_p;
+	dim_t dt_size       = bli_dt_size( dt );
+
+	packm_cxk_ker_ft f  = bli_cntx_get_ukr2_dt( dt, dt, BLIS_PACKM_KER, cntx );
+
+	// Compute the total number of iterations we'll need.
+	dim_t n_iter  = ( iter_dim + panel_dim_max - 1) / panel_dim_max;
+	char* p_begin = p_cast;
+
+	// Query the number of threads and thread ids from the current thread's
+	// packm thrinfo_t node.
+	const dim_t nt  = bli_thrinfo_num_threads( thread );
+	const dim_t tid = bli_thrinfo_thread_id( thread );
+
+	// Suppress warnings in case tid isn't used (ie: as in slab partitioning).
+	( void )nt;
+	( void )tid;
 
-//INSERT_GENTFUNC_BASIC( packm_var1 )
-GENTFUNC( float,    s, packm_var1 )
-GENTFUNC( double,   d, packm_var1 )
-GENTFUNC( scomplex, c, packm_var1 )
-GENTFUNC( dcomplex, z, packm_var1 )
+	// Determine the thread range and increment using the current thread's
+	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
+	// will depend on whether slab or round-robin partitioning was requested
+	// at configure-time.
+	dim_t it_start, it_end, it_inc;
+	bli_thread_range_slrr( tid, nt, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
+
+	// Iterate over every logical micropanel in the source matrix.
+	for ( dim_t ic  = 0, it  = 0; it < n_iter; ic += panel_dim_max, it += 1 )
+	{
+		dim_t panel_dim = bli_min( panel_dim_max, iter_dim - ic );
+
+		const char* c_begin = c_cast + ic*incc*dt_size;
+
+		const char* c_use = c_begin;
+		      char* p_use = p_begin;
+
+		// The definition of bli_is_my_iter() will depend on whether slab
+		// or round-robin partitioning was requested at configure-time. (The
+		// default is slab.)
+		if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) )
+		{
+			f
+			(
+			  conjc,
+			  BLIS_PACKED_PANELS,
+			  panel_dim,
+			  panel_dim_max,
+			  1, // TODO: this shouldn't be hard-coded.
+			  panel_len,
+			  panel_len_max,
+			  kappa,
+			  c_use, incc, ldc,
+			  p_use,       ldp,
+			  NULL,
+			  cntx
+			);
+
+			// The packing microkernel f is equivalent to (where ctype and ch represent
+			// the data type and type character, e.g. `float` and `s`):
+			//
+			//  ctype  kappa_cast = *( ctype* )kappa;
+			//  ctype* c_cast     = ( ctype* )c_use;
+			//  ctype* p_cast     = ( ctype* )p_use;
+			//
+			//  // Perform the packing, taking conjc into account.
+			//  if ( bli_is_conj( conjc ) )
+			//  {
+			//  	for ( dim_t l = 0; l < panel_len; ++l )
+			//  	{
+			//  		for ( dim_t i = 0; i < panel_dim; ++i )
+			//  		{
+			//  			ctype* cli = c_cast + (l  )*ldc + (i  )*incc;
+			//  			ctype* pli = p_cast + (l  )*ldp + (i  )*1;
+			//
+			//  			PASTEMAC(ch,axpyjs)( kappa_cast, *cli, *pli );
+			//  		}
+			//  	}
+			//  }
+			//  else
+			//  {
+			//  	for ( dim_t l = 0; l < panel_len; ++l )
+			//  	{
+			//  		for ( dim_t i = 0; i < panel_dim; ++i )
+			//  		{
+			//  			ctype* cli = c_cast + (l  )*ldc + (i  )*incc;
+			//  			ctype* pli = p_cast + (l  )*ldp + (i  )*1;
+			//
+			//  			PASTEMAC(ch,axpys)( kappa_cast, *cli, *pli );
+			//  		}
+			//  	}
+			//  }
+			//
+			//  // If panel_dim < panel_dim_max and/or panel_len < panel_len_max,
+			//  // then we zero those unused rows/columns.
+			//  PASTEMAC(ch,set0s_edge)
+			//  (
+			//    panel_dim, panel_dim_max,
+			//    panel_len, panel_len_max,
+			//    p_cast, ldp
+			//  );
+		}
+
+		p_begin += ps_p*dt_size;
+	}
+}
 
diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c
deleted file mode 100644
index 96d041a1a..000000000
--- a/sandbox/gemmlike/bls_l3_packm_var2.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-//
-// Variant 2 is similar to variant 1, but inlines the contents of packm_cxk().
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTECH2(bls_,ch,varname) \
-     ( \
-       conj_t           conjc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            m_max, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-                           dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-\
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it, ic; \
-	dim_t           ic0; \
-	doff_t          ic_inc; \
-	dim_t           panel_len; \
-	dim_t           panel_len_max; \
-	dim_t           panel_dim; \
-	dim_t           panel_dim_max; \
-	inc_t           incc; \
-	inc_t           ldc; \
-	inc_t           ldp; \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	bool row_stored = bli_is_col_packed( schema ); \
-	/*bool col_stored = bli_is_row_packed( schema );*/ \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len      = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		incc           = cs_c; \
-		ldc            = rs_c; \
-		ldp            = rs_p; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len      = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		incc           = rs_c; \
-		ldc            = cs_c; \
-		ldp            = cs_p; \
-	} \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-	} \
-\
-	ctype* restrict p_begin = p_cast; \
-\
-	/* Query the number of threads and thread ids from the current thread's
-	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thrinfo_n_way( thread ); \
-	const dim_t tid = bli_thrinfo_work_id( thread ); \
-\
-	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
-	( void )nt; \
-	( void )tid; \
-\
-	dim_t it_start, it_end, it_inc; \
-\
-	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
-	   will depend on whether slab or round-robin partitioning was requested
-	   at configure-time. */ \
-	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
-\
-	/* Iterate over every logical micropanel in the source matrix. */ \
-	for ( ic  = ic0,    it  = 0; it < n_iter; \
-	      ic += ic_inc, it += 1 ) \
-	{ \
-		panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		ctype* restrict c_begin = c_cast   + (ic  )*incc; \
-\
-		ctype* restrict c_use = c_begin; \
-		ctype* restrict p_use = p_begin; \
-\
-		/* The definition of bli_is_my_iter() will depend on whether slab
-		   or round-robin partitioning was requested at configure-time. (The
-		   default is slab.) */ \
-		if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
-		{ \
-			/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
-			   we're wrong, this will get someone's attention. */ \
-			if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-				bli_abort(); \
-\
-			/* Perform the packing, taking conjc into account. */ \
-			if ( bli_is_conj( conjc ) ) \
-			{ \
-				for ( dim_t l = 0; l < panel_len; ++l ) \
-				{ \
-					for ( dim_t i = 0; i < panel_dim; ++i ) \
-					{ \
-						ctype* cli = c_use + (l  )*ldc + (i  )*incc; \
-						ctype* pli = p_use + (l  )*ldp + (i  )*1; \
-\
-						PASTEMAC(ch,copyjs)( *cli, *pli ); \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t l = 0; l < panel_len; ++l ) \
-				{ \
-					for ( dim_t i = 0; i < panel_dim; ++i ) \
-					{ \
-						ctype* cli = c_use + (l  )*ldc + (i  )*incc; \
-						ctype* pli = p_use + (l  )*ldp + (i  )*1; \
-\
-						PASTEMAC(ch,copys)( *cli, *pli ); \
-					} \
-				} \
-			} \
-\
-			/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
-			if ( panel_dim < panel_dim_max ) \
-			{ \
-				const dim_t     i      = panel_dim; \
-				const dim_t     m_edge = panel_dim_max - panel_dim; \
-				const dim_t     n_edge = panel_len_max; \
-				ctype* restrict p_edge = p_use + (i  )*1; \
-\
-				PASTEMAC(ch,set0s_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, 1, ldp  \
-				); \
-			} \
-\
-			/* If panel_len < panel_len_max, then we zero those unused columns. */ \
-			if ( panel_len < panel_len_max ) \
-			{ \
-				const dim_t     j      = panel_len; \
-				const dim_t     m_edge = panel_dim_max; \
-				const dim_t     n_edge = panel_len_max - panel_len; \
-				ctype* restrict p_edge = p_use + (j  )*ldp; \
-\
-				PASTEMAC(ch,set0s_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, 1, ldp  \
-				); \
-			} \
-		} \
-\
-/*
-if ( !row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-else \
-PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-*/ \
-\
-		p_begin += ps_p; \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_var1 )
-GENTFUNC( float,    s, packm_var2 )
-GENTFUNC( double,   d, packm_var2 )
-GENTFUNC( scomplex, c, packm_var2 )
-GENTFUNC( dcomplex, z, packm_var2 )
-
diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c
deleted file mode 100644
index 6ee209291..000000000
--- a/sandbox/gemmlike/bls_l3_packm_var3.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-//
-// Variant 3 is similar to variant 1, except that it parallelizes packing
-// along the k dimension. (Our current hypothesis is that this method of
-// parallelizing the operation may perform better on some NUMA systems.)
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTECH2(bls_,ch,varname) \
-     ( \
-       conj_t           conjc, \
-       pack_t           schema, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            m_max, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-                           dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-\
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it, ic; \
-	dim_t           ic0; \
-	doff_t          ic_inc; \
-	dim_t           panel_len; \
-	dim_t           panel_len_max; \
-	dim_t           panel_dim; \
-	dim_t           panel_dim_max; \
-	inc_t           incc; \
-	inc_t           ldc; \
-	inc_t           ldp; \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	bool row_stored = bli_is_col_packed( schema ); \
-	/*bool col_stored = bli_is_row_packed( schema );*/ \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len      = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		incc           = cs_c; \
-		ldc            = rs_c; \
-		ldp            = rs_p; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len      = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		incc           = rs_c; \
-		ldc            = cs_c; \
-		ldp            = cs_p; \
-	} \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-	} \
-\
-	/* Query the number of threads and thread ids from the current thread's
-	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thrinfo_n_way( thread ); \
-	const dim_t tid = bli_thrinfo_work_id( thread ); \
-\
-	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
-	( void )nt; \
-	( void )tid; \
-\
-	dim_t pr_start, pr_end; \
-\
-	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. */ \
-	bli_thread_range_sub( thread, panel_len, 1, FALSE, &pr_start, &pr_end ); \
-\
-	/* Define instances of panel_len and panel_len_max that are specific to
-	   the local thread. */ \
-	dim_t panel_len_loc     = pr_end - pr_start; \
-	dim_t panel_len_max_loc = panel_len_loc; \
-\
-	/* If panel_len_max > panel_len, then there are some columns in p that
-	   need to be zeroed. Of course, only the last thread will be responsible
-	   for this edge region. */ \
-	dim_t panel_len_zero = panel_len_max - panel_len; \
-	if ( tid == nt - 1 ) panel_len_max_loc += panel_len_zero; \
-\
-	/* Shift the pointer for c and p to the appropriate locations within the
-	   first micropanel. */ \
-	dim_t off_loc = pr_start; \
-	ctype* restrict c_begin_loc = c_cast + off_loc * ldc; \
-	ctype* restrict p_begin_loc = p_cast + off_loc * ldp; \
-\
-	/* Iterate over every logical micropanel in the source matrix. */ \
-	for ( ic  = ic0,    it  = 0; it < n_iter; \
-	      ic += ic_inc, it += 1 ) \
-	{ \
-		panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		ctype* restrict c_use = c_begin_loc + (ic  )*incc; \
-		ctype* restrict p_use = p_begin_loc + (it  )*ps_p; \
-\
-		{ \
-			PASTECH2(bls_,ch,packm_cxk) \
-			( \
-			  conjc, \
-			  schema, \
-			  panel_dim, \
-			  panel_dim_max, \
-			  panel_len_loc, \
-			  panel_len_max_loc, \
-			  kappa_cast, \
-			  c_use, incc, ldc, \
-			  p_use,       ldp, \
-			  cntx  \
-			); \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_var3 )
-GENTFUNC( float,    s, packm_var3 )
-GENTFUNC( double,   d, packm_var3 )
-GENTFUNC( scomplex, c, packm_var3 )
-GENTFUNC( dcomplex, z, packm_var3 )
-
-/*
-if ( !row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var3: a packed", panel_dim_max, panel_len_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-else \
-PASTEMAC(ch,fprintm)( stdout, "packm_var3: b packed", panel_len_max, panel_dim_max, \
-                               p_use, rs_p, cs_p, "%5.2f", "" ); \
-*/
-
diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c
deleted file mode 100644
index f987a6a7d..000000000
--- a/sandbox/gemmlike/bls_packm_cxk.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       conj_t  conja, \
-       pack_t  schema, \
-       dim_t   panel_dim, \
-       dim_t   panel_dim_max, \
-       dim_t   panel_len, \
-       dim_t   panel_len_max, \
-       ctype*  kappa, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  p,             inc_t ldp, \
-       cntx_t* cntx  \
-     ) \
-{ \
-	/* Note that we use panel_dim_max, not panel_dim, to query the packm
-	   kernel function pointer. This means that we always use the same
-	   kernel, even for edge cases. */ \
-	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
-	                                           : BLIS_PACKM_MRXK_KER; \
-\
-	/* Query the context for the packm kernel corresponding to the current
-	   panel dimension, or kernel id. If the id is invalid, the function will
-	   return NULL. */ \
-	packm_cxk_ker_ft f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
-\
-	/* If there exists a kernel implementation for the micro-panel dimension
-	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
-	/* NOTE: We've disabled calling packm micro-kernels from the context for
-	   this implementation. To re-enable, change FALSE to TRUE in the
-	   conditional below. */ \
-	if ( f != NULL && FALSE ) \
-	{ \
-		f \
-		( \
-		  conja, \
-		  schema, \
-		  panel_dim, \
-		  panel_len, \
-		  panel_len_max, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,       ldp, \
-		  cntx  \
-		); \
-	} \
-	else \
-	{ \
-		/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
-		   we're wrong, this will get someone's attention. */ \
-		if ( !PASTEMAC(ch,eq1)( *kappa ) ) \
-			bli_abort(); \
-\
-		/* Perform the packing, taking conja into account. */ \
-		if ( bli_is_conj( conja ) ) \
-		{ \
-			for ( dim_t l = 0; l < panel_len; ++l ) \
-			{ \
-				for ( dim_t i = 0; i < panel_dim; ++i ) \
-				{ \
-					ctype* ali = a + (l  )*lda + (i  )*inca; \
-					ctype* pli = p + (l  )*ldp + (i  )*1; \
-\
-					PASTEMAC(ch,copyjs)( *ali, *pli ); \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			for ( dim_t l = 0; l < panel_len; ++l ) \
-			{ \
-				for ( dim_t i = 0; i < panel_dim; ++i ) \
-				{ \
-					ctype* ali = a + (l  )*lda + (i  )*inca; \
-					ctype* pli = p + (l  )*ldp + (i  )*1; \
-\
-					PASTEMAC(ch,copys)( *ali, *pli ); \
-				} \
-			} \
-		} \
-\
-		/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
-		if ( panel_dim < panel_dim_max ) \
-		{ \
-			const dim_t     i      = panel_dim; \
-			const dim_t     m_edge = panel_dim_max - panel_dim; \
-			const dim_t     n_edge = panel_len_max; \
-			ctype* restrict p_edge = p + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-\
-		/* If panel_len < panel_len_max, then we zero those unused columns. */ \
-		if ( panel_len < panel_len_max ) \
-		{ \
-			const dim_t     j      = panel_len; \
-			const dim_t     m_edge = panel_dim_max; \
-			const dim_t     n_edge = panel_len_max - panel_len; \
-			ctype* restrict p_edge = p + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC( packm_cxk )
-GENTFUNC( float,    s, packm_cxk )
-GENTFUNC( double,   d, packm_cxk )
-GENTFUNC( scomplex, c, packm_cxk )
-GENTFUNC( dcomplex, z, packm_cxk )
-
diff --git a/test/thread_ranges/test_ranges.c b/test/thread_ranges/test_ranges.c
index b597ab300..752ff1ea3 100644
--- a/test/thread_ranges/test_ranges.c
+++ b/test/thread_ranges/test_ranges.c
@@ -114,7 +114,6 @@ int main( int argc, char** argv )
 	char   out_ch;
 
 	obj_t   a;
-	blksz_t bfs;
 
 	thrinfo_t thrinfo;
 	dim_t  m, n;
@@ -272,7 +271,7 @@ int main( int argc, char** argv )
 		else               n = ( dim_t )n_input;
 
 		dt = BLIS_DOUBLE;
-		
+
 		bli_obj_create( dt, m, n, 0, 0, &a );
 
 		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
@@ -281,8 +280,6 @@ int main( int argc, char** argv )
 
 		bli_randm( &a );
 
-		bli_blksz_init_easy( &bfs, bf, bf, bf, bf );
-
 		printf( "%4u x %4u  ", ( unsigned )m, ( unsigned )n );
 
 		for ( t = t_begin; t != t_stop; t += t_inc )
@@ -291,13 +288,13 @@ int main( int argc, char** argv )
 			thrinfo.work_id = t;
 
 			if      ( part_n_dim && go_fwd )
-				area = bli_thread_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range( &thrinfo, &a, bf, BLIS_N, BLIS_FWD, TRUE, &start, &end );
 			else if ( part_n_dim && go_bwd )
-				area = bli_thread_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range( &thrinfo, &a, bf, BLIS_N, BLIS_BWD, TRUE, &start, &end );
 			else if ( part_m_dim && go_fwd )
-				area = bli_thread_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range( &thrinfo, &a, bf, BLIS_M, BLIS_FWD, TRUE, &start, &end );
 			else // ( part_m_dim && go_bwd )
-				area = bli_thread_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range( &thrinfo, &a, bf, BLIS_M, BLIS_BWD, TRUE, &start, &end );
 
 			width = end - start;
 
diff --git a/testsuite/check-blistest.sh b/testsuite/check-blistest.sh
index 1b5a50962..355718002 100755
--- a/testsuite/check-blistest.sh
+++ b/testsuite/check-blistest.sh
@@ -39,17 +39,31 @@ ansi_green="\033[0;32m"
 ansi_normal="\033[0m"
 
 passmsg="All BLIS tests passed!"
+exitmsg0="The BLIS testsuite failed to exit normally. :("
 failmsg0="At least one BLIS test failed. :("
 failmsg1="Please see output.testsuite for details."
 
-grep -q FAILURE $1
+# First make sure that the testsuite completed normally (e.g. did not abort()
+# or segfault).
+grep -q 'Exiting normally' $1
 
+# The testsuite did not complete if the error code from grep was *not* 0.
+if [ $? -ne 0 ]; then
+    printf "${ansi_red}""${script_name}: ${exitmsg0}""${ansi_normal}\n"
+    exit 1
+fi
+
+# If the testsuite completed normally, check for numerical failures.
+grep -q 'FAILURE' $1
+
+# A numerical failure was detected if the error code from grep was 0.
 if [ $? -eq 0 ]; then
-	printf "${ansi_red}""${script_name}: ${failmsg0}""${ansi_normal}\n"
-	printf "${ansi_red}""${script_name}: ${failmsg1}""${ansi_normal}\n"
-	exit 1
+    printf "${ansi_red}""${script_name}: ${failmsg0}""${ansi_normal}\n"
+    printf "${ansi_red}""${script_name}: ${failmsg1}""${ansi_normal}\n"
+    exit 1
 else
-	printf "${ansi_green}""${script_name}: ${passmsg}""${ansi_normal}\n"
-	exit 0
+    printf "${ansi_green}""${script_name}: ${passmsg}""${ansi_normal}\n"
+    exit 0
 fi
 
+
diff --git a/testsuite/input.general.mixed b/testsuite/input.general.mixed
index 36a3e62a6..414fcb8a1 100644
--- a/testsuite/input.general.mixed
+++ b/testsuite/input.general.mixed
@@ -28,7 +28,7 @@ sdcz    # Datatype(s) to test:
 1       # Test gemm with mixed-domain operands?
 1       # Test gemm with mixed-precision operands?
 100     # Problem size: first to test
-500     # Problem size: maximum to test
+100     # Problem size: maximum to test
 100     # Problem size: increment between experiments
         # Complex level-3 implementations to test:
 1       #   1m   ('1' = enable; '0' = disable)
diff --git a/testsuite/input.operations.mixed b/testsuite/input.operations.mixed
index eb851b786..baab1ea47 100644
--- a/testsuite/input.operations.mixed
+++ b/testsuite/input.operations.mixed
@@ -88,12 +88,12 @@
 
 # --- Section overrides ----------------------------------------------------
 
-1        # Utility
-1        # Level-1v kernels
-1        # Level-1m
-1        # Level-1f kernels
-1        # Level-2
-1        # Level-3 micro-kernels
+0        # Utility
+0        # Level-1v kernels
+0        # Level-1m
+0        # Level-1f kernels
+0        # Level-2
+0        # Level-3 micro-kernels
 1        # Level-3
 
 
@@ -284,47 +284,47 @@
 
 # --- Level-3 --------------------------------------------------------------
 
-2        # gemm
+1        # gemm
 -1 -1 -1 #   dimensions: m n k
 nn       #   parameters: transa transb
 
 1        # gemmt
 -1 -1    #   dimensions: m k
-???      #   parameters: uploc transa transb
+?nn      #   parameters: uploc transa transb
 
 1        # hemm
 -1 -1    #   dimensions: m n
-????     #   parameters: side uploa conja transb
+??nn     #   parameters: side uploa conja transb
 
 1        # herk
 -1 -1    #   dimensions: m k
-??       #   parameters: uploc transa
+?n       #   parameters: uploc transa
 
 1        # her2k
 -1 -1    #   dimensions: m k
-???      #   parameters: uploc transa transb
+?nn      #   parameters: uploc transa transb
 
 1        # symm
 -1 -1    #   dimensions: m n
-????     #   parameters: side uploa conja transb
+??nn     #   parameters: side uploa conja transb
 
 1        # syrk
 -1 -1    #   dimensions: m k
-??       #   parameters: uploc transa
+?n       #   parameters: uploc transa
 
 1        # syr2k
 -1 -1    #   dimensions: m k
-???      #   parameters: uploc transa transb
+?nn      #   parameters: uploc transa transb
 
 1        # trmm
 -1 -1    #   dimensions: m n
-????     #   parameters: side uploa transa diaga
+??n?     #   parameters: side uploa transa diaga
 
 1        # trmm3
 -1 -1    #   dimensions: m n
-????n    #   parameters: side uploa transa diaga transb
+??n?n    #   parameters: side uploa transa diaga transb
 
 1        # trsm
 -1 -1    #   dimensions: m n
-????     #   parameters: side uploa transa diaga
+??n?     #   parameters: side uploa transa diaga
 
diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c
index c8e740b24..6d2588851 100644
--- a/testsuite/src/test_amaxv.c
+++ b/testsuite/src/test_amaxv.c
@@ -342,7 +342,7 @@ GENFRONT( amaxv, amaxv_test )
 #undef  GENFRONT
 #define GENFRONT( tname, opname ) \
 \
-void PASTEMAC0(opname) \
+void PASTEMAC(opname) \
      ( \
        obj_t*  x, \
        obj_t*  index  \
@@ -431,7 +431,7 @@ void PASTEMAC(ch,varname) \
 			ctype* chi1 = x + (i  )*incx; \
 \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+			PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
 			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c
index 65f910f9b..81cdfb794 100644
--- a/testsuite/src/test_gemm.c
+++ b/testsuite/src/test_gemm.c
@@ -67,19 +67,6 @@ void libblis_test_gemm_experiment
        double*        resid
      );
 
-void libblis_test_gemm_md
-     (
-       test_params_t* params,
-       test_op_t*     op,
-       iface_t        iface,
-       char*          dc_str,
-       char*          pc_str,
-       char*          sc_str,
-       unsigned int   p_cur,
-       double*        perf,
-       double*        resid
-     );
-
 void libblis_test_gemm_impl
      (
        iface_t   iface,
@@ -102,25 +89,6 @@ void libblis_test_gemm_check
        double*        resid
      );
 
-void libblis_test_gemm_md_check
-     (
-       test_params_t* params,
-       obj_t*         alpha,
-       obj_t*         a,
-       obj_t*         b,
-       obj_t*         beta,
-       obj_t*         c,
-       obj_t*         c_orig,
-       double*        resid
-     );
-
-double libblis_test_gemm_flops
-     (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
-     );
-
 
 void libblis_test_gemm_deps
      (
@@ -176,7 +144,6 @@ void libblis_test_gemm
 }
 
 
-
 void libblis_test_gemm_experiment
      (
        test_params_t* params,
@@ -196,131 +163,7 @@ void libblis_test_gemm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
-
-	dim_t        m, n, k;
-
-	trans_t      transa;
-	trans_t      transb;
-
-	obj_t        alpha, a, b, beta, c;
-	obj_t        c_save;
-
-
-	// Use a different function to handle mixed datatypes.
-	if ( params->mixed_domain || params->mixed_precision )
-	{
-		libblis_test_gemm_md( params, op, iface,
-		                      dc_str, pc_str, sc_str,
-		                      p_cur, perf, resid );
-		return;
-	}
-
-	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
-
-	// Map the dimension specifier to actual dimensions.
-	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
-	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
-	k = libblis_test_get_dim_from_prob_size( op->dim_spec[2], p_cur );
-
-	// Map parameter characters to BLIS constants.
-	bli_param_map_char_to_blis_trans( pc_str[0], &transa );
-	bli_param_map_char_to_blis_trans( pc_str[1], &transb );
-
-	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
-
-	// Create test operands (vectors and/or matrices).
-	libblis_test_mobj_create( params, datatype, transa,
-	                          sc_str[1], m, k, &a );
-	libblis_test_mobj_create( params, datatype, transb,
-	                          sc_str[2], k, n, &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
-	                          sc_str[0], m, n, &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
-	                          sc_str[0], m, n, &c_save );
-
-	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
-	{
-		bli_setsc(  1.2,  0.0, &alpha );
-		bli_setsc(  0.9,  0.0, &beta );
-	}
-	else
-	{
-		bli_setsc(  1.2,  0.8, &alpha );
-		bli_setsc(  0.9,  1.0, &beta );
-	}
-
-	#if 0
-	//bli_setm( &BLIS_ONE, &a );
-	bli_setsc(  1.0,  0.0, &alpha );
-	bli_setsc(  1.0,  0.0, &beta );
-	#endif
-
-	// Randomize A, B, and C, and save C.
-	libblis_test_mobj_randomize( params, TRUE, &a );
-	libblis_test_mobj_randomize( params, TRUE, &b );
-	libblis_test_mobj_randomize( params, TRUE, &c );
-	bli_copym( &c, &c_save );
-
-	// Apply the parameters.
-	bli_obj_set_conjtrans( transa, &a );
-	bli_obj_set_conjtrans( transb, &b );
-
-	// Repeat the experiment n_repeats times and record results. 
-	for ( i = 0; i < n_repeats; ++i )
-	{
-		bli_copym( &c_save, &c );
-
-		time = bli_clock();
-
-		libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
-
-		time_min = bli_clock_min_diff( time_min, time );
-	}
-
-	// Estimate the performance of the best experiment repeat.
-	*perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
-
-	// Perform checks.
-	libblis_test_gemm_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
-
-	// Zero out performance and residual if output matrix is empty.
-	libblis_test_check_empty_problem( &c, perf, resid );
-
-	// Free the test objects.
-	bli_obj_free( &a );
-	bli_obj_free( &b );
-	bli_obj_free( &c );
-	bli_obj_free( &c_save );
-}
-
-
-void libblis_test_gemm_md
-     (
-       test_params_t* params,
-       test_op_t*     op,
-       iface_t        iface,
-       char*          dc_str,
-       char*          pc_str,
-       char*          sc_str,
-       unsigned int   p_cur,
-       double*        perf,
-       double*        resid
-     )
-{
-	unsigned int n_repeats = params->n_repeats;
-	unsigned int i;
-
-	double       time_min  = DBL_MAX;
-	double       time;
-
-	num_t        dt_a, dt_b, dt_c;
-	num_t        dt_complex;
+	num_t        dt_a, dt_b, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, n, k;
 
@@ -335,10 +178,9 @@ void libblis_test_gemm_md
 	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
 	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
 	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
-
-	// Project one of the datatypes (it doesn't matter which) to the
-	// complex domain.
-	dt_complex = bli_dt_proj_to_complex( dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[5], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -350,8 +192,8 @@ void libblis_test_gemm_md
 	bli_param_map_char_to_blis_trans( pc_str[1], &transb );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( dt_complex, &alpha );
-	bli_obj_scalar_init_detached( dt_complex, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
 	libblis_test_mobj_create( params, dt_a, transa,
@@ -363,27 +205,12 @@ void libblis_test_gemm_md
 	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, n, &c_save );
 
-	// For mixed-precision, set the computation precision of C.
-	if ( params->mixed_precision )
-	{
-		num_t dt_comp;
-		prec_t comp_prec;
-
-		// The computation precision is encoded in the computation datatype,
-		// which appears as an additional char in dc_str.
-		bli_param_map_char_to_blis_dt( dc_str[3], &dt_comp );
-
-		// Extract the precision from the computation datatype.
-		comp_prec = bli_dt_prec( dt_comp );
-
-		// Set the computation precision of C.
-		bli_obj_set_comp_prec( comp_prec, &c );
-	}
-
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
 
 	// Set alpha and beta.
 	{
-		bli_setsc(  2.0,  0.0, &alpha );
+		bli_setsc(  2.0,  0.2, &alpha );
 		bli_setsc(  1.2,  0.5, &beta );
 		//bli_setsc(  1.0,  0.0, &alpha );
 		//bli_setsc(  1.0,  0.0, &beta );
@@ -399,7 +226,7 @@ void libblis_test_gemm_md
 	bli_obj_set_conjtrans( transa, &a );
 	bli_obj_set_conjtrans( transb, &b );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -412,12 +239,10 @@ void libblis_test_gemm_md
 	}
 
 	// Estimate the performance of the best experiment repeat.
-	//*perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
-	//if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
-	*perf = libblis_test_gemm_flops( &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+	*perf = libblis_test_l3_flops( BLIS_GEMM, &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF;
 
 	// Perform checks.
-	libblis_test_gemm_md_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
+	libblis_test_gemm_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
 
 	// Zero out performance and residual if output matrix is empty.
 	libblis_test_check_empty_problem( &c, perf, resid );
@@ -473,7 +298,7 @@ bli_printm( "c after", c, "%6.3f", "" );
 
 
-void libblis_test_gemm_md_check
+void libblis_test_gemm_check
      (
        test_params_t* params,
        obj_t*         alpha,
@@ -485,8 +310,8 @@ void libblis_test_gemm_md_check
        double*        resid
      )
 {
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
-	num_t  dt_comp = bli_obj_dt_proj_to_complex( c );
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
 	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
@@ -498,19 +323,41 @@ void libblis_test_gemm_md_check
 
 	double junk;
 
+	//
+	// Pre-conditions:
+	// - a is randomized.
+	// - b is randomized.
+	// - c_orig is randomized.
+	// Note:
+	// - alpha and beta should have non-zero imaginary components in the
+	//   complex cases in order to more fully exercise the implementation.
+	//
+	// Under these conditions, we assume that the implementation for
+	//
+	//   C := beta * C_orig + alpha * transa(A) * transb(B)
+	//
+	// is functioning correctly if
+	//
+	//   normfv( v - z )
+	//
+	// is negligible, where
+	//
+	//   v = C * t
+	//   z = ( beta * C_orig + alpha * transa(A) * transb(B) ) * t
+	//     = beta * C_orig * t + alpha * transa(A) * transb(B) * t
+	//     = beta * C_orig * t + alpha * transa(A) * w
+	//     = beta * C_orig * t + z
+	//
+
 	// Compute our reference checksum in the real domain if all operands
-	// are real, and in the complex domain otherwise. Also implicit in this
-	// is that we use the storage precision of C to determine the precision
-	// in which we perform the reference checksum.
+	// are real, and in the complex domain otherwise.
 	if ( bli_obj_is_real( a ) &&
 	     bli_obj_is_real( b ) &&
 	     bli_obj_is_real( c ) ) dt = dt_real;
 	else                        dt = dt_comp;
 
-	// This function works in a manner similar to that of the function
-	// libblis_test_gemm_check(), except that we project a, b, and c into
-	// the complex domain (regardless of their storage datatype), and then
-	// proceed with the checking accordingly.
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
 
 	obj_t a2, b2, c2, c0;
 
@@ -529,7 +376,7 @@ void libblis_test_gemm_md_check
 	// they are executed.
 	bli_setiv( &BLIS_ZERO, &t );
 
-	// Create complex equivalents of a, b, c_orig, and c.
+	// Create type-casted equivalents of a, b, c_orig, and c.
 	bli_obj_create( dt, m, k, 0, 0, &a2 );
 	bli_obj_create( dt, k, n, 0, 0, &b2 );
 	bli_obj_create( dt, m, n, 0, 0, &c2 );
@@ -543,28 +390,10 @@ void libblis_test_gemm_md_check
 
 	bli_gemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
 
-#if 0
-if ( bli_obj_is_scomplex( c ) &&
-     bli_obj_is_float( a ) &&
-     bli_obj_is_float( b ) )
-{
-bli_printm( "test_gemm.c: a", a, "%7.3f", "" );
-bli_printm( "test_gemm.c: b", b, "%7.3f", "" );
-bli_printm( "test_gemm.c: c orig", c_orig, "%7.3f", "" );
-bli_printm( "test_gemm.c: c computed", c, "%7.3f", "" );
-}
-#endif
-
-#if 0
-	bli_gemm( alpha, &a2, &b2, beta, &c2 );
-	bli_gemv( &BLIS_ONE, &c2, &t, &BLIS_ZERO, &z );
-	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
-#else
 	bli_gemv( &BLIS_ONE, &b2, &t, &BLIS_ZERO, &w );
 	bli_gemv( alpha, &a2, &w, &BLIS_ZERO, &z );
 	bli_gemv( beta, &c2, &t, &BLIS_ONE, &z );
 	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
-#endif
 
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
@@ -581,120 +410,3 @@ bli_printm( "test_gemm.c: c computed", c, "%7.3f", "" );
 	bli_obj_free( &c0 );
 }
 
-
-
-void libblis_test_gemm_check
-     (
-       test_params_t* params,
-       obj_t*         alpha,
-       obj_t*         a,
-       obj_t*         b,
-       obj_t*         beta,
-       obj_t*         c,
-       obj_t*         c_orig,
-       double*        resid
-     )
-{
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
-
-	dim_t  m       = bli_obj_length( c );
-	dim_t  n       = bli_obj_width( c );
-	dim_t  k       = bli_obj_width_after_trans( a );
-
-	obj_t  norm;
-	obj_t  t, v, w, z;
-
-	double junk;
-
-	//
-	// Pre-conditions:
-	// - a is randomized.
-	// - b is randomized.
-	// - c_orig is randomized.
-	// Note:
-	// - alpha and beta should have non-zero imaginary components in the
-	//   complex cases in order to more fully exercise the implementation.
-	//
-	// Under these conditions, we assume that the implementation for
-	//
-	//   C := beta * C_orig + alpha * transa(A) * transb(B)
-	//
-	// is functioning correctly if
-	//
-	//   normfv( v - z )
-	//
-	// is negligible, where
-	//
-	//   v = C * t
-	//   z = ( beta * C_orig + alpha * transa(A) * transb(B) ) * t
-	//     = beta * C_orig * t + alpha * transa(A) * transb(B) * t
-	//     = beta * C_orig * t + alpha * transa(A) * w
-	//     = beta * C_orig * t + z
-	//
-
-	bli_obj_scalar_init_detached( dt_real, &norm );
-
-	bli_obj_create( dt, n, 1, 0, 0, &t );
-	bli_obj_create( dt, m, 1, 0, 0, &v );
-	bli_obj_create( dt, k, 1, 0, 0, &w );
-	bli_obj_create( dt, m, 1, 0, 0, &z );
-
-	libblis_test_vobj_randomize( params, TRUE, &t );
-
-	bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
-
-	bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &w );
-	bli_gemv( alpha, a, &w, &BLIS_ZERO, &z );
-	bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z );
-
-	bli_subv( &z, &v );
-	bli_normfv( &v, &norm );
-	bli_getsc( &norm, resid, &junk );
-
-	bli_obj_free( &t );
-	bli_obj_free( &v );
-	bli_obj_free( &w );
-	bli_obj_free( &z );
-}
-
-double libblis_test_gemm_flops
-     (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
-     )
-{
-	bool   a_is_real    = bli_obj_is_real( a );
-	bool   a_is_complex = bli_obj_is_complex( a );
-
-	bool   b_is_real    = bli_obj_is_real( b );
-	bool   b_is_complex = bli_obj_is_complex( b );
-
-	bool   c_is_real    = bli_obj_is_real( c );
-	bool   c_is_complex = bli_obj_is_complex( c );
-
-	double m            = ( double )bli_obj_length( c );
-	double n            = ( double )bli_obj_width( c );
-	double k            = ( double )bli_obj_width( a );
-
-	double flops;
-
-	if      ( ( c_is_complex && a_is_complex && b_is_complex ) )
-	{
-		flops = 8.0 * m * n * k;
-	}
-	else if ( ( c_is_complex && a_is_complex && b_is_real    ) ||
-	          ( c_is_complex && a_is_real    && b_is_complex ) ||
-	          ( c_is_real    && a_is_complex && b_is_complex ) )
-	{
-		flops = 4.0 * m * n * k;
-	}
-	else
-	{
-		flops = 2.0 * m * n * k;
-	}
-
-	return flops;
-}
-
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index f3b5f7b52..288249b59 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -241,7 +241,7 @@ void libblis_test_gemm_ukr_experiment
 	  BLIS_MR,
 	  BLIS_KR,
 	  BLIS_NO_INVERT_DIAG,
-	  BLIS_PACKED_ROW_PANELS,
+	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
 	  cntx
@@ -251,7 +251,7 @@ void libblis_test_gemm_ukr_experiment
 	  BLIS_NR,
 	  BLIS_KR,
 	  BLIS_NO_INVERT_DIAG,
-	  BLIS_PACKED_COL_PANELS,
+	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_B_PANEL,
 	  &b, &bp,
 	  cntx
diff --git a/testsuite/src/test_gemmt.c b/testsuite/src/test_gemmt.c
index 3b7b08748..c35744fe2 100644
--- a/testsuite/src/test_gemmt.c
+++ b/testsuite/src/test_gemmt.c
@@ -166,7 +166,7 @@ void libblis_test_gemmt_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_b, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, k;
 
@@ -179,7 +179,12 @@ void libblis_test_gemmt_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[5], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -191,29 +196,26 @@ void libblis_test_gemmt_experiment
 	bli_param_map_char_to_blis_trans( pc_str[2], &transb );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
-	libblis_test_mobj_create( params, datatype, transa,
+	libblis_test_mobj_create( params, dt_a, transa,
 	                          sc_str[1], m, k, &a );
-	libblis_test_mobj_create( params, datatype, transb,
+	libblis_test_mobj_create( params, dt_b, transb,
 	                          sc_str[2], k, m, &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
-	{
-		bli_setsc(  1.2,  0.0, &alpha );
-		bli_setsc(  0.9,  0.0, &beta );
-	}
-	else
 	{
-		bli_setsc(  1.2,  0.8, &alpha );
-		bli_setsc(  0.9,  1.0, &beta );
+		bli_setsc(  2.0,  0.2, &alpha );
+		bli_setsc(  1.2,  0.5, &beta );
 	}
 
 	// Randomize A and B.
@@ -243,7 +245,7 @@ void libblis_test_gemmt_experiment
 	bli_obj_set_conjtrans( transa, &a );
 	bli_obj_set_conjtrans( transb, &b );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -256,8 +258,7 @@ void libblis_test_gemmt_experiment
 	}
 
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 1.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	*perf = libblis_test_l3_flops( BLIS_GEMMT, &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF;
 
 	// Perform checks.
 	libblis_test_gemmt_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
@@ -324,12 +325,13 @@ void libblis_test_gemmt_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
 	uplo_t uploc   = bli_obj_uplo( c );
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
-	//dim_t  k       = bli_obj_width_after_trans( a );
+	dim_t  k       = bli_obj_width_after_trans( a );
 
 	obj_t  norm;
 	obj_t  t, v, q, z;
@@ -362,6 +364,18 @@ void libblis_test_gemmt_check
 	//     = beta * C_orig * t + z
 	//
 
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, b2, c2, c0;
+
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
 	bli_obj_create( dt, m, 1, 0, 0, &t );
@@ -373,18 +387,33 @@ void libblis_test_gemmt_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
-
-	bli_gemm( &BLIS_ONE, a, b, &BLIS_ZERO, &q );
-#if 1
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, m, k, 0, 0, &a2 );
+	bli_obj_create( dt, k, m, 0, 0, &b2 );
+	bli_obj_create( dt, m, m, 0, 0, &c2 );
+	bli_obj_create( dt, m, m, 0, 0, &c0 );
+	bli_obj_set_uplo( uploc, &c0 );
+	bli_obj_set_uplo( uploc, &c2 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b,      &b2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_gemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
+
+	bli_gemm( &BLIS_ONE, &a2, &b2, &BLIS_ZERO, &q );
 	bli_mktrim( &q );
 	bli_gemv( alpha, &q, &t, &BLIS_ZERO, &z );
-#else
-	bli_obj_set_struc( BLIS_TRIANGULAR, &q );
-	bli_copyv( &t, &z );
-	bli_trmv( alpha, &q, &z );
-#endif
-	bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z );
+	bli_gemv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
 
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
@@ -394,5 +423,10 @@ void libblis_test_gemmt_check
 	bli_obj_free( &v );
 	bli_obj_free( &z );
 	bli_obj_free( &q );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 480e49c2d..26aad890d 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -290,7 +290,7 @@ void libblis_test_gemmtrsm_ukr_experiment
 	  BLIS_MR,
 	  BLIS_MR,
 	  BLIS_INVERT_DIAG,
-	  BLIS_PACKED_ROW_PANELS,
+	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
 	  cntx
@@ -325,7 +325,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		  BLIS_NR,
 		  BLIS_MR,
 		  BLIS_NO_INVERT_DIAG,
-		  BLIS_PACKED_COL_PANELS,
+		  BLIS_PACKED_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,
 		  &b, &bp,
 		  cntx
diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c
index cac5aa73a..079b71979 100644
--- a/testsuite/src/test_hemm.c
+++ b/testsuite/src/test_hemm.c
@@ -168,7 +168,7 @@ void libblis_test_hemm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_b, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, n;
 	dim_t        mn_side;
@@ -183,7 +183,12 @@ void libblis_test_hemm_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[5], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -196,30 +201,27 @@ void libblis_test_hemm_experiment
 	bli_param_map_char_to_blis_trans( pc_str[3], &transb );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
 	bli_set_dim_with_side( side, m, n, &mn_side );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_a, BLIS_NO_TRANSPOSE,
 	                          sc_str[1], mn_side, mn_side, &a );
-	libblis_test_mobj_create( params, datatype, transb,
+	libblis_test_mobj_create( params, dt_b, transb,
 	                          sc_str[2], m,       n,       &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
 	{
-		bli_setsc(  1.2,  0.0, &alpha );
-		bli_setsc( -1.0,  0.0, &beta );
-	}
-	else
-	{
-		bli_setsc(  1.2,  0.8, &alpha );
-		bli_setsc( -1.0,  1.0, &beta );
+		bli_setsc(  2.0,  0.2, &alpha );
+		bli_setsc(  1.2,  0.5, &beta );
 	}
 
 	// Set the structure and uplo properties of A.
@@ -241,7 +243,7 @@ void libblis_test_hemm_experiment
 	bli_obj_set_conj( conja, &a );
 	bli_obj_set_conjtrans( transb, &b );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -254,8 +256,14 @@ void libblis_test_hemm_experiment
 	}
 
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	if ( bli_is_left( side ) )
+	{
+		*perf = libblis_test_l3_flops( BLIS_HEMM, &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
+	else
+	{
+		*perf = libblis_test_l3_flops( BLIS_HEMM, &b, &a, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
 
 	// Perform checks.
 	libblis_test_hemm_check( params, side, &alpha, &a, &b, &beta, &c, &c_save, resid );
@@ -309,11 +317,17 @@ void libblis_test_hemm_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	uplo_t uploa   = bli_obj_uplo( a );
+	if ( bli_obj_has_trans( a ) )
+		bli_toggle_uplo( &uploa );
+
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
 	dim_t  n       = bli_obj_width( c );
+	dim_t  mn_side = bli_obj_length( a );
 
 	obj_t  norm;
 	obj_t  t, v, w, z;
@@ -352,6 +366,18 @@ void libblis_test_hemm_check
 	//     = beta * C_orig * t + alpha * transb(B) * w
 	//     = beta * C_orig * t + z
 
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, b2, c2, c0;
+
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
 	if ( bli_is_left( side ) )
@@ -371,21 +397,42 @@ void libblis_test_hemm_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, mn_side, mn_side, 0, 0, &a2 );
+	bli_obj_create( dt, m, n, 0, 0, &b2 );
+	bli_obj_create( dt, m, n, 0, 0, &c2 );
+	bli_obj_create( dt, m, n, 0, 0, &c0 );
+	bli_obj_set_struc( BLIS_HERMITIAN, &a2 );
+	bli_obj_set_uplo( uploa, &a2 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b,      &b2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_gemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
 
 	if ( bli_is_left( side ) )
 	{
-		bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &w );
-		bli_hemv( alpha, a, &w, &BLIS_ZERO, &z );
+		bli_gemv( &BLIS_ONE, &b2, &t, &BLIS_ZERO, &w );
+		bli_hemv( alpha, &a2, &w, &BLIS_ZERO, &z );
 	}
 	else // else if ( bli_is_right( side ) )
 	{
-		bli_hemv( &BLIS_ONE, a, &t, &BLIS_ZERO, &w );
-		bli_gemv( alpha, b, &w, &BLIS_ZERO, &z );
+		bli_hemv( &BLIS_ONE, &a2, &t, &BLIS_ZERO, &w );
+		bli_gemv( alpha, &b2, &w, &BLIS_ZERO, &z );
 	}
 
-	bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z );
-	
+	bli_gemv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
+
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
 	bli_getsc( &norm, resid, &junk );
@@ -394,5 +441,10 @@ void libblis_test_hemm_check
 	bli_obj_free( &v );
 	bli_obj_free( &w );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c
index 59bbaf5f1..03689ef1b 100644
--- a/testsuite/src/test_her2k.c
+++ b/testsuite/src/test_her2k.c
@@ -166,7 +166,7 @@ void libblis_test_her2k_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_b, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, k;
 
@@ -178,7 +178,12 @@ void libblis_test_her2k_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[5], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -190,31 +195,28 @@ void libblis_test_her2k_experiment
 	bli_param_map_char_to_blis_trans( pc_str[2], &transb );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
-	libblis_test_mobj_create( params, datatype, transa,
+	libblis_test_mobj_create( params, dt_a, transa,
 	                          sc_str[1], m, k, &a );
-	libblis_test_mobj_create( params, datatype, transb,
+	libblis_test_mobj_create( params, dt_b, transb,
 	                          sc_str[2], m, k, &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
-	{
-		bli_setsc(  0.8, 0.0, &alpha );
-		bli_setsc( -1.0, 0.0, &beta );
-	}
-	else
 	{
 		// For her2k, alpha may be complex, but beta must be real-valued
 		// (in order to preserve the Hermitian structure of C).
 		bli_setsc(  0.8, 0.5, &alpha );
-		bli_setsc( -1.0, 0.0, &beta );
+		bli_setsc( -1.2, 0.0, &beta );
 	}
 
 	// Randomize A and B.
@@ -240,7 +242,7 @@ void libblis_test_her2k_experiment
 	bli_obj_set_conjtrans( transa, &a );
 	bli_obj_set_conjtrans( transb, &b );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -252,9 +254,11 @@ void libblis_test_her2k_experiment
 		time_min = bli_clock_min_diff( time_min, time );
 	}
 
+	obj_t bh;
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, &b, &bh );
+
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	*perf = libblis_test_l3_flops( BLIS_HER2K, &a, &bh, &c ) / time_min / FLOPS_PER_UNIT_PERF;
 
 	// Perform checks.
 	libblis_test_her2k_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
@@ -306,13 +310,15 @@ void libblis_test_her2k_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	uplo_t uploc   = bli_obj_uplo( c );
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
 	dim_t  k       = bli_obj_width_after_trans( a );
 
-	obj_t  alphac, ah, bh;
+	obj_t  alphac;
 	obj_t  norm;
 	obj_t  t, v, w1, w2, z;
 
@@ -347,8 +353,17 @@ void libblis_test_her2k_check
 	//     = beta * C_orig * t + z
 	//
 
-	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah );
-	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh );
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, b2, ah2, bh2, c2, c0;
 
 	bli_obj_scalar_init_detached( dt_real, &norm );
 	bli_obj_scalar_init_detached_copy_of( dt, BLIS_CONJUGATE, alpha, &alphac );
@@ -361,13 +376,39 @@ void libblis_test_her2k_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_hemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
-
-	bli_gemv( &BLIS_ONE, &ah, &t, &BLIS_ZERO, &w2 );
-	bli_gemv( &BLIS_ONE, &bh, &t, &BLIS_ZERO, &w1 );
-	bli_gemv( alpha, a, &w1, &BLIS_ZERO, &z );
-	bli_gemv( &alphac, b, &w2, &BLIS_ONE, &z );
-	bli_hemv( beta, c_orig, &t, &BLIS_ONE, &z );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, m, k, 0, 0, &a2 );
+	bli_obj_create( dt, k, m, 0, 0, &b2 );
+	bli_obj_create( dt, m, m, 0, 0, &c2 );
+	bli_obj_create( dt, m, m, 0, 0, &c0 );
+	bli_obj_set_struc( BLIS_HERMITIAN, &c2 );
+	bli_obj_set_struc( BLIS_HERMITIAN, &c0 );
+	bli_obj_set_uplo( uploc, &c2 );
+	bli_obj_set_uplo( uploc, &c0 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b,      &b2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, &a2, &ah2 );
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, &b2, &bh2 );
+
+	bli_hemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
+
+	bli_gemv( &BLIS_ONE, &ah2, &t, &BLIS_ZERO, &w2 );
+	bli_gemv( &BLIS_ONE, &bh2, &t, &BLIS_ZERO, &w1 );
+	bli_gemv( alpha, &a2, &w1, &BLIS_ZERO, &z );
+	bli_gemv( &alphac, &b2, &w2, &BLIS_ONE, &z );
+	bli_hemv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
 
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
@@ -378,5 +419,10 @@ void libblis_test_her2k_check
 	bli_obj_free( &w1 );
 	bli_obj_free( &w2 );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c
index bbb7be922..a4940b5e4 100644
--- a/testsuite/src/test_herk.c
+++ b/testsuite/src/test_herk.c
@@ -164,7 +164,7 @@ void libblis_test_herk_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, k;
 
@@ -176,7 +176,11 @@ void libblis_test_herk_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -187,29 +191,26 @@ void libblis_test_herk_experiment
 	bli_param_map_char_to_blis_trans( pc_str[1], &transa );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
-	libblis_test_mobj_create( params, datatype, transa,
+	libblis_test_mobj_create( params, dt_a, transa,
 	                          sc_str[1], m, k, &a );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
-	{
-		bli_setsc(  1.2, 0.0, &alpha );
-		bli_setsc( -1.0, 0.0, &beta );
-	}
-	else
 	{
 		// For herk, alpha and beta must both be real-valued, even in the
 		// complex case (in order to preserve the Hermitian structure of C).
 		bli_setsc(  1.2, 0.0, &alpha );
-		bli_setsc( -1.0, 0.0, &beta );
+		bli_setsc( -1.5, 0.0, &beta );
 	}
 
 	// Randomize A.
@@ -233,7 +234,7 @@ void libblis_test_herk_experiment
 	// Apply the remaining parameters.
 	bli_obj_set_conjtrans( transa, &a );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -245,9 +246,11 @@ void libblis_test_herk_experiment
 		time_min = bli_clock_min_diff( time_min, time );
 	}
 
+	obj_t ah;
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, &a, &ah );
+
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 1.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	*perf = libblis_test_l3_flops( BLIS_HERK, &a, &ah, &c ) / time_min / FLOPS_PER_UNIT_PERF;
 
 	// Perform checks.
 	libblis_test_herk_check( params, &alpha, &a, &beta, &c, &c_save, resid );
@@ -296,13 +299,14 @@ void libblis_test_herk_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	uplo_t uploc   = bli_obj_uplo( c );
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
 	dim_t  k       = bli_obj_width_after_trans( a );
 
-	obj_t  ah;
 	obj_t  norm;
 	obj_t  t, v, w, z;
 
@@ -332,7 +336,16 @@ void libblis_test_herk_check
 	//     = beta * C_orig * t + z
 	//
 
-	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah );
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, ah2, c2, c0;
 
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
@@ -343,11 +356,34 @@ void libblis_test_herk_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_hemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, m, k, 0, 0, &a2 );
+	bli_obj_create( dt, m, m, 0, 0, &c2 );
+	bli_obj_create( dt, m, m, 0, 0, &c0 );
+	bli_obj_set_struc( BLIS_HERMITIAN, &c2 );
+	bli_obj_set_struc( BLIS_HERMITIAN, &c0 );
+	bli_obj_set_uplo( uploc, &c2 );
+	bli_obj_set_uplo( uploc, &c0 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
 
-	bli_gemv( &BLIS_ONE, &ah, &t, &BLIS_ZERO, &w );
-	bli_gemv( alpha, a, &w, &BLIS_ZERO, &z );
-	bli_hemv( beta, c_orig, &t, &BLIS_ONE, &z );
+	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, &a2, &ah2 );
+
+	bli_hemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
+
+	bli_gemv( &BLIS_ONE, &ah2, &t, &BLIS_ZERO, &w );
+	bli_gemv( alpha, &a2, &w, &BLIS_ZERO, &z );
+	bli_hemv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
 
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
@@ -357,5 +393,9 @@ void libblis_test_herk_check
 	bli_obj_free( &v );
 	bli_obj_free( &w );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 4f09d3932..aed0cd817 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -91,14 +91,17 @@ int main( int argc, char** argv )
 	// Finalize libblis.
 	bli_finalize();
 
+	// Prepare to return.
+	int rval = 0;
 #ifdef BLIS_ENABLE_HPX
-	return bli_thread_finalize_hpx();
-#else
-	// Return peacefully.
-	return 0;
+	rval = bli_thread_finalize_hpx();
 #endif
-}
 
+	// Output a termination tag.
+	libblis_test_output_term_tag();
+
+	return rval;
+}
 
 #if 0
 typedef struct thread_data
@@ -781,18 +784,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	int     i;
 	//char   int_type_size_str[8];
 	gint_t  int_type_size;
-	ind_t   im;
 	cntx_t* cntx;
-	cntx_t* cntx_c;
-	cntx_t* cntx_z;
-
-#ifndef BLIS_ENABLE_GEMM_MD
-	// Notify the user if mixed domain or mixed precision was requested.
-	if ( params->mixed_domain || params->mixed_precision )
-	{
-		libblis_test_printf_error( "mixed domain and/or mixed precision testing requested, but building against BLIS without mixed datatype support.\n" );
-	}
-#endif
 
 	// Skip informational output if BLIS is running in quiet mode.
 	if ( libblis_test_quiet_mode ) return;
@@ -874,7 +866,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	// Set up rntm_t objects for each of the four families:
 	// gemm, herk, trmm, trsm.
 	rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r;
-	dim_t  m = 1000, n = 1000, k = 1000;
 
 	bli_rntm_init_from_global( &gemm   );
 	bli_rntm_init_from_global( &herk   );
@@ -883,12 +874,15 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	bli_rntm_init_from_global( &trsm_l );
 	bli_rntm_init_from_global( &trsm_r );
 
+    /*
+	dim_t  m = 1000, n = 1000, k = 1000;
 	bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT,  m, n, k, &gemm );
 	bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT,  m, n, k, &herk );
 	bli_rntm_set_ways_for_op( BLIS_TRMM, BLIS_LEFT,  m, n, k, &trmm_l );
 	bli_rntm_set_ways_for_op( BLIS_TRMM, BLIS_RIGHT, m, n, k, &trmm_r );
 	bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_LEFT,  m, n, k, &trsm_l );
 	bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_RIGHT, m, n, k, &trsm_r );
+    */
 
 	const bool tls_enabled = bli_info_get_enable_tls();
 	const bool thr_enabled = bli_info_get_enable_threading();
@@ -967,6 +961,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "  environment        %5s %5s %5s %5s %5s %5s\n",
 	                                                               nt_str, jc_nt_str, pc_nt_str,
 	                                                            ic_nt_str, jr_nt_str, ir_nt_str );
+    /*
 	libblis_test_fprintf_c( os, "  gemm   (m,n,k=1000)      %5d %5d %5d %5d %5d\n",
 	                                ( int )bli_rntm_jc_ways( &gemm ), ( int )bli_rntm_pc_ways( &gemm ),
 	                                ( int )bli_rntm_ic_ways( &gemm ),
@@ -991,6 +986,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                                ( int )bli_rntm_jc_ways( &trsm_r ), ( int )bli_rntm_pc_ways( &trsm_r ),
 	                                ( int )bli_rntm_ic_ways( &trsm_r ),
 	                                ( int )bli_rntm_jr_ways( &trsm_r ), ( int )bli_rntm_ir_ways( &trsm_r ) );
+    */
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "thread partitioning              \n" );
 	//libblis_test_fprintf_c( os, "  jc/ic loops                  %s\n", "slab" );
@@ -1068,7 +1064,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 
 	// Query a native context.
-	cntx = ( cntx_t* )bli_gks_query_nat_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	libblis_test_fprintf_c( os, "level-3 blocksizes             s       d       c       z \n" );
 	libblis_test_fprintf_c( os, "  mc                     %7d %7d %7d %7d\n",
@@ -1182,84 +1178,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "\n" );
 
-	libblis_test_fprintf_c( os, "--- BLIS induced implementation info ---\n" );
-	libblis_test_fprintf_c( os, "\n" );
-
-	for ( im = 0; im < BLIS_NAT; ++im )
-	{
-	if ( params->ind_enable[ im ] == 0 ) continue;
-
-	bli_ind_oper_enable_only( BLIS_GEMM, im, BLIS_SCOMPLEX );
-	bli_ind_oper_enable_only( BLIS_GEMM, im, BLIS_DCOMPLEX );
-
-	//libblis_test_fprintf_c( os, "                               c       z \n" );
-	libblis_test_fprintf_c( os, "                                               c       z \n" );
-	libblis_test_fprintf_c( os, "complex implementation                   %7s %7s\n",
-	                        bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_SCOMPLEX ),
-	                        bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) );
-	libblis_test_fprintf_c( os, "\n" );
-
-	// Query a native context. NOTE: Now that we've removed the dt argument from
-	// bli_gks_query_ind_cntx(), we can consolidate cntx_c and cntx_z; there is
-	// no need to query two contexts since they are the same.
-	cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im );
-	cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im );
-
-	libblis_test_fprintf_c( os, "level-3 blocksizes                             c       z \n" );
-	libblis_test_fprintf_c( os, "  mc                                     %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) );
-	libblis_test_fprintf_c( os, "  kc                                     %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) );
-	libblis_test_fprintf_c( os, "  nc                                     %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) );
-	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "  mc maximum                             %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) );
-	libblis_test_fprintf_c( os, "  kc maximum                             %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) );
-	libblis_test_fprintf_c( os, "  nc maximum                             %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) );
-	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "  mr                                     %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) );
-	libblis_test_fprintf_c( os, "  nr                                     %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) );
-	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "  mr packdim                             %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) );
-	libblis_test_fprintf_c( os, "  nr packdim                             %7d %7d\n",
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ),
-	                        ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) );
-	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "micro-kernel types                             c       z\n" );
-	libblis_test_fprintf_c( os, "  gemm                                   %7s %7s\n",
-	                        bli_info_get_gemm_ukr_impl_string( im, BLIS_SCOMPLEX ),
-	                        bli_info_get_gemm_ukr_impl_string( im, BLIS_DCOMPLEX ) );
-	libblis_test_fprintf_c( os, "  gemmtrsm_l                             %7s %7s\n",
-	                        bli_info_get_gemmtrsm_l_ukr_impl_string( im, BLIS_SCOMPLEX ),
-	                        bli_info_get_gemmtrsm_l_ukr_impl_string( im, BLIS_DCOMPLEX ) );
-	libblis_test_fprintf_c( os, "  gemmtrsm_u                             %7s %7s\n",
-	                        bli_info_get_gemmtrsm_u_ukr_impl_string( im, BLIS_SCOMPLEX ),
-	                        bli_info_get_gemmtrsm_u_ukr_impl_string( im, BLIS_DCOMPLEX ) );
-	libblis_test_fprintf_c( os, "  trsm_l                                 %7s %7s\n",
-	                        bli_info_get_trsm_l_ukr_impl_string( im, BLIS_SCOMPLEX ),
-	                        bli_info_get_trsm_l_ukr_impl_string( im, BLIS_DCOMPLEX ) );
-	libblis_test_fprintf_c( os, "  trsm_u                                 %7s %7s\n",
-	                        bli_info_get_trsm_u_ukr_impl_string( im, BLIS_SCOMPLEX ),
-	                        bli_info_get_trsm_u_ukr_impl_string( im, BLIS_DCOMPLEX ) );
-	libblis_test_fprintf_c( os, "\n" );
-
-	}
-
 	bli_ind_disable_all();
 
 	libblis_test_fprintf_c( os, "\n" );
@@ -1403,6 +1321,23 @@ void libblis_test_output_op_struct( FILE* os, test_op_t* op, char* op_str )
 
 
+void libblis_test_output_term_tag( void )
+{
+	// Skip informational output if BLIS is running in quiet mode.
+	if ( libblis_test_quiet_mode ) return;
+
+	FILE* os = stdout;
+
+	// Output a termination tag that can be checked by scripts to confirm that
+	// the testsuite completed normally. This allows us to detect things like
+	// aborts and segfaults, which may not be detected by CI as failures.
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "Exiting normally.\n" );
+	libblis_test_fprintf_c( os, "\n" );
+}
+
+
+
 char* libblis_test_get_string_for_result( double    resid,
                                           num_t     dt,
                                           thresh_t* thresh )
@@ -1651,67 +1586,44 @@ void libblis_test_op_driver
 	unsigned int  p_first             = params->p_first;
 	unsigned int  p_max               = params->p_max;
 	unsigned int  p_inc               = params->p_inc;
-	unsigned int  mix_all_storage     = params->mix_all_storage;
-	unsigned int  mixed_domain        = params->mixed_domain;
-	unsigned int  mixed_precision     = params->mixed_precision;
 	unsigned int  reaction_to_failure = params->reaction_to_failure;
+	char*         d_spec_str          = params->datatype_char;
+
+	unsigned int  n_param_combos = 0;
+	unsigned int  n_store_combos = 0;
+	unsigned int  n_dt_combos    = 0;
+	char**        pc_str = NULL;
+	char**        sc_str = NULL;
+	char**        dc_str = NULL;
+
+	// Figure out how many operands and scalars we have.
+	// The number of scalars is only relevant for mixed-domain computations
+	// so assume 0 for any non-level-3 operations.
+	unsigned int n_operands = strlen( o_types );
+	unsigned int n_scalars = op->opid == BLIS_NOID ? 0 :
+	                         op->opid == BLIS_TRMM ? 1 :
+	                         op->opid == BLIS_TRSM ? 1 :
+	                         /* all other l3 ops */  2;
+
+	// Increment the number of operands by one to account for the
+	// computation precision (or computation datatype, as we will encode
+	// it in the char string).
+	unsigned int n_operands_all = n_operands + n_scalars + 1;
 
-	num_t         datatype;
-	num_t         dt_check;
-	char          dt_char;
-
-	char*         p_spec_str;
-	unsigned int  n_params;
-	char**        chars_for_param;
-	unsigned int  n_param_combos;
-	char**        pc_str;
-
-	char          s_spec_str[ MAX_NUM_OPERANDS + 1 ];
-	unsigned int  n_operands;
-	unsigned int  n_operandsp1;
-	char**        chars_for_storage;
-	unsigned int  n_store_combos;
-	char**        sc_str;
-
-	char          d_spec_str[ MAX_NUM_OPERANDS + 1 ];
-	char**        chars_for_spdt;
-	char**        chars_for_dpdt;
-	unsigned int  n_spdt_combos;
-	unsigned int  n_dpdt_combos;
-	unsigned int  n_dt_combos;
-	char**        dc_str;
-
-	char**        chars_for_dt;
-	char**        chars_for_rddt;
-	char**        chars_for_cddt;
-	unsigned int  n_rddt_combos;
-	unsigned int  n_cddt_combos;
-
-	unsigned int  p_cur, pi;
-	unsigned int  indi, pci, sci, dci, i, j, o;
-	unsigned int  is_mixed_dt;
-
-	double        perf, resid;
-	char*         pass_str;
-	char*         ind_str;
-	char          blank_str[32];
-	char          funcname_str[64];
-	char          dims_str[64];
-	char          label_str[128];
-	unsigned int  n_spaces;
-	unsigned int  n_dims_print;
-
-	FILE*         output_stream = NULL;
-
-	// These arrays are malloc()'ed in select branches. Here, we set
-	// them to NULL so they can be unconditionally free()'ed at the
-	// end of the function.
-	chars_for_rddt = NULL;
-	chars_for_cddt = NULL;
-	chars_for_spdt = NULL;
-	chars_for_dpdt = NULL;
+
+	// Mixed-precision/mixed-domain only works for level-3 operations
+	// *except* TRSM.
+	unsigned int mixed_domain    = params->mixed_domain;
+	unsigned int mixed_precision = params->mixed_precision;
+	if ( op->opid == BLIS_TRSM || !bli_opid_is_level3( op->opid ) )
+	{
+		mixed_precision = DISABLE;
+		mixed_domain    = DISABLE;
+	}
+	unsigned int is_mixed_dt = mixed_domain || mixed_precision;
 
 	// If output to files was requested, attempt to open a file stream.
+	FILE* output_stream = NULL;
 	if ( params->output_files )
 		libblis_test_fopen_ofile( op_str, iface, &output_stream );
 
@@ -1723,22 +1635,22 @@ void libblis_test_op_driver
 		bli_error_checking_level_set( BLIS_FULL_ERROR_CHECKING );
 
 	// Obtain the parameter specification (filter) string.
-	p_spec_str = op->params;
+	char* p_spec_str = op->params;
 
 	// Figure out how many parameters we have.
-	n_params = strlen( p_types );
+	unsigned int n_params = strlen( p_types );
 
-	if ( strlen( p_types ) != strlen( p_spec_str) )
+	if ( strlen( p_spec_str) != n_params )
 	{
 		libblis_test_printf_error( "Parameter specification string from input file does not match length of p_types string.\n" );
 	}
 
 	// Allocate an array that stores pointers to the sets of possible parameter
 	// chars for each parameter.
-	chars_for_param = ( char** ) malloc( n_params * sizeof( char* ) );
+	char** chars_for_param = ( char** ) malloc( n_params * sizeof( char* ) );
 
 	// Set the values in chars_for_param to the appropriate string addresses.
-	for ( i = 0; i < n_params; ++i )
+	for ( unsigned int i = 0; i < n_params; ++i )
 	{
 		param_t param_type = libblis_test_get_param_type_for_char( p_types[i] );
 		chars_for_param[i] = libblis_test_param_chars[ param_type ];
@@ -1752,7 +1664,7 @@ void libblis_test_op_driver
 	// Allocate an array of parameter combination strings, one for each
 	// parameter combination that needs to be tested.
 	pc_str = ( char** ) malloc( n_param_combos * sizeof( char* ) );
-	for ( i = 0; i < n_param_combos; ++i )
+	for ( unsigned int i = 0; i < n_param_combos; ++i )
 		pc_str[i] = ( char* ) malloc( ( n_params + 1 ) * sizeof( char ) );
 
 	// Fill the parameter combination strings in pc_str with the parameter
@@ -1763,30 +1675,29 @@ void libblis_test_op_driver
 	                                 n_param_combos,
 	                                 pc_str );
 
-
-
-	// Figure out how many operands we have.
-	n_operands = strlen( o_types );
-
 	// If we are testing a micro-kernel, unconditionally disable the
-	// "mix all storage" option.
+	// "mix all storage" and mixed-precision/mixed-domain options.
+	unsigned int mix_all_storage = params->mix_all_storage;
 	if ( iface == BLIS_TEST_SEQ_UKERNEL )
+	{
 		mix_all_storage = DISABLE;
+	}
 
 	// Enumerate all combinations of storage schemes requested.
 	if ( mix_all_storage )
 	{
 		// Fill storage specification string with wildcard chars.
-		for ( i = 0; i < n_operands; ++i ) s_spec_str[i] = '?';
-		s_spec_str[i] = '\0';
+		char s_spec_str[ MAX_NUM_OPERANDS + 1 ];
+		for ( unsigned int i = 0; i < n_operands; ++i ) s_spec_str[i] = '?';
+		s_spec_str[n_operands] = '\0';
 
 		// Allocate an array that stores pointers to the sets of possible
 		// storage chars for each operand.
-		chars_for_storage = ( char** ) malloc( n_operands * sizeof( char* ) );
+		char** chars_for_storage = ( char** ) malloc( n_operands * sizeof( char* ) );
 
 		// Set the values in chars_for_storage to the address of the string
 		// that holds the storage chars.
-		for ( i = 0; i < n_operands; ++i )
+		for ( unsigned int i = 0; i < n_operands; ++i )
 		{
 			operand_t operand_type = libblis_test_get_operand_type_for_char( o_types[i] );
 			chars_for_storage[i] = libblis_test_store_chars[ operand_type ];
@@ -1800,7 +1711,7 @@ void libblis_test_op_driver
 		// Allocate an array of storage combination strings, one for each
 		// storage combination that needs to be tested.
 		sc_str = ( char** ) malloc( n_store_combos * sizeof( char* ) );
-		for ( sci = 0; sci < n_store_combos; ++sci )
+		for ( unsigned int sci = 0; sci < n_store_combos; ++sci )
 			sc_str[sci] = ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) );
 
 
@@ -1820,7 +1731,7 @@ void libblis_test_op_driver
 		unsigned int n_mat_operands = 0;
 		unsigned int n_vec_operands = 0;
 
-		for ( o = 0; o < n_operands; ++o )
+		for ( unsigned int o = 0; o < n_operands; ++o )
 		{
 			operand_t operand_type
 			          = libblis_test_get_operand_type_for_char( o_types[o] );
@@ -1831,33 +1742,30 @@ void libblis_test_op_driver
 		// We compute the total number of storage combinations based on whether
 		// the current operation has only matrix operands, only vector operands,
 		// or both.
-		if      ( n_vec_operands == 0 )
+		if ( n_vec_operands == 0 )
 		{
-			n_store_combos = n_mstorage;
 			n_vstorage = 1;
 		}
-		else if ( n_mat_operands == 0 )
+
+		if ( n_mat_operands == 0 )
 		{
-			n_store_combos = n_vstorage;
 			n_mstorage = 1;
 		}
-		else
-		{
-			n_store_combos = n_mstorage * n_vstorage;
-		}
+
+		n_store_combos = n_mstorage * n_vstorage;
 
 		sc_str = ( char** ) malloc( n_store_combos * sizeof( char* ) );
 
-		for ( j = 0; j < n_mstorage; ++j )
+		for ( unsigned int j = 0; j < n_mstorage; ++j )
 		{
-			for ( i = 0; i < n_vstorage; ++i )
+			for ( unsigned int i = 0; i < n_vstorage; ++i )
 			{
-				sci = j*n_vstorage + i;
+				unsigned int sci = j*n_vstorage + i;
 
 				sc_str[ sci ]
 				= ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) );
 
-				for ( o = 0; o < n_operands; ++o )
+				for ( unsigned int o = 0; o < n_operands; ++o )
 				{
 					unsigned int ij;
 					operand_t    operand_type
@@ -1873,302 +1781,143 @@ void libblis_test_op_driver
 		}
 	}
 
-	// Enumerate all combinations of datatypes requested, but only for the
-	// gemm operation.
-
-	if      ( !mixed_domain &&  mixed_precision && op->opid == BLIS_GEMM )
-	{
-		is_mixed_dt = TRUE;
-
-		// Increment the number of operands by one to account for the
-		// computation precision (or computation datatype, as we will encode
-		// it in the char string).
-		n_operandsp1 = n_operands + 1;
+	// Enumerate all combinations of datatypes requested.
 
-		unsigned int has_rd = libblis_test_dt_str_has_rd_char( params );
-		unsigned int has_cd = libblis_test_dt_str_has_cd_char( params );
+	// Keep track of the capacity of the array of datatype strings. Start with
+	// at least enough storage for non-mixed-precision/-domain cases.
+	unsigned int n_dt_combos_max = n_datatypes;
 
-		// Fill datatype specification string with wildcard chars.
-		for ( i = 0; i < n_operandsp1; ++i ) d_spec_str[i] = '?';
-		d_spec_str[i] = '\0';
-
-		// Allocate an array that stores pointers to the sets of possible
-		// datatype chars for each operand.
-		chars_for_rddt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) );
-		chars_for_cddt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) );
-
-		// Set the values in chars_for_rddt/cddt to the address of the string
-		// that holds the datatype chars.
-		for ( i = 0; i < n_operandsp1; ++i )
-		{
-			chars_for_rddt[i] = libblis_test_rd_chars;
-			chars_for_cddt[i] = libblis_test_cd_chars;
-		}
+	// Allocate an array of datatype combination strings, one for each
+	// datatype combination that needs to be tested. Initially it will have only
+	// one entry, and we will add entries as we go, growing the char** array
+	// as necessary.
+	dc_str = ( char** ) malloc( n_dt_combos_max * sizeof( char* ) );
+	n_dt_combos = 0;
 
-		// Set the last set of chars in chars_for_cddt to the real domain
-		// charset. This is because the last char will be the computation
-		// precision.
-		chars_for_cddt[i-1] = libblis_test_rd_chars;
+	// Compute the total number of datatype combinations to test according to
+	// the list of allowed datatypes specified by the user and whether mixed-
+	// precision/mixed-domain computation is allowed. The computational datatype
+	// is always real, while scalars are always double precision, real if all
+	// operands are real, and complex (or both real and complex combinations for
+	// mixed-domain computations) if any operand is complex.
 
-		// Compute the total number of datatype combinations to test (which is
-		// simply the product of the string lengths of chars_for_spdt/dpdt[i]).
-		// NOTE: We skip inspecting/branching off of the d_spec_str chars since
-		// we know they are all '?'.
-		n_rddt_combos = 0; n_cddt_combos = 0;
+	// This is the current set of datatype characters encoded as offsets in d_spec_str,
+	// initialized to the first allowed datatype for each operand and scalar.
+	int* idx = ( int* ) malloc( n_operands * sizeof( int ) );
+	for ( unsigned int i = 0; i < n_operands; i++ )
+		idx[ i ] = 0;
 
-		if ( has_rd )
-			n_rddt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str,
-			                                           chars_for_rddt );
+	// This is the current set of datatype characters after re-encoding as chars.
+	char* tmp = ( char* ) malloc( ( n_operands_all + 1 ) * sizeof( char ) );
+	tmp[ n_operands_all ] = '\0';
 
-		if ( has_cd )
-			n_cddt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str,
-			                                           chars_for_cddt );
-
-		// Add real and complex domain combinations.
-		n_dt_combos = n_rddt_combos + n_cddt_combos;
-
-		// Allocate an array of datatype combination strings, one for each
-		// datatype combination that needs to be tested.
-		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
-
-		char** dc_str_p = dc_str;
-
-		// Fill the datatype combination strings in dc_str with the datatype
-		// combinations implied by chars_for_rddt/cddt.
-		if ( has_rd )
-		{
-			libblis_test_fill_param_strings( d_spec_str,
-			                                 chars_for_rddt,
-			                                 n_operandsp1,
-			                                 n_rddt_combos,
-			                                 dc_str_p );
-			dc_str_p += n_rddt_combos;
-		}
-		if ( has_cd )
-		{
-			libblis_test_fill_param_strings( d_spec_str,
-			                                 chars_for_cddt,
-			                                 n_operandsp1,
-			                                 n_cddt_combos,
-			                                 dc_str_p );
-			dc_str_p += n_cddt_combos;
-		}
-
-#if 0
-		printf( "n_rddt_combos = %d\n", n_rddt_combos );
-		printf( "n_cddt_combos = %d\n", n_cddt_combos );
-		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
-
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			printf( "dc_str[%2d] = %s\n", dci, dc_str[dci] );
-
-		bli_abort();
-#endif
-	}
-	else if (  mixed_domain && !mixed_precision && op->opid == BLIS_GEMM )
+	for ( unsigned int ops_done = FALSE; !ops_done; )
 	{
-		is_mixed_dt = TRUE;
-
-		// Increment the number of operands by one to account for the
-		// computation precision (or computation datatype, as we will encode
-		// it in the char string).
-		n_operandsp1 = n_operands + 1;
-
-		unsigned int has_sp = libblis_test_dt_str_has_sp_char( params );
-		unsigned int has_dp = libblis_test_dt_str_has_dp_char( params );
+		// Translate the offsets in d_spec_str into actual type characters.
+		for ( unsigned int i = 0; i < n_operands; i++ )
+			tmp[ i ] = d_spec_str[ idx[ i ] ];
 
-		// Fill datatype specification string with wildcard chars.
-		for ( i = 0; i < n_operands; ++i ) d_spec_str[i] = '?';
-		d_spec_str[i] = '\0';
-
-		// Allocate an array that stores pointers to the sets of possible
-		// datatype chars for each operand (plus the computation precision
-		// char).
-		chars_for_spdt = ( char** ) malloc( n_operands * sizeof( char* ) );
-		chars_for_dpdt = ( char** ) malloc( n_operands * sizeof( char* ) );
-
-		// Set the values in chars_for_spdt/dpdt to the address of the string
-		// that holds the datatype chars.
-		for ( i = 0; i < n_operands; ++i )
+		// Check that the requested constraints on matching precision and/or
+		// domain are satisfied if mixed computations are not specified.
+		unsigned int is_mixed_precision = FALSE;
+		unsigned int is_mixed_domain    = FALSE;
+		for ( unsigned int i = 1; i < n_operands; i++ )
 		{
-			chars_for_spdt[i] = libblis_test_sp_chars;
-			chars_for_dpdt[i] = libblis_test_dp_chars;
-		}
-
-		// Compute the total number of datatype combinations to test (which is
-		// simply the product of the string lengths of chars_for_spdt/dpdt[i]).
-		// NOTE: We skip inspecting/branching off of the d_spec_str chars since
-		// we know they are all '?'.
-		n_spdt_combos = 0; n_dpdt_combos = 0;
-
-		if ( has_sp )
-			n_spdt_combos = libblis_test_count_combos( n_operands, d_spec_str,
-			                                           chars_for_spdt );
+			if ( libblis_test_dt_str_has_rd_char_str( 1, tmp+0 ) !=
+			     libblis_test_dt_str_has_rd_char_str( 1, tmp+i ) )
+				is_mixed_domain = TRUE;
 
-		if ( has_dp )
-			n_dpdt_combos = libblis_test_count_combos( n_operands, d_spec_str,
-			                                           chars_for_dpdt );
-
-		// Add single- and double-precision combinations.
-		n_dt_combos = n_spdt_combos + n_dpdt_combos;
-
-		// Allocate an array of datatype combination strings, one for each
-		// datatype combination that needs to be tested.
-		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
-
-		char** dc_str_p = dc_str;
-
-		// Fill the datatype combination strings in dc_str with the datatype
-		// combinations implied by chars_for_spdt/dpdt.
-		if ( has_sp )
-		{
-			libblis_test_fill_param_strings( d_spec_str,
-			                                 chars_for_spdt,
-			                                 n_operands,
-			                                 n_spdt_combos,
-			                                 dc_str_p );
-			dc_str_p += n_spdt_combos;
-		}
-		if ( has_dp )
-		{
-			libblis_test_fill_param_strings( d_spec_str,
-			                                 chars_for_dpdt,
-			                                 n_operands,
-			                                 n_dpdt_combos,
-			                                 dc_str_p );
-			dc_str_p += n_dpdt_combos;
+			if ( libblis_test_dt_str_has_sp_char_str( 1, tmp+0 ) !=
+			     libblis_test_dt_str_has_sp_char_str( 1, tmp+i ) )
+				is_mixed_precision = TRUE;
 		}
 
-		// Manually set the computation char to the real projection of the
-		// first char of each combination.
-		int prec_i = n_operands;
-		for ( i = 0; i < n_dt_combos; ++i )
+		// Only do combinations of datatypes which are allowed
+		if ( !( is_mixed_precision && !mixed_precision ) &&
+		     !( is_mixed_domain    && !mixed_domain    ) )
 		{
-			dc_str[i][prec_i]   = libblis_test_proj_dtchar_to_precchar( dc_str[i][0] );
-			dc_str[i][prec_i+1] = '\0';
-		}
+			// Set the initial scalar datatypes as double and real if all operands
+			// are real of if mixed-domain computation is requested.
+			int has_cd = libblis_test_dt_str_has_dp_char_str( n_operands, tmp );
+			for ( unsigned int i = n_operands; i < n_operands_all-1; i++ )
+				tmp[ i ] = !has_cd || mixed_domain ? 'd' : 'z';
 
-#if 0
-		printf( "n_spdt_combos = %d\n", n_spdt_combos );
-		printf( "n_dpdt_combos = %d\n", n_dpdt_combos );
-		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
-
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			printf( "dc_str[%2d] = %s\n", dci, dc_str[dci] );
+			for ( unsigned int scalars_done = FALSE; !scalars_done; )
+			{
+				// Determine which computational precisions to test. If mixed-precision
+				// computation is requested then this is all real precisions, otherwise
+				// it is the precision of the input operands.
+				num_t prec_min = 0;
+				num_t prec_max = strlen( libblis_test_rd_chars );
+				if ( !mixed_precision )
+				{
+					tmp[ n_operands_all-1 ] = libblis_test_proj_dtchar_to_precchar( tmp[0] );
+					prec_max = 1;
+				}
 
-		bli_abort();
-#endif
-	}
-	else if (  mixed_domain &&  mixed_precision && op->opid == BLIS_GEMM )
-	{
-		is_mixed_dt = TRUE;
+				for ( unsigned int dci = prec_min; dci < prec_max; dci++ )
+				{
+					if ( mixed_precision )
+						tmp[ n_operands_all-1 ] = libblis_test_rd_chars[ dci ];
 
-		// Increment the number of operands by one to account for the
-		// computation precision (or computation datatype, as we will encode
-		// it in the char string).
-		n_operandsp1 = n_operands + 1;
+					// If there is no more space available in dc_str then
+					// reallocate to a larger size.
+					if ( n_dt_combos == n_dt_combos_max )
+					{
+						n_dt_combos_max *= 2;
+						dc_str = ( char** ) realloc( dc_str, n_dt_combos_max * sizeof( char* ) );
+					}
 
-		// Fill datatype specification string with wildcard chars.
-		for ( i = 0; i < n_operandsp1; ++i ) d_spec_str[i] = '?';
-		d_spec_str[i] = '\0';
+					// Add this datatype combination to the list.
+					dc_str[ n_dt_combos ] = ( char* ) malloc( ( n_operands_all + 1 ) * sizeof( char ) );
+					strcpy( dc_str[ n_dt_combos ], tmp );
+					n_dt_combos++;
+				}
 
-		// Allocate an array that stores pointers to the sets of possible
-		// datatype chars for each operand.
-		chars_for_dt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) );
+				// Only loop over real/complex combinations of scalars
+				// when mixed-domain computation is requested and at least
+				// one operand is complex.
+				if ( !mixed_domain || !has_cd )
+					break;
 
-		// Set the values in chars_for_rddt/cddt to the address of the string
-		// that holds the datatype chars.
-		for ( i = 0; i < n_operandsp1; ++i )
-		{
-			chars_for_dt[i] = libblis_test_dt_chars;
+				// Go through the scalar datatypes and increment to the next combination.
+				for ( unsigned int i = 0; i < n_scalars; i++ )
+				{
+					if ( tmp[ n_operands+i ] == 'd' )
+					{
+						tmp[ n_operands+i ] = 'z';
+						break;
+					}
+					else /* tmp[ n_operands+j ] == 'z' */
+					{
+						tmp[ n_operands+i ] = 'd';
+						if ( i == n_scalars-1 )
+							scalars_done = TRUE;
+					}
+				}
+			}
 		}
 
-		// Set the last set of chars in chars_for_dt to the real domain
-		// charset. This is because the last char will be the computation
-		// precision, with the computation domain implied by the operands'
-		// storage datatypes.
-		chars_for_dt[i-1] = libblis_test_rd_chars;
-
-		// Compute the total number of datatype combinations to test (which is
-		// simply the product of the string lengths of chars_for_dt[i]).
-		// NOTE: We skip inspecting/branching off of the d_spec_str chars since
-		// we know they are all '?'.
-		n_dt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str,
-		                                         chars_for_dt );
-
-		// Allocate an array of datatype combination strings, one for each
-		// datatype combination that needs to be tested.
-		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
-
-		// Fill the datatype combination strings in dc_str with the datatype
-		// combinations implied by chars_for_rddt/cddt.
-		libblis_test_fill_param_strings( d_spec_str,
-		                                 chars_for_dt,
-		                                 n_operandsp1,
-		                                 n_dt_combos,
-		                                 dc_str );
-
-#if 0
-		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
-
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			printf( "dc_str[%3d] = %s\n", dci, dc_str[dci] );
-
-		bli_abort();
-#endif
-	}
-	else // ( ( !mixed_domain && !mixed_precision ) || op->opid != BLIS_GEMM )
-	{
-		is_mixed_dt = FALSE;
-
-		// Increment the number of operands by one to account for the
-		// computation precision (or computation datatype, as we will encode
-		// it in the char string).
-		n_operandsp1 = n_operands + 1;
-
-		// Since we are not mixing domains, we only consider n_datatype
-		// datatype combinations, where each combination is actually
-		// homogeneous (e.g. "sss", "ddd", etc., if n_operands == 3).
-		n_dt_combos = n_datatypes;
-
-		// Allocate an array of datatype combination strings, one for each
-		// datatype specified.
-		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
-
-		// Fill each datatype combination string with the same dt char for
-		// each operand in the current operation.
-		for ( dci = 0; dci < n_dt_combos; ++dci )
+		// Go through the operand datatype indices and increment to the next combination.
+		for ( unsigned int i = 0; i < n_operands; i++ )
 		{
-			dt_char = params->datatype_char[dci];
-
-			for ( i = 0; i < n_operands; ++i )
-				dc_str[dci][i] = dt_char;
+			idx[ i ]++;
 
-			// Encode the computation precision as the last char.
-			dc_str[dci][i] = libblis_test_proj_dtchar_to_precchar( dc_str[dci][0] );
-
-			dc_str[dci][i+1] = '\0';
+			if ( idx[ i ] < n_datatypes )
+			{
+				break;
+			}
+			else
+			{
+				idx[ i ] = 0;
+				if ( i == n_operands-1 )
+					ops_done = TRUE;
+			}
 		}
-
-#if 0
-		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
-
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-			printf( "dc_str[%3d] = %s\n", dci, dc_str[dci] );
-
-		bli_abort();
-#endif
 	}
 
-
+	free( tmp );
+	free( idx );
 
 	// These statements should only be executed by one thread.
 	if ( tdata->id == 0 )
@@ -2189,45 +1938,32 @@ void libblis_test_op_driver
 		}
 	}
 
+	// Wait here so that other threads don't start printing yet.
+	bli_pthread_barrier_wait( tdata->barrier );
 
 	// Loop over the requested storage schemes.
-	for ( sci = 0; sci < n_store_combos; ++sci )
-	//for ( sci = 0; sci < 5; ( sci == 0 || sci == 2 ? sci+=2 : ++sci ) )
-	//for ( sci = 0; sci < 5; ( sci == 2 ? sci+=2 : ++sci ) )
-	//for ( sci = 3; sci < 8; ( sci == 3 ? sci+=2 : ++sci ) )
-	//for ( sci = 0; sci < 1; ++sci )
-	//for ( sci = 7; sci < 8; ++sci )
+	for ( unsigned int sci = 0; sci < n_store_combos; ++sci )
 	{
 		// Loop over the requested datatypes.
-		for ( dci = 0; dci < n_dt_combos; ++dci )
-		//for ( dci = 14; dci < 15; ++dci )
-		//for ( dci = 6; dci < 7; dci += 1 )
-		//for ( dci = 12; dci < 13; ++dci )
-		//for ( dci = 4; dci < 5; ++dci )
-		//for ( dci = 8; dci < 9; ++dci )
-		//for ( dci = 0; dci < 1; ++dci )
+		for ( unsigned int dci = 0; dci < n_dt_combos; ++dci )
 		{
 			// We need a datatype to use for induced method related things
 			// as well as to decide which set of residual thresholds to use.
 			// We must choose the first operand's dt char since that's the
 			// only operand we know is guaranteed to exist.
+			num_t datatype;
 			bli_param_map_char_to_blis_dt( dc_str[dci][0], &datatype );
-			dt_check = datatype;
+			num_t dt_check = datatype;
 
-			int has_sp = libblis_test_dt_str_has_sp_char_str( n_operandsp1,
-			                                                  dc_str[dci] );
-			int has_dp = libblis_test_dt_str_has_dp_char_str( n_operandsp1,
+			int has_sp = libblis_test_dt_str_has_sp_char_str( n_operands_all,
 			                                                  dc_str[dci] );
-			int has_samep = (has_sp && !has_dp ) ||
-			                (has_dp && !has_sp );
 
 			// Notice that we use n_operands here instead of
-			// n_operandsp1 since we only want to chars for the
+			// n_operands_all since we only want to chars for the
 			// storage datatypes of the matrix operands, not the
 			// computation precision char.
-			int has_cd_only =
-			!libblis_test_dt_str_has_rd_char_str( n_operands,
-			                                      dc_str[dci] );
+			int has_cd_only = !libblis_test_dt_str_has_rd_char_str( n_operands,
+			                                                        dc_str[dci] );
 
 			if ( has_sp )
 			{
@@ -2238,6 +1974,7 @@ void libblis_test_op_driver
 			}
 
 			// Build a commented column label string.
+			char label_str[128];
 			libblis_test_build_col_labels_string( params, op, label_str );
 
 			// These statements should only be executed by one thread.
@@ -2262,44 +1999,27 @@ void libblis_test_op_driver
 				ind_first = 0;
 
 			// Loop over induced methods (or just BLIS_NAT).
-			for ( indi = ind_first; indi <= ind_last; ++indi )
+			for ( unsigned int indi = ind_first; indi <= ind_last; ++indi )
 			{
-				// If the current datatype is real, OR if the current
-				// induced method is implemented (for the operation
-				// being tested) AND it was requested, then we enable
-				// ONLY that method and proceed. Otherwise, we skip the
+				// If the current induced method is not implemented (for the operation
+				// being tested) OR it was not requested, we skip the
 				// current method and go to the next method.
-				if ( bli_is_real( datatype ) ) { ; }
-				else if ( bli_ind_oper_is_impl( op->opid, indi ) &&
-				          params->ind_enable[ indi ] == 1 )
-				{
-					// If the current induced method is 1m, make sure that
-					// we only proceed for gemm where all operands are stored
-					// in the complex domain. (This prevents 1m from being
-					// executed on mixed-datatype combinations that contain
-					// real domain datatypes.)
-					if ( indi == BLIS_1M )
-					{
-						if      ( op->opid == BLIS_GEMM && has_cd_only ) { ; }
-						else if ( has_samep && has_cd_only ) { ; }
-						else { continue; }
-					}
-					else { ; }
-				}
-				else { continue; }
+				if ( !bli_ind_oper_is_impl( op->opid, indi ) ||
+				     params->ind_enable[ indi ] == 0 )
+					continue;
 
 				bli_ind_oper_enable_only( op->opid, indi, datatype );
 
 				// Query the implementation string associated with the
 				// current operation and datatype. If the operation is
 				// not level-3, we will always get back the native string.
-				ind_str = ( char* )bli_ind_oper_get_avail_impl_string( op->opid, datatype );
+				char* ind_str = ( char* )bli_ind_oper_get_avail_impl_string( op->opid, datatype );
 
 				// Loop over the requested parameter combinations.
-				for ( pci = 0; pci < n_param_combos; ++pci )
+				for ( unsigned int pci = 0; pci < n_param_combos; ++pci )
 				{
 					// Loop over the requested problem sizes.
-					for ( p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi )
+					for ( unsigned int p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi )
 					{
 						// Skip this experiment (for this problem size) according to
 						// to the counter, number of threads, and thread id.
@@ -2312,6 +2032,7 @@ void libblis_test_op_driver
 						// Call the given experiment function. perf and resid will
 						// contain the resulting performance and residual values,
 						// respectively.
+						double perf, resid;
 						f_exp( params,
 						       op,
 						       iface,
@@ -2327,12 +2048,13 @@ void libblis_test_op_driver
 
 						// Query the string corresponding to the residual's
 						// position relative to the thresholds.
-						pass_str = libblis_test_get_string_for_result( resid,
-						                                               dt_check,
-						                                               thresh );
+						char* pass_str = libblis_test_get_string_for_result( resid,
+						                                                     dt_check,
+						                                                     thresh );
 
 						// Build a string unique to the operation, datatype combo,
 						// parameter combo, and storage combo being tested.
+						char funcname_str[64];
 						libblis_test_build_function_string( BLIS_FILEDATA_PREFIX_STR,
 						                                    op->opid,
 						                                    indi,
@@ -2345,16 +2067,12 @@ void libblis_test_op_driver
 						                                    sc_str[sci],
 						                                    funcname_str );
 
-						// Compute the number of spaces we have left to fill given
-						// length of our operation's name.
-						n_spaces = MAX_FUNC_STRING_LENGTH - strlen( funcname_str );
-						fill_string_with_n_spaces( blank_str, n_spaces );
-
 						// Print all dimensions to a single string.
+						char dims_str[64];
 						libblis_test_build_dims_string( op, p_cur, dims_str );
 
 						// Count the number of dimensions that were printed to the string.
-						n_dims_print = libblis_test_get_n_dims_from_string( dims_str );
+						unsigned int n_dims_print = libblis_test_get_n_dims_from_string( dims_str );
 
 						// Output the results of the test. Use matlab format if requested.
 						// NOTE: Here we use fprintf() over libblis_test_fprintf() so
@@ -2365,8 +2083,8 @@ void libblis_test_op_driver
 						if ( params->output_matlab_format )
 						{
 							fprintf( stdout,
-							         "%s%s( %3u, 1:%u ) = [%s  %7.2lf  %8.2le ]; %c %s\n",
-							         funcname_str, blank_str, pi, n_dims_print + 2,
+							         "%-*s( %3u, 1:%u ) = [%s  %7.2lf  %8.2le ]; %c %s\n",
+							         MAX_FUNC_STRING_LENGTH, funcname_str, pi, n_dims_print + 2,
 							         dims_str, perf, resid,
 							         OUTPUT_COMMENT_CHAR,
 							         pass_str );
@@ -2375,8 +2093,8 @@ void libblis_test_op_driver
 							// opened).
 							if ( output_stream )
 							fprintf( output_stream,
-							         "%s%s( %3u, 1:%u ) = [%s  %7.2lf  %8.2le ]; %c %s\n",
-							         funcname_str, blank_str, pi, n_dims_print + 2,
+							         "%-*s( %3u, 1:%u ) = [%s  %7.2lf  %8.2le ]; %c %s\n",
+							         MAX_FUNC_STRING_LENGTH, funcname_str, pi, n_dims_print + 2,
 							         dims_str, perf, resid,
 							         OUTPUT_COMMENT_CHAR,
 							         pass_str );
@@ -2384,8 +2102,8 @@ void libblis_test_op_driver
 						else
 						{
 							fprintf( stdout,
-							         "%s%s      %s  %7.2lf   %8.2le   %s\n",
-							         funcname_str, blank_str,
+							         "%-*s      %s  %7.2lf   %8.2le   %s\n",
+							         MAX_FUNC_STRING_LENGTH, funcname_str,
 							         dims_str, perf, resid,
 							         pass_str );
 
@@ -2393,8 +2111,8 @@ void libblis_test_op_driver
 							// opened).
 							if ( output_stream )
 							fprintf( output_stream,
-							         "%s%s      %s  %7.2lf   %8.2le   %s\n",
-							         funcname_str, blank_str,
+							         "%-*s      %s  %7.2lf   %8.2le   %s\n",
+							         MAX_FUNC_STRING_LENGTH, funcname_str,
 							         dims_str, perf, resid,
 							         pass_str );
 						}
@@ -2435,39 +2153,29 @@ void libblis_test_op_driver
 		}
 	}
 
-
 	// Free the array that stored pointers to the sets of possible parameter
 	// chars for each parameter.
 	free( chars_for_param );
 
 	// Free the parameter combination strings and then the master pointer.
-	for ( pci = 0; pci < n_param_combos; ++pci )
+	for ( unsigned int pci = 0; pci < n_param_combos; ++pci )
 		free( pc_str[pci] );
 	free( pc_str );
 
 	// Free the storage combination strings and then the master pointer.
-	for ( sci = 0; sci < n_store_combos; ++sci )
+	for ( unsigned int sci = 0; sci < n_store_combos; ++sci )
 		free( sc_str[sci] );
 	free( sc_str );
 
-	// Free some auxiliary arrays used by the mixed-domain/mixed-precision
-	// datatype-handling logic.
-	free( chars_for_rddt );
-	free( chars_for_cddt );
-	free( chars_for_spdt );
-	free( chars_for_dpdt );
-
 	// Free the datatype combination strings and then the master pointer.
-	for ( dci = 0; dci < n_dt_combos; ++dci )
+	for ( unsigned int dci = 0; dci < n_dt_combos; ++dci )
 		free( dc_str[dci] );
 	free( dc_str );
 
-
 	// If the file was opened (successfully), close the output stream.
 	if ( output_stream )
 		libblis_test_fclose_ofile( output_stream );
 
-
 	// Mark this operation as done.
 	if ( tdata->id == 0 )
 		op->test_done = TRUE;
@@ -2498,7 +2206,7 @@ void libblis_test_build_function_string
 	// We only print the full datatype combination string if is_mixed_dt
 	// is set and the operation is gemm. Otherwise, we print only
 	// the first char (since they are all the same).
-	if ( is_mixed_dt == TRUE && opid == BLIS_GEMM )
+	if ( is_mixed_dt == TRUE )
 		sprintf( funcname_str, "%s_%s%s", prefix_str, dc_str, op_str );
 	else
 		sprintf( funcname_str, "%s_%c%s", prefix_str, dc_str[0], op_str );
@@ -2600,8 +2308,8 @@ void libblis_test_build_col_labels_string( test_params_t* params, test_op_t* op,
 		                                            "<dt><op>_<stor>         " );
 	}
 
-	if ( params->output_matlab_format ) n_spaces = 11;
-	else                                n_spaces = 1;
+	if ( params->output_matlab_format ) n_spaces = MAX_FUNC_STRING_LENGTH - 15;
+	else                                n_spaces = MAX_FUNC_STRING_LENGTH - 25;
 
 	fill_string_with_n_spaces( blank_str, n_spaces );
 
@@ -2712,35 +2420,43 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
 
 thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
 {
-	bool does_inv_diag;
+	static packm_ker_ft GENARRAY2_MIXP(packm_struc_cxk,packm_struc_cxk);
 
+	bool does_inv_diag;
 	if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE;
 	else                                   does_inv_diag = TRUE;
 
 	rntm_t rntm = BLIS_RNTM_INITIALIZER;
 
+	num_t dt = bli_obj_dt( a );
+
 	// Create a control tree node for the packing operation.
-	cntl_t* cntl = bli_packm_cntl_create_node
+	packm_def_cntl_t cntl;
+	bli_packm_def_cntl_init_node
 	(
-	  NULL, // pass NULL as the pool so that malloc() is used.
-	  NULL, // func ptr is not referenced b/c we don't call via l3 _int().
-	  bmult_id_m,
-	  bmult_id_n,
+	  NULL, // func ptr is not referenced b/c we don't call via bli_l3_int().
+	  dt,
+	  dt,
+	  dt,
+	  packm_struc_cxk[ dt ][ dt ],
+	  bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ),
+	  bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ),
+	  1,
+	  1,
+	  1,
+	  bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ),
 	  does_inv_diag,
 	  FALSE,
 	  FALSE,
 	  pack_schema,
 	  pack_buf,
-	  NULL  // no child node needed
+	  &cntl
 	);
 
-	thrinfo_t* thread = bli_l3_thrinfo_create( 0, &BLIS_SINGLE_COMM, NULL, &rntm, cntl );
+	thrinfo_t* thread = bli_l3_thrinfo_create( 0, &BLIS_SINGLE_COMM, NULL, &rntm, ( cntl_t* )&cntl );
 
 	// Pack the contents of A to P.
-	bli_packm_blk_var1( a, p, cntx, cntl, thread );
-
-	// Free the control tree.
-	bli_l3_cntl_free( NULL, cntl );
+	bli_packm_blk_var1( a, p, cntx, ( cntl_t* )&cntl, thread );
 
 	// Return the thread control tree pointer so the caller can free the thrinfo_t and its
 	// mem_t entry later on.
@@ -3337,6 +3053,55 @@ int libblis_test_l3_is_disabled( test_op_t* op )
 	else                               return FALSE;
 }
 
+double libblis_test_l3_flops
+     (
+       opid_t op,
+       obj_t* a,
+       obj_t* b,
+       obj_t* c
+     )
+{
+	bool   a_is_real    = bli_obj_is_real( a );
+	bool   a_is_complex = bli_obj_is_complex( a );
+
+	bool   b_is_real    = bli_obj_is_real( b );
+	bool   b_is_complex = bli_obj_is_complex( b );
+
+	bool   c_is_real    = bli_obj_is_real( c );
+	bool   c_is_complex = bli_obj_is_complex( c );
+
+	double m            = ( double )bli_obj_length( c );
+	double n            = ( double )bli_obj_width( c );
+	double k            = ( double )bli_obj_width( a );
+
+	double flops        = op == BLIS_GEMM ||
+	                      op == BLIS_HEMM ||
+	                      op == BLIS_SYMM ||
+	                      op == BLIS_SYR2K ||
+	                      op == BLIS_HER2K ? 2.0 :
+	                   /* op == BLIS_GEMMT ||
+	                      op == BLIS_TRMM ||
+	                      op == BLIS_TRMM3 ||
+	                      op == BLIS_TRSM ||
+	                      op == BLIS_SYRK ||
+	                      op == BLIS_HERK ? */ 1.0;
+
+	if      ( ( c_is_complex && a_is_complex && b_is_complex ) )
+	{
+		flops *= 4;
+	}
+	else if ( ( c_is_complex && a_is_complex && b_is_real    ) ||
+	          ( c_is_complex && a_is_real    && b_is_complex ) ||
+	          ( c_is_real    && a_is_complex && b_is_complex ) )
+	{
+		flops *= 2;
+	}
+
+	flops *= m * n * k;
+
+	return flops;
+}
+
 // ---
 
 int libblis_test_dt_str_has_sp_char( test_params_t* params )
diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h
index 93c892c4f..7c1e52805 100644
--- a/testsuite/src/test_libblis.h
+++ b/testsuite/src/test_libblis.h
@@ -67,7 +67,7 @@
 #define INPUT_BUFFER_SIZE            256
 #define MAX_FILENAME_LENGTH          1000
 #define MAX_BINARY_NAME_LENGTH       256
-#define MAX_FUNC_STRING_LENGTH       26
+#define MAX_FUNC_STRING_LENGTH       32
 #define FLOPS_PER_UNIT_PERF          1e9
 
 #define MAX_NUM_MSTORAGE             4
@@ -343,6 +343,7 @@ void libblis_test_read_op_info( test_ops_t*  ops,
 void libblis_test_output_section_overrides( FILE* os, test_ops_t* ops );
 void libblis_test_output_params_struct( FILE* os, test_params_t* params );
 void libblis_test_output_op_struct( FILE* os, test_op_t* op, char* op_str );
+void libblis_test_output_term_tag( void );
 
 // --- Mapping ---
 
@@ -471,6 +472,7 @@ int  libblis_test_l1f_is_disabled( test_op_t* op );
 int  libblis_test_l2_is_disabled( test_op_t* op );
 int  libblis_test_l3ukr_is_disabled( test_op_t* op );
 int  libblis_test_l3_is_disabled( test_op_t* op );
+double libblis_test_l3_flops( opid_t op, obj_t* a, obj_t* b, obj_t* c );
 int  libblis_test_dt_str_has_sp_char( test_params_t* params );
 int  libblis_test_dt_str_has_sp_char_str( int n, char* str );
 int  libblis_test_dt_str_has_dp_char( test_params_t* params );
diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c
index 9b6268ca3..8742695c6 100644
--- a/testsuite/src/test_randm.c
+++ b/testsuite/src/test_randm.c
@@ -333,12 +333,12 @@ void PASTEMAC(ch,varname)( \
 		{ \
 			ctype* chi1 = x_cast + (i  )*rs_x + (j  )*cs_x; \
 \
-			PASTEMAC2(ch,chr,abval2s)( *chi1, abs_chi1 ); \
-			PASTEMAC2(chr,chr,adds)( abs_chi1, sum ); \
+			PASTEMAC(ch,chr,abval2s)( *chi1, abs_chi1 ); \
+			PASTEMAC(chr,chr,adds)( abs_chi1, sum ); \
 		} \
 	} \
 \
-	PASTEMAC2(chr,chr,copys)( sum, *sum_x_cast ); \
+	PASTEMAC(chr,chr,copys)( sum, *sum_x_cast ); \
 }
 
 INSERT_GENTFUNCR_BASIC( absumm )
diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c
index 03d74e869..0fdf281e5 100644
--- a/testsuite/src/test_symm.c
+++ b/testsuite/src/test_symm.c
@@ -168,7 +168,7 @@ void libblis_test_symm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_b, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, n;
 	dim_t        mn_side;
@@ -183,7 +183,12 @@ void libblis_test_symm_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[5], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -196,30 +201,27 @@ void libblis_test_symm_experiment
 	bli_param_map_char_to_blis_trans( pc_str[3], &transb );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
 	bli_set_dim_with_side( side, m, n, &mn_side );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_a, BLIS_NO_TRANSPOSE,
 	                          sc_str[1], mn_side, mn_side, &a );
-	libblis_test_mobj_create( params, datatype, transb,
+	libblis_test_mobj_create( params, dt_b, transb,
 	                          sc_str[2], m,       n,       &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
 	{
-		bli_setsc(  0.8,  0.0, &alpha );
-		bli_setsc( -1.0,  0.0, &beta );
-	}
-	else
-	{
-		bli_setsc(  0.8,  0.6, &alpha );
-		bli_setsc( -1.0,  1.0, &beta );
+		bli_setsc(  2.0,  0.2, &alpha );
+		bli_setsc(  1.2,  0.5, &beta );
 	}
 
 	// Set the structure and uplo properties of A.
@@ -241,7 +243,7 @@ void libblis_test_symm_experiment
 	bli_obj_set_conj( conja, &a );
 	bli_obj_set_conjtrans( transb, &b );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -254,8 +256,14 @@ void libblis_test_symm_experiment
 	}
 
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	if ( bli_is_left( side ) )
+	{
+		*perf = libblis_test_l3_flops( BLIS_SYMM, &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
+	else
+	{
+		*perf = libblis_test_l3_flops( BLIS_SYMM, &b, &a, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
 
 	// Perform checks.
 	libblis_test_symm_check( params, side, &alpha, &a, &b, &beta, &c, &c_save, resid );
@@ -309,11 +317,17 @@ void libblis_test_symm_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	uplo_t uploa   = bli_obj_uplo( a );
+	if ( bli_obj_has_trans( a ) )
+		bli_toggle_uplo( &uploa );
+
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
 	dim_t  n       = bli_obj_width( c );
+	dim_t  mn_side = bli_obj_length( a );
 
 	obj_t  norm;
 	obj_t  t, v, w, z;
@@ -352,6 +366,18 @@ void libblis_test_symm_check
 	//     = beta * C_orig * t + alpha * transb(B) * w
 	//     = beta * C_orig * t + z
 
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, b2, c2, c0;
+
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
 	if ( bli_is_left( side ) )
@@ -371,21 +397,42 @@ void libblis_test_symm_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, mn_side, mn_side, 0, 0, &a2 );
+	bli_obj_create( dt, m, n, 0, 0, &b2 );
+	bli_obj_create( dt, m, n, 0, 0, &c2 );
+	bli_obj_create( dt, m, n, 0, 0, &c0 );
+	bli_obj_set_struc( BLIS_SYMMETRIC, &a2 );
+	bli_obj_set_uplo( uploa, &a2 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b,      &b2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_gemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
 
 	if ( bli_is_left( side ) )
 	{
-		bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &w );
-		bli_symv( alpha, a, &w, &BLIS_ZERO, &z );
+		bli_gemv( &BLIS_ONE, &b2, &t, &BLIS_ZERO, &w );
+		bli_symv( alpha, &a2, &w, &BLIS_ZERO, &z );
 	}
 	else
 	{
-		bli_symv( &BLIS_ONE, a, &t, &BLIS_ZERO, &w );
-		bli_gemv( alpha, b, &w, &BLIS_ZERO, &z );
+		bli_symv( &BLIS_ONE, &a2, &t, &BLIS_ZERO, &w );
+		bli_gemv( alpha, &b2, &w, &BLIS_ZERO, &z );
 	}
 
-	bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z );
-	
+	bli_gemv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
+
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
 	bli_getsc( &norm, resid, &junk );
@@ -394,5 +441,10 @@ void libblis_test_symm_check
 	bli_obj_free( &v );
 	bli_obj_free( &w );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c
index 2e1fcf237..38785a90e 100644
--- a/testsuite/src/test_syr2k.c
+++ b/testsuite/src/test_syr2k.c
@@ -166,7 +166,7 @@ void libblis_test_syr2k_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_b, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, k;
 
@@ -178,7 +178,12 @@ void libblis_test_syr2k_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[5], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -190,31 +195,26 @@ void libblis_test_syr2k_experiment
 	bli_param_map_char_to_blis_trans( pc_str[2], &transb );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
-	libblis_test_mobj_create( params, datatype, transa,
+	libblis_test_mobj_create( params, dt_a, transa,
 	                          sc_str[1], m, k, &a );
-	libblis_test_mobj_create( params, datatype, transb,
+	libblis_test_mobj_create( params, dt_b, transb,
 	                          sc_str[2], m, k, &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
-	{
-		bli_setsc(  0.8, 0.0, &alpha );
-		bli_setsc( -1.0, 0.0, &beta );
-	}
-	else
 	{
-		// For syr2k, both alpha and beta may be complex since, unlike her2k,
-		// C is symmetric in both the real and complex cases.
-		bli_setsc(  0.8, 0.5, &alpha );
-		bli_setsc( -1.0, 0.5, &beta );
+		bli_setsc(  2.0,  0.2, &alpha );
+		bli_setsc(  1.2,  0.5, &beta );
 	}
 
 	// Randomize A and B.
@@ -240,7 +240,7 @@ void libblis_test_syr2k_experiment
 	bli_obj_set_conjtrans( transa, &a );
 	bli_obj_set_conjtrans( transb, &b );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -252,9 +252,11 @@ void libblis_test_syr2k_experiment
 		time_min = bli_clock_min_diff( time_min, time );
 	}
 
+	obj_t bt;
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, &b, &bt );
+
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	*perf = libblis_test_l3_flops( BLIS_SYR2K, &a, &bt, &c ) / time_min / FLOPS_PER_UNIT_PERF;
 
 	// Perform checks.
 	libblis_test_syr2k_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
@@ -306,13 +308,14 @@ void libblis_test_syr2k_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	uplo_t uploc   = bli_obj_uplo( c );
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
 	dim_t  k       = bli_obj_width_after_trans( a );
 
-	obj_t  at, bt;
 	obj_t  norm;
 	obj_t  t, v, w1, w2, z;
 
@@ -346,8 +349,17 @@ void libblis_test_syr2k_check
 	//     = beta * C_orig * t + z
 	//
 
-	bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at );
-	bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt );
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, b2, at2, bt2, c2, c0;
 
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
@@ -359,13 +371,39 @@ void libblis_test_syr2k_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_symv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
-
-	bli_gemv( &BLIS_ONE, &at, &t, &BLIS_ZERO, &w2 );
-	bli_gemv( &BLIS_ONE, &bt, &t, &BLIS_ZERO, &w1 );
-	bli_gemv( alpha, a, &w1, &BLIS_ZERO, &z );
-	bli_gemv( alpha, b, &w2, &BLIS_ONE, &z );
-	bli_symv( beta, c_orig, &t, &BLIS_ONE, &z );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, m, k, 0, 0, &a2 );
+	bli_obj_create( dt, k, m, 0, 0, &b2 );
+	bli_obj_create( dt, m, m, 0, 0, &c2 );
+	bli_obj_create( dt, m, m, 0, 0, &c0 );
+	bli_obj_set_struc( BLIS_SYMMETRIC, &c2 );
+	bli_obj_set_struc( BLIS_SYMMETRIC, &c0 );
+	bli_obj_set_uplo( uploc, &c2 );
+	bli_obj_set_uplo( uploc, &c0 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b,      &b2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, &a2, &at2 );
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, &b2, &bt2 );
+
+	bli_symv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
+
+	bli_gemv( &BLIS_ONE, &at2, &t, &BLIS_ZERO, &w2 );
+	bli_gemv( &BLIS_ONE, &bt2, &t, &BLIS_ZERO, &w1 );
+	bli_gemv( alpha, &a2, &w1, &BLIS_ZERO, &z );
+	bli_gemv( alpha, &b2, &w2, &BLIS_ONE, &z );
+	bli_symv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
 
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
@@ -376,5 +414,10 @@ void libblis_test_syr2k_check
 	bli_obj_free( &w1 );
 	bli_obj_free( &w2 );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c
index be3e33fe3..119a9a955 100644
--- a/testsuite/src/test_syrk.c
+++ b/testsuite/src/test_syrk.c
@@ -164,7 +164,7 @@ void libblis_test_syrk_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, k;
 
@@ -176,7 +176,11 @@ void libblis_test_syrk_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -187,29 +191,24 @@ void libblis_test_syrk_experiment
 	bli_param_map_char_to_blis_trans( pc_str[1], &transa );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
-	libblis_test_mobj_create( params, datatype, transa,
+	libblis_test_mobj_create( params, dt_a, transa,
 	                          sc_str[1], m, k, &a );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, m, &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
-	{
-		bli_setsc(  1.2,  0.0, &alpha );
-		bli_setsc( -1.0,  0.0, &beta );
-	}
-	else
 	{
-		// For syrk, both alpha and beta may be complex since, unlike herk,
-		// C is symmetric in both the real and complex cases.
-		bli_setsc(  1.2,  0.5, &alpha );
-		bli_setsc( -1.0,  0.5, &beta );
+		bli_setsc(  2.0,  0.2, &alpha );
+		bli_setsc(  1.2,  0.5, &beta );
 	}
 
 	// Randomize A.
@@ -233,7 +232,7 @@ void libblis_test_syrk_experiment
 	// Apply the remaining parameters.
 	bli_obj_set_conjtrans( transa, &a );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -245,9 +244,11 @@ void libblis_test_syrk_experiment
 		time_min = bli_clock_min_diff( time_min, time );
 	}
 
+	obj_t at;
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, &a, &at );
+
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 1.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	*perf = libblis_test_l3_flops( BLIS_SYRK, &a, &at, &c ) / time_min / FLOPS_PER_UNIT_PERF;
 
 	// Perform checks.
 	libblis_test_syrk_check( params, &alpha, &a, &beta, &c, &c_save, resid );
@@ -296,13 +297,14 @@ void libblis_test_syrk_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	uplo_t uploc   = bli_obj_uplo( c );
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
 	dim_t  k       = bli_obj_width_after_trans( a );
 
-	obj_t  at;
 	obj_t  norm;
 	obj_t  t, v, w, z;
 
@@ -333,7 +335,16 @@ void libblis_test_syrk_check
 	//     = beta * C_orig * t + z
 	//
 
-	bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at );
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, at2, c2, c0;
 
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
@@ -344,11 +355,34 @@ void libblis_test_syrk_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_symv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
 
-	bli_gemv( &BLIS_ONE, &at, &t, &BLIS_ZERO, &w );
-	bli_gemv( alpha, a, &w, &BLIS_ZERO, &z );
-	bli_symv( beta, c_orig, &t, &BLIS_ONE, &z );
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, m, k, 0, 0, &a2 );
+	bli_obj_create( dt, m, m, 0, 0, &c2 );
+	bli_obj_create( dt, m, m, 0, 0, &c0 );
+	bli_obj_set_struc( BLIS_SYMMETRIC, &c2 );
+	bli_obj_set_struc( BLIS_SYMMETRIC, &c0 );
+	bli_obj_set_uplo( uploc, &c2 );
+	bli_obj_set_uplo( uploc, &c0 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_obj_alias_with_trans( BLIS_TRANSPOSE, &a2, &at2 );
+
+	bli_symv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
+
+	bli_gemv( &BLIS_ONE, &at2, &t, &BLIS_ZERO, &w );
+	bli_gemv( alpha, &a2, &w, &BLIS_ZERO, &z );
+	bli_symv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
 
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
@@ -358,5 +392,9 @@ void libblis_test_syrk_check
 	bli_obj_free( &v );
 	bli_obj_free( &w );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c
index 497ecf97e..80f174679 100644
--- a/testsuite/src/test_trmm.c
+++ b/testsuite/src/test_trmm.c
@@ -164,7 +164,7 @@ void libblis_test_trmm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_b, dt_alpha, dt_comp;
 
 	dim_t        m, n;
 	dim_t        mn_side;
@@ -179,7 +179,10 @@ void libblis_test_trmm_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_b );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -192,25 +195,23 @@ void libblis_test_trmm_experiment
 	bli_param_map_char_to_blis_diag( pc_str[3], &diaga );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
 
 	// Create test operands (vectors and/or matrices).
 	bli_set_dim_with_side( side, m, n, &mn_side );
-	libblis_test_mobj_create( params, datatype, transa,
+	libblis_test_mobj_create( params, dt_a, transa,
 	                          sc_str[1], mn_side, mn_side, &a );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_b, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_b, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &b_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &b );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &b ) )
 	{
-		bli_setsc(  0.8,  0.0, &alpha );
-	}
-	else
-	{
-		bli_setsc(  0.8,  0.5, &alpha );
+		bli_setsc(  2.0,  0.2, &alpha );
 	}
 
 	// Set the structure and uplo properties of A.
@@ -229,7 +230,7 @@ void libblis_test_trmm_experiment
 	bli_obj_set_conjtrans( transa, &a );
 	bli_obj_set_diag( diaga, &a );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &b_save, &b );
@@ -242,8 +243,14 @@ void libblis_test_trmm_experiment
 	}
 
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &b ) ) *perf *= 4.0;
+	if ( bli_is_left( side ) )
+	{
+		*perf = libblis_test_l3_flops( BLIS_TRMM, &a, &b, &b ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
+	else
+	{
+		*perf = libblis_test_l3_flops( BLIS_TRMM, &b, &a, &b ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
 
 	// Perform checks.
 	libblis_test_trmm_check( params, side, &alpha, &a, &b, &b_save, resid );
@@ -295,11 +302,18 @@ void libblis_test_trmm_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( b );
-	num_t  dt_real = bli_obj_dt_proj_to_real( b );
+	uplo_t uploa   = bli_obj_uplo( a );
+	if ( bli_obj_has_trans( a ) )
+		bli_toggle_uplo( &uploa );
+
+	diag_t diaga   = bli_obj_diag( a );
+	num_t  dt_real = bli_obj_comp_prec( b ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( b ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( b );
 	dim_t  n       = bli_obj_width( b );
+	dim_t  mn_side = bli_obj_length( a );
 
 	obj_t  norm;
 	obj_t  t, v, w, z;
@@ -335,6 +349,17 @@ void libblis_test_trmm_check
 	//     = alpha * B * transa(A) * t
 	//     = alpha * B * w
 
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, b2, b0;
+
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
 	if ( bli_is_left( side ) )
@@ -354,21 +379,42 @@ void libblis_test_trmm_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &v );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, mn_side, mn_side, 0, 0, &a2 );
+	bli_obj_create( dt, m, n, 0, 0, &b2 );
+	bli_obj_create( dt, m, n, 0, 0, &b0 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b_orig, &b2 );
+	bli_castm( b,      &b0 );
+	bli_obj_set_struc( BLIS_TRIANGULAR, &a2 );
+	bli_obj_set_uplo( uploa, &a2 );
+	bli_obj_set_diag( diaga, &a2 );
+
+	bli_gemv( &BLIS_ONE, &b0, &t, &BLIS_ZERO, &v );
 
 	if ( bli_is_left( side ) )
 	{
-		bli_gemv( &BLIS_ONE, b_orig, &t, &BLIS_ZERO, &w );
-		bli_trmv( alpha, a, &w );
+		bli_gemv( &BLIS_ONE, &b2, &t, &BLIS_ZERO, &w );
+		bli_trmv( alpha, &a2, &w );
 		bli_copyv( &w, &z );
 	}
 	else
 	{
 		bli_copyv( &t, &w );
-		bli_trmv( &BLIS_ONE, a, &w );
-		bli_gemv( alpha, b_orig, &w, &BLIS_ZERO, &z );
+		bli_trmv( &BLIS_ONE, &a2, &w );
+		bli_gemv( alpha, &b2, &w, &BLIS_ZERO, &z );
 	}
 
+	if ( bli_obj_is_real( b ) ) bli_setiv( &BLIS_ZERO, &z );
+
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
 	bli_getsc( &norm, resid, &junk );
@@ -377,5 +423,9 @@ void libblis_test_trmm_check
 	bli_obj_free( &v );
 	bli_obj_free( &w );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &b0 );
 }
 
diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c
index d0644252f..a0c1ca2c4 100644
--- a/testsuite/src/test_trmm3.c
+++ b/testsuite/src/test_trmm3.c
@@ -168,7 +168,7 @@ void libblis_test_trmm3_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
-	num_t        datatype;
+	num_t        dt_a, dt_b, dt_c, dt_alpha, dt_beta, dt_comp;
 
 	dim_t        m, n;
 	dim_t        mn_side;
@@ -184,7 +184,12 @@ void libblis_test_trmm3_experiment
 
 
 	// Use the datatype of the first char in the datatype combination string.
-	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
+	bli_param_map_char_to_blis_dt( dc_str[3], &dt_alpha );
+	bli_param_map_char_to_blis_dt( dc_str[4], &dt_beta );
+	bli_param_map_char_to_blis_dt( dc_str[5], &dt_comp );
 
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
@@ -198,30 +203,27 @@ void libblis_test_trmm3_experiment
 	bli_param_map_char_to_blis_trans( pc_str[4], &transb );
 
 	// Create test scalars.
-	bli_obj_scalar_init_detached( datatype, &alpha );
-	bli_obj_scalar_init_detached( datatype, &beta );
+	bli_obj_scalar_init_detached( dt_alpha, &alpha );
+	bli_obj_scalar_init_detached( dt_beta, &beta );
 
 	// Create test operands (vectors and/or matrices).
 	bli_set_dim_with_side( side, m, n, &mn_side );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_a, BLIS_NO_TRANSPOSE,
 	                          sc_str[1], mn_side, mn_side, &a );
-	libblis_test_mobj_create( params, datatype, transb,
+	libblis_test_mobj_create( params, dt_b, transb,
 	                          sc_str[2], m,       n,       &b );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &c );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,       n,       &c_save );
 
+	// Set the computation precision of C.
+	bli_obj_set_comp_prec( bli_dt_prec( dt_comp ), &c );
+
 	// Set alpha and beta.
-	if ( bli_obj_is_real( &c ) )
-	{
-		bli_setsc(  0.8,  0.0, &alpha );
-		bli_setsc( -1.0,  0.0, &beta );
-	}
-	else
 	{
-		bli_setsc(  0.8,  0.6, &alpha );
-		bli_setsc( -1.0,  0.5, &beta );
+		bli_setsc(  2.0,  0.2, &alpha );
+		bli_setsc(  1.2,  0.5, &beta );
 	}
 
 	// Set the structure and uplo properties of A.
@@ -242,7 +244,7 @@ void libblis_test_trmm3_experiment
 	bli_obj_set_diag( diaga, &a );
 	bli_obj_set_conjtrans( transb, &b );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copym( &c_save, &c );
@@ -255,8 +257,14 @@ void libblis_test_trmm3_experiment
 	}
 
 	// Estimate the performance of the best experiment repeat.
-	*perf = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
-	if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	if ( bli_is_left( side ) )
+	{
+		*perf = libblis_test_l3_flops( BLIS_TRMM3, &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
+	else
+	{
+		*perf = libblis_test_l3_flops( BLIS_TRMM3, &b, &a, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+	}
 
 	// Perform checks.
 	libblis_test_trmm3_check( params, side, &alpha, &a, &b, &beta, &c, &c_save, resid );
@@ -310,11 +318,18 @@ void libblis_test_trmm3_check
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( c );
-	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	uplo_t uploa   = bli_obj_uplo( a );
+	if ( bli_obj_has_trans( a ) )
+		bli_toggle_uplo( &uploa );
+
+	diag_t diaga   = bli_obj_diag( a );
+	num_t  dt_real = bli_obj_comp_prec( c ) | BLIS_REAL;
+	num_t  dt_comp = bli_obj_comp_prec( c ) | BLIS_COMPLEX;
+	num_t  dt;
 
 	dim_t  m       = bli_obj_length( c );
 	dim_t  n       = bli_obj_width( c );
+	dim_t  mn_side = bli_obj_length( a );
 
 	obj_t  norm;
 	obj_t  t, v, w, z;
@@ -353,6 +368,18 @@ void libblis_test_trmm3_check
 	//     = beta * C_orig * t + alpha * transb(B) * w
 	//     = beta * C_orig * t + z
 
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// Project a, b, and c into the appropriate domain and computational
+	// precision, and then proceed with the checking accordingly.
+
+	obj_t a2, b2, c2, c0;
+
 	bli_obj_scalar_init_detached( dt_real, &norm );
 
 	if ( bli_is_left( side ) )
@@ -372,23 +399,45 @@ void libblis_test_trmm3_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create type-casted equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, mn_side, mn_side, 0, 0, &a2 );
+	bli_obj_create( dt, m, n, 0, 0, &b2 );
+	bli_obj_create( dt, m, n, 0, 0, &c2 );
+	bli_obj_create( dt, m, n, 0, 0, &c0 );
+	bli_obj_set_struc( BLIS_TRIANGULAR, &a2 );
+	bli_obj_set_uplo( uploa, &a2 );
+	bli_obj_set_diag( diaga, &a2 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b,      &b2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_gemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
 
 	if ( bli_is_left( side ) )
 	{
-		bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &w );
-		bli_trmv( alpha, a, &w );
+		bli_gemv( &BLIS_ONE, &b2, &t, &BLIS_ZERO, &w );
+		bli_trmv( alpha, &a2, &w );
 		bli_copyv( &w, &z );
 	}
 	else
 	{
 		bli_copyv( &t, &w );
-		bli_trmv( &BLIS_ONE, a, &w );
-		bli_gemv( alpha, b, &w, &BLIS_ZERO, &z );
+		bli_trmv( &BLIS_ONE, &a2, &w );
+		bli_gemv( alpha, &b2, &w, &BLIS_ZERO, &z );
 	}
 
-	bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z );
-	
+	bli_gemv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
+
 	bli_subv( &z, &v );
 	bli_normfv( &v, &norm );
 	bli_getsc( &norm, resid, &junk );
@@ -397,5 +446,10 @@ void libblis_test_trmm3_check
 	bli_obj_free( &v );
 	bli_obj_free( &w );
 	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
 }
 
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index ae5c9a814..27d488810 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -239,7 +239,7 @@ void libblis_test_trsm_ukr_experiment
 	  BLIS_MR,
 	  BLIS_MR,
 	  BLIS_INVERT_DIAG,
-	  BLIS_PACKED_ROW_PANELS,
+	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
 	  cntx
@@ -271,7 +271,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		  BLIS_NR,
 		  BLIS_MR,
 		  BLIS_NO_INVERT_DIAG,
-		  BLIS_PACKED_COL_PANELS,
+		  BLIS_PACKED_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,
 		  &b, &bp,
 		  cntx

From fd1a7e3ca9547718aa61c806848099705216182b Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Thu, 25 Apr 2024 15:00:59 -0500
Subject: [PATCH 186/230] Allow test/3 drivers to use default ind_t method.
 (#804)

Details:
- Previously, the standalone performance drivers in test/3 were written
  under the assumption that the user would want to explicitly test
  either native execution *or* 1m. But because the accompanying runme.sh
  script defaults to passing "native" in for the -i command line option
  (which explicitly sets the induced method type), running the script
  without modification causes the test drivers to use slow reference
  microkernels on systems where native complex-domain microkernels are
  not registered -- which will yield poor performance for complex-domain
  level-3 operations. Furthermore, even if a user was aware of this, the
  test drivers did not support any single value for the -i option that
  would test BLIS using the library's default behavior -- that is, using
  1m on systems where it is needed and native execution on systems that
  have native microkernels implemented and registered.
- This commit addresses the aforementioned issue by supporting a new
  value for the -i option: "auto". The "auto" value causes the driver
  to avoid explicitly setting the induced method altogether, leaving
  BLIS's default behavior in place. This "auto" option is also now the
  default setting within the runme.sh script. Thanks to Leick Robinson
  for finding and reporting this issue.
- Also added support for "nat" as a shorthand for "native", which
  the help text already (erroneously) claimed was supported.
---
 test/3/test_gemm.c  | 11 ++++++++---
 test/3/test_hemm.c  | 11 ++++++++---
 test/3/test_herk.c  | 11 ++++++++---
 test/3/test_trmm.c  | 11 ++++++++---
 test/3/test_trsm.c  | 11 ++++++++---
 test/3/test_utils.c | 23 ++++++++++++++++-------
 test/3/test_utils.h |  1 +
 7 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c
index 20bcca46c..287069b2b 100644
--- a/test/3/test_gemm.c
+++ b/test/3/test_gemm.c
@@ -171,9 +171,14 @@ int main( int argc, char** argv )
 		bli_copym( &c, &c_save );
 
 #ifdef BLIS
-		// Switch to the induced method specified by ind.
-		bli_ind_disable_all_dt( dt );
-		bli_ind_enable_dt( ind, dt );
+		// Switch to the induced method specified by ind, unless the 'auto'
+		// option was given, in which case we leave the induced method
+		// unchanged.
+		if ( params.im_is_auto == FALSE )
+		{
+			bli_ind_disable_all_dt( dt );
+			bli_ind_enable_dt( ind, dt );
+		}
 #endif
 
 #ifdef EIGEN
diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c
index 9f40fbc00..bf60d7a3e 100644
--- a/test/3/test_hemm.c
+++ b/test/3/test_hemm.c
@@ -168,9 +168,14 @@ int main( int argc, char** argv )
 		bli_copym( &c, &c_save );
 
 #ifdef BLIS
-		// Switch to the induced method specified by ind.
-		bli_ind_disable_all_dt( dt );
-		bli_ind_enable_dt( ind, dt );
+		// Switch to the induced method specified by ind, unless the 'auto'
+		// option was given, in which case we leave the induced method
+		// unchanged.
+		if ( params.im_is_auto == FALSE )
+		{
+			bli_ind_disable_all_dt( dt );
+			bli_ind_enable_dt( ind, dt );
+		}
 #endif
 
 		dtime_save = DBL_MAX;
diff --git a/test/3/test_herk.c b/test/3/test_herk.c
index 9e94bef52..9eda317c7 100644
--- a/test/3/test_herk.c
+++ b/test/3/test_herk.c
@@ -168,9 +168,14 @@ int main( int argc, char** argv )
 		bli_copym( &c, &c_save );
 
 #ifdef BLIS
-		// Switch to the induced method specified by ind.
-		bli_ind_disable_all_dt( dt );
-		bli_ind_enable_dt( ind, dt );
+		// Switch to the induced method specified by ind, unless the 'auto'
+		// option was given, in which case we leave the induced method
+		// unchanged.
+		if ( params.im_is_auto == FALSE )
+		{
+			bli_ind_disable_all_dt( dt );
+			bli_ind_enable_dt( ind, dt );
+		}
 #endif
 
 		dtime_save = DBL_MAX;
diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c
index 955a7b878..2aa17c29a 100644
--- a/test/3/test_trmm.c
+++ b/test/3/test_trmm.c
@@ -171,9 +171,14 @@ int main( int argc, char** argv )
 		bli_copym( &c, &c_save );
 
 #ifdef BLIS
-		// Switch to the induced method specified by ind.
-		bli_ind_disable_all_dt( dt );
-		bli_ind_enable_dt( ind, dt );
+		// Switch to the induced method specified by ind, unless the 'auto'
+		// option was given, in which case we leave the induced method
+		// unchanged.
+		if ( params.im_is_auto == FALSE )
+		{
+			bli_ind_disable_all_dt( dt );
+			bli_ind_enable_dt( ind, dt );
+		}
 #endif
 
 		dtime_save = DBL_MAX;
diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c
index 5fdfa8580..33a45f5b7 100644
--- a/test/3/test_trsm.c
+++ b/test/3/test_trsm.c
@@ -174,9 +174,14 @@ int main( int argc, char** argv )
 		bli_copym( &c, &c_save );
 
 #ifdef BLIS
-		// Switch to the induced method specified by ind.
-		bli_ind_disable_all_dt( dt );
-		bli_ind_enable_dt( ind, dt );
+		// Switch to the induced method specified by ind, unless the 'auto'
+		// option was given, in which case we leave the induced method
+		// unchanged.
+		if ( params.im_is_auto == FALSE )
+		{
+			bli_ind_disable_all_dt( dt );
+			bli_ind_enable_dt( ind, dt );
+		}
 #endif
 
 		dtime_save = DBL_MAX;
diff --git a/test/3/test_utils.c b/test/3/test_utils.c
index 8e441d055..18542c33f 100644
--- a/test/3/test_utils.c
+++ b/test/3/test_utils.c
@@ -38,7 +38,7 @@
 // Global string constants.
 const char* GLOB_DEF_DT_STR    = "d";
 const char* GLOB_DEF_SC_STR    = "ccc";
-const char* GLOB_DEF_IM_STR    = "native";
+const char* GLOB_DEF_IM_STR    = "auto";
 
 const char* GLOB_DEF_PS_STR    = "50 1000 50";
 const char* GLOB_DEF_M_STR     = "-1";
@@ -365,9 +365,10 @@ void parse_cl_params( int argc, char** argv, init_fp fp, params_t* params )
 				}
 				printf( "\n" );
 				printf( "    -i im\n" );
-				printf( "            Use native execution if im is 'native' (or 'nat'). Otherwise,\n" );
-				printf( "            if im is '1m', use the 1m method to induce complex computation\n" );
-				printf( "            using the equivalent real-domain microkernels.\n" );
+				printf( "            Use native execution if im is 'native' (or 'nat'). If im is '1m',\n" );
+				printf( "            use the 1m method to induce complex computation using the\n" );
+				printf( "            equivalent real-domain microkernels. If im is 'auto', do not\n" );
+				printf( "            explicitly set the induced method and instead use the default.\n" );
 				printf( "\n" );
 				printf( "    -p 'lo hi in'\n" );
 				printf( "            Perform a sweep of measurements of problem sizes ranging from \n" );
@@ -609,13 +610,21 @@ void proc_params( params_t* params )
 	bli_param_map_char_to_blis_dt( params->dt_str[0], &params->dt );
 
 	// Parse the induced method to the corresponding ind_t.
-	if      ( strncmp( params->im_str, "native", 6 ) == 0 )
+	if    ( strncmp( params->im_str, "native", 6 ) == 0 ||
+	        strncmp( params->im_str, "nat",    3 ) == 0 )
 	{
-		params->im = BLIS_NAT;
+		params->im         = BLIS_NAT;
+		params->im_is_auto = FALSE;
 	}
 	else if ( strncmp( params->im_str, "1m",     2 ) == 0 )
 	{
-		params->im = BLIS_1M;
+		params->im         = BLIS_1M;
+		params->im_is_auto = FALSE;
+	}
+	else if ( strncmp( params->im_str, "auto",   4 ) == 0 )
+	{
+		params->im         = BLIS_1M;
+		params->im_is_auto = TRUE;
 	}
 	else
 	{
diff --git a/test/3/test_utils.h b/test/3/test_utils.h
index 088f9ce97..aa75b44e3 100644
--- a/test/3/test_utils.h
+++ b/test/3/test_utils.h
@@ -90,6 +90,7 @@ typedef struct params_s
 
 	const char* im_str;
 	ind_t im;
+	bool  im_is_auto;
 
 	// Problem size range and dimension specifiers.
 	const char* ps_str;

From cad51491e8a0b306015a5a02881dc2a9b60dd8d9 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Tue, 30 Apr 2024 16:46:54 -0500
Subject: [PATCH 187/230] Use "-i auto" by default in test/3 drivers.

Details:
- Request default induced method behavior of BLIS via "-i auto" when
  running the standalone performance drivers in test/3 via the runme.sh
  script present in that directory. (Previously, the runme.sh script
  would use "-i native" by default.) This change was originally intended
  for fd1a7e3.
---
 test/3/runme.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/3/runme.sh b/test/3/runme.sh
index fefcbe5ee..64f0823fc 100755
--- a/test/3/runme.sh
+++ b/test/3/runme.sh
@@ -95,11 +95,11 @@ fi
 
 # Datatypes to test.
 test_dts="s d c z"
-test_dts="d"
+#test_dts="d"
 
 # Operations to test.
 test_ops="gemm_nn hemm_ll herk_ln trmm_llnn trsm_runn"
-#test_ops="herk"
+#test_ops="gemm_nn"
 
 # Implementations to test.
 test_impls="blis"
@@ -115,8 +115,9 @@ fi
 # Number of repeats per problem size.
 nrepeats=3
 
-# The induced method to use ('native' or '1m').
-ind="native"
+# The induced method to use ('auto', 'native', or '1m') for executing
+# complex-domain level-3 operations.
+ind="auto"
 
 # Quiet mode?
 #quiet="yes"

From 5ab286f61525f8ead35ecc258305a5ccd4ee096b Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 6 May 2024 13:14:52 -0500
Subject: [PATCH 188/230] Added a script to help create new rc branches.

Details:
- Added a new script, build/start-new-rc.sh, which:
  1. Updates the version file with a new version string.
  2. Commits (locally) the version string update.
  3. Updates the CHANGELOG file with the output of 'git log'.
  4. Commits (locally) the CHANGLOG file update.
  5. Creates a new branch whose name is equal to "<vers>-rc0" where
     <vers> is the new version string.
  6. Reminds the user to execute some final steps if everything looks
     good.
  This new script will help in the future when it's time to start a new
  release candidate branch/lineage off of 'master'. Note that this
  script is based on build/bump-version.sh (which itself may change in
  the future due to changes in the way versions/releases will be handled
  going forward).
---
 build/start-new-rc.sh | 235 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100755 build/start-new-rc.sh

diff --git a/build/start-new-rc.sh b/build/start-new-rc.sh
new file mode 100755
index 000000000..786522214
--- /dev/null
+++ b/build/start-new-rc.sh
@@ -0,0 +1,235 @@
+#!/bin/sh
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+#
+# bump-version.sh
+#
+# Field G. Van Zee
+#
+
+
+print_usage()
+{
+	#local script_name
+	
+	# Get the script name
+	#script_name=${0##*/}
+	
+	# Echo usage info
+	echo " "
+	echo " "$script_name
+	echo " "
+	echo " Field G. Van Zee"
+	echo " "
+	echo " Performs a series of actions needed when creating a new release"
+	echo " candidate branch for BLIS:"
+	echo "   1. Overwrite the version file with the version string passed"
+	echo "      into this script (<new_vers>)."
+	echo "   2. Commit the updated version file."
+	echo "   3. Update the CHANGELOG file."
+	echo "   4. Commit the updated CHANGELOG file."
+	echo "   5. Create a new branch (named '<new_vers>-rc0') which refers to"
+	echo "      the commit created in (4)."
+	echo " "
+	echo " Usage:"
+	echo "   ${script_name} [options] new_vers"
+	echo " "
+	echo " Arguments:"
+	echo " "
+	echo "   new_vers     The new version string."
+	echo " "
+	echo " Options:"
+	echo " "
+	echo "   -d           dry-run"
+	echo "                  Go through all the motions, but don't actually make any"
+	echo "                  changes to files or perform any git commits. Note that"
+	echo "                  this will result in the commits for (2) and (5) above"
+	echo "                  being equal to the initial commit in the script output."
+	echo "   -f VERSFILE  version file name"
+	echo "                  Update VERSFILE with new version string instead of default"
+	echo "                  'version' file."
+	
+	# Exit with non-zero exit status
+	exit 1
+}
+
+
+main()
+{
+	# -- BEGIN GLOBAL VARIABLE DECLARATIONS --
+
+	# The name of the script, stripped of any preceeding path.
+	script_name=${0##*/}
+
+	# The name of the config.mk file.
+	configmk_file='config.mk'
+
+	# The name of the CHANGELOG file.
+	changelog_file='CHANGELOG'
+
+	# The name and location of the default version file.
+	version_file_def='build/version'
+
+	# The name and location of the specified version file.
+	version_file=''
+
+	# Strings used during version query.
+	git_commit_str=''
+	new_version_str=''
+	new_rc_str=''
+
+	# Master branch name.
+	master_br=master
+
+	# The script name to use instead of the $0 when outputting messages.
+	output_name=''
+
+	# The git directory.
+	gitdir='.git'
+	
+	# Whether we are performing a dry run or not.
+	dry_run_flag=""	
+
+	# -- END GLOBAL VARIABLE DECLARATIONS --
+
+
+	# Process our command line options.
+	while getopts ":dhf:" opt; do
+		case $opt in
+			d  ) dry_run_flag="1" ;;
+			f  ) version_file=$OPTARG ;;
+			h  ) print_usage ;;
+			\? ) print_usage
+		esac
+	done
+	shift $(($OPTIND - 1))
+
+
+	# If a version file name was not given, set version_file to the default
+	# value.
+	if [ -n "${version_file}" ]; then
+
+		echo "${script_name}: version file specified: '${version_file}'."
+	else
+
+		echo "${script_name}: no version file specified; defaulting to '${version_file_def}'."
+		version_file="${version_file_def}"
+	fi
+
+
+	# Check the number of arguments after command line option processing.
+	if [ $# = "1" ]; then
+
+		new_version_str=$1
+		new_rc_str="${new_version_str}-rc0"
+
+		echo "${script_name}: new version string: '${new_version_str}'."
+		echo "${script_name}: preparing to create release candidate branch '${new_rc_str}'."
+
+	else
+		print_usage
+	fi
+
+
+	# Check if the .git dir exists; if it does not, we do nothing.
+	if [ -d "${gitdir}" ]; then
+
+		echo "${script_name}: found '${gitdir}' directory; assuming git clone."
+
+		git_commit_str=$(git describe --always)
+		echo "${script_name}: initial commit: ${git_commit_str}."
+
+		echo "${script_name}: updating version file '${version_file}'."
+		if [ -z "$dry_run_flag" ]; then
+			echo "${new_version_str}" > ${version_file}
+		fi
+
+		echo "${script_name}: executing: git checkout ${master_br}."
+		echo "${script_name}: executing: git commit -m \"Version file update (${new_version_str})\" ${version_file}."
+		if [ -z "$dry_run_flag" ]; then
+			git checkout ${master_br}
+			git commit -m "Version file update (${new_version_str})" ${version_file}
+		fi
+
+		git_commit_str=$(git describe --always)
+		echo "${script_name}: new commit containing version file update: ${git_commit_str}."
+
+		echo "${script_name}: updating '${changelog_file}'."
+		if [ -z "$dry_run_flag" ]; then
+
+			# If 'make distclean' was run recently, we need to re-run
+			# configure in order for 'make changelog' to work properly.
+			if [ ! -f "${configmk_file}" ]; then
+				./configure auto
+			fi
+			make changelog
+		fi
+
+		echo "${script_name}: executing: git commit -m \"CHANGELOG update (${new_version_str})\" ${changelog_file}."
+		if [ -z "$dry_run_flag" ]; then
+			git commit -m "CHANGELOG update (${new_version_str})" ${changelog_file}
+		fi
+
+		git_commit_str=$(git describe --always)
+		echo "${script_name}: new commit containing CHANGELOG update: ${git_commit_str}."
+
+		echo "${script_name}: executing: git checkout -b ${new_rc_str}."
+		if [ -z "$dry_run_flag" ]; then
+			git checkout -b "${new_rc_str}"
+		fi
+
+		echo "${script_name}: "
+		echo "${script_name}: FINAL STEPS: Check the output of 'git log'. If everything"
+		echo "${script_name}: looks okay, execute these commands manually:"
+		echo "${script_name}: "
+		echo "${script_name}:   git checkout master"
+		echo "${script_name}:   git push"
+		echo "${script_name}:   git push -u origin ${new_rc_str}"
+		echo "${script_name}: "
+		
+
+	else
+
+		echo "${script_name}: could not find '${gitdir}' directory; bailing out."
+
+	fi
+
+
+	# Exit peacefully.
+	return 0
+}
+
+
+# The script's main entry point, passing all parameters given.
+main "$@"

From c2af113c7ba6d0dcc128ba36ec6e140d89180cf3 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 6 May 2024 13:37:47 -0500
Subject: [PATCH 189/230] Version file update (1.0)

---
 build/version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/version b/build/version
index ac39a106c..d3827e75a 100644
--- a/build/version
+++ b/build/version
@@ -1 +1 @@
-0.9.0
+1.0

From a876918c8c79a1c3d3d95de1f283350b7249b8ae Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 6 May 2024 13:37:48 -0500
Subject: [PATCH 190/230] CHANGELOG update (1.0)

---
 CHANGELOG | 3016 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 3013 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 27bb039b5..76691e13d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,16 +1,3026 @@
-commit 14c86f66b20901b60ee276da355c1b62642c18d2 (HEAD -> master, tag: 0.9.0)
+commit c2af113c7ba6d0dcc128ba36ec6e140d89180cf3 (HEAD -> master)
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Mon May 6 13:37:47 2024 -0500
+
+    Version file update (1.0)
+
+commit 5ab286f61525f8ead35ecc258305a5ccd4ee096b (origin/master, origin/HEAD)
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Mon May 6 13:14:52 2024 -0500
+
+    Added a script to help create new rc branches.
+    
+    Details:
+    - Added a new script, build/start-new-rc.sh, which:
+      1. Updates the version file with a new version string.
+      2. Commits (locally) the version string update.
+      3. Updates the CHANGELOG file with the output of 'git log'.
+      4. Commits (locally) the CHANGLOG file update.
+      5. Creates a new branch whose name is equal to "<vers>-rc0" where
+         <vers> is the new version string.
+      6. Reminds the user to execute some final steps if everything looks
+         good.
+      This new script will help in the future when it's time to start a new
+      release candidate branch/lineage off of 'master'. Note that this
+      script is based on build/bump-version.sh (which itself may change in
+      the future due to changes in the way versions/releases will be handled
+      going forward).
+
+commit cad51491e8a0b306015a5a02881dc2a9b60dd8d9
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Tue Apr 30 16:46:54 2024 -0500
+
+    Use "-i auto" by default in test/3 drivers.
+    
+    Details:
+    - Request default induced method behavior of BLIS via "-i auto" when
+      running the standalone performance drivers in test/3 via the runme.sh
+      script present in that directory. (Previously, the runme.sh script
+      would use "-i native" by default.) This change was originally intended
+      for fd1a7e3.
+
+commit fd1a7e3ca9547718aa61c806848099705216182b
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Thu Apr 25 15:00:59 2024 -0500
+
+    Allow test/3 drivers to use default ind_t method. (#804)
+    
+    Details:
+    - Previously, the standalone performance drivers in test/3 were written
+      under the assumption that the user would want to explicitly test
+      either native execution *or* 1m. But because the accompanying runme.sh
+      script defaults to passing "native" in for the -i command line option
+      (which explicitly sets the induced method type), running the script
+      without modification causes the test drivers to use slow reference
+      microkernels on systems where native complex-domain microkernels are
+      not registered -- which will yield poor performance for complex-domain
+      level-3 operations. Furthermore, even if a user was aware of this, the
+      test drivers did not support any single value for the -i option that
+      would test BLIS using the library's default behavior -- that is, using
+      1m on systems where it is needed and native execution on systems that
+      have native microkernels implemented and registered.
+    - This commit addresses the aforementioned issue by supporting a new
+      value for the -i option: "auto". The "auto" value causes the driver
+      to avoid explicitly setting the induced method altogether, leaving
+      BLIS's default behavior in place. This "auto" option is also now the
+      default setting within the runme.sh script. Thanks to Leick Robinson
+      for finding and reporting this issue.
+    - Also added support for "nat" as a shorthand for "native", which
+      the help text already (erroneously) claimed was supported.
+
+commit a49238e6141c96a41aa3c2a4adb0b0663d0b4968
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Apr 24 15:07:18 2024 -0500
+
+    Refactor the control tree and other infrastructure (#710)
+    
+    Details:
+    1. A "plugin" architecture.
+    - Users are now able to register new kernels, kernel preferences, and
+      blocksizes at runtime, directly from user applications.
+    - Plugins can be created, configured, and built using only an installed
+      version of BLIS -- no source or source code changes required.
+    - Plugins support both reference and optimized kernels, as well as
+      custom configuration-to-kernel-set mappings.
+    - Building plugins (including reference and relevant optimized kernels)
+      for enabled architectures or architecture families is automated, as is
+      linking into the final library.
+    - The configure script is now installed as 'configure-plugin'. In this
+      mode, it can be used to initialize a plugin from a template including
+      optional example code, and prepare a build system for compiling the
+      plugin into a shared or static library.
+    - Additional configuration files, templates, and build system components
+      are also installed to '%prefix%/share/blis'.
+    - The cntx_t struct now has extensible data structures for holding
+      kernels, preferences, and blocksizes. These are based on a "stack"
+      structure which contains a list of fixed-size data blocks. Adding a
+      new entry (which may require allocating a new block or reallocating
+      the block pointer array) requires locking, but looking up entries is
+      lock-free and takes O(1) time.
+    - Kernels can depend on either 1 or 2 type parameters (e.g.
+      mixed-precision packing requires 2). The func2_t struct supports
+      the latter, but can be implicitly cast to func_t if only "diagonal"
+      entries are needed. The number of type parameters can be inferred from
+      the kernel ID for type safety.
+    - Functions have been added to register new kernels, preferences, and
+      blocksizes with the global kernel structure (gks). This creates
+      corresponding entries in each allocated context and returns the next
+      available ID. Plugins use this API to register user kernels, although
+      the user is responsible for tracking the returned IDs for later
+      lookup. Setting newly-registered reference kernels, as well as
+      overriding these with optimized kernels is done in exactly the same
+      manner as in bli_cntx_init_ref() and bli_cntx_init_<subconfig>().
+    
+    2. Restructuring of the control and thread control trees.
+    - The control tree has been substantially restructured to support more
+      flexibility.
+    - The "default" control trees for gemm (also used for
+      hemm/symm/herk/her2k/syrk/syr2k/trmm/trmm3) and trsm are now
+      represented as a single structure containing all necessary control
+      tree nodes and parameters.
+    - An API has been added to modify the default gemm/trsm control trees.
+    - This same API is used by the framework and packm/gemm/trsm variants
+      to access specific control tree nodes.
+    - Users can alternatively create a custom control tree from scratch.
+    - The blocksizes are now encoded directly in the control tree, rather
+      than via loop IDs. The logic for adjusting blocksizes for certain
+      operations has been moved to the control tree initialization.
+    - Type information is encoded in the control tree to drive proper
+      selection of packing and computational kernels provided by the user.
+    - The packing microkernel now receives an opaque "params" struct which
+      is user-definable and can be used to pass additional information
+      through the call stack.
+    - The auxinfo_t struct has been updated with a .params field for
+      opaque user data as well as the global offsets of the current
+      microtile.
+    - The packm and gemm variants can be overridden by the user, and also
+      receive an opaque params struct via the associated control tree
+      node.
+    - The structure-aware packing kernel bli_packm_struc_cxk() is no longer
+      hard-coded to be called from the default packm variant, but can be
+      overridden by the user. It also supports mixed-precision/mixed-domain
+      natively now.
+    - The thread control tree (thrinfo_t) is now created entirely up-front
+      by inspecting the control tree. The required number of threads at each
+      level is encoded in the control tree via loop IDs (actually a bitfield
+      of loop IDs), although the ordering and number of such IDs is
+      arbitrary. The logic for adjusting the number of threads at each level
+      based on operation type (e.g. trmm) is now in the control tree
+      initialization and expressed by combining loop IDs from multiple
+      levels into a single level.
+    - The mem_t object containing the pack buffer pointer has been moved
+      from the control tree to the thread control tree. NOTE: **The control
+      tree is now strictly const throughout the operation, and only a
+      single copy is shared by all threads.**
+    - The thread control tree node for packing has been changed so that
+      there is no longer a "fake" node indicating a team of single threads.
+      Instead, the number of threads and thread IDs in the "normal" thread
+      control tree node are used. This change has also been made to the
+      gemmsup thread control tree and packing variants, as well as to the
+      gemmlike sandbox.
+    - Parameters controlling packing (e.g. inversion of the diagonal,
+      direction, schema) are not stored directly in the control tree but in
+      the opaque params struct. The packing control tree node and its
+      default params struct are stored together in the "combined"
+      gemm/trsm control tree structure and initialized as a unit. Users can
+      update these parameters individually or substitute a custom packm
+      variant and params struct.
+    - The "target" and "execution" datatypes has been removed from the obj_t
+      struct and replaced by type information in the control tree.
+    - The "sub-node" and "sub-prenode" of a control tree node have been
+      replaced by an arbitrary number of sub-nodes accessed by index. There
+      is a hard cap on the number of sub-nodes (currently 2). Sub-nodes are
+      added during control tree initialization, *after*
+      creation/initialization of the parent node through an updated API.
+    - The level-3 thread decorator has been significantly simplified and
+      directly calls bli_l3_int(). The control tree is created externally,
+      and it is no longer necessary to alias matrices or set object pack
+      schemas. Also, the rntm_t passed in may be NULL. Finally, family
+      and scalar information is no longer needed here.
+    - bli_l3_int() is now a simple inline function which extracts the next
+      control tree node and variant and calls it.
+    - bli_*_front() have been removed and inlined into the expert object
+      API with significant simplification.
+    - 1m (or other induced method) no longer uses an alternative cntx_t.
+    - The .pack_fn/.ker_fn pointers and associated params fields on the
+      obj_t were removed in favor of the present solution.
+    
+    3. Overhaul of variable substitution in configure script.
+    - The configure script has been somewhat re-written to use a
+      centralized mechanism for substituting variables into build system and
+      other configuration files.
+    - All substitution variables go through the same pathway now, which
+      necessitated some variable naming changes for variables which were
+      named the same in e.g. Makefile and bli_config.h but with
+      different definitions.
+    - CC and CXX variables can now contain spaces, e.g. 'g++ -std=c++17'.
+      This provides better support for integration with build tooling such
+      as autotools.
+    
+    4. Overhaul of packing kernels.
+    - Previously there were two packing kernels referenced in the cntx_t
+      structure for MRxk and NRxk shaped micropanels, respectively. These
+      have now been merged into one kernel which is responsible for packing
+      any dense rectangular portion of either A or B.
+    - The packing kernel now receives information about the register
+      blocksize (cdim_max) and duplication factor (the "broadcast-B"
+      format, although this can also apply to the A matrix).
+    - The structure-aware packing kernel (bli_packm_struc_cxk(), which is
+      now user-overridable) also receives global offsets of the current
+      micropanel within A or B.
+    - Explicit kernels for packing the diagonal blocks of
+      triangular/symmetric/Hermitian matrices have been added to the
+      cntx_t. This means that the bli_packm_struc_ckx() "kernel" no longer
+      needs to directly touch data (except to zero out some regions).
+    - bli_packm_struc_cxk() has also been updated to work only in terms of
+      fundamental elements (i.e., real datatypes) when computing offsets and
+      when zeroing data, which greatly simplifies mixed-domain/1m packing.
+    - bli_packm_scalar() has been updated to better support complex scalars
+      in mixed-domain operations.
+    - Pack schemas for PACKED_ROW_PANELS* and PACKED_COL_PANELS* have
+      been merged into simply PACKED_PANELS*. This reflects the merging of
+      the packing kernels into a single generic kernel. There were only a
+      very few places which needed the row/column information and this is
+      now supplied by alternative means.
+    - Packing variants always behave "as if" the A matrix were being packed
+      (i.e. the code assumes packing column-stored row panels). Packing of B
+      is handled by applying an implicit or explicit transpose before
+      packing. This change also applies to gemmsup.
+    
+    5. Improved MD/MP support.
+    - All level-3 operations (except trsm) now support full
+      mixed-domain/mixed-precision operation.
+    - Explicit 1m packing kernels have been added in the cntx_t.
+    - An explicit 1m microkernel wrapper has been added to the cntx_t.
+    - An extra packing kernel for the "ro" format has been added, along with
+      the pack_t enumeration value. This supports the packing for
+      real*complex -> real, including potential scaling by a complex alpha,
+      support for structured matrices, etc.
+    - Extra microkernel wrappers for mixed-domain operations have been added
+      to support the 'ccr' (and by extension, 'crc'), 'rcc', and 'crr'
+      cases. Notably this includes full support for general stride storage
+      and complex alpha/beta.
+    - Packing kernels and gemm microkernels are now "templated" based on two
+      type parameters rather than one. For packing this allows direct
+      optimization of mixed-precision kernels, and for gemm microkernels
+      this allows direct optimization of mixed-precision without writing to
+      a temporary buffer. Reference packing kernels are directly
+      instantiated for all mixes of precisions, while by default
+      mixed-precision gemm microkernels are supported via a microkernel
+      wrapper. The "old" way of specifying optimized kernels using a single
+      type parameter works unchanged.
+    - alpha and beta are typecast appropriately to the computational or
+      output datatype, respectively, and **always** to the complex domain.
+      Scalar typecasting has also been added to gemmsup for safety.
+    - The gemm macrokernel doesn't have to do any typecasting anymore, as a
+      microkernel wrapper or optimized mixed-precision/mixed-domain kernel
+      now handles this.
+    - 1m and mixed-domain operations now always use a microkernel wrapper,
+      rather than adjusting parameters in the gemm macrokernel.
+    - The gemmt macrokernel **does** still have to handle explicit
+      write-back of microtiles which intersect the diagonal, although
+      typecasting has already been performed.
+    - The gemmt_x_ker_var2(), trmm_xx_ker_var2(), and trsm_xx_ker_var2()
+      functions have been removed. The appropriate macrokernel pointer is
+      selected during control tree initialization.
+    - Real domain MR/NR are checked for even-ness based on the gemm
+      microkernel's row preference in order to guarantee proper 1m and
+      mixed-domain operation.
+    - Full range of mixed-domain/mixed-precision functionality tested in the
+      testsuite ('input.*.mixed').
+    
+    6. Other changes:
+    - The build system has been updated to support C++ source files
+      throughout the framework. While the intent is not to add such files to
+      BLIS itself, this supports plugins written in C++.
+    - Many instances of configuration-specific code have been simplified by
+      introducing an INSERT_GENTCONF macro which instantiates a block of
+      code for each enabled sub-configuration. The ConfigurationHowTo.md
+      document has been updated accordingly.
+    - PASTEMAC?/PASTECH?/PASTEF77? have been removed in favor of
+      variadic macros which accept any number of arguments (up to a
+      reasonable limit).
+    - The INSERT_GENTFUNC* macros have been updated to clean up
+      mixed-precision and mixed-domain instantiations.
+    - bli_align_dim_to_mult() has been updated to support rounding either up
+      or down based on a flag.
+    - Checking for empty matrices and other early exits (level-3 only) has
+      been consolidated into a single utility function.
+    - The auxinfo_t struct is always passed as const.
+    - The new function bli_obj_alias_submatrix() aliases a matrix while also
+      resetting the root to NULL, offsets to zero (while adjusting the
+      buffer), and applying any implicit transpose.
+    - Level-3 pruning functions now only check matrix structure to see what
+      to do, not the operation family.
+    - gemmsup packing has been updated to use the "normal" pack buffer
+      allocation routines.
+    - Remove duplicate checks for early return from gemmsup handler.
+    - bli_determine_blocksize() has been significantly simplified.
+    - Partitioning packed panels is no longer allowed.
+    - Added bli_xxsame macros.
+    - Automated the calculation of info bit shifts and masks based on
+      predefined bit sizes for various flags. This greatly simplifies
+      reordering, adding, or removing flags from the info/info2 bitfields.
+    - Moved more BLIS_NUM_* macros into the corresponding enums as the
+      last entry so that the value is automatically computed.
+    - Better const-correctness in some level0 scalar macros.
+    - Better mixed-precision support in some level0 scalar macros.
+    - Added a bli_axpbys_mxn() macro.
+    - bli_thread_range_sub() takes explicit thread ID and number of threads
+      rather than a thrinfo_t node.
+    - "De-templated" BLIS gemmlike sandbox (specifically, bls_gemm_bp_var1()
+      and bls_packm_var1()).
+    - Combined bls_l3_packm_[ab]() into one function with thin wrappers.
+    - Deleted bls_packm_var[23]().
+    - Add a "termination tag" to the testsuite output so that
+      'make check-blis' can accurately check for successful completion.
+    - Add a new function to centrally compute FLOPs for level-3 operations
+      in the testsuite.
+
+commit a316d2c6c33fc1f8f7c58c4210ab203f48349041
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Mar 28 12:52:00 2024 -0500
+
+    Fix incorrect commenting of `BLIS_RNTM_INITIALIZER` and `BLIS_OBJECT_INITIALIZER`.
+
+commit 664cc6bc3ea610b4ecea63d78c6024c48f045635
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Mar 26 16:25:17 2024 -0500
+
+    Update BLIS_*_INITIALIZER macros for C++ compatibility. (#802)
+    
+    Details:
+    - Remove designated initializer syntax. This isn't officially supported
+      until C++20.
+    - Arrange initializers in the order in which they are defined in the
+      struct. Even with standard or extension support for designated
+      initializers, initializing non-static members out-of-order is an
+      error in C++.
+    - Remove the conditional code which uses '-1' as the default value of
+      the 'pack_buf' member of 'mem_t' in C, but 'BLIS_BUFFER_FOR_GEN_USE'
+      in C++. Simply use the latter as a common-sense default.
+
+commit 1a8c8180b32cf5988bf9eb5d2f0f8111a729993a
+Author: John <50754967+j-bm@users.noreply.github.com>
+Date:   Thu Feb 15 12:35:10 2024 -0400
+
+    Add cpu part codes for various manufacturers and use in the code (#794)
+    
+    * Add cpu_id symbols for arm v8.
+    
+    * Add symbols for arm v7.
+    
+    * Always assume firestorm on Apple aarch64.
+    
+    * Fixes incorrect usage of model vs. part in some places.
+    
+    * Fixes #793
+    
+    ---------
+    
+    Co-authored-by: J <jal@o75snap.localdomain>
+
+commit c382d8bdccc07e22a341fe04960f0cbf4eec083b
+Author: Igor Zhuravlov <zhuravlov.ip@ya.ru>
+Date:   Sun Jan 14 04:03:31 2024 +0000
+
+    Fix errors and typos in docs/BLIS*API.md (#791)
+    
+    Details:
+    - Fixed errors and unified formatting in docs/BLIS*API.md docs.
+
+commit a72e4569f2a03cc3578c019bf7ce25491a44137d
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Wed Dec 6 18:21:47 2023 -0600
+
+    Include bli_config.h before bli_system.h in cblas.h. (#789)
+    
+    Details:
+    - Previously, in cblas.h, bli_config.h was being #included *after*
+      bli_system.h, which meant that the BLIS_ENABLE_SYSTEM macro was
+      never defined in time for proper OS detection. This bug only
+      affected cblas.h -- blis.h had been correctly #including
+      bli_config.h before bli_system.h since fb93d24. Thanks to
+      Edward Smyth for reporting this bug and suggesting the fix.
+
+commit 1236ddab455ef3a6293ab394ff06b3a19c2913d9
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Sun Dec 3 16:42:34 2023 -0600
+
+    Fixed random segfault in test/3 drivers. (#788)
+    
+    Details:
+    - Fixed a segfault in the non-gemm test drivers in test/3 that was the
+      result of sometimes leaving either .n_str or .k_str fields of the
+      params_t struct uninitialized, depending on the operation in question.
+      For example, in test_hemm.c, init_def_params() would only initialize
+      the .m_str and .n_str fields, but not the .k_str field. Even though
+      hemm doesn't use a 'k' dimension, the proc_params() function (called
+      via parse_cl_params()) universally attempts to convert all three into
+      integers via sscanf(), which was understandably failing when one of
+      those strings was a NULL pointer. I'm not sure how this code ever
+      worked to begin with. Special thanks to Leick Robinson for finding and
+      reporting this bug.
+
+commit 141a6c9a8e7557d9c7d28aecedec9dc5377dba13
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Tue Nov 21 12:26:43 2023 -0600
+
+    Install helper headers to INCDIR prefix. (#787)
+    
+    Details:
+    - Install one-line headers to INCDIR whose entire purpose is to
+      #include the actual headers within the local 'blis' header directory
+      so that applications can #include "blis.h" instead of #include
+      <blis/blis.h> (and/or "cblas.h" instead of <blis/cblas.h> if CBLAS is
+      enabled) when headers are installed to global paths. (Note that
+      INCDIR is the installation prefix for headers as specified by
+      '--includedir=INCDIR', which defaults to 'PREFIX/include' if not
+      specified.) Not sure how this problem went unreported for so long,
+      since presumably any user trying to #include "blis.h" from a global
+      installation would have encountered a compiler error.
+    - The one-line blis.h and cblas.h headers now reside in the 'build'
+      directory, ready to install as is.
+    - Thanks to to Jed Brown for reporting this via Issue #786, and for
+      Devin Matthews and Mo Zhou for their engagement.
+    - Harmonized the rule in the top-level Makefile for installing blis.pc
+      into SHAREDIR/pkgconfig with conventions for others vis-a-vis
+      verbosity/non-verbosity.
+
+commit 2d9439298b336aa6d0ee000a5285a3adb4e6d462
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Nov 21 12:18:07 2023 -0600
+
+    Allow users to defines [sd]complex using std::complex (#784)
+    
+    Details:
+    - In C++ applications, it makes a lot of sense to interface to BLIS
+      using C++'s standard complex number library, which uses a template
+      class std::complex. Obviously BLIS doesn't know anything about this
+      and defaults to a custom struct to represent complex numbers. This PR
+      updates the bli_[cz]{real,imag}() functions to accept std::complex
+      numbers when a C++ compiler is being used. Note that this has no
+      effect on the compilation of the BLIS library (or testsuite), and only
+      comes into play when including blis.h into a C++ project and forcing
+      the use of std::complex for scomplex and dcomplex.
+    - The application can explicitly request std:complex-based types via:
+    
+        #define BLIS_ENABLE_STD_COMPLEX
+        #include <blis.h>
+        // Call BLIS functions using std::complex<double> here.
+    
+    - Fixed a bug in the definition of some scalar level-0 macros, since
+      bli_creal()/bli_cimag() and bli_zreal()/bli_zimag() are no longer
+      interchangeable.
+
+commit f7ce54a252028483e4c6af619015eb22063d5541 (origin/1.0-rc0)
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Fri Nov 3 15:52:57 2023 -0500
+
+    CREDITS file update.
+
+commit 05388ddb66f8bf2d62009b162d64bf2d99226b83
+Author: Aaron Hutchinson <113382047+Aaron-Hutchinson@users.noreply.github.com>
+Date:   Fri Nov 3 13:30:31 2023 -0700
+
+    Added 'sifive_x280' subconfig, kernel set. (#737)
+    
+    Details:
+    - Added a new 'sifive_x280' subconfiguration for SiFive's x280 RISC-V
+      instruction set architecture. The subconfig registers kernels from a
+      correspondingly new kernel set, also named 'sifive_x280'.
+    - Added the aforementioned kernel set, which includes intrinsics- and
+      assembly-based implementations of most level-1v kernels along with
+      level-1f kernels axpy2v dotaxpyv, packm kernels, and level-3 gemm,
+      gemmtrsm_l, and gemmtrsm_u microkernels (plus supporting files).
+    - Registered the 'sifive_x280' subconfig as belonging to a singleton
+      family by the same name.
+    - Added an entry to '.travis.yml' to test the new subconfig via qemu.
+    - Updates to 'travis/do_riscv.sh' script to support the 'sifive_x280'
+      subconfig and to reflect updated tarball names.
+    - Special thanks to Lee Killough, Devin Matthews, and Angelika Schwarz
+      for their engagement on this commit.
+
+commit 7a87e57b69d697a9b06231a5c0423c00fa375dc1 (origin/10.0-rc0)
+Author: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
+Date:   Sat Oct 14 02:05:41 2023 -0500
+
+    Fixed HPX barrier synchronization (#783)
+    
+    Details:
+    - Fixed hpx barrier synchronization. HPX was hanging on larger cores
+      because blis was using non-hpx synchronization primitives. But when
+      using hpx-runtime only hpx-synchronization primitives should be used.
+      Hence, a C style wrapper hpx_barrier_t is introduced to perform hpx
+      barrier operations.
+    - Replaced hpx::for_loop with hpx::futures. Using hpx::for_loop with
+      hpx::barrier on n_threads greater than actual hardware thread count
+      causes synchronization issues making hpx hanging. This can be avoided
+      by using hpx::futures, which are relatively very lightweight, robust
+      and scalable.
+
+commit 8fff1e31da1c87e46cacec112b0ac280ab47cd8b
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Thu Oct 12 15:51:41 2023 -0500
+
+    Fixed bug in sup threshold registration. (#782)
+    
+    Details:
+    - Fixed a bug that resulted in BLIS non-deterministically calling the
+      gemmsup handler, irrespective of the thresholds that are registered
+      via bli_cntx_set_blkszs().
+    - Deep dive: In bli_cntx_init_ref.c, the default values for the gemmsup
+      thresholds (BLIS_[MNK]T blocksizes) wre being set to zero so that no
+      operation ever matched the criteria for gemmsup (unless specific sup
+      thresholds are registered). HOWEVER, these thresholds are set via
+      bli_cntx_set_blkszs() which calls bli_blksz_copy_if_pos(), which was
+      only coping the thresholds into the gks' cntx_t if the values were
+      strictly positive. Thus, the zero values passed into
+      bli_cntx_set_blkszs() were being ignored and those threshold slots
+      within the gks were left uninitialized. The upshot of this is that the
+      reference gemmsup handler was being called for gemm problems
+      essentially at random (and as it turns out, very rarely the reference
+      gemmsup implementation would encounter a divide-by-zero error).
+    - The problem was fixed by changing bli_blksz_copy_if_pos() so that it
+      copies values that are non-negative (values >= 0 instead of > 0). The
+      function was also renamed to bli_blksz_copy_if_nonneg()
+    - Also needed to standardize use of -1 as the sole value to embed into
+      blksz_t structs as a signal to bli_cntx_set_blkszs() to *not* register
+      a value for that slot (and instead let whatever existing values
+      remain). This required updates to the bli_cntx_init_*() functions for
+      bgq, cortexa9, knc, penryn, power7, and template subconfigs, as some
+      of these codes were using 0 instead of -1.
+    - Fixes #781. Thanks to Devin Matthews for identifying, diagnosing, and
+      proposing a fix for this issue.
+
+commit 1e264a42474b535431768ef925bbd518412d392e
+Author: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
+Date:   Mon Oct 2 18:29:46 2023 -0500
+
+    Update zen3 subconfig to support NVHPC compilers. (#779)
+    
+    Details:
+    - Parse $(CC_VENDOR) values of "nvc" in 'zen3' make_defs.mk file.
+    - Minor refactor to accommodate above edit.
+    - CREDITS file update.
+
+commit c2099ed2519dcac8ee421faf999b36e1c2260be7
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Mon Oct 2 14:56:48 2023 -0500
+
+    Fixed brokenness when sba is disabled. (#777)
+    
+    Details:
+    - Previously, disabling the sba via --disable-sba-pools resulted in a
+      segfault due to a sanity-check-triggering abort(). The problem was
+      that the sba, as currently used in the l3 thread decorators, did not
+      yet (fully) support pools being disabled. The solution entailed
+      creating wrapper function, bli_sba_array_elem(), which either calls
+      bli_apool_array_elem() (when sba pools are enabled at configure time)
+      or returns a NULL sba_pool pointer (when sba pools are disabled), and
+      calling bli_sba_array_elem() in place of bli_apool_array_elem(). Note
+      that the NULL pointer returned by bli_sba_array_elem() when the sba
+      pools are disabled does no harm since in that situation the pointer
+      goes unreferenced when acquiring and releasing small blocks. Thanks to
+      John Mather for reporting this bug.
+    - Guarded the bodies of bli_sba_init() and bli_sba_finalize() with
+      #ifdef BLIS_ENABLE_SBA_POOLS. I don't think this was actually necessary
+      to fix the aforementioned bug, but it seems like good practice.
+    - Moved the code in bli_l3_thrinfo_create() that checked that the array*
+      pointer is non-NULL before calling bli_sba_array_elem() (previously
+      bli_apool_array_elem()) into the definition of bli_sba_array_elem().
+    - Renamed various instances of 'pool' variables and function parameters
+      to 'sba_pool' to emphasize what kind of pool it represents.
+    - Whitespace changes.
+
+commit 37ca4fd168525a71937d16aaf6a13c0de5b4daef
+Author: Field G. Van Zee <fgvanzee@gmail.com>
+Date:   Thu Sep 28 16:37:57 2023 -0500
+
+    Implemented [cz]symv_(), [cz]syr_(), [cz]rot_(). (#778)
+    
+    Details:
+    - Expanded existing BLAS compatibility APIs to provide interfaces to
+      [cz]symv_(), [cz]syr_(). This was easy since those operations were
+      already implemented natively in BLIS; the APIs were previously
+      omitted only because they were not formally part of the BLAS.
+    - Implemented [cz]rot_() by feeding code from LAPACK 3.11 through
+      f2c.
+    - Thanks to James Foster for pointing out that LAPACK contains these
+      additional symbols, which prompted these additions, as well as for
+      testing the [cz]rot_() functions from Julia's test infrastructure.
+    - CREDITS file update.
+
+commit 6f412204004666abac266409a203cb635efbabf3
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 26 18:00:54 2023 -0500
+
+    Added 'altra', 'altramax' subconfigs. (#775)
+    
+    Details:
+    - Forward-ported 'altra' and 'altramax' subconfigurations from the
+      older 'stable' branch lineage [1]. These subconfigs primarily target
+      the Ampere Altra and AltraMax (ARM) processors. They also contain
+      "QuickStart" directories with information and scripts to help
+      use BLIS on these microarchitectures. Thanks to Jeff Diamond and
+      Leick Robinson for developing these subconfigs and resources.
+    - Updated kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c according to
+      changes in the 'stable' lineage, mostly related to re-enabling of
+      assembly code branches that target general stride IO.
+    
+    [1] Note that the 'stable' branch is being used to make sure that more
+        recent commits do not introduce unreasonable performance
+        regressions. As such, the name should be interpreted as shorthand
+        for "performance stable," not "API stable."
+
+commit a4a63295b96ed5b32f4df6477d24db07bf431202
+Author: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
+Date:   Tue Sep 26 17:58:38 2023 -0500
+
+    Fixes to HPC runtime code path. (#773)
+    
+    Details:
+    - Fixed hpx::for_each invocation and replace with hpx::for_loop. The HPX
+      runtime was initialized using hpx::start, but the hpx::for_each
+      function was being called on a non-hpx runtime (i.e standard BLIS
+      runtime - single main thread). To run hpx::for_each on HPX runtime
+      correctly, the code now uses hpx::run_as_hpx_thread(func, args...).
+    - Replaced hpx::for_each with hpx::for_loop, which eliminates use of
+      hpx::util::counting_iterator.
+    - Employ hpx::execution::chunk_size(1) to make sure that a thread
+      resides on a particular core.
+    - Replaced hpx::apply() with updated version hpx::post().
+    - Initialize tdata->id = 0 in libblis.c to 0, as it is the main thread
+      and is needed for writing results to output file.
+    - By default, if not specified, the HPX runtime uses all N threads/cores
+      available in the system. But, if we want to only specify n_threads out
+      N threads, we use hpx::execution::experimental::num_cores(n_threads).
+
+commit c6546c1131b1ddd45ef13f9f2b620ce2e955dbf8
+Author: John Mather <54645798+jmather-sesi@users.noreply.github.com>
+Date:   Wed Sep 20 13:41:07 2023 -0400
+
+    Fixed broken link in Multithreading.md. (#774)
+    
+    Details:
+    - Replaced 404'd link in docs/Multithreading.md with an archive from
+       The Wayback Machine.
+    - CREDITS file update.
+
+commit 6dcf7666eff14348e82fbc2750be4b199321e1b9
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sun Aug 27 14:18:57 2023 -0500
+
+    Revamped bli_init() to use TLS where feasible. (#767)
+    
+    Details:
+    - Revamped bli_init_apis() and bli_finalize_apis() to use separate
+      bli_pthread_switch_t objects for each of the five sub-API init
+      functions, with the objects for the 'ind' and 'rntm' sub-APIs being
+      declared with BLIS_THREAD_LOCAL. This allows some APIs to be treated
+      as thread-local and the rest as thread-shared. Thanks to Edward Smyth
+      for requesting application thread-specific rntm_t structs, which
+      inspired these change.
+    - Combined bli_thread_init_from_env() and bli_pack_init_from_env() into
+      a new function, bli_rntm_init_rntm_from_env(), and placed the combined
+      code in bli_rntm.c inside of a new bli_rntm_init() function. Then
+      removed the (now empty) bli_pack_init() and _finalize() function defs.
+    - Deprecated bli_rntm_init() for the purposes of initializing a rntm_t
+      (temporarily preserving it as bli_rntm_clear() in a cpp-undefined code
+      block) so that the function name could be used for the aforementioned
+      bli_rntm_init() function.
+    - Updated libblis_test_pobj_create() in test_libblis.c to use a static
+      rntm_t initializer instead of the deprecated bli_rntm_init()
+      function-based option.
+    - Minor updates to docs/Multithreading.md, including removal of
+      bli_rntm_init() in the example of how to initialize rntm_t structs.
+    - Changed the return value of bli_gks_init(), bli_ind_init(),
+      bli_memsys_init(), bli_thread_init(), and bli_rntm_init() (and their
+      finalize() counterparts) from 'void' to 'int' so that those functions
+      match the function type expected by bli_pthread_switch_on()/_off().
+      Those init/finalize functions now return 0 to indicate success, which
+      is needed so that the switch actually changes state from off to on
+      and vice versa.
+    - Defined bli_thread_reset(), which copies the contents of the
+      global_rntm_at_init() struct into the global_rntm struct (for the
+      current application thread).
+    - Guard calls to bli_pthread_mutex_lock()/_unlock() in
+      - bli_pack_set_pack_a() and _pack_b()
+      - bli_rntm_init_from_global()
+      - bli_thread_set_ways()
+      - bli_thread_set_num_threads()
+      - bli_thread_set_thread_impl()
+      - bli_thread_reset()
+      - bli_l3_ind_oper_set_enable()
+      with #ifdef BLIS_DISABLE_TLS (since TLS precludes the possibility of
+      race conditions).
+    - In frame/base/bli_rntm.c, declare global_rntm, global_rntm_at_init,
+      and global_rntm_mutex as BLIS_THREAD_LOCAL so that separate
+      application threads can change the number of ways of BLIS parallelism
+      independently from one another.
+    - Access global_rntm only via a new private (not exported) function,
+      bli_global_rntm(). Defined a similar function for a rntm_t new to
+      this commit, global_rntm_at_init, which preserves the state of the
+      global rntm at initialization-time.
+    - In frame/3/bli_l3_ind.c, added a guard to the declaration of the
+      static variable oper_st_mutex with #ifdef BLIS_DISABLE_TLS so that the
+      mutex is omitted altogether when TLS is enabled (which prevents the
+      compiler from warning about an unused variable).
+    - Removed redundant code from bli_thread.c:
+        #ifdef BLIS_ENABLE_HPX
+        #include "bli_thread_hpx.h"
+        #endif
+      since this code is already present in bli_thread.h.
+    - Thanks to Minh Quan Ho for his review of and feedback on this commit.
+    - Comment updates.
+
+commit fa6a9b24ae2ddbd5f30f657d46004843581c768c
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Aug 19 12:44:34 2023 -0500
+
+    Fixed error when using common.mk from testsuite. (#768)
+    
+    Details:
+    - Commit 2db31e0 (#755) inserted logic into common.mk that attempts to
+      preprocess build/detect/android/bionic.h to determine whether the
+      __BIONIC__ macro is defined (in which case -lrt should not be included
+      in LDFLAGS). However, the path to bionic.h was encoded without regard
+      to DIST_PATH, and so utilizing common.mk anywhere that isn't the top-
+      level directory (such as in the testsuite directory) resulted in a
+      compiler error:
+    
+        gcc: error: build/detect/android/bionic.h: No such file or directory
+        gcc: fatal error: no input files
+        compilation terminated.
+    
+      This commit adds a $(DIST_PATH) prefix to the path to bionic.h so that
+      it can be located from other applications' Makefiles that use BLIS's
+      makefile fragments.
+
+commit 634e532c8dcce7383d96ba33276df65c656b2198
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Aug 9 21:54:49 2023 -0500
+
+    Set thrcomm timpl_t id inside init functions. (#766)
+    
+    Details:
+    - Previously, the timpl_t id being used when a thrcomm_t is being
+      initialized was set within the bli_thrcomm_init() dispatch function
+      after the timpl_t-specific bli_thrcomm_init_*() function returned. But
+      it just occurred to me that each bli_thrcomm_init_*() function already
+      intrinsically knows its own timpl_t value. This commit shifts the
+      setting of the thrcomm_t.ti field into the corresponding
+      bli_thrcomm_init_*() function for each timpl_t type (e.g. single,
+      openmp, pthreads, hpx).
+    - Removed long-deprecated code dating back nearly 10 years.
+    - Whitespace changes
+    - Comment updates.
+
+commit 3cf17b4a91232709bc6a205b0e4d7ecc96579aa9
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Aug 7 13:46:20 2023 -0500
+
+    Small fixes/improvements to docs/Multithreading.md. (#764)
+    
+    Details:
+    - Added reminders that #include "blis.h" must be added to source files
+      in order to access BLIS API function prototypes. Thanks to Barry Smith
+      for suggesting this improvement.
+    - Fixed pre-existing typos.
+    - CREDITS file update.
+
+commit dbc79812c390f812c7bf030bfcf87e947a1443c4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Jul 28 18:16:38 2023 -0500
+
+    CREDITS file update.
+    
+    Details:
+    - Thanks to Igor Zhuravlov for PR #753 (commit 915daaa).
+
+commit 915daaa43cd189c86d93d72cd249714f126e9425
+Author: Igor Zhuravlov <zhuravlov.ip@ya.ru>
+Date:   Thu Jul 27 20:33:59 2023 +0000
+
+    Fix typos in docs + example code comments. (#753)
+    
+    Details:
+    - Fixed various typos in API documentation in docs/BLIS*API.md and
+      comments in the source code examples within examples/?api/*.c.
+
+commit 2db31e057e7e9c97fc60021b5ae72a01a48d7588
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Thu Jul 27 15:27:21 2023 -0500
+
+    Exclude -lrt on Android with Bionic libraries. (#755)
+    
+    Details:
+    - Added build/detect/android/bionic.h header to test whether the
+      __BIONIC__ cpp macro is defined.
+    - In common.mk, only add -lrt to LDFLAGS when Bionic is not present.
+    - CREDITS file update.
+
+commit 22ad8c1b752364784f320168b31995945ad84a59
+Author: ct-clmsn <ct.clmsn@gmail.com>
+Date:   Thu Jul 27 16:23:29 2023 -0400
+
+    Small fixes to support hpx in the testsuite (#759)
+    
+    Details:
+    - Minor changes to test_libblis.c to support hpx.
+
+commit c91b41d022e33da82b3b06c82be047a29873d9b6
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Wed Jul 26 14:37:08 2023 -0500
+
+    Auto-detect the RISC-V ABI of the compiler and use -mabi= during RISC-V Builds (#750)
+    
+    Details:
+    - Generate a build error if there is a 32/64-bit mismatch between the
+      RISC-V ABI or architecture and the BLIS configuration selected.
+    - Handle Q, Zicsr, ZiFencei, Zba, Zbb, Zbc, Zbs and Zfh extensions in
+      the RISC-V architecture auto-detection. ZiFencei and Zicsr is not
+      detectable with built-in RISC-V macros right now.
+    - ZiFencei is not important for BLIS because doesn't it have
+      Just-In-Time compilation or self-modifying code, and Zicsr is implied
+      by the floating-point extensions, which are required for good
+      performance in BLIS.
+    - Move RISC-V autodetect header files to build/detect/riscv/.
+
+commit a0b04e3c007f1207e5678bf20c07752906742fb7 (origin/aocl-blas, aocl-blas)
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 26 17:59:21 2023 -0500
+
+    Rewrote regen-symbols.sh (gen-libblis-symbols.sh). (#751)
+    
+    Details:
+    - Wrote an alternative to regen-symbols.sh, gen-libblis-symbols.sh,
+      that generates a list of exported symbols from the monolithic blis.h
+      file rather than peeking inside of the shared object via nm. (This new
+      script lives in the 'build' directory and the older script has been
+      retired to build/old.) Special thanks to Devin Matthews for authoring
+      gen-libblis-symbols.sh.
+    - Added a 'symbols' target to the top-level Makefile which will refresh
+      build/libblis-symbols.def, with supporting changes to common.mk.
+    - Updates to build/libblis-symbols.def using the new symbol-generating
+      script.
+
+commit 6b894c30b9bb2c2518848d74e4c8d96844f77f24
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 12 17:22:44 2023 -0500
+
+    Rewrote/fixed broken tree barrier implementation.
+    
+    Details:
+    - Rewrote the defintion of bli_thrcomm_tree_barrier() so that it (a)
+      actually worked again, and (b) used atomics instead of a basic C99
+      spin loop. (Note that the conventional barrier implementation is
+      still enabled by default; the tree barrier must be toggled on
+      manually within the configuration.)
+    - Added an early return to the definition of bli_thrcomm_barrier() in
+      the cases where comm == NULL or comm->n_threads == 1.
+    - Reordered thread-related and thread-dependent header #include
+      directives in blis.h so that the BLIS_TREE_BARRIER and
+      BLIS_TREE_BARRIER_ARITY macros, which would be defined in the target
+      configuration's in the bli_family_*.h file, would be #included prior
+      to the inclusion of the thrcomm_t header that uses them.
+    - Changed the type of barrier_t.count from 'int' to 'dim_t'.
+    - Changed the type of barrier_t.signal from 'volatile int' to 'gint_t'.
+    - Special thanks to Leick Robinson for contributing these changes.
+    - Whitespace changes.
+
+commit d639554894b6252a86bd3164921bce6fbb9e3b5e
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Jun 7 16:11:14 2023 -0500
+
+    Pad thrcomm_t fields to avoid false sharing.
+    
+    Details:
+    - Inserted a cache line of padding between various fields of the
+      thrcomm_t and, in the case of the (presently defunct) tree barrier,
+      fields of the barrier_t. This additional padding ensures that these
+      fields, which both serve different purposes when performing a thread
+      barrier, are only accessed when needed (and not just due to their
+      spatial locality with their cache line neighbors).
+    - Added a new cpp macro constant, BLIS_CACHE_LINE_SIZE, to
+      bli_config_macro_defs. This new constant defines the size of a cache
+      line (in bytes) and defaults to 64.
+    - Special thanks to Leick Robinson for discovering this false sharing
+      issue and developing/submitting the patch.
+
+commit 89b7863fc9a88903917deedc6a5ad9fd17f83713
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon May 8 16:51:18 2023 -0500
+
+    Fix 1m enablement for herk/her2k/syrk/syr2k. (#743)
+    
+    Details:
+    - Ever since 28b0982, herk, her2k, syrk, and syr2k have been implemented
+      in terms of the gemmt expert API. And since the decision of which
+      induced method to use (1m or native) is made *below* the level of the
+      expert API, executing any of {herk,her2k,syrk,syr2k} results in BLIS
+      checking the enablement status for gemmt.
+    - This commit applies a band-aid of sorts to this issue by modifying
+      bli_l3_ind_oper_get_enable() and bli_l3_ind_oper_set_enable() so that
+      any attempts to query or modify the internal enablement status for
+      herk, her2k, syrk, or syr2k instead does so for gemmt.
+    - This solution isn't perfect since, in theory, the user could enable 1m
+      for, say, herk but then disable it for syrk, and then be confused when
+      herk runs via native execution. But we don't anticipate that users
+      modify 1m enablement at the operation level, and so in practice this
+      solution is likely fine for now.
+
+commit 138de3b3e88c5bf7d8718c45c88811771cf42db8
+Author: Ajay Panyala <ajay.panyala@gmail.com>
+Date:   Sun May 7 13:01:38 2023 -0700
+
+    add nvhpc compiler support (#719)
+    
+    Add detection of the NVIDIA nvhpc compiler (`nvc`) in `configure`, and adjust some warning options in `config.mk`. Currently, no specific options for `nvc` have been added in the relevant configurations so it may not be usable without further tweaks.
+
+commit 0873c0f6ed03fea321d1631b3d1a385a306aa797
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun May 7 14:03:19 2023 -0500
+
+    Consolidate INSERT_ macro sets via variadic macros. (#744)
+    
+    Details:
+    - Consolidated INSERT_GENTFUNC_* (and corresponding GENTPROT) macro sets
+      using variadic macros (__VA_ARGS__), which means we no longer need a
+      different INSERT_ macro for each possible number of arguments the
+      macro might take. This change seems reasonable given that variadic
+      macros are a standard C99 feature and widely supported. I took care
+      not to use variadic macros where 0 variadic arguments are expected
+      since that is a non-standard extension.
+    - Added pre-typecast parentheses to arithmetic expressions in printf()
+      statements in bli_thread_range_tlb.c.
+
+commit ef9d3e6675320a53e7cb477c16b01388e708b1da
+Author: h-vetinari <h.vetinari@gmx.com>
+Date:   Sun May 7 04:59:35 2023 +1100
+
+    Added missing #include <io.h> for Windows. (#747)
+    
+    Details:
+    - This commit fixes issue #746, in which the _access() function (called
+      from within blastest/f2c/open.c) is undeclared when compiling on
+      Windows with clang 16.
+
+commit 6fd9aabb03d172a792a7eeb106c7d965cf038421
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri May 5 14:22:52 2023 -0500
+
+    Fix bug in detecting Fortran compiler vendor (#745)
+    
+    `FC` was used instead of `found_fc`.
+
+commit 8215b02f99aa77ecc7d813508c247565115319d7
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Wed Apr 12 12:59:27 2023 -0500
+
+    Apply #738 to make_defs.mk of RISC-V subconfigs. (#740)
+    
+    Details:
+    - PR #738 -- which moved -fPIC flag insertion responsibilities from
+      common.mk to the subconfigs' individual make_defs.mk files -- was
+      merged shortly before the introduction of new RISC-V subconfigs in
+      #693. This commit brings those RISC-V subconfigs up to date with the
+      new -fPIC conventions.
+
+commit 6b38c5ac07a2a27738674784e58aa699bf895447
+Author: angsch <17718454+angsch@users.noreply.github.com>
+Date:   Tue Apr 11 19:27:43 2023 +0200
+
+    Add RISC-V target (#693)
+    
+    Details:
+    - There are four RISC-V base configurations: 'rv32i', 'rv32iv', 'rv64i',
+      and 'rv64iv', namely the 32-bit and 64-bit implementations with and
+      without the 'V' vector extension. Additional extensions such as 'M'
+      (multiplication), 'A' (atomics), 'F' ('float' hardware support), 'D'
+      ('double' hardware support), and 'C' (compressed-length instructions),
+      are automatically used when available. If they are not available, then
+      software equivalents (e.g., softfloat and -latomic) are used.
+    - './configure auto' can be invoked on a RISC-V build platform, and will
+      automatically detect RISC-V CPU extensions through the RISC-V C API:
+      https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md
+    - The assembly kernels assume the presence of the vector extension
+      RVV 1.0.
+    - It is possible to build 'rv[32,64]iv' for any value of VLEN.
+      However, if VLEN < 128, the targets will fall back to the generic
+      kernels and blocksizes.
+    - The vector microkernels are vector-length agnostic and work with
+      every VLEN >=128, but are expected to work best with smaller vector
+      lengths, i.e., VLEN <= 512.
+    - The assembly kernels cover column major storage (rs_c == 1).
+    - The blocksizes aim at being a good generic choice for out-of-order
+      cores. They are not tuned to a specific RISC-V HPC core.
+    - The vector kernels have been tested using vlen={128,256,512}.
+    - The single- and double-precision assembly code routines for 'sgemm'
+      and 'dgemm', or for 'cgemm' and 'zgemm', are combined in their RISC-V
+      vector assembly source code, and are differentiated only with macros.
+    - The XLEN=32 and XLEN=64 versions of the RISC-V assembly code are
+      identical, except that callee-saved registers are saved and restored
+      differently. There are RISC-V assembly code #include files for
+      handling the saving and restoring of callee-saved registers, and they
+      are future-proof if ever XLEN=128.
+    - Multiplications, such as computing array strides and offsets, are
+      performed in C, and later passed to the RISC-V assembly kernels. This
+      is so that the compiler can determine whether the 'M' (multiply)
+      extension is available and use multiplication instructions, or call
+      library helper functions instead.
+    - A new macro called bli_static_assert() has been added to perform
+      static assertions at compile-time, regardless of the C/C++ dialect of
+      the compiler. The original motivation of this was to ensure that
+      calling RISC-V assembly kernels would not silently truncate arguments
+      of type 'dim_t' or 'inc_t' (so-called "narrowing conversions").
+    - RISC-V CI tests have been added to Travis CI, using the
+      riscv-gnu-toolchain cross-compiler, and qemu simulator.
+    - Thanks to Lee Killough for collaborating on this commit.
+
+commit 593d01761910af6a9a16ee0ac097142732f73c29
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Apr 8 16:44:16 2023 -0500
+
+    CREDITS file update.
+
+commit 259f68479671bbaf9c5986759aaa0004f9b05a24
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Apr 7 16:11:34 2023 -0500
+
+    CREDITS file update.
+    
+    Details:
+    - Added attributions associated with commits:
+      - 98d4678 9b1beec: @bartoldeman
+      - 2b05948 059f151: @ct-clmsn
+    - Reordered attirubtion for @decandia50.
+
+commit aea8e1d9243631635ca788d5e14f0f29328e637d
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Apr 3 12:17:51 2023 -0500
+
+    Optionally disable thread-local storage. (#735)
+    
+    Details:
+    - Implemented a new configure option, --disable-tls, which allows the
+      user to optionally disable the use of thread-local storage qualifiers
+      on static variables in BLIS. This option will rarely be needed, but
+      in some situations may allow BLIS to compile when TLS is unavailable.
+      Thanks to Nick Knight for suggesting this option.
+    - Unlike the --disable-system option, --disable-tls does not forcibly
+      disable threading. Instead, warnings of the possible consequences of
+      using threading with TLS disabled are added to:
+      - the output of './configure --help';
+      - the output of 'configure' the --disable-tls option is parsed;
+      - the informational header output by the testsuite.
+      Thanks to Minh Quan Ho for suggesting these warnings.
+    - Modified frame/include/bli_lang_defs.h so that BLIS_THREAD_LOCAL is
+      defined to nothing when BLIS_ENABLE_TLS is not defined.
+    - Defined bli_info_get_enable_tls(), which returns whether the cpp macro
+      BLIS_ENABLE_TLS was defined.
+    - Edited --disable-system configure status output for clarity.
+    - Whitespace updates.
+
+commit 3f1432abe75cc306ef90a04381d7e0d8739fded8
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Mon Apr 3 12:10:59 2023 -0500
+
+    Add output.testsuite to .gitignore (#736)
+    
+    Details:
+    - Added `output.testsuite` to .gitignore since it was previously not
+      being matched by `output.testsuite.*`.
+
+commit 38fc5237520a2f20914a9de8bb14d5999009b3fb
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Mar 30 17:30:07 2023 -0500
+
+    Added mm_algorithm pdf files (bp and pb).
+    
+    Details:
+    - Added PDF versions of the PowerPoint files added in 17cd260.
+
+commit 17cd260cb504b2f3997c32daec77f4c828fbb32b
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Mar 29 21:47:12 2023 -0500
+
+    Added mm_algorithm pptx files (bp and pb).
+    
+    Details:
+    - Added two PowerPoint files that contain slides depicting the classic
+      Goto algorithm for matrix multiplication as well as its sister
+      "panel-block" algorithm. These files reside in docs/diagrams.
+
+commit 9d778e0f7c94d8752dd578101e4fc6893a1f54ef
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Mar 29 17:36:49 2023 -0500
+
+    Move -fPIC insertion to subconfigs' make_defs.mk. (#738)
+    
+    * Move -fPIC insertion to subconfigs' make_defs.mk.
+    
+    Details:
+    - Previously, common.mk was appending -fPIC to the CPICFLAGS variables
+      set within the various subconfigurations' make_defs.mk files. This
+      seemed somewhat unintuitive, and so now the -fPIC flag is assigned to
+      the various subconfigs' CPICFLAGS variables in the respective
+      make_defs.mk files.
+    - This also commit changes the logic in common.mk so that instead of
+      appending, the variable is overwritten, but now *only* in the case
+      of Windows (since apparently -fPIC needs to be omitted there). Thanks
+      to Nick Knight for catching and reporting this weirdness.
+
+commit 04090df01175477394d1e73af2e5769751d47cd6
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Mar 27 14:13:10 2023 -0500
+
+    Fixed compile errors with `BLIS_DISABLE_BLAS_DEFS`. (#730)
+    
+    * Fixed compile errors with BLIS_DISABLE_BLAS_DEFS.
+    
+    Details:
+    - This commit fixes a compile-time error related to the type definition
+      (prototype) of dsdot_() when BLIS_DISABLE_BLAS_DEFS is defined by the
+      application (or the configuration), which is actually a symptom of a
+      larger design issue when disabling BLAS prototypes. The macro was
+      intended to allow applications to bring their own BLAS prototypes and
+      suppress the inclusion of duplicate (or possibly conflicting)
+      prototypes within blis.h. However, prototypes are still needed during
+      compilation even if they are ultimately omitted from blis.h. The
+      problem is that almost every source file in BLIS--including the BLAS
+      compatibility layer--only includes one header (blis.h), and if we
+      were to #include a new header in the BLAS source files (to isolate
+      only the BLAS prototypes), we would also have to make the build system
+      aware of the location of those headers. Thanks to Edward Smyth of AMD
+      for reporting this issue.
+    - The solution I settled upon was to remove all cpp guards from all BLAS
+      headers (by changing them to #if 1, for easy search-and-replace
+      anchoring in the future if we ever need to re-insert guards) and
+      modifying bli_blas.h so that the BLAS prototypes are #included if
+      either (a) BLIS_ENABLE_BLAS_DEFS is defined, or (b)
+      BLIS_ENABLE_BLAS_DEFS is *not* defined but BLIS_IS_BUILDING_LIBRARY
+      *is* defined. (Thanks to Devin Matthews for steering me away from an
+      inferior solution.)
+    - This commit also spins off the actual BLAS prototypes/definitions to
+      a separate file, bli_blas_defs.h.
+    - CREDITS file update.
+
+commit 5f841307f668f65b7ed5a479bd8374d2581208cf
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Mar 24 20:05:13 2023 -0500
+
+    Omit -fPIC if shared library build is disabled. (#732)
+    
+    Details:
+    - Updated common.mk so that when --disable-shared option is given to
+      configure:
+      1. The -fPIC compiler flag is omitted from the individual
+         configuration family members' CPICFLAGS variables (which are
+         initialized in each subconfig's make_defs.mk file); and
+      2. The BUILD_SYMFLAGS variable, which contains compiler flags needed
+         to control the symbol export behavior, is left blank.
+    - The net result of these changes is that flags specific to shared
+      library builds are only used when a shared library is actually
+      scheduled to be built. Thanks to Nick Knight for reporting this issue.
+    - CREDITS file update.
+
+commit 72c37eb80f964b7840377076e5009aec5b29d320 (origin/riscv)
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Thu Mar 23 16:01:55 2023 -0500
+
+    Updated configure to pass all shellcheck checks. (#729)
+    
+    Details:
+    - Modified configure so that it passes all 'shellcheck' checks,
+      disabling ones which we violate but which are just stylistic, or are
+      special cases in our code.
+    - Miscellaneous other minor changes, such as rearranged redirections in
+      long sed/perl pipes to look more natural.
+    - Whitespace tweaks.
+
+commit 60f36347c16e6336215cd52b4e5f3c0f96e7c253
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Feb 22 20:37:30 2023 -0600
+
+    Fixed bugs in scal2v ref kernel when alpha == 1. (#728)
+    
+    Details:
+    - Fixed a typo bug in ref_kernels/1/bli_scal2v_ref.c where the
+      conditional that was supposed to be checking for cases when alpha is
+      equal to 1.0 (so that copyv could be used instead of scal2v) was
+      instead erroneously comparing alpha against 0.0.
+    - Fixed another bug in the same function whereby BLIS_NO_CONJUGATE was
+      erroneously being passed into copyv instead of the kernel's conjx
+      parameter. This second bug was inert, however, due to the first bug
+      since the "alpha == 0.0" case was already being handled, resulting in
+      the code block never executing.
+
+commit fab18dca46618799bb0b4f652820b33d36a5d4d4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Feb 22 16:50:00 2023 -0600
+
+    Use 'void*' datatypes in kernel APIs. (#727)
+    
+    Details:
+    - Migrated all kernel APIs to use void* pointers instead of float*,
+      double*, scomplex*, and dcomplex* pointers. This allows us to define
+      many fewer kernel function pointer types, which also makes it much
+      easier to know which function pointer type to use at any given time.
+      (For example, whereas before there was ?axpyv_ker_ft, ?axpyv_ker_vft,
+      and axpyv_ker_vft, now there is just axpyv_ker_ft, which is equivalent
+      so what axpyv_ker_vft used to be.)
+    - Refactored how kernel function prototypes and kernel function types
+      are defined so as to reduce redundant code. Specifically, the
+      function signatures (excluding cntx_t* and, in the case of level-3
+      microkernels, auxinfo_t*) are defined in new headers named, for
+      example, bli_l1v_ker_params.h. Those signatures are reused via macro
+      instantiation when defining both kernel prototypes and kernel function
+      types. This will hopefully make it a little easier to update, add, and
+      manage kernel APIs going forward.
+    - Updated all reference kernels according to the aforementioned switch
+      to void* pointers.
+    - Updated all optimzied kernels according to the aforementioned switch
+      to void* pointers. This sometimes required renaming variables,
+      inserting typecasting so that pointer arithmetic could continue to
+      function as intended, and related tweaks.
+    - Updated sandbox/gemmlike according to the aforementioned switch to
+      void* pointers.
+    - Renamed:
+      - frame/1/bli_l1v_ft_ker.h    -> frame/1/bli_l1v_ker_ft.h
+      - frame/1f/bli_l1f_ft_ker.h   -> frame/1f/bli_l1f_ker_ft.h
+      - frame/1m/bli_l1m_ft_ker.h   -> frame/1m/bli_l1m_ker_ft.h
+      - frame/3/bli_l1m_ft_ukr.h    -> frame/3/bli_l1m_ukr_ft.h
+      - frame/3/bli_l3_sup_ft_ker.h -> frame/3/bli_l3_sup_ker_ft.h
+      to better align with naming of neighboring files.
+    - Added the missing "void* params" argument to bli_?packm_struc_cxk() in
+      frame/1m/packm/bli_packm_struc_cxk.c. This argument is being passed
+      into the function from bli_packm_blk_var1(), but wasn't being "caught"
+      by the function definition itself. The function prototype for
+      bli_?packm_struc_cxk() also needed updating.
+    - Reordered the last two parameters in bli_?packm_struc_cxk().
+      (Previously, the "void* params" was passed in after the
+      "const cntx_t* cntx", although because of the above bug the params
+      argument wasn't actually present in the function definition.)
+
+commit 93c63d1f469c4650df082d0fa2f29c46db0e25f5
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Feb 20 11:14:23 2023 -0600
+
+    Use 'const' pointers in kernel APIs. (#722)
+    
+    Details:
+    - Qualified all input-only data pointers in the various kernel APIs with
+      the 'const' keyword while also removing 'restrict' from those kernel
+      APIs. (Use of 'restrict' was maintained in kernel implementations,
+      where appropriate.) This affected the function pointer types defined
+      for all of the kernels, their prototypes, and the reference and
+      optimized kernel definitions' signatures.
+    - Templatized the definitions of copys_mxn and xpbys_mxn static inline
+      functions.
+    - Minor whitespace and style changes (e.g. combining local variable
+      declaration and initialization into a single statement).
+    - Removed some unused kernel code left in 'old' directories.
+    - Thanks to Nisanth M P for helping to validate changes to the power10
+      microkernels.
+
+commit 4e18cd34f909c5045597f411340ede3a5e0bc5e1
+Author: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
+Date:   Sun Feb 19 04:18:41 2023 +0900
+
+    Restored ArmSVE general storage case. (#708)
+    
+    Details:
+    - Restored general storage case in armsve kernels.
+    - Reason for doing this: Though real `g`-storage is difficult to
+      speedup, `g`-codepath here can provide a good support for
+      transposed-storage. i.e. at least good for `GEMM_UKR_SETUP_CT_AMBI`.
+    - By experience, this solution is only *a little* slower than in-reg
+      transpose. Plus in-reg transpose is only possible for a fixed VL in
+      our case.
+
+commit 0ba6e9eafb1e667373d9dbc2aa045557921f33e2
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Sat Feb 18 13:15:42 2023 -0600
+
+    Refined emacs handling of indentation. (#717)
+    
+    Details:
+    - This refines the emacs autoformatting to be better in line with
+      contribution guidelines.
+    - Removed a stray shebang in a .mk file which confuses emacs about the
+      file mode, which should be makefile-mode. (emacs also removes stray
+      whitespace at the ends of lines.)
+
+commit 059f15105b1643fe56084f883c22b3cadf368b39
+Author: ct-clmsn <ct.clmsn@gmail.com>
+Date:   Sat Feb 18 14:13:23 2023 -0500
+
+    Updated hpx namespace for make_count_shape. (#725)
+    
+    Details:
+    - The hpx namespace for *counting_shape changed. This PR updates the use
+      of counting_shape in blis to comply with the change in hpx.
+    - Co-authored-by: ctaylor <ctaylor@tactcomplabs.com>
+
+commit 0b421eff130b5c896edcc09e7358d18564d177e9
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Feb 18 13:11:41 2023 -0600
+
+    Added an 'arm64' entry to `.travis.yml`. (#726)
+    
+    Details:
+    - Added a new 'arm64' entry to the .travis.yml file in an attempt to get
+      Travis CI to compile both NEON and SVE kernels, even if only NEON
+      kernels are exercised in the testing. With this new 'arm64' entry, the
+      'cortexa57' entry becomes redundant and may be removed. Thanks to
+      RuQing Xu for this suggestion.
+    - Previously, the macro BLIS_SIMD_MAX_SIZE was *not* being set in
+      bli_kernels_arm64.h, which meant that the default value of 64 was
+      being used. This caused a runtime consistency check to fail in
+      bli_gks.c (in Travis CI), one which requires that
+    
+        mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE
+    
+      for all datatype sizes dt_size, where BLIS_STACK_BUF_MAX_SIZE is
+      defined as
+    
+        BLIS_SIMD_MAX_NUM_REGISTERS * BLIS_SIMD_MAX_SIZE * 2
+    
+      This commit increases BLIS_SIMD_MAX_SIZE to 128 for the 'arm64'
+      configuration, thus overriding the default and (hopefully) avoiding
+      the aforementioned consistency check failures.
+    - Appended '|| cat ./output.testsuite' to all 'make' commands in
+      travis/do_testsuite.sh. Thanks to RuQing Xu for this suggestion.
+    - Whitespace changes.
+
+commit b1d3fc7e5b0927086e336a23f16ea59aa3611ccb
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Feb 10 15:34:47 2023 -0600
+
+    Redirect grep stderr to /dev/null. (#723)
+    
+    Details:
+    - In common.mk, added a redirection of stderr to /dev/null for the grep
+      command being used to gather a list of header files #included from
+      bli_cntx_ref.c. The redirection is desirable because as of grep 3.8,
+      regular expressions with "stray" backslashes trigger warnings [1].
+      But removing the backslash seems to break the BLIS build system when
+      using pre-3.8 versions of grep, so this seems to be easiest way to
+      satisfy the BLIS build system for both pre- and post-3.8 grep
+      environments.
+    
+      [1] https://lists.gnu.org/archive/html/info-gnu/2022-09/msg00001.html
+
+commit e3d352f1fcc93e6a46fde1aa4a7f0a18fb27bd42
+Author: Nisanth M P <nisanthmp.01@gmail.com>
+Date:   Wed Feb 8 06:11:41 2023 +0530
+
+    Added runtime selection of 'power' config family. (#718)
+    
+    Details:
+    - Created a 'power' umbrella configuration family, which, when targeted
+      at configure-time, will build both 'power9' and 'power10' subconfigs.
+      (With this feature, a BLIS shared library could be compiled on a
+      power9 system and run on power10 and vice-versa. Unoptimised code
+      will execute if it is linked and run on any other generic system.)
+    - This new configuration family will only work with gcc, since that is
+      the only compiler supported by both power9 and power10 subconfigs in
+      BLIS.
+    - Documented power9 and power10 as supported microarchitectures in the
+      docs/HardwareSupport.md document.
+
+commit e730c685d09336b3bd09e86c94330c4eba967f3e
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Feb 6 15:31:54 2023 -0600
+
+    Define `BLIS_VERSION_STRING` in `blis.h`. (#720)
+    
+    Details:
+    - Previously, the version string was communicated from configure to
+      config.mk (via the config.mk.in template), where it was included via
+      the top-level Makefile, where it was then used to define the
+      preprocessor macro BLIS_VERSION_STRING via a command line argument to
+      the compiler (via -D). This macro is then used within bli_info.c to
+      initialize a static string which can then be queried via the
+      bli_info_get_version_str() function. However, there are some
+      applications that may find utility in being able to access the version
+      string by inspecting the monolithic (flattened) blis.h header file
+      that is created at compile time and installed alongside the library.
+      This commit moves the definition of BLIS_VERSION_STRING into
+      bli_config.h (via the bli_config.h.in template) so that it is
+      embedded in blis.h. The version string is now available in three
+      places:
+      - the static/shared library, which is installed in the 'lib'
+        subdirectory of the install prefix (query-able via the
+        bli_info_get_version_str() function);
+      - the config.mk makefile fragment, which is installed in the 'share'
+        subdirectory of the install prefix (in the VERSION variable);
+      - the blis.h header file, which is installed in the 'include'
+        subdirectory of the install prefix (via the BLIS_VERSION_STRING
+        macro constant).
+      Thanks to Mohsen Aznaveh and Tim Davis for providing the idea for this
+      change.
+    - CREDITS file update.
+
+commit dc5d00a6ce0350cd82859d8c24f23d98f205d8db
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Fri Jan 27 17:36:47 2023 -0600
+
+    Typecast printf() args to avoid compiler warnings. (#716)
+    
+    Details:
+    - In bli_thread_range_tlb.c, typecast integer arguments passed to
+      printf() -- which are typically disabled unless debugging -- to type
+      "long" to guarantee a match to the "%ld" format specifiers used in
+      those calls. This avoids spurious warnings with certain compilers in
+      certain toolchain environments, such as 32-bit RISC-V (rv32iv).
+
+commit ecbcf4008815035c695822fcaf106477debff89a
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Wed Jan 18 20:35:50 2023 -0600
+
+    Use here-document for 'configure --help' output. (#714)
+    
+    Details:
+    - Changed the configure script function that outputs "--help" text to do
+      so via so-called "here-document" syntax for improved readability and
+      maintainability. The change eliminates hundreds of echo statements and
+      makes it easier to change existing configure options' help text, along
+      with other benefits such as eliminating the need to escape double-
+      quote characters (").
+
+commit c334ec278f5e2a101625629b2e13bbf1b38dede5
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Jan 18 13:10:19 2023 -0600
+
+    Merge tlb- and slab/rr-specific gemm macrokernels. (#711)
+    
+    Details:
+    - Merged the tlb-specific gemm macrokernel (_var2b) with the slab/rr-
+      specific one (var2) so that a single function can be compiled with
+      either tlb or slab/rr support, depending on the value of the
+      BLIS_ENABLE_JRIR_TLB, _SLAB, and _RR. This is done by incorporating
+      information from both approaches: the start/end/inc for the JR and IR
+      loops from slab or rr partitioning; and the number of assigned
+      microtiles, plus the starting IR dimension offset for all iterations
+      after the first (ir_next). With these changes, slab, rr, and tlb can
+      all be parameterized by initializing a similar set of variables prior
+      to the jr loop.
+    - Removed the wrap-around logic that sets the "b_next" field of the
+      auxinfo_t struct, which executes during the last IR iteration of the
+      last JR iteration. The potential benefit of this code is so minor
+      (and hinges on the microkernel making use of the b_next field) that
+      it's arguably not worth including. The code also does the wrong
+      thing for some threads whenever JR_NT > 1, since only thread 0 (in the
+      JR group) would even compute with the first micropanel of B.
+    - Re-expressed the definition of bli_is_last_iter_slrr so that slab and
+      tlb use the same code rather than rr and tlb.
+    - Adjusted the initialization of the gemm control tree accordingly.
+
+commit 5793a77937aee9847a5692c8e44b36a6380800a1
+Author: HarshDave12 <122850830+HarshDave12@users.noreply.github.com>
+Date:   Tue Jan 17 21:55:02 2023 +0530
+
+    Fixed mis-mapped instruction for VEXTRACTF64X2. (#713)
+    
+    Details:
+    - This commit fixes a typo in the macro definition for the extended
+      inline assembly macro VEXTRACTF64X2 in bli_x86_asm_macros.h. The macro
+      was previously defined (incorrectly) in terms of the vextractf64x4
+      instruction rather than vextractf64x2.
+    - CREDITS file update.
+
+commit 16d2e9ea9ca0853197b416eba701b840a8587bca
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Jan 13 20:03:01 2023 -0600
+
+    Defined lt, lte, gt, gte + misc. other updates. (#712)
+    
+    Details:
+    - Changed invertsc operation to be a non-destructive operation; that is,
+      it now takes separate input and output operands. This change applies
+      to both the object and typed APIs.
+    - Defined an alternative square root operation, sqrtrsc, which, when
+      operating on complex scalars, assumes the imaginary part of the input
+      to be zero.
+    - Changed the semantics of addm, subm, copym, axpym, scal2m, and xpbym
+      so that when the source matrix has an implicit unit diagonal, the
+      operation leaves the diagonal of the destination matrix untouched.
+      Previously, the operations would interpret an implicit unit diagonal
+      on the source matrix as a request to manifest the unit diagonal
+      *explicitly* on output (either as something to copy in the case of
+      copym, or something to compute with in the cases of addm, subm, axpym,
+      scal2m, and xpbym). It turns out that this behavior was too cute by
+      half and could cause unintended headaches for practical use cases.
+      (This change in behavior also required small modifications to the trmv
+      and trsv testsuite modules so that they would properly test matrices
+      with unit diagonals.)
+    - Added missing dependencies for copym to gemv, ger, hemv, trmv, and
+      trsv testsuite modules.
+    - Implemented level-0-like ltsc, ltesc, gtsc, gtesc operations in
+      frame/util, which use lt, lte, gt, and gte level-0 scalar macros.
+    - Trivial variable rename in bli_part.c to harmonize with other
+      variable naming conventions.
+
+commit 9a366b14fe52c469f4664ef5dd93d85be8d97baa
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Jan 12 13:07:22 2023 -0600
+
+    Implement cntx_t pointer caching in gks. (#709)
+    
+    Details:
+    - Refactored the gks cntx_t query functions so that: (1) there is a
+      clearer pattern of similarity between functions that query a native
+      context and those that query its induced (1m) counterpart; and (2)
+      queried cntx_t pointers (for both native and induced cntx_t pointers)
+      are cached (by default), or deep-queried upon each invocation,
+      depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is defined.
+    - Refactored query-related functions in bli_arch.c to cache the queried
+      arch_t value (by default), or deep-query the arch_t value upon each
+      invocation, depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is
+      defined.
+    - Tweaked the behavior of bli_gks_query_ind_cntx_impl() (formerly named
+      bli_gks_query_ind_cntx()) so that the induced method cntx_t struct is
+      repopulated each time the function is called. (It is still only
+      allocated once on first call.) This was mostly done in preparation for
+      some future in which the arch_t value might change at runtime. In such
+      a scenario, the induced method context would need to be recalculated
+      any time the native context changes.
+    - Added preprocessor logic to bli_config_macro_defs.h to handle enabling
+      or disabling of cntx_t pointer caching (via BLIS_ENABLE_GKS_CACHING).
+    - For now, cntx_t pointer caching is enabled by default and does not
+      correspond to any official configure option. Disabling can be done
+      by inserting a #define for BLIS_DISABLE_GKS_CACHING into the
+      appropriate bli_family_*.h header file within the configuration of
+      interest.
+    - Thanks to Harihara Sudhan S (AMD) for suggesting that cntxt_t pointers
+      (and not just arch_t values) be cached.
+    - Comment updates.
+
+commit b895ec9f1f66fb93972589c06bff171337153a31
+Author: Nisanth M P <nisanthmp.01@gmail.com>
+Date:   Wed Jan 11 09:02:32 2023 +0530
+
+    Fixing type-mismatch errors in power10 sandbox (#701)
+    
+    Details:
+    - This commit fixes a mismatch between the function type signature of
+      bli_gemm_ex() required by BLIS and the version of the function defined
+      within the power10 sandbox. It also performs typecasting upon calling
+      bli_gemm_front() to attain type consistency with the type signature
+      defined by BLIS for bli_gemm_front().
+
+commit 38d88d5c131253066cad4f98eea06fa9299cae3b
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Tue Jan 10 21:24:58 2023 -0600
+
+    Define new global scalar (obj_t) constants. (#703)
+    
+    Details:
+    - This commit defines the following new global scalar constants:
+      - BLIS_ONE_I: This constant encodes the imaginary unit.
+      - BLIS_MINUS_ONE_I: This constant encodes the negative imaginary unit.
+      - BLIS_NAN: This constant encodes a not-a-number value. Both real and
+        imaginary parts are set to NaN for complex datatypes.
+
+commit cdb22b8ffa5b31a0c16ac1a7bcecefeb5216f669
+Author: Nisanth M P <nisanthmp.01@gmail.com>
+Date:   Wed Jan 11 08:50:57 2023 +0530
+
+    Disable power10 kernels other than sgemm, dgemm. (#705)
+    
+    Details:
+    - There is a power10 sandbox which uses microkernels for datatypes other
+      than float and double (or scomplex/dcomplex). In a regular power10-
+      configured build (that is, with the sandbox disabled), there were
+      compile errors for some of these other non-sgemm/non-dgemm
+      microkernels. This commit protects those kernels with a new cpp macro
+      guard (which is defined in sandbox/power10/bli_sandbox.h) that
+      prevents that kernel code from being compiled for normal, non-sandbox
+      power10 builds.
+
+commit d220f9c436c0dae409974724d42ab6c52f12a726
+Author: Nisanth M P <nisanthmp.01@gmail.com>
+Date:   Wed Jan 11 08:43:03 2023 +0530
+
+    Fix k = 0 edge case in power10 microkernels (#706)
+    
+    Details:
+    - When power10 sgemm and dgemm microkernels are called with k = 0, they
+      become caught in infinite loops and segfault. This is fixed now via an
+      early exit in the case of k = 0.
+
+commit 2e1ba9d13c23a06a7b6f8bd326af428f7ea68c31
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Jan 10 21:05:54 2023 -0600
+
+    Tile-level partitioning in jr/ir loops (ex-trsm). (#695)
+    
+    Details:
+    - Reimplemented parallelization of the JR loop in gemmt (which is
+      recycled for herk, her2k, syrk, and syr2k). Previously, the
+      rectangular region of the current MC x NC panel of C would be
+      parallelized separately from from the diagonal region of that same
+      submatrix, with the rectangular portion being assigned to threads via
+      slab or round-robin (rr) partitioning (as determined at configure-
+      time) and the diagonal region being assigned via round-robin. This
+      approach did not work well when extracting lots of parallelism from
+      the JR loop and was often suboptimal even for smaller degrees of
+      parallelism. This commit implements tile-level load balancing (tlb) in
+      which the IR loop is effectively subjugated in service of more
+      equitably dividing work in the JR loop. This approach is especially
+      potent for certain situations where the diagonal region of the MC x NR
+      panel of C are significant relative to the entire region. However, it
+      also seems to benefit many problem sizes of other level-3 operations
+      (excluding trsm, which has an inherent algorithmic dependency in the
+      IR loop that prevents the application of tlb). For now, tlb is
+      implemented as _var2b.c macrokernels for gemm (which forms the basis
+      for gemm, hemm, and symm), gemmt (which forms the basis of herk,
+      her2k, syrk, and syr2k), and trmm (which forms the basis of trmm and
+      trmm3). Which function pointers (_var2() or _var2b()) are embedded in
+      the control tree will depend on whether the BLIS_ENABLE_JRIR_TLB cpp
+      macro is defined, which is controlled by the value passed to the
+      existing --thread-part-jrir=METHOD (or -r METHOD) configure option.
+      This script adds 'tlb' as a valid option alongside the previously
+      supported values of 'slab' and 'rr'. ('slab' is still the default.)
+      Thanks to Leick Robinson for abstractly inspiring this work, and to
+      Minh Quan Ho for inquiring (in PR #562, and before that in Issue #437)
+      about the possibility of improved load balance in macrokernel loops,
+      and even prototyping what it might look like, long before I fully
+      understood the problem.
+    - In bli_thread_range_weighted_sub(), tweaked the the way we compute the
+      area of the current MC x NC trapezoidal panel of C by better taking
+      into account the microtile structure along the diagonal. Previously,
+      it was an underestimate, as it assumed MR = NR = 1 (that is, it
+      assumed that the microtile column of C that overlapped with microtiles
+      exactly coincided with the diagonal). Now, we only assume MR = NR.
+      This is still a slight underestimate when MR != NR, so the additional
+      area is scaled by 1.5 in a hackish attempt to compensate for this, as
+      well as other additional effects that are difficult to model (such as
+      the increased cost of writing to temporary tiles before finally
+      updating C). The net effect of this better estimation of the
+      trapezoidal area should be (on average) slightly larger regions
+      assigned to threads that have little or no overlap with the diagonal
+      region (and correspondingly slightly smaller regions in the diagonal
+      region), which we expect will lead to slightly better load balancing
+      in most situations.
+    - Spun off the contents of bli_thread.[ch] that relate to computing
+      thread ranges into one of three source/header file pairs:
+      - bli_thread_range.[ch], which define functions that are not specific
+        to the jr/ir loops;
+      - bli_thread_range_slab_rr.[ch], which define functions that implement
+        slab or round-robin partitioning for the jr/ir loops;
+      - bli_thread_range_tlb.[ch], which define functions that implement
+        tlb for the jr/ir loops.
+    - Fixed the computation of a_next in the last iteration of the IR loop
+      in bli_gemmt_l_ker_var2(). Previously, it always "wrapped" back around
+      to the first micropanel of the current MC x KC packed block of A.
+      However, this is almost never actually the micropanel that is used
+      next. A new macro, bli_gemmt_l_wrap_a_upanel(), computes a_next
+      correctly, with a similarly named bli_gemmt_u_wrap_a_upanel() for use
+      in the upper-stored case (which *does* actually always choose the
+      first micropanel of A as its a_next at the end of the IR loop).
+    - Removed adjustments for a_next/b_next (a2/b2) for the diagonal-
+      intersecting case of gemmt_l_ker_var2() and the above-diagonal case
+      of gemmt_u_ker_var2() since these cases will only coincide with the
+      last iteration of the IR loop in very small problems.
+    - Defined bli_is_last_iter_l() and bli_is_last_iter_u(), the latter of
+      which explicitly considers whether the current microtile is the last
+      tile that intersects the diagonal. (The former does the same, but the
+      computation coincides with the original bli_is_last_iter().) These
+      functions are now used in gemmt to test when a_next (or a2) should
+      "wrap" (as discussed above). Also defined bli_is_last_iter_tlb_l()
+      and bli_is_last_iter_tlb_u(), which are similar to the aforementioned
+      functions but are used when employing tlb in gemmt.
+    - Redefined macros in bli_packm_thrinfo.h, which test whether an
+      iteration of work is assigned to a thread, as static inline functions
+      in bli_param_macro_defs.h (and then deleted bli_packm_thrinfo.h).
+      In the process of redefining these macros, I also renamed them from
+      bli_packm_my_iter_rr/sl() to bli_is_my_iter_rr/sl().
+    - Renamed
+        bli_thread_range_jrir_rr() -> bli_thread_range_rr()
+        bli_thread_range_jrir_sl() -> bli_thread_range_sl()
+        bli_thread_range_jrir()    -> bli_thread_range_slrr()
+    - Renamed
+        bli_is_last_iter() -> bli_is_last_iter_slrr()
+    - Defined
+        bli_info_get_thread_jrir_tlb()
+      and renamed:
+      - bli_info_get_thread_part_jrir_slab() ->
+        bli_info_get_thread_jrir_slab()
+      - bli_info_get_thread_part_jrir_rr() ->
+        bli_info_get_thread_jrir_rr()
+    - Modified bli_rntm_set_ways_for_op() to redirect IR loop parallelism
+      into the JR loop when tlb is enabled for non-trsm level-3 operations.
+    - Added a sanity check to prevent bli_prune_unref_mparts() from being
+      used on packed objects. This prohibition is necessary because the
+      current implementation does not take into account the atomicity of
+      packed micropanel widths relative to the diagonal of structured
+      matrices. That is, the function prunes greedily without regard to
+      whether doing so would prune off part of a micropanel *which has
+      already been packed* and assigned to a thread for inclusion in the
+      computation.
+    - Further restricted early returns in bli_prune_unref_mparts() to
+      situations where the primary matrix is not only of general structure
+      but also dense (in terms of its uplo_t value). The addition of the
+      matrix's dense-ness to the conditional is required because gemmt is
+      somewhat unusual in that its C matrix has general structure but is
+      marked as lower- or upper-stored via its uplo_t. By only checking
+      for general structure, attempts to prune gemmt C matrices would
+      incorrectly result in early returns, even though that operation
+      effectively treats the matrix as symmetric (and stored in only one
+      triangle).
+    - Fixed a latent bug in bli_thread_range_rr() wherein incorrect ranges
+      were computed when 1 < bf. Thankfully, this bug was not yet
+      manifesting since all current invocations used bf == 1.
+    - Fixed a latent bug in some unexercised code in bli_?gemmt_l_ker_var2()
+      that would perform incorrect pruning of unreferenced regions above
+      where the diagonal of a lower-stored matrix intersects the right edge.
+      Thankfully, the bug was not harming anything since those unreferenced
+      regions were being pruned prior to the macrokernel.
+    - Rewrote slab/rr-based gemmt macrokernels so that they no longer carved
+      C into rectangular and diagonal regions prior to parallelizing each
+      separately. The new macrokernels use a unified loop structure where
+      quadratic (slab) partitioning is used.
+    - Updated all level-3 macrokernels to have a more uniform coding style,
+      such as wrt combining variable declarations with initializations as
+      well as the use of const.
+    - Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and
+      bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and
+      bli_thrinfo_thread_id(), respectively. This change probably should
+      have been included in aeb5f0c.
+    - Removed old prototypes in bli_gemmt_var.h and bli_trmm_var.h that
+      corresponded to functions that were removed in aeb5f0c.
+    - Other very minor cleanups.
+    - Comment updates.
+
+commit b6735ca26b9d459d9253795dc5841ae8de9e84c9
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Jan 6 14:10:01 2023 -0600
+
+    Refactor structure awareness in packm_blk_var1.c. (#707)
+    
+    Details:
+    - Factored some of the structure awareness out of the loop in
+      bli_packm_blk_var1(). So instead of having a single loop with
+      conditionals in the body to handle various kinds of structure (and
+      stored/unstored submatrix placement), we now have a conditional branch
+      to handle various structure/storage scenarios with a loop in each
+      section. This change was originally motivated to choose slab or round-
+      robin partitioning (in the context of triangular matrices) based on
+      the structure of the entire block (or panel) being packed rather than
+      each micropanel individually. Previously, the code would attempt to
+      limit rr to the portion of the block that intersects the diagonal and
+      use slab for the remainder. However, that approach was not well-thought
+      out and in many situations this would lead to inferior load balancing
+      when compared to using round-robin for the entire block (or panel).
+      This commit has the added benefit of incurring less overhead during
+      the packing process now that each of the new loops is simpler.
+
+commit f956b79922da412791e4c8b8b846b3aafc0a5ee0
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Dec 31 20:18:08 2022 -0600
+
+    Switch to l3 sup decorator in gemmlike sandbox. (#704)
+    
+    Details:
+    - Modified the gemmlike sandbox to call bli_l3_sup_thread_decorator()
+      rather than a local analogue of that code. This reduces redundant
+      logic and makes it easier for the sandbox to inherit future
+      improvements to the framework's threading code.
+    - Moved addon/gemmd to addon/old/gemmd. This code has fallen out of date
+      and is taking too much effort to maintain. We will very likely
+      reimplement it completely once future changes are made to the
+      framework proper.
+
+commit 538150c5845ad903773ca797c740048174116aa4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sun Dec 25 22:28:09 2022 -0600
+
+    Applied race condition fix to sup thread decorator.
+    
+    Details:
+    - Applied the race condition bugfix in commit 7d23dc2 to the
+      corresponding sup code in bli_l3_sup_decor.c. Note that in the case
+      of sup, the race condition would have only manifested when optional
+      packing was enabled at runtime (typically via setting BLIS_PACK_A
+      and/or BLIS_PACK_B environment variables).
+    - Both the fix in this commit and the fix in 7d23dc2 address bugs
+      that were introduced when the thrinfo_t trees/communicators were
+      restructured in the October omnibus commit (aeb5f0c).
+
+commit 7d23dc2a064a371dc9883e2c2c7236a70912428c
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun Dec 25 19:09:14 2022 -0600
+
+    Fix a race condition which manifested as incorrect results (rarely). (#702)
+    
+    The problem occurs when there are at least two teams of threads packing different parts of a matrix, and where each team has at least two threads; call them team A and team B. The problematic sequence is:
+    
+    1. The chief of team A checks out a block B and broadcasts the pointer to its teammates.
+    2. Team A completely packs their data and perform a barrier amongst themselves.
+    3. Team A commences computing with the packed data.
+    4. The chief of team A finishes computing before its teammates, then calls bli_thrinfo_free on its thrinfo_t struct (which contains the mem_t object referencing the buffer B). This causes buffer B to be checked back in to the pba.
+    5. The chief of team B checks out the *same* block B that was just checked back in and broadcasts the pointer to its teammates.
+    6. DATA RACE: now the remaining threads of team A are reading *while* team B are writing to the same buffer B. If team A write new data before team B are done computing then an incorrect result is generated.
+    
+    The solution is to place a global barrier before the call to bli_thrinfo_free at the end of the computation.
+    
+    Co-authored-by: Field G. Van Zee <field@cs.utexas.edu>
+
+commit 3accacf57d11e9b109339754f91bf22329b6cb6a
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Dec 16 10:26:33 2022 -0600
+
+    Skip 1m optimization when forcing hemm_l/symm_l. (#697)
+    
+    Details:
+    - Fixed a bug in right-sided hemm when:
+      - using the 1m method,
+      - #defining BLIS_DISABLE_HEMM_RIGHT in the active subconfiguration,
+        and
+      - the storage of C matches the gemm microkernel IO preference PRIOR to
+        the right-sidedness being detected and recast in terms of the left-
+        side code path.
+      It turns out that bli_gemm_ind_recast_1m_params() was applying its
+      optimization (recasting a complex-domain macrokernel calling a 1m
+      virtual microkernel to a real-domain macrokernel calling the real-
+      domain microkernel) in situations in which it should not have. The
+      optimization was silently assuming that the storage of C always
+      matched that of the microkernel preference, since the front-end (in
+      this case, bli_hemm_front()) would have already had a chance to
+      transpose the operation to bring the two into agreement. However, by
+      disabling right-sided hemm, we deprive BLIS of that flexibility (as a
+      transposed left-sided case would necessarily have to become a right-
+      sided case), and thus the assumption was no longer holding in all
+      cases. Thanks to Nisanth M P for reporting this bug in Issue #621.
+    - The aforementioned bug, and its bugfix, also apply to symm when
+      BLIS_DISABLE_SYMM_RIGHT is defined.
+    - Comment updates.
+    - CREDITS file update.
+
+commit 4833ba224eba54df3f349bcb7e188bcc53442449
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Dec 12 20:26:02 2022 -0600
+
+    Fixed perf of mt sup with packing, and mt gemmlike. (#696)
+    
+    Details:
+    - Brought the gemmsup code path up to date relative to the latest
+      thrinfo_t semantics introduced in the October Omnibus commit
+      (aeb5f0c). This was done by passing the prenode (instead of the
+      current node) into the packm variant within bli_l3_sup_packm.c as well
+      as creating the prenodes and attaching them to the thrinfo_t tree in
+      bli_l3_sup_thrinfo_create(). These changes erase the performance
+      degradation introduced in the omnibus when running multithreaded sup
+      with optional packing enabled. Special thanks to Devin Matthews for
+      sussing out this fix in short order.
+    - Fixed the gemmlike sandbox in a manner similar to that of sup with
+      packing, described above. This also involved passing the prenode into
+      the local gemmlike packm variant. (Recall that gemmlike recycles the
+      use of bli_l3_sup_thrinfo_create(), so it automatically inherits that
+      part of the sup fix described above.)
+    - Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and
+      bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and
+      bli_thrinfo_thread_id(), respectively.
+
+commit db10dd8e11a12d85017f84455558a82c0093b1da
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Nov 29 19:10:31 2022 -0600
+
+    Fixed _gemm_small() prototype; disabled gemm_small.
+    
+    Details:
+    - Fixed a mismatch between the prototype for bli_gemm_small() in
+      bli_gemm_front.h and the actual definition of bli_gemm_small() in
+      kernels/zen/3/bli_gemm_small.c. The former was erroneously declaring
+      the cntl_t* argument as 'const'. Thanks to Jeff Diamond for reporting
+      this issue.
+    - Commented out BLIS_ENABLE_SMALL_MATRIX, BLIS_ENABLE_SMALL_MATRIX_TRSM
+      macro definitions in config/zen3/bli_family_zen3.h. AMD's small matrix
+      implementation should probably remain disabled in vanilla BLIS, at
+      least for now.
+
+commit f0337b784d164ae505ca0e11277a1155680500d1
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sun Nov 13 21:36:47 2022 -0600
+
+    Trival whitespace/comment tweaks.
+    
+    Details:
+    - Trivial whitespace and comment changes, most of which ideally would
+      have been part of the previous commit pertaining to HPX (2b05948).
+
+commit 2b05948ad2c9785bc53f376d53a7141cbc917447
+Author: ct-clmsn <ct.clmsn@gmail.com>
+Date:   Sun Nov 13 17:40:22 2022 -0500
+
+    blis support for hpx (#682)
+    
+    Implement threading backend via HPX.
+    
+    HPX is an asynchronous many task runtime system used in high performance computing applications. The runtime implements the ISO C++ parallelism specification and provides a user-space thread implementation.
+    
+    This PR provides BLIS a thread backend implementation using HPX and resolves feature request #681. The configuration script, makefiles, and testsuite have been updated to support an HPX build option. The addition of HPX support provides other developers an exemplar for integrating other C++ threading backends into BLIS.
+    
+    Co-authored-by: ctaylor <ctaylor@pennywise.cm.cluster>
+    Co-authored-by: Devin Matthews <damatthews@smu.edu>
+
+commit e1ea25da43508925e33d4e57e420cfc0a9de793f
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Nov 11 12:07:51 2022 -0600
+
+    Fixed subtle barrier_fpa bug in bli_thrcomm.c. (#690)
+    
+    Details:
+    - In bli_thrcommo.c, correctly initialize the BLIS_OPENMP element of the
+      barrier function pointer array (barrier_fpa) to NULL when
+      BLIS_ENABLE_OPENMP is *not* defined. Similarly, initialize the
+      BLIS_POSIX element of barrier_fpa to NULL when BLIS_ENABLE_PTHREADS is
+      not enabled. This bug was introduced in a1a5a9b and was likely the
+      result of an incomplete edit. The effects of the bug would have
+      likely manifested when querying a thrcomm_t that was initialized with
+      a timpl_t value corresponding to a threading implementation that was
+      omitted from the -t option at configure-time.
+
+commit dc6e5f3f5770074ba38554541b8b64711a68c084
+Author: leekillough <15950023+leekillough@users.noreply.github.com>
+Date:   Thu Nov 3 18:33:08 2022 -0500
+
+    Enhance emacs formatting of C files to remove trailing whitespace and ensure a newline at the end of file
+
+commit 713d078075a4a563a43d83fd0880ab5091c2e4a4
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Nov 3 20:00:11 2022 -0500
+
+    Delete mpi_test garbage. (#689)
+    
+    Details:
+    - tlrmchlsmth: "What even is this? No comments, no commit message, not
+      used by anything. Trash."
+
+commit 8d813f7f12732d52c95570ae884d5defbfd19234
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Nov 3 19:10:47 2022 -0500
+
+    Some decluttering of the top-level directory.
+    
+    Details:
+    - Relocated 'mpi_test' directory to test/mpi_test.
+    - Relocated 'so_version' and 'version' files from top-level directory to
+      'build' directory.
+    - Updated build/bump-version.sh script to accommodate relocation of
+      'version' file to 'build' directory.
+    - Updated configure script to accommodate relocation of 'so_version'
+      file to 'build' directory.
+    - Updated INSTALL file to replace pointers to blis-devel mailing list
+      with a pointer to docs/Discord.md.
+    - Updated RELEASING file to contain a reminder to consider whether the
+      so_version file should be updated prior to the release.
+
+commit 6774bf08c92fc6983706a91bbb93b960e8eef285
+Author: Lee Killough <15950023+leekillough@users.noreply.github.com>
+Date:   Thu Nov 3 15:20:47 2022 -0500
+
+    Fix typo in configure --help text. (#686)
+    
+    Details:
+    - Fixed a misspelling in the --help description for the --int-size (-i)
+      configure option.
+
+commit 872898d817f35702e7678ff7f3eeff0f12e641f5
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Nov 2 21:53:22 2022 -0500
+
+    Fixed trmm[3]/trsm performance bug in cf7d616. (#685)
+    
+    Details:
+    - Fixed a performance bug in the packing of micropanels that intersect
+      the diagonal of triangular matrices (i.e., those found in trmm, trmm3,
+      and trsm). This bug was introduced in cf7d616 and stemmed from an
+      ill-formed boolean conditional expression in bli_packm_blk_var1().
+      This conditional would chose when to use round-robin parallel work
+      allocation, but checked for the triangularity of the submatrix being
+      packed while failing also to check for whether the current micropanel
+      actually intersected the diagonal. The net result of this bug was that
+      *all* micropanels of a triangular matrix, no matter where the upanels
+      resided within the matrix, were assigned to threads via a round-robin
+      policy. This affected some microarchitectures and threading
+      configurations much worse than others, but it seems that overall the
+      effect was universally negative, likely because of the reduced spatial
+      locality during the packing with round-robin. Thanks to Leick Robinson
+      for his tireless efforts in helping track down this issue.
+
+commit edcc2f9940449f7d9cefcfc02159d27b013e7995
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Nov 2 19:04:49 2022 -0500
+
+    Support --nosup, --sup configure options. (#684)
+    
+    Details:
+    - Added --nosup and --sup as alternative ways of requesting that sup be
+      disabled or enabled. These are analagous to --disable-sup-handling and
+      --enable-sup-handling, respectively. (I got tired of typing out
+      --disable-sup-handling and needed a shorthand notation.)
+    - Tweaked message output by configure when sup is enable/disabled for
+      clarity and specificity.
+    - Whitespace changes.
+
+commit 5eea6ad9eb25f37685d1ae4ae08c73cd1daca297
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Nov 2 17:07:54 2022 -0500
+
+    Add mention of Wilkinson Prize to README.md. (#683)
+    
+    Details:
+    - Added blurbs and links to Wilkinson Prize to README.md.
+    - Added mention of both Best Paper and Wilkinson Prizes to the top of
+      README.md.
+    - Other minor tweaks.
+
+commit 29f79f030e939969d4f3876c4fdaac7b0c5daa63
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 31 18:57:45 2022 -0500
+
+    Fixed performance bug caused by redundant packing. (#680)
+    
+    Details:
+    - Fixed a performance bug whereby multiple threads were redundantly
+      packing the same (rather than separate) micropanels. This bug was
+      caused by different parts of the code using the num_threads/thread_id
+      field of the thrinfo_t vs. the n_way/work_id fields. The fix was to
+      standardize on the latter and provide a "fake" thrinfo_t sub-prenode
+      in the thrinfo tree which consists of single-member thread teams. The
+      single team with multiple threads node is still required since it and
+      only it can be used to perform barriers and broadcasts (e.g. of the
+      packed buffer pointer).
+
+commit aeb5f0cc19665456e990a7ffccdb09da2e3f504b
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Oct 27 12:39:11 2022 -0500
+
+    Omnibus PR - Oct 2023 (#678)
+    
+    Details:
+    - This is an "omnibus" commit, consisting of multiple medium-sized
+      commits that affect non-trivial aspects of BLIS. The major highlights:
+      - Relocated the pba, sba pool (from the rntm_t), and mem_t (from the
+        cntl_t) to the thrinfo_t object. This allows the rntm_t to be
+        effectively const (although it is sometimes copied internally and
+        modified to reflect different ways of parallelism). Moving the mem_t
+        sets the stage for sharing a global control tree amongst all
+        threads.
+      - De-templatized the macrokernels for gemmt, trmm, and trsm to match
+        the macrokernel for gemm, which has been de-templatized since
+        54fa28b.
+      - Reimplemented bli_l3_determine_kc() by separating out the logic for
+        adjusting KC based on MR/NR for triangular A and/or B into a new
+        function, bli_l3_adjust_kc(). For now, this function is still called
+        from bli_l3_determine_kc(), but in the future we plan to have it
+        called once when constructing the control tree.
+      - Refactored the level-3 thread decorator into two parts:
+        - One part deals only with launching threads, each one calling a
+          generic thread entry function. This code resides in frame/thread
+          and constitutes the definition of bli_thread_launch(). Note that
+          it is specific to the threading implementation (OpenMP, pthreads,
+          single, etc.)
+        - The other part deals with passing the matrix operands and related
+          information into bli_thread_launch(). This is the "l3 decorator"
+          and now resides in frame/3. It is agnostic to the threading
+          implementation.
+      - Modified the "level" of the thread control tree passed in at each
+        operation. Previously, each operation (e.g. bli_gemm_blk_var1()) was
+        passed in a communicator representing the active thread teams which
+        would share the available work. Now, the *parent* thread comm is
+        passed in. The operation then grabs the child comm and uses it to
+        partition the work. The difference is in bli_trsm_blk_var1(), where
+        there are now two children nodes for this single operation (i.e. the
+        thread control tree is split one level above where the control tree
+        is). The sub-prenode is used for the trsm subproblem while the
+        normal sub-node is used for the gemm part. Importantly, the parent
+        comm is used for the barrier between them.
+    - Removed cntl_t* arguments from bli_*_front() functions. These will be
+      added back in the future when the control tree's creation is moved so
+      that it happens much sooner (provided that bli_*_front() have not been
+      absorbed into their respective bli_*_ex() functions).
+    - Renamed various bli_thread_*() query functions to bli_thrinfo_*(),
+      for consistency. This includes _num_threads(), _thread_id(), _n_way(),
+      _work_id(), _sba_pool(), _pba(), _mem(), _barrier(), _broadcast(), and
+      _am_chief().
+    - Removed extraneous barrier from _blk_var3() of gemm and trsm.
+    - Fixed a typo in bli_type_defs.h where BLIS_BLAS_INT_TYPE_SIZE was
+      misspelled.
+
+commit c803b03e52a7a6997a8d304a8cfa9acf7c1c555b
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Oct 26 18:20:00 2022 -0500
+
+    Add check to disable armsve on Apple M1.
+
+commit 2dd692b710b6a9889f7ebdd7934a2108be5c5530
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Oct 26 18:10:26 2022 -0500
+
+    Fix auto-detection of firestorm (Apple M1).
+
+commit 88105dbecf0f9dfbfa30215743346e8bd6afb971
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Oct 21 15:16:12 2022 -0500
+
+    Added Discord documentation (#677)
+    
+    Details:
+    - Added a docs/Discord.md markdown document that walks the reader
+      through creating a Discord account, obtaining the invite link, and
+      using the link to join the BLIS Discord server.
+    - Updated README.md to reference the new Discord.md document in multiple
+      places, including via the official Discord logo (used with explicit
+      permission from representatives at Discord Inc.).
+
+commit 23f5b8df3e802a27bacd92571184ec57bbdfa646
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Oct 17 20:21:21 2022 -0500
+
+    Shuffled checked properties in bli_l3_check.c. (#676)
+    
+    Details:
+    - Added certain checks for matrix structure to the level-3 operations'
+      _check() functions, and slightly reorganized existing checks.
+
+commit 9453e0f163503f64a290256b4be53d8882224863
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Oct 3 19:46:20 2022 -0500
+
+    CREDITS file update.
+    
+    Details:
+    - This attribution was intended to go in PR #647.
+
+commit 76a23bd8c33e161221891935a489df9a9fb9c8c0
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Oct 3 15:55:07 2022 -0500
+
+    Reinstate sanity check in bli_pool_finalize. (#671)
+    
+    Details:
+    - Added a reinit argument to bli_pool_finalize(). This bool will signal
+      whether or not the function is being called from bli_pool_reinit(). If
+      it is not being called from _reinit(), we can safely check to confirm
+      that .top_index == 0 (i.e., all blocks have been checked in). But if
+      it *is* being called from _reinit(), then that check will be skipped
+      since one of the predicted use cases for bli_pool_reinit() anticipates
+      that some blocks are (probably) checked out when the pool_t is
+      reinitialized.
+    - Updated existing invocations of bli_pool_finalize() to pass in either
+      FALSE (from bli_apool_free_block() or bli_pba_finalize_pools()) or
+      TRUE (from bli_pool_reinit()) for the new reinit argument.
+
+commit 63470b49e3b9b15e00a8f666e86ccd70c6005fe9
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Sep 29 18:52:08 2022 -0500
+
+    Fix some bugs in bli_pool.c (#670)
+    
+    Details:
+    - Add a check for premature pool exhaustion when checking in blocks via
+      bli_pool_checkin_block(). This detects "double-free" and other bad
+      conditions that don't necessarily result in a segfault.
+    - Make sure to copy all block pointers when growing the pool size.
+      Previously, checked-out block pointers (which are guaranteed to be set
+      to NULL) were not being copied, leading to the presence of
+      uninitialized data.
+
+commit 42d0e66318b186d25eeb215b40ce26115401ed8b
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Thu Sep 29 17:38:02 2022 -0500
+
+    Add AddressSanitizer (-fsanitize=address) option. (#669)
+    
+    Details:
+    - Added support for AddressSanitizer (ASan), a compiler-integrated
+      memory error detector. The option (disabled by default) enables
+      compiling and linking with the -fsanitize=address flag supported by
+      clang, gcc, and probably others. This flag is employed during
+      compilation of all BLIS source files *except* for optimized kernels,
+      which are exempted because ASan usually requires an extra register,
+      which violates the constraints for many gemm microkernels.
+    - Minor whitespace, comment, ordering, and configure help text updates.
+
+commit b861c71b50c6d48cb07282f44aa9dddffc1f1b3f
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Fri Sep 23 13:22:27 2022 -0500
+
+    Add consistent NaN/Inf handling in sumsqv. (#668)
+    
+    Details:
+    - Changed sumsqv implementation as follows:
+      - If there is a NaN (either real or imaginary), then return a sum of
+        NaN and unit scale.
+      - Else, if there is an Inf (either real or imaginary), then return a
+        sum of +Inf and unit scale.
+      - Otherwise behave as normal.
+
+commit ee81efc7887374c974a78bfb3e0865776b2f97a8
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Sep 22 19:15:07 2022 -0500
+
+    Parameterized test/3 drivers via command line args. (#667)
+    
+    Details:
+    - Rewrote the drivers in test/3, the Makefile, and the runme.sh script
+      so that most of the important parameters, including parameter combo,
+      datatype, storage combo, induced method, problem size range, dimension
+      bindings, number of repeats, and alpha/beta values can be passed in
+      via command line arguments. (Previously, most of these parameters were
+      hard-coded into the driver source, except a few that were hard-coded
+      into the Makefile.) If no argument is given for any particular option,
+      it will be assigned a sane default. Either way, the values employed at
+      runtime will be printed to stdout before the performance data in a
+      section that is commented out with '%' characters (which is used by
+      matlab and octave for comments), unless the -q option is given, in
+      which case the driver will proceed quietly and output only performance
+      data. Each driver also provides extensive help via the -h option, with
+      the help text tailored for the operation in question (e.g. gemm, hemm,
+      herk, etc.). In this help text, the driver reminds the user which
+      implementation it was linked to (e.g. blis, openblas, vendor, eigen).
+      Thanks to Jeff Diamond for suggesting this CLI-based reimagining of
+      the test/3 drivers.
+    - In the test/3 drivers: converted cpp macro string constants, as well
+      as two string literals (for the opname and pc_str) used in each test
+      driver, to global (or static) const char* strings, and replaced the
+      use of strncpy() for storing the results of the command line argument
+      parsing with pointer copies from the corresponding strings in argv.
+      This works because the argv array is guaranteed by the C99 standard
+      to persist throughout the life of the program. This new approach uses
+      less storage and executes faster. Thanks to Minh Quan Ho for
+      recommending this change.
+    - Renamed the IMP_STR cpp macro that gets defined on the command line,
+      via the test/3/Makefile, to IMPL_STR.
+    - Updated runme.sh to set the problem size ranges for single-threaded
+      and multithreaded execution independently from one another, as well as
+      on a per-system basis.
+    - Added a 'quiet' variable to runme.sh that can easily toggle quiet mode
+      for the test drivers' output.
+    - Very minor typecast fix in call to bli_getopt() in bli_utils.c.
+    - In bli_getopt(), changed the nextchar variable from being a local
+      static variable to a field of the getopt_t state struct. (Not sure why
+      it was ever declared static to begin with.)
+    - Other minor changes to bli_getopt() to accommodate the rewritten test
+      drivers' command line parsing needs.
+
+commit 036a4f9d822df25a76a653e70be76fb02284d3d3
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Sep 22 18:36:50 2022 -0500
+
+    Refactored some rntm_t management code. (#666)
+    
+    Details:
+    - Separated the "sanitizing" code from the auto-factorization code
+      in bli_rntm_set_ways_from_rntm() and _rntm_set_ways_from_rntm_sup().
+      The santizing code now resides in bli_rntm_sanitize() while the
+      factorization code resides in bli_rntm_factorize() and
+      bli_rntm_factorize_sup(). (There are two different functions because
+      the conventional and sup factorization codes are currently somewhat
+      different.) Also note that the factorization code now relies on the
+      .auto_factor field to have already been set, either during
+      rntm_t initialization or when the rntm_t was previously updated and
+      santized. So rather than locally determining whether to auto-
+      factorize, those functions just read the .auto_factor field and
+      proceed accordingly.
+    - Refactored and removed most code from bli_thread_init_rntm_from_env().
+      This function now reads the environment variables needed to set nt,
+      jc, pc, ic, jr, and ir; sets them into the global rntm_t; and then
+      calls bli_rntm_sanitize() in order to make sure that the contents are
+      in a "good" state. Thanks to Devin Matthews for suggesting this
+      refactoring.
+    - Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() such that
+      if multithreading is disabled at compile time (that is, if the cpp
+      macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the
+      caller's request and instead clear the nt and ways fields.
+    - Redefined bli_thread_set_num_threads() and bli_thread_set_ways() such
+      that if multithreading is disabled at compile time (that is, if the
+      cpp macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the
+      caller's request and do nothing.
+    - Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() as true
+      functions rather than static inline functions.
+    - In bli_rntm.c, statically initialize the global_rntm global variable
+      via the BLIS_RNTM_INITIALIZER macro.
+    - In bli_rntm.h, defined bli_rntm_clear_auto_factor(), which sets the
+      .auto_factor field of the rntm_t to FALSE.
+    - Reorganized order of some inline function definitions in bli_rntm.h.
+    - Changed the default value given to the .auto_factor field by the
+      BLIS_RNTM_INITIALIZER macro from TRUE to FALSE.
+    - Call bli_rntm_clear_auto_factor() instead of
+      bli_rntm_set_auto_factor_only() in bli_rntm_init().
+    - Comment/whitespace updates.
+
+commit a1a5a9b4cbef9208da494c45a2f933a8e82559ac
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Sep 21 18:31:01 2022 -0500
+
+    Implemented support for fat multithreading. (#665)
+    
+    Details:
+    - Allow the user to configure BLIS in such a way that multiple threading
+      implementations get compiled into the library, with one of those
+      implementations chosen at runtime. For now, there are only three
+      implementations available: OpenMP, pthreads, and single. (Here,
+      'single' merely refers to single-threaded mode.) The configure script
+      now allows the user to give the -t option with a comma-separated list
+      of values, such as '-t openmp,pthreads'. The first value in the list
+      will always be the default at library initialization time, and
+      'single' is always silently appended to the end of the list. The user
+      can specify which implementation should execute in one of three ways:
+      by setting the BLIS_THREAD_IMPL environment variable prior to launch;
+      by calling the bli_thread_set_thread_impl() global runtime API; or by
+      encoding their choice into a rntm_t that is passed into one of the
+      expert interfaces. Any of these three choices overrides the
+      initialization-time default (i.e., the first value listed to the -t
+      configure option). Requesting an implementation that was not compiled
+      into the library will result in an error message followed by
+      bli_abort().
+    - Relocated the 'auto' logic for the -t option from the top-level
+      Makefile to the configure script. (Currently, this logic is pretty
+      dumb, choosing 'openmp' for gcc and icc, and 'pthreads' for clang.)
+    - Defined a new 'timpl_t' enum in bli_type_defs.h, with three valid
+      values: BLIS_SINGLE, BLIS_OPENMP, BLIS_POSIX.
+    - Reorganized the thrcomm_t struct into a single defintion with two
+      preprocessor blocks, one each for additional fields needed by OpenMP
+      and pthreads.
+    - Added timpl_t argument to bli_thrcomm_bcast(), bli_thrcomm_barrier(),
+      bli_thrcomm_init(), and bli_thrcomm_cleanup(), which these functions
+      need since they are now wrappers that choose the implementation-
+      specific function corresponding to the currently enabled threading
+      implementation.
+    - Added rntm_t* to bli_thread_broadcast(), bli_thread_barrier() so that
+      those functions can pass the timpl_t value into bli_thrcomm_bcast()
+      and bli_thrcomm_barrier(), respectively.
+    - Defined bli_env_get_str() in bli_env.c to allow the querying of
+      BLIS_THREAD_IMPL (which, unlike BLIS_NUM_THREADS and friends, is
+      expected to be a string).
+    - Defined bli_thread_get_thread_impl(), bli_thread_set_thread_impl() to
+      get and set the current threading implementation at runtime.
+    - Defined bli_rntm_thread_impl() and bli_rntm_set_thread_impl() to query
+      and set the threading implementation within a rntm_t. Also choose
+      BLIS_SINGLE as the default value when initializing rntm_t structs.
+    - Added bli_info_get_*() functions to query whether OpenMP or pthreads
+      would be chosen as the default at init-time. Note that this only
+      tests whether OpenMP or pthreads is the first implementation in the
+      list passed to the threading configure option (-t) and is *not* the
+      same as querying which implementation is currently selected, since
+      that can be influenced by BLIS_THREAD_IMPL and/or
+      bli_thread_set_thread_impl().
+    - Changed l3int_t to l3int_ft.
+    - Updated docs/Multithreading.md to document the new behavior.
+    - Updated sandbox/gemmlike and addon/gemmd to work with the new fat
+      threading feature. This included a few bugfixes to bring the codes up
+      to date, as necessary.
+    - Comment, whitespace updates.
+
+commit 89df7b8fa3a3e47ab2fc10ac4d65d0b9fde16942
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sun Sep 18 18:46:57 2022 -0500
+
+    De-templatized _sup_var1n2m.c; unified _sup_packm_a/b(). (#659)
+    
+    Details:
+    - Re-expressed the two variants in frame/3/bli_l3_sup_var1n2m.c as a
+      single function each that performs char* pointer arithmetic rather
+      than four datatype-specific functions. Did the same for the functions
+      in bli_l3_sup_packm_a.c and _sup_packm_b.c, and then unified the two
+      into a single set of functions for packing either A or B, which now
+      resides in bli_l3_sup_packm.c.
+    - Pre-grow the cntl_t tree in both bli_l3_sup_var1n2m.c variants rather
+      than grow them incrementally.
+    - Relocated empty-matrix and scale-by-beta early return handlnig from
+      bli_gemm_front() and bli_gemmt_front() to their _ex() counterparts.
+    - Comment, whitespace updates.
+
+commit fb91337eff1ee2098f315a83888f6667b3a56f86
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Sep 15 19:08:10 2022 -0500
+
+    Fixed a harmless pc_nt bug in 05a811e.
+    
+    Details:
+    - Added missing curly braces around some statements in bli_rntm.c, one
+      of which  needed them in order for the relevant code to be executed in
+      the intended way. The consequence of 05a811e omitting those braces was
+      that a statement (pc_nt = 1;) was executed more often than it needed
+      to be.
+    - Also adjusted the analagous code in bli_thread.c to match that of
+      bli_rntm.c.
+
+commit e86076bf4461d1a78186fb21ba8320cfb430f62c
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Sep 15 14:22:59 2022 -0500
+
+    Test the 'gemmlike' sandbox via AppVeyor. (#664)
+    
+    Details:
+    - Added a fifth test to our .appveyor.yml that enables the 'gemmlike'
+      sandbox with OpenMP enabled (via clang, the 'auto' configuration
+      target, and building to a static library). Thanks to Jeff Diamond
+      for pointing out that this test would be useful.
+
+commit 63177dca48cb7d066576d884da4a7a599ececebf
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Sep 15 11:21:26 2022 -0500
+
+    Fixed gemmlike sandbox bug introduced in 7c07b47.
+    
+    Details:
+    - Fixed a bug in the 'gemmlike' sandbox that was introduced in 7c07b47.
+      This bug was the result of the fact that the gemmlike implementation
+      uses bli_thrinfo_sup_grow() to grow its thrinfo_t tree, but the
+      aforementioned commit added an optimization that kicks in when the
+      rntm_t .pack_a and .pack_b fields are both FALSE. Those fields were
+      originally added only for sup execution; for large code path, they
+      are intended to be ignored. But the default initial state of a rntm_t
+      has those fields set to FALSE, which was inadvertantly activating the
+      optimization (which targeted single-threaded cases only) and would
+      cause multithreaded use cases of 'gemmlike' to segfault. The fix took
+      the form of setting the .pack_a and .pack_b fields to TRUE in
+      bls_gemm_ex().
+    - Added minimal 'const' and 'const'-casting to 'gemmlike' so that gcc
+      stays quiet.
+
+commit 05a811e898b371a76581abd4afa416980cce7db9
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 13 19:24:05 2022 -0500
+
+    Initialize rntm_t nt/ways fields with 1 (not -1). (#663)
+    
+    Details:
+    - Changed the way that rntm_t structs are initialized, mainly so that
+      the global rntm_t that is set via environment variables at runtime
+      may be queried by the application prior to any computation taking
+      place. (Strictly speaking, the application may already query these
+      fields, but they do not always contain valid values and often contain
+      -1 when they are unset.) These changes also served to clarify how
+      these parameters are treated, and homogenized the implementations of
+      bli_rntm_set_ways_from_rntm(), bli_rntm_set_ways_from_rntm_sup(), and
+      bli_thread_init_rntm_from_env(). Special thanks to Jeff Diamond,
+      Leick Robinson, and Devin Matthews for pointing out that the previous
+      behavior was needlessly confusing and could be improved.
+    - The aforementioned modifications also included subtle changes as to
+      what counts as "setting" a loop's ways of parallelism for the purposes
+      of deciding whether to use the ways or the total number of threads.
+      Previously, setting any loop's ways, even to 1, counted in favor of
+      using the ways. Now, only values greater than 1 will count as
+      "setting", and all other values will silently be mapped to 1, with
+      those parameters treated as if they were untouched all along.
+    - Updated bli_rntm.h and bli_thread.c so that any attempt to set the
+      PC_NT variable (or pc_nt field of a rntm_t) will either ignore the
+      request or reassert the value as 1.
+    - Updated bli_rntm_set_ways() so that rather than clear the
+      num_threads field, it is set to the product of all of the per-loop
+      ways of parallelism.
+    - Removed code from test_libblis.c that handled the possibility of unset
+      environment variables when printing out their values.
+    - Removed bli_rntm_equals() inline function from bli_rntm.h, which has
+      long been disabled.
+    - Updates to docs/Multithreading.md related to the aforementioned
+      changes.
+    - Comment updates.
+
+commit fd885cf98f4fe1d3bc46468e567776c37c670fcc
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 13 11:50:23 2022 -0500
+
+    Use kernel CFLAGS for 'kernels' subdirs in addons. (#658)
+    
+    Details:
+    - Updated Makefile and common.mk so that the targeted configuration's
+      kernel CFLAGS are applied to source files that are found in a
+      'kernels' subdirectory within an enabled addon. For now, this
+      behavior only applies when the 'kernels' directory is at the top
+      level of the addon directory structure. For example, if there is an
+      addon named 'foobar', the source code must be located in
+      addon/foobar/kernels/ in order for it to be compiled with the target
+      configurations's kernel CFLAGS. Any other source code within
+      addon/foobar/ will be compiled with general-purpose CFLAGS (the same
+      ones that were used on all addon code prior to this commit). Thanks
+      to AMD (esp. Mithun Mohan) for suggesting this change and catching an
+      intermediate bug in the PR.
+    - Comment/whitespace updates.
+
+commit cb74202db39dc8cb81fdd06f8a445f8837e27853
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 13 11:46:24 2022 -0500
+
+    Fixed incorrect sizeof(type) in edge case macros. (#662)
+    
+    Details:
+    - In bli_edge_case_macro_defs.h, the GEMM_UKR_SETUP_CT_PRE() and
+      GEMMTRSM_UKR_SETUP_CT_PRE() macros previously declared their temporary
+      ct microtiles as:
+    
+        PASTEMAC(ch,ctype)
+              _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \
+                   __attribute__((aligned(alignment))); \
+    
+      The problem here is that sizeof( PASTEMAC(ch,type) ) evaluates to
+      things like sizeof( BLIS_DOUBLE ), not sizeof( double ), and since
+      BLIS_DOUBLE is an enum, it is typically an int, which means the
+      sizeof() expression is evaluating to the wrong value. This was likely
+      a benign bug, though, since BLIS does not support any computational
+      datatypes that are smaller than sizeof( int ), which means the ct
+      array would be *over*-allocated rather than underallocated. Thanks
+      to @moon-chilled for identifying and reporting this bug in #624.
+    - CREDITS file update.
+
+commit 6e5431e8494b06bd80efcab3abf0a6456d6c0381
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Sat Sep 10 15:16:58 2022 -0500
+
+    Fix line number issue in flattened blis.h. (#660)
+    
+    Details:
+    - Updated the top-level Makefile so that it invokes flatten-headers.py
+      without the -c option, which was requesting that comments be stripped
+      (since comment stripping is disabled by default).
+    - Updated flatten-headers.py to accept a new option (-l) to enable
+      insertion of #line directives into the output file. This new option
+      is enabled by default.
+    - Also added logic to flatten-headers.py that outputs a warning if both
+      comment stripping and line numbers are requested since the comment
+      stripping will cause the line numbers to become inaccurate.
+
+commit 4afe0cfdab0e069e027f97920ea604249e34df47
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Sep 8 18:33:20 2022 -0500
+
+    Defined invscalv, invscalm, invscald operations. (#661)
+    
+    Details:
+    - Defined invert-scale (invscal) operation on vectors (level-1v),
+      matrices (level-1m), and diagonals (level-1d).
+    - Added test modules for invscalv and invscalm to the testsuite.
+    - Updated BLISObjectAPI.md and BLISTypedAPI.md API documentation to
+      reflect the new operations. Also updated KernelsHowTo.md accordingly.
+    - Renamed 'beta' to 'alpha' in scalv and scalm testsuite modules (and
+      input.operations files) so that the parameter name matches the
+      parameter used in the documentation.
+
+commit a87eae2b11408b556e562f1b04e673c6cd1612bc
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Sep 6 18:04:09 2022 -0500
+
+    Added '-q' quiet mode option to testsuite. (#657)
+    
+    Details:
+    - Added support for a '-q' command line option to the testsuite. This
+      option suppresses most informational output that would normally
+      clutter up the screen. By default, verbose mode (the previous
+      status quo) will be operative, and so quiet mode must be requested.
+
+commit dfa54139664a42d29774e140ec9e5597af869a76
+Author: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
+Date:   Tue Aug 30 08:07:50 2022 +0800
+
+    Arm64 dgemmsup with extended MR&NR (#655)
+    
+    Details:
+    - Since the number of registers in NEON is large but their lengths are
+      short, I'm here extending both MR and NR.
+    - The approach is to represent the C microtile in registers optionally
+      in columns, so for sizes like 6x7m, the 'crr' kernel is the default
+      with 'rrr' supported through an in-register transpose.
+    - A few asm kernels are crafted for 'rv' to complete this extended size
+      support.
+    - For 'rd' I'm still relying heavily on C99 intrinsic kernels with
+      branching so the performance might not be optimal. (Sorry for that.)
+    - So far, these changes only affect the 'firestorm' subconfig.
+    - This commit also contains row-preferential s12x8 and d6x8 gemm
+      ukernels. These microkernels are templatized versions of the existing
+      s8x12 and d6x8 ukernels defined in bli_gemm_armv8a_asm_d6x8.c.
+
+commit 9e5594ad5fc41df8ef2825a025d7844ac2275c27
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 11 14:36:38 2022 -0500
+
+    Temporarily disabled #line directives from 6826c1c.
+    
+    Details:
+    - Commented out the inclusion of #line preprocessor directives in the
+      flattened header output provided by build/flatten-headers.py. This
+      output was added recently in 6826c1c, but was later found to have
+      thrown off the line numbering referenced by compiler warnings and
+      errors (possibly due to license comment blocks, which are stripped
+      from source headers as they are inlined into the monolithic header).
+
+commit 775148bcdbb1014b4881a76306f35f5d0fedecbe
+Author: jdiamondGitHub <jeff_diamond@fastmail.com>
+Date:   Fri Aug 5 12:01:24 2022 -0500
+
+    Updated ARMv8a kernels to fix 2 prefetching issues. (#649)
+    
+    Details:
+    - The ARMv8a dgemm/sgemm microkernels had 2 prefetching issues that
+      impacted performance on modern ARM platforms. The most significant
+      issue was that only a single prefetch per C tile column was issued.
+      When a column of C was not cache aligned, the second cache line would
+      not be prefetched at all, forcing the kernel to wait for an entire
+      load to update elements of C. This happened with roughly 50% of the
+      C prefetches. The fix was to have two prefetches per column, spaced
+      64 bytes (1 cache line) apart.
+    - A secondary performance issue was that all the C prefetch instructions
+      were issued sequentially at the beginning of the kernel call. This
+      caused a noticeable performance slowdown. Interleaving the prefetch
+      calls every 2-3 instructions in the prologue code solved the issue.
+
+commit bbaf29abd942de47a3a99a80a67d12bab41b27db
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Aug 4 17:51:37 2022 -0500
+
+    Very minor variable updates to common.mk.
+    
+    Details:
+    - Fixed a harmless bug that would have allowed C++ headers into the list
+      of header suffices specifically reserved for C99 headers. In practice,
+      this would have had no substantive effect on anything since the core
+      BLIS framework does not use C++ headers.
+
+commit a48e29d799091a833213efeafaf2d342ebdafde9
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Jul 28 10:11:07 2022 -0500
+
+    CREDITS file update.
+    
+    Details:
+    - Thanks to Kihiro Bando for assisting with issue #644.
+
+commit 5b298935de7f20462bfad1893ed34ecd691cec5a
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Jul 27 19:14:15 2022 -0500
+
+    Removed buggy cruft from power10 subconfig.
+    
+    Details:
+    - Removed #defines for BLIS_BBN_s and BLIS_BBN_d from
+      bli_kernel_defs_power10.h. These were inadvertently set in ae10d949
+      because the power10 subconfig was registering bb packm ukernels, but
+      only for 6xk (power10 uses s8x16 and d8x8 ukernels) and only because
+      the original author (probably) copy-pasted from power9 when getting
+      started. That 6xk packm registration was effectively "dead code"
+      prior to ae10d949, but was then mistaken as not-dead code during the
+      ae10d949 refactor. These improper bb factors may have been causing
+      bugs in power10 builds. Thanks to Nicholai Tukanov for helping remind
+      me what the power10 subconfig was supposed to look like.
+    - Removed extraneous microkernel preference registrations from power10
+      subconfig. Preferences for single and double complex gemm were being
+      registered despite there being no complex gemm ukernels registered to
+      go with them. Similarly, there were trsm preferences registered
+      without any trsm ukernels registered (and BLIS doesn't actually use a
+      preference for the trsm ukernel anyway). These extraneous
+      registrations were almost surely not hurting anything, even if they
+      were quite misleading.
+
+commit 56de31b00fa0f1ba866321817cd1e5d83000ff11
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Jul 27 13:54:17 2022 -0500
+
+    Disable modification of KC in the gemmsup kernels. (#648)
+    
+    This led to a ~50% performance reduction for certain gemm operations (but not others?). See #644 for example.
+
+commit 4dde947e2ec9e139c162801320c94e6a01a39708
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue Jul 26 17:29:32 2022 -0500
+
+    Fixed out-of-bounds bug in sup s6x16m haswell kernel.
+    
+    Details:
+    - Fixed another out-of-bounds read access bug in the haswell sup
+      assembly kernels. This bug is similar to the one fixed in 17b0caa
+      and affects bli_sgemmsup_rv_haswell_asm_6x2m(). Thanks to Madeesh
+      Kannan for reporting this bug (and a suitable fix) in #635.
+    - CREDITS file update.
+
+commit 6826c1cdfba855513786d9e3d606681316453398
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Mon Jul 25 18:21:05 2022 -0500
+
+    Add `#line` directives to flattened `blis.h`. (#643)
+    
+    Details:
+    - Modified flatten-headers.py so that #line directives are inserted into
+      the flattened blis.h file. This facilitates easier debugging when
+      something is amiss in the flattened blis.h because the compiler will
+      be able to refer to the line number within the original constituent
+      header file (which is where the fix would go) rather than the line
+      number within the flattened header (which is not as helpful).
+
+commit af3a41e02534befdae026377592ce437bab83023
+Author: Alexander Grund <Flamefire@users.noreply.github.com>
+Date:   Thu Jul 21 18:05:48 2022 +0200
+
+    Add autodetection for POWER7, POWER9 & POWER10 (#647)
+    
+    Read from `/proc/cpuinfo` as done for ARM.
+    Fixes #501
+
+commit 17b0caa2b2bff439feb6d2b39cfa16e7591882b0
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Jul 14 17:55:34 2022 -0500
+
+    Fixed out-of-bounds read in haswell gemmsup kernels.
+    
+    Details:
+    - Fixed memory access bugs in the bli_sgemmsup_rv_haswell_asm_Mx2()
+      kernels, where M = {1,2,3,4,5,6}. The bugs were caused by loading four
+      single-precision elements of C, via instructions such as:
+    
+            vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
+    
+      in situations where only two elements are guaranteed to exist. (These
+      bugs may not have manifested in earlier tests due to the leading
+      dimension alignment that BLIS employs by default.) The issue was fixed
+      by replacing lines like the one above with:
+    
+            vmovsd(mem(rcx), xmm0)
+            vfmadd231ps(xmm0, xmm3, xmm4)
+    
+      Thus, we use vmovsd to explicitly load only two elements of C into
+      registers, and then operate on those values using register addressing.
+      Thanks to Daniël de Kok for reporting these bugs in #635, and to
+      Bhaskar Nallani for proposing the fix).
+    - CREDITS file update.
+
+commit cc260fd7068f0fe449d818435aa11adb14c17fed
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Jul 13 16:16:01 2022 -0500
+
+    Allow uniform max problem sizes in test/3/runme.sh.
+    
+    Details:
+    - Tweaked test/3/runme.sh so that the test driver binaries for single-
+      threaded (st), single-socket (1s), and dual-socket (2s) execution can
+      be built using identical problem size ranges. Previously, this was not
+      possible because runme.sh used the maximum problem size, which was
+      embedded into the binary filename, to tell the three classes of
+      binaries apart from one another. Now, runme.sh uses the binary suffix
+      ("st", "1s", or "2s") to tell them apart. This required only a few
+      changes to the logic, but it also required a change in format to the
+      threading config strings themselves (replacing the max problem size
+      with "st", "1s", or "2s"). Thanks to Jeff Diamond for inspiring this
+      improvement.
+    - Comment updates.
+
+commit 9b1beec60be31c6ea20b85806d61551497b699e4
+Author: bartoldeman <bartoldeman@users.noreply.github.com>
+Date:   Mon Jul 11 20:15:12 2022 -0400
+
+    Use BLIS_ENABLE_COMPLEX_RETURN_INTEL in blastest files (#636)
+    
+    Details:
+    - Fixed a crash that occurs when either cblat1 or zblat1 are linked
+      with a build of BLIS that was compiled with '--complex-return=intel'.
+      This fix involved inserting preprocessor macro guards based on
+      BLIS_ENABLE_COMPLEX_RETURN_INTEL into blastest/src/cblat1.c and
+      blastest/src/zblat1.c to correctly handle situations where BLIS is
+      compiled with Intel/f2c-style calling conventions for complex numbers.
+    - Updated blastest/src/fortran/run-f2c.sh so that future executions
+      will insert the aforementioned cpp macro conditional where
+      appropriate.
+
+commit 98d467891b74021ace7f248cb0856bec734e39b6
+Author: bartoldeman <bartoldeman@users.noreply.github.com>
+Date:   Mon Jul 11 19:40:53 2022 -0400
+
+    Change complex_return='intel' for ifx. (#637)
+    
+    Details:
+    - When checking the version string of the Fortran compiler for the
+      purposes of determining a default return convention for complex
+      domain values, grep for "IFORT" instead of "ifort" since that string
+      is common to both the 'ifx' and 'ifort' binaries provided by Intel:
+    
+        $ ifx --version
+        ifx (IFORT) 2022.1.0 20220316
+        Copyright (C) 1985-2022 Intel Corporation. All rights reserved.
+    
+        $ ifort --version
+        ifort (IFORT) 2021.6.0 20220226
+        Copyright (C) 1985-2022 Intel Corporation. All rights reserved.
+
+commit ffde54cc5c334aca8eff4d6072ba49496bf3104c
+Author: jdiamondGitHub <jeff_diamond@fastmail.com>
+Date:   Mon Jul 11 16:47:30 2022 -0500
+
+    Minor changes to .gitignore and LICENSE files. (#642)
+    
+    Details:
+    - Macs create .DS_Store files in every directory visited. Updated
+      .gitignore file so these files won't be reported as untracked by
+      'git status'.
+    - Added Oracle Corporation to the LICENSE file.
+    - Updated UT copyright on behalf of SHPC.
+
+commit 7cba7ce3dd1533fcc4ca96ac902bdf218686139a
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Jul 8 11:15:18 2022 -0500
+
+    Minor cleanups, comment updates to bli_gks.c.
+    
+    Details:
+    - Removed a redundant registration of 'a64fx' subconfig in
+      bli_gks_init().
+    - Reordered registration of 'armsve', 'a64fx', and 'firestorm'
+      subconfigs. Thanks to Jeff Diamond for his input on this reordering.
+    - Comment updates to bli_gks.c and arch_t enum in bli_type_defs.h.
+
+commit 667f201b7871da68622027d02bd6b7da3262f8e8
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Jul 7 16:44:21 2022 -0500
+
+    Fixed type bug in bli_cntx_set_ukr_prefs().
+    
+    Details:
+    - Fixed a bug in bli_cntx_set_ukr_prefs() which erroneously typecast the
+      num_t value read from va_args() down to a bool before being stored
+      within the cntx_t. This bug was introduced on April 6th 2022, in
+      ae10d94. This caused the ukernel preferences for double real and
+      double complex to go unchanged while the preferences for single real
+      and single complex were corrupted by the former datatypes'
+      preference values. The bug manifested as degraded performance for
+      subconfigurations that registered column-preferential ukernels. The
+      reason is that the erroneous preferences trigger unnecessary
+      transpositions in the operation, which forces the gemm ukernel to
+      compute on matrices that are not stored according to its preference.
+      Thanks to Devin Matthews, Jeff Diamond, and Leick Robinson for their
+      extensive efforts and assistance in tracking down this issue.
+    - Augmented the informational header that is output by the testsuite to
+      include ukernel preferences for gemm, gemmtrsm_[lu], and trsm_[lu].
+    - CREDITS file update.
+
+commit d429b6bfced21a63bf711224ac402f93f0080b52
+Author: Isuru Fernando <isuruf@gmail.com>
+Date:   Tue Jun 28 15:34:10 2022 -0500
+
+    Support clang targetting MinGW (#639)
+    
+    * Support clang targetting MinGW
+    
+    * Fix pthread linking
+
+commit d93df023348144e091f7b3e3053995648f348aa7
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Jun 15 14:09:49 2022 -0500
+
+    Removed unused dt arg in bli_gks_query_ind_cntx().
+    
+    Details:
+    - Removed the num_t datatype argument from bli_gks_query_ind_cntx().
+      This argument stopped being needed by the function in commit e9da642.
+      Its only use in bli_gks_query_ind_cntx() was to be passed through to
+      the context initialization function for the chosen induced method,
+      but even then, commit log notes from e9da642 indicate that I could not
+      recall why the datatype argument was ever needed by the context init
+      function to begin with.
+    - Updated all invocations of bli_gks_query_ind_cntx() to omit the dt
+      argument. Most of these invocations resided in various standalone test
+      drivers (and the testsuite).
+
+commit 56772892450cc92b3fbd6a9d0460153a43fc47ab
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Wed Jun 1 10:49:33 2022 -0500
+
+    Added SMU citation to README.md intro.
+    
+    Details:
+    - Added a citation to SMU and the Matthews Research Group to the general
+      attribution of maintainership and development in the Introduction of
+      the README.md file. Thanks to Robert van de Geijn and Devin Matthews
+      for suggesting this change.
+
+commit 4603324eb090dfceaad3693a70b2d60544036aa8
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu May 19 14:07:03 2022 -0500
+
+    Init/finalize via bli_pthread_switch_t API (#634).
+    
+    Details:
+    - Defined and implemented a new pthread-like abstract datatype and API
+      in bli_pthread.c. The new type, bli_pthread_switch_t, is similar to
+      bli_pthread_once_t in some respects. The idea is that like a switch in
+      your home that controls a light or ceiling fan, it can either be on or
+      off. The switch starts in the off state. Moving from one state to the
+      other (on to off; off to on) causes some action (i.e., a startup or
+      shutdown function) to be executed. Trying to move from one state to
+      the same state (on to on; off to off) is safe in that it results in
+      no action. Unlike bli_pthread_once(), the API for bli_pthread_switch_t
+      contains both _on() and _off() interfaces. Also, unlike the _once()
+      function, the _on() and _off() functions return error codes so that
+      the 'int' error code returned from the startup or shutdown functions
+      may be passed back to the caller. Thanks to Devin Matthews for his
+      input and feedback on this feature.
+    - Replaced the previous implementation of bli_init_once() and
+      bli_finalize_once() -- both of which used bli_pthread_once() -- with
+      ones that rely upon bli_pthread_switch_on() and _switch_off(),
+      respectively. This also required updating the return types of
+      _init_apis() and _finalize_apis() to match the function pointer type
+      required by bli_pthread_switch_on()/_switch_off().
+    - Comment updates.
+
+commit 64a9b061f6032e2b59613aecdbe7bb52161605c1
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Tue May 10 14:54:22 2022 -0500
+
+    Fixed misspelling of 'xpbys' in gemm macrokernel.
+    
+    Details:
+    - Fixed a functionally harmless typo in bli_gemm_ker_var2.c where a few
+      instances of the substring "xpbys" were misspelled as "xbpys". The
+      misspellings were harmless because they were consistent, and because
+      they referenced only local symbols.
+
+commit 1c733402a95ab08b20f3332c2397fd52a2627cf6
+Author: Jed Brown <jed@jedbrown.org>
+Date:   Thu Apr 28 11:58:44 2022 -0600
+
+    Fix version check for znver3, which needs gcc >= 10.3 (#628)
+    
+    Apple's clang-12 lacks znver3 support, unlike upstream clang-12.
+
+commit 6431c9e13b86e4442b6aacba18a0ace12288c955
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Apr 14 13:01:24 2022 -0500
+
+    Added missing 'const' to zen bli_gemm_small.c.
+    
+    Details:
+    - Added missing 'const' qualifiers to signatures of functions defined in
+      kernels/zen/3/bli_gemm_small.c. This fixes compile-time errors when
+      targeting 'zen3' subconfig (which apparently is enabling AMD's
+      gemm_small code path by default). Thanks to Devin Matthews for
+      reporting this error.
+
+commit 9fea633748ed27ef3853bba7cd955690c61092b4
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Apr 13 15:59:06 2022 -0500
+
+    Partial addition of 'const' to all interfaces above the (micro)kernels. (#625)
+    
+    Details:
+    - Added 'const' qualifier to applicable function arguments wherever the
+      the pointed-to object is not internally modified. This change affects
+      all interfaces that reside above the level of the (micro)kernels.
+    - Typecast certain function return values to discard 'const' qualifier.
+    - Removed 'restrict' from various arguments, including cntx_t*,
+      auxinfo_t*, rntm_t*, thrinfo_t*, mem_t*, and others
+    - Removed parts of some APIs, such as bli_cntx_*(), due to limited use.
+    - Merged some variable declarations with their corresponding
+      initialization statements.
+    - Whitespace changes.
+
+commit ae10d9495486f589ed0320f0151b2d195574f1cf (origin/amd)
+Author: Devin Matthews <damatthews@smu.edu>
+Date:   Wed Apr 6 20:31:11 2022 -0500
+
+    Simplify and rewrite reference packm kernels. (#610)
+    
+    Details:
+    - Reorganized the way kernels are stored within the cntx_t structure so
+      that rather than having a function pointer for every supported size of
+      unrolled packm kernel (2xk, 3xk, 4xk, etc.), we store only two packm
+      kernels per datatype: one to pack MRxk micropanels and one to pack
+      NRxk micropanels.
+      - NOTE: The "bb" (broadcast B) reference kernels have been merged into
+        the "standard" kernels (packm [including 1er and unpackm], gemm,
+        trsm, gemmtrsm). This replication factor is controlled by
+        BLIS_BB[MN]_[sdcz] etc. Power9/10 needs testing since only a
+        replication factor of 1 has been tested. armsve also needs testing
+        since the MR value isn't available as a macro.
+    - Simplified the bli_cntx_*() APIs to conform to the new unified kernel
+      array within the cntx_t. Updated existing bli_cntx_init_<subconfig>()
+      function definitions for all subconfigurations.
+    - Consolidated all kernel id types (e.g. l1vkr_t, l1mkr_t, l3ukr_t,
+      etc.) into one kernel id type: ukr_t.
+    - Various edits, updates, and rewrites of reference kernels pursuant to
+      the aforementioned changes.
+    - Define compile-time macro constants (BLIS_MR_[sdcz], BLIS_NR_[sdcz],
+      and friends) in bli_kernel_macro_defs.h, but only when the macro
+      BLIS_IN_REF_KERNEL is defined by the build system.
+    - Loose ends:
+      - Still need to update documentation, including:
+        - docs/ConfigurationHowTo.md
+        - docs/KernelsHowTo.md
+        to reflect changes made in this commit.
+
+commit b3e674db3c05ca586b159a71deb1b61d701ae5c9
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Apr 4 17:31:02 2022 -0500
+
+    README.md update to link to releases page.
+
+commit 69fa915464c52f09a5971a60f521900d31a34e69
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Apr 1 08:47:46 2022 -0500
+
+    Fixed broken "tagged releases" link in README.md.
+
+commit 88cab8383ca90ddbb4cf13e69b7d44a1663a4425
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Fri Apr 1 08:12:06 2022 -0500
+
+    CHANGELOG update (0.9.0)
+
+commit 14c86f66b20901b60ee276da355c1b62642c18d2 (tag: 0.9.0)
 Author: Field G. Van Zee <field@cs.utexas.edu>
 Date:   Fri Apr 1 08:12:06 2022 -0500
 
     Version file update (0.9.0)
 
-commit 99bb9002f1aff598d347eae2821a3f7bdd1f48e8 (origin/master, origin/HEAD)
+commit 99bb9002f1aff598d347eae2821a3f7bdd1f48e8
 Author: Field G. Van Zee <field@cs.utexas.edu>
 Date:   Fri Apr 1 08:10:59 2022 -0500
 
     ReleaseNotes.md update in advance of next version.
 
-commit bee7678b2558a691ac850819dbe33fefe4fdbee3 (origin/dev, origin/amd, dev, amd)
+commit bee7678b2558a691ac850819dbe33fefe4fdbee3
 Author: Field G. Van Zee <field@cs.utexas.edu>
 Date:   Thu Mar 31 14:09:39 2022 -0500
 

From 06dddf1e51ccff70d77ee8cb731c3217e70eb730 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 6 May 2024 13:47:42 -0500
Subject: [PATCH 191/230] ReleaseNotes.md update.

---
 docs/ReleaseNotes.md | 125 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index ccb4d9f0e..1c2d5f4d2 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -4,6 +4,7 @@
 
 ## Contents
 
+* [Changes in 1.0](ReleaseNotes.md#changes-in-10)
 * [Changes in 0.9.0](ReleaseNotes.md#changes-in-090)
 * [Changes in 0.8.1](ReleaseNotes.md#changes-in-081)
 * [Changes in 0.8.0](ReleaseNotes.md#changes-in-080)
@@ -40,6 +41,130 @@
 * [Changes in 0.0.2](ReleaseNotes.md#changes-in-002)
 * [Changes in 0.0.1](ReleaseNotes.md#changes-in-001)
 
+## Changes in 1.0
+May 6, 2024
+
+Improvements present in 1.0:
+
+Framework:
+- Initialize/finalize BLIS via a new `bli_pthread_switch_t` API. (Field Van Zee, Devin Matthews)
+- Revamped `bli_init()` to use TLS where feasible. (Field Van Zee, Edward Smyth, Minh Quan Ho)
+- Implemented support for fat multithreading.
+- Implemented tile-level load balancing (tlb), or tile-level partitioning, in jr/ir loops for `gemm`, `gemmt`, and `trmm` macrokernels. (Field Van Zee, Devin Matthews, Leick Robinson, Minh Quan Ho)
+- Added padding to `thrcomm_t` fields to avoid false sharing of cache lines. (Leick Robinson)
+- Rewrote/fixed broken tree barrier implementation. (Leick Robinson)
+- Refactored some `rntm_t` management code. (Field Van Zee, Devin Matthews)
+- Initialize `rntm_t` nt/ways fields with 1 (not -1). (Field Van Zee, Jeff Diamond, Leick Robinson, Devin Matthews)
+- Defined `invscalv`, `invscalm`, `invscald` operations.
+- Added consistent `NaN`/`Inf` handling in `sumsqv`. (Devin Matthews)
+- Implemented support for HPX as a threading backend option. (Christopher Taylor, Srinivas Yadav)
+- Relocated the pba, sba pool (from the `rntm_t`), and `mem_t` (from the `cntl_t`) to the `thrinfo_t` object.
+- Modified which communicator is associated with a given node of the `thrinfo_t` tree. (Devin Matthews)
+- Refactored level-3 thread decorator into two parts: a thread launcher and a function to pass operands. (Devin Matthews)
+- Refactored strucure awareness in `bli_packm_blk_var1.c`. (Devin Matthews)
+- Reimplemented `bli_l3_determine_kc()`. (Devin Matthews)
+- Implemented `cntx_t` pointer caching in gks. (Field Van Zee, Harihara Sudhan S)
+- Added `const` keyword to pointers in kernel APIs. (Field Van Zee, Nisanth M P)
+- Migrated all kernel APIs to use `void*` pointers.
+- Defined new global scalar constants: `BLIS_ONE_I`, `BLIS_MINUS_ONE_I`, `BLIS_NAN`. (Devin Matthews)
+- Disabled modification of KC in the `gemmsup` kernels. (Devin Matthews)
+- Defined `lt`, `lte`, `gt`, `gte` operations and other miscellaneous updates.
+- Consolidated `INSERT_` macro sets via variadic macros. (Devin Matthews)
+- De-templatized macrokernels for `gemmt`, `trmm`, and `trsm` to match that of `gemm`. (Devin Matthews)
+- De-templatized `bli_l3_sup_var1n2m.c` and unified `_sup_packm_a/b()`. (Devin Matthews)
+- Fixed 1m enablement for `herk`/`her2k`/`syrk`/`syr2k`. (Devin Matthews)
+- Fixed `trmm[3]`/`trsm` performance bug introduced in `cf7d616`. (Field Van Zee, Leick Robinson)
+- Fixed a 1m optimization bug in right-sided `hemm`/`symm`. (Field Van Zee, Nisanth M P)
+- Fixed a bug in sup threshold registration. (Devin Matthews, Field Van Zee)
+- Fixed brokenness in the small block allocator (sba) when the sba is disabled. (Field Van Zee, John Mather)
+- Fixed type bug in `bli_cntx_set_ukr_prefs()`. (Field Van Zee, Leick Robinson, Devin Matthews, Jeff Diamond)
+- Fixed incorrect `sizeof(type)` in edge case macros. (@moon-chilled)
+- Fixed bugs and added sanity check in `bli_pool.c`. (Devin Matthews)
+- Fixed a typo in the macro defintion for `VEXTRACTF64X2` in `bli_x86_asm_macros.h`. (Harsh Dave)
+- Fixed a typo in `bli_type_defs.h` where `BLIS_BLAS_INT_TYPE_SIZE` was misspelled. (Devin Matthews)
+- Typecast `printf()` args in `bli_thread_range_tlb.c` to avoid compiler warnings. (Lee Killough)
+- Minor tweaks to `bli_l3_check.c`.
+- Partial addition of `const` to all interfaces above the (micro)kernels. (Devin Matthews)
+- Fixed a harmless misspelling of `xpbys` in gemm macrokernel.
+- Various internal API renaming/reorganization.
+- Various other fixes.
+
+Compatibility:
+- Implemented `[cz]symv_()`, `[cz]syr_()`, `[cz]rot_()`. (Field Van Zee, James Foster)
+- Fixed compilation errors when `BLIS_DISABLE_BLAS_DEFS` is defined. (Field Van Zee, Edward Smyth, Devin Matthews)
+- Include `bli_config.h` before `bli_system.h` in `cblas.h` so that `BLIS_ENABLE_SYSTEM` is defined in time for proper OS detection. (Edward Smyth)
+
+Kernels:
+- Updated ARMv8a kernels to fix two prefetching issues and re-enable general stride IO. (Jeff Diamond)
+- Restored general storage case to `armsve` kernels. (RuQing Xu)
+- Added arm64 `dgemmsup` with extended MR and NR. (RuQing Xu)
+- Reorganized the way `packm` kernels are stored within the `cntx_t` so that BLIS only stores two `packm` kernels per datatype: one for MRxk upanels and one for kxNR upanels. (Devin Matthews)
+- Fixed bugs in `scal2v` reference kernel when alpha == 1.
+- Fixed out-of-bounds read in `haswell` `gemmsup` kernels. (Daniël de Kok, Bhaskar Nallani, Madeesh Kannan)
+- Fixed k = 0 edge case in `power10` microkernels. (Nisanth M P)
+- Disabled `power10` kernels other than `sgemm`, `dgemm`. (Nisanth M P)
+- Fixed `bli_gemm_small()` prototype mismatch. (Jeff Diamond)
+
+Extras:
+- Use the conventional level-3 sup thread decorator within the `gemmlike` sandbox.
+- Fixed type-mismatch errors in `power10` sandbox. (Nisanth M P)
+- Fixed `gemmlike` sandbox bug that stems from reuse of `bli_thrinfo_sup_grow()`.
+
+Build system:
+- Added two arm64 subconfigs: `altra` and `altramax`. (Jeff Diamond, Leick Robinson)
+- Added support for RISC-V configuration targets. (Angelika Schwarz, Lee Killough)
+- Auto-detect the RISC-V ABI of the compiler and use `-mabi=` during RISC-V builds. (Lee Killough)
+- Added `sifive_x280` subconfig and kernel set. (Aaron Hutchinson, Lee Killough, Devin Matthews, and Angelika Schwarz)
+- Added AddressSanitizer (--enable-asan) option to `configure`. (Devin Matthews)
+- Added option to disable thread-local storage via `--disable-tls`. (Field Van Zee, Nick Knight)
+- Exclude `-lrt` on Android with Bionic libraries. (Lee Killough)
+- Omit `-fPIC` option when shared library build is disabled. (Field Van Zee, Nick Knight)
+- Move `-fPIC` option insertion to subconfigs' `make_defs.mk` files. (Field Van Zee, Nick Knight)
+- Install one-line helper headers to `INCDIR` prefix so that user can `#include "blis.h"` instead of `#include <blis/blis.h>` and/or `"cblas.h"` instead of `<blis/cblas.h>` if CBLAS is enabled). (Field Van Zee, Jed Brown, Devin Matthews, Mo Zhou)
+- Enhanced detection of Fortran compiler when checking the version string for the purposes of determining a default return convention for complex domain values. (Bart Oldeman)
+- Added detection of the NVIDIA nvhpc compiler (`nvc`) in `configure`. (Ajay Panyala)
+- Updated `zen3` subconfig to support NVHPC compilers. (Abhishek Bagusetty)
+- Use kernel CFLAGS for `kernels` subdirs in addons. (AMD, Mithun Mohan)
+- Created `power` umbrella configuration family (which currently includes `power9` and `power10` subconfigs). (Nisanth M P)
+- Defined `BLIS_VERSION_STRING` in `blis.h` instead of via command line argument during compilation. (Field Van Zee, Mohsen Aznaveh, Tim Davis)
+- Rewrote `regen-symbols.sh` as `gen-libblis-symbols.sh`. (Field Van Zee)
+- Support `clang` targetting MinGW. (Isuru Fernando)
+- Added autodetection (via `/proc/cpuinfo`) for POWER7, POWER9 and POWER10 microarchitectures. (Alexander Grund)
+- Added `#line` directives to flattened `blis.h` to facilitate easier debugging. (Devin Matthews)
+- Added `--nosup` and `--sup` shorthand options to `configure`.
+- Use here-document syntax for `configure --help` output. (Lee Killough)
+- Updated `configure` to pass all `shellcheck` checks. (Lee Killough)
+- Tweaks to `.dir-locals.el` to enchance emacs formatting of C files. (Lee Killough)
+- Removed buggy cruft from `power10` subconfig. (Field Van Zee, Nicholai Tukanov)
+- Added missing `#include <io.h>` for Windows. (@h-vetinari)
+- Fixed hardware auto-detection for `firestorm` (Apple M1) subconfig. (Devin Matthews)
+- Fixed bug in detection of Fortran compiler vendor. (Devin Matthews)
+- Fixed version check for `znver3`, which needs gcc >= 10.3. (Jed Brown)
+- Fixed typo in `configure --help` text. (Lee Killough)
+- Fixed warning about regular expressions with stray backslashes as the result of recent changes to `grep`.
+- Added `output.testsuite` to `.gitignore`.
+- Minor changes to .gitignore and LICENSE files. (Jeff Diamond)
+- Minor decluttering of top-level directory.
+- Very minor tweaks to common.mk.
+
+Testing:
+- Rewrote `test/3` drivers to take parameters via command line arguments. (Field Van Zee, Jeff Diamond, Leick Robinson)
+- Added `arm64` entry to `.travis.yml` so that Travis CI will compile/test ARM builds. (Field Van Zee, RuQing Xu)
+- Test the `gemmlike` sandbox via AppVeyor. (Jeff Diamond)
+- Added `-q` quiet mode option to testsuite.
+- Fixed non-deterministic segfault in standalone `test/3` drivers. (Field Van Zee, Leick Robinson)
+- Fixed a crash that occurs when either `cblat1` or `zblat1` are linked with a build of BLIS that was compiled with `--complex-return=intel`. (Bart Oldeman)
+- Other minor fixes/tweaks.
+
+Documentation:
+- Added Discord documentation (`docs/Discord.md`) and logo to `README.md`.
+- Added the `mm_algorithm` files (for bp and pb) to `docs/diagrams`.
+- Added mention of Wilkinson Prize to `README.md`.
+- Minor fixes and improvements to `docs/Multithreading.md`.
+- Fix typos in docs + example code comments. (Igor Zhuravlov)
+- Fixed broken "tagged releases" link in `README.md`.
+- Added SMU citation to `README.md` intro.
+
 ## Changes in 0.9.0
 April 1, 2022
 

From 01e151a9658cbe07ee0cac8b03fa13fef26df19e Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 6 May 2024 15:37:27 -0500
Subject: [PATCH 192/230] Updated RELEASING file; fixes to ReleaseNotes.md.

Details:
- Updated RELEASING file to reflect new release protocols, given the
  more sophisticated policy of maintaining release candidate branches
  separate from 'master' (which is now more akin to a development
  branch). Further refinements to this file will likely follow.
- Fixed typos in ReleaseNotes.md. Thanks to Robert van de Geijn for
  reporting these.
---
 RELEASING            | 113 ++++++++++++++++++++++++++++++++++++++-----
 docs/ReleaseNotes.md |   6 +--
 2 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/RELEASING b/RELEASING
index 0996a560d..85de229c9 100644
--- a/RELEASING
+++ b/RELEASING
@@ -1,3 +1,65 @@
+Here are the steps to follow to create a new release candidate of BLIS:
+
+If you're creating a new release candidate lineage -- that is, the *first*
+release candidate for a new version (i.e., 2.0-rc0):
+
+1. Use the build/start-new-rc.sh script to create a new rc branch.
+
+   $ ./build/start-new-rc.sh "2.0"
+
+   This will update the version file in BLIS to reflect the new version
+   string (in this case, "2.0"). It also refreshes the contents of the
+   CHANGELOG file with the output of 'git log'. Finally, it creates a
+   new "-rc0" branch (in this case, "2.0-rc0").
+
+   NOTE: This script assumes that you want the new rc branch to be
+   a descendant of the head of 'master'.
+
+2. Make sure the script did what it was supposed to do by inspecting
+   the output of 'git log' and 'git branch'. If everything looks good,
+   you can push the changes via:
+
+   $ git checkout master
+   $ git push
+   $ git push -u origin 2.0-rc0
+
+   At this point, the new release candidate branch is live at origin.
+
+If you're creating a new release candidate for an existing lineage
+-- that is, a follow-up release candidate for a new version that already
+has one or more release candidates -- start by checking out the latest
+release candidate, for example:
+
+1. Start by checking out the latest release candidate:
+
+   $ git checkout 2.0-rc1
+
+2. Then create a new release candidate branch whose name increments the
+   "rc" number:
+
+   $ git checkout -b 2.0-rc2
+
+3. Then cherry-pick one or more bugfixes that were made to 'master':
+
+   $ git cherry-pick -nx <commit>
+
+4. Finally, commit the changes. Be sure to include lines in the commit
+   log entry for each cherry-picked commit that note the commit hash
+   of the *original* commit that is being cherry-picked from. Example:
+
+     Fixed a bug in blahblahblah. (#777)
+    
+     Details:
+     - Fixed a bug in blahblahblah that manifested as blahblahblah. This
+       bug was introduced in commit abc12345. Thanks to John Smith for
+       reporting this bug.
+     - (cherry picked from commit abc0123456789abc0123456789abc0123456789a)
+
+   Note the final line, which was *not* present in the original commit
+   log entry but *should be* present in the commit log entry for the
+   commit that cherry-picks to (in this example) 2.0-rc2.
+
+
 Here are the steps to follow to create a new release (version) of BLIS:
 
 1. Make sure there are no commits that have yet to be pulled into
@@ -7,42 +69,67 @@ Here are the steps to follow to create a new release (version) of BLIS:
 
    If there are any commits upstream, merge them as appropriate.
 
-2. Consider whether the so_version should be updated (via the so_version
+2. Check out the latest release candidate:
+
+   $ git checkout 2.0-rc2
+
+3. Consider whether the so_version should be updated (via the so_version
    file in the 'build' directory) due to any ABI changes since the previous
    version. If so, commit that change now.
 
-3. Verify that the code builds properly.
+4. Verify that the code builds properly.
 
    $ ./configure auto; make
 
-4. Verify that the code passes BLIS and BLAS tests:
+5. Verify that the code passes BLIS and BLAS tests:
 
    $ make check           # BLIS testsuite (fast) + BLAS test drivers
    $ make checkblis       # BLIS testsuite (full ex. mixed-datatype)
    $ make checkblis-md    # BLIS testsuite (mixed-datatype only)
    $ make checkblis-salt  # BLIS testsuite (fast + salt)
 
-5. Draft a new announcement to blis-devel, crediting those who
+6. Draft a new announcement to blis-devel, crediting those who
    contributed towards this version by browsing 'git log'.
 
-6. Update CREDITS file if 'git log' reveals any new contributors.
+7. Update CREDITS file if 'git log' reveals any new contributors.
+   NOTE: This should have already been done prior to the rc cycle.
 
-7. Update docs/ReleaseNotes.md file with body of finalized announcement
+8. Update docs/ReleaseNotes.md file with body of finalized announcement
    and the date of the release.
+   NOTE: This should be a cherry-pick off of 'master' since the release notes
+   need to be committed there anyway as well.
 
-8. Commit changes from steps 5 and 6.
+9. Commit changes from steps 7 and 8.
 
-9. Bump the version number:
+10. Create a final release branch:
 
-   $ ./build/bump-version.sh "0.3.2"
+   $ git checkout -b 2.0-final
 
-   This will result in two new commits: a version file update and a CHANGELOG
-   file update.
+   This will help identify the release commit when browsing the GitHub network
+   graph, since tags don't show up there.
 
-10. Push the new commits and new tag associated with the new version:
+11. Tag the final release commit:
+
+   $ git tag 2.0 2.0-final
+
+   You can also use the actual commit hash instead of "2.0-final".
+
+12. Push the new commits and new tag associated with the new version:
 
    $ git push
    $ git push --tag
 
-11. Send finalized announcement to blis-devel.
+13. Publish a new release via GitHub (https://github.com/flame/blis/releases).
+    Identify the new version by the tag you just created and pushed. You can
+    also identify the previous release.
+
+    Try to use formatting consistent with the prior release. (You can start to
+    edit the previous release, inspect/copy some of the markdown syntax, and
+    then abort the edit.)
+
+14. Update the Wikipedia entry for BLIS to reflect the new latest version.
+
+15. Announce the release on Discord.
+
+16. Send finalized announcement to blis-devel.
 
diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index 1c2d5f4d2..9a667f80e 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -61,7 +61,7 @@ Framework:
 - Relocated the pba, sba pool (from the `rntm_t`), and `mem_t` (from the `cntl_t`) to the `thrinfo_t` object.
 - Modified which communicator is associated with a given node of the `thrinfo_t` tree. (Devin Matthews)
 - Refactored level-3 thread decorator into two parts: a thread launcher and a function to pass operands. (Devin Matthews)
-- Refactored strucure awareness in `bli_packm_blk_var1.c`. (Devin Matthews)
+- Refactored structure awareness in `bli_packm_blk_var1.c`. (Devin Matthews)
 - Reimplemented `bli_l3_determine_kc()`. (Devin Matthews)
 - Implemented `cntx_t` pointer caching in gks. (Field Van Zee, Harihara Sudhan S)
 - Added `const` keyword to pointers in kernel APIs. (Field Van Zee, Nisanth M P)
@@ -80,7 +80,7 @@ Framework:
 - Fixed type bug in `bli_cntx_set_ukr_prefs()`. (Field Van Zee, Leick Robinson, Devin Matthews, Jeff Diamond)
 - Fixed incorrect `sizeof(type)` in edge case macros. (@moon-chilled)
 - Fixed bugs and added sanity check in `bli_pool.c`. (Devin Matthews)
-- Fixed a typo in the macro defintion for `VEXTRACTF64X2` in `bli_x86_asm_macros.h`. (Harsh Dave)
+- Fixed a typo in the macro definition for `VEXTRACTF64X2` in `bli_x86_asm_macros.h`. (Harsh Dave)
 - Fixed a typo in `bli_type_defs.h` where `BLIS_BLAS_INT_TYPE_SIZE` was misspelled. (Devin Matthews)
 - Typecast `printf()` args in `bli_thread_range_tlb.c` to avoid compiler warnings. (Lee Killough)
 - Minor tweaks to `bli_l3_check.c`.
@@ -134,7 +134,7 @@ Build system:
 - Added `--nosup` and `--sup` shorthand options to `configure`.
 - Use here-document syntax for `configure --help` output. (Lee Killough)
 - Updated `configure` to pass all `shellcheck` checks. (Lee Killough)
-- Tweaks to `.dir-locals.el` to enchance emacs formatting of C files. (Lee Killough)
+- Tweaks to `.dir-locals.el` to enhance emacs formatting of C files. (Lee Killough)
 - Removed buggy cruft from `power10` subconfig. (Field Van Zee, Nicholai Tukanov)
 - Added missing `#include <io.h>` for Windows. (@h-vetinari)
 - Fixed hardware auto-detection for `firestorm` (Apple M1) subconfig. (Devin Matthews)

From 6d0ab74f6975fdf4d19cee06d946b09b6ca89656 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 6 May 2024 16:02:03 -0500
Subject: [PATCH 193/230] Updates to README.md section on downloading.

Details:
- Updated the text in README.md in the "How to Download BLIS" section.
  The new text no longer recommends that the reader use the 'master'
  branch over official releases, as the previous text did. The text was
  tweaked since (a) the 'master' branch is now akin to a development
  branch, and (b) the reader will no longer forgo bugfixes by sticking
  to official releases since we will (going forward) publish bugfix
  releases for the most recent version.
---
 README.md | 50 ++++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 68c937f52..05104b976 100644
--- a/README.md
+++ b/README.md
@@ -367,12 +367,12 @@ to your hardware.
 
 1. **Download a source repository with `git clone`.**
 Generally speaking, we prefer using `git clone` to clone a `git` repository.
-Having a repository allows the user to periodically pull in the latest changes
-and quickly rebuild BLIS whenever they wish. Also, implicit in cloning a
-repository is that the repository defaults to using the `master` branch, which
-contains the latest "stable" commits since the most recent release. (This is
-in contrast to Option 3 in which the user is opting for code that may be
-slightly out of date.)
+Having a repository allows the user to periodically pull in the latest changes,
+try out release candidates when they become available, switch to older versions
+easily, and quickly rebuild BLIS whenever they wish.
+(Note that implicit in cloning a repository is that the repository defaults to
+using the `master` branch, which, as of 1.0, is considered akin to a development
+branch and likely contains improvements since the most recent release.)
 
    In order to clone a `git` repository of BLIS, please obtain a repository
 URL by clicking on the green button above the file/directory listing near the
@@ -382,19 +382,33 @@ to executing the following command in your terminal shell:
    git clone https://github.com/flame/blis.git
    ```
    At this point, you will have the latest commit of the `master` branch
-checked out. If you wish to check out a particular version x.y.z, execute
-the following:
+checked out. If you wish to check out an official release version, say,
+1.0, execute the following:
    ```
-   git checkout x.y.z
+   git checkout 1.0
    ```
    `git` will then transform your working copy to match the state of the
-commit associated with version x.y.z. You can view a list of tags at any
-time by executing:
+commit associated with version 1.0. You can view a list of official
+versiontags at any time by executing:
    ```
    git tag --list
    ```
+   Note that pre-release versions, such as release candidates, are actually
+branches rather than tags, and thus will not show up in the list of tagged
+versions.
+
+2. **Download a source release via a tarball/zip file.**
+If you would like to stick to the code that is included in official releases
+and don't need the convenience of pulling in the latest changes via `git`, you
+may download either a tarball or zip file of BLIS's latest
+[release](https://github.com/flame/blis/releases). (NOTE: Some older releases
+are only available as [tagged](https://github.com/flame/blis/tags) commits.
+Also note that downloading release x.y.z is equivalent to downloading, or
+checking out, the `git` tag `x.y.z`.)
+We consider this option to be less than ideal for some people since you will
+not be able to update your code with a simple `git pull` command.
 
-2. **Download a source repository via a zip file.**
+3. **Download a source repository via a zip file.**
 If you are uncomfortable with using `git` but would still like the latest
 stable commits, we recommend that you download BLIS as a zip file.
 
@@ -402,18 +416,6 @@ stable commits, we recommend that you download BLIS as a zip file.
 click on the green button above the file listing near the top of this page.
 This should reveal a link for downloading the zip file.
 
-3. **Download a source release via a tarball/zip file.**
-Alternatively, if you would like to stick to the code that is included in
-official releases, you may download either a tarball or zip file of BLIS's
-latest [release](https://github.com/flame/blis/releases). Some older releases
-are only available as [tagged](https://github.com/flame/blis/tags) commits.
-(Note: downloading release x.y.z is equivalent to downloading, or checking out,
-tag `x.y.z`.)
-We consider this option to be less than ideal for most people since it will
-likely mean you miss out on the latest bugfix or feature commits (in contrast
-to Options 1 or 2), and you also will not be able to update your code with a
-simple `git pull` command (in contrast to Option 1).
-
 4. **Download a binary package specific to your OS.**
 While we don't recommend this as the first choice for most users, we provide
 links to community members who generously maintain BLIS packages for various

From 729c57c15aa50030145ff702626c31839ded3502 Mon Sep 17 00:00:00 2001
From: AngryLoki <AngryLoki@users.noreply.github.com>
Date: Wed, 5 Jun 2024 00:28:41 +0800
Subject: [PATCH 194/230] Fix SyntaxWarning messages from python 3.12 (#809)

Details:
- When using regexes in Python, certain characters need backslash escaping, e.g.:
  ```python
  regex = re.compile( '^[\s]*#include (["<])([\w\.\-/]*)([">])' )
  ```
  However, technically escape sequences like `\s` are not valid and should actually be double-escaped: `\\s`.
  Python 3.12 now warns about such escape sequences, and in a later version these warning will be promoted
  to errors. See also: https://docs.python.org/dev/whatsnew/3.12.html#other-language-changes. The fix here
  is to use Python's "raw strings" to avoid double-escaping. This issue can be checked for all files in the current
  directory with the command `python -m compileall -d . -f -q .`
- Thanks to @AngryLoki for the fix.
---
 build/add-copyright.py   | 16 ++++++++--------
 build/flatten-headers.py |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/build/add-copyright.py b/build/add-copyright.py
index e22ebd16c..ae331c94e 100755
--- a/build/add-copyright.py
+++ b/build/add-copyright.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -200,7 +200,7 @@ def main():
 		file_string = "".join( file_lines )
 
 		# Search for an existing copyright line.
-		has_cr = re.search( 'Copyright \(C\)', file_string )
+		has_cr = re.search( r'Copyright \(C\)', file_string )
 
 		# If the file does not have any copyright notice in it already, we
 		# assume we don't need to update it.
@@ -210,7 +210,7 @@ def main():
 
 		# Check whether the file already has a copyright for the_org. We may
 		# need to use this information later.
-		has_org_cr = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, file_string )
+		has_org_cr = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, file_string )
 
 		# Initialize the list of processed (potentially modified) file lines.
 		mod_file_lines = []
@@ -225,7 +225,7 @@ def main():
 			# Iterate through the lines in the current file.
 			for line in file_lines:
 
-				result = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, line )
+				result = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, line )
 
 				# If the current line matches a copyright line for the_org...
 				if result:
@@ -253,7 +253,7 @@ def main():
 
 						# Add the unchanged line to the running list.
 						mod_file_lines += line
-							
+
 				else:
 					# Add the unchanged line to the running list.
 					mod_file_lines += line
@@ -284,8 +284,8 @@ def main():
 					line_next = file_lines[i]
 
 				# Try to match both the current line and the next line.
-				result  = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line )
-				resnext = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line_next )
+				result  = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line )
+				resnext = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line_next )
 
 				# Parse the results.
 				if result:
@@ -301,7 +301,7 @@ def main():
 						# The current line matches but the next does not. Thus,
 						# this branch only executes for the *last* copyright line
 						# in the file.
-						
+
 						# Extract the year and organization from the matched
 						# string.
 						old_year = result.group(1)
diff --git a/build/flatten-headers.py b/build/flatten-headers.py
index 2d5b74c7a..2ce5b6c0c 100755
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -110,7 +110,7 @@ def print_usage():
 
 def canonicalize_ws( s ):
 
-	return re.sub( '\s+', ' ', s ).strip()
+	return re.sub( r'\s+', ' ', s ).strip()
 
 # ---
 
@@ -166,7 +166,7 @@ def list_contains_header( items ):
 	rval = False
 	for item in items:
 
-		is_h = re.search( "\.h", item )
+		is_h = re.search( r"\.h", item )
 
 		if is_h:
 			rval = True
@@ -198,7 +198,7 @@ def get_header_path( filename, header_dirpaths ):
 
 def strip_cstyle_comments( string ):
 
-	return re.sub( "/\*.*?\*/", "", string, flags=re.S )
+	return re.sub( r"/\*.*?\*/", "", string, flags=re.S )
 
 # ------------------------------------------------------------------------------
 
@@ -527,7 +527,7 @@ def main():
 	# Precompile the main regular expression used to isolate #include
 	# directives and the headers they reference. This regex object will
 	# get reused over and over again in flatten_header().
-	regex = re.compile( '^[\s]*#include (["<])([\w\.\-/]*)([">])' )
+	regex = re.compile( r'^[\s]*#include (["<])([\w\.\-/]*)([">])' )
 
 	# Recursively substitute headers for occurrences of #include directives.
 	final_string = flatten_header( inputfile, header_dirpaths, nestsp )

From 5cbec6503de335b3b63fa5d4f388fddd3aff2b61 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 4 Jun 2024 11:30:22 -0500
Subject: [PATCH 195/230] Update CREDITS

---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index bca98dadd..327c82a13 100644
--- a/CREDITS
+++ b/CREDITS
@@ -142,6 +142,7 @@ but many others have contributed code, ideas, and feedback, including
   Stefano Zampini          @stefanozampini
   M. Zhou                  @cdluminate
   Igor Zhuravlov           @jip                       (Far Eastern Federal University)
+                           @AngryLoki
 
 BLIS's development was partially funded by grants from industry
 partners, including

From 415893066e966159799d96166cadcf9bb5535b1c Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 18 Jun 2024 22:03:32 -0500
Subject: [PATCH 196/230] Add ScaLAPACK compatibility mode. (#813)

Details:
- Add configure options `--enable-scalapack-compat` and `--disabled-scalapack-compat`
  (default disabled).
- Add a macro `BLIS_{ENABLE,DISABLE}_SCALAPACK_COMPAT` to bli_config.h.
- This option and macro control any changes to the API necessary to maintain
  compatibility with ScaLAPACK. Currently, this only means disabling the complex
  versions of `syr`, `syr2`, and `symv`. In the future, other changes could be
  controlled by the same flag.
- Complex `syr2` wasn't enabled at the same time that complex `syr` and `symv` were.
  This is now corrected.
---
 build/bli_config.h.in   |  6 ++++++
 configure               | 22 ++++++++++++++++++++++
 frame/compat/bla_symv.c |  6 ++++++
 frame/compat/bla_symv.h |  6 ++++++
 frame/compat/bla_syr.c  |  6 ++++++
 frame/compat/bla_syr.h  |  6 ++++++
 frame/compat/bla_syr2.c |  8 +++++++-
 frame/compat/bla_syr2.h |  8 +++++++-
 8 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 634ad0a42..eae30970b 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -110,6 +110,12 @@
 #define BLIS_DISABLE_MEM_TRACING
 #endif
 
+#if @scalapack_compat@
+#define BLIS_ENABLE_SCALAPACK_COMPAT
+#else
+#define BLIS_DISABLE_SCALAPACK_COMPAT
+#endif
+
 #if @int_type_size@ == 64
 #define BLIS_INT_TYPE_SIZE 64
 #elif @int_type_size@ == 32
diff --git a/configure b/configure
index dc6817f7d..83329e7cb 100755
--- a/configure
+++ b/configure
@@ -303,6 +303,12 @@ print_usage()
                  which are determined by the BLIS subconfiguration used at
                  runtime.) By default, these customized files are disabled.
 
+   --enable-scalapack-compat, --disable-scalapack-compat
+
+                 Enable strict compatibility with ScaLAPACK, which may
+                 requiring disabling certain conflicting functionality
+                 available throught the BLAS and/or CBLAS interfaces.
+
    -a NAME --enable-addon=NAME
 
                  Enable the code provided by an addon. An addon consists
@@ -3011,6 +3017,7 @@ blis_main()
 	enable_trsm_preinversion='yes'
 	force_version='no'
 	complex_return='default'
+	scalapack_compat='no'
 
 	# The addon flag and names.
 	addon_flag=''
@@ -3265,6 +3272,13 @@ blis_main()
 							enable_amd_frame_tweaks='no'
 							;;
 
+						enable-scalapack-compat)
+							scalapack_compat='yes'
+							;;
+						disable-scalapack-compat)
+							scalapack_compat='no'
+							;;
+
 						with-memkind)
 							enable_memkind='yes'
 							;;
@@ -3933,6 +3947,13 @@ blis_main()
 		echo "${script_name}: memory tracing output is disabled."
 		enable_mem_tracing_01=0
 	fi
+	if [[ ${scalapack_compat} = yes ]]; then
+		echo "${script_name}: ScaLAPACK compatibility is enabled."
+		scalapack_compat_01=1
+	else
+		echo "${script_name}: ScaLAPACK compatibility is disabled."
+		scalapack_compat_01=0
+	fi
 	if [[ ${has_memkind} = yes ]]; then
 		if [[ -z ${enable_memkind} ]]; then
 			# If no explicit option was given for libmemkind one way or the other,
@@ -4311,6 +4332,7 @@ blis_main()
 	add_config_var complex_return_intel      complex_return_intel01
 	add_config_var addon_list_includes
 	add_config_var enable_addons             enable_addons_01
+	add_config_var scalapack_compat          scalapack_compat_01
 
 	generate_config_file "${config_mk_in_path}"    "${config_mk_out_path}"
 	generate_config_file "${bli_config_h_in_path}" "${bli_config_h_out_path}"
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index f13514b01..b0313415b 100644
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -39,6 +39,8 @@
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNC
+#undef  GENTFUNCRO
+#define GENTFUNCRO GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
@@ -110,6 +112,10 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
+#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
+INSERT_GENTFUNCRO_BLAS( symv, symv )
+#else
 INSERT_GENTFUNC_BLAS( symv, symv )
 #endif
+#endif
 
diff --git a/frame/compat/bla_symv.h b/frame/compat/bla_symv.h
index 4f453a7a3..8c2992bb0 100644
--- a/frame/compat/bla_symv.h
+++ b/frame/compat/bla_symv.h
@@ -38,6 +38,8 @@
 // Prototype BLAS-to-BLIS interfaces.
 //
 #undef  GENTPROT
+#undef  GENTPROTRO
+#define GENTPROTRO GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
@@ -52,8 +54,12 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
+#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
+INSERT_GENTPROTRO_BLAS( symv )
+#else
 INSERT_GENTPROT_BLAS( symv )
 #endif
+#endif
 
 #endif
 
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index 0b4877a5c..aeceaa225 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -39,6 +39,8 @@
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNC
+#undef  GENTFUNCRO
+#define GENTFUNCRO GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
@@ -101,6 +103,10 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
+#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
+INSERT_GENTFUNCRO_BLAS( syr, syr )
+#else
 INSERT_GENTFUNC_BLAS( syr, syr )
 #endif
+#endif
 
diff --git a/frame/compat/bla_syr.h b/frame/compat/bla_syr.h
index 7f3eeb367..3f15502de 100644
--- a/frame/compat/bla_syr.h
+++ b/frame/compat/bla_syr.h
@@ -38,6 +38,8 @@
 // Prototype BLAS-to-BLIS interfaces.
 //
 #undef  GENTPROT
+#undef  GENTPROTRO
+#define GENTPROTRO GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
@@ -50,8 +52,12 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
+#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
+INSERT_GENTPROTRO_BLAS( syr )
+#else
 INSERT_GENTPROT_BLAS( syr )
 #endif
+#endif
 
 #endif
 
diff --git a/frame/compat/bla_syr2.c b/frame/compat/bla_syr2.c
index e34fb9003..66de19221 100644
--- a/frame/compat/bla_syr2.c
+++ b/frame/compat/bla_syr2.c
@@ -38,8 +38,10 @@
 //
 // Define BLAS-to-BLIS interfaces.
 //
+#undef  GENTFUNC
 #undef  GENTFUNCRO
-#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
+#define GENTFUNCRO GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
      ( \
@@ -109,6 +111,10 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
+#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
 INSERT_GENTFUNCRO_BLAS( syr2, syr2 )
+#else
+INSERT_GENTFUNC_BLAS( syr2, syr2 )
+#endif
 #endif
 
diff --git a/frame/compat/bla_syr2.h b/frame/compat/bla_syr2.h
index 1b44a669e..e7e0dc826 100644
--- a/frame/compat/bla_syr2.h
+++ b/frame/compat/bla_syr2.h
@@ -37,8 +37,10 @@
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
+#undef  GENTPROT
 #undef  GENTPROTRO
-#define GENTPROTRO( ftype, ch, blasname ) \
+#define GENTPROTRO GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      ( \
@@ -51,7 +53,11 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
+#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
 INSERT_GENTPROTRO_BLAS( syr2 )
+#else
+INSERT_GENTPROT_BLAS( syr2 )
+#endif
 #endif
 
 #endif

From 31ecf820b9eb3368ad907ae6b192bf7397ebc92c Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 20 Jun 2024 18:23:23 -0500
Subject: [PATCH 197/230] Fix a bug in the piledriver microkernels. (#814)

Details:
- At some point, the piledriver (and bulldozer and excavator)
  microkernel tests via SDE had been removed from Travis CI testing.
  This PR re-enables them.
- A bug in the piledriver complex gemm microkernels has also been
  fixed. The `beta*C` product was not being correctly added to the `A*B`
  product before writing back out to memory.
- Fixes #811.
---
 .../piledriver/3/bli_gemm_piledriver_asm_d8x3.c  | 16 ++++++++--------
 travis/do_sde.sh                                 |  3 +--
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
index f11b4f68a..813425ce8 100644
--- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
+++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
@@ -1325,12 +1325,12 @@ void bli_cgemm_piledriver_asm_4x2
 		vmulps(xmm6, xmm0, xmm0)
 		vmulps(xmm7, xmm1, xmm1)
 		vaddsubps(xmm1, xmm0, xmm0)
-		vaddps(xmm8, xmm0, xmm0)
+		vaddps(xmm8, xmm0, xmm8)
 
 		vmulps(xmm6, xmm2, xmm2)
 		vmulps(xmm7, xmm3, xmm3)
 		vaddsubps(xmm3, xmm2, xmm2)
-		vaddps(xmm12, xmm2, xmm2)
+		vaddps(xmm12, xmm2, xmm12)
 
 		vmovups(mem(r10), xmm0) // load c01:c11
 		vmovups(mem(r10, 16), xmm2) // load c21:c31
@@ -1340,12 +1340,12 @@ void bli_cgemm_piledriver_asm_4x2
 		vmulps(xmm6, xmm0, xmm0)
 		vmulps(xmm7, xmm1, xmm1)
 		vaddsubps(xmm1, xmm0, xmm0)
-		vaddps(xmm10, xmm0, xmm0)
+		vaddps(xmm10, xmm0, xmm10)
 
 		vmulps(xmm6, xmm2, xmm2)
 		vmulps(xmm7, xmm3, xmm3)
 		vaddsubps(xmm3, xmm2, xmm2)
-		vaddps(xmm14, xmm2, xmm2)
+		vaddps(xmm14, xmm2, xmm14)
 
 		 // fall through
 
@@ -1737,12 +1737,12 @@ void bli_zgemm_piledriver_asm_2x2
 		vmulpd(xmm6, xmm0, xmm0)
 		vmulpd(xmm7, xmm1, xmm1)
 		vaddsubpd(xmm1, xmm0, xmm0)
-		vaddpd(xmm8, xmm0, xmm0)
+		vaddpd(xmm8, xmm0, xmm8)
 
 		vmulpd(xmm6, xmm2, xmm2)
 		vmulpd(xmm7, xmm3, xmm3)
 		vaddsubpd(xmm3, xmm2, xmm2)
-		vaddpd(xmm12, xmm2, xmm2)
+		vaddpd(xmm12, xmm2, xmm12)
 
 		vmovups(mem(r10), xmm0) // load c01
 		vmovups(mem(r10, 16), xmm2) // load c11
@@ -1752,12 +1752,12 @@ void bli_zgemm_piledriver_asm_2x2
 		vmulpd(xmm6, xmm0, xmm0)
 		vmulpd(xmm7, xmm1, xmm1)
 		vaddsubpd(xmm1, xmm0, xmm0)
-		vaddpd(xmm10, xmm0, xmm0)
+		vaddpd(xmm10, xmm0, xmm10)
 
 		vmulpd(xmm6, xmm2, xmm2)
 		vmulpd(xmm7, xmm3, xmm3)
 		vaddsubpd(xmm3, xmm2, xmm2)
-		vaddpd(xmm14, xmm2, xmm2)
+		vaddpd(xmm14, xmm2, xmm14)
 
 		 // fall through
 
diff --git a/travis/do_sde.sh b/travis/do_sde.sh
index de1545886..3c299e9b4 100755
--- a/travis/do_sde.sh
+++ b/travis/do_sde.sh
@@ -45,8 +45,7 @@ for LIB in $LD_SO $LIBC_SO $LIBM_SO; do
     sudo mv .tmp $LIB
 done
 
-#for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do
-for ARCH in penryn sandybridge haswell skx knl zen zen2 zen3; do
+for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do
     if [ "$ARCH" = "knl" ]; then
         $SDE -knl -- ./test_libblis.x > output.testsuite
     else

From 8820f8f91efd32e38e2995e73323656ef767bbd8 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Tue, 25 Jun 2024 22:56:23 -0500
Subject: [PATCH 198/230] Fixed typo in 4158930; variable renames. (#815)

Details:
- Fixed a typo in the "./configure --help" output for the ScaLAPACK
  compatibility option implemented in 4158930.
- Trivial variable renames.
---
 build/bli_config.h.in |  2 +-
 configure             | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index eae30970b..64b9d341c 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -110,7 +110,7 @@
 #define BLIS_DISABLE_MEM_TRACING
 #endif
 
-#if @scalapack_compat@
+#if @enable_scalapack_compat@
 #define BLIS_ENABLE_SCALAPACK_COMPAT
 #else
 #define BLIS_DISABLE_SCALAPACK_COMPAT
diff --git a/configure b/configure
index 83329e7cb..1ff10bed5 100755
--- a/configure
+++ b/configure
@@ -307,7 +307,7 @@ print_usage()
 
                  Enable strict compatibility with ScaLAPACK, which may
                  requiring disabling certain conflicting functionality
-                 available throught the BLAS and/or CBLAS interfaces.
+                 available through the BLAS and/or CBLAS interfaces.
 
    -a NAME --enable-addon=NAME
 
@@ -3015,9 +3015,9 @@ blis_main()
 	enable_amd_frame_tweaks='no'
 	enable_memkind='' # The default memkind value is determined later on.
 	enable_trsm_preinversion='yes'
+	enable_scalapack_compat='no'
 	force_version='no'
 	complex_return='default'
-	scalapack_compat='no'
 
 	# The addon flag and names.
 	addon_flag=''
@@ -3273,10 +3273,10 @@ blis_main()
 							;;
 
 						enable-scalapack-compat)
-							scalapack_compat='yes'
+							enable_scalapack_compat='yes'
 							;;
 						disable-scalapack-compat)
-							scalapack_compat='no'
+							enable_scalapack_compat='no'
 							;;
 
 						with-memkind)
@@ -3947,12 +3947,12 @@ blis_main()
 		echo "${script_name}: memory tracing output is disabled."
 		enable_mem_tracing_01=0
 	fi
-	if [[ ${scalapack_compat} = yes ]]; then
+	if [[ ${enable_scalapack_compat} = yes ]]; then
 		echo "${script_name}: ScaLAPACK compatibility is enabled."
-		scalapack_compat_01=1
+		enable_scalapack_compat_01=1
 	else
 		echo "${script_name}: ScaLAPACK compatibility is disabled."
-		scalapack_compat_01=0
+		enable_scalapack_compat_01=0
 	fi
 	if [[ ${has_memkind} = yes ]]; then
 		if [[ -z ${enable_memkind} ]]; then
@@ -4332,7 +4332,7 @@ blis_main()
 	add_config_var complex_return_intel      complex_return_intel01
 	add_config_var addon_list_includes
 	add_config_var enable_addons             enable_addons_01
-	add_config_var scalapack_compat          scalapack_compat_01
+	add_config_var enable_scalapack_compat   enable_scalapack_compat_01
 
 	generate_config_file "${config_mk_in_path}"    "${config_mk_out_path}"
 	generate_config_file "${bli_config_h_in_path}" "${bli_config_h_out_path}"

From a822cb2e22b7ac0c6aec4d477f93301ccf65a296 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Thu, 8 Aug 2024 13:34:37 -0500
Subject: [PATCH 199/230] Fixed out-of-bounds read bug in sup haswell ukr.
 (#824)

Details:
- Fixed a bug in the bli_sgemmsup_rd_haswell_asm_1x16n() millikernel.
  The kernel was erroneously performing an out-of-bounds read whenever
  the singleton edge case loop executed (that is, whenever the k
  dimension of the millikernel problem was not a multiple of 8). This
  OOB error was the result of a copy-paste bug; when developing the
  s1x16n function, I started from a copy of the s2x16n function, but
  then failed to delete the instruction that reads the second element
  of A in the code that handles the PR loop's edge case. Thanks to
  @j-bm for reporting this bug in Issue #821 and helping narrow down
  the cause to the rax register.
- CREDITS file update.
---
 CREDITS                                                   | 3 ++-
 kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CREDITS b/CREDITS
index 327c82a13..f24f0f371 100644
--- a/CREDITS
+++ b/CREDITS
@@ -57,7 +57,7 @@ but many others have contributed code, ideas, and feedback, including
   Stefan Husmann           @stefanhusmann
   Aaron Hutchinson         @Aaron-Hutchinson          (SiFive)
   Francisco Igual          @figual                    (Universidad Complutense de Madrid)
-  John Mather              @jmather-sesi              (SideFX Software)
+                           @j-bm
   Madeesh Kannan           @shadeMe
   Tony Kelman              @tkelman
   Lee Killough             @leekillough               (Tactical Computing Labs)
@@ -77,6 +77,7 @@ but many others have contributed code, ideas, and feedback, including
   Giorgos Margaritis
   Bryan Marker             @bamarker                  (The University of Texas at Austin)
   Simon Lukas Märtens      @ACSimon33                 (RWTH Aachen University)
+  John Mather              @jmather-sesi              (SideFX Software)
   Devin Matthews           @devinamatthews            (The University of Texas at Austin)
   Stefanos Mavros          @smavros
   Mithun Mohan             @MithunMohanKadavil        (AMD)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
index aa9361e5f..06eddca86 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
@@ -2211,7 +2211,6 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	                                   // which would destory intermediate results.
 
 	vmovss(mem(rax       ), xmm0)
-	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
 
 	vmovss(mem(rbx        ), xmm3)
@@ -2412,5 +2411,6 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 			#endif
 		}
 	}
+
 }
 

From 8d9be878b1a59aba401fd0d7b1b24c34526f0e81 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Thu, 8 Aug 2024 14:41:30 -0500
Subject: [PATCH 200/230] Flatten cblas.h immediately after blis.h. (#819)

Details:
- Previously, if the user enabled CBLAS via 'configure --enable-cblas'
  and then ran 'make', the flattened blis.h header file would be created
  immediately, but the flattened cblas.h header file would not be
  created until 'make install' was run. This was happening because
  nothing in the BLIS build process (except installation) depended on
  the flattened cblas.h (whereas *everything* depends on the flattened
  blis.h, and therefore it was being created first). This behavior can
  be confusing to application developers who could reasonably expect
  that the flattened cblas.h header would be available (to inspect or
  use) prior to running 'make install'.
- This commit fixes the aforementioned issue by (1) adding cblas.h (if
  CBLAS is enabled) as a dependency to all of the build rules for core
  framework object files, and (2) making the flattened blis.h a
  prerequisite for flattening cblas.h. The upshot is that (1) ensures
  that the flattened cblas.h is created around the the same time that
  the flattened blis.h is created, and (2) ensures that the two headers
  are flattened sequentially (first blis.h and then cblas.h) even when
  using 'make -j[n]', which ensures that the output of the two processes
  do not comingle.
- Thanks to Jeff Diamond for reporting this issue.
---
 Makefile | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index 37f6766a0..c686fd12c 100644
--- a/Makefile
+++ b/Makefile
@@ -281,16 +281,17 @@ endif
 # --- Monolithic header definitions --------------------------------------------
 #
 
-# Define a list of headers to install. The default is to only install blis.h.
+# Define lists of headers to create/install. The default is to only
+# create/install blis.h.
+HEADERS_TO_BUILD        := $(BLIS_H_FLAT)
 HEADERS_TO_INSTALL      := $(BLIS_H_FLAT)
 
-# If CBLAS is enabled, we also install cblas.h. This allows the user to continue
-# using #include "cblas.h" in their application, if they wish. (NOTE: Even if we
-# didn't install cblas.h, the user could *still* access CBLAS definitions and
-# function prototypes, but they would have to update their source code to use
-# #include "blis.h" instead of #include "cblas.h" since the latter header file
-# would not exist.)
+# If CBLAS is enabled, we also create/install cblas.h. This allows the user to
+# continue using #include "cblas.h" in their application, if they wish. (NOTE:
+# The user can also access CBLAS definitions and function prototypes by
+# #include'ing "blis.h".)
 ifeq ($(MK_ENABLE_CBLAS),yes)
+HEADERS_TO_BUILD   += $(CBLAS_H_FLAT)
 HEADERS_TO_INSTALL += $(CBLAS_H_FLAT)
 endif
 
@@ -568,7 +569,11 @@ endif
 
 flat-cblas-header: check-env $(CBLAS_H_FLAT)
 
-$(CBLAS_H_FLAT): $(FRAME_H99_FILES)
+# Note that the flattened blis.h is a prerequisite of flattening cblas.h. This
+# is done so that the two headers are built sequentially even when using
+# 'make -j[n]'. Otherwise, the output from the two processes can become
+# interleaved, which looks awkward/confusing.
+$(CBLAS_H_FLAT): $(FRAME_H99_FILES) $(BLIS_H_FLAT)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(FLATTEN_H) -l -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
 else
@@ -587,7 +592,7 @@ endif
 # first argument: a configuration name from config_list, used to look up the
 # CFLAGS to use during compilation.
 define make-config-rule
-$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-config-cflags-for,$(1)) -c $$< -o $$@
 else
@@ -599,7 +604,7 @@ endef
 # first argument: a configuration name from the union of config_list and
 # config_name, used to look up the CFLAGS to use during compilation.
 define make-frame-rule
-$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-frame-cflags-for,$(1)) -c $$< -o $$@
 else
@@ -608,7 +613,7 @@ else
 endif
 
 ifneq ($(findstring hpx,$(THREADING_MODEL)),)
-$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cpp $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CXX) $(call get-frame-cxxflags-for,$(1)) -c $$< -o $$@
 else
@@ -620,7 +625,7 @@ endef
 
 # first argument: a kernel set (name) being targeted (e.g. haswell).
 define make-refinit-rule
-$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref.o: $(REFKERN_PATH)/bli_cntx_ref.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref.o: $(REFKERN_PATH)/bli_cntx_ref.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-refinit-cflags-for,$(1)) -c $$< -o $$@
 else
@@ -631,7 +636,7 @@ endef
 
 # first argument: a kernel set (name) being targeted (e.g. haswell).
 define make-refkern-rule
-$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-refkern-cflags-for,$(1)) -c $$< -o $$@
 else
@@ -644,7 +649,7 @@ endef
 # second argument: the configuration whose CFLAGS we should use in compilation.
 # third argument: the kernel file suffix being considered.
 define make-kernels-rule
-$(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@
 else
@@ -657,7 +662,7 @@ endef
 # config_name, used to look up the CFLAGS to use during compilation.
 # second argument: the C99 addon file suffix being considered.
 define make-c99-addon-rule
-$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@
 else
@@ -671,7 +676,7 @@ endef
 # second argument: the C99 addon file suffix being considered.
 # third argument: the name of the addon being considered.
 define make-c99-addon-kers-rule
-$(BASE_OBJ_ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.o: $(ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.o: $(ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.$(2) $(HEADERS_TO_BUILD) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-addon-kernel-c99flags-for,$(1)) -c $$< -o $$@
 else
@@ -684,7 +689,7 @@ endef
 # config_name, used to look up the CFLAGS to use during compilation.
 # second argument: the C++ addon file suffix being considered.
 define make-cxx-addon-rule
-$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@
 else
@@ -697,7 +702,7 @@ endef
 # config_name, used to look up the CFLAGS to use during compilation.
 # second argument: the C99 sandbox file suffix being considered.
 define make-c99-sandbox-rule
-$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_H99_FILES) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(SANDBOX_H99_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-sandbox-c99flags-for,$(1)) -c $$< -o $$@
 else
@@ -710,7 +715,7 @@ endef
 # config_name, used to look up the CFLAGS to use during compilation.
 # second argument: the C++ sandbox file suffix being considered.
 define make-cxx-sandbox-rule
-$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
+$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(SANDBOX_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CXX) $(call get-sandbox-cxxflags-for,$(1)) -c $$< -o $$@
 else
@@ -861,7 +866,7 @@ blastest-bin: check-env blastest-f2c $(BLASTEST_DRV_BIN_PATHS)
 blastest-run: $(BLASTEST_DRV_BINS_R)
 
 # f2c object file rule.
-$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c $(BLIS_H_FLAT)
+$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c $(HEADERS_TO_BUILD)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@
 else
@@ -870,7 +875,7 @@ else
 endif
 
 # driver object file rule.
-$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c $(BLIS_H_FLAT)
+$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c $(HEADERS_TO_BUILD)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@
 else
@@ -951,7 +956,7 @@ testsuite: testsuite-run
 testsuite-bin: check-env $(TESTSUITE_BIN)
 
 # Object file rule.
-$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c $(BLIS_H_FLAT)
+$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c $(HEADERS_TO_BUILD)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) -c $< -o $@
 else

From b36bc95693091d1777b74eeb14d29ac8e76760a3 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 10 Oct 2024 14:48:45 -0500
Subject: [PATCH 201/230] Fix some aspects of the control tre/plugin
 infrastructure (#827)

Details:
- Use configure-time variable substitution rather than the PNAME macro
  to generate symbol names in plugins. This makes it much easier for
  uses to see what names their symbols will have (and to change them
  if desired).
- Use 'siz_t' rather than 'ukr_t' for anything dealing with kernel IDs
  (and similar for blocksizes and kernel preferences). Because users
  can now register new kernels, the values of the IDs for their custom
  kernels are no longer enumerated in 'ukr_t', which causes type
  conversion problems. This requires also being careful about the type
  of BLIS_VA_END and forcing existing enumerations like 'ukr_t' to be
  represented using integers of the same width as 'siz_t'.
- Modify the gemm control tree initialization function to indicate
  whether or not the operation as a whole was transposed. This is
  needed if users have to treat the initial A and B differently in the
  control tree, for example in a tensor times matrix operation (if
  transposed to matrix times tensor, we need to know which "matrix"
  object is now the tensor).
---
 build/plugin/bli_plugin.h.in        |  6 +--
 build/plugin/bli_plugin_init_ref.c  |  4 +-
 build/plugin/bli_plugin_init_zen3.c |  9 +++-
 build/plugin/bli_plugin_register.c  |  8 ++--
 configure                           |  8 ++--
 frame/3/gemm/bli_gemm_cntl.c        |  4 +-
 frame/3/gemm/bli_gemm_cntl.h        |  2 +-
 frame/base/bli_cntx.c               | 71 ++++++++++++++---------------
 frame/base/bli_cntx.h               | 58 +++++++++++------------
 frame/include/bli_misc_macro_defs.h |  6 ---
 frame/include/bli_type_defs.h       | 30 +++++++++---
 11 files changed, 109 insertions(+), 97 deletions(-)

diff --git a/build/plugin/bli_plugin.h.in b/build/plugin/bli_plugin.h.in
index b2cff2293..9e0495c18 100644
--- a/build/plugin/bli_plugin.h.in
+++ b/build/plugin/bli_plugin.h.in
@@ -137,10 +137,10 @@ INSERT_GENTCONF
 #undef GENTCONF
 #define GENTCONF( CONFIG, config ) \
 \
-void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,_,config)( PASTECH(plugin,BLIS_PNAME_INFIX,_params) ); \
-void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,_,config,BLIS_REF_SUFFIX)( PASTECH(plugin,BLIS_PNAME_INFIX,_params) );
+void PASTEMAC(plugin_init_@plugin_name@_,config)( plugin_@plugin_name@_params ); \
+void PASTEMAC(plugin_init_@plugin_name@_,config,BLIS_REF_SUFFIX)( plugin_@plugin_name@_params );
 
 INSERT_GENTCONF
 
-BLIS_EXPORT_BLIS err_t PASTEMAC(plugin_register,BLIS_PNAME_INFIX)( PASTECH(plugin,BLIS_PNAME_INFIX,_params) );
+BLIS_EXPORT_BLIS err_t bli_plugin_register_@plugin_name@( plugin_@plugin_name@_params );
 
diff --git a/build/plugin/bli_plugin_init_ref.c b/build/plugin/bli_plugin_init_ref.c
index 649eb311a..0dec70b01 100644
--- a/build/plugin/bli_plugin_init_ref.c
+++ b/build/plugin/bli_plugin_init_ref.c
@@ -56,9 +56,9 @@ do { \
 
 // -----------------------------------------------------------------------------
 
-void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
+void PASTEMAC(plugin_init_@plugin_name@,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
      (
-       PASTECH(plugin,BLIS_PNAME_INFIX,_params)
+       plugin_@plugin_name@_params
      )
 {
 	cntx_t* cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH,BLIS_CNAME_UPPER_INFIX) );
diff --git a/build/plugin/bli_plugin_init_zen3.c b/build/plugin/bli_plugin_init_zen3.c
index 43d2e86a1..f5cd2c8bd 100644
--- a/build/plugin/bli_plugin_init_zen3.c
+++ b/build/plugin/bli_plugin_init_zen3.c
@@ -34,14 +34,19 @@
 
 #include @PLUGIN_HEADER@
 
-void PASTEMAC(plugin_init,BLIS_PNAME_INFIX,BLIS_CNAME_INFIX)
+void PASTEMAC(plugin_init_@plugin_name@,BLIS_CNAME_INFIX)
      (
-       PASTECH(plugin,BLIS_PNAME_INFIX,_params)
+       plugin_@plugin_name@_params
      )
 {
 	cntx_t* cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH,BLIS_CNAME_UPPER_INFIX) );
 	( void )cntx;
 
+	PASTEMAC(plugin_init_@plugin_name@,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
+    (
+      plugin_@plugin_name@_params_only
+    );
+
     // ------------------------------------------------------------------------>
 	// -- Example Initialization ---------------------------------------------->
 	// ------------------------------------------------------------------------>
diff --git a/build/plugin/bli_plugin_register.c b/build/plugin/bli_plugin_register.c
index 4512b554e..f711e3940 100644
--- a/build/plugin/bli_plugin_register.c
+++ b/build/plugin/bli_plugin_register.c
@@ -34,9 +34,9 @@
 
 #include @PLUGIN_HEADER@
 
-err_t PASTEMAC(plugin_register,BLIS_PNAME_INFIX)
+err_t bli_plugin_register_@plugin_name@
      (
-       PASTECH(plugin,BLIS_PNAME_INFIX,_params)
+       plugin_@plugin_name@_params
      )
 {
 	// ------------------------------------------------------------------------>
@@ -69,9 +69,9 @@ err_t PASTEMAC(plugin_register,BLIS_PNAME_INFIX)
 
 	#undef GENTCONF
 	#define GENTCONF( CONFIG, config ) \
-	PASTEMAC(plugin_init,BLIS_PNAME_INFIX,_,config,BLIS_REF_SUFFIX) \
+	PASTEMAC(plugin_init_@plugin_name@_,config) \
 	( \
-	  PASTECH(plugin,BLIS_PNAME_INFIX,_params_only) \
+	  plugin_@plugin_name@_params_only \
 	);
 
 	INSERT_GENTCONF
diff --git a/configure b/configure
index 1ff10bed5..7e8684158 100755
--- a/configure
+++ b/configure
@@ -5243,7 +5243,7 @@ plugin_main()
 			else
 				strip_examples ${sharedir}/blis/plugin/bli_plugin_register.c bli_plugin_register.c
 			fi
-			perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" bli_plugin_register.c
+			perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|;" -e "s|\@plugin_name\@|${plugin_name}|;" bli_plugin_register.c
 			maybe_echo "done"
 		fi
 
@@ -5279,7 +5279,7 @@ plugin_main()
 				else
 					strip_examples ${sharedir}/blis/plugin/${file} ref_kernels/${file}
 				fi
-				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" ref_kernels/${file}
+				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|;" -e "s|\@plugin_name\@|${plugin_name}|;" ref_kernels/${file}
 				done="false"
 			fi
 		done
@@ -5314,7 +5314,7 @@ plugin_main()
 				else
 					cp ${sharedir}/blis/plugin/bli_plugin_init_zen3.c config/${config}/bli_plugin_init_${config}.c
 				fi
-				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" config/${config}/bli_plugin_init_${config}.c
+				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|;" -e "s|\@plugin_name\@|${plugin_name}|;" config/${config}/bli_plugin_init_${config}.c
 			fi
 
 			if [ ! -e config/${config}/bli_kernel_defs_${config}.h ] || [ ${force_flag} == '1' ]; then
@@ -5340,7 +5340,7 @@ plugin_main()
 		if [ ${examples_flag} == '1' ]; then
 			if [ ! -e kernels/zen3/my_kernel_1_zen3.c ] || [ ${force_flag} == '1' ]; then
 				cp ${sharedir}/blis/plugin/my_kernel_1_zen3.c kernels/zen3
-				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|" kernels/zen3/my_kernel_1_zen3.c
+				perl -pi -e "s|\@PLUGIN_HEADER\@|${plugin_h}|;" -e "s|\@plugin_name\@|${plugin_name}|;" kernels/zen3/my_kernel_1_zen3.c
 			fi
 		fi
 
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index 4972502d3..9b6e07de5 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -71,7 +71,7 @@ void bli_gemm_var_cntl_init_node
 	);
 }
 
-void bli_gemm_cntl_init
+bool bli_gemm_cntl_init
      (
              ind_t        im,
              opid_t       family,
@@ -559,6 +559,8 @@ void bli_gemm_cntl_init
 	  c,
 	  cntl
 	);
+
+	return needs_swap;
 }
 
 void bli_gemm_cntl_finalize
diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h
index 08fb62696..ad3a29a03 100644
--- a/frame/3/gemm/bli_gemm_cntl.h
+++ b/frame/3/gemm/bli_gemm_cntl.h
@@ -200,7 +200,7 @@ typedef struct gemm_cntl_s gemm_cntl_t;
 
 // -----------------------------------------------------------------------------
 
-BLIS_EXPORT_BLIS void bli_gemm_cntl_init
+BLIS_EXPORT_BLIS bool bli_gemm_cntl_init
      (
              ind_t        im,
              opid_t       family,
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 36c481bbc..fd0799a50 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -46,7 +46,7 @@ BLIS_EXPORT_BLIS err_t bli_cntx_init( cntx_t* cntx )
 	if ( error != BLIS_SUCCESS )
 		return error;
 
-	error = bli_stack_init( sizeof( bszid_t ), 32, 32, BLIS_NUM_BLKSZS, &cntx->bmults );
+	error = bli_stack_init( sizeof( siz_t ), 32, 32, BLIS_NUM_BLKSZS, &cntx->bmults );
 	if ( error != BLIS_SUCCESS )
 		return error;
 
@@ -118,9 +118,9 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	   void bli_cntx_set_blkszs
 	   (
 	     cntx_t* cntx,
-	     bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
-	     bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
-	     bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
+	     siz_t bs0_id, blksz_t* blksz0, siz_t bm0_id,
+	     siz_t bs1_id, blksz_t* blksz1, siz_t bm1_id,
+	     siz_t bs2_id, blksz_t* blksz2, siz_t bm2_id,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -133,19 +133,18 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	// Process blocksizes until we get a BLIS_VA_END.
 	while ( true )
 	{
-		int bs_id0 = va_arg( args, int );
+		int bs_id = va_arg( args, siz_t );
 
-		// If we find a bszid_t id of BLIS_VA_END, then we are done.
-		if ( bs_id0 == BLIS_VA_END ) break;
+		// If we find a siz_t id of BLIS_VA_END, then we are done.
+		if ( bs_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the bszid_t of the blocksize we're about to process (already done),
+		// - the siz_t of the blocksize we're about to process (already done),
 		// - the address of the blksz_t object,
-		// - the bszid_t of the multiple we need to associate with
+		// - the siz_t of the multiple we need to associate with
 		//   the blksz_t object.
-		bszid_t  bs_id = ( bszid_t  )bs_id0;
 		blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-		bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
+		siz_t    bm_id = ( siz_t    )va_arg( args, siz_t    );
 
 		// Copy the blksz_t object contents into the appropriate
 		// location within the context's blksz_t array. Do the same
@@ -172,9 +171,9 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	   void bli_cntx_set_ukrs
 	   (
 	     cntx_t* cntx,
-	     ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
-	     ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
-	     ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
+	     siz_t ukr0_id, num_t dt0, void_fp ukr0_fp,
+	     siz_t ukr1_id, num_t dt1, void_fp ukr1_fp,
+	     siz_t ukr2_id, num_t dt2, void_fp ukr2_fp,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -187,16 +186,15 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	// Process ukernels until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const int ukr_id0 = va_arg( args, int );
+		const int ukr_id = va_arg( args, siz_t );
 
 		// If we find a ukernel id of BLIS_VA_END, then we are done.
-		if ( ukr_id0 == BLIS_VA_END ) break;
+		if ( ukr_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the ukr_t of the kernel we're about to process (already done),
+		// - the siz_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		const ukr_t   ukr_id = ( ukr_t   )ukr_id0;
 		const num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
 		      void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
 
@@ -223,9 +221,9 @@ void bli_cntx_set_ukr2s( cntx_t* cntx , ... )
 	   void bli_cntx_set_ukr2s
 	   (
 	     cntx_t* cntx,
-	     ukr_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
-	     ukr_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
-	     ukr_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
+	     siz_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
+	     siz_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
+	     siz_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -238,16 +236,15 @@ void bli_cntx_set_ukr2s( cntx_t* cntx , ... )
 	// Process ukernels until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const int ukr_id0 = va_arg( args, int );
+		const int ukr_id = va_arg( args, siz_t );
 
 		// If we find a ukernel id of BLIS_VA_END, then we are done.
-		if ( ukr_id0 == BLIS_VA_END ) break;
+		if ( ukr_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the ukr_t of the kernel we're about to process (already done),
+		// - the siz_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		const ukr_t   ukr_id  = ( ukr_t  )ukr_id0;
 		const num_t   ukr_dt1 = ( num_t   )va_arg( args, num_t   );
 		const num_t   ukr_dt2 = ( num_t   )va_arg( args, num_t   );
 		      void_fp ukr_fp  = ( void_fp )va_arg( args, void_fp );
@@ -275,9 +272,9 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	   void bli_cntx_set_ukr_prefs
 	   (
 	     cntx_t* cntx,
-	     ukr_pref_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
-	     ukr_pref_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
-	     ukr_pref_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
+	     siz_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
+	     siz_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
+	     siz_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -290,18 +287,17 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	// Process ukernel preferences until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const int ukr_pref_id0 = va_arg( args, int );
+		const int ukr_pref_id = va_arg( args, siz_t );
 
 		// If we find a ukernel pref id of BLIS_VA_END, then we are done.
-		if ( ukr_pref_id0 == BLIS_VA_END ) break;
+		if ( ukr_pref_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the ukr_t of the kernel we're about to process (already done),
+		// - the siz_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0;
-		const num_t      ukr_pref_dt = ( num_t      )va_arg( args, num_t );
-		const bool       ukr_pref    = ( bool       )va_arg( args, int );
+		const num_t ukr_pref_dt = ( num_t )va_arg( args, num_t );
+		const bool  ukr_pref    = ( bool  )va_arg( args, int );
 
 		// Store the ukernel preference value into the context.
 		bli_cntx_set_ukr_pref_dt( ukr_pref, ukr_pref_dt, ukr_pref_id, cntx );
@@ -341,15 +337,14 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 	// Process sup handlers until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const int op_id0 = va_arg( args, int );
+		const opid_t op_id = va_arg( args, siz_t );
 
 		// If we find an operation id of BLIS_VA_END, then we are done.
-		if ( op_id0 == BLIS_VA_END ) break;
+		if ( op_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
 		// - the opid_t of the operation we're about to process,
 		// - the sup handler function pointer
-		const opid_t  op_id = ( opid_t  )op_id0;
 		      void_fp op_fp = ( void_fp )va_arg( args, void_fp );
 
 		if ( op_id >= BLIS_NUM_LEVEL3_OPS )
@@ -368,7 +363,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 
 // -----------------------------------------------------------------------------
 
-err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, bszid_t bmult_id, cntx_t* cntx )
+err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, siz_t bmult_id, cntx_t* cntx )
 {
 	siz_t id_blksz;
 	err_t error = bli_stack_push( &id_blksz, &cntx->blkszs );
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 59c8efbf7..1e9874c05 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -44,7 +44,7 @@
 typedef struct cntx_s
 {
 	stck_t blkszs; // blksz_t
-	stck_t bmults; // bszid_t
+	stck_t bmults; // siz_t
 
 	stck_t ukrs; // func_t
 	stck_t ukr2s; // func2_t
@@ -60,7 +60,7 @@ typedef struct cntx_s
 // -- cntx_t query (complex) ---------------------------------------------------
 //
 
-BLIS_INLINE const blksz_t* bli_cntx_get_blksz( bszid_t bs_id, const cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_blksz( siz_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* blksz;
 	err_t error = bli_stack_get( bs_id, ( void** )&blksz, &cntx->blkszs );
@@ -69,7 +69,7 @@ BLIS_INLINE const blksz_t* bli_cntx_get_blksz( bszid_t bs_id, const cntx_t* cntx
 	return blksz;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, siz_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
 	dim_t          bs_dt  = bli_blksz_get_def( dt, blksz );
@@ -78,7 +78,7 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx
 	return bs_dt;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, siz_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
 	dim_t          bs_dt  = bli_blksz_get_max( dt, blksz );
@@ -87,24 +87,24 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx
 	return bs_dt;
 }
 
-BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, const cntx_t* cntx )
+BLIS_INLINE siz_t bli_cntx_get_bmult_id( siz_t bs_id, const cntx_t* cntx )
 {
-	const bszid_t* bsz;
+	const siz_t* bsz;
 	err_t error = bli_stack_get( bs_id, ( void** )&bsz, &cntx->bmults );
 	if ( error != BLIS_SUCCESS )
 		bli_check_error_code( error );
 	return *bsz;
 }
 
-BLIS_INLINE const blksz_t* bli_cntx_get_bmult( bszid_t bs_id, const cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_bmult( siz_t bs_id, const cntx_t* cntx )
 {
-	bszid_t        bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
+	siz_t          bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
 	const blksz_t* bmult  = bli_cntx_get_blksz( bm_id, cntx );
 
 	return bmult;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, siz_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* bmult  = bli_cntx_get_bmult( bs_id, cntx );
 	dim_t          bm_dt  = bli_blksz_get_def( dt, bmult );
@@ -114,7 +114,7 @@ BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, const cntx_t*
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE const func2_t* bli_cntx_get_ukr2s( ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE const func2_t* bli_cntx_get_ukr2s( siz_t ukr_id, const cntx_t* cntx )
 {
 	const func2_t* ukr;
 	err_t error = bli_stack_get( bli_ker_idx( ukr_id ), ( void** )&ukr, &cntx->ukr2s );
@@ -123,7 +123,7 @@ BLIS_INLINE const func2_t* bli_cntx_get_ukr2s( ukr_t ukr_id, const cntx_t* cntx
 	return ukr;
 }
 
-BLIS_INLINE void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, siz_t ukr_id, const cntx_t* cntx )
 {
 	const func2_t* func = bli_cntx_get_ukr2s( ukr_id, cntx );
 
@@ -132,11 +132,11 @@ BLIS_INLINE void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, ukr_t ukr_id, co
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE const func_t* bli_cntx_get_ukrs( siz_t ukr_id, const cntx_t* cntx )
 {
 	if ( bli_ker_ntype( ukr_id ) == 2 )
 	{
-		return ( const func_t* )bli_cntx_get_ukr2s( ( ukr_t )ukr_id, cntx );
+		return ( const func_t* )bli_cntx_get_ukr2s( ukr_id, cntx );
 	}
 	else
 	{
@@ -148,11 +148,11 @@ BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx )
 	}
 }
 
-BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, siz_t ukr_id, const cntx_t* cntx )
 {
 	if ( bli_ker_ntype( ukr_id ) == 2 )
 	{
-		return bli_cntx_get_ukr2_dt( dt, dt, ( ukr_t )ukr_id, cntx );
+		return bli_cntx_get_ukr2_dt( dt, dt, ukr_id, cntx );
 	}
 	else
 	{
@@ -164,7 +164,7 @@ BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* c
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t pref_id, const cntx_t* cntx )
+BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( siz_t pref_id, const cntx_t* cntx )
 {
 	const mbool_t* ukr_prefs;
 	err_t error = bli_stack_get( pref_id, ( void** )&ukr_prefs, &cntx->ukr_prefs );
@@ -173,7 +173,7 @@ BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t pref_id, const cnt
 	return ukr_prefs;
 }
 
-BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, siz_t ukr_id, const cntx_t* cntx )
 {
 	const mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
 
@@ -262,14 +262,14 @@ BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, c
 // NOTE: The framework does not use any of the following functions. We provide
 // them in order to facilitate creating/modifying custom contexts.
 
-BLIS_INLINE err_t bli_cntx_set_blksz( bszid_t bs_id, const blksz_t* blksz, bszid_t mult_id, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_blksz( siz_t bs_id, const blksz_t* blksz, siz_t mult_id, cntx_t* cntx )
 {
 	blksz_t* cntx_blksz;
 	err_t error = bli_stack_get( bs_id, ( void** )&cntx_blksz, &cntx->blkszs );
 	if ( error != BLIS_SUCCESS )
 		return error;
 
-	bszid_t* cntx_mult_id;
+	siz_t* cntx_mult_id;
 	error = bli_stack_get( bs_id, ( void** )&cntx_mult_id, &cntx->bmults );
 	if ( error != BLIS_SUCCESS )
 		return error;
@@ -280,38 +280,38 @@ BLIS_INLINE err_t bli_cntx_set_blksz( bszid_t bs_id, const blksz_t* blksz, bszid
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx )
 {
 	bli_blksz_set_def( bs, dt, ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx ) );
 }
 
-BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx )
 {
 	bli_blksz_set_max( bs, dt, ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx ) );
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr2( ukr_t ukr_id, const func2_t* func, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr2( siz_t ukr_id, const func2_t* func, cntx_t* cntx )
 {
 	*( func2_t* )bli_cntx_get_ukr2s( ukr_id, cntx ) = *func;
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, ukr_t ker_id, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, siz_t ker_id, cntx_t* cntx )
 {
 	bli_func2_set_dt( fp, dt1, dt2, ( func2_t* )bli_cntx_get_ukr2s( ker_id, cntx ) );
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr( ukr_t ukr_id, const func_t* func, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr( siz_t ukr_id, const func_t* func, cntx_t* cntx )
 {
 	*( func_t* )bli_cntx_get_ukrs( ukr_id, cntx ) = *func;
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, siz_t ker_id, cntx_t* cntx )
 {
 	if ( bli_ker_ntype( ker_id ) == 2 )
 	{
-		bli_cntx_set_ukr2_dt( fp, dt, dt, (ukr_t)ker_id, cntx );
+		bli_cntx_set_ukr2_dt( fp, dt, dt, ker_id, cntx );
 	}
 	else
 	{
@@ -319,13 +319,13 @@ BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t
 	}
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, const mbool_t* prefs, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr_pref( siz_t ukr_id, const mbool_t* prefs, cntx_t* cntx )
 {
 	*( mbool_t* )bli_cntx_get_ukr_prefs( ukr_id, cntx ) = *prefs;
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, ukr_pref_t ukr_id, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, siz_t ukr_id, cntx_t* cntx )
 {
 	bli_mbool_set_dt( pref, dt, ( mbool_t* )bli_cntx_get_ukr_prefs( ukr_id, cntx ));
 	return BLIS_SUCCESS;
@@ -400,7 +400,7 @@ BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx );
 
 BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, bszid_t bmult_id, cntx_t* cntx );
+BLIS_EXPORT_BLIS err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, siz_t bmult_id, cntx_t* cntx );
 
 BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr( siz_t* ukr_id, const func_t* ukr, cntx_t* cntx );
 
diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h
index 98d86a298..f30880344 100644
--- a/frame/include/bli_misc_macro_defs.h
+++ b/frame/include/bli_misc_macro_defs.h
@@ -186,12 +186,6 @@ BLIS_INLINE void bli_toggle_bool( bool* b )
 #define bli_iformatspec() "%6d"
 
 
-// Sentinel constant used to indicate the end of a variable argument function
-// (See bli_cntx.c)
-
-#define BLIS_VA_END  (-1)
-
-
 // Static assertion compatible with any version of C/C++
 #define bli_static_assert(cond) while(0){struct s {int STATIC_ASSERT_FAILED : !!(cond);};}
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 5bc96e8f2..df0b2a425 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -626,6 +626,11 @@ typedef enum
 #define bli_ker_idx( ker )	 ((ker) & ~BLIS_NTYPE_KER_BITS)
 #define bli_ker_ntype( ker ) ((((ker) & BLIS_NTYPE_KER_BITS) >> BLIS_NTYPE_KER_SHIFT) + 1)
 
+// Sentinel constant used to indicate the end of a variable argument function
+// (See bli_cntx.c)
+
+#define BLIS_VA_END  ((siz_t)-1)
+
 typedef enum
 {
 	// -- Single-type kernels --
@@ -675,7 +680,7 @@ typedef enum
 	BLIS_GEMMSUP_CCC_UKR,
 	BLIS_GEMMSUP_XXX_UKR,
 
-	// BLIS_NUM_UKRS must be last!
+	// BLIS_NUM_UKRS must after all 1-type kernels and before 2-type kernels!
 	BLIS_NUM_UKRS_, BLIS_NUM_UKRS = bli_ker_idx( BLIS_NUM_UKRS_ ),
 
 	// -- Two-type kernels --
@@ -702,8 +707,11 @@ typedef enum
 	BLIS_GEMM_RCC_UKR,
 	BLIS_GEMM_CRR_UKR,
 
-	// BLIS_NUM_UKR2S must be last!
-	BLIS_NUM_UKR2S_, BLIS_NUM_UKR2S = bli_ker_idx( BLIS_NUM_UKR2S_ )
+	// BLIS_NUM_UKR2S must come after all kernels!
+	BLIS_NUM_UKR2S_, BLIS_NUM_UKR2S = bli_ker_idx( BLIS_NUM_UKR2S_ ),
+
+	// Force the size of ukr_t values to be as large as siz_t
+	BLIS_UKRS_END_ = BLIS_VA_END
 } ukr_t;
 
 
@@ -728,9 +736,11 @@ typedef enum
 	BLIS_GEMMSUP_XXX_UKR_ROW_PREF,
 
     // BLIS_NUM_UKR_PREFS must be last!
-    BLIS_NUM_UKR_PREFS
-} ukr_pref_t;
+    BLIS_NUM_UKR_PREFS,
 
+	// Force the size of ukr_pref_t values to be as large as siz_t
+	BLIS_UKR_PREFS_END_ = BLIS_VA_END
+} ukr_pref_t;
 
 typedef enum
 {
@@ -864,7 +874,10 @@ typedef enum
 
 	// BLIS_NOID (= BLIS_NUM_LEVEL3_OPS) must be last!
 	BLIS_NOID,
-	BLIS_NUM_LEVEL3_OPS = BLIS_NOID
+	BLIS_NUM_LEVEL3_OPS = BLIS_NOID,
+
+	// Force the size of opid_t values to be as large as siz_t
+	BLIS_LEVEL3_OPS_END_ = BLIS_VA_END
 } opid_t;
 
 
@@ -911,7 +924,10 @@ typedef enum
 	// BLIS_NO_PART (= BLIS_NUM_BLKSZS) must be last!
 	BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable,
 	              // such as when characterizing a packm operation.
-	BLIS_NUM_BLKSZS = BLIS_NO_PART
+	BLIS_NUM_BLKSZS = BLIS_NO_PART,
+
+	// Force the size of bszid_t values to be as large as siz_t
+	BLIS_BLKSZS_END_ = BLIS_VA_END
 } bszid_t;
 
 
From 827c50b23402ee61da40fb789432213ded6af5b2 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Wed, 16 Oct 2024 16:45:26 -0500
Subject: [PATCH 202/230] Implemented `--omit-symbols=LIST` `configure` option.
 (#823)

Details:
- Added a new option to 'configure' that allows the user to specify a
  list of symbols to omit from the library. The format of the option is
  --omit-symbols=LIST where LIST is a comma-separated list of symbol
  names (excluding any trailing underscore). This list is parsed into
  a list of #define directives that causes the relevant parts of BLIS
  to be ignored (or not). As such, the nature of this option is to only
  support omitting symbols which have been pre-identified as potential
  troublemakers when linking BLIS with other libraries such as LAPACK
  or ScaLAPACK. (This list may grow in the future as additional symbols
  are identified.) Note: we leave lsame_() and xerbla_() prototypes
  enabled even when their respective symbols are omitted from the
  library.
- Re-implemented the --enable-scalapack-compat configure option to
  utilize the underlying --omit-symbols=LIST infrastructure.
- Implemented an --enable-lapack-compat option, which omits all of the
  known problematic symbols currently supported for omission.
- This commit addresses Issue #816. Thanks to Timo Betcke for bringing
  it to our attention and to Devin Matthews for his advice and for
  his initial implementation of --enable-scalapack-compat (PR #813).
- CREDITS file update.
---
 CREDITS                             |   1 +
 build/bli_config.h.in               |   3 +
 configure                           | 143 +++++++++++++++++++++++++---
 frame/compat/bla_symv.c             |  12 ++-
 frame/compat/bla_symv.h             |  16 ++--
 frame/compat/bla_syr.c              |  12 ++-
 frame/compat/bla_syr.h              |  16 ++--
 frame/compat/bla_syr2.c             |  14 +--
 frame/compat/bla_syr2.h             |  16 ++--
 frame/compat/f2c/bla_lsame.c        |   2 +
 frame/compat/f2c/bla_lsame.h        |  10 +-
 frame/compat/f2c/bla_rot.c          |   4 +
 frame/compat/f2c/bla_rot.h          |   7 +-
 frame/compat/f2c/bla_xerbla.c       |   2 +
 frame/compat/f2c/bla_xerbla.h       |  10 +-
 frame/compat/f2c/bla_xerbla_array.c |   2 +
 frame/compat/f2c/bla_xerbla_array.h |   2 +-
 17 files changed, 204 insertions(+), 68 deletions(-)

diff --git a/CREDITS b/CREDITS
index f24f0f371..373514136 100644
--- a/CREDITS
+++ b/CREDITS
@@ -19,6 +19,7 @@ but many others have contributed code, ideas, and feedback, including
   Abhishek Bagusetty       @abagusetty                (Argonne National Laboratory)
   Satish Balay             @balay                     (Argonne National Laboratory)
   Kihiro Bando             @bandokihiro
+  Timo Betcke              @tbetcke                   (University College London)
   Matthew Brett            @matthew-brett             (University of Birmingham)
   Jérémie du Boisberranger @jeremiedbb
   Jed Brown                @jedbrown                  (Argonne National Laboratory)
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 64b9d341c..f883c492b 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -45,6 +45,9 @@
 // Enabled kernel sets (kernel_list)
 @kernel_list_defines@
 
+// Disabled symbols (symbol_omit_list)
+@omit_symbol_list_defines@
+
 #define BLIS_VERSION_STRING "@version@"
 
 #if @enable_system@
diff --git a/configure b/configure
index 7e8684158..22a143cf2 100755
--- a/configure
+++ b/configure
@@ -144,6 +144,50 @@ print_usage()
                  library. If the shared library build is disabled, the
                  static library build must remain enabled.
 
+   --omit-symbols=LIST
+
+                 Omit a custom set of compatibility symbols when building
+                 BLIS. When given, LIST is parsed as a comma-separated
+                 list of symbol names (excluding any trailing underscore).
+                 This option is useful when (1) the user is planning to
+                 link BLIS with another library that provides conflicting
+                 symbols, and (2) the user wishes the symbols in this
+                 other library to prevail at link time without relying on
+                 weak/strong symbol semantics. Note that currently ONLY
+                 the following symbols are supported for omission:
+
+                           crot    zrot    lsame
+                           csymv   zsymv   xerbla
+                           csyr    zsyr    xerbla_array
+                           csyr2   zsyr2
+
+   --enable-lapack-compat, --disable-lapack-compat
+
+                 Enable strict compatibility with LAPACK. This option
+                 causes BLIS to be built without some routines that we
+                 consider to be BLAS compatibility routines but that
+                 also happen to be provided by LAPACK. This option is
+                 equivalent to using the --omit-symbols=LIST option
+                 where LIST contains the following symbols:
+
+                           crot    zrot    lsame
+                           csymv   zsymv   xerbla
+                           csyr    zsyr    xerbla_array
+                           csyr2   zsyr2
+
+   --enable-scalapack-compat, --disable-scalapack-compat
+
+                 Enable strict compatibility with ScaLAPACK. This option
+                 causes BLIS to be built without some routines that we
+                 consider to be BLAS compatibility routines but that
+                 also happen to be provided by ScaLAPACK. This option is
+                 equivalent to using the --omit-symbols=LIST option
+                 where LIST contains the following symbols:
+
+                           csymv   zsymv
+                           csyr    zsyr
+                           csyr2   zsyr2
+
    --enable-rpath, --disable-rpath
 
                  Enable (disabled by default) setting an install_name for
@@ -303,12 +347,6 @@ print_usage()
                  which are determined by the BLIS subconfiguration used at
                  runtime.) By default, these customized files are disabled.
 
-   --enable-scalapack-compat, --disable-scalapack-compat
-
-                 Enable strict compatibility with ScaLAPACK, which may
-                 requiring disabling certain conflicting functionality
-                 available through the BLAS and/or CBLAS interfaces.
-
    -a NAME --enable-addon=NAME
 
                  Enable the code provided by an addon. An addon consists
@@ -3015,10 +3053,14 @@ blis_main()
 	enable_amd_frame_tweaks='no'
 	enable_memkind='' # The default memkind value is determined later on.
 	enable_trsm_preinversion='yes'
+	enable_lapack_compat='no'
 	enable_scalapack_compat='no'
 	force_version='no'
 	complex_return='default'
 
+	# The symbol omission list.
+	omit_symbol_list=''
+
 	# The addon flag and names.
 	addon_flag=''
 	addon_list=''
@@ -3155,6 +3197,10 @@ blis_main()
 							enable_shared='no'
 							;;
 
+						omit-symbols=*)
+							omit_symbol_list=${OPTARG#*=}
+							;;
+
 						enable-rpath)
 							enable_rpath='yes'
 							;;
@@ -3272,6 +3318,13 @@ blis_main()
 							enable_amd_frame_tweaks='no'
 							;;
 
+						enable-lapack-compat)
+							enable_lapack_compat='yes'
+							;;
+						disable-lapack-compat)
+							enable_lapack_compat='no'
+							;;
+
 						enable-scalapack-compat)
 							enable_scalapack_compat='yes'
 							;;
@@ -3675,6 +3728,54 @@ blis_main()
 		exit 1
 	fi
 
+	# Check for the LAPACK compatibility option before the option for symbol
+	# omission since the former can imply/augment the latter.
+	if [[ ${enable_lapack_compat} = yes ]]; then
+		echo "${script_name}: LAPACK compatibility is enabled."
+		enable_lapack_compat_01=1
+		problematic_symbols="crot,zrot,csymv,zsymv,csyr,zsyr,csyr2,zsyr2,lsame,xerbla,xerbla_array"
+		omit_symbol_list="${omit_symbol_list},${problematic_symbols}"
+	else
+		echo "${script_name}: LAPACK compatibility is disabled."
+		enable_lapack_compat_01=0
+	fi
+
+	# Check for the ScaLAPACK compatibility option before the option for symbol
+	# omission since the former can imply/augment the latter.
+	if [[ ${enable_scalapack_compat} = yes ]]; then
+		echo "${script_name}: ScaLAPACK compatibility is enabled."
+		enable_scalapack_compat_01=1
+		problematic_symbols="csymv,zsymv,csyr,zsyr,csyr2,zsyr2"
+		omit_symbol_list="${omit_symbol_list},${problematic_symbols}"
+	else
+		echo "${script_name}: ScaLAPACK compatibility is disabled."
+		enable_scalapack_compat_01=0
+	fi
+
+	# Check if we are omitting any symbols.
+	if [[ ${omit_symbol_list} != "" ]]; then
+
+		# Create a list of #defines, one for each symbol the user requested
+		# that we omit. Note that first we convert the list's commas into
+		# spaces.
+
+		# Start by changing the comma-separated list to a space-separated list.
+		omit_symbol_list=$(echo "${omit_symbol_list}" | sed -e "s/,/ /g")
+
+		# Remove duplicates.
+		#omit_symbol_list=$(rm_duplicate_words_simple "${omit_symbol_list}")
+
+		# Sort the list, removing duplicates (via -u).
+		omit_symbol_list=$(echo "${omit_symbol_list}" | xargs -n1 | sort -u)
+
+		echo "${script_name}: omitting the following symbols from BLIS:"
+		for omit_symbol_name in ${omit_symbol_list}; do
+			echo "${script_name}:   ${omit_symbol_name}"
+		done
+	else
+		echo "${script_name}: no symbols will be omitted."
+	fi
+
 	# Check if we are building with or without operating system support.
 	if [[ ${enable_system} = yes ]]; then
 		echo "${script_name}: enabling operating system support."
@@ -3947,13 +4048,6 @@ blis_main()
 		echo "${script_name}: memory tracing output is disabled."
 		enable_mem_tracing_01=0
 	fi
-	if [[ ${enable_scalapack_compat} = yes ]]; then
-		echo "${script_name}: ScaLAPACK compatibility is enabled."
-		enable_scalapack_compat_01=1
-	else
-		echo "${script_name}: ScaLAPACK compatibility is disabled."
-		enable_scalapack_compat_01=0
-	fi
 	if [[ ${has_memkind} = yes ]]; then
 		if [[ -z ${enable_memkind} ]]; then
 			# If no explicit option was given for libmemkind one way or the other,
@@ -4216,6 +4310,28 @@ blis_main()
 		addon_list_includes="${addon_list_includes}#include ${addon_header}\n"
 	done
 
+	# Make sure that omit_symbol_list only contains lowercase letters, digits,
+	# underscores, and commas.
+	omit_symbol_list_check=$(echo "${omit_symbol_list}" | sed -e "s/[a-z0-9_, ]//g")
+
+	if [[ "${omit_symbol_list_check}" != "" ]]; then
+		echo "${script_name}: --omit-symbol=LIST option contains unexpected characters: ${omit_symbol_list_check}"
+		exit 1
+	fi
+
+	# Create a list of #defines, one for each symbol the user requested that we
+	# omit. Note that first we convert the list's commas into spaces.
+	omit_symbol_list=$(echo "${omit_symbol_list}" | sed -e "s/,/ /g")
+	for sym in ${omit_symbol_list}; do
+
+		# Convert the current config name to uppercase.
+		sym=$(echo "${sym}" | tr '[:lower:]' '[:upper:]')
+
+		# Create a #define and add it to the running list.
+		omit_define="BLIS_DISABLE_${sym}"
+		omit_symbol_list_defines="${omit_symbol_list_defines}#define ${omit_define}\n"
+	done
+
 
 	# -- Determine whether we are performing an out-of-tree build --------------
 
@@ -4310,6 +4426,7 @@ blis_main()
 	add_config_var config_name_define
 	add_config_var config_list_defines
 	add_config_var kernel_list_defines
+	add_config_var omit_symbol_list_defines
 	add_config_var enable_tls                enable_tls_01
 	add_config_var enable_openmp             enable_openmp_01
 	add_config_var enable_openmp_as_def      enable_openmp_as_def_01
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index b0313415b..a90e7c0c9 100644
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -112,10 +112,12 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
-INSERT_GENTFUNCRO_BLAS( symv, symv )
-#else
-INSERT_GENTFUNC_BLAS( symv, symv )
+GENTFUNC( float,    s, symv, symv )
+GENTFUNC( double,   d, symv, symv )
+#ifndef BLIS_DISABLE_CSYMV
+GENTFUNC( scomplex, c, symv, symv )
+#endif
+#ifndef BLIS_DISABLE_ZSYMV
+GENTFUNC( dcomplex, z, symv, symv )
 #endif
 #endif
-
diff --git a/frame/compat/bla_symv.h b/frame/compat/bla_symv.h
index 8c2992bb0..a9af785a5 100644
--- a/frame/compat/bla_symv.h
+++ b/frame/compat/bla_symv.h
@@ -32,14 +32,10 @@
 
 */
 
-#if 1
-
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
 #undef  GENTPROT
-#undef  GENTPROTRO
-#define GENTPROTRO GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
@@ -54,12 +50,12 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
-INSERT_GENTPROTRO_BLAS( symv )
-#else
-INSERT_GENTPROT_BLAS( symv )
+GENTPROT( float,    s, symv )
+GENTPROT( double,   d, symv )
+#ifndef BLIS_DISABLE_CSYMV
+GENTPROT( scomplex, c, symv )
 #endif
+#ifndef BLIS_DISABLE_ZSYMV
+GENTPROT( dcomplex, z, symv )
 #endif
-
 #endif
-
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index aeceaa225..3cc594757 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -103,10 +103,12 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
-INSERT_GENTFUNCRO_BLAS( syr, syr )
-#else
-INSERT_GENTFUNC_BLAS( syr, syr )
+GENTFUNC( float,    s, syr, syr )
+GENTFUNC( double,   d, syr, syr )
+#ifndef BLIS_DISABLE_CSYR
+GENTFUNC( scomplex, c, syr, syr )
+#endif
+#ifndef BLIS_DISABLE_ZSYR
+GENTFUNC( dcomplex, z, syr, syr )
 #endif
 #endif
-
diff --git a/frame/compat/bla_syr.h b/frame/compat/bla_syr.h
index 3f15502de..b55da9af8 100644
--- a/frame/compat/bla_syr.h
+++ b/frame/compat/bla_syr.h
@@ -32,14 +32,10 @@
 
 */
 
-#if 1
-
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
 #undef  GENTPROT
-#undef  GENTPROTRO
-#define GENTPROTRO GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
@@ -52,12 +48,12 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
-INSERT_GENTPROTRO_BLAS( syr )
-#else
-INSERT_GENTPROT_BLAS( syr )
+GENTPROT( float,    s, syr )
+GENTPROT( double,   d, syr )
+#ifndef BLIS_DISABLE_CSYR
+GENTPROT( scomplex, c, syr )
 #endif
+#ifndef BLIS_DISABLE_ZSYR
+GENTPROT( dcomplex, z, syr )
 #endif
-
 #endif
-
diff --git a/frame/compat/bla_syr2.c b/frame/compat/bla_syr2.c
index 66de19221..a69409f04 100644
--- a/frame/compat/bla_syr2.c
+++ b/frame/compat/bla_syr2.c
@@ -39,8 +39,6 @@
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNC
-#undef  GENTFUNCRO
-#define GENTFUNCRO GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
@@ -111,10 +109,12 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
-INSERT_GENTFUNCRO_BLAS( syr2, syr2 )
-#else
-INSERT_GENTFUNC_BLAS( syr2, syr2 )
+GENTFUNC( float,    s, syr2, syr2 )
+GENTFUNC( double,   d, syr2, syr2 )
+#ifndef BLIS_DISABLE_CSYR2
+GENTFUNC( scomplex, c, syr2, syr2 )
+#endif
+#ifndef BLIS_DISABLE_ZSYR2
+GENTFUNC( dcomplex, z, syr2, syr2 )
 #endif
 #endif
-
diff --git a/frame/compat/bla_syr2.h b/frame/compat/bla_syr2.h
index e7e0dc826..4d9021764 100644
--- a/frame/compat/bla_syr2.h
+++ b/frame/compat/bla_syr2.h
@@ -32,14 +32,10 @@
 
 */
 
-#if 1
-
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
 #undef  GENTPROT
-#undef  GENTPROTRO
-#define GENTPROTRO GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
@@ -53,12 +49,12 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_ENABLE_SCALAPACK_COMPAT
-INSERT_GENTPROTRO_BLAS( syr2 )
-#else
-INSERT_GENTPROT_BLAS( syr2 )
+GENTPROT( float,    s, syr2 )
+GENTPROT( double,   d, syr2 )
+#ifndef BLIS_DISABLE_CSYR2
+GENTPROT( scomplex, c, syr2 )
 #endif
+#ifndef BLIS_DISABLE_ZSYR2
+GENTPROT( dcomplex, z, syr2 )
 #endif
-
 #endif
-
diff --git a/frame/compat/f2c/bla_lsame.c b/frame/compat/f2c/bla_lsame.c
index 8fdc7dfd8..f2e12bee4 100644
--- a/frame/compat/f2c/bla_lsame.c
+++ b/frame/compat/f2c/bla_lsame.c
@@ -41,6 +41,7 @@
 	-lf2c -lm   (in that order)
 */
 
+#ifndef BLIS_DISABLE_LSAME
 
 #ifdef LAPACK_ILP64
 long PASTEF77(lsame)(const char *ca, const char *cb, long ca_len, long cb_len)
@@ -151,4 +152,5 @@ int PASTEF77(lsame)(const char *ca, const char *cb, int ca_len, int cb_len)
 } /* lsame */
 
 #endif
+#endif
 
diff --git a/frame/compat/f2c/bla_lsame.h b/frame/compat/f2c/bla_lsame.h
index 83acd7d76..04a919eaa 100644
--- a/frame/compat/f2c/bla_lsame.h
+++ b/frame/compat/f2c/bla_lsame.h
@@ -32,7 +32,13 @@
 
 */
 
-#if 1
+// NOTE: We prototype lsame_() and xerbla_() even when those symbols are
+// omitted via the --omit-symbols=LIST configure option. This is done because
+// the BLAS compatibility layer calls lsame_() and xerbla_() within its _check()
+// functions, and we prefer to give the compiler a chance to do type checking
+// against a function prototype (and *not* output a warning) even if those
+// routines are not going to be compiled within BLIS.
+//#ifndef BLIS_DISABLE_LSAME
 
 #ifdef LAPACK_ILP64
 long PASTEF77(lsame)(const char *ca, const char *cb, long ca_len, long cb_len);
@@ -40,4 +46,4 @@ long PASTEF77(lsame)(const char *ca, const char *cb, long ca_len, long cb_len);
 BLIS_EXPORT_BLAS int PASTEF77(lsame)(const char *ca, const char *cb, int ca_len, int cb_len);
 #endif
 
-#endif
+//#endif
diff --git a/frame/compat/f2c/bla_rot.c b/frame/compat/f2c/bla_rot.c
index 0dbd720d2..cd41a2fb9 100644
--- a/frame/compat/f2c/bla_rot.c
+++ b/frame/compat/f2c/bla_rot.c
@@ -360,6 +360,7 @@
 } /* zdrot_ */
 
 
+#ifndef BLIS_DISABLE_CROT
 /* crot.f -- translated by f2c (version 20100827).
    You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
@@ -597,8 +598,10 @@
     }
     return 0;
 } /* crot_ */
+#endif
 
 
+#ifndef BLIS_DISABLE_ZROT
 /* zrot.f -- translated by f2c (version 20100827).
    You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
@@ -836,6 +839,7 @@
     }
     return 0;
 } /* zrot_ */
+#endif
 
 
 #endif
diff --git a/frame/compat/f2c/bla_rot.h b/frame/compat/f2c/bla_rot.h
index 4e6aead4a..5260f949c 100644
--- a/frame/compat/f2c/bla_rot.h
+++ b/frame/compat/f2c/bla_rot.h
@@ -32,13 +32,14 @@
 
 */
 
-#if 1
-
 BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
 BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
 BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
 BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
+#ifndef BLIS_DISABLE_CROT
 BLIS_EXPORT_BLAS int PASTEF77(c,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_scomplex *s);
+#endif
+#ifndef BLIS_DISABLE_ZROT
 BLIS_EXPORT_BLAS int PASTEF77(z,rot)(const bla_integer *n, bla_dcomplex *cx, const bla_integer *incx, bla_dcomplex *cy, const bla_integer *incy, const bla_double *c__, const bla_dcomplex *s);
-
 #endif
+
diff --git a/frame/compat/f2c/bla_xerbla.c b/frame/compat/f2c/bla_xerbla.c
index 991ef00d0..d8fc44d36 100644
--- a/frame/compat/f2c/bla_xerbla.c
+++ b/frame/compat/f2c/bla_xerbla.c
@@ -43,6 +43,7 @@
 
 /* Table of constant values */
 
+#ifndef BLIS_DISABLE_XERBLA
 /* Subroutine */ int PASTEF77(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len)
 {
 /*  -- LAPACK auxiliary routine (preliminary version) -- */
@@ -88,4 +89,5 @@
 } /* xerbla */
 
 #endif
+#endif
 
diff --git a/frame/compat/f2c/bla_xerbla.h b/frame/compat/f2c/bla_xerbla.h
index 6824a5688..985237914 100644
--- a/frame/compat/f2c/bla_xerbla.h
+++ b/frame/compat/f2c/bla_xerbla.h
@@ -32,8 +32,14 @@
 
 */
 
-#if 1
+// NOTE: We prototype lsame_() and xerbla_() even when those symbols are
+// omitted via the --omit-symbols=LIST configure option. This is done because
+// the BLAS compatibility layer calls lsame_() and xerbla_() within its _check()
+// functions, and we prefer to give the compiler a chance to do type checking
+// against a function prototype (and *not* output a warning) even if those
+// routines are not going to be compiled within BLIS.
+//#ifndef BLIS_DISABLE_XERBLA
 
 BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF77(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
 
-#endif
+//#endif
diff --git a/frame/compat/f2c/bla_xerbla_array.c b/frame/compat/f2c/bla_xerbla_array.c
index b69775d3b..1c2c77dff 100644
--- a/frame/compat/f2c/bla_xerbla_array.c
+++ b/frame/compat/f2c/bla_xerbla_array.c
@@ -38,6 +38,7 @@
 
 #define MAX_NUM_CHARS 32
 
+#ifndef BLIS_DISABLE_XERBLA_ARRAY
 int PASTEF77(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info)
 {
 	int  i;
@@ -71,4 +72,5 @@ int PASTEF77(xerbla_array)(const bla_character *srname_array, const bla_integer
 }
 
 #endif
+#endif
 
diff --git a/frame/compat/f2c/bla_xerbla_array.h b/frame/compat/f2c/bla_xerbla_array.h
index 4684b942f..05b674738 100644
--- a/frame/compat/f2c/bla_xerbla_array.h
+++ b/frame/compat/f2c/bla_xerbla_array.h
@@ -32,7 +32,7 @@
 
 */
 
-#if 1
+#ifndef BLIS_DISABLE_XERBLA_ARRAY
 
 BLIS_EXPORT_BLAS int PASTEF77(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info);
 

From 50b7117e4bc4a2c675002529b79823ebcca3160e Mon Sep 17 00:00:00 2001
From: Michael Yeh <111819036+myeh01@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:05:59 -0800
Subject: [PATCH 203/230] Use intrinsics for all sifive_x280 kernels (#822)

Details:
- Replace all assembly kernels in the `sifive_x280` kernel set with intrinsic versions.
- Fixes bug encountered in #805.
- Update the RISC-V toolchain used in CI testing.
- Special thanks to Michael Yeh (@myeh01) and SiFive.
---
 .../sifive_x280/bli_cntx_init_sifive_x280.c   |  106 +-
 config/sifive_x280/make_defs.mk               |    2 +-
 .../sifive_x280/1/bli_amaxv_sifive_x280_asm.c |  293 --
 .../bli_amaxv_sifive_x280_intr.c              |  179 +
 .../bli_amaxv_sifive_x280_intr_complex.c      |  105 +
 .../bli_amaxv_sifive_x280_intr_real.c         |  100 +
 .../bli_axpbyv_sifive_x280_intr.c             |    4 +-
 .../sifive_x280/1/bli_copyv_sifive_x280_asm.c |  272 --
 .../bli_copyv_sifive_x280_intr.c              |  116 +
 .../bli_copyv_sifive_x280_intr_complex.c      |   75 +
 .../bli_copyv_sifive_x280_intr_real.c         |   68 +
 .../1/bli_invertv_sifive_x280_asm.c           |  221 --
 .../bli_invertv_sifive_x280_intr.c            |  118 +
 .../bli_invertv_sifive_x280_intr_complex.c    |   83 +
 .../bli_invertv_sifive_x280_intr_real.c       |   68 +
 .../1/bli_invscalv_sifive_x280_asm.c          |  266 --
 .../bli_invscalv_sifive_x280_intr.c           |  117 +
 .../bli_invscalv_sifive_x280_intr_complex.c   |   83 +
 .../bli_invscalv_sifive_x280_intr_real.c      |   75 +
 .../bli_scal2v_sifive_x280_intr.c             |    4 +-
 .../bli_scal2v_sifive_x280_intr_complex.c     |    6 +-
 .../bli_scalv_sifive_x280_intr.c              |    2 +-
 .../sifive_x280/1/bli_setv_sifive_x280_asm.c  |  204 --
 .../bli_setv_sifive_x280_intr.c               |  116 +
 .../bli_setv_sifive_x280_intr_complex.c       |   71 +
 .../bli_setv_sifive_x280_intr_real.c          |   64 +
 .../sifive_x280/1/bli_swapv_sifive_x280_asm.c |  245 --
 .../bli_swapv_sifive_x280_intr.c              |  115 +
 .../bli_swapv_sifive_x280_intr_complex.c      |   76 +
 .../bli_swapv_sifive_x280_intr_real.c         |   76 +
 .../bli_xpbyv_sifive_x280_intr.c              |    2 +-
 .../1f/bli_axpyf_sifive_x280_asm.c            |  430 ---
 .../bli_axpyf_sifive_x280_intr.c              |  121 +
 .../bli_axpyf_sifive_x280_intr_complex.c      |  149 +
 .../bli_axpyf_sifive_x280_intr_real.c         |   96 +
 .../1f/bli_dotxaxpyf_sifive_x280_asm.c        | 3120 -----------------
 .../bli_dotxaxpyf_sifive_x280_intr.c          |  137 +
 .../bli_dotxaxpyf_sifive_x280_intr_complex.c  |  427 +++
 .../bli_dotxaxpyf_sifive_x280_intr_real.c     |  283 ++
 .../1f/bli_dotxf_sifive_x280_asm.c            | 2645 --------------
 .../bli_dotxf_sifive_x280_intr.c              |  132 +
 .../bli_dotxf_sifive_x280_intr_complex.c      |  324 ++
 .../bli_dotxf_sifive_x280_intr_real.c         |  262 ++
 .../1m/bli_packm_sifive_x280_asm.c            | 1465 --------
 .../bli_packm_sifive_x280_intr.c              |  168 +
 .../bli_packm_sifive_x280_intr_complex.c      |  545 +++
 .../bli_packm_sifive_x280_intr_real.c         |  364 ++
 .../sifive_x280/3/bli_gemm_sifive_x280_asm.c  | 2406 -------------
 .../bli_gemm_sifive_x280_intr.c               |  138 +
 .../bli_gemm_sifive_x280_intr_complex.c       |  517 +++
 .../bli_gemm_sifive_x280_intr_real.c          |  339 ++
 .../bli_gemmtrsm_l_sifive_x280_asm_complex.c  |  327 --
 .../bli_gemmtrsm_l_sifive_x280_asm_real.c     |  253 --
 .../bli_gemmtrsm_u_sifive_x280_asm_complex.c  |  331 --
 .../bli_gemmtrsm_u_sifive_x280_asm_real.c     |  260 --
 .../bli_gemmtrsm_sifive_x280_intr.c}          |   97 +-
 .../bli_gemmtrsm_sifive_x280_intr_complex.c   |  437 +++
 .../bli_gemmtrsm_sifive_x280_intr_real.c      |  364 ++
 kernels/sifive_x280/bli_kernels_sifive_x280.h |  106 +-
 kernels/sifive_x280/riscv_cmul_macros_intr.h  |  147 +
 .../sifive_x280/riscv_overloaded_intrinsics.h |  109 +-
 travis/do_riscv.sh                            |    2 +-
 62 files changed, 6902 insertions(+), 12931 deletions(-)
 delete mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
 create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c
 create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c
 delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
 delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
 delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
 delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
 rename kernels/sifive_x280/3/{bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c => bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c} (75%)
 create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c
 create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c
 create mode 100644 kernels/sifive_x280/riscv_cmul_macros_intr.h

diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c
index 56a1a66d5..668891cf3 100644
--- a/config/sifive_x280/bli_cntx_init_sifive_x280.c
+++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c
@@ -54,10 +54,10 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx )
 	  BLIS_ADDV_KER,       BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr,
 	  BLIS_ADDV_KER,       BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr,
 
-	  BLIS_AMAXV_KER,      BLIS_FLOAT,    bli_samaxv_sifive_x280_asm,
-	  BLIS_AMAXV_KER,      BLIS_DOUBLE,   bli_damaxv_sifive_x280_asm,
-	  BLIS_AMAXV_KER,      BLIS_SCOMPLEX, bli_camaxv_sifive_x280_asm,
-	  BLIS_AMAXV_KER,      BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_asm,
+	  BLIS_AMAXV_KER,      BLIS_FLOAT,    bli_samaxv_sifive_x280_intr,
+	  BLIS_AMAXV_KER,      BLIS_DOUBLE,   bli_damaxv_sifive_x280_intr,
+	  BLIS_AMAXV_KER,      BLIS_SCOMPLEX, bli_camaxv_sifive_x280_intr,
+	  BLIS_AMAXV_KER,      BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_intr,
 
 	  BLIS_AXPBYV_KER,     BLIS_FLOAT,    bli_saxpbyv_sifive_x280_intr,
 	  BLIS_AXPBYV_KER,     BLIS_DOUBLE,   bli_daxpbyv_sifive_x280_intr,
@@ -69,10 +69,10 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx )
 	  BLIS_AXPYV_KER,      BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr,
 	  BLIS_AXPYV_KER,      BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr,
 
-	  BLIS_COPYV_KER,      BLIS_FLOAT,    bli_scopyv_sifive_x280_asm,
-	  BLIS_COPYV_KER,      BLIS_DOUBLE,   bli_dcopyv_sifive_x280_asm,
-	  BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_asm,
-	  BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_asm,
+	  BLIS_COPYV_KER,      BLIS_FLOAT,    bli_scopyv_sifive_x280_intr,
+	  BLIS_COPYV_KER,      BLIS_DOUBLE,   bli_dcopyv_sifive_x280_intr,
+	  BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_intr,
+	  BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_intr,
 
 	  BLIS_DOTV_KER,       BLIS_FLOAT,    bli_sdotv_sifive_x280_intr,
 	  BLIS_DOTV_KER,       BLIS_DOUBLE,   bli_ddotv_sifive_x280_intr,
@@ -84,15 +84,15 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx )
 	  BLIS_DOTXV_KER,      BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr,
 	  BLIS_DOTXV_KER,      BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr,
 
-	  BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_sifive_x280_asm,
-	  BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_sifive_x280_asm,
-	  BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_asm,
-	  BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_asm,
+	  BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_sifive_x280_intr,
+	  BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_sifive_x280_intr,
+	  BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_intr,
+	  BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_intr,
 
-	  BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_sifive_x280_asm,
-	  BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_sifive_x280_asm,
-	  BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_asm,
-	  BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_asm,
+	  BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_sifive_x280_intr,
+	  BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_sifive_x280_intr,
+	  BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_intr,
+	  BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_intr,
 
 	  BLIS_SCAL2V_KER,     BLIS_FLOAT,    bli_sscal2v_sifive_x280_intr,
 	  BLIS_SCAL2V_KER,     BLIS_DOUBLE,   bli_dscal2v_sifive_x280_intr,
@@ -104,20 +104,20 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx )
 	  BLIS_SCALV_KER,      BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr,
 	  BLIS_SCALV_KER,      BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr,
 
-	  BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_sifive_x280_asm,
-	  BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_sifive_x280_asm,
-	  BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_sifive_x280_asm,
-	  BLIS_SETV_KER,       BLIS_DCOMPLEX, bli_zsetv_sifive_x280_asm,
+	  BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_sifive_x280_intr,
+	  BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_sifive_x280_intr,
+	  BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_sifive_x280_intr,
+	  BLIS_SETV_KER,       BLIS_DCOMPLEX, bli_zsetv_sifive_x280_intr,
 
 	  BLIS_SUBV_KER,       BLIS_FLOAT,    bli_ssubv_sifive_x280_intr,
 	  BLIS_SUBV_KER,       BLIS_DOUBLE,   bli_dsubv_sifive_x280_intr,
 	  BLIS_SUBV_KER,       BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr,
 	  BLIS_SUBV_KER,       BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr,
 
-	  BLIS_SWAPV_KER,      BLIS_FLOAT,    bli_sswapv_sifive_x280_asm,
-	  BLIS_SWAPV_KER,      BLIS_DOUBLE,   bli_dswapv_sifive_x280_asm,
-	  BLIS_SWAPV_KER,      BLIS_SCOMPLEX, bli_cswapv_sifive_x280_asm,
-	  BLIS_SWAPV_KER,      BLIS_DCOMPLEX, bli_zswapv_sifive_x280_asm,
+	  BLIS_SWAPV_KER,      BLIS_FLOAT,    bli_sswapv_sifive_x280_intr,
+	  BLIS_SWAPV_KER,      BLIS_DOUBLE,   bli_dswapv_sifive_x280_intr,
+	  BLIS_SWAPV_KER,      BLIS_SCOMPLEX, bli_cswapv_sifive_x280_intr,
+	  BLIS_SWAPV_KER,      BLIS_DCOMPLEX, bli_zswapv_sifive_x280_intr,
 
 	  BLIS_XPBYV_KER,      BLIS_FLOAT,    bli_sxpbyv_sifive_x280_intr,
 	  BLIS_XPBYV_KER,      BLIS_DOUBLE,   bli_dxpbyv_sifive_x280_intr,
@@ -130,46 +130,46 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx )
 	  BLIS_AXPY2V_KER,     BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr,
 	  BLIS_AXPY2V_KER,     BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr,
 
-	  BLIS_AXPYF_KER,      BLIS_FLOAT,    bli_saxpyf_sifive_x280_asm,
-	  BLIS_AXPYF_KER,      BLIS_DOUBLE,   bli_daxpyf_sifive_x280_asm,
-	  BLIS_AXPYF_KER,      BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_asm,
-	  BLIS_AXPYF_KER,      BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_asm,
+	  BLIS_AXPYF_KER,      BLIS_FLOAT,    bli_saxpyf_sifive_x280_intr,
+	  BLIS_AXPYF_KER,      BLIS_DOUBLE,   bli_daxpyf_sifive_x280_intr,
+	  BLIS_AXPYF_KER,      BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_intr,
+	  BLIS_AXPYF_KER,      BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_intr,
 
-	  BLIS_DOTXF_KER,      BLIS_FLOAT,    bli_sdotxf_sifive_x280_asm,
-	  BLIS_DOTXF_KER,      BLIS_DOUBLE,   bli_ddotxf_sifive_x280_asm,
-	  BLIS_DOTXF_KER,      BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_asm,
-	  BLIS_DOTXF_KER,      BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_asm,
+	  BLIS_DOTXF_KER,      BLIS_FLOAT,    bli_sdotxf_sifive_x280_intr,
+	  BLIS_DOTXF_KER,      BLIS_DOUBLE,   bli_ddotxf_sifive_x280_intr,
+	  BLIS_DOTXF_KER,      BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_intr,
+	  BLIS_DOTXF_KER,      BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_intr,
 
 	  BLIS_DOTAXPYV_KER,   BLIS_FLOAT,    bli_sdotaxpyv_sifive_x280_intr,
 	  BLIS_DOTAXPYV_KER,   BLIS_DOUBLE,   bli_ddotaxpyv_sifive_x280_intr,
 	  BLIS_DOTAXPYV_KER,   BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr,
 	  BLIS_DOTAXPYV_KER,   BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr,
 
-	  BLIS_DOTXAXPYF_KER,  BLIS_FLOAT,    bli_sdotxaxpyf_sifive_x280_asm,
-	  BLIS_DOTXAXPYF_KER,  BLIS_DOUBLE,   bli_ddotxaxpyf_sifive_x280_asm,
-	  BLIS_DOTXAXPYF_KER,  BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_asm,
-	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm,
+	  BLIS_DOTXAXPYF_KER,  BLIS_FLOAT,    bli_sdotxaxpyf_sifive_x280_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DOUBLE,   bli_ddotxaxpyf_sifive_x280_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_intr,
 
 	  // Level 1m
-	  BLIS_PACKM_KER,      BLIS_FLOAT,    bli_spackm_sifive_x280_asm_7m4,
-	  BLIS_PACKM_KER,      BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_7m4,
-	  BLIS_PACKM_KER,      BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6m2,
-	  BLIS_PACKM_KER,      BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6m2,
+	  BLIS_PACKM_KER,      BLIS_FLOAT,    bli_spackm_sifive_x280_intr,
+	  BLIS_PACKM_KER,      BLIS_DOUBLE,   bli_dpackm_sifive_x280_intr,
+	  BLIS_PACKM_KER,      BLIS_SCOMPLEX, bli_cpackm_sifive_x280_intr,
+	  BLIS_PACKM_KER,      BLIS_DCOMPLEX, bli_zpackm_sifive_x280_intr,
 
 	  // Level 3
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_x280_asm_7m4,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_sifive_x280_asm_7m4,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_sifive_x280_asm_6m2,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_sifive_x280_asm_6m2,
-
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_sifive_x280_asm,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_sifive_x280_asm,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_asm,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_asm,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_sifive_x280_asm,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_sifive_x280_asm,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_asm,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_asm,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_x280_intr,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_sifive_x280_intr,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_sifive_x280_intr,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_sifive_x280_intr,
+
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_sifive_x280_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_sifive_x280_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_sifive_x280_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_sifive_x280_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_intr,
 
 	  BLIS_VA_END
 	);
diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk
index acdf5a361..31b31e387 100644
--- a/config/sifive_x280/make_defs.mk
+++ b/config/sifive_x280/make_defs.mk
@@ -61,7 +61,7 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -Ofast
+COPTFLAGS      := -O3
 endif
 
 # Flags specific to optimized kernels.
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
deleted file mode 100644
index c423dd131..000000000
--- a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
-                     dim_t *index, const cntx_t *cntx) {
-    // assumes 64-bit index
-    (void)cntx;
-    const float* restrict x = x_;
-
-    if (n <= 1) {
-        *index = 0;
-        return;
-    }
-    incx *= 4;
-    size_t avl = n;
-    size_t offset = 0;
-    bool first = true;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
-                         : "=r"(vl)
-                         : "r"(avl));
-        if (incx == 4)
-            __asm__("vle32.v v24, (%0)" : : "r"(x));
-        else
-            __asm__("vlse32.v v24, (%0), %1" : : "r"(x), "r"(incx));
-        // check for NaN
-        __asm__ volatile("vmfne.vv v0, v24, v24");
-        dim_t nan_index;
-        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
-        if (nan_index != -1) {
-            *index = nan_index + offset;
-            return;
-        }
-        if (first) {
-            __asm__("vfabs.v v8, v24");
-            // keep vl same, change SEW and LMUL
-            __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
-            __asm__("vid.v v16");
-            first = false;
-        } else {
-            __asm__("vfabs.v v24, v24");
-            __asm__("vmflt.vv v0, v8, v24");
-            __asm__("vmerge.vvm v8, v8, v24, v0");
-            // keep vl same, change SEW and LMUL
-            __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
-            __asm__("vid.v v24");
-            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
-            __asm__("vmerge.vvm v16, v16, v24, v0");
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        offset += vl;
-        avl -= vl;
-    }
-    __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
-    __asm__("vmv.s.x v0, zero");
-    __asm__("vfredmax.vs v0, v8, v0");
-    __asm__("vrgather.vi v24, v0, 0");
-    __asm__("vmfeq.vv v0, v8, v24");
-    __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
-    uint64_t imax = -1;
-    __asm__("vmv.s.x v24, %0" : : "r"(imax));
-    __asm__("vredminu.vs v24, v16, v24, v0.t");
-    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
-    __asm__("vse64.v v24, (%0)" : : "r"(index));
-    return;
-}
-
-void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
-                     dim_t *index, const cntx_t *cntx) {
-    // assumes 64-bit index
-    (void)cntx;
-    const double* restrict x = x_;
-
-    if (n <= 1) {
-        *index = 0;
-        return;
-    }
-    incx *= 8;
-    size_t avl = n;
-    size_t offset = 0;
-    bool first = true;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e64, m8, tu, ma"
-                         : "=r"(vl)
-                         : "r"(avl));
-        if (incx == 8)
-            __asm__("vle64.v v24, (%0)" : : "r"(x));
-        else
-            __asm__("vlse64.v v24, (%0), %1" : : "r"(x), "r"(incx));
-        // check for NaN
-        __asm__ volatile("vmfne.vv v0, v24, v24");
-        dim_t nan_index;
-        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
-        if (nan_index != -1) {
-            *index = nan_index + offset;
-            return;
-        }
-        if (first) {
-            __asm__("vfabs.v v8, v24");
-            __asm__("vid.v v16");
-            first = false;
-        } else {
-            __asm__("vfabs.v v24, v24");
-            __asm__("vmflt.vv v0, v8, v24");
-            __asm__("vmerge.vvm v8, v8, v24, v0");
-            __asm__("vid.v v24");
-            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
-            __asm__("vmerge.vvm v16, v16, v24, v0");
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        offset += vl;
-        avl -= vl;
-    }
-    __asm__ volatile("vsetvli zero, %0, e64, m8, ta, ma" : : "r"(n));
-    __asm__("vmv.s.x v0, zero");
-    __asm__("vfredmax.vs v0, v8, v0");
-    __asm__("vrgather.vi v24, v0, 0");
-    __asm__("vmfeq.vv v0, v8, v24");
-    uint64_t imax = -1;
-    __asm__("vmv.s.x v24, %0" : : "r"(imax));
-    __asm__("vredminu.vs v24, v16, v24, v0.t");
-    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
-    __asm__("vse64.v v24, (%0)" : : "r"(index));
-    return;
-}
-
-void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
-                     dim_t *index, const cntx_t *cntx) {
-    // assumes 64-bit index
-    (void)cntx;
-    const scomplex* restrict x = x_;
-
-    if (n <= 1) {
-        *index = 0;
-        return;
-    }
-    incx *= 8;
-    size_t avl = n;
-    size_t offset = 0;
-    bool first = true;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
-                         : "=r"(vl)
-                         : "r"(avl));
-        if (incx == 8)
-            __asm__("vlseg2e32.v v24, (%0)" : : "r"(x));
-        else
-            __asm__("vlsseg2e32.v v24, (%0), %1" : : "r"(x), "r"(incx));
-        __asm__("vfabs.v v24, v24");
-        __asm__("vfabs.v v28, v28");
-        __asm__("vfadd.vv v24, v24, v28");
-        // check for NaN
-        __asm__ volatile("vmfne.vv v0, v24, v24");
-        dim_t nan_index;
-        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
-        if (nan_index != -1) {
-            *index = nan_index + offset;
-            return;
-        }
-        if (first) {
-            __asm__("vmv4r.v v8, v24");
-            // keep vl same, change SEW and LMUL
-            __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
-            __asm__("vid.v v16");
-            first = false;
-        } else {
-            __asm__("vmflt.vv v0, v8, v24");
-            __asm__("vmerge.vvm v8, v8, v24, v0");
-            // keep vl same, change SEW and LMUL
-            __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
-            __asm__("vid.v v24");
-            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
-            __asm__("vmerge.vvm v16, v16, v24, v0");
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        offset += vl;
-        avl -= vl;
-    }
-    __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
-    __asm__("vmv.s.x v0, zero");
-    __asm__("vfredmax.vs v0, v8, v0");
-    __asm__("vrgather.vi v24, v0, 0");
-    __asm__("vmfeq.vv v0, v8, v24");
-    __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
-    uint64_t imax = -1;
-    __asm__("vmv.s.x v24, %0" : : "r"(imax));
-    __asm__("vredminu.vs v24, v16, v24, v0.t");
-    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
-    __asm__("vse64.v v24, (%0)" : : "r"(index));
-    return;
-}
-
-void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
-                     dim_t *index, const cntx_t *cntx) {
-    // assumes 64-bit index
-    (void)cntx;
-    const dcomplex* restrict x = x_;
-
-    if (n <= 1) {
-        *index = 0;
-        return;
-    }
-    incx *= 16;
-    size_t avl = n;
-    size_t offset = 0;
-    bool first = true;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e64, m4, tu, ma"
-                         : "=r"(vl)
-                         : "r"(avl));
-        if (incx == 16)
-            __asm__("vlseg2e64.v v24, (%0)" : : "r"(x));
-        else
-            __asm__("vlsseg2e64.v v24, (%0), %1" : : "r"(x), "r"(incx));
-        __asm__("vfabs.v v24, v24");
-        __asm__("vfabs.v v28, v28");
-        __asm__("vfadd.vv v24, v24, v28");
-        // check for NaN
-        __asm__ volatile("vmfne.vv v0, v24, v24");
-        dim_t nan_index;
-        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
-        if (nan_index != -1) {
-            *index = nan_index + offset;
-            return;
-        }
-        if (first) {
-            __asm__("vmv4r.v v8, v24");
-            __asm__("vid.v v16");
-            first = false;
-        } else {
-            __asm__("vmflt.vv v0, v8, v24");
-            __asm__("vmerge.vvm v8, v8, v24, v0");
-            __asm__("vid.v v24");
-            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
-            __asm__("vmerge.vvm v16, v16, v24, v0");
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        offset += vl;
-        avl -= vl;
-    }
-    __asm__ volatile("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(n));
-    __asm__("vmv.s.x v0, zero");
-    __asm__("vfredmax.vs v0, v8, v0");
-    __asm__("vrgather.vi v24, v0, 0");
-    __asm__("vmfeq.vv v0, v8, v24");
-    uint64_t imax = -1;
-    __asm__("vmv.s.x v24, %0" : : "r"(imax));
-    __asm__("vredminu.vs v24, v16, v24, v0.t");
-    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
-    __asm__("vse64.v v24, (%0)" : : "r"(index));
-    return;
-}
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c
new file mode 100644
index 000000000..4f7d54630
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c
@@ -0,0 +1,179 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <limits.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define AMAXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##amaxv_sifive_x280_intr(\
+          dim_t            n,              \
+    const T*      restrict x_, inc_t incx, \
+          dim_t*           index,          \
+    const cntx_t*          cntx            \
+)
+
+#define AMAXV(...)  AMAXV_(__VA_ARGS__)
+
+// BLIS defines integers to be 32 or 64 bits according to BLIS_INT_TYPE_SIZE.
+// If BLIS_INT_TYPE_SIZE is any other value, integers are defined to be longs.
+#if BLIS_INT_TYPE_SIZE == 32 || BLIS_INT_TYPE_SIZE == 64
+#define AMAXV_SIFIVE_X280_INT_SIZE BLIS_INT_TYPE_SIZE
+#elif LONG_MAX == INT32_MAX
+#define AMAXV_SIFIVE_X280_INT_SIZE 32
+#elif LONG_MAX == INT64_MAX
+#define AMAXV_SIFIVE_X280_INT_SIZE 64
+#else
+#error "Integers must be 32- or 64-bits for bli_?amaxv_sifive_x280_intr."
+#endif
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC_X 32
+#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#if PREC_I == 32
+#define LMUL_X m4
+#define LMUL_I m4
+#define RATIO 8
+#elif PREC_I == 64
+#define LMUL_X m4
+#define LMUL_I m8
+#define RATIO 8
+#endif
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_amaxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC_X
+#undef PREC_I
+#undef LMUL_X
+#undef LMUL_I
+#undef RATIO
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC_X 64
+#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#if PREC_I == 32
+#define LMUL_X m8
+#define LMUL_I m4
+#define RATIO 8
+#elif PREC_I == 64
+#define LMUL_X m4
+#define LMUL_I m4
+#define RATIO 16
+#endif
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_amaxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC_X
+#undef PREC_I
+#undef LMUL_X
+#undef LMUL_I
+#undef RATIO
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC_X 32
+#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#if PREC_I == 32
+#define LMUL_X m4
+#define LMUL_I m4
+#define RATIO 8
+#elif PREC_I == 64
+#define LMUL_X m4
+#define LMUL_I m8
+#define RATIO 8
+#endif
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_amaxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC_X
+#undef PREC_I
+#undef LMUL_X
+#undef LMUL_I
+#undef RATIO
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC_X 64
+#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#if PREC_I == 32
+#define LMUL_X m8
+#define LMUL_I m4
+#define RATIO 8
+#elif PREC_I == 64
+#define LMUL_X m4
+#define LMUL_I m4
+#define RATIO 16
+#endif
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_amaxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC_X
+#undef PREC_I
+#undef LMUL_X
+#undef LMUL_I
+#undef RATIO
+#undef FLT_SIZE
+
+#undef AMAXV_SIFIVE_X280_INT_SIZE
+
+#undef AMAXV
+#undef AMAXV_
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..f1f3a749e
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c
@@ -0,0 +1,105 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AMAXV
+
+AMAXV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    const DATATYPE* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+
+    RVV_TYPE_F(PREC_X, LMUL_X) xacc;
+    // Indices will be unsigned and of the same width as dim_t.
+    RVV_TYPE_U(PREC_I, LMUL_I) iacc;
+    RVV_TYPE_U(PREC_I, LMUL_I) vid_vec = VID_V(PREC_I, LMUL_I)(n);
+    bool first = true;
+    guint_t offset = 0;
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC_X, LMUL_X)(avl);
+        RVV_TYPE_FX(PREC_X, LMUL_X, 2) xvec;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC_X, LMUL_X, 2)((BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC_X, LMUL_X, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl);
+
+        RVV_TYPE_F(PREC_X, LMUL_X) xvec_real = VGET_V_F(PREC_X, LMUL_X, 2)(xvec, 0);
+        RVV_TYPE_F(PREC_X, LMUL_X) xvec_imag = VGET_V_F(PREC_X, LMUL_X, 2)(xvec, 1);
+        RVV_TYPE_F(PREC_X, LMUL_X) xvec_real_abs = VFABS_V(PREC_X, LMUL_X)(xvec_real, vl);
+        RVV_TYPE_F(PREC_X, LMUL_X) xvec_imag_abs = VFABS_V(PREC_X, LMUL_X)(xvec_imag, vl);
+        RVV_TYPE_F(PREC_X, LMUL_X) xvec_abs = VFADD_VV(PREC_X, LMUL_X)(xvec_real_abs, xvec_imag_abs, vl);
+
+        RVV_TYPE_B(RATIO) is_nan = VMFNE_VV(PREC_X, LMUL_X, RATIO)(xvec_abs, xvec_abs, vl);
+        int nan_index = VFIRST_M(RATIO)(is_nan, vl);
+        if (nan_index != -1) {
+            *index = (guint_t) nan_index + offset;
+            return;
+        }
+
+        if (first) {
+            xacc = xvec_abs; 
+            iacc = vid_vec;
+            first = false;
+        }
+        else {
+            RVV_TYPE_B(RATIO) mask = VMFGT_VV(PREC_X, LMUL_X, RATIO)(xvec_abs, xacc, vl);
+            xacc = VFMAX_VV_TU(PREC_X, LMUL_X)(xacc, xvec_abs, xacc, vl);
+            RVV_TYPE_U(PREC_I, LMUL_I) ivec = VADD_VX_U(PREC_I, LMUL_I)(vid_vec, offset, vl);
+            iacc = VMERGE_VVM_TU_U(PREC_I, LMUL_I)(iacc, iacc, ivec, mask, vl);
+        }
+
+        x += vl * incx;
+        offset += vl;
+        avl -= vl;
+    }
+
+    RVV_TYPE_F(PREC_X, m1) xmax = VFMV_S_F(PREC_X, m1)(0., 1);
+    xmax = VFREDMAX_VS(PREC_X, LMUL_X)(xacc, xmax, n);
+    RVV_TYPE_F(PREC_X, LMUL_X) xmax_splat = VLMUL_EXT_V_F_M1(PREC_X, LMUL_X)(xmax);
+    xmax_splat = VRGATHER_VX_F(PREC_X, LMUL_X)(xmax_splat, 0, n);
+    RVV_TYPE_B(RATIO) mask = VMFEQ_VV(PREC_X, LMUL_X, RATIO)(xacc, xmax_splat, n);
+    RVV_TYPE_U(PREC_I, m1) imax = VMV_S_X_U(PREC_I, m1)(-1, 1);
+    imax = VREDMINU_VS_M(PREC_I, LMUL_I)(mask, iacc, imax, n);
+    *index = VMV_X_S_U(PREC_I)(imax);
+    return;
+}
+
+#endif // AMAXV
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..bcc4ee99d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c
@@ -0,0 +1,100 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AMAXV
+
+AMAXV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    const DATATYPE* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+
+    RVV_TYPE_F(PREC_X, LMUL_X) xacc;
+    // Indices will be unsigned and of the same width as dim_t.
+    RVV_TYPE_U(PREC_I, LMUL_I) iacc;
+    RVV_TYPE_U(PREC_I, LMUL_I) vid_vec = VID_V(PREC_I, LMUL_I)(n);
+    bool first = true;
+    guint_t offset = 0;
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC_X, LMUL_X)(avl);
+        RVV_TYPE_F(PREC_X, LMUL_X) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC_X, LMUL_X)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC_X, LMUL_X)(x, FLT_SIZE * incx, vl);
+
+        RVV_TYPE_B(RATIO) is_nan = VMFNE_VV(PREC_X, LMUL_X, RATIO)(xvec, xvec, vl);
+        int nan_index = VFIRST_M(RATIO)(is_nan, vl);
+        if (nan_index != -1) {
+            *index = (guint_t) nan_index + offset;
+            return;
+        }
+
+        if (first) {
+            xacc = VFABS_V(PREC_X, LMUL_X)(xvec, vl); 
+            iacc = vid_vec;
+            first = false;
+        }
+        else {
+            xvec = VFABS_V(PREC_X, LMUL_X)(xvec, vl);
+            RVV_TYPE_B(RATIO) mask = VMFGT_VV(PREC_X, LMUL_X, RATIO)(xvec, xacc, vl);
+            xacc = VFMAX_VV_TU(PREC_X, LMUL_X)(xacc, xvec, xacc, vl);
+            RVV_TYPE_U(PREC_I, LMUL_I) ivec = VADD_VX_U(PREC_I, LMUL_I)(vid_vec, offset, vl);
+            iacc = VMERGE_VVM_TU_U(PREC_I, LMUL_I)(iacc, iacc, ivec, mask, vl);
+        }
+
+        x += vl * incx;
+        offset += vl;
+        avl -= vl;
+    }
+
+    RVV_TYPE_F(PREC_X, m1) xmax = VFMV_S_F(PREC_X, m1)(0., 1);
+    xmax = VFREDMAX_VS(PREC_X, LMUL_X)(xacc, xmax, n);
+    RVV_TYPE_F(PREC_X, LMUL_X) xmax_splat = VLMUL_EXT_V_F_M1(PREC_X, LMUL_X)(xmax);
+    xmax_splat = VRGATHER_VX_F(PREC_X, LMUL_X)(xmax_splat, 0, n);
+    RVV_TYPE_B(RATIO) mask = VMFEQ_VV(PREC_X, LMUL_X, RATIO)(xacc, xmax_splat, n);
+    RVV_TYPE_U(PREC_I, m1) imax = VMV_S_X_U(PREC_I, m1)(-1, 1);
+    imax = VREDMINU_VS_M(PREC_I, LMUL_I)(mask, iacc, imax, n);
+    *index = VMV_X_S_U(PREC_I)(imax);
+    return;
+}
+
+#endif // AMAXV
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
index 3b29f898d..389292f90 100644
--- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
@@ -52,9 +52,7 @@
 
 #define AXPBYV(...)  AXPBYV_(__VA_ARGS__)
 
-#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
-#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
 #define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
 #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
deleted file mode 100644
index 357187775..000000000
--- a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
-                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
-    (void)conjx;
-    (void)cntx;
-    const float* restrict x = x_;
-    float* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE)
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-
-        if (incy == FLT_SIZE)
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-        else
-            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
-                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
-    (void)conjx;
-    (void)cntx;
-    const double* restrict x = x_;
-    double* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE)
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-
-        if (incy == FLT_SIZE)
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-        else
-            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-
-void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
-                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
-    (void)cntx;
-    const scomplex* restrict x = x_;
-    scomplex* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    if (conjx == BLIS_NO_CONJUGATE) {
-        size_t avl = n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                             : "=r"(vl)
-                             : "r"(avl), "i"(8 * 2 * FLT_SIZE));
-            if (incx == 2 * FLT_SIZE)
-                __asm__(VLE "v0, (%0)" : : "r"(x));
-            else
-                __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-
-            if (incy == 2 * FLT_SIZE)
-                __asm__(VSE "v0, (%0)" : : "r"(y));
-            else
-                __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-            avl -= vl;
-        }
-    } else {
-        size_t avl = n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                             : "=r"(vl)
-                             : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
-            else
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-
-            __asm__("vfneg.v v4, v4");
-
-            if (incy == 2 * FLT_SIZE)
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
-            else
-                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-            avl -= vl;
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-
-#define FLT_SIZE 8
-#define SH_ADD "sh3add "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-
-void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
-                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
-    (void)cntx;
-    const dcomplex* restrict x = x_;
-    dcomplex* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    if (conjx == BLIS_NO_CONJUGATE && incx == 2 * FLT_SIZE &&
-        incy == 2 * FLT_SIZE) {
-        size_t avl = 2 * n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                             : "=r"(vl)
-                             : "r"(avl), "i"(8 * FLT_SIZE));
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-            __asm__(SH_ADD "%0, %1, %0" : "+r"(x) : "r"(vl));
-            __asm__(SH_ADD "%0, %1, %0" : "+r"(y) : "r"(vl));
-            avl -= vl;
-        }
-    } else {
-        size_t avl = n;
-        while (avl) {
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                             : "=r"(vl)
-                             : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
-            else
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-
-            if (conjx == BLIS_CONJUGATE)
-                __asm__("vfneg.v v4, v4");
-
-            if (incy == 2 * FLT_SIZE)
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
-            else
-                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-            avl -= vl;
-        }
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c
new file mode 100644
index 000000000..e030d85ff
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <riscv_vector.h>
+#include <stddef.h>
+
+#define COPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##copyv_sifive_x280_intr(\
+          conj_t           conjx,          \
+          dim_t            n,              \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx            \
+)
+
+#define COPYV(...)  COPYV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_copyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_copyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_copyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_copyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef COPYV
+#undef COPYV_
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..21e595967
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c
@@ -0,0 +1,75 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef COPYV
+
+COPYV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl);
+
+        if (bli_is_conj(conjx)) {
+            RVV_TYPE_F(PREC, LMUL) xvec_imag;
+            xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+            xvec_imag = VFNEG_VF(PREC, LMUL)(xvec_imag, vl);
+            xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag);
+        }
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, xvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, xvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // COPYV
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..00bb8ed49
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c
@@ -0,0 +1,68 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef COPYV
+
+COPYV(PRECISION_CHAR, void)
+{
+    (void)conjx;
+    (void)cntx;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL)(y, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // COPYV
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
deleted file mode 100644
index cbca88592..000000000
--- a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
-                           const cntx_t *cntx) {
-    (void)cntx;
-    float* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    float one = 1.f;
-    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
-    incx *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE) {
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-            __asm__("vfrdiv.vf v0, v0, f0");
-            __asm__(VSE "v0, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfrdiv.vf v0, v0, f0");
-            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
-                           const cntx_t *cntx) {
-    (void)cntx;
-    double* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    double one = 1.;
-    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
-    incx *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE) {
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-            __asm__("vfrdiv.vf v0, v0, f0");
-            __asm__(VSE "v0, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfrdiv.vf v0, v0, f0");
-            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-
-void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
-                           const cntx_t *cntx) {
-    (void)cntx;
-    scomplex* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    incx *= 2 * FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE) {
-            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
-            __asm__("vfneg.v v4, v4");
-            __asm__("vfmul.vv v8, v0, v0");
-            __asm__("vfmacc.vv v8, v4, v4");
-            __asm__("vfdiv.vv v0, v0, v8");
-            __asm__("vfdiv.vv v4, v4, v8");
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfneg.v v4, v4");
-            __asm__("vfmul.vv v8, v0, v0");
-            __asm__("vfmacc.vv v8, v4, v4");
-            __asm__("vfdiv.vv v0, v0, v8");
-            __asm__("vfdiv.vv v4, v4, v8");
-            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-
-#define FLT_SIZE 8
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-
-void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
-                           const cntx_t *cntx) {
-    (void)cntx;
-    dcomplex* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    incx *= 2 * FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE) {
-            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
-            __asm__("vfneg.v v4, v4");
-            __asm__("vfmul.vv v8, v0, v0");
-            __asm__("vfmacc.vv v8, v4, v4");
-            __asm__("vfdiv.vv v0, v0, v8");
-            __asm__("vfdiv.vv v4, v4, v8");
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfneg.v v4, v4");
-            __asm__("vfmul.vv v8, v0, v0");
-            __asm__("vfmacc.vv v8, v4, v4");
-            __asm__("vfdiv.vv v0, v0, v8");
-            __asm__("vfdiv.vv v4, v4, v8");
-            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c
new file mode 100644
index 000000000..fc8f8a76d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c
@@ -0,0 +1,118 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <riscv_vector.h>
+#include <stddef.h>
+
+#define INVERTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invertv_sifive_x280_intr(\
+          dim_t            n,              \
+          T*      restrict x_, inc_t incx, \
+    const cntx_t*          cntx            \
+)
+
+#define INVERTV(...)  INVERTV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_invertv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_invertv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define RATIO 8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_invertv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef RATIO
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define RATIO 16
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_invertv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef RATIO
+#undef FLT_SIZE
+
+#undef INVERTV
+#undef INVERTV_
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..994ae3075
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c
@@ -0,0 +1,83 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef INVERTV
+
+INVERTV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl);
+
+        RVV_TYPE_F(PREC, LMUL) xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        RVV_TYPE_F(PREC, LMUL) xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        RVV_TYPE_F(PREC, LMUL) xvec_real_abs = VFABS_V(PREC, LMUL)(xvec_real, vl);
+        RVV_TYPE_F(PREC, LMUL) xvec_imag_abs = VFABS_V(PREC, LMUL)(xvec_imag, vl);
+        RVV_TYPE_B(RATIO) mask = VMFGE_VV(PREC, LMUL, RATIO)(xvec_real_abs, xvec_imag_abs, vl);
+        RVV_TYPE_F(PREC, LMUL) max = VMERGE_VVM_F(PREC, LMUL)(xvec_imag, xvec_real, mask, vl);
+        RVV_TYPE_F(PREC, LMUL) min = VMERGE_VVM_F(PREC, LMUL)(xvec_real, xvec_imag, mask, vl);
+        RVV_TYPE_F(PREC, LMUL) f = VFDIV_VV(PREC, LMUL)(min, max, vl);
+        RVV_TYPE_F(PREC, LMUL) denom = VFMACC_VV(PREC, LMUL)(max, f, min, vl);
+        RVV_TYPE_F(PREC, LMUL) t1 = VFRDIV_VF(PREC, LMUL)(denom, 1., vl);
+        RVV_TYPE_F(PREC, LMUL) t2 = VFDIV_VV(PREC, LMUL)(f, denom, vl);
+        xvec_real = VMERGE_VVM_F(PREC, LMUL)(t2, t1, mask, vl);
+        xvec_imag = VMERGE_VVM_F(PREC, LMUL)(t1, t2, mask, vl);
+        xvec_imag = VFNEG_VF(PREC, LMUL)(xvec_imag, vl);
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, xvec_real);
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag);
+
+        if (incx == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, xvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, xvec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // INVERTV
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..621e88c9f
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c
@@ -0,0 +1,68 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef INVERTV
+
+INVERTV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        xvec = VFRDIV_VF(PREC, LMUL)(xvec, 1., vl); 
+
+        if (incx == 1)
+            VSE_V_F(PREC, LMUL)(x, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // INVERTV
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
deleted file mode 100644
index 51edc9221..000000000
--- a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define FDIV "fdiv.s "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                            void * restrict x_, inc_t incx,
-                            const cntx_t *cntx) {
-    (void)conjalpha;
-    (void)cntx;
-    const float* restrict alpha = alpha_;
-    float* restrict x = x_;
-    if (n <= 0 || *alpha == 0.f || *alpha == 1.f)
-        return;
-
-    float one = 1.f;
-    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
-    __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
-    __asm__(FDIV "f0, f0, f1");
-    incx *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE) {
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-            __asm__("vfmul.vf v0, v0, f0");
-            __asm__(VSE "v0, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfmul.vf v0, v0, f0");
-            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FDIV
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define FDIV "fdiv.d "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                            void * restrict x_, inc_t incx,
-                            const cntx_t *cntx) {
-    (void)conjalpha;
-    (void)cntx;
-    const double* restrict alpha = alpha_;
-    double* restrict x = x_;
-    if (n <= 0 || *alpha == 0. || *alpha == 1.)
-        return;
-
-    double one = 1.;
-    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
-    __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
-    __asm__(FDIV "f0, f0, f1");
-    incx *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE) {
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-            __asm__("vfmul.vf v0, v0, f0");
-            __asm__(VSE "v0, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfmul.vf v0, v0, f0");
-            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FDIV
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define FMUL "fmul.s "
-#define FMADD "fmadd.s "
-#define FDIV "fdiv.s "
-#define FNEG "fneg.s "
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-
-void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                            void * restrict x_, inc_t incx,
-                            const cntx_t *cntx) {
-    (void)cntx;
-    const scomplex* restrict alpha = alpha_;
-    scomplex* restrict x = x_;
-    if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f))
-        return;
-
-    __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(FMUL "f2, f0, f0");
-    __asm__(FMADD "f2, f1, f1, f2");
-    __asm__(FDIV "f0, f0, f2");
-    __asm__(FDIV "f1, f1, f2");
-    if (conjalpha == BLIS_NO_CONJUGATE)
-        __asm__(FNEG "f1, f1");
-    incx *= 2 * FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE) {
-            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
-            __asm__("vfmul.vf v8, v0, f0");
-            __asm__("vfmul.vf v12, v4, f0");
-            __asm__("vfnmsac.vf v8, f1, v4");
-            __asm__("vfmacc.vf v12, f1, v0");
-            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfmul.vf v8, v0, f0");
-            __asm__("vfmul.vf v12, v4, f0");
-            __asm__("vfnmsac.vf v8, f1, v4");
-            __asm__("vfmacc.vf v12, f1, v0");
-            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FMUL
-#undef FMADD
-#undef FDIV
-#undef FNEG
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define FMUL "fmul.d "
-#define FMADD "fmadd.d "
-#define FDIV "fdiv.d "
-#define FNEG "fneg.d "
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-
-void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                            void * restrict x_, inc_t incx,
-                            const cntx_t *cntx) {
-    (void)cntx;
-    const dcomplex* restrict alpha = alpha_;
-    dcomplex* restrict x = x_;
-    if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.))
-        return;
-
-    __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(FMUL "f2, f0, f0");
-    __asm__(FMADD "f2, f1, f1, f2");
-    __asm__(FDIV "f0, f0, f2");
-    __asm__(FDIV "f1, f1, f2");
-    if (conjalpha == BLIS_NO_CONJUGATE)
-        __asm__(FNEG "f1, f1");
-    incx *= 2 * FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE) {
-            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
-            __asm__("vfmul.vf v8, v0, f0");
-            __asm__("vfmul.vf v12, v4, f0");
-            __asm__("vfnmsac.vf v8, f1, v4");
-            __asm__("vfmacc.vf v12, f1, v0");
-            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
-        } else {
-            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-            __asm__("vfmul.vf v8, v0, f0");
-            __asm__("vfmul.vf v12, v4, f0");
-            __asm__("vfnmsac.vf v8, f1, v4");
-            __asm__("vfmacc.vf v12, f1, v0");
-            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
-        }
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c
new file mode 100644
index 000000000..a5c7561bd
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_cmul_macros_intr.h"
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <riscv_vector.h>
+#include <stddef.h>
+
+#define INVSCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invscalv_sifive_x280_intr(\
+          conj_t           conjalpha,      \
+          dim_t            n,              \
+    const T*      restrict alpha_,         \
+          T*      restrict x_, inc_t incx, \
+    const cntx_t*          cntx            \
+)
+
+#define INVSCALV(...)  INVSCALV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_invscalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_invscalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_invscalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_invscalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef INVSCALV
+#undef INVSCALV_
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..077e9dd06
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c
@@ -0,0 +1,83 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef INVSCALV
+
+INVSCALV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0) return;
+    if (PASTEMAC(PRECISION_CHAR, eq1)(*alpha)) return;
+    if (PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) return;
+
+    DATATYPE alpha_conj_inv;
+    PASTEMAC(PRECISION_CHAR, copycjs)(conjalpha, *alpha, alpha_conj_inv); 
+    PASTEMAC(PRECISION_CHAR, inverts)(alpha_conj_inv);
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl);
+
+        RVV_TYPE_F(PREC, LMUL) xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        RVV_TYPE_F(PREC, LMUL) xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        RVV_TYPE_F(PREC, LMUL) yvec_real, yvec_imag;
+
+        VCMUL_VF(PREC, LMUL, yvec_real, yvec_imag, xvec_real, xvec_imag, alpha_conj_inv.real, alpha_conj_inv.imag, vl);
+
+        RVV_TYPE_FX(PREC, LMUL, 2) yvec = VUNDEFINED_FX(PREC, LMUL, 2)();
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incx == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, yvec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // INVSCALV
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..a38b97c33
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c
@@ -0,0 +1,75 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef INVSCALV
+
+INVSCALV(PRECISION_CHAR, void)
+{
+    (void)conjalpha;
+    (void)cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0) return;
+    if (PASTEMAC(PRECISION_CHAR, eq1)(*alpha)) return;
+    if (PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) return;
+
+    DATATYPE alpha_inv = *alpha;
+    PASTEMAC(PRECISION_CHAR, inverts)(alpha_inv);
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        xvec = VFMUL_VF(PREC, LMUL)(xvec, alpha_inv, vl); 
+
+        if (incx == 1)
+            VSE_V_F(PREC, LMUL)(x, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // INVSCALV
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
index cd2dd2c18..4cae8257c 100644
--- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
@@ -51,9 +51,9 @@
 
 #define SCAL2V(...)  SCAL2V_(__VA_ARGS__)
 
-#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr
 #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
 
 // Single precision real
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
index 4a25ce3e3..2e946a2a4 100644
--- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
@@ -77,13 +77,9 @@ SCAL2V(PRECISION_CHAR, void)
             yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
         }
 
-        // FIXME: remove the #pragmas and change the __riscv_vset_v_f intrinsics to use 
-        // __riscv_vcreate_v_f once they become available in LLVM.
-        #pragma GCC diagnostic push
-        #pragma GCC diagnostic ignored "-Wuninitialized"
+        yvec = VUNDEFINED_FX(PREC, LMUL, 2)();
         yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
         yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
-        #pragma GCC diagnostic pop
 
         if (incy == 1)
             VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
index b5788d632..d1fb9940e 100644
--- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
@@ -49,7 +49,7 @@
 
 #define SCALV(...)  SCALV_(__VA_ARGS__)
 
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
 
 // Single precision real
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
deleted file mode 100644
index ef9091f16..000000000
--- a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
-    (void)conjalpha;
-    (void)cntx;
-    const float* restrict alpha = alpha_;
-    float* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
-                     :
-                     : "r"(n), "i"(8 * FLT_SIZE));
-    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
-    incx *= FLT_SIZE;
-
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE)
-            __asm__(VSE "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
-    (void)conjalpha;
-    (void)cntx;
-    const double* restrict alpha = alpha_;
-    double* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
-                     :
-                     : "r"(n), "i"(8 * FLT_SIZE));
-    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
-    incx *= FLT_SIZE;
-
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE)
-            __asm__(VSE "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define VLSE "vlse32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-
-void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
-    (void)cntx;
-    const scomplex* restrict alpha = alpha_;
-    scomplex* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
-                     :
-                     : "r"(n), "i"(8 * FLT_SIZE));
-    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
-    __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(VLSE "v4, (t0), zero");
-    if (conjalpha == BLIS_CONJUGATE)
-        __asm__("vfneg.v v4, v4");
-    incx *= 2 * FLT_SIZE;
-
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE)
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLSE
-#undef VSSEG2
-#undef VSSSEG2
-
-#define FLT_SIZE 8
-#define VLSE "vlse64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-
-void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
-                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
-    (void)cntx;
-    const dcomplex* restrict alpha = alpha_;
-    dcomplex* restrict x = x_;
-    if (n <= 0)
-        return;
-
-    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
-                     :
-                     : "r"(n), "i"(8 * FLT_SIZE));
-    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
-    __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(VLSE "v4, (t0), zero");
-    if (conjalpha == BLIS_CONJUGATE)
-        __asm__("vfneg.v v4, v4");
-    incx *= 2 * FLT_SIZE;
-
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE)
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        avl -= vl;
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c
new file mode 100644
index 000000000..8c2ba7c72
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <riscv_vector.h>
+#include <stddef.h>
+
+#define SETV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##setv_sifive_x280_intr(\
+          conj_t           conjalpha,      \
+          dim_t            n,              \
+    const T*      restrict alpha_,         \
+          T*      restrict x_, inc_t incx, \
+    const cntx_t*          cntx            \
+)
+
+#define SETV(...)  SETV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_setv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_setv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_setv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_setv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SETV
+#undef SETV_
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..efee3a7f6
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SETV
+
+SETV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0) return;
+
+    DATATYPE alpha_conj;
+    PASTEMAC(PRECISION_CHAR, copycjs)(conjalpha, *alpha, alpha_conj);
+
+    RVV_TYPE_F(PREC, LMUL) alpha_conj_real_vec = VFMV_V_F(PREC, LMUL)(alpha_conj.real, n); 
+    RVV_TYPE_F(PREC, LMUL) alpha_conj_imag_vec = VFMV_V_F(PREC, LMUL)(alpha_conj.imag, n); 
+
+    RVV_TYPE_FX(PREC, LMUL, 2) alpha_conj_vec = VUNDEFINED_FX(PREC, LMUL, 2)();
+    alpha_conj_vec = VSET_V_F(PREC, LMUL, 2)(alpha_conj_vec, 0, alpha_conj_real_vec);
+    alpha_conj_vec = VSET_V_F(PREC, LMUL, 2)(alpha_conj_vec, 1, alpha_conj_imag_vec);
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+
+        if (incx == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, alpha_conj_vec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, alpha_conj_vec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // SETV
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..4b73de5c4
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c
@@ -0,0 +1,64 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SETV
+
+SETV(PRECISION_CHAR, void)
+{
+    (void)conjalpha;
+    (void)cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0) return;
+
+    RVV_TYPE_F(PREC, LMUL) alpha_vec = VFMV_V_F(PREC, LMUL)(*alpha, n);
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+
+        if (incx == 1)
+            VSE_V_F(PREC, LMUL)(x, alpha_vec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, alpha_vec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // SETV
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
deleted file mode 100644
index 2342e254a..000000000
--- a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * restrict y_,
-                     inc_t incy, const cntx_t *cntx) {
-    (void)cntx;
-    float* restrict x = x_;
-    float* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE)
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == FLT_SIZE)
-            __asm__(VLE "v8, (%0)" : : "r"(y));
-        else
-            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
-
-        if (incx == FLT_SIZE)
-            __asm__(VSE "v8, (%0)" : : "r"(x));
-        else
-            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == FLT_SIZE)
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-        else
-            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
-                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
-    (void)cntx;
-    double* restrict x = x_;
-    double* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == FLT_SIZE)
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == FLT_SIZE)
-            __asm__(VLE "v8, (%0)" : : "r"(y));
-        else
-            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
-
-        if (incx == FLT_SIZE)
-            __asm__(VSE "v8, (%0)" : : "r"(x));
-        else
-            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == FLT_SIZE)
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-        else
-            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
-                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
-    (void)cntx;
-    scomplex* restrict x = x_;
-    scomplex* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * 2 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE)
-            __asm__(VLE "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == 2 * FLT_SIZE)
-            __asm__(VLE "v8, (%0)" : : "r"(y));
-        else
-            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
-
-        if (incx == 2 * FLT_SIZE)
-            __asm__(VSE "v8, (%0)" : : "r"(x));
-        else
-            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == 2 * FLT_SIZE)
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-        else
-            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-
-void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
-                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
-    (void)cntx;
-    dcomplex* restrict x = x_;
-    dcomplex* restrict y = y_;
-    if (n <= 0)
-        return;
-
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    size_t avl = n;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        if (incx == 2 * FLT_SIZE)
-            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
-        else
-            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == 2 * FLT_SIZE)
-            __asm__(VLSEG2 "v8, (%0)" : : "r"(y));
-        else
-            __asm__(VLSSEG2 "v8, (%0), %1" : : "r"(y), "r"(incy));
-
-        if (incx == 2 * FLT_SIZE)
-            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
-        else
-            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
-        if (incy == 2 * FLT_SIZE)
-            __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
-        else
-            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
-
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c
new file mode 100644
index 000000000..baf685d35
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c
@@ -0,0 +1,115 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <riscv_vector.h>
+#include <stddef.h>
+
+#define SWAPV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##swapv_sifive_x280_intr(\
+          dim_t            n,              \
+          T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx            \
+)
+
+#define SWAPV(...)  SWAPV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_swapv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_swapv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_swapv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_swapv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SWAPV
+#undef SWAPV_
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..104ba5223
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c
@@ -0,0 +1,76 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SWAPV
+
+SWAPV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl);
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, vl);
+
+        if (incx == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, yvec, vl);
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, xvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, xvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // SWAPV
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c
new file mode 100644
index 000000000..efa7222ab
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c
@@ -0,0 +1,76 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SWAPV
+
+SWAPV(PRECISION_CHAR, void)
+{
+    (void)cntx;
+    DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        if (incx == 1)
+            VSE_V_F(PREC, LMUL)(x, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, yvec, vl);
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL)(y, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // SWAPV
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
index dce4085bf..da688851d 100644
--- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
@@ -51,7 +51,7 @@
 
 #define XPBYV(...)  XPBYV_(__VA_ARGS__)
 
-#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr
 #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
 
 // Single precision real
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
deleted file mode 100644
index 43c2ba44e..000000000
--- a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_saxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
-                         const void *restrict alpha_, const void *restrict a_, inc_t inca,
-                         inc_t lda, const void *restrict x_, inc_t incx,
-                         void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
-    (void)conja;
-    (void)conjx;
-    (void)cntx;
-    const float *restrict alpha = alpha_;
-    const float *restrict a = a_;
-    const float *restrict x = x_;
-    float *restrict y = y_;
-
-    if (m == 0 || b == 0)
-        return;
-    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
-    inca *= FLT_SIZE;
-    lda *= FLT_SIZE;
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    size_t avl = m;
-    while (avl) {
-        // process vl elements of y at a time
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        // x_tmp traverses x
-        // a points to the vl x b block of a needed this iteration
-        // a_tmp traverses the columns of this block
-        const float* restrict x_tmp = x;
-        const float* restrict a_tmp = a;
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-        if (inca == FLT_SIZE)
-            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-        else
-            __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
-        __asm__("vfmul.vf v0, v0, ft0");
-        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-
-        for (dim_t i = 1; i < b; ++i) {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-            if (inca == FLT_SIZE)
-                __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
-            else
-                __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-            __asm__("vfmacc.vf v0, ft0, v24");
-        }
-
-        if (incy == FLT_SIZE) {
-            __asm__(VLE "v24, (%0)" : : "r"(y));
-            __asm__("vfmacc.vf v24, ft11, v0");
-            __asm__(VSE "v24, (%0)" : : "r"(y));
-        } else {
-            __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
-            __asm__("vfmacc.vf v24, ft11, v0");
-            __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_daxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
-                         const void *restrict alpha_, const void *restrict a_, inc_t inca,
-                         inc_t lda, const void *restrict x_, inc_t incx,
-                         void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
-    (void)conja;
-    (void)conjx;
-    (void)cntx;
-    const double *restrict alpha = alpha_;
-    const double *restrict a = a_;
-    const double *restrict x = x_;
-    double *restrict y = y_;
-
-    if (m == 0 || b == 0)
-        return;
-    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
-    inca *= FLT_SIZE;
-    lda *= FLT_SIZE;
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    size_t avl = m;
-    while (avl) {
-        // process vl elements of y at a time
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        // x_tmp traverses x
-        // a points to the vl x b block of a needed this iteration
-        // a_tmp traverses the columns of this block
-        const double* restrict x_tmp = x;
-        const double* restrict a_tmp = a;
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-        if (inca == FLT_SIZE)
-            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-        else
-            __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
-        __asm__("vfmul.vf v0, v0, ft0");
-        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-
-        for (dim_t i = 1; i < b; ++i) {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-            if (inca == FLT_SIZE)
-                __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
-            else
-                __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-            __asm__("vfmacc.vf v0, ft0, v24");
-        }
-
-        if (incy == FLT_SIZE) {
-            __asm__(VLE "v24, (%0)" : : "r"(y));
-            __asm__("vfmacc.vf v24, ft11, v0");
-            __asm__(VSE "v24, (%0)" : : "r"(y));
-        } else {
-            __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
-            __asm__("vfmacc.vf v24, ft11, v0");
-            __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define VLSEG "vlseg2e32.v "
-#define VLSSEG "vlsseg2e32.v "
-#define VSSEG "vsseg2e32.v "
-#define VSSSEG "vssseg2e32.v "
-
-void bli_caxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
-                         const void *restrict alpha_, const void *restrict a_,
-                         inc_t inca, inc_t lda, const void *restrict x_,
-                         inc_t incx, void *restrict y_, inc_t incy,
-                         const cntx_t *restrict cntx) {
-    (void)cntx;
-    const scomplex *restrict alpha = alpha_;
-    const scomplex *restrict a = a_;
-    const scomplex *restrict x = x_;
-    scomplex *restrict y = y_;
-    
-    if (m == 0 || b == 0)
-        return;
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    inca *= 2 * FLT_SIZE;
-    lda *= 2 * FLT_SIZE;
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    size_t avl = m;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        const scomplex* restrict x_tmp = x;
-        const scomplex* restrict a_tmp = a;
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
-        if (inca == 2 * FLT_SIZE)
-            __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
-        else
-            __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
-        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-        __asm__("vfmul.vf v0, v24, ft0");
-        __asm__("vfmul.vf v4, v24, ft1");
-        if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-            __asm__("vfnmsac.vf v0, ft1, v28");
-            __asm__("vfmacc.vf v4, ft0, v28");
-        } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
-            __asm__("vfmacc.vf v0, ft1, v28");
-            __asm__("vfmsac.vf v4, ft0, v28");
-        } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-            __asm__("vfmacc.vf v0, ft1, v28");
-            __asm__("vfnmsac.vf v4, ft0, v28");
-        } else {
-            __asm__("vfnmsac.vf v0, ft1, v28");
-            __asm__("vfnmacc.vf v4, ft0, v28");
-        }
-
-        for (dim_t i = 1; i < b; ++i) {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
-            if (inca == 2 * FLT_SIZE)
-                __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
-            else
-                __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-            __asm__("vfmacc.vf v0, ft0, v24");
-            if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-                __asm__("vfmacc.vf v4, ft1, v24");
-                __asm__("vfnmsac.vf v0, ft1, v28");
-                __asm__("vfmacc.vf v4, ft0, v28");
-            } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
-                __asm__("vfnmsac.vf v4, ft1, v24");
-                __asm__("vfmacc.vf v0, ft1, v28");
-                __asm__("vfmacc.vf v4, ft0, v28");
-            } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-                __asm__("vfmacc.vf v4, ft1, v24");
-                __asm__("vfmacc.vf v0, ft1, v28");
-                __asm__("vfnmsac.vf v4, ft0, v28");
-            } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
-                __asm__("vfnmsac.vf v4, ft1, v24");
-                __asm__("vfnmsac.vf v0, ft1, v28");
-                __asm__("vfnmsac.vf v4, ft0, v28");
-            }
-        }
-
-        if (incy == 2 * FLT_SIZE) {
-            __asm__(VLSEG "v24, (%0)" : : "r"(y));
-            __asm__("vfmacc.vf v24, ft10, v0");
-            __asm__("vfmacc.vf v28, ft10, v4");
-            __asm__("vfnmsac.vf v24, ft11, v4");
-            __asm__("vfmacc.vf v28, ft11, v0");
-            __asm__(VSSEG "v24, (%0)" : : "r"(y));
-        } else {
-            __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
-            __asm__("vfmacc.vf v24, ft10, v0");
-            __asm__("vfmacc.vf v28, ft10, v4");
-            __asm__("vfnmsac.vf v24, ft11, v4");
-            __asm__("vfmacc.vf v28, ft11, v0");
-            __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLSEG
-#undef VLSSEG
-#undef VSSEG
-#undef VSSSEG
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define VLSEG "vlseg2e64.v "
-#define VLSSEG "vlsseg2e64.v "
-#define VSSEG "vsseg2e64.v "
-#define VSSSEG "vssseg2e64.v "
-
-void bli_zaxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
-                         const void *restrict alpha_, const void *restrict a_,
-                         inc_t inca, inc_t lda, const void *restrict x_,
-                         inc_t incx, void *restrict y_, inc_t incy,
-                         const cntx_t *restrict cntx) {
-    (void)cntx;
-    const dcomplex *restrict alpha = alpha_;
-    const dcomplex *restrict a = a_;
-    const dcomplex *restrict x = x_;
-    dcomplex *restrict y = y_;
-
-    if (m == 0 || b == 0)
-        return;
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    inca *= 2 * FLT_SIZE;
-    lda *= 2 * FLT_SIZE;
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    size_t avl = m;
-    while (avl) {
-        size_t vl;
-        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
-                         : "=r"(vl)
-                         : "r"(avl), "i"(8 * FLT_SIZE));
-        const dcomplex* restrict x_tmp = x;
-        const dcomplex* restrict a_tmp = a;
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
-        if (inca == 2 * FLT_SIZE)
-            __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
-        else
-            __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
-        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-        __asm__("vfmul.vf v0, v24, ft0");
-        __asm__("vfmul.vf v4, v24, ft1");
-        if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-            __asm__("vfnmsac.vf v0, ft1, v28");
-            __asm__("vfmacc.vf v4, ft0, v28");
-        } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
-            __asm__("vfmacc.vf v0, ft1, v28");
-            __asm__("vfmsac.vf v4, ft0, v28");
-        } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-            __asm__("vfmacc.vf v0, ft1, v28");
-            __asm__("vfnmsac.vf v4, ft0, v28");
-        } else {
-            __asm__("vfnmsac.vf v0, ft1, v28");
-            __asm__("vfnmacc.vf v4, ft0, v28");
-        }
-
-        for (dim_t i = 1; i < b; ++i) {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
-            if (inca == 2 * FLT_SIZE)
-                __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
-            else
-                __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
-            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
-            __asm__("vfmacc.vf v0, ft0, v24");
-            if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-                __asm__("vfmacc.vf v4, ft1, v24");
-                __asm__("vfnmsac.vf v0, ft1, v28");
-                __asm__("vfmacc.vf v4, ft0, v28");
-            } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
-                __asm__("vfnmsac.vf v4, ft1, v24");
-                __asm__("vfmacc.vf v0, ft1, v28");
-                __asm__("vfmacc.vf v4, ft0, v28");
-            } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
-                __asm__("vfmacc.vf v4, ft1, v24");
-                __asm__("vfmacc.vf v0, ft1, v28");
-                __asm__("vfnmsac.vf v4, ft0, v28");
-            } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
-                __asm__("vfnmsac.vf v4, ft1, v24");
-                __asm__("vfnmsac.vf v0, ft1, v28");
-                __asm__("vfnmsac.vf v4, ft0, v28");
-            }
-        }
-
-        if (incy == 2 * FLT_SIZE) {
-            __asm__(VLSEG "v24, (%0)" : : "r"(y));
-            __asm__("vfmacc.vf v24, ft10, v0");
-            __asm__("vfmacc.vf v28, ft10, v4");
-            __asm__("vfnmsac.vf v24, ft11, v4");
-            __asm__("vfmacc.vf v28, ft11, v0");
-            __asm__(VSSEG "v24, (%0)" : : "r"(y));
-        } else {
-            __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
-            __asm__("vfmacc.vf v24, ft10, v0");
-            __asm__("vfmacc.vf v28, ft10, v4");
-            __asm__("vfnmsac.vf v24, ft11, v4");
-            __asm__("vfmacc.vf v28, ft11, v0");
-            __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
-        avl -= vl;
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c
new file mode 100644
index 000000000..a5e026846
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c
@@ -0,0 +1,121 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_cmul_macros_intr.h"
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <riscv_vector.h>
+#include <stdint.h>
+
+#define AXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyf_sifive_x280_intr(\
+          conj_t            conja,                        \
+          conj_t            conjx,                        \
+          dim_t                 m,                        \
+          dim_t                 b,                        \
+    const T*      restrict alpha_,                        \
+    const T*      restrict     a_, inc_t inca, inc_t lda, \
+    const T*      restrict     x_, inc_t incx,            \
+          T*      restrict     y_, inc_t incy,            \
+    const cntx_t* restrict   cntx                         \
+)
+
+#define AXPYF(...)  AXPYF_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyf_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyf_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyf_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyf_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPYF
+#undef AXPYF_
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..0ab5509fa
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c
@@ -0,0 +1,149 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYF
+
+AXPYF(PRECISION_CHAR, void)
+{
+    // Computes y := y + alpha * conja(A) * conjx(x)
+    
+    (void) cntx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (m <= 0 || b <= 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha))
+        return;
+
+    size_t avl = m;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict x_tmp = x;
+        RVV_TYPE_F(PREC, LMUL) ax_vec_real, ax_vec_imag;
+
+        for (size_t i = 0; i < b; ++i) {
+            DATATYPE x_tmp_conj;
+            PASTEMAC(PRECISION_CHAR, copycjs)(conjx, *x_tmp, x_tmp_conj);
+
+            RVV_TYPE_FX(PREC, LMUL, 2) acol_vec;
+            if (inca == 1)
+                acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) a_tmp, vl);
+            else
+                acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) a_tmp, 2 * FLT_SIZE * inca, vl);
+
+            RVV_TYPE_F(PREC, LMUL) acol_vec_real = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);
+            RVV_TYPE_F(PREC, LMUL) acol_vec_imag = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);
+
+            if (bli_is_conj(conja)) {
+                if (i == 0)
+                    VCMUL_VF_CONJ
+                    (
+                      PREC, LMUL,
+                      ax_vec_real, ax_vec_imag,
+                      acol_vec_real, acol_vec_imag,
+                      x_tmp_conj.real, x_tmp_conj.imag,
+                      vl
+                    );
+                else
+                    VCMACC_VF_CONJ
+                    (
+                      PREC, LMUL,
+                      ax_vec_real, ax_vec_imag,
+                      x_tmp_conj.real, x_tmp_conj.imag,
+                      acol_vec_real, acol_vec_imag,
+                      vl
+                    );
+            }
+            else {
+                if (i == 0)
+                    VCMUL_VF
+                    (
+                      PREC, LMUL,
+                      ax_vec_real, ax_vec_imag,
+                      acol_vec_real, acol_vec_imag,
+                      x_tmp_conj.real, x_tmp_conj.imag,
+                      vl
+                    );
+                else
+                    VCMACC_VF
+                    (
+                      PREC, LMUL,
+                      ax_vec_real, ax_vec_imag,
+                      x_tmp_conj.real, x_tmp_conj.imag,
+                      acol_vec_real, acol_vec_imag,
+                      vl
+                    );
+            }
+
+            a_tmp += lda;
+            x_tmp += incx;
+        }
+        
+        RVV_TYPE_FX(PREC, LMUL, 2) yvec;
+	if (incy == 1)
+	    yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl);
+	else
+	    yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, vl);
+
+        RVV_TYPE_F(PREC, LMUL) yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        RVV_TYPE_F(PREC, LMUL) yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        VCMACC_VF
+        (
+          PREC, LMUL,
+          yvec_real, yvec_imag,
+          alpha->real, alpha->imag,
+          ax_vec_real, ax_vec_imag,
+          vl
+        );
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+	if (incy == 1)
+	    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, yvec, vl);
+	else
+	    VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, yvec, vl);
+
+        a += vl * inca;
+        y += vl * incy;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // AXPYF
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c
new file mode 100644
index 000000000..ae7dcb21d
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c
@@ -0,0 +1,96 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYF
+
+AXPYF(PRECISION_CHAR, void)
+{
+    // Computes y := y + alpha * conja(A) * conjx(x)
+    
+    (void) conja; // Suppress unused parameter warnings
+    (void) conjx;
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (m <= 0 || b <= 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha))
+        return;
+
+    size_t avl = m;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict x_tmp = x;
+        RVV_TYPE_F(PREC, LMUL) ax_vec;
+
+        for (size_t i = 0; i < b; ++i) {
+            RVV_TYPE_F(PREC, LMUL) acol_vec;
+            if (inca == 1)
+                acol_vec = VLE_V_F(PREC, LMUL)(a_tmp, vl);
+            else
+                acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp, FLT_SIZE * inca, vl);
+
+            if (i == 0)
+                ax_vec = VFMUL_VF(PREC, LMUL)(acol_vec, *x_tmp, vl);
+            else
+                ax_vec = VFMACC_VF(PREC, LMUL)(ax_vec, *x_tmp, acol_vec, vl);
+
+            a_tmp += lda;
+            x_tmp += incx;
+        }
+        
+        RVV_TYPE_F(PREC, LMUL) yvec;
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, ax_vec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL)(y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        a += vl * inca;
+        y += vl * incy;
+        avl -= vl;
+    }
+    return;
+}
+
+#endif // AXPYF
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
deleted file mode 100644
index ecb340707..000000000
--- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
+++ /dev/null
@@ -1,3120 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "../riscv_cmul_macros_asm.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define FMUL "fmul.s "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_sdotxaxpyf_sifive_x280_asm(
-             conj_t           conjat,
-             conj_t           conja,
-             conj_t           conjw,
-             conj_t           conjx,
-             dim_t            m,
-             dim_t            b,
-       const void*   restrict alpha_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-       const void*   restrict w_, inc_t incw,
-       const void*   restrict x_, inc_t incx,
-       const void*   restrict beta_,
-             void*   restrict y_, inc_t incy,
-             void*   restrict z_, inc_t incz,
-       const cntx_t* restrict cntx
-                             ) {
-  (void)conjat;
-  (void)conja;
-  (void)conjw;
-  (void)conjx;
-  (void)cntx;
-  const float *restrict alpha = alpha_;
-  const float *restrict beta = beta_;
-  const float *restrict a = a_;
-  const float *restrict w = w_;
-  const float *restrict x = x_;
-  float *restrict y = y_;
-  float *restrict z = z_;
-
-  if (b == 0)
-    return;
-  else if (m == 0 || *alpha == 0.f) {
-    // scale y by beta
-    if (*beta == 0.f)
-        bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-    else
-        bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-    return;
-  }
-
-  __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-  __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-  inca *= FLT_SIZE;
-  lda *= FLT_SIZE;
-  incw *= FLT_SIZE;
-  incx *= FLT_SIZE;
-  incy *= FLT_SIZE;
-  incz *= FLT_SIZE;
-  inc_t a_bump = 5 * lda;
-  while (b >= 5) {
-    // compute dot product of w with 5 rows of a
-    const float* restrict w_tmp = w;
-    const float* restrict z_tmp = z;
-    const float* restrict a_col = a;
-    size_t avl = m;
-    bool first = true;
-    while (avl) {
-      const float* restrict a_row = a_col;
-      size_t vl;
-      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-      if (incw == FLT_SIZE)
-        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
-      else
-        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-      if (inca == FLT_SIZE) {
-        // a unit stride
-        if (first) {
-          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmul.vv v0, v24, v28");
-          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmul.vv v4, v24, v28");
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmul.vv v8, v24, v28");
-          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmul.vv v12, v24, v28");
-          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmul.vv v16, v24, v28");
-          first = false;
-        }
-        else {
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmacc.vv v0, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmacc.vv v4, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmacc.vv v8, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmacc.vv v12, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmacc.vv v16, v24, v28");
-        }
-      } // end a unit stride
-      else {
-        // a non-unit stride
-        if (first) {
-          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmul.vv v0, v24, v28");
-          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmul.vv v4, v24, v28");
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmul.vv v8, v24, v28");
-          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmul.vv v12, v24, v28");
-          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmul.vv v16, v24, v28");
-          first = false;
-        }
-        else {
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmacc.vv v0, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmacc.vv v4, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmacc.vv v8, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmacc.vv v12, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmacc.vv v16, v24, v28");
-        }
-      } // end a non-unit stride
-
-      if (incz == FLT_SIZE) {
-        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
-      } else {
-        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-      }
-
-      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
-      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-      avl -= vl;
-    }
-
-    __asm__("vmv.s.x v31, x0");
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v0, v0, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.f) {
-      __asm__("vfmul.vf v0, v0, ft10");
-      __asm__(VSE "v0, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v0");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v4, v4, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.f) {
-      __asm__("vfmul.vf v4, v4, ft10");
-      __asm__(VSE "v4, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v4");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v8, v8, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.f) {
-      __asm__("vfmul.vf v8, v8, ft10");
-      __asm__(VSE "v8, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v8");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v12, v12, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.f) {
-      __asm__("vfmul.vf v12, v12, ft10");
-      __asm__(VSE "v12, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v12");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v16, v16, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.f) {
-      __asm__("vfmul.vf v16, v16, ft10");
-      __asm__(VSE "v16, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v16");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-    b -= 5;
-  }
-
-  if (b > 0) {
-    const float* restrict w_tmp = w;
-    const float* restrict z_tmp = z;
-    const float* restrict a_col;
-    __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-    __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
-    size_t avl = m;
-    bool first = true;
-    while (avl) {
-      const float* restrict a_row = a_col;
-      size_t vl;
-      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-      if (incw == FLT_SIZE)
-        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
-      else
-        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-      __asm__("vmv.v.i v20, 0");
-      if (inca == FLT_SIZE) {
-        // a unit stride
-        if (first) {
-          switch (b) {
-          case 4:
-            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmul.vv v12, v24, v28");
-          case 3:
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmul.vv v8, v24, v28");
-          case 2:
-            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmul.vv v4, v24, v28");
-          case 1:
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmul.vv v0, v24, v28");
-          }
-          first = false;
-        }
-        else {
-          switch (b) {
-          case 4:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmacc.vv v12, v24, v28");
-          case 3:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmacc.vv v8, v24, v28");
-          case 2:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmacc.vv v4, v24, v28");
-          case 1:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmacc.vv v0, v24, v28");
-          }
-        }
-      } // end a unit stride
-      else {
-        // a non-unit stride
-        if (first) {
-          switch (b) {
-          case 4:
-            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmul.vv v12, v24, v28");
-          case 3:
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmul.vv v8, v24, v28");
-          case 2:
-            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmul.vv v4, v24, v28");
-          case 1:
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmul.vv v0, v24, v28");
-          }
-          first = false;
-        }
-        else {
-          switch (b) {
-          case 4:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmacc.vv v12, v24, v28");
-          case 3:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmacc.vv v8, v24, v28");
-          case 2:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmacc.vv v4, v24, v28");
-          case 1:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmacc.vv v0, v24, v28");
-          }
-        }
-      } // end a non-unit stride
-
-      if (incz == FLT_SIZE) {
-        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
-      } else {
-        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-      }
-
-      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
-      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-      avl -= vl;
-    }
-
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-    __asm__("vmv.s.x v31, x0");
-
-    switch (b) {
-    case 4:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v12, v12, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.f) {
-        __asm__("vfmul.vf v12, v12, ft10");
-        __asm__(VSE "v12, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v12");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-    case 3:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v8, v8, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.f) {
-        __asm__("vfmul.vf v8, v8, ft10");
-        __asm__(VSE "v8, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v8");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-    case 2:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v4, v4, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.f) {
-        __asm__("vfmul.vf v4, v4, ft10");
-        __asm__(VSE "v4, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v4");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-    case 1:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v0, v0, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.f) {
-        __asm__("vfmul.vf v0, v0, ft10");
-        __asm__(VSE "v0, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v0");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-    }
-  } // end cleanup
-  return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FMUL
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define FMUL "fmul.d "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_ddotxaxpyf_sifive_x280_asm(
-             conj_t           conjat,
-             conj_t           conja,
-             conj_t           conjw,
-             conj_t           conjx,
-             dim_t            m,
-             dim_t            b,
-       const void*   restrict alpha_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-       const void*   restrict w_, inc_t incw,
-       const void*   restrict x_, inc_t incx,
-       const void*   restrict beta_,
-             void*   restrict y_, inc_t incy,
-             void*   restrict z_, inc_t incz,
-       const cntx_t* restrict cntx
-                             ) {
-  (void)conjat;
-  (void)conja;
-  (void)conjw;
-  (void)conjx;
-  (void)cntx;
-  const double *restrict alpha = alpha_;
-  const double *restrict beta = beta_;
-  const double *restrict a = a_;
-  const double *restrict w = w_;
-  const double *restrict x = x_;
-  double *restrict y = y_;
-  double *restrict z = z_;
-
-  if (b == 0)
-    return;
-  else if (m == 0 || *alpha == 0.) {
-    // scale y by beta
-    if (*beta == 0.)
-        bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-    else
-        bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-    return;
-  }
-
-  __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-  __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-  inca *= FLT_SIZE;
-  lda *= FLT_SIZE;
-  incw *= FLT_SIZE;
-  incx *= FLT_SIZE;
-  incy *= FLT_SIZE;
-  incz *= FLT_SIZE;
-  inc_t a_bump = 5 * lda;
-  while (b >= 5) {
-    // compute dot product of w with 5 rows of a
-    const double* restrict w_tmp = w;
-    const double* restrict z_tmp = z;
-    const double* restrict a_col = a;
-    size_t avl = m;
-    bool first = true;
-    while (avl) {
-      const double* restrict a_row = a_col;
-      size_t vl;
-      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-      if (incw == FLT_SIZE)
-        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
-      else
-        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-      if (inca == FLT_SIZE) {
-        // a unit stride
-        if (first) {
-          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmul.vv v0, v24, v28");
-          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmul.vv v4, v24, v28");
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmul.vv v8, v24, v28");
-          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmul.vv v12, v24, v28");
-          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmul.vv v16, v24, v28");
-          first = false;
-        }
-        else {
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmacc.vv v0, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmacc.vv v4, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmacc.vv v8, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmacc.vv v12, v24, v28");
-          __asm__(VLE "v24, (%0)" : : "r"(a_row));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmacc.vv v16, v24, v28");
-        }
-      } // end a unit stride
-      else {
-        // a non-unit stride
-        if (first) {
-          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmul.vv v0, v24, v28");
-          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmul.vv v4, v24, v28");
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmul.vv v8, v24, v28");
-          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmul.vv v12, v24, v28");
-          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmul.vv v16, v24, v28");
-          first = false;
-        }
-        else {
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmul.vf v20, v24, ft0");
-          __asm__("vfmacc.vv v0, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft1, v24");
-          __asm__("vfmacc.vv v4, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft2, v24");
-          __asm__("vfmacc.vv v8, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-          __asm__("vfmacc.vf v20, ft3, v24");
-          __asm__("vfmacc.vv v12, v24, v28");
-          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-          __asm__("vfmacc.vf v20, ft4, v24");
-          __asm__("vfmacc.vv v16, v24, v28");
-        }
-      } // end a non-unit stride
-
-      if (incz == FLT_SIZE) {
-        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
-      } else {
-        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-      }
-
-      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
-      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-      avl -= vl;
-    }
-
-    __asm__("vmv.s.x v31, x0");
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v0, v0, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.) {
-      __asm__("vfmul.vf v0, v0, ft10");
-      __asm__(VSE "v0, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v0");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v4, v4, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.) {
-      __asm__("vfmul.vf v4, v4, ft10");
-      __asm__(VSE "v4, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v4");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v8, v8, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.) {
-      __asm__("vfmul.vf v8, v8, ft10");
-      __asm__(VSE "v8, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v8");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v12, v12, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.) {
-      __asm__("vfmul.vf v12, v12, ft10");
-      __asm__(VSE "v12, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v12");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-    __asm__("vfredusum.vs v16, v16, v31");
-    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-    if (*beta == 0.) {
-      __asm__("vfmul.vf v16, v16, ft10");
-      __asm__(VSE "v16, (%0)" : : "r"(y));
-    }
-    else {
-      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-      __asm__(FMUL "ft0, ft11, ft0");
-      __asm__("vfmv.s.f v30, ft0");
-      __asm__("vfmacc.vf v30, ft10, v16");
-      __asm__(VSE "v30, (%0)" : : "r"(y));
-    }
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-    __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-    b -= 5;
-  }
-
-  if (b > 0) {
-    const double* restrict w_tmp = w;
-    const double* restrict z_tmp = z;
-    const double* restrict a_col;
-    __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-    __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
-    size_t avl = m;
-    bool first = true;
-    while (avl) {
-      const double* restrict a_row = a_col;
-      size_t vl;
-      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-      if (incw == FLT_SIZE)
-        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
-      else
-        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-      __asm__("vmv.v.i v20, 0");
-      if (inca == FLT_SIZE) {
-        // a unit stride
-        if (first) {
-          switch (b) {
-          case 4:
-            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmul.vv v12, v24, v28");
-          case 3:
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmul.vv v8, v24, v28");
-          case 2:
-            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmul.vv v4, v24, v28");
-          case 1:
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmul.vv v0, v24, v28");
-          }
-          first = false;
-        }
-        else {
-          switch (b) {
-          case 4:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmacc.vv v12, v24, v28");
-          case 3:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmacc.vv v8, v24, v28");
-          case 2:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmacc.vv v4, v24, v28");
-          case 1:
-            __asm__(VLE "v24, (%0)" : : "r"(a_row));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmacc.vv v0, v24, v28");
-          }
-        }
-      } // end a unit stride
-      else {
-        // a non-unit stride
-        if (first) {
-          switch (b) {
-          case 4:
-            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmul.vv v12, v24, v28");
-          case 3:
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmul.vv v8, v24, v28");
-          case 2:
-            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmul.vv v4, v24, v28");
-          case 1:
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmul.vv v0, v24, v28");
-          }
-          first = false;
-        }
-        else {
-          switch (b) {
-          case 4:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft3, v24");
-            __asm__("vfmacc.vv v12, v24, v28");
-          case 3:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft2, v24");
-            __asm__("vfmacc.vv v8, v24, v28");
-          case 2:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-            __asm__("vfmacc.vf v20, ft1, v24");
-            __asm__("vfmacc.vv v4, v24, v28");
-          case 1:
-            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-            __asm__("vfmacc.vf v20, ft0, v24");
-            __asm__("vfmacc.vv v0, v24, v28");
-          }
-        }
-      } // end a non-unit stride
-
-      if (incz == FLT_SIZE) {
-        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
-      } else {
-        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-        __asm__("vfmacc.vf v24, ft10, v20");
-        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-      }
-
-      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
-      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-      avl -= vl;
-    }
-
-    __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-    __asm__("vmv.s.x v31, x0");
-
-    switch (b) {
-    case 4:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v12, v12, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.) {
-        __asm__("vfmul.vf v12, v12, ft10");
-        __asm__(VSE "v12, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v12");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-    case 3:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v8, v8, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.) {
-        __asm__("vfmul.vf v8, v8, ft10");
-        __asm__(VSE "v8, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v8");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-    case 2:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v4, v4, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.) {
-        __asm__("vfmul.vf v4, v4, ft10");
-        __asm__(VSE "v4, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v4");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-    case 1:
-      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-      __asm__("vfredusum.vs v0, v0, v31");
-      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-      if (*beta == 0.) {
-        __asm__("vfmul.vf v0, v0, ft10");
-        __asm__(VSE "v0, (%0)" : : "r"(y));
-      }
-      else {
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-        __asm__(FMUL "ft0, ft11, ft0");
-        __asm__("vfmv.s.f v30, ft0");
-        __asm__("vfmacc.vf v30, ft10, v0");
-        __asm__(VSE "v30, (%0)" : : "r"(y));
-      }
-    }
-  } // end cleanup
-  return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FMUL
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define FMUL "fmul.s "
-#define FMADD "fmadd.s "
-#define FNMSUB "fnmsub.s "
-#define FNEG "fneg.s "
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define VSE "vse32.v "
-
-void bli_cdotxaxpyf_sifive_x280_asm
-     (
-             conj_t           conjat,
-             conj_t           conja,
-             conj_t           conjw,
-             conj_t           conjx,
-             dim_t            m,
-             dim_t            b,
-       const void*   restrict alpha_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-       const void*   restrict w_, inc_t incw,
-       const void*   restrict x_, inc_t incx,
-       const void*   restrict beta_,
-             void*   restrict y_, inc_t incy,
-             void*   restrict z_, inc_t incz,
-       const cntx_t* restrict cntx
-     )
-{
-    (void)cntx;
-    const scomplex *restrict alpha = alpha_;
-    const scomplex *restrict beta = beta_;
-    const scomplex *restrict a = a_;
-    const scomplex *restrict w = w_;
-    const scomplex *restrict x = x_;
-    scomplex *restrict y = y_;
-    scomplex *restrict z = z_;
-    
-    if (b == 0)
-        return;
-    else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
-        // scale y by beta
-        if (beta->real == 0.f && beta->imag == 0.f)
-            bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        else
-            bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        return;
-    }
-
-    // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
-    // and fa6-fa7 to store beta
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
-    __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
-    // Reduce to case when A^T is not conjugated, then conjugate
-    // computed product A^T * w if needed.
-    conj_t conjatw = BLIS_NO_CONJUGATE;
-    if (conjat == BLIS_CONJUGATE) {
-        bli_toggle_conj(&conjat);
-        bli_toggle_conj(&conjw);
-        bli_toggle_conj(&conjatw);
-    }
-    conj_t conjax = BLIS_NO_CONJUGATE;
-    if (conja == BLIS_CONJUGATE) {
-        bli_toggle_conj(&conja);
-        bli_toggle_conj(&conjx);
-        bli_toggle_conj(&conjax);
-    }
-    inca *= 2 * FLT_SIZE;
-    lda *= 2 * FLT_SIZE;
-    incw *= 2 * FLT_SIZE;
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    incz *= 2 * FLT_SIZE;
-    // these are used to bump a and y, resp.
-    inc_t a_bump = 5 * lda;
-    inc_t y_bump = incy - FLT_SIZE;
-    while (b >= 5) {
-        // compute dot product of w with 6 rows of a
-        const scomplex* restrict w_tmp = w;
-        const scomplex* restrict z_tmp = z;
-        const scomplex* restrict a_col = a;
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const scomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incw == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-            if (inca == 2 * FLT_SIZE) {
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjw = no conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a unit stride, conjw = conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjw = no conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a non-unit stride, conjw = conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a non-unit stride
-
-            if (incz == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
-            }
-            else {
-                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-            }
-
-            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
-            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("vmv.s.x v31, x0");
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v0, v0, v31");
-        __asm__("vfredusum.vs v2, v2, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v0, v2, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v4, v4, v31");
-        __asm__("vfredusum.vs v6, v6, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v4, v6, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v8, v8, v31");
-        __asm__("vfredusum.vs v10, v10, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v8, v10, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v12, v12, v31");
-        __asm__("vfredusum.vs v14, v14, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v12, v14, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v16, v16, v31");
-        __asm__("vfredusum.vs v18, v18, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v16, v18, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        // a += 5 * lda;
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-        b -= 5;
-    }
-
-    if (b > 0) {
-        // cleanup loop, 0 < b < 5
-        const scomplex* restrict w_tmp = w;
-        const scomplex* restrict z_tmp = z;
-        const scomplex* restrict a_col;
-        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const scomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incw == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-            __asm__("vmv.v.i v20, 0");
-            __asm__("vmv.v.i v22, 0");
-            if (inca == 2 * FLT_SIZE) {
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjw = no conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a unit stride, conjw = conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjw = no conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a non-unit stride, conjw = conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a non-unit stride
-
-            if (incz == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
-            }
-            else {
-                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-            }
-
-            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
-            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-        y_bump = incy + FLT_SIZE;
-        __asm__("vmv.s.x v31, x0");
-
-        switch (b) {
-        case 4:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v12, v12, v31");
-            __asm__("vfredusum.vs v14, v14, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v12, v14, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 3:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v8, v8, v31");
-            __asm__("vfredusum.vs v10, v10, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v8, v10, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 2:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v4, v4, v31");
-            __asm__("vfredusum.vs v6, v6, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v4, v6, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 1:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v0, v0, v31");
-            __asm__("vfredusum.vs v2, v2, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v0, v2, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-        }
-    }
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FMUL
-#undef FMADD
-#undef FNMSUB
-#undef FNEG
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-#undef VSE
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define FMUL "fmul.d "
-#define FMADD "fmadd.d "
-#define FNMSUB "fnmsub.d "
-#define FNEG "fneg.d "
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define VSE "vse64.v "
-
-void bli_zdotxaxpyf_sifive_x280_asm
-     (
-             conj_t           conjat,
-             conj_t           conja,
-             conj_t           conjw,
-             conj_t           conjx,
-             dim_t            m,
-             dim_t            b,
-       const void*   restrict alpha_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-       const void*   restrict w_, inc_t incw,
-       const void*   restrict x_, inc_t incx,
-       const void*   restrict beta_,
-             void*   restrict y_, inc_t incy,
-             void*   restrict z_, inc_t incz,
-       const cntx_t* restrict cntx
-     )
-{
-    (void)cntx;
-    const dcomplex *restrict alpha = alpha_;
-    const dcomplex *restrict beta = beta_;
-    const dcomplex *restrict a = a_;
-    const dcomplex *restrict w = w_;
-    const dcomplex *restrict x = x_;
-    dcomplex *restrict y = y_;
-    dcomplex *restrict z = z_;
-
-    if (b == 0)
-        return;
-    else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
-        // scale y by beta
-        if (beta->real == 0. && beta->imag == 0.)
-            bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        else
-            bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        return;
-    }
-
-    // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
-    // and fa6-fa7 to store beta
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
-    __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
-    // Reduce to case when A^T is not conjugated, then conjugate
-    // computed product A^T * w if needed.
-    conj_t conjatw = BLIS_NO_CONJUGATE;
-    if (conjat == BLIS_CONJUGATE) {
-        bli_toggle_conj(&conjat);
-        bli_toggle_conj(&conjw);
-        bli_toggle_conj(&conjatw);
-    }
-    conj_t conjax = BLIS_NO_CONJUGATE;
-    if (conja == BLIS_CONJUGATE) {
-        bli_toggle_conj(&conja);
-        bli_toggle_conj(&conjx);
-        bli_toggle_conj(&conjax);
-    }
-    inca *= 2 * FLT_SIZE;
-    lda *= 2 * FLT_SIZE;
-    incw *= 2 * FLT_SIZE;
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    incz *= 2 * FLT_SIZE;
-    // these are used to bump a and y, resp.
-    inc_t a_bump = 5 * lda;
-    inc_t y_bump = incy - FLT_SIZE;
-    while (b >= 5) {
-        // compute dot product of w with 6 rows of a
-        const dcomplex* restrict w_tmp = w;
-        const dcomplex* restrict z_tmp = z;
-        const dcomplex* restrict a_col = a;
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const dcomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incw == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-            if (inca == 2 * FLT_SIZE) {
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjw = no conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a unit stride, conjw = conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjw = no conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a non-unit stride, conjw = conj
-                    if (first) {
-                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
-                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a non-unit stride
-
-            if (incz == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
-            }
-            else {
-                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-            }
-
-            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
-            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("vmv.s.x v31, x0");
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v0, v0, v31");
-        __asm__("vfredusum.vs v2, v2, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v0, v2, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v4, v4, v31");
-        __asm__("vfredusum.vs v6, v6, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v4, v6, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v8, v8, v31");
-        __asm__("vfredusum.vs v10, v10, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v8, v10, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v12, v12, v31");
-        __asm__("vfredusum.vs v14, v14, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v12, v14, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v16, v16, v31");
-        __asm__("vfredusum.vs v18, v18, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmul_vf(v28, v29, v16, v18, ft10, ft11);
-          }
-          else {
-            vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
-          }
-        }
-        else {
-          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-          __asm__("vfmv.s.f v28, ft0");
-          __asm__("vfmv.s.f v29, ft1");
-          if (conjatw == BLIS_NO_CONJUGATE) {
-            vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
-          }
-          else {
-            vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
-          }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        // a += 5 * lda;
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-        b -= 5;
-    }
-
-    if (b > 0) {
-        // cleanup loop, 0 < b < 5
-        const dcomplex* restrict w_tmp = w;
-        const dcomplex* restrict z_tmp = z;
-        const dcomplex* restrict a_col;
-        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const dcomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incw == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
-            __asm__("vmv.v.i v20, 0");
-            __asm__("vmv.v.i v22, 0");
-            if (inca == 2 * FLT_SIZE) {
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjw = no conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a unit stride, conjw = conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjw == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjw = no conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_NO_CONJUGATE
-                else { // conjw == BLIS_CONJUGATE
-                    // a non-unit stride, conjw = conj
-                    if (first) {
-                        switch (b) {
-                        case 4:
-                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
-                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
-                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 4:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
-                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
-                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
-                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
-                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjw == BLIS_CONJUGATE
-            } // end a non-unit stride
-
-            if (incz == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
-            }
-            else {
-                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-                if (conjax == BLIS_NO_CONJUGATE) {
-                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
-                }
-                else {
-                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
-                }
-                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
-            }
-
-            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
-            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-        y_bump = incy + FLT_SIZE;
-        __asm__("vmv.s.x v31, x0");
-
-        switch (b) {
-        case 4:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v12, v12, v31");
-            __asm__("vfredusum.vs v14, v14, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v12, v14, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 3:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v8, v8, v31");
-            __asm__("vfredusum.vs v10, v10, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v8, v10, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 2:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v4, v4, v31");
-            __asm__("vfredusum.vs v6, v6, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v4, v6, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 1:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v0, v0, v31");
-            __asm__("vfredusum.vs v2, v2, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v0, v2, ft10, ft11);
-              }
-              else {
-                vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
-              }
-            }
-            else {
-              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
-              __asm__("vfmv.s.f v28, ft0");
-              __asm__("vfmv.s.f v29, ft1");
-              if (conjatw == BLIS_NO_CONJUGATE) {
-                vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
-              }
-              else {
-                vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
-              }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-        }
-    }
-    return;
-}
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c
new file mode 100644
index 000000000..dc1bca9f6
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c
@@ -0,0 +1,137 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include "../../riscv_cmul_macros_intr.h"
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <stdint.h>
+#include <riscv_vector.h>
+
+#define DOTXAXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxaxpyf_sifive_x280_intr(\
+          conj_t           conjat,                    \
+          conj_t           conja,                     \
+          conj_t           conjw,                     \
+          conj_t           conjx,                     \
+          dim_t            m,                         \
+          dim_t            b,                         \
+    const T*      restrict alpha_,                    \
+    const T*      restrict a_, inc_t inca, inc_t lda, \
+    const T*      restrict w_, inc_t incw,            \
+    const T*      restrict x_, inc_t incx,            \
+    const T*      restrict beta_,                     \
+          T*      restrict y_, inc_t incy,            \
+          T*      restrict z_, inc_t incz,            \
+    const cntx_t* restrict cntx                       \
+)
+
+#define DOTXAXPYF(...)  DOTXAXPYF_(__VA_ARGS__)
+
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotxaxpyf_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotxaxpyf_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m2
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m2
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SETV_
+#undef SETV
+#undef SCALV_
+#undef SCALV
+
+#undef DOTXAXPYF
+#undef DOTXAXPYF_
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..d8a984064
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c
@@ -0,0 +1,427 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXAXPYF
+
+#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i)                                      \
+    do {                                                                        \
+        acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \
+        acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                      \
+        acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                      \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                                                    \
+    do {                                                                                              \
+        acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \
+        acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                                            \
+        acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                                            \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                           \
+    do {                                                                                                                    \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+        VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
+        VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl);  \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+        VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
+        VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+        VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
+        VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                                       \
+        VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc3_r, yacc3_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
+        VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                                 \
+    do {                                                                                                                    \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+        VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
+        VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl);  \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+        VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
+        VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+        VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
+        VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                                       \
+        VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc3_r, yacc3_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
+        VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                            \
+    do {                                                                                                                        \
+        switch (b) {                                                                                                            \
+        case 3:                                                                                                                 \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+            VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
+            VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        case 2:                                                                                                                 \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+            VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
+            VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        case 1:                                                                                                                 \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+            VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
+            VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        }                                                                                                                       \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                                  \
+    do {                                                                                                                        \
+        switch (b) {                                                                                                            \
+        case 3:                                                                                                                 \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+            VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
+            VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        case 2:                                                                                                                 \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+            VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
+            VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        case 1:                                                                                                                 \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+            VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
+            VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \
+        }                                                                                                                       \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_REDUCE(i)                                                                            \
+    do {                                                                                                           \
+        RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1);                                               \
+        RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1);                                               \
+        dot##i##_r = VF_REDUSUM_VS(PREC, LMUL)(yacc##i##_r, dot##i##_r, m);                                        \
+        dot##i##_i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i##_i, dot##i##_i, m);                                        \
+        RVV_TYPE_F(PREC, m1) y##i##_r, y##i##_i;                                                                   \
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {                                                                \
+            if (bli_is_conj(conjatw))                                                                              \
+                VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1);  \
+            else                                                                                                   \
+                VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1);       \
+            y[i * incy].real = VFMV_F_S(PREC)(y##i##_r);                                                           \
+            y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i);                                                           \
+        }                                                                                                          \
+        else {                                                                                                     \
+            PASTEMAC(PRECISION_CHAR, scals)(*beta, y[i * incy])                                                    \
+            y##i##_r = VFMV_S_F(PREC, m1)(y[i * incy].real, 1);                                                    \
+            y##i##_i = VFMV_S_F(PREC, m1)(y[i * incy].imag, 1);                                                    \
+            if (bli_is_conj(conjatw))                                                                              \
+                VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \
+            else                                                                                                   \
+                VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1);      \
+            y[i * incy].real = VFMV_F_S(PREC)(y##i##_r);                                                           \
+            y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i);                                                           \
+        }                                                                                                          \
+    } while (0)
+
+DOTXAXPYF(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjat(A^T) * conjx(x)
+    
+    (void) cntx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict w = w_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict y = y_;
+    DATATYPE* restrict z = z_;
+
+    if (b == 0) return;
+    if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta))
+            SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    conj_t conjatw = BLIS_NO_CONJUGATE;
+    conj_t conjax = BLIS_NO_CONJUGATE;
+    if (bli_is_conj(conjw)) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjw);
+        bli_toggle_conj(&conjatw);
+    }
+    if (bli_is_conj(conjx)) {
+        bli_toggle_conj(&conja);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjax);
+    }
+
+    while (b >= 4) {
+        // Compute dot product of w with 4 columns of a.
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict w_tmp = w;
+        DATATYPE* restrict z_tmp = z;
+        RVV_TYPE_F(PREC, LMUL) yacc0_r, yacc0_i, yacc1_r, yacc1_i,
+                               yacc2_r, yacc2_i, yacc3_r, yacc3_i;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            RVV_TYPE_FX(PREC, LMUL, 2) wvec, acol_vec;
+            RVV_TYPE_F(PREC, LMUL) wvec_r, wvec_i, acol_vec_r, acol_vec_i;
+            RVV_TYPE_F(PREC, LMUL) zacc_r, zacc_i;
+            if (incw == 1)
+                wvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, vl);
+            else
+                wvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, 2 * FLT_SIZE * incw, vl);
+            wvec_r = VGET_V_F(PREC, LMUL, 2)(wvec, 0);
+            wvec_i = VGET_V_F(PREC, LMUL, 2)(wvec, 1);
+
+            if (first) {
+                if (bli_is_conj(conjat)) {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, );
+                    }
+                }
+                else {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , );
+                    }
+                }
+                first = false;
+            }
+            else {
+                if (bli_is_conj(conjat)) {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, );
+                    }
+                }
+                else {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , );
+                    }
+                }
+            }
+              
+            RVV_TYPE_FX(PREC, LMUL, 2) zvec;
+            if (incz == 1)
+                zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl);
+            else
+                zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, vl);
+            RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+            RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+            if (bli_is_conj(conjax))
+                VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl);
+            else
+                VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl);
+            zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r);
+            zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i);
+            if (incz == 1)
+                VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, zvec, vl);
+            else
+                VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, zvec, vl);
+
+            a_tmp += vl * inca;
+            w_tmp += vl * incw;
+            z_tmp += vl * incz;
+            avl -= vl;
+        }
+
+        DOTXAXPYF_SIFIVE_X280_REDUCE(0);
+        DOTXAXPYF_SIFIVE_X280_REDUCE(1);
+        DOTXAXPYF_SIFIVE_X280_REDUCE(2);
+        DOTXAXPYF_SIFIVE_X280_REDUCE(3);
+
+        a += 4 * lda;
+        x += 4 * incx;
+        y += 4 * incy;
+        b -= 4;
+    }
+
+    if (b > 0) {
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict w_tmp = w;
+        DATATYPE* restrict z_tmp = z;
+        RVV_TYPE_F(PREC, LMUL) yacc0_r, yacc0_i, yacc1_r, yacc1_i, yacc2_r, yacc2_i;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            RVV_TYPE_FX(PREC, LMUL, 2) wvec, acol_vec;
+            RVV_TYPE_F(PREC, LMUL) wvec_r, wvec_i, acol_vec_r, acol_vec_i;
+            RVV_TYPE_F(PREC, LMUL) zacc_r = VFMV_V_F(PREC, LMUL)(0, vl);
+            RVV_TYPE_F(PREC, LMUL) zacc_i = VFMV_V_F(PREC, LMUL)(0, vl);
+            if (incw == 1)
+                wvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, vl);
+            else
+                wvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, 2 * FLT_SIZE * incw, vl);
+            wvec_r = VGET_V_F(PREC, LMUL, 2)(wvec, 0);
+            wvec_i = VGET_V_F(PREC, LMUL, 2)(wvec, 1);
+
+            if (first) {
+                if (bli_is_conj(conjat)) {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, );
+                    }
+                }
+                else {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , );
+                    }
+                }
+                first = false;
+            }
+            else {
+                if (bli_is_conj(conjat)) {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, );
+                    }
+                }
+                else {
+                    if (bli_is_conj(conja)) {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , _CONJ);
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , _CONJ);
+                    }
+                    else {
+                        if (inca == 1)
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , );
+                        else
+                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , );
+                    }
+                }
+            }
+              
+            RVV_TYPE_FX(PREC, LMUL, 2) zvec;
+            if (incz == 1)
+                zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl);
+            else
+                zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, vl);
+            RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+            RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+            if (bli_is_conj(conjax))
+                VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl);
+            else
+                VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl);
+            zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r);
+            zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i);
+            if (incz == 1)
+                VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, zvec, vl);
+            else
+                VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, zvec, vl);
+
+            a_tmp += vl * inca;
+            w_tmp += vl * incw;
+            z_tmp += vl * incz;
+            avl -= vl;
+        }
+
+        switch (b) {
+        case 3:
+            DOTXAXPYF_SIFIVE_X280_REDUCE(2);
+        case 2:
+            DOTXAXPYF_SIFIVE_X280_REDUCE(1);
+        case 1:
+            DOTXAXPYF_SIFIVE_X280_REDUCE(0);
+        }
+    }
+    return;
+}
+
+#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL
+#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED
+#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY
+#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY
+#undef DOTXAXPYF_SIFIVE_X280_REDUCE
+
+#endif // DOTXAXPYF
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c
new file mode 100644
index 000000000..57ef4f744
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c
@@ -0,0 +1,283 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXAXPYF
+
+#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i)                   \
+    do {                                                     \
+        acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                             \
+    do {                                                                       \
+        acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF)                \
+    do {                                                               \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+        yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
+        zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl);        \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+        yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
+        zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+        yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
+        zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+        yacc3 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
+        zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF)                      \
+    do {                                                               \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+        yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl);   \
+        zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl);        \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+        yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl);   \
+        zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+        yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl);   \
+        zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
+        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+        yacc3 = VFMACC_VV_TU(PREC, LMUL)(yacc3, acol_vec, wvec, vl);   \
+        zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF)                 \
+    do {                                                                   \
+        switch (b) {                                                       \
+        case 3:                                                            \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+            yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
+            zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
+        case 2:                                                            \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+            yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
+            zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
+        case 1:                                                            \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+            yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
+            zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \
+        }                                                                  \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF)                       \
+    do {                                                                   \
+        switch (b) {                                                       \
+        case 3:                                                            \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+            yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl);   \
+            zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
+        case 2:                                                            \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+            yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl);   \
+            zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
+        case 1:                                                            \
+            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+            yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl);   \
+            zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \
+        }                                                                  \
+    } while (0)
+
+#define DOTXAXPYF_SIFIVE_X280_REDUCE(i)                                     \
+    do {                                                                    \
+        RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1);            \
+        dot##i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i, dot##i, m);             \
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {                         \
+            dot##i = VFMUL_VF(PREC, m1)(dot##i, *alpha, 1);                 \
+            y[i * incy] = VFMV_F_S(PREC)(dot##i);                           \
+        }                                                                   \
+        else {                                                              \
+            y[i * incy] *= *beta;                                           \
+            RVV_TYPE_F(PREC, m1) y##i = VFMV_S_F(PREC, m1)(y[i * incy], 1); \
+            y##i = VFMACC_VF(PREC, m1)(y##i, *alpha, dot##i, 1);            \
+            y[i * incy] = VFMV_F_S(PREC)(y##i);                             \
+        }                                                                   \
+    } while (0)
+
+DOTXAXPYF(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjat(A^T) * conjw(w)
+    //          z :=        z + alpha * conja(A)    * conjx(x)
+    
+    (void) conjat; // Suppress unused parameter warnings
+    (void) conja;
+    (void) conjw;
+    (void) conjx;
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict w = w_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict y = y_;
+    DATATYPE* restrict z = z_;
+
+    if (b == 0) return;
+    if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta))
+            SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    while (b >= 4) {
+        // Process 4 columns of a at a time.
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict w_tmp = w;
+        DATATYPE* restrict z_tmp = z;
+        RVV_TYPE_F(PREC, LMUL) yacc0, yacc1, yacc2, yacc3;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            RVV_TYPE_F(PREC, LMUL) wvec, acol_vec;
+            RVV_TYPE_F(PREC, LMUL) zacc;
+            if (incw == 1)
+                wvec = VLE_V_F(PREC, LMUL)(w_tmp, vl);
+            else
+                wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl);
+            if (first) {
+                if (inca == 1)
+                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( );
+                else
+                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED);
+                first = false;
+            }
+            else {
+                if (inca == 1)
+                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY( );
+                else
+                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED);
+            }
+
+            RVV_TYPE_F(PREC, LMUL) zvec;
+            if (incz == 1)
+                zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl);
+            else
+                zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl);
+            zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl);
+            if (incz == 1)
+                VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl);
+            else
+                VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl);
+              
+            a_tmp += vl * inca;
+            w_tmp += vl * incw;
+            z_tmp += vl * incz;
+            avl -= vl;
+        }
+
+        DOTXAXPYF_SIFIVE_X280_REDUCE(0);
+        DOTXAXPYF_SIFIVE_X280_REDUCE(1);
+        DOTXAXPYF_SIFIVE_X280_REDUCE(2);
+        DOTXAXPYF_SIFIVE_X280_REDUCE(3);
+
+        a += 4 * lda;
+        x += 4 * incx;
+        y += 4 * incy;
+        b -= 4;
+    }
+
+    if (b > 0) {
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict w_tmp = w;
+        DATATYPE* restrict z_tmp = z;
+        RVV_TYPE_F(PREC, LMUL) yacc0, yacc1, yacc2;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            RVV_TYPE_F(PREC, LMUL) wvec, acol_vec;
+            RVV_TYPE_F(PREC, LMUL) zacc = VFMV_V_F(PREC, LMUL)(0, vl);
+            if (incw == 1)
+                wvec = VLE_V_F(PREC, LMUL)(w_tmp, vl);
+            else
+                wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl);
+            if (first) {
+                if (inca == 1)
+                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( );
+                else
+                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED);
+                first = false;
+            }
+            else {
+                if (inca == 1)
+                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( );
+                else
+                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED);
+            }
+
+            RVV_TYPE_F(PREC, LMUL) zvec;
+            if (incz == 1)
+                zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl);
+            else
+                zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl);
+            zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl);
+            if (incz == 1)
+                VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl);
+            else
+                VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl);
+              
+            a_tmp += vl * inca;
+            w_tmp += vl * incw;
+            z_tmp += vl * incz;
+            avl -= vl;
+        }
+
+        switch (b) {
+        case 3:
+            DOTXAXPYF_SIFIVE_X280_REDUCE(2);
+        case 2:
+            DOTXAXPYF_SIFIVE_X280_REDUCE(1);
+        case 1:
+            DOTXAXPYF_SIFIVE_X280_REDUCE(0);
+        }
+    }
+    return;
+}
+
+#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL
+#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED
+#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY
+#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY
+#undef DOTXAXPYF_SIFIVE_X280_REDUCE
+
+#endif // DOTXAXPYF
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
deleted file mode 100644
index 5ac2d4166..000000000
--- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
+++ /dev/null
@@ -1,2645 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "../riscv_cmul_macros_asm.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define FMUL "fmul.s "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-
-void bli_sdotxf_sifive_x280_asm(
-              conj_t           conjat,
-              conj_t           conjx,
-              dim_t            m,
-              dim_t            b,
-        const void*   restrict alpha_,
-        const void*   restrict a_, inc_t inca, inc_t lda,
-        const void*   restrict x_, inc_t incx,
-        const void*   restrict beta_,
-              void*   restrict y_, inc_t incy,
-        const cntx_t* restrict cntx
-        ) {
-    // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
-    // we process 6 elements of y per iteration, using y_tmp to load/store from
-    // y a points to the 6 x m block of a needed this iteration each 6 x m block
-    // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
-    // use x_tmp to load from x a_row is used to load each of the 6 rows of this
-    // 6 x vl block
-    (void)conjat;
-    (void)conjx;
-    (void)cntx;
-    const float* restrict alpha = alpha_;
-    const float* restrict a = a_;
-    const float* restrict x = x_;
-    const float* restrict beta = beta_;
-    float* restrict y = y_;
-
-    if (b == 0)
-        return;
-    else if (m == 0 || *alpha == 0.f) {
-        // scale y by beta
-        if (*beta == 0.f)
-            bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        else
-            bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        return;
-    }
-
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-    inca *= FLT_SIZE;
-    lda *= FLT_SIZE;
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    inc_t a_bump = 6 * lda; // to bump a down 6 rows
-
-    while (b >= 6) {
-        // compute dot product of x with 6 rows of a
-        const float* restrict x_tmp = x;
-        const float* restrict a_col = a;
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const float* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == FLT_SIZE)
-                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == FLT_SIZE) {
-                // a unit stride
-                if (first) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v0, v0, v28");
-                    __asm__(VLE "v4, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v4, v4, v28");
-                    __asm__(VLE "v8, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v8, v8, v28");
-                    __asm__(VLE "v12, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v12, v12, v28");
-                    __asm__(VLE "v16, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v16, v16, v28");
-                    __asm__(VLE "v20, (%0)" : : "r"(a_row));
-                    __asm__("vfmul.vv v20, v20, v28");
-                    first = false;
-                }
-                else {
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v0, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v4, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v8, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v12, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v16, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("vfmacc.vv v20, v24, v28");
-                }
-            } // end a unit stride
-            else {
-                // a non-unit stride
-                if (first) {
-                    __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v0, v0, v28");
-                    __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v4, v4, v28");
-                    __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v8, v8, v28");
-                    __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v12, v12, v28");
-                    __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v16, v16, v28");
-                    __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("vfmul.vv v20, v20, v28");
-                    first = false;
-                }
-                else {
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v0, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v4, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v8, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v12, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v16, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("vfmacc.vv v20, v24, v28");
-                }
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("vmv.s.x v31, x0");
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v0, v0, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.f) {
-            __asm__("vfmul.vf v0, v0, ft10");
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v0");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v4, v4, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.f) {
-            __asm__("vfmul.vf v4, v4, ft10");
-            __asm__(VSE "v4, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v4");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v8, v8, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.f) {
-            __asm__("vfmul.vf v8, v8, ft10");
-            __asm__(VSE "v8, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v8");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v12, v12, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.f) {
-            __asm__("vfmul.vf v12, v12, ft10");
-            __asm__(VSE "v12, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v12");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v16, v16, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.f) {
-            __asm__("vfmul.vf v16, v16, ft10");
-            __asm__(VSE "v16, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v16");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v20, v20, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.f) {
-            __asm__("vfmul.vf v20, v20, ft10");
-            __asm__(VSE "v20, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v20");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        // a += 6 * lda;
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-        b -= 6;
-    }
-
-    if (b > 0) {
-        // compute dot product of x with remaining < 6 rows of a
-        const float* restrict x_tmp = x;
-        // a_col will move along the last row of a!
-        const float* restrict a_col;
-        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const float* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == FLT_SIZE)
-                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == FLT_SIZE) {
-                // a unit stride
-                if (first) {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLE "v16, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v16, v16, v28");
-                    case 4:
-                        __asm__(VLE "v12, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v12, v12, v28");
-                    case 3:
-                        __asm__(VLE "v8, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v8, v8, v28");
-                    case 2:
-                        __asm__(VLE "v4, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v4, v4, v28");
-                    case 1:
-                        __asm__(VLE "v0, (%0)" : : "r"(a_row));
-                        __asm__("vfmul.vv v0, v0, v28");
-                    }
-                    first = false;
-                }
-                else {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v16, v24, v28");
-                    case 4:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v12, v24, v28");
-                    case 3:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v8, v24, v28");
-                    case 2:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v4, v24, v28");
-                    case 1:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("vfmacc.vv v0, v24, v28");
-                    }
-                }
-            } // end a unit stride
-            else {
-                // a non-unit stride
-                if (first) {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v16, v16, v28");
-                    case 4:
-                        __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v12, v12, v28");
-                    case 3:
-                        __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v8, v8, v28");
-                    case 2:
-                        __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v4, v4, v28");
-                    case 1:
-                        __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("vfmul.vv v0, v0, v28");
-                    }
-                    first = false;
-                }
-                else {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v16, v24, v28");
-                    case 4:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v12, v24, v28");
-                    case 3:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v8, v24, v28");
-                    case 2:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v4, v24, v28");
-                    case 1:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("vfmacc.vv v0, v24, v28");
-                    }
-                }
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-        __asm__("vmv.s.x v31, x0");
-        switch (b) {
-        case 5:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v16, v16, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.f) {
-                __asm__("vfmul.vf v16, v16, ft10");
-                __asm__(VSE "v16, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v16");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 4:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v12, v12, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.f) {
-                __asm__("vfmul.vf v12, v12, ft10");
-                __asm__(VSE "v12, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v12");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 3:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v8, v8, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.f) {
-                __asm__("vfmul.vf v8, v8, ft10");
-                __asm__(VSE "v8, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v8");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 2:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v4, v4, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.f) {
-                __asm__("vfmul.vf v4, v4, ft10");
-                __asm__(VSE "v4, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v4");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 1:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v0, v0, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.f) {
-                __asm__("vfmul.vf v0, v0, ft10");
-                __asm__(VSE "v0, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v0");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-        }
-    } // end cleanup
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FMUL
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define FMUL "fmul.d "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-
-void bli_ddotxf_sifive_x280_asm(
-              conj_t           conjat,
-              conj_t           conjx,
-              dim_t            m,
-              dim_t            b,
-        const void*   restrict alpha_,
-        const void*   restrict a_, inc_t inca, inc_t lda,
-        const void*   restrict x_, inc_t incx,
-        const void*   restrict beta_,
-              void*   restrict y_, inc_t incy,
-        const cntx_t* restrict cntx
-        ) {
-    // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
-    // we process 6 elements of y per iteration, using y_tmp to load/store from
-    // y a points to the 6 x m block of a needed this iteration each 6 x m block
-    // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
-    // use x_tmp to load from x a_row is used to load each of the 6 rows of this
-    // 6 x vl block
-    (void)conjat;
-    (void)conjx;
-    (void)cntx;
-    const double* restrict alpha = alpha_;
-    const double* restrict a = a_;
-    const double* restrict x = x_;
-    const double* restrict beta = beta_;
-    double* restrict y = y_;
-
-    if (b == 0)
-        return;
-    else if (m == 0 || *alpha == 0.) {
-        // scale y by beta
-        if (*beta == 0.)
-            bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        else
-            bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        return;
-    }
-
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-    inca *= FLT_SIZE;
-    lda *= FLT_SIZE;
-    incx *= FLT_SIZE;
-    incy *= FLT_SIZE;
-    inc_t a_bump = 6 * lda; // to bump a down 6 rows
-
-    while (b >= 6) {
-        // compute dot product of x with 6 rows of a
-        const double* restrict x_tmp = x;
-        const double* restrict a_col = a;
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const double* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == FLT_SIZE)
-                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == FLT_SIZE) {
-                // a unit stride
-                if (first) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v0, v0, v28");
-                    __asm__(VLE "v4, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v4, v4, v28");
-                    __asm__(VLE "v8, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v8, v8, v28");
-                    __asm__(VLE "v12, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v12, v12, v28");
-                    __asm__(VLE "v16, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v16, v16, v28");
-                    __asm__(VLE "v20, (%0)" : : "r"(a_row));
-                    __asm__("vfmul.vv v20, v20, v28");
-                    first = false;
-                }
-                else {
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v0, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v4, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v8, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v12, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v16, v24, v28");
-                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                    __asm__("vfmacc.vv v20, v24, v28");
-                }
-            } // end a unit stride
-            else {
-                // a non-unit stride
-                if (first) {
-                    __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v0, v0, v28");
-                    __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v4, v4, v28");
-                    __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v8, v8, v28");
-                    __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v12, v12, v28");
-                    __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmul.vv v16, v16, v28");
-                    __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("vfmul.vv v20, v20, v28");
-                    first = false;
-                }
-                else {
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v0, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v4, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v8, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v12, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                    __asm__("vfmacc.vv v16, v24, v28");
-                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                    __asm__("vfmacc.vv v20, v24, v28");
-                }
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("vmv.s.x v31, x0");
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v0, v0, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.) {
-            __asm__("vfmul.vf v0, v0, ft10");
-            __asm__(VSE "v0, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v0");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v4, v4, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.) {
-            __asm__("vfmul.vf v4, v4, ft10");
-            __asm__(VSE "v4, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v4");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v8, v8, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.) {
-            __asm__("vfmul.vf v8, v8, ft10");
-            __asm__(VSE "v8, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v8");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v12, v12, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.) {
-            __asm__("vfmul.vf v12, v12, ft10");
-            __asm__(VSE "v12, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v12");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v16, v16, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.) {
-            __asm__("vfmul.vf v16, v16, ft10");
-            __asm__(VSE "v16, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v16");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v20, v20, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (*beta == 0.) {
-            __asm__("vfmul.vf v20, v20, ft10");
-            __asm__(VSE "v20, (%0)" : : "r"(y));
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-            __asm__(FMUL "ft0, ft11, ft0");
-            __asm__("vfmv.s.f v30, ft0");
-            __asm__("vfmacc.vf v30, ft10, v20");
-            __asm__(VSE "v30, (%0)" : : "r"(y));
-        }
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
-
-        // a += 6 * lda;
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-        b -= 6;
-    }
-
-    if (b > 0) {
-        // compute dot product of x with remaining < 6 rows of a
-        const double* restrict x_tmp = x;
-        // a_col will move along the last row of a!
-        const double* restrict a_col;
-        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const double* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == FLT_SIZE)
-                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == FLT_SIZE) {
-                // a unit stride
-                if (first) {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLE "v16, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v16, v16, v28");
-                    case 4:
-                        __asm__(VLE "v12, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v12, v12, v28");
-                    case 3:
-                        __asm__(VLE "v8, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v8, v8, v28");
-                    case 2:
-                        __asm__(VLE "v4, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v4, v4, v28");
-                    case 1:
-                        __asm__(VLE "v0, (%0)" : : "r"(a_row));
-                        __asm__("vfmul.vv v0, v0, v28");
-                    }
-                    first = false;
-                }
-                else {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v16, v24, v28");
-                    case 4:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v12, v24, v28");
-                    case 3:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v8, v24, v28");
-                    case 2:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v4, v24, v28");
-                    case 1:
-                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
-                        __asm__("vfmacc.vv v0, v24, v28");
-                    }
-                }
-            } // end a unit stride
-            else {
-                // a non-unit stride
-                if (first) {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v16, v16, v28");
-                    case 4:
-                        __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v12, v12, v28");
-                    case 3:
-                        __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v8, v8, v28");
-                    case 2:
-                        __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmul.vv v4, v4, v28");
-                    case 1:
-                        __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("vfmul.vv v0, v0, v28");
-                    }
-                    first = false;
-                }
-                else {
-                    switch (b) {
-                    case 5:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v16, v24, v28");
-                    case 4:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v12, v24, v28");
-                    case 3:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v8, v24, v28");
-                    case 2:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        __asm__("vfmacc.vv v4, v24, v28");
-                    case 1:
-                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("vfmacc.vv v0, v24, v28");
-                    }
-                }
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-        __asm__("vmv.s.x v31, x0");
-        switch (b) {
-        case 5:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v16, v16, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.) {
-                __asm__("vfmul.vf v16, v16, ft10");
-                __asm__(VSE "v16, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v16");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 4:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v12, v12, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.) {
-                __asm__("vfmul.vf v12, v12, ft10");
-                __asm__(VSE "v12, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v12");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 3:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v8, v8, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.) {
-                __asm__("vfmul.vf v8, v8, ft10");
-                __asm__(VSE "v8, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v8");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 2:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v4, v4, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.) {
-                __asm__("vfmul.vf v4, v4, ft10");
-                __asm__(VSE "v4, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v4");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
-        case 1:
-            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v0, v0, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (*beta == 0.) {
-                __asm__("vfmul.vf v0, v0, ft10");
-                __asm__(VSE "v0, (%0)" : : "r"(y));
-            }
-            else {
-                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
-                __asm__(FMUL "ft0, ft11, ft0");
-                __asm__("vfmv.s.f v30, ft0");
-                __asm__("vfmacc.vf v30, ft10, v0");
-                __asm__(VSE "v30, (%0)" : : "r"(y));
-            }
-        }
-    } // end cleanup
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FMUL
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define FMUL "fmul.s "
-#define FMADD "fmadd.s "
-#define FNMSUB "fnmsub.s "
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define VSE "vse32.v "
-
-void bli_cdotxf_sifive_x280_asm(
-              conj_t           conjat,
-              conj_t           conjx,
-              dim_t            m,
-              dim_t            b,
-        const void*   restrict alpha_,
-        const void*   restrict a_, inc_t inca, inc_t lda,
-        const void*   restrict x_, inc_t incx,
-        const void*   restrict beta_,
-              void*   restrict y_, inc_t incy,
-        const cntx_t* restrict cntx
-        ) {
-    (void)cntx;
-    const scomplex* restrict alpha = alpha_;
-    const scomplex* restrict a = a_;
-    const scomplex* restrict x = x_;
-    const scomplex* restrict beta = beta_;
-    scomplex* restrict y = y_;
-
-    if (b == 0)
-        return;
-    else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
-        // scale y by beta
-        if (beta->real == 0.f && beta->imag == 0.f)
-            bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        else
-            bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        return;
-    }
-
-    __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
-    // Reduce to case when A^T is not conjugated, then conjugate
-    // computed product A^T * x if needed.
-    conj_t conjatx = BLIS_NO_CONJUGATE;
-    if (conjat == BLIS_CONJUGATE) {
-        bli_toggle_conj(&conjat);
-        bli_toggle_conj(&conjx);
-        bli_toggle_conj(&conjatx);
-    }
-    inca *= 2 * FLT_SIZE;
-    lda *= 2 * FLT_SIZE;
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    // these are used to bump a and y, resp.
-    inc_t a_bump = 6 * lda;
-    inc_t y_bump = incy - FLT_SIZE;
-    while (b >= 6) {
-        // compute dot product of x with 6 rows of a
-        const scomplex* restrict x_tmp = x;
-        const scomplex* restrict a_col = a;
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const scomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == 2 * FLT_SIZE) {
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjx = no conj
-                    if (first) {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmul_vv(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx == BLIS_CONJUGATE
-                    // a unit stride, conjx = conj
-                    if (first) {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjx = no conj
-                    if (first) {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmul_vv(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx = BLIS_CONJUGATE
-                    // a non-unit stride, conjx = conj
-                    if (first) {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("vmv.s.x v31, x0");
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v0, v0, v31");
-        __asm__("vfredusum.vs v2, v2, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v0, v2, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v4, v4, v31");
-        __asm__("vfredusum.vs v6, v6, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v4, v6, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v8, v8, v31");
-        __asm__("vfredusum.vs v10, v10, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v8, v10, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v12, v12, v31");
-        __asm__("vfredusum.vs v14, v14, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v12, v14, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v16, v16, v31");
-        __asm__("vfredusum.vs v18, v18, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v16, v18, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v20, v20, v31");
-        __asm__("vfredusum.vs v22, v22, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0.f && beta->imag == 0.f) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v20, v22, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        // a += 6 * lda;
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-        b -= 6;
-    }
-
-    if (b > 0) {
-        // cleanup loop, 0 < b < 6
-        const scomplex* restrict x_tmp = x;
-        const scomplex* restrict a_col;
-        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const scomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == 2 * FLT_SIZE) {
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjx = no conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx == BLIS_CONJUGATE
-                    // a unit stride, conjx = conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjx = no conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx == BLIS_CONJUGATE
-                    // a non-unit stride, conjx = conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-        y_bump = incy + FLT_SIZE;
-        __asm__("vmv.s.x v31, x0");
-        
-        switch (b) {
-        case 5:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v16, v16, v31");
-            __asm__("vfredusum.vs v18, v18, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v16, v18, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 4:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v12, v12, v31");
-            __asm__("vfredusum.vs v14, v14, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v12, v14, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 3:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v8, v8, v31");
-            __asm__("vfredusum.vs v10, v10, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v8, v10, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 2:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v4, v4, v31");
-            __asm__("vfredusum.vs v6, v6, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v4, v6, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 1:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v0, v0, v31");
-            __asm__("vfredusum.vs v2, v2, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0.f && beta->imag == 0.f) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v0, v2, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-        }
-    } // end cleanup
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef FMUL
-#undef FMADD
-#undef FNMSUB
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-#undef VSE
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define FMUL "fmul.d "
-#define FMADD "fmadd.d "
-#define FNMSUB "fnmsub.d "
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define VSE "vse64.v "
-
-void bli_zdotxf_sifive_x280_asm(
-              conj_t           conjat,
-              conj_t           conjx,
-              dim_t            m,
-              dim_t            b,
-        const void*   restrict alpha_,
-        const void*   restrict a_, inc_t inca, inc_t lda,
-        const void*   restrict x_, inc_t incx,
-        const void*   restrict beta_,
-              void*   restrict y_, inc_t incy,
-        const cntx_t* restrict cntx
-        ) {
-    (void)cntx;
-    const dcomplex* restrict alpha = alpha_;
-    const dcomplex* restrict a = a_;
-    const dcomplex* restrict x = x_;
-    const dcomplex* restrict beta = beta_;
-    dcomplex* restrict y = y_;
-
-    if (b == 0)
-        return;
-    else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
-        // scale y by beta
-        if (beta->real == 0. && beta->imag == 0.)
-            bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        else
-            bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
-        return;
-    }
-
-    __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
-    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
-    // Reduce to case when A^T is not conjugated, then conjugate
-    // computed product A^T * x if needed.
-    conj_t conjatx = BLIS_NO_CONJUGATE;
-    if (conjat == BLIS_CONJUGATE) {
-        bli_toggle_conj(&conjat);
-        bli_toggle_conj(&conjx);
-        bli_toggle_conj(&conjatx);
-    }
-    inca *= 2 * FLT_SIZE;
-    lda *= 2 * FLT_SIZE;
-    incx *= 2 * FLT_SIZE;
-    incy *= 2 * FLT_SIZE;
-    // these are used to bump a and y, resp.
-    inc_t a_bump = 6 * lda;
-    inc_t y_bump = incy - FLT_SIZE;
-    while (b >= 6) {
-        // compute dot product of x with 6 rows of a
-        const dcomplex* restrict x_tmp = x;
-        const dcomplex* restrict a_col = a;
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const dcomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == 2 * FLT_SIZE) {
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjx = no conj
-                    if (first) {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmul_vv(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx == BLIS_CONJUGATE
-                    // a unit stride, conjx = conj
-                    if (first) {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjx = no conj
-                    if (first) {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmul_vv(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx = BLIS_CONJUGATE
-                    // a non-unit stride, conjx = conj
-                    if (first) {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
-                        first = false;
-                    }
-                    else {
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("vmv.s.x v31, x0");
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v0, v0, v31");
-        __asm__("vfredusum.vs v2, v2, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v0, v2, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v4, v4, v31");
-        __asm__("vfredusum.vs v6, v6, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v4, v6, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v8, v8, v31");
-        __asm__("vfredusum.vs v10, v10, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v8, v10, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v12, v12, v31");
-        __asm__("vfredusum.vs v14, v14, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v12, v14, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v16, v16, v31");
-        __asm__("vfredusum.vs v18, v18, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v16, v18, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-        __asm__("vfredusum.vs v20, v20, v31");
-        __asm__("vfredusum.vs v22, v22, v31");
-        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-        if (beta->real == 0. && beta->imag == 0.) {
-            if (conjatx == BLIS_NO_CONJUGATE) {
-                vcmul_vf(v28, v29, v20, v22, ft8, ft9);
-            }
-            else {
-                vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
-            }
-        }
-        else {
-            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-            __asm__("vfmv.s.f v28, ft0");
-            __asm__("vfmv.s.f v29, ft1");
-            if (conjatx == BLIS_NO_CONJUGATE) {
-              vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
-            }
-            else {
-              vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
-            }
-        }
-        __asm__(VSE "v28, (%0)" : : "r"(y));
-        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-        __asm__(VSE "v29, (%0)" : : "r"(y));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
-
-        // a += 6 * lda;
-        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
-        b -= 6;
-    }
-
-    if (b > 0) {
-        // cleanup loop, 0 < b < 6
-        const dcomplex* restrict x_tmp = x;
-        const dcomplex* restrict a_col;
-        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
-        size_t avl = m;
-        bool first = true;
-        while (avl) {
-            const dcomplex* restrict a_row = a_col;
-            size_t vl;
-            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-            if (incx == 2 * FLT_SIZE)
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
-            else
-                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
-            if (inca == 2 * FLT_SIZE) {
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a unit stride, conjx = no conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx == BLIS_CONJUGATE
-                    // a unit stride, conjx = conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
-                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a unit stride
-            else { // a non-unit stride
-                if (conjx == BLIS_NO_CONJUGATE) {
-                    // a non-unit stride, conjx = no conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmul_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_NO_CONJUGATE
-                else { // conjx == BLIS_CONJUGATE
-                    // a non-unit stride, conjx = conj
-                    if (first) {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                        first = false;
-                    }
-                    else {
-                        switch (b) {
-                        case 5:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
-                        case 4:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
-                        case 3:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
-                        case 2:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
-                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
-                        case 1:
-                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
-                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
-                        }
-                    }
-                } // end conjx == BLIS_CONJUGATE
-            } // end a non-unit stride
-            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
-            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
-            avl -= vl;
-        }
-
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
-        y_bump = incy + FLT_SIZE;
-        __asm__("vmv.s.x v31, x0");
-        
-        switch (b) {
-        case 5:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v16, v16, v31");
-            __asm__("vfredusum.vs v18, v18, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v16, v18, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 4:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v12, v12, v31");
-            __asm__("vfredusum.vs v14, v14, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v12, v14, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 3:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v8, v8, v31");
-            __asm__("vfredusum.vs v10, v10, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v8, v10, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 2:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v4, v4, v31");
-            __asm__("vfredusum.vs v6, v6, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v4, v6, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
-        case 1:
-            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
-            __asm__("vfredusum.vs v0, v0, v31");
-            __asm__("vfredusum.vs v2, v2, v31");
-            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            if (beta->real == 0. && beta->imag == 0.) {
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                    vcmul_vf(v28, v29, v0, v2, ft8, ft9);
-                }
-                else {
-                    vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
-                }
-            }
-            else {
-                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
-                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
-                __asm__("vfmv.s.f v28, ft0");
-                __asm__("vfmv.s.f v29, ft1");
-                if (conjatx == BLIS_NO_CONJUGATE) {
-                  vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
-                }
-                else {
-                  vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
-                }
-            }
-            __asm__(VSE "v28, (%0)" : : "r"(y));
-            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
-            __asm__(VSE "v29, (%0)" : : "r"(y));
-        }
-    } // end cleanup
-    return;
-}
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c
new file mode 100644
index 000000000..9396515b3
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c
@@ -0,0 +1,132 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_cmul_macros_intr.h"
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <stdint.h>
+#include <riscv_vector.h>
+
+#define DOTXF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxf_sifive_x280_intr(\
+          conj_t           conjat,                    \
+          conj_t           conjx,                     \
+          dim_t            m,                         \
+          dim_t            b,                         \
+    const T*      restrict alpha_,                    \
+    const T*      restrict a_, inc_t inca, inc_t lda, \
+    const T*      restrict x_, inc_t incx,            \
+    const T*      restrict beta_,                     \
+          T*      restrict y_, inc_t incy,            \
+    const cntx_t* restrict cntx                       \
+)
+
+#define DOTXF(...)  DOTXF_(__VA_ARGS__)
+
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotxf_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotxf_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m2
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotxf_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m2
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotxf_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SETV_
+#undef SETV
+#undef SCALV_
+#undef SCALV
+
+#undef DOTXF
+#undef DOTXF_
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..463a111f0
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c
@@ -0,0 +1,324 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXF
+
+#define DOTXF_SIFIVE_X280_LOAD_ACOL(i)                                          \
+    do {                                                                        \
+        acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \
+        acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                      \
+        acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                      \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                                                        \
+    do {                                                                                              \
+        acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \
+        acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                                            \
+        acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                                            \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, CONJ_SUF)                                       \
+    do {                                                                                            \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                   \
+        VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                   \
+        VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                   \
+        VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                   \
+        VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                   \
+        VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);                                                   \
+        VCMUL_VV##CONJ_SUF(PREC, LMUL, acc5_r, acc5_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, CONJ_SUF)                                                   \
+    do {                                                                                                  \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                         \
+        VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                         \
+        VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                         \
+        VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                         \
+        VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                         \
+        VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);                                                         \
+        VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc5_r, acc5_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, CONJ_SUF)                                            \
+    do {                                                                                                    \
+        switch (b) {                                                                                        \
+            case 5:                                                                                         \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                   \
+                VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+            case 4:                                                                                         \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                   \
+                VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+            case 3:                                                                                         \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                   \
+                VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+            case 2:                                                                                         \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                   \
+                VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+            case 1:                                                                                         \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                   \
+                VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
+        }                                                                                                   \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, CONJ_SUF)                                                        \
+    do {                                                                                                          \
+        switch (b) {                                                                                              \
+            case 5:                                                                                               \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                         \
+                VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+            case 4:                                                                                               \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                         \
+                VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+            case 3:                                                                                               \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                         \
+                VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+            case 2:                                                                                               \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                         \
+                VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+            case 1:                                                                                               \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                         \
+                VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
+        }                                                                                                         \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_REDUCE(i)                                                                                \
+    do {                                                                                                           \
+        RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1);                                               \
+        RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1);                                               \
+        dot##i##_r = VF_REDUSUM_VS(PREC, LMUL)(acc##i##_r, dot##i##_r, m);                                         \
+        dot##i##_i = VF_REDUSUM_VS(PREC, LMUL)(acc##i##_i, dot##i##_i, m);                                         \
+        RVV_TYPE_F(PREC, m1) y##i##_r, y##i##_i;                                                                   \
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {                                                                \
+            if (bli_is_conj(conjatx))                                                                              \
+                VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1);  \
+            else                                                                                                   \
+                VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1);       \
+            y[i * incy].real = VFMV_F_S(PREC)(y##i##_r);                                                           \
+            y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i);                                                           \
+        }                                                                                                          \
+        else {                                                                                                     \
+            PASTEMAC(PRECISION_CHAR, scals)(*beta, y[i * incy])                                                    \
+            y##i##_r = VFMV_S_F(PREC, m1)(y[i * incy].real, 1);                                                    \
+            y##i##_i = VFMV_S_F(PREC, m1)(y[i * incy].imag, 1);                                                    \
+            if (bli_is_conj(conjatx))                                                                              \
+                VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \
+            else                                                                                                   \
+                VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1);      \
+            y[i * incy].real = VFMV_F_S(PREC)(y##i##_r);                                                           \
+            y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i);                                                           \
+        }                                                                                                          \
+    } while (0)
+
+DOTXF(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjat(A^T) * conjx(x)
+    
+    (void) cntx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict y = y_;
+
+    if (b == 0) return;
+    if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta))
+            SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    conj_t conjatx = BLIS_NO_CONJUGATE;
+    if (bli_is_conj(conjx)) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjatx);
+    }
+
+    while (b >= 6) {
+        // Compute dot product of x with 6 columns of a.
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict x_tmp = x;
+        RVV_TYPE_F(PREC, LMUL) acc0_r, acc0_i, acc1_r, acc1_i, acc2_r, acc2_i,
+                               acc3_r, acc3_i, acc4_r, acc4_i, acc5_r, acc5_i;
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, acol_vec;
+        RVV_TYPE_F(PREC, LMUL) xvec_r, xvec_i, acol_vec_r, acol_vec_i;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            if (incx == 1)
+                xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, vl);
+            else
+                xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, 2 * FLT_SIZE * incx, vl);
+            xvec_r = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+            xvec_i = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+            if (first) {
+                if (bli_is_conj(conjat)) {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ);
+                    else
+                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ);
+                }
+                else {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , );
+                    else
+                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, );
+                }
+                first = false;
+            }
+            else {
+                if (bli_is_conj(conjat)) {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_LOOP_BODY( , _CONJ);
+                    else
+                        DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ);
+                }
+                else {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_LOOP_BODY( , );
+                    else
+                        DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, );
+                }
+            }
+              
+            a_tmp += vl * inca;
+            x_tmp += vl * incx;
+            avl -= vl;
+        }
+
+        DOTXF_SIFIVE_X280_REDUCE(0);
+        DOTXF_SIFIVE_X280_REDUCE(1);
+        DOTXF_SIFIVE_X280_REDUCE(2);
+        DOTXF_SIFIVE_X280_REDUCE(3);
+        DOTXF_SIFIVE_X280_REDUCE(4);
+        DOTXF_SIFIVE_X280_REDUCE(5);
+
+        a += 6 * lda;
+        y += 6 * incy;
+        b -= 6;
+    }
+
+    if (b > 0) {
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict x_tmp = x;
+        RVV_TYPE_F(PREC, LMUL) acc0_r, acc0_i, acc1_r, acc1_i, acc2_r, acc2_i,
+                               acc3_r, acc3_i, acc4_r, acc4_i;
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, acol_vec;
+        RVV_TYPE_F(PREC, LMUL) xvec_r, xvec_i, acol_vec_r, acol_vec_i;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            if (incx == 1)
+                xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, vl);
+            else
+                xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, 2 * FLT_SIZE * incx, vl);
+            xvec_r = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+            xvec_i = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+            if (first) {
+                if (bli_is_conj(conjat)) {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ);
+                    else
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ);
+                }
+                else {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , );
+                    else
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, );
+                }
+                first = false;
+            }
+            else {
+                if (bli_is_conj(conjat)) {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY( , _CONJ);
+                    else
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ);
+                }
+                else {
+                    if (inca == 1)
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY( , );
+                    else
+                        DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, );
+                }
+            }
+
+            a_tmp += vl * inca;
+            x_tmp += vl * incx;
+            avl -= vl;
+        }
+
+        switch (b) {
+            case 5:
+                DOTXF_SIFIVE_X280_REDUCE(4);
+            case 4:
+                DOTXF_SIFIVE_X280_REDUCE(3);
+            case 3:
+                DOTXF_SIFIVE_X280_REDUCE(2);
+            case 2:
+                DOTXF_SIFIVE_X280_REDUCE(1);
+            case 1:
+                DOTXF_SIFIVE_X280_REDUCE(0);
+        }
+    }
+    return;
+}
+
+#undef DOTXF_SIFIVE_X280_LOAD_ACOL
+#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED
+#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST
+#undef DOTXF_SIFIVE_X280_LOOP_BODY
+#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST
+#undef DOTXF_SIFIVE_X280_CLEANUP_BODY
+#undef DOTXF_SIFIVE_X280_REDUCE
+
+#endif // DOTXF
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c
new file mode 100644
index 000000000..8286e2476
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c
@@ -0,0 +1,262 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXF
+
+#define DOTXF_SIFIVE_X280_LOAD_ACOL(i)                                         \
+    do {                                                                       \
+        acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl);                   \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                                 \
+    do {                                                                       \
+        acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF)      \
+    do {                                                 \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);        \
+        acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);        \
+        acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);        \
+        acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);        \
+        acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);        \
+        acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);        \
+        acc5 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF)                      \
+    do {                                                           \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+        acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+        acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+        acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+        acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                  \
+        acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \
+        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);                  \
+        acc5 = VFMACC_VV_TU(PREC, LMUL)(acc5, acol_vec, xvec, vl); \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF)           \
+    do {                                                         \
+        switch (b) {                                             \
+            case 5:                                              \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);        \
+                acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+            case 4:                                              \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);        \
+                acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+            case 3:                                              \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);        \
+                acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+            case 2:                                              \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);        \
+                acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+            case 1:                                              \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);        \
+                acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
+        }                                                        \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF)                           \
+    do {                                                                   \
+        switch (b) {                                                       \
+            case 5:                                                        \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                  \
+                acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \
+            case 4:                                                        \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+                acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \
+            case 3:                                                        \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+                acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \
+            case 2:                                                        \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+                acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \
+            case 1:                                                        \
+                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+                acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \
+        }                                                                  \
+    } while (0)
+
+#define DOTXF_SIFIVE_X280_REDUCE(i)                                         \
+    do {                                                                    \
+        RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1);            \
+        dot##i = VF_REDUSUM_VS(PREC, LMUL)(acc##i, dot##i, m);              \
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {                         \
+            dot##i = VFMUL_VF(PREC, m1)(dot##i, *alpha, 1);                 \
+            y[i * incy] = VFMV_F_S(PREC)(dot##i);                           \
+        }                                                                   \
+        else {                                                              \
+            y[i * incy] *= *beta;                                           \
+            RVV_TYPE_F(PREC, m1) y##i = VFMV_S_F(PREC, m1)(y[i * incy], 1); \
+            y##i = VFMACC_VF(PREC, m1)(y##i, *alpha, dot##i, 1);            \
+            y[i * incy] = VFMV_F_S(PREC)(y##i);                             \
+        }                                                                   \
+    } while (0)
+
+DOTXF(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjat(A^T) * conjx(x)
+    
+    (void) conjat; // Suppress unused parameter warnings
+    (void) conjx;
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict y = y_;
+
+    if (b == 0) return;
+    if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta))
+            SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    while (b >= 6) {
+        // Compute dot product of x with 6 columns of a.
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict x_tmp = x;
+        RVV_TYPE_F(PREC, LMUL) acc0, acc1, acc2, acc3, acc4, acc5;
+        RVV_TYPE_F(PREC, LMUL) xvec, acol_vec;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            if (incx == 1)
+                xvec = VLE_V_F(PREC, LMUL)(x_tmp, vl);
+            else
+                xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl);
+            if (first) {
+                if (inca == 1)
+                    DOTXF_SIFIVE_X280_LOOP_BODY_FIRST();
+                else
+                    DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED);
+                first = false;
+            }
+            else {
+                if (inca == 1)
+                    DOTXF_SIFIVE_X280_LOOP_BODY();
+                else
+                    DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED);
+            }
+              
+            a_tmp += vl * inca;
+            x_tmp += vl * incx;
+            avl -= vl;
+        }
+
+        DOTXF_SIFIVE_X280_REDUCE(0);
+        DOTXF_SIFIVE_X280_REDUCE(1);
+        DOTXF_SIFIVE_X280_REDUCE(2);
+        DOTXF_SIFIVE_X280_REDUCE(3);
+        DOTXF_SIFIVE_X280_REDUCE(4);
+        DOTXF_SIFIVE_X280_REDUCE(5);
+
+        a += 6 * lda;
+        y += 6 * incy;
+        b -= 6;
+    }
+
+    if (b > 0) {
+        const DATATYPE* restrict a_tmp = a;
+        const DATATYPE* restrict x_tmp = x;
+        RVV_TYPE_F(PREC, LMUL) acc0, acc1, acc2, acc3, acc4;
+        RVV_TYPE_F(PREC, LMUL) xvec, acol_vec;
+        bool first = true;
+        size_t avl = m;
+        while (avl) {
+            size_t vl = VSETVL(PREC, LMUL)(avl);
+            if (incx == 1)
+                xvec = VLE_V_F(PREC, LMUL)(x_tmp, vl);
+            else
+                xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl);
+            if (first) {
+                if (inca == 1)
+                    DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST();
+                else
+                    DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED);
+                first = false;
+            }
+            else {
+                if (inca == 1)
+                    DOTXF_SIFIVE_X280_CLEANUP_BODY();
+                else
+                    DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED);
+            }
+
+            a_tmp += vl * inca;
+            x_tmp += vl * incx;
+            avl -= vl;
+        }
+
+        switch (b) {
+            case 5:
+                DOTXF_SIFIVE_X280_REDUCE(4);
+            case 4:
+                DOTXF_SIFIVE_X280_REDUCE(3);
+            case 3:
+                DOTXF_SIFIVE_X280_REDUCE(2);
+            case 2:
+                DOTXF_SIFIVE_X280_REDUCE(1);
+            case 1:
+                DOTXF_SIFIVE_X280_REDUCE(0);
+        }
+
+    }
+    return;
+}
+
+#undef DOTXF_SIFIVE_X280_LOAD_ACOL
+#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED
+#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST
+#undef DOTXF_SIFIVE_X280_LOOP_BODY
+#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST
+#undef DOTXF_SIFIVE_X280_CLEANUP_BODY
+#undef DOTXF_SIFIVE_X280_REDUCE
+
+#endif // DOTXF
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c
deleted file mode 100644
index 3ee4cdd20..000000000
--- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c
+++ /dev/null
@@ -1,1465 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "../riscv_cmul_macros_asm.h"
-#include "../bli_kernels_sifive_x280.h"
-#include <math.h>
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-#define VSSSEG8 "vssseg8e32.v "
-#define VSSSEG7 "vssseg7e32.v "
-#define VSSSEG6 "vssseg6e32.v "
-#define VSSSEG5 "vssseg5e32.v "
-#define VSSSEG4 "vssseg4e32.v "
-#define VSSSEG3 "vssseg3e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define NR 64
-
-void bli_spackm_sifive_x280_asm_7m4
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            cdim_max,
-             dim_t            cdim_bcast,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const void*   restrict params,
-       const cntx_t*          cntx
-     )
-{
-    (void) conja;
-    (void) cntx;
-    const float* kappa = kappa_;
-    const float* a = a_;
-    float* p = p_;
-
-    float kappa_cast = *kappa;
-
-    // MRxk kernel
-    if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-            switch (cdim) {
-                case 0: __asm__("vmv.v.i v0, 0");
-                case 1: __asm__("vmv.v.i v1, 0");
-                case 2: __asm__("vmv.v.i v2, 0");
-                case 3: __asm__("vmv.v.i v3, 0");
-                case 4: __asm__("vmv.v.i v4, 0");
-                case 5: __asm__("vmv.v.i v5, 0");
-                case 6: __asm__("vmv.v.i v6, 0");
-            }
-            a += (cdim - 1) * inca;
-            size_t avl = n;
-            while (avl) {
-                const float* a_tmp = a;
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                switch (cdim) {
-                    case 7:
-                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 6:
-                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 5:
-                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 4:
-                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 3:
-                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast != 1.f) {
-                    switch (cdim) {
-                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                    }
-                }
-                __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= FLT_SIZE;
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == FLT_SIZE) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast != 1.f) {
-                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                }
-                __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // NRxk kernel
-    else if (cdim <= 64 && cdim_max == 64 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v8, 0");
-            size_t avl = n;
-            while (avl) {
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                dim_t cdim_tmp = cdim;
-                const float* a_tmp = a;
-                float* p_tmp = p;
-                while (cdim_tmp >= 8) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    if (kappa_cast != 1.f) {
-                        __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
-                    }
-                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                    p_tmp += 8;
-                    cdim_tmp -= 8;
-                }
-                if (cdim_tmp > 0) {
-                    a_tmp += (cdim_tmp - 1) * inca;
-                    switch (cdim_tmp) {
-                        case 7:
-                            __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 6:
-                            __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 5:
-                            __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 4:
-                            __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 3:
-                            __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 2:
-                            __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 1:
-                            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                    }
-                    if (kappa_cast != 1.f) {
-                        switch (cdim_tmp) {
-                            case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                            case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                            case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                            case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                            case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                            case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                            case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                        }
-                    }
-                    switch (cdim_tmp) {
-                        case 7:
-                            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 6:
-                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 5:
-                            __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 4:
-                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 3:
-                            __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 2:
-                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 1:
-                            __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                    }
-                    p_tmp += cdim_tmp;
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-                for (size_t i = 0; i < vl; ++i) {
-                    __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
-                    p_tmp += ldp;
-                }
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v8, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= FLT_SIZE;
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == FLT_SIZE) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast != 1.f) {
-                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // generic kernel
-    else
-    {
-        bli_sspackm_sifive_x280_ref
-        (
-          conja,
-          schema,
-          cdim,
-          cdim_max,
-          cdim_bcast,
-          n,
-          n_max,
-          kappa,
-          a, inca, lda,
-          p,       ldp,
-          params,
-          cntx
-        );
-    }
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-#undef VSSSEG8
-#undef VSSSEG7
-#undef VSSSEG6
-#undef VSSSEG5
-#undef VSSSEG4
-#undef VSSSEG3
-#undef VSSSEG2
-#undef NR
-
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-#define VSSSEG8 "vssseg8e64.v "
-#define VSSSEG7 "vssseg7e64.v "
-#define VSSSEG6 "vssseg6e64.v "
-#define VSSSEG5 "vssseg5e64.v "
-#define VSSSEG4 "vssseg4e64.v "
-#define VSSSEG3 "vssseg3e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define NR 32
-
-void bli_dpackm_sifive_x280_asm_7m4
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            cdim_max,
-             dim_t            cdim_bcast,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const void*   restrict params,
-       const cntx_t*          cntx
-     )
-{
-    (void) conja;
-    (void) cntx;
-    const double* kappa = kappa_;
-    const double* a = a_;
-    double* p = p_;
-
-    double kappa_cast = *kappa;
-
-    // MRxk kernel
-    if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-            switch (cdim) {
-                case 0: __asm__("vmv.v.i v0, 0");
-                case 1: __asm__("vmv.v.i v1, 0");
-                case 2: __asm__("vmv.v.i v2, 0");
-                case 3: __asm__("vmv.v.i v3, 0");
-                case 4: __asm__("vmv.v.i v4, 0");
-                case 5: __asm__("vmv.v.i v5, 0");
-                case 6: __asm__("vmv.v.i v6, 0");
-            }
-            a += (cdim - 1) * inca;
-            size_t avl = n;
-            while (avl) {
-                const double* a_tmp = a;
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                switch (cdim) {
-                    case 7:
-                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 6:
-                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 5:
-                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 4:
-                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 3:
-                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast != 1.) {
-                    switch (cdim) {
-                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                    }
-                }
-                __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= FLT_SIZE;
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == FLT_SIZE) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast != 1.) {
-                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                }
-                __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // NRxk kernel
-    else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v8, 0");
-            size_t avl = n;
-            while (avl) {
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                dim_t cdim_tmp = cdim;
-                const double* a_tmp = a;
-                double* p_tmp = p;
-                while (cdim_tmp >= 8) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    if (kappa_cast != 1.) {
-                        __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                        __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
-                    }
-                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                    p_tmp += 8;
-                    cdim_tmp -= 8;
-                }
-                if (cdim_tmp > 0) {
-                    a_tmp += (cdim_tmp - 1) * inca;
-                    switch (cdim_tmp) {
-                        case 7:
-                            __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 6:
-                            __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 5:
-                            __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 4:
-                            __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 3:
-                            __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 2:
-                            __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 1:
-                            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
-                    }
-                    if (kappa_cast != 1.) {
-                        switch (cdim_tmp) {
-                            case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
-                            case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
-                            case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
-                            case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
-                            case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
-                            case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
-                            case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                        }
-                    }
-                    switch (cdim_tmp) {
-                        case 7:
-                            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 6:
-                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 5:
-                            __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 4:
-                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 3:
-                            __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 2:
-                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                        case 1:
-                            __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
-                            break;
-                    }
-                    p_tmp += cdim_tmp;
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-                for (size_t i = 0; i < vl; ++i) {
-                    __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
-                    p_tmp += ldp;
-                }
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v8, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= FLT_SIZE;
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == FLT_SIZE) {
-                    __asm__(VLE "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast != 1.) {
-                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSE "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // generic kernel
-    else
-    {
-        bli_ddpackm_sifive_x280_ref
-        (
-          conja,
-          schema,
-          cdim,
-          cdim_max,
-          cdim_bcast,
-          n,
-          n_max,
-          kappa,
-          a, inca, lda,
-          p,       ldp,
-          params,
-          cntx
-        );
-    }
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-#undef VSSSEG8
-#undef VSSSEG7
-#undef VSSSEG6
-#undef VSSSEG5
-#undef VSSSEG4
-#undef VSSSEG3
-#undef VSSSEG2
-#undef NR
-
-#define FLT_SIZE 4
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define VSSSEG4 "vssseg4e32.v "
-#define VSSSEG6 "vssseg6e32.v "
-#define VSSSEG8 "vssseg8e32.v "
-#define NR 32
-
-void bli_cpackm_sifive_x280_asm_6m2
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            cdim_max,
-             dim_t            cdim_bcast,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const void*   restrict params,
-       const cntx_t*          cntx
-     )
-{
-    (void) cntx;
-    const scomplex* kappa = kappa_;
-    const scomplex* a = a_;
-    scomplex* p = p_;
-
-    scomplex kappa_cast = *kappa;
-
-    // MRxk kernel
-    if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                switch (cdim) {
-                    case 0:
-                        __asm__("vmv.v.i v0, 0");
-                        __asm__("vmv.v.i v1, 0");
-                    case 1:
-                        __asm__("vmv.v.i v2, 0");
-                        __asm__("vmv.v.i v3, 0");
-                    case 2:
-                        __asm__("vmv.v.i v4, 0");
-                        __asm__("vmv.v.i v5, 0");
-                    case 3:
-                        __asm__("vmv.v.i v6, 0");
-                        __asm__("vmv.v.i v7, 0");
-                    case 4:
-                        __asm__("vmv.v.i v8, 0");
-                        __asm__("vmv.v.i v9, 0");
-                    case 5:
-                        __asm__("vmv.v.i v10, 0");
-                        __asm__("vmv.v.i v11, 0");
-                }
-            }
-            else {
-                switch (cdim) {
-                    case 0:
-                        __asm__("vmv.v.i v12, 0");
-                        __asm__("vmv.v.i v13, 0");
-                    case 1:
-                        __asm__("vmv.v.i v14, 0");
-                        __asm__("vmv.v.i v15, 0");
-                    case 2:
-                        __asm__("vmv.v.i v16, 0");
-                        __asm__("vmv.v.i v17, 0");
-                    case 3:
-                        __asm__("vmv.v.i v18, 0");
-                        __asm__("vmv.v.i v19, 0");
-                    case 4:
-                        __asm__("vmv.v.i v20, 0");
-                        __asm__("vmv.v.i v21, 0");
-                    case 5:
-                        __asm__("vmv.v.i v22, 0");
-                        __asm__("vmv.v.i v23, 0");
-                }
-            }
-            a += (cdim - 1) * inca;
-            size_t avl = n;
-            while (avl) {
-                const scomplex* a_tmp = a;
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                switch (cdim) {
-                    case 6:
-                        __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 5:
-                        __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 4:
-                        __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 3:
-                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                    if (conja == BLIS_CONJUGATE) {
-                        switch (cdim) {
-                            case 6: __asm__("vfneg.v v11, v11");
-                            case 5: __asm__("vfneg.v v9, v9");
-                            case 4: __asm__("vfneg.v v7, v7");
-                            case 3: __asm__("vfneg.v v5, v5");
-                            case 2: __asm__("vfneg.v v3, v3");
-                            case 1: __asm__("vfneg.v v1, v1");
-                        }
-                    }
-                    __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                    __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        switch (cdim) {
-                            case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                            case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                            case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                            case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    else {
-                        switch (cdim) {
-                            case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                            case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                            case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                            case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                    __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-                }
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v1, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= 2 * FLT_SIZE;
-            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v1, 0");
-            __asm__("vmv.v.i v2, 0");
-            __asm__("vmv.v.i v3, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == 2 * FLT_SIZE) {
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                    if (conja == BLIS_CONJUGATE) {
-                        __asm__("vfneg.v v1, v1");
-                    }
-                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                    else {
-                        vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
-                }
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v1, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // NRxk kernel
-    else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v16, 0");
-            __asm__("vmv.v.i v18, 0");
-            size_t avl = n;
-            while (avl) {
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                dim_t cdim_tmp = cdim;
-                const scomplex* a_tmp = a;
-                scomplex* p_tmp = p;
-                while (cdim_tmp >= 4) {
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                        if (conja == BLIS_CONJUGATE) {
-                            __asm__("vfneg.v v1, v1");
-                            __asm__("vfneg.v v3, v3");
-                            __asm__("vfneg.v v5, v5");
-                            __asm__("vfneg.v v7, v7");
-                        }
-                        __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                    }
-                    else {
-                        if (conja == BLIS_NO_CONJUGATE) {
-                            vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        }
-                        else {
-                            vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        }
-                        __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                    }
-                    p_tmp += 4;
-                    cdim_tmp -= 4;
-                }
-                if (cdim_tmp > 0) {
-                    a_tmp += (cdim_tmp - 1) * inca;
-                    switch (cdim_tmp) {
-                        case 3:
-                            __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 2:
-                            __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 1:
-                            __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                    }
-                    if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                        if (conja == BLIS_CONJUGATE) {
-                            switch (cdim_tmp) {
-                                case 3: __asm__("vfneg.v v5, v5");
-                                case 2: __asm__("vfneg.v v3, v3");
-                                case 1: __asm__("vfneg.v v1, v1");
-                            }
-                        }
-                        switch (cdim_tmp) {
-                            case 3:
-                                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 2:
-                                __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 1:
-                                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                        }
-                    }
-                    else {
-                        if (conja == BLIS_NO_CONJUGATE) {
-                            switch (cdim_tmp) {
-                                case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                                case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                                case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            }
-                        }
-                        else {
-                            switch (cdim_tmp) {
-                                case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                                case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                                case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            }
-                        }
-                        switch (cdim_tmp) {
-                            case 3:
-                                __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 2:
-                                __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 1:
-                                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                        }
-                    }
-                    p_tmp += cdim_tmp;
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-                for (size_t i = 0; i < vl; ++i) {
-                    __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
-                    p_tmp += ldp;
-                }
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= 2 * FLT_SIZE;
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v2, 0");
-            __asm__("vmv.v.i v4, 0");
-            __asm__("vmv.v.i v6, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == 2 * FLT_SIZE) {
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
-                    if (conja == BLIS_CONJUGATE) {
-                        __asm__("vfneg.v v2, v2");
-                    }
-                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                    }
-                    else {
-                        vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                    }
-                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
-                }
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v2, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // generic kernel
-    else
-    {
-        bli_ccpackm_sifive_x280_ref
-        (
-          conja,
-          schema,
-          cdim,
-          cdim_max,
-          cdim_bcast,
-          n,
-          n_max,
-          kappa,
-          a, inca, lda,
-          p,       ldp,
-          params,
-          cntx
-        );
-    }
-}
-
-#undef FLT_SIZE
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-#undef VSSSEG4
-#undef VSSSEG6
-#undef VSSSEG8
-#undef NR
-
-#define FLT_SIZE 8
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define VSSSEG4 "vssseg4e64.v "
-#define VSSSEG6 "vssseg6e64.v "
-#define VSSSEG8 "vssseg8e64.v "
-#define NR 16
-
-void bli_zpackm_sifive_x280_asm_6m2
-     (
-             conj_t           conja,
-             pack_t           schema,
-             dim_t            cdim,
-             dim_t            cdim_max,
-             dim_t            cdim_bcast,
-             dim_t            n,
-             dim_t            n_max,
-       const void*   restrict kappa_,
-       const void*   restrict a_, inc_t inca, inc_t lda,
-             void*   restrict p_,             inc_t ldp,
-       const void*   restrict params,
-       const cntx_t*          cntx
-     )
-{
-    (void) cntx;
-    const dcomplex* kappa = kappa_;
-    const dcomplex* a = a_;
-    dcomplex* p = p_;
-
-    dcomplex kappa_cast = *kappa;
-
-    // MRxk kernel
-    if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                switch (cdim) {
-                    case 0:
-                        __asm__("vmv.v.i v0, 0");
-                        __asm__("vmv.v.i v1, 0");
-                    case 1:
-                        __asm__("vmv.v.i v2, 0");
-                        __asm__("vmv.v.i v3, 0");
-                    case 2:
-                        __asm__("vmv.v.i v4, 0");
-                        __asm__("vmv.v.i v5, 0");
-                    case 3:
-                        __asm__("vmv.v.i v6, 0");
-                        __asm__("vmv.v.i v7, 0");
-                    case 4:
-                        __asm__("vmv.v.i v8, 0");
-                        __asm__("vmv.v.i v9, 0");
-                    case 5:
-                        __asm__("vmv.v.i v10, 0");
-                        __asm__("vmv.v.i v11, 0");
-                }
-            }
-            else {
-                switch (cdim) {
-                    case 0:
-                        __asm__("vmv.v.i v12, 0");
-                        __asm__("vmv.v.i v13, 0");
-                    case 1:
-                        __asm__("vmv.v.i v14, 0");
-                        __asm__("vmv.v.i v15, 0");
-                    case 2:
-                        __asm__("vmv.v.i v16, 0");
-                        __asm__("vmv.v.i v17, 0");
-                    case 3:
-                        __asm__("vmv.v.i v18, 0");
-                        __asm__("vmv.v.i v19, 0");
-                    case 4:
-                        __asm__("vmv.v.i v20, 0");
-                        __asm__("vmv.v.i v21, 0");
-                    case 5:
-                        __asm__("vmv.v.i v22, 0");
-                        __asm__("vmv.v.i v23, 0");
-                }
-            }
-            a += (cdim - 1) * inca;
-            size_t avl = n;
-            while (avl) {
-                const dcomplex* a_tmp = a;
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                switch (cdim) {
-                    case 6:
-                        __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 5:
-                        __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 4:
-                        __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 3:
-                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 2:
-                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                        a_tmp -= inca;
-                    case 1:
-                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                }
-                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                    if (conja == BLIS_CONJUGATE) {
-                        switch (cdim) {
-                            case 6: __asm__("vfneg.v v11, v11");
-                            case 5: __asm__("vfneg.v v9, v9");
-                            case 4: __asm__("vfneg.v v7, v7");
-                            case 3: __asm__("vfneg.v v5, v5");
-                            case 2: __asm__("vfneg.v v3, v3");
-                            case 1: __asm__("vfneg.v v1, v1");
-                        }
-                    }
-                    __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                    __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        switch (cdim) {
-                            case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                            case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                            case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                            case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    else {
-                        switch (cdim) {
-                            case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
-                            case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
-                            case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
-                            case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
-                        }
-                    }
-                    __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
-                    __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
-                }
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v1, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= 2 * FLT_SIZE;
-            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v1, 0");
-            __asm__("vmv.v.i v2, 0");
-            __asm__("vmv.v.i v3, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == 2 * FLT_SIZE) {
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                    if (conja == BLIS_CONJUGATE) {
-                        __asm__("vfneg.v v1, v1");
-                    }
-                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                    else {
-                        vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
-                    }
-                    __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
-                }
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v1, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // NRxk kernel
-    else if (cdim <= 16 && cdim_max == 16 && cdim_bcast == 1)
-    {
-        if (lda == 1) {
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v16, 0");
-            __asm__("vmv.v.i v18, 0");
-            size_t avl = n;
-            while (avl) {
-                size_t vl;
-                __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
-                dim_t cdim_tmp = cdim;
-                const dcomplex* a_tmp = a;
-                dcomplex* p_tmp = p;
-                while (cdim_tmp >= 4) {
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
-                    a_tmp += inca;
-                    if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                        if (conja == BLIS_CONJUGATE) {
-                            __asm__("vfneg.v v1, v1");
-                            __asm__("vfneg.v v3, v3");
-                            __asm__("vfneg.v v5, v5");
-                            __asm__("vfneg.v v7, v7");
-                        }
-                        __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                    }
-                    else {
-                        if (conja == BLIS_NO_CONJUGATE) {
-                            vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        }
-                        else {
-                            vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                            vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
-                        }
-                        __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                    }
-                    p_tmp += 4;
-                    cdim_tmp -= 4;
-                }
-                if (cdim_tmp > 0) {
-                    a_tmp += (cdim_tmp - 1) * inca;
-                    switch (cdim_tmp) {
-                        case 3:
-                            __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 2:
-                            __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
-                            a_tmp -= inca;
-                        case 1:
-                            __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
-                    }
-                    if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                        if (conja == BLIS_CONJUGATE) {
-                            switch (cdim_tmp) {
-                                case 3: __asm__("vfneg.v v5, v5");
-                                case 2: __asm__("vfneg.v v3, v3");
-                                case 1: __asm__("vfneg.v v1, v1");
-                            }
-                        }
-                        switch (cdim_tmp) {
-                            case 3:
-                                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 2:
-                                __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 1:
-                                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                        }
-                    }
-                    else {
-                        if (conja == BLIS_NO_CONJUGATE) {
-                            switch (cdim_tmp) {
-                                case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                                case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                                case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            }
-                        }
-                        else {
-                            switch (cdim_tmp) {
-                                case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
-                                case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
-                                case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
-                            }
-                        }
-                        switch (cdim_tmp) {
-                            case 3:
-                                __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 2:
-                                __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                            case 1:
-                                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
-                                break;
-                        }
-                    }
-                    p_tmp += cdim_tmp;
-                }
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
-                for (size_t i = 0; i < vl; ++i) {
-                    __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
-                    p_tmp += ldp;
-                }
-                a += vl;
-                p += vl * ldp;
-                avl -= vl;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-        else {
-            inca *= 2 * FLT_SIZE;
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v2, 0");
-            __asm__("vmv.v.i v4, 0");
-            __asm__("vmv.v.i v6, 0");
-            for (size_t i = 0; i < n; ++i) {
-                __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
-                if (inca == 2 * FLT_SIZE) {
-                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
-                }
-                else {
-                    __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
-                }
-                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
-                    if (conja == BLIS_CONJUGATE) {
-                        __asm__("vfneg.v v2, v2");
-                    }
-                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                }
-                else {
-                    if (conja == BLIS_NO_CONJUGATE) {
-                        vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                    }
-                    else {
-                        vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
-                    }
-                    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-                    __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
-                }
-                a += lda;
-                p += ldp;
-            }
-            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
-            __asm__("vmv.v.i v0, 0");
-            __asm__("vmv.v.i v2, 0");
-            for (size_t i = n; i < n_max; ++i) {
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
-                p += ldp;
-            }
-        }
-    }
-    // generic kernel
-    else
-    {
-        bli_zzpackm_sifive_x280_ref
-        (
-          conja,
-          schema,
-          cdim,
-          cdim_max,
-          cdim_bcast,
-          n,
-          n_max,
-          kappa,
-          a, inca, lda,
-          p,       ldp,
-          params,
-          cntx
-        );
-    }
-}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c
new file mode 100644
index 000000000..119872197
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c
@@ -0,0 +1,168 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include "../../riscv_cmul_macros_intr.h"
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <stdint.h>
+#include <riscv_vector.h>
+
+#define PACKM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##packm_sifive_x280_intr(\
+         conj_t           conja,                     \
+         pack_t           schema,                    \
+         dim_t            cdim,                      \
+         dim_t            cdim_max,                  \
+         dim_t            cdim_bcast,                \
+         dim_t            n,                         \
+         dim_t            n_max,                     \
+   const T*      restrict kappa_,                    \
+   const T*      restrict a_, inc_t inca, inc_t lda, \
+         T*      restrict p_,             inc_t ldp, \
+   const T*      restrict params,                    \
+   const cntx_t*          cntx                       \
+)
+
+#define PACKM(...)  PACKM_(__VA_ARGS__)
+
+#define REF_KERNEL_(PRECISION_CHAR) bli_##PRECISION_CHAR##PRECISION_CHAR##packm_sifive_x280_ref
+#define REF_KERNEL(PRECISION_CHAR) REF_KERNEL_(PRECISION_CHAR)
+
+// LMUL is the LMUL used when a is "row major" (lda == 1). Since we use
+// segment stores with more than 4 fields, this is usually m1.
+// LMUL_MR is an LMUL large enough to hold MR floats (for spackm, cpackm)
+// or doubles (for dpackm, zpackm). LMUL_NR is analogous.
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m1
+#define LMUL_MR m1
+#define LMUL_NR m4
+#define FLT_SIZE sizeof(float)
+#define MR 7
+#define NR 64
+
+#include "./bli_packm_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef LMUL_MR
+#undef LMUL_NR
+#undef FLT_SIZE
+#undef MR
+#undef NR
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m1
+#define LMUL_MR m1
+#define LMUL_NR m4
+#define FLT_SIZE sizeof(double)
+#define MR 7
+#define NR 32
+
+#include "./bli_packm_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef LMUL_MR
+#undef LMUL_NR
+#undef FLT_SIZE
+#undef MR
+#undef NR
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m1
+#define LMUL_MR m1
+#define LMUL_NR m2
+#define FLT_SIZE sizeof(float)
+#define MR 6
+#define NR 32
+
+#include "./bli_packm_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef LMUL_MR
+#undef LMUL_NR
+#undef FLT_SIZE
+#undef MR
+#undef NR
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m1
+#define LMUL_MR m1
+#define LMUL_NR m2
+#define FLT_SIZE sizeof(double)
+#define MR 6
+#define NR 16
+
+#include "./bli_packm_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef LMUL_MR
+#undef LMUL_NR
+#undef FLT_SIZE
+#undef MR
+#undef NR
+
+#undef REF_KERNEL_
+#undef REF_KERNEL
+
+#undef PACKM
+#undef PACKM_
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..ee49090dc
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c
@@ -0,0 +1,545 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef PACKM
+
+PACKM(PRECISION_CHAR, void)
+{
+    (void) schema; // Suppress unused parameter warnings
+    (void) params;
+    (void) cntx;
+    const DATATYPE* restrict kappa = kappa_;
+    const DATATYPE* restrict a = a_;
+    DATATYPE* restrict p = p_;
+
+    // MRxk kernel
+    if (cdim <= MR && cdim_max == MR && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            // a is "row major"
+            RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r, arow3_r, arow4_r, arow5_r;
+            RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i, arow3_i, arow4_i, arow5_i;
+            RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r,
+                                   kappa_arow3_r, kappa_arow4_r, kappa_arow5_r;
+            RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i,
+                                   kappa_arow3_i, kappa_arow4_i, kappa_arow5_i;
+            // pad lower edge
+            if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                switch (cdim) {
+                case 0:
+                    arow0_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    arow0_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 1:
+                    arow1_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    arow1_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 2:
+                    arow2_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    arow2_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 3:
+                    arow3_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    arow3_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 4:
+                    arow4_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    arow4_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 5:
+                    arow5_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    arow5_i = VFMV_V_F(PREC, LMUL)(0., n);
+                }
+            } else {
+                switch (cdim) {
+                case 0:
+                    kappa_arow0_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    kappa_arow0_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 1:
+                    kappa_arow1_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    kappa_arow1_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 2:
+                    kappa_arow2_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    kappa_arow2_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 3:
+                    kappa_arow3_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    kappa_arow3_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 4:
+                    kappa_arow4_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    kappa_arow4_i = VFMV_V_F(PREC, LMUL)(0., n);
+                case 5:
+                    kappa_arow5_r = VFMV_V_F(PREC, LMUL)(0., n);
+                    kappa_arow5_i = VFMV_V_F(PREC, LMUL)(0., n);
+                }
+            }
+
+            size_t avl = n;
+            while (avl) {
+                size_t vl = VSETVL(PREC, LMUL)(avl);
+                RVV_TYPE_FX(PREC, LMUL, 2) arow_vec;
+                switch (cdim) {
+                case 6:
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 5 * inca), vl);
+                    arow5_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow5_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                case 5:
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 4 * inca), vl);
+                    arow4_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow4_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                case 4:
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 3 * inca), vl);
+                    arow3_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow3_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                case 3:
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 2 * inca), vl);
+                    arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                case 2:
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 1 * inca), vl);
+                    arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                case 1:
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 0 * inca), vl);
+                    arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                }
+
+                if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                    if (bli_is_conj(conja)) {
+                        switch (cdim) {
+                        case 6:
+                            arow5_i = VFNEG_VF(PREC, LMUL)(arow5_i, vl);
+                        case 5:
+                            arow4_i = VFNEG_VF(PREC, LMUL)(arow4_i, vl);
+                        case 4:
+                            arow3_i = VFNEG_VF(PREC, LMUL)(arow3_i, vl);
+                        case 3:
+                            arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl);
+                        case 2:
+                            arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl);
+                        case 1:
+                            arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl);
+                        }
+                    }
+
+                    RVV_TYPE_FX(PREC, LMUL, 6) ablock = VUNDEFINED_FX(PREC, LMUL, 6)();
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, arow0_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, arow0_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, arow1_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, arow1_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, arow2_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, arow2_i);
+                    VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p, 2 * FLT_SIZE * ldp, ablock, vl);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, arow3_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, arow3_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, arow4_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, arow4_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, arow5_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, arow5_i);
+                    VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*)(p + 3), 2 * FLT_SIZE * ldp, ablock, vl);
+                } else {
+                    if (bli_is_conj(conja)) {
+                        switch (cdim) {
+                        case 6:
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow5_r, kappa_arow5_i, arow5_r, arow5_i, kappa->real, kappa->imag, vl);
+                        case 5:
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow4_r, kappa_arow4_i, arow4_r, arow4_i, kappa->real, kappa->imag, vl);
+                        case 4:
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl);
+                        case 3:
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl);
+                        case 2:
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl);
+                        case 1:
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl);
+                        }
+                    } else {
+                        switch (cdim) {
+                        case 6:
+                            VCMUL_VF(PREC, LMUL, kappa_arow5_r, kappa_arow5_i, arow5_r, arow5_i, kappa->real, kappa->imag, vl);
+                        case 5:
+                            VCMUL_VF(PREC, LMUL, kappa_arow4_r, kappa_arow4_i, arow4_r, arow4_i, kappa->real, kappa->imag, vl);
+                        case 4:
+                            VCMUL_VF(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl);
+                        case 3:
+                            VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl);
+                        case 2:
+                            VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl);
+                        case 1:
+                            VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl);
+                        }
+                    }
+
+                    RVV_TYPE_FX(PREC, LMUL, 6) ablock = VUNDEFINED_FX(PREC, LMUL, 6)();
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, kappa_arow0_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, kappa_arow0_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, kappa_arow1_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, kappa_arow1_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, kappa_arow2_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, kappa_arow2_i);
+                    VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p, 2 * FLT_SIZE * ldp, ablock, vl);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, kappa_arow3_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, kappa_arow3_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, kappa_arow4_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, kappa_arow4_i);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, kappa_arow5_r);
+                    ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, kappa_arow5_i);
+                    VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*)(p + 3), 2 * FLT_SIZE * ldp, ablock, vl);
+                }
+
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            
+            RVV_TYPE_FX(PREC, LMUL_MR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_MR, 2)();
+            zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max));
+            zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max));
+            for (size_t i = n; i < n_max; ++i) {
+                VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        }
+        else {
+            RVV_TYPE_FX(PREC, LMUL_MR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_MR, 2)();
+            zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max));
+            zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max));
+
+            for (size_t i = 0; i < n; ++i) {
+                RVV_TYPE_FX(PREC, LMUL_MR, 2) acol;
+                if (inca == 1)
+                    acol = VLSEG2_V_F_TU(PREC, LMUL_MR, 2)(zero_padding, (BASE_DT*) a, cdim);
+                else
+                    acol = VLSSEG2_V_F_TU(PREC, LMUL_MR, 2)(zero_padding, (BASE_DT*) a, 2 * FLT_SIZE * inca, cdim);
+                RVV_TYPE_F(PREC, LMUL_MR) acol_r = VGET_V_F(PREC, LMUL_MR, 2)(acol, 0); 
+                RVV_TYPE_F(PREC, LMUL_MR) acol_i = VGET_V_F(PREC, LMUL_MR, 2)(acol, 1); 
+
+                if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                    if (bli_is_conj(conja)) {
+                        acol_i = VFNEG_VF_TU(PREC, LMUL_MR)(acol_i, acol_i, cdim);
+                        acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 0, acol_r);
+                        acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 1, acol_i);
+                    }
+                } else {
+                    RVV_TYPE_F(PREC, LMUL_MR) kappa_acol_r, kappa_acol_i;
+                    if (bli_is_conj(conja))
+                        VCMUL_VF_CONJ_TU(PREC, LMUL_MR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim);
+                    else
+                        VCMUL_VF_TU(PREC, LMUL_MR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim);
+                    acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 0, kappa_acol_r);
+                    acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 1, kappa_acol_i);
+                }
+
+                VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, acol, cdim_max);
+                 
+                a += lda;
+                p += ldp;
+            }
+
+            for (size_t i = n; i < n_max; ++i) {
+                VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        }
+    }
+    // NRxk kernel
+    else if (cdim <= NR && cdim_max == NR && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            // a is "row major"
+            RVV_TYPE_FX(PREC, LMUL_NR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_NR, 2)();
+            zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max));
+            zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max));
+            size_t avl = n;
+            while (avl) {
+                size_t vl = VSETVL(PREC, LMUL)(avl);
+                dim_t cdim_tmp = cdim;
+                const DATATYPE* restrict a_tmp = a;
+                DATATYPE* restrict p_tmp = p;
+                while (cdim_tmp >= 4) {
+                    RVV_TYPE_FX(PREC, LMUL, 2) arow_vec;
+                    RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r, arow3_r;
+                    RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i, arow3_i;
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 0 * inca), vl);
+                    arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 1 * inca), vl);
+                    arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 2 * inca), vl);
+                    arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                    arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 3 * inca), vl);
+                    arow3_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                    arow3_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+
+                    if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                        if (bli_is_conj(conja)) {
+                            arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl);
+                            arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl);
+                            arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl);
+                            arow3_i = VFNEG_VF(PREC, LMUL)(arow3_i, vl);
+                        }
+
+                        RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)();
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, arow0_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, arow0_i);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, arow1_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, arow1_i);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, arow2_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, arow2_i);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, arow3_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, arow3_i);
+                        VSSSEG8_V_F(PREC, LMUL, 8)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock, vl);
+                    } else {
+                        RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r, kappa_arow3_r;
+                        RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i, kappa_arow3_i;
+                        if (bli_is_conj(conja)) {
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl);
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl);
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl);
+                            VCMUL_VF_CONJ(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl);
+                        } else {
+                            VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl);
+                            VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl);
+                            VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl);
+                            VCMUL_VF(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl);
+                        }
+
+                        RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)();
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, kappa_arow0_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, kappa_arow0_i);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, kappa_arow1_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, kappa_arow1_i);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, kappa_arow2_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, kappa_arow2_i);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, kappa_arow3_r);
+                        ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, kappa_arow3_i);
+                        VSSSEG8_V_F(PREC, LMUL, 8)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock, vl);
+                    }
+                    
+                    a_tmp += 4 * inca;
+                    p_tmp += 4;
+                    cdim_tmp -= 4;
+                }
+
+                if (cdim_tmp > 0) {
+                    RVV_TYPE_FX(PREC, LMUL, 2) arow_vec;
+                    RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r;
+                    RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i;
+                    switch (cdim_tmp) {
+                    case 3:
+                        arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 2 * inca), vl);
+                        arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                        arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                    case 2:
+                        arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 1 * inca), vl);
+                        arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                        arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                    case 1:
+                        arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 0 * inca), vl);
+                        arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0);
+                        arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1);
+                    }
+
+                    if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                        if (bli_is_conj(conja)) {
+                            switch (cdim_tmp) {
+                            case 3:
+                                arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl);
+                            case 2:
+                                arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl);
+                            case 1:
+                                arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl);
+                            }
+                        }
+
+                        RVV_TYPE_FX(PREC, LMUL, 6) ablock3 = VUNDEFINED_FX(PREC, LMUL, 6)();
+                        RVV_TYPE_FX(PREC, LMUL, 4) ablock2 = VUNDEFINED_FX(PREC, LMUL, 4)();
+                        RVV_TYPE_FX(PREC, LMUL, 2) ablock1 = VUNDEFINED_FX(PREC, LMUL, 2)();
+                        switch (cdim_tmp) {
+                        case 3:
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 0, arow0_r);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 1, arow0_i);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 2, arow1_r);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 3, arow1_i);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 4, arow2_r);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 5, arow2_i);
+                            VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock3, vl);
+                            break;
+                        case 2:
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 0, arow0_r);
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 1, arow0_i);
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 2, arow1_r);
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 3, arow1_i);
+                            VSSSEG4_V_F(PREC, LMUL, 4)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock2, vl);
+                            break;
+                        case 1:
+                            ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 0, arow0_r);
+                            ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 1, arow0_i);
+                            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock1, vl);
+                            break;
+                        }
+                    } else {
+                        RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r;
+                        RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i;
+                        if (bli_is_conj(conja)) {
+                            switch (cdim_tmp) {
+                            case 3:
+                                VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl);
+                            case 2:
+                                VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl);
+                            case 1:
+                                VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl);
+                            }
+                        } else {
+                            switch (cdim_tmp) {
+                            case 3:
+                                VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl);
+                            case 2:
+                                VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl);
+                            case 1:
+                                VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl);
+                            }
+                        }
+
+                        RVV_TYPE_FX(PREC, LMUL, 6) ablock3 = VUNDEFINED_FX(PREC, LMUL, 6)();
+                        RVV_TYPE_FX(PREC, LMUL, 4) ablock2 = VUNDEFINED_FX(PREC, LMUL, 4)();
+                        RVV_TYPE_FX(PREC, LMUL, 2) ablock1 = VUNDEFINED_FX(PREC, LMUL, 2)();
+                        switch (cdim_tmp) {
+                        case 3:
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 0, kappa_arow0_r);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 1, kappa_arow0_i);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 2, kappa_arow1_r);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 3, kappa_arow1_i);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 4, kappa_arow2_r);
+                            ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 5, kappa_arow2_i);
+                            VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock3, vl);
+                            break;
+                        case 2:
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 0, kappa_arow0_r);
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 1, kappa_arow0_i);
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 2, kappa_arow1_r);
+                            ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 3, kappa_arow1_i);
+                            VSSSEG4_V_F(PREC, LMUL, 4)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock2, vl);
+                            break;
+                        case 1:
+                            ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 0, kappa_arow0_r);
+                            ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 1, kappa_arow0_i);
+                            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock1, vl);
+                            break;
+                        }
+                    }
+                    
+                    p_tmp += cdim_tmp;
+                }
+
+                // pad lower edge
+                for (size_t i = 0; i < vl; ++i) {
+                    VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p_tmp, zero_padding, cdim_max - cdim);
+                    p_tmp += ldp;
+                }
+
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+            
+            // pad right edge
+            for (size_t i = n; i < n_max; ++i) {
+                VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        } else {
+            RVV_TYPE_FX(PREC, LMUL_NR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_NR, 2)();
+            zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max));
+            zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max));
+
+            for (size_t i = 0; i < n; ++i) {
+                RVV_TYPE_FX(PREC, LMUL_NR, 2) acol;
+                if (inca == 1)
+                    acol = VLSEG2_V_F_TU(PREC, LMUL_NR, 2)(zero_padding, (BASE_DT*) a, cdim);
+                else
+                    acol = VLSSEG2_V_F_TU(PREC, LMUL_NR, 2)(zero_padding, (BASE_DT*) a, 2 * FLT_SIZE * inca, cdim);
+                RVV_TYPE_F(PREC, LMUL_NR) acol_r = VGET_V_F(PREC, LMUL_NR, 2)(acol, 0); 
+                RVV_TYPE_F(PREC, LMUL_NR) acol_i = VGET_V_F(PREC, LMUL_NR, 2)(acol, 1); 
+
+                if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                    if (bli_is_conj(conja)) {
+                        acol_i = VFNEG_VF_TU(PREC, LMUL_NR)(acol_i, acol_i, cdim);
+                        acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 0, acol_r);
+                        acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 1, acol_i);
+                    }
+                } else {
+                    RVV_TYPE_F(PREC, LMUL_NR) kappa_acol_r, kappa_acol_i;
+                    if (bli_is_conj(conja))
+                        VCMUL_VF_CONJ_TU(PREC, LMUL_NR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim);
+                    else
+                        VCMUL_VF_TU(PREC, LMUL_NR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim);
+                    acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 0, kappa_acol_r);
+                    acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 1, kappa_acol_i);
+                }
+
+                VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, acol, cdim_max);
+                 
+                a += lda;
+                p += ldp;
+            }
+
+            for (size_t i = n; i < n_max; ++i) {
+                VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        }
+    }
+    // generic kernel
+    else
+    {
+        REF_KERNEL(PRECISION_CHAR)
+        (
+          conja,
+          schema,
+          cdim,
+          cdim_max,
+          cdim_bcast,
+          n,
+          n_max,
+          kappa,
+          a, inca, lda,
+          p,       ldp,
+          params,
+          cntx
+        );
+    }
+
+    return;
+}
+
+#endif // PACKM
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c
new file mode 100644
index 000000000..741714d60
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c
@@ -0,0 +1,364 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef PACKM
+
+PACKM(PRECISION_CHAR, void)
+{
+    (void) conja; // Suppress unused parameter warnings
+    (void) schema;
+    (void) params;
+    (void) cntx;
+    const DATATYPE* restrict kappa = kappa_;
+    const DATATYPE* restrict a = a_;
+    DATATYPE* restrict p = p_;
+
+    // MRxk kernel
+    if (cdim <= MR && cdim_max == MR && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            // a is "row major"
+            // pad the lower edge with zeros
+            RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6;
+            switch (cdim) {
+            case 0:
+                arow0 = VFMV_V_F(PREC, LMUL)(0., n);
+            case 1:
+                arow1 = VFMV_V_F(PREC, LMUL)(0., n);
+            case 2:
+                arow2 = VFMV_V_F(PREC, LMUL)(0., n);
+            case 3:
+                arow3 = VFMV_V_F(PREC, LMUL)(0., n);
+            case 4:
+                arow4 = VFMV_V_F(PREC, LMUL)(0., n);
+            case 5:
+                arow5 = VFMV_V_F(PREC, LMUL)(0., n);
+            case 6:
+                arow6 = VFMV_V_F(PREC, LMUL)(0., n);
+            }
+
+            size_t avl = n;
+            while (avl) {
+                size_t vl = VSETVL(PREC, LMUL)(avl);
+                switch (cdim) {
+                    case 7:
+                        arow6 = VLE_V_F(PREC, LMUL)(a + 6 * inca, vl);
+                    case 6:
+                        arow5 = VLE_V_F(PREC, LMUL)(a + 5 * inca, vl);
+                    case 5:
+                        arow4 = VLE_V_F(PREC, LMUL)(a + 4 * inca, vl);
+                    case 4:
+                        arow3 = VLE_V_F(PREC, LMUL)(a + 3 * inca, vl);
+                    case 3:
+                        arow2 = VLE_V_F(PREC, LMUL)(a + 2 * inca, vl);
+                    case 2:
+                        arow1 = VLE_V_F(PREC, LMUL)(a + 1 * inca, vl);
+                    case 1:
+                        arow0 = VLE_V_F(PREC, LMUL)(a + 0 * inca, vl);
+                }
+                
+                if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                    switch (cdim) {
+                        case 7:
+                            arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl);
+                        case 6:
+                            arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl);
+                        case 5:
+                            arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl);
+                        case 4:
+                            arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl);
+                        case 3:
+                            arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl);
+                        case 2:
+                            arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl);
+                        case 1:
+                            arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl);
+                    }
+                }
+
+                RVV_TYPE_FX(PREC, LMUL, 7) ablock = VUNDEFINED_FX(PREC, LMUL, 7)();
+                ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 0, arow0); 
+                ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 1, arow1); 
+                ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 2, arow2); 
+                ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 3, arow3); 
+                ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 4, arow4); 
+                ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 5, arow5); 
+                ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 6, arow6); 
+                VSSSEG7_V_F(PREC, LMUL, 7)(p, FLT_SIZE * ldp, ablock, vl);
+
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+
+            RVV_TYPE_F(PREC, LMUL_MR) zero_padding = VFMV_V_F(PREC, LMUL_MR)(0., cdim_max);
+            for (size_t i = n; i < n_max; ++i) {
+                VSE_V_F(PREC, LMUL_MR)(p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        }
+        else {
+            RVV_TYPE_F(PREC, LMUL_MR) zero_padding = VFMV_V_F(PREC, LMUL_MR)(0., cdim_max);
+            for (size_t i = 0; i < n; ++i) {
+                RVV_TYPE_F(PREC, LMUL_MR) acol_vec;
+                if (inca == 1)
+                    acol_vec = VLE_V_F_TU(PREC, LMUL_MR)(zero_padding, a, cdim);
+                else
+                    acol_vec = VLSE_V_F_TU(PREC, LMUL_MR)(zero_padding, a, FLT_SIZE * inca, cdim);
+
+                if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa))
+                    acol_vec = VFMUL_VF_TU(PREC, LMUL_MR)(acol_vec, acol_vec, *kappa, cdim);
+
+                VSE_V_F(PREC, LMUL_MR)(p, acol_vec, cdim_max);
+                 
+                a += lda;
+                p += ldp;
+            }
+
+            for (size_t i = n; i < n_max; ++i) {
+                VSE_V_F(PREC, LMUL_MR)(p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        }
+    }
+    // NRxk kernel
+    else if (cdim <= NR && cdim_max == NR && cdim_bcast == 1)
+    {
+        if (lda == 1) {
+            // a is "row major"
+            RVV_TYPE_F(PREC, LMUL_NR) zero_padding = VFMV_V_F(PREC, LMUL_NR)(0., cdim_max);
+            size_t avl = n;
+            while (avl) {
+                size_t vl = VSETVL(PREC, LMUL)(avl);
+                dim_t cdim_tmp = cdim;
+                const DATATYPE* restrict a_tmp = a;
+                DATATYPE* restrict p_tmp = p;
+                while (cdim_tmp >= 8) {
+                    RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6, arow7;
+                    arow0 = VLE_V_F(PREC, LMUL)(a_tmp + 0 * inca, vl);
+                    arow1 = VLE_V_F(PREC, LMUL)(a_tmp + 1 * inca, vl);
+                    arow2 = VLE_V_F(PREC, LMUL)(a_tmp + 2 * inca, vl);
+                    arow3 = VLE_V_F(PREC, LMUL)(a_tmp + 3 * inca, vl);
+                    arow4 = VLE_V_F(PREC, LMUL)(a_tmp + 4 * inca, vl);
+                    arow5 = VLE_V_F(PREC, LMUL)(a_tmp + 5 * inca, vl);
+                    arow6 = VLE_V_F(PREC, LMUL)(a_tmp + 6 * inca, vl);
+                    arow7 = VLE_V_F(PREC, LMUL)(a_tmp + 7 * inca, vl);
+                    
+                    if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                        arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl);
+                        arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl);
+                        arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl);
+                        arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl);
+                        arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl);
+                        arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl);
+                        arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl);
+                        arow7 = VFMUL_VF(PREC, LMUL)(arow7, *kappa, vl);
+                    }
+                    
+                    RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)();
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, arow0); 
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, arow1); 
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, arow2); 
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, arow3); 
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, arow4); 
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, arow5); 
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, arow6); 
+                    ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, arow7); 
+                    VSSSEG8_V_F(PREC, LMUL, 8)(p_tmp, FLT_SIZE * ldp, ablock, vl);
+                    
+                    a_tmp += 8 * inca;
+                    p_tmp += 8;
+                    cdim_tmp -= 8;
+                }
+
+                if (cdim_tmp > 0) {
+                    RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6;
+                    switch (cdim_tmp) {
+                    case 7:
+                        arow6 = VLE_V_F(PREC, LMUL)(a_tmp + 6 * inca, vl);
+                    case 6:
+                        arow5 = VLE_V_F(PREC, LMUL)(a_tmp + 5 * inca, vl);
+                    case 5:
+                        arow4 = VLE_V_F(PREC, LMUL)(a_tmp + 4 * inca, vl);
+                    case 4:
+                        arow3 = VLE_V_F(PREC, LMUL)(a_tmp + 3 * inca, vl);
+                    case 3:
+                        arow2 = VLE_V_F(PREC, LMUL)(a_tmp + 2 * inca, vl);
+                    case 2:
+                        arow1 = VLE_V_F(PREC, LMUL)(a_tmp + 1 * inca, vl);
+                    case 1:
+                        arow0 = VLE_V_F(PREC, LMUL)(a_tmp + 0 * inca, vl);
+                    }
+
+                    if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) {
+                        switch (cdim_tmp) {
+                        case 7:
+                            arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl);
+                        case 6:
+                            arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl);
+                        case 5:
+                            arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl);
+                        case 4:
+                            arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl);
+                        case 3:
+                            arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl);
+                        case 2:
+                            arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl);
+                        case 1:
+                            arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl);
+                        }
+                    }
+
+                    RVV_TYPE_FX(PREC, LMUL, 7) ablock7 = VUNDEFINED_FX(PREC, LMUL, 7)();
+                    RVV_TYPE_FX(PREC, LMUL, 6) ablock6 = VUNDEFINED_FX(PREC, LMUL, 6)();
+                    RVV_TYPE_FX(PREC, LMUL, 5) ablock5 = VUNDEFINED_FX(PREC, LMUL, 5)();
+                    RVV_TYPE_FX(PREC, LMUL, 4) ablock4 = VUNDEFINED_FX(PREC, LMUL, 4)();
+                    RVV_TYPE_FX(PREC, LMUL, 3) ablock3 = VUNDEFINED_FX(PREC, LMUL, 3)();
+                    RVV_TYPE_FX(PREC, LMUL, 2) ablock2 = VUNDEFINED_FX(PREC, LMUL, 2)();
+                    switch (cdim_tmp) {
+                    case 7:
+                        ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 0, arow0); 
+                        ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 1, arow1); 
+                        ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 2, arow2); 
+                        ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 3, arow3); 
+                        ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 4, arow4); 
+                        ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 5, arow5); 
+                        ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 6, arow6); 
+                        VSSSEG7_V_F(PREC, LMUL, 7)(p_tmp, FLT_SIZE * ldp, ablock7, vl);
+                        break;
+                    case 6:
+                        ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 0, arow0); 
+                        ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 1, arow1); 
+                        ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 2, arow2); 
+                        ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 3, arow3); 
+                        ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 4, arow4); 
+                        ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 5, arow5); 
+                        VSSSEG6_V_F(PREC, LMUL, 6)(p_tmp, FLT_SIZE * ldp, ablock6, vl);
+                        break;
+                    case 5:
+                        ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 0, arow0); 
+                        ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 1, arow1); 
+                        ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 2, arow2); 
+                        ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 3, arow3); 
+                        ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 4, arow4); 
+                        VSSSEG5_V_F(PREC, LMUL, 5)(p_tmp, FLT_SIZE * ldp, ablock5, vl);
+                        break;
+                    case 4:
+                        ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 0, arow0); 
+                        ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 1, arow1); 
+                        ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 2, arow2); 
+                        ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 3, arow3); 
+                        VSSSEG4_V_F(PREC, LMUL, 4)(p_tmp, FLT_SIZE * ldp, ablock4, vl);
+                        break;
+                    case 3:
+                        ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 0, arow0); 
+                        ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 1, arow1); 
+                        ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 2, arow2); 
+                        VSSSEG3_V_F(PREC, LMUL, 3)(p_tmp, FLT_SIZE * ldp, ablock3, vl);
+                        break;
+                    case 2:
+                        ablock2 = VSET_V_F(PREC, LMUL, 2)(ablock2, 0, arow0); 
+                        ablock2 = VSET_V_F(PREC, LMUL, 2)(ablock2, 1, arow1); 
+                        VSSSEG2_V_F(PREC, LMUL, 2)(p_tmp, FLT_SIZE * ldp, ablock2, vl);
+                        break;
+                    case 1:
+                        VSSE_V_F(PREC, LMUL)(p_tmp, FLT_SIZE * ldp, arow0, vl);
+                        break;
+                    }
+                    p_tmp += cdim_tmp;
+                }
+
+                for (size_t i = 0; i < vl; ++i) {
+                    VSE_V_F(PREC, LMUL_NR)(p_tmp, zero_padding, cdim_max - cdim);
+                    p_tmp += ldp;
+                }
+
+                a += vl;
+                p += vl * ldp;
+                avl -= vl;
+            }
+
+            for (size_t i = n; i < n_max; ++i) {
+                VSE_V_F(PREC, LMUL_NR)(p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        } else {
+            RVV_TYPE_F(PREC, LMUL_NR) zero_padding = VFMV_V_F(PREC, LMUL_NR)(0., cdim_max);
+            for (size_t i = 0; i < n; ++i) {
+                RVV_TYPE_F(PREC, LMUL_NR) acol_vec;
+                if (inca == 1)
+                    acol_vec = VLE_V_F_TU(PREC, LMUL_NR)(zero_padding, a, cdim);
+                else
+                    acol_vec = VLSE_V_F_TU(PREC, LMUL_NR)(zero_padding, a, FLT_SIZE * inca, cdim);
+
+                if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa))
+                    acol_vec = VFMUL_VF_TU(PREC, LMUL_NR)(acol_vec, acol_vec, *kappa, cdim);
+
+                VSE_V_F(PREC, LMUL_NR)(p, acol_vec, cdim_max);
+                 
+                a += lda;
+                p += ldp;
+            }
+
+            for (size_t i = n; i < n_max; ++i) {
+                VSE_V_F(PREC, LMUL_NR)(p, zero_padding, cdim_max);
+                p += ldp;
+            }
+        }
+    }
+    // generic kernel
+    else
+    {
+        REF_KERNEL(PRECISION_CHAR)
+        (
+          conja,
+          schema,
+          cdim,
+          cdim_max,
+          cdim_bcast,
+          n,
+          n_max,
+          kappa,
+          a, inca, lda,
+          p,       ldp,
+          params,
+          cntx
+        );
+    }
+
+    return;
+}
+
+#endif // PACKM
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
deleted file mode 100644
index f4a5a26ca..000000000
--- a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
+++ /dev/null
@@ -1,2406 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// clang-format off
-#include "blis.h"
-#include "../riscv_cmul_macros_asm.h"
-#include "../bli_kernels_sifive_x280.h"
-#include <math.h>
-#include <stddef.h>
-#include <stdbool.h>
-#include <riscv_vector.h>
-
-// byte-size of the floating point type
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define VLE "vle32.v "
-#define VLSE "vlse32.v "
-#define VSE "vse32.v "
-#define VSSE "vsse32.v "
-#define PACKMR 8
-#define PACKNR 64
-
-void bli_sgemm_7m4
-     (
-             dim_t           N,
-             dim_t           K,
-       const float* restrict alpha,
-       const float* restrict a,
-       const float* restrict b,
-       const float* restrict beta,
-             float* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    bool first = true;
-    // compute a*b
-    for (dim_t k = 0; k < K; ++k) {
-        __asm__(VLE "v28, (%0)" : : "r"(b));
-        if (first) {
-            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-            __asm__("vfmul.vf v0, v28, ft0");
-
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-            __asm__("vfmul.vf v4, v28, ft1");
-
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-            __asm__("vfmul.vf v8, v28, ft2");
-
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-            __asm__("vfmul.vf v12, v28, ft3");
-
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-            __asm__("vfmul.vf v16, v28, ft4");
-
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-            __asm__("vfmul.vf v20, v28, ft5");
-
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-            __asm__("vfmul.vf v24, v28, ft6");
-
-            first = false;
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-            __asm__("vfmacc.vf v0, ft0, v28");
-
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-            __asm__("vfmacc.vf v4, ft1, v28");
-
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-            __asm__("vfmacc.vf v8, ft2, v28");
-
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-            __asm__("vfmacc.vf v12, ft3, v28");
-
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-            __asm__("vfmacc.vf v16, ft4, v28");
-
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-            __asm__("vfmacc.vf v20, ft5, v28");
-
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-            __asm__("vfmacc.vf v24, ft6, v28");
-        }
-
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
-    }
-
-    rsc *= FLT_SIZE;
-    csc *= FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-
-    // compute alpha*a*b + beta*c
-    if (*beta == 0.f) {
-        __asm__("vfmul.vf v0, v0, ft10");
-        __asm__("vfmul.vf v4, v4, ft10");
-        __asm__("vfmul.vf v8, v8, ft10");
-        __asm__("vfmul.vf v12, v12, ft10");
-        __asm__("vfmul.vf v16, v16, ft10");
-        __asm__("vfmul.vf v20, v20, ft10");
-        __asm__("vfmul.vf v24, v24, ft10");
-    }
-    else { // beta != 0.f
-        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-        float *c_tmp = c;
-        if (csc == FLT_SIZE) { // c unit column stride
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v0, v0, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v0, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v4, v4, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v4, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v8, v8, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v8, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v12, v12, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v12, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v16, v16, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v16, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v20, v20, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v20, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v24, v24, ft10");
-            __asm__("vfmacc.vf v24, ft11, v28");
-        } // end c unit column stride
-        else { // c non-unit column stride
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v0, v0, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v0, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v4, v4, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v4, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v8, v8, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v8, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v12, v12, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v12, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v16, v16, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v16, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v20, v20, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v20, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v24, v24, ft10");
-            __asm__("vfmacc.vf v24, ft11, v28");
-        } // end c non-unit column stride
-    } // end beta != 0.f
-
-    // store c
-    if (csc == FLT_SIZE) {
-        __asm__(VSE "v0, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v4, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v8, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v12, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v16, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v20, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v24, (%0)" : : "r"(c));
-    }
-    else {
-        __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
-    }
-
-    return;
-}
-
-void bli_sgemm_7m4_cleanup
-     (
-             dim_t           M,
-             dim_t           N,
-             dim_t           K,
-       const float* restrict alpha,
-       const float* restrict a,
-       const float* restrict b,
-       const float* restrict beta,
-             float* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    bool first = true;
-    // compute a*b
-    for (dim_t k = 0; k < K; ++k) {
-        __asm__(VLE "v28, (%0)" : : "r"(b));
-        if (first) {
-            switch (M) {
-            case 6:
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                __asm__("vfmul.vf v20, v28, ft5");
-            case 5:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__("vfmul.vf v16, v28, ft4");
-            case 4:
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                __asm__("vfmul.vf v12, v28, ft3");
-            case 3:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__("vfmul.vf v8, v28, ft2");
-            case 2:
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                __asm__("vfmul.vf v4, v28, ft1");
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__("vfmul.vf v0, v28, ft0");
-            }
-            first = false;
-        }
-        else {
-            switch (M) {
-            case 6:
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                __asm__("vfmacc.vf v20, ft5, v28");
-            case 5:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__("vfmacc.vf v16, ft4, v28");
-            case 4:
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                __asm__("vfmacc.vf v12, ft3, v28");
-            case 3:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__("vfmacc.vf v8, ft2, v28");
-            case 2:
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                __asm__("vfmacc.vf v4, ft1, v28");
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__("vfmacc.vf v0, ft0, v28");
-            }
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
-    }
-
-    c += (M - 1) * rsc;
-    rsc *= FLT_SIZE;
-    csc *= FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-
-    // compute alpha*a*b + beta*c
-    if (*beta == 0.f) {
-        switch (M) {
-        case 6:
-            __asm__("vfmul.vf v20, v20, ft10");
-        case 5:
-            __asm__("vfmul.vf v16, v16, ft10");
-        case 4:
-            __asm__("vfmul.vf v12, v12, ft10");
-        case 3:
-            __asm__("vfmul.vf v8, v8, ft10");
-        case 2:
-            __asm__("vfmul.vf v4, v4, ft10");
-        case 1:
-            __asm__("vfmul.vf v0, v0, ft10");
-        }
-    }
-    else { // beta != 0.f
-        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-        float *c_tmp = c;
-        if (csc == FLT_SIZE) {
-            switch (M) {
-            case 6:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v20, v20, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v20, ft11, v28");
-            case 5:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v16, v16, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v16, ft11, v28");
-            case 4:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v12, v12, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v12, ft11, v28");
-            case 3:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v8, v8, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v8, ft11, v28");
-            case 2:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v4, v4, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v4, ft11, v28");
-            case 1:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v0, v0, ft10");
-                __asm__("vfmacc.vf v0, ft11, v28");
-            }
-        } // end c unit column stride
-        else { // c non-unit column stride
-            switch (M) {
-            case 6:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v20, v20, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v20, ft11, v28");
-            case 5:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v16, v16, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v16, ft11, v28");
-            case 4:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v12, v12, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v12, ft11, v28");
-            case 3:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v8, v8, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v8, ft11, v28");
-            case 2:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v4, v4, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v4, ft11, v28");
-            case 1:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v0, v0, ft10");
-                __asm__("vfmacc.vf v0, ft11, v28");
-            }
-        } // end c non-unit column stride
-    } // end beta != 0.f
-
-    // store c
-    if (csc == FLT_SIZE) {
-        switch (M) {
-        case 6:
-            __asm__(VSE "v20, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 5:
-            __asm__(VSE "v16, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 4:
-            __asm__(VSE "v12, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 3:
-            __asm__(VSE "v8, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 2:
-            __asm__(VSE "v4, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 1:
-            __asm__(VSE "v0, (%0)" : : "r"(c));
-        }
-    }
-    else {
-        switch (M) {
-        case 6:
-            __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 5:
-            __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 4:
-            __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 3:
-            __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 2:
-            __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 1:
-            __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-        }
-    }
-    return;
-}
-
-void bli_sgemm_7m4_k0
-     (
-             dim_t           M,
-             dim_t           N,
-       const float* restrict beta,
-             float* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
-    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
-    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    c += (M - 1) * rsc;
-    rsc *= FLT_SIZE;
-    csc *= FLT_SIZE;
-    if (*beta == 0.f) {
-        // set c to 0
-        __asm__("vmv.v.i v0, 0");
-        if (csc == FLT_SIZE) { // c unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-            }
-        } // end c unit column stride
-        else { // c non-unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-            }
-        } // end c non-unit column stride
-    } // end beta == 0.f
-    else { // beta != 0.f
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
-        if (csc == FLT_SIZE) { // c unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VLE "v24, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v24, v24, ft0");
-                __asm__(VSE "v24, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VLE "v20, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v20, v20, ft0");
-                __asm__(VSE "v20, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VLE "v16, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v16, v16, ft0");
-                __asm__(VSE "v16, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VLE "v12, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v12, v12, ft0");
-                __asm__(VSE "v12, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VLE "v8, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v8, v8, ft0");
-                __asm__(VSE "v8, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VLE "v4, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v4, v4, ft0");
-                __asm__(VSE "v4, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VLE "v0, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v0, v0, ft0");
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-
-            }
-        } // end c unit column stride
-        else { // c non-unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v24, v24, ft0");
-                __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v20, v20, ft0");
-                __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v16, v16, ft0");
-                __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v12, v12, ft0");
-                __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v8, v8, ft0");
-                __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v4, v4, ft0");
-                __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v0, v0, ft0");
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-            }
-        } // end c non-unit column stride
-    } // end beta != 0.f
-    return;
-}
-
-void bli_sgemm_sifive_x280_asm_7m4
-     (
-             dim_t               M,
-             dim_t               N,
-             dim_t               K,
-       const void*      restrict alpha_,
-       const void*      restrict a_,
-       const void*      restrict b_,
-       const void*      restrict beta_,
-             void*      restrict c_, inc_t rsc, inc_t csc,
-       const auxinfo_t* restrict data,
-       const cntx_t*    restrict cntx
-     )
-{
-    (void) data;
-    (void) cntx;
-    const float* restrict alpha = alpha_;
-    const float* restrict beta = beta_;
-    const float* restrict a = a_;
-    const float* restrict b = b_;
-    float* restrict c = c_;
-
-    // M x N x K sgemm
-    if (M <= 0 || N <= 0 || K < 0)
-        return;
-    else if (K == 0)
-        bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc);
-    else if (M == 7)
-        bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
-    else
-        bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-#undef PACKMR
-#undef PACKNR
-
-// byte-size of the floating point type
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define VLE "vle64.v "
-#define VLSE "vlse64.v "
-#define VSE "vse64.v "
-#define VSSE "vsse64.v "
-#define PACKMR 8
-#define PACKNR 32
-
-void bli_dgemm_7m4
-     (
-             dim_t            N,
-             dim_t            K,
-       const double* restrict alpha,
-       const double* restrict a,
-       const double* restrict b,
-       const double* restrict beta,
-             double* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    bool first = true;
-    // compute a*b
-    for (dim_t k = 0; k < K; ++k) {
-        __asm__(VLE "v28, (%0)" : : "r"(b));
-        if (first) {
-            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-            __asm__("vfmul.vf v0, v28, ft0");
-
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-            __asm__("vfmul.vf v4, v28, ft1");
-
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-            __asm__("vfmul.vf v8, v28, ft2");
-
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-            __asm__("vfmul.vf v12, v28, ft3");
-
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-            __asm__("vfmul.vf v16, v28, ft4");
-
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-            __asm__("vfmul.vf v20, v28, ft5");
-
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-            __asm__("vfmul.vf v24, v28, ft6");
-
-            first = false;
-        }
-        else {
-            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-            __asm__("vfmacc.vf v0, ft0, v28");
-
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-            __asm__("vfmacc.vf v4, ft1, v28");
-
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-            __asm__("vfmacc.vf v8, ft2, v28");
-
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-            __asm__("vfmacc.vf v12, ft3, v28");
-
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-            __asm__("vfmacc.vf v16, ft4, v28");
-
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-            __asm__("vfmacc.vf v20, ft5, v28");
-
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-            __asm__("vfmacc.vf v24, ft6, v28");
-        }
-
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
-    }
-
-    rsc *= FLT_SIZE;
-    csc *= FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-
-    // compute alpha*a*b + beta*c
-    if (*beta == 0.) {
-        __asm__("vfmul.vf v0, v0, ft10");
-        __asm__("vfmul.vf v4, v4, ft10");
-        __asm__("vfmul.vf v8, v8, ft10");
-        __asm__("vfmul.vf v12, v12, ft10");
-        __asm__("vfmul.vf v16, v16, ft10");
-        __asm__("vfmul.vf v20, v20, ft10");
-        __asm__("vfmul.vf v24, v24, ft10");
-    }
-    else { // beta != 0.
-        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-        double *c_tmp = c;
-        if (csc == FLT_SIZE) { // c unit column stride
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v0, v0, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v0, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v4, v4, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v4, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v8, v8, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v8, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v12, v12, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v12, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v16, v16, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v16, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v20, v20, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v20, ft11, v28");
-
-            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-            __asm__("vfmul.vf v24, v24, ft10");
-            __asm__("vfmacc.vf v24, ft11, v28");
-        } // end c unit column stride
-        else { // c non-unit column stride
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v0, v0, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v0, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v4, v4, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v4, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v8, v8, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v8, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v12, v12, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v12, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v16, v16, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v16, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v20, v20, ft10");
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__("vfmacc.vf v20, ft11, v28");
-
-            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("vfmul.vf v24, v24, ft10");
-            __asm__("vfmacc.vf v24, ft11, v28");
-        } // end c non-unit column stride
-    } // end beta != 0.
-
-    // store c
-    if (csc == FLT_SIZE) {
-        __asm__(VSE "v0, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v4, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v8, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v12, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v16, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v20, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSE "v24, (%0)" : : "r"(c));
-    }
-    else {
-        __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
-    }
-
-    return;
-}
-
-void bli_dgemm_7m4_cleanup
-     (
-             dim_t            M,
-             dim_t            N,
-             dim_t            K,
-       const double* restrict alpha,
-       const double* restrict a,
-       const double* restrict b,
-       const double* restrict beta,
-             double* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
-    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    bool first = true;
-    // compute a*b
-    for (dim_t k = 0; k < K; ++k) {
-        __asm__(VLE "v28, (%0)" : : "r"(b));
-        if (first) {
-            switch (M) {
-            case 6:
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                __asm__("vfmul.vf v20, v28, ft5");
-            case 5:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__("vfmul.vf v16, v28, ft4");
-            case 4:
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                __asm__("vfmul.vf v12, v28, ft3");
-            case 3:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__("vfmul.vf v8, v28, ft2");
-            case 2:
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                __asm__("vfmul.vf v4, v28, ft1");
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__("vfmul.vf v0, v28, ft0");
-            }
-            first = false;
-        }
-        else {
-            switch (M) {
-            case 6:
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                __asm__("vfmacc.vf v20, ft5, v28");
-            case 5:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__("vfmacc.vf v16, ft4, v28");
-            case 4:
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                __asm__("vfmacc.vf v12, ft3, v28");
-            case 3:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__("vfmacc.vf v8, ft2, v28");
-            case 2:
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                __asm__("vfmacc.vf v4, ft1, v28");
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__("vfmacc.vf v0, ft0, v28");
-            }
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
-    }
-
-    c += (M - 1) * rsc;
-    rsc *= FLT_SIZE;
-    csc *= FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
-
-    // compute alpha*a*b + beta*c
-    if (*beta == 0.) {
-        switch (M) {
-        case 6:
-            __asm__("vfmul.vf v20, v20, ft10");
-        case 5:
-            __asm__("vfmul.vf v16, v16, ft10");
-        case 4:
-            __asm__("vfmul.vf v12, v12, ft10");
-        case 3:
-            __asm__("vfmul.vf v8, v8, ft10");
-        case 2:
-            __asm__("vfmul.vf v4, v4, ft10");
-        case 1:
-            __asm__("vfmul.vf v0, v0, ft10");
-        }
-    }
-    else { // beta != 0.
-        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
-        double *c_tmp = c;
-        if (csc == FLT_SIZE) {
-            switch (M) {
-            case 6:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v20, v20, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v20, ft11, v28");
-            case 5:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v16, v16, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v16, ft11, v28");
-            case 4:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v12, v12, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v12, ft11, v28");
-            case 3:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v8, v8, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v8, ft11, v28");
-            case 2:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v4, v4, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v4, ft11, v28");
-            case 1:
-                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
-                __asm__("vfmul.vf v0, v0, ft10");
-                __asm__("vfmacc.vf v0, ft11, v28");
-            }
-        } // end c unit column stride
-        else { // c non-unit column stride
-            switch (M) {
-            case 6:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v20, v20, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v20, ft11, v28");
-            case 5:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v16, v16, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v16, ft11, v28");
-            case 4:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v12, v12, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v12, ft11, v28");
-            case 3:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v8, v8, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v8, ft11, v28");
-            case 2:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v4, v4, ft10");
-                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                __asm__("vfmacc.vf v4, ft11, v28");
-            case 1:
-                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                __asm__("vfmul.vf v0, v0, ft10");
-                __asm__("vfmacc.vf v0, ft11, v28");
-            }
-        } // end c non-unit column stride
-    } // end beta != 0.
-
-    // store c
-    if (csc == FLT_SIZE) {
-        switch (M) {
-        case 6:
-            __asm__(VSE "v20, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 5:
-            __asm__(VSE "v16, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 4:
-            __asm__(VSE "v12, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 3:
-            __asm__(VSE "v8, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 2:
-            __asm__(VSE "v4, (%0)" : : "r"(c));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 1:
-            __asm__(VSE "v0, (%0)" : : "r"(c));
-        }
-    }
-    else {
-        switch (M) {
-        case 6:
-            __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 5:
-            __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 4:
-            __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 3:
-            __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 2:
-            __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-        case 1:
-            __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-        }
-    }
-    return;
-}
-
-void bli_dgemm_7m4_k0
-     (
-             dim_t            M,
-             dim_t            N,
-       const double* restrict beta,
-             double* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
-    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
-    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    c += (M - 1) * rsc;
-    rsc *= FLT_SIZE;
-    csc *= FLT_SIZE;
-    if (*beta == 0.) {
-        // set c to 0
-        __asm__("vmv.v.i v0, 0");
-        if (csc == FLT_SIZE) { // c unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-            }
-        } // end c unit column stride
-        else { // c non-unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-            }
-        } // end c non-unit column stride
-    } // end beta == 0.
-    else { // beta != 0.
-        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
-        if (csc == FLT_SIZE) { // c unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VLE "v24, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v24, v24, ft0");
-                __asm__(VSE "v24, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VLE "v20, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v20, v20, ft0");
-                __asm__(VSE "v20, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VLE "v16, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v16, v16, ft0");
-                __asm__(VSE "v16, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VLE "v12, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v12, v12, ft0");
-                __asm__(VSE "v12, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VLE "v8, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v8, v8, ft0");
-                __asm__(VSE "v8, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VLE "v4, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v4, v4, ft0");
-                __asm__(VSE "v4, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VLE "v0, (%0)" : : "r"(c));
-                __asm__("vfmul.vf v0, v0, ft0");
-                __asm__(VSE "v0, (%0)" : : "r"(c));
-
-            }
-        } // end c unit column stride
-        else { // c non-unit column stride
-            switch (M) {
-            case 7:
-                __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v24, v24, ft0");
-                __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 6:
-                __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v20, v20, ft0");
-                __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 5:
-                __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v16, v16, ft0");
-                __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v12, v12, ft0");
-                __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v8, v8, ft0");
-                __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v4, v4, ft0");
-                __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("vfmul.vf v0, v0, ft0");
-                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
-            }
-        } // end c non-unit column stride
-    } // end beta != 0.
-    return;
-}
-
-void bli_dgemm_sifive_x280_asm_7m4
-     (
-             dim_t               M,
-             dim_t               N,
-             dim_t               K,
-       const void*      restrict alpha_,
-       const void*      restrict a_,
-       const void*      restrict b_,
-       const void*      restrict beta_,
-             void*      restrict c_, inc_t rsc, inc_t csc,
-       const auxinfo_t* restrict data,
-       const cntx_t*    restrict cntx
-     )
-{
-    (void) data;
-    (void) cntx;
-    const double* restrict alpha = alpha_;
-    const double* restrict beta = beta_;
-    const double* restrict a = a_;
-    const double* restrict b = b_;
-    double* restrict c = c_;
-
-    // M x N x K dgemm
-    if (M <= 0 || N <= 0 || K < 0)
-        return;
-    else if (K == 0)
-        bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc);
-    else if (M == 7)
-        bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
-    else
-        bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLE
-#undef VLSE
-#undef VSE
-#undef VSSE
-#undef PACKMR
-#undef PACKNR
-
-// byte-size of underlying floating point type
-#define FLT_SIZE 4
-#define FLT_LOAD "flw "
-#define VLSEG2 "vlseg2e32.v "
-#define VLSSEG2 "vlsseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define PACKMR 8
-#define PACKNR 32
-
-void bli_cgemm_6m2
-     (
-             dim_t              N,
-             dim_t              K,
-       const scomplex* restrict alpha,
-       const scomplex* restrict a,
-       const scomplex* restrict b,
-       const scomplex* restrict beta,
-             scomplex* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 6 x N x K cgemm, N <= 32 = vlmax, K > 0
-    // pairs of register groups hold the real and imag. parts of rows of c and b
-    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    if (K >= 2) {
-        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-    vcmul_vf(v0, v2, v24, v26, ft0, ft1);
-
-    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-    vcmul_vf(v4, v6, v24, v26, ft2, ft3);
-
-    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-    vcmul_vf(v8, v10, v24, v26, ft4, ft5);
-
-    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-    vcmul_vf(v12, v14, v24, v26, ft6, ft7);
-
-    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-    vcmul_vf(v16, v18, v24, v26, ft8, ft9);
-
-    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
-    vcmul_vf(v20, v22, v24, v26, ft10, ft11);
-    K -= 1;
-
-    if (K >= 2) {
-        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-    while (K > 0) {
-        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-        vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
-
-        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-        vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
-
-        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-        vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
-
-        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-        vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
-
-        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-        vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
-
-        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
-        vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
-        K -= 1;
-
-        if (K == 0) { break; }
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-        vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
-
-        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-        vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
-
-        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-        vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
-
-        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-        vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
-
-        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-        vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
-
-        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
-        vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
-        K -= 1;
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-    }
-
-    rsc *= 2 * FLT_SIZE;
-    csc *= 2 * FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
-
-    __asm__("vfmul.vf v24, v2, ft1");
-    __asm__("vfmul.vf v26, v0, ft1");
-    __asm__("vfmul.vf v28, v6, ft1");
-    __asm__("vfmul.vf v30, v4, ft1");
-
-    __asm__("vfmsub.vf v0, ft0, v24");
-    __asm__("vfmadd.vf v2, ft0, v26");
-    __asm__("vfmsub.vf v4, ft0, v28");
-    __asm__("vfmadd.vf v6, ft0, v30");
-
-    __asm__("vfmul.vf v24, v10, ft1");
-    __asm__("vfmul.vf v26, v8, ft1");
-    __asm__("vfmul.vf v28, v14, ft1");
-    __asm__("vfmul.vf v30, v12, ft1");
-
-    __asm__("vfmsub.vf v8, ft0, v24");
-    __asm__("vfmadd.vf v10, ft0, v26");
-    __asm__("vfmsub.vf v12, ft0, v28");
-    __asm__("vfmadd.vf v14, ft0, v30");
-
-    __asm__("vfmul.vf v24, v18, ft1");
-    __asm__("vfmul.vf v26, v16, ft1");
-    __asm__("vfmul.vf v28, v22, ft1");
-    __asm__("vfmul.vf v30, v20, ft1");
-
-    __asm__("vfmsub.vf v16, ft0, v24");
-    __asm__("vfmadd.vf v18, ft0, v26");
-    __asm__("vfmsub.vf v20, ft0, v28");
-    __asm__("vfmadd.vf v22, ft0, v30");
-
-    scomplex beta_cast = *beta;
-    if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
-        if (csc == 2 * FLT_SIZE) {
-            scomplex *c_tmp = c;
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-
-            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
-        }
-        else {
-            scomplex *c_tmp = c;
-            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-
-            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
-        }
-    }
-
-    if (csc == 2 * FLT_SIZE) {
-        __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
-    }
-    else {
-        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
-    }
-
-    return;
-}
-
-void bli_cgemm_6m2_cleanup
-     (
-             dim_t              M,
-             dim_t              N,
-             dim_t              K,
-       const scomplex* restrict alpha,
-       const scomplex* restrict a,
-       const scomplex* restrict b,
-       const scomplex* restrict beta,
-             scomplex* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
-    // pairs of register groups hold the real and imag. parts of rows of c and b
-
-    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    if (K >= 2) {
-        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-
-    switch (M) {
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-            vcmul_vf(v16, v18, v24, v26, ft8, ft9);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-            vcmul_vf(v12, v14, v24, v26, ft6, ft7);
-        case 3:
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-            vcmul_vf(v8, v10, v24, v26, ft4, ft5);
-        case 2:
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-            vcmul_vf(v4, v6, v24, v26, ft2, ft3);
-        case 1:
-            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-            vcmul_vf(v0, v2, v24, v26, ft0, ft1);
-    }
-    K -= 1;
-
-    if (K >= 2) {
-        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-    while (K > 0) {
-        switch (M) {
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-                vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-                vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
-        }
-        K -= 1;
-
-        if (K == 0) { break; }
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-        switch (M) {
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-                vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-                vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
-        }
-        K -= 1;
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-    }
-
-    c += (M - 1) * rsc;
-    rsc *= 2 * FLT_SIZE;
-    csc *= 2 * FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
-
-    switch (M) {
-        case 5:
-            __asm__("vfmul.vf v24, v18, ft1");
-            __asm__("vfmul.vf v26, v16, ft1");
-            __asm__("vfmsub.vf v16, ft0, v24");
-            __asm__("vfmadd.vf v18, ft0, v26");
-        case 4:
-            __asm__("vfmul.vf v28, v14, ft1");
-            __asm__("vfmul.vf v30, v12, ft1");
-            __asm__("vfmsub.vf v12, ft0, v28");
-            __asm__("vfmadd.vf v14, ft0, v30");
-        case 3:
-            __asm__("vfmul.vf v24, v10, ft1");
-            __asm__("vfmul.vf v26, v8, ft1");
-            __asm__("vfmsub.vf v8, ft0, v24");
-            __asm__("vfmadd.vf v10, ft0, v26");
-        case 2:
-            __asm__("vfmul.vf v28, v6, ft1");
-            __asm__("vfmul.vf v30, v4, ft1");
-            __asm__("vfmsub.vf v4, ft0, v28");
-            __asm__("vfmadd.vf v6, ft0, v30");
-        case 1:
-            __asm__("vfmul.vf v24, v2, ft1");
-            __asm__("vfmul.vf v26, v0, ft1");
-            __asm__("vfmsub.vf v0, ft0, v24");
-            __asm__("vfmadd.vf v2, ft0, v26");
-    }
-
-    scomplex beta_cast = *beta;
-    if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
-        if (csc == 2 * FLT_SIZE) {
-            scomplex *c_tmp = c;
-            switch (M) {
-                case 5:
-                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-                case 4:
-                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-                case 3:
-                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-                case 2:
-                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-                case 1:
-                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-            }
-        }
-        else {
-            scomplex *c_tmp = c;
-            switch (M) {
-                case 5:
-                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-                case 4:
-                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-                case 3:
-                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-                case 2:
-                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-                case 1:
-                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-            }
-        }
-    }
-
-    if (csc == 2 * FLT_SIZE) {
-        switch (M) {
-            case 5:
-                __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
-        }
-    }
-    else {
-        switch (M) {
-            case 5:
-                __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-        }
-    }
-
-    return;
-}
-
-void bli_cgemm_6m2_k0
-     (
-             dim_t              M,
-             dim_t              N,
-       const scomplex* restrict beta,
-             scomplex* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
-    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
-    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    csc *= 2 * FLT_SIZE;
-
-    scomplex beta_cast = *beta;
-    if (beta_cast.real == 0.f && beta_cast.imag == 0.f) {
-        // set c to 0
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v2, 0");
-        for (size_t i = 0; i < M; ++i) {
-            if (csc == 2 * FLT_SIZE)
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
-            else
-                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-            c += rsc;
-        }
-    }
-    else {
-        // scale c by beta
-        for (size_t i = 0; i < M; ++i) {
-            if (csc == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
-                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
-                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
-            }
-            else {
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
-                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
-            }
-            c += rsc;
-        }
-    }
-    return;
-}
-
-void bli_cgemm_sifive_x280_asm_6m2
-     (
-             dim_t               M,
-             dim_t               N,
-             dim_t               K,
-       const void*      restrict alpha_,
-       const void*      restrict a_,
-       const void*      restrict b_,
-       const void*      restrict beta_,
-             void*      restrict c_, inc_t rsc, inc_t csc,
-       const auxinfo_t* restrict data,
-       const cntx_t*    restrict cntx
-     )
-{
-    // M x N x K cgemm
-    (void) data;
-    (void) cntx;
-    const scomplex* restrict alpha = alpha_;
-    const scomplex* restrict beta = beta_;
-    const scomplex* restrict a = a_;
-    const scomplex* restrict b = b_;
-    scomplex* restrict c = c_;
-
-    if (M <= 0 || N <= 0 || K < 0)
-        return;
-    else if (K == 0)
-        bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc);
-    else if (M == 6)
-        bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
-    else
-        bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-#undef PACKMR
-#undef PACKNR
-
-// byte-size of underlying floating point type
-#define FLT_SIZE 8
-#define FLT_LOAD "fld "
-#define VLSEG2 "vlseg2e64.v "
-#define VLSSEG2 "vlsseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define PACKMR 8
-#define PACKNR 16
-
-void bli_zgemm_6m2
-     (
-             dim_t              N,
-             dim_t              K,
-       const dcomplex* restrict alpha,
-       const dcomplex* restrict a,
-       const dcomplex* restrict b,
-       const dcomplex* restrict beta,
-             dcomplex* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 6 x N x K zgemm, N <= 32 = vlmax, K > 0
-    // pairs of register groups hold the real and imag. parts of rows of c and b
-    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    if (K >= 2) {
-        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-    vcmul_vf(v0, v2, v24, v26, ft0, ft1);
-
-    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-    vcmul_vf(v4, v6, v24, v26, ft2, ft3);
-
-    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-    vcmul_vf(v8, v10, v24, v26, ft4, ft5);
-
-    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-    vcmul_vf(v12, v14, v24, v26, ft6, ft7);
-
-    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-    vcmul_vf(v16, v18, v24, v26, ft8, ft9);
-
-    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
-    vcmul_vf(v20, v22, v24, v26, ft10, ft11);
-    K -= 1;
-
-    if (K >= 2) {
-        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-    while (K > 0) {
-        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-        vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
-
-        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-        vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
-
-        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-        vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
-
-        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-        vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
-
-        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-        vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
-
-        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
-        vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
-        K -= 1;
-
-        if (K == 0) { break; }
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-        vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
-
-        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-        vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
-
-        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-        vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
-
-        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-        vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
-
-        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-        vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
-
-        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
-        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
-        vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
-        K -= 1;
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-    }
-
-    rsc *= 2 * FLT_SIZE;
-    csc *= 2 * FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
-
-    __asm__("vfmul.vf v24, v2, ft1");
-    __asm__("vfmul.vf v26, v0, ft1");
-    __asm__("vfmul.vf v28, v6, ft1");
-    __asm__("vfmul.vf v30, v4, ft1");
-
-    __asm__("vfmsub.vf v0, ft0, v24");
-    __asm__("vfmadd.vf v2, ft0, v26");
-    __asm__("vfmsub.vf v4, ft0, v28");
-    __asm__("vfmadd.vf v6, ft0, v30");
-
-    __asm__("vfmul.vf v24, v10, ft1");
-    __asm__("vfmul.vf v26, v8, ft1");
-    __asm__("vfmul.vf v28, v14, ft1");
-    __asm__("vfmul.vf v30, v12, ft1");
-
-    __asm__("vfmsub.vf v8, ft0, v24");
-    __asm__("vfmadd.vf v10, ft0, v26");
-    __asm__("vfmsub.vf v12, ft0, v28");
-    __asm__("vfmadd.vf v14, ft0, v30");
-
-    __asm__("vfmul.vf v24, v18, ft1");
-    __asm__("vfmul.vf v26, v16, ft1");
-    __asm__("vfmul.vf v28, v22, ft1");
-    __asm__("vfmul.vf v30, v20, ft1");
-
-    __asm__("vfmsub.vf v16, ft0, v24");
-    __asm__("vfmadd.vf v18, ft0, v26");
-    __asm__("vfmsub.vf v20, ft0, v28");
-    __asm__("vfmadd.vf v22, ft0, v30");
-
-    dcomplex beta_cast = *beta;
-    if (beta_cast.real != 0. || beta_cast.imag != 0.) {
-        if (csc == 2 * FLT_SIZE) {
-            dcomplex *c_tmp = c;
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-
-            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
-        }
-        else {
-            dcomplex *c_tmp = c;
-            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-
-            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-
-            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-
-            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
-        }
-    }
-
-    if (csc == 2 * FLT_SIZE) {
-        __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
-    }
-    else {
-        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
-        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
-        __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
-    }
-
-    return;
-}
-
-void bli_zgemm_6m2_cleanup
-     (
-             dim_t              M,
-             dim_t              N,
-             dim_t              K,
-       const dcomplex* restrict alpha,
-       const dcomplex* restrict a,
-       const dcomplex* restrict b,
-       const dcomplex* restrict beta,
-             dcomplex* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
-    // pairs of register groups hold the real and imag. parts of rows of c and b
-
-    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    if (K >= 2) {
-        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-
-    switch (M) {
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-            vcmul_vf(v16, v18, v24, v26, ft8, ft9);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-            vcmul_vf(v12, v14, v24, v26, ft6, ft7);
-        case 3:
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-            vcmul_vf(v8, v10, v24, v26, ft4, ft5);
-        case 2:
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-            vcmul_vf(v4, v6, v24, v26, ft2, ft3);
-        case 1:
-            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-            vcmul_vf(v0, v2, v24, v26, ft0, ft1);
-    }
-    K -= 1;
-
-    if (K >= 2) {
-        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-    while (K > 0) {
-        switch (M) {
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-                vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-                vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
-        }
-        K -= 1;
-
-        if (K == 0) { break; }
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-
-        switch (M) {
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
-                vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
-                vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
-                vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
-                vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
-                vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
-        }
-        K -= 1;
-
-        if (K >= 2) {
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
-            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
-    }
-
-    c += (M - 1) * rsc;
-    rsc *= 2 * FLT_SIZE;
-    csc *= 2 * FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
-
-    switch (M) {
-        case 5:
-            __asm__("vfmul.vf v24, v18, ft1");
-            __asm__("vfmul.vf v26, v16, ft1");
-            __asm__("vfmsub.vf v16, ft0, v24");
-            __asm__("vfmadd.vf v18, ft0, v26");
-        case 4:
-            __asm__("vfmul.vf v28, v14, ft1");
-            __asm__("vfmul.vf v30, v12, ft1");
-            __asm__("vfmsub.vf v12, ft0, v28");
-            __asm__("vfmadd.vf v14, ft0, v30");
-        case 3:
-            __asm__("vfmul.vf v24, v10, ft1");
-            __asm__("vfmul.vf v26, v8, ft1");
-            __asm__("vfmsub.vf v8, ft0, v24");
-            __asm__("vfmadd.vf v10, ft0, v26");
-        case 2:
-            __asm__("vfmul.vf v28, v6, ft1");
-            __asm__("vfmul.vf v30, v4, ft1");
-            __asm__("vfmsub.vf v4, ft0, v28");
-            __asm__("vfmadd.vf v6, ft0, v30");
-        case 1:
-            __asm__("vfmul.vf v24, v2, ft1");
-            __asm__("vfmul.vf v26, v0, ft1");
-            __asm__("vfmsub.vf v0, ft0, v24");
-            __asm__("vfmadd.vf v2, ft0, v26");
-    }
-
-    dcomplex beta_cast = *beta;
-    if (beta_cast.real != 0. || beta_cast.imag != 0.) {
-        if (csc == 2 * FLT_SIZE) {
-            dcomplex *c_tmp = c;
-            switch (M) {
-                case 5:
-                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-                case 4:
-                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-                case 3:
-                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-                case 2:
-                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-                case 1:
-                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
-                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-            }
-        }
-        else {
-            dcomplex *c_tmp = c;
-            switch (M) {
-                case 5:
-                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
-                case 4:
-                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
-                case 3:
-                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
-                case 2:
-                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
-                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
-                case 1:
-                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
-                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
-            }
-        }
-    }
-
-    if (csc == 2 * FLT_SIZE) {
-        switch (M) {
-            case 5:
-                __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
-        }
-    }
-    else {
-        switch (M) {
-            case 5:
-                __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 4:
-                __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 3:
-                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 2:
-                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
-                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
-            case 1:
-                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-        }
-    }
-
-    return;
-}
-
-void bli_zgemm_6m2_k0
-     (
-             dim_t              M,
-             dim_t              N,
-       const dcomplex* restrict beta,
-             dcomplex* restrict c, inc_t rsc, inc_t csc
-     )
-{
-    // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
-    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
-    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
-    csc *= 2 * FLT_SIZE;
-
-    dcomplex beta_cast = *beta;
-    if (beta_cast.real == 0. && beta_cast.imag == 0.) {
-        // set c to 0
-        __asm__("vmv.v.i v0, 0");
-        __asm__("vmv.v.i v2, 0");
-        for (size_t i = 0; i < M; ++i) {
-            if (csc == 2 * FLT_SIZE)
-                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
-            else
-                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-            c += rsc;
-        }
-    }
-    else {
-        // scale c by beta
-        for (size_t i = 0; i < M; ++i) {
-            if (csc == 2 * FLT_SIZE) {
-                __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
-                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
-                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
-            }
-            else {
-                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
-                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
-                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
-            }
-            c += rsc;
-        }
-    }
-    return;
-}
-
-void bli_zgemm_sifive_x280_asm_6m2
-     (
-             dim_t               M,
-             dim_t               N,
-             dim_t               K,
-       const void*      restrict alpha_,
-       const void*      restrict a_,
-       const void*      restrict b_,
-       const void*      restrict beta_,
-             void*      restrict c_, inc_t rsc, inc_t csc,
-       const auxinfo_t* restrict data,
-       const cntx_t*    restrict cntx
-     )
-{
-    // M x N x K zgemm
-    (void) data;
-    (void) cntx;
-    const dcomplex* restrict alpha = alpha_;
-    const dcomplex* restrict beta = beta_;
-    const dcomplex* restrict a = a_;
-    const dcomplex* restrict b = b_;
-    dcomplex* restrict c = c_;
-
-    if (M <= 0 || N <= 0 || K < 0)
-        return;
-    else if (K == 0)
-        bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc);
-    else if (M == 6)
-        bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
-    else
-        bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
-    return;
-}
-
-#undef FLT_SIZE
-#undef FLT_LOAD
-#undef VLSEG2
-#undef VLSSEG2
-#undef VSSEG2
-#undef VSSSEG2
-#undef PACKMR
-#undef PACKNR
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c
new file mode 100644
index 000000000..664d4616f
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c
@@ -0,0 +1,138 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "../../riscv_cmul_macros_intr.h"
+#include "../../riscv_overloaded_intrinsics.h"
+#include "blis.h"
+#include <riscv_vector.h>
+#include <stdint.h>
+
+#define GEMM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemm_sifive_x280_intr(\
+         dim_t               m,                        \
+         dim_t               n,                        \
+         dim_t               k,                        \
+   const void*      restrict alpha_,                   \
+   const void*      restrict a_,                       \
+   const void*      restrict b_,                       \
+   const void*      restrict beta_,                    \
+         void*      restrict c_, inc_t rsc, inc_t csc, \
+   const auxinfo_t* restrict data,                     \
+   const cntx_t*    restrict cntx                      \
+)
+
+#define GEMM(...)  GEMM_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+#define PACKMR 8
+#define PACKNR 64
+
+#include "./bli_gemm_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef PACKMR
+#undef PACKNR
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+#define PACKMR 8
+#define PACKNR 32
+
+#include "./bli_gemm_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef PACKMR
+#undef PACKNR
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m2
+#define FLT_SIZE sizeof(float)
+#define PACKMR 8
+#define PACKNR 32
+
+#include "./bli_gemm_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef PACKMR
+#undef PACKNR
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m2
+#define FLT_SIZE sizeof(double)
+#define PACKMR 8
+#define PACKNR 16
+
+#include "./bli_gemm_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef PACKMR
+#undef PACKNR
+
+#undef GEMM
+#undef GEMM_
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..15a19ab49
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c
@@ -0,0 +1,517 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMM
+
+GEMM(PRECISION_CHAR, void)
+{
+    (void) data; // Suppress unused parameter warnings
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict b = b_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict c = c_;
+
+    if (m <= 0 || n <= 0 || k < 0)
+        return;
+    else if (k == 0) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            RVV_TYPE_FX(PREC, LMUL, 2) zero_splat = VUNDEFINED_FX(PREC, LMUL, 2)();
+            zero_splat = VSET_V_F(PREC, LMUL, 2)(zero_splat, 0, VFMV_V_F(PREC, LMUL)(0., n));
+            zero_splat = VSET_V_F(PREC, LMUL, 2)(zero_splat, 1, VFMV_V_F(PREC, LMUL)(0., n));
+
+            for (dim_t i = 0; i < m; ++i) {
+                if (csc == 1)
+                    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), zero_splat, n);
+                else
+                    VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, zero_splat, n);
+            }
+        }
+        else {
+            for (dim_t i = 0; i < m; ++i) {
+                RVV_TYPE_FX(PREC, LMUL, 2) c0;
+                RVV_TYPE_F(PREC, LMUL) c0_r, c0_i;
+                RVV_TYPE_F(PREC, LMUL) beta_c0_r, beta_c0_i;
+
+                if (csc == 1)
+                    c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), n);
+                else
+                    c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMUL_VF(PREC, LMUL, beta_c0_r, beta_c0_i, c0_r, c0_i, beta->real, beta->imag, n); 
+                c0 = VSET_V_F(PREC, LMUL, 2)(c0, 0, beta_c0_r);
+                c0 = VSET_V_F(PREC, LMUL, 2)(c0, 1, beta_c0_i);
+                if (csc == 1)
+                    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), c0, n);
+                else
+                    VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, c0, n);
+            }
+        }
+    }
+    else if (m == 6) {
+        RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r, ab5_r;
+        RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i, ab5_i;
+        RVV_TYPE_FX(PREC, LMUL, 2) b0, b1;
+        RVV_TYPE_F(PREC, LMUL) b0_r, b1_r;
+        RVV_TYPE_F(PREC, LMUL) b0_i, b1_i;
+
+        b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+        b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+        b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+        b += PACKNR;
+        if (k >= 2) {
+            b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+            b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0);
+            b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1);
+            b += PACKNR;
+        }
+
+        VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0].real, a[0].imag, n);
+        VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1].real, a[1].imag, n);
+        VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2].real, a[2].imag, n);
+        VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3].real, a[3].imag, n);
+        VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4].real, a[4].imag, n);
+        VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, b0_r, b0_i, a[5].real, a[5].imag, n);
+        
+        a += PACKMR;
+        k -= 1;
+        
+        if (k >= 2) {
+            b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+            b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+            b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+            b += PACKNR;
+        }
+
+        while (k > 0) {
+            VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b1_r, b1_i, n);
+            VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b1_r, b1_i, n);
+            VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b1_r, b1_i, n);
+            VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b1_r, b1_i, n);
+            VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b1_r, b1_i, n);
+            VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5].real, a[5].imag, b1_r, b1_i, n);
+             
+            a += PACKMR;
+            k -= 1;
+
+            if (k == 0) { break; }
+
+            if (k >= 2) {
+                b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+                b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0);
+                b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1);
+                b += PACKNR;
+            }
+
+            VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b0_r, b0_i, n);
+            VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b0_r, b0_i, n);
+            VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b0_r, b0_i, n);
+            VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b0_r, b0_i, n);
+            VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b0_r, b0_i, n);
+            VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5].real, a[5].imag, b0_r, b0_i, n);
+             
+            a += PACKMR;
+            k -= 1;
+
+            if (k == 0) { break; }
+
+            if (k >= 2) {
+                b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+                b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+                b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+                b += PACKNR;
+            }
+        }
+
+        RVV_TYPE_F(PREC, LMUL) temp0_r, temp1_r;
+        RVV_TYPE_F(PREC, LMUL) temp0_i, temp1_i;
+        temp0_r = VFMUL_VF(PREC, LMUL)(ab0_i, alpha->imag, n);
+        temp0_i = VFMUL_VF(PREC, LMUL)(ab0_r, alpha->imag, n);
+        temp1_r = VFMUL_VF(PREC, LMUL)(ab1_i, alpha->imag, n);
+        temp1_i = VFMUL_VF(PREC, LMUL)(ab1_r, alpha->imag, n);
+
+        ab0_r = VFMSUB_VF(PREC, LMUL)(ab0_r, alpha->real, temp0_r, n);
+        ab0_i = VFMADD_VF(PREC, LMUL)(ab0_i, alpha->real, temp0_i, n);
+        ab1_r = VFMSUB_VF(PREC, LMUL)(ab1_r, alpha->real, temp1_r, n);
+        ab1_i = VFMADD_VF(PREC, LMUL)(ab1_i, alpha->real, temp1_i, n);
+
+        temp0_r = VFMUL_VF(PREC, LMUL)(ab2_i, alpha->imag, n);
+        temp0_i = VFMUL_VF(PREC, LMUL)(ab2_r, alpha->imag, n);
+        temp1_r = VFMUL_VF(PREC, LMUL)(ab3_i, alpha->imag, n);
+        temp1_i = VFMUL_VF(PREC, LMUL)(ab3_r, alpha->imag, n);
+
+        ab2_r = VFMSUB_VF(PREC, LMUL)(ab2_r, alpha->real, temp0_r, n);
+        ab2_i = VFMADD_VF(PREC, LMUL)(ab2_i, alpha->real, temp0_i, n);
+        ab3_r = VFMSUB_VF(PREC, LMUL)(ab3_r, alpha->real, temp1_r, n);
+        ab3_i = VFMADD_VF(PREC, LMUL)(ab3_i, alpha->real, temp1_i, n);
+
+        temp0_r = VFMUL_VF(PREC, LMUL)(ab4_i, alpha->imag, n);
+        temp0_i = VFMUL_VF(PREC, LMUL)(ab4_r, alpha->imag, n);
+        temp1_r = VFMUL_VF(PREC, LMUL)(ab5_i, alpha->imag, n);
+        temp1_i = VFMUL_VF(PREC, LMUL)(ab5_r, alpha->imag, n);
+
+        ab4_r = VFMSUB_VF(PREC, LMUL)(ab4_r, alpha->real, temp0_r, n);
+        ab4_i = VFMADD_VF(PREC, LMUL)(ab4_i, alpha->real, temp0_i, n);
+        ab5_r = VFMSUB_VF(PREC, LMUL)(ab5_r, alpha->real, temp1_r, n);
+        ab5_i = VFMADD_VF(PREC, LMUL)(ab5_i, alpha->real, temp1_i, n);
+
+        if (!PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            RVV_TYPE_FX(PREC, LMUL, 2) c0;
+            RVV_TYPE_F(PREC, LMUL) c0_r, c0_i;
+            if (csc == 1) {
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n);
+            }
+            else {
+                c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n);
+                
+                c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), 2 * FLT_SIZE * csc, n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n);
+            }
+        }
+
+        RVV_TYPE_FX(PREC, LMUL, 2) ab0 = VCREATE_V_FX(PREC, LMUL, 2)(ab0_r, ab0_i);
+        RVV_TYPE_FX(PREC, LMUL, 2) ab1 = VCREATE_V_FX(PREC, LMUL, 2)(ab1_r, ab1_i);
+        RVV_TYPE_FX(PREC, LMUL, 2) ab2 = VCREATE_V_FX(PREC, LMUL, 2)(ab2_r, ab2_i);
+        RVV_TYPE_FX(PREC, LMUL, 2) ab3 = VCREATE_V_FX(PREC, LMUL, 2)(ab3_r, ab3_i);
+        RVV_TYPE_FX(PREC, LMUL, 2) ab4 = VCREATE_V_FX(PREC, LMUL, 2)(ab4_r, ab4_i);
+        RVV_TYPE_FX(PREC, LMUL, 2) ab5 = VCREATE_V_FX(PREC, LMUL, 2)(ab5_r, ab5_i);
+
+        if (csc == 1) {
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), ab0, n);
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), ab1, n);
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), ab2, n);
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), ab3, n);
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), ab4, n);
+            VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), ab5, n);
+        }
+        else {
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, ab0, n);
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, ab1, n);
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, ab2, n);
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, ab3, n);
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, ab4, n);
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), 2 * FLT_SIZE * csc, ab5, n);
+        }
+    }
+    else {
+        RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r;
+        RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i;
+        RVV_TYPE_FX(PREC, LMUL, 2) b0, b1;
+        RVV_TYPE_F(PREC, LMUL) b0_r, b1_r;
+        RVV_TYPE_F(PREC, LMUL) b0_i, b1_i;
+
+        b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+        b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+        b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+        b += PACKNR;
+        if (k >= 2) {
+            b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+            b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0);
+            b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1);
+            b += PACKNR;
+        }
+
+        switch (m) {
+        case 5:
+            VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4].real, a[4].imag, n);
+        case 4:
+            VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3].real, a[3].imag, n);
+        case 3:
+            VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2].real, a[2].imag, n);
+        case 2:
+            VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1].real, a[1].imag, n);
+        case 1:
+            VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0].real, a[0].imag, n);
+        }
+        
+        a += PACKMR;
+        k -= 1;
+        
+        if (k >= 2) {
+            b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+            b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+            b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+            b += PACKNR;
+        }
+
+        while (k > 0) {
+            switch (m) {
+            case 5:
+                VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b1_r, b1_i, n);
+            case 4:
+                VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b1_r, b1_i, n);
+            case 3:
+                VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b1_r, b1_i, n);
+            case 2:
+                VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b1_r, b1_i, n);
+            case 1:
+                VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b1_r, b1_i, n);
+            }
+             
+            a += PACKMR;
+            k -= 1;
+
+            if (k == 0) { break; }
+
+            if (k >= 2) {
+                b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+                b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0);
+                b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1);
+                b += PACKNR;
+            }
+
+            switch (m) {
+            case 5:
+                VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b0_r, b0_i, n);
+            case 4:
+                VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b0_r, b0_i, n);
+            case 3:
+                VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b0_r, b0_i, n);
+            case 2:
+                VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b0_r, b0_i, n);
+            case 1:
+                VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b0_r, b0_i, n);
+            }
+             
+            a += PACKMR;
+            k -= 1;
+
+            if (k == 0) { break; }
+
+            if (k >= 2) {
+                b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+                b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+                b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+                b += PACKNR;
+            }
+        }
+
+        RVV_TYPE_F(PREC, LMUL) temp0_r, temp1_r;
+        RVV_TYPE_F(PREC, LMUL) temp0_i, temp1_i;
+        switch (m) {
+        case 5:
+            temp0_r = VFMUL_VF(PREC, LMUL)(ab4_i, alpha->imag, n);
+            temp0_i = VFMUL_VF(PREC, LMUL)(ab4_r, alpha->imag, n);
+            ab4_r = VFMSUB_VF(PREC, LMUL)(ab4_r, alpha->real, temp0_r, n);
+            ab4_i = VFMADD_VF(PREC, LMUL)(ab4_i, alpha->real, temp0_i, n);
+        case 4:
+            temp1_r = VFMUL_VF(PREC, LMUL)(ab3_i, alpha->imag, n);
+            temp1_i = VFMUL_VF(PREC, LMUL)(ab3_r, alpha->imag, n);
+            ab3_r = VFMSUB_VF(PREC, LMUL)(ab3_r, alpha->real, temp1_r, n);
+            ab3_i = VFMADD_VF(PREC, LMUL)(ab3_i, alpha->real, temp1_i, n);
+        case 3:
+            temp0_r = VFMUL_VF(PREC, LMUL)(ab2_i, alpha->imag, n);
+            temp0_i = VFMUL_VF(PREC, LMUL)(ab2_r, alpha->imag, n);
+            ab2_r = VFMSUB_VF(PREC, LMUL)(ab2_r, alpha->real, temp0_r, n);
+            ab2_i = VFMADD_VF(PREC, LMUL)(ab2_i, alpha->real, temp0_i, n);
+        case 2:
+            temp1_r = VFMUL_VF(PREC, LMUL)(ab1_i, alpha->imag, n);
+            temp1_i = VFMUL_VF(PREC, LMUL)(ab1_r, alpha->imag, n);
+            ab1_r = VFMSUB_VF(PREC, LMUL)(ab1_r, alpha->real, temp1_r, n);
+            ab1_i = VFMADD_VF(PREC, LMUL)(ab1_i, alpha->real, temp1_i, n);
+        case 1:
+            temp0_r = VFMUL_VF(PREC, LMUL)(ab0_i, alpha->imag, n);
+            temp0_i = VFMUL_VF(PREC, LMUL)(ab0_r, alpha->imag, n);
+            ab0_r = VFMSUB_VF(PREC, LMUL)(ab0_r, alpha->real, temp0_r, n);
+            ab0_i = VFMADD_VF(PREC, LMUL)(ab0_i, alpha->real, temp0_i, n);
+        }
+
+        if (!PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            RVV_TYPE_FX(PREC, LMUL, 2) c0;
+            RVV_TYPE_F(PREC, LMUL) c0_r, c0_i;
+            if (csc == 1) {
+                switch (m) {
+                case 5:
+                    c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 4:
+                    c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 3:
+                    c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 2:
+                    c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 1:
+                    c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n);
+                }
+                
+            }
+            else {
+                switch (m) {
+                case 5:
+                    c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 4:
+                    c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 3:
+                    c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 2:
+                    c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n);
+                case 1:
+                    c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, n);
+                    c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                    c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                    VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n);
+                }
+            }
+        }
+
+        RVV_TYPE_FX(PREC, LMUL, 2) ab0, ab1, ab2, ab3, ab4;
+        switch (m) {
+        case 5:
+            ab4 = VCREATE_V_FX(PREC, LMUL, 2)(ab4_r, ab4_i);
+        case 4:
+            ab3 = VCREATE_V_FX(PREC, LMUL, 2)(ab3_r, ab3_i);
+        case 3:
+            ab2 = VCREATE_V_FX(PREC, LMUL, 2)(ab2_r, ab2_i);
+        case 2:
+            ab1 = VCREATE_V_FX(PREC, LMUL, 2)(ab1_r, ab1_i);
+        case 1:
+            ab0 = VCREATE_V_FX(PREC, LMUL, 2)(ab0_r, ab0_i);
+        }
+
+        if (csc == 1) {
+            switch (m) {
+            case 5:
+                VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), ab4, n);
+            case 4:
+                VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), ab3, n);
+            case 3:
+                VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), ab2, n);
+            case 2:
+                VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), ab1, n);
+            case 1:
+                VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), ab0, n);
+            }
+        }
+        else {
+            switch (m) {
+            case 5:
+                VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, ab4, n);
+            case 4:
+                VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, ab3, n);
+            case 3:
+                VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, ab2, n);
+            case 2:
+                VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, ab1, n);
+            case 1:
+                VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, ab0, n);
+            }
+        }
+    }
+
+    return;
+}
+
+#endif // GEMM
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c
new file mode 100644
index 000000000..605b93fb7
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c
@@ -0,0 +1,339 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMM
+
+GEMM(PRECISION_CHAR, void)
+{
+    (void) data; // Suppress unused parameter warnings
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a = a_;
+    const DATATYPE* restrict b = b_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict c = c_;
+
+    if (m <= 0 || n <= 0 || k < 0)
+        return;
+    else if (k == 0) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n);
+            for (dim_t i = 0; i < m; ++i) {
+                if (csc == 1)
+                    VSE_V_F(PREC, LMUL)(c + i * rsc, zero_splat, n);
+                else
+                    VSSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, zero_splat, n);
+            }
+        }
+        else {
+            for (dim_t i = 0; i < m; ++i) {
+                RVV_TYPE_F(PREC, LMUL) c0;
+                if (csc == 1)
+                    c0 = VLE_V_F(PREC, LMUL)(c + i * rsc, n);
+                else
+                    c0 = VLSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, n);
+                c0 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+                if (csc == 1)
+                    VSE_V_F(PREC, LMUL)(c + i * rsc, c0, n);
+                else
+                    VSSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, c0, n);
+            }
+        }
+    }
+    else if (m == 7) {
+        RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5, ab6;
+        bool first = true;
+        for (dim_t i = 0; i < k; ++i) {
+            RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n);
+            if (first) {
+                ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0], n);
+                ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1], n);
+                ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2], n);
+                ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3], n);
+                ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4], n);
+                ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5], n);
+                ab6 = VFMUL_VF(PREC, LMUL)(b0, a[6], n);
+                first = false;
+            }
+            else {
+                ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0], b0, n);
+                ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1], b0, n);
+                ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2], b0, n);
+                ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3], b0, n);
+                ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4], b0, n);
+                ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5], b0, n);
+                ab6 = VFMACC_VF(PREC, LMUL)(ab6, a[6], b0, n);
+            }
+
+            a += PACKMR;
+            b += PACKNR;
+        }
+
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n);
+            ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n);
+            ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n);
+            ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n);
+            ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n);
+            ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n);
+            ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n);
+        }
+        else {
+            RVV_TYPE_F(PREC, LMUL) c0;
+            if (csc == 1) {
+                c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n);
+                ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n);
+                ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n);
+                c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n);
+                ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n);
+                ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n);
+                c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n);
+                ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n);
+                ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n);
+                c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n);
+                ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n);
+                ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n);
+                c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n);
+                ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n);
+                ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n);
+                c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n);
+                ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n);
+                ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n);
+                c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n);
+                ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n);
+                ab6 = VFMACC_VF(PREC, LMUL)(ab6, *beta, c0, n);
+            }
+            else {
+                c0 = VLSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, n);
+                ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n);
+                ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n);
+                c0 = VLSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, n);
+                ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n);
+                ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n);
+                c0 = VLSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, n);
+                ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n);
+                ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n);
+                c0 = VLSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, n);
+                ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n);
+                ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n);
+                c0 = VLSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, n);
+                ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n);
+                ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n);
+                c0 = VLSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, n);
+                ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n);
+                ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n);
+                c0 = VLSE_V_F(PREC, LMUL)(c + 6 * rsc, FLT_SIZE * csc, n);
+                ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n);
+                ab6 = VFMACC_VF(PREC, LMUL)(ab6, *beta, c0, n);
+            }
+        }
+
+        if (csc == 1) {
+            VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n);
+            VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n);
+            VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n);
+            VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n);
+            VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n);
+            VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n);
+            VSE_V_F(PREC, LMUL)(c + 6 * rsc, ab6, n);
+        }
+        else {
+            VSSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, ab0, n);
+            VSSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, ab1, n);
+            VSSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, ab2, n);
+            VSSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, ab3, n);
+            VSSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, ab4, n);
+            VSSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, ab5, n);
+            VSSE_V_F(PREC, LMUL)(c + 6 * rsc, FLT_SIZE * csc, ab6, n);
+        }
+    }
+    else {
+        // 0 < m < 7
+        RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5;
+        bool first = true;
+        for (dim_t i = 0; i < k; ++i) {
+            RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n);
+            if (first) {
+                switch (m) {
+                case 6:
+                    ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5], n);
+                case 5:
+                    ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4], n);
+                case 4:
+                    ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3], n);
+                case 3:
+                    ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2], n);
+                case 2:
+                    ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1], n);
+                case 1:
+                    ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0], n);
+                }
+                first = false;
+            }
+            else {
+                switch (m) {
+                case 6:
+                    ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5], b0, n);
+                case 5:
+                    ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4], b0, n);
+                case 4:
+                    ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3], b0, n);
+                case 3:
+                    ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2], b0, n);
+                case 2:
+                    ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1], b0, n);
+                case 1:
+                    ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0], b0, n);
+                }
+            }
+
+            a += PACKMR;
+            b += PACKNR;
+        }
+
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            switch (m) {
+            case 6:
+                ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n);
+            case 5:
+                ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n);
+            case 4:
+                ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n);
+            case 3:
+                ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n);
+            case 2:
+                ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n);
+            case 1:
+                ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n);
+            }
+        }
+        else {
+            RVV_TYPE_F(PREC, LMUL) c0;
+            if (csc == 1) {
+                switch (m) {
+                case 6:
+                    c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n);
+                    ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n);
+                    ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n);
+                case 5:
+                    c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n);
+                    ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n);
+                    ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n);
+                case 4:
+                    c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n);
+                    ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n);
+                    ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n);
+                case 3:
+                    c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n);
+                    ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n);
+                    ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n);
+                case 2:
+                    c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n);
+                    ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n);
+                    ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n);
+                case 1:
+                    c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n);
+                    ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n);
+                    ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n);
+                }
+            }
+            else {
+                switch (m) {
+                case 6:
+                    c0 = VLSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, n);
+                    ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n);
+                    ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n);
+                case 5:
+                    c0 = VLSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, n);
+                    ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n);
+                    ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n);
+                case 4:
+                    c0 = VLSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, n);
+                    ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n);
+                    ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n);
+                case 3:
+                    c0 = VLSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, n);
+                    ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n);
+                    ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n);
+                case 2:
+                    c0 = VLSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, n);
+                    ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n);
+                    ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n);
+                case 1:
+                    c0 = VLSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, n);
+                    ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n);
+                    ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n);
+                }
+            }
+        }
+
+        if (csc == 1) {
+            switch (m) {
+            case 6:
+                VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n);
+            case 5:
+                VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n);
+            case 4:
+                VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n);
+            case 3:
+                VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n);
+            case 2:
+                VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n);
+            case 1:
+                VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n);
+            }
+        }
+        else {
+            switch (m) {
+            case 6:
+                VSSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, ab5, n);
+            case 5:
+                VSSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, ab4, n);
+            case 4:
+                VSSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, ab3, n);
+            case 3:
+                VSSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, ab2, n);
+            case 2:
+                VSSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, ab1, n);
+            case 1:
+                VSSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, ab0, n);
+            }
+        }
+    }
+
+    return;
+}
+
+#endif // GEMM
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
deleted file mode 100644
index 18df010d0..000000000
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// clang-format off
-#ifdef GEMMTRSM
-
-GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
-{
-    (void) data;
-    (void) cntx;
-    const DATATYPE* restrict alpha = alpha_;
-    const DATATYPE* restrict a10 = a10_;
-    const DATATYPE* restrict a11 = a11_;
-    const DATATYPE* restrict b01 = b01_;
-    const DATATYPE* restrict b11 = b11_;
-    DATATYPE* restrict c11 = c11_;
-
-    if (m <= 0 || n <= 0)
-        return;
-
-    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-
-    DATATYPE alpha_cast = *alpha;
-    if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
-        switch (m) {
-            case 6:
-                __asm__("vmv.v.i v20, 0");
-                __asm__("vmv.v.i v22, 0");
-            case 5:
-                __asm__("vmv.v.i v16, 0");
-                __asm__("vmv.v.i v18, 0");
-            case 4:
-                __asm__("vmv.v.i v12, 0");
-                __asm__("vmv.v.i v14, 0");
-            case 3:
-                __asm__("vmv.v.i v8, 0");
-                __asm__("vmv.v.i v10, 0");
-            case 2:
-                __asm__("vmv.v.i v4, 0");
-                __asm__("vmv.v.i v6, 0");
-            case 1:
-                __asm__("vmv.v.i v0, 0");
-                __asm__("vmv.v.i v2, 0");
-        }
-    }
-    else {
-        const DATATYPE* b11_tmp = b11 + (m - 1) * PACKNR;
-        switch (m) {
-            case 6:
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
-            case 5:
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
-            case 4:
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
-            case 3:
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
-            case 2:
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
-            case 1:
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
-        }
-    }
-
-    if (k >= 1) {
-        __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
-        __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-    if (k >= 2) {
-        __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
-        __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-
-    while (k > 0) {
-        switch (m) {
-            case 6:
-                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
-                vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
-                vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
-                vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
-                vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
-                vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
-                vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
-        }
-        k -= 1;
-
-        if (k == 0) { break; }
-
-        if (k >= 2) {
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
-            __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
-
-        switch (m) {
-            case 6:
-                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
-                vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
-                vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
-                vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
-                vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
-                vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
-                vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
-        }
-        k -= 1;
-
-        if (k >= 2) {
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
-            __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
-    }
-
-    rsc *= 2 * FLT_SIZE;
-    csc *= 2 * FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
-    vcmul_vf(v24, v26, v0, v2, ft0, ft1);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 1) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
-            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-        case 3:
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
-            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
-        case 2:
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
-            vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
-    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
-    vcmul_vf(v24, v26, v4, v6, ft2, ft3);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 2) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
-            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-        case 3:
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
-            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
-    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
-    vcmul_vf(v24, v26, v8, v10, ft4, ft5);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 3) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
-            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
-    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
-    vcmul_vf(v24, v26, v12, v14, ft6, ft7);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 4) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
-    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
-    vcmul_vf(v24, v26, v16, v18, ft8, ft9);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 5) return;
-
-    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
-    vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
-    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
-    vcmul_vf(v24, v26, v20, v22, ft10, ft11);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    return;
-}
-
-#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
deleted file mode 100644
index a0f913473..000000000
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// clang-format off
-#ifdef GEMMTRSM
-
-GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
-{
-    const DATATYPE* restrict alpha = alpha_;
-    const DATATYPE* restrict a10 = a10_;
-    const DATATYPE* restrict a11 = a11_;
-    const DATATYPE* restrict b01 = b01_;
-    const DATATYPE* restrict b11 = b11_;
-    DATATYPE* restrict c11 = c11_;
-
-    if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
-        return;
-
-    dim_t b11_offset, temp;
-    size_t vl;
-    __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
-    
-    // Multiply step sizes by data size
-    __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
-    __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
-  
-    __asm__("addi %0, %1, %2": "=r"(b11_offset): "r"(m), "I"(-1));
-    __asm__("li %0, %1": "=r"(temp): "I"(PACKNR * FLT_SIZE));
-    __asm__("mul %0, %0, %1": "+r"(b11_offset): "r"(temp));
-    // b11_offset = (m-1)*PACKNR*FLT_SIZE
-
-    __asm__("add %0, %0, %1": "+r"(b11): "r"(b11_offset));
-    __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha));  // TO DO: optimize alpha = 1 case
-    switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
-        case 7: __asm__(VLE "  v0, (%0)": : "r"(b11)); 
-                __asm__("vfmul.vf  v0,  v0, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-        case 6: __asm__(VLE "  v4, (%0)": : "r"(b11));
-                __asm__("vfmul.vf  v4,  v4, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-        case 5: __asm__(VLE "  v8, (%0)": : "r"(b11));
-                __asm__("vfmul.vf  v8,  v8, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-        case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
-                 __asm__("vfmul.vf v12, v12, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-        case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
-                __asm__("vfmul.vf v16, v16, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-        case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
-                __asm__("vfmul.vf v20, v20, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-        case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
-                __asm__("vfmul.vf v24, v24, f0");
-                // no sub of b11 on final entry
-    }
-    // b11 now reset to original value
-    //  v0 = row 6 of b11
-    //  v4 = row 5 of b11
-    //  v8 = row 4 of b11
-    // v12 = row 3 of b11
-    // v16 = row 2 of b11
-    // v20 = row 1 of b11
-    // v24 = row 0 of b11
-
-    // GEMM: B11 := alpha * B11 - A10 * B01
-    for (dim_t i = 0; i < k; i++){
-        __asm__(VLE " v28, (%0)": : "r"(b01)); // kth row of b01
-        switch (m){
-            case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(6*FLT_SIZE), "r"(a10));
-                    __asm__("vfnmsac.vf  v0, f6, v28");
-            case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(5*FLT_SIZE), "r"(a10));
-                    __asm__("vfnmsac.vf  v4, f5, v28");
-            case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(4*FLT_SIZE), "r"(a10));
-                    __asm__("vfnmsac.vf  v8, f4, v28");
-            case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(3*FLT_SIZE), "r"(a10));
-                    __asm__("vfnmsac.vf v12, f3, v28");
-            case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(2*FLT_SIZE), "r"(a10));
-                    __asm__("vfnmsac.vf v16, f2, v28");
-            case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(1*FLT_SIZE), "r"(a10));
-                    __asm__("vfnmsac.vf v20, f1, v28");
-            case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(0*FLT_SIZE), "r"(a10));
-                 __asm__("vfnmsac.vf v24, f0, v28");
-        }
-        __asm__("addi %0, %0, %1": "+r"(a10): "I"(PACKMR * FLT_SIZE));
-        __asm__("addi %0, %0, %1": "+r"(b01): "I"(PACKNR * FLT_SIZE));
-    }
-    // TRSM: B11 := inv(A11) * B11
-    // TO DO: Investigate code size reduction (loop rerolling)
-
-    // Row 0
-    __asm__(FLT_LOAD " f0,  %0(%1)": : "I"(0*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v24, v24, f0");
-    __asm__(VSE " v24, (%0)": : "r"(b11));
-    __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 1) return;
-
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v24");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v24");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v24");
-        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v12, f3, v24");
-        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v16, f2, v24");
-        case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v20, f1, v24");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
-
-    // Row 1
-    __asm__(FLT_LOAD " f1,  %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v20, v20, f1");
-    __asm__(VSE " v20, (%0)": : "r"(b11));
-    __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 2) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v20");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v20");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v20");
-        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v12, f3, v20");
-        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v16, f2, v20");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
-
-    // Row 2
-    __asm__(FLT_LOAD " f2,  %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v16, v16, f2");
-    __asm__(VSE " v16, (%0)": : "r"(b11));
-    __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 3) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v16");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v16");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v16");
-        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v12, f3, v16");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 3
-    __asm__(FLT_LOAD " f3,  %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v12, v12, f3");
-    __asm__(VSE " v12, (%0)": : "r"(b11));
-    __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 4) return;
-  
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v12");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v12");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v12");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 4
-    __asm__(FLT_LOAD " f4,  %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v8, v8, f4");
-    __asm__(VSE " v8, (%0)": : "r"(b11));
-    __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 5) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v8");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v8");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 5
-    __asm__(FLT_LOAD " f5,  %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v4, v4, f5");
-    __asm__(VSE " v4, (%0)": : "r"(b11));
-    __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 6) return;
-    
-    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
-    __asm__("vfnmsac.vf v0, f6, v4");
-    
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 6
-    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v0, v0, f6");
-    __asm__(VSE " v0, (%0)": : "r"(b11));
-    __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
-}
-#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
deleted file mode 100644
index 9332fd096..000000000
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// clang-format off
-#ifdef GEMMTRSM
-
-GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
-{
-    (void) data;
-    (void) cntx;
-    const DATATYPE* restrict alpha = alpha_;
-    const DATATYPE* restrict a12 = a12_;
-    const DATATYPE* restrict a11 = a11_;
-    const DATATYPE* restrict b21 = b21_;
-    const DATATYPE* restrict b11 = b11_;
-    DATATYPE* restrict c11 = c11_;
-
-    if (m <= 0 || n <= 0)
-        return;
-
-    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
-
-    DATATYPE alpha_cast = *alpha;
-    if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
-        switch (m) {
-            case 6:
-                __asm__("vmv.v.i v20, 0");
-                __asm__("vmv.v.i v22, 0");
-            case 5:
-                __asm__("vmv.v.i v16, 0");
-                __asm__("vmv.v.i v18, 0");
-            case 4:
-                __asm__("vmv.v.i v12, 0");
-                __asm__("vmv.v.i v14, 0");
-            case 3:
-                __asm__("vmv.v.i v8, 0");
-                __asm__("vmv.v.i v10, 0");
-            case 2:
-                __asm__("vmv.v.i v4, 0");
-                __asm__("vmv.v.i v6, 0");
-            case 1:
-                __asm__("vmv.v.i v0, 0");
-                __asm__("vmv.v.i v2, 0");
-        }
-    }
-    else {
-        const DATATYPE* b11_tmp = b11;
-        switch (m) {
-            case 6:
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
-            case 5:
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
-            case 4:
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
-            case 3:
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
-            case 2:
-                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
-                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
-            case 1:
-                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
-                vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
-        }
-    }
-
-    if (k >= 1) {
-        __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
-        __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-    if (k >= 2) {
-        __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
-        __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
-    }
-
-    a12 += m - 1;
-
-    while (k > 0) {
-        switch (m) {
-            case 6:
-                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
-                vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
-                vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
-                vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
-                vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
-                vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
-                vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
-        }
-        k -= 1;
-
-        if (k == 0) { break; }
-
-        if (k >= 2) {
-            __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
-            __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
-
-        switch (m) {
-            case 6:
-                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
-                vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
-            case 5:
-                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
-                vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
-            case 4:
-                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
-                vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
-            case 3:
-                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
-                vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
-            case 2:
-                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
-                vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
-            case 1:
-                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
-                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
-                vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
-        }
-        k -= 1;
-
-        if (k >= 2) {
-            __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
-            __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
-        }
-        __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
-    }
-
-    a11 += (m - 1) * (PACKMR + 1); // (m - 1) + (m - 1) * PACKMR
-    b11 += (m - 1) * PACKNR;
-    c11 += (m - 1) * rsc;
-    rsc *= 2 * FLT_SIZE;
-    csc *= 2 * FLT_SIZE;
-
-    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
-    vcmul_vf(v24, v26, v0, v2, ft0, ft1);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 1) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
-            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-        case 3:
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
-            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
-        case 2:
-            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
-            vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
-    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
-    vcmul_vf(v24, v26, v4, v6, ft2, ft3);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 2) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
-            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-        case 3:
-            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
-            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
-    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
-    vcmul_vf(v24, v26, v8, v10, ft4, ft5);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 3) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-        case 4:
-            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
-            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
-    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
-    vcmul_vf(v24, v26, v12, v14, ft6, ft7);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 4) return;
-
-    switch (m) {
-        case 6:
-            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
-            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-        case 5:
-            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
-            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
-            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
-    }
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
-    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
-    vcmul_vf(v24, v26, v16, v18, ft8, ft9);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    if (m == 5) return;
-
-    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
-    vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
-
-    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
-    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
-    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
-
-    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
-    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
-    vcmul_vf(v24, v26, v20, v22, ft10, ft11);
-    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
-    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
-
-    return;
-}
-#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
deleted file mode 100644
index 2d511a8ba..000000000
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// clang-format off
-#ifdef GEMMTRSM
-
-GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
-{
-    const DATATYPE* restrict alpha = alpha_;
-    const DATATYPE* restrict a12 = a12_;
-    const DATATYPE* restrict a11 = a11_;
-    const DATATYPE* restrict b21 = b21_;
-    const DATATYPE* restrict b11 = b11_;
-    DATATYPE* restrict c11 = c11_;
-    
-    if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
-        return;
-    
-    dim_t m_sz, a11_offset, c11_offset, temp;
-    size_t vl;
-    __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
-
-    // Multiply step sizes by data size
-    __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
-    __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
-    __asm__("slli %0, %1, %2": "=r"(m_sz) : "r"(m), "I"(LOG_FLT_SIZE));
-    
-    __asm__("li %0, %1": "=r"(temp): "I"((PACKMR+1)*FLT_SIZE)); 
-    __asm__("mul %0, %1, %2": "=r"(a11_offset) : "r"(m), "r"(temp));
-    __asm__("addi %0, %0, %1": "+r"(a11_offset) : "I"(-PACKMR * FLT_SIZE));
-    __asm__("mul %0, %1, %2": "=r"(c11_offset) : "r"(m), "r"(rsc));
-    __asm__("sub %0, %0, %1": "+r"(c11_offset) : "r"(rsc));
-    // a11_offset = (PACKMR*(m-1)+m)*sz = m*(PACKMR+1)*FLT_SIZE - PACKMR*FLT_SIZE
-    // c11_offset = rsc*(m-1)*sz
-    
-    __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha));
-    switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
-        case 7: __asm__(VLE "  v0, (%0)": : "r"(b11));
-                __asm__("vfmul.vf  v0,  v0, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-        case 6: __asm__(VLE "  v4, (%0)": : "r"(b11));
-                __asm__("vfmul.vf  v4,  v4, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-        case 5: __asm__(VLE "  v8, (%0)": : "r"(b11));
-                __asm__("vfmul.vf  v8,  v8, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-        case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
-                __asm__("vfmul.vf v12, v12, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-        case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
-                __asm__("vfmul.vf v16, v16, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-        case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
-                __asm__("vfmul.vf v20, v20, f0");
-                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
-        case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
-                __asm__("vfmul.vf v24, v24, f0");
-                // no add of b11 on final entry
-    }
-    // b11 now positioned at start of last row
-    // v24 = row 0 from bottom (bottom row)
-    // v20 = row 1 from bottom
-    // v16 = row 2 from bottom
-    // v12 = row 3 from bottom
-    //  v8 = row 4 from bottom
-    //  v4 = row 5 from bottom
-    //  v0 = row 6 from bottom
-
-    // GEMM: B11 := alpha * B11 - A12 * B21
-    __asm__("add %0, %0, %1": "+r"(a12): "r"(m_sz));
-    for (dim_t i = 0; i < k; i++){
-        __asm__(VLE " v28, (%0)": : "r"(b21)); // kth row of b21
-        switch (m){
-            case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(-7*FLT_SIZE), "r"(a12));
-                    __asm__("vfnmsac.vf  v0, f6, v28");
-            case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(-6*FLT_SIZE), "r"(a12));
-                    __asm__("vfnmsac.vf  v4, f5, v28");
-            case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(-5*FLT_SIZE), "r"(a12));
-                    __asm__("vfnmsac.vf  v8, f4, v28");
-            case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(-4*FLT_SIZE), "r"(a12));
-                    __asm__("vfnmsac.vf v12, f3, v28");
-            case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(-3*FLT_SIZE), "r"(a12));
-                    __asm__("vfnmsac.vf v16, f2, v28");
-            case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(-2*FLT_SIZE), "r"(a12));
-                    __asm__("vfnmsac.vf v20, f1, v28");
-            case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(-1*FLT_SIZE), "r"(a12));
-                    __asm__("vfnmsac.vf v24, f0, v28");
-    }
-    __asm__("addi %0, %0, %1": "+r"(a12): "I"(PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b21): "I"(PACKNR * FLT_SIZE));
-    }
-    // TRSM: B11 := inv(A11) * B11
-    // Move a11 to end of array and c11 to first entry in last row
-    __asm__("add %0, %0, %1": "+r"(a11): "r"(a11_offset));
-    __asm__("add %0, %0, %1": "+r"(c11): "r"(c11_offset));
-
-    // Row 0 from bottom (bottom row)
-    __asm__(FLT_LOAD " f0,  %0(%1)": : "I"(-1*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v24, v24, f0");
-    __asm__(VSE " v24, (%0)": : "r"(b11));
-    __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 1) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v24");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v24");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v24");
-        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v12, f3, v24");
-        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v16, f2, v24");
-        case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v20, f1, v24");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
-
-    // Row 1 from bottom
-    __asm__(FLT_LOAD " f1,  %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v20, v20, f1");
-    __asm__(VSE " v20, (%0)": : "r"(b11));
-    __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 2) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v20");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v20");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v20");
-        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v12, f3, v20");
-        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v16, f2, v20");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
-
-    // Row 2 from bottom
-    __asm__(FLT_LOAD " f2,  %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v16, v16, f2");
-    __asm__(VSE " v16, (%0)": : "r"(b11));
-    __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 3) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v16");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v16");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v16");
-        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf v12, f3, v16");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 3 from bottom
-    __asm__(FLT_LOAD " f3,  %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v12, v12, f3");
-    __asm__(VSE " v12, (%0)": : "r"(b11));
-    __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 4) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v12");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v12");
-        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v8, f4, v12");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 4 from bottom
-    __asm__(FLT_LOAD " f4,  %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v8, v8, f4");
-    __asm__(VSE " v8, (%0)": : "r"(b11));
-    __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 5) return;
-    
-    switch (m){
-        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v0, f6, v8");
-        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
-                __asm__("vfnmsac.vf  v4, f5, v8");
-    }
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 5 from bottom
-    __asm__(FLT_LOAD " f5,  %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v4, v4, f5");
-    __asm__(VSE " v4, (%0)": : "r"(b11));
-    __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
-    if (m == 6) return;
-    
-    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
-    __asm__("vfnmsac.vf v0, f6, v4");
-    
-    // Pointer bumps
-    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
-    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
-    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
-    
-    // Row 6 from bottom
-    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
-    __asm__("vfmul.vf v0, v0, f6");
-    __asm__(VSE " v0, (%0)": : "r"(b11));
-    __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
-    
-}
-#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c
similarity index 75%
rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
rename to kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c
index 7cb8d9e07..687abec18 100644
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,12 +34,12 @@
 
 // clang-format off
 #include "blis.h"
-#include "../../riscv_cmul_macros_asm.h"
+#include "../../riscv_cmul_macros_intr.h"
 #include "../../bli_kernels_sifive_x280.h"
 #include <stdint.h>
 #include <riscv_vector.h>
 
-#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_asm(\
+#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_intr(\
           dim_t               m,      \
           dim_t               n,      \
           dim_t               k,      \
@@ -55,7 +55,7 @@
     const cntx_t*    restrict cntx    \
     )
 
-#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_asm(\
+#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_intr(\
           dim_t               m,      \
           dim_t               n,      \
           dim_t               k,      \
@@ -76,108 +76,83 @@
 // Single precision real
 #define DATATYPE float
 #define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
 #define PACKMR 8
 #define PACKNR 64
-#define VLE "vle32.v"
-#define VSE "vse32.v"
-#define VSSE "vsse32.v"
-#define FLT_LOAD "flw"
-#define FLT_SIZE sizeof(float)
-#define LOG_FLT_SIZE 2
-
 
-#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
-#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_sifive_x280_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
 #undef PACKMR
 #undef PACKNR
-#undef VLE
-#undef VSE
-#undef VSSE
-#undef FLT_LOAD
-#undef FLT_SIZE
-#undef LOG_FLT_SIZE
 
 // Double precision real
 #define DATATYPE double
 #define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
 #define PACKMR 8
 #define PACKNR 32
-#define VLE "vle64.v"
-#define VSE "vse64.v"
-#define VSSE "vsse64.v"
-#define FLT_LOAD "fld"
-#define FLT_SIZE sizeof(double)
-#define LOG_FLT_SIZE 3
 
-#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
-#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_sifive_x280_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
 #undef PACKMR
 #undef PACKNR
-#undef VLE
-#undef VSE
-#undef VSSE
-#undef FLT_LOAD
-#undef FLT_SIZE
-#undef LOG_FLT_SIZE
 
 // Single precision complex
 #define DATATYPE scomplex
+#define BASE_DT float
 #define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m2
+#define FLT_SIZE sizeof(float)
 #define PACKMR 8
 #define PACKNR 32
-#define VLSEG2 "vlseg2e32.v "
-#define VSSEG2 "vsseg2e32.v "
-#define VSSSEG2 "vssseg2e32.v "
-#define FLT_LOAD "flw "
-#define FLT_SIZE sizeof(float)
 
-#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
-#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_sifive_x280_intr_complex.c"
 
 #undef DATATYPE
+#undef BASE_DT
 #undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
 #undef PACKMR
 #undef PACKNR
-#undef VLSEG2
-#undef VSSEG2
-#undef VSSSEG2
-#undef FLT_LOAD
-#undef FLT_SIZE
 
 // Double precision complex
 #define DATATYPE dcomplex
+#define BASE_DT double
 #define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m2
+#define FLT_SIZE sizeof(double)
 #define PACKMR 8
 #define PACKNR 16
-#define VLSEG2 "vlseg2e64.v "
-#define VSSEG2 "vsseg2e64.v "
-#define VSSSEG2 "vssseg2e64.v "
-#define FLT_LOAD "fld "
-#define FLT_SIZE sizeof(double)
 
-#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
-#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_sifive_x280_intr_complex.c"
 
 #undef DATATYPE
+#undef BASE_DT
 #undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
 #undef PACKMR
 #undef PACKNR
-#undef VLSEG
-#undef VSSEG
-#undef VSSSEG
-#undef FLT_LOAD
-#undef FLT_SIZE
-
-
 
 #undef GEMMTRSM
 #undef GEMMTRSM_L
 #undef GEMMTRSM_U
-
-
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c
new file mode 100644
index 000000000..88ea04b7a
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c
@@ -0,0 +1,437 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr
+#define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR)
+
+static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
+     (
+             dim_t              m,
+             dim_t              n,
+             dim_t              k,
+       const DATATYPE* restrict beta,
+       const DATATYPE* restrict a, inc_t rsa, inc_t csa,
+       const DATATYPE* restrict b, inc_t rsb,
+             DATATYPE* restrict c, inc_t rsc,
+       const DATATYPE* restrict a11, inc_t rsa11, inc_t csa11,
+             DATATYPE* restrict c11, inc_t rsc11, inc_t csc11
+     )
+{
+    // This function computes inv(a11) * (beta * c - a * b)
+    // and stores the result in c and c11.
+    
+    RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r, ab5_r;
+    RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i, ab5_i;
+    // gemm step
+    if (m <= 0 || n <= 0 || k < 0)
+        return;
+    else if (k == 0) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n);
+            switch (m) {
+            case 6:
+                ab5_r = zero_splat;
+                ab5_i = zero_splat;
+            case 5:
+                ab4_r = zero_splat;
+                ab4_i = zero_splat;
+            case 4:
+                ab3_r = zero_splat;
+                ab3_i = zero_splat;
+            case 3:
+                ab2_r = zero_splat;
+                ab2_i = zero_splat;
+            case 2:
+                ab1_r = zero_splat;
+                ab1_i = zero_splat;
+            case 1:
+                ab0_r = zero_splat;
+                ab0_i = zero_splat;
+            }
+        }
+        else {
+            RVV_TYPE_FX(PREC, LMUL, 2) c0;
+            RVV_TYPE_F(PREC, LMUL) c0_r, c0_i;
+
+            switch (m) {
+            case 6:
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 5 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, c0_r, c0_i, beta->real, beta->imag, n); 
+            case 5:
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 4 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, c0_r, c0_i, beta->real, beta->imag, n); 
+            case 4:
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 3 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, c0_r, c0_i, beta->real, beta->imag, n); 
+            case 3:
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 2 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, c0_r, c0_i, beta->real, beta->imag, n); 
+            case 2:
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 1 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, c0_r, c0_i, beta->real, beta->imag, n); 
+            case 1:
+                c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 0 * rsc), n);
+                c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+                c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+                VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, c0_r, c0_i, beta->real, beta->imag, n); 
+            }
+        }
+    }
+    else {
+        RVV_TYPE_FX(PREC, LMUL, 2) b0, b1;
+        RVV_TYPE_F(PREC, LMUL) b0_r, b1_r;
+        RVV_TYPE_F(PREC, LMUL) b0_i, b1_i;
+
+        b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+        b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+        b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+        b += rsb;
+        if (k >= 2) {
+            b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+            b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0);
+            b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1);
+            b += rsb;
+        }
+
+        switch (m) {
+        case 6:
+            VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, b0_r, b0_i, a[5 * rsa].real, a[5 * rsa].imag, n);
+        case 5:
+            VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4 * rsa].real, a[4 * rsa].imag, n);
+        case 4:
+            VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3 * rsa].real, a[3 * rsa].imag, n);
+        case 3:
+            VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2 * rsa].real, a[2 * rsa].imag, n);
+        case 2:
+            VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1 * rsa].real, a[1 * rsa].imag, n);
+        case 1:
+            VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0 * rsa].real, a[0 * rsa].imag, n);
+        }
+        
+        a += csa;
+        k -= 1;
+        
+        if (k >= 2) {
+            b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+            b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+            b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+            b += rsb;
+        }
+
+        while (k > 0) {
+            switch (m) {
+            case 6:
+                VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5 * rsa].real, a[5 * rsa].imag, b1_r, b1_i, n);
+            case 5:
+                VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4 * rsa].real, a[4 * rsa].imag, b1_r, b1_i, n);
+            case 4:
+                VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3 * rsa].real, a[3 * rsa].imag, b1_r, b1_i, n);
+            case 3:
+                VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2 * rsa].real, a[2 * rsa].imag, b1_r, b1_i, n);
+            case 2:
+                VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1 * rsa].real, a[1 * rsa].imag, b1_r, b1_i, n);
+            case 1:
+                VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0 * rsa].real, a[0 * rsa].imag, b1_r, b1_i, n);
+            }
+             
+            a += csa;
+            k -= 1;
+
+            if (k == 0) { break; }
+
+            if (k >= 2) {
+                b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+                b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0);
+                b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1);
+                b += rsb;
+            }
+
+            switch (m) {
+            case 6:
+                VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5 * rsa].real, a[5 * rsa].imag, b0_r, b0_i, n);
+            case 5:
+                VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4 * rsa].real, a[4 * rsa].imag, b0_r, b0_i, n);
+            case 4:
+                VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3 * rsa].real, a[3 * rsa].imag, b0_r, b0_i, n);
+            case 3:
+                VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2 * rsa].real, a[2 * rsa].imag, b0_r, b0_i, n);
+            case 2:
+                VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1 * rsa].real, a[1 * rsa].imag, b0_r, b0_i, n);
+            case 1:
+                VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0 * rsa].real, a[0 * rsa].imag, b0_r, b0_i, n);
+            }
+             
+            a += csa;
+            k -= 1;
+
+            if (k == 0) { break; }
+
+            if (k >= 2) {
+                b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n);
+                b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0);
+                b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1);
+                b += rsb;
+            }
+        }
+
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            switch (m) {
+            case 6:
+                ab5_r = VFNEG_VF(PREC, LMUL)(ab5_r, n);
+                ab5_i = VFNEG_VF(PREC, LMUL)(ab5_i, n);
+            case 5:
+                ab4_r = VFNEG_VF(PREC, LMUL)(ab4_r, n);
+                ab4_i = VFNEG_VF(PREC, LMUL)(ab4_i, n);
+            case 4:
+                ab3_r = VFNEG_VF(PREC, LMUL)(ab3_r, n);
+                ab3_i = VFNEG_VF(PREC, LMUL)(ab3_i, n);
+            case 3:
+                ab2_r = VFNEG_VF(PREC, LMUL)(ab2_r, n);
+                ab2_i = VFNEG_VF(PREC, LMUL)(ab2_i, n);
+            case 2:
+                ab1_r = VFNEG_VF(PREC, LMUL)(ab1_r, n);
+                ab1_i = VFNEG_VF(PREC, LMUL)(ab1_i, n);
+            case 1:
+                ab0_r = VFNEG_VF(PREC, LMUL)(ab0_r, n);
+                ab0_i = VFNEG_VF(PREC, LMUL)(ab0_i, n);
+            }
+        }
+        else {
+            RVV_TYPE_FX(PREC, LMUL, 2) c0;
+            RVV_TYPE_F(PREC, LMUL) c0_r, c0_i;
+	    switch (m) {
+	    case 6:
+		c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), n);
+		c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+		c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+		VCMSAC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n);
+	    case 5:
+		c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n);
+		c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+		c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+		VCMSAC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n);
+	    case 4:
+		c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n);
+		c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+		c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+		VCMSAC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n);
+	    case 3:
+		c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n);
+		c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+		c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+		VCMSAC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n);
+	    case 2:
+		c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n);
+		c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+		c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+		VCMSAC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n);
+	    case 1:
+		c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n);
+		c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0);
+		c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1);
+		VCMSAC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n);
+	    }
+        }
+    }   
+
+    // trsm step
+    RVV_TYPE_FX(PREC, LMUL, 2) temp = VUNDEFINED_FX(PREC, LMUL, 2)();
+    RVV_TYPE_F(PREC, LMUL) temp_r, temp_i;
+
+    VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab0_r, ab0_i, a11[0 * rsa11].real, a11[0 * rsa11].imag, n);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i);
+    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 0 * rsc), temp, n);
+    if (csc11 == 1)
+        VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 0 * rsc11), temp, n);
+    else
+        VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 0 * rsc11), 2 * FLT_SIZE * csc11, temp, n);
+    if (m == 1) return;
+    switch (m) {
+    case 6:
+        VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n);
+    case 5:
+        VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n);
+    case 4:
+        VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n);
+    case 3:
+        VCNMSAC_VF(PREC, LMUL, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, temp_r, temp_i, n);
+    case 2:
+        VCNMSAC_VF(PREC, LMUL, ab1_r, ab1_i, a11[1 * rsa11].real, a11[1 * rsa11].imag, temp_r, temp_i, n);
+    }
+    a11 += csa11;
+
+    VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab1_r, ab1_i, a11[1 * rsa11].real, a11[1 * rsa11].imag, n);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i);
+    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 1 * rsc), temp, n);
+    if (csc11 == 1)
+        VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 1 * rsc11), temp, n);
+    else
+        VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 1 * rsc11), 2 * FLT_SIZE * csc11, temp, n);
+    if (m == 2) return;
+    switch (m) {
+    case 6:
+        VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n);
+    case 5:
+        VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n);
+    case 4:
+        VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n);
+    case 3:
+        VCNMSAC_VF(PREC, LMUL, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, temp_r, temp_i, n);
+    }
+    a11 += csa11;
+
+    VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, n);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i);
+    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 2 * rsc), temp, n);
+    if (csc11 == 1)
+        VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 2 * rsc11), temp, n);
+    else
+        VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 2 * rsc11), 2 * FLT_SIZE * csc11, temp, n);
+    if (m == 3) return;
+    switch (m) {
+    case 6:
+        VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n);
+    case 5:
+        VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n);
+    case 4:
+        VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n);
+    }
+    a11 += csa11;
+
+    VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, n);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i);
+    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 3 * rsc), temp, n);
+    if (csc11 == 1)
+        VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 3 * rsc11), temp, n);
+    else
+        VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 3 * rsc11), 2 * FLT_SIZE * csc11, temp, n);
+    if (m == 4) return;
+    switch (m) {
+    case 6:
+        VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n);
+    case 5:
+        VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n);
+    }
+    a11 += csa11;
+
+    VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, n);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i);
+    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 4 * rsc), temp, n);
+    if (csc11 == 1)
+        VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 4 * rsc11), temp, n);
+    else
+        VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 4 * rsc11), 2 * FLT_SIZE * csc11, temp, n);
+    if (m == 5) return;
+    VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n);
+    a11 += csa11;
+
+    VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, n);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r);
+    temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i);
+    VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 5 * rsc), temp, n);
+    if (csc11 == 1)
+        VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 5 * rsc11), temp, n);
+    else
+        VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 5 * rsc11), 2 * FLT_SIZE * csc11, temp, n);
+    return;
+}
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a10 = a10_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b01 = b01_;
+    DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
+    (
+      m, n, k,
+      alpha,
+      a10, 1, PACKMR,
+      b01, PACKNR,
+      b11, PACKNR,
+      a11, 1, PACKMR,
+      c11, rsc, csc
+    );
+
+    return;
+}
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a12 = a12_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b21 = b21_;
+    DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
+    (
+      m, n, k,
+      alpha,
+      a12 + (m - 1), -1, PACKMR,
+      b21, PACKNR,
+      b11 + (m - 1) * PACKNR, -PACKNR,
+      a11 + (m - 1) + (m - 1) * PACKMR, -1, -PACKMR,
+      c11 + (m - 1) * rsc, -rsc, csc
+    );
+
+    return;
+}
+
+#undef GEMMTRSM_IMPL_NAME_
+#undef GEMMTRSM_IMPL_NAME
+
+#endif // GEMMTRSM
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c
new file mode 100644
index 000000000..7c3c3b8b7
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c
@@ -0,0 +1,364 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr
+#define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR)
+
+static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
+     (
+             dim_t              m,
+             dim_t              n,
+             dim_t              k,
+       const DATATYPE* restrict beta,
+       const DATATYPE* restrict a, inc_t rsa, inc_t csa,
+       const DATATYPE* restrict b, inc_t rsb,
+             DATATYPE* restrict c, inc_t rsc,
+       const DATATYPE* restrict a11, inc_t rsa11, inc_t csa11,
+             DATATYPE* restrict c11, inc_t rsc11, inc_t csc11
+     )
+{
+    // This function computes inv(a11) * (beta * c - a * b)
+    // and stores the result in c and c11.
+    
+    RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5, ab6;
+    // gemm step
+    if (m <= 0 || n <= 0 || k < 0)
+        return;
+    else if (k == 0) {
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n);
+            switch (m) {
+            case 7:
+                ab6 = zero_splat;
+            case 6:
+                ab5 = zero_splat;
+            case 5:
+                ab4 = zero_splat;
+            case 4:
+                ab3 = zero_splat;
+            case 3:
+                ab2 = zero_splat;
+            case 2:
+                ab1 = zero_splat;
+            case 1:
+                ab0 = zero_splat;
+            }
+        }
+        else {
+            RVV_TYPE_F(PREC, LMUL) c0;
+            switch (m) {
+            case 7:
+                c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n);
+                ab6 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+            case 6:
+                c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n);
+                ab5 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+            case 5:
+                c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n);
+                ab4 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+            case 4:
+                c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n);
+                ab3 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+            case 3:
+                c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n);
+                ab2 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+            case 2:
+                c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n);
+                ab1 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+            case 1:
+                c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n);
+                ab0 = VFMUL_VF(PREC, LMUL)(c0, *beta, n);
+            }
+        }
+    }
+    else {
+        bool first = true;
+        for (dim_t i = 0; i < k; ++i) {
+            RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n);
+            if (first) {
+                switch (m) {
+                case 7:
+                    ab6 = VFMUL_VF(PREC, LMUL)(b0, a[6 * rsa], n);
+                case 6:
+                    ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5 * rsa], n);
+                case 5:
+                    ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4 * rsa], n);
+                case 4:
+                    ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3 * rsa], n);
+                case 3:
+                    ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2 * rsa], n);
+                case 2:
+                    ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1 * rsa], n);
+                case 1:
+                    ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0 * rsa], n);
+                }
+                first = false;
+            }
+            else {
+                switch (m) {
+                case 7:
+                    ab6 = VFMACC_VF(PREC, LMUL)(ab6, a[6 * rsa], b0, n);
+                case 6:
+                    ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5 * rsa], b0, n);
+                case 5:
+                    ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4 * rsa], b0, n);
+                case 4:
+                    ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3 * rsa], b0, n);
+                case 3:
+                    ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2 * rsa], b0, n);
+                case 2:
+                    ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1 * rsa], b0, n);
+                case 1:
+                    ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0 * rsa], b0, n);
+                }
+            }
+
+            a += csa;
+            b += rsb;
+        }
+
+        if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) {
+            switch (m) {
+            case 7:
+                ab6 = VFNEG_VF(PREC, LMUL)(ab6, n);
+            case 6:
+                ab5 = VFNEG_VF(PREC, LMUL)(ab5, n);
+            case 5:
+                ab4 = VFNEG_VF(PREC, LMUL)(ab4, n);
+            case 4:
+                ab3 = VFNEG_VF(PREC, LMUL)(ab3, n);
+            case 3:
+                ab2 = VFNEG_VF(PREC, LMUL)(ab2, n);
+            case 2:
+                ab1 = VFNEG_VF(PREC, LMUL)(ab1, n);
+            case 1:
+                ab0 = VFNEG_VF(PREC, LMUL)(ab0, n);
+            }
+        }
+        else {
+            RVV_TYPE_F(PREC, LMUL) c0;
+            switch (m) {
+            case 7:
+                c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n);
+                ab6 = VFMSAC_VF(PREC, LMUL)(ab6, *beta, c0, n);
+            case 6:
+                c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n);
+                ab5 = VFMSAC_VF(PREC, LMUL)(ab5, *beta, c0, n);
+            case 5:
+                c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n);
+                ab4 = VFMSAC_VF(PREC, LMUL)(ab4, *beta, c0, n);
+            case 4:
+                c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n);
+                ab3 = VFMSAC_VF(PREC, LMUL)(ab3, *beta, c0, n);
+            case 3:
+                c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n);
+                ab2 = VFMSAC_VF(PREC, LMUL)(ab2, *beta, c0, n);
+            case 2:
+                c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n);
+                ab1 = VFMSAC_VF(PREC, LMUL)(ab1, *beta, c0, n);
+            case 1:
+                c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n);
+                ab0 = VFMSAC_VF(PREC, LMUL)(ab0, *beta, c0, n);
+            }
+        }
+    }
+    
+    // trsm step
+    ab0 = VFMUL_VF(PREC, LMUL)(ab0, a11[0 * rsa11], n);
+    VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n);
+    if (csc11 == 1)
+        VSE_V_F(PREC, LMUL)(c11 + 0 * rsc11, ab0, n);
+    else
+        VSSE_V_F(PREC, LMUL)(c11 + 0 * rsc11, FLT_SIZE * csc11, ab0, n);
+    if (m == 1) return;
+    switch (m) {
+    case 7:
+        ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab0, n);
+    case 6:
+        ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab0, n);
+    case 5:
+        ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab0, n);
+    case 4:
+        ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab0, n);
+    case 3:
+        ab2 = VFNMSAC_VF(PREC, LMUL)(ab2, a11[2 * rsa11], ab0, n);
+    case 2:
+        ab1 = VFNMSAC_VF(PREC, LMUL)(ab1, a11[1 * rsa11], ab0, n);
+    }
+    a11 += csa11;
+
+    ab1 = VFMUL_VF(PREC, LMUL)(ab1, a11[1 * rsa11], n);
+    VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n);
+    if (csc11 == 1)
+        VSE_V_F(PREC, LMUL)(c11 + 1 * rsc11, ab1, n);
+    else
+        VSSE_V_F(PREC, LMUL)(c11 + 1 * rsc11, FLT_SIZE * csc11, ab1, n);
+    if (m == 2) return;
+    switch (m) {
+    case 7:
+        ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab1, n);
+    case 6:
+        ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab1, n);
+    case 5:
+        ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab1, n);
+    case 4:
+        ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab1, n);
+    case 3:
+        ab2 = VFNMSAC_VF(PREC, LMUL)(ab2, a11[2 * rsa11], ab1, n);
+    }
+    a11 += csa11;
+
+    ab2 = VFMUL_VF(PREC, LMUL)(ab2, a11[2 * rsa11], n);
+    VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n);
+    if (csc11 == 1)
+        VSE_V_F(PREC, LMUL)(c11 + 2 * rsc11, ab2, n);
+    else
+        VSSE_V_F(PREC, LMUL)(c11 + 2 * rsc11, FLT_SIZE * csc11, ab2, n);
+    if (m == 3) return;
+    switch (m) {
+    case 7:
+        ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab2, n);
+    case 6:
+        ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab2, n);
+    case 5:
+        ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab2, n);
+    case 4:
+        ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab2, n);
+    }
+    a11 += csa11;
+
+    ab3 = VFMUL_VF(PREC, LMUL)(ab3, a11[3 * rsa11], n);
+    VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n);
+    if (csc11 == 1)
+        VSE_V_F(PREC, LMUL)(c11 + 3 * rsc11, ab3, n);
+    else
+        VSSE_V_F(PREC, LMUL)(c11 + 3 * rsc11, FLT_SIZE * csc11, ab3, n);
+    if (m == 4) return;
+    switch (m) {
+    case 7:
+        ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab3, n);
+    case 6:
+        ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab3, n);
+    case 5:
+        ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab3, n);
+    }
+    a11 += csa11;
+
+    ab4 = VFMUL_VF(PREC, LMUL)(ab4, a11[4 * rsa11], n);
+    VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n);
+    if (csc11 == 1)
+        VSE_V_F(PREC, LMUL)(c11 + 4 * rsc11, ab4, n);
+    else
+        VSSE_V_F(PREC, LMUL)(c11 + 4 * rsc11, FLT_SIZE * csc11, ab4, n);
+    if (m == 5) return;
+    switch (m) {
+    case 7:
+        ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab4, n);
+    case 6:
+        ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab4, n);
+    }
+    a11 += csa11;
+
+    ab5 = VFMUL_VF(PREC, LMUL)(ab5, a11[5 * rsa11], n);
+    VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n);
+    if (csc11 == 1)
+        VSE_V_F(PREC, LMUL)(c11 + 5 * rsc11, ab5, n);
+    else
+        VSSE_V_F(PREC, LMUL)(c11 + 5 * rsc11, FLT_SIZE * csc11, ab5, n);
+    if (m == 6) return;
+    ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab5, n);
+    a11 += csa11;
+
+    ab6 = VFMUL_VF(PREC, LMUL)(ab6, a11[6 * rsa11], n);
+    VSE_V_F(PREC, LMUL)(c + 6 * rsc, ab6, n);
+    if (csc11 == 1)
+        VSE_V_F(PREC, LMUL)(c11 + 6 * rsc11, ab6, n);
+    else
+        VSSE_V_F(PREC, LMUL)(c11 + 6 * rsc11, FLT_SIZE * csc11, ab6, n);
+    return;
+}
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a10 = a10_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b01 = b01_;
+    DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
+    (
+      m, n, k,
+      alpha,
+      a10, 1, PACKMR,
+      b01, PACKNR,
+      b11, PACKNR,
+      a11, 1, PACKMR,
+      c11, rsc, csc
+    );
+
+    return;
+}
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a12 = a12_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b21 = b21_;
+    DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
+    (
+      m, n, k,
+      alpha,
+      a12 + (m - 1), -1, PACKMR,
+      b21, PACKNR,
+      b11 + (m - 1) * PACKNR, -PACKNR,
+      a11 + (m - 1) + (m - 1) * PACKMR, -1, -PACKMR,
+      c11 + (m - 1) * rsc, -rsc, csc
+    );
+
+    return;
+}
+
+#undef GEMMTRSM_IMPL_NAME_
+#undef GEMMTRSM_IMPL_NAME
+
+#endif // GEMMTRSM
diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h
index 0ee01041e..ff7b445c4 100644
--- a/kernels/sifive_x280/bli_kernels_sifive_x280.h
+++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h
@@ -38,10 +38,10 @@ ADDV_KER_PROT(double,       d, addv_sifive_x280_intr)
 ADDV_KER_PROT(scomplex,     c, addv_sifive_x280_intr)
 ADDV_KER_PROT(dcomplex,     z, addv_sifive_x280_intr)
 
-AMAXV_KER_PROT(float,       s, amaxv_sifive_x280_asm)
-AMAXV_KER_PROT(double,      d, amaxv_sifive_x280_asm)
-AMAXV_KER_PROT(scomplex,    c, amaxv_sifive_x280_asm)
-AMAXV_KER_PROT(dcomplex,    z, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(float,       s, amaxv_sifive_x280_intr)
+AMAXV_KER_PROT(double,      d, amaxv_sifive_x280_intr)
+AMAXV_KER_PROT(scomplex,    c, amaxv_sifive_x280_intr)
+AMAXV_KER_PROT(dcomplex,    z, amaxv_sifive_x280_intr)
 
 AXPBYV_KER_PROT(float,      s, axpbyv_sifive_x280_intr)
 AXPBYV_KER_PROT(double,     d, axpbyv_sifive_x280_intr)
@@ -53,10 +53,10 @@ AXPYV_KER_PROT(double,      d, axpyv_sifive_x280_intr)
 AXPYV_KER_PROT(scomplex,    c, axpyv_sifive_x280_intr)
 AXPYV_KER_PROT(dcomplex,    z, axpyv_sifive_x280_intr)
 
-COPYV_KER_PROT(float,       s, copyv_sifive_x280_asm)
-COPYV_KER_PROT(double,      d, copyv_sifive_x280_asm)
-COPYV_KER_PROT(scomplex,    c, copyv_sifive_x280_asm)
-COPYV_KER_PROT(dcomplex,    z, copyv_sifive_x280_asm)
+COPYV_KER_PROT(float,       s, copyv_sifive_x280_intr)
+COPYV_KER_PROT(double,      d, copyv_sifive_x280_intr)
+COPYV_KER_PROT(scomplex,    c, copyv_sifive_x280_intr)
+COPYV_KER_PROT(dcomplex,    z, copyv_sifive_x280_intr)
 
 DOTV_KER_PROT(float,        s, dotv_sifive_x280_intr)
 DOTV_KER_PROT(double,       d, dotv_sifive_x280_intr)
@@ -68,15 +68,15 @@ DOTXV_KER_PROT(double,      d, dotxv_sifive_x280_intr)
 DOTXV_KER_PROT(scomplex,    c, dotxv_sifive_x280_intr)
 DOTXV_KER_PROT(dcomplex,    z, dotxv_sifive_x280_intr)
 
-INVERTV_KER_PROT(float,     s, invertv_sifive_x280_asm)
-INVERTV_KER_PROT(double,    d, invertv_sifive_x280_asm)
-INVERTV_KER_PROT(scomplex,  c, invertv_sifive_x280_asm)
-INVERTV_KER_PROT(dcomplex,  z, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(float,     s, invertv_sifive_x280_intr)
+INVERTV_KER_PROT(double,    d, invertv_sifive_x280_intr)
+INVERTV_KER_PROT(scomplex,  c, invertv_sifive_x280_intr)
+INVERTV_KER_PROT(dcomplex,  z, invertv_sifive_x280_intr)
 
-INVSCALV_KER_PROT(float,    s, invscalv_sifive_x280_asm)
-INVSCALV_KER_PROT(double,   d, invscalv_sifive_x280_asm)
-INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_asm)
-INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(float,    s, invscalv_sifive_x280_intr)
+INVSCALV_KER_PROT(double,   d, invscalv_sifive_x280_intr)
+INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_intr)
+INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_intr)
 
 SCAL2V_KER_PROT(float,      s, scal2v_sifive_x280_intr)
 SCAL2V_KER_PROT(double,     d, scal2v_sifive_x280_intr)
@@ -88,20 +88,20 @@ SCALV_KER_PROT(double,      d, scalv_sifive_x280_intr)
 SCALV_KER_PROT(scomplex,    c, scalv_sifive_x280_intr)
 SCALV_KER_PROT(dcomplex,    z, scalv_sifive_x280_intr)
 
-SETV_KER_PROT(float,        s, setv_sifive_x280_asm)
-SETV_KER_PROT(double,       d, setv_sifive_x280_asm)
-SETV_KER_PROT(scomplex,     c, setv_sifive_x280_asm)
-SETV_KER_PROT(dcomplex,     z, setv_sifive_x280_asm)
+SETV_KER_PROT(float,        s, setv_sifive_x280_intr)
+SETV_KER_PROT(double,       d, setv_sifive_x280_intr)
+SETV_KER_PROT(scomplex,     c, setv_sifive_x280_intr)
+SETV_KER_PROT(dcomplex,     z, setv_sifive_x280_intr)
 
 SUBV_KER_PROT(float,        s, subv_sifive_x280_intr)
 SUBV_KER_PROT(double,       d, subv_sifive_x280_intr)
 SUBV_KER_PROT(scomplex,     c, subv_sifive_x280_intr)
 SUBV_KER_PROT(dcomplex,     z, subv_sifive_x280_intr)
 
-SWAPV_KER_PROT(float,       s, swapv_sifive_x280_asm)
-SWAPV_KER_PROT(double,      d, swapv_sifive_x280_asm)
-SWAPV_KER_PROT(scomplex,    c, swapv_sifive_x280_asm)
-SWAPV_KER_PROT(dcomplex,    z, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(float,       s, swapv_sifive_x280_intr)
+SWAPV_KER_PROT(double,      d, swapv_sifive_x280_intr)
+SWAPV_KER_PROT(scomplex,    c, swapv_sifive_x280_intr)
+SWAPV_KER_PROT(dcomplex,    z, swapv_sifive_x280_intr)
 
 XPBYV_KER_PROT(float,       s, xpbyv_sifive_x280_intr)
 XPBYV_KER_PROT(double,      d, xpbyv_sifive_x280_intr)
@@ -114,31 +114,31 @@ AXPY2V_KER_PROT(double,     d, axpy2v_sifive_x280_intr)
 AXPY2V_KER_PROT(scomplex,   c, axpy2v_sifive_x280_intr)
 AXPY2V_KER_PROT(dcomplex,   z, axpy2v_sifive_x280_intr)
 
-AXPYF_KER_PROT(float,       s, axpyf_sifive_x280_asm)
-AXPYF_KER_PROT(double,      d, axpyf_sifive_x280_asm)
-AXPYF_KER_PROT(scomplex,    c, axpyf_sifive_x280_asm)
-AXPYF_KER_PROT(dcomplex,    z, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(float,       s, axpyf_sifive_x280_intr)
+AXPYF_KER_PROT(double,      d, axpyf_sifive_x280_intr)
+AXPYF_KER_PROT(scomplex,    c, axpyf_sifive_x280_intr)
+AXPYF_KER_PROT(dcomplex,    z, axpyf_sifive_x280_intr)
 
-DOTXF_KER_PROT(float,       s, dotxf_sifive_x280_asm)
-DOTXF_KER_PROT(double,      d, dotxf_sifive_x280_asm)
-DOTXF_KER_PROT(scomplex,    c, dotxf_sifive_x280_asm)
-DOTXF_KER_PROT(dcomplex,    z, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(float,       s, dotxf_sifive_x280_intr)
+DOTXF_KER_PROT(double,      d, dotxf_sifive_x280_intr)
+DOTXF_KER_PROT(scomplex,    c, dotxf_sifive_x280_intr)
+DOTXF_KER_PROT(dcomplex,    z, dotxf_sifive_x280_intr)
 
 DOTAXPYV_KER_PROT(float,    s, dotaxpyv_sifive_x280_intr)
 DOTAXPYV_KER_PROT(double,   d, dotaxpyv_sifive_x280_intr)
 DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr)
 DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr)
 
-DOTXAXPYF_KER_PROT(float,   s, dotxaxpyf_sifive_x280_asm)
-DOTXAXPYF_KER_PROT(double,  d, dotxaxpyf_sifive_x280_asm)
-DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm)
-DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(float,   s, dotxaxpyf_sifive_x280_intr)
+DOTXAXPYF_KER_PROT(double,  d, dotxaxpyf_sifive_x280_intr)
+DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_intr)
+DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_intr)
 
 // Level 1m
-PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_7m4)
-PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_7m4)
-PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_6m2)
-PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_6m2)
+PACKM_KER_PROT(float,       s, packm_sifive_x280_intr)
+PACKM_KER_PROT(double,      d, packm_sifive_x280_intr)
+PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_intr)
+PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_intr)
 
 // Reference 1m
 PACKM_KER_PROT(float,       ss, packm_sifive_x280_ref)
@@ -147,16 +147,16 @@ PACKM_KER_PROT(scomplex,    cc, packm_sifive_x280_ref)
 PACKM_KER_PROT(dcomplex,    zz, packm_sifive_x280_ref)
 
 // Level 3
-GEMM_UKR_PROT(float,        s, gemm_sifive_x280_asm_7m4)
-GEMM_UKR_PROT(double,       d, gemm_sifive_x280_asm_7m4)
-GEMM_UKR_PROT(scomplex,     c, gemm_sifive_x280_asm_6m2)
-GEMM_UKR_PROT(dcomplex,     z, gemm_sifive_x280_asm_6m2)
-
-GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_l_sifive_x280_asm)
-GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_l_sifive_x280_asm)
-GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_asm)
-GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_asm)
-GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_u_sifive_x280_asm)
-GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_u_sifive_x280_asm)
-GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_asm)
-GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_asm)
+GEMM_UKR_PROT(float,        s, gemm_sifive_x280_intr)
+GEMM_UKR_PROT(double,       d, gemm_sifive_x280_intr)
+GEMM_UKR_PROT(scomplex,     c, gemm_sifive_x280_intr)
+GEMM_UKR_PROT(dcomplex,     z, gemm_sifive_x280_intr)
+
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_l_sifive_x280_intr)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_l_sifive_x280_intr)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_intr)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_intr)
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_u_sifive_x280_intr)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_u_sifive_x280_intr)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_intr)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_intr)
diff --git a/kernels/sifive_x280/riscv_cmul_macros_intr.h b/kernels/sifive_x280/riscv_cmul_macros_intr.h
new file mode 100644
index 000000000..70a0a1612
--- /dev/null
+++ b/kernels/sifive_x280/riscv_cmul_macros_intr.h
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "riscv_overloaded_intrinsics.h"
+
+// macros to emit complex multiplication
+// caveat: the destination registers cannot overlap the source registers!
+
+// vd = vs2 * f[rs1]
+#define VCMUL_VF(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \
+    do {                                                                 \
+        VD_R = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_R, VL);                   \
+        VD_I = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_I, VL);                   \
+        VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);           \
+        VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL);            \
+    } while(0)
+
+// vd = conj(vs2) * f[rs1]
+#define VCMUL_VF_CONJ(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \
+    do {                                                                      \
+        VD_R = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_R, VL);                        \
+        VD_I = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_I, VL);                        \
+        VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);                 \
+        VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL);                \
+    } while(0)
+
+// vd = vs2 * f[rs1]
+#define VCMUL_VF_TU(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \
+    do {                                                                    \
+        VD_R = VFMUL_VF_TU(PREC, LMUL)(VS2_R, VS2_R, RS1_R, VL);            \
+        VD_I = VFMUL_VF_TU(PREC, LMUL)(VS2_I, VS2_I, RS1_R, VL);            \
+        VD_R = VFNMSAC_VF_TU(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);           \
+        VD_I = VFMACC_VF_TU(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL);            \
+    } while(0)
+
+// vd = conj(vs2) * f[rs1]
+#define VCMUL_VF_CONJ_TU(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \
+    do {                                                                         \
+        VD_R = VFMUL_VF_TU(PREC, LMUL)(VS2_R, VS2_R, RS1_R, VL);                 \
+        VD_I = VFMUL_VF_TU(PREC, LMUL)(VS2_I, VS2_I, RS1_R, VL);                 \
+        VD_R = VFMACC_VF_TU(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);                 \
+        VD_I = VFMSAC_VF_TU(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL);                 \
+    } while(0)
+
+// vd = vs2 * vs1
+#define VCMUL_VV(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, VS1_R, VS1_I, VL) \
+    do {                                                                 \
+        VD_R = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_R, VL);                   \
+        VD_I = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_I, VL);                   \
+        VD_R = VFNMSAC_VV(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL);           \
+        VD_I = VFMACC_VV(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL);            \
+    } while(0)
+
+// vd = conj(vs2) * vs1
+#define VCMUL_VV_CONJ(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, VS1_R, VS1_I, VL) \
+    do {                                                                      \
+        VD_R = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_R, VL);                        \
+        VD_I = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_I, VL);                        \
+        VD_R = VFMACC_VV(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL);                 \
+        VD_I = VFNMSAC_VV(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL);                \
+    } while(0)
+
+// vd += vs2 * f[rs1]
+#define VCMACC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \
+    do {                                                                  \
+        VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL);             \
+        VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL);             \
+        VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);            \
+        VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL);             \
+    } while(0)
+
+// vd += conj(vs2) * f[rs1]
+#define VCMACC_VF_CONJ(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \
+    do {                                                                       \
+        VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL);                  \
+        VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL);                  \
+        VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);                  \
+        VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL);                 \
+    } while(0)
+
+// vd = vs2 * f[rs1] - vd
+#define VCMSAC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL)  \
+    do {                                                                   \
+        VD_R = VFMSAC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL);              \
+        VD_I = VFMSAC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL);              \
+        VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);             \
+        VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL);              \
+    } while(0)
+
+// vd -= vs2 * f[rs1]
+#define VCNMSAC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \
+    do {                                                                   \
+        VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL);             \
+        VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL);             \
+        VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL);              \
+        VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL);             \
+    } while(0)
+
+// vd += vs2 * vs1
+#define VCMACC_VV_TU(PREC, LMUL, VD_R, VD_I, VS1_R, VS1_I, VS2_R, VS2_I, VL) \
+    do {                                                                     \
+        VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_R, VS2_R, VL);             \
+        VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_I, VS2_R, VL);             \
+        VD_R = VFNMSAC_VV_TU(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL);            \
+        VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL);             \
+    } while(0)
+
+// vd += conj(vs2) * vs1
+#define VCMACC_VV_CONJ_TU(PREC, LMUL, VD_R, VD_I, VS1_R, VS1_I, VS2_R, VS2_I, VL) \
+    do {                                                                          \
+        VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_R, VS2_R, VL);                  \
+        VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_I, VS2_R, VL);                  \
+        VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL);                  \
+        VD_I = VFNMSAC_VV_TU(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL);                 \
+    } while(0)
+
diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
index 6a1d11b13..44f70f272 100644
--- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h
+++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
@@ -33,6 +33,10 @@
 */
 
 // 6. Configuration-Setting and Utility Functions
+#define RVV_TYPE_B_(RATIO) vbool##RATIO##_t
+#define RVV_TYPE_B(RATIO) RVV_TYPE_B_(RATIO)
+#define RVV_TYPE_U_(PRECISION, LMUL) vuint##PRECISION##LMUL##_t
+#define RVV_TYPE_U(PRECISION, LMUL) RVV_TYPE_U_(PRECISION, LMUL)
 #define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t
 #define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL)
 #define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t
@@ -50,6 +54,14 @@
 #define VLSEG2_V_F(PRECISION, LMUL, NFIELDS)   VLSEG2_V_F_(PRECISION, LMUL, NFIELDS)
 #define VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS)   __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
 #define VLSSEG2_V_F(PRECISION, LMUL, NFIELDS)   VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VLE_V_F_TU_(PRECISION, LMUL)   __riscv_vle##PRECISION##_v_f##PRECISION##LMUL##_tu
+#define VLE_V_F_TU(PRECISION, LMUL)   VLE_V_F_TU_(PRECISION, LMUL)
+#define VLSE_V_F_TU_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL##_tu
+#define VLSE_V_F_TU(PRECISION, LMUL) VLSE_V_F_TU_(PRECISION, LMUL)
+#define VLSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS)   __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS##_tu
+#define VLSEG2_V_F_TU(PRECISION, LMUL, NFIELDS)   VLSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS)
+#define VLSSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS)   __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS##_tu
+#define VLSSEG2_V_F_TU(PRECISION, LMUL, NFIELDS)   VLSSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS)
 // Stores
 #define VSE_V_F_(PRECISION, LMUL)   __riscv_vse##PRECISION##_v_f##PRECISION##LMUL
 #define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL)
@@ -59,58 +71,131 @@
 #define VSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
 #define VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
 #define VSSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG3_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg3e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG3_V_F(PRECISION, LMUL, NFIELDS) VSSSEG3_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG4_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg4e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG4_V_F(PRECISION, LMUL, NFIELDS) VSSSEG4_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG5_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg5e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG5_V_F(PRECISION, LMUL, NFIELDS) VSSSEG5_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG6_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg6e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG6_V_F(PRECISION, LMUL, NFIELDS) VSSSEG6_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG7_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg7e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG7_V_F(PRECISION, LMUL, NFIELDS) VSSSEG7_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg8e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG8_V_F(PRECISION, LMUL, NFIELDS) VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS)
+
+// 11. Vector Integer Arithmetic Operations
+#define VADD_VX_U_(PRECISION, LMUL) __riscv_vadd_vx_u##PRECISION##LMUL
+#define VADD_VX_U(PRECISION, LMUL) VADD_VX_U_(PRECISION, LMUL)
+#define VMERGE_VVM_TU_U_(PRECISION, LMUL) __riscv_vmerge_vvm_u##PRECISION##LMUL##_tu
+#define VMERGE_VVM_TU_U(PRECISION, LMUL) VMERGE_VVM_TU_U_(PRECISION, LMUL)
 
 // 13. Vector Floating-Point Operations
 #define VFADD_VV_(PRECISION, LMUL) __riscv_vfadd_vv_f##PRECISION##LMUL
 #define VFADD_VV(PRECISION, LMUL) VFADD_VV_(PRECISION, LMUL)
 #define VFSUB_VV_(PRECISION, LMUL) __riscv_vfsub_vv_f##PRECISION##LMUL
 #define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL)
-#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
-#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
 #define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL
 #define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL)
 #define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
 #define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
-#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL
-#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL)
+#define VFDIV_VV_(PRECISION, LMUL) __riscv_vfdiv_vv_f##PRECISION##LMUL
+#define VFDIV_VV(PRECISION, LMUL) VFDIV_VV_(PRECISION, LMUL)
+#define VFRDIV_VF_(PRECISION, LMUL) __riscv_vfrdiv_vf_f##PRECISION##LMUL
+#define VFRDIV_VF(PRECISION, LMUL) VFRDIV_VF_(PRECISION, LMUL)
 #define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL
 #define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL)
-#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu
-#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL)
+#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL
+#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL)
 #define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL
 #define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL)
+#define VFNMSAC_VV_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL
+#define VFNMSAC_VV(PRECISION, LMUL) VFNMSAC_VV_(PRECISION, LMUL)
 #define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL
 #define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL)
-#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu
-#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL)
 #define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL
 #define VFMADD_VF(PRECISION, LMUL)  VFMADD_VF_(PRECISION, LMUL)
 #define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL
 #define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL)
+#define VFMAX_VV_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL
+#define VFMAX_VV(PRECISION, LMUL) VFMAX_VV_(PRECISION, LMUL)
 #define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL
 #define VFNEG_VF(PRECISION, LMUL)  VFNEG_VF_(PRECISION, LMUL)
+#define VFABS_V_(PRECISION, LMUL) __riscv_vfabs_v_f##PRECISION##LMUL
+#define VFABS_V(PRECISION, LMUL) VFABS_V_(PRECISION, LMUL)
+#define VMFEQ_VV_(PRECISION, LMUL, RATIO) __riscv_vmfeq_vv_f##PRECISION##LMUL##_b##RATIO
+#define VMFEQ_VV(PRECISION, LMUL, RATIO) VMFEQ_VV_(PRECISION, LMUL, RATIO)
+#define VMFNE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfne_vv_f##PRECISION##LMUL##_b##RATIO
+#define VMFNE_VV(PRECISION, LMUL, RATIO) VMFNE_VV_(PRECISION, LMUL, RATIO)
+#define VMFGT_VV_(PRECISION, LMUL, RATIO) __riscv_vmfgt_vv_f##PRECISION##LMUL##_b##RATIO
+#define VMFGT_VV(PRECISION, LMUL, RATIO) VMFGT_VV_(PRECISION, LMUL, RATIO)
+#define VMFGE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfge_vv_f##PRECISION##LMUL##_b##RATIO
+#define VMFGE_VV(PRECISION, LMUL, RATIO) VMFGE_VV_(PRECISION, LMUL, RATIO)
+#define VMERGE_VVM_F_(PRECISION, LMUL) __riscv_vmerge_vvm_f##PRECISION##LMUL
+#define VMERGE_VVM_F(PRECISION, LMUL) VMERGE_VVM_F_(PRECISION, LMUL)
 #define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)(  __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG
 #define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL)
+#define VFMV_V_F_(PRECISION, LMUL) __riscv_vfmv_v_f_f##PRECISION##LMUL
+#define VFMV_V_F(PRECISION, LMUL) VFMV_V_F_(PRECISION, LMUL)
+
+#define VFMUL_VF_TU_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL##_tu
+#define VFMUL_VF_TU(PRECISION, LMUL) VFMUL_VF_TU_(PRECISION, LMUL)
+#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu
+#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL)
+#define VFMACC_VF_TU_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL##_tu
+#define VFMACC_VF_TU(PRECISION, LMUL) VFMACC_VF_TU_(PRECISION, LMUL)
+#define VFMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL##_tu
+#define VFMSAC_VF_TU(PRECISION, LMUL) VFMSAC_VF_TU_(PRECISION, LMUL)
+#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu
+#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL)
+#define VFNMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL##_tu
+#define VFNMSAC_VF_TU(PRECISION, LMUL) VFNMSAC_VF_TU_(PRECISION, LMUL)
+#define VFMAX_VV_TU_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL##_tu
+#define VFMAX_VV_TU(PRECISION, LMUL) VFMAX_VV_TU_(PRECISION, LMUL)
+#define VFNEG_VF_TU_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL##_tu
+#define VFNEG_VF_TU(PRECISION, LMUL)  VFNEG_VF_TU_(PRECISION, LMUL)
 
 // 14. Vector Reduction Operations
+#define VREDMINU_VS_M_(PRECISION, LMUL) __riscv_vredminu_vs_u##PRECISION##LMUL##_u##PRECISION##m1_m
+#define VREDMINU_VS_M(PRECISION, LMUL) VREDMINU_VS_M_(PRECISION, LMUL)
 #define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1
 #define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL)
+#define VFREDMAX_VS_(PRECISION, LMUL) __riscv_vfredmax_vs_f##PRECISION##LMUL##_f##PRECISION##m1
+#define VFREDMAX_VS(PRECISION, LMUL) VFREDMAX_VS_(PRECISION, LMUL)
+
+// 15. Vector Mask Operations
+#define VFIRST_M_(RATIO) __riscv_vfirst_m_b##RATIO
+#define VFIRST_M(RATIO) VFIRST_M_(RATIO)
+#define VID_V_(PRECISION, LMUL) __riscv_vid_v_u##PRECISION##LMUL
+#define VID_V(PRECISION, LMUL) VID_V_(PRECISION, LMUL)
 
 // 16. Vector Permutation Operations
-#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL
-#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL)
+#define VMV_X_S_U_(PRECISION) __riscv_vmv_x_s_u##PRECISION##m1_u##PRECISION
+#define VMV_X_S_U(PRECISION) VMV_X_S_U_(PRECISION)
+#define VMV_S_X_U_(PRECISION, LMUL) __riscv_vmv_s_x_u##PRECISION##LMUL
+#define VMV_S_X_U(PRECISION, LMUL) VMV_S_X_U_(PRECISION, LMUL)
 #define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION
 #define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION)
+#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL
+#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL)
+#define VRGATHER_VX_F_(PRECISION, LMUL) __riscv_vrgather_vx_f##PRECISION##LMUL
+#define VRGATHER_VX_F(PRECISION, LMUL) VRGATHER_VX_F_(PRECISION, LMUL)
 
 // Miscellaneous Vector Function
 #define VREINTERPRET_V_I_F_(PRECISION, LMUL) __riscv_vreinterpret_v_i##PRECISION##LMUL##_f##PRECISION##LMUL
 #define VREINTERPRET_V_I_F(PRECISION, LMUL) VREINTERPRET_V_I_F_(PRECISION, LMUL)
 #define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL
 #define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL)
-#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL
-#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS)
+#define VLMUL_EXT_V_F_M1_(PRECISION, LMUL) __riscv_vlmul_ext_v_f##PRECISION##m1##_f##PRECISION##LMUL
+#define VLMUL_EXT_V_F_M1(PRECISION, LMUL) VLMUL_EXT_V_F_M1_(PRECISION, LMUL)
+#define VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) __riscv_vundefined_f##PRECISION##LMUL##x##NFIELDS
+#define VUNDEFINED_FX(PRECISION, LMUL, NFIELDS) VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS)
 #define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS
 #define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS)
+#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL
+#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS)
+#define VCREATE_V_FX_(PRECISION, LMUL, NFIELDS) __riscv_vcreate_v_f##PRECISION##LMUL##x##NFIELDS
+#define VCREATE_V_FX(PRECISION, LMUL, NFIELDS) VCREATE_V_FX_(PRECISION, LMUL, NFIELDS)
 
 // Non-vector functions
 #define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__))
diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh
index 56c2b85c2..82b6afee6 100755
--- a/travis/do_riscv.sh
+++ b/travis/do_riscv.sh
@@ -3,7 +3,7 @@
 set -e
 set -x
 
-TAG=2023.10.18
+TAG=2024.08.03
 
 # The prebuilt toolchains only support hardfloat, so we only
 # test these for now.

From 12f2efa7dfe11a684d62af02592499d91b7e344b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 29 Nov 2024 02:16:48 +0200
Subject: [PATCH 204/230] Add complex return detection for nvfortran (#765)

Details:
- Search for Intel ifx and NVIDIA/PGI Fortran compilers.
- Correctly determine the Fortran compiler vendor for Intel ifx and NVIDIA/PGI compilers.
- Determine the compiler version and correct Fortran complex return type for NVIDIA/PGI.
---
 configure | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 22a143cf2..676c3b0ad 100755
--- a/configure
+++ b/configure
@@ -1221,7 +1221,7 @@ get_fc_search_list()
 {
 	local list
 
-	list="gfortran ifort"
+	list="gfortran ifort ifx nvfortran"
 
 	echo "${list}"
 }
@@ -4229,11 +4229,37 @@ blis_main()
 			# Query the compiler "vendor" (ie: the compiler's simple name).
 			# The last part ({ read first rest ; echo $first ; }) is a workaround
 			# to OS X's egrep only returning the first match.
-			fc_vendor=$(echo "${vendor_string}" | grep -oE 'IFORT|GNU' |
+			fc_vendor=$(echo "${vendor_string}" | grep -oE 'IFORT|IFX|GNU|NVIDIA|PGI' |
 			            { read -r first rest ; echo "${first}"; })
 
-			if [[ ${fc_vendor} = IFORT ]]; then
+			if [[ ${fc_vendor} = IFORT || ${fc_vendor} = IFX ]]; then
 				complex_return='intel'
+			elif [[ ${fc_vendor} = NVIDIA || ${fc_vendor} = PGI ]]; then
+				# On x86_64 and aarch64 prior to 23.9, nvfortran
+				# uses the 'intel' convention.
+                                # On and ppc64le and aarch64 starting with 23.9,
+                                # the convention is 'gnu'.
+				if [[ "$(uname -m)" = "aarch64" ]]; then
+					fc_version=$(echo "${vendor_string}" \
+					             | grep -oE '[0-9]+\.[0-9]+\.?[0-9]*' \
+					             | { read -r first rest ; echo "${first}"; })
+					if [[ ${fc_version:0:2} -lt 23 ]]; then
+						complex_return='intel'
+					elif [[ ${fc_version:0:2} -eq 23 ]]; then
+						# Use 3:5 because minor version numbers include 1 and 11.
+						if [[ ${fc_version:3:5} -lt 9 ]]; then
+							complex_return='intel'
+						else
+							complex_return='gnu'
+						fi
+					else
+						complex_return='gnu'
+					fi
+				elif [[ "$(uname -m)" = "ppc64le" ]]; then
+					complex_return='gnu'
+				else
+					complex_return='intel'
+				fi
 			elif [[ ${fc_vendor} = GNU ]]; then
 				complex_return='gnu'
 			else

From 5cb70d8ea12295d1d12cd23afb9895b4f7b1afff Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 14 Jan 2025 17:16:12 -0600
Subject: [PATCH 205/230] Add documentation for plugins (#820)

Add documentation for the plugin system and for modifying the control tree to make custom operations.

Details:
- `docs/PluginHowTo.md` describes in a "tutorial style" how to implement a custom BLAS-like operation by creating a plugin and then modifying the `gemm` control tree to achieve the desired effect.
- Briefly, plugins allow users to add new kernels and associated block sizes/preferences to BLIS without modifying the BLIS source code. User-provided kernels are compiled using the BLIS build system for configured architectures and selected at runtime based on the actual hardware.
- To implement custom operations, users can combine their own kernels (and/or existing BLIS kernels) with a customized control tree, which represents the specific algorithmic steps. Users can customize the kernels to be used for packing and for computation, extra information passed to kernels (e.g. additional parameters or data), block sizes, etc. An API is provided for modifying the default `gemm` control tree (also used for other level-3 operations, except `trsm`).
---
 README.md                              |   19 +
 docs/PluginHowTo.md                    | 1299 ++++++++++++++++++++++++
 docs/diagrams/mmbp_algorithm_color.png |  Bin 0 -> 28861 bytes
 3 files changed, 1318 insertions(+)
 create mode 100644 docs/PluginHowTo.md
 create mode 100644 docs/diagrams/mmbp_algorithm_color.png

diff --git a/README.md b/README.md
index 05104b976..676bd6951 100644
--- a/README.md
+++ b/README.md
@@ -106,6 +106,18 @@ all of which are available for free via the [edX platform](http://www.edx.org/).
 What's New
 ----------
 
+ * **Plugin feature now available!** BLIS addons (see below) provided a way to
+quickly extend BLIS's operation support or define new custom BLIS APIs for your application.
+BLIS plugins extend this support to completely external code, needing only an installed BLIS
+package (no source required). BLIS plugins also allow users to define their own kernels
+and blocksizes, combined with the cross-architecture support provided by the BLIS framework.
+Finally, user plugins can utilize the new API for modifying the BLIS "control tree" which
+defines the mathematical operation to be computed, as well as information controlling packing,
+partitioning, etc. Users can now modify the control tree to implement new linear algebra
+operations not already included in BLIS. See the [documentation](docs/PluginHowTo.md) for
+an overview of these features and a step-by-step guides for creating plugins and modifying
+the control tree to implement an example operation "SYRKD".
+
  * **BLIS selected for the 2023 James H. Wilkinson Prize for Numerical Software!** We
 are thrilled to announce that Field Van Zee and Devin Matthews were chosen to receive
 the [2023 James H. Wilkinson Prize for Numerical Software](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software).
@@ -529,6 +541,13 @@ use the multithreading features of BLIS.
 overview of BLIS's mixed-datatype functionality and provides a brief example
 of how to take advantage of this new code.
 
+ * **[Extending BLIS functionality](docs/PluginHowTo.md).** This document provides an
+overview of BLIS's mechanisms for extending functionality through user-defined code.
+BLIS has a plugin infrastructure which allows users to define their own kernels,
+blocksizes, and kernel preferences which are compiled and managed by the BLIS framework.
+BLIS also provides an API for modifying the "control tree" which can be used to
+implement novel linear algebra operations.
+
  * **[Performance](docs/Performance.md).** This document reports empirically
 measured performance of a representative set of level-3 operations on a variety
 of hardware architectures, as implemented within BLIS and other BLAS libraries
diff --git a/docs/PluginHowTo.md b/docs/PluginHowTo.md
new file mode 100644
index 000000000..e9f6b93fd
--- /dev/null
+++ b/docs/PluginHowTo.md
@@ -0,0 +1,1299 @@
+# Contents
+
+* **[Introduction](PluginHowTo.md#introduction)**
+  * **[Example Plugin](PluginHowTo.md#example-plugin)**
+  * **[Creating a New Plugin](PluginHowTo.md#creating-a-new-plugin)**
+  * **[Building a Plugin](PluginHowTo.md#building-a-plugin)**
+* **[Kernels](PluginHowTo.md#kernels)**
+  * **[Accessing Kernels](PluginHowTo.md#accessing-kernels)**
+  * **[Reference Kernels](PluginHowTo.md#reference-kernels)**
+  * **[Optimized Kernels](PluginHowTo.md#optimized-kernels)**
+  * **[Mappign Kernels to Subconfigurations](PluginHowTo.md#mapping-kernels-to-subconfigurations)**
+* **[Custom Operations](PluginHowTo.md#custom-operations)**
+  * **[Example: `bli_gemmt_ex`](PluginHowTo.md#example-bli_gemmt_ex)**
+  * **[The Control Tree](PluginHowTo.md#the-control-tree)**
+  * **[Modifying the Control Tree](PluginHowTo.md#modifying-the-control-tree)**
+  * **[Modifications to Blocking](PluginHowTo.md#modifications-to-blocking)**
+  * **[Modifications to Packing](PluginHowTo.md#modifications-to-packing)**
+  * **[Modifications to Computation](PluginHowTo.md#modifications-to-computation)**
+  * **[SYRKD](PluginHowTo.md#syrkd)**
+<!--
+* **[API Reference](PluginHowTo.md#api-reference)**
+  * **[Registration](PluginHowTo.md#registration)**
+  * **[Helper Functions](PluginHowTo.md#helper-functions)**
+  * **[Context Initialization](PluginHowTo.md#context-initialization)**
+  * **[Context Query](PluginHowTo.md#context-query)**
+  * **[Control tree modification](PluginHowTo.md#control-tree-modification)**
+-->
+
+# Introduction
+
+A BLIS plugin is a piece of user-defined code that provides additional linear algebra functionality, but leverages BLIS's internal framework for high performance. Through a plugin, users can:
+
+* Provide customized or optimized [kernels](PluginHowTo.md#kernels), and access internal BLIS kernels.
+* Define new, custom linear algebra [operations](PluginHowTo.md#custom-operations) which extend the level-3 BLAS (for example, `GEMM`).
+
+Plugins are defined completely externally to BLIS (that is, the BLIS source code is not required). However, an installed copy of BLIS 2.0 or later is required (assumed installed to `$PREFIX`) in order to configure or build a plugin. Building a plugin then results in a shared and/or static library which can be distributed or linked into your code. The template and example files generated by BLIS are all in C99, but C++ is also supported.
+
+## Example Plugin
+
+A new plugin is created by running `$PREFIX/share/blis/configure-plugin <name>`, where `<name>` is the name you wish to give to the plugin which must be a valid C99 identifier. By default, this generates a fully-functioning example plugin containing the following files:
+
+<normal><pre><code>├─ [Makefile](PluginHowTo.md#makefile) **
+├─ [config.mk](PluginHowTo.md#configmk) **
+├─ [config_registry](PluginHowTo.md#config_registry) **
+├─ [bli_plugin_\<name\>.h](PluginHowTo.md#bli_plugin_nameh)
+├─ [bli_plugin_register.c](PluginHowTo.md#bli_plugin_registerc)
+├─ config
+│  ├─ \<config_1\>
+│  ├─ ...
+│  └─ \<config_n\>
+│     ├─ [bli_kernel_defs_\<config\>.h](PluginHowTo.md#configconfigbli_kernel_defs_configh)
+│     ├─ [bli_plugin_init_\<config\>.c](PluginHowTo.md#configconfigbli_plugin_init_configc)
+│     └─ [make_defs.mk](PluginHowTo.md#configarchmake_defsmk)
+├─ ref_kernels
+│  ├─ [bli_plugin_init_ref.c](PluginHowTo.md#ref_kernelsbli_plugin_init_refc)
+│  ├─ [my_kernel_1_ref.c](PluginHowTo.md#ref_kernelsmy_kernel_1_refc-and-my_kernel_2_refc) *
+│  └─ [my_kernel_2_ref.c](PluginHowTo.md#ref_kernelsmy_kernel_1_refc-and-my_kernel_2_refc) *
+├─ kernels
+│  ├─ \<arch_1\>
+│  ├─ ...
+│  └─ zen3
+│     └─ [my_kernel_1_zen3.c](PluginHowTo.md#kernelszen3my_kernel_1_zen3c) *
+└─ obj **
+&nbsp;  └─ [\<config\>](PluginHowTo.md#objconfig) **
+</code></pre></normal>
+
+Files marked with `*` (and some portions of other files) are for example only and can be omitted by passing the `--disable-examples` flag to `configure-plugin`. Files and directories marked with `**` are only required when you are ready to build the plugin and can be disabled with `--disable-build`. The remaining files and directories constitute the plugin "template". If you want to later generate only build files then these files (which presumably already exist) can be skipped with `--disable-templates`.
+
+#### `Makefile`
+
+The Makefile for your plugin is automatically generated by `configure-plugin` and should not be modified. Targets `make` and `make clean` are supported and will build your plugin based on the flags given during configuration.
+
+#### `config.mk`
+
+This file is also generated by `configure-plugin` and should not need to be modified.
+
+#### `config_registry`
+
+This file is provides the mapping from kernel sets to subconfigurations and configuration families. See [Mapping Kernels to Subconfigurations](PluginHowTo.md#mapping-ernels-to-subconfigurations) for more details.
+
+#### `bli_plugin_<name>.h`
+
+This file is the main header for the plugin. It should be `#include`d in order to use the functionality provided by the plugin. ***Note:*** the name and contents of this header are a suggestion---feel free to structure your plugin however you like!
+
+The example file contains several sections:
+
+* Macros defining arguments to be passed to the registration functions. The example given uses externally-provided arrays to store the generated kernel, blocksize, and preference IDs. Many alternative strategies are possible, e.g. passing a struct, passing individual pointers/references to IDs, or using global variables and passing no arguments (defining these macros to be empty). You can also pass in any other arguments you might need during registration. Macros are preferred to define the parameters since the parameter list is used in several different files and in generated code.
+
+* Enumerations providing convenient names by which kernel/blocksize/preference IDs can be obtained. In the example, these are offsets into the arrays passed into the `bli_plugin_register_<name>`. So, calling code could look up the kernel ID for kernel #2 as `kerids[MY_KERNEL_2]`. This section is entirely optional if you prefer a different way of accessing kernel IDs.
+
+* Prototypes for kernels. A prototype (and preferably a typedef) is recommended for each kernel you write so that you can provide type safety when calling kernels. Note that both kernels are assumed to have reference implementations (one for each enabled subconfiguration, expanded using the `INSERT_GENTCONF` macro to generate prototypes automatically), while a special "optimized" kernel #2 is available for double-precision operations on Zen 3 hardware. The latter prototype is given only for example---your plugin code would not need to know whether or not an optimized kernel is available and would only need to look up kernels by ID. The file `config/zen3/bli_plugin_init_zen3.c` handles registering this optimized kernel so that it can be automatically selected when running on Zen 3.
+
+* Prototypes for the plugin registration function (`bli_plugin_register_<name>`) and configuration-specific initialization functions. The former function can be named and structured however you like, but we recommend keeping the latter (configuration-specific) functions as-is.
+
+#### `bli_plugin_register.c`
+
+This file implements the function `bli_plugin_register_<name>` and illustrates how to register new kernels, along with associated blocksizes and kernel preferences. Each registration function generates a new, unique ID which must be saved and communicated to the rest of the plugin (for example, via global variables or arguments passed in to the function `bli_plugin_register_<name>`) so that they can be used later. This function also calls `bli_plugin_register_<name>_<config>` for each architecture which was enabled at configure time (see [`bli_plugin_init_<config>.c`](PluginHowTo.md#configconfigbli_plugin_init_configc)).
+
+Any code using the plugin should call this function (which you can rename if you like) before making use of any plugin functionality.
+
+#### `config/<config>/bli_kernel_defs_<config>.h`
+
+This file provides macros specific to one subconfiguration, such as the register blocksizes for the BLIS `GEMM` microkernel. You can add any macros or other definitions here that you want to be avialable to any code being compiled for the corresponding subcofiguration. Note that configuration families (e.g. `x86_64`) supersede individual subconfigurations.
+
+#### `config/<config>/bli_plugin_init_<config>.c`
+
+This file initializes the "context" with any kernels, blocksizes, or kernel preference which are optimized for the corresponding subconfiguration. It also call the reference initialization function in [`ref_kernels/bli_plugin_init_ref.c`](PluginHowTo.md#ref_kernelsbli_plugin_init_refc) for the matching configuration. A full example is given for the `zen3` subconfiguration. If no optimized kernels have been written for a particular subconfiguration, then no modifications are necessary. See [Mapping Kernels to Subconfigurations](PluginHowTo.md#mapping-ernels-to-subconfigurations) for more information about how optimized kernels and subconfigurations are related.
+
+#### `config/<config>/make_defs.mk`
+
+This file contains additional build variables or compiler-/architecture-specific flags for each subconfiguration. Typically these files should not be modified in order to achieve the best performance and maintain compatibility with BLIS.
+
+#### `ref_kernels/bli_plugin_init_ref.c`
+
+This file handles initialization of the context with [reference](PluginHowTo.md#reference-kernels) kernels. This file is compiled once for each enabled subconfiguration, resulting in functions `bli_plugin_init_<name>_<config>_ref`. Whenever you add a new reference kernel, blocksize, or kernel preference, you must also add code to initialize it here.
+
+#### `ref_kernels/my_kernel_1_ref.c` and `my_kernel_2_ref.c`
+
+These are example reference kernels. Note that the kernels are instantiated for the four standard datatypes (single and double precision, for both real and complex domains), indicated by the letters `sdcz`. Your kernels can use the same macros to help with instantiation of different types (or combinations of types), or you can use a different mechanism such as C++ templates.
+
+#### `kernels/zen3/my_kernel_1_zen3.c`
+
+This is an example optimized kernel. Typically optimized kernels are written with a specific data type or combination of data types in mind. In this example, only a double-precision real version is implemented, specifically for the Zen 3 architecture.
+
+#### `obj/<config>`
+
+This folder will contain the built object files and static and/or shared library for the plugin. Only one sub-folder is created corresponding to the configuration for which BLIS was built.
+
+## Creating a New Plugin
+
+To create a "blank" plugin without any build files or example code, execute `$PREFIX/share/blis/configure-plugin --init <name>` in the directory where you want the plugin to exist. At this point, you can start adding your own:
+
+* Kernels, [see below](PluginHowTo.md#kernels) for more details
+  1. Create a reference kernel. The file must be in the `ref_kernels` directory in order to be compiled correctly. Your kernel can any name and interface, but should ideally be implemented for all supported data types and should be architecture-agnostic.
+  2. Register your kernel in the `bli_plugin_register.c` file.
+  3. Initialize the context with pointer(s) to your reference kernel in the `ref_kernels/bli_plugin_init_ref.c` file.
+  4. [Optionally] implemented optimized versions in the appropriate `kernels/<arch>` directories, and initialize them in `config/<config>/bli_plugin_init_<config>.c`
+* Blocksizes
+  1. Register the blocksizes in `bli_plugin_register.c`.
+  2. Provide default values in `ref_kernels/bli_plugin_init_ref.c`. All data types should be given a default value.
+  3. [Optional] provide values for configuration-specific optimized implementations in `config/<config>/bli_plugin_init_<config>.c`.
+* Kernel preferences
+  1. Register the kernel preferences in `bli_plugin_register.c`.
+  2. Provide default values in `ref_kernels/bli_plugin_init_ref.c`. All data types should be given a default value.
+  3. [Optional] provide values for configuration-specific optimized implementations in `config/<config>/bli_plugin_init_<config>.c`.
+
+You will also need to provide a way to get registered kernel/blocksize/preference IDs back to your code by filling in the `plugin_<name>_params` and `plugin_<name>_params_only` macros in `bli_plugin_<name>.h`, saving to global variables, etc.
+
+## Building a Plugin
+
+Before building your kernel on a particular system, you must reconfigure to build using `$PREFIX/share/blis/configure-plugin --build [<name>]` in the plugin directory. Note that you do not need to provide the plugin name if it can be guessed from the name of `bli_plugin_<name>.h`. There are several flags which can be used to control how your plugin will be built:
+
+| Flag                  | Explanation |
+|-----------------------|-------------|
+| -p PATH,<br>--path=PATH  | Look for the plugin source in PATH instead of the current directory. This option is used to build the plugin out-of-tree. |
+| -e SYMBOLS,<br>--export-shared[=SYMBOLS] | Specify the subset of library symbols that are exported within a shared library. Valid values for SYMBOLS are: 'public' (the default) and 'all'. By default, only functions and variables that belong to public APIs are exported in shared libraries. However, the user may instead export all symbols in BLIS, even those that were intended for internal use only. Note that the public APIs encompass all functions that almost any user would ever want to call, including the BLAS/CBLAS compatibility APIs as well as the basic and expert interfaces to the typed and object APIs that are unique to BLIS. Also note that changing this option to 'all' will have no effect in some environments, such as when compiling with clang on Windows. |
+| --enable-rpath,<br>--disable-rpath | Enable (disabled by default) setting an install_name for dynamic libraries on macOS which starts with @rpath rather than the absolute install path. |
+| --disable-shared,<br>--enable-shared | Disable (enabled by default) building BLIS as a shared library. If the shared library build is disabled, the static library build must remain enabled. |
+| --disable-static,<br>--enable-static | Disable (enabled by default) building BLIS as a static library. If the static library build is disabled, the shared library build must remain enabled. |
+| -d DEBUG, --enable-debug[=DEBUG] | Enable debugging symbols in the library. If argument DEBUG is given as 'opt', then optimization flags are kept in the framework, otherwise optimization is turned off. |
+| --enable-verbose-make,<br>--disable-verbose-make | Enable (disabled by default) verbose compilation output during make. |
+| -f, --force | Overwrite any files in the current directory which are normally copied by configure-plugin, for example 'Makefile' and 'config_registry'. |
+| --enable-asan,<br>--disable-asan | Enable (disabled by default) compiling and linking BLIS framework code with the AddressSanitizer (ASan) library. Optimized kernels are NOT compiled with ASan support due to limitations of register assignment in inline assembly. WARNING: ENABLING THIS OPTION WILL NEGATIVELY IMPACT PERFORMANCE. Please use only for informational/debugging purposes. |
+| --enable-arg-max-hack<br>--disable-arg-max-hack | Enable (disabled by default) build system logic that will allow archiving/linking the static/shared library even if the command plus command line arguments exceeds the operating system limit (ARG_MAX). |
+
+After configuring, you can now build using `make`. **Your plugin is always built for the same subconfiguration or configuration family that BLIS was.** This means that build configuration should ideally be done on the target system, unless you are using an installation of BLIS which is configured for a "fat build" for a full configuration familty, such as `x86_64`. The final shared and/or static library is available in the `obj/<config>` directory, where `<config>` is the configuration that BLIS and your plugin are built for.
+
+# Kernels
+
+Kernels are the high-performance pieces of code at the heart of BLIS. A kernel usually does one simple computational operation on one or more input matrices, vectors, or scalars. For example, one of the workhorse kernels in BLIS is the `GEMM` microkernel, which computes a small matrix multiplication of `MR*k` and `k*NR` matrices, where `MR` and `NR` are constants depending on the architecture. You can write kernels which are intended to replace or extend existing BLIS kernels, or for any other operation which you might encounter in your code which needs a high-performance, architecture-specific solution.
+
+The BLIS plugin architecture supports two types of user-supplied kernels: reference kernels and optimized kernels. The former type of kernel is coded once (typically in standard C or C++), and compiled separately for any architecture which might be encountered. Then, at runtime BLIS will select the appropriate version of the kernel for the current hardware. Reference kernels typically do not achieve the highest performance, but are useful for less performance-sensitive operations such as data movement (which is bandwidth limited and not FLOP limited). For performance-critical kernels, you can additionally provide optimized kernels. These kernels are specific to one hardware architecture or family of related architectures, and are also often datatype-specific. These kernels also often employ compiler intrinsics or inline assembly which is not portable. If you provide an optimized kernel for a hardware architecture which is detected at runtime, BLIS will automatically select this kernel in preference to the reference kernel.
+
+In addition to kernels, BLIS plugins support providing blocksizes (for example, the `MR` and `NR` parameters above) as well as kernel preferences (essentially, the logical true/false equivalent of blocksizes) which control or define the behavior of kernels. These too are looked-up based on the actual hardware encountered at runtime, and come in reference (essentially, default) and optimized flavors. While internal BLIS kernels endeavor to operate correctly for any kind of input (although they work most efficiently for inputs which conform to the corresponding block sizes and preferences), your kernels are not required to support arbitrary inputs or parameters. You only have to provide the functionality that you know you will need!
+
+## Accessing Kernels
+
+Kernels, blocksizes, and kernel preferences are accessed through the "context", which reflects the kernel set available for the hardware on which BLIS is running. Initially, kernels and their parameters must be registered. This creates a slot in the context to hold pointers, blocksizes, or other data, and then returns a unique ID. Next, this slot must be filled with user-supplied data (pointers to reference kernels, default blocksizes, etc.), using the supplied IDs. If optimized kernels or parameters are avialable these are then written over the reference data. All of these steps happen during plugin registration which must happen before any computations are performed with the plugin (although BLIS itself can be used). Finally, at any point after plugin registratation, the current context can be obtained and then queried using the unique IDs:
+
+```C++
+const cntx_t* cntx = bli_gks_query_cntx();
+
+my_fun_ptr kernel = ( my_fun_ptr )bli_cntx_get_ukr_dt( BLIS_DOUBLE, MY_KERNEL_ID, cntx );
+
+kernel(...);
+```
+
+The process for registering and intializing kernels is detailed below.
+
+## Reference Kernels
+
+A reference kernel must first be registered. This should happen in `bli_plugin_register_<name>` defined in `bli_plugin_register.c` (although you can change the function and file names):
+
+```C++
+err_t errval;
+kerid_t id;
+
+err = bli_gks_register_ukr( &id );
+if ( err != BLIS_SUCCESS )
+    //handle error
+```
+
+Note that for registration we don't need to know anything about the actual kernel yet. Next, the pointers to the reference kernels must be supplied in the file `ref_kernels/bli_plugin_init_ref.c` (again, you can change the filename, but it must reside in `ref_kernels`, and it is not recommended to change the function name or signature since this must match `bli_plugin_register.c` and is generated automatically for each subconfiguration):
+
+```C++
+func_t ptrs;
+gen_func_init( &ptrs, PASTECH(my_kernel,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) );
+bli_cntx_set_ukr( MY_KERNEL_ID, &ptrs, cntx );
+```
+
+The `func_t` struc contains a function pointer for each data type. In this example the helper macro `gen_func_init` is used to automatically generate the correct symbol name for each type and for the current subconfiguration (since this file is compiled once for each enabled subconfiguration). It is strongly recommended to use the provided macros and naming convention for reference kernels. However, you are free to use any method you like to fill the entries of the `func_t` struct, *with pointers to the reference function of the correct type and for the correct subfiguration*. The kernel is now fully initialized and can be used safely on any hardware which BLIS was configured for.
+
+## Optimized Kernels
+
+If an optimized kernel implementation is available (as a function in a file in some  `kernels/<arch>` folder), it should be initialized in the appropriate file `config/<config>/bli_plugin_init_<config>.c`. For example:
+
+```C++
+bli_cntx_set_ukrs
+(
+  cntx,
+
+  MY_KERNEL_ID, BLIS_DOUBLE, bli_dmy_kernel_zen3,
+
+  BLIS_VA_END
+);
+```
+
+Here, it is not necessary to provide an optimized implementation for all datatypes. The automatically-generated template code and build system will handle building the correct files and calling the initialization functions for subconfigurations which are enabled in the BLIS installation you are using. So, you can simply provide optimized implementations for any hardware which is important to you and it will be picked up and used if possible.
+
+## Mapping Kernels to Subconfigurations
+
+It may seem strange that optimized kernel implementations are written in the `kernels` folder, but are initialized in the `config` folder. In fact, the sub-folders of these two directories are not even the same! This is because in BLIS, multiple *subconfigurations* (roughly mapping to specific hardware architectures), as well as *configuration families* (for example, all `x86_64` architectures), can use kernels from one (or more) of the folders in `kernels`, called *kernel sets*. The mapping from kernel sets to configurations is defined by the `config_registry` file. Essentially, this means that when adding an optimized kernel, you should initialize the kernel in each configuration which maps the kernel set where you defined the kernel. Conversely, this also means that if you define the kernel in a kernel set which is not mapped by any enabled configuration, then the kernel will not exist and linking will fail.
+
+By default, this file contains the mapping known by BLIS at the time of plugin creation. Thus, it might be a good idea to periodically reconfigure your plugin in order to pick up new `config` or `kernels` sub-folders and entries in `config_registry`. Instead, or in addition, you can define your own mappings in `config_registry` to reflect how your particular kernels should be used. *Note that this mapping only affects kernels in your plugin, and does not affect reference kernels.* See [here](ConfigurationHowTo.md) for more information on subconfigurations, configuration families, and mapping of kernel sets.
+
+# Custom Operations
+
+BLIS is written as a framework, meaning that user-written code can be inserted in order to achieve new functionality. For example, consider the mathematical operation $\mathop{\text{tri}}(C) := \mathop{\text{tri}}(\alpha A D A^T + \beta C)$ where $D$ is a diagonal matrix and the function `tri` operates only on the upper or lower part of a matrix. If $D$ were the identity matrix, then this would be a standard level-3 BLAS operation, `SYRK`, so we call this BLAS-like operation `SYRKD`. While it is technically not necessary to use the plugin infrastructure to implement `SYRKD` using BLIS, extending BLAS operations typically requires new kernels which are conveniently managed as a plugin. However, the code discussed in this section does not need to exist in the plugin directory (although it can be placed in the top-level plugin directory) but should have access to the kernel, blocksize, and kernel preference IDs registered by the plugin.
+
+Because $A D A^T = A (A D)^T = (A D) A^T$, it is actually even more closely related to the operation `GEMMT`, which implements $\mathop{\text{tri}}(C) := \mathop{\text{tri}}(\alpha \mathop{\text{trana}}(A) \mathop{\text{tranb}}(B) + \beta C)$ where the functions `trana` and `tranb` optionally transpose the operand. Essentially, this is just `GEMM` where we know the result will in fact be symmetric even though $A \ne B^T$. Then we can see that `SYRKD` is the same thing as `GEMMT` with $B = AD$, $\mathop{\text{trana}}(A)=A$ and $\mathop{\text{tranb}}(B)=B^T$. So, let's implement `SYRKD` by:
+
+1. Starting with the high-level code which defines `GEMMT`.
+
+2. Writing a kernel to handle the multiplication $A D$ when packing the "virtual" matrix $B$.
+
+3. Modifying the `GEMMT` operation to use our custom packing kernel.
+
+4. Supplying additional data so that the packing kernel can address $D$ (in addition to $A$ which is passed as a normal parameter of `GEMMT`).
+
+## Example: `bli_gemmt_ex`
+
+Consider the following code which implements `GEMMT`:
+
+```C
+/*
+ * Step 0:
+ */
+void bli_gemmt_ex
+     (
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const rntm_t* rntm
+     )
+{
+    /*
+     * Step 1: Make sure BLIS is initialized.
+     */
+    bli_init_once();
+
+    /*
+     * Step 2: Check the operands for consistency and check for cases where
+     *         we can exit early (alpha = 0, m = 0, etc.).
+     */
+    if ( bli_error_checking_is_enabled() )
+        bli_gemmt_check( alpha, a, b, beta, c, cntx );
+
+    if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS )
+        return;
+
+    /*
+     * Step 3: Determine if we can and should use the 1m method for
+     *         cases with all complex operands.
+     */
+    num_t dt = bli_obj_dt( c );
+    ind_t im = BLIS_NAT;
+
+    if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+         bli_obj_dt( b ) == bli_obj_dt( c ) &&
+         bli_obj_is_complex( c ) )
+        // Usually BLIS_NAT if a complex microkernel is available,
+        // otherwise BLIS_1M.
+        im = bli_gemmtind_find_avail( dt );
+
+    /*
+     * Step 4: Alias A, B, and C so that we have local mutable copies and
+     *         to take care of implicit transpose, sub-matrix references,
+     *         etc.
+     */
+    obj_t a_local;
+    obj_t b_local;
+    obj_t c_local;
+    bli_obj_alias_submatrix( a, &a_local );
+    bli_obj_alias_submatrix( b, &b_local );
+    bli_obj_alias_submatrix( c, &c_local );
+
+    /*
+     * Step 5: Create a "default" control tree.
+     */
+    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+    gemm_cntl_t cntl;
+    bli_gemm_cntl_init
+    (
+      im,
+      BLIS_GEMMT,
+      alpha,
+      &a_local,
+      &b_local,
+      beta,
+      &c_local,
+      cntx,
+      &cntl
+    );
+
+    /*
+     * Step 6: Execute the control tree in parallel.
+     */
+    bli_l3_thread_decorator
+    (
+      &a_local,
+      &b_local,
+      &c_local,
+      cntx,
+      ( cntl_t* )&cntl,
+      rntm
+    );
+}
+```
+
+### Step 0: Function signature
+
+The function name and signature is entirely up to you. Your function can take `obj_t`s as parameters, but you can also contruct `obj_t`s internally based on whatever function parameters you define (see Step 4 for more on this).
+
+For `SYRKD`, we would need to add a `const obj_t* D` parameter, and the `const obj_t* B` parameter can be removed since we know that $B = A^T$.
+
+### Step 1: Intialize BLIS
+
+This step is mandatory and must be done before calling any other BLIS APIs used here. Most BLIS API calls (like `bli_gemm`) check for initialization themselves, but the control tree and thread decorator APIs do not.
+
+### Step 2: Error and early exit checks
+
+BLIS has some standard checks for typical level-3 BLAS operations, as well as checks for conditions which enable an early exit. You may use these functions, but for new operations you may need to include additional checks. Also note that `bli_l3_return_early_if_trivial` assumes that `C` is a dense matrix and will attempt to scale by `beta` if exiting early. If `C` refers to a matrix-like object with alternative layout then you will need to check for early exit conditions manually.
+
+For example, when implementing `SYRKD`, all of the checks done by `bli_gemmt_check` are still relevant (relatize size/shape of `A`, `B`, `C`, triangular `C`, etc.). We would also want to check that `C` is symmetric, and that `D` is a vector and has the correct length. In a complex Hermitian version (e.g. `HERKD`) we might also want to enforce that `D` is real. Because `C` is a normal, dense matrix, we can also call `bli_l3_return_early_if_trivial` safely.
+
+### Step 3: Check for 1m or "natural" complex execution
+
+This step is optional. If your operation doesn't support complex operands, or if you don't want to support the 1m method (which requires additional kernels, see below), then you can always default to `BLIS_NAT` as the execution method.
+
+The functions `bli_XXXind_find_avail` are likely not useful for custom code, but you can check if an optimized complex-domain `GEMM` microkernel is available by using:
+
+```C
+bool c_optimized = ! bli_gks_cntx_ukr_is_ref( BLIS_SCOMPLEX, BLIS_GEMM_UKR, cntx );
+```
+
+Note that if the complex-domain `GEMM` microkernel is not optimized then using `BLIS_NAT` may decrease performance.
+
+### Step 4: Alias local matrices
+
+Your function may or may not operate on normal, dense matrices represented as BLIS `obj_t`s. If so, then code similar to that used for `GEMMT` is recommended, since it handles implicit transposition of the operants, cases where a sub-matrix of a larger matrix is indicated, etc.
+
+**If instead you are using "matrix-like" operands, then you will still need to construct an `obj_t`** In this case, the `obj_t` simply indicates the size and shape of the object when logically viewed as a matrix. For example, a dense tensor can be mapped onto a matrix by collecting some tensor dimensions as the "rows" and the remaining dimensions as the "columns". The locations of elements are determined by the tensor strides, ordering of dimensions within rows and columns, etc. and does not translate directly to a matrix row-major or column-major layout. This is OK! BLIS will simply keep track of the matrix partitioning, and as long as you provide a custom packing kernel which knows *how* to access the data, BLIS will tell your kernel *what* data (logical sub-matrix) to pack. The same concept applies to the computational (`GEMM`) kernel.
+
+BLIS only needs `obj_t`s for the `A`, `B`, and `C` matrices (as they are defined in `GEMM` and related operations). Note that the `obj_t` representing the output matrix (`C`) should have row stride (`rs`) and column (`cs`) stride values set to indicate a "row-preference" (`rs < cs`) or "column-preference" (`cs < rs`). All other `obj_t`s only need to have the matrix length and width set, unless you are using default kernels which then need a full matrix specification. If your operation references other data or objects (like the `D` operand in `SYRKD`) or your matrix-like objects need data which doesn't fit in an `obj_t`, then this information will be provided separately (see below).
+
+For `SYRKD`, the `A` and `C` matrices are already `obj_t`s and we should just alias the sub-matrices. The `obj_t` for `B` can be constructed from `A` with a transpose. For `D`, we have also chosen to pass in an `obj_t` representing a vector (mathematically, the diagonal of a matrix), and so we can just alias a sub-matrix to clean it up.
+
+### Step 5: Create the control tree
+
+The control tree determines exactly what operations are done during execution and their parameters (see more below). Custom operations will typically begin with a "default" control tree which corresponds to the most similar level-3 BLAS operation. The particular level-3 operation used does matter: for example `TRMM` will only operate on the upper or lower part of a matrix, and other factors such as threading can be affected as well. In order to accomodate custom operations, the control tree will then need to be modified. This is discussed in the following sections.
+
+For `SYRKD`, since the output matrix is symmetric (stored as triangular), we should use `GEMMT` as the template for the control tree. Note that `SYRK`, `SYR2K`, `HERK`, and `HER2K` all use the `GEMMT` control tree.
+
+### Step 6: Execute the control tree
+
+This step should be essentially the same for all operations. The `A`, `B`, and `C` objects are those `obj_t`s created earlier (and which may only be logical matrices with a length and width only if you provide custom kernels). The control tree and context are passed in, as well as a "runtime" (`rntm_t`) object. Typically, the pointer to the runtime is `NULL`, which uses default settings for threading. If you want to customize threading then you can also pass in a custom `rntm_t` object.
+
+## The Control Tree
+
+![BLIS GEMM algorithm](diagrams/mmbp_algorithm_color.png)
+
+*Figure 1: The GEMM algorithm in BLIS*
+
+A typical `GEMM` operation in BLIS is depicted visually in Fig. 1. The matrix objects `A`, `B`, and `C` (represented as `obj_t`s) only provide limited information about size/shape and, for normal dense matrices, data location and layout. The rest of the information about how to execute the operation, including what order to partition the matrix dimensions, what blocksizes to use, what kernels to use for packing and computation, what parts of the matrices to operate on (for triangular/symmetric matrices), how to apply threading, etc. is all stored in the "control tree". This is a tree-based data structure, where each node indicates a primitive operation to be performed, which is executed by a specific kernel. The built-in control tree nodes are:
+
+- Partitioning along the "m", "n", or "k" dimensions (as defined in `GEMM`).
+
+- Packing of the `A` or `B` matrix. Packing moves data into a specialized layout which provides better data locality. While packing kernels should typically place some sort of data into a packed buffer (in a format which the computational kernel can understand), it could also perform any operation on the input matrix while doing so. In general, we could denote this as $A_{packed} = pack(op(A,...))$, where the ellipsis indicates additional information that can be stored in the control tree.
+
+- Computation (`GEMM` and `GEMMTRSM`). Only `GEMM`-like computation can be customized currently. This operation doesn't have to actually perform a `GEMM` computation ($C = \alpha A B + \beta C$). Rather, it can perform any operation $C = op(A,B,C,...)$ where the ellipsis indicates additional information that can be stored in the control tree.
+
+The control tree manages the flow of data through the processor caches by partitioning the matrices, using the estimated number and timing of memory accesses performed by the kernels. Thus, for operations which are truly `GEMM`-like, most of the control tree would not need to be modified. Operations which deal with less, more, or simply different data may need adjustments to blocksizes, or in extreme cases, the structure of the control tree. Altering the control tree structure and adding new custom control tree nodes are beyond the scope of this tutorial.
+
+## Modifying the Control Tree
+
+In order to get BLIS to do anything other than `GEMM` (or the related level-3 BLAS operation used as a template for the control tree), the control tree must be modified. Currently, the following aspects of the control tree can be modified:
+
+- The blocksizes used when partitioning. The 5 "standard" blocksizes `MC`, `NC`, `KC`, `MR`, and `NR` can be modified for `GEMM`-like operations. Note that modifying `MR` and `NR` in particular may require changes to your packing and/or computational kernels, since this affects the layout of packed data.
+
+- The packing "microkernel", which is used to pack a small portion of a matrix (at most `MR x KC` or `NR x KC`). If data is to be packed in the same format as for a normal dense matrix, but read from a different layout and/or with operations applied during packing, then changing only the packing microkernel is likely sufficient.
+
+- The packing "variant", which is responsible for a) obtaining a memory buffer of the appropriate size for the packed data, b) setting up the packed matrix object, and c) actually performing the packing (typically by calling a packing microkernel, however packing variants can be implemented many different ways). Modifying the packing variant may be necessary if data is to be packed in a substantially different format or other situations beyond the scope of a simple microkernel.
+
+- The computational microkernel (which is the `GEMM` microkernel by default). The computational microkernel reads data from the packed buffers and then does "something" with it, typically in combination with data from the matrix `C`. Usually, the `C` matrix also represents an output which is written by the microkernel. Note that when replacing only the microkernel, both packed data and the representation of output data in `C` must be similar to the standard `GEMM` case. So, this modification is most effective for relatively `GEMM`-like operations.
+
+- The computational variant, which is responsible for performing some operation on the entirety of the packed data as well as the matrix `C`. By modifying the entire computational variant, the user has complete freedom in the amount and format of data which is packed and in the data represented by `C`.
+
+Between modifying the control tree and execution, users should always call `bli_gemm_cntl_finalize` in order to maintain consistency between various parameters (see below).
+
+Note that when modifying one component it may also be necessary to modify other components in order to maintain consistency. For example, changing the pack format used by the packing microkernel would require a computational microkernel which can read the new format. Likewise, changing the register blocksizes (`MR` and/or `NR`) might require writing new packing or computational microkernels to take advantage of the new blocksize. The default microkernels are written to accept any register blocksize, although they are most efficient with the default `MR` and `NR`.
+
+In the case of `SYRKD`, the only changes necessary are applying multiplication of either `A` or `B` by the diagonal matrix `D` (represented as a vector) during packing. So, it is only necessary to write a new packing microkernel and insert it into the control tree.
+
+## Modifications to Blocking
+
+The BLIS framework applies blocking (partitioning) to the matrices during the computation, resulting in smaller and smaller submatrices which ideally fit into successive levels of the cache hierarchy. The control tree nodes control which dimension is blocked at which level. Modifying the actual control tree structure is beyond the scope of this tutotial. However, it may often be necessary to control the blocksize used at each level. For example, the blocksize used in the top-level partitioning, `NC`, can be modified using:
+
+```C
+blksz_t nc;
+//                        S     D     C     Z
+bli_blksz_init_easy( &nc, 8092, 5150, 2244, 2244 );
+bli_gemm_cntl_set_nc( &nc, &cntl );
+```
+
+where `cntl` is a `gemm_cntl_t` object which has been initialized with `bli_gemm_cntl_init`. Note that the `blksz_t` object used to set `NC` contains values for each datatype. Supplying values for all datatypes (even though the operation being performed uses a specific and known set of datatypes) is recommended because the control tree is sometimes constructed to use blocksizes and even microkernels from related datatypes, such as in mixed-domain computations.
+
+The function `bli_blksz_init` also lets users specify extended blocksizes. For cache blocking (`MC`, `NC`, `KC`), the extended blocksizes is the maximum partition size and the normal blocksize is the default partition size. If the remainder block after partitioning by the default size is less than max minus default, then it is merged with another block. This can be useful for smoothing out performance drops for matrices just larger than the blocksize. For register blocksizes (`MR` and `NR`), the extended blocksize is the size used for computing the size of the packed buffer and the leading dimension of the indiviual sub-matrices packed by the microkernel. If it is larger than the default blocksize then there will be unused elements in the pack buffer. This can be useful for e.g. aligning packed sub-matrices on cache line boundaries.
+
+BLIS requires the `MC`(default and extension) be a multiple of `MR` (default only), and likewise of `NC` and `NR`. Also, in operations with triangular/symmetric/hermitian matrices, BLIS may also require `KC` to be a multiple of either `MR` or `NR`. The function `bli_gemm_cntl_finalize` tweaks the cache blocksizes, if necessary, to maintain these relationships.
+
+Finally, note that changing, in particular, the register blocksizes may require changes in user-defined microkernels if any assumptions about register blocking are hard-coded.
+
+## Modifications to Packing
+
+Packing the `A` and `B` matrices is represented as nodes in the control tree. These control tree nodes contain a pointer to the packing variant, as well as to a "params" structure containing any additional data needed. Packing variants have the following signature:
+
+```C
+void pack_variant(  const obj_t*     a,
+                          obj_t*     p,
+                    const cntx_t*    cntx,
+                    const cntl_t*    cntl,
+                          thrinfo_t* thread );
+```
+
+where `a` is the matrix to pack and `p` is an uninitialized matrix object which should represent the packed matrix on return. You can set a custom packing variant with:
+
+```C
+bli_gemm_cntl_set_pack[ab]_var( &pack_variant, &cntl );
+```
+
+If the default parameter structure which is included in the `gemm_cntl_t` object (and which is pointed to by the default value of the params pointer) can be used by your custom packing variant, then the [packing parameters API](../frame/3/gemm/bli_gemm_cntl.h) can be used. However, if different information is required then you can create your own structure (which must be treated as read-only during the operation) and insert it into the control tree with:
+
+```C
+my_params_t params;
+// intialize params...
+bli_gemm_cntl_set_pack[ab]_params( &params, &cntl );
+```
+
+and obtained within the packing variant using:
+
+```C
+my_params_t* params = ( my_params_t* )bli_packm_cntl_variant_params( cntl );
+```
+
+If instead only the packing microkernel needs to be modified, you can set a new packing microkernel with:
+
+```C
+func_t pack_ukr;
+bli_func_init( &pack_ukr, &spack_ukr, &dpack_ukr, &cpack_ukr, &zpack_ukr );
+bli_gemm_cntl_set_pack[ab]_ukr_simple( &pack_ukr, &cntl );
+```
+
+for packing without datatype conversion (mixed-precision/mixed-domain), or in the most general case:
+
+```C
+func2_t pack_ukr;
+bli_func2_init( &pack_ukr, ... );
+bli_gemm_cntl_set_pack[ab]_ukr( &pack_ukr, &cntl );
+```
+
+Packing microkernels must have a signature compatible with:
+
+```C
+xpack_ukr(       struc_t strucc,
+                 diag_t  diagc,
+                 uplo_t  uploc,
+                 conj_t  conjc,
+                 pack_t  schema,
+                 bool    invdiag,
+                 dim_t   panel_dim,
+                 dim_t   panel_len,
+                 dim_t   panel_dim_max,
+                 dim_t   panel_len_max,
+                 dim_t   panel_dim_off,
+                 dim_t   panel_len_off,
+                 dim_t   panel_bcast,
+           const void*   kappa,
+           const void*   c, inc_t incc, inc_t ldc,
+                 void*   p,             inc_t ldp,
+           const void*   params,
+           const cntx_t* cntx );
+```
+
+Note that the packing microkernel also receives a params pointer. This pointer is `NULL` by default but can be set using `bli_gemm_cntl_set_pack[ab]_params` just as for the packing variant params.
+
+## Modifications to Computation
+
+Similar to packing, the computation phase of the operation is represented as a control tree node. The computational variant must have the following signature:
+
+```C
+void comp_variant( const obj_t*     a,
+                   const obj_t*     b,
+                   const obj_t*     c,
+                   const cntx_t*    cntx,
+                   const cntl_t*    cntl,
+                         thrinfo_t* thread );
+```
+
+You can set a custom computational variant with:
+
+```C
+bli_gemm_cntl_set_var( &comp_variant, &cntl );
+```
+
+If the default parameter structure which is included in the `gemm_cntl_t` object can be used by your custom computational variant, then the [computational parameters API](../frame/3/gemm/bli_gemm_cntl.h) can be used. However, if different information is required then you can create your own structure (which must be treated as read-only during the operation) and insert it into the control tree with:
+
+```C
+my_params_t params;
+// intialize params...
+bli_gemm_cntl_set_params( &params, &cntl );
+```
+
+and obtained within the packing variant using:
+
+```C
+my_params_t* params = ( my_params_t* )bli_gemm_var_cntl_params( cntl );
+```
+
+If instead only the computational microkernel needs to be modified, you can set a new microkernel with:
+
+```C
+func_t comp_ukr;
+bli_func_init( &comp_ukr, &scomp_ukr, &dcomp_ukr, &ccomp_ukr, &zcomp_ukr );
+bli_gemm_cntl_set_ukr_simple( &comp_ukr, &cntl );
+```
+
+for computation without datatype conversion for output to `C` (mixed-precision/mixed-domain), or in the most general case:
+
+```C
+func2_t comp_ukr;
+bli_func2_init( &comp_ukr, ... );
+bli_gemm_cntl_set_ukr( &comp_ukr, &cntl );
+```
+
+Computational microkernels must have a signature compatible with:
+
+```C
+xcomp_ukr(       dim_t      m,
+                 dim_t      n,
+                 dim_t      k,
+           const void*      alpha,
+           const void*      a,
+           const void*      b,
+           const void*      beta,
+                 void*      c, inc_t rs_c, inc_t cs_c,
+           const auxinfo_t* auxinfo,
+           const cntx_t*    cntx );
+```
+
+As with packing microkernels, a params pointer is also available to computational microkernels and can be set using `bli_gemm_cntl_set_params`. The params pointer is stored in the `auxinfo_t` struct, and can be obtained with `bli_auxinfo_params( &auxinfo )` (see also the [auxinfo API](../frame/base/bli_auxinfo.h)).
+
+## SYRKD
+
+Taking all of the above considerations in to account, we can finally implement `SYRKD` (for double-precision operands only, some error checking omitted):
+
+```C
+typedef struct
+{
+    const double* d;
+    inc_t incd;
+} syrkd_params;
+
+/*
+ * This function should ideally be defined in a plugin and registered with the context.
+ * Then, it could be obtained using `bli_cntx_get_ukr_dt` below for the appropriate
+ * hardware detected at runtime.
+ */
+void dsyrkd_pack(       struc_t       strucc,
+                        diag_t        diagc,
+                        uplo_t        uploc,
+                        conj_t        conjc,
+                        pack_t        schema,
+                        bool          invdiag,
+                        dim_t         panel_dim,
+                        dim_t         panel_len,
+                        dim_t         panel_dim_max,
+                        dim_t         panel_len_max,
+                        dim_t         panel_dim_off,
+                        dim_t         panel_len_off,
+                        dim_t         panel_bcast,
+                  const double*       kappa_ptr,
+                  const double*       c, inc_t incc, inc_t ldc,
+                        double*       p,             inc_t ldp,
+                  const syrkd_params* params,
+                  const cntx_t*       cntx )
+{
+    inc_t incd = params->incd;
+    const double* d = params->d + panel_dim_off*incd;
+    double kappa = *kappa_ptr;
+
+    for (dim_t p = 0;p < panel_len;p++)
+    for (dim_t i = 0;i < panel_dim;i++)
+    for (dim_t r = 0;r < panel_bcast;r++)
+        p[i*panel_bcast + r + p*ldp] = kappa * d[i*incd] * c[i*incc + p*ldc];
+
+    bli_dset0s_edge
+    (
+      panel_dim*panel_bcast, panel_dim_max*panel_bcast,
+      panel_len, panel_len_max,
+      p, ldp
+    );
+}
+
+void dsyrkd
+     (
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const rntm_t* rntm
+     )
+{
+    bli_init_once();
+
+    if ( bli_error_checking_is_enabled() )
+        bli_syrk_check( alpha, a, beta, c, cntx );
+
+    // Additional checks for D...
+
+    obj_t b;
+    bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &b );
+
+    if ( bli_l3_return_early_if_trivial( alpha, a, &b, beta, c ) == BLIS_SUCCESS )
+        return;
+
+    num_t dt = bli_obj_dt( c );
+    ind_t im = BLIS_NAT;
+
+    obj_t a_local;
+    obj_t b_local;
+    obj_t c_local;
+    obj_t d_local;
+    bli_obj_alias_submatrix(  a, &a_local );
+    bli_obj_alias_submatrix( &b, &b_local );
+    bli_obj_alias_submatrix(  c, &c_local );
+    bli_obj_alias_submatrix(  d, &d_local );
+
+    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+    gemm_cntl_t cntl;
+    bli_gemm_cntl_init
+    (
+      im,
+      BLIS_GEMMT,
+      alpha,
+      &a_local,
+      &b_local,
+      beta,
+      &c_local,
+      cntx,
+      &cntl
+    );
+
+    func_t pack_ukr;
+    bli_func_set_dt( &dsyrkd_pack, BLIS_DOUBLE, &pack_ukr );
+
+    syrkd_params params;
+    params.d = ( const double* )bli_obj_buffer( &d_local );
+    params.incd = bli_obj_vector_inc( &d_local );
+
+    bli_gemm_cntl_set_packb_ukr_simple( &pack_ukr, &cntl );
+    bli_gemm_cntl_set_packb_params( &params, &cntl );
+
+    bli_gemm_cntl_finalize
+    (
+      BLIS_GEMMT
+      &a_local,
+      &b_local,
+      &c_local,
+      &cntl
+    );
+
+    bli_l3_thread_decorator
+    (
+      &a_local,
+      &b_local,
+      &c_local,
+      cntx,
+      ( cntl_t* )&cntl,
+      rntm
+    );
+}
+```
+
+<!--
+# API Reference
+
+## Registration
+
+```C++
+err_t bli_gks_register_ukr( siz_t* ukr_id );
+```
+
+Register a new microkernel, which may have a different implementation for each supported data type.
+
+<table style="margin-left:0">
+  <tr><td><b>Parameters:</b></td>   <td><ul style="padding-left:15px; "><li><b>ukr_id</b> &ndash; A pointer to value which will be set to the unique ID of the new kernel.</li></ul></td></tr>
+  <tr><td><b>Returns:</b></td>   <td>An error code which is <code>BLIS_SUCCESS</code> on success.</td></tr>
+</table>
+
+```C++
+err_t bli_gks_register_ukr2( siz_t* ukr_id );
+```
+
+Register a new microkernel, which may have a different implementation for each *pair* of supported data types.
+
+<table style="margin-left:0">
+  <tr><td><b>Parameters:</b></td>   <td><ul style="padding-left:15px"><li><b>ukr_id</b> &ndash; A pointer to value which will be set to the unique ID of the new kernel.</li></ul></td></tr>
+  <tr><td><b>Returns:</b></td>   <td>An error code which is <code>BLIS_SUCCESS</code> on success.</td></tr>
+</table>
+
+```C++
+err_t bli_gks_register_blksz( siz_t* bs_id );
+```
+
+Register a new blocksize, which may have a different integral value for each supported data type.
+
+<table style="margin-left:0">
+  <tr><td><b>Parameters:</b></td>   <td><ul style="padding-left:15px"><li><b>bs_id</b> &ndash; A pointer to value which will be set to the unique ID of the new blocksize.</li></ul></td></tr>
+  <tr><td><b>Returns:</b></td>   <td>An error code which is <code>BLIS_SUCCESS</code> on success.</td></tr>
+</table>
+
+```C++
+err_t bli_gks_register_ukr_pref( siz_t* ukr_pref_id );
+```
+
+Register a new microkernel preference, which may have a different logical value for each supported data type.
+
+<table style="margin-left:0">
+  <tr><td><b>Parameters:</b></td>   <td><ul style="padding-left:15px"><li><b>ukr_pref_id</b> &ndash; A pointer to value which will be set to the unique ID of the new preference.</li></ul></td></tr>
+  <tr><td><b>Returns:</b></td>   <td>An error code which is <code>BLIS_SUCCESS</code> on success.</td></tr>
+</table>
+
+## Helper Functions
+
+```C++
+void_fp bli_func_get_dt(       num_t   dt,
+                         const func_t* func );
+```
+
+TODO
+
+```C++
+void bli_func_set_dt( void_fp fp,
+                      num_t   dt,
+                      func_t* func );
+```
+
+```C++
+void bli_func_copy_dt( num_t dt_src, const func_t* func_src,
+                       num_t dt_dst,       func_t* func_dst );
+```
+
+```C++
+func_t* bli_func_create( void_fp ptr_s,
+                         void_fp ptr_d,
+                         void_fp ptr_c,
+                         void_fp ptr_z );
+```
+
+```C++
+void bli_func_init( func_t* f,
+                    void_fp ptr_s,
+                    void_fp ptr_d,
+                    void_fp ptr_c,
+                    void_fp ptr_z );
+```
+
+```C++
+void bli_func_init_null( func_t* f );
+```
+
+```C++
+void bli_func_free( func_t* f );
+```
+
+```C++
+void_fp bli_func2_get_dt(       num_t    dt1,
+                                num_t    dt2,
+                          const func2_t* func );
+```
+
+```C++
+void bli_func2_set_dt( void_fp  fp,
+                       num_t    dt1,
+                       num_t    dt2,
+                       func2_t* func );
+```
+
+```C++
+func2_t* bli_func2_create( void_fp ptr_ss, void_fp ptr_sd, void_fp ptr_sc, void_fp ptr_sz,
+                           void_fp ptr_ds, void_fp ptr_dd, void_fp ptr_dc, void_fp ptr_dz,
+                           void_fp ptr_cs, void_fp ptr_cd, void_fp ptr_cc, void_fp ptr_cz,
+                           void_fp ptr_zs, void_fp ptr_zd, void_fp ptr_zc, void_fp ptr_zz );
+```
+
+```C++
+void bli_func2_init( func2_t* f,
+                     void_fp ptr_ss, void_fp ptr_sd, void_fp ptr_sc, void_fp ptr_sz,
+                     void_fp ptr_ds, void_fp ptr_dd, void_fp ptr_dc, void_fp ptr_dz,
+                     void_fp ptr_cs, void_fp ptr_cd, void_fp ptr_cc, void_fp ptr_cz,
+                     void_fp ptr_zs, void_fp ptr_zd, void_fp ptr_zc, void_fp ptr_zz );
+```
+
+```C++
+void bli_func2_init_null( func2_t* f );
+```
+
+```C++
+void bli_func2_free( func2_t* f );
+```
+
+```C++
+dim_t bli_blksz_get_def(       num_t    dt,
+                         const blksz_t* b );
+```
+
+```C++
+dim_t bli_blksz_get_max(       num_t    dt,
+                         const blksz_t* b );
+```
+
+```C++
+void bli_blksz_set_def ( dim_t    val,
+                         num_t    dt,
+                         blksz_t* b );
+```
+
+```C++
+void bli_blksz_set_max( dim_t    val,
+                        num_t    dt,
+                        blksz_t* b );
+```
+
+```C++
+void bli_blksz_copy( const blksz_t* b_src,
+                           blksz_t* b_dst );
+```
+
+```C++
+void bli_blksz_copy_if_nonneg( const blksz_t* b_src,
+                                     blksz_t* b_dst );
+```
+
+```C++
+void bli_blksz_copy_def_dt( num_t dt_src, const blksz_t* b_src,
+                            num_t dt_dst,       blksz_t* b_dst );
+```
+
+```C++
+void bli_blksz_copy_max_dt( num_t dt_src, const blksz_t* b_src,
+                            num_t dt_dst,       blksz_t* b_dst );
+```
+
+```C++
+void bli_blksz_copy_dt( num_t dt_src, const blksz_t* b_src,
+                        num_t dt_dst,       blksz_t* b_dst );
+```
+
+```C++
+blksz_t* bli_blksz_create( dim_t b_s,  dim_t b_d,  dim_t b_c,  dim_t b_z,
+                           dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z );
+```
+
+```C++
+blksz_t* bli_blksz_create_ed( dim_t b_s, dim_t be_s,
+                              dim_t b_d, dim_t be_d,
+                              dim_t b_c, dim_t be_c,
+                              dim_t b_z, dim_t be_z );
+```
+
+```C++
+void bli_blksz_init( blksz_t* b,
+                     dim_t b_s,  dim_t b_d,  dim_t b_c,  dim_t b_z,
+                     dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z );
+```
+
+```C++
+void bli_blksz_init_ed( blksz_t* b,
+                        dim_t    b_s, dim_t be_s,
+                        dim_t    b_d, dim_t be_d,
+                        dim_t    b_c, dim_t be_c,
+                        dim_t    b_z, dim_t be_z );
+```
+
+```C++
+void bli_blksz_init_easy( blksz_t* b,
+                          dim_t b_s,  dim_t b_d,  dim_t b_c,  dim_t b_z );
+```
+
+```C++
+void bli_blksz_free( blksz_t* b );
+```
+
+```C++
+bool bli_mbool_get_dt( num_t dt, const mbool_t* mb );
+```
+
+```C++
+void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb );
+```
+
+```C++
+mbool_t* bli_mbool_create( bool b_s,
+                           bool b_d,
+                           bool b_c,
+                           bool b_z );
+```
+
+```C++
+void bli_mbool_init( mbool_t* b,
+                     bool     b_s,
+                     bool     b_d,
+                     bool     b_c,
+                     bool     b_z );
+```
+
+```C++
+void bli_mbool_free( mbool_t* b );
+```
+
+```C++
+#define PASTECH(...)
+```
+
+```C++
+#define PASTEMAC(...)
+```
+
+```C++
+#define gen_func_init( func_p, opname )
+```
+
+```C++
+#define gen_func_init_ro( func_p, opname )
+```
+
+```C++
+#define gen_func_init_co( func_p, opname )
+```
+
+## Context Initialization
+
+```C++
+err_t bli_cntx_set_ukr( siz_t ukr_id, const func_t* func, cntx_t* cntx );
+```
+
+```C++
+void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, siz_t ukr_id, const func_t* func, cntx_t* cntx );
+```
+
+```C++
+err_t bli_cntx_set_ukr2( siz_t ukr_id, const func2_t* func, cntx_t* cntx );
+```
+
+```C++
+void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, siz_t ukr_id, const func_t* func, cntx_t* cntx );
+```
+
+```C++
+err_t bli_cntx_set_blksz( siz_t bs_id, const blksz_t* blksz, siz_t mult_id, cntx_t* cntx );
+```
+
+```C++
+void bli_cntx_set_blksz_def_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx );
+```
+
+```C++
+void bli_cntx_set_blksz_max_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx );
+```
+
+```C++
+err_t bli_cntx_set_ukr_pref( siz_t ukr_pref_id, const mbool_t* prefs, cntx_t* cntx );
+```
+
+```C++
+err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, siz_t ukr_pref_id, cntx_t* cntx );
+```
+
+```C++
+void bli_cntx_set_ukrs( cntx_t* cntx,
+                        siz_t ukr0_id, num_t dt0, void_fp ukr0_fp,
+                        siz_t ukr1_id, num_t dt1, void_fp ukr1_fp,
+                        siz_t ukr2_id, num_t dt2, void_fp ukr2_fp,
+                        ...,
+                        BLIS_VA_END );
+```
+
+```C++
+void bli_cntx_set_ukr2s( cntx_t* cntx,
+                         siz_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
+                         siz_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
+                         siz_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
+                         ...,
+                         BLIS_VA_END );
+```
+
+```C++
+void bli_cntx_set_blksz( cntx_t* cntx,
+                         siz_t bs0_id, const blksz_t* blksz0, siz_t bm0_id,
+                         siz_t bs1_id, const blksz_t* blksz1, siz_t bm1_id,
+                         siz_t bs2_id, const blksz_t* blksz2, siz_t bm2_id,
+                         ...,
+                         BLIS_VA_END );
+```
+
+```C++
+void bli_cntx_set_ukr_prefs( cntx_t* cntx,
+                             siz_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
+                             siz_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
+                             siz_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
+                             ...,
+                             BLIS_VA_END );
+```
+
+## Context Query
+
+```C++
+const cntx_t* bli_gks_query_cntx();
+```
+
+```C++
+const cntx_t* bli_gks_lookup_id( arch_t id );
+```
+
+```C++
+const func_t* bli_cntx_get_ukrs( siz_t ukr_id, const cntx_t* cntx );
+```
+
+```C++
+void_fp bli_cntx_get_ukr_dt( num_t dt, siz_t ukr_id, const cntx_t* cntx );
+```
+
+```C++
+const func2_t* bli_cntx_get_ukr2s( siz_t ukr_id, const cntx_t* cntx );
+```
+
+```C++
+void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, siz_t ukr_id, const cntx_t* cntx );
+```
+
+```C++
+const blksz_t* bli_cntx_get_blksz( siz_t bs_id, const cntx_t* cntx );
+```
+
+```C++
+dim_t bli_cntx_get_blksz_def_dt( num_t dt, siz_t bs_id, const cntx_t* cntx );
+```
+
+```C++
+dim_t bli_cntx_get_blksz_max_dt( num_t dt, siz_t bs_id, const cntx_t* cntx );
+```
+
+```C++
+siz_t bli_cntx_get_bmult_id( siz_t bs_id, const cntx_t* cntx );
+```
+
+```C++
+const blksz_t* bli_cntx_get_bmult( siz_t bs_id, const cntx_t* cntx );
+```
+
+```C++
+dim_t bli_cntx_get_bmult_dt( num_t dt, siz_t bs_id, const cntx_t* cntx );
+```
+
+```C++
+const mbool_t* bli_cntx_get_ukr_prefs( siz_t ukr_pref_id, const cntx_t* cntx );
+```
+
+```C++
+bool bli_cntx_get_ukr_prefs_dt( num_t dt, siz_t ukr_pref_id, const cntx_t* cntx );
+```
+
+## Control tree modification
+
+BLIS_EXPORT_BLIS void bli_gemm_cntl_init
+     (
+             ind_t        im,
+             opid_t       family,
+       const obj_t*       alpha,
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const cntx_t*      cntx,
+             gemm_cntl_t* cntl
+     );
+
+BLIS_EXPORT_BLIS void bli_gemm_cntl_finalize
+     (
+             opid_t       family,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+             gemm_cntl_t* cntl
+     );
+
+```C++
+gemm_ukr_ft bli_gemm_cntl_ukr( gemm_cntl_t* cntl );
+```
+
+```C++
+bool bli_gemm_cntl_row_pref( gemm_cntl_t* cntl );
+```
+
+```C++
+const void* bli_gemm_cntl_params( gemm_cntl_t* cntl );
+```
+
+```C++
+l3_var_oft bli_gemm_cntl_var( gemm_cntl_t* cntl );
+```
+
+```C++
+packm_ker_ft bli_gemm_cntl_packa_ukr( gemm_cntl_t* cntl );
+```
+
+```C++
+pack_t bli_gemm_cntl_packa_schema( gemm_cntl_t* cntl );
+```
+
+```C++
+const void* bli_gemm_cntl_packa_params( gemm_cntl_t* cntl );
+```
+
+```C++
+packm_var_oft bli_gemm_cntl_packa_var( gemm_cntl_t* cntl );
+```
+
+```C++
+packm_ker_ft bli_gemm_cntl_packb_ukr( gemm_cntl_t* cntl );
+```
+
+```C++
+pack_t bli_gemm_cntl_packb_schema( gemm_cntl_t* cntl );
+```
+
+```C++
+const void* bli_gemm_cntl_packb_params( gemm_cntl_t* cntl );
+```
+
+```C++
+packm_var_oft bli_gemm_cntl_packb_var( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_mr_def( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_mr_pack( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_nr_def( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_nr_pack( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_mc_def( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_mc_max( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_nc_def( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_nc_max( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_kc_def( gemm_cntl_t* cntl );
+```
+
+```C++
+dim_t bli_gemm_cntl_kc_max( gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_ukr( const func2_t* ukr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_ukr_simple( const func_t* ukr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_row_pref( const mbool_t* row_pref, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_params( const void* params, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_var( l3_var_oft var, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packa_ukr( const func2_t* ukr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packa_ukr_simple( const func_t* ukr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packa_schema( pack_t schema, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packa_params( const void* params, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packa_var( packm_var_oft var, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packb_ukr( const func2_t* ukr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packb_ukr_simple( const func_t* ukr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packb_schema( pack_t schema, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packb_params( const void* params, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_packb_var( packm_var_oft var, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_mr( const blksz_t* mr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_nr( const blksz_t* nr, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_mc( const blksz_t* mc, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_nc( const blksz_t* nc, gemm_cntl_t* cntl );
+```
+
+```C++
+void bli_gemm_cntl_set_kc( const blksz_t* kc, gemm_cntl_t* cntl );
+```
+-->
\ No newline at end of file
diff --git a/docs/diagrams/mmbp_algorithm_color.png b/docs/diagrams/mmbp_algorithm_color.png
new file mode 100644
index 0000000000000000000000000000000000000000..24e94e2aa50592518ee2f9a4bd13465e1e397b20
GIT binary patch
literal 28861
zcmb@ubzIbKv@bd$B~lUs(jn3z(v36%0#Z_v(miwuNQoegq|zNjw}>zT(k0yt-JSOt
z_dfgVd(OH4+|Thvd6{8;^Lt{gZ>;r9_-hq89IPi;5C{ZEL0(!N0=ee^fuNvcqJu|L
zIdRFsHyTrUb!7;|ixC0|2!=o|!9xM-5D1hD0@-{Afe0l-AS8}yjcOv`33L-BIcdlp
z^1p8_`ElTp2afW(F5tWTzrV8GHewKn!z%@8$u}NTJG1^i8e=!&``QqvLAp5>Vwo>_
z=W$DedZ^cx3JMgr0S#7Jn*7jlv#iYTe%HyYe{SBqI)$q6$&gRR3zn{Br3OUwqUC=J
zI2E$^L?><^PhEz(L2N1cC}od&qlKY?-zt`|fxqvnLGHbavwxa?in~g`D8iR5MxFsh
z3X@hQH*<)XCKx|f5Nj?NUp<(~gHTHTQPzSocq%NApEj6?7Hdwr(<$w8`M?7L5zAC{
z*UKNVa)pj8xiyW<B_cfcMdr6$)-I=rdTSUz^8Q%I!au^7dDQ+6KbE0=$L%FPs(dVJ
z<^V>B@q%;1sGYekUt^<H3<L9y9VM=uO8?g_8x6LJ6r#sy4q33br3(VMFgDZezm*k}
zw)m*`sG>0_;8wNKc7=`l9Di;3W}nkcp;(ZjXEFU;J}6if8F)z~11=v8zn^G6=0Yjr
zfj?d6FhS9~;Yx>(`HPYy3|%nMyqyahRTA=zy7|QL6ReyD(@}r|A64?^hkBV(k<bQZ
z^s<0Qi{ZWqo^ImUM43KyqB(7Z$;^cT2g$Xc|IBIdy26uYJw3gl;b9Zk8~LbLU!S9O
zgyN|-D4Uwn2Ut2@?|0yhL6Wftj;!%(rgyCORoL!lNxmS(w(hxC(4K_w<o$+*a<6{Y
ze%7?JRytv!grcdo_OVtm@dpVAjIG1I&StaJrvnnOxoK@X36l}Uz~5o9wznVOjw?~|
zYoi8}xab&Qt_vB$OX%v7<-)$-HjkIuhh98}rA9Pn$~@3HZt!5PayoxGJ6i0g|Lsv`
z+TAR^>1T3&TwY$@#sRCqv)HFf6&8I@-e_tX&(u3Hv9j_s-ZEhH^!AR-ja%3J1YZ_a
zh4)6bPS$d$!VMtyb>urcJNT%yyB$&w);B0P-KNdj^1Vto;&X)zZQa$kxjiyMdsk69
zxSS@U5uUTA%;sLxXJ&sZj7j<J_1{iJ5{35ntCZ+gp+9`gCgJNV%BYe?dgi0QpCRnd
z!NSVQ?R{#CHT6`MNQPIQiM=DEf~|NC{|!pA;a91R*l_l`Nn5Z9Vi~LgEnuddZlw(9
z4y&H|BM*wlQZ)T5oegyp7f-yFHxo8bIuE^C9EH-~>1xe}8Ykz>vUwp?-Mwj^G>77l
zCmz^5UE_0Fl6G}-GieV*v$V33eEBlyGEJu2Zg_MQ6;e=M&fnOYg9igkb_Stka8XuP
zX5!?8A8(A@bHyXZ4(+hCv&;RVd>DelPus#{+J%iA(V`=%g!{f>lN)=UO1ib{EGBI*
zgPN9Bh(a9W_nI1F@LVDT^9H3d2&^?O@a|Becj7c&+Ep;v@VeI{DNKd$Fc?94dOF0?
zt63Hb<-PV3Pg8l;XG_l@x9*$ZP1I}}mW7$xm)Bp^Yi7+$D^1-}ei)PohlACymwmVG
zVFG5sS)cPip9plvR)`%HX8==jbWu<vA<sE<#UfrX%@7j%2_-B_DbZXv-9YZfHP|e4
zPWwWVeUB)<a9Uz__#=V(CU`q?qGpXS<v-Gx-8f!GtLV^F25$FVeJCR#Ed&2sJw4<P
z*~LT8Uhk^}f;qVO{^~hxi|M9)T#e0y$y~E%Ncw!iV<`gz1N}`oDQ7$Ih)$DRsWGB$
zw{Z(waBq9E>gp8>z6OSaLM)f><#B9}+uy0F2(*WfNeys@;iU#mk5xra-YBf@I9OXt
zDJdysKA6kf<$Dm?A+Q)s7+EGOIW{_4z)bz1J6|1Z#y6%=Z?4Ls?|CCuh{x>(;&U0w
z#<JppLrJ-B5I=PP^up(<V3_}GBUE5Hl4rlv!S^9wDo!jzA<q!Zv!p~pH!_~g(>EQy
ztl&M{KNn$eST<_=x8mOWG$(XO!6#8xm?!p+inGMRAbz;hJKfrM{-YMOm?&;LQy(KD
zu%K9#um}&n=ua1y;aQlO0~7ghG}Z=jtdl368KLug+S-kr#IX#Ey_{`t<eGlNN?KK6
zzP1ct;53b2Y>fPtH!F46e8t|G7jbfCTU6>``KctDUmf#~bvRK-t3`UxhELFpz-6DG
z;3$D@bNPQv!2h3_`Tyb*(%D&AC}Nod>(=;E>faV9LFWFu;|9LoXlr-uosk<nefb-P
zt(*9{EX3GHER&YC@eG>9l$Dhw<r^)S-XD7_yv2?+H|vps0e8YgMZrW5vb47k89-!<
zmyr?^6IbZ4rDTHH%q~vkfw2W4JV83g41DJby(Cg2T6Au}$3}mIjc#1?R;l(w{KNZ^
zWdXX56ru09Fhh(X5bO>bd`waUb`9o_TkTF1`Wz$-3V{p?vfBBok7ORP^%g}xRS4`|
zect$1=2IYlmj4^byPCJlJ*Ibqm(O#Xm=XsN8doipxELrOAhvv|Lz;c8YDBaO1W(Ax
z7tgMkJrK#xzG#xK?|T~ErS?3-nhQhU`1OOm_c;hp5h^?%F=`R2=%KMcTU&VOL7S#c
zv$#pUXrUeVB(A=XYdcY3fLt84457Gq{j338bZUC~p7gwtO1=ipitoiu0w+;(Q+O8X
zSxwZkKn0r~fy28H;}`5SEif<{v)*861r9wXokt=iTCCuH#$W&lvWv&wV+MY^U}OcC
zo0^I&dpnf1Fw0Jg{d31H$e0u?13&<94{h^C&u5>BZWQ~@!Ptt=MCAQ210f(K><WUE
z;3!bxf9L1rZyp}#&Yx&LG&57eoS(`fcRY5h5Mo?m-uqa&!H}#wYGUi-#MW3mD}R{6
z8k5J_H=<B1S-MBr#S)6r%&l$CAG<vB7mh9z*DNS$Kg#Zv&wWpH$8yC;`*QvLOndKj
zZz{f21U{OB6ehD2QS$@Z989%+#^5h>((V73pZKz&J^V;!Xq|<0DF{#Owf}ovzFi@+
zqhw>!Y$Nq@<W3^S<R%7D7h?RwDQ#$sOM06?4X_ZmrWWn!PtaBO)tApsPmX0eZx<D@
z21eVPT3Tk;S(0U!B6h!uacHsio_?K>lIKv%{Kt7T>cChoXnUQzXXU<_lL0{7-0JUa
zVfazc!3l^`JZXqAO@v8iTlAO6!$ub;&IbX;XK^5O)lEdGRA+Dw)V2Pc#TE?w|CodI
z!wVm$yQjg)578V5<cMTEM5w@`O5)Qf5P*${q$F-C`6KH^d7V3M`7P`NM+=^<6No=2
zHOz@N%2;6chcBO9Rr*{wSFk+<y9Cg#0}mPn^%*x98@a()HN&_tG1$7H9Yg<<v?XZS
z$2kf_Ittjj8Ypl^Qi(oUU0r3aJGk%xRLT0s7W0<At<u=4bGt>=M&iAp#}%>o-TyoN
zLKDEH5((RxuCs|+brpJqiMkZ^P&z0LX8lFoy=9@a?nmf<k*|B$9Y4V3tZPnnp0zB4
z2r8d=n1P_}g<Vc1Y!RmBstC6At_IHUMNf-SHo5Km>`xbVIVX7;sjjOV-qX`VRi{Mu
zkCKg!#-Fx_c~bgcA20{ivGrO*PA+`*y;>kqX0J4;;7gu{I9YB@1~_x!<`OT$SakoI
z^nRdc=Jm|bzW<g6KVW)iTvW?e>CJ=wo=O`V8<2azUDNc!?AJs9x7sN#F0KSoazY_-
z{IaB>LAOd3CUkM*2l!@a$M?Jk(56{STU*o)uKlYUz*Z~Xmg38(i6$QE+s-B|P&WAr
zA>ku2G4cBLQA{9aXb0v;DN9RB3BWUkcVf}^^70sirD*@8@P0yZf3V<bXyB$2%YYg9
zCFxDlKN6<bS4lBFtjaG3W9xOAX1o$51K6*nTeRGu&oOHX=qiZ+5aT7W>v)^=?Q_7u
zR!XY+f9k$ZPZ;N=Un9Z#Sn;kM<yFdy2P65a6SK1X9(A>~bVOLjr`waVZRqMWmG|U{
zI;&CmdtsY6lIH#`&(YvcU(5x|W&jzaCJmS5nXEKv(jo||J8T}=2p+TUD{9jbENANl
z)9ZAicyejt*<vV=t^(NGZU@&Z*$Lj+9ff%fP0f!O)IRtwYkwp)H52?EiV%cLQ~B(-
z%a=@6y_a*_ogt%T(EgW#X>!)LXBv2(d!KICKYwayBby4Q18`I|?FZe=xkgF&v8_El
zir@iSA}p(n5!5|in{kGnBYJ>%I)FWwmX@YC&pOSGZZZbryA+g^NXyBE=A|)i;Q#OP
z<$nc9rqYr?S+(c=33So$I5)rO*4^Evy{^r%Vi)I<0r5MEgoFfAu5V}lmklD1JGeY{
zY9^cypPYc(17tF*s0iz9-rv8!?0sd!&FQpL2M#}}!RItC=)M-LCSfXv_9-*bYE78e
z#cI+}oBv(fHKpxrBXLIv&K9?*J#F;U-sgt?Hz#QZ<<duMeJ`1aq|D8qZf<S4c!_=r
zXNvH+K7ccOkVJGW4l2t{CF!y~&Mtl@)Nm;b;1kTe;KjQ?W0k0qgUSY}tbj+R9B1X8
zw*n5^7-tYceB$|k&~E=Zc=eq4F~2`5L^5yQy!l~Y2Xsd&Y!1jMlS;D_4Fdz5Pn}sD
zFASJK3TIaD$r=JF#<iU*ktxElK_4JF_<Bc*{*N4V#+Z`<ZUyAyawZTY(8XPVQ1vxB
zN8vy%0x`w_&?#zpG-16@k*Kqtdo`BfgSl>YaRLu`kzqMWUtb@%B%tk^UZodimrg)*
zvP<IsfR#*L*1j#>%=!b?mLut*3!-0B``uS=(-31QvCN}b21$HWK#dd40jx!~274|Z
zcq}*yEVv2*z(x5WG1o8Cp8AE3A%d>ikW*SMczzF~gA2S52+A)E4<KAXa)9My)2a9|
z*r*sAe0u5z8QOi2KT?Q((E8iDhK>|FtFiHk$N*v|D56mrK-uTdpWl(n-t=(v!lrm1
zT|ryGvYVTmhm2`lgy1V(IC2vL0Kk+Q=V$ZGn4sXq$6PsCV*@C72?04548M%93dmPL
zK<SmY-GnJ{F?13?g2@z02VZDS-AUNJFx}abNNKu#4-p`D+?qr=UQgTd)$5lv-=w~-
zdqC?OpiB7Gvt=$v@J+lLQx8To*;Xc?9l572S3a%v6UFg}`!xWfLSD<J1MJ%zdcFY`
z$go_yFSzKnJqv=qh8NsT7;XIbD3O7VRcA7TbVCi<q4S&DdgsuNm?V4JO+MOS(!9z_
zfT%0+bNqm29XgN1z6I;?^0?-Qf#v8SV-6~tVKXT^JA1Mal;w19KDzDpq}*fiBS~bL
zmgLTS8#SEDyJTqWHyJ?0V<#LO93YuGL9V_BxsQSS>+AcB$@kB?89I(cqQovOvx{SR
z!0y^u1`<$BTRUHjit8OjCy~9ud{9~~$9xb-DN=x58-xJm*k@iTVxjGk_OQ&kHBpX@
zjinr<{7N&^MISf<nKneVije{38!N$0xb&er)#lM|*O3Tdq$zBMiRPo@<3AT2iEg&b
zTLIsy*XNLrHS5Wd%c+*8lhD?FVXenD1iv^}muKIz4_DYpD0%m5CSHFieB<?ljV&gR
z`_^+G{*H+Sth-LA+d{S9yh*nIT4W{=r9ndXg@Kt<kO8w>nt`C*cr~9jwyuouznCKI
z$&ZMqUsfa7dh-kk)!P4y%xyyzA^b{3YU*B%$hSX6IFby0yq0zVJQ+}bn;HOpA;twy
zNdWbh;wST!k&$WN%3|O>^bkow#J|b@AQ2GA&s;B$DlkKA2X$YqTvg_0YeEZ>1a`uS
zMQ*J8?A}?@%&Tg%KdyKI#-vF9K%*5vmLo;Jg)?iXJ*WB;CJ%R0-Nzv6Qk+*Ubd2(_
zV<K0NE=G6hLOVW4(7b-peVCvn7!aE(BOgf;E<H#b?wgi*^m>0+lKc4P^l>i?L_U)7
zmX+b6SURjczTcxG7fB#lJ&g)s3qwCzcW97l7e>n*(2oDWs?j4=E1IDB=FPfaGB<X1
z8LmA#QGIxf3EZ<oirwMm`Ybl9xy9xYW6mg`h*qqLVh&P~MB&no^{kgoAK%X9e4)E|
z^;<%onxxNW_=YNC#UYI!$t$~QcW`>I(?UUdd777<O>fU+$xOy;9Y3V`Z{x)$!(h+#
zp<|kxuLsrJo%AK0Tjd%@+KZx(T&eysHIUhaiK@#@N%LrWzsWQbs^1&6LmFT7!|BOb
z5@!MqimzlaX>-Vt;lvOcUlIuUd;}P5)(~-c!E!Tu#cL#=HNmGuP~Oa%2on5u`F!~w
zJQqoQotX%DMCS;;BlDO#(NdSE-hYrjDO5S||7?$*Ab^p!f@zJ+%(zgQfR0fF4!~|T
zqR(aM+&Ub0xB^kVlSPLO0$R)^ac>prV&q@dDTJ#GJ2mJ(C9Bs>MC*mgYInTwJMW-2
zIoTW&RA6~wJ<6oR_E>{*s~|h-XKs>VO4XK3WP(7)bED;EWy)sYs&(g<GQ6dL--Ccq
z^Y$$nAJ-Zk9VNYq#oXX+G*)=!aB;Y*)9T}?62*&@_(r%J%kfTg2D7uzo=)0Hg#nWA
zy|A!*-TIg;i>YYdQt=~$g5bsJIN{Fg_J7O3-9pA=@$O)$lerG4brAD1zy#>odGh2*
zq@Q32xu7!xpWRfjaw;F0K|)}vh^O=1z@zU&jKR_XSHHKmimmbi@Lu2E-gXj3Dba5j
zUT672@oho*GWb96p3DG8%Cdll_A3DO94&tCj~ql6+~`m2q#{zLck&cCe!Y2-t$aAT
zrQDuM#H*VK003AhXin<t1=GPq><Y0oz0U=rUUHGXXcP*Dgc*rugkTY50Wk}hN^NVY
zR<V%VB(S`YDN$w^!;u0*68~M*Bmff{OccP~hj-{><o~>ueAo!sJb^0U-z|m!c!25l
z&%gw-l2##hXpG($P$C&FQh+p}BOBY?=5i*z!bbEh%pP%BV}Ji*LXeE`pOXdf3}U`X
z-;nJSbmSqztT*#fwmtQjOc8=1@t`lvII9>gr#F3LnZ{2ms_}Lqn3S0y{MU(#29P1Y
zj@cf6TDw2Gbz+g0Dc0y%?l(#(^9b-+QN|7~Kt2CVqL6<SQ>{h5mLWf|{oCaTc<20)
zuOrK!vM>QwWORUm?wuj7p=RD0>o~objg5`kG5KVPU_a9XU~?u$y$9UBnwpw|(o(R>
z)?4HDbzA73V6b)5l-i6DSze}3!lKRw+*S+U@Yhy}s%mdz_hJspWP?rJ30TDaFb+gM
zHUc%PePHV((3lj=CU~tR{vfyy0M$nqG!a=(Xau8z*m^e$=l}o$Fb<?Cn11hF-m&BM
zYzM3RR^$=IE)~n)Xe4oqkbzB>DGpkY%BCL^Os6PQge3hv)Rg~EyiTkd*?1G3pPw)D
z%-Y6g%syr=Eq@T{mSFwda4q7eO$g}WiWCwaM;rlRQPd$4)J_AS*CRu_H(s}<l0g9&
zmcW-5%fyp#wXG#EOGxyG32mV5+V*F^CeXMif%+I3q!FndixPHrrBgEPuoADA8q5z-
zQJk(`@bUcwoCz>vb~hle%gFx8>mM+-YrnQ9uLPsEDnFLEcCSOf9<PZQgf}Q7bAX9k
z3{l^oUq4EK3!>shQ9KsGu2);IYFN&1rSvwIf%i;asN+K2ZY1?c+R==x+D}oE3Ylq+
zA1H<>&_J~nD%0|xAqBOs?9y){`hLQ23Nm8?O1|Vwp+NzM*vG5rIJ6e#AwO_j%c<r?
zB>^6lQvq1XNo9%ve1ZZlNhAY60;C$SAejUC3Y95ax-J*70`2s~8Hn~HpYnmP(Mab#
zRl2D@;jz0m!-;Re9Ed`Qw8I?O^1z(o<*)5n3}8?I3?+q}r!&@>H?DaOH<bR_B<7Km
zUBs$a+|Tb~4k!>O&kDl(H3Ntd9I8roP4o45Lo$53d#t8rqCg4nvt%H~CT*#r<!0{|
z+JpYK`ZlfU0BeM@;oF}t8um0#$2B-Twv?w{=Q%wL#OIHd(4CQ-e5RXz90hlx09+1O
z&0z1?Xd7lezt4So4Va-{o4EazZN49&DN_R|$XF;mm>wMs2A*UV(q!Fw`TUKMQFKvJ
zQT#NM0b^x5@7Sy>xCNS`$j&iVV9dYy>Pezk_F9Z9UV;h}b-}F(8cj_8En$HTZ1!v5
z$W+(Wy-YNRrrDSLwF3fh?7L-8Ehu{M_^@lHgVh%DdG(;Wa-{6WPu!M|99TBeYHy-*
zMe;{;YiqS|kUl(Nvv#@RXa0E~9!`H@2H#@s@~$6*)-U*(kq<YxAo6DFs;UT2PEHC6
z3l+BXt<d&fGX(=z^jlsYI-b}$4bZodt+EVw@e-8egU-t3XnSr=LvwSuzyZKSf#jsS
zY3z9bdiCmx)N7>tq(_}n)uHqiaG){}z(gD6U`87_YJ{uz5Y(x{SGwc4Jr7qRPiY(K
z-1iJ$I85gES4f{>_jboUGX*-3`3wi-Z)@wHOP!y*XCBM*c<}S`eg}WxiR8?NiVJTn
zVsnGcbx%UR5&SsrAyUlx{>#}sFXg$i(AYu3q1h}0gv;zVJ`jOArQ;=+5eY{<FzoV)
zRIPM^|Ccs=>&Z<}$FIbJ6KezzAeq<ZaY{<cAXv1^%ggv_J%D*;yE6^eIlu@0S?{zw
zns+r-V*|=9fgvI2*+%5dCV(AV{iKt+1TNeDZK7|olgo>Y(<+EkM#d+X(wD=xd}EW7
z;esw}1uYZ+S%B5A7Supc?oD9_12)K?WbTf28>t8iG<$VLlOxafUTv_F5^NreKR+Uf
zS=H^6#$QV|N-AK|Zm5wMulI8#_gTgPAaZqe&6x@5`VU2gMEBU2m4FT0uz99ZsUp2C
z=WThDBA<`;aTs6$<Ky%RJ1v)fg`e-bw|xSD8xKZ0!_@~0AxKrLrM0viE#3u&<>I#O
zz1K3ap7fqcA@;c<`Cw87Tk0uAplxSaP7sIEoO5cjkcW}`EL5k;;xjO@esw>~jI^7s
zH3o1CJcNRxqMx7+f#l!dx_`d%V1Ob|@(U{h`{n~#sG?U{yfdWjB2n+)v`y>yRzl|Y
z@&Rc|kZ{r~%H|L2s8Q<V7F|vhh8~M(kRQOG9|C@S=O+?!gVMl_?QLmg<<CNA-~8`x
z1-(a*!qX^Y5wO<TF&?K*<ufFfdmSPsh*v}uwIm?{PyLyVm8n5~>U7xP!82ZNOc1cU
zmLc_DB>r1(|8%n~!^*YBc{d%$+c;c7mN<NclbaZam?3y&)~Q2ou@)nRtKNs{v6p|u
z4W)Nvnflv!X{(K?VHg%XgG)U<hcFx>TwBf1mo!2SWC(cCiq*vZFt-UJrBB|Q%6RWO
ze_oBK^lqJ99cUt`Dr(PnzxLhfqJ`3*&1N7|CPRBXIrWcC89dWJ!Ac`jhWMx+yZ-nZ
z<!90p71GCZJ)wJSonf0!o@9$s?ZO{uFgLwE_bfpQVem&T`xib70%h7AFcuaTbqla^
zBqq9AeRay08GE<q_ey*>j9;}~S=l~Hl1wz`)vV0W+q)#5k}6FW>G;g^%<%CJVz-sE
z^>9zs<WC*vA0<&{5BP#76~G)|L}GVH{117u;%m#)ox#=%o&%~lkj5-sN;7dazTF7a
z#Go8-Cy;W%mu#EQ$aHH1n(B2k6+Ju+2SWNrjHq9@8GIXEFS7DMgv>9sw24Rd?s=gT
zXLp$ll-YUg?2sH>7HL7rA3|KC-@+iI_NDC!;@;TvEaX$7t<xw<3@D2Ymh}6jNeD!4
z8K^e_seo=zGc~NPrOcfIqX_}5#=s!`t)YQIDM#rlU_}vhwtRXj<0gMTzS=~(N}!Sm
zHBoOAhzjam?YUYZe9F&{e-pCi)&vSBz`rge&iGl|M{ROgy*2hdTq)BNvp7lgePgQw
zujBj$c9gg|8{}K#sd{8#3(-wusM~Rs&foyb@z(ndKkVbZ_mXYFv;}twZ}<CIRjswm
zdY(S98&+<}PCI90`xRN@3ZlviB?ahcU`y6T!2-&3Ee6c>+Fn`2vDg@~*eIW#*2p!|
z9f-&L%3L4hOzjI<TeGNO`!qH6c0}KWX9~<E9EUnrY;Lxe_*pdBd)Jz!u&l6zfg2PJ
zdtahIzbn@_rOs5{LnKx!8|kiNGA*YjcJE{Y+)2d4+ip+g>X*hFJPrKg-_FhIB#xol
z0}^m{HCEmQY>|rcZV>P(3B0J1VOPuVNEyqfC3Nk&tZSYOJ7gC5c|nVdilj6(HD%*s
z1)@Ib(FnNdX!G3}{ybdqP4(KcXCVxov3KuQW>lI$&Yk<TmaTpr)-=C=SR6d<^GKJG
z8Msw2kP!~orJJ@pDq`{C_PU;^4Zp5Hzc8BNSy+^GRvYWAP0wCT7@AQ?-D0Ny*C5W0
z>59$LRZk;<6i_HMchXtSa-qn-&FIz{kWzb_RtbG9^9UF+Vws%pONaC)INdZ)=SDvx
zJatO70N*Jx2eNz9^Eoec^Ua9`G>dbU)2SOHX<@Utyz3<%rS6ly!1Xs*4+t55Y@KYD
zy)xQ#tF!Bz<!nZW2n<$YJZz~eBy{_;5|_ZaRQ<5Lo!q~fZ2l@UpWAVt``pBvRqX2B
z*Wpf_hinx(jw?>?{5@S~h8;<ZcF8Yp^3NA`o}yjUs<aEsa*@87z>P=xP0Dt}im~x$
zs>83CC0T1ZwXANsVI}Cvw&PBJ-rvNJJ3DML_QJfPrBuPJ7RnMaA&$jXIz17}KV};6
z7RN*M9#ukKhEjccmf=A#Vd(w2(Ty>Ds@^`=D_gy?X@_S!uI-1^o*g8hd!&p<nP$Fz
zQzp%Ck=J8cbqI1>SdZOQR|>D#727vEN8GCK*T{mfey7W~frG9_^vT*x(sm+P*WPdq
zW{=&mUglUHzq&mC$(Z!q4-s@jAPmGw-uT$}B9<owTY004o5f{apE%JS=Y2;n|E+S<
zw3x<G#P;4f!F=mJ7R9w!+mq@fv7;|mf&3630>U4vLd!lgT+4JCj8$0+>5RIG%1+DA
z4J-_y&G^t>L|amy#mko+rS*1eb~>k*<klAr+YLx@XK18d*|3y%<x2RD-FJ<a7vBAj
z5SQ%QOtbo7?rNs$BFyE_q-FL(dfs0@O%uP1+q1?!F<ellvz(vB&7-@dRpU2jv2WVp
z%Qo^-L#`+?k|D2BXC0S|NSsd3LQ-nUBY`5<1+EhmszK{VV`4q~2J&NFLl(eE)>3_k
zkHMpGF`o_xSK`>`(EZ!%qk&53)_ClncApt1_`NU=`FjxHO!GG?qG^BvI&k&2%9?P&
zBXV+b7(QY~!!a~prz%ZDhd3)&h+i)732R{N0a(T+=lc`S|L`yz;mM{~iw)Ta*CjVd
z4+Qb`6Er{+iLD9XY8yZ`RqUImcHAOKx#{?ccQUH8Ir#LXKS%DQ>W$ie`nTi|1UYbR
zxuN?=8w&J7D0?rn8=*=v6-j(?c>D<syCZpbj|QcnitEGm)7gQVaKxC&j(49xy-m=N
zCUXIgDz$0jVrz@>=7MtAqzqk5a$?<3jlw`jTd(pNA@}1|V6K507`uZMjQB%Jg`(#x
z`R|}qW(myK6<aeoqdW_;?S6^BV=_ajM7TB0;UR3jA*(N$`C{Z>INo3HA6At&2EU$D
z3M#UXKgk?vsHqvdj$twj0yMPZ=`E77AlQg-<@80<DVPW*0@XJZAY$vLmwQa!zYwCp
zMMb$c^0|udt+cLWZqszI2|qIf)*LEZZ`W#P?eq??MK>d5s~Fd~4N+q_UT5-iqk`TW
zVQ;na65v9e$s)z2g9HDiIlj6&lSf>&TBdv8E-=~=%8?T7sX4&?Se6N}bY{sNNNrS>
zzX!aWCov}76=wB7UYZc74?*eCq?Z8-=U{$qTy60_vj?^mTDDv?Un2_|bbmpTLdf;^
zx^;5b?akG}MB6PfXY;;Pc{KD7H+*Gx&HC)MByaJz7?SXK3b5i}NuRt7!RDUbL|iaN
z(23oin`RdqKlx94@oSv^AAqev=r}zJEp(7U&9}aDK>2{WwJo0j1uiI?MZ?n`9-l(<
zaOXfpi&V_Vqq{T*l-+?pVdl#F%JfITJrKvw{4pA^=fWTcmAYt30sl?bvo(Yyd<A@r
zQ0h<Ts`EVvs0G*5;{f^orG!%9!NUNd76nN#d<LZ$`QtmwSX^!tKyh+N8XKQ&{IVb!
zHLcV^E{dPD+Mjj(4GzVfql?4Z0H|2B^7EHqXJ9e#T_iCmR7~&ug_$d<4AAyI2WqF!
zLPPE&QAZ`R7E2hLyZ&0ztR^Ivt@oYo<3O4Kgl7#gkS+f#j{Eq^mkG%lU3+Js2?}Iv
z>!{xUSs5p51DT?oD{|MRMU=0b9hkigdb1s<O^p-f+g8874IBJ)q5u#MU;Y3Do>vR9
zQr~A01k8--osfwwU~HiF!klJVDd10WX0n*1Vi|ywie&jHC7@{5NAdNOY1~X!B0nt}
z!L(f#Q1uk>55}Z5tl$@+QtC^Npnev&ri~E5)hgDst6`O#M;vuh%6W=?1yGC{)LwtY
zAY40lpkNFXZw`r<@S8!P;Mq_AmJuk#0*3_{2jJhaZmxgjTS}1uxbl}pPb#fq-ZB{^
zG4a-TF=qA*xc3icx}$JtmQ%|Z-2cs%yt|mg({l?R^}$8QZ$OakytmMO46J-$x?Ag-
zWV(ZnEno=%;YJ#wcJ({1W_v`L70e}|Ez6h98kE8#%V1Os%JQ7Q(D&YSE61LMKGR#s
zYG@GL;4aa!nPDOhF?N+JMcS-8da9vwpL^cG(X-ecHlK5&&sH$x;3@HX<&Z|H#%6<=
znT#N_LxEVNHD&R$<R&8@1Q$+hd=J;KCp5tv!Ei8V(q#TFeEv3zDdljFW_YZLJTcg;
zceLI6aHZP=RF7O>fr^!6g3dWZnp~e_=;cgC@{|R;o%{~FpP2%sA=}0v<<&RZeib7}
z)lMHRV&cjh)~;2iZ{<pS4CM@xon`!m3HE((vMxv|awh!Hv@buugIlPLR}cbe1pV7T
z%QCoN6%1Ee(e|E+2^HYZ-Rdtg&tmN3FrlemP>@qmOBb^nE~MC-n#Ipd0nFR~Olw|}
z3J;a!zyI6)iVJ)$CM?75;D;QNtZd~`Ia%${czoc?17r9-D9h76jd(k!0P3c|i_lFp
zu9^gulNo1K7@LjOpqo&-g0?I-Vcm4P8<j-{;vS8C%P-rN<*DRFFL?A(o*$vQ08w)s
zfEmyWh~4oGlxc87?{g$6DJyNBc=0kTA{z>)tQ4*X6>{q&^<tuJOwUhtcb}`}!$;0^
z)FoQ5=71)Yoe;MHDM;4S({mV+keeRMezbCQ;TnViy7p}TTV<Q8gKJYLK(WD@;$MFV
zX!~fqYY)Px^u2QHt2EL3uGstb!cn0l-*A7tHvu$=fU3@a_7nbBNxSHZp*U{t00h^>
z7SQ5f%yoTtdbWTE53G~2-5&mozdIua$EPEz;xYj0%5Sw|!(Hz(rw(@`Zu67pNpkjq
zckx<FEAhZj7i|w|YZMkf03uH}eH5_g+w=H#z3*%bMb#25h7LVDppqR_0Q*@7EEk-t
zo}!t`Rjk#QFCVOc-a`92y)&K4dv71~0~Z%wUQi(6?JaDk8!~G@F)Hjf4Z2LAA9T(2
zfH$N|#E<kg#2FL<I~HHxFFpMJlM5UC{ND_Y=C!|<pt6D@F>|_sjx;Q4YilX4J9El&
zb928T=sQcnO#%W8>f67WGAt;Xlh||v6B8ecoG%2)kk^lZ_R%SVKhIhoH&5@#g4;lq
z44}b;d^wdW_kxoPIKG>LgF>BELatCo@9MB9sPX`e0wj7j=nLW^2bdY&mkXmj<~{L|
zX9^S4RJU=vmT|y4XikqiR#r-6Yj5^E0u{dP>a%VE4^E@jW(Jfx8ca}9zriX8!4Pb&
z3!E^B9J0eOa9IE}Xfgkf9>dOb{>e(6MCB=E6bOKs5C2){F8?2eZmTio7|<RwvH`N;
zrWkey!;Zy}a2w25=xW7);Wi}qqrFZnn{uWYCHSLLX@a)7xj8%y4BUSu@35r{L}qk#
zb#+0sX{L>pRk+CA_0gkPa=z3);CLG*JNB>q7TF8@y}2M4Gu9qWXY(<-Nh2LJ;wr?_
zAv@z32*RUl!kx6kyI)NGl2FX#U{>kbJottdrhtpAic{d`w}|4{(=pgQg3NZH(GxZE
zfX*o3jDmkbLG6KqE^zbYYqKi#cFv++2NqT9mTI8}NrDDa(7t<}e1q)J`C>o%gpI=d
zR}6f)J&|1x_C(>$CP<{irTOFE7!(R9#UIolUUdb#FfPaSQB!U_-?0C9|A*)Hg+R0@
zHK7gXk2G&To8w+O-}vwCO&~D++A>@D9%-x>^8V(+K80>~v_`em?`~!;Ow4o$`2Nuq
z7(SzHABo1;U$%L|we&0I+{2XBPsd-ke@kJH9EFs$Sw<}w;2&J|eXK(eK0Oz95bs_&
z3f&KvPq%?1MFY~F7hmYv9U%G9WS@5KW<ypT*gH}k%H+KnEgK_leCZ?TfM^GZYg{+I
zale2B`{7(5%`aub_qvZkpY4V#So>LhatnIQz)QJ;>Ja$@&}dU}T#*$>@IzSWbjg3u
z-P8y^SC26A{0zTFi(csSUlnt_KXvwb%$mLEEb_mV!og-eZtadWZr6)tP&UK`y&_+|
zFU~s#o&smpi30SP1-8?qq^8C(7SpXgL!!F&$Xy<YDXq3TtkqTc!mc0(Z$R7OljsZo
zexn8S>b+DGuK~!xp3~|H(aV;;-xUj`3`{6due2Va-UEK|4)chtI!fK)6VecFk-8V%
z8L=oI%qOcG=}1E)kJ_}CkGbD3mQB_zb?4s&Ji9#%ZAQ13xN2q2JHy7unFH0|4Oe7w
z8xoN<XJOVe#E2XEQu&d!Y!(yBAft_mF5k|I+Aop#(Wc2``nE%%=n;6@$If!j9`6N}
z&h0(2kFg3o_?X`s8pvR5XXnbH>paT)hG47jwqDsohQ1}Uw8sqnry91>cZ^vQQ0Y_U
zP>8);Oe@niL{z%ZX9QyGzGli{MCG~iMh~#yWubZUCn4=rC7)0wKQ7QS<rQHw37gKB
zp=Q5qX2Akt3V3HtBl8y$j0^<m#~A{J1kf8)K0!Wl$amDXoWg5^G$24NnGe|!73&o@
zZ(mn@F_%F86<I5FPr*t_W+B4Ly*AYY{+T_UH?ZBq37<W_-OK^~g=0jyJwi!}z0XDx
z$+<&dFjW|;vBiZu_D3mDjT(&)?Jx!g!$#Q*QjioCv1Auh$z3>(X0y$FFBzBi(i&W7
z0q+fbw^{L`jgkDV-4S3r0{<+P!=s9=SXWn8(tYsLeSjbll9ln#d9H;+fJK)s%x^`b
zHAn4r)t4bU?RrmyY&D4JxBbv(^*8&8XOUz)uQfE{hWsWB+zli-6&;wj*kYF~Lj2*t
zIRsH|Ui}jb0<NK5S7UDhh>WcE_X4699upH2*`VsVeDnOKgZC{==97XGcSoBk=p<oO
zPKiJ8i(^#o*=(^O!kYQYYPsKk%Br8o4$U5-Hkp41E}b1Egas;gy0gEoK=1nJGQV!n
zcyGm?`j;FHXD&a(i$yk)H0HoBMm`5_)f;|!vK3lZ#-WjE^ZN_!-#P~)*((Z{Kiq$t
z+#O`E%)B`u$pTJUR<inFUS9C>-V3W`lZ&SMd}&h#*T!gl-e_$lI10s7N^FfCN}WO+
zEjGxMoBR#?^F%Du|MG##6JM_`=pT!=)AVVs;KE;O_oMqQ#XTnw??Te!ohZ+$L%FNz
zsvL+X$DJmkUUyAUL2!5O2&h}{+`6vJv%a-o4Ss)l4m3o<fnVL67i)&sW?|%M!||}v
z!R1zY>a9Kihi=PEAo@|ju&lePhGBw9G?+RS?<Bj*S&ha^bhF*)k>+d%p7^h!`#nx=
z2s~MBcJ{VZ|27NB4c9cces@RUeVSJfC>;sv^YR>#`%iAyE#*FQbcW)nko&92UWwlA
z-z_wH-;O@pI<E_T@_g=@Mjja=9Tt0orL8QK)BsTNdtRPwow&R4yVhN?v+G9Mox`Yp
z-+G7LKWXs;H`y12GBHas5Aox~OkFI4@@$RYiLHq4SS5+yR+D(2_6Rc1-(I{I^z{Wc
zTIgMgm*!amIDH^F*MNgCyuPo0y6nB2;eCkcmb3A#X}cea=ll8YW|`b+b(+8K9WZ8}
z&N|$Qk*&Ur4&CA(u->e1vIDhsv83^3xihgS$<>!5ViT>L?%$2t{B*}E?*g*9w$2V-
zGHVz_JK*|#1go;&`*F`ELkU{;5~HmvYS3%u_VuDV5uok9<7=w??+muC1QyQ;M_7zb
z=CB|?oc>fa$DfS@x7HN+f4r;Y6Q#d~-&6g(NV>s<YTLA-T_1lyK%?u9zlQE-YAP+(
zznn&MI!tzIyrNwH8?^dq&Gc&Ru|DsSWZw}mJU=goL=VUB+2JkkXx~OILocH>(pmRO
zL&0q?{((h#m9tNt=u}#YjhDWSxNM>t^1ZD-AlXcGm>-U)ad`2)k?}}PvSo<BRVgkj
zdquo%dIvP>IX2A-Dwm87HQ+_xb%sv+FG=3r-gvV)Jly;7^M}ujpNo&STv^Z2q6D0Z
z4cI0nI>)H?D;c9WGhVe0_U-iUvfAejS3?uSyBKr8=FT|x6h%+M>r|!oFyxod6tSQ{
zZi<<K(>`_&?}JRI&c3rn_=D12Gc8VU+~^rkvMwVJL2x~Jn@rcOWJ}=liCi3QRW#YL
z+IxRv-Z+u>F+h&{^OlzJa+~(glhn5Mi>i6^GrX@)tv%xcLG||^_k9SG`l90x>l()N
zs8*9stp7NfAh7ebmb2xOjYFUu*Hkt$M(bi(^~&+8k19S1r(4OhjJb5@fW$DS`f1^_
z+M0Cx!6>`#x!<B+wIoopBxC#|JGxD~kjDHl){my?+M-!Pck#=&Q<B2dLS;HHsFAym
z)xHrAcvKP1MtAx^8enm7H&K9Whs6!xb;qrk{&}-uxmS=g=q|9L^dIwPtq`of*LHHv
zU3&QO!v(1Z6um1t{Y%`T`<&i~D_`;ZkXLsHW;1?O7$bT5U*l|&?7kok+7ECMWFdhJ
zt<yFrei#i|ddR(@t@N670ZZ*CiBfpj{=C4)Qc{fJ@B`?L5qRoe%M@XaPx8;p45YEt
z3MmA=fad18_QNn~%Ji!WE-q^S^OF&)1lF5*|8wQtp6Q>$72AFO5J-r2)NkDdd&2{Q
zK+Zoep?wUa2EOEOb-p39b_e`2a#4bMgkgW?rnD|=?^)Zu4@AD3`5Ij>%sN0tU%4A`
zWM1<#Vy{xLen?Pb#O}Mc-}ydm&7H88LvKPr3wVQtqF#sJlhCWBBm6F(D->)NVRR4=
z5#IKh%@4`N&Pnp-MKEfh+J&oNx=m)MKeeZE5wJcM{XqOFBW!-J*I@l`6?vQAX;A$k
z-kcvAb?em=78bcc|HzTj-pUTy1>E_p`os5Yb6B5psgZjV_?IbL$DIh;nMH4AjzN3f
zJF1`4{HDJvJ6z9V-0eHQ9h!o^2{A8!nI56q(7Nxon1uXOD`20O=pVj_5--F8OB_Ac
zyA-s4>-&7FWBNcs!{;S34n)rO2OO^zG>80lgSRrR;}u&$_~+%6V8?Z%+;3VnNcPy3
zso!sItnP958wyd{^82=N*DrISnKT~&-I^cJEhdNL%{oTgUm!DqzxZ^3o__;Au-{3}
zW{|ZYguia-<TE!0IOUv73WBqK8(zNvqqgZu*iAZ^|N5a&h+m%Ye#w$}aiR&xXNx_0
z{)JYw6R)bXEwT)%R8_Mr|1yh1gjX?AoO4h7PcM?JHU$cN3hV<u+jP$7D}UlD@L}@1
zsq^1zt<8`vTmenhCM=3VgSJ{B4F@S-Nxi4et9uQ^8y^x5G%#_lcCgA>Hd;EfS5gZb
zDAM#*Pt#2ACcSEs2Bp?CWh3v}S=ve0N2>&A4(cy}XNc^w4(>31l`cdKsdzu1bUxPM
zUO3&{dTl=I`<W%W#^$U)&EQk+fJiWi=%1dUsv}b>jMJ^FwhnF<N3?3cCEl*%h3f-f
zAfgK#J5tlsB(Bq+8>EFV&3$d-1;}|Zdwu2Yi*0RZlb~)R3{J?YzcCrPUsqGpDOz_i
zf|GJ<3DLP~S9~RMmQ~%WCw2>2?iZ<>SOvS@S!hmpAMRaw>kH1fcymK`TzBrLdpl{_
zQFe75Q&WRvD=v=Tw|ID?w+sIvX%0$u<}-d6ZMTXeh~|rqu?6a*cfXRI&TrX=@X~+X
zT%L(zjV&*|CkA(2nQM-7d)f50K&0VGewrT#$ZRTA%58o(yd$^MV4Lx{%nJBP6LU1I
zy>EnWPdGrj(nJM!z4njZFMpz>^+8<KEhI-M?wkk=hNpCoynu4O{5P`U_iR2+trpm4
z096)Zx{Q`PWob(fHTOy-mW}+Q-&1?@a;~<tt6b1_`OEn?{D3-ewkMAB3Lvt`QV{4#
z)ImLQ@P!f8C5L)Sp>tlbtCjmcHEk~mo7=<&Kg7XD0}4fStL8uSx138~lgOq!l`(qU
zZSDC{2-`{z$51rgUNP@p0MKN&$#X_Uh9!JG$MXin?c60CC9U1C#8U-#??=U>Z0MH9
z`ohHD{r0xkYJ$%>Jn&ekBg%zN7ph%2q^8F|TTVA}+qbc^7Y|X*l|7il0ME=$WGD++
z=ZiMuT(w@K;Gv22Pqw+M0$WvV=b-J}@lYW@N3$#vjoNc~og%?zV46Jq13}o`6MqD~
z(hH+yj0Au!ez))Y{3>VBz32tIRI1@M3&gCC`=U>4?zY_<O9y^2yq54u`Yn3KYuzZ#
zqCy%TK`3i|s&RSZ@4C0#cyl?n%(!6~m+|(lZ2nqV<MA{luhO%d-EW8A8?)Wwx5R(G
z%}(`ynh9<wt+=2d0KUxaa|1`FYa<@XI6E%ghUbs3Kkq)^Yv35Dv2E=laX!TPVleMq
zdviHC_oZ)bt=##h>g#qKYkYFVN*Y`57xRWQjLnZA?%`{L=rikSU$ri>atBOajgh5P
zE;O(VJbfB7d>bXoOvwDShtY}<A4mORi@40}Slwbo>aNlix*4BBRifAtK1#IbLUTvb
z5DL+qZq18jX|3=Fjl(!IJty<E4QK$FQfNPre@o<N{)Rk6^R1r?8=ShS63A)=%*x~1
zirUY6uerL1-@ozRUUQvPU7JYi;U~^>s>kSS*>)2M9n$1zhv@Wg-9fSJAh?Y5m}_cH
zfqR8Uh>42joB_`~U6Mt-!Y-#6GWE2L`1?18wuO0~Vk$sJDf<%<+uKXFR`-K5m%~O7
z2h+5BGw55ohz*sY#ExmlU3Y){3GEgD=(p$o+IbY+PTx$U8TX(^5lIy*8yMVqO{$(3
zS?8J&xRlpV;1upCn`IH1{d32mO$YO;{0x-cm35*N{JzwX@p2rx{p4q>OesM|JMfb`
z^1JU%$jCeV!CpYc{~+O6^iU~1+hcvtEzV`b;<H-jf&vk;{LgKeTUzX$jhaf!rw<+G
zOBz%iznS7?Bx^)}2<nNAni}%9y8yEPIE4nYVtU!Z{=!ELx@ZFGr0=+rHOpdOv4gSo
z^hOQs^bfKQynxyYtpXzPA<6KDeCnq#mt9Xn3SNw@!7UilBVv5YsnhnJr{^Z{d;vaV
z$$3E56oq0<9lS}~N!E;yzt*H6N=yfWOxN_WDc<CHs_6Q>3+(&uX}|8&!5vJNCmg$6
zr79-jarO93IxlV6i36o~1|t{icuNZ^H7_UJ-C`!oGpdl{!=<o)CM+tvbQ!Qd4Z~=&
zhqBlp=m_O9KhXZE$jNp)?QQ~_cj+grBt-`c0^D72HjE-_Jym!F&AqDTP9oplT^4r8
zc~-lX7uNS5s07w`^#%7B03WMH5xz3LaR1hGxyZJCtn2eRIP`4fAyd!9omrg2v24IE
zx3+q}BmUXlq+gYPf5hx2X{d;kh<~k;aF>2%LTRE=>$O%sxP#kW8`WI~2h$E1Uw2dX
zv2)V#3+rdds4_Gql^rP>XYF;*bRT2RU;25JXU*8fz*;WgMtbY8=I{dEEpt)LteutZ
zV$}b3HvH1ra8D}Fwy)M`9@`VyO&HiMm+5X|ALrkED{_0#c>1|m+rAQlCw}m-ueN~&
zBVZgyPQ7%K-6QRDYjcDOEs6YZx2_!BLoW@mo#Ud#_Dp*0U|d{n+E*Gj!u(LSwQ)B#
ztT_e*T}&7)!hkfnoU9E80z}C@ygO&fw@B#NUl75gN)c_FSoOa?3QE4In(1rwK8U^x
z9T2|}SVegd-cariO1+u8H?%6Gf{Ys|4?=PMa=(%_m<;)pcx*gK<2rmZ8^dn2v-hJC
zjt5t+x^Y@<o!;CpZC_Q8`xRX0AbXP@{R{2YL0{Why@b@=JvpmQrq&Dh>)E?c@lcV?
z^=zlBmvWD5kwtpEyKQe$x25%*iI;N?$~>v+WVx}{qy30sui44e3n^qDWjGJx2TO^G
ztulG7+w&2_?MYTdj?Wch3dC#-DHCz!Lk5Pq7sB@EOy`DkIZZ~}FAk-l+5@k=?yHc|
z+qv9EuQl{KG~(TlzD>-#UihGLy_F!oMa4Opg;mLJ5ZXU_WOl_2ebX35P%a0%rtdis
zxPwt!Whn6~%5=9qrwck*Y5+B{W8I#{*k=^}CT~}Z=k)GD1fs|ilMa2jo|o5AwGcaD
zeVq_a<)B;jq5!y?DK;2iEiCO2uc&&VeHfkxj}v2d8-De_st|YSvK|-q>B8B`SM(UL
z)LK6f$B(@nIvO?0^F%+3Ok15OQj&o78NkUocdV{^PDlkU^ktdv!&+_|K-qElnYmE+
z+Cjc={f$Mmg@7@*ShsDdov3eaO?vTGCLU3<LT1%w3vpwdRcrz$_8)k15zmVYIT*V?
zqJ_AAp2UFS-DUK%<<nGALEW&wVVbU32ed<}Y7~>${%^|QK;OL|g$<luA^@El&Sz?<
zCM3C2)+#&pzgy&QU`2{4*sf~@AX6*0XcuF>886%X?pOG_Jj}ShfI1EMy6m(19C%_9
zau!CXt^!HQdV4Tht`W5oCf)d=xV0R}jjcJu$Ys@{;gu)%0(^~je-1`0I^L{J);sUn
zX}lIY7QhyC(*ej`*XPU4A>jb1?s)mlvk)qGlVP<NK|Rq!qiQ}!dpc*QOF0i2wy&H?
z(5Xb3?BjcOYfC*Yy04r7n^ll2_`PVClCXs5y4}89l`gQjq5&=UhpDmr%aF{996P`4
zAGRC<v0ZpGU+JY+hPrL;a7&**vP`KEdsA+BN}<J;5>0)3T$hq~f+Vs@59cSgM@HnM
z%(9i^6Hky3`w8#*auV;`d(a)VK)uX$E6nqPeJds0ZI(Q}GTZX_a#G{7x`GxQsdG^+
zJ{~GBj~m@+9?gK4ibkn_y^RxO0)^Y0fxql~KYzJ!4l`sbYPBAH;?sY7c3=6<ouj9`
z@fy8E@6Hcoj>hsjyW%Mpj$bROwv9&dc;SF@F3Ik8S2r2KcC`ERqCTp~rLrQ|-D#PZ
zhx@epz$&T9&y|ER!$r-oCsaFX&~<1<osWm0YDcf1PacRx2O|%eDalHmL$CMV)U-)h
zPnzjtFw0s_ry|d$YizW;uE0@0JQ5=xC&|t8KK^;Llg87X>7)a{APl@NJ%^T$pr_HS
z+K3N<65!ywrAM}>A&=2IMa{qC(~<Xhb|;c0I+s73NhAD1oXWBuGR;@_ZgY%B1HI($
z_^H)0=dVrkfs=YXeCnK9JNtH!S-Cx@PMFWa?d(*WZmF)%s&GJ4LO$H36MYz3oCs9W
zOVFq0ef<Rh#qF-k+3}Q#I79pqCBM&`XlUfkm-@B$c%~Bw_3}IAD$T)#n_(qRvR0AD
z-QXCGtTwIrY@Q$IO|X<~F_pJ^d09UGIiVK2FM1qDVQ7s7U9H_MKA>_1rTd^eWCFGH
zKBf1Mfy~dGZo#8y1h-Y2nk9y(`oq5QHw1bxprNSK`PwYSKm%MM`7c7fX`}_>(w3Qd
zs4tvza$bDK7fH%fCJgZp2bWpeEFk%HvPnuwpK<9EUb+L8?6e;5o1GnNIq%rX@BBb?
z2ObO*bsy4@4-<NSiJLx;mft}l+Z=^(DKBMt6qkTF11TlDX<nPs+i^;eq!4U&`{|<H
zDxj80^~K(}xTwFqqr#;YBCV7948#8gwm;G@i6R$M<PmY`HlmW4<c1g0KR<Q%Da=sP
z^%ScVUMcSPCH1vn58>(<oyZ+kyYO|NexI^%<GF6COj+k(uW&Ji3j!u|uQep1CuZ{|
z@RtzE;0%rEV5J<7wcMoY1z-I3nnNq-DqwyK&ZqrH%i`uuI~EN%RRBKw0Gy%H-N~$9
zA;sUhQW<}amX*yw5cd?>p<+9IZQP@^_BZ^reswf}{sXrrE02@Fd>54UU}d1Z14P)<
z*>bNLhBxk<b&vJn>kalwLVm)Neuak9AYPJpqK#T#6`HC}a664g!REmhQq&#r0R_(p
zEdh!yP$Al1&yId#kWdXwO5?6bGSGe7J;{w=^LVyAno}6)#-csrYShJp6;1AkMd9wj
zv#b6)&u_A|b~z+7b_^<Ff6jcqaeshB#F7=DdKPHq`UqN9K%-}NacrWaKpM1I_^a7H
z#QygVvb~WJzsXl<;`ZkSIUla}7grF9#6DHNcDe@99fbqtW-7=w?E=tUh3x5U5q}NN
z-A4BJ_ounTk%q6&t3>n4b6)C<#=V}%)2qvc_Q<ntw>DtMeOB&sSSTg&&m8g#rT<+N
zWw5SYA+#&*NK-5nMSZseevJqy#8Gez!`bf-I9&<>pS{sC#EA#|euLj%zvzJKu7cyw
z2hT_i(4@SO#UE5C6xt$=N$ZL1XrQB4AYfn?su}>j^KDgZ4?~P2%cz*~!3i3ouhg@*
z4}FG2!_BW{l1yiNw7Eg=rr}$G=s@=e$ci$5-84@5gv}-7d>4k;|GPCk2A^-(M!&Xl
z69py3F>BsNuN(<W@aYDP-7xTh72wdGa%K>s;5M;7)9-P%jWpJ0aO#X^Dw^$o9^7Cy
z@Y>lS9Tac1{pnu*P0Ky}JE*8bz^Ax?t^|IFVAOrkBYAZrtIgTJ#VHc?;fCEwd}&%~
z%qwnK0(4eDz<~`=H2@zgaCQYg{6(KbTc3+RE}P&|;ZnC?K4%npXdUwJgdh5U97+U-
z=V$RPw(f4*aojC0n>h<oRc0BVhJgMiA#hf8%Fcw(|8Ck;pvb?iK6A{0gL{=9HC7y)
zA_~I=r~Oj<EL?eGv+M~0ANfM((x)um5pa9Ze%W-orvc9La4Xk^4<L9Nv4V*{l4M@c
zqaT0zbyGrQ_}xB^Kx2QQG%^<r@~=l90~}q!L_w8&UFRZ{E^CxpT|c{9UAO8<B{J|H
zn&G1#ZPL&sUF@I3x(g2s8`7jh(;BoVR3iet{Q6(ev$~-=iJ!MjUrbZpnmxDLIcDT^
z5i66c)HfyAyCi@A)A>BL9>%P}nGfD|WNU#EHxy}JKz_tCYjF9T@8?Xz*d8XD8`sH^
z@YTF6b^4%>kJ|VD)O8h5QGVNg1hD`?Km;WuBm|^GQV|3s6j3^)ySot-q?HZ<>5^_~
zDFFp3>2g3yI)<LNNALf>``&tQEnO~QzL@Wv@9cf{FSdNz$d5eGVT1lSJbHD~iXYY?
zrXV1u+I9dN;}0tY#OaP`Sn&0Z_Er=0?<Oe#^=P*m9_@0~>sFuaYjwL1nLrN;mO2-(
zKm-Zw$Q!aBG0rw!oc{@s*MoxtFzQItnA($64n&-elFKV9W+ABjUQM~sS+YRd)A4nD
zEtKU*#pYtz=j8W=;WAtKb#)Xlfz{YUhx2D9$HwyR<f#L0v$eGqvKL;TzP{_`STzd*
z@C;B8;<dDjDDMH;Cg}$SkpCm_nO_PMUT9i%TA^a$yT{V3Mk2Ypk0HNVTVvz~=N12w
z|No`_F86NyqyEklJR>D44G#fmGC-CYG6O-2Srf3WmbV{KVk9Xmp!$9cbs<hG&0i~|
z5y!F?uGm^Y*-rB=q9wZYn!~t<0)vRF;cRH*tOIbQ4+sQ^oOlieb9O#qO<3{6CKh=(
zH1Lfg+qv!~_W3MNy!xWj&W5Qy2jPDh{!PH?IiEbm8C7aP{kAhIHdaZt=arX(ryqmM
zuqTRWaA-(zSmsJ@Zti_GHF8pR?e{|w?)7VYo%A-cfA|+5UnxS1=&C8#GcVfJV2lBf
zs=%3wF_f?2nx*A5H2PVh-yP`~iD-ID^8j8|0O87(`IUQEx}m&lt?r1G);tVkoyo_K
zR1^jEBe*A%<A%$6*_u|Xj#&#W+7i2898KYn)CJ1{7zJ|g_@M@zdR<fC^z6C~TpH_A
z@dC;9-ex4F%96iy6!<)KD0+o|<Eb(o$ckDBndBQ?k8jYPcb%_TbaXdx(=IdarR&_4
zHF4FT<r&Ku+=-VFBzYs9Titg+2d~xYXXe4>JDgeZ84oVV^nPwCE2a=7LDAK*eWy<%
z8Zd@cC5i~wF)>Qm%r&+c2oIosO;g<qrm)>6D7xR+GTnMI(%bmvll&t*oOPmG7@~7x
z?lp>=hsj<ClJZqD8%b4*)5Wb%;3W=9`j&^6`pjPpD?a=p<Lk^-z9p_tk20Ea=C#u3
zWr7kFZ-daOdLZ6Y=92lxit9iigg?$drV-t<)<3=0>h|2e+STZmsBYBDEaWd;k)D=p
zu9aXO`2BK`L-s!5%&yyzjBu-_a^N?C{JcwIHG1c)YJS_vZ%_%D(Nicb+sLVrC%c?l
z@0`)Zg6JavOp=xI3VLrc|K{6@ttEcvX8FDM(x+1&*oskGPrVXjR|~mnT`$RhM;4fU
zzC!DDN~=f^S0|Vrv{Jw^y*E;tq*4h_2~D1AOdcm<T|Z4BfTDPAo0d3}OKdfSBa9OS
zuXF-@w#(Za;Xv6=cO1MgJr*KGx~aOkr}0oFl!PplkNFBPYgFr8O0w!e(SqOrG2!G1
zgde$R-Utco{q=+do4)-`mj@&z*+<iFqew4J-Ac>M!~sMM6bnzsjvx{Pi$Q6Fj|S{W
z>?Q>hG@pNl8+cn<S{kxtKRh`2Av5!}om(PF3y0}KCZi_-jM6*#Qk2CWxRRa2u_jhx
z=*<W=9L-=W(K%UG+5cVE%T?F{U<ilTu?xVFpw>?)yt-DmtnP8#_Yg4SO731kd?;sp
zF-gJg5ysY+3hgpU!53<Aai0_fJ$TVA*%(ma_^;{i(2&&0e_7bhm95OY^aZqIK#Y+V
zEbs&dmdjer0#{xDposNuyzSxC;b2SfVs)$n5ad>+$0CM1x1O$!j|ZAJcyj~X@xtM`
zTe;QAk&D!^ar;pyCGRDa=D|tGUVPQ@m(P750>UV(e|<}oLU9~ItYx76BWCnj3=-_p
z`@HmBqeL_oya5zJ(FE++p2hB{K@93<Z-Si#uWRA)sN`e4-Ug9s<S!^Gd7<ZuY`Qv!
zst<#x+oI81v$Zyx2h%N4+u6H?$LPt|!R+5-Za@ykweq|x@UG$p-$jrNIn{z{h>7`G
zjG!P&ccHKs*#(uJVzbptgI-4-@(12O_>spKm40>&7kE>Ti=pwpen-0Loa$J*TGv%C
zQ?ee-U{5Ywrd<0RQ^_ZAM}XS-Tby<rt^%H9^Y~fd8Ufn}D0dW>1Nw%Rlsv*uC}K5*
zu_=?>dFOeJll|^bU;h~6=&;s;!Kv1&18v8RG<uSgPK&=fXxh)>RR8*n%QN3Cy3bE_
z+QG8l#;YJFgray*$jvMaPYRS<KqTXNsR1Sy+aYh#Ocn<h;T4B~1e!PS=lF%oztv&_
zU~2oY+`Zk!fz6cNb@MJ_5kOV>1&#H(vu|uYO*VH&t#_RC@}EIy|5|I~^gEB*dXo|N
zeaFQQjGh!pfPIh?!^~z|J2{0swY0PZi!*FctgC*o0LLnT?FEIDNF;%1%F76T4L&Wi
z91gY^$fM)a50q`Xy^;ESaEvG}0)vgzMJ<e#5}bxoyLku5_KIBFzHXJ;q~)74Z^@CF
zdEH&Ve_<O0dp!L7s5lyst+Q%Y6UJ&esWq!lK@x4t-x#IHc?n}UfCc!z!51t746wD)
zc2Ir7m|Ttlv^IL&SvE9!sqLS7*`H9xogzCCY0VzTBNr1a%Ondd(>}~ta=<#XNpX<@
zz`Au59k}D+tQqJx7^?XL=SwP)gqH5bg9i^L^dUd9tNHoKD$0tOj?@KHDaGx{Z;9(O
z&Un7&{(GSsKochR8xfIPWxWQRUcb+NTv^Mjxvk41Tbb;>Vkunm8(*MDjG9=1((gyX
z_|N2N^sb5kaxT(#%eJOg=F!(+->PN_#nvGeo!8PRE@jZAzZkWLfgHRyH4tYF#x&s_
z=}u)Nf|>dK^DU(xuV;XY^G~Mmx_5zP1dCuCeMEk4huUYH*dK?11-+mV%-LY%YxLg5
zy1VGpPbt-^Q_m1;AsS=-?*tUWZ@PO#&QDW_zH;VoMP;v#OolzD_IxE|EZ4C+eWGx`
zke3rM72?Gp3T5x`84lWaCczVcV;)otkZ)7sox{hS$z{S7o~u>ZimmP2+B~niaTGhi
zGF^5<L-gb<s^{3Ur#t#XQob@jf@h?H+6U%R06OKqN2U3-<oI1NawyHW!PmVgp+_1X
zDnW~&1r{nMdD6NK;g=K`dLU~nylN38&c4Ah?`s;zaf&66sZ|PD*HgpzLS^vz!79Hy
zN1dI-l=gb<4bf)yWb4o6_Px#B6ZNh_1e{=0#XvZ`73M(Wd%?H)MSHVKj3Fp+LHIlB
zJYH#84Lk^L!6{H8rdQ4-CL|DEdM(E<Tf{=tTzb{JBj|Y%?1rc?mWhTPJE!N;`^UVT
zTNvYpH;BzFW0^3gp56*OO(ye*_Ocd%G#OGfwzU!6fFPiv8byyAz*>m#0pB}}ry%4w
zT!(gTOU5NH)F%lJ9jqc%2gevL!AZVfC9bO_JVSeBx8#p7HH~T6NrZ{+i9nd_^=f;X
z$PZeIitlV=AK!6sSbHF~pe?V`)}w`%TsZYCPPS9mtl1fT7==&eOYEOSrZNR%kRyEB
zS{H4+QYa=CisG~UENPr{5#h2=UDPUqZkwvFHaM<Qblw}qsb6a)gEjmn(SW7znpI?B
zp-DV&orlci-C81$0{b4K6K*R7uDBEN5r(60uWIuFQh-(~O7hwMp0d^=(;{VN#Um;Z
z-eRca{U-lT<9)W3#fbnkKl)nr!Pq#`*J^Lxd5+a(*n2@7Ry`fm-P0uJ&Av*@{pO57
za;j(&mcJvjb}Hg~hgE%tNSx-Pa4#dJ>E}|`wR#kKX8WXPF}5%LKd_5!yB-Yc!s$VO
zJ6udrsRyqBvuGi^sJ+L6uP}yT+5g_Gzaw^#q0~f?Sj#!QkmQz^WpVPOpc;K}&E}Lm
z8%a<#@jljE`fDI&#t~VmVqI~Ny!?P9(ROG^Ozy74llUNQJTOYR--|&GRfi8n-%mGQ
zyixUWI4Z?)tO{BAm%u@;Z)W$@%WZe0uAw0Sy$LoI8yCIYCo3EVO3x}<5yQLUkSlh@
zfre;_%8@AWLrU9mN|Eo)CcTmzv3m|dja#anidJQ#5vOzNQVXKOr`M>h(1B8kcZ+L1
zW}04_xoWta2M7kripcG>g*@TdAJ&sW)Nbk1<*@u_Hfg1Or6zymGbONIxYn?)2l8r+
zrhNp2*YkF{(-+*jdLsyc;P6rKs+X4B!{(7K=T9165F7ggqQD}M;wA-<XT{3~SKDn@
z&Em4<J-7q-=R52E+X9dBO4z=DoS4l>(}2k(5pxacIh%<bVs1=*-t7kq%d-?z9!eQP
zs~-dTv(kSt7W|E>bt~tU?o^XhP1*B8Wc1lxUtqP-L4Tp%&{1n=Zt*vLn1E<n^E~44
z{Csx8O=im?YSu<oiJ_Yda)#c82N3;mHfireG~8)-b%yOq)#sxy@f}y{<6k#1PlBM)
zOR!+87xbOLq}gw=w$tz$n&+Q^!YwSazWH~074`CJ*ZbL2xbUgCU5eNA@JmQpRedmW
z;ndVKaK5*&r<u5Vv>F&k4NKXZTHU}qH;EKAW;|tRB9scQOt*g<o=RN;I7e*%+x@z&
zcHI5O_{P9-zH9Tqx(LJ|J9mVO6qI5$pDOp&EOf1BR|NHdGE`B?e(qd;#D<)yr$&J?
z?*qfs{_AglKiS}WS@;Stg%v&-gF$}()00DeOpJL=7!e&#?s?At@?GUfT0WI|h_Jsb
ze;F5Uxq%VE|JaFN`>33s8f^MD8S;-alU9)wRdw!(<E<Ze>=K3h_gN+_Lz%qSr>*lF
zsbOJSoPIskRbG)4#8o0ozT0B^-pz0HO`tNpC%oAoSW?I7i4@hR>b=uh+%TsI0y_%C
z*Uib8-G<kyl2R9E5J5(#_j1!_EU{jjxW@W^OR;JKP*N_@&QSB9EI7F}+0)%d>a#dr
zqxY7I!2F!qH<PzIq8``oh@#!&>`u$4qS1|zLg9>USxlb9W?z{qJKup%?yH7-N`ab4
zZP6O(Yq&AMDA%OLon`D9<`G)USR(}kt)~H28r-bukq#k+YPL(TZY0%;hO?MMQ9Q4L
z<;l_NHt#%IZ8~I|Q7({02jZ0l`k0&JLHLU2fp^7xeaTGNxMTM;ymy$aC#P{SEvn_L
z%(VF5GcPvij$XY>LFp>))`lACwe4)<{@z!kqNHvA>+aZS?4LYpe@<aErp)+p7dY!~
zxfM-+y1ISc%lBOe_4JWnL|PwXRtJkxsr&721GMTR>KC7Yi#=%-Fli<A$BxvyYc;*#
z_ZqdsnRj$|6{PoIlInYWJ1f7%oO%4}mBI6HvDLQFo_N7BhB6lR8=A>g!+UG~k8K91
zsaw7``)%&@o(6!O*)nmn&@_m+#>TgV3Br(s7tR9v5)26OQhH;zdOTI;iVxF6*27Lg
z<EOFmqcPmEU~H$jv+Ek0Cn<ub<)Z!bWEM<j)`E7yd3>^+f(c5?49vQo4dbnPB~7z7
zInBUH%~1klsj$RLtQW9=VS7ygJSUfzdHE+Dh{3!R%032akbI`EaT#{W@y=J{wd@F#
zJz=ks(RrVt#PomVPdo#^%GerfPA_A4Nr=7|qLAHus~mMPTy>8_A9BeFIO18=&k<Q1
z>G4;3mIzrTCMeQvb6t8J_c(=M>Yl?bN2mq;$*4id151r+46y$};EMp7YQl#A2s$TT
zV5|UCblCMiz#r*a<_jDMjQ)@k_a-;Hf(dD@N~Mgx(K-91`6&Q)I`X2?3N`B`Dady7
z@=5B^m=sTj;7xtToQoxP6~4#Wvs!Tuho)p)CWKRy#WZU*p64&Wn3tlSB~8R1_C^V{
zYk557@T7HF!I8>j<+~pN>FM}tn7gPGYd%pqxj$&0Z69Qvy}wwpo@l5hKNDj}4z>#O
z5id6H3-*ijBw{)9RiaVcc#C3)Dn>k9Ipx#dDpJfm_$IJ96{F+W7G4aS8Zt2_@t*b%
zy$N15RtHLx!B@m=P5>Ya8Q>Pa&8+O1)~j)`5^M>OzB&E0%2=iSZpvLGK>LO25XM99
zWuh+GU#Tg$Z-HAgy`_9`S`O6>pF>@Q#8J=gVK353HE)t(aH}Wkc;2eq<M<^*tW*D%
zNNvgdz+;}1=^7>#U#CL#Nw-SRO&qIJWJ<9y+Dmx5Oo+)QHcqr}YtOAdp($sFyStQP
zxN#M>*sNxOP=mr0YF^}@H34QY?1jL*02uKy5fPP`Omo7jYK$KO7^J|Udv^Lg^70DZ
zLzBI}bw2Q~iBGU==8+@wewigir*@E3H5CqIg9PMU-Z8*r32krf=X1T!Z62OSw~MHF
zrT97p!dsANI2y%|vzt61>mC1_46<GFAxUd@nTYR=-rkd>wGf8r3N|M8x4PN6skWH9
zxnM(?QYJXp()>UN8O^WVBsQ&X;qbiQE@$OQ(cbi4+qM0+ZFw>wGm_+kQOdZ(V4c|n
zTy~!oBB99~R!$i2YuJh;R{|v*vZ*>M>S~ofKV)2*yW`S$xSU@v_U59<?gC@*?ISH~
z<*ROXqc`@C5-MyoZA5#n-z6&0sM1|OJ#kr1xsTb{UHW+|tM|NhSxB@3Z;uANHT3<@
z%b9g?J3-N0`ZeK7PeG;L*T5Fe{Y7hXt*d%io}w;^l6eSXF?>-cH(?xWyH?li<UD32
z<=#|M=cUvct(&?2mlz()flFJ)-f2rCltWu@V)8RD=@&eSH*aOp2Gt*-kdyc+2IalU
zVV8GNf2t5r7gtEvxzq_3<P2{=&dYmFokd*tI>-+})i5(;I-8rbKn2YYsM=6DCqJ>1
z4uc4F=A?}b{8uD)ooruy`Xo3wG|#9=%6S<R9W~As2+=%03tSb#2nx&(%w44-U6sIx
zO8U`1AqDHnKmWZM@!c|QsyG8G&-52D3ElYC+H%ir=>yO<=3+wONDRLX*}DrSw<!M}
zf3I;~o}2X|O((>*m-|u0eK%tIehA=z4O@d~NrXa4u$G8+ysiw}`kdY{<+qrDrLV5_
zTefoI5veg%7Z*Y6L>|-GS}2rI@Qm5*n*;p3Q*C1nd>NFKh(FI3_Z8(XwoVrRZcxlE
z|6?%%ULx1E+TJIMxI1+%L_hbAnqTo*G?VBmwS$)oc%mQ|)Ft~P+f#OI{8x<@fq>nC
zP@AS2Z7*^Y)Pzu*g(~@`3;6H3eH`I^tiLjYIzsNrNP#aQhL;|jLI%10B`8$Xiz@W^
zqQEG`79a#jKAOMdDp~yc8XsxX!UgtB5*%POLWQlRLm!m(>6sS9l1htFECh%fLCRw`
zBd=+vFxm@s!|t)sRVPq_!8X%IU@&tLik_gpgVp*!^+wp0Pr*m1=u}%nBmdm`h-W`g
z<DN&Lljq*JZvVpgY=a)EUQj`yR-Q>;%_+h$(Wl)NTe{(`Bm>JLeBsFxkVU0zyVaXg
z${sMht$u8MG%I5Yh1Rc~wmH4yXMYkaQLZIZJx12KLqZOB0F{d2l~CMYUG_d+Gw0G?
zLV?rQPz@s@#aNptk62Wea~<4Tdmv5o&&b+Ny(8+iOmk4AKN6{wvu;d2{1CkwRd_X%
z<yNF1edQptPoy{|@o;0-&7GIOAoCis@>Z0q^O-A8^ltz6Y?IIuX?Rr&W=gr36C1#X
z$H8a>52oh#8w>IS$bU^V%v|~)ADH`Q6l1Oy*560h%>||mWBN<GOvpw1cT}nDu))WQ
zB8nlQi!qG6JSs<z0TLF64wy_j(&0DnUh%S~qebw1j8r2U9(U3ZgEctV!$L(Nh!zr6
z*cm~tdXw-?w`WVK#e~GiXB|pj!`-@()t<k-^|z5s<FGPksem^|{h{(Ze0Uc&rYTni
zW;0Tu%HX~b#-7g43SaZly~nN=Q4VJxZ56kAMrm>{Q9NF1kcnwwa5|c82}my}ke^h!
z2%eL$*xw_Ogm4Ig(vYXTbC+<e{X+98U&o-t9?XZBL?k4PMP4~m{CoWyvs7Xf^)_4G
z?k^!z)Lo*dctn05{wT=YyR2eIG`U46!*%1L*ybGkK)45XTX2V%gg942nxqb2S;?I%
z1K%<y(JM<`89mUcx`96Xp3mv5f{3DbV~?i}vR3KN2ZRD1UANuX$;gtCmYhM^Xp=1k
zy-b(PigKrG`q}Ac%=rFz<o)_y6m-*I&W^v=gcb>gDwABi7ipdXHFzDY>#@+jyP$2N
z8JeBqiB)vRTfuqSd@_>Rhqwi%O^B5^18$Va{M$~06pOG}i$pL*+pVsrm%&+IQHCWX
zZpf@{XQdt4OMu30+(kcO6I=h;Cjfs<t~xKSIft$YKT8tDyW)^}wQ0T1n8C@(N<1s<
z?%`P;(L?@pcBQA&HH&(9lpb}5E*;)gx~+k9Lmi13_yR+2=MW#t%CG}!FC*h<9=K2J
zm95gex3i00v#s(zCHJRlc_DqknQ(%5fs6^aRd?J;90aA-qVZBdX(?brQYjv9*ix*%
zF7j%7Ou0g|dG_$A++r%EYPdk@nm6<hXaU&@k7**PwIFSDcXw|ep33=P$nel1mI*dj
zO+Ob-L-Pg#W_t-5UninbY1Ioof~eMvBS*<kHk((o>4*Y%$eIWoJ^i|klVfZrV~)l{
z8ibj5%Xa*Z{M`pTwo?wYCXlcj24XcaV&iZt*7oPdJ4~_>J}!FP$_{!5z1eq`I&Qnx
zIaHpX+}@9zATX5C-0?yUwCL%0C4Q_zs22zS>FR8V4st{>ox^-NDrC>qWm+52H_?YN
z{`o5-N~oS;lOb|R&bGFc3?jYB7wEozj4z(z@lvPyH0(R$>{=hg(n*MD0TyCoC&E3{
zi@enU%$Y#1;|GZYt6J$<5baRHb_{XzupnH$w=GzGiotd{H(T!urys}Gp#G*Tilur#
zXuzh+v_qfOoz_1|fPTp<#cCon0KJZ{>1obJbu?s`u5+%_Pph`^-auD0`Rc&tkP0Hr
zyc?uLn9c=<J4<)fgcLW$oKL7P8*^Np4U%PVHXQhkQ)86r<69Nz@cZaG|HPh_=GKXQ
zv6(B2D3SAvS-SOQD-Aw_o2Mver{mM9n}t>a*LsgnDbw>GZ_#7TMCrz}CGBojF8=I>
zb_T)M*w6XAKWBw-u~Z|Bop?L-x%-cFWCcBV)*2S!usK3WPOD2E*0uI}!<?(ZFu<VE
zd$1VEG3N9oM|g=$)3rt7mDqGJYWfgQqtc=eFS1mX)n)(BWzY8tmyRtiM+)Z&uerSU
zZeJo_-Cpy@NAzoK2uw1s1$G7CwPBEY?|_Z-327s|649I^P)=A!rI(bHs0_=;e;-hR
zmGNE4?+fL8)-6$MoYO4|tQ}W}Lp$bzdz$TWb^7rtK7r&zf0oWXq>zv1ZYs|c1GOJ=
zbpTNQ6aU_%k?~t<M`TnlMC&u_w??BBeh%*IJ!~GA=&7x~B%6ZfXYfFgMB(4Pj+m*d
zAjMo@SL~GL9Kl_6HnECdXFZCy><0&vKe~Txds&4aUn*;7{Vf39TuC|3J!Xtb7I~;5
z1UmWLW?pcae_(H4cBl%zx2D^mJrpT=V)f+*$Yy<v%se<@A*S_qb-87B%8MnNc#7Ro
zP+@l9%r1ssF?|;YW<}Bu+x_eWdp^2QGVA`)=^l@q88L0nPh)C|zLpkYS3WG5Zi}20
zu)J(w4=qfcvE&NIX0hGd=jN<FX3})B3{@Ac_CUak_NqJ#^COs6%x6*`i?(t%Hi#Bj
z3hk}R2=+^K_Gf<`i854+JbfPj;|^w@Qe-sd@pId)SvWFxvkj(HCj>lLZ)>X^4%<}O
zy~1v+C%@8VjJcu3CeE|BwseBu+?tr#iDjn8|41-hx<AQG=OauB!tT<2gA-*<v2F3=
z!%7v!qa;N#Pj#HI$J}q6ojfdeWN+Rr--A9W<dIA2jgI+!LkvJV7|#HJ$y|PgCWw~G
zZHv@{zB088`9l5Z@*V1&gR%X3*{oSZ^fvi0vR}54<M3%l;k04w?BibH98s$8*7qim
z)|aWCQ4=ikBT_uZe52RVWgV9138*MgnqsnIyPGeuk4p57r9~8edm%P<o3WRX(kj=&
zu5&!`^w_s?L;II7Bzrn_?zl$ozzl;fF(r$+j`gBFJEkhShvoU;WO;XCcI{<AZ8qvW
zEqSf^4|E!fA9GBsHYff-mM1hxK#>EPO3~cX5(hCjC}F+E$ICnK!I0WFMU>cgxY<Fz
zT4szsts6EUU9p}yr2aH8<^-^fw$3NeJL(&&KEi;pMTJfS<Gd`U{<mlte)RWa%}8%(
zmm0{V;7&~&n#+$`av6N#G&5>*_`Up=$1sqaRfLKFB%e_ADe~k0R!ra8`i}-doCeXy
z>XsqEtY2d4Z|DXX;lJ42eR%6#0iab(ha?V+$1zS}e{4DrGGYb7(#c~?K_hIZ5Hs)V
zW%zvSxytkBsij<Hzuntr(%CdNy4jsT`8L&ajB;H7!UgnM888&*X03qJ`{w54e8*qL
zYcZ(6Q~8*ylx-HXzs%B;SIB3Wf?`Y*`)o~BFH~TOS{)BIja?4s2~5)M+!lu27q}m-
zoPHpWQHO`uJh$D0ZVe<3^>V#s+?&kq?itb=A6{}Gr`Lxk3#}wFhB37Xs+6kFVClH9
zedZ0r4}!B;lGFX1U5qK7{y5D$DyHXs<Wsz@94cg>pu|w$UTUG)-Tt>=coGveS?nA}
zDNJS@N)g{Ds^M(-LrNit02qtrkErz?8y@`LUh7sM<6~WkcG!|3<T~O5(}^WN;Q-9{
z>r_dpVhV!vb?3~J7gfiLQu|Y*$o?@W*dvkI?v3uBAh)5tgt01C-@8ZPpv0IiZUUot
zHzHgV+MP6Qd-R2m${Wsqa(wHYT}2q6hYX{1EgEKKeDC94c<iM<-cUPBTuvdF!5p5X
zo6CCdy=ky0d4jHtv_F{PHTM|)fy;UM<&GkP+r#<ipEPTNe*h2@PBr@fJegO712A~-
z3@_4TN&OuW(NuPY&rjkU-8_QnQ&Tr{%PUVx%dlfL-Qw5myYOWHG>agBTeH>F{E+$g
zSkAj;3gNLRqe`B{<W>T$qT&2;LD+++pM(@5z`o|IBG6<a)0Z|S;F*cb^?V#an)+v!
zfIeK$&M1irjnpW_F~1T0qcX|8v0aU>w?jWX!`V-Xpm!VbePzDy5X%3r-hO;9u;rp@
zzc5!`snuvCv}ai9U-~fb!{l)G=M~@%z3D6%-{4#uEfGog-1FXz?i*4_<a!3436578
z6iGHmjZ<bfW?MjS4US)!Kj5DY&6^~z{51*py$N}wqf2TjRy|kbQ_~Vm%@C|%o}pNr
z;L&i_Y#P5e?Mn5np$(!1Q8v6fmX>T_YM>yvL|=!Ra<ASSGY5;5v1KTKM388`)@D{R
zlE3``?*w`*$UU0hfAnOXW%3T@A^R&C-Q&VbKR5e4Um2z$b+DG=6mk<T4%Hlt^NMmp
zxX3xeRTg8itd5f)e0W^IMy_4Ot=nH*F0TxoD*o+iYhp+`mn`DWVf2-(;s!mR{6{Wa
zZoUuLPip0fjGx|)SXF^_6rL=0xCm)@k#=OCsGouW&HC=)N{Y@GlOuzcv--wjx6i1@
z7{WCjFV#Iww#dqu;v{*5SjRn?_&X}z;BUTod$G&diqK#8k=2fLmWiA2-ws~?+XDaJ
zt-b#DFPMxY9oj>01Yk<crKJnZ#qfc=&$DpbaeXUD4Cpa9IMXLZ&AXKVPWzpU&#PoW
zw-m#wrnh5$6I_$afmRf9ga1yxC%2WWvy)R^V{aTMHpc2+Ru&hmEkYVD!x?#g6lw*7
zFgp!QFA)GtaV<ObFkw#p0(vz8lr90}709*z`nIQ5e^9tULtg>QOgqjB;Oz$G_w3u*
z)(;Qez9#Y$Kvy7f_&DN8KDaWv>2^lkf=&z2-s!P>3YDKTC;`Xn&VM*{?s<O~IDd-R
z0+^{?QCW$(G#vW08%hWR`Z_M*%^FL?%D#Mr_%p<e3K?re;un@O!RWY^*B82ff$$Qk
zS4btl0Xzzf!K6?h3)q-LDhNcoZ!$TxufPzJk>YZCZZ31-QATE_WEHz6_p@TS25=;7
z4PgU~X)g8tu6;4+HPdydX<qFx1GEcL<TC*O2f$aU*u=Y8I4&5IHLs<toK;z_pN__e
z!c<fOFiasZ$b8tS47a2N2=FU^fB)*ztgNhzNk(iRlVNEllBc!zVT4|S2`0!w+gDY=
zK%{T-3D{hZs+AgCgo5bij2}UsA>54F`$`GgcVEE!vKUPEtD8RZAYU%Jw2m0TOnG~Z
zad6+7?>>V7H+NeUIlzX44=ZkgqlKW55T+p%^sA8NZpnr{IM~9&bAH&!05`P_1IuAG
zAt3K}7dlKpCn%SV=JU`OixhwA9*;DbiekD>@HoNv$Vr&K;OF6Kbxvfu@lmT*xn6N}
zfb0R>=Rp-b7_ggHC5}b<@)L$`K)8bQ4`MOCBE)w5+jjw;VuH)Zi(tkn{xv-2KHb0D
z%zQ_mH}A7%D8rr(hX#AM93)j840Ig~1@-L=;U5GS2Nw?;hX5NF|1(a0K~7FV4n7tR
z4nYo%$(mp6|Kk8lYXegw*Z=>3k8|3Xc79C%zJi0PrJ=opuBFZYf6URQw;UWpk2%K1
zz(`Qq^r@k}xV58|fvJ_TxU~y{n~(nvmkTeaz#SeI4i*mFf-j%o^ThvrUfIyj-qhL(
c@kHVw8|NKfB|i0L_%%XCQbD3n{Dtp-0T&4G8~^|S

literal 0
HcmV?d00001


From fa6ddb1c94e8a87151615089e212cf11294531d2 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 17 Jan 2025 13:48:38 -0600
Subject: [PATCH 206/230] Create a new type to represent IDs for all kernels,
 blocksizes, etc. (#841)

Details:
- Currently, all enums used to represent built-in kernel IDs, blocksizes, preferences, and operation IDs have a special member equal to `BLIS_VA_END`, which in turn is `(siz_t)-1`. In principle, this would force the underlying type used to represent the enum values to be as wide as `siz_t`, particularly when passed to the variadic function `bli_cntx_set_ukrs` and friends. User-registered kernels IDs and such are of type `siz_t` explicitly. However, gcc (12 and older), clang, and icx pass literal enum constants (e.g. `BLIS_MR`) that are small enough as `int` when 32-bit mode is used (`-m32`). This causes a misalignment of the parameters on the stack and ultimately a segfault. The problem also exists in 64-bit mode with clang and icx and on aarch64 with clang, as parameters far enough down the list to go on the stack do not get the upper 4 bytes initialized.
- This commit introduces a new type `kerid_t` which is always `uint32_t`. This type is used for all kernel, blocksize, preference, and operation IDs (including user-registered ones). It is also used for `BLIS_VA_END`.
- Now all enum values are always passed as 32-bit ints on all architectures.
- Fixes #839.
---
 build/plugin/bli_plugin.h.in       |  6 +--
 build/plugin/bli_plugin_init_ref.c |  2 +-
 docs/BLISObjectAPI.md              |  1 +
 docs/PluginHowTo.md                | 74 ++++++++++++-------------
 frame/base/bli_cntx.c              | 87 ++++++++++++++++--------------
 frame/base/bli_cntx.h              | 58 ++++++++++----------
 frame/base/bli_gks.c               | 24 ++++-----
 frame/base/bli_gks.h               |  8 +--
 frame/include/bli_type_defs.h      | 16 +++---
 9 files changed, 145 insertions(+), 131 deletions(-)

diff --git a/build/plugin/bli_plugin.h.in b/build/plugin/bli_plugin.h.in
index 9e0495c18..ad87e8c73 100644
--- a/build/plugin/bli_plugin.h.in
+++ b/build/plugin/bli_plugin.h.in
@@ -41,9 +41,9 @@
 
 #define plugin_@plugin_name@_params \
 \
-       siz_t* bszids, /* <----- Example arguments       */ \
-       siz_t* kerids, /* <----- for plugin registration */ \
-       siz_t* prefids /* <----- and initialization.     */
+       kerid_t* bszids, /* <----- Example arguments       */ \
+       kerid_t* kerids, /* <----- for plugin registration */ \
+       kerid_t* prefids /* <----- and initialization.     */
 
 #define plugin_@plugin_name@_params_only \
 \
diff --git a/build/plugin/bli_plugin_init_ref.c b/build/plugin/bli_plugin_init_ref.c
index 0dec70b01..48767e8a0 100644
--- a/build/plugin/bli_plugin_init_ref.c
+++ b/build/plugin/bli_plugin_init_ref.c
@@ -69,7 +69,7 @@ void PASTEMAC(plugin_init_@plugin_name@,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX)
 	// ------------------------------------------------------------------------>
 
 	blksz_t blkszs[ MY_NUM_BLOCK_SIZES ];
-	siz_t   bmults[ MY_NUM_BLOCK_SIZES ];
+	kerid_t bmults[ MY_NUM_BLOCK_SIZES ];
 	func_t  funcs[ MY_NUM_KERNELS ];
 	mbool_t mbools[ MY_NUM_KERNEL_PREFS ];
 
diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md
index 877cdb91d..01cd1ba37 100644
--- a/docs/BLISObjectAPI.md
+++ b/docs/BLISObjectAPI.md
@@ -82,6 +82,7 @@ The following tables list various types used throughout the BLIS object API.
 | `inc_t`           | `gint_t`                 | matrix row/column strides and vector increments.                     |
 | `doff_t`          | `gint_t`                 | matrix diagonal offset: if _k_ < 0, diagonal begins at element (-_k_,0); otherwise diagonal begins at element (0,_k_). |
 | `siz_t`           | `guint_t`                | a byte size or byte offset.                                          |
+| `kerid_t`         | `uint32_t`               | a kernel, block size, operation, or kernel preference ID.        |
 
 ### Floating-point types
 
diff --git a/docs/PluginHowTo.md b/docs/PluginHowTo.md
index e9f6b93fd..0dd340dcc 100644
--- a/docs/PluginHowTo.md
+++ b/docs/PluginHowTo.md
@@ -722,7 +722,7 @@ void dsyrkd
 ## Registration
 
 ```C++
-err_t bli_gks_register_ukr( siz_t* ukr_id );
+err_t bli_gks_register_ukr( kerid_t* ukr_id );
 ```
 
 Register a new microkernel, which may have a different implementation for each supported data type.
@@ -733,7 +733,7 @@ Register a new microkernel, which may have a different implementation for each s
 </table>
 
 ```C++
-err_t bli_gks_register_ukr2( siz_t* ukr_id );
+err_t bli_gks_register_ukr2( kerid_t* ukr_id );
 ```
 
 Register a new microkernel, which may have a different implementation for each *pair* of supported data types.
@@ -744,7 +744,7 @@ Register a new microkernel, which may have a different implementation for each *
 </table>
 
 ```C++
-err_t bli_gks_register_blksz( siz_t* bs_id );
+err_t bli_gks_register_blksz( kerid_t* bs_id );
 ```
 
 Register a new blocksize, which may have a different integral value for each supported data type.
@@ -755,7 +755,7 @@ Register a new blocksize, which may have a different integral value for each sup
 </table>
 
 ```C++
-err_t bli_gks_register_ukr_pref( siz_t* ukr_pref_id );
+err_t bli_gks_register_ukr_pref( kerid_t* ukr_pref_id );
 ```
 
 Register a new microkernel preference, which may have a different logical value for each supported data type.
@@ -976,73 +976,73 @@ void bli_mbool_free( mbool_t* b );
 ## Context Initialization
 
 ```C++
-err_t bli_cntx_set_ukr( siz_t ukr_id, const func_t* func, cntx_t* cntx );
+err_t bli_cntx_set_ukr( kerid_t ukr_id, const func_t* func, cntx_t* cntx );
 ```
 
 ```C++
-void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, siz_t ukr_id, const func_t* func, cntx_t* cntx );
+void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, kerid_t ukr_id, const func_t* func, cntx_t* cntx );
 ```
 
 ```C++
-err_t bli_cntx_set_ukr2( siz_t ukr_id, const func2_t* func, cntx_t* cntx );
+err_t bli_cntx_set_ukr2( kerid_t ukr_id, const func2_t* func, cntx_t* cntx );
 ```
 
 ```C++
-void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, siz_t ukr_id, const func_t* func, cntx_t* cntx );
+void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, kerid_t ukr_id, const func_t* func, cntx_t* cntx );
 ```
 
 ```C++
-err_t bli_cntx_set_blksz( siz_t bs_id, const blksz_t* blksz, siz_t mult_id, cntx_t* cntx );
+err_t bli_cntx_set_blksz( kerid_t bs_id, const blksz_t* blksz, kerid_t mult_id, cntx_t* cntx );
 ```
 
 ```C++
-void bli_cntx_set_blksz_def_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx );
+void bli_cntx_set_blksz_def_dt( num_t dt, kerid_t bs_id, dim_t bs, cntx_t* cntx );
 ```
 
 ```C++
-void bli_cntx_set_blksz_max_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx );
+void bli_cntx_set_blksz_max_dt( num_t dt, kerid_t bs_id, dim_t bs, cntx_t* cntx );
 ```
 
 ```C++
-err_t bli_cntx_set_ukr_pref( siz_t ukr_pref_id, const mbool_t* prefs, cntx_t* cntx );
+err_t bli_cntx_set_ukr_pref( kerid_t ukr_pref_id, const mbool_t* prefs, cntx_t* cntx );
 ```
 
 ```C++
-err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, siz_t ukr_pref_id, cntx_t* cntx );
+err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, kerid_t ukr_pref_id, cntx_t* cntx );
 ```
 
 ```C++
 void bli_cntx_set_ukrs( cntx_t* cntx,
-                        siz_t ukr0_id, num_t dt0, void_fp ukr0_fp,
-                        siz_t ukr1_id, num_t dt1, void_fp ukr1_fp,
-                        siz_t ukr2_id, num_t dt2, void_fp ukr2_fp,
+                        kerid_t ukr0_id, num_t dt0, void_fp ukr0_fp,
+                        kerid_t ukr1_id, num_t dt1, void_fp ukr1_fp,
+                        kerid_t ukr2_id, num_t dt2, void_fp ukr2_fp,
                         ...,
                         BLIS_VA_END );
 ```
 
 ```C++
 void bli_cntx_set_ukr2s( cntx_t* cntx,
-                         siz_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
-                         siz_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
-                         siz_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
+                         kerid_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
+                         kerid_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
+                         kerid_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
                          ...,
                          BLIS_VA_END );
 ```
 
 ```C++
 void bli_cntx_set_blksz( cntx_t* cntx,
-                         siz_t bs0_id, const blksz_t* blksz0, siz_t bm0_id,
-                         siz_t bs1_id, const blksz_t* blksz1, siz_t bm1_id,
-                         siz_t bs2_id, const blksz_t* blksz2, siz_t bm2_id,
+                         kerid_t bs0_id, const blksz_t* blksz0, kerid_t bm0_id,
+                         kerid_t bs1_id, const blksz_t* blksz1, kerid_t bm1_id,
+                         kerid_t bs2_id, const blksz_t* blksz2, kerid_t bm2_id,
                          ...,
                          BLIS_VA_END );
 ```
 
 ```C++
 void bli_cntx_set_ukr_prefs( cntx_t* cntx,
-                             siz_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
-                             siz_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
-                             siz_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
+                             kerid_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
+                             kerid_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
+                             kerid_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
                              ...,
                              BLIS_VA_END );
 ```
@@ -1058,51 +1058,51 @@ const cntx_t* bli_gks_lookup_id( arch_t id );
 ```
 
 ```C++
-const func_t* bli_cntx_get_ukrs( siz_t ukr_id, const cntx_t* cntx );
+const func_t* bli_cntx_get_ukrs( kerid_t ukr_id, const cntx_t* cntx );
 ```
 
 ```C++
-void_fp bli_cntx_get_ukr_dt( num_t dt, siz_t ukr_id, const cntx_t* cntx );
+void_fp bli_cntx_get_ukr_dt( num_t dt, kerid_t ukr_id, const cntx_t* cntx );
 ```
 
 ```C++
-const func2_t* bli_cntx_get_ukr2s( siz_t ukr_id, const cntx_t* cntx );
+const func2_t* bli_cntx_get_ukr2s( kerid_t ukr_id, const cntx_t* cntx );
 ```
 
 ```C++
-void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, siz_t ukr_id, const cntx_t* cntx );
+void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, kerid_t ukr_id, const cntx_t* cntx );
 ```
 
 ```C++
-const blksz_t* bli_cntx_get_blksz( siz_t bs_id, const cntx_t* cntx );
+const blksz_t* bli_cntx_get_blksz( kerid_t bs_id, const cntx_t* cntx );
 ```
 
 ```C++
-dim_t bli_cntx_get_blksz_def_dt( num_t dt, siz_t bs_id, const cntx_t* cntx );
+dim_t bli_cntx_get_blksz_def_dt( num_t dt, kerid_t bs_id, const cntx_t* cntx );
 ```
 
 ```C++
-dim_t bli_cntx_get_blksz_max_dt( num_t dt, siz_t bs_id, const cntx_t* cntx );
+dim_t bli_cntx_get_blksz_max_dt( num_t dt, kerid_t bs_id, const cntx_t* cntx );
 ```
 
 ```C++
-siz_t bli_cntx_get_bmult_id( siz_t bs_id, const cntx_t* cntx );
+kerid_t bli_cntx_get_bmult_id( kerid_t bs_id, const cntx_t* cntx );
 ```
 
 ```C++
-const blksz_t* bli_cntx_get_bmult( siz_t bs_id, const cntx_t* cntx );
+const blksz_t* bli_cntx_get_bmult( kerid_t bs_id, const cntx_t* cntx );
 ```
 
 ```C++
-dim_t bli_cntx_get_bmult_dt( num_t dt, siz_t bs_id, const cntx_t* cntx );
+dim_t bli_cntx_get_bmult_dt( num_t dt, kerid_t bs_id, const cntx_t* cntx );
 ```
 
 ```C++
-const mbool_t* bli_cntx_get_ukr_prefs( siz_t ukr_pref_id, const cntx_t* cntx );
+const mbool_t* bli_cntx_get_ukr_prefs( kerid_t ukr_pref_id, const cntx_t* cntx );
 ```
 
 ```C++
-bool bli_cntx_get_ukr_prefs_dt( num_t dt, siz_t ukr_pref_id, const cntx_t* cntx );
+bool bli_cntx_get_ukr_prefs_dt( num_t dt, kerid_t ukr_pref_id, const cntx_t* cntx );
 ```
 
 ## Control tree modification
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index fd0799a50..dde22c72b 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -46,7 +46,7 @@ BLIS_EXPORT_BLIS err_t bli_cntx_init( cntx_t* cntx )
 	if ( error != BLIS_SUCCESS )
 		return error;
 
-	error = bli_stack_init( sizeof( siz_t ), 32, 32, BLIS_NUM_BLKSZS, &cntx->bmults );
+	error = bli_stack_init( sizeof( kerid_t ), 32, 32, BLIS_NUM_BLKSZS, &cntx->bmults );
 	if ( error != BLIS_SUCCESS )
 		return error;
 
@@ -118,9 +118,9 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	   void bli_cntx_set_blkszs
 	   (
 	     cntx_t* cntx,
-	     siz_t bs0_id, blksz_t* blksz0, siz_t bm0_id,
-	     siz_t bs1_id, blksz_t* blksz1, siz_t bm1_id,
-	     siz_t bs2_id, blksz_t* blksz2, siz_t bm2_id,
+	     kerid_t bs0_id, blksz_t* blksz0, kerid_t bm0_id,
+	     kerid_t bs1_id, blksz_t* blksz1, kerid_t bm1_id,
+	     kerid_t bs2_id, blksz_t* blksz2, kerid_t bm2_id,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -133,18 +133,18 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	// Process blocksizes until we get a BLIS_VA_END.
 	while ( true )
 	{
-		int bs_id = va_arg( args, siz_t );
+		kerid_t bs_id = ( kerid_t )va_arg( args, kerid_t );
 
-		// If we find a siz_t id of BLIS_VA_END, then we are done.
+		// If we find a block size id of BLIS_VA_END, then we are done.
 		if ( bs_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the siz_t of the blocksize we're about to process (already done),
+		// - the kerid_t of the blocksize we're about to process (already done),
 		// - the address of the blksz_t object,
-		// - the siz_t of the multiple we need to associate with
+		// - the kerid_t of the multiple we need to associate with
 		//   the blksz_t object.
 		blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-		siz_t    bm_id = ( siz_t    )va_arg( args, siz_t    );
+		kerid_t  bm_id = ( kerid_t  )va_arg( args, kerid_t );
 
 		// Copy the blksz_t object contents into the appropriate
 		// location within the context's blksz_t array. Do the same
@@ -171,9 +171,9 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	   void bli_cntx_set_ukrs
 	   (
 	     cntx_t* cntx,
-	     siz_t ukr0_id, num_t dt0, void_fp ukr0_fp,
-	     siz_t ukr1_id, num_t dt1, void_fp ukr1_fp,
-	     siz_t ukr2_id, num_t dt2, void_fp ukr2_fp,
+	     kerid_t ukr0_id, num_t dt0, void_fp ukr0_fp,
+	     kerid_t ukr1_id, num_t dt1, void_fp ukr1_fp,
+	     kerid_t ukr2_id, num_t dt2, void_fp ukr2_fp,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -186,17 +186,17 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	// Process ukernels until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const int ukr_id = va_arg( args, siz_t );
+		kerid_t ukr_id = ( kerid_t )va_arg( args, kerid_t );
 
 		// If we find a ukernel id of BLIS_VA_END, then we are done.
 		if ( ukr_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the siz_t of the kernel we're about to process (already done),
+		// - the kerid_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		const num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
-		      void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
+		num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
+		void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
 
 		// Store the ukernel function pointer into the context.
 		bli_cntx_set_ukr_dt( ukr_fp, ukr_dt, ukr_id, cntx );
@@ -221,9 +221,9 @@ void bli_cntx_set_ukr2s( cntx_t* cntx , ... )
 	   void bli_cntx_set_ukr2s
 	   (
 	     cntx_t* cntx,
-	     siz_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
-	     siz_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
-	     siz_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
+	     kerid_t ukr0_id, num_t dt1_0, num_t dt2_0, void_fp ukr0_fp,
+	     kerid_t ukr1_id, num_t dt1_1, num_t dt2_1, void_fp ukr1_fp,
+	     kerid_t ukr2_id, num_t dt1_2, num_t dt2_2, void_fp ukr2_fp,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -236,18 +236,18 @@ void bli_cntx_set_ukr2s( cntx_t* cntx , ... )
 	// Process ukernels until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const int ukr_id = va_arg( args, siz_t );
+		kerid_t ukr_id = ( kerid_t )va_arg( args, kerid_t );
 
 		// If we find a ukernel id of BLIS_VA_END, then we are done.
 		if ( ukr_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the siz_t of the kernel we're about to process (already done),
+		// - the kerid_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		const num_t   ukr_dt1 = ( num_t   )va_arg( args, num_t   );
-		const num_t   ukr_dt2 = ( num_t   )va_arg( args, num_t   );
-		      void_fp ukr_fp  = ( void_fp )va_arg( args, void_fp );
+		num_t   ukr_dt1 = ( num_t   )va_arg( args, num_t   );
+		num_t   ukr_dt2 = ( num_t   )va_arg( args, num_t   );
+		void_fp ukr_fp  = ( void_fp )va_arg( args, void_fp );
 
 		// Store the ukernel function pointer into the context.
 		bli_cntx_set_ukr2_dt( ukr_fp, ukr_dt1, ukr_dt2, ukr_id, cntx );
@@ -272,9 +272,9 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	   void bli_cntx_set_ukr_prefs
 	   (
 	     cntx_t* cntx,
-	     siz_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
-	     siz_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
-	     siz_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
+	     kerid_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
+	     kerid_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
+	     kerid_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -287,17 +287,17 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	// Process ukernel preferences until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const int ukr_pref_id = va_arg( args, siz_t );
+		kerid_t ukr_pref_id = ( kerid_t )va_arg( args, kerid_t );
 
 		// If we find a ukernel pref id of BLIS_VA_END, then we are done.
 		if ( ukr_pref_id == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
-		// - the siz_t of the kernel we're about to process (already done),
+		// - the kerid_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		const num_t ukr_pref_dt = ( num_t )va_arg( args, num_t );
-		const bool  ukr_pref    = ( bool  )va_arg( args, int );
+		num_t ukr_pref_dt = ( num_t )va_arg( args, num_t );
+		bool  ukr_pref    = ( bool  )va_arg( args, int );
 
 		// Store the ukernel preference value into the context.
 		bli_cntx_set_ukr_pref_dt( ukr_pref, ukr_pref_dt, ukr_pref_id, cntx );
@@ -337,7 +337,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 	// Process sup handlers until BLIS_VA_END is reached.
 	while ( true )
 	{
-		const opid_t op_id = va_arg( args, siz_t );
+		kerid_t op_id = ( kerid_t )va_arg( args, kerid_t );
 
 		// If we find an operation id of BLIS_VA_END, then we are done.
 		if ( op_id == BLIS_VA_END ) break;
@@ -363,7 +363,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 
 // -----------------------------------------------------------------------------
 
-err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, siz_t bmult_id, cntx_t* cntx )
+err_t bli_cntx_register_blksz( kerid_t* bs_id, const blksz_t* blksz, kerid_t bmult_id, cntx_t* cntx )
 {
 	siz_t id_blksz;
 	err_t error = bli_stack_push( &id_blksz, &cntx->blkszs );
@@ -390,12 +390,15 @@ err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, siz_t bmult_i
 	}
 }
 
-err_t bli_cntx_register_ukr( siz_t* ukr_id, const func_t* ukr, cntx_t* cntx )
+err_t bli_cntx_register_ukr( kerid_t* ukr_id, const func_t* ukr, cntx_t* cntx )
 {
-	err_t error = bli_stack_push( ukr_id, &cntx->ukrs );
+	siz_t new_ukr_id;
+	err_t error = bli_stack_push( &new_ukr_id, &cntx->ukrs );
 	if ( error != BLIS_SUCCESS )
 		return error;
 
+	*ukr_id = new_ukr_id;
+
 	if ( ukr )
 	{
 		return bli_cntx_set_ukr( *ukr_id, ukr, cntx );
@@ -406,12 +409,15 @@ err_t bli_cntx_register_ukr( siz_t* ukr_id, const func_t* ukr, cntx_t* cntx )
 	}
 }
 
-err_t bli_cntx_register_ukr2( siz_t* ukr_id, const func2_t* ukr, cntx_t* cntx )
+err_t bli_cntx_register_ukr2( kerid_t* ukr_id, const func2_t* ukr, cntx_t* cntx )
 {
-	err_t error = bli_stack_push( ukr_id, &cntx->ukr2s );
+	siz_t new_ukr_id;
+	err_t error = bli_stack_push( &new_ukr_id, &cntx->ukr2s );
 	if ( error != BLIS_SUCCESS )
 		return error;
 
+	*ukr_id = new_ukr_id;
+
 	if ( ukr )
 	{
 		return bli_cntx_set_ukr2( *ukr_id, ukr, cntx );
@@ -422,12 +428,15 @@ err_t bli_cntx_register_ukr2( siz_t* ukr_id, const func2_t* ukr, cntx_t* cntx )
 	}
 }
 
-err_t bli_cntx_register_ukr_pref( siz_t* ukr_pref_id, const mbool_t* ukr_pref, cntx_t* cntx )
+err_t bli_cntx_register_ukr_pref( kerid_t* ukr_pref_id, const mbool_t* ukr_pref, cntx_t* cntx )
 {
-	err_t error = bli_stack_push( ukr_pref_id, &cntx->ukr_prefs );
+	siz_t new_ukr_pref_id;
+	err_t error = bli_stack_push( &new_ukr_pref_id, &cntx->ukr_prefs );
 	if ( error != BLIS_SUCCESS )
 		return error;
 
+	*ukr_pref_id = new_ukr_pref_id;
+
 	if ( ukr_pref )
 	{
 		return bli_cntx_set_ukr_pref( *ukr_pref_id, ukr_pref, cntx );
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 1e9874c05..f5043299e 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -44,7 +44,7 @@
 typedef struct cntx_s
 {
 	stck_t blkszs; // blksz_t
-	stck_t bmults; // siz_t
+	stck_t bmults; // kerid_t
 
 	stck_t ukrs; // func_t
 	stck_t ukr2s; // func2_t
@@ -60,7 +60,7 @@ typedef struct cntx_s
 // -- cntx_t query (complex) ---------------------------------------------------
 //
 
-BLIS_INLINE const blksz_t* bli_cntx_get_blksz( siz_t bs_id, const cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_blksz( kerid_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* blksz;
 	err_t error = bli_stack_get( bs_id, ( void** )&blksz, &cntx->blkszs );
@@ -69,7 +69,7 @@ BLIS_INLINE const blksz_t* bli_cntx_get_blksz( siz_t bs_id, const cntx_t* cntx )
 	return blksz;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, siz_t bs_id, const cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, kerid_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
 	dim_t          bs_dt  = bli_blksz_get_def( dt, blksz );
@@ -78,7 +78,7 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, siz_t bs_id, const cntx_t
 	return bs_dt;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, siz_t bs_id, const cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, kerid_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
 	dim_t          bs_dt  = bli_blksz_get_max( dt, blksz );
@@ -87,24 +87,24 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, siz_t bs_id, const cntx_t
 	return bs_dt;
 }
 
-BLIS_INLINE siz_t bli_cntx_get_bmult_id( siz_t bs_id, const cntx_t* cntx )
+BLIS_INLINE kerid_t bli_cntx_get_bmult_id( kerid_t bs_id, const cntx_t* cntx )
 {
-	const siz_t* bsz;
+	const kerid_t* bsz;
 	err_t error = bli_stack_get( bs_id, ( void** )&bsz, &cntx->bmults );
 	if ( error != BLIS_SUCCESS )
 		bli_check_error_code( error );
 	return *bsz;
 }
 
-BLIS_INLINE const blksz_t* bli_cntx_get_bmult( siz_t bs_id, const cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_bmult( kerid_t bs_id, const cntx_t* cntx )
 {
-	siz_t          bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
+	kerid_t        bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
 	const blksz_t* bmult  = bli_cntx_get_blksz( bm_id, cntx );
 
 	return bmult;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, siz_t bs_id, const cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, kerid_t bs_id, const cntx_t* cntx )
 {
 	const blksz_t* bmult  = bli_cntx_get_bmult( bs_id, cntx );
 	dim_t          bm_dt  = bli_blksz_get_def( dt, bmult );
@@ -114,7 +114,7 @@ BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, siz_t bs_id, const cntx_t* cn
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE const func2_t* bli_cntx_get_ukr2s( siz_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE const func2_t* bli_cntx_get_ukr2s( kerid_t ukr_id, const cntx_t* cntx )
 {
 	const func2_t* ukr;
 	err_t error = bli_stack_get( bli_ker_idx( ukr_id ), ( void** )&ukr, &cntx->ukr2s );
@@ -123,7 +123,7 @@ BLIS_INLINE const func2_t* bli_cntx_get_ukr2s( siz_t ukr_id, const cntx_t* cntx
 	return ukr;
 }
 
-BLIS_INLINE void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, siz_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, kerid_t ukr_id, const cntx_t* cntx )
 {
 	const func2_t* func = bli_cntx_get_ukr2s( ukr_id, cntx );
 
@@ -132,7 +132,7 @@ BLIS_INLINE void_fp bli_cntx_get_ukr2_dt( num_t dt1, num_t dt2, siz_t ukr_id, co
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE const func_t* bli_cntx_get_ukrs( siz_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE const func_t* bli_cntx_get_ukrs( kerid_t ukr_id, const cntx_t* cntx )
 {
 	if ( bli_ker_ntype( ukr_id ) == 2 )
 	{
@@ -148,7 +148,7 @@ BLIS_INLINE const func_t* bli_cntx_get_ukrs( siz_t ukr_id, const cntx_t* cntx )
 	}
 }
 
-BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, siz_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, kerid_t ukr_id, const cntx_t* cntx )
 {
 	if ( bli_ker_ntype( ukr_id ) == 2 )
 	{
@@ -164,7 +164,7 @@ BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, siz_t ukr_id, const cntx_t* c
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( siz_t pref_id, const cntx_t* cntx )
+BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( kerid_t pref_id, const cntx_t* cntx )
 {
 	const mbool_t* ukr_prefs;
 	err_t error = bli_stack_get( pref_id, ( void** )&ukr_prefs, &cntx->ukr_prefs );
@@ -173,7 +173,7 @@ BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( siz_t pref_id, const cntx_t*
 	return ukr_prefs;
 }
 
-BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, siz_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, kerid_t ukr_id, const cntx_t* cntx )
 {
 	const mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
 
@@ -262,14 +262,14 @@ BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, c
 // NOTE: The framework does not use any of the following functions. We provide
 // them in order to facilitate creating/modifying custom contexts.
 
-BLIS_INLINE err_t bli_cntx_set_blksz( siz_t bs_id, const blksz_t* blksz, siz_t mult_id, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_blksz( kerid_t bs_id, const blksz_t* blksz, kerid_t mult_id, cntx_t* cntx )
 {
 	blksz_t* cntx_blksz;
 	err_t error = bli_stack_get( bs_id, ( void** )&cntx_blksz, &cntx->blkszs );
 	if ( error != BLIS_SUCCESS )
 		return error;
 
-	siz_t* cntx_mult_id;
+	kerid_t* cntx_mult_id;
 	error = bli_stack_get( bs_id, ( void** )&cntx_mult_id, &cntx->bmults );
 	if ( error != BLIS_SUCCESS )
 		return error;
@@ -280,34 +280,34 @@ BLIS_INLINE err_t bli_cntx_set_blksz( siz_t bs_id, const blksz_t* blksz, siz_t m
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, kerid_t bs_id, dim_t bs, cntx_t* cntx )
 {
 	bli_blksz_set_def( bs, dt, ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx ) );
 }
 
-BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, siz_t bs_id, dim_t bs, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, kerid_t bs_id, dim_t bs, cntx_t* cntx )
 {
 	bli_blksz_set_max( bs, dt, ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx ) );
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr2( siz_t ukr_id, const func2_t* func, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr2( kerid_t ukr_id, const func2_t* func, cntx_t* cntx )
 {
 	*( func2_t* )bli_cntx_get_ukr2s( ukr_id, cntx ) = *func;
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, siz_t ker_id, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr2_dt( void_fp fp, num_t dt1, num_t dt2, kerid_t ker_id, cntx_t* cntx )
 {
 	bli_func2_set_dt( fp, dt1, dt2, ( func2_t* )bli_cntx_get_ukr2s( ker_id, cntx ) );
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr( siz_t ukr_id, const func_t* func, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr( kerid_t ukr_id, const func_t* func, cntx_t* cntx )
 {
 	*( func_t* )bli_cntx_get_ukrs( ukr_id, cntx ) = *func;
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, siz_t ker_id, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, kerid_t ker_id, cntx_t* cntx )
 {
 	if ( bli_ker_ntype( ker_id ) == 2 )
 	{
@@ -319,13 +319,13 @@ BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, siz_t ker_id, cntx_t
 	}
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr_pref( siz_t ukr_id, const mbool_t* prefs, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr_pref( kerid_t ukr_id, const mbool_t* prefs, cntx_t* cntx )
 {
 	*( mbool_t* )bli_cntx_get_ukr_prefs( ukr_id, cntx ) = *prefs;
 	return BLIS_SUCCESS;
 }
 
-BLIS_INLINE err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, siz_t ukr_id, cntx_t* cntx )
+BLIS_INLINE err_t bli_cntx_set_ukr_pref_dt( bool pref, num_t dt, kerid_t ukr_id, cntx_t* cntx )
 {
 	bli_mbool_set_dt( pref, dt, ( mbool_t* )bli_cntx_get_ukr_prefs( ukr_id, cntx ));
 	return BLIS_SUCCESS;
@@ -400,13 +400,13 @@ BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx );
 
 BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS err_t bli_cntx_register_blksz( siz_t* bs_id, const blksz_t* blksz, siz_t bmult_id, cntx_t* cntx );
+BLIS_EXPORT_BLIS err_t bli_cntx_register_blksz( kerid_t* bs_id, const blksz_t* blksz, kerid_t bmult_id, cntx_t* cntx );
 
-BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr( siz_t* ukr_id, const func_t* ukr, cntx_t* cntx );
+BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr( kerid_t* ukr_id, const func_t* ukr, cntx_t* cntx );
 
-BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr2( siz_t* ukr_id, const func2_t* ukr, cntx_t* cntx );
+BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr2( kerid_t* ukr_id, const func2_t* ukr, cntx_t* cntx );
 
-BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr_pref( siz_t* ukr_pref_id, const mbool_t* ukr_pref, cntx_t* cntx );
+BLIS_EXPORT_BLIS err_t bli_cntx_register_ukr_pref( kerid_t* ukr_pref_id, const mbool_t* ukr_pref, cntx_t* cntx );
 
 
 #endif
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index a6eea5562..d174b0433 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -482,10 +482,10 @@ kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 // -- microkernel and block size registration ----------------------------------
 //
 
-err_t bli_gks_register_blksz( siz_t* bs_id )
+err_t bli_gks_register_blksz( kerid_t* bs_id )
 {
-	siz_t id = 0;
-	siz_t next_id;
+	kerid_t id = 0;
+	kerid_t next_id;
 	cntx_t* cntx;
 	err_t err;
 
@@ -513,10 +513,10 @@ err_t bli_gks_register_blksz( siz_t* bs_id )
 	return BLIS_SUCCESS;
 }
 
-err_t bli_gks_register_ukr( siz_t* ukr_id )
+err_t bli_gks_register_ukr( kerid_t* ukr_id )
 {
-	siz_t id = 0;
-	siz_t next_id;
+	kerid_t id = 0;
+	kerid_t next_id;
 	cntx_t* cntx;
 	err_t err;
 
@@ -544,10 +544,10 @@ err_t bli_gks_register_ukr( siz_t* ukr_id )
 	return BLIS_SUCCESS;
 }
 
-err_t bli_gks_register_ukr2( siz_t* ukr_id )
+err_t bli_gks_register_ukr2( kerid_t* ukr_id )
 {
-	siz_t id = 0;
-	siz_t next_id;
+	kerid_t id = 0;
+	kerid_t next_id;
 	cntx_t* cntx;
 	err_t err;
 
@@ -575,10 +575,10 @@ err_t bli_gks_register_ukr2( siz_t* ukr_id )
 	return BLIS_SUCCESS;
 }
 
-err_t bli_gks_register_ukr_pref( siz_t* ukr_pref_id )
+err_t bli_gks_register_ukr_pref( kerid_t* ukr_pref_id )
 {
-	siz_t id = 0;
-	siz_t next_id;
+	kerid_t id = 0;
+	kerid_t next_id;
 	cntx_t* cntx;
 	err_t err;
 
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index f4d12c641..8204ff717 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -57,13 +57,13 @@ BLIS_EXPORT_BLIS kimpl_t       bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method
 
 //char*                          bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
 
-BLIS_EXPORT_BLIS err_t bli_gks_register_blksz( siz_t* bs_id );
+BLIS_EXPORT_BLIS err_t bli_gks_register_blksz( kerid_t* bs_id );
 
-BLIS_EXPORT_BLIS err_t bli_gks_register_ukr( siz_t* ukr_id );
+BLIS_EXPORT_BLIS err_t bli_gks_register_ukr( kerid_t* ukr_id );
 
-BLIS_EXPORT_BLIS err_t bli_gks_register_ukr2( siz_t* ukr_id );
+BLIS_EXPORT_BLIS err_t bli_gks_register_ukr2( kerid_t* ukr_id );
 
-BLIS_EXPORT_BLIS err_t bli_gks_register_ukr_pref( siz_t* ukr_pref_id );
+BLIS_EXPORT_BLIS err_t bli_gks_register_ukr_pref( kerid_t* ukr_pref_id );
 
 #endif
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index df0b2a425..e1d82c563 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -623,13 +623,17 @@ typedef enum
 #define BLIS_2TYPE_KER       (  1u << BLIS_NTYPE_KER_SHIFT)
 #define BLIS_3TYPE_KER       (  2u << BLIS_NTYPE_KER_SHIFT)
 
-#define bli_ker_idx( ker )	 ((ker) & ~BLIS_NTYPE_KER_BITS)
+#define bli_ker_idx( ker )   ( kerid_t )((ker) & ~BLIS_NTYPE_KER_BITS)
 #define bli_ker_ntype( ker ) ((((ker) & BLIS_NTYPE_KER_BITS) >> BLIS_NTYPE_KER_SHIFT) + 1)
 
+// We have to use a 32-bit type here to avoid problems with passing small enum
+// constants to variadic functions. See https://github.com/flame/blis/issues/839.
+typedef uint32_t kerid_t;
+
 // Sentinel constant used to indicate the end of a variable argument function
 // (See bli_cntx.c)
 
-#define BLIS_VA_END  ((siz_t)-1)
+#define BLIS_VA_END  ((kerid_t)-1)
 
 typedef enum
 {
@@ -710,7 +714,7 @@ typedef enum
 	// BLIS_NUM_UKR2S must come after all kernels!
 	BLIS_NUM_UKR2S_, BLIS_NUM_UKR2S = bli_ker_idx( BLIS_NUM_UKR2S_ ),
 
-	// Force the size of ukr_t values to be as large as siz_t
+	// Force the size of ukr_t values to be as large as kerid_t
 	BLIS_UKRS_END_ = BLIS_VA_END
 } ukr_t;
 
@@ -738,7 +742,7 @@ typedef enum
     // BLIS_NUM_UKR_PREFS must be last!
     BLIS_NUM_UKR_PREFS,
 
-	// Force the size of ukr_pref_t values to be as large as siz_t
+	// Force the size of ukr_pref_t values to be as large as kerid_t
 	BLIS_UKR_PREFS_END_ = BLIS_VA_END
 } ukr_pref_t;
 
@@ -876,7 +880,7 @@ typedef enum
 	BLIS_NOID,
 	BLIS_NUM_LEVEL3_OPS = BLIS_NOID,
 
-	// Force the size of opid_t values to be as large as siz_t
+	// Force the size of opid_t values to be as large as kerid_t
 	BLIS_LEVEL3_OPS_END_ = BLIS_VA_END
 } opid_t;
 
@@ -926,7 +930,7 @@ typedef enum
 	              // such as when characterizing a packm operation.
 	BLIS_NUM_BLKSZS = BLIS_NO_PART,
 
-	// Force the size of bszid_t values to be as large as siz_t
+	// Force the size of bszid_t values to be as large as kerid_t
 	BLIS_BLKSZS_END_ = BLIS_VA_END
 } bszid_t;
 

From fb7ba1da524efa47011d95cfd8a9fee86018fcf0 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 17 Jan 2025 13:54:40 -0600
Subject: [PATCH 207/230] Update release instructions. (#837)

Details:
- Rename `RELEASING` to `RELEASING.md`.
- Add additional structure and Markdown notation to `RELEASING.md`.
- Add a section on the overall release and branching strategy.
- Clarify and tweak instructions for making release candidates and releases.
- Add instructions for making point releaases and back-porting bug fixes.
- Rename `build/start-new-rc.sh` to `build/do-release.sh`.
- Tweak `do-release.sh` to do only common tasks for rcs, major releases, and point releases.
- Add `-b` option to `do-release.sh` which does a "bare" release without a new branch or tag (for "dev releases" on master).
- Update the version file on `master` to `3.0-dev` to reflect the new guidelines.
---
 RELEASING                                | 135 ---------
 RELEASING.md                             | 346 +++++++++++++++++++++++
 build/{start-new-rc.sh => do-release.sh} | 141 ++++-----
 build/version                            |   2 +-
 4 files changed, 420 insertions(+), 204 deletions(-)
 delete mode 100644 RELEASING
 create mode 100644 RELEASING.md
 rename build/{start-new-rc.sh => do-release.sh} (66%)

diff --git a/RELEASING b/RELEASING
deleted file mode 100644
index 85de229c9..000000000
--- a/RELEASING
+++ /dev/null
@@ -1,135 +0,0 @@
-Here are the steps to follow to create a new release candidate of BLIS:
-
-If you're creating a new release candidate lineage -- that is, the *first*
-release candidate for a new version (i.e., 2.0-rc0):
-
-1. Use the build/start-new-rc.sh script to create a new rc branch.
-
-   $ ./build/start-new-rc.sh "2.0"
-
-   This will update the version file in BLIS to reflect the new version
-   string (in this case, "2.0"). It also refreshes the contents of the
-   CHANGELOG file with the output of 'git log'. Finally, it creates a
-   new "-rc0" branch (in this case, "2.0-rc0").
-
-   NOTE: This script assumes that you want the new rc branch to be
-   a descendant of the head of 'master'.
-
-2. Make sure the script did what it was supposed to do by inspecting
-   the output of 'git log' and 'git branch'. If everything looks good,
-   you can push the changes via:
-
-   $ git checkout master
-   $ git push
-   $ git push -u origin 2.0-rc0
-
-   At this point, the new release candidate branch is live at origin.
-
-If you're creating a new release candidate for an existing lineage
--- that is, a follow-up release candidate for a new version that already
-has one or more release candidates -- start by checking out the latest
-release candidate, for example:
-
-1. Start by checking out the latest release candidate:
-
-   $ git checkout 2.0-rc1
-
-2. Then create a new release candidate branch whose name increments the
-   "rc" number:
-
-   $ git checkout -b 2.0-rc2
-
-3. Then cherry-pick one or more bugfixes that were made to 'master':
-
-   $ git cherry-pick -nx <commit>
-
-4. Finally, commit the changes. Be sure to include lines in the commit
-   log entry for each cherry-picked commit that note the commit hash
-   of the *original* commit that is being cherry-picked from. Example:
-
-     Fixed a bug in blahblahblah. (#777)
-    
-     Details:
-     - Fixed a bug in blahblahblah that manifested as blahblahblah. This
-       bug was introduced in commit abc12345. Thanks to John Smith for
-       reporting this bug.
-     - (cherry picked from commit abc0123456789abc0123456789abc0123456789a)
-
-   Note the final line, which was *not* present in the original commit
-   log entry but *should be* present in the commit log entry for the
-   commit that cherry-picks to (in this example) 2.0-rc2.
-
-
-Here are the steps to follow to create a new release (version) of BLIS:
-
-1. Make sure there are no commits that have yet to be pulled into
-   local repository.
-
-   $ git pull
-
-   If there are any commits upstream, merge them as appropriate.
-
-2. Check out the latest release candidate:
-
-   $ git checkout 2.0-rc2
-
-3. Consider whether the so_version should be updated (via the so_version
-   file in the 'build' directory) due to any ABI changes since the previous
-   version. If so, commit that change now.
-
-4. Verify that the code builds properly.
-
-   $ ./configure auto; make
-
-5. Verify that the code passes BLIS and BLAS tests:
-
-   $ make check           # BLIS testsuite (fast) + BLAS test drivers
-   $ make checkblis       # BLIS testsuite (full ex. mixed-datatype)
-   $ make checkblis-md    # BLIS testsuite (mixed-datatype only)
-   $ make checkblis-salt  # BLIS testsuite (fast + salt)
-
-6. Draft a new announcement to blis-devel, crediting those who
-   contributed towards this version by browsing 'git log'.
-
-7. Update CREDITS file if 'git log' reveals any new contributors.
-   NOTE: This should have already been done prior to the rc cycle.
-
-8. Update docs/ReleaseNotes.md file with body of finalized announcement
-   and the date of the release.
-   NOTE: This should be a cherry-pick off of 'master' since the release notes
-   need to be committed there anyway as well.
-
-9. Commit changes from steps 7 and 8.
-
-10. Create a final release branch:
-
-   $ git checkout -b 2.0-final
-
-   This will help identify the release commit when browsing the GitHub network
-   graph, since tags don't show up there.
-
-11. Tag the final release commit:
-
-   $ git tag 2.0 2.0-final
-
-   You can also use the actual commit hash instead of "2.0-final".
-
-12. Push the new commits and new tag associated with the new version:
-
-   $ git push
-   $ git push --tag
-
-13. Publish a new release via GitHub (https://github.com/flame/blis/releases).
-    Identify the new version by the tag you just created and pushed. You can
-    also identify the previous release.
-
-    Try to use formatting consistent with the prior release. (You can start to
-    edit the previous release, inspect/copy some of the markdown syntax, and
-    then abort the edit.)
-
-14. Update the Wikipedia entry for BLIS to reflect the new latest version.
-
-15. Announce the release on Discord.
-
-16. Send finalized announcement to blis-devel.
-
diff --git a/RELEASING.md b/RELEASING.md
new file mode 100644
index 000000000..9496b396a
--- /dev/null
+++ b/RELEASING.md
@@ -0,0 +1,346 @@
+## Contents
+
+* **[BLIS version numbering scheme and branching strategy](RELEASING.md#blis-version-numbering-scheme-and-branching-strategy)**
+* **[Instructions for creating a new release candidate or point release of BLIS
+](RELEASING.md#instructions-for-creating-a-new-release-candidate-or-point-release-of-blis
+)**
+  * **[Creating a new release lineage branch
+](RELEASING.md#creating-a-new-release-lineage-branch)**
+  * **[Creating a new release candidate  (e.g. `1.x` -> `2.0-rc0` or `2.0-rc0` -> `2.0-rc1`)](RELEASING.md#creating-a-new-release-candidate-eg-1x---20-rc0-or-20-rc0---20-rc1)**
+  * **[Creating a new major release (e.g. `2.0-rc<n>` -> `2.0`)](RELEASING.md#creating-a-new-major-release-eg-20-rcn---20)**
+  * **[Back-porting fixes from `master` to releases](RELEASING.md#back-porting-fixes-from-master-to-releases)**
+  * **[Creating a new point release (e.g. `1.1` -> `1.2` or `2.0` -> `2.1`)](RELEASING.md#creating-a-new-point-release-eg-11---12-or-20---21)**
+
+## BLIS version numbering scheme and branching strategy
+
+BLIS uses a major.minor version numbering scheme. An increase in the
+major version number (a "major release" or simple "new version")
+indicates new (usually significant) functionality, and possible
+incompatibility with previous major releases, although the ABI
+version can be used to check for compatibility across major version
+in principle.
+
+Major releases have one or more "release candidates" which are
+preliminary versions of the next release, publicly distributed for
+comment and/or bug discovery. Subsequent release candidates (rcs)
+correct problems found in the previous rc. Once a reasonable level
+of stability is achieved, the full release is distributed.
+
+An increase in the minor version number (a "point release") indicates
+the incorportation of one or more bugfixes or other minor changes since
+the initial major version release or last point release.
+
+Essentially, point releases extend the rc cadence beyond the official
+release by correcting additional problems discovered after release.
+
+All rcs, initial major release, and point releases are created along a
+linear git branch, named for the major release lineage, e.g. `r1.x`.
+Commits indicating rcs and releases are tagged (e.g. `1.0-rc0`, `1.0`,
+`1.1`) and also have an associated non-tip branch (e.g. `r1.0-rc0`,
+`r1.0`, `r1.1`). Using both tags and branches increases visibility of
+important commits, but new commits should only be made on the `r1.x`
+lineage branch.
+
+Release lineage branches diverge from `master` starting with the first
+rc. Any new commits on the release lineage (except version maintenance
+commits such as updating the version file, CHANGELOG, and release notes)
+are cherry-picked from `master`. Exceptions may be made if, for example,
+a backported bugfix cannot be cherry-picked and requires a more targeted
+fix directly on a release branch.
+
+Here is an example illustration of the release branch structure:
+```
+_________________________________________________________master
+   \                    \
+    \                    \__r2.0-rc0_____r2.0-rc1_____r2.0,r2.x
+     \                      (2.0-rc0)    (2.0-rc1)    (2.0)
+      \
+       \__r1.0-rc0_____r1.0-rc1_____r1.0_____r1.1_____r1.2,r1.x
+          (1.0-rc0)    (1.0-rc1)    (1.0)    (1.1)    (1.2)
+                                     /\
+      <- release candidates -- major release -- point releases ->
+```
+
+In each case, the version number (as encoded in the `version` file)
+indicates the `x.y` prefix of the most recent tagged commit. The
+exception is `master`, where the `version` file indicates `z.0-dev`,
+where `z` is the major version number one higher than the latest major
+release (e.g. `3.0-dev` in the example above).
+
+## Instructions for creating a new release candidate or point release of BLIS
+
+### Creating a new release lineage branch
+
+1. Consider whether the so_version should be updated (via the `build/so_version`
+   file) due to any ABI changes since the previous version. If so, commit that
+   change on `master` now.
+
+2. Create the new release lineage branch.
+
+   ```
+   $ git checkout master
+   $ git pull
+   $ git branch r2.x
+   ```
+
+   Note that the new release lineage branch should not be check out at this point.
+
+3. Update the version on the `master` branch to reflect the next release in development.
+
+   ```
+   $ ./build/do-release.sh -b "3.0-dev"
+   $ git push
+   ```
+
+   Note the extra option `-b`.
+
+4. Check out the new release lineage branch.
+
+   ```
+   $ git checkout r2.x
+   ```
+
+### Creating a new release candidate (e.g. `1.x` -> `2.0-rc0` or `2.0-rc0` -> `2.0-rc1`)
+
+1. Make sure that the release lineage branch is checked out and up-to-date.
+
+   ```
+   $ git checkout r2.x
+   $ git pull
+   ```
+
+2. Draft a new announcement to the blis-devel mailing list, crediting those who
+   contributed towards this version by browsing `git log`.
+
+3. Update the CREDITS file if `git log` reveals any new contributors.
+   NOTE: This should have already been done prior to the rc cycle.
+
+4. Commit the updated CREDITS file if changed.
+
+5. Update `docs/ReleaseNotes.md` with the body of finalized announcement
+   and the date of the release. Developers are encouraged to update
+   the release notes on `master` as new changes are made, which simplifies
+   preparation of rc0.
+
+6. Commit the updated `docs/ReleaseNotes.md` file.
+
+7. Use the `build/do-release.sh` script to create a new rc branch and tag.
+
+   ```
+   $ ./build/do-release.sh "2.0-rc<n>"
+   ```
+
+   Where `<n>` is `0` for the first rc, or one higher than the last rc on this release
+   lineage branch.
+
+8. Make sure the `do-release` script and other commits did what they were
+   supposed to do by inspecting the output of `git log`. If everything looks good,
+   you can push the changes via:
+
+   ```
+   $ git push
+   $ git push --tags
+   $ git push -u <origin> 2.0-rc<n>
+   ```
+
+   Where `<origin>` is the name of the appropiate upstream git remote.
+
+   At this point, the new release candidate branch is live at `<origin>`.
+
+9. Announce the rc release on blis-devel, Discord, and/or other appropriate
+   venues.
+
+10. Wait for bug reports. Typically an rc should stay live for at least a month
+    in order to give users time to try it out.
+
+11. After the trial period, cherry-pick any bugfixes or other updates:
+
+    $ git cherry-pick [-nx] <commit>
+
+    Be sure to include lines in the commit
+    log entry for each cherry-picked commit that note the commit hash
+    of the *original* commit that is being cherry-picked from. Example:
+
+    ```
+    Fixed a bug in blahblahblah. (#777)
+
+    Details:
+     - Fixed a bug in blahblahblah that manifested as blahblahblah. This
+       bug was introduced in commit abc12345. Thanks to John Smith for
+       reporting this bug.
+     - (cherry picked from commit abc0123456789abc0123456789abc0123456789a)
+    ```
+
+    Note the final line, which was *not* present in the original commit
+    log entry (on `master`) but *should be* present in the commit log entry for the
+    cherry-picked commit (on the release lineage branch).
+
+ 12. If no bugs are reported/found, or if the updated rc is otherwise ready
+     for promotion to full release, continue with the instructions below.
+     Otherwise, return to step 2, incrementing `<n>`.
+
+### Creating a new major release (e.g. `2.0-rc<n>` -> `2.0`)
+
+1. Make sure that the release lineage branch is checked out and up-to-date.
+
+   ```
+   $ git checkout r2.x
+   $ git pull
+   ```
+
+2. Draft a new announcement to the blis-devel mailing list, crediting those who
+   contributed towards this version by browsing `git log`.
+
+3. Update the CREDITS file if `git log` reveals any new contributors.
+   NOTE: This should have already been done prior to the release cycle.
+
+4. Commit the updated CREDITS file if changed.
+
+5. Update `docs/ReleaseNotes.md` with the body of finalized announcement
+   and the date of the release. Developers are encouraged to update
+   the release notes on `master` as new changes are made, which simplifies
+   preparation of the release.
+
+6. Commit the updated `docs/ReleaseNotes.md` file.
+
+7. Use the `build/do-release.sh` script to create a new release branch and tag.
+
+   ```
+   $ ./build/do-release.sh "2.0"
+   ```
+
+8. Make sure the `do-release` script and other commits did what they were
+   supposed to do by inspecting the output of `git log`. If everything looks good,
+   you can push the changes via:
+
+   ```
+   $ git push
+   $ git push --tags
+   $ git push -u <origin> 2.0
+   ```
+
+   Where `<origin>` is the name of the appropiate upstream git remote.
+
+   At this point, the new release branch is live at `<origin>`.
+
+9. Publish a new release via GitHub (https://github.com/flame/blis/releases).
+   Identify the new version by the tag you just created and pushed. You can
+   also identify the previous release.
+
+   Try to use formatting consistent with the prior release. (You can start to
+   edit the previous release, inspect/copy some of the markdown syntax, and
+   then abort the edit.)
+
+10. Announce the rc release on blis-devel, Discord, and/or other appropriate
+    venues.
+
+11. Update the Wikipedia entry for BLIS to reflect the new latest version.
+
+### Back-porting fixes from `master` to releases
+
+1. When a bug fix is developed on `master` which is applicable to a supported release,
+   and corrects a significant problem with correctness, usability, or performance
+   (e.g. not new functionality or cosmetic changes), it should be back-ported.
+   Bug fixes should be individually back-ported to all supported releases.
+
+2. Check out the relevant release lineage branch, e.g.:
+
+   ```
+   $ git checkout r2.x
+   $ git pull
+   ```
+
+3. Verify that the bug affects this release lineage. If not, skip this release lineage.
+
+4. If possible, cherry-pick the bugfix commit from `master`:
+
+   $ git cherry-pick [-nx] <commit>
+
+   Be sure to include lines in the commit
+   log entry for each cherry-picked commit that note the commit hash
+   of the *original* commit that is being cherry-picked from. Example:
+
+   ```
+     Fixed a bug in blahblahblah. (#777)
+
+     Details:
+     - Fixed a bug in blahblahblah that manifested as blahblahblah. This
+       bug was introduced in commit abc12345. Thanks to John Smith for
+       reporting this bug.
+     - (cherry picked from commit abc0123456789abc0123456789abc0123456789a)
+   ```
+
+   Note the final line, which was *not* present in the original commit
+   log entry (on `master`) but *should be* present in the commit log entry
+   for the cherry-picked commit (on the release lineage branch).
+
+5. If cherry-picking is not possible (e.g. the commit does not merge cleanly,
+   underlying implementation details or internal APIs have changed, etc.,
+   then craft a new bugfix on the release lineage branch. Make sure to test
+   the new bugfix against the reported bug, as well as the full BLIS testsuite!
+
+7. Push the changes via `git push`. Do not update any other release branches or tags
+   at this time.
+
+### Creating a new point release (e.g. `1.1` -> `1.2` or `2.0` -> `2.1`)
+
+1. Once enough bug fixes have accumulated, a bug fix of high enough urgency, or a
+   pre-determined period of time has elapsed, all bug fix commits since the last release
+   (major or point release) will be included in a new point release.
+
+   Point releases can be made on either the most recent release lineage branch or on
+   a "historical" but still supported release lineage.
+
+2. Check out the relevant release lineage branch (which may not be the most recent)
+
+   ```
+   $ git checkout r2.x
+   $ git pull
+   ```
+
+3. Draft a new announcement to the blis-devel mailing list, crediting those who
+   contributed towards this version by browsing `git log`.
+
+4. Update the CREDITS file if `git log` reveals any new contributors.
+   NOTE: This should have already been done prior to the release cycle.
+
+5. Commit the updated CREDITS file if changed.
+
+6. Update `docs/ReleaseNotes.md` with the body of finalized announcement
+   and the date of the release.
+
+7. Commit the updated `docs/ReleaseNotes.md` file.
+
+8. Use the `build/do-release.sh` script to create a new release branch and tag.
+
+   ```
+   $ ./build/do-release.sh "2.1"
+   ```
+
+9. Make sure the `do-release` script and other commits did what they were
+   supposed to do by inspecting the output of `git log`. If everything looks good,
+   you can push the changes via:
+
+   ```
+   $ git push
+   $ git push --tags
+   $ git push -u <origin> 2.1
+   ```
+
+   Where `<origin>` is the name of the appropiate upstream git remote.
+
+   At this point, the new release branch is live at `<origin>`.
+
+10. Update the release target branch via GitHub (https://github.com/flame/blis/releases).
+    Identify the new version by the tag you just created and pushed. You can
+    also identify the previous release.
+
+    Try to use formatting consistent with the prior release. (You can start to
+    edit the previous release, inspect/copy some of the markdown syntax, and
+    then abort the edit.)
+
+11. Announce the rc release on blis-devel, Discord, and/or other appropriate
+    venues.
+
+12. If this point release is for the most recent major release lineage,
+    update the Wikipedia entry for BLIS to reflect the new latest version.
diff --git a/build/start-new-rc.sh b/build/do-release.sh
similarity index 66%
rename from build/start-new-rc.sh
rename to build/do-release.sh
index 786522214..97c906695 100755
--- a/build/start-new-rc.sh
+++ b/build/do-release.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -41,46 +41,45 @@
 
 print_usage()
 {
-	#local script_name
-	
-	# Get the script name
-	#script_name=${0##*/}
-	
-	# Echo usage info
-	echo " "
-	echo " "$script_name
-	echo " "
-	echo " Field G. Van Zee"
-	echo " "
-	echo " Performs a series of actions needed when creating a new release"
-	echo " candidate branch for BLIS:"
-	echo "   1. Overwrite the version file with the version string passed"
-	echo "      into this script (<new_vers>)."
-	echo "   2. Commit the updated version file."
-	echo "   3. Update the CHANGELOG file."
-	echo "   4. Commit the updated CHANGELOG file."
-	echo "   5. Create a new branch (named '<new_vers>-rc0') which refers to"
-	echo "      the commit created in (4)."
-	echo " "
-	echo " Usage:"
-	echo "   ${script_name} [options] new_vers"
-	echo " "
-	echo " Arguments:"
-	echo " "
-	echo "   new_vers     The new version string."
-	echo " "
-	echo " Options:"
-	echo " "
-	echo "   -d           dry-run"
-	echo "                  Go through all the motions, but don't actually make any"
-	echo "                  changes to files or perform any git commits. Note that"
-	echo "                  this will result in the commits for (2) and (5) above"
-	echo "                  being equal to the initial commit in the script output."
-	echo "   -f VERSFILE  version file name"
-	echo "                  Update VERSFILE with new version string instead of default"
-	echo "                  'version' file."
-	
-	# Exit with non-zero exit status
+	echo <<- EOF
+
+	$script_name
+
+	 Field G. Van Zee
+
+	 Performs a series of actions needed when creating a new release
+	 candidate branch for BLIS:
+	   1. Overwrite the version file with the version string passed
+	      into this script (<new_vers>).
+	   2. Commit the updated version file.
+	   3. Update the CHANGELOG file.
+	   4. Commit the updated CHANGELOG file.
+	   5. Create a new branch (named 'r<new_vers>') which refers to
+	      the commit created in (4).
+	   6. Tag the commit created in (4) with a tage '<new_vers>'.
+
+	 Usage:
+	   ${script_name} [options] new_vers"
+
+	 Arguments:
+
+	   new_vers     The new version string.
+
+	 Options:
+
+	   -b           bare update
+	                  Update the version and CHANGELOG files but do not create
+	                  a release branch or tag.
+	   -d           dry-run
+	                  Go through all the motions, but don't actually make any
+	                  changes to files or perform any git commits. Note that
+	                  this will result in the commits for (2) and (4) above
+	                  being equal to the initial commit in the script output.
+	   -f VERSFILE  version file name
+	                  Update VERSFILE with new version string instead of default
+	                  'version' file.
+	EOF
+
 	exit 1
 }
 
@@ -117,17 +116,21 @@ main()
 
 	# The git directory.
 	gitdir='.git'
-	
+
 	# Whether we are performing a dry run or not.
-	dry_run_flag=""	
+	dry_run_flag=""
+
+	# Whether we are doing a bare update or not.
+	bare_flag=""
 
 	# -- END GLOBAL VARIABLE DECLARATIONS --
 
 
 	# Process our command line options.
-	while getopts ":dhf:" opt; do
+	while getopts ":dhbf:" opt; do
 		case $opt in
 			d  ) dry_run_flag="1" ;;
+			b  ) bare_flag="1" ;;
 			f  ) version_file=$OPTARG ;;
 			h  ) print_usage ;;
 			\? ) print_usage
@@ -152,10 +155,12 @@ main()
 	if [ $# = "1" ]; then
 
 		new_version_str=$1
-		new_rc_str="${new_version_str}-rc0"
+		new_rc_str="r${new_version_str}"
 
 		echo "${script_name}: new version string: '${new_version_str}'."
-		echo "${script_name}: preparing to create release candidate branch '${new_rc_str}'."
+		if [ -z "${bare_flag}" ]; then
+			echo "${script_name}: preparing to create release (candidate) branch '${new_rc_str}'."
+		fi
 
 	else
 		print_usage
@@ -175,10 +180,8 @@ main()
 			echo "${new_version_str}" > ${version_file}
 		fi
 
-		echo "${script_name}: executing: git checkout ${master_br}."
 		echo "${script_name}: executing: git commit -m \"Version file update (${new_version_str})\" ${version_file}."
 		if [ -z "$dry_run_flag" ]; then
-			git checkout ${master_br}
 			git commit -m "Version file update (${new_version_str})" ${version_file}
 		fi
 
@@ -187,13 +190,7 @@ main()
 
 		echo "${script_name}: updating '${changelog_file}'."
 		if [ -z "$dry_run_flag" ]; then
-
-			# If 'make distclean' was run recently, we need to re-run
-			# configure in order for 'make changelog' to work properly.
-			if [ ! -f "${configmk_file}" ]; then
-				./configure auto
-			fi
-			make changelog
+			git log --no-decorate > ${changelog_file}
 		fi
 
 		echo "${script_name}: executing: git commit -m \"CHANGELOG update (${new_version_str})\" ${changelog_file}."
@@ -204,20 +201,28 @@ main()
 		git_commit_str=$(git describe --always)
 		echo "${script_name}: new commit containing CHANGELOG update: ${git_commit_str}."
 
-		echo "${script_name}: executing: git checkout -b ${new_rc_str}."
-		if [ -z "$dry_run_flag" ]; then
-			git checkout -b "${new_rc_str}"
-		fi
+		if [ -z "${bare_flag}" ]; then
 
-		echo "${script_name}: "
-		echo "${script_name}: FINAL STEPS: Check the output of 'git log'. If everything"
-		echo "${script_name}: looks okay, execute these commands manually:"
-		echo "${script_name}: "
-		echo "${script_name}:   git checkout master"
-		echo "${script_name}:   git push"
-		echo "${script_name}:   git push -u origin ${new_rc_str}"
-		echo "${script_name}: "
-		
+			echo "${script_name}: Creating branch ${new_rc_str}."
+			if [ -z "$dry_run_flag" ]; then
+				git branch "${new_rc_str}"
+			fi
+
+			echo "${script_name}: Tagging branch ${new_rc_str} with tag ${new_version_str}."
+			if [ -z "$dry_run_flag" ]; then
+				git tag "${new_version_str}" "${new_rc_str}"
+			fi
+
+			echo "${script_name}: "
+			echo "${script_name}: FINAL STEPS: Check the output of 'git log'. If everything"
+			echo "${script_name}: looks okay, push the new branch manually:"
+			echo "${script_name}: "
+			echo "${script_name}:   git push"
+			echo "${script_name}:   git push --tags"
+			echo "${script_name}:   git push -u origin ${new_rc_str}"
+			echo "${script_name}: "
+
+		fi
 
 	else
 
diff --git a/build/version b/build/version
index d3827e75a..7a1511416 100644
--- a/build/version
+++ b/build/version
@@ -1 +1 @@
-1.0
+3.0-dev

From 4bc4a1c9a42b6c9b4f68722cd7b666ebf9a46b9a Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 17 Jan 2025 14:33:50 -0600
Subject: [PATCH 208/230] ReleaseNotes.md update.

Details:
- Update release notes for #841, should have been done in the PR.
- [ci skip]
---
 docs/ReleaseNotes.md | 64 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index 9a667f80e..d12a6df56 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -4,6 +4,8 @@
 
 ## Contents
 
+* [Changes in 3.0](ReleaseNotes.md#changes-in-30)
+* [Changes in 2.0](ReleaseNotes.md#changes-in-20)
 * [Changes in 1.0](ReleaseNotes.md#changes-in-10)
 * [Changes in 0.9.0](ReleaseNotes.md#changes-in-090)
 * [Changes in 0.8.1](ReleaseNotes.md#changes-in-081)
@@ -41,6 +43,66 @@
 * [Changes in 0.0.2](ReleaseNotes.md#changes-in-002)
 * [Changes in 0.0.1](ReleaseNotes.md#changes-in-001)
 
+## Changes in 3.0:
+In development
+
+Improvements present in 3.0:
+
+Framework:
+- Fixed an issue which could cause a segfault on x86-64 with `-m32` (and potentially, on other 64-bit setups) stemming from how enum constants are passed to variadic functions. (Igor Zhuravlov)
+
+## Changes in 2.0:
+January 15, 2025
+
+Improvements present in 2.0:
+
+Known Issues:
+- There is a performance regression in the `ztrmm` and `ztrsm` operations. On the Ampere Altra, performance is impacted by up to 30%; it is currently unknown if and how much this bug affects other architectures but the effect should be much smaller in most cases.
+
+Framework:
+- BLIS now supports "plugins", which provide additional functionality through user-defined kernels, blocksizes, and kernel preferences. Users can use an installed copy of BLIS (even a binary-only distribution) to create a plugin outside of the BLIS source tree. User-written reference kernels can then be registered into BLIS, and are compiled by the BLIS build system for all configured architecture. This also means that user-provided kernels participate in run-time kernel selection based on the actual hardware used! Additionally, users can provide and register optimized kernels for specific architectures which are automatically selected as appropriate. See `docs/PluginHowTo.md` for more information.
+- A new API has been added which allows users to modify the default "control tree". This data structure defines the specific algorithmic steps used to implement a level-3 BLAS operation such as `gemm` or `syrk`. Users can start with a predefined control tree for one of the level-3 BLAS operations (except `trsm` currently) and then modify it to produce a custom operation. Users can change kernels for packing and computation, associated blocksizes, and provide additional information (such as external parameters or additional data) which is passed directly to the kernels. See `docs/PluginHowTo.md` for more information and a working example.
+- All level-3 BLAS operations (except `trsm`) now support full mixed-precision mixed-domain computation. The A, B, and C matrices, as well as the alpha and beta scalars, may be provided in any of the supported data types (single/double precision and real/complex domain, currently), and an additionally-provided computational precision controls how the computation is actually performed internally. The computational precision can be set on the `obj_t` structure representing the C matrix.
+- Added a `func2_t` struct for dealing with 2-type kernels (see below). A `func2_t` can be safely cast to `func_t` to refer to only kernels with equal type parameters. (Devin Matthews)
+- The `bli_*_front` functions have been removed.
+- Extensive other back-end changes and improvements.
+
+Compatibility:
+- Added a ScaLAPACK compatibility mode which disables some conflicting BLAS definitions. (Field Van Zee)
+- Fixed issues with improperly escaped strings in python scripts for compatibility with python 3.12+. (@AngryLoki)
+- Added a user-defined macro `BLIS_ENABLE_STD_COMPLEX` which uses `std::complex` typedefs in `blis.h` for C++ code.  (Devin Matthews)
+- Fixed a bug in the definition of some scalar level-0 macros affecting compatibility of `bli_creal` and `bli_zreal`, for example. (Devin Matthews)
+- Fixed improperly-quoted strings in Python scripts which affected compatibility with Python 3.12+. (@AngryLoki)
+- The static initializer macros (`BLIS_*_INITIALIZER`) have been fixed for compatibility with C++. (Devin Matthews)
+- Install "helper" `blis.h` and `cblas.h` headers directly to `INCDIR` (in addition to the full files in `INCDIR/blis`). (Field Van Zee, Jed Brown, Mo Zhou)
+
+Kernels:
+- Fixed an out-of-bounds read bug in the `haswell` `gemmsup` kernels. (John Mather)
+- Fixed a bug in the complex-domain `gemm` kernels for `piledriver`. (@rmast)
+- Kernel, blocksizes, and preference lookup functions now use `siz_t` rather than specific enums. (Devin Matthews)
+- Fixed some issues with run-time kernel detection and add more ARM part numbers/manufacturer codes. (John Mather)
+- Kernels can now be added which have two datatype parameters. Kernel IDs are assigned such that 1-type and 2-type kernels cannot be interchanged accidentally. (Devin Matthews)
+- The packing microkernels and computational microkernels (`gemm` and `gemmtrsm`) now receive offsets into the global matrix. The latter are passed via the `auxinfo_t` struct. (Devin Matthews)
+- The separate "MRxk" and "NRxk" packing kernels have been merged into one generic packing kernel. Packing kernels are now expected to pack any size micropanel, but may optimize for specific shapes. (Devin Matthews)
+- Added explicit packing kernels for diagonal portions of matrices, and for certain mixed-domain/1m cases. (Devin Matthews)
+- Improved support for duplication during packing ("broadcast-B") across all packing kernels.
+
+Build system:
+- The `cblas.h` file is now "flattened" immediately after `blis.h` is (if enabled), rather than later in the build process. (Jeff Diamond, Field Van Zee)
+- Added script to help with preparing release candidate branches. (Field Van Zee)
+- The configure script has been overhauled. In particular, using spaces in `CC`/`CXX` is now supported. (Devin Matthews)
+- Improved support for C++ source files in BLIS or in plugins. (Devin Matthews)
+
+Testing:
+- test/3 drivers now allow using the "default" induced method, rather than forcing native or 1m operation. (Field Van Zee, Leick Robinson)
+- Fix some segfaults in the test/3 drivers. (Field Van Zee, Leick Robinson)
+- The testsuite now tests *all* possible type combinations when requested. (Devin Matthews)
+- Improved detection of problems in `make check-blis` and related targets. (Devin Matthews)
+
+Documentation:
+- Added documentation for the new plugin system and for creating custom operations by modifying the BLIS control tree. (Devin Matthews)
+- Updated documentation for downloading BLIS in `README.md` and instructions for maintainers in `RELEASING`. (Field Van Zee)
+
 ## Changes in 1.0
 May 6, 2024
 
@@ -302,7 +364,7 @@ Kernels:
 Build system:
 - Output a pkgconfig file so that CMake users that use BLIS can find and incorporate BLIS build products. (Ajay Panyala)
 - Fixed an issue in the the configure script's kernel-to-config map that caused `skx` kernel flags to be used when compiling kernels from the `zen` kernel set. This issue wasn't really fixed, but rather tweaked in such a way that it happens to now work. A more proper fix would require a serious rethinking of the configuration system. (Devin Matthews)
-- Fixed the shared library build rule in top-level Makefile. The previous rule was incorrectly only linking prerequisites that were newer than the target (`$?`) rather than correctly linking all prerequisites (`$^`). (Devin Matthews) 
+- Fixed the shared library build rule in top-level Makefile. The previous rule was incorrectly only linking prerequisites that were newer than the target (`$?`) rather than correctly linking all prerequisites (`$^`). (Devin Matthews)
 - Fixed `cc_vendor` for crosstool-ng toolchains. (Isuru Fernando)
 - Allow disabling of `trsm` diagonal pre-inversion at compile time via `--disable-trsm-preinversion`.
 

From 534d52b3b3bf9cea5e539baa28505ebe14f74921 Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Mon, 20 Jan 2025 21:25:52 +0100
Subject: [PATCH 209/230] Clarified OMP_NUM_THREADS (#835)

Details:
- Removed/relaxed the deprecation warning for `OMP_NUM_THREADS`.
- Clarified how `OMP_NUM_THREADS` is used and added a simple example on how to do different regions of thread-counts.
---
 docs/Multithreading.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index d8f8b13f4..11ca7d1d7 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -147,7 +147,14 @@ $ GOMP_CPU_AFFINITY="0-15" BLIS_NUM_THREADS=16 ./my_blis_program
 ```
 Either of these approaches causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `BLIS_NT` (a shorthand alternative to `BLIS_NUM_THREADS`). If neither variable is defined, then BLIS will attempt to read `OMP_NUM_THREADS`. If none of these variables is set, the default number of threads is 1.
 
-**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable to specify multithreading within BLIS and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
+**Note**: If none of `BLIS_NT`/`BLIS_NUM_THREADS` are defined, BLIS will fall back to use
+the standardized `OMP_NUM_THREADS` environment variable.
+By having an application specific environment variable one can fine-tune the thread
+utilization, e.g. to run OpenMP constructs using 4 threads, and BLIS with 2 threads:
+```
+$ OMP_NUM_THREADS=4 BLIS_NUM_THREADS=2 ./my_omp_blis_program
+```
+
 
 ### Environment variables: the manual way
 

From 967d29d6bfb2bbe44f0491448b4458caf31df88a Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 20 Jan 2025 14:27:01 -0600
Subject: [PATCH 210/230] CREDITS file update.

---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index 373514136..27877fff8 100644
--- a/CREDITS
+++ b/CREDITS
@@ -91,6 +91,7 @@ but many others have contributed code, ideas, and feedback, including
   Nisanth M P              @nisanthmp
   Nisanth Padinharepatt                               (AMD)
   Ajay Panyala             @ajaypanyala
+  Nick Papior              @zerothi
   Marc-Antoine Parent      @maparent                  (Conversence)
   Devangi Parikh           @dnparikh                  (The University of Texas at Austin)
   Elmar Peise              @elmar-peise               (RWTH-Aachen)

From 38063dd9d96e0bba695daba9b276d35258c887ee Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <fgvanzee@gmail.com>
Date: Mon, 20 Jan 2025 14:35:21 -0600
Subject: [PATCH 211/230] Optionally ignore extra dirs in `gen-make-frag.sh`.
 (#833)

Details:
- Implemented an option (`-i LIST`) to `gen-make-frag.sh` that allows the caller to optionally ignore additional directories when walking the source directory. (Note that previously the standard -- and only -- way to ignore directories was to add them to the `ignore_list` file, which is a required argument to the script.)
- I implemented this feature for something but then ended up not needing it, but figured it might be helpful in the future.
- Multiple `-i` options are allowed.
---
 build/gen-make-frags/gen-make-frag.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh
index e826db068..348b3c68f 100755
--- a/build/gen-make-frags/gen-make-frag.sh
+++ b/build/gen-make-frags/gen-make-frag.sh
@@ -86,6 +86,9 @@ print_usage()
 	echo "                 root_dir."
 	echo "   -h          hide"
 	echo "                 Hide the makefile fragments by prepending filenames with '.'."
+	echo "   -i LIST     ignore"
+	echo "                 Augment the list of directory names contained in ign_list"
+	echo "                 with the directory names in LIST."
 	echo "   -p PREFIX   prefix name"
 	echo "                 Use PREFIX instead of uppercased root_dir in the makefile"
 	echo "                 variable name. If the root_dir were 'stuff' and -p was not"
@@ -430,6 +433,7 @@ main()
 	# Flags set by getopts.
 	dry_run_flag=""
 	hide_flag=""
+	ignore_list=""
 	recursive_flag=""
 	output_name=""
 	prefix_flag=""
@@ -443,11 +447,12 @@ main()
 
 
 	# Process our command line options.
-	while getopts ":dho:p:rv:" opt; do
+	while getopts ":dhi:o:p:rv:" opt; do
 		case $opt in
 			d  ) dry_run_flag="1" ;;
 			h  ) hide_flag="1" ;;
 			r  ) recursive_flag="1" ;;
+			i  ) ignore_list="${ignore_list} $OPTARG" ;;
 			o  ) output_name=$OPTARG ;;
 			p  ) prefix_flag=$OPTARG ;;
 			v  ) verbose_flag=$OPTARG ;;
@@ -487,6 +492,10 @@ main()
 	# generation.
 	read_mkfile_config
 
+	# Append the command line ignore_list to the ignore_dirs variable read from
+	# ignore_file by read_mkfile_config().
+	ignore_dirs="${ignore_dirs} ${ignore_list}"
+
 
 	# Strip / from end of directory path, if there is one.
 	root_dir=${root_dir%/}

From 769d73f01b07777bc1b9b7ac6b9eeef02f145471 Mon Sep 17 00:00:00 2001
From: Michael Yeh <111819036+myeh01@users.noreply.github.com>
Date: Mon, 20 Jan 2025 12:43:09 -0800
Subject: [PATCH 212/230] Add the sifive_rvv configuration (#832)

Details:
- Added a `sifive_rvv` configuration which is `VLEN`-agnostic but takes advantage of optimized microkernels for SiFive (and other) RISC-V architectures.
- This configuration does not currently participate in automatic configuration selection during BLIS configure.
- `VLEN` is detected at runtime to properly make use of available vectorization.
---
 config/sifive_rvv/bli_cntx_init_sifive_rvv.c  | 225 +++++++++++++++++
 config/sifive_rvv/bli_family_sifive_rvv.h     |  34 +++
 .../sifive_rvv/bli_kernel_defs_sifive_rvv.h   |  55 +++++
 config/sifive_rvv/make_defs.mk                |  80 ++++++
 .../sifive_x280/bli_cntx_init_sifive_x280.c   | 230 +++++++++---------
 config/sifive_x280/make_defs.mk               |   4 +-
 config_registry                               |   3 +-
 frame/base/bli_arch.c                         |   4 +
 frame/include/bli_arch_config.h               |   6 +
 frame/include/bli_gentconf_macro_defs.h       |   6 +
 frame/include/bli_type_defs.h                 |   1 +
 .../bli_addv_sifive_rvv_intr.c}               |  12 +-
 .../bli_addv_sifive_rvv_intr_complex.c}       |   2 +-
 .../bli_addv_sifive_rvv_intr_real.c}          |   2 +-
 .../bli_amaxv_sifive_rvv_intr.c}              |  28 +--
 .../bli_amaxv_sifive_rvv_intr_complex.c}      |   0
 .../bli_amaxv_sifive_rvv_intr_real.c}         |   0
 .../bli_axpbyv_sifive_rvv_intr.c}             |  18 +-
 .../bli_axpbyv_sifive_rvv_intr_complex.c}     |   2 +-
 .../bli_axpbyv_sifive_rvv_intr_real.c}        |   2 +-
 .../bli_axpyv_sifive_rvv_intr.c}              |  12 +-
 .../bli_axpyv_sifive_rvv_intr_complex.c}      |   2 +-
 .../bli_axpyv_sifive_rvv_intr_real.c}         |   2 +-
 .../bli_copyv_sifive_rvv_intr.c}              |  10 +-
 .../bli_copyv_sifive_rvv_intr_complex.c}      |   0
 .../bli_copyv_sifive_rvv_intr_real.c}         |   0
 .../bli_dotv_sifive_rvv_intr.c}               |  12 +-
 .../bli_dotv_sifive_rvv_intr_complex.c}       |   2 +-
 .../bli_dotv_sifive_rvv_intr_real.c}          |   2 +-
 .../bli_dotxv_sifive_rvv_intr.c}              |  12 +-
 .../bli_dotxv_sifive_rvv_intr_complex.c}      |   2 +-
 .../bli_dotxv_sifive_rvv_intr_real.c}         |   2 +-
 .../bli_invertv_sifive_rvv_intr.c}            |  10 +-
 .../bli_invertv_sifive_rvv_intr_complex.c}    |   0
 .../bli_invertv_sifive_rvv_intr_real.c}       |   0
 .../bli_invscalv_sifive_rvv_intr.c}           |  10 +-
 .../bli_invscalv_sifive_rvv_intr_complex.c}   |   0
 .../bli_invscalv_sifive_rvv_intr_real.c}      |   0
 .../bli_scal2v_sifive_rvv_intr.c}             |  16 +-
 .../bli_scal2v_sifive_rvv_intr_complex.c}     |   2 +-
 .../bli_scal2v_sifive_rvv_intr_real.c}        |   2 +-
 .../bli_scalv_sifive_rvv_intr.c}              |  14 +-
 .../bli_scalv_sifive_rvv_intr_complex.c}      |   2 +-
 .../bli_scalv_sifive_rvv_intr_real.c}         |   2 +-
 .../bli_setv_sifive_rvv_intr.c}               |  10 +-
 .../bli_setv_sifive_rvv_intr_complex.c}       |   0
 .../bli_setv_sifive_rvv_intr_real.c}          |   0
 .../bli_subv_sifive_rvv_intr.c}               |  12 +-
 .../bli_subv_sifive_rvv_intr_complex.c}       |   2 +-
 .../bli_subv_sifive_rvv_intr_real.c}          |   2 +-
 .../bli_swapv_sifive_rvv_intr.c}              |  10 +-
 .../bli_swapv_sifive_rvv_intr_complex.c}      |   0
 .../bli_swapv_sifive_rvv_intr_real.c}         |   0
 .../bli_xpbyv_sifive_rvv_intr.c}              |  14 +-
 .../bli_xpbyv_sifive_rvv_intr_complex.c}      |   2 +-
 .../bli_xpbyv_sifive_rvv_intr_real.c}         |   2 +-
 .../bli_axpy2v_sifive_rvv_intr.c}             |  12 +-
 .../bli_axpy2v_sifive_rvv_intr_complex.c}     |   2 +-
 .../bli_axpy2v_sifive_rvv_intr_real.c}        |   2 +-
 .../bli_axpyf_sifive_rvv_intr.c}              |  10 +-
 .../bli_axpyf_sifive_rvv_intr_complex.c}      |   0
 .../bli_axpyf_sifive_rvv_intr_real.c}         |   0
 .../bli_dotaxpyv_sifive_rvv_intr.c}           |  12 +-
 .../bli_dotaxpyv_sifive_rvv_intr_complex.c}   |   2 +-
 .../bli_dotaxpyv_sifive_rvv_intr_real.c}      |   2 +-
 .../bli_dotxaxpyf_sifive_rvv_intr.c}          |  14 +-
 .../bli_dotxaxpyf_sifive_rvv_intr_complex.c}  | 134 +++++-----
 .../bli_dotxaxpyf_sifive_rvv_intr_real.c}     |  86 +++----
 .../bli_dotxf_sifive_rvv_intr.c}              |  14 +-
 .../bli_dotxf_sifive_rvv_intr_complex.c}      | 126 +++++-----
 .../bli_dotxf_sifive_rvv_intr_real.c}         | 110 ++++-----
 .../bli_packm_sifive_rvv_intr.c}              |  32 ++-
 .../bli_packm_sifive_rvv_intr_complex.c}      |  19 +-
 .../bli_packm_sifive_rvv_intr_real.c}         |  22 +-
 .../bli_gemm_sifive_rvv_intr.c}               |  18 +-
 .../bli_gemm_sifive_rvv_intr_complex.c}       |   0
 .../bli_gemm_sifive_rvv_intr_real.c}          |   0
 .../bli_gemmtrsm_sifive_rvv_intr.c}           |  22 +-
 .../bli_gemmtrsm_sifive_rvv_intr_complex.c}   |   2 +-
 .../bli_gemmtrsm_sifive_rvv_intr_real.c}      |   2 +-
 kernels/sifive_rvv/bli_kernels_sifive_rvv.h   | 162 ++++++++++++
 .../riscv_cmul_macros_intr.h                  |   0
 .../riscv_overloaded_intrinsics.h             |   2 +-
 kernels/sifive_x280/bli_kernels_sifive_x280.h | 162 ------------
 kernels/sifive_x280/riscv_cmul_macros_asm.h   | 137 -----------
 85 files changed, 1137 insertions(+), 853 deletions(-)
 create mode 100644 config/sifive_rvv/bli_cntx_init_sifive_rvv.c
 create mode 100644 config/sifive_rvv/bli_family_sifive_rvv.h
 create mode 100644 config/sifive_rvv/bli_kernel_defs_sifive_rvv.h
 create mode 100644 config/sifive_rvv/make_defs.mk
 rename kernels/{sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c => sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c} (98%)
 rename kernels/{sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c => sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c => sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c} (86%)
 rename kernels/{sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c => sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c => sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c => sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c => sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c} (93%)
 rename kernels/{sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c => sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c => sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c => sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c} (93%)
 rename kernels/{sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c => sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c => sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c} (93%)
 rename kernels/{sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c => sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c => sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c} (93%)
 rename kernels/{sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c => sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c => sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c => sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c => sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c => sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c} (98%)
 rename kernels/{sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c => sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c => sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c} (93%)
 rename kernels/{sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c => sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c => sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c} (98%)
 rename kernels/{sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c => sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c => sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c} (93%)
 rename kernels/{sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c => sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c => sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c => sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c => sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c => sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c => sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c => sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c} (94%)
 rename kernels/{sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c => sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c => sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c} (92%)
 rename kernels/{sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c => sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c} (99%)
 rename kernels/{sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c => sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c} (93%)
 rename kernels/{sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c} (76%)
 rename kernels/{sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c => sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c} (79%)
 rename kernels/{sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c => sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c} (94%)
 rename kernels/{sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c => sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c} (74%)
 rename kernels/{sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c => sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c} (72%)
 rename kernels/{sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c => sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c} (84%)
 rename kernels/{sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c => sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c => sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c} (98%)
 rename kernels/{sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c => sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c} (90%)
 rename kernels/{sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c => sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_complex.c} (100%)
 rename kernels/{sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c => sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_real.c} (100%)
 rename kernels/{sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c => sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c} (89%)
 rename kernels/{sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c => sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c} (99%)
 rename kernels/{sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c => sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c} (99%)
 create mode 100644 kernels/sifive_rvv/bli_kernels_sifive_rvv.h
 rename kernels/{sifive_x280 => sifive_rvv}/riscv_cmul_macros_intr.h (100%)
 rename kernels/{sifive_x280 => sifive_rvv}/riscv_overloaded_intrinsics.h (99%)
 delete mode 100644 kernels/sifive_x280/bli_kernels_sifive_x280.h
 delete mode 100644 kernels/sifive_x280/riscv_cmul_macros_asm.h

diff --git a/config/sifive_rvv/bli_cntx_init_sifive_rvv.c b/config/sifive_rvv/bli_cntx_init_sifive_rvv.c
new file mode 100644
index 000000000..54f17303f
--- /dev/null
+++ b/config/sifive_rvv/bli_cntx_init_sifive_rvv.c
@@ -0,0 +1,225 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <riscv_vector.h>
+
+void bli_cntx_init_sifive_rvv( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_sifive_rvv_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+        unsigned vlenb = __riscv_vlenb();
+
+	// Update the context with optimized native kernels.
+	bli_cntx_set_ukrs
+	(
+	  cntx,
+
+	  // Level 1
+	  BLIS_ADDV_KER,       BLIS_FLOAT,    bli_saddv_sifive_rvv_intr,
+	  BLIS_ADDV_KER,       BLIS_DOUBLE,   bli_daddv_sifive_rvv_intr,
+	  BLIS_ADDV_KER,       BLIS_SCOMPLEX, bli_caddv_sifive_rvv_intr,
+	  BLIS_ADDV_KER,       BLIS_DCOMPLEX, bli_zaddv_sifive_rvv_intr,
+
+	  BLIS_AMAXV_KER,      BLIS_FLOAT,    bli_samaxv_sifive_rvv_intr,
+	  BLIS_AMAXV_KER,      BLIS_DOUBLE,   bli_damaxv_sifive_rvv_intr,
+	  BLIS_AMAXV_KER,      BLIS_SCOMPLEX, bli_camaxv_sifive_rvv_intr,
+	  BLIS_AMAXV_KER,      BLIS_DCOMPLEX, bli_zamaxv_sifive_rvv_intr,
+
+	  BLIS_AXPBYV_KER,     BLIS_FLOAT,    bli_saxpbyv_sifive_rvv_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DOUBLE,   bli_daxpbyv_sifive_rvv_intr,
+	  BLIS_AXPBYV_KER,     BLIS_SCOMPLEX, bli_caxpbyv_sifive_rvv_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DCOMPLEX, bli_zaxpbyv_sifive_rvv_intr,
+
+	  BLIS_AXPYV_KER,      BLIS_FLOAT,    bli_saxpyv_sifive_rvv_intr,
+	  BLIS_AXPYV_KER,      BLIS_DOUBLE,   bli_daxpyv_sifive_rvv_intr,
+	  BLIS_AXPYV_KER,      BLIS_SCOMPLEX, bli_caxpyv_sifive_rvv_intr,
+	  BLIS_AXPYV_KER,      BLIS_DCOMPLEX, bli_zaxpyv_sifive_rvv_intr,
+
+	  BLIS_COPYV_KER,      BLIS_FLOAT,    bli_scopyv_sifive_rvv_intr,
+	  BLIS_COPYV_KER,      BLIS_DOUBLE,   bli_dcopyv_sifive_rvv_intr,
+	  BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_sifive_rvv_intr,
+	  BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_sifive_rvv_intr,
+
+	  BLIS_DOTV_KER,       BLIS_FLOAT,    bli_sdotv_sifive_rvv_intr,
+	  BLIS_DOTV_KER,       BLIS_DOUBLE,   bli_ddotv_sifive_rvv_intr,
+	  BLIS_DOTV_KER,       BLIS_SCOMPLEX, bli_cdotv_sifive_rvv_intr,
+	  BLIS_DOTV_KER,       BLIS_DCOMPLEX, bli_zdotv_sifive_rvv_intr,
+
+	  BLIS_DOTXV_KER,      BLIS_FLOAT,    bli_sdotxv_sifive_rvv_intr,
+	  BLIS_DOTXV_KER,      BLIS_DOUBLE,   bli_ddotxv_sifive_rvv_intr,
+	  BLIS_DOTXV_KER,      BLIS_SCOMPLEX, bli_cdotxv_sifive_rvv_intr,
+	  BLIS_DOTXV_KER,      BLIS_DCOMPLEX, bli_zdotxv_sifive_rvv_intr,
+
+	  BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_sifive_rvv_intr,
+	  BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_sifive_rvv_intr,
+	  BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_sifive_rvv_intr,
+	  BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_sifive_rvv_intr,
+
+	  BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_sifive_rvv_intr,
+	  BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_sifive_rvv_intr,
+	  BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_sifive_rvv_intr,
+	  BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_sifive_rvv_intr,
+
+	  BLIS_SCAL2V_KER,     BLIS_FLOAT,    bli_sscal2v_sifive_rvv_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DOUBLE,   bli_dscal2v_sifive_rvv_intr,
+	  BLIS_SCAL2V_KER,     BLIS_SCOMPLEX, bli_cscal2v_sifive_rvv_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DCOMPLEX, bli_zscal2v_sifive_rvv_intr,
+
+	  BLIS_SCALV_KER,      BLIS_FLOAT,    bli_sscalv_sifive_rvv_intr,
+	  BLIS_SCALV_KER,      BLIS_DOUBLE,   bli_dscalv_sifive_rvv_intr,
+	  BLIS_SCALV_KER,      BLIS_SCOMPLEX, bli_cscalv_sifive_rvv_intr,
+	  BLIS_SCALV_KER,      BLIS_DCOMPLEX, bli_zscalv_sifive_rvv_intr,
+
+	  BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_sifive_rvv_intr,
+	  BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_sifive_rvv_intr,
+	  BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_sifive_rvv_intr,
+	  BLIS_SETV_KER,       BLIS_DCOMPLEX, bli_zsetv_sifive_rvv_intr,
+
+	  BLIS_SUBV_KER,       BLIS_FLOAT,    bli_ssubv_sifive_rvv_intr,
+	  BLIS_SUBV_KER,       BLIS_DOUBLE,   bli_dsubv_sifive_rvv_intr,
+	  BLIS_SUBV_KER,       BLIS_SCOMPLEX, bli_csubv_sifive_rvv_intr,
+	  BLIS_SUBV_KER,       BLIS_DCOMPLEX, bli_zsubv_sifive_rvv_intr,
+
+	  BLIS_SWAPV_KER,      BLIS_FLOAT,    bli_sswapv_sifive_rvv_intr,
+	  BLIS_SWAPV_KER,      BLIS_DOUBLE,   bli_dswapv_sifive_rvv_intr,
+	  BLIS_SWAPV_KER,      BLIS_SCOMPLEX, bli_cswapv_sifive_rvv_intr,
+	  BLIS_SWAPV_KER,      BLIS_DCOMPLEX, bli_zswapv_sifive_rvv_intr,
+
+	  BLIS_XPBYV_KER,      BLIS_FLOAT,    bli_sxpbyv_sifive_rvv_intr,
+	  BLIS_XPBYV_KER,      BLIS_DOUBLE,   bli_dxpbyv_sifive_rvv_intr,
+	  BLIS_XPBYV_KER,      BLIS_SCOMPLEX, bli_cxpbyv_sifive_rvv_intr,
+	  BLIS_XPBYV_KER,      BLIS_DCOMPLEX, bli_zxpbyv_sifive_rvv_intr,
+
+	  // Level 1f
+	  BLIS_AXPY2V_KER,     BLIS_FLOAT,    bli_saxpy2v_sifive_rvv_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DOUBLE,   bli_daxpy2v_sifive_rvv_intr,
+	  BLIS_AXPY2V_KER,     BLIS_SCOMPLEX, bli_caxpy2v_sifive_rvv_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DCOMPLEX, bli_zaxpy2v_sifive_rvv_intr,
+
+	  BLIS_AXPYF_KER,      BLIS_FLOAT,    bli_saxpyf_sifive_rvv_intr,
+	  BLIS_AXPYF_KER,      BLIS_DOUBLE,   bli_daxpyf_sifive_rvv_intr,
+	  BLIS_AXPYF_KER,      BLIS_SCOMPLEX, bli_caxpyf_sifive_rvv_intr,
+	  BLIS_AXPYF_KER,      BLIS_DCOMPLEX, bli_zaxpyf_sifive_rvv_intr,
+
+	  BLIS_DOTXF_KER,      BLIS_FLOAT,    bli_sdotxf_sifive_rvv_intr,
+	  BLIS_DOTXF_KER,      BLIS_DOUBLE,   bli_ddotxf_sifive_rvv_intr,
+	  BLIS_DOTXF_KER,      BLIS_SCOMPLEX, bli_cdotxf_sifive_rvv_intr,
+	  BLIS_DOTXF_KER,      BLIS_DCOMPLEX, bli_zdotxf_sifive_rvv_intr,
+
+	  BLIS_DOTAXPYV_KER,   BLIS_FLOAT,    bli_sdotaxpyv_sifive_rvv_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DOUBLE,   bli_ddotaxpyv_sifive_rvv_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_rvv_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_rvv_intr,
+
+	  BLIS_DOTXAXPYF_KER,  BLIS_FLOAT,    bli_sdotxaxpyf_sifive_rvv_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DOUBLE,   bli_ddotxaxpyf_sifive_rvv_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_rvv_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_rvv_intr,
+
+	  // Level 1m
+	  BLIS_PACKM_KER,      BLIS_FLOAT,    bli_spackm_sifive_rvv_intr,
+	  BLIS_PACKM_KER,      BLIS_DOUBLE,   bli_dpackm_sifive_rvv_intr,
+	  BLIS_PACKM_KER,      BLIS_SCOMPLEX, bli_cpackm_sifive_rvv_intr,
+	  BLIS_PACKM_KER,      BLIS_DCOMPLEX, bli_zpackm_sifive_rvv_intr,
+
+	  // Level 3
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_rvv_intr,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_sifive_rvv_intr,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_sifive_rvv_intr,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_sifive_rvv_intr,
+
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_rvv_intr,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_DCOMPLEX, TRUE,
+
+	  BLIS_VA_END
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],     7,     7,     6,     6,
+	                                             8,     8,     8,     8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4 * vlenb / 4, 4 * vlenb / 8, 2 * vlenb / 4, 2 * vlenb / 8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     7,     7,     6,     6 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4 * vlenb / 4, 4 * vlenb / 8, 2 * vlenb / 4, 2 * vlenb / 8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    64,    64,    64,    64 );
+	// Default BLIS_BBM_s = 1, but set here to ensure it's correct
+	bli_blksz_init_easy( &blkszs[ BLIS_BBM ],    1,     1,     1,     1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBN ],    1,     1,     1,     1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+	  // level-1m
+	  BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
+	  BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
+
+	  BLIS_VA_END
+	);
+}
+
diff --git a/config/sifive_rvv/bli_family_sifive_rvv.h b/config/sifive_rvv/bli_family_sifive_rvv.h
new file mode 100644
index 000000000..708c1960f
--- /dev/null
+++ b/config/sifive_rvv/bli_family_sifive_rvv.h
@@ -0,0 +1,34 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
diff --git a/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h b/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h
new file mode 100644
index 000000000..33543db50
--- /dev/null
+++ b/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+#define BLIS_MR_s   7
+#define BLIS_MR_d   7
+#define BLIS_MR_c   6
+#define BLIS_MR_z   6
+
+#define BLIS_PACKMR_s   8
+#define BLIS_PACKMR_d   8
+#define BLIS_PACKMR_c   8
+#define BLIS_PACKMR_z   8
+
+#define BLIS_NR_s   -1
+#define BLIS_NR_d   -1
+#define BLIS_NR_c   -1
+#define BLIS_NR_z   -1
+//#endif
+
diff --git a/config/sifive_rvv/make_defs.mk b/config/sifive_rvv/make_defs.mk
new file mode 100644
index 000000000..a4b3675e1
--- /dev/null
+++ b/config/sifive_rvv/make_defs.mk
@@ -0,0 +1,80 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2024, SiFive, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := sifive_rvv
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb -mabi=lp64d
+CMISCFLAGS_SIFIVE_OTHER :=
+CPPROCFLAGS    :=
+CMISCFLAGS     := $(CMISCFLAGS_SIFIVE) $(CMISCFLAGS_SIFIVE_OTHER) \
+                  -fdata-sections -ffunction-sections \
+                  -fdiagnostics-color=always -fno-rtti -fno-exceptions
+CPICFLAGS      := -fPIC
+CWARNFLAGS     := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \
+                  -Wno-sign-compare -Wno-unused-variable
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS)
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c
index 668891cf3..142ca1927 100644
--- a/config/sifive_x280/bli_cntx_init_sifive_x280.c
+++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c
@@ -49,127 +49,127 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx )
 	  cntx,
 
 	  // Level 1
-	  BLIS_ADDV_KER,       BLIS_FLOAT,    bli_saddv_sifive_x280_intr,
-	  BLIS_ADDV_KER,       BLIS_DOUBLE,   bli_daddv_sifive_x280_intr,
-	  BLIS_ADDV_KER,       BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr,
-	  BLIS_ADDV_KER,       BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr,
-
-	  BLIS_AMAXV_KER,      BLIS_FLOAT,    bli_samaxv_sifive_x280_intr,
-	  BLIS_AMAXV_KER,      BLIS_DOUBLE,   bli_damaxv_sifive_x280_intr,
-	  BLIS_AMAXV_KER,      BLIS_SCOMPLEX, bli_camaxv_sifive_x280_intr,
-	  BLIS_AMAXV_KER,      BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_intr,
-
-	  BLIS_AXPBYV_KER,     BLIS_FLOAT,    bli_saxpbyv_sifive_x280_intr,
-	  BLIS_AXPBYV_KER,     BLIS_DOUBLE,   bli_daxpbyv_sifive_x280_intr,
-	  BLIS_AXPBYV_KER,     BLIS_SCOMPLEX, bli_caxpbyv_sifive_x280_intr,
-	  BLIS_AXPBYV_KER,     BLIS_DCOMPLEX, bli_zaxpbyv_sifive_x280_intr,
-
-	  BLIS_AXPYV_KER,      BLIS_FLOAT,    bli_saxpyv_sifive_x280_intr,
-	  BLIS_AXPYV_KER,      BLIS_DOUBLE,   bli_daxpyv_sifive_x280_intr,
-	  BLIS_AXPYV_KER,      BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr,
-	  BLIS_AXPYV_KER,      BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr,
-
-	  BLIS_COPYV_KER,      BLIS_FLOAT,    bli_scopyv_sifive_x280_intr,
-	  BLIS_COPYV_KER,      BLIS_DOUBLE,   bli_dcopyv_sifive_x280_intr,
-	  BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_intr,
-	  BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_intr,
-
-	  BLIS_DOTV_KER,       BLIS_FLOAT,    bli_sdotv_sifive_x280_intr,
-	  BLIS_DOTV_KER,       BLIS_DOUBLE,   bli_ddotv_sifive_x280_intr,
-	  BLIS_DOTV_KER,       BLIS_SCOMPLEX, bli_cdotv_sifive_x280_intr,
-	  BLIS_DOTV_KER,       BLIS_DCOMPLEX, bli_zdotv_sifive_x280_intr,
-
-	  BLIS_DOTXV_KER,      BLIS_FLOAT,    bli_sdotxv_sifive_x280_intr,
-	  BLIS_DOTXV_KER,      BLIS_DOUBLE,   bli_ddotxv_sifive_x280_intr,
-	  BLIS_DOTXV_KER,      BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr,
-	  BLIS_DOTXV_KER,      BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr,
-
-	  BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_sifive_x280_intr,
-	  BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_sifive_x280_intr,
-	  BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_intr,
-	  BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_intr,
-
-	  BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_sifive_x280_intr,
-	  BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_sifive_x280_intr,
-	  BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_intr,
-	  BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_intr,
-
-	  BLIS_SCAL2V_KER,     BLIS_FLOAT,    bli_sscal2v_sifive_x280_intr,
-	  BLIS_SCAL2V_KER,     BLIS_DOUBLE,   bli_dscal2v_sifive_x280_intr,
-	  BLIS_SCAL2V_KER,     BLIS_SCOMPLEX, bli_cscal2v_sifive_x280_intr,
-	  BLIS_SCAL2V_KER,     BLIS_DCOMPLEX, bli_zscal2v_sifive_x280_intr,
-
-	  BLIS_SCALV_KER,      BLIS_FLOAT,    bli_sscalv_sifive_x280_intr,
-	  BLIS_SCALV_KER,      BLIS_DOUBLE,   bli_dscalv_sifive_x280_intr,
-	  BLIS_SCALV_KER,      BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr,
-	  BLIS_SCALV_KER,      BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr,
-
-	  BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_sifive_x280_intr,
-	  BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_sifive_x280_intr,
-	  BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_sifive_x280_intr,
-	  BLIS_SETV_KER,       BLIS_DCOMPLEX, bli_zsetv_sifive_x280_intr,
-
-	  BLIS_SUBV_KER,       BLIS_FLOAT,    bli_ssubv_sifive_x280_intr,
-	  BLIS_SUBV_KER,       BLIS_DOUBLE,   bli_dsubv_sifive_x280_intr,
-	  BLIS_SUBV_KER,       BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr,
-	  BLIS_SUBV_KER,       BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr,
-
-	  BLIS_SWAPV_KER,      BLIS_FLOAT,    bli_sswapv_sifive_x280_intr,
-	  BLIS_SWAPV_KER,      BLIS_DOUBLE,   bli_dswapv_sifive_x280_intr,
-	  BLIS_SWAPV_KER,      BLIS_SCOMPLEX, bli_cswapv_sifive_x280_intr,
-	  BLIS_SWAPV_KER,      BLIS_DCOMPLEX, bli_zswapv_sifive_x280_intr,
-
-	  BLIS_XPBYV_KER,      BLIS_FLOAT,    bli_sxpbyv_sifive_x280_intr,
-	  BLIS_XPBYV_KER,      BLIS_DOUBLE,   bli_dxpbyv_sifive_x280_intr,
-	  BLIS_XPBYV_KER,      BLIS_SCOMPLEX, bli_cxpbyv_sifive_x280_intr,
-	  BLIS_XPBYV_KER,      BLIS_DCOMPLEX, bli_zxpbyv_sifive_x280_intr,
+	  BLIS_ADDV_KER,       BLIS_FLOAT,    bli_saddv_sifive_rvv_intr,
+	  BLIS_ADDV_KER,       BLIS_DOUBLE,   bli_daddv_sifive_rvv_intr,
+	  BLIS_ADDV_KER,       BLIS_SCOMPLEX, bli_caddv_sifive_rvv_intr,
+	  BLIS_ADDV_KER,       BLIS_DCOMPLEX, bli_zaddv_sifive_rvv_intr,
+
+	  BLIS_AMAXV_KER,      BLIS_FLOAT,    bli_samaxv_sifive_rvv_intr,
+	  BLIS_AMAXV_KER,      BLIS_DOUBLE,   bli_damaxv_sifive_rvv_intr,
+	  BLIS_AMAXV_KER,      BLIS_SCOMPLEX, bli_camaxv_sifive_rvv_intr,
+	  BLIS_AMAXV_KER,      BLIS_DCOMPLEX, bli_zamaxv_sifive_rvv_intr,
+
+	  BLIS_AXPBYV_KER,     BLIS_FLOAT,    bli_saxpbyv_sifive_rvv_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DOUBLE,   bli_daxpbyv_sifive_rvv_intr,
+	  BLIS_AXPBYV_KER,     BLIS_SCOMPLEX, bli_caxpbyv_sifive_rvv_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DCOMPLEX, bli_zaxpbyv_sifive_rvv_intr,
+
+	  BLIS_AXPYV_KER,      BLIS_FLOAT,    bli_saxpyv_sifive_rvv_intr,
+	  BLIS_AXPYV_KER,      BLIS_DOUBLE,   bli_daxpyv_sifive_rvv_intr,
+	  BLIS_AXPYV_KER,      BLIS_SCOMPLEX, bli_caxpyv_sifive_rvv_intr,
+	  BLIS_AXPYV_KER,      BLIS_DCOMPLEX, bli_zaxpyv_sifive_rvv_intr,
+
+	  BLIS_COPYV_KER,      BLIS_FLOAT,    bli_scopyv_sifive_rvv_intr,
+	  BLIS_COPYV_KER,      BLIS_DOUBLE,   bli_dcopyv_sifive_rvv_intr,
+	  BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_sifive_rvv_intr,
+	  BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_sifive_rvv_intr,
+
+	  BLIS_DOTV_KER,       BLIS_FLOAT,    bli_sdotv_sifive_rvv_intr,
+	  BLIS_DOTV_KER,       BLIS_DOUBLE,   bli_ddotv_sifive_rvv_intr,
+	  BLIS_DOTV_KER,       BLIS_SCOMPLEX, bli_cdotv_sifive_rvv_intr,
+	  BLIS_DOTV_KER,       BLIS_DCOMPLEX, bli_zdotv_sifive_rvv_intr,
+
+	  BLIS_DOTXV_KER,      BLIS_FLOAT,    bli_sdotxv_sifive_rvv_intr,
+	  BLIS_DOTXV_KER,      BLIS_DOUBLE,   bli_ddotxv_sifive_rvv_intr,
+	  BLIS_DOTXV_KER,      BLIS_SCOMPLEX, bli_cdotxv_sifive_rvv_intr,
+	  BLIS_DOTXV_KER,      BLIS_DCOMPLEX, bli_zdotxv_sifive_rvv_intr,
+
+	  BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_sifive_rvv_intr,
+	  BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_sifive_rvv_intr,
+	  BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_sifive_rvv_intr,
+	  BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_sifive_rvv_intr,
+
+	  BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_sifive_rvv_intr,
+	  BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_sifive_rvv_intr,
+	  BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_sifive_rvv_intr,
+	  BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_sifive_rvv_intr,
+
+	  BLIS_SCAL2V_KER,     BLIS_FLOAT,    bli_sscal2v_sifive_rvv_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DOUBLE,   bli_dscal2v_sifive_rvv_intr,
+	  BLIS_SCAL2V_KER,     BLIS_SCOMPLEX, bli_cscal2v_sifive_rvv_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DCOMPLEX, bli_zscal2v_sifive_rvv_intr,
+
+	  BLIS_SCALV_KER,      BLIS_FLOAT,    bli_sscalv_sifive_rvv_intr,
+	  BLIS_SCALV_KER,      BLIS_DOUBLE,   bli_dscalv_sifive_rvv_intr,
+	  BLIS_SCALV_KER,      BLIS_SCOMPLEX, bli_cscalv_sifive_rvv_intr,
+	  BLIS_SCALV_KER,      BLIS_DCOMPLEX, bli_zscalv_sifive_rvv_intr,
+
+	  BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_sifive_rvv_intr,
+	  BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_sifive_rvv_intr,
+	  BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_sifive_rvv_intr,
+	  BLIS_SETV_KER,       BLIS_DCOMPLEX, bli_zsetv_sifive_rvv_intr,
+
+	  BLIS_SUBV_KER,       BLIS_FLOAT,    bli_ssubv_sifive_rvv_intr,
+	  BLIS_SUBV_KER,       BLIS_DOUBLE,   bli_dsubv_sifive_rvv_intr,
+	  BLIS_SUBV_KER,       BLIS_SCOMPLEX, bli_csubv_sifive_rvv_intr,
+	  BLIS_SUBV_KER,       BLIS_DCOMPLEX, bli_zsubv_sifive_rvv_intr,
+
+	  BLIS_SWAPV_KER,      BLIS_FLOAT,    bli_sswapv_sifive_rvv_intr,
+	  BLIS_SWAPV_KER,      BLIS_DOUBLE,   bli_dswapv_sifive_rvv_intr,
+	  BLIS_SWAPV_KER,      BLIS_SCOMPLEX, bli_cswapv_sifive_rvv_intr,
+	  BLIS_SWAPV_KER,      BLIS_DCOMPLEX, bli_zswapv_sifive_rvv_intr,
+
+	  BLIS_XPBYV_KER,      BLIS_FLOAT,    bli_sxpbyv_sifive_rvv_intr,
+	  BLIS_XPBYV_KER,      BLIS_DOUBLE,   bli_dxpbyv_sifive_rvv_intr,
+	  BLIS_XPBYV_KER,      BLIS_SCOMPLEX, bli_cxpbyv_sifive_rvv_intr,
+	  BLIS_XPBYV_KER,      BLIS_DCOMPLEX, bli_zxpbyv_sifive_rvv_intr,
 
 	  // Level 1f
-	  BLIS_AXPY2V_KER,     BLIS_FLOAT,    bli_saxpy2v_sifive_x280_intr,
-	  BLIS_AXPY2V_KER,     BLIS_DOUBLE,   bli_daxpy2v_sifive_x280_intr,
-	  BLIS_AXPY2V_KER,     BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr,
-	  BLIS_AXPY2V_KER,     BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr,
-
-	  BLIS_AXPYF_KER,      BLIS_FLOAT,    bli_saxpyf_sifive_x280_intr,
-	  BLIS_AXPYF_KER,      BLIS_DOUBLE,   bli_daxpyf_sifive_x280_intr,
-	  BLIS_AXPYF_KER,      BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_intr,
-	  BLIS_AXPYF_KER,      BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_intr,
-
-	  BLIS_DOTXF_KER,      BLIS_FLOAT,    bli_sdotxf_sifive_x280_intr,
-	  BLIS_DOTXF_KER,      BLIS_DOUBLE,   bli_ddotxf_sifive_x280_intr,
-	  BLIS_DOTXF_KER,      BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_intr,
-	  BLIS_DOTXF_KER,      BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_intr,
-
-	  BLIS_DOTAXPYV_KER,   BLIS_FLOAT,    bli_sdotaxpyv_sifive_x280_intr,
-	  BLIS_DOTAXPYV_KER,   BLIS_DOUBLE,   bli_ddotaxpyv_sifive_x280_intr,
-	  BLIS_DOTAXPYV_KER,   BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr,
-	  BLIS_DOTAXPYV_KER,   BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr,
-
-	  BLIS_DOTXAXPYF_KER,  BLIS_FLOAT,    bli_sdotxaxpyf_sifive_x280_intr,
-	  BLIS_DOTXAXPYF_KER,  BLIS_DOUBLE,   bli_ddotxaxpyf_sifive_x280_intr,
-	  BLIS_DOTXAXPYF_KER,  BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_intr,
-	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_intr,
+	  BLIS_AXPY2V_KER,     BLIS_FLOAT,    bli_saxpy2v_sifive_rvv_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DOUBLE,   bli_daxpy2v_sifive_rvv_intr,
+	  BLIS_AXPY2V_KER,     BLIS_SCOMPLEX, bli_caxpy2v_sifive_rvv_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DCOMPLEX, bli_zaxpy2v_sifive_rvv_intr,
+
+	  BLIS_AXPYF_KER,      BLIS_FLOAT,    bli_saxpyf_sifive_rvv_intr,
+	  BLIS_AXPYF_KER,      BLIS_DOUBLE,   bli_daxpyf_sifive_rvv_intr,
+	  BLIS_AXPYF_KER,      BLIS_SCOMPLEX, bli_caxpyf_sifive_rvv_intr,
+	  BLIS_AXPYF_KER,      BLIS_DCOMPLEX, bli_zaxpyf_sifive_rvv_intr,
+
+	  BLIS_DOTXF_KER,      BLIS_FLOAT,    bli_sdotxf_sifive_rvv_intr,
+	  BLIS_DOTXF_KER,      BLIS_DOUBLE,   bli_ddotxf_sifive_rvv_intr,
+	  BLIS_DOTXF_KER,      BLIS_SCOMPLEX, bli_cdotxf_sifive_rvv_intr,
+	  BLIS_DOTXF_KER,      BLIS_DCOMPLEX, bli_zdotxf_sifive_rvv_intr,
+
+	  BLIS_DOTAXPYV_KER,   BLIS_FLOAT,    bli_sdotaxpyv_sifive_rvv_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DOUBLE,   bli_ddotaxpyv_sifive_rvv_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_rvv_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_rvv_intr,
+
+	  BLIS_DOTXAXPYF_KER,  BLIS_FLOAT,    bli_sdotxaxpyf_sifive_rvv_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DOUBLE,   bli_ddotxaxpyf_sifive_rvv_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_rvv_intr,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_rvv_intr,
 
 	  // Level 1m
-	  BLIS_PACKM_KER,      BLIS_FLOAT,    bli_spackm_sifive_x280_intr,
-	  BLIS_PACKM_KER,      BLIS_DOUBLE,   bli_dpackm_sifive_x280_intr,
-	  BLIS_PACKM_KER,      BLIS_SCOMPLEX, bli_cpackm_sifive_x280_intr,
-	  BLIS_PACKM_KER,      BLIS_DCOMPLEX, bli_zpackm_sifive_x280_intr,
+	  BLIS_PACKM_KER,      BLIS_FLOAT,    bli_spackm_sifive_rvv_intr,
+	  BLIS_PACKM_KER,      BLIS_DOUBLE,   bli_dpackm_sifive_rvv_intr,
+	  BLIS_PACKM_KER,      BLIS_SCOMPLEX, bli_cpackm_sifive_rvv_intr,
+	  BLIS_PACKM_KER,      BLIS_DCOMPLEX, bli_zpackm_sifive_rvv_intr,
 
 	  // Level 3
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_x280_intr,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_sifive_x280_intr,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_sifive_x280_intr,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_sifive_x280_intr,
-
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_sifive_x280_intr,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_sifive_x280_intr,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_intr,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_intr,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_sifive_x280_intr,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_sifive_x280_intr,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_intr,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_intr,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_rvv_intr,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_sifive_rvv_intr,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_sifive_rvv_intr,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_sifive_rvv_intr,
+
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_rvv_intr,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_rvv_intr,
 
 	  BLIS_VA_END
 	);
diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk
index 31b31e387..5f19e4e44 100644
--- a/config/sifive_x280/make_defs.mk
+++ b/config/sifive_x280/make_defs.mk
@@ -47,8 +47,10 @@ THIS_CONFIG    := sifive_x280
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.
 CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl512b -mabi=lp64d
+CMISCFLAGS_SIFIVE_OTHER :=
 CPPROCFLAGS    :=
-CMISCFLAGS     := $(CMISCFLAGS_SIFIVE) -fdata-sections -ffunction-sections \
+CMISCFLAGS     := $(CMISCFLAGS_SIFIVE) $(CMISCFLAGS_SIFIVE_OTHER) \
+                  -fdata-sections -ffunction-sections \
                   -fdiagnostics-color=always -fno-rtti -fno-exceptions
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \
diff --git a/config_registry b/config_registry
index 8c1f6f254..815439348 100644
--- a/config_registry
+++ b/config_registry
@@ -62,7 +62,8 @@ rv32iv:      rv32iv/rviv
 rv64iv:      rv64iv/rviv
 
 # SiFive architectures.
-sifive_x280: sifive_x280
+sifive_rvv: sifive_rvv
+sifive_x280: sifive_x280/sifive_rvv
 
 # Generic architectures.
 generic:     generic
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index 135d41063..53d9bdefd 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -287,6 +287,9 @@ arch_t bli_arch_query_id_impl( void )
 		#endif
 
 		// SiFive microarchitectures.
+		#ifdef BLIS_FAMILY_SIFIVE_RVV
+		id = BLIS_ARCH_SIFIVE_RVV;
+		#endif
 		#ifdef BLIS_FAMILY_SIFIVE_X280
 		id = BLIS_ARCH_SIFIVE_X280;
 		#endif
@@ -356,6 +359,7 @@ static const char* config_name[ BLIS_NUM_ARCHS ] =
     "rv32iv",
     "rv64iv",
 
+    "sifive_rvv",
     "sifive_x280",
 
     "generic"
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index a35bb7746..49a894302 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -180,6 +180,9 @@ INSERT_GENTCONF
 
 // -- SiFive families --
 
+#ifdef BLIS_FAMILY_SIFIVE_RVV
+#include "bli_family_sifive_rvv.h"
+#endif
 #ifdef BLIS_FAMILY_SIFIVE_X280
 #include "bli_family_sifive_x280.h"
 #endif
@@ -277,6 +280,9 @@ INSERT_GENTCONF
 
 // -- SiFive RISC-V architectures --
 
+#ifdef BLIS_KERNELS_SIFIVE_RVV
+#include "bli_kernels_sifive_rvv.h"
+#endif
 #ifdef BLIS_KERNELS_SIFIVE_X280
 #include "bli_kernels_sifive_x280.h"
 #endif
diff --git a/frame/include/bli_gentconf_macro_defs.h b/frame/include/bli_gentconf_macro_defs.h
index 70414fb47..f6f3af20e 100644
--- a/frame/include/bli_gentconf_macro_defs.h
+++ b/frame/include/bli_gentconf_macro_defs.h
@@ -222,6 +222,11 @@
 
 // -- SiFive architectures ----------------------------------------------------
 
+#ifdef BLIS_CONFIG_SIFIVE_RVV
+#define INSERT_GENTCONF_SIFIVE_RVV GENTCONF( SIFIVE_RVV, sifive_rvv )
+#else
+#define INSERT_GENTCONF_SIFIVE_RVV
+#endif
 #ifdef BLIS_CONFIG_SIFIVE_X280
 #define INSERT_GENTCONF_SIFIVE_X280 GENTCONF( SIFIVE_X280, sifive_x280 )
 #else
@@ -280,6 +285,7 @@ INSERT_GENTCONF_RV64I \
 INSERT_GENTCONF_RV32IV \
 INSERT_GENTCONF_RV64IV \
 \
+INSERT_GENTCONF_SIFIVE_RVV \
 INSERT_GENTCONF_SIFIVE_X280 \
 \
 INSERT_GENTCONF_GENERIC
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index e1d82c563..890d216ea 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1007,6 +1007,7 @@ typedef enum
 	BLIS_ARCH_RV64IV,
 
 	// SiFive
+	BLIS_ARCH_SIFIVE_RVV,
 	BLIS_ARCH_SIFIVE_X280,
 
 	// Generic architecture/configuration
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c
index 2b7ad6fe7..c917390f9 100644
--- a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_x280_intr(\
+#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_rvv_intr(\
           conj_t           conjx,         \
           dim_t            n,             \
     const T*      restrict x_, inc_t incx, \
@@ -57,7 +57,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_addv_sifive_x280_intr_real.c"
+#include "./bli_addv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -72,7 +72,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_addv_sifive_x280_intr_real.c"
+#include "./bli_addv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -88,7 +88,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_addv_sifive_x280_intr_complex.c"
+#include "./bli_addv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -105,7 +105,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_addv_sifive_x280_intr_complex.c"
+#include "./bli_addv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c
index d5343befe..ae4ff39b9 100644
--- a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c
index d4e7d4a45..bc928a5e6 100644
--- a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_addv_sifive_rvv_intr/bli_addv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c
similarity index 86%
rename from kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c
index 4f7d54630..6014b860b 100644
--- a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr.c
@@ -40,7 +40,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 
-#define AMAXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##amaxv_sifive_x280_intr(\
+#define AMAXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##amaxv_sifive_rvv_intr(\
           dim_t            n,              \
     const T*      restrict x_, inc_t incx, \
           dim_t*           index,          \
@@ -52,20 +52,20 @@
 // BLIS defines integers to be 32 or 64 bits according to BLIS_INT_TYPE_SIZE.
 // If BLIS_INT_TYPE_SIZE is any other value, integers are defined to be longs.
 #if BLIS_INT_TYPE_SIZE == 32 || BLIS_INT_TYPE_SIZE == 64
-#define AMAXV_SIFIVE_X280_INT_SIZE BLIS_INT_TYPE_SIZE
+#define AMAXV_SIFIVE_RVV_INT_SIZE BLIS_INT_TYPE_SIZE
 #elif LONG_MAX == INT32_MAX
-#define AMAXV_SIFIVE_X280_INT_SIZE 32
+#define AMAXV_SIFIVE_RVV_INT_SIZE 32
 #elif LONG_MAX == INT64_MAX
-#define AMAXV_SIFIVE_X280_INT_SIZE 64
+#define AMAXV_SIFIVE_RVV_INT_SIZE 64
 #else
-#error "Integers must be 32- or 64-bits for bli_?amaxv_sifive_x280_intr."
+#error "Integers must be 32- or 64-bits for bli_?amaxv_sifive_rvv_intr."
 #endif
 
 // Single precision real
 #define DATATYPE float
 #define PRECISION_CHAR s
 #define PREC_X 32
-#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE
 #if PREC_I == 32
 #define LMUL_X m4
 #define LMUL_I m4
@@ -77,7 +77,7 @@
 #endif
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_amaxv_sifive_x280_intr_real.c"
+#include "./bli_amaxv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -92,7 +92,7 @@
 #define DATATYPE double
 #define PRECISION_CHAR d
 #define PREC_X 64
-#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE
 #if PREC_I == 32
 #define LMUL_X m8
 #define LMUL_I m4
@@ -104,7 +104,7 @@
 #endif
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_amaxv_sifive_x280_intr_real.c"
+#include "./bli_amaxv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -120,7 +120,7 @@
 #define BASE_DT float
 #define PRECISION_CHAR c
 #define PREC_X 32
-#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE
 #if PREC_I == 32
 #define LMUL_X m4
 #define LMUL_I m4
@@ -132,7 +132,7 @@
 #endif
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_amaxv_sifive_x280_intr_complex.c"
+#include "./bli_amaxv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -149,7 +149,7 @@
 #define BASE_DT double
 #define PRECISION_CHAR z
 #define PREC_X 64
-#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE
+#define PREC_I AMAXV_SIFIVE_RVV_INT_SIZE
 #if PREC_I == 32
 #define LMUL_X m8
 #define LMUL_I m4
@@ -161,7 +161,7 @@
 #endif
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_amaxv_sifive_x280_intr_complex.c"
+#include "./bli_amaxv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -173,7 +173,7 @@
 #undef RATIO
 #undef FLT_SIZE
 
-#undef AMAXV_SIFIVE_X280_INT_SIZE
+#undef AMAXV_SIFIVE_RVV_INT_SIZE
 
 #undef AMAXV
 #undef AMAXV_
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_amaxv_sifive_rvv_intr/bli_amaxv_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c
index 389292f90..94e3272bc 100644
--- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_x280_intr(\
+#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_rvv_intr(\
           conj_t           conjx,          \
           dim_t            n,              \
     const T*      restrict alpha_,         \
@@ -52,11 +52,11 @@
 
 #define AXPBYV(...)  AXPBYV_(__VA_ARGS__)
 
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
-#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_rvv_intr
 #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
-#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_x280_intr
+#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_rvv_intr
 #define SCAL2V(PRECISION_CHAR) SCAL2V_(PRECISION_CHAR)
 
 // Single precision real
@@ -66,7 +66,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpbyv_sifive_x280_intr_real.c"
+#include "./bli_axpbyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -81,7 +81,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpbyv_sifive_x280_intr_real.c"
+#include "./bli_axpbyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -97,7 +97,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+#include "./bli_axpbyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -114,7 +114,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+#include "./bli_axpbyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c
index 31fc584b9..af034824e 100644
--- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c
index 33eafc5d1..b48218902 100644
--- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_axpbyv_sifive_rvv_intr/bli_axpbyv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c
index 3f9ebd3b0..07dc6a416 100644
--- a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_x280_intr(\
+#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_rvv_intr(\
           conj_t           conjx,         \
           dim_t            n,             \
     const T*      restrict alpha_,         \
@@ -58,7 +58,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpyv_sifive_x280_intr_real.c"
+#include "./bli_axpyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -73,7 +73,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpyv_sifive_x280_intr_real.c"
+#include "./bli_axpyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -89,7 +89,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpyv_sifive_x280_intr_complex.c"
+#include "./bli_axpyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -106,7 +106,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpyv_sifive_x280_intr_complex.c"
+#include "./bli_axpyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c
index dc520d212..1b88f7d26 100644
--- a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c
index 0c2cda842..8ad0ac3fb 100644
--- a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_axpyv_sifive_rvv_intr/bli_axpyv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c
similarity index 93%
rename from kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c
index e030d85ff..ab9cf0f34 100644
--- a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr.c
@@ -38,7 +38,7 @@
 #include <riscv_vector.h>
 #include <stddef.h>
 
-#define COPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##copyv_sifive_x280_intr(\
+#define COPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##copyv_sifive_rvv_intr(\
           conj_t           conjx,          \
           dim_t            n,              \
     const T*      restrict x_, inc_t incx, \
@@ -55,7 +55,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_copyv_sifive_x280_intr_real.c"
+#include "./bli_copyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -70,7 +70,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_copyv_sifive_x280_intr_real.c"
+#include "./bli_copyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -86,7 +86,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_copyv_sifive_x280_intr_complex.c"
+#include "./bli_copyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -103,7 +103,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_copyv_sifive_x280_intr_complex.c"
+#include "./bli_copyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_copyv_sifive_rvv_intr/bli_copyv_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c
index 0dc856540..31ae4cc26 100644
--- a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_x280_intr(\
+#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_rvv_intr(\
           conj_t           conjxt,        \
           conj_t           conjy,         \
           dim_t            n,             \
@@ -59,7 +59,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotv_sifive_x280_intr_real.c"
+#include "./bli_dotv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -74,7 +74,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotv_sifive_x280_intr_real.c"
+#include "./bli_dotv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -90,7 +90,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotv_sifive_x280_intr_complex.c"
+#include "./bli_dotv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -107,7 +107,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotv_sifive_x280_intr_complex.c"
+#include "./bli_dotv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c
index 250fab46e..14dbfc4e9 100644
--- a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c
index 0ec8e6328..b7aec00fd 100644
--- a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_dotv_sifive_rvv_intr/bli_dotv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c
similarity index 93%
rename from kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c
index 048f8d298..ad405979c 100644
--- a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_x280_intr(\
+#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_rvv_intr(\
           conj_t           conjxt,        \
           conj_t           conjy,         \
           dim_t            n,             \
@@ -62,7 +62,7 @@
 #define FLT_SIZE sizeof(float)
 #define FMA fmaf
 
-#include "./bli_dotxv_sifive_x280_intr_real.c"
+#include "./bli_dotxv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -79,7 +79,7 @@
 #define FLT_SIZE sizeof(double)
 #define FMA fma
 
-#include "./bli_dotxv_sifive_x280_intr_real.c"
+#include "./bli_dotxv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -97,7 +97,7 @@
 #define FLT_SIZE sizeof(float)
 #define FMA fmaf
 
-#include "./bli_dotxv_sifive_x280_intr_complex.c"
+#include "./bli_dotxv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -116,7 +116,7 @@
 #define FLT_SIZE sizeof(double)
 #define FMA fma
 
-#include "./bli_dotxv_sifive_x280_intr_complex.c"
+#include "./bli_dotxv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c
index 8245e8e05..1c6d3d8f7 100644
--- a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c
index f9d934697..1f84ae610 100644
--- a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_dotxv_sifive_rvv_intr/bli_dotxv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c
similarity index 93%
rename from kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c
index fc8f8a76d..7f4443479 100644
--- a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr.c
@@ -38,7 +38,7 @@
 #include <riscv_vector.h>
 #include <stddef.h>
 
-#define INVERTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invertv_sifive_x280_intr(\
+#define INVERTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invertv_sifive_rvv_intr(\
           dim_t            n,              \
           T*      restrict x_, inc_t incx, \
     const cntx_t*          cntx            \
@@ -53,7 +53,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_invertv_sifive_x280_intr_real.c"
+#include "./bli_invertv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -68,7 +68,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_invertv_sifive_x280_intr_real.c"
+#include "./bli_invertv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -85,7 +85,7 @@
 #define RATIO 8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_invertv_sifive_x280_intr_complex.c"
+#include "./bli_invertv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -104,7 +104,7 @@
 #define RATIO 16
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_invertv_sifive_x280_intr_complex.c"
+#include "./bli_invertv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_invertv_sifive_rvv_intr/bli_invertv_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c
similarity index 93%
rename from kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c
index a5c7561bd..0dc9c01ab 100644
--- a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr.c
@@ -39,7 +39,7 @@
 #include <riscv_vector.h>
 #include <stddef.h>
 
-#define INVSCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invscalv_sifive_x280_intr(\
+#define INVSCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invscalv_sifive_rvv_intr(\
           conj_t           conjalpha,      \
           dim_t            n,              \
     const T*      restrict alpha_,         \
@@ -56,7 +56,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_invscalv_sifive_x280_intr_real.c"
+#include "./bli_invscalv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -71,7 +71,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_invscalv_sifive_x280_intr_real.c"
+#include "./bli_invscalv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -87,7 +87,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_invscalv_sifive_x280_intr_complex.c"
+#include "./bli_invscalv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -104,7 +104,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_invscalv_sifive_x280_intr_complex.c"
+#include "./bli_invscalv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_invscalv_sifive_rvv_intr/bli_invscalv_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c
index 4cae8257c..b434f751e 100644
--- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_x280_intr(\
+#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_rvv_intr(\
           conj_t           conjx,         \
           dim_t            n,             \
     const T*      restrict alpha_,         \
@@ -51,9 +51,9 @@
 
 #define SCAL2V(...)  SCAL2V_(__VA_ARGS__)
 
-#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_rvv_intr
 #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
 
 // Single precision real
@@ -63,7 +63,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_scal2v_sifive_x280_intr_real.c"
+#include "./bli_scal2v_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -78,7 +78,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_scal2v_sifive_x280_intr_real.c"
+#include "./bli_scal2v_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -94,7 +94,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_scal2v_sifive_x280_intr_complex.c"
+#include "./bli_scal2v_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -111,7 +111,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_scal2v_sifive_x280_intr_complex.c"
+#include "./bli_scal2v_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c
index 2e946a2a4..c2272ae3b 100644
--- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c
index 7084e15cf..7b8088202 100644
--- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_scal2v_sifive_rvv_intr/bli_scal2v_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c
index d1fb9940e..c6b19ea00 100644
--- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +39,7 @@
 #include "blis.h"
 #include "../../riscv_overloaded_intrinsics.h"
 
-#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_x280_intr(\
+#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_rvv_intr(\
           conj_t  conjalpha,               \
           dim_t   n,                       \
     const T*      restrict alpha_,         \
@@ -49,7 +49,7 @@
 
 #define SCALV(...)  SCALV_(__VA_ARGS__)
 
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
 
 // Single precision real
@@ -59,7 +59,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_scalv_sifive_x280_intr_real.c"
+#include "./bli_scalv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -74,7 +74,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_scalv_sifive_x280_intr_real.c"
+#include "./bli_scalv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -90,7 +90,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_scalv_sifive_x280_intr_complex.c"
+#include "./bli_scalv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -107,7 +107,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_scalv_sifive_x280_intr_complex.c"
+#include "./bli_scalv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c
index c6803c967..20f49ebdf 100644
--- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c
index 2b4e31d35..7cc2dd6b6 100644
--- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_scalv_sifive_rvv_intr/bli_scalv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c
similarity index 93%
rename from kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c
index 8c2ba7c72..33cfb4a57 100644
--- a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr.c
@@ -38,7 +38,7 @@
 #include <riscv_vector.h>
 #include <stddef.h>
 
-#define SETV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##setv_sifive_x280_intr(\
+#define SETV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##setv_sifive_rvv_intr(\
           conj_t           conjalpha,      \
           dim_t            n,              \
     const T*      restrict alpha_,         \
@@ -55,7 +55,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_setv_sifive_x280_intr_real.c"
+#include "./bli_setv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -70,7 +70,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_setv_sifive_x280_intr_real.c"
+#include "./bli_setv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -86,7 +86,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_setv_sifive_x280_intr_complex.c"
+#include "./bli_setv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -103,7 +103,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_setv_sifive_x280_intr_complex.c"
+#include "./bli_setv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_setv_sifive_rvv_intr/bli_setv_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c
index e6b483a3f..0ba7c5304 100644
--- a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_x280_intr(\
+#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_rvv_intr(\
           conj_t           conjx,          \
           dim_t            n,              \
     const T*      restrict x_, inc_t incx, \
@@ -57,7 +57,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_subv_sifive_x280_intr_real.c"
+#include "./bli_subv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -72,7 +72,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_subv_sifive_x280_intr_real.c"
+#include "./bli_subv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -88,7 +88,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_subv_sifive_x280_intr_complex.c"
+#include "./bli_subv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -105,7 +105,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_subv_sifive_x280_intr_complex.c"
+#include "./bli_subv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c
index 2d4a1a017..62eab516d 100644
--- a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c
index b15859431..5488007b2 100644
--- a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_subv_sifive_rvv_intr/bli_subv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c
similarity index 93%
rename from kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c
index baf685d35..ec14df9cb 100644
--- a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr.c
@@ -38,7 +38,7 @@
 #include <riscv_vector.h>
 #include <stddef.h>
 
-#define SWAPV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##swapv_sifive_x280_intr(\
+#define SWAPV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##swapv_sifive_rvv_intr(\
           dim_t            n,              \
           T*      restrict x_, inc_t incx, \
           T*      restrict y_, inc_t incy, \
@@ -54,7 +54,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_swapv_sifive_x280_intr_real.c"
+#include "./bli_swapv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -69,7 +69,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_swapv_sifive_x280_intr_real.c"
+#include "./bli_swapv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -85,7 +85,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_swapv_sifive_x280_intr_complex.c"
+#include "./bli_swapv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -102,7 +102,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_swapv_sifive_x280_intr_complex.c"
+#include "./bli_swapv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_swapv_sifive_rvv_intr/bli_swapv_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c
index da688851d..0f6a6d550 100644
--- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_x280_intr(\
+#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_rvv_intr(\
           conj_t           conjx,          \
           dim_t            n,              \
     const T*      restrict x_, inc_t incx, \
@@ -51,7 +51,7 @@
 
 #define XPBYV(...)  XPBYV_(__VA_ARGS__)
 
-#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_rvv_intr
 #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
 
 // Single precision real
@@ -61,7 +61,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_xpbyv_sifive_x280_intr_real.c"
+#include "./bli_xpbyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -76,7 +76,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_xpbyv_sifive_x280_intr_real.c"
+#include "./bli_xpbyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -92,7 +92,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+#include "./bli_xpbyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -109,7 +109,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+#include "./bli_xpbyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c
index 4c86e8b36..1eb2fff23 100644
--- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c
index b23272fea..f4a8aa72e 100644
--- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1/bli_xpbyv_sifive_rvv_intr/bli_xpbyv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
rename to kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c
index 1b5ce3b96..e9d4a8b5f 100644
--- a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #include "../../riscv_overloaded_intrinsics.h"
 
 
-#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_x280_intr(\
+#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_rvv_intr(\
           conj_t            conjx,              \
           conj_t            conjy,              \
           dim_t                 n,              \
@@ -61,7 +61,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpy2v_sifive_x280_intr_real.c"
+#include "./bli_axpy2v_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -76,7 +76,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpy2v_sifive_x280_intr_real.c"
+#include "./bli_axpy2v_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -92,7 +92,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+#include "./bli_axpy2v_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -109,7 +109,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+#include "./bli_axpy2v_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c
index 9b5719827..de753d224 100644
--- a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c
index cebb15997..b2e42155c 100644
--- a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1f/bli_axpy2v_sifive_rvv_intr/bli_axpy2v_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c
similarity index 94%
rename from kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c
rename to kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c
index a5e026846..ace31d7a8 100644
--- a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr.c
@@ -39,7 +39,7 @@
 #include <riscv_vector.h>
 #include <stdint.h>
 
-#define AXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyf_sifive_x280_intr(\
+#define AXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyf_sifive_rvv_intr(\
           conj_t            conja,                        \
           conj_t            conjx,                        \
           dim_t                 m,                        \
@@ -60,7 +60,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpyf_sifive_x280_intr_real.c"
+#include "./bli_axpyf_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -75,7 +75,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpyf_sifive_x280_intr_real.c"
+#include "./bli_axpyf_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -91,7 +91,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_axpyf_sifive_x280_intr_complex.c"
+#include "./bli_axpyf_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -108,7 +108,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_axpyf_sifive_x280_intr_complex.c"
+#include "./bli_axpyf_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1f/bli_axpyf_sifive_rvv_intr/bli_axpyf_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c
similarity index 92%
rename from kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
rename to kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c
index 9cd1071d7..7d46f52b0 100644
--- a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +39,7 @@
 #include "blis.h"
 #include "../../riscv_overloaded_intrinsics.h"
 
-#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_x280_intr(\
+#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_rvv_intr(\
           conj_t            conjxt,             \
           conj_t             conjx,             \
           conj_t             conjy,             \
@@ -61,7 +61,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+#include "./bli_dotaxpyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -76,7 +76,7 @@
 #define LMUL m8
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+#include "./bli_dotaxpyv_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -92,7 +92,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+#include "./bli_dotaxpyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -109,7 +109,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+#include "./bli_dotaxpyv_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c
index c3cd06c52..7529fb758 100644
--- a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_complex.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c
similarity index 99%
rename from kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c
index adaf3610b..0b6b7e016 100644
--- a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1f/bli_dotaxpyv_sifive_rvv_intr/bli_dotaxpyv_sifive_rvv_intr_real.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c
similarity index 93%
rename from kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c
rename to kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c
index dc1bca9f6..24052dd5c 100644
--- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr.c
@@ -40,7 +40,7 @@
 #include <stdint.h>
 #include <riscv_vector.h>
 
-#define DOTXAXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxaxpyf_sifive_x280_intr(\
+#define DOTXAXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxaxpyf_sifive_rvv_intr(\
           conj_t           conjat,                    \
           conj_t           conja,                     \
           conj_t           conjw,                     \
@@ -59,9 +59,9 @@
 
 #define DOTXAXPYF(...)  DOTXAXPYF_(__VA_ARGS__)
 
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
-#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_rvv_intr
 #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
 
 // Single precision real
@@ -71,7 +71,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotxaxpyf_sifive_x280_intr_real.c"
+#include "./bli_dotxaxpyf_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -86,7 +86,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotxaxpyf_sifive_x280_intr_real.c"
+#include "./bli_dotxaxpyf_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -102,7 +102,7 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c"
+#include "./bli_dotxaxpyf_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -119,7 +119,7 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c"
+#include "./bli_dotxaxpyf_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c
similarity index 76%
rename from kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c
index d8a984064..67edd9db3 100644
--- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_complex.c
@@ -35,89 +35,89 @@
 // clang-format off
 #ifdef DOTXAXPYF
 
-#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i)                                      \
+#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL(i)                                      \
     do {                                                                        \
         acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \
         acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                      \
         acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                      \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                                                    \
+#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i)                                                    \
     do {                                                                                              \
         acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \
         acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                                            \
         acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                                            \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                           \
+#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                           \
     do {                                                                                                                    \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                                       \
         VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
         VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl);  \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                                       \
         VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
         VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                                       \
         VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
         VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                                                                       \
         VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc3_r, yacc3_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
         VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                                 \
+#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                                 \
     do {                                                                                                                    \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                                       \
         VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
         VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl);  \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                                       \
         VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
         VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                                       \
         VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
         VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                                       \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                                                                       \
         VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc3_r, yacc3_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
         VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                            \
+#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                            \
     do {                                                                                                                        \
         switch (b) {                                                                                                            \
         case 3:                                                                                                                 \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                                       \
             VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
             VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
         case 2:                                                                                                                 \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                                       \
             VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
             VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
         case 1:                                                                                                                 \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                                       \
             VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl);                    \
             VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \
         }                                                                                                                       \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                                  \
+#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF)                                                  \
     do {                                                                                                                        \
         switch (b) {                                                                                                            \
         case 3:                                                                                                                 \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                                       \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                                       \
             VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
             VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \
         case 2:                                                                                                                 \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                                       \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                                       \
             VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
             VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \
         case 1:                                                                                                                 \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                                       \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                                       \
             VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl);              \
             VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \
         }                                                                                                                       \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_REDUCE(i)                                                                            \
+#define DOTXAXPYF_SIFIVE_RVV_REDUCE(i)                                                                            \
     do {                                                                                                           \
         RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1);                                               \
         RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1);                                               \
@@ -205,29 +205,29 @@ DOTXAXPYF(PRECISION_CHAR, void)
                 if (bli_is_conj(conjat)) {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , _CONJ, _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, _CONJ, _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , _CONJ, );
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, _CONJ, );
                     }
                 }
                 else {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , , _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, , _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( , , );
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, , );
                     }
                 }
                 first = false;
@@ -236,29 +236,29 @@ DOTXAXPYF(PRECISION_CHAR, void)
                 if (bli_is_conj(conjat)) {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , _CONJ, _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, _CONJ, _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , _CONJ, );
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, _CONJ, );
                     }
                 }
                 else {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , , _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, , _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( , , );
                         else
-                            DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , );
+                            DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED, , );
                     }
                 }
             }
@@ -287,10 +287,10 @@ DOTXAXPYF(PRECISION_CHAR, void)
             avl -= vl;
         }
 
-        DOTXAXPYF_SIFIVE_X280_REDUCE(0);
-        DOTXAXPYF_SIFIVE_X280_REDUCE(1);
-        DOTXAXPYF_SIFIVE_X280_REDUCE(2);
-        DOTXAXPYF_SIFIVE_X280_REDUCE(3);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(0);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(1);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(2);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(3);
 
         a += 4 * lda;
         x += 4 * incx;
@@ -322,29 +322,29 @@ DOTXAXPYF(PRECISION_CHAR, void)
                 if (bli_is_conj(conjat)) {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , _CONJ, _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , _CONJ, );
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, );
                     }
                 }
                 else {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , , _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, , _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , , );
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, , );
                     }
                 }
                 first = false;
@@ -353,29 +353,29 @@ DOTXAXPYF(PRECISION_CHAR, void)
                 if (bli_is_conj(conjat)) {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , _CONJ, _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, _CONJ, _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , _CONJ, );
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, _CONJ, );
                     }
                 }
                 else {
                     if (bli_is_conj(conja)) {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , , _CONJ);
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , _CONJ);
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, , _CONJ);
                     }
                     else {
                         if (inca == 1)
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( , , );
                         else
-                            DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , );
+                            DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, , );
                     }
                 }
             }
@@ -406,22 +406,22 @@ DOTXAXPYF(PRECISION_CHAR, void)
 
         switch (b) {
         case 3:
-            DOTXAXPYF_SIFIVE_X280_REDUCE(2);
+            DOTXAXPYF_SIFIVE_RVV_REDUCE(2);
         case 2:
-            DOTXAXPYF_SIFIVE_X280_REDUCE(1);
+            DOTXAXPYF_SIFIVE_RVV_REDUCE(1);
         case 1:
-            DOTXAXPYF_SIFIVE_X280_REDUCE(0);
+            DOTXAXPYF_SIFIVE_RVV_REDUCE(0);
         }
     }
     return;
 }
 
-#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL
-#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED
-#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST
-#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY
-#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST
-#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY
-#undef DOTXAXPYF_SIFIVE_X280_REDUCE
+#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL
+#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED
+#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY
+#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY
+#undef DOTXAXPYF_SIFIVE_RVV_REDUCE
 
 #endif // DOTXAXPYF
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c
similarity index 79%
rename from kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c
index 57ef4f744..7143d3a97 100644
--- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1f/bli_dotxaxpyf_sifive_rvv_intr/bli_dotxaxpyf_sifive_rvv_intr_real.c
@@ -35,85 +35,85 @@
 // clang-format off
 #ifdef DOTXAXPYF
 
-#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i)                   \
+#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL(i)                   \
     do {                                                     \
         acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                             \
+#define DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i)                             \
     do {                                                                       \
         acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF)                \
+#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF)                \
     do {                                                               \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                  \
         yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
         zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl);        \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                  \
         yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
         zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                  \
         yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
         zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                  \
         yacc3 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
         zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF)                      \
+#define DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF)                      \
     do {                                                               \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                  \
         yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl);   \
         zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl);        \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                  \
         yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl);   \
         zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                  \
         yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl);   \
         zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
-        DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+        DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                  \
         yacc3 = VFMACC_VV_TU(PREC, LMUL)(yacc3, acol_vec, wvec, vl);   \
         zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF)                 \
+#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF)                 \
     do {                                                                   \
         switch (b) {                                                       \
         case 3:                                                            \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                  \
             yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
             zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
         case 2:                                                            \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                  \
             yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
             zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
         case 1:                                                            \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                  \
             yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl);              \
             zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \
         }                                                                  \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF)                       \
+#define DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF)                       \
     do {                                                                   \
         switch (b) {                                                       \
         case 3:                                                            \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                  \
             yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl);   \
             zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \
         case 2:                                                            \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                  \
             yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl);   \
             zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \
         case 1:                                                            \
-            DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+            DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                  \
             yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl);   \
             zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \
         }                                                                  \
     } while (0)
 
-#define DOTXAXPYF_SIFIVE_X280_REDUCE(i)                                     \
+#define DOTXAXPYF_SIFIVE_RVV_REDUCE(i)                                     \
     do {                                                                    \
         RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1);            \
         dot##i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i, dot##i, m);             \
@@ -174,16 +174,16 @@ DOTXAXPYF(PRECISION_CHAR, void)
                 wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl);
             if (first) {
                 if (inca == 1)
-                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( );
+                    DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST( );
                 else
-                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED);
+                    DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED);
                 first = false;
             }
             else {
                 if (inca == 1)
-                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY( );
+                    DOTXAXPYF_SIFIVE_RVV_LOOP_BODY( );
                 else
-                    DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED);
+                    DOTXAXPYF_SIFIVE_RVV_LOOP_BODY(_STRIDED);
             }
 
             RVV_TYPE_F(PREC, LMUL) zvec;
@@ -203,10 +203,10 @@ DOTXAXPYF(PRECISION_CHAR, void)
             avl -= vl;
         }
 
-        DOTXAXPYF_SIFIVE_X280_REDUCE(0);
-        DOTXAXPYF_SIFIVE_X280_REDUCE(1);
-        DOTXAXPYF_SIFIVE_X280_REDUCE(2);
-        DOTXAXPYF_SIFIVE_X280_REDUCE(3);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(0);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(1);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(2);
+        DOTXAXPYF_SIFIVE_RVV_REDUCE(3);
 
         a += 4 * lda;
         x += 4 * incx;
@@ -231,16 +231,16 @@ DOTXAXPYF(PRECISION_CHAR, void)
                 wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl);
             if (first) {
                 if (inca == 1)
-                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( );
+                    DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST( );
                 else
-                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED);
+                    DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED);
                 first = false;
             }
             else {
                 if (inca == 1)
-                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( );
+                    DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY( );
                 else
-                    DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED);
+                    DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED);
             }
 
             RVV_TYPE_F(PREC, LMUL) zvec;
@@ -262,22 +262,22 @@ DOTXAXPYF(PRECISION_CHAR, void)
 
         switch (b) {
         case 3:
-            DOTXAXPYF_SIFIVE_X280_REDUCE(2);
+            DOTXAXPYF_SIFIVE_RVV_REDUCE(2);
         case 2:
-            DOTXAXPYF_SIFIVE_X280_REDUCE(1);
+            DOTXAXPYF_SIFIVE_RVV_REDUCE(1);
         case 1:
-            DOTXAXPYF_SIFIVE_X280_REDUCE(0);
+            DOTXAXPYF_SIFIVE_RVV_REDUCE(0);
         }
     }
     return;
 }
 
-#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL
-#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED
-#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST
-#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY
-#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST
-#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY
-#undef DOTXAXPYF_SIFIVE_X280_REDUCE
+#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL
+#undef DOTXAXPYF_SIFIVE_RVV_LOAD_ACOL_STRIDED
+#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_RVV_LOOP_BODY
+#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY_FIRST
+#undef DOTXAXPYF_SIFIVE_RVV_CLEANUP_BODY
+#undef DOTXAXPYF_SIFIVE_RVV_REDUCE
 
 #endif // DOTXAXPYF
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c
similarity index 94%
rename from kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c
rename to kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c
index 9396515b3..e65f0637d 100644
--- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr.c
@@ -39,7 +39,7 @@
 #include <stdint.h>
 #include <riscv_vector.h>
 
-#define DOTXF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxf_sifive_x280_intr(\
+#define DOTXF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxf_sifive_rvv_intr(\
           conj_t           conjat,                    \
           conj_t           conjx,                     \
           dim_t            m,                         \
@@ -54,9 +54,9 @@
 
 #define DOTXF(...)  DOTXF_(__VA_ARGS__)
 
-#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_rvv_intr
 #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
-#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_rvv_intr
 #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
 
 // Single precision real
@@ -66,7 +66,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotxf_sifive_x280_intr_real.c"
+#include "./bli_dotxf_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -81,7 +81,7 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotxf_sifive_x280_intr_real.c"
+#include "./bli_dotxf_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -97,7 +97,7 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(float)
 
-#include "./bli_dotxf_sifive_x280_intr_complex.c"
+#include "./bli_dotxf_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -114,7 +114,7 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(double)
 
-#include "./bli_dotxf_sifive_x280_intr_complex.c"
+#include "./bli_dotxf_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c
similarity index 74%
rename from kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c
index 463a111f0..8cdc4b76e 100644
--- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_complex.c
@@ -35,95 +35,95 @@
 // clang-format off
 #ifdef DOTXF
 
-#define DOTXF_SIFIVE_X280_LOAD_ACOL(i)                                          \
+#define DOTXF_SIFIVE_RVV_LOAD_ACOL(i)                                          \
     do {                                                                        \
         acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \
         acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                      \
         acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                      \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                                                        \
+#define DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i)                                                        \
     do {                                                                                              \
         acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \
         acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0);                                            \
         acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1);                                            \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, CONJ_SUF)                                       \
+#define DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF, CONJ_SUF)                                       \
     do {                                                                                            \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                   \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                   \
         VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                   \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                   \
         VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                   \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                   \
         VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                   \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                                                   \
         VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                   \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);                                                   \
         VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);                                                   \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5);                                                   \
         VCMUL_VV##CONJ_SUF(PREC, LMUL, acc5_r, acc5_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, CONJ_SUF)                                                   \
+#define DOTXF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF, CONJ_SUF)                                                   \
     do {                                                                                                  \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                         \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                         \
         VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                         \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                         \
         VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                         \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                         \
         VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                         \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                                                         \
         VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                         \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);                                                         \
         VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);                                                         \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5);                                                         \
         VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc5_r, acc5_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, CONJ_SUF)                                            \
+#define DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF, CONJ_SUF)                                            \
     do {                                                                                                    \
         switch (b) {                                                                                        \
             case 5:                                                                                         \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                   \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);                                                   \
                 VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
             case 4:                                                                                         \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                   \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                                                   \
                 VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
             case 3:                                                                                         \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                   \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                   \
                 VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
             case 2:                                                                                         \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                   \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                   \
                 VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
             case 1:                                                                                         \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                   \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                   \
                 VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \
         }                                                                                                   \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, CONJ_SUF)                                                        \
+#define DOTXF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF, CONJ_SUF)                                                        \
     do {                                                                                                          \
         switch (b) {                                                                                              \
             case 5:                                                                                               \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                                                         \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);                                                         \
                 VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
             case 4:                                                                                               \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                                                         \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                                                         \
                 VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
             case 3:                                                                                               \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                                                         \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                                                         \
                 VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
             case 2:                                                                                               \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                                                         \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                                                         \
                 VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
             case 1:                                                                                               \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                                                         \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                                                         \
                 VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \
         }                                                                                                         \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_REDUCE(i)                                                                                \
+#define DOTXF_SIFIVE_RVV_REDUCE(i)                                                                                \
     do {                                                                                                           \
         RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1);                                               \
         RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1);                                               \
@@ -200,30 +200,30 @@ DOTXF(PRECISION_CHAR, void)
             if (first) {
                 if (bli_is_conj(conjat)) {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ);
+                        DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST( , _CONJ);
                     else
-                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ);
+                        DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, _CONJ);
                 }
                 else {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , );
+                        DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST( , );
                     else
-                        DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, );
+                        DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED, );
                 }
                 first = false;
             }
             else {
                 if (bli_is_conj(conjat)) {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_LOOP_BODY( , _CONJ);
+                        DOTXF_SIFIVE_RVV_LOOP_BODY( , _CONJ);
                     else
-                        DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ);
+                        DOTXF_SIFIVE_RVV_LOOP_BODY(_STRIDED, _CONJ);
                 }
                 else {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_LOOP_BODY( , );
+                        DOTXF_SIFIVE_RVV_LOOP_BODY( , );
                     else
-                        DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, );
+                        DOTXF_SIFIVE_RVV_LOOP_BODY(_STRIDED, );
                 }
             }
               
@@ -232,12 +232,12 @@ DOTXF(PRECISION_CHAR, void)
             avl -= vl;
         }
 
-        DOTXF_SIFIVE_X280_REDUCE(0);
-        DOTXF_SIFIVE_X280_REDUCE(1);
-        DOTXF_SIFIVE_X280_REDUCE(2);
-        DOTXF_SIFIVE_X280_REDUCE(3);
-        DOTXF_SIFIVE_X280_REDUCE(4);
-        DOTXF_SIFIVE_X280_REDUCE(5);
+        DOTXF_SIFIVE_RVV_REDUCE(0);
+        DOTXF_SIFIVE_RVV_REDUCE(1);
+        DOTXF_SIFIVE_RVV_REDUCE(2);
+        DOTXF_SIFIVE_RVV_REDUCE(3);
+        DOTXF_SIFIVE_RVV_REDUCE(4);
+        DOTXF_SIFIVE_RVV_REDUCE(5);
 
         a += 6 * lda;
         y += 6 * incy;
@@ -265,30 +265,30 @@ DOTXF(PRECISION_CHAR, void)
             if (first) {
                 if (bli_is_conj(conjat)) {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ);
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , _CONJ);
                     else
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ);
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, _CONJ);
                 }
                 else {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , );
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST( , );
                     else
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, );
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED, );
                 }
                 first = false;
             }
             else {
                 if (bli_is_conj(conjat)) {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY( , _CONJ);
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY( , _CONJ);
                     else
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ);
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, _CONJ);
                 }
                 else {
                     if (inca == 1)
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY( , );
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY( , );
                     else
-                        DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, );
+                        DOTXF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED, );
                 }
             }
 
@@ -299,26 +299,26 @@ DOTXF(PRECISION_CHAR, void)
 
         switch (b) {
             case 5:
-                DOTXF_SIFIVE_X280_REDUCE(4);
+                DOTXF_SIFIVE_RVV_REDUCE(4);
             case 4:
-                DOTXF_SIFIVE_X280_REDUCE(3);
+                DOTXF_SIFIVE_RVV_REDUCE(3);
             case 3:
-                DOTXF_SIFIVE_X280_REDUCE(2);
+                DOTXF_SIFIVE_RVV_REDUCE(2);
             case 2:
-                DOTXF_SIFIVE_X280_REDUCE(1);
+                DOTXF_SIFIVE_RVV_REDUCE(1);
             case 1:
-                DOTXF_SIFIVE_X280_REDUCE(0);
+                DOTXF_SIFIVE_RVV_REDUCE(0);
         }
     }
     return;
 }
 
-#undef DOTXF_SIFIVE_X280_LOAD_ACOL
-#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED
-#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST
-#undef DOTXF_SIFIVE_X280_LOOP_BODY
-#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST
-#undef DOTXF_SIFIVE_X280_CLEANUP_BODY
-#undef DOTXF_SIFIVE_X280_REDUCE
+#undef DOTXF_SIFIVE_RVV_LOAD_ACOL
+#undef DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED
+#undef DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST
+#undef DOTXF_SIFIVE_RVV_LOOP_BODY
+#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST
+#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY
+#undef DOTXF_SIFIVE_RVV_REDUCE
 
 #endif // DOTXF
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c
similarity index 72%
rename from kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c
index 8286e2476..cdc8f259e 100644
--- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1f/bli_dotxf_sifive_rvv_intr/bli_dotxf_sifive_rvv_intr_real.c
@@ -35,91 +35,91 @@
 // clang-format off
 #ifdef DOTXF
 
-#define DOTXF_SIFIVE_X280_LOAD_ACOL(i)                                         \
+#define DOTXF_SIFIVE_RVV_LOAD_ACOL(i)                                         \
     do {                                                                       \
         acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl);                   \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i)                                 \
+#define DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED(i)                                 \
     do {                                                                       \
         acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF)      \
+#define DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(LOAD_SUF)      \
     do {                                                 \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);        \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);        \
         acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);        \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);        \
         acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);        \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);        \
         acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);        \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);        \
         acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);        \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);        \
         acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);        \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5);        \
         acc5 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF)                      \
+#define DOTXF_SIFIVE_RVV_LOOP_BODY(LOAD_SUF)                      \
     do {                                                           \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                  \
         acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                  \
         acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                  \
         acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                  \
         acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                  \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);                  \
         acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \
-        DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5);                  \
+        DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(5);                  \
         acc5 = VFMACC_VV_TU(PREC, LMUL)(acc5, acol_vec, xvec, vl); \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF)           \
+#define DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(LOAD_SUF)           \
     do {                                                         \
         switch (b) {                                             \
             case 5:                                              \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);        \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);        \
                 acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
             case 4:                                              \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);        \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);        \
                 acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
             case 3:                                              \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);        \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);        \
                 acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
             case 2:                                              \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);        \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);        \
                 acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
             case 1:                                              \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);        \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);        \
                 acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \
         }                                                        \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF)                           \
+#define DOTXF_SIFIVE_RVV_CLEANUP_BODY(LOAD_SUF)                           \
     do {                                                                   \
         switch (b) {                                                       \
             case 5:                                                        \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4);                  \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(4);                  \
                 acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \
             case 4:                                                        \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3);                  \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(3);                  \
                 acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \
             case 3:                                                        \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2);                  \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(2);                  \
                 acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \
             case 2:                                                        \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1);                  \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(1);                  \
                 acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \
             case 1:                                                        \
-                DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0);                  \
+                DOTXF_SIFIVE_RVV_LOAD_ACOL##LOAD_SUF(0);                  \
                 acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \
         }                                                                  \
     } while (0)
 
-#define DOTXF_SIFIVE_X280_REDUCE(i)                                         \
+#define DOTXF_SIFIVE_RVV_REDUCE(i)                                         \
     do {                                                                    \
         RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1);            \
         dot##i = VF_REDUSUM_VS(PREC, LMUL)(acc##i, dot##i, m);              \
@@ -173,16 +173,16 @@ DOTXF(PRECISION_CHAR, void)
                 xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl);
             if (first) {
                 if (inca == 1)
-                    DOTXF_SIFIVE_X280_LOOP_BODY_FIRST();
+                    DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST();
                 else
-                    DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED);
+                    DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST(_STRIDED);
                 first = false;
             }
             else {
                 if (inca == 1)
-                    DOTXF_SIFIVE_X280_LOOP_BODY();
+                    DOTXF_SIFIVE_RVV_LOOP_BODY();
                 else
-                    DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED);
+                    DOTXF_SIFIVE_RVV_LOOP_BODY(_STRIDED);
             }
               
             a_tmp += vl * inca;
@@ -190,12 +190,12 @@ DOTXF(PRECISION_CHAR, void)
             avl -= vl;
         }
 
-        DOTXF_SIFIVE_X280_REDUCE(0);
-        DOTXF_SIFIVE_X280_REDUCE(1);
-        DOTXF_SIFIVE_X280_REDUCE(2);
-        DOTXF_SIFIVE_X280_REDUCE(3);
-        DOTXF_SIFIVE_X280_REDUCE(4);
-        DOTXF_SIFIVE_X280_REDUCE(5);
+        DOTXF_SIFIVE_RVV_REDUCE(0);
+        DOTXF_SIFIVE_RVV_REDUCE(1);
+        DOTXF_SIFIVE_RVV_REDUCE(2);
+        DOTXF_SIFIVE_RVV_REDUCE(3);
+        DOTXF_SIFIVE_RVV_REDUCE(4);
+        DOTXF_SIFIVE_RVV_REDUCE(5);
 
         a += 6 * lda;
         y += 6 * incy;
@@ -217,16 +217,16 @@ DOTXF(PRECISION_CHAR, void)
                 xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl);
             if (first) {
                 if (inca == 1)
-                    DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST();
+                    DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST();
                 else
-                    DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED);
+                    DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST(_STRIDED);
                 first = false;
             }
             else {
                 if (inca == 1)
-                    DOTXF_SIFIVE_X280_CLEANUP_BODY();
+                    DOTXF_SIFIVE_RVV_CLEANUP_BODY();
                 else
-                    DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED);
+                    DOTXF_SIFIVE_RVV_CLEANUP_BODY(_STRIDED);
             }
 
             a_tmp += vl * inca;
@@ -236,27 +236,27 @@ DOTXF(PRECISION_CHAR, void)
 
         switch (b) {
             case 5:
-                DOTXF_SIFIVE_X280_REDUCE(4);
+                DOTXF_SIFIVE_RVV_REDUCE(4);
             case 4:
-                DOTXF_SIFIVE_X280_REDUCE(3);
+                DOTXF_SIFIVE_RVV_REDUCE(3);
             case 3:
-                DOTXF_SIFIVE_X280_REDUCE(2);
+                DOTXF_SIFIVE_RVV_REDUCE(2);
             case 2:
-                DOTXF_SIFIVE_X280_REDUCE(1);
+                DOTXF_SIFIVE_RVV_REDUCE(1);
             case 1:
-                DOTXF_SIFIVE_X280_REDUCE(0);
+                DOTXF_SIFIVE_RVV_REDUCE(0);
         }
 
     }
     return;
 }
 
-#undef DOTXF_SIFIVE_X280_LOAD_ACOL
-#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED
-#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST
-#undef DOTXF_SIFIVE_X280_LOOP_BODY
-#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST
-#undef DOTXF_SIFIVE_X280_CLEANUP_BODY
-#undef DOTXF_SIFIVE_X280_REDUCE
+#undef DOTXF_SIFIVE_RVV_LOAD_ACOL
+#undef DOTXF_SIFIVE_RVV_LOAD_ACOL_STRIDED
+#undef DOTXF_SIFIVE_RVV_LOOP_BODY_FIRST
+#undef DOTXF_SIFIVE_RVV_LOOP_BODY
+#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY_FIRST
+#undef DOTXF_SIFIVE_RVV_CLEANUP_BODY
+#undef DOTXF_SIFIVE_RVV_REDUCE
 
 #endif // DOTXF
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c
similarity index 84%
rename from kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c
rename to kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c
index 119872197..567a2a2b5 100644
--- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr.c
@@ -40,7 +40,7 @@
 #include <stdint.h>
 #include <riscv_vector.h>
 
-#define PACKM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##packm_sifive_x280_intr(\
+#define PACKM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##packm_sifive_rvv_intr(\
          conj_t           conja,                     \
          pack_t           schema,                    \
          dim_t            cdim,                      \
@@ -57,8 +57,11 @@
 
 #define PACKM(...)  PACKM_(__VA_ARGS__)
 
-#define REF_KERNEL_(PRECISION_CHAR) bli_##PRECISION_CHAR##PRECISION_CHAR##packm_sifive_x280_ref
-#define REF_KERNEL(PRECISION_CHAR) REF_KERNEL_(PRECISION_CHAR)
+#define BLI_SCAL2BBS_MXN_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2bbs_mxn
+#define BLI_SCAL2BBS_MXN(PRECISION_CHAR) BLI_SCAL2BBS_MXN_(PRECISION_CHAR)
+
+#define BLI_SET0S_EDGE_(PRECISION_CHAR) bli_##PRECISION_CHAR##set0s_edge
+#define BLI_SET0S_EDGE(PRECISION_CHAR) BLI_SET0S_EDGE_(PRECISION_CHAR)
 
 // LMUL is the LMUL used when a is "row major" (lda == 1). Since we use
 // segment stores with more than 4 fields, this is usually m1.
@@ -74,9 +77,9 @@
 #define LMUL_NR m4
 #define FLT_SIZE sizeof(float)
 #define MR 7
-#define NR 64
+#define NR ( 4 * __riscv_vlenb() / 4 )
 
-#include "./bli_packm_sifive_x280_intr_real.c"
+#include "./bli_packm_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -97,9 +100,9 @@
 #define LMUL_NR m4
 #define FLT_SIZE sizeof(double)
 #define MR 7
-#define NR 32
+#define NR ( 4 * __riscv_vlenb() / 8 )
 
-#include "./bli_packm_sifive_x280_intr_real.c"
+#include "./bli_packm_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -121,9 +124,9 @@
 #define LMUL_NR m2
 #define FLT_SIZE sizeof(float)
 #define MR 6
-#define NR 32
+#define NR ( 2 * __riscv_vlenb() / 4 )
 
-#include "./bli_packm_sifive_x280_intr_complex.c"
+#include "./bli_packm_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -146,9 +149,9 @@
 #define LMUL_NR m2
 #define FLT_SIZE sizeof(double)
 #define MR 6
-#define NR 16
+#define NR ( 2 * __riscv_vlenb() / 8 )
 
-#include "./bli_packm_sifive_x280_intr_complex.c"
+#include "./bli_packm_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -161,8 +164,11 @@
 #undef MR
 #undef NR
 
-#undef REF_KERNEL_
-#undef REF_KERNEL
+#undef BLI_SCAL2BBS_MXN_
+#undef BLI_SCAL2BBS_MXN
+
+#undef BLI_SET0S_EDGE_
+#undef BLI_SET0S_EDGE
 
 #undef PACKM
 #undef PACKM_
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c
index ee49090dc..2173be3a7 100644
--- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_complex.c
@@ -522,20 +522,21 @@ PACKM(PRECISION_CHAR, void)
     // generic kernel
     else
     {
-        REF_KERNEL(PRECISION_CHAR)
+        BLI_SCAL2BBS_MXN(PRECISION_CHAR)
         (
           conja,
-          schema,
           cdim,
-          cdim_max,
-          cdim_bcast,
           n,
-          n_max,
           kappa,
-          a, inca, lda,
-          p,       ldp,
-          params,
-          cntx
+          a,       inca, lda,
+          p, cdim_bcast, ldp
+        );
+
+        BLI_SET0S_EDGE(PRECISION_CHAR)
+        (
+          cdim*cdim_bcast, cdim_max*cdim_bcast,
+          n, n_max,
+          p, ldp
         );
     }
 
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c
similarity index 98%
rename from kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c
index 741714d60..c853765a2 100644
--- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/1m/bli_packm_sifive_rvv_intr/bli_packm_sifive_rvv_intr_real.c
@@ -37,8 +37,7 @@
 
 PACKM(PRECISION_CHAR, void)
 {
-    (void) conja; // Suppress unused parameter warnings
-    (void) schema;
+    (void) schema; // Suppress unused parameter warnings
     (void) params;
     (void) cntx;
     const DATATYPE* restrict kappa = kappa_;
@@ -341,20 +340,21 @@ PACKM(PRECISION_CHAR, void)
     // generic kernel
     else
     {
-        REF_KERNEL(PRECISION_CHAR)
+        BLI_SCAL2BBS_MXN(PRECISION_CHAR)
         (
           conja,
-          schema,
           cdim,
-          cdim_max,
-          cdim_bcast,
           n,
-          n_max,
           kappa,
-          a, inca, lda,
-          p,       ldp,
-          params,
-          cntx
+          a,       inca, lda,
+          p, cdim_bcast, ldp
+        );
+
+        BLI_SET0S_EDGE(PRECISION_CHAR)
+        (
+          cdim*cdim_bcast, cdim_max*cdim_bcast,
+          n, n_max,
+          p, ldp
         );
     }
 
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c
similarity index 90%
rename from kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c
rename to kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c
index 664d4616f..97722f13c 100644
--- a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr.c
@@ -39,7 +39,7 @@
 #include <riscv_vector.h>
 #include <stdint.h>
 
-#define GEMM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemm_sifive_x280_intr(\
+#define GEMM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemm_sifive_rvv_intr(\
          dim_t               m,                        \
          dim_t               n,                        \
          dim_t               k,                        \
@@ -61,9 +61,9 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 #define PACKMR 8
-#define PACKNR 64
+#define PACKNR ( 4 * __riscv_vlenb() / 4 )
 
-#include "./bli_gemm_sifive_x280_intr_real.c"
+#include "./bli_gemm_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -80,9 +80,9 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 #define PACKMR 8
-#define PACKNR 32
+#define PACKNR ( 4 * __riscv_vlenb() / 8 )
 
-#include "./bli_gemm_sifive_x280_intr_real.c"
+#include "./bli_gemm_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -100,9 +100,9 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(float)
 #define PACKMR 8
-#define PACKNR 32
+#define PACKNR ( 2 * __riscv_vlenb() / 4 )
 
-#include "./bli_gemm_sifive_x280_intr_complex.c"
+#include "./bli_gemm_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -121,9 +121,9 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(double)
 #define PACKMR 8
-#define PACKNR 16
+#define PACKNR ( 2 * __riscv_vlenb() / 8 )
 
-#include "./bli_gemm_sifive_x280_intr_complex.c"
+#include "./bli_gemm_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_complex.c
similarity index 100%
rename from kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_complex.c
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c b/kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_real.c
similarity index 100%
rename from kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/3/bli_gemm_sifive_rvv_intr/bli_gemm_sifive_rvv_intr_real.c
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c
similarity index 89%
rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c
rename to kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c
index 687abec18..fa4ea309d 100644
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c
+++ b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr.c
@@ -35,11 +35,11 @@
 // clang-format off
 #include "blis.h"
 #include "../../riscv_cmul_macros_intr.h"
-#include "../../bli_kernels_sifive_x280.h"
+#include "../../bli_kernels_sifive_rvv.h"
 #include <stdint.h>
 #include <riscv_vector.h>
 
-#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_intr(\
+#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_rvv_intr(\
           dim_t               m,      \
           dim_t               n,      \
           dim_t               k,      \
@@ -55,7 +55,7 @@
     const cntx_t*    restrict cntx    \
     )
 
-#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_intr(\
+#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_rvv_intr(\
           dim_t               m,      \
           dim_t               n,      \
           dim_t               k,      \
@@ -80,9 +80,9 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(float)
 #define PACKMR 8
-#define PACKNR 64
+#define PACKNR ( 4 * __riscv_vlenb() / 4 )
 
-#include "./bli_gemmtrsm_sifive_x280_intr_real.c"
+#include "./bli_gemmtrsm_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -99,9 +99,9 @@
 #define LMUL m4
 #define FLT_SIZE sizeof(double)
 #define PACKMR 8
-#define PACKNR 32
+#define PACKNR ( 4 * __riscv_vlenb() / 8 )
 
-#include "./bli_gemmtrsm_sifive_x280_intr_real.c"
+#include "./bli_gemmtrsm_sifive_rvv_intr_real.c"
 
 #undef DATATYPE
 #undef PRECISION_CHAR
@@ -119,9 +119,9 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(float)
 #define PACKMR 8
-#define PACKNR 32
+#define PACKNR ( 2 * __riscv_vlenb() / 4 )
 
-#include "./bli_gemmtrsm_sifive_x280_intr_complex.c"
+#include "./bli_gemmtrsm_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
@@ -140,9 +140,9 @@
 #define LMUL m2
 #define FLT_SIZE sizeof(double)
 #define PACKMR 8
-#define PACKNR 16
+#define PACKNR ( 2 * __riscv_vlenb() / 8 )
 
-#include "./bli_gemmtrsm_sifive_x280_intr_complex.c"
+#include "./bli_gemmtrsm_sifive_rvv_intr_complex.c"
 
 #undef DATATYPE
 #undef BASE_DT
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c
similarity index 99%
rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c
rename to kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c
index 88ea04b7a..7f2fc1c89 100644
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c
+++ b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_complex.c
@@ -35,7 +35,7 @@
 // clang-format off
 #ifdef GEMMTRSM
 
-#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr
+#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_rvv_intr
 #define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR)
 
 static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c
similarity index 99%
rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c
rename to kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c
index 7c3c3b8b7..b628e4cc1 100644
--- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c
+++ b/kernels/sifive_rvv/3/bli_gemmtrsm_sifive_rvv_intr/bli_gemmtrsm_sifive_rvv_intr_real.c
@@ -35,7 +35,7 @@
 // clang-format off
 #ifdef GEMMTRSM
 
-#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr
+#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_rvv_intr
 #define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR)
 
 static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR)
diff --git a/kernels/sifive_rvv/bli_kernels_sifive_rvv.h b/kernels/sifive_rvv/bli_kernels_sifive_rvv.h
new file mode 100644
index 000000000..f9f0f8995
--- /dev/null
+++ b/kernels/sifive_rvv/bli_kernels_sifive_rvv.h
@@ -0,0 +1,162 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Level 1
+ADDV_KER_PROT(float,        s, addv_sifive_rvv_intr)
+ADDV_KER_PROT(double,       d, addv_sifive_rvv_intr)
+ADDV_KER_PROT(scomplex,     c, addv_sifive_rvv_intr)
+ADDV_KER_PROT(dcomplex,     z, addv_sifive_rvv_intr)
+
+AMAXV_KER_PROT(float,       s, amaxv_sifive_rvv_intr)
+AMAXV_KER_PROT(double,      d, amaxv_sifive_rvv_intr)
+AMAXV_KER_PROT(scomplex,    c, amaxv_sifive_rvv_intr)
+AMAXV_KER_PROT(dcomplex,    z, amaxv_sifive_rvv_intr)
+
+AXPBYV_KER_PROT(float,      s, axpbyv_sifive_rvv_intr)
+AXPBYV_KER_PROT(double,     d, axpbyv_sifive_rvv_intr)
+AXPBYV_KER_PROT(scomplex,   c, axpbyv_sifive_rvv_intr)
+AXPBYV_KER_PROT(dcomplex,   z, axpbyv_sifive_rvv_intr)
+
+AXPYV_KER_PROT(float,       s, axpyv_sifive_rvv_intr)
+AXPYV_KER_PROT(double,      d, axpyv_sifive_rvv_intr)
+AXPYV_KER_PROT(scomplex,    c, axpyv_sifive_rvv_intr)
+AXPYV_KER_PROT(dcomplex,    z, axpyv_sifive_rvv_intr)
+
+COPYV_KER_PROT(float,       s, copyv_sifive_rvv_intr)
+COPYV_KER_PROT(double,      d, copyv_sifive_rvv_intr)
+COPYV_KER_PROT(scomplex,    c, copyv_sifive_rvv_intr)
+COPYV_KER_PROT(dcomplex,    z, copyv_sifive_rvv_intr)
+
+DOTV_KER_PROT(float,        s, dotv_sifive_rvv_intr)
+DOTV_KER_PROT(double,       d, dotv_sifive_rvv_intr)
+DOTV_KER_PROT(scomplex,     c, dotv_sifive_rvv_intr)
+DOTV_KER_PROT(dcomplex,     z, dotv_sifive_rvv_intr)
+
+DOTXV_KER_PROT(float,       s, dotxv_sifive_rvv_intr)
+DOTXV_KER_PROT(double,      d, dotxv_sifive_rvv_intr)
+DOTXV_KER_PROT(scomplex,    c, dotxv_sifive_rvv_intr)
+DOTXV_KER_PROT(dcomplex,    z, dotxv_sifive_rvv_intr)
+
+INVERTV_KER_PROT(float,     s, invertv_sifive_rvv_intr)
+INVERTV_KER_PROT(double,    d, invertv_sifive_rvv_intr)
+INVERTV_KER_PROT(scomplex,  c, invertv_sifive_rvv_intr)
+INVERTV_KER_PROT(dcomplex,  z, invertv_sifive_rvv_intr)
+
+INVSCALV_KER_PROT(float,    s, invscalv_sifive_rvv_intr)
+INVSCALV_KER_PROT(double,   d, invscalv_sifive_rvv_intr)
+INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_rvv_intr)
+INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_rvv_intr)
+
+SCAL2V_KER_PROT(float,      s, scal2v_sifive_rvv_intr)
+SCAL2V_KER_PROT(double,     d, scal2v_sifive_rvv_intr)
+SCAL2V_KER_PROT(scomplex,   c, scal2v_sifive_rvv_intr)
+SCAL2V_KER_PROT(dcomplex,   z, scal2v_sifive_rvv_intr)
+
+SCALV_KER_PROT(float,       s, scalv_sifive_rvv_intr)
+SCALV_KER_PROT(double,      d, scalv_sifive_rvv_intr)
+SCALV_KER_PROT(scomplex,    c, scalv_sifive_rvv_intr)
+SCALV_KER_PROT(dcomplex,    z, scalv_sifive_rvv_intr)
+
+SETV_KER_PROT(float,        s, setv_sifive_rvv_intr)
+SETV_KER_PROT(double,       d, setv_sifive_rvv_intr)
+SETV_KER_PROT(scomplex,     c, setv_sifive_rvv_intr)
+SETV_KER_PROT(dcomplex,     z, setv_sifive_rvv_intr)
+
+SUBV_KER_PROT(float,        s, subv_sifive_rvv_intr)
+SUBV_KER_PROT(double,       d, subv_sifive_rvv_intr)
+SUBV_KER_PROT(scomplex,     c, subv_sifive_rvv_intr)
+SUBV_KER_PROT(dcomplex,     z, subv_sifive_rvv_intr)
+
+SWAPV_KER_PROT(float,       s, swapv_sifive_rvv_intr)
+SWAPV_KER_PROT(double,      d, swapv_sifive_rvv_intr)
+SWAPV_KER_PROT(scomplex,    c, swapv_sifive_rvv_intr)
+SWAPV_KER_PROT(dcomplex,    z, swapv_sifive_rvv_intr)
+
+XPBYV_KER_PROT(float,       s, xpbyv_sifive_rvv_intr)
+XPBYV_KER_PROT(double,      d, xpbyv_sifive_rvv_intr)
+XPBYV_KER_PROT(scomplex,    c, xpbyv_sifive_rvv_intr)
+XPBYV_KER_PROT(dcomplex,    z, xpbyv_sifive_rvv_intr)
+
+// Level 1f
+AXPY2V_KER_PROT(float,      s, axpy2v_sifive_rvv_intr)
+AXPY2V_KER_PROT(double,     d, axpy2v_sifive_rvv_intr)
+AXPY2V_KER_PROT(scomplex,   c, axpy2v_sifive_rvv_intr)
+AXPY2V_KER_PROT(dcomplex,   z, axpy2v_sifive_rvv_intr)
+
+AXPYF_KER_PROT(float,       s, axpyf_sifive_rvv_intr)
+AXPYF_KER_PROT(double,      d, axpyf_sifive_rvv_intr)
+AXPYF_KER_PROT(scomplex,    c, axpyf_sifive_rvv_intr)
+AXPYF_KER_PROT(dcomplex,    z, axpyf_sifive_rvv_intr)
+
+DOTXF_KER_PROT(float,       s, dotxf_sifive_rvv_intr)
+DOTXF_KER_PROT(double,      d, dotxf_sifive_rvv_intr)
+DOTXF_KER_PROT(scomplex,    c, dotxf_sifive_rvv_intr)
+DOTXF_KER_PROT(dcomplex,    z, dotxf_sifive_rvv_intr)
+
+DOTAXPYV_KER_PROT(float,    s, dotaxpyv_sifive_rvv_intr)
+DOTAXPYV_KER_PROT(double,   d, dotaxpyv_sifive_rvv_intr)
+DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_rvv_intr)
+DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_rvv_intr)
+
+DOTXAXPYF_KER_PROT(float,   s, dotxaxpyf_sifive_rvv_intr)
+DOTXAXPYF_KER_PROT(double,  d, dotxaxpyf_sifive_rvv_intr)
+DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_rvv_intr)
+DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_rvv_intr)
+
+// Level 1m
+PACKM_KER_PROT(float,       s, packm_sifive_rvv_intr)
+PACKM_KER_PROT(double,      d, packm_sifive_rvv_intr)
+PACKM_KER_PROT(scomplex,    c, packm_sifive_rvv_intr)
+PACKM_KER_PROT(dcomplex,    z, packm_sifive_rvv_intr)
+
+// Reference 1m
+PACKM_KER_PROT(float,       ss, packm_sifive_rvv_ref)
+PACKM_KER_PROT(double,      dd, packm_sifive_rvv_ref)
+PACKM_KER_PROT(scomplex,    cc, packm_sifive_rvv_ref)
+PACKM_KER_PROT(dcomplex,    zz, packm_sifive_rvv_ref)
+
+// Level 3
+GEMM_UKR_PROT(float,        s, gemm_sifive_rvv_intr)
+GEMM_UKR_PROT(double,       d, gemm_sifive_rvv_intr)
+GEMM_UKR_PROT(scomplex,     c, gemm_sifive_rvv_intr)
+GEMM_UKR_PROT(dcomplex,     z, gemm_sifive_rvv_intr)
+
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_l_sifive_rvv_intr)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_l_sifive_rvv_intr)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_rvv_intr)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_rvv_intr)
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_u_sifive_rvv_intr)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_u_sifive_rvv_intr)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_rvv_intr)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_rvv_intr)
diff --git a/kernels/sifive_x280/riscv_cmul_macros_intr.h b/kernels/sifive_rvv/riscv_cmul_macros_intr.h
similarity index 100%
rename from kernels/sifive_x280/riscv_cmul_macros_intr.h
rename to kernels/sifive_rvv/riscv_cmul_macros_intr.h
diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_rvv/riscv_overloaded_intrinsics.h
similarity index 99%
rename from kernels/sifive_x280/riscv_overloaded_intrinsics.h
rename to kernels/sifive_rvv/riscv_overloaded_intrinsics.h
index 44f70f272..794c44c09 100644
--- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h
+++ b/kernels/sifive_rvv/riscv_overloaded_intrinsics.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2023, SiFive, Inc.
+   Copyright (C) 2024, SiFive, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h
deleted file mode 100644
index ff7b445c4..000000000
--- a/kernels/sifive_x280/bli_kernels_sifive_x280.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// Level 1
-ADDV_KER_PROT(float,        s, addv_sifive_x280_intr)
-ADDV_KER_PROT(double,       d, addv_sifive_x280_intr)
-ADDV_KER_PROT(scomplex,     c, addv_sifive_x280_intr)
-ADDV_KER_PROT(dcomplex,     z, addv_sifive_x280_intr)
-
-AMAXV_KER_PROT(float,       s, amaxv_sifive_x280_intr)
-AMAXV_KER_PROT(double,      d, amaxv_sifive_x280_intr)
-AMAXV_KER_PROT(scomplex,    c, amaxv_sifive_x280_intr)
-AMAXV_KER_PROT(dcomplex,    z, amaxv_sifive_x280_intr)
-
-AXPBYV_KER_PROT(float,      s, axpbyv_sifive_x280_intr)
-AXPBYV_KER_PROT(double,     d, axpbyv_sifive_x280_intr)
-AXPBYV_KER_PROT(scomplex,   c, axpbyv_sifive_x280_intr)
-AXPBYV_KER_PROT(dcomplex,   z, axpbyv_sifive_x280_intr)
-
-AXPYV_KER_PROT(float,       s, axpyv_sifive_x280_intr)
-AXPYV_KER_PROT(double,      d, axpyv_sifive_x280_intr)
-AXPYV_KER_PROT(scomplex,    c, axpyv_sifive_x280_intr)
-AXPYV_KER_PROT(dcomplex,    z, axpyv_sifive_x280_intr)
-
-COPYV_KER_PROT(float,       s, copyv_sifive_x280_intr)
-COPYV_KER_PROT(double,      d, copyv_sifive_x280_intr)
-COPYV_KER_PROT(scomplex,    c, copyv_sifive_x280_intr)
-COPYV_KER_PROT(dcomplex,    z, copyv_sifive_x280_intr)
-
-DOTV_KER_PROT(float,        s, dotv_sifive_x280_intr)
-DOTV_KER_PROT(double,       d, dotv_sifive_x280_intr)
-DOTV_KER_PROT(scomplex,     c, dotv_sifive_x280_intr)
-DOTV_KER_PROT(dcomplex,     z, dotv_sifive_x280_intr)
-
-DOTXV_KER_PROT(float,       s, dotxv_sifive_x280_intr)
-DOTXV_KER_PROT(double,      d, dotxv_sifive_x280_intr)
-DOTXV_KER_PROT(scomplex,    c, dotxv_sifive_x280_intr)
-DOTXV_KER_PROT(dcomplex,    z, dotxv_sifive_x280_intr)
-
-INVERTV_KER_PROT(float,     s, invertv_sifive_x280_intr)
-INVERTV_KER_PROT(double,    d, invertv_sifive_x280_intr)
-INVERTV_KER_PROT(scomplex,  c, invertv_sifive_x280_intr)
-INVERTV_KER_PROT(dcomplex,  z, invertv_sifive_x280_intr)
-
-INVSCALV_KER_PROT(float,    s, invscalv_sifive_x280_intr)
-INVSCALV_KER_PROT(double,   d, invscalv_sifive_x280_intr)
-INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_intr)
-INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_intr)
-
-SCAL2V_KER_PROT(float,      s, scal2v_sifive_x280_intr)
-SCAL2V_KER_PROT(double,     d, scal2v_sifive_x280_intr)
-SCAL2V_KER_PROT(scomplex,   c, scal2v_sifive_x280_intr)
-SCAL2V_KER_PROT(dcomplex,   z, scal2v_sifive_x280_intr)
-
-SCALV_KER_PROT(float,       s, scalv_sifive_x280_intr)
-SCALV_KER_PROT(double,      d, scalv_sifive_x280_intr)
-SCALV_KER_PROT(scomplex,    c, scalv_sifive_x280_intr)
-SCALV_KER_PROT(dcomplex,    z, scalv_sifive_x280_intr)
-
-SETV_KER_PROT(float,        s, setv_sifive_x280_intr)
-SETV_KER_PROT(double,       d, setv_sifive_x280_intr)
-SETV_KER_PROT(scomplex,     c, setv_sifive_x280_intr)
-SETV_KER_PROT(dcomplex,     z, setv_sifive_x280_intr)
-
-SUBV_KER_PROT(float,        s, subv_sifive_x280_intr)
-SUBV_KER_PROT(double,       d, subv_sifive_x280_intr)
-SUBV_KER_PROT(scomplex,     c, subv_sifive_x280_intr)
-SUBV_KER_PROT(dcomplex,     z, subv_sifive_x280_intr)
-
-SWAPV_KER_PROT(float,       s, swapv_sifive_x280_intr)
-SWAPV_KER_PROT(double,      d, swapv_sifive_x280_intr)
-SWAPV_KER_PROT(scomplex,    c, swapv_sifive_x280_intr)
-SWAPV_KER_PROT(dcomplex,    z, swapv_sifive_x280_intr)
-
-XPBYV_KER_PROT(float,       s, xpbyv_sifive_x280_intr)
-XPBYV_KER_PROT(double,      d, xpbyv_sifive_x280_intr)
-XPBYV_KER_PROT(scomplex,    c, xpbyv_sifive_x280_intr)
-XPBYV_KER_PROT(dcomplex,    z, xpbyv_sifive_x280_intr)
-
-// Level 1f
-AXPY2V_KER_PROT(float,      s, axpy2v_sifive_x280_intr)
-AXPY2V_KER_PROT(double,     d, axpy2v_sifive_x280_intr)
-AXPY2V_KER_PROT(scomplex,   c, axpy2v_sifive_x280_intr)
-AXPY2V_KER_PROT(dcomplex,   z, axpy2v_sifive_x280_intr)
-
-AXPYF_KER_PROT(float,       s, axpyf_sifive_x280_intr)
-AXPYF_KER_PROT(double,      d, axpyf_sifive_x280_intr)
-AXPYF_KER_PROT(scomplex,    c, axpyf_sifive_x280_intr)
-AXPYF_KER_PROT(dcomplex,    z, axpyf_sifive_x280_intr)
-
-DOTXF_KER_PROT(float,       s, dotxf_sifive_x280_intr)
-DOTXF_KER_PROT(double,      d, dotxf_sifive_x280_intr)
-DOTXF_KER_PROT(scomplex,    c, dotxf_sifive_x280_intr)
-DOTXF_KER_PROT(dcomplex,    z, dotxf_sifive_x280_intr)
-
-DOTAXPYV_KER_PROT(float,    s, dotaxpyv_sifive_x280_intr)
-DOTAXPYV_KER_PROT(double,   d, dotaxpyv_sifive_x280_intr)
-DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr)
-DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr)
-
-DOTXAXPYF_KER_PROT(float,   s, dotxaxpyf_sifive_x280_intr)
-DOTXAXPYF_KER_PROT(double,  d, dotxaxpyf_sifive_x280_intr)
-DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_intr)
-DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_intr)
-
-// Level 1m
-PACKM_KER_PROT(float,       s, packm_sifive_x280_intr)
-PACKM_KER_PROT(double,      d, packm_sifive_x280_intr)
-PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_intr)
-PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_intr)
-
-// Reference 1m
-PACKM_KER_PROT(float,       ss, packm_sifive_x280_ref)
-PACKM_KER_PROT(double,      dd, packm_sifive_x280_ref)
-PACKM_KER_PROT(scomplex,    cc, packm_sifive_x280_ref)
-PACKM_KER_PROT(dcomplex,    zz, packm_sifive_x280_ref)
-
-// Level 3
-GEMM_UKR_PROT(float,        s, gemm_sifive_x280_intr)
-GEMM_UKR_PROT(double,       d, gemm_sifive_x280_intr)
-GEMM_UKR_PROT(scomplex,     c, gemm_sifive_x280_intr)
-GEMM_UKR_PROT(dcomplex,     z, gemm_sifive_x280_intr)
-
-GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_l_sifive_x280_intr)
-GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_l_sifive_x280_intr)
-GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_intr)
-GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_intr)
-GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_u_sifive_x280_intr)
-GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_u_sifive_x280_intr)
-GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_intr)
-GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_intr)
diff --git a/kernels/sifive_x280/riscv_cmul_macros_asm.h b/kernels/sifive_x280/riscv_cmul_macros_asm.h
deleted file mode 100644
index 9c33fd7bc..000000000
--- a/kernels/sifive_x280/riscv_cmul_macros_asm.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023, SiFive, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-// macros to emit complex multiplication
-// caveat: the destination registers cannot overlap the source registers!
-// rd = rs1 * rs2
-#define cmul(rd_r, rd_i, rs1_r, rs1_i, rs2_r, rs2_i) \
-  \
-  __asm__(FMUL#rd_r", "#rs1_r", "#rs2_r);\
-  __asm__(FMUL#rd_i", "#rs1_r", "#rs2_i);\
-  __asm__(FNMSUB#rd_r", "#rs1_i", "#rs2_i", "#rd_r);\
-  __asm__(FMADD#rd_i", "#rs1_i", "#rs2_r", "#rd_i)
-
-// vd = vs2 * f[rs1]
-#define vcmul_vf(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
-  \
-  __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
-  __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
-  __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
-  __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
-
-#define vcmul_vf2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
-  \
-  __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
-  __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
-  __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
-  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
-
-// vd = conj(vs2) * f[rs1]
-#define vcmul_vf_conj(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
-  \
-  __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
-  __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
-  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
-  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
-
-#define vcmul_vf_conj2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
-  \
-  __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
-  __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
-  __asm__("vfmacc.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
-  __asm__("vfnmsac.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
-
-// vd += vs2 * f[rs1]
-#define vcmacc_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
-  \
-  __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
-  __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
-  __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
-  __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
-
-#define vcmacc_vf2(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
-  \
-  __asm__("vfmacc.vf "#vd_r", %0, "#vs2_r : : "f"(rs1_r));\
-  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_r : : "f"(rs1_i));\
-  __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
-  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
-
-// vd += conj(vs2) * f[rs1]
-#define vcmacc_vf_conj(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
-  \
-  __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
-  __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
-  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
-  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
-
-// vd -= vs2 * f[rs1]
-#define vcnmsac_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
-  \
-  __asm__("vfnmsac.vf "#vd_r", "#rs1_r", "#vs2_r);\
-  __asm__("vfnmsac.vf "#vd_i", "#rs1_i", "#vs2_r);\
-  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
-  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
-
-// vd = vs2 * vs1
-#define vcmul_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
-  \
-  __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
-  __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
-  __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
-  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
-
-// vd = vs2 * conj(vs1)
-#define vcmul_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
-  \
-  __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
-  __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
-  __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
-  __asm__("vfmsac.vv "#vd_i", "#vs2_i", "#vs1_r)
-
-// vd += vs2 * vs1
-#define vcmacc_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
-  \
-  __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
-  __asm__("vfmacc.vv "#vd_i", "#vs2_r", "#vs1_i);\
-  __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
-  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
-
-// vd += vs2 * conj(vs1)
-#define vcmacc_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
-  \
-  __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
-  __asm__("vfnmsac.vv "#vd_i", "#vs2_r", "#vs1_i);\
-  __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
-  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
-

From d161545a97b7393be79e45fc9d866e4969938614 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 20 Jan 2025 14:45:00 -0600
Subject: [PATCH 213/230] Run full "make check" for SDE tests. (#818)

Details:
- Previously, the tests using Intel SDE ran the BLIS testsuite manually. Now, the full `make check` suite is run using SDE as a wrapper for execution.
---
 travis/do_sde.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/travis/do_sde.sh b/travis/do_sde.sh
index 3c299e9b4..4f0447778 100755
--- a/travis/do_sde.sh
+++ b/travis/do_sde.sh
@@ -29,9 +29,7 @@ mv $SDE_DIRPATH/$SDE_TARBALL .
 
 tar xvf $SDE_TARBALL
 
-make -j2 testsuite-bin
-cp $DIST_PATH/testsuite/input.general.fast input.general
-cp $DIST_PATH/testsuite/input.operations.fast input.operations
+make -j2 testsuite-bin blastest-bin
 
 TMP=`ldd ./test_libblis.x | grep ld | sed 's/^.*=> //'`
 LD_SO=${TMP%% *}
@@ -47,11 +45,13 @@ done
 
 for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do
     if [ "$ARCH" = "knl" ]; then
-        $SDE -knl -- ./test_libblis.x > output.testsuite
+        TESTSUITE_WRAPPER="$SDE -knl --"
     else
-        $SDE -cpuid_in $DIST_PATH/travis/cpuid/$ARCH.def -- ./test_libblis.x > output.testsuite
+        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/travis/cpuid/$ARCH.def --"
     fi
-    $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
+
+    make TESTSUITE_WRAPPER="$TESTSUITE_WRAPPER" check
+
     TMP=`grep "active sub-configuration" output.testsuite`
     CONFIG=${TMP##* }
     if [ "$CONFIG" != "$ARCH" ]; then

From a3cfbefec1b0680ce015a96ee195f169e6e8475a Mon Sep 17 00:00:00 2001
From: Igor Zhuravlov <zhuravlov.ip@ya.ru>
Date: Tue, 21 Jan 2025 00:55:20 +0000
Subject: [PATCH 214/230] Fix errors and typos in some docs (#843)

Details:
- Fixes to the documentation:
    1. Some integer-based types were missed.
    2. Some function parameters were missed.
    3. Many interfaces were missing `const`.
- Improved formatting and consistency, removed trailing whitespace.
- Added several missed global constants.
---
 docs/BLISObjectAPI.md  |  739 +++++++++++++-----------
 docs/BLISTypedAPI.md   | 1248 ++++++++++++++++++++--------------------
 docs/KernelsHowTo.md   |   82 +--
 docs/Multithreading.md |    9 +
 docs/Performance.md    |    2 +-
 docs/ReleaseNotes.md   |    2 +-
 6 files changed, 1080 insertions(+), 1002 deletions(-)

diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md
index 01cd1ba37..4917cbee3 100644
--- a/docs/BLISObjectAPI.md
+++ b/docs/BLISObjectAPI.md
@@ -74,24 +74,24 @@ The following tables list various types used throughout the BLIS object API.
 
 ### Integer-based types
 
-| BLIS integer type | Type definition          | Used to represent...                                                 |
-|:------------------|:-------------------------|:---------------------------------------------------------------------|
-| `gint_t`          | `int32_t` or `int64_t`   | general-purpose signed integer; used to define signed integer types. |
-| `guint_t`         | `uint32_t` or `uint64_t` | general-purpose signed integer; used to define signed integer types. |
-| `dim_t`           | `gint_t`                 | matrix and vector dimensions.                                        |
-| `inc_t`           | `gint_t`                 | matrix row/column strides and vector increments.                     |
+| BLIS integer type | Type definition          | Used to represent...                                                     |
+|:------------------|:-------------------------|:-------------------------------------------------------------------------|
+| `gint_t`          | `int32_t` or `int64_t`   | general-purpose signed integer; used to define signed integer types.     |
+| `guint_t`         | `uint32_t` or `uint64_t` | general-purpose unsigned integer; used to define unsigned integer types. |
+| `dim_t`           | `gint_t`                 | matrix and vector dimensions.                                            |
+| `inc_t`           | `gint_t`                 | matrix row/column strides and vector increments.                         |
 | `doff_t`          | `gint_t`                 | matrix diagonal offset: if _k_ < 0, diagonal begins at element (-_k_,0); otherwise diagonal begins at element (0,_k_). |
-| `siz_t`           | `guint_t`                | a byte size or byte offset.                                          |
-| `kerid_t`         | `uint32_t`               | a kernel, block size, operation, or kernel preference ID.        |
+| `siz_t`           | `guint_t`                | a byte size or byte offset, unsigned integer.                            |
+| `kerid_t`         | `uint32_t`               | a kernel, block size, operation, or kernel preference ID.                |
 
 ### Floating-point types
 
-| BLIS fp type      | Type definition                        | Used to represent...              |
-|:------------------|:---------------------------------------|:----------------------------------|
-| `float`           | _N/A_                                  | single-precision real numbers.    |
-| `double`          | _N/A_                                  | double-precision real numbers.    |
-| `scomplex`        | `struct { float real; float imag; }`   | single-precision complex numbers. |
-| `dcomplex`        | `struct { double real; double imag; }` | double-precision complex numbers. |
+| BLIS fp type | Type definition                        | Used to represent...              |
+|:-------------|:---------------------------------------|:----------------------------------|
+| `float`      | _N/A_                                  | single-precision real numbers.    |
+| `double`     | _N/A_                                  | double-precision real numbers.    |
+| `scomplex`   | `struct { float real; float imag; }`   | single-precision complex numbers. |
+| `dcomplex`   | `struct { double real; double imag; }` | double-precision complex numbers. |
 
 ### Enumerated parameter types
 
@@ -114,7 +114,7 @@ The following tables list various types used throughout the BLIS object API.
 | `BLIS_SINGLE_PREC` | contains single-precision elements.        |
 | `BLIS_DOUBLE_PREC` | contains double-precision elements.        |
 
-| `trans_t`                | Semantic meaning: Matrix operand ...            |
+| `trans_t`                | Semantic meaning: Matrix operand...             |
 |:-------------------------|:------------------------------------------------|
 | `BLIS_NO_TRANSPOSE`      | will be used as given.                          |
 | `BLIS_TRANSPOSE`         | will be implicitly transposed.                  |
@@ -144,7 +144,7 @@ The following tables list various types used throughout the BLIS object API.
 | `BLIS_UPPER` | is stored in (and will be accessed only from) the upper triangle. |
 | `BLIS_DENSE` | is stored as a full matrix (ie: in both triangles).               |
 
-| `diag_t`            | Semantic meaning: Matrix operand ...                                       |
+| `diag_t`            | Semantic meaning: Matrix operand...                                        |
 |:--------------------|:---------------------------------------------------------------------------|
 | `BLIS_NONUNIT_DIAG` | has a non-unit diagonal that should be explicitly read from.               |
 | `BLIS_UNIT_DIAG`    | has a unit diagonal that should be implicitly assumed (and not read from). |
@@ -161,6 +161,9 @@ BLIS defines a handful of scalar objects that conveniently represent various con
 |  `BLIS_ZERO`               | ` 0.0`           |
 |  `BLIS_ONE`                | ` 1.0`           |
 |  `BLIS_TWO`                | ` 2.0`           |
+|  `BLIS_MINUS_ONE_I`        | ` 0.0-1.0*i`     |
+|  `BLIS_ONE_I`              | ` 0.0+1.0*i`     |
+|  `BLIS_NAN`                | ` NaN`           |
 
 These objects are polymorphic; each one contains a `float`, `double`, `scomplex`, `dcomplex`, and `gint_t` representation of the constant value in question. They can be used in place of any `obj_t*` operand in any object API function provided that the following criteria are met:
  * The object parameter requires unit dimensions (1x1). (In other words, the function expects a scalar for the operand in question.)
@@ -178,24 +181,24 @@ The functions listed in this document belong to the "basic" interface subset of
 ```c
 void bli_gemm
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 while the expert interface is:
 ```c
 void bli_gemm_ex
      (
-       obj_t*   alpha,
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx,
-       rntm_t*  rntm
+       const obj_t*   alpha,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   beta,
+       const obj_t*   c,
+       const cntx_t*  cntx,
+       const rntm_t*  rntm
      );
 ```
 The expert interface contains two additional parameters: a `cntx_t*` and `rntm_t*`. Note that calling a function from the expert interface with the `cntx_t*` and `rntm_t*` arguments each set to `NULL` is equivalent to calling the corresponding basic interface. Specifically, a `NULL` value passed in for the `cntx_t*` results in a valid context being queried from BLIS, and a `NULL` value passed in for the `rntm_t*` results in the current global settings for multithreading to be used.
@@ -204,7 +207,7 @@ The expert interface contains two additional parameters: a `cntx_t*` and `rntm_t
 
 In general, it is permissible to pass in `NULL` for a `cntx_t*` parameter when calling an expert interface such as `bli_gemm_ex()`. However, there are cases where `NULL` values are not accepted and may result in a segmentation fault. Specifically, the `cntx_t*` argument appears in the interfaces to the `gemm`, `trsm`, and `gemmtrsm` [level-3 microkernels](KernelsHowTo.md#level-3) along with all [level-1v](KernelsHowTo.md#level-1v) and [level-1f](KernelsHowTo.md#level-1f) kernels. There, as a general rule, a valid pointer must be passed in. Whenever a valid context is needed, the developer may query a default context from the global kernel structure (if a context is not already available in the current scope):
 ```c
-cntx_t* bli_gks_query_cntx( void );
+const cntx_t* bli_gks_query_cntx( void );
 ```
 When BLIS is configured to target a configuration family (e.g. `intel64`, `x86_64`), `bli_gks_query_cntx()` will use `cpuid` or an equivalent heuristic to select and and return the appropriate context. When BLIS is configured to target a singleton sub-configuration (e.g. `haswell`, `skx`), `bli_gks_query_cntx()` will unconditionally return a pointer to the context appropriate for the targeted configuration.
 
@@ -386,8 +389,8 @@ Objects initialized via this function should generally not be passed to `bli_obj
 ```c
 void bli_obj_create_conf_to
      (
-       obj_t*  s,
-       obj_t*  d
+       const obj_t*  s,
+             obj_t*  d
      );
 ```
 Initialize an object `d` with dimensions conformal to those of an existing object `s`. Object `d` is initialized with the same row and column strides as those of `s`. However, the structure, uplo, conjugation, and transposition properties of `s` are **not** inherited by `d`.
@@ -427,35 +430,35 @@ Notes for interpreting function descriptions:
 ---
 
 ```c
-num_t bli_obj_dt( obj_t* obj );
+num_t bli_obj_dt( const obj_t* obj );
 ```
 Return the storage datatype property of `obj`.
 
 ---
 
 ```c
-dom_t bli_obj_domain( obj_t* obj );
+dom_t bli_obj_domain( const obj_t* obj );
 ```
 Return the domain component of the storage datatype property of `obj`.
 
 ---
 
 ```c
-prec_t bli_obj_prec( obj_t* obj );
+prec_t bli_obj_prec( const obj_t* obj );
 ```
 Return the precision component of the storage datatype property of `obj`.
 
 ---
 
 ```c
-trans_t bli_obj_conjtrans_status( obj_t* obj );
+trans_t bli_obj_conjtrans_status( const obj_t* obj );
 ```
 Return the `trans_t` property of `obj`, which may indicate transposition, conjugation, both, or neither. Thus, possible return values are `BLIS_NO_TRANSPOSE`, `BLIS_CONJ_NO_TRANSPOSE`, `BLIS_TRANSPOSE`, or `BLIS_CONJ_TRANSPOSE`.
 
 ---
 
 ```c
-trans_t bli_obj_onlytrans_status( obj_t* obj );
+trans_t bli_obj_onlytrans_status( const obj_t* obj );
 ```
 Return the transposition component of the `trans_t` property of `obj`, which may indicate transposition or no transposition.
 Thus, possible return values are `BLIS_NO_TRANSPOSE` or `BLIS_TRANSPOSE`.
@@ -463,7 +466,7 @@ Thus, possible return values are `BLIS_NO_TRANSPOSE` or `BLIS_TRANSPOSE`.
 ---
 
 ```c
-conj_t bli_obj_conj_status( obj_t* obj );
+conj_t bli_obj_conj_status( const obj_t* obj );
 ```
 Return the conjugation component of the `trans_t` property of `obj`, which may indicate conjugation or no conjugation.
 Thus, possible return values are `BLIS_NO_CONJUGATE` or `BLIS_CONJUGATE`.
@@ -471,77 +474,77 @@ Thus, possible return values are `BLIS_NO_CONJUGATE` or `BLIS_CONJUGATE`.
 ---
 
 ```c
-struc_t bli_obj_struc( obj_t* obj );
+struc_t bli_obj_struc( const obj_t* obj );
 ```
 Return the structure property of `obj`.
 
 ---
 
 ```c
-uplo_t bli_obj_uplo( obj_t* obj );
+uplo_t bli_obj_uplo( const obj_t* obj );
 ```
 Return the uplo (i.e., storage) property of `obj`.
 
 ---
 
 ```c
-diag_t bli_obj_diag( obj_t* obj );
+diag_t bli_obj_diag( const obj_t* obj );
 ```
 Return the diagonal property of `obj`.
 
 ---
 
 ```c
-doff_t bli_obj_diag_offset( obj_t* obj );
+doff_t bli_obj_diag_offset( const obj_t* obj );
 ```
 Return the diagonal offset of `obj`. Note that the diagonal offset will be negative, `-i`, if the diagonal begins at element `(-i,0)` and positive `j` if the diagonal begins at element `(0,j)`.
 
 ---
 
 ```c
-dim_t bli_obj_length( obj_t* obj );
+dim_t bli_obj_length( const obj_t* obj );
 ```
 Return the number of rows (or _m_ dimension) of `obj`. This value is the _m_ dimension **before** taking into account the transposition property as indicated by `bli_obj_onlytrans_status()` or `bli_obj_conjtrans_status()`.
 
 ---
 
 ```c
-dim_t bli_obj_width( obj_t* obj );
+dim_t bli_obj_width( const obj_t* obj );
 ```
 Return the number of columns (or _n_ dimension) of `obj`. This value is the _n_ dimension **before** taking into account the transposition property as indicated by `bli_obj_onlytrans_status()` or `bli_obj_conjtrans_status()`.
 
 ---
 
 ```c
-dim_t bli_obj_length_after_trans( obj_t* obj );
+dim_t bli_obj_length_after_trans( const obj_t* obj );
 ```
 Return the number of rows (or _m_ dimension) of `obj` after taking into account the transposition property as indicated by `bli_obj_onlytrans_status()` or `bli_obj_conjtrans_status()`.
 
 ---
 
 ```c
-dim_t bli_obj_width_after_trans( obj_t* obj );
+dim_t bli_obj_width_after_trans( const obj_t* obj );
 ```
 Return the number of columns (or _n_ dimension) of `obj` after taking into account the transposition property as indicated by `bli_obj_onlytrans_status()` or `bli_obj_conjtrans_status()`.
 
 ---
 
 ```c
-inc_t bli_obj_row_stride( obj_t* obj );
+inc_t bli_obj_row_stride( const obj_t* obj );
 ```
 Return the row stride property of `obj`. When storing by columns, the row stride is 1. When storing by rows, the row stride is also sometimes called the _leading dimension_.
 
 ---
 
 ```c
-inc_t bli_obj_col_stride( obj_t* obj );
+inc_t bli_obj_col_stride( const obj_t* obj );
 ```
 Return the column stride property of `obj`. When storing by rows, the column stride is 1. When storing by columns, the column stride is also sometimes called the _leading dimension_.
 
 ---
 
 ```c
-dim_t bli_obj_vector_dim( obj_t* obj );
+dim_t bli_obj_vector_dim( const obj_t* obj );
 ```
 Return the number of elements in a vector object `obj`.
 This function assumes that at least one dimension of `obj` is unit, and that it therefore represents a vector.
@@ -549,7 +552,7 @@ This function assumes that at least one dimension of `obj` is unit, and that it
 ---
 
 ```c
-inc_t bli_obj_vector_inc( obj_t* obj );
+inc_t bli_obj_vector_inc( const obj_t* obj );
 ```
 Return the storage increment of a vector object `obj`.
 This function assumes that at least one dimension of `obj` is unit, and that it therefore represents a vector.
@@ -557,7 +560,7 @@ This function assumes that at least one dimension of `obj` is unit, and that it
 ---
 
 ```c
-void* bli_obj_buffer( obj_t* obj );
+void* bli_obj_buffer( const obj_t* obj );
 ```
 Return the address to the data buffer associated with object `obj`.
 **Note**: The address returned by this buffer will not take into account any subpartitioning. However, this will not be a problem for most casual users.
@@ -565,7 +568,7 @@ Return the address to the data buffer associated with object `obj`.
 ---
 
 ```c
-siz_t bli_obj_elem_size( obj_t* obj );
+siz_t bli_obj_elem_size( const obj_t* obj );
 ```
 Return the size, in bytes, of the storage datatype as indicated by `bli_obj_dt()`.
 
@@ -656,10 +659,10 @@ Modify the properties of `obj` to induce a logical transposition. This function
 ---
 
 ```c
-void bli_obj_alias_to( obj_t* a, obj_t* b );
+void bli_obj_alias_to( const obj_t* a, obj_t* b );
 ```
 Initialize `b` to be a shallow copy, or alias, of `a`. For most people's purposes, this is equivalent to
-```
+```c
   b = a;
 ```
 However, there is at least one field (one that only developers should be concerned with) that is not copied.
@@ -667,14 +670,14 @@ However, there is at least one field (one that only developers should be concern
 ---
 
 ```c
-void bli_obj_real_part( obj_t* c, obj_t* r );
+void bli_obj_real_part( const obj_t* c, obj_t* r );
 ```
 Initialize `r` to be a modified shallow copy of `c` that refers only to the real part of `c`.
 
 ---
 
 ```c
-void bli_obj_imag_part( obj_t* c, obj_t* i );
+void bli_obj_imag_part( const obj_t* c, obj_t* i );
 ```
 Initialize `i` to be a modified shallow copy of `c` that refers only to the imaginary part of `c`.
 
@@ -706,8 +709,8 @@ Level-1v operations perform various level-1 BLAS-like operations on vectors (hen
 ```c
 void bli_addv
      (
-       obj_t*  x,
-       obj_t*  y,
+       const obj_t*  x,
+       const obj_t*  y
      );
 ```
 Perform
@@ -724,8 +727,8 @@ Observed object properties: `conj?(x)`.
 ```c
 void bli_amaxv
      (
-       obj_t*  x,
-       obj_t*  index
+       const obj_t*  x,
+       const obj_t*  index
      );
 ```
 Given a vector of length _n_, return the zero-based index of the element of vector `x` that contains the largest absolute value (or, in the complex domain, the largest complex modulus). The object `index` must be created of type `BLIS_INT`.
@@ -742,9 +745,9 @@ Observed object properties: none.
 ```c
 void bli_axpyv
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      );
 ```
 Perform
@@ -758,13 +761,13 @@ Observed object properties: `conj?(alpha)`, `conj?(x)`.
 ---
 
 #### axpbyv
-```
+```c
 void bli_axpbyv
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      );
 ```
 Perform
@@ -781,8 +784,8 @@ Observed object properties: `conj?(alpha)`, `conj?(x)`.
 ```c
 void bli_copyv
      (
-       obj_t*  x,
-       obj_t*  y
+       const obj_t*  x,
+       const obj_t*  y
      );
 ```
 Perform
@@ -799,9 +802,9 @@ Observed object properties: `conj?(x)`.
 ```c
 void bli_dotv
      (
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  rho
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  rho
      );
 ```
 Perform
@@ -818,11 +821,11 @@ Observed object properties: `conj?(x)`, `conj?(y)`.
 ```c
 void bli_dotxv
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  beta,
-       obj_t*  rho
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  beta,
+       const obj_t*  rho
      );
 ```
 Perform
@@ -839,7 +842,7 @@ Observed object properties: `conj?(alpha)`, `conj?(beta)`, `conj?(x)`, `conj?(y)
 ```c
 void bli_invertv
      (
-       obj_t*  x
+       const obj_t*  x
      );
 ```
 Invert all elements of an _n_-length vector `x`.
@@ -850,8 +853,8 @@ Invert all elements of an _n_-length vector `x`.
 ```c
 void bli_invscalv
      (
-       obj_t*  alpha,
-       obj_t*  x
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 ```
 Perform
@@ -868,8 +871,8 @@ Observed object properties: `conj?(alpha)`.
 ```c
 void bli_scalv
      (
-       obj_t*  alpha,
-       obj_t*  x
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 ```
 Perform
@@ -886,9 +889,9 @@ Observed object properties: `conj?(alpha)`.
 ```c
 void bli_scal2v
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      );
 ```
 Perform
@@ -905,8 +908,8 @@ Observed object properties: `conj?(alpha)`, `conj?(x)`.
 ```c
 void bli_setv
      (
-       obj_t*  alpha,
-       obj_t*  x
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 ```
 Perform
@@ -923,8 +926,8 @@ Observed object properties: `conj?(alpha)`.
 ```c
 void bli_setrv
      (
-       obj_t*  alpha,
-       obj_t*  x
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 ```
 Perform
@@ -941,8 +944,8 @@ If `x` is real, this operation is equivalent to performing `setv` on `x` with th
 ```c
 void bli_setiv
      (
-       obj_t*  alpha,
-       obj_t*  x
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 ```
 Perform
@@ -959,8 +962,8 @@ If `x` is real, this operation is equivalent to a no-op.
 ```c
 void bli_subv
      (
-       obj_t*  x,
-       obj_t*  y
+       const obj_t*  x,
+       const obj_t*  y
      );
 ```
 Perform
@@ -977,8 +980,8 @@ Observed object properties: `conj?(x)`.
 ```c
 void bli_swapv
      (
-       obj_t*  x,
-       obj_t*  y
+       const obj_t*  x,
+       const obj_t*  y
      );
 ```
 Swap corresponding elements of two _n_-length vectors `x` and `y`.
@@ -986,12 +989,12 @@ Swap corresponding elements of two _n_-length vectors `x` and `y`.
 ---
 
 #### xpbyv
-```
+```c
 void bli_xpbyv
      (
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      );
 ```
 Perform
@@ -1020,8 +1023,8 @@ These operations are similar to their level-1m counterparts, except they only re
 ```c
 void bli_addd
      (
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 
@@ -1033,9 +1036,9 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `trans?(A)`.
 ```c
 void bli_axpyd
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 
@@ -1047,8 +1050,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `diag(A)`, `trans?(A)`
 ```c
 void bli_copyd
      (
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 
@@ -1060,7 +1063,7 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `trans?(A)`.
 ```c
 void bli_invertd
      (
-       obj_t*  a
+       const obj_t*  a
      );
 ```
 
@@ -1072,8 +1075,8 @@ Observed object properties: `diagoff(A)`.
 ```c
 void bli_invscald
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 
@@ -1085,8 +1088,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`.
 ```c
 void bli_scald
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 
@@ -1098,9 +1101,9 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`.
 ```c
 void bli_scal2d
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 
@@ -1112,8 +1115,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `diag(A)`, `trans?(A)`
 ```c
 void bli_setd
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 
@@ -1125,8 +1128,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`.
 ```c
 void bli_setid
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 Set the imaginary components of every element along the diagonal of `a`
@@ -1142,8 +1145,8 @@ Observed object properties: `diagoff(A)`.
 ```c
 void bli_shiftd
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 Add a constant value `alpha` to every element along the diagonal of `a`.
@@ -1156,8 +1159,8 @@ Observed object properties: `diagoff(A)`.
 ```c
 void bli_subd
      (
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 
@@ -1169,9 +1172,9 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `trans?(A)`.
 ```c
 void bli_xpbyd
      (
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  b
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  b
      );
 ```
 
@@ -1191,8 +1194,8 @@ Level-1m operations perform various level-1 BLAS-like operations on matrices (he
 ```c
 void bli_addm
      (
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 Perform
@@ -1210,9 +1213,9 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`, `trans?(A)`.
 ```c
 void bli_axpym
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 Perform
@@ -1230,8 +1233,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `diag(A)`, `uplo(A)`,
 ```c
 void bli_copym
      (
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 Perform
@@ -1249,8 +1252,8 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`, `trans?(A)`.
 ```c
 void bli_invscalm
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1267,8 +1270,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `uplo(A)`.
 ```c
 void bli_scalm
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1285,9 +1288,9 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `uplo(A)`.
 ```c
 void bli_scal2m
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 Perform
@@ -1305,8 +1308,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `diag(A)`, `uplo(A)`,
 ```c
 void bli_setm
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1323,8 +1326,8 @@ Observed object properties: `conj?(alpha)`, `diagoff(A)`, `diag(A)`, `uplo(A)`.
 ```c
 void bli_setrm
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1343,8 +1346,8 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`.
 ```c
 void bli_setim
      (
-       obj_t*  alpha,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1363,8 +1366,8 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`.
 ```c
 void bli_subm
      (
-       obj_t*  a,
-       obj_t*  b
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 Perform
@@ -1396,11 +1399,11 @@ Level-1f kernels are employed when optimizing level-2 operations.
 ```c
 void bli_axpy2v
      (
-       obj_t*  alphax,
-       obj_t*  alphay,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  z
+       const obj_t*  alphax,
+       const obj_t*  alphay,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  z
      );
 ```
 Perform
@@ -1417,11 +1420,12 @@ Observed object properties: `conj?(alphax)`, `conj?(x)`, `conj?(alphay)`, `conj?
 ```c
 void bli_dotaxpyv
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  rho,
-       obj_t*  z
+       const obj_t*  alpha,
+       const obj_t*  xt,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  rho,
+       const obj_t*  z
      );
 ```
 Perform
@@ -1439,10 +1443,10 @@ Observed object properties: `conj?(x)`, `conj?(y)`, `conj?(alpha)`.
 ```c
 void bli_axpyf
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  y
      );
 ```
 Perform
@@ -1459,11 +1463,11 @@ Observed object properties: `conj?(alpha)`, `conj?(A)`, `conj?(x)`.
 ```c
 void bli_dotxf
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      );
 ```
 Perform
@@ -1480,13 +1484,14 @@ Observed object properties: `conj?(alpha)`, `conj?(beta)`, `conj?(A)`, `conj?(x)
 ```c
 void bli_dotxaxpyf
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  w,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y,
-       obj_t*  z
+       const obj_t*  alpha,
+       const obj_t*  at,
+       const obj_t*  a,
+       const obj_t*  w,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y,
+       const obj_t*  z
      );
 ```
 Perform
@@ -1511,11 +1516,11 @@ Level-2 operations perform various level-2 BLAS-like operations.
 ```c
 void bli_gemv
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      );
 ```
 Perform
@@ -1532,10 +1537,10 @@ Observed object properties: `conj?(alpha)`, `conj?(beta)`, `trans?(A)`, `conj?(x
 ```c
 void bli_ger
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1552,11 +1557,11 @@ Observed object properties: `conj?(alpha)`, `conj?(x)`, `conj?(y)`.
 ```c
 void bli_hemv
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      );
 ```
 Perform
@@ -1573,9 +1578,9 @@ Observed object properties: `conj?(alpha)`, `conj?(beta)`, `conj?(A)`, `uplo(A)`
 ```c
 void bli_her
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1594,10 +1599,10 @@ Observed object properties: `conj?(alpha)`, `uplo(A)`, `conj?(x)`.
 ```c
 void bli_her2
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1614,11 +1619,11 @@ Observed object properties: `uplo(A)`, `conj?(x)`, `conj?(y)`.
 ```c
 void bli_symv
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      );
 ```
 Perform
@@ -1635,9 +1640,9 @@ Observed object properties: `conj?(alpha)`, `conj?(beta)`, `conj?(A)`, `uplo(A)`
 ```c
 void bli_syr
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1654,10 +1659,10 @@ Observed object properties: `conj?(alpha)`, `conj?(x)`.
 ```c
 void bli_syr2
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      );
 ```
 Perform
@@ -1674,9 +1679,9 @@ Observed object properties: `uplo(A)`, `conj?(x)`, `conj?(y)`.
 ```c
 void bli_trmv
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x
      );
 ```
 Perform
@@ -1693,9 +1698,9 @@ Observed object properties: `conj?(alpha)`, `uplo(A)`, `trans?(A)`, `diag(A)`.
 ```c
 void bli_trsv
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  y
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  y
      );
 ```
 Solve the linear system
@@ -1722,11 +1727,11 @@ Level-3 operations perform various level-3 BLAS-like operations.
 ```c
 void bli_gemm
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1743,11 +1748,11 @@ Observed object properties: `trans?(A)`, `trans?(B)`.
 ```c
 void bli_gemmt
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1764,12 +1769,12 @@ Observed object properties: `trans?(A)`, `trans?(B)`, `uplo(C)`.
 ```c
 void bli_hemm
      (
-       side_t  sidea,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+             side_t  sidea,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1790,10 +1795,10 @@ Observed object properties: `uplo(A)`, `conj?(A)`, `trans?(B)`.
 ```c
 void bli_herk
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1812,11 +1817,11 @@ Observed object properties: `trans?(A)`, `uplo(C)`.
 ```c
 void bli_her2k
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1835,12 +1840,12 @@ Observed object properties: `trans?(A)`, `trans?(B)`, `uplo(C)`.
 ```c
 void bli_symm
      (
-       side_t  sidea,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+             side_t  sidea,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1861,10 +1866,10 @@ Observed object properties: `uplo(A)`, `conj?(A)`, `trans?(B)`.
 ```c
 void bli_syrk
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1881,11 +1886,11 @@ Observed object properties: `trans?(A)`, `uplo(C)`.
 ```c
 void bli_syr2k
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1902,10 +1907,10 @@ Observed object properties: `trans?(A)`, `trans?(B)`, `uplo(C)`.
 ```c
 void bli_trmm
      (
-       side_t  sidea,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b
+             side_t  sidea,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 Perform
@@ -1926,12 +1931,12 @@ Observed object properties: `uplo(A)`, `trans?(A)`, `diag(A)`.
 ```c
 void bli_trmm3
      (
-       side_t  sidea,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+             side_t  sidea,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 ```
 Perform
@@ -1952,10 +1957,10 @@ Observed object properties: `uplo(A)`, `trans?(A)`, `diag(A)`, `trans?(B)`.
 ```c
 void bli_trsm
      (
-       side_t  sidea,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b
+             side_t  sidea,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b
      );
 ```
 Solve the linear system with multiple right-hand sides
@@ -1981,8 +1986,8 @@ Observed object properties: `uplo(A)`, `trans?(A)`, `diag(A)`.
 ```c
 void bli_asumv
      (
-       obj_t*  x,
-       obj_t*  asum
+       const obj_t*  x,
+       const obj_t*  asum
      );
 ```
 Compute the sum of the absolute values of the fundamental elements of vector `x`. The resulting sum is stored to `asum`.
@@ -2000,8 +2005,8 @@ Observed object properties: none.
 ```c
 void bli_norm[1fi]m
      (
-       obj_t*  a,
-       obj_t*  norm
+       const obj_t*  a,
+       const obj_t*  norm
      );
 ```
 Compute the one-norm (`bli_norm1m()`), Frobenius norm (`bli_normfm()`), or infinity norm (`bli_normim()`) of the elements in an _m x n_ matrix `A`. If `uplo(A)` is `BLIS_LOWER` or `BLIS_UPPER` then `A` is assumed to be lower or upper triangular, respectively, with the main diagonal located at offset `diagoff(A)`. The resulting norm is stored to `norm`.
@@ -2018,8 +2023,8 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`.
 ```c
 void bli_norm[1fi]v
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t*  x,
+       const obj_t*  norm
      );
 ```
 Compute the one-norm (`bli_norm1v()`), Frobenius norm (`bli_normfv()`), or infinity norm (`bli_normiv()`) of the elements in a vector `x` of length _n_. The resulting norm is stored to `norm`.
@@ -2034,7 +2039,7 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`.
 ```c
 void bli_mkherm
      (
-       obj_t*  a
+       const obj_t*  a
      );
 ```
 Make an _m x m_ matrix `A` explicitly Hermitian by copying the conjugate of the triangle specified by `uplo(A)` to the opposite triangle. Imaginary components of diagonal elements are explicitly set to zero. It is assumed that the diagonal offset of `A` is zero.
@@ -2047,7 +2052,7 @@ Observed object properties: `uplo(A)`.
 ```c
 void bli_mksymm
      (
-       obj_t*  a
+       const obj_t*  a
      );
 ```
 Make an _m x m_ matrix `A` explicitly symmetric by copying the triangle specified by `uplo(A)` to the opposite triangle. It is assumed that the diagonal offset of `A` is zero.
@@ -2060,7 +2065,7 @@ Observed object properties: `uplo(A)`.
 ```c
 void bli_mktrim
      (
-       obj_t*  a
+       const obj_t*  a
      );
 ```
 Make an _m x m_ matrix `A` explicitly triangular by preserving the triangle specified by `uplo(A)` and zeroing the elements in the opposite triangle. It is assumed that the diagonal offset of `A` is zero.
@@ -2073,11 +2078,11 @@ Observed object properties: `uplo(A)`.
 ```c
 void bli_fprintv
      (
-       FILE*   file,
-       char*   s1,
-       obj_t*  x,
-       char*   format,
-       char*   s2
+             FILE*   file,
+       const char*   s1,
+       const obj_t*  x,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print a vector `x` of length _m_ to file stream `file`, where `file` is a file pointer returned by the standard C library function `fopen()`. The caller may also pass in a global file pointer such as `stdout` or `stderr`. The strings `s1` and `s2` are printed immediately before and after the output (respectively), and the format specifier `format` is used to format the individual elements. For valid format specifiers, please see documentation for the standard C library function `printf()`.
@@ -2090,11 +2095,11 @@ Print a vector `x` of length _m_ to file stream `file`, where `file` is a file p
 ```c
 void bli_fprintm
      (
-       FILE*   file,
-       char*   s1,
-       obj_t*  a,
-       char*   format,
-       char*   s2
+             FILE*   file,
+       const char*   s1,
+       const obj_t*  a,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print an _m x n_ matrix `A` to file stream `file`, where `file` is a file pointer returned by the standard C library function `fopen()`. The caller may also pass in a global file pointer such as `stdout` or `stderr`. The strings `s1` and `s2` are printed immediately before and after the output (respectively), and the format specifier `format` is used to format the individual elements. For valid format specifiers, please see documentation for the standard C library function `printf()`.
@@ -2107,10 +2112,10 @@ Print an _m x n_ matrix `A` to file stream `file`, where `file` is a file pointe
 ```c
 void bli_printv
      (
-       char*   s1,
-       obj_t*  x,
-       char*   format,
-       char*   s2
+       const char*   s1,
+       const obj_t*  x,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print a vector `x` of length _m_ to standard output. This function call is equivalent to calling `bli_fprintv()` with `stdout` as the file pointer.
@@ -2121,10 +2126,10 @@ Print a vector `x` of length _m_ to standard output. This function call is equiv
 ```c
 void bli_printm
      (
-       char*   s1,
-       obj_t*  a,
-       char*   format,
-       char*   s2
+       const char*   s1,
+       const obj_t*  a,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print an _m x n_ matrix `a` to standard output. This function call is equivalent to calling `bli_fprintm()` with `stdout` as the file pointer.
@@ -2135,7 +2140,7 @@ Print an _m x n_ matrix `a` to standard output. This function call is equivalent
 ```c
 void bli_randv
      (
-       obj_t*  x
+       const obj_t*  x
      );
 ```
 Set the elements of a vector `x` of length _n_ to random values on the interval `[-1,1)`.
@@ -2148,7 +2153,7 @@ Set the elements of a vector `x` of length _n_ to random values on the interval
 ```c
 void bli_randm
      (
-       obj_t*  a
+       const obj_t*  a
      );
 ```
 Set the elements of an _m x n_ matrix `A` to random values on the interval `[-1,1)`. Off-diagonal elements (in the triangle specified by `uplo(A)`) are scaled by `1.0/max(m,n)`.
@@ -2164,9 +2169,9 @@ Observed object properties: `diagoff(A)`, `uplo(A)`.
 ```c
 void bli_sumsqv
      (
-       obj_t*  x,
-       obj_t*  scale,
-       obj_t*  sumsq
+       const obj_t*  x,
+       const obj_t*  scale,
+       const obj_t*  sumsq
      );
 ```
 Compute the sum of the squares of the elements in a vector `x` of length _n_. The result is computed in scaled form, and in such a way that it may be used repeatedly to accumulate the sum of the squares of several vectors.
@@ -2186,9 +2191,9 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec
 ```c
 void bli_getsc
      (
-       obj_t*   chi,
-       double*  zeta_r,
-       double*  zeta_i
+       const obj_t*   chi,
+             double*  zeta_r,
+             double*  zeta_i
      );
 ```
 Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -2198,12 +2203,12 @@ Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and
 #### getijv
 ```c
 err_t bli_getijv
-      (
-        dim_t    i,
-        obj_t*   x,
-        double*  ar,
-        double*  ai
-      )
+     (
+             dim_t    i,
+       const obj_t*   x,
+             double*  ar,
+             double*  ai
+     );
 ```
 Copy the real and imaginary values at the `i`th element of vector object `x` to `ar` and `ai`. If elements of `x` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
 If either the element offset `i` is beyond the vector dimension of `x` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `x` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`.
@@ -2213,13 +2218,13 @@ If either the element offset `i` is beyond the vector dimension of `x` or less t
 #### getijm
 ```c
 err_t bli_getijm
-      (
-        dim_t    i,
-        dim_t    j,
-        obj_t*   b,
-        double*  ar,
-        double*  ai
-      )
+     (
+             dim_t    i,
+             dim_t    j,
+       const obj_t*   b,
+             double*  ar,
+             double*  ai
+     );
 ```
 Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. If elements of `b` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
 If either the row offset `i` is beyond the _m_ dimension of `b` or less than zero, or column offset `j` is beyond the _n_ dimension of `b` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `b` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`.
@@ -2230,9 +2235,9 @@ If either the row offset `i` is beyond the _m_ dimension of `b` or less than zer
 ```c
 void bli_setsc
      (
-       double  zeta_r,
-       double  zeta_i,
-       obj_t*  chi
+             double  zeta_r,
+             double  zeta_i,
+       const obj_t*  chi
      );
 ```
 Copy real and imaginary values `zeta_r` and `zeta_i` to the scalar object `chi`. If `chi` is stored as a real type, then `zeta_i` is ignored. (If `chi` is stored in single precision, the contents are typecast/demoted during the copy.)
@@ -2243,10 +2248,10 @@ Copy real and imaginary values `zeta_r` and `zeta_i` to the scalar object `chi`.
 ```c
 err_t bli_setijv
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       obj_t*  x
+             double  ar,
+             double  ai,
+             dim_t   i,
+       const obj_t*  x
      );
 ```
 Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. If elements of `x` are stored as real types, then only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.)
@@ -2258,11 +2263,11 @@ If the element offset `i` is beyond the vector dimension of `x` or less than zer
 ```c
 err_t bli_setijm
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       dim_t   j,
-       obj_t*  b
+             double  ar,
+             double  ai,
+             dim_t   i,
+             dim_t   j,
+       const obj_t*  b
      );
 ```
 Copy real and imaginary values `ar` and `ai` to the (`i`,`j`) element of object `b`. If elements of `b` are stored as real types, then only `ar` is copied and `ai` is ignored. (If `b` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.)
@@ -2274,9 +2279,9 @@ If either the row offset `i` is beyond the _m_ dimension of `b` or less than zer
 ```c
 void bli_eqsc
      (
-       obj_t*  chi,
-       obj_t*  psi,
-       bool*   is_eq
+       const obj_t*  chi,
+       const obj_t*  psi,
+             bool*   is_eq
      );
 ```
 Perform an element-wise comparison between scalars `chi` and `psi` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -2290,9 +2295,9 @@ Observed object properties: `conj?(chi)`, `conj?(psi)`.
 ```c
 void bli_eqv
      (
-       obj_t*  x,
-       obj_t*  y,
-       bool*   is_eq
+       const obj_t*  x,
+       const obj_t*  y,
+             bool*   is_eq
      );
 ```
 Perform an element-wise comparison between vectors `x` and `y` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -2306,9 +2311,9 @@ Observed object properties: `conj?(x)`, `conj?(y)`.
 ```c
 void bli_eqm
      (
-       obj_t*  a,
-       obj_t*  b,
-       bool*   is_eq
+       const obj_t*  a,
+       const obj_t*  b,
+             bool*   is_eq
      );
 ```
 Perform an element-wise comparison between matrices `A` and `B` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -2326,7 +2331,7 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`, `trans?(A)`, `tr
 
 BLIS allows applications to query information about how BLIS was configured. The `bli_info_` API provides several categories of query routines. Most values are returned as a `gint_t`, which is a signed integer. The size of this integer can be queried through a special routine that returns the size in a character string:
 ```c
-char* bli_info_get_int_type_size_str( void );
+const char* bli_info_get_int_type_size_str( void );
 ```
 **Note:** All of the `bli_info_` functions are **always** thread-safe, no matter how BLIS was configured.
 
@@ -2334,7 +2339,7 @@ char* bli_info_get_int_type_size_str( void );
 
 The following routine returns the address the full BLIS version string:
 ```c
-char* bli_info_get_version_str( void );
+const char* bli_info_get_version_str( void );
 ```
 
 ## Specific configuration
@@ -2347,7 +2352,7 @@ This is most useful when BLIS is configured with multiple configurations. (When
 
 Once the configuration's ID is known, it can be used to query a string that contains the name of the configuration:
 ```c
-char* bli_arch_string( arch_t id );
+const char* bli_arch_string( arch_t id );
 ```
 
 ## General configuration
@@ -2366,10 +2371,33 @@ gint_t bli_info_get_stack_buf_max_size( void );
 gint_t bli_info_get_stack_buf_align_size( void );
 gint_t bli_info_get_heap_addr_align_size( void );
 gint_t bli_info_get_heap_stride_align_size( void );
-gint_t bli_info_get_pool_addr_align_size( void );
+gint_t bli_info_get_pool_addr_align_size_a( void );
+gint_t bli_info_get_pool_addr_align_size_b( void );
+gint_t bli_info_get_pool_addr_align_size_c( void );
+gint_t bli_info_get_pool_addr_align_size_gen( void );
+gint_t bli_info_get_pool_addr_offset_size_a( void );
+gint_t bli_info_get_pool_addr_offset_size_b( void );
+gint_t bli_info_get_pool_addr_offset_size_c( void );
+gint_t bli_info_get_pool_addr_offset_size_gen( void );
 gint_t bli_info_get_enable_stay_auto_init( void );
 gint_t bli_info_get_enable_blas( void );
+gint_t bli_info_get_enable_cblas( void );
 gint_t bli_info_get_blas_int_type_size( void );
+gint_t bli_info_get_enable_pba_pools( void );
+gint_t bli_info_get_enable_sba_pools( void );
+gint_t bli_info_get_enable_threading( void );
+gint_t bli_info_get_enable_openmp( void );
+gint_t bli_info_get_enable_pthreads( void );
+gint_t bli_info_get_enable_hpx( void );
+gint_t bli_info_get_enable_openmp_as_default( void );
+gint_t bli_info_get_enable_pthreads_as_default( void );
+gint_t bli_info_get_enable_hpx_as_default( void );
+gint_t bli_info_get_thread_jrir_slab( void );
+gint_t bli_info_get_thread_jrir_rr( void );
+gint_t bli_info_get_thread_jrir_tlb( void );
+gint_t bli_info_get_enable_tls( void );
+gint_t bli_info_get_enable_memkind( void );
+gint_t bli_info_get_enable_sandbox( void );
 ```
 
 ## Kernel information
@@ -2379,11 +2407,11 @@ gint_t bli_info_get_blas_int_type_size( void );
 The following routines allow the caller to obtain a string that identifies the implementation type of each microkernel that is currently active (ie: part of the current active configuration, as identified bi `bli_arch_query_id()`).
 
 ```c
-char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt );
 ```
 
 Possible implementation (ie: the `ind_t method` argument) types are:
@@ -2397,16 +2425,31 @@ Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_s
  * `BLIS_NOTAPPLIC_UKERNEL` (`"notappl"`): This value is returned usually when performing a `gemmtrsm` or `trsm` microkernel type query for any `method` value that is not `BLIS_NAT` (ie: native). That is, induced methods cannot be (purely) used on `trsm`-based microkernels because these microkernels perform more a triangular inversion, which is not matrix multiplication.
 
 
+### Operation implementation type query
+
+The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implemenation query](BLISTypedAPI.md#microkernel-implementation-type-query).
+```c
+const char* bli_info_get_gemm_impl_string( num_t dt );
+const char* bli_info_get_gemmt_impl_string( num_t dt );
+const char* bli_info_get_hemm_impl_string( num_t dt );
+const char* bli_info_get_herk_impl_string( num_t dt );
+const char* bli_info_get_her2k_impl_string( num_t dt );
+const char* bli_info_get_symm_impl_string( num_t dt );
+const char* bli_info_get_syrk_impl_string( num_t dt );
+const char* bli_info_get_syr2k_impl_string( num_t dt );
+const char* bli_info_get_trmm_impl_string( num_t dt );
+const char* bli_info_get_trmm3_impl_string( num_t dt );
+const char* bli_info_get_trsm_impl_string( num_t dt );
+```
+
+
 ## Clock functions
 
 ---
 
 #### clock
 ```c
-double bli_clock
-     (
-       void
-     );
+double bli_clock( void );
 ```
 Return the amount of time that has elapsed since some fixed time in the past. The return values of `bli_clock()` typically feature nanosecond precision, though this is not guaranteed.
 
diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md
index 28b46ea3e..350638a47 100644
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -77,41 +77,41 @@ The following tables list various types used throughout the BLIS typed API.
 
 ### Floating-point types
 
-| BLIS type  | BLIS char | Type definition                        | Used to represent...                 |
-|:-----------|:----------|:---------------------------------------|:-------------------------------------|
-| `float`    | `s`       | _N/A_                                  | single-precision real numbers    |
-| `double`   | `d`       | _N/A_                                  | double-precision real numbers    |
-| `scomplex` | `c`       | `struct { float real; float imag; }`   | single-precision complex numbers |
-| `dcomplex` | `z`       | `struct { double real; double imag; }` | double-precision complex numbers |
+| BLIS fp type | BLIS char | Type definition                        | Used to represent...              |
+|:-------------|:----------|:---------------------------------------|:----------------------------------|
+| `float`      | `s`       | _N/A_                                  | single-precision real numbers.    |
+| `double`     | `d`       | _N/A_                                  | double-precision real numbers.    |
+| `scomplex`   | `c`       | `struct { float real; float imag; }`   | single-precision complex numbers. |
+| `dcomplex`   | `z`       | `struct { double real; double imag; }` | double-precision complex numbers. |
 
 ### Enumerated parameter types
 
-| `trans_t`                | Semantic meaning: Corresponding matrix operand... |
-|:-------------------------|:--------------------------------------------------|
-| `BLIS_NO_TRANSPOSE`      | will be used as given.                         |
-| `BLIS_TRANSPOSE`         | will be implicitly transposed.                 |
-| `BLIS_CONJ_NO_TRANSPOSE` | will be implicitly conjugated.                 |
+| `trans_t`                | Semantic meaning: Matrix operand...             |
+|:-------------------------|:------------------------------------------------|
+| `BLIS_NO_TRANSPOSE`      | will be used as given.                          |
+| `BLIS_TRANSPOSE`         | will be implicitly transposed.                  |
+| `BLIS_CONJ_NO_TRANSPOSE` | will be implicitly conjugated.                  |
 | `BLIS_CONJ_TRANSPOSE`    | will be implicitly transposed _and_ conjugated. |
 
-| `conj_t`             | Semantic meaning: Corresponding matrix/vector operand... |
-|:---------------------|:---------------------------------------------------------|
-| `BLIS_NO_CONJUGATE`  | will be used as given.                                |
-| `BLIS_CONJUGATE`     | will be implicitly conjugated.                        |
+| `conj_t`             | Semantic meaning: Matrix/vector operand... |
+|:---------------------|:-------------------------------------------|
+| `BLIS_NO_CONJUGATE`  | will be used as given.                     |
+| `BLIS_CONJUGATE`     | will be implicitly conjugated.             |
 
-| `side_t`     | Semantic meaning: Corresponding matrix operand...  |
-|:-------------|:---------------------------------------------------|
-| `BLIS_LEFT`  | appears on the left.                            |
-| `BLIS_RIGHT` | appears on the right.                           |
+| `side_t`     | Semantic meaning: Matrix operand... |
+|:-------------|:------------------------------------|
+| `BLIS_LEFT`  | appears on the left.                |
+| `BLIS_RIGHT` | appears on the right.               |
 
-| `uplo_t`     | Semantic meaning: Corresponding matrix operand... |
-|:-------------|:--------------------------------------------------|
+| `uplo_t`     | Semantic meaning: Matrix operand...                               |
+|:-------------|:------------------------------------------------------------------|
 | `BLIS_LOWER` | is stored in (and will be accessed only from) the lower triangle. |
 | `BLIS_UPPER` | is stored in (and will be accessed only from) the upper triangle. |
 | `BLIS_DENSE` | is stored as a full matrix (ie: in both triangles).               |
 
-| `diag_t`            | Semantic meaning: Corresponding matrix operand... |
-|:--------------------|:--------------------------------------------------|
-| `BLIS_NONUNIT_DIAG` | has a non-unit diagonal that should be explicitly read from. |
+| `diag_t`            | Semantic meaning: Matrix operand...                                        |
+|:--------------------|:---------------------------------------------------------------------------|
+| `BLIS_NONUNIT_DIAG` | has a non-unit diagonal that should be explicitly read from.               |
 | `BLIS_UNIT_DIAG`    | has a unit diagonal that should be implicitly assumed (and not read from). |
 
 ### Basic vs expert interfaces
@@ -120,34 +120,34 @@ The functions listed in this document belong to the "basic" interface subset of
 ```c
 void bli_?gemm
      (
-       trans_t  transa,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    n,
-       dim_t    k,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             trans_t  transa,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    n,
+             dim_t    k,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 while the expert interface is:
 ```c
 void bli_?gemm_ex
      (
-       trans_t  transa,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    n,
-       dim_t    k,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc,
-       cntx_t*  cntx,
-       rntm_t*  rntm
+             trans_t  transa,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    n,
+             dim_t    k,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc,
+       const cntx_t*  cntx,
+       const rntm_t*  rntm
      );
 ```
 The expert interface contains two additional parameters: a `cntx_t*` and `rntm_t*`. Note that calling a function from the expert interface with the `cntx_t*` and `rntm_t*` arguments each set to `NULL` is equivalent to calling the corresponding basic interface. Specifically, a `NULL` value passed in for the `cntx_t*` results in a valid context being queried from BLIS, and a `NULL` value passed in for the `rntm_t*` results in the current global settings for multithreading to be used.
@@ -156,7 +156,7 @@ The expert interface contains two additional parameters: a `cntx_t*` and `rntm_t
 
 In general, it is permissible to pass in `NULL` for a `cntx_t*` parameter when calling an expert interface such as `bli_dgemm_ex()`. However, there are cases where `NULL` values are not accepted and may result in a segmentation fault. Specifically, the `cntx_t*` argument appears in the interfaces to the `gemm`, `trsm`, and `gemmtrsm` [level-3 microkernels](KernelsHowTo.md#level-3) along with all [level-1v](KernelsHowTo.md#level-1v) and [level-1f](KernelsHowTo.md#level-1f) kernels. There, as a general rule, a valid pointer must be passed in. Whenever a valid context is needed, the developer may query a default context from the global kernel structure (if a context is not already available in the current scope):
 ```c
-cntx_t* bli_gks_query_cntx( void );
+const cntx_t* bli_gks_query_cntx( void );
 ```
 When BLIS is configured to target a configuration family (e.g. `intel64`, `x86_64`), `bli_gks_query_cntx()` will use `cpuid` or an equivalent heuristic to select and and return the appropriate context. When BLIS is configured to target a singleton sub-configuration (e.g. `haswell`, `skx`), `bli_gks_query_cntx()` will unconditionally return a pointer to the context appropriate for the targeted configuration.
 
@@ -195,7 +195,6 @@ void bli_finalize( void );
 # Computational function reference
 
 Notes for interpreting the following prototypes:
-
   * Any occurrence of `?` should be replaced with `s`, `d`, `c`, or `z` to form an actual function name.
   * Any occurrence of `ctype` should be replaced with the actual C type corresponding to the datatype instance in question, while `rtype` should be replaced by the real projection of `ctype`. For example:
     * If we consider the prototype for `bli_zaxpyv()` below, `ctype` refers to `dcomplex`.
@@ -227,10 +226,10 @@ Level-1v operations perform various level-1 BLAS-like operations on vectors (hen
 ```c
 void bli_?addv
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -245,9 +244,9 @@ where `x` and `y` are vectors of length _n_.
 ```c
 void bli_?amaxv
      (
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       dim_t*  index
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+             dim_t*  index
      );
 ```
 Given a vector of length _n_, return the zero-based index `index` of the element of vector `x` that contains the largest absolute value (or, in the complex domain, the largest complex modulus).
@@ -262,11 +261,11 @@ If `NaN` is encountered, it is treated as if it were a valid value that was smal
 ```c
 void bli_?axpyv
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -281,12 +280,12 @@ where `x` and `y` are vectors of length _n_, and `alpha` is a scalar.
 ```c
 void bli_?axpbyv
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  beta,
-       ctype*  y, inc_t incy
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+       const ctype*  beta,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -301,10 +300,10 @@ where `x` and `y` are vectors of length _n_, and `alpha` and `beta` are scalars.
 ```c
 void bli_?copyv
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -319,12 +318,12 @@ where `x` and `y` are vectors of length _n_.
 ```c
 void bli_?dotv
      (
-       conj_t  conjx,
-       conj_t  conjy,
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       ctype*  rho
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+             ctype*  rho
      );
 ```
 Perform
@@ -339,14 +338,14 @@ where `x` and `y` are vectors of length _n_, and `rho` is a scalar.
 ```c
 void bli_?dotxv
      (
-       conj_t  conjx,
-       conj_t  conjy,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       ctype*  beta,
-       ctype*  rho
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   n,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+       const ctype*  beta,
+             ctype*  rho
      );
 ```
 Perform
@@ -373,10 +372,10 @@ Invert all elements of an _n_-length vector `x`.
 ```c
 void bli_?invscalv
      (
-       conj_t  conjalpha,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx
+             conj_t  conjalpha,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  x, inc_t incx
      );
 ```
 Perform
@@ -391,10 +390,10 @@ where `x` is a vector of length _n_, and `alpha` is a scalar.
 ```c
 void bli_?scalv
      (
-       conj_t  conjalpha,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx
+             conj_t  conjalpha,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  x, inc_t incx
      );
 ```
 Perform
@@ -409,11 +408,11 @@ where `x` is a vector of length _n_, and `alpha` is a scalar.
 ```c
 void bli_?scal2v
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -428,10 +427,10 @@ where `x` and `y` are vectors of length _n_, and `alpha` is a scalar.
 ```c
 void bli_?setv
      (
-       conj_t  conjalpha,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx
+             conj_t  conjalpha,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  x, inc_t incx
      );
 ```
 Perform
@@ -446,10 +445,10 @@ That is, set all elements of an _n_-length vector `x` to scalar `conjalpha(alpha
 ```c
 void bli_?subv
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -477,11 +476,11 @@ Swap corresponding elements of two _n_-length vectors `x` and `y`.
 ```c
 void bli_?xpbyv
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       ctype*  beta,
-       ctype*  y, inc_t incy
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+       const ctype*  beta,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -508,13 +507,13 @@ Most of these operations are similar to level-1m counterparts, except they only
 ```c
 void bli_?addd
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -524,14 +523,14 @@ void bli_?addd
 ```c
 void bli_?axpyd
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -541,13 +540,13 @@ void bli_?axpyd
 ```c
 void bli_?copyd
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -570,12 +569,12 @@ void bli_?invertd
 ```c
 void bli_?invscald
      (
-       conj_t  conjalpha,
-       doff_t  diagoffa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa
+             conj_t  conjalpha,
+             doff_t  diagoffa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 
@@ -585,12 +584,12 @@ void bli_?invscald
 ```c
 void bli_?scald
      (
-       conj_t  conjalpha,
-       doff_t  diagoffa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa
+             conj_t  conjalpha,
+             doff_t  diagoffa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 
@@ -600,14 +599,14 @@ void bli_?scald
 ```c
 void bli_?scal2d
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -617,12 +616,12 @@ void bli_?scal2d
 ```c
 void bli_?setd
      (
-       conj_t  conjalpha,
-       doff_t  diagoffa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa
+             conj_t  conjalpha,
+             doff_t  diagoffa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 
@@ -632,11 +631,11 @@ void bli_?setd
 ```c
 void bli_?setid
      (
-       doff_t    diagoffa,
-       dim_t     m,
-       dim_t     n,
-       ctype_r*  alpha,
-       ctype*    a, inc_t rsa, inc_t csa
+             doff_t    diagoffa,
+             dim_t     m,
+             dim_t     n,
+       const ctype_r*  alpha,
+             ctype*    a, inc_t rsa, inc_t csa
      );
 ```
 Set the imaginary components of every element along the diagonal of `a`, as
@@ -650,11 +649,11 @@ of `a`.
 ```c
 void bli_?shiftd
      (
-       doff_t  diagoffa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa
+             doff_t  diagoffa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Add a constant value `alpha` to every element along the diagonal of `a`, as
@@ -666,13 +665,13 @@ specified by `diagoffa`.
 ```c
 void bli_?subd
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -682,14 +681,14 @@ void bli_?subd
 ```c
 void bli_?xpbyd
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   beta,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   beta,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 
@@ -707,14 +706,14 @@ Level-1m operations perform various level-1 BLAS-like operations on matrices (he
 ```c
 void bli_?addm
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       uplo_t   uploa,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             uplo_t   uploa,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -729,15 +728,15 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up
 ```c
 void bli_?axpym
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       uplo_t   uploa,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             uplo_t   uploa,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -752,14 +751,14 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up
 ```c
 void bli_?copym
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       uplo_t   uploa,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             uplo_t   uploa,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -774,13 +773,13 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up
 ```c
 void bli_?invscalm
      (
-       conj_t  conjalpha,
-       doff_t  diagoffa,
-       uplo_t  uploa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa
+             conj_t  conjalpha,
+             doff_t  diagoffa,
+             uplo_t  uploa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Perform
@@ -795,13 +794,13 @@ where `A` is an _m x n_ matrix stored as a dense matrix, or lower- or upper-tria
 ```c
 void bli_?scalm
      (
-       conj_t  conjalpha,
-       doff_t  diagoffa,
-       uplo_t  uploa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa
+             conj_t  conjalpha,
+             doff_t  diagoffa,
+             uplo_t  uploa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Perform
@@ -816,15 +815,15 @@ where `A` is an _m x n_ matrix stored as a dense matrix, or lower- or upper-tria
 ```c
 void bli_?scal2m
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       uplo_t   uploa,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             uplo_t   uploa,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -839,14 +838,14 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up
 ```c
 void bli_?setm
      (
-       conj_t  conjalpha,
-       doff_t  diagoffa,
-       diag_t  diaga,
-       uplo_t  uploa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa
+             conj_t  conjalpha,
+             doff_t  diagoffa,
+             diag_t  diaga,
+             uplo_t  uploa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Set all elements of an _m x n_ matrix `A` to `conjalpha(alpha)`, where `A` is stored as a dense matrix, or lower- or upper- triangular/trapezoidal matrix, as specified by `uploa`, with the diagonal offset of `A` specified by `diagoffa` and unit/non-unit nature of the diagonal specified by `diaga`. If `uploa` indicates lower or upper storage, only that part of matrix `A` will be updated.
@@ -857,14 +856,14 @@ Set all elements of an _m x n_ matrix `A` to `conjalpha(alpha)`, where `A` is st
 ```c
 void bli_?subm
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       uplo_t   uploa,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             doff_t   diagoffa,
+             diag_t   diaga,
+             uplo_t   uploa,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -893,14 +892,14 @@ Level-1f kernels are employed when optimizing level-2 operations.
 ```c
 void bli_?axpy2v
      (
-       conj_t  conjx,
-       conj_t  conjy,
-       dim_t   m,
-       ctype*  alphax,
-       ctype*  alphay,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       ctype*  z, inc_t incz
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   m,
+       const ctype*  alphax,
+       const ctype*  alphay,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+             ctype*  z, inc_t incz
      );
 ```
 Perform
@@ -915,15 +914,15 @@ where `x`, `y`, and `z` are vectors of length _m_. The kernel, if optimized, is
 ```c
 void bli_?dotaxpyv
      (
-       conj_t  conjxt,
-       conj_t  conjx,
-       conj_t  conjy,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       ctype*  rho,
-       ctype*  z, inc_t incz
+             conj_t  conjxt,
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   m,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+             ctype*  rho,
+             ctype*  z, inc_t incz
      );
 ```
 Perform
@@ -939,14 +938,14 @@ where `x`, `y`, and `z` are vectors of length _m_ and `alpha` and `rho` are scal
 ```c
 void bli_?axpyf
      (
-       conj_t  conja,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   b,
-       ctype*  alpha,
-       ctype*  a, inc_t inca, inc_t lda,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b,
+       const ctype*  alpha,
+       const ctype*  a, inc_t inca, inc_t lda,
+       const ctype*  x, inc_t incx,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -961,15 +960,15 @@ where `A` is an _m x b_ matrix, and `y` and `x` are vectors. The kernel, if opti
 ```c
 void bli_?dotxf
      (
-       conj_t  conjat,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   b,
-       ctype*  alpha,
-       ctype*  a, inc_t inca, inc_t lda,
-       ctype*  x, inc_t incx,
-       ctype*  beta,
-       ctype*  y, inc_t incy
+             conj_t  conjat,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b,
+       const ctype*  alpha,
+       const ctype*  a, inc_t inca, inc_t lda,
+       const ctype*  x, inc_t incx,
+       const ctype*  beta,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -984,19 +983,19 @@ where `A` is an _m x b_ matrix, and `y` and `x` are vectors. The kernel, if opti
 ```c
 void bli_?dotxaxpyf
      (
-       conj_t  conjat,
-       conj_t  conja,
-       conj_t  conjw,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   b,
-       ctype*  alpha,
-       ctype*  a, inc_t inca, inc_t lda,
-       ctype*  w, inc_t incw,
-       ctype*  x, inc_t incx,
-       ctype*  beta,
-       ctype*  y, inc_t incy,
-       ctype*  z, inc_t incz
+             conj_t  conjat,
+             conj_t  conja,
+             conj_t  conjw,
+             conj_t  conjx,
+             dim_t   m,
+             dim_t   b,
+       const ctype*  alpha,
+       const ctype*  a, inc_t inca, inc_t lda,
+       const ctype*  w, inc_t incw,
+       const ctype*  x, inc_t incx,
+       const ctype*  beta,
+             ctype*  y, inc_t incy,
+             ctype*  z, inc_t incz
      );
 ```
 Perform
@@ -1019,15 +1018,15 @@ Level-2 operations perform various level-2 BLAS-like operations.
 ```c
 void bli_?gemv
      (
-       trans_t  transa,
-       conj_t   conjx,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   x, inc_t incx,
-       ctype*   beta,
-       ctype*   y, inc_t incy
+             trans_t  transa,
+             conj_t   conjx,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   x, inc_t incx,
+       const ctype*   beta,
+             ctype*   y, inc_t incy
      );
 ```
 Perform
@@ -1042,14 +1041,14 @@ where `transa(A)` is an _m x n_ matrix, and `y` and `x` are vectors.
 ```c
 void bli_?ger
      (
-       conj_t  conjx,
-       conj_t  conjy,
-       dim_t   m,
-       dim_t   n,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       ctype*  a, inc_t rsa, inc_t csa
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Perform
@@ -1064,15 +1063,15 @@ where `A` is an _m x n_ matrix, and `x` and `y` are vectors of length _m_ and _n
 ```c
 void bli_?hemv
      (
-       uplo_t  uploa,
-       conj_t  conja,
-       conj_t  conjx,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  x, inc_t incx,
-       ctype*  beta,
-       ctype*  y, inc_t incy
+             uplo_t  uploa,
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+       const ctype*  alpha,
+       const ctype*  a, inc_t rsa, inc_t csa,
+       const ctype*  x, inc_t incx,
+       const ctype*  beta,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -1087,12 +1086,12 @@ where `A` is an _m x m_ Hermitian matrix stored in the lower or upper triangle a
 ```c
 void bli_?her
      (
-       uplo_t  uploa,
-       conj_t  conjx,
-       dim_t   m,
-       rtype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  a, inc_t rsa, inc_t csa
+             uplo_t  uploa,
+             conj_t  conjx,
+             dim_t   m,
+       const rtype*  alpha,
+       const ctype*  x, inc_t incx,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Perform
@@ -1109,14 +1108,14 @@ where `A` is an _m x m_ Hermitian matrix stored in the lower or upper triangle a
 ```c
 void bli_?her2
      (
-       uplo_t  uploa,
-       conj_t  conjx,
-       conj_t  conjy,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       ctype*  a, inc_t rsa, inc_t csa
+             uplo_t  uploa,
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   m,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Perform
@@ -1131,15 +1130,15 @@ where `A` is an _m x m_ Hermitian matrix stored in the lower or upper triangle a
 ```c
 void bli_?symv
      (
-       uplo_t  uploa,
-       conj_t  conja,
-       conj_t  conjx,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  a, inc_t rsa, inc_t csa,
-       ctype*  x, inc_t incx,
-       ctype*  beta,
-       ctype*  y, inc_t incy
+             uplo_t  uploa,
+             conj_t  conja,
+             conj_t  conjx,
+             dim_t   m,
+       const ctype*  alpha,
+       const ctype*  a, inc_t rsa, inc_t csa,
+       const ctype*  x, inc_t incx,
+       const ctype*  beta,
+             ctype*  y, inc_t incy
      );
 ```
 Perform
@@ -1154,12 +1153,12 @@ where `A` is an _m x m_ symmetric matrix stored in the lower or upper triangle a
 ```c
 void bli_?syr
      (
-       uplo_t  uploa,
-       conj_t  conjx,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  a, inc_t rsa, inc_t csa
+             uplo_t  uploa,
+             conj_t  conjx,
+             dim_t   m,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Perform
@@ -1174,14 +1173,14 @@ where `A` is an _m x m_ symmetric matrix stored in the lower or upper triangle a
 ```c
 void bli_?syr2
      (
-       uplo_t  uploa,
-       conj_t  conjx,
-       conj_t  conjy,
-       dim_t   m,
-       ctype*  alpha,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       ctype*  a, inc_t rsa, inc_t csa
+             uplo_t  uploa,
+             conj_t  conjx,
+             conj_t  conjy,
+             dim_t   m,
+       const ctype*  alpha,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+             ctype*  a, inc_t rsa, inc_t csa
      );
 ```
 Perform
@@ -1196,13 +1195,13 @@ where `A` is an _m x m_ symmetric matrix stored in the lower or upper triangle a
 ```c
 void bli_?trmv
      (
-       uplo_t   uploa,
-       trans_t  transa,
-       diag_t   diaga,
-       dim_t    m,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   x, inc_t incx
+             uplo_t   uploa,
+             trans_t  transa,
+             diag_t   diaga,
+             dim_t    m,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   x, inc_t incx
      );
 ```
 Perform
@@ -1217,13 +1216,13 @@ where `A` is an _m x m_ triangular matrix stored in the lower or upper triangle
 ```c
 void bli_?trsv
      (
-       uplo_t   uploa,
-       trans_t  transa,
-       diag_t   diaga,
-       dim_t    m,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   y, inc_t incy
+             uplo_t   uploa,
+             trans_t  transa,
+             diag_t   diaga,
+             dim_t    m,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   y, inc_t incy
      );
 ```
 Solve the linear system
@@ -1248,16 +1247,16 @@ Level-3 operations perform various level-3 BLAS-like operations.
 ```c
 void bli_?gemm
      (
-       trans_t  transa,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    n,
-       dim_t    k,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             trans_t  transa,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    n,
+             dim_t    k,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1272,16 +1271,16 @@ where C is an _m x n_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)`
 ```c
 void bli_?gemmt
      (
-       uplo_t   uploc,
-       trans_t  transa,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    k,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             uplo_t   uploc,
+             trans_t  transa,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    k,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1296,17 +1295,17 @@ where C is an _m x m_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)`
 ```c
 void bli_?hemm
      (
-       side_t   sidea,
-       uplo_t   uploa,
-       conj_t   conja,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             side_t   sidea,
+             uplo_t   uploa,
+             conj_t   conja,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1325,14 +1324,14 @@ if `sidea` is `BLIS_RIGHT`, where `C` and `B` are _m x n_ matrices and `A` is a
 ```c
 void bli_?herk
      (
-       uplo_t   uploc,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    k,
-       rtype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       rtype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             uplo_t   uploc,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    k,
+       const rtype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const rtype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1349,16 +1348,16 @@ where C is an _m x m_ Hermitian matrix stored in the lower or upper triangle as
 ```c
 void bli_?her2k
      (
-       uplo_t   uploc,
-       trans_t  transa,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    k,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       rtype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             uplo_t   uploc,
+             trans_t  transa,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    k,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const rtype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1375,17 +1374,17 @@ where C is an _m x m_ Hermitian matrix stored in the lower or upper triangle as
 ```c
 void bli_?symm
      (
-       side_t   sidea,
-       uplo_t   uploa,
-       conj_t   conja,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             side_t   sidea,
+             uplo_t   uploa,
+             conj_t   conja,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1404,14 +1403,14 @@ if `sidea` is `BLIS_RIGHT`, where `C` and `B` are _m x n_ matrices and `A` is a
 ```c
 void bli_?syrk
      (
-       uplo_t   uploc,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    k,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             uplo_t   uploc,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    k,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1426,16 +1425,16 @@ where C is an _m x m_ symmetric matrix stored in the lower or upper triangle as
 ```c
 void bli_?syr2k
      (
-       uplo_t   uploc,
-       trans_t  transa,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    k,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             uplo_t   uploc,
+             trans_t  transa,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    k,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1450,15 +1449,15 @@ where C is an _m x m_ symmetric matrix stored in the lower or upper triangle as
 ```c
 void bli_?trmm
      (
-       side_t   sidea,
-       uplo_t   uploa,
-       trans_t  transa,
-       diag_t   diaga,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             side_t   sidea,
+             uplo_t   uploa,
+             trans_t  transa,
+             diag_t   diaga,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Perform
@@ -1477,18 +1476,18 @@ if `sidea` is `BLIS_RIGHT`, where `B` is an _m x n_ matrix and `A` is a triangul
 ```c
 void bli_?trmm3
      (
-       side_t   sidea,
-       uplo_t   uploa,
-       trans_t  transa,
-       diag_t   diaga,
-       trans_t  transb,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb,
-       ctype*   beta,
-       ctype*   c, inc_t rsc, inc_t csc
+             side_t   sidea,
+             uplo_t   uploa,
+             trans_t  transa,
+             diag_t   diaga,
+             trans_t  transb,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+       const ctype*   b, inc_t rsb, inc_t csb,
+       const ctype*   beta,
+             ctype*   c, inc_t rsc, inc_t csc
      );
 ```
 Perform
@@ -1507,15 +1506,15 @@ if `sidea` is `BLIS_RIGHT`, where `C` and `transb(B)` are _m x n_ matrices and `
 ```c
 void bli_?trsm
      (
-       side_t   sidea,
-       uplo_t   uploa,
-       trans_t  transa,
-       diag_t   diaga,
-       dim_t    m,
-       dim_t    n,
-       ctype*   alpha,
-       ctype*   a, inc_t rsa, inc_t csa,
-       ctype*   b, inc_t rsb, inc_t csb
+             side_t   sidea,
+             uplo_t   uploa,
+             trans_t  transa,
+             diag_t   diaga,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   alpha,
+       const ctype*   a, inc_t rsa, inc_t csa,
+             ctype*   b, inc_t rsb, inc_t csb
      );
 ```
 Solve the linear system with multiple right-hand sides
@@ -1539,9 +1538,9 @@ if `sidea` is `BLIS_RIGHT`, where `X` and `B` are an _m x n_ matrices and `A` is
 ```c
 void bli_?asumv
      (
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       rtype*  asum
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+             rtype*  asum
      );
 ```
 Compute the sum of the absolute values of the fundamental elements of vector `x`. The resulting sum is stored to `asum`.
@@ -1557,13 +1556,13 @@ Compute the sum of the absolute values of the fundamental elements of vector `x`
 ```c
 void bli_?norm[1fi]m
      (
-       doff_t  diagoffa,
-       doff_t  diaga,
-       uplo_t  uploa,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rs_a, inc_t cs_a,
-       rtype*  norm
+             doff_t  diagoffa,
+             doff_t  diaga,
+             uplo_t  uploa,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  a, inc_t rs_a, inc_t cs_a,
+             rtype*  norm
      );
 ```
 Compute the one-norm (`bli_?norm1m()`), Frobenius norm (`bli_?normfm()`), or infinity norm (`bli_?normim()`) of the elements in an _m x n_ matrix `A`. If `uploa` is `BLIS_LOWER` or `BLIS_UPPER` then `A` is assumed to be lower or upper triangular, respectively, with the main diagonal located at offset `diagoffa`. The resulting norm is stored to `norm`.
@@ -1578,9 +1577,9 @@ Compute the one-norm (`bli_?norm1m()`), Frobenius norm (`bli_?normfm()`), or inf
 ```c
 void bli_?norm[1fi]v
      (
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       rtype*  norm
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+             rtype*  norm
      );
 ```
 Compute the one-norm (`bli_?norm1v()`), Frobenius norm (`bli_?normfv()`), or infinity norm (`bli_?normiv()`) of the elements in a vector `x` of length _n_. The resulting norm is stored to `norm`.
@@ -1632,12 +1631,12 @@ Make an _m x m_ matrix `A` explicitly triangular by preserving the triangle spec
 ```c
 void bli_?fprintv
      (
-       FILE*   file,
-       char*   s1,
-       dim_t   m,
-       ctype*  x, inc_t incx,
-       char*   format,
-       char*   s2
+             FILE*   file,
+       const char*   s1,
+             dim_t   m,
+       const ctype*  x, inc_t incx,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print a vector `x` of length _m_ to file stream `file`, where `file` is a file pointer returned by the standard C library function `fopen()`. The caller may also pass in a global file pointer such as `stdout` or `stderr`. The strings `s1` and `s2` are printed immediately before and after the output (respectively), and the format specifier `format` is used to format the individual elements. For valid format specifiers, please see documentation for the standard C library function `printf()`.
@@ -1650,13 +1649,13 @@ Print a vector `x` of length _m_ to file stream `file`, where `file` is a file p
 ```c
 void bli_?fprintm
      (
-       FILE*   file,
-       char*   s1,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rs_a, inc_t cs_a,
-       char*   format,
-       char*   s2
+             FILE*   file,
+       const char*   s1,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  a, inc_t rs_a, inc_t cs_a,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print an _m x n_ matrix `A` to file stream `file`, where `file` is a file pointer returned by the standard C library function `fopen()`. The caller may also pass in a global file pointer such as `stdout` or `stderr`. The strings `s1` and `s2` are printed immediately before and after the output (respectively), and the format specifier `format` is used to format the individual elements. For valid format specifiers, please see documentation for the standard C library function `printf()`.
@@ -1669,11 +1668,11 @@ Print an _m x n_ matrix `A` to file stream `file`, where `file` is a file pointe
 ```c
 void bli_?printv
      (
-       char*   s1,
-       dim_t   m,
-       ctype*  x, inc_t incx,
-       char*   format,
-       char*   s2
+       const char*   s1,
+             dim_t   m,
+       const ctype*  x, inc_t incx,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print a vector `x` of length _m_ to standard output. This function call is equivalent to calling `bli_?fprintv()` with `stdout` as the file pointer.
@@ -1684,12 +1683,12 @@ Print a vector `x` of length _m_ to standard output. This function call is equiv
 ```c
 void bli_?printm
      (
-       char*   s1,
-       dim_t   m,
-       dim_t   n,
-       ctype*  a, inc_t rs_a, inc_t cs_a,
-       char*   format,
-       char*   s2
+       const char*   s1,
+             dim_t   m,
+             dim_t   n,
+       const ctype*  a, inc_t rs_a, inc_t cs_a,
+       const char*   format,
+       const char*   s2
      );
 ```
 Print an _m x n_ matrix `a` to standard output. This function call is equivalent to calling `bli_?fprintm()` with `stdout` as the file pointer.
@@ -1731,10 +1730,10 @@ Set the elements of an _m x n_ matrix `A` to random values on the interval `[-1,
 ```c
 void bli_?sumsqv
      (
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       rtype*  scale,
-       rtype*  sumsq
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+             rtype*  scale,
+             rtype*  sumsq
      );
 ```
 Compute the sum of the squares of the elements in a vector `x` of length _n_. The result is computed in scaled form, and in such a way that it may be used repeatedly to accumulate the sum of the squares of several vectors.
@@ -1753,9 +1752,9 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec
 ```c
 void bli_?getsc
      (
-       ctype*   chi,
-       double*  zeta_r,
-       double*  zeta_i
+       const ctype*   chi,
+             double*  zeta_r,
+             double*  zeta_i
      );
 ```
 Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -1766,10 +1765,10 @@ Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and
 ```c
 err_t bli_?getijv
      (
-       dim_t    i,
-       ctype*   x, inc_t incx,
-       double*  ar,
-       double*  ai
+             dim_t    i,
+       const void*    x, inc_t incx,
+             double*  ar,
+             double*  ai
      );
 ```
 Copy the real and imaginary values at the `i`th element of vector `x` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -1781,11 +1780,11 @@ Note that the object-based analogue of [getijv](BLISObjectAPI.md#getijv) does bo
 ```c
 err_t bli_?getijm
      (
-       dim_t    i,
-       dim_t    j,
-       ctype*   b, inc_t rs_b, inc_t cs_b,
-       double*  ar,
-       double*  ai
+             dim_t    i,
+             dim_t    j,
+       const void*    b, inc_t rs_b, inc_t cs_b,
+             double*  ar,
+             double*  ai
      );
 ```
 Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.)
@@ -1813,7 +1812,7 @@ err_t bli_?setijv
        double  ar,
        double  ai,
        dim_t   i,
-       ctype*  x, inc_t incx
+       void*   x, inc_t incx
      );
 ```
 Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.)
@@ -1829,7 +1828,7 @@ err_t bli_?setijm
        double  ai,
        dim_t   i,
        dim_t   j,
-       ctype*  b, inc_t rs_b, inc_t cs_b
+       void*   b, inc_t rs_b, inc_t cs_b
      );
 ```
 Copy real and imaginary values `ar` and `ai` to the (`i`,`j`) element of object `b`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `b` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.)
@@ -1841,10 +1840,10 @@ Note that the object-based analogue of [setijm](BLISObjectAPI.md#setijm) does bo
 ```c
 void bli_?eqsc
      (
-       conj_t  conjchi,
-       ctype*  chi,
-       ctype*  psi,
-       bool*   is_eq
+             conj_t  conjchi,
+       const ctype*  chi,
+       const ctype*  psi,
+             bool*   is_eq
      );
 ```
 Perform an element-wise comparison between scalars `chi` and `psi` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -1856,11 +1855,11 @@ If `conjchi` indicates a conjugation, `chi` will be implicitly conjugated for pu
 ```c
 void bli_?eqv
      (
-       conj_t  conjx,
-       dim_t   n,
-       ctype*  x, inc_t incx,
-       ctype*  y, inc_t incy,
-       bool*   is_eq
+             conj_t  conjx,
+             dim_t   n,
+       const ctype*  x, inc_t incx,
+       const ctype*  y, inc_t incy,
+             bool*   is_eq
      );
 ```
 Perform an element-wise comparison between length _n_ vectors `x` and `y` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -1872,15 +1871,15 @@ If `conjx` indicates a conjugation, `x` will be implicitly conjugated for purpos
 ```c
 void bli_?eqm
      (
-       doff_t   diagoffa,
-       diag_t   diaga,
-       uplo_t   uploa,
-       trans_t  transa,
-       dim_t    m,
-       dim_t    n,
-       ctype*   a, inc_t rs_a, inc_t cs_a,
-       ctype*   b, inc_t rs_b, inc_t cs_b,
-       bool*    is_eq
+             doff_t   diagoffa,
+             diag_t   diaga,
+             uplo_t   uploa,
+             trans_t  transa,
+             dim_t    m,
+             dim_t    n,
+       const ctype*   a, inc_t rs_a, inc_t cs_a,
+       const ctype*   b, inc_t rs_b, inc_t cs_b,
+             bool*    is_eq
      );
 ```
 Perform an element-wise comparison between matrices `A` and `B` and store the boolean result in the `bool` pointed to by `is_eq`.
@@ -1903,14 +1902,16 @@ If `transa` indicates a conjugation and/or transposition, then `A` will be conju
 ```c
 void bli_?gemm_*
      (
-       dim_t                k,
-       ctype*      restrict alpha,
-       ctype*      restrict a1,
-       ctype*      restrict b1,
-       ctype*      restrict beta,
-       ctype*      restrict c11, inc_t rsc, inc_t csc,
-       auxinfo_t*  restrict data,
-       cntx_t*     restrict cntx
+             dim_t       m,
+             dim_t       n,
+             dim_t       k,
+       const ctype*      alpha,
+       const ctype*      a1,
+       const ctype*      b1,
+       const ctype*      beta,
+             ctype*      c11, inc_t rsc, inc_t csc,
+       const auxinfo_t*  data,
+       const cntx_t*     cntx
      );
 ```
 Perform
@@ -1928,20 +1929,20 @@ Please see the [Kernel Guide](KernelsHowTo.md) for more information on the `gemm
 ```c
 void bli_?trsm_l_*
      (
-       ctype*      restrict a11,
-       ctype*      restrict b11,
-       ctype*      restrict c11, inc_t rsc, inc_t csc
-       auxinfo_t*  restrict data,
-       cntx_t*     restrict cntx
+       const ctype*      a11,
+             ctype*      b11,
+             ctype*      c11, inc_t rsc, inc_t csc
+       const auxinfo_t*  data,
+       const cntx_t*     cntx
      );
 
 void bli_?trsm_u_*
      (
-       ctype*      restrict a11,
-       ctype*      restrict b11,
-       ctype*      restrict c11, inc_t rsc, inc_t csc
-       auxinfo_t*  restrict data,
-       cntx_t*     restrict cntx
+       const ctype*      a11,
+             ctype*      b11,
+             ctype*      c11, inc_t rsc, inc_t csc
+       const auxinfo_t*  data,
+       const cntx_t*     cntx
      );
 ```
 Perform
@@ -1959,28 +1960,32 @@ Please see the [Kernel Guide](KernelsHowTo.md) for more information on the `trsm
 ```c
 void bli_?gemmtrsm_l_*
      (
-       dim_t                k,
-       ctype*      restrict alpha,
-       ctype*      restrict a10,
-       ctype*      restrict a11,
-       ctype*      restrict b01,
-       ctype*      restrict b11,
-       ctype*      restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*  restrict data,
-       cntx_t*     restrict cntx
+             dim_t       m,
+             dim_t       n,
+             dim_t       k,
+       const ctype*      alpha,
+       const ctype*      a10,
+       const ctype*      a11,
+       const ctype*      b01,
+             ctype*      b11,
+             ctype*      c11, inc_t rs_c, inc_t cs_c,
+       const auxinfo_t*  data,
+       const cntx_t*     cntx
      );
 
 void bli_?gemmtrsm_u_*
      (
-       dim_t                k,
-       ctype*      restrict alpha,
-       ctype*      restrict a12,
-       ctype*      restrict a11,
-       ctype*      restrict b21,
-       ctype*      restrict b11,
-       ctype*      restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t*  restrict data,
-       cntx_t*     restrict cntx
+             dim_t       m,
+             dim_t       n,
+             dim_t       k,
+       const ctype*      alpha,
+       const ctype*      a12,
+       const ctype*      a11,
+       const ctype*      b21,
+             ctype*      b11,
+             ctype*      c11, inc_t rs_c, inc_t cs_c,
+       const auxinfo_t*  data,
+       const cntx_t*     cntx
      );
 ```
 Perform
@@ -2008,7 +2013,7 @@ Please see the [Kernel Guide](KernelsHowTo.md) for more information on the `gemm
 
 BLIS allows applications to query information about how BLIS was configured. The `bli_info_` API provides several categories of query routines. Most values are returned as a `gint_t`, which is a signed integer. The size of this integer can be queried through a special routine that returns the size in a character string:
 ```c
-char* bli_info_get_int_type_size_str( void );
+const char* bli_info_get_int_type_size_str( void );
 ```
 **Note:** All of the `bli_info_` functions are **always** thread-safe, no matter how BLIS was configured.
 
@@ -2016,7 +2021,7 @@ char* bli_info_get_int_type_size_str( void );
 
 The following routine returns the address the full BLIS version string:
 ```c
-char* bli_info_get_version_str( void );
+const char* bli_info_get_version_str( void );
 ```
 
 ## Specific configuration
@@ -2029,7 +2034,7 @@ This is most useful when BLIS is configured with multiple configurations. (When
 
 Once the configuration's ID is known, it can be used to query a string that contains the name of the configuration:
 ```c
-char* bli_arch_string( arch_t id );
+const char* bli_arch_string( arch_t id );
 ```
 
 ## General configuration
@@ -2048,10 +2053,33 @@ gint_t bli_info_get_stack_buf_max_size( void );
 gint_t bli_info_get_stack_buf_align_size( void );
 gint_t bli_info_get_heap_addr_align_size( void );
 gint_t bli_info_get_heap_stride_align_size( void );
-gint_t bli_info_get_pool_addr_align_size( void );
+gint_t bli_info_get_pool_addr_align_size_a( void );
+gint_t bli_info_get_pool_addr_align_size_b( void );
+gint_t bli_info_get_pool_addr_align_size_c( void );
+gint_t bli_info_get_pool_addr_align_size_gen( void );
+gint_t bli_info_get_pool_addr_offset_size_a( void );
+gint_t bli_info_get_pool_addr_offset_size_b( void );
+gint_t bli_info_get_pool_addr_offset_size_c( void );
+gint_t bli_info_get_pool_addr_offset_size_gen( void );
 gint_t bli_info_get_enable_stay_auto_init( void );
 gint_t bli_info_get_enable_blas( void );
+gint_t bli_info_get_enable_cblas( void );
 gint_t bli_info_get_blas_int_type_size( void );
+gint_t bli_info_get_enable_pba_pools( void );
+gint_t bli_info_get_enable_sba_pools( void );
+gint_t bli_info_get_enable_threading( void );
+gint_t bli_info_get_enable_openmp( void );
+gint_t bli_info_get_enable_pthreads( void );
+gint_t bli_info_get_enable_hpx( void );
+gint_t bli_info_get_enable_openmp_as_default( void );
+gint_t bli_info_get_enable_pthreads_as_default( void );
+gint_t bli_info_get_enable_hpx_as_default( void );
+gint_t bli_info_get_thread_jrir_slab( void );
+gint_t bli_info_get_thread_jrir_rr( void );
+gint_t bli_info_get_thread_jrir_tlb( void );
+gint_t bli_info_get_enable_tls( void );
+gint_t bli_info_get_enable_memkind( void );
+gint_t bli_info_get_enable_sandbox( void );
 ```
 
 ## Kernel information
@@ -2061,11 +2089,11 @@ gint_t bli_info_get_blas_int_type_size( void );
 The following routines allow the caller to obtain a string that identifies the implementation type of each microkernel that is currently active (ie: part of the current active configuration, as identified bi `bli_arch_query_id()`).
 
 ```c
-char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt )
-char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt );
+const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt );
 ```
 
 Possible implementation (ie: the `ind_t method` argument) types are:
@@ -2083,16 +2111,17 @@ Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_s
 
 The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implemenation query](BLISTypedAPI.md#microkernel-implementation-type-query).
 ```c
-char* bli_info_get_gemm_impl_string( num_t dt );
-char* bli_info_get_hemm_impl_string( num_t dt );
-char* bli_info_get_herk_impl_string( num_t dt );
-char* bli_info_get_her2k_impl_string( num_t dt );
-char* bli_info_get_symm_impl_string( num_t dt );
-char* bli_info_get_syrk_impl_string( num_t dt );
-char* bli_info_get_syr2k_impl_string( num_t dt );
-char* bli_info_get_trmm_impl_string( num_t dt );
-char* bli_info_get_trmm3_impl_string( num_t dt );
-char* bli_info_get_trsm_impl_string( num_t dt );
+const char* bli_info_get_gemm_impl_string( num_t dt );
+const char* bli_info_get_gemmt_impl_string( num_t dt );
+const char* bli_info_get_hemm_impl_string( num_t dt );
+const char* bli_info_get_herk_impl_string( num_t dt );
+const char* bli_info_get_her2k_impl_string( num_t dt );
+const char* bli_info_get_symm_impl_string( num_t dt );
+const char* bli_info_get_syrk_impl_string( num_t dt );
+const char* bli_info_get_syr2k_impl_string( num_t dt );
+const char* bli_info_get_trmm_impl_string( num_t dt );
+const char* bli_info_get_trmm3_impl_string( num_t dt );
+const char* bli_info_get_trsm_impl_string( num_t dt );
 ```
 
 
@@ -2102,10 +2131,7 @@ char* bli_info_get_trsm_impl_string( num_t dt );
 
 #### clock
 ```c
-double bli_clock
-     (
-       void
-     );
+double bli_clock( void );
 ```
 Return the amount of time that has elapsed since some fixed time in the past. The return values of `bli_clock()` typically feature nanosecond precision, though this is not guaranteed.
 
diff --git a/docs/KernelsHowTo.md b/docs/KernelsHowTo.md
index 30a4dc736..0bbe20984 100644
--- a/docs/KernelsHowTo.md
+++ b/docs/KernelsHowTo.md
@@ -315,20 +315,20 @@ Parameters:
 The diagram below shows the packed micropanel operands and how elements of each would be stored when _MR_ = _NR_ = 4. The hex digits indicate the layout and order (but NOT the numeric contents) of the elements in memory. Note that the storage of `C11` is not shown since it is determined by the row and column strides of `C11`.
 
 ```
-         c11:           a1:                        b1:                  
-         _______        ______________________     _______              
-        |       |      |0 4 8 C               |   |0 1 2 3|             
-    MR  |       |      |1 5 9 D . . .         |   |4 5 6 7|             
-        |       |  +=  |2 6 A E               |   |8 9 A B|             
-        |_______|      |3_7_B_F_______________|   |C D E F|             
-                                                  |   .   |             
-            NR                    k               |   .   | k           
-                                                  |   .   |             
-                                                  |       |             
-                                                  |       |             
-                                                  |_______|             
-                                                                        
-                                                      NR                
+         c11:           a1:                        b1:
+         _______        ______________________     _______
+        |       |      |0 4 8 C               |   |0 1 2 3|
+    MR  |       |      |1 5 9 D . . .         |   |4 5 6 7|
+        |       |  +=  |2 6 A E               |   |8 9 A B|
+        |_______|      |3_7_B_F_______________|   |C D E F|
+                                                  |   .   |
+            NR                    k               |   .   | k
+                                                  |   .   |
+                                                  |       |
+                                                  |       |
+                                                  |_______|
+
+                                                      NR
 ```
 
 #### Implementation Notes for gemm
@@ -573,8 +573,8 @@ Parameters:
 The diagram below shows the packed micropanel operands for `trsm_l` and how elements of each would be stored when _MR_ = _NR_ = 4. (The hex digits indicate the layout and order (but NOT the numeric contents) in memory. Here, matrix `A11` (referenced by `a11`) is **lower triangular**. Matrix `A11` **does contain** elements corresponding to the strictly upper triangle, however, they are not guaranteed to contain zeros and thus these elements should not be referenced.
 
 ```
-                                              NR    
-                                            _______ 
+                                              NR
+                                            _______
                                        b01:|0 1 2 3|
                                            |4 5 6 7|
                                            |8 9 A B|
@@ -587,8 +587,8 @@ The diagram below shows the packed micropanel operands for `trsm_l` and how elem
   MR  |1 5 9 D . . .      |  `.    |       |       |
       |2 6 A E            |    `.  |    MR |       |
       |3_7_B_F____________|______`.|       |_______|
-                                                    
-                k             MR                    
+
+                k             MR
 ```
 
 
@@ -597,8 +597,8 @@ The diagram below shows the packed micropanel operands for `trsm_l` and how elem
 The diagram below shows the packed micropanel operands for `trsm_u` and how elements of each would be stored when _MR_ = _NR_ = 4. (The hex digits indicate the layout and order (but NOT the numeric contents) in memory. Here, matrix `A11` (referenced by `a11`) is **upper triangular**. Matrix `A11` **does contain** elements corresponding to the strictly lower triangle, however, they are not guaranteed to contain zeros and thus these elements should not be referenced.
 
 ```
-       a11:     a12:                          NR    
-       ________ ___________________         _______ 
+       a11:     a12:                          NR
+       ________ ___________________         _______
       |`.      |0 4 8              |   b11:|0 1 2 3|
   MR  |  `.    |1 5 9 . . .        |       |4 5 6 7|
       |    `.  |2 6 A              |    MR |8 9 A B|
@@ -611,7 +611,7 @@ The diagram below shows the packed micropanel operands for `trsm_u` and how elem
      starting with a12 to avoid            |       |
      obscuring triangular structure        |       |
      of a11.                               |_______|
-                                                                            
+
 ```
 
 
@@ -659,7 +659,7 @@ void bli_?axpy2v_<suffix>
        ctype*  restrict y, inc_t incy,
        ctype*  restrict z, inc_t incz,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -683,7 +683,7 @@ void bli_?dotaxpyv_<suffix>
        ctype*  restrict rho,
        ctype*  restrict z, inc_t incz,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -707,7 +707,7 @@ void bli_?axpyf_<suffix>
        ctype*  restrict x, inc_t incx,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -731,7 +731,7 @@ void bli_?dotxf_<suffix>
        ctype*  restrict beta,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -761,7 +761,7 @@ void bli_?dotxaxpyf_<suffix>
        ctype*  restrict y, inc_t incy,
        ctype*  restrict z, inc_t incz,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -789,7 +789,7 @@ void bli_?addv_<suffix>
        ctype*  restrict x, inc_t incx,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -807,7 +807,7 @@ void bli_?amaxv_<suffix>
        ctype*  restrict x, inc_t incx,
        dim_t*  restrict index,
        cntx_t* restrict cntx
-     )
+     );
 ```
 Given a vector of length _n_, this kernel returns the zero-based index `index` of the element of vector `x` that contains the largest absolute value (or, in the complex domain, the largest complex modulus).
 
@@ -825,7 +825,7 @@ void bli_?axpyv_<suffix>
        ctype*  restrict x, inc_t incx,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -846,7 +846,7 @@ void bli_?axpbyv_<suffix>
        ctype*  restrict beta,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -865,7 +865,7 @@ void bli_?copyv_<suffix>
        ctype*  restrict x, inc_t incx,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -886,7 +886,7 @@ void bli_?dotv_<suffix>
        ctype*  restrict y, inc_t incy,
        ctype*  restrict rho,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -909,7 +909,7 @@ void bli_?dotxv_<suffix>
        ctype*  restrict beta,
        ctype*  restrict rho,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -926,7 +926,7 @@ void bli_?invertv_<suffix>
        dim_t            n,
        ctype*  restrict x, inc_t incx,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel inverts all elements of an _n_-length vector `x`.
 
@@ -941,7 +941,7 @@ void bli_?invscalv_<suffix>
        ctype*  restrict alpha,
        ctype*  restrict x, inc_t incx,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -960,7 +960,7 @@ void bli_?scalv_<suffix>
        ctype*  restrict alpha,
        ctype*  restrict x, inc_t incx,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -980,7 +980,7 @@ void bli_?scal2v_<suffix>
        ctype*  restrict x, inc_t incx,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -999,7 +999,7 @@ void bli_?setv_<suffix>
        ctype*  restrict alpha,
        ctype*  restrict x, inc_t incx,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -1018,7 +1018,7 @@ void bli_?subv_<suffix>
        ctype*  restrict x, inc_t incx,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
@@ -1036,7 +1036,7 @@ void bli_?swapv_<suffix>
        ctype*  restrict x, inc_t incx,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel swaps corresponding elements of two _n_-length vectors `x` and `y` stored with strides `incx` and `incy`, respectively.
 
@@ -1052,7 +1052,7 @@ void bli_?xpbyv_<suffix>
        ctype*  restrict beta,
        ctype*  restrict y, inc_t incy,
        cntx_t* restrict cntx
-     )
+     );
 ```
 This kernel performs the following operation:
 ```
diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 11ca7d1d7..edc9addbb 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -291,6 +291,15 @@ If you want to initialize it as part of the declaration, you may do so via the d
 rntm_t rntm = BLIS_RNTM_INITIALIZER;
 ```
 As of this writing, BLIS treats a default-initialized `rntm_t` as a request for single-threaded execution.
+If your application needs to know the ways of parallelism that were conveyed via environment variables, then there is an another way by copying the global `rntm_t` object via
+```c
+void bli_rntm_init_from_global( rntm_t* rntm );
+```
+Which may be called as:
+```c
+bli_rntm_init_from_global( &rntm );
+```
+This way is necessary when running application with multiple BLIS threads.
 
 **Note**: If you choose to **not** initialize the `rntm_t` object and then pass it into a level-3 operation, **you will almost surely observe undefined behavior!** Please don't do this!
 
diff --git a/docs/Performance.md b/docs/Performance.md
index f4992d1de..2294a46d3 100644
--- a/docs/Performance.md
+++ b/docs/Performance.md
@@ -503,7 +503,7 @@ The `runthese.m` file will contain example invocations of the function.
     * Multithreaded (64 core) execution requested via `export MKL_NUM_THREADS=64`
     * Multithreaded (128 core) execution requested via `export MKL_NUM_THREADS=128`
 * Affinity:
-  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-127"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset. 
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-127"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
   * All executables were run through `numactl --interleave=all`.
 * Frequency throttling (via `cpupower`):
   * Driver: acpi-cpufreq
diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index d12a6df56..1884198e2 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -972,7 +972,7 @@ July 18, 2013
 - Optional (and very much untested) C99 built-in complex type/arithmetic support.
 
 Note that `bli_config.h` has changed since 0.0.8. Added configuration macros are:
-```
+```c
   #define BLIS_ENABLE_C99_COMPLEX
   #define BLIS_ENABLE_BLAS2BLIS_INT64
   #define PASTEF770(name) // ...

From d17c0639e398c35d5a07d939f7ebce016acf3d8a Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 22 Jan 2025 11:22:59 -0600
Subject: [PATCH 215/230] Add guard in `examples/*/Makefile` to check that
 `common.mk` was actually found successfully. (#842)

Details:
- If the examples are built out-of-tree then `BLIS_INSTALL_PATH` needs to be set to find the header, library, and build system files. Also, if the examples are attempted to be built before configuring blis then `common.mk` will be missing.
- Current behavior silently ignores the failed import of `common.mk` which causes various difficult-to-diagnose problems.
- The Android/Bionic detection in common.mk has also been changed to not rely on an external file. This allows examples to be compiled in isolation.

Details:
- When building examples out-of-tree (or potentially other external code using `common.mk`), `DIST_PATH` will not be set and so `common.mk` will not be able to locate `build/detect/android/bionic.h`, causing a compiler error in some cases.
- This has been fixed by including the contents of `bionic.h` in the shell statement executing the compiler check.
- Fixes #840.
---
 build/detect/android/bionic.h | 36 -----------------------------------
 common.mk                     |  5 ++---
 examples/oapi/Makefile        |  8 ++++++++
 examples/tapi/Makefile        | 11 ++++++++++-
 4 files changed, 20 insertions(+), 40 deletions(-)
 delete mode 100644 build/detect/android/bionic.h

diff --git a/build/detect/android/bionic.h b/build/detect/android/bionic.h
deleted file mode 100644
index e9a49610b..000000000
--- a/build/detect/android/bionic.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-  BLIS
-  An object-based framework for developing high-performance BLAS-like
-  libraries.
-
-  Copyright (C) 2023, The University of Texas at Austin
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-   - Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-   - Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-   - Neither the name(s) of the copyright holder(s) nor the names of its
-     contributors may be used to endorse or promote products derived
-     from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* Detect Bionic on Android */
-#if __BIONIC__
-bionic
-#endif
diff --git a/common.mk b/common.mk
index 5da49eef2..2ad418772 100644
--- a/common.mk
+++ b/common.mk
@@ -1039,9 +1039,8 @@ endif
 
 ifeq ($(OS_NAME),Linux)
 # Exclude -lrt on Android by detecting Bionic.
-# $(CC) -E bionic.h returns a "bionic" substring iff Bionic is detected.
-BIONIC_H_PATH := $(DIST_PATH)/build/detect/android/bionic.h
-BIONIC := $(findstring bionic,$(shell $(CC) -E $(BIONIC_H_PATH)))
+# printf *must* be used here rather than echo -e
+BIONIC := $(findstring bionic,$(shell printf "\#ifdef __BIONIC__\nbionic\n\#endif" | $(CC) -E -))
 ifeq (,$(BIONIC))
 LDFLAGS += -lrt
 endif
diff --git a/examples/oapi/Makefile b/examples/oapi/Makefile
index f12ca227b..69d6fdbbd 100644
--- a/examples/oapi/Makefile
+++ b/examples/oapi/Makefile
@@ -88,6 +88,14 @@ endif
 # Include the common makefile fragment.
 -include $(SHARE_PATH)/common.mk
 
+# Detect whether we actually got the configuration file. If we didn't, then
+# we may be building from an installed version and need BLIS_INSTALL_PATH.
+ifeq ($(strip $(COMMON_MK_INCLUDED)),yes)
+COMMON_MK_PRESENT := yes
+else
+COMMON_MK_PRESENT := no
+$(error common.mk not found, you may need to specify BLIS_INSTALL_PATH)
+endif
 
 
 #
diff --git a/examples/tapi/Makefile b/examples/tapi/Makefile
index 83330d38b..9a8b203f3 100644
--- a/examples/tapi/Makefile
+++ b/examples/tapi/Makefile
@@ -1,6 +1,6 @@
 #
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -76,6 +76,15 @@ endif
 # Include the common makefile fragment.
 -include $(SHARE_PATH)/common.mk
 
+# Detect whether we actually got the configuration file. If we didn't, then
+# we may be building from an installed version and need BLIS_INSTALL_PATH.
+ifeq ($(strip $(COMMON_MK_INCLUDED)),yes)
+COMMON_MK_PRESENT := yes
+else
+COMMON_MK_PRESENT := no
+$(error common.mk not found, you may need to specify BLIS_INSTALL_PATH)
+endif
+
 
 
 #

From 7e8a5891902312a281bce37037eaa06d7d501639 Mon Sep 17 00:00:00 2001
From: Dave Love <dave.love@manchester.ac.uk>
Date: Fri, 24 Jan 2025 21:44:32 +0000
Subject: [PATCH 216/230] Blacklist KNL with GCC 15+ (#844)

Details:
- GCC 15 drops support for Xeon Phi architectures such as KNL.
- This PR blacklists the `knl` configuration for GCC 15+.
---
 configure | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 676c3b0ad..4df2ed80b 100755
--- a/configure
+++ b/configure
@@ -1772,7 +1772,7 @@ check_compiler()
 	# Specific:
 	#
 	#   skx: icc 15.0.1+, gcc 6.0+, clang 3.9+
-	#   knl: icc 14.0.1+, gcc 5.0+, clang 3.9+
+	#   knl: icc 14.0.1+, gcc 5.0-14, clang 3.9+
 	#   haswell: any
 	#   sandybridge: any
 	#   penryn: any
@@ -1824,7 +1824,7 @@ check_compiler()
 				blacklistcc_add "zen"
 			fi
 		fi
-		if [[ ${cc_major} -lt 5 ]]; then
+		if [[ ${cc_major} -lt 5 ]] || [[ ${cc_major} -gt 14 ]]; then
 			blacklistcc_add "knl"
 		fi
 		if [[ ${cc_major} -lt 6 ]]; then

From a6f2ce9dd53fbe099650d322fa69b21a3be10fb0 Mon Sep 17 00:00:00 2001
From: "M. Zhou" <cdluminate@gmail.com>
Date: Wed, 5 Feb 2025 14:07:01 -0800
Subject: [PATCH 217/230] Alias *gemmt_ as *gemmtr_ to fix lapack 3.12.1
 compatibility. (#849)

Details:
- Alias `?gemmt_` as `?gemmtr_` to fix lapack 3.12.1 compatibility. (Fixes #848)
- Add the `?gemmtr_ `and `cblas_?gemmtr` aliases to symbol list.
- Also alias `cblas_?gemmt` as `cblas_?gemmtr` for lapack 3.12.1 compatibility.
---
 build/libblis-symbols.def                   |  8 ++++++++
 frame/compat/cblas/src/cblas.h              |  4 ++++
 frame/compat/cblas/src/extra/cblas_cgemmt.c |  2 ++
 frame/compat/cblas/src/extra/cblas_dgemmt.c |  2 ++
 frame/compat/cblas/src/extra/cblas_sgemmt.c |  2 ++
 frame/compat/cblas/src/extra/cblas_zgemmt.c |  2 ++
 frame/compat/extra/bla_gemmt.c              | 10 ++++++++--
 frame/compat/extra/bla_gemmt.h              |  1 +
 8 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index 190dea77c..36c68fce2 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -1248,6 +1248,7 @@ cblas_cgemm
 cblas_cgemm3m
 cblas_cgemm_batch
 cblas_cgemmt
+cblas_cgemmtr
 cblas_cgemv
 cblas_cgerc
 cblas_cgeru
@@ -1284,6 +1285,7 @@ cblas_dgbmv
 cblas_dgemm
 cblas_dgemm_batch
 cblas_dgemmt
+cblas_dgemmtr
 cblas_dgemv
 cblas_dger
 cblas_dnrm2
@@ -1330,6 +1332,7 @@ cblas_sgbmv
 cblas_sgemm
 cblas_sgemm_batch
 cblas_sgemmt
+cblas_sgemmtr
 cblas_sgemv
 cblas_sger
 cblas_snrm2
@@ -1369,6 +1372,7 @@ cblas_zgemm
 cblas_zgemm3m
 cblas_zgemm_batch
 cblas_zgemmt
+cblas_zgemmtr
 cblas_zgemv
 cblas_zgerc
 cblas_zgeru
@@ -1405,6 +1409,7 @@ cgemm3m_
 cgemm_
 cgemm_batch_
 cgemmt_
+cgemmtr_
 cgemv_
 cgerc_
 cgeru_
@@ -1446,6 +1451,7 @@ dgbmv_
 dgemm_
 dgemm_batch_
 dgemmt_
+dgemmtr_
 dgemv_
 dger_
 dnrm2_
@@ -1507,6 +1513,7 @@ sgbmv_
 sgemm_
 sgemm_batch_
 sgemmt_
+sgemmtr_
 sgemv_
 sger_
 snrm2_
@@ -1551,6 +1558,7 @@ zgemm3m_
 zgemm_
 zgemm_batch_
 zgemmt_
+zgemmtr_
 zgemv_
 zgerc_
 zgeru_
diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h
index 97aa38036..32ee4a649 100644
--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -608,21 +608,25 @@ void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  f77_int N, f77_int K, float alpha, const float *A,
                  f77_int lda, const float *B, f77_int ldb,
                  float beta, float *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_sgemmtr();  // alias to cblas_sgemmt
 void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                  f77_int N, f77_int K, double alpha, const double *A,
                  f77_int lda, const double *B, f77_int ldb,
                  double beta, double *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_dgemmtr();  // alias to cblas_dgemmt
 void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                  f77_int N, f77_int K, const void *alpha, const void *A,
                  f77_int lda, const void *B, f77_int ldb,
                  const void *beta, void *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_cgemmtr();  // alias to cblas_cgemmt
 void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                  f77_int N, f77_int K, const void *alpha, const void *A,
                  f77_int lda, const void *B, f77_int ldb,
                  const void *beta, void *C, f77_int ldc);
+void BLIS_EXPORT_BLAS cblas_zgemmtr();  // alias to cblas_zgemmt
 
 // -- Batch APIs --
 
diff --git a/frame/compat/cblas/src/extra/cblas_cgemmt.c b/frame/compat/cblas/src/extra/cblas_cgemmt.c
index 79d18f041..2dabac6d8 100644
--- a/frame/compat/cblas/src/extra/cblas_cgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_cgemmt.c
@@ -163,4 +163,6 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
    RowMajorStrg = 0;
    return;
 }
+
+void cblas_cgemmtr() __attribute__((alias("cblas_cgemmt")));
 #endif
diff --git a/frame/compat/cblas/src/extra/cblas_dgemmt.c b/frame/compat/cblas/src/extra/cblas_dgemmt.c
index 8677e02b7..74e12c618 100644
--- a/frame/compat/cblas/src/extra/cblas_dgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_dgemmt.c
@@ -163,4 +163,6 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
    RowMajorStrg = 0;
    return;
 }
+
+void cblas_dgemmtr() __attribute__((alias("cblas_dgemmt")));
 #endif
diff --git a/frame/compat/cblas/src/extra/cblas_sgemmt.c b/frame/compat/cblas/src/extra/cblas_sgemmt.c
index abe5ae857..ed3a75e6e 100644
--- a/frame/compat/cblas/src/extra/cblas_sgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_sgemmt.c
@@ -163,4 +163,6 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
    RowMajorStrg = 0;
    return;
 }
+
+void cblas_sgemmtr() __attribute__((alias("cblas_sgemmt")));
 #endif
diff --git a/frame/compat/cblas/src/extra/cblas_zgemmt.c b/frame/compat/cblas/src/extra/cblas_zgemmt.c
index d3d1fa96a..2b6e5ff68 100644
--- a/frame/compat/cblas/src/extra/cblas_zgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_zgemmt.c
@@ -163,4 +163,6 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
    RowMajorStrg = 0;
    return;
 }
+
+void cblas_zgemmtr() __attribute__((alias("cblas_zgemmt")));
 #endif
diff --git a/frame/compat/extra/bla_gemmt.c b/frame/compat/extra/bla_gemmt.c
index 266663df3..73bf25cda 100644
--- a/frame/compat/extra/bla_gemmt.c
+++ b/frame/compat/extra/bla_gemmt.c
@@ -39,6 +39,8 @@
 //
 // Define BLAS-to-BLIS interfaces.
 //
+#define STRINGIFY( name ) #name
+#define EXPAND_AND_STRINGIFY( name ) STRINGIFY( name )
 
 #ifdef BLIS_BLAS3_CALLS_TAPI
 
@@ -118,7 +120,9 @@ void PASTEF77(ch,blasname) \
 \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+}; \
+void PASTEF77 (ch, blasname ## r )() \
+	__attribute__ ((alias(EXPAND_AND_STRINGIFY(PASTEF77(ch,blasname)))));
 
 #else
 
@@ -221,7 +225,9 @@ void PASTEF77(ch,blasname) \
 \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+}; \
+void PASTEF77 (ch, blasname ## r )() \
+	__attribute__ ((alias(EXPAND_AND_STRINGIFY(PASTEF77(ch,blasname)))));
 
 #endif
 
diff --git a/frame/compat/extra/bla_gemmt.h b/frame/compat/extra/bla_gemmt.h
index 3bef5a898..9dda6ff30 100644
--- a/frame/compat/extra/bla_gemmt.h
+++ b/frame/compat/extra/bla_gemmt.h
@@ -56,5 +56,6 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
 
 #ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( gemmt )
+INSERT_GENTPROT_BLAS( gemmtr )
 #endif
 

From 5ad37a860b191f905a7ed895280a8057573ae909 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 5 Feb 2025 16:10:37 -0600
Subject: [PATCH 218/230] Increase the max size for stack buffers. (#851)

Details:
- See #850 for details on the problem.
- This is a temporary fix which should work for sdcz data types.
- Altra architectures may still not fully work for MP/MD as the stack buffer size is hard-coded.
---
 docs/ConfigurationHowTo.md            | 4 ++--
 frame/base/bli_check.c                | 4 +++-
 frame/include/bli_kernel_macro_defs.h | 7 ++++---
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md
index 9217ae9fd..0d4fa4cac 100644
--- a/docs/ConfigurationHowTo.md
+++ b/docs/ConfigurationHowTo.md
@@ -215,9 +215,9 @@ _**SIMD register file.**_ BLIS allows you to specify the _maximum_ number of SIM
 #define BLIS_SIMD_MAX_NUM_REGISTERS  32
 #define BLIS_SIMD_MAX_SIZE           64
 ```
-These macros are used in computing the maximum amount of temporary storage (typically allocated statically, on the function stack) that will be needed to hold a single micro-tile of any datatype (and for any induced method):
+These macros are used in computing the maximum amount of temporary storage (typically allocated statically, on the function stack) that will be needed to hold a single micro-tile of any datatype (and for any induced method or mixed-precision computation):
 ```c
-#define BLIS_STACK_BUF_MAX_SIZE  ( BLIS_SIMD_MAX_NUM_REGISTERS * BLIS_SIMD_MAX_SIZE * 2 )
+#define BLIS_STACK_BUF_MAX_SIZE  ( BLIS_SIMD_MAX_NUM_REGISTERS * BLIS_SIMD_MAX_SIZE * 4 )
 ```
 These temporary buffers are used when handling edge cases (m % _MR_ != 0 || n % _NR_ != 0) within the level-3 macrokernels, and also in the virtual microkernels of various implementations of induced methods for complex matrix multiplication. It is **very important** that these values be set correctly; otherwise, you may experience undefined behavior as stack data is overwritten at run-time. A kernel developer may set `BLIS_SIMD_MAX_NUM_REGISTERS` and `BLIS_SIMD_MAX_SIZE`, which will indirectly affect `BLIS_STACK_BUF_MAX_SIZE`, or he may set `BLIS_STACK_BUF_MAX_SIZE` directly. Notice that the default values are already set to work with modern x86_64 systems.
 
diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c
index e949b6361..39cefd62a 100644
--- a/frame/base/bli_check.c
+++ b/frame/base/bli_check.c
@@ -826,7 +826,9 @@ err_t bli_check_sufficient_stack_buf_size( const cntx_t* cntx )
 	{
 		dim_t mr      = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
 		dim_t nr      = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
-		siz_t dt_size = bli_dt_size( dt );
+		// Always use the size of the largest data type to account for
+		// conversions during mixed-domain/mixed-precision computation.
+		siz_t dt_size = bli_dt_size( BLIS_DCOMPLEX );
 
 		// NOTE: For induced methods, we use the size of the complex datatypes
 		// (rather than the size of the native micro-kernels' datatype) because
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index 8c0f1cb14..f1d906c12 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -183,14 +183,15 @@
 
 // The maximum size in bytes of local stack buffers within macro-kernel
 // functions. These buffers are usually used to store a temporary copy
-// of a single microtile. The reason we multiply by 2 is to handle induced
+// of a single microtile. The reason we multiply by 4 is to handle induced
 // methods, where we use real domain register blocksizes in units of
-// complex elements. Specifically, the macro-kernels will need this larger
+// complex elements, as well as mixed-precision, where data may be
+// converted to a wider type. Specifically, the macro-kernels will need this larger
 // micro-tile footprint, even though the virtual micro-kernels will only
 // ever be writing to half (real or imaginary part) at a time.
 #ifndef BLIS_STACK_BUF_MAX_SIZE
 #define BLIS_STACK_BUF_MAX_SIZE          ( BLIS_SIMD_MAX_NUM_REGISTERS * \
-                                           BLIS_SIMD_MAX_SIZE * 2 )
+                                           BLIS_SIMD_MAX_SIZE * 4 )
 #endif
 
 // Alignment size used to align local stack buffers within macro-kernel

From 028be422e306986674f7b1d96b99153bf2a6477e Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 6 Feb 2025 23:22:24 -0600
Subject: [PATCH 219/230] Fix problem with clang-14.0.0 and reference `gemm`
 ukr. (#854)

Details:
- clang 14.0.0 apparently makes some invalid assumptions about whether
  or not the AB microtile is initialized in the `gemm` reference
  microkernel. This leads to the "scale by alpha" part doing something
  strange (all sorts of random and even NaN values pop up). I do not
  know why this only manifested for `ztrsm` on `skx` (in
  `zgemm_skx_ref` via `zgemmtrsm_skx_ref`). See #852.
- Aliasing the AB microtile (in the proper datatype) as a pointer to
  a raw character array, and then initializing the character array
  with `= { 0 }` convinces the compiler to do the right thing.
- The problem did not occur in 14.0.6 or 15.0.7. It may only be a narrow
  band of versions which are problematic.
- This commit adds the char array workaround and fixes #852.
---
 ref_kernels/3/bli_gemm_ref.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index ab861bcb5..61505eef8 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -194,16 +194,15 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 		return; \
 	} \
 \
-	      ctype ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                / sizeof( ctype ) ] \
-	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t rs_ab  = nr; \
-	const inc_t cs_ab  = 1; \
-\
-	const inc_t rs_a   = PASTECH(BLIS_BBM_,ch); \
-	const inc_t cs_a   = PASTECH(BLIS_PACKMR_,ch); \
-	const inc_t rs_b   = PASTECH(BLIS_PACKNR_,ch); \
-	const inc_t cs_b   = PASTECH(BLIS_BBN_,ch); \
+	      char   ab_[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))) = { 0 }; \
+	      ctype* ab    = (ctype*)ab_; \
+	const inc_t  rs_ab = nr; \
+	const inc_t  cs_ab = 1; \
+\
+	const inc_t  rs_a  = PASTECH(BLIS_BBM_,ch); \
+	const inc_t  cs_a  = PASTECH(BLIS_PACKMR_,ch); \
+	const inc_t  rs_b  = PASTECH(BLIS_PACKNR_,ch); \
+	const inc_t  cs_b  = PASTECH(BLIS_BBN_,ch); \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \

From 40a52dc0289f27f74a43e886ae14bf11738db169 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sat, 8 Feb 2025 13:47:08 -0600
Subject: [PATCH 220/230] Add CircleCI (#855)

Details:
- This PR adds CircleCI testing in addition to TravisCI and Appveyor.
- All of the same tests as on Travis are run, except that different hardware typically ends up being used (usually Zen on Travis, Xeon Platinum on Circle). This has actually exposed a couple of bugs (see #850 and #852).
- The `travis` directory has been renamed to `ci` as it is now shared.
- Running SDE on CircleCI is a bit problematic because glibc changed how CPUID detection is done. This requires running some architectures with different hardware definition files and forcing a config via `BLIS_ARCH_TYPE`.
---
 .circleci/config.yml                 | 253 +++++++++++++++++++++++++++
 .travis.yml                          |  12 +-
 {travis => ci}/cpuid/excavator.def   |   0
 {travis => ci}/cpuid/haswell.def     |   0
 {travis => ci}/cpuid/penryn.def      |   0
 {travis => ci}/cpuid/piledriver.def  |   0
 {travis => ci}/cpuid/sandybridge.def |   0
 {travis => ci}/cpuid/skx.def         |   0
 {travis => ci}/cpuid/skx1.def        |   0
 {travis => ci}/cpuid/steamroller.def |   0
 {travis => ci}/cpuid/zen.def         |   0
 {travis => ci}/cpuid/zen2.def        |   0
 {travis => ci}/cpuid/zen3.def        |   0
 {travis => ci}/cxx/Makefile          |   0
 {travis => ci}/cxx/cxx-test.cxx      |   0
 {travis => ci}/cxx/cxx-test.sh       |   4 +-
 {travis => ci}/do_riscv.sh           |   0
 {travis => ci}/do_sde.sh             |  35 ++--
 {travis => ci}/do_testsuite.sh       |  15 +-
 travis/patch-ld-so.py                |  16 --
 20 files changed, 293 insertions(+), 42 deletions(-)
 create mode 100644 .circleci/config.yml
 rename {travis => ci}/cpuid/excavator.def (100%)
 rename {travis => ci}/cpuid/haswell.def (100%)
 rename {travis => ci}/cpuid/penryn.def (100%)
 rename {travis => ci}/cpuid/piledriver.def (100%)
 rename {travis => ci}/cpuid/sandybridge.def (100%)
 rename {travis => ci}/cpuid/skx.def (100%)
 rename {travis => ci}/cpuid/skx1.def (100%)
 rename {travis => ci}/cpuid/steamroller.def (100%)
 rename {travis => ci}/cpuid/zen.def (100%)
 rename {travis => ci}/cpuid/zen2.def (100%)
 rename {travis => ci}/cpuid/zen3.def (100%)
 rename {travis => ci}/cxx/Makefile (100%)
 rename {travis => ci}/cxx/cxx-test.cxx (100%)
 rename {travis => ci}/cxx/cxx-test.sh (93%)
 rename {travis => ci}/do_riscv.sh (100%)
 rename {travis => ci}/do_sde.sh (53%)
 rename {travis => ci}/do_testsuite.sh (76%)
 delete mode 100755 travis/patch-ld-so.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 000000000..339fc11cf
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,253 @@
+version: 2.1
+
+branches:
+  only:
+    - master
+    - dev
+    - amd
+
+executors:
+  linux: # Docker using the Base Convenience Image
+    docker:
+      - image: cimg/base:2024.10
+  linuxnew: # Docker using the Base Convenience Image
+    docker:
+      - image: cimg/base:current-22.04
+  macos: &macos-executor # macos executor running Xcode
+    macos:
+      xcode: 14.2.0
+  linuxvm: # executor type
+    machine:
+      image: ubuntu-2204:current
+
+workflows:
+  build:
+    jobs:
+      # Default:
+      # - build:
+      #     os: linux
+      #     CC: gcc
+      #     OOT: 0
+      #     TEST: FAST
+      #     SDE: 0
+      #     THR: none
+      #     CONF: auto
+      #     BLD: ''
+      #     LDFLAGS: ''
+      #     TESTSUITE_WRAPPER: ''
+      #     PACKAGES: ''
+
+      # full testsuite (all tests + mixed datatype (gemm_nn only) + salt + OOT)
+      - build:
+          OOT: 1
+          TEST: ALL
+          SDE: 0
+          CONF: x86_64
+
+      # SDE testing for x86_64
+      - build:
+          # linuxvm must be used because it provides 8G RAM and SDE fails with 4G RAM
+          os: linuxvm
+          OOT: 0
+          TEST: FAST
+          SDE: 1
+          CONF: x86_64
+
+      # openmp build
+      - build:
+          THR: openmp
+
+      # pthreads build
+      - build:
+          THR: pthreads
+
+      # clang build
+      - build:
+          CC: clang
+          CXX: clang++
+          PACKAGES: clang
+
+      # macOS with system compiler (clang)
+      - build:
+          os: macos
+          CC: clang
+          CXX: clang++
+
+      # cortexa15 build and fast testsuite (qemu)
+      - build:
+          CC: arm-linux-gnueabihf-gcc
+          CXX: arm-linux-gnueabihf-g++
+          CONF: cortexa15
+          PACKAGES: 'gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user'
+          TESTSUITE_WRAPPER: 'qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/'
+
+      # cortexa57 build and fast testsuite (qemu)
+      - build:
+          CC: aarch64-linux-gnu-gcc
+          CXX: aarch64-linux-gnu-g++
+          CONF: cortexa57
+          PACKAGES: 'gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user'
+          TESTSUITE_WRAPPER: 'qemu-aarch64 -L /usr/aarch64-linux-gnu/'
+
+      # Apple M1 (firestorm) build and fast testsuite (qemu)
+      - build:
+          CC: aarch64-linux-gnu-gcc
+          CXX: aarch64-linux-gnu-g++
+          CONF: firestorm
+          PACKAGES: 'gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user'
+          TESTSUITE_WRAPPER: 'qemu-aarch64 -L /usr/aarch64-linux-gnu/'
+
+      # armsve build and fast testsuite (qemu)
+      - build:
+          CC: aarch64-linux-gnu-gcc-10
+          CXX: aarch64-linux-gnu-g++-10
+          CONF: armsve
+          PACKAGES: 'gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user'
+          TESTSUITE_WRAPPER: 'qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/'
+
+      # arm64 build and fast testsuite (qemu)
+      # NOTE: This entry omits the -cpu flag so that while both NEON and SVE kernels
+      # are compiled, only NEON kernels will be tested. (h/t to RuQing Xu)
+      - build:
+          CC: aarch64-linux-gnu-gcc-10
+          CXX: aarch64-linux-gnu-g++-10
+          CONF: arm64
+          PACKAGES: 'gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user'
+          TESTSUITE_WRAPPER: 'qemu-aarch64 -L /usr/aarch64-linux-gnu/'
+
+      # The RISC-V targets require the qemu version available in jammy or newer.
+      # When CI is upgraded, the packages should be activated and do_script.sh
+      # cleaned up.
+      # PACKAGES="qemu-user qemu-user-binfmt"
+      - build:
+          CONF: rv64iv
+          BLD: --disable-shared
+          LDFLAGS: -static
+      - build:
+          CONF: rv32iv
+          BLD: --disable-shared
+          LDFLAGS: -static
+      - build:
+          CONF: sifive_x280
+          BLD: --disable-shared
+          LDFLAGS: -static
+
+jobs:
+  build:
+    parameters:
+      os:
+        type: executor
+        default: linux
+      CC:
+        type: string
+        default: gcc
+      CXX:
+        type: string
+        default: g++
+      OOT:
+        type: integer
+        default: 0
+      TEST:
+        type: string
+        default: FAST
+      SDE:
+        type: integer
+        default: 0
+      THR:
+        type: string
+        default: none
+      CONF:
+        type: string
+        default: auto
+      BLD:
+        type: string
+        default: ''
+      LDFLAGS:
+        type: string
+        default: ''
+      TESTSUITE_WRAPPER:
+        type: string
+        default: ''
+      PACKAGES:
+        type: string
+        default: ''
+    executor: << parameters.os >>
+    steps:
+      - checkout
+
+      - when:
+          condition:
+            not:
+              equal: [ *macos-executor, << parameters.os >> ]
+          steps:
+            - run:
+                name: Installing Dependencies
+                command:
+                  sudo apt-get update && sudo NEEDRESTART_MODE=a apt-get install -y make python3 << parameters.PACKAGES >>
+
+      - run:
+          name: Configuring, Building, Testing
+          command: |
+            export DIST_PATH=.
+            export CC="<< parameters.CC >>"
+            export CXX="<< parameters.CXX >>"
+            export OOT="<< parameters.OOT >>"
+            export CONF="<< parameters.CONF >>"
+            export TEST="<< parameters.TEST >>"
+            export BLD="<< parameters.BLD >>"
+            export LDFLAGS="<< parameters.LDFLAGS >>"
+            export SDE="<< parameters.SDE >>"
+            export THR="<< parameters.THR >>"
+            export TESTSUITE_WRAPPER="<< parameters.TESTSUITE_WRAPPER >>"
+
+            pwd
+            if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi
+            pwd
+
+            if [ "$CONF" = "rv64iv" ]; then
+              $DIST_PATH/ci/do_riscv.sh "$CONF";
+              export CC=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-gcc;
+              export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-g++;
+              export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
+            fi
+            if [ "$CONF" = "rv32iv" ]; then
+              $DIST_PATH/ci/do_riscv.sh "$CONF";
+              export CC=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-gcc;
+              export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++;
+              export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
+            fi
+            if [ "$CONF" = "sifive_x280" ]; then
+              $DIST_PATH/ci/do_riscv.sh "$CONF";
+              export CC=$DIST_PATH/../toolchain/riscv/bin/clang;
+              export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++;
+              export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000";
+            fi
+
+            echo "Configuration:"
+            echo "CC                = $CC"
+            echo "CXX               = $CXX"
+            echo "OOT               = $OOT"
+            echo "CONF              = $CONF"
+            echo "THR               = $THR"
+            echo "TEST              = $TEST"
+            echo "BLD               = $BLD"
+            echo "SDE               = $SDE"
+            echo "DIST_PATH         = $DIST_PATH"
+            echo "LDFLAGS           = $LDFLAGS"
+            echo "TESTSUITE_WRAPPER = $TESTSUITE_WRAPPER"
+
+            $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF
+            pwd
+            ls -l
+            $CC --version
+            $CC -v
+
+            make -j2
+            make install
+
+            if [ "$BLD" = "" ] && [ "$TESTSUITE_WRAPPER" = "" ] ; then $DIST_PATH/ci/cxx/cxx-test.sh $DIST_PATH $(ls -1 include); fi
+            # Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed
+            # on real chip (A64fx).
+            if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
+            if [ "$TEST" != "0" ]; then $DIST_PATH/ci/do_testsuite.sh; fi
+            if [ "$SDE" = "1" ]; then $DIST_PATH/ci/do_sde.sh; fi
diff --git a/.travis.yml b/.travis.yml
index bdfafb6b0..df955764f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -100,19 +100,19 @@ script:
 - if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi
 - pwd
 - if [ "$CONF" = "rv64iv" ]; then
-    $DIST_PATH/travis/do_riscv.sh "$CONF";
+    $DIST_PATH/ci/do_riscv.sh "$CONF";
     export CC=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-gcc;
     export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-g++;
     export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
   fi
 - if [ "$CONF" = "rv32iv" ]; then
-    $DIST_PATH/travis/do_riscv.sh "$CONF";
+    $DIST_PATH/ci/do_riscv.sh "$CONF";
     export CC=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-gcc;
     export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++;
     export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
   fi
 - if [ "$CONF" = "sifive_x280" ]; then
-    $DIST_PATH/travis/do_riscv.sh "$CONF";
+    $DIST_PATH/ci/do_riscv.sh "$CONF";
     export CC=$DIST_PATH/../toolchain/riscv/bin/clang;
     export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++;
     export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000";
@@ -124,9 +124,9 @@ script:
 - $CC -v
 - make -j 2
 - make install
-- if [ "$BLD" = "" ]; then $DIST_PATH/travis/cxx/cxx-test.sh $DIST_PATH $(ls -1 include); fi
+- if [ "$BLD" = "" ]; then $DIST_PATH/ci/cxx/cxx-test.sh $DIST_PATH $(ls -1 include); fi
 # Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed
 # on real chip (A64fx).
 - if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
-- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
-- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi
+- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/ci/do_testsuite.sh; fi
+- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/ci/do_sde.sh; fi
diff --git a/travis/cpuid/excavator.def b/ci/cpuid/excavator.def
similarity index 100%
rename from travis/cpuid/excavator.def
rename to ci/cpuid/excavator.def
diff --git a/travis/cpuid/haswell.def b/ci/cpuid/haswell.def
similarity index 100%
rename from travis/cpuid/haswell.def
rename to ci/cpuid/haswell.def
diff --git a/travis/cpuid/penryn.def b/ci/cpuid/penryn.def
similarity index 100%
rename from travis/cpuid/penryn.def
rename to ci/cpuid/penryn.def
diff --git a/travis/cpuid/piledriver.def b/ci/cpuid/piledriver.def
similarity index 100%
rename from travis/cpuid/piledriver.def
rename to ci/cpuid/piledriver.def
diff --git a/travis/cpuid/sandybridge.def b/ci/cpuid/sandybridge.def
similarity index 100%
rename from travis/cpuid/sandybridge.def
rename to ci/cpuid/sandybridge.def
diff --git a/travis/cpuid/skx.def b/ci/cpuid/skx.def
similarity index 100%
rename from travis/cpuid/skx.def
rename to ci/cpuid/skx.def
diff --git a/travis/cpuid/skx1.def b/ci/cpuid/skx1.def
similarity index 100%
rename from travis/cpuid/skx1.def
rename to ci/cpuid/skx1.def
diff --git a/travis/cpuid/steamroller.def b/ci/cpuid/steamroller.def
similarity index 100%
rename from travis/cpuid/steamroller.def
rename to ci/cpuid/steamroller.def
diff --git a/travis/cpuid/zen.def b/ci/cpuid/zen.def
similarity index 100%
rename from travis/cpuid/zen.def
rename to ci/cpuid/zen.def
diff --git a/travis/cpuid/zen2.def b/ci/cpuid/zen2.def
similarity index 100%
rename from travis/cpuid/zen2.def
rename to ci/cpuid/zen2.def
diff --git a/travis/cpuid/zen3.def b/ci/cpuid/zen3.def
similarity index 100%
rename from travis/cpuid/zen3.def
rename to ci/cpuid/zen3.def
diff --git a/travis/cxx/Makefile b/ci/cxx/Makefile
similarity index 100%
rename from travis/cxx/Makefile
rename to ci/cxx/Makefile
diff --git a/travis/cxx/cxx-test.cxx b/ci/cxx/cxx-test.cxx
similarity index 100%
rename from travis/cxx/cxx-test.cxx
rename to ci/cxx/cxx-test.cxx
diff --git a/travis/cxx/cxx-test.sh b/ci/cxx/cxx-test.sh
similarity index 93%
rename from travis/cxx/cxx-test.sh
rename to ci/cxx/cxx-test.sh
index c0036611f..52402867d 100755
--- a/travis/cxx/cxx-test.sh
+++ b/ci/cxx/cxx-test.sh
@@ -50,9 +50,9 @@ if [ ! -e $INCLUDE_DIR/blis.h ]; then
     exit 1
 fi
 
-if [ ! -e $SOURCE_DIR/travis/cxx/Makefile ]; then
+if [ ! -e $SOURCE_DIR/ci/cxx/Makefile ]; then
     echo "could not find cxx-test Makefile"
     exit 1
 fi
 
-make -C $SOURCE_DIR/travis/cxx INCLUDE_DIR=$INCLUDE_DIR LIB_DIR=$LIB_DIR BUILD_DIR=$BUILD_DIR
+make -C $SOURCE_DIR/ci/cxx INCLUDE_DIR=$INCLUDE_DIR LIB_DIR=$LIB_DIR BUILD_DIR=$BUILD_DIR
diff --git a/travis/do_riscv.sh b/ci/do_riscv.sh
similarity index 100%
rename from travis/do_riscv.sh
rename to ci/do_riscv.sh
diff --git a/travis/do_sde.sh b/ci/do_sde.sh
similarity index 53%
rename from travis/do_sde.sh
rename to ci/do_sde.sh
index 4f0447778..7f8a927f6 100755
--- a/travis/do_sde.sh
+++ b/ci/do_sde.sh
@@ -31,23 +31,32 @@ tar xvf $SDE_TARBALL
 
 make -j2 testsuite-bin blastest-bin
 
-TMP=`ldd ./test_libblis.x | grep ld | sed 's/^.*=> //'`
-LD_SO=${TMP%% *}
-TMP=`ldd ./test_libblis.x | grep libc | sed 's/^.*=> //'`
-LIBC_SO=${TMP%% *}
-TMP=`ldd ./test_libblis.x | grep libm | sed 's/^.*=> //'`
-LIBM_SO=${TMP%% *}
-for LIB in $LD_SO $LIBC_SO $LIBM_SO; do
-    $DIST_PATH/travis/patch-ld-so.py $LIB .tmp
-    chmod a+x .tmp
-    sudo mv .tmp $LIB
-done
-
 for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do
+    export BLIS_ARCH_TYPE=-1
+
     if [ "$ARCH" = "knl" ]; then
         TESTSUITE_WRAPPER="$SDE -knl --"
+    elif [ "$ARCH" = "sandybridge" ]; then
+        # The sandybridge.def file causes a segfault in SDE on some systems.
+        # Instead, use the CPUID values for haswell, but force BLIS to use the
+        # sandybridge configuration.
+        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --"
+        export BLIS_ARCH_TYPE=4
+    elif [ "$ARCH" = "piledriver" ]; then
+        # We used to "patch" ld.so and libm to remove CPUID checks so that glibc
+        # wouldn't try to use instructions not supported by SDE (FMA4). That no
+        # longer works, so test Piledriver/Steamroller/Excavator as haswell
+        # but with the configuration forced via environment variable.
+        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --"
+        export BLIS_ARCH_TYPE=11
+    elif [ "$ARCH" = "steamroller" ]; then
+        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --"
+        export BLIS_ARCH_TYPE=10
+    elif [ "$ARCH" = "excavator" ]; then
+        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --"
+        export BLIS_ARCH_TYPE=9
     else
-        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/travis/cpuid/$ARCH.def --"
+        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/$ARCH.def --"
     fi
 
     make TESTSUITE_WRAPPER="$TESTSUITE_WRAPPER" check
diff --git a/travis/do_testsuite.sh b/ci/do_testsuite.sh
similarity index 76%
rename from travis/do_testsuite.sh
rename to ci/do_testsuite.sh
index c21df3a32..aa72d8051 100755
--- a/travis/do_testsuite.sh
+++ b/ci/do_testsuite.sh
@@ -9,27 +9,32 @@ export BLIS_JR_NT=1
 export BLIS_IR_NT=1
 
 if [ "$TEST" = "FAST" -o "$TEST" = "ALL" ]; then
-	make testblis-fast || cat ./output.testsuite
+	make testblis-fast
+	cat ./output.testsuite
 	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
 if [ "$TEST" = "MD" -o "$TEST" = "ALL" ]; then
-	make testblis-md || cat ./output.testsuite
+	make testblis-md
+	cat ./output.testsuite
 	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
 if [ "$TEST" = "SALT" -o "$TEST" = "ALL" ]; then
 	# Disable multithreading within BLIS.
 	export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=1 BLIS_IR_NT=1
-	make testblis-salt || cat ./output.testsuite
+	make testblis-salt
+	cat ./output.testsuite
 	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
 if [ "$TEST" = "1" -o "$TEST" = "ALL" ]; then
-	make testblis || cat ./output.testsuite
+	make testblis
+	cat ./output.testsuite
 	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
-make testblas || cat ./output.testsuite
+make testblas
+cat ./output.testsuite
 $DIST_PATH/blastest/check-blastest.sh
 
diff --git a/travis/patch-ld-so.py b/travis/patch-ld-so.py
deleted file mode 100755
index 72e580d74..000000000
--- a/travis/patch-ld-so.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Patch ld.so to disable runtime CPUID detection
-# Taken from https://stackoverflow.com/a/44483482
-#
-
-import re
-import sys
-
-infile, outfile = sys.argv[1:]
-d = open(infile, 'rb').read()
-# Match CPUID(eax=0), "xor eax,eax" followed closely by "cpuid"
-o = re.sub(b'(\x31\xc0.{0,32})\x0f\xa2', b'\\1\x66\x90', d)
-#assert d != o
-open(outfile, 'wb').write(o)

From 14047f62d1fc746cbabe112197cfc1afe526a82a Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 19 Feb 2025 13:31:19 -0600
Subject: [PATCH 221/230] Do not use symbol aliases on macOS. (#856)

Details:
- The BLAS/CBLAS function `?gemmtr` is currently implemented as a symbol alias of the already-existing `?gemmt`. This does not work on macOS/Darwin.
- Instead, use a minimal wrapper function which calls the appropriate existing BLAS/CBLAS function.
- Also clean up the CBLAS prototypes a bit.
---
 frame/compat/cblas/src/cblas.h              | 40 ++++++++----
 frame/compat/cblas/src/extra/cblas_cgemmt.c | 35 +++++++----
 frame/compat/cblas/src/extra/cblas_dgemmt.c | 35 +++++++----
 frame/compat/cblas/src/extra/cblas_sgemmt.c | 35 +++++++----
 frame/compat/cblas/src/extra/cblas_zgemmt.c | 35 +++++++----
 frame/compat/extra/bla_gemmt.c              | 69 ++++++++++++++++++---
 6 files changed, 181 insertions(+), 68 deletions(-)

diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h
index 32ee4a649..0a29c9150 100644
--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -87,7 +87,7 @@ BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void   *X, f77_int incX);
  * ===========================================================================
  */
 
-/* 
+/*
  * Routines with standard 4 prefixes (s, d, c, z)
  */
 void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX,
@@ -119,7 +119,7 @@ void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X,
                  f77_int incX, void *Y, f77_int incY);
 
 
-/* 
+/*
  * Routines with S and D prefix only
  */
 void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s);
@@ -137,7 +137,7 @@ void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX,
                 double *Y, f77_int incY, const double *P);
 
 
-/* 
+/*
  * Routines with S D C Z CS and ZD prefixes
  */
 void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX);
@@ -153,7 +153,7 @@ void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int inc
  * ===========================================================================
  */
 
-/* 
+/*
  * Routines with standard 4 prefixes (S, D, C, Z)
  */
 void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order,
@@ -289,7 +289,7 @@ void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
                  f77_int N, const void *Ap, void *X, f77_int incX);
 
 
-/* 
+/*
  * Routines with S and D prefixes only
  */
 void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -351,7 +351,7 @@ void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
                 f77_int incX, const double *Y, f77_int incY, double *A);
 
 
-/* 
+/*
  * Routines with C and Z prefixes only
  */
 void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -422,7 +422,7 @@ void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
  * ===========================================================================
  */
 
-/* 
+/*
  * Routines with standard 4 prefixes (S, D, C, Z)
  */
 void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
@@ -546,7 +546,7 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  void *B, f77_int ldb);
 
 
-/* 
+/*
  * Routines with prefixes C and Z only
  */
 void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
@@ -608,25 +608,41 @@ void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  f77_int N, f77_int K, float alpha, const float *A,
                  f77_int lda, const float *B, f77_int ldb,
                  float beta, float *C, f77_int ldc);
-void BLIS_EXPORT_BLAS cblas_sgemmtr();  // alias to cblas_sgemmt
+void BLIS_EXPORT_BLAS cblas_sgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, float alpha, const float *A,
+                 f77_int lda, const float *B, f77_int ldb,
+                 float beta, float *C, f77_int ldc);
 void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                  f77_int N, f77_int K, double alpha, const double *A,
                  f77_int lda, const double *B, f77_int ldb,
                  double beta, double *C, f77_int ldc);
-void BLIS_EXPORT_BLAS cblas_dgemmtr();  // alias to cblas_dgemmt
+void BLIS_EXPORT_BLAS cblas_dgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, double alpha, const double *A,
+                 f77_int lda, const double *B, f77_int ldb,
+                 double beta, double *C, f77_int ldc);
 void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                  f77_int N, f77_int K, const void *alpha, const void *A,
                  f77_int lda, const void *B, f77_int ldb,
                  const void *beta, void *C, f77_int ldc);
-void BLIS_EXPORT_BLAS cblas_cgemmtr();  // alias to cblas_cgemmt
+void BLIS_EXPORT_BLAS cblas_cgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);
 void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                  enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                  f77_int N, f77_int K, const void *alpha, const void *A,
                  f77_int lda, const void *B, f77_int ldb,
                  const void *beta, void *C, f77_int ldc);
-void BLIS_EXPORT_BLAS cblas_zgemmtr();  // alias to cblas_zgemmt
+void BLIS_EXPORT_BLAS cblas_zgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);
 
 // -- Batch APIs --
 
diff --git a/frame/compat/cblas/src/extra/cblas_cgemmt.c b/frame/compat/cblas/src/extra/cblas_cgemmt.c
index 2dabac6d8..eeb3c457f 100644
--- a/frame/compat/cblas/src/extra/cblas_cgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_cgemmt.c
@@ -48,13 +48,13 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                   f77_int lda, const void  *B, f77_int ldb,
                   const void *beta, void  *C, f77_int ldc)
 {
-   char UL, TA, TB;   
+   char UL, TA, TB;
 #ifdef F77_CHAR
    F77_CHAR F77_UL, F77_TA, F77_TB;
 #else
-   #define F77_UL &UL  
-   #define F77_TA &TA  
-   #define F77_TB &TB  
+   #define F77_UL &UL
+   #define F77_TA &TA
+   #define F77_TB &TB
 #endif
 
 #ifdef F77_INT
@@ -78,7 +78,7 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       if( Uplo == CblasUpper) UL='U';
       else if ( Uplo == CblasLower ) UL='L';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -89,7 +89,7 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TA='T';
       else if ( TransA == CblasConjTrans ) TA='C';
       else if ( TransA == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -100,7 +100,7 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TB='T';
       else if ( TransB == CblasConjTrans ) TB='C';
       else if ( TransB == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -121,7 +121,7 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       RowMajorStrg = 1;
       if( Uplo == CblasUpper) UL='L';
       else if ( Uplo == CblasLower ) UL='U';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -132,7 +132,7 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TB='T';
       else if ( TransA == CblasConjTrans ) TB='C';
       else if ( TransA == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -142,7 +142,7 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TA='T';
       else if ( TransB == CblasConjTrans ) TA='C';
       else if ( TransB == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -157,12 +157,23 @@ void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B,
                   &F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc);
-   } 
+   }
    else  cblas_xerbla(1, "cblas_cgemmt", "Illegal Order setting, %d\n", Order);
    CBLAS_CallFromC = 0;
    RowMajorStrg = 0;
    return;
 }
 
-void cblas_cgemmtr() __attribute__((alias("cblas_cgemmt")));
+void cblas_cgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  const void* alpha, const void  *A,
+                  f77_int lda, const void  *B, f77_int ldb,
+                  const void* beta, void  *C, f77_int ldc)
+#ifdef BLIS_OS_OSX
+{ cblas_cgemmt(Order, Uplo, TransA, TransB, M, K, alpha, A, lda, B, ldb, beta, C, ldc); }
+#else
+__attribute__((alias("cblas_cgemmt")));
+#endif
+
 #endif
diff --git a/frame/compat/cblas/src/extra/cblas_dgemmt.c b/frame/compat/cblas/src/extra/cblas_dgemmt.c
index 74e12c618..b55ecfbf4 100644
--- a/frame/compat/cblas/src/extra/cblas_dgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_dgemmt.c
@@ -48,13 +48,13 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                   f77_int lda, const double  *B, f77_int ldb,
                   double beta, double  *C, f77_int ldc)
 {
-   char UL, TA, TB;   
+   char UL, TA, TB;
 #ifdef F77_CHAR
    F77_CHAR F77_UL, F77_TA, F77_TB;
 #else
-   #define F77_UL &UL  
-   #define F77_TA &TA  
-   #define F77_TB &TB  
+   #define F77_UL &UL
+   #define F77_TA &TA
+   #define F77_TB &TB
 #endif
 
 #ifdef F77_INT
@@ -78,7 +78,7 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       if( Uplo == CblasUpper) UL='U';
       else if ( Uplo == CblasLower ) UL='L';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -89,7 +89,7 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TA='T';
       else if ( TransA == CblasConjTrans ) TA='C';
       else if ( TransA == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -100,7 +100,7 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TB='T';
       else if ( TransB == CblasConjTrans ) TB='C';
       else if ( TransB == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -121,7 +121,7 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       RowMajorStrg = 1;
       if( Uplo == CblasUpper) UL='L';
       else if ( Uplo == CblasLower ) UL='U';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -132,7 +132,7 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TB='T';
       else if ( TransA == CblasConjTrans ) TB='C';
       else if ( TransA == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -142,7 +142,7 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TA='T';
       else if ( TransB == CblasConjTrans ) TA='C';
       else if ( TransB == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -157,12 +157,23 @@ void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
                   &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
-   } 
+   }
    else  cblas_xerbla(1, "cblas_dgemmt", "Illegal Order setting, %d\n", Order);
    CBLAS_CallFromC = 0;
    RowMajorStrg = 0;
    return;
 }
 
-void cblas_dgemmtr() __attribute__((alias("cblas_dgemmt")));
+void cblas_dgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  double alpha, const double  *A,
+                  f77_int lda, const double  *B, f77_int ldb,
+                  double beta, double  *C, f77_int ldc)
+#ifdef BLIS_OS_OSX
+{ cblas_dgemmt(Order, Uplo, TransA, TransB, M, K, alpha, A, lda, B, ldb, beta, C, ldc); }
+#else
+__attribute__((alias("cblas_dgemmt")));
+#endif
+
 #endif
diff --git a/frame/compat/cblas/src/extra/cblas_sgemmt.c b/frame/compat/cblas/src/extra/cblas_sgemmt.c
index ed3a75e6e..9ab59d7c7 100644
--- a/frame/compat/cblas/src/extra/cblas_sgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_sgemmt.c
@@ -48,13 +48,13 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                   f77_int lda, const float  *B, f77_int ldb,
                   float beta, float  *C, f77_int ldc)
 {
-   char UL, TA, TB;   
+   char UL, TA, TB;
 #ifdef F77_CHAR
    F77_CHAR F77_UL, F77_TA, F77_TB;
 #else
-   #define F77_UL &UL  
-   #define F77_TA &TA  
-   #define F77_TB &TB  
+   #define F77_UL &UL
+   #define F77_TA &TA
+   #define F77_TB &TB
 #endif
 
 #ifdef F77_INT
@@ -78,7 +78,7 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       if( Uplo == CblasUpper) UL='U';
       else if ( Uplo == CblasLower ) UL='L';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -89,7 +89,7 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TA='T';
       else if ( TransA == CblasConjTrans ) TA='C';
       else if ( TransA == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -100,7 +100,7 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TB='T';
       else if ( TransB == CblasConjTrans ) TB='C';
       else if ( TransB == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -121,7 +121,7 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       RowMajorStrg = 1;
       if( Uplo == CblasUpper) UL='L';
       else if ( Uplo == CblasLower ) UL='U';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -132,7 +132,7 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TB='T';
       else if ( TransA == CblasConjTrans ) TB='C';
       else if ( TransA == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -142,7 +142,7 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TA='T';
       else if ( TransB == CblasConjTrans ) TA='C';
       else if ( TransB == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -157,12 +157,23 @@ void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
                   &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
-   } 
+   }
    else  cblas_xerbla(1, "cblas_sgemmt", "Illegal Order setting, %d\n", Order);
    CBLAS_CallFromC = 0;
    RowMajorStrg = 0;
    return;
 }
 
-void cblas_sgemmtr() __attribute__((alias("cblas_sgemmt")));
+void cblas_sgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  float alpha, const float  *A,
+                  f77_int lda, const float  *B, f77_int ldb,
+                  float beta, float  *C, f77_int ldc)
+#ifdef BLIS_OS_OSX
+{ cblas_sgemmt(Order, Uplo, TransA, TransB, M, K, alpha, A, lda, B, ldb, beta, C, ldc); }
+#else
+__attribute__((alias("cblas_sgemmt")));
+#endif
+
 #endif
diff --git a/frame/compat/cblas/src/extra/cblas_zgemmt.c b/frame/compat/cblas/src/extra/cblas_zgemmt.c
index 2b6e5ff68..add97d2e3 100644
--- a/frame/compat/cblas/src/extra/cblas_zgemmt.c
+++ b/frame/compat/cblas/src/extra/cblas_zgemmt.c
@@ -48,13 +48,13 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
                   f77_int lda, const void  *B, f77_int ldb,
                   const void *beta, void  *C, f77_int ldc)
 {
-   char UL, TA, TB;   
+   char UL, TA, TB;
 #ifdef F77_CHAR
    F77_CHAR F77_UL, F77_TA, F77_TB;
 #else
-   #define F77_UL &UL  
-   #define F77_TA &TA  
-   #define F77_TB &TB  
+   #define F77_UL &UL
+   #define F77_TA &TA
+   #define F77_TB &TB
 #endif
 
 #ifdef F77_INT
@@ -78,7 +78,7 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       if( Uplo == CblasUpper) UL='U';
       else if ( Uplo == CblasLower ) UL='L';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -89,7 +89,7 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TA='T';
       else if ( TransA == CblasConjTrans ) TA='C';
       else if ( TransA == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -100,7 +100,7 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TB='T';
       else if ( TransB == CblasConjTrans ) TB='C';
       else if ( TransB == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -121,7 +121,7 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       RowMajorStrg = 1;
       if( Uplo == CblasUpper) UL='L';
       else if ( Uplo == CblasLower ) UL='U';
-      else 
+      else
       {
          cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
          CBLAS_CallFromC = 0;
@@ -132,7 +132,7 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransA == CblasTrans) TB='T';
       else if ( TransA == CblasConjTrans ) TB='C';
       else if ( TransA == CblasNoTrans )   TB='N';
-      else 
+      else
       {
          cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
          CBLAS_CallFromC = 0;
@@ -142,7 +142,7 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
       if(TransB == CblasTrans) TA='T';
       else if ( TransB == CblasConjTrans ) TA='C';
       else if ( TransB == CblasNoTrans )   TA='N';
-      else 
+      else
       {
          cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
@@ -157,12 +157,23 @@ void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
 
       F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B,
                   &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
-   } 
+   }
    else  cblas_xerbla(1, "cblas_zgemmt", "Illegal Order setting, %d\n", Order);
    CBLAS_CallFromC = 0;
    RowMajorStrg = 0;
    return;
 }
 
-void cblas_zgemmtr() __attribute__((alias("cblas_zgemmt")));
+void cblas_zgemmtr(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  const void* alpha, const void  *A,
+                  f77_int lda, const void  *B, f77_int ldb,
+                  const void* beta, void  *C, f77_int ldc)
+#ifdef BLIS_OS_OSX
+{ cblas_zgemmt(Order, Uplo, TransA, TransB, M, K, alpha, A, lda, B, ldb, beta, C, ldc); }
+#else
+__attribute__((alias("cblas_zgemmt")));
+#endif
+
 #endif
diff --git a/frame/compat/extra/bla_gemmt.c b/frame/compat/extra/bla_gemmt.c
index 73bf25cda..f293b0acf 100644
--- a/frame/compat/extra/bla_gemmt.c
+++ b/frame/compat/extra/bla_gemmt.c
@@ -39,8 +39,6 @@
 //
 // Define BLAS-to-BLIS interfaces.
 //
-#define STRINGIFY( name ) #name
-#define EXPAND_AND_STRINGIFY( name ) STRINGIFY( name )
 
 #ifdef BLIS_BLAS3_CALLS_TAPI
 
@@ -120,9 +118,7 @@ void PASTEF77(ch,blasname) \
 \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}; \
-void PASTEF77 (ch, blasname ## r )() \
-	__attribute__ ((alias(EXPAND_AND_STRINGIFY(PASTEF77(ch,blasname)))));
+};
 
 #else
 
@@ -225,9 +221,66 @@ void PASTEF77(ch,blasname) \
 \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}; \
-void PASTEF77 (ch, blasname ## r )() \
-	__attribute__ ((alias(EXPAND_AND_STRINGIFY(PASTEF77(ch,blasname)))));
+};
+
+#endif
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTFUNC_BLAS( gemmt, gemmt )
+#endif
+
+#ifdef BLIS_OS_OSX
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname##r) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77(ch,blasname) \
+	( \
+	  uploc, \
+	  transa, \
+	  transb, \
+	  m, \
+	  k, \
+	  alpha, \
+	  a, lda, \
+	  b, ldb, \
+	  beta, \
+	  c, ldc  \
+	); \
+}
+
+#else
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname##r) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) __attribute__ ((alias(STRINGIFY_INT(PASTEF77(ch,blasname)))));
 
 #endif
 

From 3c71737e426f8d567f1324b82609b4a61db670f8 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 19 Feb 2025 13:53:39 -0600
Subject: [PATCH 222/230] Update README.md

Details:
- Add status badge for CircleCI.
- [ci skip]
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 676bd6951..27900ac06 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,9 @@ _Recipient of the **[2020 SIAM Activity Group on Supercomputing Best Paper Prize
 
 ![The BLIS cat is sleeping.](http://www.cs.utexas.edu/users/field/blis_cat.png)
 
-[![Build Status](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis)
-[![Build Status](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master)
+[![Build Status (CircleCI)](https://dl.circleci.com/status-badge/img/gh/flame/blis/tree/master.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/flame/blis/tree/master)
+[![Build Status (TravisCI)](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis)
+[![Build Status (Appveyor)](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master)
 
 [<img alt="Discord logo" title="Join us on Discord!" height="32px" src="docs/images/discord.svg" />](docs/Discord.md)
 

From a014a08189d05f45752f7ac23d8d42a24536fb93 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 27 Feb 2025 13:48:17 -0600
Subject: [PATCH 223/230] Add new level-0 macro layer. (#830)

Details:
- Developed by @fgvanzee and @devinamatthews.
- Level-0 scalar macros have moved from a named-based system (e.g. `bli_dcopys( ... )`) to a macro argument-based system (`bli_tcopys( d,d, ... )`).
- All macros are explicitly mixed-type.
- All input and output operands can have a distinct type (precision and/or domain). Unnecessary computations and spurious NaN/Inf propagation are avoided in mixed-domain cases.
- All macros which do math (i.e. not copy/set/etc.) take an additional computational precision.
- Tile-level macros, 1m, broadcast-B, and other extensions are also included.
- All macros should correctly handle aliasing of input and output operands (this needs to be rigorously checked).
- The macros work generically over the defined types -- new types only need limited support (primarily conversion to other types and basic math).
- For code outside of core BLIS (optimized kernels, sandboxes, etc.), a selection of legacy macros have been added which translate to the new level-0 macros. Behavior is unchanged.
- A standalone, templated C++ testsuite for the level-0 macros has been added. It is currently included as part of the CircleCI tests.
- Const-correctness of level-0 macros is also checked.
---
 .circleci/config.yml                          |  13 +-
 blastest/src/dblat1.c                         |   2 +-
 build/libblis-symbols.def                     |  20 +-
 build/plugin/my_kernel_1_ref.c                |   2 +-
 build/plugin/my_kernel_2_ref.c                |   4 +-
 ci/do_level0.sh                               |  16 +
 common.mk                                     |   9 +-
 .../kernels/1/bli_axpyv_template_noopt_var1.c |  14 +-
 .../kernels/1/bli_dotv_template_noopt_var1.c  |  20 +-
 .../1f/bli_axpy2v_template_noopt_var1.c       |  48 +-
 .../1f/bli_axpyf_template_noopt_var1.c        |  20 +-
 .../1f/bli_dotaxpyv_template_noopt_var1.c     |  56 +-
 .../1f/bli_dotxaxpyf_template_noopt_var1.c    |  64 +-
 .../1f/bli_dotxf_template_noopt_var1.c        |  20 +-
 .../kernels/3/bli_gemm_template_noopt_mxn.c   |  14 +-
 .../kernels/3/bli_trsm_l_template_noopt_mxn.c |  10 +-
 .../kernels/3/bli_trsm_u_template_noopt_mxn.c |  10 +-
 frame/0/bli_l0_tapi.c                         |  40 +-
 frame/0/copysc/bli_copysc.c                   |   4 +-
 frame/1d/bli_l1d_tapi.c                       |   2 +-
 frame/1m/bli_l1m_tapi.c                       |   8 +-
 frame/1m/bli_l1m_unb_var1.c                   |  12 +-
 frame/1m/packm/bli_packm_struc_cxk.c          |   4 +-
 frame/2/bli_l2_tapi.c                         |  16 +-
 frame/2/gemv/bli_gemv_unb_var2.c              |   6 +-
 frame/2/gemv/bli_gemv_unf_var2.c              |   2 +-
 frame/2/ger/bli_ger_unb_var1.c                |   4 +-
 frame/2/ger/bli_ger_unb_var2.c                |   4 +-
 frame/2/hemv/bli_hemv_unb_var1.c              |  12 +-
 frame/2/hemv/bli_hemv_unb_var2.c              |  12 +-
 frame/2/hemv/bli_hemv_unb_var3.c              |  12 +-
 frame/2/hemv/bli_hemv_unb_var4.c              |  12 +-
 frame/2/hemv/bli_hemv_unf_var1.c              |  20 +-
 frame/2/hemv/bli_hemv_unf_var1a.c             |  14 +-
 frame/2/hemv/bli_hemv_unf_var3.c              |  20 +-
 frame/2/hemv/bli_hemv_unf_var3a.c             |  14 +-
 frame/2/her/bli_her_unb_var1.c                |  16 +-
 frame/2/her/bli_her_unb_var2.c                |  16 +-
 frame/2/her2/bli_her2_unb_var1.c              |  26 +-
 frame/2/her2/bli_her2_unb_var2.c              |  26 +-
 frame/2/her2/bli_her2_unb_var3.c              |  26 +-
 frame/2/her2/bli_her2_unb_var4.c              |  26 +-
 frame/2/her2/bli_her2_unf_var1.c              |  26 +-
 frame/2/her2/bli_her2_unf_var4.c              |  26 +-
 frame/2/trmv/bli_trmv_unb_var1.c              |  16 +-
 frame/2/trmv/bli_trmv_unb_var2.c              |  16 +-
 frame/2/trmv/bli_trmv_unf_var1.c              |  28 +-
 frame/2/trmv/bli_trmv_unf_var2.c              |  24 +-
 frame/2/trsv/bli_trsv_unb_var1.c              |  12 +-
 frame/2/trsv/bli_trsv_unb_var2.c              |  12 +-
 frame/2/trsv/bli_trsv_unf_var1.c              |  24 +-
 frame/2/trsv/bli_trsv_unf_var2.c              |  20 +-
 frame/3/bli_l3_sup_var12.c                    |   4 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |   6 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2b.c         |   6 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |   6 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2b.c         |   6 +-
 frame/base/bli_machval.c                      |   2 +-
 frame/base/bli_obj.c                          |  38 +-
 frame/base/bli_query.c                        |   4 +-
 frame/base/bli_setgetijm.c                    |   4 +-
 frame/base/bli_setgetijv.c                    |   4 +-
 frame/base/cast/bli_castm.c                   |   8 +-
 frame/base/cast/bli_castnzm.c                 |   8 +-
 frame/base/cast/bli_castv.c                   |   8 +-
 frame/compat/bla_dot.c                        |   4 +-
 frame/compat/bla_her2k.c                      |   8 +-
 frame/compat/bla_herk.c                       |   8 +-
 frame/compat/bla_scal.c                       |   4 +-
 frame/compat/f2c/bla_gbmv.c                   | 144 +--
 frame/compat/f2c/bla_hbmv.c                   | 256 ++---
 frame/compat/f2c/bla_hpmv.c                   | 256 ++---
 frame/compat/f2c/bla_hpr.c                    | 128 +--
 frame/compat/f2c/bla_hpr2.c                   | 224 ++---
 frame/compat/f2c/bla_rot.c                    | 180 ++--
 frame/compat/f2c/bla_rotg.c                   |  44 +-
 frame/compat/f2c/bla_tbmv.c                   | 288 +++---
 frame/compat/f2c/bla_tbsv.c                   | 240 ++---
 frame/compat/f2c/bla_tpmv.c                   | 288 +++---
 frame/compat/f2c/bla_tpsv.c                   | 240 ++---
 frame/compat/f2c/util/bla_c_div.c             |   4 +-
 frame/compat/f2c/util/bla_d_cnjg.c            |   2 +-
 frame/compat/f2c/util/bla_r_cnjg.c            |   2 +-
 frame/compat/f2c/util/bla_z_div.c             |   4 +-
 frame/include/bli_cast_macro_defs.h           | 529 ++++++++++
 frame/include/bli_complex_macro_defs.h        |   2 +
 frame/include/bli_edge_case_macro_defs.h      |   8 +-
 frame/include/bli_genarray_macro_defs.h       |  10 +
 frame/include/bli_gentfunc_macro_defs.h       |  12 +-
 frame/include/bli_gentprot_macro_defs.h       |   9 +
 frame/include/bli_macro_defs.h                |   1 +
 frame/include/bli_misc_macro_defs.h           |  37 +-
 frame/include/bli_scalar_macro_defs.h         | 234 +----
 frame/include/level0/1e/bli_copy1es.h         |  85 --
 frame/include/level0/1e/bli_copyj1es.h        |  85 --
 frame/include/level0/1e/bli_invert1es.h       |  53 -
 frame/include/level0/1e/bli_scal1es.h         |  53 -
 frame/include/level0/1e/bli_scal21es.h        | 235 -----
 frame/include/level0/1e/bli_scal2j1es.h       | 235 -----
 .../level0/1m/bli_invert1ms_mxn_diag.h        | 126 ---
 frame/include/level0/1m/bli_scal1ms_mxn.h     | 124 ---
 frame/include/level0/1m/bli_scal21ms_mxn.h    | 202 ----
 .../include/level0/1m/bli_scal21ms_mxn_diag.h | 126 ---
 .../include/level0/1m/bli_scal21ms_mxn_uplo.h | 296 ------
 frame/include/level0/1m/bli_set1ms_mxn.h      | 194 ----
 frame/include/level0/1m/bli_set1ms_mxn_diag.h | 130 ---
 frame/include/level0/1m/bli_set1ms_mxn_uplo.h | 198 ----
 .../include/level0/1m/bli_seti01ms_mxn_diag.h | 114 ---
 frame/include/level0/1r/bli_scal1rs.h         |  61 --
 frame/include/level0/bb/bli_scal2bbs_mxn.h    | 204 ----
 frame/include/level0/bli_absq2s.h             |  90 --
 frame/include/level0/bli_abval2s.h            |  97 --
 frame/include/level0/bli_add3s.h              | 192 ----
 frame/include/level0/bli_addjs.h              |  88 --
 frame/include/level0/bli_adds.h               |  88 --
 frame/include/level0/bli_adds_mxn.h           | 513 ----------
 frame/include/level0/bli_adds_mxn_uplo.h      | 212 ----
 .../{old/ri3/bli_copyri3s.h => bli_assigns.h} |  35 +-
 frame/include/level0/bli_axmys.h              | 192 ----
 frame/include/level0/bli_axpbyjs.h            | 480 ---------
 frame/include/level0/bli_axpbys.h             | 480 ---------
 frame/include/level0/bli_axpbys_mxn.h         | 129 ---
 frame/include/level0/bli_axpyjs.h             | 192 ----
 frame/include/level0/bli_axpys.h              | 192 ----
 .../level0/{bli_lt.h => bli_complex_terms.h}  |  67 +-
 frame/include/level0/bli_conjs.h              |  57 --
 frame/include/level0/bli_copycjs.h            |  92 --
 frame/include/level0/bli_copyjnzs.h           |  80 --
 frame/include/level0/bli_copyjs.h             |  92 --
 frame/include/level0/bli_copynzs.h            |  78 --
 frame/include/level0/bli_copys.h              |  78 --
 frame/include/level0/bli_copys_mxn.h          | 676 -------------
 .../level0/{bli_inverts.h => bli_declinits.h} |  36 +-
 frame/include/level0/bli_dotjs.h              | 141 ---
 frame/include/level0/bli_dots.h               | 141 ---
 frame/include/level0/bli_eq.h                 | 119 ---
 frame/include/level0/bli_gets.h               |  83 --
 frame/include/level0/bli_invscaljs.h          |  88 --
 frame/include/level0/bli_invscals.h           |  88 --
 frame/include/level0/bli_lte.h                |  71 --
 frame/include/level0/bli_neg2s.h              |  88 --
 frame/include/level0/bli_randnp2s.h           | 175 ----
 frame/include/level0/bli_rands.h              |  74 --
 frame/include/level0/bli_scal2js.h            | 192 ----
 frame/include/level0/bli_scal2s.h             | 191 ----
 frame/include/level0/bli_scal2s_mxn.h         | 122 ---
 frame/include/level0/bli_scalcjs.h            |  88 --
 frame/include/level0/bli_scaljs.h             |  88 --
 frame/include/level0/bli_scals.h              |  88 --
 frame/include/level0/bli_set0s_mxn.h          |  76 --
 frame/include/level0/bli_set1s.h              |  44 -
 frame/include/level0/bli_setis.h              |  76 --
 frame/include/level0/bli_sets.h               |  99 --
 frame/include/level0/bli_sqrt2s.h             |  97 --
 frame/include/level0/bli_subjs.h              |  88 --
 frame/include/level0/bli_subs.h               |  88 --
 frame/include/level0/bli_swaps.h              | 171 ----
 frame/include/level0/bli_tabsq2s.h            | 135 +++
 frame/include/level0/bli_tabval2s.h           | 206 ++++
 frame/include/level0/bli_tadd3s.h             | 199 ++++
 frame/include/level0/bli_tadds.h              | 176 ++++
 frame/include/level0/bli_taxpbys.h            | 278 ++++++
 frame/include/level0/bli_taxpys.h             | 241 +++++
 .../{1r/bli_scal21rs.h => bli_tconjs.h}       |  61 +-
 frame/include/level0/bli_tcopycjs.h           | 129 +++
 frame/include/level0/bli_tcopynzs.h           | 191 ++++
 frame/include/level0/bli_tcopys.h             | 254 +++++
 frame/include/level0/bli_tdots.h              | 120 +++
 frame/include/level0/bli_teqs.h               | 171 ++++
 .../level0/{bli_fprints.h => bli_tfprints.h}  |  89 +-
 .../level0/{bli_setrs.h => bli_tgets.h}       | 103 +-
 frame/include/level0/bli_tinverts.h           | 226 +++++
 frame/include/level0/bli_tinvscals.h          | 264 +++++
 .../{old/ri3/bli_scal2ri3s.h => bli_tneg2s.h} |  88 +-
 .../{1r/bli_scal2j1rs.h => bli_trandnp2s.h}   |  61 +-
 .../ri3/bli_scal2jri3s.h => bli_trands.h}     |  75 +-
 frame/include/level0/bli_tscal2s.h            | 654 +++++++++++++
 frame/include/level0/bli_tscalcjs.h           | 129 +++
 frame/include/level0/bli_tscals.h             | 284 ++++++
 frame/include/level0/bli_tsets.h              | 306 ++++++
 frame/include/level0/bli_tsqrt2s.h            | 197 ++++
 frame/include/level0/bli_tsubs.h              | 161 +++
 frame/include/level0/bli_tswaps.h             | 154 +++
 frame/include/level0/bli_txpbys.h             | 325 +++++++
 frame/include/level0/bli_xpbyjs.h             | 191 ----
 frame/include/level0/bli_xpbys.h              | 191 ----
 frame/include/level0/bli_xpbys_mxn.h          | 830 ----------------
 frame/include/level0/bli_xpbys_mxn_uplo.h     | 300 ------
 frame/include/level0/old/bli_cast.h           | 150 ---
 frame/include/level0/old/bli_castfrom.h       |  33 -
 frame/include/level0/old/bli_castto.h         |  33 -
 frame/include/level0/old/bli_copynzjs.h       | 140 ---
 frame/include/level0/old/bli_copynzs.h        | 139 ---
 frame/include/level0/old/bli_invscalcjs.h     | 159 ---
 frame/include/level0/old/bli_scalcjs.h        | 156 ---
 frame/include/level0/old/bli_set0ris_mxn.h    |  81 --
 frame/include/level0/old/io/bli_scal2ios.h    |  61 --
 frame/include/level0/old/io/bli_scal2jios.h   |  52 -
 frame/include/level0/old/ri3/bli_copyjri3s.h  |  46 -
 .../level0/old/ri3/bli_scal2ri3s_mxn.h        | 183 ----
 .../level0/old/rih/bli_scal2rihs_mxn.h        | 283 ------
 .../level0/old/rih/bli_scal2rihs_mxn_diag.h   | 110 ---
 .../level0/old/rih/bli_scal2rihs_mxn_uplo.h   | 348 -------
 .../level0/old/rih/bli_setrihs_mxn_diag.h     | 110 ---
 frame/include/level0/old/ro/bli_scal2jros.h   |  51 -
 frame/include/level0/old/ro/bli_scal2ros.h    |  62 --
 frame/include/level0/old/rpi/bli_scal2jrpis.h |  53 -
 frame/include/level0/old/rpi/bli_scal2rpis.h  |  66 --
 frame/include/level0/ri/bli_absq2ris.h        |  63 --
 frame/include/level0/ri/bli_add3ris.h         |  63 --
 frame/include/level0/ri/bli_addjris.h         |  46 -
 frame/include/level0/ri/bli_addris.h          |  63 --
 frame/include/level0/ri/bli_axmyris.h         |  75 --
 frame/include/level0/ri/bli_axpbyjris.h       |  91 --
 frame/include/level0/ri/bli_axpbyris.h        |  91 --
 frame/include/level0/ri/bli_axpyjris.h        | 169 ----
 frame/include/level0/ri/bli_axpyris.h         | 169 ----
 frame/include/level0/ri/bli_conjris.h         |  61 --
 frame/include/level0/ri/bli_copycjris.h       |  69 --
 frame/include/level0/ri/bli_copyjris.h        |  66 --
 frame/include/level0/ri/bli_copyris.h         |  84 --
 frame/include/level0/ri/bli_eqris.h           |  76 --
 frame/include/level0/ri/bli_invertris.h       |  70 --
 frame/include/level0/ri/bli_invscaljris.h     |  49 -
 frame/include/level0/ri/bli_invscalris.h      |  84 --
 frame/include/level0/ri/bli_neg2ris.h         |  63 --
 frame/include/level0/ri/bli_scal2jris.h       | 173 ----
 frame/include/level0/ri/bli_scal2ris.h        | 173 ----
 frame/include/level0/ri/bli_scal2ris_mxn.h    | 173 ----
 frame/include/level0/ri/bli_scalcjris.h       |  77 --
 frame/include/level0/ri/bli_scaljris.h        |  49 -
 frame/include/level0/ri/bli_scalris.h         |  79 --
 .../include/level0/ri/bli_scalris_mxn_uplo.h  | 110 ---
 frame/include/level0/ri/bli_set0ris.h         |  46 -
 frame/include/level0/ri/bli_subjris.h         |  46 -
 frame/include/level0/ri/bli_subris.h          |  63 --
 frame/include/level0/ri/bli_swapris.h         |  77 --
 frame/include/level0/ri/bli_xpbyjris.h        | 162 ----
 frame/include/level0/ri/bli_xpbyris.h         | 162 ----
 frame/util/bli_util_check.c                   |  27 +-
 frame/util/bli_util_check.h                   |   4 +-
 frame/util/bli_util_fpa.c                     |  25 +-
 frame/util/bli_util_fpa.h                     |   4 +-
 frame/util/bli_util_ft.h                      |   6 +-
 frame/util/bli_util_oapi.c                    |   6 +-
 frame/util/bli_util_oapi.h                    |   4 +-
 frame/util/bli_util_tapi.c                    |  30 +-
 frame/util/bli_util_tapi.h                    |  12 +-
 frame/util/bli_util_unb_var1.c                | 174 ++--
 ref_kernels/1/bli_addv_ref.c                  |   8 +-
 ref_kernels/1/bli_amaxv_ref.c                 |  36 +-
 ref_kernels/1/bli_axpbyv_ref.c                |  24 +-
 ref_kernels/1/bli_axpyv_ref.c                 |  12 +-
 ref_kernels/1/bli_copyv_ref.c                 |   8 +-
 ref_kernels/1/bli_dotv_ref.c                  |  16 +-
 ref_kernels/1/bli_dotxv_ref.c                 |  22 +-
 ref_kernels/1/bli_invertv_ref.c               |   4 +-
 ref_kernels/1/bli_invscalv_ref.c              |  10 +-
 ref_kernels/1/bli_scal2v_ref.c                |  12 +-
 ref_kernels/1/bli_scalv_ref.c                 |  10 +-
 ref_kernels/1/bli_setv_ref.c                  |  12 +-
 ref_kernels/1/bli_subv_ref.c                  |   8 +-
 ref_kernels/1/bli_swapv_ref.c                 |   4 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |  12 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |  24 +-
 ref_kernels/1f/bli_axpyf_ref.c                |  12 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |  28 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |  34 +-
 ref_kernels/1f/bli_dotxf_ref.c                |  18 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   |  74 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       |  40 +-
 ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c    |  27 +-
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        |  28 +-
 ref_kernels/1m/bli_packm_cxk_ref.c            |  59 +-
 ref_kernels/1m/bli_packm_cxk_ro_ref.c         |  13 +-
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |   6 +-
 ref_kernels/3/bli_gemm_ref.c                  |  72 +-
 ref_kernels/3/bli_gemmsup_ref.c               | 116 +--
 ref_kernels/3/bli_gemmtrsm_ref.c              |  10 +-
 ref_kernels/3/bli_trsm_ref.c                  |  24 +-
 ref_kernels/ind/bli_gemm1m_ref.c              |   7 +-
 ref_kernels/ind/bli_gemm_ccr_ref.c            |   7 +-
 ref_kernels/ind/bli_gemm_crr_ref.c            |  39 +-
 ref_kernels/ind/bli_gemm_rcc_ref.c            |   3 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          | 168 ++--
 sandbox/gemmlike/bls_l3_packm_var1.c          |   4 +-
 test/level0/Makefile                          | 153 +++
 .../bli_seti0s.h => test/level0/test_l0.cxx   |  21 +-
 test/level0/test_l0.hpp                       | 914 ++++++++++++++++++
 .../level0/test_tabsq2s.cxx                   |  99 +-
 test/level0/test_tabval2s.cxx                 |  86 ++
 test/level0/test_tadd3s.cxx                   | 189 ++++
 test/level0/test_tadds.cxx                    | 196 ++++
 test/level0/test_taxpbys.cxx                  | 256 +++++
 test/level0/test_taxpys.cxx                   | 181 ++++
 .../level0/test_tconjs.cxx                    |  83 +-
 test/level0/test_tcopycjs.cxx                 | 115 +++
 test/level0/test_tcopynzs.cxx                 | 150 +++
 test/level0/test_tcopys.cxx                   | 301 ++++++
 .../level0/test_tdots.cxx                     |  17 +-
 test/level0/test_teqs.cxx                     | 218 +++++
 .../level0/test_tfprints.cxx                  |  25 +-
 .../level0/test_tgets.cxx                     |  59 +-
 test/level0/test_tinverts.cxx                 | 141 +++
 test/level0/test_tinvscals.cxx                | 146 +++
 .../level0/test_tneg2s.cxx                    | 103 +-
 .../level0/test_trandnp2s.cxx                 |  18 +-
 .../level0/test_trands.cxx                    |  25 +-
 test/level0/test_tscal2s.cxx                  | 535 ++++++++++
 .../level0/test_tscalcjs.cxx                  |  61 +-
 test/level0/test_tscals.cxx                   | 277 ++++++
 test/level0/test_tsets.cxx                    | 361 +++++++
 test/level0/test_tsqrt2s.cxx                  |  86 ++
 test/level0/test_tsubs.cxx                    | 164 ++++
 test/level0/test_tswaps.cxx                   |  98 ++
 test/level0/test_txpbys.cxx                   | 311 ++++++
 testsuite/src/test_amaxv.c                    |  22 +-
 testsuite/src/test_libblis.c                  |   4 +-
 testsuite/src/test_randm.c                    |   8 +-
 testsuite/src/test_setm.c                     |   6 +-
 testsuite/src/test_setv.c                     |  14 +-
 321 files changed, 13361 insertions(+), 18835 deletions(-)
 create mode 100755 ci/do_level0.sh
 create mode 100644 frame/include/bli_cast_macro_defs.h
 delete mode 100644 frame/include/level0/1e/bli_copy1es.h
 delete mode 100644 frame/include/level0/1e/bli_copyj1es.h
 delete mode 100644 frame/include/level0/1e/bli_invert1es.h
 delete mode 100644 frame/include/level0/1e/bli_scal1es.h
 delete mode 100644 frame/include/level0/1e/bli_scal21es.h
 delete mode 100644 frame/include/level0/1e/bli_scal2j1es.h
 delete mode 100644 frame/include/level0/1m/bli_invert1ms_mxn_diag.h
 delete mode 100644 frame/include/level0/1m/bli_scal1ms_mxn.h
 delete mode 100644 frame/include/level0/1m/bli_scal21ms_mxn.h
 delete mode 100644 frame/include/level0/1m/bli_scal21ms_mxn_diag.h
 delete mode 100644 frame/include/level0/1m/bli_scal21ms_mxn_uplo.h
 delete mode 100644 frame/include/level0/1m/bli_set1ms_mxn.h
 delete mode 100644 frame/include/level0/1m/bli_set1ms_mxn_diag.h
 delete mode 100644 frame/include/level0/1m/bli_set1ms_mxn_uplo.h
 delete mode 100644 frame/include/level0/1m/bli_seti01ms_mxn_diag.h
 delete mode 100644 frame/include/level0/1r/bli_scal1rs.h
 delete mode 100644 frame/include/level0/bb/bli_scal2bbs_mxn.h
 delete mode 100644 frame/include/level0/bli_absq2s.h
 delete mode 100644 frame/include/level0/bli_abval2s.h
 delete mode 100644 frame/include/level0/bli_add3s.h
 delete mode 100644 frame/include/level0/bli_addjs.h
 delete mode 100644 frame/include/level0/bli_adds.h
 delete mode 100644 frame/include/level0/bli_adds_mxn.h
 delete mode 100644 frame/include/level0/bli_adds_mxn_uplo.h
 rename frame/include/level0/{old/ri3/bli_copyri3s.h => bli_assigns.h} (77%)
 delete mode 100644 frame/include/level0/bli_axmys.h
 delete mode 100644 frame/include/level0/bli_axpbyjs.h
 delete mode 100644 frame/include/level0/bli_axpbys.h
 delete mode 100644 frame/include/level0/bli_axpbys_mxn.h
 delete mode 100644 frame/include/level0/bli_axpyjs.h
 delete mode 100644 frame/include/level0/bli_axpys.h
 rename frame/include/level0/{bli_lt.h => bli_complex_terms.h} (60%)
 delete mode 100644 frame/include/level0/bli_conjs.h
 delete mode 100644 frame/include/level0/bli_copycjs.h
 delete mode 100644 frame/include/level0/bli_copyjnzs.h
 delete mode 100644 frame/include/level0/bli_copyjs.h
 delete mode 100644 frame/include/level0/bli_copynzs.h
 delete mode 100644 frame/include/level0/bli_copys.h
 delete mode 100644 frame/include/level0/bli_copys_mxn.h
 rename frame/include/level0/{bli_inverts.h => bli_declinits.h} (64%)
 delete mode 100644 frame/include/level0/bli_dotjs.h
 delete mode 100644 frame/include/level0/bli_dots.h
 delete mode 100644 frame/include/level0/bli_eq.h
 delete mode 100644 frame/include/level0/bli_gets.h
 delete mode 100644 frame/include/level0/bli_invscaljs.h
 delete mode 100644 frame/include/level0/bli_invscals.h
 delete mode 100644 frame/include/level0/bli_lte.h
 delete mode 100644 frame/include/level0/bli_neg2s.h
 delete mode 100644 frame/include/level0/bli_randnp2s.h
 delete mode 100644 frame/include/level0/bli_rands.h
 delete mode 100644 frame/include/level0/bli_scal2js.h
 delete mode 100644 frame/include/level0/bli_scal2s.h
 delete mode 100644 frame/include/level0/bli_scal2s_mxn.h
 delete mode 100644 frame/include/level0/bli_scalcjs.h
 delete mode 100644 frame/include/level0/bli_scaljs.h
 delete mode 100644 frame/include/level0/bli_scals.h
 delete mode 100644 frame/include/level0/bli_set0s_mxn.h
 delete mode 100644 frame/include/level0/bli_set1s.h
 delete mode 100644 frame/include/level0/bli_setis.h
 delete mode 100644 frame/include/level0/bli_sets.h
 delete mode 100644 frame/include/level0/bli_sqrt2s.h
 delete mode 100644 frame/include/level0/bli_subjs.h
 delete mode 100644 frame/include/level0/bli_subs.h
 delete mode 100644 frame/include/level0/bli_swaps.h
 create mode 100644 frame/include/level0/bli_tabsq2s.h
 create mode 100644 frame/include/level0/bli_tabval2s.h
 create mode 100644 frame/include/level0/bli_tadd3s.h
 create mode 100644 frame/include/level0/bli_tadds.h
 create mode 100644 frame/include/level0/bli_taxpbys.h
 create mode 100644 frame/include/level0/bli_taxpys.h
 rename frame/include/level0/{1r/bli_scal21rs.h => bli_tconjs.h} (60%)
 create mode 100644 frame/include/level0/bli_tcopycjs.h
 create mode 100644 frame/include/level0/bli_tcopynzs.h
 create mode 100644 frame/include/level0/bli_tcopys.h
 create mode 100644 frame/include/level0/bli_tdots.h
 create mode 100644 frame/include/level0/bli_teqs.h
 rename frame/include/level0/{bli_fprints.h => bli_tfprints.h} (52%)
 rename frame/include/level0/{bli_setrs.h => bli_tgets.h} (57%)
 create mode 100644 frame/include/level0/bli_tinverts.h
 create mode 100644 frame/include/level0/bli_tinvscals.h
 rename frame/include/level0/{old/ri3/bli_scal2ri3s.h => bli_tneg2s.h} (53%)
 rename frame/include/level0/{1r/bli_scal2j1rs.h => bli_trandnp2s.h} (60%)
 rename frame/include/level0/{old/ri3/bli_scal2jri3s.h => bli_trands.h} (60%)
 create mode 100644 frame/include/level0/bli_tscal2s.h
 create mode 100644 frame/include/level0/bli_tscalcjs.h
 create mode 100644 frame/include/level0/bli_tscals.h
 create mode 100644 frame/include/level0/bli_tsets.h
 create mode 100644 frame/include/level0/bli_tsqrt2s.h
 create mode 100644 frame/include/level0/bli_tsubs.h
 create mode 100644 frame/include/level0/bli_tswaps.h
 create mode 100644 frame/include/level0/bli_txpbys.h
 delete mode 100644 frame/include/level0/bli_xpbyjs.h
 delete mode 100644 frame/include/level0/bli_xpbys.h
 delete mode 100644 frame/include/level0/bli_xpbys_mxn.h
 delete mode 100644 frame/include/level0/bli_xpbys_mxn_uplo.h
 delete mode 100644 frame/include/level0/old/bli_cast.h
 delete mode 100644 frame/include/level0/old/bli_castfrom.h
 delete mode 100644 frame/include/level0/old/bli_castto.h
 delete mode 100644 frame/include/level0/old/bli_copynzjs.h
 delete mode 100644 frame/include/level0/old/bli_copynzs.h
 delete mode 100644 frame/include/level0/old/bli_invscalcjs.h
 delete mode 100644 frame/include/level0/old/bli_scalcjs.h
 delete mode 100644 frame/include/level0/old/bli_set0ris_mxn.h
 delete mode 100644 frame/include/level0/old/io/bli_scal2ios.h
 delete mode 100644 frame/include/level0/old/io/bli_scal2jios.h
 delete mode 100644 frame/include/level0/old/ri3/bli_copyjri3s.h
 delete mode 100644 frame/include/level0/old/ri3/bli_scal2ri3s_mxn.h
 delete mode 100644 frame/include/level0/old/rih/bli_scal2rihs_mxn.h
 delete mode 100644 frame/include/level0/old/rih/bli_scal2rihs_mxn_diag.h
 delete mode 100644 frame/include/level0/old/rih/bli_scal2rihs_mxn_uplo.h
 delete mode 100644 frame/include/level0/old/rih/bli_setrihs_mxn_diag.h
 delete mode 100644 frame/include/level0/old/ro/bli_scal2jros.h
 delete mode 100644 frame/include/level0/old/ro/bli_scal2ros.h
 delete mode 100644 frame/include/level0/old/rpi/bli_scal2jrpis.h
 delete mode 100644 frame/include/level0/old/rpi/bli_scal2rpis.h
 delete mode 100644 frame/include/level0/ri/bli_absq2ris.h
 delete mode 100644 frame/include/level0/ri/bli_add3ris.h
 delete mode 100644 frame/include/level0/ri/bli_addjris.h
 delete mode 100644 frame/include/level0/ri/bli_addris.h
 delete mode 100644 frame/include/level0/ri/bli_axmyris.h
 delete mode 100644 frame/include/level0/ri/bli_axpbyjris.h
 delete mode 100644 frame/include/level0/ri/bli_axpbyris.h
 delete mode 100644 frame/include/level0/ri/bli_axpyjris.h
 delete mode 100644 frame/include/level0/ri/bli_axpyris.h
 delete mode 100644 frame/include/level0/ri/bli_conjris.h
 delete mode 100644 frame/include/level0/ri/bli_copycjris.h
 delete mode 100644 frame/include/level0/ri/bli_copyjris.h
 delete mode 100644 frame/include/level0/ri/bli_copyris.h
 delete mode 100644 frame/include/level0/ri/bli_eqris.h
 delete mode 100644 frame/include/level0/ri/bli_invertris.h
 delete mode 100644 frame/include/level0/ri/bli_invscaljris.h
 delete mode 100644 frame/include/level0/ri/bli_invscalris.h
 delete mode 100644 frame/include/level0/ri/bli_neg2ris.h
 delete mode 100644 frame/include/level0/ri/bli_scal2jris.h
 delete mode 100644 frame/include/level0/ri/bli_scal2ris.h
 delete mode 100644 frame/include/level0/ri/bli_scal2ris_mxn.h
 delete mode 100644 frame/include/level0/ri/bli_scalcjris.h
 delete mode 100644 frame/include/level0/ri/bli_scaljris.h
 delete mode 100644 frame/include/level0/ri/bli_scalris.h
 delete mode 100644 frame/include/level0/ri/bli_scalris_mxn_uplo.h
 delete mode 100644 frame/include/level0/ri/bli_set0ris.h
 delete mode 100644 frame/include/level0/ri/bli_subjris.h
 delete mode 100644 frame/include/level0/ri/bli_subris.h
 delete mode 100644 frame/include/level0/ri/bli_swapris.h
 delete mode 100644 frame/include/level0/ri/bli_xpbyjris.h
 delete mode 100644 frame/include/level0/ri/bli_xpbyris.h
 create mode 100644 test/level0/Makefile
 rename frame/include/level0/bli_seti0s.h => test/level0/test_l0.cxx (83%)
 create mode 100644 test/level0/test_l0.hpp
 rename frame/include/level0/ri/bli_sqrt2ris.h => test/level0/test_tabsq2s.cxx (55%)
 create mode 100644 test/level0/test_tabval2s.cxx
 create mode 100644 test/level0/test_tadd3s.cxx
 create mode 100644 test/level0/test_tadds.cxx
 create mode 100644 test/level0/test_taxpbys.cxx
 create mode 100644 test/level0/test_taxpys.cxx
 rename frame/include/level0/bli_set0s_edge.h => test/level0/test_tconjs.cxx (61%)
 create mode 100644 test/level0/test_tcopycjs.cxx
 create mode 100644 test/level0/test_tcopynzs.cxx
 create mode 100644 test/level0/test_tcopys.cxx
 rename frame/include/level0/1r/bli_invert1rs.h => test/level0/test_tdots.cxx (81%)
 create mode 100644 test/level0/test_teqs.cxx
 rename frame/include/level0/1r/bli_copyj1rs.h => test/level0/test_tfprints.cxx (82%)
 rename frame/include/level0/bb/bli_bcastbbs_mxn.h => test/level0/test_tgets.cxx (63%)
 create mode 100644 test/level0/test_tinverts.cxx
 create mode 100644 test/level0/test_tinvscals.cxx
 rename frame/include/level0/ri/bli_abval2ris.h => test/level0/test_tneg2s.cxx (50%)
 rename frame/include/level0/bli_set0s.h => test/level0/test_trandnp2s.cxx (82%)
 rename frame/include/level0/1r/bli_copy1rs.h => test/level0/test_trands.cxx (82%)
 create mode 100644 test/level0/test_tscal2s.cxx
 rename frame/include/level0/bb/bli_set0bbs_mxn.h => test/level0/test_tscalcjs.cxx (60%)
 create mode 100644 test/level0/test_tscals.cxx
 create mode 100644 test/level0/test_tsets.cxx
 create mode 100644 test/level0/test_tsqrt2s.cxx
 create mode 100644 test/level0/test_tsubs.cxx
 create mode 100644 test/level0/test_tswaps.cxx
 create mode 100644 test/level0/test_txpbys.cxx

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 339fc11cf..8c73a0eec 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -30,6 +30,7 @@ workflows:
       #     OOT: 0
       #     TEST: FAST
       #     SDE: 0
+      #     LEVEL0: 0
       #     THR: none
       #     CONF: auto
       #     BLD: ''
@@ -41,20 +42,18 @@ workflows:
       - build:
           OOT: 1
           TEST: ALL
-          SDE: 0
           CONF: x86_64
 
       # SDE testing for x86_64
       - build:
           # linuxvm must be used because it provides 8G RAM and SDE fails with 4G RAM
           os: linuxvm
-          OOT: 0
-          TEST: FAST
           SDE: 1
           CONF: x86_64
 
-      # openmp build
+      # openmp build + LEVEL0
       - build:
+          LEVEL0: 1
           THR: openmp
 
       # pthreads build
@@ -153,6 +152,9 @@ jobs:
       SDE:
         type: integer
         default: 0
+      LEVEL0:
+        type: integer
+        default: 0
       THR:
         type: string
         default: none
@@ -197,6 +199,7 @@ jobs:
             export BLD="<< parameters.BLD >>"
             export LDFLAGS="<< parameters.LDFLAGS >>"
             export SDE="<< parameters.SDE >>"
+            export LEVEL0="<< parameters.LEVEL0 >>"
             export THR="<< parameters.THR >>"
             export TESTSUITE_WRAPPER="<< parameters.TESTSUITE_WRAPPER >>"
 
@@ -232,6 +235,7 @@ jobs:
             echo "TEST              = $TEST"
             echo "BLD               = $BLD"
             echo "SDE               = $SDE"
+            echo "LEVEL0            = $LEVEL0"
             echo "DIST_PATH         = $DIST_PATH"
             echo "LDFLAGS           = $LDFLAGS"
             echo "TESTSUITE_WRAPPER = $TESTSUITE_WRAPPER"
@@ -251,3 +255,4 @@ jobs:
             if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
             if [ "$TEST" != "0" ]; then $DIST_PATH/ci/do_testsuite.sh; fi
             if [ "$SDE" = "1" ]; then $DIST_PATH/ci/do_sde.sh; fi
+            if [ "$LEVEL0" = "1" ]; then $DIST_PATH/ci/do_level0.sh; fi
diff --git a/blastest/src/dblat1.c b/blastest/src/dblat1.c
index e84867178..ccac12c88 100644
--- a/blastest/src/dblat1.c
+++ b/blastest/src/dblat1.c
@@ -1044,7 +1044,7 @@ static real c_b81 = 0.f;
 
     /* Local variables */
     real sd;
-    extern real s_epsilon_();
+    extern real s_epsilon_(real *);
 
     /* Fortran I/O blocks */
     static cilist io___125 = { 0, 6, 0, fmt_99999, 0 };
diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index 36c68fce2..ec68592d7 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -121,7 +121,7 @@ bli_cger_ex
 bli_cgetijm
 bli_cgetijv
 bli_cgetsc
-bli_cgtesc
+bli_cgesc
 bli_cgtsc
 bli_check_error_code_helper
 bli_chemm
@@ -149,7 +149,7 @@ bli_cinvscalv
 bli_cinvscalv_ex
 bli_clock
 bli_clock_min_diff
-bli_cltesc
+bli_clesc
 bli_cltsc
 bli_cmachval
 bli_cmkherm
@@ -351,7 +351,7 @@ bli_dger_ex
 bli_dgetijm
 bli_dgetijv
 bli_dgetsc
-bli_dgtesc
+bli_dgesc
 bli_dgtsc
 bli_dhemm
 bli_dhemm_ex
@@ -377,7 +377,7 @@ bli_dinvscalm_ex
 bli_dinvscalv
 bli_dinvscalv_ex
 bli_divsc
-bli_dltesc
+bli_dlesc
 bli_dltsc
 bli_dmachval
 bli_dmkherm
@@ -532,7 +532,7 @@ bli_gks_l3_ukr_impl_type
 bli_gks_query_cntx
 bli_gks_query_ind_cntx
 bli_gks_query_nat_cntx
-bli_gtesc
+bli_gesc
 bli_gtsc
 bli_hemm
 bli_hemm_ex
@@ -630,7 +630,7 @@ bli_iprintm
 bli_iprintv
 bli_isetsc
 bli_l3_thrinfo_create
-bli_ltesc
+bli_lesc
 bli_ltsc
 bli_machval
 bli_malloc_user
@@ -824,7 +824,7 @@ bli_sger_ex
 bli_sgetijm
 bli_sgetijv
 bli_sgetsc
-bli_sgtesc
+bli_sgesc
 bli_sgtsc
 bli_shemm
 bli_shemm_ex
@@ -852,7 +852,7 @@ bli_sinvscalm_ex
 bli_sinvscalv
 bli_sinvscalv_ex
 bli_sleep
-bli_sltesc
+bli_slesc
 bli_sltsc
 bli_smachval
 bli_smkherm
@@ -1097,7 +1097,7 @@ bli_zger_ex
 bli_zgetijm
 bli_zgetijv
 bli_zgetsc
-bli_zgtesc
+bli_zgesc
 bli_zgtsc
 bli_zhemm
 bli_zhemm_ex
@@ -1123,7 +1123,7 @@ bli_zinvscalm_ex
 bli_zinvscalv
 bli_zinvscalv_ex
 bli_zipsc
-bli_zltesc
+bli_zlesc
 bli_zltsc
 bli_zmachval
 bli_zmkherm
diff --git a/build/plugin/my_kernel_1_ref.c b/build/plugin/my_kernel_1_ref.c
index dc3433814..42fa593c1 100644
--- a/build/plugin/my_kernel_1_ref.c
+++ b/build/plugin/my_kernel_1_ref.c
@@ -48,7 +48,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 \
 	for ( dim_t i = 0; i < n; ++i ) \
 	{ \
-		PASTEMAC(ch,copys)( *a, x[ i ] ); \
+		bli_tcopys( ch,ch, *a, x[ i ] ); \
 	} \
 }
 
diff --git a/build/plugin/my_kernel_2_ref.c b/build/plugin/my_kernel_2_ref.c
index 27aa1e96b..0d241b5c1 100644
--- a/build/plugin/my_kernel_2_ref.c
+++ b/build/plugin/my_kernel_2_ref.c
@@ -56,7 +56,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 		{ \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				PASTEMAC(ch,seti0s)( a[ i*n + j ] ); \
+				bli_tseti0s( ch, a[ i*n + j ] ); \
 			} \
 		} \
 	} \
@@ -66,7 +66,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 		{ \
 			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				PASTEMAC(ch,seti0s)( a[ i + j*m ] ); \
+				bli_tseti0s( ch, a[ i + j*m ] ); \
 			} \
 		} \
 	} \
diff --git a/ci/do_level0.sh b/ci/do_level0.sh
new file mode 100755
index 000000000..792e07590
--- /dev/null
+++ b/ci/do_level0.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -e
+set -x
+
+: ${SRCDIR:=../..}
+
+if ! [ -d test/level0 ]; then
+    mkdir -p test/level0
+    ln -s $SRCDIR/test/level0/* test/level0/
+fi
+
+cd test/level0
+make -j2
+
+./test_l0.x
diff --git a/common.mk b/common.mk
index 2ad418772..584eb20f4 100644
--- a/common.mk
+++ b/common.mk
@@ -63,7 +63,6 @@ $(eval $(call store-var-for,CC,         $(1)))
 $(eval $(call store-var-for,CC_VENDOR,  $(1)))
 $(eval $(call store-var-for,CPPROCFLAGS,$(1)))
 $(eval $(call store-var-for,CLANGFLAGS, $(1)))
-$(eval $(call store-var-for,CXXLANGFLAGS,$(1)))
 $(eval $(call store-var-for,CMISCFLAGS, $(1)))
 $(eval $(call store-var-for,CPICFLAGS,  $(1)))
 $(eval $(call store-var-for,CWARNFLAGS, $(1)))
@@ -109,8 +108,8 @@ get-noopt-cxxflags-for   = $(strip $(CXXFLAGS_PRESET) \
                                    $(call load-var-for,CWARNFLAGS,$(1)) \
                                    $(call load-var-for,CPICFLAGS,$(1)) \
                                    $(call load-var-for,CMISCFLAGS,$(1)) \
-                                   $(call load-var-for,CXXLANGFLAGS,$(1)) \
                                    $(call load-var-for,CPPROCFLAGS,$(1)) \
+                                   $(CXXLANGFLAGS) \
                                    $(CTHREADFLAGS) \
                                    $(CXXTHREADFLAGS) \
                                    $(CINCFLAGS) \
@@ -776,6 +775,11 @@ endif
 
 CWARNFLAGS :=
 
+# Do not allow functions with implicit definitions to be called
+ifneq ($(CC_VENDOR),ibm)
+CWARNFLAGS += -Werror=implicit-function-declaration
+endif
+
 # Disable unused function warnings and stop compiling on first error for
 # all compilers that accept such options: gcc, clang, and icc.
 ifneq ($(CC_VENDOR),ibm)
@@ -914,7 +918,6 @@ endif
 else
 CXXLANGFLAGS :=
 endif
-$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))))
 
 # --- C Preprocessor flags ---
 
diff --git a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
index 8796bab26..cc3c078c1 100644
--- a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
+++ b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
@@ -117,7 +117,7 @@ void bli_zaxpyv_template_noopt
 
 	if ( bli_zero_dim1( n ) ) return;
 
-	if ( bli_zeq0( *alpha ) ) return;
+	if ( bli_teq0s( z, *alpha ) ) return;
 
 
 	// If there is anything that would interfere with our use of aligned
@@ -179,7 +179,7 @@ void bli_zaxpyv_template_noopt
 		// Compute front edge cases if x and y were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zaxpys( *alpha, *xp, *yp );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *yp );
 
 			xp += 1; yp += 1;
 		}
@@ -188,7 +188,7 @@ void bli_zaxpyv_template_noopt
 		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zaxpys( *alpha, *xp, *yp );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *yp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -197,7 +197,7 @@ void bli_zaxpyv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zaxpys( *alpha, *xp, *yp );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *yp );
 
 			xp += 1; yp += 1;
 		}
@@ -207,7 +207,7 @@ void bli_zaxpyv_template_noopt
 		// Compute front edge cases if x and y were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zaxpyjs( *alpha, *xp, *yp );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *yp );
 
 			xp += 1; yp += 1;
 		}
@@ -216,7 +216,7 @@ void bli_zaxpyv_template_noopt
 		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zaxpyjs( *alpha, *xp, *yp );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *yp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -225,7 +225,7 @@ void bli_zaxpyv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zaxpyjs( *alpha, *xp, *yp );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *yp );
 
 			xp += 1; yp += 1;
 		}
diff --git a/config/template/kernels/1/bli_dotv_template_noopt_var1.c b/config/template/kernels/1/bli_dotv_template_noopt_var1.c
index 90f93b817..c59c0d620 100644
--- a/config/template/kernels/1/bli_dotv_template_noopt_var1.c
+++ b/config/template/kernels/1/bli_dotv_template_noopt_var1.c
@@ -127,7 +127,7 @@ void bli_zdotv_template_noopt
 	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) )
 	{
-		bli_zset0s( *rho );
+		bli_tset0s( z, *rho );
 		return;
 	}
 
@@ -185,7 +185,7 @@ void bli_zdotv_template_noopt
 
 
 	// Initialize accumulator to zero.
-	bli_zset0s( dotxy );
+	bli_tset0s( z, dotxy );
 
 
 	conjx_use = conjx;
@@ -204,7 +204,7 @@ void bli_zdotv_template_noopt
 		// Compute front edge cases if x and y were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
 
 			xp += 1; yp += 1;
 		}
@@ -213,7 +213,7 @@ void bli_zdotv_template_noopt
 		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -222,7 +222,7 @@ void bli_zdotv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
 
 			xp += 1; yp += 1;
 		}
@@ -232,7 +232,7 @@ void bli_zdotv_template_noopt
 		// Compute front edge cases if x and y were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
 
 			xp += 1; yp += 1;
 		}
@@ -241,7 +241,7 @@ void bli_zdotv_template_noopt
 		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -250,7 +250,7 @@ void bli_zdotv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
 
 			xp += 1; yp += 1;
 		}
@@ -259,8 +259,8 @@ void bli_zdotv_template_noopt
 	// If conjugation on y was requested, we induce it by conjugating
 	// the contents of dotxy.
 	if ( bli_is_conj( conjy ) )
-		bli_zconjs( dotxy );
+		bli_tconjs( z, dotxy );
 
-	bli_zcopys( dotxy, *rho );
+	bli_tcopys( z,z, dotxy, *rho );
 }
 
diff --git a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
index 5a12bf761..649511cf0 100644
--- a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
@@ -194,8 +194,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zaxpys( *alpha1, *xp, *zp );
-			bli_zaxpys( *alpha2, *yp, *zp );
+			bli_taxpys( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpys( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -207,8 +207,8 @@ void bli_zaxpy2v_template_noopt
 		// to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zaxpys( *alpha1, *xp, *zp );
-			bli_zaxpys( *alpha2, *yp, *zp );
+			bli_taxpys( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpys( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -218,8 +218,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zaxpys( *alpha1, *xp, *zp );
-			bli_zaxpys( *alpha2, *yp, *zp );
+			bli_taxpys( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpys( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -229,8 +229,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zaxpys(  *alpha1, *xp, *zp );
-			bli_zaxpyjs( *alpha2, *yp, *zp );
+			bli_taxpys( z,z,z,z,  *alpha1, *xp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -242,8 +242,8 @@ void bli_zaxpy2v_template_noopt
 		// to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zaxpys(  *alpha1, *xp, *zp );
-			bli_zaxpyjs( *alpha2, *yp, *zp );
+			bli_taxpys( z,z,z,z,  *alpha1, *xp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -253,8 +253,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zaxpys(  *alpha1, *xp, *zp );
-			bli_zaxpyjs( *alpha2, *yp, *zp );
+			bli_taxpys( z,z,z,z,  *alpha1, *xp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -264,8 +264,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zaxpyjs( *alpha1, *xp, *zp );
-			bli_zaxpys(  *alpha2, *yp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpys( z,z,z,z,  *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -277,8 +277,8 @@ void bli_zaxpy2v_template_noopt
 		// to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zaxpyjs( *alpha1, *xp, *zp );
-			bli_zaxpys(  *alpha2, *yp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpys( z,z,z,z,  *alpha2, *yp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -288,8 +288,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zaxpyjs( *alpha1, *xp, *zp );
-			bli_zaxpys(  *alpha2, *yp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpys( z,z,z,z,  *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -299,8 +299,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zaxpyjs( *alpha1, *xp, *zp );
-			bli_zaxpyjs( *alpha2, *yp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -312,8 +312,8 @@ void bli_zaxpy2v_template_noopt
 		// to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zaxpyjs( *alpha1, *xp, *zp );
-			bli_zaxpyjs( *alpha2, *yp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -323,8 +323,8 @@ void bli_zaxpy2v_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zaxpyjs( *alpha1, *xp, *zp );
-			bli_zaxpyjs( *alpha2, *yp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp );
+			bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
diff --git a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
index f7b492286..834c2fc24 100644
--- a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
@@ -209,16 +209,16 @@ void bli_zaxpyf_template_noopt
 	{
 		for ( j = 0; j < b_n; ++j )
 		{
-			bli_zcopys( *xp[ j ], alpha_x[ j ] );
-			bli_zscals( *alpha, alpha_x[ j ] );
+			bli_tcopys( z,z, *xp[ j ], alpha_x[ j ] );
+			bli_tscals( z,z,z, *alpha, alpha_x[ j ] );
 		}
 	}
 	else // if ( bli_is_conj( conjx ) )
 	{
 		for ( j = 0; j < b_n; ++j )
 		{
-			bli_zcopyjs( *xp[ j ], alpha_x[ j ] );
-			bli_zscals( *alpha, alpha_x[ j ] );
+			bli_tcopyjs( z,z, *xp[ j ], alpha_x[ j ] );
+			bli_tscals( z,z,z, *alpha, alpha_x[ j ] );
 		}
 	}
 
@@ -231,7 +231,7 @@ void bli_zaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp );
+				bli_taxpys( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp );
 
 				ap[ j ] += 1;
 			}
@@ -247,7 +247,7 @@ void bli_zaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp );
+				bli_taxpys( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp );
 
 				ap[ j ] += n_elem_per_iter;
 			}
@@ -259,7 +259,7 @@ void bli_zaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp );
+				bli_taxpys( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp );
 
 				ap[ j ] += 1;
 			}
@@ -273,7 +273,7 @@ void bli_zaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
+				bli_taxpyjs( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp );
 
 				ap[ j ] += 1;
 			}
@@ -289,7 +289,7 @@ void bli_zaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
+				bli_taxpyjs( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp );
 
 				ap[ j ] += n_elem_per_iter;
 			}
@@ -301,7 +301,7 @@ void bli_zaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
+				bli_taxpyjs( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp );
 
 				ap[ j ] += 1;
 			}
diff --git a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
index 31a3097c0..ae806d50d 100644
--- a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
@@ -138,7 +138,7 @@ void bli_zdotaxpyv_template_noopt
 	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) )
 	{
-		bli_zset0s( *rho );
+		bli_tset0s( z, *rho );
 		return;
 	}
 
@@ -202,7 +202,7 @@ void bli_zdotaxpyv_template_noopt
 
 
 	// Initialize accumulator to zero.
-	bli_zset0s( dotxy );
+	bli_tset0s( z, dotxy );
 
 
 	conjxt_use = conjxt;
@@ -222,8 +222,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
-			bli_zaxpys( *alpha, *xp, *zp );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -235,8 +235,8 @@ void bli_zdotaxpyv_template_noopt
 		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
-			bli_zaxpys( *alpha, *xp, *zp );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -246,8 +246,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
-			bli_zaxpys( *alpha, *xp, *zp );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -257,8 +257,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
-			bli_zaxpys( *alpha, *xp, *zp );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -270,8 +270,8 @@ void bli_zdotaxpyv_template_noopt
 		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
-			bli_zaxpys( *alpha, *xp, *zp );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -281,8 +281,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
-			bli_zaxpys( *alpha, *xp, *zp );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpys( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -292,8 +292,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
-			bli_zaxpyjs( *alpha, *xp, *zp );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -305,8 +305,8 @@ void bli_zdotaxpyv_template_noopt
 		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
-			bli_zaxpyjs( *alpha, *xp, *zp );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -316,8 +316,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zdots( *xp, *yp, dotxy );
-			bli_zaxpyjs( *alpha, *xp, *zp );
+			bli_tdots( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -327,8 +327,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute front edge cases if x, y, and z were unaligned.
 		for ( i = 0; i < n_pre; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
-			bli_zaxpyjs( *alpha, *xp, *zp );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -340,8 +340,8 @@ void bli_zdotaxpyv_template_noopt
 		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
 		for ( i = 0; i < n_iter; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
-			bli_zaxpyjs( *alpha, *xp, *zp );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += n_elem_per_iter;
 			yp += n_elem_per_iter;
@@ -351,8 +351,8 @@ void bli_zdotaxpyv_template_noopt
 		// Compute tail edge cases, if applicable.
 		for ( i = 0; i < n_left; ++i )
 		{
-			bli_zdotjs( *xp, *yp, dotxy );
-			bli_zaxpyjs( *alpha, *xp, *zp );
+			bli_tdotjs( z,z,z,z, *xp, *yp, dotxy );
+			bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp );
 
 			xp += 1; yp += 1; zp += 1;
 		}
@@ -361,8 +361,8 @@ void bli_zdotaxpyv_template_noopt
 	// If conjugation on y was requested, we induce it by conjugating
 	// the contents of rho.
 	if ( bli_is_conj( conjy ) )
-		bli_zconjs( dotxy );
+		bli_tconjs( z, dotxy );
 
-	bli_zcopys( dotxy, *rho );
+	bli_tcopys( z,z, dotxy, *rho );
 }
 
diff --git a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
index aeb502f35..468647ff2 100644
--- a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
@@ -238,23 +238,23 @@ void bli_zdotxaxpyf_template_noopt
 	{
 		for ( j = 0; j < b_n; ++j )
 		{
-			bli_zcopys( *xp[ j ], alpha_x[ j ] );
-			bli_zscals( *alpha, alpha_x[ j ] );
+			bli_tcopys( z,z, *xp[ j ], alpha_x[ j ] );
+			bli_tscals( z,z,z, *alpha, alpha_x[ j ] );
 		}
 	}
 	else // if ( bli_is_conj( conjx ) )
 	{
 		for ( j = 0; j < b_n; ++j )
 		{
-			bli_zcopyjs( *xp[ j ], alpha_x[ j ] );
-			bli_zscals( *alpha, alpha_x[ j ] );
+			bli_tcopyjs( z,z, *xp[ j ], alpha_x[ j ] );
+			bli_tscals( z,z,z, *alpha, alpha_x[ j ] );
 		}
 	}
 
 	// Initialize our accumulators to zero.
 	for ( j = 0; j < b_n; ++j )
 	{
-		bli_zset0s( At_w[ j ] );
+		bli_tset0s( z, At_w[ j ] );
 	}
 
 
@@ -278,8 +278,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdots( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -295,8 +295,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdots( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += n_elem_per_iter;
 			}
@@ -308,8 +308,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdots( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -323,8 +323,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -340,8 +340,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += n_elem_per_iter;
 			}
@@ -353,8 +353,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -368,8 +368,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdots( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -385,8 +385,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdots( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += n_elem_per_iter;
 			}
@@ -398,8 +398,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdots( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -413,8 +413,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -430,8 +430,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += n_elem_per_iter;
 			}
@@ -443,8 +443,8 @@ void bli_zdotxaxpyf_template_noopt
 		{
 			for ( j = 0; j < b_n; ++j )
 			{
-				bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
-				bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
+				bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] );
+				bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp );
 
 				ap[ j ] += 1;
 			}
@@ -459,7 +459,7 @@ void bli_zdotxaxpyf_template_noopt
 	{
 		for ( j = 0; j < b_n; ++j )
 		{
-			bli_zconjs( At_w[ j ] );
+			bli_tconjs( z, At_w[ j ] );
 		}
 	}
 
@@ -467,8 +467,8 @@ void bli_zdotxaxpyf_template_noopt
 	// scaling by beta.
 	for ( j = 0; j < b_n; ++j )
 	{
-		bli_zscals( *beta, *yp[ j ] );
-		bli_zaxpys( *alpha, At_w[ j ], *yp[ j ] );
+		bli_tscals( z,z,z, *beta, *yp[ j ] );
+		bli_taxpys( z,z,z,z, *alpha, At_w[ j ], *yp[ j ] );
 	}
 }
 
diff --git a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
index 650303afe..ac62ff999 100644
--- a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
@@ -227,7 +227,7 @@ void bli_zdotxf_template_noopt
 	// Initialize our accumulators to zero.
 	for ( i = 0; i < b_n; ++i )
 	{
-		bli_zset0s( Atx[ i ] );
+		bli_tset0s( z, Atx[ i ] );
 	}
 
 
@@ -249,7 +249,7 @@ void bli_zdotxf_template_noopt
 		{
 			for ( i = 0; i < b_n; ++i )
 			{
-				bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
+				bli_tdots( z,z,z,z, *ap[ i ], *xp, Atx[ i ] );
 
 				ap[ i ] += 1;
 			}
@@ -264,7 +264,7 @@ void bli_zdotxf_template_noopt
 		{
 			for ( i = 0; i < b_n; ++i )
 			{
-				bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
+				bli_tdots( z,z,z,z, *ap[ i ], *xp, Atx[ i ] );
 
 				ap[ i ] += n_elem_per_iter;
 			}
@@ -276,7 +276,7 @@ void bli_zdotxf_template_noopt
 		{
 			for ( i = 0; i < b_n; ++i )
 			{
-				bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
+				bli_tdots( z,z,z,z, *ap[ i ], *xp, Atx[ i ] );
 
 				ap[ i ] += 1;
 			}
@@ -290,7 +290,7 @@ void bli_zdotxf_template_noopt
 		{
 			for ( i = 0; i < b_n; ++i )
 			{
-				bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
+				bli_tdotjs( z,z,z,z, *ap[ i ], *xp, Atx[ i ] );
 
 				ap[ i ] += 1;
 			}
@@ -305,7 +305,7 @@ void bli_zdotxf_template_noopt
 		{
 			for ( i = 0; i < b_n; ++i )
 			{
-				bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
+				bli_tdotjs( z,z,z,z, *ap[ i ], *xp, Atx[ i ] );
 
 				ap[ i ] += n_elem_per_iter;
 			}
@@ -317,7 +317,7 @@ void bli_zdotxf_template_noopt
 		{
 			for ( i = 0; i < b_n; ++i )
 			{
-				bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
+				bli_tdotjs( z,z,z,z, *ap[ i ], *xp, Atx[ i ] );
 
 				ap[ i ] += 1;
 			}
@@ -332,7 +332,7 @@ void bli_zdotxf_template_noopt
 	{
 		for ( i = 0; i < b_n; ++i )
 		{
-			bli_zconjs( Atx[ i ] );
+			bli_tconjs( z, Atx[ i ] );
 		}
 	}
 
@@ -341,8 +341,8 @@ void bli_zdotxf_template_noopt
 	// scaling by beta.
 	for ( i = 0; i < b_n; ++i )
 	{
-		bli_zzscals( *beta, *yp[ i ] );
-		bli_zzzaxpys( *alpha, Atx[ i ], *yp[ i ] );
+		bli_tscals( z,z,z, *beta, *yp[ i ] );
+		bli_taxpys( z,z,z,z, *alpha, Atx[ i ], *yp[ i ] );
 	}
 }
 
diff --git a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
index 190519fa0..97a924b0a 100644
--- a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
@@ -98,7 +98,7 @@ void bli_zgemm_template_noopt
 	/* Initialize the accumulator elements in ab to zero. */
 	for ( i = 0; i < mr * nr; ++i )
 	{
-		bli_zset0s( *(ab + i) );
+		bli_tset0s( z, *(ab + i) );
 	}
 
 	/* Perform a series of k rank-1 updates into ab. */
@@ -116,7 +116,7 @@ void bli_zgemm_template_noopt
 			{
 				ai = *(a1 + i);
 
-				bli_zdots( ai, bj, *abij );
+				bli_tdots( z,z,z,z, ai, bj, *abij );
 
 				abij += rs_ab;
 			}
@@ -129,16 +129,17 @@ void bli_zgemm_template_noopt
 	/* Scale each element of ab by alpha. */
 	for ( i = 0; i < mr * nr; ++i )
 	{
-		bli_zscals( *alpha, *(ab + i) );
+		bli_tscals( z,z,z, *alpha, *(ab + i) );
 	}
 
 	/* If beta is zero, overwrite c11 with the scaled result in ab.
 	   Otherwise, scale c11 by beta and then add the scaled result in
 	   ab. */
-	if ( bli_zeq0( *beta ) )
+	if ( bli_teq0s( z, *beta ) )
 	{
 		/* c11 := ab */
-		bli_zcopys_mxn( m,
+		bli_tcopys_mxn( z,z,
+		                m,
 		                n,
 		                ab,  rs_ab, cs_ab,
 		                c11, rs_c,  cs_c );
@@ -146,7 +147,8 @@ void bli_zgemm_template_noopt
 	else
 	{
 		/* c11 := beta * c11 + ab */
-		bli_zxpbys_mxn( m,
+		bli_txpbys_mxn( z,z,z,z,
+		                m,
 		                n,
 		                ab,  rs_ab, cs_ab,
 		                beta,
diff --git a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
index 4e6634dea..2688a7bc5 100644
--- a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
@@ -116,25 +116,25 @@ void bli_ztrsm_l_template_noopt
 			gamma11 = c11 + (i  )*rs_c + (j  )*cs_c;
 
 			/* chi11 = chi11 - a10t * x01; */
-			bli_zset0s( rho11 );
+			bli_tset0s( z, rho11 );
 			for ( l = 0; l < n_behind; ++l )
 			{
 				alpha10 = a10t + (l  )*cs_a;
 				chi01   = x01  + (l  )*rs_b;
 
-				bli_zaxpys( *alpha10, *chi01, rho11 );
+				bli_taxpys( z,z,z,z, *alpha10, *chi01, rho11 );
 			}
-			bli_zsubs( rho11, *chi11 );
+			bli_tsubs( z,z,z, rho11, *chi11 );
 
 			/* chi11 = chi11 / alpha11; */
 			/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
 			   of alpha11, so we can multiply rather than divide. We store
 			   the inverse of alpha11 intentionally to avoid expensive
 			   division instructions within the micro-kernel. */
-			bli_zscals( *alpha11, *chi11 );
+			bli_tscals( z,z,z, *alpha11, *chi11 );
 
 			/* Output final result to matrix C. */
-			bli_zcopys( *chi11, *gamma11 );
+			bli_tcopys( z,z, *chi11, *gamma11 );
 		}
 	}
 }
diff --git a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
index 42982459a..9d133b037 100644
--- a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
@@ -116,25 +116,25 @@ void bli_ztrsm_u_template_noopt
 			gamma11 = c11 + (i  )*rs_c + (j  )*cs_c;
 
 			/* chi11 = chi11 - a12t * x21; */
-			bli_zset0s( rho11 );
+			bli_tset0s( z, rho11 );
 			for ( l = 0; l < n_behind; ++l )
 			{
 				alpha12 = a12t + (l  )*cs_a;
 				chi21   = x21  + (l  )*rs_b;
 
-				bli_zaxpys( *alpha12, *chi21, rho11 );
+				bli_taxpys( z,z,z,z, *alpha12, *chi21, rho11 );
 			}
-			bli_zsubs( rho11, *chi11 );
+			bli_tsubs( z,z,z, rho11, *chi11 );
 
 			/* chi11 = chi11 / alpha11; */
 			/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
 			   of alpha11, so we can multiply rather than divide. We store
 			   the inverse of alpha11 intentionally to avoid expensive
 			   division instructions within the micro-kernel. */
-			bli_zscals( *alpha11, *chi11 );
+			bli_tscals( z,z,z, *alpha11, *chi11 );
 
 			/* Output final result to matrix C. */
-			bli_zcopys( *chi11, *gamma11 );
+			bli_tcopys( z,z, *chi11, *gamma11 );
 		}
 	}
 }
diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c
index ef2a942f3..d972eacde 100644
--- a/frame/0/bli_l0_tapi.c
+++ b/frame/0/bli_l0_tapi.c
@@ -52,8 +52,8 @@ void PASTEMAC(ch,opname) \
 \
 	ctype chi_conj; \
 \
-	PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
-	PASTEMAC(ch,kername)( chi_conj, *psi ); \
+	bli_tcopycjs( ch,ch, conjchi, *chi, chi_conj ); \
+	PASTEMAC(t,kername)( ch,ch,ch, chi_conj, *psi ); \
 }
 
 INSERT_GENTFUNC_BASIC( addsc, adds )
@@ -75,9 +75,9 @@ void PASTEMAC(ch,opname) \
 \
 	ctype chi_conj; \
 \
-	PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
-	PASTEMAC(ch,kername)( chi_conj ); \
-	PASTEMAC(ch,copys)( chi_conj, *psi ); \
+	bli_tcopycjs( ch,ch, conjchi, *chi, chi_conj ); \
+	PASTEMAC(t,kername)( ch,ch, chi_conj ); \
+	bli_tcopys( ch,ch, chi_conj, *psi ); \
 }
 
 INSERT_GENTFUNC_BASIC( invertsc, inverts )
@@ -95,17 +95,17 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	if ( PASTEMAC(ch,eq0)( *chi ) ) \
+	if ( bli_teq0s( ch, *chi ) ) \
 	{ \
 		/* Overwrite potential Infs and NaNs. */ \
-		PASTEMAC(ch,set0s)( *psi ); \
+		bli_tset0s( ch, *psi ); \
 	} \
 	else \
 	{ \
 		ctype chi_conj; \
 \
-		PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
-		PASTEMAC(ch,kername)( chi_conj, *psi ); \
+		bli_tcopycjs( ch,ch, conjchi, *chi, chi_conj ); \
+		PASTEMAC(t,kername)( ch,ch,ch, chi_conj, *psi ); \
 	} \
 }
 
@@ -129,11 +129,11 @@ void PASTEMAC(ch,opname) \
 \
 	( void )absq_i; \
 \
-	PASTEMAC(ch,chr,gets)( *chi, chi_r, chi_i ); \
+	bli_tgets( ch,chr, *chi, chi_r, chi_i ); \
 \
 	/* absq   = chi_r * chi_r + chi_i * chi_i; \
 	   absq_r = 0.0; (thrown away) */ \
-	PASTEMAC(ch,absq2ris)( chi_r, chi_i, *absq, absq_i ); \
+	bli_tabsq2ris( ch,ch,ch, chi_r, chi_i, *absq, absq_i ); \
 \
 	( void )chi_i; \
 }
@@ -153,7 +153,7 @@ void PASTEMAC(ch,opname) \
 	bli_init_once(); \
 \
 	/* norm = sqrt( chi_r * chi_r + chi_i * chi_i ); */ \
-	PASTEMAC(ch,chr,abval2s)( *chi, *norm ); \
+	bli_tabval2s( ch,chr,chr, *chi, *norm ); \
 }
 
 INSERT_GENTFUNCR_BASIC( normfsc )
@@ -171,7 +171,7 @@ void PASTEMAC(ch,opname) \
 	bli_init_once(); \
 \
 	/* NOTE: sqrtsc/sqrt2s differs from normfsc/abval2s in the complex domain. */ \
-	PASTEMAC(ch,sqrt2s)( *chi, *psi ); \
+	bli_tsqrt2s( ch,ch,ch, *chi, *psi ); \
 }
 
 INSERT_GENTFUNC_BASIC( sqrtsc )
@@ -190,7 +190,7 @@ void PASTEMAC(ch,opname) \
 \
 	const ctype_r chi_r = PASTEMAC(ch,real)( *chi ); \
 \
-	PASTEMAC(chr,ch,sqrt2s)( chi_r, *psi ); \
+	bli_tsqrt2s( chr,ch,chr, chi_r, *psi ); \
 }
 
 INSERT_GENTFUNCR_BASIC( sqrtrsc )
@@ -208,7 +208,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC(ch,d,gets)( *chi, *zeta_r, *zeta_i ); \
+	bli_tgets( ch,d, *chi, *zeta_r, *zeta_i ); \
 }
 
 INSERT_GENTFUNC_BASIC( getsc )
@@ -226,7 +226,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC(d,ch,sets)( zeta_r, zeta_i, *chi ); \
+	bli_tsets( d,ch, zeta_r, zeta_i, *chi ); \
 }
 
 INSERT_GENTFUNC_BASIC( setsc )
@@ -244,7 +244,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC(ch,chr,gets)( *chi, *zeta_r, *zeta_i ); \
+	bli_tgets( ch,chr, *chi, *zeta_r, *zeta_i ); \
 }
 
 INSERT_GENTFUNCR_BASIC( unzipsc )
@@ -262,7 +262,7 @@ void PASTEMAC(ch,opname) \
 { \
 	bli_init_once(); \
 \
-	PASTEMAC(chr,ch,sets)( *zeta_r, *zeta_i, *chi ); \
+	bli_tsets( chr,ch, *zeta_r, *zeta_i, *chi ); \
 }
 
 INSERT_GENTFUNCR_BASIC( zipsc )
@@ -278,7 +278,7 @@ void bli_igetsc
 {
 	bli_init_once();
 
-	PASTEMAC(i,d,gets)( *chi, *zeta_r, *zeta_i );
+	bli_tgets( i,d, *chi, *zeta_r, *zeta_i );
 }
 
 void bli_isetsc
@@ -290,6 +290,6 @@ void bli_isetsc
 {
 	bli_init_once();
 
-	PASTEMAC(d,i,sets)( zeta_r, zeta_i, *chi );
+	bli_tsets( d,i, zeta_r, zeta_i, *chi );
 }
 
diff --git a/frame/0/copysc/bli_copysc.c b/frame/0/copysc/bli_copysc.c
index 11e111544..77d06f610 100644
--- a/frame/0/copysc/bli_copysc.c
+++ b/frame/0/copysc/bli_copysc.c
@@ -117,11 +117,11 @@ void PASTEMAC(chx,chy,varname) \
 \
 	if ( bli_is_conj( conjchi ) ) \
 	{ \
-		PASTEMAC(chx,chy,copyjs)( *chi_cast, *psi_cast ); \
+		bli_tcopyjs( chx,chy, *chi_cast, *psi_cast ); \
 	} \
 	else \
 	{ \
-		PASTEMAC(chx,chy,copys)( *chi_cast, *psi_cast ); \
+		bli_tcopys( chx,chy, *chi_cast, *psi_cast ); \
 	} \
 }
 
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index 17e7fcd3b..47737872f 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -363,7 +363,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	{ \
 		ctype* chi11 = x1 + (i  )*incx; \
 \
-		PASTEMAC(ch,setis)( *alpha, *chi11 ); \
+		bli_tsetis( ch,ch, *alpha, *chi11 ); \
 	} */ \
 \
 	/* Acquire the address of the imaginary component of the first element,
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index d17df0eb7..775f45095 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -208,7 +208,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	if ( bli_zero_dim2( m, n ) ) return; \
 \
 	/* If alpha is zero, then the entire operation is a no-op. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
+	if ( bli_teq0s( ch, *alpha ) ) return; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -288,7 +288,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	/* If alpha is zero, then we set the output matrix to zero. This
 	   seemingly minor optimization is important because it will clear
 	   any NaNs and Infs in x that would otherwise propogate. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_teq0s( ch, *alpha ) ) \
 	{ \
 \
 		PASTEMAC(ch,setm,BLIS_TAPI_EX_SUF) \
@@ -429,7 +429,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* If beta is zero, then the operation reduces to copym. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		PASTEMAC(ch,copym,_unb_var1) \
 		( \
@@ -520,7 +520,7 @@ void PASTEMAC(chx,chy,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* If beta is zero, then the operation reduces to copym. */ \
-	if ( PASTEMAC(chy,eq0)( *beta ) ) \
+	if ( bli_teq0s( chy, *beta ) ) \
 	{ \
 		PASTEMAC(chx,chy,castm) \
 		( \
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index 749e372bb..fc790ed77 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -532,7 +532,7 @@ void PASTEMAC(chx,chy,opname) \
 	/*conjx = bli_extract_conj( transx );*/ \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
-	if ( PASTEMAC(chy,eq1)( *beta ) ) \
+	if ( bli_teq1s( chy, *beta ) ) \
 	{ \
 		if ( incx == 1 && incy == 1 ) \
 		{ \
@@ -545,7 +545,7 @@ void PASTEMAC(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(chx,chy,adds)( x1[i], y1[i] ); \
+					bli_tadds( chx,chy,chy, x1[i], y1[i] ); \
 				} \
 			} \
 		} \
@@ -563,7 +563,7 @@ void PASTEMAC(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(chx,chy,adds)( *chi1, *psi1 ); \
+					bli_tadds( chx,chy,chy, *chi1, *psi1 ); \
 \
 					chi1 += incx; \
 					psi1 += incy; \
@@ -571,7 +571,7 @@ void PASTEMAC(chx,chy,opname) \
 			} \
 		} \
 	} \
-	else /* ( !PASTEMAC(chy,eq1)( *beta ) ) */ \
+	else /* ( !bli_teq1s( chy, *beta ) ) */ \
 	{ \
 		if ( incx == 1 && incy == 1 ) \
 		{ \
@@ -584,7 +584,7 @@ void PASTEMAC(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(chx,chy,chy,xpbys)( x1[i], *beta, y1[i] ); \
+					bli_txpbys( chx,chy,chy,chy, x1[i], *beta, y1[i] ); \
 				} \
 			} \
 		} \
@@ -602,7 +602,7 @@ void PASTEMAC(chx,chy,opname) \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \
+					bli_txpbys( chx,chy,chy,chy, *chi1, *beta, *psi1 ); \
 \
 					chi1 += incx; \
 					psi1 += incy; \
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 74f9de8f8..0791de524 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -95,8 +95,8 @@ void PASTEMAC(chc,chp,varname) \
 	{ \
 		ctypep_r kappa_r, kappa_i; \
 		( void )kappa_r; \
-		PASTEMAC(chp,gets)( *( ctypep* )kappa, kappa_r, kappa_i ); \
-		if ( PASTEMAC(chp_r,eq0)( kappa_i ) ) \
+		bli_tgets( chp,chp, *( ctypep* )kappa, kappa_r, kappa_i ); \
+		if ( bli_teq0s( chp_r, kappa_i ) ) \
 		{ \
 			/* Treat the matrix as real with doubled strides. */ \
 			dt_c = bli_dt_proj_to_real( dt_c ); \
diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c
index f6f2a035d..4b9117231 100644
--- a/frame/2/bli_l2_tapi.c
+++ b/frame/2/bli_l2_tapi.c
@@ -74,7 +74,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 \
 	/* If x has zero elements, or if alpha is zero, scale y by beta and
 	   return early. */ \
-	if ( bli_zero_dim1( n_x ) || PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_zero_dim1( n_x ) || bli_teq0s( ch, *alpha ) ) \
 	{ \
 		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
@@ -144,7 +144,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	BLIS_TAPI_EX_DECLS \
 \
 	/* If x or y has zero elements, or if alpha is zero, return early. */ \
-	if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
+	if ( bli_zero_dim2( m, n ) || bli_teq0s( ch, *alpha ) ) return; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -201,7 +201,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 \
 	/* If x has zero elements, or if alpha is zero, scale y by beta and
 	   return early. */ \
-	if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_zero_dim1( m ) || bli_teq0s( ch, *alpha ) ) \
 	{ \
 		PASTEMAC(ch,scalv,BLIS_TAPI_EX_SUF) \
 		( \
@@ -273,12 +273,12 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	ctype alpha_local; \
 \
 	/* If x has zero elements, or if alpha is zero, return early. */ \
-	if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \
+	if ( bli_zero_dim1( m ) || bli_teq0s( chr, *alpha ) ) return; \
 \
 	/* Make a local copy of alpha, cast into the complex domain. This
 	   allows us to use the same underlying her variants to implement
 	   both her and syr operations. */ \
-	PASTEMAC(chr,ch,copys)( *alpha, alpha_local ); \
+	bli_tcopys( chr,ch, *alpha, alpha_local ); \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -335,7 +335,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	BLIS_TAPI_EX_DECLS \
 \
 	/* If x has zero elements, or if alpha is zero, return early. */ \
-	if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
+	if ( bli_zero_dim1( m ) || bli_teq0s( ch, *alpha ) ) return; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -394,7 +394,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	BLIS_TAPI_EX_DECLS \
 \
 	/* If x has zero elements, or if alpha is zero, return early. */ \
-	if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
+	if ( bli_zero_dim1( m ) || bli_teq0s( ch, *alpha ) ) return; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -461,7 +461,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* If alpha is zero, set x to zero and return early. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_teq0s( ch, *alpha ) ) \
 	{ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
 		( \
diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c
index f40bb2dab..c512bff7f 100644
--- a/frame/2/gemv/bli_gemv_unb_var2.c
+++ b/frame/2/gemv/bli_gemv_unb_var2.c
@@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \
 	conja = bli_extract_conj( transa ); \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -107,8 +107,8 @@ void PASTEMAC(ch,varname) \
 		y1   = y + (0  )*incy; \
 \
 		/* y = y + alpha * chi1 * a1; */ \
-		PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \
-		PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \
+		bli_tcopycjs( ch,ch, conjx, *chi1, alpha_chi1 ); \
+		bli_tscals( ch,ch,ch, *alpha, alpha_chi1 ); \
 \
 		kfp_av \
 		( \
diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c
index a89017116..f1be39ca5 100644
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \
 	conja = bli_extract_conj( transa ); \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
diff --git a/frame/2/ger/bli_ger_unb_var1.c b/frame/2/ger/bli_ger_unb_var1.c
index 24e96822b..0b0761130 100644
--- a/frame/2/ger/bli_ger_unb_var1.c
+++ b/frame/2/ger/bli_ger_unb_var1.c
@@ -68,8 +68,8 @@ void PASTEMAC(ch,varname) \
 		y1   = y + (0  )*incy; \
 \
 		/* a1t = a1t + alpha * chi1 * y; */ \
-		PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \
-		PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \
+		bli_tcopycjs( ch,ch, conjx, *chi1, alpha_chi1 ); \
+		bli_tscals( ch,ch,ch, *alpha, alpha_chi1 ); \
 \
 		kfp_av \
 		( \
diff --git a/frame/2/ger/bli_ger_unb_var2.c b/frame/2/ger/bli_ger_unb_var2.c
index fb38e683d..ee40223ca 100644
--- a/frame/2/ger/bli_ger_unb_var2.c
+++ b/frame/2/ger/bli_ger_unb_var2.c
@@ -68,8 +68,8 @@ void PASTEMAC(ch,varname) \
 		psi1 = y + (j  )*incy; \
 \
 		/* a1 = a1 + alpha * psi1 * x; */ \
-		PASTEMAC(ch,copycjs)( conjy, *psi1, alpha_psi1 ); \
-		PASTEMAC(ch,scals)( *alpha, alpha_psi1 ); \
+		bli_tcopycjs( ch,ch, conjy, *psi1, alpha_psi1 ); \
+		bli_tscals( ch,ch,ch, *alpha, alpha_psi1 ); \
 \
 		kfp_av \
 		( \
diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c
index eeffc4292..204c90f0d 100644
--- a/frame/2/hemv/bli_hemv_unb_var1.c
+++ b/frame/2/hemv/bli_hemv_unb_var1.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -134,8 +134,8 @@ void PASTEMAC(ch,varname) \
 		psi1     = y + (i  )*incy; \
 \
 		/* Apply conjx to chi1 and and scale by alpha. */ \
-		PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
-		PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
+		bli_tcopycjs( ch,ch, conjx, *chi1, conjx_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi1, alpha_chi1 ); \
 \
 		/* y0 = y0 + alpha * a10t' * chi1; */ \
 		kfp_av \
@@ -164,12 +164,12 @@ void PASTEMAC(ch,varname) \
 \
 		/* For hemv, explicitly set the imaginary component of alpha11 to
 		   zero. */ \
-		PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+		bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( alpha11_temp ); \
+			bli_tseti0s( ch, alpha11_temp ); \
 \
 		/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
-		PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, alpha_chi1, alpha11_temp, *psi1 ); \
 \
 	} \
 }
diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c
index 07de60dcc..c11563c4a 100644
--- a/frame/2/hemv/bli_hemv_unb_var2.c
+++ b/frame/2/hemv/bli_hemv_unb_var2.c
@@ -93,7 +93,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -136,8 +136,8 @@ void PASTEMAC(ch,varname) \
 		psi1     = y + (i  )*incy; \
 \
         /* Apply conjx to chi1 and and scale by alpha. */ \
-        PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
-        PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
+        bli_tcopycjs( ch,ch, conjx, *chi1, conjx_chi1 ); \
+        bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi1, alpha_chi1 ); \
 \
 		/* psi1 = psi1 + alpha * a10t * x0; */ \
 		kfp_dv \
@@ -169,12 +169,12 @@ void PASTEMAC(ch,varname) \
 \
 		/* For hemv, explicitly set the imaginary component of alpha11 to
 		   zero. */ \
-		PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+		bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( alpha11_temp ); \
+			bli_tseti0s( ch, alpha11_temp ); \
 \
 		/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
-		PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, alpha_chi1, alpha11_temp, *psi1 ); \
 	} \
 }
 
diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c
index 1edd78f82..6654acd5e 100644
--- a/frame/2/hemv/bli_hemv_unb_var3.c
+++ b/frame/2/hemv/bli_hemv_unb_var3.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -134,17 +134,17 @@ void PASTEMAC(ch,varname) \
 		y2       = y + (i+1)*incy; \
 \
 		/* Apply conjx to chi1 and and scale by alpha. */ \
-		PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
-		PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
+		bli_tcopycjs( ch,ch, conjx, *chi1, conjx_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi1, alpha_chi1 ); \
 \
 		/* For hemv, explicitly set the imaginary component of alpha11 to
 		   zero. */ \
-		PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+		bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( alpha11_temp ); \
+			bli_tseti0s( ch, alpha11_temp ); \
 \
 		/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
-		PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, alpha_chi1, alpha11_temp, *psi1 ); \
 \
 		/* psi1 = psi1 + alpha * a21' * x2; */ \
 		kfp_dv \
diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c
index 704299ab1..ea2bee6d3 100644
--- a/frame/2/hemv/bli_hemv_unb_var4.c
+++ b/frame/2/hemv/bli_hemv_unb_var4.c
@@ -92,7 +92,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -135,8 +135,8 @@ void PASTEMAC(ch,varname) \
 		y2       = y + (i+1)*incy; \
 \
 		/* Apply conjx to chi1 and and scale by alpha. */ \
-		PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
-		PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
+		bli_tcopycjs( ch,ch, conjx, *chi1, conjx_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi1, alpha_chi1 ); \
 \
 		/* y0 = y0 + alpha * a10t' * chi1; */ \
 		kfp_av \
@@ -151,12 +151,12 @@ void PASTEMAC(ch,varname) \
 \
 		/* For hemv, explicitly set the imaginary component of alpha11 to
 		   zero. */ \
-		PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+		bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( alpha11_temp ); \
+			bli_tseti0s( ch, alpha11_temp ); \
 \
 		/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
-		PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, alpha_chi1, alpha11_temp, *psi1 ); \
 \
 		/* y2 = y2 + alpha * a21 * chi1; */ \
 		kfp_av \
diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c
index bb96d9ae5..45e7ef7b4 100644
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -176,38 +176,38 @@ void PASTEMAC(ch,varname) \
 			y21      = y1  + (k+1)*incy; \
 \
 			/* y01 = y01 + alpha * a10t' * chi11; */ \
-			PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
-			PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
+			bli_tcopycjs( ch,ch, conjx, *chi11, conjx_chi11 ); \
+			bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi11, alpha_chi11 ); \
 			if ( bli_is_conj( conj1 ) ) \
 			{ \
 				for ( j = 0; j < f_behind; ++j ) \
-					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+					bli_taxpyjs( ch,ch,ch,ch, alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
 			} \
 			else \
 			{ \
 				for ( j = 0; j < f_behind; ++j ) \
-					PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+					bli_taxpys( ch,ch,ch,ch, alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
 			} \
 \
 			/* For hemv, explicitly set the imaginary component of alpha11 to
 			   zero. */ \
-			PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+			bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 			if ( bli_is_conj( conjh ) ) \
-				PASTEMAC(ch,seti0s)( alpha11_temp ); \
+				bli_tseti0s( ch, alpha11_temp ); \
 \
 			/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
-			PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
+			bli_taxpys( ch,ch,ch,ch, alpha_chi11, alpha11_temp, *psi11 ); \
 \
 			/* y21 = y21 + alpha * a21 * chi11; */ \
 			if ( bli_is_conj( conj0 ) ) \
 			{ \
 				for ( j = 0; j < f_ahead; ++j ) \
-					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+					bli_taxpyjs( ch,ch,ch,ch, alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
 			} \
 			else \
 			{ \
 				for ( j = 0; j < f_ahead; ++j ) \
-					PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+					bli_taxpys( ch,ch,ch,ch, alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
 			} \
 		} \
 	} \
diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c
index f20a6de84..f4ee1988d 100644
--- a/frame/2/hemv/bli_hemv_unf_var1a.c
+++ b/frame/2/hemv/bli_hemv_unf_var1a.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -132,8 +132,8 @@ void PASTEMAC(ch,varname) \
 		psi1     = y + (i  )*incy; \
 \
 		/* Apply conjx to chi1 and and scale by alpha. */ \
-		PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
-		PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
+		bli_tcopycjs( ch,ch, conjx, *chi1, conjx_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi1, alpha_chi1 ); \
 \
 		/* psi1 = psi1 + alpha * a10t * x0;     (dotv) */ \
 		/* y0   = y0   + alpha * a10t' * chi1;  (axpyv) */ \
@@ -150,16 +150,16 @@ void PASTEMAC(ch,varname) \
 		  y0,   incy, \
 		  cntx  \
 		); \
-		PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, *alpha, rho, *psi1 ); \
 \
 		/* For hemv, explicitly set the imaginary component of alpha11 to
 		   zero. */ \
-		PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+		bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( alpha11_temp ); \
+			bli_tseti0s( ch, alpha11_temp ); \
 \
 		/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
-		PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, alpha_chi1, alpha11_temp, *psi1 ); \
 \
 	} \
 }
diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c
index ef25a3562..f0d910429 100644
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -156,38 +156,38 @@ void PASTEMAC(ch,varname) \
 			y21      = y1  + (k+1)*incy; \
 \
 			/* y01 = y01 + alpha * a10t' * chi11; */ \
-			PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
-			PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
+			bli_tcopycjs( ch,ch, conjx, *chi11, conjx_chi11 ); \
+			bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi11, alpha_chi11 ); \
 			if ( bli_is_conj( conj0 ) ) \
 			{ \
 				for ( j = 0; j < f_behind; ++j ) \
-					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+					bli_taxpyjs( ch,ch,ch,ch, alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
 			} \
 			else \
 			{ \
 				for ( j = 0; j < f_behind; ++j ) \
-					PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+					bli_taxpys( ch,ch,ch,ch, alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
 			} \
 \
 			/* For hemv, explicitly set the imaginary component of alpha11 to
 			   zero. */ \
-			PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+			bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 			if ( bli_is_conj( conjh ) ) \
-				PASTEMAC(ch,seti0s)( alpha11_temp ); \
+				bli_tseti0s( ch, alpha11_temp ); \
 \
 			/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
-			PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
+			bli_taxpys( ch,ch,ch,ch, alpha_chi11, alpha11_temp, *psi11 ); \
 \
 			/* y21 = y21 + alpha * a21 * chi11; */ \
 			if ( bli_is_conj( conj1 ) ) \
 			{ \
 				for ( j = 0; j < f_ahead; ++j ) \
-					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+					bli_taxpyjs( ch,ch,ch,ch, alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
 			} \
 			else \
 			{ \
 				for ( j = 0; j < f_ahead; ++j ) \
-					PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+					bli_taxpys( ch,ch,ch,ch, alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
 			} \
 		} \
 \
diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c
index 3501a9ac7..8e1ffc3e2 100644
--- a/frame/2/hemv/bli_hemv_unf_var3a.c
+++ b/frame/2/hemv/bli_hemv_unf_var3a.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* y = 0; */ \
 		PASTEMAC(ch,setv,BLIS_TAPI_EX_SUF) \
@@ -133,16 +133,16 @@ void PASTEMAC(ch,varname) \
 \
 		/* For hemv, explicitly set the imaginary component of alpha11 to
 		   zero. */ \
-		PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+		bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_temp ); \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( alpha11_temp ); \
+			bli_tseti0s( ch, alpha11_temp ); \
 \
 		/* Apply conjx to chi1 and and scale by alpha. */ \
-		PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
-		PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
+		bli_tcopycjs( ch,ch, conjx, *chi1, conjx_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, *alpha, conjx_chi1, alpha_chi1 ); \
 \
 		/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
-		PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, alpha_chi1, alpha11_temp, *psi1 ); \
 \
 		/* psi1 = psi1 + alpha * a21' * x2;   (dotv) */ \
 		/* y2   = y2   + alpha * a21 * chi1;  (axpyv) */ \
@@ -159,7 +159,7 @@ void PASTEMAC(ch,varname) \
 		  y2,  incy, \
 		  cntx  \
 		); \
-		PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \
+		bli_taxpys( ch,ch,ch,ch, *alpha, rho, *psi1 ); \
 	} \
 }
 
diff --git a/frame/2/her/bli_her_unb_var1.c b/frame/2/her/bli_her_unb_var1.c
index 7f7215c5d..8491b75fc 100644
--- a/frame/2/her/bli_her_unb_var1.c
+++ b/frame/2/her/bli_her_unb_var1.c
@@ -70,10 +70,10 @@ void PASTEMAC(ch,varname) \
 \
 	/* Make a local copy of alpha and zero out the imaginary component if
 	   we are being invoked as her, since her requires alpha to be real. */ \
-	PASTEMAC(ch,copys)( *alpha, alpha_local ); \
+	bli_tcopys( ch,ch, *alpha, alpha_local ); \
 	if ( bli_is_conj( conjh ) ) \
 	{ \
-		PASTEMAC(ch,seti0s)( alpha_local ); \
+		bli_tseti0s( ch, alpha_local ); \
 	} \
 \
 	/* The algorithm will be expressed in terms of the lower triangular case;
@@ -112,15 +112,15 @@ void PASTEMAC(ch,varname) \
 		gamma11  = c + (i  )*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx to chi1. */ \
-		PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
-		PASTEMAC(ch,copycjs)( conj1, *chi1, conjx1_chi1 ); \
+		bli_tcopycjs( ch,ch, conj0, *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conj1, *chi1, conjx1_chi1 ); \
 \
 		/* Compute scalar for vector subproblem. */ \
-		PASTEMAC(ch,scal2s)( alpha_local, conjx0_chi1, alpha_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha_local, conjx0_chi1, alpha_chi1 ); \
 \
 		/* Compute alpha * chi1 * conj(chi1) after chi1 has already been
 		   conjugated, if needed, by conjx. */ \
-		PASTEMAC(ch,scal2s)( alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \
 \
 		/* c10t = c10t + alpha * chi1 * x0'; */ \
 		kfp_av \
@@ -134,12 +134,12 @@ void PASTEMAC(ch,varname) \
 		); \
 \
 		/* gamma11 = gamma11 + alpha * chi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha_chi1_chi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha_chi1_chi1, *gamma11 ); \
 \
 		/* For her2, explicitly set the imaginary component of gamma11 to
 		   zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/her/bli_her_unb_var2.c b/frame/2/her/bli_her_unb_var2.c
index 1f071ca42..2693927d6 100644
--- a/frame/2/her/bli_her_unb_var2.c
+++ b/frame/2/her/bli_her_unb_var2.c
@@ -70,10 +70,10 @@ void PASTEMAC(ch,varname) \
 \
 	/* Make a local copy of alpha and zero out the imaginary component if
 	   we are being invoked as her, since her requires alpha to be real. */ \
-	PASTEMAC(ch,copys)( *alpha, alpha_local ); \
+	bli_tcopys( ch,ch, *alpha, alpha_local ); \
 	if ( bli_is_conj( conjh ) ) \
 	{ \
-		PASTEMAC(ch,seti0s)( alpha_local ); \
+		bli_tseti0s( ch, alpha_local ); \
 	} \
 \
 	/* The algorithm will be expressed in terms of the lower triangular case;
@@ -112,15 +112,15 @@ void PASTEMAC(ch,varname) \
 		c21      = c + (i+1)*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx to chi1. */ \
-		PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
-		PASTEMAC(ch,copycjs)( conj1, *chi1, conjx1_chi1 ); \
+		bli_tcopycjs( ch,ch, conj0, *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conj1, *chi1, conjx1_chi1 ); \
 \
 		/* Compute scalar for vector subproblem. */ \
-		PASTEMAC(ch,scal2s)( alpha_local, conjx0_chi1, alpha_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha_local, conjx0_chi1, alpha_chi1 ); \
 \
 		/* Compute alpha * chi1 * conj(chi1) after chi1 has already been
 		   conjugated, if needed, by conjx. */ \
-		PASTEMAC(ch,scal2s)( alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \
 \
 		/* c21 = c21 + alpha * x2 * conj(chi1); */ \
 		kfp_av \
@@ -134,12 +134,12 @@ void PASTEMAC(ch,varname) \
 		); \
 \
 		/* gamma11 = gamma11 + alpha * chi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha_chi1_chi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha_chi1_chi1, *gamma11 ); \
 \
 		/* For her, explicitly set the imaginary component of gamma11 to
 		   zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c
index e0bfd7773..6a761c40a 100644
--- a/frame/2/her2/bli_her2_unb_var1.c
+++ b/frame/2/her2/bli_her2_unb_var1.c
@@ -80,8 +80,8 @@ void PASTEMAC(ch,varname) \
 		rs_ct = rs_c; \
 		cs_ct = cs_c; \
 \
-		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+		bli_tcopys( ch,ch, *alpha, alpha0 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha1 ); \
 	} \
 	else /* if ( bli_is_upper( uplo ) ) */ \
 	{ \
@@ -93,8 +93,8 @@ void PASTEMAC(ch,varname) \
 		conjx = bli_apply_conj( conjh, conjx ); \
 		conjy = bli_apply_conj( conjh, conjy ); \
 \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
-		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha0 ); \
+		bli_tcopys( ch,ch, *alpha, alpha1 ); \
 	} \
 \
 	/* Apply conjh (which carries the conjugation component of the Hermitian
@@ -117,17 +117,17 @@ void PASTEMAC(ch,varname) \
 		gamma11  = c + (i  )*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
-		PASTEMAC(ch,copycjs)( conjx,        *chi1, conjx0_chi1 ); \
-		PASTEMAC(ch,copycjs)( conjy,        *psi1, conjy1_psi1 ); \
-		PASTEMAC(ch,copycjs)( conj0,        *psi1, conjy0_psi1 ); \
+		bli_tcopycjs( ch,ch, conjx,        *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conjy,        *psi1, conjy1_psi1 ); \
+		bli_tcopycjs( ch,ch, conj0,        *psi1, conjy0_psi1 ); \
 \
 		/* Compute scalars for vector subproblems. */ \
-		PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
-		PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0, conjx0_chi1, alpha0_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha1, conjy1_psi1, alpha1_psi1 ); \
 \
 		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
 		   already been conjugated, if needed, by conjx and conjy. */ \
-		PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
 \
 		/* c10t = c10t + alpha * chi1 * y0'; */ \
 		kfp_av \
@@ -153,13 +153,13 @@ void PASTEMAC(ch,varname) \
 \
 		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
 		                     + conj(alpha) * psi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
 \
 		/* For her2, explicitly set the imaginary component of gamma11 to
            zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c
index 0ab92fb38..9715d5c02 100644
--- a/frame/2/her2/bli_her2_unb_var2.c
+++ b/frame/2/her2/bli_her2_unb_var2.c
@@ -86,8 +86,8 @@ void PASTEMAC(ch,varname) \
 		rs_ct = rs_c; \
 		cs_ct = cs_c; \
 \
-		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+		bli_tcopys( ch,ch, *alpha, alpha0 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha1 ); \
 	} \
 	else /* if ( bli_is_upper( uplo ) ) */ \
 	{ \
@@ -99,8 +99,8 @@ void PASTEMAC(ch,varname) \
 		conjx = bli_apply_conj( conjh, conjx ); \
 		conjy = bli_apply_conj( conjh, conjy ); \
 \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
-		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha0 ); \
+		bli_tcopys( ch,ch, *alpha, alpha1 ); \
 	} \
 \
 	/* Apply conjh (which carries the conjugation component of the Hermitian
@@ -126,17 +126,17 @@ void PASTEMAC(ch,varname) \
 		c21      = c + (i+1)*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
-		PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
-		PASTEMAC(ch,copycjs)( conjy,       *psi1, conjy1_psi1 ); \
-		PASTEMAC(ch,copycjs)( conj0,       *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conjh_conjy, *psi1, conjy0_psi1 ); \
+		bli_tcopycjs( ch,ch, conjy,       *psi1, conjy1_psi1 ); \
+		bli_tcopycjs( ch,ch, conj0,       *chi1, conjx0_chi1 ); \
 \
 		/* Compute scalars for vector subproblems. */ \
-		PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
-		PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0, conjy0_psi1, alpha0_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha1, conjy1_psi1, alpha1_psi1 ); \
 \
 		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
 		   already been conjugated, if needed, by conjx and conjy. */ \
-		PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
 \
 		/* c21 = c21 + alpha * x2 * conj(psi1); */ \
 		kfp_av \
@@ -162,13 +162,13 @@ void PASTEMAC(ch,varname) \
 \
 		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
 		                     + conj(alpha) * psi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
 \
 		/* For her2, explicitly set the imaginary component of gamma11 to
            zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c
index dc2630c46..745a387f5 100644
--- a/frame/2/her2/bli_her2_unb_var3.c
+++ b/frame/2/her2/bli_her2_unb_var3.c
@@ -86,8 +86,8 @@ void PASTEMAC(ch,varname) \
 		rs_ct = rs_c; \
 		cs_ct = cs_c; \
 \
-		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+		bli_tcopys( ch,ch, *alpha, alpha0 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha1 ); \
 	} \
 	else /* if ( bli_is_upper( uplo ) ) */ \
 	{ \
@@ -99,8 +99,8 @@ void PASTEMAC(ch,varname) \
 		conjx = bli_apply_conj( conjh, conjx ); \
 		conjy = bli_apply_conj( conjh, conjy ); \
 \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
-		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha0 ); \
+		bli_tcopys( ch,ch, *alpha, alpha1 ); \
 	} \
 \
 	/* Apply conjh (which carries the conjugation component of the Hermitian
@@ -126,17 +126,17 @@ void PASTEMAC(ch,varname) \
 		c21      = c + (i+1)*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
-		PASTEMAC(ch,copycjs)( conjx,       *chi1, conjx0_chi1 ); \
-		PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
-		PASTEMAC(ch,copycjs)( conj0,       *psi1, conjy0_psi1 ); \
+		bli_tcopycjs( ch,ch, conjx,       *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conjh_conjx, *chi1, conjx1_chi1 ); \
+		bli_tcopycjs( ch,ch, conj0,       *psi1, conjy0_psi1 ); \
 \
 		/* Compute scalars for vector subproblems. */ \
-		PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
-		PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0, conjx0_chi1, alpha0_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha1, conjx1_chi1, alpha1_chi1 ); \
 \
 		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
 		   already been conjugated, if needed, by conjx and conjy. */ \
-		PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
 \
 		/* c10t = c10t + alpha * chi1 * y0'; */ \
 		kfp_av \
@@ -162,13 +162,13 @@ void PASTEMAC(ch,varname) \
 \
 		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
 		                     + conj(alpha) * psi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
 \
 		/* For her2, explicitly set the imaginary component of gamma11 to
            zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c
index 59902654d..e93d6a489 100644
--- a/frame/2/her2/bli_her2_unb_var4.c
+++ b/frame/2/her2/bli_her2_unb_var4.c
@@ -86,8 +86,8 @@ void PASTEMAC(ch,varname) \
 		rs_ct = rs_c; \
 		cs_ct = cs_c; \
 \
-		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+		bli_tcopys( ch,ch, *alpha, alpha0 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha1 ); \
 	} \
 	else /* if ( bli_is_upper( uplo ) ) */ \
 	{ \
@@ -99,8 +99,8 @@ void PASTEMAC(ch,varname) \
 		conjx = bli_apply_conj( conjh, conjx ); \
 		conjy = bli_apply_conj( conjh, conjy ); \
 \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
-		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha0 ); \
+		bli_tcopys( ch,ch, *alpha, alpha1 ); \
 	} \
 \
 	/* Apply conjh (which carries the conjugation component of the Hermitian
@@ -125,17 +125,17 @@ void PASTEMAC(ch,varname) \
 		c21      = c + (i+1)*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
-		PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
-		PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
-		PASTEMAC(ch,copycjs)( conj0,       *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conjh_conjy, *psi1, conjy0_psi1 ); \
+		bli_tcopycjs( ch,ch, conjh_conjx, *chi1, conjx1_chi1 ); \
+		bli_tcopycjs( ch,ch, conj0,       *chi1, conjx0_chi1 ); \
 \
 		/* Compute scalars for vector subproblems. */ \
-		PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
-		PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0, conjy0_psi1, alpha0_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha1, conjx1_chi1, alpha1_chi1 ); \
 \
 		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
 		   already been conjugated, if needed, by conjx and conjy. */ \
-		PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
 \
 		/* c21 = c21 + alpha * x2 * conj(psi1); */ \
 		kfp_av \
@@ -161,13 +161,13 @@ void PASTEMAC(ch,varname) \
 \
 		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
 		                     + conj(alpha) * psi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
 \
 		/* For her2, explicitly set the imaginary component of gamma11 to
            zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c
index aa0de6a2f..085ff9003 100644
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -80,8 +80,8 @@ void PASTEMAC(ch,varname) \
 		rs_ct = rs_c; \
 		cs_ct = cs_c; \
 \
-		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+		bli_tcopys( ch,ch, *alpha, alpha0 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha1 ); \
 	} \
 	else /* if ( bli_is_upper( uplo ) ) */ \
 	{ \
@@ -93,8 +93,8 @@ void PASTEMAC(ch,varname) \
 		conjx = bli_apply_conj( conjh, conjx ); \
 		conjy = bli_apply_conj( conjh, conjy ); \
 \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
-		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha0 ); \
+		bli_tcopys( ch,ch, *alpha, alpha1 ); \
 	} \
 \
 	/* Apply conjh (which carries the conjugation component of the Hermitian
@@ -117,17 +117,17 @@ void PASTEMAC(ch,varname) \
 		gamma11  = c + (i  )*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
-		PASTEMAC(ch,copycjs)( conjx,        *chi1, conjx0_chi1 ); \
-		PASTEMAC(ch,copycjs)( conjy,        *psi1, conjy1_psi1 ); \
-		PASTEMAC(ch,copycjs)( conj0,        *psi1, conjy0_psi1 ); \
+		bli_tcopycjs( ch,ch, conjx,        *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conjy,        *psi1, conjy1_psi1 ); \
+		bli_tcopycjs( ch,ch, conj0,        *psi1, conjy0_psi1 ); \
 \
 		/* Compute scalars for vector subproblems. */ \
-		PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
-		PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0, conjx0_chi1, alpha0_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha1, conjy1_psi1, alpha1_psi1 ); \
 \
 		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
 		   already been conjugated, if needed, by conjx and conjy. */ \
-		PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
 \
 		/* c10t = c10t +      alpha  * chi1 * y0'; */ \
 		/* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
@@ -146,13 +146,13 @@ void PASTEMAC(ch,varname) \
 \
 		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
 		                     + conj(alpha) * psi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
 \
 		/* For her2, explicitly set the imaginary component of gamma11 to
            zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c
index 4095e5e65..2430b7673 100644
--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -86,8 +86,8 @@ void PASTEMAC(ch,varname) \
 		rs_ct = rs_c; \
 		cs_ct = cs_c; \
 \
-		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+		bli_tcopys( ch,ch, *alpha, alpha0 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha1 ); \
 	} \
 	else /* if ( bli_is_upper( uplo ) ) */ \
 	{ \
@@ -99,8 +99,8 @@ void PASTEMAC(ch,varname) \
 		conjx = bli_apply_conj( conjh, conjx ); \
 		conjy = bli_apply_conj( conjh, conjy ); \
 \
-		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
-		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+		bli_tcopycjs( ch,ch, conjh, *alpha, alpha0 ); \
+		bli_tcopys( ch,ch, *alpha, alpha1 ); \
 	} \
 \
 	/* Apply conjh (which carries the conjugation component of the Hermitian
@@ -125,17 +125,17 @@ void PASTEMAC(ch,varname) \
 		c21      = c + (i+1)*rs_ct + (i  )*cs_ct; \
 \
 		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
-		PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
-		PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
-		PASTEMAC(ch,copycjs)( conj0,       *chi1, conjx0_chi1 ); \
+		bli_tcopycjs( ch,ch, conjh_conjy, *psi1, conjy0_psi1 ); \
+		bli_tcopycjs( ch,ch, conjh_conjx, *chi1, conjx1_chi1 ); \
+		bli_tcopycjs( ch,ch, conj0,       *chi1, conjx0_chi1 ); \
 \
 		/* Compute scalars for vector subproblems. */ \
-		PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
-		PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0, conjy0_psi1, alpha0_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha1, conjx1_chi1, alpha1_chi1 ); \
 \
 		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
 		   already been conjugated, if needed, by conjx and conjy. */ \
-		PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
+		bli_tscal2s( ch,ch,ch,ch, alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
 \
 		/* c21 = c21 +      alpha  * x2 * conj(psi1); */ \
 		/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \
@@ -154,13 +154,13 @@ void PASTEMAC(ch,varname) \
 \
 		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
 		                     + conj(alpha) * psi1 * conj(chi1); */ \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
-		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
+		bli_tadds( ch,ch,ch, alpha0_chi1_psi1, *gamma11 ); \
 \
 		/* For her2, explicitly set the imaginary component of gamma11 to
            zero. */ \
 		if ( bli_is_conj( conjh ) ) \
-			PASTEMAC(ch,seti0s)( *gamma11 ); \
+			bli_tseti0s( ch, *gamma11 ); \
 	} \
 }
 
diff --git a/frame/2/trmv/bli_trmv_unb_var1.c b/frame/2/trmv/bli_trmv_unb_var1.c
index 36ba911b8..fb80f72ba 100644
--- a/frame/2/trmv/bli_trmv_unb_var1.c
+++ b/frame/2/trmv/bli_trmv_unb_var1.c
@@ -96,10 +96,10 @@ void PASTEMAC(ch,varname) \
 			x2       = x + (i+1)*incx; \
 \
 			/* chi1 = alpha * alpha11 * chi1; */ \
-			PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+			bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 			if ( bli_is_nonunit_diag( diaga ) ) \
-				PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-			PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
+				bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+			bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi1 ); \
 \
 			/* chi1 = chi1 + alpha * a12t * x2; */ \
 			kfp_dv \
@@ -112,7 +112,7 @@ void PASTEMAC(ch,varname) \
 			  &rho, \
 			  cntx  \
 			); \
-			PASTEMAC(ch,axpys)( *alpha, rho, *chi1 ); \
+			bli_taxpys( ch,ch,ch,ch, *alpha, rho, *chi1 ); \
 		} \
 	} \
 	else /* if ( bli_is_lower( uploa_trans ) ) */ \
@@ -127,10 +127,10 @@ void PASTEMAC(ch,varname) \
 			x0       = x + (0  )*incx; \
 \
 			/* chi1 = alpha * alpha11 * chi1; */ \
-			PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+			bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 			if ( bli_is_nonunit_diag( diaga ) ) \
-				PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-			PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
+				bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+			bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi1 ); \
 \
 			/* chi1 = chi1 + alpha * a10t * x0; */ \
 			kfp_dv \
@@ -143,7 +143,7 @@ void PASTEMAC(ch,varname) \
 			  &rho, \
 			  cntx  \
 			); \
-			PASTEMAC(ch,axpys)( *alpha, rho, *chi1 ); \
+			bli_taxpys( ch,ch,ch,ch, *alpha, rho, *chi1 ); \
 		} \
 	} \
 }
diff --git a/frame/2/trmv/bli_trmv_unb_var2.c b/frame/2/trmv/bli_trmv_unb_var2.c
index 91b85d685..0b18da5e0 100644
--- a/frame/2/trmv/bli_trmv_unb_var2.c
+++ b/frame/2/trmv/bli_trmv_unb_var2.c
@@ -96,7 +96,7 @@ void PASTEMAC(ch,varname) \
 			x0       = x + (0  )*incx; \
 \
 			/* x0 = x0 + alpha * chi1 * a01; */ \
-			PASTEMAC(ch,scal2s)( *alpha, *chi1, alpha_chi1 ); \
+			bli_tscal2s( ch,ch,ch,ch, *alpha, *chi1, alpha_chi1 ); \
 			kfp_av \
 			( \
 			  conja, \
@@ -108,10 +108,10 @@ void PASTEMAC(ch,varname) \
 			); \
 \
 			/* chi1 = alpha * alpha11 * chi1; */ \
-			PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+			bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 			if ( bli_is_nonunit_diag( diaga ) ) \
-				PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-			PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
+				bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+			bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi1 ); \
 		} \
 	} \
 	else /* if ( bli_is_lower( uploa_trans ) ) */ \
@@ -126,7 +126,7 @@ void PASTEMAC(ch,varname) \
 			x2       = x + (i+1)*incx; \
 \
 			/* x2 = x2 + alpha * chi1 * a21; */ \
-			PASTEMAC(ch,scal2s)( *alpha, *chi1, alpha_chi1 ); \
+			bli_tscal2s( ch,ch,ch,ch, *alpha, *chi1, alpha_chi1 ); \
 			kfp_av \
 			( \
 			  conja, \
@@ -138,10 +138,10 @@ void PASTEMAC(ch,varname) \
 			); \
 \
 			/* chi1 = alpha * alpha11 * chi1; */ \
-			PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+			bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 			if ( bli_is_nonunit_diag( diaga ) ) \
-				PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-			PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
+				bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+			bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi1 ); \
 		} \
 	} \
 }
diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c
index 70b4fa7f1..3983ad85a 100644
--- a/frame/2/trmv/bli_trmv_unf_var1.c
+++ b/frame/2/trmv/bli_trmv_unf_var1.c
@@ -116,24 +116,24 @@ void PASTEMAC(ch,varname) \
 				x21      = x1  + (l+1)*incx; \
 \
 				/* chi11 = alpha * alpha11 * chi11; */ \
-				PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+				bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 				if ( bli_is_nonunit_diag( diaga ) ) \
-					PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-				PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
+					bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+				bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi11 ); \
 \
 				/* chi11 = chi11 + alpha * a12t * x21; */ \
-				PASTEMAC(ch,set0s)( rho1 ); \
+				bli_tset0s( ch, rho1 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
+						bli_tdotjs( ch,ch,ch,ch, *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
+						bli_tdots( ch,ch,ch,ch, *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
 				} \
-				PASTEMAC(ch,axpys)( *alpha, rho1, *chi11 ); \
+				bli_taxpys( ch,ch,ch,ch, *alpha, rho1, *chi11 ); \
 			} \
 \
 			/* x1 = x1 + alpha * A12 * x2; */ \
@@ -175,24 +175,24 @@ void PASTEMAC(ch,varname) \
 				x01      = x1  + (0  )*incx; \
 \
 				/* chi11 = alpha * alpha11 * chi11; */ \
-				PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+				bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 				if ( bli_is_nonunit_diag( diaga ) ) \
-					PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-				PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
+					bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+				bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi11 ); \
 \
 				/* chi11 = chi11 + alpha * a10t * x01; */ \
-				PASTEMAC(ch,set0s)( rho1 ); \
+				bli_tset0s( ch, rho1 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
+						bli_tdotjs( ch,ch,ch,ch, *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
+						bli_tdots( ch,ch,ch,ch, *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
 				} \
-				PASTEMAC(ch,axpys)( *alpha, rho1, *chi11 ); \
+				bli_taxpys( ch,ch,ch,ch, *alpha, rho1, *chi11 ); \
 			} \
 \
 			/* x1 = x1 + alpha * A10 * x0; */ \
diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c
index 981a819de..7cfad81a6 100644
--- a/frame/2/trmv/bli_trmv_unf_var2.c
+++ b/frame/2/trmv/bli_trmv_unf_var2.c
@@ -129,23 +129,23 @@ void PASTEMAC(ch,varname) \
 				x01      = x1  + (0  )*incx; \
 \
 				/* x01 = x01 + alpha * chi11 * a01; */ \
-				PASTEMAC(ch,scal2s)( *alpha, *chi11, alpha_chi11 ); \
+				bli_tscal2s( ch,ch,ch,ch, *alpha, *chi11, alpha_chi11 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,axpyjs)( alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
+						bli_taxpyjs( ch,ch,ch,ch, alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,axpys)( alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
+						bli_taxpys( ch,ch,ch,ch, alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
 				} \
 \
 				/* chi11 = alpha * alpha11 * chi11; */ \
-				PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+				bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 				if ( bli_is_nonunit_diag( diaga ) ) \
-					PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-				PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
+					bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+				bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi11 ); \
 			} \
 		} \
 	} \
@@ -186,23 +186,23 @@ void PASTEMAC(ch,varname) \
 				x21      = x1  + (l+1)*incx; \
 \
 				/* x21 = x21 + alpha * chi11 * a21; */ \
-				PASTEMAC(ch,scal2s)( *alpha, *chi11, alpha_chi11 ); \
+				bli_tscal2s( ch,ch,ch,ch, *alpha, *chi11, alpha_chi11 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
+						bli_taxpyjs( ch,ch,ch,ch, alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
+						bli_taxpys( ch,ch,ch,ch, alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
 				} \
 \
 				/* chi11 = alpha * alpha11 * chi11; */ \
-				PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
+				bli_tcopys( ch,ch, *alpha, alpha_alpha11_conj ); \
 				if ( bli_is_nonunit_diag( diaga ) ) \
-					PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
-				PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
+					bli_tscalcjs( ch,ch,ch, conja, *alpha11, alpha_alpha11_conj ); \
+				bli_tscals( ch,ch,ch, alpha_alpha11_conj, *chi11 ); \
 			} \
 		} \
 	} \
diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c
index 99ddce861..6c8057054 100644
--- a/frame/2/trsv/bli_trsv_unb_var1.c
+++ b/frame/2/trsv/bli_trsv_unb_var1.c
@@ -117,13 +117,13 @@ void PASTEMAC(ch,varname) \
 			  &rho, \
 			  cntx  \
 			); \
-			PASTEMAC(ch,subs)( rho, *chi1 ); \
+			bli_tsubs( ch,ch,ch, rho, *chi1 ); \
 \
 			/* chi1 = chi1 / alpha11; */ \
 			if ( bli_is_nonunit_diag( diaga ) ) \
 			{ \
-				PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-				PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
+				bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+				bli_tinvscals( ch,ch,ch, alpha11_conj, *chi1 ); \
 			} \
 		} \
 	} \
@@ -149,13 +149,13 @@ void PASTEMAC(ch,varname) \
 			  &rho, \
 			  cntx  \
 			); \
-			PASTEMAC(ch,subs)( rho, *chi1 ); \
+			bli_tsubs( ch,ch,ch, rho, *chi1 ); \
 \
 			/* chi1 = chi1 / alpha11; */ \
 			if ( bli_is_nonunit_diag( diaga ) ) \
 			{ \
-				PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-				PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
+				bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+				bli_tinvscals( ch,ch,ch, alpha11_conj, *chi1 ); \
 			} \
 		} \
 	} \
diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c
index aed530c2d..1d3dd7a16 100644
--- a/frame/2/trsv/bli_trsv_unb_var2.c
+++ b/frame/2/trsv/bli_trsv_unb_var2.c
@@ -109,12 +109,12 @@ void PASTEMAC(ch,varname) \
 			/* chi1 = chi1 / alpha11; */ \
 			if ( bli_is_nonunit_diag( diaga ) ) \
 			{ \
-				PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-				PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
+				bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+				bli_tinvscals( ch,ch,ch, alpha11_conj, *chi1 ); \
 			} \
 \
 			/* x0 = x0 - chi1 * a01; */ \
-			PASTEMAC(ch,neg2s)( *chi1, minus_chi1 ); \
+			bli_tneg2s( ch,ch, *chi1, minus_chi1 ); \
 			kfp_av \
 			( \
 			  conja, \
@@ -140,12 +140,12 @@ void PASTEMAC(ch,varname) \
 			/* chi1 = chi1 / alpha11; */ \
 			if ( bli_is_nonunit_diag( diaga ) ) \
 			{ \
-				PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-				PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
+				bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+				bli_tinvscals( ch,ch,ch, alpha11_conj, *chi1 ); \
 			} \
 \
 			/* x2 = x2 - chi1 * a21; */ \
-			PASTEMAC(ch,neg2s)( *chi1, minus_chi1 ); \
+			bli_tneg2s( ch,ch, *chi1, minus_chi1 ); \
 			kfp_av \
 			( \
 			  conja, \
diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c
index 109184a7c..bf30903a1 100644
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -143,24 +143,24 @@ void PASTEMAC(ch,varname) \
 				x21      = x1  + (l+1)*incx; \
 \
 				/* chi11 = chi11 - a12t * x21; */ \
-				PASTEMAC(ch,set0s)( rho1 ); \
+				bli_tset0s( ch, rho1 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
+						bli_tdotjs( ch,ch,ch,ch, *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
+						bli_tdots( ch,ch,ch,ch, *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
 				} \
-				PASTEMAC(ch,subs)( rho1, *chi11 ); \
+				bli_tsubs( ch,ch,ch, rho1, *chi11 ); \
 \
 				/* chi11 = chi11 / alpha11; */ \
 				if ( bli_is_nonunit_diag( diaga ) ) \
 				{ \
-					PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-					PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
+					bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+					bli_tinvscals( ch,ch,ch, alpha11_conj, *chi11 ); \
 				} \
 			} \
 		} \
@@ -203,24 +203,24 @@ void PASTEMAC(ch,varname) \
 				x01      = x1  + (0  )*incx; \
 \
 				/* chi11 = chi11 - a10t * x01; */ \
-				PASTEMAC(ch,set0s)( rho1 ); \
+				bli_tset0s( ch, rho1 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
+						bli_tdotjs( ch,ch,ch,ch, *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_behind; ++j ) \
-						PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
+						bli_tdots( ch,ch,ch,ch, *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
 				} \
-				PASTEMAC(ch,subs)( rho1, *chi11 ); \
+				bli_tsubs( ch,ch,ch, rho1, *chi11 ); \
 \
 				/* chi11 = chi11 / alpha11; */ \
 				if ( bli_is_nonunit_diag( diaga ) ) \
 				{ \
-					PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-					PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
+					bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+					bli_tinvscals( ch,ch,ch, alpha11_conj, *chi11 ); \
 				} \
 			} \
 		} \
diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c
index 5055b9a62..79f459601 100644
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -129,21 +129,21 @@ void PASTEMAC(ch,varname) \
 				/* chi11 = chi11 / alpha11; */ \
 				if ( bli_is_nonunit_diag( diaga ) ) \
 				{ \
-					PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-					PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
+					bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+					bli_tinvscals( ch,ch,ch, alpha11_conj, *chi11 ); \
 				} \
 \
 				/* x01 = x01 - chi11 * a01; */ \
-				PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \
+				bli_tneg2s( ch,ch, *chi11, minus_chi11 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
+						bli_taxpyjs( ch,ch,ch,ch, minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
+						bli_taxpys( ch,ch,ch,ch, minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
 				} \
 			} \
 \
@@ -187,21 +187,21 @@ void PASTEMAC(ch,varname) \
 				/* chi11 = chi11 / alpha11; */ \
 				if ( bli_is_nonunit_diag( diaga ) ) \
 				{ \
-					PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
-					PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
+					bli_tcopycjs( ch,ch, conja, *alpha11, alpha11_conj ); \
+					bli_tinvscals( ch,ch,ch, alpha11_conj, *chi11 ); \
 				} \
 \
 				/* x21 = x21 - chi11 * a21; */ \
-				PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \
+				bli_tneg2s( ch,ch, *chi11, minus_chi11 ); \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
+						bli_taxpyjs( ch,ch,ch,ch, minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
 				} \
 				else \
 				{ \
 					for ( j = 0; j < f_ahead; ++j ) \
-						PASTEMAC(ch,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
+						bli_taxpys( ch,ch,ch,ch, minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
 				} \
 			} \
 \
diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c
index 694142416..a0a7c5e42 100644
--- a/frame/3/bli_l3_sup_var12.c
+++ b/frame/3/bli_l3_sup_var12.c
@@ -214,7 +214,7 @@ void PASTEMAC(ch,varname) \
 	if ( bli_zero_dim3( m, n, k ) ) return; \
 \
 	/* If alpha is zero, scale by beta and return. */ \
-	if ( PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
+	if ( bli_teq0s( ch, *(( ctype* )alpha) ) ) \
 	{ \
 		PASTEMAC(ch,scalm) \
 		( \
@@ -561,7 +561,7 @@ void PASTEMAC(ch,varname) \
 	if ( bli_zero_dim3( m, n, k ) ) return; \
 \
 	/* If alpha is zero, scale by beta and return. */ \
-	if ( PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
+	if ( bli_teq0s( ch, *(( ctype* )alpha) ) ) \
 	{ \
 		PASTEMAC(ch,scalm) \
 		( \
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 02ff808e2..f86806eb0 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -62,14 +62,16 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC(ch,ch,ch,xpbys_mxn_l) \
+	bli_txpbys_mxn_uplo \
 	( \
+	  ch,ch,ch,ch, \
 	  diagoff, \
+	  BLIS_LOWER, \
 	  m, \
 	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
-	  y_cast, rs_y,  cs_y \
+	  y_cast, rs_y, cs_y \
 	); \
 }
 
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
index a11f599d3..eaef84d53 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c
@@ -62,14 +62,16 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC(ch,ch,ch,xpbys_mxn_l) \
+	bli_txpbys_mxn_uplo \
 	( \
+	  ch,ch,ch,ch, \
 	  diagoff, \
+	  BLIS_LOWER, \
 	  m, \
 	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
-	  y_cast, rs_y,  cs_y \
+	  y_cast, rs_y, cs_y \
 	); \
 }
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 34a10914f..082c388ea 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -62,14 +62,16 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC(ch,ch,ch,xpbys_mxn_u) \
+	bli_txpbys_mxn_uplo \
 	( \
+	  ch,ch,ch,ch, \
 	  diagoff, \
+	  BLIS_UPPER, \
 	  m, \
 	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
-	  y_cast, rs_y,  cs_y \
+	  y_cast, rs_y, cs_y \
 	); \
 }
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
index aa9f3bc5e..b4ed52e41 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c
@@ -62,14 +62,16 @@ BLIS_INLINE void PASTEMAC(ch,op) \
 	const ctype* restrict b_cast = b; \
 	      ctype* restrict y_cast = y; \
 \
-	PASTEMAC(ch,ch,ch,xpbys_mxn_u) \
+	bli_txpbys_mxn_uplo \
 	( \
+	  ch,ch,ch,ch, \
 	  diagoff, \
+	  BLIS_UPPER, \
 	  m, \
 	  n, \
 	  x_cast, rs_x, cs_x, \
 	  b_cast, \
-	  y_cast, rs_y,  cs_y \
+	  y_cast, rs_y, cs_y \
 	); \
 }
 
diff --git a/frame/base/bli_machval.c b/frame/base/bli_machval.c
index a4b9223bc..7c42284d6 100644
--- a/frame/base/bli_machval.c
+++ b/frame/base/bli_machval.c
@@ -113,7 +113,7 @@ void PASTEMAC(chv,opname) \
 \
 	/* Copy the requested parameter value to the output buffer, which
 	   may involve a demotion from the complex to real domain. */ \
-	PASTEMAC(chvr,chv,copys)( pvals[ val_i ], *v_cast ); \
+	bli_tcopys( chvr,chv, pvals[ val_i ], *v_cast ); \
 }
 
 INSERT_GENTFUNCR_BASIC( machval, lamch )
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index e4e79d9f4..0d0d3bf66 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -121,12 +121,12 @@ void bli_obj_create_without_buffer
 	// scenarios. Failing to do this can lead to reading uninitialized
 	// memory just before calling the macrokernel (as the internal scalars
 	// for A and B are merged).
-	//if      ( bli_is_float( dt )    ) { bli_sset1s( *(( float*    )s) ); }
-	//else if ( bli_is_double( dt )   ) { bli_dset1s( *(( double*   )s) ); }
-	if      ( bli_is_float( dt )    ) { bli_cset1s( *(( scomplex* )s) ); }
-	else if ( bli_is_double( dt )   ) { bli_zset1s( *(( dcomplex* )s) ); }
-	else if ( bli_is_scomplex( dt ) ) { bli_cset1s( *(( scomplex* )s) ); }
-	else if ( bli_is_dcomplex( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); }
+	//if      ( bli_is_float( dt )    ) { bli_tset1s( s, *(( float*    )s) ); }
+	//else if ( bli_is_double( dt )   ) { bli_tset1s( d, *(( double*   )s) ); }
+	if      ( bli_is_float( dt )    ) { bli_tset1s( c, *(( scomplex* )s) ); }
+	else if ( bli_is_double( dt )   ) { bli_tset1s( z, *(( dcomplex* )s) ); }
+	else if ( bli_is_scomplex( dt ) ) { bli_tset1s( c, *(( scomplex* )s) ); }
+	else if ( bli_is_dcomplex( dt ) ) { bli_tset1s( z, *(( dcomplex* )s) ); }
 }
 
 void bli_obj_alloc_buffer
@@ -312,10 +312,10 @@ void bli_obj_free
 	//temp_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, obj );
 	//temp_i = bli_obj_buffer_for_const( BLIS_INT,      obj );
 
-	bli_dssets( value, 0.0, *temp_s );
-	bli_ddsets( value, 0.0, *temp_d );
-	bli_dcsets( value, 0.0, *temp_c );
-	bli_dzsets( value, 0.0, *temp_z );
+	bli_tsets( d,s, value, 0.0, *temp_s );
+	bli_tsets( d,d, value, 0.0, *temp_d );
+	bli_tsets( d,c, value, 0.0, *temp_c );
+	bli_tsets( d,z, value, 0.0, *temp_z );
 
 	*temp_i = ( gint_t ) value;
 }
@@ -347,33 +347,33 @@ void bli_obj_free
 
 	buf_a = bli_obj_buffer_at_off( a );
 
-	bli_zzsets( 0.0, 0.0, value );
+	bli_tsets( z,z, 0.0, 0.0, value );
 
 	if ( bli_obj_is_float( a ) )
 	{
-		bli_szcopys( *(( float*    )buf_a), value );
+		bli_tcopys( s,z, *(( float*    )buf_a), value );
 	}
 	else if ( bli_obj_is_double( a ) )
 	{
-		bli_dzcopys( *(( double*   )buf_a), value );
+		bli_tcopys( d,z, *(( double*   )buf_a), value );
 	}
 	else if ( bli_obj_is_scomplex( a ) )
 	{
-		bli_czcopys( *(( scomplex* )buf_a), value );
+		bli_tcopys( c,z, *(( scomplex* )buf_a), value );
 	}
 	else if ( bli_obj_is_dcomplex( a ) )
 	{
-		bli_zzcopys( *(( dcomplex* )buf_a), value );
+		bli_tcopys( z,z, *(( dcomplex* )buf_a), value );
 	}
 	else
 	{
 		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
 	}
 
-	bli_zscopys( value, *temp_s );
-	bli_zdcopys( value, *temp_d );
-	bli_zccopys( value, *temp_c );
-	bli_zzcopys( value, *temp_z );
+	bli_tcopys( z,s, value, *temp_s );
+	bli_tcopys( z,d, value, *temp_d );
+	bli_tcopys( z,c, value, *temp_c );
+	bli_tcopys( z,z, value, *temp_z );
 
 	*temp_i = ( gint_t ) bli_zreal( value );
 }
diff --git a/frame/base/bli_query.c b/frame/base/bli_query.c
index 140fc2f97..7824e1e48 100644
--- a/frame/base/bli_query.c
+++ b/frame/base/bli_query.c
@@ -77,7 +77,7 @@ bool bli_obj_equals( const obj_t* a, const obj_t* b )
 		else if ( dt == BLIS_DOUBLE )   r_val = bli_deqa( buf_a, buf_b );
 		else if ( dt == BLIS_SCOMPLEX ) r_val = bli_ceqa( buf_a, buf_b );
 		else if ( dt == BLIS_DCOMPLEX ) r_val = bli_zeqa( buf_a, buf_b );
-		else if ( dt == BLIS_INT )      r_val = bli_ieqa( buf_a, buf_b );
+		else if ( dt == BLIS_INT )      r_val = bli_ieq( buf_a, buf_b );
 	}
 
 	return r_val;
@@ -181,7 +181,7 @@ bool bli_obj_imag_is_zero( const obj_t* a )
 		bli_getsc( a, &a_r, &a_i );
 
 		// Compare the imaginary part of a to double-precision zero.
-		if ( !bli_deq0( a_i ) ) r_val = FALSE;
+		if ( !bli_teq0s( d, a_i ) ) r_val = FALSE;
 	}
 
 	return r_val;
diff --git a/frame/base/bli_setgetijm.c b/frame/base/bli_setgetijm.c
index 5a89d258e..9c826af06 100644
--- a/frame/base/bli_setgetijm.c
+++ b/frame/base/bli_setgetijm.c
@@ -101,7 +101,7 @@ void PASTEMAC(ch,opname) \
 \
 	ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
-	PASTEMAC(z,ch,sets)( ar, ai, *b_ij ); \
+	bli_tsets( z,ch, ar, ai, *b_ij ); \
 }
 
 INSERT_GENTFUNC_BASIC( setijm )
@@ -175,7 +175,7 @@ void PASTEMAC(ch,opname) \
 \
 	const ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
-	PASTEMAC(ch,z,gets)( *b_ij, *ar, *ai ); \
+	bli_tgets( ch,z, *b_ij, *ar, *ai ); \
 }
 
 INSERT_GENTFUNC_BASIC( getijm )
diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c
index a6ae2860d..dc50faae4 100644
--- a/frame/base/bli_setgetijv.c
+++ b/frame/base/bli_setgetijv.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,opname) \
 \
 	ctype* restrict x_i = x_cast + (i  )*incx; \
 \
-	PASTEMAC(z,ch,sets)( ar, ai, *x_i ); \
+	bli_tsets( z,ch, ar, ai, *x_i ); \
 }
 
 INSERT_GENTFUNC_BASIC( setijv )
@@ -161,7 +161,7 @@ void PASTEMAC(ch,opname) \
 \
 	const ctype* restrict x_i = x_cast + (i  )*incx; \
 \
-	PASTEMAC(ch,z,gets)( *x_i, *ar, *ai ); \
+	bli_tgets( ch,z, *x_i, *ar, *ai ); \
 }
 
 INSERT_GENTFUNC_BASIC( getijv )
diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c
index 6ae848b4c..12dcbb073 100644
--- a/frame/base/cast/bli_castm.c
+++ b/frame/base/cast/bli_castm.c
@@ -150,7 +150,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copyjs)( a1[i], b1[i] ); \
+					bli_tcopyjs( cha,chb, a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -163,7 +163,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copyjs)( *a1, *b1 ); \
+					bli_tcopyjs( cha,chb, *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
@@ -182,7 +182,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copys)( a1[i], b1[i] ); \
+					bli_tcopys( cha,chb, a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -195,7 +195,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copys)( *a1, *b1 ); \
+					bli_tcopys( cha,chb, *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c
index 3c2bbcb57..18f209838 100644
--- a/frame/base/cast/bli_castnzm.c
+++ b/frame/base/cast/bli_castnzm.c
@@ -150,7 +150,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copyjnzs)( a1[i], b1[i] ); \
+					bli_tcopyjnzs( cha,chb, a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -163,7 +163,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copyjnzs)( *a1, *b1 ); \
+					bli_tcopyjnzs( cha,chb, *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
@@ -182,7 +182,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copynzs)( a1[i], b1[i] ); \
+					bli_tcopynzs( cha,chb, a1[i], b1[i] ); \
 				} \
 			} \
 		} \
@@ -195,7 +195,7 @@ void PASTEMAC(cha,chb,opname) \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
-					PASTEMAC(cha,chb,copynzs)( *a1, *b1 ); \
+					bli_tcopynzs( cha,chb, *a1, *b1 ); \
 \
 					a1 += inca; \
 					b1 += incb; \
diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c
index 468ff9109..c110d4da1 100644
--- a/frame/base/cast/bli_castv.c
+++ b/frame/base/cast/bli_castv.c
@@ -123,14 +123,14 @@ void PASTEMAC(chx,chy,opname) \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(chx,chy,copyjs)( x1[i], y1[i] ); \
+				bli_tcopyjs( chx,chy, x1[i], y1[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(chx,chy,copyjs)( *x1, *y1 ); \
+				bli_tcopyjs( chx,chy, *x1, *y1 ); \
 \
 				x1 += incx; \
 				y1 += incy; \
@@ -143,14 +143,14 @@ void PASTEMAC(chx,chy,opname) \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(chx,chy,copys)( x1[i], y1[i] ); \
+				bli_tcopys( chx,chy, x1[i], y1[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(chx,chy,copys)( *x1, *y1 ); \
+				bli_tcopys( chx,chy, *x1, *y1 ); \
 \
 				x1 += incx; \
 				y1 += incy; \
diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c
index b68af083c..e462a8291 100644
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -207,8 +207,8 @@ double PASTEF77(d,sdot)
 		float* chi1 = x0 + (i  )*incx0;
 		float* psi1 = y0 + (i  )*incy0;
 
-		bli_ddots( (( double )(*chi1)),
-		           (( double )(*psi1)), rho );
+		bli_tdots( d,d,d,d, (( double )(*chi1)),
+		                    (( double )(*psi1)), rho );
 	}
 
 	/* Finalization of BLIS is not required, because initialization was
diff --git a/frame/compat/bla_her2k.c b/frame/compat/bla_her2k.c
index 25e9fb431..f6f58d401 100644
--- a/frame/compat/bla_her2k.c
+++ b/frame/compat/bla_her2k.c
@@ -93,8 +93,8 @@ void PASTEF77(ch,blasname) \
 	   - the rank-2k product is empty (either because alpha is zero or k
 	     is zero) AND matrix C is not scaled. */ \
 	if ( m0 == 0 || \
-	     ( ( PASTEMAC(ch,eq0)( *alpha ) || k0 == 0 ) \
-	       && PASTEMAC(chr,eq1)( *beta ) \
+	     ( ( bli_teq0s( ch, *alpha ) || k0 == 0 ) \
+	       && bli_teq1s( chr, *beta ) \
          ) \
 	   ) \
 	{ \
@@ -186,8 +186,8 @@ void PASTEF77(ch,blasname) \
 	   - the rank-2k product is empty (either because alpha is zero or k
 	     is zero) AND matrix C is not scaled. */ \
 	if ( m0 == 0 || \
-	     ( ( PASTEMAC(ch,eq0)( *alpha ) || k0 == 0 ) \
-	       && PASTEMAC(chr,eq1)( *beta ) \
+	     ( ( bli_teq0s( ch, *alpha ) || k0 == 0 ) \
+	       && bli_teq1s( chr, *beta ) \
          ) \
 	   ) \
 	{ \
diff --git a/frame/compat/bla_herk.c b/frame/compat/bla_herk.c
index a9f01268d..4f447faba 100644
--- a/frame/compat/bla_herk.c
+++ b/frame/compat/bla_herk.c
@@ -91,8 +91,8 @@ void PASTEF77(ch,blasname) \
 	   - the rank-k product is empty (either because alpha is zero or k
 	     is zero) AND matrix C is not scaled. */ \
 	if ( m0 == 0 || \
-	     ( ( PASTEMAC(chr,eq0)( *alpha ) || k0 == 0 ) \
-	       && PASTEMAC(chr,eq1)( *beta ) \
+	     ( ( bli_teq0s( chr, *alpha ) || k0 == 0 ) \
+	       && bli_teq1s( chr, *beta ) \
          ) \
 	   ) \
 	{ \
@@ -178,8 +178,8 @@ void PASTEF77(ch,blasname) \
 	   - the rank-k product is empty (either because alpha is zero or k
 	     is zero) AND matrix C is not scaled. */ \
 	if ( m0 == 0 || \
-	     ( ( PASTEMAC(chr,eq0)( *alpha ) || k0 == 0 ) \
-	       && PASTEMAC(chr,eq1)( *beta ) \
+	     ( ( bli_teq0s( chr, *alpha ) || k0 == 0 ) \
+	       && bli_teq1s( chr, *beta ) \
          ) \
 	   ) \
 	{ \
diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c
index 0acf7c10d..543515a3f 100644
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -39,7 +39,7 @@
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNCSCAL
-#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
+#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, cha_real, blasname, blisname ) \
 \
 void PASTEF77(chx,cha,blasname) \
      ( \
@@ -67,7 +67,7 @@ void PASTEF77(chx,cha,blasname) \
 	   that is, we just always sub-optimally implement those cases
 	   by casting alpha to ctype_x (potentially the complex domain) and
 	   using the homogeneous datatype instance according to that type. */ \
-	PASTEMAC(cha,chx,copys)( *alpha, alpha_cast ); \
+	bli_tcopys( cha_real,chx, *alpha, alpha_cast ); \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC(chx,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/compat/f2c/bla_gbmv.c b/frame/compat/f2c/bla_gbmv.c
index 320b49684..0edc7cbe9 100644
--- a/frame/compat/f2c/bla_gbmv.c
+++ b/frame/compat/f2c/bla_gbmv.c
@@ -268,7 +268,7 @@
 		i__1 = leny;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
-		    bli_csets( (0.f), (0.f), y[i__2] );
+		    bli_tsets( c,c, (0.f), (0.f), y[i__2] );
 /* L10: */
 		}
 	    } else {
@@ -276,8 +276,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
 		    i__3 = i__;
-		    bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		    bli_tsets( c,c, (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 /* L20: */
 		}
 	    }
@@ -287,7 +287,7 @@
 		i__1 = leny;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
-		    bli_csets( (0.f), (0.f), y[i__2] );
+		    bli_tsets( c,c, (0.f), (0.f), y[i__2] );
 		    iy += *incy;
 /* L30: */
 		}
@@ -296,8 +296,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
 		    i__3 = iy;
-		    bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		    bli_tsets( c,c, (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		    iy += *incy;
 /* L40: */
 		}
@@ -319,8 +319,8 @@
 		i__2 = jx;
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 		    i__2 = jx;
-		    bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 		    k = kup1 - j;
 /* Computing MAX */
 		    i__2 = 1, i__3 = j - *ku;
@@ -331,9 +331,9 @@
 			i__2 = i__;
 			i__3 = i__;
 			i__5 = k + i__ + j * a_dim1;
-			bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
-			bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+			bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
+			bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 /* L50: */
 		    }
 		}
@@ -346,8 +346,8 @@
 		i__4 = jx;
 		if (bli_creal(x[i__4]) != 0.f || bli_cimag(x[i__4]) != 0.f) {
 		    i__4 = jx;
-		    bli_csets( (bli_creal(*alpha) * bli_creal(x[i__4]) - bli_cimag(*alpha) * bli_cimag(x[i__4])), (bli_creal(*alpha) * bli_cimag(x[i__4]) + bli_cimag(*alpha) * bli_creal(x[i__4])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__4]) - bli_cimag(*alpha) * bli_cimag(x[i__4])), (bli_creal(*alpha) * bli_cimag(x[i__4]) + bli_cimag(*alpha) * bli_creal(x[i__4])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 		    iy = ky;
 		    k = kup1 - j;
 /* Computing MAX */
@@ -359,9 +359,9 @@
 			i__4 = iy;
 			i__2 = iy;
 			i__5 = k + i__ + j * a_dim1;
-			bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
-			bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
+			bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
+			bli_tsets( c,c, (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
 			iy += *incy;
 /* L70: */
 		    }
@@ -381,7 +381,7 @@
 	if (*incx == 1) {
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
-		bli_csets( (0.f), (0.f), temp );
+		bli_tsets( c,c, (0.f), (0.f), temp );
 		k = kup1 - j;
 		if (noconj) {
 /* Computing MAX */
@@ -392,9 +392,9 @@
 		    for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) {
 			i__3 = k + i__ + j * a_dim1;
 			i__4 = i__;
-			bli_csets( (bli_creal(a[i__3]) * bli_creal(x[i__4]) - bli_cimag(a[i__3]) * bli_cimag(x[i__4])), (bli_creal(a[i__3]) * bli_cimag(x[i__4]) + bli_cimag(a[i__3]) * bli_creal(x[i__4])), q__2 );
-			bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			bli_tsets( c,c, (bli_creal(a[i__3]) * bli_creal(x[i__4]) - bli_cimag(a[i__3]) * bli_cimag(x[i__4])), (bli_creal(a[i__3]) * bli_cimag(x[i__4]) + bli_cimag(a[i__3]) * bli_creal(x[i__4])), q__2 );
+			bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L90: */
 		    }
 		} else {
@@ -406,24 +406,24 @@
 		    for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) {
 			bla_r_cnjg(&q__3, &a[k + i__ + j * a_dim1]);
 			i__2 = i__;
-			bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
-			bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
+			bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L100: */
 		    }
 		}
 		i__4 = jy;
 		i__2 = jy;
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp) - bli_cimag(*alpha) * bli_cimag(temp)), (bli_creal(*alpha) * bli_cimag(temp) + bli_cimag(*alpha) * bli_creal(temp)), q__2 );
-		bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp) - bli_cimag(*alpha) * bli_cimag(temp)), (bli_creal(*alpha) * bli_cimag(temp) + bli_cimag(*alpha) * bli_creal(temp)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
 		jy += *incy;
 /* L110: */
 	    }
 	} else {
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
-		bli_csets( (0.f), (0.f), temp );
+		bli_tsets( c,c, (0.f), (0.f), temp );
 		ix = kx;
 		k = kup1 - j;
 		if (noconj) {
@@ -435,9 +435,9 @@
 		    for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) {
 			i__4 = k + i__ + j * a_dim1;
 			i__2 = ix;
-			bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__2]) - bli_cimag(a[i__4]) * bli_cimag(x[i__2])), (bli_creal(a[i__4]) * bli_cimag(x[i__2]) + bli_cimag(a[i__4]) * bli_creal(x[i__2])), q__2 );
-			bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			bli_tsets( c,c, (bli_creal(a[i__4]) * bli_creal(x[i__2]) - bli_cimag(a[i__4]) * bli_cimag(x[i__2])), (bli_creal(a[i__4]) * bli_cimag(x[i__2]) + bli_cimag(a[i__4]) * bli_creal(x[i__2])), q__2 );
+			bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			ix += *incx;
 /* L120: */
 		    }
@@ -450,18 +450,18 @@
 		    for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) {
 			bla_r_cnjg(&q__3, &a[k + i__ + j * a_dim1]);
 			i__3 = ix;
-			bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-			bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+			bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			ix += *incx;
 /* L130: */
 		    }
 		}
 		i__2 = jy;
 		i__3 = jy;
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp) - bli_cimag(*alpha) * bli_cimag(temp)), (bli_creal(*alpha) * bli_cimag(temp) + bli_cimag(*alpha) * bli_creal(temp)), q__2 );
-		bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp) - bli_cimag(*alpha) * bli_cimag(temp)), (bli_creal(*alpha) * bli_cimag(temp) + bli_cimag(*alpha) * bli_creal(temp)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		jy += *incy;
 		if (j > *ku) {
 		    kx += *incx;
@@ -1421,7 +1421,7 @@
 		i__1 = leny;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
-		    bli_zsets( (0.), (0.), y[i__2] );
+		    bli_tsets( z,z, (0.), (0.), y[i__2] );
 /* L10: */
 		}
 	    } else {
@@ -1429,8 +1429,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
 		    i__3 = i__;
-		    bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		    bli_tsets( z,z, (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 /* L20: */
 		}
 	    }
@@ -1440,7 +1440,7 @@
 		i__1 = leny;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
-		    bli_zsets( (0.), (0.), y[i__2] );
+		    bli_tsets( z,z, (0.), (0.), y[i__2] );
 		    iy += *incy;
 /* L30: */
 		}
@@ -1449,8 +1449,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
 		    i__3 = iy;
-		    bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		    bli_tsets( z,z, (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		    iy += *incy;
 /* L40: */
 		}
@@ -1472,8 +1472,8 @@
 		i__2 = jx;
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 		    k = kup1 - j;
 /* Computing MAX */
 		    i__2 = 1, i__3 = j - *ku;
@@ -1484,9 +1484,9 @@
 			i__2 = i__;
 			i__3 = i__;
 			i__5 = k + i__ + j * a_dim1;
-			bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
-			bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+			bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
+			bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 /* L50: */
 		    }
 		}
@@ -1499,8 +1499,8 @@
 		i__4 = jx;
 		if (bli_zreal(x[i__4]) != 0. || bli_zimag(x[i__4]) != 0.) {
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__4]) - bli_zimag(*alpha) * bli_zimag(x[i__4])), (bli_zreal(*alpha) * bli_zimag(x[i__4]) + bli_zimag(*alpha) * bli_zreal(x[i__4])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__4]) - bli_zimag(*alpha) * bli_zimag(x[i__4])), (bli_zreal(*alpha) * bli_zimag(x[i__4]) + bli_zimag(*alpha) * bli_zreal(x[i__4])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 		    iy = ky;
 		    k = kup1 - j;
 /* Computing MAX */
@@ -1512,9 +1512,9 @@
 			i__4 = iy;
 			i__2 = iy;
 			i__5 = k + i__ + j * a_dim1;
-			bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
-			bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
+			bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
+			bli_tsets( z,z, (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
 			iy += *incy;
 /* L70: */
 		    }
@@ -1534,7 +1534,7 @@
 	if (*incx == 1) {
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
-		bli_zsets( (0.), (0.), temp );
+		bli_tsets( z,z, (0.), (0.), temp );
 		k = kup1 - j;
 		if (noconj) {
 /* Computing MAX */
@@ -1545,9 +1545,9 @@
 		    for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) {
 			i__3 = k + i__ + j * a_dim1;
 			i__4 = i__;
-			bli_zsets( (bli_zreal(a[i__3]) * bli_zreal(x[i__4]) - bli_zimag(a[i__3]) * bli_zimag(x[i__4])), (bli_zreal(a[i__3]) * bli_zimag(x[i__4]) + bli_zimag(a[i__3]) * bli_zreal(x[i__4])), z__2 );
-			bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			bli_tsets( z,z, (bli_zreal(a[i__3]) * bli_zreal(x[i__4]) - bli_zimag(a[i__3]) * bli_zimag(x[i__4])), (bli_zreal(a[i__3]) * bli_zimag(x[i__4]) + bli_zimag(a[i__3]) * bli_zreal(x[i__4])), z__2 );
+			bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L90: */
 		    }
 		} else {
@@ -1559,24 +1559,24 @@
 		    for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) {
 			bla_d_cnjg(&z__3, &a[k + i__ + j * a_dim1]);
 			i__2 = i__;
-			bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
-			bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
+			bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L100: */
 		    }
 		}
 		i__4 = jy;
 		i__2 = jy;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp) - bli_zimag(*alpha) * bli_zimag(temp)), (bli_zreal(*alpha) * bli_zimag(temp) + bli_zimag(*alpha) * bli_zreal(temp)), z__2 );
-		bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp) - bli_zimag(*alpha) * bli_zimag(temp)), (bli_zreal(*alpha) * bli_zimag(temp) + bli_zimag(*alpha) * bli_zreal(temp)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
 		jy += *incy;
 /* L110: */
 	    }
 	} else {
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
-		bli_zsets( (0.), (0.), temp );
+		bli_tsets( z,z, (0.), (0.), temp );
 		ix = kx;
 		k = kup1 - j;
 		if (noconj) {
@@ -1588,9 +1588,9 @@
 		    for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) {
 			i__4 = k + i__ + j * a_dim1;
 			i__2 = ix;
-			bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__2]) - bli_zimag(a[i__4]) * bli_zimag(x[i__2])), (bli_zreal(a[i__4]) * bli_zimag(x[i__2]) + bli_zimag(a[i__4]) * bli_zreal(x[i__2])), z__2 );
-			bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			bli_tsets( z,z, (bli_zreal(a[i__4]) * bli_zreal(x[i__2]) - bli_zimag(a[i__4]) * bli_zimag(x[i__2])), (bli_zreal(a[i__4]) * bli_zimag(x[i__2]) + bli_zimag(a[i__4]) * bli_zreal(x[i__2])), z__2 );
+			bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			ix += *incx;
 /* L120: */
 		    }
@@ -1603,18 +1603,18 @@
 		    for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) {
 			bla_d_cnjg(&z__3, &a[k + i__ + j * a_dim1]);
 			i__3 = ix;
-			bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-			bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+			bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			ix += *incx;
 /* L130: */
 		    }
 		}
 		i__2 = jy;
 		i__3 = jy;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp) - bli_zimag(*alpha) * bli_zimag(temp)), (bli_zreal(*alpha) * bli_zimag(temp) + bli_zimag(*alpha) * bli_zreal(temp)), z__2 );
-		bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp) - bli_zimag(*alpha) * bli_zimag(temp)), (bli_zreal(*alpha) * bli_zimag(temp) + bli_zimag(*alpha) * bli_zreal(temp)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		jy += *incy;
 		if (j > *ku) {
 		    kx += *incx;
diff --git a/frame/compat/f2c/bla_hbmv.c b/frame/compat/f2c/bla_hbmv.c
index c20a720f9..6c3f45f5f 100644
--- a/frame/compat/f2c/bla_hbmv.c
+++ b/frame/compat/f2c/bla_hbmv.c
@@ -254,7 +254,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
-		    bli_csets( (0.f), (0.f), y[i__2] );
+		    bli_tsets( c,c, (0.f), (0.f), y[i__2] );
 /* L10: */
 		}
 	    } else {
@@ -262,8 +262,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
 		    i__3 = i__;
-		    bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		    bli_tsets( c,c, (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 /* L20: */
 		}
 	    }
@@ -273,7 +273,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
-		    bli_csets( (0.f), (0.f), y[i__2] );
+		    bli_tsets( c,c, (0.f), (0.f), y[i__2] );
 		    iy += *incy;
 /* L30: */
 		}
@@ -282,8 +282,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
 		    i__3 = iy;
-		    bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		    bli_tsets( c,c, (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		    iy += *incy;
 /* L40: */
 		}
@@ -302,9 +302,9 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		l = kplus1 - j;
 /* Computing MAX */
 		i__2 = 1, i__3 = j - *k;
@@ -313,25 +313,25 @@
 		    i__2 = i__;
 		    i__3 = i__;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 		    i__2 = i__;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 /* L50: */
 		}
 		i__4 = j;
 		i__2 = j;
 		i__3 = kplus1 + j * a_dim1;
 		r__1 = bli_creal(a[i__3]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
-		bli_csets( (bli_creal(y[i__2]) + bli_creal(q__3)), (bli_cimag(y[i__2]) + bli_cimag(q__3)), q__2 );
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
-		bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
+		bli_tsets( c,c, (bli_creal(y[i__2]) + bli_creal(q__3)), (bli_cimag(y[i__2]) + bli_cimag(q__3)), q__2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
+		bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
 /* L60: */
 	    }
 	} else {
@@ -340,9 +340,9 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__4 = jx;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__4]) - bli_cimag(*alpha) * bli_cimag(x[i__4])), (bli_creal(*alpha) * bli_cimag(x[i__4]) + bli_cimag(*alpha) * bli_creal(x[i__4])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__4]) - bli_cimag(*alpha) * bli_cimag(x[i__4])), (bli_creal(*alpha) * bli_cimag(x[i__4]) + bli_cimag(*alpha) * bli_creal(x[i__4])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		ix = kx;
 		iy = ky;
 		l = kplus1 - j;
@@ -353,14 +353,14 @@
 		    i__4 = iy;
 		    i__2 = iy;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
 		    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 		    i__4 = ix;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    ix += *incx;
 		    iy += *incy;
 /* L70: */
@@ -369,11 +369,11 @@
 		i__4 = jy;
 		i__2 = kplus1 + j * a_dim1;
 		r__1 = bli_creal(a[i__2]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
-		bli_csets( (bli_creal(y[i__4]) + bli_creal(q__3)), (bli_cimag(y[i__4]) + bli_cimag(q__3)), q__2 );
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
-		bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
+		bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__3)), (bli_cimag(y[i__4]) + bli_cimag(q__3)), q__2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
+		bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		jx += *incx;
 		jy += *incy;
 		if (j > *k) {
@@ -391,16 +391,16 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__3 = j;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__3]) - bli_cimag(*alpha) * bli_cimag(x[i__3])), (bli_creal(*alpha) * bli_cimag(x[i__3]) + bli_cimag(*alpha) * bli_creal(x[i__3])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__3]) - bli_cimag(*alpha) * bli_cimag(x[i__3])), (bli_creal(*alpha) * bli_cimag(x[i__3]) + bli_cimag(*alpha) * bli_creal(x[i__3])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		i__3 = j;
 		i__4 = j;
 		i__2 = j * a_dim1 + 1;
 		r__1 = bli_creal(a[i__2]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
-		bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		l = 1 - j;
 /* Computing MIN */
 		i__4 = *n, i__2 = j + *k;
@@ -409,21 +409,21 @@
 		    i__4 = i__;
 		    i__2 = i__;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
 		    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 		    i__4 = i__;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 /* L90: */
 		}
 		i__3 = j;
 		i__4 = j;
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
-		bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 /* L100: */
 	    }
 	} else {
@@ -432,16 +432,16 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__3 = jx;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__3]) - bli_cimag(*alpha) * bli_cimag(x[i__3])), (bli_creal(*alpha) * bli_cimag(x[i__3]) + bli_cimag(*alpha) * bli_creal(x[i__3])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__3]) - bli_cimag(*alpha) * bli_cimag(x[i__3])), (bli_creal(*alpha) * bli_cimag(x[i__3]) + bli_cimag(*alpha) * bli_creal(x[i__3])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		i__3 = jy;
 		i__4 = jy;
 		i__2 = j * a_dim1 + 1;
 		r__1 = bli_creal(a[i__2]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
-		bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		l = 1 - j;
 		ix = jx;
 		iy = jy;
@@ -454,21 +454,21 @@
 		    i__4 = iy;
 		    i__2 = iy;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] );
 		    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 		    i__4 = ix;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 /* L110: */
 		}
 		i__3 = jy;
 		i__4 = jy;
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
-		bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		jx += *incx;
 		jy += *incy;
 /* L120: */
@@ -700,7 +700,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
-		    bli_zsets( (0.), (0.), y[i__2] );
+		    bli_tsets( z,z, (0.), (0.), y[i__2] );
 /* L10: */
 		}
 	    } else {
@@ -708,8 +708,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
 		    i__3 = i__;
-		    bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		    bli_tsets( z,z, (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 /* L20: */
 		}
 	    }
@@ -719,7 +719,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
-		    bli_zsets( (0.), (0.), y[i__2] );
+		    bli_tsets( z,z, (0.), (0.), y[i__2] );
 		    iy += *incy;
 /* L30: */
 		}
@@ -728,8 +728,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
 		    i__3 = iy;
-		    bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		    bli_tsets( z,z, (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		    iy += *incy;
 /* L40: */
 		}
@@ -748,9 +748,9 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		l = kplus1 - j;
 /* Computing MAX */
 		i__2 = 1, i__3 = j - *k;
@@ -759,25 +759,25 @@
 		    i__2 = i__;
 		    i__3 = i__;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 		    i__2 = i__;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 /* L50: */
 		}
 		i__4 = j;
 		i__2 = j;
 		i__3 = kplus1 + j * a_dim1;
 		d__1 = bli_zreal(a[i__3]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
-		bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__3)), (bli_zimag(y[i__2]) + bli_zimag(z__3)), z__2 );
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
-		bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
+		bli_tsets( z,z, (bli_zreal(y[i__2]) + bli_zreal(z__3)), (bli_zimag(y[i__2]) + bli_zimag(z__3)), z__2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
+		bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
 /* L60: */
 	    }
 	} else {
@@ -786,9 +786,9 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__4 = jx;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__4]) - bli_zimag(*alpha) * bli_zimag(x[i__4])), (bli_zreal(*alpha) * bli_zimag(x[i__4]) + bli_zimag(*alpha) * bli_zreal(x[i__4])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__4]) - bli_zimag(*alpha) * bli_zimag(x[i__4])), (bli_zreal(*alpha) * bli_zimag(x[i__4]) + bli_zimag(*alpha) * bli_zreal(x[i__4])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		ix = kx;
 		iy = ky;
 		l = kplus1 - j;
@@ -799,14 +799,14 @@
 		    i__4 = iy;
 		    i__2 = iy;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
 		    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 		    i__4 = ix;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    ix += *incx;
 		    iy += *incy;
 /* L70: */
@@ -815,11 +815,11 @@
 		i__4 = jy;
 		i__2 = kplus1 + j * a_dim1;
 		d__1 = bli_zreal(a[i__2]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
-		bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__3)), (bli_zimag(y[i__4]) + bli_zimag(z__3)), z__2 );
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
-		bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
+		bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__3)), (bli_zimag(y[i__4]) + bli_zimag(z__3)), z__2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
+		bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		jx += *incx;
 		jy += *incy;
 		if (j > *k) {
@@ -837,16 +837,16 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__3 = j;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__3]) - bli_zimag(*alpha) * bli_zimag(x[i__3])), (bli_zreal(*alpha) * bli_zimag(x[i__3]) + bli_zimag(*alpha) * bli_zreal(x[i__3])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__3]) - bli_zimag(*alpha) * bli_zimag(x[i__3])), (bli_zreal(*alpha) * bli_zimag(x[i__3]) + bli_zimag(*alpha) * bli_zreal(x[i__3])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		i__3 = j;
 		i__4 = j;
 		i__2 = j * a_dim1 + 1;
 		d__1 = bli_zreal(a[i__2]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
-		bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		l = 1 - j;
 /* Computing MIN */
 		i__4 = *n, i__2 = j + *k;
@@ -855,21 +855,21 @@
 		    i__4 = i__;
 		    i__2 = i__;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
 		    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 		    i__4 = i__;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 /* L90: */
 		}
 		i__3 = j;
 		i__4 = j;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
-		bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 /* L100: */
 	    }
 	} else {
@@ -878,16 +878,16 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__3 = jx;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__3]) - bli_zimag(*alpha) * bli_zimag(x[i__3])), (bli_zreal(*alpha) * bli_zimag(x[i__3]) + bli_zimag(*alpha) * bli_zreal(x[i__3])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__3]) - bli_zimag(*alpha) * bli_zimag(x[i__3])), (bli_zreal(*alpha) * bli_zimag(x[i__3]) + bli_zimag(*alpha) * bli_zreal(x[i__3])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		i__3 = jy;
 		i__4 = jy;
 		i__2 = j * a_dim1 + 1;
 		d__1 = bli_zreal(a[i__2]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
-		bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		l = 1 - j;
 		ix = jx;
 		iy = jy;
@@ -900,21 +900,21 @@
 		    i__4 = iy;
 		    i__2 = iy;
 		    i__5 = l + i__ + j * a_dim1;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] );
 		    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 		    i__4 = ix;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 /* L110: */
 		}
 		i__3 = jy;
 		i__4 = jy;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
-		bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		jx += *incx;
 		jy += *incy;
 /* L120: */
diff --git a/frame/compat/f2c/bla_hpmv.c b/frame/compat/f2c/bla_hpmv.c
index 743261157..a2b9ab1ac 100644
--- a/frame/compat/f2c/bla_hpmv.c
+++ b/frame/compat/f2c/bla_hpmv.c
@@ -214,7 +214,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
-		    bli_csets( (0.f), (0.f), y[i__2] );
+		    bli_tsets( c,c, (0.f), (0.f), y[i__2] );
 /* L10: */
 		}
 	    } else {
@@ -222,8 +222,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
 		    i__3 = i__;
-		    bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		    bli_tsets( c,c, (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 /* L20: */
 		}
 	    }
@@ -233,7 +233,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
-		    bli_csets( (0.f), (0.f), y[i__2] );
+		    bli_tsets( c,c, (0.f), (0.f), y[i__2] );
 		    iy += *incy;
 /* L30: */
 		}
@@ -242,8 +242,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
 		    i__3 = iy;
-		    bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		    bli_tsets( c,c, (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		    iy += *incy;
 /* L40: */
 		}
@@ -262,23 +262,23 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		k = kk;
 		i__2 = j - 1;
 		for (i__ = 1; i__ <= i__2; ++i__) {
 		    i__3 = i__;
 		    i__4 = i__;
 		    i__5 = k;
-		    bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		    bla_r_cnjg(&q__3, &ap[k]);
 		    i__3 = i__;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    ++k;
 /* L50: */
 		}
@@ -286,11 +286,11 @@
 		i__3 = j;
 		i__4 = kk + j - 1;
 		r__1 = bli_creal(ap[i__4]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
-		bli_csets( (bli_creal(y[i__3]) + bli_creal(q__3)), (bli_cimag(y[i__3]) + bli_cimag(q__3)), q__2 );
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
-		bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
+		bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__3)), (bli_cimag(y[i__3]) + bli_cimag(q__3)), q__2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
+		bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		kk += j;
 /* L60: */
 	    }
@@ -300,9 +300,9 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		ix = kx;
 		iy = ky;
 		i__2 = kk + j - 2;
@@ -310,14 +310,14 @@
 		    i__3 = iy;
 		    i__4 = iy;
 		    i__5 = k;
-		    bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		    bla_r_cnjg(&q__3, &ap[k]);
 		    i__3 = ix;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    ix += *incx;
 		    iy += *incy;
 /* L70: */
@@ -326,11 +326,11 @@
 		i__3 = jy;
 		i__4 = kk + j - 1;
 		r__1 = bli_creal(ap[i__4]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
-		bli_csets( (bli_creal(y[i__3]) + bli_creal(q__3)), (bli_cimag(y[i__3]) + bli_cimag(q__3)), q__2 );
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
-		bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 );
+		bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__3)), (bli_cimag(y[i__3]) + bli_cimag(q__3)), q__2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 );
+		bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		jx += *incx;
 		jy += *incy;
 		kk += j;
@@ -345,38 +345,38 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		i__2 = j;
 		i__3 = j;
 		i__4 = kk;
 		r__1 = bli_creal(ap[i__4]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
-		bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		k = kk + 1;
 		i__2 = *n;
 		for (i__ = j + 1; i__ <= i__2; ++i__) {
 		    i__3 = i__;
 		    i__4 = i__;
 		    i__5 = k;
-		    bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		    bla_r_cnjg(&q__3, &ap[k]);
 		    i__3 = i__;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    ++k;
 /* L90: */
 		}
 		i__2 = j;
 		i__3 = j;
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
-		bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		kk += *n - j + 1;
 /* L100: */
 	    }
@@ -386,16 +386,16 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
-		bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
-		bli_csets( (0.f), (0.f), temp2 );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		bli_tsets( c,c, (0.f), (0.f), temp2 );
 		i__2 = jy;
 		i__3 = jy;
 		i__4 = kk;
 		r__1 = bli_creal(ap[i__4]);
-		bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
-		bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		bli_tsets( c,c, (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		ix = jx;
 		iy = jy;
 		i__2 = kk + *n - j;
@@ -405,21 +405,21 @@
 		    i__3 = iy;
 		    i__4 = iy;
 		    i__5 = k;
-		    bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
-		    bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
+		    bli_tsets( c,c, (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 );
+		    bli_tsets( c,c, (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] );
 		    bla_r_cnjg(&q__3, &ap[k]);
 		    i__3 = ix;
-		    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-		    bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+		    bli_tsets( c,c, (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 /* L110: */
 		}
 		i__2 = jy;
 		i__3 = jy;
-		bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
-		bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
-		bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
+		bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 );
+		bli_tsets( c,c, (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 );
+		bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] );
 		jx += *incx;
 		jy += *incy;
 		kk += *n - j + 1;
@@ -612,7 +612,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
-		    bli_zsets( (0.), (0.), y[i__2] );
+		    bli_tsets( z,z, (0.), (0.), y[i__2] );
 /* L10: */
 		}
 	    } else {
@@ -620,8 +620,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = i__;
 		    i__3 = i__;
-		    bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		    bli_tsets( z,z, (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 /* L20: */
 		}
 	    }
@@ -631,7 +631,7 @@
 		i__1 = *n;
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
-		    bli_zsets( (0.), (0.), y[i__2] );
+		    bli_tsets( z,z, (0.), (0.), y[i__2] );
 		    iy += *incy;
 /* L30: */
 		}
@@ -640,8 +640,8 @@
 		for (i__ = 1; i__ <= i__1; ++i__) {
 		    i__2 = iy;
 		    i__3 = iy;
-		    bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		    bli_tsets( z,z, (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		    iy += *incy;
 /* L40: */
 		}
@@ -660,23 +660,23 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		k = kk;
 		i__2 = j - 1;
 		for (i__ = 1; i__ <= i__2; ++i__) {
 		    i__3 = i__;
 		    i__4 = i__;
 		    i__5 = k;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		    bla_d_cnjg(&z__3, &ap[k]);
 		    i__3 = i__;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    ++k;
 /* L50: */
 		}
@@ -684,11 +684,11 @@
 		i__3 = j;
 		i__4 = kk + j - 1;
 		d__1 = bli_zreal(ap[i__4]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
-		bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__3)), (bli_zimag(y[i__3]) + bli_zimag(z__3)), z__2 );
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
-		bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
+		bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__3)), (bli_zimag(y[i__3]) + bli_zimag(z__3)), z__2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
+		bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		kk += j;
 /* L60: */
 	    }
@@ -698,9 +698,9 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		ix = kx;
 		iy = ky;
 		i__2 = kk + j - 2;
@@ -708,14 +708,14 @@
 		    i__3 = iy;
 		    i__4 = iy;
 		    i__5 = k;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		    bla_d_cnjg(&z__3, &ap[k]);
 		    i__3 = ix;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    ix += *incx;
 		    iy += *incy;
 /* L70: */
@@ -724,11 +724,11 @@
 		i__3 = jy;
 		i__4 = kk + j - 1;
 		d__1 = bli_zreal(ap[i__4]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
-		bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__3)), (bli_zimag(y[i__3]) + bli_zimag(z__3)), z__2 );
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
-		bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 );
+		bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__3)), (bli_zimag(y[i__3]) + bli_zimag(z__3)), z__2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 );
+		bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		jx += *incx;
 		jy += *incy;
 		kk += j;
@@ -743,38 +743,38 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = j;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		i__2 = j;
 		i__3 = j;
 		i__4 = kk;
 		d__1 = bli_zreal(ap[i__4]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
-		bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		k = kk + 1;
 		i__2 = *n;
 		for (i__ = j + 1; i__ <= i__2; ++i__) {
 		    i__3 = i__;
 		    i__4 = i__;
 		    i__5 = k;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		    bla_d_cnjg(&z__3, &ap[k]);
 		    i__3 = i__;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    ++k;
 /* L90: */
 		}
 		i__2 = j;
 		i__3 = j;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
-		bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		kk += *n - j + 1;
 /* L100: */
 	    }
@@ -784,16 +784,16 @@
 	    i__1 = *n;
 	    for (j = 1; j <= i__1; ++j) {
 		i__2 = jx;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
-		bli_zsets( (0.), (0.), temp2 );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		bli_tsets( z,z, (0.), (0.), temp2 );
 		i__2 = jy;
 		i__3 = jy;
 		i__4 = kk;
 		d__1 = bli_zreal(ap[i__4]);
-		bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
-		bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		bli_tsets( z,z, (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		ix = jx;
 		iy = jy;
 		i__2 = kk + *n - j;
@@ -803,21 +803,21 @@
 		    i__3 = iy;
 		    i__4 = iy;
 		    i__5 = k;
-		    bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
-		    bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
+		    bli_tsets( z,z, (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] );
 		    bla_d_cnjg(&z__3, &ap[k]);
 		    i__3 = ix;
-		    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-		    bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 /* L110: */
 		}
 		i__2 = jy;
 		i__3 = jy;
-		bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
-		bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
-		bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
+		bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 );
+		bli_tsets( z,z, (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 );
+		bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] );
 		jx += *incx;
 		jy += *incy;
 		kk += *n - j + 1;
diff --git a/frame/compat/f2c/bla_hpr.c b/frame/compat/f2c/bla_hpr.c
index 636cefef3..43f2934ed 100644
--- a/frame/compat/f2c/bla_hpr.c
+++ b/frame/compat/f2c/bla_hpr.c
@@ -196,31 +196,31 @@
 		i__2 = j;
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 		    bla_r_cnjg(&q__2, &x[j]);
-		    bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+		    bli_tsets( c,c, (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 		    k = kk;
 		    i__2 = j - 1;
 		    for (i__ = 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 			++k;
 /* L10: */
 		    }
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = j;
-		    bli_csets( (bli_creal(x[i__4]) * bli_creal(temp) - bli_cimag(x[i__4]) * bli_cimag(temp)), (bli_creal(x[i__4]) * bli_cimag(temp) + bli_cimag(x[i__4]) * bli_creal(temp)), q__1 );
+		    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(temp) - bli_cimag(x[i__4]) * bli_cimag(temp)), (bli_creal(x[i__4]) * bli_cimag(temp) + bli_cimag(x[i__4]) * bli_creal(temp)), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		kk += j;
 /* L20: */
@@ -232,31 +232,31 @@
 		i__2 = jx;
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 		    bla_r_cnjg(&q__2, &x[jx]);
-		    bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+		    bli_tsets( c,c, (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 		    ix = kx;
 		    i__2 = kk + j - 2;
 		    for (k = kk; k <= i__2; ++k) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 			ix += *incx;
 /* L30: */
 		    }
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = jx;
-		    bli_csets( (bli_creal(x[i__4]) * bli_creal(temp) - bli_cimag(x[i__4]) * bli_cimag(temp)), (bli_creal(x[i__4]) * bli_cimag(temp) + bli_cimag(x[i__4]) * bli_creal(temp)), q__1 );
+		    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(temp) - bli_cimag(x[i__4]) * bli_cimag(temp)), (bli_creal(x[i__4]) * bli_cimag(temp) + bli_cimag(x[i__4]) * bli_creal(temp)), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		jx += *incx;
 		kk += j;
@@ -273,23 +273,23 @@
 		i__2 = j;
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 		    bla_r_cnjg(&q__2, &x[j]);
-		    bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+		    bli_tsets( c,c, (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = j;
-		    bli_csets( (bli_creal(temp) * bli_creal(x[i__4]) - bli_cimag(temp) * bli_cimag(x[i__4])), (bli_creal(temp) * bli_cimag(x[i__4]) + bli_cimag(temp) * bli_creal(x[i__4])), q__1 );
+		    bli_tsets( c,c, (bli_creal(temp) * bli_creal(x[i__4]) - bli_cimag(temp) * bli_cimag(x[i__4])), (bli_creal(temp) * bli_cimag(x[i__4]) + bli_cimag(temp) * bli_creal(x[i__4])), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		    k = kk + 1;
 		    i__2 = *n;
 		    for (i__ = j + 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 			++k;
 /* L50: */
 		    }
@@ -297,7 +297,7 @@
 		    i__2 = kk;
 		    i__3 = kk;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		kk = kk + *n - j + 1;
 /* L60: */
@@ -309,14 +309,14 @@
 		i__2 = jx;
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 		    bla_r_cnjg(&q__2, &x[jx]);
-		    bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+		    bli_tsets( c,c, (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = jx;
-		    bli_csets( (bli_creal(temp) * bli_creal(x[i__4]) - bli_cimag(temp) * bli_cimag(x[i__4])), (bli_creal(temp) * bli_cimag(x[i__4]) + bli_cimag(temp) * bli_creal(x[i__4])), q__1 );
+		    bli_tsets( c,c, (bli_creal(temp) * bli_creal(x[i__4]) - bli_cimag(temp) * bli_cimag(x[i__4])), (bli_creal(temp) * bli_cimag(x[i__4]) + bli_cimag(temp) * bli_creal(x[i__4])), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		    ix = jx;
 		    i__2 = kk + *n - j;
 		    for (k = kk + 1; k <= i__2; ++k) {
@@ -324,16 +324,16 @@
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 /* L70: */
 		    }
 		} else {
 		    i__2 = kk;
 		    i__3 = kk;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		jx += *incx;
 		kk = kk + *n - j + 1;
@@ -508,31 +508,31 @@
 		i__2 = j;
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 		    bla_d_cnjg(&z__2, &x[j]);
-		    bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+		    bli_tsets( z,z, (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 		    k = kk;
 		    i__2 = j - 1;
 		    for (i__ = 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 			++k;
 /* L10: */
 		    }
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = j;
-		    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp) - bli_zimag(x[i__4]) * bli_zimag(temp)), (bli_zreal(x[i__4]) * bli_zimag(temp) + bli_zimag(x[i__4]) * bli_zreal(temp)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(temp) - bli_zimag(x[i__4]) * bli_zimag(temp)), (bli_zreal(x[i__4]) * bli_zimag(temp) + bli_zimag(x[i__4]) * bli_zreal(temp)), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		kk += j;
 /* L20: */
@@ -544,31 +544,31 @@
 		i__2 = jx;
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 		    bla_d_cnjg(&z__2, &x[jx]);
-		    bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+		    bli_tsets( z,z, (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 		    ix = kx;
 		    i__2 = kk + j - 2;
 		    for (k = kk; k <= i__2; ++k) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 			ix += *incx;
 /* L30: */
 		    }
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp) - bli_zimag(x[i__4]) * bli_zimag(temp)), (bli_zreal(x[i__4]) * bli_zimag(temp) + bli_zimag(x[i__4]) * bli_zreal(temp)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(temp) - bli_zimag(x[i__4]) * bli_zimag(temp)), (bli_zreal(x[i__4]) * bli_zimag(temp) + bli_zimag(x[i__4]) * bli_zreal(temp)), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		jx += *incx;
 		kk += j;
@@ -585,23 +585,23 @@
 		i__2 = j;
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 		    bla_d_cnjg(&z__2, &x[j]);
-		    bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+		    bli_tsets( z,z, (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = j;
-		    bli_zsets( (bli_zreal(temp) * bli_zreal(x[i__4]) - bli_zimag(temp) * bli_zimag(x[i__4])), (bli_zreal(temp) * bli_zimag(x[i__4]) + bli_zimag(temp) * bli_zreal(x[i__4])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(x[i__4]) - bli_zimag(temp) * bli_zimag(x[i__4])), (bli_zreal(temp) * bli_zimag(x[i__4]) + bli_zimag(temp) * bli_zreal(x[i__4])), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		    k = kk + 1;
 		    i__2 = *n;
 		    for (i__ = j + 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 			++k;
 /* L50: */
 		    }
@@ -609,7 +609,7 @@
 		    i__2 = kk;
 		    i__3 = kk;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		kk = kk + *n - j + 1;
 /* L60: */
@@ -621,14 +621,14 @@
 		i__2 = jx;
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 		    bla_d_cnjg(&z__2, &x[jx]);
-		    bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+		    bli_tsets( z,z, (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(temp) * bli_zreal(x[i__4]) - bli_zimag(temp) * bli_zimag(x[i__4])), (bli_zreal(temp) * bli_zimag(x[i__4]) + bli_zimag(temp) * bli_zreal(x[i__4])), z__1 );
+		    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(x[i__4]) - bli_zimag(temp) * bli_zimag(x[i__4])), (bli_zreal(temp) * bli_zimag(x[i__4]) + bli_zimag(temp) * bli_zreal(x[i__4])), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		    ix = jx;
 		    i__2 = kk + *n - j;
 		    for (k = kk + 1; k <= i__2; ++k) {
@@ -636,16 +636,16 @@
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 /* L70: */
 		    }
 		} else {
 		    i__2 = kk;
 		    i__3 = kk;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		jx += *incx;
 		kk = kk + *n - j + 1;
diff --git a/frame/compat/f2c/bla_hpr2.c b/frame/compat/f2c/bla_hpr2.c
index 98ae6b554..db366fe10 100644
--- a/frame/compat/f2c/bla_hpr2.c
+++ b/frame/compat/f2c/bla_hpr2.c
@@ -222,41 +222,41 @@
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[j]);
-		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
 		    i__2 = j;
-		    bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
 		    bla_r_cnjg(&q__1, &q__2);
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    k = kk;
 		    i__2 = j - 1;
 		    for (i__ = 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
 			i__6 = i__;
-			bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
-			bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
+			bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 			++k;
 /* L10: */
 		    }
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = j;
-		    bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
+		    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
 		    i__5 = j;
-		    bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
-		    bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
+		    bli_tsets( c,c, (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
+		    bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		kk += j;
 /* L20: */
@@ -269,12 +269,12 @@
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[jy]);
-		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
 		    i__2 = jx;
-		    bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
 		    bla_r_cnjg(&q__1, &q__2);
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    ix = kx;
 		    iy = ky;
 		    i__2 = kk + j - 2;
@@ -282,12 +282,12 @@
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
 			i__6 = iy;
-			bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
-			bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
+			bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 			ix += *incx;
 			iy += *incy;
 /* L30: */
@@ -295,17 +295,17 @@
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = jx;
-		    bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
+		    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
 		    i__5 = jy;
-		    bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
-		    bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
+		    bli_tsets( c,c, (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
+		    bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		jx += *incx;
 		jy += *incy;
@@ -325,33 +325,33 @@
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[j]);
-		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
 		    i__2 = j;
-		    bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
 		    bla_r_cnjg(&q__1, &q__2);
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = j;
-		    bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
+		    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
 		    i__5 = j;
-		    bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
-		    bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
+		    bli_tsets( c,c, (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
+		    bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		    k = kk + 1;
 		    i__2 = *n;
 		    for (i__ = j + 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
 			i__6 = i__;
-			bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
-			bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
+			bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 			++k;
 /* L50: */
 		    }
@@ -359,7 +359,7 @@
 		    i__2 = kk;
 		    i__3 = kk;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		kk = kk + *n - j + 1;
 /* L60: */
@@ -372,21 +372,21 @@
 		if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f
 			|| bli_cimag(y[i__3]) != 0.f)) {
 		    bla_r_cnjg(&q__2, &y[jy]);
-		    bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp1 );
 		    i__2 = jx;
-		    bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
+		    bli_tsets( c,c, (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 );
 		    bla_r_cnjg(&q__1, &q__2);
-		    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
+		    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp2 );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = jx;
-		    bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
+		    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 );
 		    i__5 = jy;
-		    bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
-		    bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
+		    bli_tsets( c,c, (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 );
+		    bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
 		    r__1 = bli_creal(ap[i__3]) + bli_creal(q__1);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		    ix = jx;
 		    iy = jy;
 		    i__2 = kk + *n - j;
@@ -396,19 +396,19 @@
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
-			bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
+			bli_tsets( c,c, (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 );
+			bli_tsets( c,c, (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 );
 			i__6 = iy;
-			bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
-			bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
-			bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
+			bli_tsets( c,c, (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 );
+			bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 );
+			bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] );
 /* L70: */
 		    }
 		} else {
 		    i__2 = kk;
 		    i__3 = kk;
 		    r__1 = bli_creal(ap[i__3]);
-		    bli_csets( (r__1), (0.f), ap[i__2] );
+		    bli_tsets( c,c, (r__1), (0.f), ap[i__2] );
 		}
 		jx += *incx;
 		jy += *incy;
@@ -610,41 +610,41 @@
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[j]);
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
 		    i__2 = j;
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
 		    bla_d_cnjg(&z__1, &z__2);
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    k = kk;
 		    i__2 = j - 1;
 		    for (i__ = 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
 			i__6 = i__;
-			bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
-			bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
+			bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 			++k;
 /* L10: */
 		    }
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = j;
-		    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
+		    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
 		    i__5 = j;
-		    bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
-		    bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
+		    bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		kk += j;
 /* L20: */
@@ -657,12 +657,12 @@
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[jy]);
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
 		    bla_d_cnjg(&z__1, &z__2);
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    ix = kx;
 		    iy = ky;
 		    i__2 = kk + j - 2;
@@ -670,12 +670,12 @@
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
 			i__6 = iy;
-			bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
-			bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
+			bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 			ix += *incx;
 			iy += *incy;
 /* L30: */
@@ -683,17 +683,17 @@
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
+		    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
 		    i__5 = jy;
-		    bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
-		    bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
+		    bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		} else {
 		    i__2 = kk + j - 1;
 		    i__3 = kk + j - 1;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		jx += *incx;
 		jy += *incy;
@@ -713,33 +713,33 @@
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[j]);
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
 		    i__2 = j;
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
 		    bla_d_cnjg(&z__1, &z__2);
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = j;
-		    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
+		    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
 		    i__5 = j;
-		    bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
-		    bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
+		    bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		    k = kk + 1;
 		    i__2 = *n;
 		    for (i__ = j + 1; i__ <= i__2; ++i__) {
 			i__3 = k;
 			i__4 = k;
 			i__5 = i__;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
 			i__6 = i__;
-			bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
-			bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
+			bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 			++k;
 /* L50: */
 		    }
@@ -747,7 +747,7 @@
 		    i__2 = kk;
 		    i__3 = kk;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		kk = kk + *n - j + 1;
 /* L60: */
@@ -760,21 +760,21 @@
 		if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. ||
 			bli_zimag(y[i__3]) != 0.)) {
 		    bla_d_cnjg(&z__2, &y[jy]);
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 );
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
+		    bli_tsets( z,z, (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 );
 		    bla_d_cnjg(&z__1, &z__2);
-		    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
+		    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 );
 		    i__2 = kk;
 		    i__3 = kk;
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
+		    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 );
 		    i__5 = jy;
-		    bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
-		    bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
+		    bli_tsets( z,z, (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 );
+		    bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
 		    d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		    ix = jx;
 		    iy = jy;
 		    i__2 = kk + *n - j;
@@ -784,19 +784,19 @@
 			i__3 = k;
 			i__4 = k;
 			i__5 = ix;
-			bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
-			bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
+			bli_tsets( z,z, (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 );
+			bli_tsets( z,z, (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 );
 			i__6 = iy;
-			bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
-			bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
-			bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
+			bli_tsets( z,z, (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 );
+			bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 );
+			bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] );
 /* L70: */
 		    }
 		} else {
 		    i__2 = kk;
 		    i__3 = kk;
 		    d__1 = bli_zreal(ap[i__3]);
-		    bli_zsets( (d__1), (0.), ap[i__2] );
+		    bli_tsets( z,z, (d__1), (0.), ap[i__2] );
 		}
 		jx += *incx;
 		jy += *incy;
diff --git a/frame/compat/f2c/bla_rot.c b/frame/compat/f2c/bla_rot.c
index cd41a2fb9..2ba82efd8 100644
--- a/frame/compat/f2c/bla_rot.c
+++ b/frame/compat/f2c/bla_rot.c
@@ -221,20 +221,20 @@
     i__1 = *n;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = ix;
-	bli_csets( (*c__ * bli_creal(cx[i__2])), (*c__ * bli_cimag(cx[i__2])), q__2 );
+	bli_tsets( c,c, (*c__ * bli_creal(cx[i__2])), (*c__ * bli_cimag(cx[i__2])), q__2 );
 	i__3 = iy;
-	bli_csets( (*s * bli_creal(cy[i__3])), (*s * bli_cimag(cy[i__3])), q__3 );
-	bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
-	bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ctemp );
+	bli_tsets( c,c, (*s * bli_creal(cy[i__3])), (*s * bli_cimag(cy[i__3])), q__3 );
+	bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
+	bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ctemp );
 	i__2 = iy;
 	i__3 = iy;
-	bli_csets( (*c__ * bli_creal(cy[i__3])), (*c__ * bli_cimag(cy[i__3])), q__2 );
+	bli_tsets( c,c, (*c__ * bli_creal(cy[i__3])), (*c__ * bli_cimag(cy[i__3])), q__2 );
 	i__4 = ix;
-	bli_csets( (*s * bli_creal(cx[i__4])), (*s * bli_cimag(cx[i__4])), q__3 );
-	bli_csets( (bli_creal(q__2) - bli_creal(q__3)), (bli_cimag(q__2) - bli_cimag(q__3)), q__1 );
-	bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), cy[i__2] );
+	bli_tsets( c,c, (*s * bli_creal(cx[i__4])), (*s * bli_cimag(cx[i__4])), q__3 );
+	bli_tsets( c,c, (bli_creal(q__2) - bli_creal(q__3)), (bli_cimag(q__2) - bli_cimag(q__3)), q__1 );
+	bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), cy[i__2] );
 	i__2 = ix;
-	bli_csets( (bli_creal(ctemp)), (bli_cimag(ctemp)), cx[i__2] );
+	bli_tsets( c,c, (bli_creal(ctemp)), (bli_cimag(ctemp)), cx[i__2] );
 	ix += *incx;
 	iy += *incy;
 /* L10: */
@@ -247,20 +247,20 @@
     i__1 = *n;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = i__;
-	bli_csets( (*c__ * bli_creal(cx[i__2])), (*c__ * bli_cimag(cx[i__2])), q__2 );
+	bli_tsets( c,c, (*c__ * bli_creal(cx[i__2])), (*c__ * bli_cimag(cx[i__2])), q__2 );
 	i__3 = i__;
-	bli_csets( (*s * bli_creal(cy[i__3])), (*s * bli_cimag(cy[i__3])), q__3 );
-	bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
-	bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ctemp );
+	bli_tsets( c,c, (*s * bli_creal(cy[i__3])), (*s * bli_cimag(cy[i__3])), q__3 );
+	bli_tsets( c,c, (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 );
+	bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), ctemp );
 	i__2 = i__;
 	i__3 = i__;
-	bli_csets( (*c__ * bli_creal(cy[i__3])), (*c__ * bli_cimag(cy[i__3])), q__2 );
+	bli_tsets( c,c, (*c__ * bli_creal(cy[i__3])), (*c__ * bli_cimag(cy[i__3])), q__2 );
 	i__4 = i__;
-	bli_csets( (*s * bli_creal(cx[i__4])), (*s * bli_cimag(cx[i__4])), q__3 );
-	bli_csets( (bli_creal(q__2) - bli_creal(q__3)), (bli_cimag(q__2) - bli_cimag(q__3)), q__1 );
-	bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), cy[i__2] );
+	bli_tsets( c,c, (*s * bli_creal(cx[i__4])), (*s * bli_cimag(cx[i__4])), q__3 );
+	bli_tsets( c,c, (bli_creal(q__2) - bli_creal(q__3)), (bli_cimag(q__2) - bli_cimag(q__3)), q__1 );
+	bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), cy[i__2] );
 	i__2 = i__;
-	bli_csets( (bli_creal(ctemp)), (bli_cimag(ctemp)), cx[i__2] );
+	bli_tsets( c,c, (bli_creal(ctemp)), (bli_cimag(ctemp)), cx[i__2] );
 /* L30: */
     }
     return 0;
@@ -314,20 +314,20 @@
     i__1 = *n;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = ix;
-	bli_zsets( (*c__ * bli_zreal(zx[i__2])), (*c__ * bli_zimag(zx[i__2])), z__2 );
+	bli_tsets( z,z, (*c__ * bli_zreal(zx[i__2])), (*c__ * bli_zimag(zx[i__2])), z__2 );
 	i__3 = iy;
-	bli_zsets( (*s * bli_zreal(zy[i__3])), (*s * bli_zimag(zy[i__3])), z__3 );
-	bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
-	bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ztemp );
+	bli_tsets( z,z, (*s * bli_zreal(zy[i__3])), (*s * bli_zimag(zy[i__3])), z__3 );
+	bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
+	bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ztemp );
 	i__2 = iy;
 	i__3 = iy;
-	bli_zsets( (*c__ * bli_zreal(zy[i__3])), (*c__ * bli_zimag(zy[i__3])), z__2 );
+	bli_tsets( z,z, (*c__ * bli_zreal(zy[i__3])), (*c__ * bli_zimag(zy[i__3])), z__2 );
 	i__4 = ix;
-	bli_zsets( (*s * bli_zreal(zx[i__4])), (*s * bli_zimag(zx[i__4])), z__3 );
-	bli_zsets( (bli_zreal(z__2) - bli_zreal(z__3)), (bli_zimag(z__2) - bli_zimag(z__3)), z__1 );
-	bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), zy[i__2] );
+	bli_tsets( z,z, (*s * bli_zreal(zx[i__4])), (*s * bli_zimag(zx[i__4])), z__3 );
+	bli_tsets( z,z, (bli_zreal(z__2) - bli_zreal(z__3)), (bli_zimag(z__2) - bli_zimag(z__3)), z__1 );
+	bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), zy[i__2] );
 	i__2 = ix;
-	bli_zsets( (bli_zreal(ztemp)), (bli_zimag(ztemp)), zx[i__2] );
+	bli_tsets( z,z, (bli_zreal(ztemp)), (bli_zimag(ztemp)), zx[i__2] );
 	ix += *incx;
 	iy += *incy;
 /* L10: */
@@ -340,20 +340,20 @@
     i__1 = *n;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = i__;
-	bli_zsets( (*c__ * bli_zreal(zx[i__2])), (*c__ * bli_zimag(zx[i__2])), z__2 );
+	bli_tsets( z,z, (*c__ * bli_zreal(zx[i__2])), (*c__ * bli_zimag(zx[i__2])), z__2 );
 	i__3 = i__;
-	bli_zsets( (*s * bli_zreal(zy[i__3])), (*s * bli_zimag(zy[i__3])), z__3 );
-	bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
-	bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ztemp );
+	bli_tsets( z,z, (*s * bli_zreal(zy[i__3])), (*s * bli_zimag(zy[i__3])), z__3 );
+	bli_tsets( z,z, (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 );
+	bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), ztemp );
 	i__2 = i__;
 	i__3 = i__;
-	bli_zsets( (*c__ * bli_zreal(zy[i__3])), (*c__ * bli_zimag(zy[i__3])), z__2 );
+	bli_tsets( z,z, (*c__ * bli_zreal(zy[i__3])), (*c__ * bli_zimag(zy[i__3])), z__2 );
 	i__4 = i__;
-	bli_zsets( (*s * bli_zreal(zx[i__4])), (*s * bli_zimag(zx[i__4])), z__3 );
-	bli_zsets( (bli_zreal(z__2) - bli_zreal(z__3)), (bli_zimag(z__2) - bli_zimag(z__3)), z__1 );
-	bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), zy[i__2] );
+	bli_tsets( z,z, (*s * bli_zreal(zx[i__4])), (*s * bli_zimag(zx[i__4])), z__3 );
+	bli_tsets( z,z, (bli_zreal(z__2) - bli_zreal(z__3)), (bli_zimag(z__2) - bli_zimag(z__3)), z__1 );
+	bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), zy[i__2] );
 	i__2 = i__;
-	bli_zsets( (bli_zreal(ztemp)), (bli_zimag(ztemp)), zx[i__2] );
+	bli_tsets( z,z, (bli_zreal(ztemp)), (bli_zimag(ztemp)), zx[i__2] );
 /* L30: */
     }
     return 0;
@@ -434,62 +434,71 @@
 	cx[i__2].r = stemp.r;
 	cx[i__2].i = stemp.i;
 #else
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  *c__ * bli_creal(cx[i__2]),
 	  *c__ * bli_cimag(cx[i__2]),
 	  q__2
 	);
 	i__3 = iy;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
 	  bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
 	  q__3
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__2) + bli_creal(q__3),
 	  bli_cimag(q__2) + bli_cimag(q__3),
 	  q__1
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__1),
 	  bli_cimag(q__1),
 	  stemp
 	);
 	i__2 = iy;
 	i__3 = iy;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  *c__ * bli_creal(cy[i__3]),
 	  *c__ * bli_cimag(cy[i__3]),
 	  q__2
 	);
 	bla_r_cnjg(&q__4, s);
 	i__4 = ix;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
 	  bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
 	  q__3
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__2) - bli_creal(q__3),
 	  bli_cimag(q__2) - bli_cimag(q__3),
 	  q__1
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__1),
 	  bli_cimag(q__1),
 	  cy[i__2]
 	);
 	i__2 = ix;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(stemp),
 	  bli_cimag(stemp),
 	  cx[i__2]
@@ -533,62 +542,71 @@
 	cx[i__2].r = stemp.r;
 	cx[i__2].i = stemp.i;
 #else
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  *c__ * bli_creal(cx[i__2]),
 	  *c__ * bli_cimag(cx[i__2]),
 	  q__2
 	);
 	i__3 = i__;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
 	  bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
 	  q__3
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__2) + bli_creal(q__3),
 	  bli_cimag(q__2) + bli_cimag(q__3),
 	  q__1
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__1),
 	  bli_cimag(q__1),
 	  stemp
 	);
 	i__2 = i__;
 	i__3 = i__;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  *c__ * bli_creal(cy[i__3]),
 	  *c__ * bli_cimag(cy[i__3]),
 	  q__2
 	);
 	bla_r_cnjg(&q__4, s);
 	i__4 = i__;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
 	  bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
 	  q__3
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__2) - bli_creal(q__3),
 	  bli_cimag(q__2) - bli_cimag(q__3),
 	  q__1
 	);
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(q__1),
 	  bli_cimag(q__1),
 	  cy[i__2]
 	);
 	i__2 = i__;
-	bli_csets
+	bli_tsets
 	(
+	  c,c,
 	  bli_creal(stemp),
 	  bli_cimag(stemp),
 	  cx[i__2]
@@ -675,62 +693,71 @@
 	cx[i__2].r = stemp.r;
 	cx[i__2].i = stemp.i;
 #else
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  *c__ * bli_zreal(cx[i__2]),
 	  *c__ * bli_zimag(cx[i__2]),
 	  z__2
 	);
 	i__3 = iy;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
 	  bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
 	  z__3
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__2) + bli_zreal(z__3),
 	  bli_zimag(z__2) + bli_zimag(z__3),
 	  z__1
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__1),
 	  bli_zimag(z__1),
 	  stemp
 	);
 	i__2 = iy;
 	i__3 = iy;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  *c__ * bli_zreal(cy[i__3]),
 	  *c__ * bli_zimag(cy[i__3]),
 	  z__2
 	);
 	bla_d_cnjg(&z__4, s);
 	i__4 = ix;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
 	  bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
 	  z__3
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__2) - bli_zreal(z__3),
 	  bli_zimag(z__2) - bli_zimag(z__3),
 	  z__1
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__1),
 	  bli_zimag(z__1),
 	  cy[i__2]
 	);
 	i__2 = ix;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(stemp),
 	  bli_zimag(stemp),
 	  cx[i__2]
@@ -774,62 +801,71 @@
 	cx[i__2].r = stemp.r;
 	cx[i__2].i = stemp.i;
 #else
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  *c__ * bli_zreal(cx[i__2]),
 	  *c__ * bli_zimag(cx[i__2]),
 	  z__2
 	);
 	i__3 = i__;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
 	  bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
 	  z__3
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__2) + bli_zreal(z__3),
 	  bli_zimag(z__2) + bli_zimag(z__3),
 	  z__1
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__1),
 	  bli_zimag(z__1),
 	  stemp
 	);
 	i__2 = i__;
 	i__3 = i__;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  *c__ * bli_zreal(cy[i__3]),
 	  *c__ * bli_zimag(cy[i__3]),
 	  z__2
 	);
 	bla_d_cnjg(&z__4, s);
 	i__4 = i__;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
 	  bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
 	  z__3
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__2) - bli_zreal(z__3),
 	  bli_zimag(z__2) - bli_zimag(z__3),
 	  z__1
 	);
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(z__1),
 	  bli_zimag(z__1),
 	  cy[i__2]
 	);
 	i__2 = i__;
-	bli_zsets
+	bli_tsets
 	(
+	  z,z,
 	  bli_zreal(stemp),
 	  bli_zimag(stemp),
 	  cx[i__2]
diff --git a/frame/compat/f2c/bla_rotg.c b/frame/compat/f2c/bla_rotg.c
index 1572689f5..48a40d473 100644
--- a/frame/compat/f2c/bla_rotg.c
+++ b/frame/compat/f2c/bla_rotg.c
@@ -180,28 +180,28 @@ static bla_double dc_b4 = 1.;
 	goto L10;
     }
     *c__ = 0.f;
-    bli_csets( 1.f, 0.f, *s );
-    bli_csets( bli_creal(*cb), bli_cimag(*cb), *ca );
+    bli_tsets( c,c, 1.f, 0.f, *s );
+    bli_tsets( c,c, bli_creal(*cb), bli_cimag(*cb), *ca );
     goto L20;
 L10:
     scale = bla_c_abs(ca) + bla_c_abs(cb);
-    bli_csets( (bli_creal(*ca) / scale), (bli_cimag(*ca) / scale), q__1 );
+    bli_tsets( c,c, (bli_creal(*ca) / scale), (bli_cimag(*ca) / scale), q__1 );
 /* Computing 2nd power */
     r__1 = bla_c_abs(&q__1);
-    bli_csets( (bli_creal(*cb) / scale), (bli_cimag(*cb) / scale), q__2 );
+    bli_tsets( c,c, (bli_creal(*cb) / scale), (bli_cimag(*cb) / scale), q__2 );
 /* Computing 2nd power */
     r__2 = bla_c_abs(&q__2);
     norm = scale * sqrt(r__1 * r__1 + r__2 * r__2);
     r__1 = bla_c_abs(ca);
-    bli_csets( (bli_creal(*ca) / r__1), (bli_cimag(*ca) / r__1), q__1 );
-    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), alpha );
+    bli_tsets( c,c, (bli_creal(*ca) / r__1), (bli_cimag(*ca) / r__1), q__1 );
+    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), alpha );
     *c__ = bla_c_abs(ca) / norm;
     bla_r_cnjg(&q__3, cb);
-    bli_csets( (bli_creal(alpha) * bli_creal(q__3) - bli_cimag(alpha) * bli_cimag(q__3)), (bli_creal(alpha) * bli_cimag(q__3) + bli_cimag(alpha) * bli_creal(q__3)), q__2 );
-    bli_csets( (bli_creal(q__2) / norm), (bli_cimag(q__2) / norm), q__1 );
-    bli_csets( bli_creal(q__1), bli_cimag(q__1), *s );
-    bli_csets( (norm * bli_creal(alpha)), (norm * bli_cimag(alpha)), q__1 );
-    bli_csets( bli_creal(q__1), bli_cimag(q__1), *ca );
+    bli_tsets( c,c, (bli_creal(alpha) * bli_creal(q__3) - bli_cimag(alpha) * bli_cimag(q__3)), (bli_creal(alpha) * bli_cimag(q__3) + bli_cimag(alpha) * bli_creal(q__3)), q__2 );
+    bli_tsets( c,c, (bli_creal(q__2) / norm), (bli_cimag(q__2) / norm), q__1 );
+    bli_tsets( c,c, bli_creal(q__1), bli_cimag(q__1), *s );
+    bli_tsets( c,c, (norm * bli_creal(alpha)), (norm * bli_cimag(alpha)), q__1 );
+    bli_tsets( c,c, bli_creal(q__1), bli_cimag(q__1), *ca );
 L20:
     return 0;
 } /* crotg_ */
@@ -232,30 +232,30 @@ static bla_double dc_b4 = 1.;
 	goto L10;
     }
     *c__ = 0.;
-    bli_zsets( 1., 0., *s );
-    bli_zsets( bli_zreal(*cb), bli_zimag(*cb), *ca );
+    bli_tsets( z,z, 1., 0., *s );
+    bli_tsets( z,z, bli_zreal(*cb), bli_zimag(*cb), *ca );
     goto L20;
 L10:
     scale = bla_z_abs(ca) + bla_z_abs(cb);
-    bli_zsets( (scale), (0.), z__2 );
+    bli_tsets( z,z, (scale), (0.), z__2 );
     bla_z_div(&z__1, ca, &z__2);
 /* Computing 2nd power */
     d__1 = bla_z_abs(&z__1);
-    bli_zsets( (scale), (0.), z__4 );
+    bli_tsets( z,z, (scale), (0.), z__4 );
     bla_z_div(&z__3, cb, &z__4);
 /* Computing 2nd power */
     d__2 = bla_z_abs(&z__3);
     norm = scale * sqrt(d__1 * d__1 + d__2 * d__2);
     d__1 = bla_z_abs(ca);
-    bli_zsets( (bli_zreal(*ca) / d__1), (bli_zimag(*ca) / d__1), z__1 );
-    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), alpha );
+    bli_tsets( z,z, (bli_zreal(*ca) / d__1), (bli_zimag(*ca) / d__1), z__1 );
+    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), alpha );
     *c__ = bla_z_abs(ca) / norm;
     bla_d_cnjg(&z__3, cb);
-    bli_zsets( (bli_zreal(alpha) * bli_zreal(z__3) - bli_zimag(alpha) * bli_zimag(z__3)), (bli_zreal(alpha) * bli_zimag(z__3) + bli_zimag(alpha) * bli_zreal(z__3)), z__2 );
-    bli_zsets( (bli_zreal(z__2) / norm), (bli_zimag(z__2) / norm), z__1 );
-    bli_zsets( bli_zreal(z__1), bli_zimag(z__1), *s );
-    bli_zsets( (norm * bli_zreal(alpha)), (norm * bli_zimag(alpha)), z__1 );
-    bli_zsets( bli_zreal(z__1), bli_zimag(z__1), *ca );
+    bli_tsets( z,z, (bli_zreal(alpha) * bli_zreal(z__3) - bli_zimag(alpha) * bli_zimag(z__3)), (bli_zreal(alpha) * bli_zimag(z__3) + bli_zimag(alpha) * bli_zreal(z__3)), z__2 );
+    bli_tsets( z,z, (bli_zreal(z__2) / norm), (bli_zimag(z__2) / norm), z__1 );
+    bli_tsets( z,z, bli_zreal(z__1), bli_zimag(z__1), *s );
+    bli_tsets( z,z, (norm * bli_zreal(alpha)), (norm * bli_zimag(alpha)), z__1 );
+    bli_tsets( z,z, bli_zreal(z__1), bli_zimag(z__1), *ca );
 L20:
     return 0;
 } /* zrotg_ */
diff --git a/frame/compat/f2c/bla_tbmv.c b/frame/compat/f2c/bla_tbmv.c
index 16c149c89..d1d191dd0 100644
--- a/frame/compat/f2c/bla_tbmv.c
+++ b/frame/compat/f2c/bla_tbmv.c
@@ -266,7 +266,7 @@
 		    i__2 = j;
 		    if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 			i__2 = j;
-			bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 			l = kplus1 - j;
 /* Computing MAX */
 			i__2 = 1, i__3 = j - *k;
@@ -275,17 +275,17 @@
 			    i__2 = i__;
 			    i__3 = i__;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 /* L10: */
 			}
 			if (nounit) {
 			    i__4 = j;
 			    i__2 = j;
 			    i__3 = kplus1 + j * a_dim1;
-			    bli_csets( (bli_creal(x[i__2]) * bli_creal(a[i__3]) - bli_cimag(x[i__2]) * bli_cimag(a[i__3])), (bli_creal(x[i__2]) * bli_cimag(a[i__3]) + bli_cimag(x[i__2]) * bli_creal(a[i__3])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
+			    bli_tsets( c,c, (bli_creal(x[i__2]) * bli_creal(a[i__3]) - bli_cimag(x[i__2]) * bli_cimag(a[i__3])), (bli_creal(x[i__2]) * bli_cimag(a[i__3]) + bli_cimag(x[i__2]) * bli_creal(a[i__3])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
 			}
 		    }
 /* L20: */
@@ -297,7 +297,7 @@
 		    i__4 = jx;
 		    if (bli_creal(x[i__4]) != 0.f || bli_cimag(x[i__4]) != 0.f) {
 			i__4 = jx;
-			bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
 			ix = kx;
 			l = kplus1 - j;
 /* Computing MAX */
@@ -307,9 +307,9 @@
 			    i__4 = ix;
 			    i__2 = ix;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__2]) + bli_creal(q__2)), (bli_cimag(x[i__2]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__2]) + bli_creal(q__2)), (bli_cimag(x[i__2]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
 			    ix += *incx;
 /* L30: */
 			}
@@ -317,8 +317,8 @@
 			    i__3 = jx;
 			    i__4 = jx;
 			    i__2 = kplus1 + j * a_dim1;
-			    bli_csets( (bli_creal(x[i__4]) * bli_creal(a[i__2]) - bli_cimag(x[i__4]) * bli_cimag(a[i__2])), (bli_creal(x[i__4]) * bli_cimag(a[i__2]) + bli_cimag(x[i__4]) * bli_creal(a[i__2])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(a[i__2]) - bli_cimag(x[i__4]) * bli_cimag(a[i__2])), (bli_creal(x[i__4]) * bli_cimag(a[i__2]) + bli_cimag(x[i__4]) * bli_creal(a[i__2])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 			}
 		    }
 		    jx += *incx;
@@ -334,7 +334,7 @@
 		    i__1 = j;
 		    if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) {
 			i__1 = j;
-			bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 			l = 1 - j;
 /* Computing MIN */
 			i__1 = *n, i__3 = j + *k;
@@ -343,17 +343,17 @@
 			    i__1 = i__;
 			    i__3 = i__;
 			    i__2 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__2]) - bli_cimag(temp) * bli_cimag(a[i__2])), (bli_creal(temp) * bli_cimag(a[i__2]) + bli_cimag(temp) * bli_creal(a[i__2])), q__2 );
-			    bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__2]) - bli_cimag(temp) * bli_cimag(a[i__2])), (bli_creal(temp) * bli_cimag(a[i__2]) + bli_cimag(temp) * bli_creal(a[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 /* L50: */
 			}
 			if (nounit) {
 			    i__4 = j;
 			    i__1 = j;
 			    i__3 = j * a_dim1 + 1;
-			    bli_csets( (bli_creal(x[i__1]) * bli_creal(a[i__3]) - bli_cimag(x[i__1]) * bli_cimag(a[i__3])), (bli_creal(x[i__1]) * bli_cimag(a[i__3]) + bli_cimag(x[i__1]) * bli_creal(a[i__3])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
+			    bli_tsets( c,c, (bli_creal(x[i__1]) * bli_creal(a[i__3]) - bli_cimag(x[i__1]) * bli_cimag(a[i__3])), (bli_creal(x[i__1]) * bli_cimag(a[i__3]) + bli_cimag(x[i__1]) * bli_creal(a[i__3])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
 			}
 		    }
 /* L60: */
@@ -365,7 +365,7 @@
 		    i__4 = jx;
 		    if (bli_creal(x[i__4]) != 0.f || bli_cimag(x[i__4]) != 0.f) {
 			i__4 = jx;
-			bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
 			ix = kx;
 			l = 1 - j;
 /* Computing MIN */
@@ -375,9 +375,9 @@
 			    i__4 = ix;
 			    i__1 = ix;
 			    i__2 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__2]) - bli_cimag(temp) * bli_cimag(a[i__2])), (bli_creal(temp) * bli_cimag(a[i__2]) + bli_cimag(temp) * bli_creal(a[i__2])), q__2 );
-			    bli_csets( (bli_creal(x[i__1]) + bli_creal(q__2)), (bli_cimag(x[i__1]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__2]) - bli_cimag(temp) * bli_cimag(a[i__2])), (bli_creal(temp) * bli_cimag(a[i__2]) + bli_cimag(temp) * bli_creal(a[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__1]) + bli_creal(q__2)), (bli_cimag(x[i__1]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] );
 			    ix -= *incx;
 /* L70: */
 			}
@@ -385,8 +385,8 @@
 			    i__3 = jx;
 			    i__4 = jx;
 			    i__1 = j * a_dim1 + 1;
-			    bli_csets( (bli_creal(x[i__4]) * bli_creal(a[i__1]) - bli_cimag(x[i__4]) * bli_cimag(a[i__1])), (bli_creal(x[i__4]) * bli_cimag(a[i__1]) + bli_cimag(x[i__4]) * bli_creal(a[i__1])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) * bli_creal(a[i__1]) - bli_cimag(x[i__4]) * bli_cimag(a[i__1])), (bli_creal(x[i__4]) * bli_cimag(a[i__1]) + bli_cimag(x[i__4]) * bli_creal(a[i__1])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 			}
 		    }
 		    jx -= *incx;
@@ -406,13 +406,13 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__3 = j;
-		    bli_csets( (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp );
 		    l = kplus1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__3 = kplus1 + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__3]) - bli_cimag(temp) * bli_cimag(a[i__3])), (bli_creal(temp) * bli_cimag(a[i__3]) + bli_cimag(temp) * bli_creal(a[i__3])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__3]) - bli_cimag(temp) * bli_cimag(a[i__3])), (bli_creal(temp) * bli_cimag(a[i__3]) + bli_cimag(temp) * bli_creal(a[i__3])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -420,16 +420,16 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    i__4 = l + i__ + j * a_dim1;
 			    i__1 = i__;
-			    bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__1]) - bli_cimag(a[i__4]) * bli_cimag(x[i__1])), (bli_creal(a[i__4]) * bli_cimag(x[i__1]) + bli_cimag(a[i__4]) * bli_creal(x[i__1])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__4]) * bli_creal(x[i__1]) - bli_cimag(a[i__4]) * bli_cimag(x[i__1])), (bli_creal(a[i__4]) * bli_cimag(x[i__1]) + bli_cimag(a[i__4]) * bli_creal(x[i__1])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L90: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -437,14 +437,14 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__4 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L100: */
 			}
 		    }
 		    i__3 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__3] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__3] );
 /* L110: */
 		}
 	    } else {
@@ -452,15 +452,15 @@
 		jx = kx;
 		for (j = *n; j >= 1; --j) {
 		    i__3 = jx;
-		    bli_csets( (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp );
 		    kx -= *incx;
 		    ix = kx;
 		    l = kplus1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__3 = kplus1 + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__3]) - bli_cimag(temp) * bli_cimag(a[i__3])), (bli_creal(temp) * bli_cimag(a[i__3]) + bli_cimag(temp) * bli_creal(a[i__3])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__3]) - bli_cimag(temp) * bli_cimag(a[i__3])), (bli_creal(temp) * bli_cimag(a[i__3]) + bli_cimag(temp) * bli_creal(a[i__3])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -468,17 +468,17 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    i__4 = l + i__ + j * a_dim1;
 			    i__1 = ix;
-			    bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__1]) - bli_cimag(a[i__4]) * bli_cimag(x[i__1])), (bli_creal(a[i__4]) * bli_cimag(x[i__1]) + bli_cimag(a[i__4]) * bli_creal(x[i__1])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__4]) * bli_creal(x[i__1]) - bli_cimag(a[i__4]) * bli_cimag(x[i__1])), (bli_creal(a[i__4]) * bli_cimag(x[i__1]) + bli_cimag(a[i__4]) * bli_creal(x[i__1])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix -= *incx;
 /* L120: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -486,15 +486,15 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__4 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix -= *incx;
 /* L130: */
 			}
 		    }
 		    i__3 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__3] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__3] );
 		    jx -= *incx;
 /* L140: */
 		}
@@ -504,13 +504,13 @@
 		i__3 = *n;
 		for (j = 1; j <= i__3; ++j) {
 		    i__4 = j;
-		    bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
 		    l = 1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__4 = j * a_dim1 + 1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -518,16 +518,16 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    i__1 = l + i__ + j * a_dim1;
 			    i__2 = i__;
-			    bli_csets( (bli_creal(a[i__1]) * bli_creal(x[i__2]) - bli_cimag(a[i__1]) * bli_cimag(x[i__2])), (bli_creal(a[i__1]) * bli_cimag(x[i__2]) + bli_cimag(a[i__1]) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__1]) * bli_creal(x[i__2]) - bli_cimag(a[i__1]) * bli_cimag(x[i__2])), (bli_creal(a[i__1]) * bli_cimag(x[i__2]) + bli_cimag(a[i__1]) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L150: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -535,14 +535,14 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__1 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L160: */
 			}
 		    }
 		    i__4 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
 /* L170: */
 		}
 	    } else {
@@ -550,15 +550,15 @@
 		i__3 = *n;
 		for (j = 1; j <= i__3; ++j) {
 		    i__4 = jx;
-		    bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
 		    kx += *incx;
 		    ix = kx;
 		    l = 1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__4 = j * a_dim1 + 1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -566,17 +566,17 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    i__1 = l + i__ + j * a_dim1;
 			    i__2 = ix;
-			    bli_csets( (bli_creal(a[i__1]) * bli_creal(x[i__2]) - bli_cimag(a[i__1]) * bli_cimag(x[i__2])), (bli_creal(a[i__1]) * bli_cimag(x[i__2]) + bli_cimag(a[i__1]) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__1]) * bli_creal(x[i__2]) - bli_cimag(a[i__1]) * bli_cimag(x[i__2])), (bli_creal(a[i__1]) * bli_cimag(x[i__2]) + bli_cimag(a[i__1]) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix += *incx;
 /* L180: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -584,15 +584,15 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__1 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix += *incx;
 /* L190: */
 			}
 		    }
 		    i__4 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
 		    jx += *incx;
 /* L200: */
 		}
@@ -1658,7 +1658,7 @@
 		    i__2 = j;
 		    if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 			i__2 = j;
-			bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 			l = kplus1 - j;
 /* Computing MAX */
 			i__2 = 1, i__3 = j - *k;
@@ -1667,17 +1667,17 @@
 			    i__2 = i__;
 			    i__3 = i__;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 /* L10: */
 			}
 			if (nounit) {
 			    i__4 = j;
 			    i__2 = j;
 			    i__3 = kplus1 + j * a_dim1;
-			    bli_zsets( (bli_zreal(x[i__2]) * bli_zreal(a[i__3]) - bli_zimag(x[i__2]) * bli_zimag(a[i__3])), (bli_zreal(x[i__2]) * bli_zimag(a[i__3]) + bli_zimag(x[i__2]) * bli_zreal(a[i__3])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
+			    bli_tsets( z,z, (bli_zreal(x[i__2]) * bli_zreal(a[i__3]) - bli_zimag(x[i__2]) * bli_zimag(a[i__3])), (bli_zreal(x[i__2]) * bli_zimag(a[i__3]) + bli_zimag(x[i__2]) * bli_zreal(a[i__3])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
 			}
 		    }
 /* L20: */
@@ -1689,7 +1689,7 @@
 		    i__4 = jx;
 		    if (bli_zreal(x[i__4]) != 0. || bli_zimag(x[i__4]) != 0.) {
 			i__4 = jx;
-			bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
 			ix = kx;
 			l = kplus1 - j;
 /* Computing MAX */
@@ -1699,9 +1699,9 @@
 			    i__4 = ix;
 			    i__2 = ix;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__2]) + bli_zreal(z__2)), (bli_zimag(x[i__2]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__2]) + bli_zreal(z__2)), (bli_zimag(x[i__2]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
 			    ix += *incx;
 /* L30: */
 			}
@@ -1709,8 +1709,8 @@
 			    i__3 = jx;
 			    i__4 = jx;
 			    i__2 = kplus1 + j * a_dim1;
-			    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(a[i__2]) - bli_zimag(x[i__4]) * bli_zimag(a[i__2])), (bli_zreal(x[i__4]) * bli_zimag(a[i__2]) + bli_zimag(x[i__4]) * bli_zreal(a[i__2])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(a[i__2]) - bli_zimag(x[i__4]) * bli_zimag(a[i__2])), (bli_zreal(x[i__4]) * bli_zimag(a[i__2]) + bli_zimag(x[i__4]) * bli_zreal(a[i__2])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 			}
 		    }
 		    jx += *incx;
@@ -1726,7 +1726,7 @@
 		    i__1 = j;
 		    if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) {
 			i__1 = j;
-			bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 			l = 1 - j;
 /* Computing MIN */
 			i__1 = *n, i__3 = j + *k;
@@ -1735,17 +1735,17 @@
 			    i__1 = i__;
 			    i__3 = i__;
 			    i__2 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__2]) - bli_zimag(temp) * bli_zimag(a[i__2])), (bli_zreal(temp) * bli_zimag(a[i__2]) + bli_zimag(temp) * bli_zreal(a[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__2]) - bli_zimag(temp) * bli_zimag(a[i__2])), (bli_zreal(temp) * bli_zimag(a[i__2]) + bli_zimag(temp) * bli_zreal(a[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 /* L50: */
 			}
 			if (nounit) {
 			    i__4 = j;
 			    i__1 = j;
 			    i__3 = j * a_dim1 + 1;
-			    bli_zsets( (bli_zreal(x[i__1]) * bli_zreal(a[i__3]) - bli_zimag(x[i__1]) * bli_zimag(a[i__3])), (bli_zreal(x[i__1]) * bli_zimag(a[i__3]) + bli_zimag(x[i__1]) * bli_zreal(a[i__3])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
+			    bli_tsets( z,z, (bli_zreal(x[i__1]) * bli_zreal(a[i__3]) - bli_zimag(x[i__1]) * bli_zimag(a[i__3])), (bli_zreal(x[i__1]) * bli_zimag(a[i__3]) + bli_zimag(x[i__1]) * bli_zreal(a[i__3])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
 			}
 		    }
 /* L60: */
@@ -1757,7 +1757,7 @@
 		    i__4 = jx;
 		    if (bli_zreal(x[i__4]) != 0. || bli_zimag(x[i__4]) != 0.) {
 			i__4 = jx;
-			bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
 			ix = kx;
 			l = 1 - j;
 /* Computing MIN */
@@ -1767,9 +1767,9 @@
 			    i__4 = ix;
 			    i__1 = ix;
 			    i__2 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__2]) - bli_zimag(temp) * bli_zimag(a[i__2])), (bli_zreal(temp) * bli_zimag(a[i__2]) + bli_zimag(temp) * bli_zreal(a[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__1]) + bli_zreal(z__2)), (bli_zimag(x[i__1]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__2]) - bli_zimag(temp) * bli_zimag(a[i__2])), (bli_zreal(temp) * bli_zimag(a[i__2]) + bli_zimag(temp) * bli_zreal(a[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__1]) + bli_zreal(z__2)), (bli_zimag(x[i__1]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] );
 			    ix -= *incx;
 /* L70: */
 			}
@@ -1777,8 +1777,8 @@
 			    i__3 = jx;
 			    i__4 = jx;
 			    i__1 = j * a_dim1 + 1;
-			    bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(a[i__1]) - bli_zimag(x[i__4]) * bli_zimag(a[i__1])), (bli_zreal(x[i__4]) * bli_zimag(a[i__1]) + bli_zimag(x[i__4]) * bli_zreal(a[i__1])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) * bli_zreal(a[i__1]) - bli_zimag(x[i__4]) * bli_zimag(a[i__1])), (bli_zreal(x[i__4]) * bli_zimag(a[i__1]) + bli_zimag(x[i__4]) * bli_zreal(a[i__1])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 			}
 		    }
 		    jx -= *incx;
@@ -1798,13 +1798,13 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__3 = j;
-		    bli_zsets( (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp );
 		    l = kplus1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__3 = kplus1 + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__3]) - bli_zimag(temp) * bli_zimag(a[i__3])), (bli_zreal(temp) * bli_zimag(a[i__3]) + bli_zimag(temp) * bli_zreal(a[i__3])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__3]) - bli_zimag(temp) * bli_zimag(a[i__3])), (bli_zreal(temp) * bli_zimag(a[i__3]) + bli_zimag(temp) * bli_zreal(a[i__3])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -1812,16 +1812,16 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    i__4 = l + i__ + j * a_dim1;
 			    i__1 = i__;
-			    bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__1]) - bli_zimag(a[i__4]) * bli_zimag(x[i__1])), (bli_zreal(a[i__4]) * bli_zimag(x[i__1]) + bli_zimag(a[i__4]) * bli_zreal(x[i__1])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__4]) * bli_zreal(x[i__1]) - bli_zimag(a[i__4]) * bli_zimag(x[i__1])), (bli_zreal(a[i__4]) * bli_zimag(x[i__1]) + bli_zimag(a[i__4]) * bli_zreal(x[i__1])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L90: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -1829,14 +1829,14 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__4 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L100: */
 			}
 		    }
 		    i__3 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] );
 /* L110: */
 		}
 	    } else {
@@ -1844,15 +1844,15 @@
 		jx = kx;
 		for (j = *n; j >= 1; --j) {
 		    i__3 = jx;
-		    bli_zsets( (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp );
 		    kx -= *incx;
 		    ix = kx;
 		    l = kplus1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__3 = kplus1 + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__3]) - bli_zimag(temp) * bli_zimag(a[i__3])), (bli_zreal(temp) * bli_zimag(a[i__3]) + bli_zimag(temp) * bli_zreal(a[i__3])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__3]) - bli_zimag(temp) * bli_zimag(a[i__3])), (bli_zreal(temp) * bli_zimag(a[i__3]) + bli_zimag(temp) * bli_zreal(a[i__3])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -1860,17 +1860,17 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    i__4 = l + i__ + j * a_dim1;
 			    i__1 = ix;
-			    bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__1]) - bli_zimag(a[i__4]) * bli_zimag(x[i__1])), (bli_zreal(a[i__4]) * bli_zimag(x[i__1]) + bli_zimag(a[i__4]) * bli_zreal(x[i__1])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__4]) * bli_zreal(x[i__1]) - bli_zimag(a[i__4]) * bli_zimag(x[i__1])), (bli_zreal(a[i__4]) * bli_zimag(x[i__1]) + bli_zimag(a[i__4]) * bli_zreal(x[i__1])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix -= *incx;
 /* L120: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MAX */
 			i__4 = 1, i__1 = j - *k;
@@ -1878,15 +1878,15 @@
 			for (i__ = j - 1; i__ >= i__3; --i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__4 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix -= *incx;
 /* L130: */
 			}
 		    }
 		    i__3 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] );
 		    jx -= *incx;
 /* L140: */
 		}
@@ -1896,13 +1896,13 @@
 		i__3 = *n;
 		for (j = 1; j <= i__3; ++j) {
 		    i__4 = j;
-		    bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
 		    l = 1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__4 = j * a_dim1 + 1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -1910,16 +1910,16 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    i__1 = l + i__ + j * a_dim1;
 			    i__2 = i__;
-			    bli_zsets( (bli_zreal(a[i__1]) * bli_zreal(x[i__2]) - bli_zimag(a[i__1]) * bli_zimag(x[i__2])), (bli_zreal(a[i__1]) * bli_zimag(x[i__2]) + bli_zimag(a[i__1]) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__1]) * bli_zreal(x[i__2]) - bli_zimag(a[i__1]) * bli_zimag(x[i__2])), (bli_zreal(a[i__1]) * bli_zimag(x[i__2]) + bli_zimag(a[i__1]) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L150: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -1927,14 +1927,14 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__1 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L160: */
 			}
 		    }
 		    i__4 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
 /* L170: */
 		}
 	    } else {
@@ -1942,15 +1942,15 @@
 		i__3 = *n;
 		for (j = 1; j <= i__3; ++j) {
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
 		    kx += *incx;
 		    ix = kx;
 		    l = 1 - j;
 		    if (noconj) {
 			if (nounit) {
 			    i__4 = j * a_dim1 + 1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -1958,17 +1958,17 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    i__1 = l + i__ + j * a_dim1;
 			    i__2 = ix;
-			    bli_zsets( (bli_zreal(a[i__1]) * bli_zreal(x[i__2]) - bli_zimag(a[i__1]) * bli_zimag(x[i__2])), (bli_zreal(a[i__1]) * bli_zimag(x[i__2]) + bli_zimag(a[i__1]) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__1]) * bli_zreal(x[i__2]) - bli_zimag(a[i__1]) * bli_zimag(x[i__2])), (bli_zreal(a[i__1]) * bli_zimag(x[i__2]) + bli_zimag(a[i__1]) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix += *incx;
 /* L180: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 /* Computing MIN */
 			i__1 = *n, i__2 = j + *k;
@@ -1976,15 +1976,15 @@
 			for (i__ = j + 1; i__ <= i__4; ++i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__1 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix += *incx;
 /* L190: */
 			}
 		    }
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
 		    jx += *incx;
 /* L200: */
 		}
diff --git a/frame/compat/f2c/bla_tbsv.c b/frame/compat/f2c/bla_tbsv.c
index b237556f8..9f58c0adb 100644
--- a/frame/compat/f2c/bla_tbsv.c
+++ b/frame/compat/f2c/bla_tbsv.c
@@ -272,10 +272,10 @@
 			if (nounit) {
 			    i__1 = j;
 			    bla_c_div(&q__1, &x[j], &a[kplus1 + j * a_dim1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 			}
 			i__1 = j;
-			bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 /* Computing MAX */
 			i__2 = 1, i__3 = j - *k;
 			i__1 = f2c_max(i__2,i__3);
@@ -283,9 +283,9 @@
 			    i__2 = i__;
 			    i__3 = i__;
 			    i__4 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__2 );
-			    bli_csets( (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 /* L10: */
 			}
 		    }
@@ -303,10 +303,10 @@
 			if (nounit) {
 			    i__1 = jx;
 			    bla_c_div(&q__1, &x[jx], &a[kplus1 + j * a_dim1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 			}
 			i__1 = jx;
-			bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 /* Computing MAX */
 			i__2 = 1, i__3 = j - *k;
 			i__1 = f2c_max(i__2,i__3);
@@ -314,9 +314,9 @@
 			    i__2 = ix;
 			    i__3 = ix;
 			    i__4 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__2 );
-			    bli_csets( (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			    ix -= *incx;
 /* L30: */
 			}
@@ -335,10 +335,10 @@
 			if (nounit) {
 			    i__2 = j;
 			    bla_c_div(&q__1, &x[j], &a[j * a_dim1 + 1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			}
 			i__2 = j;
-			bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 /* Computing MIN */
 			i__3 = *n, i__4 = j + *k;
 			i__2 = f2c_min(i__3,i__4);
@@ -346,9 +346,9 @@
 			    i__3 = i__;
 			    i__4 = i__;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 /* L50: */
 			}
 		    }
@@ -366,10 +366,10 @@
 			if (nounit) {
 			    i__2 = jx;
 			    bla_c_div(&q__1, &x[jx], &a[j * a_dim1 + 1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			}
 			i__2 = jx;
-			bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 /* Computing MIN */
 			i__3 = *n, i__4 = j + *k;
 			i__2 = f2c_min(i__3,i__4);
@@ -377,9 +377,9 @@
 			    i__3 = ix;
 			    i__4 = ix;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 			    ix += *incx;
 /* L70: */
 			}
@@ -399,7 +399,7 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = j;
-		    bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 		    l = kplus1 - j;
 		    if (noconj) {
 /* Computing MAX */
@@ -408,14 +408,14 @@
 			for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) {
 			    i__2 = l + i__ + j * a_dim1;
 			    i__3 = i__;
-			    bli_csets( (bli_creal(a[i__2]) * bli_creal(x[i__3]) - bli_cimag(a[i__2]) * bli_cimag(x[i__3])), (bli_creal(a[i__2]) * bli_cimag(x[i__3]) + bli_cimag(a[i__2]) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__2]) * bli_creal(x[i__3]) - bli_cimag(a[i__2]) * bli_cimag(x[i__3])), (bli_creal(a[i__2]) * bli_cimag(x[i__3]) + bli_cimag(a[i__2]) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L90: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &a[kplus1 + j * a_dim1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 /* Computing MAX */
@@ -424,19 +424,19 @@
 			for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__4 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L100: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__3 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__3] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__3] );
 /* L110: */
 		}
 	    } else {
@@ -444,7 +444,7 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__3 = jx;
-		    bli_csets( (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp );
 		    ix = kx;
 		    l = kplus1 - j;
 		    if (noconj) {
@@ -454,15 +454,15 @@
 			for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) {
 			    i__3 = l + i__ + j * a_dim1;
 			    i__4 = ix;
-			    bli_csets( (bli_creal(a[i__3]) * bli_creal(x[i__4]) - bli_cimag(a[i__3]) * bli_cimag(x[i__4])), (bli_creal(a[i__3]) * bli_cimag(x[i__4]) + bli_cimag(a[i__3]) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__3]) * bli_creal(x[i__4]) - bli_cimag(a[i__3]) * bli_cimag(x[i__4])), (bli_creal(a[i__3]) * bli_cimag(x[i__4]) + bli_cimag(a[i__3]) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix += *incx;
 /* L120: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &a[kplus1 + j * a_dim1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 /* Computing MAX */
@@ -471,20 +471,20 @@
 			for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__2 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix += *incx;
 /* L130: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__4 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
 		    jx += *incx;
 		    if (j > *k) {
 			kx += *incx;
@@ -496,7 +496,7 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__1 = j;
-		    bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 		    l = 1 - j;
 		    if (noconj) {
 /* Computing MIN */
@@ -505,14 +505,14 @@
 			for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) {
 			    i__1 = l + i__ + j * a_dim1;
 			    i__4 = i__;
-			    bli_csets( (bli_creal(a[i__1]) * bli_creal(x[i__4]) - bli_cimag(a[i__1]) * bli_cimag(x[i__4])), (bli_creal(a[i__1]) * bli_cimag(x[i__4]) + bli_cimag(a[i__1]) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__1]) * bli_creal(x[i__4]) - bli_cimag(a[i__1]) * bli_cimag(x[i__4])), (bli_creal(a[i__1]) * bli_cimag(x[i__4]) + bli_cimag(a[i__1]) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L150: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &a[j * a_dim1 + 1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 /* Computing MIN */
@@ -521,19 +521,19 @@
 			for (i__ = f2c_min(i__2,i__1); i__ >= i__4; --i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__2 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L160: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__4 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__4] );
 /* L170: */
 		}
 	    } else {
@@ -541,7 +541,7 @@
 		jx = kx;
 		for (j = *n; j >= 1; --j) {
 		    i__4 = jx;
-		    bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp );
 		    ix = kx;
 		    l = 1 - j;
 		    if (noconj) {
@@ -551,15 +551,15 @@
 			for (i__ = f2c_min(i__4,i__2); i__ >= i__1; --i__) {
 			    i__4 = l + i__ + j * a_dim1;
 			    i__2 = ix;
-			    bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__2]) - bli_cimag(a[i__4]) * bli_cimag(x[i__2])), (bli_creal(a[i__4]) * bli_cimag(x[i__2]) + bli_cimag(a[i__4]) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(a[i__4]) * bli_creal(x[i__2]) - bli_cimag(a[i__4]) * bli_cimag(x[i__2])), (bli_creal(a[i__4]) * bli_cimag(x[i__2]) + bli_cimag(a[i__4]) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix -= *incx;
 /* L180: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &a[j * a_dim1 + 1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 /* Computing MIN */
@@ -568,20 +568,20 @@
 			for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) {
 			    bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
 			    i__1 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix -= *incx;
 /* L190: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__2 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
 		    jx -= *incx;
 		    if (*n - j >= *k) {
 			kx -= *incx;
@@ -1665,10 +1665,10 @@
 			if (nounit) {
 			    i__1 = j;
 			    bla_z_div(&z__1, &x[j], &a[kplus1 + j * a_dim1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 			}
 			i__1 = j;
-			bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 /* Computing MAX */
 			i__2 = 1, i__3 = j - *k;
 			i__1 = f2c_max(i__2,i__3);
@@ -1676,9 +1676,9 @@
 			    i__2 = i__;
 			    i__3 = i__;
 			    i__4 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 /* L10: */
 			}
 		    }
@@ -1696,10 +1696,10 @@
 			if (nounit) {
 			    i__1 = jx;
 			    bla_z_div(&z__1, &x[jx], &a[kplus1 + j * a_dim1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 			}
 			i__1 = jx;
-			bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 /* Computing MAX */
 			i__2 = 1, i__3 = j - *k;
 			i__1 = f2c_max(i__2,i__3);
@@ -1707,9 +1707,9 @@
 			    i__2 = ix;
 			    i__3 = ix;
 			    i__4 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			    ix -= *incx;
 /* L30: */
 			}
@@ -1728,10 +1728,10 @@
 			if (nounit) {
 			    i__2 = j;
 			    bla_z_div(&z__1, &x[j], &a[j * a_dim1 + 1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			}
 			i__2 = j;
-			bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 /* Computing MIN */
 			i__3 = *n, i__4 = j + *k;
 			i__2 = f2c_min(i__3,i__4);
@@ -1739,9 +1739,9 @@
 			    i__3 = i__;
 			    i__4 = i__;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 /* L50: */
 			}
 		    }
@@ -1759,10 +1759,10 @@
 			if (nounit) {
 			    i__2 = jx;
 			    bla_z_div(&z__1, &x[jx], &a[j * a_dim1 + 1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			}
 			i__2 = jx;
-			bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 /* Computing MIN */
 			i__3 = *n, i__4 = j + *k;
 			i__2 = f2c_min(i__3,i__4);
@@ -1770,9 +1770,9 @@
 			    i__3 = ix;
 			    i__4 = ix;
 			    i__5 = l + i__ + j * a_dim1;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 			    ix += *incx;
 /* L70: */
 			}
@@ -1792,7 +1792,7 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = j;
-		    bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 		    l = kplus1 - j;
 		    if (noconj) {
 /* Computing MAX */
@@ -1801,14 +1801,14 @@
 			for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) {
 			    i__2 = l + i__ + j * a_dim1;
 			    i__3 = i__;
-			    bli_zsets( (bli_zreal(a[i__2]) * bli_zreal(x[i__3]) - bli_zimag(a[i__2]) * bli_zimag(x[i__3])), (bli_zreal(a[i__2]) * bli_zimag(x[i__3]) + bli_zimag(a[i__2]) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__2]) * bli_zreal(x[i__3]) - bli_zimag(a[i__2]) * bli_zimag(x[i__3])), (bli_zreal(a[i__2]) * bli_zimag(x[i__3]) + bli_zimag(a[i__2]) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L90: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &a[kplus1 + j * a_dim1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 /* Computing MAX */
@@ -1817,19 +1817,19 @@
 			for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__4 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L100: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__3 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] );
 /* L110: */
 		}
 	    } else {
@@ -1837,7 +1837,7 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__3 = jx;
-		    bli_zsets( (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp );
 		    ix = kx;
 		    l = kplus1 - j;
 		    if (noconj) {
@@ -1847,15 +1847,15 @@
 			for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) {
 			    i__3 = l + i__ + j * a_dim1;
 			    i__4 = ix;
-			    bli_zsets( (bli_zreal(a[i__3]) * bli_zreal(x[i__4]) - bli_zimag(a[i__3]) * bli_zimag(x[i__4])), (bli_zreal(a[i__3]) * bli_zimag(x[i__4]) + bli_zimag(a[i__3]) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__3]) * bli_zreal(x[i__4]) - bli_zimag(a[i__3]) * bli_zimag(x[i__4])), (bli_zreal(a[i__3]) * bli_zimag(x[i__4]) + bli_zimag(a[i__3]) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix += *incx;
 /* L120: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &a[kplus1 + j * a_dim1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 /* Computing MAX */
@@ -1864,20 +1864,20 @@
 			for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__2 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix += *incx;
 /* L130: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
 		    jx += *incx;
 		    if (j > *k) {
 			kx += *incx;
@@ -1889,7 +1889,7 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__1 = j;
-		    bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 		    l = 1 - j;
 		    if (noconj) {
 /* Computing MIN */
@@ -1898,14 +1898,14 @@
 			for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) {
 			    i__1 = l + i__ + j * a_dim1;
 			    i__4 = i__;
-			    bli_zsets( (bli_zreal(a[i__1]) * bli_zreal(x[i__4]) - bli_zimag(a[i__1]) * bli_zimag(x[i__4])), (bli_zreal(a[i__1]) * bli_zimag(x[i__4]) + bli_zimag(a[i__1]) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__1]) * bli_zreal(x[i__4]) - bli_zimag(a[i__1]) * bli_zimag(x[i__4])), (bli_zreal(a[i__1]) * bli_zimag(x[i__4]) + bli_zimag(a[i__1]) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L150: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &a[j * a_dim1 + 1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 /* Computing MIN */
@@ -1914,19 +1914,19 @@
 			for (i__ = f2c_min(i__2,i__1); i__ >= i__4; --i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__2 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L160: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__4 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] );
 /* L170: */
 		}
 	    } else {
@@ -1934,7 +1934,7 @@
 		jx = kx;
 		for (j = *n; j >= 1; --j) {
 		    i__4 = jx;
-		    bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp );
 		    ix = kx;
 		    l = 1 - j;
 		    if (noconj) {
@@ -1944,15 +1944,15 @@
 			for (i__ = f2c_min(i__4,i__2); i__ >= i__1; --i__) {
 			    i__4 = l + i__ + j * a_dim1;
 			    i__2 = ix;
-			    bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__2]) - bli_zimag(a[i__4]) * bli_zimag(x[i__2])), (bli_zreal(a[i__4]) * bli_zimag(x[i__2]) + bli_zimag(a[i__4]) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(a[i__4]) * bli_zreal(x[i__2]) - bli_zimag(a[i__4]) * bli_zimag(x[i__2])), (bli_zreal(a[i__4]) * bli_zimag(x[i__2]) + bli_zimag(a[i__4]) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix -= *incx;
 /* L180: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &a[j * a_dim1 + 1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 /* Computing MIN */
@@ -1961,20 +1961,20 @@
 			for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) {
 			    bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
 			    i__1 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix -= *incx;
 /* L190: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
 		    jx -= *incx;
 		    if (*n - j >= *k) {
 			kx -= *incx;
diff --git a/frame/compat/f2c/bla_tpmv.c b/frame/compat/f2c/bla_tpmv.c
index 853f30156..f4bc0adee 100644
--- a/frame/compat/f2c/bla_tpmv.c
+++ b/frame/compat/f2c/bla_tpmv.c
@@ -220,16 +220,16 @@
 		    i__2 = j;
 		    if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 			i__2 = j;
-			bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 			k = kk;
 			i__2 = j - 1;
 			for (i__ = 1; i__ <= i__2; ++i__) {
 			    i__3 = i__;
 			    i__4 = i__;
 			    i__5 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__4]) + bli_creal(q__2)), (bli_cimag(x[i__4]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) + bli_creal(q__2)), (bli_cimag(x[i__4]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 			    ++k;
 /* L10: */
 			}
@@ -237,8 +237,8 @@
 			    i__2 = j;
 			    i__3 = j;
 			    i__4 = kk + j - 1;
-			    bli_csets( (bli_creal(x[i__3]) * bli_creal(ap[i__4]) - bli_cimag(x[i__3]) * bli_cimag(ap[i__4])), (bli_creal(x[i__3]) * bli_cimag(ap[i__4]) + bli_cimag(x[i__3]) * bli_creal(ap[i__4])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) * bli_creal(ap[i__4]) - bli_cimag(x[i__3]) * bli_cimag(ap[i__4])), (bli_creal(x[i__3]) * bli_cimag(ap[i__4]) + bli_cimag(x[i__3]) * bli_creal(ap[i__4])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			}
 		    }
 		    kk += j;
@@ -251,16 +251,16 @@
 		    i__2 = jx;
 		    if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) {
 			i__2 = jx;
-			bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 			ix = kx;
 			i__2 = kk + j - 2;
 			for (k = kk; k <= i__2; ++k) {
 			    i__3 = ix;
 			    i__4 = ix;
 			    i__5 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__4]) + bli_creal(q__2)), (bli_cimag(x[i__4]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) + bli_creal(q__2)), (bli_cimag(x[i__4]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 			    ix += *incx;
 /* L30: */
 			}
@@ -268,8 +268,8 @@
 			    i__2 = jx;
 			    i__3 = jx;
 			    i__4 = kk + j - 1;
-			    bli_csets( (bli_creal(x[i__3]) * bli_creal(ap[i__4]) - bli_cimag(x[i__3]) * bli_cimag(ap[i__4])), (bli_creal(x[i__3]) * bli_cimag(ap[i__4]) + bli_cimag(x[i__3]) * bli_creal(ap[i__4])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) * bli_creal(ap[i__4]) - bli_cimag(x[i__3]) * bli_cimag(ap[i__4])), (bli_creal(x[i__3]) * bli_cimag(ap[i__4]) + bli_cimag(x[i__3]) * bli_creal(ap[i__4])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			}
 		    }
 		    jx += *incx;
@@ -284,16 +284,16 @@
 		    i__1 = j;
 		    if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) {
 			i__1 = j;
-			bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 			k = kk;
 			i__1 = j + 1;
 			for (i__ = *n; i__ >= i__1; --i__) {
 			    i__2 = i__;
 			    i__3 = i__;
 			    i__4 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 );
-			    bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			    --k;
 /* L50: */
 			}
@@ -301,8 +301,8 @@
 			    i__1 = j;
 			    i__2 = j;
 			    i__3 = kk - *n + j;
-			    bli_csets( (bli_creal(x[i__2]) * bli_creal(ap[i__3]) - bli_cimag(x[i__2]) * bli_cimag(ap[i__3])), (bli_creal(x[i__2]) * bli_cimag(ap[i__3]) + bli_cimag(x[i__2]) * bli_creal(ap[i__3])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(x[i__2]) * bli_creal(ap[i__3]) - bli_cimag(x[i__2]) * bli_cimag(ap[i__3])), (bli_creal(x[i__2]) * bli_cimag(ap[i__3]) + bli_cimag(x[i__2]) * bli_creal(ap[i__3])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 			}
 		    }
 		    kk -= *n - j + 1;
@@ -315,16 +315,16 @@
 		    i__1 = jx;
 		    if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) {
 			i__1 = jx;
-			bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 			ix = kx;
 			i__1 = kk - (*n - (j + 1));
 			for (k = kk; k >= i__1; --k) {
 			    i__2 = ix;
 			    i__3 = ix;
 			    i__4 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 );
-			    bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			    ix -= *incx;
 /* L70: */
 			}
@@ -332,8 +332,8 @@
 			    i__1 = jx;
 			    i__2 = jx;
 			    i__3 = kk - *n + j;
-			    bli_csets( (bli_creal(x[i__2]) * bli_creal(ap[i__3]) - bli_cimag(x[i__2]) * bli_cimag(ap[i__3])), (bli_creal(x[i__2]) * bli_cimag(ap[i__3]) + bli_cimag(x[i__2]) * bli_creal(ap[i__3])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(x[i__2]) * bli_creal(ap[i__3]) - bli_cimag(x[i__2]) * bli_cimag(ap[i__3])), (bli_creal(x[i__2]) * bli_cimag(ap[i__3]) + bli_cimag(x[i__2]) * bli_creal(ap[i__3])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 			}
 		    }
 		    jx -= *incx;
@@ -351,41 +351,41 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__1 = j;
-		    bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 		    k = kk - 1;
 		    if (noconj) {
 			if (nounit) {
 			    i__1 = kk;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__1]) - bli_cimag(temp) * bli_cimag(ap[i__1])), (bli_creal(temp) * bli_cimag(ap[i__1]) + bli_cimag(temp) * bli_creal(ap[i__1])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__1]) - bli_cimag(temp) * bli_cimag(ap[i__1])), (bli_creal(temp) * bli_cimag(ap[i__1]) + bli_cimag(temp) * bli_creal(ap[i__1])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			for (i__ = j - 1; i__ >= 1; --i__) {
 			    i__1 = k;
 			    i__2 = i__;
-			    bli_csets( (bli_creal(ap[i__1]) * bli_creal(x[i__2]) - bli_cimag(ap[i__1]) * bli_cimag(x[i__2])), (bli_creal(ap[i__1]) * bli_cimag(x[i__2]) + bli_cimag(ap[i__1]) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__1]) * bli_creal(x[i__2]) - bli_cimag(ap[i__1]) * bli_cimag(x[i__2])), (bli_creal(ap[i__1]) * bli_cimag(x[i__2]) + bli_cimag(ap[i__1]) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    --k;
 /* L90: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			for (i__ = j - 1; i__ >= 1; --i__) {
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__1 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    --k;
 /* L100: */
 			}
 		    }
 		    i__1 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
 		    kk -= j;
 /* L110: */
 		}
@@ -393,43 +393,43 @@
 		jx = kx + (*n - 1) * *incx;
 		for (j = *n; j >= 1; --j) {
 		    i__1 = jx;
-		    bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 		    ix = jx;
 		    if (noconj) {
 			if (nounit) {
 			    i__1 = kk;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__1]) - bli_cimag(temp) * bli_cimag(ap[i__1])), (bli_creal(temp) * bli_cimag(ap[i__1]) + bli_cimag(temp) * bli_creal(ap[i__1])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__1]) - bli_cimag(temp) * bli_cimag(ap[i__1])), (bli_creal(temp) * bli_cimag(ap[i__1]) + bli_cimag(temp) * bli_creal(ap[i__1])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			i__1 = kk - j + 1;
 			for (k = kk - 1; k >= i__1; --k) {
 			    ix -= *incx;
 			    i__2 = k;
 			    i__3 = ix;
-			    bli_csets( (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L120: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			i__1 = kk - j + 1;
 			for (k = kk - 1; k >= i__1; --k) {
 			    ix -= *incx;
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__2 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L130: */
 			}
 		    }
 		    i__1 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
 		    jx -= *incx;
 		    kk -= j;
 /* L140: */
@@ -441,43 +441,43 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = j;
-		    bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 		    k = kk + 1;
 		    if (noconj) {
 			if (nounit) {
 			    i__2 = kk;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__2]) - bli_cimag(temp) * bli_cimag(ap[i__2])), (bli_creal(temp) * bli_cimag(ap[i__2]) + bli_cimag(temp) * bli_creal(ap[i__2])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__2]) - bli_cimag(temp) * bli_cimag(ap[i__2])), (bli_creal(temp) * bli_cimag(ap[i__2]) + bli_cimag(temp) * bli_creal(ap[i__2])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			i__2 = *n;
 			for (i__ = j + 1; i__ <= i__2; ++i__) {
 			    i__3 = k;
 			    i__4 = i__;
-			    bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ++k;
 /* L150: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			i__2 = *n;
 			for (i__ = j + 1; i__ <= i__2; ++i__) {
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__3 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ++k;
 /* L160: */
 			}
 		    }
 		    i__2 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
 		    kk += *n - j + 1;
 /* L170: */
 		}
@@ -486,43 +486,43 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = jx;
-		    bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 		    ix = jx;
 		    if (noconj) {
 			if (nounit) {
 			    i__2 = kk;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__2]) - bli_cimag(temp) * bli_cimag(ap[i__2])), (bli_creal(temp) * bli_cimag(ap[i__2]) + bli_cimag(temp) * bli_creal(ap[i__2])), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__2]) - bli_cimag(temp) * bli_cimag(ap[i__2])), (bli_creal(temp) * bli_cimag(ap[i__2]) + bli_cimag(temp) * bli_creal(ap[i__2])), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			i__2 = kk + *n - j;
 			for (k = kk + 1; k <= i__2; ++k) {
 			    ix += *incx;
 			    i__3 = k;
 			    i__4 = ix;
-			    bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L180: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk]);
-			    bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 			i__2 = kk + *n - j;
 			for (k = kk + 1; k <= i__2; ++k) {
 			    ix += *incx;
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__3 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 /* L190: */
 			}
 		    }
 		    i__2 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
 		    jx += *incx;
 		    kk += *n - j + 1;
 /* L200: */
@@ -1417,16 +1417,16 @@
 		    i__2 = j;
 		    if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 			i__2 = j;
-			bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 			k = kk;
 			i__2 = j - 1;
 			for (i__ = 1; i__ <= i__2; ++i__) {
 			    i__3 = i__;
 			    i__4 = i__;
 			    i__5 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__4]) + bli_zreal(z__2)), (bli_zimag(x[i__4]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) + bli_zreal(z__2)), (bli_zimag(x[i__4]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 			    ++k;
 /* L10: */
 			}
@@ -1434,8 +1434,8 @@
 			    i__2 = j;
 			    i__3 = j;
 			    i__4 = kk + j - 1;
-			    bli_zsets( (bli_zreal(x[i__3]) * bli_zreal(ap[i__4]) - bli_zimag(x[i__3]) * bli_zimag(ap[i__4])), (bli_zreal(x[i__3]) * bli_zimag(ap[i__4]) + bli_zimag(x[i__3]) * bli_zreal(ap[i__4])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) * bli_zreal(ap[i__4]) - bli_zimag(x[i__3]) * bli_zimag(ap[i__4])), (bli_zreal(x[i__3]) * bli_zimag(ap[i__4]) + bli_zimag(x[i__3]) * bli_zreal(ap[i__4])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			}
 		    }
 		    kk += j;
@@ -1448,16 +1448,16 @@
 		    i__2 = jx;
 		    if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) {
 			i__2 = jx;
-			bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 			ix = kx;
 			i__2 = kk + j - 2;
 			for (k = kk; k <= i__2; ++k) {
 			    i__3 = ix;
 			    i__4 = ix;
 			    i__5 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__4]) + bli_zreal(z__2)), (bli_zimag(x[i__4]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) + bli_zreal(z__2)), (bli_zimag(x[i__4]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 			    ix += *incx;
 /* L30: */
 			}
@@ -1465,8 +1465,8 @@
 			    i__2 = jx;
 			    i__3 = jx;
 			    i__4 = kk + j - 1;
-			    bli_zsets( (bli_zreal(x[i__3]) * bli_zreal(ap[i__4]) - bli_zimag(x[i__3]) * bli_zimag(ap[i__4])), (bli_zreal(x[i__3]) * bli_zimag(ap[i__4]) + bli_zimag(x[i__3]) * bli_zreal(ap[i__4])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) * bli_zreal(ap[i__4]) - bli_zimag(x[i__3]) * bli_zimag(ap[i__4])), (bli_zreal(x[i__3]) * bli_zimag(ap[i__4]) + bli_zimag(x[i__3]) * bli_zreal(ap[i__4])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			}
 		    }
 		    jx += *incx;
@@ -1481,16 +1481,16 @@
 		    i__1 = j;
 		    if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) {
 			i__1 = j;
-			bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 			k = kk;
 			i__1 = j + 1;
 			for (i__ = *n; i__ >= i__1; --i__) {
 			    i__2 = i__;
 			    i__3 = i__;
 			    i__4 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			    --k;
 /* L50: */
 			}
@@ -1498,8 +1498,8 @@
 			    i__1 = j;
 			    i__2 = j;
 			    i__3 = kk - *n + j;
-			    bli_zsets( (bli_zreal(x[i__2]) * bli_zreal(ap[i__3]) - bli_zimag(x[i__2]) * bli_zimag(ap[i__3])), (bli_zreal(x[i__2]) * bli_zimag(ap[i__3]) + bli_zimag(x[i__2]) * bli_zreal(ap[i__3])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(x[i__2]) * bli_zreal(ap[i__3]) - bli_zimag(x[i__2]) * bli_zimag(ap[i__3])), (bli_zreal(x[i__2]) * bli_zimag(ap[i__3]) + bli_zimag(x[i__2]) * bli_zreal(ap[i__3])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 			}
 		    }
 		    kk -= *n - j + 1;
@@ -1512,16 +1512,16 @@
 		    i__1 = jx;
 		    if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) {
 			i__1 = jx;
-			bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 			ix = kx;
 			i__1 = kk - (*n - (j + 1));
 			for (k = kk; k >= i__1; --k) {
 			    i__2 = ix;
 			    i__3 = ix;
 			    i__4 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			    ix -= *incx;
 /* L70: */
 			}
@@ -1529,8 +1529,8 @@
 			    i__1 = jx;
 			    i__2 = jx;
 			    i__3 = kk - *n + j;
-			    bli_zsets( (bli_zreal(x[i__2]) * bli_zreal(ap[i__3]) - bli_zimag(x[i__2]) * bli_zimag(ap[i__3])), (bli_zreal(x[i__2]) * bli_zimag(ap[i__3]) + bli_zimag(x[i__2]) * bli_zreal(ap[i__3])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(x[i__2]) * bli_zreal(ap[i__3]) - bli_zimag(x[i__2]) * bli_zimag(ap[i__3])), (bli_zreal(x[i__2]) * bli_zimag(ap[i__3]) + bli_zimag(x[i__2]) * bli_zreal(ap[i__3])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 			}
 		    }
 		    jx -= *incx;
@@ -1548,41 +1548,41 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__1 = j;
-		    bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 		    k = kk - 1;
 		    if (noconj) {
 			if (nounit) {
 			    i__1 = kk;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__1]) - bli_zimag(temp) * bli_zimag(ap[i__1])), (bli_zreal(temp) * bli_zimag(ap[i__1]) + bli_zimag(temp) * bli_zreal(ap[i__1])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__1]) - bli_zimag(temp) * bli_zimag(ap[i__1])), (bli_zreal(temp) * bli_zimag(ap[i__1]) + bli_zimag(temp) * bli_zreal(ap[i__1])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			for (i__ = j - 1; i__ >= 1; --i__) {
 			    i__1 = k;
 			    i__2 = i__;
-			    bli_zsets( (bli_zreal(ap[i__1]) * bli_zreal(x[i__2]) - bli_zimag(ap[i__1]) * bli_zimag(x[i__2])), (bli_zreal(ap[i__1]) * bli_zimag(x[i__2]) + bli_zimag(ap[i__1]) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__1]) * bli_zreal(x[i__2]) - bli_zimag(ap[i__1]) * bli_zimag(x[i__2])), (bli_zreal(ap[i__1]) * bli_zimag(x[i__2]) + bli_zimag(ap[i__1]) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    --k;
 /* L90: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			for (i__ = j - 1; i__ >= 1; --i__) {
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__1 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    --k;
 /* L100: */
 			}
 		    }
 		    i__1 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
 		    kk -= j;
 /* L110: */
 		}
@@ -1590,43 +1590,43 @@
 		jx = kx + (*n - 1) * *incx;
 		for (j = *n; j >= 1; --j) {
 		    i__1 = jx;
-		    bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 		    ix = jx;
 		    if (noconj) {
 			if (nounit) {
 			    i__1 = kk;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__1]) - bli_zimag(temp) * bli_zimag(ap[i__1])), (bli_zreal(temp) * bli_zimag(ap[i__1]) + bli_zimag(temp) * bli_zreal(ap[i__1])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__1]) - bli_zimag(temp) * bli_zimag(ap[i__1])), (bli_zreal(temp) * bli_zimag(ap[i__1]) + bli_zimag(temp) * bli_zreal(ap[i__1])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			i__1 = kk - j + 1;
 			for (k = kk - 1; k >= i__1; --k) {
 			    ix -= *incx;
 			    i__2 = k;
 			    i__3 = ix;
-			    bli_zsets( (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L120: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			i__1 = kk - j + 1;
 			for (k = kk - 1; k >= i__1; --k) {
 			    ix -= *incx;
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__2 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L130: */
 			}
 		    }
 		    i__1 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
 		    jx -= *incx;
 		    kk -= j;
 /* L140: */
@@ -1638,43 +1638,43 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = j;
-		    bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 		    k = kk + 1;
 		    if (noconj) {
 			if (nounit) {
 			    i__2 = kk;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__2]) - bli_zimag(temp) * bli_zimag(ap[i__2])), (bli_zreal(temp) * bli_zimag(ap[i__2]) + bli_zimag(temp) * bli_zreal(ap[i__2])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__2]) - bli_zimag(temp) * bli_zimag(ap[i__2])), (bli_zreal(temp) * bli_zimag(ap[i__2]) + bli_zimag(temp) * bli_zreal(ap[i__2])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			i__2 = *n;
 			for (i__ = j + 1; i__ <= i__2; ++i__) {
 			    i__3 = k;
 			    i__4 = i__;
-			    bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ++k;
 /* L150: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			i__2 = *n;
 			for (i__ = j + 1; i__ <= i__2; ++i__) {
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__3 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ++k;
 /* L160: */
 			}
 		    }
 		    i__2 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
 		    kk += *n - j + 1;
 /* L170: */
 		}
@@ -1683,43 +1683,43 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 		    ix = jx;
 		    if (noconj) {
 			if (nounit) {
 			    i__2 = kk;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__2]) - bli_zimag(temp) * bli_zimag(ap[i__2])), (bli_zreal(temp) * bli_zimag(ap[i__2]) + bli_zimag(temp) * bli_zreal(ap[i__2])), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__2]) - bli_zimag(temp) * bli_zimag(ap[i__2])), (bli_zreal(temp) * bli_zimag(ap[i__2]) + bli_zimag(temp) * bli_zreal(ap[i__2])), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			i__2 = kk + *n - j;
 			for (k = kk + 1; k <= i__2; ++k) {
 			    ix += *incx;
 			    i__3 = k;
 			    i__4 = ix;
-			    bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L180: */
 			}
 		    } else {
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk]);
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 			i__2 = kk + *n - j;
 			for (k = kk + 1; k <= i__2; ++k) {
 			    ix += *incx;
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__3 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 /* L190: */
 			}
 		    }
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
 		    jx += *incx;
 		    kk += *n - j + 1;
 /* L200: */
diff --git a/frame/compat/f2c/bla_tpsv.c b/frame/compat/f2c/bla_tpsv.c
index 6a4a5ab6c..a85cc0dba 100644
--- a/frame/compat/f2c/bla_tpsv.c
+++ b/frame/compat/f2c/bla_tpsv.c
@@ -224,18 +224,18 @@
 			if (nounit) {
 			    i__1 = j;
 			    bla_c_div(&q__1, &x[j], &ap[kk]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 			}
 			i__1 = j;
-			bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 			k = kk - 1;
 			for (i__ = j - 1; i__ >= 1; --i__) {
 			    i__1 = i__;
 			    i__2 = i__;
 			    i__3 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__3]) - bli_cimag(temp) * bli_cimag(ap[i__3])), (bli_creal(temp) * bli_cimag(ap[i__3]) + bli_cimag(temp) * bli_creal(ap[i__3])), q__2 );
-			    bli_csets( (bli_creal(x[i__2]) - bli_creal(q__2)), (bli_cimag(x[i__2]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__3]) - bli_cimag(temp) * bli_cimag(ap[i__3])), (bli_creal(temp) * bli_cimag(ap[i__3]) + bli_cimag(temp) * bli_creal(ap[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__2]) - bli_creal(q__2)), (bli_cimag(x[i__2]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 			    --k;
 /* L10: */
 			}
@@ -251,10 +251,10 @@
 			if (nounit) {
 			    i__1 = jx;
 			    bla_c_div(&q__1, &x[jx], &ap[kk]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] );
 			}
 			i__1 = jx;
-			bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 			ix = jx;
 			i__1 = kk - j + 1;
 			for (k = kk - 1; k >= i__1; --k) {
@@ -262,9 +262,9 @@
 			    i__2 = ix;
 			    i__3 = ix;
 			    i__4 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 );
-			    bli_csets( (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 /* L30: */
 			}
 		    }
@@ -283,19 +283,19 @@
 			if (nounit) {
 			    i__2 = j;
 			    bla_c_div(&q__1, &x[j], &ap[kk]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			}
 			i__2 = j;
-			bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 			k = kk + 1;
 			i__2 = *n;
 			for (i__ = j + 1; i__ <= i__2; ++i__) {
 			    i__3 = i__;
 			    i__4 = i__;
 			    i__5 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 			    ++k;
 /* L50: */
 			}
@@ -312,10 +312,10 @@
 			if (nounit) {
 			    i__2 = jx;
 			    bla_c_div(&q__1, &x[jx], &ap[kk]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] );
 			}
 			i__2 = jx;
-			bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+			bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 			ix = jx;
 			i__2 = kk + *n - j;
 			for (k = kk + 1; k <= i__2; ++k) {
@@ -323,9 +323,9 @@
 			    i__3 = ix;
 			    i__4 = ix;
 			    i__5 = k;
-			    bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
-			    bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
+			    bli_tsets( c,c, (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 );
+			    bli_tsets( c,c, (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] );
 /* L70: */
 			}
 		    }
@@ -345,42 +345,42 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = j;
-		    bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 		    k = kk;
 		    if (noconj) {
 			i__2 = j - 1;
 			for (i__ = 1; i__ <= i__2; ++i__) {
 			    i__3 = k;
 			    i__4 = i__;
-			    bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ++k;
 /* L90: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &ap[kk + j - 1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 			i__2 = j - 1;
 			for (i__ = 1; i__ <= i__2; ++i__) {
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__3 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ++k;
 /* L100: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk + j - 1]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__2 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
 		    kk += j;
 /* L110: */
 		}
@@ -389,42 +389,42 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = jx;
-		    bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp );
 		    ix = kx;
 		    if (noconj) {
 			i__2 = kk + j - 2;
 			for (k = kk; k <= i__2; ++k) {
 			    i__3 = k;
 			    i__4 = ix;
-			    bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix += *incx;
 /* L120: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &ap[kk + j - 1]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 			i__2 = kk + j - 2;
 			for (k = kk; k <= i__2; ++k) {
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__3 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix += *incx;
 /* L130: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk + j - 1]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__2 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__2] );
 		    jx += *incx;
 		    kk += j;
 /* L140: */
@@ -435,42 +435,42 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__1 = j;
-		    bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 		    k = kk;
 		    if (noconj) {
 			i__1 = j + 1;
 			for (i__ = *n; i__ >= i__1; --i__) {
 			    i__2 = k;
 			    i__3 = i__;
-			    bli_csets( (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    --k;
 /* L150: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &ap[kk - *n + j]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 			i__1 = j + 1;
 			for (i__ = *n; i__ >= i__1; --i__) {
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__2 = i__;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    --k;
 /* L160: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk - *n + j]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__1 = j;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
 		    kk -= *n - j + 1;
 /* L170: */
 		}
@@ -479,42 +479,42 @@
 		jx = kx;
 		for (j = *n; j >= 1; --j) {
 		    i__1 = jx;
-		    bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
+		    bli_tsets( c,c, (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp );
 		    ix = kx;
 		    if (noconj) {
 			i__1 = kk - (*n - (j + 1));
 			for (k = kk; k >= i__1; --k) {
 			    i__2 = k;
 			    i__3 = ix;
-			    bli_csets( (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix -= *incx;
 /* L180: */
 			}
 			if (nounit) {
 			    bla_c_div(&q__1, &temp, &ap[kk - *n + j]);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    } else {
 			i__1 = kk - (*n - (j + 1));
 			for (k = kk; k >= i__1; --k) {
 			    bla_r_cnjg(&q__3, &ap[k]);
 			    i__2 = ix;
-			    bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
-			    bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 );
+			    bli_tsets( c,c, (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			    ix -= *incx;
 /* L190: */
 			}
 			if (nounit) {
 			    bla_r_cnjg(&q__2, &ap[kk - *n + j]);
 			    bla_c_div(&q__1, &temp, &q__2);
-			    bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp );
+			    bli_tsets( c,c, (bli_creal(q__1)), (bli_cimag(q__1)), temp );
 			}
 		    }
 		    i__1 = jx;
-		    bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
+		    bli_tsets( c,c, (bli_creal(temp)), (bli_cimag(temp)), x[i__1] );
 		    jx -= *incx;
 		    kk -= *n - j + 1;
 /* L200: */
@@ -1420,18 +1420,18 @@
 			if (nounit) {
 			    i__1 = j;
 			    bla_z_div(&z__1, &x[j], &ap[kk]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 			}
 			i__1 = j;
-			bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 			k = kk - 1;
 			for (i__ = j - 1; i__ >= 1; --i__) {
 			    i__1 = i__;
 			    i__2 = i__;
 			    i__3 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__3]) - bli_zimag(temp) * bli_zimag(ap[i__3])), (bli_zreal(temp) * bli_zimag(ap[i__3]) + bli_zimag(temp) * bli_zreal(ap[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__2]) - bli_zreal(z__2)), (bli_zimag(x[i__2]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__3]) - bli_zimag(temp) * bli_zimag(ap[i__3])), (bli_zreal(temp) * bli_zimag(ap[i__3]) + bli_zimag(temp) * bli_zreal(ap[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__2]) - bli_zreal(z__2)), (bli_zimag(x[i__2]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 			    --k;
 /* L10: */
 			}
@@ -1447,10 +1447,10 @@
 			if (nounit) {
 			    i__1 = jx;
 			    bla_z_div(&z__1, &x[jx], &ap[kk]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] );
 			}
 			i__1 = jx;
-			bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 			ix = jx;
 			i__1 = kk - j + 1;
 			for (k = kk - 1; k >= i__1; --k) {
@@ -1458,9 +1458,9 @@
 			    i__2 = ix;
 			    i__3 = ix;
 			    i__4 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 /* L30: */
 			}
 		    }
@@ -1479,19 +1479,19 @@
 			if (nounit) {
 			    i__2 = j;
 			    bla_z_div(&z__1, &x[j], &ap[kk]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			}
 			i__2 = j;
-			bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 			k = kk + 1;
 			i__2 = *n;
 			for (i__ = j + 1; i__ <= i__2; ++i__) {
 			    i__3 = i__;
 			    i__4 = i__;
 			    i__5 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 			    ++k;
 /* L50: */
 			}
@@ -1508,10 +1508,10 @@
 			if (nounit) {
 			    i__2 = jx;
 			    bla_z_div(&z__1, &x[jx], &ap[kk]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] );
 			}
 			i__2 = jx;
-			bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+			bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 			ix = jx;
 			i__2 = kk + *n - j;
 			for (k = kk + 1; k <= i__2; ++k) {
@@ -1519,9 +1519,9 @@
 			    i__3 = ix;
 			    i__4 = ix;
 			    i__5 = k;
-			    bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
-			    bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
+			    bli_tsets( z,z, (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] );
 /* L70: */
 			}
 		    }
@@ -1541,42 +1541,42 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = j;
-		    bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 		    k = kk;
 		    if (noconj) {
 			i__2 = j - 1;
 			for (i__ = 1; i__ <= i__2; ++i__) {
 			    i__3 = k;
 			    i__4 = i__;
-			    bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ++k;
 /* L90: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &ap[kk + j - 1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 			i__2 = j - 1;
 			for (i__ = 1; i__ <= i__2; ++i__) {
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__3 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ++k;
 /* L100: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk + j - 1]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__2 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
 		    kk += j;
 /* L110: */
 		}
@@ -1585,42 +1585,42 @@
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp );
 		    ix = kx;
 		    if (noconj) {
 			i__2 = kk + j - 2;
 			for (k = kk; k <= i__2; ++k) {
 			    i__3 = k;
 			    i__4 = ix;
-			    bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix += *incx;
 /* L120: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &ap[kk + j - 1]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 			i__2 = kk + j - 2;
 			for (k = kk; k <= i__2; ++k) {
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__3 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix += *incx;
 /* L130: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk + j - 1]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__2 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] );
 		    jx += *incx;
 		    kk += j;
 /* L140: */
@@ -1631,42 +1631,42 @@
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    i__1 = j;
-		    bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 		    k = kk;
 		    if (noconj) {
 			i__1 = j + 1;
 			for (i__ = *n; i__ >= i__1; --i__) {
 			    i__2 = k;
 			    i__3 = i__;
-			    bli_zsets( (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    --k;
 /* L150: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &ap[kk - *n + j]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 			i__1 = j + 1;
 			for (i__ = *n; i__ >= i__1; --i__) {
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__2 = i__;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    --k;
 /* L160: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk - *n + j]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__1 = j;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
 		    kk -= *n - j + 1;
 /* L170: */
 		}
@@ -1675,42 +1675,42 @@
 		jx = kx;
 		for (j = *n; j >= 1; --j) {
 		    i__1 = jx;
-		    bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
+		    bli_tsets( z,z, (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp );
 		    ix = kx;
 		    if (noconj) {
 			i__1 = kk - (*n - (j + 1));
 			for (k = kk; k >= i__1; --k) {
 			    i__2 = k;
 			    i__3 = ix;
-			    bli_zsets( (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix -= *incx;
 /* L180: */
 			}
 			if (nounit) {
 			    bla_z_div(&z__1, &temp, &ap[kk - *n + j]);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    } else {
 			i__1 = kk - (*n - (j + 1));
 			for (k = kk; k >= i__1; --k) {
 			    bla_d_cnjg(&z__3, &ap[k]);
 			    i__2 = ix;
-			    bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
-			    bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 );
+			    bli_tsets( z,z, (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			    ix -= *incx;
 /* L190: */
 			}
 			if (nounit) {
 			    bla_d_cnjg(&z__2, &ap[kk - *n + j]);
 			    bla_z_div(&z__1, &temp, &z__2);
-			    bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
+			    bli_tsets( z,z, (bli_zreal(z__1)), (bli_zimag(z__1)), temp );
 			}
 		    }
 		    i__1 = jx;
-		    bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
+		    bli_tsets( z,z, (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] );
 		    jx -= *incx;
 		    kk -= *n - j + 1;
 /* L200: */
diff --git a/frame/compat/f2c/util/bla_c_div.c b/frame/compat/f2c/util/bla_c_div.c
index 975f49b0a..c991f7bc1 100644
--- a/frame/compat/f2c/util/bla_c_div.c
+++ b/frame/compat/f2c/util/bla_c_div.c
@@ -38,8 +38,8 @@
 
 void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp)
 {
-	bli_ccopys( *ap, *cp );
-	bli_cinvscals( *bp, *cp );
+	bli_tcopys( c,c, *ap, *cp );
+	bli_tinvscals( c,c,c, *bp, *cp );
 }
 
 #endif
diff --git a/frame/compat/f2c/util/bla_d_cnjg.c b/frame/compat/f2c/util/bla_d_cnjg.c
index 43dc9758c..f6df81cf2 100644
--- a/frame/compat/f2c/util/bla_d_cnjg.c
+++ b/frame/compat/f2c/util/bla_d_cnjg.c
@@ -38,7 +38,7 @@
 
 void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src)
 {
-	bli_zcopyjs( *src, *dest );
+	bli_tcopyjs( z,z, *src, *dest );
 }
 
 #endif
diff --git a/frame/compat/f2c/util/bla_r_cnjg.c b/frame/compat/f2c/util/bla_r_cnjg.c
index 42b25d575..497dcfa38 100644
--- a/frame/compat/f2c/util/bla_r_cnjg.c
+++ b/frame/compat/f2c/util/bla_r_cnjg.c
@@ -38,7 +38,7 @@
 
 void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src)
 {
-	bli_ccopyjs( *src, *dest );
+	bli_tcopyjs( c,c, *src, *dest );
 }
 
 #endif
diff --git a/frame/compat/f2c/util/bla_z_div.c b/frame/compat/f2c/util/bla_z_div.c
index 3d36a8ac8..80cf30fbd 100644
--- a/frame/compat/f2c/util/bla_z_div.c
+++ b/frame/compat/f2c/util/bla_z_div.c
@@ -38,8 +38,8 @@
 
 void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp)
 {
-	bli_zcopys( *ap, *cp );
-	bli_zinvscals( *bp, *cp );
+	bli_tcopys( z,z, *ap, *cp );
+	bli_tinvscals( z,z,z, *bp, *cp );
 }
 
 #endif
diff --git a/frame/include/bli_cast_macro_defs.h b/frame/include/bli_cast_macro_defs.h
new file mode 100644
index 000000000..3033276ff
--- /dev/null
+++ b/frame/include/bli_cast_macro_defs.h
@@ -0,0 +1,529 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_CAST_MACRO_DEFS_H
+#define BLIS_CAST_MACRO_DEFS_H
+
+// -- Typecast { bfloat16 | float | double } to bfloat16 -----------------------
+
+#ifdef BFLOAT
+BLIS_INLINE bfloat bli_bbcast( bfloat b )
+{
+	return b;
+}
+#endif
+
+#ifdef BFLOAT
+BLIS_INLINE bfloat bli_sbcast( float s )
+{
+	bfloat b;
+
+	// View the float as a char array.
+	char* s_ch = ( char* )&s;
+
+	// Copy upper two bytes of float to a local bfloat16.
+	memcpy( &b, &s_ch[2], 2 );
+
+	return b;
+}
+#endif
+
+#ifdef BFLOAT
+BLIS_INLINE bfloat bli_dbcast( double d )
+{
+	bfloat b;
+
+	// Typecast double input argument to a local float.
+	float s = ( float )d;
+
+	// View the float as a char array.
+	char* s_ch = ( char* )&s;
+
+	// Copy upper two bytes of float to a local bfloat16.
+	memcpy( &b, &s_ch[2], 2 );
+
+	return b;
+}
+#endif
+
+// -- Typecast { bfloat16 | float | double | int } to float --------------------------
+
+#ifdef BFLOAT
+BLIS_INLINE float bli_bscast( bfloat b )
+{
+	// Initialize all bits in a local float to zero.
+	float s = 0.0F;
+
+	// View the float as a char array.
+	char* s_ch = ( char* )&s;
+
+	// Copy bfloat16 to the upper two bytes of a local float.
+	memcpy( &s_ch[2], &b, 2 );
+
+	return s;
+}
+#endif
+
+BLIS_INLINE float bli_sscast( float s )
+{
+	return s;
+}
+
+BLIS_INLINE float bli_dscast( double d )
+{
+	return ( float )d;
+}
+
+BLIS_INLINE float bli_iscast( dim_t i )
+{
+	return ( float )i;
+}
+
+// -- Typecast { bfloat16 | float | double | int } to double -------------------------
+
+#ifdef BFLOAT
+BLIS_INLINE double bli_bdcast( bfloat b )
+{
+	// Initialize all bits in a local float to zero.
+	float s = 0.0F;
+
+	// View the float as a char array.
+	char* s_ch = ( char* )&s;
+
+	// Copy bfloat16 to the upper two bytes of a local float.
+	memcpy( &s_ch[2], &b, 2 );
+
+	return ( double )s;
+}
+#endif
+
+BLIS_INLINE double bli_sdcast( float s )
+{
+	return ( double )s;
+}
+
+//#if 1
+BLIS_INLINE double bli_ddcast( double d )
+{
+	return d;
+}
+//#else
+//#define bli_ddcast( d )  ( d )
+//#endif
+
+BLIS_INLINE double bli_idcast( dim_t i )
+{
+	return ( double )i;
+}
+
+// -- Typecast { float | double | int } to int -------------------------
+
+BLIS_INLINE dim_t bli_sicast( float s )
+{
+	return ( dim_t )s;
+}
+
+BLIS_INLINE dim_t bli_dicast( double d )
+{
+	return ( dim_t )d;
+}
+
+BLIS_INLINE dim_t bli_iicast( dim_t i )
+{
+	return i;
+}
+
+#if 0
+// -- Fused real/imag accessor + typecast --------------------------------------
+
+// Generate static functions that fuse two operations:
+// - accessing the real and imaginary components of all datatypes (real
+//   and complex)
+// - typecasting a real (or imaginary) component to any real datatype
+// Examples:
+// static float  bli_dreals( double   a ) { return bli_dscast( bli_dreal( a ) ); }
+// static double bli_sreald( float    a ) { return bli_sdcast( bli_sreal( a ) ); }
+// static float  bli_creals( scomplex a ) { return bli_sscast( bli_creal( a ) ); }
+// static double bli_cimagd( scomplex a ) { return bli_sdcast( bli_cimag( a ) ); }
+
+#undef  GENTFUNC
+#define GENTFUNC( chi, cho ) \
+\
+BLIS_INLINE PASTEMAC(cho,ctype) PASTEMAC2(chi,real,cho)( PASTEMAC(chi,ctype) a ) \
+{ \
+	return PASTEMAC2(chi,cho,cast)( PASTEMAC(chi,real)( a ) ); \
+} \
+BLIS_INLINE PASTEMAC(cho,ctype) PASTEMAC2(chi,imag,cho)( PASTEMAC(chi,ctype) a ) \
+{ \
+	return PASTEMAC2(chi,cho,cast)( PASTEMAC(chi,imag)( a ) ); \
+}
+
+// NOTE: We only have to generate functions that output to types [bsd] because
+// these macros only need to output real types. The composition that allows
+// complex types will be handled by the consumers to these bli_?[real|imag]?()
+// functions.
+
+// [bsdkcz][bsd]
+
+GENTFUNC( b, b )
+GENTFUNC( s, b )
+GENTFUNC( d, b )
+GENTFUNC( k, b )
+GENTFUNC( c, b )
+GENTFUNC( z, b )
+
+GENTFUNC( b, s )
+GENTFUNC( s, s )
+GENTFUNC( d, s )
+GENTFUNC( k, s )
+GENTFUNC( c, s )
+GENTFUNC( z, s )
+
+GENTFUNC( b, d )
+GENTFUNC( s, d )
+GENTFUNC( d, d )
+GENTFUNC( k, d )
+GENTFUNC( c, d )
+GENTFUNC( z, d )
+#endif
+
+// bli_xytcast() macros are only used in the definitions of level0 scalar
+// macros. There, we use a different name from the actual cast functions--
+// which are named using the format bli_xycast()--so that we can optionally
+// replace them as part of the optimization below without distrubing any
+// other uses of bli_xycast() that should not be changed.
+
+#define bli_bbtcast  bli_bbcast
+#define bli_sbtcast  bli_sbcast
+#define bli_dbtcast  bli_dbcast
+#define bli_kbtcast  bli_kbcast
+#define bli_cbtcast  bli_cbcast
+#define bli_zbtcast  bli_zbcast
+
+#define bli_bstcast  bli_bscast
+#define bli_sstcast  bli_sscast
+#define bli_dstcast  bli_dscast
+#define bli_kstcast  bli_kscast
+#define bli_cstcast  bli_cscast
+#define bli_zstcast  bli_zscast
+#define bli_istcast  bli_iscast
+
+#define bli_bdtcast  bli_bdcast
+#define bli_sdtcast  bli_sdcast
+#define bli_ddtcast  bli_ddcast
+#define bli_kdtcast  bli_kdcast
+#define bli_cdtcast  bli_cdcast
+#define bli_zdtcast  bli_zdcast
+#define bli_idtcast  bli_idcast
+
+#define bli_sitcast  bli_sicast
+#define bli_ditcast  bli_dicast
+#define bli_iitcast  bli_iicast
+
+// An optimization. In situations where computations would normally occur
+// in bfloat, redundant typecasting may occur. For example, in the case of
+// performing ssbbaxpy (a and x stored in type s; y stored in type b;
+// compute in b), a and x would normally be typecast to b so that all
+// operands are in the computation precision (namely, bfloat), but since
+// our reference implementation implements bfloat flops in terms of float
+// flops, all operands would need to be typecast back to s anyway just so
+// the computation can take place. This means that a and x were truncated
+// down to bfloat (and thus lost precision) somewhat unnecessarily. Instead,
+// what could happen is that a and x remain in s, y is typecast to s,
+// computation would take place in s, and then the result is truncated to
+// bfloat on output to y. These macros substitute certain static function
+// calls to be the equivalent calls that would cast to float instead of
+// bfloat.
+#ifdef BLIS_OPTIMIZE_BFLOAT_AS_FLOAT
+
+#undef  bli_bbcast
+#define bli_bbcast  bli_bscast
+#undef  bli_sbcast
+#define bli_sbcast  bli_sscast
+#undef  bli_dbcast
+#define bli_dbcast  bli_dscast
+#undef  bli_kbcast
+#define bli_kbcast  bli_kscast
+#undef  bli_cbcast
+#define bli_cbcast  bli_cscast
+#undef  bli_zbcast
+#define bli_zbcast  bli_zscast
+
+#endif
+
+
+// -- Basic constants (per precision) ------------------------------------------
+
+#ifdef BLIS_OPTIMIZE_BFLOAT_AS_FLOAT
+
+#define bli_btwo                bli_stwo
+#define bli_bone                bli_sone
+#define bli_bzero               bli_szero
+#define bli_bmone               bli_smone
+#define bli_bmtwo               bli_smtwo
+
+#else
+
+#define bli_btwo    bli_sbcast( bli_stwo )
+#define bli_bone    bli_sbcast( bli_sone )
+#define bli_bzero   bli_sbcast( bli_szero )
+#define bli_bmone   bli_sbcast( bli_smone )
+#define bli_bmtwo   bli_sbcast( bli_smtwo )
+
+#endif
+
+#define bli_stwo    2.0F
+#define bli_sone    1.0F
+#define bli_szero   0.0F
+#define bli_smone  -1.0F
+#define bli_smtwo  -2.0F
+
+#define bli_dtwo    2.0
+#define bli_done    1.0
+#define bli_dzero   0.0
+#define bli_dmone  -1.0
+#define bli_dmtwo  -2.0
+
+// -- Basic arithmetic operations (per precision) ------------------------------
+
+#ifdef BLIS_OPTIMIZE_BFLOAT_AS_FLOAT
+
+#define bli_bmul( a, b )                  bli_smul(             a,             b  )
+#define bli_bdiv( a, b )                  bli_sdiv(             a,             b  )
+#define bli_badd( a, b )                  bli_sadd(             a,             b  )
+#define bli_bsub( a, b )                  bli_ssub(             a,             b  )
+#define bli_bneg( a )                     bli_sneg(             a                 )
+#define bli_bsqrt( a )                    bli_ssqrt(            a                 )
+#define bli_bhypot( a, b )                bli_shypot(            a,             b  )
+
+#else
+
+#define bli_bmul( a, b )      bli_sbcast( bli_smul(  bli_bscast(a), bli_bscast(b) ) )
+#define bli_bdiv( a, b )      bli_sbcast( bli_sdiv(  bli_bscast(a), bli_bscast(b) ) )
+#define bli_badd( a, b )      bli_sbcast( bli_sadd(  bli_bscast(a), bli_bscast(b) ) )
+#define bli_bsub( a, b )      bli_sbcast( bli_ssub(  bli_bscast(a), bli_bscast(b) ) )
+#define bli_bneg( a )         bli_sbcast( bli_sneg(  bli_bscast(a)                ) )
+#define bli_bsqrt( a )        bli_sbcast( bli_ssqrt( bli_bscast(a)                ) )
+#define bli_bhypot( a, b )    bli_sbcast( bli_shypot( bli_bscast(a), bli_bscast(b) ) )
+
+#endif
+
+#define bli_smul( a, b )       (a) * (b)
+#define bli_sdiv( a, b )       (a) / (b)
+#define bli_sadd( a, b )       (a) + (b)
+#define bli_ssub( a, b )       (a) - (b)
+#define bli_sneg( a )          -(a)
+#define bli_ssqrt( a )         sqrtf(a)
+#define bli_shypot( a, b )     hypotf(a,b)
+
+#define bli_dmul( a, b )       (a) * (b)
+#define bli_ddiv( a, b )       (a) / (b)
+#define bli_dadd( a, b )       (a) + (b)
+#define bli_dsub( a, b )       (a) - (b)
+#define bli_dneg( a )          -(a)
+#define bli_dsqrt( a )         sqrt(a)
+#define bli_dhypot( a, b )     hypot(a,b)
+
+// -- Basic compare operations (per precision) ---------------------------------
+
+#ifdef BLIS_OPTIMIZE_BFLOAT_AS_FLOAT
+
+#define bli_beq( a, b )                  bli_seq(            a,             b  )
+#define bli_blt( a, b )                  bli_slt(            a,             b  )
+#define bli_ble( a, b )                  bli_sle(            a,             b  )
+#define bli_bgt( a, b )                  bli_sgt(            a,             b  )
+#define bli_bge( a, b )                  bli_sge(            a,             b  )
+
+#else
+
+#define bli_beq( a, b )      bli_sbcast( bli_seq( bli_bscast(a), bli_bscast(b) ) )
+#define bli_blt( a, b )      bli_sbcast( bli_slt( bli_bscast(a), bli_bscast(b) ) )
+#define bli_ble( a, b )      bli_sbcast( bli_sle( bli_bscast(a), bli_bscast(b) ) )
+#define bli_bgt( a, b )      bli_sbcast( bli_sgt( bli_bscast(a), bli_bscast(b) ) )
+#define bli_bge( a, b )      bli_sbcast( bli_sge( bli_bscast(a), bli_bscast(b) ) )
+
+#endif
+
+#define bli_seq( a, b )  ( a == b )
+#define bli_slt( a, b )  ( a <  b )
+#define bli_sle( a, b )  ( a <= b )
+#define bli_sgt( a, b )  ( a >  b )
+#define bli_sge( a, b )  ( a >= b )
+
+#define bli_deq( a, b )  ( a == b )
+#define bli_dlt( a, b )  ( a <  b )
+#define bli_dle( a, b )  ( a <= b )
+#define bli_dgt( a, b )  ( a >  b )
+#define bli_dge( a, b )  ( a >= b )
+
+#define bli_ieq( a, b )  ( a == b )
+#define bli_ilt( a, b )  ( a <  b )
+#define bli_ile( a, b )  ( a <= b )
+#define bli_igt( a, b )  ( a >  b )
+#define bli_ige( a, b )  ( a >= b )
+
+// -- Min/max/abs/etc. operations (per precision) ------------------------------
+
+#ifdef BLIS_OPTIMIZE_BFLOAT_AS_FLOAT
+
+#define bli_bmin( a, b )                   bli_smin(               a,             b  )
+#define bli_bmax( a, b )                   bli_smax(               a,             b  )
+#define bli_babs( a )                      bli_sabs(               a                 )
+#define bli_bminabs( a, b )                bli_sminabs(            a              b  )
+#define bli_bmaxabs( a, b )                bli_smaxabs(            a              b  )
+#define bli_bcopysign( a, b )            ( bli_slt(            b , bli_szero ) \
+                                           ? bli_sneg( bli_sabs(            a  ) ) \
+                                           :           bli_sabs(            a  )   )
+
+#else
+
+#define bli_bmin( a, b )       bli_sbcast(    bli_smin( bli_bscast(a), bli_bscast(b) ) )
+#define bli_bmax( a, b )       bli_sbcast(    bli_smax( bli_bscast(a), bli_bscast(b) ) )
+#define bli_babs( a )          bli_sbcast(    bli_sabs( bli_bscast(a)                ) )
+#define bli_bminabs( a, b )    bli_sbcast( bli_sminabs( bli_bscast(a), bli_bscast(b) ) )
+#define bli_bmaxabs( a, b )    bli_sbcast( bli_smaxabs( bli_bscast(a), bli_bscast(b) ) )
+#define bli_bcopysign( a, b )  bli_sbcast( bli_slt( bli_bscast(b), bli_szero ) \
+                                           ? bli_sneg( bli_sabs( bli_bscast(a) ) ) \
+                                           :           bli_sabs( bli_bscast(a) )   )
+
+#endif
+
+#define bli_smin( a, b )       ( bli_slt( a, b ) ? a : b )
+#define bli_smax( a, b )       ( bli_sgt( a, b ) ? a : b )
+//#define bli_sabs( a )          ( bli_slt( a, PASTEMAC(s,zero) ) ? -(a) : a )
+#define bli_sabs( a )          ( fabsf(a) )
+#define bli_sminabs( a, b )    bli_smin( bli_sabs( a ), bli_sabs( b ) )
+#define bli_smaxabs( a, b )    bli_smax( bli_sabs( a ), bli_sabs( b ) )
+#define bli_scopysign( a, b )  ( copysignf( a, b ) ) \
+
+#define bli_dmin( a, b )       ( bli_dlt( a, b ) ? a : b )
+#define bli_dmax( a, b )       ( bli_dgt( a, b ) ? a : b )
+//#define bli_dabs( a )          ( bli_dlt( a, PASTEMAC(d,zero) ) ? -(a) : a )
+#define bli_dabs( a )          ( fabs(a) )
+#define bli_dminabs( a, b )    bli_dmin( bli_dabs( a ), bli_dabs( b ) )
+#define bli_dmaxabs( a, b )    bli_dmax( bli_dabs( a ), bli_dabs( b ) )
+#define bli_dcopysign( a, b )  ( copysign( a, b ) ) \
+
+// -- Infinity/NaN check (per precision) ---------------------------------------
+
+#ifdef BLIS_OPTIMIZE_BFLOAT_AS_FLOAT
+
+#define bli_bisinf( a )        bli_sisinf(            a  )
+#define bli_bisnan( a )        bli_sisnan(            a  )
+
+#else
+
+#define bli_bisinf( a )        bli_sisinf( bli_bscast(a) )
+#define bli_bisnan( a )        bli_sisnan( bli_bscast(a) )
+
+#endif
+
+#define bli_sisinf( a )        isinf( a )
+#define bli_sisnan( a )        isnan( a )
+
+#define bli_disinf( a )        isinf( a )
+#define bli_disnan( a )        isnan( a )
+
+// -- Randomization operations (per precision) ---------------------------------
+
+#define bli_brand              bli_dbcast( bli_rand() )
+#define bli_srand              bli_dscast( bli_rand() )
+#define bli_drand              bli_ddcast( bli_rand() )
+
+// Randomize a real number on the interval [-1.0,1.0] and return it as a double.
+BLIS_INLINE double bli_rand( void )
+{
+	return ( ( ( double ) rand()         ) /
+             ( ( double ) RAND_MAX / 2.0 )
+           ) - 1.0;
+}
+
+#define bli_brandnp2           bli_dbcast( bli_randnp2s() )
+#define bli_srandnp2           bli_dscast( bli_randnp2s() )
+#define bli_drandnp2           bli_ddcast( bli_randnp2s() )
+
+// Randomize a power of two on a narrow range and return it as a double.
+BLIS_INLINE double bli_randnp2s( void )
+{
+	const double m_max  = 6.0;
+	const double m_max2 = m_max + 2.0;
+	double       t;
+	double       r_val;
+
+	// Compute a narrow-range power of two.
+	//
+	// For the purposes of commentary, we'll assume that m_max = 4. This
+	// represents the largest power of two we will use to generate the
+	// random numbers.
+
+	do
+	{
+		// Generate a random real number t on the interval: [0.0, 6.0].
+		t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2;
+
+		// Transform the interval into the set of integers, {0,1,2,3,4,5}.
+		// Note that 6 is prohibited by the loop guard below.
+		t = floor( t );
+	}
+	// If t is ever equal to m_max2, we re-randomize. The guard against
+	// m_max2 < t is for sanity and shouldn't happen, unless perhaps there
+	// is weirdness in the typecasting to double when computing t above.
+	while ( m_max2 <= t );
+
+	// Map values of t == 0 to a final value of 0.
+	if ( t == 0.0 ) r_val = 0.0;
+	else
+	{
+		// This case handles values of t = {1,2,3,4,5}.
+
+		// Compute r_val = 2^s where s = -(t-1) = {-4,-3,-2,-1,0}.
+		r_val = pow( 2.0, -(t - 1.0) );
+
+		// Compute a random number to determine the sign of the final
+		// result.
+		const double s_val = PASTEMAC(d,rand);
+
+		// If our sign value is negative, our random power of two will
+		// be negative.
+		if ( s_val < 0.0 ) r_val = -r_val;
+	}
+
+	// r_val = 0, or +/-{2^0, 2^-1, 2^-2, 2^-3, 2^-4}.
+	return r_val;
+}
+
+
+
+#endif
+
diff --git a/frame/include/bli_complex_macro_defs.h b/frame/include/bli_complex_macro_defs.h
index f9e22ef0a..6b2e4a27e 100644
--- a/frame/include/bli_complex_macro_defs.h
+++ b/frame/include/bli_complex_macro_defs.h
@@ -43,6 +43,8 @@
 #define bli_simag( x )  ( 0.0F )
 #define bli_dreal( x )  ( x )
 #define bli_dimag( x )  ( 0.0 )
+#define bli_ireal( x )  ( x )
+#define bli_iimag( x )  ( 0 )
 
 
 #if defined(__cplusplus) && defined(BLIS_ENABLE_STD_COMPLEX)
diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h
index ad72e7514..5bac3081a 100644
--- a/frame/include/bli_edge_case_macro_defs.h
+++ b/frame/include/bli_edge_case_macro_defs.h
@@ -56,7 +56,7 @@
 #define GEMM_UKR_SETUP_CT_POST(ch) \
 \
 	PASTEMAC(ch,ctype) _zero; \
-	PASTEMAC(ch,set0s)( _zero ); \
+	bli_tset0s( ch, _zero ); \
 	\
 	if ( _use_ct ) \
 	{ \
@@ -117,8 +117,9 @@
 	   microtile. */ \
 	if ( _use_ct ) \
 	{ \
-		PASTEMAC(ch,xpbys_mxn) \
+		bli_txpbys_mxn \
 		( \
+		  ch,ch,ch,ch, \
 		  m, n, \
 		  _ct, _rs_ct, _cs_ct, \
 		  _beta, \
@@ -204,8 +205,9 @@
 	   output microtile. Used by trsm. */ \
 	if ( _use_ct ) \
 	{ \
-		PASTEMAC(ch,copys_mxn) \
+		PASTEMAC(t,copys_mxn) \
 		( \
+		  ch,ch, \
 		  m, n, \
 		  _ct, _rs_ct, _cs_ct, \
 		  _c,  _rs_c,  _cs_c \
diff --git a/frame/include/bli_genarray_macro_defs.h b/frame/include/bli_genarray_macro_defs.h
index 4ec89d948..52e59b658 100644
--- a/frame/include/bli_genarray_macro_defs.h
+++ b/frame/include/bli_genarray_macro_defs.h
@@ -50,6 +50,16 @@ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \
 	( tname )PASTEMAC(z,opname)  \
 }
 
+#define GENARRAYRO_FPA(tname,opname) \
+\
+static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \
+{ \
+	( tname )PASTEMAC(s,opname), \
+	( tname )PASTEMAC(d,opname), \
+	NULL, \
+	NULL \
+}
+
 // -- "Smart" one-operand macro (with integer support) --
 
 #define GENARRAY_FPA_I(tname,opname) \
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index 8074bb441..aeafd3755 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -130,12 +130,12 @@ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname )
 
 #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \
 \
-GENTFUNCSCAL( float,    float,    s,  , blasname, blisname ) \
-GENTFUNCSCAL( double,   double,   d,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, scomplex, c,  , blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, dcomplex, z,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, double,   z, d, blasname, blisname )
+GENTFUNCSCAL( float,    float,    s,  , s, blasname, blisname ) \
+GENTFUNCSCAL( double,   double,   d,  , d, blasname, blisname ) \
+GENTFUNCSCAL( scomplex, scomplex, c,  , c, blasname, blisname ) \
+GENTFUNCSCAL( dcomplex, dcomplex, z,  , z, blasname, blisname ) \
+GENTFUNCSCAL( scomplex, float,    c, s, s, blasname, blisname ) \
+GENTFUNCSCAL( dcomplex, double,   z, d, d, blasname, blisname )
 
 
diff --git a/frame/include/bli_gentprot_macro_defs.h b/frame/include/bli_gentprot_macro_defs.h
index e733e4800..e661e5235 100644
--- a/frame/include/bli_gentprot_macro_defs.h
+++ b/frame/include/bli_gentprot_macro_defs.h
@@ -165,6 +165,15 @@ GENTPROTR( dcomplex, double, z, d, __VA_ARGS__ )
 
 
+// -- Basic one-operand macro with real domain only --
+
+#define INSERT_GENTPROTRO_BASIC( ... ) \
+\
+GENTPROTRO( float,  s, __VA_ARGS__ ) \
+GENTPROTRO( double, d, __VA_ARGS__ )
+
+
+
 // -- Basic one-operand macro with complex domain only and real projection --
 
 #define INSERT_GENTPROTCO_BASIC( ... ) \
diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h
index 8af3f5a26..cd4d619e3 100644
--- a/frame/include/bli_macro_defs.h
+++ b/frame/include/bli_macro_defs.h
@@ -97,6 +97,7 @@
 #include "bli_gentconf_macro_defs.h"
 
 #include "bli_misc_macro_defs.h"
+#include "bli_cast_macro_defs.h"
 #include "bli_edge_case_macro_defs.h"
 #include "bli_param_macro_defs.h"
 #include "bli_complex_macro_defs.h"
diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h
index f30880344..c58acc0d1 100644
--- a/frame/include/bli_misc_macro_defs.h
+++ b/frame/include/bli_misc_macro_defs.h
@@ -84,14 +84,6 @@ BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult )
 	       );
 }
 
-// isnan, isinf
-// NOTE: These must remain macros, since isinf() and isnan() are macros
-// (defined in math.h) that likely depend on the type of the argument 'a'
-// below.
-
-#define bli_isinf( a )  isinf( a )
-#define bli_isnan( a )  isnan( a )
-
 // is_odd, is_even
 
 BLIS_INLINE bool bli_is_odd( gint_t a )
@@ -130,27 +122,50 @@ BLIS_INLINE void bli_toggle_bool( bool* b )
 	else              *b = TRUE;
 }
 
-// return datatype for char
+// return datatype for datatype char
 
 #define bli_stype ( BLIS_FLOAT    )
 #define bli_dtype ( BLIS_DOUBLE   )
 #define bli_ctype ( BLIS_SCOMPLEX )
 #define bli_ztype ( BLIS_DCOMPLEX )
 
-// return C type for char
+// return C type for datatype char
 
 #define bli_sctype  float
 #define bli_dctype  double
 #define bli_cctype  scomplex
 #define bli_zctype  dcomplex
 
-// return real proj of C type for char
+// return C type for domain and precision chars
+
+#define bli_rsctype float
+#define bli_rdctype double
+#define bli_csctype scomplex
+#define bli_cdctype dcomplex
+
+// return real proj of C type for datatype char
 
 #define bli_sctyper  float
 #define bli_dctyper  double
 #define bli_cctyper  float
 #define bli_zctyper  double
 
+// return precision component of dt char
+
+#define bli_sprec  s
+#define bli_dprec  d
+#define bli_cprec  s
+#define bli_zprec  d
+#define bli_iprec  i
+
+// return domain component of dt char
+
+#define bli_sdom   r
+#define bli_ddom   r
+#define bli_cdom   c
+#define bli_zdom   c
+#define bli_idom   r
+
 // return whether or not two types are the same
 
 #define bli_sssame 1
diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h
index 2eea517fd..2d4798a10 100644
--- a/frame/include/bli_scalar_macro_defs.h
+++ b/frame/include/bli_scalar_macro_defs.h
@@ -36,6 +36,10 @@
 #define BLIS_SCALAR_MACRO_DEFS_H
 
 
+#include "bli_assigns.h"
+#include "bli_complex_terms.h"
+#include "bli_constants.h"
+#include "bli_declinits.h"
 
 // -- Assignment/Accessor macros --
 
@@ -44,210 +48,42 @@
 // whether fields of a struct are set directly or whether native C99
 // assignment is used).
 
-#include "bli_sets.h"    // sets both real and imaginary components
-
-// NOTE: These macros are not used by other scalar macros, but they are
-// related to those defined in bli_sets.h, and so we #include them here.
-
-#include "bli_setrs.h"   // sets real component only
-#include "bli_setis.h"   // sets imaginary component only
+#include "bli_tsets.h"    // sets both real and imaginary components
 
 // NOTE: This macro also needs to be defined early on since it determines
 // how real and imaginary components are accessed (ie: whether the fields
 // of a struct are read directly or whether native C99 functions are used.)
 
-#include "bli_gets.h"
-
-
-// -- Scalar constant initialization macros --
-
-#include "bli_constants.h"
-
-
-// -- Separated scalar macros (separated real/imaginary values) --
-
-#include "bli_absq2ris.h"
-
-#include "bli_abval2ris.h"
-
-#include "bli_addris.h"
-#include "bli_addjris.h"
-
-#include "bli_add3ris.h"
-
-#include "bli_axpbyris.h"
-#include "bli_axpbyjris.h"
-
-#include "bli_axpyris.h"
-#include "bli_axpyjris.h"
-
-#include "bli_axmyris.h"
-
-#include "bli_conjris.h"
-
-#include "bli_copyris.h"
-#include "bli_copyjris.h"
-#include "bli_copycjris.h"
-
-#include "bli_eqris.h"
-
-#include "bli_invertris.h"
-
-#include "bli_invscalris.h"
-#include "bli_invscaljris.h"
-
-#include "bli_neg2ris.h"
-
-#include "bli_scalris.h"
-#include "bli_scaljris.h"
-#include "bli_scalcjris.h"
-
-#include "bli_scal2ris.h"
-#include "bli_scal2jris.h"
-
-#include "bli_set0ris.h"
-
-#include "bli_sqrt2ris.h"
-
-#include "bli_subris.h"
-#include "bli_subjris.h"
-
-#include "bli_swapris.h"
-
-#include "bli_xpbyris.h"
-#include "bli_xpbyjris.h"
-
-// Inlined scalar macros in loops
-#include "bli_scal2ris_mxn.h"
-#include "bli_scalris_mxn_uplo.h"
-
-
-// -- Conventional scalar macros (paired real/imaginary values) --
-
-#include "bli_absq2s.h"
-
-#include "bli_abval2s.h"
-
-#include "bli_adds.h"
-#include "bli_addjs.h"
-
-#include "bli_add3s.h"
-
-#include "bli_axpbys.h"
-#include "bli_axpbyjs.h"
-
-#include "bli_axpys.h"
-#include "bli_axpyjs.h"
-
-#include "bli_axmys.h"
-
-#include "bli_conjs.h"
-
-#include "bli_copys.h"
-#include "bli_copyjs.h"
-#include "bli_copycjs.h"
-
-#include "bli_copynzs.h"
-#include "bli_copyjnzs.h"
-
-#include "bli_dots.h"
-#include "bli_dotjs.h"
-
-#include "bli_eq.h"
-#include "bli_lt.h"
-#include "bli_lte.h"
-
-#include "bli_fprints.h"
-
-#include "bli_inverts.h"
-
-#include "bli_invscals.h"
-#include "bli_invscaljs.h"
-
-#include "bli_neg2s.h"
-
-#include "bli_rands.h"
-#include "bli_randnp2s.h"
-
-#include "bli_scals.h"
-#include "bli_scaljs.h"
-#include "bli_scalcjs.h"
-
-#include "bli_scal2s.h"
-#include "bli_scal2js.h"
-
-#include "bli_set0s.h"
-
-#include "bli_set1s.h"
-
-#include "bli_seti0s.h"
-
-#include "bli_sqrt2s.h"
-
-#include "bli_subs.h"
-#include "bli_subjs.h"
-
-#include "bli_swaps.h"
-
-#include "bli_xpbys.h"
-#include "bli_xpbyjs.h"
-
-// Inlined scalar macros in loops
-#include "bli_adds_mxn.h"
-#include "bli_adds_mxn_uplo.h"
-#include "bli_set0s_mxn.h"
-#include "bli_set0s_edge.h"
-#include "bli_copys_mxn.h"
-#include "bli_scal2s_mxn.h"
-
-#include "bli_axpbys_mxn.h"
-#include "bli_xpbys_mxn.h"
-#include "bli_xpbys_mxn_uplo.h"
-
-// -- "broadcast B" scalar macros --
-
-#include "bli_bcastbbs_mxn.h"
-#include "bli_scal2bbs_mxn.h"
-#include "bli_set0bbs_mxn.h"
-
-
-// -- 1m-specific scalar macros --
-
-// 1e
-#include "bli_copy1es.h"
-#include "bli_copyj1es.h"
-
-#include "bli_invert1es.h"
-
-#include "bli_scal1es.h"
-
-#include "bli_scal21es.h"
-#include "bli_scal2j1es.h"
-
-// 1r
-#include "bli_copy1rs.h"
-#include "bli_copyj1rs.h"
-
-#include "bli_invert1rs.h"
-
-#include "bli_scal1rs.h"
-
-#include "bli_scal21rs.h"
-#include "bli_scal2j1rs.h"
-
-// 1m (1e or 1r)
-#include "bli_invert1ms_mxn_diag.h"
-
-#include "bli_scal1ms_mxn.h"
-
-#include "bli_scal21ms_mxn.h"
-#include "bli_scal21ms_mxn_diag.h"
-#include "bli_scal21ms_mxn_uplo.h"
-
-#include "bli_set1ms_mxn.h"
-#include "bli_set1ms_mxn_diag.h"
-#include "bli_set1ms_mxn_uplo.h"
-#include "bli_seti01ms_mxn_diag.h"
+#include "bli_tgets.h"
+
+// -- Scalar macros --
+
+#include "bli_tabsq2s.h"
+#include "bli_tabval2s.h"
+#include "bli_tadd3s.h"
+#include "bli_tadds.h"
+#include "bli_taxpbys.h"
+#include "bli_taxpys.h"
+#include "bli_tconjs.h"
+#include "bli_tcopycjs.h"
+#include "bli_tcopynzs.h"
+#include "bli_tcopys.h"
+#include "bli_tdots.h"
+#include "bli_teqs.h"
+#include "bli_tfprints.h"
+#include "bli_tinverts.h"
+#include "bli_tinvscals.h"
+#include "bli_tneg2s.h"
+#include "bli_trandnp2s.h"
+#include "bli_trands.h"
+#include "bli_tscalcjs.h"
+#include "bli_tscal2s.h"
+#include "bli_tscals.h"
+#include "bli_tsets.h"
+#include "bli_tsqrt2s.h"
+#include "bli_tsubs.h"
+#include "bli_tswaps.h"
+#include "bli_txpbys.h"
 
 
 #endif
diff --git a/frame/include/level0/1e/bli_copy1es.h b/frame/include/level0/1e/bli_copy1es.h
deleted file mode 100644
index 7dc6a493a..000000000
--- a/frame/include/level0/1e/bli_copy1es.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPY1ES_H
-#define BLIS_COPY1ES_H
-
-// copy1es
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_dscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_cscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_zscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-
-#define bli_sdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_ddcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_cdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_zdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-
-#define bli_sccopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_dccopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_cccopy1es( a, bri, bir ) \
-{ \
-	bli_cccopyris(  bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
-	bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \
-}
-#define bli_zccopy1es( a, bri, bir ) \
-{ \
-	bli_zccopyris(  bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \
-	bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \
-}
-
-#define bli_szcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_dzcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_czcopy1es( a, bri, bir ) \
-{ \
-	bli_czcopyris(  bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \
-	bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \
-}
-#define bli_zzcopy1es( a, bri, bir ) \
-{ \
-	bli_zzcopyris(  bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \
-	bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \
-}
-
-
-#define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir )
-#define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir )
-
-#endif
-
diff --git a/frame/include/level0/1e/bli_copyj1es.h b/frame/include/level0/1e/bli_copyj1es.h
deleted file mode 100644
index 25bb19d5b..000000000
--- a/frame/include/level0/1e/bli_copyj1es.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYJ1ES_H
-#define BLIS_COPYJ1ES_H
-
-// copyj1es
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_dscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_cscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_zscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-
-#define bli_sdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_ddcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_cdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_zdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-
-#define bli_sccopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_dccopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_cccopyj1es( a, bri, bir ) \
-{ \
-	bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
-	bli_cccopyris( bli_cimag(a),  bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \
-}
-#define bli_zccopyj1es( a, bri, bir ) \
-{ \
-	bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \
-	bli_zccopyris( bli_zimag(a),  bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \
-}
-
-#define bli_szcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_dzcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; }
-#define bli_czcopyj1es( a, bri, bir ) \
-{ \
-	bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \
-	bli_czcopyris( bli_cimag(a),  bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \
-}
-#define bli_zzcopyj1es( a, bri, bir ) \
-{ \
-	bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \
-	bli_zzcopyris( bli_zimag(a),  bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \
-}
-
-
-#define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir )
-#define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir )
-
-#endif
-
diff --git a/frame/include/level0/1e/bli_invert1es.h b/frame/include/level0/1e/bli_invert1es.h
deleted file mode 100644
index b45c3ca1f..000000000
--- a/frame/include/level0/1e/bli_invert1es.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVERT1ES_H
-#define BLIS_INVERT1ES_H
-
-// invert1es
-
-#define bli_cinvert1es( bri, bir ) \
-{ \
-	bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \
-	bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \
-}
-
-#define bli_zinvert1es( bri, bir ) \
-{ \
-	bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \
-	bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \
-}
-
-#endif
-
diff --git a/frame/include/level0/1e/bli_scal1es.h b/frame/include/level0/1e/bli_scal1es.h
deleted file mode 100644
index 485a8ae64..000000000
--- a/frame/include/level0/1e/bli_scal1es.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL1ES_H
-#define BLIS_SCAL1ES_H
-
-// scal1es
-
-#define bli_cscal1es( a, yri, yir ) \
-{ \
-	bli_cscalris(  bli_creal(a),   bli_cimag(a),   bli_creal(yri), bli_cimag(yri) ); \
-	bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-#define bli_zscal1es( a, yri, yir ) \
-{ \
-	bli_zscalris(  bli_zreal(a),   bli_zimag(a),   bli_zreal(yri), bli_zimag(yri) ); \
-	bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-#endif
-
diff --git a/frame/include/level0/1e/bli_scal21es.h b/frame/include/level0/1e/bli_scal21es.h
deleted file mode 100644
index 1cce97399..000000000
--- a/frame/include/level0/1e/bli_scal21es.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL21ES_H
-#define BLIS_SCAL21ES_H
-
-// scal21es
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_scsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_szsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_dssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ddsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dcsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dzsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_cssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_cdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ccsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_czsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_zssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zcsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zzsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_scdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_szdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_dsdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dcdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dzdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_csdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_cddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ccdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_czdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_zsdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zcdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zzdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sdcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sccscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_szcscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-#define bli_dscscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ddcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dccscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_dzcscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-#define bli_cscscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_sreal(x), bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_cdcscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_dreal(x), bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_cccscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_czcscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-#define bli_zscscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_sreal(x), bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_zdcscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_dreal(x), bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_zccscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_zzcscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_zreal(x), bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sdzscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sczscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_szzscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-#define bli_dszscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ddzscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dczscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_dzzscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-#define bli_cszscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_cdzscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_cczscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_czzscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-#define bli_zszscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_zdzscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_zczscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_zzzscal21es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a),  bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-
-
-#define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir )
-#define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir )
-
-#endif
-
diff --git a/frame/include/level0/1e/bli_scal2j1es.h b/frame/include/level0/1e/bli_scal2j1es.h
deleted file mode 100644
index d868f6fb7..000000000
--- a/frame/include/level0/1e/bli_scal2j1es.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2J1ES_H
-#define BLIS_SCAL2J1ES_H
-
-// scal2j1es
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_scsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_szsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_dssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ddsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dcsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dzsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_cssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_cdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ccsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_czsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_zssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zcsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zzsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_scdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_szdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_dsdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dcdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dzdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_csdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_cddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ccdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_czdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-#define bli_zsdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zcdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_zzdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sdcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sccscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_szcscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-#define bli_dscscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ddcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dccscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_dzcscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-#define bli_cscscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x),  bli_sreal(x), bli_creal(yir), bli_zimag(yir) ); \
-}
-#define bli_cdcscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x),  bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_cccscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_czcscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-#define bli_zscscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x),  bli_sreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_zdcscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x),  bli_dreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_zccscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x),  bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-#define bli_zzcscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_creal(yri), bli_cimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x),  bli_zreal(x), bli_creal(yir), bli_cimag(yir) ); \
-}
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sdzscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_sczscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_szzscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-#define bli_dszscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_ddzscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; }
-#define bli_dczscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_dzzscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-#define bli_cszscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x),  bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_cdzscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x),  bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_cczscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_czzscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-#define bli_zszscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x),  bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_zdzscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x),  bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_zczscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x),  bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-#define bli_zzzscal2j1es( a, x, yri, yir ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x),  bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
-}
-
-
-
-#define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir )
-#define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir )
-
-#endif
-
diff --git a/frame/include/level0/1m/bli_invert1ms_mxn_diag.h b/frame/include/level0/1m/bli_invert1ms_mxn_diag.h
deleted file mode 100644
index dfdeb2293..000000000
--- a/frame/include/level0/1m/bli_invert1ms_mxn_diag.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVERT1MS_MXN_DIAG_H
-#define BLIS_INVERT1MS_MXN_DIAG_H
-
-// invert1ms_mxn_diag
-
-#define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		scomplex* restrict y_off_ri = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y; \
-		scomplex* restrict y_off_ir = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \
-			                *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		float*    restrict y_cast  = ( float* )y; \
-		float*    restrict y_off_r = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2; \
-		float*    restrict y_off_i = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2 + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \
-			                *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		dcomplex* restrict y_off_ri = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y; \
-		dcomplex* restrict y_off_ir = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \
-			                *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		double*   restrict y_cast  = ( double* )y; \
-		double*   restrict y_off_r = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2; \
-		double*   restrict y_off_i = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2 + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \
-			                *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_scal1ms_mxn.h b/frame/include/level0/1m/bli_scal1ms_mxn.h
deleted file mode 100644
index 7d845576d..000000000
--- a/frame/include/level0/1m/bli_scal1ms_mxn.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL1MS_MXN_H
-#define BLIS_SCAL1MS_MXN_H
-
-// scal1ms_mxn
-
-#define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t i, j; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		scomplex* restrict y_ri = y; \
-		scomplex* restrict y_ir = y + ld_y/2; \
-\
-		for ( j = 0; j < n; ++j ) \
-		for ( i = 0; i < m; ++i ) \
-		{ \
-			bli_cscal1es( *(a), \
-			              *(y_ri + i*rs_y + j*cs_y), \
-			              *(y_ir + i*rs_y + j*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		float*    restrict y_cast = ( float* )y; \
-		float*    restrict y_r    = y_cast; \
-		float*    restrict y_i    = y_cast + ld_y; \
-\
-		for ( j = 0; j < n; ++j ) \
-		for ( i = 0; i < m; ++i ) \
-		{ \
-			bli_cscal1rs( *(a), \
-			              *(y_r + i*rs_y2 + j*cs_y2), \
-			              *(y_i + i*rs_y2 + j*cs_y2) ); \
-		} \
-	} \
-}
-
-#define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t i, j; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		dcomplex* restrict y_ri = y; \
-		dcomplex* restrict y_ir = y + ld_y/2; \
-\
-		for ( j = 0; j < n; ++j ) \
-		for ( i = 0; i < m; ++i ) \
-		{ \
-			bli_zscal1es( *(a), \
-			              *(y_ri + i*rs_y + j*cs_y), \
-			              *(y_ir + i*rs_y + j*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop,
-		   which steps in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		double*   restrict y_cast = ( double* )y; \
-		double*   restrict y_r    = y_cast; \
-		double*   restrict y_i    = y_cast + ld_y; \
-\
-		for ( j = 0; j < n; ++j ) \
-		for ( i = 0; i < m; ++i ) \
-		{ \
-			bli_zscal1rs( *(a), \
-			              *(y_r + i*rs_y2 + j*cs_y2), \
-			              *(y_i + i*rs_y2 + j*cs_y2) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_scal21ms_mxn.h b/frame/include/level0/1m/bli_scal21ms_mxn.h
deleted file mode 100644
index 9a824fbd5..000000000
--- a/frame/include/level0/1m/bli_scal21ms_mxn.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL21MS_MXN_H
-#define BLIS_SCAL21MS_MXN_H
-
-// scal21ms_mxn
-
-BLIS_INLINE void bli_cscal21ms_mxn
-     (
-       const pack_t       schema,
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       scomplex* restrict alpha,
-       scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y
-     )
-{
-	dim_t i, j;
-
-	/* Handle 1e and 1r separately. */
-	if ( bli_is_1e_packed( schema ) )
-	{
-		scomplex* restrict y_ri = y;
-		scomplex* restrict y_ir = y + ld_y/2;
-
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_cscal2j1es( *(alpha),
-				                *(x    + i*rs_x + j*cs_x),
-				                *(y_ri + i*rs_y + j*cs_y),
-				                *(y_ir + i*rs_y + j*cs_y) );
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_cscal21es( *(alpha),
-				               *(x    + i*rs_x + j*cs_x),
-				               *(y_ri + i*rs_y + j*cs_y),
-				               *(y_ir + i*rs_y + j*cs_y) );
-			}
-		}
-	}
-	else /* if ( bli_is_1r_packed( schema ) ) */
-	{
-		inc_t rs_y2 = rs_y;
-		inc_t cs_y2 = cs_y;
-
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; }
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; }
-
-		float*  restrict y_cast = ( float* )y;
-		float*  restrict y_r    = y_cast;
-		float*  restrict y_i    = y_cast + ld_y;
-
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_cscal2j1rs( *(alpha),
-				                *(x   + i*rs_x  + j*cs_x ),
-				                *(y_r + i*rs_y2 + j*cs_y2),
-				                *(y_i + i*rs_y2 + j*cs_y2) );
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_cscal21rs( *(alpha),
-				               *(x   + i*rs_x  + j*cs_x ),
-				               *(y_r + i*rs_y2 + j*cs_y2),
-				               *(y_i + i*rs_y2 + j*cs_y2) );
-			}
-		}
-	}
-}
-
-BLIS_INLINE void bli_zscal21ms_mxn
-     (
-       const pack_t       schema,
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       dcomplex* restrict alpha,
-       dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y
-     )
-{
-	dim_t i, j;
-
-	/* Handle 1e and 1r separately. */
-	if ( bli_is_1e_packed( schema ) )
-	{
-		dcomplex* restrict y_ri = y;
-		dcomplex* restrict y_ir = y + ld_y/2;
-
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_zscal2j1es( *(alpha),
-				                *(x    + i*rs_x + j*cs_x),
-				                *(y_ri + i*rs_y + j*cs_y),
-				                *(y_ir + i*rs_y + j*cs_y) );
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_zscal21es( *(alpha),
-				               *(x    + i*rs_x + j*cs_x),
-				               *(y_ri + i*rs_y + j*cs_y),
-				               *(y_ir + i*rs_y + j*cs_y) );
-			}
-		}
-	}
-	else /* if ( bli_is_1r_packed( schema ) ) */
-	{
-		inc_t rs_y2 = rs_y;
-		inc_t cs_y2 = cs_y;
-
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; }
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; }
-
-		double* restrict y_cast = ( double* )y;
-		double* restrict y_r    = y_cast;
-		double* restrict y_i    = y_cast + ld_y;
-
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_zscal2j1rs( *(alpha),
-				                *(x   + i*rs_x  + j*cs_x ),
-				                *(y_r + i*rs_y2 + j*cs_y2),
-				                *(y_i + i*rs_y2 + j*cs_y2) );
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( j = 0; j < n; ++j )
-			for ( i = 0; i < m; ++i )
-			{
-				bli_zscal21rs( *(alpha),
-				               *(x   + i*rs_x  + j*cs_x ),
-				               *(y_r + i*rs_y2 + j*cs_y2),
-				               *(y_i + i*rs_y2 + j*cs_y2) );
-			}
-		}
-	}
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_diag.h b/frame/include/level0/1m/bli_scal21ms_mxn_diag.h
deleted file mode 100644
index 21074338d..000000000
--- a/frame/include/level0/1m/bli_scal21ms_mxn_diag.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL21MS_MXN_DIAG_H
-#define BLIS_SCAL21MS_MXN_DIAG_H
-
-// scal21ms_mxn_diag
-
-#define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		scomplex* restrict y_off_ri = y; \
-		scomplex* restrict y_off_ir = y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_cscscal21es( *(a), \
-			                 *(x        + i*rs_x + i*cs_x), \
-			                 *(y_off_ri + i*rs_y + i*cs_y), \
-			                 *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		float*    restrict y_cast  = ( float* )y; \
-		float*    restrict y_off_r = y_cast; \
-		float*    restrict y_off_i = y_cast + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_cscscal21rs( *(a), \
-			                 *(x       + i*rs_x  + i*cs_x ), \
-			                 *(y_off_r + i*rs_y2 + i*cs_y2), \
-			                 *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		dcomplex* restrict y_off_ri = y; \
-		dcomplex* restrict y_off_ir = y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_zdzscal21es( *(a), \
-			                 *(x        + i*rs_x + i*cs_x), \
-			                 *(y_off_ri + i*rs_y + i*cs_y), \
-			                 *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		double*   restrict y_cast  = ( double* )y; \
-		double*   restrict y_off_r = y_cast; \
-		double*   restrict y_off_i = y_cast + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_zdzscal21rs( *(a), \
-			                 *(x       + i*rs_x  + i*cs_x ), \
-			                 *(y_off_r + i*rs_y2 + i*cs_y2), \
-			                 *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h b/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h
deleted file mode 100644
index a41d3e57f..000000000
--- a/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL21MS_MXN_UPLO_H
-#define BLIS_SCAL21MS_MXN_UPLO_H
-
-// scal21ms_mxn_uplo
-
-#define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t i, j; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		scomplex* restrict y_ri = y; \
-		scomplex* restrict y_ir = y + ld_y/2; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_cscal2j1es( *(a), \
-					                *(x    + i*rs_x + j*cs_x), \
-					                *(y_ri + i*rs_y + j*cs_y), \
-					                *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_cscal21es( *(a), \
-					               *(x    + i*rs_x + j*cs_x), \
-					               *(y_ri + i*rs_y + j*cs_y), \
-					               *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_cscal2j1es( *(a), \
-					                *(x    + i*rs_x + j*cs_x), \
-					                *(y_ri + i*rs_y + j*cs_y), \
-					                *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_cscal21es( *(a), \
-					               *(x    + i*rs_x + j*cs_x), \
-					               *(y_ri + i*rs_y + j*cs_y), \
-					               *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		float*    restrict y_cast = ( float* )y; \
-		float*    restrict y_r    = y_cast; \
-		float*    restrict y_i    = y_cast + ld_y; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_cscal2j1rs( *(a), \
-					                *(x   + i*rs_x  + j*cs_x ), \
-					                *(y_r + i*rs_y2 + j*cs_y2), \
-					                *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_cscal21rs( *(a), \
-					               *(x   + i*rs_x  + j*cs_x ), \
-					               *(y_r + i*rs_y2 + j*cs_y2), \
-					               *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_cscal2j1rs( *(a), \
-					                *(x   + i*rs_x  + j*cs_x ), \
-					                *(y_r + i*rs_y2 + j*cs_y2), \
-					                *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_cscal21rs( *(a), \
-					               *(x   + i*rs_x  + j*cs_x ), \
-					               *(y_r + i*rs_y2 + j*cs_y2), \
-					               *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-#define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t i, j; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		dcomplex* restrict y_ri = y; \
-		dcomplex* restrict y_ir = y + ld_y/2; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_zscal2j1es( *(a), \
-					                *(x    + i*rs_x + j*cs_x), \
-					                *(y_ri + i*rs_y + j*cs_y), \
-					                *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_zscal21es( *(a), \
-					               *(x    + i*rs_x + j*cs_x), \
-					               *(y_ri + i*rs_y + j*cs_y), \
-					               *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_zscal2j1es( *(a), \
-					                *(x    + i*rs_x + j*cs_x), \
-					                *(y_ri + i*rs_y + j*cs_y), \
-					                *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_zscal21es( *(a), \
-					               *(x    + i*rs_x + j*cs_x), \
-					               *(y_ri + i*rs_y + j*cs_y), \
-					               *(y_ir + i*rs_y + j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		double*   restrict y_cast = ( double* )y; \
-		double*   restrict y_r    = y_cast; \
-		double*   restrict y_i    = y_cast + ld_y; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_zscal2j1rs( *(a), \
-					                *(x   + i*rs_x  + j*cs_x ), \
-					                *(y_r + i*rs_y2 + j*cs_y2), \
-					                *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = j; i < m; ++i ) \
-				{ \
-					bli_zscal21rs( *(a), \
-					               *(x   + i*rs_x  + j*cs_x ), \
-					               *(y_r + i*rs_y2 + j*cs_y2), \
-					               *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_zscal2j1rs( *(a), \
-					                *(x   + i*rs_x  + j*cs_x ), \
-					                *(y_r + i*rs_y2 + j*cs_y2), \
-					                *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( j = 0; j < m; ++j ) \
-				for ( i = 0; i < j + 1; ++i ) \
-				{ \
-					bli_zscal21rs( *(a), \
-					               *(x   + i*rs_x  + j*cs_x ), \
-					               *(y_r + i*rs_y2 + j*cs_y2), \
-					               *(y_i + i*rs_y2 + j*cs_y2) ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_set1ms_mxn.h b/frame/include/level0/1m/bli_set1ms_mxn.h
deleted file mode 100644
index f7d492c23..000000000
--- a/frame/include/level0/1m/bli_set1ms_mxn.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SET1MS_MXN_H
-#define BLIS_SET1MS_MXN_H
-
-// set1ms_mxn
-
-#define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	/* Include real domain version to facilitate macro-izing mixed-datatype
-	   components of packm. */ \
-}
-
-#define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	/* Include real domain version to facilitate macro-izing mixed-datatype
-	   components of packm. */ \
-}
-
-BLIS_INLINE void bli_cset1ms_mxn
-     (
-       const pack_t       schema,
-       const dim_t        offm,
-       const dim_t        offn,
-       const dim_t        m,
-       const dim_t        n,
-       scomplex* restrict alpha,
-       scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y
-     )
-{
-	inc_t offm_local = offm;
-	inc_t offn_local = offn;
-	dim_t m_local    = m;
-	dim_t n_local    = n;
-	inc_t rs_y1      = rs_y;
-	inc_t cs_y1      = cs_y;
-	inc_t rs_y2      = rs_y;
-	inc_t cs_y2      = cs_y;
-	dim_t i, j;
-
-	/* Optimization: The loops walk through y with unit stride if y is
-	   column-stored. If y is row-stored, swap the dimensions and strides
-	   to preserve unit stride movement. */
-	if ( cs_y == 1 )
-	{
-		bli_swap_incs( &offm_local, &offn_local );
-		bli_swap_dims( &m_local, &n_local );
-		bli_swap_incs( &rs_y1, &cs_y1 );
-		bli_swap_incs( &rs_y2, &cs_y2 );
-	}
-
-	/* Handle 1e and 1r separately. */
-	if ( bli_is_1e_packed( schema ) )
-	{
-		scomplex* restrict y_off_ri = y + (offm_local  )*rs_y1
-		                                + (offn_local  )*cs_y1;
-		scomplex* restrict y_off_ir = y + (offm_local  )*rs_y1
-		                                + (offn_local  )*cs_y1 + ld_y/2;
-
-		for ( j = 0; j < n_local; ++j )
-		for ( i = 0; i < m_local; ++i )
-		{
-			bli_ccopy1es( *(alpha),
-			              *(y_off_ri + i*rs_y1 + j*cs_y1),
-			              *(y_off_ir + i*rs_y1 + j*cs_y1) );
-		}
-	}
-	else /* if ( bli_is_1r_packed( schema ) ) */
-	{
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; }
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; }
-
-		float*    restrict y_cast  = ( float* )y;
-		float*    restrict y_off_r = y_cast + (offm_local  )*rs_y2
-		                                    + (offn_local  )*cs_y2;
-		float*    restrict y_off_i = y_cast + (offm_local  )*rs_y2
-		                                    + (offn_local  )*cs_y2 + ld_y;
-
-		for ( j = 0; j < n_local; ++j )
-		for ( i = 0; i < m_local; ++i )
-		{
-			bli_ccopy1rs( *(alpha),
-			              *(y_off_r + i*rs_y2 + j*cs_y2),
-			              *(y_off_i + i*rs_y2 + j*cs_y2) );
-		}
-	}
-}
-
-BLIS_INLINE void bli_zset1ms_mxn
-     (
-       const pack_t       schema,
-       const dim_t        offm,
-       const dim_t        offn,
-       const dim_t        m,
-       const dim_t        n,
-       dcomplex* restrict alpha,
-       dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y
-     )
-{
-	inc_t offm_local = offm;
-	inc_t offn_local = offn;
-	dim_t m_local    = m;
-	dim_t n_local    = n;
-	inc_t rs_y1      = rs_y;
-	inc_t cs_y1      = cs_y;
-	inc_t rs_y2      = rs_y;
-	inc_t cs_y2      = cs_y;
-	dim_t i, j;
-
-	/* Optimization: The loops walk through y with unit stride if y is
-	   column-stored. If y is row-stored, swap the dimensions and strides
-	   to preserve unit stride movement. */
-	if ( cs_y == 1 )
-	{
-		bli_swap_incs( &offm_local, &offn_local );
-		bli_swap_dims( &m_local, &n_local );
-		bli_swap_incs( &rs_y1, &cs_y1 );
-		bli_swap_incs( &rs_y2, &cs_y2 );
-	}
-
-	/* Handle 1e and 1r separately. */
-	if ( bli_is_1e_packed( schema ) )
-	{
-		dcomplex* restrict y_off_ri = y + (offm_local  )*rs_y1
-		                                + (offn_local  )*cs_y1;
-		dcomplex* restrict y_off_ir = y + (offm_local  )*rs_y1
-		                                + (offn_local  )*cs_y1 + ld_y/2;
-
-		for ( j = 0; j < n_local; ++j )
-		for ( i = 0; i < m_local; ++i )
-		{
-			bli_zcopy1es( *(alpha),
-			              *(y_off_ri + i*rs_y1 + j*cs_y1),
-			              *(y_off_ir + i*rs_y1 + j*cs_y1) );
-		}
-	}
-	else /* if ( bli_is_1r_packed( schema ) ) */
-	{
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; }
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; }
-
-		double*   restrict y_cast  = ( double* )y;
-		double*   restrict y_off_r = y_cast + (offm_local  )*rs_y2
-		                                    + (offn_local  )*cs_y2;
-		double*   restrict y_off_i = y_cast + (offm_local  )*rs_y2
-		                                    + (offn_local  )*cs_y2 + ld_y;
-
-		for ( j = 0; j < n_local; ++j )
-		for ( i = 0; i < m_local; ++i )
-		{
-			bli_zcopy1rs( *(alpha),
-			              *(y_off_r + i*rs_y2 + j*cs_y2),
-			              *(y_off_i + i*rs_y2 + j*cs_y2) );
-		}
-	}
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_set1ms_mxn_diag.h b/frame/include/level0/1m/bli_set1ms_mxn_diag.h
deleted file mode 100644
index 856e47bce..000000000
--- a/frame/include/level0/1m/bli_set1ms_mxn_diag.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SET1MS_MXN_DIAG_H
-#define BLIS_SET1MS_MXN_DIAG_H
-
-// set1ms_mxn_diag
-
-#define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		scomplex* restrict y_off_ri = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y; \
-		scomplex* restrict y_off_ir = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_ccopy1es( *(a), \
-			              *(y_off_ri + i*rs_y + i*cs_y), \
-			              *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		float*    restrict y_cast  = ( float* )y; \
-		float*    restrict y_off_r = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2; \
-		float*    restrict y_off_i = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2 + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_ccopy1rs( *(a), \
-			              *(y_off_r + i*rs_y2 + i*cs_y2), \
-			              *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		dcomplex* restrict y_off_ri = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y; \
-		dcomplex* restrict y_off_ir = y + (offm  )*rs_y \
-		                                + (offn  )*cs_y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_zcopy1es( *(a), \
-			              *(y_off_ri + i*rs_y + i*cs_y), \
-			              *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		double*   restrict y_cast  = ( double* )y; \
-		double*   restrict y_off_r = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2; \
-		double*   restrict y_off_i = y_cast + (offm  )*rs_y2 \
-		                                    + (offn  )*cs_y2 + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_zcopy1rs( *(a), \
-			              *(y_off_r + i*rs_y2 + i*cs_y2), \
-			              *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_set1ms_mxn_uplo.h b/frame/include/level0/1m/bli_set1ms_mxn_uplo.h
deleted file mode 100644
index d672b9174..000000000
--- a/frame/include/level0/1m/bli_set1ms_mxn_uplo.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SET1MS_MXN_UPLO_H
-#define BLIS_SET1MS_MXN_UPLO_H
-
-// set1ms_mxn_uplo
-
-#define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	doff_t diagoff_abs = bli_abs( diagoff ); \
-	inc_t  offdiag_inc; \
-	dim_t  i, j; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		/* Set the off-diagonal increment. */ \
-		if         ( diagoff > 0 )    offdiag_inc = cs_y; \
-		else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \
-\
-		scomplex* restrict y0   = y + (diagoff_abs  )*offdiag_inc; \
-		scomplex* restrict y_ri = y0; \
-		scomplex* restrict y_ir = y0 + ld_y/2; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = j; i < m; ++i ) \
-			{ \
-				bli_ccopy1es( *(a), \
-				              *(y_ri + i*rs_y + j*cs_y), \
-				              *(y_ir + i*rs_y + j*cs_y) ); \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < j + 1; ++i ) \
-			{ \
-				bli_ccopy1es( *(a), \
-				              *(y_ri + i*rs_y + j*cs_y), \
-				              *(y_ir + i*rs_y + j*cs_y) ); \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		/* Set the off-diagonal increment. */ \
-		if         ( diagoff > 0 )    offdiag_inc = cs_y2; \
-		else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \
-\
-		float*    restrict y0  = ( float* )y + (diagoff_abs  )*offdiag_inc; \
-		float*    restrict y_r = y0; \
-		float*    restrict y_i = y0 + ld_y; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = j; i < m; ++i ) \
-			{ \
-				bli_ccopy1rs( *(a), \
-				              *(y_r + i*rs_y2 + j*cs_y2), \
-				              *(y_i + i*rs_y2 + j*cs_y2) ); \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < j + 1; ++i ) \
-			{ \
-				bli_ccopy1rs( *(a), \
-				              *(y_r + i*rs_y2 + j*cs_y2), \
-				              *(y_i + i*rs_y2 + j*cs_y2) ); \
-			} \
-		} \
-	} \
-}
-
-#define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \
-{ \
-	doff_t diagoff_abs = bli_abs( diagoff ); \
-	inc_t  offdiag_inc; \
-	dim_t  i, j; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		/* Set the off-diagonal increment. */ \
-		if         ( diagoff > 0 )    offdiag_inc = cs_y; \
-		else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \
-\
-		dcomplex* restrict y0   = y + (diagoff_abs  )*offdiag_inc; \
-		dcomplex* restrict y_ri = y0; \
-		dcomplex* restrict y_ir = y0 + ld_y/2; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = j; i < m; ++i ) \
-			{ \
-				bli_zcopy1es( *(a), \
-				              *(y_ri + i*rs_y + j*cs_y), \
-				              *(y_ir + i*rs_y + j*cs_y) ); \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < j + 1; ++i ) \
-			{ \
-				bli_zcopy1es( *(a), \
-				              *(y_ri + i*rs_y + j*cs_y), \
-				              *(y_ir + i*rs_y + j*cs_y) ); \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		/* Set the off-diagonal increment. */ \
-		if         ( diagoff > 0 )    offdiag_inc = cs_y2; \
-		else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \
-\
-		double*   restrict y0  = ( double* )y + (diagoff_abs  )*offdiag_inc; \
-		double*   restrict y_r = y0; \
-		double*   restrict y_i = y0 + ld_y; \
-\
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = j; i < m; ++i ) \
-			{ \
-				bli_zcopy1rs( *(a), \
-				              *(y_r + i*rs_y2 + j*cs_y2), \
-				              *(y_i + i*rs_y2 + j*cs_y2) ); \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < j + 1; ++i ) \
-			{ \
-				bli_zcopy1rs( *(a), \
-				              *(y_r + i*rs_y2 + j*cs_y2), \
-				              *(y_i + i*rs_y2 + j*cs_y2) ); \
-			} \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/1m/bli_seti01ms_mxn_diag.h b/frame/include/level0/1m/bli_seti01ms_mxn_diag.h
deleted file mode 100644
index dd8bf7a3b..000000000
--- a/frame/include/level0/1m/bli_seti01ms_mxn_diag.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SETI01MS_MXN_DIAG_H
-#define BLIS_SETI01MS_MXN_DIAG_H
-
-// seti01ms_mxn_diag
-
-#define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		scomplex* restrict y_off_ri = y; \
-		scomplex* restrict y_off_ir = y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \
-			bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		float*    restrict y_cast  = ( float* )y; \
-		float*    restrict y_off_i = y_cast + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t i; \
-\
-	/* Handle 1e and 1r separately. */ \
-	if ( bli_is_1e_packed( schema ) ) \
-	{ \
-		dcomplex* restrict y_off_ri = y; \
-		dcomplex* restrict y_off_ir = y + ld_y/2; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \
-			bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_1r_packed( schema ) ) */ \
-	{ \
-		inc_t rs_y2 = rs_y; \
-		inc_t cs_y2 = cs_y; \
-\
-		/* Scale the non-unit stride by two for the 1r loop, which steps
-		   in units of real (not complex) values. */ \
-		if         ( rs_y2 == 1 )    { cs_y2 *= 2; } \
-		else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
-\
-		double*   restrict y_cast  = ( double* )y; \
-		double*   restrict y_off_i = y_cast + ld_y; \
-\
-		for ( i = 0; i < min_m_n; ++i ) \
-		{ \
-			bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/1r/bli_scal1rs.h b/frame/include/level0/1r/bli_scal1rs.h
deleted file mode 100644
index f75c589d0..000000000
--- a/frame/include/level0/1r/bli_scal1rs.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL1RS_H
-#define BLIS_SCAL1RS_H
-
-// scal1rs
-
-#define bli_cscal1rs( a, yr, yi ) \
-{ \
-	bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \
-}
-
-#define bli_zscal1rs( a, yr, yi ) \
-{ \
-	bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \
-}
-
-#define bli_scscal1rs( a, yr, yi ) \
-{ \
-	bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \
-}
-
-#define bli_dzscal1rs( a, yr, yi ) \
-{ \
-	bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \
-}
-
-#endif
-
diff --git a/frame/include/level0/bb/bli_scal2bbs_mxn.h b/frame/include/level0/bb/bli_scal2bbs_mxn.h
deleted file mode 100644
index d6f95f97f..000000000
--- a/frame/include/level0/bb/bli_scal2bbs_mxn.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2BBS_MXN_H
-#define BLIS_SCAL2BBS_MXN_H
-
-// scal2bbs_mxn
-
-#undef  GENTFUNCRO
-#define GENTFUNCRO( ctype, ch, opname ) \
-\
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const conj_t          conjx, \
-       const dim_t           m, \
-       const dim_t           n, \
-       const ctype* restrict alpha, \
-       const ctype* restrict x, const inc_t incx, const inc_t ldx, \
-             ctype* restrict y, const inc_t incy, const inc_t ldy  \
-     ) \
-{ \
-	/* Assume that the duplication factor is the row stride of y. */ \
-	const dim_t d    = incy; \
-	const dim_t ds_y = 1; \
-\
-	if ( bli_is_conj( conjx ) ) \
-	{ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			const ctype* restrict xj = x + j*ldx; \
-			      ctype* restrict yj = y + j*ldy; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				const ctype* restrict xij = xj + i*incx; \
-				      ctype* restrict yij = yj + i*incy; \
-\
-				PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \
-\
-				for ( dim_t p = 1; p < d; ++p ) \
-				{ \
-					ctype* restrict yijd = yij + p*ds_y; \
-\
-					PASTEMAC(ch,copys)( *yij, *yijd ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_noconj( conjx ) ) */ \
-	{ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			const ctype* restrict xj = x + j*ldx; \
-			      ctype* restrict yj = y + j*ldy; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				const ctype* restrict xij = xj + i*incx; \
-				      ctype* restrict yij = yj + i*incy; \
-\
-				PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \
-\
-				for ( dim_t p = 1; p < d; ++p ) \
-				{ \
-					ctype* restrict yijd = yij + p*ds_y; \
-\
-					PASTEMAC(ch,copys)( *yij, *yijd ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNCRO_BASIC( scal2bbs_mxn )
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \
-\
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const conj_t          conjx, \
-       const dim_t           m, \
-       const dim_t           n, \
-       const ctype* restrict alpha, \
-       const ctype* restrict x, const inc_t incx, const inc_t ldx, \
-             ctype* restrict y, const inc_t incy, const inc_t ldy  \
-     ) \
-{ \
-	/* Assume that the duplication factor is the row stride of y. */ \
-	const dim_t       d          = incy; \
-	const dim_t       ds_y       = 1; \
-\
-	const inc_t       incx2      = 2 * incx; \
-	const inc_t       ldx2       = 2 * ldx; \
-\
-	const inc_t       incy2      = 2 * incy; \
-	const inc_t       ldy2       = 2 * ldy; \
-\
-	ctype_r* restrict alpha_r    = ( ctype_r* )alpha; \
-	ctype_r* restrict alpha_i    = ( ctype_r* )alpha + 1; \
-	ctype_r* restrict chi_r      = ( ctype_r* )x; \
-	ctype_r* restrict chi_i      = ( ctype_r* )x + 1; \
-	ctype_r* restrict psi_r      = ( ctype_r* )y; \
-	ctype_r* restrict psi_i      = ( ctype_r* )y + 1*d; \
-\
-	if ( bli_is_conj( conjx ) ) \
-	{ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype_r* restrict chij_r = chi_r + j*ldx2; \
-			ctype_r* restrict chij_i = chi_i + j*ldx2; \
-			ctype_r* restrict psij_r = psi_r + j*ldy2; \
-			ctype_r* restrict psij_i = psi_i + j*ldy2; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype_r* restrict chiij_r = chij_r + i*incx2; \
-				ctype_r* restrict chiij_i = chij_i + i*incx2; \
-				ctype_r* restrict psiij_r = psij_r + i*incy2; \
-				ctype_r* restrict psiij_i = psij_i + i*incy2; \
-\
-				PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \
-				                        *chiij_r, *chiij_i, \
-				                        *psiij_r, *psiij_i ); \
-\
-				for ( dim_t p = 1; p < d; ++p ) \
-				{ \
-					ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \
-					ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \
-\
-					PASTEMAC(ch,copyris)( *psiij_r,  *psiij_i, \
-					                      *psiijd_r, *psiijd_i ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_noconj( conjx ) ) */ \
-	{ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype_r* restrict chij_r = chi_r + j*ldx2; \
-			ctype_r* restrict chij_i = chi_i + j*ldx2; \
-			ctype_r* restrict psij_r = psi_r + j*ldy2; \
-			ctype_r* restrict psij_i = psi_i + j*ldy2; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype_r* restrict chiij_r = chij_r + i*incx2; \
-				ctype_r* restrict chiij_i = chij_i + i*incx2; \
-				ctype_r* restrict psiij_r = psij_r + i*incy2; \
-				ctype_r* restrict psiij_i = psij_i + i*incy2; \
-\
-				PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \
-				                       *chiij_r, *chiij_i, \
-				                       *psiij_r, *psiij_i ); \
-\
-				for ( dim_t p = 1; p < d; ++p ) \
-				{ \
-					ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \
-					ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \
-\
-					PASTEMAC(ch,copyris)( *psiij_r,  *psiij_i, \
-					                      *psiijd_r, *psiijd_i ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNCCO( scal2bbs_mxn )
-
-#endif
diff --git a/frame/include/level0/bli_absq2s.h b/frame/include/level0/bli_absq2s.h
deleted file mode 100644
index dee2bea5f..000000000
--- a/frame/include/level0/bli_absq2s.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ABSQR2_H
-#define BLIS_ABSQR2_H
-
-// absq2s
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of a.
-
-#define bli_ssabsq2s( x, a )              bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F         )
-#define bli_dsabsq2s( x, a )              bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F         )
-#define bli_csabsq2s( x, a ) { float ti;  bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti           ); ( void )ti; }
-#define bli_zsabsq2s( x, a ) { float ti;  bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti           ); ( void )ti; }
-
-#define bli_sdabsq2s( x, a )              bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0          )
-#define bli_ddabsq2s( x, a )              bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0          )
-#define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti           ); ( void )ti; }
-#define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti           ); ( void )ti; }
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scabsq2s( x, a )              bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) )
-#define bli_dcabsq2s( x, a )              bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) )
-#define bli_ccabsq2s( x, a )              bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) )
-#define bli_zcabsq2s( x, a )              bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) )
-
-#define bli_szabsq2s( x, a )              bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_dzabsq2s( x, a )              bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_czabsq2s( x, a )              bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_zzabsq2s( x, a )              bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scabsq2s( x, a )  bli_scsets(          (x) *          (x), 0.0, (a) )
-#define bli_dcabsq2s( x, a )  bli_dcsets(          (x) *          (x), 0.0, (a) )
-#define bli_ccabsq2s( x, a )  bli_ccsets( bli_creal(x) * bli_creal(x) + \
-                                          bli_cimag(x) * bli_cimag(x), 0.0, (a) )
-#define bli_zcabsq2s( x, a )  bli_zcsets( bli_zreal(x) * bli_zreal(x) + \
-                                          bli_zimag(x) * bli_zimag(x), 0.0, (a) )
-
-#define bli_szabsq2s( x, a )  bli_szsets(          (x) *          (x), 0.0, (a) )
-#define bli_dzabsq2s( x, a )  bli_dzsets(          (x) *          (x), 0.0, (a) )
-#define bli_czabsq2s( x, a )  bli_czsets( bli_creal(x) * bli_creal(x) + \
-                                          bli_cimag(x) * bli_cimag(x), 0.0, (a) )
-#define bli_zzabsq2s( x, a )  bli_zzsets( bli_zreal(x) * bli_zreal(x) + \
-                                          bli_zimag(x) * bli_zimag(x), 0.0, (a) )
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sabsq2s( x, a )  bli_ssabsq2s( x, a )
-#define bli_dabsq2s( x, a )  bli_ddabsq2s( x, a )
-#define bli_cabsq2s( x, a )  bli_ccabsq2s( x, a )
-#define bli_zabsq2s( x, a )  bli_zzabsq2s( x, a )
-
-
-#endif
diff --git a/frame/include/level0/bli_abval2s.h b/frame/include/level0/bli_abval2s.h
deleted file mode 100644
index 63df867df..000000000
--- a/frame/include/level0/bli_abval2s.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ABVAL2S_H
-#define BLIS_ABVAL2S_H
-
-// abval2s
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of a.
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_ssabval2s( x, a )              bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F         )
-#define bli_dsabval2s( x, a )              bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F         )
-#define bli_csabval2s( x, a ) { float ti;  bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti           ); ( void )ti; }
-#define bli_zsabval2s( x, a ) { float ti;  bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti           ); ( void )ti; }
-
-#define bli_sdabval2s( x, a )              bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0          )
-#define bli_ddabval2s( x, a )              bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0          )
-#define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti           ); ( void )ti; }
-#define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti           ); ( void )ti; }
-
-#define bli_scabval2s( x, a )              bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) )
-#define bli_dcabval2s( x, a )              bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) )
-#define bli_ccabval2s( x, a )              bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) )
-#define bli_zcabval2s( x, a )              bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) )
-
-#define bli_szabval2s( x, a )              bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_dzabval2s( x, a )              bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_czabval2s( x, a )              bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_zzabval2s( x, a )              bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_ssabval2s( x, a )  bli_sssets( fabsf(x), 0.0, (a) )
-#define bli_dsabval2s( x, a )  bli_dssets( fabs (x), 0.0, (a) )
-#define bli_csabval2s( x, a )  bli_cssets( cabsf(x), 0.0, (a) )
-#define bli_zsabval2s( x, a )  bli_zssets( cabs (x), 0.0, (a) )
-
-#define bli_sdabval2s( x, a )  bli_sdsets( fabsf(x), 0.0, (a) )
-#define bli_ddabval2s( x, a )  bli_ddsets( fabs (x), 0.0, (a) )
-#define bli_cdabval2s( x, a )  bli_cdsets( cabsf(x), 0.0, (a) )
-#define bli_zdabval2s( x, a )  bli_zdsets( cabs (x), 0.0, (a) )
-
-#define bli_scabval2s( x, a )  bli_scsets( fabsf(x), 0.0, (a) )
-#define bli_dcabval2s( x, a )  bli_dcsets( fabs (x), 0.0, (a) )
-#define bli_ccabval2s( x, a )  bli_ccsets( cabsf(x), 0.0, (a) )
-#define bli_zcabval2s( x, a )  bli_zcsets( cabs (x), 0.0, (a) )
-
-#define bli_szabval2s( x, a )  bli_szsets( fabsf(x), 0.0, (a) )
-#define bli_dzabval2s( x, a )  bli_dzsets( fabs (x), 0.0, (a) )
-#define bli_czabval2s( x, a )  bli_czsets( cabsf(x), 0.0, (a) )
-#define bli_zzabval2s( x, a )  bli_zzsets( cabs (x), 0.0, (a) )
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sabval2s( x, a )  bli_ssabval2s( x, a )
-#define bli_dabval2s( x, a )  bli_ddabval2s( x, a )
-#define bli_cabval2s( x, a )  bli_ccabval2s( x, a )
-#define bli_zabval2s( x, a )  bli_zzabval2s( x, a )
-
-
-#endif
diff --git a/frame/include/level0/bli_add3s.h b/frame/include/level0/bli_add3s.h
deleted file mode 100644
index f922a791b..000000000
--- a/frame/include/level0/bli_add3s.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADD3S_H
-#define BLIS_ADD3S_H
-
-// add3s
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of b.
-// - The third char encodes the type of c.
-
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssadd3s( a, b, c )  bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) )
-#define bli_dssadd3s( a, b, c )  bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) )
-#define bli_cssadd3s( a, b, c )  bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) )
-#define bli_zssadd3s( a, b, c )  bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) )
-
-#define bli_sdsadd3s( a, b, c )  bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_ddsadd3s( a, b, c )  bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_cdsadd3s( a, b, c )  bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_zdsadd3s( a, b, c )  bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) )
-
-#define bli_scsadd3s( a, b, c )  bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_dcsadd3s( a, b, c )  bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_ccsadd3s( a, b, c )  bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_zcsadd3s( a, b, c )  bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) )
-
-#define bli_szsadd3s( a, b, c )  bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_dzsadd3s( a, b, c )  bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_czsadd3s( a, b, c )  bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) )
-#define bli_zzsadd3s( a, b, c )  bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) )
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdadd3s( a, b, c )  bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_dsdadd3s( a, b, c )  bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_csdadd3s( a, b, c )  bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_zsdadd3s( a, b, c )  bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) )
-
-#define bli_sddadd3s( a, b, c )  bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_dddadd3s( a, b, c )  bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_cddadd3s( a, b, c )  bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_zddadd3s( a, b, c )  bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) )
-
-#define bli_scdadd3s( a, b, c )  bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_dcdadd3s( a, b, c )  bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_ccdadd3s( a, b, c )  bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_zcdadd3s( a, b, c )  bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) )
-
-#define bli_szdadd3s( a, b, c )  bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_dzdadd3s( a, b, c )  bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_czdadd3s( a, b, c )  bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) )
-#define bli_zzdadd3s( a, b, c )  bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscadd3s( a, b, c )  bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) )
-#define bli_dscadd3s( a, b, c )  bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) )
-#define bli_cscadd3s( a, b, c )  bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) )
-#define bli_zscadd3s( a, b, c )  bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) )
-
-#define bli_sdcadd3s( a, b, c )  bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_ddcadd3s( a, b, c )  bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_cdcadd3s( a, b, c )  bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_zdcadd3s( a, b, c )  bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) )
-
-#define bli_sccadd3s( a, b, c )  bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_dccadd3s( a, b, c )  bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_cccadd3s( a, b, c )  bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_zccadd3s( a, b, c )  bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) )
-
-#define bli_szcadd3s( a, b, c )  bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_dzcadd3s( a, b, c )  bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_czcadd3s( a, b, c )  bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) )
-#define bli_zzcadd3s( a, b, c )  bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) )
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszadd3s( a, b, c )  bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_dszadd3s( a, b, c )  bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_cszadd3s( a, b, c )  bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_zszadd3s( a, b, c )  bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) )
-
-#define bli_sdzadd3s( a, b, c )  bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_ddzadd3s( a, b, c )  bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_cdzadd3s( a, b, c )  bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_zdzadd3s( a, b, c )  bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) )
-
-#define bli_sczadd3s( a, b, c )  bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_dczadd3s( a, b, c )  bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_cczadd3s( a, b, c )  bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_zczadd3s( a, b, c )  bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) )
-
-#define bli_szzadd3s( a, b, c )  bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_dzzadd3s( a, b, c )  bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_czzadd3s( a, b, c )  bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) )
-#define bli_zzzadd3s( a, b, c )  bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_dscadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_cscadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zscadd3s( a, b, c )  { (c) = (a) + (b); }
-
-#define bli_sdcadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_ddcadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_cdcadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zdcadd3s( a, b, c )  { (c) = (a) + (b); }
-
-#define bli_sccadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_dccadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_cccadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zccadd3s( a, b, c )  { (c) = (a) + (b); }
-
-#define bli_szcadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_dzcadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_czcadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zzcadd3s( a, b, c )  { (c) = (a) + (b); }
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_dszadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_cszadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zszadd3s( a, b, c )  { (c) = (a) + (b); }
-
-#define bli_sdzadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_ddzadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_cdzadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zdzadd3s( a, b, c )  { (c) = (a) + (b); }
-
-#define bli_sczadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_dczadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_cczadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zczadd3s( a, b, c )  { (c) = (a) + (b); }
-
-#define bli_szzadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_dzzadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_czzadd3s( a, b, c )  { (c) = (a) + (b); }
-#define bli_zzzadd3s( a, b, c )  { (c) = (a) + (b); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sadd3s( a, b, c )  bli_sssadd3s( a, b, c )
-#define bli_dadd3s( a, b, c )  bli_dddadd3s( a, b, c )
-#define bli_cadd3s( a, b, c )  bli_cccadd3s( a, b, c )
-#define bli_zadd3s( a, b, c )  bli_zzzadd3s( a, b, c )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_addjs.h b/frame/include/level0/bli_addjs.h
deleted file mode 100644
index abe570c53..000000000
--- a/frame/include/level0/bli_addjs.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADDJS_H
-#define BLIS_ADDJS_H
-
-// addjs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_ssaddjs( a, y )  bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dsaddjs( a, y )  bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_csaddjs( a, y )  bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zsaddjs( a, y )  bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdaddjs( a, y )  bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddaddjs( a, y )  bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdaddjs( a, y )  bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdaddjs( a, y )  bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scaddjs( a, y )  bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcaddjs( a, y )  bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccaddjs( a, y )  bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcaddjs( a, y )  bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szaddjs( a, y )  bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzaddjs( a, y )  bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czaddjs( a, y )  bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzaddjs( a, y )  bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scaddjs( a, y )  { (y) +=      (a); }
-#define bli_dcaddjs( a, y )  { (y) +=      (a); }
-#define bli_ccaddjs( a, y )  { (y) += conjf(a); }
-#define bli_zcaddjs( a, y )  { (y) += conj (a); }
-
-#define bli_szaddjs( a, y )  { (y) +=      (a); }
-#define bli_dzaddjs( a, y )  { (y) +=      (a); }
-#define bli_czaddjs( a, y )  { (y) += conjf(a); }
-#define bli_zzaddjs( a, y )  { (y) += conj (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_saddjs( a, y )  bli_ssaddjs( a, y )
-#define bli_daddjs( a, y )  bli_ddaddjs( a, y )
-#define bli_caddjs( a, y )  bli_ccaddjs( a, y )
-#define bli_zaddjs( a, y )  bli_zzaddjs( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_adds.h b/frame/include/level0/bli_adds.h
deleted file mode 100644
index 340f2c3e0..000000000
--- a/frame/include/level0/bli_adds.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADDS_H
-#define BLIS_ADDS_H
-
-// adds
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_ssadds( a, y )  bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dsadds( a, y )  bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_csadds( a, y )  bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zsadds( a, y )  bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdadds( a, y )  bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddadds( a, y )  bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdadds( a, y )  bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdadds( a, y )  bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scadds( a, y )  bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcadds( a, y )  bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccadds( a, y )  bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcadds( a, y )  bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szadds( a, y )  bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzadds( a, y )  bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czadds( a, y )  bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzadds( a, y )  bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scadds( a, y )  { (y) += (a); }
-#define bli_dcadds( a, y )  { (y) += (a); }
-#define bli_ccadds( a, y )  { (y) += (a); }
-#define bli_zcadds( a, y )  { (y) += (a); }
-
-#define bli_szadds( a, y )  { (y) += (a); }
-#define bli_dzadds( a, y )  { (y) += (a); }
-#define bli_czadds( a, y )  { (y) += (a); }
-#define bli_zzadds( a, y )  { (y) += (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sadds( a, y )  bli_ssadds( a, y )
-#define bli_dadds( a, y )  bli_ddadds( a, y )
-#define bli_cadds( a, y )  bli_ccadds( a, y )
-#define bli_zadds( a, y )  bli_zzadds( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_adds_mxn.h b/frame/include/level0/bli_adds_mxn.h
deleted file mode 100644
index 8a92a17a6..000000000
--- a/frame/include/level0/bli_adds_mxn.h
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADDS_MXN_H
-#define BLIS_ADDS_MXN_H
-
-// adds_mxn
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-
-// xy = ?s
-
-BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ssadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_ssadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ssadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dsadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dsadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dsadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_csadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_csadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_csadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zsadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zsadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zsadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// xy = ?d
-
-BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sdadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_sdadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sdadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ddadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_ddadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ddadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cdadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_cdadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cdadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zdadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zdadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zdadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// xy = ?c
-
-BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_scadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_scadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_scadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dcadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dcadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dcadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ccadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_ccadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ccadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zcadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zcadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zcadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// xy = ?z
-
-BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_szadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_szadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_szadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dzadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dzadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dzadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_czadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_czadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_czadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zzadds( *(x + ii + jj*cs_x),
-		            *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zzadds( *(x + ii*rs_x + jj),
-		            *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zzadds( *(x + ii*rs_x + jj*cs_x),
-		            *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-
-
-BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                         float*    restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                         double*   restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                         scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-                                                         dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-
-
-#endif
diff --git a/frame/include/level0/bli_adds_mxn_uplo.h b/frame/include/level0/bli_adds_mxn_uplo.h
deleted file mode 100644
index 29f0ee038..000000000
--- a/frame/include/level0/bli_adds_mxn_uplo.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADDS_MXN_UPLO_H
-#define BLIS_ADDS_MXN_UPLO_H
-
-// adds_mxn_u
-
-#define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-			{ \
-				bli_ssadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-#define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-			{ \
-				bli_ddadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-#define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-			{ \
-				bli_ccadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-#define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-			{ \
-				bli_zzadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-// adds_mxn_l
-
-#define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-			{ \
-				bli_ssadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-#define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-			{ \
-				bli_ddadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-#define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-			{ \
-				bli_ccadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-#define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	{ \
-		for ( _i = 0; _i < m; ++_i ) \
-		{ \
-			if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-			{ \
-				bli_zzadds( *(x + _i*rs_x + _j*cs_x), \
-				            *(y + _i*rs_y + _j*cs_y) ); \
-			} \
-		} \
-	} \
-}
-
-
-#define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-#define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-#define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-#define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-#define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-#define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-#define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-#define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-
-#endif
diff --git a/frame/include/level0/old/ri3/bli_copyri3s.h b/frame/include/level0/bli_assigns.h
similarity index 77%
rename from frame/include/level0/old/ri3/bli_copyri3s.h
rename to frame/include/level0/bli_assigns.h
index 86ec79b0a..97a79b299 100644
--- a/frame/include/level0/old/ri3/bli_copyri3s.h
+++ b/frame/include/level0/bli_assigns.h
@@ -32,34 +32,21 @@
 
 */
 
-#ifndef BLIS_COPYRI3S_H
-#define BLIS_COPYRI3S_H
+#ifndef BLIS_ASSIGNS_H
+#define BLIS_ASSIGNS_H
 
-// copyri3s
+// assigns
 
-#define bli_scopyri3s( ar, ai, br, bi, bri ) \
-{ \
-	(br) = (ar); \
-}
+// Notes:
+// - The first char encodes the domain of output y.
+// - These macros are used to avoid updates on an output's imaginary part
+//   when that output exists only in the real domain (i.e. has no imaginary
+//   part to begin with).
 
-#define bli_dcopyri3s( ar, ai, br, bi, bri ) \
-{ \
-	(br) = (ar); \
-}
+#define bli_rassigns( xr, xi, yr, yi ) { yr = xr;          }
+#define bli_cassigns( xr, xi, yr, yi ) { yr = xr; yi = xi; }
+#define bli_jassigns( xr, xi, yr, yi ) {          yi = xi; }
 
-#define bli_ccopyri3s( ar, ai, br, bi, bri ) \
-{ \
-	(br)  = (ar); \
-	(bi)  = (ai); \
-	(bri) = (ar) + (ai); \
-}
-
-#define bli_zcopyri3s( ar, ai, br, bi, bri ) \
-{ \
-	(br)  = (ar); \
-	(bi)  = (ai); \
-	(bri) = (ar) + (ai); \
-}
 
 #endif
 
diff --git a/frame/include/level0/bli_axmys.h b/frame/include/level0/bli_axmys.h
deleted file mode 100644
index 7b0410caa..000000000
--- a/frame/include/level0/bli_axmys.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXMYS_H
-#define BLIS_AXMYS_H
-
-// axmys
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssaxmys( a, x, y )  bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dssaxmys( a, x, y )  bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cssaxmys( a, x, y )  bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zssaxmys( a, x, y )  bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsaxmys( a, x, y )  bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ddsaxmys( a, x, y )  bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cdsaxmys( a, x, y )  bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zdsaxmys( a, x, y )  bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_scsaxmys( a, x, y )  bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dcsaxmys( a, x, y )  bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ccsaxmys( a, x, y )  bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zcsaxmys( a, x, y )  bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_szsaxmys( a, x, y )  bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dzsaxmys( a, x, y )  bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_czsaxmys( a, x, y )  bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zzsaxmys( a, x, y )  bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdaxmys( a, x, y )  bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dsdaxmys( a, x, y )  bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_csdaxmys( a, x, y )  bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zsdaxmys( a, x, y )  bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sddaxmys( a, x, y )  bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dddaxmys( a, x, y )  bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cddaxmys( a, x, y )  bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zddaxmys( a, x, y )  bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_scdaxmys( a, x, y )  bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dcdaxmys( a, x, y )  bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ccdaxmys( a, x, y )  bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zcdaxmys( a, x, y )  bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_szdaxmys( a, x, y )  bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dzdaxmys( a, x, y )  bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_czdaxmys( a, x, y )  bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zzdaxmys( a, x, y )  bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxmys( a, x, y )  bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dscaxmys( a, x, y )  bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cscaxmys( a, x, y )  bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zscaxmys( a, x, y )  bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sdcaxmys( a, x, y )  bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_ddcaxmys( a, x, y )  bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cdcaxmys( a, x, y )  bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zdcaxmys( a, x, y )  bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sccaxmys( a, x, y )  bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccaxmys( a, x, y )  bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccaxmys( a, x, y )   bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccaxmys( a, x, y )   bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcaxmys( a, x, y )  bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dzcaxmys( a, x, y )  bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_czcaxmys( a, x, y )   bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zzcaxmys( a, x, y )   bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxmys( a, x, y )  bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dszaxmys( a, x, y )  bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cszaxmys( a, x, y )  bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zszaxmys( a, x, y )  bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sdzaxmys( a, x, y )  bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzaxmys( a, x, y )  bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzaxmys( a, x, y )  bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzaxmys( a, x, y )  bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sczaxmys( a, x, y )  bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dczaxmys( a, x, y )  bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cczaxmys( a, x, y )   bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zczaxmys( a, x, y )   bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_szzaxmys( a, x, y )  bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzaxmys( a, x, y )  bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czzaxmys( a, x, y )   bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzaxmys( a, x, y )   bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_dscaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_cscaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zscaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-#define bli_sdcaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_ddcaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_cdcaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zdcaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-#define bli_sccaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_dccaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_cccaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zccaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-#define bli_szcaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_dzcaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_czcaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zzcaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_dszaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_cszaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zszaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-#define bli_sdzaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_ddzaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_cdzaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zdzaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-#define bli_sczaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_dczaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_cczaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zczaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-#define bli_szzaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_dzzaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_czzaxmys( a, x, y )  { (y) -= (a) * (x); }
-#define bli_zzzaxmys( a, x, y )  { (y) -= (a) * (x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_saxmys( a, x, y )  bli_sssaxmys( a, x, y )
-#define bli_daxmys( a, x, y )  bli_dddaxmys( a, x, y )
-#define bli_caxmys( a, x, y )  bli_cccaxmys( a, x, y )
-#define bli_zaxmys( a, x, y )  bli_zzzaxmys( a, x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_axpbyjs.h b/frame/include/level0/bli_axpbyjs.h
deleted file mode 100644
index 8f229f1d9..000000000
--- a/frame/include/level0/bli_axpbyjs.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPBYJS_H
-#define BLIS_AXPBYJS_H
-
-// axpbyjs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of b.
-// - The fourth char encodes the type of y.
-
-// -- (axby) = (???s) ----------------------------------------------------------
-
-#define bli_ssssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dsssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_csssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zsssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sdssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_scssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dcssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ccssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zcssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzssaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_ssdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dsdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_csdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zsdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sddsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dddsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cddsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zddsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_scdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dcdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ccdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zcdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzdsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_sscsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dscsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cscsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zscsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sdcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sccsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dccsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cccsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zccsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzcsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_sszsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dszsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cszsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zszsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sdzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sczsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dczsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cczsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zczsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzzsaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-
-// -- (axby) = (???d) ----------------------------------------------------------
-
-#define bli_sssdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dssdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cssdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zssdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_scsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dcsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ccsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zcsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzsdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_ssddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dsddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_csddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zsddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_scddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dcddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ccddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zcddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzddaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sscdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dscdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cscdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zscdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sccdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dccdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cccdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zccdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzcdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sszdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dszdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cszdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zszdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sczdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dczdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cczdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zczdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzzdaxpbyjs( a, x, b, y )  bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axby) = (???c) ----------------------------------------------------------
-
-#define bli_ssscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dsscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_csscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zsscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sdscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_scscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dcscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ccscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zcscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzscaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_ssdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dsdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_csdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zsdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sddcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dddcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cddcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zddcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_scdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dcdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ccdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zcdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzdcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_ssccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dsccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_csccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zsccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sdccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_scccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dcccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ccccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zcccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzccaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_sszcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dszcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cszcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zszcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sdzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sczcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dczcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cczcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zczcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzzcaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-
-// -- (axby) = (???z) ----------------------------------------------------------
-
-#define bli_ssszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dsszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_csszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zsszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sdszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_scszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dcszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ccszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zcszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzszaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_ssdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dsdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_csdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zsdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sddzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dddzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cddzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zddzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_scdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dcdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ccdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zcdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzdzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_ssczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dsczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_csczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zsczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sdczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_scczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dcczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ccczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zcczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzczaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sszzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dszzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cszzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zszzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sdzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sczzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dczzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cczzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zczzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzzaxpbyjs( a, x, b, y )  bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axby) = (???c) ----------------------------------------------------------
-
-#define bli_ssscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dsscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_csscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zsscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sdscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_ddscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cdscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zdscaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_scscaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dcscaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_ccscaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zcscaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szscaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzscaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czscaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzscaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-#define bli_ssdcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dsdcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_csdcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zsdcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sddcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dddcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cddcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zddcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_scdcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dcdcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_ccdcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zcdcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szdcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzdcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czdcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzdcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-#define bli_ssccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dsccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_csccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zsccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sdccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_ddccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cdccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zdccaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_scccaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dcccaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_ccccaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zcccaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szccaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzccaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czccaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzccaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-#define bli_sszcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dszcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cszcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zszcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sdzcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_ddzcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cdzcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zdzcaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sczcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dczcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_cczcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zczcaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szzcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzzcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czzcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzzcaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-// -- (axby) = (???z) ----------------------------------------------------------
-
-#define bli_ssszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dsszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_csszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zsszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sdszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_ddszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cdszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zdszaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_scszaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dcszaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_ccszaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zcszaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szszaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzszaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czszaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzszaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-#define bli_ssdzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dsdzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_csdzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zsdzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sddzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dddzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cddzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zddzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_scdzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dcdzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_ccdzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zcdzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szdzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzdzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czdzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzdzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-#define bli_ssczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dsczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_csczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zsczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sdczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_ddczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cdczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zdczaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_scczaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dcczaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_ccczaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zcczaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szczaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzczaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czczaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzczaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-#define bli_sszzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_dszzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cszzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zszzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sdzzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_ddzzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_cdzzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_zdzzaxpbyjs( a, x, b, y )  { (y) = (a) *      (x) + (b) * (y); }
-#define bli_sczzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_dczzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_cczzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_zczzaxpbyjs( a, x, b, y )  { (y) = (a) * conjf(x) + (b) * (y); }
-#define bli_szzzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_dzzzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_czzzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-#define bli_zzzzaxpbyjs( a, x, b, y )  { (y) = (a) *  conj(x) + (b) * (y); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_saxpbyjs( a, x, b, y )  bli_ssssaxpbyjs( a, x, b, y )
-#define bli_daxpbyjs( a, x, b, y )  bli_ddddaxpbyjs( a, x, b, y )
-#define bli_caxpbyjs( a, x, b, y )  bli_ccccaxpbyjs( a, x, b, y )
-#define bli_zaxpbyjs( a, x, b, y )  bli_zzzzaxpbyjs( a, x, b, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_axpbys.h b/frame/include/level0/bli_axpbys.h
deleted file mode 100644
index 411408932..000000000
--- a/frame/include/level0/bli_axpbys.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPBYS_H
-#define BLIS_AXPBYS_H
-
-// axpbys
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of b.
-// - The fourth char encodes the type of y.
-
-// -- (axby) = (???s) ----------------------------------------------------------
-
-#define bli_ssssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dsssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_csssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zsssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sdssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_scssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dcssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ccssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zcssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzssaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_ssdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dsdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_csdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zsdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sddsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dddsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cddsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zddsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_scdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dcdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ccdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zcdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzdsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_sscsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dscsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cscsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zscsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sdcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sccsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dccsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cccsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zccsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzcsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_sszsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dszsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cszsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zszsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sdzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_sczsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dczsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cczsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zczsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_szzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzzsaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-
-// -- (axby) = (???d) ----------------------------------------------------------
-
-#define bli_sssdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dssdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cssdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zssdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_scsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dcsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ccsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zcsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzsdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_ssddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dsddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_csddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zsddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_scddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dcddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ccddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zcddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzddaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sscdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dscdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cscdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zscdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sccdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dccdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cccdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zccdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzcdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sszdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dszdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cszdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zszdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sdzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ddzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cdzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zdzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_sczdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dczdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cczdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zczdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_szzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzzdaxpbys( a, x, b, y )  bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axby) = (???c) ----------------------------------------------------------
-
-#define bli_ssscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dsscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_csscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zsscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sdscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_scscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dcscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ccscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zcscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzscaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_ssdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dsdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_csdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zsdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sddcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dddcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cddcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zddcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_scdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dcdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ccdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zcdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzdcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_ssccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dsccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_csccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zsccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sdccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_scccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dcccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ccccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zcccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzccaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_sszcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dszcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cszcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zszcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sdzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_sczcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dczcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cczcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zczcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_szzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzzcaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-
-// -- (axby) = (???z) ----------------------------------------------------------
-
-#define bli_ssszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dsszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_csszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zsszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sdszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_scszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dcszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ccszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zcszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzszaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_ssdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dsdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_csdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zsdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sddzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dddzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cddzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zddzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_scdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dcdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ccdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zcdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzdzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_ssczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dsczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_csczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zsczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sdczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_scczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dcczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ccczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zcczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzczaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sszzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dszzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cszzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zszzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sdzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_sczzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dczzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cczzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zczzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_szzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzzaxpbys( a, x, b, y )  bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axby) = (???c) ----------------------------------------------------------
-
-#define bli_ssscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dsscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_csscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zsscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sdscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ddscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cdscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zdscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_scscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dcscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ccscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zcscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzscaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-#define bli_ssdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dsdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_csdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zsdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sddcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dddcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cddcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zddcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_scdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dcdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ccdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zcdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzdcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-#define bli_ssccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dsccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_csccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zsccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sdccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ddccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cdccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zdccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_scccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dcccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ccccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zcccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzccaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-#define bli_sszcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dszcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cszcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zszcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sdzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ddzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cdzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zdzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sczcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dczcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cczcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zczcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzzcaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-// -- (axby) = (???z) ----------------------------------------------------------
-
-#define bli_ssszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dsszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_csszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zsszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sdszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ddszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cdszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zdszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_scszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dcszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ccszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zcszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzszaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-#define bli_ssdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dsdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_csdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zsdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sddzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dddzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cddzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zddzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_scdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dcdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ccdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zcdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzdzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-#define bli_ssczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dsczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_csczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zsczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sdczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ddczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cdczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zdczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_scczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dcczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ccczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zcczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzczaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-#define bli_sszzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dszzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cszzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zszzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sdzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_ddzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cdzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zdzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_sczzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dczzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_cczzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zczzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_szzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_dzzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_czzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-#define bli_zzzzaxpbys( a, x, b, y )  { (y) = (a) * (x) + (b) * (y); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_saxpbys( a, x, b, y )  bli_ssssaxpbys( a, x, b, y )
-#define bli_daxpbys( a, x, b, y )  bli_ddddaxpbys( a, x, b, y )
-#define bli_caxpbys( a, x, b, y )  bli_ccccaxpbys( a, x, b, y )
-#define bli_zaxpbys( a, x, b, y )  bli_zzzzaxpbys( a, x, b, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_axpbys_mxn.h b/frame/include/level0/bli_axpbys_mxn.h
deleted file mode 100644
index 494c5d445..000000000
--- a/frame/include/level0/bli_axpbys_mxn.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPBYS_MXN_H
-#define BLIS_AXPBYS_MXN_H
-
-// axpbys_mxn
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of b.
-// - The fourth char encodes the type of y.
-// - We only implement cases where typeof(a) == type(x) && typeof(b) == typeof(y).
-
-#undef  BLIS_ENABLE_CR_CASES
-#define BLIS_ENABLE_CR_CASES 0
-
-// -- bli_????axpbys_mxn --
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
-\
-BLIS_INLINE void PASTEMAC(chx,chx,chy,chy,opname) \
-     ( \
-       const dim_t   m, \
-       const dim_t   n, \
-       const ctypex* alpha, \
-       const ctypex* x, inc_t rs_x, inc_t cs_x, \
-       const ctypey* beta, \
-             ctypey* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-	/* If beta is zero, overwrite y with alpha*x (in case y has infs or NaNs). */ \
-	if ( PASTEMAC(chy,eq0)( *beta ) ) \
-	{ \
-		PASTEMAC(chx,chx,chy,scal2s_mxn)( BLIS_NO_CONJUGATE, m, n, alpha, x, rs_x, cs_x, y, rs_y, cs_y ); \
-		return; \
-	} \
-\
-	if      ( BLIS_ENABLE_CR_CASES && rs_x == 1 && rs_y == 1 ) \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC(chx,chx,chy,chy,kername) \
-		( \
-		  *alpha, *(x + ii + jj*cs_x), \
-		  *beta,  *(y + ii + jj*cs_y) \
-		); \
-	} \
-	else if ( BLIS_ENABLE_CR_CASES && cs_x == 1 && cs_y == 1 ) \
-	{ \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		PASTEMAC(chx,chx,chy,chy,kername) \
-		( \
-		  *alpha, *(x + ii*rs_x + jj), \
-		  *beta,  *(y + ii*rs_y + jj) \
-		); \
-	} \
-	else \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC(chx,chx,chy,chy,kername) \
-		( \
-		  *alpha, *(x + ii*rs_x + jj*cs_x), \
-		  *beta,  *(y + ii*rs_y + jj*cs_y) \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC2_BASIC ( axpbys_mxn, axpbys )
-INSERT_GENTFUNC2_MIX_DP( axpbys_mxn, axpbys )
-
-
-// -- bli_?axpbys_mxn --
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const dim_t  m, \
-       const dim_t  n, \
-       const ctype* alpha, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       const ctype* beta, \
-             ctype* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-    PASTEMAC(ch,ch,ch,ch,opname)( m, n, alpha, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-
-INSERT_GENTFUNC_BASIC( axpbys_mxn )
-
-
-#endif
diff --git a/frame/include/level0/bli_axpyjs.h b/frame/include/level0/bli_axpyjs.h
deleted file mode 100644
index 4b2c640a4..000000000
--- a/frame/include/level0/bli_axpyjs.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPYJS_H
-#define BLIS_AXPYJS_H
-
-// axpyjs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssaxpyjs( a, x, y )  bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dssaxpyjs( a, x, y )  bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cssaxpyjs( a, x, y )  bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zssaxpyjs( a, x, y )  bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsaxpyjs( a, x, y )  bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ddsaxpyjs( a, x, y )  bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cdsaxpyjs( a, x, y )  bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zdsaxpyjs( a, x, y )  bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_scsaxpyjs( a, x, y )  bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dcsaxpyjs( a, x, y )  bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ccsaxpyjs( a, x, y )  bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zcsaxpyjs( a, x, y )  bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_szsaxpyjs( a, x, y )  bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dzsaxpyjs( a, x, y )  bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_czsaxpyjs( a, x, y )  bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zzsaxpyjs( a, x, y )  bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdaxpyjs( a, x, y )  bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dsdaxpyjs( a, x, y )  bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_csdaxpyjs( a, x, y )  bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zsdaxpyjs( a, x, y )  bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sddaxpyjs( a, x, y )  bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dddaxpyjs( a, x, y )  bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cddaxpyjs( a, x, y )  bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zddaxpyjs( a, x, y )  bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_scdaxpyjs( a, x, y )  bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dcdaxpyjs( a, x, y )  bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ccdaxpyjs( a, x, y )  bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zcdaxpyjs( a, x, y )  bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_szdaxpyjs( a, x, y )  bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dzdaxpyjs( a, x, y )  bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_czdaxpyjs( a, x, y )  bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zzdaxpyjs( a, x, y )  bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxpyjs( a, x, y )  bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dscaxpyjs( a, x, y )  bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cscaxpyjs( a, x, y )  bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zscaxpyjs( a, x, y )  bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sdcaxpyjs( a, x, y )  bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_ddcaxpyjs( a, x, y )  bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cdcaxpyjs( a, x, y )  bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zdcaxpyjs( a, x, y )  bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sccaxpyjs( a, x, y )  bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccaxpyjs( a, x, y )  bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccaxpyjs( a, x, y )   bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccaxpyjs( a, x, y )   bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcaxpyjs( a, x, y )  bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dzcaxpyjs( a, x, y )  bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_czcaxpyjs( a, x, y )   bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zzcaxpyjs( a, x, y )   bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxpyjs( a, x, y )  bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dszaxpyjs( a, x, y )  bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cszaxpyjs( a, x, y )  bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zszaxpyjs( a, x, y )  bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sdzaxpyjs( a, x, y )  bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzaxpyjs( a, x, y )  bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzaxpyjs( a, x, y )  bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzaxpyjs( a, x, y )  bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sczaxpyjs( a, x, y )  bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dczaxpyjs( a, x, y )  bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cczaxpyjs( a, x, y )   bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zczaxpyjs( a, x, y )   bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_szzaxpyjs( a, x, y )  bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzaxpyjs( a, x, y )  bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czzaxpyjs( a, x, y )   bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzaxpyjs( a, x, y )   bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_dscaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_cscaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_zscaxpyjs( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sdcaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_ddcaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_cdcaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_zdcaxpyjs( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sccaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-#define bli_dccaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-#define bli_cccaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-#define bli_zccaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-
-#define bli_szcaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-#define bli_dzcaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-#define bli_czcaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-#define bli_zzcaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_dszaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_cszaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_zszaxpyjs( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sdzaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_ddzaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_cdzaxpyjs( a, x, y )  { (y) += (a) * (x); }
-#define bli_zdzaxpyjs( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sczaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-#define bli_dczaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-#define bli_cczaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-#define bli_zczaxpyjs( a, x, y )  { (y) += (a) * conjf(x); }
-
-#define bli_szzaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-#define bli_dzzaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-#define bli_czzaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-#define bli_zzzaxpyjs( a, x, y )  { (y) += (a) * conj(x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_saxpyjs( a, x, y )  bli_sssaxpyjs( a, x, y )
-#define bli_daxpyjs( a, x, y )  bli_dddaxpyjs( a, x, y )
-#define bli_caxpyjs( a, x, y )  bli_cccaxpyjs( a, x, y )
-#define bli_zaxpyjs( a, x, y )  bli_zzzaxpyjs( a, x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_axpys.h b/frame/include/level0/bli_axpys.h
deleted file mode 100644
index fb6871b4b..000000000
--- a/frame/include/level0/bli_axpys.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPYS_H
-#define BLIS_AXPYS_H
-
-// axpys
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssaxpys( a, x, y )  bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dssaxpys( a, x, y )  bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cssaxpys( a, x, y )  bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zssaxpys( a, x, y )  bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsaxpys( a, x, y )  bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ddsaxpys( a, x, y )  bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cdsaxpys( a, x, y )  bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zdsaxpys( a, x, y )  bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_scsaxpys( a, x, y )  bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dcsaxpys( a, x, y )  bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ccsaxpys( a, x, y )  bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zcsaxpys( a, x, y )  bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_szsaxpys( a, x, y )  bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dzsaxpys( a, x, y )  bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_czsaxpys( a, x, y )  bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zzsaxpys( a, x, y )  bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdaxpys( a, x, y )  bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dsdaxpys( a, x, y )  bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_csdaxpys( a, x, y )  bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zsdaxpys( a, x, y )  bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sddaxpys( a, x, y )  bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dddaxpys( a, x, y )  bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cddaxpys( a, x, y )  bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zddaxpys( a, x, y )  bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_scdaxpys( a, x, y )  bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dcdaxpys( a, x, y )  bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ccdaxpys( a, x, y )  bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zcdaxpys( a, x, y )  bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_szdaxpys( a, x, y )  bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dzdaxpys( a, x, y )  bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_czdaxpys( a, x, y )  bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zzdaxpys( a, x, y )  bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxpys( a, x, y )  bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dscaxpys( a, x, y )  bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cscaxpys( a, x, y )  bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zscaxpys( a, x, y )  bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sdcaxpys( a, x, y )  bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_ddcaxpys( a, x, y )  bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cdcaxpys( a, x, y )  bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zdcaxpys( a, x, y )  bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sccaxpys( a, x, y )  bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccaxpys( a, x, y )  bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccaxpys( a, x, y )   bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccaxpys( a, x, y )   bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcaxpys( a, x, y )  bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dzcaxpys( a, x, y )  bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_czcaxpys( a, x, y )   bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zzcaxpys( a, x, y )   bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxpys( a, x, y )  bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dszaxpys( a, x, y )  bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cszaxpys( a, x, y )  bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zszaxpys( a, x, y )  bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sdzaxpys( a, x, y )  bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzaxpys( a, x, y )  bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzaxpys( a, x, y )  bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzaxpys( a, x, y )  bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sczaxpys( a, x, y )  bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dczaxpys( a, x, y )  bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cczaxpys( a, x, y )   bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zczaxpys( a, x, y )   bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_szzaxpys( a, x, y )  bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzaxpys( a, x, y )  bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czzaxpys( a, x, y )   bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzaxpys( a, x, y )   bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_dscaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_cscaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zscaxpys( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sdcaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_ddcaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_cdcaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zdcaxpys( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sccaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_dccaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_cccaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zccaxpys( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_szcaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_dzcaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_czcaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zzcaxpys( a, x, y )  { (y) += (a) * (x); }
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_dszaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_cszaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zszaxpys( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sdzaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_ddzaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_cdzaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zdzaxpys( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_sczaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_dczaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_cczaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zczaxpys( a, x, y )  { (y) += (a) * (x); }
-
-#define bli_szzaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_dzzaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_czzaxpys( a, x, y )  { (y) += (a) * (x); }
-#define bli_zzzaxpys( a, x, y )  { (y) += (a) * (x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_saxpys( a, x, y )  bli_sssaxpys( a, x, y )
-#define bli_daxpys( a, x, y )  bli_dddaxpys( a, x, y )
-#define bli_caxpys( a, x, y )  bli_cccaxpys( a, x, y )
-#define bli_zaxpys( a, x, y )  bli_zzzaxpys( a, x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_lt.h b/frame/include/level0/bli_complex_terms.h
similarity index 60%
rename from frame/include/level0/bli_lt.h
rename to frame/include/level0/bli_complex_terms.h
index b7c68ddaa..0cf05c30c 100644
--- a/frame/include/level0/bli_lt.h
+++ b/frame/include/level0/bli_complex_terms.h
@@ -32,39 +32,40 @@
 
 */
 
-#ifndef BLIS_LT_H
-#define BLIS_LT_H
-
-
-// lt (passed by value)
-
-#define bli_slt( a, b )  (          (a) <          (b) )
-#define bli_dlt( a, b )  (          (a) <          (b) )
-#define bli_clt( a, b )  ( bli_creal(a) < bli_creal(b) )
-#define bli_zlt( a, b )  ( bli_zreal(a) < bli_zreal(b) )
-#define bli_ilt( a, b )  (          (a) <          (b) )
-
-// lt0
-
-#define bli_slt0( a )  (          (a) < 0.0F )
-#define bli_dlt0( a )  (          (a) < 0.0  )
-#define bli_clt0( a )  ( bli_creal(a) < 0.0F )
-#define bli_zlt0( a )  ( bli_zreal(a) < 0.0  )
-
-// gt (passed by value)
-
-#define bli_sgt( a, b )  (          (a) >          (b) )
-#define bli_dgt( a, b )  (          (a) >          (b) )
-#define bli_cgt( a, b )  ( bli_creal(a) > bli_creal(b) )
-#define bli_zgt( a, b )  ( bli_zreal(a) > bli_zreal(b) )
-#define bli_igt( a, b )  (          (a) >          (b) )
-
-// gt0
-
-#define bli_sgt0( a )  (          (a) > 0.0F )
-#define bli_dgt0( a )  (          (a) > 0.0  )
-#define bli_cgt0( a )  ( bli_creal(a) > 0.0F )
-#define bli_zgt0( a )  ( bli_zreal(a) > 0.0  )
+#ifndef BLIS_COMPLEX_TERMS_H
+#define BLIS_COMPLEX_TERMS_H
+
+
+// -- Complex term-zeroing macros ----------------------------------------------
+
+// Note:
+// - pab is the precision of the A_[ri] * B_[ri] product. It is only used in
+//   certain cases where we need to decide which precision of zero to substitute
+//   into the calling expression.
+
+// ar * br term
+#define bli_rrtermrr( pab, ab )  ab
+#define bli_rctermrr( pab, ab )  ab
+#define bli_crtermrr( pab, ab )  ab
+#define bli_cctermrr( pab, ab )  ab
+
+// ai * bi term
+#define bli_rrtermii( pab, ab )  PASTEMAC(pab,zero)
+#define bli_rctermii( pab, ab )  PASTEMAC(pab,zero)
+#define bli_crtermii( pab, ab )  PASTEMAC(pab,zero)
+#define bli_cctermii( pab, ab )  ab
+
+// ai * br term
+#define bli_rrtermir( pab, ab )  PASTEMAC(pab,zero)
+#define bli_rctermir( pab, ab )  PASTEMAC(pab,zero)
+#define bli_crtermir( pab, ab )  ab
+#define bli_cctermir( pab, ab )  ab
+
+// ar * bi term
+#define bli_rrtermri( pab, ab )  PASTEMAC(pab,zero)
+#define bli_rctermri( pab, ab )  ab
+#define bli_crtermri( pab, ab )  PASTEMAC(pab,zero)
+#define bli_cctermri( pab, ab )  ab
 
 
diff --git a/frame/include/level0/bli_conjs.h b/frame/include/level0/bli_conjs.h
deleted file mode 100644
index 241148825..000000000
--- a/frame/include/level0/bli_conjs.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_CONJS_H
-#define BLIS_CONJS_H
-
-// conjs
-
-#define bli_sconjs( x )  bli_sconjris( bli_sreal(x), bli_simag(x) )
-#define bli_dconjs( x )  bli_dconjris( bli_dreal(x), bli_dimag(x) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_cconjs( x )  bli_cconjris( bli_creal(x), bli_cimag(x) )
-#define bli_zconjs( x )  bli_zconjris( bli_zreal(x), bli_zimag(x) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_cconjs( x )  { (x) = conjf(x); }
-#define bli_zconjs( x )  { (x) = conj (x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#endif
-
diff --git a/frame/include/level0/bli_copycjs.h b/frame/include/level0/bli_copycjs.h
deleted file mode 100644
index f6056ad1e..000000000
--- a/frame/include/level0/bli_copycjs.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYCJS_H
-#define BLIS_COPYCJS_H
-
-// copycjs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopycjs( conjx, x, y )  bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dscopycjs( conjx, x, y )  bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cscopycjs( conjx, x, y )  bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zscopycjs( conjx, x, y )  bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdcopycjs( conjx, x, y )  bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ddcopycjs( conjx, x, y )  bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cdcopycjs( conjx, x, y )  bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zdcopycjs( conjx, x, y )  bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sccopycjs( conjx, x, y )  bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccopycjs( conjx, x, y )  bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccopycjs( conjx, x, y )  bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccopycjs( conjx, x, y )  bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcopycjs( conjx, x, y )  bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzcopycjs( conjx, x, y )  bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czcopycjs( conjx, x, y )  bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzcopycjs( conjx, x, y )  bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sccopycjs( conjx, x, y )  { (y) = (x); }
-#define bli_dccopycjs( conjx, x, y )  { (y) = (x); }
-#define bli_cccopycjs( conjx, x, y )  { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); }
-#define bli_zccopycjs( conjx, x, y )  { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); }
-
-#define bli_szcopycjs( conjx, x, y )  { (y) = (x); }
-#define bli_dzcopycjs( conjx, x, y )  { (y) = (x); }
-#define bli_czcopycjs( conjx, x, y )  { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); }
-#define bli_zzcopycjs( conjx, x, y )  { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_iicopycjs( conjx, x, y )  { (y) = ( gint_t ) (x); }
-
-
-#define bli_scopycjs( conjx, x, y )  bli_sscopycjs( conjx, x, y )
-#define bli_dcopycjs( conjx, x, y )  bli_ddcopycjs( conjx, x, y )
-#define bli_ccopycjs( conjx, x, y )  bli_cccopycjs( conjx, x, y )
-#define bli_zcopycjs( conjx, x, y )  bli_zzcopycjs( conjx, x, y )
-#define bli_icopycjs( conjx, x, y )  bli_iicopycjs( conjx, x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_copyjnzs.h b/frame/include/level0/bli_copyjnzs.h
deleted file mode 100644
index 048525a18..000000000
--- a/frame/include/level0/bli_copyjnzs.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYJNZS_H
-#define BLIS_COPYJNZS_H
-
-// copyjnzs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopyjnzs( x, y )  bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dscopyjnzs( x, y )  bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cscopyjnzs( x, y )  bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zscopyjnzs( x, y )  bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdcopyjnzs( x, y )  bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ddcopyjnzs( x, y )  bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cdcopyjnzs( x, y )  bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zdcopyjnzs( x, y )  bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-// NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we
-// don't touch the imaginary part of y.
-#define bli_sccopyjnzs( x, y )  bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccopyjnzs( x, y )  bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccopyjnzs( x, y )  bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccopyjnzs( x, y )  bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we
-// don't touch the imaginary part of y.
-#define bli_szcopyjnzs( x, y )  bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzcopyjnzs( x, y )  bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czcopyjnzs( x, y )  bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzcopyjnzs( x, y )  bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-
-#define bli_iicopyjnzs( x, y )  { (y) = ( gint_t ) (x); }
-
-
-#define bli_scopyjnzs( x, y )  bli_sscopyjnzs( x, y )
-#define bli_dcopyjnzs( x, y )  bli_ddcopyjnzs( x, y )
-#define bli_ccopyjnzs( x, y )  bli_cccopyjnzs( x, y )
-#define bli_zcopyjnzs( x, y )  bli_zzcopyjnzs( x, y )
-#define bli_icopyjnzs( x, y )  bli_iicopyjnzs( x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_copyjs.h b/frame/include/level0/bli_copyjs.h
deleted file mode 100644
index 7292e82fb..000000000
--- a/frame/include/level0/bli_copyjs.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYJS_H
-#define BLIS_COPYJS_H
-
-// copyjs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopyjs( x, y )  bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dscopyjs( x, y )  bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cscopyjs( x, y )  bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zscopyjs( x, y )  bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdcopyjs( x, y )  bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ddcopyjs( x, y )  bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cdcopyjs( x, y )  bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zdcopyjs( x, y )  bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sccopyjs( x, y )  bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccopyjs( x, y )  bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccopyjs( x, y )  bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccopyjs( x, y )  bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcopyjs( x, y )  bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzcopyjs( x, y )  bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czcopyjs( x, y )  bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzcopyjs( x, y )  bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sccopyjs( x, y )  { (y) =      (x); }
-#define bli_dccopyjs( x, y )  { (y) =      (x); }
-#define bli_cccopyjs( x, y )  { (y) = conjf(x); }
-#define bli_zccopyjs( x, y )  { (y) = conj (x); }
-
-#define bli_szcopyjs( x, y )  { (y) =      (x); }
-#define bli_dzcopyjs( x, y )  { (y) =      (x); }
-#define bli_czcopyjs( x, y )  { (y) = conjf(x); }
-#define bli_zzcopyjs( x, y )  { (y) = conj (x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_iicopyjs( x, y )  { (y) = ( gint_t ) (x); }
-
-
-#define bli_scopyjs( x, y )  bli_sscopyjs( x, y )
-#define bli_dcopyjs( x, y )  bli_ddcopyjs( x, y )
-#define bli_ccopyjs( x, y )  bli_cccopyjs( x, y )
-#define bli_zcopyjs( x, y )  bli_zzcopyjs( x, y )
-#define bli_icopyjs( x, y )  bli_iicopyjs( x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_copynzs.h b/frame/include/level0/bli_copynzs.h
deleted file mode 100644
index aa5d78651..000000000
--- a/frame/include/level0/bli_copynzs.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYNZS_H
-#define BLIS_COPYNZS_H
-
-// copynzs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopynzs( x, y )  bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dscopynzs( x, y )  bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cscopynzs( x, y )  bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zscopynzs( x, y )  bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdcopynzs( x, y )  bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ddcopynzs( x, y )  bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cdcopynzs( x, y )  bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zdcopynzs( x, y )  bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-// NOTE: Use of scopyris() is so we don't touch the imaginary part of y.
-#define bli_sccopynzs( x, y )  bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccopynzs( x, y )  bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccopynzs( x, y )  bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccopynzs( x, y )  bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// NOTE: Use of dcopyris() is so we don't touch the imaginary part of y.
-#define bli_szcopynzs( x, y )  bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzcopynzs( x, y )  bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czcopynzs( x, y )  bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzcopynzs( x, y )  bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-
-#define bli_iicopynzs( x, y )  { (y) = ( gint_t ) (x); }
-
-
-#define bli_scopynzs( x, y )  bli_sscopynzs( x, y )
-#define bli_dcopynzs( x, y )  bli_ddcopynzs( x, y )
-#define bli_ccopynzs( x, y )  bli_cccopynzs( x, y )
-#define bli_zcopynzs( x, y )  bli_zzcopynzs( x, y )
-#define bli_icopynzs( x, y )  bli_iicopynzs( x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_copys.h b/frame/include/level0/bli_copys.h
deleted file mode 100644
index fee51f4dc..000000000
--- a/frame/include/level0/bli_copys.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYS_H
-#define BLIS_COPYS_H
-
-// copys
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopys( x, y )  bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dscopys( x, y )  bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cscopys( x, y )  bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zscopys( x, y )  bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdcopys( x, y )  bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ddcopys( x, y )  bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cdcopys( x, y )  bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zdcopys( x, y )  bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-// NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero.
-#define bli_sccopys( x, y )  bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccopys( x, y )  bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccopys( x, y )  bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccopys( x, y )  bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero.
-#define bli_szcopys( x, y )  bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzcopys( x, y )  bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czcopys( x, y )  bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzcopys( x, y )  bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-
-#define bli_iicopys( x, y )  { (y) = ( gint_t ) (x); }
-
-
-#define bli_scopys( x, y )  bli_sscopys( x, y )
-#define bli_dcopys( x, y )  bli_ddcopys( x, y )
-#define bli_ccopys( x, y )  bli_cccopys( x, y )
-#define bli_zcopys( x, y )  bli_zzcopys( x, y )
-#define bli_icopys( x, y )  bli_iicopys( x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_copys_mxn.h b/frame/include/level0/bli_copys_mxn.h
deleted file mode 100644
index 4b729376a..000000000
--- a/frame/include/level0/bli_copys_mxn.h
+++ /dev/null
@@ -1,676 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYS_MXN_H
-#define BLIS_COPYS_MXN_H
-
-// copys_mxn
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#undef  BLIS_ENABLE_CR_CASES
-#define BLIS_ENABLE_CR_CASES 0
-
-// -- bli_??copys_mxn --
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
-\
-BLIS_INLINE void PASTEMAC(chx,chy,opname) \
-     ( \
-       const dim_t   m, \
-       const dim_t   n, \
-       const ctypex* x, inc_t rs_x, inc_t cs_x, \
-             ctypey* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-	if      ( BLIS_ENABLE_CR_CASES && rs_x == 1 && rs_y == 1 ) \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC(chx,chy,kername)( *(x + ii + jj*cs_x), \
-		                            *(y + ii + jj*cs_y) ); \
-	} \
-	else if ( BLIS_ENABLE_CR_CASES && cs_x == 1 && cs_y == 1 ) \
-	{ \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		PASTEMAC(chx,chy,kername)( *(x + ii*rs_x + jj), \
-		                            *(y + ii*rs_y + jj) ); \
-	} \
-	else \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC(chx,chy,kername)( *(x + ii*rs_x + jj*cs_x), \
-		                            *(y + ii*rs_y + jj*cs_y) ); \
-	} \
-}
-
-INSERT_GENTFUNC2_BASIC ( copys_mxn, copys )
-INSERT_GENTFUNC2_MIX_DP( copys_mxn, copys )
-
-
-// -- bli_?copys_mxn --
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const dim_t  m, \
-       const dim_t  n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-             ctype* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-	PASTEMAC(ch,ch,opname)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-
-INSERT_GENTFUNC_BASIC( copys_mxn )
-
-
-
-
-#if 0
-
-// xy = ?s
-
-BLIS_INLINE void bli_sscopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sscopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_sscopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sscopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dscopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dscopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dscopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dscopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_cscopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cscopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_cscopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cscopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zscopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zscopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zscopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zscopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// xy = ?d
-
-BLIS_INLINE void bli_sdcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sdcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_sdcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sdcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_ddcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ddcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_ddcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_ddcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_cdcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cdcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_cdcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cdcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zdcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zdcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zdcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zdcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// xy = ?c
-
-BLIS_INLINE void bli_sccopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sccopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_sccopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sccopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dccopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dccopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dccopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dccopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_cccopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cccopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_cccopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cccopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zccopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zccopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zccopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zccopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// xy = ?c
-
-BLIS_INLINE void bli_szcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_szcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_szcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_szcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dzcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dzcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dzcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dzcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_czcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_czcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_czcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_czcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zzcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zzcopys( *(x + ii + jj*cs_x),
-		             *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zzcopys( *(x + ii*rs_x + jj),
-		             *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zzcopys( *(x + ii*rs_x + jj*cs_x),
-		             *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-BLIS_INLINE void bli_scopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_dcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_ccopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_zcopys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-}
-#endif
-
-#endif
diff --git a/frame/include/level0/bli_inverts.h b/frame/include/level0/bli_declinits.h
similarity index 64%
rename from frame/include/level0/bli_inverts.h
rename to frame/include/level0/bli_declinits.h
index 092e5ab4e..0461cd1cd 100644
--- a/frame/include/level0/bli_inverts.h
+++ b/frame/include/level0/bli_declinits.h
@@ -32,28 +32,26 @@
 
 */
 
-#ifndef BLIS_INVERTS_H
-#define BLIS_INVERTS_H
+#ifndef BLIS_DECLINITS_H
+#define BLIS_DECLINITS_H
 
-// inverts
+// declinits
 
 // Notes:
-// - The first char encodes the type of x.
-
-#define bli_sinverts( x )  bli_sinvertris( bli_sreal(x), bli_simag(x) )
-#define bli_dinverts( x )  bli_dinvertris( bli_dreal(x), bli_dimag(x) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_cinverts( x )  bli_cinvertris( bli_creal(x), bli_cimag(x) )
-#define bli_zinverts( x )  bli_zinvertris( bli_zreal(x), bli_zimag(x) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_cinverts( x )  { (x) = 1.0F / (x); }
-#define bli_zinverts( x )  { (x) = 1.0  / (x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
+// - The first char encodes the domain of output yr + yi.
+// - The pxy precision character encodes the precision of x AND y (they
+//   are assumed to be the same).
+// - These macros are used to declare AND initialize variables corresponding
+//   to the real and imaginary parts of (presumably) temporary variables.
+//   If the domain is real, only the real part is declared and initialized.
+
+#define bli_rdeclinits( pxy, xr, xi, yr, yi ) PASTEMAC(pxy,ctype) yr = xr; (void)yr;
+#define bli_cdeclinits( pxy, xr, xi, yr, yi ) PASTEMAC(pxy,ctype) yr = xr; (void)yr; \
+                                              PASTEMAC(pxy,ctype) yi = xi; (void)yi;
+
+// An extra definition for situations where we only need a real value declared
+// and initialized (e.g. when explicitly implementing in the complex domain).
+#define bli_rodeclinits( pxy, xr, yr ) bli_rdeclinits( pxy, xr, /*xi*/, yr, /*yi*/ )
 
 
 #endif
diff --git a/frame/include/level0/bli_dotjs.h b/frame/include/level0/bli_dotjs.h
deleted file mode 100644
index e03f0e2a7..000000000
--- a/frame/include/level0/bli_dotjs.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_DOTJS_H
-#define BLIS_DOTJS_H
-
-// dotjs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-// - The third char encodes the type of rho.
-// - x is used in conjugated form.
-
-
-#define bli_sssdotjs( x, y, a )  bli_sssaxpyjs( y, x, a )
-#define bli_dssdotjs( x, y, a )  bli_sdsaxpyjs( y, x, a )
-#define bli_cssdotjs( x, y, a )  bli_scsaxpyjs( y, x, a )
-#define bli_zssdotjs( x, y, a )  bli_szsaxpyjs( y, x, a )
-
-#define bli_sdsdotjs( x, y, a )  bli_dssaxpyjs( y, x, a )
-#define bli_ddsdotjs( x, y, a )  bli_ddsaxpyjs( y, x, a )
-#define bli_cdsdotjs( x, y, a )  bli_dcsaxpyjs( y, x, a )
-#define bli_zdsdotjs( x, y, a )  bli_dzsaxpyjs( y, x, a )
-
-#define bli_scsdotjs( x, y, a )  bli_cssaxpyjs( y, x, a )
-#define bli_dcsdotjs( x, y, a )  bli_cdsaxpyjs( y, x, a )
-#define bli_ccsdotjs( x, y, a )  bli_ccsaxpyjs( y, x, a )
-#define bli_zcsdotjs( x, y, a )  bli_czsaxpyjs( y, x, a )
-
-#define bli_szsdotjs( x, y, a )  bli_zssaxpyjs( y, x, a )
-#define bli_dzsdotjs( x, y, a )  bli_zdsaxpyjs( y, x, a )
-#define bli_czsdotjs( x, y, a )  bli_zcsaxpyjs( y, x, a )
-#define bli_zzsdotjs( x, y, a )  bli_zzsaxpyjs( y, x, a )
-
-
-#define bli_ssddotjs( x, y, a )  bli_ssdaxpyjs( y, x, a )
-#define bli_dsddotjs( x, y, a )  bli_sddaxpyjs( y, x, a )
-#define bli_csddotjs( x, y, a )  bli_scdaxpyjs( y, x, a )
-#define bli_zsddotjs( x, y, a )  bli_szdaxpyjs( y, x, a )
-
-#define bli_sdddotjs( x, y, a )  bli_dsdaxpyjs( y, x, a )
-#define bli_ddddotjs( x, y, a )  bli_dddaxpyjs( y, x, a )
-#define bli_cdddotjs( x, y, a )  bli_dcdaxpyjs( y, x, a )
-#define bli_zdddotjs( x, y, a )  bli_dzdaxpyjs( y, x, a )
-
-#define bli_scddotjs( x, y, a )  bli_csdaxpyjs( y, x, a )
-#define bli_dcddotjs( x, y, a )  bli_cddaxpyjs( y, x, a )
-#define bli_ccddotjs( x, y, a )  bli_ccdaxpyjs( y, x, a )
-#define bli_zcddotjs( x, y, a )  bli_czdaxpyjs( y, x, a )
-
-#define bli_szddotjs( x, y, a )  bli_zsdaxpyjs( y, x, a )
-#define bli_dzddotjs( x, y, a )  bli_zddaxpyjs( y, x, a )
-#define bli_czddotjs( x, y, a )  bli_zcdaxpyjs( y, x, a )
-#define bli_zzddotjs( x, y, a )  bli_zzdaxpyjs( y, x, a )
-
-
-#define bli_sscdotjs( x, y, a )  bli_sscaxpyjs( y, x, a )
-#define bli_dscdotjs( x, y, a )  bli_sdcaxpyjs( y, x, a )
-#define bli_cscdotjs( x, y, a )  bli_sccaxpyjs( y, x, a )
-#define bli_zscdotjs( x, y, a )  bli_szcaxpyjs( y, x, a )
-
-#define bli_sdcdotjs( x, y, a )  bli_dscaxpyjs( y, x, a )
-#define bli_ddcdotjs( x, y, a )  bli_ddcaxpyjs( y, x, a )
-#define bli_cdcdotjs( x, y, a )  bli_dccaxpyjs( y, x, a )
-#define bli_zdcdotjs( x, y, a )  bli_dzcaxpyjs( y, x, a )
-
-#define bli_sccdotjs( x, y, a )  bli_cscaxpyjs( y, x, a )
-#define bli_dccdotjs( x, y, a )  bli_cdcaxpyjs( y, x, a )
-#define bli_cccdotjs( x, y, a )  bli_cccaxpyjs( y, x, a )
-#define bli_zccdotjs( x, y, a )  bli_czcaxpyjs( y, x, a )
-
-#define bli_szcdotjs( x, y, a )  bli_zscaxpyjs( y, x, a )
-#define bli_dzcdotjs( x, y, a )  bli_zdcaxpyjs( y, x, a )
-#define bli_czcdotjs( x, y, a )  bli_zccaxpyjs( y, x, a )
-#define bli_zzcdotjs( x, y, a )  bli_zzcaxpyjs( y, x, a )
-
-
-#define bli_sszdotjs( x, y, a )  bli_sszaxpyjs( y, x, a )
-#define bli_dszdotjs( x, y, a )  bli_sdzaxpyjs( y, x, a )
-#define bli_cszdotjs( x, y, a )  bli_sczaxpyjs( y, x, a )
-#define bli_zszdotjs( x, y, a )  bli_szzaxpyjs( y, x, a )
-
-#define bli_sdzdotjs( x, y, a )  bli_dszaxpyjs( y, x, a )
-#define bli_ddzdotjs( x, y, a )  bli_ddzaxpyjs( y, x, a )
-#define bli_cdzdotjs( x, y, a )  bli_dczaxpyjs( y, x, a )
-#define bli_zdzdotjs( x, y, a )  bli_dzzaxpyjs( y, x, a )
-
-#define bli_sczdotjs( x, y, a )  bli_cszaxpyjs( y, x, a )
-#define bli_dczdotjs( x, y, a )  bli_cdzaxpyjs( y, x, a )
-#define bli_cczdotjs( x, y, a )  bli_cczaxpyjs( y, x, a )
-#define bli_zczdotjs( x, y, a )  bli_czzaxpyjs( y, x, a )
-
-#define bli_szzdotjs( x, y, a )  bli_zszaxpyjs( y, x, a )
-#define bli_dzzdotjs( x, y, a )  bli_zdzaxpyjs( y, x, a )
-#define bli_czzdotjs( x, y, a )  bli_zczaxpyjs( y, x, a )
-#define bli_zzzdotjs( x, y, a )  bli_zzzaxpyjs( y, x, a )
-
-
-
-
-
-#define bli_sdotjs( x, y, a )  bli_sssdotjs( x, y, a )
-#define bli_ddotjs( x, y, a )  bli_ddddotjs( x, y, a )
-#define bli_cdotjs( x, y, a )  bli_cccdotjs( x, y, a )
-#define bli_zdotjs( x, y, a )  bli_zzzdotjs( x, y, a )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_dots.h b/frame/include/level0/bli_dots.h
deleted file mode 100644
index f565ba529..000000000
--- a/frame/include/level0/bli_dots.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_DOTS_H
-#define BLIS_DOTS_H
-
-// dots
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-// - The third char encodes the type of rho.
-
-
-#define bli_sssdots( x, y, a )  bli_sssaxpys( x, y, a )
-#define bli_dssdots( x, y, a )  bli_dssaxpys( x, y, a )
-#define bli_cssdots( x, y, a )  bli_cssaxpys( x, y, a )
-#define bli_zssdots( x, y, a )  bli_zssaxpys( x, y, a )
-
-#define bli_sdsdots( x, y, a )  bli_sdsaxpys( x, y, a )
-#define bli_ddsdots( x, y, a )  bli_ddsaxpys( x, y, a )
-#define bli_cdsdots( x, y, a )  bli_cdsaxpys( x, y, a )
-#define bli_zdsdots( x, y, a )  bli_zdsaxpys( x, y, a )
-
-#define bli_scsdots( x, y, a )  bli_scsaxpys( x, y, a )
-#define bli_dcsdots( x, y, a )  bli_dcsaxpys( x, y, a )
-#define bli_ccsdots( x, y, a )  bli_ccsaxpys( x, y, a )
-#define bli_zcsdots( x, y, a )  bli_zcsaxpys( x, y, a )
-
-#define bli_szsdots( x, y, a )  bli_szsaxpys( x, y, a )
-#define bli_dzsdots( x, y, a )  bli_dzsaxpys( x, y, a )
-#define bli_czsdots( x, y, a )  bli_czsaxpys( x, y, a )
-#define bli_zzsdots( x, y, a )  bli_zzsaxpys( x, y, a )
-
-
-
-#define bli_ssddots( x, y, a )  bli_ssdaxpys( x, y, a )
-#define bli_dsddots( x, y, a )  bli_dsdaxpys( x, y, a )
-#define bli_csddots( x, y, a )  bli_csdaxpys( x, y, a )
-#define bli_zsddots( x, y, a )  bli_zsdaxpys( x, y, a )
-
-#define bli_sdddots( x, y, a )  bli_sddaxpys( x, y, a )
-#define bli_ddddots( x, y, a )  bli_dddaxpys( x, y, a )
-#define bli_cdddots( x, y, a )  bli_cddaxpys( x, y, a )
-#define bli_zdddots( x, y, a )  bli_zddaxpys( x, y, a )
-
-#define bli_scddots( x, y, a )  bli_scdaxpys( x, y, a )
-#define bli_dcddots( x, y, a )  bli_dcdaxpys( x, y, a )
-#define bli_ccddots( x, y, a )  bli_ccdaxpys( x, y, a )
-#define bli_zcddots( x, y, a )  bli_zcdaxpys( x, y, a )
-
-#define bli_szddots( x, y, a )  bli_szdaxpys( x, y, a )
-#define bli_dzddots( x, y, a )  bli_dzdaxpys( x, y, a )
-#define bli_czddots( x, y, a )  bli_czdaxpys( x, y, a )
-#define bli_zzddots( x, y, a )  bli_zzdaxpys( x, y, a )
-
-
-
-#define bli_sscdots( x, y, a )  bli_sscaxpys( x, y, a )
-#define bli_dscdots( x, y, a )  bli_dscaxpys( x, y, a )
-#define bli_cscdots( x, y, a )  bli_cscaxpys( x, y, a )
-#define bli_zscdots( x, y, a )  bli_zscaxpys( x, y, a )
-
-#define bli_sdcdots( x, y, a )  bli_sdcaxpys( x, y, a )
-#define bli_ddcdots( x, y, a )  bli_ddcaxpys( x, y, a )
-#define bli_cdcdots( x, y, a )  bli_cdcaxpys( x, y, a )
-#define bli_zdcdots( x, y, a )  bli_zdcaxpys( x, y, a )
-
-#define bli_sccdots( x, y, a )  bli_sccaxpys( x, y, a )
-#define bli_dccdots( x, y, a )  bli_dccaxpys( x, y, a )
-#define bli_cccdots( x, y, a )  bli_cccaxpys( x, y, a )
-#define bli_zccdots( x, y, a )  bli_zccaxpys( x, y, a )
-
-#define bli_szcdots( x, y, a )  bli_szcaxpys( x, y, a )
-#define bli_dzcdots( x, y, a )  bli_dzcaxpys( x, y, a )
-#define bli_czcdots( x, y, a )  bli_czcaxpys( x, y, a )
-#define bli_zzcdots( x, y, a )  bli_zzcaxpys( x, y, a )
-
-
-
-#define bli_sszdots( x, y, a )  bli_sszaxpys( x, y, a )
-#define bli_dszdots( x, y, a )  bli_dszaxpys( x, y, a )
-#define bli_cszdots( x, y, a )  bli_cszaxpys( x, y, a )
-#define bli_zszdots( x, y, a )  bli_zszaxpys( x, y, a )
-
-#define bli_sdzdots( x, y, a )  bli_sdzaxpys( x, y, a )
-#define bli_ddzdots( x, y, a )  bli_ddzaxpys( x, y, a )
-#define bli_cdzdots( x, y, a )  bli_cdzaxpys( x, y, a )
-#define bli_zdzdots( x, y, a )  bli_zdzaxpys( x, y, a )
-
-#define bli_sczdots( x, y, a )  bli_sczaxpys( x, y, a )
-#define bli_dczdots( x, y, a )  bli_dczaxpys( x, y, a )
-#define bli_cczdots( x, y, a )  bli_cczaxpys( x, y, a )
-#define bli_zczdots( x, y, a )  bli_zczaxpys( x, y, a )
-
-#define bli_szzdots( x, y, a )  bli_szzaxpys( x, y, a )
-#define bli_dzzdots( x, y, a )  bli_dzzaxpys( x, y, a )
-#define bli_czzdots( x, y, a )  bli_czzaxpys( x, y, a )
-#define bli_zzzdots( x, y, a )  bli_zzzaxpys( x, y, a )
-
-
-
-#define bli_sdots( x, y, a )  bli_sssdots( x, y, a )
-#define bli_ddots( x, y, a )  bli_ddddots( x, y, a )
-#define bli_cdots( x, y, a )  bli_cccdots( x, y, a )
-#define bli_zdots( x, y, a )  bli_zzzdots( x, y, a )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_eq.h b/frame/include/level0/bli_eq.h
deleted file mode 100644
index 691542b08..000000000
--- a/frame/include/level0/bli_eq.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_EQ_H
-#define BLIS_EQ_H
-
-
-// eq (passed by value)
-
-#define bli_seq( a, b )  ( (a) == (b) )
-#define bli_deq( a, b )  ( (a) == (b) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_ceq( a, b )  ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) )
-#define bli_zeq( a, b )  ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_ceq( a, b )  ( (a) == (b) )
-#define bli_zeq( a, b )  ( (a) == (b) )
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-#define bli_ieq( a, b )  ( (a) == (b) )
-
-
-
-// eqtori (passed by value)
-
-#define bli_seqtori( a, br, bi )  ( (a) == (br) )
-#define bli_deqtori( a, br, bi )  ( (a) == (br) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_ceqtori( a, br, bi )  ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) )
-#define bli_zeqtori( a, br, bi )  ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_ceqtori( a, br, bi )  ( (a) == (br) + (bi) * (I) )
-#define bli_zeqtori( a, br, bi )  ( (a) == (br) + (bi) * (I) )
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-
-// eqa (passed by address)
-
-#define bli_seqa( a, b )  bli_seq( *(( float*    )(a)), *(( float*    )(b)) )
-#define bli_deqa( a, b )  bli_deq( *(( double*   )(a)), *(( double*   )(b)) )
-#define bli_ceqa( a, b )  bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) )
-#define bli_zeqa( a, b )  bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) )
-#define bli_ieqa( a, b )  bli_ieq( *(( gint_t*   )(a)), *(( gint_t*   )(b)) )
-
-
-
-// eq1
-
-#define bli_seq1( a )  bli_seqtori( (a), 1.0F, 0.0F )
-#define bli_deq1( a )  bli_deqtori( (a), 1.0,  0.0  )
-#define bli_ceq1( a )  bli_ceqtori( (a), 1.0F, 0.0F )
-#define bli_zeq1( a )  bli_zeqtori( (a), 1.0,  0.0  )
-#define bli_ieq1( a )  bli_ieq    ( (a), 1          )
-
-
-
-// eq0
-
-#define bli_seq0( a )  bli_seqtori( (a), 0.0F, 0.0F )
-#define bli_deq0( a )  bli_deqtori( (a), 0.0,  0.0  )
-#define bli_ceq0( a )  bli_ceqtori( (a), 0.0F, 0.0F )
-#define bli_zeq0( a )  bli_zeqtori( (a), 0.0,  0.0  )
-#define bli_ieq0( a )  bli_ieq    ( (a), 0          )
-
-
-
-// eqm1
-
-#define bli_seqm1( a )  bli_seqtori( (a), -1.0F, 0.0F )
-#define bli_deqm1( a )  bli_deqtori( (a), -1.0,  0.0  )
-#define bli_ceqm1( a )  bli_ceqtori( (a), -1.0F, 0.0F )
-#define bli_zeqm1( a )  bli_zeqtori( (a), -1.0,  0.0  )
-#define bli_ieqm1( a )  bli_ieq    ( (a), -1          )
-
-
-
-#endif
diff --git a/frame/include/level0/bli_gets.h b/frame/include/level0/bli_gets.h
deleted file mode 100644
index 985ecf4cb..000000000
--- a/frame/include/level0/bli_gets.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_GETS_H
-#define BLIS_GETS_H
-
-// gets
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-
-#define bli_ssgets( x, yr, yi )  { (yr) = bli_sreal(x); (yi) = bli_simag(x); }
-#define bli_dsgets( x, yr, yi )  { (yr) = bli_dreal(x); (yi) = bli_dimag(x); }
-#define bli_csgets( x, yr, yi )  { (yr) = bli_creal(x); (yi) = bli_cimag(x); }
-#define bli_zsgets( x, yr, yi )  { (yr) = bli_zreal(x); (yi) = bli_zimag(x); }
-#define bli_isgets( x, yr, yi )  { (yr) = ( float )(x); (yi) = 0.0F; }
-
-#define bli_sdgets( x, yr, yi )  { (yr) = bli_sreal(x); (yi) = bli_simag(x); }
-#define bli_ddgets( x, yr, yi )  { (yr) = bli_dreal(x); (yi) = bli_dimag(x); }
-#define bli_cdgets( x, yr, yi )  { (yr) = bli_creal(x); (yi) = bli_cimag(x); }
-#define bli_zdgets( x, yr, yi )  { (yr) = bli_zreal(x); (yi) = bli_zimag(x); }
-#define bli_idgets( x, yr, yi )  { (yr) = ( double )(x); (yi) = 0.0; }
-
-#define bli_scgets( x, yr, yi )  { (yr) = bli_sreal(x); (yi) = bli_simag(x); }
-#define bli_dcgets( x, yr, yi )  { (yr) = bli_dreal(x); (yi) = bli_dimag(x); }
-#define bli_ccgets( x, yr, yi )  { (yr) = bli_creal(x); (yi) = bli_cimag(x); }
-#define bli_zcgets( x, yr, yi )  { (yr) = bli_zreal(x); (yi) = bli_zimag(x); }
-#define bli_icgets( x, yr, yi )  { (yr) = ( float )(x); (yi) = 0.0F; }
-
-#define bli_szgets( x, yr, yi )  { (yr) = bli_sreal(x); (yi) = bli_simag(x); }
-#define bli_dzgets( x, yr, yi )  { (yr) = bli_dreal(x); (yi) = bli_dimag(x); }
-#define bli_czgets( x, yr, yi )  { (yr) = bli_creal(x); (yi) = bli_cimag(x); }
-#define bli_zzgets( x, yr, yi )  { (yr) = bli_zreal(x); (yi) = bli_zimag(x); }
-#define bli_izgets( x, yr, yi )  { (yr) = ( double )(x); (yi) = 0.0; }
-
-#define bli_sigets( x, yr, yi )  { (yr) = bli_sreal(x); (yi) = 0; }
-#define bli_digets( x, yr, yi )  { (yr) = bli_dreal(x); (yi) = 0; }
-#define bli_cigets( x, yr, yi )  { (yr) = bli_creal(x); (yi) = 0; }
-#define bli_zigets( x, yr, yi )  { (yr) = bli_zreal(x); (yi) = 0; }
-#define bli_iigets( x, yr, yi )  { (yr) =          (x); (yi) = 0; }
-
-
-#define bli_sgets( x, yr, yi )  bli_ssgets( x, yr, yi )
-#define bli_dgets( x, yr, yi )  bli_ddgets( x, yr, yi )
-#define bli_cgets( x, yr, yi )  bli_csgets( x, yr, yi )
-#define bli_zgets( x, yr, yi )  bli_zdgets( x, yr, yi )
-#define bli_igets( x, yr, yi )  bli_idgets( x, yr, yi )
-
-
-#endif
diff --git a/frame/include/level0/bli_invscaljs.h b/frame/include/level0/bli_invscaljs.h
deleted file mode 100644
index 2c26bdc7f..000000000
--- a/frame/include/level0/bli_invscaljs.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVSCALJS_H
-#define BLIS_INVSCALJS_H
-
-// invscaljs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_ssinvscaljs( a, y )  bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dsinvscaljs( a, y )  bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_csinvscaljs( a, y )  bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zsinvscaljs( a, y )  bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdinvscaljs( a, y )  bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddinvscaljs( a, y )  bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdinvscaljs( a, y )  bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdinvscaljs( a, y )  bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scinvscaljs( a, y )  bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcinvscaljs( a, y )  bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccinvscaljs( a, y )   bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcinvscaljs( a, y )   bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szinvscaljs( a, y )  bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzinvscaljs( a, y )  bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czinvscaljs( a, y )   bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzinvscaljs( a, y )   bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scinvscaljs( a, y )  { (y) /=      (a); }
-#define bli_dcinvscaljs( a, y )  { (y) /=      (a); }
-#define bli_ccinvscaljs( a, y )  { (y) /= conjf(a); }
-#define bli_zcinvscaljs( a, y )  { (y) /= conj (a); }
-
-#define bli_szinvscaljs( a, y )  { (y) /=      (a); }
-#define bli_dzinvscaljs( a, y )  { (y) /=      (a); }
-#define bli_czinvscaljs( a, y )  { (y) /= conjf(a); }
-#define bli_zzinvscaljs( a, y )  { (y) /= conj (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sinvscaljs( a, y )  bli_ssinvscaljs( a, y )
-#define bli_dinvscaljs( a, y )  bli_ddinvscaljs( a, y )
-#define bli_cinvscaljs( a, y )  bli_ccinvscaljs( a, y )
-#define bli_zinvscaljs( a, y )  bli_zzinvscaljs( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_invscals.h b/frame/include/level0/bli_invscals.h
deleted file mode 100644
index 558298f0d..000000000
--- a/frame/include/level0/bli_invscals.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVSCALS_H
-#define BLIS_INVSCALS_H
-
-// invscals
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_ssinvscals( a, y )  bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dsinvscals( a, y )  bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_csinvscals( a, y )  bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zsinvscals( a, y )  bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdinvscals( a, y )  bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddinvscals( a, y )  bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdinvscals( a, y )  bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdinvscals( a, y )  bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scinvscals( a, y )  bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcinvscals( a, y )  bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccinvscals( a, y )   bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcinvscals( a, y )   bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szinvscals( a, y )  bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzinvscals( a, y )  bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czinvscals( a, y )   bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzinvscals( a, y )   bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scinvscals( a, y )  { (y) /= (a); }
-#define bli_dcinvscals( a, y )  { (y) /= (a); }
-#define bli_ccinvscals( a, y )  { (y) /= (a); }
-#define bli_zcinvscals( a, y )  { (y) /= (a); }
-
-#define bli_szinvscals( a, y )  { (y) /= (a); }
-#define bli_dzinvscals( a, y )  { (y) /= (a); }
-#define bli_czinvscals( a, y )  { (y) /= (a); }
-#define bli_zzinvscals( a, y )  { (y) /= (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sinvscals( a, y )  bli_ssinvscals( a, y )
-#define bli_dinvscals( a, y )  bli_ddinvscals( a, y )
-#define bli_cinvscals( a, y )  bli_ccinvscals( a, y )
-#define bli_zinvscals( a, y )  bli_zzinvscals( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_lte.h b/frame/include/level0/bli_lte.h
deleted file mode 100644
index ab87ff800..000000000
--- a/frame/include/level0/bli_lte.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_LTE_H
-#define BLIS_LTE_H
-
-
-// lte (passed by value)
-
-#define bli_slte( a, b )  (          (a) <=          (b) )
-#define bli_dlte( a, b )  (          (a) <=          (b) )
-#define bli_clte( a, b )  ( bli_creal(a) <= bli_creal(b) )
-#define bli_zlte( a, b )  ( bli_zreal(a) <= bli_zreal(b) )
-#define bli_ilte( a, b )  (          (a) <=          (b) )
-
-// lte0
-
-#define bli_slte0( a )  (          (a) <= 0.0F )
-#define bli_dlte0( a )  (          (a) <= 0.0  )
-#define bli_clte0( a )  ( bli_creal(a) <= 0.0F )
-#define bli_zlte0( a )  ( bli_zreal(a) <= 0.0  )
-
-// gte (passed by value)
-
-#define bli_sgte( a, b )  (          (a) >=          (b) )
-#define bli_dgte( a, b )  (          (a) >=          (b) )
-#define bli_cgte( a, b )  ( bli_creal(a) >= bli_creal(b) )
-#define bli_zgte( a, b )  ( bli_zreal(a) >= bli_zreal(b) )
-#define bli_igte( a, b )  (          (a) >=          (b) )
-
-// gte0
-
-#define bli_sgte0( a )  (          (a) >= 0.0F )
-#define bli_dgte0( a )  (          (a) >= 0.0  )
-#define bli_cgte0( a )  ( bli_creal(a) >= 0.0F )
-#define bli_zgte0( a )  ( bli_zreal(a) >= 0.0  )
-
-
-
-#endif
diff --git a/frame/include/level0/bli_neg2s.h b/frame/include/level0/bli_neg2s.h
deleted file mode 100644
index 2f505d50d..000000000
--- a/frame/include/level0/bli_neg2s.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_NEG2S_H
-#define BLIS_NEG2S_H
-
-// neg2s
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_ssneg2s( x, y )  bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dsneg2s( x, y )  bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_csneg2s( x, y )  bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zsneg2s( x, y )  bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdneg2s( x, y )  bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ddneg2s( x, y )  bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cdneg2s( x, y )  bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zdneg2s( x, y )  bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scneg2s( x, y )  bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dcneg2s( x, y )  bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_ccneg2s( x, y )  bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zcneg2s( x, y )  bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szneg2s( x, y )  bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzneg2s( x, y )  bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czneg2s( x, y )  bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzneg2s( x, y )  bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scneg2s( x, y )  { (y) = -(x); }
-#define bli_dcneg2s( x, y )  { (y) = -(x); }
-#define bli_ccneg2s( x, y )  { (y) = -(x); }
-#define bli_zcneg2s( x, y )  { (y) = -(x); }
-
-#define bli_szneg2s( x, y )  { (y) = -(x); }
-#define bli_dzneg2s( x, y )  { (y) = -(x); }
-#define bli_czneg2s( x, y )  { (y) = -(x); }
-#define bli_zzneg2s( x, y )  { (y) = -(x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sneg2s( x, y )  bli_ssneg2s( x, y )
-#define bli_dneg2s( x, y )  bli_ddneg2s( x, y )
-#define bli_cneg2s( x, y )  bli_ccneg2s( x, y )
-#define bli_zneg2s( x, y )  bli_zzneg2s( x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_randnp2s.h b/frame/include/level0/bli_randnp2s.h
deleted file mode 100644
index 7904f72aa..000000000
--- a/frame/include/level0/bli_randnp2s.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_RANDNP2S_H
-#define BLIS_RANDNP2S_H
-
-// randnp2s
-
-
-#define bli_srandnp2s( a ) \
-{ \
-	bli_drandnp2s( a ); \
-}
-
-#if 0
-#define bli_drandnp2s_prev( a ) \
-{ \
-	const double m_max  = 3.0; \
-	const double m_max2 = m_max + 2.0; \
-	double       t; \
-	double       r_val; \
-\
-	/* Compute a narrow-range power of two.
-
-	   For the purposes of commentary, we'll assume that m_max = 4. This
-	   represents the largest power of two we will use to generate the
-	   random numbers. */ \
-\
-	/* Generate a random real number t on the interval: [0.0, 6.0]. */ \
-	t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \
-\
-	/* Modify t to guarantee that is never equal to the upper bound of
-	   the interval (in this case, 6.0). */ \
-	if ( t == m_max2 ) t = t - 1.0; \
-\
-	/* Transform the interval into the set of integers, {0,1,2,3,4,5}. */ \
-	t = floor( t ); \
-\
-	/* Map values of t == 0 to a final value of 0. */ \
-	if ( t == 0.0 ) r_val = 0.0; \
-	else \
-	{ \
-		/* This case handles values of t = {1,2,3,4,5}. */ \
-\
-		double s_exp, s_val; \
-\
-		/* Compute two random numbers to determine the signs of the
-		   exponent and the end result. */ \
-		PASTEMAC(d,rands)( s_exp ); \
-		PASTEMAC(d,rands)( s_val ); \
-\
-		/* Compute r_val = 2^s where s = +/-(t-1) = {-4,-3,-2,-1,0,1,2,3,4}. */ \
-		if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \
-		else               r_val = pow( 2.0,   t - 1.0  ); \
-\
-		/* If our sign value is negative, our random power of two will
-		   be negative. */ \
-		if ( s_val < 0.0 ) r_val = -r_val; \
-	} \
-\
-	/* Normalize by the largest possible positive value. */ \
-	r_val = r_val / pow( 2.0, m_max ); \
-\
-	/* r_val = 0, or +/-{2^-4, 2^-3, 2^-2, 2^-1, 2^0, 2^1, 2^2, 2^3, 2^4}. */ \
-	/* NOTE: For single-precision macros, this assignment results in typecast
-	   down to float. */ \
-	a = r_val; \
-}
-#endif
-
-#define bli_drandnp2s( a ) \
-{ \
-	const double m_max  = 6.0; \
-	const double m_max2 = m_max + 2.0; \
-	double       t; \
-	double       r_val; \
-\
-	/* Compute a narrow-range power of two.
-
-	   For the purposes of commentary, we'll assume that m_max = 4. This
-	   represents the largest power of two we will use to generate the
-	   random numbers. */ \
-\
-	do \
-	{ \
-		/* Generate a random real number t on the interval: [0.0, 6.0]. */ \
-		t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \
-\
-		/* Transform the interval into the set of integers, {0,1,2,3,4,5}.
-		   Note that 6 is prohibited by the loop guard below. */ \
-		t = floor( t ); \
-	} \
-	/* If t is ever equal to m_max2, we re-randomize. The guard against
-	   m_max2 < t is for sanity and shouldn't happen, unless perhaps there
-	   is weirdness in the typecasting to double when computing t above. */ \
-	while ( m_max2 <= t ); \
-\
-	/* Map values of t == 0 to a final value of 0. */ \
-	if ( t == 0.0 ) r_val = 0.0; \
-	else \
-	{ \
-		/* This case handles values of t = {1,2,3,4,5}. */ \
-\
-		double s_val; \
-\
-		/* Compute r_val = 2^s where s = -(t-1) = {-4,-3,-2,-1,0}. */ \
-		r_val = pow( 2.0, -(t - 1.0) ); \
-\
-		/* Compute a random number to determine the sign of the final
-		   result. */ \
-		PASTEMAC(d,rands)( s_val ); \
-\
-		/* If our sign value is negative, our random power of two will
-		   be negative. */ \
-		if ( s_val < 0.0 ) r_val = -r_val; \
-	} \
-\
-	/* r_val = 0, or +/-{2^0, 2^-1, 2^-2, 2^-3, 2^-4}. */ \
-	/* NOTE: For single-precision macros, this assignment results in typecast
-	   down to float. */ \
-	a = r_val; \
-}
-#define bli_crandnp2s( a ) \
-{ \
-	float  ar, ai; \
-\
-	bli_srandnp2s( ar ); \
-	bli_srandnp2s( ai ); \
-\
-	bli_csets( ar, ai, (a) ); \
-}
-#define bli_zrandnp2s( a ) \
-{ \
-	double ar, ai; \
-\
-	bli_drandnp2s( ar ); \
-	bli_drandnp2s( ai ); \
-\
-	bli_zsets( ar, ai, (a) ); \
-}
-
-
-#endif
-
diff --git a/frame/include/level0/bli_rands.h b/frame/include/level0/bli_rands.h
deleted file mode 100644
index b377a6153..000000000
--- a/frame/include/level0/bli_rands.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_RANDS_H
-#define BLIS_RANDS_H
-
-// rands
-
-
-#define bli_srands( a ) \
-{ \
-	(a) = ( float  ) (   ( double ) rand() / \
-	                   ( ( double ) RAND_MAX / 2.0 ) \
-	                 ) - 1.0F; \
-}
-#define bli_drands( a ) \
-{ \
-	(a) = ( double ) (   ( double ) rand() / \
-	                   ( ( double ) RAND_MAX / 2.0 ) \
-	                 ) - 1.0; \
-}
-#define bli_crands( a ) \
-{ \
-	float  ar, ai; \
-\
-	bli_srands( ar ); \
-	bli_srands( ai ); \
-\
-	bli_csets( ar, ai, (a) ); \
-}
-#define bli_zrands( a ) \
-{ \
-	double ar, ai; \
-\
-	bli_drands( ar ); \
-	bli_drands( ai ); \
-\
-	bli_zsets( ar, ai, (a) ); \
-}
-
-
-#endif
-
diff --git a/frame/include/level0/bli_scal2js.h b/frame/include/level0/bli_scal2js.h
deleted file mode 100644
index e8f3acc27..000000000
--- a/frame/include/level0/bli_scal2js.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2JS_H
-#define BLIS_SCAL2JS_H
-
-// scal2js
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dssscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cssscal2js( a, x, y )  bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zssscal2js( a, x, y )  bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ddsscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cdsscal2js( a, x, y )  bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zdsscal2js( a, x, y )  bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_scsscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dcsscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ccsscal2js( a, x, y )  bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zcsscal2js( a, x, y )  bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_szsscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dzsscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_czsscal2js( a, x, y )  bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zzsscal2js( a, x, y )  bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dsdscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_csdscal2js( a, x, y )  bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zsdscal2js( a, x, y )  bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sddscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dddscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cddscal2js( a, x, y )  bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zddscal2js( a, x, y )  bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_scdscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dcdscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ccdscal2js( a, x, y )  bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zcdscal2js( a, x, y )  bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_szdscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dzdscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_czdscal2js( a, x, y )  bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zzdscal2js( a, x, y )  bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dscscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cscscal2js( a, x, y )  bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zscscal2js( a, x, y )  bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sdcscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_ddcscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cdcscal2js( a, x, y )  bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zdcscal2js( a, x, y )  bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sccscal2js( a, x, y )  bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccscal2js( a, x, y )  bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccscal2js( a, x, y )  bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccscal2js( a, x, y )  bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcscal2js( a, x, y )  bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dzcscal2js( a, x, y )  bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_czcscal2js( a, x, y )  bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zzcscal2js( a, x, y )  bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dszscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cszscal2js( a, x, y )  bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zszscal2js( a, x, y )  bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sdzscal2js( a, x, y )  bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzscal2js( a, x, y )  bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzscal2js( a, x, y )  bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzscal2js( a, x, y )  bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sczscal2js( a, x, y )  bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dczscal2js( a, x, y )  bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cczscal2js( a, x, y )  bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zczscal2js( a, x, y )  bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_szzscal2js( a, x, y )  bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzscal2js( a, x, y )  bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czzscal2js( a, x, y )  bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzscal2js( a, x, y )  bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_dscscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_cscscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_zscscal2js( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sdcscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_ddcscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_cdcscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_zdcscal2js( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sccscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-#define bli_dccscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-#define bli_cccscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-#define bli_zccscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-
-#define bli_szcscal2js( a, x, y )  { (y) = (a) * conj(x); }
-#define bli_dzcscal2js( a, x, y )  { (y) = (a) * conj(x); }
-#define bli_czcscal2js( a, x, y )  { (y) = (a) * conj(x); }
-#define bli_zzcscal2js( a, x, y )  { (y) = (a) * conj(x); }
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_dszscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_cszscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_zszscal2js( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sdzscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_ddzscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_cdzscal2js( a, x, y )  { (y) = (a) * (x); }
-#define bli_zdzscal2js( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sczscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-#define bli_dczscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-#define bli_cczscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-#define bli_zczscal2js( a, x, y )  { (y) = (a) * conjf(x); }
-
-#define bli_szzscal2js( a, x, y )  { (y) = (a) * conj(x); }
-#define bli_dzzscal2js( a, x, y )  { (y) = (a) * conj(x); }
-#define bli_czzscal2js( a, x, y )  { (y) = (a) * conj(x); }
-#define bli_zzzscal2js( a, x, y )  { (y) = (a) * conj(x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sscal2js( a, x, y )  bli_sssscal2js( a, x, y )
-#define bli_dscal2js( a, x, y )  bli_dddscal2js( a, x, y )
-#define bli_cscal2js( a, x, y )  bli_cccscal2js( a, x, y )
-#define bli_zscal2js( a, x, y )  bli_zzzscal2js( a, x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_scal2s.h b/frame/include/level0/bli_scal2s.h
deleted file mode 100644
index d963595c0..000000000
--- a/frame/include/level0/bli_scal2s.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2S_H
-#define BLIS_SCAL2S_H
-
-// scal2s
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dssscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cssscal2s( a, x, y )  bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zssscal2s( a, x, y )  bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ddsscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_cdsscal2s( a, x, y )  bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zdsscal2s( a, x, y )  bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_scsscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dcsscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_ccsscal2s( a, x, y )  bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zcsscal2s( a, x, y )  bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_szsscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dzsscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_czsscal2s( a, x, y )  bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zzsscal2s( a, x, y )  bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dsdscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_csdscal2s( a, x, y )  bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zsdscal2s( a, x, y )  bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sddscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dddscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cddscal2s( a, x, y )  bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zddscal2s( a, x, y )  bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_scdscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dcdscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ccdscal2s( a, x, y )  bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zcdscal2s( a, x, y )  bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-
-#define bli_szdscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_dzdscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_czdscal2s( a, x, y )  bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zzdscal2s( a, x, y )  bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dscscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cscscal2s( a, x, y )  bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zscscal2s( a, x, y )  bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sdcscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_ddcscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cdcscal2s( a, x, y )  bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zdcscal2s( a, x, y )  bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_sccscal2s( a, x, y )  bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dccscal2s( a, x, y )  bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_cccscal2s( a, x, y )  bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zccscal2s( a, x, y )  bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcscal2s( a, x, y )  bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dzcscal2s( a, x, y )  bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_czcscal2s( a, x, y )  bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zzcscal2s( a, x, y )  bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dszscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cszscal2s( a, x, y )  bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zszscal2s( a, x, y )  bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sdzscal2s( a, x, y )  bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzscal2s( a, x, y )  bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzscal2s( a, x, y )  bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzscal2s( a, x, y )  bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sczscal2s( a, x, y )  bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dczscal2s( a, x, y )  bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_cczscal2s( a, x, y )  bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zczscal2s( a, x, y )  bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-
-#define bli_szzscal2s( a, x, y )  bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzscal2s( a, x, y )  bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czzscal2s( a, x, y )  bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzscal2s( a, x, y )  bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_dscscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_cscscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zscscal2s( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sdcscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_ddcscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_cdcscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zdcscal2s( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sccscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_dccscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_cccscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zccscal2s( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_szcscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_dzcscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_czcscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zzcscal2s( a, x, y )  { (y) = (a) * (x); }
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_dszscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_cszscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zszscal2s( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sdzscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_ddzscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_cdzscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zdzscal2s( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_sczscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_dczscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_cczscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zczscal2s( a, x, y )  { (y) = (a) * (x); }
-
-#define bli_szzscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_dzzscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_czzscal2s( a, x, y )  { (y) = (a) * (x); }
-#define bli_zzzscal2s( a, x, y )  { (y) = (a) * (x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sscal2s( a, x, y )  bli_sssscal2s( a, x, y )
-#define bli_dscal2s( a, x, y )  bli_dddscal2s( a, x, y )
-#define bli_cscal2s( a, x, y )  bli_cccscal2s( a, x, y )
-#define bli_zscal2s( a, x, y )  bli_zzzscal2s( a, x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_scal2s_mxn.h b/frame/include/level0/bli_scal2s_mxn.h
deleted file mode 100644
index fdfea4dd9..000000000
--- a/frame/include/level0/bli_scal2s_mxn.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2S_MXN_H
-#define BLIS_SCAL2S_MXN_H
-
-// scal2s_mxn
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-// - We only implement cases where typeof(a) == type(x).
-
-#undef  BLIS_ENABLE_CR_CASES
-#define BLIS_ENABLE_CR_CASES 0
-
-// -- bli_???scal2s_mxn --
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
-\
-BLIS_INLINE void PASTEMAC(chx,chx,chy,opname) \
-     ( \
-       const conj_t  conjx, \
-       const dim_t   m, \
-       const dim_t   n, \
-       const ctypex* alpha, \
-       const ctypex* x, inc_t rs_x, inc_t cs_x, \
-             ctypey* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-	if ( bli_is_conj( conjx ) ) \
-	{ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			const ctypex* restrict xj = x + j*cs_x; \
-			      ctypey* restrict yj = y + j*cs_y; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				const ctypex* restrict xij = xj + i*rs_x; \
-				      ctypey* restrict yij = yj + i*rs_y; \
-\
-				PASTEMAC(chx,chx,chy,scal2js)( *alpha, *xij, *yij ); \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_noconj( conjx ) ) */ \
-	{ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			const ctypex* restrict xj = x + j*cs_x; \
-			      ctypey* restrict yj = y + j*cs_y; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				const ctypex* restrict xij = xj + i*rs_x; \
-				      ctypey* restrict yij = yj + i*rs_y; \
-\
-				PASTEMAC(chx,chx,chy,scal2s)( *alpha, *xij, *yij ); \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC2_BASIC ( scal2s_mxn, scal2s )
-INSERT_GENTFUNC2_MIX_DP( scal2s_mxn, scal2s )
-
-
-// -- bli_?scal2s_mxn --
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const conj_t conjx, \
-       const dim_t  m, \
-       const dim_t  n, \
-       const ctype* alpha, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-             ctype* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-	PASTEMAC(ch,ch,ch,opname)( conjx, m, n, alpha, x, rs_x, cs_x, y, rs_y, cs_y ); \
-}
-
-INSERT_GENTFUNC_BASIC( scal2s_mxn )
-
-#endif
diff --git a/frame/include/level0/bli_scalcjs.h b/frame/include/level0/bli_scalcjs.h
deleted file mode 100644
index 5619415cd..000000000
--- a/frame/include/level0/bli_scalcjs.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALCJS_H
-#define BLIS_SCALCJS_H
-
-// scalcjs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_ssscalcjs( conjx, x, y )  bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
-#define bli_dsscalcjs( conjx, x, y )  bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_csscalcjs( conjx, x, y )  bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
-#define bli_zsscalcjs( conjx, x, y )  bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdscalcjs( conjx, x, y )  bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_ddscalcjs( conjx, x, y )  bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_cdscalcjs( conjx, x, y )  bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
-#define bli_zdscalcjs( conjx, x, y )  bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scscalcjs( conjx, x, y )  bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
-#define bli_dcscalcjs( conjx, x, y )  bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_ccscalcjs( conjx, x, y )   bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
-#define bli_zcscalcjs( conjx, x, y )   bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
-
-#define bli_szscalcjs( conjx, x, y )  bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_dzscalcjs( conjx, x, y )  bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_czscalcjs( conjx, x, y )   bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
-#define bli_zzscalcjs( conjx, x, y )   bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scscalcjs( conjx, x, y )  { (y) *= (x); }
-#define bli_dcscalcjs( conjx, x, y )  { (y) *= (x); }
-#define bli_ccscalcjs( conjx, x, y )  { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); }
-#define bli_zcscalcjs( conjx, x, y )  { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); }
-
-#define bli_szscalcjs( conjx, x, y )  { (y) *= (x); }
-#define bli_dzscalcjs( conjx, x, y )  { (y) *= (x); }
-#define bli_czscalcjs( conjx, x, y )  { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); }
-#define bli_zzscalcjs( conjx, x, y )  { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sscalcjs( conjx, x, y )  bli_ssscalcjs( conjx, x, y )
-#define bli_dscalcjs( conjx, x, y )  bli_ddscalcjs( conjx, x, y )
-#define bli_cscalcjs( conjx, x, y )  bli_ccscalcjs( conjx, x, y )
-#define bli_zscalcjs( conjx, x, y )  bli_zzscalcjs( conjx, x, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_scaljs.h b/frame/include/level0/bli_scaljs.h
deleted file mode 100644
index 8fb2d2922..000000000
--- a/frame/include/level0/bli_scaljs.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALJS_H
-#define BLIS_SCALJS_H
-
-// scaljs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_ssscaljs( a, y )  bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dsscaljs( a, y )  bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_csscaljs( a, y )  bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zsscaljs( a, y )  bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdscaljs( a, y )  bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddscaljs( a, y )  bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdscaljs( a, y )  bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdscaljs( a, y )  bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scscaljs( a, y )  bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcscaljs( a, y )  bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccscaljs( a, y )   bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcscaljs( a, y )   bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szscaljs( a, y )  bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzscaljs( a, y )  bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czscaljs( a, y )   bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzscaljs( a, y )   bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scscaljs( a, y )  { (y) *=      (a); }
-#define bli_dcscaljs( a, y )  { (y) *=      (a); }
-#define bli_ccscaljs( a, y )  { (y) *= conjf(a); }
-#define bli_zcscaljs( a, y )  { (y) *= conj (a); }
-
-#define bli_szscaljs( a, y )  { (y) *=      (a); }
-#define bli_dzscaljs( a, y )  { (y) *=      (a); }
-#define bli_czscaljs( a, y )  { (y) *= conjf(a); }
-#define bli_zzscaljs( a, y )  { (y) *= conj (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sscaljs( a, y )  bli_ssscaljs( a, y )
-#define bli_dscaljs( a, y )  bli_ddscaljs( a, y )
-#define bli_cscaljs( a, y )  bli_ccscaljs( a, y )
-#define bli_zscaljs( a, y )  bli_zzscaljs( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_scals.h b/frame/include/level0/bli_scals.h
deleted file mode 100644
index fc45c5e76..000000000
--- a/frame/include/level0/bli_scals.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALS_H
-#define BLIS_SCALS_H
-
-// scals
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_ssscals( a, y )  bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dsscals( a, y )  bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_csscals( a, y )  bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zsscals( a, y )  bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdscals( a, y )  bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddscals( a, y )  bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdscals( a, y )  bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdscals( a, y )  bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scscals( a, y )  bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcscals( a, y )  bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccscals( a, y )   bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcscals( a, y )   bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szscals( a, y )  bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzscals( a, y )  bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czscals( a, y )   bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzscals( a, y )   bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scscals( a, y )  { (y) *= (a); }
-#define bli_dcscals( a, y )  { (y) *= (a); }
-#define bli_ccscals( a, y )  { (y) *= (a); }
-#define bli_zcscals( a, y )  { (y) *= (a); }
-
-#define bli_szscals( a, y )  { (y) *= (a); }
-#define bli_dzscals( a, y )  { (y) *= (a); }
-#define bli_czscals( a, y )  { (y) *= (a); }
-#define bli_zzscals( a, y )  { (y) *= (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sscals( a, y )  bli_ssscals( a, y )
-#define bli_dscals( a, y )  bli_ddscals( a, y )
-#define bli_cscals( a, y )  bli_ccscals( a, y )
-#define bli_zscals( a, y )  bli_zzscals( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_set0s_mxn.h b/frame/include/level0/bli_set0s_mxn.h
deleted file mode 100644
index ed2f9b159..000000000
--- a/frame/include/level0/bli_set0s_mxn.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SET0S_MXN_H
-#define BLIS_SET0S_MXN_H
-
-// set0s_mxn
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n,
-                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	for ( dim_t j = 0; j < n; ++j )
-	for ( dim_t i = 0; i < m; ++i )
-	bli_sset0s( *(y + i*rs_y + j*cs_y) );
-}
-
-BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n,
-                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	for ( dim_t j = 0; j < n; ++j )
-	for ( dim_t i = 0; i < m; ++i )
-	bli_dset0s( *(y + i*rs_y + j*cs_y) );
-}
-
-BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n,
-                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	for ( dim_t j = 0; j < n; ++j )
-	for ( dim_t i = 0; i < m; ++i )
-	bli_cset0s( *(y + i*rs_y + j*cs_y) );
-}
-
-BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n,
-                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
-{
-	for ( dim_t j = 0; j < n; ++j )
-	for ( dim_t i = 0; i < m; ++i )
-	bli_zset0s( *(y + i*rs_y + j*cs_y) );
-}
-
-#endif
diff --git a/frame/include/level0/bli_set1s.h b/frame/include/level0/bli_set1s.h
deleted file mode 100644
index 98a58bc10..000000000
--- a/frame/include/level0/bli_set1s.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SET1S_H
-#define BLIS_SET1S_H
-
-#define bli_sset1s( a )  bli_ssets( 1.0F, 0.0F, (a) )
-#define bli_dset1s( a )  bli_dsets( 1.0 , 0.0 , (a) )
-#define bli_cset1s( a )  bli_csets( 1.0F, 0.0F, (a) )
-#define bli_zset1s( a )  bli_zsets( 1.0 , 0.0 , (a) )
-
-#endif
-
diff --git a/frame/include/level0/bli_setis.h b/frame/include/level0/bli_setis.h
deleted file mode 100644
index 4f508e493..000000000
--- a/frame/include/level0/bli_setis.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SETIS_H
-#define BLIS_SETIS_H
-
-// setis
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sssetis( xi, y )  { ; }
-#define bli_dssetis( xi, y )  { ; }
-
-#define bli_sdsetis( xi, y )  { ; }
-#define bli_ddsetis( xi, y )  { ; }
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsetis( xi, y )  { bli_cimag(y) = (xi); }
-#define bli_dcsetis( xi, y )  { bli_cimag(y) = (xi); }
-
-#define bli_szsetis( xi, y )  { bli_zimag(y) = (xi); }
-#define bli_dzsetis( xi, y )  { bli_zimag(y) = (xi); }
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsetis( xi, y )  { (y) = bli_creal(y) + (xi) * (I); }
-#define bli_dcsetis( xi, y )  { (y) = bli_creal(y) + (xi) * (I); }
-
-#define bli_szsetis( xi, y )  { (y) = bli_zreal(y) + (xi) * (I); }
-#define bli_dzsetis( xi, y )  { (y) = bli_zreal(y) + (xi) * (I); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_ssetis( xi, y )  bli_sssetis( xi, y )
-#define bli_dsetis( xi, y )  bli_ddsetis( xi, y )
-#define bli_csetis( xi, y )  bli_scsetis( xi, y )
-#define bli_zsetis( xi, y )  bli_dzsetis( xi, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_sets.h b/frame/include/level0/bli_sets.h
deleted file mode 100644
index 758fc29d6..000000000
--- a/frame/include/level0/bli_sets.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SETS_H
-#define BLIS_SETS_H
-
-// sets
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sssets( xr, xi, y )  { (y) = (xr); }
-#define bli_dssets( xr, xi, y )  { (y) = (xr); }
-#define bli_cssets( xr, xi, y )  { (y) = (xr); }
-#define bli_zssets( xr, xi, y )  { (y) = (xr); }
-#define bli_issets( xr, xi, y )  { (y) = (xr); }
-
-#define bli_sdsets( xr, xi, y )  { (y) = (xr); }
-#define bli_ddsets( xr, xi, y )  { (y) = (xr); }
-#define bli_cdsets( xr, xi, y )  { (y) = (xr); }
-#define bli_zdsets( xr, xi, y )  { (y) = (xr); }
-#define bli_idsets( xr, xi, y )  { (y) = (xr); }
-
-#ifndef BLIS_ENABLE_C99_COMPLEX 
-
-#define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); }
-#define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); }
-#define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); }
-#define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); }
-#define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); }
-
-#define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); }
-#define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); }
-#define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); }
-#define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); }
-#define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); }
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-#define bli_dcsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-#define bli_ccsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-#define bli_zcsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-
-#define bli_szsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-#define bli_dzsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-#define bli_czsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-#define bli_zzsets( xr, xi, y )  { (y) = (xr) + (xi) * (I); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); }
-#define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); }
-#define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); }
-#define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); }
-#define bli_iisets( xr, xi, y ) { (y) =          (xr); }
-
-
-#define bli_ssets( xr, xi, y )  bli_sssets( xr, xi, y )
-#define bli_dsets( xr, xi, y )  bli_ddsets( xr, xi, y )
-#define bli_csets( xr, xi, y )  bli_scsets( xr, xi, y )
-#define bli_zsets( xr, xi, y )  bli_dzsets( xr, xi, y )
-#define bli_isets( xr, xi, y )  bli_disets( xr, xi, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_sqrt2s.h b/frame/include/level0/bli_sqrt2s.h
deleted file mode 100644
index 66ae9fe18..000000000
--- a/frame/include/level0/bli_sqrt2s.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SQRT2S_H
-#define BLIS_SQRT2S_H
-
-// sqrt2s
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of a.
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sssqrt2s( x, a )  bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) )
-#define bli_dssqrt2s( x, a )  bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) )
-#define bli_cssqrt2s( x, a )  bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) )
-#define bli_zssqrt2s( x, a )  bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) )
-
-#define bli_sdsqrt2s( x, a )  bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) )
-#define bli_ddsqrt2s( x, a )  bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) )
-#define bli_cdsqrt2s( x, a )  bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) )
-#define bli_zdsqrt2s( x, a )  bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) )
-
-#define bli_scsqrt2s( x, a )  bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) )
-#define bli_dcsqrt2s( x, a )  bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) )
-#define bli_ccsqrt2s( x, a )   bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) )
-#define bli_zcsqrt2s( x, a )   bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) )
-
-#define bli_szsqrt2s( x, a )  bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_dzsqrt2s( x, a )  bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_czsqrt2s( x, a )   bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) )
-#define bli_zzsqrt2s( x, a )   bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sssqrt2s( x, a )  { (a) = ( float    )            sqrtf( (x) )  ; }
-#define bli_dssqrt2s( x, a )  { (a) = ( float    )            sqrt ( (x) )  ; }
-#define bli_cssqrt2s( x, a )  { (a) = ( float    )bli_creal( csqrtf( (x) ) ); }
-#define bli_zssqrt2s( x, a )  { (a) = ( float    )bli_zreal( csqrt ( (x) ) ); }
-
-#define bli_sdsqrt2s( x, a )  { (a) = ( double   )            sqrtf( (x) )  ; }
-#define bli_ddsqrt2s( x, a )  { (a) = ( double   )            sqrt ( (x) )  ; }
-#define bli_cdsqrt2s( x, a )  { (a) = ( double   )bli_creal( csqrtf( (x) ) ); }
-#define bli_zdsqrt2s( x, a )  { (a) = ( double   )bli_zreal( csqrt ( (x) ) ); }
-
-#define bli_scsqrt2s( x, a )  { (a) = ( scomplex )            sqrtf( (x) )  ; }
-#define bli_dcsqrt2s( x, a )  { (a) = ( scomplex )            sqrt ( (x) )  ; }
-#define bli_ccsqrt2s( x, a )  { (a) = ( scomplex )           csqrtf( (x) )  ; }
-#define bli_zcsqrt2s( x, a )  { (a) = ( scomplex )           csqrt ( (x) )  ; }
-
-#define bli_szsqrt2s( x, a )  { (a) = ( dcomplex )            sqrtf( (x) )  ; }
-#define bli_dzsqrt2s( x, a )  { (a) = ( dcomplex )            sqrt ( (x) )  ; }
-#define bli_czsqrt2s( x, a )  { (a) = ( dcomplex )           csqrtf( (x) )  ; }
-#define bli_zzsqrt2s( x, a )  { (a) = ( dcomplex )           csqrt ( (x) )  ; }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_ssqrt2s( x, a )  bli_sssqrt2s( x, a )
-#define bli_dsqrt2s( x, a )  bli_ddsqrt2s( x, a )
-#define bli_csqrt2s( x, a )  bli_ccsqrt2s( x, a )
-#define bli_zsqrt2s( x, a )  bli_zzsqrt2s( x, a )
-
-
-#endif
diff --git a/frame/include/level0/bli_subjs.h b/frame/include/level0/bli_subjs.h
deleted file mode 100644
index f453fa21f..000000000
--- a/frame/include/level0/bli_subjs.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SUBJS_H
-#define BLIS_SUBJS_H
-
-// subjs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_sssubjs( a, y )  bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dssubjs( a, y )  bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_cssubjs( a, y )  bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zssubjs( a, y )  bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsubjs( a, y )  bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddsubjs( a, y )  bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdsubjs( a, y )  bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdsubjs( a, y )  bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsubjs( a, y )  bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcsubjs( a, y )  bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccsubjs( a, y )  bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcsubjs( a, y )  bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szsubjs( a, y )  bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzsubjs( a, y )  bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czsubjs( a, y )  bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzsubjs( a, y )  bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsubjs( a, y )  { (y) -=      (a); }
-#define bli_dcsubjs( a, y )  { (y) -=      (a); }
-#define bli_ccsubjs( a, y )  { (y) -= conjf(a); }
-#define bli_zcsubjs( a, y )  { (y) -= conj (a); }
-
-#define bli_szsubjs( a, y )  { (y) -=      (a); }
-#define bli_dzsubjs( a, y )  { (y) -=      (a); }
-#define bli_czsubjs( a, y )  { (y) -= conjf(a); }
-#define bli_zzsubjs( a, y )  { (y) -= conj (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_ssubjs( a, y )  bli_sssubjs( a, y )
-#define bli_dsubjs( a, y )  bli_ddsubjs( a, y )
-#define bli_csubjs( a, y )  bli_ccsubjs( a, y )
-#define bli_zsubjs( a, y )  bli_zzsubjs( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_subs.h b/frame/include/level0/bli_subs.h
deleted file mode 100644
index 2c9a79dab..000000000
--- a/frame/include/level0/bli_subs.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SUBS_H
-#define BLIS_SUBS_H
-
-// subs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of y.
-
-#define bli_sssubs( a, y )  bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) )
-#define bli_dssubs( a, y )  bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_cssubs( a, y )  bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) )
-#define bli_zssubs( a, y )  bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsubs( a, y )  bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_ddsubs( a, y )  bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_cdsubs( a, y )  bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) )
-#define bli_zdsubs( a, y )  bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsubs( a, y )  bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) )
-#define bli_dcsubs( a, y )  bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_ccsubs( a, y )  bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) )
-#define bli_zcsubs( a, y )  bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) )
-
-#define bli_szsubs( a, y )  bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_dzsubs( a, y )  bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_czsubs( a, y )  bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) )
-#define bli_zzsubs( a, y )  bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsubs( a, y )  { (y) -= (a); }
-#define bli_dcsubs( a, y )  { (y) -= (a); }
-#define bli_ccsubs( a, y )  { (y) -= (a); }
-#define bli_zcsubs( a, y )  { (y) -= (a); }
-
-#define bli_szsubs( a, y )  { (y) -= (a); }
-#define bli_dzsubs( a, y )  { (y) -= (a); }
-#define bli_czsubs( a, y )  { (y) -= (a); }
-#define bli_zzsubs( a, y )  { (y) -= (a); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_ssubs( a, y )  bli_sssubs( a, y )
-#define bli_dsubs( a, y )  bli_ddsubs( a, y )
-#define bli_csubs( a, y )  bli_ccsubs( a, y )
-#define bli_zsubs( a, y )  bli_zzsubs( a, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_swaps.h b/frame/include/level0/bli_swaps.h
deleted file mode 100644
index fe18d94fd..000000000
--- a/frame/include/level0/bli_swaps.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SWAPS_H
-#define BLIS_SWAPS_H
-
-// swaps
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-
-#define bli_ssswaps( x, y ) \
-{ \
-	float    w; \
-	bli_sscopys( (y), (w) ); \
-	bli_sscopys( (x), (y) ); \
-	bli_sscopys( (w), (x) ); \
-}
-#define bli_dsswaps( x, y ) \
-{ \
-	double   w; \
-	bli_sdcopys( (y), (w) ); \
-	bli_dscopys( (x), (y) ); \
-	bli_ddcopys( (w), (x) ); \
-}
-#define bli_csswaps( x, y ) \
-{ \
-	scomplex w; \
-	bli_sccopys( (y), (w) ); \
-	bli_cscopys( (x), (y) ); \
-	bli_cccopys( (w), (x) ); \
-}
-#define bli_zsswaps( x, y ) \
-{ \
-	dcomplex w; \
-	bli_szcopys( (y), (w) ); \
-	bli_zscopys( (x), (y) ); \
-	bli_zzcopys( (w), (x) ); \
-}
-
-
-#define bli_sdswaps( x, y ) \
-{ \
-	float    w; \
-	bli_dscopys( (y), (w) ); \
-	bli_sdcopys( (x), (y) ); \
-	bli_sscopys( (w), (x) ); \
-}
-#define bli_ddswaps( x, y ) \
-{ \
-	double   w; \
-	bli_ddcopys( (y), (w) ); \
-	bli_ddcopys( (x), (y) ); \
-	bli_ddcopys( (w), (x) ); \
-}
-#define bli_cdswaps( x, y ) \
-{ \
-	scomplex w; \
-	bli_dccopys( (y), (w) ); \
-	bli_cdcopys( (x), (y) ); \
-	bli_cccopys( (w), (x) ); \
-}
-#define bli_zdswaps( x, y ) \
-{ \
-	dcomplex w; \
-	bli_dzcopys( (y), (w) ); \
-	bli_zdcopys( (x), (y) ); \
-	bli_zzcopys( (w), (x) ); \
-}
-
-
-#define bli_scswaps( x, y ) \
-{ \
-	float    w; \
-	bli_cscopys( (y), (w) ); \
-	bli_sccopys( (x), (y) ); \
-	bli_sscopys( (w), (x) ); \
-}
-#define bli_dcswaps( x, y ) \
-{ \
-	double   w; \
-	bli_cdcopys( (y), (w) ); \
-	bli_dccopys( (x), (y) ); \
-	bli_ddcopys( (w), (x) ); \
-}
-#define bli_ccswaps( x, y ) \
-{ \
-	scomplex w; \
-	bli_cccopys( (y), (w) ); \
-	bli_cccopys( (x), (y) ); \
-	bli_cccopys( (w), (x) ); \
-}
-#define bli_zcswaps( x, y ) \
-{ \
-	dcomplex w; \
-	bli_czcopys( (y), (w) ); \
-	bli_zccopys( (x), (y) ); \
-	bli_zzcopys( (w), (x) ); \
-}
-
-
-#define bli_szswaps( x, y ) \
-{ \
-	float    w; \
-	bli_zscopys( (y), (w) ); \
-	bli_szcopys( (x), (y) ); \
-	bli_sscopys( (w), (x) ); \
-}
-#define bli_dzswaps( x, y ) \
-{ \
-	double   w; \
-	bli_zdcopys( (y), (w) ); \
-	bli_dzcopys( (x), (y) ); \
-	bli_ddcopys( (w), (x) ); \
-}
-#define bli_czswaps( x, y ) \
-{ \
-	scomplex w; \
-	bli_zccopys( (y), (w) ); \
-	bli_czcopys( (x), (y) ); \
-	bli_cccopys( (w), (x) ); \
-}
-#define bli_zzswaps( x, y ) \
-{ \
-	dcomplex w; \
-	bli_zzcopys( (y), (w) ); \
-	bli_zzcopys( (x), (y) ); \
-	bli_zzcopys( (w), (x) ); \
-}
-
-
-#define bli_sswaps( x, y )  bli_ssswaps( x, y )
-#define bli_dswaps( x, y )  bli_ddswaps( x, y )
-#define bli_cswaps( x, y )  bli_ccswaps( x, y )
-#define bli_zswaps( x, y )  bli_zzswaps( x, y )
-
-
-#endif
diff --git a/frame/include/level0/bli_tabsq2s.h b/frame/include/level0/bli_tabsq2s.h
new file mode 100644
index 000000000..72b5f8382
--- /dev/null
+++ b/frame/include/level0/bli_tabsq2s.h
@@ -0,0 +1,135 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TABSQ2S_H
+#define BLIS_TABSQ2S_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) = (xr) * (xr) + (xi) * (xi);
+// (yi) = 0;
+
+#define bli_tabsq2ims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(dx,dx,termrr)( \
+	        chc, \
+	        PASTEMAC(chc,mul)( \
+	          PASTEMAC(px,chc,tcast)(xr), \
+	          PASTEMAC(px,chc,tcast)(xr) \
+	        )  \
+	      ), \
+	      PASTEMAC(dx,dx,termii)( \
+	        chc, \
+	        PASTEMAC(chc,mul)( \
+	          PASTEMAC(px,chc,tcast)(xi), \
+	          PASTEMAC(px,chc,tcast)(xi) \
+	        ) \
+	      ) \
+	    ) \
+	  ),\
+	  PASTEMAC(py,zero), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tabsq2s
+#define bli_tabsq2s( chx, chy, chc, x, y ) \
+        bli_tabsq2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tabsq2ris
+#define bli_tabsq2ris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_tabsq2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) := (xr) * (xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) := (xr) * (xr) + (xi) * (xi);
+// (yi) xx   0 ;
+
+//   c       r
+// (yr) := (xr) * (xr);
+// (yi) :=   0 ;
+
+//   c       c
+// (yr) := (xr) * (xr) + (xi) * (xi);
+// (yi) :=   0 ;
+
+#endif
+
diff --git a/frame/include/level0/bli_tabval2s.h b/frame/include/level0/bli_tabval2s.h
new file mode 100644
index 000000000..abcd66ba7
--- /dev/null
+++ b/frame/include/level0/bli_tabval2s.h
@@ -0,0 +1,206 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TABVAL2S_H
+#define BLIS_TABVAL2S_H
+
+// -- Implementation macro -----------------------------------------------------
+
+#define bli_tabval2ims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dx,abval2ims) \
+	( \
+	  dx, px, xr, xi, \
+	  dy, py, yr, yi, \
+	  chc  \
+	); \
+}
+
+// -- real-to-real domain implementation --
+// -- real-to-complex domain implementation --
+
+// (yr) = abs( xr );
+// (yi) = 0;
+
+#define bli_rabval2ims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,abs)( \
+	      PASTEMAC(px,chc,tcast)(xr) \
+	    ) \
+	  ), \
+	  PASTEMAC(py,zero), \
+	  yr, \
+	  yi \
+	) \
+} \
+
+// -- complex-to-real domain implementation --
+// -- complex-to-complex domain implementation --
+
+// NOTE: Instead of defining abval2 in terms of bli_?hypot(), we use an
+// alternate definition that can avoid overflow in the final result due
+// to overflow in the intermediate results (e.g. xr * xr and xi * xi).
+
+// xmaxr = maxabs( xr, xi );
+// if ( s == 0.0 ) mag = 0.0;
+// else            mag = sqrt( xmaxr ) *
+//                       sqrt( ( xr / xmaxr ) * xr +
+//                             ( xi / xmaxr ) * xi );
+// yr = mag;
+// yi = 0.0;
+
+#define bli_cabval2ims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(ro,declinits) \
+	( \
+	  px, \
+	  PASTEMAC(px,maxabs)(xr,xi), \
+	  xmaxr  \
+	) \
+	PASTEMAC(dy,assigns) \
+	( \
+	  ( PASTEMAC(teq0s)(px,xmaxr) && \
+	    !PASTEMAC(px,isnan)(xi) && \
+	    !PASTEMAC(px,isnan)(xr) \
+	    ? PASTEMAC(py,zero) \
+	    : PASTEMAC(chc,py,tcast)( \
+	        PASTEMAC(chc,mul)( \
+	          PASTEMAC(chc,sqrt)( \
+	            PASTEMAC(px,chc,tcast)(xmaxr) \
+	          ), \
+	          PASTEMAC(chc,sqrt)( \
+	            PASTEMAC(chc,add)( \
+	              PASTEMAC(chc,mul)( \
+	                PASTEMAC(px,chc,tcast)(xr), \
+	                PASTEMAC(chc,div)( \
+	                  PASTEMAC(px,chc,tcast)(xr), \
+	                  PASTEMAC(px,chc,tcast)(xmaxr) \
+	                ) \
+	              ), \
+	              PASTEMAC(chc,mul)( \
+	                PASTEMAC(px,chc,tcast)(xi), \
+	                PASTEMAC(chc,div)( \
+	                  PASTEMAC(px,chc,tcast)(xi), \
+	                  PASTEMAC(px,chc,tcast)(xmaxr) \
+                    ) \
+	              ) \
+	            ) \
+	          ) \
+	        ) \
+	      ) \
+	  ), \
+	  PASTEMAC(py,zero), \
+	  yr, \
+	  yi \
+	) \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tabval2s
+#define bli_tabval2s( chx, chy, chc, x, y ) \
+        bli_tabval2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tabval2ris
+#define bli_tabval2ris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_tabval2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) := abs(xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) := sqrt(s) * sqrt( ( xr / s ) * xr + ( xi / s ) * xi );
+// (yi) xx   0 ;
+
+//   c       r
+// (yr) := abs(xr);
+// (yi) :=   0 ;
+
+//   c       c
+// (yr) := sqrt(s) * sqrt( ( xr / s ) * xr + ( xi / s ) * xi );
+// (yi) :=   0 ;
+
+#endif
+
diff --git a/frame/include/level0/bli_tadd3s.h b/frame/include/level0/bli_tadd3s.h
new file mode 100644
index 000000000..43e090fba
--- /dev/null
+++ b/frame/include/level0/bli_tadd3s.h
@@ -0,0 +1,199 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TADD3S_H
+#define BLIS_TADD3S_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (zr) = (yr) + (xr);
+// (zi) = (yi) + (xi);
+
+#define bli_tadd3ims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          dz, pz, zr, zi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dz,assigns) \
+	( \
+	  PASTEMAC(chc,pz,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(py,chc,tcast)(yr), \
+	      PASTEMAC(px,chc,tcast)(xr)  \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,pz,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(py,chc,tcast)(yi), \
+	      PASTEMAC(px,chc,tcast)(xi) \
+	    ) \
+	  ), \
+	  zr, \
+	  zi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tadd3s
+#define bli_tadd3s( chx, chy, chz, chc, x, y, z ) \
+        bli_tadd3ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chz,dom),  \
+          PASTEMAC(chz,prec), \
+          PASTEMAC(chz,real)(z), \
+          PASTEMAC(chz,imag)(z), \
+          PASTEMAC(chc,prec)  \
+        )
+
+#undef GENTFUNC
+#define GENTFUNC( ctypex, chx, ctypey, chy, ctypez, chz, ctypec, chc, opname ) \
+UNIT_TEST(chx,chy,chz,chc,opname) \
+( \
+	for ( auto x : test_values<ctypex>() ) \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto z0 = convert<ctypez>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		ctypez z; \
+		bli_tadds( chx,chy,chz,chc, x, y, z ); \
+\
+		INFO( "z (C++):  " << z0 ); \
+		INFO( "z (BLIS): " << z ); \
+\
+		check<ctypec>( z, z0 ); \
+	} \
+)
+
+// tadd3js
+#define bli_tadd3js( chx, chy, chz, chc, x, y, z ) \
+        bli_tadd3ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chz,dom),  \
+          PASTEMAC(chz,prec), \
+          PASTEMAC(chz,real)(z), \
+          PASTEMAC(chz,imag)(z), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tadd3ris
+#define bli_tadd3ris( chx, chy, chz, chc, xr, xi, yr, yi, zr, zi ) \
+        bli_tadd3ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chz,dom),  \
+          PASTEMAC(chz,prec), \
+                     zr, \
+                     zi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tadd3jris
+#define bli_tadd3jris( chx, chy, chz, chc, xr, xi, yr, yi, zr, zi ) \
+        bli_tadd3ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chz,dom),  \
+          PASTEMAC(chz,prec), \
+                     zr, \
+                     zi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) += (xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) += (xr);
+// (yi) xx (xi);
+
+//   c       r
+// (yr) += (xr);
+// (yi) +=   0 ;
+
+//   c       c
+// (yr) += (xr);
+// (yi) += (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_tadds.h b/frame/include/level0/bli_tadds.h
new file mode 100644
index 000000000..091708c1c
--- /dev/null
+++ b/frame/include/level0/bli_tadds.h
@@ -0,0 +1,176 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TADDS_H
+#define BLIS_TADDS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) = (yr) + (xr);
+// (yi) = (yi) + (xi);
+
+#define bli_taddims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(py,chc,tcast)(yr), \
+	      PASTEMAC(px,chc,tcast)(xr)  \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(py,chc,tcast)(yi), \
+	      PASTEMAC(px,chc,tcast)(xi) \
+	    ) \
+	  ), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tadds
+#define bli_tadds( chx, chy, chc, x, y ) \
+        bli_taddims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// taddjs
+#define bli_taddjs( chx, chy, chc, x, y ) \
+        bli_taddims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// taddris
+#define bli_taddris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_taddims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// taddjris
+#define bli_taddjris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_taddims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- mxn --
+
+// tadds_mxn
+#define bli_tadds_mxn( chx, chy, chc, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
+{ \
+	for ( dim_t jj = 0; jj < (n); ++jj ) \
+	for ( dim_t ii = 0; ii < (m); ++ii ) \
+	{ \
+		PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+		PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+		bli_tadds( chx,chy,chc, *xij, *yij ); \
+	} \
+}
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) += (xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) += (xr);
+// (yi) xx (xi);
+
+//   c       r
+// (yr) += (xr);
+// (yi) +=   0 ;
+
+//   c       c
+// (yr) += (xr);
+// (yi) += (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_taxpbys.h b/frame/include/level0/bli_taxpbys.h
new file mode 100644
index 000000000..6fbfc3230
--- /dev/null
+++ b/frame/include/level0/bli_taxpbys.h
@@ -0,0 +1,278 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TAXPBYS_H
+#define BLIS_TAXPBYS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yorigr) := (yr)
+// (yorigi) := (yi)
+// (yr) := (ar) * (xr) - (ai) * (xi) + (br) * (yorigr) - (bi) * (yorigi);
+// (yi) := (ai) * (xr) + (ar) * (xi) + (bi) * (yorigr) + (br) * (yorigi);
+
+#define bli_taxpbyims( \
+          \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          db, pb, br, bi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(c,declinits) \
+	( \
+	  py, \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(chc,sub)( \
+	        PASTEMAC(da,dx,termrr)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ar), \
+	            PASTEMAC(px,chc,tcast)(xr) \
+	          )  \
+	        ), \
+	        PASTEMAC(da,dx,termii)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ai), \
+	            PASTEMAC(px,chc,tcast)(xi) \
+	          )  \
+	        )  \
+	      ), \
+	      PASTEMAC(chc,sub)( \
+	        PASTEMAC(db,dy,termrr)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(br), \
+	            PASTEMAC(py,chc,tcast)(yr) \
+	          )  \
+	        ), \
+	        PASTEMAC(db,dy,termii)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(bi), \
+	            PASTEMAC(py,chc,tcast)(yi) \
+	          ) \
+	        ) \
+	      ) \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(chc,add)( \
+	        PASTEMAC(da,dx,termir)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ai), \
+	            PASTEMAC(px,chc,tcast)(xr) \
+	          )  \
+	        ), \
+	        PASTEMAC(da,dx,termri)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ar), \
+	            PASTEMAC(px,chc,tcast)(xi) \
+	          ) \
+	        ) \
+	      ), \
+	      PASTEMAC(chc,add)( \
+	        PASTEMAC(db,dy,termir)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(bi), \
+	            PASTEMAC(py,chc,tcast)(yr) \
+	          )  \
+	        ), \
+	        PASTEMAC(db,dy,termri)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(br), \
+	            PASTEMAC(py,chc,tcast)(yi) \
+	          ) \
+	        ) \
+	      ) \
+	    ) \
+	  ), \
+	  tr, \
+	  ti \
+	); \
+	PASTEMAC(dy,assigns) \
+	( \
+	  tr, \
+	  ti, \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// taxpbys
+#define bli_taxpbys( cha, chx, chb, chy, chc, a, x, b, y ) \
+        bli_taxpbyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chb,dom),  \
+          PASTEMAC(chb,prec), \
+          PASTEMAC(chb,real)(b), \
+          PASTEMAC(chb,imag)(b), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// taxpbyjs
+#define bli_taxpbyjs( cha, chx, chb, chy, chc, a, x, b, y ) \
+        bli_taxpbyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chb,dom),  \
+          PASTEMAC(chb,prec), \
+          PASTEMAC(chb,real)(b), \
+          PASTEMAC(chb,imag)(b), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// taxpbyris
+#define bli_taxpbyris( cha, chx, chb, chy, chc, ar, ai, xr, xi, br, bi, yr, yi ) \
+        bli_taxpbyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chb,dom),  \
+          PASTEMAC(chb,prec), \
+                     br, \
+                     bi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// taxpbyjris
+#define bli_taxpbyjris( cha, chx, chb, chy, chc, ar, ai, xr, xi, br, bi, yr, yi ) \
+        bli_taxpbyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chb,dom),  \
+          PASTEMAC(chb,prec), \
+                     br, \
+                     bi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- mxn --
+
+// axpbys_mxn
+#define bli_taxpbys_mxn( cha, chx, chb, chy, chc, m, n, alpha, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
+{ \
+\
+	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
+	if ( bli_teq0s( chb, *(beta) ) ) \
+	{ \
+		bli_tscal2s_mxn( cha, chx, chy, chc, BLIS_NO_CONJUGATE, m, n, alpha, x, rs_x, cs_x, y, rs_y, cs_y ); \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		{ \
+			PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+			PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+			bli_taxpbys( cha,chx,chb,chy,chc, *(alpha), *xij, *(beta), *yij ); \
+		} \
+	} \
+}
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_saxpbys( a, x, b, y ) bli_taxpbys( s,s,s,s,s, a, x, b, y )
+#define bli_daxpbys( a, x, b, y ) bli_taxpbys( d,d,d,d,d, a, x, b, y )
+#define bli_caxpbys( a, x, b, y ) bli_taxpbys( c,c,c,c,s, a, x, b, y )
+#define bli_zaxpbys( a, x, b, y ) bli_taxpbys( z,z,z,z,d, a, x, b, y )
+
+// -- Notes --------------------------------------------------------------------
+
+#endif
+
diff --git a/frame/include/level0/bli_taxpys.h b/frame/include/level0/bli_taxpys.h
new file mode 100644
index 000000000..df81cc102
--- /dev/null
+++ b/frame/include/level0/bli_taxpys.h
@@ -0,0 +1,241 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TAXPYS_H
+#define BLIS_TAXPYS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (tr) += (ar) * (xr) - (ai) * (xi);
+// (ti) += (ai) * (xr) + (ar) * (xi);
+// (yr) += (tr);
+// (yi) += (ti);
+
+#define bli_taxpyims( \
+          \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(c,declinits) \
+	( \
+	  py, \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(py,chc,tcast)(yr), \
+	      PASTEMAC(chc,sub)( \
+	        PASTEMAC(da,dx,termrr)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ar), \
+	            PASTEMAC(px,chc,tcast)(xr) \
+	          )  \
+	        ), \
+	        PASTEMAC(da,dx,termii)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ai), \
+	            PASTEMAC(px,chc,tcast)(xi) \
+	          ) \
+	        ) \
+	      ) \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(py,chc,tcast)(yi), \
+	      PASTEMAC(chc,add)( \
+	        PASTEMAC(da,dx,termir)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ai), \
+	            PASTEMAC(px,chc,tcast)(xr) \
+	          )  \
+	        ), \
+	        PASTEMAC(da,dx,termri)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pa,chc,tcast)(ar), \
+	            PASTEMAC(px,chc,tcast)(xi) \
+	          ) \
+	        ) \
+	      ) \
+	    ) \
+	  ), \
+	  tr, \
+	  ti \
+	); \
+	PASTEMAC(dy,assigns) \
+	( \
+	  tr, \
+	  ti, \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// taxpys
+#define bli_taxpys( cha, chx, chy, chc, a, x, y ) \
+        bli_taxpyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// taxpyjs
+#define bli_taxpyjs( cha, chx, chy, chc, a, x, y ) \
+        bli_taxpyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// taxpyris
+#define bli_taxpyris( cha, chx, chy, chc, ar, ai, xr, xi, yr, yi ) \
+        bli_taxpyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// taxpyjris
+#define bli_taxpyjris( cha, chx, chy, chc, ar, ai, xr, xi, yr, yi ) \
+        bli_taxpyims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_saxpys( a, x, y ) bli_taxpys( s,s,s,s, a, x, y )
+#define bli_daxpys( a, x, y ) bli_taxpys( d,d,d,d, a, x, y )
+#define bli_caxpys( a, x, y ) bli_taxpys( c,c,c,s, a, x, y )
+#define bli_zaxpys( a, x, y ) bli_taxpys( z,z,z,d, a, x, y )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r      r
+// (yr) += (ar) * (xr) -   0  *   0 ;
+// (yi) xx   0  * (xr) + (ar) *   0 ;
+
+//   r       r      c
+// (yr) += (ar) * (xr) -   0  * (xi);
+// (yi) xx   0  * (xr) + (ar) * (xi);
+
+//   r       c      r
+// (yr) += (ar) * (xr) - (ai) *   0 ;
+// (yi) xx (ai) * (xr) + (ar) *   0 ;
+
+//   r       c      c
+// (yr) += (ar) * (xr) - (ai) * (xi);
+// (yi) xx (ai) * (xr) + (ar) * (xi);
+
+//   c       r      r
+// (yr) += (ar) * (xr) -   0  *   0 ;
+// (yi) +=   0  * (xr) + (ar) *   0 ;
+
+//   c       r      c
+// (yr) += (ar) * (xr) -   0  * (xi);
+// (yi) +=   0  * (xr) + (ar) * (xi);
+
+//   c       c      r
+// (yr) += (ar) * (xr) - (ai) *   0 ;
+// (yi) += (ai) * (xr) + (ar) *   0 ;
+
+//   c       c      c
+// (yr) += (ar) * (xr) - (ai) * (xi);
+// (yi) += (ai) * (xr) + (ar) * (xi);
+
+#endif
+
diff --git a/frame/include/level0/1r/bli_scal21rs.h b/frame/include/level0/bli_tconjs.h
similarity index 60%
rename from frame/include/level0/1r/bli_scal21rs.h
rename to frame/include/level0/bli_tconjs.h
index 2f0590a62..5706f6886 100644
--- a/frame/include/level0/1r/bli_scal21rs.h
+++ b/frame/include/level0/bli_tconjs.h
@@ -32,34 +32,57 @@
 
 */
 
-#ifndef BLIS_SCAL21RS_H
-#define BLIS_SCAL21RS_H
+#ifndef BLIS_TCONJS_H
+#define BLIS_TCONJS_H
 
-// scal21rs
+// -- Implementation macro -----------------------------------------------------
 
-#define bli_cscscal21rs( a, x, yr, yi ) \
-{ \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \
-}
+// (xr) :=  (xr)
+// (xi) := -(xi)
 
-#define bli_cccscal21rs( a, x, yr, yi ) \
+#define bli_tconjims( \
+          \
+          dx, px, xr, xi \
+        ) \
 { \
-	bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \
+	PASTEMAC(dx,assigns) \
+	( \
+	                   xr, \
+	  PASTEMAC(px,neg)(xi),\
+	                   xr, \
+	                   xi \
+	); \
 }
 
-#define bli_zdzscal21rs( a, x, yr, yi ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \
-}
+// -- API macros ---------------------------------------------------------------
 
-#define bli_zzzscal21rs( a, x, yr, yi ) \
-{ \
-	bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \
-}
+// -- Consolidated --
+
+// tconjs
+#define bli_tconjs( chx, x ) \
+        bli_tconjims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x) \
+        )
+
+// -- Exposed real/imaginary --
+
+// tconjris
+#define bli_tconjris( chx, xr, xi ) \
+        bli_tconjims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi \
+        )
 
+// -- Higher-level static functions --------------------------------------------
 
-#define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi )
-#define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi )
+// -- Notes --------------------------------------------------------------------
 
 #endif
 
diff --git a/frame/include/level0/bli_tcopycjs.h b/frame/include/level0/bli_tcopycjs.h
new file mode 100644
index 000000000..74f843833
--- /dev/null
+++ b/frame/include/level0/bli_tcopycjs.h
@@ -0,0 +1,129 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TCOPYCJS_H
+#define BLIS_TCOPYCJS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) := (xr);
+// (yi) := ( is_conj( conj ) ? -(xi) : (xi) );
+
+#define bli_tcopycjims( \
+          \
+          conj, \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr), \
+	  ( bli_is_conj( conj ) ? PASTEMAC(px,neg)( \
+	                            PASTEMAC(px,py,tcast)(xi) \
+	                          ) \
+	                        :   PASTEMAC(px,py,tcast)(xi) \
+	  ), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tcopycjs
+#define bli_tcopycjs( chx, chy, conj, x, y ) \
+        bli_tcopycjims \
+        ( \
+          conj, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// -- Exposed real/imaginary --
+
+// tcopyris
+#define bli_tcopycjris( chx, chy, conj, xr, xi, yr, yi ) \
+        bli_tcopycjims \
+        ( \
+          conj, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_scopycjs( conj, x, y ) bli_tcopycjs( s,s, conj, x, y )
+#define bli_dcopycjs( conj, x, y ) bli_tcopycjs( d,d, conj, x, y )
+#define bli_ccopycjs( conj, x, y ) bli_tcopycjs( c,c, conj, x, y )
+#define bli_zcopycjs( conj, x, y ) bli_tcopycjs( z,z, conj, x, y )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) := (xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) := (xr);
+// (yi) xx (xi);
+
+//   c       r
+// (yr) := (xr);
+// (yi) :=   0 ;
+
+//   c       c
+// (yr) := (xr);
+// (yi) := (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_tcopynzs.h b/frame/include/level0/bli_tcopynzs.h
new file mode 100644
index 000000000..b777da9d4
--- /dev/null
+++ b/frame/include/level0/bli_tcopynzs.h
@@ -0,0 +1,191 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TCOPYNZS_H
+#define BLIS_TCOPYNZS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) := (xr);
+// if ( is_complex( x ) )
+//     (yi) := (xi);
+
+#define bli_tcopynzims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(dx,dy,copynzims) \
+	( \
+	  dx, px, xr, xi, \
+	  dy, py, yr, yi \
+	); \
+}
+
+// -- real-to-real domain implementation --
+
+#define bli_rrcopynzims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(r,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr), \
+	  PASTEMAC(px,py,tcast)(xi), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- complex-to-real domain implementation --
+// -- real-to-complex domain implementation --
+
+// NOTE: Normally, the real-to-complex case would take place in the complex
+// domain (in that an implicit zero would be copied to y.imag), but since
+// this is copynz, we avoid updating the imaginary parts of complex y when
+// x is real. Thus, real-to-complex ends up getting implemented the same as
+// real-to-real (and complex-to-real).
+
+#define bli_rccopynzims bli_rrcopynzims
+#define bli_crcopynzims bli_rrcopynzims
+
+// -- complex-to-complex domain implementation --
+
+#define bli_cccopynzims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(c,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr), \
+	  PASTEMAC(px,py,tcast)(xi), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tcopynzs
+#define bli_tcopynzs( chx, chy, x, y ) \
+        bli_tcopynzims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// tcopyjnzs
+#define bli_tcopyjnzs( chx, chy, x, y ) \
+        bli_tcopynzims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// -- Exposed real/imaginary --
+
+// tcopynzris
+#define bli_tcopynzris( chx, chy, xr, xi, yr, yi ) \
+        bli_tcopynzims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// tcopyjnzris
+#define bli_tcopyjnzris( chx, chy, xr, xi, yr, yi ) \
+        bli_tcopynzims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) := (xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) := (xr);
+// (yi) xx (xi);
+
+//   c       r
+// (yr) := (xr);
+// (yi) xx   0 ;    // NOTE: This is what copynzs does differently from copys.
+
+//   c       c
+// (yr) := (xr);
+// (yi) := (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_tcopys.h b/frame/include/level0/bli_tcopys.h
new file mode 100644
index 000000000..70722a3d3
--- /dev/null
+++ b/frame/include/level0/bli_tcopys.h
@@ -0,0 +1,254 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TCOPYS_H
+#define BLIS_TCOPYS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) := (xr);
+// (yi) := (xi);
+
+#define bli_tcopyims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr), \
+	  PASTEMAC(px,py,tcast)(xi), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tcopys
+#define bli_tcopys( chx, chy, x, y ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// tcopyjs
+#define bli_tcopyjs( chx, chy, x, y ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// -- Exposed real/imaginary --
+
+// tcopyris
+#define bli_tcopyris( chx, chy, xr, xi, yr, yi ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// tcopyjris
+#define bli_tcopyjris( chx, chy, xr, xi, yr, yi ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// -- 1e / 1r --
+
+// tcopy1es
+#define bli_tcopy1es( chx, chy, x, yri, yir ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yri), \
+          PASTEMAC(chy,imag)(yri) \
+        ); \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yir), \
+          PASTEMAC(chy,imag)(yir) \
+        )
+
+// tcopyj1es
+#define bli_tcopyj1es( chx, chy, x, yri, yir ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yri), \
+          PASTEMAC(chy,imag)(yri) \
+        ); \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yir), \
+          PASTEMAC(chy,imag)(yir) \
+        )
+
+// tcopy1rs
+#define bli_tcopy1rs( chx, chy, x, yr, yi ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          yr, \
+          yi \
+        )
+
+// tcopyj1rs
+#define bli_tcopyj1rs( chx, chy, x, yr, yi ) \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          yr, \
+          yi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- mxn --
+
+#define bli_tcopys_mxn( chx, chy, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
+{ \
+	for ( dim_t jj = 0; jj < (n); ++jj ) \
+	for ( dim_t ii = 0; ii < (m); ++ii ) \
+	{ \
+		PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+		PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+		bli_tcopys( chx,chy, *xij, *yij ); \
+	} \
+}
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_scopys( x, y ) bli_tcopys( s,s, x, y )
+#define bli_dcopys( x, y ) bli_tcopys( d,d, x, y )
+#define bli_ccopys( x, y ) bli_tcopys( c,c, x, y )
+#define bli_zcopys( x, y ) bli_tcopys( z,z, x, y )
+#define bli_icopys( x, y ) bli_tcopys( i,i, x, y )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) := (xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) := (xr);
+// (yi) xx (xi);
+
+//   c       r
+// (yr) := (xr);
+// (yi) :=   0 ;
+
+//   c       c
+// (yr) := (xr);
+// (yi) := (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_tdots.h b/frame/include/level0/bli_tdots.h
new file mode 100644
index 000000000..b83c4f3b2
--- /dev/null
+++ b/frame/include/level0/bli_tdots.h
@@ -0,0 +1,120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TDOTS_H
+#define BLIS_TDOTS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (cr) += (ar) * (br) - (ai) * (bi);
+// (ci) += (ai) * (br) + (ar) * (bi);
+
+#define bli_tdotims bli_taxpyims
+
+// -- API macros ---------------------------------------------------------------
+
+// NOTE: When defining the tdots macros, we can recycle taxpys since they both
+// perform c += a * b. However, when invoking taxpys, the first two operands
+// passed in must be swapped because in BLIS axpy is set up to conjugate its
+// second operand (ie: the second operand to the a*x product) while dot
+// is set up to conjugate its first operand (ie: the first operand to the x*y
+// product).
+
+// -- Consolidated --
+
+// tdots
+#define  bli_tdots( chx,chy,cha,chc, x, y, a ) \
+        bli_taxpys( chy,chx,cha,chc, y, x, a )
+
+// tdotjs
+#define  bli_tdotjs( chx,chy,cha,chc, x, y, a ) \
+        bli_taxpyjs( chy,chx,cha,chc, y, x, a )
+
+// -- Exposed real/imaginary --
+
+// tdotris
+#define  bli_tdotris( chx,chy,cha,chc, xr, xi, yr, yi, ar, ai ) \
+        bli_taxpyris( chy,chx,cha,chc, yr, yi, xr, yx, ar, ai )
+
+// tdotjris
+#define  bli_tdotjris( chx,chy,cha,chc, xr, xi, yr, yi, ar, ai ) \
+        bli_taxpyjris( chy,chx,cha,chc, yr, yi, xr, yx, ar, ai )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_sdots( x, y, a ) bli_tdots( s,s,s,s, x, y, a )
+#define bli_ddots( x, y, a ) bli_tdots( d,d,d,d, x, y, a )
+#define bli_cdots( x, y, a ) bli_tdots( c,c,c,s, x, y, a )
+#define bli_zdots( x, y, a ) bli_tdots( z,z,z,d, x, y, a )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r      r
+// (yr) += (ar) * (xr) -   0  *   0 ;
+// (yi) xx   0  * (xr) + (ar) *   0 ;
+
+//   r       r      c
+// (yr) += (ar) * (xr) -   0  * (xi);
+// (yi) xx   0  * (xr) + (ar) * (xi);
+
+//   r       c      r
+// (yr) += (ar) * (xr) - (ai) *   0 ;
+// (yi) xx (ai) * (xr) + (ar) *   0 ;
+
+//   r       c      c
+// (yr) += (ar) * (xr) - (ai) * (xi);
+// (yi) xx (ai) * (xr) + (ar) * (xi);
+
+//   c       r      r
+// (yr) += (ar) * (xr) -   0  *   0 ;
+// (yi) +=   0  * (xr) + (ar) *   0 ;
+
+//   c       r      c
+// (yr) += (ar) * (xr) -   0  * (xi);
+// (yi) +=   0  * (xr) + (ar) * (xi);
+
+//   c       c      r
+// (yr) += (ar) * (xr) - (ai) *   0 ;
+// (yi) += (ai) * (xr) + (ar) *   0 ;
+
+//   c       c      c
+// (yr) += (ar) * (xr) - (ai) * (xi);
+// (yi) += (ai) * (xr) + (ar) * (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_teqs.h b/frame/include/level0/bli_teqs.h
new file mode 100644
index 000000000..d474bffea
--- /dev/null
+++ b/frame/include/level0/bli_teqs.h
@@ -0,0 +1,171 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TEQS_H
+#define BLIS_TEQS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (xr) == (yr) && (xi) == (yi)
+
+#define bli_teqims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+    ( PASTEMAC(PASTEMAC(chc,prec),eq)( PASTEMAC(px,chc,tcast)(xr), \
+                                       PASTEMAC(py,chc,tcast)(yr) ) && \
+      PASTEMAC(PASTEMAC(chc,prec),eq)( PASTEMAC(px,chc,tcast)(xi), \
+                                       PASTEMAC(py,chc,tcast)(yi) ) )
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// teqs
+#define bli_teqs( chx, chy, chc, x, y ) \
+        bli_teqims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// teqris
+#define bli_teqris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_teqims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Convenience macros -------------------------------------------------------
+
+// -- Exposed real/imaginary --
+
+#define bli_teq1ris( chx, xr, xi ) \
+        bli_teqris \
+        ( \
+          chx, chx, chx, \
+                     xr, \
+                     xi, \
+          PASTEMAC(PASTEMAC(chx,prec),one), \
+          PASTEMAC(PASTEMAC(chx,prec),zero) \
+        )
+
+#define bli_teq0ris( chx, xr, xi ) \
+        bli_teqris \
+        ( \
+          chx, chx, chx, \
+                     xr, \
+                     xi, \
+          PASTEMAC(PASTEMAC(chx,prec),zero), \
+          PASTEMAC(PASTEMAC(chx,prec),zero) \
+        )
+
+#define bli_teqm1ris( chx, xr, xi ) \
+        bli_teqris \
+        ( \
+          chx, chx, chx, \
+                     xr, \
+                     xi, \
+          PASTEMAC(PASTEMAC(chx,prec),mone), \
+          PASTEMAC(PASTEMAC(chx,prec),zero) \
+        )
+
+// -- Consolidated --
+
+#define bli_teq1s( chx, x ) \
+        bli_teq1ris \
+        ( \
+          chx, \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x) \
+        )
+
+#define bli_teq0s( chx, x ) \
+        bli_teq0ris \
+        ( \
+          chx, \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x) \
+        )
+
+#define bli_teqm1s( chx, x ) \
+        bli_teqm1ris \
+        ( \
+          chx, \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_seqs( x, y ) bli_teqs( s,s,s, x, y )
+#define bli_deqs( x, y ) bli_teqs( d,d,d, x, y )
+#define bli_ceqs( x, y ) bli_teqs( c,c,c, x, y )
+#define bli_zeqs( x, y ) bli_teqs( z,z,z, x, y )
+
+#define bli_seq1( x ) bli_teq1s( s, x )
+#define bli_deq1( x ) bli_teq1s( d, x )
+#define bli_ceq1( x ) bli_teq1s( c, x )
+#define bli_zeq1( x ) bli_teq1s( z, x )
+
+#define bli_seq0( x ) bli_teq0s( s, x )
+#define bli_deq0( x ) bli_teq0s( d, x )
+#define bli_ceq0( x ) bli_teq0s( c, x )
+#define bli_zeq0( x ) bli_teq0s( z, x )
+
+// -- Notes --------------------------------------------------------------------
+
+#endif
+
diff --git a/frame/include/level0/bli_fprints.h b/frame/include/level0/bli_tfprints.h
similarity index 52%
rename from frame/include/level0/bli_fprints.h
rename to frame/include/level0/bli_tfprints.h
index c52cddfc9..2616d1e3f 100644
--- a/frame/include/level0/bli_fprints.h
+++ b/frame/include/level0/bli_tfprints.h
@@ -32,37 +32,82 @@
 
 */
 
-#ifndef BLIS_FPRINTS_H
-#define BLIS_FPRINTS_H
+#ifndef BLIS_TFPRINTS_H
+#define BLIS_TFPRINTS_H
 
-// prints
+// -- Implementation macro -----------------------------------------------------
 
-#define bli_sfprints( file, spec, x ) \
-{ \
-	fprintf( file, spec, (x) ); \
-}
-#define bli_dfprints( file, spec, x ) \
+// -- real domain implementation --
+
+#define bli_rfprintims( \
+          \
+          file, spec, \
+          dx, px, xr, xi \
+        ) \
 { \
-	fprintf( file, spec, (x) ); \
+	fprintf( file, spec, xr ); \
 }
-#define bli_cfprints( file, spec, x ) \
-{ \
-	fprintf( file, spec, bli_creal(x) ); \
-	fprintf( file, " + " ); \
-	fprintf( file, spec, bli_cimag(x) ); \
-	fprintf( file, " " ); \
-}
-#define bli_zfprints( file, spec, x ) \
+
+// -- complex domain implementation --
+
+#define bli_cfprintims( \
+          \
+          file, spec, \
+          dx, px, xr, xi \
+        ) \
 { \
-	fprintf( file, spec, bli_zreal(x) ); \
+	fprintf( file, spec, xr ); \
 	fprintf( file, " + " ); \
-	fprintf( file, spec, bli_zimag(x) ); \
-	fprintf( file, " " ); \
+	fprintf( file, spec, xi ); \
+	fprintf( file, "i" ); \
 }
-#define bli_ifprints( file, spec, x ) \
+
+// -- general implementation --
+
+#define bli_tfprintims( \
+          \
+          file, spec, \
+          dx, px, xr, xi \
+        ) \
 { \
-	fprintf( file, spec, (x) ); \
+	PASTEMAC(dx,fprintims) \
+	( \
+	  file, spec, \
+	  dx, px, xr, xi \
+	); \
 }
 
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tfprints
+#define bli_tfprints( chx, file, spec, x ) \
+        bli_tfprintims \
+        ( \
+          file, spec, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x) \
+        )
+
+// -- Exposed real/imaginary --
+
+// tfprintris
+#define bli_tfprintris( chx, file, spec, xr, xi ) \
+        bli_tfprintims \
+        ( \
+          file, spec, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
 
 #endif
+
diff --git a/frame/include/level0/bli_setrs.h b/frame/include/level0/bli_tgets.h
similarity index 57%
rename from frame/include/level0/bli_setrs.h
rename to frame/include/level0/bli_tgets.h
index 6a5b4a3f8..d052c8f23 100644
--- a/frame/include/level0/bli_setrs.h
+++ b/frame/include/level0/bli_tgets.h
@@ -32,45 +32,70 @@
 
 */
 
-#ifndef BLIS_SETRS_H
-#define BLIS_SETRS_H
-
-// setrs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sssetrs( xr, y )  { (y) = (xr); }
-#define bli_dssetrs( xr, y )  { (y) = (xr); }
-
-#define bli_sdsetrs( xr, y )  { (y) = (xr); }
-#define bli_ddsetrs( xr, y )  { (y) = (xr); }
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsetrs( xr, y )  { bli_creal(y) = (xr); }
-#define bli_dcsetrs( xr, y )  { bli_creal(y) = (xr); }
-
-#define bli_szsetrs( xr, y )  { bli_zreal(y) = (xr); }
-#define bli_dzsetrs( xr, y )  { bli_zreal(y) = (xr); }
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_scsetrs( xr, y )  { (y) = (xr) + bli_cimag(y) * (I); }
-#define bli_dcsetrs( xr, y )  { (y) = (xr) + bli_cimag(y) * (I); }
-
-#define bli_szsetrs( xr, y )  { (y) = (xr) + bli_zimag(y) * (I); }
-#define bli_dzsetrs( xr, y )  { (y) = (xr) + bli_zimag(y) * (I); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_ssetrs( xr, y )  bli_sssetrs( xr, y )
-#define bli_dsetrs( xr, y )  bli_ddsetrs( xr, y )
-#define bli_csetrs( xr, y )  bli_scsetrs( xr, y )
-#define bli_zsetrs( xr, y )  bli_dzsetrs( xr, y )
-
+#ifndef BLIS_TGETS_H
+#define BLIS_TGETS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) := (xr)
+// (yi) := (xi)
+
+// NOTE: always assign the imaginary component, even for real y
+
+#define bli_tgetims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(c,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr), \
+	  PASTEMAC(px,py,tcast)(xi), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Hybrid --
+
+// tgets
+#define bli_tgets( chx, chy, x, yr, yi ) \
+        bli_tgetims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) := (xr);
+// (yi) :=   0 ;
+
+//   r       c
+// (yr) := (xr);
+// (yi) := (xi);
+
+//   c       r
+// (yr) := (xr);
+// (yi) :=   0 ;
+
+//   c       c
+// (yr) := (xr);
+// (yi) := (xi);
 
 #endif
 
diff --git a/frame/include/level0/bli_tinverts.h b/frame/include/level0/bli_tinverts.h
new file mode 100644
index 000000000..701a33599
--- /dev/null
+++ b/frame/include/level0/bli_tinverts.h
@@ -0,0 +1,226 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TINVERTS_H
+#define BLIS_TINVERTS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+#define bli_tinvertims( \
+          \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dx,invertims) \
+	( \
+	  dx, px, xr, xi, \
+	  chc  \
+	); \
+}
+
+// -- real domain implementation --
+
+// (xr) = 1.0 / (xr);
+
+#define bli_rinvertims( \
+          \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(r,assigns) \
+	( \
+	  PASTEMAC(chc,px,tcast)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(chc,one), \
+	      PASTEMAC(px,chc,tcast)(xr)  \
+	    ) \
+	  ),\
+	  PASTEMAC(px,zero), \
+	  xr, \
+	  xi \
+	); \
+}
+
+// -- complex domain implementation --
+
+// sr    = maxabs( xr, xi );
+// xsr   = xr / sr;
+// xsi   = xi / sr;
+// tempr = xr * xsr + xi * xsi
+// xr    =  xsr / tempr;
+// xi    = -xsi / tempr;
+
+#define bli_cinvertims( \
+          \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(ro,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,maxabs)( \
+	    PASTEMAC(px,chc,tcast)(xr), \
+	    PASTEMAC(px,chc,tcast)(xi)  \
+	  ), \
+	  sr  \
+	) \
+	PASTEMAC(c,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,div)( \
+	    PASTEMAC(px,chc,tcast)(xr), \
+	    sr  \
+	  ), \
+	  PASTEMAC(chc,div)( \
+	    PASTEMAC(px,chc,tcast)(xi), \
+	    sr  \
+	  ), \
+	  xsr, \
+	  xsi \
+	) \
+	PASTEMAC(ro,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,add)( \
+	    PASTEMAC(chc,mul)( \
+	      PASTEMAC(px,chc,tcast)(xr), \
+	      xsr  \
+	    ), \
+	    PASTEMAC(chc,mul)( \
+	      PASTEMAC(px,chc,tcast)(xi), \
+	      xsi  \
+	    ) \
+	  ), \
+	  tempr  \
+	) \
+	PASTEMAC(c,assigns) \
+	( \
+	  PASTEMAC(chc,px,tcast)( \
+	    PASTEMAC(chc,div)( \
+	      xsr, \
+	      tempr  \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,px,tcast)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(PASTEMAC(chc,prec),neg)(xsi), \
+	      tempr  \
+	    ) \
+	  ),\
+	  xr, \
+	  xi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tinverts
+#define bli_tinverts( chx, chc, x ) \
+        bli_tinvertims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tinvertris
+#define bli_tinvertris( chx, chc, xr, xi ) \
+        bli_tinvertims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- 1e / 1r --
+
+// invert1es
+#define bli_tinvert1es( chx, chc, xri, xir ) \
+        bli_tinvertims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(xri), \
+          PASTEMAC(chx,imag)(xri), \
+          PASTEMAC(chc,prec)  \
+        ); \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(xri)  \
+          ), \
+          PASTEMAC(chx,real)(xri), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(xir), \
+          PASTEMAC(chx,imag)(xir) \
+        )
+
+// invert1rs
+#define bli_tinvert1rs( chx, chc, xr, xi ) \
+        bli_tinvertims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          xr, \
+          xi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_sinverts( x ) bli_tinverts( s,s, x )
+#define bli_dinverts( x ) bli_tinverts( d,d, x )
+#define bli_cinverts( x ) bli_tinverts( c,c, x )
+#define bli_zinverts( x ) bli_tinverts( z,z, x )
+
+// -- Notes --------------------------------------------------------------------
+
+#endif
+
diff --git a/frame/include/level0/bli_tinvscals.h b/frame/include/level0/bli_tinvscals.h
new file mode 100644
index 000000000..cdf0f3196
--- /dev/null
+++ b/frame/include/level0/bli_tinvscals.h
@@ -0,0 +1,264 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TINVSCALS_H
+#define BLIS_TINVSCALS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+#define bli_tinvscalims( \
+          \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(da,dx,invscalims) \
+	( \
+	  da, pa, ar, ai, \
+	  dx, px, xr, xi, \
+	  chc  \
+	); \
+}
+
+// -- real-real domain implementation --
+// -- real-complex domain implementation --
+
+// (xr) = (xr) / (ar);
+// (xi) = (xi) / (ar);
+
+#define bli_rrinvscalims bli_rcinvscalims
+
+#define bli_rcinvscalims( \
+          \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dx,assigns) \
+	( \
+	  PASTEMAC(chc,px,tcast)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(px,chc,tcast)(xr),  \
+	      PASTEMAC(pa,chc,tcast)(ar)  \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,px,tcast)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(px,chc,tcast)(xi),  \
+	      PASTEMAC(pa,chc,tcast)(ar)  \
+	    ) \
+	  ),\
+	  xr, \
+	  xi \
+	); \
+}
+
+// -- complex-real domain implementation --
+// -- complex-complex domain implementation --
+
+// sr    = maxabs( ar, ai );
+// asr   = ar / sr;
+// asi   = ai / sr;
+// xrt   = xr;
+// tempr = ar * asr + ai * asi
+// xr    = ( asr * xrt + asi * xi  ) / tempr;
+// xi    = ( asr * xi  - asi * xrt ) / tempr;
+
+#define bli_crinvscalims bli_ccinvscalims
+
+#define bli_ccinvscalims( \
+          \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(ro,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,maxabs)( \
+	    PASTEMAC(pa,chc,tcast)(ar), \
+	    PASTEMAC(pa,chc,tcast)(ai)  \
+	  ), \
+	  sr  \
+	) \
+	PASTEMAC(c,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,div)( \
+	    PASTEMAC(pa,chc,tcast)(ar), \
+	    sr  \
+	  ), \
+	  PASTEMAC(chc,div)( \
+	    PASTEMAC(pa,chc,tcast)(ai), \
+	    sr  \
+	  ), \
+	  asr, \
+	  asi \
+	) \
+	PASTEMAC(ro,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(px,chc,tcast)(xr), \
+	  xrt  \
+	) \
+	PASTEMAC(ro,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,add)( \
+	    PASTEMAC(chc,mul)( \
+	      PASTEMAC(pa,chc,tcast)(ar), \
+	      asr  \
+	    ), \
+	    PASTEMAC(chc,mul)( \
+	      PASTEMAC(pa,chc,tcast)(ai), \
+	      asi  \
+	    ) \
+	  ), \
+	  tempr  \
+	) \
+	PASTEMAC(dx,assigns) \
+	( \
+	  PASTEMAC(chc,px,tcast)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(chc,add)( \
+	        PASTEMAC(chc,mul)( \
+	          asr, \
+              xrt  \
+            ), \
+	        PASTEMAC(chc,mul)( \
+	          asi, \
+	          PASTEMAC(px,chc,tcast)(xi)  \
+            )  \
+          ), \
+	      tempr  \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,px,tcast)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(chc,sub)( \
+	        PASTEMAC(chc,mul)( \
+	          asr, \
+	          PASTEMAC(px,chc,tcast)(xi)  \
+            ), \
+	        PASTEMAC(chc,mul)( \
+	          asi, \
+              xrt  \
+            )  \
+          ), \
+	      tempr  \
+	    ) \
+	  ),\
+	  xr, \
+	  xi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tinvscals
+#define bli_tinvscals( cha, chx, chc, a, x ) \
+        bli_tinvscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tinvscaljs
+#define bli_tinvscaljs( cha, chx, chc, a, x ) \
+        bli_tinvscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(PASTEMAC(cha,prec),neg)( \
+            PASTEMAC(cha,imag)(a)  \
+          ), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tinvscalris
+#define bli_tinvscalris( cha, chx, chc, ar, ai, xr, xi ) \
+        bli_tinvscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tinvscaljris
+#define bli_tinvscaljris( cha, chx, chc, ar, ai, xr, xi ) \
+        bli_tinvscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+          PASTEMAC(PASTEMAC(cha,prec),neg)( \
+                     ai ), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
+
+#endif
+
diff --git a/frame/include/level0/old/ri3/bli_scal2ri3s.h b/frame/include/level0/bli_tneg2s.h
similarity index 53%
rename from frame/include/level0/old/ri3/bli_scal2ri3s.h
rename to frame/include/level0/bli_tneg2s.h
index 72f3911cc..8c118f2cf 100644
--- a/frame/include/level0/old/ri3/bli_scal2ri3s.h
+++ b/frame/include/level0/bli_tneg2s.h
@@ -32,48 +32,70 @@
 
 */
 
-#ifndef BLIS_SCAL2RI3S_H
-#define BLIS_SCAL2RI3S_H
+#ifndef BLIS_TNEG2S_H
+#define BLIS_TNEG2S_H
 
-// scal2ri3s
+// -- Implementation macro -----------------------------------------------------
 
-#define bli_sscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr); \
-}
+// (yr) = -(xr);
+// (yi) = -(xi);
 
-#define bli_dscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \
+#define bli_tneg2ims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
 { \
-	(yr) = (ar) * (xr); \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(py,neg)( \
+	    PASTEMAC(px,py,tcast)(xr) \
+	  ), \
+	  PASTEMAC(py,neg)( \
+	    PASTEMAC(px,py,tcast)(xi) \
+	  ), \
+	  yr, \
+	  yi \
+	); \
 }
 
-#define bli_cscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr) - (ai) * (xi); \
-	(yi) = (ai) * (xr) + (ar) * (xi); \
-	(yri) = (yr) + (yi); \
-}
+// -- API macros ---------------------------------------------------------------
 
-#define bli_zscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr) - (ai) * (xi); \
-	(yi) = (ai) * (xr) + (ar) * (xi); \
-	(yri) = (yr) + (yi); \
-}
+// -- Consolidated --
 
-#define bli_scscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr); \
-	(yi) = (ar) * (xi); \
-	(yri) = (yr) + (yi); \
-}
+// tneg2s
+#define bli_tneg2s( chx, chy, x, y ) \
+        bli_tneg2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
 
-#define bli_dzscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr); \
-	(yi) = (ar) * (xi); \
-	(yri) = (yr) + (yi); \
-}
+// -- Exposed real/imaginary --
+
+// tneg2ris
+#define bli_tneg2ris( chx, chy, xr, xi, yr, yi ) \
+        bli_tneg2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
 
 #endif
 
diff --git a/frame/include/level0/1r/bli_scal2j1rs.h b/frame/include/level0/bli_trandnp2s.h
similarity index 60%
rename from frame/include/level0/1r/bli_scal2j1rs.h
rename to frame/include/level0/bli_trandnp2s.h
index 9f75b55e6..af321d3d0 100644
--- a/frame/include/level0/1r/bli_scal2j1rs.h
+++ b/frame/include/level0/bli_trandnp2s.h
@@ -32,34 +32,57 @@
 
 */
 
-#ifndef BLIS_SCAL2J1RS_H
-#define BLIS_SCAL2J1RS_H
+#ifndef BLIS_TRANDNP2S_H
+#define BLIS_TRANDNP2S_H
 
-// scal2j1rs
+// -- Implementation macro -----------------------------------------------------
 
-#define bli_cscscal2j1rs( a, x, yr, yi ) \
-{ \
-	bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \
-}
+// (xr) = randnp2();
+// (xi) = randnp2();
 
-#define bli_cccscal2j1rs( a, x, yr, yi ) \
+#define bli_trandnp2ims( \
+          \
+          dx, px, xr, xi \
+        ) \
 { \
-	bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \
+	PASTEMAC(dx,assigns) \
+	( \
+	  PASTEMAC(px,randnp2),\
+	  PASTEMAC(px,randnp2),\
+	            xr, \
+	            xi \
+	); \
 }
 
-#define bli_zdzscal2j1rs( a, x, yr, yi ) \
-{ \
-	bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \
-}
+// -- API macros ---------------------------------------------------------------
 
-#define bli_zzzscal2j1rs( a, x, yr, yi ) \
-{ \
-	bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \
-}
+// -- Consolidated --
+
+// trandnp2s
+#define bli_trandnp2s( chx, x ) \
+        bli_trandnp2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x) \
+        )
+
+// -- Exposed real/imaginary --
+
+// trandnp2ris
+#define bli_trandnp2ris( chx, xr, xi ) \
+        bli_trandnp2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi \
+        )
 
+// -- Higher-level static functions --------------------------------------------
 
-#define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi )
-#define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi )
+// -- Notes --------------------------------------------------------------------
 
 #endif
 
diff --git a/frame/include/level0/old/ri3/bli_scal2jri3s.h b/frame/include/level0/bli_trands.h
similarity index 60%
rename from frame/include/level0/old/ri3/bli_scal2jri3s.h
rename to frame/include/level0/bli_trands.h
index 08be57c1d..8b03d0bfd 100644
--- a/frame/include/level0/old/ri3/bli_scal2jri3s.h
+++ b/frame/include/level0/bli_trands.h
@@ -32,48 +32,57 @@
 
 */
 
-#ifndef BLIS_SCAL2JRI3S_H
-#define BLIS_SCAL2JRI3S_H
+#ifndef BLIS_TRANDS_H
+#define BLIS_TRANDS_H
 
-// scal2jri3s
+// -- Implementation macro -----------------------------------------------------
 
-#define bli_sscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr); \
-}
+// (xr) = rand();
+// (xi) = rand();
 
-#define bli_dscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \
+#define bli_trandims( \
+          \
+          dx, px, xr, xi \
+        ) \
 { \
-	(yr) = (ar) * (xr); \
+	PASTEMAC(dx,assigns) \
+	( \
+	  PASTEMAC(px,rand),\
+	  PASTEMAC(px,rand),\
+	            xr, \
+	            xi \
+	); \
 }
 
-#define bli_cscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr) + (ai) * (xi); \
-	(yi) = (ai) * (xr) - (ar) * (xi); \
-	(yri) = (yr) + (yi); \
-}
+// -- API macros ---------------------------------------------------------------
 
-#define bli_zscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) * (xr) + (ai) * (xi); \
-	(yi) = (ai) * (xr) - (ar) * (xi); \
-	(yri) = (yr) + (yi); \
-}
+// -- Consolidated --
 
-#define bli_scscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) *  (xr); \
-	(yi) = (ar) * -(xi); \
-	(yri) = (yr) + (yi); \
-}
+// trands
+#define bli_trands( chx, x ) \
+        bli_trandims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x) \
+        )
 
-#define bli_dzscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \
-{ \
-	(yr) = (ar) *  (xr); \
-	(yi) = (ar) * -(xi); \
-	(yri) = (yr) + (yi); \
-}
+// -- Exposed real/imaginary --
+
+// trandris
+#define bli_trandris( chx, xr, xi ) \
+        bli_trandims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
 
 #endif
 
diff --git a/frame/include/level0/bli_tscal2s.h b/frame/include/level0/bli_tscal2s.h
new file mode 100644
index 000000000..6243b030d
--- /dev/null
+++ b/frame/include/level0/bli_tscal2s.h
@@ -0,0 +1,654 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TSCAL2S_H
+#define BLIS_TSCAL2S_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (tr) := (ar) * (xr) - (ai) * (xi);
+// (ti) := (ai) * (xr) + (ar) * (xi);
+// (yr) := (tr);
+// (yi) := (ti);
+
+#define bli_tscal2ims( \
+          \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(c,declinits) \
+	( \
+	  py, \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,sub)( \
+	      PASTEMAC(da,dx,termrr)( \
+	        chc, \
+	        PASTEMAC(chc,mul)( \
+	          PASTEMAC(pa,chc,tcast)(ar), \
+	          PASTEMAC(px,chc,tcast)(xr) \
+	        )  \
+	      ), \
+	      PASTEMAC(da,dx,termii)( \
+	        chc, \
+	        PASTEMAC(chc,mul)( \
+	          PASTEMAC(pa,chc,tcast)(ai), \
+	          PASTEMAC(px,chc,tcast)(xi) \
+	        ) \
+	      ) \
+	    ) \
+	  ), \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(da,dx,termir)( \
+	        chc, \
+	        PASTEMAC(chc,mul)( \
+	          PASTEMAC(pa,chc,tcast)(ai), \
+	          PASTEMAC(px,chc,tcast)(xr) \
+	        )  \
+	      ), \
+	      PASTEMAC(da,dx,termri)( \
+	        chc, \
+	        PASTEMAC(chc,mul)( \
+	          PASTEMAC(pa,chc,tcast)(ar), \
+	          PASTEMAC(px,chc,tcast)(xi) \
+	        ) \
+	      ) \
+	    ) \
+	  ), \
+	  tr, \
+	  ti \
+	); \
+	PASTEMAC(dy,assigns) \
+	( \
+	  tr, \
+	  ti, \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tscal2s
+#define bli_tscal2s( cha, chx, chy, chc, a, x, y ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tscal2js
+#define bli_tscal2js( cha, chx, chy, chc, a, x, y ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tscal2ris
+#define bli_tscal2ris( cha, chx, chy, chc, ar, ai, xr, xi, yr, yi ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tscal2jris
+#define bli_tscal2jris( cha, chx, chy, chc, ar, ai, xr, xi, yr, yi ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- 1e / 1r --
+
+// tscal21es
+#define bli_tscal21es( cha, chx, chy, chc, a, x, yri, yir ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yri), \
+          PASTEMAC(chy,imag)(yri), \
+          PASTEMAC(chc,prec)  \
+        ); \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(PASTEMAC(chy,prec),neg)( \
+            PASTEMAC(chy,imag)(yri)  \
+          ), \
+          PASTEMAC(chy,real)(yri), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yir), \
+          PASTEMAC(chy,imag)(yir) \
+        )
+
+// tscal2j1es
+#define bli_tscal2j1es( cha, chx, chy, chc, a, x, yri, yir ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yri), \
+          PASTEMAC(chy,imag)(yri), \
+          PASTEMAC(chc,prec)  \
+        ); \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(PASTEMAC(chy,prec),neg)( \
+            PASTEMAC(chy,imag)(yri)  \
+          ), \
+          PASTEMAC(chy,real)(yri), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(yir), \
+          PASTEMAC(chy,imag)(yir) \
+        )
+
+// tscal21rs
+#define bli_tscal21rs( cha, chx, chy, chc, a, x, yr, yi ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          yr, \
+          yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tscal2j1rs
+#define bli_tscal2j1rs( cha, chx, chy, chc, a, x, yr, yi ) \
+        bli_tscal2ims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          yr, \
+          yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- mxn --
+
+// scal2bbs_mxn
+
+#define bli_tscal2bbs_mxn_r( \
+                             cha,chx,chy,chc, \
+                             ctypea, ctypea_r, \
+                             ctypex, ctypex_r, \
+                             ctypey, ctypey_r, \
+                             conjx, \
+                             m, \
+                             n, \
+                             alpha, \
+                             x, incx, ldx, \
+                             y, incy, ldy  \
+                           ) \
+{ \
+	/* Assume that the duplication factor is the row stride of y. */ \
+	const dim_t d    = incy; \
+	const dim_t ds_y = 1; \
+\
+	if ( bli_is_conj( conjx ) ) \
+	{ \
+		for ( dim_t j = 0; j < (n); ++j ) \
+		{ \
+			ctypex* restrict xj = (ctypex*)(x) + j*(ldx); \
+			ctypey* restrict yj = (ctypey*)(y) + j*(ldy); \
+	\
+			for ( dim_t i = 0; i < (m); ++i ) \
+			{ \
+				ctypex* restrict xij = xj + i*(incx); \
+				ctypey* restrict yij = yj + i*(incy); \
+	\
+				bli_tscal2js( cha,chx,chy,chc, *(const ctypea* restrict)(alpha), *xij, *yij ); \
+	\
+				for ( dim_t p = 1; p < d; ++p ) \
+				{ \
+					ctypey* restrict yijd = yij + p*ds_y; \
+	\
+					bli_tcopys( chy,chy, *yij, *yijd ); \
+				} \
+			} \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t j = 0; j < (n); ++j ) \
+		{ \
+			ctypex* restrict xj = (ctypex*)(x) + j*(ldx); \
+			ctypey* restrict yj = (ctypey*)(y) + j*(ldy); \
+	\
+			for ( dim_t i = 0; i < (m); ++i ) \
+			{ \
+				ctypex* restrict xij = xj + i*(incx); \
+				ctypey* restrict yij = yj + i*(incy); \
+	\
+				bli_tscal2s( cha,chx,chy,chc, *(const ctypea* restrict)(alpha), *xij, *yij ); \
+	\
+				for ( dim_t p = 1; p < d; ++p ) \
+				{ \
+					ctypey* restrict yijd = yij + p*ds_y; \
+	\
+					bli_tcopys( chy,chy, *yij, *yijd ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+#define bli_tscal2bbs_mxn_c( \
+                             cha,chx,chy,chc, \
+                             ctypea, ctypea_r, \
+                             ctypex, ctypex_r, \
+                             ctypey, ctypey_r, \
+                             conjx, \
+                             m, \
+                             n, \
+                             alpha, \
+                             x, incx, ldx, \
+                             y, incy, ldy  \
+                           ) \
+{ \
+	/* Assume that the duplication factor is the row stride of y. */ \
+	const dim_t       d          = incy; \
+	const dim_t       ds_y       = 1; \
+\
+	const inc_t       incx2      = sizeof(ctypex) / sizeof(ctypex_r) * (incx); \
+	const inc_t       ldx2       = sizeof(ctypex) / sizeof(ctypex_r) * (ldx); \
+\
+	const inc_t       incy2      = sizeof(ctypey) / sizeof(ctypey_r) * (incy); \
+	const inc_t       ldy2       = sizeof(ctypey) / sizeof(ctypey_r) * (ldy); \
+\
+	ctypea_r* restrict alpha_r    = ( ctypea_r* )(alpha); \
+	ctypea_r* restrict alpha_i    = ( ctypea_r* )(alpha) + 1; (void)alpha_i; \
+	ctypex_r* restrict chi_r      = ( ctypex_r* )(x); \
+	ctypex_r* restrict chi_i      = ( ctypex_r* )(x) + 1; (void)chi_i; \
+	ctypey_r* restrict psi_r      = ( ctypey_r* )(y); \
+	ctypey_r* restrict psi_i      = ( ctypey_r* )(y) + 1*d; (void)psi_i; \
+\
+	if ( bli_is_conj( conjx ) ) \
+	{ \
+		for ( dim_t j = 0; j < (n); ++j ) \
+		{ \
+			ctypex_r* restrict chij_r = chi_r + j*ldx2; \
+			ctypex_r* restrict chij_i = chi_i + j*ldx2; \
+			ctypey_r* restrict psij_r = psi_r + j*ldy2; \
+			ctypey_r* restrict psij_i = psi_i + j*ldy2; \
+\
+			for ( dim_t i = 0; i < (m); ++i ) \
+			{ \
+				ctypex_r* restrict chiij_r = chij_r + i*incx2; \
+				ctypex_r* restrict chiij_i = chij_i + i*incx2; (void)chiij_i; \
+				ctypey_r* restrict psiij_r = psij_r + i*incy2; \
+				ctypey_r* restrict psiij_i = psij_i + i*incy2; (void)psiij_i; \
+\
+				bli_tscal2jris( cha,chx,chy,chc, \
+				                *alpha_r, *alpha_i, \
+				                *chiij_r, *chiij_i, \
+				                *psiij_r, *psiij_i ); \
+\
+				for ( dim_t p = 1; p < d; ++p ) \
+				{ \
+					ctypey_r* restrict psiijd_r = psiij_r + p*ds_y; \
+					ctypey_r* restrict psiijd_i = psiij_i + p*ds_y; (void)psiijd_i; \
+\
+					bli_tcopyris( chy,chy, *psiij_r,  *psiij_i, \
+					                       *psiijd_r, *psiijd_i ); \
+				} \
+			} \
+		} \
+	} \
+	else /* if ( bli_is_noconj( conjx ) ) */ \
+	{ \
+		for ( dim_t j = 0; j < (n); ++j ) \
+		{ \
+			ctypex_r* restrict chij_r = chi_r + j*ldx2; \
+			ctypex_r* restrict chij_i = chi_i + j*ldx2; \
+			ctypey_r* restrict psij_r = psi_r + j*ldy2; \
+			ctypey_r* restrict psij_i = psi_i + j*ldy2; \
+\
+			for ( dim_t i = 0; i < (m); ++i ) \
+			{ \
+				ctypex_r* restrict chiij_r = chij_r + i*incx2; \
+				ctypex_r* restrict chiij_i = chij_i + i*incx2; (void)chiij_i; \
+				ctypey_r* restrict psiij_r = psij_r + i*incy2; \
+				ctypey_r* restrict psiij_i = psij_i + i*incy2; (void)psiij_i; \
+\
+				bli_tscal2ris( cha,chx,chy,chc, \
+				               *alpha_r, *alpha_i, \
+				               *chiij_r, *chiij_i, \
+				               *psiij_r, *psiij_i ); \
+\
+				for ( dim_t p = 1; p < d; ++p ) \
+				{ \
+					ctypey_r* restrict psiijd_r = psiij_r + p*ds_y; \
+					ctypey_r* restrict psiijd_i = psiij_i + p*ds_y; (void)psiijd_i; \
+\
+					bli_tcopyris( chy,chy, *psiij_r,  *psiij_i, \
+					                       *psiijd_r, *psiijd_i ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+#define bli_tscal2bbs_mxn( \
+                           cha,chx,chy,chc, \
+                           conjx, \
+                           m, \
+                           n, \
+                           alpha, \
+                           x, incx, ldx, \
+                           y, incy, ldy  \
+                         ) \
+PASTECH(bli_tscal2bbs_mxn_,PASTEMAC(chy,dom)) \
+( \
+  cha,chx,chy,chc, \
+  PASTEMAC(cha,ctype),PASTEMAC(cha,ctyper), \
+  PASTEMAC(chx,ctype),PASTEMAC(chx,ctyper), \
+  PASTEMAC(chy,ctype),PASTEMAC(chy,ctyper), \
+  conjx, \
+  m, \
+  n, \
+  alpha, \
+  x, incx, ldx, \
+  y, incy, ldy \
+)
+
+#define bli_tscal2s_mxn( cha, chx, chy, chc, conjx, m, n, alpha, x, rs_x, cs_x, y, rs_y, cs_y ) \
+{ \
+	if ( bli_is_conj( conjx ) ) \
+	{ \
+		for ( dim_t jj = 0; jj < (n); ++jj ) \
+		for ( dim_t ii = 0; ii < (m); ++ii ) \
+		{ \
+			PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+			PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+			bli_tscal2js( cha,chx,chy,chc, *(alpha), *xij, *yij ); \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < (n); ++jj ) \
+		for ( dim_t ii = 0; ii < (m); ++ii ) \
+		{ \
+			PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+			PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+			bli_tscal2s( cha,chx,chy,chc, *(alpha), *xij, *yij ); \
+		} \
+	} \
+}
+
+#define bli_tscal2ris_mxn( cha, chx, chy, chc, conjx, m, n, alpha, x, rs_x, cs_x, y, rs_y, cs_y, is_y ) \
+{ \
+	PASTEMAC(cha,ctyper)* restrict alpha_r = ( PASTEMAC(cha,ctyper)* )(alpha);     (void)alpha_r; \
+	PASTEMAC(cha,ctyper)* restrict alpha_i = ( PASTEMAC(cha,ctyper)* )(alpha) + 1; (void)alpha_i; \
+	PASTEMAC(chx,ctyper)* restrict x_r     = ( PASTEMAC(chx,ctyper)* )(x); \
+	PASTEMAC(chx,ctyper)* restrict x_i     = ( PASTEMAC(chx,ctyper)* )(x) + 1; \
+	PASTEMAC(chy,ctyper)* restrict y_r     = ( PASTEMAC(chy,ctyper)* )(y); \
+	PASTEMAC(chy,ctyper)* restrict y_i     = ( PASTEMAC(chy,ctyper)* )(y) + (is_y); \
+	const dim_t incx2                      = 2*(rs_x); \
+	const dim_t ldx2                       = 2*(cs_x); \
+\
+	if ( bli_is_conj( conjx ) ) \
+	{ \
+		for ( dim_t jj = 0; jj < (n); ++jj ) \
+		for ( dim_t ii = 0; ii < (m); ++ii ) \
+		{ \
+			PASTEMAC(chx,ctyper)* restrict chi11_r = x_r + ii*incx2  + jj*ldx2;   (void)chi11_r; \
+			PASTEMAC(chx,ctyper)* restrict chi11_i = x_i + ii*incx2  + jj*ldx2;   (void)chi11_i; \
+			PASTEMAC(chy,ctyper)* restrict psi11_r = y_r + ii*(rs_y) + jj*(cs_y); (void)psi11_r; \
+			PASTEMAC(chy,ctyper)* restrict psi11_i = y_i + ii*(rs_y) + jj*(cs_y); (void)psi11_i; \
+\
+			bli_tscal2jris \
+			( \
+			  cha,chx,chy,chc, \
+			  *alpha_r, *alpha_i, \
+			  *chi11_r, *chi11_i, \
+			  *psi11_r, *psi11_i  \
+			); \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < (n); ++jj ) \
+		for ( dim_t ii = 0; ii < (m); ++ii ) \
+		{ \
+			PASTEMAC(chx,ctyper)* restrict chi11_r = x_r + ii*incx2  + jj*ldx2;   (void)chi11_r; \
+			PASTEMAC(chx,ctyper)* restrict chi11_i = x_i + ii*incx2  + jj*ldx2;   (void)chi11_i; \
+			PASTEMAC(chy,ctyper)* restrict psi11_r = y_r + ii*(rs_y) + jj*(cs_y); (void)psi11_r; \
+			PASTEMAC(chy,ctyper)* restrict psi11_i = y_i + ii*(rs_y) + jj*(cs_y); (void)psi11_i; \
+\
+			bli_tscal2ris \
+			( \
+			  cha,chx,chy,chc, \
+			  *alpha_r, *alpha_i, \
+			  *chi11_r, *chi11_i, \
+			  *psi11_r, *psi11_i  \
+			); \
+		} \
+	} \
+}
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_sscal2bbs_mxn( conjx, m, n, alpha, x, incx, ldx, y, incy, ldy ) \
+  bli_tscal2bbs_mxn( \
+                     s,s,s,s, \
+                     conjx, \
+                     m, \
+                     n, \
+                     alpha, \
+                     x, incx, ldx, \
+                     y, incy, ldy  \
+                   ) \
+
+#define bli_dscal2bbs_mxn( conjx, m, n, alpha, x, incx, ldx, y, incy, ldy ) \
+  bli_tscal2bbs_mxn( \
+                     d,d,d,d, \
+                     conjx, \
+                     m, \
+                     n, \
+                     alpha, \
+                     x, incx, ldx, \
+                     y, incy, ldy  \
+                   ) \
+
+#define bli_cscal2bbs_mxn( conjx, m, n, alpha, x, incx, ldx, y, incy, ldy ) \
+  bli_tscal2bbs_mxn( \
+                     c,c,c,c, \
+                     conjx, \
+                     m, \
+                     n, \
+                     alpha, \
+                     x, incx, ldx, \
+                     y, incy, ldy  \
+                   ) \
+
+#define bli_zscal2bbs_mxn( conjx, m, n, alpha, x, incx, ldx, y, incy, ldy ) \
+  bli_tscal2bbs_mxn( \
+                     z,z,z,z, \
+                     conjx, \
+                     m, \
+                     n, \
+                     alpha, \
+                     x, incx, ldx, \
+                     y, incy, ldy  \
+                   ) \
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_sscal2s( a, x, y ) bli_tscal2s( s,s,s,s, a, x, y )
+#define bli_dscal2s( a, x, y ) bli_tscal2s( d,d,d,d, a, x, y )
+#define bli_cscal2s( a, x, y ) bli_tscal2s( c,c,c,s, a, x, y )
+#define bli_zscal2s( a, x, y ) bli_tscal2s( z,z,z,d, a, x, y )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r      r
+// (yr) := (ar) * (xr) -   0  *   0 ;
+// (yi) xx   0  * (xr) + (ar) *   0 ;
+
+//   r       r      c
+// (yr) := (ar) * (xr) -   0  * (xi);
+// (yi) xx   0  * (xr) + (ar) * (xi);
+
+//   r       c      r
+// (yr) := (ar) * (xr) - (ai) *   0 ;
+// (yi) xx (ai) * (xr) + (ar) *   0 ;
+
+//   r       c      c
+// (yr) := (ar) * (xr) - (ai) * (xi);
+// (yi) xx (ai) * (xr) + (ar) * (xi);
+
+//   c       r      r
+// (yr) := (ar) * (xr) -   0  *   0 ;
+// (yi) :=   0  * (xr) + (ar) *   0 ;
+
+//   c       r      c
+// (yr) := (ar) * (xr) -   0  * (xi);
+// (yi) :=   0  * (xr) + (ar) * (xi);
+
+//   c       c      r
+// (yr) := (ar) * (xr) - (ai) *   0 ;
+// (yi) := (ai) * (xr) + (ar) *   0 ;
+
+//   c       c      c
+// (yr) := (ar) * (xr) - (ai) * (xi);
+// (yi) := (ai) * (xr) + (ar) * (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_tscalcjs.h b/frame/include/level0/bli_tscalcjs.h
new file mode 100644
index 000000000..8f2efaa0b
--- /dev/null
+++ b/frame/include/level0/bli_tscalcjs.h
@@ -0,0 +1,129 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TSCALCJS_H
+#define BLIS_TSCALCJS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (tr) := (ar) * (xr) - ( is_conj( conj ) ? -(ai) : (ai) ) * (xi);
+// (ti) := ( is_conj( conj ) ? -(ai) : (ai) ) * (xr) + (ar) * (xi);
+// (xr) := (tr);
+// (xi) := (ti);
+
+#define bli_tscalcjims( \
+        \
+          conj, \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(c,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,sub)( \
+	    PASTEMAC(da,dx,termrr)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        PASTEMAC(pa,chc,tcast)(ar), \
+	        PASTEMAC(px,chc,tcast)(xr) \
+	      )  \
+	    ), \
+	    PASTEMAC(da,dx,termii)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        ( bli_is_conj( conj ) ? PASTEMAC(chc,neg)( \
+	                                  PASTEMAC(pa,chc,tcast)(ai) \
+	                                ) \
+	                              :   PASTEMAC(pa,chc,tcast)(ai) \
+	        ), \
+	        PASTEMAC(px,chc,tcast)(xi) \
+	      ) \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,add)( \
+	    PASTEMAC(da,dx,termir)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        ( bli_is_conj( conj ) ? PASTEMAC(chc,neg)( \
+	                                  PASTEMAC(pa,chc,tcast)(ai) \
+	                                ) \
+	                              :   PASTEMAC(pa,chc,tcast)(ai) \
+	        ), \
+	        PASTEMAC(px,chc,tcast)(xr) \
+	      )  \
+	    ), \
+	    PASTEMAC(da,dx,termri)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        PASTEMAC(pa,chc,tcast)(ar), \
+	        PASTEMAC(px,chc,tcast)(xi) \
+	      ) \
+	    ) \
+	  ), \
+	  tr, \
+	  ti \
+	) \
+	PASTEMAC(dx,assigns) \
+	( \
+	  PASTEMAC(chc,px,tcast)(tr), \
+	  PASTEMAC(chc,px,tcast)(ti), \
+	  xr, \
+	  xi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tscals
+#define bli_tscalcjs( cha, chx, chc, conj, a, x ) \
+        bli_tscalcjims \
+        ( \
+          conj, \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chc,prec)  \
+        )
+
+#endif
+
diff --git a/frame/include/level0/bli_tscals.h b/frame/include/level0/bli_tscals.h
new file mode 100644
index 000000000..30292d6f7
--- /dev/null
+++ b/frame/include/level0/bli_tscals.h
@@ -0,0 +1,284 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TSCALS_H
+#define BLIS_TSCALS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (tr) := (ar) * (xr) - (ai) * (xi);
+// (ti) := (ai) * (xr) + (ar) * (xi);
+// (xr) := (tr);
+// (xi) := (ti);
+
+#define bli_tscalims( \
+        \
+          da, pa, ar, ai, \
+          dx, px, xr, xi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(c,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,sub)( \
+	    PASTEMAC(da,dx,termrr)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        PASTEMAC(pa,chc,tcast)(ar), \
+	        PASTEMAC(px,chc,tcast)(xr) \
+	      )  \
+	    ), \
+	    PASTEMAC(da,dx,termii)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        PASTEMAC(pa,chc,tcast)(ai), \
+	        PASTEMAC(px,chc,tcast)(xi) \
+	      ) \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,add)( \
+	    PASTEMAC(da,dx,termir)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        PASTEMAC(pa,chc,tcast)(ai), \
+	        PASTEMAC(px,chc,tcast)(xr) \
+	      )  \
+	    ), \
+	    PASTEMAC(da,dx,termri)( \
+	      chc, \
+	      PASTEMAC(chc,mul)( \
+	        PASTEMAC(pa,chc,tcast)(ar), \
+	        PASTEMAC(px,chc,tcast)(xi) \
+	      ) \
+	    ) \
+	  ), \
+	  tr, \
+	  ti \
+	) \
+	PASTEMAC(dx,assigns) \
+	( \
+	  PASTEMAC(chc,px,tcast)(tr), \
+	  PASTEMAC(chc,px,tcast)(ti), \
+	  xr, \
+	  xi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tscals
+#define bli_tscals( cha, chx, chc, a, x ) \
+        bli_tscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tscaljs
+#define bli_tscaljs( cha, chx, chc, a, x ) \
+        bli_tscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(PASTEMAC(cha,prec),neg)( \
+            PASTEMAC(cha,imag)(a)  \
+          ), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tscalris
+#define bli_tscalris( cha, chx, chc, ar, ai, xr, xi ) \
+        bli_tscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+                     ai, \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tscaljris
+#define bli_tscaljris( cha, chx, chc, ar, ai, xr, xi ) \
+        bli_tscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+                     ar, \
+          PASTEMAC(PASTEMAC(cha,prec),neg)( \
+                     ai ), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- 1e / 1r --
+
+// scal1es
+#define bli_tscal1es( cha, chx, chc, a, xri, xir ) \
+        bli_tscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(xri), \
+          PASTEMAC(chx,imag)(xri), \
+          PASTEMAC(chc,prec)  \
+        ); \
+        bli_tcopyims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(xri)  \
+          ), \
+          PASTEMAC(chx,real)(xri), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(xir), \
+          PASTEMAC(chx,imag)(xir) \
+        )
+
+// scal1rs
+#define bli_tscal1rs( cha, chx, chc, a, xr, xi ) \
+        bli_tscalims \
+        ( \
+          PASTEMAC(cha,dom),  \
+          PASTEMAC(cha,prec), \
+          PASTEMAC(cha,real)(a), \
+          PASTEMAC(cha,imag)(a), \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          xr, \
+          xi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- mxn_uplo --
+
+#define bli_tscalris_mxn_uplo( cha, chx, chc, uplo, diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \
+{ \
+	if ( bli_is_upper( uplo ) ) \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		{ \
+			if ( (doff_t)jj - (doff_t)ii >= diagoff ) \
+			{ \
+				PASTEMAC(chx,ctyper)* restrict xij_r = (xr) + ii*(rs_x) + jj*(cs_x); \
+				PASTEMAC(chx,ctyper)* restrict xij_i = (xi) + ii*(rs_x) + jj*(cs_x); \
+				(void)xij_i; \
+\
+				bli_tscalris( cha,chx,chc, *(ar), *(ai), *xij_r, *xij_i ); \
+			} \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		{ \
+			if ( (doff_t)jj - (doff_t)ii <= diagoff ) \
+			{ \
+				PASTEMAC(chx,ctyper)* restrict xij_r = (xr) + ii*(rs_x) + jj*(cs_x); \
+				PASTEMAC(chx,ctyper)* restrict xij_i = (xi) + ii*(rs_x) + jj*(cs_x); \
+				(void)xij_i; \
+\
+				bli_tscalris( cha,chx,chc, *(ar), *(ai), *xij_r, *xij_i ); \
+			} \
+		} \
+	} \
+}
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_sscals( a, x ) bli_tscals( s,s,s, a, x )
+#define bli_dscals( a, x ) bli_tscals( d,d,d, a, x )
+#define bli_cscals( a, x ) bli_tscals( c,c,s, a, x )
+#define bli_zscals( a, x ) bli_tscals( z,z,d, a, x )
+
+#define bli_ssscals( a, x ) bli_tscals( s,s,s, a, x )
+#define bli_ddscals( a, x ) bli_tscals( d,d,d, a, x )
+#define bli_ccscals( a, x ) bli_tscals( c,c,s, a, x )
+#define bli_zzscals( a, x ) bli_tscals( z,z,d, a, x )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (xr) := (ar) * (xr) -   0  *   0 ;
+// (xi) xx   0  * (xr) + (ar) *   0 ;
+
+//   r       c
+// (xr) := (ar) * (xr) - (ai) *   0 ;
+// (xi) xx (ai) * (xr) + (ar) *   0 ;
+
+//   c       r
+// (xr) := (ar) * (xr) -   0  * (xi);
+// (xi) :=   0  * (xr) + (ar) * (xi);
+
+//   c       c
+// (xr) := (ar) * (xr) - (ai) * (xi);
+// (xi) := (ai) * (xr) + (ar) * (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_tsets.h b/frame/include/level0/bli_tsets.h
new file mode 100644
index 000000000..05a86b3f7
--- /dev/null
+++ b/frame/include/level0/bli_tsets.h
@@ -0,0 +1,306 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, Southern Methodist University
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TSETS_H
+#define BLIS_TSETS_H
+
+// -- Implementation macros ----------------------------------------------------
+
+#define bli_tsetims( \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr), \
+	  PASTEMAC(px,py,tcast)(xi), \
+	  yr, \
+	  yi \
+	); \
+}
+
+#define bli_tsetrims( \
+              px, xr, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr), \
+	  yi, \
+	  yr, \
+	  yi \
+	); \
+}
+
+#define bli_tsetiims( \
+              px,     xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  yr, \
+	  PASTEMAC(px,py,tcast)(xi), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tsets
+#define bli_tsets( chx,chy, xr, xi, y ) \
+        bli_tsetims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// tsetrs
+#define bli_tsetrs( chx,chy, xr, y ) \
+        bli_tsetrims \
+        ( \
+          PASTEMAC(chx,prec), \
+                    xr, \
+          PASTEMAC(chy,dom), \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// tsetis
+#define bli_tsetis( chx,chy, xi, y ) \
+        bli_tsetiims \
+        ( \
+          PASTEMAC(chx,prec), \
+                    xi, \
+          PASTEMAC(chy,dom), \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// -- Exposed real/imaginary --
+
+// tsetris
+#define bli_tsetris( chx,chy, xr, xi, yr, yi ) \
+        bli_tsetims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// -- Set to constant --
+
+// tset0s
+#define bli_tset0s( chy, y ) \
+    bli_tsets \
+    ( \
+      chy,chy, \
+      PASTEMAC \
+      ( \
+        PASTEMAC(chy,prec), \
+        zero \
+      ), \
+      PASTEMAC \
+      ( \
+        PASTEMAC(chy,prec), \
+        zero \
+      ), \
+      y \
+    )
+
+// tset1s
+#define bli_tset1s( chy, y ) \
+    bli_tsets \
+    ( \
+      chy,chy, \
+      PASTEMAC \
+      ( \
+       PASTEMAC(chy,prec), \
+       one \
+      ), \
+      PASTEMAC \
+      ( \
+        PASTEMAC(chy,prec), \
+        zero \
+      ), \
+      y \
+    )
+
+// tsetr0s
+#define bli_tsetr0s( chy, y ) \
+    bli_tsetrs( chy,chy, PASTEMAC(PASTEMAC(chy,prec),zero), y )
+
+// tseti0s
+#define bli_tseti0s( chy, y ) \
+    bli_tsetis( chy,chy, PASTEMAC(PASTEMAC(chy,prec),zero), y )
+
+// tset0ris
+#define bli_tset0ris( chy, yr, yi ) \
+    bli_tsetris \
+    ( \
+      chy,chy, \
+      PASTEMAC \
+      ( \
+       PASTEMAC(chy,prec), \
+       zero \
+      ), \
+      PASTEMAC \
+      ( \
+        PASTEMAC(chy,prec), \
+        zero \
+      ), \
+      yr, \
+      yi \
+    )
+
+// -- Micro-tile --
+
+// set0s_mxn
+#define bli_tset0s_mxn( chy, m, n, y, rs_y, cs_y ) \
+{ \
+	for ( dim_t _j = 0; _j < (n); ++_j ) \
+	for ( dim_t _i = 0; _i < (m); ++_i ) \
+	bli_tset0s( chy, *((y) + _i*(rs_y) + _j*(cs_y)) ); \
+}
+
+// set0bbs_mxn
+#define bli_tset0bbs_mxn( chy, m, n, y, incy, ldy ) \
+{ \
+	/* Assume that the duplication factor is the row stride of y. */ \
+	const dim_t _d    = incy; \
+	const dim_t _ds_y = 1; \
+\
+	for ( dim_t _j = 0; _j < (n); ++_j ) \
+	{ \
+		PASTEMAC(chy,ctype)* restrict _yj = (PASTEMAC(chy,ctype)*)(y) + _j*(ldy); \
+\
+		for ( dim_t _i = 0; _i < (m); ++_i ) \
+		{ \
+			PASTEMAC(chy,ctype)* restrict _yij = _yj + _i*(incy); \
+\
+			for ( dim_t _p = 0; _p < _d; ++_p ) \
+			{ \
+				PASTEMAC(chy,ctype)* restrict _yijd = _yij + _p*_ds_y; \
+\
+				bli_tset0s( chy, *_yijd ); \
+			} \
+		} \
+	} \
+}
+
+// bcastbbs_mxn
+#define bli_tbcastbbs_mxn( chy, m, n, y, incy, ldy ) \
+{ \
+	/* Assume that the duplication factor is the row stride of y. */ \
+	const dim_t _d    = incy; \
+	const dim_t _ds_y = 1; \
+\
+	for ( dim_t _j = 0; _j < (n); ++_j ) \
+	{ \
+		PASTEMAC(chy,ctype)* restrict _yj = (PASTEMAC(chy,ctype)*)(y) + _j*(ldy); \
+\
+		for ( dim_t _i = 0; _i < (m); ++_i ) \
+		{ \
+			PASTEMAC(chy,ctyper)* restrict _yij_r = (PASTEMAC(chy,ctyper)*)( _yj + _i*(incy) ); \
+			PASTEMAC(chy,ctyper)* restrict _yij_i = _yij_r + (incy); \
+\
+			for ( dim_t _p = 1; _p < _d; ++_p ) \
+			{ \
+				PASTEMAC(chy,ctyper)* restrict _yijd_r = _yij_r + _p*_ds_y; \
+				PASTEMAC(chy,ctyper)* restrict _yijd_i = _yij_i + _p*_ds_y; (void)_yijd_i; \
+\
+				bli_tcopyris( chy,chy, *_yij_r, *_yij_i, *_yijd_r, *_yijd_i ); \
+			} \
+		} \
+	} \
+}
+
+#define bli_tset0s_edge( chp, i, m, j, n, p, ldp ) \
+{ \
+	if ( (i) < (m) ) \
+	{ \
+		bli_tset0s_mxn \
+		( \
+		  chp, \
+		  (m) - (i), \
+		  j, \
+		  (p) + (i)*1, 1, ldp \
+		); \
+	} \
+\
+	if ( (j) < (n) ) \
+	{ \
+		bli_tset0s_mxn \
+		( \
+		  chp, \
+		  m, \
+		  (n) - (j), \
+		  (p) + (j)*(ldp), 1, ldp \
+		); \
+	} \
+}
+
+#endif
+
+// -- Legacy macros ------------------------------------------------------------
+
+#define bli_sset0s( x ) bli_tset0s( s, x )
+#define bli_dset0s( x ) bli_tset0s( d, x )
+#define bli_cset0s( x ) bli_tset0s( c, x )
+#define bli_zset0s( x ) bli_tset0s( z, x )
+
+#define bli_sset0s_edge( i, m, j, n, p, ldp ) bli_tset0s_edge( s, i, m, j, n, (float   *)(p), ldp )
+#define bli_dset0s_edge( i, m, j, n, p, ldp ) bli_tset0s_edge( d, i, m, j, n, (double  *)(p), ldp )
+#define bli_cset0s_edge( i, m, j, n, p, ldp ) bli_tset0s_edge( c, i, m, j, n, (scomplex*)(p), ldp )
+#define bli_zset0s_edge( i, m, j, n, p, ldp ) bli_tset0s_edge( z, i, m, j, n, (dcomplex*)(p), ldp )
+
+// -- Notes --------------------------------------------------------------------
diff --git a/frame/include/level0/bli_tsqrt2s.h b/frame/include/level0/bli_tsqrt2s.h
new file mode 100644
index 000000000..a66384e3f
--- /dev/null
+++ b/frame/include/level0/bli_tsqrt2s.h
@@ -0,0 +1,197 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TSQRT2S_H
+#define BLIS_TSQRT2S_H
+
+// -- Implementation macro -----------------------------------------------------
+
+#define bli_tsqrt2ims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dx,sqrtims) \
+	( \
+	  dx, px, xr, xi, \
+	  dy, py, yr, yi, \
+	  chc  \
+	); \
+}
+
+// -- real domain implementation --
+
+// yr = sqrt( xr );
+
+#define bli_rsqrtims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,sqrt)( \
+	      PASTEMAC(px,chc,tcast)( xr )  \
+	    ) \
+	  ), \
+	  PASTEMAC(py,zero), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- complex domain implementation --
+
+// mag = hypot( xr, xi );
+// tr  = sqrt( ( mag + xr ) / 2.0 );
+// ti  = sqrt( ( mag - xr ) / 2.0 );
+// yr  =           tr;
+// yi  = copysign( ti, xi );
+
+#define bli_csqrtims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(ro,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,hypot)( \
+	    PASTEMAC(px,chc,tcast)( xr ), \
+	    PASTEMAC(px,chc,tcast)( xi )  \
+	  ), \
+	  mag  \
+	) \
+	PASTEMAC(c,declinits) \
+	( \
+	  chc, \
+	  PASTEMAC(chc,sqrt)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(chc,add)( \
+	        mag, \
+	        PASTEMAC(px,chc,tcast)( xr ) \
+	      ), \
+	      PASTEMAC(chc,two) \
+	    ) \
+	  ), \
+	  PASTEMAC(chc,sqrt)( \
+	    PASTEMAC(chc,div)( \
+	      PASTEMAC(chc,sub)( \
+	        mag, \
+	        PASTEMAC(px,chc,tcast)( xr ) \
+	      ), \
+	      PASTEMAC(chc,two) \
+	    ) \
+	  ), \
+	  tr, \
+	  ti \
+	) \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(chc,py,tcast)( tr ), \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,copysign)( ti, xi ) \
+	  ), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tsqrt2s
+#define bli_tsqrt2s( chx, chy, chc, x, y ) \
+        bli_tsqrt2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tsqrt2ris
+#define bli_tsqrt2ris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_tsqrt2ims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) := sqrt(xr);
+// (yi) xx       0 ;
+
+//   r       c
+// (yr) := sqrt(xr ...);
+// (yi) xx copysign(xi ...);
+
+//   c       r
+// (yr) := sqrt(xr);
+// (yi) :=       0 ;
+
+//   c       c
+// (yr) := sqrt(xr ... );
+// (yi) := copysign(xi ...);
+
+#endif
+
diff --git a/frame/include/level0/bli_tsubs.h b/frame/include/level0/bli_tsubs.h
new file mode 100644
index 000000000..7563bc561
--- /dev/null
+++ b/frame/include/level0/bli_tsubs.h
@@ -0,0 +1,161 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TSUBS_H
+#define BLIS_TSUBS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) = (yr) - (xr);
+// (yi) = (yi) - (xi);
+
+#define bli_tsubims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi, \
+          chc  \
+        ) \
+{ \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,sub)( \
+	      PASTEMAC(py,chc,tcast)(yr), \
+	      PASTEMAC(px,chc,tcast)(xr)  \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,sub)( \
+	      PASTEMAC(py,chc,tcast)(yi), \
+	      PASTEMAC(px,chc,tcast)(xi) \
+	    ) \
+	  ), \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tsubs
+#define bli_tsubs( chx, chy, chc, x, y ) \
+        bli_tsubims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tsubjs
+#define bli_tsubjs( chx, chy, chc, x, y ) \
+        bli_tsubims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+            PASTEMAC(chx,imag)(x)  \
+          ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y), \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Exposed real/imaginary --
+
+// tsubris
+#define bli_tsubris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_tsubims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// tsubjris
+#define bli_tsubjris( chx, chy, chc, xr, xi, yr, yi ) \
+        bli_tsubims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+          PASTEMAC(PASTEMAC(chx,prec),neg)( \
+                     xi ), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi, \
+          PASTEMAC(chc,prec)  \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r
+// (yr) -= (xr);
+// (yi) xx   0 ;
+
+//   r       c
+// (yr) -= (xr);
+// (yi) xx (xi);
+
+//   c       r
+// (yr) -= (xr);
+// (yi) -=   0 ;
+
+//   c       c
+// (yr) -= (xr);
+// (yi) -= (xi);
+
+#endif
+
diff --git a/frame/include/level0/bli_tswaps.h b/frame/include/level0/bli_tswaps.h
new file mode 100644
index 000000000..fd7f4dfeb
--- /dev/null
+++ b/frame/include/level0/bli_tswaps.h
@@ -0,0 +1,154 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TSWAPS_H
+#define BLIS_TSWAPS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (tr) := (yr);
+// (ti) := (yi);
+// (yr) := (xr);
+// (yi) := (xi);
+// (xr) := (tr);
+// (xi) := (ti);
+
+#define bli_tswapims( \
+          \
+          dx, px, xr, xi, \
+          dy, py, yr, yi \
+        ) \
+{ \
+	/* It is important to use dx (or, alternatively, 'c') in the declinits macro
+	   so that in the case of chy chx = r c, ti gets set to zero. The c r case
+	   also works since ti, while unset by declinits, is unused by assigns. */ \
+	PASTEMAC(dx,declinits)( py, yr, yi, tr, ti ) \
+	PASTEMAC(dy,assigns) \
+	( \
+	  PASTEMAC(px,py,tcast)(xr),\
+	  PASTEMAC(px,py,tcast)(xi), \
+	  yr, \
+	  yi \
+	); \
+	PASTEMAC(dx,assigns) \
+	( \
+	  PASTEMAC(py,px,tcast)(tr),\
+	  PASTEMAC(py,px,tcast)(ti), \
+	  xr, \
+	  xi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// tswaps
+#define bli_tswaps( chx, chy, x, y ) \
+        bli_tswapims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+          PASTEMAC(chx,real)(x), \
+          PASTEMAC(chx,imag)(x), \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+          PASTEMAC(chy,real)(y), \
+          PASTEMAC(chy,imag)(y) \
+        )
+
+// -- Exposed real/imaginary --
+
+// tswapris
+#define bli_tswapris( chx, chy, xr, xi, yr, yi ) \
+        bli_tswapims \
+        ( \
+          PASTEMAC(chx,dom),  \
+          PASTEMAC(chx,prec), \
+                     xr, \
+                     xi, \
+          PASTEMAC(chy,dom),  \
+          PASTEMAC(chy,prec), \
+                     yr, \
+                     yi \
+        )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- Legacy macros ------------------------------------------------------------
+
+
+#define bli_sswaps( x, y ) bli_tswaps( s,s, x, y )
+#define bli_dswaps( x, y ) bli_tswaps( d,d, x, y )
+#define bli_cswaps( x, y ) bli_tswaps( c,c, x, y )
+#define bli_zswaps( x, y ) bli_tswaps( z,z, x, y )
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+// chy chx: r  r
+// (tr) := (yr);
+// (ti) :=   0 ;
+// (yr) := (xr);
+// (yi) xx (xi);
+// (xr) := (tr);
+// (xi) xx (ti);
+
+// chy chx: r  c
+// (tr) := (yr);
+// (ti) :=   0 ;
+// (yr) := (xr);
+// (yi) xx (xi);
+// (xr) := (tr);
+// (xi) := (ti);
+
+// chy chx: c  r
+// (tr) := (yr);
+// (ti) xx (yi);
+// (yr) := (xr);
+// (yi) :=   0 ;
+// (xr) := (tr);
+// (xi) xx (ti);
+
+// chy chx: c  c
+// (tr) := (yr);
+// (ti) := (yi);
+// (yr) := (xr);
+// (yi) := (xi);
+// (xr) := (tr);
+// (xi) := (ti);
+
+#endif
+
diff --git a/frame/include/level0/bli_txpbys.h b/frame/include/level0/bli_txpbys.h
new file mode 100644
index 000000000..122ddc465
--- /dev/null
+++ b/frame/include/level0/bli_txpbys.h
@@ -0,0 +1,325 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_TXPBYS_H
+#define BLIS_TXPBYS_H
+
+// -- Implementation macro -----------------------------------------------------
+
+// (yr) := (xr) + (br) * (yr) - (bi) * (yi);
+// (yi) := (xi) + (bi) * (yr) + (br) * (yi);
+
+#define bli_txpbyims( \
+	      \
+	      dx, px, xr, xi, \
+	      db, pb, br, bi, \
+	      dy, py, yr, yi, \
+	      chc  \
+	    ) \
+{ \
+	PASTEMAC(c,declinits) \
+	( \
+	  py, \
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(px,chc,tcast)(xr), \
+	      PASTEMAC(chc,sub)( \
+	        PASTEMAC(db,dy,termrr)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(br), \
+	            PASTEMAC(py,chc,tcast)(yr) \
+	          )  \
+	        ), \
+	        PASTEMAC(db,dy,termii)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(bi), \
+	            PASTEMAC(py,chc,tcast)(yi) \
+	          ) \
+	        ) \
+	      ) \
+	    ) \
+	  ),\
+	  PASTEMAC(chc,py,tcast)( \
+	    PASTEMAC(chc,add)( \
+	      PASTEMAC(px,chc,tcast)(xi), \
+	      PASTEMAC(chc,add)( \
+	        PASTEMAC(db,dy,termir)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(bi), \
+	            PASTEMAC(py,chc,tcast)(yr) \
+	          )  \
+	        ), \
+	        PASTEMAC(db,dy,termri)( \
+	          chc, \
+	          PASTEMAC(chc,mul)( \
+	            PASTEMAC(pb,chc,tcast)(br), \
+	            PASTEMAC(py,chc,tcast)(yi) \
+	          ) \
+	        ) \
+	      ) \
+	    ) \
+	  ), \
+	  tr, \
+	  ti \
+	); \
+	PASTEMAC(dy,assigns) \
+	( \
+	  tr, \
+	  ti, \
+	  yr, \
+	  yi \
+	); \
+}
+
+// -- API macros ---------------------------------------------------------------
+
+// -- Consolidated --
+
+// txpbys
+#define bli_txpbys( chx, chb, chy, chc, x, b, y ) \
+	    bli_txpbyims \
+	    ( \
+	      PASTEMAC(chx,dom),  \
+	      PASTEMAC(chx,prec), \
+	      PASTEMAC(chx,real)(x), \
+	      PASTEMAC(chx,imag)(x), \
+	      PASTEMAC(chb,dom),  \
+	      PASTEMAC(chb,prec), \
+	      PASTEMAC(chb,real)(b), \
+	      PASTEMAC(chb,imag)(b), \
+	      PASTEMAC(chy,dom),  \
+	      PASTEMAC(chy,prec), \
+	      PASTEMAC(chy,real)(y), \
+	      PASTEMAC(chy,imag)(y), \
+	      PASTEMAC(chc,prec)  \
+	    )
+
+// txpbyjs
+#define bli_txpbyjs( chx, chb, chy, chc, x, b, y ) \
+	    bli_txpbyims \
+	    ( \
+	      PASTEMAC(chx,dom),  \
+	      PASTEMAC(chx,prec), \
+	      PASTEMAC(chx,real)(x), \
+	      PASTEMAC(PASTEMAC(chx,prec),neg)( \
+	        PASTEMAC(chx,imag)(x)  \
+	      ), \
+	      PASTEMAC(chb,dom),  \
+	      PASTEMAC(chb,prec), \
+	      PASTEMAC(chb,real)(b), \
+	      PASTEMAC(chb,imag)(b), \
+	      PASTEMAC(chy,dom),  \
+	      PASTEMAC(chy,prec), \
+	      PASTEMAC(chy,real)(y), \
+	      PASTEMAC(chy,imag)(y), \
+	      PASTEMAC(chc,prec)  \
+	    )
+
+// -- Exposed real/imaginary --
+
+// txpbyris
+#define bli_txpbyris( chx, chb, chy, chc, xr, xi, br, bi, yr, yi ) \
+	    bli_txpbyims \
+	    ( \
+	      PASTEMAC(chx,dom),  \
+	      PASTEMAC(chx,prec), \
+	                 xr, \
+	                 xi, \
+	      PASTEMAC(chb,dom),  \
+	      PASTEMAC(chb,prec), \
+	                 br, \
+	                 bi, \
+	      PASTEMAC(chy,dom),  \
+	      PASTEMAC(chy,prec), \
+	                 yr, \
+	                 yi, \
+	      PASTEMAC(chc,prec)  \
+	    )
+
+// txpbyjris
+#define bli_txpbyjris( chx, chb, chy, chc, xr, xi, br, bi, yr, yi ) \
+	    bli_txpbyims \
+	    ( \
+	      PASTEMAC(chx,dom),  \
+	      PASTEMAC(chx,prec), \
+	                 xr, \
+	      PASTEMAC(PASTEMAC(chx,prec),neg)( \
+	                 xi ), \
+	      PASTEMAC(chb,dom),  \
+	      PASTEMAC(chb,prec), \
+	                 br, \
+	                 bi, \
+	      PASTEMAC(chy,dom),  \
+	      PASTEMAC(chy,prec), \
+	                 yr, \
+	                 yi, \
+	      PASTEMAC(chc,prec)  \
+	    )
+
+// -- Higher-level static functions --------------------------------------------
+
+// -- mxn --
+
+// xpbys_mxn
+#define bli_txpbys_mxn( chx, chb, chy, chc, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
+{ \
+\
+	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
+	if ( bli_teq0s( chb, *(beta) ) ) \
+	{ \
+		bli_tcopys_mxn( chx, chy, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+	} \
+	else \
+	{ \
+		for ( dim_t jj = 0; jj < n; ++jj ) \
+		for ( dim_t ii = 0; ii < m; ++ii ) \
+		{ \
+			PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+			PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+			bli_txpbys( chx,chb,chy,chc, *xij, *(beta), *yij ); \
+		} \
+	} \
+}
+
+// xpbys_mxn_uplo
+#define bli_txpbys_mxn_uplo( chx, chb, chy, chc, diagoff, uplo, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
+{ \
+	if ( bli_is_upper( uplo ) ) \
+	{ \
+		/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
+		if ( bli_teq0s( chb, *(beta) ) ) \
+		{ \
+			for ( dim_t jj = 0; jj < n; ++jj ) \
+			for ( dim_t ii = 0; ii < m; ++ii ) \
+			{ \
+				if ( (doff_t)jj - (doff_t)ii >= (diagoff) ) \
+				{ \
+					PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+					PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+					bli_tcopys( chx,chy, *xij, *yij ); \
+				} \
+			} \
+		} \
+		else \
+		{ \
+			for ( dim_t jj = 0; jj < n; ++jj ) \
+			for ( dim_t ii = 0; ii < m; ++ii ) \
+			{ \
+				if ( (doff_t)jj - (doff_t)ii >= (diagoff) ) \
+				{ \
+					PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+					PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+					bli_txpbys( chx,chb,chy,chc, *xij, *(beta), *yij ); \
+				} \
+			} \
+		} \
+	} \
+	else /* if ( bli_is_lower( uplo ) ) */ \
+	{ \
+		/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
+		if ( bli_teq0s( chb, *(beta) ) ) \
+		{ \
+			for ( dim_t jj = 0; jj < n; ++jj ) \
+			for ( dim_t ii = 0; ii < m; ++ii ) \
+			{ \
+				if ( (doff_t)jj - (doff_t)ii <= (diagoff) ) \
+				{ \
+					PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+					PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+					bli_tcopys( chx,chy, *xij, *yij ); \
+				} \
+			} \
+		} \
+		else \
+		{ \
+			for ( dim_t jj = 0; jj < n; ++jj ) \
+			for ( dim_t ii = 0; ii < m; ++ii ) \
+			{ \
+				if ( (doff_t)jj - (doff_t)ii <= (diagoff) ) \
+				{ \
+					PASTEMAC(chx,ctype)* restrict xij = ( PASTEMAC(chx,ctype)* )(x) + ii*(rs_x) + jj*(cs_x); \
+					PASTEMAC(chy,ctype)* restrict yij = ( PASTEMAC(chy,ctype)* )(y) + ii*(rs_y) + jj*(cs_y); \
+\
+					bli_txpbys( chx,chb,chy,chc, *xij, *(beta), *yij ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+// -- Notes --------------------------------------------------------------------
+
+// -- Domain cases --
+
+//   r       r      r
+// (yr) := (xr) + (br) * (yr) -   0  *   0 ;
+// (yi) xx   0  +   0  * (yr) + (br) *   0 ;
+
+//   r       r      c
+// (yr) := (xr) + (br) * (yr) - (bi) *   0 ;
+// (yi) xx   0  + (bi) * (yr) + (br) *   0 ;
+
+//   r       c      r
+// (yr) := (xr) + (br) * (yr) -   0  *   0 ;
+// (yi) xx (xi) +   0  * (yr) + (br) *   0 ;
+
+//   r       c      c
+// (yr) := (xr) + (br) * (yr) - (bi) *   0 ;
+// (yi) xx (xi) + (bi) * (yr) + (br) *   0 ;
+
+//   c       r      r
+// (yr) := (xr) + (br) * (yr) -   0  * (yi);
+// (yi) :=   0  +   0  * (yr) + (br) * (yi);
+
+//   c       r      c
+// (yr) := (xr) + (br) * (yr) - (bi) * (yi);
+// (yi) :=   0  + (bi) * (yr) + (br) * (yi);
+
+//   c       c      r
+// (yr) := (xr) + (br) * (yr) -   0  * (yi);
+// (yi) := (xi) +   0  * (yr) + (br) * (yi);
+
+//   c       c      c
+// (yr) := (xr) + (br) * (yr) - (bi) * (yi);
+// (yi) := (xi) + (bi) * (yr) + (br) * (yi);
+
+#endif
+
diff --git a/frame/include/level0/bli_xpbyjs.h b/frame/include/level0/bli_xpbyjs.h
deleted file mode 100644
index 312623737..000000000
--- a/frame/include/level0/bli_xpbyjs.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_XPBYJS_H
-#define BLIS_XPBYJS_H
-
-// xpbyjs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of b.
-// - The third char encodes the type of y.
-
-// -- (xby) = (??s) ------------------------------------------------------------
-
-#define bli_sssxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dssxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cssxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zssxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_scsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dcsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ccsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zcsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_szsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzsxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-
-// -- (xby) = (??d) ------------------------------------------------------------
-
-#define bli_ssdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dsdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_csdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zsdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sddxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dddxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cddxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zddxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_scdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dcdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ccdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zcdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_szdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzdxpbyjs( x, b, y )  bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (xby) = (??c) ------------------------------------------------------------
-
-#define bli_sscxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dscxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cscxpbyjs( x, b, y )  bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zscxpbyjs( x, b, y )  bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_sdcxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddcxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdcxpbyjs( x, b, y )  bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdcxpbyjs( x, b, y )  bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_sccxpbyjs( x, b, y )  bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dccxpbyjs( x, b, y )  bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cccxpbyjs( x, b, y )  bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zccxpbyjs( x, b, y )  bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcxpbyjs( x, b, y )  bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzcxpbyjs( x, b, y )  bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czcxpbyjs( x, b, y )  bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzcxpbyjs( x, b, y )  bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-
-// -- (xby) = (??z) ------------------------------------------------------------
-
-#define bli_sszxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dszxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cszxpbyjs( x, b, y )  bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zszxpbyjs( x, b, y )  bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sdzxpbyjs( x, b, y )  bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzxpbyjs( x, b, y )  bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzxpbyjs( x, b, y )  bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzxpbyjs( x, b, y )  bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sczxpbyjs( x, b, y )  bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dczxpbyjs( x, b, y )  bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cczxpbyjs( x, b, y )  bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zczxpbyjs( x, b, y )  bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_szzxpbyjs( x, b, y )  bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzxpbyjs( x, b, y )  bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czzxpbyjs( x, b, y )  bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzxpbyjs( x, b, y )  bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (xby) = (??c) ------------------------------------------------------------
-
-#define bli_sscxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dscxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cscxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zscxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sdcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_ddcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cdcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zdcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sccxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dccxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cccxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zccxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_szcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dzcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_czcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zzcxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-// -- (xby) = (??z) ------------------------------------------------------------
-
-#define bli_sszxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dszxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cszxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zszxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sdzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_ddzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cdzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zdzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sczxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dczxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cczxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zczxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_szzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dzzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_czzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zzzxpbyjs( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sxpbyjs( x, b, y )  bli_sssxpbyjs( x, b, y )
-#define bli_dxpbyjs( x, b, y )  bli_dddxpbyjs( x, b, y )
-#define bli_cxpbyjs( x, b, y )  bli_cccxpbyjs( x, b, y )
-#define bli_zxpbyjs( x, b, y )  bli_zzzxpbyjs( x, b, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_xpbys.h b/frame/include/level0/bli_xpbys.h
deleted file mode 100644
index f57fec76c..000000000
--- a/frame/include/level0/bli_xpbys.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_XPBYS_H
-#define BLIS_XPBYS_H
-
-// xpbys
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of b.
-// - The third char encodes the type of y.
-
-// -- (xby) = (??s) ------------------------------------------------------------
-
-#define bli_sssxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dssxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cssxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zssxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_sdsxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ddsxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_cdsxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zdsxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_scsxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dcsxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_ccsxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zcsxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) )
-
-#define bli_szsxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_dzsxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_czsxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-#define bli_zzsxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) )
-
-// -- (xby) = (??d) ------------------------------------------------------------
-
-#define bli_ssdxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dsdxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_csdxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zsdxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_sddxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dddxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_cddxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zddxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_scdxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dcdxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_ccdxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zcdxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) )
-
-#define bli_szdxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_dzdxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_czdxpbys( x, b, y )  bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-#define bli_zzdxpbys( x, b, y )  bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) )
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-// -- (xby) = (??c) ------------------------------------------------------------
-
-#define bli_sscxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dscxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cscxpbys( x, b, y )  bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zscxpbys( x, b, y )  bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_sdcxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_ddcxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cdcxpbys( x, b, y )  bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zdcxpbys( x, b, y )  bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_sccxpbys( x, b, y )  bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dccxpbys( x, b, y )  bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_cccxpbys( x, b, y )  bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zccxpbys( x, b, y )  bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) )
-
-#define bli_szcxpbys( x, b, y )  bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_dzcxpbys( x, b, y )  bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_czcxpbys( x, b, y )  bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-#define bli_zzcxpbys( x, b, y )  bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) )
-
-// -- (xby) = (??z) ------------------------------------------------------------
-
-#define bli_sszxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dszxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cszxpbys( x, b, y )  bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zszxpbys( x, b, y )  bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sdzxpbys( x, b, y )  bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_ddzxpbys( x, b, y )  bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cdzxpbys( x, b, y )  bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zdzxpbys( x, b, y )  bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_sczxpbys( x, b, y )  bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dczxpbys( x, b, y )  bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_cczxpbys( x, b, y )  bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zczxpbys( x, b, y )  bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) )
-
-#define bli_szzxpbys( x, b, y )  bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_dzzxpbys( x, b, y )  bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_czzxpbys( x, b, y )  bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-#define bli_zzzxpbys( x, b, y )  bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) )
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-// -- (xby) = (??c) ------------------------------------------------------------
-
-#define bli_sscxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dscxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cscxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zscxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sdcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_ddcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cdcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zdcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sccxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dccxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cccxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zccxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_szcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dzcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_czcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zzcxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-// -- (xby) = (??z) ------------------------------------------------------------
-
-#define bli_sszxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dszxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cszxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zszxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sdzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_ddzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cdzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zdzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_sczxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dczxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_cczxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zczxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#define bli_szzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_dzzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_czzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-#define bli_zzzxpbys( x, b, y )  { (y) = (x) + (b) * (y); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sxpbys( x, b, y )  bli_sssxpbys( x, b, y )
-#define bli_dxpbys( x, b, y )  bli_dddxpbys( x, b, y )
-#define bli_cxpbys( x, b, y )  bli_cccxpbys( x, b, y )
-#define bli_zxpbys( x, b, y )  bli_zzzxpbys( x, b, y )
-
-
-#endif
-
diff --git a/frame/include/level0/bli_xpbys_mxn.h b/frame/include/level0/bli_xpbys_mxn.h
deleted file mode 100644
index d3174289f..000000000
--- a/frame/include/level0/bli_xpbys_mxn.h
+++ /dev/null
@@ -1,830 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_XPBYS_MXN_H
-#define BLIS_XPBYS_MXN_H
-
-// xpbys_mxn
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of b.
-// - The third char encodes the type of y.
-// - We only implement cases where typeof(b) == typeof(y).
-
-#undef  BLIS_ENABLE_CR_CASES
-#define BLIS_ENABLE_CR_CASES 0
-
-// -- bli_???xpbys_mxn --
-
-#undef  GENTFUNC2
-#define GENTFUNC2( ctypex, ctypey, chx, chy, opname, kername ) \
-\
-BLIS_INLINE void PASTEMAC(chx,chy,chy,opname) \
-     ( \
-       const dim_t   m, \
-       const dim_t   n, \
-       const ctypex* x, inc_t rs_x, inc_t cs_x, \
-       const ctypey* beta, \
-             ctypey* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( PASTEMAC(chy,eq0)( *beta ) ) \
-	{ \
-		PASTEMAC(chx,chy,copys_mxn)( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
-		return; \
-	} \
-\
-	if      ( BLIS_ENABLE_CR_CASES && rs_x == 1 && rs_y == 1 ) \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC(chx,chy,chy,kername) \
-		( \
-		  *(x + ii + jj*cs_x), *beta, \
-		  *(y + ii + jj*cs_y) \
-		); \
-	} \
-	else if ( BLIS_ENABLE_CR_CASES && cs_x == 1 && cs_y == 1 ) \
-	{ \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		PASTEMAC(chx,chy,chy,kername) \
-		( \
-		  *(x + ii*rs_x + jj), *beta, \
-		  *(y + ii*rs_y + jj) \
-		); \
-	} \
-	else \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		PASTEMAC(chx,chy,chy,kername) \
-		( \
-		  *(x + ii*rs_x + jj*cs_x), *beta, \
-		  *(y + ii*rs_y + jj*cs_y) \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC2_BASIC ( xpbys_mxn, xpbys )
-INSERT_GENTFUNC2_MIX_DP( xpbys_mxn, xpbys )
-
-
-// -- bli_?xpbys_mxn --
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const dim_t  m, \
-       const dim_t  n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       const ctype* beta, \
-             ctype* y, inc_t rs_y, inc_t cs_y  \
-     ) \
-{ \
-    PASTEMAC(ch,ch,ch,opname)( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-
-INSERT_GENTFUNC_BASIC( xpbys_mxn )
-
-
-
-#if 0
-// -- (xby) = (?ss) ------------------------------------------------------------
-
-BLIS_INLINE void bli_sssxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-       const float*    restrict beta,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_seq0( *beta ) )
-	{
-		bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sssxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_sssxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dssxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-       const float*    restrict beta,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_seq0( *beta ) )
-	{
-		bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dssxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dssxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_cssxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const float*    restrict beta,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_seq0( *beta ) )
-	{
-		bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cssxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_cssxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zssxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const float*    restrict beta,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_seq0( *beta ) )
-	{
-		bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zssxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zssxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// -- (xby) = (?dd) ------------------------------------------------------------
-
-BLIS_INLINE void bli_sddxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-       const double*   restrict beta,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_deq0( *beta ) )
-	{
-		bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sddxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_sddxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dddxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-       const double*   restrict beta,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_deq0( *beta ) )
-	{
-		bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dddxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dddxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_cddxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const double*   restrict beta,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_deq0( *beta ) )
-	{
-		bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cddxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_cddxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zddxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const double*   restrict beta,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_deq0( *beta ) )
-	{
-		bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zddxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zddxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// -- (xby) = (?cc) ------------------------------------------------------------
-
-BLIS_INLINE void bli_sccxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-       const scomplex* restrict beta,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_ceq0( *beta ) )
-	{
-		bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sccxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_sccxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dccxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-       const scomplex* restrict beta,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_ceq0( *beta ) )
-	{
-		bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dccxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dccxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_cccxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const scomplex* restrict beta,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_ceq0( *beta ) )
-	{
-		bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cccxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_cccxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zccxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const scomplex* restrict beta,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_ceq0( *beta ) )
-	{
-		bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zccxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zccxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-// -- (xby) = (?zz) ------------------------------------------------------------
-
-BLIS_INLINE void bli_szzxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-       const dcomplex* restrict beta,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_zeq0( *beta ) )
-	{
-		bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_szzxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_szzxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_dzzxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-       const dcomplex* restrict beta,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_zeq0( *beta ) )
-	{
-		bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dzzxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_dzzxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_czzxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const dcomplex* restrict beta,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_zeq0( *beta ) )
-	{
-		bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_czzxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_czzxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-BLIS_INLINE void bli_zzzxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const dcomplex* restrict beta,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
-	if ( bli_zeq0( *beta ) )
-	{
-		bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
-		return;
-	}
-
-#ifdef BLIS_ENABLE_CR_CASES
-	if ( rs_x == 1 && rs_y == 1 )
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zzzxpbys( *(x + ii + jj*cs_x), *beta,
-		              *(y + ii + jj*cs_y) );
-	}
-	else if ( cs_x == 1 && cs_y == 1 )
-	{
-		for ( dim_t ii = 0; ii < m; ++ii )
-		for ( dim_t jj = 0; jj < n; ++jj )
-		bli_zzzxpbys( *(x + ii*rs_x + jj), *beta,
-		              *(y + ii*rs_y + jj) );
-	}
-	else
-#endif
-	{
-		for ( dim_t jj = 0; jj < n; ++jj )
-		for ( dim_t ii = 0; ii < m; ++ii )
-		bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
-		              *(y + ii*rs_y + jj*cs_y) );
-	}
-}
-
-
-
-
-BLIS_INLINE void bli_sxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const float*    restrict x, const inc_t rs_x, const inc_t cs_x,
-       const float*    restrict beta,
-             float*    restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_dxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const double*   restrict x, const inc_t rs_x, const inc_t cs_x,
-       const double*   restrict beta,
-             double*   restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_cxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const scomplex* restrict beta,
-             scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
-}
-BLIS_INLINE void bli_zxpbys_mxn
-     (
-       const dim_t m,
-       const dim_t n,
-       const dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       const dcomplex* restrict beta,
-             dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
-}
-#endif
-
-
-#endif
diff --git a/frame/include/level0/bli_xpbys_mxn_uplo.h b/frame/include/level0/bli_xpbys_mxn_uplo.h
deleted file mode 100644
index 1c50a8cf4..000000000
--- a/frame/include/level0/bli_xpbys_mxn_uplo.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_XPBYS_MXN_UPLO_H
-#define BLIS_XPBYS_MXN_UPLO_H
-
-// xpbys_mxn_u
-
-#define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_seq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_sscopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_deq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_ceq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_cccopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_zeq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-// xpbys_mxn_l
-
-#define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_seq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_sscopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_deq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_ceq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_cccopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_zeq0( *beta ) ) \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \
-			             *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \
-			              *(beta), \
-			              *(y + _i*rs_y + _j*cs_y) ); \
-		} \
-	} \
-}
-
-
-#define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-#define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-#define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-#define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-#define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-#define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-#define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-#define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
-}
-
-#endif
diff --git a/frame/include/level0/old/bli_cast.h b/frame/include/level0/old/bli_cast.h
deleted file mode 100644
index f54b9cd96..000000000
--- a/frame/include/level0/old/bli_cast.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_CAST_H
-#define BLIS_CAST_H
-
-// cast
-
-// Notes:
-// - The first char encodes the type of *ap.
-// - The second char encodes the type of b.
-
-
-#define bli_sscast( ap, b ) \
-{ \
-	(b) = ( float  )             *(( float*    )(ap)); \
-}
-#define bli_dscast( ap, b ) \
-{ \
-	(b) = ( float  )             *(( double*   )(ap)); \
-}
-#define bli_cscast( ap, b ) \
-{ \
-	(b) = ( float  )  bli_creal( *(( scomplex* )(ap)) ); \
-}
-#define bli_zscast( ap, b ) \
-{ \
-	(b) = ( float  )  bli_zreal( *(( dcomplex* )(ap)) ); \
-}
-
-
-#define bli_sdcast( ap, b ) \
-{ \
-	(b) = ( double )             *(( float*    )(ap)); \
-}
-#define bli_ddcast( ap, b ) \
-{ \
-	(b) = ( double )             *(( double*   )(ap)); \
-}
-#define bli_cdcast( ap, b ) \
-{ \
-	(b) = ( double )  bli_creal( *(( scomplex* )(ap)) ); \
-}
-#define bli_zdcast( ap, b ) \
-{ \
-	(b) = ( double )  bli_zreal( *(( dcomplex* )(ap)) ); \
-}
-
-
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sccast( ap, b ) \
-{ \
-	bli_scsets( bli_sreal( *(( float*    )(ap)) ), \
-	                                          0.0, (b) ); \
-}
-#define bli_dccast( ap, b ) \
-{ \
-	bli_dcsets( bli_dreal( *(( double*   )(ap)) ), \
-	                                          0.0, (b) ); \
-}
-#define bli_cccast( ap, b ) \
-{ \
-	bli_ccsets( bli_creal( *(( scomplex* )(ap)) ), \
-	            bli_cimag( *(( scomplex* )(ap)) ), (b) ); \
-}
-#define bli_zccast( ap, b ) \
-{ \
-	bli_zcsets( bli_zreal( *(( dcomplex* )(ap)) ), \
-	            bli_zimag( *(( dcomplex* )(ap)) ), (b) ); \
-}
-
-
-#define bli_szcast( ap, b ) \
-{ \
-	bli_szsets( bli_sreal( *(( float*    )(ap)) ), \
-	                                          0.0, (b) ); \
-}
-#define bli_dzcast( ap, b ) \
-{ \
-	bli_dzsets( bli_dreal( *(( double*   )(ap)) ), \
-	                                          0.0, (b) ); \
-}
-#define bli_czcast( ap, b ) \
-{ \
-	bli_czsets( bli_creal( *(( scomplex* )(ap)) ), \
-	            bli_cimag( *(( scomplex* )(ap)) ), (b) ); \
-}
-#define bli_zzcast( ap, b ) \
-{ \
-	bli_zzsets( bli_zreal( *(( dcomplex* )(ap)) ), \
-	            bli_zimag( *(( dcomplex* )(ap)) ), (b) ); \
-}
-
-
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_sccast( ap, b )  { (b) = ( scomplex ) *(( float*    )(ap)); }
-#define bli_dccast( ap, b )  { (b) = ( scomplex ) *(( double*   )(ap)); }
-#define bli_cccast( ap, b )  { (b) = ( scomplex ) *(( scomplex* )(ap)); }
-#define bli_zccast( ap, b )  { (b) = ( scomplex ) *(( dcomplex* )(ap)); }
-
-#define bli_szcast( ap, b )  { (b) = ( dcomplex ) *(( float*    )(ap)); }
-#define bli_dzcast( ap, b )  { (b) = ( dcomplex ) *(( double*   )(ap)); }
-#define bli_czcast( ap, b )  { (b) = ( dcomplex ) *(( scomplex* )(ap)); }
-#define bli_zzcast( ap, b )  { (b) = ( dcomplex ) *(( dcomplex* )(ap)); }
-
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
-
-#define bli_scast( ap, b )  bli_sscast( ap, b )
-#define bli_dcast( ap, b )  bli_ddcast( ap, b )
-#define bli_ccast( ap, b )  bli_cccast( ap, b )
-#define bli_zcast( ap, b )  bli_zzcast( ap, b )
-
-#endif
diff --git a/frame/include/level0/old/bli_castfrom.h b/frame/include/level0/old/bli_castfrom.h
deleted file mode 100644
index 52e6a98b8..000000000
--- a/frame/include/level0/old/bli_castfrom.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
diff --git a/frame/include/level0/old/bli_castto.h b/frame/include/level0/old/bli_castto.h
deleted file mode 100644
index 52e6a98b8..000000000
--- a/frame/include/level0/old/bli_castto.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
diff --git a/frame/include/level0/old/bli_copynzjs.h b/frame/include/level0/old/bli_copynzjs.h
deleted file mode 100644
index ce82ee1c7..000000000
--- a/frame/include/level0/old/bli_copynzjs.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYNZJS_H
-#define BLIS_COPYNZJS_H
-
-// copynzjs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-// - x is copied in conjugated form.
-
-#define bli_sscopynzjs( x, y ) \
-{ \
-	(y) = ( float  ) (x); \
-}
-#define bli_dscopynzjs( x, y ) \
-{ \
-	(y) = ( float  ) (x); \
-}
-#define bli_cscopynzjs( x, y ) \
-{ \
-	(y) = ( float  ) (x).real; \
-}
-#define bli_zscopynzjs( x, y ) \
-{ \
-	(y) = ( float  ) (x).real; \
-}
-
-#define bli_sdcopynzjs( x, y ) \
-{ \
-	(y) = ( double ) (x); \
-}
-#define bli_ddcopynzjs( x, y ) \
-{ \
-	(y) = ( double ) (x); \
-}
-#define bli_cdcopynzjs( x, y ) \
-{ \
-	(y) = ( double ) (x).real; \
-}
-#define bli_zdcopynzjs( x, y ) \
-{ \
-	(y) = ( double ) (x).real; \
-}
-
-#define bli_sccopynzjs( x, y ) \
-{ \
-	(y).real = ( float  ) (x); \
-	/* (y).imag = 0.0F; (SKIP COPYING OF ZERO) */ \
-}
-#define bli_dccopynzjs( x, y ) \
-{ \
-	(y).real = ( float  ) (x); \
-	/* (y).imag = 0.0F; (SKIP COPYING OF ZERO) */ \
-}
-#define bli_cccopynzjs( x, y ) \
-{ \
-	(y).real = ( float  )  (x).real; \
-	(y).imag = ( float  ) -(x).imag; \
-}
-#define bli_zccopynzjs( x, y ) \
-{ \
-	(y).real = ( float  )  (x).real; \
-	(y).imag = ( float  ) -(x).imag; \
-}
-
-#define bli_szcopynzjs( x, y ) \
-{ \
-	(y).real = ( double ) (x); \
-	/* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \
-}
-#define bli_dzcopynzjs( x, y ) \
-{ \
-	(y).real = ( double ) (x); \
-	/* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \
-}
-#define bli_czcopynzjs( x, y ) \
-{ \
-	(y).real = ( double )  (x).real; \
-	(y).imag = ( double ) -(x).imag; \
-}
-#define bli_zzcopynzjs( x, y ) \
-{ \
-	(y).real = ( double )  (x).real; \
-	(y).imag = ( double ) -(x).imag; \
-}
-
-
-#define bli_scopynzjs( x, y ) \
-{ \
-	bli_sscopynzjs( x, y ); \
-}
-#define bli_dcopynzjs( x, y ) \
-{ \
-	bli_ddcopynzjs( x, y ); \
-}
-#define bli_ccopynzjs( x, y ) \
-{ \
-	bli_cccopynzjs( x, y ); \
-}
-#define bli_zcopynzjs( x, y ) \
-{ \
-	bli_zzcopynzjs( x, y ); \
-}
-
-
-#endif
diff --git a/frame/include/level0/old/bli_copynzs.h b/frame/include/level0/old/bli_copynzs.h
deleted file mode 100644
index a8fe71a6c..000000000
--- a/frame/include/level0/old/bli_copynzs.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYNZS_H
-#define BLIS_COPYNZS_H
-
-// copynzs
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define bli_sscopynzs( x, y ) \
-{ \
-	(y) = ( float  ) (x); \
-}
-#define bli_dscopynzs( x, y ) \
-{ \
-	(y) = ( float  ) (x); \
-}
-#define bli_cscopynzs( x, y ) \
-{ \
-	(y) = ( float  ) (x).real; \
-}
-#define bli_zscopynzs( x, y ) \
-{ \
-	(y) = ( float  ) (x).real; \
-}
-
-#define bli_sdcopynzs( x, y ) \
-{ \
-	(y) = ( double ) (x); \
-}
-#define bli_ddcopynzs( x, y ) \
-{ \
-	(y) = ( double ) (x); \
-}
-#define bli_cdcopynzs( x, y ) \
-{ \
-	(y) = ( double ) (x).real; \
-}
-#define bli_zdcopynzs( x, y ) \
-{ \
-	(y) = ( double ) (x).real; \
-}
-
-#define bli_sccopynzs( x, y ) \
-{ \
-	(y).real = ( float  ) (x); \
-	/* (y).imag = 0.0F; (SKIP COPYING OF ZERO) */ \
-}
-#define bli_dccopynzs( x, y ) \
-{ \
-	(y).real = ( float  ) (x); \
-	/* (y).imag = 0.0F (SKIP COPYING OF ZERO) */; \
-}
-#define bli_cccopynzs( x, y ) \
-{ \
-	(y).real = ( float  ) (x).real; \
-	(y).imag = ( float  ) (x).imag; \
-}
-#define bli_zccopynzs( x, y ) \
-{ \
-	(y).real = ( float  ) (x).real; \
-	(y).imag = ( float  ) (x).imag; \
-}
-
-#define bli_szcopynzs( x, y ) \
-{ \
-	(y).real = ( double ) (x); \
-	/* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \
-}
-#define bli_dzcopynzs( x, y ) \
-{ \
-	(y).real = ( double ) (x); \
-	/* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \
-}
-#define bli_czcopynzs( x, y ) \
-{ \
-	(y).real = ( double ) (x).real; \
-	(y).imag = ( double ) (x).imag; \
-}
-#define bli_zzcopynzs( x, y ) \
-{ \
-	(y).real = ( double ) (x).real; \
-	(y).imag = ( double ) (x).imag; \
-}
-
-
-#define bli_scopynzs( x, y ) \
-{ \
-	bli_sscopynzs( x, y ); \
-}
-#define bli_dcopynzs( x, y ) \
-{ \
-	bli_ddcopynzs( x, y ); \
-}
-#define bli_ccopynzs( x, y ) \
-{ \
-	bli_cccopynzs( x, y ); \
-}
-#define bli_zcopynzs( x, y ) \
-{ \
-	bli_zzcopynzs( x, y ); \
-}
-
-
-#endif
diff --git a/frame/include/level0/old/bli_invscalcjs.h b/frame/include/level0/old/bli_invscalcjs.h
deleted file mode 100644
index 983d7622c..000000000
--- a/frame/include/level0/old/bli_invscalcjs.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVSCALCJS_H
-#define BLIS_INVSCALCJS_H
-
-// invscalcjs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-
-#define bli_ssinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( float  ) (a); \
-}
-#define bli_dsinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( float  ) (a); \
-}
-#define bli_csinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( float  ) (a).real; \
-}
-#define bli_zsinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( float  ) (a).real; \
-}
-
-#define bli_sdinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( double ) (a); \
-}
-#define bli_ddinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( double ) (a); \
-}
-#define bli_cdinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( double ) (a).real; \
-}
-#define bli_zdinvscalcjs( conj, a, x ) \
-{ \
-	(x) /= ( double ) (a).real; \
-}
-
-#define bli_scinvscalcjs( conj, a, x ) \
-{ \
-	(x).real /= ( float  ) (a); \
-	(x).imag /= ( float  ) (a); \
-}
-#define bli_dcinvscalcjs( conj, a, x ) \
-{ \
-	(x).real /= ( float  ) (a); \
-	(x).imag /= ( float  ) (a); \
-}
-#define bli_ccinvscalcjs( conj, a, x ) \
-{ \
-	float  aimag = ( bli_is_conj( conj ) ? ( float  ) -(a).imag : \
-	                                       ( float  )  (a).imag ); \
-	float  temp =              ( float  ) (a).real * (a).real + ( float  ) aimag * (a).imag; \
-	float  xr   = ( float  ) ( ( float  ) (a).real * (x).real + ( float  ) aimag * (x).imag ) / temp; \
-	float  xi   = ( float  ) ( ( float  ) (a).real * (x).imag - ( float  ) aimag * (x).real ) / temp; \
-	(x).real    = xr; \
-	(x).imag    = xi; \
-}
-#define bli_zcinvscalcjs( conj, a, x ) \
-{ \
-	float  aimag = ( bli_is_conj( conj ) ? ( float  ) -(a).imag : \
-	                                       ( float  )  (a).imag ); \
-	float  temp =              ( float  ) (a).real * (a).real + ( float  ) aimag * (a).imag; \
-	float  xr   = ( float  ) ( ( float  ) (a).real * (x).real + ( float  ) aimag * (x).imag ) / temp; \
-	float  xi   = ( float  ) ( ( float  ) (a).real * (x).imag - ( float  ) aimag * (x).real ) / temp; \
-	(x).real    = xr; \
-	(x).imag    = xi; \
-}
-
-#define bli_szinvscalcjs( conj, a, x ) \
-{ \
-	(x).real /= ( double ) (a); \
-	(x).imag /= ( double ) (a); \
-}
-#define bli_dzinvscalcjs( conj, a, x ) \
-{ \
-	(x).real /= ( double ) (a); \
-	(x).imag /= ( double ) (a); \
-}
-#define bli_czinvscalcjs( conj, a, x ) \
-{ \
-	double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \
-	                                       ( double )  (a).imag ); \
-	double temp =              ( double ) (a).real * (a).real + ( double ) aimag * (a).imag; \
-	double xr   = ( double ) ( ( double ) (a).real * (x).real + ( double ) aimag * (x).imag ) / temp; \
-	double xi   = ( double ) ( ( double ) (a).real * (x).imag - ( double ) aimag * (x).real ) / temp; \
-	(x).real    = xr; \
-	(x).imag    = xi; \
-}
-#define bli_zzinvscalcjs( conj, a, x ) \
-{ \
-	double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \
-	                                       ( double )  (a).imag ); \
-	double temp =              ( double ) (a).real * (a).real + ( double ) aimag * (a).imag; \
-	double xr   = ( double ) ( ( double ) (a).real * (x).real + ( double ) aimag * (x).imag ) / temp; \
-	double xi   = ( double ) ( ( double ) (a).real * (x).imag - ( double ) aimag * (x).real ) / temp; \
-	(x).real    = xr; \
-	(x).imag    = xi; \
-}
-
-
-#define bli_sinvscalcjs( conj, a, x ) \
-{ \
-	bli_ssinvscalcjs( conj, a, x ); \
-}
-#define bli_dinvscalcjs( conj, a, x ) \
-{ \
-	bli_ddinvscalcjs( conj, a, x ); \
-}
-#define bli_cinvscalcjs( conj, a, x ) \
-{ \
-	bli_ccinvscalcjs( conj, a, x ); \
-}
-#define bli_zinvscalcjs( conj, a, x ) \
-{ \
-	bli_zzinvscalcjs( conj, a, x ); \
-}
-
-
-#endif
diff --git a/frame/include/level0/old/bli_scalcjs.h b/frame/include/level0/old/bli_scalcjs.h
deleted file mode 100644
index 3072ca7e2..000000000
--- a/frame/include/level0/old/bli_scalcjs.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALCJS_H
-#define BLIS_SCALCJS_H
-
-// scalcjs
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - a is (conditionally) used in conjugated form.
-
-#define bli_ssscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( float  ) (a); \
-}
-#define bli_dsscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( float  ) (a); \
-}
-#define bli_csscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( float  ) (a).real; \
-}
-#define bli_zsscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( float  ) (a).real; \
-}
-
-#define bli_sdscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( double ) (a); \
-}
-#define bli_ddscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( double ) (a); \
-}
-#define bli_cdscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( double ) (a).real; \
-}
-#define bli_zdscalcjs( conj, a, x ) \
-{ \
-	(x) *= ( double ) (a).real; \
-}
-
-#define bli_scscalcjs( conj, a, x ) \
-{ \
-	(x).real *= ( float  ) (a); \
-	(x).imag *= ( float  ) (a); \
-}
-#define bli_dcscalcjs( conj, a, x ) \
-{ \
-	(x).real *= ( float  ) (a); \
-	(x).imag *= ( float  ) (a); \
-}
-#define bli_ccscalcjs( conj, a, x ) \
-{ \
-	float  aimag = ( bli_is_conj( conj ) ? ( float  ) -(a).imag : \
-	                                       ( float  )  (a).imag ); \
-	float  tempr = ( float  ) (a).real * (x).real - ( float  ) aimag * (x).imag; \
-	float  tempi = ( float  ) (a).real * (x).imag + ( float  ) aimag * (x).real; \
-	(x).real = tempr; \
-	(x).imag = tempi; \
-}
-#define bli_zcscalcjs( conj, a, x ) \
-{ \
-	float  aimag = ( bli_is_conj( conj ) ? ( float  ) -(a).imag : \
-	                                       ( float  )  (a).imag ); \
-	float  tempr = ( float  ) (a).real * (x).real - ( float  ) aimag * (x).imag; \
-	float  tempi = ( float  ) (a).real * (x).imag + ( float  ) aimag * (x).real; \
-	(x).real = tempr; \
-	(x).imag = tempi; \
-}
-
-#define bli_szscalcjs( conj, a, x ) \
-{ \
-	(x).real *= ( double ) (a); \
-	(x).imag *= ( double ) (a); \
-}
-#define bli_dzscalcjs( conj, a, x ) \
-{ \
-	(x).real *= ( double ) (a); \
-	(x).imag *= ( double ) (a); \
-}
-#define bli_czscalcjs( conj, a, x ) \
-{ \
-	double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \
-	                                       ( double )  (a).imag ); \
-	double tempr = ( double ) (a).real * (x).real - ( double ) aimag * (x).imag; \
-	double tempi = ( double ) (a).real * (x).imag + ( double ) aimag * (x).real; \
-	(x).real = tempr; \
-	(x).imag = tempi; \
-}
-#define bli_zzscalcjs( conj, a, x ) \
-{ \
-	double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \
-	                                       ( double )  (a).imag ); \
-	double tempr = ( double ) (a).real * (x).real - ( double ) aimag * (x).imag; \
-	double tempi = ( double ) (a).real * (x).imag + ( double ) aimag * (x).real; \
-	(x).real = tempr; \
-	(x).imag = tempi; \
-}
-
-
-#define bli_sscalcjs( conj, a, x ) \
-{ \
-	bli_ssscalcjs( conj, a, x ); \
-}
-#define bli_dscalcjs( conj, a, x ) \
-{ \
-	bli_ddscalcjs( conj, a, x ); \
-}
-#define bli_cscalcjs( conj, a, x ) \
-{ \
-	bli_ccscalcjs( conj, a, x ); \
-}
-#define bli_zscalcjs( conj, a, x ) \
-{ \
-	bli_zzscalcjs( conj, a, x ); \
-}
-
-
-#endif
diff --git a/frame/include/level0/old/bli_set0ris_mxn.h b/frame/include/level0/old/bli_set0ris_mxn.h
deleted file mode 100644
index 212ef4742..000000000
--- a/frame/include/level0/old/bli_set0ris_mxn.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SET0RIS_MXN_H
-#define BLIS_SET0RIS_MXN_H
-
-// set0ris_mxn
-
-#define bli_sset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_sset0ris( *(ar + _i*rs_a + _j*cs_a), \
-	              *(ai + _i*rs_a + _j*cs_a) ); \
-}
-
-#define bli_dset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_dset0ris( *(ar + _i*rs_a + _j*cs_a), \
-	              *(ai + _i*rs_a + _j*cs_a) ); \
-}
-
-#define bli_cset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_cset0ris( *(ar + _i*rs_a + _j*cs_a), \
-	              *(ai + _i*rs_a + _j*cs_a) ); \
-}
-
-#define bli_zset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_zset0ris( *(ar + _i*rs_a + _j*cs_a), \
-	              *(ai + _i*rs_a + _j*cs_a) ); \
-}
-
-
-#endif
diff --git a/frame/include/level0/old/io/bli_scal2ios.h b/frame/include/level0/old/io/bli_scal2ios.h
deleted file mode 100644
index 6b2415171..000000000
--- a/frame/include/level0/old/io/bli_scal2ios.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyiight
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyiight
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2IOS_H
-#define BLIS_SCAL2IOS_H
-
-// scal2ios
-
-#define bli_cscal2ios( a, x, yi ) \
-{ \
-	(yi) = bli_cimag(a) * bli_creal(x) + bli_creal(a) * bli_cimag(x); \
-}
-
-#define bli_zscal2ios( a, x, yi ) \
-{ \
-	(yi) = bli_zimag(a) * bli_zreal(x) + bli_zreal(a) * bli_zimag(x); \
-}
-
-#define bli_scscal2ios( a, x, yi ) \
-{ \
-	(yi) = bli_creal(a) * bli_cimag(x); \
-}
-
-#define bli_dzscal2ios( a, x, yi ) \
-{ \
-	(yi) = bli_zreal(a) * bli_zimag(x); \
-}
-
-#endif
-
diff --git a/frame/include/level0/old/io/bli_scal2jios.h b/frame/include/level0/old/io/bli_scal2jios.h
deleted file mode 100644
index 941b6044c..000000000
--- a/frame/include/level0/old/io/bli_scal2jios.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyiight
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyiight
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2JIOS_H
-#define BLIS_SCAL2JIOS_H
-
-// scal2jios
-
-#define bli_cscal2jios( a, x, yi ) \
-{ \
-	(yi) = bli_cimag(a) * bli_creal(x) - bli_creal(a) * bli_cimag(x); \
-}
-
-#define bli_zscal2jios( a, x, yi ) \
-{ \
-	(yi) = bli_zimag(a) * bli_zreal(x) - bli_zreal(a) * bli_zimag(x); \
-}
-
-
-#endif
-
diff --git a/frame/include/level0/old/ri3/bli_copyjri3s.h b/frame/include/level0/old/ri3/bli_copyjri3s.h
deleted file mode 100644
index 6be9e3619..000000000
--- a/frame/include/level0/old/ri3/bli_copyjri3s.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYJRI3S_H
-#define BLIS_COPYJRI3S_H
-
-// copyjri3s
-
-#define bli_scopyjri3s( ar, ai, br, bi, bri )  bli_scopyri3s( (ar), -(ai), (br), (bi), (bri) )
-#define bli_dcopyjri3s( ar, ai, br, bi, bri )  bli_dcopyri3s( (ar), -(ai), (br), (bi), (bri) )
-#define bli_ccopyjri3s( ar, ai, br, bi, bri )  bli_ccopyri3s( (ar), -(ai), (br), (bi), (bri) )
-#define bli_zcopyjri3s( ar, ai, br, bi, bri )  bli_zcopyri3s( (ar), -(ai), (br), (bi), (bri) )
-
-#endif
-
diff --git a/frame/include/level0/old/ri3/bli_scal2ri3s_mxn.h b/frame/include/level0/old/ri3/bli_scal2ri3s_mxn.h
deleted file mode 100644
index 2316f0738..000000000
--- a/frame/include/level0/old/ri3/bli_scal2ri3s_mxn.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2RI3S_MXN_H
-#define BLIS_SCAL2RI3S_MXN_H
-
-// scal2ri3s_mxn
-
-BLIS_INLINE void bli_cscal2ri3s_mxn
-     (
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       scomplex* restrict alpha,
-       scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y
-     )
-{
-	float*  restrict alpha_r = ( float*  )alpha; \
-	float*  restrict alpha_i = ( float*  )alpha + 1; \
-	float*  restrict x_r     = ( float*  )x; \
-	float*  restrict x_i     = ( float*  )x + 1; \
-	float*  restrict y_r     = ( float*  )y; \
-	float*  restrict y_i     = ( float*  )y +   is_y; \
-	float*  restrict y_rpi   = ( float*  )y + 2*is_y; \
-	const dim_t      incx2   = 2*rs_x; \
-	const dim_t      ldx2    = 2*cs_x; \
-
-	/* Treat the micro-panel as panel_dim x panel_len and column-stored
-	   (unit row stride). */ \
-
-	if ( bli_is_conj( conjx ) )
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			float*  restrict chi11_r   = x_r   + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict chi11_i   = x_i   + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict psi11_r   = y_r   + (i  )*1     + (j  )*cs_y;
-			float*  restrict psi11_i   = y_i   + (i  )*1     + (j  )*cs_y;
-			float*  restrict psi11_rpi = y_rpi + (i  )*1     + (j  )*cs_y;
-
-			bli_cscal2jri3s
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i,
-			  *psi11_rpi
-			);
-		}
-	}
-	else /* if ( bli_is_noconj( conjx ) ) */
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			float*  restrict chi11_r   = x_r   + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict chi11_i   = x_i   + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict psi11_r   = y_r   + (i  )*1     + (j  )*cs_y;
-			float*  restrict psi11_i   = y_i   + (i  )*1     + (j  )*cs_y;
-			float*  restrict psi11_rpi = y_rpi + (i  )*1     + (j  )*cs_y;
-
-			bli_cscal2ri3s
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i,
-			  *psi11_rpi
-			);
-		}
-	}
-}
-
-BLIS_INLINE void bli_zscal2ri3s_mxn
-     (
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       dcomplex* restrict alpha,
-       dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y
-     )
-{
-	double* restrict alpha_r = ( double* )alpha; \
-	double* restrict alpha_i = ( double* )alpha + 1; \
-	double* restrict x_r     = ( double* )x; \
-	double* restrict x_i     = ( double* )x + 1; \
-	double* restrict y_r     = ( double* )y; \
-	double* restrict y_i     = ( double* )y +   is_y; \
-	double* restrict y_rpi   = ( double* )y + 2*is_y; \
-	const dim_t      incx2   = 2*rs_x; \
-	const dim_t      ldx2    = 2*cs_x; \
-
-	/* Treat the micro-panel as panel_dim x panel_len and column-stored
-	   (unit row stride). */ \
-
-	if ( bli_is_conj( conjx ) )
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			double* restrict chi11_r   = x_r   + (i  )*incx2 + (j  )*ldx2;
-			double* restrict chi11_i   = x_i   + (i  )*incx2 + (j  )*ldx2;
-			double* restrict psi11_r   = y_r   + (i  )*1     + (j  )*cs_y;
-			double* restrict psi11_i   = y_i   + (i  )*1     + (j  )*cs_y;
-			double* restrict psi11_rpi = y_rpi + (i  )*1     + (j  )*cs_y;
-
-			bli_zscal2jri3s
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i,
-			  *psi11_rpi
-			);
-		}
-	}
-	else /* if ( bli_is_noconj( conjx ) ) */
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			double* restrict chi11_r   = x_r   + (i  )*incx2 + (j  )*ldx2;
-			double* restrict chi11_i   = x_i   + (i  )*incx2 + (j  )*ldx2;
-			double* restrict psi11_r   = y_r   + (i  )*1     + (j  )*cs_y;
-			double* restrict psi11_i   = y_i   + (i  )*1     + (j  )*cs_y;
-			double* restrict psi11_rpi = y_rpi + (i  )*1     + (j  )*cs_y;
-
-			bli_zscal2ri3s
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i,
-			  *psi11_rpi
-			);
-		}
-	}
-}
-
-
-#endif
diff --git a/frame/include/level0/old/rih/bli_scal2rihs_mxn.h b/frame/include/level0/old/rih/bli_scal2rihs_mxn.h
deleted file mode 100644
index ca117b85d..000000000
--- a/frame/include/level0/old/rih/bli_scal2rihs_mxn.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2RIHS_MXN_H
-#define BLIS_SCAL2RIHS_MXN_H
-
-// scal2rihs_mxn
-
-BLIS_INLINE void bli_cscal2rihs_mxn
-     (
-       const pack_t       schema,
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       scomplex* restrict alpha,
-       scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       scomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	scomplex* restrict x_r =            x;
-	float*    restrict y_r = ( float*  )y;
-
-	if ( bli_is_ro_packed( schema ) )
-	{
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				scomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				float*    restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_cscal2jros
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				scomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				float*    restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_cscal2ros
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-	}
-	else if ( bli_is_io_packed( schema ) )
-	{
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				scomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				float*    restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_cscal2jios
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				scomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				float*    restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_cscal2ios
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-	}
-	else /* if ( bli_is_rpi_packed( schema ) ) */
-	{
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				scomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				float*    restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_cscal2jrpis
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				scomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				float*    restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_cscal2rpis
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-	}
-}
-
-BLIS_INLINE void bli_zscal2rihs_mxn
-     (
-       const pack_t       schema,
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       dcomplex* restrict alpha,
-       dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y
-     )
-{
-	dcomplex* restrict x_r =            x;
-	double*   restrict y_r = ( double* )y;
-
-	if ( bli_is_ro_packed( schema ) )
-	{
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				dcomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				double*   restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_zscal2jros
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				dcomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				double*   restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_zscal2ros
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-	}
-	else if ( bli_is_io_packed( schema ) )
-	{
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				dcomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				double*   restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_zscal2jios
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				dcomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				double*   restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_zscal2ios
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-	}
-	else /* if ( bli_is_rpi_packed( schema ) ) */
-	{
-		if ( bli_is_conj( conjx ) )
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				dcomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				double*   restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_zscal2jrpis
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-		else /* if ( bli_is_noconj( conjx ) ) */
-		{
-			for ( dim_t j = 0; j < n; ++j )
-			for ( dim_t i = 0; i < m; ++i )
-			{
-				dcomplex* restrict chi11   = x_r + (i  )*rs_x + (j  )*cs_x;
-				double*   restrict psi11_r = y_r + (i  )*rs_y + (j  )*cs_y;
-
-				bli_zscal2rpis
-				(
-				  *alpha,
-				  *chi11,
-				  *psi11_r 
-				);
-			}
-		}
-	}
-}
-
-
-#endif
diff --git a/frame/include/level0/old/rih/bli_scal2rihs_mxn_diag.h b/frame/include/level0/old/rih/bli_scal2rihs_mxn_diag.h
deleted file mode 100644
index 79897755e..000000000
--- a/frame/include/level0/old/rih/bli_scal2rihs_mxn_diag.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2RIHS_MXN_DIAG_H
-#define BLIS_SCAL2RIHS_MXN_DIAG_H
-
-// scal2rihs_mxn_diag
-
-#define bli_cscscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t _i; \
-\
-	/* Handle ro, io, and rpi separately. */ \
-	if ( bli_is_ro_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_scscal2ros( *(x   + _i*rs_x + _i*cs_x), \
-			                *(a), \
-			                *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else if ( bli_is_io_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_scscal2ios( *(x   + _i*rs_x + _i*cs_x), \
-			                *(a), \
-			                *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_rpi_packed( schema ) ) */ \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_scscal2rpis( *(x   + _i*rs_x + _i*cs_x), \
-			                 *(a), \
-			                 *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_zdzscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \
-{ \
-	dim_t min_m_n = bli_min( m, n ); \
-	dim_t _i; \
-\
-	/* Handle ro, io, and rpi separately. */ \
-	if ( bli_is_ro_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_dzscal2ros( *(x   + _i*rs_x + _i*cs_x), \
-			                *(a), \
-			                *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else if ( bli_is_io_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_dzscal2ios( *(x   + _i*rs_x + _i*cs_x), \
-			                *(a), \
-			                *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_rpi_packed( schema ) ) */ \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_dzscal2rpis( *(x   + _i*rs_x + _i*cs_x), \
-			                 *(a), \
-			                 *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/old/rih/bli_scal2rihs_mxn_uplo.h b/frame/include/level0/old/rih/bli_scal2rihs_mxn_uplo.h
deleted file mode 100644
index 6c26fadd4..000000000
--- a/frame/include/level0/old/rih/bli_scal2rihs_mxn_uplo.h
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2RIHS_MXN_UPLO_H
-#define BLIS_SCAL2RIHS_MXN_UPLO_H
-
-// scal2rihs_mxn_uplo
-
-#define bli_cscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* Handle ro, io, and rpi separately. */ \
-	if ( bli_is_ro_packed( schema ) ) \
-	{ \
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_cscal2jros( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_cscal2ros( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_cscal2jros( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_cscal2ros( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_io_packed( schema ) ) \
-	{ \
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_cscal2jios( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_cscal2ios( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_cscal2jios( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_cscal2ios( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_rpi_packed( schema ) ) */ \
-	{ \
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_cscal2jrpis( *(a), \
-					                 *(x   + _i*rs_x + _j*cs_x), \
-					                 *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_cscal2rpis( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_cscal2jrpis( *(a), \
-					                 *(x   + _i*rs_x + _j*cs_x), \
-					                 *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_cscal2rpis( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-#define bli_zscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	/* Handle ro, io, and rpi separately. */ \
-	if ( bli_is_ro_packed( schema ) ) \
-	{ \
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_zscal2jros( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_zscal2ros( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_zscal2jros( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_zscal2ros( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_io_packed( schema ) ) \
-	{ \
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_zscal2jios( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_zscal2ios( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_zscal2jios( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_zscal2ios( *(a), \
-					               *(x   + _i*rs_x + _j*cs_x), \
-					               *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_rpi_packed( schema ) ) */ \
-	{ \
-		if ( bli_is_lower( uplo ) ) \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_zscal2jrpis( *(a), \
-					                 *(x   + _i*rs_x + _j*cs_x), \
-					                 *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = _j; _i < m; ++_i ) \
-				{ \
-					bli_zscal2rpis( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_upper( uplo ) ) */ \
-		{ \
-			if ( bli_is_conj( conjx ) ) \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_zscal2jrpis( *(a), \
-					                 *(x   + _i*rs_x + _j*cs_x), \
-					                 *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-			else /* if ( bli_is_noconj( conjx ) ) */ \
-			{ \
-				for ( _j = 0; _j < m; ++_j ) \
-				for ( _i = 0; _i < _j + 1; ++_i ) \
-				{ \
-					bli_zscal2rpis( *(a), \
-					                *(x   + _i*rs_x + _j*cs_x), \
-					                *(y_r + _i*rs_y + _j*cs_y) ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/old/rih/bli_setrihs_mxn_diag.h b/frame/include/level0/old/rih/bli_setrihs_mxn_diag.h
deleted file mode 100644
index 33584deb8..000000000
--- a/frame/include/level0/old/rih/bli_setrihs_mxn_diag.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SETRIHS_MXN_DIAG_H
-#define BLIS_SETRIHS_MXN_DIAG_H
-
-// setrihs_mxn_diag
-
-#define bli_csetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \
-{ \
-	const float  a_r     = bli_zreal( *a ); \
-	const float  a_i     = bli_zimag( *a ); \
-	dim_t        min_m_n = bli_min( m, n ); \
-	dim_t        _i; \
-\
-	/* Handle ro, io, and rpi separately. */ \
-	if ( bli_is_ro_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_scopys(  (a_r), \
-			            *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else if ( bli_is_io_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_scopys(  (a_i), \
-			            *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_rpi_packed( schema ) ) */ \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_sadd3s(  (a_r), \
-			             (a_i), \
-			            *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-}
-
-#define bli_zsetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \
-{ \
-	const double a_r     = bli_zreal( *a ); \
-	const double a_i     = bli_zimag( *a ); \
-	dim_t        min_m_n = bli_min( m, n ); \
-	dim_t        _i; \
-\
-	/* Handle ro, io, and rpi separately. */ \
-	if ( bli_is_ro_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_dcopys(  (a_r), \
-			            *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else if ( bli_is_io_packed( schema ) ) \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_dcopys(  (a_i), \
-			            *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-	else /* if ( bli_is_rpi_packed( schema ) ) */ \
-	{ \
-		for ( _i = 0; _i < min_m_n; ++_i ) \
-		{ \
-			bli_dadd3s(  (a_r), \
-			             (a_i), \
-			            *(y_r + _i*rs_y + _i*cs_y) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/old/ro/bli_scal2jros.h b/frame/include/level0/old/ro/bli_scal2jros.h
deleted file mode 100644
index be7b43fb0..000000000
--- a/frame/include/level0/old/ro/bli_scal2jros.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2JROS_H
-#define BLIS_SCAL2JROS_H
-
-// scal2jros
-
-#define bli_cscal2jros( a, x, yr ) \
-{ \
-	(yr) = bli_creal(a) * bli_creal(x) + bli_cimag(a) * bli_cimag(x); \
-}
-
-#define bli_zscal2jros( a, x, yr ) \
-{ \
-	(yr) = bli_zreal(a) * bli_zreal(x) + bli_zimag(a) * bli_zimag(x); \
-}
-
-#endif
-
diff --git a/frame/include/level0/old/ro/bli_scal2ros.h b/frame/include/level0/old/ro/bli_scal2ros.h
deleted file mode 100644
index 5f68de5ab..000000000
--- a/frame/include/level0/old/ro/bli_scal2ros.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2ROS_H
-#define BLIS_SCAL2ROS_H
-
-// scal2ros
-
-#define bli_cscal2ros( a, x, yr ) \
-{ \
-	(yr) = bli_creal(a) * bli_creal(x) - bli_cimag(a) * bli_cimag(x); \
-}
-
-#define bli_zscal2ros( a, x, yr ) \
-{ \
-	(yr) = bli_zreal(a) * bli_zreal(x) - bli_zimag(a) * bli_zimag(x); \
-}
-
-#define bli_scscal2ros( a, x, yr ) \
-{ \
-	(yr) = bli_creal(a) * bli_creal(x); \
-}
-
-#define bli_dzscal2ros( a, x, yr ) \
-{ \
-	(yr) = bli_zreal(a) * bli_zreal(x); \
-}
-
-
-#endif
-
diff --git a/frame/include/level0/old/rpi/bli_scal2jrpis.h b/frame/include/level0/old/rpi/bli_scal2jrpis.h
deleted file mode 100644
index 718baa425..000000000
--- a/frame/include/level0/old/rpi/bli_scal2jrpis.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyrpiight
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyrpiight
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2JRPIS_H
-#define BLIS_SCAL2JRPIS_H
-
-// scal2jrpis
-
-#define bli_cscal2jrpis( a, x, yrpi ) \
-{ \
-	(yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \
-	         (bli_cimag(a)-bli_creal(a)) * bli_cimag(x); \
-}
-
-#define bli_zscal2jrpis( a, x, yrpi ) \
-{ \
-	(yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \
-	         (bli_zimag(a)-bli_zreal(a)) * bli_zimag(x); \
-}
-
-#endif
-
diff --git a/frame/include/level0/old/rpi/bli_scal2rpis.h b/frame/include/level0/old/rpi/bli_scal2rpis.h
deleted file mode 100644
index 159cdc335..000000000
--- a/frame/include/level0/old/rpi/bli_scal2rpis.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2RPIS_H
-#define BLIS_SCAL2RPIS_H
-
-// scal2rpis
-
-#define bli_cscal2rpis( a, x, yrpi ) \
-{ \
-	(yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \
-	         (bli_creal(a)-bli_cimag(a)) * bli_cimag(x); \
-}
-
-#define bli_zscal2rpis( a, x, yrpi ) \
-{ \
-	(yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \
-	         (bli_zreal(a)-bli_zimag(a)) * bli_zimag(x); \
-}
-
-#define bli_scscal2rpis( a, x, yrpi ) \
-{ \
-	(yrpi) = bli_creal(a) * bli_creal(x) + \
-	         bli_creal(a) * bli_cimag(x); \
-}
-
-#define bli_dzscal2rpis( a, x, yrpi ) \
-{ \
-	(yrpi) = bli_zreal(a) * bli_zreal(x) + \
-	         bli_zreal(a) * bli_zimag(x); \
-}
-
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_absq2ris.h b/frame/include/level0/ri/bli_absq2ris.h
deleted file mode 100644
index 6698a51a1..000000000
--- a/frame/include/level0/ri/bli_absq2ris.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ABSQ2RIS_H
-#define BLIS_ABSQ2RIS_H
-
-// absq2ris
-
-#define bli_sabsq2ris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar) * (ar); \
-}
-
-#define bli_dabsq2ris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar) * (ar); \
-}
-
-#define bli_cabsq2ris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar) * (ar) + (ai) * (ai); \
-	(bi) = 0.0F; \
-}
-
-#define bli_zabsq2ris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar) * (ar) + (ai) * (ai); \
-	(bi) = 0.0; \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_add3ris.h b/frame/include/level0/ri/bli_add3ris.h
deleted file mode 100644
index 8c686568c..000000000
--- a/frame/include/level0/ri/bli_add3ris.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADD3RIS_H
-#define BLIS_ADD3RIS_H
-
-// add3ris
-
-#define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \
-{ \
-	(cr) = (ar) + (br); \
-}
-
-#define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \
-{ \
-	(cr) = (ar) + (br); \
-}
-
-#define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \
-{ \
-	(cr) = (ar) + (br); \
-	(ci) = (ai) + (bi); \
-}
-
-#define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \
-{ \
-	(cr) = (ar) + (br); \
-	(ci) = (ai) + (bi); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_addjris.h b/frame/include/level0/ri/bli_addjris.h
deleted file mode 100644
index df1802744..000000000
--- a/frame/include/level0/ri/bli_addjris.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADDJRIS_H
-#define BLIS_ADDJRIS_H
-
-// addjris
-
-#define bli_saddjris( ar, ai, xr, xi )  bli_saddris( (ar), -(ai), (xr), (xi) )
-#define bli_daddjris( ar, ai, xr, xi )  bli_daddris( (ar), -(ai), (xr), (xi) )
-#define bli_caddjris( ar, ai, xr, xi )  bli_caddris( (ar), -(ai), (xr), (xi) )
-#define bli_zaddjris( ar, ai, xr, xi )  bli_zaddris( (ar), -(ai), (xr), (xi) )
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_addris.h b/frame/include/level0/ri/bli_addris.h
deleted file mode 100644
index 2efadd36f..000000000
--- a/frame/include/level0/ri/bli_addris.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ADDRIS_H
-#define BLIS_ADDRIS_H
-
-// addris
-
-#define bli_saddris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) + (ar); \
-}
-
-#define bli_daddris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) + (ar); \
-}
-
-#define bli_caddris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) + (ar); \
-	(xi) = (xi) + (ai); \
-}
-
-#define bli_zaddris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) + (ar); \
-	(xi) = (xi) + (ai); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_axmyris.h b/frame/include/level0/ri/bli_axmyris.h
deleted file mode 100644
index f8d899237..000000000
--- a/frame/include/level0/ri/bli_axmyris.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXMYRIS_H
-#define BLIS_AXMYRIS_H
-
-// axmyris
-
-#define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) -= (ar) * (xr); \
-}
-
-#define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) -= (ar) * (xr); \
-}
-
-#define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) -= (ar) * (xr) - (ai) * (xi); \
-	(yi) -= (ai) * (xr) + (ar) * (xi); \
-}
-
-#define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) -= (ar) * (xr) - (ai) * (xi); \
-	(yi) -= (ai) * (xr) + (ar) * (xi); \
-}
-
-#define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) -= (ar) * (xr); \
-	(yi) -= (ar) * (xi); \
-}
-
-#define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) -= (ar) * (xr); \
-	(yi) -= (ar) * (xi); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_axpbyjris.h b/frame/include/level0/ri/bli_axpbyjris.h
deleted file mode 100644
index 8dc2a5597..000000000
--- a/frame/include/level0/ri/bli_axpbyjris.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPBYJRIS_H
-#define BLIS_AXPBYJRIS_H
-
-// axpbyjris
-
-#define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \
-{ \
-    (yr) = (ar) * (xr) + (br) * (yr); \
-}
-
-#define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \
-{ \
-    const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \
-    const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \
-    (yr) = yt_r; \
-    (yi) = yt_i; \
-}
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of b.
-// - The fourth char encodes the type of y.
-
-// -- (axby) = (??ss) ----------------------------------------------------------
-
-#define bli_ssssxpbyjris  bli_rxxpbyjris
-#define bli_dsssxpbyjris  bli_rxxpbyjris
-#define bli_csssxpbyjris  bli_rxxpbyjris
-#define bli_zsssxpbyjris  bli_rxxpbyjris
-
-#define bli_sdssxpbyjris  bli_rxxpbyjris
-#define bli_ddssxpbyjris  bli_rxxpbyjris
-#define bli_cdssxpbyjris  bli_rxxpbyjris
-#define bli_zdssxpbyjris  bli_rxxpbyjris
-
-#define bli_scssxpbyjris  bli_rxxpbyjris
-#define bli_dcssxpbyjris  bli_rxxpbyjris
-#define bli_ccssxpbyjris  bli_rxxpbyjris
-#define bli_zcssxpbyjris  bli_rxxpbyjris
-
-#define bli_szssxpbyjris  bli_rxxpbyjris
-#define bli_dzssxpbyjris  bli_rxxpbyjris
-#define bli_czssxpbyjris  bli_rxxpbyjris
-#define bli_zzssxpbyjris  bli_rxxpbyjris
-
-// NOTE: This series needs to be finished for all other char values for (by), but
-// not until something in BLIS actually needs mixed-datatype axpbyjris.
-
-
-#define bli_saxpbyjris    bli_ssssaxpbyjris
-#define bli_daxpbyjris    bli_ddddaxpbyjris
-#define bli_caxpbyjris    bli_ccccaxpbyjris
-#define bli_zaxpbyjris    bli_zzzzaxpbyjris
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_axpbyris.h b/frame/include/level0/ri/bli_axpbyris.h
deleted file mode 100644
index 1344749e0..000000000
--- a/frame/include/level0/ri/bli_axpbyris.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPBYRIS_H
-#define BLIS_AXPBYRIS_H
-
-// axpbyris
-
-#define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \
-{ \
-    (yr) = (ar) * (xr) + (br) * (yr); \
-}
-
-#define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \
-{ \
-    const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \
-    const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \
-    (yr) = yt_r; \
-    (yi) = yt_i; \
-}
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of b.
-// - The fourth char encodes the type of y.
-
-// -- (axby) = (??ss) ----------------------------------------------------------
-
-#define bli_ssssxpbyris  bli_rxxpbyris
-#define bli_dsssxpbyris  bli_rxxpbyris
-#define bli_csssxpbyris  bli_rxxpbyris
-#define bli_zsssxpbyris  bli_rxxpbyris
-
-#define bli_sdssxpbyris  bli_rxxpbyris
-#define bli_ddssxpbyris  bli_rxxpbyris
-#define bli_cdssxpbyris  bli_rxxpbyris
-#define bli_zdssxpbyris  bli_rxxpbyris
-
-#define bli_scssxpbyris  bli_rxxpbyris
-#define bli_dcssxpbyris  bli_rxxpbyris
-#define bli_ccssxpbyris  bli_rxxpbyris
-#define bli_zcssxpbyris  bli_rxxpbyris
-
-#define bli_szssxpbyris  bli_rxxpbyris
-#define bli_dzssxpbyris  bli_rxxpbyris
-#define bli_czssxpbyris  bli_rxxpbyris
-#define bli_zzssxpbyris  bli_rxxpbyris
-
-// NOTE: This series needs to be finished for all other char values for (by), but
-// not until something in BLIS actually needs mixed-datatype axpbyris.
-
-
-#define bli_saxpbyris    bli_ssssaxpbyris
-#define bli_daxpbyris    bli_ddddaxpbyris
-#define bli_caxpbyris    bli_ccccaxpbyris
-#define bli_zaxpbyris    bli_zzzzaxpbyris
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_axpyjris.h b/frame/include/level0/ri/bli_axpyjris.h
deleted file mode 100644
index 4525591b4..000000000
--- a/frame/include/level0/ri/bli_axpyjris.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPYJRIS_H
-#define BLIS_AXPYJRIS_H
-
-// axpyjris
-
-#define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr); \
-}
-
-#define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr) + (ai) * (xi); \
-	(yi) += (ai) * (xr) - (ar) * (xi); \
-}
-
-#define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr) + (ai) * (xi); \
-}
-
-#define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) *  (xr); \
-	(yi) += (ar) * -(xi); \
-}
-
-#define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr); \
-	(yi) += (ai) * (xr); \
-}
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssaxpyjris  bli_rxaxpyjris
-#define bli_dssaxpyjris  bli_rxaxpyjris
-#define bli_cssaxpyjris  bli_rxaxpyjris
-#define bli_zssaxpyjris  bli_rxaxpyjris
-
-#define bli_sdsaxpyjris  bli_rxaxpyjris
-#define bli_ddsaxpyjris  bli_rxaxpyjris
-#define bli_cdsaxpyjris  bli_rxaxpyjris
-#define bli_zdsaxpyjris  bli_rxaxpyjris
-
-#define bli_scsaxpyjris  bli_rxaxpyjris
-#define bli_dcsaxpyjris  bli_rxaxpyjris
-#define bli_ccsaxpyjris  bli_roaxpyjris
-#define bli_zcsaxpyjris  bli_roaxpyjris
-
-#define bli_szsaxpyjris  bli_rxaxpyjris
-#define bli_dzsaxpyjris  bli_rxaxpyjris
-#define bli_czsaxpyjris  bli_roaxpyjris
-#define bli_zzsaxpyjris  bli_roaxpyjris
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdaxpyjris  bli_rxaxpyjris
-#define bli_dsdaxpyjris  bli_rxaxpyjris
-#define bli_csdaxpyjris  bli_rxaxpyjris
-#define bli_zsdaxpyjris  bli_rxaxpyjris
-
-#define bli_sddaxpyjris  bli_rxaxpyjris
-#define bli_dddaxpyjris  bli_rxaxpyjris
-#define bli_cddaxpyjris  bli_rxaxpyjris
-#define bli_zddaxpyjris  bli_rxaxpyjris
-
-#define bli_scdaxpyjris  bli_rxaxpyjris
-#define bli_dcdaxpyjris  bli_rxaxpyjris
-#define bli_ccdaxpyjris  bli_roaxpyjris
-#define bli_zcdaxpyjris  bli_roaxpyjris
-
-#define bli_szdaxpyjris  bli_rxaxpyjris
-#define bli_dzdaxpyjris  bli_rxaxpyjris
-#define bli_czdaxpyjris  bli_roaxpyjris
-#define bli_zzdaxpyjris  bli_roaxpyjris
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxpyjris  bli_rxaxpyjris
-#define bli_dscaxpyjris  bli_rxaxpyjris
-#define bli_cscaxpyjris  bli_rcaxpyjris
-#define bli_zscaxpyjris  bli_rcaxpyjris
-
-#define bli_sdcaxpyjris  bli_rxaxpyjris
-#define bli_ddcaxpyjris  bli_rxaxpyjris
-#define bli_cdcaxpyjris  bli_rcaxpyjris
-#define bli_zdcaxpyjris  bli_rcaxpyjris
-
-#define bli_sccaxpyjris  bli_craxpyjris
-#define bli_dccaxpyjris  bli_craxpyjris
-#define bli_cccaxpyjris  bli_cxaxpyjris
-#define bli_zccaxpyjris  bli_cxaxpyjris
-
-#define bli_szcaxpyjris  bli_craxpyjris
-#define bli_dzcaxpyjris  bli_craxpyjris
-#define bli_czcaxpyjris  bli_cxaxpyjris
-#define bli_zzcaxpyjris  bli_cxaxpyjris
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxpyjris  bli_rxaxpyjris
-#define bli_dszaxpyjris  bli_rxaxpyjris
-#define bli_cszaxpyjris  bli_rcaxpyjris
-#define bli_zszaxpyjris  bli_rcaxpyjris
-
-#define bli_sdzaxpyjris  bli_rxaxpyjris
-#define bli_ddzaxpyjris  bli_rxaxpyjris
-#define bli_cdzaxpyjris  bli_rcaxpyjris
-#define bli_zdzaxpyjris  bli_rcaxpyjris
-
-#define bli_sczaxpyjris  bli_craxpyjris
-#define bli_dczaxpyjris  bli_craxpyjris
-#define bli_cczaxpyjris  bli_cxaxpyjris
-#define bli_zczaxpyjris  bli_cxaxpyjris
-
-#define bli_szzaxpyjris  bli_craxpyjris
-#define bli_dzzaxpyjris  bli_craxpyjris
-#define bli_czzaxpyjris  bli_cxaxpyjris
-#define bli_zzzaxpyjris  bli_cxaxpyjris
-
-
-
-#define bli_saxpyjris    bli_sssaxpyjris
-#define bli_daxpyjris    bli_dddaxpyjris
-#define bli_caxpyjris    bli_cccaxpyjris
-#define bli_zaxpyjris    bli_zzzaxpyjris
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_axpyris.h b/frame/include/level0/ri/bli_axpyris.h
deleted file mode 100644
index 515e5e790..000000000
--- a/frame/include/level0/ri/bli_axpyris.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_AXPYRIS_H
-#define BLIS_AXPYRIS_H
-
-// axpyris
-
-#define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr); \
-}
-
-#define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr) - (ai) * (xi); \
-	(yi) += (ai) * (xr) + (ar) * (xi); \
-}
-
-#define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr) - (ai) * (xi); \
-}
-
-#define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr); \
-	(yi) += (ar) * (xi); \
-}
-
-#define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) += (ar) * (xr); \
-	(yi) += (ai) * (xr); \
-}
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssaxpyris  bli_rxaxpyris
-#define bli_dssaxpyris  bli_rxaxpyris
-#define bli_cssaxpyris  bli_rxaxpyris
-#define bli_zssaxpyris  bli_rxaxpyris
-
-#define bli_sdsaxpyris  bli_rxaxpyris
-#define bli_ddsaxpyris  bli_rxaxpyris
-#define bli_cdsaxpyris  bli_rxaxpyris
-#define bli_zdsaxpyris  bli_rxaxpyris
-
-#define bli_scsaxpyris  bli_rxaxpyris
-#define bli_dcsaxpyris  bli_rxaxpyris
-#define bli_ccsaxpyris  bli_roaxpyris
-#define bli_zcsaxpyris  bli_roaxpyris
-
-#define bli_szsaxpyris  bli_rxaxpyris
-#define bli_dzsaxpyris  bli_rxaxpyris
-#define bli_czsaxpyris  bli_roaxpyris
-#define bli_zzsaxpyris  bli_roaxpyris
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdaxpyris  bli_rxaxpyris
-#define bli_dsdaxpyris  bli_rxaxpyris
-#define bli_csdaxpyris  bli_rxaxpyris
-#define bli_zsdaxpyris  bli_rxaxpyris
-
-#define bli_sddaxpyris  bli_rxaxpyris
-#define bli_dddaxpyris  bli_rxaxpyris
-#define bli_cddaxpyris  bli_rxaxpyris
-#define bli_zddaxpyris  bli_rxaxpyris
-
-#define bli_scdaxpyris  bli_rxaxpyris
-#define bli_dcdaxpyris  bli_rxaxpyris
-#define bli_ccdaxpyris  bli_roaxpyris
-#define bli_zcdaxpyris  bli_roaxpyris
-
-#define bli_szdaxpyris  bli_rxaxpyris
-#define bli_dzdaxpyris  bli_rxaxpyris
-#define bli_czdaxpyris  bli_roaxpyris
-#define bli_zzdaxpyris  bli_roaxpyris
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscaxpyris  bli_rxaxpyris
-#define bli_dscaxpyris  bli_rxaxpyris
-#define bli_cscaxpyris  bli_rcaxpyris
-#define bli_zscaxpyris  bli_rcaxpyris
-
-#define bli_sdcaxpyris  bli_rxaxpyris
-#define bli_ddcaxpyris  bli_rxaxpyris
-#define bli_cdcaxpyris  bli_rcaxpyris
-#define bli_zdcaxpyris  bli_rcaxpyris
-
-#define bli_sccaxpyris  bli_craxpyris
-#define bli_dccaxpyris  bli_craxpyris
-#define bli_cccaxpyris  bli_cxaxpyris
-#define bli_zccaxpyris  bli_cxaxpyris
-
-#define bli_szcaxpyris  bli_craxpyris
-#define bli_dzcaxpyris  bli_craxpyris
-#define bli_czcaxpyris  bli_cxaxpyris
-#define bli_zzcaxpyris  bli_cxaxpyris
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszaxpyris  bli_rxaxpyris
-#define bli_dszaxpyris  bli_rxaxpyris
-#define bli_cszaxpyris  bli_rcaxpyris
-#define bli_zszaxpyris  bli_rcaxpyris
-
-#define bli_sdzaxpyris  bli_rxaxpyris
-#define bli_ddzaxpyris  bli_rxaxpyris
-#define bli_cdzaxpyris  bli_rcaxpyris
-#define bli_zdzaxpyris  bli_rcaxpyris
-
-#define bli_sczaxpyris  bli_craxpyris
-#define bli_dczaxpyris  bli_craxpyris
-#define bli_cczaxpyris  bli_cxaxpyris
-#define bli_zczaxpyris  bli_cxaxpyris
-
-#define bli_szzaxpyris  bli_craxpyris
-#define bli_dzzaxpyris  bli_craxpyris
-#define bli_czzaxpyris  bli_cxaxpyris
-#define bli_zzzaxpyris  bli_cxaxpyris
-
-
-
-#define bli_saxpyris    bli_sssaxpyris
-#define bli_daxpyris    bli_dddaxpyris
-#define bli_caxpyris    bli_cccaxpyris
-#define bli_zaxpyris    bli_zzzaxpyris
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_conjris.h b/frame/include/level0/ri/bli_conjris.h
deleted file mode 100644
index c4a917011..000000000
--- a/frame/include/level0/ri/bli_conjris.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_CONJRIS_H
-#define BLIS_CONJRIS_H
-
-// conjris
-
-#define bli_sconjris( xr, xi ) \
-{ \
-	; \
-}
-
-#define bli_dconjris( xr, xi ) \
-{ \
-	; \
-}
-
-#define bli_cconjris( xr, xi ) \
-{ \
-	(xi) = -(xi); \
-}
-
-#define bli_zconjris( xr, xi ) \
-{ \
-	(xi) = -(xi); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_copycjris.h b/frame/include/level0/ri/bli_copycjris.h
deleted file mode 100644
index c83232370..000000000
--- a/frame/include/level0/ri/bli_copycjris.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYCJRIS_H
-#define BLIS_COPYCJRIS_H
-
-// copycjris
-
-#define bli_scopycjris( conj, xr, xi, yr, yi ) \
-{ \
-	bli_scopyris( (xr), (xi), (yr), (yi) ); \
-}
-
-#define bli_dcopycjris( conj, xr, xi, yr, yi ) \
-{ \
-	bli_dcopyris( (xr), (xi), (yr), (yi) ); \
-}
-
-#define bli_ccopycjris( conj, xr, xi, yr, yi ) \
-{ \
-	(yr) =                          (xr); \
-	(yi) = ( bli_is_conj( conj ) ? -(xi) \
-	                             :  (xi) ); \
-}
-
-#define bli_zcopycjris( conj, xr, xi, yr, yi ) \
-{ \
-	(yr) =                          (xr); \
-	(yi) = ( bli_is_conj( conj ) ? -(xi) \
-	                             :  (xi) ); \
-}
-
-#define bli_icopycjris( conj, xr, xi, yr, yi ) \
-{ \
-	bli_icopyris( (xr), (xi), (yr), (yi) ); \
-}
-
-#endif
diff --git a/frame/include/level0/ri/bli_copyjris.h b/frame/include/level0/ri/bli_copyjris.h
deleted file mode 100644
index 86fd70542..000000000
--- a/frame/include/level0/ri/bli_copyjris.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYJRIS_H
-#define BLIS_COPYJRIS_H
-
-// copyjris
-
-#define bli_scopyjris( ar, ai, br, bi )  bli_scopyris( (ar), -(ai), (br), (bi) )
-#define bli_dcopyjris( ar, ai, br, bi )  bli_dcopyris( (ar), -(ai), (br), (bi) )
-#define bli_ccopyjris( ar, ai, br, bi )  bli_ccopyris( (ar), -(ai), (br), (bi) )
-#define bli_zcopyjris( ar, ai, br, bi )  bli_zcopyris( (ar), -(ai), (br), (bi) )
-
-#define bli_sscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, 0.0F, br, bi )
-#define bli_dscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, 0.0,  br, bi )
-#define bli_cscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, ai,   br, bi )
-#define bli_zscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, ai,   br, bi )
-
-#define bli_sdcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, 0.0F, br, bi )
-#define bli_ddcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, 0.0,  br, bi )
-#define bli_cdcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, ai,   br, bi )
-#define bli_zdcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, ai,   br, bi )
-
-#define bli_sccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, 0.0F, br, bi )
-#define bli_dccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, 0.0,  br, bi )
-#define bli_cccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, ai,   br, bi )
-#define bli_zccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, ai,   br, bi )
-
-#define bli_szcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, 0.0F, br, bi )
-#define bli_dzcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, 0.0,  br, bi )
-#define bli_czcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, ai,   br, bi )
-#define bli_zzcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, ai,   br, bi )
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_copyris.h b/frame/include/level0/ri/bli_copyris.h
deleted file mode 100644
index cd971587d..000000000
--- a/frame/include/level0/ri/bli_copyris.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_COPYRIS_H
-#define BLIS_COPYRIS_H
-
-// copyris
-
-#define bli_scopyris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar); \
-    ( void )ai; ( void )bi; \
-}
-
-#define bli_dcopyris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar); \
-    ( void )ai; ( void )bi; \
-}
-
-#define bli_ccopyris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar); \
-	(bi) = (ai); \
-}
-
-#define bli_zcopyris( ar, ai, br, bi ) \
-{ \
-	(br) = (ar); \
-	(bi) = (ai); \
-}
-
-#define bli_sscopyris( ar, ai, br, bi )  { bli_scopyris( ar, 0.0F, br, bi ); ( void )ai; }
-#define bli_dscopyris( ar, ai, br, bi )  { bli_scopyris( ar, 0.0,  br, bi ); ( void )ai; }
-#define bli_cscopyris( ar, ai, br, bi )  bli_scopyris( ar, ai,   br, bi )
-#define bli_zscopyris( ar, ai, br, bi )  bli_scopyris( ar, ai,   br, bi )
-
-#define bli_sdcopyris( ar, ai, br, bi )  { bli_dcopyris( ar, 0.0F, br, bi ); ( void )ai; }
-#define bli_ddcopyris( ar, ai, br, bi )  { bli_dcopyris( ar, 0.0,  br, bi ); ( void )ai; }
-#define bli_cdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, ai,   br, bi )
-#define bli_zdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, ai,   br, bi )
-
-#define bli_sccopyris( ar, ai, br, bi )  { bli_ccopyris( ar, 0.0F, br, bi ); ( void )ai; }
-#define bli_dccopyris( ar, ai, br, bi )  { bli_ccopyris( ar, 0.0,  br, bi ); ( void )ai; }
-#define bli_cccopyris( ar, ai, br, bi )  bli_ccopyris( ar, ai,   br, bi )
-#define bli_zccopyris( ar, ai, br, bi )  bli_ccopyris( ar, ai,   br, bi )
-
-#define bli_szcopyris( ar, ai, br, bi )  { bli_zcopyris( ar, 0.0F, br, bi ); ( void )ai; }
-#define bli_dzcopyris( ar, ai, br, bi )  { bli_zcopyris( ar, 0.0,  br, bi ); ( void )ai; }
-#define bli_czcopyris( ar, ai, br, bi )  bli_zcopyris( ar, ai,   br, bi )
-#define bli_zzcopyris( ar, ai, br, bi )  bli_zcopyris( ar, ai,   br, bi )
-
-#endif
diff --git a/frame/include/level0/ri/bli_eqris.h b/frame/include/level0/ri/bli_eqris.h
deleted file mode 100644
index 4f8458316..000000000
--- a/frame/include/level0/ri/bli_eqris.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_EQRIS_H
-#define BLIS_EQRIS_H
-
-
-// eqris (passed by value)
-
-#define bli_seqris( ar, ai, br, bi )  ( (ar) == (br) )
-#define bli_deqris( ar, ai, br, bi )  ( (ar) == (br) )
-#define bli_ceqris( ar, ai, br, bi )  ( (ar) == (br) && (ai) == (bi) )
-#define bli_zeqris( ar, ai, br, bi )  ( (ar) == (br) && (ai) == (bi) )
-#define bli_ieqris( ar, ai, br, bi )  ( (ar) == (br) )
-
-
-// eq1ris
-
-#define bli_seq1ris( ar, ai )  bli_seqris( (ar), (ai), 1.0F, 0.0F )
-#define bli_deq1ris( ar, ai )  bli_deqris( (ar), (ai), 1.0,  0.0  )
-#define bli_ceq1ris( ar, ai )  bli_ceqris( (ar), (ai), 1.0F, 0.0F )
-#define bli_zeq1ris( ar, ai )  bli_zeqris( (ar), (ai), 1.0,  0.0  )
-#define bli_ieq1ris( ar, ai )  bli_ieqris( (ar), (ai), 1,    0    )
-
-
-// eq0ris
-
-#define bli_seq0ris( ar, ai )  bli_seqris( (ar), (ai), 0.0F, 0.0F )
-#define bli_deq0ris( ar, ai )  bli_deqris( (ar), (ai), 0.0,  0.0  )
-#define bli_ceq0ris( ar, ai )  bli_ceqris( (ar), (ai), 0.0F, 0.0F )
-#define bli_zeq0ris( ar, ai )  bli_zeqris( (ar), (ai), 0.0,  0.0  )
-#define bli_ieq0ris( ar, ai )  bli_ieqris( (ar), (ai), 0,    0    )
-
-
-// eqm1ris
-
-#define bli_seqm1ris( ar, ai )  bli_seqris( (ar), (ai), -1.0F, 0.0F )
-#define bli_deqm1ris( ar, ai )  bli_deqris( (ar), (ai), -1.0,  0.0  )
-#define bli_ceqm1ris( ar, ai )  bli_ceqris( (ar), (ai), -1.0F, 0.0F )
-#define bli_zeqm1ris( ar, ai )  bli_zeqris( (ar), (ai), -1.0,  0.0  )
-#define bli_ieqm1ris( ar, ai )  bli_ieqris( (ar), (ai), -1,    0    )
-
-
-
-#endif
diff --git a/frame/include/level0/ri/bli_invertris.h b/frame/include/level0/ri/bli_invertris.h
deleted file mode 100644
index aacf40ee3..000000000
--- a/frame/include/level0/ri/bli_invertris.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVERTRIS_H
-#define BLIS_INVERTRIS_H
-
-// invertris
-
-#define bli_sinvertris( xr, xi ) \
-{ \
-	(xr) = 1.0F / (xr); \
-}
-
-#define bli_dinvertris( xr, xi ) \
-{ \
-	(xr) = 1.0  / (xr); \
-}
-
-#define bli_cinvertris( xr, xi ) \
-{ \
-	float  s    = bli_fmaxabs( (xr), (xi) ); \
-	float  xr_s = (xr) / s; \
-	float  xi_s = (xi) / s; \
-	float  temp = ( xr_s * (xr) + xi_s * (xi) ); \
-	(xr)        =  xr_s / temp; \
-	(xi)        = -xi_s / temp; \
-}
-
-#define bli_zinvertris( xr, xi ) \
-{ \
-	double s    = bli_fmaxabs( (xr), (xi) ); \
-	double xr_s = (xr) / s; \
-	double xi_s = (xi) / s; \
-	double temp = ( xr_s * (xr) + xi_s * (xi) ); \
-	(xr)        =  xr_s / temp; \
-	(xi)        = -xi_s / temp; \
-}
-
-#endif
diff --git a/frame/include/level0/ri/bli_invscaljris.h b/frame/include/level0/ri/bli_invscaljris.h
deleted file mode 100644
index 43d98cd78..000000000
--- a/frame/include/level0/ri/bli_invscaljris.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVSCALJRIS_H
-#define BLIS_INVSCALJRIS_H
-
-// invscaljris
-
-#define bli_sinvscaljris( ar, ai, xr, xi )  bli_sinvscalris( (ar), -(ai), (xr), (xi) )
-#define bli_dinvscaljris( ar, ai, xr, xi )  bli_dinvscalris( (ar), -(ai), (xr), (xi) )
-#define bli_cinvscaljris( ar, ai, xr, xi )  bli_cinvscalris( (ar), -(ai), (xr), (xi) )
-#define bli_zinvscaljris( ar, ai, xr, xi )  bli_zinvscalris( (ar), -(ai), (xr), (xi) )
-
-#define bli_scinvscaljris( ar, ai, xr, xi )  bli_scinvscalris( (ar), -(ai), (xr), (xi) )
-#define bli_dzinvscaljris( ar, ai, xr, xi )  bli_dzinvscalris( (ar), -(ai), (xr), (xi) )
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_invscalris.h b/frame/include/level0/ri/bli_invscalris.h
deleted file mode 100644
index 1f846ee78..000000000
--- a/frame/include/level0/ri/bli_invscalris.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_INVSCALRIS_H
-#define BLIS_INVSCALRIS_H
-
-// invscalris
-
-#define bli_sinvscalris( ar, ai, xr, xi ) \
-{ \
-	(xr) /= (ar); \
-}
-
-#define bli_dinvscalris( ar, ai, xr, xi ) \
-{ \
-	(xr) /= (ar); \
-}
-
-#define bli_cinvscalris( ar, ai, xr, xi ) \
-{ \
-	float  s     = bli_fmaxabs( (ar), (ai) ); \
-	float  ar_s  = (ar) / s; \
-	float  ai_s  = (ai) / s; \
-	float  xrt   = (xr); \
-	float  temp  = ( ar_s * (ar) + ai_s * (ai) ); \
-	(xr)         = ( (xrt) * ar_s + (xi)  * ai_s ) / temp; \
-	(xi)         = ( (xi)  * ar_s - (xrt) * ai_s ) / temp; \
-}
-
-#define bli_zinvscalris( ar, ai, xr, xi ) \
-{ \
-	double s     = bli_fmaxabs( (ar), (ai) ); \
-	double ar_s  = (ar) / s; \
-	double ai_s  = (ai) / s; \
-	double xrt   = (xr); \
-	double temp  = ( ar_s * (ar) + ai_s * (ai) ); \
-	(xr)         = ( (xrt) * ar_s + (xi)  * ai_s ) / temp; \
-	(xi)         = ( (xi)  * ar_s - (xrt) * ai_s ) / temp; \
-}
-
-#define bli_scinvscalris( ar, ai, xr, xi ) \
-{ \
-	(xr) /= (ar); \
-	(xi) /= (ar); \
-}
-
-#define bli_dzinvscalris( ar, ai, xr, xi ) \
-{ \
-	(xr) /= (ar); \
-	(xi) /= (ar); \
-}
-
-#endif
diff --git a/frame/include/level0/ri/bli_neg2ris.h b/frame/include/level0/ri/bli_neg2ris.h
deleted file mode 100644
index 860b144cf..000000000
--- a/frame/include/level0/ri/bli_neg2ris.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_NEG2RIS_H
-#define BLIS_NEG2RIS_H
-
-// neg2ris
-
-#define bli_sneg2ris( ar, ai, br, bi ) \
-{ \
-	(br) = -(ar); \
-}
-
-#define bli_dneg2ris( ar, ai, br, bi ) \
-{ \
-	(br) = -(ar); \
-}
-
-#define bli_cneg2ris( ar, ai, br, bi ) \
-{ \
-	(br) = -(ar); \
-	(bi) = -(ai); \
-}
-
-#define bli_zneg2ris( ar, ai, br, bi ) \
-{ \
-	(br) = -(ar); \
-	(bi) = -(ai); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_scal2jris.h b/frame/include/level0/ri/bli_scal2jris.h
deleted file mode 100644
index f3b71ed2e..000000000
--- a/frame/include/level0/ri/bli_scal2jris.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2JRIS_H
-#define BLIS_SCAL2JRIS_H
-
-// scal2jris
-
-#define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr); \
-    ( void )ai; ( void )xi; ( void )yi; \
-}
-
-#define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr) + (ai) * (xi); \
-	(yi) = (ai) * (xr) - (ar) * (xi); \
-}
-
-#define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr) + (ai) * (xi); \
-    ( void )yi; \
-}
-
-#define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) *  (xr); \
-	(yi) = (ar) * -(xi); \
-    ( void )ai; \
-}
-
-#define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr); \
-	(yi) = (ai) * (xr); \
-    ( void )xi; \
-}
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi )
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-
-#define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi )
-
-
-
-#define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi )
-#define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi )
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_scal2ris.h b/frame/include/level0/ri/bli_scal2ris.h
deleted file mode 100644
index e30fd9789..000000000
--- a/frame/include/level0/ri/bli_scal2ris.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2RIS_H
-#define BLIS_SCAL2RIS_H
-
-// scal2ris
-
-#define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr); \
-    ( void )ai; ( void )xi; ( void )yi; \
-}
-
-#define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr) - (ai) * (xi); \
-	(yi) = (ai) * (xr) + (ar) * (xi); \
-}
-
-#define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr) - (ai) * (xi); \
-    ( void )yi; \
-}
-
-#define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr); \
-	(yi) = (ar) * (xi); \
-    ( void )ai; \
-}
-
-#define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \
-{ \
-	(yr) = (ar) * (xr); \
-	(yi) = (ai) * (xr); \
-    ( void )xi; \
-}
-
-// Notes:
-// - The first char encodes the type of a.
-// - The second char encodes the type of x.
-// - The third char encodes the type of y.
-
-// -- (axy) = (??s) ------------------------------------------------------------
-
-#define bli_sssscal2ris  bli_rxscal2ris
-#define bli_dssscal2ris  bli_rxscal2ris
-#define bli_cssscal2ris  bli_rxscal2ris
-#define bli_zssscal2ris  bli_rxscal2ris
-
-#define bli_sdsscal2ris  bli_rxscal2ris
-#define bli_ddsscal2ris  bli_rxscal2ris
-#define bli_cdsscal2ris  bli_rxscal2ris
-#define bli_zdsscal2ris  bli_rxscal2ris
-
-#define bli_scsscal2ris  bli_rxscal2ris
-#define bli_dcsscal2ris  bli_rxscal2ris
-#define bli_ccsscal2ris  bli_roscal2ris
-#define bli_zcsscal2ris  bli_roscal2ris
-
-#define bli_szsscal2ris  bli_rxscal2ris
-#define bli_dzsscal2ris  bli_rxscal2ris
-#define bli_czsscal2ris  bli_roscal2ris
-#define bli_zzsscal2ris  bli_roscal2ris
-
-// -- (axy) = (??d) ------------------------------------------------------------
-
-#define bli_ssdscal2ris  bli_rxscal2ris
-#define bli_dsdscal2ris  bli_rxscal2ris
-#define bli_csdscal2ris  bli_rxscal2ris
-#define bli_zsdscal2ris  bli_rxscal2ris
-
-#define bli_sddscal2ris  bli_rxscal2ris
-#define bli_dddscal2ris  bli_rxscal2ris
-#define bli_cddscal2ris  bli_rxscal2ris
-#define bli_zddscal2ris  bli_rxscal2ris
-
-#define bli_scdscal2ris  bli_rxscal2ris
-#define bli_dcdscal2ris  bli_rxscal2ris
-#define bli_ccdscal2ris  bli_roscal2ris
-#define bli_zcdscal2ris  bli_roscal2ris
-
-#define bli_szdscal2ris  bli_rxscal2ris
-#define bli_dzdscal2ris  bli_rxscal2ris
-#define bli_czdscal2ris  bli_roscal2ris
-#define bli_zzdscal2ris  bli_roscal2ris
-
-// -- (axy) = (??c) ------------------------------------------------------------
-
-#define bli_sscscal2ris  bli_rxscal2ris
-#define bli_dscscal2ris  bli_rxscal2ris
-#define bli_cscscal2ris  bli_rcscal2ris
-#define bli_zscscal2ris  bli_rcscal2ris
-
-#define bli_sdcscal2ris  bli_rxscal2ris
-#define bli_ddcscal2ris  bli_rxscal2ris
-#define bli_cdcscal2ris  bli_rcscal2ris
-#define bli_zdcscal2ris  bli_rcscal2ris
-
-#define bli_sccscal2ris  bli_crscal2ris
-#define bli_dccscal2ris  bli_crscal2ris
-#define bli_cccscal2ris  bli_cxscal2ris
-#define bli_zccscal2ris  bli_cxscal2ris
-
-#define bli_szcscal2ris  bli_crscal2ris
-#define bli_dzcscal2ris  bli_crscal2ris
-#define bli_czcscal2ris  bli_cxscal2ris
-#define bli_zzcscal2ris  bli_cxscal2ris
-
-// -- (axy) = (??z) ------------------------------------------------------------
-
-#define bli_sszscal2ris  bli_rxscal2ris
-#define bli_dszscal2ris  bli_rxscal2ris
-#define bli_cszscal2ris  bli_rcscal2ris
-#define bli_zszscal2ris  bli_rcscal2ris
-
-#define bli_sdzscal2ris  bli_rxscal2ris
-#define bli_ddzscal2ris  bli_rxscal2ris
-#define bli_cdzscal2ris  bli_rcscal2ris
-#define bli_zdzscal2ris  bli_rcscal2ris
-
-#define bli_sczscal2ris  bli_crscal2ris
-#define bli_dczscal2ris  bli_crscal2ris
-#define bli_cczscal2ris  bli_cxscal2ris
-#define bli_zczscal2ris  bli_cxscal2ris
-
-#define bli_szzscal2ris  bli_crscal2ris
-#define bli_dzzscal2ris  bli_crscal2ris
-#define bli_czzscal2ris  bli_cxscal2ris
-#define bli_zzzscal2ris  bli_cxscal2ris
-
-
-
-#define bli_sscal2ris    bli_sssscal2ris
-#define bli_dscal2ris    bli_dddscal2ris
-#define bli_cscal2ris    bli_cccscal2ris
-#define bli_zscal2ris    bli_zzzscal2ris
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_scal2ris_mxn.h b/frame/include/level0/ri/bli_scal2ris_mxn.h
deleted file mode 100644
index 85b242146..000000000
--- a/frame/include/level0/ri/bli_scal2ris_mxn.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCAL2RIS_MXN_H
-#define BLIS_SCAL2RIS_MXN_H
-
-// scal2ris_mxn
-
-BLIS_INLINE void bli_cscal2ris_mxn
-     (
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       scomplex* restrict alpha,
-       scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y
-     )
-{
-	float*  restrict alpha_r = ( float*  )alpha; \
-	float*  restrict alpha_i = ( float*  )alpha + 1; \
-	float*  restrict x_r     = ( float*  )x; \
-	float*  restrict x_i     = ( float*  )x + 1; \
-	float*  restrict y_r     = ( float*  )y; \
-	float*  restrict y_i     = ( float*  )y + is_y; \
-	const dim_t      incx2   = 2*rs_x; \
-	const dim_t      ldx2    = 2*cs_x; \
-
-	/* Treat the micro-panel as panel_dim x panel_len and column-stored
-	   (unit row stride). */ \
-
-	if ( bli_is_conj( conjx ) )
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			float*  restrict chi11_r = x_r + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict chi11_i = x_i + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict psi11_r = y_r + (i  )*1     + (j  )*cs_y;
-			float*  restrict psi11_i = y_i + (i  )*1     + (j  )*cs_y;
-
-			bli_cscal2jris
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i
-			);
-		}
-	}
-	else /* if ( bli_is_noconj( conjx ) ) */
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			float*  restrict chi11_r = x_r + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict chi11_i = x_i + (i  )*incx2 + (j  )*ldx2;
-			float*  restrict psi11_r = y_r + (i  )*1     + (j  )*cs_y;
-			float*  restrict psi11_i = y_i + (i  )*1     + (j  )*cs_y;
-
-			bli_cscal2ris
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i
-			);
-		}
-	}
-}
-
-BLIS_INLINE void bli_zscal2ris_mxn
-     (
-       const conj_t       conjx,
-       const dim_t        m,
-       const dim_t        n,
-       dcomplex* restrict alpha,
-       dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
-       dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y
-     )
-{
-	double* restrict alpha_r = ( double* )alpha; \
-	double* restrict alpha_i = ( double* )alpha + 1; \
-	double* restrict x_r     = ( double* )x; \
-	double* restrict x_i     = ( double* )x + 1; \
-	double* restrict y_r     = ( double* )y; \
-	double* restrict y_i     = ( double* )y + is_y; \
-	const dim_t      incx2   = 2*rs_x; \
-	const dim_t      ldx2    = 2*cs_x; \
-
-	/* Treat the micro-panel as panel_dim x panel_len and column-stored
-	   (unit row stride). */ \
-
-	if ( bli_is_conj( conjx ) )
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			double* restrict chi11_r = x_r + (i  )*incx2 + (j  )*ldx2;
-			double* restrict chi11_i = x_i + (i  )*incx2 + (j  )*ldx2;
-			double* restrict psi11_r = y_r + (i  )*1     + (j  )*cs_y;
-			double* restrict psi11_i = y_i + (i  )*1     + (j  )*cs_y;
-
-			bli_zscal2jris
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i
-			);
-		}
-	}
-	else /* if ( bli_is_noconj( conjx ) ) */
-	{
-		for ( dim_t j = 0; j < n; ++j )
-		for ( dim_t i = 0; i < m; ++i )
-		{
-			double* restrict chi11_r = x_r + (i  )*incx2 + (j  )*ldx2;
-			double* restrict chi11_i = x_i + (i  )*incx2 + (j  )*ldx2;
-			double* restrict psi11_r = y_r + (i  )*1     + (j  )*cs_y;
-			double* restrict psi11_i = y_i + (i  )*1     + (j  )*cs_y;
-
-			bli_zscal2ris
-			(
-			  *alpha_r,
-			  *alpha_i,
-			  *chi11_r,
-			  *chi11_i,
-			  *psi11_r,
-			  *psi11_i
-			);
-		}
-	}
-}
-
-
-#endif
diff --git a/frame/include/level0/ri/bli_scalcjris.h b/frame/include/level0/ri/bli_scalcjris.h
deleted file mode 100644
index 8050a924b..000000000
--- a/frame/include/level0/ri/bli_scalcjris.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALCJRIS_H
-#define BLIS_SCALCJRIS_H
-
-// scalcjris
-
-#define bli_sscalcjris( conj, ar, ai, xr, xi ) \
-{ \
-	bli_sscalris( (ar), (ai), (xr), (xi) ); \
-}
-
-#define bli_dscalcjris( conj, ar, ai, xr, xi ) \
-{ \
-	bli_dscalris( (ar), (ai), (xr), (xi) ); \
-}
-
-#define bli_cscalcjris( conj, ar, ai, xr, xi ) \
-{ \
-	if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \
-	else                       { bli_cscalris(  (ar), (ai), (xr), (xi) ); } \
-}
-
-#define bli_zscalcjris( conj, ar, ai, xr, xi ) \
-{ \
-	if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \
-	else                       { bli_zscalris(  (ar), (ai), (xr), (xi) ); } \
-}
-
-#define bli_iscalcjris( conj, ar, ai, xr, xi ) \
-{ \
-	bli_iscalris( (ar), (xi), (xr), (xi) ); \
-}
-
-#define bli_scscalcjris( conj, ar, ai, xr, xi ) \
-{ \
-	bli_scscalris( (ar), (ai), (xr), (xi) ); \
-}
-
-#define bli_dzscalcjris( conj, ar, ai, xr, xi ) \
-{ \
-	bli_dzscalris( (ar), (ai), (xr), (xi) ); \
-}
-
-#endif
diff --git a/frame/include/level0/ri/bli_scaljris.h b/frame/include/level0/ri/bli_scaljris.h
deleted file mode 100644
index 29722c1b5..000000000
--- a/frame/include/level0/ri/bli_scaljris.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALJRIS_H
-#define BLIS_SCALJRIS_H
-
-// scaljris
-
-#define bli_sscaljris( ar, ai, xr, xi )  bli_sscalris( (ar), -(ai), (xr), (xi) )
-#define bli_dscaljris( ar, ai, xr, xi )  bli_dscalris( (ar), -(ai), (xr), (xi) )
-#define bli_cscaljris( ar, ai, xr, xi )  bli_cscalris( (ar), -(ai), (xr), (xi) )
-#define bli_zscaljris( ar, ai, xr, xi )  bli_zscalris( (ar), -(ai), (xr), (xi) )
-
-#define bli_scscaljris( ar, ai, xr, xi )  bli_scscalris( (ar), -(ai), (xr), (xi) )
-#define bli_dzscaljris( ar, ai, xr, xi )  bli_dzscalris( (ar), -(ai), (xr), (xi) )
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_scalris.h b/frame/include/level0/ri/bli_scalris.h
deleted file mode 100644
index e5eeb19ba..000000000
--- a/frame/include/level0/ri/bli_scalris.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALRIS_H
-#define BLIS_SCALRIS_H
-
-// scalris
-
-#define bli_sscalris( ar, ai, xr, xi ) \
-{ \
-	(xr)      = (ar) * (xr); \
-}
-
-#define bli_dscalris( ar, ai, xr, xi ) \
-{ \
-	(xr)      = (ar) * (xr); \
-}
-
-#define bli_cscalris( ar, ai, xr, xi ) \
-{ \
-	float  yr = (ar) * (xr) - (ai) * (xi); \
-	float  yi = (ai) * (xr) + (ar) * (xi); \
-	(xr) = yr; \
-	(xi) = yi; \
-}
-
-#define bli_zscalris( ar, ai, xr, xi ) \
-{ \
-	double yr = (ar) * (xr) - (ai) * (xi); \
-	double yi = (ai) * (xr) + (ar) * (xi); \
-	(xr) = yr; \
-	(xi) = yi; \
-}
-
-#define bli_scscalris( ar, ai, xr, xi ) \
-{ \
-	(xr)      = (ar) * (xr); \
-	(xi)      = (ar) * (xi); \
-}
-
-#define bli_dzscalris( ar, ai, xr, xi ) \
-{ \
-	(xr)      = (ar) * (xr); \
-	(xi)      = (ar) * (xi); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_scalris_mxn_uplo.h b/frame/include/level0/ri/bli_scalris_mxn_uplo.h
deleted file mode 100644
index ed2b7d18e..000000000
--- a/frame/include/level0/ri/bli_scalris_mxn_uplo.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SCALRIS_MXN_UPLO_H
-#define BLIS_SCALRIS_MXN_UPLO_H
-
-// scalris_mxn_u
-
-#define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	{ \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_cscalris( *(ar), \
-			              *(ai), \
-			              *((xr) + _i*rs_x + _j*cs_x), \
-			              *((xi) + _i*rs_x + _j*cs_x) ); \
-		} \
-	} \
-}
-
-#define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	{ \
-		if ( (doff_t)_j - (doff_t)_i >= diagoff ) \
-		{ \
-			bli_zscalris( *(ar), \
-			              *(ai), \
-			              *((xr) + _i*rs_x + _j*cs_x), \
-			              *((xi) + _i*rs_x + _j*cs_x) ); \
-		} \
-	} \
-}
-
-// scalris_mxn_l
-
-#define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	{ \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_cscalris( *(ar), \
-			              *(ai), \
-			              *((xr) + _i*rs_x + _j*cs_x), \
-			              *((xi) + _i*rs_x + _j*cs_x) ); \
-		} \
-	} \
-}
-
-#define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	{ \
-		if ( (doff_t)_j - (doff_t)_i <= diagoff ) \
-		{ \
-			bli_zscalris( *(ar), \
-			              *(ai), \
-			              *((xr) + _i*rs_x + _j*cs_x), \
-			              *((xi) + _i*rs_x + _j*cs_x) ); \
-		} \
-	} \
-}
-
-#endif
diff --git a/frame/include/level0/ri/bli_set0ris.h b/frame/include/level0/ri/bli_set0ris.h
deleted file mode 100644
index a4e0ed47f..000000000
--- a/frame/include/level0/ri/bli_set0ris.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SET0RIS_H
-#define BLIS_SET0RIS_H
-
-// set0ris
-
-#define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi )
-#define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi )
-#define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi )
-#define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi )
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_subjris.h b/frame/include/level0/ri/bli_subjris.h
deleted file mode 100644
index a35fdbc72..000000000
--- a/frame/include/level0/ri/bli_subjris.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SUBJRIS_H
-#define BLIS_SUBJRIS_H
-
-// subjris
-
-#define bli_ssubjris( ar, ai, xr, xi )  bli_ssubris( (ar), -(ai), (xr), (xi) )
-#define bli_dsubjris( ar, ai, xr, xi )  bli_dsubris( (ar), -(ai), (xr), (xi) )
-#define bli_csubjris( ar, ai, xr, xi )  bli_csubris( (ar), -(ai), (xr), (xi) )
-#define bli_zsubjris( ar, ai, xr, xi )  bli_zsubris( (ar), -(ai), (xr), (xi) )
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_subris.h b/frame/include/level0/ri/bli_subris.h
deleted file mode 100644
index 4c340604d..000000000
--- a/frame/include/level0/ri/bli_subris.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SUBRIS_H
-#define BLIS_SUBRIS_H
-
-// subris
-
-#define bli_ssubris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) - (ar); \
-}
-
-#define bli_dsubris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) - (ar); \
-}
-
-#define bli_csubris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) - (ar); \
-	(xi) = (xi) - (ai); \
-}
-
-#define bli_zsubris( ar, ai, xr, xi ) \
-{ \
-	(xr) = (xr) - (ar); \
-	(xi) = (xi) - (ai); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_swapris.h b/frame/include/level0/ri/bli_swapris.h
deleted file mode 100644
index 5b080fa76..000000000
--- a/frame/include/level0/ri/bli_swapris.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_SWAPRIS_H
-#define BLIS_SWAPRIS_H
-
-// swapris
-
-#define bli_sswapris( ar, ai, br, bi ) \
-{ \
-	float tr, ti; \
-\
-	bli_scopyris( (br), (bi), (tr), (ti) ); \
-	bli_scopyris( (ar), (ai), (br), (bi) ); \
-	bli_scopyris( (tr), (ti), (ar), (ai) ); \
-}
-
-#define bli_dswapris( ar, ai, br, bi ) \
-{ \
-	double tr, ti; \
-\
-	bli_dcopyris( (br), (bi), (tr), (ti) ); \
-	bli_dcopyris( (ar), (ai), (br), (bi) ); \
-	bli_dcopyris( (tr), (ti), (ar), (ai) ); \
-}
-
-#define bli_cswapris( ar, ai, br, bi ) \
-{ \
-	scomplex tr, ti; \
-\
-	bli_ccopyris( (br), (bi), (tr), (ti) ); \
-	bli_ccopyris( (ar), (ai), (br), (bi) ); \
-	bli_ccopyris( (tr), (ti), (ar), (ai) ); \
-}
-
-#define bli_zswapris( ar, ai, br, bi ) \
-{ \
-	dcomplex tr, ti; \
-\
-	bli_zcopyris( (br), (bi), (tr), (ti) ); \
-	bli_zcopyris( (ar), (ai), (br), (bi) ); \
-	bli_zcopyris( (tr), (ti), (ar), (ai) ); \
-}
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_xpbyjris.h b/frame/include/level0/ri/bli_xpbyjris.h
deleted file mode 100644
index e441a2b45..000000000
--- a/frame/include/level0/ri/bli_xpbyjris.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_XPBYJRIS_H
-#define BLIS_XPBYJRIS_H
-
-// xpbyjris
-
-#define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \
-{ \
-	(yr) = (xr) + (br) * (yr); \
-}
-
-#define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \
-{ \
-	const __typeof__(yr) yt_r =  (xr) + (br) * (yr) - (bi) * (yi); \
-	const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \
-	(yr) = yt_r; \
-	(yi) = yt_i; \
-}
-
-#define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \
-{ \
-	const __typeof__(yr) yt_r =  (xr) + (br) * (yr); \
-	const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \
-	(yr) = yt_r; \
-	(yi) = yt_i; \
-}
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of b.
-// - The third char encodes the type of y.
-
-// -- (xby) = (??s) ------------------------------------------------------------
-
-#define bli_sssxpbyjris  bli_rxxpbyjris
-#define bli_dssxpbyjris  bli_rxxpbyjris
-#define bli_cssxpbyjris  bli_rxxpbyjris
-#define bli_zssxpbyjris  bli_rxxpbyjris
-
-#define bli_sdsxpbyjris  bli_rxxpbyjris
-#define bli_ddsxpbyjris  bli_rxxpbyjris
-#define bli_cdsxpbyjris  bli_rxxpbyjris
-#define bli_zdsxpbyjris  bli_rxxpbyjris
-
-#define bli_scsxpbyjris  bli_rxxpbyjris
-#define bli_dcsxpbyjris  bli_rxxpbyjris
-#define bli_ccsxpbyjris  bli_rxxpbyjris
-#define bli_zcsxpbyjris  bli_rxxpbyjris
-
-#define bli_szsxpbyjris  bli_rxxpbyjris
-#define bli_dzsxpbyjris  bli_rxxpbyjris
-#define bli_czsxpbyjris  bli_rxxpbyjris
-#define bli_zzsxpbyjris  bli_rxxpbyjris
-
-// -- (xby) = (??d) ------------------------------------------------------------
-
-#define bli_ssdxpbyjris  bli_rxxpbyjris
-#define bli_dsdxpbyjris  bli_rxxpbyjris
-#define bli_csdxpbyjris  bli_rxxpbyjris
-#define bli_zsdxpbyjris  bli_rxxpbyjris
-
-#define bli_sddxpbyjris  bli_rxxpbyjris
-#define bli_dddxpbyjris  bli_rxxpbyjris
-#define bli_cddxpbyjris  bli_rxxpbyjris
-#define bli_zddxpbyjris  bli_rxxpbyjris
-
-#define bli_scdxpbyjris  bli_rxxpbyjris
-#define bli_dcdxpbyjris  bli_rxxpbyjris
-#define bli_ccdxpbyjris  bli_rxxpbyjris
-#define bli_zcdxpbyjris  bli_rxxpbyjris
-
-#define bli_szdxpbyjris  bli_rxxpbyjris
-#define bli_dzdxpbyjris  bli_rxxpbyjris
-#define bli_czdxpbyjris  bli_rxxpbyjris
-#define bli_zzdxpbyjris  bli_rxxpbyjris
-
-// -- (xby) = (??c) ------------------------------------------------------------
-
-#define bli_sscxpbyjris  bli_rxxpbyjris
-#define bli_dscxpbyjris  bli_rxxpbyjris
-#define bli_cscxpbyjris  bli_crxpbyjris
-#define bli_zscxpbyjris  bli_crxpbyjris
-
-#define bli_sdcxpbyjris  bli_rxxpbyjris
-#define bli_ddcxpbyjris  bli_rxxpbyjris
-#define bli_cdcxpbyjris  bli_crxpbyjris
-#define bli_zdcxpbyjris  bli_crxpbyjris
-
-#define bli_sccxpbyjris  bli_cxxpbyjris
-#define bli_dccxpbyjris  bli_cxxpbyjris
-#define bli_cccxpbyjris  bli_cxxpbyjris
-#define bli_zccxpbyjris  bli_cxxpbyjris
-
-#define bli_szcxpbyjris  bli_cxxpbyjris
-#define bli_dzcxpbyjris  bli_cxxpbyjris
-#define bli_czcxpbyjris  bli_cxxpbyjris
-#define bli_zzcxpbyjris  bli_cxxpbyjris
-
-// -- (xby) = (??z) ------------------------------------------------------------
-
-#define bli_sszxpbyjris  bli_rxxpbyjris
-#define bli_dszxpbyjris  bli_rxxpbyjris
-#define bli_cszxpbyjris  bli_crxpbyjris
-#define bli_zszxpbyjris  bli_crxpbyjris
-
-#define bli_sdzxpbyjris  bli_rxxpbyjris
-#define bli_ddzxpbyjris  bli_rxxpbyjris
-#define bli_cdzxpbyjris  bli_crxpbyjris
-#define bli_zdzxpbyjris  bli_crxpbyjris
-
-#define bli_sczxpbyjris  bli_cxxpbyjris
-#define bli_dczxpbyjris  bli_cxxpbyjris
-#define bli_cczxpbyjris  bli_cxxpbyjris
-#define bli_zczxpbyjris  bli_cxxpbyjris
-
-#define bli_szzxpbyjris  bli_cxxpbyjris
-#define bli_dzzxpbyjris  bli_cxxpbyjris
-#define bli_czzxpbyjris  bli_cxxpbyjris
-#define bli_zzzxpbyjris  bli_cxxpbyjris
-
-
-
-#define bli_sxpbyjris    bli_sssxpbyjris
-#define bli_dxpbyjris    bli_dddxpbyjris
-#define bli_cxpbyjris    bli_cccxpbyjris
-#define bli_zxpbyjris    bli_zzzxpbyjris
-
-#endif
-
diff --git a/frame/include/level0/ri/bli_xpbyris.h b/frame/include/level0/ri/bli_xpbyris.h
deleted file mode 100644
index 4d693de92..000000000
--- a/frame/include/level0/ri/bli_xpbyris.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_XPBYRIS_H
-#define BLIS_XPBYRIS_H
-
-// xpbyris
-
-#define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \
-{ \
-	(yr) = (xr) + (br) * (yr); \
-}
-
-#define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \
-{ \
-	const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \
-	const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \
-	(yr) = yt_r; \
-	(yi) = yt_i; \
-}
-
-#define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \
-{ \
-	const __typeof__(yr) yt_r = (xr) + (br) * (yr); \
-	const __typeof__(yi) yt_i = (xi) + (br) * (yi); \
-	(yr) = yt_r; \
-	(yi) = yt_i; \
-}
-
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of b.
-// - The third char encodes the type of y.
-
-// -- (xby) = (??s) ------------------------------------------------------------
-
-#define bli_sssxpbyris  bli_rxxpbyris
-#define bli_dssxpbyris  bli_rxxpbyris
-#define bli_cssxpbyris  bli_rxxpbyris
-#define bli_zssxpbyris  bli_rxxpbyris
-
-#define bli_sdsxpbyris  bli_rxxpbyris
-#define bli_ddsxpbyris  bli_rxxpbyris
-#define bli_cdsxpbyris  bli_rxxpbyris
-#define bli_zdsxpbyris  bli_rxxpbyris
-
-#define bli_scsxpbyris  bli_rxxpbyris
-#define bli_dcsxpbyris  bli_rxxpbyris
-#define bli_ccsxpbyris  bli_rxxpbyris
-#define bli_zcsxpbyris  bli_rxxpbyris
-
-#define bli_szsxpbyris  bli_rxxpbyris
-#define bli_dzsxpbyris  bli_rxxpbyris
-#define bli_czsxpbyris  bli_rxxpbyris
-#define bli_zzsxpbyris  bli_rxxpbyris
-
-// -- (xby) = (??d) ------------------------------------------------------------
-
-#define bli_ssdxpbyris  bli_rxxpbyris
-#define bli_dsdxpbyris  bli_rxxpbyris
-#define bli_csdxpbyris  bli_rxxpbyris
-#define bli_zsdxpbyris  bli_rxxpbyris
-
-#define bli_sddxpbyris  bli_rxxpbyris
-#define bli_dddxpbyris  bli_rxxpbyris
-#define bli_cddxpbyris  bli_rxxpbyris
-#define bli_zddxpbyris  bli_rxxpbyris
-
-#define bli_scdxpbyris  bli_rxxpbyris
-#define bli_dcdxpbyris  bli_rxxpbyris
-#define bli_ccdxpbyris  bli_rxxpbyris
-#define bli_zcdxpbyris  bli_rxxpbyris
-
-#define bli_szdxpbyris  bli_rxxpbyris
-#define bli_dzdxpbyris  bli_rxxpbyris
-#define bli_czdxpbyris  bli_rxxpbyris
-#define bli_zzdxpbyris  bli_rxxpbyris
-
-// -- (xby) = (??c) ------------------------------------------------------------
-
-#define bli_sscxpbyris  bli_rxxpbyris
-#define bli_dscxpbyris  bli_rxxpbyris
-#define bli_cscxpbyris  bli_crxpbyris
-#define bli_zscxpbyris  bli_crxpbyris
-
-#define bli_sdcxpbyris  bli_rxxpbyris
-#define bli_ddcxpbyris  bli_rxxpbyris
-#define bli_cdcxpbyris  bli_crxpbyris
-#define bli_zdcxpbyris  bli_crxpbyris
-
-#define bli_sccxpbyris  bli_cxxpbyris
-#define bli_dccxpbyris  bli_cxxpbyris
-#define bli_cccxpbyris  bli_cxxpbyris
-#define bli_zccxpbyris  bli_cxxpbyris
-
-#define bli_szcxpbyris  bli_cxxpbyris
-#define bli_dzcxpbyris  bli_cxxpbyris
-#define bli_czcxpbyris  bli_cxxpbyris
-#define bli_zzcxpbyris  bli_cxxpbyris
-
-// -- (xby) = (??z) ------------------------------------------------------------
-
-#define bli_sszxpbyris  bli_rxxpbyris
-#define bli_dszxpbyris  bli_rxxpbyris
-#define bli_cszxpbyris  bli_crxpbyris
-#define bli_zszxpbyris  bli_crxpbyris
-
-#define bli_sdzxpbyris  bli_rxxpbyris
-#define bli_ddzxpbyris  bli_rxxpbyris
-#define bli_cdzxpbyris  bli_crxpbyris
-#define bli_zdzxpbyris  bli_crxpbyris
-
-#define bli_sczxpbyris  bli_cxxpbyris
-#define bli_dczxpbyris  bli_cxxpbyris
-#define bli_cczxpbyris  bli_cxxpbyris
-#define bli_zczxpbyris  bli_cxxpbyris
-
-#define bli_szzxpbyris  bli_cxxpbyris
-#define bli_dzzxpbyris  bli_cxxpbyris
-#define bli_czzxpbyris  bli_cxxpbyris
-#define bli_zzzxpbyris  bli_cxxpbyris
-
-
-
-#define bli_sxpbyris    bli_sssxpbyris
-#define bli_dxpbyris    bli_dddxpbyris
-#define bli_cxpbyris    bli_cccxpbyris
-#define bli_zxpbyris    bli_zzzxpbyris
-
-#endif
-
diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c
index 3fafb4e50..370619971 100644
--- a/frame/util/bli_util_check.c
+++ b/frame/util/bli_util_check.c
@@ -151,10 +151,33 @@ void PASTEMAC(opname,_check) \
 }
 
 GENFRONT( eqsc )
+
+
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC(opname,_check) \
+     ( \
+       const obj_t* chi, \
+       const obj_t* psi, \
+       const bool*  is  \
+     ) \
+{ \
+	bli_l0_xxbsc_check( chi, psi, is ); \
+\
+	err_t e_val; \
+\
+	e_val = bli_check_real_datatype( bli_obj_dt( chi ) ); \
+	bli_check_error_code( e_val ); \
+\
+	e_val = bli_check_real_datatype( bli_obj_dt( psi ) ); \
+	bli_check_error_code( e_val ); \
+}
+
 GENFRONT( ltsc )
-GENFRONT( ltesc )
+GENFRONT( lesc )
 GENFRONT( gtsc )
-GENFRONT( gtesc )
+GENFRONT( gesc )
 
 
 #undef  GENFRONT
diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h
index 26986b52c..3d91a7c96 100644
--- a/frame/util/bli_util_check.h
+++ b/frame/util/bli_util_check.h
@@ -130,9 +130,9 @@ void PASTEMAC(opname,_check) \
 
 GENTPROT( eqsc )
 GENTPROT( ltsc )
-GENTPROT( ltesc )
+GENTPROT( lesc )
 GENTPROT( gtsc )
-GENTPROT( gtesc )
+GENTPROT( gesc )
 
 
 #undef  GENPROT
diff --git a/frame/util/bli_util_fpa.c b/frame/util/bli_util_fpa.c
index bbba052c6..c577ef0cd 100644
--- a/frame/util/bli_util_fpa.c
+++ b/frame/util/bli_util_fpa.c
@@ -89,12 +89,29 @@ PASTEMAC(opname,_qfp)( num_t dt ) \
 GENFRONT( eqsc )
 GENFRONT( eqv )
 GENFRONT( eqm )
-GENFRONT( ltsc )
-GENFRONT( ltesc )
-GENFRONT( gtsc )
-GENFRONT( gtesc )
 GENFRONT( fprintv )
 GENFRONT( fprintm )
 //GENFRONT( printv )
 //GENFRONT( printm )
 
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+/*
+GENARRAY_FPA( void_fp, opname ); \
+*/ \
+\
+GENARRAYRO_FPA( PASTECH(opname,_vft), \
+                PASTECH(opname) ); \
+\
+PASTECH(opname,_vft) \
+PASTEMAC(opname,_qfp)( num_t dt ) \
+{ \
+	return PASTECH(opname,_fpa)[ dt ]; \
+}
+
+GENFRONT( ltsc )
+GENFRONT( lesc )
+GENFRONT( gtsc )
+GENFRONT( gesc )
+
diff --git a/frame/util/bli_util_fpa.h b/frame/util/bli_util_fpa.h
index 5ee0f4adb..ee4ce2bd2 100644
--- a/frame/util/bli_util_fpa.h
+++ b/frame/util/bli_util_fpa.h
@@ -70,9 +70,9 @@ GENPROT( eqsc )
 GENPROT( eqv )
 GENPROT( eqm )
 GENPROT( ltsc )
-GENPROT( ltesc )
+GENPROT( lesc )
 GENPROT( gtsc )
-GENPROT( gtesc )
+GENPROT( gesc )
 GENPROT( fprintv )
 GENPROT( fprintm )
 //GENPROT( printv )
diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h
index 2bb1943d7..c4af5be8b 100644
--- a/frame/util/bli_util_ft.h
+++ b/frame/util/bli_util_ft.h
@@ -248,7 +248,7 @@ typedef void (*PASTECH(ch,opname,tsuf)) \
 
 INSERT_GENTDEF( eqm )
 
-// ltsc, ltesc, gtsc, gtesc
+// ltsc, lesc, gtsc, gesc
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
@@ -261,9 +261,9 @@ typedef void (*PASTECH(ch,opname,tsuf)) \
      );
 
 INSERT_GENTDEF( ltsc )
-INSERT_GENTDEF( ltesc )
+INSERT_GENTDEF( lesc )
 INSERT_GENTDEF( gtsc )
-INSERT_GENTDEF( gtesc )
+INSERT_GENTDEF( gesc )
 
 #endif // #ifdef BLIS_OAPI_BASIC
 
diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c
index 4810b6f00..f6a814421 100644
--- a/frame/util/bli_util_oapi.c
+++ b/frame/util/bli_util_oapi.c
@@ -395,7 +395,7 @@ void PASTEMAC(opname) \
 	/* Integer objects are handled separately. */ \
 	if ( bli_is_int( dt ) ) \
 	{ \
-		*is_eq = bli_ieqa( buf_chi, buf_psi ); \
+		*is_eq = bli_ieq( buf_chi, buf_psi ); \
 		return; \
 	} \
 \
@@ -571,9 +571,9 @@ void PASTEMAC(opname) \
 }
 
 GENFRONT( ltsc )
-GENFRONT( ltesc )
+GENFRONT( lesc )
 GENFRONT( gtsc )
-GENFRONT( gtesc )
+GENFRONT( gesc )
 
 
 #undef  GENFRONT
diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h
index 2a1d700d8..cc159d8a7 100644
--- a/frame/util/bli_util_oapi.h
+++ b/frame/util/bli_util_oapi.h
@@ -154,9 +154,9 @@ GENPROT( eqsc )
 GENPROT( eqv )
 GENPROT( eqm )
 GENPROT( ltsc )
-GENPROT( ltesc )
+GENPROT( lesc )
 GENPROT( gtsc )
-GENPROT( gtesc )
+GENPROT( gesc )
 
 
 #undef  GENPROT
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index c3521f244..5997828da 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -59,7 +59,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	   zero and return early. */ \
 	if ( bli_zero_dim1( n ) ) \
 	{ \
-		PASTEMAC(chr,set0s)( *asum ); \
+		bli_tset0s( chr, *asum ); \
 		return; \
 	} \
 \
@@ -138,7 +138,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	   early. */ \
 	if ( bli_zero_dim1( n ) ) \
 	{ \
-		PASTEMAC(chr,set0s)( *norm ); \
+		bli_tset0s( chr, *norm ); \
 		return; \
 	} \
 \
@@ -185,7 +185,7 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	   early. */ \
 	if ( bli_zero_dim2( m, n ) ) \
 	{ \
-		PASTEMAC(chr,set0s)( *norm ); \
+		bli_tset0s( chr, *norm ); \
 		return; \
 	} \
 \
@@ -236,10 +236,10 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	ctype_r norm; \
 \
 	/* Set the norm to zero. */ \
-	PASTEMAC(chr,set0s)( norm ); \
+	bli_tset0s( chr, norm ); \
 \
 	/* Iterate at least once, but continue iterating until the norm is not zero. */ \
-	while ( PASTEMAC(chr,eq0)( norm ) ) \
+	while ( bli_teq0s( chr, norm ) ) \
 	{ \
 		/* Invoke the helper variant, which loops over the appropriate kernel
 		   to implement the current operation. */ \
@@ -295,10 +295,10 @@ void PASTEMAC(ch,opname,EX_SUF) \
 	ctype_r norm; \
 \
 	/* Set the norm to zero. */ \
-	PASTEMAC(chr,set0s)( norm ); \
+	bli_tset0s( chr, norm ); \
 \
 	/* Iterate at least once, but continue iterating until the norm is not zero. */ \
-	while ( PASTEMAC(chr,eq0)( norm ) ) \
+	while ( bli_teq0s( chr, norm ) ) \
 	{ \
 		/* Invoke the helper variant, which loops over the appropriate kernel
 		   to implement the current operation. */ \
@@ -393,9 +393,9 @@ void PASTEMAC(ch,opname) \
 \
 	ctype chi_conj; \
 \
-	PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
+	bli_tcopycjs( ch,ch, conjchi, *chi, chi_conj ); \
 \
-	*is_eq = PASTEMAC(ch,eq)( chi_conj, *psi ); \
+	*is_eq = PASTEMAC(t,eqs)( ch,ch,ch, chi_conj, *psi ); \
 }
 
 INSERT_GENTFUNC_BASIC( eqsc )
@@ -475,8 +475,8 @@ void PASTEMAC(ch,opname) \
 INSERT_GENTFUNC_BASIC( eqm )
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, kername ) \
+#undef  GENTFUNCRO
+#define GENTFUNCRO( ctype, ch, opname, kername ) \
 \
 void PASTEMAC(ch,opname) \
      ( \
@@ -490,10 +490,10 @@ void PASTEMAC(ch,opname) \
 	*is = PASTEMAC(ch,kername)( *chi, *psi ); \
 }
 
-INSERT_GENTFUNC_BASIC( ltsc,  lt )
-INSERT_GENTFUNC_BASIC( ltesc, lte )
-INSERT_GENTFUNC_BASIC( gtsc,  gt )
-INSERT_GENTFUNC_BASIC( gtesc, gte )
+INSERT_GENTFUNCRO_BASIC( ltsc, lt )
+INSERT_GENTFUNCRO_BASIC( lesc, le )
+INSERT_GENTFUNCRO_BASIC( gtsc, gt )
+INSERT_GENTFUNCRO_BASIC( gesc, ge )
 
 
 #undef  GENTFUNC
diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h
index 715b22a26..1b9db64bb 100644
--- a/frame/util/bli_util_tapi.h
+++ b/frame/util/bli_util_tapi.h
@@ -202,8 +202,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
 INSERT_GENTPROT_BASIC( eqm )
 
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
+#undef  GENTPROTRO
+#define GENTPROTRO( ctype, ch, opname ) \
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
@@ -212,10 +212,10 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
              bool*  is  \
      );
 
-INSERT_GENTPROT_BASIC( ltsc )
-INSERT_GENTPROT_BASIC( ltesc )
-INSERT_GENTPROT_BASIC( gtsc )
-INSERT_GENTPROT_BASIC( gtesc )
+INSERT_GENTPROTRO_BASIC( ltsc )
+INSERT_GENTPROTRO_BASIC( lesc )
+INSERT_GENTPROTRO_BASIC( gtsc )
+INSERT_GENTPROTRO_BASIC( gesc )
 
 
 #undef  GENTPROT
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index b3767e6a8..0546d6786 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -59,26 +59,26 @@ void PASTEMAC(ch,varname) \
 	dim_t   i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
-	PASTEMAC(chr,set0s)( absum ); \
+	bli_tset0s( chr, absum ); \
 \
 	for ( i = 0; i < n; ++i ) \
 	{ \
 		chi1 = x + (i  )*incx; \
 \
 		/* Get the real and imaginary components of chi1. */ \
-		PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+		bli_tgets( ch,chr, *chi1, chi1_r, chi1_i ); \
 \
 		/* Replace chi1_r and chi1_i with their absolute values. */ \
 		chi1_r = bli_fabs( chi1_r ); \
 		chi1_i = bli_fabs( chi1_i ); \
 \
 		/* Accumulate the real and imaginary components into absum. */ \
-		PASTEMAC(chr,adds)( chi1_r, absum ); \
-		PASTEMAC(chr,adds)( chi1_i, absum ); \
+		bli_tadds( chr,chr,chr, chi1_r, absum ); \
+		bli_tadds( chr,chr,chr, chi1_i, absum ); \
 	} \
 \
 	/* Store the final value of absum to the output variable. */ \
-	PASTEMAC(chr,copys)( absum, *asum ); \
+	bli_tcopys( chr,chr, absum, *asum ); \
 }
 
 INSERT_GENTFUNCR_BASIC( asumv_unb_var1 )
@@ -245,21 +245,21 @@ void PASTEMAC(ch,varname) \
 	dim_t   i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
-	PASTEMAC(chr,set0s)( absum ); \
+	bli_tset0s( chr, absum ); \
 \
 	for ( i = 0; i < n; ++i ) \
 	{ \
 		chi1 = x + (i  )*incx; \
 \
 		/* Compute the absolute value (or complex magnitude) of chi1. */ \
-		PASTEMAC(ch,chr,abval2s)( *chi1, abs_chi1 ); \
+		bli_tabval2s( ch,chr,chr, *chi1, abs_chi1 ); \
 \
 		/* Accumulate the absolute value of chi1 into absum. */ \
-		PASTEMAC(chr,adds)( abs_chi1, absum ); \
+		bli_tadds( chr,chr,chr, abs_chi1, absum ); \
 	} \
 \
 	/* Store final value of absum to the output variable. */ \
-	PASTEMAC(chr,copys)( absum, *norm ); \
+	bli_tcopys( chr,chr, absum, *norm ); \
 }
 
 INSERT_GENTFUNCR_BASIC( norm1v_unb_var1 )
@@ -284,8 +284,8 @@ void PASTEMAC(ch,varname) \
 	ctype_r  sqrt_sumsq; \
 \
 	/* Initialize scale and sumsq to begin the summation. */ \
-	PASTEMAC(chr,copys)( *zero, scale ); \
-	PASTEMAC(chr,copys)( *one,  sumsq ); \
+	bli_tcopys( chr,chr, *zero, scale ); \
+	bli_tcopys( chr,chr, *one,  sumsq ); \
 \
 	/* Compute the sum of the squares of the vector. */ \
 	PASTEMAC(ch,kername) \
@@ -299,11 +299,11 @@ void PASTEMAC(ch,varname) \
 	); \
 \
 	/* Compute: norm = scale * sqrt( sumsq ) */ \
-	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
-	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+	bli_tsqrt2s( chr,chr,chr, sumsq, sqrt_sumsq ); \
+	bli_tscals( chr,chr,chr, scale, sqrt_sumsq ); \
 \
 	/* Store the final value to the output variable. */ \
-	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+	bli_tcopys( chr,chr, sqrt_sumsq, *norm ); \
 }
 
 //INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
@@ -337,8 +337,8 @@ void PASTEMAC(ch,varname) \
 	ctype_r  sqrt_sumsq; \
 \
 	/* Initialize scale and sumsq to begin the summation. */ \
-	PASTEMAC(chr,copys)( *zero, scale ); \
-	PASTEMAC(chr,copys)( *one,  sumsq ); \
+	bli_tcopys( chr, *zero, scale ); \
+	bli_tcopys( chr, *one,  sumsq ); \
 \
 	/* An optimization: first try to use dotv to compute the sum of
 	   the squares of the vector. If no floating-point exceptions
@@ -368,13 +368,13 @@ void PASTEMAC(ch,varname) \
 		  rntm  \
 		); \
 \
-		PASTEMAC(ch,chr,copys)( sumsqc, sumsq ); \
+		bli_tcopys( ch,chr, sumsqc, sumsq ); \
 \
 		f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
 \
 		if ( !f_exp_raised ) \
 		{ \
-		    PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \
+		    tsqrt2s( chr, sumsq, *norm ); \
 		    return; \
 		} \
 	} \
@@ -391,11 +391,11 @@ void PASTEMAC(ch,varname) \
 	); \
 \
 	/* Compute: norm = scale * sqrt( sumsq ) */ \
-	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
-	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+	tsqrt2s( chr, sumsq, sqrt_sumsq ); \
+	bli_tscals( chr, scale, sqrt_sumsq ); \
 \
 	/* Store the final value to the output variable. */ \
-	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+	bli_tcopys( chr, sqrt_sumsq, *norm ); \
 }
 #else
 #define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
@@ -416,8 +416,8 @@ void PASTEMAC(ch,varname) \
 	ctype_r  sqrt_sumsq; \
 \
 	/* Initialize scale and sumsq to begin the summation. */ \
-	PASTEMAC(chr,copys)( *zero, scale ); \
-	PASTEMAC(chr,copys)( *one,  sumsq ); \
+	bli_tcopys( chr,chr, *zero, scale ); \
+	bli_tcopys( chr,chr, *one,  sumsq ); \
 \
 	/* Compute the sum of the squares of the vector. */ \
 \
@@ -432,11 +432,11 @@ void PASTEMAC(ch,varname) \
 	); \
 \
 	/* Compute: norm = scale * sqrt( sumsq ) */ \
-	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
-	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+	bli_tsqrt2s( chr,chr,chr, sumsq, sqrt_sumsq ); \
+	bli_tscals( chr,chr,chr, scale, sqrt_sumsq ); \
 \
 	/* Store the final value to the output variable. */ \
-	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+	bli_tcopys( chr,chr, sqrt_sumsq, *norm ); \
 }
 #endif
 GENTFUNCR( float,   float,  s, s, normfv_unb_var1, sumsqv_unb_var1 )
@@ -461,28 +461,28 @@ void PASTEMAC(ch,varname) \
 	dim_t   i; \
 \
 	/* Initialize the maximum absolute value to zero. */ \
-	PASTEMAC(chr,set0s)( abs_chi1_max ); \
+	bli_tset0s( chr, abs_chi1_max ); \
 \
 	for ( i = 0; i < n; ++i ) \
 	{ \
 		chi1 = x + (i  )*incx; \
 \
 		/* Compute the absolute value (or complex magnitude) of chi1. */ \
-		PASTEMAC(ch,chr,abval2s)( *chi1, abs_chi1 ); \
+		bli_tabval2s( ch,chr,chr, *chi1, abs_chi1 ); \
 \
 		/* If the absolute value of the current element exceeds that of
 		   the previous largest, save it and its index. If NaN is
 		   encountered, then treat it the same as if it were a valid
 		   value that was larger than any previously seen. This
 		   behavior mimics that of LAPACK's ?lange(). */ \
-		if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
+		if ( abs_chi1_max < abs_chi1 || PASTEMAC(chr,isnan)( abs_chi1 ) ) \
 		{ \
-			PASTEMAC(chr,copys)( abs_chi1, abs_chi1_max ); \
+			bli_tcopys( chr,chr, abs_chi1, abs_chi1_max ); \
 		} \
 	} \
 \
 	/* Store the final value to the output variable. */ \
-	PASTEMAC(chr,copys)( abs_chi1_max, *norm ); \
+	bli_tcopys( chr,chr, abs_chi1_max, *norm ); \
 }
 
 INSERT_GENTFUNCR_BASIC( normiv_unb_var1 )
@@ -520,12 +520,12 @@ void PASTEMAC(ch,varname) \
 	dim_t   ij0, n_shift; \
 \
 	/* Initialize the maximum absolute column sum to zero. */ \
-	PASTEMAC(chr,set0s)( absum_max ); \
+	bli_tset0s( chr, absum_max ); \
 \
 	/* If either dimension is zero, return with absum_max equal to zero. */ \
 	if ( bli_zero_dim2( m, n ) ) \
 	{ \
-		PASTEMAC(chr,copys)( absum_max, *norm ); \
+		bli_tcopys( chr,chr, absum_max, *norm ); \
 		return; \
 	} \
 \
@@ -541,7 +541,7 @@ void PASTEMAC(ch,varname) \
 	/* If the matrix is zeros, return with absum_max equal to zero. */ \
 	if ( bli_is_zeros( uplox_eff ) ) \
 	{ \
-		PASTEMAC(chr,copys)( absum_max, *norm ); \
+		bli_tcopys( chr,chr, absum_max, *norm ); \
 		return; \
 	} \
 \
@@ -567,9 +567,9 @@ void PASTEMAC(ch,varname) \
 \
 			/* If absum_j is greater than the previous maximum value,
 			   then save it. */ \
-			if ( absum_max < absum_j || bli_isnan( absum_j ) ) \
+			if ( absum_max < absum_j || PASTEMAC(chr,isnan)( absum_j ) ) \
 			{ \
-				PASTEMAC(chr,copys)( absum_j, absum_max ); \
+				bli_tcopys( chr,chr, absum_j, absum_max ); \
 			} \
 		} \
 	} \
@@ -598,14 +598,14 @@ void PASTEMAC(ch,varname) \
 \
 				/* Handle the diagonal element separately in case it's
 				   unit. */ \
-				PASTEMAC(ch,chr,abval2s)( *chi1, abval_chi1 ); \
-				PASTEMAC(chr,adds)( abval_chi1, absum_j ); \
+				bli_tabval2s( ch,chr,chr, *chi1, abval_chi1 ); \
+				bli_tadds( chr,chr,chr, abval_chi1, absum_j ); \
 \
 				/* If absum_j is greater than the previous maximum value,
 				   then save it. */ \
-				if ( absum_max < absum_j || bli_isnan( absum_j ) ) \
+				if ( absum_max < absum_j || PASTEMAC(chr,isnan)( absum_j ) ) \
 				{ \
-					PASTEMAC(chr,copys)( absum_j, absum_max ); \
+					bli_tcopys( chr,chr, absum_j, absum_max ); \
 				} \
 			} \
 		} \
@@ -633,21 +633,21 @@ void PASTEMAC(ch,varname) \
 \
 				/* Handle the diagonal element separately in case it's
 				   unit. */ \
-				PASTEMAC(ch,chr,abval2s)( *chi1, abval_chi1 ); \
-				PASTEMAC(chr,adds)( abval_chi1, absum_j ); \
+				bli_tabval2s( ch,chr,chr, *chi1, abval_chi1 ); \
+				bli_tadds( chr,chr,chr, abval_chi1, absum_j ); \
 \
 				/* If absum_j is greater than the previous maximum value,
 				   then save it. */ \
-				if ( absum_max < absum_j || bli_isnan( absum_j ) ) \
+				if ( absum_max < absum_j || PASTEMAC(chr,isnan)( absum_j ) ) \
 				{ \
-					PASTEMAC(chr,copys)( absum_j, absum_max ); \
+					bli_tcopys( chr,chr, absum_j, absum_max ); \
 				} \
 			} \
 		} \
 	} \
 \
 	/* Store final value of absum_max to the output variable. */ \
-	PASTEMAC(chr,copys)( absum_max, *norm ); \
+	bli_tcopys( chr,chr, absum_max, *norm ); \
 }
 
 INSERT_GENTFUNCR_BASIC( norm1m_unb_var1, norm1v_unb_var1 )
@@ -688,7 +688,7 @@ void PASTEMAC(ch,varname) \
 	/* Return a norm of zero if either dimension is zero. */ \
 	if ( bli_zero_dim2( m, n ) ) \
 	{ \
-		PASTEMAC(chr,set0s)( *norm ); \
+		bli_tset0s( chr, *norm ); \
 		return; \
 	} \
 \
@@ -705,13 +705,13 @@ void PASTEMAC(ch,varname) \
 	/* Check the effective uplo; if it's zeros, then our norm is zero. */ \
 	if ( bli_is_zeros( uplox_eff ) ) \
 	{ \
-		PASTEMAC(chr,set0s)( *norm ); \
+		bli_tset0s( chr, *norm ); \
 		return; \
 	} \
 \
 	/* Initialize scale and sumsq to begin the summation. */ \
-	PASTEMAC(chr,copys)( *zero_r, scale ); \
-	PASTEMAC(chr,copys)( *one_r,  sumsq ); \
+	bli_tcopys( chr,chr, *zero_r, scale ); \
+	bli_tcopys( chr,chr, *one_r,  sumsq ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -810,11 +810,11 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* Compute: norm = scale * sqrt( sumsq ) */ \
-	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
-	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+	bli_tsqrt2s( chr,chr,chr, sumsq, sqrt_sumsq ); \
+	bli_tscals( chr,chr,chr, scale, sqrt_sumsq ); \
 \
 	/* Store the final value to the output variable. */ \
-	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+	bli_tcopys( chr,chr, sqrt_sumsq, *norm ); \
 }
 
 INSERT_GENTFUNCR_BASIC( normfm_unb_var1, sumsqv_unb_var1 )
@@ -880,7 +880,7 @@ void PASTEMAC(ch,varname) \
 \
 	for ( i = 0; i < n; ++i ) \
 	{ \
-		PASTEMAC(ch,randmac)( *chi1 ); \
+		PASTEMAC(t,randmac)( ch, *chi1 ); \
 \
 		chi1 += incx; \
 	} \
@@ -954,9 +954,9 @@ void PASTEMAC(ch,varname) \
 	{ \
 		max_m_n = bli_max( m, n ); \
 \
-		PASTEMAC(d,ch,sets)( max_m_n, 0.0, omega ); \
-		PASTEMAC(ch,copys)( *one, beta ); \
-		PASTEMAC(ch,invscals)( omega, beta ); \
+		bli_tsets( d,ch, max_m_n, 0.0, omega ); \
+		bli_tcopys( ch,ch, *one, beta ); \
+		bli_tinvscals( ch,ch,ch, omega, beta ); \
 \
 		if ( bli_is_upper( uplox_eff ) ) \
 		{ \
@@ -981,8 +981,8 @@ void PASTEMAC(ch,varname) \
 				( void )chi1; \
 				/* We want positive diagonal elements between 1 and 2. */ \
 /*
-				PASTEMAC(ch,abval2s)( *chi1, *chi1 ); \
-				PASTEMAC(ch,adds)( *one, *chi1 ); \
+				bli_tabval2s( ch,ch,ch, *chi1, *chi1 ); \
+				bli_tadds( ch,ch,ch, *one, *chi1 ); \
 */ \
 \
 				/* Scale the super-diagonal elements by 1/max(m,n). */ \
@@ -1022,8 +1022,8 @@ void PASTEMAC(ch,varname) \
 				( void )chi1; \
 				/* We want positive diagonal elements between 1 and 2. */ \
 /*
-				PASTEMAC(ch,abval2s)( *chi1, *chi1 ); \
-				PASTEMAC(ch,adds)( *one, *chi1 ); \
+				bli_tabval2s( ch,ch,ch, *chi1, *chi1 ); \
+				bli_tadds( ch,ch,ch, *one, *chi1 ); \
 */ \
 \
 				/* Scale the sub-diagonal elements by 1/max(m,n). */ \
@@ -1075,50 +1075,50 @@ void PASTEMAC(ch,varname) \
 	   the Frobenius norm in netlib LAPACK's ?lassq(). */ \
 \
 	/* Copy scale and sumsq to local variables. */ \
-	PASTEMAC(chr,copys)( *scale, scale_r ); \
-	PASTEMAC(chr,copys)( *sumsq, sumsq_r ); \
+	bli_tcopys( chr,chr, *scale, scale_r ); \
+	bli_tcopys( chr,chr, *sumsq, sumsq_r ); \
 \
 	chi1 = x; \
 \
 	for ( i = 0; i < n; ++i ) \
 	{ \
 		/* Get the real and imaginary components of chi1. */ \
-		PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+		bli_tgets( ch,chr, *chi1, chi1_r, chi1_i ); \
 \
 		abs_chi1_r = bli_fabs( chi1_r ); \
 		abs_chi1_i = bli_fabs( chi1_i ); \
 \
-		if ( bli_isnan( abs_chi1_r ) ) \
+		if ( PASTEMAC(chr,isnan)( abs_chi1_r ) ) \
 		{ \
 			sumsq_r = abs_chi1_r; \
 			scale_r = one_r; \
 		} \
 \
-		if ( bli_isnan( abs_chi1_i ) ) \
+		if ( PASTEMAC(chr,isnan)( abs_chi1_i ) ) \
 		{ \
 			sumsq_r = abs_chi1_i; \
 			scale_r = one_r; \
 		} \
 \
-		if ( bli_isnan( sumsq_r ) ) \
+		if ( PASTEMAC(chr,isnan)( sumsq_r ) ) \
 		{ \
 			chi1 += incx; \
 			continue; \
 		} \
 \
-		if ( bli_isinf( abs_chi1_r ) ) \
+		if ( PASTEMAC(chr,isinf)( abs_chi1_r ) ) \
 		{ \
 			sumsq_r = abs_chi1_r; \
 			scale_r = one_r; \
 		} \
 \
-		if ( bli_isinf( abs_chi1_i ) ) \
+		if ( PASTEMAC(chr,isinf)( abs_chi1_i ) ) \
 		{ \
 			sumsq_r = abs_chi1_i; \
 			scale_r = one_r; \
 		} \
 \
-		if ( bli_isinf( sumsq_r ) ) \
+		if ( PASTEMAC(chr,isinf)( sumsq_r ) ) \
 		{ \
 			chi1 += incx; \
 			continue; \
@@ -1134,7 +1134,7 @@ void PASTEMAC(ch,varname) \
 				          sumsq_r * ( scale_r / abs_chi1_r ) * \
 				                    ( scale_r / abs_chi1_r );  \
 \
-				PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \
+				bli_tcopys( chr,chr, abs_chi1_r, scale_r ); \
 			} \
 			else \
 			{ \
@@ -1153,7 +1153,7 @@ void PASTEMAC(ch,varname) \
 				          sumsq_r * ( scale_r / abs_chi1_i ) * \
 				                    ( scale_r / abs_chi1_i );  \
 \
-				PASTEMAC(chr,copys)( abs_chi1_i, scale_r ); \
+				bli_tcopys( chr,chr, abs_chi1_i, scale_r ); \
 			} \
 			else \
 			{ \
@@ -1166,8 +1166,8 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* Store final values of scale and sumsq to output variables. */ \
-	PASTEMAC(chr,copys)( scale_r, *scale ); \
-	PASTEMAC(chr,copys)( sumsq_r, *sumsq ); \
+	bli_tcopys( chr,chr, scale_r, *scale ); \
+	bli_tcopys( chr,chr, sumsq_r, *sumsq ); \
 }
 
 INSERT_GENTFUNCR_BASIC( sumsqv_unb_var1 )
@@ -1192,10 +1192,10 @@ bool PASTEMAC(ch,opname) \
 \
 		ctype chi1c; \
 \
-		if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *chi1, chi1c ); } \
-		else                        { PASTEMAC(ch,copys)( *chi1, chi1c ); } \
+		if ( bli_is_conj( conjx ) ) { bli_tcopyjs( ch,ch, *chi1, chi1c ); } \
+		else                        { bli_tcopys( ch,ch, *chi1, chi1c ); } \
 \
-		if ( !PASTEMAC(ch,eq)( chi1c, *psi1 ) ) \
+		if ( !PASTEMAC(t,eqs)( ch,ch,ch, chi1c, *psi1 ) ) \
 			return FALSE; \
 	} \
 \
@@ -1264,10 +1264,10 @@ bool PASTEMAC(ch,opname) \
 				ctype* y11 = y1 + (i  )*incy; \
 				ctype  x11c; \
 \
-				if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
-				else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
+				if ( bli_is_conj( conjx ) ) { bli_tcopyjs( ch,ch, *x11, x11c ); } \
+				else                        { bli_tcopys( ch,ch, *x11, x11c ); } \
 \
-				if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \
+				if ( !PASTEMAC(t,eqs)( ch,ch,ch, x11c, *y11 ) ) \
 					return FALSE; \
 			} \
 		} \
@@ -1289,10 +1289,10 @@ bool PASTEMAC(ch,opname) \
 					ctype* y11 = y1 + (i  )*incy; \
 					ctype  x11c; \
 \
-					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
-					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
+					if ( bli_is_conj( conjx ) ) { bli_tcopyjs( ch,ch, *x11, x11c ); } \
+					else                        { bli_tcopys( ch,ch, *x11, x11c ); } \
 \
-					if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \
+					if ( !PASTEMAC(t,eqs)( ch,ch,ch, x11c, *y11 ) ) \
 						return FALSE; \
 				} \
 			} \
@@ -1313,10 +1313,10 @@ bool PASTEMAC(ch,opname) \
 					ctype* y11 = y1 + (i  )*incy; \
 					ctype  x11c; \
 \
-					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
-					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
+					if ( bli_is_conj( conjx ) ) { bli_tcopyjs( ch,ch, *x11, x11c ); } \
+					else                        { bli_tcopys( ch,ch, *x11, x11c ); } \
 \
-					if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \
+					if ( !PASTEMAC(t,eqs)( ch,ch,ch, x11c, *y11 ) ) \
 						return FALSE; \
 				} \
 			} \
@@ -1352,7 +1352,7 @@ void PASTEMAC(ch,opname) \
 \
 	for ( dim_t i = 0; i < n; ++i ) \
 	{ \
-		PASTEMAC(ch,fprints)( file, format, *chi1 ); \
+		bli_tfprints( ch, file, format, *chi1 ); \
 		fprintf( file, "\n" ); \
 \
 		chi1 += incx; \
@@ -1390,7 +1390,7 @@ void PASTEMAC(ch,opname) \
 		{ \
 			const ctype* chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
 \
-			PASTEMAC(ch,fprints)( file, format, *chi1 ); \
+			bli_tfprints( ch, file, format, *chi1 ); \
 			fprintf( file, " " ); \
 		} \
 \
diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c
index c0ef4bda7..7b30aca1a 100644
--- a/ref_kernels/1/bli_addv_ref.c
+++ b/ref_kernels/1/bli_addv_ref.c
@@ -58,14 +58,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,addjs)( x[i], y[i] ); \
+				bli_taddjs( ch,ch,ch, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,addjs)( *x, *y ); \
+				bli_taddjs( ch,ch,ch, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
@@ -79,14 +79,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,adds)( x[i], y[i] ); \
+				bli_tadds( ch,ch,ch, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,adds)( *x, *y ); \
+				bli_tadds( ch,ch,ch, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c
index 87ef63225..47c012a2a 100644
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -64,17 +64,17 @@ void PASTEMAC(ch,opname,arch,suf) \
 	   the behavior of netlib BLAS's i?amax() routines. */ \
 	if ( bli_zero_dim1( n ) ) \
 	{ \
-		PASTEMAC(i,copys)( *zero_i, *index ); \
+		bli_tcopys( i,i, *zero_i, *index ); \
 		return; \
 	} \
 \
 	/* Initialize the index of the maximum absolute value to zero. */ \
-	PASTEMAC(i,copys)( *zero_i, i_max_l ); \
+	bli_tcopys( i,i, *zero_i, i_max_l ); \
 \
 	/* Initialize the maximum absolute value search candidate with
 	   -1, which is guaranteed to be less than all values we will
 	   compute. */ \
-	PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \
+	bli_tcopys( chr,chr, *minus_one, abs_chi1_max ); \
 \
 	if ( incx == 1 ) \
 	{ \
@@ -83,23 +83,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+			bli_tgets( ch,chr, *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
-			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
-			PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \
+			bli_tabval2s( chr,chr,chr, chi1_r, chi1_r ); \
+			bli_tabval2s( chr,chr,chr, chi1_i, chi1_i ); \
 \
 			/* Add the real and imaginary absolute values together. */ \
-			PASTEMAC(chr,set0s)( abs_chi1 ); \
-			PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \
-			PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \
+			bli_tset0s( chr, abs_chi1 ); \
+			bli_tadds( chr,chr,chr, chi1_r, abs_chi1 ); \
+			bli_tadds( chr,chr,chr, chi1_i, abs_chi1 ); \
 \
 			/* If the absolute value of the current element exceeds that of
 			   the previous largest, save it and its index. If NaN is
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
 			   behavior mimics that of LAPACK's ?lange(). */ \
-			if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
+			if ( abs_chi1_max < abs_chi1 || ( PASTEMAC(chr,isnan)( abs_chi1 ) && !PASTEMAC(chr,isnan)( abs_chi1_max ) ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
 				i_max_l      = i; \
@@ -115,23 +115,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 			const ctype* restrict chi1 = x + (i  )*incx; \
 \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+			bli_tgets( ch,chr, *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
-			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
-			PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \
+			bli_tabval2s( chr,chr,chr, chi1_r, chi1_r ); \
+			bli_tabval2s( chr,chr,chr, chi1_i, chi1_i ); \
 \
 			/* Add the real and imaginary absolute values together. */ \
-			PASTEMAC(chr,set0s)( abs_chi1 ); \
-			PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \
-			PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \
+			bli_tset0s( chr, abs_chi1 ); \
+			bli_tadds( chr,chr,chr, chi1_r, abs_chi1 ); \
+			bli_tadds( chr,chr,chr, chi1_i, abs_chi1 ); \
 \
 			/* If the absolute value of the current element exceeds that of
 			   the previous largest, save it and its index. If NaN is
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
 			   behavior mimics that of LAPACK's ?lange(). */ \
-			if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
+			if ( abs_chi1_max < abs_chi1 || ( PASTEMAC(chr,isnan)( abs_chi1 ) && !PASTEMAC(chr,isnan)( abs_chi1_max ) ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
 				i_max_l      = i; \
@@ -140,7 +140,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 	} \
 \
 	/* Store the final index to the output variable. */ \
-	PASTEMAC(i,copys)( i_max_l, *index ); \
+	bli_tcopys( i,i, i_max_l, *index ); \
 }
 
 INSERT_GENTFUNCR_BASIC( amaxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 1c265c819..d23be2018 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -55,9 +55,9 @@ void PASTEMAC(ch,opname,arch,suf) \
 	const ctype* beta  = beta0; \
 	      ctype* y     = y0; \
 \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_teq0s( ch, *alpha ) ) \
 	{ \
-		if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		if ( bli_teq0s( ch, *beta ) ) \
 		{ \
 			/* If alpha is zero and beta is zero, set to zero. */ \
 \
@@ -77,7 +77,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 			); \
 			return; \
 		} \
-		else if ( PASTEMAC(ch,eq1)( *beta ) ) \
+		else if ( bli_teq1s( ch, *beta ) ) \
 		{ \
 			/* If alpha is zero and beta is one, return. */ \
 			return; \
@@ -102,9 +102,9 @@ void PASTEMAC(ch,opname,arch,suf) \
 		} \
 \
 	} \
-	else if ( PASTEMAC(ch,eq1)( *alpha ) ) \
+	else if ( bli_teq1s( ch, *alpha ) ) \
 	{ \
-		if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		if ( bli_teq0s( ch, *beta ) ) \
 		{ \
 			/* If alpha is one and beta is zero, use copyv. */ \
 \
@@ -122,7 +122,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 			); \
 			return; \
 		} \
-		else if ( PASTEMAC(ch,eq1)( *beta ) ) \
+		else if ( bli_teq1s( ch, *beta ) ) \
 		{ \
 			/* If alpha is one and beta is one, use addv. */ \
 \
@@ -162,7 +162,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 	} \
 	else \
 	{ \
-		if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		if ( bli_teq0s( ch, *beta ) ) \
 		{ \
 			/* If alpha is something else and beta is zero, use scal2v. */ \
 \
@@ -181,7 +181,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 			); \
 			return; \
 		} \
-		else if ( PASTEMAC(ch,eq1)( *beta ) ) \
+		else if ( bli_teq1s( ch, *beta ) ) \
 		{ \
 			/* If alpha is something else and beta is one, use axpyv. */ \
 \
@@ -211,14 +211,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpbyjs)( *alpha, x[i], *beta, y[i] ); \
+				bli_taxpbyjs( ch,ch,ch,ch,ch, *alpha, x[i], *beta, y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpbyjs)( *alpha, *x, *beta, *y ); \
+				bli_taxpbyjs( ch,ch,ch,ch,ch, *alpha, *x, *beta, *y ); \
 \
 				x += incx; \
 				y += incy; \
@@ -232,14 +232,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpbys)( *alpha, x[i], *beta, y[i] ); \
+				bli_taxpbys( ch,ch,ch,ch,ch, *alpha, x[i], *beta, y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpbys)( *alpha, *x, *beta, *y ); \
+				bli_taxpbys( ch,ch,ch,ch,ch, *alpha, *x, *beta, *y ); \
 \
 				x += incx; \
 				y += incy; \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index f9ca0fb9d..03f75f585 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -54,10 +54,10 @@ void PASTEMAC(ch,opname,arch,suf) \
 	      ctype* y     = y0; \
 \
 	/* If alpha is zero, return. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
+	if ( bli_teq0s( ch, *alpha ) ) return; \
 \
 	/* If alpha is one, use addv. */ \
-	if ( PASTEMAC(ch,eq1)( *alpha ) ) \
+	if ( bli_teq1s( ch, *alpha ) ) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t dt     = PASTEMAC(ch,type); \
@@ -81,14 +81,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpyjs)( *alpha, x[i], y[i] ); \
+				bli_taxpyjs( ch,ch,ch,ch, *alpha, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpyjs)( *alpha, *x, *y ); \
+				bli_taxpyjs( ch,ch,ch,ch, *alpha, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
@@ -102,14 +102,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpys)( *alpha, x[i], y[i] ); \
+				bli_taxpys( ch,ch,ch,ch, *alpha, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,axpys)( *alpha, *x, *y ); \
+				bli_taxpys( ch,ch,ch,ch, *alpha, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c
index 0f35f5167..634ab2caa 100644
--- a/ref_kernels/1/bli_copyv_ref.c
+++ b/ref_kernels/1/bli_copyv_ref.c
@@ -58,14 +58,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,copyjs)( x[i], y[i] ); \
+				bli_tcopyjs( ch,ch, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,copyjs)( *x, *y ); \
+				bli_tcopyjs( ch,ch, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
@@ -79,14 +79,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,copys)( x[i], y[i] ); \
+				bli_tcopys( ch,ch, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,copys)( *x, *y ); \
+				bli_tcopys( ch,ch, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c
index 18a195ca2..1fd07461c 100644
--- a/ref_kernels/1/bli_dotv_ref.c
+++ b/ref_kernels/1/bli_dotv_ref.c
@@ -56,11 +56,11 @@ void PASTEMAC(ch,opname,arch,suf) \
 \
 	if ( bli_zero_dim1( n ) ) \
 	{ \
-		PASTEMAC(ch,set0s)( *rho ); \
+		bli_tset0s( ch, *rho ); \
 		return; \
 	} \
 \
-	PASTEMAC(ch,set0s)( dotxy ); \
+	bli_tset0s( ch, dotxy ); \
 \
 	conj_t conjx_use = conjx; \
 \
@@ -77,14 +77,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
+				bli_tdotjs( ch,ch,ch,ch, x[i], y[i], dotxy ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dotjs)( *x, *y, dotxy ); \
+				bli_tdotjs( ch,ch,ch,ch, *x, *y, dotxy ); \
 \
 				x += incx; \
 				y += incy; \
@@ -98,14 +98,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
+				bli_tdots( ch,ch,ch,ch, x[i], y[i], dotxy ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dots)( *x, *y, dotxy ); \
+				bli_tdots( ch,ch,ch,ch, *x, *y, dotxy ); \
 \
 				x += incx; \
 				y += incy; \
@@ -114,9 +114,9 @@ void PASTEMAC(ch,opname,arch,suf) \
 	} \
 \
 	if ( bli_is_conj( conjy ) ) \
-		PASTEMAC(ch,conjs)( dotxy ); \
+		bli_tconjs( ch, dotxy ); \
 \
-	PASTEMAC(ch,copys)( dotxy, *rho ); \
+	bli_tcopys( ch,ch, dotxy, *rho ); \
 }
 
 INSERT_GENTFUNC_BASIC( dotv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c
index 8fe116001..41961bb9b 100644
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -59,19 +59,19 @@ void PASTEMAC(ch,opname,arch,suf) \
 	ctype dotxy; \
 \
 	/* If beta is zero, clear rho. Otherwise, scale by beta. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
-		PASTEMAC(ch,set0s)( *rho ); \
+		bli_tset0s( ch, *rho ); \
 	} \
 	else \
 	{ \
-		PASTEMAC(ch,scals)( *beta, *rho ); \
+		bli_tscals( ch,ch,ch, *beta, *rho ); \
 	} \
 \
 	/* If the vectors are empty or if alpha is zero, return early. */ \
-	if ( bli_zero_dim1( n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
+	if ( bli_zero_dim1( n ) || bli_teq0s( ch, *alpha ) ) return; \
 \
-	PASTEMAC(ch,set0s)( dotxy ); \
+	bli_tset0s( ch, dotxy ); \
 \
 	/* If y must be conjugated, we do so indirectly by first toggling the
 	   effective conjugation of x and then conjugating the resulting dot
@@ -88,14 +88,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
+				bli_tdotjs( ch,ch,ch,ch, x[i], y[i], dotxy ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dotjs)( *x, *y, dotxy ); \
+				bli_tdotjs( ch,ch,ch,ch, *x, *y, dotxy ); \
 \
 				x += incx; \
 				y += incy; \
@@ -109,14 +109,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
+				bli_tdots( ch,ch,ch,ch, x[i], y[i], dotxy ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,dots)( *x, *y, dotxy ); \
+				bli_tdots( ch,ch,ch,ch, *x, *y, dotxy ); \
 \
 				x += incx; \
 				y += incy; \
@@ -125,9 +125,9 @@ void PASTEMAC(ch,opname,arch,suf) \
 	} \
 \
 	if ( bli_is_conj( conjy ) ) \
-		PASTEMAC(ch,conjs)( dotxy ); \
+		bli_tconjs( ch, dotxy ); \
 \
-	PASTEMAC(ch,axpys)( *alpha, dotxy, *rho ); \
+	bli_taxpys( ch,ch,ch,ch, *alpha, dotxy, *rho ); \
 }
 
 INSERT_GENTFUNC_BASIC( dotxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c
index 1cea1c61a..a70fcfd99 100644
--- a/ref_kernels/1/bli_invertv_ref.c
+++ b/ref_kernels/1/bli_invertv_ref.c
@@ -53,14 +53,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 		PRAGMA_SIMD \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,inverts)( x[i] ); \
+			bli_tinverts( ch,ch, x[i] ); \
 		} \
 	} \
 	else \
 	{ \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,inverts)( *x ); \
+			bli_tinverts( ch,ch, *x ); \
 \
 			x += incx; \
 		} \
diff --git a/ref_kernels/1/bli_invscalv_ref.c b/ref_kernels/1/bli_invscalv_ref.c
index 914c89174..3eea27127 100644
--- a/ref_kernels/1/bli_invscalv_ref.c
+++ b/ref_kernels/1/bli_invscalv_ref.c
@@ -52,28 +52,28 @@ void PASTEMAC(ch,opname,arch,suf) \
 	      ctype* x     = x0; \
 \
 	/* If alpha is one, return. */ \
-	if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
+	if ( bli_teq1s( ch, *alpha ) ) return; \
 \
 	/* If alpha is zero, inv(alpha) is undefined. Bad user! Return early. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
+	if ( bli_teq0s( ch, *alpha ) ) return; \
 \
 	ctype alpha_conj; \
 \
-	PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \
+	bli_tcopycjs( ch,ch, conjalpha, *alpha, alpha_conj ); \
 \
 	if ( incx == 1 ) \
 	{ \
 		PRAGMA_SIMD \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,invscals)( alpha_conj, x[i] ); \
+			bli_tinvscals( ch,ch,ch, alpha_conj, x[i] ); \
 		} \
 	} \
 	else \
 	{ \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,invscals)( alpha_conj, *x ); \
+			bli_tinvscals( ch,ch,ch, alpha_conj, *x ); \
 \
 			x += incx; \
 		} \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index 4b96f5659..242272565 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -53,7 +53,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 	const ctype* x     = x0; \
 	      ctype* y     = y0; \
 \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_teq0s( ch, *alpha ) ) \
 	{ \
 		/* If alpha is zero, use setv. */ \
 \
@@ -73,7 +73,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 		); \
 		return; \
 	} \
-	else if ( PASTEMAC(ch,eq1)( *alpha ) ) \
+	else if ( bli_teq1s( ch, *alpha ) ) \
 	{ \
 		/* If alpha is one, use copyv. */ \
 \
@@ -99,14 +99,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,scal2js)( *alpha, x[i], y[i] ); \
+				bli_tscal2js( ch,ch,ch,ch, *alpha, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,scal2js)( *alpha, *x, *y ); \
+				bli_tscal2js( ch,ch,ch,ch, *alpha, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
@@ -120,14 +120,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,scal2s)( *alpha, x[i], y[i] ); \
+				bli_tscal2s( ch,ch,ch,ch, *alpha, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,scal2s)( *alpha, *x, *y ); \
+				bli_tscal2s( ch,ch,ch,ch, *alpha, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 8e9a1ec98..5d8d39636 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -52,10 +52,10 @@ void PASTEMAC(ch,opname,arch,suf) \
 	      ctype* x     = x0; \
 \
 	/* If alpha is one, return. */ \
-	if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
+	if ( bli_teq1s( ch, *alpha ) ) return; \
 \
 	/* If alpha is zero, use setv. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_teq0s( ch, *alpha ) ) \
 	{ \
 		const ctype* zero = PASTEMAC(ch,0); \
 \
@@ -76,21 +76,21 @@ void PASTEMAC(ch,opname,arch,suf) \
 \
 	ctype alpha_conj; \
 \
-	PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \
+	bli_tcopycjs( ch,ch, conjalpha, *alpha, alpha_conj ); \
 \
 	if ( incx == 1 ) \
 	{ \
 		PRAGMA_SIMD \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,scals)( alpha_conj, x[i] ); \
+			bli_tscals( ch,ch,ch, alpha_conj, x[i] ); \
 		} \
 	} \
 	else \
 	{ \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,scals)( alpha_conj, *x ); \
+			bli_tscals( ch,ch,ch, alpha_conj, *x ); \
 \
 			x += incx; \
 		} \
diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c
index 8d945f618..197a0b73d 100644
--- a/ref_kernels/1/bli_setv_ref.c
+++ b/ref_kernels/1/bli_setv_ref.c
@@ -51,21 +51,21 @@ void PASTEMAC(ch,opname,arch,suf) \
 	const ctype* alpha = alpha0; \
 	      ctype* x     = x0; \
 \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( bli_teq0s( ch, *alpha ) ) \
 	{ \
 		if ( incx == 1 ) \
 		{ \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,set0s)( x[i] ); \
+				bli_tset0s( ch, x[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,set0s)( *x ); \
+				bli_tset0s( ch, *x ); \
 \
 				x += incx; \
 			} \
@@ -75,21 +75,21 @@ void PASTEMAC(ch,opname,arch,suf) \
 	{ \
 		ctype alpha_conj; \
 \
-		PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \
+		bli_tcopycjs( ch,ch, conjalpha, *alpha, alpha_conj ); \
 \
 		if ( incx == 1 ) \
 		{ \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,copys)( alpha_conj, x[i] ); \
+				bli_tcopys( ch,ch, alpha_conj, x[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,copys)( alpha_conj, *x ); \
+				bli_tcopys( ch,ch, alpha_conj, *x ); \
 \
 				x += incx; \
 			} \
diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c
index d43d96033..b60a0c800 100644
--- a/ref_kernels/1/bli_subv_ref.c
+++ b/ref_kernels/1/bli_subv_ref.c
@@ -58,14 +58,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,subjs)( x[i], y[i] ); \
+				bli_tsubjs( ch,ch,ch, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,subjs)( *x, *y ); \
+				bli_tsubjs( ch,ch,ch, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
@@ -79,14 +79,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,subs)( x[i], y[i] ); \
+				bli_tsubs( ch,ch,ch, x[i], y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,subs)( *x, *y ); \
+				bli_tsubs( ch,ch,ch, *x, *y ); \
 \
 				x += incx; \
 				y += incy; \
diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c
index f01d0d09f..1739464c0 100644
--- a/ref_kernels/1/bli_swapv_ref.c
+++ b/ref_kernels/1/bli_swapv_ref.c
@@ -55,14 +55,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 		PRAGMA_SIMD \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,swaps)( x[i], y[i] ); \
+			bli_tswaps( ch,ch, x[i], y[i] ); \
 		} \
 	} \
 	else \
 	{ \
 		for ( dim_t i = 0; i < n; ++i ) \
 		{ \
-			PASTEMAC(ch,swaps)( *x, *y ); \
+			bli_tswaps( ch,ch, *x, *y ); \
 \
 			x += incx; \
 			y += incy; \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 02c0cd14d..b7b584fb0 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -54,7 +54,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 	      ctype* y    = y0; \
 \
 	/* If beta is zero, use copyv. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t  dt      = PASTEMAC(ch,type); \
@@ -71,7 +71,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 		return; \
 	} \
 	/* If alpha is one, use addv. */ \
-	else if ( PASTEMAC(ch,eq1)( *beta ) ) \
+	else if ( bli_teq1s( ch, *beta ) ) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t dt     = PASTEMAC(ch,type); \
@@ -95,14 +95,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,xpbyjs)( x[i], *beta, y[i] ); \
+				bli_txpbyjs( ch,ch,ch,ch, x[i], *beta, y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,xpbyjs)( *x, *beta, *y ); \
+				bli_txpbyjs( ch,ch,ch,ch, *x, *beta, *y ); \
 \
 				x += incx; \
 				y += incy; \
@@ -116,14 +116,14 @@ void PASTEMAC(ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,xpbys)( x[i], *beta, y[i] ); \
+				bli_txpbys( ch,ch,ch,ch, x[i], *beta, y[i] ); \
 			} \
 		} \
 		else \
 		{ \
 			for ( dim_t i = 0; i < n; ++i ) \
 			{ \
-				PASTEMAC(ch,xpbys)( *x, *beta, *y ); \
+				bli_txpbys( ch,ch,ch,ch, *x, *beta, *y ); \
 \
 				x += incx; \
 				y += incy; \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 8b5b2cbbb..9f46577b0 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -70,8 +70,8 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < n; ++i ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \
-					PASTEMAC(ch,axpys)( *alphay, y[i], z[i] ); \
+					bli_taxpys( ch,ch,ch,ch, *alphax, x[i], z[i] ); \
+					bli_taxpys( ch,ch,ch,ch, *alphay, y[i], z[i] ); \
 				} \
 			} \
 			else /* if ( bli_is_conj( conjy ) ) */ \
@@ -79,9 +79,9 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < n; ++i ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \
-					PASTEMAC(ch,copyjs)( y[i], psic ); \
-					PASTEMAC(ch,axpys)( *alphay, psic, z[i] ); \
+					bli_taxpys( ch,ch,ch,ch, *alphax, x[i], z[i] ); \
+					bli_tcopyjs( ch,ch, y[i], psic ); \
+					bli_taxpys( ch,ch,ch,ch, *alphay, psic, z[i] ); \
 				} \
 			} \
 		} \
@@ -92,9 +92,9 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < n; ++i ) \
 				{ \
-					PASTEMAC(ch,copyjs)( x[i], chic ); \
-					PASTEMAC(ch,axpys)( *alphax, chic, z[i] ); \
-					PASTEMAC(ch,axpys)( *alphay, y[i], z[i] ); \
+					bli_tcopyjs( ch,ch, x[i], chic ); \
+					bli_taxpys( ch,ch,ch,ch, *alphax, chic, z[i] ); \
+					bli_taxpys( ch,ch,ch,ch, *alphay, y[i], z[i] ); \
 				} \
 			} \
 			else /* if ( bli_is_conj( conjy ) ) */ \
@@ -102,10 +102,10 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < n; ++i ) \
 				{ \
-					PASTEMAC(ch,copyjs)( x[i], chic ); \
-					PASTEMAC(ch,axpys)( *alphax, chic, z[i] ); \
-					PASTEMAC(ch,copyjs)( y[i], psic ); \
-					PASTEMAC(ch,axpys)( *alphay, psic, z[i] ); \
+					bli_tcopyjs( ch,ch, x[i], chic ); \
+					bli_taxpys( ch,ch,ch,ch, *alphax, chic, z[i] ); \
+					bli_tcopyjs( ch,ch, y[i], psic ); \
+					bli_taxpys( ch,ch,ch,ch, *alphay, psic, z[i] ); \
 				} \
 			} \
 		} \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index 233c64fc2..f86f5f46d 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -67,13 +67,13 @@ void PASTEMAC(ch,opname,arch,suf) \
 		{ \
 			PRAGMA_SIMD \
 			for ( dim_t j = 0; j < ff; ++j ) \
-				PASTEMAC(ch,scal2js)( *alpha, x[j], ax[j] ); \
+				bli_tscal2js( ch,ch,ch,ch, *alpha, x[j], ax[j] ); \
 		} \
 		else \
 		{ \
 			PRAGMA_SIMD \
 			for ( dim_t j = 0; j < ff; ++j ) \
-				PASTEMAC(ch,scal2s)( *alpha, x[j], ax[j] ); \
+				bli_tscal2s( ch,ch,ch,ch, *alpha, x[j], ax[j] ); \
 		} \
 \
 		/* Accumulate ff separate axpyv's into y. */ \
@@ -83,7 +83,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 			for ( dim_t i = 0; i < m; ++i ) \
 			for ( dim_t j = 0; j < ff; ++j ) \
 			{ \
-				PASTEMAC(ch,axpys)( ax[j], a[i + j*lda], y[i] ); \
+				bli_taxpys( ch,ch,ch,ch, ax[j], a[i + j*lda], y[i] ); \
 			} \
 		} \
 		else \
@@ -92,7 +92,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 			for ( dim_t i = 0; i < m; ++i ) \
 			for ( dim_t j = 0; j < ff; ++j ) \
 			{ \
-				PASTEMAC(ch,axpyjs)( ax[j], a[i + j*lda], y[i] ); \
+				bli_taxpyjs( ch,ch,ch,ch, ax[j], a[i + j*lda], y[i] ); \
 			} \
 		} \
 	} \
@@ -110,8 +110,8 @@ void PASTEMAC(ch,opname,arch,suf) \
 \
 			ctype alpha_chi1; \
 \
-			PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \
-			PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \
+			bli_tcopycjs( ch,ch, conjx, *chi1, alpha_chi1 ); \
+			bli_tscals( ch,ch,ch, *alpha, alpha_chi1 ); \
 \
 			kfp_av \
 			( \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index fe558ba3a..20b1b0e87 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -67,7 +67,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 			conj_t conjxt_use = conjxt; \
 			ctype  dotxy; \
 \
-			PASTEMAC(ch,set0s)( dotxy ); \
+			bli_tset0s( ch, dotxy ); \
 \
 			if ( bli_is_conj( conjy ) ) \
 				bli_toggle_conj( &conjxt_use ); \
@@ -77,8 +77,8 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < m; ++i ) \
 				{ \
-					PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
-					PASTEMAC(ch,axpys)( *alpha, x[i], z[i] ); \
+					bli_tdots( ch,ch,ch,ch, x[i], y[i], dotxy ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, x[i], z[i] ); \
 				} \
 			} \
 			else /* bli_is_conj( conjxt_use ) ) */ \
@@ -86,22 +86,22 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < m; ++i ) \
 				{ \
-					PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
-					PASTEMAC(ch,axpys)( *alpha, x[i], z[i] ); \
+					bli_tdotjs( ch,ch,ch,ch, x[i], y[i], dotxy ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, x[i], z[i] ); \
 				} \
 			} \
 \
 			if ( bli_is_conj( conjy ) ) \
-				PASTEMAC(ch,conjs)( dotxy ); \
+				bli_tconjs( ch, dotxy ); \
 \
-			PASTEMAC(ch,copys)( dotxy, *rho ); \
+			bli_tcopys( ch,ch, dotxy, *rho ); \
 		} \
 		else /* bli_is_conj( conjx ) ) */ \
 		{ \
 			conj_t conjxt_use = conjxt; \
 			ctype  dotxy; \
 \
-			PASTEMAC(ch,set0s)( dotxy ); \
+			bli_tset0s( ch, dotxy ); \
 \
 			if ( bli_is_conj( conjy ) ) \
 				bli_toggle_conj( &conjxt_use ); \
@@ -111,8 +111,8 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < m; ++i ) \
 				{ \
-					PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
-					PASTEMAC(ch,axpyjs)( *alpha, x[i], z[i] ); \
+					bli_tdots( ch,ch,ch,ch, x[i], y[i], dotxy ); \
+					bli_taxpyjs( ch,ch,ch,ch, *alpha, x[i], z[i] ); \
 				} \
 			} \
 			else /* bli_is_conj( conjxt_use ) ) */ \
@@ -120,15 +120,15 @@ void PASTEMAC(ch,opname,arch,suf) \
 				PRAGMA_SIMD \
 				for ( dim_t i = 0; i < m; ++i ) \
 				{ \
-					PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
-					PASTEMAC(ch,axpyjs)( *alpha, x[i], z[i] ); \
+					bli_tdotjs( ch,ch,ch,ch, x[i], y[i], dotxy ); \
+					bli_taxpyjs( ch,ch,ch,ch, *alpha, x[i], z[i] ); \
 				} \
 			} \
 \
 			if ( bli_is_conj( conjy ) ) \
-				PASTEMAC(ch,conjs)( dotxy ); \
+				bli_tconjs( ch, dotxy ); \
 \
-			PASTEMAC(ch,copys)( dotxy, *rho ); \
+			bli_tcopys( ch,ch, dotxy, *rho ); \
 		} \
 	} \
 	else \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index 6cfa5168c..83d4be89c 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -75,33 +75,33 @@ void PASTEMAC(ch,opname,arch,suf) \
 		ctype ax[ ff ]; \
 \
 		/* If beta is zero, clear y. Otherwise, scale by beta. */ \
-		if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		if ( bli_teq0s( ch, *beta ) ) \
 		{ \
-			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( y[i] ); \
+			for ( dim_t i = 0; i < ff; ++i ) bli_tset0s( ch, y[i] ); \
 		} \
 		else \
 		{ \
-			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \
+			for ( dim_t i = 0; i < ff; ++i ) bli_tscals( ch,ch,ch, *beta, y[i] ); \
 		} \
 \
 		/* If the vectors are empty or if alpha is zero, return early. */ \
-		if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
+		if ( bli_zero_dim1( m ) || bli_teq0s( ch, *alpha ) ) return; \
 \
 		/* Initialize r vector to 0. */ \
-		for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \
+		for ( dim_t i = 0; i < ff; ++i ) bli_tset0s( ch, r[i] ); \
 \
 		/* Scale x by alpha, storing to a temporary array ax. */ \
 		if ( bli_is_conj( conjx ) ) \
 		{ \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < ff; ++i ) \
-				PASTEMAC(ch,scal2js)( *alpha, x[i], ax[i] ); \
+				bli_tscal2js( ch,ch,ch,ch, *alpha, x[i], ax[i] ); \
 		} \
 		else \
 		{ \
 			PRAGMA_SIMD \
 			for ( dim_t i = 0; i < ff; ++i ) \
-				PASTEMAC(ch,scal2s)( *alpha, x[i], ax[i] ); \
+				bli_tscal2s( ch,ch,ch,ch, *alpha, x[i], ax[i] ); \
 		} \
 \
 		/* If a must be conjugated, we do so indirectly by first toggling the
@@ -120,8 +120,8 @@ void PASTEMAC(ch,opname,arch,suf) \
 				for ( dim_t p = 0; p < m; ++p ) \
 				for ( dim_t i = 0; i < ff; ++i ) \
 				{ \
-					PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \
-					PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \
+					bli_taxpys( ch,ch,ch,ch, a[p + i*lda], w[p], r[i] ); \
+					bli_taxpys( ch,ch,ch,ch, ax[i], a[p + i*lda], z[p] ); \
 				} \
 			} \
 			else \
@@ -130,8 +130,8 @@ void PASTEMAC(ch,opname,arch,suf) \
 				for ( dim_t p = 0; p < m; ++p ) \
 				for ( dim_t i = 0; i < ff; ++i ) \
 				{ \
-					PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \
-					PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \
+					bli_taxpys( ch,ch,ch,ch, a[p + i*lda], w[p], r[i] ); \
+					bli_taxpyjs( ch,ch,ch,ch, ax[i], a[p + i*lda], z[p] ); \
 				} \
 			} \
 		} \
@@ -143,8 +143,8 @@ void PASTEMAC(ch,opname,arch,suf) \
 				for ( dim_t p = 0; p < m; ++p ) \
 				for ( dim_t i = 0; i < ff; ++i ) \
 				{ \
-					PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \
-					PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \
+					bli_taxpyjs( ch,ch,ch,ch, a[p + i*lda], w[p], r[i] ); \
+					bli_taxpys( ch,ch,ch,ch, ax[i], a[p + i*lda], z[p] ); \
 				} \
 			} \
 			else \
@@ -153,18 +153,18 @@ void PASTEMAC(ch,opname,arch,suf) \
 				for ( dim_t p = 0; p < m; ++p ) \
 				for ( dim_t i = 0; i < ff; ++i ) \
 				{ \
-					PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \
-					PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \
+					bli_taxpyjs( ch,ch,ch,ch, a[p + i*lda], w[p], r[i] ); \
+					bli_taxpyjs( ch,ch,ch,ch, ax[i], a[p + i*lda], z[p] ); \
 				} \
 			} \
 		} \
 \
 		if ( bli_is_conj( conjat ) ) \
-			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,conjs)( r[i] ); \
+			for ( dim_t i = 0; i < ff; ++i ) bli_tconjs( ch, r[i] ); \
 \
 		for ( dim_t i = 0; i < ff; ++i ) \
 		{ \
-			PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \
+			bli_taxpys( ch,ch,ch,ch, *alpha, r[i], y[i] ); \
 		} \
 	} \
 	else \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 0f4cda2b8..a3545c97e 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -63,20 +63,20 @@ void PASTEMAC(ch,opname,arch,suf) \
 		ctype r[ ff ]; \
 \
 		/* If beta is zero, clear y. Otherwise, scale by beta. */ \
-		if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		if ( bli_teq0s( ch, *beta ) ) \
 		{ \
-			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( y[i] ); \
+			for ( dim_t i = 0; i < ff; ++i ) bli_tset0s( ch, y[i] ); \
 		} \
 		else \
 		{ \
-			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \
+			for ( dim_t i = 0; i < ff; ++i ) bli_tscals( ch,ch,ch, *beta, y[i] ); \
 		} \
 \
 		/* If the vectors are empty or if alpha is zero, return early. */ \
-		if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
+		if ( bli_zero_dim1( m ) || bli_teq0s( ch, *alpha ) ) return; \
 \
 		/* Initialize r vector to 0. */ \
-		for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \
+		for ( dim_t i = 0; i < ff; ++i ) bli_tset0s( ch, r[i] ); \
 \
 		/* If a must be conjugated, we do so indirectly by first toggling the
 		   effective conjugation of x and then conjugating the resulting dot
@@ -92,7 +92,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 			for ( dim_t p = 0; p < m; ++p ) \
 			for ( dim_t i = 0; i < ff; ++i ) \
 			{ \
-				PASTEMAC(ch,axpys)( a[p + i*lda], x[p], r[i] ); \
+				bli_taxpys( ch,ch,ch,ch, a[p + i*lda], x[p], r[i] ); \
 			} \
 		} \
 		else \
@@ -101,16 +101,16 @@ void PASTEMAC(ch,opname,arch,suf) \
 			for ( dim_t p = 0; p < m; ++p ) \
 			for ( dim_t i = 0; i < ff; ++i ) \
 			{ \
-				PASTEMAC(ch,axpyjs)( a[p + i*lda], x[p], r[i] ); \
+				bli_taxpyjs( ch,ch,ch,ch, a[p + i*lda], x[p], r[i] ); \
 			} \
 		} \
 \
 		if ( bli_is_conj( conjat ) ) \
-			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,conjs)( r[i] ); \
+			for ( dim_t i = 0; i < ff; ++i ) bli_tconjs( ch, r[i] ); \
 \
 		for ( dim_t i = 0; i < ff; ++i ) \
 		{ \
-			PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \
+			bli_taxpys( ch,ch,ch,ch, *alpha, r[i], y[i] ); \
 		} \
 	} \
 	else \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index fad987c4b..ec52e5271 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -37,43 +37,45 @@
 
 #define PACKM_SET_1E( chp_r, val_r, val_i, mnk ) \
 do { \
-	PASTEMAC(chp_r,copys)(  val_r, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
-	PASTEMAC(chp_r,copys)(  val_i, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
-	PASTEMAC(chp_r,copys)( -val_i, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
-	PASTEMAC(chp_r,copys)(  val_r, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
+	bli_tcopys( chp_r,chp_r,  val_r, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+	bli_tcopys( chp_r,chp_r,  val_i, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
+	bli_tcopys( chp_r,chp_r, -val_i, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+	bli_tcopys( chp_r,chp_r,  val_r, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
 } while (0)
 
 
 #define PACKM_SET_1R( chp_r, val_r, val_i, mnk ) \
 do { \
-	PASTEMAC(chp_r,copys)( val_r, *(pi1_r + mnk*cdim_bcast + d + mnk*ldp2) ); \
-	PASTEMAC(chp_r,copys)( val_i, *(pi1_i + mnk*cdim_bcast + d + mnk*ldp2) ); \
+	bli_tcopys( chp_r,chp_r, val_r, *(pi1_r + mnk*cdim_bcast + d + mnk*ldp2) ); \
+	bli_tcopys( chp_r,chp_r, val_i, *(pi1_i + mnk*cdim_bcast + d + mnk*ldp2) ); \
 } while (0)
 
 
 #define PACKM_SCAL_1E( ctypep_r, cha, chp, mn, k, op ) \
 do { \
-	ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
-	PASTEMAC(cha,chp,copyris)( *(alpha1 +  mn       *inca2       + 0 + k*lda2), \
-	                            *(alpha1 +  mn       *inca2       + 1 + k*lda2), \
-	                            alpha_r, alpha_i ); \
-	PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
-	PASTEMAC(chp,copyris)(  ka_r, ka_i, *(pi1_ri + (mn*2 + 0)*cdim_bcast  + d + k*ldp2), \
-	                                    *(pi1_ri + (mn*2 + 1)*cdim_bcast  + d + k*ldp2) ); \
-	PASTEMAC(chp,copyris)( -ka_i, ka_r, *(pi1_ir + (mn*2 + 0)*cdim_bcast  + d + k*ldp2), \
-	                                    *(pi1_ir + (mn*2 + 1)*cdim_bcast  + d + k*ldp2) ); \
+	ctypep_r ka_r, ka_i; \
+	PASTEMAC(t,op)( chp,cha,chp,chp, \
+	                kappa_r, kappa_i, \
+	                *(alpha1 +  mn*inca2 + 0 + k*lda2), \
+	                *(alpha1 +  mn*inca2 + 1 + k*lda2), \
+	                ka_r, ka_i ); \
+	bli_tcopyris( chp,chp,  ka_r, ka_i, *(pi1_ri + (mn*2 + 0)*cdim_bcast + d + k*ldp2), \
+	                                    *(pi1_ri + (mn*2 + 1)*cdim_bcast + d + k*ldp2) ); \
+	bli_tcopyris( chp,chp, -ka_i, ka_r, *(pi1_ir + (mn*2 + 0)*cdim_bcast + d + k*ldp2), \
+	                                    *(pi1_ir + (mn*2 + 1)*cdim_bcast + d + k*ldp2) ); \
 } while (0)
 
 
 #define PACKM_SCAL_1R( ctypep_r, cha, chp, mn, k, op ) \
 do { \
-	ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
-	PASTEMAC(cha,chp,copyris)( *(alpha1 +  mn       *inca2       + 0 + k*lda2), \
-	                            *(alpha1 +  mn       *inca2       + 1 + k*lda2), \
-	                            alpha_r, alpha_i ); \
-	PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
-	PASTEMAC(chp,copyris)( ka_r, ka_i, *(pi1_r  + mn*cdim_bcast  + d + k*ldp2), \
-	                                   *(pi1_i  + mn*cdim_bcast  + d + k*ldp2) ); \
+	ctypep_r ka_r, ka_i; \
+	PASTEMAC(t,op)( chp,cha,chp,chp, \
+	                kappa_r, kappa_i, \
+	                *(alpha1 +  mn*inca2 + 0 + k*lda2), \
+	                *(alpha1 +  mn*inca2 + 1 + k*lda2), \
+	                ka_r, ka_i ); \
+	bli_tcopyris( chp,chp, ka_r, ka_i, *(pi1_r + mn*cdim_bcast + d + k*ldp2), \
+	                                   *(pi1_i + mn*cdim_bcast + d + k*ldp2) ); \
 } while (0)
 
 
@@ -154,8 +156,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	if ( bli_is_1e_packed( schema ) ) \
 	{ \
 		/* start by zeroing out the whole block */ \
-		PASTEMAC(chp_r,set0s_mxn) \
+		bli_tset0s_mxn \
 		( \
+		  chp_r, \
 		  2*cdim_max, \
 		  2*n_max, \
 		  ( ctypep_r* )p, 1, ldp  \
@@ -214,11 +217,11 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
 			{ \
 				ctypep_r alpha_r; \
-				PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
-				PASTEMAC(chp_r,scal2s)(  kappa_r, alpha_r, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
-				PASTEMAC(chp_r,scal2s)(  kappa_i, alpha_r, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
-				PASTEMAC(chp_r,scal2s)( -kappa_i, alpha_r, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
-				PASTEMAC(chp_r,scal2s)(  kappa_r, alpha_r, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
+				bli_tcopys( cha_r,chp_r, *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
+				bli_tscal2s( chp_r,chp_r,chp_r,chp_r,  kappa_r, alpha_r, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+				bli_tscal2s( chp_r,chp_r,chp_r,chp_r,  kappa_i, alpha_r, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
+				bli_tscal2s( chp_r,chp_r,chp_r,chp_r, -kappa_i, alpha_r, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
+				bli_tscal2s( chp_r,chp_r,chp_r,chp_r,  kappa_r, alpha_r, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
 			} \
 		} \
 		else if ( bli_is_conj( conja )) \
@@ -240,9 +243,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
 			{ \
-				PASTEMAC(chp,invertris)( *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2), \
+				bli_tinvertris( chp,chp, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2), \
 				                         *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \
-				PASTEMAC(chp,copyjris)( *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2), \
+				bli_tcopyjris( chp,chp, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2), \
 				                        *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2), \
 				                        *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2), \
 				                        *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \
@@ -257,8 +260,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	else /* bli_is_1r_packed( schema ) */ \
 	{ \
 		/* start by zeroing out the whole block */ \
-		PASTEMAC(chp_r,set0s_mxn) \
+		bli_tset0s_mxn \
 		( \
+		  chp_r, \
 		  cdim_max, \
 		  2*n_max, \
 		  ( ctypep_r* )p, 1, ldp  \
@@ -317,9 +321,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
 			{ \
 				ctypep_r alpha_r; \
-				PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
-				PASTEMAC(chp_r,scal2s)( kappa_r, alpha_r, *(pi1_r + mnk*(cdim_bcast + ldp2) + d) ); \
-				PASTEMAC(chp_r,scal2s)( kappa_i, alpha_r, *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \
+				bli_tcopys( cha_r,chp_r, *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
+				bli_tscal2s( chp_r,chp_r,chp_r,chp_r, kappa_r, alpha_r, *(pi1_r + mnk*(cdim_bcast + ldp2) + d) ); \
+				bli_tscal2s( chp_r,chp_r,chp_r,chp_r, kappa_i, alpha_r, *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \
 			} \
 		} \
 		else if ( bli_is_conj( conja ) ) \
@@ -340,7 +344,7 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 		{ \
 			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-				PASTEMAC(chp,invertris)( *(pi1_r + mnk*(cdim_bcast + ldp2) + d), \
+				bli_tinvertris( chp,chp, *(pi1_r + mnk*(cdim_bcast + ldp2) + d), \
 				                         *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \
 		} \
 \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index 635bb9900..82976ebde 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -42,11 +42,10 @@ do \
 	for ( dim_t k = 0; k < cdim; k++ ) \
 	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
 	{ \
-		ctypep alpha_cast, kappa_alpha; \
-		PASTEMAC(cha,chp,copys)( *(alpha1 + mn*inca + k*lda), alpha_cast ); \
-		PASTEMAC(chp,op)( kappa_cast, alpha_cast, kappa_alpha ); \
+		ctypep kappa_alpha; \
+		PASTEMAC(t,op)( chp,cha,chp,chp, kappa_cast, *(alpha1 + mn*inca + k*lda), kappa_alpha ); \
 		for ( dim_t d = 0; d < dfac; d++ ) \
-			PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mn*dfac + d + k*ldp) ); \
+			bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mn*dfac + d + k*ldp) ); \
 	} \
 } while(0)
 
@@ -81,11 +80,12 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
      ) \
 { \
 	/* start by zeroing out the whole block */ \
-	PASTEMAC(chp,set0s_mxn) \
+	bli_tset0s_mxn \
 	( \
+	  chp, \
 	  cdim_max, \
 	  n_max, \
-	  p, 1, ldp  \
+	  ( ctypep* )p, 1, ldp  \
 	); \
 \
 	      ctypep           kappa_cast = *( ctypep* )kappa; \
@@ -134,40 +134,38 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-			PASTEMAC(chp,copys)( kappa_cast, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+			bli_tcopys( chp,chp, kappa_cast, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 	} \
 	else if ( bli_is_hermitian( struca ) ) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 		{ \
 			ctypep alpha_cast, kappa_alpha; \
-			PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
-			PASTEMAC(chp,seti0s)( alpha_cast ); \
-			PASTEMAC(chp,scal2s)( kappa_cast, alpha_cast, kappa_alpha ); \
+			bli_tcopys( cha,chp, *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
+			bli_tseti0s( chp, alpha_cast ); \
+			bli_tscal2s( chp,chp,chp,chp, kappa_cast, alpha_cast, kappa_alpha ); \
 			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 		} \
 	} \
 	else if ( bli_is_conj( conja )) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 		{ \
-			ctypep alpha_cast, kappa_alpha; \
-			PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
-			PASTEMAC(chp,scal2js)( kappa_cast, alpha_cast, kappa_alpha ); \
+			ctypep kappa_alpha; \
+			bli_tscal2js( chp,cha,chp,chp, kappa_cast, *(alpha1 + mnk*(inca + lda)), kappa_alpha ); \
 			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 		} \
 	} \
 	else \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 		{ \
-			ctypep alpha_cast, kappa_alpha; \
-			PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
-			PASTEMAC(chp,scal2s)( kappa_cast, alpha_cast, kappa_alpha ); \
+			ctypep kappa_alpha; \
+			bli_tscal2s( chp,cha,chp,chp, kappa_cast, *(alpha1 + mnk*(inca + lda)), kappa_alpha ); \
 			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 		} \
 	} \
 \
@@ -176,13 +174,13 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-			PASTEMAC(chp,inverts)( *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+			bli_tinverts( chp,chp, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 	} \
 \
 	/* if this an edge case in both directions, extend the diagonal with ones */ \
 	for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
 	for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-		PASTEMAC(chp,set1s)( *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+		bli_tset1s( chp, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
 }
 
 INSERT_GENTFUNC2_BASIC( packm_diag, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c
index bb6fe939e..a520158fc 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c
@@ -37,18 +37,19 @@
 
 #define PACKM_SET_RO( chp_r, val, mnk ) \
 do { \
-	PASTEMAC(chp_r,copys)( val, *(pi1_r + mnk*cdim_bcast + d + mnk*ldp) ); \
+	bli_tcopys( chp_r,chp_r, val, *(pi1_r + mnk*cdim_bcast + d + mnk*ldp) ); \
 } while (0)
 
 
 #define PACKM_SCAL_RO( ctypep_r, cha, chp, chp_r, mn, k, op ) \
 do { \
-	ctypep_r alpha_r, alpha_i, ka_r, ka_i; (void)ka_i; \
-	PASTEMAC(cha,chp,copyris)( *(alpha1 +  mn       *inca2       + 0 + k*lda2), \
-	                            *(alpha1 +  mn       *inca2       + 1 + k*lda2), \
-	                            alpha_r, alpha_i ); \
-	PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
-	PASTEMAC(chp_r,copys)( ka_r, *(pi1_r  + mn*cdim_bcast  + d + k*ldp) ); \
+	ctypep_r ka_r, ka_i; (void)ka_i; \
+	PASTEMAC(t,op)( chp,cha,chp,chp, \
+	                kappa_r, kappa_i, \
+	                *(alpha1 + mn*inca2 + 0 + k*lda2), \
+	                *(alpha1 + mn*inca2 + 1 + k*lda2), \
+	                ka_r, ka_i ); \
+	bli_tcopys( chp_r,chp_r, ka_r, *(pi1_r + mn*cdim_bcast + d + k*ldp) ); \
 } while (0)
 
 
@@ -104,8 +105,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	const ctypea_r* restrict alpha1  = ( const ctypea_r* )a; \
 \
 	/* start by zeroing out the whole block */ \
-	PASTEMAC(chp_r,set0s_mxn) \
+	bli_tset0s_mxn \
 	( \
+	  chp_r, \
 	  cdim_max, \
 	  n_max, \
 	  ( ctypep_r* )p, 1, ldp  \
@@ -161,11 +163,10 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	{ \
 		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
 		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-		{ \
-			ctypep_r alpha_r; \
-			PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2)), alpha_r ); \
-			PASTEMAC(chp_r,scal2s)( kappa_r, alpha_r, *(pi1_r + mnk*(cdim_bcast + ldp) + d) ); \
-		} \
+			bli_tscal2s( chp_r,cha_r,chp_r,chp_r, \
+			             kappa_r, \
+			             *(alpha1 + mnk*(inca2 + lda2)), \
+			             *(pi1_r + mnk*(cdim_bcast + ldp) + d) ); \
 	} \
 	else if ( bli_is_conj( conja ) ) \
 	{ \
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 5115628cd..e4350384d 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -44,13 +44,15 @@ do \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; ++mn ) \
 		{ \
-			ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
-			PASTEMAC(cha,chp,copyris)( *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), alpha_r, alpha_i ); \
-			PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+			ctypep_r ka_r, ka_i; \
+			PASTEMAC(t,op)( chp,cha,chp,chp, \
+			                kappa_r, kappa_i, \
+			                *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                ka_r, ka_i ); \
 			for ( dim_t d = 0; d < dfac; ++d ) \
 			{ \
-				PASTEMAC(chp,copyris)(  ka_r, ka_i, *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
-				PASTEMAC(chp,copyris)( -ka_i, ka_r, *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+				bli_tcopyris( chp,chp,  ka_r, ka_i, *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+				bli_tcopyris( chp,chp, -ka_i, ka_r, *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
 			} \
 		} \
 \
@@ -70,11 +72,13 @@ do \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; ++mn ) \
 		{ \
-			ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
-			PASTEMAC(cha,chp,copyris)( *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), alpha_r, alpha_i ); \
-			PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+			ctypep_r ka_r, ka_i; \
+			PASTEMAC(t,op)( chp,cha,chp,chp, \
+			                kappa_r, kappa_i, \
+			                *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                ka_r, ka_i ); \
 			for ( dim_t d = 0; d < dfac; ++d ) \
-				PASTEMAC(chp,copyris)( ka_r, ka_i, *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
+				bli_tcopyris( chp,chp, ka_r, ka_i, *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
 		} \
 \
 		alpha1 += lda2; \
@@ -153,8 +157,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 			else                        PACKM_1E_BODY( ctypep_r, cha, chp, , cdim, cdim_bcast, inca2, scal2ris ); \
 		} \
 \
-		PASTEMAC(chp_r,set0s_edge) \
+		bli_tset0s_edge \
 		( \
+		  chp_r, \
 		  cdim2*cdim_bcast, 2*cdim_max*cdim_bcast, \
 		  2*n, 2*n_max, \
 		  ( ctypep_r* )p, ldp  \
@@ -204,8 +209,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 			else                        PACKM_1R_BODY( ctypep_r, cha, chp, , cdim, cdim_bcast, inca2, scal2ris ); \
 		} \
 \
-		PASTEMAC(chp_r,set0s_edge) \
+		bli_tset0s_edge \
 		( \
+		  chp_r, \
 		  cdim*cdim_bcast, cdim_max*cdim_bcast, \
 		  2*n, 2*n_max, \
 		  ( ctypep_r* )p, ldp  \
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index 5cca515ae..6b70ddda7 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -34,8 +34,17 @@
 
 #include "blis.h"
 
+// Apparently gcc 11 and older have a bug where the _Pragma
+// erroneously moves to the beginning of the entire macro
+// body (e.g. just before "do")
+#ifdef __GNUC__
+#if __GNUC__ < 12
+#undef PRAGMA_SIMD
+#define PRAGMA_SIMD
+#endif
+#endif
 
-#define PACKM_BODY( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
+#define PACKM_BODY_r( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
 \
 do \
 { \
@@ -44,11 +53,10 @@ do \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; mn++ ) \
 		{ \
-			ctypep alpha_cast, kappa_alpha; \
-			PASTEMAC(cha,chp,copys)( *(alpha1 + mn*inca), alpha_cast ); \
-			PASTEMAC(chp,op)( kappa_cast, alpha_cast, kappa_alpha ); \
+			ctypep kappa_alpha; \
+			PASTEMAC(t,op)( chp,cha,chp,chp, kappa_cast, *(alpha1 + mn*inca), kappa_alpha ); \
 			for ( dim_t d = 0; d < dfac; d++ ) \
-				PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mn*dfac + d) ); \
+				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mn*dfac + d) ); \
 		} \
 \
 		alpha1 += lda; \
@@ -57,6 +65,42 @@ do \
 } while(0)
 
 
+#define PACKM_BODY_c_( ctypea, ctypep, ctypep_r, cha, chp, chp_r, pragma, cdim, dfac, inca, op ) \
+\
+do \
+{ \
+	for ( dim_t k = n; k != 0; --k ) \
+	{ \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; mn++ ) \
+		{ \
+			ctypep kappa_alpha; \
+			PASTEMAC(t,op)( chp,cha,chp,chp, kappa_cast, *(alpha1 + mn*inca), kappa_alpha ); \
+			ctypep_r kar, kai; \
+			bli_tgets( chp,chp, kappa_alpha, kar, kai ); \
+			ctypep_r* pi1r = (ctypep_r*)pi1; \
+			ctypep_r* pi1i = (ctypep_r*)pi1 + dfac; \
+			for ( dim_t d = 0; d < dfac; d++ ) \
+			{ \
+				bli_tcopys( chp_r,chp_r, kar, *(pi1r + mn*dfac*2 + d) ); \
+				bli_tcopys( chp_r,chp_r, kai, *(pi1i + mn*dfac*2 + d) ); \
+			} \
+		} \
+\
+		alpha1 += lda; \
+		pi1    += ldp; \
+	} \
+} while(0)
+
+
+#define PACKM_BODY_c( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
+PACKM_BODY_c_( ctypea, ctypep, PASTEMAC(chp,ctyper), cha, chp, PASTEMAC(chp,prec), pragma, cdim, dfac, inca, op )
+
+
+#define PACKM_BODY( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
+PASTECH(PACKM_BODY_,PASTEMAC(chp,dom))( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op )
+
+
 #undef  GENTFUNC2
 #define GENTFUNC2( ctypea, ctypep, cha, chp, opname, arch, suf ) \
 \
@@ -117,11 +161,12 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 		else                        PACKM_BODY( ctypea, ctypep, cha, chp, , cdim, cdim_bcast, inca, scal2s ); \
 	} \
 \
-	PASTEMAC(chp,set0s_edge) \
+	bli_tset0s_edge \
 	( \
+	  chp, \
 	  cdim*cdim_bcast, cdim_max*cdim_bcast, \
 	  n, n_max, \
-	  p, ldp  \
+	  ( ctypep* )p, ldp  \
 	); \
 }
 
diff --git a/ref_kernels/1m/bli_packm_cxk_ro_ref.c b/ref_kernels/1m/bli_packm_cxk_ro_ref.c
index a8165351d..549e18d2e 100644
--- a/ref_kernels/1m/bli_packm_cxk_ro_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ro_ref.c
@@ -44,12 +44,14 @@ do \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; ++mn ) \
 		{ \
-			ctypep_r alpha_r, alpha_i, ka_r, ka_i; \
+			ctypep_r ka_r, ka_i; \
 			( void )ka_i; \
-			PASTEMAC(cha,chp,copyris)( *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), alpha_r, alpha_i ); \
-			PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \
+			PASTEMAC(t,op)( chp,cha,chp,chp, \
+			                kappa_r, kappa_i, \
+			                *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                ka_r, ka_i ); \
 			for ( dim_t d = 0; d < dfac; ++d ) \
-				PASTEMAC(chp_r,copys)( ka_r, *(pi1_r + mn*dfac + d) ); \
+				bli_tcopys( chp_r,chp_r, ka_r, *(pi1_r + mn*dfac + d) ); \
 		} \
 \
 		alpha1 += lda2; \
@@ -122,8 +124,9 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 		else                        PACKM_RO_BODY( ctypep_r, cha, chp, chp_r, , cdim, cdim_bcast, inca2, scal2ris ); \
 	} \
 \
-	PASTEMAC(chp_r,set0s_edge) \
+	bli_tset0s_edge \
 	( \
+	  chp_r, \
 	  cdim*cdim_bcast, cdim_max*cdim_bcast, \
 	  n, n_max, \
 	  ( ctypep_r* )p, ldp  \
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 071f5c4ab..2cf7149e3 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -43,11 +43,7 @@ do \
 	{ \
 		pragma \
 		for ( dim_t mn = 0; mn < cdim; mn++ ) \
-		{ \
-			ctypep kappa_pi; \
-			PASTEMAC(chp,op)( *kappa_cast, *(pi1 + mn*dfac), kappa_pi ); \
-			PASTEMAC(chp,cha,copys)( kappa_pi, *(alpha1 + mn*inca) ); \
-		} \
+			PASTEMAC(t,op)( chp,chp,cha,chp, *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
 \
 		alpha1 += lda; \
 		pi1    += ldp; \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 61505eef8..119aa7b59 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -37,8 +37,8 @@
 // Completely generic gemm ukr implementation which checks MR/NR at
 // runtime. Very slow, but has to be used in certain cases.
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, arch, suf ) \
 \
 static void PASTEMAC(ch,ch,opname,arch,suf) \
      ( \
@@ -80,7 +80,7 @@ static void PASTEMAC(ch,ch,opname,arch,suf) \
 	/* Initialize the accumulator elements in ab to zero. */ \
 	for ( dim_t i = 0; i < m * n; ++i ) \
 	{ \
-		PASTEMAC(ch,set0s)( *(ab + i) ); \
+		bli_tset0s( ch, *(ab + i) ); \
 	} \
 \
 	/* Perform a series of k rank-1 updates into ab. */ \
@@ -92,13 +92,19 @@ static void PASTEMAC(ch,ch,opname,arch,suf) \
 		   are typically fully unrolled. */ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype bj = *(b + j*cs_b); \
+			ctype bj; \
+			const ctype_r* b_r = (const ctype_r*)(b + j*cs_b); \
+			const ctype_r* b_i = b_r + cs_b; (void)b_i; \
+			bli_tsets( ch,ch, *b_r, *b_i, bj ); \
 \
 			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype ai = *(a + i*rs_a); \
+				ctype ai; \
+				const ctype_r* a_r = (const ctype_r*)(a + i*rs_a); \
+				const ctype_r* a_i = a_r + rs_a; (void)a_i; \
+				bli_tsets( ch,ch, *a_r, *a_i, ai ); \
 \
-				PASTEMAC(ch,dots)( ai, bj, *abij ); \
+				bli_tdots( ch,ch,ch,ch, ai, bj, *abij ); \
 \
 				abij += rs_ab; \
 			} \
@@ -111,15 +117,17 @@ static void PASTEMAC(ch,ch,opname,arch,suf) \
 	/* Scale the result in ab by alpha. */ \
 	for ( dim_t i = 0; i < m * n; ++i ) \
 	{ \
-		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
+		bli_tscals( ch,ch,ch, *alpha, *(ab + i) ); \
 	} \
 \
 	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
 	   scale by beta and then add the scaled redult in ab. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	if ( bli_teq0s( ch, *beta ) ) \
 	{ \
-		PASTEMAC(ch,copys_mxn) \
+		bli_tcopys_mxn \
 		( \
+		  ch, \
+		  ch, \
 		  m, \
 		  n, \
 		  ab, rs_ab, cs_ab, \
@@ -128,8 +136,12 @@ static void PASTEMAC(ch,ch,opname,arch,suf) \
 	} \
 	else \
 	{ \
-		PASTEMAC(ch,xpbys_mxn) \
+		bli_txpbys_mxn \
 		( \
+		  ch, \
+		  ch, \
+		  ch, \
+		  ch, \
 		  m, \
 		  n, \
 		  ab, rs_ab, cs_ab, \
@@ -139,7 +151,7 @@ static void PASTEMAC(ch,ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCR_BASIC( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 // An implementation that attempts to facilitate emission of vectorized
 // instructions via constant loop bounds + #pragma omp simd directives.
@@ -171,12 +183,17 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 \
 	const dim_t mr = PASTECH(BLIS_MR_,ch); \
 	const dim_t nr = PASTECH(BLIS_NR_,ch); \
+\
+	const inc_t rs_a   = PASTECH(BLIS_BBM_,ch); \
+	const inc_t cs_a   = PASTECH(BLIS_PACKMR_,ch); \
+	const inc_t rs_b   = PASTECH(BLIS_PACKNR_,ch); \
+	const inc_t cs_b   = PASTECH(BLIS_BBN_,ch); \
 \
 	/* If either BLIS_MR_? or BLIS_NR_? was left undefined by the subconfig,
 	   the compiler can't fully unroll the MR and NR loop iterations below,
 	   which means there's no benefit to using this kernel over a general-
 	   purpose implementation instead. */ \
-	if ( mr == -1 || nr == -1 ) \
+	if ( mr == -1 || nr == -1 || rs_a != 1 || cs_b != 1 ) \
 	{ \
 		PASTEMAC(ch,ch,gemm_gen,arch,suf) \
 		( \
@@ -198,18 +215,13 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 	      ctype* ab    = (ctype*)ab_; \
 	const inc_t  rs_ab = nr; \
 	const inc_t  cs_ab = 1; \
-\
-	const inc_t  rs_a  = PASTECH(BLIS_BBM_,ch); \
-	const inc_t  cs_a  = PASTECH(BLIS_PACKMR_,ch); \
-	const inc_t  rs_b  = PASTECH(BLIS_PACKNR_,ch); \
-	const inc_t  cs_b  = PASTECH(BLIS_BBN_,ch); \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
 	PRAGMA_SIMD \
 	for ( dim_t i = 0; i < mr * nr; ++i ) \
 	{ \
-		PASTEMAC(ch,set0s)( ab[ i ] ); \
+		bli_tset0s( ch, ab[ i ] ); \
 	} \
 \
 	/* Perform a series of k rank-1 updates into ab. */ \
@@ -220,8 +232,9 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 			PRAGMA_SIMD \
 			for ( dim_t j = 0; j < nr; ++j ) \
 			{ \
-				PASTEMAC(ch,dots) \
+				bli_tdots \
 				( \
+				  ch,ch,ch,ch, \
 				  a[ i*rs_a ], \
 				  b[ j*cs_b ], \
 				  ab[ i*rs_ab + j*cs_ab ]  \
@@ -237,7 +250,7 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 	PRAGMA_SIMD \
 	for ( dim_t i = 0; i < mr * nr; ++i ) \
 	{ \
-		PASTEMAC(ch,scals)( *alpha, ab[ i ] ); \
+		bli_tscals( ch,ch,ch, *alpha, ab[ i ] ); \
 	} \
 \
 	/* Output/accumulate intermediate result ab based on the storage
@@ -246,12 +259,13 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 	{ \
 		/* C is row-stored. */ \
 \
-		if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		if ( bli_teq0s( ch, *beta ) ) \
 		{ \
 			for ( dim_t i = 0; i < m; ++i ) \
 			for ( dim_t j = 0; j < n; ++j ) \
-			PASTEMAC(ch,copys) \
+			bli_tcopys \
 			( \
+			  ch,ch, \
 			  ab[ i*rs_ab + j*cs_ab ], \
 			  c [ i*rs_c  + j*1     ]  \
 			); \
@@ -260,8 +274,9 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 		{ \
 			for ( dim_t i = 0; i < m; ++i ) \
 			for ( dim_t j = 0; j < n; ++j ) \
-			PASTEMAC(ch,xpbys) \
+			bli_txpbys \
 			( \
+			  ch,ch,ch,ch, \
 			  ab[ i*rs_ab + j*cs_ab ], \
 			  *beta, \
 			  c [ i*rs_c  + j*1     ]  \
@@ -272,12 +287,13 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 	{ \
 		/* C is column-stored or general-stored. */ \
 \
-		if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		if (bli_teq0s( ch, *beta ) ) \
 		{ \
 			for ( dim_t j = 0; j < n; ++j ) \
 			for ( dim_t i = 0; i < m; ++i ) \
-			PASTEMAC(ch,copys) \
+			bli_tcopys \
 			( \
+			  ch,ch, \
 			  ab[ i*rs_ab + j*cs_ab ], \
 			  c [ i*rs_c  + j*cs_c  ]  \
 			); \
@@ -286,8 +302,9 @@ void PASTEMAC(ch,ch,opname,arch,suf) \
 		{ \
 			for ( dim_t j = 0; j < n; ++j ) \
 			for ( dim_t i = 0; i < m; ++i ) \
-			PASTEMAC(ch,xpbys) \
+			bli_txpbys \
 			( \
+			  ch,ch,ch,ch, \
 			  ab[ i*rs_ab + j*cs_ab ], \
 			  *beta, \
 			  c [ i*rs_c  + j*cs_c  ]  \
@@ -354,8 +371,9 @@ void PASTEMAC(chab,chc,opname,arch,suf) \
 	  cntx  \
 	); \
 \
-	PASTEMAC(chab,chc,chc,xpbys_mxn) \
+	bli_txpbys_mxn \
 	( \
+	  chab,chc,chc,chc, \
 	  m, n, \
 	  ct, rs_ct, cs_ct, \
 	  beta, \
diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c
index 9dab9e092..7f2fa50dd 100644
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -80,7 +80,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict bj  = &b [ j*cs_b ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -88,23 +88,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+					bli_tdots( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
@@ -123,7 +123,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict bj  = &b [ j*cs_b ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -131,23 +131,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+					bli_taxpyjs( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
@@ -166,7 +166,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict bj  = &b [ j*cs_b ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -174,23 +174,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+					bli_tdotjs( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
@@ -209,7 +209,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict bj  = &b [ j*cs_b ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -217,26 +217,26 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+					bli_tdots( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
-				PASTEMAC(ch,conjs)( ab ); \
+				bli_tconjs( ch, ab ); \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
@@ -291,7 +291,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict ai  = &a [ i*rs_a ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -299,23 +299,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+					bli_tdots( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
@@ -334,7 +334,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict ai  = &a [ i*rs_a ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -342,23 +342,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+					bli_taxpyjs( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
@@ -377,7 +377,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict ai  = &a [ i*rs_a ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -385,23 +385,23 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+					bli_tdotjs( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
@@ -420,7 +420,7 @@ void PASTEMAC(ch,opname,arch,suf) \
 				const ctype* restrict ai  = &a [ i*rs_a ]; \
 				      ctype           ab; \
 \
-				PASTEMAC(ch,set0s)( ab ); \
+				bli_tset0s( ch, ab ); \
 \
 				/* Perform a dot product to update the (i,j) element of c. */ \
 				for ( dim_t l = 0; l < k; ++l ) \
@@ -428,26 +428,26 @@ void PASTEMAC(ch,opname,arch,suf) \
 					const ctype* restrict aij = &ai[ l*cs_a ]; \
 					const ctype* restrict bij = &bj[ l*rs_b ]; \
 \
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+					bli_tdots( ch,ch,ch,ch, *aij, *bij, ab ); \
 				} \
 \
 				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
-				PASTEMAC(ch,conjs)( ab ); \
+				bli_tconjs( ch, ab ); \
 \
 				/* If beta is one, add ab into c. If beta is zero, overwrite c
 				   with the result in ab. Otherwise, scale by beta and accumulate
 				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				if ( bli_teq1s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+					bli_taxpys( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				else if ( bli_teq0s( ch, *beta ) ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+					bli_tscal2s( ch,ch,ch,ch, *alpha, ab, *cij ); \
 				} \
 				else \
 				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+					bli_taxpbys( ch,ch,ch,ch,ch, *alpha, ab, *beta, *cij ); \
 				} \
 			} \
 		} \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index e1b00e358..0d496feb7 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -138,11 +138,12 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
 \
 	/* Broadcast the elements of the updated b11 submatrix to their
 	   duplicated neighbors. */ \
-	PASTEMAC(ch,bcastbbs_mxn) \
+	bli_tbcastbbs_mxn \
 	( \
-	  m, \
+	  ch, \
 	  n, \
-	  b11, rs_b, cs_b  \
+	  m, \
+	  b11, cs_b, rs_b  \
 	); \
 \
 	/* b11 = inv(a11) * b11;
@@ -162,8 +163,9 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
 \
 	if ( use_ct ) \
 	{ \
-		PASTEMAC(ch,copys_mxn) \
+		bli_tcopys_mxn \
 		( \
+		  ch,ch, \
 		  m, n, \
 		  ct,  rs_ct, cs_ct, \
 		  c11, rs_c,  cs_c  \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index c5f833359..cbd64899d 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -91,29 +91,29 @@ void PASTEMAC(ch,opname,arch,suf) \
 			ctype           rho11; \
 \
 			/* beta11 = beta11 - a10t * b01; */ \
-			PASTEMAC(ch,set0s)( rho11 ); \
+			bli_tset0s( ch, rho11 ); \
 			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
 				const ctype* restrict alpha10 = a10t + (l  )*cs_a; \
 				      ctype* restrict beta01  = b01  + (l  )*rs_b; \
 \
-				PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \
+				bli_taxpys( ch,ch,ch,ch, *alpha10, *beta01, rho11 ); \
 			} \
-			PASTEMAC(ch,subs)( rho11, beta11c ); \
+			bli_tsubs( ch,ch,ch, rho11, beta11c ); \
 \
 			/* beta11 = beta11 / alpha11; */ \
 			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 			   (1.0/alpha11) is stored during packing instead alpha11 so we
 			   can multiply rather than divide. When preinversion is disabled,
 			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,scals)( *alpha11, beta11c ); \
+			PASTEMAC(t,diagop)( ch,ch,ch, *alpha11, beta11c ); \
 \
 			/* Output final result to matrix c. */ \
-			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
+			bli_tcopys( ch,ch, beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
 			for ( dim_t d = 0; d < cs_b; ++d ) \
-				PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
+				bli_tcopys( ch,ch, beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
@@ -179,29 +179,29 @@ void PASTEMAC(ch,opname,arch,suf) \
 			ctype           rho11; \
 \
 			/* beta11 = beta11 - a12t * b21; */ \
-			PASTEMAC(ch,set0s)( rho11 ); \
+			bli_tset0s( ch, rho11 ); \
 			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
 				const ctype* restrict alpha12 = a12t + (l  )*cs_a; \
 				      ctype* restrict beta21  = b21  + (l  )*rs_b; \
 \
-				PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \
+				bli_taxpys( ch,ch,ch,ch, *alpha12, *beta21, rho11 ); \
 			} \
-			PASTEMAC(ch,subs)( rho11, beta11c ); \
+			bli_tsubs( ch,ch,ch, rho11, beta11c ); \
 \
 			/* beta11 = beta11 / alpha11; */ \
 			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 			   (1.0/alpha11) is stored during packing instead alpha11 so we
 			   can multiply rather than divide. When preinversion is disabled,
 			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,diagop)( *alpha11, beta11c ); \
+			PASTEMAC(t,diagop)( ch,ch,ch, *alpha11, beta11c ); \
 \
 			/* Output final result to matrix c. */ \
-			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
+			bli_tcopys( ch,ch, beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
 			for ( dim_t d = 0; d < cs_b; ++d ) \
-				PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
+				bli_tcopys( ch,ch, beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 30904ecdb..63269c859 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -90,8 +90,8 @@ void PASTEMAC(chabr,chcr,opname,arch,suf) \
 	auxinfo_t auxinfo_r = *auxinfo; \
 	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
 \
-	if ( !PASTEMAC(chabr,eq0)( *alpha_i ) || \
-	     !PASTEMAC(chcr,eq0)( *beta_i ) || \
+	if ( !bli_teq0s( chabr, *alpha_i ) || \
+	     !bli_teq0s( chcr, *beta_i ) || \
 	     !bli_is_preferentially_stored( rs_c, cs_c, row_pref ) || \
 	     !PASTEMAC(chabr,chcr,same) ) \
 	{ \
@@ -134,8 +134,9 @@ void PASTEMAC(chabr,chcr,opname,arch,suf) \
 		  cntx  \
 		); \
 \
-		PASTEMAC(chab,chab,chc,chc,axpbys_mxn) \
+		bli_taxpbys_mxn \
 		( \
+		  chab,chab,chc,chc,chc, \
 		  m, n, \
 		  alpha, \
 		  ct, rs_ct, cs_ct, \
diff --git a/ref_kernels/ind/bli_gemm_ccr_ref.c b/ref_kernels/ind/bli_gemm_ccr_ref.c
index f86faebc8..484cf678f 100644
--- a/ref_kernels/ind/bli_gemm_ccr_ref.c
+++ b/ref_kernels/ind/bli_gemm_ccr_ref.c
@@ -89,8 +89,8 @@ void PASTEMAC(chabr,chcr,opname,arch,suf) \
 	auxinfo_t auxinfo_r = *auxinfo; \
 	bli_auxinfo_set_params( params_r, &auxinfo_r ); \
 \
-	if ( !PASTEMAC(chabr,eq0)( *alpha_i ) || \
-	     !PASTEMAC(chcr,eq0)( *beta_i ) || \
+	if ( !bli_teq0s( chabr, *alpha_i ) || \
+	     !bli_teq0s( chcr, *beta_i ) || \
 	     !bli_is_preferentially_stored( rs_c, cs_c, row_pref ) || \
 	     !PASTEMAC(chab,chc,same) ) \
 	{ \
@@ -128,8 +128,9 @@ void PASTEMAC(chabr,chcr,opname,arch,suf) \
 		  cntx  \
 		); \
 \
-		PASTEMAC(chab,chab,chc,chc,axpbys_mxn) \
+		bli_taxpbys_mxn \
 		( \
+		  chab,chab,chc,chc,chc, \
 		  m, n, \
 		  alpha, \
 		  ct, rs_ct, cs_ct, \
diff --git a/ref_kernels/ind/bli_gemm_crr_ref.c b/ref_kernels/ind/bli_gemm_crr_ref.c
index c729e95d7..243e9e988 100644
--- a/ref_kernels/ind/bli_gemm_crr_ref.c
+++ b/ref_kernels/ind/bli_gemm_crr_ref.c
@@ -99,35 +99,16 @@ void PASTEMAC(chabr,chcr,opname,arch,suf) \
 	  cntx  \
 	); \
 \
-	ctype_abr ar, ai; \
-	PASTEMAC(chab,gets)( *alpha, ar, ai ); \
-\
-	if ( PASTEMAC(chc,eq0)( *beta ) ) \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		{ \
-			ctype_abr axr, axi; \
-			ctype_ab ax; \
-			PASTEMAC(chabr,scal2s)( ar, *(ct + ii*rs_ct + jj*cs_ct), axr ); \
-			PASTEMAC(chabr,scal2s)( ai, *(ct + ii*rs_ct + jj*cs_ct), axi ); \
-			PASTEMAC(chab,sets)( axr, axi, ax ); \
-			PASTEMAC(chab,chc,copys)( ax, *(c + ii*rs_c + jj*cs_c) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( dim_t jj = 0; jj < n; ++jj ) \
-		for ( dim_t ii = 0; ii < m; ++ii ) \
-		{ \
-			ctype_abr axr, axi; \
-			ctype_ab ax; \
-			PASTEMAC(chabr,scal2s)( ar, *(ct + ii*rs_ct + jj*cs_ct), axr ); \
-			PASTEMAC(chabr,scal2s)( ai, *(ct + ii*rs_ct + jj*cs_ct), axi ); \
-			PASTEMAC(chab,sets)( axr, axi, ax ); \
-			PASTEMAC(chab,chc,chc,xpbys)( ax, *beta, *(c + ii*rs_c + jj*cs_c) ); \
-		} \
-	} \
+	bli_taxpbys_mxn \
+	( \
+	  chab,chabr,chc,chc,chc, \
+	  m, \
+	  n, \
+	  alpha, \
+	  ct, rs_ct, cs_ct, \
+	  beta, \
+	  c, rs_c, cs_c \
+	); \
 }
 
 INSERT_GENTFUNC2RO( gemm_crr, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/ind/bli_gemm_rcc_ref.c b/ref_kernels/ind/bli_gemm_rcc_ref.c
index 67859cd4a..97424bfde 100644
--- a/ref_kernels/ind/bli_gemm_rcc_ref.c
+++ b/ref_kernels/ind/bli_gemm_rcc_ref.c
@@ -99,8 +99,9 @@ void PASTEMAC(chab,chc,opname,arch,suf) \
 		  cntx  \
 		); \
 \
-		PASTEMAC(chab,chc,chc,xpbys_mxn) \
+		bli_txpbys_mxn \
 		( \
+		  chab,chc,chc,chc, \
 		  m, n, \
 		  ct, rs_ct, cs_ct, \
 		  beta, \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 54f9900c3..10a167570 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -93,7 +93,7 @@ static void PASTEMAC(chr,opname,arch,suf) \
 				ctype_r           rho11_i; \
 \
 				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
+				bli_tset0ris( ch, rho11_r, \
 				                      rho11_i ); \
 				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
@@ -102,36 +102,36 @@ static void PASTEMAC(chr,opname,arch,suf) \
 					ctype_r* restrict beta01_r  = b01_ri  + (l  )*rs_b2 + 0*cs_b; \
 					ctype_r* restrict beta01_i  = b01_ri  + (l  )*rs_b2 + 1*cs_b; \
 \
-					PASTEMAC(ch,axpyris)( *alpha10_r, \
-					                      *alpha10_i, \
-					                      *beta01_r, \
-					                      *beta01_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
+					bli_taxpyris( ch,ch,ch,ch, *alpha10_r, \
+					                           *alpha10_i, \
+					                           *beta01_r, \
+					                           *beta01_i, \
+					                           rho11_r, \
+					                           rho11_i ); \
 				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				bli_tsubris( ch,ch,ch, rho11_r, \
+				                       rho11_i, \
+				                       beta11c_r, \
+				                       beta11c_i ); \
 \
 				/* beta11 = beta11 / alpha11; */ \
 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 				   (1.0/alpha11) is stored during packing instead alpha11 so we
 				   can multiply rather than divide. When preinversion is disabled,
 				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				PASTEMAC(t,diagop)( ch,ch,ch, *alpha11_r, \
+				                              *alpha11_i, \
+				                              beta11c_r, \
+				                              beta11c_i ); \
 \
 				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
+				bli_tsets( ch,ch,  beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
 				for ( dim_t d = 0; d < cs_b; ++d ) \
 				{ \
-					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
-					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+					bli_tcopyris( ch,ch,  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					bli_tcopyris( ch,ch, -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
                 } \
 			} \
 		} \
@@ -176,8 +176,8 @@ static void PASTEMAC(chr,opname,arch,suf) \
 				ctype_r           rho11_i; \
 \
 				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
-				                      rho11_i ); \
+				bli_tset0ris( ch, rho11_r, \
+				                  rho11_i ); \
 				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
 					ctype*   restrict alpha10_ri = a10t_ri + (l  )*cs_a; \
@@ -186,38 +186,38 @@ static void PASTEMAC(chr,opname,arch,suf) \
 					ctype_r* restrict beta01_r   = b01_r   + (l  )*rs_b2; \
 					ctype_r* restrict beta01_i   = b01_i   + (l  )*rs_b2; \
 \
-					PASTEMAC(ch,axpyris)( *alpha10_r, \
-					                      *alpha10_i, \
-					                      *beta01_r, \
-					                      *beta01_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
+					bli_taxpyris( ch,ch,ch,ch, *alpha10_r, \
+					                           *alpha10_i, \
+					                           *beta01_r, \
+					                           *beta01_i, \
+					                           rho11_r, \
+					                           rho11_i ); \
 				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				bli_tsubris( ch,ch,ch, rho11_r, \
+				                       rho11_i, \
+				                       beta11c_r, \
+				                       beta11c_i ); \
 \
 				/* beta11 = beta11 / alpha11; */ \
 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 				   (1.0/alpha11) is stored during packing instead alpha11 so we
 				   can multiply rather than divide. When preinversion is disabled,
 				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				PASTEMAC(t,diagop)( ch,ch,ch, *alpha11_r, \
+				                              *alpha11_i, \
+				                              beta11c_r, \
+				                              beta11c_i ); \
 \
 				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)( beta11c_r, \
-				                   beta11c_i, *gamma11 ); \
+				bli_tsets( ch,ch, beta11c_r, \
+				                  beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
 				for ( dim_t d = 0; d < cs_b; ++d ) \
-					PASTEMAC(ch,copyris)( beta11c_r, \
-					                      beta11c_i, \
-					                      *(beta11_r + d), \
-					                      *(beta11_i + d) ); \
+					bli_tcopyris( ch,ch, beta11c_r, \
+					                     beta11c_i, \
+					                     *(beta11_r + d), \
+					                     *(beta11_i + d) ); \
 			} \
 		} \
 	} \
@@ -288,7 +288,7 @@ static void PASTEMAC(chr,opname,arch,suf) \
 				ctype_r           rho11_i; \
 \
 				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
+				bli_tset0ris( ch, rho11_r, \
 				                      rho11_i ); \
 				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
@@ -297,36 +297,36 @@ static void PASTEMAC(chr,opname,arch,suf) \
 					ctype_r* restrict beta21_r  = b21_ri + (l  )*rs_b2 + 0*cs_b; \
 					ctype_r* restrict beta21_i  = b21_ri + (l  )*rs_b2 + 1*cs_b; \
 \
-					PASTEMAC(ch,axpyris)( *alpha12_r, \
-					                      *alpha12_i, \
-					                      *beta21_r, \
-					                      *beta21_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
+					bli_taxpyris( ch,ch,ch,ch, *alpha12_r, \
+					                           *alpha12_i, \
+					                           *beta21_r, \
+					                           *beta21_i, \
+					                           rho11_r, \
+					                           rho11_i ); \
 				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				bli_tsubris( ch,ch,ch, rho11_r, \
+				                       rho11_i, \
+				                       beta11c_r, \
+				                       beta11c_i ); \
 \
 				/* beta11 = beta11 / alpha11; */ \
 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 				   (1.0/alpha11) is stored during packing instead alpha11 so we
 				   can multiply rather than divide. When preinversion is disabled,
 				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				PASTEMAC(t,diagop)( ch,ch,ch, *alpha11_r, \
+				                              *alpha11_i, \
+				                              beta11c_r, \
+				                              beta11c_i ); \
 \
 				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
+				bli_tsets( ch,ch, beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
 				for ( dim_t d = 0; d < cs_b; ++d ) \
 				{ \
-					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
-					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+					bli_tcopyris( ch,ch,  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					bli_tcopyris( ch,ch, -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
                 } \
 			} \
 		} \
@@ -371,7 +371,7 @@ static void PASTEMAC(chr,opname,arch,suf) \
 				ctype_r           rho11_i; \
 \
 				/* beta11 = beta11 - a10t * b01; */ \
-				PASTEMAC(ch,set0ris)( rho11_r, \
+				bli_tset0ris( ch, rho11_r, \
 				                      rho11_i ); \
 				for ( dim_t l = 0; l < n_behind; ++l ) \
 				{ \
@@ -381,38 +381,38 @@ static void PASTEMAC(chr,opname,arch,suf) \
 					ctype_r* restrict beta21_r   = b21_r   + (l  )*rs_b2; \
 					ctype_r* restrict beta21_i   = b21_i   + (l  )*rs_b2; \
 \
-					PASTEMAC(ch,axpyris)( *alpha12_r, \
-					                      *alpha12_i, \
-					                      *beta21_r, \
-					                      *beta21_i, \
-					                      rho11_r, \
-					                      rho11_i ); \
+					bli_taxpyris( ch,ch,ch,ch, *alpha12_r, \
+					                           *alpha12_i, \
+					                           *beta21_r, \
+					                           *beta21_i, \
+					                           rho11_r, \
+					                           rho11_i ); \
 				} \
-				PASTEMAC(ch,subris)( rho11_r, \
-				                     rho11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				bli_tsubris( ch,ch,ch, rho11_r, \
+				                       rho11_i, \
+				                       beta11c_r, \
+				                       beta11c_i ); \
 \
 				/* beta11 = beta11 / alpha11; */ \
 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 				   (1.0/alpha11) is stored during packing instead alpha11 so we
 				   can multiply rather than divide. When preinversion is disabled,
 				   alpha11 is stored and division happens below explicitly. */ \
-				PASTEMAC(ch,diagop)( *alpha11_r, \
-				                     *alpha11_i, \
-				                     beta11c_r, \
-				                     beta11c_i ); \
+				PASTEMAC(t,diagop)( ch,ch,ch, *alpha11_r, \
+				                              *alpha11_i, \
+				                              beta11c_r, \
+				                              beta11c_i ); \
 \
 				/* Output final result to matrix c. */ \
-				PASTEMAC(ch,sets)( beta11c_r, \
-				                   beta11c_i, *gamma11 ); \
+				bli_tsets( ch,ch, beta11c_r, \
+				                  beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
 				for ( dim_t d = 0; d < cs_b; ++d ) \
-					PASTEMAC(ch,copyris)( beta11c_r, \
-					                      beta11c_i, \
-					                      *(beta11_r + d), \
-					                      *(beta11_i + d) ); \
+					bli_tcopyris( ch,ch, beta11c_r, \
+					                     beta11c_i, \
+					                     *(beta11_r + d), \
+					                     *(beta11_i + d) ); \
 			} \
 		} \
 	} \
@@ -540,8 +540,9 @@ void PASTEMAC(chr,opname,arch,suf) \
 			ctype_r* restrict beta11_ir_r = b11_ir + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
 			ctype_r* restrict beta11_ir_i = b11_ir + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \
 \
-			PASTEMAC(ch,chr,ch,xpbyris) \
+			bli_txpbyris \
 			( \
+			  ch,chr,ch,chr, \
 			  *beta11t_r, \
 			  *beta11t_i, \
 			  alpha_r, \
@@ -550,8 +551,8 @@ void PASTEMAC(chr,opname,arch,suf) \
 			  *beta11_ri_i  \
 			); \
 \
-			PASTEMAC(ch,copyris)( -*beta11_ri_i, *beta11_ri_r, \
-			                       *beta11_ir_r, *beta11_ir_i ); \
+			bli_tcopyris( ch,ch, -*beta11_ri_i, *beta11_ri_r, \
+			                      *beta11_ir_r, *beta11_ir_i ); \
 		} \
 	} \
 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
@@ -575,8 +576,9 @@ void PASTEMAC(chr,opname,arch,suf) \
 			ctype_r* restrict beta11_r  = b11_r + i*rs_b2 + j*cs_b2 + d; \
 			ctype_r* restrict beta11_i  = b11_i + i*rs_b2 + j*cs_b2 + d; \
 \
-			PASTEMAC(ch,chr,ch,xpbyris) \
+			bli_txpbyris \
 			( \
+			  ch,chr,ch,chr, \
 			  *beta11t_r, \
 			  *beta11t_i, \
 			  alpha_r, \
diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c
index ab656a31a..b35a7bdc3 100644
--- a/sandbox/gemmlike/bls_l3_packm_var1.c
+++ b/sandbox/gemmlike/bls_l3_packm_var1.c
@@ -139,7 +139,7 @@ void bls_packm_var1
 			//  			ctype* cli = c_cast + (l  )*ldc + (i  )*incc;
 			//  			ctype* pli = p_cast + (l  )*ldp + (i  )*1;
 			//
-			//  			PASTEMAC(ch,axpyjs)( kappa_cast, *cli, *pli );
+			//  			bli_taxpyjs( ch,ch,ch,ch, kappa_cast, *cli, *pli );
 			//  		}
 			//  	}
 			//  }
@@ -152,7 +152,7 @@ void bls_packm_var1
 			//  			ctype* cli = c_cast + (l  )*ldc + (i  )*incc;
 			//  			ctype* pli = p_cast + (l  )*ldp + (i  )*1;
 			//
-			//  			PASTEMAC(ch,axpys)( kappa_cast, *cli, *pli );
+			//  			bli_taxpys( ch,ch,ch,ch, kappa_cast, *cli, *pli );
 			//  		}
 			//  	}
 			//  }
diff --git a/test/level0/Makefile b/test/level0/Makefile
new file mode 100644
index 000000000..a9168cccb
--- /dev/null
+++ b/test/level0/Makefile
@@ -0,0 +1,153 @@
+#!/bin/bash
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+#
+# Makefile
+#
+# Field G. Van Zee
+#
+# Makefile for standalone BLIS test drivers.
+#
+
+#
+# --- Makefile PHONY target definitions ----------------------------------------
+#
+
+.PHONY: all \
+        test-ranges \
+        clean cleanx
+
+
+#
+# --- Determine makefile fragment location -------------------------------------
+#
+
+# Comments:
+# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
+# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
+#   the second case because CONFIG_NAME is not yet set.
+ifneq ($(strip $(BLIS_INSTALL_PATH)),)
+LIB_PATH   := $(BLIS_INSTALL_PATH)/lib
+INC_PATH   := $(BLIS_INSTALL_PATH)/include/blis
+SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
+else
+DIST_PATH   := ../..
+CONFIG_NAME := $(shell grep -E "CONFIG_NAME *:=" $(DIST_PATH)/config.mk | sed 's/.*:= *//')
+LIB_PATH     = $(DIST_PATH)/lib/$(CONFIG_NAME)
+INC_PATH     = $(DIST_PATH)/include/$(CONFIG_NAME)
+SHARE_PATH  := $(DIST_PATH)
+endif
+
+
+#
+# --- Include common makefile definitions --------------------------------------
+#
+
+# Include the common makefile fragment.
+-include $(SHARE_PATH)/common.mk
+
+
+#
+# --- General build definitions ------------------------------------------------
+#
+
+TEST_SRC_PATH  := .
+TEST_OBJ_PATH  := .
+
+# Gather all local object files.
+SRC_SUFFIXES   := c cxx cpp
+TEST_OBJS      := $(foreach suf, \
+                            $(SRC_SUFFIXES), \
+                            $(sort $(patsubst $(TEST_SRC_PATH)/%.$(suf), \
+                                              $(TEST_OBJ_PATH)/%.o, \
+                                              $(wildcard $(TEST_SRC_PATH)/*.$(suf)))))
+
+# Override the value of CINCFLAGS so that the value of CFLAGS returned by
+# get-user-cflags-for() is not cluttered up with include paths needed only
+# while building BLIS.
+CINCFLAGS      := -I$(INC_PATH)
+CXXINCFLAGS    := -I$(INC_PATH)
+CXXLANGFLAGS   := -std=c++17
+
+# Use the CFLAGS for the configuration family.
+CFLAGS         := $(call get-frame-cflags-for,$(CONFIG_NAME))
+CXXFLAGS       := $(call get-frame-cxxflags-for,$(CONFIG_NAME))
+
+# Add installed and local header paths to CFLAGS
+CFLAGS         += -I$(TEST_SRC_PATH)
+CXXFLAGS       += -I$(TEST_SRC_PATH) -DENABLE_INFO
+
+HDR_SUFFIXES   := h hpp
+HEADERS        := $(foreach suf, $(HDR_SUFFIXES), $(wildcard $(TEST_SRC_PATH)/*.$(suf))) $(INC_PATH)/blis.h
+
+
+#
+# --- Targets/rules ------------------------------------------------------------
+#
+
+all: test
+
+test: test_l0.x
+
+
+# -- Source file rules --
+
+test_%.o: test_%.c Makefile $(HEADERS)
+	$(CC) $(CFLAGS) -c $< -o $@
+
+test_%.o: test_%.cpp Makefile $(HEADERS)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+test_%.o: test_%.cxx Makefile $(HEADERS)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+
+# -- Executable file rules --
+
+# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
+# on the link command line in case BLIS was configured with the BLAS
+# compatibility layer. This prevents BLIS from inadvertently getting called
+# for the BLAS routines we are trying to test with.
+
+test_l0.x: $(TEST_OBJS) $(LIBBLIS_LINK)
+	$(CXX) $(TEST_OBJS) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+
+
+# -- Clean rules --
+
+clean: cleanx
+
+cleanx:
+	- $(RM_F) *.o *.x
+
diff --git a/frame/include/level0/bli_seti0s.h b/test/level0/test_l0.cxx
similarity index 83%
rename from frame/include/level0/bli_seti0s.h
rename to test/level0/test_l0.cxx
index 229d6b474..fd55c43cf 100644
--- a/frame/include/level0/bli_seti0s.h
+++ b/test/level0/test_l0.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,13 +33,15 @@
 
 */
 
-#ifndef BLIS_SETI0S_H
-#define BLIS_SETI0S_H
+#include "test_l0.hpp"
 
-#define bli_sseti0s( a )  bli_ssetis( 0.0F, (a) )
-#define bli_dseti0s( a )  bli_dsetis( 0.0 , (a) )
-#define bli_cseti0s( a )  bli_csetis( 0.0F, (a) )
-#define bli_zseti0s( a )  bli_zsetis( 0.0 , (a) )
-
-#endif
+unit_test_registrar& get_unit_test_registrar()
+{
+	static unit_test_registrar registrar;
+	return registrar;
+}
 
+int main()
+{
+	return !!get_unit_test_registrar().run_tests();
+}
diff --git a/test/level0/test_l0.hpp b/test/level0/test_l0.hpp
new file mode 100644
index 000000000..710c592c4
--- /dev/null
+++ b/test/level0/test_l0.hpp
@@ -0,0 +1,914 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, Southern Methodist University
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_UNIT_TESTING_H
+#define BLIS_UNIT_TESTING_H
+
+#include <exception>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <unistd.h>
+#include <cassert>
+#include <signal.h>
+#include <type_traits>
+#include <functional>
+
+#include "blis.h"
+
+using unit_test_t = std::function<void()>;
+
+struct variable_printer_base
+{
+    virtual ~variable_printer_base() {}
+
+    virtual void print() const = 0;
+};
+
+struct unit_test_failure : std::exception {};
+
+struct unit_test_registrar
+{
+    std::vector<unit_test_t> tests;
+    std::vector<const variable_printer_base*> vars;
+
+    static const char* red()
+    {
+        #ifdef BLIS_OS_WINDOWS
+        static std::string s = _isatty(_fileno(stdout)) ? "\e[0;31m" : "";
+        #else
+        static std::string s = isatty(fileno(stdout)) ? "\e[0;31m" : "";
+        #endif
+        return s.c_str();
+    }
+
+    static const char* green()
+    {
+        #ifdef BLIS_OS_WINDOWS
+        static std::string s = _isatty(_fileno(stdout)) ? "\e[0;32m" : "";
+        #else
+        static std::string s = isatty(fileno(stdout)) ? "\e[0;32m" : "";
+        #endif
+        return s.c_str();
+    }
+
+    static const char* normal()
+    {
+        #ifdef BLIS_OS_WINDOWS
+        static std::string s = _isatty(_fileno(stdout)) ? "\e[0m" : "";
+        #else
+        static std::string s = isatty(fileno(stdout)) ? "\e[0m" : "";
+        #endif
+        return s.c_str();
+    }
+
+    size_t register_test(unit_test_t test)
+    {
+        tests.push_back(test);
+        return tests.size()-1;
+    }
+
+    bool run_tests()
+    {
+        auto failed = 0;
+        auto total = 0;
+
+        for (auto& test : tests)
+        {
+            try
+            {
+                test();
+            }
+            catch (const unit_test_failure&)
+            {
+                failed++;
+            }
+
+            total++;
+        }
+
+        printf("\n");
+        printf("Total tests: %d\n", total);
+        printf("%sPassed: %d (%.1f%%)%s\n", green(), total-failed, 100.0*(total-failed)/total, normal());
+        if (failed)
+            printf("%sFailed: %d (%.1f%%)%s\n\n", red(), failed, 100.0*failed/total, normal());
+
+        return failed;
+    }
+
+    void push_var(const variable_printer_base* var)
+    {
+        vars.push_back(var);
+    }
+
+    void pop_var(const variable_printer_base* var)
+    {
+        assert(vars.back() == var);
+        vars.pop_back();
+    }
+
+    [[noreturn]]
+    void fail(const char* cond)
+    {
+        printf("%sFAILURE%s\n\n", red(), normal());
+
+        for (auto& var : vars)
+            var->print();
+
+        printf("\nAssertion failed: %s\n\n", cond);
+
+        signal(SIGTRAP, [](int) {});
+        raise(SIGTRAP);
+
+        throw unit_test_failure();
+    }
+};
+
+unit_test_registrar& get_unit_test_registrar();
+
+inline size_t register_unit_test(unit_test_t test)
+{
+    return get_unit_test_registrar().register_test(test);
+}
+
+template <typename T>
+struct variable_printer : variable_printer_base
+{
+    const char* message{};
+    T var{};
+
+    variable_printer()
+    {
+        get_unit_test_registrar().push_var(this);
+    }
+
+    virtual ~variable_printer() override
+    {
+        get_unit_test_registrar().pop_var(this);
+    }
+
+    variable_printer& operator<<(const char* m)
+    {
+        message = m;
+        return *this;
+    }
+
+    variable_printer& operator<<(const T& v)
+    {
+        var = v;
+        return *this;
+    }
+
+    virtual void print() const final override
+    {
+        std::cout << message << var << std::endl;
+    }
+};
+
+template <>
+struct variable_printer<void> : variable_printer_base
+{
+    const char* message;
+
+    variable_printer()
+    {
+        get_unit_test_registrar().push_var(this);
+    }
+
+    virtual ~variable_printer() override
+    {
+        get_unit_test_registrar().pop_var(this);
+    }
+
+    variable_printer& operator<<(const char* m)
+    {
+        message = m;
+        return *this;
+    }
+
+    virtual void print() const final override
+    {
+        std::cout << message << std::endl;
+    }
+};
+
+template <typename T>
+struct variable_printer_helper
+{
+    using type = variable_printer<T>;
+
+    template <typename U>
+    variable_printer_helper<U> operator<<(U) const;
+
+    variable_printer_helper<void> operator<<(const char*) const;
+};
+
+#define VARIABLE_PRINTER(...) typename decltype(variable_printer_helper<void>{} << __VA_ARGS__)::type
+
+#define VAR_NAME_(line) variable_printer_##line
+#define VAR_NAME(line) VAR_NAME_(line)
+
+#define INFO_(id, ...) \
+VARIABLE_PRINTER(__VA_ARGS__) VAR_NAME(id); \
+VAR_NAME(id) << __VA_ARGS__;
+
+#ifdef ENABLE_INFO
+#define INFO(...) INFO_(__COUNTER__, __VA_ARGS__)
+#else
+#define INFO(...)
+#endif
+
+#define TEST_NAME_(line,name) unit_test_##name##_##line
+#define TEST_NAME(line,name) TEST_NAME_(line,name)
+
+#define TEST_ID_(line,name) unit_test_id_##name##_##line
+#define TEST_ID(line,name) TEST_ID_(line,name)
+
+#define TEST_CASE_(id,name) \
+extern "C" void TEST_NAME(id,name)(); \
+auto TEST_ID(id,name) = register_unit_test(TEST_NAME(id,name)); \
+void TEST_NAME(id,name)()
+#define TEST_CASE(name) TEST_CASE_(__LINE__,name)
+
+#define REQUIRE(cond) \
+do { \
+    if ( !__builtin_expect( !!(cond), 1 ) ) \
+    { \
+        get_unit_test_registrar().fail( #cond ); \
+    } \
+} while (0)
+
+#define FAIL(...) \
+do { \
+    INFO(__VA_ARGS__); \
+    REQUIRE(false); \
+} while (0)
+
+class Approx
+{
+    protected:
+        double target_;
+        double margin_ = 0;
+
+    public:
+        Approx(double target) : target_(target) {}
+
+        Approx& margin(double value)
+        {
+            margin_ = value;
+            return *this;
+        }
+
+        bool operator==(double other) const
+        {
+            return std::abs(other - target_) <= margin_;
+        }
+
+        friend bool operator==(double lhs, const Approx& rhs)
+        {
+            return rhs == lhs;
+        }
+};
+
+#define UNIT_TEST1( ch1, opname ) \
+TEST_CASE(ch1##opname) \
+{ \
+    INFO("Type character 1: " << #ch1); \
+    printf("Testing: %s...", STRINGIFY_INT(ch1##opname));
+
+#define UNIT_TEST2( ch1, ch2, opname ) \
+TEST_CASE(ch1##ch2##opname) \
+{ \
+    INFO("Type character 1: " << #ch1); \
+    INFO("Type character 2: " << #ch2); \
+    printf("Testing: %s...", STRINGIFY_INT(ch1##ch2##opname));
+
+#define UNIT_TEST3( ch1, ch2, ch3, opname ) \
+TEST_CASE(ch1##ch2##ch3##opname) \
+{ \
+    INFO("Type character 1: " << #ch1); \
+    INFO("Type character 2: " << #ch2); \
+    INFO("Type character 3: " << #ch3); \
+    printf("Testing: %s...", STRINGIFY_INT(ch1##ch2##ch3##opname));
+
+#define UNIT_TEST4( ch1, ch2, ch3, ch4, opname ) \
+TEST_CASE(ch1##ch2##ch3##ch4##opname) \
+{ \
+    INFO("Type character 1: " << #ch1); \
+    INFO("Type character 2: " << #ch2); \
+    INFO("Type character 3: " << #ch3); \
+    INFO("Type character 4: " << #ch4); \
+    printf("Testing: %s...", STRINGIFY_INT(ch1##ch2##ch3##ch4##opname));
+
+#define UNIT_TEST5( ch1, ch2, ch3, ch4, ch5, opname ) \
+TEST_CASE(ch1##ch2##ch3##ch4##ch5##opname) \
+{ \
+    INFO("Type character 1: " << #ch1); \
+    INFO("Type character 2: " << #ch2); \
+    INFO("Type character 3: " << #ch3); \
+    INFO("Type character 4: " << #ch4); \
+    INFO("Type character 5: " << #ch5); \
+    printf("Testing: %s...", STRINGIFY_INT(ch1##ch2##ch3##ch4##ch5##opname));
+
+#define UNIT_TEST_BODY( ... ) \
+    __VA_ARGS__; \
+    printf("%sPASS%s\n", unit_test_registrar::green(), unit_test_registrar::normal()); \
+}
+
+#define UNIT_TEST_SELECTOR_( ARG1, ARG2, ARG3, ARG4, ARG5, ARG6, ARG7, ... ) ARG7
+
+#define UNIT_TEST_SELECTOR( ... ) \
+UNIT_TEST_SELECTOR_( __VA_ARGS__, \
+                     UNIT_TEST5, \
+                     UNIT_TEST4, \
+                     UNIT_TEST3, \
+                     UNIT_TEST2, \
+                     UNIT_TEST1)
+
+#define UNIT_TEST( ... ) UNIT_TEST_SELECTOR(__VA_ARGS__)(__VA_ARGS__) UNIT_TEST_BODY
+
+enum
+{
+    BLIS_TEST_ZERO      = 0x01,
+    BLIS_TEST_NEGATIVE  = 0x02,
+    BLIS_TEST_INFINITY  = 0x04,
+    BLIS_TEST_NAN       = 0x08,
+    BLIS_TEST_DEFAULT   = ~BLIS_TEST_INFINITY
+};
+
+template <typename T>
+struct is_complex : std::false_type {};
+
+template <>
+struct is_complex<scomplex> : std::true_type {};
+
+template <>
+struct is_complex<dcomplex> : std::true_type {};
+
+template <typename T>
+struct is_real : std::integral_constant<bool,!is_complex<T>::value> {};
+
+template <typename T> struct make_complex;
+
+template <> struct make_complex<float   > { using type = scomplex; };
+template <> struct make_complex<double  > { using type = dcomplex; };
+template <> struct make_complex<scomplex> { using type = scomplex; };
+template <> struct make_complex<dcomplex> { using type = dcomplex; };
+
+template <typename T>
+using make_complex_t = typename make_complex<T>::type;
+
+template <typename T> struct make_real;
+
+template <> struct make_real<float   > { using type = float; };
+template <> struct make_real<double  > { using type = double; };
+template <> struct make_real<scomplex> { using type = float; };
+template <> struct make_real<dcomplex> { using type = double; };
+
+template <typename T>
+using make_real_t = typename make_real<T>::type;
+
+template <typename T, bool Cond>
+struct make_complex_if : std::conditional<Cond,make_complex_t<T>,make_real_t<T>> {};
+
+template <typename T, bool Cond>
+using make_complex_if_t = typename make_complex_if<T,Cond>::type;
+
+template <typename T>
+struct real_imag_part
+{
+    real_imag_part& operator=(T) { return *this; }
+
+    operator T() const { return T(); }
+};
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<typename std::remove_cv<T>::type>::value,T&> real(T& x) { return x; }
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value,real_imag_part<T>> imag(T) { return {}; }
+
+inline float& real(scomplex& x) { return x.real; }
+
+inline float& imag(scomplex& x) { return x.imag; }
+
+inline double& real(dcomplex& x) { return x.real; }
+
+inline double& imag(dcomplex& x) { return x.imag; }
+
+inline const float& real(const scomplex& x) { return x.real; }
+
+inline const float& imag(const scomplex& x) { return x.imag; }
+
+inline const double& real(const dcomplex& x) { return x.real; }
+
+inline const double& imag(const dcomplex& x) { return x.imag; }
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value,T> norm(T x) { return x*x; }
+
+inline float norm(const scomplex& x) { return x.real*x.real + x.imag*x.imag; }
+
+inline double norm(const dcomplex& x) { return x.real*x.real + x.imag*x.imag; }
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value,T> absolute(T x) { return std::abs(x); }
+
+inline float absolute(const scomplex& x) { return std::hypot(x.real, x.imag); }
+
+inline double absolute(const dcomplex& x) { return std::hypot(x.real, x.imag); }
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value,T> square_root(T x) { return std::sqrt(x); }
+
+template <typename T, typename U, typename=void>
+struct convert_impl;
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_real<T>::value && is_real<U>::value>>
+{
+    void operator()(T x, U& y) const { y = x; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_real<T>::value && is_complex<U>::value>>
+{
+    void operator()(T x, U& y) const { y.real = x; y.imag = 0; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_complex<T>::value && is_real<U>::value>>
+{
+    void operator()(T x, U& y) const { y = x.real; }
+};
+
+template <typename T, typename U>
+struct convert_impl<T, U, std::enable_if_t<is_complex<T>::value && is_complex<U>::value>>
+{
+    void operator()(T x, U& y) const { y.real = x.real; y.imag = x.imag; }
+};
+
+template <typename U, typename T>
+U convert(T x)
+{
+    U y;
+    convert_impl<T,U>{}(x,y);
+    return y;
+}
+
+template <typename U, typename T>
+auto convert_prec(T x) -> make_complex_if_t<U,is_complex<T>::value>
+{
+    return convert<make_complex_if_t<U,is_complex<T>::value>>(x);
+}
+
+#define COMPLEX_MATH_OPS(rtype, ctype) \
+\
+inline bool operator==(rtype x, ctype y) \
+{ \
+    return x == y.real && y.imag == 0; \
+} \
+\
+inline bool operator==(ctype x, rtype y) \
+{ \
+    return y == x.real && x.imag == 0; \
+} \
+\
+inline bool operator==(ctype x, ctype y) \
+{ \
+    return x.real == y.real && \
+           x.imag == y.imag; \
+ } \
+ \
+inline ctype operator-(ctype x) \
+{ \
+    return {-x.real, -x.imag}; \
+} \
+\
+inline ctype operator+(rtype x, ctype y) \
+{ \
+    return {x+y.real, y.imag}; \
+} \
+\
+inline ctype operator+(ctype x, rtype y) \
+{ \
+    return {y+x.real, x.imag}; \
+} \
+\
+inline ctype operator+(ctype x, ctype y) \
+{ \
+    return {x.real+y.real, x.imag+y.imag}; \
+} \
+\
+inline ctype operator-(rtype x, ctype y) \
+{ \
+    return {x-y.real, -y.imag}; \
+} \
+\
+inline ctype operator-(ctype x, rtype y) \
+{ \
+    return {x.real-y, x.imag}; \
+} \
+\
+inline ctype operator-(ctype x, ctype y) \
+{ \
+    return {x.real-y.real, x.imag-y.imag}; \
+} \
+\
+inline ctype operator*(rtype x, ctype y) \
+{ \
+    return {x*y.real, x*y.imag}; \
+} \
+\
+inline ctype operator*(ctype x, rtype y) \
+{ \
+    return {y*x.real, y*x.imag}; \
+} \
+\
+inline ctype operator*(ctype x, ctype y) \
+{ \
+    return {x.real*y.real - x.imag*y.imag, \
+            x.real*y.imag + x.imag*y.real}; \
+} \
+\
+inline ctype operator/(rtype x, ctype y) \
+{ \
+    auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \
+    auto n = std::ilogb(scale); \
+    auto yrs = std::scalbn(y.real, -n); \
+    auto yis = std::scalbn(y.imag, -n); \
+    auto denom = y.real*yrs + y.imag*yis; \
+    return {x*yrs/denom, -x*yis/denom}; \
+} \
+\
+inline ctype operator/(ctype x, rtype y) \
+{ \
+    return {x.real/y, x.imag/y}; \
+} \
+\
+inline ctype operator/(ctype x, ctype y) \
+{ \
+    auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \
+    auto n = std::ilogb(scale); \
+    auto yrs = std::scalbn(y.real, -n); \
+    auto yis = std::scalbn(y.imag, -n); \
+    auto denom = y.real*yrs + y.imag*yis; \
+    return {(x.real*yrs + x.imag*yis)/denom, \
+            (x.imag*yrs - x.real*yis)/denom}; \
+}
+
+COMPLEX_MATH_OPS(float,  scomplex);
+COMPLEX_MATH_OPS(double, dcomplex);
+
+template <typename T>
+std::enable_if_t<is_real<T>::value,T> conj(T x) { return x; }
+
+template <typename T>
+std::enable_if_t<is_complex<T>::value,T> conj(const T& x) { return {x.real, -x.imag}; }
+
+template <typename T>
+std::enable_if_t<is_complex<T>::value,T> swapri(const T& x) { return {x.imag, x.real}; }
+
+inline bool bli_isnan( float x ) { return bli_sisnan( x ); }
+
+inline bool bli_isnan( double x ) { return bli_disnan( x ); }
+
+inline bool bli_isinf( float x ) { return bli_sisinf( x ); }
+
+inline bool bli_isinf( double x ) { return bli_disinf( x ); }
+
+template <typename C, typename T>
+std::enable_if_t<is_real<T>::value> check(T x, T y)
+{
+    auto tol = 8*std::numeric_limits<make_real_t<C>>::epsilon();
+    INFO("x: " << x);
+    INFO("y: " << y);
+    INFO("|x-y|: " << std::abs(x-y));
+    INFO("eps: " << tol);
+    if ( bli_isnan( x ) || bli_isnan( y ) )
+        REQUIRE( bli_isnan( x ) == bli_isnan( y ) );
+    else if ( bli_isinf( x ) || bli_isinf( y ) )
+        REQUIRE( x == y );
+    else
+        REQUIRE( x == Approx(y).margin(tol) );
+}
+
+template <typename C, typename T>
+std::enable_if_t<is_complex<T>::value> check(const T& x, const T& y)
+{
+    {
+        INFO("Real part:");
+        check<C>( x.real, y.real );
+    }
+    {
+        INFO("Imag part:");
+        check<C>( x.imag, y.imag );
+    }
+}
+
+template <typename T>
+std::enable_if_t<is_real<T>::value,std::vector<T>> test_values(int mask = BLIS_TEST_DEFAULT)
+{
+    std::vector<T> vals{0.439};
+
+    if (mask & BLIS_TEST_NEGATIVE)
+        vals.push_back(-0.563);
+
+    if (mask & BLIS_TEST_ZERO)
+        vals.push_back(0);
+
+    if (mask & BLIS_TEST_INFINITY)
+    {
+        vals.push_back(INFINITY);
+        if (mask & BLIS_TEST_NEGATIVE)
+            vals.push_back(-INFINITY);
+    }
+
+    if (mask & BLIS_TEST_NAN)
+        vals.push_back(NAN);
+
+    return vals;
+}
+
+template <typename T>
+std::enable_if_t<is_complex<T>::value,std::vector<T>> test_values(int mask = BLIS_TEST_DEFAULT)
+{
+    auto real_vals = test_values<make_real_t<T>>(mask);
+    std::vector<T> vals;
+    for (auto& r : real_vals)
+    for (auto& i : real_vals)
+        vals.push_back({r, i});
+    return vals;
+}
+
+template <typename T>
+std::enable_if_t<is_complex<T>::value,std::ostream&> operator<<(std::ostream& os, const T& val)
+{
+    return os << '(' << val.real << ", " << val.imag << ')';
+}
+
+template <size_t M, size_t N, typename T>
+std::array<std::array<T,N>,M> tile(const T& val = T())
+{
+    std::array<std::array<T,N>,M> ret;
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+        ret[i][j] = val;
+    return ret;
+}
+
+template <size_t M, size_t N, typename T>
+std::array<std::array<T,N>,M> conj(const std::array<std::array<T,N>,M>& x)
+{
+    std::array<std::array<T,N>,M> ret;
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+        ret[i][j] = conj(x[i][j]);
+    return ret;
+}
+
+template <size_t M, size_t N, typename T>
+std::array<std::array<make_real_t<T>,N>,M> real(const std::array<std::array<T,N>,M>& x)
+{
+    std::array<std::array<make_real_t<T>,N>,M> ret;
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+        ret[i][j] = real(x[i][j]);
+    return ret;
+}
+
+template <size_t M, size_t N, typename T>
+std::array<std::array<make_real_t<T>,N>,M> imag(const std::array<std::array<T,N>,M>& x)
+{
+    std::array<std::array<make_real_t<T>,N>,M> ret;
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+        ret[i][j] = imag(x[i][j]);
+    return ret;
+}
+
+template <size_t D, size_t M, size_t N, typename T>
+std::enable_if_t<!is_complex<T>::value,std::array<std::array<T,D*N>,M>>
+bcast(const std::array<std::array<T,N>,M>& x)
+{
+    std::array<std::array<T,D*N>,M> ret;
+    for (size_t d = 0;d < D;d++)
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+        ret[i][d + j*D] = x[i][j];
+    return ret;
+}
+
+template <size_t D, size_t M, size_t N, typename T>
+std::enable_if_t<is_complex<T>::value,std::array<std::array<T,D*N>,M>>
+bcast(const std::array<std::array<T,N>,M>& x)
+{
+    std::array<std::array<make_real_t<T>,2*D*N>,M> ret_r;
+    std::array<std::array<T,D*N>,M> ret;
+    for (size_t d = 0;d < D;d++)
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+    {
+        ret_r[i][d + 0*D + j*2*D] = real(x[i][j]);
+        ret_r[i][d + 1*D + j*2*D] = imag(x[i][j]);
+    }
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < D*N;j++)
+    {
+        real(ret[i][j]) = ret_r[i][j*2+0];
+        imag(ret[i][j]) = ret_r[i][j*2+1];
+    }
+    return ret;
+}
+
+struct dense_cond
+{
+    bool operator()(dim_t, dim_t) const { return true; }
+};
+
+constexpr dense_cond dense;
+
+struct is_below
+{
+    doff_t diagoff;
+
+    is_below(doff_t d) : diagoff(d) {}
+
+    bool operator()(dim_t i, dim_t j) const { return j-i <= diagoff; }
+};
+
+struct is_above
+{
+    doff_t diagoff;
+
+    is_above(doff_t d) : diagoff(d) {}
+
+    bool operator()(dim_t i, dim_t j) const { return j-i >= diagoff; }
+};
+
+template <typename C, typename T, size_t M, size_t N>
+void check(const std::array<std::array<T,N>,M>& x,
+           const std::array<std::array<T,N>,M>& y)
+{
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+    {
+        INFO("i = " << i);
+        INFO("j = " << j);
+        check<C>(x[i][j], y[i][j]);
+    }
+}
+
+template <typename C, int Transpose, typename A, typename X, typename B, typename Y, size_t M, size_t N>
+void axpbys_mxn(const A& a, const std::array<std::array<X,N>,M>& x,
+                const B& b,       std::array<std::array<Y,N>,M>& y, const std::function<bool(size_t,size_t)>& cond)
+{
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+    if (Transpose == BLIS_NO_TRANSPOSE ? cond(i, j) : cond(j, i))
+    {
+        if (real(b) == 0 && imag(b) == 0)
+            y[i][j] = convert<Y>(convert_prec<C>(a) *
+                                 convert_prec<C>(x[i][j]));
+        else
+            y[i][j] = convert<Y>((convert_prec<C>(a) *
+                                  convert_prec<C>(x[i][j])) +
+                                 (convert_prec<C>(b) *
+                                  convert_prec<C>(y[i][j])));
+    }
+}
+
+namespace std
+{
+
+template <typename T, size_t M, size_t N>
+std::ostream& operator<<(std::ostream& os, const std::array<std::array<T,N>,M>& x)
+{
+    for (size_t i = 0;i < M;i++)
+    for (size_t j = 0;j < N;j++)
+        os << '[' << i << "][" << j << "]: " << x[i][j] << std::endl;
+    return os;
+}
+
+} // namespace std
+
+#define BLIS_FOR_ALL_TYPES0(macro, ...) macro(__VA_ARGS__);
+
+#define BLIS_FOR_TYPES_1R(...) \
+BLIS_FOR_ALL_TYPES0(__VA_ARGS__, float, s) \
+BLIS_FOR_ALL_TYPES0(__VA_ARGS__, double, d)
+
+#define BLIS_FOR_TYPES_1C(...) \
+BLIS_FOR_ALL_TYPES0(__VA_ARGS__, scomplex, c) \
+BLIS_FOR_ALL_TYPES0(__VA_ARGS__, dcomplex, z)
+
+#define BLIS_FOR_TYPES_1RC(...) \
+BLIS_FOR_TYPES_1R(__VA_ARGS__) \
+BLIS_FOR_TYPES_1C(__VA_ARGS__)
+
+#define BLIS_FOR_ALL_TYPES1(type, ...) PASTECH(BLIS_FOR_TYPES_1, type)(__VA_ARGS__)
+
+#define BLIS_FOR_TYPES_2R(...) \
+BLIS_FOR_ALL_TYPES1(__VA_ARGS__, float, s) \
+BLIS_FOR_ALL_TYPES1(__VA_ARGS__, double, d)
+
+#define BLIS_FOR_TYPES_2C(...) \
+BLIS_FOR_ALL_TYPES1(__VA_ARGS__, scomplex, c) \
+BLIS_FOR_ALL_TYPES1(__VA_ARGS__, dcomplex, z)
+
+#define BLIS_FOR_TYPES_2RC(...) \
+BLIS_FOR_TYPES_2R(__VA_ARGS__) \
+BLIS_FOR_TYPES_2C(__VA_ARGS__)
+
+#define BLIS_FOR_ALL_TYPES2(type, ...) PASTECH(BLIS_FOR_TYPES_2, type)(__VA_ARGS__)
+
+#define BLIS_FOR_TYPES_3R(...) \
+BLIS_FOR_ALL_TYPES2(__VA_ARGS__, float, s) \
+BLIS_FOR_ALL_TYPES2(__VA_ARGS__, double, d)
+
+#define BLIS_FOR_TYPES_3C(...) \
+BLIS_FOR_ALL_TYPES2(__VA_ARGS__, scomplex, c) \
+BLIS_FOR_ALL_TYPES2(__VA_ARGS__, dcomplex, z)
+
+#define BLIS_FOR_TYPES_3RC(...) \
+BLIS_FOR_TYPES_3R(__VA_ARGS__) \
+BLIS_FOR_TYPES_3C(__VA_ARGS__)
+
+#define BLIS_FOR_ALL_TYPES3(type, ...) PASTECH(BLIS_FOR_TYPES_3, type)(__VA_ARGS__)
+
+#define BLIS_FOR_TYPES_4R(...) \
+BLIS_FOR_ALL_TYPES3(__VA_ARGS__, float, s) \
+BLIS_FOR_ALL_TYPES3(__VA_ARGS__, double, d)
+
+#define BLIS_FOR_TYPES_4C(...) \
+BLIS_FOR_ALL_TYPES3(__VA_ARGS__, scomplex, c) \
+BLIS_FOR_ALL_TYPES3(__VA_ARGS__, dcomplex, z)
+
+#define BLIS_FOR_TYPES_4RC(...) \
+BLIS_FOR_TYPES_4R(__VA_ARGS__) \
+BLIS_FOR_TYPES_4C(__VA_ARGS__)
+
+#define BLIS_FOR_ALL_TYPES4(type, ...) PASTECH(BLIS_FOR_TYPES_4, type)(__VA_ARGS__)
+
+#define BLIS_FOR_TYPES_5R(...) \
+BLIS_FOR_ALL_TYPES4(__VA_ARGS__, float, s) \
+BLIS_FOR_ALL_TYPES4(__VA_ARGS__, double, d)
+
+#define BLIS_FOR_TYPES_5C(...) \
+BLIS_FOR_ALL_TYPES4(__VA_ARGS__, scomplex, c) \
+BLIS_FOR_ALL_TYPES4(__VA_ARGS__, dcomplex, z)
+
+#define BLIS_FOR_TYPES_5RC(...) \
+BLIS_FOR_TYPES_5R(__VA_ARGS__) \
+BLIS_FOR_TYPES_5C(__VA_ARGS__)
+
+#define BLIS_FOR_ALL_TYPES5(type, ...) PASTECH(BLIS_FOR_TYPES_5, type)(__VA_ARGS__)
+
+#define INSERT_GENTFUNC_MIX1(t1, opname) \
+BLIS_FOR_ALL_TYPES1(t1, GENTFUNC, opname)
+
+#define INSERT_GENTFUNC_MIX2(t1, t2, opname) \
+BLIS_FOR_ALL_TYPES2(t1, t2, GENTFUNC, opname)
+
+#define INSERT_GENTFUNC_MIX3(t1, t2, t3, opname) \
+BLIS_FOR_ALL_TYPES3(t1, t2, t3, GENTFUNC, opname)
+
+#define INSERT_GENTFUNC_MIX4(t1, t2, t3, t4, opname) \
+BLIS_FOR_ALL_TYPES4(t1, t2, t3, t4, GENTFUNC, opname)
+
+#define INSERT_GENTFUNC_MIX5(t1, t2, t3, t4, t5, opname) \
+BLIS_FOR_ALL_TYPES5(t1, t2, t3, t4, t5, GENTFUNC, opname)
+
+#endif
+
diff --git a/frame/include/level0/ri/bli_sqrt2ris.h b/test/level0/test_tabsq2s.cxx
similarity index 55%
rename from frame/include/level0/ri/bli_sqrt2ris.h
rename to test/level0/test_tabsq2s.cxx
index 06fbe7289..59fcad91f 100644
--- a/frame/include/level0/ri/bli_sqrt2ris.h
+++ b/test/level0/test_tabsq2s.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,64 +33,54 @@
 
 */
 
-#ifndef BLIS_SQRT2RIS_H
-#define BLIS_SQRT2RIS_H
+#include "test_l0.hpp"
 
-// sqrt2ris
+/******************************************************************************
+ *
+ * absq2s
+ *
+ *****************************************************************************/
 
-#define bli_ssqrt2ris( xr, xi, ar, ai ) \
-{ \
-	(ar)       = sqrtf( (xr) ); \
-}
-
-#define bli_dsqrt2ris( xr, xi, ar, ai ) \
-{ \
-	(ar)       = sqrt( (xr) ); \
-}
-
-#define bli_csqrt2ris( xr, xi, ar, ai ) \
-{ \
-	float  s   = bli_fmaxabs( (xr), (xi) ); \
-	float  mag; \
-	if ( s == 0.0F ) mag = 0.0F; \
-	else \
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
 	{ \
-		mag = sqrtf( s ) * \
-		      sqrtf( ( (xr) / s ) * (xr) + \
-		             ( (xi) / s ) * (xi) ); \
-	} \
+		auto y0 = convert<ctypey>( norm( convert_prec<ctypec>( x ) ) ); \
 \
-	(ar)       = sqrtf( ( mag + (xr) ) / 2.0F ); \
-	(ai)       = sqrtf( ( mag - (xi) ) / 2.0F ); \
-}
-
-#define bli_zsqrt2ris( xr, xi, ar, ai ) \
-{ \
-	double s   = bli_fmaxabs( (xr), (xi) ); \
-	double mag; \
-	if ( s == 0.0 ) mag = 0.0; \
-	else \
-	{ \
-		mag = sqrt( s ) * \
-		      sqrt( ( (xr) / s ) * (xr) + \
-		            ( (xi) / s ) * (xi) ); \
-	} \
+		ctypey y; \
+		bli_tabsq2s( chx,chy,chc, x, y ); \
 \
-	(ar)       = sqrt( ( mag + (xr) ) / 2.0 ); \
-	(ai)       = sqrt( ( mag - (xi) ) / 2.0 ); \
-}
-
-#define bli_scsqrt2ris( xr, xi, ar, ai ) \
-{ \
-	(ar)       = sqrtf( (xr) ); \
-	(ai)       = 0.0F; \
-}
+		INFO( "x:        " << x ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
 
-#define bli_dzsqrt2ris( xr, xi, ar, ai ) \
-{ \
-	(ar)       = sqrt( (xr) ); \
-	(ai)       = 0.0; \
-}
+INSERT_GENTFUNC_MIX3( RC, RC, R, absq2s )
 
-#endif
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( norm( convert_prec<ctypec>( x ) ) ); \
+\
+		ctypey y; \
+		bli_tabsq2ris( chx,chy,chc, \
+		               real( x ), imag( x ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
 
+INSERT_GENTFUNC_MIX3( RC, RC, R, absq2ris )
diff --git a/test/level0/test_tabval2s.cxx b/test/level0/test_tabval2s.cxx
new file mode 100644
index 000000000..0209131c6
--- /dev/null
+++ b/test/level0/test_tabval2s.cxx
@@ -0,0 +1,86 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * abval2s
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( absolute( convert_prec<ctypec>( x ) ) ); \
+\
+		ctypey y; \
+		bli_tabval2s( chx,chy,chc, x, y ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, abval2s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( absolute( convert_prec<ctypec>( x ) ) ); \
+\
+		ctypey y; \
+		bli_tabval2ris( chx,chy,chc, \
+		                real( x ), imag( x ), \
+		                real( y ), imag( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, abval2ris )
diff --git a/test/level0/test_tadd3s.cxx b/test/level0/test_tadd3s.cxx
new file mode 100644
index 000000000..f4c18ad5b
--- /dev/null
+++ b/test/level0/test_tadd3s.cxx
@@ -0,0 +1,189 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * add3s
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypez, chz, ctypec, chc ) \
+UNIT_TEST(chx,chy,chz,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto z0 = convert<ctypez>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		ctypez z; \
+		bli_tadd3s( chx,chy,chz,chc, x, y, z ); \
+\
+		INFO( "z (C++):  " << z0 ); \
+		INFO( "z (BLIS): " << z ); \
+\
+		check<ctypec>( z, z0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4(RC, RC, RC, C, add3s);
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chz,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		bli_tadd3s( chx,chy,chy,chc, x, y, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+\
+	for (       auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		bli_tadd3s( chx,chy,chx,chc, x, y, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3(RC, RC, C, add3s_inplace);
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypez, chz, ctypec, chc ) \
+UNIT_TEST(chx,chy,chz,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto z0 = convert<ctypez>( conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		ctypez z; \
+		bli_tadd3js( chx,chy,chz,chc, x, y, z ); \
+\
+		INFO( "z (C++):  " << z0 ); \
+		INFO( "z (BLIS): " << z ); \
+\
+		check<ctypec>( z, z0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4(RC, RC, RC, C, add3js);
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypez, chz, ctypec, chc ) \
+UNIT_TEST(chx,chy,chz,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto z0 = convert<ctypez>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		ctypez z; \
+		bli_tadd3ris( chx,chy,chz,chc, \
+		              real( x ), imag( x ), \
+		              real( y ), imag( y ), \
+		              real( z ), imag( z ) ); \
+\
+		INFO( "z (C++):  " << z0 ); \
+		INFO( "z (BLIS): " << z ); \
+\
+		check<ctypec>( z, z0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4(RC, RC, RC, C, add3ris);
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypez, chz, ctypec, chc ) \
+UNIT_TEST(chx,chy,chz,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto z0 = convert<ctypez>( conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		ctypez z; \
+		bli_tadd3jris( chx,chy,chz,chc, \
+		               real( x ), imag( x ), \
+		               real( y ), imag( y ), \
+		               real( z ), imag( z ) ); \
+\
+		INFO( "z (C++):  " << z0 ); \
+		INFO( "z (BLIS): " << z ); \
+\
+		check<ctypec>( z, z0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4(RC, RC, RC, C, add3jris);
diff --git a/test/level0/test_tadds.cxx b/test/level0/test_tadds.cxx
new file mode 100644
index 000000000..a2a9bdc10
--- /dev/null
+++ b/test/level0/test_tadds.cxx
@@ -0,0 +1,196 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * adds
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_tadds( chx,chy,chc, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3(RC, RC, C, adds);
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_taddjs( chx,chy,chc, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, addjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_taddris( chx,chy,chc, \
+		             real( x ), imag( x ), \
+		             real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, addris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_taddjris( chx,chy,chc, \
+		              real( x ), imag( x ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, addjris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( 1.0, xmn, 1.0, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_tadds_mxn( chx,chy,chc, M, N, &xmn[0][0], N, 1, &ymn[0][0], N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+\
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( 1.0, xmn, 1.0, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_tadds_mxn( chx,chy,chc, N, M, &xmn[0][0], 1, N, &ymn[0][0], 1, N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, adds_mxn )
diff --git a/test/level0/test_taxpbys.cxx b/test/level0/test_taxpbys.cxx
new file mode 100644
index 000000000..410e71724
--- /dev/null
+++ b/test/level0/test_taxpbys.cxx
@@ -0,0 +1,256 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * axpbys
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chb,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( b ) * \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpbys( cha,chx,chb,chy,chc, a, x, b, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX5( RC, RC, RC, RC, R, axpbys )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypeb, chb, ctypec, chc ) \
+UNIT_TEST(cha,chx,chb,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for (       auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( b ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+\
+		bli_taxpbys( cha,chx,chb,chx,chc, a, x, b, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, axpbys_inplace )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chb,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>(       convert_prec<ctypec>( a ) * \
+		                           conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( b ) * \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpbyjs( cha,chx,chb,chy,chc, a, x, b, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX5( RC, RC, RC, RC, R, axpbyjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chb,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( b ) * \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpbyris( cha,chx,chb,chy,chc, \
+		               real( a ), imag( a ), \
+		               real( x ), imag( x ), \
+		               real( b ), imag( b ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX5( RC, RC, RC, RC, R, axpbyris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chb,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>(       convert_prec<ctypec>( a ) * \
+		                           conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( b ) * \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpbyjris( cha,chx,chb,chy,chc, \
+		                real( a ), imag( a ), \
+		                real( x ), imag( x ), \
+		                real( b ), imag( b ), \
+		                real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX5( RC, RC, RC, RC, R, axpbyjris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chb,chy,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( a, xmn, b, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_taxpbys_mxn( cha,chx,chb,chy,chc, M, N, &a, &xmn[0][0], N, 1, &b, &ymn[0][0], N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+\
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( a, xmn, b, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_taxpbys_mxn( cha,chx,chb,chy,chc, N, M, &a, &xmn[0][0], 1, N, &b, &ymn[0][0], 1, N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX5( RC, RC, RC, RC, R, axpbys_mxn )
diff --git a/test/level0/test_taxpys.cxx b/test/level0/test_taxpys.cxx
new file mode 100644
index 000000000..87bc23824
--- /dev/null
+++ b/test/level0/test_taxpys.cxx
@@ -0,0 +1,181 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * axpys
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpys( cha,chx,chy,chc, a, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, axpys )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for (       auto x : test_values<ctypex>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		bli_taxpys( cha,chx,chx,chc, a, x, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, axpys_inplace )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>(       convert_prec<ctypec>( a ) * \
+		                           conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpyjs( cha,chx,chy,chc, a, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, axpyjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpyris( cha,chx,chy,chc, \
+		              real( a ), imag( a ), \
+		              real( x ), imag( x ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, axpyris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>(       convert_prec<ctypec>( a ) * \
+		                           conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_taxpyjris( cha,chx,chy,chc, \
+		               real( a ), imag( a ), \
+		               real( x ), imag( x ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, axpyjris )
diff --git a/frame/include/level0/bli_set0s_edge.h b/test/level0/test_tconjs.cxx
similarity index 61%
rename from frame/include/level0/bli_set0s_edge.h
rename to test/level0/test_tconjs.cxx
index ca57685fc..6083297f8 100644
--- a/frame/include/level0/bli_set0s_edge.h
+++ b/test/level0/test_tconjs.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,48 +33,52 @@
 
 */
 
-#ifndef BLIS_SET0S_EDGE_H
-#define BLIS_SET0S_EDGE_H
+#include "test_l0.hpp"
 
-// set0s_mxn
+/******************************************************************************
+ *
+ * conjs
+ *
+ *****************************************************************************/
 
-// Notes:
-// - The first char encodes the type of x.
-// - The second char encodes the type of y.
-
-#define GENTFUNC(ctype,ch,op) \
-\
-BLIS_INLINE void PASTEMAC(ch,op) \
-     ( \
-       const dim_t     i, \
-       const dim_t     m, \
-       const dim_t     j, \
-       const dim_t     n, \
-       ctype* restrict p, \
-       const inc_t     ldp \
-     ) \
-{ \
-	if ( i < m ) \
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	for ( auto y : test_values<ctypey>() ) \
 	{ \
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m - i, \
-		  j, \
-		  p + i*1, 1, ldp \
-		); \
-	} \
+		auto y0 = conj( y ); \
 \
-	if ( j < n ) \
-	{ \
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m, \
-		  n - j, \
-		  p + j*ldp, 1, ldp \
-		); \
+		INFO( "y (init): " << y ); \
+\
+		bli_tconjs( chy, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
 	} \
-}
+)
 
-INSERT_GENTFUNC_BASIC(set0s_edge)
+INSERT_GENTFUNC_MIX1( RC, conjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = conj( y ); \
+\
+		INFO( "y (init): " << y ); \
+\
+		bli_tconjris( chy, real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
 
-#endif
+INSERT_GENTFUNC_MIX1( RC, conjris )
diff --git a/test/level0/test_tcopycjs.cxx b/test/level0/test_tcopycjs.cxx
new file mode 100644
index 000000000..b152109b4
--- /dev/null
+++ b/test/level0/test_tcopycjs.cxx
@@ -0,0 +1,115 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * copycjs
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( bli_is_conj( conjx ) ? conj( x ) : x ); \
+\
+		INFO( "conjx:    " << bli_is_conj( conjx ) ); \
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tcopycjs( chx,chy, conjx, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copycjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for (       auto x : test_values<ctypex>() ) \
+	{ \
+		auto x0 = convert<ctypex>( bli_is_conj( conjx ) ? conj( x ) : x ); \
+\
+		INFO( "conjx:    " << bli_is_conj( conjx ) ); \
+		INFO( "x:        " << x ); \
+\
+		bli_tcopycjs( chx,chx, conjx, x, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypex>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, copycjs_inplace )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( bli_is_conj( conjx ) ? conj( x ) : x ); \
+\
+		INFO( "conjx:    " << bli_is_conj( conjx ) ); \
+		INFO( "x:        " << x); \
+\
+		ctypey y; \
+		bli_tcopycjris( chx,chy, conjx, \
+		                real( x ), imag( x ), \
+		                real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copycjris )
diff --git a/test/level0/test_tcopynzs.cxx b/test/level0/test_tcopynzs.cxx
new file mode 100644
index 000000000..d321025ff
--- /dev/null
+++ b/test/level0/test_tcopynzs.cxx
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * copynzs
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		real( y0 ) = convert_prec<ctypey>( real( x ) ); \
+		if ( is_complex<ctypex>::value ) \
+			imag( y0 ) = convert_prec<ctypey>( imag( x ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_tcopynzs( chx,chy, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copynzs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		real( y0 ) = convert_prec<ctypey>( real( x ) ); \
+		if ( is_complex<ctypex>::value ) \
+			imag( y0 ) = convert_prec<ctypey>( -imag( x ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_tcopyjnzs( chx,chy, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copyjnzs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		real( y0 ) = convert_prec<ctypey>( real( x ) ); \
+		if ( is_complex<ctypex>::value ) \
+			imag( y0 ) = convert_prec<ctypey>( imag( x ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_tcopynzris( chx,chy, \
+		                real( x ), imag( x ), \
+		                real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copynzris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		real( y0 ) = convert_prec<ctypey>( real( x ) ); \
+		if ( is_complex<ctypex>::value ) \
+			imag( y0 ) = convert_prec<ctypey>( -imag( x ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (orig): " << y ); \
+\
+		bli_tcopyjnzris( chx,chy, \
+		                 real( x ), imag( x ), \
+		                 real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copyjnzris )
diff --git a/test/level0/test_tcopys.cxx b/test/level0/test_tcopys.cxx
new file mode 100644
index 000000000..cede1f341
--- /dev/null
+++ b/test/level0/test_tcopys.cxx
@@ -0,0 +1,301 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * copys
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( x ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tcopys( chx,chy, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copys )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( conj( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tcopyjs( chx,chy, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copyjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( auto x : test_values<ctypex>() ) \
+	{ \
+		auto x0 = convert<ctypex>( conj( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		bli_tcopyjs( chx,chx, x, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypex>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, copyjs_inplace )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( x ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tcopyris( chx,chy, \
+		              real( x ), imag( x ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copyris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( conj( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tcopyjris( chx,chy, \
+		               real( x ), imag( x ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copyjris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto yri0 = convert<ctypey>( x ); \
+		auto yir0 = convert<ctypey>( swapri( conj( x ) ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey yri, yir; \
+		bli_tcopy1es( chx,chy, x, yri, yir ); \
+\
+		INFO( "yri (C++):  " << yri0 ); \
+		INFO( "yir (C++):  " << yir0 ); \
+		INFO( "yri (BLIS): " << yri ); \
+		INFO( "yir (BLIS): " << yir ); \
+\
+		check<ctypey>( yri, yri0 ); \
+		check<ctypey>( yir, yir0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( C, C, copy1es )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto yri0 = convert<ctypey>( conj( x ) ); \
+		auto yir0 = convert<ctypey>( swapri( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey yri, yir; \
+		bli_tcopyj1es( chx,chy, x, yri, yir ); \
+\
+		INFO( "yri (C++):  " << yri0 ); \
+		INFO( "yir (C++):  " << yir0 ); \
+		INFO( "yri (BLIS): " << yri ); \
+		INFO( "yir (BLIS): " << yir ); \
+\
+		check<ctypey>( yri, yri0 ); \
+		check<ctypey>( yir, yir0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( C, C, copyj1es )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( x ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tcopy1rs( chx,chy, x, real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( C, C, copy1rs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( conj( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tcopyj1rs( chx,chy, x, real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( C, C, copyj1rs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N,ctypey>(); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypey,BLIS_NO_TRANSPOSE>( 1.0, xmn, 0.0, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tcopys_mxn( chx,chy, M, N, &xmn[0][0], N, 1, &ymn[0][0], N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+\
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N,ctypey>(); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypey,BLIS_TRANSPOSE>( 1.0, xmn, 0.0, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tcopys_mxn( chx,chy, N, M, &xmn[0][0], 1, N, &ymn[0][0], 1, N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, copys_mxn )
diff --git a/frame/include/level0/1r/bli_invert1rs.h b/test/level0/test_tdots.cxx
similarity index 81%
rename from frame/include/level0/1r/bli_invert1rs.h
rename to test/level0/test_tdots.cxx
index 16f7283fd..d55605e1b 100644
--- a/frame/include/level0/1r/bli_invert1rs.h
+++ b/test/level0/test_tdots.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,12 +33,12 @@
 
 */
 
-#ifndef BLIS_INVERT1RS_H
-#define BLIS_INVERT1RS_H
+#include "test_l0.hpp"
 
-// invert1rs
+/******************************************************************************
+ *
+ * dots
+ *
+ *****************************************************************************/
 
-#define bli_cinvert1rs( xr, xi )  bli_cinvertris( xr, xi )
-#define bli_zinvert1rs( xr, xi )  bli_zinvertris( xr, xi )
-
-#endif
+// No tests, dot(x, y, a) == axpy(y, x, a)
diff --git a/test/level0/test_teqs.cxx b/test/level0/test_teqs.cxx
new file mode 100644
index 000000000..b07df602c
--- /dev/null
+++ b/test/level0/test_teqs.cxx
@@ -0,0 +1,218 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * eqs
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto expected = convert_prec<ctypec>( x ) == \
+		                convert_prec<ctypec>( y ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		auto found = bli_teqs( chx,chy,chc, x, y ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, eqs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto expected = convert_prec<ctypec>( x ) == \
+		                convert_prec<ctypec>( y ); \
+\
+		INFO( "x: " << x ); \
+		INFO( "y: " << y ); \
+\
+		auto found = bli_teqris( chx,chy,chc, \
+		                         real(x), imag(x), \
+		                         real(y), imag(y) ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, eqris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto expected = x == convert_prec<ctypex>( 1.0 ); \
+\
+		INFO( "x: " << x ); \
+\
+		auto found = bli_teq1ris( chx, real( x ), imag( x ) ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, eq1ris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto expected = x == convert_prec<ctypex>( 0.0 ); \
+\
+		INFO( "x: " << x ); \
+\
+		auto found = bli_teq0ris( chx, real( x ), imag( x ) ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, eq0ris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto expected = x == convert_prec<ctypex>( -1.0 ); \
+\
+		INFO( "x: " << x ); \
+\
+		auto found = bli_teqm1ris( chx, real( x ), imag( x ) ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, eqm1ris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto expected = x == convert_prec<ctypex>( 1.0 ); \
+\
+		INFO( "x: " << x ); \
+\
+		auto found = bli_teq1s( chx, x ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, eq1s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto expected = x == convert_prec<ctypex>( 0.0 ); \
+\
+		INFO( "x: " << x ); \
+\
+		auto found = bli_teq0s( chx, x ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, eq0s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto expected = x == convert_prec<ctypex>( -1.0 ); \
+\
+		INFO( "x: " << x ); \
+\
+		auto found = bli_teqm1s( chx, x ); \
+\
+		INFO( "expected: " << expected ); \
+		INFO( "found   : " << found ); \
+\
+		REQUIRE( expected == found ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, eqm1s )
diff --git a/frame/include/level0/1r/bli_copyj1rs.h b/test/level0/test_tfprints.cxx
similarity index 82%
rename from frame/include/level0/1r/bli_copyj1rs.h
rename to test/level0/test_tfprints.cxx
index d7cdff305..cc98f836a 100644
--- a/frame/include/level0/1r/bli_copyj1rs.h
+++ b/test/level0/test_tfprints.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,20 +33,12 @@
 
 */
 
-#ifndef BLIS_COPYJ1RS_H
-#define BLIS_COPYJ1RS_H
+#include "test_l0.hpp"
 
-// copyj1rs
-
-#define bli_ccopyj1rs( a, br, bi ) \
-{ \
-	bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \
-}
-
-#define bli_zcopyj1rs( a, br, bi ) \
-{ \
-	bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \
-}
-
-#endif
+/******************************************************************************
+ *
+ * fprints
+ *
+ *****************************************************************************/
 
+// No tests
diff --git a/frame/include/level0/bb/bli_bcastbbs_mxn.h b/test/level0/test_tgets.cxx
similarity index 63%
rename from frame/include/level0/bb/bli_bcastbbs_mxn.h
rename to test/level0/test_tgets.cxx
index d060b767b..a4c0486f5 100644
--- a/frame/include/level0/bb/bli_bcastbbs_mxn.h
+++ b/test/level0/test_tgets.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,43 +33,39 @@
 
 */
 
-#ifndef BLIS_BCASTBBS_MXN_H
-#define BLIS_BCASTBBS_MXN_H
+#include "test_l0.hpp"
 
-// bcastbbs_mxn
+/******************************************************************************
+ *
+ * gets
+ *
+ *****************************************************************************/
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
 \
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const dim_t        m, \
-       const dim_t        n, \
-       ctype*    restrict y, const inc_t incy, const inc_t ldy  \
-     ) \
-{ \
-	/* Assume that the duplication factor is the column stride of y. */ \
-	const dim_t d    = ldy; \
-	const dim_t ds_y = 1; \
+	using ctypeyr = make_real_t<ctypey>; \
+	using ctypeyc = make_complex_t<ctypey>; \
 \
-	for ( dim_t i = 0; i < m; ++i ) \
+	for ( const auto x : test_values<ctypex>() ) \
 	{ \
-		ctype* restrict yi = y + i*incy; \
+		auto y0 = convert<ctypeyc>( x ); \
 \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict yij = yi + j*ldy; \
+		INFO( "x:        " << x ); \
 \
-			for ( dim_t p = 1; p < d; ++p ) \
-			{ \
-				ctype* restrict yijd = yij + p*ds_y; \
+		ctypeyr yr, yi; \
+		bli_tgets( chx,chy, x, yr, yi ); \
 \
-				PASTEMAC(ch,copys)( *yij, *yijd ); \
-			} \
-		} \
+		INFO( "yr (C++):  " << real( y0 ) ); \
+		INFO( "yi (C++):  " << imag( y0 ) ); \
+		INFO( "yr (BLIS): " << yr ); \
+		INFO( "yi (BLIS): " << yi ); \
+\
+		check<ctypey>( yr, real( y0 ) ); \
+		check<ctypey>( yi, imag( y0 ) ); \
 	} \
-}
-
-INSERT_GENTFUNC_BASIC( bcastbbs_mxn )
+)
 
-#endif
+INSERT_GENTFUNC_MIX2( RC, RC, gets )
diff --git a/test/level0/test_tinverts.cxx b/test/level0/test_tinverts.cxx
new file mode 100644
index 000000000..180189eed
--- /dev/null
+++ b/test/level0/test_tinverts.cxx
@@ -0,0 +1,141 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * inverts
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(chx,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( 1.0 ) / \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tinverts( chx,chc, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, R, inverts )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(chx,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( 1.0 ) / \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tinvertris( chx,chc, real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, R, invertris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(chx,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto xri = x; \
+		auto xir = swapri( conj( x ) ); \
+\
+		auto xri0 = convert<ctypex>( convert_prec<ctypec>( 1.0 ) / \
+		                             convert_prec<ctypec>( x ) ); \
+		auto xir0 = swapri( conj( xri0 ) ); \
+\
+		INFO( "x:          " << x ); \
+		INFO( "xri (orig): " << xri ); \
+		INFO( "xir (orig): " << xir ); \
+\
+		bli_tinvert1es( chx,chc, xri, xir ); \
+\
+		INFO( "xri (C++):  " << xri0 ); \
+		INFO( "xir (C++):  " << xir0 ); \
+		INFO( "xri (BLIS): " << xri ); \
+		INFO( "xir (BLIS): " << xir ); \
+\
+		check<ctypec>( xri, xri0 ); \
+		check<ctypec>( xir, xir0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( C, R, invert1es )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(chx,chc,opname) \
+( \
+	for ( auto x : test_values<ctypex>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( 1.0 ) / \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		bli_tinvert1rs( chx,chc, real( x ), imag( x ) ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( C, R, invert1rs )
diff --git a/test/level0/test_tinvscals.cxx b/test/level0/test_tinvscals.cxx
new file mode 100644
index 000000000..7631e4f5a
--- /dev/null
+++ b/test/level0/test_tinvscals.cxx
@@ -0,0 +1,146 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * invscals
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( x ) / \
+		                           convert_prec<ctypec>( a ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tinvscals( cha,chx,chc, a, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, invscals )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( x ) / \
+		                           convert_prec<ctypec>( conj( a ) ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tinvscaljs( cha,chx,chc, a, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, invscaljs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( x ) / \
+		                           convert_prec<ctypec>( a ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tinvscalris( cha,chx,chc, \
+		                 real( a ), imag( a ), \
+		                 real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, invscalris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( x ) / \
+		                           convert_prec<ctypec>( conj( a ) ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tinvscaljris( cha,chx,chc, \
+		                  real( a ), imag( a ), \
+		                  real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, invscaljris )
diff --git a/frame/include/level0/ri/bli_abval2ris.h b/test/level0/test_tneg2s.cxx
similarity index 50%
rename from frame/include/level0/ri/bli_abval2ris.h
rename to test/level0/test_tneg2s.cxx
index fc0ca2c3e..b0c65d7aa 100644
--- a/frame/include/level0/ri/bli_abval2ris.h
+++ b/test/level0/test_tneg2s.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,49 +33,77 @@
 
 */
 
-#ifndef BLIS_ABVAL2RIS_H
-#define BLIS_ABVAL2RIS_H
+#include "test_l0.hpp"
 
-// abval2ris
+/******************************************************************************
+ *
+ * neg2s
+ *
+ *****************************************************************************/
 
-#define bli_sabval2ris( xr, xi, ar, ai ) \
-{ \
-	(ar)       = fabsf(xr); \
-}
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( -x ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tneg2s( chx,chy, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
 
-#define bli_dabval2ris( xr, xi, ar, ai ) \
-{ \
-	(ar)       = fabs(xr); \
-}
+INSERT_GENTFUNC_MIX2( RC, RC, neg2s )
 
-#define bli_cabval2ris( xr, xi, ar, ai ) \
-{ \
-	float  s   = bli_fmaxabs( (xr), (xi) ); \
-	float  mag; \
-	if ( s == 0.0F ) mag = 0.0F; \
-	else \
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx ) \
+UNIT_TEST(chx,opname) \
+( \
+	for ( auto x : test_values<ctypex>() ) \
 	{ \
-		mag = sqrtf( s ) * \
-		      sqrtf( ( (xr) / s ) * (xr) + \
-		             ( (xi) / s ) * (xi) ); \
+		auto x0 = -x; \
+\
+		INFO( "x:        " << x ); \
+\
+		bli_tneg2s( chx,chx, x, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypex>( x, x0 ); \
 	} \
-	(ar)       = mag; \
-	(ai)       = 0.0F; \
-}
+)
+
+INSERT_GENTFUNC_MIX1( RC, neg2s_inplace )
 
-#define bli_zabval2ris( xr, xi, ar, ai ) \
-{ \
-	double s   = bli_fmaxabs( (xr), (xi) ); \
-	double mag; \
-	if ( s == 0.0 ) mag = 0.0; \
-	else \
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
 	{ \
-		mag = sqrt( s ) * \
-		      sqrt( ( (xr) / s ) * (xr) + \
-		            ( (xi) / s ) * (xi) ); \
+		auto y0 = convert<ctypey>( -x ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tneg2ris( chx,chy, \
+		              real( x ), imag( x ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
 	} \
-	(ar)       = mag; \
-	(ai)       = 0.0; \
-}
+)
 
-#endif
+INSERT_GENTFUNC_MIX2( RC, RC, neg2ris )
diff --git a/frame/include/level0/bli_set0s.h b/test/level0/test_trandnp2s.cxx
similarity index 82%
rename from frame/include/level0/bli_set0s.h
rename to test/level0/test_trandnp2s.cxx
index 92aab787d..238282ed1 100644
--- a/frame/include/level0/bli_set0s.h
+++ b/test/level0/test_trandnp2s.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,13 +33,12 @@
 
 */
 
-#ifndef BLIS_SET0S_H
-#define BLIS_SET0S_H
+#include "test_l0.hpp"
 
-#define bli_sset0s( a )  bli_ssets( 0.0F, 0.0F, (a) )
-#define bli_dset0s( a )  bli_dsets( 0.0 , 0.0 , (a) )
-#define bli_cset0s( a )  bli_csets( 0.0F, 0.0F, (a) )
-#define bli_zset0s( a )  bli_zsets( 0.0 , 0.0 , (a) )
-
-#endif
+/******************************************************************************
+ *
+ * randnp2s
+ *
+ *****************************************************************************/
 
+// No tests
diff --git a/frame/include/level0/1r/bli_copy1rs.h b/test/level0/test_trands.cxx
similarity index 82%
rename from frame/include/level0/1r/bli_copy1rs.h
rename to test/level0/test_trands.cxx
index 0211497f7..034d02e8b 100644
--- a/frame/include/level0/1r/bli_copy1rs.h
+++ b/test/level0/test_trands.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,20 +33,12 @@
 
 */
 
-#ifndef BLIS_COPY1RS_H
-#define BLIS_COPY1RS_H
+#include "test_l0.hpp"
 
-// copy1rs
-
-#define bli_ccopy1rs( a, br, bi ) \
-{ \
-	bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \
-}
-
-#define bli_zcopy1rs( a, br, bi ) \
-{ \
-	bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \
-}
-
-#endif
+/******************************************************************************
+ *
+ * rands
+ *
+ *****************************************************************************/
 
+// No tests
diff --git a/test/level0/test_tscal2s.cxx b/test/level0/test_tscal2s.cxx
new file mode 100644
index 000000000..c0ae8ea99
--- /dev/null
+++ b/test/level0/test_tscal2s.cxx
@@ -0,0 +1,535 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * scal2s
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tscal2s( cha,chx,chy,chc, a, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, scal2s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for (       auto x : test_values<ctypex>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		bli_tscal2s( cha,chx,chx,chc, a, x, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, scal2s_inplace )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( conj( x ) ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tscal2js( cha,chx,chy,chc, a, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, scal2js )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tscal2ris( cha,chx,chy,chc, \
+		               real( a ), imag( a ), \
+		               real( x ), imag( x ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, scal2ris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( conj( x ) ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tscal2jris( cha,chx,chy,chc, \
+		                real( a ), imag( a ), \
+		                real( x ), imag( x ), \
+		                real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, scal2jris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto yri0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                             convert_prec<ctypec>( x ) ); \
+		auto yir0 = swapri( conj( yri0 ) ); \
+\
+		INFO( "a:          " << a ); \
+		INFO( "x:          " << x ); \
+\
+		ctypey yri, yir; \
+		bli_tscal21es( cha,chx,chy,chc, a, x, yri, yir ); \
+\
+		INFO( "yri (C++):  " << yri0 ); \
+		INFO( "yir (C++):  " << yir0 ); \
+		INFO( "yri (BLIS): " << yri ); \
+		INFO( "yir (BLIS): " << yir ); \
+\
+		check<ctypec>( yri, yri0 ); \
+		check<ctypec>( yir, yir0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, C, C, R, scal21es )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto yri0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                             convert_prec<ctypec>( conj( x ) ) ); \
+		auto yir0 = swapri( conj( yri0 ) ); \
+\
+		INFO( "a:          " << a ); \
+		INFO( "x:          " << x ); \
+\
+		ctypey yri, yir; \
+		bli_tscal2j1es( cha,chx,chy,chc, a, x, yri, yir ); \
+\
+		INFO( "yri (C++):  " << yri0 ); \
+		INFO( "yir (C++):  " << yir0 ); \
+		INFO( "yri (BLIS): " << yri ); \
+		INFO( "yir (BLIS): " << yir ); \
+\
+		check<ctypec>( yri, yri0 ); \
+		check<ctypec>( yir, yir0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, C, C, R, scal2j1es )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tscal21rs( cha,chx,chy,chc, a, x, real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, C, C, R, scal21rs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( conj( x ) ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tscal2j1rs( cha,chx,chy,chc, a, x, real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, C, C, R, scal2j1rs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+GENTFUNC0( opname, 1, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+GENTFUNC0( opname, 2, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+GENTFUNC0( opname, 5, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc )
+
+#undef GENTFUNC0
+#define GENTFUNC0( opname, D, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,PASTECH(opname,_,D)) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn00 = tile<M,N,ctypey>(); \
+		      auto ymn = tile<M,D*N,ctypey>(); \
+\
+		INFO("column-major"); \
+\
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( a, bli_is_conj( conjx ) ? conj( xmn ) : xmn, 0.0, ymn00, dense ); \
+		auto ymn0 = bcast<D>( ymn00 ); \
+\
+		INFO( "conjx: " << bli_is_conj( conjx ) ); \
+		INFO( "a:     " << a ); \
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tscal2bbs_mxn( cha,chx,chy,chc, conjx, N, M, &a, &xmn[0][0], 1, N, &ymn[0][0], D, D*N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, scal2bbs_mxn )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N,ctypey>(); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( a, bli_is_conj( conjx ) ? conj( xmn ) : xmn, 0.0, ymn0, dense ); \
+\
+		INFO( "conjx: " << bli_is_conj( conjx ) ); \
+		INFO( "a:     " << a ); \
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tscal2s_mxn( cha,chx,chy,chc, conjx, M, N, &a, &xmn[0][0], N, 1, &ymn[0][0], N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+\
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N,ctypey>(); \
+\
+		INFO("column-major"); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( a, bli_is_conj( conjx ) ? conj( xmn ) : xmn, 0.0, ymn0, dense ); \
+\
+		INFO( "conjx: " << bli_is_conj( conjx ) ); \
+		INFO( "a:     " << a ); \
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tscal2s_mxn( cha,chx,chy,chc, conjx, N, M, &a, &xmn[0][0], 1, N, &ymn[0][0], 1, N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, scal2s_mxn )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N,ctypey>(); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( a, bli_is_conj( conjx ) \
+		                                         ? conj( xmn ) \
+		                                         : xmn, \
+		                                      0.0, ymn0, dense ); \
+\
+		INFO( "conjx: " << bli_is_conj( conjx ) ); \
+		INFO( "a:     " << a ); \
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tscal2ris_mxn( cha,chx,chy,chc, conjx, \
+		                   M, N, &a, \
+		                   &xmn[0][0], N, 1, \
+		                   &ymn[0][0], 2*N, 2, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+\
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N,ctypey>(); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( a, bli_is_conj( conjx ) \
+		                                      ? conj( xmn ) \
+		                                      : xmn, \
+		                                   0.0, ymn0, dense ); \
+\
+		INFO( "conjx: " << bli_is_conj( conjx ) ); \
+		INFO( "a:     " << a ); \
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tscal2ris_mxn( cha,chx,chy,chc, \
+		                   conjx, N, M, &a, \
+		                   &xmn[0][0], 1, N, \
+		                   &ymn[0][0], 2, 2*N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, C, C, R, scal2ris_mxn_together )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(cha,chx,chy,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	using ctypeyr = make_real_t<ctypey>; \
+\
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto yrmn = tile<M,N,ctypeyr>(); \
+		      auto yimn = tile<M,N,ctypeyr>(); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = tile<M,N,ctypey>(); \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( a, bli_is_conj( conjx ) \
+		                                         ? conj( xmn ) \
+		                                         : xmn, \
+		                                      0.0, ymn0, dense ); \
+		auto yrmn0 = real( ymn0 ); \
+		auto yimn0 = imag( ymn0 ); \
+\
+		INFO( "conjx: " << bli_is_conj( conjx ) ); \
+		INFO( "a:     " << a ); \
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tscal2ris_mxn( cha,chx,chy,chc, \
+		                   conjx, M, N, &a, \
+		                   &xmn[0][0], N, 1, \
+		                   &yrmn[0][0], N, 1, \
+		                   &yimn[0][0] - &yrmn[0][0] ); \
+\
+		INFO( "yr (C++):\n" << yrmn0 ); \
+		INFO( "yi (C++):\n" << yimn0 ); \
+		INFO( "yr (BLIS):\n" << yrmn ); \
+		INFO( "yi (BLIS):\n" << yimn ); \
+\
+		check<ctypec>( yrmn, yrmn0 ); \
+		check<ctypec>( yimn, yimn0 ); \
+	} \
+\
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto yrmn = tile<M,N,ctypeyr>(); \
+		      auto yimn = tile<M,N,ctypeyr>(); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = tile<M,N,ctypey>(); \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( a, bli_is_conj( conjx ) \
+		                                      ? conj( xmn ) \
+		                                      : xmn, \
+		                                   0.0, ymn0, dense ); \
+		auto yrmn0 = real( ymn0 ); \
+		auto yimn0 = imag( ymn0 ); \
+\
+		INFO( "conjx: " << bli_is_conj( conjx ) ); \
+		INFO( "a:     " << a ); \
+		INFO( "x:\n" << xmn ); \
+\
+		bli_tscal2ris_mxn( cha,chx,chy,chc, \
+		                   conjx, N, M, &a, \
+		                   &xmn[0][0], 1, N, \
+		                   &yrmn[0][0], 1, N, \
+		                   &yimn[0][0] - &yrmn[0][0] ); \
+\
+		INFO( "yr (C++):\n" << yrmn0 ); \
+		INFO( "yi (C++):\n" << yimn0 ); \
+		INFO( "yr (BLIS):\n" << yrmn ); \
+		INFO( "yi (BLIS):\n" << yimn ); \
+\
+		check<ctypec>( yrmn, yrmn0 ); \
+		check<ctypec>( yimn, yimn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, C, C, R, scal2ris_mxn_separate )
diff --git a/frame/include/level0/bb/bli_set0bbs_mxn.h b/test/level0/test_tscalcjs.cxx
similarity index 60%
rename from frame/include/level0/bb/bli_set0bbs_mxn.h
rename to test/level0/test_tscalcjs.cxx
index f05121819..4351aa13d 100644
--- a/frame/include/level0/bb/bli_set0bbs_mxn.h
+++ b/test/level0/test_tscalcjs.cxx
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
     - Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
+    - Neither the name of The University of Texas nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
 
@@ -32,43 +33,37 @@
 
 */
 
-#ifndef BLIS_SET0BBS_MXN_H
-#define BLIS_SET0BBS_MXN_H
+#include "test_l0.hpp"
 
-// set0bbs_mxn
+/******************************************************************************
+ *
+ * scalcjs
+ *
+ *****************************************************************************/
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-BLIS_INLINE void PASTEMAC(ch,opname) \
-     ( \
-       const dim_t        m, \
-       const dim_t        n, \
-       ctype*    restrict y, const inc_t incy, const inc_t ldy  \
-     ) \
-{ \
-	/* Assume that the duplication factor is the row stride of y. */ \
-	const dim_t d    = incy; \
-	const dim_t ds_y = 1; \
-\
-	for ( dim_t j = 0; j < n; ++j ) \
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto conjx : { BLIS_CONJUGATE, BLIS_NO_CONJUGATE } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
 	{ \
-		ctype* restrict yj = y + j*ldy; \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( bli_is_conj( conjx ) ? conj( a ) : a ) * \
+		                           convert_prec<ctypec>( x ) ); \
 \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict yij = yj + i*incy; \
+		INFO( "conjx:    " << conjx ); \
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
 \
-			for ( dim_t p = 0; p < d; ++p ) \
-			{ \
-				ctype* restrict yijd = yij + p*ds_y; \
+		ctypex y = x; \
+		bli_tscalcjs( cha,chx,chc, conjx, a, y ); \
 \
-				PASTEMAC(ch,set0s)( *yijd ); \
-			} \
-		} \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
 	} \
-}
-
-INSERT_GENTFUNC_BASIC( set0bbs_mxn )
+)
 
-#endif
+INSERT_GENTFUNC_MIX3( RC, RC, R, scalcjs )
diff --git a/test/level0/test_tscals.cxx b/test/level0/test_tscals.cxx
new file mode 100644
index 000000000..ffa0e7a2f
--- /dev/null
+++ b/test/level0/test_tscals.cxx
@@ -0,0 +1,277 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * scals
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tscals( cha,chx,chc, a, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, scals )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( conj( a ) ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tscaljs( cha,chx,chc, a, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, scaljs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tscalris( cha,chx,chc, \
+		              real( a ), imag( a ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, scalris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypex>( convert_prec<ctypec>( conj( a ) ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x:        " << x ); \
+\
+		ctypex y = x; \
+		bli_tscaljris( cha,chx,chc, \
+		               real( a ), imag( a ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, scaljris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto xri = x; \
+		auto xir = swapri( conj( x ) ); \
+\
+		auto xri0 = convert<ctypex>( convert_prec<ctypec>( a ) * \
+		                             convert_prec<ctypec>( x ) ); \
+		auto xir0 = swapri( conj( xri0 ) ); \
+\
+		INFO( "a:          " << a ); \
+		INFO( "x:          " << x ); \
+		INFO( "xri (orig): " << xri ); \
+		INFO( "xir (orig): " << xir ); \
+\
+		bli_tscal1es( cha,chx,chc, a, xri, xir ); \
+\
+		INFO( "xri (C++):  " << xri0 ); \
+		INFO( "xir (C++):  " << xir0 ); \
+		INFO( "xri (BLIS): " << xri ); \
+		INFO( "xir (BLIS): " << xir ); \
+\
+		check<ctypec>( xri, xri0 ); \
+		check<ctypec>( xir, xir0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, C, R, scal1es )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	for ( const auto a : test_values<ctypea>() ) \
+	for (       auto x : test_values<ctypex>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( a ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "a:        " << a ); \
+		INFO( "x (orig): " << x ); \
+\
+		bli_tscal1rs( cha,chx,chc, a, real( x ), imag( x ) ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "xr(BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, C, R, scal1rs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypea, cha, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(cha,chx,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const uplo_t uplo : { BLIS_UPPER, BLIS_LOWER } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto diagoff : { -1, 0, 1 } ) \
+	{ \
+		auto xmn = tile<M,N>( x ); \
+\
+		INFO( "row-major" ); \
+\
+		std::function<bool(size_t,size_t)> func = is_below( diagoff ); \
+		if ( uplo == BLIS_UPPER ) func = is_above( diagoff ); \
+\
+		auto xmn0 = xmn; \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( a, xmn, 0.0, xmn0, func ); \
+\
+		INFO( "upper:   " << ( uplo == BLIS_UPPER ) ); \
+		INFO( "diagoff: " << diagoff ); \
+		INFO( "a: " << a ); \
+		INFO( "x (init):\n" << xmn ); \
+\
+		bli_tscalris_mxn_uplo( cha,chx,chc, uplo, diagoff, M, N, \
+		                       &real( a ), &real( a )+1, \
+		                       &real( xmn[0][0] ), &real( xmn[0][0] )+1, \
+		                       &real( xmn[1][0] ) - &real( xmn[0][0] ), \
+		                       &real( xmn[0][1] ) - &real( xmn[0][0] ) ); \
+\
+		INFO( "x (C++):\n" << xmn0 ); \
+		INFO( "x (BLIS):\n" << xmn ); \
+\
+		check<ctypec>( xmn, xmn0 ); \
+	} \
+\
+	for ( const uplo_t uplo : { BLIS_UPPER, BLIS_LOWER } ) \
+	for ( const auto a : test_values<ctypea>() ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto diagoff : { -1, 0, 1 } ) \
+	{ \
+		auto xmn = tile<M,N>( x ); \
+\
+		INFO( "column-major" ); \
+\
+		std::function<bool(size_t,size_t)> func = is_below( diagoff ); \
+		if ( uplo == BLIS_UPPER ) func = is_above( diagoff ); \
+\
+		auto xmn0 = xmn; \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( a, xmn, 0.0, xmn0, func ); \
+\
+		INFO( "upper:   " << ( uplo == BLIS_UPPER ) ); \
+		INFO( "diagoff: " << diagoff ); \
+		INFO( "a: " << a ); \
+		INFO( "x (init):\n" << xmn ); \
+\
+		bli_tscalris_mxn_uplo( cha,chx,chc, uplo, diagoff, N, M, \
+		                       &real( a ), &real( a )+1, \
+		                       &real( xmn[0][0] ), &real( xmn[0][0] )+1, \
+		                       &real( xmn[0][1] ) - &real( xmn[0][0] ), \
+		                       &real( xmn[1][0] ) - &real( xmn[0][0] ) ); \
+\
+		INFO( "x (C++):\n" << xmn0 ); \
+		INFO( "x (BLIS):\n" << xmn ); \
+\
+		check<ctypec>( xmn, xmn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, scalris_mxn_uplo )
diff --git a/test/level0/test_tsets.cxx b/test/level0/test_tsets.cxx
new file mode 100644
index 000000000..445e648e3
--- /dev/null
+++ b/test/level0/test_tsets.cxx
@@ -0,0 +1,361 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * sets
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( x ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tsets( chx,chy, real( x ), imag( x ), y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, sets )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		real( y0 ) = convert_prec<ctypey>( real( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		bli_tsetrs( chx,chy, real( x ), y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, setrs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		imag( y0 ) = convert_prec<ctypey>( imag( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		bli_tsetis( chx,chy, imag( x ), y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, setis )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( x ); \
+\
+		INFO( "x:        " << x ); \
+\
+		ctypey y; \
+		bli_tsetris( chx,chy, \
+		             real( x ), imag( x ), \
+		             real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, setris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( 0.0 ); \
+\
+		INFO( "y (init): " << y ); \
+\
+		bli_tset0s( chy, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, set0s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( 1.0 ); \
+\
+		INFO( "y (init): " << y ); \
+\
+		bli_tset1s( chy, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, set1s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		real( y0 ) = convert_prec<ctypey>( 0.0 ); \
+\
+		INFO( "y (init): " << y ); \
+\
+		bli_tsetr0s( chy, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, setr0s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = y; \
+		imag( y0 ) = convert_prec<ctypey>( 0.0 ); \
+\
+		INFO( "y (init): " << y ); \
+\
+		bli_tseti0s( chy, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, seti0s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( 0.0 ); \
+\
+		bli_tset0ris( chy, real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, set0ris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+UNIT_TEST(chy,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+  \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto ymn = tile<M,N>( y ); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = tile<M,N>( convert<ctypey>( 0.0 ) ); \
+\
+		INFO( "y (init):\n" << ymn); \
+\
+		bli_tset0s_mxn( chy, M, N, &ymn[0][0], N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+  \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto ymn = tile<M,N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = tile<M,N>( convert<ctypey>( 0.0 ) ); \
+\
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_tset0s_mxn( chy, N, M, &ymn[0][0], 1, N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, set0s_mxn )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+GENTFUNC0( opname, 1, ctypey, chy ) \
+GENTFUNC0( opname, 2, ctypey, chy ) \
+GENTFUNC0( opname, 5, ctypey, chy )
+
+#undef GENTFUNC0
+#define GENTFUNC0( opname, D, ctypey, chy ) \
+UNIT_TEST(chy,PASTECH(opname,_,D)) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+  \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto ymn = tile<M,D*N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = tile<M,D*N>( convert<ctypey>( 0.0 ) ); \
+\
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_tset0bbs_mxn( chy, N, M, &ymn[0][0], D, D*N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, set0bbs_mxn )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypey, chy ) \
+GENTFUNC0( opname, 10, 10, ctypey, chy ) \
+GENTFUNC0( opname, 10, 4, ctypey, chy ) \
+GENTFUNC0( opname, 4, 10, ctypey, chy ) \
+GENTFUNC0( opname, 10, 0, ctypey, chy ) \
+GENTFUNC0( opname, 0, 10, ctypey, chy ) \
+GENTFUNC0( opname, 4, 0, ctypey, chy ) \
+GENTFUNC0( opname, 0, 4, ctypey, chy ) \
+GENTFUNC0( opname, 0, 0, ctypey, chy )
+
+#undef GENTFUNC0
+#define GENTFUNC0( opname, M, N, ctypey, chy ) \
+UNIT_TEST(chy,PASTECH(opname,_,M,_,N)) \
+( \
+	constexpr auto M0 = 10; \
+	constexpr auto N0 = 10; \
+  \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto ymn = tile<M0,N0>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = tile<M0,N0>( convert<ctypey>( 0.0 ) ); \
+		for ( auto i = 0; i < M; i++ ) \
+		for ( auto j = 0; j < N; j++ ) \
+			ymn0[i][j] = y; \
+\
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_tset0s_edge( chy, N, N0, M, M0, &ymn[0][0], N0 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, set0s_edge )
diff --git a/test/level0/test_tsqrt2s.cxx b/test/level0/test_tsqrt2s.cxx
new file mode 100644
index 000000000..840531477
--- /dev/null
+++ b/test/level0/test_tsqrt2s.cxx
@@ -0,0 +1,86 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * sqrt2s
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( square_root( convert_prec<ctypec>( x ) ) ); \
+\
+		ctypey y; \
+		bli_tsqrt2s( chx,chy,chc, x, y ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( R, R, R, sqrt2s )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	{ \
+		auto y0 = convert<ctypey>( square_root( convert_prec<ctypec>( x ) ) ); \
+\
+		ctypey y; \
+		bli_tsqrt2ris( chx,chy,chc, \
+		               real( x ), imag( x ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( R, R, R, sqrt2ris )
diff --git a/test/level0/test_tsubs.cxx b/test/level0/test_tsubs.cxx
new file mode 100644
index 000000000..3613c3245
--- /dev/null
+++ b/test/level0/test_tsubs.cxx
@@ -0,0 +1,164 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * subs
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( y ) - \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_tsubs( chx,chy,chc, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, subs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( auto x : test_values<ctypex>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( x ) - \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "x:        " << x ); \
+\
+		bli_tsubs( chx,chx,chc, x, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, R, subs_inplace )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>(       convert_prec<ctypec>( y ) - \
+		                           conj( convert_prec<ctypec>( x ) ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_tsubjs( chx,chy,chc, x, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, subjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( y ) - \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_tsubris( chx,chy,chc, \
+		             real( x ), imag( x ), \
+		             real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, subris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>(       convert_prec<ctypec>( y ) - \
+		                           conj( convert_prec<ctypec>( x ) ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_tsubjris( chx,chy,chc, \
+		              real( x ), imag( x ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, subjris )
diff --git a/test/level0/test_tswaps.cxx b/test/level0/test_tswaps.cxx
new file mode 100644
index 000000000..9a0ff8103
--- /dev/null
+++ b/test/level0/test_tswaps.cxx
@@ -0,0 +1,98 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * swaps
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( auto x : test_values<ctypex>() ) \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto x0 = convert<ctypex>( y ); \
+		auto y0 = convert<ctypey>( x ); \
+\
+		INFO( "x (init): " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_tswaps( chx,chy, x, y ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "x (BLIS): " << x ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypex>( x, x0 ); \
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, swaps )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypey, chy ) \
+UNIT_TEST(chx,chy,opname) \
+( \
+	for ( auto x : test_values<ctypex>() ) \
+	for ( auto y : test_values<ctypey>() ) \
+	{ \
+		auto x0 = convert<ctypex>( y ); \
+		auto y0 = convert<ctypey>( x ); \
+\
+		INFO( "x (init): " << x ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_tswapris( chx,chy, \
+		              real( x ), imag( x ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "x (BLIS): " << x ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypex>( x, x0 ); \
+		check<ctypey>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX2( RC, RC, swapris )
diff --git a/test/level0/test_txpbys.cxx b/test/level0/test_txpbys.cxx
new file mode 100644
index 000000000..1d0dbfb43
--- /dev/null
+++ b/test/level0/test_txpbys.cxx
@@ -0,0 +1,311 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "test_l0.hpp"
+
+/******************************************************************************
+ *
+ * xpbys
+ *
+ *****************************************************************************/
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chb,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( b ) * \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_txpbys( chx,chb,chy,chc, x, b, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, xpbys )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypeb, chb, ctypec, chc ) \
+UNIT_TEST(chx,chb,chy,chc,opname) \
+( \
+	for (       auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	{ \
+		auto x0 = convert<ctypex>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( b ) * \
+		                           convert_prec<ctypec>( x ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+\
+		bli_txpbys( chx,chb,chx,chc, x, b, x ); \
+\
+		INFO( "x (C++):  " << x0 ); \
+		INFO( "x (BLIS): " << x ); \
+\
+		check<ctypec>( x, x0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX3( RC, RC, R, xpbys_inplace )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chb,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( b ) * \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_txpbyjs( chx,chb,chy,chc, x, b, y ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, xpbyjs )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chb,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( convert_prec<ctypec>( x ) + \
+		                           convert_prec<ctypec>( b ) * \
+		                           convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_txpbyris( chx,chb,chy,chc, \
+		              real( x ), imag( x ), \
+		              real( b ), imag( b ), \
+		              real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, xpbyris )
+
+// txpbyjris
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chb,chy,chc,opname) \
+( \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for (       auto y : test_values<ctypey>() ) \
+	{ \
+		auto y0 = convert<ctypey>( conj( convert_prec<ctypec>( x ) ) + \
+		                                 convert_prec<ctypec>( b ) * \
+		                                 convert_prec<ctypec>( y ) ); \
+\
+		INFO( "x:        " << x ); \
+		INFO( "b:        " << b ); \
+		INFO( "y (init): " << y ); \
+\
+		bli_txpbyjris( chx,chb,chy,chc, \
+		               real( x ), imag( x ), \
+		               real( b ), imag( b ), \
+		               real( y ), imag( y ) ); \
+\
+		INFO( "y (C++):  " << y0 ); \
+		INFO( "y (BLIS): " << y ); \
+\
+		check<ctypec>( y, y0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, xpbyjris )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chb,chy,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "row-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( 1.0, xmn, b, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+		INFO( "b: " << b ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_txpbys_mxn( chx,chb,chy,chc, M, N, &xmn[0][0], N, 1, &b, &ymn[0][0], N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+\
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( 1.0, xmn, b, ymn0, dense ); \
+\
+		INFO( "x:\n" << xmn ); \
+		INFO( "b: " << b ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_txpbys_mxn( chx,chb,chy,chc, N, M, &xmn[0][0], 1, N, &b, &ymn[0][0], 1, N ); \
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, xpbys_mxn )
+
+#undef GENTFUNC
+#define GENTFUNC( opname, ctypex, chx, ctypeb, chb, ctypey, chy, ctypec, chc ) \
+UNIT_TEST(chx,chb,chy,chc,opname) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+\
+	for ( const uplo_t uplo : { BLIS_UPPER, BLIS_LOWER } ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	for ( const auto diagoff : { -1, 0, 1 } ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "row-major" ); \
+\
+		std::function<bool(size_t,size_t)> func = is_below( diagoff ); \
+		if ( uplo == BLIS_UPPER ) func = is_above( diagoff ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_NO_TRANSPOSE>( 1.0, xmn, b, ymn0, func ); \
+\
+		INFO( "upper:   " << ( uplo == BLIS_UPPER ) ); \
+		INFO( "diagoff: " << diagoff ); \
+		INFO( "x:\n" << xmn ); \
+		INFO( "b: " << b ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_txpbys_mxn_uplo( chx,chb,chy,chc, diagoff, uplo, M, N, &xmn[0][0], N, 1, &b, &ymn[0][0], N, 1 ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+\
+	for ( const uplo_t uplo : { BLIS_UPPER, BLIS_LOWER } ) \
+	for ( const auto x : test_values<ctypex>() ) \
+	for ( const auto b : test_values<ctypeb>() ) \
+	for ( const auto y : test_values<ctypey>() ) \
+	for ( const auto diagoff : { -1, 0, 1 } ) \
+	{ \
+		const auto xmn = tile<M,N>( x ); \
+		      auto ymn = tile<M,N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		std::function<bool(size_t,size_t)> func = is_below( diagoff ); \
+		if ( uplo == BLIS_UPPER ) func = is_above( diagoff ); \
+\
+		auto ymn0 = ymn; \
+		axpbys_mxn<ctypec,BLIS_TRANSPOSE>( 1.0, xmn, b, ymn0, func ); \
+\
+		INFO( "upper:   " << ( uplo == BLIS_UPPER ) ); \
+		INFO( "diagoff: " << diagoff ); \
+		INFO( "x:\n" << xmn ); \
+		INFO( "b: " << b ); \
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_txpbys_mxn_uplo( chx,chb,chy,chc, diagoff, uplo, N, M, &xmn[0][0], 1, N, &b, &ymn[0][0], 1, N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypec>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX4( RC, RC, RC, R, xpbys_mxn_uplo )
diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c
index 6d2588851..3bed62a8a 100644
--- a/testsuite/src/test_amaxv.c
+++ b/testsuite/src/test_amaxv.c
@@ -413,17 +413,17 @@ void PASTEMAC(ch,varname) \
 	   the behavior of netlib BLAS's i?amax() routines. */ \
 	if ( bli_zero_dim1( n ) ) \
 	{ \
-		PASTEMAC(i,copys)( *zero_i, *index ); \
+		bli_tcopys( i,i, *zero_i, *index ); \
 		return; \
 	} \
 \
 	/* Initialize the index of the maximum absolute value to zero. */ \
-	PASTEMAC(i,copys)( *zero_i, index_l ); \
+	bli_tcopys( i,i, *zero_i, index_l ); \
 \
 	/* Initialize the maximum absolute value search candidate with
 	   -1, which is guaranteed to be less than all values we will
 	   compute. */ \
-	PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \
+	bli_tcopys( chr,chr, *minus_one, abs_chi1_max ); \
 \
 	{ \
 		for ( i = 0; i < n; ++i ) \
@@ -431,23 +431,23 @@ void PASTEMAC(ch,varname) \
 			ctype* chi1 = x + (i  )*incx; \
 \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+			bli_tgets( ch,chr, *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
-			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
-			PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \
+			bli_tabval2s( chr,chr,chr, chi1_r, chi1_r ); \
+			bli_tabval2s( chr,chr,chr, chi1_i, chi1_i ); \
 \
 			/* Add the real and imaginary absolute values together. */ \
-			PASTEMAC(chr,set0s)( abs_chi1 ); \
-			PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \
-			PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \
+			bli_tset0s( chr, abs_chi1 ); \
+			bli_tadds( chr,chr,chr, chi1_r, abs_chi1 ); \
+			bli_tadds( chr,chr,chr, chi1_i, abs_chi1 ); \
 \
 			/* If the absolute value of the current element exceeds that of
 			   the previous largest, save it and its index. If NaN is
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
 			   behavior mimics that of LAPACK's ?lange(). */ \
-			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
+			if ( abs_chi1_max < abs_chi1 || PASTEMAC(chr,isnan)( abs_chi1 ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
 				index_l       = i; \
@@ -456,7 +456,7 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	/* Store the final index to the output variable. */ \
-	PASTEMAC(i,copys)( index_l, *index ); \
+	bli_tcopys( i,i, index_l, *index ); \
 }
 
 INSERT_GENTFUNCR_BASIC( amaxv_test )
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index aed0cd817..35bac9d49 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -1345,10 +1345,10 @@ char* libblis_test_get_string_for_result( double    resid,
 	char* r_val;
 
 	// Before checking against the thresholds, make sure the residual is
-	// neither NaN nor Inf. (Note that bli_isnan() and bli_isinf() are
+	// neither NaN nor Inf. (Note that bli_disnan() and bli_disinf() are
 	// both simply wrappers to the isnan() and isinf() macros defined
 	// defined in math.h.)
-	if ( bli_isnan( resid ) || bli_isinf( resid ) )
+	if ( bli_disnan( resid ) || bli_disinf( resid ) )
 	{
 		r_val = libblis_test_fail_string;
 	}
diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c
index 8742695c6..fafd0761d 100644
--- a/testsuite/src/test_randm.c
+++ b/testsuite/src/test_randm.c
@@ -325,7 +325,7 @@ void PASTEMAC(ch,varname)( \
 	ctype_r  sum; \
 	dim_t    i, j; \
 \
-	PASTEMAC(chr,set0s)( sum ); \
+	bli_tset0s( chr, sum ); \
 \
 	for ( j = 0; j < n; j++ ) \
 	{ \
@@ -333,12 +333,12 @@ void PASTEMAC(ch,varname)( \
 		{ \
 			ctype* chi1 = x_cast + (i  )*rs_x + (j  )*cs_x; \
 \
-			PASTEMAC(ch,chr,abval2s)( *chi1, abs_chi1 ); \
-			PASTEMAC(chr,chr,adds)( abs_chi1, sum ); \
+			bli_tabval2s( ch,chr,chr, *chi1, abs_chi1 ); \
+			bli_tadds( chr,chr,chr, abs_chi1, sum ); \
 		} \
 	} \
 \
-	PASTEMAC(chr,chr,copys)( sum, *sum_x_cast ); \
+	bli_tcopys( chr,chr, sum, *sum_x_cast ); \
 }
 
 INSERT_GENTFUNCR_BASIC( absumm )
diff --git a/testsuite/src/test_setm.c b/testsuite/src/test_setm.c
index 80cebd64e..966b69735 100644
--- a/testsuite/src/test_setm.c
+++ b/testsuite/src/test_setm.c
@@ -181,7 +181,7 @@ void libblis_test_setm_experiment
 	// Randomize x.
 	libblis_test_mobj_randomize( params, FALSE, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		time = bli_clock();
@@ -295,7 +295,7 @@ void libblis_test_setm_check
 			{
 				chi1 = buf_x_cast + (i  )*rs_x + (j  )*cs_x;
 
-				if ( !bli_ceq( *chi1, *beta_cast ) ) { *resid = 1.0; return; }
+				if ( !bli_teqs( c,c,c, *chi1, *beta_cast ) ) { *resid = 1.0; return; }
 			}
 		}
 	}
@@ -311,7 +311,7 @@ void libblis_test_setm_check
 			{
 				chi1 = buf_x_cast + (i  )*rs_x + (j  )*cs_x;
 
-				if ( !bli_zeq( *chi1, *beta_cast ) ) { *resid = 1.0; return; }
+				if ( !bli_teqs( z,z,z, *chi1, *beta_cast ) ) { *resid = 1.0; return; }
 			}
 		}
 	}
diff --git a/testsuite/src/test_setv.c b/testsuite/src/test_setv.c
index 10f0348c7..1983093d7 100644
--- a/testsuite/src/test_setv.c
+++ b/testsuite/src/test_setv.c
@@ -179,7 +179,7 @@ void libblis_test_setv_experiment
 	// Randomize x.
 	libblis_test_vobj_randomize( params, FALSE, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		time = bli_clock();
@@ -255,7 +255,7 @@ void libblis_test_setv_check
 		for ( i = 0; i < m_x; ++i )
 		{
 			if ( !bli_seq( *chi1, *beta_cast ) ) { *resid = 1.0; return; }
-			
+
 			chi1 += inc_x;
 		}
 	}
@@ -267,7 +267,7 @@ void libblis_test_setv_check
 		for ( i = 0; i < m_x; ++i )
 		{
 			if ( !bli_deq( *chi1, *beta_cast ) ) { *resid = 1.0; return; }
-			
+
 			chi1 += inc_x;
 		}
 	}
@@ -278,8 +278,8 @@ void libblis_test_setv_check
 
 		for ( i = 0; i < m_x; ++i )
 		{
-			if ( !bli_ceq( *chi1, *beta_cast ) ) { *resid = 1.0; return; }
-			
+			if ( !bli_teqs( c,c,c, *chi1, *beta_cast ) ) { *resid = 1.0; return; }
+
 			chi1 += inc_x;
 		}
 	}
@@ -290,8 +290,8 @@ void libblis_test_setv_check
 
 		for ( i = 0; i < m_x; ++i )
 		{
-			if ( !bli_zeq( *chi1, *beta_cast ) ) { *resid = 1.0; return; }
-			
+			if ( !bli_teqs( z,z,z, *chi1, *beta_cast ) ) { *resid = 1.0; return; }
+
 			chi1 += inc_x;
 		}
 	}

From 97084c75acd0ed104efc5da4dac0fb38a4a044f1 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 2 Mar 2025 08:50:37 -0600
Subject: [PATCH 224/230] Fix problem in `bli_obj_imag_part`. (#861)

Details:
- When adjusting the buffer to point to the first imaginary element, the function `bli_obj_buffer_at_off` was used which includes and currently set offsets, but then `bli_obj_set_buffer` was used which is the offset *before* applying offsets.
- Now a matching `bli_obj_buffer` call is used to avoid any offsets.
---
 frame/include/bli_obj_macro_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index 79b74a2c8..f48d17ba7 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -1281,7 +1281,7 @@ BLIS_INLINE void bli_obj_imag_part( const obj_t* c, obj_t* i )
 
 		// Update the buffer.
 		inc_t is_c = bli_obj_imag_stride( c );
-		char* p    = ( char* )bli_obj_buffer_at_off( c );
+		char* p    = ( char* )bli_obj_buffer( c );
 		bli_obj_set_buffer( p + is_c * es_c/2, i );
 	}
 }

From 37e52a613a6fec3fe1cde0ca018498a16b28a5dc Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 2 Mar 2025 08:56:54 -0600
Subject: [PATCH 225/230] Fix check for SVE instructions which caused problems
 on Windows. (#859)

* Fix check for SVE instructions which caused problems on Windows.

Details:
- The context intialization for `armsve` was using the HWCAP functionality of Linux to check if SVE instructions are actually available, since these are used to determine the register blocksizes. Naturally, this causes problems on Windows.
- Instead, use functions from `bli_cpuid.c` to check for SVE. On Windows, no check is actually done and SVE is never detected.
- In the case that the user specifically requests the `armsve` config on Windows, only enable this check for the whole `arm64` family and just assume SVE is available otherwise.

* Blacklist armsve on Windows.
---
 config/armsve/bli_cntx_init_armsve.c | 21 ++++++++++++---------
 configure                            |  2 +-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index 179a886ab..b1246bcd7 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -33,22 +33,25 @@
 */
 
 #include "blis.h"
-#include <sys/auxv.h>
-
-#ifndef HWCAP_SVE
-#define HWCAP_SVE (1 << 22)
-#endif
 
 void bli_cntx_init_armsve( cntx_t* cntx )
 {
-	if (!(getauxval( AT_HWCAP ) & HWCAP_SVE))
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_armsve_ref( cntx );
+
+	// If we are autodetecting the correct aarch64 config, then we have to make sure
+	// that SVE instructions are actually available since these are used in determining
+	// the register blocksizes.
+	#ifdef BLIS_FAMILY_ARM64
+	uint32_t family, model, features = 0;
+	bli_cpuid_query( &family, &model, &features );
+
+	if ( ! bli_cpuid_has_features( features, FEATURE_SVE ) )
 		return;
+	#endif
 
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
 
-	// Set default kernel blocksizes and functions.
-	bli_cntx_init_armsve_ref( cntx );
-
 	// -------------------------------------------------------------------------
 
 	// Block size.
diff --git a/configure b/configure
index 4df2ed80b..c217c4da8 100755
--- a/configure
+++ b/configure
@@ -2142,7 +2142,7 @@ check_assembler()
 
 check_os()
 {
-	if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then
+	if [[ ( "$(uname -s)" == "Darwin" || "$is_win" == "yes" ) && "$(uname -m)" == "arm64" ]]; then
 		blacklistos_add "armsve"
 	fi
 }

From 50054a6a7c0561d22720254ab6a9be1199ac10ab Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 2 Mar 2025 09:08:35 -0600
Subject: [PATCH 226/230] Adjust CI testing (#860)

Details:
- Add tests for the `generic` config, including forcing broadcast-A,B which uses a different reference kernel. This uncovered a number of bugs, especially in `trsm`/`gemmtrsm` reference kernels, as well as diagonal packing.
- Move threaded builds into main build and run `make check` once for each enabled backend.
- Fix unused variable warnings in level-0 macros.
- Fix `bli_tbastbbs_mxn` and add `bli_tcompressbbs_mxn`. The latter was missing from the reference `gemmtrsm` microkernel and is needed since the B11 block is accumulated to but, for complex datatypes, the effective imaginary stride is non-unit if B is broadcast packed.
- Run all BLAS tests single-threaded.
---
 .circleci/config.yml                        |  24 +--
 ci/do_sde.sh                                |   5 +-
 ci/do_testsuite.sh                          |   9 ++
 frame/1m/packm/bli_packm_init.c             |   6 +-
 frame/include/level0/bli_assigns.h          |   6 +-
 frame/include/level0/bli_complex_terms.h    |  14 +-
 frame/include/level0/bli_declinits.h        |   5 +-
 frame/include/level0/bli_tsets.h            |  40 +++--
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c |   4 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c     | 156 +++++++++++++-------
 ref_kernels/1m/bli_packm_cxk_ref.c          |  38 +----
 ref_kernels/3/bli_gemmtrsm_ref.c            |  15 ++
 ref_kernels/3/bli_trsm_ref.c                | 100 +++++++------
 test/level0/test_tsets.cxx                  |  81 ++++++++++
 testsuite/src/test_gemm_ukr.c               |   6 +-
 testsuite/src/test_gemmtrsm_ukr.c           |  73 +++------
 testsuite/src/test_libblis.c                |   4 +-
 testsuite/src/test_libblis.h                |   2 +-
 testsuite/src/test_trsm_ukr.c               |   2 +
 19 files changed, 372 insertions(+), 218 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8c73a0eec..ad2bf582a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -42,33 +42,33 @@ workflows:
       - build:
           OOT: 1
           TEST: ALL
+          THR: openmp,pthreads
           CONF: x86_64
 
       # SDE testing for x86_64
+      # Also test LEVEL0 here because g++ uses tons of memory for test_taxpbys.cxx
       - build:
           # linuxvm must be used because it provides 8G RAM and SDE fails with 4G RAM
           os: linuxvm
           SDE: 1
-          CONF: x86_64
-
-      # openmp build + LEVEL0
-      - build:
           LEVEL0: 1
-          THR: openmp
+          CONF: x86_64
 
-      # pthreads build
+      # test generic kernels
       - build:
-          THR: pthreads
+          CONF: generic_broadcast
 
       # clang build
       - build:
           CC: clang
+          THR: openmp,pthreads
           CXX: clang++
-          PACKAGES: clang
+          PACKAGES: clang libomp-dev
 
       # macOS with system compiler (clang)
       - build:
           os: macos
+          THR: pthreads
           CC: clang
           CXX: clang++
 
@@ -226,6 +226,11 @@ jobs:
               export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000";
             fi
 
+            if [ "$CONF" = "generic_broadcast" ]; then
+              export CONF=generic
+              export CFLAGS="-DBLIS_BBM_s=2 -DBLIS_BBM_d=2 -DBLIS_BBM_c=2 -DBLIS_BBM_z=2 -DBLIS_BBN_s=4 -DBLIS_BBN_d=4 -DBLIS_BBN_c=4 -DBLIS_BBN_z=4"
+            fi
+
             echo "Configuration:"
             echo "CC                = $CC"
             echo "CXX               = $CXX"
@@ -237,6 +242,7 @@ jobs:
             echo "SDE               = $SDE"
             echo "LEVEL0            = $LEVEL0"
             echo "DIST_PATH         = $DIST_PATH"
+            echo "CFLAGS            = $CFLAGS"
             echo "LDFLAGS           = $LDFLAGS"
             echo "TESTSUITE_WRAPPER = $TESTSUITE_WRAPPER"
 
@@ -246,7 +252,7 @@ jobs:
             $CC --version
             $CC -v
 
-            make -j2
+            make V=1 -j2
             make install
 
             if [ "$BLD" = "" ] && [ "$TESTSUITE_WRAPPER" = "" ] ; then $DIST_PATH/ci/cxx/cxx-test.sh $DIST_PATH $(ls -1 include); fi
diff --git a/ci/do_sde.sh b/ci/do_sde.sh
index 7f8a927f6..05a664b66 100755
--- a/ci/do_sde.sh
+++ b/ci/do_sde.sh
@@ -31,7 +31,7 @@ tar xvf $SDE_TARBALL
 
 make -j2 testsuite-bin blastest-bin
 
-for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do
+for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen generic; do
     export BLIS_ARCH_TYPE=-1
 
     if [ "$ARCH" = "knl" ]; then
@@ -55,6 +55,9 @@ for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator
     elif [ "$ARCH" = "excavator" ]; then
         TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --"
         export BLIS_ARCH_TYPE=9
+    elif [ "$ARCH" = "generic" ]; then
+        TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --"
+        export BLIS_ARCH_TYPE=33
     else
         TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/$ARCH.def --"
     fi
diff --git a/ci/do_testsuite.sh b/ci/do_testsuite.sh
index aa72d8051..9ecd09236 100755
--- a/ci/do_testsuite.sh
+++ b/ci/do_testsuite.sh
@@ -7,11 +7,19 @@ export BLIS_JC_NT=1
 export BLIS_IC_NT=2
 export BLIS_JR_NT=1
 export BLIS_IR_NT=1
+export BLIS_THREAD_IMPL="single"
 
 if [ "$TEST" = "FAST" -o "$TEST" = "ALL" ]; then
 	make testblis-fast
 	cat ./output.testsuite
 	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
+
+	for impl in $(echo $THR | sed 's/none//' | tr , ' '); do
+		export BLIS_THREAD_IMPL="$impl"
+		make testblis-fast
+		cat ./output.testsuite
+		$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
+	done
 fi
 
 if [ "$TEST" = "MD" -o "$TEST" = "ALL" ]; then
@@ -34,6 +42,7 @@ if [ "$TEST" = "1" -o "$TEST" = "ALL" ]; then
 	$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite
 fi
 
+export BLIS_THREAD_IMPL="single"
 make testblas
 cat ./output.testsuite
 $DIST_PATH/blastest/check-blastest.sh
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index fa19a6df5..d6b81b927 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -69,6 +69,7 @@ siz_t bli_packm_init
 	dim_t   bmult_m_def  = bli_packm_def_cntl_bmult_m_def( cntl );
 	dim_t   bmult_m_pack = bli_packm_def_cntl_bmult_m_pack( cntl );
 	dim_t   bmult_n_def  = bli_packm_def_cntl_bmult_n_def( cntl );
+	dim_t   bcast_p      = bli_packm_def_cntl_bmult_m_bcast( cntl );
 
 	// Typecast the internal scalar value to the target datatype.
 	// Note that if the typecasting is needed, this must happen BEFORE we
@@ -136,8 +137,9 @@ siz_t bli_packm_init
 	inc_t cs_p = bmult_m_pack;
 
 	// The "row stride" of a row-micropanel packed object is interpreted
-	// as the row stride WITHIN a micropanel. Thus, it is unit.
-	inc_t rs_p = 1;
+	// as the row stride WITHIN a micropanel. Thus, it is unit unless elemnents
+	// are duplicated (broadcast).
+	inc_t rs_p = bcast_p;
 
 	// The "panel stride" of a micropanel packed object is interpreted as
 	// the distance between the (0,0) element of panel k and the (0,0)
diff --git a/frame/include/level0/bli_assigns.h b/frame/include/level0/bli_assigns.h
index 97a79b299..f4247746f 100644
--- a/frame/include/level0/bli_assigns.h
+++ b/frame/include/level0/bli_assigns.h
@@ -43,9 +43,9 @@
 //   when that output exists only in the real domain (i.e. has no imaginary
 //   part to begin with).
 
-#define bli_rassigns( xr, xi, yr, yi ) { yr = xr;          }
-#define bli_cassigns( xr, xi, yr, yi ) { yr = xr; yi = xi; }
-#define bli_jassigns( xr, xi, yr, yi ) {          yi = xi; }
+#define bli_rassigns( xr, xi, yr, yi ) { yr = xr;          (void)(xi); (void)(yi); }
+#define bli_cassigns( xr, xi, yr, yi ) { yr = xr; yi = xi;                         }
+#define bli_jassigns( xr, xi, yr, yi ) {          yi = xi; (void)(xr); (void)(yr); }
 
 
 #endif
diff --git a/frame/include/level0/bli_complex_terms.h b/frame/include/level0/bli_complex_terms.h
index 0cf05c30c..281bff696 100644
--- a/frame/include/level0/bli_complex_terms.h
+++ b/frame/include/level0/bli_complex_terms.h
@@ -50,21 +50,21 @@
 #define bli_cctermrr( pab, ab )  ab
 
 // ai * bi term
-#define bli_rrtermii( pab, ab )  PASTEMAC(pab,zero)
-#define bli_rctermii( pab, ab )  PASTEMAC(pab,zero)
-#define bli_crtermii( pab, ab )  PASTEMAC(pab,zero)
+#define bli_rrtermii( pab, ab )  ( (void)(ab), PASTEMAC(pab,zero) )
+#define bli_rctermii( pab, ab )  ( (void)(ab), PASTEMAC(pab,zero) )
+#define bli_crtermii( pab, ab )  ( (void)(ab), PASTEMAC(pab,zero) )
 #define bli_cctermii( pab, ab )  ab
 
 // ai * br term
-#define bli_rrtermir( pab, ab )  PASTEMAC(pab,zero)
-#define bli_rctermir( pab, ab )  PASTEMAC(pab,zero)
+#define bli_rrtermir( pab, ab )  ( (void)(ab), PASTEMAC(pab,zero) )
+#define bli_rctermir( pab, ab )  ( (void)(ab), PASTEMAC(pab,zero) )
 #define bli_crtermir( pab, ab )  ab
 #define bli_cctermir( pab, ab )  ab
 
 // ar * bi term
-#define bli_rrtermri( pab, ab )  PASTEMAC(pab,zero)
+#define bli_rrtermri( pab, ab )  ( (void)(ab), PASTEMAC(pab,zero) )
 #define bli_rctermri( pab, ab )  ab
-#define bli_crtermri( pab, ab )  PASTEMAC(pab,zero)
+#define bli_crtermri( pab, ab )  ( (void)(ab), PASTEMAC(pab,zero) )
 #define bli_cctermri( pab, ab )  ab
 
 
diff --git a/frame/include/level0/bli_declinits.h b/frame/include/level0/bli_declinits.h
index 0461cd1cd..d95f9c7ae 100644
--- a/frame/include/level0/bli_declinits.h
+++ b/frame/include/level0/bli_declinits.h
@@ -45,13 +45,14 @@
 //   to the real and imaginary parts of (presumably) temporary variables.
 //   If the domain is real, only the real part is declared and initialized.
 
-#define bli_rdeclinits( pxy, xr, xi, yr, yi ) PASTEMAC(pxy,ctype) yr = xr; (void)yr;
+#define bli_rdeclinits( pxy, xr, xi, yr, yi ) PASTEMAC(pxy,ctype) yr = xr; (void)yr; \
+                                              PASTEMAC(pxy,ctype) yi;      (void)yi;
 #define bli_cdeclinits( pxy, xr, xi, yr, yi ) PASTEMAC(pxy,ctype) yr = xr; (void)yr; \
                                               PASTEMAC(pxy,ctype) yi = xi; (void)yi;
 
 // An extra definition for situations where we only need a real value declared
 // and initialized (e.g. when explicitly implementing in the complex domain).
-#define bli_rodeclinits( pxy, xr, yr ) bli_rdeclinits( pxy, xr, /*xi*/, yr, /*yi*/ )
+#define bli_rodeclinits( pxy, xr, yr ) PASTEMAC(pxy,ctype) yr = xr; (void)yr;
 
 
 #endif
diff --git a/frame/include/level0/bli_tsets.h b/frame/include/level0/bli_tsets.h
index 05a86b3f7..3e3323332 100644
--- a/frame/include/level0/bli_tsets.h
+++ b/frame/include/level0/bli_tsets.h
@@ -241,29 +241,51 @@
 #define bli_tbcastbbs_mxn( chy, m, n, y, incy, ldy ) \
 { \
 	/* Assume that the duplication factor is the row stride of y. */ \
-	const dim_t _d    = incy; \
-	const dim_t _ds_y = 1; \
+	const dim_t _d = incy; \
 \
 	for ( dim_t _j = 0; _j < (n); ++_j ) \
 	{ \
-		PASTEMAC(chy,ctype)* restrict _yj = (PASTEMAC(chy,ctype)*)(y) + _j*(ldy); \
+		PASTEMAC(chy,ctype)* _yj = (PASTEMAC(chy,ctype)*)(y) + _j*(ldy); \
 \
 		for ( dim_t _i = 0; _i < (m); ++_i ) \
 		{ \
-			PASTEMAC(chy,ctyper)* restrict _yij_r = (PASTEMAC(chy,ctyper)*)( _yj + _i*(incy) ); \
-			PASTEMAC(chy,ctyper)* restrict _yij_i = _yij_r + (incy); \
+			PASTEMAC(chy,ctype)* _yij = _yj + _i*(incy); \
+			PASTEMAC(chy,ctyper) _yij_r, _yij_i; \
 \
-			for ( dim_t _p = 1; _p < _d; ++_p ) \
+			bli_tgets( chy,chy, *_yij, _yij_r, _yij_i ); \
+\
+			for ( dim_t _p = 0; _p < _d; ++_p ) \
 			{ \
-				PASTEMAC(chy,ctyper)* restrict _yijd_r = _yij_r + _p*_ds_y; \
-				PASTEMAC(chy,ctyper)* restrict _yijd_i = _yij_i + _p*_ds_y; (void)_yijd_i; \
+				PASTEMAC(chy,ctyper)* _yijd_r = (PASTEMAC(chy,ctyper)*)_yij      + _p; \
+				PASTEMAC(chy,ctyper)* _yijd_i = (PASTEMAC(chy,ctyper)*)_yij + _d + _p; \
 \
-				bli_tcopyris( chy,chy, *_yij_r, *_yij_i, *_yijd_r, *_yijd_i ); \
+				bli_tcopyris( chy,chy, _yij_r, _yij_i, *_yijd_r, *_yijd_i ); \
 			} \
 		} \
 	} \
 }
 
+// bcastbbs_mxn
+#define bli_tcompressbbs_mxn( chy, m, n, y, incy, ldy ) \
+{ \
+	/* Assume that the duplication factor is the row stride of y. */ \
+	const dim_t _d = incy; \
+\
+	for ( dim_t _j = 0; _j < (n); ++_j ) \
+	{ \
+		PASTEMAC(chy,ctype)* _yj = (PASTEMAC(chy,ctype)*)(y) + _j*(ldy); \
+\
+		for ( dim_t _i = 0; _i < (m); ++_i ) \
+		{ \
+			PASTEMAC(chy,ctype)* _yij = _yj + _i*(incy); \
+			PASTEMAC(chy,ctyper)* _yij_r = (PASTEMAC(chy,ctyper)*)_yij; \
+			PASTEMAC(chy,ctyper)* _yij_i = (PASTEMAC(chy,ctyper)*)_yij + _d; \
+\
+			bli_tsets( chy,chy, *_yij_r, *_yij_i, *_yij ); \
+		} \
+	} \
+}
+
 #define bli_tset0s_edge( chp, i, m, j, n, p, ldp ) \
 { \
 	if ( (i) < (m) ) \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index ec52e5271..30179b2e6 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -159,7 +159,7 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 		bli_tset0s_mxn \
 		( \
 		  chp_r, \
-		  2*cdim_max, \
+		  2*cdim_max*cdim_bcast, \
 		  2*n_max, \
 		  ( ctypep_r* )p, 1, ldp  \
 		); \
@@ -263,7 +263,7 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 		bli_tset0s_mxn \
 		( \
 		  chp_r, \
-		  cdim_max, \
+		  cdim_max*cdim_bcast, \
 		  2*n_max, \
 		  ( ctypep_r* )p, 1, ldp  \
 		); \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index 82976ebde..ea384fe16 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -35,7 +35,98 @@
 #include "blis.h"
 
 
-#define PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, mn_min, mn_max, dfac, inca, lda, op ) \
+#define PACKM_DIAG_DIAG_( ctypea, ctypep, ctypep_r, cha, chp, chp_r, dfac, inca, lda ) \
+\
+do \
+{ \
+	if ( bli_is_unit_diag( diaga ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		{ \
+			ctypep_r kar, kai; \
+			bli_tgets( chp,chp, kappa_cast, kar, kai ); \
+			ctypep_r* pi1r = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ); \
+			ctypep_r* pi1i = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ) + dfac; \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				bli_tcopyris( chp,chp, kar, kai, *(pi1r + d), *(pi1i + d) ); \
+		} \
+	} \
+	else if ( bli_is_hermitian( struca ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		{ \
+			ctypep alpha_cast, kappa_alpha; \
+			bli_tcopys( cha,chp, *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
+			bli_tseti0s( chp, alpha_cast ); \
+			bli_tscal2s( chp,chp,chp,chp, kappa_cast, alpha_cast, kappa_alpha ); \
+			ctypep_r kar, kai; \
+			bli_tgets( chp,chp, kappa_alpha, kar, kai ); \
+			ctypep_r* pi1r = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ); \
+			ctypep_r* pi1i = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ) + dfac; \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				bli_tcopyris( chp,chp, kar, kai, *(pi1r + d), *(pi1i + d) ); \
+		} \
+	} \
+	else if ( bli_is_conj( conja )) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		{ \
+			ctypep kappa_alpha; \
+			bli_tscal2js( chp,cha,chp,chp, kappa_cast, *(alpha1 + mnk*(inca + lda)), kappa_alpha ); \
+			ctypep_r kar, kai; \
+			bli_tgets( chp,chp, kappa_alpha, kar, kai ); \
+			ctypep_r* pi1r = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ); \
+			ctypep_r* pi1i = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ) + dfac; \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				bli_tcopyris( chp,chp, kar, kai, *(pi1r + d), *(pi1i + d) ); \
+		} \
+	} \
+	else \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		{ \
+			ctypep kappa_alpha; \
+			bli_tscal2s( chp,cha,chp,chp, kappa_cast, *(alpha1 + mnk*(inca + lda)), kappa_alpha ); \
+			ctypep_r kar, kai; \
+			bli_tgets( chp,chp, kappa_alpha, kar, kai ); \
+			ctypep_r* pi1r = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ); \
+			ctypep_r* pi1i = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ) + dfac; \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				bli_tcopyris( chp,chp, kar, kai, *(pi1r + d), *(pi1i + d) ); \
+		} \
+	} \
+	\
+	/* invert the diagonal if requested */ \
+	if ( invdiag ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		{ \
+			ctypep_r* pi1r = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ); \
+			ctypep_r* pi1i = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ) + dfac; \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				bli_tinvertris( chp,chp, *(pi1r + d), *(pi1i + d) ); \
+		} \
+	} \
+	\
+	/* if this an edge case in both directions, extend the diagonal with ones */ \
+	for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+	{ \
+		ctypep_r* pi1r = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ); \
+		ctypep_r* pi1i = (ctypep_r*)( pi1 + mnk*(dfac + ldp) ) + dfac; \
+		ctypep_r oner, onei; \
+		bli_tset1s( chp_r, oner ); \
+		bli_tset0s( chp_r, onei ); \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			bli_tcopyris( chp,chp, oner, onei, *(pi1r + d), *(pi1i + d) ); \
+	} \
+} while (0)
+
+
+#define PACKM_DIAG_DIAG( ctypea, ctypep, cha, chp, dfac, inca, lda ) \
+PACKM_DIAG_DIAG_( ctypea, ctypep, PASTEMAC(chp,ctyper), cha, chp, PASTEMAC(chp,prec), dfac, inca, lda )
+
+
+#define PACKM_DIAG_BODY_( ctypea, ctypep, ctypep_r, cha, chp, chp_r, mn_min, mn_max, dfac, inca, lda, op ) \
 \
 do \
 { \
@@ -44,12 +135,19 @@ do \
 	{ \
 		ctypep kappa_alpha; \
 		PASTEMAC(t,op)( chp,cha,chp,chp, kappa_cast, *(alpha1 + mn*inca + k*lda), kappa_alpha ); \
+		ctypep_r kar, kai; \
+		bli_tgets( chp,chp, kappa_alpha, kar, kai ); \
+		ctypep_r* pi1r = (ctypep_r*)( pi1 + mn*dfac + k*ldp ); \
+		ctypep_r* pi1i = (ctypep_r*)( pi1 + mn*dfac + k*ldp ) + dfac; \
 		for ( dim_t d = 0; d < dfac; d++ ) \
-			bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mn*dfac + d + k*ldp) ); \
+			bli_tcopyris( chp,chp, kar, kai, *(pi1r + d), *(pi1i + d) ); \
 	} \
 } while(0)
 
 
+#define PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, mn_min, mn_max, dfac, inca, lda, op ) \
+PACKM_DIAG_BODY_( ctypea, ctypep, PASTEMAC(chp,ctyper), cha, chp, PASTEMAC(chp,prec), mn_min, mn_max, dfac, inca, lda, op )
+
 #define PACKM_DIAG_BODY_L( ctypea, ctypep, cha, chp, op ) \
 	PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, k+1, cdim, cdim_bcast, inca_l, lda_l, op )
 
@@ -83,7 +181,7 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	bli_tset0s_mxn \
 	( \
 	  chp, \
-	  cdim_max, \
+	  cdim_max*cdim_bcast, \
 	  n_max, \
 	  ( ctypep* )p, 1, ldp  \
 	); \
@@ -130,57 +228,7 @@ void PASTEMAC(cha,chp,opname,arch,suf) \
 	} \
 \
 	/* write the diagonal */ \
-	if ( bli_is_unit_diag( diaga ) ) \
-	{ \
-		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-			bli_tcopys( chp,chp, kappa_cast, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
-	} \
-	else if ( bli_is_hermitian( struca ) ) \
-	{ \
-		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		{ \
-			ctypep alpha_cast, kappa_alpha; \
-			bli_tcopys( cha,chp, *(alpha1 + mnk*(inca + lda)), alpha_cast ); \
-			bli_tseti0s( chp, alpha_cast ); \
-			bli_tscal2s( chp,chp,chp,chp, kappa_cast, alpha_cast, kappa_alpha ); \
-			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
-		} \
-	} \
-	else if ( bli_is_conj( conja )) \
-	{ \
-		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		{ \
-			ctypep kappa_alpha; \
-			bli_tscal2js( chp,cha,chp,chp, kappa_cast, *(alpha1 + mnk*(inca + lda)), kappa_alpha ); \
-			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
-		} \
-	} \
-	else \
-	{ \
-		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		{ \
-			ctypep kappa_alpha; \
-			bli_tscal2s( chp,cha,chp,chp, kappa_cast, *(alpha1 + mnk*(inca + lda)), kappa_alpha ); \
-			for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
-		} \
-	} \
-\
-	/* invert the diagonal if requested */ \
-	if ( invdiag ) \
-	{ \
-		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-		for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-			bli_tinverts( chp,chp, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
-	} \
-\
-	/* if this an edge case in both directions, extend the diagonal with ones */ \
-	for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
-	for ( dim_t d = 0; d < cdim_bcast; ++d ) \
-		bli_tset1s( chp, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \
+	PACKM_DIAG_DIAG( ctypea, ctypep, cha, chp, cdim_bcast, inca, lda ); \
 }
 
 INSERT_GENTFUNC2_BASIC( packm_diag, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index 6b70ddda7..bd41a44d6 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -44,28 +44,7 @@
 #endif
 #endif
 
-#define PACKM_BODY_r( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
-\
-do \
-{ \
-	for ( dim_t k = n; k != 0; --k ) \
-	{ \
-		pragma \
-		for ( dim_t mn = 0; mn < cdim; mn++ ) \
-		{ \
-			ctypep kappa_alpha; \
-			PASTEMAC(t,op)( chp,cha,chp,chp, kappa_cast, *(alpha1 + mn*inca), kappa_alpha ); \
-			for ( dim_t d = 0; d < dfac; d++ ) \
-				bli_tcopys( chp,chp, kappa_alpha, *(pi1 + mn*dfac + d) ); \
-		} \
-\
-		alpha1 += lda; \
-		pi1    += ldp; \
-	} \
-} while(0)
-
-
-#define PACKM_BODY_c_( ctypea, ctypep, ctypep_r, cha, chp, chp_r, pragma, cdim, dfac, inca, op ) \
+#define PACKM_BODY_( ctypea, ctypep, ctypep_r, cha, chp, chp_r, pragma, cdim, dfac, inca, op ) \
 \
 do \
 { \
@@ -78,13 +57,10 @@ do \
 			PASTEMAC(t,op)( chp,cha,chp,chp, kappa_cast, *(alpha1 + mn*inca), kappa_alpha ); \
 			ctypep_r kar, kai; \
 			bli_tgets( chp,chp, kappa_alpha, kar, kai ); \
-			ctypep_r* pi1r = (ctypep_r*)pi1; \
-			ctypep_r* pi1i = (ctypep_r*)pi1 + dfac; \
+			ctypep_r* pi1r = (ctypep_r*)( pi1 + mn*dfac ); \
+			ctypep_r* pi1i = pi1r + dfac; \
 			for ( dim_t d = 0; d < dfac; d++ ) \
-			{ \
-				bli_tcopys( chp_r,chp_r, kar, *(pi1r + mn*dfac*2 + d) ); \
-				bli_tcopys( chp_r,chp_r, kai, *(pi1i + mn*dfac*2 + d) ); \
-			} \
+				bli_tcopyris( chp,chp, kar, kai, *(pi1r + d), *(pi1i + d) ); \
 		} \
 \
 		alpha1 += lda; \
@@ -93,12 +69,8 @@ do \
 } while(0)
 
 
-#define PACKM_BODY_c( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
-PACKM_BODY_c_( ctypea, ctypep, PASTEMAC(chp,ctyper), cha, chp, PASTEMAC(chp,prec), pragma, cdim, dfac, inca, op )
-
-
 #define PACKM_BODY( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \
-PASTECH(PACKM_BODY_,PASTEMAC(chp,dom))( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op )
+PACKM_BODY_( ctypea, ctypep, PASTEMAC(chp,ctyper), cha, chp, PASTEMAC(chp,prec), pragma, cdim, dfac, inca, op )
 
 
 #undef  GENTFUNC2
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 0d496feb7..5073f4470 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -115,6 +115,19 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
 		rs_c_use = rs_ct; \
 		cs_c_use = cs_ct; \
 	} \
+\
+	/* If b11 is stored in broacasted format, the real and imaginary components
+	   will be too widely separated (an imaginary stride > 1). Recompress b11 so
+	   that the imaginary stride is 1 as expected (the duplicated elements aren't needed
+	   here so they are left untouched). */ \
+	if ( cs_b > 1 ) \
+	bli_tcompressbbs_mxn \
+	( \
+	  ch, \
+	  n, \
+	  m, \
+	  b11, cs_b, rs_b  \
+	); \
 \
 	/* lower: b11 = alpha * b11 - a10 * b01; */ \
 	/* upper: b11 = alpha * b11 - a12 * b21; */ \
@@ -138,6 +151,7 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
 \
 	/* Broadcast the elements of the updated b11 submatrix to their
 	   duplicated neighbors. */ \
+	if ( cs_b > 1 ) \
 	bli_tbcastbbs_mxn \
 	( \
 	  ch, \
@@ -156,6 +170,7 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
 	  data, \
 	  cntx  \
 	); \
+	\
 /*
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
                      (double*)b11, rs_b, 1, "%5.2f", "" ); \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index cbd64899d..49573383f 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -37,8 +37,8 @@
 // An implementation that indexes through B with the assumption that all
 // elements were broadcast (duplicated) by a factor of NP/NR.
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \
 \
 void PASTEMAC(ch,opname,arch,suf) \
      ( \
@@ -75,58 +75,66 @@ void PASTEMAC(ch,opname,arch,suf) \
 		dim_t i        = iter; \
 		dim_t n_behind = i; \
 \
-		const ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
-		const ctype* restrict a10t    = a + (i  )*rs_a + (0  )*cs_a; \
-		      ctype* restrict B0      = b + (0  )*rs_b + (0  )*cs_b; \
-		      ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
+		const ctype_r* restrict alpha11r = ( ctype_r* )( a + (i  )*rs_a + (i  )*cs_a ); \
+		const ctype_r* restrict alpha11i = alpha11r + rs_a; \
+		const ctype*   restrict a10t     = a + (i  )*rs_a + (0  )*cs_a; \
+		      ctype*   restrict B0       = b + (0  )*rs_b + (0  )*cs_b; \
+		      ctype*   restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
 \
 		/* b1 = b1 - a10t * B0; */ \
 		/* b1 = b1 / alpha11; */ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict b01     = B0 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict gamma11 = c  + (i  )*rs_c + (j  )*cs_c; \
-			ctype           beta11c = *beta11; \
-			ctype           rho11; \
+			ctype*   restrict b01      = B0 + (0  )*rs_b + (j  )*cs_b; \
+			ctype_r* restrict beta11r  = ( ctype_r* )( b1 + (0  )*rs_b + (j  )*cs_b ); \
+			ctype_r* restrict beta11i  = beta11r + cs_b; \
+			ctype_r* restrict gamma11r = ( ctype_r* )( c  + (i  )*rs_c + (j  )*cs_c ); \
+			ctype_r* restrict gamma11i = gamma11r + 1; \
+			ctype_r           beta11cr = *beta11r; \
+			ctype_r           beta11ci = *beta11i; \
+			ctype_r           rho11r; \
+			ctype_r           rho11i; \
 \
 			/* beta11 = beta11 - a10t * b01; */ \
-			bli_tset0s( ch, rho11 ); \
+			bli_tset0s( chr, rho11r ); \
+			bli_tset0s( chr, rho11i ); \
 			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
-				const ctype* restrict alpha10 = a10t + (l  )*cs_a; \
-				      ctype* restrict beta01  = b01  + (l  )*rs_b; \
+				const ctype_r* restrict alpha10r = ( ctype_r* )( a10t + (l  )*cs_a ); \
+				const ctype_r* restrict alpha10i = alpha10r + rs_a; \
+				      ctype_r* restrict beta01r  = ( ctype_r* )( b01  + (l  )*rs_b ); \
+				      ctype_r* restrict beta01i  = beta01r + cs_b; \
 \
-				bli_taxpys( ch,ch,ch,ch, *alpha10, *beta01, rho11 ); \
+				bli_taxpyris( ch,ch,ch,ch, *alpha10r, *alpha10i, *beta01r, *beta01i, rho11r, rho11i ); \
 			} \
-			bli_tsubs( ch,ch,ch, rho11, beta11c ); \
+			bli_tsubris( ch,ch,ch, rho11r, rho11i, beta11cr, beta11ci ); \
 \
 			/* beta11 = beta11 / alpha11; */ \
 			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 			   (1.0/alpha11) is stored during packing instead alpha11 so we
 			   can multiply rather than divide. When preinversion is disabled,
 			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(t,diagop)( ch,ch,ch, *alpha11, beta11c ); \
+			PASTEMAC(t,diagop)( ch,ch,ch, *alpha11r, *alpha11i, beta11cr, beta11ci ); \
 \
 			/* Output final result to matrix c. */ \
-			bli_tcopys( ch,ch, beta11c, *gamma11 ); \
+			bli_tcopyris( ch,ch, beta11cr, beta11ci, *gamma11r, *gamma11i ); \
 \
 			/* Store the local value back to b11. */ \
 			for ( dim_t d = 0; d < cs_b; ++d ) \
-				bli_tcopys( ch,ch, beta11c, *(beta11 + d) ); \
+				bli_tcopyris( ch,ch, beta11cr, beta11ci, *(beta11r + d), *(beta11i + d) ); \
 		} \
 	} \
 }
 
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
+INSERT_GENTFUNCR_BASIC( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
 #else
-INSERT_GENTFUNC_BASIC( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
+INSERT_GENTFUNCR_BASIC( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
 #endif
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \
 \
 void PASTEMAC(ch,opname,arch,suf) \
      ( \
@@ -163,52 +171,60 @@ void PASTEMAC(ch,opname,arch,suf) \
 		dim_t i        = m - iter - 1; \
 		dim_t n_behind = iter; \
 \
-		const ctype* restrict alpha11 = a + (i  )*rs_a + (i  )*cs_a; \
-		const ctype* restrict a12t    = a + (i  )*rs_a + (i+1)*cs_a; \
-		      ctype* restrict b1      = b + (i  )*rs_b + (0  )*cs_b; \
-		      ctype* restrict B2      = b + (i+1)*rs_b + (0  )*cs_b; \
+		const ctype_r* restrict alpha11r = ( ctype_r* )( a + (i  )*rs_a + (i  )*cs_a ); \
+		const ctype_r* restrict alpha11i = alpha11r + rs_a; \
+		const ctype*   restrict a12t     = a + (i  )*rs_a + (i+1)*cs_a; \
+		      ctype*   restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
+		      ctype*   restrict B2       = b + (i+1)*rs_b + (0  )*cs_b; \
 \
 		/* b1 = b1 - a12t * B2; */ \
 		/* b1 = b1 / alpha11; */ \
 		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict b21     = B2 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict gamma11 = c  + (i  )*rs_c + (j  )*cs_c; \
-			ctype           beta11c = *beta11; \
-			ctype           rho11; \
+			ctype_r* restrict beta11r  = ( ctype_r* )( b1 + (0  )*rs_b + (j  )*cs_b ); \
+			ctype_r* restrict beta11i  = beta11r + cs_b; \
+			ctype*   restrict b21      = B2 + (0  )*rs_b + (j  )*cs_b; \
+			ctype_r* restrict gamma11r = ( ctype_r* )( c  + (i  )*rs_c + (j  )*cs_c ); \
+			ctype_r* restrict gamma11i = gamma11r + 1; \
+			ctype_r           beta11cr = *beta11r; \
+			ctype_r           beta11ci = *beta11i; \
+			ctype_r           rho11r; \
+			ctype_r           rho11i; \
 \
 			/* beta11 = beta11 - a12t * b21; */ \
-			bli_tset0s( ch, rho11 ); \
+			bli_tset0s( chr, rho11r ); \
+			bli_tset0s( chr, rho11i ); \
 			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
-				const ctype* restrict alpha12 = a12t + (l  )*cs_a; \
-				      ctype* restrict beta21  = b21  + (l  )*rs_b; \
+				const ctype_r* restrict alpha12r = ( ctype_r* )( a12t + (l  )*cs_a ); \
+				const ctype_r* restrict alpha12i = alpha12r + rs_a; \
+				      ctype_r* restrict beta21r  = ( ctype_r* )( b21  + (l  )*rs_b ); \
+				      ctype_r* restrict beta21i  = beta21r + cs_b; \
 \
-				bli_taxpys( ch,ch,ch,ch, *alpha12, *beta21, rho11 ); \
+				bli_taxpyris( ch,ch,ch,ch, *alpha12r, *alpha12i, *beta21r, *beta21i, rho11r, rho11i ); \
 			} \
-			bli_tsubs( ch,ch,ch, rho11, beta11c ); \
+			bli_tsubris( ch,ch,ch, rho11r, rho11i, beta11cr, beta11ci ); \
 \
 			/* beta11 = beta11 / alpha11; */ \
 			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
 			   (1.0/alpha11) is stored during packing instead alpha11 so we
 			   can multiply rather than divide. When preinversion is disabled,
 			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(t,diagop)( ch,ch,ch, *alpha11, beta11c ); \
+			PASTEMAC(t,diagop)( ch,ch,ch, *alpha11r, *alpha11i, beta11cr, beta11ci ); \
 \
 			/* Output final result to matrix c. */ \
-			bli_tcopys( ch,ch, beta11c, *gamma11 ); \
+			bli_tcopyris( ch,ch, beta11cr, beta11ci, *gamma11r, *gamma11i ); \
 \
 			/* Store the local value back to b11. */ \
 			for ( dim_t d = 0; d < cs_b; ++d ) \
-				bli_tcopys( ch,ch, beta11c, *(beta11 + d) ); \
+				bli_tcopyris( ch,ch, beta11cr, beta11ci, *(beta11r + d), *(beta11i + d) ); \
 		} \
 	} \
 }
 
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
+INSERT_GENTFUNCR_BASIC( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
 #else
-INSERT_GENTFUNC_BASIC( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
+INSERT_GENTFUNCR_BASIC( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
 #endif
 
diff --git a/test/level0/test_tsets.cxx b/test/level0/test_tsets.cxx
index 445e648e3..51fddd9b4 100644
--- a/test/level0/test_tsets.cxx
+++ b/test/level0/test_tsets.cxx
@@ -318,6 +318,87 @@ UNIT_TEST(chy,PASTECH(opname,_,D)) \
 
 INSERT_GENTFUNC_MIX1( RC, set0bbs_mxn )
 
+#undef GENTFUNC0
+#define GENTFUNC0( opname, D, ctypey, chy ) \
+UNIT_TEST(chy,PASTECH(opname,_,D)) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+  \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto ymn = tile<M,D*N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = tile<M,D*N>( y ); \
+\
+		if constexpr ( is_complex<ctypey>::value ) \
+		{ \
+			for ( auto i = 0;i < M;i++ ) \
+			for ( auto j = 0;j < N;j++ ) \
+			{ \
+				auto ymnij = &real( ymn0[i][j*D] ); \
+				for ( auto d = 0;d < D;d++ ) \
+				{ \
+					ymnij[  d] = real( y ); \
+					ymnij[D+d] = imag( y ); \
+				} \
+			} \
+		} \
+\
+		for ( auto i = 0;i < M;i++ ) \
+		for ( auto j = 0;j < N;j++ ) \
+		for ( auto d = 1;d < D;d++ ) \
+			ymn[i][j*D + d] = ctypey{}; \
+\
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_tbcastbbs_mxn( chy, N, M, &ymn[0][0], D, D*N ); \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, bcastbbs_mxn )
+
+#undef GENTFUNC0
+#define GENTFUNC0( opname, D, ctypey, chy ) \
+UNIT_TEST(chy,PASTECH(opname,_,D)) \
+( \
+	constexpr auto M = 4; \
+	constexpr auto N = 4; \
+  \
+	for ( const auto y : test_values<ctypey>() ) \
+	{ \
+		auto ymn = tile<M,D*N>( y ); \
+\
+		INFO( "column-major" ); \
+\
+		auto ymn0 = tile<M,D*N>( y ); \
+\
+		INFO( "y (init):\n" << ymn ); \
+\
+		bli_tbcastbbs_mxn( chy, N, M, &ymn[0][0], D, D*N ); \
+		bli_tcompressbbs_mxn( chy, N, M, &ymn[0][0], D, D*N ); \
+\
+		for ( auto i = 0;i < M;i++ ) \
+		for ( auto j = 0;j < N;j++ ) \
+		for ( auto d = 1;d < D;d++ ) \
+			ymn[i][j*D+d] = ymn0[i][j*D+d]; \
+\
+		INFO( "y (C++):\n" << ymn0 ); \
+		INFO( "y (BLIS):\n" << ymn ); \
+\
+		check<ctypey>( ymn, ymn0 ); \
+	} \
+)
+
+INSERT_GENTFUNC_MIX1( RC, compressbbs_mxn )
+
 #undef GENTFUNC
 #define GENTFUNC( opname, ctypey, chy ) \
 GENTFUNC0( opname, 10, 10, ctypey, chy ) \
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index 288249b59..8b7989921 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -190,8 +190,8 @@ void libblis_test_gemm_ukr_experiment
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
 	// Fix m and n to MR and NR, respectively.
-	m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
-	n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
+	m   = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
+	n   = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
 
 	// Store the register blocksizes so that the driver can retrieve the
 	// values later when printing results.
@@ -240,6 +240,7 @@ void libblis_test_gemm_ukr_experiment
 	(
 	  BLIS_MR,
 	  BLIS_KR,
+	  BLIS_BBM,
 	  BLIS_NO_INVERT_DIAG,
 	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
@@ -250,6 +251,7 @@ void libblis_test_gemm_ukr_experiment
 	(
 	  BLIS_NR,
 	  BLIS_KR,
+	  BLIS_BBN,
 	  BLIS_NO_INVERT_DIAG,
 	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_B_PANEL,
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 26aad890d..341a3af30 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -90,7 +90,6 @@ void libblis_test_gemmtrsm_ukr_check
        obj_t*         bx1,
        obj_t*         b11,
        obj_t*         c11,
-       obj_t*         c11_save,
        double*        resid
      );
 
@@ -188,7 +187,6 @@ void libblis_test_gemmtrsm_ukr_experiment
 	num_t        datatype;
 
 	dim_t        m, n, k;
-	inc_t        ldap, ldbp;
 
 	char         sc_a = 'c';
 	char         sc_b = 'r';
@@ -198,7 +196,7 @@ void libblis_test_gemmtrsm_ukr_experiment
 
 	obj_t        alpha;
 	obj_t        a_big, a, b;
-	obj_t        b11, c11;
+	obj_t        a11, a1x, b11, bx1, c11;
 	obj_t        ap, bp;
 	obj_t        a1xp, a11p, bx1p, b11p;
 	obj_t        c11_save;
@@ -219,11 +217,6 @@ void libblis_test_gemmtrsm_ukr_experiment
 	m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
 	n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
 
-	// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
-	// respectively.
-	ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
-	ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
-
 	// Store the register blocksizes so that the driver can retrieve the
 	// values later when printing results.
 	op->dim_aux[0] = m;
@@ -271,14 +264,22 @@ void libblis_test_gemmtrsm_ukr_experiment
 	if ( bli_obj_is_lower( &a_big ) )
 	{
 		bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &a_big, &a );
+		bli_acquire_mpart_l2r( BLIS_SUBPART0, k, m, &a, &a1x );
+		bli_acquire_mpart_l2r( BLIS_SUBPART1, k, m, &a, &a11 );
+		bli_acquire_mpart_t2b( BLIS_SUBPART0, k, m, &b, &bx1 );
 		bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &b, &b11 );
 	}
 	else
 	{
 		bli_acquire_mpart_t2b( BLIS_SUBPART1, 0, m, &a_big, &a );
+		bli_acquire_mpart_l2r( BLIS_SUBPART1, 0, m, &a, &a11 );
+		bli_acquire_mpart_l2r( BLIS_SUBPART2, 0, m, &a, &a1x );
 		bli_acquire_mpart_t2b( BLIS_SUBPART1, 0, m, &b, &b11 );
+		bli_acquire_mpart_t2b( BLIS_SUBPART2, 0, m, &b, &bx1 );
 	}
 
+	bli_obj_set_struc( BLIS_GENERAL, &a1x );
+
 	// Copy B11 to C11, and save.
 	bli_copym( &b11, &c11 );
 	bli_copym( &c11, &c11_save );
@@ -289,6 +290,7 @@ void libblis_test_gemmtrsm_ukr_experiment
 	(
 	  BLIS_MR,
 	  BLIS_MR,
+	  BLIS_BBM,
 	  BLIS_INVERT_DIAG,
 	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
@@ -324,6 +326,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		(
 		  BLIS_NR,
 		  BLIS_MR,
+		  BLIS_BBN,
 		  BLIS_NO_INVERT_DIAG,
 		  BLIS_PACKED_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,
@@ -366,31 +369,9 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 	*perf = ( 2.0 * m * n * k + 1.0 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
 	if ( bli_obj_is_complex( &b ) ) *perf *= 4.0;
 
-	// A hack to support subconfigs such as power9, which duplicate/broadcast
-	// more than one stored element per logical element in the packed copy of
-	// B. We assume that the ratio ldbp/n gives us the duplication factor used
-	// within B while the ratio ldap/m gives us the duplication factor used
-	// within A (not entirely a safe assumption, though I think it holds for
-	// all gemm ukernels currently supported within BLIS). This duplication
-	// factor must be used as the column stride of B (or the row stride of A)
-	// in order for the bli_gemmv() operation (called within the
-	// libblis_test_gemmtrsm_ukr_check()) to operate properly.
-	if ( ldbp / n > 1 )
-	{
-		const dim_t bfac = ldbp / n;
-		bli_obj_set_col_stride( bfac, &b11p );
-		bli_obj_set_col_stride( bfac, &bx1p );
-	}
-	if ( ldap / m > 1 )
-	{
-		const dim_t bfac = ldap / m;
-		bli_obj_set_row_stride( bfac, &a11p );
-		bli_obj_set_row_stride( bfac, &a1xp );
-	}
-
 	// Perform checks.
 	libblis_test_gemmtrsm_ukr_check( params, side, &alpha,
-	                                 &a1xp, &a11p, &bx1p, &b11p, &c11, &c11_save, resid );
+	                                 &a1x, &a11, &bx1, &b11, &c11, resid );
 
 	// Zero out performance and residual if output matrix is empty.
 	//libblis_test_check_empty_problem( &c11, perf, resid );
@@ -446,15 +427,14 @@ void libblis_test_gemmtrsm_ukr_check
        obj_t*         bx1,
        obj_t*         b11,
        obj_t*         c11,
-       obj_t*         c11_orig,
        double*        resid
      )
 {
-	num_t  dt      = bli_obj_dt( b11 );
-	num_t  dt_real = bli_obj_dt_proj_to_real( b11 );
+	num_t  dt      = bli_obj_dt( c11 );
+	num_t  dt_real = bli_obj_dt_proj_to_real( c11 );
 
-	dim_t  m       = bli_obj_length( b11 );
-	dim_t  n       = bli_obj_width( b11 );
+	dim_t  m       = bli_obj_length( c11 );
+	dim_t  n       = bli_obj_width( c11 );
 	dim_t  k       = bli_obj_width( a1x );
 
 	obj_t  norm;
@@ -464,13 +444,12 @@ void libblis_test_gemmtrsm_ukr_check
 
 	//
 	// Pre-conditions:
-	// - a1x, a11, bx1, c11_orig are randomized; a11 is triangular.
-	// - contents of b11 == contents of c11.
+	// - a1x, a11, bx1, b11 are randomized; a11 is triangular.
 	// - side == BLIS_LEFT.
 	//
 	// Under these conditions, we assume that the implementation for
 	//
-	//   B := inv(A11) * ( alpha * B11 - A1x * Bx1 )       (side = left)
+	//   C11 := inv(A11) * ( alpha * B11 - A1x * Bx1 )       (side = left)
 	//
 	// is functioning correctly if
 	//
@@ -478,11 +457,11 @@ void libblis_test_gemmtrsm_ukr_check
 	//
 	// is negligible, where
 	//
-	//   v = B11 * t
+	//   v = C11 * t
 	//
-	//   z = ( inv(A11) * ( alpha * B11_orig - A1x * Bx1 ) ) * t
-	//     = inv(A11) * ( alpha * B11_orig * t - A1x * Bx1 * t )
-	//     = inv(A11) * ( alpha * B11_orig * t - A1x * w )
+	//   z = ( inv(A11) * ( alpha * B11 - A1x * Bx1 ) ) * t
+	//     = inv(A11) * ( alpha * B11 * t - A1x * Bx1 * t )
+	//     = inv(A11) * ( alpha * B11 * t - A1x * w )
 	//
 
 	bli_obj_scalar_init_detached( dt_real, &norm );
@@ -502,20 +481,16 @@ void libblis_test_gemmtrsm_ukr_check
 
 	libblis_test_vobj_randomize( params, TRUE, &t );
 
-	bli_gemv( &BLIS_ONE, b11, &t, &BLIS_ZERO, &v );
+	bli_gemv( &BLIS_ONE, c11, &t, &BLIS_ZERO, &v );
 
 #if 0
 bli_printm( "a11", a11, "%5.2f", "" );
 #endif
 
-	// Restore the diagonal of a11 to its original, un-inverted state
-	// (needed for trsv).
-	bli_invertd( a11 );
-
 	if ( bli_is_left( side ) )
 	{
 		bli_gemv( &BLIS_ONE, bx1, &t, &BLIS_ZERO, &w );
-		bli_gemv( alpha, c11_orig, &t, &BLIS_ZERO, &z );
+		bli_gemv( alpha, b11, &t, &BLIS_ZERO, &z );
 		bli_gemv( &BLIS_MINUS_ONE, a1x, &w, &BLIS_ONE, &z );
 		bli_trsv( &BLIS_ONE, a11, &z );
 	}
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 35bac9d49..d8711c9b8 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -2418,7 +2418,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
 }
 
 
-thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
+thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, bszid_t bcast_id_m, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
 {
 	static packm_ker_ft GENARRAY2_MIXP(packm_struc_cxk,packm_struc_cxk);
 
@@ -2441,7 +2441,7 @@ thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, inv
 	  packm_struc_cxk[ dt ][ dt ],
 	  bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ),
 	  bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ),
-	  1,
+	  bli_cntx_get_blksz_def_dt( dt, bcast_id_m, cntx ),
 	  1,
 	  1,
 	  bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ),
diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h
index 7c1e52805..a64b1ccb6 100644
--- a/testsuite/src/test_libblis.h
+++ b/testsuite/src/test_libblis.h
@@ -421,7 +421,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces );
 // --- Create object ---
 
 void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a );
-thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx );
+thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, bszid_t bcast_id_m, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx );
 void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x );
 
 // --- Randomize/initialize object ---
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 27d488810..0b753ca57 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -238,6 +238,7 @@ void libblis_test_trsm_ukr_experiment
 	(
 	  BLIS_MR,
 	  BLIS_MR,
+	  BLIS_BBM,
 	  BLIS_INVERT_DIAG,
 	  BLIS_PACKED_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
@@ -270,6 +271,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		(
 		  BLIS_NR,
 		  BLIS_MR,
+		  BLIS_BBN,
 		  BLIS_NO_INVERT_DIAG,
 		  BLIS_PACKED_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,

From 53d21cb478801d8e978082da2889e5e67d4221c9 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 2 Apr 2025 12:03:43 -0500
Subject: [PATCH 227/230] Fix for plugins without explicit optimized kernels.

---
 configure | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/configure b/configure
index c217c4da8..edbca9ffc 100755
--- a/configure
+++ b/configure
@@ -2668,26 +2668,28 @@ build_and_check_configurations()
 		fi
 	done
 
+	if [ ! "${script_name}" = "configure-plugin" ]; then
 
-	echo "${script_name}: checking sub-configurations' requisite kernels:"
+		echo "${script_name}: checking sub-configurations' requisite kernels:"
 
-	# Also, let's verify that the requisite kernel sets associated with
-	# the config name all correspond to directories that exist.
-	for kernel in ${kernel_list}; do
+		# Also, let's verify that the requisite kernel sets associated with
+		# the config name all correspond to directories that exist.
+		for kernel in ${kernel_list}; do
 
-		echo -n "${script_name}:   '${kernel}' kernels..."
+			echo -n "${script_name}:   '${kernel}' kernels..."
 
-		# Confirm that the current kernel sub-directory exists.
-		if [ ! -d "${kernels_dirpath}/${kernel}" ]; then
-			echo "do NOT exist!"
-			echo "${script_name}: "
-			echo "${script_name}: *** Cannot continue with nonexistent kernel '${kernel}'. ***"
-			echo "${script_name}: "
-			exit 1;
-		else
-			echo "exist."
-		fi
-	done
+			# Confirm that the current kernel sub-directory exists.
+			if [ ! -d "${kernels_dirpath}/${kernel}" ]; then
+				echo "do NOT exist!"
+				echo "${script_name}: "
+				echo "${script_name}: *** Cannot continue with nonexistent kernel '${kernel}'. ***"
+				echo "${script_name}: "
+				exit 1;
+			else
+				echo "exist."
+			fi
+		done
+	fi
 }
 
 check_build_tools()

From 5d9e110a2aa58b6e5d131db9131bae0143f22f9f Mon Sep 17 00:00:00 2001
From: Minh Quan Ho <1337056+hominhquan@users.noreply.github.com>
Date: Mon, 7 Apr 2025 21:21:45 +0200
Subject: [PATCH 228/230] Examples: replace all 4.1f printm format by 4.3f
 (#865)

Details:
- This avoids possible misinterpretation of computation results printed on stdout (thanks Mason McBride for reporting it in #864).
- Also force space for positive numbers to help with alignment.
---
 examples/oapi/00obj_basic.c    |  4 +--
 examples/oapi/04level0.c       | 26 ++++++++---------
 examples/oapi/05level1v.c      | 34 +++++++++++-----------
 examples/oapi/06level1m.c      | 38 ++++++++++++-------------
 examples/oapi/07level1m_diag.c | 28 +++++++++---------
 examples/oapi/08level2.c       | 44 ++++++++++++++--------------
 examples/oapi/09level3.c       | 44 ++++++++++++++--------------
 examples/oapi/10util.c         | 52 +++++++++++++++++-----------------
 examples/oapi/11gemm_md.c      | 32 ++++++++++-----------
 examples/tapi/00level1v.c      | 34 +++++++++++-----------
 examples/tapi/01level1m.c      | 38 ++++++++++++-------------
 examples/tapi/02level1m_diag.c | 28 +++++++++---------
 examples/tapi/03level2.c       | 44 ++++++++++++++--------------
 examples/tapi/04level3.c       | 44 ++++++++++++++--------------
 examples/tapi/05util.c         | 52 +++++++++++++++++-----------------
 15 files changed, 271 insertions(+), 271 deletions(-)

diff --git a/examples/oapi/00obj_basic.c b/examples/oapi/00obj_basic.c
index f75a1cd60..126147a2e 100644
--- a/examples/oapi/00obj_basic.c
+++ b/examples/oapi/00obj_basic.c
@@ -187,7 +187,7 @@ int main( int argc, char** argv )
 	// still need to make sure that the specifier makes sense for the data
 	// being printed. For example, you shouldn't use "%d" when printing
 	// elements of type 'float'.
-	bli_printm( "matrix 'a9' contents:", &a9, "%4.1f", "" );
+	bli_printm( "matrix 'a9' contents:", &a9, "% 4.3f", "" );
 
 
 	//
@@ -200,7 +200,7 @@ int main( int argc, char** argv )
 	// When printing complex matrices, the same format specifier gets used
 	// for both the real and imaginary parts.
 	bli_randm( &a11 );
-	bli_printm( "matrix 'a11' contents (complex):", &a11, "%4.1f", "" );
+	bli_printm( "matrix 'a11' contents (complex):", &a11, "% 4.3f", "" );
 
 
 	//
diff --git a/examples/oapi/04level0.c b/examples/oapi/04level0.c
index 72fe98200..50d99dacd 100644
--- a/examples/oapi/04level0.c
+++ b/examples/oapi/04level0.c
@@ -105,10 +105,10 @@ int main( int argc, char** argv )
 
 	// BLIS does not have a special print function for scalars, but since a
 	// 1x1 is also a vector and a matrix, we can use printv or printm.
-	bli_printm( "alpha:", &alpha, "%4.1f", "" );
-	bli_printm( "beta:", &beta, "%4.1f", "" );
-	bli_printm( "kappa:", &kappa, "%4.1f", "" );
-	bli_printm( "gamma:", &gamma, "%4.1f", "" );
+	bli_printm( "alpha:", &alpha, "% 4.3f", "" );
+	bli_printm( "beta:", &beta, "% 4.3f", "" );
+	bli_printm( "kappa:", &kappa, "% 4.3f", "" );
+	bli_printm( "gamma:", &gamma, "% 4.3f", "" );
 
 
 	//
@@ -121,7 +121,7 @@ int main( int argc, char** argv )
 	// can be used.
 	bli_obj_create_1x1( BLIS_DCOMPLEX, &zeta );
 	bli_setsc( 3.3, -4.4, &zeta );
-	bli_printm( "zeta (complex):", &zeta, "%4.1f", "" );
+	bli_printm( "zeta (complex):", &zeta, "% 4.3f", "" );
 
 
 	//
@@ -133,10 +133,10 @@ int main( int argc, char** argv )
 	// We can copy scalars amongst one another, and we can use the global
 	// scalar constants for input operands.
 	bli_copysc( &beta, &gamma );
-	bli_printm( "gamma (overwritten with beta):", &gamma, "%4.1f", "" );
+	bli_printm( "gamma (overwritten with beta):", &gamma, "% 4.3f", "" );
 
 	bli_copysc( &BLIS_ONE, &gamma );
-	bli_printm( "gamma (overwritten with BLIS_ONE):", &gamma, "%4.1f", "" );
+	bli_printm( "gamma (overwritten with BLIS_ONE):", &gamma, "% 4.3f", "" );
 
 
 	//
@@ -147,24 +147,24 @@ int main( int argc, char** argv )
 
 	// BLIS defines a range of basic floating-point operations on scalars.
 	bli_addsc( &beta, &gamma );
-	bli_printm( "gamma := gamma + beta", &gamma, "%4.1f", "" );
+	bli_printm( "gamma := gamma + beta", &gamma, "% 4.3f", "" );
 
 	bli_subsc( &alpha, &gamma );
-	bli_printm( "gamma := gamma - alpha", &gamma, "%4.1f", "" );
+	bli_printm( "gamma := gamma - alpha", &gamma, "% 4.3f", "" );
 
 	bli_divsc( &kappa, &gamma );
-	bli_printm( "gamma := gamma / kappa", &gamma, "%4.1f", "" );
+	bli_printm( "gamma := gamma / kappa", &gamma, "% 4.3f", "" );
 
 	bli_sqrtsc( &gamma, &gamma );
-	bli_printm( "gamma := sqrt( gamma )", &gamma, "%4.1f", "" );
+	bli_printm( "gamma := sqrt( gamma )", &gamma, "% 4.3f", "" );
 
 	bli_normfsc( &alpha, &alpha );
-	bli_printm( "alpha := normf( alpha ) # normf() = abs() in real domain.", &alpha, "%4.1f", "" );
+	bli_printm( "alpha := normf( alpha ) # normf() = abs() in real domain.", &alpha, "% 4.3f", "" );
 
 	// Note that normfsc() allows complex input objects, but requires that the
 	// output operand (the second operand) be a real object.
 	bli_normfsc( &zeta, &alpha );
-	bli_printm( "alpha := normf( zeta )  # normf() = complex modulus in complex domain.", &alpha, "%4.1f", "" );
+	bli_printm( "alpha := normf( zeta )  # normf() = complex modulus in complex domain.", &alpha, "% 4.3f", "" );
 
 	bli_invertsc( &gamma, &gamma );
 	bli_printm( "gamma := 1.0 / gamma", &gamma, "%4.2f", "" );
diff --git a/examples/oapi/05level1v.c b/examples/oapi/05level1v.c
index 675743624..b76e5df5f 100644
--- a/examples/oapi/05level1v.c
+++ b/examples/oapi/05level1v.c
@@ -79,9 +79,9 @@ int main( int argc, char** argv )
 	bli_setsc( 0.2, 0.0, &beta );
 	bli_setsc( 3.0, 0.0, &gamma );
 
-	bli_printm( "alpha:", &alpha, "%4.1f", "" );
-	bli_printm( "beta:", &beta, "%4.1f", "" );
-	bli_printm( "gamma:", &gamma, "%4.1f", "" );
+	bli_printm( "alpha:", &alpha, "% 4.3f", "" );
+	bli_printm( "beta:", &beta, "% 4.3f", "" );
+	bli_printm( "gamma:", &gamma, "% 4.3f", "" );
 
 	// Vectors can set by "broadcasting" a constant to every element.
 	bli_setv( &BLIS_ONE, &x );
@@ -93,9 +93,9 @@ int main( int argc, char** argv )
 	// orientation of the vector (row or column) when printing, whereas
 	// printv always prints vectors as column vectors regardless of their
 	// they are 1 x n or n x 1.
-	bli_printm( "x := 1.0", &x, "%4.1f", "" );
-	bli_printm( "y := alpha", &y, "%4.1f", "" );
-	bli_printm( "z := 0.0", &z, "%4.1f", "" );
+	bli_printm( "x := 1.0", &x, "% 4.3f", "" );
+	bli_printm( "y := alpha", &y, "% 4.3f", "" );
+	bli_printm( "z := 0.0", &z, "% 4.3f", "" );
 
 
 	//
@@ -107,7 +107,7 @@ int main( int argc, char** argv )
 	// Set a vector to random values.
 	bli_randv( &w );
 
-	bli_printm( "w := randv()", &w, "%4.1f", "" );
+	bli_printm( "w := randv()", &w, "% 4.3f", "" );
 
 
 	//
@@ -118,38 +118,38 @@ int main( int argc, char** argv )
 
 	// Copy a vector.
 	bli_copyv( &w, &a );
-	bli_printm( "a := w", &a, "%4.1f", "" );
+	bli_printm( "a := w", &a, "% 4.3f", "" );
 
 	// Add and subtract vectors.
 	bli_addv( &y, &a );
-	bli_printm( "a := a + y", &a, "%4.1f", "" );
+	bli_printm( "a := a + y", &a, "% 4.3f", "" );
 
 	bli_subv( &w, &a );
-	bli_printm( "a := a - w", &a, "%4.1f", "" );
+	bli_printm( "a := a - w", &a, "% 4.3f", "" );
 
 	// Scale a vector (destructive).
 	bli_scalv( &beta, &a );
-	bli_printm( "a := beta * a", &a, "%4.1f", "" );
+	bli_printm( "a := beta * a", &a, "% 4.3f", "" );
 
 	// Scale a vector (non-destructive).
 	bli_scal2v( &gamma, &a, &z );
-	bli_printm( "z := gamma * a", &z, "%4.1f", "" );
+	bli_printm( "z := gamma * a", &z, "% 4.3f", "" );
 
 	// Scale and accumulate between vectors.
 	bli_axpyv( &alpha, &w, &x );
-	bli_printm( "x := x + alpha * w", &x, "%4.1f", "" );
+	bli_printm( "x := x + alpha * w", &x, "% 4.3f", "" );
 
 	bli_xpbyv( &w, &BLIS_MINUS_ONE, &x );
-	bli_printm( "x := -1.0 * x + w", &x, "%4.1f", "" );
+	bli_printm( "x := -1.0 * x + w", &x, "% 4.3f", "" );
 
 	// Invert a vector element-wise.
 	bli_invertv( &y );
-	bli_printm( "y := 1 / y", &y, "%4.1f", "" );
+	bli_printm( "y := 1 / y", &y, "% 4.3f", "" );
 
 	// Swap two vectors.
 	bli_swapv( &x, &y );
-	bli_printm( "x (after swapping with y)", &x, "%4.1f", "" );
-	bli_printm( "y (after swapping with x)", &y, "%4.1f", "" );
+	bli_printm( "x (after swapping with y)", &x, "% 4.3f", "" );
+	bli_printm( "y (after swapping with x)", &y, "% 4.3f", "" );
 
 
 	//
diff --git a/examples/oapi/06level1m.c b/examples/oapi/06level1m.c
index 9d57d3149..744375cb9 100644
--- a/examples/oapi/06level1m.c
+++ b/examples/oapi/06level1m.c
@@ -76,9 +76,9 @@ int main( int argc, char** argv )
 	bli_setsc( 0.2, 0.0, &beta );
 	bli_setsc( 3.0, 0.0, &gamma );
 
-	bli_printm( "alpha:", &alpha, "%4.1f", "" );
-	bli_printm( "beta:", &beta, "%4.1f", "" );
-	bli_printm( "gamma:", &gamma, "%4.1f", "" );
+	bli_printm( "alpha:", &alpha, "% 4.3f", "" );
+	bli_printm( "beta:", &beta, "% 4.3f", "" );
+	bli_printm( "gamma:", &gamma, "% 4.3f", "" );
 
 	// Matrices, like vectors, can set by "broadcasting" a constant to every
 	// element.
@@ -86,9 +86,9 @@ int main( int argc, char** argv )
 	bli_setm( &alpha, &b );
 	bli_setm( &BLIS_ZERO, &c );
 
-	bli_printm( "a := 1.0", &a, "%4.1f", "" );
-	bli_printm( "b := alpha", &b, "%4.1f", "" );
-	bli_printm( "c := 0.0", &c, "%4.1f", "" );
+	bli_printm( "a := 1.0", &a, "% 4.3f", "" );
+	bli_printm( "b := alpha", &b, "% 4.3f", "" );
+	bli_printm( "c := 0.0", &c, "% 4.3f", "" );
 
 
 	//
@@ -100,7 +100,7 @@ int main( int argc, char** argv )
 	// Set a matrix to random values.
 	bli_randm( &e );
 
-	bli_printm( "e (randomized):", &e, "%4.1f", "" );
+	bli_printm( "e (randomized):", &e, "% 4.3f", "" );
 
 
 	//
@@ -111,26 +111,26 @@ int main( int argc, char** argv )
 
 	// Copy a matrix.
 	bli_copym( &e, &d );
-	bli_printm( "d := e", &d, "%4.1f", "" );
+	bli_printm( "d := e", &d, "% 4.3f", "" );
 
 	// Add and subtract vectors.
 	bli_addm( &a, &d );
-	bli_printm( "d := d + a", &d, "%4.1f", "" );
+	bli_printm( "d := d + a", &d, "% 4.3f", "" );
 
 	bli_subm( &a, &e );
-	bli_printm( "e := e - a", &e, "%4.1f", "" );
+	bli_printm( "e := e - a", &e, "% 4.3f", "" );
 
 	// Scale a matrix (destructive).
 	bli_scalm( &alpha, &e );
-	bli_printm( "e := alpha * e", &e, "%4.1f", "" );
+	bli_printm( "e := alpha * e", &e, "% 4.3f", "" );
 
 	// Scale a matrix (non-destructive).
 	bli_scal2m( &beta, &e, &c );
-	bli_printm( "c := beta * e", &c, "%4.1f", "" );
+	bli_printm( "c := beta * e", &c, "% 4.3f", "" );
 
 	// Scale and accumulate between matrices.
 	bli_axpym( &alpha, &a, &c );
-	bli_printm( "c := c + alpha * a", &c, "%4.1f", "" );
+	bli_printm( "c := c + alpha * a", &c, "% 4.3f", "" );
 
 
 	//
@@ -146,8 +146,8 @@ int main( int argc, char** argv )
 	// Initialize all of 'f' to -1.0 to simulate junk values.
 	bli_setm( &BLIS_MINUS_ONE, &f );
 
-	bli_printm( "e:", &e, "%4.1f", "" );
-	bli_printm( "f (initial value):", &f, "%4.1f", "" );
+	bli_printm( "e:", &e, "% 4.3f", "" );
+	bli_printm( "f (initial value):", &f, "% 4.3f", "" );
 
 	// Since we are going to copy 'e' to 'f', we need to indicate a transpose
 	// on 'e', the input operand. Transposition can be indicated by setting a
@@ -173,7 +173,7 @@ int main( int argc, char** argv )
 	// when marking an operand for transposition, not the destination.
 	bli_copym( &e, &f );
 
-	bli_printm( "f (copied value):", &f, "%4.1f", "" );
+	bli_printm( "f (copied value):", &f, "% 4.3f", "" );
 
 
 	//
@@ -194,8 +194,8 @@ int main( int argc, char** argv )
 	// Initialize all of 'h' to -1.0 to simulate junk values.
 	bli_setm( &BLIS_MINUS_ONE, &h );
 
-	bli_printm( "g:", &g, "%4.1f", "" );
-	bli_printm( "h (initial value):", &h, "%4.1f", "" );
+	bli_printm( "g:", &g, "% 4.3f", "" );
+	bli_printm( "h (initial value):", &h, "% 4.3f", "" );
 
 	// Set both the transpose and conjugation bits.
 	bli_obj_toggle_trans( &g );
@@ -206,7 +206,7 @@ int main( int argc, char** argv )
 	// conjugation.
 	bli_copym( &g, &h );
 
-	bli_printm( "h (copied value):", &h, "%4.1f", "" );
+	bli_printm( "h (copied value):", &h, "% 4.3f", "" );
 
 
 	// Free the objects.
diff --git a/examples/oapi/07level1m_diag.c b/examples/oapi/07level1m_diag.c
index ca009820c..95bc023de 100644
--- a/examples/oapi/07level1m_diag.c
+++ b/examples/oapi/07level1m_diag.c
@@ -73,7 +73,7 @@ int main( int argc, char** argv )
 	// Now set the upper triangle to random values.
 	bli_randm( &a );
 
-	bli_printm( "a: randomize upper part (lower part may contain garbage)", &a, "%4.1f", "" );
+	bli_printm( "a: randomize upper part (lower part may contain garbage)", &a, "% 4.3f", "" );
 
 
 	//
@@ -125,14 +125,14 @@ int main( int argc, char** argv )
 	// triangle of 'bl' to zero).
 	bli_setm( &BLIS_ZERO, &bl );
 
-	bli_printm( "b: randomize upper part; set strictly lower part to 0.0", &b, "%4.1f", "" );
+	bli_printm( "b: randomize upper part; set strictly lower part to 0.0", &b, "% 4.3f", "" );
 
 	// You may not see the effect of setting the strictly lower part to zero,
 	// since those values may already be zero (instead of random junk). So
 	// let's set it to something you'll notice, like -1.0.
 	bli_setm( &BLIS_MINUS_ONE, &bl );
 
-	bli_printm( "b: randomize upper part; set strictly lower part to -1.0", &b, "%4.1f", "" );
+	bli_printm( "b: randomize upper part; set strictly lower part to -1.0", &b, "% 4.3f", "" );
 
 
 	//
@@ -158,7 +158,7 @@ int main( int argc, char** argv )
 	// uninitialized, the strictly upper part could contain junk.
 	bli_copym( &bl, &c );
 
-	bli_printm( "c: copy lower part of b (upper part may contain garbage)", &c, "%4.1f", "" );
+	bli_printm( "c: copy lower part of b (upper part may contain garbage)", &c, "% 4.3f", "" );
 
 	// Notice that the structure and uplo properties of 'c' were set to their
 	// default values, BLIS_GENERAL and BLIS_DENSE, respectively. Thus, it is
@@ -178,7 +178,7 @@ int main( int argc, char** argv )
 	// ignore the "unstored" regions of input operands because they are assumed
 	// to be zero).
 
-	bli_printm( "a: copy lower triangular bl to upper triangular a", &a, "%4.1f", "" );
+	bli_printm( "a: copy lower triangular bl to upper triangular a", &a, "% 4.3f", "" );
 
 
 	//
@@ -198,7 +198,7 @@ int main( int argc, char** argv )
 	// Let's start by setting entire destination matrix to zero.
 	bli_setm( &BLIS_ZERO, &d );
 
-	bli_printm( "d: initial value (all zeros)", &d, "%4.1f", "" );
+	bli_printm( "d: initial value (all zeros)", &d, "% 4.3f", "" );
 
 	// Recall that 'bl' is marked as lower triangular with a diagonal offset
 	// of 0. Also recall that 'bl' is an alias of 'b', which is now fully
@@ -210,7 +210,7 @@ int main( int argc, char** argv )
 	bli_setijm( 3.1, 0.0, 3, 1, &bl );
 	bli_setijm( 3.2, 0.0, 3, 2, &bl );
 
-	bli_printm( "bl: lower triangular bl is aliased to b", &bl, "%4.1f", "" );
+	bli_printm( "bl: lower triangular bl is aliased to b", &bl, "% 4.3f", "" );
 
 	// We want to pluck out the lower triangle and transpose it into the upper
 	// triangle of 'd'.
@@ -221,7 +221,7 @@ int main( int argc, char** argv )
 	// 'd'. It's the source operand that matters, not the destination!)
 	bli_copym( &bl, &d );
 
-	bli_printm( "d: transpose of lower triangular of bl copied to d", &d, "%4.1f", "" );
+	bli_printm( "d: transpose of lower triangular of bl copied to d", &d, "% 4.3f", "" );
 
 
 	//
@@ -242,7 +242,7 @@ int main( int argc, char** argv )
 	// Initialize the entire matrix to -1.0 to simulate junk values.
 	bli_setm( &BLIS_MINUS_ONE, &e );
 
-	bli_printm( "e: initial value (all -1.0)", &e, "%4.1f", "" );
+	bli_printm( "e: initial value (all -1.0)", &e, "% 4.3f", "" );
 
 	// Create an alias to work with.
 	bli_obj_alias_to( &e, &el );
@@ -258,7 +258,7 @@ int main( int argc, char** argv )
 	// Randomize the lower trapezoid.
 	bli_randm( &el );
 
-	bli_printm( "e: after lower trapezoid randomized", &e, "%4.1f", "" );
+	bli_printm( "e: after lower trapezoid randomized", &e, "% 4.3f", "" );
 
 	// Move the diagonal offset of 'el' to 1 and flip the uplo field to
 	// "upper".
@@ -268,7 +268,7 @@ int main( int argc, char** argv )
 	// Set the upper triangle to zero.
 	bli_setm( &BLIS_ZERO, &el );
 
-	bli_printm( "e: after upper triangle set to zero", &e, "%4.1f", "" );
+	bli_printm( "e: after upper triangle set to zero", &e, "% 4.3f", "" );
 
 
 	//
@@ -288,7 +288,7 @@ int main( int argc, char** argv )
 	// Initialize the entire matrix to -1.0 to simulate junk values.
 	bli_setm( &BLIS_MINUS_ONE, &h );
 
-	bli_printm( "h: initial value (all -1.0)", &h, "%4.1f", "" );
+	bli_printm( "h: initial value (all -1.0)", &h, "% 4.3f", "" );
 
 	// Set the diagonal offset of 'h' to -1.
 	bli_obj_set_diag_offset( -1, &h );
@@ -300,7 +300,7 @@ int main( int argc, char** argv )
 	// Randomize the elements on and above the first subdiagonal.
 	bli_randm( &h );
 
-	bli_printm( "h: after randomizing above first subdiagonal", &h, "%4.1f", "" );
+	bli_printm( "h: after randomizing above first subdiagonal", &h, "% 4.3f", "" );
 
 	// Create an alias to work with.
 	bli_obj_alias_to( &h, &hl );
@@ -313,7 +313,7 @@ int main( int argc, char** argv )
 	// the second subdiagonal) to zero.
 	bli_setm( &BLIS_ZERO, &hl );
 
-	bli_printm( "h: after setting elements below first subdiagonal to zero", &h, "%4.1f", "" );
+	bli_printm( "h: after setting elements below first subdiagonal to zero", &h, "% 4.3f", "" );
 
 
 	// Free the objects.
diff --git a/examples/oapi/08level2.c b/examples/oapi/08level2.c
index 09e61722d..e74fe64f1 100644
--- a/examples/oapi/08level2.c
+++ b/examples/oapi/08level2.c
@@ -74,14 +74,14 @@ int main( int argc, char** argv )
 	// Initialize 'a' to 1.0.
 	bli_setm( &BLIS_ONE, &a );
 
-	bli_printm( "x: set to random values", &x, "%4.1f", "" );
-	bli_printm( "y: set to -1.0", &y, "%4.1f", "" );
-	bli_printm( "a: initial value", &a, "%4.1f", "" );
+	bli_printm( "x: set to random values", &x, "% 4.3f", "" );
+	bli_printm( "y: set to -1.0", &y, "% 4.3f", "" );
+	bli_printm( "a: initial value", &a, "% 4.3f", "" );
 
 	// a := a + alpha * x * y, where 'a' is general.
 	bli_ger( alpha, &x, &y, &a );
 
-	bli_printm( "a: after ger", &a, "%4.1f", "" );
+	bli_printm( "a: after ger", &a, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -122,14 +122,14 @@ int main( int argc, char** argv )
 	// Randomize 'a'.
 	bli_randm( &a );
 
-	bli_printm( "a: randomized", &a, "%4.1f", "" );
-	bli_printm( "x: set to 1.0", &x, "%4.1f", "" );
-	bli_printm( "y: initial value", &y, "%4.1f", "" );
+	bli_printm( "a: randomized", &a, "% 4.3f", "" );
+	bli_printm( "x: set to 1.0", &x, "% 4.3f", "" );
+	bli_printm( "y: initial value", &y, "% 4.3f", "" );
 
 	// y := beta * y + alpha * a * x, where 'a' is general.
 	bli_gemv( alpha, &a, &x, beta, &y );
 
-	bli_printm( "y: after gemv", &y, "%4.1f", "" );
+	bli_printm( "y: after gemv", &y, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -165,13 +165,13 @@ int main( int argc, char** argv )
 	bli_obj_set_uplo( BLIS_LOWER, &a );
 	bli_randm( &a );
 
-	bli_printm( "x: set to random values", &x, "%4.1f", "" );
-	bli_printm( "a: initial value (zeros in upper triangle)", &a, "%4.1f", "" );
+	bli_printm( "x: set to random values", &x, "% 4.3f", "" );
+	bli_printm( "a: initial value (zeros in upper triangle)", &a, "% 4.3f", "" );
 
 	// a := a + alpha * x * x^T, where 'a' is symmetric and lower-stored.
 	bli_syr( alpha, &x, &a );
 
-	bli_printm( "a: after syr", &a, "%4.1f", "" );
+	bli_printm( "a: after syr", &a, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -209,14 +209,14 @@ int main( int argc, char** argv )
 	bli_obj_set_uplo( BLIS_UPPER, &a );
 	bli_randm( &a );
 
-	bli_printm( "a: randomized (zeros in lower triangle)", &a, "%4.1f", "" );
-	bli_printm( "x: set to 1.0", &x, "%4.1f", "" );
-	bli_printm( "y: initial value", &y, "%4.1f", "" );
+	bli_printm( "a: randomized (zeros in lower triangle)", &a, "% 4.3f", "" );
+	bli_printm( "x: set to 1.0", &x, "% 4.3f", "" );
+	bli_printm( "y: initial value", &y, "% 4.3f", "" );
 
 	// y := beta * y + alpha * a * x, where 'a' is symmetric and upper-stored.
 	bli_symv( alpha, &a, &x, beta, &y );
 
-	bli_printm( "y: after symv", &y, "%4.1f", "" );
+	bli_printm( "y: after symv", &y, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -253,13 +253,13 @@ int main( int argc, char** argv )
 	bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a );
 	bli_randm( &a );
 
-	bli_printm( "a: randomized (zeros in upper triangle)", &a, "%4.1f", "" );
-	bli_printm( "x: initial value", &x, "%4.1f", "" );
+	bli_printm( "a: randomized (zeros in upper triangle)", &a, "% 4.3f", "" );
+	bli_printm( "x: initial value", &x, "% 4.3f", "" );
 
 	// x := alpha * a * x, where 'a' is triangular and lower-stored.
 	bli_trmv( alpha, &a, &x );
 
-	bli_printm( "x: after trmv", &x, "%4.1f", "" );
+	bli_printm( "x: after trmv", &x, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -301,21 +301,21 @@ int main( int argc, char** argv )
 	// that the matrix is not singular (singular matrices have no inverse).
 	bli_shiftd( &BLIS_TWO, &a );
 
-	bli_printm( "a: randomized (zeros in upper triangle)", &a, "%4.1f", "" );
-	bli_printm( "b: initial value", &b, "%4.1f", "" );
+	bli_printm( "a: randomized (zeros in upper triangle)", &a, "% 4.3f", "" );
+	bli_printm( "b: initial value", &b, "% 4.3f", "" );
 
 	// solve a * x = alpha * b, where 'a' is triangular and lower-stored, and
 	// overwrite b with the solution vector x.
 	bli_trsv( alpha, &a, &b );
 
-	bli_printm( "b: after trsv", &b, "%4.1f", "" );
+	bli_printm( "b: after trsv", &b, "% 4.3f", "" );
 
 	// We can confirm the solution by comparing the product of a and x to the
 	// original value of b.
 	bli_copyv( &b, &y );
 	bli_trmv( alpha, &a, &y );
 
-	bli_printm( "y: should equal initial value of b", &y, "%4.1f", "" );
+	bli_printm( "y: should equal initial value of b", &y, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
diff --git a/examples/oapi/09level3.c b/examples/oapi/09level3.c
index 27ec78c52..853014518 100644
--- a/examples/oapi/09level3.c
+++ b/examples/oapi/09level3.c
@@ -74,14 +74,14 @@ int main( int argc, char** argv )
 	bli_setm( &BLIS_ONE, &b );
 	bli_setm( &BLIS_ZERO, &c );
 
-	bli_printm( "a: randomized", &a, "%4.1f", "" );
-	bli_printm( "b: set to 1.0", &b, "%4.1f", "" );
-	bli_printm( "c: initial value", &c, "%4.1f", "" );
+	bli_printm( "a: randomized", &a, "% 4.3f", "" );
+	bli_printm( "b: set to 1.0", &b, "% 4.3f", "" );
+	bli_printm( "c: initial value", &c, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a * b, where 'a', 'b', and 'c' are general.
 	bli_gemm( alpha, &a, &b, beta, &c );
 
-	bli_printm( "c: after gemm", &c, "%4.1f", "" );
+	bli_printm( "c: after gemm", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -115,14 +115,14 @@ int main( int argc, char** argv )
 	// Set the transpose bit in 'a'.
 	bli_obj_toggle_trans( &a );
 
-	bli_printm( "a: randomized", &a, "%4.1f", "" );
-	bli_printm( "b: set to 1.0", &b, "%4.1f", "" );
-	bli_printm( "c: initial value", &c, "%4.1f", "" );
+	bli_printm( "a: randomized", &a, "% 4.3f", "" );
+	bli_printm( "b: set to 1.0", &b, "% 4.3f", "" );
+	bli_printm( "c: initial value", &c, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a^T * b, where 'a', 'b', and 'c' are general.
 	bli_gemm( alpha, &a, &b, beta, &c );
 
-	bli_printm( "c: after gemm", &c, "%4.1f", "" );
+	bli_printm( "c: after gemm", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -155,13 +155,13 @@ int main( int argc, char** argv )
 	bli_obj_set_uplo( BLIS_LOWER, &c );
 	bli_randm( &c );
 
-	bli_printm( "a: set to random values", &a, "%4.1f", "" );
-	bli_printm( "c: initial value (zeros in upper triangle)", &c, "%4.1f", "" );
+	bli_printm( "a: set to random values", &a, "% 4.3f", "" );
+	bli_printm( "c: initial value (zeros in upper triangle)", &c, "% 4.3f", "" );
 
 	// c := c + alpha * a * a^T, where 'c' is symmetric and lower-stored.
 	bli_syrk( alpha, &a, beta, &c );
 
-	bli_printm( "c: after syrk", &c, "%4.1f", "" );
+	bli_printm( "c: after syrk", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &c );
@@ -202,16 +202,16 @@ int main( int argc, char** argv )
 	bli_obj_set_uplo( BLIS_UPPER, &a );
 	bli_randm( &a );
 
-	bli_printm( "a: randomized (zeros in lower triangle)", &a, "%4.1f", "" );
-	bli_printm( "b: set to 1.0", &b, "%4.1f", "" );
-	bli_printm( "c: initial value", &c, "%4.1f", "" );
+	bli_printm( "a: randomized (zeros in lower triangle)", &a, "% 4.3f", "" );
+	bli_printm( "b: set to 1.0", &b, "% 4.3f", "" );
+	bli_printm( "c: initial value", &c, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a * b, where 'a' is symmetric and upper-stored.
 	// Note that the first 'side' operand indicates the side from which matrix
 	// 'a' is multiplied into 'b'.
 	bli_symm( side, alpha, &a, &b, beta, &c );
 
-	bli_printm( "c: after symm", &c, "%4.1f", "" );
+	bli_printm( "c: after symm", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -251,13 +251,13 @@ int main( int argc, char** argv )
 	bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a );
 	bli_randm( &a );
 
-	bli_printm( "a: randomized (zeros in upper triangle)", &a, "%4.1f", "" );
-	bli_printm( "b: initial value", &b, "%4.1f", "" );
+	bli_printm( "a: randomized (zeros in upper triangle)", &a, "% 4.3f", "" );
+	bli_printm( "b: initial value", &b, "% 4.3f", "" );
 
 	// b := alpha * a * b, where 'a' is triangular and lower-stored.
 	bli_trmm( side, alpha, &a, &b );
 
-	bli_printm( "x: after trmm", &b, "%4.1f", "" );
+	bli_printm( "x: after trmm", &b, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -303,21 +303,21 @@ int main( int argc, char** argv )
 	// that the matrix is not singular (singular matrices have no inverse).
 	bli_shiftd( &BLIS_TWO, &a );
 
-	bli_printm( "a: randomized (zeros in upper triangle)", &a, "%4.1f", "" );
-	bli_printm( "b: initial value", &b, "%4.1f", "" );
+	bli_printm( "a: randomized (zeros in upper triangle)", &a, "% 4.3f", "" );
+	bli_printm( "b: initial value", &b, "% 4.3f", "" );
 
 	// solve a * x = alpha * b, where 'a' is triangular and lower-stored, and
 	// overwrite b with the solution matrix x.
 	bli_trsm( side, alpha, &a, &b );
 
-	bli_printm( "b: after trsm", &b, "%4.1f", "" );
+	bli_printm( "b: after trsm", &b, "% 4.3f", "" );
 
 	// We can confirm the solution by comparing the product of a and x to the
 	// original value of b.
 	bli_copym( &b, &c );
 	bli_trmm( side, alpha, &a, &c );
 
-	bli_printm( "c: should equal initial value of b", &c, "%4.1f", "" );
+	bli_printm( "c: should equal initial value of b", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
diff --git a/examples/oapi/10util.c b/examples/oapi/10util.c
index 2b9226f7b..fc32209f4 100644
--- a/examples/oapi/10util.c
+++ b/examples/oapi/10util.c
@@ -73,18 +73,18 @@ int main( int argc, char** argv )
 	bli_randv( &x );
 	bli_randv( &y );
 
-	bli_printm( "x:", &x, "%4.1f", "" );
+	bli_printm( "x:", &x, "% 4.3f", "" );
 
 	// Compute the one, infinity, and frobenius norms of 'x'.
 	bli_norm1v( &x, &norm1 );
 	bli_normiv( &x, &normi );
 	bli_normfv( &x, &normf );
 
-	bli_printm( "x: 1-norm:", &norm1, "%4.1f", "" );
-	bli_printm( "x: infinity norm:", &normi, "%4.1f", "" );
-	bli_printm( "x: frobenius norm:", &normf, "%4.1f", "" );
+	bli_printm( "x: 1-norm:", &norm1, "% 4.3f", "" );
+	bli_printm( "x: infinity norm:", &normi, "% 4.3f", "" );
+	bli_printm( "x: frobenius norm:", &normf, "% 4.3f", "" );
 
-	bli_printm( "y:", &y, "%4.1f", "" );
+	bli_printm( "y:", &y, "% 4.3f", "" );
 
 	// Compute the one, infinity, and frobenius norms of 'y'. Note that we
 	// can reuse the same scalars from before for computing norms of
@@ -93,9 +93,9 @@ int main( int argc, char** argv )
 	bli_normiv( &y, &normi );
 	bli_normfv( &y, &normf );
 
-	bli_printm( "y: 1-norm:", &norm1, "%4.1f", "" );
-	bli_printm( "y: infinity norm:", &normi, "%4.1f", "" );
-	bli_printm( "y: frobenius norm:", &normf, "%4.1f", "" );
+	bli_printm( "y: 1-norm:", &norm1, "% 4.3f", "" );
+	bli_printm( "y: infinity norm:", &normi, "% 4.3f", "" );
+	bli_printm( "y: frobenius norm:", &normf, "% 4.3f", "" );
 
 
 	//
@@ -113,27 +113,27 @@ int main( int argc, char** argv )
 	bli_randm( &a );
 	bli_randm( &b );
 
-	bli_printm( "a:", &a, "%4.1f", "" );
+	bli_printm( "a:", &a, "% 4.3f", "" );
 
 	// Compute the one, infinity, and frobenius norms of 'a'.
 	bli_norm1m( &a, &norm1 );
 	bli_normim( &a, &normi );
 	bli_normfm( &a, &normf );
 
-	bli_printm( "a: 1-norm:", &norm1, "%4.1f", "" );
-	bli_printm( "a: infinity norm:", &normi, "%4.1f", "" );
-	bli_printm( "a: frobenius norm:", &normf, "%4.1f", "" );
+	bli_printm( "a: 1-norm:", &norm1, "% 4.3f", "" );
+	bli_printm( "a: infinity norm:", &normi, "% 4.3f", "" );
+	bli_printm( "a: frobenius norm:", &normf, "% 4.3f", "" );
 
-	bli_printm( "b:", &b, "%4.1f", "" );
+	bli_printm( "b:", &b, "% 4.3f", "" );
 
 	// Compute the one-norm of 'b'.
 	bli_norm1m( &b, &norm1 );
 	bli_normim( &b, &normi );
 	bli_normfm( &b, &normf );
 
-	bli_printm( "b: 1-norm:", &norm1, "%4.1f", "" );
-	bli_printm( "b: infinity norm:", &normi, "%4.1f", "" );
-	bli_printm( "b: frobenius norm:", &normf, "%4.1f", "" );
+	bli_printm( "b: 1-norm:", &norm1, "% 4.3f", "" );
+	bli_printm( "b: infinity norm:", &normi, "% 4.3f", "" );
+	bli_printm( "b: frobenius norm:", &normf, "% 4.3f", "" );
 
 
 	//
@@ -157,13 +157,13 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'c'.
 	bli_randm( &c );
 
-	bli_printm( "c (initial state):", &c, "%4.1f", "" );
+	bli_printm( "c (initial state):", &c, "% 4.3f", "" );
 
 	// mksymm on a real matrix transposes the stored triangle into the
 	// unstored triangle, making the matrix densely symmetric.
 	bli_mksymm( &c );
 
-	bli_printm( "c (after mksymm on lower triangle):", &c, "%4.1f", "" );
+	bli_printm( "c (after mksymm on lower triangle):", &c, "% 4.3f", "" );
 
 	// Digression: Most people think only of complex matrices as being able
 	// to be complex. However, in BLIS, we define Hermitian operations on
@@ -180,13 +180,13 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'd'.
 	bli_randm( &d );
 
-	bli_printm( "d (initial state):", &d, "%4.1f", "" );
+	bli_printm( "d (initial state):", &d, "% 4.3f", "" );
 
 	// mkherm on a real matrix behaves the same as mksymm, as there are no
 	// imaginary elements to conjugate.
 	bli_mkherm( &d );
 
-	bli_printm( "d (after mkherm on lower triangle):", &d, "%4.1f", "" );
+	bli_printm( "d (after mkherm on lower triangle):", &d, "% 4.3f", "" );
 
 
 	//
@@ -210,13 +210,13 @@ int main( int argc, char** argv )
 	// Randomize the upper triangle of 'e'.
 	bli_randm( &e );
 
-	bli_printm( "e (initial state):", &e, "%4.1f", "" );
+	bli_printm( "e (initial state):", &e, "% 4.3f", "" );
 
 	// mksymm on a complex matrix transposes the stored triangle into the
 	// unstored triangle.
 	bli_mksymm( &e );
 
-	bli_printm( "e (after mksymm):", &e, "%4.1f", "" );
+	bli_printm( "e (after mksymm):", &e, "% 4.3f", "" );
 
 	// Initialize all of 'f' to -1.0 to simulate junk values.
 	bli_setm( &BLIS_MINUS_ONE, &f );
@@ -228,13 +228,13 @@ int main( int argc, char** argv )
 	// Randomize the upper triangle of 'f'.
 	bli_randm( &f );
 
-	bli_printm( "f (initial state):", &f, "%4.1f", "" );
+	bli_printm( "f (initial state):", &f, "% 4.3f", "" );
 
 	// mkherm on a complex matrix transposes and conjugates the stored
 	// triangle into the unstored triangle.
 	bli_mkherm( &f );
 
-	bli_printm( "f (after mkherm):", &f, "%4.1f", "" );
+	bli_printm( "f (after mkherm):", &f, "% 4.3f", "" );
 
 
 	//
@@ -257,14 +257,14 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'g'.
 	bli_randm( &g );
 
-	bli_printm( "g (initial state):", &g, "%4.1f", "" );
+	bli_printm( "g (initial state):", &g, "% 4.3f", "" );
 
 	// mktrim does not explicitly copy any data, since presumably the stored
 	// triangle already contains the data of interest. However, mktrim does
 	// explicitly writes zeros to the unstored region.
 	bli_mktrim( &g );
 
-	bli_printm( "g (after mktrim):", &g, "%4.1f", "" );
+	bli_printm( "g (after mktrim):", &g, "% 4.3f", "" );
 
 
 	// Free the objects.
diff --git a/examples/oapi/11gemm_md.c b/examples/oapi/11gemm_md.c
index 8ae40c1f4..feaf308df 100644
--- a/examples/oapi/11gemm_md.c
+++ b/examples/oapi/11gemm_md.c
@@ -80,15 +80,15 @@ int main( int argc, char** argv )
 	bli_randm( &b );
 	bli_setm( &BLIS_ZERO, &c );
 
-	bli_printm( "a (double real):    randomized", &a, "%4.1f", "" );
-	bli_printm( "b (double complex): randomized", &b, "%4.1f", "" );
-	bli_printm( "c (double complex): initial value", &c, "%4.1f", "" );
+	bli_printm( "a (double real):    randomized", &a, "% 4.3f", "" );
+	bli_printm( "b (double complex): randomized", &b, "% 4.3f", "" );
+	bli_printm( "c (double complex): initial value", &c, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a * b, where 'a' is real, and 'b' and 'c' are
 	// complex.
 	bli_gemm( alpha, &a, &b, beta, &c );
 
-	bli_printm( "c (double complex): after gemm", &c, "%4.1f", "" );
+	bli_printm( "c (double complex): after gemm", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -129,16 +129,16 @@ int main( int argc, char** argv )
 	bli_randm( &b );
 	bli_setm( &BLIS_ZERO, &c );
 
-	bli_printm( "a (single real): randomized", &a, "%4.1f", "" );
-	bli_printm( "b (single real): randomized", &b, "%4.1f", "" );
-	bli_printm( "c (double real): initial value", &c, "%4.1f", "" );
+	bli_printm( "a (single real): randomized", &a, "% 4.3f", "" );
+	bli_printm( "b (single real): randomized", &b, "% 4.3f", "" );
+	bli_printm( "c (double real): initial value", &c, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a * b, where 'a' and 'b' are single-precision
 	// real, 'c' is double-precision real, and the matrix product is performed
 	// in double-precision arithmetic.
 	bli_gemm( alpha, &a, &b, beta, &c );
 
-	bli_printm( "c (double real): after gemm (exec prec = double precision)", &c, "%4.1f", "" );
+	bli_printm( "c (double real): after gemm (exec prec = double precision)", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -172,16 +172,16 @@ int main( int argc, char** argv )
 	bli_randm( &b );
 	bli_setm( &BLIS_ZERO, &c );
 
-	bli_printm( "a (single real): randomized", &a, "%4.1f", "" );
-	bli_printm( "b (double complex): randomized", &b, "%4.1f", "" );
-	bli_printm( "c (single complex): initial value", &c, "%4.1f", "" );
+	bli_printm( "a (single real): randomized", &a, "% 4.3f", "" );
+	bli_printm( "b (double complex): randomized", &b, "% 4.3f", "" );
+	bli_printm( "c (single complex): initial value", &c, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a * b, where 'a' is single-precision real, 'b'
 	// is double-precision complex, 'c' is single-precision complex, and the
 	// matrix product is performed in single-precision arithmetic.
 	bli_gemm( alpha, &a, &b, beta, &c );
 
-	bli_printm( "c (single complex): after gemm (exec prec = single precision)", &c, "%4.1f", "" );
+	bli_printm( "c (single complex): after gemm (exec prec = single precision)", &c, "% 4.3f", "" );
 
 	// Free the objects.
 	bli_obj_free( &a );
@@ -204,12 +204,12 @@ int main( int argc, char** argv )
 	// Initialize a real matrix A.
 	bli_randm( &a );
 
-	bli_printm( "a (double real): randomized", &a, "%4.1f", "" );
+	bli_printm( "a (double real): randomized", &a, "% 4.3f", "" );
 
 	// Project real matrix A to the complex domain (in B).
 	bli_projm( &a, &b );
 
-	bli_printm( "b (double complex): projected from 'a'", &b, "%4.1f", "" );
+	bli_printm( "b (double complex): projected from 'a'", &b, "% 4.3f", "" );
 
 	// Notice how the imaginary components in B are zero since any real
 	// matrix implicitly has imaginary values that are equal to zero.
@@ -219,12 +219,12 @@ int main( int argc, char** argv )
 	// Initialize the complex matrix B.
 	bli_randm( &b );
 
-	bli_printm( "b (double complex): randomized", &b, "%4.1f", "" );
+	bli_printm( "b (double complex): randomized", &b, "% 4.3f", "" );
 
 	// Project complex matrix B to the real domain (in A).
 	bli_projm( &b, &a );
 
-	bli_printm( "a (double real): projected from 'b'", &a, "%4.1f", "" );
+	bli_printm( "a (double real): projected from 'b'", &a, "% 4.3f", "" );
 
 	// Notice how the imaginary components are lost in the projection from
 	// the complex domain to the real domain.
diff --git a/examples/tapi/00level1v.c b/examples/tapi/00level1v.c
index e27450714..4100ade17 100644
--- a/examples/tapi/00level1v.c
+++ b/examples/tapi/00level1v.c
@@ -82,9 +82,9 @@ int main( int argc, char** argv )
 	beta  = 0.2;
 	gamma = 3.0;
 
-	printf( "alpha:\n%4.1f\n\n", alpha );
-	printf( "beta:\n%4.1f\n\n", beta );
-	printf( "gamma:\n%4.1f\n\n", gamma );
+	printf( "alpha:\n% 4.3f\n\n", alpha );
+	printf( "beta:\n% 4.3f\n\n", beta );
+	printf( "gamma:\n% 4.3f\n\n", gamma );
 	printf( "\n" );
 
 	bli_dsetv( BLIS_NO_CONJUGATE, n, &one, x, 1 );
@@ -96,9 +96,9 @@ int main( int argc, char** argv )
 	// orientation of the vector (row or column) when printing, whereas
 	// printv always prints vectors as column vectors regardless of their
 	// they are 1 x n or n x 1.
-	bli_dprintm( "x := 1.0", m, n, x, rs, cs, "%4.1f", "" );
-	bli_dprintm( "y := alpha", m, n, y, rs, cs, "%4.1f", "" );
-	bli_dprintm( "z := 0.0", m, n, z, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x := 1.0", m, n, x, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "y := alpha", m, n, y, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "z := 0.0", m, n, z, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -110,7 +110,7 @@ int main( int argc, char** argv )
 	// Set a vector to random values.
 	bli_drandv( n, w, 1 );
 
-	bli_dprintm( "x := randv()", m, n, w, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x := randv()", m, n, w, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -121,38 +121,38 @@ int main( int argc, char** argv )
 
 	// Copy a vector.
 	bli_dcopyv( BLIS_NO_CONJUGATE, n, w, 1, a, 1 );
-	bli_dprintm( "a := w", m, n, a, rs, cs, "%4.1f", "" );
+	bli_dprintm( "a := w", m, n, a, rs, cs, "% 4.3f", "" );
 
 	// Add and subtract vectors.
 	bli_daddv( BLIS_NO_CONJUGATE, n, y, 1, a, 1 );
-	bli_dprintm( "a := a + y", m, n, a, rs, cs, "%4.1f", "" );
+	bli_dprintm( "a := a + y", m, n, a, rs, cs, "% 4.3f", "" );
 
 	bli_dsubv( BLIS_NO_CONJUGATE, n, w, 1, a, 1 );
-	bli_dprintm( "a := a + w", m, n, a, rs, cs, "%4.1f", "" );
+	bli_dprintm( "a := a + w", m, n, a, rs, cs, "% 4.3f", "" );
 
 	// Scale a vector (destructive).
 	bli_dscalv( BLIS_NO_CONJUGATE, n, &beta, a, 1 );
-	bli_dprintm( "a := beta * a", m, n, a, rs, cs, "%4.1f", "" );
+	bli_dprintm( "a := beta * a", m, n, a, rs, cs, "% 4.3f", "" );
 
 	// Scale a vector (non-destructive).
 	bli_dscal2v( BLIS_NO_CONJUGATE, n, &gamma, a, 1, z, 1 );
-	bli_dprintm( "z := gamma * a", m, n, z, rs, cs, "%4.1f", "" );
+	bli_dprintm( "z := gamma * a", m, n, z, rs, cs, "% 4.3f", "" );
 
 	// Scale and accumulate between vectors.
 	bli_daxpyv( BLIS_NO_CONJUGATE, n, &alpha, w, 1, x, 1 );
-	bli_dprintm( "x := x + alpha * w", m, n, x, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x := x + alpha * w", m, n, x, rs, cs, "% 4.3f", "" );
 
 	bli_dxpbyv( BLIS_NO_CONJUGATE, n, w, 1, &minus_one, x, 1 );
-	bli_dprintm( "x := -1.0 * x + w", m, n, x, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x := -1.0 * x + w", m, n, x, rs, cs, "% 4.3f", "" );
 
 	// Invert a vector element-wise.
 	bli_dinvertv( n, y, 1 );
-	bli_dprintm( "y := 1 / y", m, n, y, rs, cs, "%4.1f", "" );
+	bli_dprintm( "y := 1 / y", m, n, y, rs, cs, "% 4.3f", "" );
 
 	// Swap two vectors.
 	bli_dswapv( n, x, 1, y, 1 );
-	bli_dprintm( "x (after swapping with y)", m, n, x, rs, cs, "%4.1f", "" );
-	bli_dprintm( "y (after swapping with x)", m, n, y, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x (after swapping with y)", m, n, x, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "y (after swapping with x)", m, n, y, rs, cs, "% 4.3f", "" );
 
 
 	//
diff --git a/examples/tapi/01level1m.c b/examples/tapi/01level1m.c
index d3a5d8287..38b17e270 100644
--- a/examples/tapi/01level1m.c
+++ b/examples/tapi/01level1m.c
@@ -83,9 +83,9 @@ int main( int argc, char** argv )
 	beta  = 0.2;
 	gamma = 3.0;
 
-	printf( "alpha:\n%4.1f\n\n", alpha );
-	printf( "beta:\n%4.1f\n\n", beta );
-	printf( "gamma:\n%4.1f\n\n", gamma );
+	printf( "alpha:\n% 4.3f\n\n", alpha );
+	printf( "beta:\n% 4.3f\n\n", beta );
+	printf( "gamma:\n% 4.3f\n\n", gamma );
 	printf( "\n" );
 
 	// Matrices, like vectors, can set by "broadcasting" a constant to every
@@ -99,9 +99,9 @@ int main( int argc, char** argv )
 	bli_dsetm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
 	           m, n, &zero, c, rs, cs );
 
-	bli_dprintm( "a := 1.0", m, n, a, rs, cs, "%4.1f", "" );
-	bli_dprintm( "b := alpha", m, n, b, rs, cs, "%4.1f", "" );
-	bli_dprintm( "c := 0.0", m, n, c, rs, cs, "%4.1f", "" );
+	bli_dprintm( "a := 1.0", m, n, a, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "b := alpha", m, n, b, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "c := 0.0", m, n, c, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -112,7 +112,7 @@ int main( int argc, char** argv )
 
 	bli_drandm( 0, BLIS_DENSE, m, n, e, rs, cs );
 
-	bli_dprintm( "e (randomized):", m, n, e, rs, cs, "%4.1f", "" );
+	bli_dprintm( "e (randomized):", m, n, e, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -124,31 +124,31 @@ int main( int argc, char** argv )
 	// Copy a matrix.
 	bli_dcopym( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_NO_TRANSPOSE,
 	            m, n, e, rs, cs, d, rs, cs );
-	bli_dprintm( "d := e", m, n, d, rs, cs, "%4.1f", "" );
+	bli_dprintm( "d := e", m, n, d, rs, cs, "% 4.3f", "" );
 
 	// Add and subtract vectors.
 	bli_daddm( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_NO_TRANSPOSE,
 	           m, n, a, rs, cs, d, rs, cs );
-	bli_dprintm( "d := d + a", m, n, d, rs, cs, "%4.1f", "" );
+	bli_dprintm( "d := d + a", m, n, d, rs, cs, "% 4.3f", "" );
 
 	bli_dsubm( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_NO_TRANSPOSE,
 	           m, n, a, rs, cs, e, rs, cs );
-	bli_dprintm( "e := e - a", m, n, e, rs, cs, "%4.1f", "" );
+	bli_dprintm( "e := e - a", m, n, e, rs, cs, "% 4.3f", "" );
 
 	// Scale a matrix (destructive).
 	bli_dscalm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
 	            m, n, &alpha, e, rs, cs );
-	bli_dprintm( "e := alpha * e", m, n, e, rs, cs, "%4.1f", "" );
+	bli_dprintm( "e := alpha * e", m, n, e, rs, cs, "% 4.3f", "" );
 
 	// Scale a matrix (non-destructive).
 	bli_dscal2m( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_NO_TRANSPOSE,
 	             m, n, &beta, e, rs, cs, c, rs, cs );
-	bli_dprintm( "c := beta * e", m, n, c, rs, cs, "%4.1f", "" );
+	bli_dprintm( "c := beta * e", m, n, c, rs, cs, "% 4.3f", "" );
 
 	// Scale and accumulate between matrices.
 	bli_daxpym( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_NO_TRANSPOSE,
 	            m, n, &alpha, a, rs, cs, c, rs, cs );
-	bli_dprintm( "c := alpha * a", m, n, c, rs, cs, "%4.1f", "" );
+	bli_dprintm( "c := alpha * a", m, n, c, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -166,8 +166,8 @@ int main( int argc, char** argv )
 	bli_dsetm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
 	           n, m, &minus_one, f, rsf, csf );
 
-	bli_dprintm( "e:", m, n, e, rs, cs, "%4.1f", "" );
-	bli_dprintm( "f (initial value):", n, m, f, rsf, csf, "%4.1f", "" );
+	bli_dprintm( "e:", m, n, e, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "f (initial value):", n, m, f, rsf, csf, "% 4.3f", "" );
 
 
 	// Copy 'e' to 'f', transposing 'e' in the process. Notice that we haven't
@@ -176,7 +176,7 @@ int main( int argc, char** argv )
 	bli_dcopym( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_TRANSPOSE,
 	            n, m, e, rs, cs, f, rsf, csf );
 
-	bli_dprintm( "f (copied value):", n, m, f, rsf, csf, "%4.1f", "" );
+	bli_dprintm( "f (copied value):", n, m, f, rsf, csf, "% 4.3f", "" );
 
 
 	//
@@ -193,13 +193,13 @@ int main( int argc, char** argv )
 	bli_zsetm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
 	           n, m, &minus_one_z, h, rsf, csf );
 
-	bli_zprintm( "g:", m, n, g, rs, cs, "%4.1f", "" );
-	bli_zprintm( "h (initial value):", n, m, h, rsf, csf, "%4.1f", "" );
+	bli_zprintm( "g:", m, n, g, rs, cs, "% 4.3f", "" );
+	bli_zprintm( "h (initial value):", n, m, h, rsf, csf, "% 4.3f", "" );
 
 	bli_zcopym( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, BLIS_CONJ_TRANSPOSE,
 	            n, m, g, rs, cs, h, rsf, csf );
 
-	bli_zprintm( "h (copied value):", n, m, h, rsf, csf, "%4.1f", "" );
+	bli_zprintm( "h (copied value):", n, m, h, rsf, csf, "% 4.3f", "" );
 
 
 	// Free the memory obtained via malloc().
diff --git a/examples/tapi/02level1m_diag.c b/examples/tapi/02level1m_diag.c
index 0b2009863..a888e10e6 100644
--- a/examples/tapi/02level1m_diag.c
+++ b/examples/tapi/02level1m_diag.c
@@ -70,7 +70,7 @@ int main( int argc, char** argv )
 	bli_drandm( 0, BLIS_UPPER, m, n, a, rs, cs );
 
 	bli_dprintm( "a: randomize upper part (lower part may contain garbage)",
-	             m, n, a, rs, cs, "%4.1f", "" );
+	             m, n, a, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -93,7 +93,7 @@ int main( int argc, char** argv )
 	           m, n, &zero, b, rs, cs );
 
 	bli_dprintm( "b: randomize upper part; set strictly lower part to 0.0)",
-	             m, n, b, rs, cs, "%4.1f", "" );
+	             m, n, b, rs, cs, "% 4.3f", "" );
 
 	// You may not see the effect of setting the strictly lower part to zero,
 	// since those values may already be zero (instead of random junk). So
@@ -102,7 +102,7 @@ int main( int argc, char** argv )
 	           m, n, &minus_one, b, rs, cs );
 
 	bli_dprintm( "b: randomize upper part; set strictly lower part to -1.0)",
-	             m, n, b, rs, cs, "%4.1f", "" );
+	             m, n, b, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -120,13 +120,13 @@ int main( int argc, char** argv )
 	            m, n, b, rs, cs, c, rs, cs );
 
 	bli_dprintm( "c: copy lower part of b (upper part may contain garbage)",
-	             m, n, c, rs, cs, "%4.1f", "" );
+	             m, n, c, rs, cs, "% 4.3f", "" );
 
 	bli_dcopym( 0, BLIS_NONUNIT_DIAG, BLIS_LOWER, BLIS_NO_TRANSPOSE,
 	            m, n, b, rs, cs, a, rs, cs );
 
 	bli_dprintm( "a: copy lower triangle of b to upper triangular a",
-	             m, n, a, rs, cs, "%4.1f", "" );
+	             m, n, a, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -145,7 +145,7 @@ int main( int argc, char** argv )
 	           m, n, &zero, d, rs, cs );
 
 	bli_dprintm( "d: initial value (all zeros)",
-	             m, n, d, rs, cs, "%4.1f", "" );
+	             m, n, d, rs, cs, "% 4.3f", "" );
 
 	// Let's change a few values of b manually so we can later see the full
 	// effect of the transposition.
@@ -156,13 +156,13 @@ int main( int argc, char** argv )
 	bli_dsetijm( 3.2, 0.0, 3, 2, b, rs, cs );
 
 	bli_dprintm( "b:",
-	             m, n, b, rs, cs, "%4.1f", "" );
+	             m, n, b, rs, cs, "% 4.3f", "" );
 
 	bli_dcopym( 0, BLIS_NONUNIT_DIAG, BLIS_LOWER, BLIS_TRANSPOSE,
 	            m, n, b, rs, cs, d, rs, cs );
 
 	bli_dprintm( "d: transpose of lower triangle of b copied to d",
-	             m, n, d, rs, cs, "%4.1f", "" );
+	             m, n, d, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -182,20 +182,20 @@ int main( int argc, char** argv )
 	           m, n, &minus_one, e, rs, cs );
 
 	bli_dprintm( "e: initial value (all -1.0)",
-	             m, n, e, rs, cs, "%4.1f", "" );
+	             m, n, e, rs, cs, "% 4.3f", "" );
 
 	// Randomize the lower trapezoid.
 	bli_drandm( 0, BLIS_LOWER, m, n, e, rs, cs );
 
 	bli_dprintm( "e: after lower trapezoid randomized",
-	             m, n, e, rs, cs, "%4.1f", "" );
+	             m, n, e, rs, cs, "% 4.3f", "" );
 
 	// Set the upper triangle to zero.
 	bli_dsetm( BLIS_NO_CONJUGATE, 1, BLIS_NONUNIT_DIAG, BLIS_UPPER,
 	           m, n, &zero, e, rs, cs );
 
 	bli_dprintm( "e: after upper triangle set to zero",
-	             m, n, e, rs, cs, "%4.1f", "" );
+	             m, n, e, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -214,13 +214,13 @@ int main( int argc, char** argv )
 	           m, n, &minus_one, h, rs, cs );
 
 	bli_dprintm( "h: initial value (all -1.0)",
-	             m, n, h, rs, cs, "%4.1f", "" );
+	             m, n, h, rs, cs, "% 4.3f", "" );
 
 	// Randomize the elements on and above the first subdiagonal.
 	bli_drandm( -1, BLIS_UPPER, m, n, h, rs, cs );
 
 	bli_dprintm( "h: after randomizing above first subdiagonal",
-	             m, n, h, rs, cs, "%4.1f", "" );
+	             m, n, h, rs, cs, "% 4.3f", "" );
 
 	// Set the region strictly below the first subdiagonal (on or below
 	// the second subdiagonal) to zero.
@@ -228,7 +228,7 @@ int main( int argc, char** argv )
 	           m, n, &zero, h, rs, cs );
 
 	bli_dprintm( "h: after setting elements below first subdiagonal to zero",
-	             m, n, h, rs, cs, "%4.1f", "" );
+	             m, n, h, rs, cs, "% 4.3f", "" );
 
 
 	// Free the memory obtained via malloc().
diff --git a/examples/tapi/03level2.c b/examples/tapi/03level2.c
index 8e8139ec5..914fcbaf0 100644
--- a/examples/tapi/03level2.c
+++ b/examples/tapi/03level2.c
@@ -80,15 +80,15 @@ int main( int argc, char** argv )
 	bli_dsetm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
 	           m, n, &one, a, rs, cs );
 
-	bli_dprintm( "x: set to random values", m, 1, x, 1, m, "%4.1f", "" );
-	bli_dprintm( "y: set to -1.0", 1, n, y, n, 1, "%4.1f", "" );
-	bli_dprintm( "a: intial value", m, n, a, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x: set to random values", m, 1, x, 1, m, "% 4.3f", "" );
+	bli_dprintm( "y: set to -1.0", 1, n, y, n, 1, "% 4.3f", "" );
+	bli_dprintm( "a: intial value", m, n, a, rs, cs, "% 4.3f", "" );
 
 	// a := a + alpha * x * y, where 'a' is general.
 	bli_dger( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE,
 	          m, n, &alpha, x, 1, y, 1, a, rs, cs );
 
-	bli_dprintm( "a: after ger", m, n, a, rs, cs, "%4.1f", "" );
+	bli_dprintm( "a: after ger", m, n, a, rs, cs, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -119,15 +119,15 @@ int main( int argc, char** argv )
 	// Randomize 'a'.
 	bli_drandm( 0, BLIS_DENSE, m, n, a, rs, cs );
 
-	bli_dprintm( "a: randomized", m, n, a, rs, cs, "%4.1f", "" );
-	bli_dprintm( "x: set to 1.0", 1, n, x, n, 1, "%4.1f", "" );
-	bli_dprintm( "y: intial value", 1, m, y, m, 1, "%4.1f", "" );
+	bli_dprintm( "a: randomized", m, n, a, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "x: set to 1.0", 1, n, x, n, 1, "% 4.3f", "" );
+	bli_dprintm( "y: intial value", 1, m, y, m, 1, "% 4.3f", "" );
 
 	// y := beta * y + alpha * a * x, where 'a' is general.
 	bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE,
 	           m, n, &alpha, a, rs, cs, x, 1, &beta, y, 1 );
 
-	bli_dprintm( "y: after gemv", 1, m, y, m, 1, "%4.1f", "" );
+	bli_dprintm( "y: after gemv", 1, m, y, m, 1, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -160,13 +160,13 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'a'.
 	bli_drandm( 0, BLIS_LOWER, m, m, a, rs, cs );
 
-	bli_dprintm( "x: set to random values", 1, m, x, m, 1, "%4.1f", "" );
-	bli_dprintm( "a: initial value (zeros in upper triangle)", m, m, a, 1, m, "%4.1f", "" );
+	bli_dprintm( "x: set to random values", 1, m, x, m, 1, "% 4.3f", "" );
+	bli_dprintm( "a: initial value (zeros in upper triangle)", m, m, a, 1, m, "% 4.3f", "" );
 
 	// a := a + alpha * x * x^T, where 'a' is symmetric and lower-stored.
 	bli_dsyr( BLIS_LOWER, BLIS_NO_CONJUGATE, m, &alpha, x, 1, a, rs, cs );
 
-	bli_dprintm( "a: after syr", m, m, a, 1, m, "%4.1f", "" );
+	bli_dprintm( "a: after syr", m, m, a, 1, m, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -201,15 +201,15 @@ int main( int argc, char** argv )
 	// Randomize 'a'.
 	bli_drandm( 0, BLIS_UPPER, m, m, a, rs, cs );
 
-	bli_dprintm( "a: randomized (zeros in lower triangle)", m, m, a, rs, cs, "%4.1f", "" );
-	bli_dprintm( "x: set to 1.0", 1, m, x, m, 1, "%4.1f", "" );
-	bli_dprintm( "y: intial value", 1, m, y, m, 1, "%4.1f", "" );
+	bli_dprintm( "a: randomized (zeros in lower triangle)", m, m, a, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "x: set to 1.0", 1, m, x, m, 1, "% 4.3f", "" );
+	bli_dprintm( "y: intial value", 1, m, y, m, 1, "% 4.3f", "" );
 
 	// y := beta * y + alpha * a * x, where 'a' is symmetric and upper-stored.
 	bli_dsymv( BLIS_UPPER, BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE,
 	           m, &alpha, a, rs, cs, x, 1, &beta, y, 1 );
 
-	bli_dprintm( "y: after symv", 1, m, y, m, 1, "%4.1f", "" );
+	bli_dprintm( "y: after symv", 1, m, y, m, 1, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -242,14 +242,14 @@ int main( int argc, char** argv )
 	// Randomize 'a'.
 	bli_drandm( 0, BLIS_LOWER, m, m, a, rs, cs );
 
-	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rs, cs, "%4.1f", "" );
-	bli_dprintm( "x: intial value", 1, m, x, m, 1, "%4.1f", "" );
+	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "x: intial value", 1, m, x, m, 1, "% 4.3f", "" );
 
 	// x := alpha * a * x, where 'a' is triangular and lower-stored.
 	bli_dtrmv( BLIS_LOWER, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG,
 	           m, &alpha, a, rs, cs, x, 1 );
 
-	bli_dprintm( "x: after trmv", 1, m, x, m, 1, "%4.1f", "" );
+	bli_dprintm( "x: after trmv", 1, m, x, m, 1, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -287,14 +287,14 @@ int main( int argc, char** argv )
 	// that the matrix is not singular (singular matrices have no inverse).
 	bli_dshiftd( 0, m, m, &two, a, rs, cs );
 
-	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rs, cs, "%4.1f", "" );
-	bli_dprintm( "b: intial value", 1, m, b, m, 1, "%4.1f", "" );
+	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "b: intial value", 1, m, b, m, 1, "% 4.3f", "" );
 
 	// x := alpha * a * x, where 'a' is triangular and lower-stored.
 	bli_dtrsv( BLIS_LOWER, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG,
 	           m, &alpha, a, rs, cs, x, 1 );
 
-	bli_dprintm( "b: after trsv", 1, m, b, m, 1, "%4.1f", "" );
+	bli_dprintm( "b: after trsv", 1, m, b, m, 1, "% 4.3f", "" );
 
 	// We can confirm the solution by comparing the product of a and x to the
 	// original value of b.
@@ -302,7 +302,7 @@ int main( int argc, char** argv )
 	bli_dtrmv( BLIS_LOWER, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG,
 	           m, &alpha, a, rs, cs, y, 1 );
 
-	bli_dprintm( "y: should equal initial value of b", 1, m, y, m, 1, "%4.1f", "" );
+	bli_dprintm( "y: should equal initial value of b", 1, m, y, m, 1, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
diff --git a/examples/tapi/04level3.c b/examples/tapi/04level3.c
index 066c866a6..10dd9ae2f 100644
--- a/examples/tapi/04level3.c
+++ b/examples/tapi/04level3.c
@@ -84,16 +84,16 @@ int main( int argc, char** argv )
 	bli_dsetm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
                m, n, &zero, c, rsc, csc );
 
-	bli_dprintm( "a: randomized", m, k, a, rsa, csa, "%4.1f", "" );
-	bli_dprintm( "b: set to 1.0", k, n, b, rsb, csb, "%4.1f", "" );
-	bli_dprintm( "c: initial value", m, n, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "a: randomized", m, k, a, rsa, csa, "% 4.3f", "" );
+	bli_dprintm( "b: set to 1.0", k, n, b, rsb, csb, "% 4.3f", "" );
+	bli_dprintm( "c: initial value", m, n, c, rsc, csc, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a * b, where 'a', 'b', and 'c' are general.
 	bli_dgemm( BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE,
 	           m, n, k, &alpha, a, rsa, csa, b, rsb, csb,
 	                     &beta, c, rsc, csc );
 
-	bli_dprintm( "c: after gemm", m, n, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "c: after gemm", m, n, c, rsc, csc, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -128,16 +128,16 @@ int main( int argc, char** argv )
 	bli_dsetm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
                m, n, &zero, c, rsc, csc );
 
-	bli_dprintm( "a: randomized", k, m, a, rsa, csa, "%4.1f", "" );
-	bli_dprintm( "b: set to 1.0", k, n, b, rsb, csb, "%4.1f", "" );
-	bli_dprintm( "c: initial value", m, n, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "a: randomized", k, m, a, rsa, csa, "% 4.3f", "" );
+	bli_dprintm( "b: set to 1.0", k, n, b, rsb, csb, "% 4.3f", "" );
+	bli_dprintm( "c: initial value", m, n, c, rsc, csc, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a^T * b, where 'a', 'b', and 'c' are general.
 	bli_dgemm( BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE,
 	           m, n, k, &alpha, a, rsa, csa, b, rsb, csb,
 	                     &beta, c, rsc, csc );
 
-	bli_dprintm( "c: after gemm", m, n, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "c: after gemm", m, n, c, rsc, csc, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -169,15 +169,15 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'c'.
 	bli_drandm( 0, BLIS_LOWER, m, n, c, rsc, csc );
 
-	bli_dprintm( "a: set to random values", m, k, a, rsa, csa, "%4.1f", "" );
-	bli_dprintm( "c: initial value (zeros in upper triangle)", m, m, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "a: set to random values", m, k, a, rsa, csa, "% 4.3f", "" );
+	bli_dprintm( "c: initial value (zeros in upper triangle)", m, m, c, rsc, csc, "% 4.3f", "" );
 
 	// c := c + alpha * a * a^T, where 'c' is symmetric and lower-stored.
 	bli_dsyrk( BLIS_LOWER, BLIS_NO_TRANSPOSE,
 	           m, k, &alpha, a, rsa, csa,
 	                  &beta, c, rsc, csc );
 
-	bli_dprintm( "c: after syrk", m, m, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "c: after syrk", m, m, c, rsc, csc, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -217,16 +217,16 @@ int main( int argc, char** argv )
 	// Randomize the upper triangle of 'a'.
 	bli_drandm( 0, BLIS_UPPER, m, m, a, rsa, csa );
 
-	bli_dprintm( "a: randomized (zeros in lower triangle)", m, m, a, rsa, csa, "%4.1f", "" );
-	bli_dprintm( "b: set to 1.0", m, n, b, rsb, csb, "%4.1f", "" );
-	bli_dprintm( "c: initial value", m, n, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "a: randomized (zeros in lower triangle)", m, m, a, rsa, csa, "% 4.3f", "" );
+	bli_dprintm( "b: set to 1.0", m, n, b, rsb, csb, "% 4.3f", "" );
+	bli_dprintm( "c: initial value", m, n, c, rsc, csc, "% 4.3f", "" );
 
 	// c := beta * c + alpha * a * b, where 'a' is symmetric and upper-stored.
 	bli_dsymm( BLIS_LEFT, BLIS_UPPER, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE,
 	           m, n, &alpha, a, rsa, csa, b, rsb, csb,
 	                  &beta, c, rsc, csc );
 
-	bli_dprintm( "c: after symm", m, n, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "c: after symm", m, n, c, rsc, csc, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -262,14 +262,14 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'a'.
 	bli_drandm( 0, BLIS_LOWER, m, m, a, rsa, csa );
 
-	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rsa, csa, "%4.1f", "" );
-	bli_dprintm( "b: initial value", m, n, b, rsb, csb, "%4.1f", "" );
+	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rsa, csa, "% 4.3f", "" );
+	bli_dprintm( "b: initial value", m, n, b, rsb, csb, "% 4.3f", "" );
 
 	// b := alpha * a * b, where 'a' is triangular and lower-stored.
 	bli_dtrmm( BLIS_LEFT, BLIS_LOWER, BLIS_NONUNIT_DIAG, BLIS_NO_TRANSPOSE,
 	           m, n, &alpha, a, rsa, csa, b, rsb, csb );
 
-	bli_dprintm( "b: after trmm", m, n, b, rsb, csb, "%4.1f", "" );
+	bli_dprintm( "b: after trmm", m, n, b, rsb, csb, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
@@ -312,15 +312,15 @@ int main( int argc, char** argv )
 	// that the matrix is not singular (singular matrices have no inverse).
 	bli_dshiftd( 0, m, m, &two, a, rsa, csa );
 
-	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rsa, csa, "%4.1f", "" );
-	bli_dprintm( "b: initial value", m, n, b, rsb, csb, "%4.1f", "" );
+	bli_dprintm( "a: randomized (zeros in upper triangle)", m, m, a, rsa, csa, "% 4.3f", "" );
+	bli_dprintm( "b: initial value", m, n, b, rsb, csb, "% 4.3f", "" );
 
 	// solve a * x = alpha * b, where 'a' is triangular and lower-stored, and
 	// overwrite b with the solution matrix x.
 	bli_dtrsm( BLIS_LEFT, BLIS_LOWER, BLIS_NONUNIT_DIAG, BLIS_NO_TRANSPOSE,
 	           m, n, &alpha, a, rsa, csa, b, rsb, csb );
 
-	bli_dprintm( "b: after trmm", m, n, b, rsb, csb, "%4.1f", "" );
+	bli_dprintm( "b: after trmm", m, n, b, rsb, csb, "% 4.3f", "" );
 
 	// We can confirm the solution by comparing the product of a and x to the
 	// original value of b.
@@ -329,7 +329,7 @@ int main( int argc, char** argv )
 	bli_dtrmm( BLIS_LEFT, BLIS_LOWER, BLIS_NONUNIT_DIAG, BLIS_NO_TRANSPOSE,
 	           m, n, &alpha, a, rsa, csa, c, rsc, csc );
 
-	bli_dprintm( "c: should equal initial value of b", m, n, c, rsc, csc, "%4.1f", "" );
+	bli_dprintm( "c: should equal initial value of b", m, n, c, rsc, csc, "% 4.3f", "" );
 
 	// Free the memory obtained via malloc().
 	free( a );
diff --git a/examples/tapi/05util.c b/examples/tapi/05util.c
index 9dc103c79..8aadbb50f 100644
--- a/examples/tapi/05util.c
+++ b/examples/tapi/05util.c
@@ -76,7 +76,7 @@ int main( int argc, char** argv )
 	bli_drandv( n, x, 1 );
 	bli_zrandv( n, y, 1 );
 
-	bli_dprintm( "x", m, n, x, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x", m, n, x, rs, cs, "% 4.3f", "" );
 
 	// Compute the one, infinity, and frobenius norms of 'x'. Note that when
 	// computing the norm alpha of a vector 'x', the datatype of alpha must be
@@ -85,11 +85,11 @@ int main( int argc, char** argv )
 	bli_dnormiv( n, x, 1, &normi );
 	bli_dnormfv( n, x, 1, &normf );
 
-	bli_dprintm( "x: 1-norm:", 1, 1, &norm1, rs, cs, "%4.1f", "" );
-	bli_dprintm( "x: infinity norm:", 1, 1, &normi, rs, cs, "%4.1f", "" );
-	bli_dprintm( "x: frobenius norm:", 1, 1, &normf, rs, cs, "%4.1f", "" );
+	bli_dprintm( "x: 1-norm:", 1, 1, &norm1, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "x: infinity norm:", 1, 1, &normi, rs, cs, "% 4.3f", "" );
+	bli_dprintm( "x: frobenius norm:", 1, 1, &normf, rs, cs, "% 4.3f", "" );
 
-	bli_zprintm( "y", m, n, y, rs, cs, "%4.1f", "" );
+	bli_zprintm( "y", m, n, y, rs, cs, "% 4.3f", "" );
 
 	// Compute the one, infinity, and frobenius norms of 'y'. Note that we
 	// can reuse the same scalars from before for computing norms of
@@ -98,9 +98,9 @@ int main( int argc, char** argv )
 	bli_znormiv( n, y, 1, &normi );
 	bli_znormfv( n, y, 1, &normf );
 
-	bli_dprintm( "y: 1-norm:", 1, 1, &norm1, 1, 1, "%4.1f", "" );
-	bli_dprintm( "y: infinity norm:", 1, 1, &normi, 1, 1, "%4.1f", "" );
-	bli_dprintm( "y: frobenius norm:", 1, 1, &normf, 1, 1, "%4.1f", "" );
+	bli_dprintm( "y: 1-norm:", 1, 1, &norm1, 1, 1, "% 4.3f", "" );
+	bli_dprintm( "y: infinity norm:", 1, 1, &normi, 1, 1, "% 4.3f", "" );
+	bli_dprintm( "y: frobenius norm:", 1, 1, &normf, 1, 1, "% 4.3f", "" );
 
 
 	//
@@ -118,7 +118,7 @@ int main( int argc, char** argv )
 	bli_drandm( 0, BLIS_DENSE, m, n, a, rs, cs );
 	bli_zrandm( 0, BLIS_DENSE, m, n, b, rs, cs );
 
-	bli_dprintm( "a:", m, n, a, rs, cs, "%4.1f", "" );
+	bli_dprintm( "a:", m, n, a, rs, cs, "% 4.3f", "" );
 
 	// Compute the one-norm of 'a'.
 	bli_dnorm1m( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
@@ -128,11 +128,11 @@ int main( int argc, char** argv )
 	bli_dnormfm( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
 	             m, n, a, rs, cs, &normf );
 
-	bli_dprintm( "a: 1-norm:", 1, 1, &norm1, 1, 1, "%4.1f", "" );
-	bli_dprintm( "a: infinity norm:", 1, 1, &normi, 1, 1, "%4.1f", "" );
-	bli_dprintm( "a: frobenius norm:", 1, 1, &normf, 1, 1, "%4.1f", "" );
+	bli_dprintm( "a: 1-norm:", 1, 1, &norm1, 1, 1, "% 4.3f", "" );
+	bli_dprintm( "a: infinity norm:", 1, 1, &normi, 1, 1, "% 4.3f", "" );
+	bli_dprintm( "a: frobenius norm:", 1, 1, &normf, 1, 1, "% 4.3f", "" );
 
-	bli_zprintm( "b:", m, n, b, rs, cs, "%4.1f", "" );
+	bli_zprintm( "b:", m, n, b, rs, cs, "% 4.3f", "" );
 
 	// Compute the one-norm of 'b'.
 	bli_znorm1m( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
@@ -142,9 +142,9 @@ int main( int argc, char** argv )
 	bli_znormfm( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
 	             m, n, b, rs, cs, &normf );
 
-	bli_dprintm( "a: 1-norm:", 1, 1, &norm1, 1, 1, "%4.1f", "" );
-	bli_dprintm( "a: infinity norm:", 1, 1, &normi, 1, 1, "%4.1f", "" );
-	bli_dprintm( "a: frobenius norm:", 1, 1, &normf, 1, 1, "%4.1f", "" );
+	bli_dprintm( "a: 1-norm:", 1, 1, &norm1, 1, 1, "% 4.3f", "" );
+	bli_dprintm( "a: infinity norm:", 1, 1, &normi, 1, 1, "% 4.3f", "" );
+	bli_dprintm( "a: frobenius norm:", 1, 1, &normf, 1, 1, "% 4.3f", "" );
 
 
 	//
@@ -165,13 +165,13 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'c'.
 	bli_drandm( 0, BLIS_LOWER, m, m, c, rs, cs );
 
-	bli_dprintm( "c (initial state):", m, m, c, rs, cs, "%4.1f", "" );
+	bli_dprintm( "c (initial state):", m, m, c, rs, cs, "% 4.3f", "" );
 
 	// mksymm on a real matrix transposes the stored triangle into the
 	// unstored triangle, making the matrix densely symmetric.
 	bli_dmksymm( BLIS_LOWER, m, c, rs, cs );
 
-	bli_dprintm( "c (after mksymm on lower triangle):", m, m, c, rs, cs, "%4.1f", "" );
+	bli_dprintm( "c (after mksymm on lower triangle):", m, m, c, rs, cs, "% 4.3f", "" );
 
 	// Digression: Most people think only of complex matrices as being able
 	// to be complex. However, in BLIS, we define Hermitian operations on
@@ -186,13 +186,13 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'd'.
 	bli_drandm( 0, BLIS_LOWER, m, m, d, rs, cs );
 
-	bli_dprintm( "d (initial state):", m, m, d, rs, cs, "%4.1f", "" );
+	bli_dprintm( "d (initial state):", m, m, d, rs, cs, "% 4.3f", "" );
 
 	// mkherm on a real matrix behaves the same as mksymm, as there are no
 	// imaginary elements to conjugate.
 	bli_dmkherm( BLIS_LOWER, m, d, rs, cs );
 
-	bli_dprintm( "c (after mkherm on lower triangle):", m, m, d, rs, cs, "%4.1f", "" );
+	bli_dprintm( "c (after mkherm on lower triangle):", m, m, d, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -213,13 +213,13 @@ int main( int argc, char** argv )
 	// Randomize the upper triangle of 'e'.
 	bli_zrandm( 0, BLIS_UPPER, m, m, e, rs, cs );
 
-	bli_zprintm( "e (initial state):", m, m, e, rs, cs, "%4.1f", "" );
+	bli_zprintm( "e (initial state):", m, m, e, rs, cs, "% 4.3f", "" );
 
 	// mksymm on a complex matrix transposes the stored triangle into the
 	// unstored triangle.
 	bli_zmksymm( BLIS_UPPER, m, e, rs, cs );
 
-	bli_zprintm( "e (after mksymm on lower triangle):", m, m, e, rs, cs, "%4.1f", "" );
+	bli_zprintm( "e (after mksymm on lower triangle):", m, m, e, rs, cs, "% 4.3f", "" );
 
 	// Initialize all of 'f' to -1.0 to simulate junk values.
 	bli_zsetm( BLIS_NO_CONJUGATE, 0, BLIS_NONUNIT_DIAG, BLIS_DENSE,
@@ -228,13 +228,13 @@ int main( int argc, char** argv )
 	// Randomize the upper triangle of 'd'.
 	bli_zrandm( 0, BLIS_UPPER, m, m, f, rs, cs );
 
-	bli_zprintm( "f (initial state):", m, m, f, rs, cs, "%4.1f", "" );
+	bli_zprintm( "f (initial state):", m, m, f, rs, cs, "% 4.3f", "" );
 
 	// mkherm on a real matrix behaves the same as mksymm, as there are no
 	// imaginary elements to conjugate.
 	bli_zmkherm( BLIS_UPPER, m, f, rs, cs );
 
-	bli_zprintm( "f (after mkherm on lower triangle):", m, m, f, rs, cs, "%4.1f", "" );
+	bli_zprintm( "f (after mkherm on lower triangle):", m, m, f, rs, cs, "% 4.3f", "" );
 
 
 	//
@@ -254,14 +254,14 @@ int main( int argc, char** argv )
 	// Randomize the lower triangle of 'g'.
 	bli_drandm( 0, BLIS_LOWER, m, m, g, rs, cs );
 
-	bli_dprintm( "g (initial state):", m, m, g, rs, cs, "%4.1f", "" );
+	bli_dprintm( "g (initial state):", m, m, g, rs, cs, "% 4.3f", "" );
 
 	// mktrim does not explicitly copy any data, since presumably the stored
 	// triangle already contains the data of interest. However, mktrim does
 	// explicitly writes zeros to the unstored region.
 	bli_dmktrim( BLIS_LOWER, m, g, rs, cs );
 
-	bli_dprintm( "g (after mktrim):", m, m, g, rs, cs, "%4.1f", "" );
+	bli_dprintm( "g (after mktrim):", m, m, g, rs, cs, "% 4.3f", "" );
 
 
 	// Free the memory obtained via malloc().

From ec5b57289feaea755ff2eb4ab39511f3dd5879d6 Mon Sep 17 00:00:00 2001
From: Atsushi Tatsuma <yoshoku@outlook.com>
Date: Fri, 2 May 2025 00:09:31 +0900
Subject: [PATCH 229/230] Fix to prevent is_win flag setting with clang on
 macOS (#867)

Details:
- In some cases, macOS was improperly detected as Windows due to a builtin preprocessor definition `#define TARGET_OS_WINDOWS 0`.
- Update the detection to specifically look for `#define _WIN32` which more robustly detects Windows.
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index edbca9ffc..a22054e75 100755
--- a/configure
+++ b/configure
@@ -2739,7 +2739,7 @@ check_build_tools()
 	                  "C compiler" "yes" found_cc
 
 	# Also check the compiler to see if we are (cross-)compiling for Windows
-	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
+	if "${found_cc}" -dM -E - < /dev/null 2> /dev/null | grep -qE "#define\s+_WIN32"; then
 		is_win=yes
 	fi
 	is_msvc=no

From 5097c599b58aecb7f990cc7bd7a5dad688a48df8 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 1 May 2025 10:11:59 -0500
Subject: [PATCH 230/230] Update CREDITS

[ci skip]
---
 CREDITS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CREDITS b/CREDITS
index 27877fff8..218d6fec7 100644
--- a/CREDITS
+++ b/CREDITS
@@ -124,6 +124,7 @@ but many others have contributed code, ideas, and feedback, including
   Vladimir Sukarev
   Harihara Sudhan S        @ihariharasudhan           (AMD)
   Chengguo Sun             @chengguosun
+  Atsushi Tatsuma          @yoshoku
   Christopher Taylor       @ct-clmsn                  (Tactical Computing Labs)
   Santanu Thangaraj                                   (AMD)
   Nicholai Tukanov         @nicholaiTukanov           (The University of Texas at Austin)